diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13633 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8859357696566998, + "eval_steps": 500, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 73.265625, + "epoch": 0.0011074197120708748, + "grad_norm": 0.47520893812179565, + "kl": 0.0, + "learning_rate": 9.99375e-07, + "loss": 0.000854941550642252, + "reward": 2.2648561000823975, + "reward_std": 0.32521533221006393, + "rewards/GDino": 0.84943026304245, + "rewards/GIT": 0.5776679813861847, + "rewards/HPSv2": 0.2639656066894531, + "rewards/ORM": 0.5737921893596649, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -22.0, + "step": 1 + }, + { + "completion_length": 57.359375, + "epoch": 0.0022148394241417496, + "grad_norm": 0.7006784677505493, + "kl": 0.00151824951171875, + "learning_rate": 9.9875e-07, + "loss": 0.0010380030144006014, + "reward": 1.6890186071395874, + "reward_std": 0.5064275413751602, + "rewards/GDino": 0.7000000476837158, + "rewards/GIT": 0.161313958466053, + "rewards/HPSv2": 0.2509632110595703, + "rewards/ORM": 0.5767413973808289, + "self_certainty_semantic": -25.375, + "self_certainty_token": -20.5625, + "step": 2 + }, + { + "completion_length": 54.640625, + "epoch": 0.0033222591362126247, + "grad_norm": 0.5812113285064697, + "kl": 0.001556396484375, + "learning_rate": 9.98125e-07, + "loss": -0.0055133504793047905, + "reward": 1.5832943320274353, + "reward_std": 0.3882431983947754, + "rewards/GDino": 0.6165956258773804, + "rewards/GIT": 0.3970412313938141, + "rewards/HPSv2": 0.24474143981933594, + "rewards/ORM": 0.3249160535633564, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -20.8125, + "step": 3 + }, + { + "completion_length": 63.578125, + "epoch": 0.004429678848283499, + "grad_norm": 0.6130731105804443, + "kl": 0.001605987548828125, + "learning_rate": 9.975e-07, + "loss": -0.005623435601592064, + "reward": 2.1563462018966675, + "reward_std": 0.3505118489265442, + "rewards/GDino": 0.8188963234424591, + "rewards/GIT": 0.4581628292798996, + "rewards/HPSv2": 0.24955368041992188, + "rewards/ORM": 0.6297334432601929, + "self_certainty_semantic": -25.5, + "self_certainty_token": -22.0, + "step": 4 + }, + { + "completion_length": 57.65625, + "epoch": 0.005537098560354375, + "grad_norm": 0.8068524599075317, + "kl": 0.00165557861328125, + "learning_rate": 9.968749999999999e-07, + "loss": -0.0018901200965046883, + "reward": 1.6294466853141785, + "reward_std": 0.3914882242679596, + "rewards/GDino": 0.6075743436813354, + "rewards/GIT": 0.2503758817911148, + "rewards/HPSv2": 0.2523918151855469, + "rewards/ORM": 0.5191046893596649, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.8125, + "step": 5 + }, + { + "completion_length": 65.8125, + "epoch": 0.006644518272425249, + "grad_norm": 74728.3515625, + "kl": 228.00085067749023, + "learning_rate": 9.9625e-07, + "loss": 2.2879227567464113, + "reward": 2.15460866689682, + "reward_std": 0.18937285244464874, + "rewards/GDino": 0.7502027153968811, + "rewards/GIT": 0.4551280438899994, + "rewards/HPSv2": 0.2774028778076172, + "rewards/ORM": 0.671875, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -22.625, + "step": 6 + }, + { + "completion_length": 65.640625, + "epoch": 0.007751937984496124, + "grad_norm": 0.9850716590881348, + "kl": 0.001739501953125, + "learning_rate": 9.956249999999999e-07, + "loss": -0.009785129223018885, + "reward": 1.6486687660217285, + "reward_std": 0.55589759349823, + "rewards/GDino": 0.5765624940395355, + "rewards/GIT": 0.15754839032888412, + "rewards/HPSv2": 0.2522296905517578, + "rewards/ORM": 0.6623281538486481, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -22.1875, + "step": 7 + }, + { + "completion_length": 65.796875, + "epoch": 0.008859357696566999, + "grad_norm": 0.8074976801872253, + "kl": 0.001628875732421875, + "learning_rate": 9.95e-07, + "loss": 0.0002866658614948392, + "reward": 1.7531355023384094, + "reward_std": 0.3834189176559448, + "rewards/GDino": 0.7171875536441803, + "rewards/GIT": 0.3904750794172287, + "rewards/HPSv2": 0.2441272735595703, + "rewards/ORM": 0.4013456404209137, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -20.5, + "step": 8 + }, + { + "completion_length": 61.53125, + "epoch": 0.009966777408637873, + "grad_norm": 0.5135362148284912, + "kl": 0.001628875732421875, + "learning_rate": 9.94375e-07, + "loss": -0.002820038120262325, + "reward": 2.1886491775512695, + "reward_std": 0.5042529106140137, + "rewards/GDino": 0.800000011920929, + "rewards/GIT": 0.3224633187055588, + "rewards/HPSv2": 0.2661018371582031, + "rewards/ORM": 0.8000838756561279, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.5, + "step": 9 + }, + { + "completion_length": 66.9375, + "epoch": 0.01107419712070875, + "grad_norm": 1.035406231880188, + "kl": 0.001590728759765625, + "learning_rate": 9.9375e-07, + "loss": 0.010037540923804045, + "reward": 1.8388126492500305, + "reward_std": 0.385573148727417, + "rewards/GDino": 0.729426920413971, + "rewards/GIT": 0.47063055634498596, + "rewards/HPSv2": 0.25093841552734375, + "rewards/ORM": 0.3878167122602463, + "self_certainty_semantic": -25.375, + "self_certainty_token": -20.75, + "step": 10 + }, + { + "completion_length": 54.65625, + "epoch": 0.012181616832779624, + "grad_norm": 0.6659172773361206, + "kl": 0.00159454345703125, + "learning_rate": 9.93125e-07, + "loss": -0.010986692272126675, + "reward": 2.312160015106201, + "reward_std": 0.3424924612045288, + "rewards/GDino": 0.7864583432674408, + "rewards/GIT": 0.5519254580140114, + "rewards/HPSv2": 0.2634601593017578, + "rewards/ORM": 0.710316002368927, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -20.9375, + "step": 11 + }, + { + "completion_length": 65.6875, + "epoch": 0.013289036544850499, + "grad_norm": 0.4100457727909088, + "kl": 0.00152587890625, + "learning_rate": 9.925e-07, + "loss": -0.0020649502985179424, + "reward": 1.831676721572876, + "reward_std": 0.37266574054956436, + "rewards/GDino": 0.6748343408107758, + "rewards/GIT": 0.3966377377510071, + "rewards/HPSv2": 0.2431049346923828, + "rewards/ORM": 0.5170995742082596, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.5, + "step": 12 + }, + { + "completion_length": 62.15625, + "epoch": 0.014396456256921373, + "grad_norm": 1.1354421377182007, + "kl": 0.0016326904296875, + "learning_rate": 9.91875e-07, + "loss": -0.0013978920178487897, + "reward": 1.7478299736976624, + "reward_std": 0.3111024349927902, + "rewards/GDino": 0.7122170925140381, + "rewards/GIT": 0.28808362782001495, + "rewards/HPSv2": 0.2510089874267578, + "rewards/ORM": 0.4965202957391739, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.6875, + "step": 13 + }, + { + "completion_length": 63.734375, + "epoch": 0.015503875968992248, + "grad_norm": 171.63954162597656, + "kl": 11.750831604003906, + "learning_rate": 9.912499999999998e-07, + "loss": 0.11320369923487306, + "reward": 1.820958137512207, + "reward_std": 0.6430586874485016, + "rewards/GDino": 0.7286913394927979, + "rewards/GIT": 0.39159613847732544, + "rewards/HPSv2": 0.222503662109375, + "rewards/ORM": 0.47816696763038635, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.875, + "step": 14 + }, + { + "completion_length": 64.796875, + "epoch": 0.016611295681063124, + "grad_norm": 1.790418267250061, + "kl": 0.001697540283203125, + "learning_rate": 9.90625e-07, + "loss": -0.0012796747614629567, + "reward": 2.4724700450897217, + "reward_std": 0.361017182469368, + "rewards/GDino": 0.8982033133506775, + "rewards/GIT": 0.5411243438720703, + "rewards/HPSv2": 0.2581005096435547, + "rewards/ORM": 0.7750419676303864, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.8125, + "step": 15 + }, + { + "completion_length": 65.078125, + "epoch": 0.017718715393133997, + "grad_norm": 0.38361120223999023, + "kl": 0.0015869140625, + "learning_rate": 9.9e-07, + "loss": 0.006866331794299185, + "reward": 1.5055131912231445, + "reward_std": 0.40322621166706085, + "rewards/GDino": 0.651562511920929, + "rewards/GIT": 0.2843637466430664, + "rewards/HPSv2": 0.24664592742919922, + "rewards/ORM": 0.32294100522994995, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.125, + "step": 16 + }, + { + "completion_length": 70.25, + "epoch": 0.018826135105204873, + "grad_norm": 1.0185045003890991, + "kl": 0.001552581787109375, + "learning_rate": 9.89375e-07, + "loss": -0.010323233203962445, + "reward": 1.5897727608680725, + "reward_std": 0.530043363571167, + "rewards/GDino": 0.5529386103153229, + "rewards/GIT": 0.2131059616804123, + "rewards/HPSv2": 0.2552909851074219, + "rewards/ORM": 0.5684372782707214, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -20.5, + "step": 17 + }, + { + "completion_length": 66.34375, + "epoch": 0.019933554817275746, + "grad_norm": 0.4375481605529785, + "kl": 0.00156402587890625, + "learning_rate": 9.8875e-07, + "loss": -0.00136462040245533, + "reward": 2.063610315322876, + "reward_std": 0.42642320692539215, + "rewards/GDino": 0.7955474257469177, + "rewards/GIT": 0.5150393098592758, + "rewards/HPSv2": 0.22445201873779297, + "rewards/ORM": 0.528571605682373, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -20.75, + "step": 18 + }, + { + "completion_length": 59.21875, + "epoch": 0.021040974529346623, + "grad_norm": 0.3959902226924896, + "kl": 0.00164031982421875, + "learning_rate": 9.88125e-07, + "loss": -0.0053134458139538765, + "reward": 1.5237417221069336, + "reward_std": 0.4693976193666458, + "rewards/GDino": 0.701702356338501, + "rewards/GIT": 0.2579326629638672, + "rewards/HPSv2": 0.24812698364257812, + "rewards/ORM": 0.3159796893596649, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.5625, + "step": 19 + }, + { + "completion_length": 61.484375, + "epoch": 0.0221483942414175, + "grad_norm": 0.5081169605255127, + "kl": 0.001689910888671875, + "learning_rate": 9.875e-07, + "loss": 0.0003520832397043705, + "reward": 1.9516127109527588, + "reward_std": 0.2731045335531235, + "rewards/GDino": 0.6437798738479614, + "rewards/GIT": 0.4635310173034668, + "rewards/HPSv2": 0.24121475219726562, + "rewards/ORM": 0.6030870825052261, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.5625, + "step": 20 + }, + { + "completion_length": 55.546875, + "epoch": 0.023255813953488372, + "grad_norm": 0.4565694034099579, + "kl": 0.001667022705078125, + "learning_rate": 9.86875e-07, + "loss": 0.0016932454891502857, + "reward": 2.180082321166992, + "reward_std": 0.5037369430065155, + "rewards/GDino": 0.7953125238418579, + "rewards/GIT": 0.45517681539058685, + "rewards/HPSv2": 0.2586212158203125, + "rewards/ORM": 0.6709719300270081, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.9375, + "step": 21 + }, + { + "completion_length": 68.75, + "epoch": 0.024363233665559248, + "grad_norm": 0.45827633142471313, + "kl": 0.001712799072265625, + "learning_rate": 9.862499999999999e-07, + "loss": 0.0007174527272582054, + "reward": 1.8721013069152832, + "reward_std": 0.4303991347551346, + "rewards/GDino": 0.6911458671092987, + "rewards/GIT": 0.36048486828804016, + "rewards/HPSv2": 0.2603263854980469, + "rewards/ORM": 0.5601442009210587, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -22.4375, + "step": 22 + }, + { + "completion_length": 58.609375, + "epoch": 0.02547065337763012, + "grad_norm": 0.6875389218330383, + "kl": 0.00162506103515625, + "learning_rate": 9.85625e-07, + "loss": -0.004631380317732692, + "reward": 1.9805514812469482, + "reward_std": 0.5138447731733322, + "rewards/GDino": 0.706105500459671, + "rewards/GIT": 0.4199465811252594, + "rewards/HPSv2": 0.26941490173339844, + "rewards/ORM": 0.5850843787193298, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -20.9375, + "step": 23 + }, + { + "completion_length": 60.859375, + "epoch": 0.026578073089700997, + "grad_norm": 0.5052416324615479, + "kl": 0.001667022705078125, + "learning_rate": 9.849999999999999e-07, + "loss": -0.0046843914315104485, + "reward": 2.368114173412323, + "reward_std": 0.4367552697658539, + "rewards/GDino": 0.815625011920929, + "rewards/GIT": 0.633857935667038, + "rewards/HPSv2": 0.25930213928222656, + "rewards/ORM": 0.6593290567398071, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.875, + "step": 24 + }, + { + "completion_length": 61.078125, + "epoch": 0.02768549280177187, + "grad_norm": 0.6162320971488953, + "kl": 0.001617431640625, + "learning_rate": 9.84375e-07, + "loss": -0.005464642075821757, + "reward": 1.9494624137878418, + "reward_std": 0.40468768775463104, + "rewards/GDino": 0.6967671811580658, + "rewards/GIT": 0.40975040197372437, + "rewards/HPSv2": 0.26043701171875, + "rewards/ORM": 0.5825077295303345, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.3125, + "step": 25 + }, + { + "completion_length": 50.734375, + "epoch": 0.028792912513842746, + "grad_norm": 2.8454437255859375, + "kl": 0.001804351806640625, + "learning_rate": 9.8375e-07, + "loss": -0.006305628921836615, + "reward": 2.190965175628662, + "reward_std": 0.44982025027275085, + "rewards/GDino": 0.7243013381958008, + "rewards/GIT": 0.5294483602046967, + "rewards/HPSv2": 0.2750282287597656, + "rewards/ORM": 0.6621872782707214, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -22.375, + "step": 26 + }, + { + "completion_length": 62.484375, + "epoch": 0.029900332225913623, + "grad_norm": 0.4033506512641907, + "kl": 0.0016021728515625, + "learning_rate": 9.83125e-07, + "loss": -0.0016465974040329456, + "reward": 1.9733637571334839, + "reward_std": 0.44280076026916504, + "rewards/GDino": 0.7363362908363342, + "rewards/GIT": 0.4528593420982361, + "rewards/HPSv2": 0.24550628662109375, + "rewards/ORM": 0.5386618673801422, + "self_certainty_semantic": -25.5, + "self_certainty_token": -22.375, + "step": 27 + }, + { + "completion_length": 65.046875, + "epoch": 0.031007751937984496, + "grad_norm": 0.559298574924469, + "kl": 0.00167083740234375, + "learning_rate": 9.825e-07, + "loss": 0.004501585033722222, + "reward": 1.4280173778533936, + "reward_std": 0.27060839533805847, + "rewards/GDino": 0.5987553596496582, + "rewards/GIT": 0.10973574221134186, + "rewards/HPSv2": 0.2664012908935547, + "rewards/ORM": 0.453125, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -20.9375, + "step": 28 + }, + { + "completion_length": 55.5625, + "epoch": 0.03211517165005537, + "grad_norm": 0.42233753204345703, + "kl": 0.00168609619140625, + "learning_rate": 9.81875e-07, + "loss": -0.005473613273352385, + "reward": 2.4506709575653076, + "reward_std": 0.20222720131278038, + "rewards/GDino": 0.8296874761581421, + "rewards/GIT": 0.605083167552948, + "rewards/HPSv2": 0.285858154296875, + "rewards/ORM": 0.7300421893596649, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -20.9375, + "step": 29 + }, + { + "completion_length": 57.640625, + "epoch": 0.03322259136212625, + "grad_norm": 0.5650274157524109, + "kl": 0.0016326904296875, + "learning_rate": 9.8125e-07, + "loss": 0.0003150699194520712, + "reward": 2.489137649536133, + "reward_std": 0.4210814982652664, + "rewards/GDino": 0.8948009014129639, + "rewards/GIT": 0.586266428232193, + "rewards/HPSv2": 0.24865341186523438, + "rewards/ORM": 0.7594169676303864, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.0625, + "step": 30 + }, + { + "completion_length": 78.78125, + "epoch": 0.03433001107419712, + "grad_norm": 0.6762183308601379, + "kl": 0.001613616943359375, + "learning_rate": 9.806249999999998e-07, + "loss": 0.007568572706077248, + "reward": 1.8555968403816223, + "reward_std": 0.2906922847032547, + "rewards/GDino": 0.5989583432674408, + "rewards/GIT": 0.38505683839321136, + "rewards/HPSv2": 0.2403736114501953, + "rewards/ORM": 0.6312080323696136, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -20.5625, + "step": 31 + }, + { + "completion_length": 62.5, + "epoch": 0.035437430786267994, + "grad_norm": 0.4184902012348175, + "kl": 0.001628875732421875, + "learning_rate": 9.8e-07, + "loss": 0.007896744413301349, + "reward": 1.495099127292633, + "reward_std": 0.3622882664203644, + "rewards/GDino": 0.6791666448116302, + "rewards/GIT": 0.25104063749313354, + "rewards/HPSv2": 0.23050880432128906, + "rewards/ORM": 0.3343829959630966, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -22.0625, + "step": 32 + }, + { + "completion_length": 70.109375, + "epoch": 0.036544850498338874, + "grad_norm": 0.47143352031707764, + "kl": 0.0016937255859375, + "learning_rate": 9.79375e-07, + "loss": 0.00709247519262135, + "reward": 2.3964842557907104, + "reward_std": 0.5415211468935013, + "rewards/GDino": 0.897656261920929, + "rewards/GIT": 0.6205766499042511, + "rewards/HPSv2": 0.2254810333251953, + "rewards/ORM": 0.6527703106403351, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.625, + "step": 33 + }, + { + "completion_length": 55.53125, + "epoch": 0.03765227021040975, + "grad_norm": 0.45762747526168823, + "kl": 0.001678466796875, + "learning_rate": 9.7875e-07, + "loss": 0.020488019566982985, + "reward": 1.9143174886703491, + "reward_std": 0.2841227799654007, + "rewards/GDino": 0.6593749821186066, + "rewards/GIT": 0.4214262217283249, + "rewards/HPSv2": 0.2424945831298828, + "rewards/ORM": 0.5910216569900513, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.75, + "step": 34 + }, + { + "completion_length": 55.828125, + "epoch": 0.03875968992248062, + "grad_norm": 0.3845841884613037, + "kl": 0.00167083740234375, + "learning_rate": 9.78125e-07, + "loss": 0.01862273830920458, + "reward": 2.274049997329712, + "reward_std": 0.28603486716747284, + "rewards/GDino": 0.7786458432674408, + "rewards/GIT": 0.5405041128396988, + "rewards/HPSv2": 0.23740386962890625, + "rewards/ORM": 0.7174962311983109, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.25, + "step": 35 + }, + { + "completion_length": 63.234375, + "epoch": 0.03986710963455149, + "grad_norm": 0.5729533433914185, + "kl": 0.001678466796875, + "learning_rate": 9.775e-07, + "loss": -0.002963901497423649, + "reward": 1.8639960289001465, + "reward_std": 0.3890039473772049, + "rewards/GDino": 0.6255208253860474, + "rewards/GIT": 0.42713797092437744, + "rewards/HPSv2": 0.24535751342773438, + "rewards/ORM": 0.5659796744585037, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.3125, + "step": 36 + }, + { + "completion_length": 63.09375, + "epoch": 0.04097452934662237, + "grad_norm": 0.47338196635246277, + "kl": 0.001888275146484375, + "learning_rate": 9.76875e-07, + "loss": 0.008916446007788181, + "reward": 1.9735829830169678, + "reward_std": 0.5416238605976105, + "rewards/GDino": 0.7008762061595917, + "rewards/GIT": 0.3141380175948143, + "rewards/HPSv2": 0.2595968246459961, + "rewards/ORM": 0.6989719867706299, + "self_certainty_semantic": -25.375, + "self_certainty_token": -23.125, + "step": 37 + }, + { + "completion_length": 58.640625, + "epoch": 0.042081949058693245, + "grad_norm": 1.639336347579956, + "kl": 0.001651763916015625, + "learning_rate": 9.7625e-07, + "loss": -0.0003745388239622116, + "reward": 1.8843677639961243, + "reward_std": 0.27646802365779877, + "rewards/GDino": 0.7309310734272003, + "rewards/GIT": 0.2879854440689087, + "rewards/HPSv2": 0.25732994079589844, + "rewards/ORM": 0.6081212311983109, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.0625, + "step": 38 + }, + { + "completion_length": 54.453125, + "epoch": 0.04318936877076412, + "grad_norm": 0.4438176453113556, + "kl": 0.00176239013671875, + "learning_rate": 9.756249999999999e-07, + "loss": -0.004410726949572563, + "reward": 2.3740460872650146, + "reward_std": 0.26216618716716766, + "rewards/GDino": 0.8794216811656952, + "rewards/GIT": 0.480433389544487, + "rewards/HPSv2": 0.2703990936279297, + "rewards/ORM": 0.7437919676303864, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.0, + "step": 39 + }, + { + "completion_length": 64.65625, + "epoch": 0.044296788482835, + "grad_norm": 0.9789016246795654, + "kl": 0.0017242431640625, + "learning_rate": 9.75e-07, + "loss": -0.0008055282523855567, + "reward": 2.2535433769226074, + "reward_std": 0.46909773349761963, + "rewards/GDino": 0.8751652538776398, + "rewards/GIT": 0.4070926010608673, + "rewards/HPSv2": 0.2731647491455078, + "rewards/ORM": 0.6981207877397537, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -20.625, + "step": 40 + }, + { + "completion_length": 60.3125, + "epoch": 0.04540420819490587, + "grad_norm": 0.39339736104011536, + "kl": 0.001697540283203125, + "learning_rate": 9.743749999999999e-07, + "loss": -0.0026839073980227113, + "reward": 1.926289677619934, + "reward_std": 0.21494604647159576, + "rewards/GDino": 0.6536072194576263, + "rewards/GIT": 0.38067150115966797, + "rewards/HPSv2": 0.2470531463623047, + "rewards/ORM": 0.6449578106403351, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.0, + "step": 41 + }, + { + "completion_length": 55.734375, + "epoch": 0.046511627906976744, + "grad_norm": 0.43325623869895935, + "kl": 0.001575469970703125, + "learning_rate": 9.7375e-07, + "loss": 0.01566000678576529, + "reward": 2.2492642402648926, + "reward_std": 0.545527771115303, + "rewards/GDino": 0.8451037406921387, + "rewards/GIT": 0.4486817270517349, + "rewards/HPSv2": 0.2523536682128906, + "rewards/ORM": 0.703125, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.125, + "step": 42 + }, + { + "completion_length": 77.859375, + "epoch": 0.047619047619047616, + "grad_norm": 0.6008194088935852, + "kl": 0.00209808349609375, + "learning_rate": 9.73125e-07, + "loss": 0.009053934598341584, + "reward": 1.752554178237915, + "reward_std": 0.3711804449558258, + "rewards/GDino": 0.6425288617610931, + "rewards/GIT": 0.38656318187713623, + "rewards/HPSv2": 0.23595809936523438, + "rewards/ORM": 0.4875040054321289, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.8125, + "step": 43 + }, + { + "completion_length": 64.859375, + "epoch": 0.048726467331118496, + "grad_norm": 0.4626310169696808, + "kl": 0.001750946044921875, + "learning_rate": 9.725e-07, + "loss": 0.00038470514118671417, + "reward": 2.837794542312622, + "reward_std": 0.3451881557703018, + "rewards/GDino": 0.9479166865348816, + "rewards/GIT": 0.7795328795909882, + "rewards/HPSv2": 0.26932334899902344, + "rewards/ORM": 0.8410216569900513, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.5625, + "step": 44 + }, + { + "completion_length": 66.921875, + "epoch": 0.04983388704318937, + "grad_norm": 1.3941670656204224, + "kl": 0.001880645751953125, + "learning_rate": 9.71875e-07, + "loss": -0.012070931028574705, + "reward": 2.561403751373291, + "reward_std": 0.48213036358356476, + "rewards/GDino": 0.9039532244205475, + "rewards/GIT": 0.5467919409275055, + "rewards/HPSv2": 0.2617225646972656, + "rewards/ORM": 0.8489359319210052, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.9375, + "step": 45 + }, + { + "completion_length": 59.625, + "epoch": 0.05094130675526024, + "grad_norm": 0.5365378260612488, + "kl": 0.001949310302734375, + "learning_rate": 9.712499999999998e-07, + "loss": 0.01103684725239873, + "reward": 2.0622146129608154, + "reward_std": 0.40072987973690033, + "rewards/GDino": 0.645312488079071, + "rewards/GIT": 0.33725525438785553, + "rewards/HPSv2": 0.2619609832763672, + "rewards/ORM": 0.8176859617233276, + "self_certainty_semantic": -25.25, + "self_certainty_token": -22.3125, + "step": 46 + }, + { + "completion_length": 64.6875, + "epoch": 0.05204872646733112, + "grad_norm": 0.5151812434196472, + "kl": 0.001766204833984375, + "learning_rate": 9.70625e-07, + "loss": -0.004148014355450869, + "reward": 1.7916635870933533, + "reward_std": 0.31147970259189606, + "rewards/GDino": 0.7293796539306641, + "rewards/GIT": 0.20818163454532623, + "rewards/HPSv2": 0.27945709228515625, + "rewards/ORM": 0.5746453106403351, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.5625, + "step": 47 + }, + { + "completion_length": 56.25, + "epoch": 0.053156146179401995, + "grad_norm": 0.7559373378753662, + "kl": 0.001861572265625, + "learning_rate": 9.7e-07, + "loss": -0.002030523493885994, + "reward": 1.4302473068237305, + "reward_std": 0.4484506845474243, + "rewards/GDino": 0.6244329512119293, + "rewards/GIT": 0.0, + "rewards/HPSv2": 0.2752876281738281, + "rewards/ORM": 0.5305267572402954, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -20.875, + "step": 48 + }, + { + "completion_length": 61.21875, + "epoch": 0.05426356589147287, + "grad_norm": 0.46310731768608093, + "kl": 0.00177764892578125, + "learning_rate": 9.69375e-07, + "loss": 0.0054672048427164555, + "reward": 1.9361683130264282, + "reward_std": 0.3801421523094177, + "rewards/GDino": 0.7904821038246155, + "rewards/GIT": 0.2458050437271595, + "rewards/HPSv2": 0.25890541076660156, + "rewards/ORM": 0.640975683927536, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.6875, + "step": 49 + }, + { + "completion_length": 61.921875, + "epoch": 0.05537098560354374, + "grad_norm": 0.5111473798751831, + "kl": 0.002353668212890625, + "learning_rate": 9.6875e-07, + "loss": 0.0035089042503386736, + "reward": 2.212684750556946, + "reward_std": 0.3874351307749748, + "rewards/GDino": 0.7840971350669861, + "rewards/GIT": 0.42198260873556137, + "rewards/HPSv2": 0.25807952880859375, + "rewards/ORM": 0.7485254108905792, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -22.625, + "step": 50 + }, + { + "completion_length": 57.796875, + "epoch": 0.05647840531561462, + "grad_norm": 0.4804292917251587, + "kl": 0.001743316650390625, + "learning_rate": 9.68125e-07, + "loss": -0.0010273723164573312, + "reward": 1.8951371908187866, + "reward_std": 0.5679852366447449, + "rewards/GDino": 0.7922006845474243, + "rewards/GIT": 0.27185457944869995, + "rewards/HPSv2": 0.2777671813964844, + "rewards/ORM": 0.5533146858215332, + "self_certainty_semantic": -25.625, + "self_certainty_token": -22.0, + "step": 51 + }, + { + "completion_length": 62.140625, + "epoch": 0.05758582502768549, + "grad_norm": 0.5876587629318237, + "kl": 0.001842498779296875, + "learning_rate": 9.675e-07, + "loss": 0.010319232940673828, + "reward": 2.453005313873291, + "reward_std": 0.35728050768375397, + "rewards/GDino": 0.917187511920929, + "rewards/GIT": 0.6651300191879272, + "rewards/HPSv2": 0.27350807189941406, + "rewards/ORM": 0.5971797704696655, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.0625, + "step": 52 + }, + { + "completion_length": 57.046875, + "epoch": 0.058693244739756366, + "grad_norm": 0.5244357585906982, + "kl": 0.00168609619140625, + "learning_rate": 9.66875e-07, + "loss": 0.0012504801852628589, + "reward": 1.8911731839179993, + "reward_std": 0.3232653737068176, + "rewards/GDino": 0.7297230660915375, + "rewards/GIT": 0.3948078155517578, + "rewards/HPSv2": 0.24039649963378906, + "rewards/ORM": 0.5262457728385925, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.25, + "step": 53 + }, + { + "completion_length": 68.921875, + "epoch": 0.059800664451827246, + "grad_norm": 0.5011692047119141, + "kl": 0.0017547607421875, + "learning_rate": 9.6625e-07, + "loss": -0.001990929711610079, + "reward": 1.5346381068229675, + "reward_std": 0.5364750325679779, + "rewards/GDino": 0.5896078050136566, + "rewards/GIT": 0.2611962556838989, + "rewards/HPSv2": 0.24633407592773438, + "rewards/ORM": 0.4375000149011612, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.5625, + "step": 54 + }, + { + "completion_length": 65.28125, + "epoch": 0.06090808416389812, + "grad_norm": 0.43720903992652893, + "kl": 0.001796722412109375, + "learning_rate": 9.65625e-07, + "loss": 0.011945425532758236, + "reward": 1.7657405734062195, + "reward_std": 0.5052186846733093, + "rewards/GDino": 0.7055748403072357, + "rewards/GIT": 0.3213713690638542, + "rewards/HPSv2": 0.26223182678222656, + "rewards/ORM": 0.4765625, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.8125, + "step": 55 + }, + { + "completion_length": 72.15625, + "epoch": 0.06201550387596899, + "grad_norm": 0.6576823592185974, + "kl": 0.00201416015625, + "learning_rate": 9.649999999999999e-07, + "loss": 0.010990551207214594, + "reward": 2.0798487663269043, + "reward_std": 0.5881477892398834, + "rewards/GDino": 0.7611979246139526, + "rewards/GIT": 0.38940075039863586, + "rewards/HPSv2": 0.25081634521484375, + "rewards/ORM": 0.678433746099472, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.8125, + "step": 56 + }, + { + "completion_length": 53.84375, + "epoch": 0.06312292358803986, + "grad_norm": 0.5109694600105286, + "kl": 0.001708984375, + "learning_rate": 9.64375e-07, + "loss": -0.009197955019772053, + "reward": 1.825343132019043, + "reward_std": 0.49610868096351624, + "rewards/GDino": 0.7342002689838409, + "rewards/GIT": 0.27930086851119995, + "rewards/HPSv2": 0.2493419647216797, + "rewards/ORM": 0.5625, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.5, + "step": 57 + }, + { + "completion_length": 54.671875, + "epoch": 0.06423034330011074, + "grad_norm": 0.48297855257987976, + "kl": 0.0018157958984375, + "learning_rate": 9.637499999999999e-07, + "loss": -2.7031637728214264e-05, + "reward": 1.9436655044555664, + "reward_std": 0.5841460824012756, + "rewards/GDino": 0.7508301734924316, + "rewards/GIT": 0.36742376536130905, + "rewards/HPSv2": 0.24603271484375, + "rewards/ORM": 0.579378753900528, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.6875, + "step": 58 + }, + { + "completion_length": 57.34375, + "epoch": 0.06533776301218161, + "grad_norm": 1.5652471780776978, + "kl": 0.00185394287109375, + "learning_rate": 9.63125e-07, + "loss": -0.0014887296129018068, + "reward": 2.154895305633545, + "reward_std": 0.5548917800188065, + "rewards/GDino": 0.7907229363918304, + "rewards/GIT": 0.44339829683303833, + "rewards/HPSv2": 0.2567615509033203, + "rewards/ORM": 0.664012536406517, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.0625, + "step": 59 + }, + { + "completion_length": 52.0625, + "epoch": 0.0664451827242525, + "grad_norm": 0.8647972941398621, + "kl": 0.00200653076171875, + "learning_rate": 9.624999999999999e-07, + "loss": -0.004864218062721193, + "reward": 2.183086931705475, + "reward_std": 0.27265597879886627, + "rewards/GDino": 0.8968750238418579, + "rewards/GIT": 0.4909053146839142, + "rewards/HPSv2": 0.2511100769042969, + "rewards/ORM": 0.544196605682373, + "self_certainty_semantic": -25.25, + "self_certainty_token": -20.8125, + "step": 60 + }, + { + "completion_length": 78.421875, + "epoch": 0.06755260243632337, + "grad_norm": 0.6149311065673828, + "kl": 0.0018310546875, + "learning_rate": 9.61875e-07, + "loss": -0.003399772336706519, + "reward": 2.3938775062561035, + "reward_std": 0.3266971558332443, + "rewards/GDino": 0.7299478650093079, + "rewards/GIT": 0.6572037935256958, + "rewards/HPSv2": 0.26293373107910156, + "rewards/ORM": 0.743791937828064, + "self_certainty_semantic": -25.5, + "self_certainty_token": -20.5, + "step": 61 + }, + { + "completion_length": 71.796875, + "epoch": 0.06866002214839424, + "grad_norm": 0.8106938600540161, + "kl": 0.00188446044921875, + "learning_rate": 9.6125e-07, + "loss": -0.004746791877551004, + "reward": 2.3078866004943848, + "reward_std": 0.4594850391149521, + "rewards/GDino": 0.7886728346347809, + "rewards/GIT": 0.6039779186248779, + "rewards/HPSv2": 0.2555561065673828, + "rewards/ORM": 0.6596797406673431, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.125, + "step": 62 + }, + { + "completion_length": 57.703125, + "epoch": 0.06976744186046512, + "grad_norm": 0.5699672102928162, + "kl": 0.00218963623046875, + "learning_rate": 9.606249999999998e-07, + "loss": 0.005022911122068763, + "reward": 2.2111340165138245, + "reward_std": 0.6219878196716309, + "rewards/GDino": 0.794545441865921, + "rewards/GIT": 0.45049863308668137, + "rewards/HPSv2": 0.24386024475097656, + "rewards/ORM": 0.7222297191619873, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.9375, + "step": 63 + }, + { + "completion_length": 78.453125, + "epoch": 0.07087486157253599, + "grad_norm": 0.7573527693748474, + "kl": 0.0022125244140625, + "learning_rate": 9.6e-07, + "loss": 0.013895762618631124, + "reward": 1.6789215207099915, + "reward_std": 0.15597553551197052, + "rewards/GDino": 0.7209441661834717, + "rewards/GIT": 0.31718890368938446, + "rewards/HPSv2": 0.26105499267578125, + "rewards/ORM": 0.37973345816135406, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.5625, + "step": 64 + }, + { + "completion_length": 63.59375, + "epoch": 0.07198228128460686, + "grad_norm": 0.4424923360347748, + "kl": 0.0020599365234375, + "learning_rate": 9.59375e-07, + "loss": 0.0005846736021339893, + "reward": 2.195925712585449, + "reward_std": 0.5788445174694061, + "rewards/GDino": 0.7169270515441895, + "rewards/GIT": 0.6367218196392059, + "rewards/HPSv2": 0.2345561981201172, + "rewards/ORM": 0.6077205836772919, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.75, + "step": 65 + }, + { + "completion_length": 67.6875, + "epoch": 0.07308970099667775, + "grad_norm": 0.5050013661384583, + "kl": 0.00211334228515625, + "learning_rate": 9.5875e-07, + "loss": 0.010172993643209338, + "reward": 2.220258355140686, + "reward_std": 0.30588236451148987, + "rewards/GDino": 0.7442708909511566, + "rewards/GIT": 0.47482602298259735, + "rewards/HPSv2": 0.25937461853027344, + "rewards/ORM": 0.7417868673801422, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.5, + "step": 66 + }, + { + "completion_length": 72.75, + "epoch": 0.07419712070874862, + "grad_norm": 0.47647950053215027, + "kl": 0.001953125, + "learning_rate": 9.58125e-07, + "loss": 0.002580178901553154, + "reward": 2.3537763357162476, + "reward_std": 0.2857324182987213, + "rewards/GDino": 0.852263331413269, + "rewards/GIT": 0.5637244433164597, + "rewards/HPSv2": 0.2550220489501953, + "rewards/ORM": 0.6827665567398071, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.4375, + "step": 67 + }, + { + "completion_length": 60.109375, + "epoch": 0.0753045404208195, + "grad_norm": 0.45224544405937195, + "kl": 0.0021209716796875, + "learning_rate": 9.575e-07, + "loss": 0.002825574716553092, + "reward": 1.613221287727356, + "reward_std": 0.332104429602623, + "rewards/GDino": 0.6193348169326782, + "rewards/GIT": 0.2909398823976517, + "rewards/HPSv2": 0.2551765441894531, + "rewards/ORM": 0.4477700889110565, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.0625, + "step": 68 + }, + { + "completion_length": 72.6875, + "epoch": 0.07641196013289037, + "grad_norm": 0.688894510269165, + "kl": 0.002315521240234375, + "learning_rate": 9.56875e-07, + "loss": 0.012800770811736584, + "reward": 2.1092969179153442, + "reward_std": 0.36874186992645264, + "rewards/GDino": 0.8054687678813934, + "rewards/GIT": 0.3866874873638153, + "rewards/HPSv2": 0.26236534118652344, + "rewards/ORM": 0.6547753810882568, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.5, + "step": 69 + }, + { + "completion_length": 60.640625, + "epoch": 0.07751937984496124, + "grad_norm": 0.45330390334129333, + "kl": 0.00215911865234375, + "learning_rate": 9.5625e-07, + "loss": -0.0010713667143136263, + "reward": 1.552397072315216, + "reward_std": 0.39455118775367737, + "rewards/GDino": 0.6554375886917114, + "rewards/GIT": 0.22663478553295135, + "rewards/HPSv2": 0.2546577453613281, + "rewards/ORM": 0.41566696763038635, + "self_certainty_semantic": -25.25, + "self_certainty_token": -20.75, + "step": 70 + }, + { + "completion_length": 76.515625, + "epoch": 0.07862679955703211, + "grad_norm": 0.5808414220809937, + "kl": 0.00222015380859375, + "learning_rate": 9.556249999999999e-07, + "loss": 0.0038980550598353148, + "reward": 1.9476300477981567, + "reward_std": 0.38603267073631287, + "rewards/GDino": 0.7262610197067261, + "rewards/GIT": 0.30087296664714813, + "rewards/HPSv2": 0.26424598693847656, + "rewards/ORM": 0.6562500149011612, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -19.9375, + "step": 71 + }, + { + "completion_length": 57.15625, + "epoch": 0.07973421926910298, + "grad_norm": 0.3693688213825226, + "kl": 0.00208282470703125, + "learning_rate": 9.55e-07, + "loss": -0.00035159417893737555, + "reward": 1.9391373991966248, + "reward_std": 0.3963821530342102, + "rewards/GDino": 0.6879567801952362, + "rewards/GIT": 0.4622843265533447, + "rewards/HPSv2": 0.24675464630126953, + "rewards/ORM": 0.5421415567398071, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -20.9375, + "step": 72 + }, + { + "completion_length": 66.65625, + "epoch": 0.08084163898117387, + "grad_norm": 0.6215986013412476, + "kl": 0.0024871826171875, + "learning_rate": 9.543749999999999e-07, + "loss": 0.003838272183202207, + "reward": 2.1008963584899902, + "reward_std": 0.4600249230861664, + "rewards/GDino": 0.8240202069282532, + "rewards/GIT": 0.48449917137622833, + "rewards/HPSv2": 0.24818038940429688, + "rewards/ORM": 0.5441965609788895, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.5, + "step": 73 + }, + { + "completion_length": 60.859375, + "epoch": 0.08194905869324474, + "grad_norm": 0.43593713641166687, + "kl": 0.0030364990234375, + "learning_rate": 9.5375e-07, + "loss": 0.002844013855792582, + "reward": 2.297879934310913, + "reward_std": 0.2846696451306343, + "rewards/GDino": 0.84375, + "rewards/GIT": 0.5265894532203674, + "rewards/HPSv2": 0.2544116973876953, + "rewards/ORM": 0.6731287837028503, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.75, + "step": 74 + }, + { + "completion_length": 68.703125, + "epoch": 0.08305647840531562, + "grad_norm": 0.48668116331100464, + "kl": 0.002227783203125, + "learning_rate": 9.53125e-07, + "loss": -0.0021062323357909918, + "reward": 1.7519539594650269, + "reward_std": 0.3109753131866455, + "rewards/GDino": 0.6498888432979584, + "rewards/GIT": 0.2745012864470482, + "rewards/HPSv2": 0.26706886291503906, + "rewards/ORM": 0.5604948848485947, + "self_certainty_semantic": -25.5, + "self_certainty_token": -20.625, + "step": 75 + }, + { + "completion_length": 70.25, + "epoch": 0.08416389811738649, + "grad_norm": 0.5122522711753845, + "kl": 0.00208282470703125, + "learning_rate": 9.525e-07, + "loss": -0.00045439647510647774, + "reward": 2.371267318725586, + "reward_std": 0.4085633456707001, + "rewards/GDino": 0.8135416805744171, + "rewards/GIT": 0.6540948301553726, + "rewards/HPSv2": 0.2650108337402344, + "rewards/ORM": 0.6386198997497559, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -20.75, + "step": 76 + }, + { + "completion_length": 62.875, + "epoch": 0.08527131782945736, + "grad_norm": 0.505736768245697, + "kl": 0.0037689208984375, + "learning_rate": 9.51875e-07, + "loss": -0.006699402409140021, + "reward": 1.5121636986732483, + "reward_std": 0.5349836349487305, + "rewards/GDino": 0.616510659456253, + "rewards/GIT": 0.18113864213228226, + "rewards/HPSv2": 0.228485107421875, + "rewards/ORM": 0.48602940142154694, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.875, + "step": 77 + }, + { + "completion_length": 65.8125, + "epoch": 0.08637873754152824, + "grad_norm": 0.4759610593318939, + "kl": 0.0022735595703125, + "learning_rate": 9.5125e-07, + "loss": 0.0014968996401876211, + "reward": 1.9482250213623047, + "reward_std": 0.38150524348020554, + "rewards/GDino": 0.7646995186805725, + "rewards/GIT": 0.31973105669021606, + "rewards/HPSv2": 0.2705249786376953, + "rewards/ORM": 0.5932694524526596, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.125, + "step": 78 + }, + { + "completion_length": 72.609375, + "epoch": 0.08748615725359911, + "grad_norm": 0.4961722195148468, + "kl": 0.00247955322265625, + "learning_rate": 9.50625e-07, + "loss": 0.00820195721462369, + "reward": 2.2431598901748657, + "reward_std": 0.19805177673697472, + "rewards/GDino": 0.8183182775974274, + "rewards/GIT": 0.60882468521595, + "rewards/HPSv2": 0.2628040313720703, + "rewards/ORM": 0.5532128810882568, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.5625, + "step": 79 + }, + { + "completion_length": 66.0625, + "epoch": 0.08859357696567, + "grad_norm": 0.5290701389312744, + "kl": 0.00308990478515625, + "learning_rate": 9.499999999999999e-07, + "loss": -0.001018086913973093, + "reward": 1.7054139375686646, + "reward_std": 0.4478110671043396, + "rewards/GDino": 0.6419965624809265, + "rewards/GIT": 0.19029075652360916, + "rewards/HPSv2": 0.2727680206298828, + "rewards/ORM": 0.6003586649894714, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.5, + "step": 80 + }, + { + "completion_length": 69.75, + "epoch": 0.08970099667774087, + "grad_norm": 0.530961811542511, + "kl": 0.00331878662109375, + "learning_rate": 9.493749999999999e-07, + "loss": -0.0018104221671819687, + "reward": 2.1294270157814026, + "reward_std": 0.30140096694231033, + "rewards/GDino": 0.7601194977760315, + "rewards/GIT": 0.36138176918029785, + "rewards/HPSv2": 0.27007102966308594, + "rewards/ORM": 0.7378547042608261, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -20.25, + "step": 81 + }, + { + "completion_length": 62.25, + "epoch": 0.09080841638981174, + "grad_norm": 0.5380280017852783, + "kl": 0.0029449462890625, + "learning_rate": 9.487499999999999e-07, + "loss": 0.0027263425290584564, + "reward": 1.7531540989875793, + "reward_std": 0.40144187211990356, + "rewards/GDino": 0.6388830840587616, + "rewards/GIT": 0.3787819594144821, + "rewards/HPSv2": 0.26526451110839844, + "rewards/ORM": 0.4702245742082596, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.3125, + "step": 82 + }, + { + "completion_length": 57.125, + "epoch": 0.09191583610188261, + "grad_norm": 0.46656447649002075, + "kl": 0.00229644775390625, + "learning_rate": 9.481249999999999e-07, + "loss": 0.0034079640172421932, + "reward": 2.1076533794403076, + "reward_std": 0.3496774584054947, + "rewards/GDino": 0.8086712956428528, + "rewards/GIT": 0.44665491580963135, + "rewards/HPSv2": 0.2527198791503906, + "rewards/ORM": 0.5996073186397552, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.875, + "step": 83 + }, + { + "completion_length": 77.609375, + "epoch": 0.09302325581395349, + "grad_norm": 0.7098491787910461, + "kl": 0.003326416015625, + "learning_rate": 9.474999999999999e-07, + "loss": -0.015582434833049774, + "reward": 2.0792417526245117, + "reward_std": 0.405472531914711, + "rewards/GDino": 0.8217203617095947, + "rewards/GIT": 0.6337592005729675, + "rewards/HPSv2": 0.2409496307373047, + "rewards/ORM": 0.3828125, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.25, + "step": 84 + }, + { + "completion_length": 70.0, + "epoch": 0.09413067552602436, + "grad_norm": 0.453952431678772, + "kl": 0.0030059814453125, + "learning_rate": 9.468749999999999e-07, + "loss": -0.008341801585629582, + "reward": 1.7731398940086365, + "reward_std": 0.43146421015262604, + "rewards/GDino": 0.6217962503433228, + "rewards/GIT": 0.33136892318725586, + "rewards/HPSv2": 0.2414989471435547, + "rewards/ORM": 0.5784757435321808, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.8125, + "step": 85 + }, + { + "completion_length": 55.46875, + "epoch": 0.09523809523809523, + "grad_norm": 0.6065813302993774, + "kl": 0.0029296875, + "learning_rate": 9.462499999999999e-07, + "loss": -0.004339609295129776, + "reward": 2.3409087657928467, + "reward_std": 0.33414456248283386, + "rewards/GDino": 0.843651682138443, + "rewards/GIT": 0.3478253483772278, + "rewards/HPSv2": 0.2929649353027344, + "rewards/ORM": 0.8564667999744415, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.6875, + "step": 86 + }, + { + "completion_length": 71.796875, + "epoch": 0.09634551495016612, + "grad_norm": 0.6815423965454102, + "kl": 0.0028076171875, + "learning_rate": 9.45625e-07, + "loss": 0.004890406038612127, + "reward": 2.096968352794647, + "reward_std": 0.4522961378097534, + "rewards/GDino": 0.7090134918689728, + "rewards/GIT": 0.4619881361722946, + "rewards/HPSv2": 0.26172447204589844, + "rewards/ORM": 0.6642423272132874, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.3125, + "step": 87 + }, + { + "completion_length": 62.921875, + "epoch": 0.09745293466223699, + "grad_norm": 0.37047135829925537, + "kl": 0.00237274169921875, + "learning_rate": 9.45e-07, + "loss": -0.007989626843482256, + "reward": 2.100303888320923, + "reward_std": 0.39728429913520813, + "rewards/GDino": 0.8100375235080719, + "rewards/GIT": 0.4551214128732681, + "rewards/HPSv2": 0.2669391632080078, + "rewards/ORM": 0.5682056248188019, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.625, + "step": 88 + }, + { + "completion_length": 61.25, + "epoch": 0.09856035437430787, + "grad_norm": 0.3903006613254547, + "kl": 0.0033111572265625, + "learning_rate": 9.44375e-07, + "loss": -0.0016460134647786617, + "reward": 2.1185483932495117, + "reward_std": 0.34406720101833344, + "rewards/GDino": 0.7301153540611267, + "rewards/GIT": 0.4342738687992096, + "rewards/HPSv2": 0.25724220275878906, + "rewards/ORM": 0.6969169676303864, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.0625, + "step": 89 + }, + { + "completion_length": 64.734375, + "epoch": 0.09966777408637874, + "grad_norm": 0.6106704473495483, + "kl": 0.002532958984375, + "learning_rate": 9.4375e-07, + "loss": 0.0018994538113474846, + "reward": 2.281058669090271, + "reward_std": 0.4019897133111954, + "rewards/GDino": 0.8515625298023224, + "rewards/GIT": 0.602006196975708, + "rewards/HPSv2": 0.2570476531982422, + "rewards/ORM": 0.5704423487186432, + "self_certainty_semantic": -25.625, + "self_certainty_token": -20.875, + "step": 90 + }, + { + "completion_length": 70.625, + "epoch": 0.10077519379844961, + "grad_norm": 0.6082563996315002, + "kl": 0.0025634765625, + "learning_rate": 9.43125e-07, + "loss": -0.001378488726913929, + "reward": 1.7446696758270264, + "reward_std": 0.48222504556179047, + "rewards/GDino": 0.6369770467281342, + "rewards/GIT": 0.4495050609111786, + "rewards/HPSv2": 0.2379169464111328, + "rewards/ORM": 0.42027057707309723, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.25, + "step": 91 + }, + { + "completion_length": 69.328125, + "epoch": 0.10188261351052048, + "grad_norm": 0.3885723054409027, + "kl": 0.00247955322265625, + "learning_rate": 9.425e-07, + "loss": 0.0029599489644169807, + "reward": 1.6940485835075378, + "reward_std": 0.48791858553886414, + "rewards/GDino": 0.7451692521572113, + "rewards/GIT": 0.3888908475637436, + "rewards/HPSv2": 0.23882293701171875, + "rewards/ORM": 0.32116562128067017, + "self_certainty_semantic": -25.5, + "self_certainty_token": -20.9375, + "step": 92 + }, + { + "completion_length": 78.96875, + "epoch": 0.10299003322259136, + "grad_norm": 2.441729784011841, + "kl": 0.00281524658203125, + "learning_rate": 9.41875e-07, + "loss": 0.0027102059684693813, + "reward": 2.098644495010376, + "reward_std": 0.5861929953098297, + "rewards/GDino": 0.7753971815109253, + "rewards/GIT": 0.33432240784168243, + "rewards/HPSv2": 0.24440956115722656, + "rewards/ORM": 0.7445152401924133, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.3125, + "step": 93 + }, + { + "completion_length": 53.640625, + "epoch": 0.10409745293466224, + "grad_norm": 1.843809962272644, + "kl": 0.00298309326171875, + "learning_rate": 9.4125e-07, + "loss": -0.002976842690259218, + "reward": 2.022274136543274, + "reward_std": 0.3149227201938629, + "rewards/GDino": 0.7854060530662537, + "rewards/GIT": 0.20830318331718445, + "rewards/HPSv2": 0.2829475402832031, + "rewards/ORM": 0.7456172108650208, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.4375, + "step": 94 + }, + { + "completion_length": 73.8125, + "epoch": 0.10520487264673312, + "grad_norm": 0.4806905686855316, + "kl": 0.0027923583984375, + "learning_rate": 9.40625e-07, + "loss": 0.0057201930321753025, + "reward": 2.5528862476348877, + "reward_std": 0.3981771767139435, + "rewards/GDino": 0.9458979666233063, + "rewards/GIT": 0.7319882810115814, + "rewards/HPSv2": 0.265625, + "rewards/ORM": 0.609375, + "self_certainty_semantic": -25.625, + "self_certainty_token": -22.375, + "step": 95 + }, + { + "completion_length": 71.578125, + "epoch": 0.10631229235880399, + "grad_norm": 1.3328330516815186, + "kl": 0.00286865234375, + "learning_rate": 9.399999999999999e-07, + "loss": 0.006992874434217811, + "reward": 2.4351861476898193, + "reward_std": 0.25794728100299835, + "rewards/GDino": 0.9020833373069763, + "rewards/GIT": 0.6907803118228912, + "rewards/HPSv2": 0.2606678009033203, + "rewards/ORM": 0.5816546380519867, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -20.125, + "step": 96 + }, + { + "completion_length": 60.703125, + "epoch": 0.10741971207087486, + "grad_norm": 0.5019268989562988, + "kl": 0.003326416015625, + "learning_rate": 9.393749999999999e-07, + "loss": 0.011835527839139104, + "reward": 1.6200063824653625, + "reward_std": 0.4240207076072693, + "rewards/GDino": 0.6504360437393188, + "rewards/GIT": 0.18544349074363708, + "rewards/HPSv2": 0.2720832824707031, + "rewards/ORM": 0.5120435357093811, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.4375, + "step": 97 + }, + { + "completion_length": 68.578125, + "epoch": 0.10852713178294573, + "grad_norm": 0.38334423303604126, + "kl": 0.003143310546875, + "learning_rate": 9.387499999999999e-07, + "loss": 0.0015034456737339497, + "reward": 1.9381686449050903, + "reward_std": 0.46784070134162903, + "rewards/GDino": 0.7850436270236969, + "rewards/GIT": 0.3971538841724396, + "rewards/HPSv2": 0.2517681121826172, + "rewards/ORM": 0.5042029470205307, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -22.0, + "step": 98 + }, + { + "completion_length": 72.234375, + "epoch": 0.10963455149501661, + "grad_norm": 1.5332801342010498, + "kl": 0.0026702880859375, + "learning_rate": 9.381249999999999e-07, + "loss": 0.0014210238587111235, + "reward": 2.1606199741363525, + "reward_std": 0.4609396979212761, + "rewards/GDino": 0.800000011920929, + "rewards/GIT": 0.6965132355690002, + "rewards/HPSv2": 0.2425823211669922, + "rewards/ORM": 0.4215243309736252, + "self_certainty_semantic": -25.625, + "self_certainty_token": -22.125, + "step": 99 + }, + { + "completion_length": 64.859375, + "epoch": 0.11074197120708748, + "grad_norm": 0.4810887575149536, + "kl": 0.0039520263671875, + "learning_rate": 9.374999999999999e-07, + "loss": -0.006660776911303401, + "reward": 2.0300318002700806, + "reward_std": 0.49300554394721985, + "rewards/GDino": 0.6639764606952667, + "rewards/GIT": 0.41904042661190033, + "rewards/HPSv2": 0.25483131408691406, + "rewards/ORM": 0.6921834945678711, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.5, + "step": 100 + }, + { + "completion_length": 59.671875, + "epoch": 0.11184939091915837, + "grad_norm": 0.6347000002861023, + "kl": 0.0032196044921875, + "learning_rate": 9.368749999999999e-07, + "loss": 0.007826576009392738, + "reward": 2.343237042427063, + "reward_std": 0.29696404933929443, + "rewards/GDino": 0.8815763592720032, + "rewards/GIT": 0.5084297135472298, + "rewards/HPSv2": 0.27715301513671875, + "rewards/ORM": 0.6760779917240143, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -20.4375, + "step": 101 + }, + { + "completion_length": 54.6875, + "epoch": 0.11295681063122924, + "grad_norm": 0.433162659406662, + "kl": 0.00323486328125, + "learning_rate": 9.3625e-07, + "loss": -0.0018342176917940378, + "reward": 2.244241714477539, + "reward_std": 0.3847181349992752, + "rewards/GDino": 0.7636502981185913, + "rewards/GIT": 0.5041892230510712, + "rewards/HPSv2": 0.26613616943359375, + "rewards/ORM": 0.7102660238742828, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.75, + "step": 102 + }, + { + "completion_length": 67.0, + "epoch": 0.11406423034330011, + "grad_norm": 0.4709942042827606, + "kl": 0.0036163330078125, + "learning_rate": 9.35625e-07, + "loss": -0.0053715279791504145, + "reward": 1.7866063117980957, + "reward_std": 0.48569220304489136, + "rewards/GDino": 0.6912583708763123, + "rewards/GIT": 0.2119271606206894, + "rewards/HPSv2": 0.26636314392089844, + "rewards/ORM": 0.6170576214790344, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.6875, + "step": 103 + }, + { + "completion_length": 72.9375, + "epoch": 0.11517165005537099, + "grad_norm": 0.4063447415828705, + "kl": 0.00260162353515625, + "learning_rate": 9.35e-07, + "loss": 0.002629161812365055, + "reward": 2.2642691135406494, + "reward_std": 0.34077706933021545, + "rewards/GDino": 0.83519247174263, + "rewards/GIT": 0.5088042318820953, + "rewards/HPSv2": 0.2578144073486328, + "rewards/ORM": 0.6624580323696136, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -22.25, + "step": 104 + }, + { + "completion_length": 60.078125, + "epoch": 0.11627906976744186, + "grad_norm": 0.46488699316978455, + "kl": 0.002288818359375, + "learning_rate": 9.34375e-07, + "loss": -0.003600445226766169, + "reward": 2.1485623121261597, + "reward_std": 0.4569554626941681, + "rewards/GDino": 0.7578125, + "rewards/GIT": 0.5468153655529022, + "rewards/HPSv2": 0.2572956085205078, + "rewards/ORM": 0.586638867855072, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.0, + "step": 105 + }, + { + "completion_length": 82.796875, + "epoch": 0.11738648947951273, + "grad_norm": 0.6562625765800476, + "kl": 0.00269317626953125, + "learning_rate": 9.3375e-07, + "loss": 0.006768202409148216, + "reward": 1.9783158898353577, + "reward_std": 0.1888652741909027, + "rewards/GDino": 0.7153646051883698, + "rewards/GIT": 0.5914923697710037, + "rewards/HPSv2": 0.2652587890625, + "rewards/ORM": 0.4062000662088394, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.0625, + "step": 106 + }, + { + "completion_length": 65.59375, + "epoch": 0.1184939091915836, + "grad_norm": 0.45307597517967224, + "kl": 0.003082275390625, + "learning_rate": 9.33125e-07, + "loss": 0.004376767203211784, + "reward": 2.5454152822494507, + "reward_std": 0.3043108731508255, + "rewards/GDino": 0.9536458253860474, + "rewards/GIT": 0.7616239190101624, + "rewards/HPSv2": 0.25897979736328125, + "rewards/ORM": 0.5711656212806702, + "self_certainty_semantic": -25.5, + "self_certainty_token": -20.6875, + "step": 107 + }, + { + "completion_length": 61.734375, + "epoch": 0.11960132890365449, + "grad_norm": 0.41155651211738586, + "kl": 0.0034942626953125, + "learning_rate": 9.325e-07, + "loss": 0.00791933387517929, + "reward": 2.225056529045105, + "reward_std": 0.2606152221560478, + "rewards/GDino": 0.7756550312042236, + "rewards/GIT": 0.44980524480342865, + "rewards/HPSv2": 0.2855796813964844, + "rewards/ORM": 0.7140165567398071, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -20.875, + "step": 108 + }, + { + "completion_length": 62.3125, + "epoch": 0.12070874861572536, + "grad_norm": 0.5856253504753113, + "kl": 0.00328826904296875, + "learning_rate": 9.31875e-07, + "loss": -0.014065259601920843, + "reward": 2.116065502166748, + "reward_std": 0.42074093222618103, + "rewards/GDino": 0.8158511817455292, + "rewards/GIT": 0.5546791851520538, + "rewards/HPSv2": 0.26972389221191406, + "rewards/ORM": 0.4758111536502838, + "self_certainty_semantic": -25.625, + "self_certainty_token": -22.0625, + "step": 109 + }, + { + "completion_length": 53.6875, + "epoch": 0.12181616832779624, + "grad_norm": 0.47900426387786865, + "kl": 0.00299835205078125, + "learning_rate": 9.3125e-07, + "loss": 0.004598683924996294, + "reward": 2.2211345434188843, + "reward_std": 0.4559909552335739, + "rewards/GDino": 0.843098521232605, + "rewards/GIT": 0.39484143257141113, + "rewards/HPSv2": 0.23913192749023438, + "rewards/ORM": 0.7440627217292786, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -22.0625, + "step": 110 + }, + { + "completion_length": 62.5625, + "epoch": 0.12292358803986711, + "grad_norm": 0.5505498051643372, + "kl": 0.00334930419921875, + "learning_rate": 9.30625e-07, + "loss": -0.009575113654136658, + "reward": 1.8931084871292114, + "reward_std": 0.3895595818758011, + "rewards/GDino": 0.6988297700881958, + "rewards/GIT": 0.34851039946079254, + "rewards/HPSv2": 0.2725067138671875, + "rewards/ORM": 0.5732617080211639, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -22.1875, + "step": 111 + }, + { + "completion_length": 66.125, + "epoch": 0.12403100775193798, + "grad_norm": 0.5518302321434021, + "kl": 0.0044097900390625, + "learning_rate": 9.3e-07, + "loss": 0.001083985436707735, + "reward": 2.1159579753875732, + "reward_std": 0.3097255080938339, + "rewards/GDino": 0.7588914632797241, + "rewards/GIT": 0.3177434876561165, + "rewards/HPSv2": 0.2764263153076172, + "rewards/ORM": 0.7628966867923737, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -22.125, + "step": 112 + }, + { + "completion_length": 57.203125, + "epoch": 0.12513842746400886, + "grad_norm": 0.5670230388641357, + "kl": 0.00327301025390625, + "learning_rate": 9.293749999999999e-07, + "loss": 0.013281037099659443, + "reward": 1.6267165541648865, + "reward_std": 0.36898210644721985, + "rewards/GDino": 0.6410032212734222, + "rewards/GIT": 0.2818482890725136, + "rewards/HPSv2": 0.26859092712402344, + "rewards/ORM": 0.4352741092443466, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.25, + "step": 113 + }, + { + "completion_length": 68.296875, + "epoch": 0.12624584717607973, + "grad_norm": 0.6704270243644714, + "kl": 0.00307464599609375, + "learning_rate": 9.287499999999999e-07, + "loss": 0.00015758577501401305, + "reward": 2.3069713711738586, + "reward_std": 0.36960119009017944, + "rewards/GDino": 0.7588542103767395, + "rewards/GIT": 0.6726887226104736, + "rewards/HPSv2": 0.2751197814941406, + "rewards/ORM": 0.6003087162971497, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.25, + "step": 114 + }, + { + "completion_length": 63.890625, + "epoch": 0.1273532668881506, + "grad_norm": 0.6844286918640137, + "kl": 0.00408935546875, + "learning_rate": 9.281249999999999e-07, + "loss": 0.0020853045862168074, + "reward": 2.1885178685188293, + "reward_std": 0.35547153651714325, + "rewards/GDino": 0.718020498752594, + "rewards/GIT": 0.5492343008518219, + "rewards/HPSv2": 0.2481842041015625, + "rewards/ORM": 0.6730788052082062, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -20.75, + "step": 115 + }, + { + "completion_length": 75.21875, + "epoch": 0.12846068660022147, + "grad_norm": 0.5827351212501526, + "kl": 0.003021240234375, + "learning_rate": 9.274999999999999e-07, + "loss": 0.0005021943943575025, + "reward": 2.2085607051849365, + "reward_std": 0.391997292637825, + "rewards/GDino": 0.7475058436393738, + "rewards/GIT": 0.5436886698007584, + "rewards/HPSv2": 0.26111602783203125, + "rewards/ORM": 0.65625, + "self_certainty_semantic": -25.5, + "self_certainty_token": -20.75, + "step": 116 + }, + { + "completion_length": 66.234375, + "epoch": 0.12956810631229235, + "grad_norm": 8.78965950012207, + "kl": 0.158905029296875, + "learning_rate": 9.268749999999999e-07, + "loss": -0.0129257976077497, + "reward": 2.4095414876937866, + "reward_std": 0.2911904752254486, + "rewards/GDino": 0.8304687738418579, + "rewards/GIT": 0.6444451212882996, + "rewards/HPSv2": 0.27797698974609375, + "rewards/ORM": 0.6566506326198578, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.75, + "step": 117 + }, + { + "completion_length": 64.578125, + "epoch": 0.13067552602436322, + "grad_norm": 0.6560596823692322, + "kl": 0.00417327880859375, + "learning_rate": 9.2625e-07, + "loss": 0.0029480335651896894, + "reward": 1.8815761804580688, + "reward_std": 0.3823118060827255, + "rewards/GDino": 0.7314696907997131, + "rewards/GIT": 0.41885554790496826, + "rewards/HPSv2": 0.24540138244628906, + "rewards/ORM": 0.4858495891094208, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -20.625, + "step": 118 + }, + { + "completion_length": 56.828125, + "epoch": 0.13178294573643412, + "grad_norm": 1.9917776584625244, + "kl": 0.0042877197265625, + "learning_rate": 9.25625e-07, + "loss": -0.01110410038381815, + "reward": 2.270492196083069, + "reward_std": 0.5458246767520905, + "rewards/GDino": 0.7566670179367065, + "rewards/GIT": 0.5055328160524368, + "rewards/HPSv2": 0.26803016662597656, + "rewards/ORM": 0.7402622997760773, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.875, + "step": 119 + }, + { + "completion_length": 72.3125, + "epoch": 0.132890365448505, + "grad_norm": 0.510168194770813, + "kl": 0.00420379638671875, + "learning_rate": 9.25e-07, + "loss": -0.013864397071301937, + "reward": 1.973584771156311, + "reward_std": 0.4184395670890808, + "rewards/GDino": 0.7117854058742523, + "rewards/GIT": 0.43370192497968674, + "rewards/HPSv2": 0.26166534423828125, + "rewards/ORM": 0.5664321482181549, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.3125, + "step": 120 + }, + { + "completion_length": 77.171875, + "epoch": 0.13399778516057587, + "grad_norm": 0.5348736643791199, + "kl": 0.00298309326171875, + "learning_rate": 9.243749999999999e-07, + "loss": 0.004201958421617746, + "reward": 1.9280533194541931, + "reward_std": 0.4291805773973465, + "rewards/GDino": 0.7109375, + "rewards/GIT": 0.38363416492938995, + "rewards/HPSv2": 0.25235748291015625, + "rewards/ORM": 0.5811240971088409, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.0625, + "step": 121 + }, + { + "completion_length": 70.53125, + "epoch": 0.13510520487264674, + "grad_norm": 0.49879971146583557, + "kl": 0.00412750244140625, + "learning_rate": 9.237499999999999e-07, + "loss": -0.0026759039610624313, + "reward": 1.9971369504928589, + "reward_std": 0.2551337629556656, + "rewards/GDino": 0.72983318567276, + "rewards/GIT": 0.34402593970298767, + "rewards/HPSv2": 0.2877368927001953, + "rewards/ORM": 0.6355408430099487, + "self_certainty_semantic": -25.5, + "self_certainty_token": -22.125, + "step": 122 + }, + { + "completion_length": 64.640625, + "epoch": 0.1362126245847176, + "grad_norm": 0.4230790436267853, + "kl": 0.00341033935546875, + "learning_rate": 9.23125e-07, + "loss": -0.002337672747671604, + "reward": 2.0281134843826294, + "reward_std": 0.3781726509332657, + "rewards/GDino": 0.7874999940395355, + "rewards/GIT": 0.4591221511363983, + "rewards/HPSv2": 0.2555961608886719, + "rewards/ORM": 0.5258950889110565, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.5625, + "step": 123 + }, + { + "completion_length": 59.375, + "epoch": 0.13732004429678848, + "grad_norm": 0.9666682481765747, + "kl": 0.00328826904296875, + "learning_rate": 9.225e-07, + "loss": -0.010707761626690626, + "reward": 2.219977855682373, + "reward_std": 0.396147683262825, + "rewards/GDino": 0.7934323251247406, + "rewards/GIT": 0.4874458909034729, + "rewards/HPSv2": 0.2524528503417969, + "rewards/ORM": 0.6866468489170074, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.0, + "step": 124 + }, + { + "completion_length": 67.421875, + "epoch": 0.13842746400885936, + "grad_norm": 0.4701387286186218, + "kl": 0.00374603271484375, + "learning_rate": 9.21875e-07, + "loss": -0.008014392806217074, + "reward": 2.166910171508789, + "reward_std": 0.44899792969226837, + "rewards/GDino": 0.7873771488666534, + "rewards/GIT": 0.5715728402137756, + "rewards/HPSv2": 0.25487709045410156, + "rewards/ORM": 0.5530830323696136, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -20.9375, + "step": 125 + }, + { + "completion_length": 60.46875, + "epoch": 0.13953488372093023, + "grad_norm": 0.6960640549659729, + "kl": 0.0052337646484375, + "learning_rate": 9.2125e-07, + "loss": 0.005524930078536272, + "reward": 1.941537857055664, + "reward_std": 0.3068820387125015, + "rewards/GDino": 0.69914710521698, + "rewards/GIT": 0.31967807561159134, + "rewards/HPSv2": 0.26458740234375, + "rewards/ORM": 0.6581252217292786, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.0, + "step": 126 + }, + { + "completion_length": 65.90625, + "epoch": 0.1406423034330011, + "grad_norm": 0.5266240239143372, + "kl": 0.0050506591796875, + "learning_rate": 9.20625e-07, + "loss": -0.008795970119535923, + "reward": 2.2745760679244995, + "reward_std": 0.35941246151924133, + "rewards/GDino": 0.7357383072376251, + "rewards/GIT": 0.42085812985897064, + "rewards/HPSv2": 0.2789630889892578, + "rewards/ORM": 0.8390165567398071, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.8125, + "step": 127 + }, + { + "completion_length": 62.0, + "epoch": 0.14174972314507198, + "grad_norm": 1.2693217992782593, + "kl": 0.00701904296875, + "learning_rate": 9.2e-07, + "loss": -0.013476235326379538, + "reward": 1.8667319416999817, + "reward_std": 0.5579482614994049, + "rewards/GDino": 0.6687500178813934, + "rewards/GIT": 0.240242637693882, + "rewards/HPSv2": 0.2608222961425781, + "rewards/ORM": 0.6969169527292252, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -20.8125, + "step": 128 + }, + { + "completion_length": 72.53125, + "epoch": 0.14285714285714285, + "grad_norm": 1.4665846824645996, + "kl": 0.0047454833984375, + "learning_rate": 9.19375e-07, + "loss": -0.006278489250689745, + "reward": 2.076420545578003, + "reward_std": 0.36895356327295303, + "rewards/GDino": 0.739062488079071, + "rewards/GIT": 0.41109369695186615, + "rewards/HPSv2": 0.2513103485107422, + "rewards/ORM": 0.6749540567398071, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.875, + "step": 129 + }, + { + "completion_length": 58.046875, + "epoch": 0.14396456256921372, + "grad_norm": 0.7384111285209656, + "kl": 0.00390625, + "learning_rate": 9.187499999999999e-07, + "loss": -0.0109781245701015, + "reward": 1.9833685159683228, + "reward_std": 0.39847198128700256, + "rewards/GDino": 0.7729166448116302, + "rewards/GIT": 0.4782646894454956, + "rewards/HPSv2": 0.24262619018554688, + "rewards/ORM": 0.48956090211868286, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.375, + "step": 130 + }, + { + "completion_length": 72.0, + "epoch": 0.1450719822812846, + "grad_norm": 0.46645256876945496, + "kl": 0.00476837158203125, + "learning_rate": 9.181249999999999e-07, + "loss": 0.006110590882599354, + "reward": 1.885680913925171, + "reward_std": 0.4655804932117462, + "rewards/GDino": 0.7249231338500977, + "rewards/GIT": 0.35940520465373993, + "rewards/HPSv2": 0.2583580017089844, + "rewards/ORM": 0.5429946184158325, + "self_certainty_semantic": -25.375, + "self_certainty_token": -22.0625, + "step": 131 + }, + { + "completion_length": 53.21875, + "epoch": 0.1461794019933555, + "grad_norm": 0.5023438930511475, + "kl": 0.00583648681640625, + "learning_rate": 9.174999999999999e-07, + "loss": -0.0056219237158074975, + "reward": 2.1214953660964966, + "reward_std": 0.5559927821159363, + "rewards/GDino": 0.8054038286209106, + "rewards/GIT": 0.4245864748954773, + "rewards/HPSv2": 0.2713184356689453, + "rewards/ORM": 0.6201866269111633, + "self_certainty_semantic": -25.75, + "self_certainty_token": -22.0625, + "step": 132 + }, + { + "completion_length": 75.6875, + "epoch": 0.14728682170542637, + "grad_norm": 0.6622663140296936, + "kl": 0.00439453125, + "learning_rate": 9.168749999999999e-07, + "loss": 0.009899101918563247, + "reward": 2.593212366104126, + "reward_std": 0.17419864609837532, + "rewards/GDino": 0.7739583253860474, + "rewards/GIT": 0.6746057868003845, + "rewards/HPSv2": 0.2743816375732422, + "rewards/ORM": 0.8702665567398071, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.75, + "step": 133 + }, + { + "completion_length": 68.90625, + "epoch": 0.14839424141749724, + "grad_norm": 0.41897183656692505, + "kl": 0.0034942626953125, + "learning_rate": 9.1625e-07, + "loss": 0.002212307066656649, + "reward": 1.978962779045105, + "reward_std": 0.45697829127311707, + "rewards/GDino": 0.7175242900848389, + "rewards/GIT": 0.5035496056079865, + "rewards/HPSv2": 0.24994659423828125, + "rewards/ORM": 0.5079423785209656, + "self_certainty_semantic": -25.625, + "self_certainty_token": -20.3125, + "step": 134 + }, + { + "completion_length": 62.859375, + "epoch": 0.14950166112956811, + "grad_norm": 0.5371299386024475, + "kl": 0.00482177734375, + "learning_rate": 9.15625e-07, + "loss": 0.005879509728401899, + "reward": 2.0941214561462402, + "reward_std": 0.47014716267585754, + "rewards/GDino": 0.774738609790802, + "rewards/GIT": 0.4917849898338318, + "rewards/HPSv2": 0.267425537109375, + "rewards/ORM": 0.5601723045110703, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -20.1875, + "step": 135 + }, + { + "completion_length": 76.578125, + "epoch": 0.150609080841639, + "grad_norm": 0.48601874709129333, + "kl": 0.004486083984375, + "learning_rate": 9.15e-07, + "loss": -0.0003573829308152199, + "reward": 1.8426015377044678, + "reward_std": 0.2483576349914074, + "rewards/GDino": 0.684923529624939, + "rewards/GIT": 0.3237183541059494, + "rewards/HPSv2": 0.2632465362548828, + "rewards/ORM": 0.5707131624221802, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.0625, + "step": 136 + }, + { + "completion_length": 70.078125, + "epoch": 0.15171650055370986, + "grad_norm": 0.5911806225776672, + "kl": 0.0052947998046875, + "learning_rate": 9.14375e-07, + "loss": -0.008954334072768688, + "reward": 2.0952707529067993, + "reward_std": 0.42313070595264435, + "rewards/GDino": 0.7640625238418579, + "rewards/GIT": 0.5078665241599083, + "rewards/HPSv2": 0.25115394592285156, + "rewards/ORM": 0.5721877217292786, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.4375, + "step": 137 + }, + { + "completion_length": 61.75, + "epoch": 0.15282392026578073, + "grad_norm": 0.6094731688499451, + "kl": 0.00860595703125, + "learning_rate": 9.137499999999999e-07, + "loss": -0.00691208359785378, + "reward": 1.8424771428108215, + "reward_std": 0.3106200248003006, + "rewards/GDino": 0.6280561089515686, + "rewards/GIT": 0.2153022214770317, + "rewards/HPSv2": 0.2725563049316406, + "rewards/ORM": 0.7265625, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.0, + "step": 138 + }, + { + "completion_length": 76.21875, + "epoch": 0.1539313399778516, + "grad_norm": 0.7681946754455566, + "kl": 0.0045166015625, + "learning_rate": 9.131249999999999e-07, + "loss": 0.006304489565081894, + "reward": 2.0444042682647705, + "reward_std": 0.4021482616662979, + "rewards/GDino": 0.7844302356243134, + "rewards/GIT": 0.33466267585754395, + "rewards/HPSv2": 0.26512908935546875, + "rewards/ORM": 0.6601821780204773, + "self_certainty_semantic": -25.375, + "self_certainty_token": -22.0, + "step": 139 + }, + { + "completion_length": 64.78125, + "epoch": 0.15503875968992248, + "grad_norm": 0.404694527387619, + "kl": 0.00445556640625, + "learning_rate": 9.124999999999999e-07, + "loss": 0.0074170518782921135, + "reward": 2.199423849582672, + "reward_std": 0.3181084841489792, + "rewards/GDino": 0.8405935764312744, + "rewards/GIT": 0.5380776524543762, + "rewards/HPSv2": 0.2516937255859375, + "rewards/ORM": 0.5690587759017944, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.375, + "step": 140 + }, + { + "completion_length": 80.1875, + "epoch": 0.15614617940199335, + "grad_norm": 0.521449089050293, + "kl": 0.00370025634765625, + "learning_rate": 9.11875e-07, + "loss": 0.01646838476881385, + "reward": 2.4023600816726685, + "reward_std": 0.17732174694538116, + "rewards/GDino": 0.6875, + "rewards/GIT": 0.7328296601772308, + "rewards/HPSv2": 0.24770545959472656, + "rewards/ORM": 0.7343250513076782, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.125, + "step": 141 + }, + { + "completion_length": 68.546875, + "epoch": 0.15725359911406422, + "grad_norm": 0.4444400370121002, + "kl": 0.006500244140625, + "learning_rate": 9.1125e-07, + "loss": -0.0020874282345175743, + "reward": 2.2395375967025757, + "reward_std": 0.37212860584259033, + "rewards/GDino": 0.7598958611488342, + "rewards/GIT": 0.5187265872955322, + "rewards/HPSv2": 0.2597951889038086, + "rewards/ORM": 0.7011198401451111, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.0625, + "step": 142 + }, + { + "completion_length": 60.453125, + "epoch": 0.1583610188261351, + "grad_norm": 0.5732141137123108, + "kl": 0.006134033203125, + "learning_rate": 9.10625e-07, + "loss": -0.0019202656112611294, + "reward": 1.9194607138633728, + "reward_std": 0.5088343024253845, + "rewards/GDino": 0.705212414264679, + "rewards/GIT": 0.3693596422672272, + "rewards/HPSv2": 0.2593517303466797, + "rewards/ORM": 0.5855368673801422, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.125, + "step": 143 + }, + { + "completion_length": 69.53125, + "epoch": 0.15946843853820597, + "grad_norm": 0.5136631727218628, + "kl": 0.00463104248046875, + "learning_rate": 9.1e-07, + "loss": -0.0024181478656828403, + "reward": 2.1130378246307373, + "reward_std": 0.3436143696308136, + "rewards/GDino": 0.6970658600330353, + "rewards/GIT": 0.5147460252046585, + "rewards/HPSv2": 0.2531890869140625, + "rewards/ORM": 0.6480368673801422, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.3125, + "step": 144 + }, + { + "completion_length": 63.234375, + "epoch": 0.16057585825027684, + "grad_norm": 0.425749808549881, + "kl": 0.0057220458984375, + "learning_rate": 9.09375e-07, + "loss": 0.0033237107563763857, + "reward": 1.907556176185608, + "reward_std": 0.3990510255098343, + "rewards/GDino": 0.7011643946170807, + "rewards/GIT": 0.3098641186952591, + "rewards/HPSv2": 0.28241920471191406, + "rewards/ORM": 0.6141084432601929, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -22.5625, + "step": 145 + }, + { + "completion_length": 62.953125, + "epoch": 0.16168327796234774, + "grad_norm": 0.5104310512542725, + "kl": 0.0064239501953125, + "learning_rate": 9.087499999999999e-07, + "loss": 0.010284929594490677, + "reward": 2.080387771129608, + "reward_std": 0.4294509291648865, + "rewards/GDino": 0.8376201391220093, + "rewards/GIT": 0.3540365919470787, + "rewards/HPSv2": 0.27114295959472656, + "rewards/ORM": 0.6175881326198578, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.5, + "step": 146 + }, + { + "completion_length": 67.625, + "epoch": 0.16279069767441862, + "grad_norm": 0.5227380394935608, + "kl": 0.0070343017578125, + "learning_rate": 9.081249999999999e-07, + "loss": 0.003552068490535021, + "reward": 1.605971097946167, + "reward_std": 0.3158091753721237, + "rewards/GDino": 0.6382401585578918, + "rewards/GIT": 0.19080179929733276, + "rewards/HPSv2": 0.25063323974609375, + "rewards/ORM": 0.5262957215309143, + "self_certainty_semantic": -25.375, + "self_certainty_token": -22.9375, + "step": 147 + }, + { + "completion_length": 69.53125, + "epoch": 0.1638981173864895, + "grad_norm": 0.5913640260696411, + "kl": 0.008758544921875, + "learning_rate": 9.074999999999999e-07, + "loss": 0.0023775382433086634, + "reward": 2.265665352344513, + "reward_std": 0.3249353617429733, + "rewards/GDino": 0.8458716571331024, + "rewards/GIT": 0.38859403878450394, + "rewards/HPSv2": 0.27611541748046875, + "rewards/ORM": 0.7550841569900513, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -21.625, + "step": 148 + }, + { + "completion_length": 69.390625, + "epoch": 0.16500553709856036, + "grad_norm": 0.6509791016578674, + "kl": 0.0075836181640625, + "learning_rate": 9.068749999999999e-07, + "loss": -0.010468412889167666, + "reward": 2.1014277935028076, + "reward_std": 0.29370661079883575, + "rewards/GDino": 0.7491666674613953, + "rewards/GIT": 0.3259096145629883, + "rewards/HPSv2": 0.2623310089111328, + "rewards/ORM": 0.7640205323696136, + "self_certainty_semantic": -25.75, + "self_certainty_token": -22.0, + "step": 149 + }, + { + "completion_length": 64.046875, + "epoch": 0.16611295681063123, + "grad_norm": 2.004599094390869, + "kl": 0.01568603515625, + "learning_rate": 9.0625e-07, + "loss": -0.003110084217041731, + "reward": 2.0497288703918457, + "reward_std": 0.46643751859664917, + "rewards/GDino": 0.7837072014808655, + "rewards/GIT": 0.31941479444503784, + "rewards/HPSv2": 0.2623157501220703, + "rewards/ORM": 0.6842910945415497, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.875, + "step": 150 + }, + { + "completion_length": 68.875, + "epoch": 0.1672203765227021, + "grad_norm": 1.2010647058486938, + "kl": 0.0079498291015625, + "learning_rate": 9.05625e-07, + "loss": 0.0036378083750605583, + "reward": 2.19494891166687, + "reward_std": 0.5349652469158173, + "rewards/GDino": 0.7948823869228363, + "rewards/GIT": 0.3874897435307503, + "rewards/HPSv2": 0.2666778564453125, + "rewards/ORM": 0.7458988428115845, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.75, + "step": 151 + }, + { + "completion_length": 65.46875, + "epoch": 0.16832779623477298, + "grad_norm": 0.4594494700431824, + "kl": 0.0051727294921875, + "learning_rate": 9.05e-07, + "loss": 0.0013301910366863012, + "reward": 2.1984575986862183, + "reward_std": 0.2301565483212471, + "rewards/GDino": 0.8368903398513794, + "rewards/GIT": 0.4207738786935806, + "rewards/HPSv2": 0.27980995178222656, + "rewards/ORM": 0.6609834432601929, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.3125, + "step": 152 + }, + { + "completion_length": 60.875, + "epoch": 0.16943521594684385, + "grad_norm": 0.584158182144165, + "kl": 0.006622314453125, + "learning_rate": 9.04375e-07, + "loss": -0.006514292559586465, + "reward": 2.2534468173980713, + "reward_std": 0.3471103012561798, + "rewards/GDino": 0.7832907140254974, + "rewards/GIT": 0.6241410374641418, + "rewards/HPSv2": 0.2647590637207031, + "rewards/ORM": 0.5812558829784393, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.8125, + "step": 153 + }, + { + "completion_length": 71.234375, + "epoch": 0.17054263565891473, + "grad_norm": 0.3877808153629303, + "kl": 0.0067138671875, + "learning_rate": 9.0375e-07, + "loss": -0.00840937439352274, + "reward": 1.5600855946540833, + "reward_std": 0.1888522505760193, + "rewards/GDino": 0.6892416477203369, + "rewards/GIT": 0.1894538253545761, + "rewards/HPSv2": 0.26103973388671875, + "rewards/ORM": 0.42035043239593506, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -23.125, + "step": 154 + }, + { + "completion_length": 75.15625, + "epoch": 0.1716500553709856, + "grad_norm": 0.48354580998420715, + "kl": 0.0096588134765625, + "learning_rate": 9.031249999999999e-07, + "loss": 0.019050699658691883, + "reward": 2.116607189178467, + "reward_std": 0.290459081530571, + "rewards/GDino": 0.6718750298023224, + "rewards/GIT": 0.4389065280556679, + "rewards/HPSv2": 0.26484203338623047, + "rewards/ORM": 0.7409836649894714, + "self_certainty_semantic": -25.625, + "self_certainty_token": -20.625, + "step": 155 + }, + { + "completion_length": 70.28125, + "epoch": 0.17275747508305647, + "grad_norm": 0.48019152879714966, + "kl": 0.00909423828125, + "learning_rate": 9.024999999999999e-07, + "loss": -0.006820322363637388, + "reward": 1.7913519144058228, + "reward_std": 0.4075485020875931, + "rewards/GDino": 0.6470568478107452, + "rewards/GIT": 0.21577580273151398, + "rewards/HPSv2": 0.2772235870361328, + "rewards/ORM": 0.6512957215309143, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.125, + "step": 156 + }, + { + "completion_length": 79.765625, + "epoch": 0.17386489479512734, + "grad_norm": 0.4524085223674774, + "kl": 0.0062713623046875, + "learning_rate": 9.018749999999999e-07, + "loss": -0.008496122900396585, + "reward": 2.5269054174423218, + "reward_std": 0.3125455528497696, + "rewards/GDino": 0.8450000286102295, + "rewards/GIT": 0.7050136923789978, + "rewards/HPSv2": 0.24599647521972656, + "rewards/ORM": 0.7308953106403351, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -22.1875, + "step": 157 + }, + { + "completion_length": 64.5, + "epoch": 0.17497231450719822, + "grad_norm": 0.43005651235580444, + "kl": 0.009552001953125, + "learning_rate": 9.0125e-07, + "loss": 0.005564866121858358, + "reward": 2.3001022338867188, + "reward_std": 0.2847408503293991, + "rewards/GDino": 0.8344532251358032, + "rewards/GIT": 0.420885294675827, + "rewards/HPSv2": 0.27132606506347656, + "rewards/ORM": 0.7734375298023224, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.0, + "step": 158 + }, + { + "completion_length": 70.171875, + "epoch": 0.1760797342192691, + "grad_norm": 0.6674854159355164, + "kl": 0.009185791015625, + "learning_rate": 9.00625e-07, + "loss": 0.001701198983937502, + "reward": 2.222777843475342, + "reward_std": 0.4929357320070267, + "rewards/GDino": 0.7640625238418579, + "rewards/GIT": 0.48828309774398804, + "rewards/HPSv2": 0.2673072814941406, + "rewards/ORM": 0.703125, + "self_certainty_semantic": -25.5, + "self_certainty_token": -20.6875, + "step": 159 + }, + { + "completion_length": 64.421875, + "epoch": 0.17718715393134, + "grad_norm": 0.4401383399963379, + "kl": 0.008697509765625, + "learning_rate": 9e-07, + "loss": 0.0025870297104120255, + "reward": 1.7824512124061584, + "reward_std": 0.44338105618953705, + "rewards/GDino": 0.7084426283836365, + "rewards/GIT": 0.286900594830513, + "rewards/HPSv2": 0.2784423828125, + "rewards/ORM": 0.5086656212806702, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -22.0, + "step": 160 + }, + { + "completion_length": 69.328125, + "epoch": 0.17829457364341086, + "grad_norm": 0.6274824142456055, + "kl": 0.008209228515625, + "learning_rate": 8.99375e-07, + "loss": 0.006771775893867016, + "reward": 2.080656409263611, + "reward_std": 0.4039708971977234, + "rewards/GDino": 0.7284385859966278, + "rewards/GIT": 0.4118357300758362, + "rewards/HPSv2": 0.2606945037841797, + "rewards/ORM": 0.6796875, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.4375, + "step": 161 + }, + { + "completion_length": 87.765625, + "epoch": 0.17940199335548174, + "grad_norm": 0.713962972164154, + "kl": 0.00885009765625, + "learning_rate": 8.9875e-07, + "loss": 0.001781372120603919, + "reward": 2.2108030319213867, + "reward_std": 0.23567625507712364, + "rewards/GDino": 0.9036458432674408, + "rewards/GIT": 0.5173117220401764, + "rewards/HPSv2": 0.2613239288330078, + "rewards/ORM": 0.5285216420888901, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.875, + "step": 162 + }, + { + "completion_length": 65.375, + "epoch": 0.1805094130675526, + "grad_norm": 0.45745736360549927, + "kl": 0.010772705078125, + "learning_rate": 8.981249999999999e-07, + "loss": -0.001884209574200213, + "reward": 2.169035792350769, + "reward_std": 0.27702826261520386, + "rewards/GDino": 0.7578125, + "rewards/GIT": 0.6291100382804871, + "rewards/HPSv2": 0.24835586547851562, + "rewards/ORM": 0.5337574481964111, + "self_certainty_semantic": -25.625, + "self_certainty_token": -20.875, + "step": 163 + }, + { + "completion_length": 66.15625, + "epoch": 0.18161683277962348, + "grad_norm": 0.4001372456550598, + "kl": 0.011199951171875, + "learning_rate": 8.974999999999999e-07, + "loss": -0.004290862008929253, + "reward": 2.6795451641082764, + "reward_std": 0.3354812413454056, + "rewards/GDino": 0.8685008883476257, + "rewards/GIT": 0.7786318361759186, + "rewards/HPSv2": 0.27187156677246094, + "rewards/ORM": 0.7605409026145935, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -20.625, + "step": 164 + }, + { + "completion_length": 71.453125, + "epoch": 0.18272425249169436, + "grad_norm": 0.6596059799194336, + "kl": 0.00909423828125, + "learning_rate": 8.96875e-07, + "loss": -0.0067337434738874435, + "reward": 2.3466144800186157, + "reward_std": 0.29852450639009476, + "rewards/GDino": 0.8130539357662201, + "rewards/GIT": 0.49434708058834076, + "rewards/HPSv2": 0.2721138000488281, + "rewards/ORM": 0.7670996189117432, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -22.875, + "step": 165 + }, + { + "completion_length": 79.859375, + "epoch": 0.18383167220376523, + "grad_norm": 0.41807329654693604, + "kl": 0.01123046875, + "learning_rate": 8.9625e-07, + "loss": 0.010698896832764149, + "reward": 2.1671139001846313, + "reward_std": 0.37620842456817627, + "rewards/GDino": 0.7225366532802582, + "rewards/GIT": 0.46812044084072113, + "rewards/HPSv2": 0.2448101043701172, + "rewards/ORM": 0.7316466867923737, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.3125, + "step": 166 + }, + { + "completion_length": 68.921875, + "epoch": 0.1849390919158361, + "grad_norm": 0.4884219467639923, + "kl": 0.010955810546875, + "learning_rate": 8.95625e-07, + "loss": 0.0020176093094050884, + "reward": 1.979174256324768, + "reward_std": 0.43148648738861084, + "rewards/GDino": 0.7630714476108551, + "rewards/GIT": 0.49030545353889465, + "rewards/HPSv2": 0.2582511901855469, + "rewards/ORM": 0.46754617989063263, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -22.0, + "step": 167 + }, + { + "completion_length": 76.453125, + "epoch": 0.18604651162790697, + "grad_norm": 0.4840864837169647, + "kl": 0.00423431396484375, + "learning_rate": 8.95e-07, + "loss": -0.0033226923551410437, + "reward": 2.049097418785095, + "reward_std": 0.2925217002630234, + "rewards/GDino": 0.7759547531604767, + "rewards/GIT": 0.5475737899541855, + "rewards/HPSv2": 0.25574493408203125, + "rewards/ORM": 0.4698239266872406, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.4375, + "step": 168 + }, + { + "completion_length": 70.6875, + "epoch": 0.18715393133997785, + "grad_norm": 0.6547427773475647, + "kl": 0.0087890625, + "learning_rate": 8.94375e-07, + "loss": -0.00017379922792315483, + "reward": 2.19344425201416, + "reward_std": 0.3008778989315033, + "rewards/GDino": 0.8275851011276245, + "rewards/GIT": 0.45398683845996857, + "rewards/HPSv2": 0.2814655303955078, + "rewards/ORM": 0.6304067671298981, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.625, + "step": 169 + }, + { + "completion_length": 82.03125, + "epoch": 0.18826135105204872, + "grad_norm": 0.5040526390075684, + "kl": 0.0142364501953125, + "learning_rate": 8.9375e-07, + "loss": -0.007077913731336594, + "reward": 2.0542516708374023, + "reward_std": 0.3690732419490814, + "rewards/GDino": 0.7519437670707703, + "rewards/GIT": 0.40589363873004913, + "rewards/HPSv2": 0.2560100555419922, + "rewards/ORM": 0.6404041647911072, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.0, + "step": 170 + }, + { + "completion_length": 64.203125, + "epoch": 0.1893687707641196, + "grad_norm": 0.4935157299041748, + "kl": 0.012420654296875, + "learning_rate": 8.931249999999999e-07, + "loss": 0.0035545220598578453, + "reward": 2.274348735809326, + "reward_std": 0.2875422090291977, + "rewards/GDino": 0.7699261903762817, + "rewards/GIT": 0.5473942309617996, + "rewards/HPSv2": 0.2648448944091797, + "rewards/ORM": 0.6921834945678711, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.0, + "step": 171 + }, + { + "completion_length": 74.953125, + "epoch": 0.19047619047619047, + "grad_norm": 0.4935402274131775, + "kl": 0.0087738037109375, + "learning_rate": 8.924999999999999e-07, + "loss": 0.004996137693524361, + "reward": 1.6501405239105225, + "reward_std": 0.3322151154279709, + "rewards/GDino": 0.5804118067026138, + "rewards/GIT": 0.419575035572052, + "rewards/HPSv2": 0.25256919860839844, + "rewards/ORM": 0.39758437871932983, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.5625, + "step": 172 + }, + { + "completion_length": 63.296875, + "epoch": 0.19158361018826134, + "grad_norm": 1.0840739011764526, + "kl": 0.0174560546875, + "learning_rate": 8.918749999999999e-07, + "loss": 0.0033964416943490505, + "reward": 2.1245768666267395, + "reward_std": 0.29341885447502136, + "rewards/GDino": 0.8359375298023224, + "rewards/GIT": 0.3758692592382431, + "rewards/HPSv2": 0.2845611572265625, + "rewards/ORM": 0.6282089054584503, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -22.0, + "step": 173 + }, + { + "completion_length": 80.53125, + "epoch": 0.19269102990033224, + "grad_norm": 0.4756031036376953, + "kl": 0.0066070556640625, + "learning_rate": 8.912499999999999e-07, + "loss": -0.001147494971519336, + "reward": 2.2244513034820557, + "reward_std": 0.3234108239412308, + "rewards/GDino": 0.7939131259918213, + "rewards/GIT": 0.5430482923984528, + "rewards/HPSv2": 0.2594108581542969, + "rewards/ORM": 0.6280790567398071, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.125, + "step": 174 + }, + { + "completion_length": 63.796875, + "epoch": 0.1937984496124031, + "grad_norm": 0.8507784605026245, + "kl": 0.01806640625, + "learning_rate": 8.906249999999999e-07, + "loss": -0.0049158919136971235, + "reward": 2.211203694343567, + "reward_std": 0.30844441056251526, + "rewards/GDino": 0.7877604365348816, + "rewards/GIT": 0.5168893337249756, + "rewards/HPSv2": 0.2628498077392578, + "rewards/ORM": 0.6437040567398071, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.125, + "step": 175 + }, + { + "completion_length": 85.40625, + "epoch": 0.19490586932447398, + "grad_norm": 0.4818137586116791, + "kl": 0.00640869140625, + "learning_rate": 8.9e-07, + "loss": -0.0028424898628145456, + "reward": 1.9287346601486206, + "reward_std": 0.36689065396785736, + "rewards/GDino": 0.7782090902328491, + "rewards/GIT": 0.4271218478679657, + "rewards/HPSv2": 0.262115478515625, + "rewards/ORM": 0.461288183927536, + "self_certainty_semantic": -25.625, + "self_certainty_token": -20.75, + "step": 176 + }, + { + "completion_length": 74.65625, + "epoch": 0.19601328903654486, + "grad_norm": 0.5553709864616394, + "kl": 0.014068603515625, + "learning_rate": 8.89375e-07, + "loss": -0.00260241178330034, + "reward": 2.277731418609619, + "reward_std": 0.36928629875183105, + "rewards/GDino": 0.7465280592441559, + "rewards/GIT": 0.4939851015806198, + "rewards/HPSv2": 0.2715930938720703, + "rewards/ORM": 0.765625, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.25, + "step": 177 + }, + { + "completion_length": 77.53125, + "epoch": 0.19712070874861573, + "grad_norm": 0.812800407409668, + "kl": 0.0077972412109375, + "learning_rate": 8.8875e-07, + "loss": -0.007587546017020941, + "reward": 2.0915766954421997, + "reward_std": 0.39137691259384155, + "rewards/GDino": 0.745751827955246, + "rewards/GIT": 0.40190117061138153, + "rewards/HPSv2": 0.2661113739013672, + "rewards/ORM": 0.6778122782707214, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.25, + "step": 178 + }, + { + "completion_length": 64.765625, + "epoch": 0.1982281284606866, + "grad_norm": 0.8705865740776062, + "kl": 0.01080322265625, + "learning_rate": 8.88125e-07, + "loss": -0.00909736379981041, + "reward": 2.4661701917648315, + "reward_std": 0.1972077488899231, + "rewards/GDino": 0.8959279954433441, + "rewards/GIT": 0.5798787474632263, + "rewards/HPSv2": 0.2825050354003906, + "rewards/ORM": 0.7078584432601929, + "self_certainty_semantic": -25.5, + "self_certainty_token": -22.4375, + "step": 179 + }, + { + "completion_length": 75.1875, + "epoch": 0.19933554817275748, + "grad_norm": 1.3513967990875244, + "kl": 0.0105743408203125, + "learning_rate": 8.874999999999999e-07, + "loss": 0.023300296626985073, + "reward": 1.805686593055725, + "reward_std": 0.4569002389907837, + "rewards/GDino": 0.748356282711029, + "rewards/GIT": 0.34142881631851196, + "rewards/HPSv2": 0.2596473693847656, + "rewards/ORM": 0.4562540054321289, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.5625, + "step": 180 + }, + { + "completion_length": 74.171875, + "epoch": 0.20044296788482835, + "grad_norm": 0.49861499667167664, + "kl": 0.00799560546875, + "learning_rate": 8.86875e-07, + "loss": 0.005896527087315917, + "reward": 1.8344124555587769, + "reward_std": 0.33161167800426483, + "rewards/GDino": 0.6484833061695099, + "rewards/GIT": 0.3188634589314461, + "rewards/HPSv2": 0.2792530059814453, + "rewards/ORM": 0.587812751531601, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.75, + "step": 181 + }, + { + "completion_length": 66.53125, + "epoch": 0.20155038759689922, + "grad_norm": 0.518588125705719, + "kl": 0.021148681640625, + "learning_rate": 8.8625e-07, + "loss": -0.0032154046930372715, + "reward": 1.6775782704353333, + "reward_std": 0.4542950987815857, + "rewards/GDino": 0.6909389793872833, + "rewards/GIT": 0.31735002249479294, + "rewards/HPSv2": 0.27741050720214844, + "rewards/ORM": 0.39187873899936676, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -22.0625, + "step": 182 + }, + { + "completion_length": 83.171875, + "epoch": 0.2026578073089701, + "grad_norm": 0.4635794758796692, + "kl": 0.015838623046875, + "learning_rate": 8.85625e-07, + "loss": 0.006844737799838185, + "reward": 1.8692994713783264, + "reward_std": 0.3296326994895935, + "rewards/GDino": 0.7293833494186401, + "rewards/GIT": 0.34990622848272324, + "rewards/HPSv2": 0.2678260803222656, + "rewards/ORM": 0.5221837162971497, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.4375, + "step": 183 + }, + { + "completion_length": 63.4375, + "epoch": 0.20376522702104097, + "grad_norm": 0.5085333585739136, + "kl": 0.0120849609375, + "learning_rate": 8.85e-07, + "loss": -0.0026784827932715416, + "reward": 2.799358606338501, + "reward_std": 0.1885242909193039, + "rewards/GDino": 0.925000011920929, + "rewards/GIT": 0.7545149028301239, + "rewards/HPSv2": 0.26367759704589844, + "rewards/ORM": 0.8561660945415497, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.1875, + "step": 184 + }, + { + "completion_length": 65.859375, + "epoch": 0.20487264673311184, + "grad_norm": 0.5494704842567444, + "kl": 0.013671875, + "learning_rate": 8.84375e-07, + "loss": -0.003346539626363665, + "reward": 2.0845471620559692, + "reward_std": 0.5152666121721268, + "rewards/GDino": 0.7945332229137421, + "rewards/GIT": 0.2876994013786316, + "rewards/HPSv2": 0.27262306213378906, + "rewards/ORM": 0.7296914756298065, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -22.125, + "step": 185 + }, + { + "completion_length": 71.6875, + "epoch": 0.2059800664451827, + "grad_norm": 0.5301854014396667, + "kl": 0.011871337890625, + "learning_rate": 8.8375e-07, + "loss": -0.0013000170001760125, + "reward": 2.0686882734298706, + "reward_std": 0.40786902606487274, + "rewards/GDino": 0.6654029488563538, + "rewards/GIT": 0.3254973590373993, + "rewards/HPSv2": 0.240997314453125, + "rewards/ORM": 0.8367905914783478, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.0625, + "step": 186 + }, + { + "completion_length": 76.890625, + "epoch": 0.2070874861572536, + "grad_norm": 0.4597737789154053, + "kl": 0.011993408203125, + "learning_rate": 8.83125e-07, + "loss": 0.016351854777894914, + "reward": 2.200950801372528, + "reward_std": 0.35277409851551056, + "rewards/GDino": 0.7939618229866028, + "rewards/GIT": 0.5313694775104523, + "rewards/HPSv2": 0.26030731201171875, + "rewards/ORM": 0.615312248468399, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.25, + "step": 187 + }, + { + "completion_length": 65.65625, + "epoch": 0.2081949058693245, + "grad_norm": 0.5319734811782837, + "kl": 0.010162353515625, + "learning_rate": 8.824999999999999e-07, + "loss": 0.00020685815252363682, + "reward": 2.099229574203491, + "reward_std": 0.360196590423584, + "rewards/GDino": 0.7534400224685669, + "rewards/GIT": 0.27092792093753815, + "rewards/HPSv2": 0.2623615264892578, + "rewards/ORM": 0.8125, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.625, + "step": 188 + }, + { + "completion_length": 71.703125, + "epoch": 0.20930232558139536, + "grad_norm": 0.7321242690086365, + "kl": 0.0094451904296875, + "learning_rate": 8.818749999999999e-07, + "loss": -0.004028161056339741, + "reward": 2.337135910987854, + "reward_std": 0.31387007236480713, + "rewards/GDino": 0.7773648500442505, + "rewards/GIT": 0.5682414174079895, + "rewards/HPSv2": 0.27951812744140625, + "rewards/ORM": 0.7120114862918854, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.25, + "step": 189 + }, + { + "completion_length": 70.203125, + "epoch": 0.21040974529346623, + "grad_norm": 1.9930344820022583, + "kl": 0.0136566162109375, + "learning_rate": 8.812499999999999e-07, + "loss": 0.008943180087953806, + "reward": 2.5060739517211914, + "reward_std": 0.16241375356912613, + "rewards/GDino": 0.9254540205001831, + "rewards/GIT": 0.454538494348526, + "rewards/HPSv2": 0.2667064666748047, + "rewards/ORM": 0.859375, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.75, + "step": 190 + }, + { + "completion_length": 64.796875, + "epoch": 0.2115171650055371, + "grad_norm": 0.4348452091217041, + "kl": 0.007415771484375, + "learning_rate": 8.806249999999999e-07, + "loss": -0.006945850793272257, + "reward": 2.5402393341064453, + "reward_std": 0.2529807686805725, + "rewards/GDino": 0.8751335144042969, + "rewards/GIT": 0.6033133119344711, + "rewards/HPSv2": 0.27858734130859375, + "rewards/ORM": 0.7832051813602448, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.5625, + "step": 191 + }, + { + "completion_length": 89.921875, + "epoch": 0.21262458471760798, + "grad_norm": 0.7680485248565674, + "kl": 0.012481689453125, + "learning_rate": 8.799999999999999e-07, + "loss": 0.005377613822929561, + "reward": 1.8802450299263, + "reward_std": 0.3106888607144356, + "rewards/GDino": 0.6456713378429413, + "rewards/GIT": 0.4135439097881317, + "rewards/HPSv2": 0.2503166198730469, + "rewards/ORM": 0.5707131326198578, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.125, + "step": 192 + }, + { + "completion_length": 69.78125, + "epoch": 0.21373200442967885, + "grad_norm": 0.5264883637428284, + "kl": 0.010955810546875, + "learning_rate": 8.793749999999999e-07, + "loss": 0.008317717118188739, + "reward": 1.861718237400055, + "reward_std": 0.4164891242980957, + "rewards/GDino": 0.7109375596046448, + "rewards/GIT": 0.21486494690179825, + "rewards/HPSv2": 0.2839984893798828, + "rewards/ORM": 0.6519171893596649, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -22.1875, + "step": 193 + }, + { + "completion_length": 74.75, + "epoch": 0.21483942414174972, + "grad_norm": 0.5414590835571289, + "kl": 0.0074462890625, + "learning_rate": 8.7875e-07, + "loss": -0.0021489104256033897, + "reward": 1.963248074054718, + "reward_std": 0.4292799085378647, + "rewards/GDino": 0.8057583570480347, + "rewards/GIT": 0.5115346312522888, + "rewards/HPSv2": 0.26822662353515625, + "rewards/ORM": 0.37772834300994873, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.25, + "step": 194 + }, + { + "completion_length": 68.203125, + "epoch": 0.2159468438538206, + "grad_norm": 0.45540449023246765, + "kl": 0.01312255859375, + "learning_rate": 8.78125e-07, + "loss": -0.004703107755631208, + "reward": 2.011273205280304, + "reward_std": 0.4216621667146683, + "rewards/GDino": 0.7242187261581421, + "rewards/GIT": 0.5994383990764618, + "rewards/HPSv2": 0.27542877197265625, + "rewards/ORM": 0.41218727827072144, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.6875, + "step": 195 + }, + { + "completion_length": 79.5, + "epoch": 0.21705426356589147, + "grad_norm": 0.5480747818946838, + "kl": 0.007293701171875, + "learning_rate": 8.774999999999999e-07, + "loss": -0.001077285036444664, + "reward": 2.287221312522888, + "reward_std": 0.3154482841491699, + "rewards/GDino": 0.7235225439071655, + "rewards/GIT": 0.5517593622207642, + "rewards/HPSv2": 0.2792186737060547, + "rewards/ORM": 0.7327205836772919, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.6875, + "step": 196 + }, + { + "completion_length": 67.34375, + "epoch": 0.21816168327796234, + "grad_norm": 0.648148238658905, + "kl": 0.01416015625, + "learning_rate": 8.76875e-07, + "loss": 0.0010744737228378654, + "reward": 2.3249343037605286, + "reward_std": 0.40621738135814667, + "rewards/GDino": 0.7385416626930237, + "rewards/GIT": 0.4809828922152519, + "rewards/HPSv2": 0.2538471221923828, + "rewards/ORM": 0.8515625, + "self_certainty_semantic": -25.75, + "self_certainty_token": -20.625, + "step": 197 + }, + { + "completion_length": 74.59375, + "epoch": 0.21926910299003322, + "grad_norm": 0.978819727897644, + "kl": 0.01177978515625, + "learning_rate": 8.7625e-07, + "loss": 0.004215072840452194, + "reward": 2.1429388523101807, + "reward_std": 0.3008539155125618, + "rewards/GDino": 0.8473958671092987, + "rewards/GIT": 0.5675143599510193, + "rewards/HPSv2": 0.2627582550048828, + "rewards/ORM": 0.4652703106403351, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.4375, + "step": 198 + }, + { + "completion_length": 65.515625, + "epoch": 0.2203765227021041, + "grad_norm": 0.6454822421073914, + "kl": 0.01220703125, + "learning_rate": 8.75625e-07, + "loss": -0.0005628032376989722, + "reward": 2.50363028049469, + "reward_std": 0.3133077025413513, + "rewards/GDino": 0.8082683682441711, + "rewards/GIT": 0.6633397042751312, + "rewards/HPSv2": 0.2600593566894531, + "rewards/ORM": 0.7719629406929016, + "self_certainty_semantic": -25.5, + "self_certainty_token": -22.0, + "step": 199 + }, + { + "completion_length": 61.59375, + "epoch": 0.22148394241417496, + "grad_norm": 0.6677749156951904, + "kl": 0.0155029296875, + "learning_rate": 8.75e-07, + "loss": 0.0032004087697714567, + "reward": 2.0826478004455566, + "reward_std": 0.48166391253471375, + "rewards/GDino": 0.7572438716888428, + "rewards/GIT": 0.2937658578157425, + "rewards/HPSv2": 0.27109718322753906, + "rewards/ORM": 0.7605408430099487, + "self_certainty_semantic": -25.75, + "self_certainty_token": -20.5625, + "step": 200 + }, + { + "completion_length": 63.96875, + "epoch": 0.22259136212624583, + "grad_norm": 0.5104448199272156, + "kl": 0.009307861328125, + "learning_rate": 8.74375e-07, + "loss": -0.003562572179362178, + "reward": 2.1382156014442444, + "reward_std": 0.3752055764198303, + "rewards/GDino": 0.7233067750930786, + "rewards/GIT": 0.41389697045087814, + "rewards/HPSv2": 0.2719917297363281, + "rewards/ORM": 0.7290200591087341, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.375, + "step": 201 + }, + { + "completion_length": 76.40625, + "epoch": 0.22369878183831673, + "grad_norm": 0.626039445400238, + "kl": 0.01519775390625, + "learning_rate": 8.7375e-07, + "loss": 0.010693363845348358, + "reward": 2.4189499616622925, + "reward_std": 0.3681239038705826, + "rewards/GDino": 0.8692708611488342, + "rewards/GIT": 0.527855783700943, + "rewards/HPSv2": 0.2807598114013672, + "rewards/ORM": 0.7410636246204376, + "self_certainty_semantic": -25.75, + "self_certainty_token": -22.0625, + "step": 202 + }, + { + "completion_length": 70.53125, + "epoch": 0.2248062015503876, + "grad_norm": 0.3975130319595337, + "kl": 0.0157623291015625, + "learning_rate": 8.73125e-07, + "loss": 0.000663774786517024, + "reward": 2.5051724910736084, + "reward_std": 0.25397956371307373, + "rewards/GDino": 0.8890624940395355, + "rewards/GIT": 0.7177164554595947, + "rewards/HPSv2": 0.2733936309814453, + "rewards/ORM": 0.6249999701976776, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.9375, + "step": 203 + }, + { + "completion_length": 67.078125, + "epoch": 0.22591362126245848, + "grad_norm": 0.4357939660549164, + "kl": 0.01806640625, + "learning_rate": 8.725e-07, + "loss": 0.00449561863206327, + "reward": 2.077805757522583, + "reward_std": 0.3087446913123131, + "rewards/GDino": 0.76171875, + "rewards/GIT": 0.42673608660697937, + "rewards/HPSv2": 0.27997589111328125, + "rewards/ORM": 0.609375, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.875, + "step": 204 + }, + { + "completion_length": 64.015625, + "epoch": 0.22702104097452935, + "grad_norm": 0.6120555996894836, + "kl": 0.01470947265625, + "learning_rate": 8.718749999999999e-07, + "loss": -0.004034913959912956, + "reward": 2.361166477203369, + "reward_std": 0.4172802269458771, + "rewards/GDino": 0.7758493423461914, + "rewards/GIT": 0.5358432680368423, + "rewards/HPSv2": 0.2727775573730469, + "rewards/ORM": 0.7766963839530945, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.0625, + "step": 205 + }, + { + "completion_length": 66.28125, + "epoch": 0.22812846068660023, + "grad_norm": 0.5106468200683594, + "kl": 0.0121612548828125, + "learning_rate": 8.712499999999999e-07, + "loss": -0.0029943487606942654, + "reward": 2.395945906639099, + "reward_std": 0.2518894746899605, + "rewards/GDino": 0.7812500298023224, + "rewards/GIT": 0.44443757832050323, + "rewards/HPSv2": 0.283966064453125, + "rewards/ORM": 0.8862921893596649, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.875, + "step": 206 + }, + { + "completion_length": 80.71875, + "epoch": 0.2292358803986711, + "grad_norm": 0.5985101461410522, + "kl": 0.008514404296875, + "learning_rate": 8.706249999999999e-07, + "loss": -0.009422333678230643, + "reward": 2.2342270612716675, + "reward_std": 0.39967483282089233, + "rewards/GDino": 0.8109811544418335, + "rewards/GIT": 0.4852132052183151, + "rewards/HPSv2": 0.2630786895751953, + "rewards/ORM": 0.6749540567398071, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -22.625, + "step": 207 + }, + { + "completion_length": 72.046875, + "epoch": 0.23034330011074197, + "grad_norm": 0.48401689529418945, + "kl": 0.027587890625, + "learning_rate": 8.699999999999999e-07, + "loss": -0.00892256060615182, + "reward": 2.50811767578125, + "reward_std": 0.23593301326036453, + "rewards/GDino": 0.8662500083446503, + "rewards/GIT": 0.6928490549325943, + "rewards/HPSv2": 0.2693309783935547, + "rewards/ORM": 0.6796875, + "self_certainty_semantic": -25.75, + "self_certainty_token": -22.1875, + "step": 208 + }, + { + "completion_length": 67.953125, + "epoch": 0.23145071982281284, + "grad_norm": 0.7185308933258057, + "kl": 0.03375244140625, + "learning_rate": 8.693749999999999e-07, + "loss": 0.008592829457484186, + "reward": 1.9527746438980103, + "reward_std": 0.44384250044822693, + "rewards/GDino": 0.7293368875980377, + "rewards/GIT": 0.380715548992157, + "rewards/HPSv2": 0.27080535888671875, + "rewards/ORM": 0.571916937828064, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.6875, + "step": 209 + }, + { + "completion_length": 70.515625, + "epoch": 0.23255813953488372, + "grad_norm": 0.5452485084533691, + "kl": 0.01983642578125, + "learning_rate": 8.687499999999999e-07, + "loss": 0.008935668971389532, + "reward": 2.4658610820770264, + "reward_std": 0.3191976174712181, + "rewards/GDino": 0.8201898336410522, + "rewards/GIT": 0.4580434560775757, + "rewards/HPSv2": 0.28137779235839844, + "rewards/ORM": 0.90625, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.6875, + "step": 210 + }, + { + "completion_length": 67.890625, + "epoch": 0.2336655592469546, + "grad_norm": 0.6218281388282776, + "kl": 0.01678466796875, + "learning_rate": 8.681249999999999e-07, + "loss": 0.003950295504182577, + "reward": 2.025146007537842, + "reward_std": 0.498775839805603, + "rewards/GDino": 0.6994791030883789, + "rewards/GIT": 0.4393797814846039, + "rewards/HPSv2": 0.2803916931152344, + "rewards/ORM": 0.6058953106403351, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.375, + "step": 211 + }, + { + "completion_length": 59.34375, + "epoch": 0.23477297895902546, + "grad_norm": 0.43229976296424866, + "kl": 0.0074615478515625, + "learning_rate": 8.675000000000001e-07, + "loss": 0.005056330235674977, + "reward": 1.901893436908722, + "reward_std": 0.49373389780521393, + "rewards/GDino": 0.7495389878749847, + "rewards/GIT": 0.2199169397354126, + "rewards/HPSv2": 0.27065086364746094, + "rewards/ORM": 0.6617866456508636, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.3125, + "step": 212 + }, + { + "completion_length": 84.03125, + "epoch": 0.23588039867109634, + "grad_norm": 0.43608731031417847, + "kl": 0.024810791015625, + "learning_rate": 8.66875e-07, + "loss": 0.010223755147308111, + "reward": 2.4147619009017944, + "reward_std": 0.3146657347679138, + "rewards/GDino": 0.852343738079071, + "rewards/GIT": 0.6123765110969543, + "rewards/HPSv2": 0.28365135192871094, + "rewards/ORM": 0.6663902103900909, + "self_certainty_semantic": -25.875, + "self_certainty_token": -22.25, + "step": 213 + }, + { + "completion_length": 77.75, + "epoch": 0.2369878183831672, + "grad_norm": 0.5804117918014526, + "kl": 0.021240234375, + "learning_rate": 8.6625e-07, + "loss": 0.007496127160266042, + "reward": 1.7721906900405884, + "reward_std": 0.48169347643852234, + "rewards/GDino": 0.6889558434486389, + "rewards/GIT": 0.28805774822831154, + "rewards/HPSv2": 0.28142738342285156, + "rewards/ORM": 0.5137497633695602, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.5, + "step": 214 + }, + { + "completion_length": 70.515625, + "epoch": 0.23809523809523808, + "grad_norm": 0.7613699436187744, + "kl": 0.0208740234375, + "learning_rate": 8.65625e-07, + "loss": 0.002267889678478241, + "reward": 2.477326512336731, + "reward_std": 0.33458858728408813, + "rewards/GDino": 0.8195984661579132, + "rewards/GIT": 0.6178127527236938, + "rewards/HPSv2": 0.2648735046386719, + "rewards/ORM": 0.7750419676303864, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.1875, + "step": 215 + }, + { + "completion_length": 62.015625, + "epoch": 0.23920265780730898, + "grad_norm": 0.4545797109603882, + "kl": 0.01617431640625, + "learning_rate": 8.65e-07, + "loss": 0.0009205628884956241, + "reward": 2.2222498059272766, + "reward_std": 0.36637741327285767, + "rewards/GDino": 0.8047255873680115, + "rewards/GIT": 0.4453047811985016, + "rewards/HPSv2": 0.2862739562988281, + "rewards/ORM": 0.6859454959630966, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -22.125, + "step": 216 + }, + { + "completion_length": 74.65625, + "epoch": 0.24031007751937986, + "grad_norm": 0.4494488835334778, + "kl": 0.008087158203125, + "learning_rate": 8.64375e-07, + "loss": -0.008069702424108982, + "reward": 2.0422152280807495, + "reward_std": 0.4399893283843994, + "rewards/GDino": 0.7447916567325592, + "rewards/GIT": 0.5272943079471588, + "rewards/HPSv2": 0.2591876983642578, + "rewards/ORM": 0.5109415352344513, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.5625, + "step": 217 + }, + { + "completion_length": 72.578125, + "epoch": 0.24141749723145073, + "grad_norm": 0.5540902614593506, + "kl": 0.019287109375, + "learning_rate": 8.6375e-07, + "loss": -0.01052069931756705, + "reward": 1.696807324886322, + "reward_std": 0.3257126286625862, + "rewards/GDino": 0.633100837469101, + "rewards/GIT": 0.3013424575328827, + "rewards/HPSv2": 0.25147247314453125, + "rewards/ORM": 0.5108915567398071, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.375, + "step": 218 + }, + { + "completion_length": 64.328125, + "epoch": 0.2425249169435216, + "grad_norm": 0.6624598503112793, + "kl": 0.020233154296875, + "learning_rate": 8.63125e-07, + "loss": 0.00015211279969662428, + "reward": 2.0258015394210815, + "reward_std": 0.2695777714252472, + "rewards/GDino": 0.6833088994026184, + "rewards/GIT": 0.45557114481925964, + "rewards/HPSv2": 0.28227996826171875, + "rewards/ORM": 0.6046415567398071, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -22.0, + "step": 219 + }, + { + "completion_length": 72.375, + "epoch": 0.24363233665559247, + "grad_norm": 1.318785309791565, + "kl": 0.0072784423828125, + "learning_rate": 8.625e-07, + "loss": -0.0014179093122947961, + "reward": 2.136075735092163, + "reward_std": 0.28762874752283096, + "rewards/GDino": 0.8410985469818115, + "rewards/GIT": 0.595182478427887, + "rewards/HPSv2": 0.2622947692871094, + "rewards/ORM": 0.4375, + "self_certainty_semantic": -25.75, + "self_certainty_token": -20.375, + "step": 220 + }, + { + "completion_length": 64.5625, + "epoch": 0.24473975636766335, + "grad_norm": 0.5440139770507812, + "kl": 0.01751708984375, + "learning_rate": 8.618749999999999e-07, + "loss": 0.00410419749096036, + "reward": 1.7209655046463013, + "reward_std": 0.49389104545116425, + "rewards/GDino": 0.6376400589942932, + "rewards/GIT": 0.22850769758224487, + "rewards/HPSv2": 0.2633934020996094, + "rewards/ORM": 0.5914241969585419, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.5, + "step": 221 + }, + { + "completion_length": 64.421875, + "epoch": 0.24584717607973422, + "grad_norm": 0.39151084423065186, + "kl": 0.02105712890625, + "learning_rate": 8.612499999999999e-07, + "loss": 0.0002729548141360283, + "reward": 1.9599390029907227, + "reward_std": 0.3968782275915146, + "rewards/GDino": 0.6790578365325928, + "rewards/GIT": 0.4293278604745865, + "rewards/HPSv2": 0.2733783721923828, + "rewards/ORM": 0.5781749486923218, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.625, + "step": 222 + }, + { + "completion_length": 64.40625, + "epoch": 0.2469545957918051, + "grad_norm": 0.9094283580780029, + "kl": 0.01580810546875, + "learning_rate": 8.606249999999999e-07, + "loss": -0.0013667852617800236, + "reward": 2.021697998046875, + "reward_std": 0.4791509807109833, + "rewards/GDino": 0.7039418518543243, + "rewards/GIT": 0.2243501842021942, + "rewards/HPSv2": 0.27252197265625, + "rewards/ORM": 0.8208840191364288, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.5, + "step": 223 + }, + { + "completion_length": 55.859375, + "epoch": 0.24806201550387597, + "grad_norm": 0.390924334526062, + "kl": 0.01751708984375, + "learning_rate": 8.599999999999999e-07, + "loss": -0.003383996314369142, + "reward": 2.2570880651474, + "reward_std": 0.2598092332482338, + "rewards/GDino": 0.7818973660469055, + "rewards/GIT": 0.46117232739925385, + "rewards/HPSv2": 0.26709747314453125, + "rewards/ORM": 0.7469209432601929, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.375, + "step": 224 + }, + { + "completion_length": 72.171875, + "epoch": 0.24916943521594684, + "grad_norm": 0.4422501027584076, + "kl": 0.0198974609375, + "learning_rate": 8.593749999999999e-07, + "loss": -0.0018981220200657845, + "reward": 2.231780171394348, + "reward_std": 0.44659605622291565, + "rewards/GDino": 0.7838541567325592, + "rewards/GIT": 0.575921356678009, + "rewards/HPSv2": 0.2408466339111328, + "rewards/ORM": 0.6311580836772919, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.3125, + "step": 225 + }, + { + "completion_length": 71.515625, + "epoch": 0.2502768549280177, + "grad_norm": 0.5690449476242065, + "kl": 0.030975341796875, + "learning_rate": 8.587499999999999e-07, + "loss": 0.0036481586284935474, + "reward": 2.3258167505264282, + "reward_std": 0.2903416156768799, + "rewards/GDino": 0.8587138652801514, + "rewards/GIT": 0.4227441996335983, + "rewards/HPSv2": 0.2740001678466797, + "rewards/ORM": 0.7703584432601929, + "self_certainty_semantic": -25.75, + "self_certainty_token": -22.375, + "step": 226 + }, + { + "completion_length": 76.265625, + "epoch": 0.2513842746400886, + "grad_norm": 0.7145232558250427, + "kl": 0.0177001953125, + "learning_rate": 8.581249999999999e-07, + "loss": -0.008261570241302252, + "reward": 1.9078750610351562, + "reward_std": 0.4406122863292694, + "rewards/GDino": 0.6768103837966919, + "rewards/GIT": 0.4432929754257202, + "rewards/HPSv2": 0.25499725341796875, + "rewards/ORM": 0.5327745378017426, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.875, + "step": 227 + }, + { + "completion_length": 72.75, + "epoch": 0.25249169435215946, + "grad_norm": 0.41743382811546326, + "kl": 0.011627197265625, + "learning_rate": 8.575e-07, + "loss": -0.0010735401883721352, + "reward": 1.964881420135498, + "reward_std": 0.3422084152698517, + "rewards/GDino": 0.8187020123004913, + "rewards/GIT": 0.21802851557731628, + "rewards/HPSv2": 0.2918586730957031, + "rewards/ORM": 0.6362922191619873, + "self_certainty_semantic": -25.875, + "self_certainty_token": -20.9375, + "step": 228 + }, + { + "completion_length": 73.078125, + "epoch": 0.25359911406423036, + "grad_norm": 0.7283642292022705, + "kl": 0.021484375, + "learning_rate": 8.568750000000001e-07, + "loss": -0.004298686049878597, + "reward": 2.1181740760803223, + "reward_std": 0.2697337493300438, + "rewards/GDino": 0.645624965429306, + "rewards/GIT": 0.3558032214641571, + "rewards/HPSv2": 0.270538330078125, + "rewards/ORM": 0.8462075889110565, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.375, + "step": 229 + }, + { + "completion_length": 72.765625, + "epoch": 0.2547065337763012, + "grad_norm": 0.6476506590843201, + "kl": 0.0166015625, + "learning_rate": 8.5625e-07, + "loss": -0.004904653993435204, + "reward": 2.1424243450164795, + "reward_std": 0.3121884614229202, + "rewards/GDino": 0.740700364112854, + "rewards/GIT": 0.3593500852584839, + "rewards/HPSv2": 0.27527427673339844, + "rewards/ORM": 0.7670995891094208, + "self_certainty_semantic": -25.875, + "self_certainty_token": -22.3125, + "step": 230 + }, + { + "completion_length": 69.171875, + "epoch": 0.2558139534883721, + "grad_norm": 0.4526921510696411, + "kl": 0.019744873046875, + "learning_rate": 8.55625e-07, + "loss": 0.0003523953491821885, + "reward": 1.910792589187622, + "reward_std": 0.46326547861099243, + "rewards/GDino": 0.706869900226593, + "rewards/GIT": 0.3568519949913025, + "rewards/HPSv2": 0.26113319396972656, + "rewards/ORM": 0.5859375, + "self_certainty_semantic": -25.625, + "self_certainty_token": -22.0, + "step": 231 + }, + { + "completion_length": 75.609375, + "epoch": 0.25692137320044295, + "grad_norm": 0.47148004174232483, + "kl": 0.01458740234375, + "learning_rate": 8.55e-07, + "loss": -0.005282421130686998, + "reward": 2.063035786151886, + "reward_std": 0.38309258222579956, + "rewards/GDino": 0.7156915068626404, + "rewards/GIT": 0.46562977135181427, + "rewards/HPSv2": 0.26252174377441406, + "rewards/ORM": 0.6191926300525665, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.4375, + "step": 232 + }, + { + "completion_length": 77.578125, + "epoch": 0.25802879291251385, + "grad_norm": 0.7205236554145813, + "kl": 0.016693115234375, + "learning_rate": 8.54375e-07, + "loss": 0.005424320697784424, + "reward": 2.174328565597534, + "reward_std": 0.28449372947216034, + "rewards/GDino": 0.7531249523162842, + "rewards/GIT": 0.30038363486528397, + "rewards/HPSv2": 0.2665290832519531, + "rewards/ORM": 0.8542908430099487, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.625, + "step": 233 + }, + { + "completion_length": 67.78125, + "epoch": 0.2591362126245847, + "grad_norm": 0.49076974391937256, + "kl": 0.02978515625, + "learning_rate": 8.5375e-07, + "loss": -0.01528711523860693, + "reward": 2.099589467048645, + "reward_std": 0.3918275982141495, + "rewards/GDino": 0.80809485912323, + "rewards/GIT": 0.311017170548439, + "rewards/HPSv2": 0.2805614471435547, + "rewards/ORM": 0.6999160349369049, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -22.0625, + "step": 234 + }, + { + "completion_length": 67.34375, + "epoch": 0.2602436323366556, + "grad_norm": 0.5299943089485168, + "kl": 0.014190673828125, + "learning_rate": 8.53125e-07, + "loss": 0.00436694361269474, + "reward": 1.8783327341079712, + "reward_std": 0.5424820780754089, + "rewards/GDino": 0.7234554290771484, + "rewards/GIT": 0.3852204605937004, + "rewards/HPSv2": 0.27126121520996094, + "rewards/ORM": 0.49839554727077484, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.9375, + "step": 235 + }, + { + "completion_length": 78.203125, + "epoch": 0.26135105204872644, + "grad_norm": 1.0189565420150757, + "kl": 0.010833740234375, + "learning_rate": 8.525e-07, + "loss": 0.005404738476499915, + "reward": 1.477653980255127, + "reward_std": 0.4136325716972351, + "rewards/GDino": 0.7152182459831238, + "rewards/GIT": 0.23550968617200851, + "rewards/HPSv2": 0.25348854064941406, + "rewards/ORM": 0.2734375, + "self_certainty_semantic": -25.5, + "self_certainty_token": -22.1875, + "step": 236 + }, + { + "completion_length": 68.25, + "epoch": 0.26245847176079734, + "grad_norm": 0.642930805683136, + "kl": 0.09527587890625, + "learning_rate": 8.51875e-07, + "loss": -0.0037159734638407826, + "reward": 2.431071162223816, + "reward_std": 0.399463415145874, + "rewards/GDino": 0.8427083194255829, + "rewards/GIT": 0.5333812236785889, + "rewards/HPSv2": 0.2799396514892578, + "rewards/ORM": 0.7750419676303864, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -22.3125, + "step": 237 + }, + { + "completion_length": 64.9375, + "epoch": 0.26356589147286824, + "grad_norm": 0.4478345215320587, + "kl": 0.01031494140625, + "learning_rate": 8.512499999999999e-07, + "loss": 0.00545497820712626, + "reward": 2.0591735243797302, + "reward_std": 0.20043298602104187, + "rewards/GDino": 0.8530542254447937, + "rewards/GIT": 0.46807297319173813, + "rewards/HPSv2": 0.26335906982421875, + "rewards/ORM": 0.47468723356723785, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -20.6875, + "step": 238 + }, + { + "completion_length": 80.0625, + "epoch": 0.2646733111849391, + "grad_norm": 0.5843000411987305, + "kl": 0.0184326171875, + "learning_rate": 8.506249999999999e-07, + "loss": -0.0013531917938962579, + "reward": 2.2810004353523254, + "reward_std": 0.22595498710870743, + "rewards/GDino": 0.6398958265781403, + "rewards/GIT": 0.5575685948133469, + "rewards/HPSv2": 0.2814750671386719, + "rewards/ORM": 0.8020609319210052, + "self_certainty_semantic": -25.625, + "self_certainty_token": -20.8125, + "step": 239 + }, + { + "completion_length": 71.171875, + "epoch": 0.26578073089701, + "grad_norm": 0.5621791481971741, + "kl": 0.0168609619140625, + "learning_rate": 8.499999999999999e-07, + "loss": 0.006901541026309133, + "reward": 2.2162342071533203, + "reward_std": 0.08602850884199142, + "rewards/GDino": 0.8477180302143097, + "rewards/GIT": 0.4770164489746094, + "rewards/HPSv2": 0.26997947692871094, + "rewards/ORM": 0.6215203106403351, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.625, + "step": 240 + }, + { + "completion_length": 71.015625, + "epoch": 0.26688815060908083, + "grad_norm": 1.044856309890747, + "kl": 0.02154541015625, + "learning_rate": 8.493749999999999e-07, + "loss": -0.004785971017554402, + "reward": 2.2813735008239746, + "reward_std": 0.25123290345072746, + "rewards/GDino": 0.8095787167549133, + "rewards/GIT": 0.6019963622093201, + "rewards/HPSv2": 0.2698402404785156, + "rewards/ORM": 0.599958062171936, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.0, + "step": 241 + }, + { + "completion_length": 70.96875, + "epoch": 0.26799557032115173, + "grad_norm": 0.558193564414978, + "kl": 0.03350830078125, + "learning_rate": 8.487499999999999e-07, + "loss": -0.007248041685670614, + "reward": 1.875414788722992, + "reward_std": 0.22249843925237656, + "rewards/GDino": 0.7162744402885437, + "rewards/GIT": 0.35095490515232086, + "rewards/HPSv2": 0.26055908203125, + "rewards/ORM": 0.5476263463497162, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.5, + "step": 242 + }, + { + "completion_length": 61.96875, + "epoch": 0.2691029900332226, + "grad_norm": 0.8121842741966248, + "kl": 0.034423828125, + "learning_rate": 8.481249999999999e-07, + "loss": 0.004763577948324382, + "reward": 1.673361897468567, + "reward_std": 0.5223372876644135, + "rewards/GDino": 0.6624484360218048, + "rewards/GIT": 0.24029508233070374, + "rewards/HPSv2": 0.26320648193359375, + "rewards/ORM": 0.5074118822813034, + "self_certainty_semantic": -25.625, + "self_certainty_token": -20.625, + "step": 243 + }, + { + "completion_length": 64.78125, + "epoch": 0.2702104097452935, + "grad_norm": 0.5268959403038025, + "kl": 0.0262451171875, + "learning_rate": 8.475e-07, + "loss": 0.003346863901242614, + "reward": 1.9098018407821655, + "reward_std": 0.2704559862613678, + "rewards/GDino": 0.737314760684967, + "rewards/GIT": 0.2029709815979004, + "rewards/HPSv2": 0.26165771484375, + "rewards/ORM": 0.7078584730625153, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.4375, + "step": 244 + }, + { + "completion_length": 69.1875, + "epoch": 0.2713178294573643, + "grad_norm": 0.7254846096038818, + "kl": 0.01483154296875, + "learning_rate": 8.46875e-07, + "loss": 0.009370718151330948, + "reward": 1.6544832587242126, + "reward_std": 0.40473152697086334, + "rewards/GDino": 0.7218703925609589, + "rewards/GIT": 0.26652586460113525, + "rewards/HPSv2": 0.2653217315673828, + "rewards/ORM": 0.40076524019241333, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -22.3125, + "step": 245 + }, + { + "completion_length": 83.21875, + "epoch": 0.2724252491694352, + "grad_norm": 0.7293990850448608, + "kl": 0.02117919921875, + "learning_rate": 8.462499999999999e-07, + "loss": -0.0010048565454781055, + "reward": 2.144185781478882, + "reward_std": 0.38181324303150177, + "rewards/GDino": 0.811366617679596, + "rewards/GIT": 0.5691226869821548, + "rewards/HPSv2": 0.254058837890625, + "rewards/ORM": 0.5096377730369568, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -20.875, + "step": 246 + }, + { + "completion_length": 72.125, + "epoch": 0.27353266888150607, + "grad_norm": 0.6859191656112671, + "kl": 0.03387451171875, + "learning_rate": 8.45625e-07, + "loss": -0.008051293902099133, + "reward": 2.1632007360458374, + "reward_std": 0.37384991347789764, + "rewards/GDino": 0.8133151531219482, + "rewards/GIT": 0.40972038358449936, + "rewards/HPSv2": 0.26703643798828125, + "rewards/ORM": 0.6731287837028503, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.125, + "step": 247 + }, + { + "completion_length": 74.1875, + "epoch": 0.27464008859357697, + "grad_norm": 0.722845733165741, + "kl": 0.023712158203125, + "learning_rate": 8.45e-07, + "loss": 0.012526229955255985, + "reward": 1.6368342638015747, + "reward_std": 0.4980652183294296, + "rewards/GDino": 0.7129978537559509, + "rewards/GIT": 0.22368073463439941, + "rewards/HPSv2": 0.28719520568847656, + "rewards/ORM": 0.4129604697227478, + "self_certainty_semantic": -25.75, + "self_certainty_token": -22.125, + "step": 248 + }, + { + "completion_length": 74.5625, + "epoch": 0.2757475083056478, + "grad_norm": 0.49814099073410034, + "kl": 0.019317626953125, + "learning_rate": 8.44375e-07, + "loss": -0.0019536763429641724, + "reward": 2.5881810188293457, + "reward_std": 0.2942521944642067, + "rewards/GDino": 0.8687500059604645, + "rewards/GIT": 0.70334193110466, + "rewards/HPSv2": 0.2790355682373047, + "rewards/ORM": 0.737053394317627, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.3125, + "step": 249 + }, + { + "completion_length": 73.359375, + "epoch": 0.2768549280177187, + "grad_norm": 0.5322409272193909, + "kl": 0.034423828125, + "learning_rate": 8.4375e-07, + "loss": 0.010972056537866592, + "reward": 2.538747787475586, + "reward_std": 0.269253209233284, + "rewards/GDino": 0.859375, + "rewards/GIT": 0.8257900178432465, + "rewards/HPSv2": 0.26576995849609375, + "rewards/ORM": 0.5878127217292786, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.5, + "step": 250 + }, + { + "completion_length": 69.96875, + "epoch": 0.2779623477297896, + "grad_norm": 0.43763142824172974, + "kl": 0.014007568359375, + "learning_rate": 8.43125e-07, + "loss": -0.004584175767377019, + "reward": 2.5028269290924072, + "reward_std": 0.3188868314027786, + "rewards/GDino": 0.878125011920929, + "rewards/GIT": 0.7406049966812134, + "rewards/HPSv2": 0.24962997436523438, + "rewards/ORM": 0.6344668865203857, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -20.9375, + "step": 251 + }, + { + "completion_length": 75.609375, + "epoch": 0.27906976744186046, + "grad_norm": 1.2747613191604614, + "kl": 0.05096435546875, + "learning_rate": 8.425e-07, + "loss": 0.0056330859661102295, + "reward": 1.8026528358459473, + "reward_std": 0.3838294893503189, + "rewards/GDino": 0.7048274576663971, + "rewards/GIT": 0.18401113897562027, + "rewards/HPSv2": 0.2747936248779297, + "rewards/ORM": 0.6390205323696136, + "self_certainty_semantic": -25.625, + "self_certainty_token": -22.375, + "step": 252 + }, + { + "completion_length": 58.328125, + "epoch": 0.28017718715393136, + "grad_norm": 0.49052363634109497, + "kl": 0.011322021484375, + "learning_rate": 8.41875e-07, + "loss": -0.007860599551349878, + "reward": 2.464292287826538, + "reward_std": 0.34174694865942, + "rewards/GDino": 0.8541666567325592, + "rewards/GIT": 0.6527195274829865, + "rewards/HPSv2": 0.24892616271972656, + "rewards/ORM": 0.7084799110889435, + "self_certainty_semantic": -25.875, + "self_certainty_token": -22.0, + "step": 253 + }, + { + "completion_length": 64.75, + "epoch": 0.2812846068660022, + "grad_norm": 2.073457956314087, + "kl": 0.02685546875, + "learning_rate": 8.4125e-07, + "loss": -0.011055355425924063, + "reward": 2.175750970840454, + "reward_std": 0.26358360797166824, + "rewards/GDino": 0.8023437261581421, + "rewards/GIT": 0.6660144329071045, + "rewards/HPSv2": 0.2644081115722656, + "rewards/ORM": 0.44298477470874786, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.5, + "step": 254 + }, + { + "completion_length": 71.09375, + "epoch": 0.2823920265780731, + "grad_norm": 0.4680713415145874, + "kl": 0.02752685546875, + "learning_rate": 8.406249999999999e-07, + "loss": 0.008177328621968627, + "reward": 2.0434359312057495, + "reward_std": 0.1861441507935524, + "rewards/GDino": 0.6919757723808289, + "rewards/GIT": 0.3358978107571602, + "rewards/HPSv2": 0.2795829772949219, + "rewards/ORM": 0.7359794676303864, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.875, + "step": 255 + }, + { + "completion_length": 67.28125, + "epoch": 0.28349944629014395, + "grad_norm": 0.4970144033432007, + "kl": 0.015625, + "learning_rate": 8.399999999999999e-07, + "loss": -0.002479484537616372, + "reward": 1.8094860911369324, + "reward_std": 0.33552980422973633, + "rewards/GDino": 0.6692599654197693, + "rewards/GIT": 0.360467329621315, + "rewards/HPSv2": 0.2643547058105469, + "rewards/ORM": 0.5154041647911072, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.375, + "step": 256 + }, + { + "completion_length": 65.46875, + "epoch": 0.28460686600221485, + "grad_norm": 0.6144054532051086, + "kl": 0.0169677734375, + "learning_rate": 8.393749999999999e-07, + "loss": 0.004478918854147196, + "reward": 1.7510342001914978, + "reward_std": 0.38688288629055023, + "rewards/GDino": 0.6829120516777039, + "rewards/GIT": 0.3309401273727417, + "rewards/HPSv2": 0.2636985778808594, + "rewards/ORM": 0.4734834134578705, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.25, + "step": 257 + }, + { + "completion_length": 83.109375, + "epoch": 0.2857142857142857, + "grad_norm": 0.7908658385276794, + "kl": 0.017425537109375, + "learning_rate": 8.387499999999999e-07, + "loss": 0.008913073223084211, + "reward": 1.6875710487365723, + "reward_std": 0.5354342758655548, + "rewards/GDino": 0.5822916924953461, + "rewards/GIT": 0.18823493272066116, + "rewards/HPSv2": 0.27369117736816406, + "rewards/ORM": 0.6433533132076263, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.0, + "step": 258 + }, + { + "completion_length": 64.734375, + "epoch": 0.2868217054263566, + "grad_norm": 0.5669509172439575, + "kl": 0.01910400390625, + "learning_rate": 8.38125e-07, + "loss": -0.007796134799718857, + "reward": 2.570701003074646, + "reward_std": 0.17354267835617065, + "rewards/GDino": 0.8989583253860474, + "rewards/GIT": 0.7911819815635681, + "rewards/HPSv2": 0.2594928741455078, + "rewards/ORM": 0.6210678368806839, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.0625, + "step": 259 + }, + { + "completion_length": 72.84375, + "epoch": 0.28792912513842744, + "grad_norm": 0.43997663259506226, + "kl": 0.05230712890625, + "learning_rate": 8.375e-07, + "loss": -0.004798144684173167, + "reward": 1.865119218826294, + "reward_std": 0.5158642530441284, + "rewards/GDino": 0.6778468787670135, + "rewards/GIT": 0.33176329731941223, + "rewards/HPSv2": 0.2755088806152344, + "rewards/ORM": 0.5800002217292786, + "self_certainty_semantic": -26.0, + "self_certainty_token": -22.125, + "step": 260 + }, + { + "completion_length": 74.875, + "epoch": 0.28903654485049834, + "grad_norm": 0.5706507563591003, + "kl": 0.02374267578125, + "learning_rate": 8.36875e-07, + "loss": 0.004061129409819841, + "reward": 2.234776735305786, + "reward_std": 0.3810935467481613, + "rewards/GDino": 0.7571678161621094, + "rewards/GIT": 0.52987040579319, + "rewards/HPSv2": 0.2680511474609375, + "rewards/ORM": 0.6796875, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -20.6875, + "step": 261 + }, + { + "completion_length": 74.5, + "epoch": 0.2901439645625692, + "grad_norm": 0.46872520446777344, + "kl": 0.01873779296875, + "learning_rate": 8.3625e-07, + "loss": -0.00692132324911654, + "reward": 2.576533555984497, + "reward_std": 0.38655444979667664, + "rewards/GDino": 0.8760416209697723, + "rewards/GIT": 0.6822678744792938, + "rewards/HPSv2": 0.26166534423828125, + "rewards/ORM": 0.7565587162971497, + "self_certainty_semantic": -25.875, + "self_certainty_token": -22.1875, + "step": 262 + }, + { + "completion_length": 76.5625, + "epoch": 0.2912513842746401, + "grad_norm": 0.46919816732406616, + "kl": 0.02130126953125, + "learning_rate": 8.356249999999999e-07, + "loss": -0.0069916946813464165, + "reward": 2.5552332401275635, + "reward_std": 0.2706700414419174, + "rewards/GDino": 0.8554803431034088, + "rewards/GIT": 0.5103462636470795, + "rewards/HPSv2": 0.2714118957519531, + "rewards/ORM": 0.9179946780204773, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.4375, + "step": 263 + }, + { + "completion_length": 72.765625, + "epoch": 0.292358803986711, + "grad_norm": 0.5821876525878906, + "kl": 0.0244140625, + "learning_rate": 8.349999999999999e-07, + "loss": -0.005310273729264736, + "reward": 2.485088586807251, + "reward_std": 0.3456519544124603, + "rewards/GDino": 0.838541716337204, + "rewards/GIT": 0.6395441293716431, + "rewards/HPSv2": 0.2694988250732422, + "rewards/ORM": 0.7375039756298065, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.125, + "step": 264 + }, + { + "completion_length": 76.34375, + "epoch": 0.29346622369878184, + "grad_norm": 0.44273218512535095, + "kl": 0.02325439453125, + "learning_rate": 8.34375e-07, + "loss": 0.00350103247910738, + "reward": 2.10404896736145, + "reward_std": 0.5246832072734833, + "rewards/GDino": 0.8096200525760651, + "rewards/GIT": 0.3854048401117325, + "rewards/HPSv2": 0.27233123779296875, + "rewards/ORM": 0.6366928219795227, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.0625, + "step": 265 + }, + { + "completion_length": 75.109375, + "epoch": 0.29457364341085274, + "grad_norm": 0.4895631968975067, + "kl": 0.016632080078125, + "learning_rate": 8.3375e-07, + "loss": -0.00301961723016575, + "reward": 1.9082358479499817, + "reward_std": 0.3556567281484604, + "rewards/GDino": 0.6507861316204071, + "rewards/GIT": 0.3627527952194214, + "rewards/HPSv2": 0.2768878936767578, + "rewards/ORM": 0.6178089678287506, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.0, + "step": 266 + }, + { + "completion_length": 78.015625, + "epoch": 0.2956810631229236, + "grad_norm": 0.5989522933959961, + "kl": 0.01922607421875, + "learning_rate": 8.33125e-07, + "loss": -0.01323670195415616, + "reward": 2.559838652610779, + "reward_std": 0.33854877948760986, + "rewards/GDino": 0.8420600295066833, + "rewards/GIT": 0.5396290123462677, + "rewards/HPSv2": 0.2910041809082031, + "rewards/ORM": 0.8871452808380127, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.625, + "step": 267 + }, + { + "completion_length": 78.328125, + "epoch": 0.2967884828349945, + "grad_norm": 0.44201359152793884, + "kl": 0.0205078125, + "learning_rate": 8.325e-07, + "loss": -0.01491079293191433, + "reward": 2.0024473071098328, + "reward_std": 0.34718990325927734, + "rewards/GDino": 0.7285216152667999, + "rewards/GIT": 0.3823501020669937, + "rewards/HPSv2": 0.26154136657714844, + "rewards/ORM": 0.6300341486930847, + "self_certainty_semantic": -25.625, + "self_certainty_token": -22.0, + "step": 268 + }, + { + "completion_length": 83.40625, + "epoch": 0.2978959025470653, + "grad_norm": 0.8317415118217468, + "kl": 0.0340576171875, + "learning_rate": 8.31875e-07, + "loss": -0.0009441054426133633, + "reward": 2.3879988193511963, + "reward_std": 0.3649384081363678, + "rewards/GDino": 0.8694140315055847, + "rewards/GIT": 0.6274352371692657, + "rewards/HPSv2": 0.26614952087402344, + "rewards/ORM": 0.625, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -20.6875, + "step": 269 + }, + { + "completion_length": 80.125, + "epoch": 0.29900332225913623, + "grad_norm": 0.6886855363845825, + "kl": 0.02099609375, + "learning_rate": 8.3125e-07, + "loss": -0.012794415233656764, + "reward": 2.0984017848968506, + "reward_std": 0.1863284632563591, + "rewards/GDino": 0.7838541865348816, + "rewards/GIT": 0.43376635760068893, + "rewards/HPSv2": 0.27881813049316406, + "rewards/ORM": 0.6019631624221802, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.3125, + "step": 270 + }, + { + "completion_length": 79.578125, + "epoch": 0.3001107419712071, + "grad_norm": 0.4688260853290558, + "kl": 0.028076171875, + "learning_rate": 8.306249999999999e-07, + "loss": -0.008426547283306718, + "reward": 1.7719020247459412, + "reward_std": 0.5150116533041, + "rewards/GDino": 0.7138240337371826, + "rewards/GIT": 0.2824729457497597, + "rewards/HPSv2": 0.26485443115234375, + "rewards/ORM": 0.5107506066560745, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.6875, + "step": 271 + }, + { + "completion_length": 70.484375, + "epoch": 0.301218161683278, + "grad_norm": 0.7626622319221497, + "kl": 0.0302734375, + "learning_rate": 8.299999999999999e-07, + "loss": -0.007479890366084874, + "reward": 1.8375248312950134, + "reward_std": 0.39174318313598633, + "rewards/GDino": 0.6447004973888397, + "rewards/GIT": 0.23957757651805878, + "rewards/HPSv2": 0.2825756072998047, + "rewards/ORM": 0.6706711649894714, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.9375, + "step": 272 + }, + { + "completion_length": 87.921875, + "epoch": 0.3023255813953488, + "grad_norm": 2.0423707962036133, + "kl": 0.02557373046875, + "learning_rate": 8.293749999999999e-07, + "loss": -0.0029805664089508355, + "reward": 1.7430112957954407, + "reward_std": 0.2507159113883972, + "rewards/GDino": 0.6535544991493225, + "rewards/GIT": 0.31878431141376495, + "rewards/HPSv2": 0.2754058837890625, + "rewards/ORM": 0.4952665716409683, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -22.0625, + "step": 273 + }, + { + "completion_length": 81.4375, + "epoch": 0.3034330011074197, + "grad_norm": 0.7430813908576965, + "kl": 0.0899658203125, + "learning_rate": 8.287499999999999e-07, + "loss": -0.001141307526268065, + "reward": 2.2495153546333313, + "reward_std": 0.34138451516628265, + "rewards/GDino": 0.8024103045463562, + "rewards/GIT": 0.4980108290910721, + "rewards/HPSv2": 0.2546348571777344, + "rewards/ORM": 0.6944593787193298, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -22.0625, + "step": 274 + }, + { + "completion_length": 82.0, + "epoch": 0.30454042081949056, + "grad_norm": 0.6249475479125977, + "kl": 0.021484375, + "learning_rate": 8.28125e-07, + "loss": -0.006782526383176446, + "reward": 2.718081474304199, + "reward_std": 0.2731604278087616, + "rewards/GDino": 0.8783854246139526, + "rewards/GIT": 0.782598078250885, + "rewards/HPSv2": 0.2852649688720703, + "rewards/ORM": 0.771833062171936, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.5625, + "step": 275 + }, + { + "completion_length": 67.046875, + "epoch": 0.30564784053156147, + "grad_norm": 0.4724781811237335, + "kl": 0.01434326171875, + "learning_rate": 8.275e-07, + "loss": 0.00598024798091501, + "reward": 2.1995996236801147, + "reward_std": 0.31263478100299835, + "rewards/GDino": 0.7332243323326111, + "rewards/GIT": 0.4423800855875015, + "rewards/HPSv2": 0.27051544189453125, + "rewards/ORM": 0.7534796893596649, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.4375, + "step": 276 + }, + { + "completion_length": 84.015625, + "epoch": 0.3067552602436323, + "grad_norm": 0.7819753289222717, + "kl": 0.0323486328125, + "learning_rate": 8.26875e-07, + "loss": -0.0006253474857658148, + "reward": 2.2662617564201355, + "reward_std": 0.33760039508342743, + "rewards/GDino": 0.7433146238327026, + "rewards/GIT": 0.5460084825754166, + "rewards/HPSv2": 0.27779579162597656, + "rewards/ORM": 0.6991429030895233, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.5, + "step": 277 + }, + { + "completion_length": 69.65625, + "epoch": 0.3078626799557032, + "grad_norm": 0.4542302191257477, + "kl": 0.0260009765625, + "learning_rate": 8.2625e-07, + "loss": 0.0030680494382977486, + "reward": 2.175543189048767, + "reward_std": 0.4059564173221588, + "rewards/GDino": 0.8273285925388336, + "rewards/GIT": 0.6166777014732361, + "rewards/HPSv2": 0.2625160217285156, + "rewards/ORM": 0.4690207839012146, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.1875, + "step": 278 + }, + { + "completion_length": 73.71875, + "epoch": 0.3089700996677741, + "grad_norm": 0.5411505103111267, + "kl": 0.019134521484375, + "learning_rate": 8.25625e-07, + "loss": -0.010078638093546033, + "reward": 2.640411376953125, + "reward_std": 0.28325602412223816, + "rewards/GDino": 0.8475841879844666, + "rewards/GIT": 0.771721363067627, + "rewards/HPSv2": 0.266693115234375, + "rewards/ORM": 0.7544127404689789, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.0, + "step": 279 + }, + { + "completion_length": 73.46875, + "epoch": 0.31007751937984496, + "grad_norm": 0.46741002798080444, + "kl": 0.026458740234375, + "learning_rate": 8.249999999999999e-07, + "loss": -0.0019589242292568088, + "reward": 2.315522074699402, + "reward_std": 0.3852066993713379, + "rewards/GDino": 0.8072916269302368, + "rewards/GIT": 0.51693394780159, + "rewards/HPSv2": 0.2585258483886719, + "rewards/ORM": 0.732770562171936, + "self_certainty_semantic": -25.625, + "self_certainty_token": -22.25, + "step": 280 + }, + { + "completion_length": 73.0625, + "epoch": 0.31118493909191586, + "grad_norm": 0.5686728954315186, + "kl": 0.033203125, + "learning_rate": 8.243749999999999e-07, + "loss": -0.00037761888233944774, + "reward": 2.1060147285461426, + "reward_std": 0.2314438670873642, + "rewards/GDino": 0.7503374814987183, + "rewards/GIT": 0.43248776346445084, + "rewards/HPSv2": 0.27953529357910156, + "rewards/ORM": 0.6436541378498077, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.8125, + "step": 281 + }, + { + "completion_length": 89.703125, + "epoch": 0.3122923588039867, + "grad_norm": 1.0470083951950073, + "kl": 0.02166748046875, + "learning_rate": 8.2375e-07, + "loss": -0.0013394411653280258, + "reward": 2.125031590461731, + "reward_std": 0.24564317613840103, + "rewards/GDino": 0.8199155628681183, + "rewards/GIT": 0.5837800800800323, + "rewards/HPSv2": 0.2696857452392578, + "rewards/ORM": 0.45165039598941803, + "self_certainty_semantic": -26.0, + "self_certainty_token": -20.9375, + "step": 282 + }, + { + "completion_length": 83.28125, + "epoch": 0.3133997785160576, + "grad_norm": 0.6488218903541565, + "kl": 0.0267333984375, + "learning_rate": 8.23125e-07, + "loss": 0.005290511529892683, + "reward": 2.1087766885757446, + "reward_std": 0.31704336404800415, + "rewards/GDino": 0.7598958313465118, + "rewards/GIT": 0.4413727596402168, + "rewards/HPSv2": 0.25473785400390625, + "rewards/ORM": 0.6527703106403351, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.1875, + "step": 283 + }, + { + "completion_length": 78.765625, + "epoch": 0.31450719822812845, + "grad_norm": 0.8609553575515747, + "kl": 0.03515625, + "learning_rate": 8.225e-07, + "loss": 0.0021090602967888117, + "reward": 1.7876678705215454, + "reward_std": 0.3264614939689636, + "rewards/GDino": 0.7270833253860474, + "rewards/GIT": 0.3341338261961937, + "rewards/HPSv2": 0.26984596252441406, + "rewards/ORM": 0.4566046893596649, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.9375, + "step": 284 + }, + { + "completion_length": 67.046875, + "epoch": 0.31561461794019935, + "grad_norm": 0.6733913421630859, + "kl": 0.049072265625, + "learning_rate": 8.21875e-07, + "loss": -0.009910227498039603, + "reward": 1.693782925605774, + "reward_std": 0.39314794540405273, + "rewards/GDino": 0.7284678220748901, + "rewards/GIT": 0.18779370188713074, + "rewards/HPSv2": 0.27591705322265625, + "rewards/ORM": 0.5016044080257416, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.1875, + "step": 285 + }, + { + "completion_length": 78.75, + "epoch": 0.3167220376522702, + "grad_norm": 0.4943682849407196, + "kl": 0.018035888671875, + "learning_rate": 8.2125e-07, + "loss": -0.0012577057350426912, + "reward": 2.193196415901184, + "reward_std": 0.46209120750427246, + "rewards/GDino": 0.8109375238418579, + "rewards/GIT": 0.4691374748945236, + "rewards/HPSv2": 0.26856422424316406, + "rewards/ORM": 0.6445571482181549, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.1875, + "step": 286 + }, + { + "completion_length": 75.4375, + "epoch": 0.3178294573643411, + "grad_norm": 0.6116342544555664, + "kl": 0.0264892578125, + "learning_rate": 8.20625e-07, + "loss": 0.0010706414468586445, + "reward": 2.312258720397949, + "reward_std": 0.3226209282875061, + "rewards/GDino": 0.7510090172290802, + "rewards/GIT": 0.5955759733915329, + "rewards/HPSv2": 0.27696990966796875, + "rewards/ORM": 0.6887038052082062, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.5, + "step": 287 + }, + { + "completion_length": 80.921875, + "epoch": 0.31893687707641194, + "grad_norm": 0.5414326190948486, + "kl": 0.02142333984375, + "learning_rate": 8.199999999999999e-07, + "loss": -0.0018570725806057453, + "reward": 2.3252252340316772, + "reward_std": 0.4416055828332901, + "rewards/GDino": 0.7875000238418579, + "rewards/GIT": 0.44259513914585114, + "rewards/HPSv2": 0.25606346130371094, + "rewards/ORM": 0.8390664756298065, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.875, + "step": 288 + }, + { + "completion_length": 73.453125, + "epoch": 0.32004429678848284, + "grad_norm": 2.0414440631866455, + "kl": 0.0384521484375, + "learning_rate": 8.193749999999999e-07, + "loss": -0.0028190852608531713, + "reward": 2.363397717475891, + "reward_std": 0.38728441298007965, + "rewards/GDino": 0.7780884504318237, + "rewards/GIT": 0.454902321100235, + "rewards/HPSv2": 0.2644233703613281, + "rewards/ORM": 0.8659836649894714, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.6875, + "step": 289 + }, + { + "completion_length": 93.84375, + "epoch": 0.3211517165005537, + "grad_norm": 0.46440112590789795, + "kl": 0.0213623046875, + "learning_rate": 8.187499999999999e-07, + "loss": -0.003084618365392089, + "reward": 1.9635199308395386, + "reward_std": 0.3792533278465271, + "rewards/GDino": 0.6178125143051147, + "rewards/GIT": 0.32137173414230347, + "rewards/HPSv2": 0.27581024169921875, + "rewards/ORM": 0.7485254108905792, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -22.0, + "step": 290 + }, + { + "completion_length": 78.65625, + "epoch": 0.3222591362126246, + "grad_norm": 0.6124201416969299, + "kl": 0.0443115234375, + "learning_rate": 8.18125e-07, + "loss": 0.0013587521389126778, + "reward": 1.9497240781784058, + "reward_std": 0.3417292982339859, + "rewards/GDino": 0.8001987338066101, + "rewards/GIT": 0.39713574945926666, + "rewards/HPSv2": 0.26921844482421875, + "rewards/ORM": 0.483171209692955, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.8125, + "step": 291 + }, + { + "completion_length": 92.625, + "epoch": 0.3233665559246955, + "grad_norm": 0.9040305018424988, + "kl": 0.027099609375, + "learning_rate": 8.175e-07, + "loss": 0.0031354378443211317, + "reward": 2.339022397994995, + "reward_std": 0.21652893722057343, + "rewards/GDino": 0.839062511920929, + "rewards/GIT": 0.5841934829950333, + "rewards/HPSv2": 0.27198219299316406, + "rewards/ORM": 0.6437839716672897, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.125, + "step": 292 + }, + { + "completion_length": 79.234375, + "epoch": 0.32447397563676633, + "grad_norm": 0.46516144275665283, + "kl": 0.0208740234375, + "learning_rate": 8.16875e-07, + "loss": 0.009912369772791862, + "reward": 2.337222397327423, + "reward_std": 0.2715953439474106, + "rewards/GDino": 0.7020833194255829, + "rewards/GIT": 0.46142156422138214, + "rewards/HPSv2": 0.2674674987792969, + "rewards/ORM": 0.90625, + "self_certainty_semantic": -25.875, + "self_certainty_token": -20.875, + "step": 293 + }, + { + "completion_length": 77.421875, + "epoch": 0.32558139534883723, + "grad_norm": 0.5907319784164429, + "kl": 0.0245361328125, + "learning_rate": 8.1625e-07, + "loss": -0.008728538639843464, + "reward": 2.7809219360351562, + "reward_std": 0.24086545407772064, + "rewards/GDino": 0.882291704416275, + "rewards/GIT": 0.6462251394987106, + "rewards/HPSv2": 0.28365516662597656, + "rewards/ORM": 0.96875, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.75, + "step": 294 + }, + { + "completion_length": 83.265625, + "epoch": 0.3266888150609081, + "grad_norm": 1.4958977699279785, + "kl": 0.0491943359375, + "learning_rate": 8.15625e-07, + "loss": 0.0032177013345062733, + "reward": 1.7095434069633484, + "reward_std": 0.4502948075532913, + "rewards/GDino": 0.6662150025367737, + "rewards/GIT": 0.10812210291624069, + "rewards/HPSv2": 0.2896270751953125, + "rewards/ORM": 0.6455792784690857, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.9375, + "step": 295 + }, + { + "completion_length": 73.25, + "epoch": 0.327796234772979, + "grad_norm": 6.574085712432861, + "kl": 0.02142333984375, + "learning_rate": 8.149999999999999e-07, + "loss": 0.0013078644406050444, + "reward": 2.7796449661254883, + "reward_std": 0.2162124067544937, + "rewards/GDino": 0.9666666984558105, + "rewards/GIT": 0.8142045736312866, + "rewards/HPSv2": 0.2789497375488281, + "rewards/ORM": 0.7198239415884018, + "self_certainty_semantic": -26.0, + "self_certainty_token": -20.5625, + "step": 296 + }, + { + "completion_length": 81.25, + "epoch": 0.3289036544850498, + "grad_norm": 2.1547293663024902, + "kl": 0.056884765625, + "learning_rate": 8.143749999999999e-07, + "loss": 0.01906517706811428, + "reward": 1.9463022351264954, + "reward_std": 0.301323801279068, + "rewards/GDino": 0.816184788942337, + "rewards/GIT": 0.2949872240424156, + "rewards/HPSv2": 0.2578582763671875, + "rewards/ORM": 0.5772718787193298, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -22.25, + "step": 297 + }, + { + "completion_length": 79.46875, + "epoch": 0.3300110741971207, + "grad_norm": 0.6863793134689331, + "kl": 0.0230712890625, + "learning_rate": 8.137499999999999e-07, + "loss": -0.005342667340300977, + "reward": 2.2086856365203857, + "reward_std": 0.255329854786396, + "rewards/GDino": 0.8382692337036133, + "rewards/GIT": 0.6078545451164246, + "rewards/HPSv2": 0.2703742980957031, + "rewards/ORM": 0.4921875, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -22.0, + "step": 298 + }, + { + "completion_length": 77.984375, + "epoch": 0.33111849390919157, + "grad_norm": 1.2127599716186523, + "kl": 0.02545166015625, + "learning_rate": 8.131249999999999e-07, + "loss": -0.0011417875648476183, + "reward": 2.55258309841156, + "reward_std": 0.41294097900390625, + "rewards/GDino": 0.8872395753860474, + "rewards/GIT": 0.7489987313747406, + "rewards/HPSv2": 0.2650489807128906, + "rewards/ORM": 0.6512957215309143, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.6875, + "step": 299 + }, + { + "completion_length": 73.25, + "epoch": 0.33222591362126247, + "grad_norm": 0.4840102791786194, + "kl": 0.02264404296875, + "learning_rate": 8.125e-07, + "loss": 0.0034709569881670177, + "reward": 2.5574848651885986, + "reward_std": 0.20182272791862488, + "rewards/GDino": 0.7973958551883698, + "rewards/GIT": 0.6920412927865982, + "rewards/HPSv2": 0.2663593292236328, + "rewards/ORM": 0.8016884028911591, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.1875, + "step": 300 + }, + { + "completion_length": 88.65625, + "epoch": 0.3333333333333333, + "grad_norm": 0.4527220129966736, + "kl": 0.021453857421875, + "learning_rate": 8.11875e-07, + "loss": 0.005656351568177342, + "reward": 2.1882619857788086, + "reward_std": 0.3654931038618088, + "rewards/GDino": 0.7827093601226807, + "rewards/GIT": 0.4531768709421158, + "rewards/HPSv2": 0.255859375, + "rewards/ORM": 0.6965163350105286, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.5625, + "step": 301 + }, + { + "completion_length": 82.03125, + "epoch": 0.3344407530454042, + "grad_norm": 0.5959556698799133, + "kl": 0.023193359375, + "learning_rate": 8.1125e-07, + "loss": 0.006475352216511965, + "reward": 2.2507412433624268, + "reward_std": 0.34599871933460236, + "rewards/GDino": 0.8190558552742004, + "rewards/GIT": 0.4401208460330963, + "rewards/HPSv2": 0.26786041259765625, + "rewards/ORM": 0.7237042486667633, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.375, + "step": 302 + }, + { + "completion_length": 70.890625, + "epoch": 0.33554817275747506, + "grad_norm": 0.5255715847015381, + "kl": 0.0272216796875, + "learning_rate": 8.10625e-07, + "loss": -0.001262905541807413, + "reward": 1.8884990215301514, + "reward_std": 0.4090830981731415, + "rewards/GDino": 0.6540113091468811, + "rewards/GIT": 0.17955374717712402, + "rewards/HPSv2": 0.27841758728027344, + "rewards/ORM": 0.7765165269374847, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -23.1875, + "step": 303 + }, + { + "completion_length": 77.75, + "epoch": 0.33665559246954596, + "grad_norm": 0.4031485915184021, + "kl": 0.019073486328125, + "learning_rate": 8.1e-07, + "loss": -0.0022556333569809794, + "reward": 2.027117609977722, + "reward_std": 0.46946775913238525, + "rewards/GDino": 0.7257461845874786, + "rewards/GIT": 0.417344406247139, + "rewards/HPSv2": 0.2757759094238281, + "rewards/ORM": 0.6082510948181152, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.625, + "step": 304 + }, + { + "completion_length": 78.375, + "epoch": 0.3377630121816168, + "grad_norm": 1.838289737701416, + "kl": 0.06536865234375, + "learning_rate": 8.093749999999999e-07, + "loss": -0.005356153065804392, + "reward": 1.969985008239746, + "reward_std": 0.3424445614218712, + "rewards/GDino": 0.718147873878479, + "rewards/GIT": 0.3563241511583328, + "rewards/HPSv2": 0.2647056579589844, + "rewards/ORM": 0.6308073699474335, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -22.3125, + "step": 305 + }, + { + "completion_length": 89.703125, + "epoch": 0.3388704318936877, + "grad_norm": 1.6125619411468506, + "kl": 0.03436279296875, + "learning_rate": 8.087499999999999e-07, + "loss": -0.00043197721242904663, + "reward": 1.5673499703407288, + "reward_std": 0.45585790276527405, + "rewards/GDino": 0.6186152696609497, + "rewards/GIT": 0.20087126642465591, + "rewards/HPSv2": 0.2628669738769531, + "rewards/ORM": 0.4849964678287506, + "self_certainty_semantic": -25.875, + "self_certainty_token": -22.375, + "step": 306 + }, + { + "completion_length": 81.390625, + "epoch": 0.3399778516057586, + "grad_norm": 1.2003884315490723, + "kl": 0.01556396484375, + "learning_rate": 8.08125e-07, + "loss": -0.0018971394747495651, + "reward": 2.113324999809265, + "reward_std": 0.27119018137454987, + "rewards/GDino": 0.7391741275787354, + "rewards/GIT": 0.6144535839557648, + "rewards/HPSv2": 0.25809288024902344, + "rewards/ORM": 0.5016044676303864, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -21.25, + "step": 307 + }, + { + "completion_length": 81.4375, + "epoch": 0.34108527131782945, + "grad_norm": 0.4800432324409485, + "kl": 0.022216796875, + "learning_rate": 8.075e-07, + "loss": -0.004123867256566882, + "reward": 2.46367871761322, + "reward_std": 0.2835230827331543, + "rewards/GDino": 0.9177083075046539, + "rewards/GIT": 0.6127268970012665, + "rewards/HPSv2": 0.2661018371582031, + "rewards/ORM": 0.6671415567398071, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.1875, + "step": 308 + }, + { + "completion_length": 84.890625, + "epoch": 0.34219269102990035, + "grad_norm": 0.6846179962158203, + "kl": 0.02520751953125, + "learning_rate": 8.06875e-07, + "loss": -0.00010142242535948753, + "reward": 2.089680314064026, + "reward_std": 0.2894028127193451, + "rewards/GDino": 0.7455952167510986, + "rewards/GIT": 0.3249504566192627, + "rewards/HPSv2": 0.2928428649902344, + "rewards/ORM": 0.7262917459011078, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.5625, + "step": 309 + }, + { + "completion_length": 78.71875, + "epoch": 0.3433001107419712, + "grad_norm": 0.5359929203987122, + "kl": 0.03472900390625, + "learning_rate": 8.0625e-07, + "loss": -0.002803298644721508, + "reward": 2.7762473821640015, + "reward_std": 0.2040305808186531, + "rewards/GDino": 0.9227638244628906, + "rewards/GIT": 0.6611425876617432, + "rewards/HPSv2": 0.2548408508300781, + "rewards/ORM": 0.9375, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.375, + "step": 310 + }, + { + "completion_length": 70.71875, + "epoch": 0.3444075304540421, + "grad_norm": 0.42147886753082275, + "kl": 0.016876220703125, + "learning_rate": 8.05625e-07, + "loss": -0.006496510934084654, + "reward": 2.523200273513794, + "reward_std": 0.2876928895711899, + "rewards/GDino": 0.8453125357627869, + "rewards/GIT": 0.5663794428110123, + "rewards/HPSv2": 0.2585620880126953, + "rewards/ORM": 0.8529461622238159, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.0, + "step": 311 + }, + { + "completion_length": 70.34375, + "epoch": 0.34551495016611294, + "grad_norm": 0.6237341165542603, + "kl": 0.0382080078125, + "learning_rate": 8.05e-07, + "loss": 0.0011792382574640214, + "reward": 2.307464361190796, + "reward_std": 0.33230580389499664, + "rewards/GDino": 0.8716782331466675, + "rewards/GIT": 0.4658700153231621, + "rewards/HPSv2": 0.2794170379638672, + "rewards/ORM": 0.6904991567134857, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.5, + "step": 312 + }, + { + "completion_length": 81.25, + "epoch": 0.34662236987818384, + "grad_norm": 0.483153760433197, + "kl": 0.024169921875, + "learning_rate": 8.043749999999999e-07, + "loss": 0.0035759536549448967, + "reward": 1.6738215684890747, + "reward_std": 0.3843349516391754, + "rewards/GDino": 0.5876509547233582, + "rewards/GIT": 0.29283201694488525, + "rewards/HPSv2": 0.2635631561279297, + "rewards/ORM": 0.5297753810882568, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.1875, + "step": 313 + }, + { + "completion_length": 86.796875, + "epoch": 0.3477297895902547, + "grad_norm": 2.524872064590454, + "kl": 0.04315185546875, + "learning_rate": 8.037499999999999e-07, + "loss": -0.013335694558918476, + "reward": 1.9714076519012451, + "reward_std": 0.2786209136247635, + "rewards/GDino": 0.8052083551883698, + "rewards/GIT": 0.45579826831817627, + "rewards/HPSv2": 0.251068115234375, + "rewards/ORM": 0.45933302491903305, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -20.5625, + "step": 314 + }, + { + "completion_length": 77.65625, + "epoch": 0.3488372093023256, + "grad_norm": 0.6192425489425659, + "kl": 0.02899169921875, + "learning_rate": 8.031249999999999e-07, + "loss": -4.310737131163478e-05, + "reward": 2.4499722719192505, + "reward_std": 0.2559589520096779, + "rewards/GDino": 0.846875011920929, + "rewards/GIT": 0.6022514998912811, + "rewards/HPSv2": 0.27861595153808594, + "rewards/ORM": 0.7222296893596649, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.9375, + "step": 315 + }, + { + "completion_length": 78.171875, + "epoch": 0.34994462901439644, + "grad_norm": 0.5415306091308594, + "kl": 0.0198974609375, + "learning_rate": 8.024999999999999e-07, + "loss": 0.00023352215066552162, + "reward": 2.4360201358795166, + "reward_std": 0.3337967246770859, + "rewards/GDino": 0.7447916269302368, + "rewards/GIT": 0.5895867943763733, + "rewards/HPSv2": 0.2578916549682617, + "rewards/ORM": 0.84375, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -22.1875, + "step": 316 + }, + { + "completion_length": 85.984375, + "epoch": 0.35105204872646734, + "grad_norm": 0.9716246128082275, + "kl": 0.05963134765625, + "learning_rate": 8.018749999999999e-07, + "loss": 0.017504149582237005, + "reward": 1.6575236916542053, + "reward_std": 0.36309416592121124, + "rewards/GDino": 0.6602314710617065, + "rewards/GIT": 0.22271956503391266, + "rewards/HPSv2": 0.28050994873046875, + "rewards/ORM": 0.49406272172927856, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.3125, + "step": 317 + }, + { + "completion_length": 79.8125, + "epoch": 0.3521594684385382, + "grad_norm": 0.6221381425857544, + "kl": 0.0673828125, + "learning_rate": 8.0125e-07, + "loss": -0.015689235646277666, + "reward": 2.1572564244270325, + "reward_std": 0.41349759697914124, + "rewards/GDino": 0.7671919465065002, + "rewards/GIT": 0.45393163710832596, + "rewards/HPSv2": 0.27384376525878906, + "rewards/ORM": 0.662289023399353, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.6875, + "step": 318 + }, + { + "completion_length": 79.3125, + "epoch": 0.3532668881506091, + "grad_norm": 0.41829535365104675, + "kl": 0.023162841796875, + "learning_rate": 8.00625e-07, + "loss": -0.00858006183989346, + "reward": 1.9483414888381958, + "reward_std": 0.3017558604478836, + "rewards/GDino": 0.6993304491043091, + "rewards/GIT": 0.268573135137558, + "rewards/HPSv2": 0.2617378234863281, + "rewards/ORM": 0.7187000811100006, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.0, + "step": 319 + }, + { + "completion_length": 85.75, + "epoch": 0.35437430786268, + "grad_norm": 0.4535002112388611, + "kl": 0.0247802734375, + "learning_rate": 8e-07, + "loss": -0.0024052427615970373, + "reward": 2.0683862566947937, + "reward_std": 0.33450697362422943, + "rewards/GDino": 0.7158985733985901, + "rewards/GIT": 0.5421321392059326, + "rewards/HPSv2": 0.2665596008300781, + "rewards/ORM": 0.5437959656119347, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.75, + "step": 320 + }, + { + "completion_length": 81.296875, + "epoch": 0.3554817275747508, + "grad_norm": 0.5315317511558533, + "kl": 0.026611328125, + "learning_rate": 7.993749999999999e-07, + "loss": -0.009409249760210514, + "reward": 2.1791539788246155, + "reward_std": 0.3816552609205246, + "rewards/GDino": 0.8082575500011444, + "rewards/GIT": 0.4406343102455139, + "rewards/HPSv2": 0.26690101623535156, + "rewards/ORM": 0.6633611619472504, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.0, + "step": 321 + }, + { + "completion_length": 62.4375, + "epoch": 0.35658914728682173, + "grad_norm": 0.528831958770752, + "kl": 0.02685546875, + "learning_rate": 7.9875e-07, + "loss": 0.016546542290598154, + "reward": 2.6317901611328125, + "reward_std": 0.2945253401994705, + "rewards/GDino": 0.903124988079071, + "rewards/GIT": 0.6644484996795654, + "rewards/HPSv2": 0.2814922332763672, + "rewards/ORM": 0.7827245891094208, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.8125, + "step": 322 + }, + { + "completion_length": 70.6875, + "epoch": 0.3576965669988926, + "grad_norm": 0.48459064960479736, + "kl": 0.034423828125, + "learning_rate": 7.98125e-07, + "loss": 0.0045278145698830485, + "reward": 2.248463988304138, + "reward_std": 0.21271561086177826, + "rewards/GDino": 0.792187511920929, + "rewards/GIT": 0.5167302191257477, + "rewards/HPSv2": 0.2770881652832031, + "rewards/ORM": 0.6624580770730972, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.75, + "step": 323 + }, + { + "completion_length": 78.078125, + "epoch": 0.3588039867109635, + "grad_norm": 0.9986751675605774, + "kl": 0.03704833984375, + "learning_rate": 7.975e-07, + "loss": 0.0015258773928508162, + "reward": 2.2912731170654297, + "reward_std": 0.2479892298579216, + "rewards/GDino": 0.8150812387466431, + "rewards/GIT": 0.5762419700622559, + "rewards/HPSv2": 0.2679405212402344, + "rewards/ORM": 0.6320093274116516, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -20.0, + "step": 324 + }, + { + "completion_length": 87.3125, + "epoch": 0.3599114064230343, + "grad_norm": 0.5009543299674988, + "kl": 0.015380859375, + "learning_rate": 7.96875e-07, + "loss": 0.00816858746111393, + "reward": 2.3778595328330994, + "reward_std": 0.43483346700668335, + "rewards/GDino": 0.8713316917419434, + "rewards/GIT": 0.5178200602531433, + "rewards/HPSv2": 0.24491596221923828, + "rewards/ORM": 0.743791937828064, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.6875, + "step": 325 + }, + { + "completion_length": 80.640625, + "epoch": 0.3610188261351052, + "grad_norm": 4.34770296604459e+17, + "kl": 2.1532835718365184e+16, + "learning_rate": 7.9625e-07, + "loss": 215359592333312.0, + "reward": 2.1344000101089478, + "reward_std": 0.1903739497065544, + "rewards/GDino": 0.8049311637878418, + "rewards/GIT": 0.44934192299842834, + "rewards/HPSv2": 0.270751953125, + "rewards/ORM": 0.609375, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.6875, + "step": 326 + }, + { + "completion_length": 71.703125, + "epoch": 0.36212624584717606, + "grad_norm": 0.591279149055481, + "kl": 0.0194091796875, + "learning_rate": 7.95625e-07, + "loss": -0.0025365690235048532, + "reward": 2.1500454545021057, + "reward_std": 0.2505127191543579, + "rewards/GDino": 0.6991045475006104, + "rewards/GIT": 0.5611639469861984, + "rewards/HPSv2": 0.27692222595214844, + "rewards/ORM": 0.6128546893596649, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.5, + "step": 327 + }, + { + "completion_length": 66.25, + "epoch": 0.36323366555924697, + "grad_norm": 0.49124178290367126, + "kl": 0.0302734375, + "learning_rate": 7.95e-07, + "loss": 0.000966892926953733, + "reward": 2.234583079814911, + "reward_std": 0.24727845191955566, + "rewards/GDino": 0.8250000178813934, + "rewards/GIT": 0.42907945811748505, + "rewards/HPSv2": 0.3059501647949219, + "rewards/ORM": 0.674553394317627, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.1875, + "step": 328 + }, + { + "completion_length": 94.390625, + "epoch": 0.3643410852713178, + "grad_norm": 0.926494836807251, + "kl": 0.015045166015625, + "learning_rate": 7.94375e-07, + "loss": 0.00036594929406419396, + "reward": 2.232884407043457, + "reward_std": 0.2973213344812393, + "rewards/GDino": 0.7332743704319, + "rewards/GIT": 0.4874793738126755, + "rewards/HPSv2": 0.2777557373046875, + "rewards/ORM": 0.7343749403953552, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.75, + "step": 329 + }, + { + "completion_length": 76.640625, + "epoch": 0.3654485049833887, + "grad_norm": 0.4620281159877777, + "kl": 0.02886962890625, + "learning_rate": 7.937499999999999e-07, + "loss": -0.004971426445990801, + "reward": 2.011380136013031, + "reward_std": 0.32141920924186707, + "rewards/GDino": 0.727248340845108, + "rewards/GIT": 0.3900819420814514, + "rewards/HPSv2": 0.2651176452636719, + "rewards/ORM": 0.6289321780204773, + "self_certainty_semantic": -25.625, + "self_certainty_token": -22.0, + "step": 330 + }, + { + "completion_length": 81.15625, + "epoch": 0.36655592469545956, + "grad_norm": 0.7666285634040833, + "kl": 0.0321044921875, + "learning_rate": 7.931249999999999e-07, + "loss": 0.0011679597664624453, + "reward": 2.653956890106201, + "reward_std": 0.13854296877980232, + "rewards/GDino": 0.885937511920929, + "rewards/GIT": 0.7691447138786316, + "rewards/HPSv2": 0.2644996643066406, + "rewards/ORM": 0.734375, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.375, + "step": 331 + }, + { + "completion_length": 88.3125, + "epoch": 0.36766334440753046, + "grad_norm": 0.5758018493652344, + "kl": 0.01702880859375, + "learning_rate": 7.924999999999999e-07, + "loss": -0.0048087649047374725, + "reward": 1.8430108428001404, + "reward_std": 0.2730695307254791, + "rewards/GDino": 0.7293344736099243, + "rewards/GIT": 0.3678126037120819, + "rewards/HPSv2": 0.2587604522705078, + "rewards/ORM": 0.48710331320762634, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.875, + "step": 332 + }, + { + "completion_length": 84.390625, + "epoch": 0.3687707641196013, + "grad_norm": 0.9169139266014099, + "kl": 0.0267333984375, + "learning_rate": 7.918749999999999e-07, + "loss": -0.0021060709841549397, + "reward": 2.4026538133621216, + "reward_std": 0.2340044304728508, + "rewards/GDino": 0.81524857878685, + "rewards/GIT": 0.44082213938236237, + "rewards/HPSv2": 0.28846168518066406, + "rewards/ORM": 0.858121246099472, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -21.5, + "step": 333 + }, + { + "completion_length": 76.578125, + "epoch": 0.3698781838316722, + "grad_norm": 0.5896627902984619, + "kl": 0.032470703125, + "learning_rate": 7.912499999999999e-07, + "loss": 0.014832689426839352, + "reward": 2.310631573200226, + "reward_std": 0.39878983795642853, + "rewards/GDino": 0.8589580357074738, + "rewards/GIT": 0.5403054803609848, + "rewards/HPSv2": 0.25203895568847656, + "rewards/ORM": 0.6593290567398071, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -21.5, + "step": 334 + }, + { + "completion_length": 81.125, + "epoch": 0.3709856035437431, + "grad_norm": 0.6335277557373047, + "kl": 0.02618408203125, + "learning_rate": 7.90625e-07, + "loss": -0.005854415707290173, + "reward": 1.9042538404464722, + "reward_std": 0.37834763526916504, + "rewards/GDino": 0.8026451170444489, + "rewards/GIT": 0.3777245283126831, + "rewards/HPSv2": 0.270538330078125, + "rewards/ORM": 0.45334580540657043, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.5, + "step": 335 + }, + { + "completion_length": 86.09375, + "epoch": 0.37209302325581395, + "grad_norm": 0.48027098178863525, + "kl": 0.0247802734375, + "learning_rate": 7.9e-07, + "loss": -0.005090413382276893, + "reward": 2.291101813316345, + "reward_std": 0.407858744263649, + "rewards/GDino": 0.7903645634651184, + "rewards/GIT": 0.7357276082038879, + "rewards/HPSv2": 0.2775554656982422, + "rewards/ORM": 0.48745404183864594, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.6875, + "step": 336 + }, + { + "completion_length": 78.0, + "epoch": 0.37320044296788485, + "grad_norm": 1.5485831499099731, + "kl": 0.022857666015625, + "learning_rate": 7.893750000000001e-07, + "loss": -0.0022913700668141246, + "reward": 2.0822601914405823, + "reward_std": 0.3321680426597595, + "rewards/GDino": 0.7484375536441803, + "rewards/GIT": 0.39538896083831787, + "rewards/HPSv2": 0.2601299285888672, + "rewards/ORM": 0.6783038675785065, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -21.625, + "step": 337 + }, + { + "completion_length": 76.46875, + "epoch": 0.3743078626799557, + "grad_norm": 0.4994075894355774, + "kl": 0.02423095703125, + "learning_rate": 7.8875e-07, + "loss": 0.00693045777734369, + "reward": 1.9913100004196167, + "reward_std": 0.4366358071565628, + "rewards/GDino": 0.7917910814285278, + "rewards/GIT": 0.5114566683769226, + "rewards/HPSv2": 0.26731109619140625, + "rewards/ORM": 0.42075108736753464, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.3125, + "step": 338 + }, + { + "completion_length": 85.734375, + "epoch": 0.3754152823920266, + "grad_norm": 0.6367454528808594, + "kl": 0.033203125, + "learning_rate": 7.88125e-07, + "loss": 0.005203233566135168, + "reward": 2.3903461694717407, + "reward_std": 0.3316587954759598, + "rewards/GDino": 0.7497395575046539, + "rewards/GIT": 0.5981788337230682, + "rewards/HPSv2": 0.27332305908203125, + "rewards/ORM": 0.7691046893596649, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -22.0625, + "step": 339 + }, + { + "completion_length": 73.515625, + "epoch": 0.37652270210409744, + "grad_norm": 0.44640496373176575, + "kl": 0.02545166015625, + "learning_rate": 7.875e-07, + "loss": -0.0023915348574519157, + "reward": 2.3552005290985107, + "reward_std": 0.34541261196136475, + "rewards/GDino": 0.7578125, + "rewards/GIT": 0.4575464129447937, + "rewards/HPSv2": 0.27921295166015625, + "rewards/ORM": 0.8606287837028503, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.375, + "step": 340 + }, + { + "completion_length": 76.9375, + "epoch": 0.37763012181616834, + "grad_norm": 0.7208606600761414, + "kl": 0.04327392578125, + "learning_rate": 7.86875e-07, + "loss": 0.006299447733908892, + "reward": 1.769343376159668, + "reward_std": 0.4668227732181549, + "rewards/GDino": 0.6656007468700409, + "rewards/GIT": 0.23980124294757843, + "rewards/HPSv2": 0.25531768798828125, + "rewards/ORM": 0.6086236536502838, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.375, + "step": 341 + }, + { + "completion_length": 78.59375, + "epoch": 0.3787375415282392, + "grad_norm": 0.4430847465991974, + "kl": 0.03216552734375, + "learning_rate": 7.8625e-07, + "loss": -0.005237360019236803, + "reward": 2.1384899616241455, + "reward_std": 0.3752764165401459, + "rewards/GDino": 0.7345833480358124, + "rewards/GIT": 0.4388365373015404, + "rewards/HPSv2": 0.2710113525390625, + "rewards/ORM": 0.694058746099472, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.9375, + "step": 342 + }, + { + "completion_length": 65.34375, + "epoch": 0.3798449612403101, + "grad_norm": 0.4848972260951996, + "kl": 0.0263671875, + "learning_rate": 7.85625e-07, + "loss": 0.013073518872261047, + "reward": 2.126249670982361, + "reward_std": 0.38274161517620087, + "rewards/GDino": 0.8289884030818939, + "rewards/GIT": 0.2857021316885948, + "rewards/HPSv2": 0.2662925720214844, + "rewards/ORM": 0.7452665567398071, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.375, + "step": 343 + }, + { + "completion_length": 75.0, + "epoch": 0.38095238095238093, + "grad_norm": 2.974632501602173, + "kl": 0.0523681640625, + "learning_rate": 7.85e-07, + "loss": -0.007122250506654382, + "reward": 2.442666530609131, + "reward_std": 0.3234139457345009, + "rewards/GDino": 0.830729216337204, + "rewards/GIT": 0.7763712406158447, + "rewards/HPSv2": 0.2387371063232422, + "rewards/ORM": 0.5968290418386459, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -22.0625, + "step": 344 + }, + { + "completion_length": 87.53125, + "epoch": 0.38205980066445183, + "grad_norm": 0.3997608721256256, + "kl": 0.02191162109375, + "learning_rate": 7.84375e-07, + "loss": 0.009387207683175802, + "reward": 1.6372931599617004, + "reward_std": 0.5579112768173218, + "rewards/GDino": 0.6632326543331146, + "rewards/GIT": 0.2669598236680031, + "rewards/HPSv2": 0.2503662109375, + "rewards/ORM": 0.4567345529794693, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -22.25, + "step": 345 + }, + { + "completion_length": 78.671875, + "epoch": 0.3831672203765227, + "grad_norm": 0.5440374612808228, + "kl": 0.023681640625, + "learning_rate": 7.837499999999999e-07, + "loss": -0.004060751176439226, + "reward": 2.5331764221191406, + "reward_std": 0.30825961381196976, + "rewards/GDino": 0.7956249713897705, + "rewards/GIT": 0.6618549525737762, + "rewards/HPSv2": 0.26337623596191406, + "rewards/ORM": 0.8123202323913574, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -22.25, + "step": 346 + }, + { + "completion_length": 78.859375, + "epoch": 0.3842746400885936, + "grad_norm": 0.505639374256134, + "kl": 0.027587890625, + "learning_rate": 7.831249999999999e-07, + "loss": 0.006256320746615529, + "reward": 1.9418827891349792, + "reward_std": 0.39945825934410095, + "rewards/GDino": 0.6593631207942963, + "rewards/GIT": 0.31888166069984436, + "rewards/HPSv2": 0.2855548858642578, + "rewards/ORM": 0.678083062171936, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -22.3125, + "step": 347 + }, + { + "completion_length": 83.796875, + "epoch": 0.3853820598006645, + "grad_norm": 0.6219062209129333, + "kl": 0.02642822265625, + "learning_rate": 7.824999999999999e-07, + "loss": 0.004094981588423252, + "reward": 2.0063390731811523, + "reward_std": 0.3849373310804367, + "rewards/GDino": 0.7137661874294281, + "rewards/GIT": 0.38513660430908203, + "rewards/HPSv2": 0.2584552764892578, + "rewards/ORM": 0.6489809155464172, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -20.5, + "step": 348 + }, + { + "completion_length": 92.359375, + "epoch": 0.3864894795127353, + "grad_norm": 0.6767128705978394, + "kl": 0.0465087890625, + "learning_rate": 7.818749999999999e-07, + "loss": 0.00501623225864023, + "reward": 1.9770677089691162, + "reward_std": 0.3419999033212662, + "rewards/GDino": 0.7295942902565002, + "rewards/GIT": 0.3469092845916748, + "rewards/HPSv2": 0.2744102478027344, + "rewards/ORM": 0.6261538863182068, + "self_certainty_semantic": -25.875, + "self_certainty_token": -22.625, + "step": 349 + }, + { + "completion_length": 85.40625, + "epoch": 0.3875968992248062, + "grad_norm": 0.904482364654541, + "kl": 0.0482177734375, + "learning_rate": 7.812499999999999e-07, + "loss": 0.008412390714511275, + "reward": 2.1975533962249756, + "reward_std": 0.3165140748023987, + "rewards/GDino": 0.7537752985954285, + "rewards/GIT": 0.41514359414577484, + "rewards/HPSv2": 0.28171348571777344, + "rewards/ORM": 0.7469209432601929, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -21.375, + "step": 350 + }, + { + "completion_length": 82.625, + "epoch": 0.38870431893687707, + "grad_norm": 0.39913347363471985, + "kl": 0.0225830078125, + "learning_rate": 7.806249999999999e-07, + "loss": 0.0012628886615857482, + "reward": 2.383103609085083, + "reward_std": 0.39389656484127045, + "rewards/GDino": 0.855949878692627, + "rewards/GIT": 0.4103764295578003, + "rewards/HPSv2": 0.26708984375, + "rewards/ORM": 0.8496872782707214, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.3125, + "step": 351 + }, + { + "completion_length": 84.828125, + "epoch": 0.38981173864894797, + "grad_norm": 0.6845752000808716, + "kl": 0.03411865234375, + "learning_rate": 7.799999999999999e-07, + "loss": 0.012704014778137207, + "reward": 2.0514729022979736, + "reward_std": 0.38871151208877563, + "rewards/GDino": 0.7418749928474426, + "rewards/GIT": 0.30121954530477524, + "rewards/HPSv2": 0.2696704864501953, + "rewards/ORM": 0.7387077808380127, + "self_certainty_semantic": -25.875, + "self_certainty_token": -22.4375, + "step": 352 + }, + { + "completion_length": 76.53125, + "epoch": 0.3909191583610188, + "grad_norm": 0.5380039215087891, + "kl": 0.0360107421875, + "learning_rate": 7.793750000000001e-07, + "loss": -0.00018121302127838135, + "reward": 1.9169389605522156, + "reward_std": 0.33840206265449524, + "rewards/GDino": 0.5888020992279053, + "rewards/GIT": 0.3554147705435753, + "rewards/HPSv2": 0.27184486389160156, + "rewards/ORM": 0.7008772194385529, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.75, + "step": 353 + }, + { + "completion_length": 71.3125, + "epoch": 0.3920265780730897, + "grad_norm": 0.6473866105079651, + "kl": 0.0369873046875, + "learning_rate": 7.787500000000001e-07, + "loss": 0.009591558366082609, + "reward": 2.060921609401703, + "reward_std": 0.5267337262630463, + "rewards/GDino": 0.6770313084125519, + "rewards/GIT": 0.3380723297595978, + "rewards/HPSv2": 0.28131675720214844, + "rewards/ORM": 0.7645011246204376, + "self_certainty_semantic": -25.75, + "self_certainty_token": -23.0, + "step": 354 + }, + { + "completion_length": 73.375, + "epoch": 0.39313399778516056, + "grad_norm": 1.9103351831436157, + "kl": 0.03515625, + "learning_rate": 7.78125e-07, + "loss": 0.009471733821555972, + "reward": 2.468719720840454, + "reward_std": 0.3037705421447754, + "rewards/GDino": 0.824194073677063, + "rewards/GIT": 0.6634758412837982, + "rewards/HPSv2": 0.2754173278808594, + "rewards/ORM": 0.7056325078010559, + "self_certainty_semantic": -25.875, + "self_certainty_token": -22.25, + "step": 355 + }, + { + "completion_length": 81.640625, + "epoch": 0.39424141749723146, + "grad_norm": 0.47359681129455566, + "kl": 0.0191650390625, + "learning_rate": 7.775e-07, + "loss": 0.01774417981505394, + "reward": 2.1485215425491333, + "reward_std": 0.37546610832214355, + "rewards/GDino": 0.7694036066532135, + "rewards/GIT": 0.2857285439968109, + "rewards/HPSv2": 0.28088951110839844, + "rewards/ORM": 0.8125, + "self_certainty_semantic": -25.875, + "self_certainty_token": -22.5, + "step": 356 + }, + { + "completion_length": 82.78125, + "epoch": 0.3953488372093023, + "grad_norm": 0.4700465500354767, + "kl": 0.028564453125, + "learning_rate": 7.76875e-07, + "loss": -0.005447798175737262, + "reward": 1.9955376386642456, + "reward_std": 0.3978916108608246, + "rewards/GDino": 0.7125872671604156, + "rewards/GIT": 0.29513833671808243, + "rewards/HPSv2": 0.2846870422363281, + "rewards/ORM": 0.703125, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.3125, + "step": 357 + }, + { + "completion_length": 84.15625, + "epoch": 0.3964562569213732, + "grad_norm": 0.5537763237953186, + "kl": 0.04052734375, + "learning_rate": 7.7625e-07, + "loss": -0.005253390525467694, + "reward": 2.2486732006073, + "reward_std": 0.33913009613752365, + "rewards/GDino": 0.7710938155651093, + "rewards/GIT": 0.5208555310964584, + "rewards/HPSv2": 0.2630157470703125, + "rewards/ORM": 0.6937080323696136, + "self_certainty_semantic": -26.0, + "self_certainty_token": -20.4375, + "step": 358 + }, + { + "completion_length": 78.84375, + "epoch": 0.39756367663344405, + "grad_norm": 0.4036509692668915, + "kl": 0.02471923828125, + "learning_rate": 7.75625e-07, + "loss": 0.004020060820039362, + "reward": 1.3866900205612183, + "reward_std": 0.45843446254730225, + "rewards/GDino": 0.5991981625556946, + "rewards/GIT": 0.18800346553325653, + "rewards/HPSv2": 0.24882888793945312, + "rewards/ORM": 0.35065943002700806, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -21.9375, + "step": 359 + }, + { + "completion_length": 75.21875, + "epoch": 0.39867109634551495, + "grad_norm": 0.5162832736968994, + "kl": 0.021728515625, + "learning_rate": 7.75e-07, + "loss": 0.0037150949938222766, + "reward": 1.9392863512039185, + "reward_std": 0.2793232351541519, + "rewards/GDino": 0.701914981007576, + "rewards/GIT": 0.5795804336667061, + "rewards/HPSv2": 0.27355384826660156, + "rewards/ORM": 0.3842370957136154, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.0, + "step": 360 + }, + { + "completion_length": 76.4375, + "epoch": 0.3997785160575858, + "grad_norm": 0.45564430952072144, + "kl": 0.02691650390625, + "learning_rate": 7.74375e-07, + "loss": 0.0029305103234946728, + "reward": 2.0641271471977234, + "reward_std": 0.368464857339859, + "rewards/GDino": 0.7391301393508911, + "rewards/GIT": 0.3999723941087723, + "rewards/HPSv2": 0.27350807189941406, + "rewards/ORM": 0.6515165269374847, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -20.6875, + "step": 361 + }, + { + "completion_length": 83.953125, + "epoch": 0.4008859357696567, + "grad_norm": 0.9035529494285583, + "kl": 0.0772705078125, + "learning_rate": 7.7375e-07, + "loss": 0.003972394741140306, + "reward": 1.934463918209076, + "reward_std": 0.4396786019206047, + "rewards/GDino": 0.7246715724468231, + "rewards/GIT": 0.46111059188842773, + "rewards/HPSv2": 0.28046226501464844, + "rewards/ORM": 0.46821947395801544, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.75, + "step": 362 + }, + { + "completion_length": 92.75, + "epoch": 0.4019933554817276, + "grad_norm": 0.5282062888145447, + "kl": 0.019317626953125, + "learning_rate": 7.731249999999999e-07, + "loss": -0.008066414389759302, + "reward": 2.07886004447937, + "reward_std": 0.4132898300886154, + "rewards/GDino": 0.7536458075046539, + "rewards/GIT": 0.4562801867723465, + "rewards/HPSv2": 0.2626380920410156, + "rewards/ORM": 0.6062959432601929, + "self_certainty_semantic": -25.75, + "self_certainty_token": -20.8125, + "step": 363 + }, + { + "completion_length": 83.484375, + "epoch": 0.40310077519379844, + "grad_norm": 0.3879307508468628, + "kl": 0.04705810546875, + "learning_rate": 7.724999999999999e-07, + "loss": -0.0018501741578802466, + "reward": 2.4320976734161377, + "reward_std": 0.2695165127515793, + "rewards/GDino": 0.796875, + "rewards/GIT": 0.5824678391218185, + "rewards/HPSv2": 0.28354835510253906, + "rewards/ORM": 0.7692064642906189, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.75, + "step": 364 + }, + { + "completion_length": 85.34375, + "epoch": 0.40420819490586934, + "grad_norm": 0.5758140683174133, + "kl": 0.0352783203125, + "learning_rate": 7.718749999999999e-07, + "loss": 0.0057990700006484985, + "reward": 2.019951283931732, + "reward_std": 0.30692145973443985, + "rewards/GDino": 0.6002083420753479, + "rewards/GIT": 0.28720255196094513, + "rewards/HPSv2": 0.27191162109375, + "rewards/ORM": 0.8606287240982056, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.6875, + "step": 365 + }, + { + "completion_length": 71.171875, + "epoch": 0.4053156146179402, + "grad_norm": 25.993791580200195, + "kl": 0.9073486328125, + "learning_rate": 7.712499999999999e-07, + "loss": 0.010981484781950712, + "reward": 1.686427891254425, + "reward_std": 0.507771372795105, + "rewards/GDino": 0.6281927824020386, + "rewards/GIT": 0.19865846633911133, + "rewards/HPSv2": 0.2666797637939453, + "rewards/ORM": 0.5928968787193298, + "self_certainty_semantic": -25.875, + "self_certainty_token": -23.375, + "step": 366 + }, + { + "completion_length": 73.765625, + "epoch": 0.4064230343300111, + "grad_norm": 2.63863205909729, + "kl": 0.03125, + "learning_rate": 7.706249999999999e-07, + "loss": 0.00012735230848193169, + "reward": 2.2389276027679443, + "reward_std": 0.4072023332118988, + "rewards/GDino": 0.7541632354259491, + "rewards/GIT": 0.45902618765830994, + "rewards/HPSv2": 0.28417205810546875, + "rewards/ORM": 0.741566002368927, + "self_certainty_semantic": -25.875, + "self_certainty_token": -22.25, + "step": 367 + }, + { + "completion_length": 72.296875, + "epoch": 0.40753045404208194, + "grad_norm": 0.935273289680481, + "kl": 0.02752685546875, + "learning_rate": 7.699999999999999e-07, + "loss": 0.0032109934836626053, + "reward": 2.2100645303726196, + "reward_std": 0.24064482748508453, + "rewards/GDino": 0.8498496413230896, + "rewards/GIT": 0.5447873771190643, + "rewards/HPSv2": 0.2853813171386719, + "rewards/ORM": 0.5300461947917938, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.875, + "step": 368 + }, + { + "completion_length": 77.703125, + "epoch": 0.40863787375415284, + "grad_norm": 0.8915330767631531, + "kl": 0.1737060546875, + "learning_rate": 7.69375e-07, + "loss": -0.004847385222092271, + "reward": 2.4846267104148865, + "reward_std": 0.34741681814193726, + "rewards/GDino": 0.8923705220222473, + "rewards/GIT": 0.6399442255496979, + "rewards/HPSv2": 0.2816905975341797, + "rewards/ORM": 0.6706212610006332, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.5625, + "step": 369 + }, + { + "completion_length": 78.828125, + "epoch": 0.4097452934662237, + "grad_norm": 0.4684216380119324, + "kl": 0.03460693359375, + "learning_rate": 7.6875e-07, + "loss": 0.006017133709974587, + "reward": 2.400377631187439, + "reward_std": 0.3661707490682602, + "rewards/GDino": 0.7559739351272583, + "rewards/GIT": 0.5203657299280167, + "rewards/HPSv2": 0.2681427001953125, + "rewards/ORM": 0.8558953106403351, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.5, + "step": 370 + }, + { + "completion_length": 81.90625, + "epoch": 0.4108527131782946, + "grad_norm": 0.39840996265411377, + "kl": 0.0191650390625, + "learning_rate": 7.68125e-07, + "loss": 0.007917450740933418, + "reward": 2.394679307937622, + "reward_std": 0.26683831214904785, + "rewards/GDino": 0.8869791626930237, + "rewards/GIT": 0.7277960479259491, + "rewards/HPSv2": 0.2607994079589844, + "rewards/ORM": 0.5191046893596649, + "self_certainty_semantic": -26.0, + "self_certainty_token": -22.0625, + "step": 371 + }, + { + "completion_length": 68.34375, + "epoch": 0.4119601328903654, + "grad_norm": 0.5618812441825867, + "kl": 0.02484130859375, + "learning_rate": 7.675e-07, + "loss": 0.0053215608932077885, + "reward": 1.9621981978416443, + "reward_std": 0.3503369837999344, + "rewards/GDino": 0.6614684462547302, + "rewards/GIT": 0.44001519680023193, + "rewards/HPSv2": 0.2732524871826172, + "rewards/ORM": 0.5874620378017426, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.3125, + "step": 372 + }, + { + "completion_length": 77.453125, + "epoch": 0.4130675526024363, + "grad_norm": 0.44254979491233826, + "kl": 0.0338134765625, + "learning_rate": 7.66875e-07, + "loss": 0.012260682880878448, + "reward": 1.9219316244125366, + "reward_std": 0.4732651710510254, + "rewards/GDino": 0.720096230506897, + "rewards/GIT": 0.2833433449268341, + "rewards/HPSv2": 0.27478790283203125, + "rewards/ORM": 0.6437040567398071, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -21.9375, + "step": 373 + }, + { + "completion_length": 77.109375, + "epoch": 0.4141749723145072, + "grad_norm": 0.5283539295196533, + "kl": 0.01953125, + "learning_rate": 7.6625e-07, + "loss": 0.00025802222080528736, + "reward": 2.1784894466400146, + "reward_std": 0.37603603303432465, + "rewards/GDino": 0.8346986174583435, + "rewards/GIT": 0.5240668132901192, + "rewards/HPSv2": 0.2767810821533203, + "rewards/ORM": 0.5429428815841675, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.875, + "step": 374 + }, + { + "completion_length": 65.1875, + "epoch": 0.4152823920265781, + "grad_norm": 2.0193865299224854, + "kl": 0.0350341796875, + "learning_rate": 7.65625e-07, + "loss": 0.00023457035422325134, + "reward": 2.463273048400879, + "reward_std": 0.16159752011299133, + "rewards/GDino": 0.8671875, + "rewards/GIT": 0.7136387228965759, + "rewards/HPSv2": 0.2778053283691406, + "rewards/ORM": 0.6046415567398071, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.125, + "step": 375 + }, + { + "completion_length": 72.15625, + "epoch": 0.416389811738649, + "grad_norm": 0.6591690182685852, + "kl": 0.0289306640625, + "learning_rate": 7.65e-07, + "loss": -0.0017245884519070387, + "reward": 2.2452900409698486, + "reward_std": 0.490168958902359, + "rewards/GDino": 0.7729970812797546, + "rewards/GIT": 0.4367716535925865, + "rewards/HPSv2": 0.25975608825683594, + "rewards/ORM": 0.7757652103900909, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.625, + "step": 376 + }, + { + "completion_length": 76.203125, + "epoch": 0.4174972314507198, + "grad_norm": 0.5766474604606628, + "kl": 0.02191162109375, + "learning_rate": 7.64375e-07, + "loss": -0.0004694787785410881, + "reward": 1.9068648219108582, + "reward_std": 0.16421005129814148, + "rewards/GDino": 0.6833183169364929, + "rewards/GIT": 0.3441741615533829, + "rewards/HPSv2": 0.26651763916015625, + "rewards/ORM": 0.6128546595573425, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.0, + "step": 377 + }, + { + "completion_length": 86.546875, + "epoch": 0.4186046511627907, + "grad_norm": 0.43956464529037476, + "kl": 0.02410888671875, + "learning_rate": 7.6375e-07, + "loss": 0.004958470817655325, + "reward": 2.4844208359718323, + "reward_std": 0.3209882155060768, + "rewards/GDino": 0.7713975608348846, + "rewards/GIT": 0.5327824056148529, + "rewards/HPSv2": 0.28255462646484375, + "rewards/ORM": 0.8976861834526062, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.0625, + "step": 378 + }, + { + "completion_length": 82.75, + "epoch": 0.41971207087486156, + "grad_norm": 1.2485466003417969, + "kl": 0.05816650390625, + "learning_rate": 7.63125e-07, + "loss": -0.002994309877976775, + "reward": 2.16240918636322, + "reward_std": 0.2650664523243904, + "rewards/GDino": 0.776936948299408, + "rewards/GIT": 0.5451656579971313, + "rewards/HPSv2": 0.2638359069824219, + "rewards/ORM": 0.5764705985784531, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -22.0, + "step": 379 + }, + { + "completion_length": 85.890625, + "epoch": 0.42081949058693247, + "grad_norm": 0.6214731931686401, + "kl": 0.0277099609375, + "learning_rate": 7.624999999999999e-07, + "loss": -0.00421567028388381, + "reward": 1.8477151989936829, + "reward_std": 0.32977864146232605, + "rewards/GDino": 0.7990064024925232, + "rewards/GIT": 0.5198927968740463, + "rewards/HPSv2": 0.28627777099609375, + "rewards/ORM": 0.2425382286310196, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -21.5625, + "step": 380 + }, + { + "completion_length": 84.375, + "epoch": 0.4219269102990033, + "grad_norm": 0.7152870893478394, + "kl": 0.02960205078125, + "learning_rate": 7.618749999999999e-07, + "loss": -0.00914668245241046, + "reward": 2.547330617904663, + "reward_std": 0.318975493311882, + "rewards/GDino": 0.8286458253860474, + "rewards/GIT": 0.724987268447876, + "rewards/HPSv2": 0.2702140808105469, + "rewards/ORM": 0.7234834134578705, + "self_certainty_semantic": -25.875, + "self_certainty_token": -20.4375, + "step": 381 + }, + { + "completion_length": 87.328125, + "epoch": 0.4230343300110742, + "grad_norm": 0.5311893820762634, + "kl": 0.031005859375, + "learning_rate": 7.612499999999999e-07, + "loss": -0.01361012738198042, + "reward": 2.263441562652588, + "reward_std": 0.2558389827609062, + "rewards/GDino": 0.7953124642372131, + "rewards/GIT": 0.5018891543149948, + "rewards/HPSv2": 0.27873992919921875, + "rewards/ORM": 0.6875, + "self_certainty_semantic": -26.0, + "self_certainty_token": -20.5, + "step": 382 + }, + { + "completion_length": 76.890625, + "epoch": 0.42414174972314506, + "grad_norm": 0.6475590467453003, + "kl": 0.02301025390625, + "learning_rate": 7.606249999999999e-07, + "loss": -0.001263815094716847, + "reward": 2.1975873708724976, + "reward_std": 0.3748020529747009, + "rewards/GDino": 0.7668351829051971, + "rewards/GIT": 0.5400517359375954, + "rewards/HPSv2": 0.2606163024902344, + "rewards/ORM": 0.6300841569900513, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.0625, + "step": 383 + }, + { + "completion_length": 83.34375, + "epoch": 0.42524916943521596, + "grad_norm": 0.507254421710968, + "kl": 0.02801513671875, + "learning_rate": 7.599999999999999e-07, + "loss": -0.0008333073928952217, + "reward": 2.077458620071411, + "reward_std": 0.3855893015861511, + "rewards/GDino": 0.7702149152755737, + "rewards/GIT": 0.5419944673776627, + "rewards/HPSv2": 0.2733325958251953, + "rewards/ORM": 0.4919167459011078, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.9375, + "step": 384 + }, + { + "completion_length": 85.40625, + "epoch": 0.4263565891472868, + "grad_norm": 0.4501701593399048, + "kl": 0.03662109375, + "learning_rate": 7.59375e-07, + "loss": 0.005623898236081004, + "reward": 2.138138771057129, + "reward_std": 0.31487397849559784, + "rewards/GDino": 0.7385719418525696, + "rewards/GIT": 0.3306322917342186, + "rewards/HPSv2": 0.2595634460449219, + "rewards/ORM": 0.8093710243701935, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.25, + "step": 385 + }, + { + "completion_length": 74.640625, + "epoch": 0.4274640088593577, + "grad_norm": 1.0451889038085938, + "kl": 0.03448486328125, + "learning_rate": 7.5875e-07, + "loss": 0.008310600649565458, + "reward": 2.5433003902435303, + "reward_std": 0.3450489193201065, + "rewards/GDino": 0.8338541686534882, + "rewards/GIT": 0.5790800303220749, + "rewards/HPSv2": 0.2816619873046875, + "rewards/ORM": 0.8487042486667633, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.9375, + "step": 386 + }, + { + "completion_length": 73.375, + "epoch": 0.42857142857142855, + "grad_norm": 0.5248279571533203, + "kl": 0.0316162109375, + "learning_rate": 7.58125e-07, + "loss": -0.004152324283495545, + "reward": 2.282692015171051, + "reward_std": 0.3422551453113556, + "rewards/GDino": 0.8517140746116638, + "rewards/GIT": 0.49808675050735474, + "rewards/HPSv2": 0.2647666931152344, + "rewards/ORM": 0.6681245267391205, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.1875, + "step": 387 + }, + { + "completion_length": 74.078125, + "epoch": 0.42967884828349945, + "grad_norm": 0.38542476296424866, + "kl": 0.013763427734375, + "learning_rate": 7.575e-07, + "loss": -0.010505566373467445, + "reward": 2.6636276245117188, + "reward_std": 0.2106212005019188, + "rewards/GDino": 0.8841937184333801, + "rewards/GIT": 0.7084439396858215, + "rewards/HPSv2": 0.2803230285644531, + "rewards/ORM": 0.790666937828064, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -22.25, + "step": 388 + }, + { + "completion_length": 76.390625, + "epoch": 0.43078626799557035, + "grad_norm": 0.6815649271011353, + "kl": 0.03594970703125, + "learning_rate": 7.56875e-07, + "loss": -0.0032428253907710314, + "reward": 2.090184509754181, + "reward_std": 0.5343265533447266, + "rewards/GDino": 0.7562500238418579, + "rewards/GIT": 0.44777335971593857, + "rewards/HPSv2": 0.26241493225097656, + "rewards/ORM": 0.623746246099472, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.25, + "step": 389 + }, + { + "completion_length": 78.015625, + "epoch": 0.4318936877076412, + "grad_norm": 0.4324207901954651, + "kl": 0.01995849609375, + "learning_rate": 7.5625e-07, + "loss": -1.2263190001249313e-06, + "reward": 2.483983278274536, + "reward_std": 0.22743677347898483, + "rewards/GDino": 0.8252314329147339, + "rewards/GIT": 0.6573226451873779, + "rewards/HPSv2": 0.26531982421875, + "rewards/ORM": 0.7361093312501907, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.25, + "step": 390 + }, + { + "completion_length": 78.890625, + "epoch": 0.4330011074197121, + "grad_norm": 0.40318140387535095, + "kl": 0.0306396484375, + "learning_rate": 7.55625e-07, + "loss": 0.01058862719219178, + "reward": 2.4253766536712646, + "reward_std": 0.247165247797966, + "rewards/GDino": 0.9132010638713837, + "rewards/GIT": 0.6424136161804199, + "rewards/HPSv2": 0.25565338134765625, + "rewards/ORM": 0.6141084432601929, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -22.375, + "step": 391 + }, + { + "completion_length": 87.84375, + "epoch": 0.43410852713178294, + "grad_norm": 0.506926417350769, + "kl": 0.02825927734375, + "learning_rate": 7.55e-07, + "loss": 0.005374419270083308, + "reward": 2.0307316184043884, + "reward_std": 0.41478703916072845, + "rewards/GDino": 0.7457704544067383, + "rewards/GIT": 0.3928220123052597, + "rewards/HPSv2": 0.2746009826660156, + "rewards/ORM": 0.617538183927536, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.4375, + "step": 392 + }, + { + "completion_length": 68.0, + "epoch": 0.43521594684385384, + "grad_norm": 0.4594443142414093, + "kl": 0.037353515625, + "learning_rate": 7.54375e-07, + "loss": 0.003534393385052681, + "reward": 2.255183219909668, + "reward_std": 0.19336728006601334, + "rewards/GDino": 0.8642210066318512, + "rewards/GIT": 0.47932907938957214, + "rewards/HPSv2": 0.28462791442871094, + "rewards/ORM": 0.6270051002502441, + "self_certainty_semantic": -25.75, + "self_certainty_token": -22.875, + "step": 393 + }, + { + "completion_length": 82.0, + "epoch": 0.4363233665559247, + "grad_norm": 0.5013278126716614, + "kl": 0.02581787109375, + "learning_rate": 7.5375e-07, + "loss": -0.0056536816991865635, + "reward": 2.371790647506714, + "reward_std": 0.19828242808580399, + "rewards/GDino": 0.8962720036506653, + "rewards/GIT": 0.4048049747943878, + "rewards/HPSv2": 0.2725849151611328, + "rewards/ORM": 0.798128753900528, + "self_certainty_semantic": -26.0, + "self_certainty_token": -23.0625, + "step": 394 + }, + { + "completion_length": 79.140625, + "epoch": 0.4374307862679956, + "grad_norm": 0.7290600538253784, + "kl": 0.071044921875, + "learning_rate": 7.53125e-07, + "loss": -0.0031574114691466093, + "reward": 2.314516067504883, + "reward_std": 0.3586508333683014, + "rewards/GDino": 0.8972340226173401, + "rewards/GIT": 0.5144131779670715, + "rewards/HPSv2": 0.27438926696777344, + "rewards/ORM": 0.6284796893596649, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -21.1875, + "step": 395 + }, + { + "completion_length": 76.109375, + "epoch": 0.43853820598006643, + "grad_norm": 0.6519272327423096, + "kl": 0.0380859375, + "learning_rate": 7.524999999999999e-07, + "loss": -0.0067711935844272375, + "reward": 2.2616201043128967, + "reward_std": 0.38303279876708984, + "rewards/GDino": 0.8086700439453125, + "rewards/GIT": 0.43244993686676025, + "rewards/HPSv2": 0.2732067108154297, + "rewards/ORM": 0.7472934424877167, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.1875, + "step": 396 + }, + { + "completion_length": 79.28125, + "epoch": 0.43964562569213733, + "grad_norm": 0.5021886229515076, + "kl": 0.024169921875, + "learning_rate": 7.518749999999999e-07, + "loss": 0.004822302144020796, + "reward": 2.49921977519989, + "reward_std": 0.35611972212791443, + "rewards/GDino": 0.884239137172699, + "rewards/GIT": 0.6691879034042358, + "rewards/HPSv2": 0.2534294128417969, + "rewards/ORM": 0.6923633217811584, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.6875, + "step": 397 + }, + { + "completion_length": 71.9375, + "epoch": 0.4407530454042082, + "grad_norm": 0.7414788007736206, + "kl": 0.05810546875, + "learning_rate": 7.512499999999999e-07, + "loss": -8.996110409498215e-05, + "reward": 2.6626181602478027, + "reward_std": 0.20449738949537277, + "rewards/GDino": 0.7895833253860474, + "rewards/GIT": 0.6252684891223907, + "rewards/HPSv2": 0.27901649475097656, + "rewards/ORM": 0.96875, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.0, + "step": 398 + }, + { + "completion_length": 84.78125, + "epoch": 0.4418604651162791, + "grad_norm": 0.5590401291847229, + "kl": 0.03173828125, + "learning_rate": 7.506249999999999e-07, + "loss": -0.008660833351314068, + "reward": 1.7432748675346375, + "reward_std": 0.41746358573436737, + "rewards/GDino": 0.5843387842178345, + "rewards/GIT": 0.23352296650409698, + "rewards/HPSv2": 0.2535381317138672, + "rewards/ORM": 0.6718749701976776, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.1875, + "step": 399 + }, + { + "completion_length": 83.21875, + "epoch": 0.4429678848283499, + "grad_norm": 0.648642897605896, + "kl": 0.0411376953125, + "learning_rate": 7.5e-07, + "loss": -0.0011475947685539722, + "reward": 2.1194006204605103, + "reward_std": 0.28101272881031036, + "rewards/GDino": 0.7772284150123596, + "rewards/GIT": 0.43165211379528046, + "rewards/HPSv2": 0.2793121337890625, + "rewards/ORM": 0.6312080323696136, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.0625, + "step": 400 + }, + { + "completion_length": 80.65625, + "epoch": 0.4440753045404208, + "grad_norm": 0.8936266303062439, + "kl": 0.05169677734375, + "learning_rate": 7.49375e-07, + "loss": -0.0011689523234963417, + "reward": 2.9162869453430176, + "reward_std": 0.23950693011283875, + "rewards/GDino": 0.964062511920929, + "rewards/GIT": 0.7574390172958374, + "rewards/HPSv2": 0.26857757568359375, + "rewards/ORM": 0.9262078106403351, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.5, + "step": 401 + }, + { + "completion_length": 71.109375, + "epoch": 0.44518272425249167, + "grad_norm": 1.048969030380249, + "kl": 0.05523681640625, + "learning_rate": 7.4875e-07, + "loss": -0.0033314225147478282, + "reward": 2.2260234355926514, + "reward_std": 0.2816730886697769, + "rewards/GDino": 0.7569573223590851, + "rewards/GIT": 0.45972205698490143, + "rewards/HPSv2": 0.29059410095214844, + "rewards/ORM": 0.71875, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.8125, + "step": 402 + }, + { + "completion_length": 74.40625, + "epoch": 0.44629014396456257, + "grad_norm": 0.5850540399551392, + "kl": 0.02862548828125, + "learning_rate": 7.48125e-07, + "loss": -0.0013043317594565451, + "reward": 1.9029690027236938, + "reward_std": 0.2511463537812233, + "rewards/GDino": 0.5718749463558197, + "rewards/GIT": 0.23306283354759216, + "rewards/HPSv2": 0.28798866271972656, + "rewards/ORM": 0.8100424408912659, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.875, + "step": 403 + }, + { + "completion_length": 76.578125, + "epoch": 0.44739756367663347, + "grad_norm": 0.8278297185897827, + "kl": 0.02764892578125, + "learning_rate": 7.475e-07, + "loss": -0.003656167769804597, + "reward": 2.104754090309143, + "reward_std": 0.40367451310157776, + "rewards/GDino": 0.7633451819419861, + "rewards/GIT": 0.39402763545513153, + "rewards/HPSv2": 0.26260948181152344, + "rewards/ORM": 0.6847716867923737, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.8125, + "step": 404 + }, + { + "completion_length": 86.046875, + "epoch": 0.4485049833887043, + "grad_norm": 0.743391215801239, + "kl": 0.051025390625, + "learning_rate": 7.468749999999999e-07, + "loss": 0.012512540211901069, + "reward": 2.741988182067871, + "reward_std": 0.29120295494794846, + "rewards/GDino": 0.8853493332862854, + "rewards/GIT": 0.6646924316883087, + "rewards/HPSv2": 0.2762794494628906, + "rewards/ORM": 0.9156669676303864, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.625, + "step": 405 + }, + { + "completion_length": 84.296875, + "epoch": 0.4496124031007752, + "grad_norm": 0.8006733059883118, + "kl": 0.0548095703125, + "learning_rate": 7.4625e-07, + "loss": -0.003613928332924843, + "reward": 1.7359102964401245, + "reward_std": 0.39680202305316925, + "rewards/GDino": 0.6715972423553467, + "rewards/GIT": 0.037226274609565735, + "rewards/HPSv2": 0.2840461730957031, + "rewards/ORM": 0.7430405914783478, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.1875, + "step": 406 + }, + { + "completion_length": 84.015625, + "epoch": 0.45071982281284606, + "grad_norm": 0.43640708923339844, + "kl": 0.03436279296875, + "learning_rate": 7.45625e-07, + "loss": -0.0018186537781730294, + "reward": 2.098067879676819, + "reward_std": 0.3177693039178848, + "rewards/GDino": 0.7636567950248718, + "rewards/GIT": 0.42283183336257935, + "rewards/HPSv2": 0.27170562744140625, + "rewards/ORM": 0.6398736536502838, + "self_certainty_semantic": -25.875, + "self_certainty_token": -22.3125, + "step": 407 + }, + { + "completion_length": 84.65625, + "epoch": 0.45182724252491696, + "grad_norm": 0.654191792011261, + "kl": 0.0341796875, + "learning_rate": 7.45e-07, + "loss": -0.0013926629908382893, + "reward": 2.1406124234199524, + "reward_std": 0.3711487799882889, + "rewards/GDino": 0.7993550896644592, + "rewards/GIT": 0.39600491523742676, + "rewards/HPSv2": 0.2621650695800781, + "rewards/ORM": 0.6830872893333435, + "self_certainty_semantic": -25.875, + "self_certainty_token": -22.625, + "step": 408 + }, + { + "completion_length": 74.265625, + "epoch": 0.4529346622369878, + "grad_norm": 0.563645601272583, + "kl": 0.0347900390625, + "learning_rate": 7.44375e-07, + "loss": 0.003390789031982422, + "reward": 2.7270385026931763, + "reward_std": 0.29976649582386017, + "rewards/GDino": 0.979687511920929, + "rewards/GIT": 0.7152339220046997, + "rewards/HPSv2": 0.2788581848144531, + "rewards/ORM": 0.7532588541507721, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.4375, + "step": 409 + }, + { + "completion_length": 66.078125, + "epoch": 0.4540420819490587, + "grad_norm": 0.49395498633384705, + "kl": 0.04254150390625, + "learning_rate": 7.4375e-07, + "loss": -0.0026592229842208326, + "reward": 2.0098077058792114, + "reward_std": 0.12201930209994316, + "rewards/GDino": 0.7845315933227539, + "rewards/GIT": 0.45276427268981934, + "rewards/HPSv2": 0.2677783966064453, + "rewards/ORM": 0.5047334432601929, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.375, + "step": 410 + }, + { + "completion_length": 91.953125, + "epoch": 0.45514950166112955, + "grad_norm": 0.46236976981163025, + "kl": 0.02215576171875, + "learning_rate": 7.43125e-07, + "loss": -0.00293766800314188, + "reward": 2.5302971601486206, + "reward_std": 0.3805113881826401, + "rewards/GDino": 0.9305559694766998, + "rewards/GIT": 0.684164822101593, + "rewards/HPSv2": 0.2784309387207031, + "rewards/ORM": 0.6371452808380127, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.875, + "step": 411 + }, + { + "completion_length": 79.0, + "epoch": 0.45625692137320045, + "grad_norm": 0.4665317237377167, + "kl": 0.0347900390625, + "learning_rate": 7.425e-07, + "loss": -0.008466975064948201, + "reward": 2.459402084350586, + "reward_std": 0.20596018806099892, + "rewards/GDino": 0.8504825830459595, + "rewards/GIT": 0.45759011059999466, + "rewards/HPSv2": 0.28253746032714844, + "rewards/ORM": 0.868791937828064, + "self_certainty_semantic": -25.875, + "self_certainty_token": -20.9375, + "step": 412 + }, + { + "completion_length": 77.828125, + "epoch": 0.4573643410852713, + "grad_norm": 0.49716901779174805, + "kl": 0.0384521484375, + "learning_rate": 7.418749999999999e-07, + "loss": -0.0018519368022680283, + "reward": 1.693007469177246, + "reward_std": 0.4901855140924454, + "rewards/GDino": 0.7433426082134247, + "rewards/GIT": 0.11231328174471855, + "rewards/HPSv2": 0.2725238800048828, + "rewards/ORM": 0.5648276954889297, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.5625, + "step": 413 + }, + { + "completion_length": 82.640625, + "epoch": 0.4584717607973422, + "grad_norm": 0.7909552454948425, + "kl": 0.0501708984375, + "learning_rate": 7.412499999999999e-07, + "loss": -0.010068975854665041, + "reward": 2.2669637203216553, + "reward_std": 0.3244923800230026, + "rewards/GDino": 0.7585968375205994, + "rewards/GIT": 0.3361337333917618, + "rewards/HPSv2": 0.28313255310058594, + "rewards/ORM": 0.8891004323959351, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.3125, + "step": 414 + }, + { + "completion_length": 79.09375, + "epoch": 0.45957918050941304, + "grad_norm": 0.43149980902671814, + "kl": 0.03082275390625, + "learning_rate": 7.406249999999999e-07, + "loss": -0.006011321907863021, + "reward": 2.1418988704681396, + "reward_std": 0.5438865721225739, + "rewards/GDino": 0.7610452175140381, + "rewards/GIT": 0.3183070868253708, + "rewards/HPSv2": 0.270355224609375, + "rewards/ORM": 0.7921914756298065, + "self_certainty_semantic": -25.75, + "self_certainty_token": -22.6875, + "step": 415 + }, + { + "completion_length": 65.28125, + "epoch": 0.46068660022148394, + "grad_norm": 0.5704305768013, + "kl": 0.03826904296875, + "learning_rate": 7.4e-07, + "loss": -0.004705146478954703, + "reward": 1.9789756536483765, + "reward_std": 0.3610726371407509, + "rewards/GDino": 0.6869430541992188, + "rewards/GIT": 0.3546784371137619, + "rewards/HPSv2": 0.2608757019042969, + "rewards/ORM": 0.6764785945415497, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.8125, + "step": 416 + }, + { + "completion_length": 82.921875, + "epoch": 0.46179401993355484, + "grad_norm": 1.534477949142456, + "kl": 0.018585205078125, + "learning_rate": 7.39375e-07, + "loss": 0.009809148497879505, + "reward": 2.0984532237052917, + "reward_std": 0.3655596971511841, + "rewards/GDino": 0.7439702451229095, + "rewards/GIT": 0.5340979695320129, + "rewards/HPSv2": 0.2582855224609375, + "rewards/ORM": 0.5620993673801422, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.625, + "step": 417 + }, + { + "completion_length": 71.984375, + "epoch": 0.4629014396456257, + "grad_norm": 0.6864891052246094, + "kl": 0.0355224609375, + "learning_rate": 7.3875e-07, + "loss": -0.007062543882057071, + "reward": 1.971714973449707, + "reward_std": 0.2827928885817528, + "rewards/GDino": 0.7771302461624146, + "rewards/GIT": 0.503301203250885, + "rewards/HPSv2": 0.2641334533691406, + "rewards/ORM": 0.42715001106262207, + "self_certainty_semantic": -25.625, + "self_certainty_token": -22.1875, + "step": 418 + }, + { + "completion_length": 75.375, + "epoch": 0.4640088593576966, + "grad_norm": 1.6586036682128906, + "kl": 0.05633544921875, + "learning_rate": 7.38125e-07, + "loss": -0.006635851226747036, + "reward": 2.1996541023254395, + "reward_std": 0.3620283454656601, + "rewards/GDino": 0.750344455242157, + "rewards/GIT": 0.5618848949670792, + "rewards/HPSv2": 0.25415992736816406, + "rewards/ORM": 0.63326495885849, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.625, + "step": 419 + }, + { + "completion_length": 94.265625, + "epoch": 0.46511627906976744, + "grad_norm": 0.6408512592315674, + "kl": 0.01483154296875, + "learning_rate": 7.375e-07, + "loss": 0.0037272414192557335, + "reward": 2.4466946125030518, + "reward_std": 0.2856600284576416, + "rewards/GDino": 0.8534864485263824, + "rewards/GIT": 0.6760710775852203, + "rewards/HPSv2": 0.2839241027832031, + "rewards/ORM": 0.6332131326198578, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -22.375, + "step": 420 + }, + { + "completion_length": 71.890625, + "epoch": 0.46622369878183834, + "grad_norm": 0.4374409317970276, + "kl": 0.03546142578125, + "learning_rate": 7.368749999999999e-07, + "loss": -0.005359282658901066, + "reward": 2.329770088195801, + "reward_std": 0.2165464088320732, + "rewards/GDino": 0.793749988079071, + "rewards/GIT": 0.5585953593254089, + "rewards/HPSv2": 0.2817115783691406, + "rewards/ORM": 0.6957131624221802, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.5, + "step": 421 + }, + { + "completion_length": 85.640625, + "epoch": 0.4673311184939092, + "grad_norm": 0.5857760310173035, + "kl": 0.03125, + "learning_rate": 7.362499999999999e-07, + "loss": -0.016356450505554676, + "reward": 2.1780741810798645, + "reward_std": 0.19313644617795944, + "rewards/GDino": 0.7867951095104218, + "rewards/GIT": 0.5323421508073807, + "rewards/HPSv2": 0.2773323059082031, + "rewards/ORM": 0.5816046893596649, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.875, + "step": 422 + }, + { + "completion_length": 84.734375, + "epoch": 0.4684385382059801, + "grad_norm": 0.4287838935852051, + "kl": 0.03082275390625, + "learning_rate": 7.356249999999999e-07, + "loss": -0.011429834179580212, + "reward": 2.2586333751678467, + "reward_std": 0.2548673450946808, + "rewards/GDino": 0.822375625371933, + "rewards/GIT": 0.5989014655351639, + "rewards/HPSv2": 0.2845439910888672, + "rewards/ORM": 0.5528122633695602, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -22.4375, + "step": 423 + }, + { + "completion_length": 79.4375, + "epoch": 0.4695459579180509, + "grad_norm": 0.5285993218421936, + "kl": 0.0333251953125, + "learning_rate": 7.35e-07, + "loss": -0.009189559612423182, + "reward": 1.9543618559837341, + "reward_std": 0.3820193409919739, + "rewards/GDino": 0.7618323564529419, + "rewards/GIT": 0.3905602991580963, + "rewards/HPSv2": 0.2703685760498047, + "rewards/ORM": 0.5316007137298584, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.25, + "step": 424 + }, + { + "completion_length": 92.890625, + "epoch": 0.4706533776301218, + "grad_norm": 0.9444605708122253, + "kl": 0.07952880859375, + "learning_rate": 7.34375e-07, + "loss": 0.015706528909504414, + "reward": 2.006572186946869, + "reward_std": 0.2646910846233368, + "rewards/GDino": 0.7495222091674805, + "rewards/GIT": 0.4039381295442581, + "rewards/HPSv2": 0.28194618225097656, + "rewards/ORM": 0.5711656212806702, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.6875, + "step": 425 + }, + { + "completion_length": 77.015625, + "epoch": 0.4717607973421927, + "grad_norm": 0.7699408531188965, + "kl": 0.0650634765625, + "learning_rate": 7.3375e-07, + "loss": 0.00010991387534886599, + "reward": 2.07556688785553, + "reward_std": 0.2595227137207985, + "rewards/GDino": 0.7702007293701172, + "rewards/GIT": 0.35001032054424286, + "rewards/HPSv2": 0.2557106018066406, + "rewards/ORM": 0.6996453106403351, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.3125, + "step": 426 + }, + { + "completion_length": 85.625, + "epoch": 0.4728682170542636, + "grad_norm": 0.41860419511795044, + "kl": 0.015625, + "learning_rate": 7.33125e-07, + "loss": -0.012647074181586504, + "reward": 2.859202742576599, + "reward_std": 0.15569165349006653, + "rewards/GDino": 0.9583333134651184, + "rewards/GIT": 0.9431213140487671, + "rewards/HPSv2": 0.2643108367919922, + "rewards/ORM": 0.6934372633695602, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -20.875, + "step": 427 + }, + { + "completion_length": 75.34375, + "epoch": 0.4739756367663344, + "grad_norm": 1.0548057556152344, + "kl": 0.05499267578125, + "learning_rate": 7.325e-07, + "loss": 0.006077921250835061, + "reward": 2.0637649297714233, + "reward_std": 0.312567874789238, + "rewards/GDino": 0.8416501581668854, + "rewards/GIT": 0.3275400698184967, + "rewards/HPSv2": 0.28148841857910156, + "rewards/ORM": 0.6130862981081009, + "self_certainty_semantic": -26.0, + "self_certainty_token": -22.0625, + "step": 428 + }, + { + "completion_length": 86.328125, + "epoch": 0.4750830564784053, + "grad_norm": 0.398201048374176, + "kl": 0.03582763671875, + "learning_rate": 7.31875e-07, + "loss": -0.013247306575067341, + "reward": 1.7798218727111816, + "reward_std": 0.25667745620012283, + "rewards/GDino": 0.6844161748886108, + "rewards/GIT": 0.232561856508255, + "rewards/HPSv2": 0.26967620849609375, + "rewards/ORM": 0.593167632818222, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.5, + "step": 429 + }, + { + "completion_length": 84.40625, + "epoch": 0.47619047619047616, + "grad_norm": 0.48394981026649475, + "kl": 0.05133056640625, + "learning_rate": 7.312499999999999e-07, + "loss": 0.0062074142042547464, + "reward": 1.9840147495269775, + "reward_std": 0.315598726272583, + "rewards/GDino": 0.6340554356575012, + "rewards/GIT": 0.3108940124511719, + "rewards/HPSv2": 0.2783946990966797, + "rewards/ORM": 0.7606706917285919, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.5, + "step": 430 + }, + { + "completion_length": 91.890625, + "epoch": 0.47729789590254706, + "grad_norm": 0.842060923576355, + "kl": 0.0592041015625, + "learning_rate": 7.306249999999999e-07, + "loss": 0.00015048013301566243, + "reward": 2.2891291975975037, + "reward_std": 0.32246553897857666, + "rewards/GDino": 0.7656250298023224, + "rewards/GIT": 0.5505922138690948, + "rewards/HPSv2": 0.27787017822265625, + "rewards/ORM": 0.6950417160987854, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.125, + "step": 431 + }, + { + "completion_length": 88.734375, + "epoch": 0.47840531561461797, + "grad_norm": 0.6684581637382507, + "kl": 0.03704833984375, + "learning_rate": 7.3e-07, + "loss": -0.000526083167642355, + "reward": 1.8135395646095276, + "reward_std": 0.3005647659301758, + "rewards/GDino": 0.6779249012470245, + "rewards/GIT": 0.3108871951699257, + "rewards/HPSv2": 0.2630805969238281, + "rewards/ORM": 0.561646893620491, + "self_certainty_semantic": -26.0, + "self_certainty_token": -22.0, + "step": 432 + }, + { + "completion_length": 80.671875, + "epoch": 0.4795127353266888, + "grad_norm": 0.784157931804657, + "kl": 0.053466796875, + "learning_rate": 7.29375e-07, + "loss": 0.0008348370902240276, + "reward": 2.0788557529449463, + "reward_std": 0.3638414815068245, + "rewards/GDino": 0.7038869261741638, + "rewards/GIT": 0.3587188050150871, + "rewards/HPSv2": 0.2833995819091797, + "rewards/ORM": 0.7328504323959351, + "self_certainty_semantic": -26.0, + "self_certainty_token": -23.25, + "step": 433 + }, + { + "completion_length": 90.234375, + "epoch": 0.4806201550387597, + "grad_norm": 0.48919686675071716, + "kl": 0.026275634765625, + "learning_rate": 7.2875e-07, + "loss": 0.0023571313358843327, + "reward": 2.3130220770835876, + "reward_std": 0.3182874023914337, + "rewards/GDino": 0.8460593223571777, + "rewards/GIT": 0.5327989757061005, + "rewards/HPSv2": 0.26193809509277344, + "rewards/ORM": 0.6722257137298584, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.5, + "step": 434 + }, + { + "completion_length": 81.75, + "epoch": 0.48172757475083056, + "grad_norm": 0.5001962184906006, + "kl": 0.0504150390625, + "learning_rate": 7.28125e-07, + "loss": -0.008762945421040058, + "reward": 1.9592769145965576, + "reward_std": 0.34958457946777344, + "rewards/GDino": 0.7855294942855835, + "rewards/GIT": 0.40955016016960144, + "rewards/HPSv2": 0.26173973083496094, + "rewards/ORM": 0.5024575889110565, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -21.75, + "step": 435 + }, + { + "completion_length": 73.4375, + "epoch": 0.48283499446290146, + "grad_norm": 0.4027310907840729, + "kl": 0.03509521484375, + "learning_rate": 7.275e-07, + "loss": -0.002915327437222004, + "reward": 2.1434993743896484, + "reward_std": 0.1968846172094345, + "rewards/GDino": 0.768848329782486, + "rewards/GIT": 0.4478500932455063, + "rewards/HPSv2": 0.28890419006347656, + "rewards/ORM": 0.6378966569900513, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.3125, + "step": 436 + }, + { + "completion_length": 85.796875, + "epoch": 0.4839424141749723, + "grad_norm": 0.3869122564792633, + "kl": 0.03369140625, + "learning_rate": 7.26875e-07, + "loss": -0.004824966192245483, + "reward": 2.3838918209075928, + "reward_std": 0.25147470086812973, + "rewards/GDino": 0.8401833176612854, + "rewards/GIT": 0.6571991443634033, + "rewards/HPSv2": 0.26744651794433594, + "rewards/ORM": 0.6190627217292786, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -21.625, + "step": 437 + }, + { + "completion_length": 94.359375, + "epoch": 0.4850498338870432, + "grad_norm": 0.48070138692855835, + "kl": 0.023162841796875, + "learning_rate": 7.262499999999999e-07, + "loss": -0.010622862726449966, + "reward": 2.399035692214966, + "reward_std": 0.2306915745139122, + "rewards/GDino": 0.7903645932674408, + "rewards/GIT": 0.6274235248565674, + "rewards/HPSv2": 0.2690563201904297, + "rewards/ORM": 0.7121912837028503, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.8125, + "step": 438 + }, + { + "completion_length": 85.578125, + "epoch": 0.48615725359911405, + "grad_norm": 0.5042142271995544, + "kl": 0.05328369140625, + "learning_rate": 7.256249999999999e-07, + "loss": -0.013352732639759779, + "reward": 2.354739785194397, + "reward_std": 0.3064749836921692, + "rewards/GDino": 0.8458828330039978, + "rewards/GIT": 0.5764823853969574, + "rewards/HPSv2": 0.2873668670654297, + "rewards/ORM": 0.6450077444314957, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -21.9375, + "step": 439 + }, + { + "completion_length": 81.984375, + "epoch": 0.48726467331118495, + "grad_norm": 0.8611792325973511, + "kl": 0.050537109375, + "learning_rate": 7.249999999999999e-07, + "loss": -0.005849999142810702, + "reward": 2.077091336250305, + "reward_std": 0.1811300478875637, + "rewards/GDino": 0.7373460233211517, + "rewards/GIT": 0.3319435864686966, + "rewards/HPSv2": 0.2855720520019531, + "rewards/ORM": 0.7222296893596649, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.0, + "step": 440 + }, + { + "completion_length": 70.984375, + "epoch": 0.4883720930232558, + "grad_norm": 0.4863640367984772, + "kl": 0.0521240234375, + "learning_rate": 7.243749999999999e-07, + "loss": -0.0025071899872273207, + "reward": 2.5115854740142822, + "reward_std": 0.3233245015144348, + "rewards/GDino": 0.8832213282585144, + "rewards/GIT": 0.5558635890483856, + "rewards/HPSv2": 0.2819633483886719, + "rewards/ORM": 0.7905370891094208, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.5, + "step": 441 + }, + { + "completion_length": 86.90625, + "epoch": 0.4894795127353267, + "grad_norm": 0.6643816232681274, + "kl": 0.03045654296875, + "learning_rate": 7.2375e-07, + "loss": -0.005832670722156763, + "reward": 1.703209102153778, + "reward_std": 0.49579574912786484, + "rewards/GDino": 0.645632416009903, + "rewards/GIT": 0.24031662940979004, + "rewards/HPSv2": 0.26605224609375, + "rewards/ORM": 0.5512078106403351, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.125, + "step": 442 + }, + { + "completion_length": 72.671875, + "epoch": 0.49058693244739754, + "grad_norm": 2.2237584590911865, + "kl": 0.039794921875, + "learning_rate": 7.23125e-07, + "loss": 0.00023815128952264786, + "reward": 1.7414924502372742, + "reward_std": 0.3599114716053009, + "rewards/GDino": 0.6700520515441895, + "rewards/GIT": 0.43203869462013245, + "rewards/HPSv2": 0.2526569366455078, + "rewards/ORM": 0.3867447078227997, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.6875, + "step": 443 + }, + { + "completion_length": 92.34375, + "epoch": 0.49169435215946844, + "grad_norm": 0.4640725255012512, + "kl": 0.0460205078125, + "learning_rate": 7.225e-07, + "loss": -0.004428665153682232, + "reward": 2.0310418605804443, + "reward_std": 0.3259080797433853, + "rewards/GDino": 0.724789559841156, + "rewards/GIT": 0.4121911972761154, + "rewards/HPSv2": 0.2721900939941406, + "rewards/ORM": 0.6218710243701935, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.4375, + "step": 444 + }, + { + "completion_length": 66.515625, + "epoch": 0.49280177187153934, + "grad_norm": 0.6725968718528748, + "kl": 0.0498046875, + "learning_rate": 7.21875e-07, + "loss": 0.0030636144801974297, + "reward": 2.7512794733047485, + "reward_std": 0.1399773694574833, + "rewards/GDino": 0.9078124761581421, + "rewards/GIT": 0.6798087060451508, + "rewards/HPSv2": 0.28865814208984375, + "rewards/ORM": 0.875, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -23.0625, + "step": 445 + }, + { + "completion_length": 89.421875, + "epoch": 0.4939091915836102, + "grad_norm": 0.43778151273727417, + "kl": 0.0401611328125, + "learning_rate": 7.212499999999999e-07, + "loss": 0.006576362065970898, + "reward": 2.292953133583069, + "reward_std": 0.2412705421447754, + "rewards/GDino": 0.8648694455623627, + "rewards/GIT": 0.47470442950725555, + "rewards/HPSv2": 0.2799797058105469, + "rewards/ORM": 0.6733995378017426, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -21.625, + "step": 446 + }, + { + "completion_length": 88.3125, + "epoch": 0.4950166112956811, + "grad_norm": 0.5815999507904053, + "kl": 0.053955078125, + "learning_rate": 7.206249999999999e-07, + "loss": 0.007724498165771365, + "reward": 2.314146637916565, + "reward_std": 0.298708438873291, + "rewards/GDino": 0.778325617313385, + "rewards/GIT": 0.4846403896808624, + "rewards/HPSv2": 0.2820758819580078, + "rewards/ORM": 0.7691046893596649, + "self_certainty_semantic": -26.0, + "self_certainty_token": -22.5, + "step": 447 + }, + { + "completion_length": 89.015625, + "epoch": 0.49612403100775193, + "grad_norm": 0.610797107219696, + "kl": 0.039306640625, + "learning_rate": 7.2e-07, + "loss": -0.00025889265816658735, + "reward": 2.5553557872772217, + "reward_std": 0.3365253210067749, + "rewards/GDino": 0.8934768736362457, + "rewards/GIT": 0.5267609506845474, + "rewards/HPSv2": 0.2717609405517578, + "rewards/ORM": 0.8633570969104767, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.3125, + "step": 448 + }, + { + "completion_length": 88.328125, + "epoch": 0.49723145071982283, + "grad_norm": 0.9571331143379211, + "kl": 0.0404052734375, + "learning_rate": 7.19375e-07, + "loss": -0.01380598871037364, + "reward": 2.071900963783264, + "reward_std": 0.2538035809993744, + "rewards/GDino": 0.7991504073143005, + "rewards/GIT": 0.35837409645318985, + "rewards/HPSv2": 0.28504371643066406, + "rewards/ORM": 0.6293328106403351, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.875, + "step": 449 + }, + { + "completion_length": 83.890625, + "epoch": 0.4983388704318937, + "grad_norm": 0.9305670261383057, + "kl": 0.0462646484375, + "learning_rate": 7.1875e-07, + "loss": -0.0037036877183709294, + "reward": 1.786819338798523, + "reward_std": 0.4241926223039627, + "rewards/GDino": 0.8178980052471161, + "rewards/GIT": 0.3817112445831299, + "rewards/HPSv2": 0.26502227783203125, + "rewards/ORM": 0.32218773663043976, + "self_certainty_semantic": -25.875, + "self_certainty_token": -22.0, + "step": 450 + }, + { + "completion_length": 89.359375, + "epoch": 0.4994462901439646, + "grad_norm": 0.4716930687427521, + "kl": 0.03607177734375, + "learning_rate": 7.18125e-07, + "loss": 0.008189344312995672, + "reward": 1.721399962902069, + "reward_std": 0.22800400853157043, + "rewards/GDino": 0.7420409619808197, + "rewards/GIT": 0.4152991771697998, + "rewards/HPSv2": 0.2706146240234375, + "rewards/ORM": 0.29344524443149567, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.75, + "step": 451 + }, + { + "completion_length": 88.921875, + "epoch": 0.5005537098560354, + "grad_norm": 0.44959157705307007, + "kl": 0.07904052734375, + "learning_rate": 7.175e-07, + "loss": -0.008612239500507712, + "reward": 2.08866810798645, + "reward_std": 0.31104299426078796, + "rewards/GDino": 0.7471995055675507, + "rewards/GIT": 0.3345036208629608, + "rewards/HPSv2": 0.28125572204589844, + "rewards/ORM": 0.7257093489170074, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.1875, + "step": 452 + }, + { + "completion_length": 96.6875, + "epoch": 0.5016611295681063, + "grad_norm": 0.5273993015289307, + "kl": 0.048828125, + "learning_rate": 7.16875e-07, + "loss": -0.008911502780392766, + "reward": 2.3552602529525757, + "reward_std": 0.30835530906915665, + "rewards/GDino": 0.7447711825370789, + "rewards/GIT": 0.6296440958976746, + "rewards/HPSv2": 0.26204490661621094, + "rewards/ORM": 0.7187999188899994, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -22.25, + "step": 453 + }, + { + "completion_length": 77.84375, + "epoch": 0.5027685492801772, + "grad_norm": 0.7640430331230164, + "kl": 0.03045654296875, + "learning_rate": 7.1625e-07, + "loss": -0.0042315139435231686, + "reward": 2.4679338932037354, + "reward_std": 0.41722583770751953, + "rewards/GDino": 0.8896875381469727, + "rewards/GIT": 0.6417616009712219, + "rewards/HPSv2": 0.25840187072753906, + "rewards/ORM": 0.678083062171936, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.75, + "step": 454 + }, + { + "completion_length": 85.71875, + "epoch": 0.5038759689922481, + "grad_norm": 0.6293461322784424, + "kl": 0.049560546875, + "learning_rate": 7.156249999999999e-07, + "loss": -0.0022096349857747555, + "reward": 2.077404499053955, + "reward_std": 0.1506524756550789, + "rewards/GDino": 0.6732477396726608, + "rewards/GIT": 0.3189016580581665, + "rewards/HPSv2": 0.27190208435058594, + "rewards/ORM": 0.8133530914783478, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -22.1875, + "step": 455 + }, + { + "completion_length": 83.9375, + "epoch": 0.5049833887043189, + "grad_norm": 1.3855115175247192, + "kl": 0.0716552734375, + "learning_rate": 7.149999999999999e-07, + "loss": 0.001122410292737186, + "reward": 2.082254648208618, + "reward_std": 0.4632801115512848, + "rewards/GDino": 0.7531865835189819, + "rewards/GIT": 0.4373682588338852, + "rewards/HPSv2": 0.2716541290283203, + "rewards/ORM": 0.6200457215309143, + "self_certainty_semantic": -26.3125, + "self_certainty_token": -21.5625, + "step": 456 + }, + { + "completion_length": 84.953125, + "epoch": 0.5060908084163898, + "grad_norm": 0.5272361636161804, + "kl": 0.0565185546875, + "learning_rate": 7.143749999999999e-07, + "loss": -0.014877717941999435, + "reward": 2.0980849862098694, + "reward_std": 0.5083387047052383, + "rewards/GDino": 0.7351427972316742, + "rewards/GIT": 0.29914718866348267, + "rewards/HPSv2": 0.2981700897216797, + "rewards/ORM": 0.7656250298023224, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.25, + "step": 457 + }, + { + "completion_length": 90.796875, + "epoch": 0.5071982281284607, + "grad_norm": 0.7672439217567444, + "kl": 0.038330078125, + "learning_rate": 7.137499999999999e-07, + "loss": -0.002860535169020295, + "reward": 2.387991786003113, + "reward_std": 0.33203044533729553, + "rewards/GDino": 0.8920667171478271, + "rewards/GIT": 0.6445018947124481, + "rewards/HPSv2": 0.2889232635498047, + "rewards/ORM": 0.5625, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.0, + "step": 458 + }, + { + "completion_length": 86.921875, + "epoch": 0.5083056478405316, + "grad_norm": 0.4916004240512848, + "kl": 0.03045654296875, + "learning_rate": 7.13125e-07, + "loss": 0.01850858621764928, + "reward": 2.1592776775360107, + "reward_std": 0.25633590668439865, + "rewards/GDino": 0.8275542855262756, + "rewards/GIT": 0.5902497172355652, + "rewards/HPSv2": 0.25870323181152344, + "rewards/ORM": 0.48277053236961365, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.5, + "step": 459 + }, + { + "completion_length": 82.1875, + "epoch": 0.5094130675526024, + "grad_norm": 0.6774526238441467, + "kl": 0.09375, + "learning_rate": 7.125e-07, + "loss": 0.005978217581287026, + "reward": 2.035232424736023, + "reward_std": 0.39641426503658295, + "rewards/GDino": 0.7663669586181641, + "rewards/GIT": 0.22609353065490723, + "rewards/HPSv2": 0.2731647491455078, + "rewards/ORM": 0.7696070969104767, + "self_certainty_semantic": -26.0, + "self_certainty_token": -22.9375, + "step": 460 + }, + { + "completion_length": 83.84375, + "epoch": 0.5105204872646734, + "grad_norm": 0.4391137361526489, + "kl": 0.03045654296875, + "learning_rate": 7.11875e-07, + "loss": 0.0027710344875231385, + "reward": 1.9793134927749634, + "reward_std": 0.39203909039497375, + "rewards/GDino": 0.7310473620891571, + "rewards/GIT": 0.2419200837612152, + "rewards/HPSv2": 0.2859916687011719, + "rewards/ORM": 0.720354437828064, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.875, + "step": 461 + }, + { + "completion_length": 88.5, + "epoch": 0.5116279069767442, + "grad_norm": 1.3002902269363403, + "kl": 0.0302734375, + "learning_rate": 7.1125e-07, + "loss": -0.003607785329222679, + "reward": 1.8144089579582214, + "reward_std": 0.27291738986968994, + "rewards/GDino": 0.6941651701927185, + "rewards/GIT": 0.42088769376277924, + "rewards/HPSv2": 0.2555980682373047, + "rewards/ORM": 0.4437580108642578, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -21.8125, + "step": 462 + }, + { + "completion_length": 84.546875, + "epoch": 0.512735326688815, + "grad_norm": 1.2651053667068481, + "kl": 0.0697021484375, + "learning_rate": 7.106249999999999e-07, + "loss": -0.002405138802714646, + "reward": 2.020813226699829, + "reward_std": 0.21075522154569626, + "rewards/GDino": 0.7722398042678833, + "rewards/GIT": 0.1233956515789032, + "rewards/HPSv2": 0.28763580322265625, + "rewards/ORM": 0.837541937828064, + "self_certainty_semantic": -25.875, + "self_certainty_token": -22.4375, + "step": 463 + }, + { + "completion_length": 77.53125, + "epoch": 0.5138427464008859, + "grad_norm": 1.0401899814605713, + "kl": 0.042236328125, + "learning_rate": 7.1e-07, + "loss": 0.0035787662491202354, + "reward": 1.9480950236320496, + "reward_std": 0.23023036122322083, + "rewards/GDino": 0.7306839227676392, + "rewards/GIT": 0.3279959112405777, + "rewards/HPSv2": 0.25968170166015625, + "rewards/ORM": 0.6297334432601929, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.75, + "step": 464 + }, + { + "completion_length": 79.109375, + "epoch": 0.5149501661129569, + "grad_norm": 0.48940005898475647, + "kl": 0.05712890625, + "learning_rate": 7.09375e-07, + "loss": -0.005857666896190494, + "reward": 2.180624008178711, + "reward_std": 0.2805362194776535, + "rewards/GDino": 0.716775506734848, + "rewards/GIT": 0.3440728187561035, + "rewards/HPSv2": 0.29165077209472656, + "rewards/ORM": 0.828125, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.125, + "step": 465 + }, + { + "completion_length": 80.171875, + "epoch": 0.5160575858250277, + "grad_norm": 0.42522236704826355, + "kl": 0.02484130859375, + "learning_rate": 7.0875e-07, + "loss": 0.005051356623880565, + "reward": 1.9770900011062622, + "reward_std": 0.32918836176395416, + "rewards/GDino": 0.7397170662879944, + "rewards/GIT": 0.3399002403020859, + "rewards/HPSv2": 0.27868080139160156, + "rewards/ORM": 0.6187919676303864, + "self_certainty_semantic": -26.25, + "self_certainty_token": -22.875, + "step": 466 + }, + { + "completion_length": 88.875, + "epoch": 0.5171650055370985, + "grad_norm": 0.7818877696990967, + "kl": 0.0535888671875, + "learning_rate": 7.08125e-07, + "loss": 0.0023874612525105476, + "reward": 2.3585548400878906, + "reward_std": 0.2254454642534256, + "rewards/GDino": 0.7899121940135956, + "rewards/GIT": 0.37046997249126434, + "rewards/HPSv2": 0.2723655700683594, + "rewards/ORM": 0.9258071482181549, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.5, + "step": 467 + }, + { + "completion_length": 83.171875, + "epoch": 0.5182724252491694, + "grad_norm": 0.7084239721298218, + "kl": 0.06085205078125, + "learning_rate": 7.075e-07, + "loss": -0.0017908139852806926, + "reward": 2.103492498397827, + "reward_std": 0.2575216367840767, + "rewards/GDino": 0.8437500298023224, + "rewards/GIT": 0.383937731385231, + "rewards/HPSv2": 0.26790428161621094, + "rewards/ORM": 0.6079004257917404, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -22.3125, + "step": 468 + }, + { + "completion_length": 76.71875, + "epoch": 0.5193798449612403, + "grad_norm": 0.7291202545166016, + "kl": 0.049560546875, + "learning_rate": 7.06875e-07, + "loss": -0.0025954623706638813, + "reward": 1.7694249153137207, + "reward_std": 0.3203016445040703, + "rewards/GDino": 0.6941194236278534, + "rewards/GIT": 0.19523531198501587, + "rewards/HPSv2": 0.2690906524658203, + "rewards/ORM": 0.610979437828064, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -21.5625, + "step": 469 + }, + { + "completion_length": 81.734375, + "epoch": 0.5204872646733112, + "grad_norm": 0.5116159319877625, + "kl": 0.0477294921875, + "learning_rate": 7.0625e-07, + "loss": 0.0012024766765534878, + "reward": 2.0922325253486633, + "reward_std": 0.2755435109138489, + "rewards/GDino": 0.7483008503913879, + "rewards/GIT": 0.515093594789505, + "rewards/HPSv2": 0.27562522888183594, + "rewards/ORM": 0.5532129406929016, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.0, + "step": 470 + }, + { + "completion_length": 85.125, + "epoch": 0.521594684385382, + "grad_norm": 0.4147425889968872, + "kl": 0.0477294921875, + "learning_rate": 7.056249999999999e-07, + "loss": 0.00042333872988820076, + "reward": 2.648235559463501, + "reward_std": 0.13014497607946396, + "rewards/GDino": 0.8161458671092987, + "rewards/GIT": 0.5663179308176041, + "rewards/HPSv2": 0.28139686584472656, + "rewards/ORM": 0.984375, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.5625, + "step": 471 + }, + { + "completion_length": 78.625, + "epoch": 0.5227021040974529, + "grad_norm": 0.6925135254859924, + "kl": 0.05615234375, + "learning_rate": 7.049999999999999e-07, + "loss": -0.007782004773616791, + "reward": 1.8986712098121643, + "reward_std": 0.37742742896080017, + "rewards/GDino": 0.7116584181785583, + "rewards/GIT": 0.171549990773201, + "rewards/HPSv2": 0.28108787536621094, + "rewards/ORM": 0.734375, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.4375, + "step": 472 + }, + { + "completion_length": 80.96875, + "epoch": 0.5238095238095238, + "grad_norm": 0.4296283721923828, + "kl": 0.0281982421875, + "learning_rate": 7.043749999999999e-07, + "loss": 0.00030455179512500763, + "reward": 2.6370961666107178, + "reward_std": 0.2901841849088669, + "rewards/GDino": 0.8401041626930237, + "rewards/GIT": 0.6279455721378326, + "rewards/HPSv2": 0.25458335876464844, + "rewards/ORM": 0.9144631326198578, + "self_certainty_semantic": -26.0, + "self_certainty_token": -22.0, + "step": 473 + }, + { + "completion_length": 90.0, + "epoch": 0.5249169435215947, + "grad_norm": 0.6720231771469116, + "kl": 0.03106689453125, + "learning_rate": 7.037499999999999e-07, + "loss": -0.005071162478998303, + "reward": 2.136423110961914, + "reward_std": 0.2358795627951622, + "rewards/GDino": 0.7747220396995544, + "rewards/GIT": 0.3631821498274803, + "rewards/HPSv2": 0.26909828186035156, + "rewards/ORM": 0.7294207215309143, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.4375, + "step": 474 + }, + { + "completion_length": 79.234375, + "epoch": 0.5260243632336655, + "grad_norm": 0.5109792947769165, + "kl": 0.044677734375, + "learning_rate": 7.031249999999999e-07, + "loss": -0.01136302575469017, + "reward": 1.844740867614746, + "reward_std": 0.3263453245162964, + "rewards/GDino": 0.7186122834682465, + "rewards/GIT": 0.32664740830659866, + "rewards/HPSv2": 0.2563457489013672, + "rewards/ORM": 0.5431355237960815, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -21.375, + "step": 475 + }, + { + "completion_length": 81.96875, + "epoch": 0.5271317829457365, + "grad_norm": 1.8045639991760254, + "kl": 0.068359375, + "learning_rate": 7.024999999999999e-07, + "loss": 0.004477499169297516, + "reward": 1.9433278441429138, + "reward_std": 0.44146284461021423, + "rewards/GDino": 0.7682976722717285, + "rewards/GIT": 0.0959073156118393, + "rewards/HPSv2": 0.2813148498535156, + "rewards/ORM": 0.797808051109314, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.9375, + "step": 476 + }, + { + "completion_length": 82.671875, + "epoch": 0.5282392026578073, + "grad_norm": 0.5051248669624329, + "kl": 0.04541015625, + "learning_rate": 7.01875e-07, + "loss": 0.0008793969172984362, + "reward": 2.0551332235336304, + "reward_std": 0.3801504373550415, + "rewards/GDino": 0.7015988528728485, + "rewards/GIT": 0.277570903301239, + "rewards/HPSv2": 0.27320098876953125, + "rewards/ORM": 0.8027623295783997, + "self_certainty_semantic": -26.0, + "self_certainty_token": -22.8125, + "step": 477 + }, + { + "completion_length": 75.34375, + "epoch": 0.5293466223698782, + "grad_norm": 0.46568962931632996, + "kl": 0.0419921875, + "learning_rate": 7.0125e-07, + "loss": 0.004621566738933325, + "reward": 2.723947286605835, + "reward_std": 0.3426435887813568, + "rewards/GDino": 0.9222352504730225, + "rewards/GIT": 0.676825225353241, + "rewards/HPSv2": 0.26899147033691406, + "rewards/ORM": 0.8558952808380127, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -22.1875, + "step": 478 + }, + { + "completion_length": 75.796875, + "epoch": 0.530454042081949, + "grad_norm": 0.5428593158721924, + "kl": 0.02685546875, + "learning_rate": 7.006250000000001e-07, + "loss": -0.009032552363350987, + "reward": 2.1365994215011597, + "reward_std": 0.334768146276474, + "rewards/GDino": 0.7453124821186066, + "rewards/GIT": 0.5772874057292938, + "rewards/HPSv2": 0.26592254638671875, + "rewards/ORM": 0.5480769276618958, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.875, + "step": 479 + }, + { + "completion_length": 76.53125, + "epoch": 0.53156146179402, + "grad_norm": 0.37428608536720276, + "kl": 0.03173828125, + "learning_rate": 7e-07, + "loss": -0.0008319772314280272, + "reward": 1.8188861012458801, + "reward_std": 0.35283350944519043, + "rewards/GDino": 0.6008494794368744, + "rewards/GIT": 0.12165512889623642, + "rewards/HPSv2": 0.27954864501953125, + "rewards/ORM": 0.8168328404426575, + "self_certainty_semantic": -26.0, + "self_certainty_token": -22.125, + "step": 480 + }, + { + "completion_length": 85.375, + "epoch": 0.5326688815060908, + "grad_norm": 0.5326999425888062, + "kl": 0.03955078125, + "learning_rate": 6.99375e-07, + "loss": -0.0045750674325972795, + "reward": 2.0283864736557007, + "reward_std": 0.28144776821136475, + "rewards/GDino": 0.6503430604934692, + "rewards/GIT": 0.23870204389095306, + "rewards/HPSv2": 0.2799663543701172, + "rewards/ORM": 0.859375, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.75, + "step": 481 + }, + { + "completion_length": 81.90625, + "epoch": 0.5337763012181617, + "grad_norm": 0.6752430200576782, + "kl": 0.03057861328125, + "learning_rate": 6.9875e-07, + "loss": -0.0032791628036648035, + "reward": 2.68773090839386, + "reward_std": 0.25259730219841003, + "rewards/GDino": 0.8796875178813934, + "rewards/GIT": 0.7561887502670288, + "rewards/HPSv2": 0.27220916748046875, + "rewards/ORM": 0.7796455323696136, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -23.25, + "step": 482 + }, + { + "completion_length": 80.5625, + "epoch": 0.5348837209302325, + "grad_norm": 0.43474993109703064, + "kl": 0.036865234375, + "learning_rate": 6.98125e-07, + "loss": -0.008354386780411005, + "reward": 2.1858333349227905, + "reward_std": 0.32648710906505585, + "rewards/GDino": 0.796875, + "rewards/GIT": 0.5572735965251923, + "rewards/HPSv2": 0.2603893280029297, + "rewards/ORM": 0.571295440196991, + "self_certainty_semantic": -26.25, + "self_certainty_token": -22.375, + "step": 483 + }, + { + "completion_length": 86.875, + "epoch": 0.5359911406423035, + "grad_norm": 0.6366690397262573, + "kl": 0.0369873046875, + "learning_rate": 6.975e-07, + "loss": -0.0077962810173630714, + "reward": 2.305320680141449, + "reward_std": 0.4473063796758652, + "rewards/GDino": 0.878125011920929, + "rewards/GIT": 0.5372260212898254, + "rewards/HPSv2": 0.26001548767089844, + "rewards/ORM": 0.6299542784690857, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.875, + "step": 484 + }, + { + "completion_length": 87.859375, + "epoch": 0.5370985603543743, + "grad_norm": 0.5098552107810974, + "kl": 0.0472412109375, + "learning_rate": 6.96875e-07, + "loss": 0.001916782173793763, + "reward": 2.481204390525818, + "reward_std": 0.25730209797620773, + "rewards/GDino": 0.898908793926239, + "rewards/GIT": 0.6178049445152283, + "rewards/HPSv2": 0.2770404815673828, + "rewards/ORM": 0.6874500215053558, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.0, + "step": 485 + }, + { + "completion_length": 95.921875, + "epoch": 0.5382059800664452, + "grad_norm": 0.6449098587036133, + "kl": 0.04168701171875, + "learning_rate": 6.9625e-07, + "loss": 0.002262353547848761, + "reward": 2.033765494823456, + "reward_std": 0.1919030249118805, + "rewards/GDino": 0.7796054184436798, + "rewards/GIT": 0.5904161334037781, + "rewards/HPSv2": 0.2731189727783203, + "rewards/ORM": 0.390625, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.0, + "step": 486 + }, + { + "completion_length": 91.28125, + "epoch": 0.5393133997785161, + "grad_norm": 0.48420625925064087, + "kl": 0.023193359375, + "learning_rate": 6.95625e-07, + "loss": 0.0003378549590706825, + "reward": 2.1127328872680664, + "reward_std": 0.1549435406923294, + "rewards/GDino": 0.729057788848877, + "rewards/GIT": 0.6239954233169556, + "rewards/HPSv2": 0.25967979431152344, + "rewards/ORM": 0.5, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.75, + "step": 487 + }, + { + "completion_length": 90.21875, + "epoch": 0.540420819490587, + "grad_norm": 0.40716660022735596, + "kl": 0.0474853515625, + "learning_rate": 6.949999999999999e-07, + "loss": -0.008702049497514963, + "reward": 1.9484134316444397, + "reward_std": 0.20615732669830322, + "rewards/GDino": 0.757031261920929, + "rewards/GIT": 0.35134750604629517, + "rewards/HPSv2": 0.2849464416503906, + "rewards/ORM": 0.555088147521019, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.8125, + "step": 488 + }, + { + "completion_length": 86.109375, + "epoch": 0.5415282392026578, + "grad_norm": 8.682135581970215, + "kl": 0.0513916015625, + "learning_rate": 6.943749999999999e-07, + "loss": 0.011015530675649643, + "reward": 1.966869831085205, + "reward_std": 0.29309169948101044, + "rewards/GDino": 0.7390032708644867, + "rewards/GIT": 0.3247649073600769, + "rewards/HPSv2": 0.26903533935546875, + "rewards/ORM": 0.634066253900528, + "self_certainty_semantic": -26.125, + "self_certainty_token": -21.3125, + "step": 489 + }, + { + "completion_length": 78.78125, + "epoch": 0.5426356589147286, + "grad_norm": 0.5013965964317322, + "kl": 0.0498046875, + "learning_rate": 6.937499999999999e-07, + "loss": 0.00042570335790514946, + "reward": 1.7634241580963135, + "reward_std": 0.3654480427503586, + "rewards/GDino": 0.6242809295654297, + "rewards/GIT": 0.23153883963823318, + "rewards/HPSv2": 0.267730712890625, + "rewards/ORM": 0.6398736834526062, + "self_certainty_semantic": -25.75, + "self_certainty_token": -22.5, + "step": 490 + }, + { + "completion_length": 83.3125, + "epoch": 0.5437430786267996, + "grad_norm": 8.46993637084961, + "kl": 0.0572509765625, + "learning_rate": 6.931249999999999e-07, + "loss": -0.005473015829920769, + "reward": 1.509476125240326, + "reward_std": 0.4046163856983185, + "rewards/GDino": 0.5425868630409241, + "rewards/GIT": 0.1505599021911621, + "rewards/HPSv2": 0.2785205841064453, + "rewards/ORM": 0.537808746099472, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -22.125, + "step": 491 + }, + { + "completion_length": 74.375, + "epoch": 0.5448504983388704, + "grad_norm": 1.0680968761444092, + "kl": 0.18212890625, + "learning_rate": 6.924999999999999e-07, + "loss": -0.007677004672586918, + "reward": 1.7566925287246704, + "reward_std": 0.46585220098495483, + "rewards/GDino": 0.7024907469749451, + "rewards/GIT": 0.27498502284288406, + "rewards/HPSv2": 0.2792167663574219, + "rewards/ORM": 0.5, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.4375, + "step": 492 + }, + { + "completion_length": 89.765625, + "epoch": 0.5459579180509413, + "grad_norm": 2.779191017150879, + "kl": 0.04620361328125, + "learning_rate": 6.918749999999999e-07, + "loss": -0.005050210049375892, + "reward": 2.510376453399658, + "reward_std": 0.25831417739391327, + "rewards/GDino": 0.7604166865348816, + "rewards/GIT": 0.5310951620340347, + "rewards/HPSv2": 0.2955150604248047, + "rewards/ORM": 0.9233495891094208, + "self_certainty_semantic": -25.875, + "self_certainty_token": -22.4375, + "step": 493 + }, + { + "completion_length": 87.1875, + "epoch": 0.5470653377630121, + "grad_norm": 0.41609928011894226, + "kl": 0.0230712890625, + "learning_rate": 6.9125e-07, + "loss": -0.003919895680155605, + "reward": 2.7028353214263916, + "reward_std": 0.15342215448617935, + "rewards/GDino": 0.8833333551883698, + "rewards/GIT": 0.6946229934692383, + "rewards/HPSv2": 0.2701873779296875, + "rewards/ORM": 0.8546915352344513, + "self_certainty_semantic": -26.125, + "self_certainty_token": -20.9375, + "step": 494 + }, + { + "completion_length": 100.03125, + "epoch": 0.5481727574750831, + "grad_norm": 0.5169051289558411, + "kl": 0.02117919921875, + "learning_rate": 6.906250000000001e-07, + "loss": 0.01825833600014448, + "reward": 2.146196663379669, + "reward_std": 0.30665238201618195, + "rewards/GDino": 0.8857923150062561, + "rewards/GIT": 0.606977790594101, + "rewards/HPSv2": 0.2580680847167969, + "rewards/ORM": 0.39535844326019287, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.1875, + "step": 495 + }, + { + "completion_length": 89.359375, + "epoch": 0.5492801771871539, + "grad_norm": 0.549789309501648, + "kl": 0.03253173828125, + "learning_rate": 6.9e-07, + "loss": -0.0010672432836145163, + "reward": 2.2737563848495483, + "reward_std": 0.26829127222299576, + "rewards/GDino": 0.8921581208705902, + "rewards/GIT": 0.5237478911876678, + "rewards/HPSv2": 0.2749919891357422, + "rewards/ORM": 0.5828584581613541, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -21.3125, + "step": 496 + }, + { + "completion_length": 78.28125, + "epoch": 0.5503875968992248, + "grad_norm": 0.43911007046699524, + "kl": 0.037353515625, + "learning_rate": 6.89375e-07, + "loss": -0.00983960134908557, + "reward": 2.2258400917053223, + "reward_std": 0.26544705033302307, + "rewards/GDino": 0.780023843050003, + "rewards/GIT": 0.34170494228601456, + "rewards/HPSv2": 0.2825450897216797, + "rewards/ORM": 0.821566253900528, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.6875, + "step": 497 + }, + { + "completion_length": 83.46875, + "epoch": 0.5514950166112956, + "grad_norm": 0.478037029504776, + "kl": 0.052978515625, + "learning_rate": 6.8875e-07, + "loss": 0.0019646910950541496, + "reward": 2.4899743795394897, + "reward_std": 0.31686463952064514, + "rewards/GDino": 0.8994792103767395, + "rewards/GIT": 0.6088346242904663, + "rewards/HPSv2": 0.2777843475341797, + "rewards/ORM": 0.7038763463497162, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.8125, + "step": 498 + }, + { + "completion_length": 80.84375, + "epoch": 0.5526024363233666, + "grad_norm": 0.7774894833564758, + "kl": 0.02783203125, + "learning_rate": 6.88125e-07, + "loss": 0.010321385692805052, + "reward": 2.206755518913269, + "reward_std": 0.33102765679359436, + "rewards/GDino": 0.757864236831665, + "rewards/GIT": 0.44185641407966614, + "rewards/HPSv2": 0.276641845703125, + "rewards/ORM": 0.7303928732872009, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.625, + "step": 499 + }, + { + "completion_length": 89.828125, + "epoch": 0.5537098560354374, + "grad_norm": 0.5191186666488647, + "kl": 0.061767578125, + "learning_rate": 6.875e-07, + "loss": 0.008163958555087447, + "reward": 2.7401968240737915, + "reward_std": 0.15466349571943283, + "rewards/GDino": 0.8918681740760803, + "rewards/GIT": 0.7040586173534393, + "rewards/HPSv2": 0.2982940673828125, + "rewards/ORM": 0.8459759056568146, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.5, + "step": 500 + }, + { + "completion_length": 88.546875, + "epoch": 0.5548172757475083, + "grad_norm": 0.504323422908783, + "kl": 0.0311279296875, + "learning_rate": 6.86875e-07, + "loss": 0.006702080834656954, + "reward": 2.1681349873542786, + "reward_std": 0.32211561501026154, + "rewards/GDino": 0.7432291507720947, + "rewards/GIT": 0.6123772263526917, + "rewards/HPSv2": 0.2656536102294922, + "rewards/ORM": 0.5468750298023224, + "self_certainty_semantic": -25.875, + "self_certainty_token": -20.9375, + "step": 501 + }, + { + "completion_length": 98.03125, + "epoch": 0.5559246954595792, + "grad_norm": 1.1771929264068604, + "kl": 0.041015625, + "learning_rate": 6.8625e-07, + "loss": 0.015669922344386578, + "reward": 2.2861039638519287, + "reward_std": 0.1799641251564026, + "rewards/GDino": 0.7887386381626129, + "rewards/GIT": 0.5292036086320877, + "rewards/HPSv2": 0.2833900451660156, + "rewards/ORM": 0.6847716569900513, + "self_certainty_semantic": -26.0, + "self_certainty_token": -22.625, + "step": 502 + }, + { + "completion_length": 87.015625, + "epoch": 0.5570321151716501, + "grad_norm": 0.4914226830005646, + "kl": 0.046142578125, + "learning_rate": 6.85625e-07, + "loss": -0.0016496748430654407, + "reward": 1.7729777693748474, + "reward_std": 0.4089376628398895, + "rewards/GDino": 0.7127688527107239, + "rewards/GIT": 0.2334435135126114, + "rewards/HPSv2": 0.26912879943847656, + "rewards/ORM": 0.5576366782188416, + "self_certainty_semantic": -26.0, + "self_certainty_token": -22.1875, + "step": 503 + }, + { + "completion_length": 81.5, + "epoch": 0.5581395348837209, + "grad_norm": 0.5661796927452087, + "kl": 0.053466796875, + "learning_rate": 6.85e-07, + "loss": 0.0025135306641459465, + "reward": 1.9099263548851013, + "reward_std": 0.16836580261588097, + "rewards/GDino": 0.6708312928676605, + "rewards/GIT": 0.3587403893470764, + "rewards/HPSv2": 0.2674999237060547, + "rewards/ORM": 0.6128546893596649, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.8125, + "step": 504 + }, + { + "completion_length": 91.765625, + "epoch": 0.5592469545957918, + "grad_norm": 0.43467390537261963, + "kl": 0.03009033203125, + "learning_rate": 6.843749999999999e-07, + "loss": -0.001930622267536819, + "reward": 1.9086629748344421, + "reward_std": 0.3665133863687515, + "rewards/GDino": 0.69776451587677, + "rewards/GIT": 0.2310735583305359, + "rewards/HPSv2": 0.2817840576171875, + "rewards/ORM": 0.6980409026145935, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -22.3125, + "step": 505 + }, + { + "completion_length": 78.265625, + "epoch": 0.5603543743078627, + "grad_norm": 0.40074095129966736, + "kl": 0.0390625, + "learning_rate": 6.837499999999999e-07, + "loss": 0.005050136474892497, + "reward": 2.1269019842147827, + "reward_std": 0.3213868960738182, + "rewards/GDino": 0.7054687738418579, + "rewards/GIT": 0.5505602061748505, + "rewards/HPSv2": 0.28408241271972656, + "rewards/ORM": 0.5867906212806702, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.9375, + "step": 506 + }, + { + "completion_length": 77.765625, + "epoch": 0.5614617940199336, + "grad_norm": 0.7651181817054749, + "kl": 0.0592041015625, + "learning_rate": 6.831249999999999e-07, + "loss": 0.0014199577271938324, + "reward": 2.0831799507141113, + "reward_std": 0.3173952251672745, + "rewards/GDino": 0.7435263097286224, + "rewards/GIT": 0.3333684056997299, + "rewards/HPSv2": 0.2785987854003906, + "rewards/ORM": 0.7276863753795624, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.875, + "step": 507 + }, + { + "completion_length": 77.78125, + "epoch": 0.5625692137320044, + "grad_norm": 0.6624107956886292, + "kl": 0.0797119140625, + "learning_rate": 6.824999999999999e-07, + "loss": 0.011807034723460674, + "reward": 2.404119551181793, + "reward_std": 0.199082151055336, + "rewards/GDino": 0.784372866153717, + "rewards/GIT": 0.5635663121938705, + "rewards/HPSv2": 0.2940349578857422, + "rewards/ORM": 0.7621453106403351, + "self_certainty_semantic": -26.125, + "self_certainty_token": -21.5625, + "step": 508 + }, + { + "completion_length": 77.0625, + "epoch": 0.5636766334440753, + "grad_norm": 0.4723074734210968, + "kl": 0.06640625, + "learning_rate": 6.818749999999999e-07, + "loss": 0.003355602500960231, + "reward": 2.441024899482727, + "reward_std": 0.27172519266605377, + "rewards/GDino": 0.9154693782329559, + "rewards/GIT": 0.5399381816387177, + "rewards/HPSv2": 0.27901268005371094, + "rewards/ORM": 0.7066046893596649, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.9375, + "step": 509 + }, + { + "completion_length": 85.21875, + "epoch": 0.5647840531561462, + "grad_norm": 0.5089316964149475, + "kl": 0.029541015625, + "learning_rate": 6.8125e-07, + "loss": 0.010979706654325128, + "reward": 2.0142000913619995, + "reward_std": 0.38842645287513733, + "rewards/GDino": 0.6785135865211487, + "rewards/GIT": 0.30009591579437256, + "rewards/HPSv2": 0.2801837921142578, + "rewards/ORM": 0.7554067671298981, + "self_certainty_semantic": -26.0, + "self_certainty_token": -22.375, + "step": 510 + }, + { + "completion_length": 90.65625, + "epoch": 0.5658914728682171, + "grad_norm": 4.600841522216797, + "kl": 0.074951171875, + "learning_rate": 6.80625e-07, + "loss": -0.0066879980731755495, + "reward": 1.9691088199615479, + "reward_std": 0.40035973489284515, + "rewards/GDino": 0.675000011920929, + "rewards/GIT": 0.5345175862312317, + "rewards/HPSv2": 0.2616462707519531, + "rewards/ORM": 0.4979449361562729, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -20.8125, + "step": 511 + }, + { + "completion_length": 96.453125, + "epoch": 0.5669988925802879, + "grad_norm": 0.5065847039222717, + "kl": 0.0377197265625, + "learning_rate": 6.800000000000001e-07, + "loss": -0.0035036103799939156, + "reward": 1.905118465423584, + "reward_std": 0.3332052379846573, + "rewards/GDino": 0.6995465755462646, + "rewards/GIT": 0.2819361090660095, + "rewards/HPSv2": 0.26618194580078125, + "rewards/ORM": 0.6574538052082062, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -21.4375, + "step": 512 + }, + { + "completion_length": 87.125, + "epoch": 0.5681063122923588, + "grad_norm": 1.3110729455947876, + "kl": 0.058349609375, + "learning_rate": 6.79375e-07, + "loss": -0.0048042405396699905, + "reward": 2.2505825757980347, + "reward_std": 0.2556573376059532, + "rewards/GDino": 0.8099638223648071, + "rewards/GIT": 0.5112008154392242, + "rewards/HPSv2": 0.2662086486816406, + "rewards/ORM": 0.6632093787193298, + "self_certainty_semantic": -25.875, + "self_certainty_token": -22.375, + "step": 513 + }, + { + "completion_length": 76.5625, + "epoch": 0.5692137320044297, + "grad_norm": 0.6024096608161926, + "kl": 0.052734375, + "learning_rate": 6.7875e-07, + "loss": 0.0006829132325947285, + "reward": 2.126915156841278, + "reward_std": 0.38184772431850433, + "rewards/GDino": 0.7540458142757416, + "rewards/GIT": 0.33178190886974335, + "rewards/HPSv2": 0.2767162322998047, + "rewards/ORM": 0.764371246099472, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.3125, + "step": 514 + }, + { + "completion_length": 95.796875, + "epoch": 0.5703211517165006, + "grad_norm": 0.4496208429336548, + "kl": 0.054931640625, + "learning_rate": 6.78125e-07, + "loss": 0.0021472845692187548, + "reward": 1.964326798915863, + "reward_std": 0.3818511813879013, + "rewards/GDino": 0.7132591307163239, + "rewards/GIT": 0.30606234073638916, + "rewards/HPSv2": 0.2753562927246094, + "rewards/ORM": 0.669649064540863, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.375, + "step": 515 + }, + { + "completion_length": 84.734375, + "epoch": 0.5714285714285714, + "grad_norm": 0.4120882451534271, + "kl": 0.0426025390625, + "learning_rate": 6.775e-07, + "loss": -0.0064294025069102645, + "reward": 2.275146961212158, + "reward_std": 0.27916960418224335, + "rewards/GDino": 0.8743277788162231, + "rewards/GIT": 0.5724380016326904, + "rewards/HPSv2": 0.2752981185913086, + "rewards/ORM": 0.5530830472707748, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.8125, + "step": 516 + }, + { + "completion_length": 88.359375, + "epoch": 0.5725359911406424, + "grad_norm": 0.4154956638813019, + "kl": 0.0433349609375, + "learning_rate": 6.76875e-07, + "loss": -0.003980438224971294, + "reward": 1.9036627411842346, + "reward_std": 0.32538601756095886, + "rewards/GDino": 0.6978917419910431, + "rewards/GIT": 0.2857309505343437, + "rewards/HPSv2": 0.2794151306152344, + "rewards/ORM": 0.6406250149011612, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -21.1875, + "step": 517 + }, + { + "completion_length": 87.046875, + "epoch": 0.5736434108527132, + "grad_norm": 0.4812760055065155, + "kl": 0.035888671875, + "learning_rate": 6.7625e-07, + "loss": 0.005491095129400492, + "reward": 2.3508909940719604, + "reward_std": 0.303069606423378, + "rewards/GDino": 0.8647581338882446, + "rewards/GIT": 0.517970398068428, + "rewards/HPSv2": 0.2732505798339844, + "rewards/ORM": 0.694911852478981, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.0625, + "step": 518 + }, + { + "completion_length": 91.546875, + "epoch": 0.574750830564784, + "grad_norm": 0.40644869208335876, + "kl": 0.02532958984375, + "learning_rate": 6.75625e-07, + "loss": -0.013489187695086002, + "reward": 2.473989486694336, + "reward_std": 0.27227312326431274, + "rewards/GDino": 0.7588541805744171, + "rewards/GIT": 0.7009795010089874, + "rewards/HPSv2": 0.26723480224609375, + "rewards/ORM": 0.7469209432601929, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.875, + "step": 519 + }, + { + "completion_length": 81.5, + "epoch": 0.5758582502768549, + "grad_norm": 0.9652438759803772, + "kl": 0.0491943359375, + "learning_rate": 6.75e-07, + "loss": -4.144746344536543e-05, + "reward": 2.7075356245040894, + "reward_std": 0.1276433803141117, + "rewards/GDino": 0.893750011920929, + "rewards/GIT": 0.6622239053249359, + "rewards/HPSv2": 0.2765617370605469, + "rewards/ORM": 0.875, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.125, + "step": 520 + }, + { + "completion_length": 92.75, + "epoch": 0.5769656699889258, + "grad_norm": 0.72523033618927, + "kl": 0.09503173828125, + "learning_rate": 6.743749999999999e-07, + "loss": -0.011197114363312721, + "reward": 2.160744071006775, + "reward_std": 0.24259518086910248, + "rewards/GDino": 0.7633025646209717, + "rewards/GIT": 0.6313807964324951, + "rewards/HPSv2": 0.26360321044921875, + "rewards/ORM": 0.5024575740098953, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.25, + "step": 521 + }, + { + "completion_length": 93.8125, + "epoch": 0.5780730897009967, + "grad_norm": 0.44977495074272156, + "kl": 0.03802490234375, + "learning_rate": 6.737499999999999e-07, + "loss": -0.004941140301525593, + "reward": 1.4902530312538147, + "reward_std": 0.4094521999359131, + "rewards/GDino": 0.5036250948905945, + "rewards/GIT": 0.1529332399368286, + "rewards/HPSv2": 0.2778034210205078, + "rewards/ORM": 0.5558912754058838, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -21.875, + "step": 522 + }, + { + "completion_length": 76.75, + "epoch": 0.5791805094130675, + "grad_norm": 0.7043271660804749, + "kl": 0.0584716796875, + "learning_rate": 6.731249999999999e-07, + "loss": 0.009275438962504268, + "reward": 2.0554347038269043, + "reward_std": 0.42759716510772705, + "rewards/GDino": 0.7391814589500427, + "rewards/GIT": 0.3492523282766342, + "rewards/HPSv2": 0.29512596130371094, + "rewards/ORM": 0.671875, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -21.9375, + "step": 523 + }, + { + "completion_length": 86.640625, + "epoch": 0.5802879291251384, + "grad_norm": 0.5973113179206848, + "kl": 0.04345703125, + "learning_rate": 6.724999999999999e-07, + "loss": 0.010272693820297718, + "reward": 2.2810587882995605, + "reward_std": 0.3110383152961731, + "rewards/GDino": 0.7970583140850067, + "rewards/GIT": 0.510815218091011, + "rewards/HPSv2": 0.2897472381591797, + "rewards/ORM": 0.6834379732608795, + "self_certainty_semantic": -25.875, + "self_certainty_token": -22.25, + "step": 524 + }, + { + "completion_length": 82.34375, + "epoch": 0.5813953488372093, + "grad_norm": 0.6339185833930969, + "kl": 0.09228515625, + "learning_rate": 6.718749999999999e-07, + "loss": 0.006851315614767373, + "reward": 1.6905651688575745, + "reward_std": 0.4879930168390274, + "rewards/GDino": 0.6248437464237213, + "rewards/GIT": 0.18320457637310028, + "rewards/HPSv2": 0.2635040283203125, + "rewards/ORM": 0.6190127730369568, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.125, + "step": 525 + }, + { + "completion_length": 94.4375, + "epoch": 0.5825027685492802, + "grad_norm": 0.9958269596099854, + "kl": 0.082275390625, + "learning_rate": 6.7125e-07, + "loss": 0.0015276819467544556, + "reward": 2.3374764919281006, + "reward_std": 0.31033264100551605, + "rewards/GDino": 0.7830190360546112, + "rewards/GIT": 0.44163867086172104, + "rewards/HPSv2": 0.28964805603027344, + "rewards/ORM": 0.8231706917285919, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.4375, + "step": 526 + }, + { + "completion_length": 84.984375, + "epoch": 0.583610188261351, + "grad_norm": 0.6254690885543823, + "kl": 0.072998046875, + "learning_rate": 6.70625e-07, + "loss": 0.008951655472628772, + "reward": 2.053499758243561, + "reward_std": 0.324885755777359, + "rewards/GDino": 0.8319855034351349, + "rewards/GIT": 0.35926371067762375, + "rewards/HPSv2": 0.27198028564453125, + "rewards/ORM": 0.5902703106403351, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.0625, + "step": 527 + }, + { + "completion_length": 78.375, + "epoch": 0.584717607973422, + "grad_norm": 0.43770545721054077, + "kl": 0.03857421875, + "learning_rate": 6.7e-07, + "loss": 5.6162942200899124e-05, + "reward": 2.194950580596924, + "reward_std": 0.24773486703634262, + "rewards/GDino": 0.7339285016059875, + "rewards/GIT": 0.5228977501392365, + "rewards/HPSv2": 0.2709827423095703, + "rewards/ORM": 0.6671415567398071, + "self_certainty_semantic": -26.0, + "self_certainty_token": -22.1875, + "step": 528 + }, + { + "completion_length": 95.125, + "epoch": 0.5858250276854928, + "grad_norm": 0.5197089910507202, + "kl": 0.0491943359375, + "learning_rate": 6.69375e-07, + "loss": -0.007315342780202627, + "reward": 2.323517680168152, + "reward_std": 0.28804992139339447, + "rewards/GDino": 0.7729908227920532, + "rewards/GIT": 0.4792102128267288, + "rewards/HPSv2": 0.26497459411621094, + "rewards/ORM": 0.8063418865203857, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.375, + "step": 529 + }, + { + "completion_length": 82.15625, + "epoch": 0.5869324473975637, + "grad_norm": 0.45114606618881226, + "kl": 0.0389404296875, + "learning_rate": 6.6875e-07, + "loss": -0.0003208932466804981, + "reward": 1.992837905883789, + "reward_std": 0.06786906532943249, + "rewards/GDino": 0.6179687678813934, + "rewards/GIT": 0.33767586946487427, + "rewards/HPSv2": 0.28719329833984375, + "rewards/ORM": 0.75, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.9375, + "step": 530 + }, + { + "completion_length": 86.25, + "epoch": 0.5880398671096345, + "grad_norm": 1.115891695022583, + "kl": 0.09912109375, + "learning_rate": 6.68125e-07, + "loss": 0.007942123105749488, + "reward": 1.9083644151687622, + "reward_std": 0.3120027482509613, + "rewards/GDino": 0.6493186950683594, + "rewards/GIT": 0.2294761836528778, + "rewards/HPSv2": 0.2951946258544922, + "rewards/ORM": 0.7343749701976776, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -21.5625, + "step": 531 + }, + { + "completion_length": 87.96875, + "epoch": 0.5891472868217055, + "grad_norm": 0.7807652950286865, + "kl": 0.04638671875, + "learning_rate": 6.675e-07, + "loss": -0.002125584869645536, + "reward": 2.6828184127807617, + "reward_std": 0.20224052667617798, + "rewards/GDino": 0.9359375238418579, + "rewards/GIT": 0.6774542331695557, + "rewards/HPSv2": 0.2679481506347656, + "rewards/ORM": 0.8014785647392273, + "self_certainty_semantic": -26.125, + "self_certainty_token": -21.3125, + "step": 532 + }, + { + "completion_length": 79.921875, + "epoch": 0.5902547065337763, + "grad_norm": 0.7299689650535583, + "kl": 0.0576171875, + "learning_rate": 6.66875e-07, + "loss": 0.0013833704870194197, + "reward": 2.4694089889526367, + "reward_std": 0.22859349846839905, + "rewards/GDino": 0.7641555666923523, + "rewards/GIT": 0.4485471844673157, + "rewards/HPSv2": 0.2961692810058594, + "rewards/ORM": 0.9605368673801422, + "self_certainty_semantic": -26.125, + "self_certainty_token": -21.8125, + "step": 533 + }, + { + "completion_length": 92.8125, + "epoch": 0.5913621262458472, + "grad_norm": 0.5747195482254028, + "kl": 0.0699462890625, + "learning_rate": 6.6625e-07, + "loss": 0.0023823545780032873, + "reward": 2.3093079924583435, + "reward_std": 0.31313300132751465, + "rewards/GDino": 0.7953124940395355, + "rewards/GIT": 0.5104830265045166, + "rewards/HPSv2": 0.25672149658203125, + "rewards/ORM": 0.7467910647392273, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -22.3125, + "step": 534 + }, + { + "completion_length": 86.625, + "epoch": 0.592469545957918, + "grad_norm": 0.6912729144096375, + "kl": 0.1064453125, + "learning_rate": 6.65625e-07, + "loss": -0.004116041818633676, + "reward": 2.109492301940918, + "reward_std": 0.2375968098640442, + "rewards/GDino": 0.6330729126930237, + "rewards/GIT": 0.4475910812616348, + "rewards/HPSv2": 0.28356170654296875, + "rewards/ORM": 0.7452665865421295, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.5, + "step": 535 + }, + { + "completion_length": 88.0625, + "epoch": 0.593576965669989, + "grad_norm": 0.4220062792301178, + "kl": 0.0458984375, + "learning_rate": 6.65e-07, + "loss": -0.0006557479500770569, + "reward": 2.047886908054352, + "reward_std": 0.3713255673646927, + "rewards/GDino": 0.7879342436790466, + "rewards/GIT": 0.4085572734475136, + "rewards/HPSv2": 0.2780036926269531, + "rewards/ORM": 0.5733915567398071, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.375, + "step": 536 + }, + { + "completion_length": 94.15625, + "epoch": 0.5946843853820598, + "grad_norm": 0.5534602403640747, + "kl": 0.0478515625, + "learning_rate": 6.64375e-07, + "loss": 0.009019973687827587, + "reward": 2.1167913675308228, + "reward_std": 0.33687959611415863, + "rewards/GDino": 0.7530189454555511, + "rewards/GIT": 0.5457257926464081, + "rewards/HPSv2": 0.2676420211791992, + "rewards/ORM": 0.5504046380519867, + "self_certainty_semantic": -26.125, + "self_certainty_token": -21.4375, + "step": 537 + }, + { + "completion_length": 79.234375, + "epoch": 0.5957918050941307, + "grad_norm": 0.6641038060188293, + "kl": 0.0592041015625, + "learning_rate": 6.637499999999999e-07, + "loss": -0.002933267503976822, + "reward": 1.5367609858512878, + "reward_std": 0.306125745177269, + "rewards/GDino": 0.5065802335739136, + "rewards/GIT": 0.11172741651535034, + "rewards/HPSv2": 0.29237937927246094, + "rewards/ORM": 0.6260739862918854, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.9375, + "step": 538 + }, + { + "completion_length": 94.546875, + "epoch": 0.5968992248062015, + "grad_norm": 0.4160470962524414, + "kl": 0.0311279296875, + "learning_rate": 6.631249999999999e-07, + "loss": 0.005283329752273858, + "reward": 2.2515803575515747, + "reward_std": 0.320987805724144, + "rewards/GDino": 0.7828125059604645, + "rewards/GIT": 0.600108414888382, + "rewards/HPSv2": 0.2847270965576172, + "rewards/ORM": 0.5839323550462723, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.9375, + "step": 539 + }, + { + "completion_length": 87.859375, + "epoch": 0.5980066445182725, + "grad_norm": 0.5893110036849976, + "kl": 0.062744140625, + "learning_rate": 6.624999999999999e-07, + "loss": -0.007368577877059579, + "reward": 2.0375999808311462, + "reward_std": 0.26507988572120667, + "rewards/GDino": 0.7440606653690338, + "rewards/GIT": 0.32250893115997314, + "rewards/HPSv2": 0.2862586975097656, + "rewards/ORM": 0.6847716569900513, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.8125, + "step": 540 + }, + { + "completion_length": 89.09375, + "epoch": 0.5991140642303433, + "grad_norm": 0.4172793924808502, + "kl": 0.0267333984375, + "learning_rate": 6.618749999999999e-07, + "loss": -0.01182193262502551, + "reward": 1.8913142085075378, + "reward_std": 0.2503943145275116, + "rewards/GDino": 0.7961800992488861, + "rewards/GIT": 0.5071080029010773, + "rewards/HPSv2": 0.26953887939453125, + "rewards/ORM": 0.31848718971014023, + "self_certainty_semantic": -26.0, + "self_certainty_token": -23.25, + "step": 541 + }, + { + "completion_length": 80.90625, + "epoch": 0.6002214839424141, + "grad_norm": 0.42525559663772583, + "kl": 0.03662109375, + "learning_rate": 6.6125e-07, + "loss": 0.012238860595971346, + "reward": 1.9371423721313477, + "reward_std": 0.30036894977092743, + "rewards/GDino": 0.7022413313388824, + "rewards/GIT": 0.28929826617240906, + "rewards/HPSv2": 0.2781105041503906, + "rewards/ORM": 0.6674922555685043, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -21.625, + "step": 542 + }, + { + "completion_length": 85.4375, + "epoch": 0.6013289036544851, + "grad_norm": 0.8508775234222412, + "kl": 0.041259765625, + "learning_rate": 6.60625e-07, + "loss": 0.0013948945561423898, + "reward": 1.8919480443000793, + "reward_std": 0.32225915789604187, + "rewards/GDino": 0.7847888469696045, + "rewards/GIT": 0.3053089827299118, + "rewards/HPSv2": 0.2678718566894531, + "rewards/ORM": 0.5339783430099487, + "self_certainty_semantic": -26.125, + "self_certainty_token": -21.9375, + "step": 543 + }, + { + "completion_length": 89.625, + "epoch": 0.602436323366556, + "grad_norm": 0.39894726872444153, + "kl": 0.03411865234375, + "learning_rate": 6.6e-07, + "loss": 0.005057943519204855, + "reward": 2.560256600379944, + "reward_std": 0.4743453115224838, + "rewards/GDino": 0.8500000238418579, + "rewards/GIT": 0.6654288470745087, + "rewards/HPSv2": 0.25701904296875, + "rewards/ORM": 0.7878087162971497, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.75, + "step": 544 + }, + { + "completion_length": 85.40625, + "epoch": 0.6035437430786268, + "grad_norm": 0.6981701850891113, + "kl": 0.0286865234375, + "learning_rate": 6.59375e-07, + "loss": 0.003844509134069085, + "reward": 2.205296814441681, + "reward_std": 0.20136026293039322, + "rewards/GDino": 0.7255208492279053, + "rewards/GIT": 0.6519614011049271, + "rewards/HPSv2": 0.27152252197265625, + "rewards/ORM": 0.5562919676303864, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -20.625, + "step": 545 + }, + { + "completion_length": 91.15625, + "epoch": 0.6046511627906976, + "grad_norm": 0.4563887119293213, + "kl": 0.0521240234375, + "learning_rate": 6.587499999999999e-07, + "loss": -0.006130927940830588, + "reward": 1.5699166059494019, + "reward_std": 0.3112071678042412, + "rewards/GDino": 0.6445845067501068, + "rewards/GIT": 0.3175094947218895, + "rewards/HPSv2": 0.2695293426513672, + "rewards/ORM": 0.33829329907894135, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.6875, + "step": 546 + }, + { + "completion_length": 84.796875, + "epoch": 0.6057585825027686, + "grad_norm": 0.6241567134857178, + "kl": 0.0640869140625, + "learning_rate": 6.581249999999999e-07, + "loss": -0.0015397676033899188, + "reward": 2.1990838646888733, + "reward_std": 0.46344010531902313, + "rewards/GDino": 0.7604524791240692, + "rewards/GIT": 0.35258936882019043, + "rewards/HPSv2": 0.2892169952392578, + "rewards/ORM": 0.7968250513076782, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.75, + "step": 547 + }, + { + "completion_length": 90.765625, + "epoch": 0.6068660022148394, + "grad_norm": 1.061600923538208, + "kl": 0.0372314453125, + "learning_rate": 6.575e-07, + "loss": 0.008668731665238738, + "reward": 2.206001937389374, + "reward_std": 0.27403025329113007, + "rewards/GDino": 0.7100606560707092, + "rewards/GIT": 0.3726941794157028, + "rewards/HPSv2": 0.2794971466064453, + "rewards/ORM": 0.84375, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.0, + "step": 548 + }, + { + "completion_length": 78.859375, + "epoch": 0.6079734219269103, + "grad_norm": 0.5541984438896179, + "kl": 0.0540771484375, + "learning_rate": 6.56875e-07, + "loss": -0.010515473783016205, + "reward": 2.3538352251052856, + "reward_std": 0.32379475235939026, + "rewards/GDino": 0.8174417316913605, + "rewards/GIT": 0.5599153190851212, + "rewards/HPSv2": 0.28277015686035156, + "rewards/ORM": 0.6937080323696136, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.25, + "step": 549 + }, + { + "completion_length": 84.734375, + "epoch": 0.6090808416389811, + "grad_norm": 0.5860046744346619, + "kl": 0.0697021484375, + "learning_rate": 6.5625e-07, + "loss": -0.004955586977303028, + "reward": 2.4616087675094604, + "reward_std": 0.2783765345811844, + "rewards/GDino": 0.796875, + "rewards/GIT": 0.45293718576431274, + "rewards/HPSv2": 0.27902984619140625, + "rewards/ORM": 0.9327665269374847, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.625, + "step": 550 + }, + { + "completion_length": 92.1875, + "epoch": 0.6101882613510521, + "grad_norm": 1.0782103538513184, + "kl": 0.093994140625, + "learning_rate": 6.55625e-07, + "loss": 0.016621893271803856, + "reward": 2.214840531349182, + "reward_std": 0.4911406636238098, + "rewards/GDino": 0.8352398872375488, + "rewards/GIT": 0.41603805869817734, + "rewards/HPSv2": 0.2964210510253906, + "rewards/ORM": 0.6671415567398071, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -23.5625, + "step": 551 + }, + { + "completion_length": 85.578125, + "epoch": 0.6112956810631229, + "grad_norm": 0.432542085647583, + "kl": 0.1011962890625, + "learning_rate": 6.55e-07, + "loss": 0.0025141866644844413, + "reward": 2.256702184677124, + "reward_std": 0.29623982310295105, + "rewards/GDino": 0.8192631006240845, + "rewards/GIT": 0.5296911746263504, + "rewards/HPSv2": 0.28748130798339844, + "rewards/ORM": 0.6202665567398071, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.375, + "step": 552 + }, + { + "completion_length": 84.9375, + "epoch": 0.6124031007751938, + "grad_norm": 0.6483874917030334, + "kl": 0.0374755859375, + "learning_rate": 6.54375e-07, + "loss": -0.0038166206795722246, + "reward": 1.9581068754196167, + "reward_std": 0.33014117181301117, + "rewards/GDino": 0.6503263413906097, + "rewards/GIT": 0.2168239951133728, + "rewards/HPSv2": 0.2891273498535156, + "rewards/ORM": 0.8018293082714081, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.625, + "step": 553 + }, + { + "completion_length": 84.421875, + "epoch": 0.6135105204872646, + "grad_norm": 0.5570255517959595, + "kl": 0.0426025390625, + "learning_rate": 6.5375e-07, + "loss": 0.00738596566952765, + "reward": 1.8431515097618103, + "reward_std": 0.37364087998867035, + "rewards/GDino": 0.6967909634113312, + "rewards/GIT": 0.22302396595478058, + "rewards/HPSv2": 0.287445068359375, + "rewards/ORM": 0.6358915567398071, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -23.125, + "step": 554 + }, + { + "completion_length": 77.640625, + "epoch": 0.6146179401993356, + "grad_norm": 0.475253701210022, + "kl": 0.035400390625, + "learning_rate": 6.531249999999999e-07, + "loss": 0.0036281460197642446, + "reward": 1.683140754699707, + "reward_std": 0.47642695903778076, + "rewards/GDino": 0.6110014021396637, + "rewards/GIT": 0.29766006767749786, + "rewards/HPSv2": 0.27327537536621094, + "rewards/ORM": 0.501203790307045, + "self_certainty_semantic": -25.75, + "self_certainty_token": -22.8125, + "step": 555 + }, + { + "completion_length": 88.78125, + "epoch": 0.6157253599114064, + "grad_norm": 4.969493865966797, + "kl": 0.0921630859375, + "learning_rate": 6.524999999999999e-07, + "loss": 0.002552055288106203, + "reward": 2.559138059616089, + "reward_std": 0.35178404301404953, + "rewards/GDino": 0.8671875, + "rewards/GIT": 0.5760512053966522, + "rewards/HPSv2": 0.2577781677246094, + "rewards/ORM": 0.858121246099472, + "self_certainty_semantic": -26.125, + "self_certainty_token": -20.8125, + "step": 556 + }, + { + "completion_length": 82.140625, + "epoch": 0.6168327796234773, + "grad_norm": 0.6524704694747925, + "kl": 0.040771484375, + "learning_rate": 6.51875e-07, + "loss": -0.011345499660819769, + "reward": 1.991989016532898, + "reward_std": 0.44549626111984253, + "rewards/GDino": 0.8012259900569916, + "rewards/GIT": 0.2607608772814274, + "rewards/HPSv2": 0.2759389877319336, + "rewards/ORM": 0.6540631651878357, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.125, + "step": 557 + }, + { + "completion_length": 91.5625, + "epoch": 0.6179401993355482, + "grad_norm": 0.5774140954017639, + "kl": 0.0523681640625, + "learning_rate": 6.5125e-07, + "loss": 0.0027473480440676212, + "reward": 1.6934953927993774, + "reward_std": 0.32764413952827454, + "rewards/GDino": 0.6785344779491425, + "rewards/GIT": 0.28618551790714264, + "rewards/HPSv2": 0.2818584442138672, + "rewards/ORM": 0.44691696763038635, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.375, + "step": 558 + }, + { + "completion_length": 81.828125, + "epoch": 0.6190476190476191, + "grad_norm": 0.7214290499687195, + "kl": 0.0400390625, + "learning_rate": 6.50625e-07, + "loss": 0.0006662113592028618, + "reward": 2.187069296836853, + "reward_std": 0.23079095780849457, + "rewards/GDino": 0.7869532406330109, + "rewards/GIT": 0.3952046409249306, + "rewards/HPSv2": 0.2861614227294922, + "rewards/ORM": 0.71875, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.25, + "step": 559 + }, + { + "completion_length": 79.109375, + "epoch": 0.6201550387596899, + "grad_norm": 0.8327608704566956, + "kl": 0.06396484375, + "learning_rate": 6.5e-07, + "loss": 0.005268956534564495, + "reward": 1.4079316854476929, + "reward_std": 0.4312395751476288, + "rewards/GDino": 0.565241128206253, + "rewards/GIT": 0.30078378319740295, + "rewards/HPSv2": 0.2668647766113281, + "rewards/ORM": 0.27504195272922516, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -21.875, + "step": 560 + }, + { + "completion_length": 84.5, + "epoch": 0.6212624584717608, + "grad_norm": 0.7923188209533691, + "kl": 0.083740234375, + "learning_rate": 6.49375e-07, + "loss": 0.002008352486882359, + "reward": 1.9170043468475342, + "reward_std": 0.40060944855213165, + "rewards/GDino": 0.6858661472797394, + "rewards/GIT": 0.32867125421762466, + "rewards/HPSv2": 0.28693389892578125, + "rewards/ORM": 0.6155331134796143, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.6875, + "step": 561 + }, + { + "completion_length": 79.6875, + "epoch": 0.6223698781838317, + "grad_norm": 1.3077260255813599, + "kl": 0.05670166015625, + "learning_rate": 6.4875e-07, + "loss": -0.009227622300386429, + "reward": 2.106406509876251, + "reward_std": 0.335910826921463, + "rewards/GDino": 0.8199536800384521, + "rewards/GIT": 0.42982835322618484, + "rewards/HPSv2": 0.2717609405517578, + "rewards/ORM": 0.5848635137081146, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -21.3125, + "step": 562 + }, + { + "completion_length": 80.171875, + "epoch": 0.6234772978959026, + "grad_norm": 0.5299350619316101, + "kl": 0.07373046875, + "learning_rate": 6.481249999999999e-07, + "loss": -0.005908059189096093, + "reward": 2.262375831604004, + "reward_std": 0.3717510998249054, + "rewards/GDino": 0.859304666519165, + "rewards/GIT": 0.48717375099658966, + "rewards/HPSv2": 0.27237510681152344, + "rewards/ORM": 0.6435223519802094, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.0625, + "step": 563 + }, + { + "completion_length": 94.015625, + "epoch": 0.6245847176079734, + "grad_norm": 0.6650445461273193, + "kl": 0.04644775390625, + "learning_rate": 6.474999999999999e-07, + "loss": -0.015223762020468712, + "reward": 2.4286820888519287, + "reward_std": 0.1945188194513321, + "rewards/GDino": 0.880752831697464, + "rewards/GIT": 0.42761753499507904, + "rewards/HPSv2": 0.2921867370605469, + "rewards/ORM": 0.828125, + "self_certainty_semantic": -26.25, + "self_certainty_token": -22.1875, + "step": 564 + }, + { + "completion_length": 79.71875, + "epoch": 0.6256921373200443, + "grad_norm": 0.502757728099823, + "kl": 0.0584716796875, + "learning_rate": 6.46875e-07, + "loss": -0.0028280543629080057, + "reward": 1.7525089979171753, + "reward_std": 0.3420318365097046, + "rewards/GDino": 0.6839133501052856, + "rewards/GIT": 0.09051510691642761, + "rewards/HPSv2": 0.2742042541503906, + "rewards/ORM": 0.7038763463497162, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -22.25, + "step": 565 + }, + { + "completion_length": 82.484375, + "epoch": 0.6267995570321152, + "grad_norm": 0.4326266050338745, + "kl": 0.0440673828125, + "learning_rate": 6.4625e-07, + "loss": -0.0022651476319879293, + "reward": 1.9366894364356995, + "reward_std": 0.2688908018171787, + "rewards/GDino": 0.7198602259159088, + "rewards/GIT": 0.3348108232021332, + "rewards/HPSv2": 0.276123046875, + "rewards/ORM": 0.6058952808380127, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.6875, + "step": 566 + }, + { + "completion_length": 85.9375, + "epoch": 0.627906976744186, + "grad_norm": 1.1396733522415161, + "kl": 0.0379638671875, + "learning_rate": 6.45625e-07, + "loss": 0.00432520592585206, + "reward": 2.3249639868736267, + "reward_std": 0.43076588213443756, + "rewards/GDino": 0.7687499523162842, + "rewards/GIT": 0.5119449347257614, + "rewards/HPSv2": 0.2824745178222656, + "rewards/ORM": 0.7617946267127991, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.3125, + "step": 567 + }, + { + "completion_length": 84.234375, + "epoch": 0.6290143964562569, + "grad_norm": 0.6943775415420532, + "kl": 0.042236328125, + "learning_rate": 6.45e-07, + "loss": 0.014131693402305245, + "reward": 1.9375189542770386, + "reward_std": 0.2767423838376999, + "rewards/GDino": 0.7555198967456818, + "rewards/GIT": 0.4167647063732147, + "rewards/HPSv2": 0.27898406982421875, + "rewards/ORM": 0.48625022172927856, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.0, + "step": 568 + }, + { + "completion_length": 83.734375, + "epoch": 0.6301218161683277, + "grad_norm": 0.5247687697410583, + "kl": 0.0458984375, + "learning_rate": 6.44375e-07, + "loss": -0.0024925548350438476, + "reward": 2.4779163599014282, + "reward_std": 0.2677724361419678, + "rewards/GDino": 0.8284472227096558, + "rewards/GIT": 0.6041100770235062, + "rewards/HPSv2": 0.28594207763671875, + "rewards/ORM": 0.7594169676303864, + "self_certainty_semantic": -26.125, + "self_certainty_token": -21.5625, + "step": 569 + }, + { + "completion_length": 89.484375, + "epoch": 0.6312292358803987, + "grad_norm": 0.8421794772148132, + "kl": 0.04107666015625, + "learning_rate": 6.4375e-07, + "loss": -0.00400165724568069, + "reward": 1.952150285243988, + "reward_std": 0.27587440609931946, + "rewards/GDino": 0.7627604305744171, + "rewards/GIT": 0.5746227502822876, + "rewards/HPSv2": 0.27101707458496094, + "rewards/ORM": 0.34375, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.875, + "step": 570 + }, + { + "completion_length": 87.25, + "epoch": 0.6323366555924695, + "grad_norm": 0.486047625541687, + "kl": 0.04345703125, + "learning_rate": 6.431249999999999e-07, + "loss": -0.01550593227148056, + "reward": 2.606563448905945, + "reward_std": 0.2680031508207321, + "rewards/GDino": 0.9061827063560486, + "rewards/GIT": 0.674159437417984, + "rewards/HPSv2": 0.2789497375488281, + "rewards/ORM": 0.7472716569900513, + "self_certainty_semantic": -26.25, + "self_certainty_token": -22.5625, + "step": 571 + }, + { + "completion_length": 92.5, + "epoch": 0.6334440753045404, + "grad_norm": 0.5024401545524597, + "kl": 0.02978515625, + "learning_rate": 6.424999999999999e-07, + "loss": 0.008753098547458649, + "reward": 2.340450704097748, + "reward_std": 0.2569350227713585, + "rewards/GDino": 0.7218749821186066, + "rewards/GIT": 0.5201355963945389, + "rewards/HPSv2": 0.27031517028808594, + "rewards/ORM": 0.828125, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.25, + "step": 572 + }, + { + "completion_length": 83.46875, + "epoch": 0.6345514950166113, + "grad_norm": 0.4357914924621582, + "kl": 0.060302734375, + "learning_rate": 6.41875e-07, + "loss": -0.004691538168117404, + "reward": 2.274852752685547, + "reward_std": 0.3174278810620308, + "rewards/GDino": 0.831250011920929, + "rewards/GIT": 0.5334950983524323, + "rewards/HPSv2": 0.2726116180419922, + "rewards/ORM": 0.6374960094690323, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.4375, + "step": 573 + }, + { + "completion_length": 90.78125, + "epoch": 0.6356589147286822, + "grad_norm": 0.8525876998901367, + "kl": 0.067138671875, + "learning_rate": 6.4125e-07, + "loss": -0.0007205374422483146, + "reward": 2.1640985012054443, + "reward_std": 0.34151124954223633, + "rewards/GDino": 0.6809311807155609, + "rewards/GIT": 0.46346134692430496, + "rewards/HPSv2": 0.27318572998046875, + "rewards/ORM": 0.7465203106403351, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.6875, + "step": 574 + }, + { + "completion_length": 79.78125, + "epoch": 0.636766334440753, + "grad_norm": 0.685072124004364, + "kl": 0.05609130859375, + "learning_rate": 6.40625e-07, + "loss": 0.007571035530418158, + "reward": 2.712357759475708, + "reward_std": 0.19092348963022232, + "rewards/GDino": 0.9664917588233948, + "rewards/GIT": 0.7502661943435669, + "rewards/HPSv2": 0.26122474670410156, + "rewards/ORM": 0.734375, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.5, + "step": 575 + }, + { + "completion_length": 87.65625, + "epoch": 0.6378737541528239, + "grad_norm": 0.5097111463546753, + "kl": 0.0457763671875, + "learning_rate": 6.4e-07, + "loss": 0.0013417699374258518, + "reward": 2.00359308719635, + "reward_std": 0.33664438128471375, + "rewards/GDino": 0.7250159382820129, + "rewards/GIT": 0.32381221652030945, + "rewards/HPSv2": 0.28449440002441406, + "rewards/ORM": 0.670270562171936, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.75, + "step": 576 + }, + { + "completion_length": 86.453125, + "epoch": 0.6389811738648948, + "grad_norm": 0.48539066314697266, + "kl": 0.0482177734375, + "learning_rate": 6.39375e-07, + "loss": -0.007292540278285742, + "reward": 1.8100631833076477, + "reward_std": 0.3537253439426422, + "rewards/GDino": 0.6475874781608582, + "rewards/GIT": 0.2818602919578552, + "rewards/HPSv2": 0.26976585388183594, + "rewards/ORM": 0.6108495593070984, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.4375, + "step": 577 + }, + { + "completion_length": 77.34375, + "epoch": 0.6400885935769657, + "grad_norm": 0.5300226211547852, + "kl": 0.0638427734375, + "learning_rate": 6.3875e-07, + "loss": 0.0006619710475206375, + "reward": 2.1816067695617676, + "reward_std": 0.24885310232639313, + "rewards/GDino": 0.7344487607479095, + "rewards/GIT": 0.44169679284095764, + "rewards/HPSv2": 0.27399444580078125, + "rewards/ORM": 0.7314668297767639, + "self_certainty_semantic": -25.875, + "self_certainty_token": -22.6875, + "step": 578 + }, + { + "completion_length": 86.90625, + "epoch": 0.6411960132890365, + "grad_norm": 1.2398159503936768, + "kl": 0.0491943359375, + "learning_rate": 6.38125e-07, + "loss": 0.013081982266157866, + "reward": 2.242226243019104, + "reward_std": 0.33919021487236023, + "rewards/GDino": 0.7380208373069763, + "rewards/GIT": 0.5399396270513535, + "rewards/HPSv2": 0.2767658233642578, + "rewards/ORM": 0.6875, + "self_certainty_semantic": -25.875, + "self_certainty_token": -22.1875, + "step": 579 + }, + { + "completion_length": 82.578125, + "epoch": 0.6423034330011074, + "grad_norm": 0.47190362215042114, + "kl": 0.052490234375, + "learning_rate": 6.374999999999999e-07, + "loss": -0.0009382915450260043, + "reward": 2.110963821411133, + "reward_std": 0.23535766452550888, + "rewards/GDino": 0.7357682585716248, + "rewards/GIT": 0.42082274705171585, + "rewards/HPSv2": 0.2749061584472656, + "rewards/ORM": 0.6794666647911072, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -21.9375, + "step": 580 + }, + { + "completion_length": 83.609375, + "epoch": 0.6434108527131783, + "grad_norm": 0.5913388729095459, + "kl": 0.0322265625, + "learning_rate": 6.368749999999999e-07, + "loss": -0.003956899279728532, + "reward": 2.055175304412842, + "reward_std": 0.31813955307006836, + "rewards/GDino": 0.6616897583007812, + "rewards/GIT": 0.46293869614601135, + "rewards/HPSv2": 0.2770252227783203, + "rewards/ORM": 0.6535216271877289, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.875, + "step": 581 + }, + { + "completion_length": 87.203125, + "epoch": 0.6445182724252492, + "grad_norm": 0.44226452708244324, + "kl": 0.05224609375, + "learning_rate": 6.362499999999999e-07, + "loss": -0.003679857822135091, + "reward": 2.4167869091033936, + "reward_std": 0.35033319890499115, + "rewards/GDino": 0.8650817573070526, + "rewards/GIT": 0.5176351368427277, + "rewards/HPSv2": 0.28732872009277344, + "rewards/ORM": 0.7467411458492279, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.8125, + "step": 582 + }, + { + "completion_length": 92.4375, + "epoch": 0.64562569213732, + "grad_norm": 0.5307609438896179, + "kl": 0.033203125, + "learning_rate": 6.35625e-07, + "loss": -0.004225422162562609, + "reward": 1.872029721736908, + "reward_std": 0.2955891862511635, + "rewards/GDino": 0.6934732794761658, + "rewards/GIT": 0.3931538015604019, + "rewards/HPSv2": 0.2833976745605469, + "rewards/ORM": 0.5020050927996635, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.75, + "step": 583 + }, + { + "completion_length": 88.65625, + "epoch": 0.646733111849391, + "grad_norm": 1.7252110242843628, + "kl": 0.0882568359375, + "learning_rate": 6.35e-07, + "loss": -0.0022117127664387226, + "reward": 1.9745519161224365, + "reward_std": 0.38848157227039337, + "rewards/GDino": 0.6942708194255829, + "rewards/GIT": 0.33099639415740967, + "rewards/HPSv2": 0.2804088592529297, + "rewards/ORM": 0.6688758432865143, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.0, + "step": 584 + }, + { + "completion_length": 97.8125, + "epoch": 0.6478405315614618, + "grad_norm": 0.5100652575492859, + "kl": 0.0596923828125, + "learning_rate": 6.34375e-07, + "loss": 0.008778128772974014, + "reward": 1.9344983100891113, + "reward_std": 0.3392782509326935, + "rewards/GDino": 0.6828030347824097, + "rewards/GIT": 0.21010816097259521, + "rewards/HPSv2": 0.29284095764160156, + "rewards/ORM": 0.7487462162971497, + "self_certainty_semantic": -26.125, + "self_certainty_token": -21.0625, + "step": 585 + }, + { + "completion_length": 85.53125, + "epoch": 0.6489479512735327, + "grad_norm": 0.45188453793525696, + "kl": 0.051513671875, + "learning_rate": 6.3375e-07, + "loss": 0.0005252244882285595, + "reward": 2.5051465034484863, + "reward_std": 0.28443823754787445, + "rewards/GDino": 0.8956741094589233, + "rewards/GIT": 0.5864372551441193, + "rewards/HPSv2": 0.27303504943847656, + "rewards/ORM": 0.75, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -23.25, + "step": 586 + }, + { + "completion_length": 89.25, + "epoch": 0.6500553709856035, + "grad_norm": 0.6196621656417847, + "kl": 0.0416259765625, + "learning_rate": 6.33125e-07, + "loss": 0.013191667851060629, + "reward": 2.3972784280776978, + "reward_std": 0.28022703528404236, + "rewards/GDino": 0.8989353477954865, + "rewards/GIT": 0.47387292981147766, + "rewards/HPSv2": 0.2728157043457031, + "rewards/ORM": 0.7516543865203857, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -22.875, + "step": 587 + }, + { + "completion_length": 91.203125, + "epoch": 0.6511627906976745, + "grad_norm": 0.7645460367202759, + "kl": 0.0732421875, + "learning_rate": 6.324999999999999e-07, + "loss": 0.004564342787489295, + "reward": 2.506924331188202, + "reward_std": 0.22133537381887436, + "rewards/GDino": 0.7828124761581421, + "rewards/GIT": 0.555272288620472, + "rewards/HPSv2": 0.27508544921875, + "rewards/ORM": 0.8937540054321289, + "self_certainty_semantic": -26.25, + "self_certainty_token": -21.5625, + "step": 588 + }, + { + "completion_length": 89.1875, + "epoch": 0.6522702104097453, + "grad_norm": 1.2343937158584595, + "kl": 0.0782470703125, + "learning_rate": 6.31875e-07, + "loss": -0.0024571301182731986, + "reward": 2.39216947555542, + "reward_std": 0.20161084830760956, + "rewards/GDino": 0.836245059967041, + "rewards/GIT": 0.5851792842149734, + "rewards/HPSv2": 0.2832450866699219, + "rewards/ORM": 0.6875000149011612, + "self_certainty_semantic": -26.25, + "self_certainty_token": -21.875, + "step": 589 + }, + { + "completion_length": 89.515625, + "epoch": 0.6533776301218162, + "grad_norm": 0.4632999300956726, + "kl": 0.0506591796875, + "learning_rate": 6.3125e-07, + "loss": -0.0008371882140636444, + "reward": 2.4015121459960938, + "reward_std": 0.29084639251232147, + "rewards/GDino": 0.8374806642532349, + "rewards/GIT": 0.4534076303243637, + "rewards/HPSv2": 0.2889385223388672, + "rewards/ORM": 0.8216853141784668, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.625, + "step": 590 + }, + { + "completion_length": 90.96875, + "epoch": 0.654485049833887, + "grad_norm": 0.7144702076911926, + "kl": 0.0919189453125, + "learning_rate": 6.30625e-07, + "loss": 0.00751919113099575, + "reward": 2.3566391468048096, + "reward_std": 0.2903212755918503, + "rewards/GDino": 0.813281238079071, + "rewards/GIT": 0.6010564863681793, + "rewards/HPSv2": 0.2751598358154297, + "rewards/ORM": 0.6671415567398071, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.6875, + "step": 591 + }, + { + "completion_length": 89.640625, + "epoch": 0.655592469545958, + "grad_norm": 0.5201144814491272, + "kl": 0.0362548828125, + "learning_rate": 6.3e-07, + "loss": -0.0002889030147343874, + "reward": 1.9802441596984863, + "reward_std": 0.25242312252521515, + "rewards/GDino": 0.7437500059604645, + "rewards/GIT": 0.460840679705143, + "rewards/HPSv2": 0.28507041931152344, + "rewards/ORM": 0.49058300256729126, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.9375, + "step": 592 + }, + { + "completion_length": 86.78125, + "epoch": 0.6566998892580288, + "grad_norm": 1.2016290426254272, + "kl": 0.03131103515625, + "learning_rate": 6.29375e-07, + "loss": 0.005808655638247728, + "reward": 2.4578179121017456, + "reward_std": 0.2993794307112694, + "rewards/GDino": 0.8857253789901733, + "rewards/GIT": 0.61774942278862, + "rewards/HPSv2": 0.2781352996826172, + "rewards/ORM": 0.6762078106403351, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.6875, + "step": 593 + }, + { + "completion_length": 82.1875, + "epoch": 0.6578073089700996, + "grad_norm": 0.5428626537322998, + "kl": 0.044677734375, + "learning_rate": 6.2875e-07, + "loss": 0.0007634558714926243, + "reward": 2.1941815614700317, + "reward_std": 0.26104915142059326, + "rewards/GDino": 0.8031250536441803, + "rewards/GIT": 0.46738259494304657, + "rewards/HPSv2": 0.2811737060546875, + "rewards/ORM": 0.6425002217292786, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.0, + "step": 594 + }, + { + "completion_length": 90.328125, + "epoch": 0.6589147286821705, + "grad_norm": 1.1422066688537598, + "kl": 0.0782470703125, + "learning_rate": 6.28125e-07, + "loss": -0.0011294980067759752, + "reward": 1.889723300933838, + "reward_std": 0.3280980587005615, + "rewards/GDino": 0.6966421604156494, + "rewards/GIT": 0.21704591810703278, + "rewards/HPSv2": 0.29947662353515625, + "rewards/ORM": 0.6765585243701935, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -21.1875, + "step": 595 + }, + { + "completion_length": 90.734375, + "epoch": 0.6600221483942414, + "grad_norm": 0.4109950363636017, + "kl": 0.03240966796875, + "learning_rate": 6.274999999999999e-07, + "loss": 0.006392950075678527, + "reward": 2.395110607147217, + "reward_std": 0.22606350481510162, + "rewards/GDino": 0.822717010974884, + "rewards/GIT": 0.5249864757061005, + "rewards/HPSv2": 0.2739696502685547, + "rewards/ORM": 0.7734375, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.375, + "step": 596 + }, + { + "completion_length": 80.046875, + "epoch": 0.6611295681063123, + "grad_norm": 0.6307306289672852, + "kl": 0.051025390625, + "learning_rate": 6.268749999999999e-07, + "loss": -0.009424115298315883, + "reward": 1.9029319286346436, + "reward_std": 0.48359400033950806, + "rewards/GDino": 0.7328821122646332, + "rewards/GIT": 0.2205829918384552, + "rewards/HPSv2": 0.29880332946777344, + "rewards/ORM": 0.6506634652614594, + "self_certainty_semantic": -26.0, + "self_certainty_token": -22.0, + "step": 597 + }, + { + "completion_length": 90.453125, + "epoch": 0.6622369878183831, + "grad_norm": 0.5556089878082275, + "kl": 0.03564453125, + "learning_rate": 6.262499999999999e-07, + "loss": -0.0023141358979046345, + "reward": 2.6856178045272827, + "reward_std": 0.1517586186528206, + "rewards/GDino": 0.8005208373069763, + "rewards/GIT": 0.745490312576294, + "rewards/HPSv2": 0.2646064758300781, + "rewards/ORM": 0.875, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.375, + "step": 598 + }, + { + "completion_length": 94.25, + "epoch": 0.6633444075304541, + "grad_norm": 0.799374520778656, + "kl": 0.0625, + "learning_rate": 6.256249999999999e-07, + "loss": -0.0034456118009984493, + "reward": 2.145764470100403, + "reward_std": 0.19438766688108444, + "rewards/GDino": 0.8263446092605591, + "rewards/GIT": 0.4010432958602905, + "rewards/HPSv2": 0.2715435028076172, + "rewards/ORM": 0.646833062171936, + "self_certainty_semantic": -26.25, + "self_certainty_token": -22.625, + "step": 599 + }, + { + "completion_length": 83.90625, + "epoch": 0.6644518272425249, + "grad_norm": 0.4506126046180725, + "kl": 0.0699462890625, + "learning_rate": 6.249999999999999e-07, + "loss": 0.009309231070801616, + "reward": 2.2850306034088135, + "reward_std": 0.3040882647037506, + "rewards/GDino": 0.8235052525997162, + "rewards/GIT": 0.3513979911804199, + "rewards/HPSv2": 0.28486061096191406, + "rewards/ORM": 0.8252668082714081, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.3125, + "step": 600 + }, + { + "completion_length": 91.296875, + "epoch": 0.6655592469545958, + "grad_norm": 1.332478642463684, + "kl": 0.091064453125, + "learning_rate": 6.24375e-07, + "loss": 0.0034302781568840146, + "reward": 2.148199498653412, + "reward_std": 0.3696499466896057, + "rewards/GDino": 0.7866345643997192, + "rewards/GIT": 0.5184628069400787, + "rewards/HPSv2": 0.2872905731201172, + "rewards/ORM": 0.555811420083046, + "self_certainty_semantic": -26.0, + "self_certainty_token": -22.375, + "step": 601 + }, + { + "completion_length": 92.40625, + "epoch": 0.6666666666666666, + "grad_norm": 0.46582087874412537, + "kl": 0.05322265625, + "learning_rate": 6.2375e-07, + "loss": -0.0017802356742322445, + "reward": 2.2475972175598145, + "reward_std": 0.3318783938884735, + "rewards/GDino": 0.7598356008529663, + "rewards/GIT": 0.3386750742793083, + "rewards/HPSv2": 0.2866325378417969, + "rewards/ORM": 0.8624540567398071, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -21.9375, + "step": 602 + }, + { + "completion_length": 90.875, + "epoch": 0.6677740863787376, + "grad_norm": 1.0787734985351562, + "kl": 0.0360107421875, + "learning_rate": 6.23125e-07, + "loss": -0.011680092196911573, + "reward": 2.3878101110458374, + "reward_std": 0.33506520837545395, + "rewards/GDino": 0.8369791209697723, + "rewards/GIT": 0.5997322201728821, + "rewards/HPSv2": 0.26525306701660156, + "rewards/ORM": 0.6858456134796143, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.0, + "step": 603 + }, + { + "completion_length": 91.234375, + "epoch": 0.6688815060908084, + "grad_norm": 0.6582486033439636, + "kl": 0.0450439453125, + "learning_rate": 6.225000000000001e-07, + "loss": 0.007074265740811825, + "reward": 2.60383677482605, + "reward_std": 0.3062824010848999, + "rewards/GDino": 0.8807291388511658, + "rewards/GIT": 0.7336847484111786, + "rewards/HPSv2": 0.2575054168701172, + "rewards/ORM": 0.7319174408912659, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -20.625, + "step": 604 + }, + { + "completion_length": 78.734375, + "epoch": 0.6699889258028793, + "grad_norm": 0.7976247668266296, + "kl": 0.0537109375, + "learning_rate": 6.21875e-07, + "loss": 0.004971811547875404, + "reward": 2.385424852371216, + "reward_std": 0.23453293181955814, + "rewards/GDino": 0.8042107224464417, + "rewards/GIT": 0.5736879110336304, + "rewards/HPSv2": 0.2902507781982422, + "rewards/ORM": 0.7172753810882568, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.3125, + "step": 605 + }, + { + "completion_length": 81.40625, + "epoch": 0.6710963455149501, + "grad_norm": 1.5215224027633667, + "kl": 0.090087890625, + "learning_rate": 6.2125e-07, + "loss": 0.005752496188506484, + "reward": 1.9925259351730347, + "reward_std": 0.27252747118473053, + "rewards/GDino": 0.8082012832164764, + "rewards/GIT": 0.4314420223236084, + "rewards/HPSv2": 0.2672538757324219, + "rewards/ORM": 0.48562876880168915, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.0625, + "step": 606 + }, + { + "completion_length": 96.25, + "epoch": 0.6722037652270211, + "grad_norm": 0.46339181065559387, + "kl": 0.039306640625, + "learning_rate": 6.20625e-07, + "loss": 0.0020465159323066473, + "reward": 2.6440305709838867, + "reward_std": 0.200993612408638, + "rewards/GDino": 0.8648234605789185, + "rewards/GIT": 0.8019833564758301, + "rewards/HPSv2": 0.28316497802734375, + "rewards/ORM": 0.694058746099472, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.125, + "step": 607 + }, + { + "completion_length": 83.25, + "epoch": 0.6733111849390919, + "grad_norm": 0.5913336277008057, + "kl": 0.0364990234375, + "learning_rate": 6.2e-07, + "loss": 0.0073931904044002295, + "reward": 2.083902657032013, + "reward_std": 0.3354155719280243, + "rewards/GDino": 0.7391253411769867, + "rewards/GIT": 0.30637510120868683, + "rewards/HPSv2": 0.2884521484375, + "rewards/ORM": 0.7499500811100006, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.75, + "step": 608 + }, + { + "completion_length": 75.296875, + "epoch": 0.6744186046511628, + "grad_norm": 0.53142911195755, + "kl": 0.0523681640625, + "learning_rate": 6.19375e-07, + "loss": 0.01127101352903992, + "reward": 2.790825366973877, + "reward_std": 0.2711338773369789, + "rewards/GDino": 0.901562511920929, + "rewards/GIT": 0.7290374040603638, + "rewards/HPSv2": 0.2899589538574219, + "rewards/ORM": 0.8702665567398071, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.5, + "step": 609 + }, + { + "completion_length": 90.65625, + "epoch": 0.6755260243632336, + "grad_norm": 0.5681328773498535, + "kl": 0.060546875, + "learning_rate": 6.1875e-07, + "loss": 0.005634253611788154, + "reward": 1.8878324031829834, + "reward_std": 0.43909256905317307, + "rewards/GDino": 0.7042554318904877, + "rewards/GIT": 0.22821517288684845, + "rewards/HPSv2": 0.2720928192138672, + "rewards/ORM": 0.6832689344882965, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -22.3125, + "step": 610 + }, + { + "completion_length": 83.375, + "epoch": 0.6766334440753046, + "grad_norm": 0.6791914701461792, + "kl": 0.0673828125, + "learning_rate": 6.18125e-07, + "loss": -0.004875717218965292, + "reward": 2.2724517583847046, + "reward_std": 0.32523632049560547, + "rewards/GDino": 0.7335502207279205, + "rewards/GIT": 0.3345900699496269, + "rewards/HPSv2": 0.2824363708496094, + "rewards/ORM": 0.921875, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.4375, + "step": 611 + }, + { + "completion_length": 98.765625, + "epoch": 0.6777408637873754, + "grad_norm": 0.6255651116371155, + "kl": 0.067138671875, + "learning_rate": 6.175e-07, + "loss": -0.006845309631898999, + "reward": 2.2029558420181274, + "reward_std": 0.28774645924568176, + "rewards/GDino": 0.779369056224823, + "rewards/GIT": 0.5393540412187576, + "rewards/HPSv2": 0.28432464599609375, + "rewards/ORM": 0.5999081134796143, + "self_certainty_semantic": -26.3125, + "self_certainty_token": -21.4375, + "step": 612 + }, + { + "completion_length": 96.921875, + "epoch": 0.6788482834994463, + "grad_norm": 0.47570472955703735, + "kl": 0.0482177734375, + "learning_rate": 6.168749999999999e-07, + "loss": -0.01025804365053773, + "reward": 2.656336784362793, + "reward_std": 0.10137559846043587, + "rewards/GDino": 0.9705497622489929, + "rewards/GIT": 0.7900106310844421, + "rewards/HPSv2": 0.2801933288574219, + "rewards/ORM": 0.615583062171936, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -22.125, + "step": 613 + }, + { + "completion_length": 89.59375, + "epoch": 0.6799557032115172, + "grad_norm": 0.46403968334198, + "kl": 0.02880859375, + "learning_rate": 6.162499999999999e-07, + "loss": -0.0035571649204939604, + "reward": 2.3885494470596313, + "reward_std": 0.24163620918989182, + "rewards/GDino": 0.8179687559604645, + "rewards/GIT": 0.5839528441429138, + "rewards/HPSv2": 0.2670745849609375, + "rewards/ORM": 0.7195532023906708, + "self_certainty_semantic": -26.0, + "self_certainty_token": -22.5, + "step": 614 + }, + { + "completion_length": 77.859375, + "epoch": 0.6810631229235881, + "grad_norm": 0.722669780254364, + "kl": 0.0537109375, + "learning_rate": 6.156249999999999e-07, + "loss": 0.0036343755200505257, + "reward": 2.0100202560424805, + "reward_std": 0.31795068085193634, + "rewards/GDino": 0.8412032127380371, + "rewards/GIT": 0.30803602933883667, + "rewards/HPSv2": 0.2885932922363281, + "rewards/ORM": 0.572187751531601, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.375, + "step": 615 + }, + { + "completion_length": 88.96875, + "epoch": 0.6821705426356589, + "grad_norm": 1.019282341003418, + "kl": 0.0684814453125, + "learning_rate": 6.149999999999999e-07, + "loss": -0.008072856580838561, + "reward": 1.6745604872703552, + "reward_std": 0.4092772305011749, + "rewards/GDino": 0.5860664546489716, + "rewards/GIT": 0.1839316338300705, + "rewards/HPSv2": 0.261932373046875, + "rewards/ORM": 0.6426301002502441, + "self_certainty_semantic": -25.875, + "self_certainty_token": -22.125, + "step": 616 + }, + { + "completion_length": 91.0625, + "epoch": 0.6832779623477298, + "grad_norm": 0.6552587747573853, + "kl": 0.05078125, + "learning_rate": 6.143749999999999e-07, + "loss": -0.003226218745112419, + "reward": 2.455517292022705, + "reward_std": 0.21070951223373413, + "rewards/GDino": 0.8002007901668549, + "rewards/GIT": 0.460027776658535, + "rewards/HPSv2": 0.2917671203613281, + "rewards/ORM": 0.9035216569900513, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.125, + "step": 617 + }, + { + "completion_length": 98.921875, + "epoch": 0.6843853820598007, + "grad_norm": 0.4279713034629822, + "kl": 0.0482177734375, + "learning_rate": 6.1375e-07, + "loss": -0.003949227626435459, + "reward": 2.228174865245819, + "reward_std": 0.2691446468234062, + "rewards/GDino": 0.789654016494751, + "rewards/GIT": 0.5287028551101685, + "rewards/HPSv2": 0.2726726531982422, + "rewards/ORM": 0.6371453106403351, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.9375, + "step": 618 + }, + { + "completion_length": 90.09375, + "epoch": 0.6854928017718716, + "grad_norm": 0.6847978830337524, + "kl": 0.0457763671875, + "learning_rate": 6.13125e-07, + "loss": 0.01479427795857191, + "reward": 2.0829684734344482, + "reward_std": 0.34162892401218414, + "rewards/GDino": 0.7051222920417786, + "rewards/GIT": 0.4035489559173584, + "rewards/HPSv2": 0.28554344177246094, + "rewards/ORM": 0.688753753900528, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.6875, + "step": 619 + }, + { + "completion_length": 101.65625, + "epoch": 0.6866002214839424, + "grad_norm": 1.227352261543274, + "kl": 0.029541015625, + "learning_rate": 6.125000000000001e-07, + "loss": -0.001065580639988184, + "reward": 2.6353999376296997, + "reward_std": 0.23471395671367645, + "rewards/GDino": 0.8964560925960541, + "rewards/GIT": 0.6640630960464478, + "rewards/HPSv2": 0.27800559997558594, + "rewards/ORM": 0.796875, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.5625, + "step": 620 + }, + { + "completion_length": 87.25, + "epoch": 0.6877076411960132, + "grad_norm": 0.6171965003013611, + "kl": 0.068115234375, + "learning_rate": 6.11875e-07, + "loss": 0.013384459540247917, + "reward": 1.891997218132019, + "reward_std": 0.26739221066236496, + "rewards/GDino": 0.6377556174993515, + "rewards/GIT": 0.22970262169837952, + "rewards/HPSv2": 0.3026599884033203, + "rewards/ORM": 0.7218790054321289, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.1875, + "step": 621 + }, + { + "completion_length": 82.453125, + "epoch": 0.6888150609080842, + "grad_norm": 1.6894267797470093, + "kl": 0.13427734375, + "learning_rate": 6.1125e-07, + "loss": -0.004799117799848318, + "reward": 2.2798895835876465, + "reward_std": 0.31005166471004486, + "rewards/GDino": 0.8250000476837158, + "rewards/GIT": 0.44419097900390625, + "rewards/HPSv2": 0.2922992706298828, + "rewards/ORM": 0.7183992862701416, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.9375, + "step": 622 + }, + { + "completion_length": 103.578125, + "epoch": 0.689922480620155, + "grad_norm": 0.4514748156070709, + "kl": 0.03399658203125, + "learning_rate": 6.10625e-07, + "loss": 0.001322310883551836, + "reward": 2.572785496711731, + "reward_std": 0.27341755479574203, + "rewards/GDino": 0.8939887881278992, + "rewards/GIT": 0.7880756258964539, + "rewards/HPSv2": 0.27813720703125, + "rewards/ORM": 0.6125839352607727, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -21.0625, + "step": 623 + }, + { + "completion_length": 87.90625, + "epoch": 0.6910299003322259, + "grad_norm": 0.4110758304595947, + "kl": 0.0892333984375, + "learning_rate": 6.1e-07, + "loss": 0.007087853271514177, + "reward": 1.9042915105819702, + "reward_std": 0.32226330786943436, + "rewards/GDino": 0.6263091564178467, + "rewards/GIT": 0.35193467140197754, + "rewards/HPSv2": 0.2724761962890625, + "rewards/ORM": 0.6535715758800507, + "self_certainty_semantic": -26.0, + "self_certainty_token": -23.0, + "step": 624 + }, + { + "completion_length": 84.421875, + "epoch": 0.6921373200442967, + "grad_norm": 0.6037589311599731, + "kl": 0.036376953125, + "learning_rate": 6.09375e-07, + "loss": -0.007811628980562091, + "reward": 2.3651981353759766, + "reward_std": 0.3844504654407501, + "rewards/GDino": 0.7807291448116302, + "rewards/GIT": 0.5242247134447098, + "rewards/HPSv2": 0.2703285217285156, + "rewards/ORM": 0.7899156212806702, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -21.8125, + "step": 625 + }, + { + "completion_length": 86.265625, + "epoch": 0.6932447397563677, + "grad_norm": 2.0771775245666504, + "kl": 0.0914306640625, + "learning_rate": 6.0875e-07, + "loss": 0.007114154053851962, + "reward": 1.6251343488693237, + "reward_std": 0.19571924954652786, + "rewards/GDino": 0.6897631585597992, + "rewards/GIT": 0.22550922632217407, + "rewards/HPSv2": 0.2786998748779297, + "rewards/ORM": 0.4311620742082596, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.875, + "step": 626 + }, + { + "completion_length": 90.390625, + "epoch": 0.6943521594684385, + "grad_norm": 0.4613959789276123, + "kl": 0.0413818359375, + "learning_rate": 6.08125e-07, + "loss": 0.00543033005669713, + "reward": 2.0488085746765137, + "reward_std": 0.251141682267189, + "rewards/GDino": 0.7159949243068695, + "rewards/GIT": 0.41834934055805206, + "rewards/HPSv2": 0.2758445739746094, + "rewards/ORM": 0.6386198699474335, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.5625, + "step": 627 + }, + { + "completion_length": 88.6875, + "epoch": 0.6954595791805094, + "grad_norm": 0.4723363518714905, + "kl": 0.0428466796875, + "learning_rate": 6.075e-07, + "loss": -0.007554895128123462, + "reward": 2.4085551500320435, + "reward_std": 0.3689408004283905, + "rewards/GDino": 0.7526527941226959, + "rewards/GIT": 0.5323520451784134, + "rewards/HPSv2": 0.2742137908935547, + "rewards/ORM": 0.8493366241455078, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -22.4375, + "step": 628 + }, + { + "completion_length": 81.25, + "epoch": 0.6965669988925803, + "grad_norm": 0.478963166475296, + "kl": 0.026611328125, + "learning_rate": 6.06875e-07, + "loss": 0.0004908200353384018, + "reward": 2.441234588623047, + "reward_std": 0.31722216308116913, + "rewards/GDino": 0.866549015045166, + "rewards/GIT": 0.6477379202842712, + "rewards/HPSv2": 0.28163909912109375, + "rewards/ORM": 0.6453084945678711, + "self_certainty_semantic": -26.0, + "self_certainty_token": -22.0625, + "step": 629 + }, + { + "completion_length": 86.515625, + "epoch": 0.6976744186046512, + "grad_norm": 0.4405619204044342, + "kl": 0.043701171875, + "learning_rate": 6.062499999999999e-07, + "loss": 0.0011022216640412807, + "reward": 2.03541362285614, + "reward_std": 0.1785994917154312, + "rewards/GDino": 0.7722096741199493, + "rewards/GIT": 0.3379756510257721, + "rewards/HPSv2": 0.2831287384033203, + "rewards/ORM": 0.6420996189117432, + "self_certainty_semantic": -26.125, + "self_certainty_token": -21.6875, + "step": 630 + }, + { + "completion_length": 88.828125, + "epoch": 0.698781838316722, + "grad_norm": 1.2109594345092773, + "kl": 0.0855712890625, + "learning_rate": 6.056249999999999e-07, + "loss": -0.0023857366759330034, + "reward": 2.27842253446579, + "reward_std": 0.3590303510427475, + "rewards/GDino": 0.8026753067970276, + "rewards/GIT": 0.3442394435405731, + "rewards/HPSv2": 0.29401588439941406, + "rewards/ORM": 0.8374919891357422, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.4375, + "step": 631 + }, + { + "completion_length": 84.703125, + "epoch": 0.6998892580287929, + "grad_norm": 0.66162109375, + "kl": 0.041259765625, + "learning_rate": 6.049999999999999e-07, + "loss": -0.0012856603134423494, + "reward": 1.8185372352600098, + "reward_std": 0.287272110581398, + "rewards/GDino": 0.713621973991394, + "rewards/GIT": 0.3100564107298851, + "rewards/HPSv2": 0.28852081298828125, + "rewards/ORM": 0.5063379108905792, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.8125, + "step": 632 + }, + { + "completion_length": 85.59375, + "epoch": 0.7009966777408638, + "grad_norm": 0.4788581132888794, + "kl": 0.0352783203125, + "learning_rate": 6.043749999999999e-07, + "loss": 0.0007031826535239816, + "reward": 1.9657421708106995, + "reward_std": 0.4616681933403015, + "rewards/GDino": 0.7202981412410736, + "rewards/GIT": 0.39410945773124695, + "rewards/HPSv2": 0.27789306640625, + "rewards/ORM": 0.5734414756298065, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -22.5625, + "step": 633 + }, + { + "completion_length": 102.0, + "epoch": 0.7021040974529347, + "grad_norm": 1.0284152030944824, + "kl": 0.0772705078125, + "learning_rate": 6.037499999999999e-07, + "loss": 0.014419443905353546, + "reward": 2.3417510986328125, + "reward_std": 0.33683477342128754, + "rewards/GDino": 0.8628135919570923, + "rewards/GIT": 0.5719771087169647, + "rewards/HPSv2": 0.2858924865722656, + "rewards/ORM": 0.6210678070783615, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.5625, + "step": 634 + }, + { + "completion_length": 87.1875, + "epoch": 0.7032115171650055, + "grad_norm": 0.9903973340988159, + "kl": 0.05126953125, + "learning_rate": 6.031249999999999e-07, + "loss": 0.002168428327422589, + "reward": 1.907868504524231, + "reward_std": 0.35160693526268005, + "rewards/GDino": 0.6744791865348816, + "rewards/GIT": 0.33536963164806366, + "rewards/HPSv2": 0.2734203338623047, + "rewards/ORM": 0.6245993375778198, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.6875, + "step": 635 + }, + { + "completion_length": 88.375, + "epoch": 0.7043189368770764, + "grad_norm": 0.5040203928947449, + "kl": 0.038330078125, + "learning_rate": 6.025000000000001e-07, + "loss": -0.006203518947586417, + "reward": 1.7218048572540283, + "reward_std": 0.1598360240459442, + "rewards/GDino": 0.6406362056732178, + "rewards/GIT": 0.43583452701568604, + "rewards/HPSv2": 0.26958274841308594, + "rewards/ORM": 0.3757513463497162, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -20.875, + "step": 636 + }, + { + "completion_length": 99.03125, + "epoch": 0.7054263565891473, + "grad_norm": 0.8950693607330322, + "kl": 0.0357666015625, + "learning_rate": 6.018750000000001e-07, + "loss": 0.0047954951878637075, + "reward": 2.144571542739868, + "reward_std": 0.18925148993730545, + "rewards/GDino": 0.7445312440395355, + "rewards/GIT": 0.5563637316226959, + "rewards/HPSv2": 0.28626060485839844, + "rewards/ORM": 0.5574158728122711, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -23.0, + "step": 637 + }, + { + "completion_length": 89.390625, + "epoch": 0.7065337763012182, + "grad_norm": 1.9158759117126465, + "kl": 0.059814453125, + "learning_rate": 6.0125e-07, + "loss": -0.005621961550787091, + "reward": 1.7748108506202698, + "reward_std": 0.31341874599456787, + "rewards/GDino": 0.6069283038377762, + "rewards/GIT": 0.23655302077531815, + "rewards/HPSv2": 0.2688713073730469, + "rewards/ORM": 0.6624580323696136, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.0625, + "step": 638 + }, + { + "completion_length": 102.375, + "epoch": 0.707641196013289, + "grad_norm": 0.559564471244812, + "kl": 0.0560302734375, + "learning_rate": 6.00625e-07, + "loss": -0.0025928550167009234, + "reward": 2.1050148606300354, + "reward_std": 0.374173641204834, + "rewards/GDino": 0.6609375178813934, + "rewards/GIT": 0.5544036030769348, + "rewards/HPSv2": 0.2807903289794922, + "rewards/ORM": 0.6088833510875702, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -21.0625, + "step": 639 + }, + { + "completion_length": 93.921875, + "epoch": 0.70874861572536, + "grad_norm": 18.474855422973633, + "kl": 0.043701171875, + "learning_rate": 6e-07, + "loss": -0.006503427983261645, + "reward": 2.3460196256637573, + "reward_std": 0.21795154362916946, + "rewards/GDino": 0.8889912366867065, + "rewards/GIT": 0.44075681269168854, + "rewards/HPSv2": 0.28970909118652344, + "rewards/ORM": 0.7265625298023224, + "self_certainty_semantic": -26.125, + "self_certainty_token": -21.1875, + "step": 640 + }, + { + "completion_length": 84.34375, + "epoch": 0.7098560354374308, + "grad_norm": 0.4927503168582916, + "kl": 0.03375244140625, + "learning_rate": 5.99375e-07, + "loss": -0.0028927952516824007, + "reward": 2.3492602109909058, + "reward_std": 0.18391503393650055, + "rewards/GDino": 0.7559124827384949, + "rewards/GIT": 0.46299484372138977, + "rewards/HPSv2": 0.28339385986328125, + "rewards/ORM": 0.8469589054584503, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -21.8125, + "step": 641 + }, + { + "completion_length": 86.796875, + "epoch": 0.7109634551495017, + "grad_norm": 0.5018269419670105, + "kl": 0.05908203125, + "learning_rate": 5.9875e-07, + "loss": 0.0004683835431933403, + "reward": 2.1911041736602783, + "reward_std": 0.25724170356988907, + "rewards/GDino": 0.7239583134651184, + "rewards/GIT": 0.43615947663784027, + "rewards/HPSv2": 0.26648521423339844, + "rewards/ORM": 0.7645010948181152, + "self_certainty_semantic": -26.125, + "self_certainty_token": -21.0625, + "step": 642 + }, + { + "completion_length": 93.09375, + "epoch": 0.7120708748615725, + "grad_norm": 0.4329701066017151, + "kl": 0.0357666015625, + "learning_rate": 5.98125e-07, + "loss": 0.008556351996958256, + "reward": 1.7531054019927979, + "reward_std": 0.31839413940906525, + "rewards/GDino": 0.7208990454673767, + "rewards/GIT": 0.22525273263454437, + "rewards/HPSv2": 0.2837867736816406, + "rewards/ORM": 0.5231667011976242, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -22.6875, + "step": 643 + }, + { + "completion_length": 91.78125, + "epoch": 0.7131782945736435, + "grad_norm": 0.7012612223625183, + "kl": 0.0526123046875, + "learning_rate": 5.975e-07, + "loss": -0.0026120016700588167, + "reward": 2.1655226349830627, + "reward_std": 0.17083656042814255, + "rewards/GDino": 0.6968750059604645, + "rewards/GIT": 0.46584388613700867, + "rewards/HPSv2": 0.2684288024902344, + "rewards/ORM": 0.734375, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.1875, + "step": 644 + }, + { + "completion_length": 99.6875, + "epoch": 0.7142857142857143, + "grad_norm": 0.48661601543426514, + "kl": 0.05084228515625, + "learning_rate": 5.96875e-07, + "loss": -0.007944982033222914, + "reward": 2.1803592443466187, + "reward_std": 0.33926379680633545, + "rewards/GDino": 0.8054166734218597, + "rewards/GIT": 0.5937389731407166, + "rewards/HPSv2": 0.25525856018066406, + "rewards/ORM": 0.5259450227022171, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.75, + "step": 645 + }, + { + "completion_length": 90.390625, + "epoch": 0.7153931339977851, + "grad_norm": 0.4743451178073883, + "kl": 0.0416259765625, + "learning_rate": 5.962499999999999e-07, + "loss": -0.0012699789367616177, + "reward": 1.9671456813812256, + "reward_std": 0.36079831421375275, + "rewards/GDino": 0.7288096249103546, + "rewards/GIT": 0.471624955534935, + "rewards/HPSv2": 0.27112388610839844, + "rewards/ORM": 0.4955873116850853, + "self_certainty_semantic": -25.875, + "self_certainty_token": -22.0, + "step": 646 + }, + { + "completion_length": 92.0, + "epoch": 0.716500553709856, + "grad_norm": 0.5281607508659363, + "kl": 0.038330078125, + "learning_rate": 5.956249999999999e-07, + "loss": -0.0008692734409123659, + "reward": 1.9515712261199951, + "reward_std": 0.32006649672985077, + "rewards/GDino": 0.7853873670101166, + "rewards/GIT": 0.3109753280878067, + "rewards/HPSv2": 0.2876243591308594, + "rewards/ORM": 0.5675841569900513, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.875, + "step": 647 + }, + { + "completion_length": 80.3125, + "epoch": 0.717607973421927, + "grad_norm": 0.5372815132141113, + "kl": 0.046875, + "learning_rate": 5.949999999999999e-07, + "loss": 3.8998667150735855e-05, + "reward": 2.462640166282654, + "reward_std": 0.31203581392765045, + "rewards/GDino": 0.8565733134746552, + "rewards/GIT": 0.5576457232236862, + "rewards/HPSv2": 0.2857952117919922, + "rewards/ORM": 0.7626258730888367, + "self_certainty_semantic": -26.3125, + "self_certainty_token": -23.5, + "step": 648 + }, + { + "completion_length": 85.828125, + "epoch": 0.7187153931339978, + "grad_norm": 0.4850459396839142, + "kl": 0.0391845703125, + "learning_rate": 5.943749999999999e-07, + "loss": -0.0029084045672789216, + "reward": 2.1491920948028564, + "reward_std": 0.38251978158950806, + "rewards/GDino": 0.7852162718772888, + "rewards/GIT": 0.22060729563236237, + "rewards/HPSv2": 0.288726806640625, + "rewards/ORM": 0.8546415567398071, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.0, + "step": 649 + }, + { + "completion_length": 78.75, + "epoch": 0.7198228128460686, + "grad_norm": 0.5087283253669739, + "kl": 0.06396484375, + "learning_rate": 5.937499999999999e-07, + "loss": -0.0033320622169412673, + "reward": 2.029821515083313, + "reward_std": 0.4616316854953766, + "rewards/GDino": 0.7132953703403473, + "rewards/GIT": 0.21734549850225449, + "rewards/HPSv2": 0.2928886413574219, + "rewards/ORM": 0.806291937828064, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -22.8125, + "step": 650 + }, + { + "completion_length": 99.296875, + "epoch": 0.7209302325581395, + "grad_norm": 0.49115225672721863, + "kl": 0.0416259765625, + "learning_rate": 5.93125e-07, + "loss": 0.00135955517180264, + "reward": 1.96121746301651, + "reward_std": 0.3768788278102875, + "rewards/GDino": 0.6516396403312683, + "rewards/GIT": 0.42501816153526306, + "rewards/HPSv2": 0.27518463134765625, + "rewards/ORM": 0.6093750149011612, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -21.5625, + "step": 651 + }, + { + "completion_length": 87.703125, + "epoch": 0.7220376522702104, + "grad_norm": 1.3749279975891113, + "kl": 0.0721435546875, + "learning_rate": 5.925e-07, + "loss": -0.0019788409117609262, + "reward": 2.5001988410949707, + "reward_std": 0.28632427006959915, + "rewards/GDino": 0.85992431640625, + "rewards/GIT": 0.515458732843399, + "rewards/HPSv2": 0.27004432678222656, + "rewards/ORM": 0.8547714054584503, + "self_certainty_semantic": -26.125, + "self_certainty_token": -21.5625, + "step": 652 + }, + { + "completion_length": 77.625, + "epoch": 0.7231450719822813, + "grad_norm": 0.5144050717353821, + "kl": 0.039306640625, + "learning_rate": 5.91875e-07, + "loss": -5.321597564034164e-05, + "reward": 1.9586353302001953, + "reward_std": 0.28463270515203476, + "rewards/GDino": 0.7114152014255524, + "rewards/GIT": 0.3402389585971832, + "rewards/HPSv2": 0.2871971130371094, + "rewards/ORM": 0.6197840571403503, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.5, + "step": 653 + }, + { + "completion_length": 89.640625, + "epoch": 0.7242524916943521, + "grad_norm": 0.7271938920021057, + "kl": 0.0478515625, + "learning_rate": 5.912500000000001e-07, + "loss": 0.009882757207378745, + "reward": 2.0267902612686157, + "reward_std": 0.4139002412557602, + "rewards/GDino": 0.7397133708000183, + "rewards/GIT": 0.21871892362833023, + "rewards/HPSv2": 0.2781715393066406, + "rewards/ORM": 0.7901863753795624, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.375, + "step": 654 + }, + { + "completion_length": 89.859375, + "epoch": 0.7253599114064231, + "grad_norm": 1.4332215785980225, + "kl": 0.07794189453125, + "learning_rate": 5.90625e-07, + "loss": 0.0034206161508336663, + "reward": 1.9203814268112183, + "reward_std": 0.4245748221874237, + "rewards/GDino": 0.7836858332157135, + "rewards/GIT": 0.3265160620212555, + "rewards/HPSv2": 0.29290008544921875, + "rewards/ORM": 0.5172794163227081, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -21.625, + "step": 655 + }, + { + "completion_length": 84.46875, + "epoch": 0.7264673311184939, + "grad_norm": 0.9126909375190735, + "kl": 0.0577392578125, + "learning_rate": 5.9e-07, + "loss": 0.000609748880378902, + "reward": 2.0030502676963806, + "reward_std": 0.30357377976179123, + "rewards/GDino": 0.7379720211029053, + "rewards/GIT": 0.34063340723514557, + "rewards/HPSv2": 0.28034019470214844, + "rewards/ORM": 0.6441046893596649, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.9375, + "step": 656 + }, + { + "completion_length": 81.84375, + "epoch": 0.7275747508305648, + "grad_norm": 0.6337254047393799, + "kl": 0.0828857421875, + "learning_rate": 5.89375e-07, + "loss": 0.002044278895482421, + "reward": 2.298175096511841, + "reward_std": 0.4422539472579956, + "rewards/GDino": 0.7859342396259308, + "rewards/GIT": 0.5971966683864594, + "rewards/HPSv2": 0.2759437561035156, + "rewards/ORM": 0.6391004323959351, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.1875, + "step": 657 + }, + { + "completion_length": 90.265625, + "epoch": 0.7286821705426356, + "grad_norm": 0.47765862941741943, + "kl": 0.0377197265625, + "learning_rate": 5.8875e-07, + "loss": -0.006116768810898066, + "reward": 2.934214949607849, + "reward_std": 0.13987672328948975, + "rewards/GDino": 0.8651041388511658, + "rewards/GIT": 0.8034897148609161, + "rewards/HPSv2": 0.2734336853027344, + "rewards/ORM": 0.9921875, + "self_certainty_semantic": -26.375, + "self_certainty_token": -21.5625, + "step": 658 + }, + { + "completion_length": 95.984375, + "epoch": 0.7297895902547066, + "grad_norm": 0.9334122538566589, + "kl": 0.0389404296875, + "learning_rate": 5.88125e-07, + "loss": 0.0013720933347940445, + "reward": 2.270847797393799, + "reward_std": 0.1525878980755806, + "rewards/GDino": 0.8497982323169708, + "rewards/GIT": 0.522528275847435, + "rewards/HPSv2": 0.27352142333984375, + "rewards/ORM": 0.625, + "self_certainty_semantic": -26.125, + "self_certainty_token": -20.875, + "step": 659 + }, + { + "completion_length": 94.046875, + "epoch": 0.7308970099667774, + "grad_norm": 0.5421406030654907, + "kl": 0.02777099609375, + "learning_rate": 5.875e-07, + "loss": 0.0007960067596286535, + "reward": 2.1377129554748535, + "reward_std": 0.2098780944943428, + "rewards/GDino": 0.7686106562614441, + "rewards/GIT": 0.5266330242156982, + "rewards/HPSv2": 0.2670726776123047, + "rewards/ORM": 0.5753966569900513, + "self_certainty_semantic": -26.125, + "self_certainty_token": -21.875, + "step": 660 + }, + { + "completion_length": 86.890625, + "epoch": 0.7320044296788483, + "grad_norm": 0.4109729826450348, + "kl": 0.07568359375, + "learning_rate": 5.86875e-07, + "loss": -4.6757631935179234e-05, + "reward": 1.2150006294250488, + "reward_std": 0.32698993384838104, + "rewards/GDino": 0.48252616822719574, + "rewards/GIT": 0.0, + "rewards/HPSv2": 0.2839031219482422, + "rewards/ORM": 0.4485713839530945, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -22.5, + "step": 661 + }, + { + "completion_length": 98.453125, + "epoch": 0.7331118493909191, + "grad_norm": 0.47569000720977783, + "kl": 0.0914306640625, + "learning_rate": 5.8625e-07, + "loss": 0.0017794049344956875, + "reward": 2.277660608291626, + "reward_std": 0.3446338027715683, + "rewards/GDino": 0.8357483148574829, + "rewards/GIT": 0.5733911991119385, + "rewards/HPSv2": 0.27134132385253906, + "rewards/ORM": 0.5971797108650208, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.1875, + "step": 662 + }, + { + "completion_length": 81.078125, + "epoch": 0.7342192691029901, + "grad_norm": 0.43906712532043457, + "kl": 0.0491943359375, + "learning_rate": 5.856249999999999e-07, + "loss": -0.007730189710855484, + "reward": 2.8684195280075073, + "reward_std": 0.23546351492404938, + "rewards/GDino": 0.909375011920929, + "rewards/GIT": 0.8069201111793518, + "rewards/HPSv2": 0.29274940490722656, + "rewards/ORM": 0.859375, + "self_certainty_semantic": -26.125, + "self_certainty_token": -21.9375, + "step": 663 + }, + { + "completion_length": 79.9375, + "epoch": 0.7353266888150609, + "grad_norm": 0.6633722186088562, + "kl": 0.03643798828125, + "learning_rate": 5.849999999999999e-07, + "loss": -0.00819562585093081, + "reward": 2.459098696708679, + "reward_std": 0.21784447878599167, + "rewards/GDino": 0.8858754634857178, + "rewards/GIT": 0.5450869351625443, + "rewards/HPSv2": 0.2877330780029297, + "rewards/ORM": 0.7404032647609711, + "self_certainty_semantic": -26.0, + "self_certainty_token": -22.6875, + "step": 664 + }, + { + "completion_length": 96.359375, + "epoch": 0.7364341085271318, + "grad_norm": 0.4794686436653137, + "kl": 0.03826904296875, + "learning_rate": 5.843749999999999e-07, + "loss": -0.007845424814149737, + "reward": 2.1590399742126465, + "reward_std": 0.3060216009616852, + "rewards/GDino": 0.7568039298057556, + "rewards/GIT": 0.33352863788604736, + "rewards/HPSv2": 0.2825031280517578, + "rewards/ORM": 0.7862043082714081, + "self_certainty_semantic": -26.125, + "self_certainty_token": -21.3125, + "step": 665 + }, + { + "completion_length": 82.609375, + "epoch": 0.7375415282392026, + "grad_norm": 0.6454534530639648, + "kl": 0.037109375, + "learning_rate": 5.837499999999999e-07, + "loss": -0.0030233769211918116, + "reward": 2.9405006170272827, + "reward_std": 0.11489935591816902, + "rewards/GDino": 0.9309895634651184, + "rewards/GIT": 0.8688841164112091, + "rewards/HPSv2": 0.2656269073486328, + "rewards/ORM": 0.875, + "self_certainty_semantic": -26.25, + "self_certainty_token": -22.8125, + "step": 666 + }, + { + "completion_length": 82.140625, + "epoch": 0.7386489479512736, + "grad_norm": 0.5992516279220581, + "kl": 0.078125, + "learning_rate": 5.83125e-07, + "loss": 0.005287746200338006, + "reward": 2.20145845413208, + "reward_std": 0.3559674769639969, + "rewards/GDino": 0.7640625238418579, + "rewards/GIT": 0.4115181192755699, + "rewards/HPSv2": 0.2874927520751953, + "rewards/ORM": 0.7383852005004883, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.375, + "step": 667 + }, + { + "completion_length": 92.828125, + "epoch": 0.7397563676633444, + "grad_norm": 0.5516700744628906, + "kl": 0.0400390625, + "learning_rate": 5.825e-07, + "loss": 0.006443792022764683, + "reward": 2.182730197906494, + "reward_std": 0.2277146726846695, + "rewards/GDino": 0.7573208212852478, + "rewards/GIT": 0.4878613352775574, + "rewards/HPSv2": 0.279693603515625, + "rewards/ORM": 0.657854437828064, + "self_certainty_semantic": -25.875, + "self_certainty_token": -23.1875, + "step": 668 + }, + { + "completion_length": 82.140625, + "epoch": 0.7408637873754153, + "grad_norm": 0.5028103590011597, + "kl": 0.0654296875, + "learning_rate": 5.81875e-07, + "loss": 0.0030899150297045708, + "reward": 2.574856996536255, + "reward_std": 0.242995984852314, + "rewards/GDino": 0.8393476009368896, + "rewards/GIT": 0.5640667974948883, + "rewards/HPSv2": 0.2855510711669922, + "rewards/ORM": 0.8858915269374847, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -21.875, + "step": 669 + }, + { + "completion_length": 88.75, + "epoch": 0.7419712070874862, + "grad_norm": 0.7942551970481873, + "kl": 0.15283203125, + "learning_rate": 5.8125e-07, + "loss": 0.0035495543852448463, + "reward": 2.1698633432388306, + "reward_std": 0.2921285629272461, + "rewards/GDino": 0.6650677919387817, + "rewards/GIT": 0.40071188658475876, + "rewards/HPSv2": 0.28537559509277344, + "rewards/ORM": 0.8187080323696136, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.0625, + "step": 670 + }, + { + "completion_length": 98.203125, + "epoch": 0.743078626799557, + "grad_norm": 1.30418062210083, + "kl": 0.066162109375, + "learning_rate": 5.806249999999999e-07, + "loss": -0.0039024970028549433, + "reward": 2.4580671787261963, + "reward_std": 0.24044052511453629, + "rewards/GDino": 0.797656238079071, + "rewards/GIT": 0.47725047171115875, + "rewards/HPSv2": 0.28659820556640625, + "rewards/ORM": 0.8965622782707214, + "self_certainty_semantic": -26.125, + "self_certainty_token": -21.625, + "step": 671 + }, + { + "completion_length": 85.8125, + "epoch": 0.7441860465116279, + "grad_norm": 0.474373996257782, + "kl": 0.0401611328125, + "learning_rate": 5.8e-07, + "loss": 0.0028850508388131857, + "reward": 2.158820152282715, + "reward_std": 0.29940100759267807, + "rewards/GDino": 0.7359375357627869, + "rewards/GIT": 0.5142717063426971, + "rewards/HPSv2": 0.267364501953125, + "rewards/ORM": 0.6412464678287506, + "self_certainty_semantic": -26.125, + "self_certainty_token": -21.5625, + "step": 672 + }, + { + "completion_length": 87.328125, + "epoch": 0.7452934662236987, + "grad_norm": 0.4354628622531891, + "kl": 0.04736328125, + "learning_rate": 5.79375e-07, + "loss": 0.004038871731609106, + "reward": 1.8933424949645996, + "reward_std": 0.484218567609787, + "rewards/GDino": 0.7562132477760315, + "rewards/GIT": 0.20988446474075317, + "rewards/HPSv2": 0.2931785583496094, + "rewards/ORM": 0.6340662240982056, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.6875, + "step": 673 + }, + { + "completion_length": 81.0625, + "epoch": 0.7464008859357697, + "grad_norm": 1.0575191974639893, + "kl": 0.075927734375, + "learning_rate": 5.7875e-07, + "loss": 0.0003490226808935404, + "reward": 1.7971633672714233, + "reward_std": 0.3968241363763809, + "rewards/GDino": 0.7667253613471985, + "rewards/GIT": 0.20426715165376663, + "rewards/HPSv2": 0.2824249267578125, + "rewards/ORM": 0.5437460243701935, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -23.0625, + "step": 674 + }, + { + "completion_length": 99.71875, + "epoch": 0.7475083056478405, + "grad_norm": 0.46759819984436035, + "kl": 0.03936767578125, + "learning_rate": 5.78125e-07, + "loss": -0.002479708520695567, + "reward": 2.41166353225708, + "reward_std": 0.3170488774776459, + "rewards/GDino": 0.8225728571414948, + "rewards/GIT": 0.5386542826890945, + "rewards/HPSv2": 0.2753944396972656, + "rewards/ORM": 0.7750419676303864, + "self_certainty_semantic": -26.4375, + "self_certainty_token": -21.8125, + "step": 675 + }, + { + "completion_length": 92.078125, + "epoch": 0.7486157253599114, + "grad_norm": 13.425639152526855, + "kl": 0.14306640625, + "learning_rate": 5.775e-07, + "loss": -0.005679744761437178, + "reward": 2.0987464785575867, + "reward_std": 0.2669459879398346, + "rewards/GDino": 0.7425667941570282, + "rewards/GIT": 0.47009122371673584, + "rewards/HPSv2": 0.29506683349609375, + "rewards/ORM": 0.5910216271877289, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -23.25, + "step": 676 + }, + { + "completion_length": 87.0, + "epoch": 0.7497231450719822, + "grad_norm": 0.47715461254119873, + "kl": 0.0343017578125, + "learning_rate": 5.76875e-07, + "loss": -0.012283400632441044, + "reward": 2.190652549266815, + "reward_std": 0.34489260613918304, + "rewards/GDino": 0.7928432524204254, + "rewards/GIT": 0.45851051807403564, + "rewards/HPSv2": 0.2838001251220703, + "rewards/ORM": 0.6554986387491226, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -22.0, + "step": 677 + }, + { + "completion_length": 78.984375, + "epoch": 0.7508305647840532, + "grad_norm": 0.5902923345565796, + "kl": 0.0955810546875, + "learning_rate": 5.7625e-07, + "loss": 0.005321919452399015, + "reward": 1.901140034198761, + "reward_std": 0.3233024924993515, + "rewards/GDino": 0.7454414367675781, + "rewards/GIT": 0.3008659929037094, + "rewards/HPSv2": 0.2813911437988281, + "rewards/ORM": 0.5734414756298065, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.5, + "step": 678 + }, + { + "completion_length": 86.234375, + "epoch": 0.751937984496124, + "grad_norm": 0.6141394376754761, + "kl": 0.0430908203125, + "learning_rate": 5.75625e-07, + "loss": 0.007074729772284627, + "reward": 2.083330512046814, + "reward_std": 0.3626834899187088, + "rewards/GDino": 0.7508443593978882, + "rewards/GIT": 0.43764275312423706, + "rewards/HPSv2": 0.2729225158691406, + "rewards/ORM": 0.6219209730625153, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.1875, + "step": 679 + }, + { + "completion_length": 91.0625, + "epoch": 0.7530454042081949, + "grad_norm": 0.45878922939300537, + "kl": 0.0361328125, + "learning_rate": 5.749999999999999e-07, + "loss": 0.0010946787660941482, + "reward": 2.5365670919418335, + "reward_std": 0.26428209990262985, + "rewards/GDino": 0.9187500178813934, + "rewards/GIT": 0.5718556642532349, + "rewards/HPSv2": 0.2721233367919922, + "rewards/ORM": 0.7738381326198578, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.25, + "step": 680 + }, + { + "completion_length": 90.375, + "epoch": 0.7541528239202658, + "grad_norm": 0.8023860454559326, + "kl": 0.02838134765625, + "learning_rate": 5.743749999999999e-07, + "loss": 0.0012773984344676137, + "reward": 2.599917769432068, + "reward_std": 0.20768918097019196, + "rewards/GDino": 0.8890624940395355, + "rewards/GIT": 0.6180640459060669, + "rewards/HPSv2": 0.26158714294433594, + "rewards/ORM": 0.8312040269374847, + "self_certainty_semantic": -26.125, + "self_certainty_token": -23.5, + "step": 681 + }, + { + "completion_length": 82.796875, + "epoch": 0.7552602436323367, + "grad_norm": 0.5059024691581726, + "kl": 0.0511474609375, + "learning_rate": 5.737499999999999e-07, + "loss": 0.006995198084041476, + "reward": 1.8690189719200134, + "reward_std": 0.34537334740161896, + "rewards/GDino": 0.6528332829475403, + "rewards/GIT": 0.20674297213554382, + "rewards/HPSv2": 0.29310035705566406, + "rewards/ORM": 0.7163423001766205, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.0, + "step": 682 + }, + { + "completion_length": 100.953125, + "epoch": 0.7563676633444075, + "grad_norm": 0.7043551802635193, + "kl": 0.0711669921875, + "learning_rate": 5.73125e-07, + "loss": -0.007771526928991079, + "reward": 1.7987099885940552, + "reward_std": 0.3775853365659714, + "rewards/GDino": 0.5704167187213898, + "rewards/GIT": 0.3323868587613106, + "rewards/HPSv2": 0.2709064483642578, + "rewards/ORM": 0.6249999701976776, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.125, + "step": 683 + }, + { + "completion_length": 86.0, + "epoch": 0.7574750830564784, + "grad_norm": 0.5185422897338867, + "kl": 0.02825927734375, + "learning_rate": 5.725e-07, + "loss": -0.003711120574735105, + "reward": 2.5120363235473633, + "reward_std": 0.25669051706790924, + "rewards/GDino": 0.7823846340179443, + "rewards/GIT": 0.6351113021373749, + "rewards/HPSv2": 0.2785606384277344, + "rewards/ORM": 0.8159796893596649, + "self_certainty_semantic": -26.0, + "self_certainty_token": -22.8125, + "step": 684 + }, + { + "completion_length": 97.296875, + "epoch": 0.7585825027685493, + "grad_norm": 0.5249005556106567, + "kl": 0.0628662109375, + "learning_rate": 5.71875e-07, + "loss": -0.007049170322716236, + "reward": 2.028254985809326, + "reward_std": 0.31377512216567993, + "rewards/GDino": 0.7800102829933167, + "rewards/GIT": 0.4365968704223633, + "rewards/HPSv2": 0.28061866760253906, + "rewards/ORM": 0.5310291945934296, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.625, + "step": 685 + }, + { + "completion_length": 93.21875, + "epoch": 0.7596899224806202, + "grad_norm": 0.5600687861442566, + "kl": 0.0361328125, + "learning_rate": 5.7125e-07, + "loss": -0.013361352030187845, + "reward": 1.9606686234474182, + "reward_std": 0.28846976161003113, + "rewards/GDino": 0.65625, + "rewards/GIT": 0.42764804512262344, + "rewards/HPSv2": 0.26404571533203125, + "rewards/ORM": 0.6127248406410217, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.75, + "step": 686 + }, + { + "completion_length": 103.171875, + "epoch": 0.760797342192691, + "grad_norm": 0.8746234178543091, + "kl": 0.0511474609375, + "learning_rate": 5.70625e-07, + "loss": 0.003181913634762168, + "reward": 2.1306235790252686, + "reward_std": 0.4023451805114746, + "rewards/GDino": 0.7613615393638611, + "rewards/GIT": 0.5234895050525665, + "rewards/HPSv2": 0.26639366149902344, + "rewards/ORM": 0.5793787837028503, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -22.6875, + "step": 687 + }, + { + "completion_length": 104.921875, + "epoch": 0.7619047619047619, + "grad_norm": 0.5565656423568726, + "kl": 0.03314208984375, + "learning_rate": 5.699999999999999e-07, + "loss": 0.008015180006623268, + "reward": 2.1197550296783447, + "reward_std": 0.3410971313714981, + "rewards/GDino": 0.7706657946109772, + "rewards/GIT": 0.4775826036930084, + "rewards/HPSv2": 0.2748985290527344, + "rewards/ORM": 0.5966082215309143, + "self_certainty_semantic": -26.125, + "self_certainty_token": -21.9375, + "step": 688 + }, + { + "completion_length": 92.34375, + "epoch": 0.7630121816168328, + "grad_norm": 0.4583781957626343, + "kl": 0.044677734375, + "learning_rate": 5.69375e-07, + "loss": -0.0036846445873379707, + "reward": 2.1093239188194275, + "reward_std": 0.23726678267121315, + "rewards/GDino": 0.8014558851718903, + "rewards/GIT": 0.45863087475299835, + "rewards/HPSv2": 0.27521324157714844, + "rewards/ORM": 0.5740238130092621, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.625, + "step": 689 + }, + { + "completion_length": 89.265625, + "epoch": 0.7641196013289037, + "grad_norm": 1.573523998260498, + "kl": 0.0704345703125, + "learning_rate": 5.6875e-07, + "loss": 0.0040516487788408995, + "reward": 2.264855980873108, + "reward_std": 0.38254983723163605, + "rewards/GDino": 0.8170240521430969, + "rewards/GIT": 0.4234756827354431, + "rewards/HPSv2": 0.28342247009277344, + "rewards/ORM": 0.7409337162971497, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.9375, + "step": 690 + }, + { + "completion_length": 88.859375, + "epoch": 0.7652270210409745, + "grad_norm": 1.1988757848739624, + "kl": 0.035888671875, + "learning_rate": 5.68125e-07, + "loss": 0.003176323603838682, + "reward": 2.044719099998474, + "reward_std": 0.30086271464824677, + "rewards/GDino": 0.6574947237968445, + "rewards/GIT": 0.4014148786664009, + "rewards/HPSv2": 0.28969573974609375, + "rewards/ORM": 0.6961137652397156, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.75, + "step": 691 + }, + { + "completion_length": 87.09375, + "epoch": 0.7663344407530454, + "grad_norm": 0.45111867785453796, + "kl": 0.04534912109375, + "learning_rate": 5.675e-07, + "loss": 0.0026908922009170055, + "reward": 1.9436599016189575, + "reward_std": 0.3347000330686569, + "rewards/GDino": 0.745746523141861, + "rewards/GIT": 0.5528430640697479, + "rewards/HPSv2": 0.2727985382080078, + "rewards/ORM": 0.3722716271877289, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.375, + "step": 692 + }, + { + "completion_length": 91.1875, + "epoch": 0.7674418604651163, + "grad_norm": 0.6132892966270447, + "kl": 0.0775146484375, + "learning_rate": 5.66875e-07, + "loss": 0.01438837987370789, + "reward": 2.446964740753174, + "reward_std": 0.3332870677113533, + "rewards/GDino": 0.846495509147644, + "rewards/GIT": 0.45031992346048355, + "rewards/HPSv2": 0.2673368453979492, + "rewards/ORM": 0.8828125, + "self_certainty_semantic": -26.125, + "self_certainty_token": -23.25, + "step": 693 + }, + { + "completion_length": 78.953125, + "epoch": 0.7685492801771872, + "grad_norm": 0.6678088307380676, + "kl": 0.05621337890625, + "learning_rate": 5.6625e-07, + "loss": 0.001470165210776031, + "reward": 2.206181526184082, + "reward_std": 0.3061055392026901, + "rewards/GDino": 0.7747617959976196, + "rewards/GIT": 0.33542168140411377, + "rewards/HPSv2": 0.2773399353027344, + "rewards/ORM": 0.8186581134796143, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.9375, + "step": 694 + }, + { + "completion_length": 91.984375, + "epoch": 0.769656699889258, + "grad_norm": 0.5633465647697449, + "kl": 0.02764892578125, + "learning_rate": 5.65625e-07, + "loss": 0.0018216867465525866, + "reward": 2.123205065727234, + "reward_std": 0.25228351354599, + "rewards/GDino": 0.7946834862232208, + "rewards/GIT": 0.523629367351532, + "rewards/HPSv2": 0.2567634582519531, + "rewards/ORM": 0.5481287688016891, + "self_certainty_semantic": -25.875, + "self_certainty_token": -22.5, + "step": 695 + }, + { + "completion_length": 89.328125, + "epoch": 0.770764119601329, + "grad_norm": 0.5084026455879211, + "kl": 0.04736328125, + "learning_rate": 5.649999999999999e-07, + "loss": 0.010489812702871859, + "reward": 1.9719426035881042, + "reward_std": 0.3934820741415024, + "rewards/GDino": 0.7802965641021729, + "rewards/GIT": 0.2579066641628742, + "rewards/HPSv2": 0.2900352478027344, + "rewards/ORM": 0.6437040567398071, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -21.875, + "step": 696 + }, + { + "completion_length": 89.484375, + "epoch": 0.7718715393133998, + "grad_norm": 0.84053635597229, + "kl": 0.0677490234375, + "learning_rate": 5.643749999999999e-07, + "loss": -0.003930016187950969, + "reward": 2.1548564434051514, + "reward_std": 0.20971987396478653, + "rewards/GDino": 0.7154485583305359, + "rewards/GIT": 0.4372923672199249, + "rewards/HPSv2": 0.2676105499267578, + "rewards/ORM": 0.7345048785209656, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.0, + "step": 697 + }, + { + "completion_length": 77.671875, + "epoch": 0.7729789590254706, + "grad_norm": 0.4470304250717163, + "kl": 0.0628662109375, + "learning_rate": 5.637499999999999e-07, + "loss": -0.008008429780602455, + "reward": 2.058813691139221, + "reward_std": 0.24794085323810577, + "rewards/GDino": 0.735086590051651, + "rewards/GIT": 0.33821845054626465, + "rewards/HPSv2": 0.2885417938232422, + "rewards/ORM": 0.6969668865203857, + "self_certainty_semantic": -26.0, + "self_certainty_token": -22.25, + "step": 698 + }, + { + "completion_length": 87.59375, + "epoch": 0.7740863787375415, + "grad_norm": 0.7203311324119568, + "kl": 0.05419921875, + "learning_rate": 5.63125e-07, + "loss": -0.005206214264035225, + "reward": 2.4558401107788086, + "reward_std": 0.26295703649520874, + "rewards/GDino": 0.7619472146034241, + "rewards/GIT": 0.5532310307025909, + "rewards/HPSv2": 0.28602027893066406, + "rewards/ORM": 0.8546415567398071, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -22.5625, + "step": 699 + }, + { + "completion_length": 85.703125, + "epoch": 0.7751937984496124, + "grad_norm": 0.6874542236328125, + "kl": 0.0460205078125, + "learning_rate": 5.625e-07, + "loss": 0.0033479061676189303, + "reward": 2.1118282079696655, + "reward_std": 0.3882039487361908, + "rewards/GDino": 0.7321169972419739, + "rewards/GIT": 0.429663822054863, + "rewards/HPSv2": 0.27544403076171875, + "rewards/ORM": 0.6746033430099487, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.0, + "step": 700 + }, + { + "completion_length": 99.140625, + "epoch": 0.7763012181616833, + "grad_norm": 0.6015984416007996, + "kl": 0.064697265625, + "learning_rate": 5.61875e-07, + "loss": 0.0012769303284585476, + "reward": 2.248101830482483, + "reward_std": 0.2704643979668617, + "rewards/GDino": 0.7317708134651184, + "rewards/GIT": 0.35989028215408325, + "rewards/HPSv2": 0.29147911071777344, + "rewards/ORM": 0.864961564540863, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.1875, + "step": 701 + }, + { + "completion_length": 91.28125, + "epoch": 0.7774086378737541, + "grad_norm": 0.44662216305732727, + "kl": 0.0250244140625, + "learning_rate": 5.6125e-07, + "loss": -0.008534628665074706, + "reward": 2.4955825805664062, + "reward_std": 0.21344706416130066, + "rewards/GDino": 0.87129807472229, + "rewards/GIT": 0.5853111147880554, + "rewards/HPSv2": 0.27585601806640625, + "rewards/ORM": 0.7631174623966217, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.625, + "step": 702 + }, + { + "completion_length": 90.125, + "epoch": 0.778516057585825, + "grad_norm": 0.5787423849105835, + "kl": 0.06494140625, + "learning_rate": 5.60625e-07, + "loss": 0.0007364919874817133, + "reward": 2.1611085534095764, + "reward_std": 0.23944087326526642, + "rewards/GDino": 0.8036986589431763, + "rewards/GIT": 0.45144329965114594, + "rewards/HPSv2": 0.26186180114746094, + "rewards/ORM": 0.6441046893596649, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.6875, + "step": 703 + }, + { + "completion_length": 88.59375, + "epoch": 0.7796234772978959, + "grad_norm": 0.46850064396858215, + "kl": 0.0391845703125, + "learning_rate": 5.6e-07, + "loss": -0.0037639520596712828, + "reward": 2.155102014541626, + "reward_std": 0.3035145103931427, + "rewards/GDino": 0.7438715398311615, + "rewards/GIT": 0.328866109251976, + "rewards/HPSv2": 0.2971820831298828, + "rewards/ORM": 0.7851822078227997, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.25, + "step": 704 + }, + { + "completion_length": 86.328125, + "epoch": 0.7807308970099668, + "grad_norm": 0.4746936559677124, + "kl": 0.0570068359375, + "learning_rate": 5.593749999999999e-07, + "loss": 0.0013905295636504889, + "reward": 2.029343366622925, + "reward_std": 0.4462120532989502, + "rewards/GDino": 0.7991890907287598, + "rewards/GIT": 0.2879885137081146, + "rewards/HPSv2": 0.27649879455566406, + "rewards/ORM": 0.665666937828064, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.5625, + "step": 705 + }, + { + "completion_length": 82.046875, + "epoch": 0.7818383167220376, + "grad_norm": 0.9424302577972412, + "kl": 0.0546875, + "learning_rate": 5.587499999999999e-07, + "loss": 0.002509636804461479, + "reward": 1.6777660250663757, + "reward_std": 0.3301454186439514, + "rewards/GDino": 0.6864481568336487, + "rewards/GIT": 0.3247561678290367, + "rewards/HPSv2": 0.288482666015625, + "rewards/ORM": 0.37807904183864594, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.4375, + "step": 706 + }, + { + "completion_length": 88.03125, + "epoch": 0.7829457364341085, + "grad_norm": 0.6290081739425659, + "kl": 0.0531005859375, + "learning_rate": 5.58125e-07, + "loss": 0.008092005038633943, + "reward": 1.9476159811019897, + "reward_std": 0.3564329296350479, + "rewards/GDino": 0.6500000059604645, + "rewards/GIT": 0.22398784011602402, + "rewards/HPSv2": 0.2810859680175781, + "rewards/ORM": 0.7925421595573425, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.8125, + "step": 707 + }, + { + "completion_length": 91.90625, + "epoch": 0.7840531561461794, + "grad_norm": 1.164078712463379, + "kl": 0.04241943359375, + "learning_rate": 5.575e-07, + "loss": -0.00011753547005355358, + "reward": 2.2988924980163574, + "reward_std": 0.18473602831363678, + "rewards/GDino": 0.8202758133411407, + "rewards/GIT": 0.6436163187026978, + "rewards/HPSv2": 0.28339195251464844, + "rewards/ORM": 0.5516084432601929, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -22.6875, + "step": 708 + }, + { + "completion_length": 84.6875, + "epoch": 0.7851605758582503, + "grad_norm": 0.9914599657058716, + "kl": 0.069091796875, + "learning_rate": 5.56875e-07, + "loss": -0.002006992930546403, + "reward": 1.8607905507087708, + "reward_std": 0.3021601140499115, + "rewards/GDino": 0.739496499300003, + "rewards/GIT": 0.38740313053131104, + "rewards/HPSv2": 0.2628650665283203, + "rewards/ORM": 0.47102586925029755, + "self_certainty_semantic": -26.125, + "self_certainty_token": -23.125, + "step": 709 + }, + { + "completion_length": 85.421875, + "epoch": 0.7862679955703211, + "grad_norm": 0.5876775979995728, + "kl": 0.0347900390625, + "learning_rate": 5.5625e-07, + "loss": 0.003620319301262498, + "reward": 2.615724205970764, + "reward_std": 0.2742619216442108, + "rewards/GDino": 0.8987156748771667, + "rewards/GIT": 0.6156101077795029, + "rewards/HPSv2": 0.25804901123046875, + "rewards/ORM": 0.8433493673801422, + "self_certainty_semantic": -26.3125, + "self_certainty_token": -21.0, + "step": 710 + }, + { + "completion_length": 82.296875, + "epoch": 0.7873754152823921, + "grad_norm": 0.6201152801513672, + "kl": 0.0458984375, + "learning_rate": 5.55625e-07, + "loss": 0.0001488246489316225, + "reward": 2.211663603782654, + "reward_std": 0.30881257355213165, + "rewards/GDino": 0.7431795001029968, + "rewards/GIT": 0.35855987668037415, + "rewards/HPSv2": 0.28961181640625, + "rewards/ORM": 0.8203125, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -21.75, + "step": 711 + }, + { + "completion_length": 74.5625, + "epoch": 0.7884828349944629, + "grad_norm": 0.5009872913360596, + "kl": 0.0623779296875, + "learning_rate": 5.55e-07, + "loss": -0.0008970615454018116, + "reward": 2.5346819162368774, + "reward_std": 0.3168572187423706, + "rewards/GDino": 0.902751088142395, + "rewards/GIT": 0.403171181678772, + "rewards/HPSv2": 0.3143463134765625, + "rewards/ORM": 0.9144132137298584, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.4375, + "step": 712 + }, + { + "completion_length": 98.015625, + "epoch": 0.7895902547065338, + "grad_norm": 0.41728198528289795, + "kl": 0.0634765625, + "learning_rate": 5.543749999999999e-07, + "loss": -0.0010938001796603203, + "reward": 2.234713613986969, + "reward_std": 0.30005037784576416, + "rewards/GDino": 0.8278032839298248, + "rewards/GIT": 0.43945807218551636, + "rewards/HPSv2": 0.2834320068359375, + "rewards/ORM": 0.6840203106403351, + "self_certainty_semantic": -26.0, + "self_certainty_token": -22.125, + "step": 713 + }, + { + "completion_length": 90.765625, + "epoch": 0.7906976744186046, + "grad_norm": 0.6176280379295349, + "kl": 0.0565185546875, + "learning_rate": 5.5375e-07, + "loss": -0.0070942766033113, + "reward": 2.0352079272270203, + "reward_std": 0.19180814176797867, + "rewards/GDino": 0.6948780119419098, + "rewards/GIT": 0.5464113652706146, + "rewards/HPSv2": 0.2782936096191406, + "rewards/ORM": 0.515625, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.875, + "step": 714 + }, + { + "completion_length": 107.78125, + "epoch": 0.7918050941306756, + "grad_norm": 0.49854108691215515, + "kl": 0.059814453125, + "learning_rate": 5.53125e-07, + "loss": 0.02198478812351823, + "reward": 2.2549163103103638, + "reward_std": 0.24842631816864014, + "rewards/GDino": 0.764163613319397, + "rewards/GIT": 0.3365834578871727, + "rewards/HPSv2": 0.27135658264160156, + "rewards/ORM": 0.8828125, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.625, + "step": 715 + }, + { + "completion_length": 86.796875, + "epoch": 0.7929125138427464, + "grad_norm": 1.8349144458770752, + "kl": 0.0816650390625, + "learning_rate": 5.525e-07, + "loss": -0.009212985401973128, + "reward": 1.952660858631134, + "reward_std": 0.4517306387424469, + "rewards/GDino": 0.7594016194343567, + "rewards/GIT": 0.26519767940044403, + "rewards/HPSv2": 0.28430747985839844, + "rewards/ORM": 0.6437540054321289, + "self_certainty_semantic": -26.125, + "self_certainty_token": -23.1875, + "step": 716 + }, + { + "completion_length": 86.375, + "epoch": 0.7940199335548173, + "grad_norm": 1.4869712591171265, + "kl": 0.06591796875, + "learning_rate": 5.51875e-07, + "loss": 0.0020673180115409195, + "reward": 1.709627091884613, + "reward_std": 0.0876375325024128, + "rewards/GDino": 0.7316266000270844, + "rewards/GIT": 0.34074390679597855, + "rewards/HPSv2": 0.2622566223144531, + "rewards/ORM": 0.375, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.8125, + "step": 717 + }, + { + "completion_length": 96.15625, + "epoch": 0.7951273532668881, + "grad_norm": 0.5024635791778564, + "kl": 0.064208984375, + "learning_rate": 5.5125e-07, + "loss": -0.0038087358698248863, + "reward": 2.3133697509765625, + "reward_std": 0.1827942095696926, + "rewards/GDino": 0.8182291984558105, + "rewards/GIT": 0.40707404911518097, + "rewards/HPSv2": 0.28177452087402344, + "rewards/ORM": 0.806291937828064, + "self_certainty_semantic": -26.125, + "self_certainty_token": -21.9375, + "step": 718 + }, + { + "completion_length": 88.703125, + "epoch": 0.7962347729789591, + "grad_norm": 0.6536163687705994, + "kl": 0.068603515625, + "learning_rate": 5.50625e-07, + "loss": -0.0003267263527959585, + "reward": 2.1304445266723633, + "reward_std": 0.3168393522500992, + "rewards/GDino": 0.7446388006210327, + "rewards/GIT": 0.3202046602964401, + "rewards/HPSv2": 0.2940387725830078, + "rewards/ORM": 0.7715622782707214, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.75, + "step": 719 + }, + { + "completion_length": 102.78125, + "epoch": 0.7973421926910299, + "grad_norm": 0.6415807008743286, + "kl": 0.0399169921875, + "learning_rate": 5.5e-07, + "loss": 0.001682810252532363, + "reward": 2.3370330333709717, + "reward_std": 0.2455347180366516, + "rewards/GDino": 0.8281573057174683, + "rewards/GIT": 0.4782037138938904, + "rewards/HPSv2": 0.28107261657714844, + "rewards/ORM": 0.7495993375778198, + "self_certainty_semantic": -26.4375, + "self_certainty_token": -22.3125, + "step": 720 + }, + { + "completion_length": 85.578125, + "epoch": 0.7984496124031008, + "grad_norm": 0.5702308416366577, + "kl": 0.078125, + "learning_rate": 5.493749999999999e-07, + "loss": -0.004434834118001163, + "reward": 2.1352263689041138, + "reward_std": 0.28349077701568604, + "rewards/GDino": 0.8023764491081238, + "rewards/GIT": 0.3593817874789238, + "rewards/HPSv2": 0.28722190856933594, + "rewards/ORM": 0.6862462162971497, + "self_certainty_semantic": -26.0, + "self_certainty_token": -23.0625, + "step": 721 + }, + { + "completion_length": 82.375, + "epoch": 0.7995570321151716, + "grad_norm": 0.4467609226703644, + "kl": 0.05419921875, + "learning_rate": 5.487499999999999e-07, + "loss": 0.0017573630902916193, + "reward": 2.256095767021179, + "reward_std": 0.36546366661787033, + "rewards/GDino": 0.8544391691684723, + "rewards/GIT": 0.4117707312107086, + "rewards/HPSv2": 0.29296875, + "rewards/ORM": 0.6969169527292252, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.8125, + "step": 722 + }, + { + "completion_length": 95.3125, + "epoch": 0.8006644518272426, + "grad_norm": 0.6223501563072205, + "kl": 0.0474853515625, + "learning_rate": 5.481249999999999e-07, + "loss": 0.002001094399020076, + "reward": 1.8290563225746155, + "reward_std": 0.3082116022706032, + "rewards/GDino": 0.6932956278324127, + "rewards/GIT": 0.32308153808116913, + "rewards/HPSv2": 0.2833843231201172, + "rewards/ORM": 0.5292948186397552, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -21.75, + "step": 723 + }, + { + "completion_length": 83.5625, + "epoch": 0.8017718715393134, + "grad_norm": 0.4794311225414276, + "kl": 0.042236328125, + "learning_rate": 5.474999999999999e-07, + "loss": 0.00010992749594151974, + "reward": 1.9405070543289185, + "reward_std": 0.3689955174922943, + "rewards/GDino": 0.6669535338878632, + "rewards/GIT": 0.30875134468078613, + "rewards/HPSv2": 0.26761436462402344, + "rewards/ORM": 0.6971876621246338, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.3125, + "step": 724 + }, + { + "completion_length": 97.421875, + "epoch": 0.8028792912513842, + "grad_norm": 0.44672688841819763, + "kl": 0.03033447265625, + "learning_rate": 5.46875e-07, + "loss": -0.006343247136101127, + "reward": 2.1785714626312256, + "reward_std": 0.2569301053881645, + "rewards/GDino": 0.8311122059822083, + "rewards/GIT": 0.6043714135885239, + "rewards/HPSv2": 0.27286338806152344, + "rewards/ORM": 0.4702245891094208, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.5625, + "step": 725 + }, + { + "completion_length": 96.953125, + "epoch": 0.8039867109634552, + "grad_norm": 2.038681745529175, + "kl": 0.096435546875, + "learning_rate": 5.4625e-07, + "loss": 7.860548794269562e-05, + "reward": 1.9912248849868774, + "reward_std": 0.3472580313682556, + "rewards/GDino": 0.7380475103855133, + "rewards/GIT": 0.5096151679754257, + "rewards/HPSv2": 0.2786426544189453, + "rewards/ORM": 0.4649196267127991, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.0, + "step": 726 + }, + { + "completion_length": 88.421875, + "epoch": 0.805094130675526, + "grad_norm": 0.798835039138794, + "kl": 0.03460693359375, + "learning_rate": 5.45625e-07, + "loss": 0.005517042591236532, + "reward": 1.949616551399231, + "reward_std": 0.44648005068302155, + "rewards/GDino": 0.6722205281257629, + "rewards/GIT": 0.2286173701286316, + "rewards/HPSv2": 0.25931549072265625, + "rewards/ORM": 0.7894631326198578, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.8125, + "step": 727 + }, + { + "completion_length": 87.234375, + "epoch": 0.8062015503875969, + "grad_norm": 0.41654542088508606, + "kl": 0.0634765625, + "learning_rate": 5.45e-07, + "loss": -0.002666122862137854, + "reward": 2.406186044216156, + "reward_std": 0.39095108211040497, + "rewards/GDino": 0.9046874940395355, + "rewards/GIT": 0.597604289650917, + "rewards/HPSv2": 0.2636699676513672, + "rewards/ORM": 0.6402243673801422, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.8125, + "step": 728 + }, + { + "completion_length": 91.515625, + "epoch": 0.8073089700996677, + "grad_norm": 0.46109533309936523, + "kl": 0.0682373046875, + "learning_rate": 5.44375e-07, + "loss": 0.001184340100735426, + "reward": 2.5900955200195312, + "reward_std": 0.2786618620157242, + "rewards/GDino": 0.9135416448116302, + "rewards/GIT": 0.6375645399093628, + "rewards/HPSv2": 0.2612190246582031, + "rewards/ORM": 0.7777703106403351, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -22.125, + "step": 729 + }, + { + "completion_length": 92.484375, + "epoch": 0.8084163898117387, + "grad_norm": 0.576739490032196, + "kl": 0.0413818359375, + "learning_rate": 5.4375e-07, + "loss": -0.001984517090022564, + "reward": 2.024367094039917, + "reward_std": 0.3431331217288971, + "rewards/GDino": 0.7703756093978882, + "rewards/GIT": 0.41060225665569305, + "rewards/HPSv2": 0.2749519348144531, + "rewards/ORM": 0.5684372335672379, + "self_certainty_semantic": -26.125, + "self_certainty_token": -21.5, + "step": 730 + }, + { + "completion_length": 90.234375, + "epoch": 0.8095238095238095, + "grad_norm": 1.2651491165161133, + "kl": 0.078125, + "learning_rate": 5.43125e-07, + "loss": -0.007466933340765536, + "reward": 1.6747112274169922, + "reward_std": 0.3407853692770004, + "rewards/GDino": 0.6279157549142838, + "rewards/GIT": 0.21330980956554413, + "rewards/HPSv2": 0.28192710876464844, + "rewards/ORM": 0.5515585243701935, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.375, + "step": 731 + }, + { + "completion_length": 82.984375, + "epoch": 0.8106312292358804, + "grad_norm": 0.5761509537696838, + "kl": 0.06591796875, + "learning_rate": 5.425e-07, + "loss": -0.0003554471768438816, + "reward": 2.157442092895508, + "reward_std": 0.4448174238204956, + "rewards/GDino": 0.6964643895626068, + "rewards/GIT": 0.43324337899684906, + "rewards/HPSv2": 0.2703723907470703, + "rewards/ORM": 0.7573619186878204, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -21.75, + "step": 732 + }, + { + "completion_length": 89.53125, + "epoch": 0.8117386489479512, + "grad_norm": 0.722574770450592, + "kl": 0.06103515625, + "learning_rate": 5.41875e-07, + "loss": 0.010946301277726889, + "reward": 2.1100489497184753, + "reward_std": 0.3279705345630646, + "rewards/GDino": 0.6987462043762207, + "rewards/GIT": 0.42046140134334564, + "rewards/HPSv2": 0.2703571319580078, + "rewards/ORM": 0.7204843461513519, + "self_certainty_semantic": -26.125, + "self_certainty_token": -21.4375, + "step": 733 + }, + { + "completion_length": 89.9375, + "epoch": 0.8128460686600222, + "grad_norm": 1.0958141088485718, + "kl": 0.0460205078125, + "learning_rate": 5.4125e-07, + "loss": -0.007840800797566772, + "reward": 1.9543083906173706, + "reward_std": 0.3317900747060776, + "rewards/GDino": 0.664845883846283, + "rewards/GIT": 0.5042918026447296, + "rewards/HPSv2": 0.26338768005371094, + "rewards/ORM": 0.5217830985784531, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.6875, + "step": 734 + }, + { + "completion_length": 86.25, + "epoch": 0.813953488372093, + "grad_norm": 0.7565427422523499, + "kl": 0.052490234375, + "learning_rate": 5.40625e-07, + "loss": -0.0017323634820058942, + "reward": 1.813037097454071, + "reward_std": 0.35334640741348267, + "rewards/GDino": 0.6798712313175201, + "rewards/GIT": 0.26263442635536194, + "rewards/HPSv2": 0.28481483459472656, + "rewards/ORM": 0.5857166945934296, + "self_certainty_semantic": -26.0, + "self_certainty_token": -22.625, + "step": 735 + }, + { + "completion_length": 84.25, + "epoch": 0.8150609080841639, + "grad_norm": 0.5775710344314575, + "kl": 0.04412841796875, + "learning_rate": 5.4e-07, + "loss": 0.0016251013148576021, + "reward": 2.3052210807800293, + "reward_std": 0.33411508798599243, + "rewards/GDino": 0.8637592792510986, + "rewards/GIT": 0.31685441732406616, + "rewards/HPSv2": 0.2863140106201172, + "rewards/ORM": 0.8382933139801025, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.0625, + "step": 736 + }, + { + "completion_length": 93.0625, + "epoch": 0.8161683277962348, + "grad_norm": 0.6606467366218567, + "kl": 0.036376953125, + "learning_rate": 5.39375e-07, + "loss": -0.0011851803865283728, + "reward": 2.2007476687431335, + "reward_std": 0.18558945506811142, + "rewards/GDino": 0.6635416448116302, + "rewards/GIT": 0.6344006955623627, + "rewards/HPSv2": 0.2934303283691406, + "rewards/ORM": 0.609375, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -22.4375, + "step": 737 + }, + { + "completion_length": 96.296875, + "epoch": 0.8172757475083057, + "grad_norm": 2.831019401550293, + "kl": 0.1197509765625, + "learning_rate": 5.387499999999999e-07, + "loss": 0.009909008163958788, + "reward": 1.9760617017745972, + "reward_std": 0.32775287330150604, + "rewards/GDino": 0.7859875857830048, + "rewards/GIT": 0.338260181248188, + "rewards/HPSv2": 0.26775169372558594, + "rewards/ORM": 0.5840622931718826, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.0, + "step": 738 + }, + { + "completion_length": 93.9375, + "epoch": 0.8183831672203765, + "grad_norm": 0.44521647691726685, + "kl": 0.0445556640625, + "learning_rate": 5.381249999999999e-07, + "loss": 0.011125812772661448, + "reward": 2.201336145401001, + "reward_std": 0.2787298932671547, + "rewards/GDino": 0.7437500357627869, + "rewards/GIT": 0.4365048035979271, + "rewards/HPSv2": 0.26166439056396484, + "rewards/ORM": 0.7594169974327087, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -21.5625, + "step": 739 + }, + { + "completion_length": 92.25, + "epoch": 0.8194905869324474, + "grad_norm": 1.0732746124267578, + "kl": 0.0709228515625, + "learning_rate": 5.374999999999999e-07, + "loss": -0.0038921566447243094, + "reward": 2.1084020137786865, + "reward_std": 0.30020764470100403, + "rewards/GDino": 0.7505642473697662, + "rewards/GIT": 0.363937608897686, + "rewards/HPSv2": 0.2900238037109375, + "rewards/ORM": 0.7038763165473938, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.125, + "step": 740 + }, + { + "completion_length": 91.953125, + "epoch": 0.8205980066445183, + "grad_norm": 0.5425018668174744, + "kl": 0.103515625, + "learning_rate": 5.368749999999999e-07, + "loss": -0.00010758591815829277, + "reward": 2.3353304862976074, + "reward_std": 0.2773418575525284, + "rewards/GDino": 0.804405003786087, + "rewards/GIT": 0.32148073613643646, + "rewards/HPSv2": 0.2840900421142578, + "rewards/ORM": 0.9253547191619873, + "self_certainty_semantic": -26.3125, + "self_certainty_token": -22.0, + "step": 741 + }, + { + "completion_length": 81.03125, + "epoch": 0.8217054263565892, + "grad_norm": 38.27437973022461, + "kl": 0.3857421875, + "learning_rate": 5.3625e-07, + "loss": -0.008703663712367415, + "reward": 2.112104058265686, + "reward_std": 0.25958235561847687, + "rewards/GDino": 0.7278973162174225, + "rewards/GIT": 0.4252029210329056, + "rewards/HPSv2": 0.27819252014160156, + "rewards/ORM": 0.6808113753795624, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.9375, + "step": 742 + }, + { + "completion_length": 82.796875, + "epoch": 0.82281284606866, + "grad_norm": 0.3979380130767822, + "kl": 0.0498046875, + "learning_rate": 5.35625e-07, + "loss": 0.008123829029500484, + "reward": 2.501998543739319, + "reward_std": 0.25658877938985825, + "rewards/GDino": 0.9072916507720947, + "rewards/GIT": 0.6435754597187042, + "rewards/HPSv2": 0.2733192443847656, + "rewards/ORM": 0.6778123080730438, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.5, + "step": 743 + }, + { + "completion_length": 77.65625, + "epoch": 0.8239202657807309, + "grad_norm": 0.5012292861938477, + "kl": 0.09228515625, + "learning_rate": 5.35e-07, + "loss": 0.007661529583856463, + "reward": 2.2044920921325684, + "reward_std": 0.23868468776345253, + "rewards/GDino": 0.7408381700515747, + "rewards/GIT": 0.4643561914563179, + "rewards/HPSv2": 0.2843780517578125, + "rewards/ORM": 0.7149195969104767, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.875, + "step": 744 + }, + { + "completion_length": 97.8125, + "epoch": 0.8250276854928018, + "grad_norm": 0.42284345626831055, + "kl": 0.06005859375, + "learning_rate": 5.343750000000001e-07, + "loss": -0.003911999461706728, + "reward": 1.8918339014053345, + "reward_std": 0.2917911037802696, + "rewards/GDino": 0.6985442638397217, + "rewards/GIT": 0.45280955731868744, + "rewards/HPSv2": 0.28673362731933594, + "rewards/ORM": 0.4537464678287506, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -22.375, + "step": 745 + }, + { + "completion_length": 93.015625, + "epoch": 0.8261351052048727, + "grad_norm": 0.5211785435676575, + "kl": 0.0570068359375, + "learning_rate": 5.3375e-07, + "loss": 0.0066674211993813515, + "reward": 2.1869595646858215, + "reward_std": 0.17587076127529144, + "rewards/GDino": 0.7884093523025513, + "rewards/GIT": 0.3536282554268837, + "rewards/HPSv2": 0.294921875, + "rewards/ORM": 0.75, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.8125, + "step": 746 + }, + { + "completion_length": 99.828125, + "epoch": 0.8272425249169435, + "grad_norm": 1.4175063371658325, + "kl": 0.10009765625, + "learning_rate": 5.33125e-07, + "loss": -0.002889312105253339, + "reward": 2.2799965143203735, + "reward_std": 0.2823447212576866, + "rewards/GDino": 0.7717703282833099, + "rewards/GIT": 0.46097248792648315, + "rewards/HPSv2": 0.2851085662841797, + "rewards/ORM": 0.7621452808380127, + "self_certainty_semantic": -26.0, + "self_certainty_token": -20.9375, + "step": 747 + }, + { + "completion_length": 84.921875, + "epoch": 0.8283499446290143, + "grad_norm": 0.5928605794906616, + "kl": 0.063232421875, + "learning_rate": 5.325e-07, + "loss": 0.0008637142600491643, + "reward": 2.248483180999756, + "reward_std": 0.38476191461086273, + "rewards/GDino": 0.870312511920929, + "rewards/GIT": 0.5309998989105225, + "rewards/HPSv2": 0.28038787841796875, + "rewards/ORM": 0.5667828321456909, + "self_certainty_semantic": -26.25, + "self_certainty_token": -22.1875, + "step": 748 + }, + { + "completion_length": 99.34375, + "epoch": 0.8294573643410853, + "grad_norm": 1.2739650011062622, + "kl": 0.0650634765625, + "learning_rate": 5.31875e-07, + "loss": 0.0008982536382973194, + "reward": 2.1500863432884216, + "reward_std": 0.35770243406295776, + "rewards/GDino": 0.756407231092453, + "rewards/GIT": 0.4540431275963783, + "rewards/HPSv2": 0.2765064239501953, + "rewards/ORM": 0.663129448890686, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.1875, + "step": 749 + }, + { + "completion_length": 83.203125, + "epoch": 0.8305647840531561, + "grad_norm": 0.5421486496925354, + "kl": 0.0657958984375, + "learning_rate": 5.3125e-07, + "loss": -0.007062981836497784, + "reward": 1.8963716626167297, + "reward_std": 0.16904155164957047, + "rewards/GDino": 0.7278782725334167, + "rewards/GIT": 0.30352286249399185, + "rewards/HPSv2": 0.2747001647949219, + "rewards/ORM": 0.5902703106403351, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.25, + "step": 750 + }, + { + "completion_length": 92.453125, + "epoch": 0.831672203765227, + "grad_norm": 0.5977789163589478, + "kl": 0.0570068359375, + "learning_rate": 5.30625e-07, + "loss": -0.003586571430787444, + "reward": 2.115361213684082, + "reward_std": 0.4803231358528137, + "rewards/GDino": 0.7299371659755707, + "rewards/GIT": 0.3196816146373749, + "rewards/HPSv2": 0.2770805358886719, + "rewards/ORM": 0.7886618673801422, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -22.0625, + "step": 751 + }, + { + "completion_length": 90.46875, + "epoch": 0.832779623477298, + "grad_norm": 0.6801989674568176, + "kl": 0.0579833984375, + "learning_rate": 5.3e-07, + "loss": -0.01345699792727828, + "reward": 2.0743667483329773, + "reward_std": 0.26966211199760437, + "rewards/GDino": 0.694890022277832, + "rewards/GIT": 0.42347368597984314, + "rewards/HPSv2": 0.2767162322998047, + "rewards/ORM": 0.6792868971824646, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.0, + "step": 752 + }, + { + "completion_length": 90.53125, + "epoch": 0.8338870431893688, + "grad_norm": 0.6047552824020386, + "kl": 0.069091796875, + "learning_rate": 5.29375e-07, + "loss": -0.002249690005555749, + "reward": 2.2418558597564697, + "reward_std": 0.37794698774814606, + "rewards/GDino": 0.8119381666183472, + "rewards/GIT": 0.5427019596099854, + "rewards/HPSv2": 0.2650737762451172, + "rewards/ORM": 0.6221418678760529, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.8125, + "step": 753 + }, + { + "completion_length": 87.578125, + "epoch": 0.8349944629014396, + "grad_norm": 0.5467040538787842, + "kl": 0.0897216796875, + "learning_rate": 5.2875e-07, + "loss": -0.012971555814146996, + "reward": 2.0573065280914307, + "reward_std": 0.27202725410461426, + "rewards/GDino": 0.7703125476837158, + "rewards/GIT": 0.41489122807979584, + "rewards/HPSv2": 0.2783527374267578, + "rewards/ORM": 0.59375, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -22.4375, + "step": 754 + }, + { + "completion_length": 97.3125, + "epoch": 0.8361018826135105, + "grad_norm": 0.7234301567077637, + "kl": 0.0594482421875, + "learning_rate": 5.281249999999999e-07, + "loss": -0.006349579431116581, + "reward": 2.547545313835144, + "reward_std": 0.25917133688926697, + "rewards/GDino": 0.8570312559604645, + "rewards/GIT": 0.8000683486461639, + "rewards/HPSv2": 0.2732582092285156, + "rewards/ORM": 0.6171875, + "self_certainty_semantic": -26.25, + "self_certainty_token": -21.9375, + "step": 755 + }, + { + "completion_length": 89.921875, + "epoch": 0.8372093023255814, + "grad_norm": 0.4962906837463379, + "kl": 0.03131103515625, + "learning_rate": 5.274999999999999e-07, + "loss": -0.0025049750693142414, + "reward": 1.889495849609375, + "reward_std": 0.18460319191217422, + "rewards/GDino": 0.7844638228416443, + "rewards/GIT": 0.5320337116718292, + "rewards/HPSv2": 0.2508106231689453, + "rewards/ORM": 0.32218772172927856, + "self_certainty_semantic": -26.25, + "self_certainty_token": -21.25, + "step": 756 + }, + { + "completion_length": 94.4375, + "epoch": 0.8383167220376523, + "grad_norm": 0.46375179290771484, + "kl": 0.0706787109375, + "learning_rate": 5.268749999999999e-07, + "loss": -0.00581313855946064, + "reward": 2.1863173246383667, + "reward_std": 0.24684715270996094, + "rewards/GDino": 0.8971692025661469, + "rewards/GIT": 0.3403056859970093, + "rewards/HPSv2": 0.27822113037109375, + "rewards/ORM": 0.6706212162971497, + "self_certainty_semantic": -26.375, + "self_certainty_token": -22.625, + "step": 757 + }, + { + "completion_length": 95.140625, + "epoch": 0.8394241417497231, + "grad_norm": 0.45831671357154846, + "kl": 0.061279296875, + "learning_rate": 5.262499999999999e-07, + "loss": 0.0077309084590524435, + "reward": 1.8927339315414429, + "reward_std": 0.2615918070077896, + "rewards/GDino": 0.6197390854358673, + "rewards/GIT": 0.43407680094242096, + "rewards/HPSv2": 0.27208518981933594, + "rewards/ORM": 0.5668328106403351, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -22.4375, + "step": 758 + }, + { + "completion_length": 79.765625, + "epoch": 0.840531561461794, + "grad_norm": 2.4955954551696777, + "kl": 0.0518798828125, + "learning_rate": 5.256249999999999e-07, + "loss": -0.0012904456816613674, + "reward": 2.257834553718567, + "reward_std": 0.21857471764087677, + "rewards/GDino": 0.7868121266365051, + "rewards/GIT": 0.3580681383609772, + "rewards/HPSv2": 0.27974510192871094, + "rewards/ORM": 0.8332091271877289, + "self_certainty_semantic": -26.125, + "self_certainty_token": -23.0625, + "step": 759 + }, + { + "completion_length": 91.578125, + "epoch": 0.8416389811738649, + "grad_norm": 0.4915505349636078, + "kl": 0.0350341796875, + "learning_rate": 5.25e-07, + "loss": -0.00024785241112113, + "reward": 1.9139019846916199, + "reward_std": 0.44706013798713684, + "rewards/GDino": 0.7436053156852722, + "rewards/GIT": 0.4668561816215515, + "rewards/HPSv2": 0.2475872039794922, + "rewards/ORM": 0.4558533579111099, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.375, + "step": 760 + }, + { + "completion_length": 91.828125, + "epoch": 0.8427464008859358, + "grad_norm": 0.4805181324481964, + "kl": 0.065185546875, + "learning_rate": 5.243750000000001e-07, + "loss": 0.0018783179111778736, + "reward": 2.342796564102173, + "reward_std": 0.4181799590587616, + "rewards/GDino": 0.8932035863399506, + "rewards/GIT": 0.5167878717184067, + "rewards/HPSv2": 0.2656135559082031, + "rewards/ORM": 0.6671915054321289, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.1875, + "step": 761 + }, + { + "completion_length": 77.5625, + "epoch": 0.8438538205980066, + "grad_norm": 0.5046535134315491, + "kl": 0.0579833984375, + "learning_rate": 5.237500000000001e-07, + "loss": -0.001801848877221346, + "reward": 2.5508885383605957, + "reward_std": 0.2785831242799759, + "rewards/GDino": 0.8223984837532043, + "rewards/GIT": 0.6196585893630981, + "rewards/HPSv2": 0.26508140563964844, + "rewards/ORM": 0.84375, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -21.4375, + "step": 762 + }, + { + "completion_length": 78.125, + "epoch": 0.8449612403100775, + "grad_norm": 0.4450807571411133, + "kl": 0.0341796875, + "learning_rate": 5.23125e-07, + "loss": 0.003032726002857089, + "reward": 2.5107150077819824, + "reward_std": 0.2531661242246628, + "rewards/GDino": 0.817289412021637, + "rewards/GIT": 0.5925834476947784, + "rewards/HPSv2": 0.2774505615234375, + "rewards/ORM": 0.8233915567398071, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -21.4375, + "step": 763 + }, + { + "completion_length": 103.375, + "epoch": 0.8460686600221484, + "grad_norm": 1.0239514112472534, + "kl": 0.044677734375, + "learning_rate": 5.225e-07, + "loss": -0.009909462183713913, + "reward": 2.346928358078003, + "reward_std": 0.22015134245157242, + "rewards/GDino": 0.8049172759056091, + "rewards/GIT": 0.47102078795433044, + "rewards/HPSv2": 0.29447364807128906, + "rewards/ORM": 0.7765165269374847, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -22.1875, + "step": 764 + }, + { + "completion_length": 94.125, + "epoch": 0.8471760797342193, + "grad_norm": 0.5059560537338257, + "kl": 0.060302734375, + "learning_rate": 5.21875e-07, + "loss": 0.0009553208947181702, + "reward": 1.9430459141731262, + "reward_std": 0.3332773894071579, + "rewards/GDino": 0.7025792598724365, + "rewards/GIT": 0.3476816713809967, + "rewards/HPSv2": 0.2869396209716797, + "rewards/ORM": 0.6058453619480133, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.3125, + "step": 765 + }, + { + "completion_length": 95.53125, + "epoch": 0.8482834994462901, + "grad_norm": 0.8066332936286926, + "kl": 0.052490234375, + "learning_rate": 5.2125e-07, + "loss": 0.002569503616541624, + "reward": 2.2811152935028076, + "reward_std": 0.4060298502445221, + "rewards/GDino": 0.8662107586860657, + "rewards/GIT": 0.44846072793006897, + "rewards/HPSv2": 0.2616233825683594, + "rewards/ORM": 0.7048204243183136, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.9375, + "step": 766 + }, + { + "completion_length": 80.65625, + "epoch": 0.8493909191583611, + "grad_norm": 0.4587434232234955, + "kl": 0.0555419921875, + "learning_rate": 5.20625e-07, + "loss": 0.00809703441336751, + "reward": 2.583795666694641, + "reward_std": 0.23198994994163513, + "rewards/GDino": 0.9273055791854858, + "rewards/GIT": 0.549352616071701, + "rewards/HPSv2": 0.285400390625, + "rewards/ORM": 0.821737140417099, + "self_certainty_semantic": -26.25, + "self_certainty_token": -22.25, + "step": 767 + }, + { + "completion_length": 76.921875, + "epoch": 0.8504983388704319, + "grad_norm": 0.5769164562225342, + "kl": 0.09033203125, + "learning_rate": 5.2e-07, + "loss": -0.0009879026329144835, + "reward": 2.194294571876526, + "reward_std": 0.3342202305793762, + "rewards/GDino": 0.7172603607177734, + "rewards/GIT": 0.46812377870082855, + "rewards/HPSv2": 0.27801513671875, + "rewards/ORM": 0.7308952808380127, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.875, + "step": 768 + }, + { + "completion_length": 80.75, + "epoch": 0.8516057585825028, + "grad_norm": 1.1964291334152222, + "kl": 0.077880859375, + "learning_rate": 5.19375e-07, + "loss": 0.0018652451690286398, + "reward": 2.228595495223999, + "reward_std": 0.26266513764858246, + "rewards/GDino": 0.7993140816688538, + "rewards/GIT": 0.3308692201972008, + "rewards/HPSv2": 0.28465843200683594, + "rewards/ORM": 0.8137537240982056, + "self_certainty_semantic": -26.25, + "self_certainty_token": -23.1875, + "step": 769 + }, + { + "completion_length": 95.6875, + "epoch": 0.8527131782945736, + "grad_norm": 0.5226348042488098, + "kl": 0.0523681640625, + "learning_rate": 5.1875e-07, + "loss": -0.007094914093613625, + "reward": 2.0200713872909546, + "reward_std": 0.22966229170560837, + "rewards/GDino": 0.7793743014335632, + "rewards/GIT": 0.3045049011707306, + "rewards/HPSv2": 0.28690147399902344, + "rewards/ORM": 0.6492905914783478, + "self_certainty_semantic": -26.0, + "self_certainty_token": -22.3125, + "step": 770 + }, + { + "completion_length": 75.6875, + "epoch": 0.8538205980066446, + "grad_norm": 0.6323092579841614, + "kl": 0.0673828125, + "learning_rate": 5.181249999999999e-07, + "loss": -0.0020664860494434834, + "reward": 2.0138555765151978, + "reward_std": 0.4761499762535095, + "rewards/GDino": 0.7040707767009735, + "rewards/GIT": 0.3145817220211029, + "rewards/HPSv2": 0.27645301818847656, + "rewards/ORM": 0.71875, + "self_certainty_semantic": -26.25, + "self_certainty_token": -22.6875, + "step": 771 + }, + { + "completion_length": 90.09375, + "epoch": 0.8549280177187154, + "grad_norm": 0.49237439036369324, + "kl": 0.076416015625, + "learning_rate": 5.174999999999999e-07, + "loss": -0.0016249592299573123, + "reward": 1.883021593093872, + "reward_std": 0.3885272890329361, + "rewards/GDino": 0.6244699656963348, + "rewards/GIT": 0.12319418787956238, + "rewards/HPSv2": 0.31054115295410156, + "rewards/ORM": 0.8248161971569061, + "self_certainty_semantic": -26.0, + "self_certainty_token": -22.375, + "step": 772 + }, + { + "completion_length": 94.03125, + "epoch": 0.8560354374307863, + "grad_norm": 0.47091352939605713, + "kl": 0.0382080078125, + "learning_rate": 5.168749999999999e-07, + "loss": 0.004989234614185989, + "reward": 2.6032445430755615, + "reward_std": 0.27818436920642853, + "rewards/GDino": 0.8882812559604645, + "rewards/GIT": 0.717700719833374, + "rewards/HPSv2": 0.2879295349121094, + "rewards/ORM": 0.7093330323696136, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -22.875, + "step": 773 + }, + { + "completion_length": 88.015625, + "epoch": 0.8571428571428571, + "grad_norm": 0.4584091603755951, + "kl": 0.0439453125, + "learning_rate": 5.162499999999999e-07, + "loss": -0.007258395431563258, + "reward": 2.1296576261520386, + "reward_std": 0.33893588185310364, + "rewards/GDino": 0.8480814099311829, + "rewards/GIT": 0.4219045042991638, + "rewards/HPSv2": 0.28467559814453125, + "rewards/ORM": 0.5749960243701935, + "self_certainty_semantic": -26.0, + "self_certainty_token": -22.5, + "step": 774 + }, + { + "completion_length": 93.0625, + "epoch": 0.858250276854928, + "grad_norm": 0.5792718529701233, + "kl": 0.071533203125, + "learning_rate": 5.156249999999999e-07, + "loss": 0.00012620631605386734, + "reward": 1.8640543818473816, + "reward_std": 0.31884703040122986, + "rewards/GDino": 0.6935815513134003, + "rewards/GIT": 0.23067793250083923, + "rewards/HPSv2": 0.3131904602050781, + "rewards/ORM": 0.626604437828064, + "self_certainty_semantic": -26.0, + "self_certainty_token": -22.4375, + "step": 775 + }, + { + "completion_length": 87.390625, + "epoch": 0.8593576965669989, + "grad_norm": 1.0393602848052979, + "kl": 0.056884765625, + "learning_rate": 5.149999999999999e-07, + "loss": -0.0038496393244713545, + "reward": 2.2068997621536255, + "reward_std": 0.2295629158616066, + "rewards/GDino": 0.8344791829586029, + "rewards/GIT": 0.2208092212677002, + "rewards/HPSv2": 0.292236328125, + "rewards/ORM": 0.859375, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -21.625, + "step": 776 + }, + { + "completion_length": 94.015625, + "epoch": 0.8604651162790697, + "grad_norm": 0.5387282967567444, + "kl": 0.050048828125, + "learning_rate": 5.14375e-07, + "loss": 0.0010228599421679974, + "reward": 2.5055289268493652, + "reward_std": 0.2184959053993225, + "rewards/GDino": 0.8076685070991516, + "rewards/GIT": 0.595233678817749, + "rewards/HPSv2": 0.30432701110839844, + "rewards/ORM": 0.7982996702194214, + "self_certainty_semantic": -26.25, + "self_certainty_token": -22.3125, + "step": 777 + }, + { + "completion_length": 89.046875, + "epoch": 0.8615725359911407, + "grad_norm": 0.4880676865577698, + "kl": 0.0477294921875, + "learning_rate": 5.137500000000001e-07, + "loss": 0.01086664735339582, + "reward": 2.0751246213912964, + "reward_std": 0.32341043651103973, + "rewards/GDino": 0.7633355259895325, + "rewards/GIT": 0.43662890046834946, + "rewards/HPSv2": 0.2818107604980469, + "rewards/ORM": 0.5933493673801422, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -21.625, + "step": 778 + }, + { + "completion_length": 96.125, + "epoch": 0.8626799557032115, + "grad_norm": 1.5523226261138916, + "kl": 0.1392822265625, + "learning_rate": 5.131250000000001e-07, + "loss": 0.001118135405704379, + "reward": 2.253826379776001, + "reward_std": 0.264050155878067, + "rewards/GDino": 0.8408099710941315, + "rewards/GIT": 0.5711329728364944, + "rewards/HPSv2": 0.2841968536376953, + "rewards/ORM": 0.5576865971088409, + "self_certainty_semantic": -26.3125, + "self_certainty_token": -21.75, + "step": 779 + }, + { + "completion_length": 95.78125, + "epoch": 0.8637873754152824, + "grad_norm": 1.023186206817627, + "kl": 0.050048828125, + "learning_rate": 5.125e-07, + "loss": 0.006281588808633387, + "reward": 1.9050045013427734, + "reward_std": 0.3339765965938568, + "rewards/GDino": 0.688654750585556, + "rewards/GIT": 0.3326198533177376, + "rewards/HPSv2": 0.2743549346923828, + "rewards/ORM": 0.609375, + "self_certainty_semantic": -26.3125, + "self_certainty_token": -22.0625, + "step": 780 + }, + { + "completion_length": 92.796875, + "epoch": 0.8648947951273532, + "grad_norm": 0.8491339087486267, + "kl": 0.0823974609375, + "learning_rate": 5.11875e-07, + "loss": -0.007737545995041728, + "reward": 2.456026792526245, + "reward_std": 0.22171350568532944, + "rewards/GDino": 0.7824384868144989, + "rewards/GIT": 0.5615817904472351, + "rewards/HPSv2": 0.2869606018066406, + "rewards/ORM": 0.8250459432601929, + "self_certainty_semantic": -26.375, + "self_certainty_token": -21.9375, + "step": 781 + }, + { + "completion_length": 94.390625, + "epoch": 0.8660022148394242, + "grad_norm": 0.6378558278083801, + "kl": 0.07958984375, + "learning_rate": 5.1125e-07, + "loss": -0.0035567248705774546, + "reward": 2.4704430103302, + "reward_std": 0.16776273399591446, + "rewards/GDino": 0.7778480350971222, + "rewards/GIT": 0.5651848614215851, + "rewards/HPSv2": 0.2789268493652344, + "rewards/ORM": 0.8484834432601929, + "self_certainty_semantic": -26.25, + "self_certainty_token": -21.625, + "step": 782 + }, + { + "completion_length": 88.125, + "epoch": 0.867109634551495, + "grad_norm": 0.5412376523017883, + "kl": 0.039794921875, + "learning_rate": 5.10625e-07, + "loss": -0.0016353868413716555, + "reward": 2.490343689918518, + "reward_std": 0.22540678828954697, + "rewards/GDino": 0.9086370766162872, + "rewards/GIT": 0.3894339054822922, + "rewards/HPSv2": 0.2907562255859375, + "rewards/ORM": 0.9015165567398071, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -22.5, + "step": 783 + }, + { + "completion_length": 86.203125, + "epoch": 0.8682170542635659, + "grad_norm": 0.7530696988105774, + "kl": 0.062255859375, + "learning_rate": 5.1e-07, + "loss": -0.0035481791710481048, + "reward": 1.9575245380401611, + "reward_std": 0.15946803614497185, + "rewards/GDino": 0.7408254146575928, + "rewards/GIT": 0.31404776126146317, + "rewards/HPSv2": 0.3059520721435547, + "rewards/ORM": 0.5966991782188416, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -22.25, + "step": 784 + }, + { + "completion_length": 91.625, + "epoch": 0.8693244739756367, + "grad_norm": 0.559535026550293, + "kl": 0.045166015625, + "learning_rate": 5.09375e-07, + "loss": 0.009843888226896524, + "reward": 2.0219587683677673, + "reward_std": 0.41755446791648865, + "rewards/GDino": 0.656224250793457, + "rewards/GIT": 0.31519366800785065, + "rewards/HPSv2": 0.2849159240722656, + "rewards/ORM": 0.765625, + "self_certainty_semantic": -26.375, + "self_certainty_token": -22.125, + "step": 785 + }, + { + "completion_length": 93.828125, + "epoch": 0.8704318936877077, + "grad_norm": 0.8398950099945068, + "kl": 0.07666015625, + "learning_rate": 5.0875e-07, + "loss": -0.004492704989388585, + "reward": 2.2774981260299683, + "reward_std": 0.38220036029815674, + "rewards/GDino": 0.7403751015663147, + "rewards/GIT": 0.4549301117658615, + "rewards/HPSv2": 0.2786293029785156, + "rewards/ORM": 0.8035636246204376, + "self_certainty_semantic": -26.0625, + "self_certainty_token": -22.9375, + "step": 786 + }, + { + "completion_length": 92.8125, + "epoch": 0.8715393133997785, + "grad_norm": 0.9490128755569458, + "kl": 0.0667724609375, + "learning_rate": 5.08125e-07, + "loss": -0.005953711923211813, + "reward": 1.9052014350891113, + "reward_std": 0.25719454139471054, + "rewards/GDino": 0.6563822031021118, + "rewards/GIT": 0.28882522135972977, + "rewards/HPSv2": 0.2690143585205078, + "rewards/ORM": 0.6909796893596649, + "self_certainty_semantic": -26.25, + "self_certainty_token": -22.8125, + "step": 787 + }, + { + "completion_length": 79.6875, + "epoch": 0.8726467331118494, + "grad_norm": 0.7004393339157104, + "kl": 0.072265625, + "learning_rate": 5.074999999999999e-07, + "loss": 0.0019673602655529976, + "reward": 2.702498197555542, + "reward_std": 0.25008123368024826, + "rewards/GDino": 0.9359375238418579, + "rewards/GIT": 0.6444357335567474, + "rewards/HPSv2": 0.2881126403808594, + "rewards/ORM": 0.8340122699737549, + "self_certainty_semantic": -26.125, + "self_certainty_token": -22.125, + "step": 788 + }, + { + "completion_length": 90.265625, + "epoch": 0.8737541528239202, + "grad_norm": 0.7166645526885986, + "kl": 0.0556640625, + "learning_rate": 5.068749999999999e-07, + "loss": 0.001658524852246046, + "reward": 2.3782191276550293, + "reward_std": 0.2895669490098953, + "rewards/GDino": 0.8661348819732666, + "rewards/GIT": 0.5619442015886307, + "rewards/HPSv2": 0.27826499938964844, + "rewards/ORM": 0.671875, + "self_certainty_semantic": -26.375, + "self_certainty_token": -22.125, + "step": 789 + }, + { + "completion_length": 78.0, + "epoch": 0.8748615725359912, + "grad_norm": 0.4692370891571045, + "kl": 0.044921875, + "learning_rate": 5.062499999999999e-07, + "loss": 0.0036736687179654837, + "reward": 2.17470920085907, + "reward_std": 0.15269017592072487, + "rewards/GDino": 0.8616602122783661, + "rewards/GIT": 0.5441394448280334, + "rewards/HPSv2": 0.2845344543457031, + "rewards/ORM": 0.484375, + "self_certainty_semantic": -26.375, + "self_certainty_token": -22.375, + "step": 790 + }, + { + "completion_length": 84.546875, + "epoch": 0.875968992248062, + "grad_norm": 0.48845577239990234, + "kl": 0.079833984375, + "learning_rate": 5.056249999999999e-07, + "loss": -0.0011137682013213634, + "reward": 2.3695492148399353, + "reward_std": 0.24598948657512665, + "rewards/GDino": 0.8458333313465118, + "rewards/GIT": 0.5067659318447113, + "rewards/HPSv2": 0.27709007263183594, + "rewards/ORM": 0.7398597598075867, + "self_certainty_semantic": -26.0, + "self_certainty_token": -22.0625, + "step": 791 + }, + { + "completion_length": 94.390625, + "epoch": 0.8770764119601329, + "grad_norm": 0.3991207778453827, + "kl": 0.0240478515625, + "learning_rate": 5.049999999999999e-07, + "loss": -0.0010369333904236555, + "reward": 2.205116868019104, + "reward_std": 0.19120776653289795, + "rewards/GDino": 0.7229166626930237, + "rewards/GIT": 0.5277722105383873, + "rewards/HPSv2": 0.28067779541015625, + "rewards/ORM": 0.673750251531601, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.0, + "step": 792 + }, + { + "completion_length": 88.71875, + "epoch": 0.8781838316722038, + "grad_norm": 1.1761263608932495, + "kl": 0.0478515625, + "learning_rate": 5.04375e-07, + "loss": 0.005101980525068939, + "reward": 2.3470221757888794, + "reward_std": 0.16771353781223297, + "rewards/GDino": 0.8018229305744171, + "rewards/GIT": 0.5189051479101181, + "rewards/HPSv2": 0.2953987121582031, + "rewards/ORM": 0.7308952808380127, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -22.1875, + "step": 793 + }, + { + "completion_length": 88.65625, + "epoch": 0.8792912513842747, + "grad_norm": 0.6012600064277649, + "kl": 0.0672607421875, + "learning_rate": 5.0375e-07, + "loss": -0.004321191692724824, + "reward": 2.4209903478622437, + "reward_std": 0.3858864903450012, + "rewards/GDino": 0.778124988079071, + "rewards/GIT": 0.5576039850711823, + "rewards/HPSv2": 0.2678070068359375, + "rewards/ORM": 0.8174543380737305, + "self_certainty_semantic": -26.25, + "self_certainty_token": -22.1875, + "step": 794 + }, + { + "completion_length": 83.6875, + "epoch": 0.8803986710963455, + "grad_norm": 0.3953668177127838, + "kl": 0.0341796875, + "learning_rate": 5.031250000000001e-07, + "loss": 0.0030281259678304195, + "reward": 2.122947931289673, + "reward_std": 0.23448041081428528, + "rewards/GDino": 0.7804083824157715, + "rewards/GIT": 0.5292441993951797, + "rewards/HPSv2": 0.2820453643798828, + "rewards/ORM": 0.5312499850988388, + "self_certainty_semantic": -26.0, + "self_certainty_token": -21.5, + "step": 795 + }, + { + "completion_length": 95.078125, + "epoch": 0.8815060908084164, + "grad_norm": 0.3811919391155243, + "kl": 0.0419921875, + "learning_rate": 5.025e-07, + "loss": 0.01182631985284388, + "reward": 2.218349575996399, + "reward_std": 0.3526668846607208, + "rewards/GDino": 0.7362289130687714, + "rewards/GIT": 0.4473404586315155, + "rewards/HPSv2": 0.302490234375, + "rewards/ORM": 0.7322899401187897, + "self_certainty_semantic": -26.125, + "self_certainty_token": -21.75, + "step": 796 + }, + { + "completion_length": 93.8125, + "epoch": 0.8826135105204873, + "grad_norm": 0.4822554588317871, + "kl": 0.045166015625, + "learning_rate": 5.01875e-07, + "loss": -0.015912785660475492, + "reward": 2.493619680404663, + "reward_std": 0.2686537802219391, + "rewards/GDino": 0.864062488079071, + "rewards/GIT": 0.5744251310825348, + "rewards/HPSv2": 0.28022003173828125, + "rewards/ORM": 0.7749121189117432, + "self_certainty_semantic": -26.1875, + "self_certainty_token": -22.375, + "step": 797 + }, + { + "completion_length": 102.828125, + "epoch": 0.8837209302325582, + "grad_norm": 0.8867488503456116, + "kl": 0.052490234375, + "learning_rate": 5.0125e-07, + "loss": 0.012860861606895924, + "reward": 1.5301844477653503, + "reward_std": 0.25543633103370667, + "rewards/GDino": 0.6588541269302368, + "rewards/GIT": 0.3966846615076065, + "rewards/HPSv2": 0.26370811462402344, + "rewards/ORM": 0.2109375, + "self_certainty_semantic": -26.0, + "self_certainty_token": -22.125, + "step": 798 + }, + { + "completion_length": 96.578125, + "epoch": 0.884828349944629, + "grad_norm": 0.5731300115585327, + "kl": 0.0601806640625, + "learning_rate": 5.00625e-07, + "loss": -0.005381495226174593, + "reward": 1.8169286251068115, + "reward_std": 0.2464291751384735, + "rewards/GDino": 0.6291365325450897, + "rewards/GIT": 0.3078873082995415, + "rewards/HPSv2": 0.28861236572265625, + "rewards/ORM": 0.5912924110889435, + "self_certainty_semantic": -26.375, + "self_certainty_token": -22.0, + "step": 799 + }, + { + "completion_length": 87.328125, + "epoch": 0.8859357696566998, + "grad_norm": 1.1065376996994019, + "kl": 0.0673828125, + "learning_rate": 5e-07, + "loss": 0.010202036239206791, + "reward": 2.076913356781006, + "reward_std": 0.34223534166812897, + "rewards/GDino": 0.7920229732990265, + "rewards/GIT": 0.5558872669935226, + "rewards/HPSv2": 0.2712745666503906, + "rewards/ORM": 0.4577286019921303, + "self_certainty_semantic": -26.3125, + "self_certainty_token": -21.8125, + "step": 800 + } + ], + "logging_steps": 1.0, + "max_steps": 1600, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}