| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.4429678848283499, | |
| "eval_steps": 500, | |
| "global_step": 400, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 73.265625, | |
| "epoch": 0.0011074197120708748, | |
| "grad_norm": 0.47520893812179565, | |
| "kl": 0.0, | |
| "learning_rate": 9.99375e-07, | |
| "loss": 0.000854941550642252, | |
| "reward": 2.2648561000823975, | |
| "reward_std": 0.32521533221006393, | |
| "rewards/GDino": 0.84943026304245, | |
| "rewards/GIT": 0.5776679813861847, | |
| "rewards/HPSv2": 0.2639656066894531, | |
| "rewards/ORM": 0.5737921893596649, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -22.0, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 57.359375, | |
| "epoch": 0.0022148394241417496, | |
| "grad_norm": 0.7006784677505493, | |
| "kl": 0.00151824951171875, | |
| "learning_rate": 9.9875e-07, | |
| "loss": 0.0010380030144006014, | |
| "reward": 1.6890186071395874, | |
| "reward_std": 0.5064275413751602, | |
| "rewards/GDino": 0.7000000476837158, | |
| "rewards/GIT": 0.161313958466053, | |
| "rewards/HPSv2": 0.2509632110595703, | |
| "rewards/ORM": 0.5767413973808289, | |
| "self_certainty_semantic": -25.375, | |
| "self_certainty_token": -20.5625, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 54.640625, | |
| "epoch": 0.0033222591362126247, | |
| "grad_norm": 0.5812113285064697, | |
| "kl": 0.001556396484375, | |
| "learning_rate": 9.98125e-07, | |
| "loss": -0.0055133504793047905, | |
| "reward": 1.5832943320274353, | |
| "reward_std": 0.3882431983947754, | |
| "rewards/GDino": 0.6165956258773804, | |
| "rewards/GIT": 0.3970412313938141, | |
| "rewards/HPSv2": 0.24474143981933594, | |
| "rewards/ORM": 0.3249160535633564, | |
| "self_certainty_semantic": -25.1875, | |
| "self_certainty_token": -20.8125, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 63.578125, | |
| "epoch": 0.004429678848283499, | |
| "grad_norm": 0.6130731105804443, | |
| "kl": 0.001605987548828125, | |
| "learning_rate": 9.975e-07, | |
| "loss": -0.005623435601592064, | |
| "reward": 2.1563462018966675, | |
| "reward_std": 0.3505118489265442, | |
| "rewards/GDino": 0.8188963234424591, | |
| "rewards/GIT": 0.4581628292798996, | |
| "rewards/HPSv2": 0.24955368041992188, | |
| "rewards/ORM": 0.6297334432601929, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -22.0, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 57.65625, | |
| "epoch": 0.005537098560354375, | |
| "grad_norm": 0.8068524599075317, | |
| "kl": 0.00165557861328125, | |
| "learning_rate": 9.968749999999999e-07, | |
| "loss": -0.0018901200965046883, | |
| "reward": 1.6294466853141785, | |
| "reward_std": 0.3914882242679596, | |
| "rewards/GDino": 0.6075743436813354, | |
| "rewards/GIT": 0.2503758817911148, | |
| "rewards/HPSv2": 0.2523918151855469, | |
| "rewards/ORM": 0.5191046893596649, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -21.8125, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 65.8125, | |
| "epoch": 0.006644518272425249, | |
| "grad_norm": 74728.3515625, | |
| "kl": 228.00085067749023, | |
| "learning_rate": 9.9625e-07, | |
| "loss": 2.2879227567464113, | |
| "reward": 2.15460866689682, | |
| "reward_std": 0.18937285244464874, | |
| "rewards/GDino": 0.7502027153968811, | |
| "rewards/GIT": 0.4551280438899994, | |
| "rewards/HPSv2": 0.2774028778076172, | |
| "rewards/ORM": 0.671875, | |
| "self_certainty_semantic": -25.3125, | |
| "self_certainty_token": -22.625, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 65.640625, | |
| "epoch": 0.007751937984496124, | |
| "grad_norm": 0.9850716590881348, | |
| "kl": 0.001739501953125, | |
| "learning_rate": 9.956249999999999e-07, | |
| "loss": -0.009785129223018885, | |
| "reward": 1.6486687660217285, | |
| "reward_std": 0.55589759349823, | |
| "rewards/GDino": 0.5765624940395355, | |
| "rewards/GIT": 0.15754839032888412, | |
| "rewards/HPSv2": 0.2522296905517578, | |
| "rewards/ORM": 0.6623281538486481, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -22.1875, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 65.796875, | |
| "epoch": 0.008859357696566999, | |
| "grad_norm": 0.8074976801872253, | |
| "kl": 0.001628875732421875, | |
| "learning_rate": 9.95e-07, | |
| "loss": 0.0002866658614948392, | |
| "reward": 1.7531355023384094, | |
| "reward_std": 0.3834189176559448, | |
| "rewards/GDino": 0.7171875536441803, | |
| "rewards/GIT": 0.3904750794172287, | |
| "rewards/HPSv2": 0.2441272735595703, | |
| "rewards/ORM": 0.4013456404209137, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -20.5, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 61.53125, | |
| "epoch": 0.009966777408637873, | |
| "grad_norm": 0.5135362148284912, | |
| "kl": 0.001628875732421875, | |
| "learning_rate": 9.94375e-07, | |
| "loss": -0.002820038120262325, | |
| "reward": 2.1886491775512695, | |
| "reward_std": 0.5042529106140137, | |
| "rewards/GDino": 0.800000011920929, | |
| "rewards/GIT": 0.3224633187055588, | |
| "rewards/HPSv2": 0.2661018371582031, | |
| "rewards/ORM": 0.8000838756561279, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -21.5, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 66.9375, | |
| "epoch": 0.01107419712070875, | |
| "grad_norm": 1.035406231880188, | |
| "kl": 0.001590728759765625, | |
| "learning_rate": 9.9375e-07, | |
| "loss": 0.010037540923804045, | |
| "reward": 1.8388126492500305, | |
| "reward_std": 0.385573148727417, | |
| "rewards/GDino": 0.729426920413971, | |
| "rewards/GIT": 0.47063055634498596, | |
| "rewards/HPSv2": 0.25093841552734375, | |
| "rewards/ORM": 0.3878167122602463, | |
| "self_certainty_semantic": -25.375, | |
| "self_certainty_token": -20.75, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 54.65625, | |
| "epoch": 0.012181616832779624, | |
| "grad_norm": 0.6659172773361206, | |
| "kl": 0.00159454345703125, | |
| "learning_rate": 9.93125e-07, | |
| "loss": -0.010986692272126675, | |
| "reward": 2.312160015106201, | |
| "reward_std": 0.3424924612045288, | |
| "rewards/GDino": 0.7864583432674408, | |
| "rewards/GIT": 0.5519254580140114, | |
| "rewards/HPSv2": 0.2634601593017578, | |
| "rewards/ORM": 0.710316002368927, | |
| "self_certainty_semantic": -25.3125, | |
| "self_certainty_token": -20.9375, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 65.6875, | |
| "epoch": 0.013289036544850499, | |
| "grad_norm": 0.4100457727909088, | |
| "kl": 0.00152587890625, | |
| "learning_rate": 9.925e-07, | |
| "loss": -0.0020649502985179424, | |
| "reward": 1.831676721572876, | |
| "reward_std": 0.37266574054956436, | |
| "rewards/GDino": 0.6748343408107758, | |
| "rewards/GIT": 0.3966377377510071, | |
| "rewards/HPSv2": 0.2431049346923828, | |
| "rewards/ORM": 0.5170995742082596, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -21.5, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 62.15625, | |
| "epoch": 0.014396456256921373, | |
| "grad_norm": 1.1354421377182007, | |
| "kl": 0.0016326904296875, | |
| "learning_rate": 9.91875e-07, | |
| "loss": -0.0013978920178487897, | |
| "reward": 1.7478299736976624, | |
| "reward_std": 0.3111024349927902, | |
| "rewards/GDino": 0.7122170925140381, | |
| "rewards/GIT": 0.28808362782001495, | |
| "rewards/HPSv2": 0.2510089874267578, | |
| "rewards/ORM": 0.4965202957391739, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -21.6875, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 63.734375, | |
| "epoch": 0.015503875968992248, | |
| "grad_norm": 171.63954162597656, | |
| "kl": 11.750831604003906, | |
| "learning_rate": 9.912499999999998e-07, | |
| "loss": 0.11320369923487306, | |
| "reward": 1.820958137512207, | |
| "reward_std": 0.6430586874485016, | |
| "rewards/GDino": 0.7286913394927979, | |
| "rewards/GIT": 0.39159613847732544, | |
| "rewards/HPSv2": 0.222503662109375, | |
| "rewards/ORM": 0.47816696763038635, | |
| "self_certainty_semantic": -25.375, | |
| "self_certainty_token": -21.875, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 64.796875, | |
| "epoch": 0.016611295681063124, | |
| "grad_norm": 1.790418267250061, | |
| "kl": 0.001697540283203125, | |
| "learning_rate": 9.90625e-07, | |
| "loss": -0.0012796747614629567, | |
| "reward": 2.4724700450897217, | |
| "reward_std": 0.361017182469368, | |
| "rewards/GDino": 0.8982033133506775, | |
| "rewards/GIT": 0.5411243438720703, | |
| "rewards/HPSv2": 0.2581005096435547, | |
| "rewards/ORM": 0.7750419676303864, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -21.8125, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 65.078125, | |
| "epoch": 0.017718715393133997, | |
| "grad_norm": 0.38361120223999023, | |
| "kl": 0.0015869140625, | |
| "learning_rate": 9.9e-07, | |
| "loss": 0.006866331794299185, | |
| "reward": 1.5055131912231445, | |
| "reward_std": 0.40322621166706085, | |
| "rewards/GDino": 0.651562511920929, | |
| "rewards/GIT": 0.2843637466430664, | |
| "rewards/HPSv2": 0.24664592742919922, | |
| "rewards/ORM": 0.32294100522994995, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -21.125, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 70.25, | |
| "epoch": 0.018826135105204873, | |
| "grad_norm": 1.0185045003890991, | |
| "kl": 0.001552581787109375, | |
| "learning_rate": 9.89375e-07, | |
| "loss": -0.010323233203962445, | |
| "reward": 1.5897727608680725, | |
| "reward_std": 0.530043363571167, | |
| "rewards/GDino": 0.5529386103153229, | |
| "rewards/GIT": 0.2131059616804123, | |
| "rewards/HPSv2": 0.2552909851074219, | |
| "rewards/ORM": 0.5684372782707214, | |
| "self_certainty_semantic": -25.3125, | |
| "self_certainty_token": -20.5, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 66.34375, | |
| "epoch": 0.019933554817275746, | |
| "grad_norm": 0.4375481605529785, | |
| "kl": 0.00156402587890625, | |
| "learning_rate": 9.8875e-07, | |
| "loss": -0.00136462040245533, | |
| "reward": 2.063610315322876, | |
| "reward_std": 0.42642320692539215, | |
| "rewards/GDino": 0.7955474257469177, | |
| "rewards/GIT": 0.5150393098592758, | |
| "rewards/HPSv2": 0.22445201873779297, | |
| "rewards/ORM": 0.528571605682373, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -20.75, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 59.21875, | |
| "epoch": 0.021040974529346623, | |
| "grad_norm": 0.3959902226924896, | |
| "kl": 0.00164031982421875, | |
| "learning_rate": 9.88125e-07, | |
| "loss": -0.0053134458139538765, | |
| "reward": 1.5237417221069336, | |
| "reward_std": 0.4693976193666458, | |
| "rewards/GDino": 0.701702356338501, | |
| "rewards/GIT": 0.2579326629638672, | |
| "rewards/HPSv2": 0.24812698364257812, | |
| "rewards/ORM": 0.3159796893596649, | |
| "self_certainty_semantic": -25.1875, | |
| "self_certainty_token": -21.5625, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 61.484375, | |
| "epoch": 0.0221483942414175, | |
| "grad_norm": 0.5081169605255127, | |
| "kl": 0.001689910888671875, | |
| "learning_rate": 9.875e-07, | |
| "loss": 0.0003520832397043705, | |
| "reward": 1.9516127109527588, | |
| "reward_std": 0.2731045335531235, | |
| "rewards/GDino": 0.6437798738479614, | |
| "rewards/GIT": 0.4635310173034668, | |
| "rewards/HPSv2": 0.24121475219726562, | |
| "rewards/ORM": 0.6030870825052261, | |
| "self_certainty_semantic": -25.1875, | |
| "self_certainty_token": -21.5625, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 55.546875, | |
| "epoch": 0.023255813953488372, | |
| "grad_norm": 0.4565694034099579, | |
| "kl": 0.001667022705078125, | |
| "learning_rate": 9.86875e-07, | |
| "loss": 0.0016932454891502857, | |
| "reward": 2.180082321166992, | |
| "reward_std": 0.5037369430065155, | |
| "rewards/GDino": 0.7953125238418579, | |
| "rewards/GIT": 0.45517681539058685, | |
| "rewards/HPSv2": 0.2586212158203125, | |
| "rewards/ORM": 0.6709719300270081, | |
| "self_certainty_semantic": -25.25, | |
| "self_certainty_token": -21.9375, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 68.75, | |
| "epoch": 0.024363233665559248, | |
| "grad_norm": 0.45827633142471313, | |
| "kl": 0.001712799072265625, | |
| "learning_rate": 9.862499999999999e-07, | |
| "loss": 0.0007174527272582054, | |
| "reward": 1.8721013069152832, | |
| "reward_std": 0.4303991347551346, | |
| "rewards/GDino": 0.6911458671092987, | |
| "rewards/GIT": 0.36048486828804016, | |
| "rewards/HPSv2": 0.2603263854980469, | |
| "rewards/ORM": 0.5601442009210587, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -22.4375, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 58.609375, | |
| "epoch": 0.02547065337763012, | |
| "grad_norm": 0.6875389218330383, | |
| "kl": 0.00162506103515625, | |
| "learning_rate": 9.85625e-07, | |
| "loss": -0.004631380317732692, | |
| "reward": 1.9805514812469482, | |
| "reward_std": 0.5138447731733322, | |
| "rewards/GDino": 0.706105500459671, | |
| "rewards/GIT": 0.4199465811252594, | |
| "rewards/HPSv2": 0.26941490173339844, | |
| "rewards/ORM": 0.5850843787193298, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -20.9375, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 60.859375, | |
| "epoch": 0.026578073089700997, | |
| "grad_norm": 0.5052416324615479, | |
| "kl": 0.001667022705078125, | |
| "learning_rate": 9.849999999999999e-07, | |
| "loss": -0.0046843914315104485, | |
| "reward": 2.368114173412323, | |
| "reward_std": 0.4367552697658539, | |
| "rewards/GDino": 0.815625011920929, | |
| "rewards/GIT": 0.633857935667038, | |
| "rewards/HPSv2": 0.25930213928222656, | |
| "rewards/ORM": 0.6593290567398071, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -21.875, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 61.078125, | |
| "epoch": 0.02768549280177187, | |
| "grad_norm": 0.6162320971488953, | |
| "kl": 0.001617431640625, | |
| "learning_rate": 9.84375e-07, | |
| "loss": -0.005464642075821757, | |
| "reward": 1.9494624137878418, | |
| "reward_std": 0.40468768775463104, | |
| "rewards/GDino": 0.6967671811580658, | |
| "rewards/GIT": 0.40975040197372437, | |
| "rewards/HPSv2": 0.26043701171875, | |
| "rewards/ORM": 0.5825077295303345, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -21.3125, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 50.734375, | |
| "epoch": 0.028792912513842746, | |
| "grad_norm": 2.8454437255859375, | |
| "kl": 0.001804351806640625, | |
| "learning_rate": 9.8375e-07, | |
| "loss": -0.006305628921836615, | |
| "reward": 2.190965175628662, | |
| "reward_std": 0.44982025027275085, | |
| "rewards/GDino": 0.7243013381958008, | |
| "rewards/GIT": 0.5294483602046967, | |
| "rewards/HPSv2": 0.2750282287597656, | |
| "rewards/ORM": 0.6621872782707214, | |
| "self_certainty_semantic": -25.3125, | |
| "self_certainty_token": -22.375, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 62.484375, | |
| "epoch": 0.029900332225913623, | |
| "grad_norm": 0.4033506512641907, | |
| "kl": 0.0016021728515625, | |
| "learning_rate": 9.83125e-07, | |
| "loss": -0.0016465974040329456, | |
| "reward": 1.9733637571334839, | |
| "reward_std": 0.44280076026916504, | |
| "rewards/GDino": 0.7363362908363342, | |
| "rewards/GIT": 0.4528593420982361, | |
| "rewards/HPSv2": 0.24550628662109375, | |
| "rewards/ORM": 0.5386618673801422, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -22.375, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 65.046875, | |
| "epoch": 0.031007751937984496, | |
| "grad_norm": 0.559298574924469, | |
| "kl": 0.00167083740234375, | |
| "learning_rate": 9.825e-07, | |
| "loss": 0.004501585033722222, | |
| "reward": 1.4280173778533936, | |
| "reward_std": 0.27060839533805847, | |
| "rewards/GDino": 0.5987553596496582, | |
| "rewards/GIT": 0.10973574221134186, | |
| "rewards/HPSv2": 0.2664012908935547, | |
| "rewards/ORM": 0.453125, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -20.9375, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 55.5625, | |
| "epoch": 0.03211517165005537, | |
| "grad_norm": 0.42233753204345703, | |
| "kl": 0.00168609619140625, | |
| "learning_rate": 9.81875e-07, | |
| "loss": -0.005473613273352385, | |
| "reward": 2.4506709575653076, | |
| "reward_std": 0.20222720131278038, | |
| "rewards/GDino": 0.8296874761581421, | |
| "rewards/GIT": 0.605083167552948, | |
| "rewards/HPSv2": 0.285858154296875, | |
| "rewards/ORM": 0.7300421893596649, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -20.9375, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 57.640625, | |
| "epoch": 0.03322259136212625, | |
| "grad_norm": 0.5650274157524109, | |
| "kl": 0.0016326904296875, | |
| "learning_rate": 9.8125e-07, | |
| "loss": 0.0003150699194520712, | |
| "reward": 2.489137649536133, | |
| "reward_std": 0.4210814982652664, | |
| "rewards/GDino": 0.8948009014129639, | |
| "rewards/GIT": 0.586266428232193, | |
| "rewards/HPSv2": 0.24865341186523438, | |
| "rewards/ORM": 0.7594169676303864, | |
| "self_certainty_semantic": -25.3125, | |
| "self_certainty_token": -21.0625, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 78.78125, | |
| "epoch": 0.03433001107419712, | |
| "grad_norm": 0.6762183308601379, | |
| "kl": 0.001613616943359375, | |
| "learning_rate": 9.806249999999998e-07, | |
| "loss": 0.007568572706077248, | |
| "reward": 1.8555968403816223, | |
| "reward_std": 0.2906922847032547, | |
| "rewards/GDino": 0.5989583432674408, | |
| "rewards/GIT": 0.38505683839321136, | |
| "rewards/HPSv2": 0.2403736114501953, | |
| "rewards/ORM": 0.6312080323696136, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -20.5625, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 62.5, | |
| "epoch": 0.035437430786267994, | |
| "grad_norm": 0.4184902012348175, | |
| "kl": 0.001628875732421875, | |
| "learning_rate": 9.8e-07, | |
| "loss": 0.007896744413301349, | |
| "reward": 1.495099127292633, | |
| "reward_std": 0.3622882664203644, | |
| "rewards/GDino": 0.6791666448116302, | |
| "rewards/GIT": 0.25104063749313354, | |
| "rewards/HPSv2": 0.23050880432128906, | |
| "rewards/ORM": 0.3343829959630966, | |
| "self_certainty_semantic": -25.3125, | |
| "self_certainty_token": -22.0625, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 70.109375, | |
| "epoch": 0.036544850498338874, | |
| "grad_norm": 0.47143352031707764, | |
| "kl": 0.0016937255859375, | |
| "learning_rate": 9.79375e-07, | |
| "loss": 0.00709247519262135, | |
| "reward": 2.3964842557907104, | |
| "reward_std": 0.5415211468935013, | |
| "rewards/GDino": 0.897656261920929, | |
| "rewards/GIT": 0.6205766499042511, | |
| "rewards/HPSv2": 0.2254810333251953, | |
| "rewards/ORM": 0.6527703106403351, | |
| "self_certainty_semantic": -25.3125, | |
| "self_certainty_token": -21.625, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 55.53125, | |
| "epoch": 0.03765227021040975, | |
| "grad_norm": 0.45762747526168823, | |
| "kl": 0.001678466796875, | |
| "learning_rate": 9.7875e-07, | |
| "loss": 0.020488019566982985, | |
| "reward": 1.9143174886703491, | |
| "reward_std": 0.2841227799654007, | |
| "rewards/GDino": 0.6593749821186066, | |
| "rewards/GIT": 0.4214262217283249, | |
| "rewards/HPSv2": 0.2424945831298828, | |
| "rewards/ORM": 0.5910216569900513, | |
| "self_certainty_semantic": -25.3125, | |
| "self_certainty_token": -21.75, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 55.828125, | |
| "epoch": 0.03875968992248062, | |
| "grad_norm": 0.3845841884613037, | |
| "kl": 0.00167083740234375, | |
| "learning_rate": 9.78125e-07, | |
| "loss": 0.01862273830920458, | |
| "reward": 2.274049997329712, | |
| "reward_std": 0.28603486716747284, | |
| "rewards/GDino": 0.7786458432674408, | |
| "rewards/GIT": 0.5405041128396988, | |
| "rewards/HPSv2": 0.23740386962890625, | |
| "rewards/ORM": 0.7174962311983109, | |
| "self_certainty_semantic": -25.3125, | |
| "self_certainty_token": -21.25, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 63.234375, | |
| "epoch": 0.03986710963455149, | |
| "grad_norm": 0.5729533433914185, | |
| "kl": 0.001678466796875, | |
| "learning_rate": 9.775e-07, | |
| "loss": -0.002963901497423649, | |
| "reward": 1.8639960289001465, | |
| "reward_std": 0.3890039473772049, | |
| "rewards/GDino": 0.6255208253860474, | |
| "rewards/GIT": 0.42713797092437744, | |
| "rewards/HPSv2": 0.24535751342773438, | |
| "rewards/ORM": 0.5659796744585037, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -21.3125, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 63.09375, | |
| "epoch": 0.04097452934662237, | |
| "grad_norm": 0.47338196635246277, | |
| "kl": 0.001888275146484375, | |
| "learning_rate": 9.76875e-07, | |
| "loss": 0.008916446007788181, | |
| "reward": 1.9735829830169678, | |
| "reward_std": 0.5416238605976105, | |
| "rewards/GDino": 0.7008762061595917, | |
| "rewards/GIT": 0.3141380175948143, | |
| "rewards/HPSv2": 0.2595968246459961, | |
| "rewards/ORM": 0.6989719867706299, | |
| "self_certainty_semantic": -25.375, | |
| "self_certainty_token": -23.125, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 58.640625, | |
| "epoch": 0.042081949058693245, | |
| "grad_norm": 1.639336347579956, | |
| "kl": 0.001651763916015625, | |
| "learning_rate": 9.7625e-07, | |
| "loss": -0.0003745388239622116, | |
| "reward": 1.8843677639961243, | |
| "reward_std": 0.27646802365779877, | |
| "rewards/GDino": 0.7309310734272003, | |
| "rewards/GIT": 0.2879854440689087, | |
| "rewards/HPSv2": 0.25732994079589844, | |
| "rewards/ORM": 0.6081212311983109, | |
| "self_certainty_semantic": -25.375, | |
| "self_certainty_token": -21.0625, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 54.453125, | |
| "epoch": 0.04318936877076412, | |
| "grad_norm": 0.4438176453113556, | |
| "kl": 0.00176239013671875, | |
| "learning_rate": 9.756249999999999e-07, | |
| "loss": -0.004410726949572563, | |
| "reward": 2.3740460872650146, | |
| "reward_std": 0.26216618716716766, | |
| "rewards/GDino": 0.8794216811656952, | |
| "rewards/GIT": 0.480433389544487, | |
| "rewards/HPSv2": 0.2703990936279297, | |
| "rewards/ORM": 0.7437919676303864, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -21.0, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 64.65625, | |
| "epoch": 0.044296788482835, | |
| "grad_norm": 0.9789016246795654, | |
| "kl": 0.0017242431640625, | |
| "learning_rate": 9.75e-07, | |
| "loss": -0.0008055282523855567, | |
| "reward": 2.2535433769226074, | |
| "reward_std": 0.46909773349761963, | |
| "rewards/GDino": 0.8751652538776398, | |
| "rewards/GIT": 0.4070926010608673, | |
| "rewards/HPSv2": 0.2731647491455078, | |
| "rewards/ORM": 0.6981207877397537, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -20.625, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 60.3125, | |
| "epoch": 0.04540420819490587, | |
| "grad_norm": 0.39339736104011536, | |
| "kl": 0.001697540283203125, | |
| "learning_rate": 9.743749999999999e-07, | |
| "loss": -0.0026839073980227113, | |
| "reward": 1.926289677619934, | |
| "reward_std": 0.21494604647159576, | |
| "rewards/GDino": 0.6536072194576263, | |
| "rewards/GIT": 0.38067150115966797, | |
| "rewards/HPSv2": 0.2470531463623047, | |
| "rewards/ORM": 0.6449578106403351, | |
| "self_certainty_semantic": -25.375, | |
| "self_certainty_token": -21.0, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 55.734375, | |
| "epoch": 0.046511627906976744, | |
| "grad_norm": 0.43325623869895935, | |
| "kl": 0.001575469970703125, | |
| "learning_rate": 9.7375e-07, | |
| "loss": 0.01566000678576529, | |
| "reward": 2.2492642402648926, | |
| "reward_std": 0.545527771115303, | |
| "rewards/GDino": 0.8451037406921387, | |
| "rewards/GIT": 0.4486817270517349, | |
| "rewards/HPSv2": 0.2523536682128906, | |
| "rewards/ORM": 0.703125, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -21.125, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 77.859375, | |
| "epoch": 0.047619047619047616, | |
| "grad_norm": 0.6008194088935852, | |
| "kl": 0.00209808349609375, | |
| "learning_rate": 9.73125e-07, | |
| "loss": 0.009053934598341584, | |
| "reward": 1.752554178237915, | |
| "reward_std": 0.3711804449558258, | |
| "rewards/GDino": 0.6425288617610931, | |
| "rewards/GIT": 0.38656318187713623, | |
| "rewards/HPSv2": 0.23595809936523438, | |
| "rewards/ORM": 0.4875040054321289, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.8125, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 64.859375, | |
| "epoch": 0.048726467331118496, | |
| "grad_norm": 0.4626310169696808, | |
| "kl": 0.001750946044921875, | |
| "learning_rate": 9.725e-07, | |
| "loss": 0.00038470514118671417, | |
| "reward": 2.837794542312622, | |
| "reward_std": 0.3451881557703018, | |
| "rewards/GDino": 0.9479166865348816, | |
| "rewards/GIT": 0.7795328795909882, | |
| "rewards/HPSv2": 0.26932334899902344, | |
| "rewards/ORM": 0.8410216569900513, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -21.5625, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 66.921875, | |
| "epoch": 0.04983388704318937, | |
| "grad_norm": 1.3941670656204224, | |
| "kl": 0.001880645751953125, | |
| "learning_rate": 9.71875e-07, | |
| "loss": -0.012070931028574705, | |
| "reward": 2.561403751373291, | |
| "reward_std": 0.48213036358356476, | |
| "rewards/GDino": 0.9039532244205475, | |
| "rewards/GIT": 0.5467919409275055, | |
| "rewards/HPSv2": 0.2617225646972656, | |
| "rewards/ORM": 0.8489359319210052, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -21.9375, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 59.625, | |
| "epoch": 0.05094130675526024, | |
| "grad_norm": 0.5365378260612488, | |
| "kl": 0.001949310302734375, | |
| "learning_rate": 9.712499999999998e-07, | |
| "loss": 0.01103684725239873, | |
| "reward": 2.0622146129608154, | |
| "reward_std": 0.40072987973690033, | |
| "rewards/GDino": 0.645312488079071, | |
| "rewards/GIT": 0.33725525438785553, | |
| "rewards/HPSv2": 0.2619609832763672, | |
| "rewards/ORM": 0.8176859617233276, | |
| "self_certainty_semantic": -25.25, | |
| "self_certainty_token": -22.3125, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 64.6875, | |
| "epoch": 0.05204872646733112, | |
| "grad_norm": 0.5151812434196472, | |
| "kl": 0.001766204833984375, | |
| "learning_rate": 9.70625e-07, | |
| "loss": -0.004148014355450869, | |
| "reward": 1.7916635870933533, | |
| "reward_std": 0.31147970259189606, | |
| "rewards/GDino": 0.7293796539306641, | |
| "rewards/GIT": 0.20818163454532623, | |
| "rewards/HPSv2": 0.27945709228515625, | |
| "rewards/ORM": 0.5746453106403351, | |
| "self_certainty_semantic": -25.3125, | |
| "self_certainty_token": -21.5625, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 56.25, | |
| "epoch": 0.053156146179401995, | |
| "grad_norm": 0.7559373378753662, | |
| "kl": 0.001861572265625, | |
| "learning_rate": 9.7e-07, | |
| "loss": -0.002030523493885994, | |
| "reward": 1.4302473068237305, | |
| "reward_std": 0.4484506845474243, | |
| "rewards/GDino": 0.6244329512119293, | |
| "rewards/GIT": 0.0, | |
| "rewards/HPSv2": 0.2752876281738281, | |
| "rewards/ORM": 0.5305267572402954, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -20.875, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 61.21875, | |
| "epoch": 0.05426356589147287, | |
| "grad_norm": 0.46310731768608093, | |
| "kl": 0.00177764892578125, | |
| "learning_rate": 9.69375e-07, | |
| "loss": 0.0054672048427164555, | |
| "reward": 1.9361683130264282, | |
| "reward_std": 0.3801421523094177, | |
| "rewards/GDino": 0.7904821038246155, | |
| "rewards/GIT": 0.2458050437271595, | |
| "rewards/HPSv2": 0.25890541076660156, | |
| "rewards/ORM": 0.640975683927536, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -21.6875, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 61.921875, | |
| "epoch": 0.05537098560354374, | |
| "grad_norm": 0.5111473798751831, | |
| "kl": 0.002353668212890625, | |
| "learning_rate": 9.6875e-07, | |
| "loss": 0.0035089042503386736, | |
| "reward": 2.212684750556946, | |
| "reward_std": 0.3874351307749748, | |
| "rewards/GDino": 0.7840971350669861, | |
| "rewards/GIT": 0.42198260873556137, | |
| "rewards/HPSv2": 0.25807952880859375, | |
| "rewards/ORM": 0.7485254108905792, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -22.625, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 57.796875, | |
| "epoch": 0.05647840531561462, | |
| "grad_norm": 0.4804292917251587, | |
| "kl": 0.001743316650390625, | |
| "learning_rate": 9.68125e-07, | |
| "loss": -0.0010273723164573312, | |
| "reward": 1.8951371908187866, | |
| "reward_std": 0.5679852366447449, | |
| "rewards/GDino": 0.7922006845474243, | |
| "rewards/GIT": 0.27185457944869995, | |
| "rewards/HPSv2": 0.2777671813964844, | |
| "rewards/ORM": 0.5533146858215332, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -22.0, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 62.140625, | |
| "epoch": 0.05758582502768549, | |
| "grad_norm": 0.5876587629318237, | |
| "kl": 0.001842498779296875, | |
| "learning_rate": 9.675e-07, | |
| "loss": 0.010319232940673828, | |
| "reward": 2.453005313873291, | |
| "reward_std": 0.35728050768375397, | |
| "rewards/GDino": 0.917187511920929, | |
| "rewards/GIT": 0.6651300191879272, | |
| "rewards/HPSv2": 0.27350807189941406, | |
| "rewards/ORM": 0.5971797704696655, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -21.0625, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 57.046875, | |
| "epoch": 0.058693244739756366, | |
| "grad_norm": 0.5244357585906982, | |
| "kl": 0.00168609619140625, | |
| "learning_rate": 9.66875e-07, | |
| "loss": 0.0012504801852628589, | |
| "reward": 1.8911731839179993, | |
| "reward_std": 0.3232653737068176, | |
| "rewards/GDino": 0.7297230660915375, | |
| "rewards/GIT": 0.3948078155517578, | |
| "rewards/HPSv2": 0.24039649963378906, | |
| "rewards/ORM": 0.5262457728385925, | |
| "self_certainty_semantic": -25.3125, | |
| "self_certainty_token": -21.25, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 68.921875, | |
| "epoch": 0.059800664451827246, | |
| "grad_norm": 0.5011692047119141, | |
| "kl": 0.0017547607421875, | |
| "learning_rate": 9.6625e-07, | |
| "loss": -0.001990929711610079, | |
| "reward": 1.5346381068229675, | |
| "reward_std": 0.5364750325679779, | |
| "rewards/GDino": 0.5896078050136566, | |
| "rewards/GIT": 0.2611962556838989, | |
| "rewards/HPSv2": 0.24633407592773438, | |
| "rewards/ORM": 0.4375000149011612, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -21.5625, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 65.28125, | |
| "epoch": 0.06090808416389812, | |
| "grad_norm": 0.43720903992652893, | |
| "kl": 0.001796722412109375, | |
| "learning_rate": 9.65625e-07, | |
| "loss": 0.011945425532758236, | |
| "reward": 1.7657405734062195, | |
| "reward_std": 0.5052186846733093, | |
| "rewards/GDino": 0.7055748403072357, | |
| "rewards/GIT": 0.3213713690638542, | |
| "rewards/HPSv2": 0.26223182678222656, | |
| "rewards/ORM": 0.4765625, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -21.8125, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 72.15625, | |
| "epoch": 0.06201550387596899, | |
| "grad_norm": 0.6576823592185974, | |
| "kl": 0.00201416015625, | |
| "learning_rate": 9.649999999999999e-07, | |
| "loss": 0.010990551207214594, | |
| "reward": 2.0798487663269043, | |
| "reward_std": 0.5881477892398834, | |
| "rewards/GDino": 0.7611979246139526, | |
| "rewards/GIT": 0.38940075039863586, | |
| "rewards/HPSv2": 0.25081634521484375, | |
| "rewards/ORM": 0.678433746099472, | |
| "self_certainty_semantic": -25.125, | |
| "self_certainty_token": -21.8125, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 53.84375, | |
| "epoch": 0.06312292358803986, | |
| "grad_norm": 0.5109694600105286, | |
| "kl": 0.001708984375, | |
| "learning_rate": 9.64375e-07, | |
| "loss": -0.009197955019772053, | |
| "reward": 1.825343132019043, | |
| "reward_std": 0.49610868096351624, | |
| "rewards/GDino": 0.7342002689838409, | |
| "rewards/GIT": 0.27930086851119995, | |
| "rewards/HPSv2": 0.2493419647216797, | |
| "rewards/ORM": 0.5625, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.5, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 54.671875, | |
| "epoch": 0.06423034330011074, | |
| "grad_norm": 0.48297855257987976, | |
| "kl": 0.0018157958984375, | |
| "learning_rate": 9.637499999999999e-07, | |
| "loss": -2.7031637728214264e-05, | |
| "reward": 1.9436655044555664, | |
| "reward_std": 0.5841460824012756, | |
| "rewards/GDino": 0.7508301734924316, | |
| "rewards/GIT": 0.36742376536130905, | |
| "rewards/HPSv2": 0.24603271484375, | |
| "rewards/ORM": 0.579378753900528, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -21.6875, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 57.34375, | |
| "epoch": 0.06533776301218161, | |
| "grad_norm": 1.5652471780776978, | |
| "kl": 0.00185394287109375, | |
| "learning_rate": 9.63125e-07, | |
| "loss": -0.0014887296129018068, | |
| "reward": 2.154895305633545, | |
| "reward_std": 0.5548917800188065, | |
| "rewards/GDino": 0.7907229363918304, | |
| "rewards/GIT": 0.44339829683303833, | |
| "rewards/HPSv2": 0.2567615509033203, | |
| "rewards/ORM": 0.664012536406517, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -21.0625, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 52.0625, | |
| "epoch": 0.0664451827242525, | |
| "grad_norm": 0.8647972941398621, | |
| "kl": 0.00200653076171875, | |
| "learning_rate": 9.624999999999999e-07, | |
| "loss": -0.004864218062721193, | |
| "reward": 2.183086931705475, | |
| "reward_std": 0.27265597879886627, | |
| "rewards/GDino": 0.8968750238418579, | |
| "rewards/GIT": 0.4909053146839142, | |
| "rewards/HPSv2": 0.2511100769042969, | |
| "rewards/ORM": 0.544196605682373, | |
| "self_certainty_semantic": -25.25, | |
| "self_certainty_token": -20.8125, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 78.421875, | |
| "epoch": 0.06755260243632337, | |
| "grad_norm": 0.6149311065673828, | |
| "kl": 0.0018310546875, | |
| "learning_rate": 9.61875e-07, | |
| "loss": -0.003399772336706519, | |
| "reward": 2.3938775062561035, | |
| "reward_std": 0.3266971558332443, | |
| "rewards/GDino": 0.7299478650093079, | |
| "rewards/GIT": 0.6572037935256958, | |
| "rewards/HPSv2": 0.26293373107910156, | |
| "rewards/ORM": 0.743791937828064, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -20.5, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 71.796875, | |
| "epoch": 0.06866002214839424, | |
| "grad_norm": 0.8106938600540161, | |
| "kl": 0.00188446044921875, | |
| "learning_rate": 9.6125e-07, | |
| "loss": -0.004746791877551004, | |
| "reward": 2.3078866004943848, | |
| "reward_std": 0.4594850391149521, | |
| "rewards/GDino": 0.7886728346347809, | |
| "rewards/GIT": 0.6039779186248779, | |
| "rewards/HPSv2": 0.2555561065673828, | |
| "rewards/ORM": 0.6596797406673431, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -21.125, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 57.703125, | |
| "epoch": 0.06976744186046512, | |
| "grad_norm": 0.5699672102928162, | |
| "kl": 0.00218963623046875, | |
| "learning_rate": 9.606249999999998e-07, | |
| "loss": 0.005022911122068763, | |
| "reward": 2.2111340165138245, | |
| "reward_std": 0.6219878196716309, | |
| "rewards/GDino": 0.794545441865921, | |
| "rewards/GIT": 0.45049863308668137, | |
| "rewards/HPSv2": 0.24386024475097656, | |
| "rewards/ORM": 0.7222297191619873, | |
| "self_certainty_semantic": -25.25, | |
| "self_certainty_token": -21.9375, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 78.453125, | |
| "epoch": 0.07087486157253599, | |
| "grad_norm": 0.7573527693748474, | |
| "kl": 0.0022125244140625, | |
| "learning_rate": 9.6e-07, | |
| "loss": 0.013895762618631124, | |
| "reward": 1.6789215207099915, | |
| "reward_std": 0.15597553551197052, | |
| "rewards/GDino": 0.7209441661834717, | |
| "rewards/GIT": 0.31718890368938446, | |
| "rewards/HPSv2": 0.26105499267578125, | |
| "rewards/ORM": 0.37973345816135406, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -21.5625, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 63.59375, | |
| "epoch": 0.07198228128460686, | |
| "grad_norm": 0.4424923360347748, | |
| "kl": 0.0020599365234375, | |
| "learning_rate": 9.59375e-07, | |
| "loss": 0.0005846736021339893, | |
| "reward": 2.195925712585449, | |
| "reward_std": 0.5788445174694061, | |
| "rewards/GDino": 0.7169270515441895, | |
| "rewards/GIT": 0.6367218196392059, | |
| "rewards/HPSv2": 0.2345561981201172, | |
| "rewards/ORM": 0.6077205836772919, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -21.75, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 67.6875, | |
| "epoch": 0.07308970099667775, | |
| "grad_norm": 0.5050013661384583, | |
| "kl": 0.00211334228515625, | |
| "learning_rate": 9.5875e-07, | |
| "loss": 0.010172993643209338, | |
| "reward": 2.220258355140686, | |
| "reward_std": 0.30588236451148987, | |
| "rewards/GDino": 0.7442708909511566, | |
| "rewards/GIT": 0.47482602298259735, | |
| "rewards/HPSv2": 0.25937461853027344, | |
| "rewards/ORM": 0.7417868673801422, | |
| "self_certainty_semantic": -25.375, | |
| "self_certainty_token": -21.5, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 72.75, | |
| "epoch": 0.07419712070874862, | |
| "grad_norm": 0.47647950053215027, | |
| "kl": 0.001953125, | |
| "learning_rate": 9.58125e-07, | |
| "loss": 0.002580178901553154, | |
| "reward": 2.3537763357162476, | |
| "reward_std": 0.2857324182987213, | |
| "rewards/GDino": 0.852263331413269, | |
| "rewards/GIT": 0.5637244433164597, | |
| "rewards/HPSv2": 0.2550220489501953, | |
| "rewards/ORM": 0.6827665567398071, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -21.4375, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 60.109375, | |
| "epoch": 0.0753045404208195, | |
| "grad_norm": 0.45224544405937195, | |
| "kl": 0.0021209716796875, | |
| "learning_rate": 9.575e-07, | |
| "loss": 0.002825574716553092, | |
| "reward": 1.613221287727356, | |
| "reward_std": 0.332104429602623, | |
| "rewards/GDino": 0.6193348169326782, | |
| "rewards/GIT": 0.2909398823976517, | |
| "rewards/HPSv2": 0.2551765441894531, | |
| "rewards/ORM": 0.4477700889110565, | |
| "self_certainty_semantic": -25.25, | |
| "self_certainty_token": -21.0625, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 72.6875, | |
| "epoch": 0.07641196013289037, | |
| "grad_norm": 0.688894510269165, | |
| "kl": 0.002315521240234375, | |
| "learning_rate": 9.56875e-07, | |
| "loss": 0.012800770811736584, | |
| "reward": 2.1092969179153442, | |
| "reward_std": 0.36874186992645264, | |
| "rewards/GDino": 0.8054687678813934, | |
| "rewards/GIT": 0.3866874873638153, | |
| "rewards/HPSv2": 0.26236534118652344, | |
| "rewards/ORM": 0.6547753810882568, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -21.5, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 60.640625, | |
| "epoch": 0.07751937984496124, | |
| "grad_norm": 0.45330390334129333, | |
| "kl": 0.00215911865234375, | |
| "learning_rate": 9.5625e-07, | |
| "loss": -0.0010713667143136263, | |
| "reward": 1.552397072315216, | |
| "reward_std": 0.39455118775367737, | |
| "rewards/GDino": 0.6554375886917114, | |
| "rewards/GIT": 0.22663478553295135, | |
| "rewards/HPSv2": 0.2546577453613281, | |
| "rewards/ORM": 0.41566696763038635, | |
| "self_certainty_semantic": -25.25, | |
| "self_certainty_token": -20.75, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 76.515625, | |
| "epoch": 0.07862679955703211, | |
| "grad_norm": 0.5808414220809937, | |
| "kl": 0.00222015380859375, | |
| "learning_rate": 9.556249999999999e-07, | |
| "loss": 0.0038980550598353148, | |
| "reward": 1.9476300477981567, | |
| "reward_std": 0.38603267073631287, | |
| "rewards/GDino": 0.7262610197067261, | |
| "rewards/GIT": 0.30087296664714813, | |
| "rewards/HPSv2": 0.26424598693847656, | |
| "rewards/ORM": 0.6562500149011612, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -19.9375, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 57.15625, | |
| "epoch": 0.07973421926910298, | |
| "grad_norm": 0.3693688213825226, | |
| "kl": 0.00208282470703125, | |
| "learning_rate": 9.55e-07, | |
| "loss": -0.00035159417893737555, | |
| "reward": 1.9391373991966248, | |
| "reward_std": 0.3963821530342102, | |
| "rewards/GDino": 0.6879567801952362, | |
| "rewards/GIT": 0.4622843265533447, | |
| "rewards/HPSv2": 0.24675464630126953, | |
| "rewards/ORM": 0.5421415567398071, | |
| "self_certainty_semantic": -25.0625, | |
| "self_certainty_token": -20.9375, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 66.65625, | |
| "epoch": 0.08084163898117387, | |
| "grad_norm": 0.6215986013412476, | |
| "kl": 0.0024871826171875, | |
| "learning_rate": 9.543749999999999e-07, | |
| "loss": 0.003838272183202207, | |
| "reward": 2.1008963584899902, | |
| "reward_std": 0.4600249230861664, | |
| "rewards/GDino": 0.8240202069282532, | |
| "rewards/GIT": 0.48449917137622833, | |
| "rewards/HPSv2": 0.24818038940429688, | |
| "rewards/ORM": 0.5441965609788895, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -21.5, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 60.859375, | |
| "epoch": 0.08194905869324474, | |
| "grad_norm": 0.43593713641166687, | |
| "kl": 0.0030364990234375, | |
| "learning_rate": 9.5375e-07, | |
| "loss": 0.002844013855792582, | |
| "reward": 2.297879934310913, | |
| "reward_std": 0.2846696451306343, | |
| "rewards/GDino": 0.84375, | |
| "rewards/GIT": 0.5265894532203674, | |
| "rewards/HPSv2": 0.2544116973876953, | |
| "rewards/ORM": 0.6731287837028503, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -21.75, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 68.703125, | |
| "epoch": 0.08305647840531562, | |
| "grad_norm": 0.48668116331100464, | |
| "kl": 0.002227783203125, | |
| "learning_rate": 9.53125e-07, | |
| "loss": -0.0021062323357909918, | |
| "reward": 1.7519539594650269, | |
| "reward_std": 0.3109753131866455, | |
| "rewards/GDino": 0.6498888432979584, | |
| "rewards/GIT": 0.2745012864470482, | |
| "rewards/HPSv2": 0.26706886291503906, | |
| "rewards/ORM": 0.5604948848485947, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -20.625, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 70.25, | |
| "epoch": 0.08416389811738649, | |
| "grad_norm": 0.5122522711753845, | |
| "kl": 0.00208282470703125, | |
| "learning_rate": 9.525e-07, | |
| "loss": -0.00045439647510647774, | |
| "reward": 2.371267318725586, | |
| "reward_std": 0.4085633456707001, | |
| "rewards/GDino": 0.8135416805744171, | |
| "rewards/GIT": 0.6540948301553726, | |
| "rewards/HPSv2": 0.2650108337402344, | |
| "rewards/ORM": 0.6386198997497559, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -20.75, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 62.875, | |
| "epoch": 0.08527131782945736, | |
| "grad_norm": 0.505736768245697, | |
| "kl": 0.0037689208984375, | |
| "learning_rate": 9.51875e-07, | |
| "loss": -0.006699402409140021, | |
| "reward": 1.5121636986732483, | |
| "reward_std": 0.5349836349487305, | |
| "rewards/GDino": 0.616510659456253, | |
| "rewards/GIT": 0.18113864213228226, | |
| "rewards/HPSv2": 0.228485107421875, | |
| "rewards/ORM": 0.48602940142154694, | |
| "self_certainty_semantic": -25.125, | |
| "self_certainty_token": -21.875, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 65.8125, | |
| "epoch": 0.08637873754152824, | |
| "grad_norm": 0.4759610593318939, | |
| "kl": 0.0022735595703125, | |
| "learning_rate": 9.5125e-07, | |
| "loss": 0.0014968996401876211, | |
| "reward": 1.9482250213623047, | |
| "reward_std": 0.38150524348020554, | |
| "rewards/GDino": 0.7646995186805725, | |
| "rewards/GIT": 0.31973105669021606, | |
| "rewards/HPSv2": 0.2705249786376953, | |
| "rewards/ORM": 0.5932694524526596, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -21.125, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 72.609375, | |
| "epoch": 0.08748615725359911, | |
| "grad_norm": 0.4961722195148468, | |
| "kl": 0.00247955322265625, | |
| "learning_rate": 9.50625e-07, | |
| "loss": 0.00820195721462369, | |
| "reward": 2.2431598901748657, | |
| "reward_std": 0.19805177673697472, | |
| "rewards/GDino": 0.8183182775974274, | |
| "rewards/GIT": 0.60882468521595, | |
| "rewards/HPSv2": 0.2628040313720703, | |
| "rewards/ORM": 0.5532128810882568, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -21.5625, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 66.0625, | |
| "epoch": 0.08859357696567, | |
| "grad_norm": 0.5290701389312744, | |
| "kl": 0.00308990478515625, | |
| "learning_rate": 9.499999999999999e-07, | |
| "loss": -0.001018086913973093, | |
| "reward": 1.7054139375686646, | |
| "reward_std": 0.4478110671043396, | |
| "rewards/GDino": 0.6419965624809265, | |
| "rewards/GIT": 0.19029075652360916, | |
| "rewards/HPSv2": 0.2727680206298828, | |
| "rewards/ORM": 0.6003586649894714, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -21.5, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 69.75, | |
| "epoch": 0.08970099667774087, | |
| "grad_norm": 0.530961811542511, | |
| "kl": 0.00331878662109375, | |
| "learning_rate": 9.493749999999999e-07, | |
| "loss": -0.0018104221671819687, | |
| "reward": 2.1294270157814026, | |
| "reward_std": 0.30140096694231033, | |
| "rewards/GDino": 0.7601194977760315, | |
| "rewards/GIT": 0.36138176918029785, | |
| "rewards/HPSv2": 0.27007102966308594, | |
| "rewards/ORM": 0.7378547042608261, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -20.25, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 62.25, | |
| "epoch": 0.09080841638981174, | |
| "grad_norm": 0.5380280017852783, | |
| "kl": 0.0029449462890625, | |
| "learning_rate": 9.487499999999999e-07, | |
| "loss": 0.0027263425290584564, | |
| "reward": 1.7531540989875793, | |
| "reward_std": 0.40144187211990356, | |
| "rewards/GDino": 0.6388830840587616, | |
| "rewards/GIT": 0.3787819594144821, | |
| "rewards/HPSv2": 0.26526451110839844, | |
| "rewards/ORM": 0.4702245742082596, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -21.3125, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 57.125, | |
| "epoch": 0.09191583610188261, | |
| "grad_norm": 0.46656447649002075, | |
| "kl": 0.00229644775390625, | |
| "learning_rate": 9.481249999999999e-07, | |
| "loss": 0.0034079640172421932, | |
| "reward": 2.1076533794403076, | |
| "reward_std": 0.3496774584054947, | |
| "rewards/GDino": 0.8086712956428528, | |
| "rewards/GIT": 0.44665491580963135, | |
| "rewards/HPSv2": 0.2527198791503906, | |
| "rewards/ORM": 0.5996073186397552, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -21.875, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 77.609375, | |
| "epoch": 0.09302325581395349, | |
| "grad_norm": 0.7098491787910461, | |
| "kl": 0.003326416015625, | |
| "learning_rate": 9.474999999999999e-07, | |
| "loss": -0.015582434833049774, | |
| "reward": 2.0792417526245117, | |
| "reward_std": 0.405472531914711, | |
| "rewards/GDino": 0.8217203617095947, | |
| "rewards/GIT": 0.6337592005729675, | |
| "rewards/HPSv2": 0.2409496307373047, | |
| "rewards/ORM": 0.3828125, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -21.25, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 70.0, | |
| "epoch": 0.09413067552602436, | |
| "grad_norm": 0.453952431678772, | |
| "kl": 0.0030059814453125, | |
| "learning_rate": 9.468749999999999e-07, | |
| "loss": -0.008341801585629582, | |
| "reward": 1.7731398940086365, | |
| "reward_std": 0.43146421015262604, | |
| "rewards/GDino": 0.6217962503433228, | |
| "rewards/GIT": 0.33136892318725586, | |
| "rewards/HPSv2": 0.2414989471435547, | |
| "rewards/ORM": 0.5784757435321808, | |
| "self_certainty_semantic": -25.375, | |
| "self_certainty_token": -21.8125, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 55.46875, | |
| "epoch": 0.09523809523809523, | |
| "grad_norm": 0.6065813302993774, | |
| "kl": 0.0029296875, | |
| "learning_rate": 9.462499999999999e-07, | |
| "loss": -0.004339609295129776, | |
| "reward": 2.3409087657928467, | |
| "reward_std": 0.33414456248283386, | |
| "rewards/GDino": 0.843651682138443, | |
| "rewards/GIT": 0.3478253483772278, | |
| "rewards/HPSv2": 0.2929649353027344, | |
| "rewards/ORM": 0.8564667999744415, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.6875, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 71.796875, | |
| "epoch": 0.09634551495016612, | |
| "grad_norm": 0.6815423965454102, | |
| "kl": 0.0028076171875, | |
| "learning_rate": 9.45625e-07, | |
| "loss": 0.004890406038612127, | |
| "reward": 2.096968352794647, | |
| "reward_std": 0.4522961378097534, | |
| "rewards/GDino": 0.7090134918689728, | |
| "rewards/GIT": 0.4619881361722946, | |
| "rewards/HPSv2": 0.26172447204589844, | |
| "rewards/ORM": 0.6642423272132874, | |
| "self_certainty_semantic": -25.375, | |
| "self_certainty_token": -21.3125, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 62.921875, | |
| "epoch": 0.09745293466223699, | |
| "grad_norm": 0.37047135829925537, | |
| "kl": 0.00237274169921875, | |
| "learning_rate": 9.45e-07, | |
| "loss": -0.007989626843482256, | |
| "reward": 2.100303888320923, | |
| "reward_std": 0.39728429913520813, | |
| "rewards/GDino": 0.8100375235080719, | |
| "rewards/GIT": 0.4551214128732681, | |
| "rewards/HPSv2": 0.2669391632080078, | |
| "rewards/ORM": 0.5682056248188019, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -21.625, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 61.25, | |
| "epoch": 0.09856035437430787, | |
| "grad_norm": 0.3903006613254547, | |
| "kl": 0.0033111572265625, | |
| "learning_rate": 9.44375e-07, | |
| "loss": -0.0016460134647786617, | |
| "reward": 2.1185483932495117, | |
| "reward_std": 0.34406720101833344, | |
| "rewards/GDino": 0.7301153540611267, | |
| "rewards/GIT": 0.4342738687992096, | |
| "rewards/HPSv2": 0.25724220275878906, | |
| "rewards/ORM": 0.6969169676303864, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -21.0625, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 64.734375, | |
| "epoch": 0.09966777408637874, | |
| "grad_norm": 0.6106704473495483, | |
| "kl": 0.002532958984375, | |
| "learning_rate": 9.4375e-07, | |
| "loss": 0.0018994538113474846, | |
| "reward": 2.281058669090271, | |
| "reward_std": 0.4019897133111954, | |
| "rewards/GDino": 0.8515625298023224, | |
| "rewards/GIT": 0.602006196975708, | |
| "rewards/HPSv2": 0.2570476531982422, | |
| "rewards/ORM": 0.5704423487186432, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -20.875, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 70.625, | |
| "epoch": 0.10077519379844961, | |
| "grad_norm": 0.6082563996315002, | |
| "kl": 0.0025634765625, | |
| "learning_rate": 9.43125e-07, | |
| "loss": -0.001378488726913929, | |
| "reward": 1.7446696758270264, | |
| "reward_std": 0.48222504556179047, | |
| "rewards/GDino": 0.6369770467281342, | |
| "rewards/GIT": 0.4495050609111786, | |
| "rewards/HPSv2": 0.2379169464111328, | |
| "rewards/ORM": 0.42027057707309723, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -21.25, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 69.328125, | |
| "epoch": 0.10188261351052048, | |
| "grad_norm": 0.3885723054409027, | |
| "kl": 0.00247955322265625, | |
| "learning_rate": 9.425e-07, | |
| "loss": 0.0029599489644169807, | |
| "reward": 1.6940485835075378, | |
| "reward_std": 0.48791858553886414, | |
| "rewards/GDino": 0.7451692521572113, | |
| "rewards/GIT": 0.3888908475637436, | |
| "rewards/HPSv2": 0.23882293701171875, | |
| "rewards/ORM": 0.32116562128067017, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -20.9375, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 78.96875, | |
| "epoch": 0.10299003322259136, | |
| "grad_norm": 2.441729784011841, | |
| "kl": 0.00281524658203125, | |
| "learning_rate": 9.41875e-07, | |
| "loss": 0.0027102059684693813, | |
| "reward": 2.098644495010376, | |
| "reward_std": 0.5861929953098297, | |
| "rewards/GDino": 0.7753971815109253, | |
| "rewards/GIT": 0.33432240784168243, | |
| "rewards/HPSv2": 0.24440956115722656, | |
| "rewards/ORM": 0.7445152401924133, | |
| "self_certainty_semantic": -25.375, | |
| "self_certainty_token": -21.3125, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 53.640625, | |
| "epoch": 0.10409745293466224, | |
| "grad_norm": 1.843809962272644, | |
| "kl": 0.00298309326171875, | |
| "learning_rate": 9.4125e-07, | |
| "loss": -0.002976842690259218, | |
| "reward": 2.022274136543274, | |
| "reward_std": 0.3149227201938629, | |
| "rewards/GDino": 0.7854060530662537, | |
| "rewards/GIT": 0.20830318331718445, | |
| "rewards/HPSv2": 0.2829475402832031, | |
| "rewards/ORM": 0.7456172108650208, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -21.4375, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 73.8125, | |
| "epoch": 0.10520487264673312, | |
| "grad_norm": 0.4806905686855316, | |
| "kl": 0.0027923583984375, | |
| "learning_rate": 9.40625e-07, | |
| "loss": 0.0057201930321753025, | |
| "reward": 2.5528862476348877, | |
| "reward_std": 0.3981771767139435, | |
| "rewards/GDino": 0.9458979666233063, | |
| "rewards/GIT": 0.7319882810115814, | |
| "rewards/HPSv2": 0.265625, | |
| "rewards/ORM": 0.609375, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -22.375, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 71.578125, | |
| "epoch": 0.10631229235880399, | |
| "grad_norm": 1.3328330516815186, | |
| "kl": 0.00286865234375, | |
| "learning_rate": 9.399999999999999e-07, | |
| "loss": 0.006992874434217811, | |
| "reward": 2.4351861476898193, | |
| "reward_std": 0.25794728100299835, | |
| "rewards/GDino": 0.9020833373069763, | |
| "rewards/GIT": 0.6907803118228912, | |
| "rewards/HPSv2": 0.2606678009033203, | |
| "rewards/ORM": 0.5816546380519867, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -20.125, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 60.703125, | |
| "epoch": 0.10741971207087486, | |
| "grad_norm": 0.5019268989562988, | |
| "kl": 0.003326416015625, | |
| "learning_rate": 9.393749999999999e-07, | |
| "loss": 0.011835527839139104, | |
| "reward": 1.6200063824653625, | |
| "reward_std": 0.4240207076072693, | |
| "rewards/GDino": 0.6504360437393188, | |
| "rewards/GIT": 0.18544349074363708, | |
| "rewards/HPSv2": 0.2720832824707031, | |
| "rewards/ORM": 0.5120435357093811, | |
| "self_certainty_semantic": -25.375, | |
| "self_certainty_token": -21.4375, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 68.578125, | |
| "epoch": 0.10852713178294573, | |
| "grad_norm": 0.38334423303604126, | |
| "kl": 0.003143310546875, | |
| "learning_rate": 9.387499999999999e-07, | |
| "loss": 0.0015034456737339497, | |
| "reward": 1.9381686449050903, | |
| "reward_std": 0.46784070134162903, | |
| "rewards/GDino": 0.7850436270236969, | |
| "rewards/GIT": 0.3971538841724396, | |
| "rewards/HPSv2": 0.2517681121826172, | |
| "rewards/ORM": 0.5042029470205307, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -22.0, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 72.234375, | |
| "epoch": 0.10963455149501661, | |
| "grad_norm": 1.5332801342010498, | |
| "kl": 0.0026702880859375, | |
| "learning_rate": 9.381249999999999e-07, | |
| "loss": 0.0014210238587111235, | |
| "reward": 2.1606199741363525, | |
| "reward_std": 0.4609396979212761, | |
| "rewards/GDino": 0.800000011920929, | |
| "rewards/GIT": 0.6965132355690002, | |
| "rewards/HPSv2": 0.2425823211669922, | |
| "rewards/ORM": 0.4215243309736252, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -22.125, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 64.859375, | |
| "epoch": 0.11074197120708748, | |
| "grad_norm": 0.4810887575149536, | |
| "kl": 0.0039520263671875, | |
| "learning_rate": 9.374999999999999e-07, | |
| "loss": -0.006660776911303401, | |
| "reward": 2.0300318002700806, | |
| "reward_std": 0.49300554394721985, | |
| "rewards/GDino": 0.6639764606952667, | |
| "rewards/GIT": 0.41904042661190033, | |
| "rewards/HPSv2": 0.25483131408691406, | |
| "rewards/ORM": 0.6921834945678711, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -21.5, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 59.671875, | |
| "epoch": 0.11184939091915837, | |
| "grad_norm": 0.6347000002861023, | |
| "kl": 0.0032196044921875, | |
| "learning_rate": 9.368749999999999e-07, | |
| "loss": 0.007826576009392738, | |
| "reward": 2.343237042427063, | |
| "reward_std": 0.29696404933929443, | |
| "rewards/GDino": 0.8815763592720032, | |
| "rewards/GIT": 0.5084297135472298, | |
| "rewards/HPSv2": 0.27715301513671875, | |
| "rewards/ORM": 0.6760779917240143, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -20.4375, | |
| "step": 101 | |
| }, | |
| { | |
| "completion_length": 54.6875, | |
| "epoch": 0.11295681063122924, | |
| "grad_norm": 0.433162659406662, | |
| "kl": 0.00323486328125, | |
| "learning_rate": 9.3625e-07, | |
| "loss": -0.0018342176917940378, | |
| "reward": 2.244241714477539, | |
| "reward_std": 0.3847181349992752, | |
| "rewards/GDino": 0.7636502981185913, | |
| "rewards/GIT": 0.5041892230510712, | |
| "rewards/HPSv2": 0.26613616943359375, | |
| "rewards/ORM": 0.7102660238742828, | |
| "self_certainty_semantic": -25.375, | |
| "self_certainty_token": -21.75, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 67.0, | |
| "epoch": 0.11406423034330011, | |
| "grad_norm": 0.4709942042827606, | |
| "kl": 0.0036163330078125, | |
| "learning_rate": 9.35625e-07, | |
| "loss": -0.0053715279791504145, | |
| "reward": 1.7866063117980957, | |
| "reward_std": 0.48569220304489136, | |
| "rewards/GDino": 0.6912583708763123, | |
| "rewards/GIT": 0.2119271606206894, | |
| "rewards/HPSv2": 0.26636314392089844, | |
| "rewards/ORM": 0.6170576214790344, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -21.6875, | |
| "step": 103 | |
| }, | |
| { | |
| "completion_length": 72.9375, | |
| "epoch": 0.11517165005537099, | |
| "grad_norm": 0.4063447415828705, | |
| "kl": 0.00260162353515625, | |
| "learning_rate": 9.35e-07, | |
| "loss": 0.002629161812365055, | |
| "reward": 2.2642691135406494, | |
| "reward_std": 0.34077706933021545, | |
| "rewards/GDino": 0.83519247174263, | |
| "rewards/GIT": 0.5088042318820953, | |
| "rewards/HPSv2": 0.2578144073486328, | |
| "rewards/ORM": 0.6624580323696136, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -22.25, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 60.078125, | |
| "epoch": 0.11627906976744186, | |
| "grad_norm": 0.46488699316978455, | |
| "kl": 0.002288818359375, | |
| "learning_rate": 9.34375e-07, | |
| "loss": -0.003600445226766169, | |
| "reward": 2.1485623121261597, | |
| "reward_std": 0.4569554626941681, | |
| "rewards/GDino": 0.7578125, | |
| "rewards/GIT": 0.5468153655529022, | |
| "rewards/HPSv2": 0.2572956085205078, | |
| "rewards/ORM": 0.586638867855072, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -21.0, | |
| "step": 105 | |
| }, | |
| { | |
| "completion_length": 82.796875, | |
| "epoch": 0.11738648947951273, | |
| "grad_norm": 0.6562625765800476, | |
| "kl": 0.00269317626953125, | |
| "learning_rate": 9.3375e-07, | |
| "loss": 0.006768202409148216, | |
| "reward": 1.9783158898353577, | |
| "reward_std": 0.1888652741909027, | |
| "rewards/GDino": 0.7153646051883698, | |
| "rewards/GIT": 0.5914923697710037, | |
| "rewards/HPSv2": 0.2652587890625, | |
| "rewards/ORM": 0.4062000662088394, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -21.0625, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 65.59375, | |
| "epoch": 0.1184939091915836, | |
| "grad_norm": 0.45307597517967224, | |
| "kl": 0.003082275390625, | |
| "learning_rate": 9.33125e-07, | |
| "loss": 0.004376767203211784, | |
| "reward": 2.5454152822494507, | |
| "reward_std": 0.3043108731508255, | |
| "rewards/GDino": 0.9536458253860474, | |
| "rewards/GIT": 0.7616239190101624, | |
| "rewards/HPSv2": 0.25897979736328125, | |
| "rewards/ORM": 0.5711656212806702, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -20.6875, | |
| "step": 107 | |
| }, | |
| { | |
| "completion_length": 61.734375, | |
| "epoch": 0.11960132890365449, | |
| "grad_norm": 0.41155651211738586, | |
| "kl": 0.0034942626953125, | |
| "learning_rate": 9.325e-07, | |
| "loss": 0.00791933387517929, | |
| "reward": 2.225056529045105, | |
| "reward_std": 0.2606152221560478, | |
| "rewards/GDino": 0.7756550312042236, | |
| "rewards/GIT": 0.44980524480342865, | |
| "rewards/HPSv2": 0.2855796813964844, | |
| "rewards/ORM": 0.7140165567398071, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -20.875, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 62.3125, | |
| "epoch": 0.12070874861572536, | |
| "grad_norm": 0.5856253504753113, | |
| "kl": 0.00328826904296875, | |
| "learning_rate": 9.31875e-07, | |
| "loss": -0.014065259601920843, | |
| "reward": 2.116065502166748, | |
| "reward_std": 0.42074093222618103, | |
| "rewards/GDino": 0.8158511817455292, | |
| "rewards/GIT": 0.5546791851520538, | |
| "rewards/HPSv2": 0.26972389221191406, | |
| "rewards/ORM": 0.4758111536502838, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -22.0625, | |
| "step": 109 | |
| }, | |
| { | |
| "completion_length": 53.6875, | |
| "epoch": 0.12181616832779624, | |
| "grad_norm": 0.47900426387786865, | |
| "kl": 0.00299835205078125, | |
| "learning_rate": 9.3125e-07, | |
| "loss": 0.004598683924996294, | |
| "reward": 2.2211345434188843, | |
| "reward_std": 0.4559909552335739, | |
| "rewards/GDino": 0.843098521232605, | |
| "rewards/GIT": 0.39484143257141113, | |
| "rewards/HPSv2": 0.23913192749023438, | |
| "rewards/ORM": 0.7440627217292786, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -22.0625, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 62.5625, | |
| "epoch": 0.12292358803986711, | |
| "grad_norm": 0.5505498051643372, | |
| "kl": 0.00334930419921875, | |
| "learning_rate": 9.30625e-07, | |
| "loss": -0.009575113654136658, | |
| "reward": 1.8931084871292114, | |
| "reward_std": 0.3895595818758011, | |
| "rewards/GDino": 0.6988297700881958, | |
| "rewards/GIT": 0.34851039946079254, | |
| "rewards/HPSv2": 0.2725067138671875, | |
| "rewards/ORM": 0.5732617080211639, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -22.1875, | |
| "step": 111 | |
| }, | |
| { | |
| "completion_length": 66.125, | |
| "epoch": 0.12403100775193798, | |
| "grad_norm": 0.5518302321434021, | |
| "kl": 0.0044097900390625, | |
| "learning_rate": 9.3e-07, | |
| "loss": 0.001083985436707735, | |
| "reward": 2.1159579753875732, | |
| "reward_std": 0.3097255080938339, | |
| "rewards/GDino": 0.7588914632797241, | |
| "rewards/GIT": 0.3177434876561165, | |
| "rewards/HPSv2": 0.2764263153076172, | |
| "rewards/ORM": 0.7628966867923737, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -22.125, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 57.203125, | |
| "epoch": 0.12513842746400886, | |
| "grad_norm": 0.5670230388641357, | |
| "kl": 0.00327301025390625, | |
| "learning_rate": 9.293749999999999e-07, | |
| "loss": 0.013281037099659443, | |
| "reward": 1.6267165541648865, | |
| "reward_std": 0.36898210644721985, | |
| "rewards/GDino": 0.6410032212734222, | |
| "rewards/GIT": 0.2818482890725136, | |
| "rewards/HPSv2": 0.26859092712402344, | |
| "rewards/ORM": 0.4352741092443466, | |
| "self_certainty_semantic": -25.3125, | |
| "self_certainty_token": -21.25, | |
| "step": 113 | |
| }, | |
| { | |
| "completion_length": 68.296875, | |
| "epoch": 0.12624584717607973, | |
| "grad_norm": 0.6704270243644714, | |
| "kl": 0.00307464599609375, | |
| "learning_rate": 9.287499999999999e-07, | |
| "loss": 0.00015758577501401305, | |
| "reward": 2.3069713711738586, | |
| "reward_std": 0.36960119009017944, | |
| "rewards/GDino": 0.7588542103767395, | |
| "rewards/GIT": 0.6726887226104736, | |
| "rewards/HPSv2": 0.2751197814941406, | |
| "rewards/ORM": 0.6003087162971497, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -21.25, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 63.890625, | |
| "epoch": 0.1273532668881506, | |
| "grad_norm": 0.6844286918640137, | |
| "kl": 0.00408935546875, | |
| "learning_rate": 9.281249999999999e-07, | |
| "loss": 0.0020853045862168074, | |
| "reward": 2.1885178685188293, | |
| "reward_std": 0.35547153651714325, | |
| "rewards/GDino": 0.718020498752594, | |
| "rewards/GIT": 0.5492343008518219, | |
| "rewards/HPSv2": 0.2481842041015625, | |
| "rewards/ORM": 0.6730788052082062, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -20.75, | |
| "step": 115 | |
| }, | |
| { | |
| "completion_length": 75.21875, | |
| "epoch": 0.12846068660022147, | |
| "grad_norm": 0.5827351212501526, | |
| "kl": 0.003021240234375, | |
| "learning_rate": 9.274999999999999e-07, | |
| "loss": 0.0005021943943575025, | |
| "reward": 2.2085607051849365, | |
| "reward_std": 0.391997292637825, | |
| "rewards/GDino": 0.7475058436393738, | |
| "rewards/GIT": 0.5436886698007584, | |
| "rewards/HPSv2": 0.26111602783203125, | |
| "rewards/ORM": 0.65625, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -20.75, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 66.234375, | |
| "epoch": 0.12956810631229235, | |
| "grad_norm": 8.78965950012207, | |
| "kl": 0.158905029296875, | |
| "learning_rate": 9.268749999999999e-07, | |
| "loss": -0.0129257976077497, | |
| "reward": 2.4095414876937866, | |
| "reward_std": 0.2911904752254486, | |
| "rewards/GDino": 0.8304687738418579, | |
| "rewards/GIT": 0.6444451212882996, | |
| "rewards/HPSv2": 0.27797698974609375, | |
| "rewards/ORM": 0.6566506326198578, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -21.75, | |
| "step": 117 | |
| }, | |
| { | |
| "completion_length": 64.578125, | |
| "epoch": 0.13067552602436322, | |
| "grad_norm": 0.6560596823692322, | |
| "kl": 0.00417327880859375, | |
| "learning_rate": 9.2625e-07, | |
| "loss": 0.0029480335651896894, | |
| "reward": 1.8815761804580688, | |
| "reward_std": 0.3823118060827255, | |
| "rewards/GDino": 0.7314696907997131, | |
| "rewards/GIT": 0.41885554790496826, | |
| "rewards/HPSv2": 0.24540138244628906, | |
| "rewards/ORM": 0.4858495891094208, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -20.625, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 56.828125, | |
| "epoch": 0.13178294573643412, | |
| "grad_norm": 1.9917776584625244, | |
| "kl": 0.0042877197265625, | |
| "learning_rate": 9.25625e-07, | |
| "loss": -0.01110410038381815, | |
| "reward": 2.270492196083069, | |
| "reward_std": 0.5458246767520905, | |
| "rewards/GDino": 0.7566670179367065, | |
| "rewards/GIT": 0.5055328160524368, | |
| "rewards/HPSv2": 0.26803016662597656, | |
| "rewards/ORM": 0.7402622997760773, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -21.875, | |
| "step": 119 | |
| }, | |
| { | |
| "completion_length": 72.3125, | |
| "epoch": 0.132890365448505, | |
| "grad_norm": 0.510168194770813, | |
| "kl": 0.00420379638671875, | |
| "learning_rate": 9.25e-07, | |
| "loss": -0.013864397071301937, | |
| "reward": 1.973584771156311, | |
| "reward_std": 0.4184395670890808, | |
| "rewards/GDino": 0.7117854058742523, | |
| "rewards/GIT": 0.43370192497968674, | |
| "rewards/HPSv2": 0.26166534423828125, | |
| "rewards/ORM": 0.5664321482181549, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -21.3125, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 77.171875, | |
| "epoch": 0.13399778516057587, | |
| "grad_norm": 0.5348736643791199, | |
| "kl": 0.00298309326171875, | |
| "learning_rate": 9.243749999999999e-07, | |
| "loss": 0.004201958421617746, | |
| "reward": 1.9280533194541931, | |
| "reward_std": 0.4291805773973465, | |
| "rewards/GDino": 0.7109375, | |
| "rewards/GIT": 0.38363416492938995, | |
| "rewards/HPSv2": 0.25235748291015625, | |
| "rewards/ORM": 0.5811240971088409, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -21.0625, | |
| "step": 121 | |
| }, | |
| { | |
| "completion_length": 70.53125, | |
| "epoch": 0.13510520487264674, | |
| "grad_norm": 0.49879971146583557, | |
| "kl": 0.00412750244140625, | |
| "learning_rate": 9.237499999999999e-07, | |
| "loss": -0.0026759039610624313, | |
| "reward": 1.9971369504928589, | |
| "reward_std": 0.2551337629556656, | |
| "rewards/GDino": 0.72983318567276, | |
| "rewards/GIT": 0.34402593970298767, | |
| "rewards/HPSv2": 0.2877368927001953, | |
| "rewards/ORM": 0.6355408430099487, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -22.125, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 64.640625, | |
| "epoch": 0.1362126245847176, | |
| "grad_norm": 0.4230790436267853, | |
| "kl": 0.00341033935546875, | |
| "learning_rate": 9.23125e-07, | |
| "loss": -0.002337672747671604, | |
| "reward": 2.0281134843826294, | |
| "reward_std": 0.3781726509332657, | |
| "rewards/GDino": 0.7874999940395355, | |
| "rewards/GIT": 0.4591221511363983, | |
| "rewards/HPSv2": 0.2555961608886719, | |
| "rewards/ORM": 0.5258950889110565, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -21.5625, | |
| "step": 123 | |
| }, | |
| { | |
| "completion_length": 59.375, | |
| "epoch": 0.13732004429678848, | |
| "grad_norm": 0.9666682481765747, | |
| "kl": 0.00328826904296875, | |
| "learning_rate": 9.225e-07, | |
| "loss": -0.010707761626690626, | |
| "reward": 2.219977855682373, | |
| "reward_std": 0.396147683262825, | |
| "rewards/GDino": 0.7934323251247406, | |
| "rewards/GIT": 0.4874458909034729, | |
| "rewards/HPSv2": 0.2524528503417969, | |
| "rewards/ORM": 0.6866468489170074, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -21.0, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 67.421875, | |
| "epoch": 0.13842746400885936, | |
| "grad_norm": 0.4701387286186218, | |
| "kl": 0.00374603271484375, | |
| "learning_rate": 9.21875e-07, | |
| "loss": -0.008014392806217074, | |
| "reward": 2.166910171508789, | |
| "reward_std": 0.44899792969226837, | |
| "rewards/GDino": 0.7873771488666534, | |
| "rewards/GIT": 0.5715728402137756, | |
| "rewards/HPSv2": 0.25487709045410156, | |
| "rewards/ORM": 0.5530830323696136, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -20.9375, | |
| "step": 125 | |
| }, | |
| { | |
| "completion_length": 60.46875, | |
| "epoch": 0.13953488372093023, | |
| "grad_norm": 0.6960640549659729, | |
| "kl": 0.0052337646484375, | |
| "learning_rate": 9.2125e-07, | |
| "loss": 0.005524930078536272, | |
| "reward": 1.941537857055664, | |
| "reward_std": 0.3068820387125015, | |
| "rewards/GDino": 0.69914710521698, | |
| "rewards/GIT": 0.31967807561159134, | |
| "rewards/HPSv2": 0.26458740234375, | |
| "rewards/ORM": 0.6581252217292786, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -21.0, | |
| "step": 126 | |
| }, | |
| { | |
| "completion_length": 65.90625, | |
| "epoch": 0.1406423034330011, | |
| "grad_norm": 0.5266240239143372, | |
| "kl": 0.0050506591796875, | |
| "learning_rate": 9.20625e-07, | |
| "loss": -0.008795970119535923, | |
| "reward": 2.2745760679244995, | |
| "reward_std": 0.35941246151924133, | |
| "rewards/GDino": 0.7357383072376251, | |
| "rewards/GIT": 0.42085812985897064, | |
| "rewards/HPSv2": 0.2789630889892578, | |
| "rewards/ORM": 0.8390165567398071, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -21.8125, | |
| "step": 127 | |
| }, | |
| { | |
| "completion_length": 62.0, | |
| "epoch": 0.14174972314507198, | |
| "grad_norm": 1.2693217992782593, | |
| "kl": 0.00701904296875, | |
| "learning_rate": 9.2e-07, | |
| "loss": -0.013476235326379538, | |
| "reward": 1.8667319416999817, | |
| "reward_std": 0.5579482614994049, | |
| "rewards/GDino": 0.6687500178813934, | |
| "rewards/GIT": 0.240242637693882, | |
| "rewards/HPSv2": 0.2608222961425781, | |
| "rewards/ORM": 0.6969169527292252, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -20.8125, | |
| "step": 128 | |
| }, | |
| { | |
| "completion_length": 72.53125, | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 1.4665846824645996, | |
| "kl": 0.0047454833984375, | |
| "learning_rate": 9.19375e-07, | |
| "loss": -0.006278489250689745, | |
| "reward": 2.076420545578003, | |
| "reward_std": 0.36895356327295303, | |
| "rewards/GDino": 0.739062488079071, | |
| "rewards/GIT": 0.41109369695186615, | |
| "rewards/HPSv2": 0.2513103485107422, | |
| "rewards/ORM": 0.6749540567398071, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -21.875, | |
| "step": 129 | |
| }, | |
| { | |
| "completion_length": 58.046875, | |
| "epoch": 0.14396456256921372, | |
| "grad_norm": 0.7384111285209656, | |
| "kl": 0.00390625, | |
| "learning_rate": 9.187499999999999e-07, | |
| "loss": -0.0109781245701015, | |
| "reward": 1.9833685159683228, | |
| "reward_std": 0.39847198128700256, | |
| "rewards/GDino": 0.7729166448116302, | |
| "rewards/GIT": 0.4782646894454956, | |
| "rewards/HPSv2": 0.24262619018554688, | |
| "rewards/ORM": 0.48956090211868286, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -21.375, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 72.0, | |
| "epoch": 0.1450719822812846, | |
| "grad_norm": 0.46645256876945496, | |
| "kl": 0.00476837158203125, | |
| "learning_rate": 9.181249999999999e-07, | |
| "loss": 0.006110590882599354, | |
| "reward": 1.885680913925171, | |
| "reward_std": 0.4655804932117462, | |
| "rewards/GDino": 0.7249231338500977, | |
| "rewards/GIT": 0.35940520465373993, | |
| "rewards/HPSv2": 0.2583580017089844, | |
| "rewards/ORM": 0.5429946184158325, | |
| "self_certainty_semantic": -25.375, | |
| "self_certainty_token": -22.0625, | |
| "step": 131 | |
| }, | |
| { | |
| "completion_length": 53.21875, | |
| "epoch": 0.1461794019933555, | |
| "grad_norm": 0.5023438930511475, | |
| "kl": 0.00583648681640625, | |
| "learning_rate": 9.174999999999999e-07, | |
| "loss": -0.0056219237158074975, | |
| "reward": 2.1214953660964966, | |
| "reward_std": 0.5559927821159363, | |
| "rewards/GDino": 0.8054038286209106, | |
| "rewards/GIT": 0.4245864748954773, | |
| "rewards/HPSv2": 0.2713184356689453, | |
| "rewards/ORM": 0.6201866269111633, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -22.0625, | |
| "step": 132 | |
| }, | |
| { | |
| "completion_length": 75.6875, | |
| "epoch": 0.14728682170542637, | |
| "grad_norm": 0.6622663140296936, | |
| "kl": 0.00439453125, | |
| "learning_rate": 9.168749999999999e-07, | |
| "loss": 0.009899101918563247, | |
| "reward": 2.593212366104126, | |
| "reward_std": 0.17419864609837532, | |
| "rewards/GDino": 0.7739583253860474, | |
| "rewards/GIT": 0.6746057868003845, | |
| "rewards/HPSv2": 0.2743816375732422, | |
| "rewards/ORM": 0.8702665567398071, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -21.75, | |
| "step": 133 | |
| }, | |
| { | |
| "completion_length": 68.90625, | |
| "epoch": 0.14839424141749724, | |
| "grad_norm": 0.41897183656692505, | |
| "kl": 0.0034942626953125, | |
| "learning_rate": 9.1625e-07, | |
| "loss": 0.002212307066656649, | |
| "reward": 1.978962779045105, | |
| "reward_std": 0.45697829127311707, | |
| "rewards/GDino": 0.7175242900848389, | |
| "rewards/GIT": 0.5035496056079865, | |
| "rewards/HPSv2": 0.24994659423828125, | |
| "rewards/ORM": 0.5079423785209656, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -20.3125, | |
| "step": 134 | |
| }, | |
| { | |
| "completion_length": 62.859375, | |
| "epoch": 0.14950166112956811, | |
| "grad_norm": 0.5371299386024475, | |
| "kl": 0.00482177734375, | |
| "learning_rate": 9.15625e-07, | |
| "loss": 0.005879509728401899, | |
| "reward": 2.0941214561462402, | |
| "reward_std": 0.47014716267585754, | |
| "rewards/GDino": 0.774738609790802, | |
| "rewards/GIT": 0.4917849898338318, | |
| "rewards/HPSv2": 0.267425537109375, | |
| "rewards/ORM": 0.5601723045110703, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -20.1875, | |
| "step": 135 | |
| }, | |
| { | |
| "completion_length": 76.578125, | |
| "epoch": 0.150609080841639, | |
| "grad_norm": 0.48601874709129333, | |
| "kl": 0.004486083984375, | |
| "learning_rate": 9.15e-07, | |
| "loss": -0.0003573829308152199, | |
| "reward": 1.8426015377044678, | |
| "reward_std": 0.2483576349914074, | |
| "rewards/GDino": 0.684923529624939, | |
| "rewards/GIT": 0.3237183541059494, | |
| "rewards/HPSv2": 0.2632465362548828, | |
| "rewards/ORM": 0.5707131624221802, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -21.0625, | |
| "step": 136 | |
| }, | |
| { | |
| "completion_length": 70.078125, | |
| "epoch": 0.15171650055370986, | |
| "grad_norm": 0.5911806225776672, | |
| "kl": 0.0052947998046875, | |
| "learning_rate": 9.14375e-07, | |
| "loss": -0.008954334072768688, | |
| "reward": 2.0952707529067993, | |
| "reward_std": 0.42313070595264435, | |
| "rewards/GDino": 0.7640625238418579, | |
| "rewards/GIT": 0.5078665241599083, | |
| "rewards/HPSv2": 0.25115394592285156, | |
| "rewards/ORM": 0.5721877217292786, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.4375, | |
| "step": 137 | |
| }, | |
| { | |
| "completion_length": 61.75, | |
| "epoch": 0.15282392026578073, | |
| "grad_norm": 0.6094731688499451, | |
| "kl": 0.00860595703125, | |
| "learning_rate": 9.137499999999999e-07, | |
| "loss": -0.00691208359785378, | |
| "reward": 1.8424771428108215, | |
| "reward_std": 0.3106200248003006, | |
| "rewards/GDino": 0.6280561089515686, | |
| "rewards/GIT": 0.2153022214770317, | |
| "rewards/HPSv2": 0.2725563049316406, | |
| "rewards/ORM": 0.7265625, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -21.0, | |
| "step": 138 | |
| }, | |
| { | |
| "completion_length": 76.21875, | |
| "epoch": 0.1539313399778516, | |
| "grad_norm": 0.7681946754455566, | |
| "kl": 0.0045166015625, | |
| "learning_rate": 9.131249999999999e-07, | |
| "loss": 0.006304489565081894, | |
| "reward": 2.0444042682647705, | |
| "reward_std": 0.4021482616662979, | |
| "rewards/GDino": 0.7844302356243134, | |
| "rewards/GIT": 0.33466267585754395, | |
| "rewards/HPSv2": 0.26512908935546875, | |
| "rewards/ORM": 0.6601821780204773, | |
| "self_certainty_semantic": -25.375, | |
| "self_certainty_token": -22.0, | |
| "step": 139 | |
| }, | |
| { | |
| "completion_length": 64.78125, | |
| "epoch": 0.15503875968992248, | |
| "grad_norm": 0.404694527387619, | |
| "kl": 0.00445556640625, | |
| "learning_rate": 9.124999999999999e-07, | |
| "loss": 0.0074170518782921135, | |
| "reward": 2.199423849582672, | |
| "reward_std": 0.3181084841489792, | |
| "rewards/GDino": 0.8405935764312744, | |
| "rewards/GIT": 0.5380776524543762, | |
| "rewards/HPSv2": 0.2516937255859375, | |
| "rewards/ORM": 0.5690587759017944, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -21.375, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 80.1875, | |
| "epoch": 0.15614617940199335, | |
| "grad_norm": 0.521449089050293, | |
| "kl": 0.00370025634765625, | |
| "learning_rate": 9.11875e-07, | |
| "loss": 0.01646838476881385, | |
| "reward": 2.4023600816726685, | |
| "reward_std": 0.17732174694538116, | |
| "rewards/GDino": 0.6875, | |
| "rewards/GIT": 0.7328296601772308, | |
| "rewards/HPSv2": 0.24770545959472656, | |
| "rewards/ORM": 0.7343250513076782, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -21.125, | |
| "step": 141 | |
| }, | |
| { | |
| "completion_length": 68.546875, | |
| "epoch": 0.15725359911406422, | |
| "grad_norm": 0.4444400370121002, | |
| "kl": 0.006500244140625, | |
| "learning_rate": 9.1125e-07, | |
| "loss": -0.0020874282345175743, | |
| "reward": 2.2395375967025757, | |
| "reward_std": 0.37212860584259033, | |
| "rewards/GDino": 0.7598958611488342, | |
| "rewards/GIT": 0.5187265872955322, | |
| "rewards/HPSv2": 0.2597951889038086, | |
| "rewards/ORM": 0.7011198401451111, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -21.0625, | |
| "step": 142 | |
| }, | |
| { | |
| "completion_length": 60.453125, | |
| "epoch": 0.1583610188261351, | |
| "grad_norm": 0.5732141137123108, | |
| "kl": 0.006134033203125, | |
| "learning_rate": 9.10625e-07, | |
| "loss": -0.0019202656112611294, | |
| "reward": 1.9194607138633728, | |
| "reward_std": 0.5088343024253845, | |
| "rewards/GDino": 0.705212414264679, | |
| "rewards/GIT": 0.3693596422672272, | |
| "rewards/HPSv2": 0.2593517303466797, | |
| "rewards/ORM": 0.5855368673801422, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -21.125, | |
| "step": 143 | |
| }, | |
| { | |
| "completion_length": 69.53125, | |
| "epoch": 0.15946843853820597, | |
| "grad_norm": 0.5136631727218628, | |
| "kl": 0.00463104248046875, | |
| "learning_rate": 9.1e-07, | |
| "loss": -0.0024181478656828403, | |
| "reward": 2.1130378246307373, | |
| "reward_std": 0.3436143696308136, | |
| "rewards/GDino": 0.6970658600330353, | |
| "rewards/GIT": 0.5147460252046585, | |
| "rewards/HPSv2": 0.2531890869140625, | |
| "rewards/ORM": 0.6480368673801422, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -21.3125, | |
| "step": 144 | |
| }, | |
| { | |
| "completion_length": 63.234375, | |
| "epoch": 0.16057585825027684, | |
| "grad_norm": 0.425749808549881, | |
| "kl": 0.0057220458984375, | |
| "learning_rate": 9.09375e-07, | |
| "loss": 0.0033237107563763857, | |
| "reward": 1.907556176185608, | |
| "reward_std": 0.3990510255098343, | |
| "rewards/GDino": 0.7011643946170807, | |
| "rewards/GIT": 0.3098641186952591, | |
| "rewards/HPSv2": 0.28241920471191406, | |
| "rewards/ORM": 0.6141084432601929, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -22.5625, | |
| "step": 145 | |
| }, | |
| { | |
| "completion_length": 62.953125, | |
| "epoch": 0.16168327796234774, | |
| "grad_norm": 0.5104310512542725, | |
| "kl": 0.0064239501953125, | |
| "learning_rate": 9.087499999999999e-07, | |
| "loss": 0.010284929594490677, | |
| "reward": 2.080387771129608, | |
| "reward_std": 0.4294509291648865, | |
| "rewards/GDino": 0.8376201391220093, | |
| "rewards/GIT": 0.3540365919470787, | |
| "rewards/HPSv2": 0.27114295959472656, | |
| "rewards/ORM": 0.6175881326198578, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -21.5, | |
| "step": 146 | |
| }, | |
| { | |
| "completion_length": 67.625, | |
| "epoch": 0.16279069767441862, | |
| "grad_norm": 0.5227380394935608, | |
| "kl": 0.0070343017578125, | |
| "learning_rate": 9.081249999999999e-07, | |
| "loss": 0.003552068490535021, | |
| "reward": 1.605971097946167, | |
| "reward_std": 0.3158091753721237, | |
| "rewards/GDino": 0.6382401585578918, | |
| "rewards/GIT": 0.19080179929733276, | |
| "rewards/HPSv2": 0.25063323974609375, | |
| "rewards/ORM": 0.5262957215309143, | |
| "self_certainty_semantic": -25.375, | |
| "self_certainty_token": -22.9375, | |
| "step": 147 | |
| }, | |
| { | |
| "completion_length": 69.53125, | |
| "epoch": 0.1638981173864895, | |
| "grad_norm": 0.5913640260696411, | |
| "kl": 0.008758544921875, | |
| "learning_rate": 9.074999999999999e-07, | |
| "loss": 0.0023775382433086634, | |
| "reward": 2.265665352344513, | |
| "reward_std": 0.3249353617429733, | |
| "rewards/GDino": 0.8458716571331024, | |
| "rewards/GIT": 0.38859403878450394, | |
| "rewards/HPSv2": 0.27611541748046875, | |
| "rewards/ORM": 0.7550841569900513, | |
| "self_certainty_semantic": -25.9375, | |
| "self_certainty_token": -21.625, | |
| "step": 148 | |
| }, | |
| { | |
| "completion_length": 69.390625, | |
| "epoch": 0.16500553709856036, | |
| "grad_norm": 0.6509791016578674, | |
| "kl": 0.0075836181640625, | |
| "learning_rate": 9.068749999999999e-07, | |
| "loss": -0.010468412889167666, | |
| "reward": 2.1014277935028076, | |
| "reward_std": 0.29370661079883575, | |
| "rewards/GDino": 0.7491666674613953, | |
| "rewards/GIT": 0.3259096145629883, | |
| "rewards/HPSv2": 0.2623310089111328, | |
| "rewards/ORM": 0.7640205323696136, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -22.0, | |
| "step": 149 | |
| }, | |
| { | |
| "completion_length": 64.046875, | |
| "epoch": 0.16611295681063123, | |
| "grad_norm": 2.004599094390869, | |
| "kl": 0.01568603515625, | |
| "learning_rate": 9.0625e-07, | |
| "loss": -0.003110084217041731, | |
| "reward": 2.0497288703918457, | |
| "reward_std": 0.46643751859664917, | |
| "rewards/GDino": 0.7837072014808655, | |
| "rewards/GIT": 0.31941479444503784, | |
| "rewards/HPSv2": 0.2623157501220703, | |
| "rewards/ORM": 0.6842910945415497, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -21.875, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 68.875, | |
| "epoch": 0.1672203765227021, | |
| "grad_norm": 1.2010647058486938, | |
| "kl": 0.0079498291015625, | |
| "learning_rate": 9.05625e-07, | |
| "loss": 0.0036378083750605583, | |
| "reward": 2.19494891166687, | |
| "reward_std": 0.5349652469158173, | |
| "rewards/GDino": 0.7948823869228363, | |
| "rewards/GIT": 0.3874897435307503, | |
| "rewards/HPSv2": 0.2666778564453125, | |
| "rewards/ORM": 0.7458988428115845, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -21.75, | |
| "step": 151 | |
| }, | |
| { | |
| "completion_length": 65.46875, | |
| "epoch": 0.16832779623477298, | |
| "grad_norm": 0.4594494700431824, | |
| "kl": 0.0051727294921875, | |
| "learning_rate": 9.05e-07, | |
| "loss": 0.0013301910366863012, | |
| "reward": 2.1984575986862183, | |
| "reward_std": 0.2301565483212471, | |
| "rewards/GDino": 0.8368903398513794, | |
| "rewards/GIT": 0.4207738786935806, | |
| "rewards/HPSv2": 0.27980995178222656, | |
| "rewards/ORM": 0.6609834432601929, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.3125, | |
| "step": 152 | |
| }, | |
| { | |
| "completion_length": 60.875, | |
| "epoch": 0.16943521594684385, | |
| "grad_norm": 0.584158182144165, | |
| "kl": 0.006622314453125, | |
| "learning_rate": 9.04375e-07, | |
| "loss": -0.006514292559586465, | |
| "reward": 2.2534468173980713, | |
| "reward_std": 0.3471103012561798, | |
| "rewards/GDino": 0.7832907140254974, | |
| "rewards/GIT": 0.6241410374641418, | |
| "rewards/HPSv2": 0.2647590637207031, | |
| "rewards/ORM": 0.5812558829784393, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -21.8125, | |
| "step": 153 | |
| }, | |
| { | |
| "completion_length": 71.234375, | |
| "epoch": 0.17054263565891473, | |
| "grad_norm": 0.3877808153629303, | |
| "kl": 0.0067138671875, | |
| "learning_rate": 9.0375e-07, | |
| "loss": -0.00840937439352274, | |
| "reward": 1.5600855946540833, | |
| "reward_std": 0.1888522505760193, | |
| "rewards/GDino": 0.6892416477203369, | |
| "rewards/GIT": 0.1894538253545761, | |
| "rewards/HPSv2": 0.26103973388671875, | |
| "rewards/ORM": 0.42035043239593506, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -23.125, | |
| "step": 154 | |
| }, | |
| { | |
| "completion_length": 75.15625, | |
| "epoch": 0.1716500553709856, | |
| "grad_norm": 0.48354580998420715, | |
| "kl": 0.0096588134765625, | |
| "learning_rate": 9.031249999999999e-07, | |
| "loss": 0.019050699658691883, | |
| "reward": 2.116607189178467, | |
| "reward_std": 0.290459081530571, | |
| "rewards/GDino": 0.6718750298023224, | |
| "rewards/GIT": 0.4389065280556679, | |
| "rewards/HPSv2": 0.26484203338623047, | |
| "rewards/ORM": 0.7409836649894714, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -20.625, | |
| "step": 155 | |
| }, | |
| { | |
| "completion_length": 70.28125, | |
| "epoch": 0.17275747508305647, | |
| "grad_norm": 0.48019152879714966, | |
| "kl": 0.00909423828125, | |
| "learning_rate": 9.024999999999999e-07, | |
| "loss": -0.006820322363637388, | |
| "reward": 1.7913519144058228, | |
| "reward_std": 0.4075485020875931, | |
| "rewards/GDino": 0.6470568478107452, | |
| "rewards/GIT": 0.21577580273151398, | |
| "rewards/HPSv2": 0.2772235870361328, | |
| "rewards/ORM": 0.6512957215309143, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -21.125, | |
| "step": 156 | |
| }, | |
| { | |
| "completion_length": 79.765625, | |
| "epoch": 0.17386489479512734, | |
| "grad_norm": 0.4524085223674774, | |
| "kl": 0.0062713623046875, | |
| "learning_rate": 9.018749999999999e-07, | |
| "loss": -0.008496122900396585, | |
| "reward": 2.5269054174423218, | |
| "reward_std": 0.3125455528497696, | |
| "rewards/GDino": 0.8450000286102295, | |
| "rewards/GIT": 0.7050136923789978, | |
| "rewards/HPSv2": 0.24599647521972656, | |
| "rewards/ORM": 0.7308953106403351, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -22.1875, | |
| "step": 157 | |
| }, | |
| { | |
| "completion_length": 64.5, | |
| "epoch": 0.17497231450719822, | |
| "grad_norm": 0.43005651235580444, | |
| "kl": 0.009552001953125, | |
| "learning_rate": 9.0125e-07, | |
| "loss": 0.005564866121858358, | |
| "reward": 2.3001022338867188, | |
| "reward_std": 0.2847408503293991, | |
| "rewards/GDino": 0.8344532251358032, | |
| "rewards/GIT": 0.420885294675827, | |
| "rewards/HPSv2": 0.27132606506347656, | |
| "rewards/ORM": 0.7734375298023224, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -21.0, | |
| "step": 158 | |
| }, | |
| { | |
| "completion_length": 70.171875, | |
| "epoch": 0.1760797342192691, | |
| "grad_norm": 0.6674854159355164, | |
| "kl": 0.009185791015625, | |
| "learning_rate": 9.00625e-07, | |
| "loss": 0.001701198983937502, | |
| "reward": 2.222777843475342, | |
| "reward_std": 0.4929357320070267, | |
| "rewards/GDino": 0.7640625238418579, | |
| "rewards/GIT": 0.48828309774398804, | |
| "rewards/HPSv2": 0.2673072814941406, | |
| "rewards/ORM": 0.703125, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -20.6875, | |
| "step": 159 | |
| }, | |
| { | |
| "completion_length": 64.421875, | |
| "epoch": 0.17718715393134, | |
| "grad_norm": 0.4401383399963379, | |
| "kl": 0.008697509765625, | |
| "learning_rate": 9e-07, | |
| "loss": 0.0025870297104120255, | |
| "reward": 1.7824512124061584, | |
| "reward_std": 0.44338105618953705, | |
| "rewards/GDino": 0.7084426283836365, | |
| "rewards/GIT": 0.286900594830513, | |
| "rewards/HPSv2": 0.2784423828125, | |
| "rewards/ORM": 0.5086656212806702, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -22.0, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 69.328125, | |
| "epoch": 0.17829457364341086, | |
| "grad_norm": 0.6274824142456055, | |
| "kl": 0.008209228515625, | |
| "learning_rate": 8.99375e-07, | |
| "loss": 0.006771775893867016, | |
| "reward": 2.080656409263611, | |
| "reward_std": 0.4039708971977234, | |
| "rewards/GDino": 0.7284385859966278, | |
| "rewards/GIT": 0.4118357300758362, | |
| "rewards/HPSv2": 0.2606945037841797, | |
| "rewards/ORM": 0.6796875, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -21.4375, | |
| "step": 161 | |
| }, | |
| { | |
| "completion_length": 87.765625, | |
| "epoch": 0.17940199335548174, | |
| "grad_norm": 0.713962972164154, | |
| "kl": 0.00885009765625, | |
| "learning_rate": 8.9875e-07, | |
| "loss": 0.001781372120603919, | |
| "reward": 2.2108030319213867, | |
| "reward_std": 0.23567625507712364, | |
| "rewards/GDino": 0.9036458432674408, | |
| "rewards/GIT": 0.5173117220401764, | |
| "rewards/HPSv2": 0.2613239288330078, | |
| "rewards/ORM": 0.5285216420888901, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -21.875, | |
| "step": 162 | |
| }, | |
| { | |
| "completion_length": 65.375, | |
| "epoch": 0.1805094130675526, | |
| "grad_norm": 0.45745736360549927, | |
| "kl": 0.010772705078125, | |
| "learning_rate": 8.981249999999999e-07, | |
| "loss": -0.001884209574200213, | |
| "reward": 2.169035792350769, | |
| "reward_std": 0.27702826261520386, | |
| "rewards/GDino": 0.7578125, | |
| "rewards/GIT": 0.6291100382804871, | |
| "rewards/HPSv2": 0.24835586547851562, | |
| "rewards/ORM": 0.5337574481964111, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -20.875, | |
| "step": 163 | |
| }, | |
| { | |
| "completion_length": 66.15625, | |
| "epoch": 0.18161683277962348, | |
| "grad_norm": 0.4001372456550598, | |
| "kl": 0.011199951171875, | |
| "learning_rate": 8.974999999999999e-07, | |
| "loss": -0.004290862008929253, | |
| "reward": 2.6795451641082764, | |
| "reward_std": 0.3354812413454056, | |
| "rewards/GDino": 0.8685008883476257, | |
| "rewards/GIT": 0.7786318361759186, | |
| "rewards/HPSv2": 0.27187156677246094, | |
| "rewards/ORM": 0.7605409026145935, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -20.625, | |
| "step": 164 | |
| }, | |
| { | |
| "completion_length": 71.453125, | |
| "epoch": 0.18272425249169436, | |
| "grad_norm": 0.6596059799194336, | |
| "kl": 0.00909423828125, | |
| "learning_rate": 8.96875e-07, | |
| "loss": -0.0067337434738874435, | |
| "reward": 2.3466144800186157, | |
| "reward_std": 0.29852450639009476, | |
| "rewards/GDino": 0.8130539357662201, | |
| "rewards/GIT": 0.49434708058834076, | |
| "rewards/HPSv2": 0.2721138000488281, | |
| "rewards/ORM": 0.7670996189117432, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -22.875, | |
| "step": 165 | |
| }, | |
| { | |
| "completion_length": 79.859375, | |
| "epoch": 0.18383167220376523, | |
| "grad_norm": 0.41807329654693604, | |
| "kl": 0.01123046875, | |
| "learning_rate": 8.9625e-07, | |
| "loss": 0.010698896832764149, | |
| "reward": 2.1671139001846313, | |
| "reward_std": 0.37620842456817627, | |
| "rewards/GDino": 0.7225366532802582, | |
| "rewards/GIT": 0.46812044084072113, | |
| "rewards/HPSv2": 0.2448101043701172, | |
| "rewards/ORM": 0.7316466867923737, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -21.3125, | |
| "step": 166 | |
| }, | |
| { | |
| "completion_length": 68.921875, | |
| "epoch": 0.1849390919158361, | |
| "grad_norm": 0.4884219467639923, | |
| "kl": 0.010955810546875, | |
| "learning_rate": 8.95625e-07, | |
| "loss": 0.0020176093094050884, | |
| "reward": 1.979174256324768, | |
| "reward_std": 0.43148648738861084, | |
| "rewards/GDino": 0.7630714476108551, | |
| "rewards/GIT": 0.49030545353889465, | |
| "rewards/HPSv2": 0.2582511901855469, | |
| "rewards/ORM": 0.46754617989063263, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -22.0, | |
| "step": 167 | |
| }, | |
| { | |
| "completion_length": 76.453125, | |
| "epoch": 0.18604651162790697, | |
| "grad_norm": 0.4840864837169647, | |
| "kl": 0.00423431396484375, | |
| "learning_rate": 8.95e-07, | |
| "loss": -0.0033226923551410437, | |
| "reward": 2.049097418785095, | |
| "reward_std": 0.2925217002630234, | |
| "rewards/GDino": 0.7759547531604767, | |
| "rewards/GIT": 0.5475737899541855, | |
| "rewards/HPSv2": 0.25574493408203125, | |
| "rewards/ORM": 0.4698239266872406, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -21.4375, | |
| "step": 168 | |
| }, | |
| { | |
| "completion_length": 70.6875, | |
| "epoch": 0.18715393133997785, | |
| "grad_norm": 0.6547427773475647, | |
| "kl": 0.0087890625, | |
| "learning_rate": 8.94375e-07, | |
| "loss": -0.00017379922792315483, | |
| "reward": 2.19344425201416, | |
| "reward_std": 0.3008778989315033, | |
| "rewards/GDino": 0.8275851011276245, | |
| "rewards/GIT": 0.45398683845996857, | |
| "rewards/HPSv2": 0.2814655303955078, | |
| "rewards/ORM": 0.6304067671298981, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -21.625, | |
| "step": 169 | |
| }, | |
| { | |
| "completion_length": 82.03125, | |
| "epoch": 0.18826135105204872, | |
| "grad_norm": 0.5040526390075684, | |
| "kl": 0.0142364501953125, | |
| "learning_rate": 8.9375e-07, | |
| "loss": -0.007077913731336594, | |
| "reward": 2.0542516708374023, | |
| "reward_std": 0.3690732419490814, | |
| "rewards/GDino": 0.7519437670707703, | |
| "rewards/GIT": 0.40589363873004913, | |
| "rewards/HPSv2": 0.2560100555419922, | |
| "rewards/ORM": 0.6404041647911072, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -21.0, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 64.203125, | |
| "epoch": 0.1893687707641196, | |
| "grad_norm": 0.4935157299041748, | |
| "kl": 0.012420654296875, | |
| "learning_rate": 8.931249999999999e-07, | |
| "loss": 0.0035545220598578453, | |
| "reward": 2.274348735809326, | |
| "reward_std": 0.2875422090291977, | |
| "rewards/GDino": 0.7699261903762817, | |
| "rewards/GIT": 0.5473942309617996, | |
| "rewards/HPSv2": 0.2648448944091797, | |
| "rewards/ORM": 0.6921834945678711, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -21.0, | |
| "step": 171 | |
| }, | |
| { | |
| "completion_length": 74.953125, | |
| "epoch": 0.19047619047619047, | |
| "grad_norm": 0.4935402274131775, | |
| "kl": 0.0087738037109375, | |
| "learning_rate": 8.924999999999999e-07, | |
| "loss": 0.004996137693524361, | |
| "reward": 1.6501405239105225, | |
| "reward_std": 0.3322151154279709, | |
| "rewards/GDino": 0.5804118067026138, | |
| "rewards/GIT": 0.419575035572052, | |
| "rewards/HPSv2": 0.25256919860839844, | |
| "rewards/ORM": 0.39758437871932983, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -21.5625, | |
| "step": 172 | |
| }, | |
| { | |
| "completion_length": 63.296875, | |
| "epoch": 0.19158361018826134, | |
| "grad_norm": 1.0840739011764526, | |
| "kl": 0.0174560546875, | |
| "learning_rate": 8.918749999999999e-07, | |
| "loss": 0.0033964416943490505, | |
| "reward": 2.1245768666267395, | |
| "reward_std": 0.29341885447502136, | |
| "rewards/GDino": 0.8359375298023224, | |
| "rewards/GIT": 0.3758692592382431, | |
| "rewards/HPSv2": 0.2845611572265625, | |
| "rewards/ORM": 0.6282089054584503, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -22.0, | |
| "step": 173 | |
| }, | |
| { | |
| "completion_length": 80.53125, | |
| "epoch": 0.19269102990033224, | |
| "grad_norm": 0.4756031036376953, | |
| "kl": 0.0066070556640625, | |
| "learning_rate": 8.912499999999999e-07, | |
| "loss": -0.001147494971519336, | |
| "reward": 2.2244513034820557, | |
| "reward_std": 0.3234108239412308, | |
| "rewards/GDino": 0.7939131259918213, | |
| "rewards/GIT": 0.5430482923984528, | |
| "rewards/HPSv2": 0.2594108581542969, | |
| "rewards/ORM": 0.6280790567398071, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -21.125, | |
| "step": 174 | |
| }, | |
| { | |
| "completion_length": 63.796875, | |
| "epoch": 0.1937984496124031, | |
| "grad_norm": 0.8507784605026245, | |
| "kl": 0.01806640625, | |
| "learning_rate": 8.906249999999999e-07, | |
| "loss": -0.0049158919136971235, | |
| "reward": 2.211203694343567, | |
| "reward_std": 0.30844441056251526, | |
| "rewards/GDino": 0.7877604365348816, | |
| "rewards/GIT": 0.5168893337249756, | |
| "rewards/HPSv2": 0.2628498077392578, | |
| "rewards/ORM": 0.6437040567398071, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.125, | |
| "step": 175 | |
| }, | |
| { | |
| "completion_length": 85.40625, | |
| "epoch": 0.19490586932447398, | |
| "grad_norm": 0.4818137586116791, | |
| "kl": 0.00640869140625, | |
| "learning_rate": 8.9e-07, | |
| "loss": -0.0028424898628145456, | |
| "reward": 1.9287346601486206, | |
| "reward_std": 0.36689065396785736, | |
| "rewards/GDino": 0.7782090902328491, | |
| "rewards/GIT": 0.4271218478679657, | |
| "rewards/HPSv2": 0.262115478515625, | |
| "rewards/ORM": 0.461288183927536, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -20.75, | |
| "step": 176 | |
| }, | |
| { | |
| "completion_length": 74.65625, | |
| "epoch": 0.19601328903654486, | |
| "grad_norm": 0.5553709864616394, | |
| "kl": 0.014068603515625, | |
| "learning_rate": 8.89375e-07, | |
| "loss": -0.00260241178330034, | |
| "reward": 2.277731418609619, | |
| "reward_std": 0.36928629875183105, | |
| "rewards/GDino": 0.7465280592441559, | |
| "rewards/GIT": 0.4939851015806198, | |
| "rewards/HPSv2": 0.2715930938720703, | |
| "rewards/ORM": 0.765625, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -21.25, | |
| "step": 177 | |
| }, | |
| { | |
| "completion_length": 77.53125, | |
| "epoch": 0.19712070874861573, | |
| "grad_norm": 0.812800407409668, | |
| "kl": 0.0077972412109375, | |
| "learning_rate": 8.8875e-07, | |
| "loss": -0.007587546017020941, | |
| "reward": 2.0915766954421997, | |
| "reward_std": 0.39137691259384155, | |
| "rewards/GDino": 0.745751827955246, | |
| "rewards/GIT": 0.40190117061138153, | |
| "rewards/HPSv2": 0.2661113739013672, | |
| "rewards/ORM": 0.6778122782707214, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.25, | |
| "step": 178 | |
| }, | |
| { | |
| "completion_length": 64.765625, | |
| "epoch": 0.1982281284606866, | |
| "grad_norm": 0.8705865740776062, | |
| "kl": 0.01080322265625, | |
| "learning_rate": 8.88125e-07, | |
| "loss": -0.00909736379981041, | |
| "reward": 2.4661701917648315, | |
| "reward_std": 0.1972077488899231, | |
| "rewards/GDino": 0.8959279954433441, | |
| "rewards/GIT": 0.5798787474632263, | |
| "rewards/HPSv2": 0.2825050354003906, | |
| "rewards/ORM": 0.7078584432601929, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -22.4375, | |
| "step": 179 | |
| }, | |
| { | |
| "completion_length": 75.1875, | |
| "epoch": 0.19933554817275748, | |
| "grad_norm": 1.3513967990875244, | |
| "kl": 0.0105743408203125, | |
| "learning_rate": 8.874999999999999e-07, | |
| "loss": 0.023300296626985073, | |
| "reward": 1.805686593055725, | |
| "reward_std": 0.4569002389907837, | |
| "rewards/GDino": 0.748356282711029, | |
| "rewards/GIT": 0.34142881631851196, | |
| "rewards/HPSv2": 0.2596473693847656, | |
| "rewards/ORM": 0.4562540054321289, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.5625, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 74.171875, | |
| "epoch": 0.20044296788482835, | |
| "grad_norm": 0.49861499667167664, | |
| "kl": 0.00799560546875, | |
| "learning_rate": 8.86875e-07, | |
| "loss": 0.005896527087315917, | |
| "reward": 1.8344124555587769, | |
| "reward_std": 0.33161167800426483, | |
| "rewards/GDino": 0.6484833061695099, | |
| "rewards/GIT": 0.3188634589314461, | |
| "rewards/HPSv2": 0.2792530059814453, | |
| "rewards/ORM": 0.587812751531601, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -21.75, | |
| "step": 181 | |
| }, | |
| { | |
| "completion_length": 66.53125, | |
| "epoch": 0.20155038759689922, | |
| "grad_norm": 0.518588125705719, | |
| "kl": 0.021148681640625, | |
| "learning_rate": 8.8625e-07, | |
| "loss": -0.0032154046930372715, | |
| "reward": 1.6775782704353333, | |
| "reward_std": 0.4542950987815857, | |
| "rewards/GDino": 0.6909389793872833, | |
| "rewards/GIT": 0.31735002249479294, | |
| "rewards/HPSv2": 0.27741050720214844, | |
| "rewards/ORM": 0.39187873899936676, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -22.0625, | |
| "step": 182 | |
| }, | |
| { | |
| "completion_length": 83.171875, | |
| "epoch": 0.2026578073089701, | |
| "grad_norm": 0.4635794758796692, | |
| "kl": 0.015838623046875, | |
| "learning_rate": 8.85625e-07, | |
| "loss": 0.006844737799838185, | |
| "reward": 1.8692994713783264, | |
| "reward_std": 0.3296326994895935, | |
| "rewards/GDino": 0.7293833494186401, | |
| "rewards/GIT": 0.34990622848272324, | |
| "rewards/HPSv2": 0.2678260803222656, | |
| "rewards/ORM": 0.5221837162971497, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.4375, | |
| "step": 183 | |
| }, | |
| { | |
| "completion_length": 63.4375, | |
| "epoch": 0.20376522702104097, | |
| "grad_norm": 0.5085333585739136, | |
| "kl": 0.0120849609375, | |
| "learning_rate": 8.85e-07, | |
| "loss": -0.0026784827932715416, | |
| "reward": 2.799358606338501, | |
| "reward_std": 0.1885242909193039, | |
| "rewards/GDino": 0.925000011920929, | |
| "rewards/GIT": 0.7545149028301239, | |
| "rewards/HPSv2": 0.26367759704589844, | |
| "rewards/ORM": 0.8561660945415497, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -21.1875, | |
| "step": 184 | |
| }, | |
| { | |
| "completion_length": 65.859375, | |
| "epoch": 0.20487264673311184, | |
| "grad_norm": 0.5494704842567444, | |
| "kl": 0.013671875, | |
| "learning_rate": 8.84375e-07, | |
| "loss": -0.003346539626363665, | |
| "reward": 2.0845471620559692, | |
| "reward_std": 0.5152666121721268, | |
| "rewards/GDino": 0.7945332229137421, | |
| "rewards/GIT": 0.2876994013786316, | |
| "rewards/HPSv2": 0.27262306213378906, | |
| "rewards/ORM": 0.7296914756298065, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -22.125, | |
| "step": 185 | |
| }, | |
| { | |
| "completion_length": 71.6875, | |
| "epoch": 0.2059800664451827, | |
| "grad_norm": 0.5301854014396667, | |
| "kl": 0.011871337890625, | |
| "learning_rate": 8.8375e-07, | |
| "loss": -0.0013000170001760125, | |
| "reward": 2.0686882734298706, | |
| "reward_std": 0.40786902606487274, | |
| "rewards/GDino": 0.6654029488563538, | |
| "rewards/GIT": 0.3254973590373993, | |
| "rewards/HPSv2": 0.240997314453125, | |
| "rewards/ORM": 0.8367905914783478, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -21.0625, | |
| "step": 186 | |
| }, | |
| { | |
| "completion_length": 76.890625, | |
| "epoch": 0.2070874861572536, | |
| "grad_norm": 0.4597737789154053, | |
| "kl": 0.011993408203125, | |
| "learning_rate": 8.83125e-07, | |
| "loss": 0.016351854777894914, | |
| "reward": 2.200950801372528, | |
| "reward_std": 0.35277409851551056, | |
| "rewards/GDino": 0.7939618229866028, | |
| "rewards/GIT": 0.5313694775104523, | |
| "rewards/HPSv2": 0.26030731201171875, | |
| "rewards/ORM": 0.615312248468399, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -21.25, | |
| "step": 187 | |
| }, | |
| { | |
| "completion_length": 65.65625, | |
| "epoch": 0.2081949058693245, | |
| "grad_norm": 0.5319734811782837, | |
| "kl": 0.010162353515625, | |
| "learning_rate": 8.824999999999999e-07, | |
| "loss": 0.00020685815252363682, | |
| "reward": 2.099229574203491, | |
| "reward_std": 0.360196590423584, | |
| "rewards/GDino": 0.7534400224685669, | |
| "rewards/GIT": 0.27092792093753815, | |
| "rewards/HPSv2": 0.2623615264892578, | |
| "rewards/ORM": 0.8125, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.625, | |
| "step": 188 | |
| }, | |
| { | |
| "completion_length": 71.703125, | |
| "epoch": 0.20930232558139536, | |
| "grad_norm": 0.7321242690086365, | |
| "kl": 0.0094451904296875, | |
| "learning_rate": 8.818749999999999e-07, | |
| "loss": -0.004028161056339741, | |
| "reward": 2.337135910987854, | |
| "reward_std": 0.31387007236480713, | |
| "rewards/GDino": 0.7773648500442505, | |
| "rewards/GIT": 0.5682414174079895, | |
| "rewards/HPSv2": 0.27951812744140625, | |
| "rewards/ORM": 0.7120114862918854, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -21.25, | |
| "step": 189 | |
| }, | |
| { | |
| "completion_length": 70.203125, | |
| "epoch": 0.21040974529346623, | |
| "grad_norm": 1.9930344820022583, | |
| "kl": 0.0136566162109375, | |
| "learning_rate": 8.812499999999999e-07, | |
| "loss": 0.008943180087953806, | |
| "reward": 2.5060739517211914, | |
| "reward_std": 0.16241375356912613, | |
| "rewards/GDino": 0.9254540205001831, | |
| "rewards/GIT": 0.454538494348526, | |
| "rewards/HPSv2": 0.2667064666748047, | |
| "rewards/ORM": 0.859375, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.75, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 64.796875, | |
| "epoch": 0.2115171650055371, | |
| "grad_norm": 0.4348452091217041, | |
| "kl": 0.007415771484375, | |
| "learning_rate": 8.806249999999999e-07, | |
| "loss": -0.006945850793272257, | |
| "reward": 2.5402393341064453, | |
| "reward_std": 0.2529807686805725, | |
| "rewards/GDino": 0.8751335144042969, | |
| "rewards/GIT": 0.6033133119344711, | |
| "rewards/HPSv2": 0.27858734130859375, | |
| "rewards/ORM": 0.7832051813602448, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -21.5625, | |
| "step": 191 | |
| }, | |
| { | |
| "completion_length": 89.921875, | |
| "epoch": 0.21262458471760798, | |
| "grad_norm": 0.7680485248565674, | |
| "kl": 0.012481689453125, | |
| "learning_rate": 8.799999999999999e-07, | |
| "loss": 0.005377613822929561, | |
| "reward": 1.8802450299263, | |
| "reward_std": 0.3106888607144356, | |
| "rewards/GDino": 0.6456713378429413, | |
| "rewards/GIT": 0.4135439097881317, | |
| "rewards/HPSv2": 0.2503166198730469, | |
| "rewards/ORM": 0.5707131326198578, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -21.125, | |
| "step": 192 | |
| }, | |
| { | |
| "completion_length": 69.78125, | |
| "epoch": 0.21373200442967885, | |
| "grad_norm": 0.5264883637428284, | |
| "kl": 0.010955810546875, | |
| "learning_rate": 8.793749999999999e-07, | |
| "loss": 0.008317717118188739, | |
| "reward": 1.861718237400055, | |
| "reward_std": 0.4164891242980957, | |
| "rewards/GDino": 0.7109375596046448, | |
| "rewards/GIT": 0.21486494690179825, | |
| "rewards/HPSv2": 0.2839984893798828, | |
| "rewards/ORM": 0.6519171893596649, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -22.1875, | |
| "step": 193 | |
| }, | |
| { | |
| "completion_length": 74.75, | |
| "epoch": 0.21483942414174972, | |
| "grad_norm": 0.5414590835571289, | |
| "kl": 0.0074462890625, | |
| "learning_rate": 8.7875e-07, | |
| "loss": -0.0021489104256033897, | |
| "reward": 1.963248074054718, | |
| "reward_std": 0.4292799085378647, | |
| "rewards/GDino": 0.8057583570480347, | |
| "rewards/GIT": 0.5115346312522888, | |
| "rewards/HPSv2": 0.26822662353515625, | |
| "rewards/ORM": 0.37772834300994873, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -21.25, | |
| "step": 194 | |
| }, | |
| { | |
| "completion_length": 68.203125, | |
| "epoch": 0.2159468438538206, | |
| "grad_norm": 0.45540449023246765, | |
| "kl": 0.01312255859375, | |
| "learning_rate": 8.78125e-07, | |
| "loss": -0.004703107755631208, | |
| "reward": 2.011273205280304, | |
| "reward_std": 0.4216621667146683, | |
| "rewards/GDino": 0.7242187261581421, | |
| "rewards/GIT": 0.5994383990764618, | |
| "rewards/HPSv2": 0.27542877197265625, | |
| "rewards/ORM": 0.41218727827072144, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.6875, | |
| "step": 195 | |
| }, | |
| { | |
| "completion_length": 79.5, | |
| "epoch": 0.21705426356589147, | |
| "grad_norm": 0.5480747818946838, | |
| "kl": 0.007293701171875, | |
| "learning_rate": 8.774999999999999e-07, | |
| "loss": -0.001077285036444664, | |
| "reward": 2.287221312522888, | |
| "reward_std": 0.3154482841491699, | |
| "rewards/GDino": 0.7235225439071655, | |
| "rewards/GIT": 0.5517593622207642, | |
| "rewards/HPSv2": 0.2792186737060547, | |
| "rewards/ORM": 0.7327205836772919, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.6875, | |
| "step": 196 | |
| }, | |
| { | |
| "completion_length": 67.34375, | |
| "epoch": 0.21816168327796234, | |
| "grad_norm": 0.648148238658905, | |
| "kl": 0.01416015625, | |
| "learning_rate": 8.76875e-07, | |
| "loss": 0.0010744737228378654, | |
| "reward": 2.3249343037605286, | |
| "reward_std": 0.40621738135814667, | |
| "rewards/GDino": 0.7385416626930237, | |
| "rewards/GIT": 0.4809828922152519, | |
| "rewards/HPSv2": 0.2538471221923828, | |
| "rewards/ORM": 0.8515625, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -20.625, | |
| "step": 197 | |
| }, | |
| { | |
| "completion_length": 74.59375, | |
| "epoch": 0.21926910299003322, | |
| "grad_norm": 0.978819727897644, | |
| "kl": 0.01177978515625, | |
| "learning_rate": 8.7625e-07, | |
| "loss": 0.004215072840452194, | |
| "reward": 2.1429388523101807, | |
| "reward_std": 0.3008539155125618, | |
| "rewards/GDino": 0.8473958671092987, | |
| "rewards/GIT": 0.5675143599510193, | |
| "rewards/HPSv2": 0.2627582550048828, | |
| "rewards/ORM": 0.4652703106403351, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -21.4375, | |
| "step": 198 | |
| }, | |
| { | |
| "completion_length": 65.515625, | |
| "epoch": 0.2203765227021041, | |
| "grad_norm": 0.6454822421073914, | |
| "kl": 0.01220703125, | |
| "learning_rate": 8.75625e-07, | |
| "loss": -0.0005628032376989722, | |
| "reward": 2.50363028049469, | |
| "reward_std": 0.3133077025413513, | |
| "rewards/GDino": 0.8082683682441711, | |
| "rewards/GIT": 0.6633397042751312, | |
| "rewards/HPSv2": 0.2600593566894531, | |
| "rewards/ORM": 0.7719629406929016, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -22.0, | |
| "step": 199 | |
| }, | |
| { | |
| "completion_length": 61.59375, | |
| "epoch": 0.22148394241417496, | |
| "grad_norm": 0.6677749156951904, | |
| "kl": 0.0155029296875, | |
| "learning_rate": 8.75e-07, | |
| "loss": 0.0032004087697714567, | |
| "reward": 2.0826478004455566, | |
| "reward_std": 0.48166391253471375, | |
| "rewards/GDino": 0.7572438716888428, | |
| "rewards/GIT": 0.2937658578157425, | |
| "rewards/HPSv2": 0.27109718322753906, | |
| "rewards/ORM": 0.7605408430099487, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -20.5625, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 63.96875, | |
| "epoch": 0.22259136212624583, | |
| "grad_norm": 0.5104448199272156, | |
| "kl": 0.009307861328125, | |
| "learning_rate": 8.74375e-07, | |
| "loss": -0.003562572179362178, | |
| "reward": 2.1382156014442444, | |
| "reward_std": 0.3752055764198303, | |
| "rewards/GDino": 0.7233067750930786, | |
| "rewards/GIT": 0.41389697045087814, | |
| "rewards/HPSv2": 0.2719917297363281, | |
| "rewards/ORM": 0.7290200591087341, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -21.375, | |
| "step": 201 | |
| }, | |
| { | |
| "completion_length": 76.40625, | |
| "epoch": 0.22369878183831673, | |
| "grad_norm": 0.626039445400238, | |
| "kl": 0.01519775390625, | |
| "learning_rate": 8.7375e-07, | |
| "loss": 0.010693363845348358, | |
| "reward": 2.4189499616622925, | |
| "reward_std": 0.3681239038705826, | |
| "rewards/GDino": 0.8692708611488342, | |
| "rewards/GIT": 0.527855783700943, | |
| "rewards/HPSv2": 0.2807598114013672, | |
| "rewards/ORM": 0.7410636246204376, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -22.0625, | |
| "step": 202 | |
| }, | |
| { | |
| "completion_length": 70.53125, | |
| "epoch": 0.2248062015503876, | |
| "grad_norm": 0.3975130319595337, | |
| "kl": 0.0157623291015625, | |
| "learning_rate": 8.73125e-07, | |
| "loss": 0.000663774786517024, | |
| "reward": 2.5051724910736084, | |
| "reward_std": 0.25397956371307373, | |
| "rewards/GDino": 0.8890624940395355, | |
| "rewards/GIT": 0.7177164554595947, | |
| "rewards/HPSv2": 0.2733936309814453, | |
| "rewards/ORM": 0.6249999701976776, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -21.9375, | |
| "step": 203 | |
| }, | |
| { | |
| "completion_length": 67.078125, | |
| "epoch": 0.22591362126245848, | |
| "grad_norm": 0.4357939660549164, | |
| "kl": 0.01806640625, | |
| "learning_rate": 8.725e-07, | |
| "loss": 0.00449561863206327, | |
| "reward": 2.077805757522583, | |
| "reward_std": 0.3087446913123131, | |
| "rewards/GDino": 0.76171875, | |
| "rewards/GIT": 0.42673608660697937, | |
| "rewards/HPSv2": 0.27997589111328125, | |
| "rewards/ORM": 0.609375, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -21.875, | |
| "step": 204 | |
| }, | |
| { | |
| "completion_length": 64.015625, | |
| "epoch": 0.22702104097452935, | |
| "grad_norm": 0.6120555996894836, | |
| "kl": 0.01470947265625, | |
| "learning_rate": 8.718749999999999e-07, | |
| "loss": -0.004034913959912956, | |
| "reward": 2.361166477203369, | |
| "reward_std": 0.4172802269458771, | |
| "rewards/GDino": 0.7758493423461914, | |
| "rewards/GIT": 0.5358432680368423, | |
| "rewards/HPSv2": 0.2727775573730469, | |
| "rewards/ORM": 0.7766963839530945, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.0625, | |
| "step": 205 | |
| }, | |
| { | |
| "completion_length": 66.28125, | |
| "epoch": 0.22812846068660023, | |
| "grad_norm": 0.5106468200683594, | |
| "kl": 0.0121612548828125, | |
| "learning_rate": 8.712499999999999e-07, | |
| "loss": -0.0029943487606942654, | |
| "reward": 2.395945906639099, | |
| "reward_std": 0.2518894746899605, | |
| "rewards/GDino": 0.7812500298023224, | |
| "rewards/GIT": 0.44443757832050323, | |
| "rewards/HPSv2": 0.283966064453125, | |
| "rewards/ORM": 0.8862921893596649, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -21.875, | |
| "step": 206 | |
| }, | |
| { | |
| "completion_length": 80.71875, | |
| "epoch": 0.2292358803986711, | |
| "grad_norm": 0.5985101461410522, | |
| "kl": 0.008514404296875, | |
| "learning_rate": 8.706249999999999e-07, | |
| "loss": -0.009422333678230643, | |
| "reward": 2.2342270612716675, | |
| "reward_std": 0.39967483282089233, | |
| "rewards/GDino": 0.8109811544418335, | |
| "rewards/GIT": 0.4852132052183151, | |
| "rewards/HPSv2": 0.2630786895751953, | |
| "rewards/ORM": 0.6749540567398071, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -22.625, | |
| "step": 207 | |
| }, | |
| { | |
| "completion_length": 72.046875, | |
| "epoch": 0.23034330011074197, | |
| "grad_norm": 0.48401689529418945, | |
| "kl": 0.027587890625, | |
| "learning_rate": 8.699999999999999e-07, | |
| "loss": -0.00892256060615182, | |
| "reward": 2.50811767578125, | |
| "reward_std": 0.23593301326036453, | |
| "rewards/GDino": 0.8662500083446503, | |
| "rewards/GIT": 0.6928490549325943, | |
| "rewards/HPSv2": 0.2693309783935547, | |
| "rewards/ORM": 0.6796875, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -22.1875, | |
| "step": 208 | |
| }, | |
| { | |
| "completion_length": 67.953125, | |
| "epoch": 0.23145071982281284, | |
| "grad_norm": 0.7185308933258057, | |
| "kl": 0.03375244140625, | |
| "learning_rate": 8.693749999999999e-07, | |
| "loss": 0.008592829457484186, | |
| "reward": 1.9527746438980103, | |
| "reward_std": 0.44384250044822693, | |
| "rewards/GDino": 0.7293368875980377, | |
| "rewards/GIT": 0.380715548992157, | |
| "rewards/HPSv2": 0.27080535888671875, | |
| "rewards/ORM": 0.571916937828064, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.6875, | |
| "step": 209 | |
| }, | |
| { | |
| "completion_length": 70.515625, | |
| "epoch": 0.23255813953488372, | |
| "grad_norm": 0.5452485084533691, | |
| "kl": 0.01983642578125, | |
| "learning_rate": 8.687499999999999e-07, | |
| "loss": 0.008935668971389532, | |
| "reward": 2.4658610820770264, | |
| "reward_std": 0.3191976174712181, | |
| "rewards/GDino": 0.8201898336410522, | |
| "rewards/GIT": 0.4580434560775757, | |
| "rewards/HPSv2": 0.28137779235839844, | |
| "rewards/ORM": 0.90625, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.6875, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 67.890625, | |
| "epoch": 0.2336655592469546, | |
| "grad_norm": 0.6218281388282776, | |
| "kl": 0.01678466796875, | |
| "learning_rate": 8.681249999999999e-07, | |
| "loss": 0.003950295504182577, | |
| "reward": 2.025146007537842, | |
| "reward_std": 0.498775839805603, | |
| "rewards/GDino": 0.6994791030883789, | |
| "rewards/GIT": 0.4393797814846039, | |
| "rewards/HPSv2": 0.2803916931152344, | |
| "rewards/ORM": 0.6058953106403351, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.375, | |
| "step": 211 | |
| }, | |
| { | |
| "completion_length": 59.34375, | |
| "epoch": 0.23477297895902546, | |
| "grad_norm": 0.43229976296424866, | |
| "kl": 0.0074615478515625, | |
| "learning_rate": 8.675000000000001e-07, | |
| "loss": 0.005056330235674977, | |
| "reward": 1.901893436908722, | |
| "reward_std": 0.49373389780521393, | |
| "rewards/GDino": 0.7495389878749847, | |
| "rewards/GIT": 0.2199169397354126, | |
| "rewards/HPSv2": 0.27065086364746094, | |
| "rewards/ORM": 0.6617866456508636, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.3125, | |
| "step": 212 | |
| }, | |
| { | |
| "completion_length": 84.03125, | |
| "epoch": 0.23588039867109634, | |
| "grad_norm": 0.43608731031417847, | |
| "kl": 0.024810791015625, | |
| "learning_rate": 8.66875e-07, | |
| "loss": 0.010223755147308111, | |
| "reward": 2.4147619009017944, | |
| "reward_std": 0.3146657347679138, | |
| "rewards/GDino": 0.852343738079071, | |
| "rewards/GIT": 0.6123765110969543, | |
| "rewards/HPSv2": 0.28365135192871094, | |
| "rewards/ORM": 0.6663902103900909, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -22.25, | |
| "step": 213 | |
| }, | |
| { | |
| "completion_length": 77.75, | |
| "epoch": 0.2369878183831672, | |
| "grad_norm": 0.5804117918014526, | |
| "kl": 0.021240234375, | |
| "learning_rate": 8.6625e-07, | |
| "loss": 0.007496127160266042, | |
| "reward": 1.7721906900405884, | |
| "reward_std": 0.48169347643852234, | |
| "rewards/GDino": 0.6889558434486389, | |
| "rewards/GIT": 0.28805774822831154, | |
| "rewards/HPSv2": 0.28142738342285156, | |
| "rewards/ORM": 0.5137497633695602, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -21.5, | |
| "step": 214 | |
| }, | |
| { | |
| "completion_length": 70.515625, | |
| "epoch": 0.23809523809523808, | |
| "grad_norm": 0.7613699436187744, | |
| "kl": 0.0208740234375, | |
| "learning_rate": 8.65625e-07, | |
| "loss": 0.002267889678478241, | |
| "reward": 2.477326512336731, | |
| "reward_std": 0.33458858728408813, | |
| "rewards/GDino": 0.8195984661579132, | |
| "rewards/GIT": 0.6178127527236938, | |
| "rewards/HPSv2": 0.2648735046386719, | |
| "rewards/ORM": 0.7750419676303864, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -21.1875, | |
| "step": 215 | |
| }, | |
| { | |
| "completion_length": 62.015625, | |
| "epoch": 0.23920265780730898, | |
| "grad_norm": 0.4545797109603882, | |
| "kl": 0.01617431640625, | |
| "learning_rate": 8.65e-07, | |
| "loss": 0.0009205628884956241, | |
| "reward": 2.2222498059272766, | |
| "reward_std": 0.36637741327285767, | |
| "rewards/GDino": 0.8047255873680115, | |
| "rewards/GIT": 0.4453047811985016, | |
| "rewards/HPSv2": 0.2862739562988281, | |
| "rewards/ORM": 0.6859454959630966, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -22.125, | |
| "step": 216 | |
| }, | |
| { | |
| "completion_length": 74.65625, | |
| "epoch": 0.24031007751937986, | |
| "grad_norm": 0.4494488835334778, | |
| "kl": 0.008087158203125, | |
| "learning_rate": 8.64375e-07, | |
| "loss": -0.008069702424108982, | |
| "reward": 2.0422152280807495, | |
| "reward_std": 0.4399893283843994, | |
| "rewards/GDino": 0.7447916567325592, | |
| "rewards/GIT": 0.5272943079471588, | |
| "rewards/HPSv2": 0.2591876983642578, | |
| "rewards/ORM": 0.5109415352344513, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -21.5625, | |
| "step": 217 | |
| }, | |
| { | |
| "completion_length": 72.578125, | |
| "epoch": 0.24141749723145073, | |
| "grad_norm": 0.5540902614593506, | |
| "kl": 0.019287109375, | |
| "learning_rate": 8.6375e-07, | |
| "loss": -0.01052069931756705, | |
| "reward": 1.696807324886322, | |
| "reward_std": 0.3257126286625862, | |
| "rewards/GDino": 0.633100837469101, | |
| "rewards/GIT": 0.3013424575328827, | |
| "rewards/HPSv2": 0.25147247314453125, | |
| "rewards/ORM": 0.5108915567398071, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.375, | |
| "step": 218 | |
| }, | |
| { | |
| "completion_length": 64.328125, | |
| "epoch": 0.2425249169435216, | |
| "grad_norm": 0.6624598503112793, | |
| "kl": 0.020233154296875, | |
| "learning_rate": 8.63125e-07, | |
| "loss": 0.00015211279969662428, | |
| "reward": 2.0258015394210815, | |
| "reward_std": 0.2695777714252472, | |
| "rewards/GDino": 0.6833088994026184, | |
| "rewards/GIT": 0.45557114481925964, | |
| "rewards/HPSv2": 0.28227996826171875, | |
| "rewards/ORM": 0.6046415567398071, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -22.0, | |
| "step": 219 | |
| }, | |
| { | |
| "completion_length": 72.375, | |
| "epoch": 0.24363233665559247, | |
| "grad_norm": 1.318785309791565, | |
| "kl": 0.0072784423828125, | |
| "learning_rate": 8.625e-07, | |
| "loss": -0.0014179093122947961, | |
| "reward": 2.136075735092163, | |
| "reward_std": 0.28762874752283096, | |
| "rewards/GDino": 0.8410985469818115, | |
| "rewards/GIT": 0.595182478427887, | |
| "rewards/HPSv2": 0.2622947692871094, | |
| "rewards/ORM": 0.4375, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -20.375, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 64.5625, | |
| "epoch": 0.24473975636766335, | |
| "grad_norm": 0.5440139770507812, | |
| "kl": 0.01751708984375, | |
| "learning_rate": 8.618749999999999e-07, | |
| "loss": 0.00410419749096036, | |
| "reward": 1.7209655046463013, | |
| "reward_std": 0.49389104545116425, | |
| "rewards/GDino": 0.6376400589942932, | |
| "rewards/GIT": 0.22850769758224487, | |
| "rewards/HPSv2": 0.2633934020996094, | |
| "rewards/ORM": 0.5914241969585419, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -21.5, | |
| "step": 221 | |
| }, | |
| { | |
| "completion_length": 64.421875, | |
| "epoch": 0.24584717607973422, | |
| "grad_norm": 0.39151084423065186, | |
| "kl": 0.02105712890625, | |
| "learning_rate": 8.612499999999999e-07, | |
| "loss": 0.0002729548141360283, | |
| "reward": 1.9599390029907227, | |
| "reward_std": 0.3968782275915146, | |
| "rewards/GDino": 0.6790578365325928, | |
| "rewards/GIT": 0.4293278604745865, | |
| "rewards/HPSv2": 0.2733783721923828, | |
| "rewards/ORM": 0.5781749486923218, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -21.625, | |
| "step": 222 | |
| }, | |
| { | |
| "completion_length": 64.40625, | |
| "epoch": 0.2469545957918051, | |
| "grad_norm": 0.9094283580780029, | |
| "kl": 0.01580810546875, | |
| "learning_rate": 8.606249999999999e-07, | |
| "loss": -0.0013667852617800236, | |
| "reward": 2.021697998046875, | |
| "reward_std": 0.4791509807109833, | |
| "rewards/GDino": 0.7039418518543243, | |
| "rewards/GIT": 0.2243501842021942, | |
| "rewards/HPSv2": 0.27252197265625, | |
| "rewards/ORM": 0.8208840191364288, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -21.5, | |
| "step": 223 | |
| }, | |
| { | |
| "completion_length": 55.859375, | |
| "epoch": 0.24806201550387597, | |
| "grad_norm": 0.390924334526062, | |
| "kl": 0.01751708984375, | |
| "learning_rate": 8.599999999999999e-07, | |
| "loss": -0.003383996314369142, | |
| "reward": 2.2570880651474, | |
| "reward_std": 0.2598092332482338, | |
| "rewards/GDino": 0.7818973660469055, | |
| "rewards/GIT": 0.46117232739925385, | |
| "rewards/HPSv2": 0.26709747314453125, | |
| "rewards/ORM": 0.7469209432601929, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -21.375, | |
| "step": 224 | |
| }, | |
| { | |
| "completion_length": 72.171875, | |
| "epoch": 0.24916943521594684, | |
| "grad_norm": 0.4422501027584076, | |
| "kl": 0.0198974609375, | |
| "learning_rate": 8.593749999999999e-07, | |
| "loss": -0.0018981220200657845, | |
| "reward": 2.231780171394348, | |
| "reward_std": 0.44659605622291565, | |
| "rewards/GDino": 0.7838541567325592, | |
| "rewards/GIT": 0.575921356678009, | |
| "rewards/HPSv2": 0.2408466339111328, | |
| "rewards/ORM": 0.6311580836772919, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -21.3125, | |
| "step": 225 | |
| }, | |
| { | |
| "completion_length": 71.515625, | |
| "epoch": 0.2502768549280177, | |
| "grad_norm": 0.5690449476242065, | |
| "kl": 0.030975341796875, | |
| "learning_rate": 8.587499999999999e-07, | |
| "loss": 0.0036481586284935474, | |
| "reward": 2.3258167505264282, | |
| "reward_std": 0.2903416156768799, | |
| "rewards/GDino": 0.8587138652801514, | |
| "rewards/GIT": 0.4227441996335983, | |
| "rewards/HPSv2": 0.2740001678466797, | |
| "rewards/ORM": 0.7703584432601929, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -22.375, | |
| "step": 226 | |
| }, | |
| { | |
| "completion_length": 76.265625, | |
| "epoch": 0.2513842746400886, | |
| "grad_norm": 0.7145232558250427, | |
| "kl": 0.0177001953125, | |
| "learning_rate": 8.581249999999999e-07, | |
| "loss": -0.008261570241302252, | |
| "reward": 1.9078750610351562, | |
| "reward_std": 0.4406122863292694, | |
| "rewards/GDino": 0.6768103837966919, | |
| "rewards/GIT": 0.4432929754257202, | |
| "rewards/HPSv2": 0.25499725341796875, | |
| "rewards/ORM": 0.5327745378017426, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -21.875, | |
| "step": 227 | |
| }, | |
| { | |
| "completion_length": 72.75, | |
| "epoch": 0.25249169435215946, | |
| "grad_norm": 0.41743382811546326, | |
| "kl": 0.011627197265625, | |
| "learning_rate": 8.575e-07, | |
| "loss": -0.0010735401883721352, | |
| "reward": 1.964881420135498, | |
| "reward_std": 0.3422084152698517, | |
| "rewards/GDino": 0.8187020123004913, | |
| "rewards/GIT": 0.21802851557731628, | |
| "rewards/HPSv2": 0.2918586730957031, | |
| "rewards/ORM": 0.6362922191619873, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -20.9375, | |
| "step": 228 | |
| }, | |
| { | |
| "completion_length": 73.078125, | |
| "epoch": 0.25359911406423036, | |
| "grad_norm": 0.7283642292022705, | |
| "kl": 0.021484375, | |
| "learning_rate": 8.568750000000001e-07, | |
| "loss": -0.004298686049878597, | |
| "reward": 2.1181740760803223, | |
| "reward_std": 0.2697337493300438, | |
| "rewards/GDino": 0.645624965429306, | |
| "rewards/GIT": 0.3558032214641571, | |
| "rewards/HPSv2": 0.270538330078125, | |
| "rewards/ORM": 0.8462075889110565, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.375, | |
| "step": 229 | |
| }, | |
| { | |
| "completion_length": 72.765625, | |
| "epoch": 0.2547065337763012, | |
| "grad_norm": 0.6476506590843201, | |
| "kl": 0.0166015625, | |
| "learning_rate": 8.5625e-07, | |
| "loss": -0.004904653993435204, | |
| "reward": 2.1424243450164795, | |
| "reward_std": 0.3121884614229202, | |
| "rewards/GDino": 0.740700364112854, | |
| "rewards/GIT": 0.3593500852584839, | |
| "rewards/HPSv2": 0.27527427673339844, | |
| "rewards/ORM": 0.7670995891094208, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -22.3125, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 69.171875, | |
| "epoch": 0.2558139534883721, | |
| "grad_norm": 0.4526921510696411, | |
| "kl": 0.019744873046875, | |
| "learning_rate": 8.55625e-07, | |
| "loss": 0.0003523953491821885, | |
| "reward": 1.910792589187622, | |
| "reward_std": 0.46326547861099243, | |
| "rewards/GDino": 0.706869900226593, | |
| "rewards/GIT": 0.3568519949913025, | |
| "rewards/HPSv2": 0.26113319396972656, | |
| "rewards/ORM": 0.5859375, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -22.0, | |
| "step": 231 | |
| }, | |
| { | |
| "completion_length": 75.609375, | |
| "epoch": 0.25692137320044295, | |
| "grad_norm": 0.47148004174232483, | |
| "kl": 0.01458740234375, | |
| "learning_rate": 8.55e-07, | |
| "loss": -0.005282421130686998, | |
| "reward": 2.063035786151886, | |
| "reward_std": 0.38309258222579956, | |
| "rewards/GDino": 0.7156915068626404, | |
| "rewards/GIT": 0.46562977135181427, | |
| "rewards/HPSv2": 0.26252174377441406, | |
| "rewards/ORM": 0.6191926300525665, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -21.4375, | |
| "step": 232 | |
| }, | |
| { | |
| "completion_length": 77.578125, | |
| "epoch": 0.25802879291251385, | |
| "grad_norm": 0.7205236554145813, | |
| "kl": 0.016693115234375, | |
| "learning_rate": 8.54375e-07, | |
| "loss": 0.005424320697784424, | |
| "reward": 2.174328565597534, | |
| "reward_std": 0.28449372947216034, | |
| "rewards/GDino": 0.7531249523162842, | |
| "rewards/GIT": 0.30038363486528397, | |
| "rewards/HPSv2": 0.2665290832519531, | |
| "rewards/ORM": 0.8542908430099487, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -21.625, | |
| "step": 233 | |
| }, | |
| { | |
| "completion_length": 67.78125, | |
| "epoch": 0.2591362126245847, | |
| "grad_norm": 0.49076974391937256, | |
| "kl": 0.02978515625, | |
| "learning_rate": 8.5375e-07, | |
| "loss": -0.01528711523860693, | |
| "reward": 2.099589467048645, | |
| "reward_std": 0.3918275982141495, | |
| "rewards/GDino": 0.80809485912323, | |
| "rewards/GIT": 0.311017170548439, | |
| "rewards/HPSv2": 0.2805614471435547, | |
| "rewards/ORM": 0.6999160349369049, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -22.0625, | |
| "step": 234 | |
| }, | |
| { | |
| "completion_length": 67.34375, | |
| "epoch": 0.2602436323366556, | |
| "grad_norm": 0.5299943089485168, | |
| "kl": 0.014190673828125, | |
| "learning_rate": 8.53125e-07, | |
| "loss": 0.00436694361269474, | |
| "reward": 1.8783327341079712, | |
| "reward_std": 0.5424820780754089, | |
| "rewards/GDino": 0.7234554290771484, | |
| "rewards/GIT": 0.3852204605937004, | |
| "rewards/HPSv2": 0.27126121520996094, | |
| "rewards/ORM": 0.49839554727077484, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.9375, | |
| "step": 235 | |
| }, | |
| { | |
| "completion_length": 78.203125, | |
| "epoch": 0.26135105204872644, | |
| "grad_norm": 1.0189565420150757, | |
| "kl": 0.010833740234375, | |
| "learning_rate": 8.525e-07, | |
| "loss": 0.005404738476499915, | |
| "reward": 1.477653980255127, | |
| "reward_std": 0.4136325716972351, | |
| "rewards/GDino": 0.7152182459831238, | |
| "rewards/GIT": 0.23550968617200851, | |
| "rewards/HPSv2": 0.25348854064941406, | |
| "rewards/ORM": 0.2734375, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -22.1875, | |
| "step": 236 | |
| }, | |
| { | |
| "completion_length": 68.25, | |
| "epoch": 0.26245847176079734, | |
| "grad_norm": 0.642930805683136, | |
| "kl": 0.09527587890625, | |
| "learning_rate": 8.51875e-07, | |
| "loss": -0.0037159734638407826, | |
| "reward": 2.431071162223816, | |
| "reward_std": 0.399463415145874, | |
| "rewards/GDino": 0.8427083194255829, | |
| "rewards/GIT": 0.5333812236785889, | |
| "rewards/HPSv2": 0.2799396514892578, | |
| "rewards/ORM": 0.7750419676303864, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -22.3125, | |
| "step": 237 | |
| }, | |
| { | |
| "completion_length": 64.9375, | |
| "epoch": 0.26356589147286824, | |
| "grad_norm": 0.4478345215320587, | |
| "kl": 0.01031494140625, | |
| "learning_rate": 8.512499999999999e-07, | |
| "loss": 0.00545497820712626, | |
| "reward": 2.0591735243797302, | |
| "reward_std": 0.20043298602104187, | |
| "rewards/GDino": 0.8530542254447937, | |
| "rewards/GIT": 0.46807297319173813, | |
| "rewards/HPSv2": 0.26335906982421875, | |
| "rewards/ORM": 0.47468723356723785, | |
| "self_certainty_semantic": -25.9375, | |
| "self_certainty_token": -20.6875, | |
| "step": 238 | |
| }, | |
| { | |
| "completion_length": 80.0625, | |
| "epoch": 0.2646733111849391, | |
| "grad_norm": 0.5843000411987305, | |
| "kl": 0.0184326171875, | |
| "learning_rate": 8.506249999999999e-07, | |
| "loss": -0.0013531917938962579, | |
| "reward": 2.2810004353523254, | |
| "reward_std": 0.22595498710870743, | |
| "rewards/GDino": 0.6398958265781403, | |
| "rewards/GIT": 0.5575685948133469, | |
| "rewards/HPSv2": 0.2814750671386719, | |
| "rewards/ORM": 0.8020609319210052, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -20.8125, | |
| "step": 239 | |
| }, | |
| { | |
| "completion_length": 71.171875, | |
| "epoch": 0.26578073089701, | |
| "grad_norm": 0.5621791481971741, | |
| "kl": 0.0168609619140625, | |
| "learning_rate": 8.499999999999999e-07, | |
| "loss": 0.006901541026309133, | |
| "reward": 2.2162342071533203, | |
| "reward_std": 0.08602850884199142, | |
| "rewards/GDino": 0.8477180302143097, | |
| "rewards/GIT": 0.4770164489746094, | |
| "rewards/HPSv2": 0.26997947692871094, | |
| "rewards/ORM": 0.6215203106403351, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.625, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 71.015625, | |
| "epoch": 0.26688815060908083, | |
| "grad_norm": 1.044856309890747, | |
| "kl": 0.02154541015625, | |
| "learning_rate": 8.493749999999999e-07, | |
| "loss": -0.004785971017554402, | |
| "reward": 2.2813735008239746, | |
| "reward_std": 0.25123290345072746, | |
| "rewards/GDino": 0.8095787167549133, | |
| "rewards/GIT": 0.6019963622093201, | |
| "rewards/HPSv2": 0.2698402404785156, | |
| "rewards/ORM": 0.599958062171936, | |
| "self_certainty_semantic": -25.9375, | |
| "self_certainty_token": -22.0, | |
| "step": 241 | |
| }, | |
| { | |
| "completion_length": 70.96875, | |
| "epoch": 0.26799557032115173, | |
| "grad_norm": 0.558193564414978, | |
| "kl": 0.03350830078125, | |
| "learning_rate": 8.487499999999999e-07, | |
| "loss": -0.007248041685670614, | |
| "reward": 1.875414788722992, | |
| "reward_std": 0.22249843925237656, | |
| "rewards/GDino": 0.7162744402885437, | |
| "rewards/GIT": 0.35095490515232086, | |
| "rewards/HPSv2": 0.26055908203125, | |
| "rewards/ORM": 0.5476263463497162, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -21.5, | |
| "step": 242 | |
| }, | |
| { | |
| "completion_length": 61.96875, | |
| "epoch": 0.2691029900332226, | |
| "grad_norm": 0.8121842741966248, | |
| "kl": 0.034423828125, | |
| "learning_rate": 8.481249999999999e-07, | |
| "loss": 0.004763577948324382, | |
| "reward": 1.673361897468567, | |
| "reward_std": 0.5223372876644135, | |
| "rewards/GDino": 0.6624484360218048, | |
| "rewards/GIT": 0.24029508233070374, | |
| "rewards/HPSv2": 0.26320648193359375, | |
| "rewards/ORM": 0.5074118822813034, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -20.625, | |
| "step": 243 | |
| }, | |
| { | |
| "completion_length": 64.78125, | |
| "epoch": 0.2702104097452935, | |
| "grad_norm": 0.5268959403038025, | |
| "kl": 0.0262451171875, | |
| "learning_rate": 8.475e-07, | |
| "loss": 0.003346863901242614, | |
| "reward": 1.9098018407821655, | |
| "reward_std": 0.2704559862613678, | |
| "rewards/GDino": 0.737314760684967, | |
| "rewards/GIT": 0.2029709815979004, | |
| "rewards/HPSv2": 0.26165771484375, | |
| "rewards/ORM": 0.7078584730625153, | |
| "self_certainty_semantic": -25.4375, | |
| "self_certainty_token": -21.4375, | |
| "step": 244 | |
| }, | |
| { | |
| "completion_length": 69.1875, | |
| "epoch": 0.2713178294573643, | |
| "grad_norm": 0.7254846096038818, | |
| "kl": 0.01483154296875, | |
| "learning_rate": 8.46875e-07, | |
| "loss": 0.009370718151330948, | |
| "reward": 1.6544832587242126, | |
| "reward_std": 0.40473152697086334, | |
| "rewards/GDino": 0.7218703925609589, | |
| "rewards/GIT": 0.26652586460113525, | |
| "rewards/HPSv2": 0.2653217315673828, | |
| "rewards/ORM": 0.40076524019241333, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -22.3125, | |
| "step": 245 | |
| }, | |
| { | |
| "completion_length": 83.21875, | |
| "epoch": 0.2724252491694352, | |
| "grad_norm": 0.7293990850448608, | |
| "kl": 0.02117919921875, | |
| "learning_rate": 8.462499999999999e-07, | |
| "loss": -0.0010048565454781055, | |
| "reward": 2.144185781478882, | |
| "reward_std": 0.38181324303150177, | |
| "rewards/GDino": 0.811366617679596, | |
| "rewards/GIT": 0.5691226869821548, | |
| "rewards/HPSv2": 0.254058837890625, | |
| "rewards/ORM": 0.5096377730369568, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -20.875, | |
| "step": 246 | |
| }, | |
| { | |
| "completion_length": 72.125, | |
| "epoch": 0.27353266888150607, | |
| "grad_norm": 0.6859191656112671, | |
| "kl": 0.03387451171875, | |
| "learning_rate": 8.45625e-07, | |
| "loss": -0.008051293902099133, | |
| "reward": 2.1632007360458374, | |
| "reward_std": 0.37384991347789764, | |
| "rewards/GDino": 0.8133151531219482, | |
| "rewards/GIT": 0.40972038358449936, | |
| "rewards/HPSv2": 0.26703643798828125, | |
| "rewards/ORM": 0.6731287837028503, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -21.125, | |
| "step": 247 | |
| }, | |
| { | |
| "completion_length": 74.1875, | |
| "epoch": 0.27464008859357697, | |
| "grad_norm": 0.722845733165741, | |
| "kl": 0.023712158203125, | |
| "learning_rate": 8.45e-07, | |
| "loss": 0.012526229955255985, | |
| "reward": 1.6368342638015747, | |
| "reward_std": 0.4980652183294296, | |
| "rewards/GDino": 0.7129978537559509, | |
| "rewards/GIT": 0.22368073463439941, | |
| "rewards/HPSv2": 0.28719520568847656, | |
| "rewards/ORM": 0.4129604697227478, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -22.125, | |
| "step": 248 | |
| }, | |
| { | |
| "completion_length": 74.5625, | |
| "epoch": 0.2757475083056478, | |
| "grad_norm": 0.49814099073410034, | |
| "kl": 0.019317626953125, | |
| "learning_rate": 8.44375e-07, | |
| "loss": -0.0019536763429641724, | |
| "reward": 2.5881810188293457, | |
| "reward_std": 0.2942521944642067, | |
| "rewards/GDino": 0.8687500059604645, | |
| "rewards/GIT": 0.70334193110466, | |
| "rewards/HPSv2": 0.2790355682373047, | |
| "rewards/ORM": 0.737053394317627, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -21.3125, | |
| "step": 249 | |
| }, | |
| { | |
| "completion_length": 73.359375, | |
| "epoch": 0.2768549280177187, | |
| "grad_norm": 0.5322409272193909, | |
| "kl": 0.034423828125, | |
| "learning_rate": 8.4375e-07, | |
| "loss": 0.010972056537866592, | |
| "reward": 2.538747787475586, | |
| "reward_std": 0.269253209233284, | |
| "rewards/GDino": 0.859375, | |
| "rewards/GIT": 0.8257900178432465, | |
| "rewards/HPSv2": 0.26576995849609375, | |
| "rewards/ORM": 0.5878127217292786, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.5, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 69.96875, | |
| "epoch": 0.2779623477297896, | |
| "grad_norm": 0.43763142824172974, | |
| "kl": 0.014007568359375, | |
| "learning_rate": 8.43125e-07, | |
| "loss": -0.004584175767377019, | |
| "reward": 2.5028269290924072, | |
| "reward_std": 0.3188868314027786, | |
| "rewards/GDino": 0.878125011920929, | |
| "rewards/GIT": 0.7406049966812134, | |
| "rewards/HPSv2": 0.24962997436523438, | |
| "rewards/ORM": 0.6344668865203857, | |
| "self_certainty_semantic": -25.9375, | |
| "self_certainty_token": -20.9375, | |
| "step": 251 | |
| }, | |
| { | |
| "completion_length": 75.609375, | |
| "epoch": 0.27906976744186046, | |
| "grad_norm": 1.2747613191604614, | |
| "kl": 0.05096435546875, | |
| "learning_rate": 8.425e-07, | |
| "loss": 0.0056330859661102295, | |
| "reward": 1.8026528358459473, | |
| "reward_std": 0.3838294893503189, | |
| "rewards/GDino": 0.7048274576663971, | |
| "rewards/GIT": 0.18401113897562027, | |
| "rewards/HPSv2": 0.2747936248779297, | |
| "rewards/ORM": 0.6390205323696136, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -22.375, | |
| "step": 252 | |
| }, | |
| { | |
| "completion_length": 58.328125, | |
| "epoch": 0.28017718715393136, | |
| "grad_norm": 0.49052363634109497, | |
| "kl": 0.011322021484375, | |
| "learning_rate": 8.41875e-07, | |
| "loss": -0.007860599551349878, | |
| "reward": 2.464292287826538, | |
| "reward_std": 0.34174694865942, | |
| "rewards/GDino": 0.8541666567325592, | |
| "rewards/GIT": 0.6527195274829865, | |
| "rewards/HPSv2": 0.24892616271972656, | |
| "rewards/ORM": 0.7084799110889435, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -22.0, | |
| "step": 253 | |
| }, | |
| { | |
| "completion_length": 64.75, | |
| "epoch": 0.2812846068660022, | |
| "grad_norm": 2.073457956314087, | |
| "kl": 0.02685546875, | |
| "learning_rate": 8.4125e-07, | |
| "loss": -0.011055355425924063, | |
| "reward": 2.175750970840454, | |
| "reward_std": 0.26358360797166824, | |
| "rewards/GDino": 0.8023437261581421, | |
| "rewards/GIT": 0.6660144329071045, | |
| "rewards/HPSv2": 0.2644081115722656, | |
| "rewards/ORM": 0.44298477470874786, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -21.5, | |
| "step": 254 | |
| }, | |
| { | |
| "completion_length": 71.09375, | |
| "epoch": 0.2823920265780731, | |
| "grad_norm": 0.4680713415145874, | |
| "kl": 0.02752685546875, | |
| "learning_rate": 8.406249999999999e-07, | |
| "loss": 0.008177328621968627, | |
| "reward": 2.0434359312057495, | |
| "reward_std": 0.1861441507935524, | |
| "rewards/GDino": 0.6919757723808289, | |
| "rewards/GIT": 0.3358978107571602, | |
| "rewards/HPSv2": 0.2795829772949219, | |
| "rewards/ORM": 0.7359794676303864, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -21.875, | |
| "step": 255 | |
| }, | |
| { | |
| "completion_length": 67.28125, | |
| "epoch": 0.28349944629014395, | |
| "grad_norm": 0.4970144033432007, | |
| "kl": 0.015625, | |
| "learning_rate": 8.399999999999999e-07, | |
| "loss": -0.002479484537616372, | |
| "reward": 1.8094860911369324, | |
| "reward_std": 0.33552980422973633, | |
| "rewards/GDino": 0.6692599654197693, | |
| "rewards/GIT": 0.360467329621315, | |
| "rewards/HPSv2": 0.2643547058105469, | |
| "rewards/ORM": 0.5154041647911072, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -21.375, | |
| "step": 256 | |
| }, | |
| { | |
| "completion_length": 65.46875, | |
| "epoch": 0.28460686600221485, | |
| "grad_norm": 0.6144054532051086, | |
| "kl": 0.0169677734375, | |
| "learning_rate": 8.393749999999999e-07, | |
| "loss": 0.004478918854147196, | |
| "reward": 1.7510342001914978, | |
| "reward_std": 0.38688288629055023, | |
| "rewards/GDino": 0.6829120516777039, | |
| "rewards/GIT": 0.3309401273727417, | |
| "rewards/HPSv2": 0.2636985778808594, | |
| "rewards/ORM": 0.4734834134578705, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.25, | |
| "step": 257 | |
| }, | |
| { | |
| "completion_length": 83.109375, | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 0.7908658385276794, | |
| "kl": 0.017425537109375, | |
| "learning_rate": 8.387499999999999e-07, | |
| "loss": 0.008913073223084211, | |
| "reward": 1.6875710487365723, | |
| "reward_std": 0.5354342758655548, | |
| "rewards/GDino": 0.5822916924953461, | |
| "rewards/GIT": 0.18823493272066116, | |
| "rewards/HPSv2": 0.27369117736816406, | |
| "rewards/ORM": 0.6433533132076263, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -21.0, | |
| "step": 258 | |
| }, | |
| { | |
| "completion_length": 64.734375, | |
| "epoch": 0.2868217054263566, | |
| "grad_norm": 0.5669509172439575, | |
| "kl": 0.01910400390625, | |
| "learning_rate": 8.38125e-07, | |
| "loss": -0.007796134799718857, | |
| "reward": 2.570701003074646, | |
| "reward_std": 0.17354267835617065, | |
| "rewards/GDino": 0.8989583253860474, | |
| "rewards/GIT": 0.7911819815635681, | |
| "rewards/HPSv2": 0.2594928741455078, | |
| "rewards/ORM": 0.6210678368806839, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -21.0625, | |
| "step": 259 | |
| }, | |
| { | |
| "completion_length": 72.84375, | |
| "epoch": 0.28792912513842744, | |
| "grad_norm": 0.43997663259506226, | |
| "kl": 0.05230712890625, | |
| "learning_rate": 8.375e-07, | |
| "loss": -0.004798144684173167, | |
| "reward": 1.865119218826294, | |
| "reward_std": 0.5158642530441284, | |
| "rewards/GDino": 0.6778468787670135, | |
| "rewards/GIT": 0.33176329731941223, | |
| "rewards/HPSv2": 0.2755088806152344, | |
| "rewards/ORM": 0.5800002217292786, | |
| "self_certainty_semantic": -26.0, | |
| "self_certainty_token": -22.125, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 74.875, | |
| "epoch": 0.28903654485049834, | |
| "grad_norm": 0.5706507563591003, | |
| "kl": 0.02374267578125, | |
| "learning_rate": 8.36875e-07, | |
| "loss": 0.004061129409819841, | |
| "reward": 2.234776735305786, | |
| "reward_std": 0.3810935467481613, | |
| "rewards/GDino": 0.7571678161621094, | |
| "rewards/GIT": 0.52987040579319, | |
| "rewards/HPSv2": 0.2680511474609375, | |
| "rewards/ORM": 0.6796875, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -20.6875, | |
| "step": 261 | |
| }, | |
| { | |
| "completion_length": 74.5, | |
| "epoch": 0.2901439645625692, | |
| "grad_norm": 0.46872520446777344, | |
| "kl": 0.01873779296875, | |
| "learning_rate": 8.3625e-07, | |
| "loss": -0.00692132324911654, | |
| "reward": 2.576533555984497, | |
| "reward_std": 0.38655444979667664, | |
| "rewards/GDino": 0.8760416209697723, | |
| "rewards/GIT": 0.6822678744792938, | |
| "rewards/HPSv2": 0.26166534423828125, | |
| "rewards/ORM": 0.7565587162971497, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -22.1875, | |
| "step": 262 | |
| }, | |
| { | |
| "completion_length": 76.5625, | |
| "epoch": 0.2912513842746401, | |
| "grad_norm": 0.46919816732406616, | |
| "kl": 0.02130126953125, | |
| "learning_rate": 8.356249999999999e-07, | |
| "loss": -0.0069916946813464165, | |
| "reward": 2.5552332401275635, | |
| "reward_std": 0.2706700414419174, | |
| "rewards/GDino": 0.8554803431034088, | |
| "rewards/GIT": 0.5103462636470795, | |
| "rewards/HPSv2": 0.2714118957519531, | |
| "rewards/ORM": 0.9179946780204773, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -21.4375, | |
| "step": 263 | |
| }, | |
| { | |
| "completion_length": 72.765625, | |
| "epoch": 0.292358803986711, | |
| "grad_norm": 0.5821876525878906, | |
| "kl": 0.0244140625, | |
| "learning_rate": 8.349999999999999e-07, | |
| "loss": -0.005310273729264736, | |
| "reward": 2.485088586807251, | |
| "reward_std": 0.3456519544124603, | |
| "rewards/GDino": 0.838541716337204, | |
| "rewards/GIT": 0.6395441293716431, | |
| "rewards/HPSv2": 0.2694988250732422, | |
| "rewards/ORM": 0.7375039756298065, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.125, | |
| "step": 264 | |
| }, | |
| { | |
| "completion_length": 76.34375, | |
| "epoch": 0.29346622369878184, | |
| "grad_norm": 0.44273218512535095, | |
| "kl": 0.02325439453125, | |
| "learning_rate": 8.34375e-07, | |
| "loss": 0.00350103247910738, | |
| "reward": 2.10404896736145, | |
| "reward_std": 0.5246832072734833, | |
| "rewards/GDino": 0.8096200525760651, | |
| "rewards/GIT": 0.3854048401117325, | |
| "rewards/HPSv2": 0.27233123779296875, | |
| "rewards/ORM": 0.6366928219795227, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -21.0625, | |
| "step": 265 | |
| }, | |
| { | |
| "completion_length": 75.109375, | |
| "epoch": 0.29457364341085274, | |
| "grad_norm": 0.4895631968975067, | |
| "kl": 0.016632080078125, | |
| "learning_rate": 8.3375e-07, | |
| "loss": -0.00301961723016575, | |
| "reward": 1.9082358479499817, | |
| "reward_std": 0.3556567281484604, | |
| "rewards/GDino": 0.6507861316204071, | |
| "rewards/GIT": 0.3627527952194214, | |
| "rewards/HPSv2": 0.2768878936767578, | |
| "rewards/ORM": 0.6178089678287506, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -21.0, | |
| "step": 266 | |
| }, | |
| { | |
| "completion_length": 78.015625, | |
| "epoch": 0.2956810631229236, | |
| "grad_norm": 0.5989522933959961, | |
| "kl": 0.01922607421875, | |
| "learning_rate": 8.33125e-07, | |
| "loss": -0.01323670195415616, | |
| "reward": 2.559838652610779, | |
| "reward_std": 0.33854877948760986, | |
| "rewards/GDino": 0.8420600295066833, | |
| "rewards/GIT": 0.5396290123462677, | |
| "rewards/HPSv2": 0.2910041809082031, | |
| "rewards/ORM": 0.8871452808380127, | |
| "self_certainty_semantic": -26.0625, | |
| "self_certainty_token": -21.625, | |
| "step": 267 | |
| }, | |
| { | |
| "completion_length": 78.328125, | |
| "epoch": 0.2967884828349945, | |
| "grad_norm": 0.44201359152793884, | |
| "kl": 0.0205078125, | |
| "learning_rate": 8.325e-07, | |
| "loss": -0.01491079293191433, | |
| "reward": 2.0024473071098328, | |
| "reward_std": 0.34718990325927734, | |
| "rewards/GDino": 0.7285216152667999, | |
| "rewards/GIT": 0.3823501020669937, | |
| "rewards/HPSv2": 0.26154136657714844, | |
| "rewards/ORM": 0.6300341486930847, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -22.0, | |
| "step": 268 | |
| }, | |
| { | |
| "completion_length": 83.40625, | |
| "epoch": 0.2978959025470653, | |
| "grad_norm": 0.8317415118217468, | |
| "kl": 0.0340576171875, | |
| "learning_rate": 8.31875e-07, | |
| "loss": -0.0009441054426133633, | |
| "reward": 2.3879988193511963, | |
| "reward_std": 0.3649384081363678, | |
| "rewards/GDino": 0.8694140315055847, | |
| "rewards/GIT": 0.6274352371692657, | |
| "rewards/HPSv2": 0.26614952087402344, | |
| "rewards/ORM": 0.625, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -20.6875, | |
| "step": 269 | |
| }, | |
| { | |
| "completion_length": 80.125, | |
| "epoch": 0.29900332225913623, | |
| "grad_norm": 0.6886855363845825, | |
| "kl": 0.02099609375, | |
| "learning_rate": 8.3125e-07, | |
| "loss": -0.012794415233656764, | |
| "reward": 2.0984017848968506, | |
| "reward_std": 0.1863284632563591, | |
| "rewards/GDino": 0.7838541865348816, | |
| "rewards/GIT": 0.43376635760068893, | |
| "rewards/HPSv2": 0.27881813049316406, | |
| "rewards/ORM": 0.6019631624221802, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -21.3125, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 79.578125, | |
| "epoch": 0.3001107419712071, | |
| "grad_norm": 0.4688260853290558, | |
| "kl": 0.028076171875, | |
| "learning_rate": 8.306249999999999e-07, | |
| "loss": -0.008426547283306718, | |
| "reward": 1.7719020247459412, | |
| "reward_std": 0.5150116533041, | |
| "rewards/GDino": 0.7138240337371826, | |
| "rewards/GIT": 0.2824729457497597, | |
| "rewards/HPSv2": 0.26485443115234375, | |
| "rewards/ORM": 0.5107506066560745, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -21.6875, | |
| "step": 271 | |
| }, | |
| { | |
| "completion_length": 70.484375, | |
| "epoch": 0.301218161683278, | |
| "grad_norm": 0.7626622319221497, | |
| "kl": 0.0302734375, | |
| "learning_rate": 8.299999999999999e-07, | |
| "loss": -0.007479890366084874, | |
| "reward": 1.8375248312950134, | |
| "reward_std": 0.39174318313598633, | |
| "rewards/GDino": 0.6447004973888397, | |
| "rewards/GIT": 0.23957757651805878, | |
| "rewards/HPSv2": 0.2825756072998047, | |
| "rewards/ORM": 0.6706711649894714, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.9375, | |
| "step": 272 | |
| }, | |
| { | |
| "completion_length": 87.921875, | |
| "epoch": 0.3023255813953488, | |
| "grad_norm": 2.0423707962036133, | |
| "kl": 0.02557373046875, | |
| "learning_rate": 8.293749999999999e-07, | |
| "loss": -0.0029805664089508355, | |
| "reward": 1.7430112957954407, | |
| "reward_std": 0.2507159113883972, | |
| "rewards/GDino": 0.6535544991493225, | |
| "rewards/GIT": 0.31878431141376495, | |
| "rewards/HPSv2": 0.2754058837890625, | |
| "rewards/ORM": 0.4952665716409683, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -22.0625, | |
| "step": 273 | |
| }, | |
| { | |
| "completion_length": 81.4375, | |
| "epoch": 0.3034330011074197, | |
| "grad_norm": 0.7430813908576965, | |
| "kl": 0.0899658203125, | |
| "learning_rate": 8.287499999999999e-07, | |
| "loss": -0.001141307526268065, | |
| "reward": 2.2495153546333313, | |
| "reward_std": 0.34138451516628265, | |
| "rewards/GDino": 0.8024103045463562, | |
| "rewards/GIT": 0.4980108290910721, | |
| "rewards/HPSv2": 0.2546348571777344, | |
| "rewards/ORM": 0.6944593787193298, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -22.0625, | |
| "step": 274 | |
| }, | |
| { | |
| "completion_length": 82.0, | |
| "epoch": 0.30454042081949056, | |
| "grad_norm": 0.6249475479125977, | |
| "kl": 0.021484375, | |
| "learning_rate": 8.28125e-07, | |
| "loss": -0.006782526383176446, | |
| "reward": 2.718081474304199, | |
| "reward_std": 0.2731604278087616, | |
| "rewards/GDino": 0.8783854246139526, | |
| "rewards/GIT": 0.782598078250885, | |
| "rewards/HPSv2": 0.2852649688720703, | |
| "rewards/ORM": 0.771833062171936, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -21.5625, | |
| "step": 275 | |
| }, | |
| { | |
| "completion_length": 67.046875, | |
| "epoch": 0.30564784053156147, | |
| "grad_norm": 0.4724781811237335, | |
| "kl": 0.01434326171875, | |
| "learning_rate": 8.275e-07, | |
| "loss": 0.00598024798091501, | |
| "reward": 2.1995996236801147, | |
| "reward_std": 0.31263478100299835, | |
| "rewards/GDino": 0.7332243323326111, | |
| "rewards/GIT": 0.4423800855875015, | |
| "rewards/HPSv2": 0.27051544189453125, | |
| "rewards/ORM": 0.7534796893596649, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -21.4375, | |
| "step": 276 | |
| }, | |
| { | |
| "completion_length": 84.015625, | |
| "epoch": 0.3067552602436323, | |
| "grad_norm": 0.7819753289222717, | |
| "kl": 0.0323486328125, | |
| "learning_rate": 8.26875e-07, | |
| "loss": -0.0006253474857658148, | |
| "reward": 2.2662617564201355, | |
| "reward_std": 0.33760039508342743, | |
| "rewards/GDino": 0.7433146238327026, | |
| "rewards/GIT": 0.5460084825754166, | |
| "rewards/HPSv2": 0.27779579162597656, | |
| "rewards/ORM": 0.6991429030895233, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.5, | |
| "step": 277 | |
| }, | |
| { | |
| "completion_length": 69.65625, | |
| "epoch": 0.3078626799557032, | |
| "grad_norm": 0.4542302191257477, | |
| "kl": 0.0260009765625, | |
| "learning_rate": 8.2625e-07, | |
| "loss": 0.0030680494382977486, | |
| "reward": 2.175543189048767, | |
| "reward_std": 0.4059564173221588, | |
| "rewards/GDino": 0.8273285925388336, | |
| "rewards/GIT": 0.6166777014732361, | |
| "rewards/HPSv2": 0.2625160217285156, | |
| "rewards/ORM": 0.4690207839012146, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -21.1875, | |
| "step": 278 | |
| }, | |
| { | |
| "completion_length": 73.71875, | |
| "epoch": 0.3089700996677741, | |
| "grad_norm": 0.5411505103111267, | |
| "kl": 0.019134521484375, | |
| "learning_rate": 8.25625e-07, | |
| "loss": -0.010078638093546033, | |
| "reward": 2.640411376953125, | |
| "reward_std": 0.28325602412223816, | |
| "rewards/GDino": 0.8475841879844666, | |
| "rewards/GIT": 0.771721363067627, | |
| "rewards/HPSv2": 0.266693115234375, | |
| "rewards/ORM": 0.7544127404689789, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -21.0, | |
| "step": 279 | |
| }, | |
| { | |
| "completion_length": 73.46875, | |
| "epoch": 0.31007751937984496, | |
| "grad_norm": 0.46741002798080444, | |
| "kl": 0.026458740234375, | |
| "learning_rate": 8.249999999999999e-07, | |
| "loss": -0.0019589242292568088, | |
| "reward": 2.315522074699402, | |
| "reward_std": 0.3852066993713379, | |
| "rewards/GDino": 0.8072916269302368, | |
| "rewards/GIT": 0.51693394780159, | |
| "rewards/HPSv2": 0.2585258483886719, | |
| "rewards/ORM": 0.732770562171936, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -22.25, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 73.0625, | |
| "epoch": 0.31118493909191586, | |
| "grad_norm": 0.5686728954315186, | |
| "kl": 0.033203125, | |
| "learning_rate": 8.243749999999999e-07, | |
| "loss": -0.00037761888233944774, | |
| "reward": 2.1060147285461426, | |
| "reward_std": 0.2314438670873642, | |
| "rewards/GDino": 0.7503374814987183, | |
| "rewards/GIT": 0.43248776346445084, | |
| "rewards/HPSv2": 0.27953529357910156, | |
| "rewards/ORM": 0.6436541378498077, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -21.8125, | |
| "step": 281 | |
| }, | |
| { | |
| "completion_length": 89.703125, | |
| "epoch": 0.3122923588039867, | |
| "grad_norm": 1.0470083951950073, | |
| "kl": 0.02166748046875, | |
| "learning_rate": 8.2375e-07, | |
| "loss": -0.0013394411653280258, | |
| "reward": 2.125031590461731, | |
| "reward_std": 0.24564317613840103, | |
| "rewards/GDino": 0.8199155628681183, | |
| "rewards/GIT": 0.5837800800800323, | |
| "rewards/HPSv2": 0.2696857452392578, | |
| "rewards/ORM": 0.45165039598941803, | |
| "self_certainty_semantic": -26.0, | |
| "self_certainty_token": -20.9375, | |
| "step": 282 | |
| }, | |
| { | |
| "completion_length": 83.28125, | |
| "epoch": 0.3133997785160576, | |
| "grad_norm": 0.6488218903541565, | |
| "kl": 0.0267333984375, | |
| "learning_rate": 8.23125e-07, | |
| "loss": 0.005290511529892683, | |
| "reward": 2.1087766885757446, | |
| "reward_std": 0.31704336404800415, | |
| "rewards/GDino": 0.7598958313465118, | |
| "rewards/GIT": 0.4413727596402168, | |
| "rewards/HPSv2": 0.25473785400390625, | |
| "rewards/ORM": 0.6527703106403351, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -21.1875, | |
| "step": 283 | |
| }, | |
| { | |
| "completion_length": 78.765625, | |
| "epoch": 0.31450719822812845, | |
| "grad_norm": 0.8609553575515747, | |
| "kl": 0.03515625, | |
| "learning_rate": 8.225e-07, | |
| "loss": 0.0021090602967888117, | |
| "reward": 1.7876678705215454, | |
| "reward_std": 0.3264614939689636, | |
| "rewards/GDino": 0.7270833253860474, | |
| "rewards/GIT": 0.3341338261961937, | |
| "rewards/HPSv2": 0.26984596252441406, | |
| "rewards/ORM": 0.4566046893596649, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -21.9375, | |
| "step": 284 | |
| }, | |
| { | |
| "completion_length": 67.046875, | |
| "epoch": 0.31561461794019935, | |
| "grad_norm": 0.6733913421630859, | |
| "kl": 0.049072265625, | |
| "learning_rate": 8.21875e-07, | |
| "loss": -0.009910227498039603, | |
| "reward": 1.693782925605774, | |
| "reward_std": 0.39314794540405273, | |
| "rewards/GDino": 0.7284678220748901, | |
| "rewards/GIT": 0.18779370188713074, | |
| "rewards/HPSv2": 0.27591705322265625, | |
| "rewards/ORM": 0.5016044080257416, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -21.1875, | |
| "step": 285 | |
| }, | |
| { | |
| "completion_length": 78.75, | |
| "epoch": 0.3167220376522702, | |
| "grad_norm": 0.4943682849407196, | |
| "kl": 0.018035888671875, | |
| "learning_rate": 8.2125e-07, | |
| "loss": -0.0012577057350426912, | |
| "reward": 2.193196415901184, | |
| "reward_std": 0.46209120750427246, | |
| "rewards/GDino": 0.8109375238418579, | |
| "rewards/GIT": 0.4691374748945236, | |
| "rewards/HPSv2": 0.26856422424316406, | |
| "rewards/ORM": 0.6445571482181549, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -21.1875, | |
| "step": 286 | |
| }, | |
| { | |
| "completion_length": 75.4375, | |
| "epoch": 0.3178294573643411, | |
| "grad_norm": 0.6116342544555664, | |
| "kl": 0.0264892578125, | |
| "learning_rate": 8.20625e-07, | |
| "loss": 0.0010706414468586445, | |
| "reward": 2.312258720397949, | |
| "reward_std": 0.3226209282875061, | |
| "rewards/GDino": 0.7510090172290802, | |
| "rewards/GIT": 0.5955759733915329, | |
| "rewards/HPSv2": 0.27696990966796875, | |
| "rewards/ORM": 0.6887038052082062, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -21.5, | |
| "step": 287 | |
| }, | |
| { | |
| "completion_length": 80.921875, | |
| "epoch": 0.31893687707641194, | |
| "grad_norm": 0.5414326190948486, | |
| "kl": 0.02142333984375, | |
| "learning_rate": 8.199999999999999e-07, | |
| "loss": -0.0018570725806057453, | |
| "reward": 2.3252252340316772, | |
| "reward_std": 0.4416055828332901, | |
| "rewards/GDino": 0.7875000238418579, | |
| "rewards/GIT": 0.44259513914585114, | |
| "rewards/HPSv2": 0.25606346130371094, | |
| "rewards/ORM": 0.8390664756298065, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -21.875, | |
| "step": 288 | |
| }, | |
| { | |
| "completion_length": 73.453125, | |
| "epoch": 0.32004429678848284, | |
| "grad_norm": 2.0414440631866455, | |
| "kl": 0.0384521484375, | |
| "learning_rate": 8.193749999999999e-07, | |
| "loss": -0.0028190852608531713, | |
| "reward": 2.363397717475891, | |
| "reward_std": 0.38728441298007965, | |
| "rewards/GDino": 0.7780884504318237, | |
| "rewards/GIT": 0.454902321100235, | |
| "rewards/HPSv2": 0.2644233703613281, | |
| "rewards/ORM": 0.8659836649894714, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -21.6875, | |
| "step": 289 | |
| }, | |
| { | |
| "completion_length": 93.84375, | |
| "epoch": 0.3211517165005537, | |
| "grad_norm": 0.46440112590789795, | |
| "kl": 0.0213623046875, | |
| "learning_rate": 8.187499999999999e-07, | |
| "loss": -0.003084618365392089, | |
| "reward": 1.9635199308395386, | |
| "reward_std": 0.3792533278465271, | |
| "rewards/GDino": 0.6178125143051147, | |
| "rewards/GIT": 0.32137173414230347, | |
| "rewards/HPSv2": 0.27581024169921875, | |
| "rewards/ORM": 0.7485254108905792, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -22.0, | |
| "step": 290 | |
| }, | |
| { | |
| "completion_length": 78.65625, | |
| "epoch": 0.3222591362126246, | |
| "grad_norm": 0.6124201416969299, | |
| "kl": 0.0443115234375, | |
| "learning_rate": 8.18125e-07, | |
| "loss": 0.0013587521389126778, | |
| "reward": 1.9497240781784058, | |
| "reward_std": 0.3417292982339859, | |
| "rewards/GDino": 0.8001987338066101, | |
| "rewards/GIT": 0.39713574945926666, | |
| "rewards/HPSv2": 0.26921844482421875, | |
| "rewards/ORM": 0.483171209692955, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -21.8125, | |
| "step": 291 | |
| }, | |
| { | |
| "completion_length": 92.625, | |
| "epoch": 0.3233665559246955, | |
| "grad_norm": 0.9040305018424988, | |
| "kl": 0.027099609375, | |
| "learning_rate": 8.175e-07, | |
| "loss": 0.0031354378443211317, | |
| "reward": 2.339022397994995, | |
| "reward_std": 0.21652893722057343, | |
| "rewards/GDino": 0.839062511920929, | |
| "rewards/GIT": 0.5841934829950333, | |
| "rewards/HPSv2": 0.27198219299316406, | |
| "rewards/ORM": 0.6437839716672897, | |
| "self_certainty_semantic": -25.5, | |
| "self_certainty_token": -21.125, | |
| "step": 292 | |
| }, | |
| { | |
| "completion_length": 79.234375, | |
| "epoch": 0.32447397563676633, | |
| "grad_norm": 0.46516144275665283, | |
| "kl": 0.0208740234375, | |
| "learning_rate": 8.16875e-07, | |
| "loss": 0.009912369772791862, | |
| "reward": 2.337222397327423, | |
| "reward_std": 0.2715953439474106, | |
| "rewards/GDino": 0.7020833194255829, | |
| "rewards/GIT": 0.46142156422138214, | |
| "rewards/HPSv2": 0.2674674987792969, | |
| "rewards/ORM": 0.90625, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -20.875, | |
| "step": 293 | |
| }, | |
| { | |
| "completion_length": 77.421875, | |
| "epoch": 0.32558139534883723, | |
| "grad_norm": 0.5907319784164429, | |
| "kl": 0.0245361328125, | |
| "learning_rate": 8.1625e-07, | |
| "loss": -0.008728538639843464, | |
| "reward": 2.7809219360351562, | |
| "reward_std": 0.24086545407772064, | |
| "rewards/GDino": 0.882291704416275, | |
| "rewards/GIT": 0.6462251394987106, | |
| "rewards/HPSv2": 0.28365516662597656, | |
| "rewards/ORM": 0.96875, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -21.75, | |
| "step": 294 | |
| }, | |
| { | |
| "completion_length": 83.265625, | |
| "epoch": 0.3266888150609081, | |
| "grad_norm": 1.4958977699279785, | |
| "kl": 0.0491943359375, | |
| "learning_rate": 8.15625e-07, | |
| "loss": 0.0032177013345062733, | |
| "reward": 1.7095434069633484, | |
| "reward_std": 0.4502948075532913, | |
| "rewards/GDino": 0.6662150025367737, | |
| "rewards/GIT": 0.10812210291624069, | |
| "rewards/HPSv2": 0.2896270751953125, | |
| "rewards/ORM": 0.6455792784690857, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -21.9375, | |
| "step": 295 | |
| }, | |
| { | |
| "completion_length": 73.25, | |
| "epoch": 0.327796234772979, | |
| "grad_norm": 6.574085712432861, | |
| "kl": 0.02142333984375, | |
| "learning_rate": 8.149999999999999e-07, | |
| "loss": 0.0013078644406050444, | |
| "reward": 2.7796449661254883, | |
| "reward_std": 0.2162124067544937, | |
| "rewards/GDino": 0.9666666984558105, | |
| "rewards/GIT": 0.8142045736312866, | |
| "rewards/HPSv2": 0.2789497375488281, | |
| "rewards/ORM": 0.7198239415884018, | |
| "self_certainty_semantic": -26.0, | |
| "self_certainty_token": -20.5625, | |
| "step": 296 | |
| }, | |
| { | |
| "completion_length": 81.25, | |
| "epoch": 0.3289036544850498, | |
| "grad_norm": 2.1547293663024902, | |
| "kl": 0.056884765625, | |
| "learning_rate": 8.143749999999999e-07, | |
| "loss": 0.01906517706811428, | |
| "reward": 1.9463022351264954, | |
| "reward_std": 0.301323801279068, | |
| "rewards/GDino": 0.816184788942337, | |
| "rewards/GIT": 0.2949872240424156, | |
| "rewards/HPSv2": 0.2578582763671875, | |
| "rewards/ORM": 0.5772718787193298, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -22.25, | |
| "step": 297 | |
| }, | |
| { | |
| "completion_length": 79.46875, | |
| "epoch": 0.3300110741971207, | |
| "grad_norm": 0.6863793134689331, | |
| "kl": 0.0230712890625, | |
| "learning_rate": 8.137499999999999e-07, | |
| "loss": -0.005342667340300977, | |
| "reward": 2.2086856365203857, | |
| "reward_std": 0.255329854786396, | |
| "rewards/GDino": 0.8382692337036133, | |
| "rewards/GIT": 0.6078545451164246, | |
| "rewards/HPSv2": 0.2703742980957031, | |
| "rewards/ORM": 0.4921875, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -22.0, | |
| "step": 298 | |
| }, | |
| { | |
| "completion_length": 77.984375, | |
| "epoch": 0.33111849390919157, | |
| "grad_norm": 1.2127599716186523, | |
| "kl": 0.02545166015625, | |
| "learning_rate": 8.131249999999999e-07, | |
| "loss": -0.0011417875648476183, | |
| "reward": 2.55258309841156, | |
| "reward_std": 0.41294097900390625, | |
| "rewards/GDino": 0.8872395753860474, | |
| "rewards/GIT": 0.7489987313747406, | |
| "rewards/HPSv2": 0.2650489807128906, | |
| "rewards/ORM": 0.6512957215309143, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -21.6875, | |
| "step": 299 | |
| }, | |
| { | |
| "completion_length": 73.25, | |
| "epoch": 0.33222591362126247, | |
| "grad_norm": 0.4840102791786194, | |
| "kl": 0.02264404296875, | |
| "learning_rate": 8.125e-07, | |
| "loss": 0.0034709569881670177, | |
| "reward": 2.5574848651885986, | |
| "reward_std": 0.20182272791862488, | |
| "rewards/GDino": 0.7973958551883698, | |
| "rewards/GIT": 0.6920412927865982, | |
| "rewards/HPSv2": 0.2663593292236328, | |
| "rewards/ORM": 0.8016884028911591, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -21.1875, | |
| "step": 300 | |
| }, | |
| { | |
| "completion_length": 88.65625, | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 0.4527220129966736, | |
| "kl": 0.021453857421875, | |
| "learning_rate": 8.11875e-07, | |
| "loss": 0.005656351568177342, | |
| "reward": 2.1882619857788086, | |
| "reward_std": 0.3654931038618088, | |
| "rewards/GDino": 0.7827093601226807, | |
| "rewards/GIT": 0.4531768709421158, | |
| "rewards/HPSv2": 0.255859375, | |
| "rewards/ORM": 0.6965163350105286, | |
| "self_certainty_semantic": -26.0, | |
| "self_certainty_token": -21.5625, | |
| "step": 301 | |
| }, | |
| { | |
| "completion_length": 82.03125, | |
| "epoch": 0.3344407530454042, | |
| "grad_norm": 0.5959556698799133, | |
| "kl": 0.023193359375, | |
| "learning_rate": 8.1125e-07, | |
| "loss": 0.006475352216511965, | |
| "reward": 2.2507412433624268, | |
| "reward_std": 0.34599871933460236, | |
| "rewards/GDino": 0.8190558552742004, | |
| "rewards/GIT": 0.4401208460330963, | |
| "rewards/HPSv2": 0.26786041259765625, | |
| "rewards/ORM": 0.7237042486667633, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.375, | |
| "step": 302 | |
| }, | |
| { | |
| "completion_length": 70.890625, | |
| "epoch": 0.33554817275747506, | |
| "grad_norm": 0.5255715847015381, | |
| "kl": 0.0272216796875, | |
| "learning_rate": 8.10625e-07, | |
| "loss": -0.001262905541807413, | |
| "reward": 1.8884990215301514, | |
| "reward_std": 0.4090830981731415, | |
| "rewards/GDino": 0.6540113091468811, | |
| "rewards/GIT": 0.17955374717712402, | |
| "rewards/HPSv2": 0.27841758728027344, | |
| "rewards/ORM": 0.7765165269374847, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -23.1875, | |
| "step": 303 | |
| }, | |
| { | |
| "completion_length": 77.75, | |
| "epoch": 0.33665559246954596, | |
| "grad_norm": 0.4031485915184021, | |
| "kl": 0.019073486328125, | |
| "learning_rate": 8.1e-07, | |
| "loss": -0.0022556333569809794, | |
| "reward": 2.027117609977722, | |
| "reward_std": 0.46946775913238525, | |
| "rewards/GDino": 0.7257461845874786, | |
| "rewards/GIT": 0.417344406247139, | |
| "rewards/HPSv2": 0.2757759094238281, | |
| "rewards/ORM": 0.6082510948181152, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -21.625, | |
| "step": 304 | |
| }, | |
| { | |
| "completion_length": 78.375, | |
| "epoch": 0.3377630121816168, | |
| "grad_norm": 1.838289737701416, | |
| "kl": 0.06536865234375, | |
| "learning_rate": 8.093749999999999e-07, | |
| "loss": -0.005356153065804392, | |
| "reward": 1.969985008239746, | |
| "reward_std": 0.3424445614218712, | |
| "rewards/GDino": 0.718147873878479, | |
| "rewards/GIT": 0.3563241511583328, | |
| "rewards/HPSv2": 0.2647056579589844, | |
| "rewards/ORM": 0.6308073699474335, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -22.3125, | |
| "step": 305 | |
| }, | |
| { | |
| "completion_length": 89.703125, | |
| "epoch": 0.3388704318936877, | |
| "grad_norm": 1.6125619411468506, | |
| "kl": 0.03436279296875, | |
| "learning_rate": 8.087499999999999e-07, | |
| "loss": -0.00043197721242904663, | |
| "reward": 1.5673499703407288, | |
| "reward_std": 0.45585790276527405, | |
| "rewards/GDino": 0.6186152696609497, | |
| "rewards/GIT": 0.20087126642465591, | |
| "rewards/HPSv2": 0.2628669738769531, | |
| "rewards/ORM": 0.4849964678287506, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -22.375, | |
| "step": 306 | |
| }, | |
| { | |
| "completion_length": 81.390625, | |
| "epoch": 0.3399778516057586, | |
| "grad_norm": 1.2003884315490723, | |
| "kl": 0.01556396484375, | |
| "learning_rate": 8.08125e-07, | |
| "loss": -0.0018971394747495651, | |
| "reward": 2.113324999809265, | |
| "reward_std": 0.27119018137454987, | |
| "rewards/GDino": 0.7391741275787354, | |
| "rewards/GIT": 0.6144535839557648, | |
| "rewards/HPSv2": 0.25809288024902344, | |
| "rewards/ORM": 0.5016044676303864, | |
| "self_certainty_semantic": -25.9375, | |
| "self_certainty_token": -21.25, | |
| "step": 307 | |
| }, | |
| { | |
| "completion_length": 81.4375, | |
| "epoch": 0.34108527131782945, | |
| "grad_norm": 0.4800432324409485, | |
| "kl": 0.022216796875, | |
| "learning_rate": 8.075e-07, | |
| "loss": -0.004123867256566882, | |
| "reward": 2.46367871761322, | |
| "reward_std": 0.2835230827331543, | |
| "rewards/GDino": 0.9177083075046539, | |
| "rewards/GIT": 0.6127268970012665, | |
| "rewards/HPSv2": 0.2661018371582031, | |
| "rewards/ORM": 0.6671415567398071, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -21.1875, | |
| "step": 308 | |
| }, | |
| { | |
| "completion_length": 84.890625, | |
| "epoch": 0.34219269102990035, | |
| "grad_norm": 0.6846179962158203, | |
| "kl": 0.02520751953125, | |
| "learning_rate": 8.06875e-07, | |
| "loss": -0.00010142242535948753, | |
| "reward": 2.089680314064026, | |
| "reward_std": 0.2894028127193451, | |
| "rewards/GDino": 0.7455952167510986, | |
| "rewards/GIT": 0.3249504566192627, | |
| "rewards/HPSv2": 0.2928428649902344, | |
| "rewards/ORM": 0.7262917459011078, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -21.5625, | |
| "step": 309 | |
| }, | |
| { | |
| "completion_length": 78.71875, | |
| "epoch": 0.3433001107419712, | |
| "grad_norm": 0.5359929203987122, | |
| "kl": 0.03472900390625, | |
| "learning_rate": 8.0625e-07, | |
| "loss": -0.002803298644721508, | |
| "reward": 2.7762473821640015, | |
| "reward_std": 0.2040305808186531, | |
| "rewards/GDino": 0.9227638244628906, | |
| "rewards/GIT": 0.6611425876617432, | |
| "rewards/HPSv2": 0.2548408508300781, | |
| "rewards/ORM": 0.9375, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -21.375, | |
| "step": 310 | |
| }, | |
| { | |
| "completion_length": 70.71875, | |
| "epoch": 0.3444075304540421, | |
| "grad_norm": 0.42147886753082275, | |
| "kl": 0.016876220703125, | |
| "learning_rate": 8.05625e-07, | |
| "loss": -0.006496510934084654, | |
| "reward": 2.523200273513794, | |
| "reward_std": 0.2876928895711899, | |
| "rewards/GDino": 0.8453125357627869, | |
| "rewards/GIT": 0.5663794428110123, | |
| "rewards/HPSv2": 0.2585620880126953, | |
| "rewards/ORM": 0.8529461622238159, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -21.0, | |
| "step": 311 | |
| }, | |
| { | |
| "completion_length": 70.34375, | |
| "epoch": 0.34551495016611294, | |
| "grad_norm": 0.6237341165542603, | |
| "kl": 0.0382080078125, | |
| "learning_rate": 8.05e-07, | |
| "loss": 0.0011792382574640214, | |
| "reward": 2.307464361190796, | |
| "reward_std": 0.33230580389499664, | |
| "rewards/GDino": 0.8716782331466675, | |
| "rewards/GIT": 0.4658700153231621, | |
| "rewards/HPSv2": 0.2794170379638672, | |
| "rewards/ORM": 0.6904991567134857, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -21.5, | |
| "step": 312 | |
| }, | |
| { | |
| "completion_length": 81.25, | |
| "epoch": 0.34662236987818384, | |
| "grad_norm": 0.483153760433197, | |
| "kl": 0.024169921875, | |
| "learning_rate": 8.043749999999999e-07, | |
| "loss": 0.0035759536549448967, | |
| "reward": 1.6738215684890747, | |
| "reward_std": 0.3843349516391754, | |
| "rewards/GDino": 0.5876509547233582, | |
| "rewards/GIT": 0.29283201694488525, | |
| "rewards/HPSv2": 0.2635631561279297, | |
| "rewards/ORM": 0.5297753810882568, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -21.1875, | |
| "step": 313 | |
| }, | |
| { | |
| "completion_length": 86.796875, | |
| "epoch": 0.3477297895902547, | |
| "grad_norm": 2.524872064590454, | |
| "kl": 0.04315185546875, | |
| "learning_rate": 8.037499999999999e-07, | |
| "loss": -0.013335694558918476, | |
| "reward": 1.9714076519012451, | |
| "reward_std": 0.2786209136247635, | |
| "rewards/GDino": 0.8052083551883698, | |
| "rewards/GIT": 0.45579826831817627, | |
| "rewards/HPSv2": 0.251068115234375, | |
| "rewards/ORM": 0.45933302491903305, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -20.5625, | |
| "step": 314 | |
| }, | |
| { | |
| "completion_length": 77.65625, | |
| "epoch": 0.3488372093023256, | |
| "grad_norm": 0.6192425489425659, | |
| "kl": 0.02899169921875, | |
| "learning_rate": 8.031249999999999e-07, | |
| "loss": -4.310737131163478e-05, | |
| "reward": 2.4499722719192505, | |
| "reward_std": 0.2559589520096779, | |
| "rewards/GDino": 0.846875011920929, | |
| "rewards/GIT": 0.6022514998912811, | |
| "rewards/HPSv2": 0.27861595153808594, | |
| "rewards/ORM": 0.7222296893596649, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -21.9375, | |
| "step": 315 | |
| }, | |
| { | |
| "completion_length": 78.171875, | |
| "epoch": 0.34994462901439644, | |
| "grad_norm": 0.5415306091308594, | |
| "kl": 0.0198974609375, | |
| "learning_rate": 8.024999999999999e-07, | |
| "loss": 0.00023352215066552162, | |
| "reward": 2.4360201358795166, | |
| "reward_std": 0.3337967246770859, | |
| "rewards/GDino": 0.7447916269302368, | |
| "rewards/GIT": 0.5895867943763733, | |
| "rewards/HPSv2": 0.2578916549682617, | |
| "rewards/ORM": 0.84375, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -22.1875, | |
| "step": 316 | |
| }, | |
| { | |
| "completion_length": 85.984375, | |
| "epoch": 0.35105204872646734, | |
| "grad_norm": 0.9716246128082275, | |
| "kl": 0.05963134765625, | |
| "learning_rate": 8.018749999999999e-07, | |
| "loss": 0.017504149582237005, | |
| "reward": 1.6575236916542053, | |
| "reward_std": 0.36309416592121124, | |
| "rewards/GDino": 0.6602314710617065, | |
| "rewards/GIT": 0.22271956503391266, | |
| "rewards/HPSv2": 0.28050994873046875, | |
| "rewards/ORM": 0.49406272172927856, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -21.3125, | |
| "step": 317 | |
| }, | |
| { | |
| "completion_length": 79.8125, | |
| "epoch": 0.3521594684385382, | |
| "grad_norm": 0.6221381425857544, | |
| "kl": 0.0673828125, | |
| "learning_rate": 8.0125e-07, | |
| "loss": -0.015689235646277666, | |
| "reward": 2.1572564244270325, | |
| "reward_std": 0.41349759697914124, | |
| "rewards/GDino": 0.7671919465065002, | |
| "rewards/GIT": 0.45393163710832596, | |
| "rewards/HPSv2": 0.27384376525878906, | |
| "rewards/ORM": 0.662289023399353, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -21.6875, | |
| "step": 318 | |
| }, | |
| { | |
| "completion_length": 79.3125, | |
| "epoch": 0.3532668881506091, | |
| "grad_norm": 0.41829535365104675, | |
| "kl": 0.023162841796875, | |
| "learning_rate": 8.00625e-07, | |
| "loss": -0.00858006183989346, | |
| "reward": 1.9483414888381958, | |
| "reward_std": 0.3017558604478836, | |
| "rewards/GDino": 0.6993304491043091, | |
| "rewards/GIT": 0.268573135137558, | |
| "rewards/HPSv2": 0.2617378234863281, | |
| "rewards/ORM": 0.7187000811100006, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -21.0, | |
| "step": 319 | |
| }, | |
| { | |
| "completion_length": 85.75, | |
| "epoch": 0.35437430786268, | |
| "grad_norm": 0.4535002112388611, | |
| "kl": 0.0247802734375, | |
| "learning_rate": 8e-07, | |
| "loss": -0.0024052427615970373, | |
| "reward": 2.0683862566947937, | |
| "reward_std": 0.33450697362422943, | |
| "rewards/GDino": 0.7158985733985901, | |
| "rewards/GIT": 0.5421321392059326, | |
| "rewards/HPSv2": 0.2665596008300781, | |
| "rewards/ORM": 0.5437959656119347, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -21.75, | |
| "step": 320 | |
| }, | |
| { | |
| "completion_length": 81.296875, | |
| "epoch": 0.3554817275747508, | |
| "grad_norm": 0.5315317511558533, | |
| "kl": 0.026611328125, | |
| "learning_rate": 7.993749999999999e-07, | |
| "loss": -0.009409249760210514, | |
| "reward": 2.1791539788246155, | |
| "reward_std": 0.3816552609205246, | |
| "rewards/GDino": 0.8082575500011444, | |
| "rewards/GIT": 0.4406343102455139, | |
| "rewards/HPSv2": 0.26690101623535156, | |
| "rewards/ORM": 0.6633611619472504, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -21.0, | |
| "step": 321 | |
| }, | |
| { | |
| "completion_length": 62.4375, | |
| "epoch": 0.35658914728682173, | |
| "grad_norm": 0.528831958770752, | |
| "kl": 0.02685546875, | |
| "learning_rate": 7.9875e-07, | |
| "loss": 0.016546542290598154, | |
| "reward": 2.6317901611328125, | |
| "reward_std": 0.2945253401994705, | |
| "rewards/GDino": 0.903124988079071, | |
| "rewards/GIT": 0.6644484996795654, | |
| "rewards/HPSv2": 0.2814922332763672, | |
| "rewards/ORM": 0.7827245891094208, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -21.8125, | |
| "step": 322 | |
| }, | |
| { | |
| "completion_length": 70.6875, | |
| "epoch": 0.3576965669988926, | |
| "grad_norm": 0.48459064960479736, | |
| "kl": 0.034423828125, | |
| "learning_rate": 7.98125e-07, | |
| "loss": 0.0045278145698830485, | |
| "reward": 2.248463988304138, | |
| "reward_std": 0.21271561086177826, | |
| "rewards/GDino": 0.792187511920929, | |
| "rewards/GIT": 0.5167302191257477, | |
| "rewards/HPSv2": 0.2770881652832031, | |
| "rewards/ORM": 0.6624580770730972, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.75, | |
| "step": 323 | |
| }, | |
| { | |
| "completion_length": 78.078125, | |
| "epoch": 0.3588039867109635, | |
| "grad_norm": 0.9986751675605774, | |
| "kl": 0.03704833984375, | |
| "learning_rate": 7.975e-07, | |
| "loss": 0.0015258773928508162, | |
| "reward": 2.2912731170654297, | |
| "reward_std": 0.2479892298579216, | |
| "rewards/GDino": 0.8150812387466431, | |
| "rewards/GIT": 0.5762419700622559, | |
| "rewards/HPSv2": 0.2679405212402344, | |
| "rewards/ORM": 0.6320093274116516, | |
| "self_certainty_semantic": -25.9375, | |
| "self_certainty_token": -20.0, | |
| "step": 324 | |
| }, | |
| { | |
| "completion_length": 87.3125, | |
| "epoch": 0.3599114064230343, | |
| "grad_norm": 0.5009543299674988, | |
| "kl": 0.015380859375, | |
| "learning_rate": 7.96875e-07, | |
| "loss": 0.00816858746111393, | |
| "reward": 2.3778595328330994, | |
| "reward_std": 0.43483346700668335, | |
| "rewards/GDino": 0.8713316917419434, | |
| "rewards/GIT": 0.5178200602531433, | |
| "rewards/HPSv2": 0.24491596221923828, | |
| "rewards/ORM": 0.743791937828064, | |
| "self_certainty_semantic": -26.125, | |
| "self_certainty_token": -22.6875, | |
| "step": 325 | |
| }, | |
| { | |
| "completion_length": 80.640625, | |
| "epoch": 0.3610188261351052, | |
| "grad_norm": 4.34770296604459e+17, | |
| "kl": 2.1532835718365184e+16, | |
| "learning_rate": 7.9625e-07, | |
| "loss": 215359592333312.0, | |
| "reward": 2.1344000101089478, | |
| "reward_std": 0.1903739497065544, | |
| "rewards/GDino": 0.8049311637878418, | |
| "rewards/GIT": 0.44934192299842834, | |
| "rewards/HPSv2": 0.270751953125, | |
| "rewards/ORM": 0.609375, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -21.6875, | |
| "step": 326 | |
| }, | |
| { | |
| "completion_length": 71.703125, | |
| "epoch": 0.36212624584717606, | |
| "grad_norm": 0.591279149055481, | |
| "kl": 0.0194091796875, | |
| "learning_rate": 7.95625e-07, | |
| "loss": -0.0025365690235048532, | |
| "reward": 2.1500454545021057, | |
| "reward_std": 0.2505127191543579, | |
| "rewards/GDino": 0.6991045475006104, | |
| "rewards/GIT": 0.5611639469861984, | |
| "rewards/HPSv2": 0.27692222595214844, | |
| "rewards/ORM": 0.6128546893596649, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -21.5, | |
| "step": 327 | |
| }, | |
| { | |
| "completion_length": 66.25, | |
| "epoch": 0.36323366555924697, | |
| "grad_norm": 0.49124178290367126, | |
| "kl": 0.0302734375, | |
| "learning_rate": 7.95e-07, | |
| "loss": 0.000966892926953733, | |
| "reward": 2.234583079814911, | |
| "reward_std": 0.24727845191955566, | |
| "rewards/GDino": 0.8250000178813934, | |
| "rewards/GIT": 0.42907945811748505, | |
| "rewards/HPSv2": 0.3059501647949219, | |
| "rewards/ORM": 0.674553394317627, | |
| "self_certainty_semantic": -25.9375, | |
| "self_certainty_token": -22.1875, | |
| "step": 328 | |
| }, | |
| { | |
| "completion_length": 94.390625, | |
| "epoch": 0.3643410852713178, | |
| "grad_norm": 0.926494836807251, | |
| "kl": 0.015045166015625, | |
| "learning_rate": 7.94375e-07, | |
| "loss": 0.00036594929406419396, | |
| "reward": 2.232884407043457, | |
| "reward_std": 0.2973213344812393, | |
| "rewards/GDino": 0.7332743704319, | |
| "rewards/GIT": 0.4874793738126755, | |
| "rewards/HPSv2": 0.2777557373046875, | |
| "rewards/ORM": 0.7343749403953552, | |
| "self_certainty_semantic": -26.0625, | |
| "self_certainty_token": -21.75, | |
| "step": 329 | |
| }, | |
| { | |
| "completion_length": 76.640625, | |
| "epoch": 0.3654485049833887, | |
| "grad_norm": 0.4620281159877777, | |
| "kl": 0.02886962890625, | |
| "learning_rate": 7.937499999999999e-07, | |
| "loss": -0.004971426445990801, | |
| "reward": 2.011380136013031, | |
| "reward_std": 0.32141920924186707, | |
| "rewards/GDino": 0.727248340845108, | |
| "rewards/GIT": 0.3900819420814514, | |
| "rewards/HPSv2": 0.2651176452636719, | |
| "rewards/ORM": 0.6289321780204773, | |
| "self_certainty_semantic": -25.625, | |
| "self_certainty_token": -22.0, | |
| "step": 330 | |
| }, | |
| { | |
| "completion_length": 81.15625, | |
| "epoch": 0.36655592469545956, | |
| "grad_norm": 0.7666285634040833, | |
| "kl": 0.0321044921875, | |
| "learning_rate": 7.931249999999999e-07, | |
| "loss": 0.0011679597664624453, | |
| "reward": 2.653956890106201, | |
| "reward_std": 0.13854296877980232, | |
| "rewards/GDino": 0.885937511920929, | |
| "rewards/GIT": 0.7691447138786316, | |
| "rewards/HPSv2": 0.2644996643066406, | |
| "rewards/ORM": 0.734375, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -21.375, | |
| "step": 331 | |
| }, | |
| { | |
| "completion_length": 88.3125, | |
| "epoch": 0.36766334440753046, | |
| "grad_norm": 0.5758018493652344, | |
| "kl": 0.01702880859375, | |
| "learning_rate": 7.924999999999999e-07, | |
| "loss": -0.0048087649047374725, | |
| "reward": 1.8430108428001404, | |
| "reward_std": 0.2730695307254791, | |
| "rewards/GDino": 0.7293344736099243, | |
| "rewards/GIT": 0.3678126037120819, | |
| "rewards/HPSv2": 0.2587604522705078, | |
| "rewards/ORM": 0.48710331320762634, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -21.875, | |
| "step": 332 | |
| }, | |
| { | |
| "completion_length": 84.390625, | |
| "epoch": 0.3687707641196013, | |
| "grad_norm": 0.9169139266014099, | |
| "kl": 0.0267333984375, | |
| "learning_rate": 7.918749999999999e-07, | |
| "loss": -0.0021060709841549397, | |
| "reward": 2.4026538133621216, | |
| "reward_std": 0.2340044304728508, | |
| "rewards/GDino": 0.81524857878685, | |
| "rewards/GIT": 0.44082213938236237, | |
| "rewards/HPSv2": 0.28846168518066406, | |
| "rewards/ORM": 0.858121246099472, | |
| "self_certainty_semantic": -25.9375, | |
| "self_certainty_token": -21.5, | |
| "step": 333 | |
| }, | |
| { | |
| "completion_length": 76.578125, | |
| "epoch": 0.3698781838316722, | |
| "grad_norm": 0.5896627902984619, | |
| "kl": 0.032470703125, | |
| "learning_rate": 7.912499999999999e-07, | |
| "loss": 0.014832689426839352, | |
| "reward": 2.310631573200226, | |
| "reward_std": 0.39878983795642853, | |
| "rewards/GDino": 0.8589580357074738, | |
| "rewards/GIT": 0.5403054803609848, | |
| "rewards/HPSv2": 0.25203895568847656, | |
| "rewards/ORM": 0.6593290567398071, | |
| "self_certainty_semantic": -25.9375, | |
| "self_certainty_token": -21.5, | |
| "step": 334 | |
| }, | |
| { | |
| "completion_length": 81.125, | |
| "epoch": 0.3709856035437431, | |
| "grad_norm": 0.6335277557373047, | |
| "kl": 0.02618408203125, | |
| "learning_rate": 7.90625e-07, | |
| "loss": -0.005854415707290173, | |
| "reward": 1.9042538404464722, | |
| "reward_std": 0.37834763526916504, | |
| "rewards/GDino": 0.8026451170444489, | |
| "rewards/GIT": 0.3777245283126831, | |
| "rewards/HPSv2": 0.270538330078125, | |
| "rewards/ORM": 0.45334580540657043, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -21.5, | |
| "step": 335 | |
| }, | |
| { | |
| "completion_length": 86.09375, | |
| "epoch": 0.37209302325581395, | |
| "grad_norm": 0.48027098178863525, | |
| "kl": 0.0247802734375, | |
| "learning_rate": 7.9e-07, | |
| "loss": -0.005090413382276893, | |
| "reward": 2.291101813316345, | |
| "reward_std": 0.407858744263649, | |
| "rewards/GDino": 0.7903645634651184, | |
| "rewards/GIT": 0.7357276082038879, | |
| "rewards/HPSv2": 0.2775554656982422, | |
| "rewards/ORM": 0.48745404183864594, | |
| "self_certainty_semantic": -26.0625, | |
| "self_certainty_token": -21.6875, | |
| "step": 336 | |
| }, | |
| { | |
| "completion_length": 78.0, | |
| "epoch": 0.37320044296788485, | |
| "grad_norm": 1.5485831499099731, | |
| "kl": 0.022857666015625, | |
| "learning_rate": 7.893750000000001e-07, | |
| "loss": -0.0022913700668141246, | |
| "reward": 2.0822601914405823, | |
| "reward_std": 0.3321680426597595, | |
| "rewards/GDino": 0.7484375536441803, | |
| "rewards/GIT": 0.39538896083831787, | |
| "rewards/HPSv2": 0.2601299285888672, | |
| "rewards/ORM": 0.6783038675785065, | |
| "self_certainty_semantic": -25.9375, | |
| "self_certainty_token": -21.625, | |
| "step": 337 | |
| }, | |
| { | |
| "completion_length": 76.46875, | |
| "epoch": 0.3743078626799557, | |
| "grad_norm": 0.4994075894355774, | |
| "kl": 0.02423095703125, | |
| "learning_rate": 7.8875e-07, | |
| "loss": 0.00693045777734369, | |
| "reward": 1.9913100004196167, | |
| "reward_std": 0.4366358071565628, | |
| "rewards/GDino": 0.7917910814285278, | |
| "rewards/GIT": 0.5114566683769226, | |
| "rewards/HPSv2": 0.26731109619140625, | |
| "rewards/ORM": 0.42075108736753464, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -21.3125, | |
| "step": 338 | |
| }, | |
| { | |
| "completion_length": 85.734375, | |
| "epoch": 0.3754152823920266, | |
| "grad_norm": 0.6367454528808594, | |
| "kl": 0.033203125, | |
| "learning_rate": 7.88125e-07, | |
| "loss": 0.005203233566135168, | |
| "reward": 2.3903461694717407, | |
| "reward_std": 0.3316587954759598, | |
| "rewards/GDino": 0.7497395575046539, | |
| "rewards/GIT": 0.5981788337230682, | |
| "rewards/HPSv2": 0.27332305908203125, | |
| "rewards/ORM": 0.7691046893596649, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -22.0625, | |
| "step": 339 | |
| }, | |
| { | |
| "completion_length": 73.515625, | |
| "epoch": 0.37652270210409744, | |
| "grad_norm": 0.44640496373176575, | |
| "kl": 0.02545166015625, | |
| "learning_rate": 7.875e-07, | |
| "loss": -0.0023915348574519157, | |
| "reward": 2.3552005290985107, | |
| "reward_std": 0.34541261196136475, | |
| "rewards/GDino": 0.7578125, | |
| "rewards/GIT": 0.4575464129447937, | |
| "rewards/HPSv2": 0.27921295166015625, | |
| "rewards/ORM": 0.8606287837028503, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -21.375, | |
| "step": 340 | |
| }, | |
| { | |
| "completion_length": 76.9375, | |
| "epoch": 0.37763012181616834, | |
| "grad_norm": 0.7208606600761414, | |
| "kl": 0.04327392578125, | |
| "learning_rate": 7.86875e-07, | |
| "loss": 0.006299447733908892, | |
| "reward": 1.769343376159668, | |
| "reward_std": 0.4668227732181549, | |
| "rewards/GDino": 0.6656007468700409, | |
| "rewards/GIT": 0.23980124294757843, | |
| "rewards/HPSv2": 0.25531768798828125, | |
| "rewards/ORM": 0.6086236536502838, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -21.375, | |
| "step": 341 | |
| }, | |
| { | |
| "completion_length": 78.59375, | |
| "epoch": 0.3787375415282392, | |
| "grad_norm": 0.4430847465991974, | |
| "kl": 0.03216552734375, | |
| "learning_rate": 7.8625e-07, | |
| "loss": -0.005237360019236803, | |
| "reward": 2.1384899616241455, | |
| "reward_std": 0.3752764165401459, | |
| "rewards/GDino": 0.7345833480358124, | |
| "rewards/GIT": 0.4388365373015404, | |
| "rewards/HPSv2": 0.2710113525390625, | |
| "rewards/ORM": 0.694058746099472, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -21.9375, | |
| "step": 342 | |
| }, | |
| { | |
| "completion_length": 65.34375, | |
| "epoch": 0.3798449612403101, | |
| "grad_norm": 0.4848972260951996, | |
| "kl": 0.0263671875, | |
| "learning_rate": 7.85625e-07, | |
| "loss": 0.013073518872261047, | |
| "reward": 2.126249670982361, | |
| "reward_std": 0.38274161517620087, | |
| "rewards/GDino": 0.8289884030818939, | |
| "rewards/GIT": 0.2857021316885948, | |
| "rewards/HPSv2": 0.2662925720214844, | |
| "rewards/ORM": 0.7452665567398071, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -21.375, | |
| "step": 343 | |
| }, | |
| { | |
| "completion_length": 75.0, | |
| "epoch": 0.38095238095238093, | |
| "grad_norm": 2.974632501602173, | |
| "kl": 0.0523681640625, | |
| "learning_rate": 7.85e-07, | |
| "loss": -0.007122250506654382, | |
| "reward": 2.442666530609131, | |
| "reward_std": 0.3234139457345009, | |
| "rewards/GDino": 0.830729216337204, | |
| "rewards/GIT": 0.7763712406158447, | |
| "rewards/HPSv2": 0.2387371063232422, | |
| "rewards/ORM": 0.5968290418386459, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -22.0625, | |
| "step": 344 | |
| }, | |
| { | |
| "completion_length": 87.53125, | |
| "epoch": 0.38205980066445183, | |
| "grad_norm": 0.3997608721256256, | |
| "kl": 0.02191162109375, | |
| "learning_rate": 7.84375e-07, | |
| "loss": 0.009387207683175802, | |
| "reward": 1.6372931599617004, | |
| "reward_std": 0.5579112768173218, | |
| "rewards/GDino": 0.6632326543331146, | |
| "rewards/GIT": 0.2669598236680031, | |
| "rewards/HPSv2": 0.2503662109375, | |
| "rewards/ORM": 0.4567345529794693, | |
| "self_certainty_semantic": -25.5625, | |
| "self_certainty_token": -22.25, | |
| "step": 345 | |
| }, | |
| { | |
| "completion_length": 78.671875, | |
| "epoch": 0.3831672203765227, | |
| "grad_norm": 0.5440374612808228, | |
| "kl": 0.023681640625, | |
| "learning_rate": 7.837499999999999e-07, | |
| "loss": -0.004060751176439226, | |
| "reward": 2.5331764221191406, | |
| "reward_std": 0.30825961381196976, | |
| "rewards/GDino": 0.7956249713897705, | |
| "rewards/GIT": 0.6618549525737762, | |
| "rewards/HPSv2": 0.26337623596191406, | |
| "rewards/ORM": 0.8123202323913574, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -22.25, | |
| "step": 346 | |
| }, | |
| { | |
| "completion_length": 78.859375, | |
| "epoch": 0.3842746400885936, | |
| "grad_norm": 0.505639374256134, | |
| "kl": 0.027587890625, | |
| "learning_rate": 7.831249999999999e-07, | |
| "loss": 0.006256320746615529, | |
| "reward": 1.9418827891349792, | |
| "reward_std": 0.39945825934410095, | |
| "rewards/GDino": 0.6593631207942963, | |
| "rewards/GIT": 0.31888166069984436, | |
| "rewards/HPSv2": 0.2855548858642578, | |
| "rewards/ORM": 0.678083062171936, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -22.3125, | |
| "step": 347 | |
| }, | |
| { | |
| "completion_length": 83.796875, | |
| "epoch": 0.3853820598006645, | |
| "grad_norm": 0.6219062209129333, | |
| "kl": 0.02642822265625, | |
| "learning_rate": 7.824999999999999e-07, | |
| "loss": 0.004094981588423252, | |
| "reward": 2.0063390731811523, | |
| "reward_std": 0.3849373310804367, | |
| "rewards/GDino": 0.7137661874294281, | |
| "rewards/GIT": 0.38513660430908203, | |
| "rewards/HPSv2": 0.2584552764892578, | |
| "rewards/ORM": 0.6489809155464172, | |
| "self_certainty_semantic": -26.0625, | |
| "self_certainty_token": -20.5, | |
| "step": 348 | |
| }, | |
| { | |
| "completion_length": 92.359375, | |
| "epoch": 0.3864894795127353, | |
| "grad_norm": 0.6767128705978394, | |
| "kl": 0.0465087890625, | |
| "learning_rate": 7.818749999999999e-07, | |
| "loss": 0.00501623225864023, | |
| "reward": 1.9770677089691162, | |
| "reward_std": 0.3419999033212662, | |
| "rewards/GDino": 0.7295942902565002, | |
| "rewards/GIT": 0.3469092845916748, | |
| "rewards/HPSv2": 0.2744102478027344, | |
| "rewards/ORM": 0.6261538863182068, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -22.625, | |
| "step": 349 | |
| }, | |
| { | |
| "completion_length": 85.40625, | |
| "epoch": 0.3875968992248062, | |
| "grad_norm": 0.904482364654541, | |
| "kl": 0.0482177734375, | |
| "learning_rate": 7.812499999999999e-07, | |
| "loss": 0.008412390714511275, | |
| "reward": 2.1975533962249756, | |
| "reward_std": 0.3165140748023987, | |
| "rewards/GDino": 0.7537752985954285, | |
| "rewards/GIT": 0.41514359414577484, | |
| "rewards/HPSv2": 0.28171348571777344, | |
| "rewards/ORM": 0.7469209432601929, | |
| "self_certainty_semantic": -25.9375, | |
| "self_certainty_token": -21.375, | |
| "step": 350 | |
| }, | |
| { | |
| "completion_length": 82.625, | |
| "epoch": 0.38870431893687707, | |
| "grad_norm": 0.39913347363471985, | |
| "kl": 0.0225830078125, | |
| "learning_rate": 7.806249999999999e-07, | |
| "loss": 0.0012628886615857482, | |
| "reward": 2.383103609085083, | |
| "reward_std": 0.39389656484127045, | |
| "rewards/GDino": 0.855949878692627, | |
| "rewards/GIT": 0.4103764295578003, | |
| "rewards/HPSv2": 0.26708984375, | |
| "rewards/ORM": 0.8496872782707214, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -21.3125, | |
| "step": 351 | |
| }, | |
| { | |
| "completion_length": 84.828125, | |
| "epoch": 0.38981173864894797, | |
| "grad_norm": 0.6845752000808716, | |
| "kl": 0.03411865234375, | |
| "learning_rate": 7.799999999999999e-07, | |
| "loss": 0.012704014778137207, | |
| "reward": 2.0514729022979736, | |
| "reward_std": 0.38871151208877563, | |
| "rewards/GDino": 0.7418749928474426, | |
| "rewards/GIT": 0.30121954530477524, | |
| "rewards/HPSv2": 0.2696704864501953, | |
| "rewards/ORM": 0.7387077808380127, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -22.4375, | |
| "step": 352 | |
| }, | |
| { | |
| "completion_length": 76.53125, | |
| "epoch": 0.3909191583610188, | |
| "grad_norm": 0.5380039215087891, | |
| "kl": 0.0360107421875, | |
| "learning_rate": 7.793750000000001e-07, | |
| "loss": -0.00018121302127838135, | |
| "reward": 1.9169389605522156, | |
| "reward_std": 0.33840206265449524, | |
| "rewards/GDino": 0.5888020992279053, | |
| "rewards/GIT": 0.3554147705435753, | |
| "rewards/HPSv2": 0.27184486389160156, | |
| "rewards/ORM": 0.7008772194385529, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.75, | |
| "step": 353 | |
| }, | |
| { | |
| "completion_length": 71.3125, | |
| "epoch": 0.3920265780730897, | |
| "grad_norm": 0.6473866105079651, | |
| "kl": 0.0369873046875, | |
| "learning_rate": 7.787500000000001e-07, | |
| "loss": 0.009591558366082609, | |
| "reward": 2.060921609401703, | |
| "reward_std": 0.5267337262630463, | |
| "rewards/GDino": 0.6770313084125519, | |
| "rewards/GIT": 0.3380723297595978, | |
| "rewards/HPSv2": 0.28131675720214844, | |
| "rewards/ORM": 0.7645011246204376, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -23.0, | |
| "step": 354 | |
| }, | |
| { | |
| "completion_length": 73.375, | |
| "epoch": 0.39313399778516056, | |
| "grad_norm": 1.9103351831436157, | |
| "kl": 0.03515625, | |
| "learning_rate": 7.78125e-07, | |
| "loss": 0.009471733821555972, | |
| "reward": 2.468719720840454, | |
| "reward_std": 0.3037705421447754, | |
| "rewards/GDino": 0.824194073677063, | |
| "rewards/GIT": 0.6634758412837982, | |
| "rewards/HPSv2": 0.2754173278808594, | |
| "rewards/ORM": 0.7056325078010559, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -22.25, | |
| "step": 355 | |
| }, | |
| { | |
| "completion_length": 81.640625, | |
| "epoch": 0.39424141749723146, | |
| "grad_norm": 0.47359681129455566, | |
| "kl": 0.0191650390625, | |
| "learning_rate": 7.775e-07, | |
| "loss": 0.01774417981505394, | |
| "reward": 2.1485215425491333, | |
| "reward_std": 0.37546610832214355, | |
| "rewards/GDino": 0.7694036066532135, | |
| "rewards/GIT": 0.2857285439968109, | |
| "rewards/HPSv2": 0.28088951110839844, | |
| "rewards/ORM": 0.8125, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -22.5, | |
| "step": 356 | |
| }, | |
| { | |
| "completion_length": 82.78125, | |
| "epoch": 0.3953488372093023, | |
| "grad_norm": 0.4700465500354767, | |
| "kl": 0.028564453125, | |
| "learning_rate": 7.76875e-07, | |
| "loss": -0.005447798175737262, | |
| "reward": 1.9955376386642456, | |
| "reward_std": 0.3978916108608246, | |
| "rewards/GDino": 0.7125872671604156, | |
| "rewards/GIT": 0.29513833671808243, | |
| "rewards/HPSv2": 0.2846870422363281, | |
| "rewards/ORM": 0.703125, | |
| "self_certainty_semantic": -26.0625, | |
| "self_certainty_token": -22.3125, | |
| "step": 357 | |
| }, | |
| { | |
| "completion_length": 84.15625, | |
| "epoch": 0.3964562569213732, | |
| "grad_norm": 0.5537763237953186, | |
| "kl": 0.04052734375, | |
| "learning_rate": 7.7625e-07, | |
| "loss": -0.005253390525467694, | |
| "reward": 2.2486732006073, | |
| "reward_std": 0.33913009613752365, | |
| "rewards/GDino": 0.7710938155651093, | |
| "rewards/GIT": 0.5208555310964584, | |
| "rewards/HPSv2": 0.2630157470703125, | |
| "rewards/ORM": 0.6937080323696136, | |
| "self_certainty_semantic": -26.0, | |
| "self_certainty_token": -20.4375, | |
| "step": 358 | |
| }, | |
| { | |
| "completion_length": 78.84375, | |
| "epoch": 0.39756367663344405, | |
| "grad_norm": 0.4036509692668915, | |
| "kl": 0.02471923828125, | |
| "learning_rate": 7.75625e-07, | |
| "loss": 0.004020060820039362, | |
| "reward": 1.3866900205612183, | |
| "reward_std": 0.45843446254730225, | |
| "rewards/GDino": 0.5991981625556946, | |
| "rewards/GIT": 0.18800346553325653, | |
| "rewards/HPSv2": 0.24882888793945312, | |
| "rewards/ORM": 0.35065943002700806, | |
| "self_certainty_semantic": -25.9375, | |
| "self_certainty_token": -21.9375, | |
| "step": 359 | |
| }, | |
| { | |
| "completion_length": 75.21875, | |
| "epoch": 0.39867109634551495, | |
| "grad_norm": 0.5162832736968994, | |
| "kl": 0.021728515625, | |
| "learning_rate": 7.75e-07, | |
| "loss": 0.0037150949938222766, | |
| "reward": 1.9392863512039185, | |
| "reward_std": 0.2793232351541519, | |
| "rewards/GDino": 0.701914981007576, | |
| "rewards/GIT": 0.5795804336667061, | |
| "rewards/HPSv2": 0.27355384826660156, | |
| "rewards/ORM": 0.3842370957136154, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -21.0, | |
| "step": 360 | |
| }, | |
| { | |
| "completion_length": 76.4375, | |
| "epoch": 0.3997785160575858, | |
| "grad_norm": 0.45564430952072144, | |
| "kl": 0.02691650390625, | |
| "learning_rate": 7.74375e-07, | |
| "loss": 0.0029305103234946728, | |
| "reward": 2.0641271471977234, | |
| "reward_std": 0.368464857339859, | |
| "rewards/GDino": 0.7391301393508911, | |
| "rewards/GIT": 0.3999723941087723, | |
| "rewards/HPSv2": 0.27350807189941406, | |
| "rewards/ORM": 0.6515165269374847, | |
| "self_certainty_semantic": -25.9375, | |
| "self_certainty_token": -20.6875, | |
| "step": 361 | |
| }, | |
| { | |
| "completion_length": 83.953125, | |
| "epoch": 0.4008859357696567, | |
| "grad_norm": 0.9035529494285583, | |
| "kl": 0.0772705078125, | |
| "learning_rate": 7.7375e-07, | |
| "loss": 0.003972394741140306, | |
| "reward": 1.934463918209076, | |
| "reward_std": 0.4396786019206047, | |
| "rewards/GDino": 0.7246715724468231, | |
| "rewards/GIT": 0.46111059188842773, | |
| "rewards/HPSv2": 0.28046226501464844, | |
| "rewards/ORM": 0.46821947395801544, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -21.75, | |
| "step": 362 | |
| }, | |
| { | |
| "completion_length": 92.75, | |
| "epoch": 0.4019933554817276, | |
| "grad_norm": 0.5282062888145447, | |
| "kl": 0.019317626953125, | |
| "learning_rate": 7.731249999999999e-07, | |
| "loss": -0.008066414389759302, | |
| "reward": 2.07886004447937, | |
| "reward_std": 0.4132898300886154, | |
| "rewards/GDino": 0.7536458075046539, | |
| "rewards/GIT": 0.4562801867723465, | |
| "rewards/HPSv2": 0.2626380920410156, | |
| "rewards/ORM": 0.6062959432601929, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -20.8125, | |
| "step": 363 | |
| }, | |
| { | |
| "completion_length": 83.484375, | |
| "epoch": 0.40310077519379844, | |
| "grad_norm": 0.3879307508468628, | |
| "kl": 0.04705810546875, | |
| "learning_rate": 7.724999999999999e-07, | |
| "loss": -0.0018501741578802466, | |
| "reward": 2.4320976734161377, | |
| "reward_std": 0.2695165127515793, | |
| "rewards/GDino": 0.796875, | |
| "rewards/GIT": 0.5824678391218185, | |
| "rewards/HPSv2": 0.28354835510253906, | |
| "rewards/ORM": 0.7692064642906189, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -21.75, | |
| "step": 364 | |
| }, | |
| { | |
| "completion_length": 85.34375, | |
| "epoch": 0.40420819490586934, | |
| "grad_norm": 0.5758140683174133, | |
| "kl": 0.0352783203125, | |
| "learning_rate": 7.718749999999999e-07, | |
| "loss": 0.0057990700006484985, | |
| "reward": 2.019951283931732, | |
| "reward_std": 0.30692145973443985, | |
| "rewards/GDino": 0.6002083420753479, | |
| "rewards/GIT": 0.28720255196094513, | |
| "rewards/HPSv2": 0.27191162109375, | |
| "rewards/ORM": 0.8606287240982056, | |
| "self_certainty_semantic": -25.6875, | |
| "self_certainty_token": -21.6875, | |
| "step": 365 | |
| }, | |
| { | |
| "completion_length": 71.171875, | |
| "epoch": 0.4053156146179402, | |
| "grad_norm": 25.993791580200195, | |
| "kl": 0.9073486328125, | |
| "learning_rate": 7.712499999999999e-07, | |
| "loss": 0.010981484781950712, | |
| "reward": 1.686427891254425, | |
| "reward_std": 0.507771372795105, | |
| "rewards/GDino": 0.6281927824020386, | |
| "rewards/GIT": 0.19865846633911133, | |
| "rewards/HPSv2": 0.2666797637939453, | |
| "rewards/ORM": 0.5928968787193298, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -23.375, | |
| "step": 366 | |
| }, | |
| { | |
| "completion_length": 73.765625, | |
| "epoch": 0.4064230343300111, | |
| "grad_norm": 2.63863205909729, | |
| "kl": 0.03125, | |
| "learning_rate": 7.706249999999999e-07, | |
| "loss": 0.00012735230848193169, | |
| "reward": 2.2389276027679443, | |
| "reward_std": 0.4072023332118988, | |
| "rewards/GDino": 0.7541632354259491, | |
| "rewards/GIT": 0.45902618765830994, | |
| "rewards/HPSv2": 0.28417205810546875, | |
| "rewards/ORM": 0.741566002368927, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -22.25, | |
| "step": 367 | |
| }, | |
| { | |
| "completion_length": 72.296875, | |
| "epoch": 0.40753045404208194, | |
| "grad_norm": 0.935273289680481, | |
| "kl": 0.02752685546875, | |
| "learning_rate": 7.699999999999999e-07, | |
| "loss": 0.0032109934836626053, | |
| "reward": 2.2100645303726196, | |
| "reward_std": 0.24064482748508453, | |
| "rewards/GDino": 0.8498496413230896, | |
| "rewards/GIT": 0.5447873771190643, | |
| "rewards/HPSv2": 0.2853813171386719, | |
| "rewards/ORM": 0.5300461947917938, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -21.875, | |
| "step": 368 | |
| }, | |
| { | |
| "completion_length": 77.703125, | |
| "epoch": 0.40863787375415284, | |
| "grad_norm": 0.8915330767631531, | |
| "kl": 0.1737060546875, | |
| "learning_rate": 7.69375e-07, | |
| "loss": -0.004847385222092271, | |
| "reward": 2.4846267104148865, | |
| "reward_std": 0.34741681814193726, | |
| "rewards/GDino": 0.8923705220222473, | |
| "rewards/GIT": 0.6399442255496979, | |
| "rewards/HPSv2": 0.2816905975341797, | |
| "rewards/ORM": 0.6706212610006332, | |
| "self_certainty_semantic": -26.0625, | |
| "self_certainty_token": -22.5625, | |
| "step": 369 | |
| }, | |
| { | |
| "completion_length": 78.828125, | |
| "epoch": 0.4097452934662237, | |
| "grad_norm": 0.4684216380119324, | |
| "kl": 0.03460693359375, | |
| "learning_rate": 7.6875e-07, | |
| "loss": 0.006017133709974587, | |
| "reward": 2.400377631187439, | |
| "reward_std": 0.3661707490682602, | |
| "rewards/GDino": 0.7559739351272583, | |
| "rewards/GIT": 0.5203657299280167, | |
| "rewards/HPSv2": 0.2681427001953125, | |
| "rewards/ORM": 0.8558953106403351, | |
| "self_certainty_semantic": -26.0, | |
| "self_certainty_token": -21.5, | |
| "step": 370 | |
| }, | |
| { | |
| "completion_length": 81.90625, | |
| "epoch": 0.4108527131782946, | |
| "grad_norm": 0.39840996265411377, | |
| "kl": 0.0191650390625, | |
| "learning_rate": 7.68125e-07, | |
| "loss": 0.007917450740933418, | |
| "reward": 2.394679307937622, | |
| "reward_std": 0.26683831214904785, | |
| "rewards/GDino": 0.8869791626930237, | |
| "rewards/GIT": 0.7277960479259491, | |
| "rewards/HPSv2": 0.2607994079589844, | |
| "rewards/ORM": 0.5191046893596649, | |
| "self_certainty_semantic": -26.0, | |
| "self_certainty_token": -22.0625, | |
| "step": 371 | |
| }, | |
| { | |
| "completion_length": 68.34375, | |
| "epoch": 0.4119601328903654, | |
| "grad_norm": 0.5618812441825867, | |
| "kl": 0.02484130859375, | |
| "learning_rate": 7.675e-07, | |
| "loss": 0.0053215608932077885, | |
| "reward": 1.9621981978416443, | |
| "reward_std": 0.3503369837999344, | |
| "rewards/GDino": 0.6614684462547302, | |
| "rewards/GIT": 0.44001519680023193, | |
| "rewards/HPSv2": 0.2732524871826172, | |
| "rewards/ORM": 0.5874620378017426, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -21.3125, | |
| "step": 372 | |
| }, | |
| { | |
| "completion_length": 77.453125, | |
| "epoch": 0.4130675526024363, | |
| "grad_norm": 0.44254979491233826, | |
| "kl": 0.0338134765625, | |
| "learning_rate": 7.66875e-07, | |
| "loss": 0.012260682880878448, | |
| "reward": 1.9219316244125366, | |
| "reward_std": 0.4732651710510254, | |
| "rewards/GDino": 0.720096230506897, | |
| "rewards/GIT": 0.2833433449268341, | |
| "rewards/HPSv2": 0.27478790283203125, | |
| "rewards/ORM": 0.6437040567398071, | |
| "self_certainty_semantic": -25.9375, | |
| "self_certainty_token": -21.9375, | |
| "step": 373 | |
| }, | |
| { | |
| "completion_length": 77.109375, | |
| "epoch": 0.4141749723145072, | |
| "grad_norm": 0.5283539295196533, | |
| "kl": 0.01953125, | |
| "learning_rate": 7.6625e-07, | |
| "loss": 0.00025802222080528736, | |
| "reward": 2.1784894466400146, | |
| "reward_std": 0.37603603303432465, | |
| "rewards/GDino": 0.8346986174583435, | |
| "rewards/GIT": 0.5240668132901192, | |
| "rewards/HPSv2": 0.2767810821533203, | |
| "rewards/ORM": 0.5429428815841675, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -21.875, | |
| "step": 374 | |
| }, | |
| { | |
| "completion_length": 65.1875, | |
| "epoch": 0.4152823920265781, | |
| "grad_norm": 2.0193865299224854, | |
| "kl": 0.0350341796875, | |
| "learning_rate": 7.65625e-07, | |
| "loss": 0.00023457035422325134, | |
| "reward": 2.463273048400879, | |
| "reward_std": 0.16159752011299133, | |
| "rewards/GDino": 0.8671875, | |
| "rewards/GIT": 0.7136387228965759, | |
| "rewards/HPSv2": 0.2778053283691406, | |
| "rewards/ORM": 0.6046415567398071, | |
| "self_certainty_semantic": -25.9375, | |
| "self_certainty_token": -22.125, | |
| "step": 375 | |
| }, | |
| { | |
| "completion_length": 72.15625, | |
| "epoch": 0.416389811738649, | |
| "grad_norm": 0.6591690182685852, | |
| "kl": 0.0289306640625, | |
| "learning_rate": 7.65e-07, | |
| "loss": -0.0017245884519070387, | |
| "reward": 2.2452900409698486, | |
| "reward_std": 0.490168958902359, | |
| "rewards/GDino": 0.7729970812797546, | |
| "rewards/GIT": 0.4367716535925865, | |
| "rewards/HPSv2": 0.25975608825683594, | |
| "rewards/ORM": 0.7757652103900909, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -21.625, | |
| "step": 376 | |
| }, | |
| { | |
| "completion_length": 76.203125, | |
| "epoch": 0.4174972314507198, | |
| "grad_norm": 0.5766474604606628, | |
| "kl": 0.02191162109375, | |
| "learning_rate": 7.64375e-07, | |
| "loss": -0.0004694787785410881, | |
| "reward": 1.9068648219108582, | |
| "reward_std": 0.16421005129814148, | |
| "rewards/GDino": 0.6833183169364929, | |
| "rewards/GIT": 0.3441741615533829, | |
| "rewards/HPSv2": 0.26651763916015625, | |
| "rewards/ORM": 0.6128546595573425, | |
| "self_certainty_semantic": -26.0, | |
| "self_certainty_token": -21.0, | |
| "step": 377 | |
| }, | |
| { | |
| "completion_length": 86.546875, | |
| "epoch": 0.4186046511627907, | |
| "grad_norm": 0.43956464529037476, | |
| "kl": 0.02410888671875, | |
| "learning_rate": 7.6375e-07, | |
| "loss": 0.004958470817655325, | |
| "reward": 2.4844208359718323, | |
| "reward_std": 0.3209882155060768, | |
| "rewards/GDino": 0.7713975608348846, | |
| "rewards/GIT": 0.5327824056148529, | |
| "rewards/HPSv2": 0.28255462646484375, | |
| "rewards/ORM": 0.8976861834526062, | |
| "self_certainty_semantic": -26.0, | |
| "self_certainty_token": -21.0625, | |
| "step": 378 | |
| }, | |
| { | |
| "completion_length": 82.75, | |
| "epoch": 0.41971207087486156, | |
| "grad_norm": 1.2485466003417969, | |
| "kl": 0.05816650390625, | |
| "learning_rate": 7.63125e-07, | |
| "loss": -0.002994309877976775, | |
| "reward": 2.16240918636322, | |
| "reward_std": 0.2650664523243904, | |
| "rewards/GDino": 0.776936948299408, | |
| "rewards/GIT": 0.5451656579971313, | |
| "rewards/HPSv2": 0.2638359069824219, | |
| "rewards/ORM": 0.5764705985784531, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -22.0, | |
| "step": 379 | |
| }, | |
| { | |
| "completion_length": 85.890625, | |
| "epoch": 0.42081949058693247, | |
| "grad_norm": 0.6214731931686401, | |
| "kl": 0.0277099609375, | |
| "learning_rate": 7.624999999999999e-07, | |
| "loss": -0.00421567028388381, | |
| "reward": 1.8477151989936829, | |
| "reward_std": 0.32977864146232605, | |
| "rewards/GDino": 0.7990064024925232, | |
| "rewards/GIT": 0.5198927968740463, | |
| "rewards/HPSv2": 0.28627777099609375, | |
| "rewards/ORM": 0.2425382286310196, | |
| "self_certainty_semantic": -25.9375, | |
| "self_certainty_token": -21.5625, | |
| "step": 380 | |
| }, | |
| { | |
| "completion_length": 84.375, | |
| "epoch": 0.4219269102990033, | |
| "grad_norm": 0.7152870893478394, | |
| "kl": 0.02960205078125, | |
| "learning_rate": 7.618749999999999e-07, | |
| "loss": -0.00914668245241046, | |
| "reward": 2.547330617904663, | |
| "reward_std": 0.318975493311882, | |
| "rewards/GDino": 0.8286458253860474, | |
| "rewards/GIT": 0.724987268447876, | |
| "rewards/HPSv2": 0.2702140808105469, | |
| "rewards/ORM": 0.7234834134578705, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -20.4375, | |
| "step": 381 | |
| }, | |
| { | |
| "completion_length": 87.328125, | |
| "epoch": 0.4230343300110742, | |
| "grad_norm": 0.5311893820762634, | |
| "kl": 0.031005859375, | |
| "learning_rate": 7.612499999999999e-07, | |
| "loss": -0.01361012738198042, | |
| "reward": 2.263441562652588, | |
| "reward_std": 0.2558389827609062, | |
| "rewards/GDino": 0.7953124642372131, | |
| "rewards/GIT": 0.5018891543149948, | |
| "rewards/HPSv2": 0.27873992919921875, | |
| "rewards/ORM": 0.6875, | |
| "self_certainty_semantic": -26.0, | |
| "self_certainty_token": -20.5, | |
| "step": 382 | |
| }, | |
| { | |
| "completion_length": 76.890625, | |
| "epoch": 0.42414174972314506, | |
| "grad_norm": 0.6475590467453003, | |
| "kl": 0.02301025390625, | |
| "learning_rate": 7.606249999999999e-07, | |
| "loss": -0.001263815094716847, | |
| "reward": 2.1975873708724976, | |
| "reward_std": 0.3748020529747009, | |
| "rewards/GDino": 0.7668351829051971, | |
| "rewards/GIT": 0.5400517359375954, | |
| "rewards/HPSv2": 0.2606163024902344, | |
| "rewards/ORM": 0.6300841569900513, | |
| "self_certainty_semantic": -25.9375, | |
| "self_certainty_token": -22.0625, | |
| "step": 383 | |
| }, | |
| { | |
| "completion_length": 83.34375, | |
| "epoch": 0.42524916943521596, | |
| "grad_norm": 0.507254421710968, | |
| "kl": 0.02801513671875, | |
| "learning_rate": 7.599999999999999e-07, | |
| "loss": -0.0008333073928952217, | |
| "reward": 2.077458620071411, | |
| "reward_std": 0.3855893015861511, | |
| "rewards/GDino": 0.7702149152755737, | |
| "rewards/GIT": 0.5419944673776627, | |
| "rewards/HPSv2": 0.2733325958251953, | |
| "rewards/ORM": 0.4919167459011078, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -21.9375, | |
| "step": 384 | |
| }, | |
| { | |
| "completion_length": 85.40625, | |
| "epoch": 0.4263565891472868, | |
| "grad_norm": 0.4501701593399048, | |
| "kl": 0.03662109375, | |
| "learning_rate": 7.59375e-07, | |
| "loss": 0.005623898236081004, | |
| "reward": 2.138138771057129, | |
| "reward_std": 0.31487397849559784, | |
| "rewards/GDino": 0.7385719418525696, | |
| "rewards/GIT": 0.3306322917342186, | |
| "rewards/HPSv2": 0.2595634460449219, | |
| "rewards/ORM": 0.8093710243701935, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -21.25, | |
| "step": 385 | |
| }, | |
| { | |
| "completion_length": 74.640625, | |
| "epoch": 0.4274640088593577, | |
| "grad_norm": 1.0451889038085938, | |
| "kl": 0.03448486328125, | |
| "learning_rate": 7.5875e-07, | |
| "loss": 0.008310600649565458, | |
| "reward": 2.5433003902435303, | |
| "reward_std": 0.3450489193201065, | |
| "rewards/GDino": 0.8338541686534882, | |
| "rewards/GIT": 0.5790800303220749, | |
| "rewards/HPSv2": 0.2816619873046875, | |
| "rewards/ORM": 0.8487042486667633, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -21.9375, | |
| "step": 386 | |
| }, | |
| { | |
| "completion_length": 73.375, | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 0.5248279571533203, | |
| "kl": 0.0316162109375, | |
| "learning_rate": 7.58125e-07, | |
| "loss": -0.004152324283495545, | |
| "reward": 2.282692015171051, | |
| "reward_std": 0.3422551453113556, | |
| "rewards/GDino": 0.8517140746116638, | |
| "rewards/GIT": 0.49808675050735474, | |
| "rewards/HPSv2": 0.2647666931152344, | |
| "rewards/ORM": 0.6681245267391205, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -21.1875, | |
| "step": 387 | |
| }, | |
| { | |
| "completion_length": 74.078125, | |
| "epoch": 0.42967884828349945, | |
| "grad_norm": 0.38542476296424866, | |
| "kl": 0.013763427734375, | |
| "learning_rate": 7.575e-07, | |
| "loss": -0.010505566373467445, | |
| "reward": 2.6636276245117188, | |
| "reward_std": 0.2106212005019188, | |
| "rewards/GDino": 0.8841937184333801, | |
| "rewards/GIT": 0.7084439396858215, | |
| "rewards/HPSv2": 0.2803230285644531, | |
| "rewards/ORM": 0.790666937828064, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -22.25, | |
| "step": 388 | |
| }, | |
| { | |
| "completion_length": 76.390625, | |
| "epoch": 0.43078626799557035, | |
| "grad_norm": 0.6815649271011353, | |
| "kl": 0.03594970703125, | |
| "learning_rate": 7.56875e-07, | |
| "loss": -0.0032428253907710314, | |
| "reward": 2.090184509754181, | |
| "reward_std": 0.5343265533447266, | |
| "rewards/GDino": 0.7562500238418579, | |
| "rewards/GIT": 0.44777335971593857, | |
| "rewards/HPSv2": 0.26241493225097656, | |
| "rewards/ORM": 0.623746246099472, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -21.25, | |
| "step": 389 | |
| }, | |
| { | |
| "completion_length": 78.015625, | |
| "epoch": 0.4318936877076412, | |
| "grad_norm": 0.4324207901954651, | |
| "kl": 0.01995849609375, | |
| "learning_rate": 7.5625e-07, | |
| "loss": -1.2263190001249313e-06, | |
| "reward": 2.483983278274536, | |
| "reward_std": 0.22743677347898483, | |
| "rewards/GDino": 0.8252314329147339, | |
| "rewards/GIT": 0.6573226451873779, | |
| "rewards/HPSv2": 0.26531982421875, | |
| "rewards/ORM": 0.7361093312501907, | |
| "self_certainty_semantic": -26.0, | |
| "self_certainty_token": -21.25, | |
| "step": 390 | |
| }, | |
| { | |
| "completion_length": 78.890625, | |
| "epoch": 0.4330011074197121, | |
| "grad_norm": 0.40318140387535095, | |
| "kl": 0.0306396484375, | |
| "learning_rate": 7.55625e-07, | |
| "loss": 0.01058862719219178, | |
| "reward": 2.4253766536712646, | |
| "reward_std": 0.247165247797966, | |
| "rewards/GDino": 0.9132010638713837, | |
| "rewards/GIT": 0.6424136161804199, | |
| "rewards/HPSv2": 0.25565338134765625, | |
| "rewards/ORM": 0.6141084432601929, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -22.375, | |
| "step": 391 | |
| }, | |
| { | |
| "completion_length": 87.84375, | |
| "epoch": 0.43410852713178294, | |
| "grad_norm": 0.506926417350769, | |
| "kl": 0.02825927734375, | |
| "learning_rate": 7.55e-07, | |
| "loss": 0.005374419270083308, | |
| "reward": 2.0307316184043884, | |
| "reward_std": 0.41478703916072845, | |
| "rewards/GDino": 0.7457704544067383, | |
| "rewards/GIT": 0.3928220123052597, | |
| "rewards/HPSv2": 0.2746009826660156, | |
| "rewards/ORM": 0.617538183927536, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -21.4375, | |
| "step": 392 | |
| }, | |
| { | |
| "completion_length": 68.0, | |
| "epoch": 0.43521594684385384, | |
| "grad_norm": 0.4594443142414093, | |
| "kl": 0.037353515625, | |
| "learning_rate": 7.54375e-07, | |
| "loss": 0.003534393385052681, | |
| "reward": 2.255183219909668, | |
| "reward_std": 0.19336728006601334, | |
| "rewards/GDino": 0.8642210066318512, | |
| "rewards/GIT": 0.47932907938957214, | |
| "rewards/HPSv2": 0.28462791442871094, | |
| "rewards/ORM": 0.6270051002502441, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -22.875, | |
| "step": 393 | |
| }, | |
| { | |
| "completion_length": 82.0, | |
| "epoch": 0.4363233665559247, | |
| "grad_norm": 0.5013278126716614, | |
| "kl": 0.02581787109375, | |
| "learning_rate": 7.5375e-07, | |
| "loss": -0.0056536816991865635, | |
| "reward": 2.371790647506714, | |
| "reward_std": 0.19828242808580399, | |
| "rewards/GDino": 0.8962720036506653, | |
| "rewards/GIT": 0.4048049747943878, | |
| "rewards/HPSv2": 0.2725849151611328, | |
| "rewards/ORM": 0.798128753900528, | |
| "self_certainty_semantic": -26.0, | |
| "self_certainty_token": -23.0625, | |
| "step": 394 | |
| }, | |
| { | |
| "completion_length": 79.140625, | |
| "epoch": 0.4374307862679956, | |
| "grad_norm": 0.7290600538253784, | |
| "kl": 0.071044921875, | |
| "learning_rate": 7.53125e-07, | |
| "loss": -0.0031574114691466093, | |
| "reward": 2.314516067504883, | |
| "reward_std": 0.3586508333683014, | |
| "rewards/GDino": 0.8972340226173401, | |
| "rewards/GIT": 0.5144131779670715, | |
| "rewards/HPSv2": 0.27438926696777344, | |
| "rewards/ORM": 0.6284796893596649, | |
| "self_certainty_semantic": -25.9375, | |
| "self_certainty_token": -21.1875, | |
| "step": 395 | |
| }, | |
| { | |
| "completion_length": 76.109375, | |
| "epoch": 0.43853820598006643, | |
| "grad_norm": 0.6519272327423096, | |
| "kl": 0.0380859375, | |
| "learning_rate": 7.524999999999999e-07, | |
| "loss": -0.0067711935844272375, | |
| "reward": 2.2616201043128967, | |
| "reward_std": 0.38303279876708984, | |
| "rewards/GDino": 0.8086700439453125, | |
| "rewards/GIT": 0.43244993686676025, | |
| "rewards/HPSv2": 0.2732067108154297, | |
| "rewards/ORM": 0.7472934424877167, | |
| "self_certainty_semantic": -25.8125, | |
| "self_certainty_token": -21.1875, | |
| "step": 396 | |
| }, | |
| { | |
| "completion_length": 79.28125, | |
| "epoch": 0.43964562569213733, | |
| "grad_norm": 0.5021886229515076, | |
| "kl": 0.024169921875, | |
| "learning_rate": 7.518749999999999e-07, | |
| "loss": 0.004822302144020796, | |
| "reward": 2.49921977519989, | |
| "reward_std": 0.35611972212791443, | |
| "rewards/GDino": 0.884239137172699, | |
| "rewards/GIT": 0.6691879034042358, | |
| "rewards/HPSv2": 0.2534294128417969, | |
| "rewards/ORM": 0.6923633217811584, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -21.6875, | |
| "step": 397 | |
| }, | |
| { | |
| "completion_length": 71.9375, | |
| "epoch": 0.4407530454042082, | |
| "grad_norm": 0.7414788007736206, | |
| "kl": 0.05810546875, | |
| "learning_rate": 7.512499999999999e-07, | |
| "loss": -8.996110409498215e-05, | |
| "reward": 2.6626181602478027, | |
| "reward_std": 0.20449738949537277, | |
| "rewards/GDino": 0.7895833253860474, | |
| "rewards/GIT": 0.6252684891223907, | |
| "rewards/HPSv2": 0.27901649475097656, | |
| "rewards/ORM": 0.96875, | |
| "self_certainty_semantic": -25.875, | |
| "self_certainty_token": -21.0, | |
| "step": 398 | |
| }, | |
| { | |
| "completion_length": 84.78125, | |
| "epoch": 0.4418604651162791, | |
| "grad_norm": 0.5590401291847229, | |
| "kl": 0.03173828125, | |
| "learning_rate": 7.506249999999999e-07, | |
| "loss": -0.008660833351314068, | |
| "reward": 1.7432748675346375, | |
| "reward_std": 0.41746358573436737, | |
| "rewards/GDino": 0.5843387842178345, | |
| "rewards/GIT": 0.23352296650409698, | |
| "rewards/HPSv2": 0.2535381317138672, | |
| "rewards/ORM": 0.6718749701976776, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -21.1875, | |
| "step": 399 | |
| }, | |
| { | |
| "completion_length": 83.21875, | |
| "epoch": 0.4429678848283499, | |
| "grad_norm": 0.648642897605896, | |
| "kl": 0.0411376953125, | |
| "learning_rate": 7.5e-07, | |
| "loss": -0.0011475947685539722, | |
| "reward": 2.1194006204605103, | |
| "reward_std": 0.28101272881031036, | |
| "rewards/GDino": 0.7772284150123596, | |
| "rewards/GIT": 0.43165211379528046, | |
| "rewards/HPSv2": 0.2793121337890625, | |
| "rewards/ORM": 0.6312080323696136, | |
| "self_certainty_semantic": -25.75, | |
| "self_certainty_token": -21.0625, | |
| "step": 400 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 1600, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |