t2i-step100 / trainer_state.json
YhangChen's picture
Initial model files upload
df36ac8 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.11074197120708748,
"eval_steps": 500,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 73.265625,
"epoch": 0.0011074197120708748,
"grad_norm": 0.47520893812179565,
"kl": 0.0,
"learning_rate": 9.99375e-07,
"loss": 0.000854941550642252,
"reward": 2.2648561000823975,
"reward_std": 0.32521533221006393,
"rewards/GDino": 0.84943026304245,
"rewards/GIT": 0.5776679813861847,
"rewards/HPSv2": 0.2639656066894531,
"rewards/ORM": 0.5737921893596649,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -22.0,
"step": 1
},
{
"completion_length": 57.359375,
"epoch": 0.0022148394241417496,
"grad_norm": 0.7006784677505493,
"kl": 0.00151824951171875,
"learning_rate": 9.9875e-07,
"loss": 0.0010380030144006014,
"reward": 1.6890186071395874,
"reward_std": 0.5064275413751602,
"rewards/GDino": 0.7000000476837158,
"rewards/GIT": 0.161313958466053,
"rewards/HPSv2": 0.2509632110595703,
"rewards/ORM": 0.5767413973808289,
"self_certainty_semantic": -25.375,
"self_certainty_token": -20.5625,
"step": 2
},
{
"completion_length": 54.640625,
"epoch": 0.0033222591362126247,
"grad_norm": 0.5812113285064697,
"kl": 0.001556396484375,
"learning_rate": 9.98125e-07,
"loss": -0.0055133504793047905,
"reward": 1.5832943320274353,
"reward_std": 0.3882431983947754,
"rewards/GDino": 0.6165956258773804,
"rewards/GIT": 0.3970412313938141,
"rewards/HPSv2": 0.24474143981933594,
"rewards/ORM": 0.3249160535633564,
"self_certainty_semantic": -25.1875,
"self_certainty_token": -20.8125,
"step": 3
},
{
"completion_length": 63.578125,
"epoch": 0.004429678848283499,
"grad_norm": 0.6130731105804443,
"kl": 0.001605987548828125,
"learning_rate": 9.975e-07,
"loss": -0.005623435601592064,
"reward": 2.1563462018966675,
"reward_std": 0.3505118489265442,
"rewards/GDino": 0.8188963234424591,
"rewards/GIT": 0.4581628292798996,
"rewards/HPSv2": 0.24955368041992188,
"rewards/ORM": 0.6297334432601929,
"self_certainty_semantic": -25.5,
"self_certainty_token": -22.0,
"step": 4
},
{
"completion_length": 57.65625,
"epoch": 0.005537098560354375,
"grad_norm": 0.8068524599075317,
"kl": 0.00165557861328125,
"learning_rate": 9.968749999999999e-07,
"loss": -0.0018901200965046883,
"reward": 1.6294466853141785,
"reward_std": 0.3914882242679596,
"rewards/GDino": 0.6075743436813354,
"rewards/GIT": 0.2503758817911148,
"rewards/HPSv2": 0.2523918151855469,
"rewards/ORM": 0.5191046893596649,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -21.8125,
"step": 5
},
{
"completion_length": 65.8125,
"epoch": 0.006644518272425249,
"grad_norm": 74728.3515625,
"kl": 228.00085067749023,
"learning_rate": 9.9625e-07,
"loss": 2.2879227567464113,
"reward": 2.15460866689682,
"reward_std": 0.18937285244464874,
"rewards/GDino": 0.7502027153968811,
"rewards/GIT": 0.4551280438899994,
"rewards/HPSv2": 0.2774028778076172,
"rewards/ORM": 0.671875,
"self_certainty_semantic": -25.3125,
"self_certainty_token": -22.625,
"step": 6
},
{
"completion_length": 65.640625,
"epoch": 0.007751937984496124,
"grad_norm": 0.9850716590881348,
"kl": 0.001739501953125,
"learning_rate": 9.956249999999999e-07,
"loss": -0.009785129223018885,
"reward": 1.6486687660217285,
"reward_std": 0.55589759349823,
"rewards/GDino": 0.5765624940395355,
"rewards/GIT": 0.15754839032888412,
"rewards/HPSv2": 0.2522296905517578,
"rewards/ORM": 0.6623281538486481,
"self_certainty_semantic": -25.5625,
"self_certainty_token": -22.1875,
"step": 7
},
{
"completion_length": 65.796875,
"epoch": 0.008859357696566999,
"grad_norm": 0.8074976801872253,
"kl": 0.001628875732421875,
"learning_rate": 9.95e-07,
"loss": 0.0002866658614948392,
"reward": 1.7531355023384094,
"reward_std": 0.3834189176559448,
"rewards/GDino": 0.7171875536441803,
"rewards/GIT": 0.3904750794172287,
"rewards/HPSv2": 0.2441272735595703,
"rewards/ORM": 0.4013456404209137,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -20.5,
"step": 8
},
{
"completion_length": 61.53125,
"epoch": 0.009966777408637873,
"grad_norm": 0.5135362148284912,
"kl": 0.001628875732421875,
"learning_rate": 9.94375e-07,
"loss": -0.002820038120262325,
"reward": 2.1886491775512695,
"reward_std": 0.5042529106140137,
"rewards/GDino": 0.800000011920929,
"rewards/GIT": 0.3224633187055588,
"rewards/HPSv2": 0.2661018371582031,
"rewards/ORM": 0.8000838756561279,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -21.5,
"step": 9
},
{
"completion_length": 66.9375,
"epoch": 0.01107419712070875,
"grad_norm": 1.035406231880188,
"kl": 0.001590728759765625,
"learning_rate": 9.9375e-07,
"loss": 0.010037540923804045,
"reward": 1.8388126492500305,
"reward_std": 0.385573148727417,
"rewards/GDino": 0.729426920413971,
"rewards/GIT": 0.47063055634498596,
"rewards/HPSv2": 0.25093841552734375,
"rewards/ORM": 0.3878167122602463,
"self_certainty_semantic": -25.375,
"self_certainty_token": -20.75,
"step": 10
},
{
"completion_length": 54.65625,
"epoch": 0.012181616832779624,
"grad_norm": 0.6659172773361206,
"kl": 0.00159454345703125,
"learning_rate": 9.93125e-07,
"loss": -0.010986692272126675,
"reward": 2.312160015106201,
"reward_std": 0.3424924612045288,
"rewards/GDino": 0.7864583432674408,
"rewards/GIT": 0.5519254580140114,
"rewards/HPSv2": 0.2634601593017578,
"rewards/ORM": 0.710316002368927,
"self_certainty_semantic": -25.3125,
"self_certainty_token": -20.9375,
"step": 11
},
{
"completion_length": 65.6875,
"epoch": 0.013289036544850499,
"grad_norm": 0.4100457727909088,
"kl": 0.00152587890625,
"learning_rate": 9.925e-07,
"loss": -0.0020649502985179424,
"reward": 1.831676721572876,
"reward_std": 0.37266574054956436,
"rewards/GDino": 0.6748343408107758,
"rewards/GIT": 0.3966377377510071,
"rewards/HPSv2": 0.2431049346923828,
"rewards/ORM": 0.5170995742082596,
"self_certainty_semantic": -25.5,
"self_certainty_token": -21.5,
"step": 12
},
{
"completion_length": 62.15625,
"epoch": 0.014396456256921373,
"grad_norm": 1.1354421377182007,
"kl": 0.0016326904296875,
"learning_rate": 9.91875e-07,
"loss": -0.0013978920178487897,
"reward": 1.7478299736976624,
"reward_std": 0.3111024349927902,
"rewards/GDino": 0.7122170925140381,
"rewards/GIT": 0.28808362782001495,
"rewards/HPSv2": 0.2510089874267578,
"rewards/ORM": 0.4965202957391739,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -21.6875,
"step": 13
},
{
"completion_length": 63.734375,
"epoch": 0.015503875968992248,
"grad_norm": 171.63954162597656,
"kl": 11.750831604003906,
"learning_rate": 9.912499999999998e-07,
"loss": 0.11320369923487306,
"reward": 1.820958137512207,
"reward_std": 0.6430586874485016,
"rewards/GDino": 0.7286913394927979,
"rewards/GIT": 0.39159613847732544,
"rewards/HPSv2": 0.222503662109375,
"rewards/ORM": 0.47816696763038635,
"self_certainty_semantic": -25.375,
"self_certainty_token": -21.875,
"step": 14
},
{
"completion_length": 64.796875,
"epoch": 0.016611295681063124,
"grad_norm": 1.790418267250061,
"kl": 0.001697540283203125,
"learning_rate": 9.90625e-07,
"loss": -0.0012796747614629567,
"reward": 2.4724700450897217,
"reward_std": 0.361017182469368,
"rewards/GDino": 0.8982033133506775,
"rewards/GIT": 0.5411243438720703,
"rewards/HPSv2": 0.2581005096435547,
"rewards/ORM": 0.7750419676303864,
"self_certainty_semantic": -25.625,
"self_certainty_token": -21.8125,
"step": 15
},
{
"completion_length": 65.078125,
"epoch": 0.017718715393133997,
"grad_norm": 0.38361120223999023,
"kl": 0.0015869140625,
"learning_rate": 9.9e-07,
"loss": 0.006866331794299185,
"reward": 1.5055131912231445,
"reward_std": 0.40322621166706085,
"rewards/GDino": 0.651562511920929,
"rewards/GIT": 0.2843637466430664,
"rewards/HPSv2": 0.24664592742919922,
"rewards/ORM": 0.32294100522994995,
"self_certainty_semantic": -25.5,
"self_certainty_token": -21.125,
"step": 16
},
{
"completion_length": 70.25,
"epoch": 0.018826135105204873,
"grad_norm": 1.0185045003890991,
"kl": 0.001552581787109375,
"learning_rate": 9.89375e-07,
"loss": -0.010323233203962445,
"reward": 1.5897727608680725,
"reward_std": 0.530043363571167,
"rewards/GDino": 0.5529386103153229,
"rewards/GIT": 0.2131059616804123,
"rewards/HPSv2": 0.2552909851074219,
"rewards/ORM": 0.5684372782707214,
"self_certainty_semantic": -25.3125,
"self_certainty_token": -20.5,
"step": 17
},
{
"completion_length": 66.34375,
"epoch": 0.019933554817275746,
"grad_norm": 0.4375481605529785,
"kl": 0.00156402587890625,
"learning_rate": 9.8875e-07,
"loss": -0.00136462040245533,
"reward": 2.063610315322876,
"reward_std": 0.42642320692539215,
"rewards/GDino": 0.7955474257469177,
"rewards/GIT": 0.5150393098592758,
"rewards/HPSv2": 0.22445201873779297,
"rewards/ORM": 0.528571605682373,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -20.75,
"step": 18
},
{
"completion_length": 59.21875,
"epoch": 0.021040974529346623,
"grad_norm": 0.3959902226924896,
"kl": 0.00164031982421875,
"learning_rate": 9.88125e-07,
"loss": -0.0053134458139538765,
"reward": 1.5237417221069336,
"reward_std": 0.4693976193666458,
"rewards/GDino": 0.701702356338501,
"rewards/GIT": 0.2579326629638672,
"rewards/HPSv2": 0.24812698364257812,
"rewards/ORM": 0.3159796893596649,
"self_certainty_semantic": -25.1875,
"self_certainty_token": -21.5625,
"step": 19
},
{
"completion_length": 61.484375,
"epoch": 0.0221483942414175,
"grad_norm": 0.5081169605255127,
"kl": 0.001689910888671875,
"learning_rate": 9.875e-07,
"loss": 0.0003520832397043705,
"reward": 1.9516127109527588,
"reward_std": 0.2731045335531235,
"rewards/GDino": 0.6437798738479614,
"rewards/GIT": 0.4635310173034668,
"rewards/HPSv2": 0.24121475219726562,
"rewards/ORM": 0.6030870825052261,
"self_certainty_semantic": -25.1875,
"self_certainty_token": -21.5625,
"step": 20
},
{
"completion_length": 55.546875,
"epoch": 0.023255813953488372,
"grad_norm": 0.4565694034099579,
"kl": 0.001667022705078125,
"learning_rate": 9.86875e-07,
"loss": 0.0016932454891502857,
"reward": 2.180082321166992,
"reward_std": 0.5037369430065155,
"rewards/GDino": 0.7953125238418579,
"rewards/GIT": 0.45517681539058685,
"rewards/HPSv2": 0.2586212158203125,
"rewards/ORM": 0.6709719300270081,
"self_certainty_semantic": -25.25,
"self_certainty_token": -21.9375,
"step": 21
},
{
"completion_length": 68.75,
"epoch": 0.024363233665559248,
"grad_norm": 0.45827633142471313,
"kl": 0.001712799072265625,
"learning_rate": 9.862499999999999e-07,
"loss": 0.0007174527272582054,
"reward": 1.8721013069152832,
"reward_std": 0.4303991347551346,
"rewards/GDino": 0.6911458671092987,
"rewards/GIT": 0.36048486828804016,
"rewards/HPSv2": 0.2603263854980469,
"rewards/ORM": 0.5601442009210587,
"self_certainty_semantic": -25.6875,
"self_certainty_token": -22.4375,
"step": 22
},
{
"completion_length": 58.609375,
"epoch": 0.02547065337763012,
"grad_norm": 0.6875389218330383,
"kl": 0.00162506103515625,
"learning_rate": 9.85625e-07,
"loss": -0.004631380317732692,
"reward": 1.9805514812469482,
"reward_std": 0.5138447731733322,
"rewards/GDino": 0.706105500459671,
"rewards/GIT": 0.4199465811252594,
"rewards/HPSv2": 0.26941490173339844,
"rewards/ORM": 0.5850843787193298,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -20.9375,
"step": 23
},
{
"completion_length": 60.859375,
"epoch": 0.026578073089700997,
"grad_norm": 0.5052416324615479,
"kl": 0.001667022705078125,
"learning_rate": 9.849999999999999e-07,
"loss": -0.0046843914315104485,
"reward": 2.368114173412323,
"reward_std": 0.4367552697658539,
"rewards/GDino": 0.815625011920929,
"rewards/GIT": 0.633857935667038,
"rewards/HPSv2": 0.25930213928222656,
"rewards/ORM": 0.6593290567398071,
"self_certainty_semantic": -25.75,
"self_certainty_token": -21.875,
"step": 24
},
{
"completion_length": 61.078125,
"epoch": 0.02768549280177187,
"grad_norm": 0.6162320971488953,
"kl": 0.001617431640625,
"learning_rate": 9.84375e-07,
"loss": -0.005464642075821757,
"reward": 1.9494624137878418,
"reward_std": 0.40468768775463104,
"rewards/GDino": 0.6967671811580658,
"rewards/GIT": 0.40975040197372437,
"rewards/HPSv2": 0.26043701171875,
"rewards/ORM": 0.5825077295303345,
"self_certainty_semantic": -25.5,
"self_certainty_token": -21.3125,
"step": 25
},
{
"completion_length": 50.734375,
"epoch": 0.028792912513842746,
"grad_norm": 2.8454437255859375,
"kl": 0.001804351806640625,
"learning_rate": 9.8375e-07,
"loss": -0.006305628921836615,
"reward": 2.190965175628662,
"reward_std": 0.44982025027275085,
"rewards/GDino": 0.7243013381958008,
"rewards/GIT": 0.5294483602046967,
"rewards/HPSv2": 0.2750282287597656,
"rewards/ORM": 0.6621872782707214,
"self_certainty_semantic": -25.3125,
"self_certainty_token": -22.375,
"step": 26
},
{
"completion_length": 62.484375,
"epoch": 0.029900332225913623,
"grad_norm": 0.4033506512641907,
"kl": 0.0016021728515625,
"learning_rate": 9.83125e-07,
"loss": -0.0016465974040329456,
"reward": 1.9733637571334839,
"reward_std": 0.44280076026916504,
"rewards/GDino": 0.7363362908363342,
"rewards/GIT": 0.4528593420982361,
"rewards/HPSv2": 0.24550628662109375,
"rewards/ORM": 0.5386618673801422,
"self_certainty_semantic": -25.5,
"self_certainty_token": -22.375,
"step": 27
},
{
"completion_length": 65.046875,
"epoch": 0.031007751937984496,
"grad_norm": 0.559298574924469,
"kl": 0.00167083740234375,
"learning_rate": 9.825e-07,
"loss": 0.004501585033722222,
"reward": 1.4280173778533936,
"reward_std": 0.27060839533805847,
"rewards/GDino": 0.5987553596496582,
"rewards/GIT": 0.10973574221134186,
"rewards/HPSv2": 0.2664012908935547,
"rewards/ORM": 0.453125,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -20.9375,
"step": 28
},
{
"completion_length": 55.5625,
"epoch": 0.03211517165005537,
"grad_norm": 0.42233753204345703,
"kl": 0.00168609619140625,
"learning_rate": 9.81875e-07,
"loss": -0.005473613273352385,
"reward": 2.4506709575653076,
"reward_std": 0.20222720131278038,
"rewards/GDino": 0.8296874761581421,
"rewards/GIT": 0.605083167552948,
"rewards/HPSv2": 0.285858154296875,
"rewards/ORM": 0.7300421893596649,
"self_certainty_semantic": -25.6875,
"self_certainty_token": -20.9375,
"step": 29
},
{
"completion_length": 57.640625,
"epoch": 0.03322259136212625,
"grad_norm": 0.5650274157524109,
"kl": 0.0016326904296875,
"learning_rate": 9.8125e-07,
"loss": 0.0003150699194520712,
"reward": 2.489137649536133,
"reward_std": 0.4210814982652664,
"rewards/GDino": 0.8948009014129639,
"rewards/GIT": 0.586266428232193,
"rewards/HPSv2": 0.24865341186523438,
"rewards/ORM": 0.7594169676303864,
"self_certainty_semantic": -25.3125,
"self_certainty_token": -21.0625,
"step": 30
},
{
"completion_length": 78.78125,
"epoch": 0.03433001107419712,
"grad_norm": 0.6762183308601379,
"kl": 0.001613616943359375,
"learning_rate": 9.806249999999998e-07,
"loss": 0.007568572706077248,
"reward": 1.8555968403816223,
"reward_std": 0.2906922847032547,
"rewards/GDino": 0.5989583432674408,
"rewards/GIT": 0.38505683839321136,
"rewards/HPSv2": 0.2403736114501953,
"rewards/ORM": 0.6312080323696136,
"self_certainty_semantic": -25.5625,
"self_certainty_token": -20.5625,
"step": 31
},
{
"completion_length": 62.5,
"epoch": 0.035437430786267994,
"grad_norm": 0.4184902012348175,
"kl": 0.001628875732421875,
"learning_rate": 9.8e-07,
"loss": 0.007896744413301349,
"reward": 1.495099127292633,
"reward_std": 0.3622882664203644,
"rewards/GDino": 0.6791666448116302,
"rewards/GIT": 0.25104063749313354,
"rewards/HPSv2": 0.23050880432128906,
"rewards/ORM": 0.3343829959630966,
"self_certainty_semantic": -25.3125,
"self_certainty_token": -22.0625,
"step": 32
},
{
"completion_length": 70.109375,
"epoch": 0.036544850498338874,
"grad_norm": 0.47143352031707764,
"kl": 0.0016937255859375,
"learning_rate": 9.79375e-07,
"loss": 0.00709247519262135,
"reward": 2.3964842557907104,
"reward_std": 0.5415211468935013,
"rewards/GDino": 0.897656261920929,
"rewards/GIT": 0.6205766499042511,
"rewards/HPSv2": 0.2254810333251953,
"rewards/ORM": 0.6527703106403351,
"self_certainty_semantic": -25.3125,
"self_certainty_token": -21.625,
"step": 33
},
{
"completion_length": 55.53125,
"epoch": 0.03765227021040975,
"grad_norm": 0.45762747526168823,
"kl": 0.001678466796875,
"learning_rate": 9.7875e-07,
"loss": 0.020488019566982985,
"reward": 1.9143174886703491,
"reward_std": 0.2841227799654007,
"rewards/GDino": 0.6593749821186066,
"rewards/GIT": 0.4214262217283249,
"rewards/HPSv2": 0.2424945831298828,
"rewards/ORM": 0.5910216569900513,
"self_certainty_semantic": -25.3125,
"self_certainty_token": -21.75,
"step": 34
},
{
"completion_length": 55.828125,
"epoch": 0.03875968992248062,
"grad_norm": 0.3845841884613037,
"kl": 0.00167083740234375,
"learning_rate": 9.78125e-07,
"loss": 0.01862273830920458,
"reward": 2.274049997329712,
"reward_std": 0.28603486716747284,
"rewards/GDino": 0.7786458432674408,
"rewards/GIT": 0.5405041128396988,
"rewards/HPSv2": 0.23740386962890625,
"rewards/ORM": 0.7174962311983109,
"self_certainty_semantic": -25.3125,
"self_certainty_token": -21.25,
"step": 35
},
{
"completion_length": 63.234375,
"epoch": 0.03986710963455149,
"grad_norm": 0.5729533433914185,
"kl": 0.001678466796875,
"learning_rate": 9.775e-07,
"loss": -0.002963901497423649,
"reward": 1.8639960289001465,
"reward_std": 0.3890039473772049,
"rewards/GDino": 0.6255208253860474,
"rewards/GIT": 0.42713797092437744,
"rewards/HPSv2": 0.24535751342773438,
"rewards/ORM": 0.5659796744585037,
"self_certainty_semantic": -25.625,
"self_certainty_token": -21.3125,
"step": 36
},
{
"completion_length": 63.09375,
"epoch": 0.04097452934662237,
"grad_norm": 0.47338196635246277,
"kl": 0.001888275146484375,
"learning_rate": 9.76875e-07,
"loss": 0.008916446007788181,
"reward": 1.9735829830169678,
"reward_std": 0.5416238605976105,
"rewards/GDino": 0.7008762061595917,
"rewards/GIT": 0.3141380175948143,
"rewards/HPSv2": 0.2595968246459961,
"rewards/ORM": 0.6989719867706299,
"self_certainty_semantic": -25.375,
"self_certainty_token": -23.125,
"step": 37
},
{
"completion_length": 58.640625,
"epoch": 0.042081949058693245,
"grad_norm": 1.639336347579956,
"kl": 0.001651763916015625,
"learning_rate": 9.7625e-07,
"loss": -0.0003745388239622116,
"reward": 1.8843677639961243,
"reward_std": 0.27646802365779877,
"rewards/GDino": 0.7309310734272003,
"rewards/GIT": 0.2879854440689087,
"rewards/HPSv2": 0.25732994079589844,
"rewards/ORM": 0.6081212311983109,
"self_certainty_semantic": -25.375,
"self_certainty_token": -21.0625,
"step": 38
},
{
"completion_length": 54.453125,
"epoch": 0.04318936877076412,
"grad_norm": 0.4438176453113556,
"kl": 0.00176239013671875,
"learning_rate": 9.756249999999999e-07,
"loss": -0.004410726949572563,
"reward": 2.3740460872650146,
"reward_std": 0.26216618716716766,
"rewards/GDino": 0.8794216811656952,
"rewards/GIT": 0.480433389544487,
"rewards/HPSv2": 0.2703990936279297,
"rewards/ORM": 0.7437919676303864,
"self_certainty_semantic": -25.5,
"self_certainty_token": -21.0,
"step": 39
},
{
"completion_length": 64.65625,
"epoch": 0.044296788482835,
"grad_norm": 0.9789016246795654,
"kl": 0.0017242431640625,
"learning_rate": 9.75e-07,
"loss": -0.0008055282523855567,
"reward": 2.2535433769226074,
"reward_std": 0.46909773349761963,
"rewards/GDino": 0.8751652538776398,
"rewards/GIT": 0.4070926010608673,
"rewards/HPSv2": 0.2731647491455078,
"rewards/ORM": 0.6981207877397537,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -20.625,
"step": 40
},
{
"completion_length": 60.3125,
"epoch": 0.04540420819490587,
"grad_norm": 0.39339736104011536,
"kl": 0.001697540283203125,
"learning_rate": 9.743749999999999e-07,
"loss": -0.0026839073980227113,
"reward": 1.926289677619934,
"reward_std": 0.21494604647159576,
"rewards/GDino": 0.6536072194576263,
"rewards/GIT": 0.38067150115966797,
"rewards/HPSv2": 0.2470531463623047,
"rewards/ORM": 0.6449578106403351,
"self_certainty_semantic": -25.375,
"self_certainty_token": -21.0,
"step": 41
},
{
"completion_length": 55.734375,
"epoch": 0.046511627906976744,
"grad_norm": 0.43325623869895935,
"kl": 0.001575469970703125,
"learning_rate": 9.7375e-07,
"loss": 0.01566000678576529,
"reward": 2.2492642402648926,
"reward_std": 0.545527771115303,
"rewards/GDino": 0.8451037406921387,
"rewards/GIT": 0.4486817270517349,
"rewards/HPSv2": 0.2523536682128906,
"rewards/ORM": 0.703125,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -21.125,
"step": 42
},
{
"completion_length": 77.859375,
"epoch": 0.047619047619047616,
"grad_norm": 0.6008194088935852,
"kl": 0.00209808349609375,
"learning_rate": 9.73125e-07,
"loss": 0.009053934598341584,
"reward": 1.752554178237915,
"reward_std": 0.3711804449558258,
"rewards/GDino": 0.6425288617610931,
"rewards/GIT": 0.38656318187713623,
"rewards/HPSv2": 0.23595809936523438,
"rewards/ORM": 0.4875040054321289,
"self_certainty_semantic": -25.6875,
"self_certainty_token": -21.8125,
"step": 43
},
{
"completion_length": 64.859375,
"epoch": 0.048726467331118496,
"grad_norm": 0.4626310169696808,
"kl": 0.001750946044921875,
"learning_rate": 9.725e-07,
"loss": 0.00038470514118671417,
"reward": 2.837794542312622,
"reward_std": 0.3451881557703018,
"rewards/GDino": 0.9479166865348816,
"rewards/GIT": 0.7795328795909882,
"rewards/HPSv2": 0.26932334899902344,
"rewards/ORM": 0.8410216569900513,
"self_certainty_semantic": -25.5625,
"self_certainty_token": -21.5625,
"step": 44
},
{
"completion_length": 66.921875,
"epoch": 0.04983388704318937,
"grad_norm": 1.3941670656204224,
"kl": 0.001880645751953125,
"learning_rate": 9.71875e-07,
"loss": -0.012070931028574705,
"reward": 2.561403751373291,
"reward_std": 0.48213036358356476,
"rewards/GDino": 0.9039532244205475,
"rewards/GIT": 0.5467919409275055,
"rewards/HPSv2": 0.2617225646972656,
"rewards/ORM": 0.8489359319210052,
"self_certainty_semantic": -25.5625,
"self_certainty_token": -21.9375,
"step": 45
},
{
"completion_length": 59.625,
"epoch": 0.05094130675526024,
"grad_norm": 0.5365378260612488,
"kl": 0.001949310302734375,
"learning_rate": 9.712499999999998e-07,
"loss": 0.01103684725239873,
"reward": 2.0622146129608154,
"reward_std": 0.40072987973690033,
"rewards/GDino": 0.645312488079071,
"rewards/GIT": 0.33725525438785553,
"rewards/HPSv2": 0.2619609832763672,
"rewards/ORM": 0.8176859617233276,
"self_certainty_semantic": -25.25,
"self_certainty_token": -22.3125,
"step": 46
},
{
"completion_length": 64.6875,
"epoch": 0.05204872646733112,
"grad_norm": 0.5151812434196472,
"kl": 0.001766204833984375,
"learning_rate": 9.70625e-07,
"loss": -0.004148014355450869,
"reward": 1.7916635870933533,
"reward_std": 0.31147970259189606,
"rewards/GDino": 0.7293796539306641,
"rewards/GIT": 0.20818163454532623,
"rewards/HPSv2": 0.27945709228515625,
"rewards/ORM": 0.5746453106403351,
"self_certainty_semantic": -25.3125,
"self_certainty_token": -21.5625,
"step": 47
},
{
"completion_length": 56.25,
"epoch": 0.053156146179401995,
"grad_norm": 0.7559373378753662,
"kl": 0.001861572265625,
"learning_rate": 9.7e-07,
"loss": -0.002030523493885994,
"reward": 1.4302473068237305,
"reward_std": 0.4484506845474243,
"rewards/GDino": 0.6244329512119293,
"rewards/GIT": 0.0,
"rewards/HPSv2": 0.2752876281738281,
"rewards/ORM": 0.5305267572402954,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -20.875,
"step": 48
},
{
"completion_length": 61.21875,
"epoch": 0.05426356589147287,
"grad_norm": 0.46310731768608093,
"kl": 0.00177764892578125,
"learning_rate": 9.69375e-07,
"loss": 0.0054672048427164555,
"reward": 1.9361683130264282,
"reward_std": 0.3801421523094177,
"rewards/GDino": 0.7904821038246155,
"rewards/GIT": 0.2458050437271595,
"rewards/HPSv2": 0.25890541076660156,
"rewards/ORM": 0.640975683927536,
"self_certainty_semantic": -25.5,
"self_certainty_token": -21.6875,
"step": 49
},
{
"completion_length": 61.921875,
"epoch": 0.05537098560354374,
"grad_norm": 0.5111473798751831,
"kl": 0.002353668212890625,
"learning_rate": 9.6875e-07,
"loss": 0.0035089042503386736,
"reward": 2.212684750556946,
"reward_std": 0.3874351307749748,
"rewards/GDino": 0.7840971350669861,
"rewards/GIT": 0.42198260873556137,
"rewards/HPSv2": 0.25807952880859375,
"rewards/ORM": 0.7485254108905792,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -22.625,
"step": 50
},
{
"completion_length": 57.796875,
"epoch": 0.05647840531561462,
"grad_norm": 0.4804292917251587,
"kl": 0.001743316650390625,
"learning_rate": 9.68125e-07,
"loss": -0.0010273723164573312,
"reward": 1.8951371908187866,
"reward_std": 0.5679852366447449,
"rewards/GDino": 0.7922006845474243,
"rewards/GIT": 0.27185457944869995,
"rewards/HPSv2": 0.2777671813964844,
"rewards/ORM": 0.5533146858215332,
"self_certainty_semantic": -25.625,
"self_certainty_token": -22.0,
"step": 51
},
{
"completion_length": 62.140625,
"epoch": 0.05758582502768549,
"grad_norm": 0.5876587629318237,
"kl": 0.001842498779296875,
"learning_rate": 9.675e-07,
"loss": 0.010319232940673828,
"reward": 2.453005313873291,
"reward_std": 0.35728050768375397,
"rewards/GDino": 0.917187511920929,
"rewards/GIT": 0.6651300191879272,
"rewards/HPSv2": 0.27350807189941406,
"rewards/ORM": 0.5971797704696655,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -21.0625,
"step": 52
},
{
"completion_length": 57.046875,
"epoch": 0.058693244739756366,
"grad_norm": 0.5244357585906982,
"kl": 0.00168609619140625,
"learning_rate": 9.66875e-07,
"loss": 0.0012504801852628589,
"reward": 1.8911731839179993,
"reward_std": 0.3232653737068176,
"rewards/GDino": 0.7297230660915375,
"rewards/GIT": 0.3948078155517578,
"rewards/HPSv2": 0.24039649963378906,
"rewards/ORM": 0.5262457728385925,
"self_certainty_semantic": -25.3125,
"self_certainty_token": -21.25,
"step": 53
},
{
"completion_length": 68.921875,
"epoch": 0.059800664451827246,
"grad_norm": 0.5011692047119141,
"kl": 0.0017547607421875,
"learning_rate": 9.6625e-07,
"loss": -0.001990929711610079,
"reward": 1.5346381068229675,
"reward_std": 0.5364750325679779,
"rewards/GDino": 0.5896078050136566,
"rewards/GIT": 0.2611962556838989,
"rewards/HPSv2": 0.24633407592773438,
"rewards/ORM": 0.4375000149011612,
"self_certainty_semantic": -25.5,
"self_certainty_token": -21.5625,
"step": 54
},
{
"completion_length": 65.28125,
"epoch": 0.06090808416389812,
"grad_norm": 0.43720903992652893,
"kl": 0.001796722412109375,
"learning_rate": 9.65625e-07,
"loss": 0.011945425532758236,
"reward": 1.7657405734062195,
"reward_std": 0.5052186846733093,
"rewards/GDino": 0.7055748403072357,
"rewards/GIT": 0.3213713690638542,
"rewards/HPSv2": 0.26223182678222656,
"rewards/ORM": 0.4765625,
"self_certainty_semantic": -25.5,
"self_certainty_token": -21.8125,
"step": 55
},
{
"completion_length": 72.15625,
"epoch": 0.06201550387596899,
"grad_norm": 0.6576823592185974,
"kl": 0.00201416015625,
"learning_rate": 9.649999999999999e-07,
"loss": 0.010990551207214594,
"reward": 2.0798487663269043,
"reward_std": 0.5881477892398834,
"rewards/GDino": 0.7611979246139526,
"rewards/GIT": 0.38940075039863586,
"rewards/HPSv2": 0.25081634521484375,
"rewards/ORM": 0.678433746099472,
"self_certainty_semantic": -25.125,
"self_certainty_token": -21.8125,
"step": 56
},
{
"completion_length": 53.84375,
"epoch": 0.06312292358803986,
"grad_norm": 0.5109694600105286,
"kl": 0.001708984375,
"learning_rate": 9.64375e-07,
"loss": -0.009197955019772053,
"reward": 1.825343132019043,
"reward_std": 0.49610868096351624,
"rewards/GDino": 0.7342002689838409,
"rewards/GIT": 0.27930086851119995,
"rewards/HPSv2": 0.2493419647216797,
"rewards/ORM": 0.5625,
"self_certainty_semantic": -25.6875,
"self_certainty_token": -21.5,
"step": 57
},
{
"completion_length": 54.671875,
"epoch": 0.06423034330011074,
"grad_norm": 0.48297855257987976,
"kl": 0.0018157958984375,
"learning_rate": 9.637499999999999e-07,
"loss": -2.7031637728214264e-05,
"reward": 1.9436655044555664,
"reward_std": 0.5841460824012756,
"rewards/GDino": 0.7508301734924316,
"rewards/GIT": 0.36742376536130905,
"rewards/HPSv2": 0.24603271484375,
"rewards/ORM": 0.579378753900528,
"self_certainty_semantic": -25.5625,
"self_certainty_token": -21.6875,
"step": 58
},
{
"completion_length": 57.34375,
"epoch": 0.06533776301218161,
"grad_norm": 1.5652471780776978,
"kl": 0.00185394287109375,
"learning_rate": 9.63125e-07,
"loss": -0.0014887296129018068,
"reward": 2.154895305633545,
"reward_std": 0.5548917800188065,
"rewards/GDino": 0.7907229363918304,
"rewards/GIT": 0.44339829683303833,
"rewards/HPSv2": 0.2567615509033203,
"rewards/ORM": 0.664012536406517,
"self_certainty_semantic": -25.5625,
"self_certainty_token": -21.0625,
"step": 59
},
{
"completion_length": 52.0625,
"epoch": 0.0664451827242525,
"grad_norm": 0.8647972941398621,
"kl": 0.00200653076171875,
"learning_rate": 9.624999999999999e-07,
"loss": -0.004864218062721193,
"reward": 2.183086931705475,
"reward_std": 0.27265597879886627,
"rewards/GDino": 0.8968750238418579,
"rewards/GIT": 0.4909053146839142,
"rewards/HPSv2": 0.2511100769042969,
"rewards/ORM": 0.544196605682373,
"self_certainty_semantic": -25.25,
"self_certainty_token": -20.8125,
"step": 60
},
{
"completion_length": 78.421875,
"epoch": 0.06755260243632337,
"grad_norm": 0.6149311065673828,
"kl": 0.0018310546875,
"learning_rate": 9.61875e-07,
"loss": -0.003399772336706519,
"reward": 2.3938775062561035,
"reward_std": 0.3266971558332443,
"rewards/GDino": 0.7299478650093079,
"rewards/GIT": 0.6572037935256958,
"rewards/HPSv2": 0.26293373107910156,
"rewards/ORM": 0.743791937828064,
"self_certainty_semantic": -25.5,
"self_certainty_token": -20.5,
"step": 61
},
{
"completion_length": 71.796875,
"epoch": 0.06866002214839424,
"grad_norm": 0.8106938600540161,
"kl": 0.00188446044921875,
"learning_rate": 9.6125e-07,
"loss": -0.004746791877551004,
"reward": 2.3078866004943848,
"reward_std": 0.4594850391149521,
"rewards/GDino": 0.7886728346347809,
"rewards/GIT": 0.6039779186248779,
"rewards/HPSv2": 0.2555561065673828,
"rewards/ORM": 0.6596797406673431,
"self_certainty_semantic": -25.5625,
"self_certainty_token": -21.125,
"step": 62
},
{
"completion_length": 57.703125,
"epoch": 0.06976744186046512,
"grad_norm": 0.5699672102928162,
"kl": 0.00218963623046875,
"learning_rate": 9.606249999999998e-07,
"loss": 0.005022911122068763,
"reward": 2.2111340165138245,
"reward_std": 0.6219878196716309,
"rewards/GDino": 0.794545441865921,
"rewards/GIT": 0.45049863308668137,
"rewards/HPSv2": 0.24386024475097656,
"rewards/ORM": 0.7222297191619873,
"self_certainty_semantic": -25.25,
"self_certainty_token": -21.9375,
"step": 63
},
{
"completion_length": 78.453125,
"epoch": 0.07087486157253599,
"grad_norm": 0.7573527693748474,
"kl": 0.0022125244140625,
"learning_rate": 9.6e-07,
"loss": 0.013895762618631124,
"reward": 1.6789215207099915,
"reward_std": 0.15597553551197052,
"rewards/GDino": 0.7209441661834717,
"rewards/GIT": 0.31718890368938446,
"rewards/HPSv2": 0.26105499267578125,
"rewards/ORM": 0.37973345816135406,
"self_certainty_semantic": -25.5625,
"self_certainty_token": -21.5625,
"step": 64
},
{
"completion_length": 63.59375,
"epoch": 0.07198228128460686,
"grad_norm": 0.4424923360347748,
"kl": 0.0020599365234375,
"learning_rate": 9.59375e-07,
"loss": 0.0005846736021339893,
"reward": 2.195925712585449,
"reward_std": 0.5788445174694061,
"rewards/GDino": 0.7169270515441895,
"rewards/GIT": 0.6367218196392059,
"rewards/HPSv2": 0.2345561981201172,
"rewards/ORM": 0.6077205836772919,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -21.75,
"step": 65
},
{
"completion_length": 67.6875,
"epoch": 0.07308970099667775,
"grad_norm": 0.5050013661384583,
"kl": 0.00211334228515625,
"learning_rate": 9.5875e-07,
"loss": 0.010172993643209338,
"reward": 2.220258355140686,
"reward_std": 0.30588236451148987,
"rewards/GDino": 0.7442708909511566,
"rewards/GIT": 0.47482602298259735,
"rewards/HPSv2": 0.25937461853027344,
"rewards/ORM": 0.7417868673801422,
"self_certainty_semantic": -25.375,
"self_certainty_token": -21.5,
"step": 66
},
{
"completion_length": 72.75,
"epoch": 0.07419712070874862,
"grad_norm": 0.47647950053215027,
"kl": 0.001953125,
"learning_rate": 9.58125e-07,
"loss": 0.002580178901553154,
"reward": 2.3537763357162476,
"reward_std": 0.2857324182987213,
"rewards/GDino": 0.852263331413269,
"rewards/GIT": 0.5637244433164597,
"rewards/HPSv2": 0.2550220489501953,
"rewards/ORM": 0.6827665567398071,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -21.4375,
"step": 67
},
{
"completion_length": 60.109375,
"epoch": 0.0753045404208195,
"grad_norm": 0.45224544405937195,
"kl": 0.0021209716796875,
"learning_rate": 9.575e-07,
"loss": 0.002825574716553092,
"reward": 1.613221287727356,
"reward_std": 0.332104429602623,
"rewards/GDino": 0.6193348169326782,
"rewards/GIT": 0.2909398823976517,
"rewards/HPSv2": 0.2551765441894531,
"rewards/ORM": 0.4477700889110565,
"self_certainty_semantic": -25.25,
"self_certainty_token": -21.0625,
"step": 68
},
{
"completion_length": 72.6875,
"epoch": 0.07641196013289037,
"grad_norm": 0.688894510269165,
"kl": 0.002315521240234375,
"learning_rate": 9.56875e-07,
"loss": 0.012800770811736584,
"reward": 2.1092969179153442,
"reward_std": 0.36874186992645264,
"rewards/GDino": 0.8054687678813934,
"rewards/GIT": 0.3866874873638153,
"rewards/HPSv2": 0.26236534118652344,
"rewards/ORM": 0.6547753810882568,
"self_certainty_semantic": -25.625,
"self_certainty_token": -21.5,
"step": 69
},
{
"completion_length": 60.640625,
"epoch": 0.07751937984496124,
"grad_norm": 0.45330390334129333,
"kl": 0.00215911865234375,
"learning_rate": 9.5625e-07,
"loss": -0.0010713667143136263,
"reward": 1.552397072315216,
"reward_std": 0.39455118775367737,
"rewards/GDino": 0.6554375886917114,
"rewards/GIT": 0.22663478553295135,
"rewards/HPSv2": 0.2546577453613281,
"rewards/ORM": 0.41566696763038635,
"self_certainty_semantic": -25.25,
"self_certainty_token": -20.75,
"step": 70
},
{
"completion_length": 76.515625,
"epoch": 0.07862679955703211,
"grad_norm": 0.5808414220809937,
"kl": 0.00222015380859375,
"learning_rate": 9.556249999999999e-07,
"loss": 0.0038980550598353148,
"reward": 1.9476300477981567,
"reward_std": 0.38603267073631287,
"rewards/GDino": 0.7262610197067261,
"rewards/GIT": 0.30087296664714813,
"rewards/HPSv2": 0.26424598693847656,
"rewards/ORM": 0.6562500149011612,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -19.9375,
"step": 71
},
{
"completion_length": 57.15625,
"epoch": 0.07973421926910298,
"grad_norm": 0.3693688213825226,
"kl": 0.00208282470703125,
"learning_rate": 9.55e-07,
"loss": -0.00035159417893737555,
"reward": 1.9391373991966248,
"reward_std": 0.3963821530342102,
"rewards/GDino": 0.6879567801952362,
"rewards/GIT": 0.4622843265533447,
"rewards/HPSv2": 0.24675464630126953,
"rewards/ORM": 0.5421415567398071,
"self_certainty_semantic": -25.0625,
"self_certainty_token": -20.9375,
"step": 72
},
{
"completion_length": 66.65625,
"epoch": 0.08084163898117387,
"grad_norm": 0.6215986013412476,
"kl": 0.0024871826171875,
"learning_rate": 9.543749999999999e-07,
"loss": 0.003838272183202207,
"reward": 2.1008963584899902,
"reward_std": 0.4600249230861664,
"rewards/GDino": 0.8240202069282532,
"rewards/GIT": 0.48449917137622833,
"rewards/HPSv2": 0.24818038940429688,
"rewards/ORM": 0.5441965609788895,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -21.5,
"step": 73
},
{
"completion_length": 60.859375,
"epoch": 0.08194905869324474,
"grad_norm": 0.43593713641166687,
"kl": 0.0030364990234375,
"learning_rate": 9.5375e-07,
"loss": 0.002844013855792582,
"reward": 2.297879934310913,
"reward_std": 0.2846696451306343,
"rewards/GDino": 0.84375,
"rewards/GIT": 0.5265894532203674,
"rewards/HPSv2": 0.2544116973876953,
"rewards/ORM": 0.6731287837028503,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -21.75,
"step": 74
},
{
"completion_length": 68.703125,
"epoch": 0.08305647840531562,
"grad_norm": 0.48668116331100464,
"kl": 0.002227783203125,
"learning_rate": 9.53125e-07,
"loss": -0.0021062323357909918,
"reward": 1.7519539594650269,
"reward_std": 0.3109753131866455,
"rewards/GDino": 0.6498888432979584,
"rewards/GIT": 0.2745012864470482,
"rewards/HPSv2": 0.26706886291503906,
"rewards/ORM": 0.5604948848485947,
"self_certainty_semantic": -25.5,
"self_certainty_token": -20.625,
"step": 75
},
{
"completion_length": 70.25,
"epoch": 0.08416389811738649,
"grad_norm": 0.5122522711753845,
"kl": 0.00208282470703125,
"learning_rate": 9.525e-07,
"loss": -0.00045439647510647774,
"reward": 2.371267318725586,
"reward_std": 0.4085633456707001,
"rewards/GDino": 0.8135416805744171,
"rewards/GIT": 0.6540948301553726,
"rewards/HPSv2": 0.2650108337402344,
"rewards/ORM": 0.6386198997497559,
"self_certainty_semantic": -25.6875,
"self_certainty_token": -20.75,
"step": 76
},
{
"completion_length": 62.875,
"epoch": 0.08527131782945736,
"grad_norm": 0.505736768245697,
"kl": 0.0037689208984375,
"learning_rate": 9.51875e-07,
"loss": -0.006699402409140021,
"reward": 1.5121636986732483,
"reward_std": 0.5349836349487305,
"rewards/GDino": 0.616510659456253,
"rewards/GIT": 0.18113864213228226,
"rewards/HPSv2": 0.228485107421875,
"rewards/ORM": 0.48602940142154694,
"self_certainty_semantic": -25.125,
"self_certainty_token": -21.875,
"step": 77
},
{
"completion_length": 65.8125,
"epoch": 0.08637873754152824,
"grad_norm": 0.4759610593318939,
"kl": 0.0022735595703125,
"learning_rate": 9.5125e-07,
"loss": 0.0014968996401876211,
"reward": 1.9482250213623047,
"reward_std": 0.38150524348020554,
"rewards/GDino": 0.7646995186805725,
"rewards/GIT": 0.31973105669021606,
"rewards/HPSv2": 0.2705249786376953,
"rewards/ORM": 0.5932694524526596,
"self_certainty_semantic": -25.5,
"self_certainty_token": -21.125,
"step": 78
},
{
"completion_length": 72.609375,
"epoch": 0.08748615725359911,
"grad_norm": 0.4961722195148468,
"kl": 0.00247955322265625,
"learning_rate": 9.50625e-07,
"loss": 0.00820195721462369,
"reward": 2.2431598901748657,
"reward_std": 0.19805177673697472,
"rewards/GDino": 0.8183182775974274,
"rewards/GIT": 0.60882468521595,
"rewards/HPSv2": 0.2628040313720703,
"rewards/ORM": 0.5532128810882568,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -21.5625,
"step": 79
},
{
"completion_length": 66.0625,
"epoch": 0.08859357696567,
"grad_norm": 0.5290701389312744,
"kl": 0.00308990478515625,
"learning_rate": 9.499999999999999e-07,
"loss": -0.001018086913973093,
"reward": 1.7054139375686646,
"reward_std": 0.4478110671043396,
"rewards/GDino": 0.6419965624809265,
"rewards/GIT": 0.19029075652360916,
"rewards/HPSv2": 0.2727680206298828,
"rewards/ORM": 0.6003586649894714,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -21.5,
"step": 80
},
{
"completion_length": 69.75,
"epoch": 0.08970099667774087,
"grad_norm": 0.530961811542511,
"kl": 0.00331878662109375,
"learning_rate": 9.493749999999999e-07,
"loss": -0.0018104221671819687,
"reward": 2.1294270157814026,
"reward_std": 0.30140096694231033,
"rewards/GDino": 0.7601194977760315,
"rewards/GIT": 0.36138176918029785,
"rewards/HPSv2": 0.27007102966308594,
"rewards/ORM": 0.7378547042608261,
"self_certainty_semantic": -25.5625,
"self_certainty_token": -20.25,
"step": 81
},
{
"completion_length": 62.25,
"epoch": 0.09080841638981174,
"grad_norm": 0.5380280017852783,
"kl": 0.0029449462890625,
"learning_rate": 9.487499999999999e-07,
"loss": 0.0027263425290584564,
"reward": 1.7531540989875793,
"reward_std": 0.40144187211990356,
"rewards/GDino": 0.6388830840587616,
"rewards/GIT": 0.3787819594144821,
"rewards/HPSv2": 0.26526451110839844,
"rewards/ORM": 0.4702245742082596,
"self_certainty_semantic": -25.625,
"self_certainty_token": -21.3125,
"step": 82
},
{
"completion_length": 57.125,
"epoch": 0.09191583610188261,
"grad_norm": 0.46656447649002075,
"kl": 0.00229644775390625,
"learning_rate": 9.481249999999999e-07,
"loss": 0.0034079640172421932,
"reward": 2.1076533794403076,
"reward_std": 0.3496774584054947,
"rewards/GDino": 0.8086712956428528,
"rewards/GIT": 0.44665491580963135,
"rewards/HPSv2": 0.2527198791503906,
"rewards/ORM": 0.5996073186397552,
"self_certainty_semantic": -25.5,
"self_certainty_token": -21.875,
"step": 83
},
{
"completion_length": 77.609375,
"epoch": 0.09302325581395349,
"grad_norm": 0.7098491787910461,
"kl": 0.003326416015625,
"learning_rate": 9.474999999999999e-07,
"loss": -0.015582434833049774,
"reward": 2.0792417526245117,
"reward_std": 0.405472531914711,
"rewards/GDino": 0.8217203617095947,
"rewards/GIT": 0.6337592005729675,
"rewards/HPSv2": 0.2409496307373047,
"rewards/ORM": 0.3828125,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -21.25,
"step": 84
},
{
"completion_length": 70.0,
"epoch": 0.09413067552602436,
"grad_norm": 0.453952431678772,
"kl": 0.0030059814453125,
"learning_rate": 9.468749999999999e-07,
"loss": -0.008341801585629582,
"reward": 1.7731398940086365,
"reward_std": 0.43146421015262604,
"rewards/GDino": 0.6217962503433228,
"rewards/GIT": 0.33136892318725586,
"rewards/HPSv2": 0.2414989471435547,
"rewards/ORM": 0.5784757435321808,
"self_certainty_semantic": -25.375,
"self_certainty_token": -21.8125,
"step": 85
},
{
"completion_length": 55.46875,
"epoch": 0.09523809523809523,
"grad_norm": 0.6065813302993774,
"kl": 0.0029296875,
"learning_rate": 9.462499999999999e-07,
"loss": -0.004339609295129776,
"reward": 2.3409087657928467,
"reward_std": 0.33414456248283386,
"rewards/GDino": 0.843651682138443,
"rewards/GIT": 0.3478253483772278,
"rewards/HPSv2": 0.2929649353027344,
"rewards/ORM": 0.8564667999744415,
"self_certainty_semantic": -25.6875,
"self_certainty_token": -21.6875,
"step": 86
},
{
"completion_length": 71.796875,
"epoch": 0.09634551495016612,
"grad_norm": 0.6815423965454102,
"kl": 0.0028076171875,
"learning_rate": 9.45625e-07,
"loss": 0.004890406038612127,
"reward": 2.096968352794647,
"reward_std": 0.4522961378097534,
"rewards/GDino": 0.7090134918689728,
"rewards/GIT": 0.4619881361722946,
"rewards/HPSv2": 0.26172447204589844,
"rewards/ORM": 0.6642423272132874,
"self_certainty_semantic": -25.375,
"self_certainty_token": -21.3125,
"step": 87
},
{
"completion_length": 62.921875,
"epoch": 0.09745293466223699,
"grad_norm": 0.37047135829925537,
"kl": 0.00237274169921875,
"learning_rate": 9.45e-07,
"loss": -0.007989626843482256,
"reward": 2.100303888320923,
"reward_std": 0.39728429913520813,
"rewards/GDino": 0.8100375235080719,
"rewards/GIT": 0.4551214128732681,
"rewards/HPSv2": 0.2669391632080078,
"rewards/ORM": 0.5682056248188019,
"self_certainty_semantic": -25.5,
"self_certainty_token": -21.625,
"step": 88
},
{
"completion_length": 61.25,
"epoch": 0.09856035437430787,
"grad_norm": 0.3903006613254547,
"kl": 0.0033111572265625,
"learning_rate": 9.44375e-07,
"loss": -0.0016460134647786617,
"reward": 2.1185483932495117,
"reward_std": 0.34406720101833344,
"rewards/GDino": 0.7301153540611267,
"rewards/GIT": 0.4342738687992096,
"rewards/HPSv2": 0.25724220275878906,
"rewards/ORM": 0.6969169676303864,
"self_certainty_semantic": -25.625,
"self_certainty_token": -21.0625,
"step": 89
},
{
"completion_length": 64.734375,
"epoch": 0.09966777408637874,
"grad_norm": 0.6106704473495483,
"kl": 0.002532958984375,
"learning_rate": 9.4375e-07,
"loss": 0.0018994538113474846,
"reward": 2.281058669090271,
"reward_std": 0.4019897133111954,
"rewards/GDino": 0.8515625298023224,
"rewards/GIT": 0.602006196975708,
"rewards/HPSv2": 0.2570476531982422,
"rewards/ORM": 0.5704423487186432,
"self_certainty_semantic": -25.625,
"self_certainty_token": -20.875,
"step": 90
},
{
"completion_length": 70.625,
"epoch": 0.10077519379844961,
"grad_norm": 0.6082563996315002,
"kl": 0.0025634765625,
"learning_rate": 9.43125e-07,
"loss": -0.001378488726913929,
"reward": 1.7446696758270264,
"reward_std": 0.48222504556179047,
"rewards/GDino": 0.6369770467281342,
"rewards/GIT": 0.4495050609111786,
"rewards/HPSv2": 0.2379169464111328,
"rewards/ORM": 0.42027057707309723,
"self_certainty_semantic": -25.5,
"self_certainty_token": -21.25,
"step": 91
},
{
"completion_length": 69.328125,
"epoch": 0.10188261351052048,
"grad_norm": 0.3885723054409027,
"kl": 0.00247955322265625,
"learning_rate": 9.425e-07,
"loss": 0.0029599489644169807,
"reward": 1.6940485835075378,
"reward_std": 0.48791858553886414,
"rewards/GDino": 0.7451692521572113,
"rewards/GIT": 0.3888908475637436,
"rewards/HPSv2": 0.23882293701171875,
"rewards/ORM": 0.32116562128067017,
"self_certainty_semantic": -25.5,
"self_certainty_token": -20.9375,
"step": 92
},
{
"completion_length": 78.96875,
"epoch": 0.10299003322259136,
"grad_norm": 2.441729784011841,
"kl": 0.00281524658203125,
"learning_rate": 9.41875e-07,
"loss": 0.0027102059684693813,
"reward": 2.098644495010376,
"reward_std": 0.5861929953098297,
"rewards/GDino": 0.7753971815109253,
"rewards/GIT": 0.33432240784168243,
"rewards/HPSv2": 0.24440956115722656,
"rewards/ORM": 0.7445152401924133,
"self_certainty_semantic": -25.375,
"self_certainty_token": -21.3125,
"step": 93
},
{
"completion_length": 53.640625,
"epoch": 0.10409745293466224,
"grad_norm": 1.843809962272644,
"kl": 0.00298309326171875,
"learning_rate": 9.4125e-07,
"loss": -0.002976842690259218,
"reward": 2.022274136543274,
"reward_std": 0.3149227201938629,
"rewards/GDino": 0.7854060530662537,
"rewards/GIT": 0.20830318331718445,
"rewards/HPSv2": 0.2829475402832031,
"rewards/ORM": 0.7456172108650208,
"self_certainty_semantic": -25.5625,
"self_certainty_token": -21.4375,
"step": 94
},
{
"completion_length": 73.8125,
"epoch": 0.10520487264673312,
"grad_norm": 0.4806905686855316,
"kl": 0.0027923583984375,
"learning_rate": 9.40625e-07,
"loss": 0.0057201930321753025,
"reward": 2.5528862476348877,
"reward_std": 0.3981771767139435,
"rewards/GDino": 0.9458979666233063,
"rewards/GIT": 0.7319882810115814,
"rewards/HPSv2": 0.265625,
"rewards/ORM": 0.609375,
"self_certainty_semantic": -25.625,
"self_certainty_token": -22.375,
"step": 95
},
{
"completion_length": 71.578125,
"epoch": 0.10631229235880399,
"grad_norm": 1.3328330516815186,
"kl": 0.00286865234375,
"learning_rate": 9.399999999999999e-07,
"loss": 0.006992874434217811,
"reward": 2.4351861476898193,
"reward_std": 0.25794728100299835,
"rewards/GDino": 0.9020833373069763,
"rewards/GIT": 0.6907803118228912,
"rewards/HPSv2": 0.2606678009033203,
"rewards/ORM": 0.5816546380519867,
"self_certainty_semantic": -25.6875,
"self_certainty_token": -20.125,
"step": 96
},
{
"completion_length": 60.703125,
"epoch": 0.10741971207087486,
"grad_norm": 0.5019268989562988,
"kl": 0.003326416015625,
"learning_rate": 9.393749999999999e-07,
"loss": 0.011835527839139104,
"reward": 1.6200063824653625,
"reward_std": 0.4240207076072693,
"rewards/GDino": 0.6504360437393188,
"rewards/GIT": 0.18544349074363708,
"rewards/HPSv2": 0.2720832824707031,
"rewards/ORM": 0.5120435357093811,
"self_certainty_semantic": -25.375,
"self_certainty_token": -21.4375,
"step": 97
},
{
"completion_length": 68.578125,
"epoch": 0.10852713178294573,
"grad_norm": 0.38334423303604126,
"kl": 0.003143310546875,
"learning_rate": 9.387499999999999e-07,
"loss": 0.0015034456737339497,
"reward": 1.9381686449050903,
"reward_std": 0.46784070134162903,
"rewards/GDino": 0.7850436270236969,
"rewards/GIT": 0.3971538841724396,
"rewards/HPSv2": 0.2517681121826172,
"rewards/ORM": 0.5042029470205307,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -22.0,
"step": 98
},
{
"completion_length": 72.234375,
"epoch": 0.10963455149501661,
"grad_norm": 1.5332801342010498,
"kl": 0.0026702880859375,
"learning_rate": 9.381249999999999e-07,
"loss": 0.0014210238587111235,
"reward": 2.1606199741363525,
"reward_std": 0.4609396979212761,
"rewards/GDino": 0.800000011920929,
"rewards/GIT": 0.6965132355690002,
"rewards/HPSv2": 0.2425823211669922,
"rewards/ORM": 0.4215243309736252,
"self_certainty_semantic": -25.625,
"self_certainty_token": -22.125,
"step": 99
},
{
"completion_length": 64.859375,
"epoch": 0.11074197120708748,
"grad_norm": 0.4810887575149536,
"kl": 0.0039520263671875,
"learning_rate": 9.374999999999999e-07,
"loss": -0.006660776911303401,
"reward": 2.0300318002700806,
"reward_std": 0.49300554394721985,
"rewards/GDino": 0.6639764606952667,
"rewards/GIT": 0.41904042661190033,
"rewards/HPSv2": 0.25483131408691406,
"rewards/ORM": 0.6921834945678711,
"self_certainty_semantic": -25.5625,
"self_certainty_token": -21.5,
"step": 100
}
],
"logging_steps": 1.0,
"max_steps": 1600,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}