iris-step100 / trainer_state.json
YhangChen's picture
Initial model files upload
9b131b0 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.11074197120708748,
"eval_steps": 500,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 73.265625,
"epoch": 0.0011074197120708748,
"grad_norm": 0.5076314806938171,
"kl": 0.0,
"learning_rate": 9.99375e-07,
"loss": -0.018259915290400386,
"reward": 2.2648561000823975,
"reward_std": 0.32521533221006393,
"rewards/GDino": 0.84943026304245,
"rewards/GIT": 0.5776679813861847,
"rewards/HPSv2": 0.2639656066894531,
"rewards/ORM": 0.5737921893596649,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -22.0,
"step": 1
},
{
"completion_length": 56.0,
"epoch": 0.0022148394241417496,
"grad_norm": 0.5364330410957336,
"kl": 0.001522064208984375,
"learning_rate": 9.9875e-07,
"loss": 0.00348748016403988,
"reward": 1.7680926322937012,
"reward_std": 0.41801488399505615,
"rewards/GDino": 0.6529064476490021,
"rewards/GIT": 0.19494981318712234,
"rewards/HPSv2": 0.24983596801757812,
"rewards/ORM": 0.6704004406929016,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -21.0,
"step": 2
},
{
"completion_length": 55.4375,
"epoch": 0.0033222591362126247,
"grad_norm": 0.5614722967147827,
"kl": 0.001556396484375,
"learning_rate": 9.98125e-07,
"loss": 0.01565772108733654,
"reward": 1.6570448875427246,
"reward_std": 0.3965621292591095,
"rewards/GDino": 0.6382372081279755,
"rewards/GIT": 0.37795570492744446,
"rewards/HPSv2": 0.24709796905517578,
"rewards/ORM": 0.3937540017068386,
"self_certainty_semantic": -25.1875,
"self_certainty_token": -20.9375,
"step": 3
},
{
"completion_length": 65.34375,
"epoch": 0.004429678848283499,
"grad_norm": 2.5736770629882812,
"kl": 0.0016021728515625,
"learning_rate": 9.975e-07,
"loss": -0.0012893765233457088,
"reward": 2.061529755592346,
"reward_std": 0.4106704443693161,
"rewards/GDino": 0.7796730995178223,
"rewards/GIT": 0.43717896938323975,
"rewards/HPSv2": 0.24744796752929688,
"rewards/ORM": 0.5972296595573425,
"self_certainty_semantic": -25.5,
"self_certainty_token": -22.0,
"step": 4
},
{
"completion_length": 63.578125,
"epoch": 0.005537098560354375,
"grad_norm": 0.48238250613212585,
"kl": 0.001575469970703125,
"learning_rate": 9.968749999999999e-07,
"loss": 0.020129199139773846,
"reward": 1.5302643775939941,
"reward_std": 0.44902199506759644,
"rewards/GDino": 0.6246840953826904,
"rewards/GIT": 0.23608428239822388,
"rewards/HPSv2": 0.2453451156616211,
"rewards/ORM": 0.42415088415145874,
"self_certainty_semantic": -25.625,
"self_certainty_token": -22.1875,
"step": 5
},
{
"completion_length": 60.65625,
"epoch": 0.006644518272425249,
"grad_norm": 0.8221905827522278,
"kl": 0.001674652099609375,
"learning_rate": 9.9625e-07,
"loss": 0.0192068200558424,
"reward": 2.1602972745895386,
"reward_std": 0.23134037852287292,
"rewards/GDino": 0.783700168132782,
"rewards/GIT": 0.452057421207428,
"rewards/HPSv2": 0.274627685546875,
"rewards/ORM": 0.6499120593070984,
"self_certainty_semantic": -25.3125,
"self_certainty_token": -22.0,
"step": 6
},
{
"completion_length": 65.453125,
"epoch": 0.007751937984496124,
"grad_norm": 0.433403879404068,
"kl": 0.0016021728515625,
"learning_rate": 9.956249999999999e-07,
"loss": 0.028950304724276066,
"reward": 1.7097668647766113,
"reward_std": 0.5880981385707855,
"rewards/GDino": 0.5914062708616257,
"rewards/GIT": 0.15753822773694992,
"rewards/HPSv2": 0.25023555755615234,
"rewards/ORM": 0.7105867862701416,
"self_certainty_semantic": -25.5,
"self_certainty_token": -21.75,
"step": 7
},
{
"completion_length": 74.90625,
"epoch": 0.008859357696566999,
"grad_norm": 0.41245806217193604,
"kl": 0.00152587890625,
"learning_rate": 9.95e-07,
"loss": -0.016540683340281248,
"reward": 1.785366177558899,
"reward_std": 0.39637817442417145,
"rewards/GDino": 0.7011832594871521,
"rewards/GIT": 0.3848375529050827,
"rewards/HPSv2": 0.2445659637451172,
"rewards/ORM": 0.45477938652038574,
"self_certainty_semantic": -25.375,
"self_certainty_token": -20.875,
"step": 8
},
{
"completion_length": 61.828125,
"epoch": 0.009966777408637873,
"grad_norm": 0.3924250602722168,
"kl": 0.001617431640625,
"learning_rate": 9.94375e-07,
"loss": 0.03069412149488926,
"reward": 2.0813064575195312,
"reward_std": 0.5435488224029541,
"rewards/GDino": 0.736801415681839,
"rewards/GIT": 0.32275132089853287,
"rewards/HPSv2": 0.26233673095703125,
"rewards/ORM": 0.759416937828064,
"self_certainty_semantic": -25.5625,
"self_certainty_token": -21.1875,
"step": 9
},
{
"completion_length": 62.796875,
"epoch": 0.01107419712070875,
"grad_norm": 0.5886948704719543,
"kl": 0.00164031982421875,
"learning_rate": 9.9375e-07,
"loss": -0.009089878294616938,
"reward": 1.8167259693145752,
"reward_std": 0.4427160769701004,
"rewards/GDino": 0.6997816860675812,
"rewards/GIT": 0.4742187559604645,
"rewards/HPSv2": 0.2480792999267578,
"rewards/ORM": 0.3946462571620941,
"self_certainty_semantic": -25.5,
"self_certainty_token": -21.125,
"step": 10
},
{
"completion_length": 64.09375,
"epoch": 0.012181616832779624,
"grad_norm": 0.6388463377952576,
"kl": 0.0016326904296875,
"learning_rate": 9.93125e-07,
"loss": -0.011163983959704638,
"reward": 2.250586152076721,
"reward_std": 0.29546695202589035,
"rewards/GDino": 0.7932291626930237,
"rewards/GIT": 0.5437096580862999,
"rewards/HPSv2": 0.25614356994628906,
"rewards/ORM": 0.657503753900528,
"self_certainty_semantic": -25.25,
"self_certainty_token": -21.0625,
"step": 11
},
{
"completion_length": 73.265625,
"epoch": 0.013289036544850499,
"grad_norm": 0.37963175773620605,
"kl": 0.001583099365234375,
"learning_rate": 9.925e-07,
"loss": 0.009535952471196651,
"reward": 1.8723560571670532,
"reward_std": 0.48824670910835266,
"rewards/GDino": 0.671429455280304,
"rewards/GIT": 0.4155814051628113,
"rewards/HPSv2": 0.2387409210205078,
"rewards/ORM": 0.5466042459011078,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -21.625,
"step": 12
},
{
"completion_length": 55.015625,
"epoch": 0.014396456256921373,
"grad_norm": 0.5844080448150635,
"kl": 0.001674652099609375,
"learning_rate": 9.91875e-07,
"loss": 0.0034986711107194424,
"reward": 1.7595484256744385,
"reward_std": 0.3697086051106453,
"rewards/GDino": 0.7100214958190918,
"rewards/GIT": 0.26869260519742966,
"rewards/HPSv2": 0.24958419799804688,
"rewards/ORM": 0.53125,
"self_certainty_semantic": -25.25,
"self_certainty_token": -21.375,
"step": 13
},
{
"completion_length": 55.65625,
"epoch": 0.015503875968992248,
"grad_norm": 0.5192797780036926,
"kl": 0.001674652099609375,
"learning_rate": 9.912499999999998e-07,
"loss": 0.010001872200518847,
"reward": 2.201015591621399,
"reward_std": 0.4899330288171768,
"rewards/GDino": 0.8140625059604645,
"rewards/GIT": 0.4328514188528061,
"rewards/HPSv2": 0.2431640625,
"rewards/ORM": 0.7109375,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -22.0,
"step": 14
},
{
"completion_length": 64.0,
"epoch": 0.016611295681063124,
"grad_norm": 0.46844616532325745,
"kl": 0.00174713134765625,
"learning_rate": 9.90625e-07,
"loss": 0.0017675042618066072,
"reward": 2.433342456817627,
"reward_std": 0.33736473321914673,
"rewards/GDino": 0.9153576791286469,
"rewards/GIT": 0.5124611556529999,
"rewards/HPSv2": 0.2507901191711426,
"rewards/ORM": 0.7547334432601929,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -21.6875,
"step": 15
},
{
"completion_length": 53.203125,
"epoch": 0.017718715393133997,
"grad_norm": 0.49579355120658875,
"kl": 0.001758575439453125,
"learning_rate": 9.9e-07,
"loss": 0.003856237977743149,
"reward": 1.6368815302848816,
"reward_std": 0.42226114869117737,
"rewards/GDino": 0.6432631015777588,
"rewards/GIT": 0.2906690910458565,
"rewards/HPSv2": 0.25169944763183594,
"rewards/ORM": 0.45124977827072144,
"self_certainty_semantic": -25.3125,
"self_certainty_token": -20.8125,
"step": 16
},
{
"completion_length": 76.28125,
"epoch": 0.018826135105204873,
"grad_norm": 0.5296036601066589,
"kl": 0.001590728759765625,
"learning_rate": 9.89375e-07,
"loss": -0.003345506265759468,
"reward": 1.7861530184745789,
"reward_std": 0.5057752877473831,
"rewards/GDino": 0.6293700635433197,
"rewards/GIT": 0.2197464406490326,
"rewards/HPSv2": 0.26516151428222656,
"rewards/ORM": 0.671875,
"self_certainty_semantic": -25.5,
"self_certainty_token": -20.6875,
"step": 17
},
{
"completion_length": 58.0625,
"epoch": 0.019933554817275746,
"grad_norm": 0.6577962636947632,
"kl": 0.00174713134765625,
"learning_rate": 9.8875e-07,
"loss": -0.019500677473843098,
"reward": 2.303292393684387,
"reward_std": 0.2609405145049095,
"rewards/GDino": 0.8339102566242218,
"rewards/GIT": 0.5853700041770935,
"rewards/HPSv2": 0.24338722229003906,
"rewards/ORM": 0.640625,
"self_certainty_semantic": -25.1875,
"self_certainty_token": -21.25,
"step": 18
},
{
"completion_length": 50.859375,
"epoch": 0.021040974529346623,
"grad_norm": 0.3543226718902588,
"kl": 0.00182342529296875,
"learning_rate": 9.88125e-07,
"loss": -0.00019507110118865967,
"reward": 1.6344053149223328,
"reward_std": 0.47374215722084045,
"rewards/GDino": 0.705148845911026,
"rewards/GIT": 0.2559727430343628,
"rewards/HPSv2": 0.2541370391845703,
"rewards/ORM": 0.41914665699005127,
"self_certainty_semantic": -25.25,
"self_certainty_token": -22.3125,
"step": 19
},
{
"completion_length": 65.921875,
"epoch": 0.0221483942414175,
"grad_norm": 0.5358290672302246,
"kl": 0.001781463623046875,
"learning_rate": 9.875e-07,
"loss": 0.007933363318443298,
"reward": 1.9504321217536926,
"reward_std": 0.3728322237730026,
"rewards/GDino": 0.6606760025024414,
"rewards/GIT": 0.48046815395355225,
"rewards/HPSv2": 0.24678802490234375,
"rewards/ORM": 0.5625,
"self_certainty_semantic": -25.125,
"self_certainty_token": -21.0625,
"step": 20
},
{
"completion_length": 59.3125,
"epoch": 0.023255813953488372,
"grad_norm": 2.0912797451019287,
"kl": 0.001811981201171875,
"learning_rate": 9.86875e-07,
"loss": -0.004398644436150789,
"reward": 2.252086877822876,
"reward_std": 0.44888848066329956,
"rewards/GDino": 0.798213005065918,
"rewards/GIT": 0.4853799045085907,
"rewards/HPSv2": 0.25956153869628906,
"rewards/ORM": 0.7089323997497559,
"self_certainty_semantic": -25.1875,
"self_certainty_token": -22.75,
"step": 21
},
{
"completion_length": 52.265625,
"epoch": 0.024363233665559248,
"grad_norm": 0.5790585875511169,
"kl": 0.00191497802734375,
"learning_rate": 9.862499999999999e-07,
"loss": 0.006876260507851839,
"reward": 1.9933909177780151,
"reward_std": 0.32367050647735596,
"rewards/GDino": 0.7134387493133545,
"rewards/GIT": 0.41087181866168976,
"rewards/HPSv2": 0.2721214294433594,
"rewards/ORM": 0.5969589203596115,
"self_certainty_semantic": -25.5,
"self_certainty_token": -22.375,
"step": 22
},
{
"completion_length": 59.375,
"epoch": 0.02547065337763012,
"grad_norm": 0.45692723989486694,
"kl": 0.001697540283203125,
"learning_rate": 9.85625e-07,
"loss": -0.00792664848268032,
"reward": 2.015365242958069,
"reward_std": 0.48256243765354156,
"rewards/GDino": 0.724082350730896,
"rewards/GIT": 0.42729710042476654,
"rewards/HPSv2": 0.2667560577392578,
"rewards/ORM": 0.5972296893596649,
"self_certainty_semantic": -25.375,
"self_certainty_token": -20.6875,
"step": 23
},
{
"completion_length": 55.203125,
"epoch": 0.026578073089700997,
"grad_norm": 0.46439889073371887,
"kl": 0.0016937255859375,
"learning_rate": 9.849999999999999e-07,
"loss": 0.0024933242239058018,
"reward": 2.460409939289093,
"reward_std": 0.4443647414445877,
"rewards/GDino": 0.8454739451408386,
"rewards/GIT": 0.6258784532546997,
"rewards/HPSv2": 0.2624950408935547,
"rewards/ORM": 0.7265625,
"self_certainty_semantic": -25.6875,
"self_certainty_token": -21.25,
"step": 24
},
{
"completion_length": 60.15625,
"epoch": 0.02768549280177187,
"grad_norm": 0.47176027297973633,
"kl": 0.001880645751953125,
"learning_rate": 9.84375e-07,
"loss": 0.005812188144773245,
"reward": 2.0174233317375183,
"reward_std": 0.40724658966064453,
"rewards/GDino": 0.7186038792133331,
"rewards/GIT": 0.4156235605478287,
"rewards/HPSv2": 0.26485633850097656,
"rewards/ORM": 0.6183395236730576,
"self_certainty_semantic": -25.5,
"self_certainty_token": -22.0,
"step": 25
},
{
"completion_length": 53.21875,
"epoch": 0.028792912513842746,
"grad_norm": 0.716375470161438,
"kl": 0.00209808349609375,
"learning_rate": 9.8375e-07,
"loss": 0.02397427149116993,
"reward": 2.186239182949066,
"reward_std": 0.46710920333862305,
"rewards/GDino": 0.7593750059604645,
"rewards/GIT": 0.5171153843402863,
"rewards/HPSv2": 0.2734565734863281,
"rewards/ORM": 0.6362921893596649,
"self_certainty_semantic": -25.25,
"self_certainty_token": -23.0,
"step": 26
},
{
"completion_length": 58.421875,
"epoch": 0.029900332225913623,
"grad_norm": 0.428893119096756,
"kl": 0.00171661376953125,
"learning_rate": 9.83125e-07,
"loss": -0.005866332910954952,
"reward": 1.9681838750839233,
"reward_std": 0.3645169883966446,
"rewards/GDino": 0.7666666209697723,
"rewards/GIT": 0.4486802965402603,
"rewards/HPSv2": 0.2419452667236328,
"rewards/ORM": 0.5108915567398071,
"self_certainty_semantic": -25.375,
"self_certainty_token": -21.4375,
"step": 27
},
{
"completion_length": 63.328125,
"epoch": 0.031007751937984496,
"grad_norm": 0.5334203243255615,
"kl": 0.002010345458984375,
"learning_rate": 9.825e-07,
"loss": 0.012586410157382488,
"reward": 1.4134111404418945,
"reward_std": 0.3155324012041092,
"rewards/GDino": 0.6005972325801849,
"rewards/GIT": 0.11092349141836166,
"rewards/HPSv2": 0.2596569061279297,
"rewards/ORM": 0.44223344326019287,
"self_certainty_semantic": -25.3125,
"self_certainty_token": -20.5625,
"step": 28
},
{
"completion_length": 56.25,
"epoch": 0.03211517165005537,
"grad_norm": 0.40832045674324036,
"kl": 0.001819610595703125,
"learning_rate": 9.81875e-07,
"loss": 0.010300841182470322,
"reward": 2.465680956840515,
"reward_std": 0.298002652823925,
"rewards/GDino": 0.862500011920929,
"rewards/GIT": 0.6107669174671173,
"rewards/HPSv2": 0.28375244140625,
"rewards/ORM": 0.7086615860462189,
"self_certainty_semantic": -25.375,
"self_certainty_token": -21.0625,
"step": 29
},
{
"completion_length": 54.953125,
"epoch": 0.03322259136212625,
"grad_norm": 0.4050670266151428,
"kl": 0.002025604248046875,
"learning_rate": 9.8125e-07,
"loss": -0.001845305785536766,
"reward": 2.476737856864929,
"reward_std": 0.3756887763738632,
"rewards/GDino": 0.8967152833938599,
"rewards/GIT": 0.551719531416893,
"rewards/HPSv2": 0.24522781372070312,
"rewards/ORM": 0.7830752730369568,
"self_certainty_semantic": -25.1875,
"self_certainty_token": -21.25,
"step": 30
},
{
"completion_length": 74.84375,
"epoch": 0.03433001107419712,
"grad_norm": 0.7089686393737793,
"kl": 0.001865386962890625,
"learning_rate": 9.806249999999998e-07,
"loss": 0.023707949556410313,
"reward": 1.831493854522705,
"reward_std": 0.37860143184661865,
"rewards/GDino": 0.6287499666213989,
"rewards/GIT": 0.3833145350217819,
"rewards/HPSv2": 0.2413043975830078,
"rewards/ORM": 0.578125,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -20.9375,
"step": 31
},
{
"completion_length": 71.0625,
"epoch": 0.035437430786267994,
"grad_norm": 0.45204266905784607,
"kl": 0.00200653076171875,
"learning_rate": 9.8e-07,
"loss": 0.014695112593472004,
"reward": 1.5279032588005066,
"reward_std": 0.5042913109064102,
"rewards/GDino": 0.6702238023281097,
"rewards/GIT": 0.24817809462547302,
"rewards/HPSv2": 0.2356252670288086,
"rewards/ORM": 0.37387609481811523,
"self_certainty_semantic": -25.375,
"self_certainty_token": -22.5625,
"step": 32
},
{
"completion_length": 59.703125,
"epoch": 0.036544850498338874,
"grad_norm": 0.4359590411186218,
"kl": 0.00201416015625,
"learning_rate": 9.79375e-07,
"loss": 0.00610552029684186,
"reward": 2.3108657598495483,
"reward_std": 0.4415571391582489,
"rewards/GDino": 0.8515625,
"rewards/GIT": 0.6067334115505219,
"rewards/HPSv2": 0.22726917266845703,
"rewards/ORM": 0.6253007054328918,
"self_certainty_semantic": -25.375,
"self_certainty_token": -21.5,
"step": 33
},
{
"completion_length": 58.046875,
"epoch": 0.03765227021040975,
"grad_norm": 0.5853399038314819,
"kl": 0.002033233642578125,
"learning_rate": 9.7875e-07,
"loss": 0.023541483096778393,
"reward": 2.012690246105194,
"reward_std": 0.4660336524248123,
"rewards/GDino": 0.6989582777023315,
"rewards/GIT": 0.40700431168079376,
"rewards/HPSv2": 0.24774932861328125,
"rewards/ORM": 0.6589783728122711,
"self_certainty_semantic": -25.3125,
"self_certainty_token": -21.8125,
"step": 34
},
{
"completion_length": 56.90625,
"epoch": 0.03875968992248062,
"grad_norm": 0.3787715435028076,
"kl": 0.001888275146484375,
"learning_rate": 9.78125e-07,
"loss": 0.003942073322832584,
"reward": 2.452033281326294,
"reward_std": 0.3410096764564514,
"rewards/GDino": 0.8359375298023224,
"rewards/GIT": 0.567652553319931,
"rewards/HPSv2": 0.2418804168701172,
"rewards/ORM": 0.806562751531601,
"self_certainty_semantic": -25.3125,
"self_certainty_token": -21.3125,
"step": 35
},
{
"completion_length": 66.0,
"epoch": 0.03986710963455149,
"grad_norm": 0.5305721163749695,
"kl": 0.005157470703125,
"learning_rate": 9.775e-07,
"loss": -0.003781900042667985,
"reward": 1.8618011474609375,
"reward_std": 0.4120703786611557,
"rewards/GDino": 0.6453125476837158,
"rewards/GIT": 0.4281370937824249,
"rewards/HPSv2": 0.24621009826660156,
"rewards/ORM": 0.5421415567398071,
"self_certainty_semantic": -25.5625,
"self_certainty_token": -20.9375,
"step": 36
},
{
"completion_length": 51.40625,
"epoch": 0.04097452934662237,
"grad_norm": 0.46515390276908875,
"kl": 0.002716064453125,
"learning_rate": 9.76875e-07,
"loss": 0.006902199704200029,
"reward": 1.9485998153686523,
"reward_std": 0.42147715389728546,
"rewards/GDino": 0.6951449513435364,
"rewards/GIT": 0.31057579815387726,
"rewards/HPSv2": 0.26158714294433594,
"rewards/ORM": 0.681291937828064,
"self_certainty_semantic": -25.3125,
"self_certainty_token": -22.1875,
"step": 37
},
{
"completion_length": 71.03125,
"epoch": 0.042081949058693245,
"grad_norm": 0.951810896396637,
"kl": 0.00226593017578125,
"learning_rate": 9.7625e-07,
"loss": 0.03428783547133207,
"reward": 1.9112213850021362,
"reward_std": 0.30633312463760376,
"rewards/GDino": 0.7401995956897736,
"rewards/GIT": 0.30288365483283997,
"rewards/HPSv2": 0.2552833557128906,
"rewards/ORM": 0.6128547042608261,
"self_certainty_semantic": -25.375,
"self_certainty_token": -21.125,
"step": 38
},
{
"completion_length": 67.6875,
"epoch": 0.04318936877076412,
"grad_norm": 0.6357575058937073,
"kl": 0.01482391357421875,
"learning_rate": 9.756249999999999e-07,
"loss": 0.023865018505603075,
"reward": 2.345404624938965,
"reward_std": 0.31367097795009613,
"rewards/GDino": 0.8703815042972565,
"rewards/GIT": 0.4902418553829193,
"rewards/HPSv2": 0.26603126525878906,
"rewards/ORM": 0.71875,
"self_certainty_semantic": -25.5625,
"self_certainty_token": -21.75,
"step": 39
},
{
"completion_length": 59.671875,
"epoch": 0.044296788482835,
"grad_norm": 0.5422465801239014,
"kl": 0.00281524658203125,
"learning_rate": 9.75e-07,
"loss": -0.018710695207118988,
"reward": 2.222834825515747,
"reward_std": 0.42842796444892883,
"rewards/GDino": 0.8634105622768402,
"rewards/GIT": 0.40908148139715195,
"rewards/HPSv2": 0.27498817443847656,
"rewards/ORM": 0.6753546893596649,
"self_certainty_semantic": -25.3125,
"self_certainty_token": -21.125,
"step": 40
},
{
"completion_length": 60.9375,
"epoch": 0.04540420819490587,
"grad_norm": 0.7511593103408813,
"kl": 0.00299072265625,
"learning_rate": 9.743749999999999e-07,
"loss": 0.005782268475741148,
"reward": 1.8980144262313843,
"reward_std": 0.3208035007119179,
"rewards/GDino": 0.6784752607345581,
"rewards/GIT": 0.3914954513311386,
"rewards/HPSv2": 0.24643898010253906,
"rewards/ORM": 0.5816046595573425,
"self_certainty_semantic": -25.375,
"self_certainty_token": -21.25,
"step": 41
},
{
"completion_length": 48.4375,
"epoch": 0.046511627906976744,
"grad_norm": 0.5177002549171448,
"kl": 0.0025177001953125,
"learning_rate": 9.7375e-07,
"loss": 0.045526545494794846,
"reward": 2.269711136817932,
"reward_std": 0.48014624416828156,
"rewards/GDino": 0.8855312466621399,
"rewards/GIT": 0.4437972754240036,
"rewards/HPSv2": 0.2572154998779297,
"rewards/ORM": 0.6831671893596649,
"self_certainty_semantic": -25.3125,
"self_certainty_token": -20.5625,
"step": 42
},
{
"completion_length": 67.875,
"epoch": 0.047619047619047616,
"grad_norm": 0.5885121822357178,
"kl": 0.002044677734375,
"learning_rate": 9.73125e-07,
"loss": 0.013573684729635715,
"reward": 1.6382005214691162,
"reward_std": 0.38919302821159363,
"rewards/GDino": 0.6114583313465118,
"rewards/GIT": 0.3806646466255188,
"rewards/HPSv2": 0.23286819458007812,
"rewards/ORM": 0.41320937871932983,
"self_certainty_semantic": -25.375,
"self_certainty_token": -21.0625,
"step": 43
},
{
"completion_length": 54.4375,
"epoch": 0.048726467331118496,
"grad_norm": 0.40727919340133667,
"kl": 0.0020751953125,
"learning_rate": 9.725e-07,
"loss": -0.01244093757122755,
"reward": 2.8831005096435547,
"reward_std": 0.31665875762701035,
"rewards/GDino": 0.9588541388511658,
"rewards/GIT": 0.7738310992717743,
"rewards/HPSv2": 0.2601909637451172,
"rewards/ORM": 0.8902243673801422,
"self_certainty_semantic": -25.5625,
"self_certainty_token": -21.0625,
"step": 44
},
{
"completion_length": 54.90625,
"epoch": 0.04983388704318937,
"grad_norm": 0.4928445816040039,
"kl": 0.0024566650390625,
"learning_rate": 9.71875e-07,
"loss": 0.00010553281754255295,
"reward": 2.4343937635421753,
"reward_std": 0.5984751731157303,
"rewards/GDino": 0.862500011920929,
"rewards/GIT": 0.5139474421739578,
"rewards/HPSv2": 0.26379966735839844,
"rewards/ORM": 0.7941466569900513,
"self_certainty_semantic": -25.5625,
"self_certainty_token": -21.25,
"step": 45
},
{
"completion_length": 56.078125,
"epoch": 0.05094130675526024,
"grad_norm": 0.37051326036453247,
"kl": 0.00231170654296875,
"learning_rate": 9.712499999999998e-07,
"loss": 0.007893505971878767,
"reward": 1.9575175046920776,
"reward_std": 0.3945648521184921,
"rewards/GDino": 0.5999999940395355,
"rewards/GIT": 0.32395021617412567,
"rewards/HPSv2": 0.26719093322753906,
"rewards/ORM": 0.7663763463497162,
"self_certainty_semantic": -25.1875,
"self_certainty_token": -22.0625,
"step": 46
},
{
"completion_length": 55.171875,
"epoch": 0.05204872646733112,
"grad_norm": 0.8945181369781494,
"kl": 0.0025634765625,
"learning_rate": 9.70625e-07,
"loss": -0.0013387980870902538,
"reward": 1.836871862411499,
"reward_std": 0.23468619585037231,
"rewards/GDino": 0.7209739089012146,
"rewards/GIT": 0.22856376320123672,
"rewards/HPSv2": 0.27921295166015625,
"rewards/ORM": 0.608121246099472,
"self_certainty_semantic": -25.3125,
"self_certainty_token": -21.6875,
"step": 47
},
{
"completion_length": 57.984375,
"epoch": 0.053156146179401995,
"grad_norm": 1.6689982414245605,
"kl": 0.00267791748046875,
"learning_rate": 9.7e-07,
"loss": 0.022647732868790627,
"reward": 1.454766035079956,
"reward_std": 0.40884387493133545,
"rewards/GDino": 0.6050891876220703,
"rewards/GIT": 0.0,
"rewards/HPSv2": 0.2698974609375,
"rewards/ORM": 0.5797793865203857,
"self_certainty_semantic": -25.3125,
"self_certainty_token": -21.625,
"step": 48
},
{
"completion_length": 58.046875,
"epoch": 0.05426356589147287,
"grad_norm": 0.4761441648006439,
"kl": 0.002048492431640625,
"learning_rate": 9.69375e-07,
"loss": 0.016307475278154016,
"reward": 1.9066129326820374,
"reward_std": 0.5319462567567825,
"rewards/GDino": 0.7744874656200409,
"rewards/GIT": 0.2370736114680767,
"rewards/HPSv2": 0.2514495849609375,
"rewards/ORM": 0.6436022371053696,
"self_certainty_semantic": -25.25,
"self_certainty_token": -21.625,
"step": 49
},
{
"completion_length": 61.0,
"epoch": 0.05537098560354374,
"grad_norm": 0.8074173331260681,
"kl": 0.0040283203125,
"learning_rate": 9.6875e-07,
"loss": 0.005913220578804612,
"reward": 2.0915883779525757,
"reward_std": 0.5395111739635468,
"rewards/GDino": 0.7859093248844147,
"rewards/GIT": 0.3929952085018158,
"rewards/HPSv2": 0.25482940673828125,
"rewards/ORM": 0.657854437828064,
"self_certainty_semantic": -25.5625,
"self_certainty_token": -22.25,
"step": 50
},
{
"completion_length": 44.359375,
"epoch": 0.05647840531561462,
"grad_norm": 0.5618427991867065,
"kl": 0.002471923828125,
"learning_rate": 9.68125e-07,
"loss": -0.003945098840631545,
"reward": 1.8058671951293945,
"reward_std": 0.5712144523859024,
"rewards/GDino": 0.7815796732902527,
"rewards/GIT": 0.2604931816458702,
"rewards/HPSv2": 0.27115440368652344,
"rewards/ORM": 0.49263995885849,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -22.625,
"step": 51
},
{
"completion_length": 48.0,
"epoch": 0.05758582502768549,
"grad_norm": 107.57159423828125,
"kl": 26.37615966796875,
"learning_rate": 9.675e-07,
"loss": 0.27801212295889854,
"reward": 2.4165316820144653,
"reward_std": 0.2998274937272072,
"rewards/GDino": 0.9244791567325592,
"rewards/GIT": 0.6574473828077316,
"rewards/HPSv2": 0.2756366729736328,
"rewards/ORM": 0.5589684545993805,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -21.375,
"step": 52
},
{
"completion_length": 52.140625,
"epoch": 0.058693244739756366,
"grad_norm": 0.4408358931541443,
"kl": 0.00232696533203125,
"learning_rate": 9.66875e-07,
"loss": 0.013528472045436502,
"reward": 1.8899905681610107,
"reward_std": 0.4558149725198746,
"rewards/GDino": 0.730059951543808,
"rewards/GIT": 0.39098620414733887,
"rewards/HPSv2": 0.24242782592773438,
"rewards/ORM": 0.5265165567398071,
"self_certainty_semantic": -25.0,
"self_certainty_token": -20.5625,
"step": 53
},
{
"completion_length": 58.390625,
"epoch": 0.059800664451827246,
"grad_norm": 0.48384228348731995,
"kl": 0.00225067138671875,
"learning_rate": 9.6625e-07,
"loss": 0.005568797350861132,
"reward": 1.638724684715271,
"reward_std": 0.41337575018405914,
"rewards/GDino": 0.6137361526489258,
"rewards/GIT": 0.24863167852163315,
"rewards/HPSv2": 0.24831581115722656,
"rewards/ORM": 0.5280410945415497,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -20.9375,
"step": 54
},
{
"completion_length": 50.234375,
"epoch": 0.06090808416389812,
"grad_norm": 0.46963369846343994,
"kl": 0.0026397705078125,
"learning_rate": 9.65625e-07,
"loss": 0.009267964400351048,
"reward": 1.7191376686096191,
"reward_std": 0.521537572145462,
"rewards/GDino": 0.7086881995201111,
"rewards/GIT": 0.3270767852663994,
"rewards/HPSv2": 0.2678356170654297,
"rewards/ORM": 0.4155370891094208,
"self_certainty_semantic": -25.5,
"self_certainty_token": -21.6875,
"step": 55
},
{
"completion_length": 59.953125,
"epoch": 0.06201550387596899,
"grad_norm": 0.6913841366767883,
"kl": 0.0024261474609375,
"learning_rate": 9.649999999999999e-07,
"loss": 0.03414425998926163,
"reward": 1.9336698055267334,
"reward_std": 0.45749759674072266,
"rewards/GDino": 0.6963726580142975,
"rewards/GIT": 0.38425514101982117,
"rewards/HPSv2": 0.2471466064453125,
"rewards/ORM": 0.6058953106403351,
"self_certainty_semantic": -25.0625,
"self_certainty_token": -21.875,
"step": 56
},
{
"completion_length": 50.765625,
"epoch": 0.06312292358803986,
"grad_norm": 0.5066769123077393,
"kl": 0.002532958984375,
"learning_rate": 9.64375e-07,
"loss": 0.009842937346547842,
"reward": 1.8338811993598938,
"reward_std": 0.3951306492090225,
"rewards/GDino": 0.7909577786922455,
"rewards/GIT": 0.24781160056591034,
"rewards/HPSv2": 0.2509651184082031,
"rewards/ORM": 0.5441466569900513,
"self_certainty_semantic": -25.5,
"self_certainty_token": -21.4375,
"step": 57
},
{
"completion_length": 52.9375,
"epoch": 0.06423034330011074,
"grad_norm": 0.37791869044303894,
"kl": 0.002685546875,
"learning_rate": 9.637499999999999e-07,
"loss": 0.024126023054122925,
"reward": 1.8852884769439697,
"reward_std": 0.46756890416145325,
"rewards/GDino": 0.732811689376831,
"rewards/GIT": 0.38145140558481216,
"rewards/HPSv2": 0.2541465759277344,
"rewards/ORM": 0.5168787688016891,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -21.25,
"step": 58
},
{
"completion_length": 47.234375,
"epoch": 0.06533776301218161,
"grad_norm": 0.7410405278205872,
"kl": 0.0026092529296875,
"learning_rate": 9.63125e-07,
"loss": -0.01674468442797661,
"reward": 2.3462648391723633,
"reward_std": 0.2433818019926548,
"rewards/GDino": 0.8425607979297638,
"rewards/GIT": 0.46571947634220123,
"rewards/HPSv2": 0.2664222717285156,
"rewards/ORM": 0.771562248468399,
"self_certainty_semantic": -25.375,
"self_certainty_token": -21.25,
"step": 59
},
{
"completion_length": 45.015625,
"epoch": 0.0664451827242525,
"grad_norm": 0.5326105952262878,
"kl": 0.0026397705078125,
"learning_rate": 9.624999999999999e-07,
"loss": 0.003804182168096304,
"reward": 2.036432147026062,
"reward_std": 0.3990803211927414,
"rewards/GDino": 0.8798050284385681,
"rewards/GIT": 0.4744318723678589,
"rewards/HPSv2": 0.238006591796875,
"rewards/ORM": 0.44418865442276,
"self_certainty_semantic": -25.3125,
"self_certainty_token": -20.6875,
"step": 60
},
{
"completion_length": 65.0,
"epoch": 0.06755260243632337,
"grad_norm": 0.5713196396827698,
"kl": 0.00235748291015625,
"learning_rate": 9.61875e-07,
"loss": 0.04368375800549984,
"reward": 2.1398236751556396,
"reward_std": 0.3530130609869957,
"rewards/GDino": 0.7138020694255829,
"rewards/GIT": 0.644903838634491,
"rewards/HPSv2": 0.2529468536376953,
"rewards/ORM": 0.5281709432601929,
"self_certainty_semantic": -25.25,
"self_certainty_token": -20.6875,
"step": 61
},
{
"completion_length": 54.9375,
"epoch": 0.06866002214839424,
"grad_norm": 5.612445831298828,
"kl": 0.00339508056640625,
"learning_rate": 9.6125e-07,
"loss": 0.008875304833054543,
"reward": 2.497900605201721,
"reward_std": 0.41675496101379395,
"rewards/GDino": 0.872697502374649,
"rewards/GIT": 0.601748138666153,
"rewards/HPSv2": 0.2640380859375,
"rewards/ORM": 0.759416937828064,
"self_certainty_semantic": -25.3125,
"self_certainty_token": -20.875,
"step": 62
},
{
"completion_length": 49.734375,
"epoch": 0.06976744186046512,
"grad_norm": 0.5861217379570007,
"kl": 0.003326416015625,
"learning_rate": 9.606249999999998e-07,
"loss": 0.01025251136161387,
"reward": 2.2640050053596497,
"reward_std": 0.48744213581085205,
"rewards/GDino": 0.8172852694988251,
"rewards/GIT": 0.44742196798324585,
"rewards/HPSv2": 0.2430896759033203,
"rewards/ORM": 0.7562080323696136,
"self_certainty_semantic": -25.125,
"self_certainty_token": -22.1875,
"step": 63
},
{
"completion_length": 64.375,
"epoch": 0.07087486157253599,
"grad_norm": 0.39266109466552734,
"kl": 0.00298309326171875,
"learning_rate": 9.6e-07,
"loss": -0.005469436291605234,
"reward": 1.6910768747329712,
"reward_std": 0.2151722088456154,
"rewards/GDino": 0.7097718715667725,
"rewards/GIT": 0.32366037368774414,
"rewards/HPSv2": 0.2576026916503906,
"rewards/ORM": 0.40004195272922516,
"self_certainty_semantic": -25.5,
"self_certainty_token": -21.9375,
"step": 64
},
{
"completion_length": 61.515625,
"epoch": 0.07198228128460686,
"grad_norm": 0.705937922000885,
"kl": 0.002685546875,
"learning_rate": 9.59375e-07,
"loss": 0.010601098649203777,
"reward": 2.128853142261505,
"reward_std": 0.4351096749305725,
"rewards/GDino": 0.7197916805744171,
"rewards/GIT": 0.6168824732303619,
"rewards/HPSv2": 0.23163414001464844,
"rewards/ORM": 0.5605448335409164,
"self_certainty_semantic": -25.25,
"self_certainty_token": -22.75,
"step": 65
},
{
"completion_length": 49.0,
"epoch": 0.07308970099667775,
"grad_norm": 0.4427480101585388,
"kl": 0.002899169921875,
"learning_rate": 9.5875e-07,
"loss": 0.02646360918879509,
"reward": 2.1654986143112183,
"reward_std": 0.37753987312316895,
"rewards/GDino": 0.6895833611488342,
"rewards/GIT": 0.48387444019317627,
"rewards/HPSv2": 0.2579364776611328,
"rewards/ORM": 0.7341042160987854,
"self_certainty_semantic": -25.25,
"self_certainty_token": -21.5625,
"step": 66
},
{
"completion_length": 63.140625,
"epoch": 0.07419712070874862,
"grad_norm": 0.7619237899780273,
"kl": 0.00284576416015625,
"learning_rate": 9.58125e-07,
"loss": 0.026691121514886618,
"reward": 2.3450592160224915,
"reward_std": 0.2740027904510498,
"rewards/GDino": 0.8025760054588318,
"rewards/GIT": 0.5677543580532074,
"rewards/HPSv2": 0.2594585418701172,
"rewards/ORM": 0.7152703106403351,
"self_certainty_semantic": -25.25,
"self_certainty_token": -21.875,
"step": 67
},
{
"completion_length": 50.171875,
"epoch": 0.0753045404208195,
"grad_norm": 0.4760603904724121,
"kl": 0.0030517578125,
"learning_rate": 9.575e-07,
"loss": 0.022392848506569862,
"reward": 1.6361079216003418,
"reward_std": 0.33574268221855164,
"rewards/GDino": 0.6061920523643494,
"rewards/GIT": 0.31722745299339294,
"rewards/HPSv2": 0.2595634460449219,
"rewards/ORM": 0.453125,
"self_certainty_semantic": -25.0625,
"self_certainty_token": -21.5,
"step": 68
},
{
"completion_length": 55.046875,
"epoch": 0.07641196013289037,
"grad_norm": 0.5907943248748779,
"kl": 0.00336456298828125,
"learning_rate": 9.56875e-07,
"loss": -0.0030646873638033867,
"reward": 2.119426429271698,
"reward_std": 0.298831045627594,
"rewards/GDino": 0.8028125166893005,
"rewards/GIT": 0.3893257826566696,
"rewards/HPSv2": 0.2710380554199219,
"rewards/ORM": 0.65625,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -21.4375,
"step": 69
},
{
"completion_length": 51.140625,
"epoch": 0.07751937984496124,
"grad_norm": 0.47751373052597046,
"kl": 0.00359344482421875,
"learning_rate": 9.5625e-07,
"loss": -0.011344656813889742,
"reward": 1.4646188020706177,
"reward_std": 0.5817874372005463,
"rewards/GDino": 0.5935695767402649,
"rewards/GIT": 0.23897356167435646,
"rewards/HPSv2": 0.25234222412109375,
"rewards/ORM": 0.37973344326019287,
"self_certainty_semantic": -24.875,
"self_certainty_token": -20.9375,
"step": 70
},
{
"completion_length": 55.609375,
"epoch": 0.07862679955703211,
"grad_norm": 0.5281980633735657,
"kl": 0.0030364990234375,
"learning_rate": 9.556249999999999e-07,
"loss": -0.023217559792101383,
"reward": 1.856022596359253,
"reward_std": 0.4435942769050598,
"rewards/GDino": 0.6947268545627594,
"rewards/GIT": 0.28702250868082047,
"rewards/HPSv2": 0.26489830017089844,
"rewards/ORM": 0.609375,
"self_certainty_semantic": -25.375,
"self_certainty_token": -20.5,
"step": 71
},
{
"completion_length": 42.390625,
"epoch": 0.07973421926910298,
"grad_norm": 0.4538242518901825,
"kl": 0.002960205078125,
"learning_rate": 9.55e-07,
"loss": 0.016265914775431156,
"reward": 1.911847174167633,
"reward_std": 0.4016146659851074,
"rewards/GDino": 0.6731474995613098,
"rewards/GIT": 0.46439771354198456,
"rewards/HPSv2": 0.2497406005859375,
"rewards/ORM": 0.524561420083046,
"self_certainty_semantic": -25.0,
"self_certainty_token": -20.875,
"step": 72
},
{
"completion_length": 53.28125,
"epoch": 0.08084163898117387,
"grad_norm": 0.5773823261260986,
"kl": 0.00330352783203125,
"learning_rate": 9.543749999999999e-07,
"loss": -0.0016377167776226997,
"reward": 2.114488363265991,
"reward_std": 0.44427454471588135,
"rewards/GDino": 0.8240922689437866,
"rewards/GIT": 0.4950668513774872,
"rewards/HPSv2": 0.24412155151367188,
"rewards/ORM": 0.5512077808380127,
"self_certainty_semantic": -25.375,
"self_certainty_token": -21.0,
"step": 73
},
{
"completion_length": 56.296875,
"epoch": 0.08194905869324474,
"grad_norm": 0.43449509143829346,
"kl": 0.0035247802734375,
"learning_rate": 9.5375e-07,
"loss": 0.03005522396415472,
"reward": 2.32301664352417,
"reward_std": 0.22542773187160492,
"rewards/GDino": 0.864062488079071,
"rewards/GIT": 0.5282620340585709,
"rewards/HPSv2": 0.25408363342285156,
"rewards/ORM": 0.6766084730625153,
"self_certainty_semantic": -25.25,
"self_certainty_token": -22.25,
"step": 74
},
{
"completion_length": 67.6875,
"epoch": 0.08305647840531562,
"grad_norm": 0.4218258857727051,
"kl": 0.0028228759765625,
"learning_rate": 9.53125e-07,
"loss": 0.015081442426890135,
"reward": 1.7625158429145813,
"reward_std": 0.4334114193916321,
"rewards/GDino": 0.6663236618041992,
"rewards/GIT": 0.26877461373806,
"rewards/HPSv2": 0.2647876739501953,
"rewards/ORM": 0.5626298785209656,
"self_certainty_semantic": -25.1875,
"self_certainty_token": -21.125,
"step": 75
},
{
"completion_length": 62.15625,
"epoch": 0.08416389811738649,
"grad_norm": 0.45278123021125793,
"kl": 0.00312042236328125,
"learning_rate": 9.525e-07,
"loss": 0.01650754688307643,
"reward": 2.2938578128814697,
"reward_std": 0.5077499151229858,
"rewards/GDino": 0.7734375,
"rewards/GIT": 0.6401466727256775,
"rewards/HPSv2": 0.2568778991699219,
"rewards/ORM": 0.6233955323696136,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -21.125,
"step": 76
},
{
"completion_length": 50.984375,
"epoch": 0.08527131782945736,
"grad_norm": 0.5513558387756348,
"kl": 0.004730224609375,
"learning_rate": 9.51875e-07,
"loss": -0.008258584188297391,
"reward": 1.6354877948760986,
"reward_std": 0.5420883148908615,
"rewards/GDino": 0.643737405538559,
"rewards/GIT": 0.20579323172569275,
"rewards/HPSv2": 0.2405567169189453,
"rewards/ORM": 0.5454003810882568,
"self_certainty_semantic": -25.0625,
"self_certainty_token": -22.1875,
"step": 77
},
{
"completion_length": 56.390625,
"epoch": 0.08637873754152824,
"grad_norm": 0.9578920602798462,
"kl": 0.00360107421875,
"learning_rate": 9.5125e-07,
"loss": 0.0016261041164398193,
"reward": 2.061507523059845,
"reward_std": 0.2758500352501869,
"rewards/GDino": 0.7561410367488861,
"rewards/GIT": 0.33666322380304337,
"rewards/HPSv2": 0.2762489318847656,
"rewards/ORM": 0.6924542784690857,
"self_certainty_semantic": -25.3125,
"self_certainty_token": -21.1875,
"step": 78
},
{
"completion_length": 57.375,
"epoch": 0.08748615725359911,
"grad_norm": 0.46459418535232544,
"kl": 0.004241943359375,
"learning_rate": 9.50625e-07,
"loss": -0.019409675151109695,
"reward": 2.298323154449463,
"reward_std": 0.22066934406757355,
"rewards/GDino": 0.8136925399303436,
"rewards/GIT": 0.6333461850881577,
"rewards/HPSv2": 0.27008056640625,
"rewards/ORM": 0.5812040567398071,
"self_certainty_semantic": -25.3125,
"self_certainty_token": -21.875,
"step": 79
},
{
"completion_length": 60.5625,
"epoch": 0.08859357696567,
"grad_norm": 0.4274587631225586,
"kl": 0.004058837890625,
"learning_rate": 9.499999999999999e-07,
"loss": 0.013256619684398174,
"reward": 1.6786987781524658,
"reward_std": 0.3984425514936447,
"rewards/GDino": 0.6007516384124756,
"rewards/GIT": 0.18326736986637115,
"rewards/HPSv2": 0.2720355987548828,
"rewards/ORM": 0.6226442158222198,
"self_certainty_semantic": -25.5,
"self_certainty_token": -21.3125,
"step": 80
},
{
"completion_length": 58.03125,
"epoch": 0.08970099667774087,
"grad_norm": 0.9172859191894531,
"kl": 0.00426483154296875,
"learning_rate": 9.493749999999999e-07,
"loss": 0.003496276680380106,
"reward": 2.106017231941223,
"reward_std": 0.30050399899482727,
"rewards/GDino": 0.7440759837627411,
"rewards/GIT": 0.3581302911043167,
"rewards/HPSv2": 0.27126121520996094,
"rewards/ORM": 0.7325496971607208,
"self_certainty_semantic": -25.3125,
"self_certainty_token": -20.875,
"step": 81
},
{
"completion_length": 49.5625,
"epoch": 0.09080841638981174,
"grad_norm": 0.4841405153274536,
"kl": 0.00412750244140625,
"learning_rate": 9.487499999999999e-07,
"loss": 0.025506282225251198,
"reward": 1.6879253387451172,
"reward_std": 0.42353254556655884,
"rewards/GDino": 0.6098452508449554,
"rewards/GIT": 0.38033944368362427,
"rewards/HPSv2": 0.2658271789550781,
"rewards/ORM": 0.43191343545913696,
"self_certainty_semantic": -25.4375,
"self_certainty_token": -20.9375,
"step": 82
},
{
"completion_length": 48.328125,
"epoch": 0.09191583610188261,
"grad_norm": 0.492243230342865,
"kl": 0.00345611572265625,
"learning_rate": 9.481249999999999e-07,
"loss": -0.0034960508346557617,
"reward": 2.1111596822738647,
"reward_std": 0.41540510952472687,
"rewards/GDino": 0.7717877924442291,
"rewards/GIT": 0.4860316216945648,
"rewards/HPSv2": 0.2670021057128906,
"rewards/ORM": 0.5863381326198578,
"self_certainty_semantic": -25.25,
"self_certainty_token": -21.4375,
"step": 83
},
{
"completion_length": 66.3125,
"epoch": 0.09302325581395349,
"grad_norm": 0.5617808699607849,
"kl": 0.004180908203125,
"learning_rate": 9.474999999999999e-07,
"loss": 0.003248518332839012,
"reward": 2.094790816307068,
"reward_std": 0.3879907354712486,
"rewards/GDino": 0.7973622679710388,
"rewards/GIT": 0.632976621389389,
"rewards/HPSv2": 0.24137306213378906,
"rewards/ORM": 0.4230788052082062,
"self_certainty_semantic": -25.25,
"self_certainty_token": -21.1875,
"step": 84
},
{
"completion_length": 51.40625,
"epoch": 0.09413067552602436,
"grad_norm": 0.5695884823799133,
"kl": 0.003204345703125,
"learning_rate": 9.468749999999999e-07,
"loss": 0.012543351389467716,
"reward": 1.8675293326377869,
"reward_std": 0.4282868355512619,
"rewards/GDino": 0.6550000011920929,
"rewards/GIT": 0.33260630816221237,
"rewards/HPSv2": 0.24515533447265625,
"rewards/ORM": 0.6347676515579224,
"self_certainty_semantic": -25.0625,
"self_certainty_token": -21.375,
"step": 85
},
{
"completion_length": 48.296875,
"epoch": 0.09523809523809523,
"grad_norm": 0.46590158343315125,
"kl": 0.00469970703125,
"learning_rate": 9.462499999999999e-07,
"loss": 0.00347991194576025,
"reward": 2.2731298208236694,
"reward_std": 0.383390873670578,
"rewards/GDino": 0.8246111273765564,
"rewards/GIT": 0.33447980135679245,
"rewards/HPSv2": 0.29212188720703125,
"rewards/ORM": 0.821916937828064,
"self_certainty_semantic": -25.375,
"self_certainty_token": -22.0,
"step": 86
},
{
"completion_length": 54.390625,
"epoch": 0.09634551495016612,
"grad_norm": 0.5397853255271912,
"kl": 0.004425048828125,
"learning_rate": 9.45625e-07,
"loss": 0.008617566898465157,
"reward": 2.2459940314292908,
"reward_std": 0.4676859378814697,
"rewards/GDino": 0.7356771230697632,
"rewards/GIT": 0.46453191339969635,
"rewards/HPSv2": 0.26766395568847656,
"rewards/ORM": 0.7781210243701935,
"self_certainty_semantic": -25.25,
"self_certainty_token": -21.0,
"step": 87
},
{
"completion_length": 42.15625,
"epoch": 0.09745293466223699,
"grad_norm": 0.48280662298202515,
"kl": 0.00406646728515625,
"learning_rate": 9.45e-07,
"loss": 0.016791983507573605,
"reward": 2.1528985500335693,
"reward_std": 0.44025059044361115,
"rewards/GDino": 0.7985424101352692,
"rewards/GIT": 0.47699007391929626,
"rewards/HPSv2": 0.2789325714111328,
"rewards/ORM": 0.5984334945678711,
"self_certainty_semantic": -25.375,
"self_certainty_token": -20.875,
"step": 88
},
{
"completion_length": 48.25,
"epoch": 0.09856035437430787,
"grad_norm": 0.4512772560119629,
"kl": 0.00405120849609375,
"learning_rate": 9.44375e-07,
"loss": -0.009609811007976532,
"reward": 2.155352771282196,
"reward_std": 0.3193782642483711,
"rewards/GDino": 0.7525902688503265,
"rewards/GIT": 0.4481022357940674,
"rewards/HPSv2": 0.2619743347167969,
"rewards/ORM": 0.6926859021186829,
"self_certainty_semantic": -25.375,
"self_certainty_token": -21.125,
"step": 89
},
{
"completion_length": 47.78125,
"epoch": 0.09966777408637874,
"grad_norm": 0.5204576849937439,
"kl": 0.004425048828125,
"learning_rate": 9.4375e-07,
"loss": -0.017570611089468002,
"reward": 2.3318194150924683,
"reward_std": 0.3641355484724045,
"rewards/GDino": 0.854687511920929,
"rewards/GIT": 0.6271218061447144,
"rewards/HPSv2": 0.26286888122558594,
"rewards/ORM": 0.5871412754058838,
"self_certainty_semantic": -25.5,
"self_certainty_token": -21.4375,
"step": 90
},
{
"completion_length": 54.9375,
"epoch": 0.10077519379844961,
"grad_norm": 0.7515896558761597,
"kl": 0.0042266845703125,
"learning_rate": 9.43125e-07,
"loss": 0.022024651989340782,
"reward": 1.7255874276161194,
"reward_std": 0.3924099802970886,
"rewards/GDino": 0.6800954043865204,
"rewards/GIT": 0.41760827600955963,
"rewards/HPSv2": 0.22957611083984375,
"rewards/ORM": 0.3983076214790344,
"self_certainty_semantic": -24.875,
"self_certainty_token": -21.125,
"step": 91
},
{
"completion_length": 51.90625,
"epoch": 0.10188261351052048,
"grad_norm": 0.6844750046730042,
"kl": 0.004730224609375,
"learning_rate": 9.425e-07,
"loss": 0.017017286270856857,
"reward": 1.7472361326217651,
"reward_std": 0.49342362582683563,
"rewards/GDino": 0.7615998685359955,
"rewards/GIT": 0.3799494504928589,
"rewards/HPSv2": 0.2450580596923828,
"rewards/ORM": 0.36062875390052795,
"self_certainty_semantic": -25.5,
"self_certainty_token": -21.0625,
"step": 92
},
{
"completion_length": 52.5625,
"epoch": 0.10299003322259136,
"grad_norm": 0.476144403219223,
"kl": 0.004547119140625,
"learning_rate": 9.41875e-07,
"loss": -0.006627652794122696,
"reward": 2.3529324531555176,
"reward_std": 0.38789400458335876,
"rewards/GDino": 0.8122400343418121,
"rewards/GIT": 0.40920257568359375,
"rewards/HPSv2": 0.25894737243652344,
"rewards/ORM": 0.8725424408912659,
"self_certainty_semantic": -25.1875,
"self_certainty_token": -20.75,
"step": 93
},
{
"completion_length": 45.234375,
"epoch": 0.10409745293466224,
"grad_norm": 0.4303518235683441,
"kl": 0.00390625,
"learning_rate": 9.4125e-07,
"loss": 0.002329372800886631,
"reward": 2.063507556915283,
"reward_std": 0.4875355362892151,
"rewards/GDino": 0.8157378733158112,
"rewards/GIT": 0.2162991166114807,
"rewards/HPSv2": 0.2860240936279297,
"rewards/ORM": 0.7454463839530945,
"self_certainty_semantic": -25.3125,
"self_certainty_token": -22.25,
"step": 94
},
{
"completion_length": 54.21875,
"epoch": 0.10520487264673312,
"grad_norm": 0.9745371341705322,
"kl": 0.004852294921875,
"learning_rate": 9.40625e-07,
"loss": 0.015892890747636557,
"reward": 2.4900766611099243,
"reward_std": 0.33158986270427704,
"rewards/GDino": 0.9456690549850464,
"rewards/GIT": 0.7110534906387329,
"rewards/HPSv2": 0.2568836212158203,
"rewards/ORM": 0.5764705836772919,
"self_certainty_semantic": -25.625,
"self_certainty_token": -21.3125,
"step": 95
},
{
"completion_length": 62.953125,
"epoch": 0.10631229235880399,
"grad_norm": 1.6108874082565308,
"kl": 0.00475311279296875,
"learning_rate": 9.399999999999999e-07,
"loss": 0.012537557166069746,
"reward": 2.4274561405181885,
"reward_std": 0.3028244078159332,
"rewards/GDino": 0.9155160486698151,
"rewards/GIT": 0.6933247745037079,
"rewards/HPSv2": 0.25919437408447266,
"rewards/ORM": 0.5594209432601929,
"self_certainty_semantic": -25.375,
"self_certainty_token": -20.3125,
"step": 96
},
{
"completion_length": 44.890625,
"epoch": 0.10741971207087486,
"grad_norm": 0.42777886986732483,
"kl": 0.0064697265625,
"learning_rate": 9.393749999999999e-07,
"loss": 0.006582918576896191,
"reward": 1.7229499220848083,
"reward_std": 0.29571742564439774,
"rewards/GDino": 0.6976552903652191,
"rewards/GIT": 0.17514611035585403,
"rewards/HPSv2": 0.2757740020751953,
"rewards/ORM": 0.5743745565414429,
"self_certainty_semantic": -25.25,
"self_certainty_token": -21.5,
"step": 97
},
{
"completion_length": 52.78125,
"epoch": 0.10852713178294573,
"grad_norm": 0.4346785247325897,
"kl": 0.00446319580078125,
"learning_rate": 9.387499999999999e-07,
"loss": 0.010664775501936674,
"reward": 1.9896260499954224,
"reward_std": 0.5384568274021149,
"rewards/GDino": 0.7534899115562439,
"rewards/GIT": 0.416723370552063,
"rewards/HPSv2": 0.25490760803222656,
"rewards/ORM": 0.5645051300525665,
"self_certainty_semantic": -25.25,
"self_certainty_token": -21.4375,
"step": 98
},
{
"completion_length": 53.546875,
"epoch": 0.10963455149501661,
"grad_norm": 0.4226502478122711,
"kl": 0.00543212890625,
"learning_rate": 9.381249999999999e-07,
"loss": -0.009754271944984794,
"reward": 2.1711018085479736,
"reward_std": 0.3036491945385933,
"rewards/GDino": 0.8239583373069763,
"rewards/GIT": 0.6844146698713303,
"rewards/HPSv2": 0.24951934814453125,
"rewards/ORM": 0.41320937871932983,
"self_certainty_semantic": -25.25,
"self_certainty_token": -22.0,
"step": 99
},
{
"completion_length": 48.0625,
"epoch": 0.11074197120708748,
"grad_norm": 0.4250389039516449,
"kl": 0.00537109375,
"learning_rate": 9.374999999999999e-07,
"loss": -0.014408082235604525,
"reward": 1.9375371932983398,
"reward_std": 0.4484590142965317,
"rewards/GDino": 0.6897697150707245,
"rewards/GIT": 0.4094943553209305,
"rewards/HPSv2": 0.2472515106201172,
"rewards/ORM": 0.5910216569900513,
"self_certainty_semantic": -25.25,
"self_certainty_token": -21.4375,
"step": 100
}
],
"logging_steps": 1.0,
"max_steps": 1600,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}