diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10233 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6644518272425249, + "eval_steps": 500, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 73.265625, + "epoch": 0.0011074197120708748, + "grad_norm": 0.5076314806938171, + "kl": 0.0, + "learning_rate": 9.99375e-07, + "loss": -0.018259915290400386, + "reward": 2.2648561000823975, + "reward_std": 0.32521533221006393, + "rewards/GDino": 0.84943026304245, + "rewards/GIT": 0.5776679813861847, + "rewards/HPSv2": 0.2639656066894531, + "rewards/ORM": 0.5737921893596649, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -22.0, + "step": 1 + }, + { + "completion_length": 56.0, + "epoch": 0.0022148394241417496, + "grad_norm": 0.5364330410957336, + "kl": 0.001522064208984375, + "learning_rate": 9.9875e-07, + "loss": 0.00348748016403988, + "reward": 1.7680926322937012, + "reward_std": 0.41801488399505615, + "rewards/GDino": 0.6529064476490021, + "rewards/GIT": 0.19494981318712234, + "rewards/HPSv2": 0.24983596801757812, + "rewards/ORM": 0.6704004406929016, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.0, + "step": 2 + }, + { + "completion_length": 55.4375, + "epoch": 0.0033222591362126247, + "grad_norm": 0.5614722967147827, + "kl": 0.001556396484375, + "learning_rate": 9.98125e-07, + "loss": 0.01565772108733654, + "reward": 1.6570448875427246, + "reward_std": 0.3965621292591095, + "rewards/GDino": 0.6382372081279755, + "rewards/GIT": 0.37795570492744446, + "rewards/HPSv2": 0.24709796905517578, + "rewards/ORM": 0.3937540017068386, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -20.9375, + "step": 3 + }, + { + "completion_length": 65.34375, + "epoch": 0.004429678848283499, + "grad_norm": 2.5736770629882812, + "kl": 0.0016021728515625, + "learning_rate": 9.975e-07, + "loss": -0.0012893765233457088, + "reward": 2.061529755592346, + "reward_std": 0.4106704443693161, + "rewards/GDino": 0.7796730995178223, + "rewards/GIT": 0.43717896938323975, + "rewards/HPSv2": 0.24744796752929688, + "rewards/ORM": 0.5972296595573425, + "self_certainty_semantic": -25.5, + "self_certainty_token": -22.0, + "step": 4 + }, + { + "completion_length": 63.578125, + "epoch": 0.005537098560354375, + "grad_norm": 0.48238250613212585, + "kl": 0.001575469970703125, + "learning_rate": 9.968749999999999e-07, + "loss": 0.020129199139773846, + "reward": 1.5302643775939941, + "reward_std": 0.44902199506759644, + "rewards/GDino": 0.6246840953826904, + "rewards/GIT": 0.23608428239822388, + "rewards/HPSv2": 0.2453451156616211, + "rewards/ORM": 0.42415088415145874, + "self_certainty_semantic": -25.625, + "self_certainty_token": -22.1875, + "step": 5 + }, + { + "completion_length": 60.65625, + "epoch": 0.006644518272425249, + "grad_norm": 0.8221905827522278, + "kl": 0.001674652099609375, + "learning_rate": 9.9625e-07, + "loss": 0.0192068200558424, + "reward": 2.1602972745895386, + "reward_std": 0.23134037852287292, + "rewards/GDino": 0.783700168132782, + "rewards/GIT": 0.452057421207428, + "rewards/HPSv2": 0.274627685546875, + "rewards/ORM": 0.6499120593070984, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -22.0, + "step": 6 + }, + { + "completion_length": 65.453125, + "epoch": 0.007751937984496124, + "grad_norm": 0.433403879404068, + "kl": 0.0016021728515625, + "learning_rate": 9.956249999999999e-07, + "loss": 0.028950304724276066, + "reward": 1.7097668647766113, + "reward_std": 0.5880981385707855, + "rewards/GDino": 0.5914062708616257, + "rewards/GIT": 0.15753822773694992, + "rewards/HPSv2": 0.25023555755615234, + "rewards/ORM": 0.7105867862701416, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.75, + "step": 7 + }, + { + "completion_length": 74.90625, + "epoch": 0.008859357696566999, + "grad_norm": 0.41245806217193604, + "kl": 0.00152587890625, + "learning_rate": 9.95e-07, + "loss": -0.016540683340281248, + "reward": 1.785366177558899, + "reward_std": 0.39637817442417145, + "rewards/GDino": 0.7011832594871521, + "rewards/GIT": 0.3848375529050827, + "rewards/HPSv2": 0.2445659637451172, + "rewards/ORM": 0.45477938652038574, + "self_certainty_semantic": -25.375, + "self_certainty_token": -20.875, + "step": 8 + }, + { + "completion_length": 61.828125, + "epoch": 0.009966777408637873, + "grad_norm": 0.3924250602722168, + "kl": 0.001617431640625, + "learning_rate": 9.94375e-07, + "loss": 0.03069412149488926, + "reward": 2.0813064575195312, + "reward_std": 0.5435488224029541, + "rewards/GDino": 0.736801415681839, + "rewards/GIT": 0.32275132089853287, + "rewards/HPSv2": 0.26233673095703125, + "rewards/ORM": 0.759416937828064, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.1875, + "step": 9 + }, + { + "completion_length": 62.796875, + "epoch": 0.01107419712070875, + "grad_norm": 0.5886948704719543, + "kl": 0.00164031982421875, + "learning_rate": 9.9375e-07, + "loss": -0.009089878294616938, + "reward": 1.8167259693145752, + "reward_std": 0.4427160769701004, + "rewards/GDino": 0.6997816860675812, + "rewards/GIT": 0.4742187559604645, + "rewards/HPSv2": 0.2480792999267578, + "rewards/ORM": 0.3946462571620941, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.125, + "step": 10 + }, + { + "completion_length": 64.09375, + "epoch": 0.012181616832779624, + "grad_norm": 0.6388463377952576, + "kl": 0.0016326904296875, + "learning_rate": 9.93125e-07, + "loss": -0.011163983959704638, + "reward": 2.250586152076721, + "reward_std": 0.29546695202589035, + "rewards/GDino": 0.7932291626930237, + "rewards/GIT": 0.5437096580862999, + "rewards/HPSv2": 0.25614356994628906, + "rewards/ORM": 0.657503753900528, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.0625, + "step": 11 + }, + { + "completion_length": 73.265625, + "epoch": 0.013289036544850499, + "grad_norm": 0.37963175773620605, + "kl": 0.001583099365234375, + "learning_rate": 9.925e-07, + "loss": 0.009535952471196651, + "reward": 1.8723560571670532, + "reward_std": 0.48824670910835266, + "rewards/GDino": 0.671429455280304, + "rewards/GIT": 0.4155814051628113, + "rewards/HPSv2": 0.2387409210205078, + "rewards/ORM": 0.5466042459011078, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.625, + "step": 12 + }, + { + "completion_length": 55.015625, + "epoch": 0.014396456256921373, + "grad_norm": 0.5844080448150635, + "kl": 0.001674652099609375, + "learning_rate": 9.91875e-07, + "loss": 0.0034986711107194424, + "reward": 1.7595484256744385, + "reward_std": 0.3697086051106453, + "rewards/GDino": 0.7100214958190918, + "rewards/GIT": 0.26869260519742966, + "rewards/HPSv2": 0.24958419799804688, + "rewards/ORM": 0.53125, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.375, + "step": 13 + }, + { + "completion_length": 55.65625, + "epoch": 0.015503875968992248, + "grad_norm": 0.5192797780036926, + "kl": 0.001674652099609375, + "learning_rate": 9.912499999999998e-07, + "loss": 0.010001872200518847, + "reward": 2.201015591621399, + "reward_std": 0.4899330288171768, + "rewards/GDino": 0.8140625059604645, + "rewards/GIT": 0.4328514188528061, + "rewards/HPSv2": 0.2431640625, + "rewards/ORM": 0.7109375, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -22.0, + "step": 14 + }, + { + "completion_length": 64.0, + "epoch": 0.016611295681063124, + "grad_norm": 0.46844616532325745, + "kl": 0.00174713134765625, + "learning_rate": 9.90625e-07, + "loss": 0.0017675042618066072, + "reward": 2.433342456817627, + "reward_std": 0.33736473321914673, + "rewards/GDino": 0.9153576791286469, + "rewards/GIT": 0.5124611556529999, + "rewards/HPSv2": 0.2507901191711426, + "rewards/ORM": 0.7547334432601929, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.6875, + "step": 15 + }, + { + "completion_length": 53.203125, + "epoch": 0.017718715393133997, + "grad_norm": 0.49579355120658875, + "kl": 0.001758575439453125, + "learning_rate": 9.9e-07, + "loss": 0.003856237977743149, + "reward": 1.6368815302848816, + "reward_std": 0.42226114869117737, + "rewards/GDino": 0.6432631015777588, + "rewards/GIT": 0.2906690910458565, + "rewards/HPSv2": 0.25169944763183594, + "rewards/ORM": 0.45124977827072144, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -20.8125, + "step": 16 + }, + { + "completion_length": 76.28125, + "epoch": 0.018826135105204873, + "grad_norm": 0.5296036601066589, + "kl": 0.001590728759765625, + "learning_rate": 9.89375e-07, + "loss": -0.003345506265759468, + "reward": 1.7861530184745789, + "reward_std": 0.5057752877473831, + "rewards/GDino": 0.6293700635433197, + "rewards/GIT": 0.2197464406490326, + "rewards/HPSv2": 0.26516151428222656, + "rewards/ORM": 0.671875, + "self_certainty_semantic": -25.5, + "self_certainty_token": -20.6875, + "step": 17 + }, + { + "completion_length": 58.0625, + "epoch": 0.019933554817275746, + "grad_norm": 0.6577962636947632, + "kl": 0.00174713134765625, + "learning_rate": 9.8875e-07, + "loss": -0.019500677473843098, + "reward": 2.303292393684387, + "reward_std": 0.2609405145049095, + "rewards/GDino": 0.8339102566242218, + "rewards/GIT": 0.5853700041770935, + "rewards/HPSv2": 0.24338722229003906, + "rewards/ORM": 0.640625, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.25, + "step": 18 + }, + { + "completion_length": 50.859375, + "epoch": 0.021040974529346623, + "grad_norm": 0.3543226718902588, + "kl": 0.00182342529296875, + "learning_rate": 9.88125e-07, + "loss": -0.00019507110118865967, + "reward": 1.6344053149223328, + "reward_std": 0.47374215722084045, + "rewards/GDino": 0.705148845911026, + "rewards/GIT": 0.2559727430343628, + "rewards/HPSv2": 0.2541370391845703, + "rewards/ORM": 0.41914665699005127, + "self_certainty_semantic": -25.25, + "self_certainty_token": -22.3125, + "step": 19 + }, + { + "completion_length": 65.921875, + "epoch": 0.0221483942414175, + "grad_norm": 0.5358290672302246, + "kl": 0.001781463623046875, + "learning_rate": 9.875e-07, + "loss": 0.007933363318443298, + "reward": 1.9504321217536926, + "reward_std": 0.3728322237730026, + "rewards/GDino": 0.6606760025024414, + "rewards/GIT": 0.48046815395355225, + "rewards/HPSv2": 0.24678802490234375, + "rewards/ORM": 0.5625, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.0625, + "step": 20 + }, + { + "completion_length": 59.3125, + "epoch": 0.023255813953488372, + "grad_norm": 2.0912797451019287, + "kl": 0.001811981201171875, + "learning_rate": 9.86875e-07, + "loss": -0.004398644436150789, + "reward": 2.252086877822876, + "reward_std": 0.44888848066329956, + "rewards/GDino": 0.798213005065918, + "rewards/GIT": 0.4853799045085907, + "rewards/HPSv2": 0.25956153869628906, + "rewards/ORM": 0.7089323997497559, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -22.75, + "step": 21 + }, + { + "completion_length": 52.265625, + "epoch": 0.024363233665559248, + "grad_norm": 0.5790585875511169, + "kl": 0.00191497802734375, + "learning_rate": 9.862499999999999e-07, + "loss": 0.006876260507851839, + "reward": 1.9933909177780151, + "reward_std": 0.32367050647735596, + "rewards/GDino": 0.7134387493133545, + "rewards/GIT": 0.41087181866168976, + "rewards/HPSv2": 0.2721214294433594, + "rewards/ORM": 0.5969589203596115, + "self_certainty_semantic": -25.5, + "self_certainty_token": -22.375, + "step": 22 + }, + { + "completion_length": 59.375, + "epoch": 0.02547065337763012, + "grad_norm": 0.45692723989486694, + "kl": 0.001697540283203125, + "learning_rate": 9.85625e-07, + "loss": -0.00792664848268032, + "reward": 2.015365242958069, + "reward_std": 0.48256243765354156, + "rewards/GDino": 0.724082350730896, + "rewards/GIT": 0.42729710042476654, + "rewards/HPSv2": 0.2667560577392578, + "rewards/ORM": 0.5972296893596649, + "self_certainty_semantic": -25.375, + "self_certainty_token": -20.6875, + "step": 23 + }, + { + "completion_length": 55.203125, + "epoch": 0.026578073089700997, + "grad_norm": 0.46439889073371887, + "kl": 0.0016937255859375, + "learning_rate": 9.849999999999999e-07, + "loss": 0.0024933242239058018, + "reward": 2.460409939289093, + "reward_std": 0.4443647414445877, + "rewards/GDino": 0.8454739451408386, + "rewards/GIT": 0.6258784532546997, + "rewards/HPSv2": 0.2624950408935547, + "rewards/ORM": 0.7265625, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.25, + "step": 24 + }, + { + "completion_length": 60.15625, + "epoch": 0.02768549280177187, + "grad_norm": 0.47176027297973633, + "kl": 0.001880645751953125, + "learning_rate": 9.84375e-07, + "loss": 0.005812188144773245, + "reward": 2.0174233317375183, + "reward_std": 0.40724658966064453, + "rewards/GDino": 0.7186038792133331, + "rewards/GIT": 0.4156235605478287, + "rewards/HPSv2": 0.26485633850097656, + "rewards/ORM": 0.6183395236730576, + "self_certainty_semantic": -25.5, + "self_certainty_token": -22.0, + "step": 25 + }, + { + "completion_length": 53.21875, + "epoch": 0.028792912513842746, + "grad_norm": 0.716375470161438, + "kl": 0.00209808349609375, + "learning_rate": 9.8375e-07, + "loss": 0.02397427149116993, + "reward": 2.186239182949066, + "reward_std": 0.46710920333862305, + "rewards/GDino": 0.7593750059604645, + "rewards/GIT": 0.5171153843402863, + "rewards/HPSv2": 0.2734565734863281, + "rewards/ORM": 0.6362921893596649, + "self_certainty_semantic": -25.25, + "self_certainty_token": -23.0, + "step": 26 + }, + { + "completion_length": 58.421875, + "epoch": 0.029900332225913623, + "grad_norm": 0.428893119096756, + "kl": 0.00171661376953125, + "learning_rate": 9.83125e-07, + "loss": -0.005866332910954952, + "reward": 1.9681838750839233, + "reward_std": 0.3645169883966446, + "rewards/GDino": 0.7666666209697723, + "rewards/GIT": 0.4486802965402603, + "rewards/HPSv2": 0.2419452667236328, + "rewards/ORM": 0.5108915567398071, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.4375, + "step": 27 + }, + { + "completion_length": 63.328125, + "epoch": 0.031007751937984496, + "grad_norm": 0.5334203243255615, + "kl": 0.002010345458984375, + "learning_rate": 9.825e-07, + "loss": 0.012586410157382488, + "reward": 1.4134111404418945, + "reward_std": 0.3155324012041092, + "rewards/GDino": 0.6005972325801849, + "rewards/GIT": 0.11092349141836166, + "rewards/HPSv2": 0.2596569061279297, + "rewards/ORM": 0.44223344326019287, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -20.5625, + "step": 28 + }, + { + "completion_length": 56.25, + "epoch": 0.03211517165005537, + "grad_norm": 0.40832045674324036, + "kl": 0.001819610595703125, + "learning_rate": 9.81875e-07, + "loss": 0.010300841182470322, + "reward": 2.465680956840515, + "reward_std": 0.298002652823925, + "rewards/GDino": 0.862500011920929, + "rewards/GIT": 0.6107669174671173, + "rewards/HPSv2": 0.28375244140625, + "rewards/ORM": 0.7086615860462189, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.0625, + "step": 29 + }, + { + "completion_length": 54.953125, + "epoch": 0.03322259136212625, + "grad_norm": 0.4050670266151428, + "kl": 0.002025604248046875, + "learning_rate": 9.8125e-07, + "loss": -0.001845305785536766, + "reward": 2.476737856864929, + "reward_std": 0.3756887763738632, + "rewards/GDino": 0.8967152833938599, + "rewards/GIT": 0.551719531416893, + "rewards/HPSv2": 0.24522781372070312, + "rewards/ORM": 0.7830752730369568, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.25, + "step": 30 + }, + { + "completion_length": 74.84375, + "epoch": 0.03433001107419712, + "grad_norm": 0.7089686393737793, + "kl": 0.001865386962890625, + "learning_rate": 9.806249999999998e-07, + "loss": 0.023707949556410313, + "reward": 1.831493854522705, + "reward_std": 0.37860143184661865, + "rewards/GDino": 0.6287499666213989, + "rewards/GIT": 0.3833145350217819, + "rewards/HPSv2": 0.2413043975830078, + "rewards/ORM": 0.578125, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -20.9375, + "step": 31 + }, + { + "completion_length": 71.0625, + "epoch": 0.035437430786267994, + "grad_norm": 0.45204266905784607, + "kl": 0.00200653076171875, + "learning_rate": 9.8e-07, + "loss": 0.014695112593472004, + "reward": 1.5279032588005066, + "reward_std": 0.5042913109064102, + "rewards/GDino": 0.6702238023281097, + "rewards/GIT": 0.24817809462547302, + "rewards/HPSv2": 0.2356252670288086, + "rewards/ORM": 0.37387609481811523, + "self_certainty_semantic": -25.375, + "self_certainty_token": -22.5625, + "step": 32 + }, + { + "completion_length": 59.703125, + "epoch": 0.036544850498338874, + "grad_norm": 0.4359590411186218, + "kl": 0.00201416015625, + "learning_rate": 9.79375e-07, + "loss": 0.00610552029684186, + "reward": 2.3108657598495483, + "reward_std": 0.4415571391582489, + "rewards/GDino": 0.8515625, + "rewards/GIT": 0.6067334115505219, + "rewards/HPSv2": 0.22726917266845703, + "rewards/ORM": 0.6253007054328918, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.5, + "step": 33 + }, + { + "completion_length": 58.046875, + "epoch": 0.03765227021040975, + "grad_norm": 0.5853399038314819, + "kl": 0.002033233642578125, + "learning_rate": 9.7875e-07, + "loss": 0.023541483096778393, + "reward": 2.012690246105194, + "reward_std": 0.4660336524248123, + "rewards/GDino": 0.6989582777023315, + "rewards/GIT": 0.40700431168079376, + "rewards/HPSv2": 0.24774932861328125, + "rewards/ORM": 0.6589783728122711, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.8125, + "step": 34 + }, + { + "completion_length": 56.90625, + "epoch": 0.03875968992248062, + "grad_norm": 0.3787715435028076, + "kl": 0.001888275146484375, + "learning_rate": 9.78125e-07, + "loss": 0.003942073322832584, + "reward": 2.452033281326294, + "reward_std": 0.3410096764564514, + "rewards/GDino": 0.8359375298023224, + "rewards/GIT": 0.567652553319931, + "rewards/HPSv2": 0.2418804168701172, + "rewards/ORM": 0.806562751531601, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.3125, + "step": 35 + }, + { + "completion_length": 66.0, + "epoch": 0.03986710963455149, + "grad_norm": 0.5305721163749695, + "kl": 0.005157470703125, + "learning_rate": 9.775e-07, + "loss": -0.003781900042667985, + "reward": 1.8618011474609375, + "reward_std": 0.4120703786611557, + "rewards/GDino": 0.6453125476837158, + "rewards/GIT": 0.4281370937824249, + "rewards/HPSv2": 0.24621009826660156, + "rewards/ORM": 0.5421415567398071, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -20.9375, + "step": 36 + }, + { + "completion_length": 51.40625, + "epoch": 0.04097452934662237, + "grad_norm": 0.46515390276908875, + "kl": 0.002716064453125, + "learning_rate": 9.76875e-07, + "loss": 0.006902199704200029, + "reward": 1.9485998153686523, + "reward_std": 0.42147715389728546, + "rewards/GDino": 0.6951449513435364, + "rewards/GIT": 0.31057579815387726, + "rewards/HPSv2": 0.26158714294433594, + "rewards/ORM": 0.681291937828064, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -22.1875, + "step": 37 + }, + { + "completion_length": 71.03125, + "epoch": 0.042081949058693245, + "grad_norm": 0.951810896396637, + "kl": 0.00226593017578125, + "learning_rate": 9.7625e-07, + "loss": 0.03428783547133207, + "reward": 1.9112213850021362, + "reward_std": 0.30633312463760376, + "rewards/GDino": 0.7401995956897736, + "rewards/GIT": 0.30288365483283997, + "rewards/HPSv2": 0.2552833557128906, + "rewards/ORM": 0.6128547042608261, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.125, + "step": 38 + }, + { + "completion_length": 67.6875, + "epoch": 0.04318936877076412, + "grad_norm": 0.6357575058937073, + "kl": 0.01482391357421875, + "learning_rate": 9.756249999999999e-07, + "loss": 0.023865018505603075, + "reward": 2.345404624938965, + "reward_std": 0.31367097795009613, + "rewards/GDino": 0.8703815042972565, + "rewards/GIT": 0.4902418553829193, + "rewards/HPSv2": 0.26603126525878906, + "rewards/ORM": 0.71875, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.75, + "step": 39 + }, + { + "completion_length": 59.671875, + "epoch": 0.044296788482835, + "grad_norm": 0.5422465801239014, + "kl": 0.00281524658203125, + "learning_rate": 9.75e-07, + "loss": -0.018710695207118988, + "reward": 2.222834825515747, + "reward_std": 0.42842796444892883, + "rewards/GDino": 0.8634105622768402, + "rewards/GIT": 0.40908148139715195, + "rewards/HPSv2": 0.27498817443847656, + "rewards/ORM": 0.6753546893596649, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.125, + "step": 40 + }, + { + "completion_length": 60.9375, + "epoch": 0.04540420819490587, + "grad_norm": 0.7511593103408813, + "kl": 0.00299072265625, + "learning_rate": 9.743749999999999e-07, + "loss": 0.005782268475741148, + "reward": 1.8980144262313843, + "reward_std": 0.3208035007119179, + "rewards/GDino": 0.6784752607345581, + "rewards/GIT": 0.3914954513311386, + "rewards/HPSv2": 0.24643898010253906, + "rewards/ORM": 0.5816046595573425, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.25, + "step": 41 + }, + { + "completion_length": 48.4375, + "epoch": 0.046511627906976744, + "grad_norm": 0.5177002549171448, + "kl": 0.0025177001953125, + "learning_rate": 9.7375e-07, + "loss": 0.045526545494794846, + "reward": 2.269711136817932, + "reward_std": 0.48014624416828156, + "rewards/GDino": 0.8855312466621399, + "rewards/GIT": 0.4437972754240036, + "rewards/HPSv2": 0.2572154998779297, + "rewards/ORM": 0.6831671893596649, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -20.5625, + "step": 42 + }, + { + "completion_length": 67.875, + "epoch": 0.047619047619047616, + "grad_norm": 0.5885121822357178, + "kl": 0.002044677734375, + "learning_rate": 9.73125e-07, + "loss": 0.013573684729635715, + "reward": 1.6382005214691162, + "reward_std": 0.38919302821159363, + "rewards/GDino": 0.6114583313465118, + "rewards/GIT": 0.3806646466255188, + "rewards/HPSv2": 0.23286819458007812, + "rewards/ORM": 0.41320937871932983, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.0625, + "step": 43 + }, + { + "completion_length": 54.4375, + "epoch": 0.048726467331118496, + "grad_norm": 0.40727919340133667, + "kl": 0.0020751953125, + "learning_rate": 9.725e-07, + "loss": -0.01244093757122755, + "reward": 2.8831005096435547, + "reward_std": 0.31665875762701035, + "rewards/GDino": 0.9588541388511658, + "rewards/GIT": 0.7738310992717743, + "rewards/HPSv2": 0.2601909637451172, + "rewards/ORM": 0.8902243673801422, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.0625, + "step": 44 + }, + { + "completion_length": 54.90625, + "epoch": 0.04983388704318937, + "grad_norm": 0.4928445816040039, + "kl": 0.0024566650390625, + "learning_rate": 9.71875e-07, + "loss": 0.00010553281754255295, + "reward": 2.4343937635421753, + "reward_std": 0.5984751731157303, + "rewards/GDino": 0.862500011920929, + "rewards/GIT": 0.5139474421739578, + "rewards/HPSv2": 0.26379966735839844, + "rewards/ORM": 0.7941466569900513, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.25, + "step": 45 + }, + { + "completion_length": 56.078125, + "epoch": 0.05094130675526024, + "grad_norm": 0.37051326036453247, + "kl": 0.00231170654296875, + "learning_rate": 9.712499999999998e-07, + "loss": 0.007893505971878767, + "reward": 1.9575175046920776, + "reward_std": 0.3945648521184921, + "rewards/GDino": 0.5999999940395355, + "rewards/GIT": 0.32395021617412567, + "rewards/HPSv2": 0.26719093322753906, + "rewards/ORM": 0.7663763463497162, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -22.0625, + "step": 46 + }, + { + "completion_length": 55.171875, + "epoch": 0.05204872646733112, + "grad_norm": 0.8945181369781494, + "kl": 0.0025634765625, + "learning_rate": 9.70625e-07, + "loss": -0.0013387980870902538, + "reward": 1.836871862411499, + "reward_std": 0.23468619585037231, + "rewards/GDino": 0.7209739089012146, + "rewards/GIT": 0.22856376320123672, + "rewards/HPSv2": 0.27921295166015625, + "rewards/ORM": 0.608121246099472, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.6875, + "step": 47 + }, + { + "completion_length": 57.984375, + "epoch": 0.053156146179401995, + "grad_norm": 1.6689982414245605, + "kl": 0.00267791748046875, + "learning_rate": 9.7e-07, + "loss": 0.022647732868790627, + "reward": 1.454766035079956, + "reward_std": 0.40884387493133545, + "rewards/GDino": 0.6050891876220703, + "rewards/GIT": 0.0, + "rewards/HPSv2": 0.2698974609375, + "rewards/ORM": 0.5797793865203857, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.625, + "step": 48 + }, + { + "completion_length": 58.046875, + "epoch": 0.05426356589147287, + "grad_norm": 0.4761441648006439, + "kl": 0.002048492431640625, + "learning_rate": 9.69375e-07, + "loss": 0.016307475278154016, + "reward": 1.9066129326820374, + "reward_std": 0.5319462567567825, + "rewards/GDino": 0.7744874656200409, + "rewards/GIT": 0.2370736114680767, + "rewards/HPSv2": 0.2514495849609375, + "rewards/ORM": 0.6436022371053696, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.625, + "step": 49 + }, + { + "completion_length": 61.0, + "epoch": 0.05537098560354374, + "grad_norm": 0.8074173331260681, + "kl": 0.0040283203125, + "learning_rate": 9.6875e-07, + "loss": 0.005913220578804612, + "reward": 2.0915883779525757, + "reward_std": 0.5395111739635468, + "rewards/GDino": 0.7859093248844147, + "rewards/GIT": 0.3929952085018158, + "rewards/HPSv2": 0.25482940673828125, + "rewards/ORM": 0.657854437828064, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -22.25, + "step": 50 + }, + { + "completion_length": 44.359375, + "epoch": 0.05647840531561462, + "grad_norm": 0.5618427991867065, + "kl": 0.002471923828125, + "learning_rate": 9.68125e-07, + "loss": -0.003945098840631545, + "reward": 1.8058671951293945, + "reward_std": 0.5712144523859024, + "rewards/GDino": 0.7815796732902527, + "rewards/GIT": 0.2604931816458702, + "rewards/HPSv2": 0.27115440368652344, + "rewards/ORM": 0.49263995885849, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -22.625, + "step": 51 + }, + { + "completion_length": 48.0, + "epoch": 0.05758582502768549, + "grad_norm": 107.57159423828125, + "kl": 26.37615966796875, + "learning_rate": 9.675e-07, + "loss": 0.27801212295889854, + "reward": 2.4165316820144653, + "reward_std": 0.2998274937272072, + "rewards/GDino": 0.9244791567325592, + "rewards/GIT": 0.6574473828077316, + "rewards/HPSv2": 0.2756366729736328, + "rewards/ORM": 0.5589684545993805, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.375, + "step": 52 + }, + { + "completion_length": 52.140625, + "epoch": 0.058693244739756366, + "grad_norm": 0.4408358931541443, + "kl": 0.00232696533203125, + "learning_rate": 9.66875e-07, + "loss": 0.013528472045436502, + "reward": 1.8899905681610107, + "reward_std": 0.4558149725198746, + "rewards/GDino": 0.730059951543808, + "rewards/GIT": 0.39098620414733887, + "rewards/HPSv2": 0.24242782592773438, + "rewards/ORM": 0.5265165567398071, + "self_certainty_semantic": -25.0, + "self_certainty_token": -20.5625, + "step": 53 + }, + { + "completion_length": 58.390625, + "epoch": 0.059800664451827246, + "grad_norm": 0.48384228348731995, + "kl": 0.00225067138671875, + "learning_rate": 9.6625e-07, + "loss": 0.005568797350861132, + "reward": 1.638724684715271, + "reward_std": 0.41337575018405914, + "rewards/GDino": 0.6137361526489258, + "rewards/GIT": 0.24863167852163315, + "rewards/HPSv2": 0.24831581115722656, + "rewards/ORM": 0.5280410945415497, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -20.9375, + "step": 54 + }, + { + "completion_length": 50.234375, + "epoch": 0.06090808416389812, + "grad_norm": 0.46963369846343994, + "kl": 0.0026397705078125, + "learning_rate": 9.65625e-07, + "loss": 0.009267964400351048, + "reward": 1.7191376686096191, + "reward_std": 0.521537572145462, + "rewards/GDino": 0.7086881995201111, + "rewards/GIT": 0.3270767852663994, + "rewards/HPSv2": 0.2678356170654297, + "rewards/ORM": 0.4155370891094208, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.6875, + "step": 55 + }, + { + "completion_length": 59.953125, + "epoch": 0.06201550387596899, + "grad_norm": 0.6913841366767883, + "kl": 0.0024261474609375, + "learning_rate": 9.649999999999999e-07, + "loss": 0.03414425998926163, + "reward": 1.9336698055267334, + "reward_std": 0.45749759674072266, + "rewards/GDino": 0.6963726580142975, + "rewards/GIT": 0.38425514101982117, + "rewards/HPSv2": 0.2471466064453125, + "rewards/ORM": 0.6058953106403351, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.875, + "step": 56 + }, + { + "completion_length": 50.765625, + "epoch": 0.06312292358803986, + "grad_norm": 0.5066769123077393, + "kl": 0.002532958984375, + "learning_rate": 9.64375e-07, + "loss": 0.009842937346547842, + "reward": 1.8338811993598938, + "reward_std": 0.3951306492090225, + "rewards/GDino": 0.7909577786922455, + "rewards/GIT": 0.24781160056591034, + "rewards/HPSv2": 0.2509651184082031, + "rewards/ORM": 0.5441466569900513, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.4375, + "step": 57 + }, + { + "completion_length": 52.9375, + "epoch": 0.06423034330011074, + "grad_norm": 0.37791869044303894, + "kl": 0.002685546875, + "learning_rate": 9.637499999999999e-07, + "loss": 0.024126023054122925, + "reward": 1.8852884769439697, + "reward_std": 0.46756890416145325, + "rewards/GDino": 0.732811689376831, + "rewards/GIT": 0.38145140558481216, + "rewards/HPSv2": 0.2541465759277344, + "rewards/ORM": 0.5168787688016891, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.25, + "step": 58 + }, + { + "completion_length": 47.234375, + "epoch": 0.06533776301218161, + "grad_norm": 0.7410405278205872, + "kl": 0.0026092529296875, + "learning_rate": 9.63125e-07, + "loss": -0.01674468442797661, + "reward": 2.3462648391723633, + "reward_std": 0.2433818019926548, + "rewards/GDino": 0.8425607979297638, + "rewards/GIT": 0.46571947634220123, + "rewards/HPSv2": 0.2664222717285156, + "rewards/ORM": 0.771562248468399, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.25, + "step": 59 + }, + { + "completion_length": 45.015625, + "epoch": 0.0664451827242525, + "grad_norm": 0.5326105952262878, + "kl": 0.0026397705078125, + "learning_rate": 9.624999999999999e-07, + "loss": 0.003804182168096304, + "reward": 2.036432147026062, + "reward_std": 0.3990803211927414, + "rewards/GDino": 0.8798050284385681, + "rewards/GIT": 0.4744318723678589, + "rewards/HPSv2": 0.238006591796875, + "rewards/ORM": 0.44418865442276, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -20.6875, + "step": 60 + }, + { + "completion_length": 65.0, + "epoch": 0.06755260243632337, + "grad_norm": 0.5713196396827698, + "kl": 0.00235748291015625, + "learning_rate": 9.61875e-07, + "loss": 0.04368375800549984, + "reward": 2.1398236751556396, + "reward_std": 0.3530130609869957, + "rewards/GDino": 0.7138020694255829, + "rewards/GIT": 0.644903838634491, + "rewards/HPSv2": 0.2529468536376953, + "rewards/ORM": 0.5281709432601929, + "self_certainty_semantic": -25.25, + "self_certainty_token": -20.6875, + "step": 61 + }, + { + "completion_length": 54.9375, + "epoch": 0.06866002214839424, + "grad_norm": 5.612445831298828, + "kl": 0.00339508056640625, + "learning_rate": 9.6125e-07, + "loss": 0.008875304833054543, + "reward": 2.497900605201721, + "reward_std": 0.41675496101379395, + "rewards/GDino": 0.872697502374649, + "rewards/GIT": 0.601748138666153, + "rewards/HPSv2": 0.2640380859375, + "rewards/ORM": 0.759416937828064, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -20.875, + "step": 62 + }, + { + "completion_length": 49.734375, + "epoch": 0.06976744186046512, + "grad_norm": 0.5861217379570007, + "kl": 0.003326416015625, + "learning_rate": 9.606249999999998e-07, + "loss": 0.01025251136161387, + "reward": 2.2640050053596497, + "reward_std": 0.48744213581085205, + "rewards/GDino": 0.8172852694988251, + "rewards/GIT": 0.44742196798324585, + "rewards/HPSv2": 0.2430896759033203, + "rewards/ORM": 0.7562080323696136, + "self_certainty_semantic": -25.125, + "self_certainty_token": -22.1875, + "step": 63 + }, + { + "completion_length": 64.375, + "epoch": 0.07087486157253599, + "grad_norm": 0.39266109466552734, + "kl": 0.00298309326171875, + "learning_rate": 9.6e-07, + "loss": -0.005469436291605234, + "reward": 1.6910768747329712, + "reward_std": 0.2151722088456154, + "rewards/GDino": 0.7097718715667725, + "rewards/GIT": 0.32366037368774414, + "rewards/HPSv2": 0.2576026916503906, + "rewards/ORM": 0.40004195272922516, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.9375, + "step": 64 + }, + { + "completion_length": 61.515625, + "epoch": 0.07198228128460686, + "grad_norm": 0.705937922000885, + "kl": 0.002685546875, + "learning_rate": 9.59375e-07, + "loss": 0.010601098649203777, + "reward": 2.128853142261505, + "reward_std": 0.4351096749305725, + "rewards/GDino": 0.7197916805744171, + "rewards/GIT": 0.6168824732303619, + "rewards/HPSv2": 0.23163414001464844, + "rewards/ORM": 0.5605448335409164, + "self_certainty_semantic": -25.25, + "self_certainty_token": -22.75, + "step": 65 + }, + { + "completion_length": 49.0, + "epoch": 0.07308970099667775, + "grad_norm": 0.4427480101585388, + "kl": 0.002899169921875, + "learning_rate": 9.5875e-07, + "loss": 0.02646360918879509, + "reward": 2.1654986143112183, + "reward_std": 0.37753987312316895, + "rewards/GDino": 0.6895833611488342, + "rewards/GIT": 0.48387444019317627, + "rewards/HPSv2": 0.2579364776611328, + "rewards/ORM": 0.7341042160987854, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.5625, + "step": 66 + }, + { + "completion_length": 63.140625, + "epoch": 0.07419712070874862, + "grad_norm": 0.7619237899780273, + "kl": 0.00284576416015625, + "learning_rate": 9.58125e-07, + "loss": 0.026691121514886618, + "reward": 2.3450592160224915, + "reward_std": 0.2740027904510498, + "rewards/GDino": 0.8025760054588318, + "rewards/GIT": 0.5677543580532074, + "rewards/HPSv2": 0.2594585418701172, + "rewards/ORM": 0.7152703106403351, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.875, + "step": 67 + }, + { + "completion_length": 50.171875, + "epoch": 0.0753045404208195, + "grad_norm": 0.4760603904724121, + "kl": 0.0030517578125, + "learning_rate": 9.575e-07, + "loss": 0.022392848506569862, + "reward": 1.6361079216003418, + "reward_std": 0.33574268221855164, + "rewards/GDino": 0.6061920523643494, + "rewards/GIT": 0.31722745299339294, + "rewards/HPSv2": 0.2595634460449219, + "rewards/ORM": 0.453125, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.5, + "step": 68 + }, + { + "completion_length": 55.046875, + "epoch": 0.07641196013289037, + "grad_norm": 0.5907943248748779, + "kl": 0.00336456298828125, + "learning_rate": 9.56875e-07, + "loss": -0.0030646873638033867, + "reward": 2.119426429271698, + "reward_std": 0.298831045627594, + "rewards/GDino": 0.8028125166893005, + "rewards/GIT": 0.3893257826566696, + "rewards/HPSv2": 0.2710380554199219, + "rewards/ORM": 0.65625, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.4375, + "step": 69 + }, + { + "completion_length": 51.140625, + "epoch": 0.07751937984496124, + "grad_norm": 0.47751373052597046, + "kl": 0.00359344482421875, + "learning_rate": 9.5625e-07, + "loss": -0.011344656813889742, + "reward": 1.4646188020706177, + "reward_std": 0.5817874372005463, + "rewards/GDino": 0.5935695767402649, + "rewards/GIT": 0.23897356167435646, + "rewards/HPSv2": 0.25234222412109375, + "rewards/ORM": 0.37973344326019287, + "self_certainty_semantic": -24.875, + "self_certainty_token": -20.9375, + "step": 70 + }, + { + "completion_length": 55.609375, + "epoch": 0.07862679955703211, + "grad_norm": 0.5281980633735657, + "kl": 0.0030364990234375, + "learning_rate": 9.556249999999999e-07, + "loss": -0.023217559792101383, + "reward": 1.856022596359253, + "reward_std": 0.4435942769050598, + "rewards/GDino": 0.6947268545627594, + "rewards/GIT": 0.28702250868082047, + "rewards/HPSv2": 0.26489830017089844, + "rewards/ORM": 0.609375, + "self_certainty_semantic": -25.375, + "self_certainty_token": -20.5, + "step": 71 + }, + { + "completion_length": 42.390625, + "epoch": 0.07973421926910298, + "grad_norm": 0.4538242518901825, + "kl": 0.002960205078125, + "learning_rate": 9.55e-07, + "loss": 0.016265914775431156, + "reward": 1.911847174167633, + "reward_std": 0.4016146659851074, + "rewards/GDino": 0.6731474995613098, + "rewards/GIT": 0.46439771354198456, + "rewards/HPSv2": 0.2497406005859375, + "rewards/ORM": 0.524561420083046, + "self_certainty_semantic": -25.0, + "self_certainty_token": -20.875, + "step": 72 + }, + { + "completion_length": 53.28125, + "epoch": 0.08084163898117387, + "grad_norm": 0.5773823261260986, + "kl": 0.00330352783203125, + "learning_rate": 9.543749999999999e-07, + "loss": -0.0016377167776226997, + "reward": 2.114488363265991, + "reward_std": 0.44427454471588135, + "rewards/GDino": 0.8240922689437866, + "rewards/GIT": 0.4950668513774872, + "rewards/HPSv2": 0.24412155151367188, + "rewards/ORM": 0.5512077808380127, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.0, + "step": 73 + }, + { + "completion_length": 56.296875, + "epoch": 0.08194905869324474, + "grad_norm": 0.43449509143829346, + "kl": 0.0035247802734375, + "learning_rate": 9.5375e-07, + "loss": 0.03005522396415472, + "reward": 2.32301664352417, + "reward_std": 0.22542773187160492, + "rewards/GDino": 0.864062488079071, + "rewards/GIT": 0.5282620340585709, + "rewards/HPSv2": 0.25408363342285156, + "rewards/ORM": 0.6766084730625153, + "self_certainty_semantic": -25.25, + "self_certainty_token": -22.25, + "step": 74 + }, + { + "completion_length": 67.6875, + "epoch": 0.08305647840531562, + "grad_norm": 0.4218258857727051, + "kl": 0.0028228759765625, + "learning_rate": 9.53125e-07, + "loss": 0.015081442426890135, + "reward": 1.7625158429145813, + "reward_std": 0.4334114193916321, + "rewards/GDino": 0.6663236618041992, + "rewards/GIT": 0.26877461373806, + "rewards/HPSv2": 0.2647876739501953, + "rewards/ORM": 0.5626298785209656, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.125, + "step": 75 + }, + { + "completion_length": 62.15625, + "epoch": 0.08416389811738649, + "grad_norm": 0.45278123021125793, + "kl": 0.00312042236328125, + "learning_rate": 9.525e-07, + "loss": 0.01650754688307643, + "reward": 2.2938578128814697, + "reward_std": 0.5077499151229858, + "rewards/GDino": 0.7734375, + "rewards/GIT": 0.6401466727256775, + "rewards/HPSv2": 0.2568778991699219, + "rewards/ORM": 0.6233955323696136, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.125, + "step": 76 + }, + { + "completion_length": 50.984375, + "epoch": 0.08527131782945736, + "grad_norm": 0.5513558387756348, + "kl": 0.004730224609375, + "learning_rate": 9.51875e-07, + "loss": -0.008258584188297391, + "reward": 1.6354877948760986, + "reward_std": 0.5420883148908615, + "rewards/GDino": 0.643737405538559, + "rewards/GIT": 0.20579323172569275, + "rewards/HPSv2": 0.2405567169189453, + "rewards/ORM": 0.5454003810882568, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -22.1875, + "step": 77 + }, + { + "completion_length": 56.390625, + "epoch": 0.08637873754152824, + "grad_norm": 0.9578920602798462, + "kl": 0.00360107421875, + "learning_rate": 9.5125e-07, + "loss": 0.0016261041164398193, + "reward": 2.061507523059845, + "reward_std": 0.2758500352501869, + "rewards/GDino": 0.7561410367488861, + "rewards/GIT": 0.33666322380304337, + "rewards/HPSv2": 0.2762489318847656, + "rewards/ORM": 0.6924542784690857, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.1875, + "step": 78 + }, + { + "completion_length": 57.375, + "epoch": 0.08748615725359911, + "grad_norm": 0.46459418535232544, + "kl": 0.004241943359375, + "learning_rate": 9.50625e-07, + "loss": -0.019409675151109695, + "reward": 2.298323154449463, + "reward_std": 0.22066934406757355, + "rewards/GDino": 0.8136925399303436, + "rewards/GIT": 0.6333461850881577, + "rewards/HPSv2": 0.27008056640625, + "rewards/ORM": 0.5812040567398071, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.875, + "step": 79 + }, + { + "completion_length": 60.5625, + "epoch": 0.08859357696567, + "grad_norm": 0.4274587631225586, + "kl": 0.004058837890625, + "learning_rate": 9.499999999999999e-07, + "loss": 0.013256619684398174, + "reward": 1.6786987781524658, + "reward_std": 0.3984425514936447, + "rewards/GDino": 0.6007516384124756, + "rewards/GIT": 0.18326736986637115, + "rewards/HPSv2": 0.2720355987548828, + "rewards/ORM": 0.6226442158222198, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.3125, + "step": 80 + }, + { + "completion_length": 58.03125, + "epoch": 0.08970099667774087, + "grad_norm": 0.9172859191894531, + "kl": 0.00426483154296875, + "learning_rate": 9.493749999999999e-07, + "loss": 0.003496276680380106, + "reward": 2.106017231941223, + "reward_std": 0.30050399899482727, + "rewards/GDino": 0.7440759837627411, + "rewards/GIT": 0.3581302911043167, + "rewards/HPSv2": 0.27126121520996094, + "rewards/ORM": 0.7325496971607208, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -20.875, + "step": 81 + }, + { + "completion_length": 49.5625, + "epoch": 0.09080841638981174, + "grad_norm": 0.4841405153274536, + "kl": 0.00412750244140625, + "learning_rate": 9.487499999999999e-07, + "loss": 0.025506282225251198, + "reward": 1.6879253387451172, + "reward_std": 0.42353254556655884, + "rewards/GDino": 0.6098452508449554, + "rewards/GIT": 0.38033944368362427, + "rewards/HPSv2": 0.2658271789550781, + "rewards/ORM": 0.43191343545913696, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -20.9375, + "step": 82 + }, + { + "completion_length": 48.328125, + "epoch": 0.09191583610188261, + "grad_norm": 0.492243230342865, + "kl": 0.00345611572265625, + "learning_rate": 9.481249999999999e-07, + "loss": -0.0034960508346557617, + "reward": 2.1111596822738647, + "reward_std": 0.41540510952472687, + "rewards/GDino": 0.7717877924442291, + "rewards/GIT": 0.4860316216945648, + "rewards/HPSv2": 0.2670021057128906, + "rewards/ORM": 0.5863381326198578, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.4375, + "step": 83 + }, + { + "completion_length": 66.3125, + "epoch": 0.09302325581395349, + "grad_norm": 0.5617808699607849, + "kl": 0.004180908203125, + "learning_rate": 9.474999999999999e-07, + "loss": 0.003248518332839012, + "reward": 2.094790816307068, + "reward_std": 0.3879907354712486, + "rewards/GDino": 0.7973622679710388, + "rewards/GIT": 0.632976621389389, + "rewards/HPSv2": 0.24137306213378906, + "rewards/ORM": 0.4230788052082062, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.1875, + "step": 84 + }, + { + "completion_length": 51.40625, + "epoch": 0.09413067552602436, + "grad_norm": 0.5695884823799133, + "kl": 0.003204345703125, + "learning_rate": 9.468749999999999e-07, + "loss": 0.012543351389467716, + "reward": 1.8675293326377869, + "reward_std": 0.4282868355512619, + "rewards/GDino": 0.6550000011920929, + "rewards/GIT": 0.33260630816221237, + "rewards/HPSv2": 0.24515533447265625, + "rewards/ORM": 0.6347676515579224, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.375, + "step": 85 + }, + { + "completion_length": 48.296875, + "epoch": 0.09523809523809523, + "grad_norm": 0.46590158343315125, + "kl": 0.00469970703125, + "learning_rate": 9.462499999999999e-07, + "loss": 0.00347991194576025, + "reward": 2.2731298208236694, + "reward_std": 0.383390873670578, + "rewards/GDino": 0.8246111273765564, + "rewards/GIT": 0.33447980135679245, + "rewards/HPSv2": 0.29212188720703125, + "rewards/ORM": 0.821916937828064, + "self_certainty_semantic": -25.375, + "self_certainty_token": -22.0, + "step": 86 + }, + { + "completion_length": 54.390625, + "epoch": 0.09634551495016612, + "grad_norm": 0.5397853255271912, + "kl": 0.004425048828125, + "learning_rate": 9.45625e-07, + "loss": 0.008617566898465157, + "reward": 2.2459940314292908, + "reward_std": 0.4676859378814697, + "rewards/GDino": 0.7356771230697632, + "rewards/GIT": 0.46453191339969635, + "rewards/HPSv2": 0.26766395568847656, + "rewards/ORM": 0.7781210243701935, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.0, + "step": 87 + }, + { + "completion_length": 42.15625, + "epoch": 0.09745293466223699, + "grad_norm": 0.48280662298202515, + "kl": 0.00406646728515625, + "learning_rate": 9.45e-07, + "loss": 0.016791983507573605, + "reward": 2.1528985500335693, + "reward_std": 0.44025059044361115, + "rewards/GDino": 0.7985424101352692, + "rewards/GIT": 0.47699007391929626, + "rewards/HPSv2": 0.2789325714111328, + "rewards/ORM": 0.5984334945678711, + "self_certainty_semantic": -25.375, + "self_certainty_token": -20.875, + "step": 88 + }, + { + "completion_length": 48.25, + "epoch": 0.09856035437430787, + "grad_norm": 0.4512772560119629, + "kl": 0.00405120849609375, + "learning_rate": 9.44375e-07, + "loss": -0.009609811007976532, + "reward": 2.155352771282196, + "reward_std": 0.3193782642483711, + "rewards/GDino": 0.7525902688503265, + "rewards/GIT": 0.4481022357940674, + "rewards/HPSv2": 0.2619743347167969, + "rewards/ORM": 0.6926859021186829, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.125, + "step": 89 + }, + { + "completion_length": 47.78125, + "epoch": 0.09966777408637874, + "grad_norm": 0.5204576849937439, + "kl": 0.004425048828125, + "learning_rate": 9.4375e-07, + "loss": -0.017570611089468002, + "reward": 2.3318194150924683, + "reward_std": 0.3641355484724045, + "rewards/GDino": 0.854687511920929, + "rewards/GIT": 0.6271218061447144, + "rewards/HPSv2": 0.26286888122558594, + "rewards/ORM": 0.5871412754058838, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.4375, + "step": 90 + }, + { + "completion_length": 54.9375, + "epoch": 0.10077519379844961, + "grad_norm": 0.7515896558761597, + "kl": 0.0042266845703125, + "learning_rate": 9.43125e-07, + "loss": 0.022024651989340782, + "reward": 1.7255874276161194, + "reward_std": 0.3924099802970886, + "rewards/GDino": 0.6800954043865204, + "rewards/GIT": 0.41760827600955963, + "rewards/HPSv2": 0.22957611083984375, + "rewards/ORM": 0.3983076214790344, + "self_certainty_semantic": -24.875, + "self_certainty_token": -21.125, + "step": 91 + }, + { + "completion_length": 51.90625, + "epoch": 0.10188261351052048, + "grad_norm": 0.6844750046730042, + "kl": 0.004730224609375, + "learning_rate": 9.425e-07, + "loss": 0.017017286270856857, + "reward": 1.7472361326217651, + "reward_std": 0.49342362582683563, + "rewards/GDino": 0.7615998685359955, + "rewards/GIT": 0.3799494504928589, + "rewards/HPSv2": 0.2450580596923828, + "rewards/ORM": 0.36062875390052795, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.0625, + "step": 92 + }, + { + "completion_length": 52.5625, + "epoch": 0.10299003322259136, + "grad_norm": 0.476144403219223, + "kl": 0.004547119140625, + "learning_rate": 9.41875e-07, + "loss": -0.006627652794122696, + "reward": 2.3529324531555176, + "reward_std": 0.38789400458335876, + "rewards/GDino": 0.8122400343418121, + "rewards/GIT": 0.40920257568359375, + "rewards/HPSv2": 0.25894737243652344, + "rewards/ORM": 0.8725424408912659, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -20.75, + "step": 93 + }, + { + "completion_length": 45.234375, + "epoch": 0.10409745293466224, + "grad_norm": 0.4303518235683441, + "kl": 0.00390625, + "learning_rate": 9.4125e-07, + "loss": 0.002329372800886631, + "reward": 2.063507556915283, + "reward_std": 0.4875355362892151, + "rewards/GDino": 0.8157378733158112, + "rewards/GIT": 0.2162991166114807, + "rewards/HPSv2": 0.2860240936279297, + "rewards/ORM": 0.7454463839530945, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -22.25, + "step": 94 + }, + { + "completion_length": 54.21875, + "epoch": 0.10520487264673312, + "grad_norm": 0.9745371341705322, + "kl": 0.004852294921875, + "learning_rate": 9.40625e-07, + "loss": 0.015892890747636557, + "reward": 2.4900766611099243, + "reward_std": 0.33158986270427704, + "rewards/GDino": 0.9456690549850464, + "rewards/GIT": 0.7110534906387329, + "rewards/HPSv2": 0.2568836212158203, + "rewards/ORM": 0.5764705836772919, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.3125, + "step": 95 + }, + { + "completion_length": 62.953125, + "epoch": 0.10631229235880399, + "grad_norm": 1.6108874082565308, + "kl": 0.00475311279296875, + "learning_rate": 9.399999999999999e-07, + "loss": 0.012537557166069746, + "reward": 2.4274561405181885, + "reward_std": 0.3028244078159332, + "rewards/GDino": 0.9155160486698151, + "rewards/GIT": 0.6933247745037079, + "rewards/HPSv2": 0.25919437408447266, + "rewards/ORM": 0.5594209432601929, + "self_certainty_semantic": -25.375, + "self_certainty_token": -20.3125, + "step": 96 + }, + { + "completion_length": 44.890625, + "epoch": 0.10741971207087486, + "grad_norm": 0.42777886986732483, + "kl": 0.0064697265625, + "learning_rate": 9.393749999999999e-07, + "loss": 0.006582918576896191, + "reward": 1.7229499220848083, + "reward_std": 0.29571742564439774, + "rewards/GDino": 0.6976552903652191, + "rewards/GIT": 0.17514611035585403, + "rewards/HPSv2": 0.2757740020751953, + "rewards/ORM": 0.5743745565414429, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.5, + "step": 97 + }, + { + "completion_length": 52.78125, + "epoch": 0.10852713178294573, + "grad_norm": 0.4346785247325897, + "kl": 0.00446319580078125, + "learning_rate": 9.387499999999999e-07, + "loss": 0.010664775501936674, + "reward": 1.9896260499954224, + "reward_std": 0.5384568274021149, + "rewards/GDino": 0.7534899115562439, + "rewards/GIT": 0.416723370552063, + "rewards/HPSv2": 0.25490760803222656, + "rewards/ORM": 0.5645051300525665, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.4375, + "step": 98 + }, + { + "completion_length": 53.546875, + "epoch": 0.10963455149501661, + "grad_norm": 0.4226502478122711, + "kl": 0.00543212890625, + "learning_rate": 9.381249999999999e-07, + "loss": -0.009754271944984794, + "reward": 2.1711018085479736, + "reward_std": 0.3036491945385933, + "rewards/GDino": 0.8239583373069763, + "rewards/GIT": 0.6844146698713303, + "rewards/HPSv2": 0.24951934814453125, + "rewards/ORM": 0.41320937871932983, + "self_certainty_semantic": -25.25, + "self_certainty_token": -22.0, + "step": 99 + }, + { + "completion_length": 48.0625, + "epoch": 0.11074197120708748, + "grad_norm": 0.4250389039516449, + "kl": 0.00537109375, + "learning_rate": 9.374999999999999e-07, + "loss": -0.014408082235604525, + "reward": 1.9375371932983398, + "reward_std": 0.4484590142965317, + "rewards/GDino": 0.6897697150707245, + "rewards/GIT": 0.4094943553209305, + "rewards/HPSv2": 0.2472515106201172, + "rewards/ORM": 0.5910216569900513, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.4375, + "step": 100 + }, + { + "completion_length": 53.546875, + "epoch": 0.11184939091915837, + "grad_norm": 0.9339170455932617, + "kl": 0.00566864013671875, + "learning_rate": 9.368749999999999e-07, + "loss": 0.003982411697506905, + "reward": 2.2582755088806152, + "reward_std": 0.41422703862190247, + "rewards/GDino": 0.8809943795204163, + "rewards/GIT": 0.5047063827514648, + "rewards/HPSv2": 0.27569580078125, + "rewards/ORM": 0.5968790352344513, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -20.6875, + "step": 101 + }, + { + "completion_length": 49.59375, + "epoch": 0.11295681063122924, + "grad_norm": 0.3932209610939026, + "kl": 0.0052032470703125, + "learning_rate": 9.3625e-07, + "loss": -0.015652057249099016, + "reward": 2.285743832588196, + "reward_std": 0.4700127840042114, + "rewards/GDino": 0.7444302141666412, + "rewards/GIT": 0.5256561636924744, + "rewards/HPSv2": 0.2712440490722656, + "rewards/ORM": 0.7444134056568146, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.5625, + "step": 102 + }, + { + "completion_length": 54.6875, + "epoch": 0.11406423034330011, + "grad_norm": 0.44802016019821167, + "kl": 0.006866455078125, + "learning_rate": 9.35625e-07, + "loss": 0.004784752381965518, + "reward": 1.801784873008728, + "reward_std": 0.5139727592468262, + "rewards/GDino": 0.6817658245563507, + "rewards/GIT": 0.18415232002735138, + "rewards/HPSv2": 0.2674713134765625, + "rewards/ORM": 0.6683953106403351, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.5, + "step": 103 + }, + { + "completion_length": 41.671875, + "epoch": 0.11517165005537099, + "grad_norm": 0.5610178709030151, + "kl": 0.0048675537109375, + "learning_rate": 9.35e-07, + "loss": 0.012662995606660843, + "reward": 2.2165188789367676, + "reward_std": 0.4234919399023056, + "rewards/GDino": 0.8522021770477295, + "rewards/GIT": 0.4973383694887161, + "rewards/HPSv2": 0.25607872009277344, + "rewards/ORM": 0.6108995676040649, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.875, + "step": 104 + }, + { + "completion_length": 48.609375, + "epoch": 0.11627906976744186, + "grad_norm": 0.5205046534538269, + "kl": 0.00438690185546875, + "learning_rate": 9.34375e-07, + "loss": 0.010773615911602974, + "reward": 1.9969267845153809, + "reward_std": 0.462581530213356, + "rewards/GDino": 0.739062488079071, + "rewards/GIT": 0.5269564837217331, + "rewards/HPSv2": 0.2528705596923828, + "rewards/ORM": 0.4780370891094208, + "self_certainty_semantic": -25.25, + "self_certainty_token": -20.9375, + "step": 105 + }, + { + "completion_length": 53.09375, + "epoch": 0.11738648947951273, + "grad_norm": 0.4945337772369385, + "kl": 0.00521087646484375, + "learning_rate": 9.3375e-07, + "loss": -0.015708873979747295, + "reward": 1.9371621012687683, + "reward_std": 0.3034388795495033, + "rewards/GDino": 0.7275120615959167, + "rewards/GIT": 0.5758572816848755, + "rewards/HPSv2": 0.26700592041015625, + "rewards/ORM": 0.3667868673801422, + "self_certainty_semantic": -25.25, + "self_certainty_token": -20.75, + "step": 106 + }, + { + "completion_length": 47.890625, + "epoch": 0.1184939091915836, + "grad_norm": 0.5665311217308044, + "kl": 0.007598876953125, + "learning_rate": 9.33125e-07, + "loss": -0.003110818797722459, + "reward": 2.5077059268951416, + "reward_std": 0.4203761965036392, + "rewards/GDino": 0.9479166865348816, + "rewards/GIT": 0.7534970641136169, + "rewards/HPSv2": 0.2581634521484375, + "rewards/ORM": 0.548128753900528, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -20.6875, + "step": 107 + }, + { + "completion_length": 42.859375, + "epoch": 0.11960132890365449, + "grad_norm": 0.5378500819206238, + "kl": 0.0073089599609375, + "learning_rate": 9.325e-07, + "loss": -0.0038322817999869585, + "reward": 2.261056423187256, + "reward_std": 0.27708302438259125, + "rewards/GDino": 0.7834341526031494, + "rewards/GIT": 0.4610650986433029, + "rewards/HPSv2": 0.28486061096191406, + "rewards/ORM": 0.731696605682373, + "self_certainty_semantic": -25.375, + "self_certainty_token": -20.75, + "step": 108 + }, + { + "completion_length": 50.828125, + "epoch": 0.12070874861572536, + "grad_norm": 0.4483701288700104, + "kl": 0.0067138671875, + "learning_rate": 9.31875e-07, + "loss": 0.021743599325418472, + "reward": 2.118706166744232, + "reward_std": 0.3737848997116089, + "rewards/GDino": 0.793749988079071, + "rewards/GIT": 0.5421168804168701, + "rewards/HPSv2": 0.27524757385253906, + "rewards/ORM": 0.5075916796922684, + "self_certainty_semantic": -25.25, + "self_certainty_token": -22.4375, + "step": 109 + }, + { + "completion_length": 44.40625, + "epoch": 0.12181616832779624, + "grad_norm": 5.2752509117126465, + "kl": 0.005615234375, + "learning_rate": 9.3125e-07, + "loss": -0.008640175685286522, + "reward": 2.223414659500122, + "reward_std": 0.46220165491104126, + "rewards/GDino": 0.8336420953273773, + "rewards/GIT": 0.4199307709932327, + "rewards/HPSv2": 0.24015045166015625, + "rewards/ORM": 0.7296914756298065, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.9375, + "step": 110 + }, + { + "completion_length": 54.234375, + "epoch": 0.12292358803986711, + "grad_norm": 1.4650001525878906, + "kl": 0.0050811767578125, + "learning_rate": 9.30625e-07, + "loss": 0.01813027122989297, + "reward": 1.980902910232544, + "reward_std": 0.3374909907579422, + "rewards/GDino": 0.7292370200157166, + "rewards/GIT": 0.3778613805770874, + "rewards/HPSv2": 0.27063751220703125, + "rewards/ORM": 0.603166937828064, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.875, + "step": 111 + }, + { + "completion_length": 42.015625, + "epoch": 0.12403100775193798, + "grad_norm": 2.8602488040924072, + "kl": 0.014862060546875, + "learning_rate": 9.3e-07, + "loss": 0.001048431033268571, + "reward": 2.127329468727112, + "reward_std": 0.3023644834756851, + "rewards/GDino": 0.7870483100414276, + "rewards/GIT": 0.3371267020702362, + "rewards/HPSv2": 0.2804222106933594, + "rewards/ORM": 0.7227321267127991, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.5625, + "step": 112 + }, + { + "completion_length": 40.921875, + "epoch": 0.12513842746400886, + "grad_norm": 0.47990044951438904, + "kl": 0.006256103515625, + "learning_rate": 9.293749999999999e-07, + "loss": -0.0006602238863706589, + "reward": 1.6797617077827454, + "reward_std": 0.38670530915260315, + "rewards/GDino": 0.6658706367015839, + "rewards/GIT": 0.29068493843078613, + "rewards/HPSv2": 0.2770404815673828, + "rewards/ORM": 0.44616562128067017, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.8125, + "step": 113 + }, + { + "completion_length": 45.734375, + "epoch": 0.12624584717607973, + "grad_norm": 0.7622689008712769, + "kl": 0.006622314453125, + "learning_rate": 9.287499999999999e-07, + "loss": 0.01737637398764491, + "reward": 2.336304783821106, + "reward_std": 0.2698482424020767, + "rewards/GDino": 0.7757812738418579, + "rewards/GIT": 0.6553223580121994, + "rewards/HPSv2": 0.2794780731201172, + "rewards/ORM": 0.6257232129573822, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.6875, + "step": 114 + }, + { + "completion_length": 49.65625, + "epoch": 0.1273532668881506, + "grad_norm": 0.5282474756240845, + "kl": 0.007537841796875, + "learning_rate": 9.281249999999999e-07, + "loss": 0.016482284292578697, + "reward": 2.164160192012787, + "reward_std": 0.3744830787181854, + "rewards/GDino": 0.710269957780838, + "rewards/GIT": 0.5369775593280792, + "rewards/HPSv2": 0.2497711181640625, + "rewards/ORM": 0.6671415567398071, + "self_certainty_semantic": -25.375, + "self_certainty_token": -20.6875, + "step": 115 + }, + { + "completion_length": 51.953125, + "epoch": 0.12846068660022147, + "grad_norm": 0.477001816034317, + "kl": 0.0054931640625, + "learning_rate": 9.274999999999999e-07, + "loss": 0.005994495470076799, + "reward": 2.3729158639907837, + "reward_std": 0.29741741716861725, + "rewards/GDino": 0.7775339484214783, + "rewards/GIT": 0.54022216796875, + "rewards/HPSv2": 0.26922607421875, + "rewards/ORM": 0.7859334647655487, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -20.5, + "step": 116 + }, + { + "completion_length": 47.203125, + "epoch": 0.12956810631229235, + "grad_norm": 0.5086694955825806, + "kl": 0.007476806640625, + "learning_rate": 9.268749999999999e-07, + "loss": -0.011362070217728615, + "reward": 2.5480291843414307, + "reward_std": 0.21939973533153534, + "rewards/GDino": 0.8421875238418579, + "rewards/GIT": 0.6468265354633331, + "rewards/HPSv2": 0.28557777404785156, + "rewards/ORM": 0.7734375, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -22.1875, + "step": 117 + }, + { + "completion_length": 47.96875, + "epoch": 0.13067552602436322, + "grad_norm": 0.4937967360019684, + "kl": 0.008392333984375, + "learning_rate": 9.2625e-07, + "loss": 0.021687609143555164, + "reward": 1.9581258296966553, + "reward_std": 0.4126932621002197, + "rewards/GDino": 0.7584865391254425, + "rewards/GIT": 0.40825480222702026, + "rewards/HPSv2": 0.2484416961669922, + "rewards/ORM": 0.5429428219795227, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.25, + "step": 118 + }, + { + "completion_length": 42.109375, + "epoch": 0.13178294573643412, + "grad_norm": 0.5172230005264282, + "kl": 0.008087158203125, + "learning_rate": 9.25625e-07, + "loss": -0.019072898663580418, + "reward": 2.281066656112671, + "reward_std": 0.5171918570995331, + "rewards/GDino": 0.777093768119812, + "rewards/GIT": 0.48773056268692017, + "rewards/HPSv2": 0.2693214416503906, + "rewards/ORM": 0.7469209730625153, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.6875, + "step": 119 + }, + { + "completion_length": 47.03125, + "epoch": 0.132890365448505, + "grad_norm": 0.5932603478431702, + "kl": 0.006134033203125, + "learning_rate": 9.25e-07, + "loss": -0.013699718751013279, + "reward": 1.8187614679336548, + "reward_std": 0.3037511110305786, + "rewards/GDino": 0.6585085391998291, + "rewards/GIT": 0.44199828803539276, + "rewards/HPSv2": 0.2613792419433594, + "rewards/ORM": 0.4568754881620407, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.25, + "step": 120 + }, + { + "completion_length": 43.09375, + "epoch": 0.13399778516057587, + "grad_norm": 0.5679785013198853, + "kl": 0.0063629150390625, + "learning_rate": 9.243749999999999e-07, + "loss": 0.00992331630550325, + "reward": 2.001632511615753, + "reward_std": 0.43007975816726685, + "rewards/GDino": 0.7286458313465118, + "rewards/GIT": 0.4185456335544586, + "rewards/HPSv2": 0.25841522216796875, + "rewards/ORM": 0.5960258543491364, + "self_certainty_semantic": -25.0, + "self_certainty_token": -21.4375, + "step": 121 + }, + { + "completion_length": 45.125, + "epoch": 0.13510520487264674, + "grad_norm": 0.5153163075447083, + "kl": 0.0077056884765625, + "learning_rate": 9.237499999999999e-07, + "loss": 0.023711273446679115, + "reward": 1.9004549980163574, + "reward_std": 0.30516810715198517, + "rewards/GDino": 0.7201891243457794, + "rewards/GIT": 0.3620809018611908, + "rewards/HPSv2": 0.2901439666748047, + "rewards/ORM": 0.5280410796403885, + "self_certainty_semantic": -25.25, + "self_certainty_token": -20.9375, + "step": 122 + }, + { + "completion_length": 40.984375, + "epoch": 0.1362126245847176, + "grad_norm": 0.5419678092002869, + "kl": 0.0060882568359375, + "learning_rate": 9.23125e-07, + "loss": 0.002633487805724144, + "reward": 1.8504286408424377, + "reward_std": 0.48781776428222656, + "rewards/GDino": 0.7166666686534882, + "rewards/GIT": 0.484364315867424, + "rewards/HPSv2": 0.25711822509765625, + "rewards/ORM": 0.39227938652038574, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.375, + "step": 123 + }, + { + "completion_length": 54.109375, + "epoch": 0.13732004429678848, + "grad_norm": 3.3991947174072266, + "kl": 0.00714111328125, + "learning_rate": 9.225e-07, + "loss": 0.010619609151035547, + "reward": 2.2070562839508057, + "reward_std": 0.4152011424303055, + "rewards/GDino": 0.7853051722049713, + "rewards/GIT": 0.4775720238685608, + "rewards/HPSv2": 0.25039100646972656, + "rewards/ORM": 0.6937879621982574, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.625, + "step": 124 + }, + { + "completion_length": 52.390625, + "epoch": 0.13842746400885936, + "grad_norm": 0.6660559773445129, + "kl": 0.00579833984375, + "learning_rate": 9.21875e-07, + "loss": 0.030846341978758574, + "reward": 2.0939546823501587, + "reward_std": 0.3831700086593628, + "rewards/GDino": 0.8140697479248047, + "rewards/GIT": 0.5598450750112534, + "rewards/HPSv2": 0.24968528747558594, + "rewards/ORM": 0.47035445272922516, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.625, + "step": 125 + }, + { + "completion_length": 40.546875, + "epoch": 0.13953488372093023, + "grad_norm": 0.4604582190513611, + "kl": 0.0074005126953125, + "learning_rate": 9.2125e-07, + "loss": -0.00871883099898696, + "reward": 1.8355292081832886, + "reward_std": 0.3674861043691635, + "rewards/GDino": 0.7088627219200134, + "rewards/GIT": 0.29635151475667953, + "rewards/HPSv2": 0.25839805603027344, + "rewards/ORM": 0.5719168931245804, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.125, + "step": 126 + }, + { + "completion_length": 54.71875, + "epoch": 0.1406423034330011, + "grad_norm": 0.8080440759658813, + "kl": 0.0090179443359375, + "learning_rate": 9.20625e-07, + "loss": -0.01499070762656629, + "reward": 2.144508123397827, + "reward_std": 0.3519645929336548, + "rewards/GDino": 0.7293722033500671, + "rewards/GIT": 0.42373301088809967, + "rewards/HPSv2": 0.27573204040527344, + "rewards/ORM": 0.7156709134578705, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.625, + "step": 127 + }, + { + "completion_length": 51.46875, + "epoch": 0.14174972314507198, + "grad_norm": 0.5151174068450928, + "kl": 0.007843017578125, + "learning_rate": 9.2e-07, + "loss": 0.011134594678878784, + "reward": 1.9095789790153503, + "reward_std": 0.5153420865535736, + "rewards/GDino": 0.7141143381595612, + "rewards/GIT": 0.26926109194755554, + "rewards/HPSv2": 0.26451873779296875, + "rewards/ORM": 0.6616848409175873, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.25, + "step": 128 + }, + { + "completion_length": 49.1875, + "epoch": 0.14285714285714285, + "grad_norm": 0.5960005521774292, + "kl": 0.0080413818359375, + "learning_rate": 9.19375e-07, + "loss": -0.0018172780983150005, + "reward": 2.0566558837890625, + "reward_std": 0.29967472702264786, + "rewards/GDino": 0.7159374058246613, + "rewards/GIT": 0.4233300983905792, + "rewards/HPSv2": 0.25041770935058594, + "rewards/ORM": 0.6669707000255585, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.0625, + "step": 129 + }, + { + "completion_length": 47.046875, + "epoch": 0.14396456256921372, + "grad_norm": 0.5405691862106323, + "kl": 0.0067596435546875, + "learning_rate": 9.187499999999999e-07, + "loss": 0.032197900116443634, + "reward": 1.9141977429389954, + "reward_std": 0.455816388130188, + "rewards/GDino": 0.7374999821186066, + "rewards/GIT": 0.4731251299381256, + "rewards/HPSv2": 0.2379016876220703, + "rewards/ORM": 0.46567094326019287, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.1875, + "step": 130 + }, + { + "completion_length": 44.125, + "epoch": 0.1450719822812846, + "grad_norm": 0.524912416934967, + "kl": 0.009796142578125, + "learning_rate": 9.181249999999999e-07, + "loss": -0.0008213929831981659, + "reward": 1.8893061876296997, + "reward_std": 0.3467349708080292, + "rewards/GDino": 0.7044448256492615, + "rewards/GIT": 0.35102422535419464, + "rewards/HPSv2": 0.26580047607421875, + "rewards/ORM": 0.5680365860462189, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.8125, + "step": 131 + }, + { + "completion_length": 34.625, + "epoch": 0.1461794019933555, + "grad_norm": 0.8808599710464478, + "kl": 0.011566162109375, + "learning_rate": 9.174999999999999e-07, + "loss": -0.003930883482098579, + "reward": 2.177262306213379, + "reward_std": 0.5171742737293243, + "rewards/GDino": 0.8225250542163849, + "rewards/GIT": 0.4376496821641922, + "rewards/HPSv2": 0.28387451171875, + "rewards/ORM": 0.6332131624221802, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.8125, + "step": 132 + }, + { + "completion_length": 48.515625, + "epoch": 0.14728682170542637, + "grad_norm": 0.5650766491889954, + "kl": 0.006622314453125, + "learning_rate": 9.168749999999999e-07, + "loss": -0.00018032779917120934, + "reward": 2.5767033100128174, + "reward_std": 0.21334625780582428, + "rewards/GDino": 0.768750011920929, + "rewards/GIT": 0.6930812895298004, + "rewards/HPSv2": 0.2711219787597656, + "rewards/ORM": 0.8437499701976776, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.3125, + "step": 133 + }, + { + "completion_length": 44.640625, + "epoch": 0.14839424141749724, + "grad_norm": 0.5156822800636292, + "kl": 0.004974365234375, + "learning_rate": 9.1625e-07, + "loss": 0.002806268632411957, + "reward": 1.9863407611846924, + "reward_std": 0.49281907081604004, + "rewards/GDino": 0.7205729484558105, + "rewards/GIT": 0.500615194439888, + "rewards/HPSv2": 0.25568580627441406, + "rewards/ORM": 0.5094669014215469, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -20.5, + "step": 134 + }, + { + "completion_length": 45.390625, + "epoch": 0.14950166112956811, + "grad_norm": 0.8464062809944153, + "kl": 0.0070343017578125, + "learning_rate": 9.15625e-07, + "loss": -0.019531114026904106, + "reward": 2.0325437784194946, + "reward_std": 0.40662893652915955, + "rewards/GDino": 0.7845472693443298, + "rewards/GIT": 0.48149144649505615, + "rewards/HPSv2": 0.2637767791748047, + "rewards/ORM": 0.5027283281087875, + "self_certainty_semantic": -25.25, + "self_certainty_token": -20.3125, + "step": 135 + }, + { + "completion_length": 51.34375, + "epoch": 0.150609080841639, + "grad_norm": 0.5737318992614746, + "kl": 0.0070343017578125, + "learning_rate": 9.15e-07, + "loss": -0.0035284715704619884, + "reward": 1.6841511130332947, + "reward_std": 0.3555753082036972, + "rewards/GDino": 0.6274834871292114, + "rewards/GIT": 0.27718986570835114, + "rewards/HPSv2": 0.2579154968261719, + "rewards/ORM": 0.521562248468399, + "self_certainty_semantic": -25.0, + "self_certainty_token": -21.0, + "step": 136 + }, + { + "completion_length": 44.890625, + "epoch": 0.15171650055370986, + "grad_norm": 0.9722269773483276, + "kl": 0.009033203125, + "learning_rate": 9.14375e-07, + "loss": -0.010386745911091566, + "reward": 2.115400731563568, + "reward_std": 0.4073975533246994, + "rewards/GDino": 0.7504827678203583, + "rewards/GIT": 0.5390563532710075, + "rewards/HPSv2": 0.25206947326660156, + "rewards/ORM": 0.5737921595573425, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.625, + "step": 137 + }, + { + "completion_length": 45.75, + "epoch": 0.15282392026578073, + "grad_norm": 0.7052740454673767, + "kl": 0.010101318359375, + "learning_rate": 9.137499999999999e-07, + "loss": -0.009516147896647453, + "reward": 1.8091301918029785, + "reward_std": 0.3067747950553894, + "rewards/GDino": 0.6395186185836792, + "rewards/GIT": 0.19299907237291336, + "rewards/HPSv2": 0.26455116271972656, + "rewards/ORM": 0.7120613753795624, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.1875, + "step": 138 + }, + { + "completion_length": 52.609375, + "epoch": 0.1539313399778516, + "grad_norm": 0.49324488639831543, + "kl": 0.0060577392578125, + "learning_rate": 9.131249999999999e-07, + "loss": 0.019678042270243168, + "reward": 2.08823698759079, + "reward_std": 0.3092179298400879, + "rewards/GDino": 0.7869435846805573, + "rewards/GIT": 0.3419180363416672, + "rewards/HPSv2": 0.27410125732421875, + "rewards/ORM": 0.6852740943431854, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.5625, + "step": 139 + }, + { + "completion_length": 44.65625, + "epoch": 0.15503875968992248, + "grad_norm": 0.46719232201576233, + "kl": 0.005218505859375, + "learning_rate": 9.124999999999999e-07, + "loss": -0.016016804613173008, + "reward": 2.2374342679977417, + "reward_std": 0.4363926351070404, + "rewards/GDino": 0.8431436419487, + "rewards/GIT": 0.5656594336032867, + "rewards/HPSv2": 0.25456809997558594, + "rewards/ORM": 0.5740630030632019, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.5, + "step": 140 + }, + { + "completion_length": 58.671875, + "epoch": 0.15614617940199335, + "grad_norm": 0.4546290934085846, + "kl": 0.004730224609375, + "learning_rate": 9.11875e-07, + "loss": -0.0007884092628955841, + "reward": 2.369232416152954, + "reward_std": 0.263886496424675, + "rewards/GDino": 0.7109375298023224, + "rewards/GIT": 0.7504715323448181, + "rewards/HPSv2": 0.245635986328125, + "rewards/ORM": 0.6621872782707214, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -20.75, + "step": 141 + }, + { + "completion_length": 42.4375, + "epoch": 0.15725359911406422, + "grad_norm": 0.9214792251586914, + "kl": 0.009521484375, + "learning_rate": 9.1125e-07, + "loss": 0.003012734232470393, + "reward": 2.1930705904960632, + "reward_std": 0.39259086549282074, + "rewards/GDino": 0.7755208611488342, + "rewards/GIT": 0.5075857639312744, + "rewards/HPSv2": 0.2599220275878906, + "rewards/ORM": 0.6500419527292252, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -20.625, + "step": 142 + }, + { + "completion_length": 44.984375, + "epoch": 0.1583610188261351, + "grad_norm": 0.622131884098053, + "kl": 0.007476806640625, + "learning_rate": 9.10625e-07, + "loss": 0.012470124522224069, + "reward": 1.945671796798706, + "reward_std": 0.3904338628053665, + "rewards/GDino": 0.7124259173870087, + "rewards/GIT": 0.36576879024505615, + "rewards/HPSv2": 0.2677898406982422, + "rewards/ORM": 0.599687248468399, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -20.5625, + "step": 143 + }, + { + "completion_length": 50.4375, + "epoch": 0.15946843853820597, + "grad_norm": 0.5398712158203125, + "kl": 0.007080078125, + "learning_rate": 9.1e-07, + "loss": 0.009707295335829258, + "reward": 1.9473342895507812, + "reward_std": 0.411212295293808, + "rewards/GDino": 0.6565104126930237, + "rewards/GIT": 0.4918062835931778, + "rewards/HPSv2": 0.24593448638916016, + "rewards/ORM": 0.5530830472707748, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.1875, + "step": 144 + }, + { + "completion_length": 48.828125, + "epoch": 0.16057585825027684, + "grad_norm": 0.44240400195121765, + "kl": 0.0082550048828125, + "learning_rate": 9.09375e-07, + "loss": -0.015047748805955052, + "reward": 1.8875170946121216, + "reward_std": 0.37340451776981354, + "rewards/GDino": 0.6789085865020752, + "rewards/GIT": 0.31243064999580383, + "rewards/HPSv2": 0.2805948257446289, + "rewards/ORM": 0.6155830323696136, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.375, + "step": 145 + }, + { + "completion_length": 42.234375, + "epoch": 0.16168327796234774, + "grad_norm": 0.6678792238235474, + "kl": 0.0080718994140625, + "learning_rate": 9.087499999999999e-07, + "loss": 0.015477177686989307, + "reward": 2.123531699180603, + "reward_std": 0.4475431591272354, + "rewards/GDino": 0.8569894731044769, + "rewards/GIT": 0.35753606259822845, + "rewards/HPSv2": 0.26838111877441406, + "rewards/ORM": 0.640625, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.5, + "step": 146 + }, + { + "completion_length": 47.921875, + "epoch": 0.16279069767441862, + "grad_norm": 0.4761756658554077, + "kl": 0.007293701171875, + "learning_rate": 9.081249999999999e-07, + "loss": -0.008836451917886734, + "reward": 1.6021055579185486, + "reward_std": 0.30197490751743317, + "rewards/GDino": 0.6556249558925629, + "rewards/GIT": 0.1688888967037201, + "rewards/HPSv2": 0.24634170532226562, + "rewards/ORM": 0.5312500298023224, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -22.0, + "step": 147 + }, + { + "completion_length": 46.921875, + "epoch": 0.1638981173864895, + "grad_norm": 0.6984077095985413, + "kl": 0.011322021484375, + "learning_rate": 9.074999999999999e-07, + "loss": -0.001696310006082058, + "reward": 2.1800928115844727, + "reward_std": 0.22076455503702164, + "rewards/GDino": 0.8397657871246338, + "rewards/GIT": 0.3664309233427048, + "rewards/HPSv2": 0.2707710266113281, + "rewards/ORM": 0.703125, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.625, + "step": 148 + }, + { + "completion_length": 42.984375, + "epoch": 0.16500553709856036, + "grad_norm": 0.6648632884025574, + "kl": 0.008514404296875, + "learning_rate": 9.068749999999999e-07, + "loss": 0.0001570945605635643, + "reward": 2.0893077850341797, + "reward_std": 0.40299197286367416, + "rewards/GDino": 0.7465624809265137, + "rewards/GIT": 0.32270002365112305, + "rewards/HPSv2": 0.2650909423828125, + "rewards/ORM": 0.7549542784690857, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.0, + "step": 149 + }, + { + "completion_length": 46.765625, + "epoch": 0.16611295681063123, + "grad_norm": 0.4624801278114319, + "kl": 0.00653076171875, + "learning_rate": 9.0625e-07, + "loss": -0.00981504051014781, + "reward": 1.8526134490966797, + "reward_std": 0.49954167008399963, + "rewards/GDino": 0.7183263897895813, + "rewards/GIT": 0.28864332288503647, + "rewards/HPSv2": 0.2530975341796875, + "rewards/ORM": 0.5925461798906326, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.3125, + "step": 150 + }, + { + "completion_length": 44.265625, + "epoch": 0.1672203765227021, + "grad_norm": 0.6167420744895935, + "kl": 0.008636474609375, + "learning_rate": 9.05625e-07, + "loss": 0.015115905553102493, + "reward": 2.337567687034607, + "reward_std": 0.42467789351940155, + "rewards/GDino": 0.8551518619060516, + "rewards/GIT": 0.40152132511138916, + "rewards/HPSv2": 0.2702198028564453, + "rewards/ORM": 0.8106746673583984, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.25, + "step": 151 + }, + { + "completion_length": 49.578125, + "epoch": 0.16832779623477298, + "grad_norm": 0.8095238208770752, + "kl": 0.006683349609375, + "learning_rate": 9.05e-07, + "loss": -0.005870660301297903, + "reward": 2.1157608032226562, + "reward_std": 0.20268037915229797, + "rewards/GDino": 0.8022373914718628, + "rewards/GIT": 0.4189887195825577, + "rewards/HPSv2": 0.27895164489746094, + "rewards/ORM": 0.6155830323696136, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.625, + "step": 152 + }, + { + "completion_length": 41.84375, + "epoch": 0.16943521594684385, + "grad_norm": 0.5198939442634583, + "kl": 0.0080718994140625, + "learning_rate": 9.04375e-07, + "loss": -0.014349173055961728, + "reward": 2.219430923461914, + "reward_std": 0.31192296743392944, + "rewards/GDino": 0.7789298295974731, + "rewards/GIT": 0.6309380829334259, + "rewards/HPSv2": 0.26394176483154297, + "rewards/ORM": 0.5456212162971497, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -22.125, + "step": 153 + }, + { + "completion_length": 39.796875, + "epoch": 0.17054263565891473, + "grad_norm": 0.4530474543571472, + "kl": 0.00823974609375, + "learning_rate": 9.0375e-07, + "loss": -0.009022563113830984, + "reward": 1.62737375497818, + "reward_std": 0.3316381424665451, + "rewards/GDino": 0.753333568572998, + "rewards/GIT": 0.17652657628059387, + "rewards/HPSv2": 0.2600135803222656, + "rewards/ORM": 0.4375, + "self_certainty_semantic": -25.125, + "self_certainty_token": -22.25, + "step": 154 + }, + { + "completion_length": 56.828125, + "epoch": 0.1716500553709856, + "grad_norm": 0.5859449505805969, + "kl": 0.01031494140625, + "learning_rate": 9.031249999999999e-07, + "loss": -0.006706917891278863, + "reward": 2.134092330932617, + "reward_std": 0.3308701366186142, + "rewards/GDino": 0.6453125327825546, + "rewards/GIT": 0.44731535762548447, + "rewards/HPSv2": 0.26494789123535156, + "rewards/ORM": 0.7765165567398071, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -20.6875, + "step": 155 + }, + { + "completion_length": 41.078125, + "epoch": 0.17275747508305647, + "grad_norm": 0.5353882908821106, + "kl": 0.010894775390625, + "learning_rate": 9.024999999999999e-07, + "loss": 0.0037739332765340805, + "reward": 1.8532127141952515, + "reward_std": 0.3209614157676697, + "rewards/GDino": 0.6619158685207367, + "rewards/GIT": 0.23281337320804596, + "rewards/HPSv2": 0.2831287384033203, + "rewards/ORM": 0.6753546595573425, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.125, + "step": 156 + }, + { + "completion_length": 54.296875, + "epoch": 0.17386489479512734, + "grad_norm": 0.45864060521125793, + "kl": 0.0081787109375, + "learning_rate": 9.018749999999999e-07, + "loss": 0.01829966064542532, + "reward": 2.4461565017700195, + "reward_std": 0.3625805824995041, + "rewards/GDino": 0.8224999904632568, + "rewards/GIT": 0.7057079672813416, + "rewards/HPSv2": 0.24219322204589844, + "rewards/ORM": 0.6757553368806839, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.8125, + "step": 157 + }, + { + "completion_length": 35.96875, + "epoch": 0.17497231450719822, + "grad_norm": 0.6962554454803467, + "kl": 0.01031494140625, + "learning_rate": 9.0125e-07, + "loss": 0.018664106726646423, + "reward": 2.2535247802734375, + "reward_std": 0.3470146059989929, + "rewards/GDino": 0.8081650137901306, + "rewards/GIT": 0.43000543117523193, + "rewards/HPSv2": 0.2705402374267578, + "rewards/ORM": 0.7448140382766724, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.375, + "step": 158 + }, + { + "completion_length": 48.421875, + "epoch": 0.1760797342192691, + "grad_norm": 0.7126027941703796, + "kl": 0.0084686279296875, + "learning_rate": 9.00625e-07, + "loss": -0.01690885704010725, + "reward": 2.103760838508606, + "reward_std": 0.42456358671188354, + "rewards/GDino": 0.7740625143051147, + "rewards/GIT": 0.5006528943777084, + "rewards/HPSv2": 0.2603874206542969, + "rewards/ORM": 0.5686581134796143, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -20.8125, + "step": 159 + }, + { + "completion_length": 47.15625, + "epoch": 0.17718715393134, + "grad_norm": 0.4899819791316986, + "kl": 0.011016845703125, + "learning_rate": 9e-07, + "loss": 0.011189845390617847, + "reward": 1.9280957579612732, + "reward_std": 0.3808829113841057, + "rewards/GDino": 0.7389523684978485, + "rewards/GIT": 0.29086190462112427, + "rewards/HPSv2": 0.27908897399902344, + "rewards/ORM": 0.6191926002502441, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.875, + "step": 160 + }, + { + "completion_length": 48.0, + "epoch": 0.17829457364341086, + "grad_norm": 0.8614944815635681, + "kl": 0.010162353515625, + "learning_rate": 8.99375e-07, + "loss": -0.0047345394268631935, + "reward": 2.126296818256378, + "reward_std": 0.4349767565727234, + "rewards/GDino": 0.6996158957481384, + "rewards/GIT": 0.4218016564846039, + "rewards/HPSv2": 0.26108741760253906, + "rewards/ORM": 0.743791937828064, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -20.75, + "step": 161 + }, + { + "completion_length": 57.40625, + "epoch": 0.17940199335548174, + "grad_norm": 0.4764331877231598, + "kl": 0.00958251953125, + "learning_rate": 8.9875e-07, + "loss": -0.02189162978902459, + "reward": 2.1861079335212708, + "reward_std": 0.2511523813009262, + "rewards/GDino": 0.9083333313465118, + "rewards/GIT": 0.5160972326993942, + "rewards/HPSv2": 0.2581977844238281, + "rewards/ORM": 0.5034796893596649, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.5, + "step": 162 + }, + { + "completion_length": 41.796875, + "epoch": 0.1805094130675526, + "grad_norm": 0.63126540184021, + "kl": 0.01348876953125, + "learning_rate": 8.981249999999999e-07, + "loss": -0.010512399720028043, + "reward": 2.1882660388946533, + "reward_std": 0.26342111825942993, + "rewards/GDino": 0.7701247036457062, + "rewards/GIT": 0.637158066034317, + "rewards/HPSv2": 0.24700498580932617, + "rewards/ORM": 0.5339783430099487, + "self_certainty_semantic": -25.125, + "self_certainty_token": -20.625, + "step": 163 + }, + { + "completion_length": 43.78125, + "epoch": 0.18161683277962348, + "grad_norm": 0.4358772039413452, + "kl": 0.01190185546875, + "learning_rate": 8.974999999999999e-07, + "loss": -0.006003182148560882, + "reward": 2.6546283960342407, + "reward_std": 0.3610512763261795, + "rewards/GDino": 0.8648440539836884, + "rewards/GIT": 0.7706953585147858, + "rewards/HPSv2": 0.269439697265625, + "rewards/ORM": 0.749649316072464, + "self_certainty_semantic": -25.25, + "self_certainty_token": -20.6875, + "step": 164 + }, + { + "completion_length": 48.09375, + "epoch": 0.18272425249169436, + "grad_norm": 1.585694432258606, + "kl": 0.01043701171875, + "learning_rate": 8.96875e-07, + "loss": 8.579343557357788e-05, + "reward": 2.331842541694641, + "reward_std": 0.3782896548509598, + "rewards/GDino": 0.7984375357627869, + "rewards/GIT": 0.5101843625307083, + "rewards/HPSv2": 0.27196693420410156, + "rewards/ORM": 0.751253753900528, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.8125, + "step": 165 + }, + { + "completion_length": 45.34375, + "epoch": 0.18383167220376523, + "grad_norm": 0.45005306601524353, + "kl": 0.01312255859375, + "learning_rate": 8.9625e-07, + "loss": -0.006219237111508846, + "reward": 2.192749261856079, + "reward_std": 0.2947642654180527, + "rewards/GDino": 0.7351250052452087, + "rewards/GIT": 0.45546063780784607, + "rewards/HPSv2": 0.24868392944335938, + "rewards/ORM": 0.7534796893596649, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.25, + "step": 166 + }, + { + "completion_length": 43.015625, + "epoch": 0.1849390919158361, + "grad_norm": 0.5231027007102966, + "kl": 0.010772705078125, + "learning_rate": 8.95625e-07, + "loss": -0.007611713605001569, + "reward": 2.0297417044639587, + "reward_std": 0.4215656816959381, + "rewards/GDino": 0.7420200109481812, + "rewards/GIT": 0.5049543976783752, + "rewards/HPSv2": 0.26178741455078125, + "rewards/ORM": 0.5209799110889435, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.75, + "step": 167 + }, + { + "completion_length": 53.46875, + "epoch": 0.18604651162790697, + "grad_norm": 0.4332805871963501, + "kl": 0.0062408447265625, + "learning_rate": 8.95e-07, + "loss": -0.004800099181011319, + "reward": 2.05144202709198, + "reward_std": 0.3129463642835617, + "rewards/GDino": 0.7780522406101227, + "rewards/GIT": 0.5233069062232971, + "rewards/HPSv2": 0.2612953186035156, + "rewards/ORM": 0.4887876957654953, + "self_certainty_semantic": -25.0, + "self_certainty_token": -21.1875, + "step": 168 + }, + { + "completion_length": 37.90625, + "epoch": 0.18715393133997785, + "grad_norm": 0.45867836475372314, + "kl": 0.01751708984375, + "learning_rate": 8.94375e-07, + "loss": 0.004684945801272988, + "reward": 2.2036190032958984, + "reward_std": 0.26221713423728943, + "rewards/GDino": 0.8604569435119629, + "rewards/GIT": 0.4576933681964874, + "rewards/HPSv2": 0.28287315368652344, + "rewards/ORM": 0.6025954186916351, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.5, + "step": 169 + }, + { + "completion_length": 41.375, + "epoch": 0.18826135105204872, + "grad_norm": 0.4206317365169525, + "kl": 0.009979248046875, + "learning_rate": 8.9375e-07, + "loss": 0.0024775206111371517, + "reward": 1.943125069141388, + "reward_std": 0.4646635055541992, + "rewards/GDino": 0.7305906116962433, + "rewards/GIT": 0.4165241867303848, + "rewards/HPSv2": 0.24480247497558594, + "rewards/ORM": 0.5512078106403351, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.125, + "step": 170 + }, + { + "completion_length": 40.703125, + "epoch": 0.1893687707641196, + "grad_norm": 0.697102963924408, + "kl": 0.010833740234375, + "learning_rate": 8.931249999999999e-07, + "loss": -0.003134746104478836, + "reward": 2.203751564025879, + "reward_std": 0.3228776603937149, + "rewards/GDino": 0.7603735029697418, + "rewards/GIT": 0.5309787690639496, + "rewards/HPSv2": 0.2583751678466797, + "rewards/ORM": 0.654024064540863, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.1875, + "step": 171 + }, + { + "completion_length": 50.515625, + "epoch": 0.19047619047619047, + "grad_norm": 0.5617944002151489, + "kl": 0.008392333984375, + "learning_rate": 8.924999999999999e-07, + "loss": 0.03539674496278167, + "reward": 1.7965713739395142, + "reward_std": 0.3860231041908264, + "rewards/GDino": 0.6837728917598724, + "rewards/GIT": 0.4299345314502716, + "rewards/HPSv2": 0.2496967315673828, + "rewards/ORM": 0.4331671893596649, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.25, + "step": 172 + }, + { + "completion_length": 35.5625, + "epoch": 0.19158361018826134, + "grad_norm": 0.4585064649581909, + "kl": 0.017242431640625, + "learning_rate": 8.918749999999999e-07, + "loss": 0.002358448226004839, + "reward": 2.1448813676834106, + "reward_std": 0.31261830031871796, + "rewards/GDino": 0.834684431552887, + "rewards/GIT": 0.36106863617897034, + "rewards/HPSv2": 0.28739356994628906, + "rewards/ORM": 0.6617347896099091, + "self_certainty_semantic": -25.125, + "self_certainty_token": -22.5625, + "step": 173 + }, + { + "completion_length": 46.859375, + "epoch": 0.19269102990033224, + "grad_norm": 0.42562612891197205, + "kl": 0.00872802734375, + "learning_rate": 8.912499999999999e-07, + "loss": 0.0036601885221898556, + "reward": 2.2070860862731934, + "reward_std": 0.3847656697034836, + "rewards/GDino": 0.7723565697669983, + "rewards/GIT": 0.5430227518081665, + "rewards/HPSv2": 0.25701904296875, + "rewards/ORM": 0.634687751531601, + "self_certainty_semantic": -25.0, + "self_certainty_token": -20.875, + "step": 174 + }, + { + "completion_length": 46.0, + "epoch": 0.1937984496124031, + "grad_norm": 0.5808805227279663, + "kl": 0.0078277587890625, + "learning_rate": 8.906249999999999e-07, + "loss": 0.032253723591566086, + "reward": 2.157870888710022, + "reward_std": 0.2512262612581253, + "rewards/GDino": 0.7799479365348816, + "rewards/GIT": 0.5305047780275345, + "rewards/HPSv2": 0.2598762512207031, + "rewards/ORM": 0.587541937828064, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.125, + "step": 175 + }, + { + "completion_length": 58.375, + "epoch": 0.19490586932447398, + "grad_norm": 0.5789065957069397, + "kl": 0.0071868896484375, + "learning_rate": 8.9e-07, + "loss": -0.014507739804685116, + "reward": 1.955940306186676, + "reward_std": 0.3165567219257355, + "rewards/GDino": 0.7864583432674408, + "rewards/GIT": 0.4266613572835922, + "rewards/HPSv2": 0.25844573974609375, + "rewards/ORM": 0.484375, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -20.9375, + "step": 176 + }, + { + "completion_length": 51.328125, + "epoch": 0.19601328903654486, + "grad_norm": 1.309583067893982, + "kl": 0.013214111328125, + "learning_rate": 8.89375e-07, + "loss": -0.0020376548636704683, + "reward": 2.2406471967697144, + "reward_std": 0.3685739040374756, + "rewards/GDino": 0.7480616569519043, + "rewards/GIT": 0.4961736798286438, + "rewards/HPSv2": 0.25730323791503906, + "rewards/ORM": 0.7391084432601929, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -22.25, + "step": 177 + }, + { + "completion_length": 56.609375, + "epoch": 0.19712070874861573, + "grad_norm": 0.6938892602920532, + "kl": 0.008270263671875, + "learning_rate": 8.8875e-07, + "loss": 0.005612233653664589, + "reward": 2.1296470165252686, + "reward_std": 0.3742387443780899, + "rewards/GDino": 0.7609374821186066, + "rewards/GIT": 0.4125446677207947, + "rewards/HPSv2": 0.2701892852783203, + "rewards/ORM": 0.6859754621982574, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.125, + "step": 178 + }, + { + "completion_length": 37.234375, + "epoch": 0.1982281284606866, + "grad_norm": 0.4790467917919159, + "kl": 0.006927490234375, + "learning_rate": 8.88125e-07, + "loss": 0.01884503196924925, + "reward": 2.431138515472412, + "reward_std": 0.24652785062789917, + "rewards/GDino": 0.8910974562168121, + "rewards/GIT": 0.5796742737293243, + "rewards/HPSv2": 0.28255462646484375, + "rewards/ORM": 0.677812248468399, + "self_certainty_semantic": -25.125, + "self_certainty_token": -22.375, + "step": 179 + }, + { + "completion_length": 45.921875, + "epoch": 0.19933554817275748, + "grad_norm": 0.4324551522731781, + "kl": 0.011749267578125, + "learning_rate": 8.874999999999999e-07, + "loss": -0.016566987615078688, + "reward": 1.7641431093215942, + "reward_std": 0.42914003133773804, + "rewards/GDino": 0.7295474410057068, + "rewards/GIT": 0.32154107093811035, + "rewards/HPSv2": 0.25917816162109375, + "rewards/ORM": 0.4538763463497162, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.5, + "step": 180 + }, + { + "completion_length": 44.921875, + "epoch": 0.20044296788482835, + "grad_norm": 0.7271856665611267, + "kl": 0.008270263671875, + "learning_rate": 8.86875e-07, + "loss": -0.01033696997910738, + "reward": 1.8466984629631042, + "reward_std": 0.33254362642765045, + "rewards/GDino": 0.6390625238418579, + "rewards/GIT": 0.35668525099754333, + "rewards/HPSv2": 0.27863311767578125, + "rewards/ORM": 0.5723176002502441, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.125, + "step": 181 + }, + { + "completion_length": 40.84375, + "epoch": 0.20155038759689922, + "grad_norm": 0.5530015230178833, + "kl": 0.02142333984375, + "learning_rate": 8.8625e-07, + "loss": -0.008886129595339298, + "reward": 1.7842278480529785, + "reward_std": 0.4365523010492325, + "rewards/GDino": 0.7104989886283875, + "rewards/GIT": 0.34373709559440613, + "rewards/HPSv2": 0.27834129333496094, + "rewards/ORM": 0.4516504108905792, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -22.0, + "step": 182 + }, + { + "completion_length": 51.34375, + "epoch": 0.2026578073089701, + "grad_norm": 0.545352041721344, + "kl": 0.015411376953125, + "learning_rate": 8.85625e-07, + "loss": -0.010915862862020731, + "reward": 1.9055233001708984, + "reward_std": 0.38334617018699646, + "rewards/GDino": 0.7308869063854218, + "rewards/GIT": 0.3478083163499832, + "rewards/HPSv2": 0.2643280029296875, + "rewards/ORM": 0.5625000298023224, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.5, + "step": 183 + }, + { + "completion_length": 38.890625, + "epoch": 0.20376522702104097, + "grad_norm": 0.5411937832832336, + "kl": 0.01177978515625, + "learning_rate": 8.85e-07, + "loss": 0.013757664943113923, + "reward": 2.716238021850586, + "reward_std": 0.29777073860168457, + "rewards/GDino": 0.925000011920929, + "rewards/GIT": 0.7263616919517517, + "rewards/HPSv2": 0.2595672607421875, + "rewards/ORM": 0.805308997631073, + "self_certainty_semantic": -25.0, + "self_certainty_token": -21.4375, + "step": 184 + }, + { + "completion_length": 41.65625, + "epoch": 0.20487264673311184, + "grad_norm": 1.4033399820327759, + "kl": 0.016143798828125, + "learning_rate": 8.84375e-07, + "loss": 0.016424793750047684, + "reward": 2.2067692279815674, + "reward_std": 0.44875267148017883, + "rewards/GDino": 0.8457056879997253, + "rewards/GIT": 0.31159064173698425, + "rewards/HPSv2": 0.275634765625, + "rewards/ORM": 0.7738381326198578, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -22.375, + "step": 185 + }, + { + "completion_length": 40.21875, + "epoch": 0.2059800664451827, + "grad_norm": 0.545001208782196, + "kl": 0.0111083984375, + "learning_rate": 8.8375e-07, + "loss": 0.010479988530278206, + "reward": 1.9079334735870361, + "reward_std": 0.3735552281141281, + "rewards/GDino": 0.6260845363140106, + "rewards/GIT": 0.33919139206409454, + "rewards/HPSv2": 0.2270364761352539, + "rewards/ORM": 0.7156209945678711, + "self_certainty_semantic": -24.9375, + "self_certainty_token": -21.375, + "step": 186 + }, + { + "completion_length": 42.25, + "epoch": 0.2070874861572536, + "grad_norm": 0.5201876759529114, + "kl": 0.012786865234375, + "learning_rate": 8.83125e-07, + "loss": -0.005474693141877651, + "reward": 2.160323202610016, + "reward_std": 0.380416139960289, + "rewards/GDino": 0.8049721121788025, + "rewards/GIT": 0.5423709452152252, + "rewards/HPSv2": 0.25914573669433594, + "rewards/ORM": 0.5538343787193298, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -20.625, + "step": 187 + }, + { + "completion_length": 41.546875, + "epoch": 0.2081949058693245, + "grad_norm": 1.0598704814910889, + "kl": 0.0123291015625, + "learning_rate": 8.824999999999999e-07, + "loss": -0.005072480300441384, + "reward": 2.198424220085144, + "reward_std": 0.318149596452713, + "rewards/GDino": 0.77506023645401, + "rewards/GIT": 0.2563297525048256, + "rewards/HPSv2": 0.26827430725097656, + "rewards/ORM": 0.8987601101398468, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.6875, + "step": 188 + }, + { + "completion_length": 45.859375, + "epoch": 0.20930232558139536, + "grad_norm": 1.0103754997253418, + "kl": 0.0113983154296875, + "learning_rate": 8.818749999999999e-07, + "loss": 0.006917888764292002, + "reward": 2.355333089828491, + "reward_std": 0.26609520614147186, + "rewards/GDino": 0.7847018539905548, + "rewards/GIT": 0.5574060827493668, + "rewards/HPSv2": 0.2811260223388672, + "rewards/ORM": 0.7320991158485413, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -20.8125, + "step": 189 + }, + { + "completion_length": 43.21875, + "epoch": 0.21040974529346623, + "grad_norm": 1.6472647190093994, + "kl": 0.021514892578125, + "learning_rate": 8.812499999999999e-07, + "loss": 0.024224724620580673, + "reward": 2.3357620239257812, + "reward_std": 0.36145227402448654, + "rewards/GDino": 0.9042215049266815, + "rewards/GIT": 0.44460529088974, + "rewards/HPSv2": 0.26693153381347656, + "rewards/ORM": 0.720003753900528, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.125, + "step": 190 + }, + { + "completion_length": 39.84375, + "epoch": 0.2115171650055371, + "grad_norm": 0.4037325084209442, + "kl": 0.010467529296875, + "learning_rate": 8.806249999999999e-07, + "loss": 0.011584978085011244, + "reward": 2.569461226463318, + "reward_std": 0.31772294640541077, + "rewards/GDino": 0.8936654925346375, + "rewards/GIT": 0.6064814329147339, + "rewards/HPSv2": 0.28333091735839844, + "rewards/ORM": 0.7859834432601929, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.75, + "step": 191 + }, + { + "completion_length": 54.203125, + "epoch": 0.21262458471760798, + "grad_norm": 1.364973545074463, + "kl": 0.02630615234375, + "learning_rate": 8.799999999999999e-07, + "loss": -0.045025499537587166, + "reward": 1.7345000505447388, + "reward_std": 0.2504274845123291, + "rewards/GDino": 0.6147945821285248, + "rewards/GIT": 0.4107672870159149, + "rewards/HPSv2": 0.24933433532714844, + "rewards/ORM": 0.45960381627082825, + "self_certainty_semantic": -25.375, + "self_certainty_token": -20.4375, + "step": 192 + }, + { + "completion_length": 41.25, + "epoch": 0.21373200442967885, + "grad_norm": 0.4461517333984375, + "kl": 0.0135498046875, + "learning_rate": 8.793749999999999e-07, + "loss": -0.006084040272980928, + "reward": 1.8086916208267212, + "reward_std": 0.4172170013189316, + "rewards/GDino": 0.6766185760498047, + "rewards/GIT": 0.20515873283147812, + "rewards/HPSv2": 0.2862892150878906, + "rewards/ORM": 0.640625, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.875, + "step": 193 + }, + { + "completion_length": 46.046875, + "epoch": 0.21483942414174972, + "grad_norm": 0.6739068031311035, + "kl": 0.00982666015625, + "learning_rate": 8.7875e-07, + "loss": 0.005553322844207287, + "reward": 2.0827959775924683, + "reward_std": 0.38654056191444397, + "rewards/GDino": 0.8122715353965759, + "rewards/GIT": 0.5369465202093124, + "rewards/HPSv2": 0.2711658477783203, + "rewards/ORM": 0.46241210401058197, + "self_certainty_semantic": -25.375, + "self_certainty_token": -20.875, + "step": 194 + }, + { + "completion_length": 41.984375, + "epoch": 0.2159468438538206, + "grad_norm": 0.5966407060623169, + "kl": 0.01617431640625, + "learning_rate": 8.78125e-07, + "loss": 0.006598036969080567, + "reward": 2.0439035892486572, + "reward_std": 0.28816351294517517, + "rewards/GDino": 0.7164062261581421, + "rewards/GIT": 0.583609938621521, + "rewards/HPSv2": 0.27428436279296875, + "rewards/ORM": 0.46960312128067017, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.5, + "step": 195 + }, + { + "completion_length": 52.09375, + "epoch": 0.21705426356589147, + "grad_norm": 0.4805348217487335, + "kl": 0.007720947265625, + "learning_rate": 8.774999999999999e-07, + "loss": -0.005165549926459789, + "reward": 2.242133378982544, + "reward_std": 0.2688770145177841, + "rewards/GDino": 0.7020833194255829, + "rewards/GIT": 0.539276048541069, + "rewards/HPSv2": 0.2836284637451172, + "rewards/ORM": 0.7171455323696136, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.625, + "step": 196 + }, + { + "completion_length": 43.796875, + "epoch": 0.21816168327796234, + "grad_norm": 0.635823667049408, + "kl": 0.01513671875, + "learning_rate": 8.76875e-07, + "loss": -0.0036483434960246086, + "reward": 2.2711516618728638, + "reward_std": 0.4390978515148163, + "rewards/GDino": 0.6989583671092987, + "rewards/GIT": 0.5045495182275772, + "rewards/HPSv2": 0.25634765625, + "rewards/ORM": 0.8112961947917938, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -20.875, + "step": 197 + }, + { + "completion_length": 42.03125, + "epoch": 0.21926910299003322, + "grad_norm": 1.2338166236877441, + "kl": 0.012359619140625, + "learning_rate": 8.7625e-07, + "loss": 0.007895001443102956, + "reward": 2.048970103263855, + "reward_std": 0.40481944382190704, + "rewards/GDino": 0.8076697587966919, + "rewards/GIT": 0.5579419136047363, + "rewards/HPSv2": 0.2553253173828125, + "rewards/ORM": 0.42803309112787247, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.875, + "step": 198 + }, + { + "completion_length": 45.390625, + "epoch": 0.2203765227021041, + "grad_norm": 0.5174763798713684, + "kl": 0.01416015625, + "learning_rate": 8.75625e-07, + "loss": 0.016770444810390472, + "reward": 2.5844353437423706, + "reward_std": 0.2750231549143791, + "rewards/GDino": 0.8541243076324463, + "rewards/GIT": 0.6642543226480484, + "rewards/HPSv2": 0.25846099853515625, + "rewards/ORM": 0.8075956404209137, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.875, + "step": 199 + }, + { + "completion_length": 36.875, + "epoch": 0.22148394241417496, + "grad_norm": 0.509035587310791, + "kl": 0.01654052734375, + "learning_rate": 8.75e-07, + "loss": 0.006852276623249054, + "reward": 2.065316915512085, + "reward_std": 0.3316378742456436, + "rewards/GDino": 0.7682685256004333, + "rewards/GIT": 0.2910846248269081, + "rewards/HPSv2": 0.2672557830810547, + "rewards/ORM": 0.7387078404426575, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.625, + "step": 200 + }, + { + "completion_length": 42.046875, + "epoch": 0.22259136212624583, + "grad_norm": 0.5931203961372375, + "kl": 0.012725830078125, + "learning_rate": 8.74375e-07, + "loss": 0.006003358401358128, + "reward": 2.1099300384521484, + "reward_std": 0.5154563784599304, + "rewards/GDino": 0.7727163732051849, + "rewards/GIT": 0.3842439502477646, + "rewards/HPSv2": 0.2718486785888672, + "rewards/ORM": 0.6811210215091705, + "self_certainty_semantic": -25.0, + "self_certainty_token": -21.4375, + "step": 201 + }, + { + "completion_length": 44.171875, + "epoch": 0.22369878183831673, + "grad_norm": 0.4835968613624573, + "kl": 0.0213623046875, + "learning_rate": 8.7375e-07, + "loss": 0.00394545728340745, + "reward": 2.427845597267151, + "reward_std": 0.36134691536426544, + "rewards/GDino": 0.8869982063770294, + "rewards/GIT": 0.5014592558145523, + "rewards/HPSv2": 0.28162574768066406, + "rewards/ORM": 0.7577625215053558, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.8125, + "step": 202 + }, + { + "completion_length": 36.453125, + "epoch": 0.2248062015503876, + "grad_norm": 0.46910953521728516, + "kl": 0.0118408203125, + "learning_rate": 8.73125e-07, + "loss": 0.002786251250654459, + "reward": 2.49591064453125, + "reward_std": 0.31263355910778046, + "rewards/GDino": 0.8656250238418579, + "rewards/GIT": 0.7141720354557037, + "rewards/HPSv2": 0.27200889587402344, + "rewards/ORM": 0.6441047191619873, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.875, + "step": 203 + }, + { + "completion_length": 40.125, + "epoch": 0.22591362126245848, + "grad_norm": 0.467896968126297, + "kl": 0.02435302734375, + "learning_rate": 8.725e-07, + "loss": -0.011271225987002254, + "reward": 2.057776093482971, + "reward_std": 0.1546005792915821, + "rewards/GDino": 0.7309895753860474, + "rewards/GIT": 0.4367304742336273, + "rewards/HPSv2": 0.27594757080078125, + "rewards/ORM": 0.6141084581613541, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.5, + "step": 204 + }, + { + "completion_length": 33.484375, + "epoch": 0.22702104097452935, + "grad_norm": 0.7759230136871338, + "kl": 0.01708984375, + "learning_rate": 8.718749999999999e-07, + "loss": 0.027633000165224075, + "reward": 2.2592638731002808, + "reward_std": 0.3684349060058594, + "rewards/GDino": 0.7562500238418579, + "rewards/GIT": 0.5480997711420059, + "rewards/HPSv2": 0.26626014709472656, + "rewards/ORM": 0.6886538565158844, + "self_certainty_semantic": -25.25, + "self_certainty_token": -20.9375, + "step": 205 + }, + { + "completion_length": 45.046875, + "epoch": 0.22812846068660023, + "grad_norm": 0.4551739990711212, + "kl": 0.012664794921875, + "learning_rate": 8.712499999999999e-07, + "loss": 0.0007739269640296698, + "reward": 2.30593478679657, + "reward_std": 0.2534303367137909, + "rewards/GDino": 0.7875000238418579, + "rewards/GIT": 0.40002302825450897, + "rewards/HPSv2": 0.2788429260253906, + "rewards/ORM": 0.8395689129829407, + "self_certainty_semantic": -25.375, + "self_certainty_token": -22.8125, + "step": 206 + }, + { + "completion_length": 39.53125, + "epoch": 0.2292358803986711, + "grad_norm": 0.5830913782119751, + "kl": 0.01336669921875, + "learning_rate": 8.706249999999999e-07, + "loss": 0.01708681881427765, + "reward": 2.227774500846863, + "reward_std": 0.3704899549484253, + "rewards/GDino": 0.8083823919296265, + "rewards/GIT": 0.4898236393928528, + "rewards/HPSv2": 0.25934791564941406, + "rewards/ORM": 0.6702206134796143, + "self_certainty_semantic": -24.875, + "self_certainty_token": -21.5, + "step": 207 + }, + { + "completion_length": 38.5625, + "epoch": 0.23034330011074197, + "grad_norm": 0.3691738247871399, + "kl": 0.02581787109375, + "learning_rate": 8.699999999999999e-07, + "loss": 0.004591751378029585, + "reward": 2.358023166656494, + "reward_std": 0.17707649432122707, + "rewards/GDino": 0.8265625238418579, + "rewards/GIT": 0.6824776232242584, + "rewards/HPSv2": 0.2587127685546875, + "rewards/ORM": 0.5902703106403351, + "self_certainty_semantic": -25.25, + "self_certainty_token": -22.1875, + "step": 208 + }, + { + "completion_length": 40.3125, + "epoch": 0.23145071982281284, + "grad_norm": 0.9062660336494446, + "kl": 0.02685546875, + "learning_rate": 8.693749999999999e-07, + "loss": 0.006221079733222723, + "reward": 1.8361099362373352, + "reward_std": 0.4263295829296112, + "rewards/GDino": 0.7508396208286285, + "rewards/GIT": 0.3777461498975754, + "rewards/HPSv2": 0.266815185546875, + "rewards/ORM": 0.4407089203596115, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.5625, + "step": 209 + }, + { + "completion_length": 38.5625, + "epoch": 0.23255813953488372, + "grad_norm": 0.748408317565918, + "kl": 0.026611328125, + "learning_rate": 8.687499999999999e-07, + "loss": -0.002506181481294334, + "reward": 2.4682434797286987, + "reward_std": 0.2377740740776062, + "rewards/GDino": 0.8195167779922485, + "rewards/GIT": 0.46161703020334244, + "rewards/HPSv2": 0.27737998962402344, + "rewards/ORM": 0.9097296893596649, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -22.0625, + "step": 210 + }, + { + "completion_length": 41.796875, + "epoch": 0.2336655592469546, + "grad_norm": 1.765519142150879, + "kl": 0.02984619140625, + "learning_rate": 8.681249999999999e-07, + "loss": -0.0039495378732681274, + "reward": 1.913986623287201, + "reward_std": 0.5612352639436722, + "rewards/GDino": 0.6622395515441895, + "rewards/GIT": 0.43646256625652313, + "rewards/HPSv2": 0.2698841094970703, + "rewards/ORM": 0.545400395989418, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.5, + "step": 211 + }, + { + "completion_length": 34.96875, + "epoch": 0.23477297895902546, + "grad_norm": 0.6982265710830688, + "kl": 0.0120849609375, + "learning_rate": 8.675000000000001e-07, + "loss": -0.029174044728279114, + "reward": 1.9864473342895508, + "reward_std": 0.4800384193658829, + "rewards/GDino": 0.7475498020648956, + "rewards/GIT": 0.22028353065252304, + "rewards/HPSv2": 0.2693653106689453, + "rewards/ORM": 0.7492486536502838, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.5625, + "step": 212 + }, + { + "completion_length": 43.296875, + "epoch": 0.23588039867109634, + "grad_norm": 0.5722474455833435, + "kl": 0.016571044921875, + "learning_rate": 8.66875e-07, + "loss": -0.0017778393812477589, + "reward": 2.3667664527893066, + "reward_std": 0.40382666885852814, + "rewards/GDino": 0.8726562261581421, + "rewards/GIT": 0.6178667545318604, + "rewards/HPSv2": 0.2806682586669922, + "rewards/ORM": 0.5955753028392792, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.5625, + "step": 213 + }, + { + "completion_length": 39.46875, + "epoch": 0.2369878183831672, + "grad_norm": 0.4423932433128357, + "kl": 0.02313232421875, + "learning_rate": 8.6625e-07, + "loss": 0.020447181537747383, + "reward": 1.8110211491584778, + "reward_std": 0.3642263114452362, + "rewards/GDino": 0.6858590543270111, + "rewards/GIT": 0.2922380678355694, + "rewards/HPSv2": 0.2856483459472656, + "rewards/ORM": 0.5472756624221802, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -20.5625, + "step": 214 + }, + { + "completion_length": 35.453125, + "epoch": 0.23809523809523808, + "grad_norm": 0.525648295879364, + "kl": 0.02679443359375, + "learning_rate": 8.65625e-07, + "loss": -0.004891848191618919, + "reward": 2.4582409858703613, + "reward_std": 0.25197018682956696, + "rewards/GDino": 0.8230374455451965, + "rewards/GIT": 0.6004615277051926, + "rewards/HPSv2": 0.262908935546875, + "rewards/ORM": 0.7718330323696136, + "self_certainty_semantic": -24.9375, + "self_certainty_token": -21.125, + "step": 215 + }, + { + "completion_length": 35.09375, + "epoch": 0.23920265780730898, + "grad_norm": 2.2020962238311768, + "kl": 0.02142333984375, + "learning_rate": 8.65e-07, + "loss": 0.009721468668431044, + "reward": 2.365583837032318, + "reward_std": 0.21866833791136742, + "rewards/GDino": 0.8238094449043274, + "rewards/GIT": 0.441422201693058, + "rewards/HPSv2": 0.2851238250732422, + "rewards/ORM": 0.8152283430099487, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.5, + "step": 216 + }, + { + "completion_length": 54.84375, + "epoch": 0.24031007751937986, + "grad_norm": 0.45819738507270813, + "kl": 0.012054443359375, + "learning_rate": 8.64375e-07, + "loss": -0.02724960818886757, + "reward": 2.1141871213912964, + "reward_std": 0.31379085779190063, + "rewards/GDino": 0.7791666686534882, + "rewards/GIT": 0.5268686562776566, + "rewards/HPSv2": 0.25792694091796875, + "rewards/ORM": 0.5502248406410217, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.0625, + "step": 217 + }, + { + "completion_length": 40.4375, + "epoch": 0.24141749723145073, + "grad_norm": 0.5628157258033752, + "kl": 0.0264892578125, + "learning_rate": 8.6375e-07, + "loss": -0.007330425316467881, + "reward": 1.6426080465316772, + "reward_std": 0.24193137884140015, + "rewards/GDino": 0.6079951226711273, + "rewards/GIT": 0.3009786158800125, + "rewards/HPSv2": 0.2601509094238281, + "rewards/ORM": 0.47348344326019287, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.125, + "step": 218 + }, + { + "completion_length": 34.625, + "epoch": 0.2425249169435216, + "grad_norm": 0.4862461984157562, + "kl": 0.0218505859375, + "learning_rate": 8.63125e-07, + "loss": 0.014741807244718075, + "reward": 2.004266083240509, + "reward_std": 0.32091251015663147, + "rewards/GDino": 0.6978627145290375, + "rewards/GIT": 0.45149359107017517, + "rewards/HPSv2": 0.27678489685058594, + "rewards/ORM": 0.578125, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.4375, + "step": 219 + }, + { + "completion_length": 39.875, + "epoch": 0.24363233665559247, + "grad_norm": 1.1945821046829224, + "kl": 0.011016845703125, + "learning_rate": 8.625e-07, + "loss": 0.02643737755715847, + "reward": 2.016072630882263, + "reward_std": 0.323383167386055, + "rewards/GDino": 0.8033246994018555, + "rewards/GIT": 0.5735934674739838, + "rewards/HPSv2": 0.25634193420410156, + "rewards/ORM": 0.3828125, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.25, + "step": 220 + }, + { + "completion_length": 35.546875, + "epoch": 0.24473975636766335, + "grad_norm": 0.9685459136962891, + "kl": 0.01617431640625, + "learning_rate": 8.618749999999999e-07, + "loss": -0.01451091282069683, + "reward": 1.7261735200881958, + "reward_std": 0.4147494286298752, + "rewards/GDino": 0.6085260510444641, + "rewards/GIT": 0.2463730275630951, + "rewards/HPSv2": 0.2583179473876953, + "rewards/ORM": 0.6129564046859741, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.5, + "step": 221 + }, + { + "completion_length": 34.890625, + "epoch": 0.24584717607973422, + "grad_norm": 0.5475857257843018, + "kl": 0.022491455078125, + "learning_rate": 8.612499999999999e-07, + "loss": 0.033416807651519775, + "reward": 1.9957242012023926, + "reward_std": 0.5047579407691956, + "rewards/GDino": 0.658912181854248, + "rewards/GIT": 0.4291614145040512, + "rewards/HPSv2": 0.271759033203125, + "rewards/ORM": 0.6358915567398071, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.25, + "step": 222 + }, + { + "completion_length": 36.515625, + "epoch": 0.2469545957918051, + "grad_norm": 0.4916452169418335, + "kl": 0.0184326171875, + "learning_rate": 8.606249999999999e-07, + "loss": -0.00917936791665852, + "reward": 2.0419200658798218, + "reward_std": 0.5100755244493484, + "rewards/GDino": 0.729717344045639, + "rewards/GIT": 0.21226511895656586, + "rewards/HPSv2": 0.2703380584716797, + "rewards/ORM": 0.8295995891094208, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.75, + "step": 223 + }, + { + "completion_length": 30.46875, + "epoch": 0.24806201550387597, + "grad_norm": 0.5015460252761841, + "kl": 0.0233154296875, + "learning_rate": 8.599999999999999e-07, + "loss": -0.0018524383194744587, + "reward": 2.36031711101532, + "reward_std": 0.19695958495140076, + "rewards/GDino": 0.820055365562439, + "rewards/GIT": 0.45940345525741577, + "rewards/HPSv2": 0.2683582305908203, + "rewards/ORM": 0.8125, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.1875, + "step": 224 + }, + { + "completion_length": 41.0, + "epoch": 0.24916943521594684, + "grad_norm": 0.5461787581443787, + "kl": 0.025390625, + "learning_rate": 8.593749999999999e-07, + "loss": -0.009375374997034669, + "reward": 2.1164638996124268, + "reward_std": 0.4978756159543991, + "rewards/GDino": 0.7708333432674408, + "rewards/GIT": 0.5871168673038483, + "rewards/HPSv2": 0.237884521484375, + "rewards/ORM": 0.5206292420625687, + "self_certainty_semantic": -25.0, + "self_certainty_token": -21.1875, + "step": 225 + }, + { + "completion_length": 30.96875, + "epoch": 0.2502768549280177, + "grad_norm": 0.46695247292518616, + "kl": 0.016510009765625, + "learning_rate": 8.587499999999999e-07, + "loss": -0.0015379427932202816, + "reward": 2.3315041065216064, + "reward_std": 0.35161057114601135, + "rewards/GDino": 0.8348958492279053, + "rewards/GIT": 0.41105780750513077, + "rewards/HPSv2": 0.271575927734375, + "rewards/ORM": 0.8139745891094208, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.4375, + "step": 226 + }, + { + "completion_length": 38.703125, + "epoch": 0.2513842746400886, + "grad_norm": 0.5733257532119751, + "kl": 0.01409912109375, + "learning_rate": 8.581249999999999e-07, + "loss": -0.003186628222465515, + "reward": 1.9469320178031921, + "reward_std": 0.47824153304100037, + "rewards/GDino": 0.7051927149295807, + "rewards/GIT": 0.42935653030872345, + "rewards/HPSv2": 0.2548370361328125, + "rewards/ORM": 0.5575457066297531, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.4375, + "step": 227 + }, + { + "completion_length": 43.234375, + "epoch": 0.25249169435215946, + "grad_norm": 0.40495043992996216, + "kl": 0.016357421875, + "learning_rate": 8.575e-07, + "loss": -0.00824080128222704, + "reward": 1.9755643606185913, + "reward_std": 0.3469914644956589, + "rewards/GDino": 0.8501654267311096, + "rewards/GIT": 0.19020378589630127, + "rewards/HPSv2": 0.2816734313964844, + "rewards/ORM": 0.6535216271877289, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.5, + "step": 228 + }, + { + "completion_length": 40.453125, + "epoch": 0.25359911406423036, + "grad_norm": 0.6345773935317993, + "kl": 0.019287109375, + "learning_rate": 8.568750000000001e-07, + "loss": 0.0025403383187949657, + "reward": 2.108458161354065, + "reward_std": 0.28643767535686493, + "rewards/GDino": 0.6715624928474426, + "rewards/GIT": 0.3562132343649864, + "rewards/HPSv2": 0.2619743347167969, + "rewards/ORM": 0.818708062171936, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.0625, + "step": 229 + }, + { + "completion_length": 38.8125, + "epoch": 0.2547065337763012, + "grad_norm": 0.5043112635612488, + "kl": 0.01556396484375, + "learning_rate": 8.5625e-07, + "loss": 0.009474070742726326, + "reward": 2.0315651893615723, + "reward_std": 0.4527067393064499, + "rewards/GDino": 0.7280016541481018, + "rewards/GIT": 0.34433120489120483, + "rewards/HPSv2": 0.2717323303222656, + "rewards/ORM": 0.6875, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -22.0625, + "step": 230 + }, + { + "completion_length": 39.75, + "epoch": 0.2558139534883721, + "grad_norm": 0.7560561299324036, + "kl": 0.0228271484375, + "learning_rate": 8.55625e-07, + "loss": 0.03068174561485648, + "reward": 1.8725414276123047, + "reward_std": 0.37937965989112854, + "rewards/GDino": 0.7148861289024353, + "rewards/GIT": 0.35717466473579407, + "rewards/HPSv2": 0.258209228515625, + "rewards/ORM": 0.5422714352607727, + "self_certainty_semantic": -24.9375, + "self_certainty_token": -21.0625, + "step": 231 + }, + { + "completion_length": 39.296875, + "epoch": 0.25692137320044295, + "grad_norm": 1.4232704639434814, + "kl": 0.013641357421875, + "learning_rate": 8.55e-07, + "loss": 0.007876838091760874, + "reward": 1.9952596426010132, + "reward_std": 0.4007260948419571, + "rewards/GDino": 0.7144716680049896, + "rewards/GIT": 0.4604896456003189, + "rewards/HPSv2": 0.25359535217285156, + "rewards/ORM": 0.5667029619216919, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.0, + "step": 232 + }, + { + "completion_length": 52.671875, + "epoch": 0.25802879291251385, + "grad_norm": 0.553196907043457, + "kl": 0.01513671875, + "learning_rate": 8.54375e-07, + "loss": -0.002937940414994955, + "reward": 2.1501262187957764, + "reward_std": 0.3424035310745239, + "rewards/GDino": 0.7523437142372131, + "rewards/GIT": 0.3163189962506294, + "rewards/HPSv2": 0.2654838562011719, + "rewards/ORM": 0.8159796893596649, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.4375, + "step": 233 + }, + { + "completion_length": 35.40625, + "epoch": 0.2591362126245847, + "grad_norm": 4.139526844024658, + "kl": 0.033203125, + "learning_rate": 8.5375e-07, + "loss": 0.007489941082894802, + "reward": 2.0956884622573853, + "reward_std": 0.25674766674637794, + "rewards/GDino": 0.8046875, + "rewards/GIT": 0.30966047942638397, + "rewards/HPSv2": 0.2782154083251953, + "rewards/ORM": 0.703125, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -20.9375, + "step": 234 + }, + { + "completion_length": 39.875, + "epoch": 0.2602436323366556, + "grad_norm": 0.509060263633728, + "kl": 0.0189208984375, + "learning_rate": 8.53125e-07, + "loss": 0.004633883247151971, + "reward": 1.9408237934112549, + "reward_std": 0.4254070520401001, + "rewards/GDino": 0.7614295780658722, + "rewards/GIT": 0.3818518742918968, + "rewards/HPSv2": 0.27335357666015625, + "rewards/ORM": 0.5241888463497162, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.625, + "step": 235 + }, + { + "completion_length": 36.375, + "epoch": 0.26135105204872644, + "grad_norm": 0.4405996799468994, + "kl": 0.0157470703125, + "learning_rate": 8.525e-07, + "loss": -0.010419015074148774, + "reward": 1.502617061138153, + "reward_std": 0.4193390905857086, + "rewards/GDino": 0.7078101933002472, + "rewards/GIT": 0.2298043891787529, + "rewards/HPSv2": 0.25250244140625, + "rewards/ORM": 0.3125000149011612, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.4375, + "step": 236 + }, + { + "completion_length": 39.0, + "epoch": 0.26245847176079734, + "grad_norm": 0.505985677242279, + "kl": 0.01690673828125, + "learning_rate": 8.51875e-07, + "loss": -0.016914513893425465, + "reward": 2.266430974006653, + "reward_std": 0.4797457307577133, + "rewards/GDino": 0.7984375059604645, + "rewards/GIT": 0.49608829617500305, + "rewards/HPSv2": 0.27498817443847656, + "rewards/ORM": 0.6969169676303864, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.625, + "step": 237 + }, + { + "completion_length": 35.3125, + "epoch": 0.26356589147286824, + "grad_norm": 0.484314888715744, + "kl": 0.0130615234375, + "learning_rate": 8.512499999999999e-07, + "loss": 0.030976343899965286, + "reward": 1.991338849067688, + "reward_std": 0.2668539509177208, + "rewards/GDino": 0.8419585227966309, + "rewards/GIT": 0.44607045501470566, + "rewards/HPSv2": 0.26688385009765625, + "rewards/ORM": 0.436426043510437, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -20.125, + "step": 238 + }, + { + "completion_length": 44.828125, + "epoch": 0.2646733111849391, + "grad_norm": 0.7890397310256958, + "kl": 0.01763916015625, + "learning_rate": 8.506249999999999e-07, + "loss": -0.0025457546580582857, + "reward": 2.0841389894485474, + "reward_std": 0.19053151458501816, + "rewards/GDino": 0.6302083730697632, + "rewards/GIT": 0.5595559030771255, + "rewards/HPSv2": 0.2693748474121094, + "rewards/ORM": 0.625, + "self_certainty_semantic": -25.125, + "self_certainty_token": -20.4375, + "step": 239 + }, + { + "completion_length": 44.859375, + "epoch": 0.26578073089701, + "grad_norm": 0.8127689361572266, + "kl": 0.022491455078125, + "learning_rate": 8.499999999999999e-07, + "loss": 0.0033242302015423775, + "reward": 2.0860930681228638, + "reward_std": 0.22588412836194038, + "rewards/GDino": 0.7918341755867004, + "rewards/GIT": 0.4664677008986473, + "rewards/HPSv2": 0.2652912139892578, + "rewards/ORM": 0.5625, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.375, + "step": 240 + }, + { + "completion_length": 38.6875, + "epoch": 0.26688815060908083, + "grad_norm": 0.7459165453910828, + "kl": 0.0216064453125, + "learning_rate": 8.493749999999999e-07, + "loss": -0.018312662839889526, + "reward": 2.1917264461517334, + "reward_std": 0.19818688184022903, + "rewards/GDino": 0.765625, + "rewards/GIT": 0.5940662920475006, + "rewards/HPSv2": 0.2635478973388672, + "rewards/ORM": 0.5684872269630432, + "self_certainty_semantic": -25.125, + "self_certainty_token": -22.0625, + "step": 241 + }, + { + "completion_length": 40.1875, + "epoch": 0.26799557032115173, + "grad_norm": 0.528344988822937, + "kl": 0.013427734375, + "learning_rate": 8.487499999999999e-07, + "loss": 0.0015740576200187206, + "reward": 1.820701777935028, + "reward_std": 0.2884362041950226, + "rewards/GDino": 0.7261866927146912, + "rewards/GIT": 0.32748638093471527, + "rewards/HPSv2": 0.2674293518066406, + "rewards/ORM": 0.4995993375778198, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.75, + "step": 242 + }, + { + "completion_length": 35.640625, + "epoch": 0.2691029900332226, + "grad_norm": 0.6940242648124695, + "kl": 0.0177001953125, + "learning_rate": 8.481249999999999e-07, + "loss": -0.0034725347068160772, + "reward": 1.5581411123275757, + "reward_std": 0.4635423719882965, + "rewards/GDino": 0.6132907867431641, + "rewards/GIT": 0.22624823451042175, + "rewards/HPSv2": 0.2540550231933594, + "rewards/ORM": 0.4645470678806305, + "self_certainty_semantic": -25.125, + "self_certainty_token": -20.625, + "step": 243 + }, + { + "completion_length": 41.46875, + "epoch": 0.2702104097452935, + "grad_norm": 0.5053713321685791, + "kl": 0.0189208984375, + "learning_rate": 8.475e-07, + "loss": -0.0003409823402762413, + "reward": 1.8292673826217651, + "reward_std": 0.3081812709569931, + "rewards/GDino": 0.7110464870929718, + "rewards/GIT": 0.18649065494537354, + "rewards/HPSv2": 0.2602558135986328, + "rewards/ORM": 0.6714743673801422, + "self_certainty_semantic": -24.875, + "self_certainty_token": -21.25, + "step": 244 + }, + { + "completion_length": 36.484375, + "epoch": 0.2713178294573643, + "grad_norm": 0.6690962910652161, + "kl": 0.0186767578125, + "learning_rate": 8.46875e-07, + "loss": -0.020096718333661556, + "reward": 1.7160680294036865, + "reward_std": 0.4093341752886772, + "rewards/GDino": 0.7332078814506531, + "rewards/GIT": 0.2718503773212433, + "rewards/HPSv2": 0.26757240295410156, + "rewards/ORM": 0.44343726336956024, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.9375, + "step": 245 + }, + { + "completion_length": 42.96875, + "epoch": 0.2724252491694352, + "grad_norm": 0.7113038897514343, + "kl": 0.0272216796875, + "learning_rate": 8.462499999999999e-07, + "loss": 0.021475711837410927, + "reward": 2.2206690311431885, + "reward_std": 0.2255793958902359, + "rewards/GDino": 0.8505983948707581, + "rewards/GIT": 0.5832930952310562, + "rewards/HPSv2": 0.24771499633789062, + "rewards/ORM": 0.5390624701976776, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.0625, + "step": 246 + }, + { + "completion_length": 39.3125, + "epoch": 0.27353266888150607, + "grad_norm": 1.8593120574951172, + "kl": 0.0225830078125, + "learning_rate": 8.45625e-07, + "loss": -0.00028415233828127384, + "reward": 2.0454858541488647, + "reward_std": 0.3580029234290123, + "rewards/GDino": 0.7631928622722626, + "rewards/GIT": 0.3862551599740982, + "rewards/HPSv2": 0.2628746032714844, + "rewards/ORM": 0.6331632137298584, + "self_certainty_semantic": -25.0, + "self_certainty_token": -20.9375, + "step": 247 + }, + { + "completion_length": 36.125, + "epoch": 0.27464008859357697, + "grad_norm": 0.592446506023407, + "kl": 0.02227783203125, + "learning_rate": 8.45e-07, + "loss": 0.02284400351345539, + "reward": 1.5269662737846375, + "reward_std": 0.4807903617620468, + "rewards/GDino": 0.6762161254882812, + "rewards/GIT": 0.23257604241371155, + "rewards/HPSv2": 0.28420257568359375, + "rewards/ORM": 0.3339715301990509, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -22.3125, + "step": 248 + }, + { + "completion_length": 38.28125, + "epoch": 0.2757475083056478, + "grad_norm": 0.6646723747253418, + "kl": 0.01947021484375, + "learning_rate": 8.44375e-07, + "loss": -0.0015255426988005638, + "reward": 2.369264245033264, + "reward_std": 0.45240701735019684, + "rewards/GDino": 0.803423672914505, + "rewards/GIT": 0.6715860366821289, + "rewards/HPSv2": 0.27072906494140625, + "rewards/ORM": 0.6235254108905792, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.5, + "step": 249 + }, + { + "completion_length": 39.859375, + "epoch": 0.2768549280177187, + "grad_norm": 0.5067113637924194, + "kl": 0.02447509765625, + "learning_rate": 8.4375e-07, + "loss": 0.010950385592877865, + "reward": 2.4666439294815063, + "reward_std": 0.2641718164086342, + "rewards/GDino": 0.8098958134651184, + "rewards/GIT": 0.8221855163574219, + "rewards/HPSv2": 0.2626457214355469, + "rewards/ORM": 0.571916937828064, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.1875, + "step": 250 + }, + { + "completion_length": 48.34375, + "epoch": 0.2779623477297896, + "grad_norm": 0.5123341679573059, + "kl": 0.014495849609375, + "learning_rate": 8.43125e-07, + "loss": -0.007521981373429298, + "reward": 2.471664547920227, + "reward_std": 0.3560677170753479, + "rewards/GDino": 0.870312511920929, + "rewards/GIT": 0.761746883392334, + "rewards/HPSv2": 0.2522430419921875, + "rewards/ORM": 0.587362140417099, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -20.8125, + "step": 251 + }, + { + "completion_length": 39.125, + "epoch": 0.27906976744186046, + "grad_norm": 0.5467274785041809, + "kl": 0.0211181640625, + "learning_rate": 8.425e-07, + "loss": 0.0022668561432510614, + "reward": 1.6477148532867432, + "reward_std": 0.5759541094303131, + "rewards/GDino": 0.6465478837490082, + "rewards/GIT": 0.18030283600091934, + "rewards/HPSv2": 0.2717132568359375, + "rewards/ORM": 0.549150824546814, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.25, + "step": 252 + }, + { + "completion_length": 29.4375, + "epoch": 0.28017718715393136, + "grad_norm": 0.4424716532230377, + "kl": 0.01275634765625, + "learning_rate": 8.41875e-07, + "loss": 0.00682451156899333, + "reward": 2.520264148712158, + "reward_std": 0.2268836908042431, + "rewards/GDino": 0.8885416686534882, + "rewards/GIT": 0.6353528201580048, + "rewards/HPSv2": 0.2576618194580078, + "rewards/ORM": 0.7387077957391739, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.375, + "step": 253 + }, + { + "completion_length": 33.453125, + "epoch": 0.2812846068660022, + "grad_norm": 1.276292324066162, + "kl": 0.0240478515625, + "learning_rate": 8.4125e-07, + "loss": 0.021866907365620136, + "reward": 2.1942864656448364, + "reward_std": 0.2500537782907486, + "rewards/GDino": 0.7828124761581421, + "rewards/GIT": 0.6722497642040253, + "rewards/HPSv2": 0.2669944763183594, + "rewards/ORM": 0.4722296893596649, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.6875, + "step": 254 + }, + { + "completion_length": 43.296875, + "epoch": 0.2823920265780731, + "grad_norm": 0.6294586658477783, + "kl": 0.02001953125, + "learning_rate": 8.406249999999999e-07, + "loss": -0.013118594884872437, + "reward": 2.0315213203430176, + "reward_std": 0.26577161997556686, + "rewards/GDino": 0.6790449321269989, + "rewards/GIT": 0.3302696421742439, + "rewards/HPSv2": 0.2811431884765625, + "rewards/ORM": 0.7410635948181152, + "self_certainty_semantic": -25.125, + "self_certainty_token": -22.0625, + "step": 255 + }, + { + "completion_length": 38.234375, + "epoch": 0.28349944629014395, + "grad_norm": 0.47897863388061523, + "kl": 0.02569580078125, + "learning_rate": 8.399999999999999e-07, + "loss": 0.007225595414638519, + "reward": 1.8496492505073547, + "reward_std": 0.4634602516889572, + "rewards/GDino": 0.6612989604473114, + "rewards/GIT": 0.3624468967318535, + "rewards/HPSv2": 0.2617988586425781, + "rewards/ORM": 0.564104437828064, + "self_certainty_semantic": -24.875, + "self_certainty_token": -21.125, + "step": 256 + }, + { + "completion_length": 34.90625, + "epoch": 0.28460686600221485, + "grad_norm": 1.0091742277145386, + "kl": 0.011627197265625, + "learning_rate": 8.393749999999999e-07, + "loss": -0.003027831669896841, + "reward": 1.818339228630066, + "reward_std": 0.34789541363716125, + "rewards/GDino": 0.68548583984375, + "rewards/GIT": 0.3273261487483978, + "rewards/HPSv2": 0.26115989685058594, + "rewards/ORM": 0.5443674623966217, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.875, + "step": 257 + }, + { + "completion_length": 42.328125, + "epoch": 0.2857142857142857, + "grad_norm": 0.9769588708877563, + "kl": 0.01727294921875, + "learning_rate": 8.387499999999999e-07, + "loss": -0.027014064602553844, + "reward": 1.6609314680099487, + "reward_std": 0.5478866696357727, + "rewards/GDino": 0.5854167342185974, + "rewards/GIT": 0.18769004940986633, + "rewards/HPSv2": 0.27184104919433594, + "rewards/ORM": 0.6159836798906326, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.0625, + "step": 258 + }, + { + "completion_length": 34.203125, + "epoch": 0.2868217054263566, + "grad_norm": 0.4485263526439667, + "kl": 0.019287109375, + "learning_rate": 8.38125e-07, + "loss": 0.018385295988991857, + "reward": 2.5785293579101562, + "reward_std": 0.17483927309513092, + "rewards/GDino": 0.9424871504306793, + "rewards/GIT": 0.7919199466705322, + "rewards/HPSv2": 0.2565803527832031, + "rewards/ORM": 0.5875419676303864, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.1875, + "step": 259 + }, + { + "completion_length": 37.203125, + "epoch": 0.28792912513842744, + "grad_norm": 0.449808806180954, + "kl": 0.0335693359375, + "learning_rate": 8.375e-07, + "loss": 0.014663147740066051, + "reward": 1.8461943864822388, + "reward_std": 0.5021175146102905, + "rewards/GDino": 0.6706274747848511, + "rewards/GIT": 0.3253522589802742, + "rewards/HPSv2": 0.27150726318359375, + "rewards/ORM": 0.5787073373794556, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -22.3125, + "step": 260 + }, + { + "completion_length": 43.40625, + "epoch": 0.28903654485049834, + "grad_norm": 0.6159392595291138, + "kl": 0.02618408203125, + "learning_rate": 8.36875e-07, + "loss": 0.006308391690254211, + "reward": 2.2187058329582214, + "reward_std": 0.3276343122124672, + "rewards/GDino": 0.7517669200897217, + "rewards/GIT": 0.5226505398750305, + "rewards/HPSv2": 0.2646007537841797, + "rewards/ORM": 0.6796875, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.3125, + "step": 261 + }, + { + "completion_length": 40.84375, + "epoch": 0.2901439645625692, + "grad_norm": 0.4515726864337921, + "kl": 0.0157470703125, + "learning_rate": 8.3625e-07, + "loss": -0.025780703872442245, + "reward": 2.513529419898987, + "reward_std": 0.377413272857666, + "rewards/GDino": 0.8145833611488342, + "rewards/GIT": 0.6835201978683472, + "rewards/HPSv2": 0.2530097961425781, + "rewards/ORM": 0.7624161243438721, + "self_certainty_semantic": -25.375, + "self_certainty_token": -22.0, + "step": 262 + }, + { + "completion_length": 47.8125, + "epoch": 0.2912513842746401, + "grad_norm": 0.6491768956184387, + "kl": 0.01971435546875, + "learning_rate": 8.356249999999999e-07, + "loss": -0.010318214073777199, + "reward": 2.480243444442749, + "reward_std": 0.38080593943595886, + "rewards/GDino": 0.8481250107288361, + "rewards/GIT": 0.49081389605998993, + "rewards/HPSv2": 0.25723838806152344, + "rewards/ORM": 0.884066253900528, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.75, + "step": 263 + }, + { + "completion_length": 42.140625, + "epoch": 0.292358803986711, + "grad_norm": 0.635397732257843, + "kl": 0.027191162109375, + "learning_rate": 8.349999999999999e-07, + "loss": -0.010353719699196517, + "reward": 2.442339539527893, + "reward_std": 0.32322730123996735, + "rewards/GDino": 0.8117945790290833, + "rewards/GIT": 0.6421481668949127, + "rewards/HPSv2": 0.2681722640991211, + "rewards/ORM": 0.7202245593070984, + "self_certainty_semantic": -25.375, + "self_certainty_token": -20.5625, + "step": 264 + }, + { + "completion_length": 43.703125, + "epoch": 0.29346622369878184, + "grad_norm": 0.4443694055080414, + "kl": 0.0185546875, + "learning_rate": 8.34375e-07, + "loss": 0.02428914955817163, + "reward": 2.2206848859786987, + "reward_std": 0.4008527547121048, + "rewards/GDino": 0.8173159956932068, + "rewards/GIT": 0.43181733787059784, + "rewards/HPSv2": 0.27811431884765625, + "rewards/ORM": 0.693437248468399, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.0625, + "step": 265 + }, + { + "completion_length": 43.875, + "epoch": 0.29457364341085274, + "grad_norm": 0.5283258557319641, + "kl": 0.015289306640625, + "learning_rate": 8.3375e-07, + "loss": 0.019315090496093035, + "reward": 1.8423295617103577, + "reward_std": 0.337589330971241, + "rewards/GDino": 0.6214767098426819, + "rewards/GIT": 0.3616393804550171, + "rewards/HPSv2": 0.27019691467285156, + "rewards/ORM": 0.5890165567398071, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.0, + "step": 266 + }, + { + "completion_length": 42.578125, + "epoch": 0.2956810631229236, + "grad_norm": 0.5958583950996399, + "kl": 0.02301025390625, + "learning_rate": 8.33125e-07, + "loss": -0.010422109626233578, + "reward": 2.478596568107605, + "reward_std": 0.2979115843772888, + "rewards/GDino": 0.8439387381076813, + "rewards/GIT": 0.5261026620864868, + "rewards/HPSv2": 0.28636741638183594, + "rewards/ORM": 0.8221877217292786, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.9375, + "step": 267 + }, + { + "completion_length": 38.8125, + "epoch": 0.2967884828349945, + "grad_norm": 0.5406501889228821, + "kl": 0.0189208984375, + "learning_rate": 8.325e-07, + "loss": -0.011615738272666931, + "reward": 2.1584006547927856, + "reward_std": 0.3037988841533661, + "rewards/GDino": 0.7368161380290985, + "rewards/GIT": 0.3939068764448166, + "rewards/HPSv2": 0.2691917419433594, + "rewards/ORM": 0.7584857940673828, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.4375, + "step": 268 + }, + { + "completion_length": 40.375, + "epoch": 0.2978959025470653, + "grad_norm": 0.5237568616867065, + "kl": 0.01934814453125, + "learning_rate": 8.31875e-07, + "loss": -0.022221547085791826, + "reward": 2.410371422767639, + "reward_std": 0.32327765971422195, + "rewards/GDino": 0.8763707876205444, + "rewards/GIT": 0.6529507040977478, + "rewards/HPSv2": 0.25867652893066406, + "rewards/ORM": 0.622373417019844, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.0625, + "step": 269 + }, + { + "completion_length": 42.40625, + "epoch": 0.29900332225913623, + "grad_norm": 0.5596421957015991, + "kl": 0.02099609375, + "learning_rate": 8.3125e-07, + "loss": 0.0010367396753281355, + "reward": 1.9419864416122437, + "reward_std": 0.3841004818677902, + "rewards/GDino": 0.7421875298023224, + "rewards/GIT": 0.40068522095680237, + "rewards/HPSv2": 0.26871681213378906, + "rewards/ORM": 0.5303968787193298, + "self_certainty_semantic": -25.0, + "self_certainty_token": -21.625, + "step": 270 + }, + { + "completion_length": 36.515625, + "epoch": 0.3001107419712071, + "grad_norm": 0.4112670421600342, + "kl": 0.02191162109375, + "learning_rate": 8.306249999999999e-07, + "loss": 0.008409947156906128, + "reward": 1.5938773155212402, + "reward_std": 0.4974990785121918, + "rewards/GDino": 0.7035039365291595, + "rewards/GIT": 0.2642563134431839, + "rewards/HPSv2": 0.24611282348632812, + "rewards/ORM": 0.3800041973590851, + "self_certainty_semantic": -24.875, + "self_certainty_token": -22.3125, + "step": 271 + }, + { + "completion_length": 42.453125, + "epoch": 0.301218161683278, + "grad_norm": 1.410166621208191, + "kl": 0.018798828125, + "learning_rate": 8.299999999999999e-07, + "loss": -0.024008065927773714, + "reward": 1.7751591801643372, + "reward_std": 0.4845600575208664, + "rewards/GDino": 0.6045292317867279, + "rewards/GIT": 0.2201373279094696, + "rewards/HPSv2": 0.27045440673828125, + "rewards/ORM": 0.6800382286310196, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.375, + "step": 272 + }, + { + "completion_length": 34.78125, + "epoch": 0.3023255813953488, + "grad_norm": 1.887034296989441, + "kl": 0.01806640625, + "learning_rate": 8.293749999999999e-07, + "loss": 0.00430523045361042, + "reward": 1.7456142902374268, + "reward_std": 0.3797690123319626, + "rewards/GDino": 0.658672958612442, + "rewards/GIT": 0.3142998740077019, + "rewards/HPSv2": 0.2761211395263672, + "rewards/ORM": 0.4965203106403351, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -22.0625, + "step": 273 + }, + { + "completion_length": 35.328125, + "epoch": 0.3034330011074197, + "grad_norm": 0.9875066876411438, + "kl": 0.02935791015625, + "learning_rate": 8.287499999999999e-07, + "loss": 0.013776570558547974, + "reward": 2.0887337923049927, + "reward_std": 0.39243339002132416, + "rewards/GDino": 0.7666075229644775, + "rewards/GIT": 0.5064676105976105, + "rewards/HPSv2": 0.25235748291015625, + "rewards/ORM": 0.5633012652397156, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.125, + "step": 274 + }, + { + "completion_length": 42.015625, + "epoch": 0.30454042081949056, + "grad_norm": 0.5050070285797119, + "kl": 0.019317626953125, + "learning_rate": 8.28125e-07, + "loss": 0.01794680766761303, + "reward": 2.768193483352661, + "reward_std": 0.20595969259738922, + "rewards/GDino": 0.8856770694255829, + "rewards/GIT": 0.7785017788410187, + "rewards/HPSv2": 0.2793693542480469, + "rewards/ORM": 0.8246453106403351, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.875, + "step": 275 + }, + { + "completion_length": 36.234375, + "epoch": 0.30564784053156147, + "grad_norm": 0.8548435568809509, + "kl": 0.015167236328125, + "learning_rate": 8.275e-07, + "loss": -0.001379463355988264, + "reward": 2.29227876663208, + "reward_std": 0.3256339356303215, + "rewards/GDino": 0.7592664659023285, + "rewards/GIT": 0.4296427518129349, + "rewards/HPSv2": 0.27176475524902344, + "rewards/ORM": 0.8316046893596649, + "self_certainty_semantic": -25.125, + "self_certainty_token": -22.0625, + "step": 276 + }, + { + "completion_length": 38.875, + "epoch": 0.3067552602436323, + "grad_norm": 0.49069884419441223, + "kl": 0.02545166015625, + "learning_rate": 8.26875e-07, + "loss": 0.003206422086805105, + "reward": 2.004135310649872, + "reward_std": 0.4736744314432144, + "rewards/GDino": 0.6578125059604645, + "rewards/GIT": 0.5337191522121429, + "rewards/HPSv2": 0.2700614929199219, + "rewards/ORM": 0.5425421595573425, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.75, + "step": 277 + }, + { + "completion_length": 33.703125, + "epoch": 0.3078626799557032, + "grad_norm": 0.5431483387947083, + "kl": 0.029052734375, + "learning_rate": 8.2625e-07, + "loss": -0.01096310350112617, + "reward": 2.36252760887146, + "reward_std": 0.3281755894422531, + "rewards/GDino": 0.8864299952983856, + "rewards/GIT": 0.6353955864906311, + "rewards/HPSv2": 0.2625770568847656, + "rewards/ORM": 0.578125, + "self_certainty_semantic": -25.125, + "self_certainty_token": -20.875, + "step": 278 + }, + { + "completion_length": 40.84375, + "epoch": 0.3089700996677741, + "grad_norm": 0.6272196173667908, + "kl": 0.017791748046875, + "learning_rate": 8.25625e-07, + "loss": 0.005458365194499493, + "reward": 2.53033185005188, + "reward_std": 0.3404112756252289, + "rewards/GDino": 0.8331500887870789, + "rewards/GIT": 0.7181890606880188, + "rewards/HPSv2": 0.2650260925292969, + "rewards/ORM": 0.7139666378498077, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.25, + "step": 279 + }, + { + "completion_length": 29.625, + "epoch": 0.31007751937984496, + "grad_norm": 0.4168836176395416, + "kl": 0.013580322265625, + "learning_rate": 8.249999999999999e-07, + "loss": -0.020269228611141443, + "reward": 2.1206799745559692, + "reward_std": 0.36074501276016235, + "rewards/GDino": 0.7140624821186066, + "rewards/GIT": 0.5316184759140015, + "rewards/HPSv2": 0.24579620361328125, + "rewards/ORM": 0.6292029619216919, + "self_certainty_semantic": -25.0, + "self_certainty_token": -21.9375, + "step": 280 + }, + { + "completion_length": 31.390625, + "epoch": 0.31118493909191586, + "grad_norm": 0.4693773090839386, + "kl": 0.02471923828125, + "learning_rate": 8.243749999999999e-07, + "loss": 0.006131879985332489, + "reward": 2.087433636188507, + "reward_std": 0.34090328216552734, + "rewards/GDino": 0.7668008506298065, + "rewards/GIT": 0.41047535091638565, + "rewards/HPSv2": 0.28122520446777344, + "rewards/ORM": 0.6289321780204773, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -20.875, + "step": 281 + }, + { + "completion_length": 48.78125, + "epoch": 0.3122923588039867, + "grad_norm": 1.8625261783599854, + "kl": 0.02197265625, + "learning_rate": 8.2375e-07, + "loss": 0.011913509108126163, + "reward": 2.104038119316101, + "reward_std": 0.27633778750896454, + "rewards/GDino": 0.7975816428661346, + "rewards/GIT": 0.6180287599563599, + "rewards/HPSv2": 0.26529884338378906, + "rewards/ORM": 0.42312875390052795, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -20.875, + "step": 282 + }, + { + "completion_length": 37.53125, + "epoch": 0.3133997785160576, + "grad_norm": 0.5229981541633606, + "kl": 0.014862060546875, + "learning_rate": 8.23125e-07, + "loss": 0.020383010618388653, + "reward": 2.0289142727851868, + "reward_std": 0.30830617249011993, + "rewards/GDino": 0.7265625298023224, + "rewards/GIT": 0.4469078704714775, + "rewards/HPSv2": 0.24579811096191406, + "rewards/ORM": 0.6096457839012146, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.25, + "step": 283 + }, + { + "completion_length": 39.609375, + "epoch": 0.31450719822812845, + "grad_norm": 0.4969702661037445, + "kl": 0.02374267578125, + "learning_rate": 8.225e-07, + "loss": -0.000512864557094872, + "reward": 1.7745079398155212, + "reward_std": 0.2966308742761612, + "rewards/GDino": 0.717956930398941, + "rewards/GIT": 0.34320028126239777, + "rewards/HPSv2": 0.2720203399658203, + "rewards/ORM": 0.4413303881883621, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.8125, + "step": 284 + }, + { + "completion_length": 44.375, + "epoch": 0.31561461794019935, + "grad_norm": 0.7119840383529663, + "kl": 0.0361328125, + "learning_rate": 8.21875e-07, + "loss": 0.01481186505407095, + "reward": 1.5489041805267334, + "reward_std": 0.393330454826355, + "rewards/GDino": 0.6588811278343201, + "rewards/GIT": 0.18679189682006836, + "rewards/HPSv2": 0.2698822021484375, + "rewards/ORM": 0.4333488643169403, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.5625, + "step": 285 + }, + { + "completion_length": 37.34375, + "epoch": 0.3167220376522702, + "grad_norm": 0.5771685242652893, + "kl": 0.01953125, + "learning_rate": 8.2125e-07, + "loss": -0.01632743887603283, + "reward": 2.1853543519973755, + "reward_std": 0.4599858373403549, + "rewards/GDino": 0.7937499582767487, + "rewards/GIT": 0.4548158571124077, + "rewards/HPSv2": 0.27312660217285156, + "rewards/ORM": 0.6636618673801422, + "self_certainty_semantic": -25.0, + "self_certainty_token": -21.5, + "step": 286 + }, + { + "completion_length": 32.453125, + "epoch": 0.3178294573643411, + "grad_norm": 0.4807415306568146, + "kl": 0.0235595703125, + "learning_rate": 8.20625e-07, + "loss": 0.009378439281135798, + "reward": 2.3349956274032593, + "reward_std": 0.3700142502784729, + "rewards/GDino": 0.7700349688529968, + "rewards/GIT": 0.5914836376905441, + "rewards/HPSv2": 0.2726278305053711, + "rewards/ORM": 0.7008491158485413, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.375, + "step": 287 + }, + { + "completion_length": 35.734375, + "epoch": 0.31893687707641194, + "grad_norm": 0.5664615035057068, + "kl": 0.0255126953125, + "learning_rate": 8.199999999999999e-07, + "loss": 0.0027993861585855484, + "reward": 2.357239305973053, + "reward_std": 0.34706495702266693, + "rewards/GDino": 0.7578125, + "rewards/GIT": 0.4684627875685692, + "rewards/HPSv2": 0.25521278381347656, + "rewards/ORM": 0.8757513463497162, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.625, + "step": 288 + }, + { + "completion_length": 39.1875, + "epoch": 0.32004429678848284, + "grad_norm": 0.8079530000686646, + "kl": 0.01953125, + "learning_rate": 8.193749999999999e-07, + "loss": -0.018345186486840248, + "reward": 2.2459497451782227, + "reward_std": 0.36141130328178406, + "rewards/GDino": 0.7167181968688965, + "rewards/GIT": 0.463926300406456, + "rewards/HPSv2": 0.2644481658935547, + "rewards/ORM": 0.8008571267127991, + "self_certainty_semantic": -25.125, + "self_certainty_token": -20.875, + "step": 289 + }, + { + "completion_length": 49.59375, + "epoch": 0.3211517165005537, + "grad_norm": 0.6423276662826538, + "kl": 0.02008056640625, + "learning_rate": 8.187499999999999e-07, + "loss": 0.004360657767392695, + "reward": 2.068402886390686, + "reward_std": 0.3118213042616844, + "rewards/GDino": 0.6897880434989929, + "rewards/GIT": 0.3221408724784851, + "rewards/HPSv2": 0.2720451354980469, + "rewards/ORM": 0.7844289541244507, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.3125, + "step": 290 + }, + { + "completion_length": 40.578125, + "epoch": 0.3222591362126246, + "grad_norm": 1.5936462879180908, + "kl": 0.02667236328125, + "learning_rate": 8.18125e-07, + "loss": 0.00672217458486557, + "reward": 1.89811110496521, + "reward_std": 0.3131570816040039, + "rewards/GDino": 0.7552083432674408, + "rewards/GIT": 0.38471363484859467, + "rewards/HPSv2": 0.26104736328125, + "rewards/ORM": 0.4971417486667633, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.0625, + "step": 291 + }, + { + "completion_length": 52.265625, + "epoch": 0.3233665559246955, + "grad_norm": 0.5862070918083191, + "kl": 0.0191650390625, + "learning_rate": 8.175e-07, + "loss": 0.0030853250063955784, + "reward": 2.2179116010665894, + "reward_std": 0.354862704873085, + "rewards/GDino": 0.8304687440395355, + "rewards/GIT": 0.5982599407434464, + "rewards/HPSv2": 0.26502227783203125, + "rewards/ORM": 0.5241607576608658, + "self_certainty_semantic": -25.0, + "self_certainty_token": -21.0, + "step": 292 + }, + { + "completion_length": 44.359375, + "epoch": 0.32447397563676633, + "grad_norm": 0.49480700492858887, + "kl": 0.025146484375, + "learning_rate": 8.16875e-07, + "loss": 0.00254095159471035, + "reward": 2.3431068062782288, + "reward_std": 0.2980917990207672, + "rewards/GDino": 0.7105664908885956, + "rewards/GIT": 0.45115914195775986, + "rewards/HPSv2": 0.2673187255859375, + "rewards/ORM": 0.9140625, + "self_certainty_semantic": -25.25, + "self_certainty_token": -20.9375, + "step": 293 + }, + { + "completion_length": 37.453125, + "epoch": 0.32558139534883723, + "grad_norm": 0.567101776599884, + "kl": 0.031494140625, + "learning_rate": 8.1625e-07, + "loss": -0.0005339896306395531, + "reward": 2.6543023586273193, + "reward_std": 0.3742424249649048, + "rewards/GDino": 0.8531249761581421, + "rewards/GIT": 0.5934174060821533, + "rewards/HPSv2": 0.2702598571777344, + "rewards/ORM": 0.9375, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.625, + "step": 294 + }, + { + "completion_length": 42.46875, + "epoch": 0.3266888150609081, + "grad_norm": 0.7111007571220398, + "kl": 0.0341796875, + "learning_rate": 8.15625e-07, + "loss": 0.006241308408789337, + "reward": 1.6757753491401672, + "reward_std": 0.4623751789331436, + "rewards/GDino": 0.6648381352424622, + "rewards/GIT": 0.07737746834754944, + "rewards/HPSv2": 0.2832469940185547, + "rewards/ORM": 0.6503127217292786, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.75, + "step": 295 + }, + { + "completion_length": 36.21875, + "epoch": 0.327796234772979, + "grad_norm": 0.563860297203064, + "kl": 0.02325439453125, + "learning_rate": 8.149999999999999e-07, + "loss": 0.019053890835493803, + "reward": 2.774321436882019, + "reward_std": 0.19439150393009186, + "rewards/GDino": 0.971875011920929, + "rewards/GIT": 0.7940655946731567, + "rewards/HPSv2": 0.2748088836669922, + "rewards/ORM": 0.733571857213974, + "self_certainty_semantic": -25.625, + "self_certainty_token": -20.3125, + "step": 296 + }, + { + "completion_length": 35.34375, + "epoch": 0.3289036544850498, + "grad_norm": 0.5320677161216736, + "kl": 0.02813720703125, + "learning_rate": 8.143749999999999e-07, + "loss": -8.378783240914345e-05, + "reward": 1.9089605808258057, + "reward_std": 0.33931228518486023, + "rewards/GDino": 0.7917934358119965, + "rewards/GIT": 0.2872927859425545, + "rewards/HPSv2": 0.26018333435058594, + "rewards/ORM": 0.5696910321712494, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -22.125, + "step": 297 + }, + { + "completion_length": 39.84375, + "epoch": 0.3300110741971207, + "grad_norm": 0.6157541871070862, + "kl": 0.02362060546875, + "learning_rate": 8.137499999999999e-07, + "loss": 0.0025934623554348946, + "reward": 2.2229180932044983, + "reward_std": 0.28714166209101677, + "rewards/GDino": 0.8539158701896667, + "rewards/GIT": 0.6188642829656601, + "rewards/HPSv2": 0.2704963684082031, + "rewards/ORM": 0.47964154183864594, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.4375, + "step": 298 + }, + { + "completion_length": 40.296875, + "epoch": 0.33111849390919157, + "grad_norm": 0.5113131403923035, + "kl": 0.016204833984375, + "learning_rate": 8.131249999999999e-07, + "loss": -0.00016327621415257454, + "reward": 2.4606435298919678, + "reward_std": 0.4053623676300049, + "rewards/GDino": 0.8729166686534882, + "rewards/GIT": 0.7344014346599579, + "rewards/HPSv2": 0.2575702667236328, + "rewards/ORM": 0.5957551002502441, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.3125, + "step": 299 + }, + { + "completion_length": 34.75, + "epoch": 0.33222591362126247, + "grad_norm": 0.7174614071846008, + "kl": 0.02789306640625, + "learning_rate": 8.125e-07, + "loss": -0.008093067444860935, + "reward": 2.5811712741851807, + "reward_std": 0.26995332539081573, + "rewards/GDino": 0.8097889721393585, + "rewards/GIT": 0.7062003910541534, + "rewards/HPSv2": 0.2668323516845703, + "rewards/ORM": 0.7983495891094208, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.25, + "step": 300 + }, + { + "completion_length": 40.921875, + "epoch": 0.3333333333333333, + "grad_norm": 0.5175552368164062, + "kl": 0.021728515625, + "learning_rate": 8.11875e-07, + "loss": -0.0006439080461859703, + "reward": 2.1311718225479126, + "reward_std": 0.41609956324100494, + "rewards/GDino": 0.7800666689872742, + "rewards/GIT": 0.42304590344429016, + "rewards/HPSv2": 0.24529266357421875, + "rewards/ORM": 0.6827665567398071, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.75, + "step": 301 + }, + { + "completion_length": 39.75, + "epoch": 0.3344407530454042, + "grad_norm": 0.4989144802093506, + "kl": 0.02520751953125, + "learning_rate": 8.1125e-07, + "loss": -0.0295205507427454, + "reward": 2.2222087383270264, + "reward_std": 0.28523801267147064, + "rewards/GDino": 0.807892918586731, + "rewards/GIT": 0.4448024183511734, + "rewards/HPSv2": 0.2647838592529297, + "rewards/ORM": 0.704729437828064, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.5, + "step": 302 + }, + { + "completion_length": 34.4375, + "epoch": 0.33554817275747506, + "grad_norm": 2.3144850730895996, + "kl": 0.0274658203125, + "learning_rate": 8.10625e-07, + "loss": 3.9184000343084335e-05, + "reward": 1.9459970593452454, + "reward_std": 0.3054793253540993, + "rewards/GDino": 0.6584998369216919, + "rewards/GIT": 0.17534568905830383, + "rewards/HPSv2": 0.2799644470214844, + "rewards/ORM": 0.8321870565414429, + "self_certainty_semantic": -25.125, + "self_certainty_token": -22.0, + "step": 303 + }, + { + "completion_length": 35.15625, + "epoch": 0.33665559246954596, + "grad_norm": 0.49459758400917053, + "kl": 0.01934814453125, + "learning_rate": 8.1e-07, + "loss": 0.024405019357800484, + "reward": 2.1167579889297485, + "reward_std": 0.40585383772850037, + "rewards/GDino": 0.729687511920929, + "rewards/GIT": 0.4396758899092674, + "rewards/HPSv2": 0.27677345275878906, + "rewards/ORM": 0.670621246099472, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.6875, + "step": 304 + }, + { + "completion_length": 43.234375, + "epoch": 0.3377630121816168, + "grad_norm": 0.4376680552959442, + "kl": 0.02001953125, + "learning_rate": 8.093749999999999e-07, + "loss": 0.016018358757719398, + "reward": 1.974160075187683, + "reward_std": 0.36807504296302795, + "rewards/GDino": 0.7174843549728394, + "rewards/GIT": 0.35652345418930054, + "rewards/HPSv2": 0.25952720642089844, + "rewards/ORM": 0.640625, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.8125, + "step": 305 + }, + { + "completion_length": 41.0, + "epoch": 0.3388704318936877, + "grad_norm": 0.4617772400379181, + "kl": 0.02471923828125, + "learning_rate": 8.087499999999999e-07, + "loss": -0.016240317840129137, + "reward": 1.7467725276947021, + "reward_std": 0.5162160992622375, + "rewards/GDino": 0.6519080102443695, + "rewards/GIT": 0.19782309979200363, + "rewards/HPSv2": 0.27204132080078125, + "rewards/ORM": 0.625, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.625, + "step": 306 + }, + { + "completion_length": 43.171875, + "epoch": 0.3399778516057586, + "grad_norm": 0.5335010886192322, + "kl": 0.02056884765625, + "learning_rate": 8.08125e-07, + "loss": 0.01701421057805419, + "reward": 2.1066606044769287, + "reward_std": 0.22635112702846527, + "rewards/GDino": 0.7486607134342194, + "rewards/GIT": 0.6341286897659302, + "rewards/HPSv2": 0.25512123107910156, + "rewards/ORM": 0.46875, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.6875, + "step": 307 + }, + { + "completion_length": 40.453125, + "epoch": 0.34108527131782945, + "grad_norm": 0.524975061416626, + "kl": 0.02642822265625, + "learning_rate": 8.075e-07, + "loss": -0.009714136831462383, + "reward": 2.396199941635132, + "reward_std": 0.33491872251033783, + "rewards/GDino": 0.9330095648765564, + "rewards/GIT": 0.6121770441532135, + "rewards/HPSv2": 0.2553081512451172, + "rewards/ORM": 0.5957051813602448, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.75, + "step": 308 + }, + { + "completion_length": 35.671875, + "epoch": 0.34219269102990035, + "grad_norm": 0.6639419198036194, + "kl": 0.0311279296875, + "learning_rate": 8.06875e-07, + "loss": -0.029759632423520088, + "reward": 1.9770995378494263, + "reward_std": 0.2913192883133888, + "rewards/GDino": 0.7039497792720795, + "rewards/GIT": 0.32885563373565674, + "rewards/HPSv2": 0.2879142761230469, + "rewards/ORM": 0.656379908323288, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.375, + "step": 309 + }, + { + "completion_length": 34.53125, + "epoch": 0.3433001107419712, + "grad_norm": 0.5760204195976257, + "kl": 0.02392578125, + "learning_rate": 8.0625e-07, + "loss": -0.006422802805900574, + "reward": 2.6306651830673218, + "reward_std": 0.45363010466098785, + "rewards/GDino": 0.8823047280311584, + "rewards/GIT": 0.6377567946910858, + "rewards/HPSv2": 0.25002479553222656, + "rewards/ORM": 0.8605788052082062, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.375, + "step": 310 + }, + { + "completion_length": 37.0, + "epoch": 0.3444075304540421, + "grad_norm": 0.46515122056007385, + "kl": 0.0203857421875, + "learning_rate": 8.05625e-07, + "loss": 0.004794539883732796, + "reward": 2.3176556825637817, + "reward_std": 0.28090114146471024, + "rewards/GDino": 0.7750000357627869, + "rewards/GIT": 0.5293775945901871, + "rewards/HPSv2": 0.25220680236816406, + "rewards/ORM": 0.7610713541507721, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -20.9375, + "step": 311 + }, + { + "completion_length": 39.09375, + "epoch": 0.34551495016611294, + "grad_norm": 1.104447841644287, + "kl": 0.02801513671875, + "learning_rate": 8.05e-07, + "loss": 0.007791162468492985, + "reward": 2.1565089225769043, + "reward_std": 0.5575137436389923, + "rewards/GDino": 0.8127583563327789, + "rewards/GIT": 0.4446650370955467, + "rewards/HPSv2": 0.26712608337402344, + "rewards/ORM": 0.6319593787193298, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.5625, + "step": 312 + }, + { + "completion_length": 45.46875, + "epoch": 0.34662236987818384, + "grad_norm": 0.5203121304512024, + "kl": 0.072509765625, + "learning_rate": 8.043749999999999e-07, + "loss": 0.012506457045674324, + "reward": 1.833688497543335, + "reward_std": 0.3239995241165161, + "rewards/GDino": 0.6667128503322601, + "rewards/GIT": 0.2989926040172577, + "rewards/HPSv2": 0.2667713165283203, + "rewards/ORM": 0.6012117862701416, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.1875, + "step": 313 + }, + { + "completion_length": 40.03125, + "epoch": 0.3477297895902547, + "grad_norm": 0.5220968127250671, + "kl": 0.029296875, + "learning_rate": 8.037499999999999e-07, + "loss": 0.015390763524919748, + "reward": 1.8818817138671875, + "reward_std": 0.3023659586906433, + "rewards/GDino": 0.7861979305744171, + "rewards/GIT": 0.461479514837265, + "rewards/HPSv2": 0.24705886840820312, + "rewards/ORM": 0.3871453106403351, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.0, + "step": 314 + }, + { + "completion_length": 40.84375, + "epoch": 0.3488372093023256, + "grad_norm": 0.6556162238121033, + "kl": 0.02447509765625, + "learning_rate": 8.031249999999999e-07, + "loss": -0.011867262073792517, + "reward": 2.328538656234741, + "reward_std": 0.3933458775281906, + "rewards/GDino": 0.8490088582038879, + "rewards/GIT": 0.5727739632129669, + "rewards/HPSv2": 0.27246856689453125, + "rewards/ORM": 0.6342871189117432, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.6875, + "step": 315 + }, + { + "completion_length": 45.859375, + "epoch": 0.34994462901439644, + "grad_norm": 0.9298399686813354, + "kl": 0.0322265625, + "learning_rate": 8.024999999999999e-07, + "loss": 0.004026470240205526, + "reward": 2.328371286392212, + "reward_std": 0.3506554663181305, + "rewards/GDino": 0.6822916865348816, + "rewards/GIT": 0.5789465606212616, + "rewards/HPSv2": 0.25463294982910156, + "rewards/ORM": 0.8125, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.375, + "step": 316 + }, + { + "completion_length": 41.609375, + "epoch": 0.35105204872646734, + "grad_norm": 1.1766077280044556, + "kl": 0.03387451171875, + "learning_rate": 8.018749999999999e-07, + "loss": 0.0020321097690612078, + "reward": 1.7646815776824951, + "reward_std": 0.3271322548389435, + "rewards/GDino": 0.7106184661388397, + "rewards/GIT": 0.21883262693881989, + "rewards/HPSv2": 0.28487586975097656, + "rewards/ORM": 0.5503546893596649, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.1875, + "step": 317 + }, + { + "completion_length": 36.171875, + "epoch": 0.3521594684385382, + "grad_norm": 0.5149301290512085, + "kl": 0.025634765625, + "learning_rate": 8.0125e-07, + "loss": -0.03277874179184437, + "reward": 2.138643741607666, + "reward_std": 0.3318810313940048, + "rewards/GDino": 0.7371024787425995, + "rewards/GIT": 0.4322855994105339, + "rewards/HPSv2": 0.26688194274902344, + "rewards/ORM": 0.7023736536502838, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.5, + "step": 318 + }, + { + "completion_length": 37.6875, + "epoch": 0.3532668881506091, + "grad_norm": 1.4863990545272827, + "kl": 0.02178955078125, + "learning_rate": 8.00625e-07, + "loss": -0.002279828302562237, + "reward": 1.8946493268013, + "reward_std": 0.27233169972896576, + "rewards/GDino": 0.7110094726085663, + "rewards/GIT": 0.23650505393743515, + "rewards/HPSv2": 0.2566356658935547, + "rewards/ORM": 0.6904990971088409, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.125, + "step": 319 + }, + { + "completion_length": 47.5, + "epoch": 0.35437430786268, + "grad_norm": 0.43619126081466675, + "kl": 0.023193359375, + "learning_rate": 8e-07, + "loss": 0.003124894807115197, + "reward": 2.0905760526657104, + "reward_std": 0.3077670633792877, + "rewards/GDino": 0.7233002185821533, + "rewards/GIT": 0.5371533036231995, + "rewards/HPSv2": 0.2641429901123047, + "rewards/ORM": 0.5659796893596649, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.625, + "step": 320 + }, + { + "completion_length": 45.390625, + "epoch": 0.3554817275747508, + "grad_norm": 0.5557716488838196, + "kl": 0.037109375, + "learning_rate": 7.993749999999999e-07, + "loss": -0.010770568624138832, + "reward": 2.225831151008606, + "reward_std": 0.3986949622631073, + "rewards/GDino": 0.8189918994903564, + "rewards/GIT": 0.45973852276802063, + "rewards/HPSv2": 0.2627296447753906, + "rewards/ORM": 0.6843710243701935, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -20.875, + "step": 321 + }, + { + "completion_length": 29.953125, + "epoch": 0.35658914728682173, + "grad_norm": 0.6458404660224915, + "kl": 0.03125, + "learning_rate": 7.9875e-07, + "loss": -0.01060107909142971, + "reward": 2.633362889289856, + "reward_std": 0.4656294733285904, + "rewards/GDino": 0.9093749523162842, + "rewards/GIT": 0.6372530907392502, + "rewards/HPSv2": 0.2742347717285156, + "rewards/ORM": 0.8124999701976776, + "self_certainty_semantic": -25.25, + "self_certainty_token": -22.125, + "step": 322 + }, + { + "completion_length": 33.609375, + "epoch": 0.3576965669988926, + "grad_norm": 0.7263385653495789, + "kl": 0.03936767578125, + "learning_rate": 7.98125e-07, + "loss": 0.017037604935467243, + "reward": 2.223346173763275, + "reward_std": 0.27071254700422287, + "rewards/GDino": 0.8520833551883698, + "rewards/GIT": 0.49767685681581497, + "rewards/HPSv2": 0.26546478271484375, + "rewards/ORM": 0.6081212162971497, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.8125, + "step": 323 + }, + { + "completion_length": 44.796875, + "epoch": 0.3588039867109635, + "grad_norm": 0.6808414459228516, + "kl": 0.0328369140625, + "learning_rate": 7.975e-07, + "loss": 0.006063681095838547, + "reward": 2.1634711027145386, + "reward_std": 0.39905455708503723, + "rewards/GDino": 0.7807291746139526, + "rewards/GIT": 0.5616367310285568, + "rewards/HPSv2": 0.2618141174316406, + "rewards/ORM": 0.5592910945415497, + "self_certainty_semantic": -25.375, + "self_certainty_token": -20.5, + "step": 324 + }, + { + "completion_length": 34.78125, + "epoch": 0.3599114064230343, + "grad_norm": 0.6440313458442688, + "kl": 0.0216064453125, + "learning_rate": 7.96875e-07, + "loss": 0.0036031128838658333, + "reward": 2.2728426456451416, + "reward_std": 0.36220329999923706, + "rewards/GDino": 0.8558787703514099, + "rewards/GIT": 0.5083786845207214, + "rewards/HPSv2": 0.24144363403320312, + "rewards/ORM": 0.6671415269374847, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.75, + "step": 325 + }, + { + "completion_length": 40.875, + "epoch": 0.3610188261351052, + "grad_norm": 3325586688.0, + "kl": 7208960.011962891, + "learning_rate": 7.9625e-07, + "loss": 72052.51197006088, + "reward": 2.1175559759140015, + "reward_std": 0.23405619710683823, + "rewards/GDino": 0.7784741222858429, + "rewards/GIT": 0.4605119079351425, + "rewards/HPSv2": 0.2704486846923828, + "rewards/ORM": 0.6081212311983109, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.8125, + "step": 326 + }, + { + "completion_length": 34.125, + "epoch": 0.36212624584717606, + "grad_norm": 2.4330286979675293, + "kl": 0.0244140625, + "learning_rate": 7.95625e-07, + "loss": 0.006208475679159164, + "reward": 2.148754298686981, + "reward_std": 0.17882763594388962, + "rewards/GDino": 0.6925546824932098, + "rewards/GIT": 0.5509190559387207, + "rewards/HPSv2": 0.2755470275878906, + "rewards/ORM": 0.6297334432601929, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.5625, + "step": 327 + }, + { + "completion_length": 28.40625, + "epoch": 0.36323366555924697, + "grad_norm": 0.5411335229873657, + "kl": 0.04150390625, + "learning_rate": 7.95e-07, + "loss": 0.002802429720759392, + "reward": 2.179747223854065, + "reward_std": 0.22097515314817429, + "rewards/GDino": 0.796875, + "rewards/GIT": 0.4172445684671402, + "rewards/HPSv2": 0.29375267028808594, + "rewards/ORM": 0.671875, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.9375, + "step": 328 + }, + { + "completion_length": 49.09375, + "epoch": 0.3643410852713178, + "grad_norm": 0.6255419254302979, + "kl": 0.01898193359375, + "learning_rate": 7.94375e-07, + "loss": -0.01904204487800598, + "reward": 2.2398595809936523, + "reward_std": 0.2658727616071701, + "rewards/GDino": 0.7032954692840576, + "rewards/GIT": 0.4768924117088318, + "rewards/HPSv2": 0.2772178649902344, + "rewards/ORM": 0.7824538052082062, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.625, + "step": 329 + }, + { + "completion_length": 40.21875, + "epoch": 0.3654485049833887, + "grad_norm": 0.5795601606369019, + "kl": 0.03448486328125, + "learning_rate": 7.937499999999999e-07, + "loss": -0.028916708659380674, + "reward": 1.9032581448554993, + "reward_std": 0.3265683054924011, + "rewards/GDino": 0.7300936579704285, + "rewards/GIT": 0.3637319356203079, + "rewards/HPSv2": 0.2578239440917969, + "rewards/ORM": 0.5516084283590317, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.125, + "step": 330 + }, + { + "completion_length": 39.0625, + "epoch": 0.36655592469545956, + "grad_norm": 0.5067495703697205, + "kl": 0.02813720703125, + "learning_rate": 7.931249999999999e-07, + "loss": -0.009640714153647423, + "reward": 2.4922925233840942, + "reward_std": 0.3301289528608322, + "rewards/GDino": 0.870312511920929, + "rewards/GIT": 0.7084252834320068, + "rewards/HPSv2": 0.2510967254638672, + "rewards/ORM": 0.6624580323696136, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.625, + "step": 331 + }, + { + "completion_length": 47.796875, + "epoch": 0.36766334440753046, + "grad_norm": 0.671696662902832, + "kl": 0.02166748046875, + "learning_rate": 7.924999999999999e-07, + "loss": -0.004493948072195053, + "reward": 1.7546527981758118, + "reward_std": 0.38349293172359467, + "rewards/GDino": 0.7021358013153076, + "rewards/GIT": 0.32257433235645294, + "rewards/HPSv2": 0.25150489807128906, + "rewards/ORM": 0.47843775153160095, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.0625, + "step": 332 + }, + { + "completion_length": 45.03125, + "epoch": 0.3687707641196013, + "grad_norm": 0.5712724924087524, + "kl": 0.0302734375, + "learning_rate": 7.918749999999999e-07, + "loss": 0.019796861335635185, + "reward": 2.2836796045303345, + "reward_std": 0.3291999250650406, + "rewards/GDino": 0.7703441679477692, + "rewards/GIT": 0.4380648583173752, + "rewards/HPSv2": 0.2811241149902344, + "rewards/ORM": 0.7941466867923737, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.5625, + "step": 333 + }, + { + "completion_length": 34.546875, + "epoch": 0.3698781838316722, + "grad_norm": 0.4348975718021393, + "kl": 0.01947021484375, + "learning_rate": 7.912499999999999e-07, + "loss": 0.032695122761651874, + "reward": 2.472940444946289, + "reward_std": 0.2854895293712616, + "rewards/GDino": 0.8704217672348022, + "rewards/GIT": 0.5610889345407486, + "rewards/HPSv2": 0.25870513916015625, + "rewards/ORM": 0.7827245891094208, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.125, + "step": 334 + }, + { + "completion_length": 45.109375, + "epoch": 0.3709856035437431, + "grad_norm": 0.5861725807189941, + "kl": 0.03179931640625, + "learning_rate": 7.90625e-07, + "loss": 0.001954459585249424, + "reward": 2.100743532180786, + "reward_std": 0.34255510568618774, + "rewards/GDino": 0.8146567642688751, + "rewards/GIT": 0.3769591152667999, + "rewards/HPSv2": 0.27979469299316406, + "rewards/ORM": 0.6293327808380127, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.625, + "step": 335 + }, + { + "completion_length": 47.125, + "epoch": 0.37209302325581395, + "grad_norm": 0.47423750162124634, + "kl": 0.02886962890625, + "learning_rate": 7.9e-07, + "loss": -0.0015065963380038738, + "reward": 2.343234896659851, + "reward_std": 0.2731991782784462, + "rewards/GDino": 0.8291987180709839, + "rewards/GIT": 0.7442143559455872, + "rewards/HPSv2": 0.27597999572753906, + "rewards/ORM": 0.49384191632270813, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.6875, + "step": 336 + }, + { + "completion_length": 47.265625, + "epoch": 0.37320044296788485, + "grad_norm": 0.45597055554389954, + "kl": 0.02423095703125, + "learning_rate": 7.893750000000001e-07, + "loss": 0.025022207759320736, + "reward": 1.9880129098892212, + "reward_std": 0.31796349585056305, + "rewards/GDino": 0.7098958492279053, + "rewards/GIT": 0.37319713830947876, + "rewards/HPSv2": 0.2521495819091797, + "rewards/ORM": 0.6527703106403351, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.3125, + "step": 337 + }, + { + "completion_length": 37.234375, + "epoch": 0.3743078626799557, + "grad_norm": 0.5559476017951965, + "kl": 0.03076171875, + "learning_rate": 7.8875e-07, + "loss": -0.0021500587463378906, + "reward": 1.8510813117027283, + "reward_std": 0.4456440210342407, + "rewards/GDino": 0.7363041043281555, + "rewards/GIT": 0.4989383816719055, + "rewards/HPSv2": 0.2646770477294922, + "rewards/ORM": 0.3511618450284004, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.4375, + "step": 338 + }, + { + "completion_length": 37.109375, + "epoch": 0.3754152823920266, + "grad_norm": 0.7247738838195801, + "kl": 0.025390625, + "learning_rate": 7.88125e-07, + "loss": 0.005743648856878281, + "reward": 2.28269100189209, + "reward_std": 0.4758919030427933, + "rewards/GDino": 0.7302083075046539, + "rewards/GIT": 0.5845783352851868, + "rewards/HPSv2": 0.26825904846191406, + "rewards/ORM": 0.6996453106403351, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.625, + "step": 339 + }, + { + "completion_length": 33.828125, + "epoch": 0.37652270210409744, + "grad_norm": 0.46468424797058105, + "kl": 0.028076171875, + "learning_rate": 7.875e-07, + "loss": 0.0004999474622309208, + "reward": 2.349074602127075, + "reward_std": 0.37045538425445557, + "rewards/GDino": 0.7835937142372131, + "rewards/GIT": 0.4427451193332672, + "rewards/HPSv2": 0.2726478576660156, + "rewards/ORM": 0.8500878810882568, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.375, + "step": 340 + }, + { + "completion_length": 37.140625, + "epoch": 0.37763012181616834, + "grad_norm": 0.978127121925354, + "kl": 0.033447265625, + "learning_rate": 7.86875e-07, + "loss": -0.014791712863370776, + "reward": 1.7075093984603882, + "reward_std": 0.4154247045516968, + "rewards/GDino": 0.6365624964237213, + "rewards/GIT": 0.23427681624889374, + "rewards/HPSv2": 0.2517547607421875, + "rewards/ORM": 0.5849153995513916, + "self_certainty_semantic": -25.0, + "self_certainty_token": -21.5, + "step": 341 + }, + { + "completion_length": 39.640625, + "epoch": 0.3787375415282392, + "grad_norm": 0.5854309797286987, + "kl": 0.02081298828125, + "learning_rate": 7.8625e-07, + "loss": 0.008088980801403522, + "reward": 1.816591739654541, + "reward_std": 0.43020693957805634, + "rewards/GDino": 0.6936458945274353, + "rewards/GIT": 0.40889062732458115, + "rewards/HPSv2": 0.2503395080566406, + "rewards/ORM": 0.4637157917022705, + "self_certainty_semantic": -25.0, + "self_certainty_token": -21.3125, + "step": 342 + }, + { + "completion_length": 33.859375, + "epoch": 0.3798449612403101, + "grad_norm": 0.5576260089874268, + "kl": 0.02655029296875, + "learning_rate": 7.85625e-07, + "loss": 0.007988112978637218, + "reward": 2.101312041282654, + "reward_std": 0.2812753766775131, + "rewards/GDino": 0.7948440611362457, + "rewards/GIT": 0.2880048602819443, + "rewards/HPSv2": 0.2610511779785156, + "rewards/ORM": 0.7574118673801422, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.4375, + "step": 343 + }, + { + "completion_length": 39.25, + "epoch": 0.38095238095238093, + "grad_norm": 0.546230673789978, + "kl": 0.0289306640625, + "learning_rate": 7.85e-07, + "loss": 0.010598460678011179, + "reward": 2.3708373308181763, + "reward_std": 0.32361893355846405, + "rewards/GDino": 0.7421875298023224, + "rewards/GIT": 0.7563790380954742, + "rewards/HPSv2": 0.2398090362548828, + "rewards/ORM": 0.632461816072464, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.625, + "step": 344 + }, + { + "completion_length": 43.375, + "epoch": 0.38205980066445183, + "grad_norm": 0.4513307213783264, + "kl": 0.0263671875, + "learning_rate": 7.84375e-07, + "loss": -0.004892995581030846, + "reward": 1.5629626512527466, + "reward_std": 0.5399304926395416, + "rewards/GDino": 0.6694765985012054, + "rewards/GIT": 0.28154174983501434, + "rewards/HPSv2": 0.23694419860839844, + "rewards/ORM": 0.375, + "self_certainty_semantic": -24.8125, + "self_certainty_token": -21.9375, + "step": 345 + }, + { + "completion_length": 40.375, + "epoch": 0.3831672203765227, + "grad_norm": 0.6939501762390137, + "kl": 0.0322265625, + "learning_rate": 7.837499999999999e-07, + "loss": -0.0008523159194737673, + "reward": 2.43539160490036, + "reward_std": 0.2919086888432503, + "rewards/GDino": 0.7796874940395355, + "rewards/GIT": 0.6580008417367935, + "rewards/HPSv2": 0.25712013244628906, + "rewards/ORM": 0.740583062171936, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.9375, + "step": 346 + }, + { + "completion_length": 40.0, + "epoch": 0.3842746400885936, + "grad_norm": 0.6701419353485107, + "kl": 0.03582763671875, + "learning_rate": 7.831249999999999e-07, + "loss": 0.002183706732466817, + "reward": 1.8965783715248108, + "reward_std": 0.35588257014751434, + "rewards/GDino": 0.6681874692440033, + "rewards/GIT": 0.30046548694372177, + "rewards/HPSv2": 0.27935791015625, + "rewards/ORM": 0.6485673785209656, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -22.375, + "step": 347 + }, + { + "completion_length": 53.6875, + "epoch": 0.3853820598006645, + "grad_norm": 0.5544401407241821, + "kl": 0.036376953125, + "learning_rate": 7.824999999999999e-07, + "loss": 0.017971528694033623, + "reward": 1.9810231924057007, + "reward_std": 0.416934609413147, + "rewards/GDino": 0.6915624737739563, + "rewards/GIT": 0.36784209311008453, + "rewards/HPSv2": 0.2577857971191406, + "rewards/ORM": 0.6638327836990356, + "self_certainty_semantic": -25.375, + "self_certainty_token": -20.625, + "step": 348 + }, + { + "completion_length": 41.96875, + "epoch": 0.3864894795127353, + "grad_norm": 0.9691958427429199, + "kl": 0.02850341796875, + "learning_rate": 7.818749999999999e-07, + "loss": 0.006191606633365154, + "reward": 1.978796124458313, + "reward_std": 0.2658599987626076, + "rewards/GDino": 0.7494639158248901, + "rewards/GIT": 0.3184107020497322, + "rewards/HPSv2": 0.2684211730957031, + "rewards/ORM": 0.6425002217292786, + "self_certainty_semantic": -25.25, + "self_certainty_token": -22.1875, + "step": 349 + }, + { + "completion_length": 42.1875, + "epoch": 0.3875968992248062, + "grad_norm": 5.835413455963135, + "kl": 0.04150390625, + "learning_rate": 7.812499999999999e-07, + "loss": -0.00010933470912277699, + "reward": 2.14810848236084, + "reward_std": 0.40148836374282837, + "rewards/GDino": 0.7505297660827637, + "rewards/GIT": 0.42371469736099243, + "rewards/HPSv2": 0.2747211456298828, + "rewards/ORM": 0.6991429328918457, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.125, + "step": 350 + }, + { + "completion_length": 35.765625, + "epoch": 0.38870431893687707, + "grad_norm": 0.4737742841243744, + "kl": 0.03106689453125, + "learning_rate": 7.806249999999999e-07, + "loss": 0.005023700650781393, + "reward": 2.1896114349365234, + "reward_std": 0.5353913605213165, + "rewards/GDino": 0.8089349865913391, + "rewards/GIT": 0.3830580413341522, + "rewards/HPSv2": 0.2619895935058594, + "rewards/ORM": 0.735628753900528, + "self_certainty_semantic": -24.75, + "self_certainty_token": -22.1875, + "step": 351 + }, + { + "completion_length": 34.25, + "epoch": 0.38981173864894797, + "grad_norm": 0.4819023907184601, + "kl": 0.0419921875, + "learning_rate": 7.799999999999999e-07, + "loss": 0.0019328959751874208, + "reward": 1.978476643562317, + "reward_std": 0.36969317495822906, + "rewards/GDino": 0.6918750107288361, + "rewards/GIT": 0.29961538314819336, + "rewards/HPSv2": 0.2604236602783203, + "rewards/ORM": 0.7265625, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.8125, + "step": 352 + }, + { + "completion_length": 45.15625, + "epoch": 0.3909191583610188, + "grad_norm": 0.5503346920013428, + "kl": 0.0274658203125, + "learning_rate": 7.793750000000001e-07, + "loss": 0.019882426131516695, + "reward": 1.7672075629234314, + "reward_std": 0.40149015188217163, + "rewards/GDino": 0.5861979126930237, + "rewards/GIT": 0.35189656913280487, + "rewards/HPSv2": 0.26621246337890625, + "rewards/ORM": 0.5629006028175354, + "self_certainty_semantic": -25.25, + "self_certainty_token": -22.0, + "step": 353 + }, + { + "completion_length": 30.78125, + "epoch": 0.3920265780730897, + "grad_norm": 0.47999808192253113, + "kl": 0.03118896484375, + "learning_rate": 7.787500000000001e-07, + "loss": 0.015973938745446503, + "reward": 2.0040203332901, + "reward_std": 0.5554044544696808, + "rewards/GDino": 0.666580930352211, + "rewards/GIT": 0.35180309414863586, + "rewards/HPSv2": 0.2794322967529297, + "rewards/ORM": 0.7062040567398071, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -22.1875, + "step": 354 + }, + { + "completion_length": 37.171875, + "epoch": 0.39313399778516056, + "grad_norm": 0.6516053080558777, + "kl": 0.03759765625, + "learning_rate": 7.78125e-07, + "loss": 0.02263426687568426, + "reward": 2.3876103162765503, + "reward_std": 0.2892715036869049, + "rewards/GDino": 0.7893574237823486, + "rewards/GIT": 0.6562651693820953, + "rewards/HPSv2": 0.2697620391845703, + "rewards/ORM": 0.6722257137298584, + "self_certainty_semantic": -25.25, + "self_certainty_token": -22.0625, + "step": 355 + }, + { + "completion_length": 47.40625, + "epoch": 0.39424141749723146, + "grad_norm": 0.5215231776237488, + "kl": 0.0260009765625, + "learning_rate": 7.775e-07, + "loss": -0.006568143609911203, + "reward": 2.1998695135116577, + "reward_std": 0.3379738926887512, + "rewards/GDino": 0.7524849772453308, + "rewards/GIT": 0.30117174983024597, + "rewards/HPSv2": 0.28067970275878906, + "rewards/ORM": 0.8655331134796143, + "self_certainty_semantic": -25.0, + "self_certainty_token": -21.5, + "step": 356 + }, + { + "completion_length": 39.484375, + "epoch": 0.3953488372093023, + "grad_norm": 0.44946345686912537, + "kl": 0.032470703125, + "learning_rate": 7.76875e-07, + "loss": -0.0038691945374011993, + "reward": 1.9695230722427368, + "reward_std": 0.3742541968822479, + "rewards/GDino": 0.6920569837093353, + "rewards/GIT": 0.31663452088832855, + "rewards/HPSv2": 0.2782859802246094, + "rewards/ORM": 0.6825457215309143, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -22.0625, + "step": 357 + }, + { + "completion_length": 44.4375, + "epoch": 0.3964562569213732, + "grad_norm": 0.5392388701438904, + "kl": 0.03759765625, + "learning_rate": 7.7625e-07, + "loss": -0.012778437230736017, + "reward": 2.211210250854492, + "reward_std": 0.4425947368144989, + "rewards/GDino": 0.7417187690734863, + "rewards/GIT": 0.5134639292955399, + "rewards/HPSv2": 0.26231956481933594, + "rewards/ORM": 0.6937080323696136, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.0, + "step": 358 + }, + { + "completion_length": 49.65625, + "epoch": 0.39756367663344405, + "grad_norm": 0.5134049654006958, + "kl": 0.02960205078125, + "learning_rate": 7.75625e-07, + "loss": -0.012967417016625404, + "reward": 1.412535309791565, + "reward_std": 0.45060451328754425, + "rewards/GDino": 0.6461889147758484, + "rewards/GIT": 0.19755424559116364, + "rewards/HPSv2": 0.2508354187011719, + "rewards/ORM": 0.31795668601989746, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.5625, + "step": 359 + }, + { + "completion_length": 35.609375, + "epoch": 0.39867109634551495, + "grad_norm": 1.2789876461029053, + "kl": 0.0343017578125, + "learning_rate": 7.75e-07, + "loss": -0.009992476087063551, + "reward": 2.073417007923126, + "reward_std": 0.38225309550762177, + "rewards/GDino": 0.7351497113704681, + "rewards/GIT": 0.5839340090751648, + "rewards/HPSv2": 0.27143287658691406, + "rewards/ORM": 0.48290038108825684, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.1875, + "step": 360 + }, + { + "completion_length": 39.578125, + "epoch": 0.3997785160575858, + "grad_norm": 0.5060411691665649, + "kl": 0.03515625, + "learning_rate": 7.74375e-07, + "loss": -0.005461947526782751, + "reward": 2.1116931438446045, + "reward_std": 0.4044407308101654, + "rewards/GDino": 0.7036125659942627, + "rewards/GIT": 0.44938327372074127, + "rewards/HPSv2": 0.2743263244628906, + "rewards/ORM": 0.6843709945678711, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.125, + "step": 361 + }, + { + "completion_length": 46.796875, + "epoch": 0.4008859357696567, + "grad_norm": 0.5340003371238708, + "kl": 0.03424072265625, + "learning_rate": 7.7375e-07, + "loss": -0.006476349895820022, + "reward": 1.877703845500946, + "reward_std": 0.4012419879436493, + "rewards/GDino": 0.7290347218513489, + "rewards/GIT": 0.45925381779670715, + "rewards/HPSv2": 0.2832450866699219, + "rewards/ORM": 0.4061700701713562, + "self_certainty_semantic": -25.375, + "self_certainty_token": -22.375, + "step": 362 + }, + { + "completion_length": 49.234375, + "epoch": 0.4019933554817276, + "grad_norm": 0.6717630624771118, + "kl": 0.0328369140625, + "learning_rate": 7.731249999999999e-07, + "loss": -0.008118146331980824, + "reward": 1.9765561819076538, + "reward_std": 0.37670013308525085, + "rewards/GDino": 0.6909117698669434, + "rewards/GIT": 0.4758438169956207, + "rewards/HPSv2": 0.2585926055908203, + "rewards/ORM": 0.5512078106403351, + "self_certainty_semantic": -25.125, + "self_certainty_token": -20.5625, + "step": 363 + }, + { + "completion_length": 41.453125, + "epoch": 0.40310077519379844, + "grad_norm": 0.7043358087539673, + "kl": 0.0404052734375, + "learning_rate": 7.724999999999999e-07, + "loss": 0.03979423549026251, + "reward": 2.4432953596115112, + "reward_std": 0.2629931718111038, + "rewards/GDino": 0.7979166805744171, + "rewards/GIT": 0.5859342813491821, + "rewards/HPSv2": 0.28001976013183594, + "rewards/ORM": 0.7794246673583984, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.5625, + "step": 364 + }, + { + "completion_length": 44.8125, + "epoch": 0.40420819490586934, + "grad_norm": 0.6534931063652039, + "kl": 0.055908203125, + "learning_rate": 7.718749999999999e-07, + "loss": -0.007944567129015923, + "reward": 1.8853873014450073, + "reward_std": 0.39929990470409393, + "rewards/GDino": 0.5773958265781403, + "rewards/GIT": 0.29893604665994644, + "rewards/HPSv2": 0.2607097625732422, + "rewards/ORM": 0.7483455240726471, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.25, + "step": 365 + }, + { + "completion_length": 41.546875, + "epoch": 0.4053156146179402, + "grad_norm": 0.5383189916610718, + "kl": 0.048095703125, + "learning_rate": 7.712499999999999e-07, + "loss": 0.013786105439066887, + "reward": 1.8352028131484985, + "reward_std": 0.4158010184764862, + "rewards/GDino": 0.6937500238418579, + "rewards/GIT": 0.19948893785476685, + "rewards/HPSv2": 0.26660919189453125, + "rewards/ORM": 0.6753546893596649, + "self_certainty_semantic": -25.375, + "self_certainty_token": -22.625, + "step": 366 + }, + { + "completion_length": 42.734375, + "epoch": 0.4064230343300111, + "grad_norm": 0.5055248141288757, + "kl": 0.0382080078125, + "learning_rate": 7.706249999999999e-07, + "loss": 0.01024163793772459, + "reward": 2.259072184562683, + "reward_std": 0.45819520950317383, + "rewards/GDino": 0.7351614236831665, + "rewards/GIT": 0.4725143015384674, + "rewards/HPSv2": 0.2722015380859375, + "rewards/ORM": 0.7791949510574341, + "self_certainty_semantic": -25.25, + "self_certainty_token": -22.3125, + "step": 367 + }, + { + "completion_length": 35.171875, + "epoch": 0.40753045404208194, + "grad_norm": 0.9312332272529602, + "kl": 0.0435791015625, + "learning_rate": 7.699999999999999e-07, + "loss": -0.02036240417510271, + "reward": 2.231248438358307, + "reward_std": 0.2120247781276703, + "rewards/GDino": 0.8772628009319305, + "rewards/GIT": 0.5012119859457016, + "rewards/HPSv2": 0.2806358337402344, + "rewards/ORM": 0.5721378028392792, + "self_certainty_semantic": -25.25, + "self_certainty_token": -22.3125, + "step": 368 + }, + { + "completion_length": 33.625, + "epoch": 0.40863787375415284, + "grad_norm": 2.0985968112945557, + "kl": 0.0401611328125, + "learning_rate": 7.69375e-07, + "loss": 0.0018158061429858208, + "reward": 2.397477388381958, + "reward_std": 0.40679168701171875, + "rewards/GDino": 0.8972452282905579, + "rewards/GIT": 0.629754438996315, + "rewards/HPSv2": 0.27672767639160156, + "rewards/ORM": 0.59375, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.5, + "step": 369 + }, + { + "completion_length": 44.203125, + "epoch": 0.4097452934662237, + "grad_norm": 0.6264777779579163, + "kl": 0.02813720703125, + "learning_rate": 7.6875e-07, + "loss": 0.022082864306867123, + "reward": 2.269244074821472, + "reward_std": 0.44469980895519257, + "rewards/GDino": 0.7298951148986816, + "rewards/GIT": 0.5341291725635529, + "rewards/HPSv2": 0.2665119171142578, + "rewards/ORM": 0.7387078404426575, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.5, + "step": 370 + }, + { + "completion_length": 38.28125, + "epoch": 0.4108527131782946, + "grad_norm": 0.6259306073188782, + "kl": 0.01837158203125, + "learning_rate": 7.68125e-07, + "loss": -0.0007002539932727814, + "reward": 2.4523496627807617, + "reward_std": 0.2647937461733818, + "rewards/GDino": 0.9088541865348816, + "rewards/GIT": 0.7134121358394623, + "rewards/HPSv2": 0.2628498077392578, + "rewards/ORM": 0.5672334432601929, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.8125, + "step": 371 + }, + { + "completion_length": 45.765625, + "epoch": 0.4119601328903654, + "grad_norm": 0.6793559789657593, + "kl": 0.0372314453125, + "learning_rate": 7.675e-07, + "loss": 0.033544646576046944, + "reward": 2.0182183980941772, + "reward_std": 0.32042600214481354, + "rewards/GDino": 0.6397264897823334, + "rewards/GIT": 0.44353990256786346, + "rewards/HPSv2": 0.2689342498779297, + "rewards/ORM": 0.6660176515579224, + "self_certainty_semantic": -24.875, + "self_certainty_token": -21.1875, + "step": 372 + }, + { + "completion_length": 42.484375, + "epoch": 0.4130675526024363, + "grad_norm": 0.5748891234397888, + "kl": 0.041259765625, + "learning_rate": 7.66875e-07, + "loss": -0.012200751341879368, + "reward": 1.907440185546875, + "reward_std": 0.5503653585910797, + "rewards/GDino": 0.7475000023841858, + "rewards/GIT": 0.27429039031267166, + "rewards/HPSv2": 0.27011680603027344, + "rewards/ORM": 0.6155330836772919, + "self_certainty_semantic": -25.25, + "self_certainty_token": -22.5, + "step": 373 + }, + { + "completion_length": 45.109375, + "epoch": 0.4141749723145072, + "grad_norm": 0.6625725626945496, + "kl": 0.03057861328125, + "learning_rate": 7.6625e-07, + "loss": -0.005671075778082013, + "reward": 2.1532318592071533, + "reward_std": 0.3619081676006317, + "rewards/GDino": 0.8202139139175415, + "rewards/GIT": 0.5140966400504112, + "rewards/HPSv2": 0.2699394226074219, + "rewards/ORM": 0.5489819049835205, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.6875, + "step": 374 + }, + { + "completion_length": 33.640625, + "epoch": 0.4152823920265781, + "grad_norm": 0.5296221375465393, + "kl": 0.0487060546875, + "learning_rate": 7.65625e-07, + "loss": -0.0007148468866944313, + "reward": 2.4465746879577637, + "reward_std": 0.23157501220703125, + "rewards/GDino": 0.8728718757629395, + "rewards/GIT": 0.6998476386070251, + "rewards/HPSv2": 0.2731456756591797, + "rewards/ORM": 0.6007093787193298, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.5, + "step": 375 + }, + { + "completion_length": 34.859375, + "epoch": 0.416389811738649, + "grad_norm": 0.5901670455932617, + "kl": 0.04443359375, + "learning_rate": 7.65e-07, + "loss": 0.0027307383716106415, + "reward": 2.1622787714004517, + "reward_std": 0.42810335755348206, + "rewards/GDino": 0.7800533771514893, + "rewards/GIT": 0.408367857336998, + "rewards/HPSv2": 0.25921058654785156, + "rewards/ORM": 0.7146469056606293, + "self_certainty_semantic": -25.0, + "self_certainty_token": -21.25, + "step": 376 + }, + { + "completion_length": 46.25, + "epoch": 0.4174972314507198, + "grad_norm": 0.6768283843994141, + "kl": 0.03204345703125, + "learning_rate": 7.64375e-07, + "loss": 0.021322970744222403, + "reward": 1.8171244859695435, + "reward_std": 0.18127407226711512, + "rewards/GDino": 0.6758842766284943, + "rewards/GIT": 0.3382781371474266, + "rewards/HPSv2": 0.2610912322998047, + "rewards/ORM": 0.5418707728385925, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.3125, + "step": 377 + }, + { + "completion_length": 54.15625, + "epoch": 0.4186046511627907, + "grad_norm": 0.5024600625038147, + "kl": 0.034912109375, + "learning_rate": 7.6375e-07, + "loss": 0.0020371037535369396, + "reward": 2.338524103164673, + "reward_std": 0.43607713282108307, + "rewards/GDino": 0.7437655329704285, + "rewards/GIT": 0.506206676363945, + "rewards/HPSv2": 0.27953147888183594, + "rewards/ORM": 0.8090203106403351, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.1875, + "step": 378 + }, + { + "completion_length": 39.265625, + "epoch": 0.41971207087486156, + "grad_norm": 1.9171770811080933, + "kl": 0.03411865234375, + "learning_rate": 7.63125e-07, + "loss": 0.002377159893512726, + "reward": 2.2468537092208862, + "reward_std": 0.20854417979717255, + "rewards/GDino": 0.7836942076683044, + "rewards/GIT": 0.5418351590633392, + "rewards/HPSv2": 0.260040283203125, + "rewards/ORM": 0.6612841784954071, + "self_certainty_semantic": -25.25, + "self_certainty_token": -22.3125, + "step": 379 + }, + { + "completion_length": 39.484375, + "epoch": 0.42081949058693247, + "grad_norm": 1.0356440544128418, + "kl": 0.03045654296875, + "learning_rate": 7.624999999999999e-07, + "loss": -0.014718352816998959, + "reward": 1.871756136417389, + "reward_std": 0.3412375822663307, + "rewards/GDino": 0.7783447802066803, + "rewards/GIT": 0.5043638348579407, + "rewards/HPSv2": 0.27927589416503906, + "rewards/ORM": 0.30977167189121246, + "self_certainty_semantic": -25.0, + "self_certainty_token": -21.6875, + "step": 380 + }, + { + "completion_length": 40.96875, + "epoch": 0.4219269102990033, + "grad_norm": 0.632671594619751, + "kl": 0.048095703125, + "learning_rate": 7.618749999999999e-07, + "loss": -0.01816024724394083, + "reward": 2.3965006470680237, + "reward_std": 0.34211961925029755, + "rewards/GDino": 0.8125, + "rewards/GIT": 0.6881425678730011, + "rewards/HPSv2": 0.2642993927001953, + "rewards/ORM": 0.631558746099472, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.0, + "step": 381 + }, + { + "completion_length": 42.890625, + "epoch": 0.4230343300110742, + "grad_norm": 0.5367975831031799, + "kl": 0.03173828125, + "learning_rate": 7.612499999999999e-07, + "loss": 0.005420586094260216, + "reward": 2.1896389722824097, + "reward_std": 0.3109910786151886, + "rewards/GDino": 0.7874999940395355, + "rewards/GIT": 0.49838483333587646, + "rewards/HPSv2": 0.27602577209472656, + "rewards/ORM": 0.6277283728122711, + "self_certainty_semantic": -25.25, + "self_certainty_token": -20.75, + "step": 382 + }, + { + "completion_length": 42.171875, + "epoch": 0.42414174972314506, + "grad_norm": 0.3789452910423279, + "kl": 0.03533935546875, + "learning_rate": 7.606249999999999e-07, + "loss": 0.014071432873606682, + "reward": 1.9684094190597534, + "reward_std": 0.5238681137561798, + "rewards/GDino": 0.7198299765586853, + "rewards/GIT": 0.5013139694929123, + "rewards/HPSv2": 0.2457408905029297, + "rewards/ORM": 0.5015245378017426, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -22.25, + "step": 383 + }, + { + "completion_length": 40.90625, + "epoch": 0.42524916943521596, + "grad_norm": 2.8351011276245117, + "kl": 0.0416259765625, + "learning_rate": 7.599999999999999e-07, + "loss": -0.021148506551980972, + "reward": 2.102011501789093, + "reward_std": 0.2582581639289856, + "rewards/GDino": 0.7812051475048065, + "rewards/GIT": 0.5529660955071449, + "rewards/HPSv2": 0.27136993408203125, + "rewards/ORM": 0.4964703768491745, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.25, + "step": 384 + }, + { + "completion_length": 51.046875, + "epoch": 0.4263565891472868, + "grad_norm": 0.6955005526542664, + "kl": 0.0413818359375, + "learning_rate": 7.59375e-07, + "loss": 0.0069068125449121, + "reward": 2.098905622959137, + "reward_std": 0.2885952666401863, + "rewards/GDino": 0.7392346262931824, + "rewards/GIT": 0.32319432497024536, + "rewards/HPSv2": 0.26218605041503906, + "rewards/ORM": 0.7742906510829926, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -20.875, + "step": 385 + }, + { + "completion_length": 29.90625, + "epoch": 0.4274640088593577, + "grad_norm": 0.8517731428146362, + "kl": 0.0516357421875, + "learning_rate": 7.5875e-07, + "loss": 0.006788903381675482, + "reward": 2.4118454456329346, + "reward_std": 0.3538888171315193, + "rewards/GDino": 0.785937488079071, + "rewards/GIT": 0.5628680139780045, + "rewards/HPSv2": 0.28178977966308594, + "rewards/ORM": 0.78125, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -22.4375, + "step": 386 + }, + { + "completion_length": 36.765625, + "epoch": 0.42857142857142855, + "grad_norm": 0.6707489490509033, + "kl": 0.0411376953125, + "learning_rate": 7.58125e-07, + "loss": 0.014098147861659527, + "reward": 2.149623155593872, + "reward_std": 0.42522846162319183, + "rewards/GDino": 0.8425147533416748, + "rewards/GIT": 0.49068495631217957, + "rewards/HPSv2": 0.26511573791503906, + "rewards/ORM": 0.5513076782226562, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.5625, + "step": 387 + }, + { + "completion_length": 37.625, + "epoch": 0.42967884828349945, + "grad_norm": 0.4950082004070282, + "kl": 0.0244140625, + "learning_rate": 7.575e-07, + "loss": -0.012308157398365438, + "reward": 2.5664467811584473, + "reward_std": 0.319002628326416, + "rewards/GDino": 0.8304032683372498, + "rewards/GIT": 0.6985468864440918, + "rewards/HPSv2": 0.2795543670654297, + "rewards/ORM": 0.7579423785209656, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -22.125, + "step": 388 + }, + { + "completion_length": 41.625, + "epoch": 0.43078626799557035, + "grad_norm": 0.7242435216903687, + "kl": 0.0421142578125, + "learning_rate": 7.56875e-07, + "loss": -0.0010866057127714157, + "reward": 2.184369921684265, + "reward_std": 0.47019390761852264, + "rewards/GDino": 0.7906249761581421, + "rewards/GIT": 0.46294473111629486, + "rewards/HPSv2": 0.2710704803466797, + "rewards/ORM": 0.6597297042608261, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.625, + "step": 389 + }, + { + "completion_length": 50.125, + "epoch": 0.4318936877076412, + "grad_norm": 1.1640534400939941, + "kl": 0.030517578125, + "learning_rate": 7.5625e-07, + "loss": -0.019844084046781063, + "reward": 2.396089255809784, + "reward_std": 0.24368739873170853, + "rewards/GDino": 0.7795874178409576, + "rewards/GIT": 0.6525524854660034, + "rewards/HPSv2": 0.26082420349121094, + "rewards/ORM": 0.703125, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.25, + "step": 390 + }, + { + "completion_length": 35.984375, + "epoch": 0.4330011074197121, + "grad_norm": 0.5103877782821655, + "kl": 0.0311279296875, + "learning_rate": 7.55625e-07, + "loss": -0.008262321585789323, + "reward": 2.406006336212158, + "reward_std": 0.23674829304218292, + "rewards/GDino": 0.8994598388671875, + "rewards/GIT": 0.6523403823375702, + "rewards/HPSv2": 0.2604560852050781, + "rewards/ORM": 0.5937500149011612, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.9375, + "step": 391 + }, + { + "completion_length": 43.921875, + "epoch": 0.43410852713178294, + "grad_norm": 0.497820645570755, + "kl": 0.042724609375, + "learning_rate": 7.55e-07, + "loss": -0.00259082525735721, + "reward": 1.8813644647598267, + "reward_std": 0.4379560351371765, + "rewards/GDino": 0.7333734929561615, + "rewards/GIT": 0.3741501718759537, + "rewards/HPSv2": 0.2706317901611328, + "rewards/ORM": 0.5032089352607727, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.5, + "step": 392 + }, + { + "completion_length": 30.484375, + "epoch": 0.43521594684385384, + "grad_norm": 0.6349219083786011, + "kl": 0.0517578125, + "learning_rate": 7.54375e-07, + "loss": -0.013653552625328302, + "reward": 2.2154151797294617, + "reward_std": 0.2131977081298828, + "rewards/GDino": 0.8221572637557983, + "rewards/GIT": 0.4915757179260254, + "rewards/HPSv2": 0.28302001953125, + "rewards/ORM": 0.6186620593070984, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -22.8125, + "step": 393 + }, + { + "completion_length": 39.859375, + "epoch": 0.4363233665559247, + "grad_norm": 0.569675087928772, + "kl": 0.03436279296875, + "learning_rate": 7.5375e-07, + "loss": -0.006760713644325733, + "reward": 2.4199535846710205, + "reward_std": 0.302301824092865, + "rewards/GDino": 0.9265625178813934, + "rewards/GIT": 0.4195697158575058, + "rewards/HPSv2": 0.264801025390625, + "rewards/ORM": 0.8090203106403351, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -22.4375, + "step": 394 + }, + { + "completion_length": 38.171875, + "epoch": 0.4374307862679956, + "grad_norm": 1.2075111865997314, + "kl": 0.0443115234375, + "learning_rate": 7.53125e-07, + "loss": 0.009720378555357456, + "reward": 2.379997491836548, + "reward_std": 0.34344032406806946, + "rewards/GDino": 0.8730711340904236, + "rewards/GIT": 0.5321757197380066, + "rewards/HPSv2": 0.2716255187988281, + "rewards/ORM": 0.7031249701976776, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.5625, + "step": 395 + }, + { + "completion_length": 38.6875, + "epoch": 0.43853820598006643, + "grad_norm": 342.09454345703125, + "kl": 5.0242919921875, + "learning_rate": 7.524999999999999e-07, + "loss": 0.027052484452724457, + "reward": 2.2779553532600403, + "reward_std": 0.4027100205421448, + "rewards/GDino": 0.8293097615242004, + "rewards/GIT": 0.4535738229751587, + "rewards/HPSv2": 0.2744464874267578, + "rewards/ORM": 0.7206252217292786, + "self_certainty_semantic": -25.0, + "self_certainty_token": -21.25, + "step": 396 + }, + { + "completion_length": 38.515625, + "epoch": 0.43964562569213733, + "grad_norm": 0.5501837730407715, + "kl": 0.03302001953125, + "learning_rate": 7.518749999999999e-07, + "loss": -0.02949504740536213, + "reward": 2.547436833381653, + "reward_std": 0.37221017479896545, + "rewards/GDino": 0.928906261920929, + "rewards/GIT": 0.6719013452529907, + "rewards/HPSv2": 0.2500629425048828, + "rewards/ORM": 0.696566253900528, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.625, + "step": 397 + }, + { + "completion_length": 36.8125, + "epoch": 0.4407530454042082, + "grad_norm": 1.14122474193573, + "kl": 0.0433349609375, + "learning_rate": 7.512499999999999e-07, + "loss": 0.0011832071468234062, + "reward": 2.6030901670455933, + "reward_std": 0.367518350481987, + "rewards/GDino": 0.8026041388511658, + "rewards/GIT": 0.6320919096469879, + "rewards/HPSv2": 0.2777690887451172, + "rewards/ORM": 0.890625, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.1875, + "step": 398 + }, + { + "completion_length": 46.546875, + "epoch": 0.4418604651162791, + "grad_norm": 0.5975333452224731, + "kl": 0.04150390625, + "learning_rate": 7.506249999999999e-07, + "loss": 0.016775197349488735, + "reward": 1.8540863394737244, + "reward_std": 0.29581695050001144, + "rewards/GDino": 0.6284334063529968, + "rewards/GIT": 0.23308932036161423, + "rewards/HPSv2": 0.25658416748046875, + "rewards/ORM": 0.7359794676303864, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.5625, + "step": 399 + }, + { + "completion_length": 36.34375, + "epoch": 0.4429678848283499, + "grad_norm": 1.3476744890213013, + "kl": 0.0401611328125, + "learning_rate": 7.5e-07, + "loss": -0.013706179801374674, + "reward": 2.096378445625305, + "reward_std": 0.362843781709671, + "rewards/GDino": 0.7787023782730103, + "rewards/GIT": 0.4101857915520668, + "rewards/HPSv2": 0.27315330505371094, + "rewards/ORM": 0.6343370079994202, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.0625, + "step": 400 + }, + { + "completion_length": 34.140625, + "epoch": 0.4440753045404208, + "grad_norm": 0.7070680260658264, + "kl": 0.03662109375, + "learning_rate": 7.49375e-07, + "loss": -0.017339009791612625, + "reward": 2.837607264518738, + "reward_std": 0.2649471387267113, + "rewards/GDino": 0.971875011920929, + "rewards/GIT": 0.7277626097202301, + "rewards/HPSv2": 0.2670707702636719, + "rewards/ORM": 0.8708988130092621, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.4375, + "step": 401 + }, + { + "completion_length": 36.9375, + "epoch": 0.44518272425249167, + "grad_norm": 0.466122567653656, + "kl": 0.038818359375, + "learning_rate": 7.4875e-07, + "loss": -0.00474184425547719, + "reward": 2.21277391910553, + "reward_std": 0.2819427028298378, + "rewards/GDino": 0.7564797103404999, + "rewards/GIT": 0.4613865911960602, + "rewards/HPSv2": 0.2831687927246094, + "rewards/ORM": 0.7117387652397156, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -22.4375, + "step": 402 + }, + { + "completion_length": 41.96875, + "epoch": 0.44629014396456257, + "grad_norm": 0.6223385334014893, + "kl": 0.041015625, + "learning_rate": 7.48125e-07, + "loss": 0.006069277413189411, + "reward": 1.8274257183074951, + "reward_std": 0.277290478348732, + "rewards/GDino": 0.5828124582767487, + "rewards/GIT": 0.23541270196437836, + "rewards/HPSv2": 0.2838916778564453, + "rewards/ORM": 0.725308746099472, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.75, + "step": 403 + }, + { + "completion_length": 41.828125, + "epoch": 0.44739756367663347, + "grad_norm": 0.8799866437911987, + "kl": 0.045654296875, + "learning_rate": 7.475e-07, + "loss": 0.0017385446117259562, + "reward": 1.9244996309280396, + "reward_std": 0.3593953996896744, + "rewards/GDino": 0.6904172003269196, + "rewards/GIT": 0.3588860034942627, + "rewards/HPSv2": 0.2580089569091797, + "rewards/ORM": 0.6171875, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -22.0, + "step": 404 + }, + { + "completion_length": 49.75, + "epoch": 0.4485049833887043, + "grad_norm": 1.2903233766555786, + "kl": 0.0443115234375, + "learning_rate": 7.468749999999999e-07, + "loss": 0.004371013026684523, + "reward": 2.7583900690078735, + "reward_std": 0.2406376302242279, + "rewards/GDino": 0.8804058730602264, + "rewards/GIT": 0.6657536029815674, + "rewards/HPSv2": 0.2673187255859375, + "rewards/ORM": 0.9449118673801422, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.4375, + "step": 405 + }, + { + "completion_length": 43.109375, + "epoch": 0.4496124031007752, + "grad_norm": 1.4691380262374878, + "kl": 0.046630859375, + "learning_rate": 7.4625e-07, + "loss": 0.016827881336212158, + "reward": 1.6681452989578247, + "reward_std": 0.37892957031726837, + "rewards/GDino": 0.6188920736312866, + "rewards/GIT": 0.03339381515979767, + "rewards/HPSv2": 0.282958984375, + "rewards/ORM": 0.7329003810882568, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.375, + "step": 406 + }, + { + "completion_length": 38.0, + "epoch": 0.45071982281284606, + "grad_norm": 0.7037851810455322, + "kl": 0.0367431640625, + "learning_rate": 7.45625e-07, + "loss": -0.017478429013863206, + "reward": 1.9953652024269104, + "reward_std": 0.364028662443161, + "rewards/GDino": 0.7403169274330139, + "rewards/GIT": 0.406859390437603, + "rewards/HPSv2": 0.2685394287109375, + "rewards/ORM": 0.5796495079994202, + "self_certainty_semantic": -24.9375, + "self_certainty_token": -22.25, + "step": 407 + }, + { + "completion_length": 44.0625, + "epoch": 0.45182724252491696, + "grad_norm": 0.5131052136421204, + "kl": 0.0306396484375, + "learning_rate": 7.45e-07, + "loss": 0.005040338612161577, + "reward": 2.041200637817383, + "reward_std": 0.3807089328765869, + "rewards/GDino": 0.7476771473884583, + "rewards/GIT": 0.4045151174068451, + "rewards/HPSv2": 0.25945472717285156, + "rewards/ORM": 0.6295536458492279, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -22.125, + "step": 408 + }, + { + "completion_length": 38.34375, + "epoch": 0.4529346622369878, + "grad_norm": 0.636464536190033, + "kl": 0.0491943359375, + "learning_rate": 7.44375e-07, + "loss": -0.03190365340560675, + "reward": 2.7317603826522827, + "reward_std": 0.4104766994714737, + "rewards/GDino": 0.9453125, + "rewards/GIT": 0.7198237776756287, + "rewards/HPSv2": 0.27568626403808594, + "rewards/ORM": 0.7909377217292786, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -20.9375, + "step": 409 + }, + { + "completion_length": 33.546875, + "epoch": 0.4540420819490587, + "grad_norm": 0.8647944927215576, + "kl": 0.042724609375, + "learning_rate": 7.4375e-07, + "loss": -0.003960296045988798, + "reward": 2.0273808240890503, + "reward_std": 0.3602750897407532, + "rewards/GDino": 0.8088895082473755, + "rewards/GIT": 0.4431849420070648, + "rewards/HPSv2": 0.2676239013671875, + "rewards/ORM": 0.5076826512813568, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -22.375, + "step": 410 + }, + { + "completion_length": 42.140625, + "epoch": 0.45514950166112955, + "grad_norm": 0.4983558654785156, + "kl": 0.0391845703125, + "learning_rate": 7.43125e-07, + "loss": -0.019663702347315848, + "reward": 2.568248152732849, + "reward_std": 0.22275355458259583, + "rewards/GDino": 0.9209606051445007, + "rewards/GIT": 0.6949225664138794, + "rewards/HPSv2": 0.2756767272949219, + "rewards/ORM": 0.6766883730888367, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.8125, + "step": 411 + }, + { + "completion_length": 36.234375, + "epoch": 0.45625692137320045, + "grad_norm": 2.719592332839966, + "kl": 0.0440673828125, + "learning_rate": 7.425e-07, + "loss": -0.012157580815255642, + "reward": 2.434681475162506, + "reward_std": 0.13174384832382202, + "rewards/GDino": 0.8247999548912048, + "rewards/GIT": 0.46177390962839127, + "rewards/HPSv2": 0.27869415283203125, + "rewards/ORM": 0.869413435459137, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.0, + "step": 412 + }, + { + "completion_length": 36.046875, + "epoch": 0.4573643410852713, + "grad_norm": 0.5472164750099182, + "kl": 0.0443115234375, + "learning_rate": 7.418749999999999e-07, + "loss": 0.001380805391818285, + "reward": 1.6478554010391235, + "reward_std": 0.39775100350379944, + "rewards/GDino": 0.739335834980011, + "rewards/GIT": 0.12186351418495178, + "rewards/HPSv2": 0.271331787109375, + "rewards/ORM": 0.5153242349624634, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -22.0, + "step": 413 + }, + { + "completion_length": 42.03125, + "epoch": 0.4584717607973422, + "grad_norm": 0.9818385243415833, + "kl": 0.045654296875, + "learning_rate": 7.412499999999999e-07, + "loss": 0.00032033328898251057, + "reward": 2.244621515274048, + "reward_std": 0.3542056977748871, + "rewards/GDino": 0.7868304252624512, + "rewards/GIT": 0.3334681913256645, + "rewards/HPSv2": 0.28057289123535156, + "rewards/ORM": 0.84375, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.75, + "step": 414 + }, + { + "completion_length": 44.421875, + "epoch": 0.45957918050941304, + "grad_norm": 1.3210840225219727, + "kl": 0.043212890625, + "learning_rate": 7.406249999999999e-07, + "loss": 0.01814082730561495, + "reward": 2.018928349018097, + "reward_std": 0.5640377998352051, + "rewards/GDino": 0.7119726538658142, + "rewards/GIT": 0.3103228211402893, + "rewards/HPSv2": 0.26573753356933594, + "rewards/ORM": 0.7308953106403351, + "self_certainty_semantic": -25.125, + "self_certainty_token": -22.1875, + "step": 415 + }, + { + "completion_length": 37.140625, + "epoch": 0.46068660022148394, + "grad_norm": 0.6978505253791809, + "kl": 0.040283203125, + "learning_rate": 7.4e-07, + "loss": 0.02266329526901245, + "reward": 1.9420585632324219, + "reward_std": 0.22528188675642014, + "rewards/GDino": 0.6741819083690643, + "rewards/GIT": 0.326813742518425, + "rewards/HPSv2": 0.25824642181396484, + "rewards/ORM": 0.6828164756298065, + "self_certainty_semantic": -25.125, + "self_certainty_token": -22.375, + "step": 416 + }, + { + "completion_length": 47.140625, + "epoch": 0.46179401993355484, + "grad_norm": 0.6955863833427429, + "kl": 0.02899169921875, + "learning_rate": 7.39375e-07, + "loss": -0.022133303806185722, + "reward": 2.2546093463897705, + "reward_std": 0.37237268686294556, + "rewards/GDino": 0.8068858683109283, + "rewards/GIT": 0.51471146941185, + "rewards/HPSv2": 0.25958251953125, + "rewards/ORM": 0.6734296083450317, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.4375, + "step": 417 + }, + { + "completion_length": 36.25, + "epoch": 0.4629014396456257, + "grad_norm": 0.6875625252723694, + "kl": 0.0550537109375, + "learning_rate": 7.3875e-07, + "loss": -0.02608813438564539, + "reward": 1.9164315462112427, + "reward_std": 0.4128989577293396, + "rewards/GDino": 0.737500011920929, + "rewards/GIT": 0.48389841616153717, + "rewards/HPSv2": 0.25507545471191406, + "rewards/ORM": 0.4399575889110565, + "self_certainty_semantic": -24.9375, + "self_certainty_token": -21.8125, + "step": 418 + }, + { + "completion_length": 42.359375, + "epoch": 0.4640088593576966, + "grad_norm": 0.5099749565124512, + "kl": 0.035888671875, + "learning_rate": 7.38125e-07, + "loss": 0.012332713231444359, + "reward": 2.1047881841659546, + "reward_std": 0.3690176010131836, + "rewards/GDino": 0.6977547407150269, + "rewards/GIT": 0.5606772899627686, + "rewards/HPSv2": 0.24872398376464844, + "rewards/ORM": 0.5976322144269943, + "self_certainty_semantic": -25.0, + "self_certainty_token": -21.875, + "step": 419 + }, + { + "completion_length": 47.796875, + "epoch": 0.46511627906976744, + "grad_norm": 0.7968041896820068, + "kl": 0.02825927734375, + "learning_rate": 7.375e-07, + "loss": -0.007890310138463974, + "reward": 2.5399848222732544, + "reward_std": 0.2056763395667076, + "rewards/GDino": 0.9003763794898987, + "rewards/GIT": 0.6921442151069641, + "rewards/HPSv2": 0.28192710876464844, + "rewards/ORM": 0.6655370742082596, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -22.1875, + "step": 420 + }, + { + "completion_length": 40.078125, + "epoch": 0.46622369878183834, + "grad_norm": 0.5168143510818481, + "kl": 0.039794921875, + "learning_rate": 7.368749999999999e-07, + "loss": 7.763970643281937e-06, + "reward": 2.2306275367736816, + "reward_std": 0.19530287384986877, + "rewards/GDino": 0.7578124701976776, + "rewards/GIT": 0.5601051151752472, + "rewards/HPSv2": 0.2732887268066406, + "rewards/ORM": 0.6394211947917938, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.9375, + "step": 421 + }, + { + "completion_length": 47.09375, + "epoch": 0.4673311184939092, + "grad_norm": 0.6787537932395935, + "kl": 0.0335693359375, + "learning_rate": 7.362499999999999e-07, + "loss": 0.023679735139012337, + "reward": 2.0749627351760864, + "reward_std": 0.2599632367491722, + "rewards/GDino": 0.7587704658508301, + "rewards/GIT": 0.5343009531497955, + "rewards/HPSv2": 0.2624359130859375, + "rewards/ORM": 0.5194553732872009, + "self_certainty_semantic": -25.0, + "self_certainty_token": -22.1875, + "step": 422 + }, + { + "completion_length": 45.15625, + "epoch": 0.4684385382059801, + "grad_norm": 0.5317529439926147, + "kl": 0.03717041015625, + "learning_rate": 7.356249999999999e-07, + "loss": 0.0031719468533992767, + "reward": 2.253452777862549, + "reward_std": 0.2863696217536926, + "rewards/GDino": 0.8162786066532135, + "rewards/GIT": 0.5946633368730545, + "rewards/HPSv2": 0.2769317626953125, + "rewards/ORM": 0.5655790567398071, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -22.375, + "step": 423 + }, + { + "completion_length": 43.578125, + "epoch": 0.4695459579180509, + "grad_norm": 1.1308730840682983, + "kl": 0.033447265625, + "learning_rate": 7.35e-07, + "loss": -0.0021982135949656367, + "reward": 1.969843864440918, + "reward_std": 0.39374539256095886, + "rewards/GDino": 0.7518229186534882, + "rewards/GIT": 0.3873000741004944, + "rewards/HPSv2": 0.2738075256347656, + "rewards/ORM": 0.556913435459137, + "self_certainty_semantic": -24.9375, + "self_certainty_token": -21.75, + "step": 424 + }, + { + "completion_length": 44.140625, + "epoch": 0.4706533776301218, + "grad_norm": 0.9367016553878784, + "kl": 0.0478515625, + "learning_rate": 7.34375e-07, + "loss": 0.0006617754697799683, + "reward": 1.942332684993744, + "reward_std": 0.3015865758061409, + "rewards/GDino": 0.7671928405761719, + "rewards/GIT": 0.37167659401893616, + "rewards/HPSv2": 0.27842140197753906, + "rewards/ORM": 0.5250419676303864, + "self_certainty_semantic": -24.9375, + "self_certainty_token": -21.375, + "step": 425 + }, + { + "completion_length": 39.921875, + "epoch": 0.4717607973421927, + "grad_norm": 0.5315147638320923, + "kl": 0.03662109375, + "learning_rate": 7.3375e-07, + "loss": 0.010861005634069443, + "reward": 1.9369670152664185, + "reward_std": 0.2872409373521805, + "rewards/GDino": 0.7253109216690063, + "rewards/GIT": 0.3525398448109627, + "rewards/HPSv2": 0.2622871398925781, + "rewards/ORM": 0.5968290567398071, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.9375, + "step": 426 + }, + { + "completion_length": 49.578125, + "epoch": 0.4728682170542636, + "grad_norm": 1.4368230104446411, + "kl": 0.02911376953125, + "learning_rate": 7.33125e-07, + "loss": 0.030126910656690598, + "reward": 2.7364684343338013, + "reward_std": 0.26522858440876007, + "rewards/GDino": 0.953125, + "rewards/GIT": 0.9096876680850983, + "rewards/HPSv2": 0.2548637390136719, + "rewards/ORM": 0.618791937828064, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.125, + "step": 427 + }, + { + "completion_length": 31.109375, + "epoch": 0.4739756367663344, + "grad_norm": 0.8413992524147034, + "kl": 0.036865234375, + "learning_rate": 7.325e-07, + "loss": 0.0009733177721500397, + "reward": 2.0109975337982178, + "reward_std": 0.3149056136608124, + "rewards/GDino": 0.8508215546607971, + "rewards/GIT": 0.31817829608917236, + "rewards/HPSv2": 0.2774925231933594, + "rewards/ORM": 0.5645051002502441, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -22.5625, + "step": 428 + }, + { + "completion_length": 38.0625, + "epoch": 0.4750830564784053, + "grad_norm": 0.5556603074073792, + "kl": 0.055908203125, + "learning_rate": 7.31875e-07, + "loss": -0.01675282884389162, + "reward": 1.731520414352417, + "reward_std": 0.5266979932785034, + "rewards/GDino": 0.6363127827644348, + "rewards/GIT": 0.21719177067279816, + "rewards/HPSv2": 0.26899147033691406, + "rewards/ORM": 0.6090243458747864, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.25, + "step": 429 + }, + { + "completion_length": 48.203125, + "epoch": 0.47619047619047616, + "grad_norm": 0.5894951820373535, + "kl": 0.052734375, + "learning_rate": 7.312499999999999e-07, + "loss": 0.027036309242248535, + "reward": 1.872725009918213, + "reward_std": 0.39114080369472504, + "rewards/GDino": 0.6521097123622894, + "rewards/GIT": 0.2845214605331421, + "rewards/HPSv2": 0.2642688751220703, + "rewards/ORM": 0.6718250513076782, + "self_certainty_semantic": -24.9375, + "self_certainty_token": -21.875, + "step": 430 + }, + { + "completion_length": 55.046875, + "epoch": 0.47729789590254706, + "grad_norm": 0.5126261711120605, + "kl": 0.04833984375, + "learning_rate": 7.306249999999999e-07, + "loss": -0.004950821399688721, + "reward": 2.2202555537223816, + "reward_std": 0.2636689841747284, + "rewards/GDino": 0.7635050415992737, + "rewards/GIT": 0.5405832678079605, + "rewards/HPSv2": 0.27193260192871094, + "rewards/ORM": 0.6442345380783081, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.0625, + "step": 431 + }, + { + "completion_length": 42.40625, + "epoch": 0.47840531561461797, + "grad_norm": 0.8860141038894653, + "kl": 0.039794921875, + "learning_rate": 7.3e-07, + "loss": 0.005071769934147596, + "reward": 1.8272140622138977, + "reward_std": 0.2828010469675064, + "rewards/GDino": 0.6431768834590912, + "rewards/GIT": 0.29858846962451935, + "rewards/HPSv2": 0.252716064453125, + "rewards/ORM": 0.632732629776001, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.875, + "step": 432 + }, + { + "completion_length": 38.375, + "epoch": 0.4795127353266888, + "grad_norm": 1.3662358522415161, + "kl": 0.0472412109375, + "learning_rate": 7.29375e-07, + "loss": 0.0014977240934967995, + "reward": 2.062899112701416, + "reward_std": 0.5158149152994156, + "rewards/GDino": 0.7304880619049072, + "rewards/GIT": 0.3485347479581833, + "rewards/HPSv2": 0.2774925231933594, + "rewards/ORM": 0.7063838839530945, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -22.0625, + "step": 433 + }, + { + "completion_length": 43.515625, + "epoch": 0.4806201550387597, + "grad_norm": 0.49138492345809937, + "kl": 0.0408935546875, + "learning_rate": 7.2875e-07, + "loss": -0.023660541512072086, + "reward": 2.264481842517853, + "reward_std": 0.3346661627292633, + "rewards/GDino": 0.8219202160835266, + "rewards/GIT": 0.5395843833684921, + "rewards/HPSv2": 0.2608776092529297, + "rewards/ORM": 0.6420996189117432, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -22.375, + "step": 434 + }, + { + "completion_length": 52.78125, + "epoch": 0.48172757475083056, + "grad_norm": 0.7937164902687073, + "kl": 0.0439453125, + "learning_rate": 7.28125e-07, + "loss": -0.005721581168472767, + "reward": 1.99281644821167, + "reward_std": 0.4047849476337433, + "rewards/GDino": 0.8185839653015137, + "rewards/GIT": 0.39505699276924133, + "rewards/HPSv2": 0.2604713439941406, + "rewards/ORM": 0.5187040269374847, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.6875, + "step": 435 + }, + { + "completion_length": 42.078125, + "epoch": 0.48283499446290146, + "grad_norm": 1.5256606340408325, + "kl": 0.0430908203125, + "learning_rate": 7.275e-07, + "loss": -0.008313903585076332, + "reward": 2.0975226163864136, + "reward_std": 0.3734524995088577, + "rewards/GDino": 0.7363665997982025, + "rewards/GIT": 0.4460279196500778, + "rewards/HPSv2": 0.2857952117919922, + "rewards/ORM": 0.6293327808380127, + "self_certainty_semantic": -25.125, + "self_certainty_token": -22.0625, + "step": 436 + }, + { + "completion_length": 50.0625, + "epoch": 0.4839424141749723, + "grad_norm": 0.49155765771865845, + "kl": 0.0369873046875, + "learning_rate": 7.26875e-07, + "loss": 0.021884786896407604, + "reward": 2.283669114112854, + "reward_std": 0.3140418156981468, + "rewards/GDino": 0.8053280115127563, + "rewards/GIT": 0.6532137989997864, + "rewards/HPSv2": 0.25914764404296875, + "rewards/ORM": 0.5659796893596649, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -22.1875, + "step": 437 + }, + { + "completion_length": 47.6875, + "epoch": 0.4850498338870432, + "grad_norm": 0.6744347810745239, + "kl": 0.0421142578125, + "learning_rate": 7.262499999999999e-07, + "loss": -0.01202036626636982, + "reward": 2.3710286617279053, + "reward_std": 0.2711103931069374, + "rewards/GDino": 0.7963541746139526, + "rewards/GIT": 0.6432071924209595, + "rewards/HPSv2": 0.25798797607421875, + "rewards/ORM": 0.6734794974327087, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.875, + "step": 438 + }, + { + "completion_length": 40.703125, + "epoch": 0.48615725359911405, + "grad_norm": 0.4731282591819763, + "kl": 0.0450439453125, + "learning_rate": 7.256249999999999e-07, + "loss": -0.0025211619213223457, + "reward": 2.326407551765442, + "reward_std": 0.3486577272415161, + "rewards/GDino": 0.8489097654819489, + "rewards/GIT": 0.5543791800737381, + "rewards/HPSv2": 0.2765064239501953, + "rewards/ORM": 0.6466122269630432, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.6875, + "step": 439 + }, + { + "completion_length": 42.25, + "epoch": 0.48726467331118495, + "grad_norm": 1.9416545629501343, + "kl": 0.0430908203125, + "learning_rate": 7.249999999999999e-07, + "loss": 0.026762551395222545, + "reward": 2.0378406643867493, + "reward_std": 0.3052428662776947, + "rewards/GDino": 0.7424247860908508, + "rewards/GIT": 0.3270244523882866, + "rewards/HPSv2": 0.28089141845703125, + "rewards/ORM": 0.6875, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -22.3125, + "step": 440 + }, + { + "completion_length": 38.78125, + "epoch": 0.4883720930232558, + "grad_norm": 0.5084402561187744, + "kl": 0.046875, + "learning_rate": 7.243749999999999e-07, + "loss": -0.001582828350365162, + "reward": 2.3616641759872437, + "reward_std": 0.35844916850328445, + "rewards/GDino": 0.8143379390239716, + "rewards/GIT": 0.5581052601337433, + "rewards/HPSv2": 0.27395057678222656, + "rewards/ORM": 0.7152703106403351, + "self_certainty_semantic": -25.125, + "self_certainty_token": -22.25, + "step": 441 + }, + { + "completion_length": 44.125, + "epoch": 0.4894795127353267, + "grad_norm": 0.8177931904792786, + "kl": 0.0474853515625, + "learning_rate": 7.2375e-07, + "loss": 0.005483056418597698, + "reward": 1.7684773802757263, + "reward_std": 0.5146408230066299, + "rewards/GDino": 0.664166659116745, + "rewards/GIT": 0.25258100032806396, + "rewards/HPSv2": 0.2692718505859375, + "rewards/ORM": 0.5824578404426575, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.75, + "step": 442 + }, + { + "completion_length": 35.5, + "epoch": 0.49058693244739754, + "grad_norm": 0.6697931885719299, + "kl": 0.0384521484375, + "learning_rate": 7.23125e-07, + "loss": -0.001371758058667183, + "reward": 1.694158136844635, + "reward_std": 0.26968376338481903, + "rewards/GDino": 0.6942708194255829, + "rewards/GIT": 0.43527622520923615, + "rewards/HPSv2": 0.24617385864257812, + "rewards/ORM": 0.31843727827072144, + "self_certainty_semantic": -25.0, + "self_certainty_token": -21.875, + "step": 443 + }, + { + "completion_length": 43.078125, + "epoch": 0.49169435215946844, + "grad_norm": 0.6402589082717896, + "kl": 0.0433349609375, + "learning_rate": 7.225e-07, + "loss": 0.03987434785813093, + "reward": 1.8952922821044922, + "reward_std": 0.36800456047058105, + "rewards/GDino": 0.7403267621994019, + "rewards/GIT": 0.38624581694602966, + "rewards/HPSv2": 0.2577781677246094, + "rewards/ORM": 0.5109414905309677, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.0, + "step": 444 + }, + { + "completion_length": 30.0, + "epoch": 0.49280177187153934, + "grad_norm": 0.7532635927200317, + "kl": 0.041259765625, + "learning_rate": 7.21875e-07, + "loss": 0.010319485329091549, + "reward": 2.7416329383850098, + "reward_std": 0.13061758875846863, + "rewards/GDino": 0.879687488079071, + "rewards/GIT": 0.6991226822137833, + "rewards/HPSv2": 0.2878227233886719, + "rewards/ORM": 0.875, + "self_certainty_semantic": -25.25, + "self_certainty_token": -22.8125, + "step": 445 + }, + { + "completion_length": 38.9375, + "epoch": 0.4939091915836102, + "grad_norm": 0.4900251030921936, + "kl": 0.052490234375, + "learning_rate": 7.212499999999999e-07, + "loss": 0.0012075770646333694, + "reward": 2.3121979236602783, + "reward_std": 0.12953293696045876, + "rewards/GDino": 0.8581786453723907, + "rewards/GIT": 0.47472766786813736, + "rewards/HPSv2": 0.2680034637451172, + "rewards/ORM": 0.711288183927536, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.8125, + "step": 446 + }, + { + "completion_length": 43.234375, + "epoch": 0.4950166112956811, + "grad_norm": 0.5694748759269714, + "kl": 0.0531005859375, + "learning_rate": 7.206249999999999e-07, + "loss": -0.01422290992923081, + "reward": 2.2101300954818726, + "reward_std": 0.2848183512687683, + "rewards/GDino": 0.7993446290493011, + "rewards/GIT": 0.4786403179168701, + "rewards/HPSv2": 0.27276611328125, + "rewards/ORM": 0.6593789905309677, + "self_certainty_semantic": -25.125, + "self_certainty_token": -23.0625, + "step": 447 + }, + { + "completion_length": 40.34375, + "epoch": 0.49612403100775193, + "grad_norm": 0.5506007671356201, + "kl": 0.041748046875, + "learning_rate": 7.2e-07, + "loss": 0.02106620604172349, + "reward": 2.4321142435073853, + "reward_std": 0.41992174088954926, + "rewards/GDino": 0.8411885499954224, + "rewards/GIT": 0.4576234966516495, + "rewards/HPSv2": 0.2591552734375, + "rewards/ORM": 0.8741468787193298, + "self_certainty_semantic": -24.9375, + "self_certainty_token": -21.4375, + "step": 448 + }, + { + "completion_length": 40.078125, + "epoch": 0.49723145071982283, + "grad_norm": 0.6929682493209839, + "kl": 0.046630859375, + "learning_rate": 7.19375e-07, + "loss": 0.0026401374489068985, + "reward": 2.082669258117676, + "reward_std": 0.22932946681976318, + "rewards/GDino": 0.8020833730697632, + "rewards/GIT": 0.363376647233963, + "rewards/HPSv2": 0.28600120544433594, + "rewards/ORM": 0.6312080323696136, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.6875, + "step": 449 + }, + { + "completion_length": 42.890625, + "epoch": 0.4983388704318937, + "grad_norm": 0.6465428471565247, + "kl": 0.0411376953125, + "learning_rate": 7.1875e-07, + "loss": -0.003926795674487948, + "reward": 1.7539690732955933, + "reward_std": 0.381552517414093, + "rewards/GDino": 0.8213628232479095, + "rewards/GIT": 0.3709906339645386, + "rewards/HPSv2": 0.2616615295410156, + "rewards/ORM": 0.29995405673980713, + "self_certainty_semantic": -25.0, + "self_certainty_token": -22.125, + "step": 450 + }, + { + "completion_length": 46.078125, + "epoch": 0.4994462901439646, + "grad_norm": 0.4404943883419037, + "kl": 0.0408935546875, + "learning_rate": 7.18125e-07, + "loss": -0.0011517745442688465, + "reward": 1.81854248046875, + "reward_std": 0.21245457977056503, + "rewards/GDino": 0.7781844735145569, + "rewards/GIT": 0.4284133017063141, + "rewards/HPSv2": 0.2648448944091797, + "rewards/ORM": 0.34709984064102173, + "self_certainty_semantic": -25.125, + "self_certainty_token": -22.5625, + "step": 451 + }, + { + "completion_length": 48.15625, + "epoch": 0.5005537098560354, + "grad_norm": 0.7872177958488464, + "kl": 0.0487060546875, + "learning_rate": 7.175e-07, + "loss": -0.008891448844224215, + "reward": 2.0443437695503235, + "reward_std": 0.3621833920478821, + "rewards/GDino": 0.7558982670307159, + "rewards/GIT": 0.328241229057312, + "rewards/HPSv2": 0.27618408203125, + "rewards/ORM": 0.6840202808380127, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.1875, + "step": 452 + }, + { + "completion_length": 41.515625, + "epoch": 0.5016611295681063, + "grad_norm": 0.6438757181167603, + "kl": 0.0477294921875, + "learning_rate": 7.16875e-07, + "loss": 0.013278962578624487, + "reward": 2.3407278060913086, + "reward_std": 0.29479558765888214, + "rewards/GDino": 0.7766183912754059, + "rewards/GIT": 0.6374973952770233, + "rewards/HPSv2": 0.2586193084716797, + "rewards/ORM": 0.6679927408695221, + "self_certainty_semantic": -24.75, + "self_certainty_token": -22.3125, + "step": 453 + }, + { + "completion_length": 44.0625, + "epoch": 0.5027685492801772, + "grad_norm": 0.609624445438385, + "kl": 0.0462646484375, + "learning_rate": 7.1625e-07, + "loss": -0.0013964123791083694, + "reward": 2.5196316242218018, + "reward_std": 0.3144952356815338, + "rewards/GDino": 0.8671875, + "rewards/GIT": 0.6943239569664001, + "rewards/HPSv2": 0.2564697265625, + "rewards/ORM": 0.7016504108905792, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.3125, + "step": 454 + }, + { + "completion_length": 44.859375, + "epoch": 0.5038759689922481, + "grad_norm": 0.5280178189277649, + "kl": 0.04541015625, + "learning_rate": 7.156249999999999e-07, + "loss": -0.027535758446902037, + "reward": 1.9731789231300354, + "reward_std": 0.2811855897307396, + "rewards/GDino": 0.6670139133930206, + "rewards/GIT": 0.3297960162162781, + "rewards/HPSv2": 0.2701148986816406, + "rewards/ORM": 0.7062540054321289, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -22.375, + "step": 455 + }, + { + "completion_length": 49.84375, + "epoch": 0.5049833887043189, + "grad_norm": 0.4724181294441223, + "kl": 0.041259765625, + "learning_rate": 7.149999999999999e-07, + "loss": -0.008982280036434531, + "reward": 2.0365023612976074, + "reward_std": 0.37235787510871887, + "rewards/GDino": 0.7172095775604248, + "rewards/GIT": 0.42584167420864105, + "rewards/HPSv2": 0.2715301513671875, + "rewards/ORM": 0.6219209432601929, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -22.1875, + "step": 456 + }, + { + "completion_length": 44.765625, + "epoch": 0.5060908084163898, + "grad_norm": 0.9888372421264648, + "kl": 0.0479736328125, + "learning_rate": 7.143749999999999e-07, + "loss": -0.025254069827497005, + "reward": 2.0628309845924377, + "reward_std": 0.5208294987678528, + "rewards/GDino": 0.7491490542888641, + "rewards/GIT": 0.32147393375635147, + "rewards/HPSv2": 0.29552268981933594, + "rewards/ORM": 0.6966853141784668, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -22.6875, + "step": 457 + }, + { + "completion_length": 54.09375, + "epoch": 0.5071982281284607, + "grad_norm": 1.0077027082443237, + "kl": 0.0501708984375, + "learning_rate": 7.137499999999999e-07, + "loss": -0.001974734477698803, + "reward": 2.337610900402069, + "reward_std": 0.31742343306541443, + "rewards/GDino": 0.8954645991325378, + "rewards/GIT": 0.6387971341609955, + "rewards/HPSv2": 0.2842445373535156, + "rewards/ORM": 0.5191046893596649, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.125, + "step": 458 + }, + { + "completion_length": 33.984375, + "epoch": 0.5083056478405316, + "grad_norm": 0.6579184532165527, + "kl": 0.0506591796875, + "learning_rate": 7.13125e-07, + "loss": 0.01580179762095213, + "reward": 2.240155339241028, + "reward_std": 0.23216913640499115, + "rewards/GDino": 0.8633578717708588, + "rewards/GIT": 0.6162318885326385, + "rewards/HPSv2": 0.26489830017089844, + "rewards/ORM": 0.4956671893596649, + "self_certainty_semantic": -24.9375, + "self_certainty_token": -21.125, + "step": 459 + }, + { + "completion_length": 53.359375, + "epoch": 0.5094130675526024, + "grad_norm": 1.016530156135559, + "kl": 0.06396484375, + "learning_rate": 7.125e-07, + "loss": 0.006311747245490551, + "reward": 1.799654245376587, + "reward_std": 0.45648741722106934, + "rewards/GDino": 0.6749999523162842, + "rewards/GIT": 0.25362446904182434, + "rewards/HPSv2": 0.2591972351074219, + "rewards/ORM": 0.6118325591087341, + "self_certainty_semantic": -25.125, + "self_certainty_token": -22.0625, + "step": 460 + }, + { + "completion_length": 46.015625, + "epoch": 0.5105204872646734, + "grad_norm": 0.9420766234397888, + "kl": 0.035888671875, + "learning_rate": 7.11875e-07, + "loss": -0.0029141909908503294, + "reward": 1.8296540975570679, + "reward_std": 0.4803127646446228, + "rewards/GDino": 0.6914098262786865, + "rewards/GIT": 0.23670585453510284, + "rewards/HPSv2": 0.2769889831542969, + "rewards/ORM": 0.624549388885498, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.875, + "step": 461 + }, + { + "completion_length": 39.1875, + "epoch": 0.5116279069767442, + "grad_norm": 0.7059751749038696, + "kl": 0.048095703125, + "learning_rate": 7.1125e-07, + "loss": 0.02869965275749564, + "reward": 1.7591858506202698, + "reward_std": 0.25960099697113037, + "rewards/GDino": 0.6792242527008057, + "rewards/GIT": 0.39417822659015656, + "rewards/HPSv2": 0.24828338623046875, + "rewards/ORM": 0.4375, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.4375, + "step": 462 + }, + { + "completion_length": 36.21875, + "epoch": 0.512735326688815, + "grad_norm": 0.9423550963401794, + "kl": 0.0567626953125, + "learning_rate": 7.106249999999999e-07, + "loss": 0.017706943210214376, + "reward": 1.816788673400879, + "reward_std": 0.3961597681045532, + "rewards/GDino": 0.693546324968338, + "rewards/GIT": 0.12270250916481018, + "rewards/HPSv2": 0.28178977966308594, + "rewards/ORM": 0.71875, + "self_certainty_semantic": -25.25, + "self_certainty_token": -22.0, + "step": 463 + }, + { + "completion_length": 44.1875, + "epoch": 0.5138427464008859, + "grad_norm": 0.6439041495323181, + "kl": 0.04931640625, + "learning_rate": 7.1e-07, + "loss": 0.02229080768302083, + "reward": 1.8793537616729736, + "reward_std": 0.32218582928180695, + "rewards/GDino": 0.6969353556632996, + "rewards/GIT": 0.3158310651779175, + "rewards/HPSv2": 0.25373268127441406, + "rewards/ORM": 0.6128546893596649, + "self_certainty_semantic": -24.8125, + "self_certainty_token": -21.5, + "step": 464 + }, + { + "completion_length": 43.890625, + "epoch": 0.5149501661129569, + "grad_norm": 0.5628231167793274, + "kl": 0.0523681640625, + "learning_rate": 7.09375e-07, + "loss": 0.04102162458002567, + "reward": 2.2252548933029175, + "reward_std": 0.09786996431648731, + "rewards/GDino": 0.7161866128444672, + "rewards/GIT": 0.34939881414175034, + "rewards/HPSv2": 0.2862739562988281, + "rewards/ORM": 0.8733955323696136, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.5625, + "step": 465 + }, + { + "completion_length": 48.578125, + "epoch": 0.5160575858250277, + "grad_norm": 0.6555954813957214, + "kl": 0.0419921875, + "learning_rate": 7.0875e-07, + "loss": -0.019896306097507477, + "reward": 1.9761043787002563, + "reward_std": 0.3829188644886017, + "rewards/GDino": 0.7688298523426056, + "rewards/GIT": 0.3276664614677429, + "rewards/HPSv2": 0.2706336975097656, + "rewards/ORM": 0.6089743673801422, + "self_certainty_semantic": -25.375, + "self_certainty_token": -22.5, + "step": 466 + }, + { + "completion_length": 44.34375, + "epoch": 0.5171650055370985, + "grad_norm": 0.5814468860626221, + "kl": 0.0567626953125, + "learning_rate": 7.08125e-07, + "loss": -0.001267156796529889, + "reward": 2.0606996417045593, + "reward_std": 0.3757035583257675, + "rewards/GDino": 0.7373589873313904, + "rewards/GIT": 0.3534126728773117, + "rewards/HPSv2": 0.2546577453613281, + "rewards/ORM": 0.7152702808380127, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -22.0625, + "step": 467 + }, + { + "completion_length": 37.09375, + "epoch": 0.5182724252491694, + "grad_norm": 0.7002123594284058, + "kl": 0.060302734375, + "learning_rate": 7.075e-07, + "loss": 0.014457742450758815, + "reward": 2.033020555973053, + "reward_std": 0.2667345404624939, + "rewards/GDino": 0.8099178373813629, + "rewards/GIT": 0.3804194927215576, + "rewards/HPSv2": 0.2658119201660156, + "rewards/ORM": 0.5768712759017944, + "self_certainty_semantic": -24.8125, + "self_certainty_token": -22.3125, + "step": 468 + }, + { + "completion_length": 35.625, + "epoch": 0.5193798449612403, + "grad_norm": 0.7956823706626892, + "kl": 0.0457763671875, + "learning_rate": 7.06875e-07, + "loss": -0.002700265496969223, + "reward": 1.8269972205162048, + "reward_std": 0.31584332883358, + "rewards/GDino": 0.6803904473781586, + "rewards/GIT": 0.19897165894508362, + "rewards/HPSv2": 0.263214111328125, + "rewards/ORM": 0.6844209432601929, + "self_certainty_semantic": -24.875, + "self_certainty_token": -21.875, + "step": 469 + }, + { + "completion_length": 39.328125, + "epoch": 0.5204872646733112, + "grad_norm": 1.1874123811721802, + "kl": 0.0533447265625, + "learning_rate": 7.0625e-07, + "loss": 0.013945101760327816, + "reward": 1.7942054867744446, + "reward_std": 0.504248172044754, + "rewards/GDino": 0.6401041746139526, + "rewards/GIT": 0.4749217629432678, + "rewards/HPSv2": 0.2554292678833008, + "rewards/ORM": 0.42375023663043976, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.0, + "step": 470 + }, + { + "completion_length": 43.375, + "epoch": 0.521594684385382, + "grad_norm": 0.7392427325248718, + "kl": 0.05810546875, + "learning_rate": 7.056249999999999e-07, + "loss": -0.0033886469900608063, + "reward": 2.60396945476532, + "reward_std": 0.2168251946568489, + "rewards/GDino": 0.8484375476837158, + "rewards/GIT": 0.5769221186637878, + "rewards/HPSv2": 0.27235984802246094, + "rewards/ORM": 0.90625, + "self_certainty_semantic": -25.125, + "self_certainty_token": -22.125, + "step": 471 + }, + { + "completion_length": 41.125, + "epoch": 0.5227021040974529, + "grad_norm": 0.5476402044296265, + "kl": 0.05908203125, + "learning_rate": 7.049999999999999e-07, + "loss": -6.132526323199272e-06, + "reward": 1.770015001296997, + "reward_std": 0.42023611068725586, + "rewards/GDino": 0.6564308404922485, + "rewards/GIT": 0.18880262970924377, + "rewards/HPSv2": 0.27821922302246094, + "rewards/ORM": 0.6465622782707214, + "self_certainty_semantic": -25.25, + "self_certainty_token": -22.0, + "step": 472 + }, + { + "completion_length": 37.53125, + "epoch": 0.5238095238095238, + "grad_norm": 0.5314967632293701, + "kl": 0.0400390625, + "learning_rate": 7.043749999999999e-07, + "loss": 0.008513603825122118, + "reward": 2.6033971309661865, + "reward_std": 0.18285945057868958, + "rewards/GDino": 0.7744791507720947, + "rewards/GIT": 0.6311581432819366, + "rewards/HPSv2": 0.25057220458984375, + "rewards/ORM": 0.9471877217292786, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.625, + "step": 473 + }, + { + "completion_length": 40.234375, + "epoch": 0.5249169435215947, + "grad_norm": 0.5836285352706909, + "kl": 0.04296875, + "learning_rate": 7.037499999999999e-07, + "loss": 0.007610062952153385, + "reward": 2.063364267349243, + "reward_std": 0.23948609083890915, + "rewards/GDino": 0.7378768920898438, + "rewards/GIT": 0.3602711111307144, + "rewards/HPSv2": 0.26682472229003906, + "rewards/ORM": 0.6983915567398071, + "self_certainty_semantic": -25.25, + "self_certainty_token": -22.25, + "step": 474 + }, + { + "completion_length": 40.453125, + "epoch": 0.5260243632336655, + "grad_norm": 0.6393899917602539, + "kl": 0.0455322265625, + "learning_rate": 7.031249999999999e-07, + "loss": 0.013668630854226649, + "reward": 1.6752720475196838, + "reward_std": 0.504058450460434, + "rewards/GDino": 0.6374790370464325, + "rewards/GIT": 0.31694649904966354, + "rewards/HPSv2": 0.2520465850830078, + "rewards/ORM": 0.4687999337911606, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.3125, + "step": 475 + }, + { + "completion_length": 41.8125, + "epoch": 0.5271317829457365, + "grad_norm": 0.5585540533065796, + "kl": 0.05126953125, + "learning_rate": 7.024999999999999e-07, + "loss": 0.013834381476044655, + "reward": 1.9375313520431519, + "reward_std": 0.4423487037420273, + "rewards/GDino": 0.7547129392623901, + "rewards/GIT": 0.10238027572631836, + "rewards/HPSv2": 0.2806549072265625, + "rewards/ORM": 0.7997831404209137, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.9375, + "step": 476 + }, + { + "completion_length": 33.4375, + "epoch": 0.5282392026578073, + "grad_norm": 0.6089532971382141, + "kl": 0.0474853515625, + "learning_rate": 7.01875e-07, + "loss": -0.0002646814100444317, + "reward": 1.9899710416793823, + "reward_std": 0.5306534469127655, + "rewards/GDino": 0.6599739193916321, + "rewards/GIT": 0.2705283910036087, + "rewards/HPSv2": 0.2672252655029297, + "rewards/ORM": 0.7922433316707611, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -22.3125, + "step": 477 + }, + { + "completion_length": 39.796875, + "epoch": 0.5293466223698782, + "grad_norm": 0.6699002385139465, + "kl": 0.0443115234375, + "learning_rate": 7.0125e-07, + "loss": 0.007601808290928602, + "reward": 2.5651779174804688, + "reward_std": 0.430521622300148, + "rewards/GDino": 0.8947426080703735, + "rewards/GIT": 0.617702305316925, + "rewards/HPSv2": 0.2605915069580078, + "rewards/ORM": 0.7921415567398071, + "self_certainty_semantic": -24.8125, + "self_certainty_token": -21.375, + "step": 478 + }, + { + "completion_length": 41.6875, + "epoch": 0.530454042081949, + "grad_norm": 0.5357719659805298, + "kl": 0.069580078125, + "learning_rate": 7.006250000000001e-07, + "loss": -0.017411263193935156, + "reward": 2.1919682025909424, + "reward_std": 0.34072282910346985, + "rewards/GDino": 0.7859375178813934, + "rewards/GIT": 0.5476135611534119, + "rewards/HPSv2": 0.26539039611816406, + "rewards/ORM": 0.5930267721414566, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.875, + "step": 479 + }, + { + "completion_length": 44.296875, + "epoch": 0.53156146179402, + "grad_norm": 0.4808535575866699, + "kl": 0.04248046875, + "learning_rate": 7e-07, + "loss": -0.012577058747410774, + "reward": 1.7866081595420837, + "reward_std": 0.3765677809715271, + "rewards/GDino": 0.5795279145240784, + "rewards/GIT": 0.12062875926494598, + "rewards/HPSv2": 0.2770805358886719, + "rewards/ORM": 0.8093709945678711, + "self_certainty_semantic": -25.125, + "self_certainty_token": -22.4375, + "step": 480 + }, + { + "completion_length": 42.03125, + "epoch": 0.5326688815060908, + "grad_norm": 0.6451517343521118, + "kl": 0.0438232421875, + "learning_rate": 6.99375e-07, + "loss": 0.009285415522754192, + "reward": 2.0640020966529846, + "reward_std": 0.2619232162833214, + "rewards/GDino": 0.6719355881214142, + "rewards/GIT": 0.23923633992671967, + "rewards/HPSv2": 0.2778301239013672, + "rewards/ORM": 0.875, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.75, + "step": 481 + }, + { + "completion_length": 37.234375, + "epoch": 0.5337763012181617, + "grad_norm": 1.7300623655319214, + "kl": 0.053466796875, + "learning_rate": 6.9875e-07, + "loss": -0.008384064538404346, + "reward": 2.6090811491012573, + "reward_std": 0.2747221887111664, + "rewards/GDino": 0.854687511920929, + "rewards/GIT": 0.7133489549160004, + "rewards/HPSv2": 0.2663536071777344, + "rewards/ORM": 0.7746912240982056, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -22.25, + "step": 482 + }, + { + "completion_length": 38.3125, + "epoch": 0.5348837209302325, + "grad_norm": 0.597758948802948, + "kl": 0.0496826171875, + "learning_rate": 6.98125e-07, + "loss": -0.006705485051497817, + "reward": 2.23333203792572, + "reward_std": 0.232535682618618, + "rewards/GDino": 0.8164223432540894, + "rewards/GIT": 0.5625110119581223, + "rewards/HPSv2": 0.2606487274169922, + "rewards/ORM": 0.59375, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.8125, + "step": 483 + }, + { + "completion_length": 40.453125, + "epoch": 0.5359911406423035, + "grad_norm": 0.4974672496318817, + "kl": 0.04736328125, + "learning_rate": 6.975e-07, + "loss": 0.02514065010473132, + "reward": 2.259411573410034, + "reward_std": 0.37527331709861755, + "rewards/GDino": 0.8667246103286743, + "rewards/GIT": 0.5178156942129135, + "rewards/HPSv2": 0.254425048828125, + "rewards/ORM": 0.6204463392496109, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.625, + "step": 484 + }, + { + "completion_length": 39.359375, + "epoch": 0.5370985603543743, + "grad_norm": 0.5150190591812134, + "kl": 0.0445556640625, + "learning_rate": 6.96875e-07, + "loss": -0.016253040172159672, + "reward": 2.223387122154236, + "reward_std": 0.4909762889146805, + "rewards/GDino": 0.8627164661884308, + "rewards/GIT": 0.5708933770656586, + "rewards/HPSv2": 0.26195716857910156, + "rewards/ORM": 0.5278202295303345, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.75, + "step": 485 + }, + { + "completion_length": 49.515625, + "epoch": 0.5382059800664452, + "grad_norm": 0.6220833659172058, + "kl": 0.0450439453125, + "learning_rate": 6.9625e-07, + "loss": -0.014192406553775072, + "reward": 1.9840824604034424, + "reward_std": 0.2602059319615364, + "rewards/GDino": 0.7390625476837158, + "rewards/GIT": 0.6071967780590057, + "rewards/HPSv2": 0.26282310485839844, + "rewards/ORM": 0.375, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.3125, + "step": 486 + }, + { + "completion_length": 53.03125, + "epoch": 0.5393133997785161, + "grad_norm": 1.2206982374191284, + "kl": 0.053466796875, + "learning_rate": 6.95625e-07, + "loss": -0.028607182204723358, + "reward": 1.9813396334648132, + "reward_std": 0.25878557562828064, + "rewards/GDino": 0.7200272381305695, + "rewards/GIT": 0.5861987769603729, + "rewards/HPSv2": 0.2532386779785156, + "rewards/ORM": 0.421875, + "self_certainty_semantic": -25.25, + "self_certainty_token": -22.75, + "step": 487 + }, + { + "completion_length": 51.328125, + "epoch": 0.540420819490587, + "grad_norm": 0.9301148056983948, + "kl": 0.0439453125, + "learning_rate": 6.949999999999999e-07, + "loss": -0.02558594848960638, + "reward": 1.8908702731132507, + "reward_std": 0.1930691972374916, + "rewards/GDino": 0.745572954416275, + "rewards/GIT": 0.3518243879079819, + "rewards/HPSv2": 0.2747688293457031, + "rewards/ORM": 0.5187040269374847, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -22.125, + "step": 488 + }, + { + "completion_length": 42.25, + "epoch": 0.5415282392026578, + "grad_norm": 0.5403162240982056, + "kl": 0.05615234375, + "learning_rate": 6.943749999999999e-07, + "loss": -0.014918024884536862, + "reward": 1.8645083904266357, + "reward_std": 0.31887492537498474, + "rewards/GDino": 0.7039845585823059, + "rewards/GIT": 0.31486817449331284, + "rewards/HPSv2": 0.2575721740722656, + "rewards/ORM": 0.5880835056304932, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.3125, + "step": 489 + }, + { + "completion_length": 53.9375, + "epoch": 0.5426356589147286, + "grad_norm": 0.6085987687110901, + "kl": 0.0477294921875, + "learning_rate": 6.937499999999999e-07, + "loss": 0.026606434024870396, + "reward": 1.5878762006759644, + "reward_std": 0.39282889664173126, + "rewards/GDino": 0.5839112102985382, + "rewards/GIT": 0.23308433592319489, + "rewards/HPSv2": 0.25993919372558594, + "rewards/ORM": 0.5109414830803871, + "self_certainty_semantic": -24.9375, + "self_certainty_token": -22.0, + "step": 490 + }, + { + "completion_length": 43.1875, + "epoch": 0.5437430786267996, + "grad_norm": 0.7043938040733337, + "kl": 0.0709228515625, + "learning_rate": 6.931249999999999e-07, + "loss": -0.0003202613443136215, + "reward": 1.4926478266716003, + "reward_std": 0.4612387716770172, + "rewards/GDino": 0.5785077512264252, + "rewards/GIT": 0.12771567702293396, + "rewards/HPSv2": 0.272003173828125, + "rewards/ORM": 0.5144212394952774, + "self_certainty_semantic": -25.0, + "self_certainty_token": -22.1875, + "step": 491 + }, + { + "completion_length": 39.0625, + "epoch": 0.5448504983388704, + "grad_norm": 1.6215249300003052, + "kl": 0.0604248046875, + "learning_rate": 6.924999999999999e-07, + "loss": 0.003573375754058361, + "reward": 1.6965920329093933, + "reward_std": 0.33052287995815277, + "rewards/GDino": 0.7088837027549744, + "rewards/GIT": 0.2778472602367401, + "rewards/HPSv2": 0.27584075927734375, + "rewards/ORM": 0.4340203106403351, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -22.75, + "step": 492 + }, + { + "completion_length": 42.953125, + "epoch": 0.5459579180509413, + "grad_norm": 0.6043370366096497, + "kl": 0.0589599609375, + "learning_rate": 6.918749999999999e-07, + "loss": 0.0050405459478497505, + "reward": 2.4023711681365967, + "reward_std": 0.3025038540363312, + "rewards/GDino": 0.710416704416275, + "rewards/GIT": 0.5207570940256119, + "rewards/HPSv2": 0.28365135192871094, + "rewards/ORM": 0.8875459730625153, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.9375, + "step": 493 + }, + { + "completion_length": 43.625, + "epoch": 0.5470653377630121, + "grad_norm": 0.5131484270095825, + "kl": 0.0443115234375, + "learning_rate": 6.9125e-07, + "loss": 0.0010174026247113943, + "reward": 2.6679136753082275, + "reward_std": 0.16655711084604263, + "rewards/GDino": 0.8572916984558105, + "rewards/GIT": 0.7125807106494904, + "rewards/HPSv2": 0.26370811462402344, + "rewards/ORM": 0.8343330323696136, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.4375, + "step": 494 + }, + { + "completion_length": 46.671875, + "epoch": 0.5481727574750831, + "grad_norm": 0.5884466171264648, + "kl": 0.04248046875, + "learning_rate": 6.906250000000001e-07, + "loss": 0.004825513111427426, + "reward": 2.057460904121399, + "reward_std": 0.3856964707374573, + "rewards/GDino": 0.8639159798622131, + "rewards/GIT": 0.6097074598073959, + "rewards/HPSv2": 0.25571250915527344, + "rewards/ORM": 0.3281249925494194, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -22.3125, + "step": 495 + }, + { + "completion_length": 46.515625, + "epoch": 0.5492801771871539, + "grad_norm": 0.9797373414039612, + "kl": 0.053466796875, + "learning_rate": 6.9e-07, + "loss": 0.010836861794814467, + "reward": 2.183039903640747, + "reward_std": 0.3088700920343399, + "rewards/GDino": 0.87109375, + "rewards/GIT": 0.5162058174610138, + "rewards/HPSv2": 0.2704277038574219, + "rewards/ORM": 0.5253127366304398, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.25, + "step": 496 + }, + { + "completion_length": 42.328125, + "epoch": 0.5503875968992248, + "grad_norm": 0.5162177085876465, + "kl": 0.052001953125, + "learning_rate": 6.89375e-07, + "loss": 0.012685938272625208, + "reward": 2.2096526622772217, + "reward_std": 0.38902442157268524, + "rewards/GDino": 0.7760285139083862, + "rewards/GIT": 0.30912603437900543, + "rewards/HPSv2": 0.2770366668701172, + "rewards/ORM": 0.8474613428115845, + "self_certainty_semantic": -25.25, + "self_certainty_token": -22.0, + "step": 497 + }, + { + "completion_length": 49.71875, + "epoch": 0.5514950166112956, + "grad_norm": 0.5989282727241516, + "kl": 0.06103515625, + "learning_rate": 6.8875e-07, + "loss": 0.01426611002534628, + "reward": 2.2355239391326904, + "reward_std": 0.5061717927455902, + "rewards/GDino": 0.8239583373069763, + "rewards/GIT": 0.5775501430034637, + "rewards/HPSv2": 0.26257896423339844, + "rewards/ORM": 0.5714363753795624, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -22.0625, + "step": 498 + }, + { + "completion_length": 41.828125, + "epoch": 0.5526024363233666, + "grad_norm": 0.8058765530586243, + "kl": 0.052490234375, + "learning_rate": 6.88125e-07, + "loss": 0.00940165319480002, + "reward": 2.1786409616470337, + "reward_std": 0.4084944427013397, + "rewards/GDino": 0.76282998919487, + "rewards/GIT": 0.4385463073849678, + "rewards/HPSv2": 0.2747611999511719, + "rewards/ORM": 0.7025035321712494, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.6875, + "step": 499 + }, + { + "completion_length": 48.0, + "epoch": 0.5537098560354374, + "grad_norm": 0.7646964192390442, + "kl": 0.0482177734375, + "learning_rate": 6.875e-07, + "loss": 0.0026139335241168737, + "reward": 2.6670485734939575, + "reward_std": 0.15402748435735703, + "rewards/GDino": 0.8484375178813934, + "rewards/GIT": 0.7150631248950958, + "rewards/HPSv2": 0.2910480499267578, + "rewards/ORM": 0.8125, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.5, + "step": 500 + }, + { + "completion_length": 46.234375, + "epoch": 0.5548172757475083, + "grad_norm": 0.6809297800064087, + "kl": 0.048828125, + "learning_rate": 6.86875e-07, + "loss": 0.03922370169311762, + "reward": 2.008356213569641, + "reward_std": 0.3604857325553894, + "rewards/GDino": 0.6921875178813934, + "rewards/GIT": 0.5941764116287231, + "rewards/HPSv2": 0.26070404052734375, + "rewards/ORM": 0.461288183927536, + "self_certainty_semantic": -25.0, + "self_certainty_token": -21.5625, + "step": 501 + }, + { + "completion_length": 47.171875, + "epoch": 0.5559246954595792, + "grad_norm": 0.849571168422699, + "kl": 0.049560546875, + "learning_rate": 6.8625e-07, + "loss": -0.007896499708294868, + "reward": 2.224105715751648, + "reward_std": 0.42195719480514526, + "rewards/GDino": 0.8002032041549683, + "rewards/GIT": 0.5095578879117966, + "rewards/HPSv2": 0.2751941680908203, + "rewards/ORM": 0.6391504108905792, + "self_certainty_semantic": -24.9375, + "self_certainty_token": -21.625, + "step": 502 + }, + { + "completion_length": 39.40625, + "epoch": 0.5570321151716501, + "grad_norm": 0.4625159502029419, + "kl": 0.0596923828125, + "learning_rate": 6.85625e-07, + "loss": -0.011805586516857147, + "reward": 1.6174015998840332, + "reward_std": 0.4419572353363037, + "rewards/GDino": 0.6323729455471039, + "rewards/GIT": 0.2029045894742012, + "rewards/HPSv2": 0.25756263732910156, + "rewards/ORM": 0.5245613753795624, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.9375, + "step": 503 + }, + { + "completion_length": 36.578125, + "epoch": 0.5581395348837209, + "grad_norm": 0.6870086193084717, + "kl": 0.05029296875, + "learning_rate": 6.85e-07, + "loss": 0.007269066758453846, + "reward": 1.8199999928474426, + "reward_std": 0.36270634829998016, + "rewards/GDino": 0.6532475501298904, + "rewards/GIT": 0.34954261779785156, + "rewards/HPSv2": 0.2609977722167969, + "rewards/ORM": 0.5562120378017426, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.8125, + "step": 504 + }, + { + "completion_length": 42.75, + "epoch": 0.5592469545957918, + "grad_norm": 0.8303393721580505, + "kl": 0.0489501953125, + "learning_rate": 6.843749999999999e-07, + "loss": -0.017457664478570223, + "reward": 1.8187562227249146, + "reward_std": 0.5198449939489365, + "rewards/GDino": 0.6842133402824402, + "rewards/GIT": 0.2273491695523262, + "rewards/HPSv2": 0.2782115936279297, + "rewards/ORM": 0.6289821267127991, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.9375, + "step": 505 + }, + { + "completion_length": 38.328125, + "epoch": 0.5603543743078627, + "grad_norm": 0.5167096257209778, + "kl": 0.0445556640625, + "learning_rate": 6.837499999999999e-07, + "loss": -0.04000814538449049, + "reward": 2.066583573818207, + "reward_std": 0.22248844802379608, + "rewards/GDino": 0.738374799489975, + "rewards/GIT": 0.5225924551486969, + "rewards/HPSv2": 0.2765922546386719, + "rewards/ORM": 0.529024064540863, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -22.0625, + "step": 506 + }, + { + "completion_length": 36.359375, + "epoch": 0.5614617940199336, + "grad_norm": 0.6051176190376282, + "kl": 0.0595703125, + "learning_rate": 6.831249999999999e-07, + "loss": -0.009458316955715418, + "reward": 1.9375656843185425, + "reward_std": 0.3073306977748871, + "rewards/GDino": 0.6690048575401306, + "rewards/GIT": 0.319953590631485, + "rewards/HPSv2": 0.2732524871826172, + "rewards/ORM": 0.6753546893596649, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.4375, + "step": 507 + }, + { + "completion_length": 41.671875, + "epoch": 0.5625692137320044, + "grad_norm": 0.6827051043510437, + "kl": 0.05029296875, + "learning_rate": 6.824999999999999e-07, + "loss": -0.017990117892622948, + "reward": 2.437025785446167, + "reward_std": 0.23294425010681152, + "rewards/GDino": 0.8168140351772308, + "rewards/GIT": 0.5631050169467926, + "rewards/HPSv2": 0.2876014709472656, + "rewards/ORM": 0.7695053219795227, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.9375, + "step": 508 + }, + { + "completion_length": 45.40625, + "epoch": 0.5636766334440753, + "grad_norm": 0.5109193325042725, + "kl": 0.05078125, + "learning_rate": 6.818749999999999e-07, + "loss": 0.009160971734672785, + "reward": 2.4056625366210938, + "reward_std": 0.36375945806503296, + "rewards/GDino": 0.8914958536624908, + "rewards/GIT": 0.539769321680069, + "rewards/HPSv2": 0.2717247009277344, + "rewards/ORM": 0.70267254114151, + "self_certainty_semantic": -25.25, + "self_certainty_token": -22.1875, + "step": 509 + }, + { + "completion_length": 42.5, + "epoch": 0.5647840531561462, + "grad_norm": 0.5111158490180969, + "kl": 0.0447998046875, + "learning_rate": 6.8125e-07, + "loss": -0.0029866271652281284, + "reward": 1.9386223554611206, + "reward_std": 0.4446902275085449, + "rewards/GDino": 0.6332381367683411, + "rewards/GIT": 0.3204441964626312, + "rewards/HPSv2": 0.28069114685058594, + "rewards/ORM": 0.7042488753795624, + "self_certainty_semantic": -24.9375, + "self_certainty_token": -22.5625, + "step": 510 + }, + { + "completion_length": 50.6875, + "epoch": 0.5658914728682171, + "grad_norm": 1.1105425357818604, + "kl": 0.0596923828125, + "learning_rate": 6.80625e-07, + "loss": 0.012685866095125675, + "reward": 1.8053985834121704, + "reward_std": 0.3349771797657013, + "rewards/GDino": 0.6785425245761871, + "rewards/GIT": 0.5215012729167938, + "rewards/HPSv2": 0.2517871856689453, + "rewards/ORM": 0.35356760025024414, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.0625, + "step": 511 + }, + { + "completion_length": 56.921875, + "epoch": 0.5669988925802879, + "grad_norm": 0.5685505867004395, + "kl": 0.0518798828125, + "learning_rate": 6.800000000000001e-07, + "loss": 0.01260447409003973, + "reward": 1.8695366978645325, + "reward_std": 0.4191875457763672, + "rewards/GDino": 0.6864760220050812, + "rewards/GIT": 0.2722553610801697, + "rewards/HPSv2": 0.2592887878417969, + "rewards/ORM": 0.6515165269374847, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.6875, + "step": 512 + }, + { + "completion_length": 41.671875, + "epoch": 0.5681063122923588, + "grad_norm": 0.7231737971305847, + "kl": 0.0457763671875, + "learning_rate": 6.79375e-07, + "loss": -0.0018274849280714989, + "reward": 2.0952327251434326, + "reward_std": 0.35603393614292145, + "rewards/GDino": 0.7697945237159729, + "rewards/GIT": 0.5224983841180801, + "rewards/HPSv2": 0.2513313293457031, + "rewards/ORM": 0.5516084432601929, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -22.0, + "step": 513 + }, + { + "completion_length": 42.625, + "epoch": 0.5692137320044297, + "grad_norm": 0.5151664614677429, + "kl": 0.058837890625, + "learning_rate": 6.7875e-07, + "loss": 0.012911509722471237, + "reward": 1.971530795097351, + "reward_std": 0.5368378758430481, + "rewards/GDino": 0.6933850646018982, + "rewards/GIT": 0.315007820725441, + "rewards/HPSv2": 0.263092041015625, + "rewards/ORM": 0.7000459432601929, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -22.625, + "step": 514 + }, + { + "completion_length": 49.640625, + "epoch": 0.5703211517165006, + "grad_norm": 0.6750500202178955, + "kl": 0.063720703125, + "learning_rate": 6.78125e-07, + "loss": -0.012025434523820877, + "reward": 1.9398669004440308, + "reward_std": 0.3209155201911926, + "rewards/GDino": 0.6847161054611206, + "rewards/GIT": 0.30489368736743927, + "rewards/HPSv2": 0.2687225341796875, + "rewards/ORM": 0.6815346777439117, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.5, + "step": 515 + }, + { + "completion_length": 41.390625, + "epoch": 0.5714285714285714, + "grad_norm": 0.6152828931808472, + "kl": 0.0460205078125, + "learning_rate": 6.775e-07, + "loss": 0.009829665534198284, + "reward": 2.2633460760116577, + "reward_std": 0.21108636260032654, + "rewards/GDino": 0.8759043514728546, + "rewards/GIT": 0.5751205831766129, + "rewards/HPSv2": 0.2716541290283203, + "rewards/ORM": 0.5406669825315475, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.9375, + "step": 516 + }, + { + "completion_length": 43.453125, + "epoch": 0.5725359911406424, + "grad_norm": 0.793476402759552, + "kl": 0.060546875, + "learning_rate": 6.76875e-07, + "loss": -0.0033091302029788494, + "reward": 1.8790732622146606, + "reward_std": 0.2955966591835022, + "rewards/GDino": 0.7067020535469055, + "rewards/GIT": 0.2911861911416054, + "rewards/HPSv2": 0.2660026550292969, + "rewards/ORM": 0.6151823997497559, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.625, + "step": 517 + }, + { + "completion_length": 54.265625, + "epoch": 0.5736434108527132, + "grad_norm": 0.8740166425704956, + "kl": 0.0469970703125, + "learning_rate": 6.7625e-07, + "loss": 0.008451959118247032, + "reward": 2.325771689414978, + "reward_std": 0.2714989632368088, + "rewards/GDino": 0.8236979246139526, + "rewards/GIT": 0.5366794764995575, + "rewards/HPSv2": 0.2731609344482422, + "rewards/ORM": 0.6922334432601929, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.5, + "step": 518 + }, + { + "completion_length": 43.953125, + "epoch": 0.574750830564784, + "grad_norm": 0.7303571701049805, + "kl": 0.044189453125, + "learning_rate": 6.75625e-07, + "loss": 0.022691648453474045, + "reward": 2.4649245738983154, + "reward_std": 0.3035038635134697, + "rewards/GDino": 0.7510416805744171, + "rewards/GIT": 0.6965360343456268, + "rewards/HPSv2": 0.26261329650878906, + "rewards/ORM": 0.7547334432601929, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -22.3125, + "step": 519 + }, + { + "completion_length": 41.421875, + "epoch": 0.5758582502768549, + "grad_norm": 0.8882773518562317, + "kl": 0.0509033203125, + "learning_rate": 6.75e-07, + "loss": -0.015754117514006793, + "reward": 2.516376256942749, + "reward_std": 0.41609495878219604, + "rewards/GDino": 0.8531250059604645, + "rewards/GIT": 0.644033282995224, + "rewards/HPSv2": 0.26301002502441406, + "rewards/ORM": 0.7562080323696136, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -22.4375, + "step": 520 + }, + { + "completion_length": 46.1875, + "epoch": 0.5769656699889258, + "grad_norm": 0.6410781145095825, + "kl": 0.0504150390625, + "learning_rate": 6.743749999999999e-07, + "loss": 0.035128320567309856, + "reward": 2.121917724609375, + "reward_std": 0.28992293775081635, + "rewards/GDino": 0.800000011920929, + "rewards/GIT": 0.6208551824092865, + "rewards/HPSv2": 0.25101661682128906, + "rewards/ORM": 0.4500459283590317, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.3125, + "step": 521 + }, + { + "completion_length": 50.453125, + "epoch": 0.5780730897009967, + "grad_norm": 0.8646048903465271, + "kl": 0.0638427734375, + "learning_rate": 6.737499999999999e-07, + "loss": 0.029995298013091087, + "reward": 1.3939986824989319, + "reward_std": 0.40169383585453033, + "rewards/GDino": 0.46041668951511383, + "rewards/GIT": 0.14088189601898193, + "rewards/HPSv2": 0.26492977142333984, + "rewards/ORM": 0.5277703106403351, + "self_certainty_semantic": -25.0, + "self_certainty_token": -22.0, + "step": 522 + }, + { + "completion_length": 40.296875, + "epoch": 0.5791805094130675, + "grad_norm": 0.675642728805542, + "kl": 0.0726318359375, + "learning_rate": 6.731249999999999e-07, + "loss": -0.021141795441508293, + "reward": 1.9347034692764282, + "reward_std": 0.49275273084640503, + "rewards/GDino": 0.6882331371307373, + "rewards/GIT": 0.35019098967313766, + "rewards/HPSv2": 0.2879371643066406, + "rewards/ORM": 0.6083420813083649, + "self_certainty_semantic": -25.0, + "self_certainty_token": -21.875, + "step": 523 + }, + { + "completion_length": 42.8125, + "epoch": 0.5802879291251384, + "grad_norm": 0.8465986847877502, + "kl": 0.068359375, + "learning_rate": 6.724999999999999e-07, + "loss": -0.0018713045865297318, + "reward": 2.3469094038009644, + "reward_std": 0.3619401305913925, + "rewards/GDino": 0.8242942988872528, + "rewards/GIT": 0.49616317451000214, + "rewards/HPSv2": 0.28113555908203125, + "rewards/ORM": 0.7453164756298065, + "self_certainty_semantic": -25.0, + "self_certainty_token": -22.1875, + "step": 524 + }, + { + "completion_length": 53.890625, + "epoch": 0.5813953488372093, + "grad_norm": 0.5991634726524353, + "kl": 0.06494140625, + "learning_rate": 6.718749999999999e-07, + "loss": 0.0026051076129078865, + "reward": 1.6451754570007324, + "reward_std": 0.4450817406177521, + "rewards/GDino": 0.6027501821517944, + "rewards/GIT": 0.18362435698509216, + "rewards/HPSv2": 0.2547416687011719, + "rewards/ORM": 0.6040592342615128, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.9375, + "step": 525 + }, + { + "completion_length": 47.328125, + "epoch": 0.5825027685492802, + "grad_norm": 0.9581777453422546, + "kl": 0.072998046875, + "learning_rate": 6.7125e-07, + "loss": -0.03072733711451292, + "reward": 2.296738624572754, + "reward_std": 0.3741179406642914, + "rewards/GDino": 0.7462376356124878, + "rewards/GIT": 0.4357232600450516, + "rewards/HPSv2": 0.28830718994140625, + "rewards/ORM": 0.8264706134796143, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -22.8125, + "step": 526 + }, + { + "completion_length": 45.140625, + "epoch": 0.583610188261351, + "grad_norm": 0.5074369311332703, + "kl": 0.0526123046875, + "learning_rate": 6.70625e-07, + "loss": 0.03284440189599991, + "reward": 1.8881090879440308, + "reward_std": 0.32576023042201996, + "rewards/GDino": 0.7507218718528748, + "rewards/GIT": 0.3530580550432205, + "rewards/HPSv2": 0.25963783264160156, + "rewards/ORM": 0.524691253900528, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.875, + "step": 527 + }, + { + "completion_length": 46.203125, + "epoch": 0.584717607973422, + "grad_norm": 0.5117936730384827, + "kl": 0.0582275390625, + "learning_rate": 6.7e-07, + "loss": 0.03193348180502653, + "reward": 2.1283347606658936, + "reward_std": 0.35501185059547424, + "rewards/GDino": 0.739062488079071, + "rewards/GIT": 0.5069300383329391, + "rewards/HPSv2": 0.2556877136230469, + "rewards/ORM": 0.6266543865203857, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -22.0, + "step": 528 + }, + { + "completion_length": 55.734375, + "epoch": 0.5858250276854928, + "grad_norm": 1.0097836256027222, + "kl": 0.0501708984375, + "learning_rate": 6.69375e-07, + "loss": 0.0001771491952240467, + "reward": 2.1930066347122192, + "reward_std": 0.2546389251947403, + "rewards/GDino": 0.7655479907989502, + "rewards/GIT": 0.4856642931699753, + "rewards/HPSv2": 0.25183677673339844, + "rewards/ORM": 0.6899575591087341, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.4375, + "step": 529 + }, + { + "completion_length": 50.078125, + "epoch": 0.5869324473975637, + "grad_norm": 0.8520405888557434, + "kl": 0.064697265625, + "learning_rate": 6.6875e-07, + "loss": -0.019556816201657057, + "reward": 1.9101565480232239, + "reward_std": 0.2041948214173317, + "rewards/GDino": 0.607291653752327, + "rewards/GIT": 0.3278944045305252, + "rewards/HPSv2": 0.2765789031982422, + "rewards/ORM": 0.6983915418386459, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.5625, + "step": 530 + }, + { + "completion_length": 42.984375, + "epoch": 0.5880398671096345, + "grad_norm": 0.534782350063324, + "kl": 0.07373046875, + "learning_rate": 6.68125e-07, + "loss": 0.00037076231092214584, + "reward": 1.7505834698677063, + "reward_std": 0.27002371847629547, + "rewards/GDino": 0.5543570518493652, + "rewards/GIT": 0.2095174342393875, + "rewards/HPSv2": 0.2848377227783203, + "rewards/ORM": 0.701871246099472, + "self_certainty_semantic": -25.375, + "self_certainty_token": -22.25, + "step": 531 + }, + { + "completion_length": 37.921875, + "epoch": 0.5891472868217055, + "grad_norm": 0.8290634155273438, + "kl": 0.049072265625, + "learning_rate": 6.675e-07, + "loss": -0.007146769668906927, + "reward": 2.579927086830139, + "reward_std": 0.23733402788639069, + "rewards/GDino": 0.8890624940395355, + "rewards/GIT": 0.6916349530220032, + "rewards/HPSv2": 0.26325035095214844, + "rewards/ORM": 0.7359794676303864, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.6875, + "step": 532 + }, + { + "completion_length": 43.734375, + "epoch": 0.5902547065337763, + "grad_norm": 0.6242944598197937, + "kl": 0.05859375, + "learning_rate": 6.66875e-07, + "loss": 0.0026651606895029545, + "reward": 2.338057041168213, + "reward_std": 0.2587292790412903, + "rewards/GDino": 0.7275370955467224, + "rewards/GIT": 0.42005516588687897, + "rewards/HPSv2": 0.2885475158691406, + "rewards/ORM": 0.9019171893596649, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.9375, + "step": 533 + }, + { + "completion_length": 41.046875, + "epoch": 0.5913621262458472, + "grad_norm": 0.6595937013626099, + "kl": 0.059326171875, + "learning_rate": 6.6625e-07, + "loss": -0.010991135379299521, + "reward": 2.0818028450012207, + "reward_std": 0.30971798300743103, + "rewards/GDino": 0.784375011920929, + "rewards/GIT": 0.4618041664361954, + "rewards/HPSv2": 0.24357986450195312, + "rewards/ORM": 0.5920437723398209, + "self_certainty_semantic": -24.9375, + "self_certainty_token": -21.9375, + "step": 534 + }, + { + "completion_length": 38.921875, + "epoch": 0.592469545957918, + "grad_norm": 0.6231927275657654, + "kl": 0.0654296875, + "learning_rate": 6.65625e-07, + "loss": 0.019878892228007317, + "reward": 1.9771916270256042, + "reward_std": 0.33388233184814453, + "rewards/GDino": 0.5947916507720947, + "rewards/GIT": 0.45582324266433716, + "rewards/HPSv2": 0.27800941467285156, + "rewards/ORM": 0.6485673487186432, + "self_certainty_semantic": -24.9375, + "self_certainty_token": -21.5, + "step": 535 + }, + { + "completion_length": 41.0625, + "epoch": 0.593576965669989, + "grad_norm": 0.6234171986579895, + "kl": 0.0654296875, + "learning_rate": 6.65e-07, + "loss": 0.011152018792927265, + "reward": 1.9419553875923157, + "reward_std": 0.3548129200935364, + "rewards/GDino": 0.7523582577705383, + "rewards/GIT": 0.39946797490119934, + "rewards/HPSv2": 0.2728996276855469, + "rewards/ORM": 0.517229437828064, + "self_certainty_semantic": -25.25, + "self_certainty_token": -22.1875, + "step": 536 + }, + { + "completion_length": 45.0625, + "epoch": 0.5946843853820598, + "grad_norm": 1.1540968418121338, + "kl": 0.09033203125, + "learning_rate": 6.64375e-07, + "loss": 0.013285273686051369, + "reward": 2.122566819190979, + "reward_std": 0.4439195245504379, + "rewards/GDino": 0.7579460740089417, + "rewards/GIT": 0.5050171613693237, + "rewards/HPSv2": 0.2603168487548828, + "rewards/ORM": 0.5992866456508636, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.3125, + "step": 537 + }, + { + "completion_length": 48.796875, + "epoch": 0.5957918050941307, + "grad_norm": 1.2856310606002808, + "kl": 0.0609130859375, + "learning_rate": 6.637499999999999e-07, + "loss": 0.005113992607221007, + "reward": 1.5898514986038208, + "reward_std": 0.4098888337612152, + "rewards/GDino": 0.4893845319747925, + "rewards/GIT": 0.10379865020513535, + "rewards/HPSv2": 0.29241943359375, + "rewards/ORM": 0.7042489051818848, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.9375, + "step": 538 + }, + { + "completion_length": 49.84375, + "epoch": 0.5968992248062015, + "grad_norm": 0.49107345938682556, + "kl": 0.0557861328125, + "learning_rate": 6.631249999999999e-07, + "loss": -0.026968090794980526, + "reward": 2.1781532764434814, + "reward_std": 0.38456064462661743, + "rewards/GDino": 0.7330729365348816, + "rewards/GIT": 0.5828709006309509, + "rewards/HPSv2": 0.28301048278808594, + "rewards/ORM": 0.579198956489563, + "self_certainty_semantic": -25.125, + "self_certainty_token": -22.0, + "step": 539 + }, + { + "completion_length": 54.734375, + "epoch": 0.5980066445182725, + "grad_norm": 0.5697686672210693, + "kl": 0.0615234375, + "learning_rate": 6.624999999999999e-07, + "loss": -0.012464655563235283, + "reward": 1.8495140075683594, + "reward_std": 0.17595785856246948, + "rewards/GDino": 0.6983504891395569, + "rewards/GIT": 0.305370956659317, + "rewards/HPSv2": 0.26691627502441406, + "rewards/ORM": 0.5788763463497162, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -22.0625, + "step": 540 + }, + { + "completion_length": 48.203125, + "epoch": 0.5991140642303433, + "grad_norm": 0.4882778525352478, + "kl": 0.0517578125, + "learning_rate": 6.618749999999999e-07, + "loss": 0.0061897411942481995, + "reward": 1.9610798954963684, + "reward_std": 0.23265547305345535, + "rewards/GDino": 0.7828344106674194, + "rewards/GIT": 0.5123494043946266, + "rewards/HPSv2": 0.2717914581298828, + "rewards/ORM": 0.3941046893596649, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -23.0, + "step": 541 + }, + { + "completion_length": 46.9375, + "epoch": 0.6002214839424141, + "grad_norm": 0.8239565491676331, + "kl": 0.11328125, + "learning_rate": 6.6125e-07, + "loss": 0.007412359584122896, + "reward": 1.7606784105300903, + "reward_std": 0.41101209819316864, + "rewards/GDino": 0.6593928635120392, + "rewards/GIT": 0.24423927813768387, + "rewards/HPSv2": 0.2684822082519531, + "rewards/ORM": 0.5885640680789948, + "self_certainty_semantic": -24.9375, + "self_certainty_token": -21.5, + "step": 542 + }, + { + "completion_length": 44.96875, + "epoch": 0.6013289036544851, + "grad_norm": 0.5161306858062744, + "kl": 0.053955078125, + "learning_rate": 6.60625e-07, + "loss": 0.019626203924417496, + "reward": 1.916968584060669, + "reward_std": 0.350626140832901, + "rewards/GDino": 0.8481511175632477, + "rewards/GIT": 0.2836497351527214, + "rewards/HPSv2": 0.26445865631103516, + "rewards/ORM": 0.5207091271877289, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -22.0625, + "step": 543 + }, + { + "completion_length": 45.203125, + "epoch": 0.602436323366556, + "grad_norm": 0.7409946918487549, + "kl": 0.0550537109375, + "learning_rate": 6.6e-07, + "loss": 0.0009375737281516194, + "reward": 2.439863920211792, + "reward_std": 0.417378693819046, + "rewards/GDino": 0.8464202582836151, + "rewards/GIT": 0.6017903387546539, + "rewards/HPSv2": 0.2494659423828125, + "rewards/ORM": 0.7421875, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -22.375, + "step": 544 + }, + { + "completion_length": 41.953125, + "epoch": 0.6035437430786268, + "grad_norm": 1.7777810096740723, + "kl": 0.0738525390625, + "learning_rate": 6.59375e-07, + "loss": -0.0030171696562319994, + "reward": 2.1651780009269714, + "reward_std": 0.23080424219369888, + "rewards/GDino": 0.7197917103767395, + "rewards/GIT": 0.6421350538730621, + "rewards/HPSv2": 0.2720012664794922, + "rewards/ORM": 0.53125, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.0625, + "step": 545 + }, + { + "completion_length": 56.328125, + "epoch": 0.6046511627906976, + "grad_norm": 0.6303988695144653, + "kl": 0.072998046875, + "learning_rate": 6.587499999999999e-07, + "loss": 0.018343636766076088, + "reward": 1.5270656943321228, + "reward_std": 0.33213473856449127, + "rewards/GDino": 0.6508311331272125, + "rewards/GIT": 0.3160218670964241, + "rewards/HPSv2": 0.25780296325683594, + "rewards/ORM": 0.30240975320339203, + "self_certainty_semantic": -25.125, + "self_certainty_token": -22.6875, + "step": 546 + }, + { + "completion_length": 47.578125, + "epoch": 0.6057585825027686, + "grad_norm": 2.654705762863159, + "kl": 0.38427734375, + "learning_rate": 6.581249999999999e-07, + "loss": -0.01044553192332387, + "reward": 2.2832573652267456, + "reward_std": 0.3163461983203888, + "rewards/GDino": 0.7944755554199219, + "rewards/GIT": 0.3390696793794632, + "rewards/HPSv2": 0.2950706481933594, + "rewards/ORM": 0.8546415269374847, + "self_certainty_semantic": -25.125, + "self_certainty_token": -22.3125, + "step": 547 + }, + { + "completion_length": 46.03125, + "epoch": 0.6068660022148394, + "grad_norm": 0.6309791803359985, + "kl": 0.064697265625, + "learning_rate": 6.575e-07, + "loss": 0.018591823987662792, + "reward": 2.0436050295829773, + "reward_std": 0.3530414402484894, + "rewards/GDino": 0.6515625417232513, + "rewards/GIT": 0.35474079102277756, + "rewards/HPSv2": 0.2654685974121094, + "rewards/ORM": 0.771833062171936, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.8125, + "step": 548 + }, + { + "completion_length": 43.3125, + "epoch": 0.6079734219269103, + "grad_norm": 1.3774304389953613, + "kl": 0.069580078125, + "learning_rate": 6.56875e-07, + "loss": 0.01293690619058907, + "reward": 2.2617204189300537, + "reward_std": 0.34871113300323486, + "rewards/GDino": 0.7810466587543488, + "rewards/GIT": 0.5572087168693542, + "rewards/HPSv2": 0.270965576171875, + "rewards/ORM": 0.6524995565414429, + "self_certainty_semantic": -25.375, + "self_certainty_token": -22.5, + "step": 549 + }, + { + "completion_length": 45.65625, + "epoch": 0.6090808416389811, + "grad_norm": 1.2470241785049438, + "kl": 0.0673828125, + "learning_rate": 6.5625e-07, + "loss": 0.02263116929680109, + "reward": 2.3779579401016235, + "reward_std": 0.21147555857896805, + "rewards/GDino": 0.7546875476837158, + "rewards/GIT": 0.45893964171409607, + "rewards/HPSv2": 0.26897239685058594, + "rewards/ORM": 0.8953584432601929, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.9375, + "step": 550 + }, + { + "completion_length": 43.9375, + "epoch": 0.6101882613510521, + "grad_norm": 0.9305184483528137, + "kl": 0.088623046875, + "learning_rate": 6.55625e-07, + "loss": 0.002473963424563408, + "reward": 2.317904472351074, + "reward_std": 0.35045669972896576, + "rewards/GDino": 0.8548519015312195, + "rewards/GIT": 0.43362539261579514, + "rewards/HPSv2": 0.29938507080078125, + "rewards/ORM": 0.7300421893596649, + "self_certainty_semantic": -25.25, + "self_certainty_token": -23.1875, + "step": 551 + }, + { + "completion_length": 37.625, + "epoch": 0.6112956810631229, + "grad_norm": 0.545630156993866, + "kl": 0.0693359375, + "learning_rate": 6.55e-07, + "loss": -0.002531451638787985, + "reward": 2.08464252948761, + "reward_std": 0.323154479265213, + "rewards/GDino": 0.8056005537509918, + "rewards/GIT": 0.5031750798225403, + "rewards/HPSv2": 0.2711334228515625, + "rewards/ORM": 0.5047334432601929, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.625, + "step": 552 + }, + { + "completion_length": 46.0625, + "epoch": 0.6124031007751938, + "grad_norm": 0.7531204223632812, + "kl": 0.0604248046875, + "learning_rate": 6.54375e-07, + "loss": -0.017301646526902914, + "reward": 1.8435069918632507, + "reward_std": 0.4002540707588196, + "rewards/GDino": 0.6233262568712234, + "rewards/GIT": 0.20414622128009796, + "rewards/HPSv2": 0.27692604064941406, + "rewards/ORM": 0.7391084432601929, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.625, + "step": 553 + }, + { + "completion_length": 49.9375, + "epoch": 0.6135105204872646, + "grad_norm": 0.5987286567687988, + "kl": 0.087158203125, + "learning_rate": 6.5375e-07, + "loss": -0.010272628860548139, + "reward": 1.7656413316726685, + "reward_std": 0.474995493888855, + "rewards/GDino": 0.6400487124919891, + "rewards/GIT": 0.2195979207754135, + "rewards/HPSv2": 0.27911949157714844, + "rewards/ORM": 0.6268752217292786, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -22.3125, + "step": 554 + }, + { + "completion_length": 49.140625, + "epoch": 0.6146179401993356, + "grad_norm": 0.5443540215492249, + "kl": 0.0611572265625, + "learning_rate": 6.531249999999999e-07, + "loss": -0.013051381334662437, + "reward": 1.713015377521515, + "reward_std": 0.4085501432418823, + "rewards/GDino": 0.6116544008255005, + "rewards/GIT": 0.3198617100715637, + "rewards/HPSv2": 0.2673988342285156, + "rewards/ORM": 0.5141004621982574, + "self_certainty_semantic": -24.9375, + "self_certainty_token": -22.5625, + "step": 555 + }, + { + "completion_length": 45.171875, + "epoch": 0.6157253599114064, + "grad_norm": 0.6643732786178589, + "kl": 0.081298828125, + "learning_rate": 6.524999999999999e-07, + "loss": -0.016484496649354696, + "reward": 2.402197480201721, + "reward_std": 0.4949145019054413, + "rewards/GDino": 0.8265624642372131, + "rewards/GIT": 0.5415551960468292, + "rewards/HPSv2": 0.24849700927734375, + "rewards/ORM": 0.7855828106403351, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -20.875, + "step": 556 + }, + { + "completion_length": 39.078125, + "epoch": 0.6168327796234773, + "grad_norm": 0.6011128425598145, + "kl": 0.06201171875, + "learning_rate": 6.51875e-07, + "loss": 0.0228618448600173, + "reward": 1.9158042073249817, + "reward_std": 0.4142046123743057, + "rewards/GDino": 0.7750618755817413, + "rewards/GIT": 0.25893043726682663, + "rewards/HPSv2": 0.27260780334472656, + "rewards/ORM": 0.609204113483429, + "self_certainty_semantic": -25.0, + "self_certainty_token": -21.6875, + "step": 557 + }, + { + "completion_length": 51.59375, + "epoch": 0.6179401993355482, + "grad_norm": 0.657895565032959, + "kl": 0.052978515625, + "learning_rate": 6.5125e-07, + "loss": -0.04735218919813633, + "reward": 1.7417017817497253, + "reward_std": 0.2977404296398163, + "rewards/GDino": 0.6653882265090942, + "rewards/GIT": 0.30595987290143967, + "rewards/HPSv2": 0.2765617370605469, + "rewards/ORM": 0.49379195272922516, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -22.125, + "step": 558 + }, + { + "completion_length": 44.828125, + "epoch": 0.6190476190476191, + "grad_norm": 0.6480852365493774, + "kl": 0.065673828125, + "learning_rate": 6.50625e-07, + "loss": 0.022880353964865208, + "reward": 2.21392822265625, + "reward_std": 0.25253190100193024, + "rewards/GDino": 0.7878020107746124, + "rewards/GIT": 0.39861495047807693, + "rewards/HPSv2": 0.2787151336669922, + "rewards/ORM": 0.7487961649894714, + "self_certainty_semantic": -25.25, + "self_certainty_token": -22.1875, + "step": 559 + }, + { + "completion_length": 43.546875, + "epoch": 0.6201550387596899, + "grad_norm": 1.5846436023712158, + "kl": 0.09521484375, + "learning_rate": 6.5e-07, + "loss": 0.013549871277064085, + "reward": 1.3660652041435242, + "reward_std": 0.46586059033870697, + "rewards/GDino": 0.5709560066461563, + "rewards/GIT": 0.2729204297065735, + "rewards/HPSv2": 0.2628517150878906, + "rewards/ORM": 0.25933703780174255, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -22.0625, + "step": 560 + }, + { + "completion_length": 47.953125, + "epoch": 0.6212624584717608, + "grad_norm": 0.6772641539573669, + "kl": 0.087890625, + "learning_rate": 6.49375e-07, + "loss": -0.008351297117769718, + "reward": 1.8580278754234314, + "reward_std": 0.31344228982925415, + "rewards/GDino": 0.6416242718696594, + "rewards/GIT": 0.347418375313282, + "rewards/HPSv2": 0.27796363830566406, + "rewards/ORM": 0.5910216569900513, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.6875, + "step": 561 + }, + { + "completion_length": 50.21875, + "epoch": 0.6223698781838317, + "grad_norm": 0.513234555721283, + "kl": 0.07275390625, + "learning_rate": 6.4875e-07, + "loss": -0.012114224955439568, + "reward": 2.127378523349762, + "reward_std": 0.2737106531858444, + "rewards/GDino": 0.825808048248291, + "rewards/GIT": 0.4280584901571274, + "rewards/HPSv2": 0.27159881591796875, + "rewards/ORM": 0.6019132137298584, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.9375, + "step": 562 + }, + { + "completion_length": 45.15625, + "epoch": 0.6234772978959026, + "grad_norm": 0.96412193775177, + "kl": 0.0859375, + "learning_rate": 6.481249999999999e-07, + "loss": -0.03636417631059885, + "reward": 2.0414137840270996, + "reward_std": 0.4964071810245514, + "rewards/GDino": 0.7853887677192688, + "rewards/GIT": 0.43233297765254974, + "rewards/HPSv2": 0.26239585876464844, + "rewards/ORM": 0.5612961649894714, + "self_certainty_semantic": -25.0, + "self_certainty_token": -22.375, + "step": 563 + }, + { + "completion_length": 65.8125, + "epoch": 0.6245847176079734, + "grad_norm": 0.5135129690170288, + "kl": 0.06005859375, + "learning_rate": 6.474999999999999e-07, + "loss": -0.001872258260846138, + "reward": 2.3118677139282227, + "reward_std": 0.3188101053237915, + "rewards/GDino": 0.830009251832962, + "rewards/GIT": 0.41938507556915283, + "rewards/HPSv2": 0.2812232971191406, + "rewards/ORM": 0.78125, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.875, + "step": 564 + }, + { + "completion_length": 47.171875, + "epoch": 0.6256921373200443, + "grad_norm": 2.157308340072632, + "kl": 0.099365234375, + "learning_rate": 6.46875e-07, + "loss": 0.008208409883081913, + "reward": 1.8709299564361572, + "reward_std": 0.30264636874198914, + "rewards/GDino": 0.7342717945575714, + "rewards/GIT": 0.08047330379486084, + "rewards/HPSv2": 0.2749347686767578, + "rewards/ORM": 0.78125, + "self_certainty_semantic": -25.375, + "self_certainty_token": -22.25, + "step": 565 + }, + { + "completion_length": 52.296875, + "epoch": 0.6267995570321152, + "grad_norm": 1.326660394668579, + "kl": 0.085693359375, + "learning_rate": 6.4625e-07, + "loss": -0.00620048982091248, + "reward": 1.7999188899993896, + "reward_std": 0.1955185905098915, + "rewards/GDino": 0.6784451305866241, + "rewards/GIT": 0.3365602195262909, + "rewards/HPSv2": 0.2645549774169922, + "rewards/ORM": 0.5203584432601929, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -23.5, + "step": 566 + }, + { + "completion_length": 51.6875, + "epoch": 0.627906976744186, + "grad_norm": 1.2671253681182861, + "kl": 0.068115234375, + "learning_rate": 6.45625e-07, + "loss": 0.012142400722950697, + "reward": 2.22979599237442, + "reward_std": 0.3625608831644058, + "rewards/GDino": 0.7203125059604645, + "rewards/GIT": 0.5460533201694489, + "rewards/HPSv2": 0.2686481475830078, + "rewards/ORM": 0.6947819888591766, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -22.1875, + "step": 567 + }, + { + "completion_length": 48.21875, + "epoch": 0.6290143964562569, + "grad_norm": 0.7895189523696899, + "kl": 0.0791015625, + "learning_rate": 6.45e-07, + "loss": -0.01707265805453062, + "reward": 1.8749691247940063, + "reward_std": 0.39064884185791016, + "rewards/GDino": 0.7415328025817871, + "rewards/GIT": 0.42620430886745453, + "rewards/HPSv2": 0.2739849090576172, + "rewards/ORM": 0.4332471340894699, + "self_certainty_semantic": -24.9375, + "self_certainty_token": -21.5, + "step": 568 + }, + { + "completion_length": 44.75, + "epoch": 0.6301218161683277, + "grad_norm": 0.7421867251396179, + "kl": 0.0712890625, + "learning_rate": 6.44375e-07, + "loss": 0.01910808007232845, + "reward": 2.3217188119888306, + "reward_std": 0.3109547942876816, + "rewards/GDino": 0.7889922857284546, + "rewards/GIT": 0.5959710776805878, + "rewards/HPSv2": 0.2705860137939453, + "rewards/ORM": 0.6661693751811981, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.875, + "step": 569 + }, + { + "completion_length": 56.25, + "epoch": 0.6312292358803987, + "grad_norm": 0.5583733320236206, + "kl": 0.0609130859375, + "learning_rate": 6.4375e-07, + "loss": 0.0022791270166635513, + "reward": 1.8600765466690063, + "reward_std": 0.34499575197696686, + "rewards/GDino": 0.7367187440395355, + "rewards/GIT": 0.5528475940227509, + "rewards/HPSv2": 0.2670764923095703, + "rewards/ORM": 0.30343372374773026, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -22.0, + "step": 570 + }, + { + "completion_length": 46.171875, + "epoch": 0.6323366555924695, + "grad_norm": 0.45145487785339355, + "kl": 0.07373046875, + "learning_rate": 6.431249999999999e-07, + "loss": 0.021353797987103462, + "reward": 2.602142810821533, + "reward_std": 0.2599358335137367, + "rewards/GDino": 0.9237105846405029, + "rewards/GIT": 0.6711409389972687, + "rewards/HPSv2": 0.27599525451660156, + "rewards/ORM": 0.7312959432601929, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -22.4375, + "step": 571 + }, + { + "completion_length": 54.3125, + "epoch": 0.6334440753045404, + "grad_norm": 1.0492053031921387, + "kl": 0.0517578125, + "learning_rate": 6.424999999999999e-07, + "loss": 0.012994482181966305, + "reward": 2.169149875640869, + "reward_std": 0.3728819489479065, + "rewards/GDino": 0.6718750447034836, + "rewards/GIT": 0.4776107966899872, + "rewards/HPSv2": 0.2552928924560547, + "rewards/ORM": 0.7643712162971497, + "self_certainty_semantic": -25.0, + "self_certainty_token": -22.75, + "step": 572 + }, + { + "completion_length": 54.234375, + "epoch": 0.6345514950166113, + "grad_norm": 0.47676345705986023, + "kl": 0.068359375, + "learning_rate": 6.41875e-07, + "loss": 0.009478969499468803, + "reward": 2.287571668624878, + "reward_std": 0.3681768923997879, + "rewards/GDino": 0.8140625059604645, + "rewards/GIT": 0.5471410304307938, + "rewards/HPSv2": 0.26578521728515625, + "rewards/ORM": 0.6605828404426575, + "self_certainty_semantic": -25.0, + "self_certainty_token": -21.6875, + "step": 573 + }, + { + "completion_length": 42.875, + "epoch": 0.6356589147286822, + "grad_norm": 0.7573015689849854, + "kl": 0.07275390625, + "learning_rate": 6.4125e-07, + "loss": -0.0035428733099251986, + "reward": 2.2345346212387085, + "reward_std": 0.26802630722522736, + "rewards/GDino": 0.699855774641037, + "rewards/GIT": 0.45403826981782913, + "rewards/HPSv2": 0.27089691162109375, + "rewards/ORM": 0.8097435235977173, + "self_certainty_semantic": -25.125, + "self_certainty_token": -22.8125, + "step": 574 + }, + { + "completion_length": 38.515625, + "epoch": 0.636766334440753, + "grad_norm": 1.071760892868042, + "kl": 0.071533203125, + "learning_rate": 6.40625e-07, + "loss": -0.007476763799786568, + "reward": 2.5980905294418335, + "reward_std": 0.24887971580028534, + "rewards/GDino": 0.9643161594867706, + "rewards/GIT": 0.7012321054935455, + "rewards/HPSv2": 0.2537078857421875, + "rewards/ORM": 0.6788344085216522, + "self_certainty_semantic": -24.9375, + "self_certainty_token": -22.375, + "step": 575 + }, + { + "completion_length": 56.453125, + "epoch": 0.6378737541528239, + "grad_norm": 1.1068315505981445, + "kl": 0.079833984375, + "learning_rate": 6.4e-07, + "loss": 0.004310366697609425, + "reward": 1.926874816417694, + "reward_std": 0.3794712871313095, + "rewards/GDino": 0.7066236138343811, + "rewards/GIT": 0.3324836865067482, + "rewards/HPSv2": 0.27245521545410156, + "rewards/ORM": 0.615312248468399, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.5625, + "step": 576 + }, + { + "completion_length": 54.5625, + "epoch": 0.6389811738648948, + "grad_norm": 0.6730782389640808, + "kl": 0.09765625, + "learning_rate": 6.39375e-07, + "loss": -0.009048221400007606, + "reward": 1.715060293674469, + "reward_std": 0.37638457119464874, + "rewards/GDino": 0.6072686910629272, + "rewards/GIT": 0.28505346179008484, + "rewards/HPSv2": 0.2617626190185547, + "rewards/ORM": 0.5609754621982574, + "self_certainty_semantic": -25.125, + "self_certainty_token": -22.25, + "step": 577 + }, + { + "completion_length": 49.078125, + "epoch": 0.6400885935769657, + "grad_norm": 0.5849512219429016, + "kl": 0.078369140625, + "learning_rate": 6.3875e-07, + "loss": -0.0033787088468670845, + "reward": 2.0598644018173218, + "reward_std": 0.3881510943174362, + "rewards/GDino": 0.7094090580940247, + "rewards/GIT": 0.42222118377685547, + "rewards/HPSv2": 0.25662994384765625, + "rewards/ORM": 0.6716042459011078, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -22.375, + "step": 578 + }, + { + "completion_length": 49.859375, + "epoch": 0.6411960132890365, + "grad_norm": 2.933789014816284, + "kl": 0.073486328125, + "learning_rate": 6.38125e-07, + "loss": 0.016461540944874287, + "reward": 2.3235161304473877, + "reward_std": 0.4396570473909378, + "rewards/GDino": 0.7677083313465118, + "rewards/GIT": 0.5476280003786087, + "rewards/HPSv2": 0.2765331268310547, + "rewards/ORM": 0.7316466569900513, + "self_certainty_semantic": -25.125, + "self_certainty_token": -22.5, + "step": 579 + }, + { + "completion_length": 45.90625, + "epoch": 0.6423034330011074, + "grad_norm": 0.9947912096977234, + "kl": 0.072998046875, + "learning_rate": 6.374999999999999e-07, + "loss": -0.015327150467783213, + "reward": 2.086591362953186, + "reward_std": 0.34800001978874207, + "rewards/GDino": 0.7345395088195801, + "rewards/GIT": 0.40558674931526184, + "rewards/HPSv2": 0.2715110778808594, + "rewards/ORM": 0.6749540567398071, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -21.75, + "step": 580 + }, + { + "completion_length": 44.46875, + "epoch": 0.6434108527131783, + "grad_norm": 0.6177259683609009, + "kl": 0.086669921875, + "learning_rate": 6.368749999999999e-07, + "loss": 0.011248688213527203, + "reward": 1.933541178703308, + "reward_std": 0.35859090089797974, + "rewards/GDino": 0.6562248468399048, + "rewards/GIT": 0.44320014119148254, + "rewards/HPSv2": 0.2723674774169922, + "rewards/ORM": 0.561748668551445, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -22.3125, + "step": 581 + }, + { + "completion_length": 38.6875, + "epoch": 0.6445182724252492, + "grad_norm": 0.5649813413619995, + "kl": 0.087890625, + "learning_rate": 6.362499999999999e-07, + "loss": -0.02157511841505766, + "reward": 2.146281599998474, + "reward_std": 0.467723548412323, + "rewards/GDino": 0.817384660243988, + "rewards/GIT": 0.5064477473497391, + "rewards/HPSv2": 0.2739696502685547, + "rewards/ORM": 0.5484794974327087, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -22.75, + "step": 582 + }, + { + "completion_length": 57.4375, + "epoch": 0.64562569213732, + "grad_norm": 1.1887096166610718, + "kl": 0.071044921875, + "learning_rate": 6.35625e-07, + "loss": -0.009709077654406428, + "reward": 1.8209284543991089, + "reward_std": 0.4260648190975189, + "rewards/GDino": 0.7038745880126953, + "rewards/GIT": 0.388409286737442, + "rewards/HPSv2": 0.27118682861328125, + "rewards/ORM": 0.4574578106403351, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -22.0625, + "step": 583 + }, + { + "completion_length": 55.859375, + "epoch": 0.646733111849391, + "grad_norm": 1.0990617275238037, + "kl": 0.10107421875, + "learning_rate": 6.35e-07, + "loss": 4.3497420847415924e-05, + "reward": 1.8322890996932983, + "reward_std": 0.47952717542648315, + "rewards/GDino": 0.6861795485019684, + "rewards/GIT": 0.322199709713459, + "rewards/HPSv2": 0.27663421630859375, + "rewards/ORM": 0.547275647521019, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.625, + "step": 584 + }, + { + "completion_length": 54.171875, + "epoch": 0.6478405315614618, + "grad_norm": 0.743022620677948, + "kl": 0.099365234375, + "learning_rate": 6.34375e-07, + "loss": -0.007025235798209906, + "reward": 1.7467429041862488, + "reward_std": 0.3646506369113922, + "rewards/GDino": 0.6056530177593231, + "rewards/GIT": 0.2070682942867279, + "rewards/HPSv2": 0.2825050354003906, + "rewards/ORM": 0.6515165567398071, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.625, + "step": 585 + }, + { + "completion_length": 47.171875, + "epoch": 0.6489479512735327, + "grad_norm": 0.9150655269622803, + "kl": 0.0694580078125, + "learning_rate": 6.3375e-07, + "loss": -0.008931130170822144, + "reward": 2.2964425086975098, + "reward_std": 0.4491335302591324, + "rewards/GDino": 0.8038741052150726, + "rewards/GIT": 0.5696861445903778, + "rewards/HPSv2": 0.2661018371582031, + "rewards/ORM": 0.656780481338501, + "self_certainty_semantic": -25.25, + "self_certainty_token": -22.875, + "step": 586 + }, + { + "completion_length": 48.75, + "epoch": 0.6500553709856035, + "grad_norm": 0.4439801871776581, + "kl": 0.071533203125, + "learning_rate": 6.33125e-07, + "loss": 0.03945009782910347, + "reward": 2.233885407447815, + "reward_std": 0.44437268376350403, + "rewards/GDino": 0.8034193813800812, + "rewards/GIT": 0.47745344042778015, + "rewards/HPSv2": 0.26355743408203125, + "rewards/ORM": 0.6894551515579224, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -22.25, + "step": 587 + }, + { + "completion_length": 52.828125, + "epoch": 0.6511627906976745, + "grad_norm": 0.7273818254470825, + "kl": 0.104736328125, + "learning_rate": 6.324999999999999e-07, + "loss": -0.04699191078543663, + "reward": 2.418140411376953, + "reward_std": 0.25689420104026794, + "rewards/GDino": 0.729385107755661, + "rewards/GIT": 0.5350233688950539, + "rewards/HPSv2": 0.2727947235107422, + "rewards/ORM": 0.8809372782707214, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.8125, + "step": 588 + }, + { + "completion_length": 52.671875, + "epoch": 0.6522702104097453, + "grad_norm": 1.8505158424377441, + "kl": 0.0869140625, + "learning_rate": 6.31875e-07, + "loss": -0.020862075500190258, + "reward": 2.3336164951324463, + "reward_std": 0.19946596771478653, + "rewards/GDino": 0.7767625749111176, + "rewards/GIT": 0.5896276831626892, + "rewards/HPSv2": 0.2785224914550781, + "rewards/ORM": 0.6887038052082062, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.9375, + "step": 589 + }, + { + "completion_length": 53.5, + "epoch": 0.6533776301218162, + "grad_norm": 0.5300537347793579, + "kl": 0.094482421875, + "learning_rate": 6.3125e-07, + "loss": -0.010553581640124321, + "reward": 2.2597015500068665, + "reward_std": 0.3002111464738846, + "rewards/GDino": 0.7936822772026062, + "rewards/GIT": 0.44989803433418274, + "rewards/HPSv2": 0.27987098693847656, + "rewards/ORM": 0.7362502217292786, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.9375, + "step": 590 + }, + { + "completion_length": 42.5625, + "epoch": 0.654485049833887, + "grad_norm": 0.583197295665741, + "kl": 0.0830078125, + "learning_rate": 6.30625e-07, + "loss": 0.015054948627948761, + "reward": 2.329821825027466, + "reward_std": 0.23674922436475754, + "rewards/GDino": 0.7871354520320892, + "rewards/GIT": 0.6012963056564331, + "rewards/HPSv2": 0.2722434997558594, + "rewards/ORM": 0.6691466569900513, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -22.3125, + "step": 591 + }, + { + "completion_length": 50.15625, + "epoch": 0.655592469545958, + "grad_norm": 1.685902714729309, + "kl": 0.08154296875, + "learning_rate": 6.3e-07, + "loss": -0.013527771923691034, + "reward": 1.8206582069396973, + "reward_std": 0.36716659367084503, + "rewards/GDino": 0.675000011920929, + "rewards/GIT": 0.4574429839849472, + "rewards/HPSv2": 0.2741527557373047, + "rewards/ORM": 0.4140624850988388, + "self_certainty_semantic": -25.125, + "self_certainty_token": -22.75, + "step": 592 + }, + { + "completion_length": 56.390625, + "epoch": 0.6566998892580288, + "grad_norm": 0.5346245169639587, + "kl": 0.067626953125, + "learning_rate": 6.29375e-07, + "loss": -0.028868052177131176, + "reward": 2.3374955654144287, + "reward_std": 0.448678582906723, + "rewards/GDino": 0.875, + "rewards/GIT": 0.5937578082084656, + "rewards/HPSv2": 0.2687797546386719, + "rewards/ORM": 0.599958062171936, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -22.0625, + "step": 593 + }, + { + "completion_length": 53.46875, + "epoch": 0.6578073089700996, + "grad_norm": 0.5602062940597534, + "kl": 0.09130859375, + "learning_rate": 6.2875e-07, + "loss": 0.014588311780244112, + "reward": 2.076681613922119, + "reward_std": 0.2987568974494934, + "rewards/GDino": 0.7875000238418579, + "rewards/GIT": 0.46318748593330383, + "rewards/HPSv2": 0.27095603942871094, + "rewards/ORM": 0.555038183927536, + "self_certainty_semantic": -25.25, + "self_certainty_token": -22.1875, + "step": 594 + }, + { + "completion_length": 50.234375, + "epoch": 0.6589147286821705, + "grad_norm": 0.8814988136291504, + "kl": 0.115234375, + "learning_rate": 6.28125e-07, + "loss": -0.004488900303840637, + "reward": 1.7216359376907349, + "reward_std": 0.32969560474157333, + "rewards/GDino": 0.6518171429634094, + "rewards/GIT": 0.2219809964299202, + "rewards/HPSv2": 0.2915458679199219, + "rewards/ORM": 0.5562919676303864, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -22.0, + "step": 595 + }, + { + "completion_length": 44.859375, + "epoch": 0.6600221483942414, + "grad_norm": 0.6903232336044312, + "kl": 0.080322265625, + "learning_rate": 6.274999999999999e-07, + "loss": -0.0038359593600034714, + "reward": 2.2345484495162964, + "reward_std": 0.35491831600666046, + "rewards/GDino": 0.7885048389434814, + "rewards/GIT": 0.49162155389785767, + "rewards/HPSv2": 0.2669219970703125, + "rewards/ORM": 0.6875, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -22.625, + "step": 596 + }, + { + "completion_length": 44.109375, + "epoch": 0.6611295681063123, + "grad_norm": 0.6238202452659607, + "kl": 0.11376953125, + "learning_rate": 6.268749999999999e-07, + "loss": 0.010757120093330741, + "reward": 1.9933987259864807, + "reward_std": 0.43311460316181183, + "rewards/GDino": 0.7434804737567902, + "rewards/GIT": 0.22067713737487793, + "rewards/HPSv2": 0.28347206115722656, + "rewards/ORM": 0.7457689642906189, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -22.5, + "step": 597 + }, + { + "completion_length": 47.0, + "epoch": 0.6622369878183831, + "grad_norm": 0.6406247615814209, + "kl": 0.081787109375, + "learning_rate": 6.262499999999999e-07, + "loss": -0.0002176426351070404, + "reward": 2.7149163484573364, + "reward_std": 0.18262185156345367, + "rewards/GDino": 0.8359375, + "rewards/GIT": 0.7479822635650635, + "rewards/HPSv2": 0.2559967041015625, + "rewards/ORM": 0.875, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -22.4375, + "step": 598 + }, + { + "completion_length": 54.984375, + "epoch": 0.6633444075304541, + "grad_norm": 0.46733200550079346, + "kl": 0.076416015625, + "learning_rate": 6.256249999999999e-07, + "loss": 0.008587295189499855, + "reward": 2.046528160572052, + "reward_std": 0.30855168402194977, + "rewards/GDino": 0.7562500238418579, + "rewards/GIT": 0.4039812982082367, + "rewards/HPSv2": 0.26134681701660156, + "rewards/ORM": 0.624950036406517, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -22.4375, + "step": 599 + }, + { + "completion_length": 67.96875, + "epoch": 0.6644518272425249, + "grad_norm": 0.49819114804267883, + "kl": 0.067626953125, + "learning_rate": 6.249999999999999e-07, + "loss": 0.037328097969293594, + "reward": 2.071701407432556, + "reward_std": 0.4433322548866272, + "rewards/GDino": 0.7885022759437561, + "rewards/GIT": 0.33964233100414276, + "rewards/HPSv2": 0.27628517150878906, + "rewards/ORM": 0.6672714054584503, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -22.4375, + "step": 600 + } + ], + "logging_steps": 1.0, + "max_steps": 1600, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}