{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 50, "global_step": 2699, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003705934127020892, "grad_norm": 215.50672912597656, "learning_rate": 4.983327158206743e-07, "logits/chosen": -6.070415496826172, "logits/rejected": -6.099751949310303, "logps/chosen": -984.36767578125, "logps/rejected": -897.9577026367188, "loss": 0.6962, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.055581189692020416, "rewards/margins": 0.0006150867557153106, "rewards/rejected": 0.05496610328555107, "step": 10 }, { "epoch": 0.007411868254041784, "grad_norm": 203.74668884277344, "learning_rate": 4.964801778436458e-07, "logits/chosen": -6.290555477142334, "logits/rejected": -6.2849812507629395, "logps/chosen": -932.5144653320312, "logps/rejected": -823.7965698242188, "loss": 0.675, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.27587246894836426, "rewards/margins": 0.05210857465863228, "rewards/rejected": 0.22376389801502228, "step": 20 }, { "epoch": 0.011117802381062676, "grad_norm": 201.33848571777344, "learning_rate": 4.946276398666173e-07, "logits/chosen": -6.240113735198975, "logits/rejected": -6.196782112121582, "logps/chosen": -981.3587646484375, "logps/rejected": -879.1512451171875, "loss": 0.67, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.4677122235298157, "rewards/margins": 0.07471133768558502, "rewards/rejected": 0.39300084114074707, "step": 30 }, { "epoch": 0.014823736508083569, "grad_norm": 201.69285583496094, "learning_rate": 4.927751018895887e-07, "logits/chosen": -6.257163047790527, "logits/rejected": -6.211418151855469, "logps/chosen": -998.1868286132812, "logps/rejected": -941.7491455078125, "loss": 0.6987, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.5558674931526184, "rewards/margins": 0.02628883719444275, "rewards/rejected": 0.5295786261558533, "step": 40 }, { "epoch": 0.01852967063510446, "grad_norm": 213.06088256835938, "learning_rate": 4.909225639125602e-07, "logits/chosen": -6.227558612823486, "logits/rejected": -6.338425636291504, "logps/chosen": -1029.257080078125, "logps/rejected": -952.8382568359375, "loss": 0.6643, "rewards/accuracies": 0.59375, "rewards/chosen": 0.59141606092453, "rewards/margins": 0.09994185715913773, "rewards/rejected": 0.4914742112159729, "step": 50 }, { "epoch": 0.01852967063510446, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.348860263824463, "eval_logps/chosen": -1146.9443359375, "eval_logps/rejected": -1055.7943115234375, "eval_loss": 0.6688504815101624, "eval_rewards/accuracies": 0.5936188101768494, "eval_rewards/chosen": 0.7400967478752136, "eval_rewards/margins": 0.09535637497901917, "eval_rewards/rejected": 0.6447404623031616, "eval_runtime": 173.9141, "eval_samples_per_second": 6.848, "eval_steps_per_second": 6.848, "step": 50 }, { "epoch": 0.02223560476212535, "grad_norm": 173.28237915039062, "learning_rate": 4.890700259355317e-07, "logits/chosen": -6.145205974578857, "logits/rejected": -6.158076763153076, "logps/chosen": -918.4729614257812, "logps/rejected": -808.6856079101562, "loss": 0.6631, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.6317356824874878, "rewards/margins": 0.10025952756404877, "rewards/rejected": 0.5314761400222778, "step": 60 }, { "epoch": 0.025941538889146246, "grad_norm": 201.4814910888672, "learning_rate": 4.872174879585031e-07, "logits/chosen": -5.990462779998779, "logits/rejected": -6.091423988342285, "logps/chosen": -903.2649536132812, "logps/rejected": -869.3302001953125, "loss": 0.6871, "rewards/accuracies": 0.53125, "rewards/chosen": 0.7280756831169128, "rewards/margins": 0.06912466883659363, "rewards/rejected": 0.6589510440826416, "step": 70 }, { "epoch": 0.029647473016167138, "grad_norm": 247.76434326171875, "learning_rate": 4.853649499814746e-07, "logits/chosen": -6.052975177764893, "logits/rejected": NaN, "logps/chosen": -994.2741088867188, "logps/rejected": -887.9376220703125, "loss": 0.6728, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.8349438905715942, "rewards/margins": 0.10054773092269897, "rewards/rejected": 0.7343961000442505, "step": 80 }, { "epoch": 0.03335340714318803, "grad_norm": 241.12693786621094, "learning_rate": 4.835124120044461e-07, "logits/chosen": -6.244847774505615, "logits/rejected": -6.195946216583252, "logps/chosen": -953.9434814453125, "logps/rejected": -816.5172119140625, "loss": 0.6708, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.7360488176345825, "rewards/margins": 0.10302430391311646, "rewards/rejected": 0.6330245733261108, "step": 90 }, { "epoch": 0.03705934127020892, "grad_norm": 222.10406494140625, "learning_rate": 4.816598740274175e-07, "logits/chosen": -6.177041530609131, "logits/rejected": -6.0963640213012695, "logps/chosen": -1006.8380737304688, "logps/rejected": -819.0447998046875, "loss": 0.6398, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.8563984036445618, "rewards/margins": 0.1775047481060028, "rewards/rejected": 0.6788936853408813, "step": 100 }, { "epoch": 0.03705934127020892, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.329836368560791, "eval_logps/chosen": -1141.4117431640625, "eval_logps/rejected": -1050.9219970703125, "eval_loss": 0.6667112708091736, "eval_rewards/accuracies": 0.5869017839431763, "eval_rewards/chosen": 1.2933688163757324, "eval_rewards/margins": 0.16138586401939392, "eval_rewards/rejected": 1.1319829225540161, "eval_runtime": 174.0404, "eval_samples_per_second": 6.843, "eval_steps_per_second": 6.843, "step": 100 }, { "epoch": 0.040765275397229815, "grad_norm": 176.17654418945312, "learning_rate": 4.79807336050389e-07, "logits/chosen": NaN, "logits/rejected": -6.241061210632324, "logps/chosen": -934.3814697265625, "logps/rejected": -809.5549926757812, "loss": 0.6532, "rewards/accuracies": 0.625, "rewards/chosen": 1.0543756484985352, "rewards/margins": 0.2122875154018402, "rewards/rejected": 0.8420880436897278, "step": 110 }, { "epoch": 0.0444712095242507, "grad_norm": 205.62350463867188, "learning_rate": 4.779547980733605e-07, "logits/chosen": -6.2480058670043945, "logits/rejected": -6.166928291320801, "logps/chosen": -996.0416259765625, "logps/rejected": -834.2034912109375, "loss": 0.6599, "rewards/accuracies": 0.59375, "rewards/chosen": 1.0299097299575806, "rewards/margins": 0.1862904280424118, "rewards/rejected": 0.8436192274093628, "step": 120 }, { "epoch": 0.0481771436512716, "grad_norm": 191.88278198242188, "learning_rate": 4.7610226009633197e-07, "logits/chosen": -6.192694664001465, "logits/rejected": -6.168017387390137, "logps/chosen": -957.9847412109375, "logps/rejected": -854.0274658203125, "loss": 0.6462, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9972726702690125, "rewards/margins": 0.20573964715003967, "rewards/rejected": 0.7915329933166504, "step": 130 }, { "epoch": 0.05188307777829249, "grad_norm": 164.00282287597656, "learning_rate": 4.742497221193034e-07, "logits/chosen": -6.139467239379883, "logits/rejected": -6.123991966247559, "logps/chosen": -1019.96435546875, "logps/rejected": -879.1519775390625, "loss": 0.6461, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.0306396484375, "rewards/margins": 0.21070317924022675, "rewards/rejected": 0.8199363946914673, "step": 140 }, { "epoch": 0.05558901190531338, "grad_norm": 192.38653564453125, "learning_rate": 4.7239718414227493e-07, "logits/chosen": -6.214751243591309, "logits/rejected": -6.185935020446777, "logps/chosen": -984.6964721679688, "logps/rejected": -895.7721557617188, "loss": 0.661, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.9771279096603394, "rewards/margins": 0.13651703298091888, "rewards/rejected": 0.8406108617782593, "step": 150 }, { "epoch": 0.05558901190531338, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.335906982421875, "eval_logps/chosen": -1141.2706298828125, "eval_logps/rejected": -1050.9266357421875, "eval_loss": 0.6631777882575989, "eval_rewards/accuracies": 0.5961377024650574, "eval_rewards/chosen": 1.3074761629104614, "eval_rewards/margins": 0.17596358060836792, "eval_rewards/rejected": 1.1315125226974487, "eval_runtime": 174.258, "eval_samples_per_second": 6.835, "eval_steps_per_second": 6.835, "step": 150 }, { "epoch": 0.059294946032334275, "grad_norm": 174.99664306640625, "learning_rate": 4.705446461652464e-07, "logits/chosen": -6.193015098571777, "logits/rejected": -6.1321492195129395, "logps/chosen": -1026.88037109375, "logps/rejected": -896.2667846679688, "loss": 0.6248, "rewards/accuracies": 0.65625, "rewards/chosen": 1.183638572692871, "rewards/margins": 0.23824377357959747, "rewards/rejected": 0.9453946352005005, "step": 160 }, { "epoch": 0.06300088015935516, "grad_norm": 192.7080078125, "learning_rate": 4.6869210818821784e-07, "logits/chosen": -6.125895023345947, "logits/rejected": -6.105996131896973, "logps/chosen": -891.0772705078125, "logps/rejected": -783.1829223632812, "loss": 0.6062, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.0523326396942139, "rewards/margins": 0.29890042543411255, "rewards/rejected": 0.7534322142601013, "step": 170 }, { "epoch": 0.06670681428637606, "grad_norm": 213.39122009277344, "learning_rate": 4.668395702111893e-07, "logits/chosen": -6.171431064605713, "logits/rejected": -6.141777992248535, "logps/chosen": -950.2371215820312, "logps/rejected": -837.3072509765625, "loss": 0.6254, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.1676208972930908, "rewards/margins": 0.2787976562976837, "rewards/rejected": 0.8888231515884399, "step": 180 }, { "epoch": 0.07041274841339695, "grad_norm": 187.03749084472656, "learning_rate": 4.649870322341608e-07, "logits/chosen": -6.19333028793335, "logits/rejected": -6.163342475891113, "logps/chosen": -958.0759887695312, "logps/rejected": -845.267578125, "loss": 0.6568, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.1464588642120361, "rewards/margins": 0.23536348342895508, "rewards/rejected": 0.9110953211784363, "step": 190 }, { "epoch": 0.07411868254041784, "grad_norm": 150.2332763671875, "learning_rate": 4.6313449425713225e-07, "logits/chosen": -6.098294734954834, "logits/rejected": -6.079904556274414, "logps/chosen": -920.4479370117188, "logps/rejected": -867.9637451171875, "loss": 0.6275, "rewards/accuracies": 0.625, "rewards/chosen": 1.1195321083068848, "rewards/margins": 0.24117548763751984, "rewards/rejected": 0.8783566355705261, "step": 200 }, { "epoch": 0.07411868254041784, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.339965343475342, "eval_logps/chosen": -1140.9739990234375, "eval_logps/rejected": -1050.8843994140625, "eval_loss": 0.6571991443634033, "eval_rewards/accuracies": 0.6154491901397705, "eval_rewards/chosen": 1.3371424674987793, "eval_rewards/margins": 0.20140083134174347, "eval_rewards/rejected": 1.1357417106628418, "eval_runtime": 174.4949, "eval_samples_per_second": 6.825, "eval_steps_per_second": 6.825, "step": 200 }, { "epoch": 0.07782461666743874, "grad_norm": 189.8514862060547, "learning_rate": 4.6128195628010375e-07, "logits/chosen": -6.194195747375488, "logits/rejected": -6.146265983581543, "logps/chosen": -915.0256958007812, "logps/rejected": -845.5064697265625, "loss": 0.6606, "rewards/accuracies": 0.65625, "rewards/chosen": 0.961922287940979, "rewards/margins": 0.19818060100078583, "rewards/rejected": 0.763741672039032, "step": 210 }, { "epoch": 0.08153055079445963, "grad_norm": 189.69139099121094, "learning_rate": 4.594294183030752e-07, "logits/chosen": -6.284877300262451, "logits/rejected": -6.263852119445801, "logps/chosen": -1026.146484375, "logps/rejected": -913.3385009765625, "loss": 0.6511, "rewards/accuracies": 0.625, "rewards/chosen": 0.9875092506408691, "rewards/margins": 0.21295031905174255, "rewards/rejected": 0.7745589017868042, "step": 220 }, { "epoch": 0.08523648492148052, "grad_norm": 264.7326354980469, "learning_rate": 4.575768803260467e-07, "logits/chosen": -6.017802715301514, "logits/rejected": -6.151331424713135, "logps/chosen": -846.173828125, "logps/rejected": -828.7672119140625, "loss": 0.6848, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.7509580254554749, "rewards/margins": 0.0957236960530281, "rewards/rejected": 0.6552343368530273, "step": 230 }, { "epoch": 0.0889424190485014, "grad_norm": 224.5631561279297, "learning_rate": 4.557243423490181e-07, "logits/chosen": -6.283064365386963, "logits/rejected": -6.176108360290527, "logps/chosen": -940.2021484375, "logps/rejected": -812.8372802734375, "loss": 0.6557, "rewards/accuracies": 0.625, "rewards/chosen": 0.8100060224533081, "rewards/margins": 0.16767463088035583, "rewards/rejected": 0.6423314213752747, "step": 240 }, { "epoch": 0.09264835317552231, "grad_norm": 176.48463439941406, "learning_rate": 4.538718043719896e-07, "logits/chosen": -6.0912933349609375, "logits/rejected": -6.187921524047852, "logps/chosen": -1011.1219482421875, "logps/rejected": -848.7685546875, "loss": 0.6152, "rewards/accuracies": 0.65625, "rewards/chosen": 0.857901930809021, "rewards/margins": 0.25349634885787964, "rewards/rejected": 0.6044055819511414, "step": 250 }, { "epoch": 0.09264835317552231, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.341000080108643, "eval_logps/chosen": -1144.0086669921875, "eval_logps/rejected": -1053.47119140625, "eval_loss": 0.6642729043960571, "eval_rewards/accuracies": 0.5801846981048584, "eval_rewards/chosen": 1.0336804389953613, "eval_rewards/margins": 0.1566334068775177, "eval_rewards/rejected": 0.8770471215248108, "eval_runtime": 174.2528, "eval_samples_per_second": 6.835, "eval_steps_per_second": 6.835, "step": 250 }, { "epoch": 0.0963542873025432, "grad_norm": 194.99929809570312, "learning_rate": 4.5201926639496107e-07, "logits/chosen": -6.046469688415527, "logits/rejected": -6.070072650909424, "logps/chosen": -825.3790283203125, "logps/rejected": -784.54052734375, "loss": 0.6457, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.7765441536903381, "rewards/margins": 0.17580363154411316, "rewards/rejected": 0.6007404923439026, "step": 260 }, { "epoch": 0.10006022142956408, "grad_norm": 232.19290161132812, "learning_rate": 4.5016672841793257e-07, "logits/chosen": -6.138308525085449, "logits/rejected": -6.1265411376953125, "logps/chosen": -966.5224609375, "logps/rejected": -880.6701049804688, "loss": 0.6047, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.0221220254898071, "rewards/margins": 0.32319146394729614, "rewards/rejected": 0.6989305019378662, "step": 270 }, { "epoch": 0.10376615555658499, "grad_norm": 199.44570922851562, "learning_rate": 4.48314190440904e-07, "logits/chosen": -6.254446983337402, "logits/rejected": -6.203383922576904, "logps/chosen": -974.5589599609375, "logps/rejected": -894.5767822265625, "loss": 0.6625, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.9916531443595886, "rewards/margins": 0.18319669365882874, "rewards/rejected": 0.8084564208984375, "step": 280 }, { "epoch": 0.10747208968360587, "grad_norm": 195.33485412597656, "learning_rate": 4.4646165246387553e-07, "logits/chosen": -6.29571533203125, "logits/rejected": -6.212879180908203, "logps/chosen": -883.5402221679688, "logps/rejected": -783.243408203125, "loss": 0.6307, "rewards/accuracies": 0.625, "rewards/chosen": 0.9049898386001587, "rewards/margins": 0.2685369849205017, "rewards/rejected": 0.6364529132843018, "step": 290 }, { "epoch": 0.11117802381062676, "grad_norm": 214.13623046875, "learning_rate": 4.44609114486847e-07, "logits/chosen": NaN, "logits/rejected": -6.118219375610352, "logps/chosen": -953.02001953125, "logps/rejected": -822.9397583007812, "loss": 0.6433, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.9801859855651855, "rewards/margins": 0.20221543312072754, "rewards/rejected": 0.7779706120491028, "step": 300 }, { "epoch": 0.11117802381062676, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.331676483154297, "eval_logps/chosen": -1140.791748046875, "eval_logps/rejected": -1050.74365234375, "eval_loss": 0.6641189455986023, "eval_rewards/accuracies": 0.6078925132751465, "eval_rewards/chosen": 1.355363130569458, "eval_rewards/margins": 0.20555777847766876, "eval_rewards/rejected": 1.1498054265975952, "eval_runtime": 174.1181, "eval_samples_per_second": 6.84, "eval_steps_per_second": 6.84, "step": 300 }, { "epoch": 0.11488395793764766, "grad_norm": 151.80670166015625, "learning_rate": 4.4275657650981843e-07, "logits/chosen": -6.154031276702881, "logits/rejected": -6.207782745361328, "logps/chosen": -944.8739013671875, "logps/rejected": -844.357421875, "loss": 0.6034, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 1.1336545944213867, "rewards/margins": 0.3580685555934906, "rewards/rejected": 0.7755860090255737, "step": 310 }, { "epoch": 0.11858989206466855, "grad_norm": 199.51202392578125, "learning_rate": 4.409040385327899e-07, "logits/chosen": -6.148090839385986, "logits/rejected": -6.058573246002197, "logps/chosen": -870.2352294921875, "logps/rejected": -785.9710693359375, "loss": 0.6577, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.0765498876571655, "rewards/margins": 0.24500849843025208, "rewards/rejected": 0.8315415382385254, "step": 320 }, { "epoch": 0.12229582619168944, "grad_norm": 178.98838806152344, "learning_rate": 4.390515005557614e-07, "logits/chosen": -6.161218166351318, "logits/rejected": -6.021878242492676, "logps/chosen": -900.61279296875, "logps/rejected": -716.9248046875, "loss": 0.5969, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 1.0982271432876587, "rewards/margins": 0.3538793623447418, "rewards/rejected": 0.7443478107452393, "step": 330 }, { "epoch": 0.12600176031871033, "grad_norm": 193.34494018554688, "learning_rate": 4.3719896257873284e-07, "logits/chosen": -6.12771463394165, "logits/rejected": -6.165745735168457, "logps/chosen": -938.45068359375, "logps/rejected": -867.9703369140625, "loss": 0.673, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.1986232995986938, "rewards/margins": 0.20064759254455566, "rewards/rejected": 0.9979757070541382, "step": 340 }, { "epoch": 0.12970769444573121, "grad_norm": 176.0118865966797, "learning_rate": 4.3534642460170435e-07, "logits/chosen": -6.143977165222168, "logits/rejected": -6.176082611083984, "logps/chosen": -921.1917114257812, "logps/rejected": -874.5079345703125, "loss": 0.623, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.1385135650634766, "rewards/margins": 0.2807597219944, "rewards/rejected": 0.8577538728713989, "step": 350 }, { "epoch": 0.12970769444573121, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.329718589782715, "eval_logps/chosen": -1140.4703369140625, "eval_logps/rejected": -1050.4912109375, "eval_loss": 0.6621597409248352, "eval_rewards/accuracies": 0.6011754870414734, "eval_rewards/chosen": 1.3875113725662231, "eval_rewards/margins": 0.21245607733726501, "eval_rewards/rejected": 1.1750552654266357, "eval_runtime": 173.5305, "eval_samples_per_second": 6.863, "eval_steps_per_second": 6.863, "step": 350 }, { "epoch": 0.13341362857275213, "grad_norm": 191.9296875, "learning_rate": 4.334938866246758e-07, "logits/chosen": -6.161706447601318, "logits/rejected": -6.022977828979492, "logps/chosen": -955.3453979492188, "logps/rejected": -816.9100341796875, "loss": 0.6297, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.105089545249939, "rewards/margins": 0.2945402264595032, "rewards/rejected": 0.8105493783950806, "step": 360 }, { "epoch": 0.13711956269977302, "grad_norm": 208.82858276367188, "learning_rate": 4.3164134864764725e-07, "logits/chosen": -6.0739240646362305, "logits/rejected": -6.2275519371032715, "logps/chosen": -819.2732543945312, "logps/rejected": -865.2078247070312, "loss": 0.7031, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.8856269717216492, "rewards/margins": 0.11217892169952393, "rewards/rejected": 0.77344810962677, "step": 370 }, { "epoch": 0.1408254968267939, "grad_norm": 180.92535400390625, "learning_rate": 4.297888106706187e-07, "logits/chosen": -6.172797203063965, "logits/rejected": -6.124629497528076, "logps/chosen": -935.8492431640625, "logps/rejected": -818.2625732421875, "loss": 0.6061, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 1.0353577136993408, "rewards/margins": 0.3515828251838684, "rewards/rejected": 0.683775007724762, "step": 380 }, { "epoch": 0.1445314309538148, "grad_norm": 214.0187225341797, "learning_rate": 4.2793627269359016e-07, "logits/chosen": -6.215329170227051, "logits/rejected": -6.21909236907959, "logps/chosen": -1081.975341796875, "logps/rejected": -913.9468994140625, "loss": 0.5974, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.190735101699829, "rewards/margins": 0.33388403058052063, "rewards/rejected": 0.8568509817123413, "step": 390 }, { "epoch": 0.14823736508083568, "grad_norm": 176.70887756347656, "learning_rate": 4.2608373471656166e-07, "logits/chosen": -6.156098365783691, "logits/rejected": -6.209042072296143, "logps/chosen": -976.1398315429688, "logps/rejected": -873.5931396484375, "loss": 0.6532, "rewards/accuracies": 0.59375, "rewards/chosen": 1.1283199787139893, "rewards/margins": 0.22726468741893768, "rewards/rejected": 0.9010552167892456, "step": 400 }, { "epoch": 0.14823736508083568, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.327154636383057, "eval_logps/chosen": -1140.1060791015625, "eval_logps/rejected": -1050.1492919921875, "eval_loss": 0.6659889817237854, "eval_rewards/accuracies": 0.6028547286987305, "eval_rewards/chosen": 1.4239270687103271, "eval_rewards/margins": 0.2146778702735901, "eval_rewards/rejected": 1.2092490196228027, "eval_runtime": 173.6869, "eval_samples_per_second": 6.857, "eval_steps_per_second": 6.857, "step": 400 }, { "epoch": 0.15194329920785657, "grad_norm": 169.63328552246094, "learning_rate": 4.242311967395331e-07, "logits/chosen": -6.142382621765137, "logits/rejected": -6.167304992675781, "logps/chosen": -947.6598510742188, "logps/rejected": -787.0640869140625, "loss": 0.6019, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.2996371984481812, "rewards/margins": 0.3752737045288086, "rewards/rejected": 0.9243636131286621, "step": 410 }, { "epoch": 0.15564923333487748, "grad_norm": 270.06866455078125, "learning_rate": 4.223786587625046e-07, "logits/chosen": -6.093822002410889, "logits/rejected": -6.110901832580566, "logps/chosen": -937.8591918945312, "logps/rejected": -855.3360595703125, "loss": 0.6348, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 1.346130132675171, "rewards/margins": 0.3189659118652344, "rewards/rejected": 1.0271642208099365, "step": 420 }, { "epoch": 0.15935516746189837, "grad_norm": 124.60543823242188, "learning_rate": 4.2052612078547607e-07, "logits/chosen": -6.153736591339111, "logits/rejected": -6.0936784744262695, "logps/chosen": -891.9788208007812, "logps/rejected": -771.3781127929688, "loss": 0.6269, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.1474323272705078, "rewards/margins": 0.3255314826965332, "rewards/rejected": 0.8219007253646851, "step": 430 }, { "epoch": 0.16306110158891926, "grad_norm": 169.39234924316406, "learning_rate": 4.186735828084476e-07, "logits/chosen": -6.129828453063965, "logits/rejected": -6.149449348449707, "logps/chosen": -891.6807861328125, "logps/rejected": -785.4395751953125, "loss": 0.6103, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.2507822513580322, "rewards/margins": 0.34615927934646606, "rewards/rejected": 0.9046230316162109, "step": 440 }, { "epoch": 0.16676703571594015, "grad_norm": 182.51864624023438, "learning_rate": 4.16821044831419e-07, "logits/chosen": -6.106880187988281, "logits/rejected": -6.003333568572998, "logps/chosen": -994.0611572265625, "logps/rejected": -866.1448974609375, "loss": 0.6798, "rewards/accuracies": 0.59375, "rewards/chosen": 1.1819612979888916, "rewards/margins": 0.22734245657920837, "rewards/rejected": 0.9546189308166504, "step": 450 }, { "epoch": 0.16676703571594015, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.315321445465088, "eval_logps/chosen": -1139.016845703125, "eval_logps/rejected": -1049.19189453125, "eval_loss": 0.6655510067939758, "eval_rewards/accuracies": 0.6053736209869385, "eval_rewards/chosen": 1.5328552722930908, "eval_rewards/margins": 0.22787250578403473, "eval_rewards/rejected": 1.3049829006195068, "eval_runtime": 174.0654, "eval_samples_per_second": 6.842, "eval_steps_per_second": 6.842, "step": 450 }, { "epoch": 0.17047296984296104, "grad_norm": 175.39608764648438, "learning_rate": 4.149685068543905e-07, "logits/chosen": -6.059579372406006, "logits/rejected": -6.104693412780762, "logps/chosen": -972.4683837890625, "logps/rejected": -834.6124877929688, "loss": 0.6116, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 1.3684813976287842, "rewards/margins": 0.38127750158309937, "rewards/rejected": 0.9872040748596191, "step": 460 }, { "epoch": 0.17417890396998192, "grad_norm": 230.60934448242188, "learning_rate": 4.1311596887736194e-07, "logits/chosen": -6.06960916519165, "logits/rejected": -6.0307111740112305, "logps/chosen": -868.5339965820312, "logps/rejected": -845.7874755859375, "loss": 0.6496, "rewards/accuracies": 0.65625, "rewards/chosen": 1.1452016830444336, "rewards/margins": 0.25945180654525757, "rewards/rejected": 0.8857498168945312, "step": 470 }, { "epoch": 0.1778848380970028, "grad_norm": 196.1241912841797, "learning_rate": 4.1126343090033344e-07, "logits/chosen": -6.163808822631836, "logits/rejected": -6.103111267089844, "logps/chosen": -993.4736328125, "logps/rejected": -810.2939453125, "loss": 0.5573, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 1.1358219385147095, "rewards/margins": 0.4707656502723694, "rewards/rejected": 0.6650562286376953, "step": 480 }, { "epoch": 0.18159077222402373, "grad_norm": 225.72552490234375, "learning_rate": 4.094108929233049e-07, "logits/chosen": -6.20804500579834, "logits/rejected": -6.198565483093262, "logps/chosen": -967.2512817382812, "logps/rejected": -881.41552734375, "loss": 0.6359, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 1.2158067226409912, "rewards/margins": 0.3084403872489929, "rewards/rejected": 0.9073662757873535, "step": 490 }, { "epoch": 0.18529670635104462, "grad_norm": 212.1605224609375, "learning_rate": 4.075583549462764e-07, "logits/chosen": -6.132593631744385, "logits/rejected": -6.051444053649902, "logps/chosen": -943.2779541015625, "logps/rejected": -822.4968872070312, "loss": 0.6209, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.2602580785751343, "rewards/margins": 0.3520536720752716, "rewards/rejected": 0.9082044363021851, "step": 500 }, { "epoch": 0.18529670635104462, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.321292400360107, "eval_logps/chosen": -1139.3175048828125, "eval_logps/rejected": -1049.5101318359375, "eval_loss": 0.6620848774909973, "eval_rewards/accuracies": 0.6011754870414734, "eval_rewards/chosen": 1.5027841329574585, "eval_rewards/margins": 0.2296140044927597, "eval_rewards/rejected": 1.2731702327728271, "eval_runtime": 174.6102, "eval_samples_per_second": 6.821, "eval_steps_per_second": 6.821, "step": 500 }, { "epoch": 0.1890026404780655, "grad_norm": 153.60992431640625, "learning_rate": 4.0570581696924785e-07, "logits/chosen": -5.990109443664551, "logits/rejected": -5.986026763916016, "logps/chosen": -898.7940673828125, "logps/rejected": -799.6437377929688, "loss": 0.6165, "rewards/accuracies": 0.65625, "rewards/chosen": 1.1576520204544067, "rewards/margins": 0.3633851110935211, "rewards/rejected": 0.794266939163208, "step": 510 }, { "epoch": 0.1927085746050864, "grad_norm": 172.2601776123047, "learning_rate": 4.038532789922193e-07, "logits/chosen": -6.226175308227539, "logits/rejected": -6.190736293792725, "logps/chosen": -883.1309814453125, "logps/rejected": -794.0902099609375, "loss": 0.6212, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 1.1897079944610596, "rewards/margins": 0.35320332646369934, "rewards/rejected": 0.8365045785903931, "step": 520 }, { "epoch": 0.19641450873210728, "grad_norm": 192.50827026367188, "learning_rate": 4.0200074101519076e-07, "logits/chosen": -6.055853843688965, "logits/rejected": NaN, "logps/chosen": -956.99267578125, "logps/rejected": -887.1090087890625, "loss": 0.6176, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 1.1858432292938232, "rewards/margins": 0.29170137643814087, "rewards/rejected": 0.8941418528556824, "step": 530 }, { "epoch": 0.20012044285912817, "grad_norm": 207.009521484375, "learning_rate": 4.0014820303816226e-07, "logits/chosen": -6.144883155822754, "logits/rejected": NaN, "logps/chosen": -929.91943359375, "logps/rejected": -834.5404052734375, "loss": 0.6305, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.9971332550048828, "rewards/margins": 0.3494023382663727, "rewards/rejected": 0.6477310061454773, "step": 540 }, { "epoch": 0.20382637698614908, "grad_norm": 224.73924255371094, "learning_rate": 3.982956650611337e-07, "logits/chosen": -6.141475677490234, "logits/rejected": -6.262620449066162, "logps/chosen": -966.7326049804688, "logps/rejected": -890.4728393554688, "loss": 0.6286, "rewards/accuracies": 0.625, "rewards/chosen": 0.9653292894363403, "rewards/margins": 0.29025983810424805, "rewards/rejected": 0.6750694513320923, "step": 550 }, { "epoch": 0.20382637698614908, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.334640979766846, "eval_logps/chosen": -1142.1336669921875, "eval_logps/rejected": -1052.1123046875, "eval_loss": 0.660417377948761, "eval_rewards/accuracies": 0.6179680824279785, "eval_rewards/chosen": 1.2211687564849854, "eval_rewards/margins": 0.2082298845052719, "eval_rewards/rejected": 1.0129389762878418, "eval_runtime": 174.5819, "eval_samples_per_second": 6.822, "eval_steps_per_second": 6.822, "step": 550 }, { "epoch": 0.20753231111316997, "grad_norm": 318.4903869628906, "learning_rate": 3.964431270841052e-07, "logits/chosen": -6.1527791023254395, "logits/rejected": -6.1851677894592285, "logps/chosen": -932.6305541992188, "logps/rejected": -875.3505859375, "loss": 0.7095, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 1.0057893991470337, "rewards/margins": 0.13861322402954102, "rewards/rejected": 0.8671760559082031, "step": 560 }, { "epoch": 0.21123824524019086, "grad_norm": 176.8316192626953, "learning_rate": 3.9459058910707667e-07, "logits/chosen": -6.179142951965332, "logits/rejected": -6.174668788909912, "logps/chosen": -1010.54052734375, "logps/rejected": -875.7761840820312, "loss": 0.607, "rewards/accuracies": 0.6875, "rewards/chosen": 1.096440076828003, "rewards/margins": 0.36430811882019043, "rewards/rejected": 0.7321318984031677, "step": 570 }, { "epoch": 0.21494417936721175, "grad_norm": 215.4343719482422, "learning_rate": 3.927380511300482e-07, "logits/chosen": -6.139018535614014, "logits/rejected": -6.070583343505859, "logps/chosen": -961.7306518554688, "logps/rejected": -831.40380859375, "loss": 0.6366, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 1.0454758405685425, "rewards/margins": 0.31583863496780396, "rewards/rejected": 0.7296372652053833, "step": 580 }, { "epoch": 0.21865011349423263, "grad_norm": 202.57000732421875, "learning_rate": 3.908855131530196e-07, "logits/chosen": -6.21251106262207, "logits/rejected": -6.157750606536865, "logps/chosen": -1023.9791259765625, "logps/rejected": -936.6031494140625, "loss": 0.6376, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.146539568901062, "rewards/margins": 0.26862841844558716, "rewards/rejected": 0.8779112100601196, "step": 590 }, { "epoch": 0.22235604762125352, "grad_norm": 182.68316650390625, "learning_rate": 3.890329751759911e-07, "logits/chosen": -6.045320987701416, "logits/rejected": -6.124794006347656, "logps/chosen": -956.4278564453125, "logps/rejected": -807.8424682617188, "loss": 0.6274, "rewards/accuracies": 0.625, "rewards/chosen": 1.1107687950134277, "rewards/margins": 0.26531320810317993, "rewards/rejected": 0.845455527305603, "step": 600 }, { "epoch": 0.22235604762125352, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.337012767791748, "eval_logps/chosen": -1139.7646484375, "eval_logps/rejected": -1050.012451171875, "eval_loss": 0.6612704396247864, "eval_rewards/accuracies": 0.6179680824279785, "eval_rewards/chosen": 1.458066701889038, "eval_rewards/margins": 0.23514851927757263, "eval_rewards/rejected": 1.2229182720184326, "eval_runtime": 174.599, "eval_samples_per_second": 6.821, "eval_steps_per_second": 6.821, "step": 600 }, { "epoch": 0.22606198174827444, "grad_norm": 190.41612243652344, "learning_rate": 3.8718043719896253e-07, "logits/chosen": -6.1819024085998535, "logits/rejected": -6.2130818367004395, "logps/chosen": -982.65869140625, "logps/rejected": -875.1731567382812, "loss": 0.6125, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.2317478656768799, "rewards/margins": 0.363824725151062, "rewards/rejected": 0.8679230809211731, "step": 610 }, { "epoch": 0.22976791587529533, "grad_norm": 147.1990966796875, "learning_rate": 3.8532789922193404e-07, "logits/chosen": -6.232724666595459, "logits/rejected": -6.308589458465576, "logps/chosen": -914.0718994140625, "logps/rejected": -837.0066528320312, "loss": 0.6067, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 1.2401645183563232, "rewards/margins": 0.37453925609588623, "rewards/rejected": 0.8656252026557922, "step": 620 }, { "epoch": 0.2334738500023162, "grad_norm": 200.28050231933594, "learning_rate": 3.834753612449055e-07, "logits/chosen": -6.240880012512207, "logits/rejected": -6.253479957580566, "logps/chosen": -951.05712890625, "logps/rejected": -853.3023681640625, "loss": 0.6128, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.3768174648284912, "rewards/margins": 0.3513033986091614, "rewards/rejected": 1.0255142450332642, "step": 630 }, { "epoch": 0.2371797841293371, "grad_norm": 223.86203002929688, "learning_rate": 3.81622823267877e-07, "logits/chosen": -6.115043640136719, "logits/rejected": -6.189513206481934, "logps/chosen": -951.8463745117188, "logps/rejected": -889.0245971679688, "loss": 0.706, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 1.218057632446289, "rewards/margins": 0.1631278544664383, "rewards/rejected": 1.0549296140670776, "step": 640 }, { "epoch": 0.240885718256358, "grad_norm": 211.12872314453125, "learning_rate": 3.7977028529084845e-07, "logits/chosen": -6.2889084815979, "logits/rejected": -6.2076416015625, "logps/chosen": -1071.9197998046875, "logps/rejected": -939.8733520507812, "loss": 0.6382, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 1.2747665643692017, "rewards/margins": 0.3331315815448761, "rewards/rejected": 0.941635012626648, "step": 650 }, { "epoch": 0.240885718256358, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.333639621734619, "eval_logps/chosen": -1140.6971435546875, "eval_logps/rejected": -1050.910888671875, "eval_loss": 0.6616818904876709, "eval_rewards/accuracies": 0.6204869747161865, "eval_rewards/chosen": 1.36481773853302, "eval_rewards/margins": 0.23173516988754272, "eval_rewards/rejected": 1.133082628250122, "eval_runtime": 174.4739, "eval_samples_per_second": 6.826, "eval_steps_per_second": 6.826, "step": 650 }, { "epoch": 0.24459165238337888, "grad_norm": 157.5903778076172, "learning_rate": 3.779177473138199e-07, "logits/chosen": -6.22428560256958, "logits/rejected": -6.160924434661865, "logps/chosen": -881.8909301757812, "logps/rejected": -848.2203979492188, "loss": 0.6146, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.0156034231185913, "rewards/margins": 0.35982877016067505, "rewards/rejected": 0.6557747721672058, "step": 660 }, { "epoch": 0.24829758651039976, "grad_norm": 210.9625244140625, "learning_rate": 3.7606520933679135e-07, "logits/chosen": -6.125003337860107, "logits/rejected": -6.0754289627075195, "logps/chosen": -884.8665771484375, "logps/rejected": -789.2760620117188, "loss": 0.651, "rewards/accuracies": 0.65625, "rewards/chosen": 1.0800001621246338, "rewards/margins": 0.25831884145736694, "rewards/rejected": 0.8216812014579773, "step": 670 }, { "epoch": 0.25200352063742065, "grad_norm": 172.888916015625, "learning_rate": 3.7421267135976286e-07, "logits/chosen": -6.232366561889648, "logits/rejected": -6.1383185386657715, "logps/chosen": -960.3611450195312, "logps/rejected": -835.4730224609375, "loss": 0.6177, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.2130458354949951, "rewards/margins": 0.35953769087791443, "rewards/rejected": 0.8535081744194031, "step": 680 }, { "epoch": 0.25570945476444157, "grad_norm": 186.8795623779297, "learning_rate": 3.723601333827343e-07, "logits/chosen": -6.141107082366943, "logits/rejected": -6.195657253265381, "logps/chosen": -938.1901245117188, "logps/rejected": -814.3914184570312, "loss": 0.6728, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.9760942459106445, "rewards/margins": 0.2641645669937134, "rewards/rejected": 0.7119296789169312, "step": 690 }, { "epoch": 0.25941538889146243, "grad_norm": 152.13043212890625, "learning_rate": 3.705075954057058e-07, "logits/chosen": -6.24930477142334, "logits/rejected": -6.1736040115356445, "logps/chosen": -936.4893798828125, "logps/rejected": -848.609375, "loss": 0.5967, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.0887714624404907, "rewards/margins": 0.3751378059387207, "rewards/rejected": 0.71363365650177, "step": 700 }, { "epoch": 0.25941538889146243, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.339428901672363, "eval_logps/chosen": -1141.5142822265625, "eval_logps/rejected": -1051.634521484375, "eval_loss": 0.6639354825019836, "eval_rewards/accuracies": 0.6154491901397705, "eval_rewards/chosen": 1.2831051349639893, "eval_rewards/margins": 0.22238638997077942, "eval_rewards/rejected": 1.0607186555862427, "eval_runtime": 174.5031, "eval_samples_per_second": 6.825, "eval_steps_per_second": 6.825, "step": 700 }, { "epoch": 0.26312132301848334, "grad_norm": 189.0890655517578, "learning_rate": 3.6865505742867727e-07, "logits/chosen": -6.229816436767578, "logits/rejected": -6.161192893981934, "logps/chosen": -850.07568359375, "logps/rejected": -789.8104248046875, "loss": 0.6791, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.9719418287277222, "rewards/margins": 0.19261571764945984, "rewards/rejected": 0.7793260812759399, "step": 710 }, { "epoch": 0.26682725714550426, "grad_norm": 243.8544921875, "learning_rate": 3.668025194516488e-07, "logits/chosen": -6.192295551300049, "logits/rejected": -6.1518754959106445, "logps/chosen": -950.3997802734375, "logps/rejected": -804.35009765625, "loss": 0.623, "rewards/accuracies": 0.625, "rewards/chosen": 1.1841661930084229, "rewards/margins": 0.35910895466804504, "rewards/rejected": 0.8250571489334106, "step": 720 }, { "epoch": 0.2705331912725251, "grad_norm": 194.04550170898438, "learning_rate": 3.649499814746202e-07, "logits/chosen": -6.118433952331543, "logits/rejected": -6.096287727355957, "logps/chosen": -998.7361450195312, "logps/rejected": -866.0007934570312, "loss": 0.5717, "rewards/accuracies": 0.71875, "rewards/chosen": 1.341344952583313, "rewards/margins": 0.4694565236568451, "rewards/rejected": 0.8718884587287903, "step": 730 }, { "epoch": 0.27423912539954604, "grad_norm": 199.3968963623047, "learning_rate": 3.630974434975917e-07, "logits/chosen": -6.136265754699707, "logits/rejected": -6.221334457397461, "logps/chosen": -959.4265747070312, "logps/rejected": -911.4915771484375, "loss": 0.6336, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 1.3660612106323242, "rewards/margins": 0.3232826888561249, "rewards/rejected": 1.0427783727645874, "step": 740 }, { "epoch": 0.2779450595265669, "grad_norm": 184.0609130859375, "learning_rate": 3.6124490552056313e-07, "logits/chosen": -6.145341873168945, "logits/rejected": -6.150424480438232, "logps/chosen": -933.7684326171875, "logps/rejected": -828.2578125, "loss": 0.6562, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.300537109375, "rewards/margins": 0.30143502354621887, "rewards/rejected": 0.999101996421814, "step": 750 }, { "epoch": 0.2779450595265669, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.332278251647949, "eval_logps/chosen": -1140.3084716796875, "eval_logps/rejected": -1050.5599365234375, "eval_loss": 0.6633999347686768, "eval_rewards/accuracies": 0.6171284914016724, "eval_rewards/chosen": 1.4036915302276611, "eval_rewards/margins": 0.23550742864608765, "eval_rewards/rejected": 1.1681841611862183, "eval_runtime": 174.2841, "eval_samples_per_second": 6.834, "eval_steps_per_second": 6.834, "step": 750 }, { "epoch": 0.2816509936535878, "grad_norm": 187.0355224609375, "learning_rate": 3.5939236754353464e-07, "logits/chosen": -6.186856746673584, "logits/rejected": -6.131129264831543, "logps/chosen": -941.7112426757812, "logps/rejected": -817.9475708007812, "loss": 0.5649, "rewards/accuracies": 0.71875, "rewards/chosen": 1.3290807008743286, "rewards/margins": 0.5047949552536011, "rewards/rejected": 0.8242858052253723, "step": 760 }, { "epoch": 0.2853569277806087, "grad_norm": 211.35397338867188, "learning_rate": 3.575398295665061e-07, "logits/chosen": -6.122169494628906, "logits/rejected": -6.1151204109191895, "logps/chosen": -887.787109375, "logps/rejected": -862.0372314453125, "loss": 0.6854, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 1.1365787982940674, "rewards/margins": 0.2167353630065918, "rewards/rejected": 0.9198434948921204, "step": 770 }, { "epoch": 0.2890628619076296, "grad_norm": 198.17198181152344, "learning_rate": 3.556872915894776e-07, "logits/chosen": -6.241828918457031, "logits/rejected": -6.265792369842529, "logps/chosen": -929.9652099609375, "logps/rejected": -877.3856201171875, "loss": 0.614, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 1.117902398109436, "rewards/margins": 0.3560691475868225, "rewards/rejected": 0.7618332505226135, "step": 780 }, { "epoch": 0.2927687960346505, "grad_norm": 150.22120666503906, "learning_rate": 3.5383475361244905e-07, "logits/chosen": -6.19686222076416, "logits/rejected": -6.209750175476074, "logps/chosen": -1054.142578125, "logps/rejected": -874.3284912109375, "loss": 0.5739, "rewards/accuracies": 0.6875, "rewards/chosen": 1.2843711376190186, "rewards/margins": 0.4667224884033203, "rewards/rejected": 0.817648708820343, "step": 790 }, { "epoch": 0.29647473016167136, "grad_norm": 243.4776611328125, "learning_rate": 3.519822156354205e-07, "logits/chosen": -6.137392520904541, "logits/rejected": -6.096640586853027, "logps/chosen": -926.0994262695312, "logps/rejected": -875.486328125, "loss": 0.6676, "rewards/accuracies": 0.625, "rewards/chosen": 1.1599671840667725, "rewards/margins": 0.2768460214138031, "rewards/rejected": 0.883121132850647, "step": 800 }, { "epoch": 0.29647473016167136, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.3336334228515625, "eval_logps/chosen": -1140.1082763671875, "eval_logps/rejected": -1050.391845703125, "eval_loss": 0.6643325090408325, "eval_rewards/accuracies": 0.6272040009498596, "eval_rewards/chosen": 1.4237107038497925, "eval_rewards/margins": 0.23872110247612, "eval_rewards/rejected": 1.1849894523620605, "eval_runtime": 174.4194, "eval_samples_per_second": 6.828, "eval_steps_per_second": 6.828, "step": 800 }, { "epoch": 0.3001806642886923, "grad_norm": 199.17247009277344, "learning_rate": 3.5012967765839195e-07, "logits/chosen": -6.241001129150391, "logits/rejected": -6.098967552185059, "logps/chosen": -920.8816528320312, "logps/rejected": -898.8351440429688, "loss": 0.7121, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 1.0589300394058228, "rewards/margins": 0.1631755530834198, "rewards/rejected": 0.8957546353340149, "step": 810 }, { "epoch": 0.30388659841571314, "grad_norm": 250.08518981933594, "learning_rate": 3.4827713968136346e-07, "logits/chosen": -6.267752647399902, "logits/rejected": -6.312867164611816, "logps/chosen": -1013.2420654296875, "logps/rejected": -958.6898193359375, "loss": 0.6355, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 1.1633565425872803, "rewards/margins": 0.3349772095680237, "rewards/rejected": 0.8283793330192566, "step": 820 }, { "epoch": 0.30759253254273405, "grad_norm": 208.069580078125, "learning_rate": 3.464246017043349e-07, "logits/chosen": -6.2897796630859375, "logits/rejected": -6.228161811828613, "logps/chosen": -933.79541015625, "logps/rejected": -842.0236206054688, "loss": 0.64, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.0309927463531494, "rewards/margins": 0.28416553139686584, "rewards/rejected": 0.7468270659446716, "step": 830 }, { "epoch": 0.31129846666975497, "grad_norm": 232.0865478515625, "learning_rate": 3.445720637273064e-07, "logits/chosen": -6.142411708831787, "logits/rejected": -6.164281845092773, "logps/chosen": -1016.1708984375, "logps/rejected": -898.3994140625, "loss": 0.6169, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.0414842367172241, "rewards/margins": 0.35715141892433167, "rewards/rejected": 0.6843328475952148, "step": 840 }, { "epoch": 0.31500440079677583, "grad_norm": 193.65232849121094, "learning_rate": 3.4271952575027787e-07, "logits/chosen": -6.228142738342285, "logits/rejected": -6.197975158691406, "logps/chosen": -931.4905395507812, "logps/rejected": -856.0863037109375, "loss": 0.6805, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 1.047786831855774, "rewards/margins": 0.20453695952892303, "rewards/rejected": 0.8432496786117554, "step": 850 }, { "epoch": 0.31500440079677583, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.3389668464660645, "eval_logps/chosen": -1141.093505859375, "eval_logps/rejected": -1051.356201171875, "eval_loss": 0.6609283089637756, "eval_rewards/accuracies": 0.6120907068252563, "eval_rewards/chosen": 1.3251850605010986, "eval_rewards/margins": 0.23663325607776642, "eval_rewards/rejected": 1.0885517597198486, "eval_runtime": 174.4195, "eval_samples_per_second": 6.828, "eval_steps_per_second": 6.828, "step": 850 }, { "epoch": 0.31871033492379675, "grad_norm": 191.5203094482422, "learning_rate": 3.4086698777324937e-07, "logits/chosen": -6.188792705535889, "logits/rejected": -6.236809730529785, "logps/chosen": -977.2009887695312, "logps/rejected": -865.64306640625, "loss": 0.5709, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 1.2319726943969727, "rewards/margins": 0.48221221566200256, "rewards/rejected": 0.7497605085372925, "step": 860 }, { "epoch": 0.3224162690508176, "grad_norm": 179.01022338867188, "learning_rate": 3.3901444979622077e-07, "logits/chosen": -6.048168659210205, "logits/rejected": -6.131080627441406, "logps/chosen": -959.7575073242188, "logps/rejected": -822.2799072265625, "loss": 0.6798, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 1.2015607357025146, "rewards/margins": 0.25019967555999756, "rewards/rejected": 0.9513611793518066, "step": 870 }, { "epoch": 0.3261222031778385, "grad_norm": 164.24993896484375, "learning_rate": 3.371619118191923e-07, "logits/chosen": -6.164304256439209, "logits/rejected": -6.14687442779541, "logps/chosen": -984.1392822265625, "logps/rejected": -887.4852294921875, "loss": 0.5921, "rewards/accuracies": 0.6875, "rewards/chosen": 1.326623558998108, "rewards/margins": 0.4440253674983978, "rewards/rejected": 0.8825982213020325, "step": 880 }, { "epoch": 0.3298281373048594, "grad_norm": 208.62112426757812, "learning_rate": 3.3530937384216373e-07, "logits/chosen": -6.055702209472656, "logits/rejected": -6.137775421142578, "logps/chosen": -966.544921875, "logps/rejected": -852.4841918945312, "loss": 0.6072, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 1.298642873764038, "rewards/margins": 0.36649665236473083, "rewards/rejected": 0.932146430015564, "step": 890 }, { "epoch": 0.3335340714318803, "grad_norm": 198.2984619140625, "learning_rate": 3.3345683586513524e-07, "logits/chosen": -6.077668190002441, "logits/rejected": -6.017401218414307, "logps/chosen": -968.5540771484375, "logps/rejected": -825.6281127929688, "loss": 0.5936, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 1.3333479166030884, "rewards/margins": 0.48964110016822815, "rewards/rejected": 0.8437067866325378, "step": 900 }, { "epoch": 0.3335340714318803, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.333832263946533, "eval_logps/chosen": -1138.702392578125, "eval_logps/rejected": -1049.201904296875, "eval_loss": 0.6653527021408081, "eval_rewards/accuracies": 0.6322417855262756, "eval_rewards/chosen": 1.564302682876587, "eval_rewards/margins": 0.26032954454421997, "eval_rewards/rejected": 1.3039733171463013, "eval_runtime": 174.3951, "eval_samples_per_second": 6.829, "eval_steps_per_second": 6.829, "step": 900 }, { "epoch": 0.3372400055589012, "grad_norm": 172.08016967773438, "learning_rate": 3.316042978881067e-07, "logits/chosen": -6.152904033660889, "logits/rejected": -6.063776969909668, "logps/chosen": -934.8038940429688, "logps/rejected": -874.7457885742188, "loss": 0.6239, "rewards/accuracies": 0.6875, "rewards/chosen": 1.3152906894683838, "rewards/margins": 0.36238130927085876, "rewards/rejected": 0.9529093503952026, "step": 910 }, { "epoch": 0.3409459396859221, "grad_norm": 154.69894409179688, "learning_rate": 3.297517599110782e-07, "logits/chosen": -6.253002166748047, "logits/rejected": -6.253316402435303, "logps/chosen": -1026.7967529296875, "logps/rejected": -909.5032348632812, "loss": 0.6319, "rewards/accuracies": 0.6875, "rewards/chosen": 1.4157216548919678, "rewards/margins": 0.3379477262496948, "rewards/rejected": 1.077773928642273, "step": 920 }, { "epoch": 0.344651873812943, "grad_norm": 128.03990173339844, "learning_rate": 3.278992219340496e-07, "logits/chosen": -6.163126468658447, "logits/rejected": -6.272846698760986, "logps/chosen": -1002.2288208007812, "logps/rejected": -850.8206787109375, "loss": 0.5969, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.2641438245773315, "rewards/margins": 0.45238691568374634, "rewards/rejected": 0.81175696849823, "step": 930 }, { "epoch": 0.34835780793996385, "grad_norm": 126.5301284790039, "learning_rate": 3.260466839570211e-07, "logits/chosen": -6.24020528793335, "logits/rejected": -6.305496692657471, "logps/chosen": -831.75244140625, "logps/rejected": -811.0787353515625, "loss": 0.6274, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.032156229019165, "rewards/margins": 0.3172861337661743, "rewards/rejected": 0.7148701548576355, "step": 940 }, { "epoch": 0.35206374206698476, "grad_norm": 211.2845916748047, "learning_rate": 3.2419414597999255e-07, "logits/chosen": -6.139876365661621, "logits/rejected": -6.111436367034912, "logps/chosen": -975.7356567382812, "logps/rejected": -844.0784912109375, "loss": 0.6325, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 1.1637299060821533, "rewards/margins": 0.3424530327320099, "rewards/rejected": 0.8212767839431763, "step": 950 }, { "epoch": 0.35206374206698476, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.342590808868408, "eval_logps/chosen": -1141.1739501953125, "eval_logps/rejected": -1051.595458984375, "eval_loss": 0.6582168936729431, "eval_rewards/accuracies": 0.6213266253471375, "eval_rewards/chosen": 1.3171454668045044, "eval_rewards/margins": 0.25251859426498413, "eval_rewards/rejected": 1.064626932144165, "eval_runtime": 174.1358, "eval_samples_per_second": 6.839, "eval_steps_per_second": 6.839, "step": 950 }, { "epoch": 0.3557696761940056, "grad_norm": 203.0992431640625, "learning_rate": 3.2234160800296406e-07, "logits/chosen": -6.084280967712402, "logits/rejected": -6.071610450744629, "logps/chosen": -821.4600830078125, "logps/rejected": -727.051025390625, "loss": 0.6376, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 1.0315455198287964, "rewards/margins": 0.3190918266773224, "rewards/rejected": 0.7124537229537964, "step": 960 }, { "epoch": 0.35947561032102654, "grad_norm": 180.57797241210938, "learning_rate": 3.204890700259355e-07, "logits/chosen": -6.084843635559082, "logits/rejected": -6.00299072265625, "logps/chosen": -981.2433471679688, "logps/rejected": -845.26123046875, "loss": 0.5913, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 1.1229795217514038, "rewards/margins": 0.41206812858581543, "rewards/rejected": 0.7109113931655884, "step": 970 }, { "epoch": 0.36318154444804746, "grad_norm": 207.84568786621094, "learning_rate": 3.18636532048907e-07, "logits/chosen": -6.196352958679199, "logits/rejected": -6.150284767150879, "logps/chosen": -915.8230590820312, "logps/rejected": -826.0857543945312, "loss": 0.6367, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.011979579925537, "rewards/margins": 0.326684832572937, "rewards/rejected": 0.6852947473526001, "step": 980 }, { "epoch": 0.3668874785750683, "grad_norm": 169.62612915039062, "learning_rate": 3.1678399407187847e-07, "logits/chosen": -6.155111312866211, "logits/rejected": -6.252329349517822, "logps/chosen": -945.7684326171875, "logps/rejected": -895.0965576171875, "loss": 0.6591, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 1.0974936485290527, "rewards/margins": 0.277460515499115, "rewards/rejected": 0.820033073425293, "step": 990 }, { "epoch": 0.37059341270208923, "grad_norm": 164.93836975097656, "learning_rate": 3.1493145609484997e-07, "logits/chosen": -6.1941423416137695, "logits/rejected": -6.119546413421631, "logps/chosen": -859.4827270507812, "logps/rejected": -778.8285522460938, "loss": 0.614, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 1.124022126197815, "rewards/margins": 0.3852692246437073, "rewards/rejected": 0.7387528419494629, "step": 1000 }, { "epoch": 0.37059341270208923, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.3517584800720215, "eval_logps/chosen": -1141.3780517578125, "eval_logps/rejected": -1051.7454833984375, "eval_loss": 0.6584185361862183, "eval_rewards/accuracies": 0.6196473836898804, "eval_rewards/chosen": 1.296731948852539, "eval_rewards/margins": 0.24711348116397858, "eval_rewards/rejected": 1.0496186017990112, "eval_runtime": 174.4804, "eval_samples_per_second": 6.826, "eval_steps_per_second": 6.826, "step": 1000 }, { "epoch": 0.3742993468291101, "grad_norm": 161.15005493164062, "learning_rate": 3.1307891811782137e-07, "logits/chosen": -6.150378227233887, "logits/rejected": -6.203757286071777, "logps/chosen": -1034.891357421875, "logps/rejected": -888.6922607421875, "loss": 0.6111, "rewards/accuracies": 0.65625, "rewards/chosen": 1.073899507522583, "rewards/margins": 0.3846450746059418, "rewards/rejected": 0.6892544031143188, "step": 1010 }, { "epoch": 0.378005280956131, "grad_norm": 167.26861572265625, "learning_rate": 3.112263801407929e-07, "logits/chosen": -6.080620765686035, "logits/rejected": -6.124800682067871, "logps/chosen": -882.3255615234375, "logps/rejected": -834.7425537109375, "loss": 0.6356, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 1.0985982418060303, "rewards/margins": 0.2840558588504791, "rewards/rejected": 0.8145424127578735, "step": 1020 }, { "epoch": 0.3817112150831519, "grad_norm": 143.760009765625, "learning_rate": 3.0937384216376433e-07, "logits/chosen": NaN, "logits/rejected": -6.1283392906188965, "logps/chosen": -931.8167114257812, "logps/rejected": -770.5465087890625, "loss": 0.5515, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 1.2119407653808594, "rewards/margins": 0.5438046455383301, "rewards/rejected": 0.6681360006332397, "step": 1030 }, { "epoch": 0.3854171492101728, "grad_norm": 213.75035095214844, "learning_rate": 3.0752130418673583e-07, "logits/chosen": -6.2023396492004395, "logits/rejected": -6.126180648803711, "logps/chosen": -923.2513427734375, "logps/rejected": -764.4282836914062, "loss": 0.6242, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.2077882289886475, "rewards/margins": 0.35475489497184753, "rewards/rejected": 0.8530333638191223, "step": 1040 }, { "epoch": 0.3891230833371937, "grad_norm": 129.1380157470703, "learning_rate": 3.056687662097073e-07, "logits/chosen": -6.151089668273926, "logits/rejected": -6.196557998657227, "logps/chosen": -891.5910034179688, "logps/rejected": -825.3084716796875, "loss": 0.5819, "rewards/accuracies": 0.65625, "rewards/chosen": 1.2349982261657715, "rewards/margins": 0.49733766913414, "rewards/rejected": 0.7376605272293091, "step": 1050 }, { "epoch": 0.3891230833371937, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.354092597961426, "eval_logps/chosen": -1138.8973388671875, "eval_logps/rejected": -1049.481689453125, "eval_loss": 0.6607492566108704, "eval_rewards/accuracies": 0.6213266253471375, "eval_rewards/chosen": 1.544799566268921, "eval_rewards/margins": 0.26879334449768066, "eval_rewards/rejected": 1.2760061025619507, "eval_runtime": 174.2371, "eval_samples_per_second": 6.836, "eval_steps_per_second": 6.836, "step": 1050 }, { "epoch": 0.39282901746421456, "grad_norm": 164.7945556640625, "learning_rate": 3.038162282326788e-07, "logits/chosen": -6.143533229827881, "logits/rejected": -6.127655982971191, "logps/chosen": -845.65234375, "logps/rejected": -816.9622802734375, "loss": 0.6427, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.2775068283081055, "rewards/margins": 0.3459513187408447, "rewards/rejected": 0.9315555691719055, "step": 1060 }, { "epoch": 0.3965349515912355, "grad_norm": 149.87388610839844, "learning_rate": 3.019636902556502e-07, "logits/chosen": -6.187637805938721, "logits/rejected": -6.129674434661865, "logps/chosen": -885.9992065429688, "logps/rejected": -778.6902465820312, "loss": 0.608, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 1.1672834157943726, "rewards/margins": 0.3892399072647095, "rewards/rejected": 0.7780434489250183, "step": 1070 }, { "epoch": 0.40024088571825633, "grad_norm": 166.63389587402344, "learning_rate": 3.001111522786217e-07, "logits/chosen": -6.271850109100342, "logits/rejected": -6.208528995513916, "logps/chosen": -913.72998046875, "logps/rejected": -773.72412109375, "loss": 0.6183, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.156951904296875, "rewards/margins": 0.3150864243507385, "rewards/rejected": 0.8418653607368469, "step": 1080 }, { "epoch": 0.40394681984527725, "grad_norm": 168.648681640625, "learning_rate": 2.9825861430159315e-07, "logits/chosen": -6.246833801269531, "logits/rejected": -6.256269931793213, "logps/chosen": -948.3607177734375, "logps/rejected": -898.9035034179688, "loss": 0.6266, "rewards/accuracies": 0.65625, "rewards/chosen": 1.2156215906143188, "rewards/margins": 0.34718313813209534, "rewards/rejected": 0.8684385418891907, "step": 1090 }, { "epoch": 0.40765275397229817, "grad_norm": 174.19361877441406, "learning_rate": 2.9640607632456465e-07, "logits/chosen": -6.159620761871338, "logits/rejected": -6.141018867492676, "logps/chosen": -986.7599487304688, "logps/rejected": -843.1695556640625, "loss": 0.5832, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.3146113157272339, "rewards/margins": 0.4466518461704254, "rewards/rejected": 0.8679596185684204, "step": 1100 }, { "epoch": 0.40765275397229817, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.353438854217529, "eval_logps/chosen": -1139.6771240234375, "eval_logps/rejected": -1050.29736328125, "eval_loss": 0.6590859293937683, "eval_rewards/accuracies": 0.6179680824279785, "eval_rewards/chosen": 1.466820478439331, "eval_rewards/margins": 0.2723851799964905, "eval_rewards/rejected": 1.1944352388381958, "eval_runtime": 174.1419, "eval_samples_per_second": 6.839, "eval_steps_per_second": 6.839, "step": 1100 }, { "epoch": 0.411358688099319, "grad_norm": 207.91915893554688, "learning_rate": 2.945535383475361e-07, "logits/chosen": -6.101978302001953, "logits/rejected": -6.107656955718994, "logps/chosen": -853.7755737304688, "logps/rejected": -771.1502685546875, "loss": 0.6212, "rewards/accuracies": 0.625, "rewards/chosen": 1.1558765172958374, "rewards/margins": 0.4091036915779114, "rewards/rejected": 0.7467728853225708, "step": 1110 }, { "epoch": 0.41506462222633994, "grad_norm": 172.02349853515625, "learning_rate": 2.927010003705076e-07, "logits/chosen": -6.207159996032715, "logits/rejected": -6.222277641296387, "logps/chosen": -924.0470581054688, "logps/rejected": -801.3013916015625, "loss": 0.5918, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.1804696321487427, "rewards/margins": 0.443875253200531, "rewards/rejected": 0.7365943193435669, "step": 1120 }, { "epoch": 0.4187705563533608, "grad_norm": 147.4911346435547, "learning_rate": 2.9084846239347906e-07, "logits/chosen": -6.112509727478027, "logits/rejected": -6.115456581115723, "logps/chosen": -937.400390625, "logps/rejected": -851.7019653320312, "loss": 0.5924, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 1.4129273891448975, "rewards/margins": 0.4493323266506195, "rewards/rejected": 0.9635950326919556, "step": 1130 }, { "epoch": 0.4224764904803817, "grad_norm": 188.7657012939453, "learning_rate": 2.8899592441645057e-07, "logits/chosen": -6.175480842590332, "logits/rejected": -6.202576637268066, "logps/chosen": -860.0646362304688, "logps/rejected": -785.8237915039062, "loss": 0.5818, "rewards/accuracies": 0.71875, "rewards/chosen": 1.2907373905181885, "rewards/margins": 0.48255085945129395, "rewards/rejected": 0.8081865310668945, "step": 1140 }, { "epoch": 0.4261824246074026, "grad_norm": 195.689208984375, "learning_rate": 2.8714338643942197e-07, "logits/chosen": -6.1582794189453125, "logits/rejected": -6.1286234855651855, "logps/chosen": -882.5172729492188, "logps/rejected": -781.136962890625, "loss": 0.6334, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 1.112781286239624, "rewards/margins": 0.32989996671676636, "rewards/rejected": 0.7828812599182129, "step": 1150 }, { "epoch": 0.4261824246074026, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.354198932647705, "eval_logps/chosen": -1139.8521728515625, "eval_logps/rejected": -1050.4593505859375, "eval_loss": 0.6620603203773499, "eval_rewards/accuracies": 0.6230058670043945, "eval_rewards/chosen": 1.449316382408142, "eval_rewards/margins": 0.27107417583465576, "eval_rewards/rejected": 1.1782420873641968, "eval_runtime": 174.0926, "eval_samples_per_second": 6.841, "eval_steps_per_second": 6.841, "step": 1150 }, { "epoch": 0.4298883587344235, "grad_norm": 196.42591857910156, "learning_rate": 2.852908484623935e-07, "logits/chosen": -6.155422687530518, "logits/rejected": -6.14513635635376, "logps/chosen": -881.4910888671875, "logps/rejected": -846.4808349609375, "loss": 0.5939, "rewards/accuracies": 0.65625, "rewards/chosen": 1.1745377779006958, "rewards/margins": 0.4163404405117035, "rewards/rejected": 0.7581971883773804, "step": 1160 }, { "epoch": 0.4335942928614444, "grad_norm": 143.9552001953125, "learning_rate": 2.834383104853649e-07, "logits/chosen": -6.306971549987793, "logits/rejected": -6.230139255523682, "logps/chosen": -952.1017456054688, "logps/rejected": -841.35498046875, "loss": 0.6176, "rewards/accuracies": 0.6875, "rewards/chosen": 1.0589168071746826, "rewards/margins": 0.36980077624320984, "rewards/rejected": 0.6891158819198608, "step": 1170 }, { "epoch": 0.43730022698846527, "grad_norm": 160.4393310546875, "learning_rate": 2.8158577250833643e-07, "logits/chosen": -6.1785125732421875, "logits/rejected": -6.113655090332031, "logps/chosen": -856.7884521484375, "logps/rejected": -804.0035400390625, "loss": 0.6001, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 1.080625295639038, "rewards/margins": 0.3962119221687317, "rewards/rejected": 0.6844133138656616, "step": 1180 }, { "epoch": 0.4410061611154862, "grad_norm": 149.27992248535156, "learning_rate": 2.797332345313079e-07, "logits/chosen": -6.293272972106934, "logits/rejected": -6.266045570373535, "logps/chosen": -1081.994140625, "logps/rejected": -923.7223510742188, "loss": 0.5861, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 1.324501395225525, "rewards/margins": 0.486712783575058, "rewards/rejected": 0.8377887606620789, "step": 1190 }, { "epoch": 0.44471209524250704, "grad_norm": 158.3959197998047, "learning_rate": 2.778806965542794e-07, "logits/chosen": -6.168979644775391, "logits/rejected": -6.1329731941223145, "logps/chosen": -876.14697265625, "logps/rejected": -876.2384643554688, "loss": 0.6212, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 1.1340038776397705, "rewards/margins": 0.32338947057724, "rewards/rejected": 0.8106144070625305, "step": 1200 }, { "epoch": 0.44471209524250704, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.352243900299072, "eval_logps/chosen": -1139.7337646484375, "eval_logps/rejected": -1050.3447265625, "eval_loss": 0.6622124910354614, "eval_rewards/accuracies": 0.6272040009498596, "eval_rewards/chosen": 1.4611579179763794, "eval_rewards/margins": 0.27146124839782715, "eval_rewards/rejected": 1.1896967887878418, "eval_runtime": 174.1161, "eval_samples_per_second": 6.84, "eval_steps_per_second": 6.84, "step": 1200 }, { "epoch": 0.44841802936952796, "grad_norm": 159.97509765625, "learning_rate": 2.760281585772508e-07, "logits/chosen": -6.155325889587402, "logits/rejected": -6.213382720947266, "logps/chosen": -876.2828369140625, "logps/rejected": -887.4181518554688, "loss": 0.6392, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.1726298332214355, "rewards/margins": 0.33511778712272644, "rewards/rejected": 0.8375120162963867, "step": 1210 }, { "epoch": 0.4521239634965489, "grad_norm": 149.97601318359375, "learning_rate": 2.741756206002223e-07, "logits/chosen": -6.089978218078613, "logits/rejected": -6.215594291687012, "logps/chosen": -1006.5177612304688, "logps/rejected": -837.3438720703125, "loss": 0.5759, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 1.1888597011566162, "rewards/margins": 0.4549834132194519, "rewards/rejected": 0.7338763475418091, "step": 1220 }, { "epoch": 0.45582989762356974, "grad_norm": 124.17080688476562, "learning_rate": 2.7232308262319375e-07, "logits/chosen": -6.237065315246582, "logits/rejected": -6.162973403930664, "logps/chosen": -900.43505859375, "logps/rejected": -769.6177978515625, "loss": 0.5828, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.1558102369308472, "rewards/margins": 0.5107904672622681, "rewards/rejected": 0.6450197100639343, "step": 1230 }, { "epoch": 0.45953583175059065, "grad_norm": 209.32017517089844, "learning_rate": 2.7047054464616525e-07, "logits/chosen": -6.1265363693237305, "logits/rejected": -6.136591911315918, "logps/chosen": -850.1053466796875, "logps/rejected": -755.8872680664062, "loss": 0.6007, "rewards/accuracies": 0.6875, "rewards/chosen": 1.0844950675964355, "rewards/margins": 0.4297495484352112, "rewards/rejected": 0.6547454595565796, "step": 1240 }, { "epoch": 0.4632417658776115, "grad_norm": 230.33175659179688, "learning_rate": 2.686180066691367e-07, "logits/chosen": -6.183014392852783, "logits/rejected": -6.165501117706299, "logps/chosen": -842.3179931640625, "logps/rejected": -821.1053466796875, "loss": 0.6189, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.9313210248947144, "rewards/margins": 0.330872505903244, "rewards/rejected": 0.6004485487937927, "step": 1250 }, { "epoch": 0.4632417658776115, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.354373931884766, "eval_logps/chosen": -1141.2061767578125, "eval_logps/rejected": -1051.6622314453125, "eval_loss": 0.6636425852775574, "eval_rewards/accuracies": 0.6204869747161865, "eval_rewards/chosen": 1.31391179561615, "eval_rewards/margins": 0.2559622824192047, "eval_rewards/rejected": 1.0579496622085571, "eval_runtime": 174.2497, "eval_samples_per_second": 6.835, "eval_steps_per_second": 6.835, "step": 1250 }, { "epoch": 0.4669477000046324, "grad_norm": 212.96624755859375, "learning_rate": 2.667654686921082e-07, "logits/chosen": -6.100918292999268, "logits/rejected": NaN, "logps/chosen": -1028.841552734375, "logps/rejected": -886.3626708984375, "loss": 0.613, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.181762933731079, "rewards/margins": 0.37267082929611206, "rewards/rejected": 0.8090922236442566, "step": 1260 }, { "epoch": 0.4706536341316533, "grad_norm": 162.5323944091797, "learning_rate": 2.6491293071507966e-07, "logits/chosen": -6.123560428619385, "logits/rejected": -6.198370933532715, "logps/chosen": -930.6388549804688, "logps/rejected": -765.5624389648438, "loss": 0.6066, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.0889371633529663, "rewards/margins": 0.40622156858444214, "rewards/rejected": 0.6827155351638794, "step": 1270 }, { "epoch": 0.4743595682586742, "grad_norm": 129.6339874267578, "learning_rate": 2.630603927380511e-07, "logits/chosen": -6.171866416931152, "logits/rejected": -6.194737434387207, "logps/chosen": -922.0595703125, "logps/rejected": -880.8753051757812, "loss": 0.6099, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 1.1603076457977295, "rewards/margins": 0.38798120617866516, "rewards/rejected": 0.7723264694213867, "step": 1280 }, { "epoch": 0.4780655023856951, "grad_norm": 185.04010009765625, "learning_rate": 2.6120785476102257e-07, "logits/chosen": -6.216259479522705, "logits/rejected": -6.08756685256958, "logps/chosen": -940.3963623046875, "logps/rejected": -871.3863525390625, "loss": 0.587, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.1315703392028809, "rewards/margins": 0.41449031233787537, "rewards/rejected": 0.7170801162719727, "step": 1290 }, { "epoch": 0.481771436512716, "grad_norm": 188.83816528320312, "learning_rate": 2.5935531678399407e-07, "logits/chosen": -6.360658645629883, "logits/rejected": -6.3473029136657715, "logps/chosen": -941.73876953125, "logps/rejected": -903.9150390625, "loss": 0.581, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 1.2506906986236572, "rewards/margins": 0.4942537844181061, "rewards/rejected": 0.7564369440078735, "step": 1300 }, { "epoch": 0.481771436512716, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.355961322784424, "eval_logps/chosen": -1139.5653076171875, "eval_logps/rejected": -1050.153076171875, "eval_loss": 0.6662114262580872, "eval_rewards/accuracies": 0.6288833022117615, "eval_rewards/chosen": 1.4780066013336182, "eval_rewards/margins": 0.26914337277412415, "eval_rewards/rejected": 1.2088632583618164, "eval_runtime": 174.4932, "eval_samples_per_second": 6.825, "eval_steps_per_second": 6.825, "step": 1300 }, { "epoch": 0.4854773706397369, "grad_norm": 176.7130584716797, "learning_rate": 2.575027788069655e-07, "logits/chosen": -6.2104811668396, "logits/rejected": -6.247377395629883, "logps/chosen": -863.90869140625, "logps/rejected": -790.2784423828125, "loss": 0.5685, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 1.1237311363220215, "rewards/margins": 0.48923033475875854, "rewards/rejected": 0.6345008015632629, "step": 1310 }, { "epoch": 0.48918330476675775, "grad_norm": 181.07481384277344, "learning_rate": 2.5565024082993703e-07, "logits/chosen": -6.231227397918701, "logits/rejected": NaN, "logps/chosen": -985.6842651367188, "logps/rejected": -883.3465576171875, "loss": 0.5585, "rewards/accuracies": 0.71875, "rewards/chosen": 1.381658911705017, "rewards/margins": 0.5166347026824951, "rewards/rejected": 0.8650242686271667, "step": 1320 }, { "epoch": 0.49288923889377867, "grad_norm": 203.50254821777344, "learning_rate": 2.537977028529085e-07, "logits/chosen": -6.048904895782471, "logits/rejected": -6.121670722961426, "logps/chosen": -907.2009887695312, "logps/rejected": -842.6783447265625, "loss": 0.6369, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 1.19151771068573, "rewards/margins": 0.38030725717544556, "rewards/rejected": 0.8112104535102844, "step": 1330 }, { "epoch": 0.49659517302079953, "grad_norm": 154.94403076171875, "learning_rate": 2.5194516487588e-07, "logits/chosen": -6.1506195068359375, "logits/rejected": -6.043631076812744, "logps/chosen": -921.5447387695312, "logps/rejected": -728.7830810546875, "loss": 0.5662, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 1.1830615997314453, "rewards/margins": 0.491299569606781, "rewards/rejected": 0.6917620897293091, "step": 1340 }, { "epoch": 0.5003011071478205, "grad_norm": 227.9466094970703, "learning_rate": 2.500926268988514e-07, "logits/chosen": -6.187090873718262, "logits/rejected": -6.22959041595459, "logps/chosen": -829.8533935546875, "logps/rejected": -727.3782348632812, "loss": 0.5804, "rewards/accuracies": 0.6875, "rewards/chosen": 1.0849952697753906, "rewards/margins": 0.42699941992759705, "rewards/rejected": 0.657995879650116, "step": 1350 }, { "epoch": 0.5003011071478205, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.349131107330322, "eval_logps/chosen": -1137.9072265625, "eval_logps/rejected": -1048.7745361328125, "eval_loss": 0.6663568019866943, "eval_rewards/accuracies": 0.6213266253471375, "eval_rewards/chosen": 1.6438101530075073, "eval_rewards/margins": 0.2971048057079315, "eval_rewards/rejected": 1.346705436706543, "eval_runtime": 174.0406, "eval_samples_per_second": 6.843, "eval_steps_per_second": 6.843, "step": 1350 }, { "epoch": 0.5040070412748413, "grad_norm": 193.2044219970703, "learning_rate": 2.482400889218229e-07, "logits/chosen": -6.1281938552856445, "logits/rejected": -6.117993354797363, "logps/chosen": -1060.016357421875, "logps/rejected": -954.3433837890625, "loss": 0.5773, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 1.4718914031982422, "rewards/margins": 0.5141991376876831, "rewards/rejected": 0.9576921463012695, "step": 1360 }, { "epoch": 0.5077129754018622, "grad_norm": 203.0106658935547, "learning_rate": 2.4638755094479434e-07, "logits/chosen": -6.16585111618042, "logits/rejected": -6.127178192138672, "logps/chosen": -917.00146484375, "logps/rejected": -872.7060546875, "loss": 0.6158, "rewards/accuracies": 0.65625, "rewards/chosen": 1.3071753978729248, "rewards/margins": 0.41641944646835327, "rewards/rejected": 0.8907560110092163, "step": 1370 }, { "epoch": 0.5114189095288831, "grad_norm": 135.35690307617188, "learning_rate": 2.4453501296776585e-07, "logits/chosen": -6.066123962402344, "logits/rejected": -6.084324836730957, "logps/chosen": -859.6808471679688, "logps/rejected": -741.0250854492188, "loss": 0.6318, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.2575219869613647, "rewards/margins": 0.3475838005542755, "rewards/rejected": 0.9099382162094116, "step": 1380 }, { "epoch": 0.515124843655904, "grad_norm": 171.01341247558594, "learning_rate": 2.426824749907373e-07, "logits/chosen": -6.1828107833862305, "logits/rejected": -6.259852886199951, "logps/chosen": -894.8861083984375, "logps/rejected": -804.0977783203125, "loss": 0.5773, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.3444898128509521, "rewards/margins": 0.5704382658004761, "rewards/rejected": 0.7740517258644104, "step": 1390 }, { "epoch": 0.5188307777829249, "grad_norm": 250.60726928710938, "learning_rate": 2.4082993701370875e-07, "logits/chosen": -6.185898780822754, "logits/rejected": -6.244287014007568, "logps/chosen": -987.5439453125, "logps/rejected": -882.2205200195312, "loss": 0.5984, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 1.4197866916656494, "rewards/margins": 0.5091265439987183, "rewards/rejected": 0.9106601476669312, "step": 1400 }, { "epoch": 0.5188307777829249, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.353067398071289, "eval_logps/chosen": -1137.5255126953125, "eval_logps/rejected": -1048.356689453125, "eval_loss": 0.6677223443984985, "eval_rewards/accuracies": 0.6120907068252563, "eval_rewards/chosen": 1.6819899082183838, "eval_rewards/margins": 0.29349878430366516, "eval_rewards/rejected": 1.388491153717041, "eval_runtime": 174.2641, "eval_samples_per_second": 6.834, "eval_steps_per_second": 6.834, "step": 1400 }, { "epoch": 0.5225367119099458, "grad_norm": 240.53480529785156, "learning_rate": 2.3897739903668026e-07, "logits/chosen": -6.275177001953125, "logits/rejected": -6.181919574737549, "logps/chosen": -965.3531494140625, "logps/rejected": -786.497802734375, "loss": 0.6224, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.2736237049102783, "rewards/margins": 0.4058550298213959, "rewards/rejected": 0.8677686452865601, "step": 1410 }, { "epoch": 0.5262426460369667, "grad_norm": 165.14010620117188, "learning_rate": 2.371248610596517e-07, "logits/chosen": -6.211658954620361, "logits/rejected": -6.0974249839782715, "logps/chosen": -882.7542724609375, "logps/rejected": -741.9580078125, "loss": 0.5812, "rewards/accuracies": 0.6875, "rewards/chosen": 1.35499107837677, "rewards/margins": 0.5230444073677063, "rewards/rejected": 0.8319465517997742, "step": 1420 }, { "epoch": 0.5299485801639876, "grad_norm": 143.9077606201172, "learning_rate": 2.352723230826232e-07, "logits/chosen": -6.10528039932251, "logits/rejected": -6.128796100616455, "logps/chosen": -933.9381103515625, "logps/rejected": -815.3626098632812, "loss": 0.5783, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.3778315782546997, "rewards/margins": 0.5523862242698669, "rewards/rejected": 0.8254453539848328, "step": 1430 }, { "epoch": 0.5336545142910085, "grad_norm": 249.5879669189453, "learning_rate": 2.3341978510559464e-07, "logits/chosen": -6.190318584442139, "logits/rejected": -6.109808921813965, "logps/chosen": -988.9300537109375, "logps/rejected": -904.2208862304688, "loss": 0.6759, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.5002410411834717, "rewards/margins": 0.37281566858291626, "rewards/rejected": 1.1274254322052002, "step": 1440 }, { "epoch": 0.5373604484180293, "grad_norm": 158.98834228515625, "learning_rate": 2.3156724712856612e-07, "logits/chosen": NaN, "logits/rejected": -6.147918701171875, "logps/chosen": -897.3717651367188, "logps/rejected": -858.4124755859375, "loss": 0.6545, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 1.2706643342971802, "rewards/margins": 0.30182304978370667, "rewards/rejected": 0.9688412547111511, "step": 1450 }, { "epoch": 0.5373604484180293, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.35387659072876, "eval_logps/chosen": -1137.998046875, "eval_logps/rejected": -1048.7669677734375, "eval_loss": 0.6649725437164307, "eval_rewards/accuracies": 0.6255247592926025, "eval_rewards/chosen": 1.6347370147705078, "eval_rewards/margins": 0.2872615456581116, "eval_rewards/rejected": 1.3474754095077515, "eval_runtime": 174.8955, "eval_samples_per_second": 6.81, "eval_steps_per_second": 6.81, "step": 1450 }, { "epoch": 0.5410663825450502, "grad_norm": 202.708740234375, "learning_rate": 2.297147091515376e-07, "logits/chosen": -6.149045944213867, "logits/rejected": -6.200368881225586, "logps/chosen": -1000.2081909179688, "logps/rejected": -880.8834838867188, "loss": 0.609, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.3515934944152832, "rewards/margins": 0.4020705223083496, "rewards/rejected": 0.949522852897644, "step": 1460 }, { "epoch": 0.5447723166720712, "grad_norm": 378.9857482910156, "learning_rate": 2.2786217117450905e-07, "logits/chosen": -6.143518447875977, "logits/rejected": -6.165432929992676, "logps/chosen": -974.8902587890625, "logps/rejected": -885.5198974609375, "loss": 0.6535, "rewards/accuracies": 0.625, "rewards/chosen": 1.1508817672729492, "rewards/margins": 0.333347886800766, "rewards/rejected": 0.8175338506698608, "step": 1470 }, { "epoch": 0.5484782507990921, "grad_norm": 156.70301818847656, "learning_rate": 2.2600963319748053e-07, "logits/chosen": -6.1991682052612305, "logits/rejected": -6.233222007751465, "logps/chosen": -1007.1458129882812, "logps/rejected": -901.8226318359375, "loss": 0.5823, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 1.3570195436477661, "rewards/margins": 0.5376918315887451, "rewards/rejected": 0.819327712059021, "step": 1480 }, { "epoch": 0.552184184926113, "grad_norm": 194.2936248779297, "learning_rate": 2.24157095220452e-07, "logits/chosen": -6.096491813659668, "logits/rejected": -6.074574947357178, "logps/chosen": -922.29833984375, "logps/rejected": -841.58251953125, "loss": 0.6448, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.311821699142456, "rewards/margins": 0.2840858995914459, "rewards/rejected": 1.027735710144043, "step": 1490 }, { "epoch": 0.5558901190531338, "grad_norm": 138.76791381835938, "learning_rate": 2.223045572434235e-07, "logits/chosen": -6.16571569442749, "logits/rejected": -6.221317768096924, "logps/chosen": -931.6896362304688, "logps/rejected": -879.6935424804688, "loss": 0.6187, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.2591129541397095, "rewards/margins": 0.4034864008426666, "rewards/rejected": 0.8556264638900757, "step": 1500 }, { "epoch": 0.5558901190531338, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.352556228637695, "eval_logps/chosen": -1138.7109375, "eval_logps/rejected": -1049.4052734375, "eval_loss": 0.6670076847076416, "eval_rewards/accuracies": 0.6196473836898804, "eval_rewards/chosen": 1.5634312629699707, "eval_rewards/margins": 0.2797936499118805, "eval_rewards/rejected": 1.2836376428604126, "eval_runtime": 174.7794, "eval_samples_per_second": 6.814, "eval_steps_per_second": 6.814, "step": 1500 }, { "epoch": 0.5595960531801547, "grad_norm": 196.8204345703125, "learning_rate": 2.2045201926639494e-07, "logits/chosen": -6.262406349182129, "logits/rejected": -6.286099433898926, "logps/chosen": -926.6876831054688, "logps/rejected": -719.3727416992188, "loss": 0.5834, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.2394769191741943, "rewards/margins": 0.5388771891593933, "rewards/rejected": 0.7005997896194458, "step": 1510 }, { "epoch": 0.5633019873071756, "grad_norm": 181.23269653320312, "learning_rate": 2.1859948128936642e-07, "logits/chosen": -6.15579080581665, "logits/rejected": NaN, "logps/chosen": -907.208984375, "logps/rejected": -789.55615234375, "loss": 0.6841, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 1.3312886953353882, "rewards/margins": 0.29294928908348083, "rewards/rejected": 1.038339614868164, "step": 1520 }, { "epoch": 0.5670079214341965, "grad_norm": 249.67068481445312, "learning_rate": 2.167469433123379e-07, "logits/chosen": -6.171587944030762, "logits/rejected": -6.20114803314209, "logps/chosen": -1026.1175537109375, "logps/rejected": -928.2333984375, "loss": 0.6224, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 1.4529210329055786, "rewards/margins": 0.44856566190719604, "rewards/rejected": 1.0043553113937378, "step": 1530 }, { "epoch": 0.5707138555612175, "grad_norm": 161.0312042236328, "learning_rate": 2.1489440533530935e-07, "logits/chosen": -6.143443584442139, "logits/rejected": -6.267470836639404, "logps/chosen": -933.2398681640625, "logps/rejected": -907.0738525390625, "loss": 0.6238, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 1.4245777130126953, "rewards/margins": 0.4305481016635895, "rewards/rejected": 0.9940296411514282, "step": 1540 }, { "epoch": 0.5744197896882383, "grad_norm": 156.3391571044922, "learning_rate": 2.1304186735828083e-07, "logits/chosen": -6.159350395202637, "logits/rejected": -6.219527244567871, "logps/chosen": -970.1463012695312, "logps/rejected": -861.26416015625, "loss": 0.6633, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.3536326885223389, "rewards/margins": 0.36367741227149963, "rewards/rejected": 0.9899552464485168, "step": 1550 }, { "epoch": 0.5744197896882383, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.356298923492432, "eval_logps/chosen": -1139.156005859375, "eval_logps/rejected": -1049.7542724609375, "eval_loss": 0.6682325601577759, "eval_rewards/accuracies": 0.6162888407707214, "eval_rewards/chosen": 1.51894211769104, "eval_rewards/margins": 0.27020886540412903, "eval_rewards/rejected": 1.2487331628799438, "eval_runtime": 174.6376, "eval_samples_per_second": 6.82, "eval_steps_per_second": 6.82, "step": 1550 }, { "epoch": 0.5781257238152592, "grad_norm": 223.8711395263672, "learning_rate": 2.111893293812523e-07, "logits/chosen": -6.078129768371582, "logits/rejected": -6.09130859375, "logps/chosen": -902.1170043945312, "logps/rejected": -816.3302001953125, "loss": 0.6542, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.166666030883789, "rewards/margins": 0.32527679204940796, "rewards/rejected": 0.8413891792297363, "step": 1560 }, { "epoch": 0.5818316579422801, "grad_norm": 146.7541961669922, "learning_rate": 2.093367914042238e-07, "logits/chosen": -6.186091899871826, "logits/rejected": -6.257566928863525, "logps/chosen": -953.74853515625, "logps/rejected": -876.87451171875, "loss": 0.6237, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.2467100620269775, "rewards/margins": 0.4072350859642029, "rewards/rejected": 0.8394750356674194, "step": 1570 }, { "epoch": 0.585537592069301, "grad_norm": 201.52931213378906, "learning_rate": 2.0748425342719524e-07, "logits/chosen": -6.206198215484619, "logits/rejected": -6.1201300621032715, "logps/chosen": -973.3306884765625, "logps/rejected": -831.2630004882812, "loss": 0.6483, "rewards/accuracies": 0.65625, "rewards/chosen": 1.2103111743927002, "rewards/margins": 0.3353997766971588, "rewards/rejected": 0.8749113082885742, "step": 1580 }, { "epoch": 0.5892435261963218, "grad_norm": 157.7445526123047, "learning_rate": 2.0563171545016672e-07, "logits/chosen": -6.155628681182861, "logits/rejected": -6.149939060211182, "logps/chosen": -978.5584716796875, "logps/rejected": -856.3310546875, "loss": 0.5719, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.2967727184295654, "rewards/margins": 0.4965516924858093, "rewards/rejected": 0.8002211451530457, "step": 1590 }, { "epoch": 0.5929494603233427, "grad_norm": 161.9136505126953, "learning_rate": 2.037791774731382e-07, "logits/chosen": -6.121860027313232, "logits/rejected": -6.133907794952393, "logps/chosen": -928.7197265625, "logps/rejected": -803.7809448242188, "loss": 0.6081, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.312783122062683, "rewards/margins": 0.4244639277458191, "rewards/rejected": 0.8883193135261536, "step": 1600 }, { "epoch": 0.5929494603233427, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.356760025024414, "eval_logps/chosen": -1139.2073974609375, "eval_logps/rejected": -1049.739990234375, "eval_loss": 0.6694273352622986, "eval_rewards/accuracies": 0.6171284914016724, "eval_rewards/chosen": 1.513792872428894, "eval_rewards/margins": 0.2636261582374573, "eval_rewards/rejected": 1.250166654586792, "eval_runtime": 174.1847, "eval_samples_per_second": 6.838, "eval_steps_per_second": 6.838, "step": 1600 }, { "epoch": 0.5966553944503636, "grad_norm": 110.379638671875, "learning_rate": 2.0192663949610965e-07, "logits/chosen": -6.110042572021484, "logits/rejected": -6.157367706298828, "logps/chosen": -851.4879760742188, "logps/rejected": -773.2803955078125, "loss": 0.5877, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 1.2313387393951416, "rewards/margins": 0.5156643390655518, "rewards/rejected": 0.7156744599342346, "step": 1610 }, { "epoch": 0.6003613285773846, "grad_norm": 196.35398864746094, "learning_rate": 2.0007410151908113e-07, "logits/chosen": -6.248660087585449, "logits/rejected": -6.268450736999512, "logps/chosen": -1027.9803466796875, "logps/rejected": -942.8600463867188, "loss": 0.5961, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 1.4811532497406006, "rewards/margins": 0.4853101670742035, "rewards/rejected": 0.9958430528640747, "step": 1620 }, { "epoch": 0.6040672627044055, "grad_norm": 147.44473266601562, "learning_rate": 1.982215635420526e-07, "logits/chosen": -6.094088554382324, "logits/rejected": NaN, "logps/chosen": -970.6212158203125, "logps/rejected": -842.7681884765625, "loss": 0.668, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 1.3443838357925415, "rewards/margins": 0.37295302748680115, "rewards/rejected": 0.971430778503418, "step": 1630 }, { "epoch": 0.6077731968314263, "grad_norm": 163.9195556640625, "learning_rate": 1.963690255650241e-07, "logits/chosen": -6.2662577629089355, "logits/rejected": -6.079975128173828, "logps/chosen": -937.974609375, "logps/rejected": -781.6622314453125, "loss": 0.5775, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 1.30463707447052, "rewards/margins": 0.5367504358291626, "rewards/rejected": 0.7678866982460022, "step": 1640 }, { "epoch": 0.6114791309584472, "grad_norm": 162.06088256835938, "learning_rate": 1.9451648758799554e-07, "logits/chosen": -6.107717990875244, "logits/rejected": -6.13240909576416, "logps/chosen": -855.8272705078125, "logps/rejected": -743.3189086914062, "loss": 0.6199, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.208332896232605, "rewards/margins": 0.32260221242904663, "rewards/rejected": 0.8857306241989136, "step": 1650 }, { "epoch": 0.6114791309584472, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.351521015167236, "eval_logps/chosen": -1138.5380859375, "eval_logps/rejected": -1049.13720703125, "eval_loss": 0.67048579454422, "eval_rewards/accuracies": 0.6087321639060974, "eval_rewards/chosen": 1.580714464187622, "eval_rewards/margins": 0.2702693045139313, "eval_rewards/rejected": 1.3104450702667236, "eval_runtime": 174.7091, "eval_samples_per_second": 6.817, "eval_steps_per_second": 6.817, "step": 1650 }, { "epoch": 0.6151850650854681, "grad_norm": 209.0579833984375, "learning_rate": 1.9266394961096702e-07, "logits/chosen": -6.237910270690918, "logits/rejected": -6.252842903137207, "logps/chosen": -996.7249755859375, "logps/rejected": -873.1549072265625, "loss": 0.6163, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.4242780208587646, "rewards/margins": 0.4079625606536865, "rewards/rejected": 1.0163153409957886, "step": 1660 }, { "epoch": 0.618890999212489, "grad_norm": 152.62171936035156, "learning_rate": 1.908114116339385e-07, "logits/chosen": -6.184769630432129, "logits/rejected": -6.171984672546387, "logps/chosen": -904.5120849609375, "logps/rejected": -842.2957153320312, "loss": 0.6201, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.3089262247085571, "rewards/margins": 0.4640830457210541, "rewards/rejected": 0.8448432087898254, "step": 1670 }, { "epoch": 0.6225969333395099, "grad_norm": 163.1865997314453, "learning_rate": 1.8895887365690995e-07, "logits/chosen": -6.068234443664551, "logits/rejected": -6.1052398681640625, "logps/chosen": -892.0081787109375, "logps/rejected": -850.9469604492188, "loss": 0.5911, "rewards/accuracies": 0.6875, "rewards/chosen": 1.2593834400177002, "rewards/margins": 0.4649893641471863, "rewards/rejected": 0.7943940162658691, "step": 1680 }, { "epoch": 0.6263028674665307, "grad_norm": 194.63597106933594, "learning_rate": 1.8710633567988143e-07, "logits/chosen": -6.147702217102051, "logits/rejected": -6.189964294433594, "logps/chosen": -944.7171630859375, "logps/rejected": -848.2835693359375, "loss": 0.594, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 1.3190233707427979, "rewards/margins": 0.39705803990364075, "rewards/rejected": 0.9219652414321899, "step": 1690 }, { "epoch": 0.6300088015935517, "grad_norm": 196.50355529785156, "learning_rate": 1.852537977028529e-07, "logits/chosen": -6.1947102546691895, "logits/rejected": -6.1808247566223145, "logps/chosen": -886.1546020507812, "logps/rejected": -801.2207641601562, "loss": 0.6283, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.2352696657180786, "rewards/margins": 0.33806803822517395, "rewards/rejected": 0.8972015380859375, "step": 1700 }, { "epoch": 0.6300088015935517, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.353530406951904, "eval_logps/chosen": -1139.465576171875, "eval_logps/rejected": -1050.0250244140625, "eval_loss": 0.6684470772743225, "eval_rewards/accuracies": 0.6246851682662964, "eval_rewards/chosen": 1.4879825115203857, "eval_rewards/margins": 0.26630899310112, "eval_rewards/rejected": 1.2216734886169434, "eval_runtime": 174.6538, "eval_samples_per_second": 6.819, "eval_steps_per_second": 6.819, "step": 1700 }, { "epoch": 0.6337147357205726, "grad_norm": 148.55259704589844, "learning_rate": 1.834012597258244e-07, "logits/chosen": -6.227621078491211, "logits/rejected": -6.297041893005371, "logps/chosen": -928.7427978515625, "logps/rejected": -803.3206176757812, "loss": 0.5915, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.285189151763916, "rewards/margins": 0.5148311853408813, "rewards/rejected": 0.7703579664230347, "step": 1710 }, { "epoch": 0.6374206698475935, "grad_norm": 144.889404296875, "learning_rate": 1.8154872174879584e-07, "logits/chosen": -6.107190132141113, "logits/rejected": -6.062108993530273, "logps/chosen": -983.3319091796875, "logps/rejected": -892.3099365234375, "loss": 0.6625, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 1.1306359767913818, "rewards/margins": 0.367345929145813, "rewards/rejected": 0.7632900476455688, "step": 1720 }, { "epoch": 0.6411266039746144, "grad_norm": 196.45310974121094, "learning_rate": 1.7969618377176732e-07, "logits/chosen": -6.020756721496582, "logits/rejected": -6.166621208190918, "logps/chosen": -843.7468872070312, "logps/rejected": -768.1483154296875, "loss": 0.6033, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.140453577041626, "rewards/margins": 0.39432471990585327, "rewards/rejected": 0.7461288571357727, "step": 1730 }, { "epoch": 0.6448325381016352, "grad_norm": 151.2347412109375, "learning_rate": 1.778436457947388e-07, "logits/chosen": -6.10150671005249, "logits/rejected": -6.154606819152832, "logps/chosen": -896.15185546875, "logps/rejected": -831.7677612304688, "loss": 0.5964, "rewards/accuracies": 0.6875, "rewards/chosen": 1.2138025760650635, "rewards/margins": 0.4559560716152191, "rewards/rejected": 0.757846474647522, "step": 1740 }, { "epoch": 0.6485384722286561, "grad_norm": 226.45729064941406, "learning_rate": 1.7599110781771025e-07, "logits/chosen": -6.205390453338623, "logits/rejected": -6.212441444396973, "logps/chosen": -1027.686767578125, "logps/rejected": -986.68603515625, "loss": 0.5979, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.3980613946914673, "rewards/margins": 0.47088512778282166, "rewards/rejected": 0.9271761775016785, "step": 1750 }, { "epoch": 0.6485384722286561, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.354175090789795, "eval_logps/chosen": -1139.527587890625, "eval_logps/rejected": -1050.107666015625, "eval_loss": 0.665941596031189, "eval_rewards/accuracies": 0.6246851682662964, "eval_rewards/chosen": 1.4817659854888916, "eval_rewards/margins": 0.2683611810207367, "eval_rewards/rejected": 1.2134050130844116, "eval_runtime": 174.0278, "eval_samples_per_second": 6.844, "eval_steps_per_second": 6.844, "step": 1750 }, { "epoch": 0.652244406355677, "grad_norm": 144.0729217529297, "learning_rate": 1.7413856984068173e-07, "logits/chosen": -6.174568176269531, "logits/rejected": -6.175555229187012, "logps/chosen": -911.03125, "logps/rejected": -848.8049926757812, "loss": 0.5949, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 1.23678457736969, "rewards/margins": 0.4119951128959656, "rewards/rejected": 0.8247894048690796, "step": 1760 }, { "epoch": 0.655950340482698, "grad_norm": 181.58526611328125, "learning_rate": 1.722860318636532e-07, "logits/chosen": -6.1877946853637695, "logits/rejected": -6.193057060241699, "logps/chosen": -931.5440673828125, "logps/rejected": -827.9782104492188, "loss": 0.6147, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.2314589023590088, "rewards/margins": 0.3570247292518616, "rewards/rejected": 0.8744341135025024, "step": 1770 }, { "epoch": 0.6596562746097188, "grad_norm": 152.58062744140625, "learning_rate": 1.7043349388662469e-07, "logits/chosen": -6.187155246734619, "logits/rejected": -6.14534854888916, "logps/chosen": -855.6112060546875, "logps/rejected": -809.7816162109375, "loss": 0.6337, "rewards/accuracies": 0.625, "rewards/chosen": 1.152524709701538, "rewards/margins": 0.3496701121330261, "rewards/rejected": 0.8028545379638672, "step": 1780 }, { "epoch": 0.6633622087367397, "grad_norm": 172.72300720214844, "learning_rate": 1.6858095590959614e-07, "logits/chosen": -6.147979259490967, "logits/rejected": -6.196808815002441, "logps/chosen": -1047.6871337890625, "logps/rejected": -875.8937377929688, "loss": 0.5804, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.3021422624588013, "rewards/margins": 0.5173267126083374, "rewards/rejected": 0.7848155498504639, "step": 1790 }, { "epoch": 0.6670681428637606, "grad_norm": 157.03228759765625, "learning_rate": 1.6672841793256762e-07, "logits/chosen": -6.194676876068115, "logits/rejected": -6.176183223724365, "logps/chosen": -866.8079223632812, "logps/rejected": -801.6738891601562, "loss": 0.6229, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.2280833721160889, "rewards/margins": 0.37192708253860474, "rewards/rejected": 0.8561564683914185, "step": 1800 }, { "epoch": 0.6670681428637606, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.353016376495361, "eval_logps/chosen": -1139.4063720703125, "eval_logps/rejected": -1050.0079345703125, "eval_loss": 0.6658960580825806, "eval_rewards/accuracies": 0.6154491901397705, "eval_rewards/chosen": 1.4938946962356567, "eval_rewards/margins": 0.27052515745162964, "eval_rewards/rejected": 1.2233693599700928, "eval_runtime": 174.6275, "eval_samples_per_second": 6.82, "eval_steps_per_second": 6.82, "step": 1800 }, { "epoch": 0.6707740769907815, "grad_norm": 192.70468139648438, "learning_rate": 1.648758799555391e-07, "logits/chosen": -6.1675310134887695, "logits/rejected": -6.276528835296631, "logps/chosen": -952.8025512695312, "logps/rejected": -811.5380859375, "loss": 0.6054, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.2332537174224854, "rewards/margins": 0.4917047917842865, "rewards/rejected": 0.7415488958358765, "step": 1810 }, { "epoch": 0.6744800111178024, "grad_norm": 204.32887268066406, "learning_rate": 1.6302334197851055e-07, "logits/chosen": -6.1494927406311035, "logits/rejected": -6.112942695617676, "logps/chosen": -909.9085693359375, "logps/rejected": -847.9560546875, "loss": 0.5969, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.291352391242981, "rewards/margins": 0.5246464014053345, "rewards/rejected": 0.7667059898376465, "step": 1820 }, { "epoch": 0.6781859452448232, "grad_norm": 129.51596069335938, "learning_rate": 1.6117080400148203e-07, "logits/chosen": -6.111174583435059, "logits/rejected": -6.139338970184326, "logps/chosen": -1006.2116088867188, "logps/rejected": -902.53076171875, "loss": 0.6243, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.3876721858978271, "rewards/margins": 0.43100985884666443, "rewards/rejected": 0.9566623568534851, "step": 1830 }, { "epoch": 0.6818918793718441, "grad_norm": 198.752197265625, "learning_rate": 1.593182660244535e-07, "logits/chosen": -6.178341388702393, "logits/rejected": -6.032105922698975, "logps/chosen": -931.4904174804688, "logps/rejected": -842.7693481445312, "loss": 0.6636, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.2749412059783936, "rewards/margins": 0.29882222414016724, "rewards/rejected": 0.9761190414428711, "step": 1840 }, { "epoch": 0.6855978134988651, "grad_norm": 173.8763885498047, "learning_rate": 1.5746572804742499e-07, "logits/chosen": -6.294638156890869, "logits/rejected": -6.279183387756348, "logps/chosen": -1006.7406005859375, "logps/rejected": -953.181640625, "loss": 0.6777, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 1.4022231101989746, "rewards/margins": 0.36820927262306213, "rewards/rejected": 1.0340137481689453, "step": 1850 }, { "epoch": 0.6855978134988651, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.3531341552734375, "eval_logps/chosen": -1139.8416748046875, "eval_logps/rejected": -1050.22802734375, "eval_loss": 0.6716598868370056, "eval_rewards/accuracies": 0.6078925132751465, "eval_rewards/chosen": 1.4503740072250366, "eval_rewards/margins": 0.24901418387889862, "eval_rewards/rejected": 1.2013598680496216, "eval_runtime": 174.1779, "eval_samples_per_second": 6.838, "eval_steps_per_second": 6.838, "step": 1850 }, { "epoch": 0.689303747625886, "grad_norm": 166.79347229003906, "learning_rate": 1.5561319007039644e-07, "logits/chosen": -6.272482872009277, "logits/rejected": -6.234023094177246, "logps/chosen": -851.1373291015625, "logps/rejected": -780.5647583007812, "loss": 0.5844, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.1571626663208008, "rewards/margins": 0.47672972083091736, "rewards/rejected": 0.680432915687561, "step": 1860 }, { "epoch": 0.6930096817529069, "grad_norm": 151.9160919189453, "learning_rate": 1.5376065209336792e-07, "logits/chosen": -6.101273059844971, "logits/rejected": -6.033238887786865, "logps/chosen": -865.0284423828125, "logps/rejected": -818.7235107421875, "loss": 0.6109, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 1.0943622589111328, "rewards/margins": 0.34644466638565063, "rewards/rejected": 0.7479175925254822, "step": 1870 }, { "epoch": 0.6967156158799277, "grad_norm": 148.6206817626953, "learning_rate": 1.519081141163394e-07, "logits/chosen": -6.229578971862793, "logits/rejected": -6.284262657165527, "logps/chosen": -960.0841674804688, "logps/rejected": -855.2936401367188, "loss": 0.6012, "rewards/accuracies": 0.6875, "rewards/chosen": 1.3213304281234741, "rewards/margins": 0.48967212438583374, "rewards/rejected": 0.8316582441329956, "step": 1880 }, { "epoch": 0.7004215500069486, "grad_norm": 158.1896514892578, "learning_rate": 1.5005557613931085e-07, "logits/chosen": -6.0919904708862305, "logits/rejected": -6.142601013183594, "logps/chosen": -897.45556640625, "logps/rejected": -893.6177978515625, "loss": 0.6336, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.1414662599563599, "rewards/margins": 0.35405582189559937, "rewards/rejected": 0.7874104380607605, "step": 1890 }, { "epoch": 0.7041274841339695, "grad_norm": 248.24693298339844, "learning_rate": 1.4820303816228233e-07, "logits/chosen": -6.12492036819458, "logits/rejected": -6.182600498199463, "logps/chosen": -882.8030395507812, "logps/rejected": -820.3277587890625, "loss": 0.5957, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 1.1524447202682495, "rewards/margins": 0.43523526191711426, "rewards/rejected": 0.71720951795578, "step": 1900 }, { "epoch": 0.7041274841339695, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.356428146362305, "eval_logps/chosen": -1141.09130859375, "eval_logps/rejected": -1051.447998046875, "eval_loss": 0.668496310710907, "eval_rewards/accuracies": 0.6146095991134644, "eval_rewards/chosen": 1.3254036903381348, "eval_rewards/margins": 0.24603614211082458, "eval_rewards/rejected": 1.0793676376342773, "eval_runtime": 174.5224, "eval_samples_per_second": 6.824, "eval_steps_per_second": 6.824, "step": 1900 }, { "epoch": 0.7078334182609904, "grad_norm": 246.97654724121094, "learning_rate": 1.463505001852538e-07, "logits/chosen": -6.261816024780273, "logits/rejected": -6.296639442443848, "logps/chosen": -889.0240478515625, "logps/rejected": -812.9447021484375, "loss": 0.6209, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.1191017627716064, "rewards/margins": 0.4275694489479065, "rewards/rejected": 0.6915323138237, "step": 1910 }, { "epoch": 0.7115393523880112, "grad_norm": 187.32501220703125, "learning_rate": 1.4449796220822528e-07, "logits/chosen": -6.17855167388916, "logits/rejected": -6.182552337646484, "logps/chosen": -1076.29833984375, "logps/rejected": -952.2566528320312, "loss": 0.5959, "rewards/accuracies": 0.6875, "rewards/chosen": 1.351000428199768, "rewards/margins": 0.5084677338600159, "rewards/rejected": 0.8425326347351074, "step": 1920 }, { "epoch": 0.7152452865150322, "grad_norm": 189.5558624267578, "learning_rate": 1.4264542423119674e-07, "logits/chosen": -6.147101402282715, "logits/rejected": -6.108138084411621, "logps/chosen": -975.3698120117188, "logps/rejected": -849.8056640625, "loss": 0.6439, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 1.0261156558990479, "rewards/margins": 0.3257748484611511, "rewards/rejected": 0.7003408670425415, "step": 1930 }, { "epoch": 0.7189512206420531, "grad_norm": 188.531494140625, "learning_rate": 1.4079288625416822e-07, "logits/chosen": -6.304642200469971, "logits/rejected": -6.310070991516113, "logps/chosen": -951.8046875, "logps/rejected": -888.8732299804688, "loss": 0.623, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 1.1070719957351685, "rewards/margins": 0.3869238495826721, "rewards/rejected": 0.7201482057571411, "step": 1940 }, { "epoch": 0.722657154769074, "grad_norm": 198.22499084472656, "learning_rate": 1.389403482771397e-07, "logits/chosen": -6.263820648193359, "logits/rejected": NaN, "logps/chosen": -928.9371337890625, "logps/rejected": -846.8762817382812, "loss": 0.6375, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 1.0606300830841064, "rewards/margins": 0.30186527967453003, "rewards/rejected": 0.7587647438049316, "step": 1950 }, { "epoch": 0.722657154769074, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.365767955780029, "eval_logps/chosen": -1140.9320068359375, "eval_logps/rejected": -1051.3402099609375, "eval_loss": 0.6660320162773132, "eval_rewards/accuracies": 0.6263644099235535, "eval_rewards/chosen": 1.341342568397522, "eval_rewards/margins": 0.2511833608150482, "eval_rewards/rejected": 1.0901591777801514, "eval_runtime": 174.295, "eval_samples_per_second": 6.833, "eval_steps_per_second": 6.833, "step": 1950 }, { "epoch": 0.7263630888960949, "grad_norm": 158.5247039794922, "learning_rate": 1.3708781030011115e-07, "logits/chosen": -6.31934118270874, "logits/rejected": -6.218142986297607, "logps/chosen": -926.21826171875, "logps/rejected": -798.2918090820312, "loss": 0.607, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 1.078592300415039, "rewards/margins": 0.38651174306869507, "rewards/rejected": 0.6920806169509888, "step": 1960 }, { "epoch": 0.7300690230231157, "grad_norm": 178.37115478515625, "learning_rate": 1.3523527232308263e-07, "logits/chosen": NaN, "logits/rejected": -6.076174736022949, "logps/chosen": -980.4775390625, "logps/rejected": -835.7880859375, "loss": 0.5914, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 1.232055902481079, "rewards/margins": 0.5219300389289856, "rewards/rejected": 0.710125744342804, "step": 1970 }, { "epoch": 0.7337749571501366, "grad_norm": 164.35330200195312, "learning_rate": 1.333827343460541e-07, "logits/chosen": -6.220945835113525, "logits/rejected": -6.0729475021362305, "logps/chosen": -925.7268676757812, "logps/rejected": -770.1295776367188, "loss": 0.6359, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.143280267715454, "rewards/margins": 0.3542799651622772, "rewards/rejected": 0.7890002131462097, "step": 1980 }, { "epoch": 0.7374808912771575, "grad_norm": 168.5455322265625, "learning_rate": 1.3153019636902556e-07, "logits/chosen": -6.159814834594727, "logits/rejected": -6.193203926086426, "logps/chosen": -896.9928588867188, "logps/rejected": -814.4002075195312, "loss": 0.5956, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.1392806768417358, "rewards/margins": 0.39641261100769043, "rewards/rejected": 0.7428680658340454, "step": 1990 }, { "epoch": 0.7411868254041785, "grad_norm": 178.9434814453125, "learning_rate": 1.2967765839199704e-07, "logits/chosen": -6.179207801818848, "logits/rejected": -6.206517219543457, "logps/chosen": -962.1814575195312, "logps/rejected": -875.90869140625, "loss": 0.5861, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 1.4602222442626953, "rewards/margins": 0.5008874535560608, "rewards/rejected": 0.9593348503112793, "step": 2000 }, { "epoch": 0.7411868254041785, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.363770008087158, "eval_logps/chosen": -1139.4501953125, "eval_logps/rejected": -1049.95703125, "eval_loss": 0.6680687069892883, "eval_rewards/accuracies": 0.6154491901397705, "eval_rewards/chosen": 1.4895213842391968, "eval_rewards/margins": 0.2610515356063843, "eval_rewards/rejected": 1.2284698486328125, "eval_runtime": 174.3568, "eval_samples_per_second": 6.831, "eval_steps_per_second": 6.831, "step": 2000 }, { "epoch": 0.7448927595311994, "grad_norm": 236.87335205078125, "learning_rate": 1.2782512041496851e-07, "logits/chosen": -6.2562150955200195, "logits/rejected": -6.181870937347412, "logps/chosen": -1007.4918823242188, "logps/rejected": -940.9990234375, "loss": 0.64, "rewards/accuracies": 0.65625, "rewards/chosen": 1.4098269939422607, "rewards/margins": 0.383728563785553, "rewards/rejected": 1.0260984897613525, "step": 2010 }, { "epoch": 0.7485986936582202, "grad_norm": 180.59913635253906, "learning_rate": 1.2597258243794e-07, "logits/chosen": -6.184215068817139, "logits/rejected": -6.141203880310059, "logps/chosen": -925.7356567382812, "logps/rejected": -824.5137939453125, "loss": 0.5902, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 1.2621865272521973, "rewards/margins": 0.42541566491127014, "rewards/rejected": 0.8367708921432495, "step": 2020 }, { "epoch": 0.7523046277852411, "grad_norm": 192.83140563964844, "learning_rate": 1.2412004446091145e-07, "logits/chosen": -6.163074970245361, "logits/rejected": -6.230958461761475, "logps/chosen": -957.0022583007812, "logps/rejected": -863.4744873046875, "loss": 0.5727, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 1.3720769882202148, "rewards/margins": 0.5459089279174805, "rewards/rejected": 0.8261680603027344, "step": 2030 }, { "epoch": 0.756010561912262, "grad_norm": 128.6780242919922, "learning_rate": 1.2226750648388292e-07, "logits/chosen": -6.119555473327637, "logits/rejected": -6.239079475402832, "logps/chosen": -927.7706909179688, "logps/rejected": -826.4373779296875, "loss": 0.5713, "rewards/accuracies": 0.6875, "rewards/chosen": 1.3195993900299072, "rewards/margins": 0.4843137264251709, "rewards/rejected": 0.8352855443954468, "step": 2040 }, { "epoch": 0.7597164960392829, "grad_norm": 244.9542694091797, "learning_rate": 1.2041496850685438e-07, "logits/chosen": -6.327083110809326, "logits/rejected": -6.312042236328125, "logps/chosen": -1045.3717041015625, "logps/rejected": -913.8675537109375, "loss": 0.5965, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.329573392868042, "rewards/margins": 0.427541583776474, "rewards/rejected": 0.9020318984985352, "step": 2050 }, { "epoch": 0.7597164960392829, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.36568546295166, "eval_logps/chosen": -1140.65673828125, "eval_logps/rejected": -1051.03271484375, "eval_loss": 0.6708235740661621, "eval_rewards/accuracies": 0.6112510561943054, "eval_rewards/chosen": 1.3688610792160034, "eval_rewards/margins": 0.24796034395694733, "eval_rewards/rejected": 1.1209006309509277, "eval_runtime": 174.13, "eval_samples_per_second": 6.84, "eval_steps_per_second": 6.84, "step": 2050 }, { "epoch": 0.7634224301663038, "grad_norm": 154.33078002929688, "learning_rate": 1.1856243052982586e-07, "logits/chosen": -6.247130393981934, "logits/rejected": -6.239518642425537, "logps/chosen": -976.1388549804688, "logps/rejected": -855.1339111328125, "loss": 0.5808, "rewards/accuracies": 0.71875, "rewards/chosen": 1.2261674404144287, "rewards/margins": 0.4568137526512146, "rewards/rejected": 0.7693536877632141, "step": 2060 }, { "epoch": 0.7671283642933246, "grad_norm": 152.96670532226562, "learning_rate": 1.1670989255279732e-07, "logits/chosen": -6.12724494934082, "logits/rejected": -6.189521312713623, "logps/chosen": -933.6554565429688, "logps/rejected": -847.5482177734375, "loss": 0.6256, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 1.2696754932403564, "rewards/margins": 0.4418273866176605, "rewards/rejected": 0.8278481364250183, "step": 2070 }, { "epoch": 0.7708342984203456, "grad_norm": 171.41937255859375, "learning_rate": 1.148573545757688e-07, "logits/chosen": -6.24149227142334, "logits/rejected": -6.186224937438965, "logps/chosen": -952.6795654296875, "logps/rejected": -866.2142333984375, "loss": 0.5913, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 1.2056645154953003, "rewards/margins": 0.43794241547584534, "rewards/rejected": 0.7677222490310669, "step": 2080 }, { "epoch": 0.7745402325473665, "grad_norm": 202.81951904296875, "learning_rate": 1.1300481659874027e-07, "logits/chosen": -6.235200881958008, "logits/rejected": -6.2063446044921875, "logps/chosen": -846.9156494140625, "logps/rejected": -822.10205078125, "loss": 0.6708, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 1.0181329250335693, "rewards/margins": 0.2550104558467865, "rewards/rejected": 0.7631224393844604, "step": 2090 }, { "epoch": 0.7782461666743874, "grad_norm": 183.90293884277344, "learning_rate": 1.1115227862171175e-07, "logits/chosen": -6.047713279724121, "logits/rejected": -6.1482133865356445, "logps/chosen": -942.37060546875, "logps/rejected": -869.7897338867188, "loss": 0.5837, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.1601864099502563, "rewards/margins": 0.4576262831687927, "rewards/rejected": 0.7025600671768188, "step": 2100 }, { "epoch": 0.7782461666743874, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.365157127380371, "eval_logps/chosen": -1140.8760986328125, "eval_logps/rejected": -1051.16064453125, "eval_loss": 0.6740830540657043, "eval_rewards/accuracies": 0.5994962453842163, "eval_rewards/chosen": 1.3469244241714478, "eval_rewards/margins": 0.23882101476192474, "eval_rewards/rejected": 1.1081035137176514, "eval_runtime": 174.0197, "eval_samples_per_second": 6.844, "eval_steps_per_second": 6.844, "step": 2100 }, { "epoch": 0.7819521008014082, "grad_norm": 165.31671142578125, "learning_rate": 1.0929974064468321e-07, "logits/chosen": -6.240053653717041, "logits/rejected": -6.137989044189453, "logps/chosen": -927.2156372070312, "logps/rejected": -787.1029052734375, "loss": 0.5752, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 1.1965583562850952, "rewards/margins": 0.5078359842300415, "rewards/rejected": 0.6887223124504089, "step": 2110 }, { "epoch": 0.7856580349284291, "grad_norm": 187.2274932861328, "learning_rate": 1.0744720266765468e-07, "logits/chosen": -6.173762321472168, "logits/rejected": -6.2248053550720215, "logps/chosen": -946.9639892578125, "logps/rejected": -851.115234375, "loss": 0.5905, "rewards/accuracies": 0.65625, "rewards/chosen": 1.1326261758804321, "rewards/margins": 0.43694519996643066, "rewards/rejected": 0.6956809163093567, "step": 2120 }, { "epoch": 0.78936396905545, "grad_norm": 203.85891723632812, "learning_rate": 1.0559466469062616e-07, "logits/chosen": -6.200101375579834, "logits/rejected": -6.217113494873047, "logps/chosen": -970.5763549804688, "logps/rejected": -895.93310546875, "loss": 0.6369, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.2285493612289429, "rewards/margins": 0.3296011984348297, "rewards/rejected": 0.8989483118057251, "step": 2130 }, { "epoch": 0.793069903182471, "grad_norm": 204.75799560546875, "learning_rate": 1.0374212671359762e-07, "logits/chosen": -6.152576446533203, "logits/rejected": -6.173853874206543, "logps/chosen": -1001.7930908203125, "logps/rejected": -783.9508056640625, "loss": 0.607, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.1725000143051147, "rewards/margins": 0.419525682926178, "rewards/rejected": 0.7529743909835815, "step": 2140 }, { "epoch": 0.7967758373094919, "grad_norm": 217.63076782226562, "learning_rate": 1.018895887365691e-07, "logits/chosen": -6.167757987976074, "logits/rejected": -6.10471248626709, "logps/chosen": -996.0086669921875, "logps/rejected": -872.7799072265625, "loss": 0.6336, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.3676767349243164, "rewards/margins": 0.4060121476650238, "rewards/rejected": 0.9616644978523254, "step": 2150 }, { "epoch": 0.7967758373094919, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.359668731689453, "eval_logps/chosen": -1139.7021484375, "eval_logps/rejected": -1050.19970703125, "eval_loss": 0.6704598665237427, "eval_rewards/accuracies": 0.6137699484825134, "eval_rewards/chosen": 1.4643239974975586, "eval_rewards/margins": 0.2601255178451538, "eval_rewards/rejected": 1.2041983604431152, "eval_runtime": 174.4958, "eval_samples_per_second": 6.825, "eval_steps_per_second": 6.825, "step": 2150 }, { "epoch": 0.8004817714365127, "grad_norm": 203.3338165283203, "learning_rate": 1.0003705075954057e-07, "logits/chosen": -6.282981872558594, "logits/rejected": -6.3550286293029785, "logps/chosen": -897.6851806640625, "logps/rejected": -796.6698608398438, "loss": 0.6354, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.2425462007522583, "rewards/margins": 0.3381389081478119, "rewards/rejected": 0.9044073224067688, "step": 2160 }, { "epoch": 0.8041877055635336, "grad_norm": 164.21310424804688, "learning_rate": 9.818451278251204e-08, "logits/chosen": -6.134262561798096, "logits/rejected": -6.197000503540039, "logps/chosen": -919.0861206054688, "logps/rejected": -864.6526489257812, "loss": 0.6287, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 1.2566717863082886, "rewards/margins": 0.4035015106201172, "rewards/rejected": 0.8531702160835266, "step": 2170 }, { "epoch": 0.8078936396905545, "grad_norm": 153.4619598388672, "learning_rate": 9.633197480548351e-08, "logits/chosen": -6.275097846984863, "logits/rejected": NaN, "logps/chosen": -924.9786376953125, "logps/rejected": -775.713134765625, "loss": 0.5827, "rewards/accuracies": 0.6875, "rewards/chosen": 1.353480577468872, "rewards/margins": 0.5435738563537598, "rewards/rejected": 0.8099066019058228, "step": 2180 }, { "epoch": 0.8115995738175754, "grad_norm": 150.305419921875, "learning_rate": 9.447943682845498e-08, "logits/chosen": -6.12339973449707, "logits/rejected": -6.132723808288574, "logps/chosen": -996.9190673828125, "logps/rejected": -849.7203369140625, "loss": 0.6283, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.3306301832199097, "rewards/margins": 0.4646291732788086, "rewards/rejected": 0.8660010099411011, "step": 2190 }, { "epoch": 0.8153055079445963, "grad_norm": 137.26937866210938, "learning_rate": 9.262689885142645e-08, "logits/chosen": -6.194148063659668, "logits/rejected": -6.139514923095703, "logps/chosen": -874.9927978515625, "logps/rejected": -870.7437744140625, "loss": 0.6213, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.2261251211166382, "rewards/margins": 0.37579071521759033, "rewards/rejected": 0.8503344655036926, "step": 2200 }, { "epoch": 0.8153055079445963, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.357300758361816, "eval_logps/chosen": -1138.9786376953125, "eval_logps/rejected": -1049.5445556640625, "eval_loss": 0.6698047518730164, "eval_rewards/accuracies": 0.6120907068252563, "eval_rewards/chosen": 1.5366746187210083, "eval_rewards/margins": 0.26695773005485535, "eval_rewards/rejected": 1.269716739654541, "eval_runtime": 174.7913, "eval_samples_per_second": 6.814, "eval_steps_per_second": 6.814, "step": 2200 }, { "epoch": 0.8190114420716171, "grad_norm": 195.36293029785156, "learning_rate": 9.077436087439792e-08, "logits/chosen": -6.112738609313965, "logits/rejected": -6.180370807647705, "logps/chosen": -958.05810546875, "logps/rejected": -914.5363159179688, "loss": 0.6249, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 1.3648970127105713, "rewards/margins": 0.38803738355636597, "rewards/rejected": 0.9768595695495605, "step": 2210 }, { "epoch": 0.822717376198638, "grad_norm": 218.26048278808594, "learning_rate": 8.89218228973694e-08, "logits/chosen": -6.269371032714844, "logits/rejected": -6.25061559677124, "logps/chosen": -948.0462036132812, "logps/rejected": -886.0133666992188, "loss": 0.6872, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.214468002319336, "rewards/margins": 0.23822224140167236, "rewards/rejected": 0.9762457013130188, "step": 2220 }, { "epoch": 0.826423310325659, "grad_norm": 211.05848693847656, "learning_rate": 8.706928492034086e-08, "logits/chosen": -6.225638389587402, "logits/rejected": -6.232121467590332, "logps/chosen": -908.8572998046875, "logps/rejected": -863.0309448242188, "loss": 0.6504, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.1848371028900146, "rewards/margins": 0.3589955270290375, "rewards/rejected": 0.8258415460586548, "step": 2230 }, { "epoch": 0.8301292444526799, "grad_norm": 225.31521606445312, "learning_rate": 8.521674694331234e-08, "logits/chosen": -6.275576591491699, "logits/rejected": -6.208001136779785, "logps/chosen": -863.1882934570312, "logps/rejected": -742.2914428710938, "loss": 0.5924, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 1.097625970840454, "rewards/margins": 0.4352286756038666, "rewards/rejected": 0.6623972058296204, "step": 2240 }, { "epoch": 0.8338351785797008, "grad_norm": 229.04183959960938, "learning_rate": 8.336420896628381e-08, "logits/chosen": -6.226241111755371, "logits/rejected": -6.138689994812012, "logps/chosen": -1008.5877685546875, "logps/rejected": -819.7078247070312, "loss": 0.6272, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 1.2099117040634155, "rewards/margins": 0.3881527781486511, "rewards/rejected": 0.8217589259147644, "step": 2250 }, { "epoch": 0.8338351785797008, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.360867500305176, "eval_logps/chosen": -1140.3447265625, "eval_logps/rejected": -1050.7974853515625, "eval_loss": 0.6691888570785522, "eval_rewards/accuracies": 0.6146095991134644, "eval_rewards/chosen": 1.4000587463378906, "eval_rewards/margins": 0.2556445896625519, "eval_rewards/rejected": 1.1444141864776611, "eval_runtime": 174.9273, "eval_samples_per_second": 6.809, "eval_steps_per_second": 6.809, "step": 2250 }, { "epoch": 0.8375411127067216, "grad_norm": 179.97850036621094, "learning_rate": 8.151167098925527e-08, "logits/chosen": -6.222306251525879, "logits/rejected": -6.101675510406494, "logps/chosen": -929.2380981445312, "logps/rejected": -836.9358520507812, "loss": 0.5694, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.3586682081222534, "rewards/margins": 0.4944392740726471, "rewards/rejected": 0.8642290234565735, "step": 2260 }, { "epoch": 0.8412470468337425, "grad_norm": 187.8804473876953, "learning_rate": 7.965913301222675e-08, "logits/chosen": -6.19569730758667, "logits/rejected": -6.270641326904297, "logps/chosen": -982.6009521484375, "logps/rejected": -859.2730712890625, "loss": 0.6045, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.1823841333389282, "rewards/margins": 0.3857570290565491, "rewards/rejected": 0.7966271638870239, "step": 2270 }, { "epoch": 0.8449529809607634, "grad_norm": 250.08534240722656, "learning_rate": 7.780659503519822e-08, "logits/chosen": -6.17086935043335, "logits/rejected": -6.222559452056885, "logps/chosen": -1034.419677734375, "logps/rejected": -955.6583251953125, "loss": 0.659, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 1.3033745288848877, "rewards/margins": 0.39839601516723633, "rewards/rejected": 0.9049783945083618, "step": 2280 }, { "epoch": 0.8486589150877843, "grad_norm": 148.72601318359375, "learning_rate": 7.59540570581697e-08, "logits/chosen": -6.179747581481934, "logits/rejected": -6.1192827224731445, "logps/chosen": -986.9896240234375, "logps/rejected": -826.3615112304688, "loss": 0.5721, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 1.323201298713684, "rewards/margins": 0.5028241276741028, "rewards/rejected": 0.8203772306442261, "step": 2290 }, { "epoch": 0.8523648492148052, "grad_norm": 178.256591796875, "learning_rate": 7.410151908114116e-08, "logits/chosen": -6.32712459564209, "logits/rejected": -6.3251752853393555, "logps/chosen": -896.9401245117188, "logps/rejected": -792.0689697265625, "loss": 0.6251, "rewards/accuracies": 0.65625, "rewards/chosen": 1.158760905265808, "rewards/margins": 0.38926878571510315, "rewards/rejected": 0.7694920897483826, "step": 2300 }, { "epoch": 0.8523648492148052, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.361525535583496, "eval_logps/chosen": -1140.1575927734375, "eval_logps/rejected": -1050.596923828125, "eval_loss": 0.667841911315918, "eval_rewards/accuracies": 0.6095718145370483, "eval_rewards/chosen": 1.418774962425232, "eval_rewards/margins": 0.25429922342300415, "eval_rewards/rejected": 1.164475917816162, "eval_runtime": 174.9606, "eval_samples_per_second": 6.807, "eval_steps_per_second": 6.807, "step": 2300 }, { "epoch": 0.8560707833418261, "grad_norm": 289.4090270996094, "learning_rate": 7.224898110411264e-08, "logits/chosen": -6.040954113006592, "logits/rejected": -6.1116743087768555, "logps/chosen": -868.0579833984375, "logps/rejected": -742.7337646484375, "loss": 0.5972, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 1.136728286743164, "rewards/margins": 0.48759302496910095, "rewards/rejected": 0.6491352915763855, "step": 2310 }, { "epoch": 0.859776717468847, "grad_norm": 209.7076416015625, "learning_rate": 7.039644312708411e-08, "logits/chosen": -6.146378993988037, "logits/rejected": -6.205878257751465, "logps/chosen": -955.7279052734375, "logps/rejected": -776.606201171875, "loss": 0.6427, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.3216975927352905, "rewards/margins": 0.4136362671852112, "rewards/rejected": 0.9080612063407898, "step": 2320 }, { "epoch": 0.8634826515958679, "grad_norm": 157.4209442138672, "learning_rate": 6.854390515005557e-08, "logits/chosen": -6.1600141525268555, "logits/rejected": -6.090916633605957, "logps/chosen": -1095.3663330078125, "logps/rejected": -915.3992919921875, "loss": 0.5802, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 1.4250476360321045, "rewards/margins": 0.5543738007545471, "rewards/rejected": 0.8706739544868469, "step": 2330 }, { "epoch": 0.8671885857228888, "grad_norm": 202.49034118652344, "learning_rate": 6.669136717302705e-08, "logits/chosen": -6.110904216766357, "logits/rejected": -6.1261210441589355, "logps/chosen": -967.0671997070312, "logps/rejected": -858.1019287109375, "loss": 0.6087, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 1.3673769235610962, "rewards/margins": 0.5143194198608398, "rewards/rejected": 0.8530575037002563, "step": 2340 }, { "epoch": 0.8708945198499096, "grad_norm": 103.83200073242188, "learning_rate": 6.483882919599852e-08, "logits/chosen": -6.204503059387207, "logits/rejected": -6.187712669372559, "logps/chosen": -872.6140747070312, "logps/rejected": -760.0308227539062, "loss": 0.6168, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 1.1666477918624878, "rewards/margins": 0.4573976993560791, "rewards/rejected": 0.7092500925064087, "step": 2350 }, { "epoch": 0.8708945198499096, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.359028339385986, "eval_logps/chosen": -1138.91455078125, "eval_logps/rejected": -1049.516845703125, "eval_loss": 0.668138325214386, "eval_rewards/accuracies": 0.6221662759780884, "eval_rewards/chosen": 1.543074131011963, "eval_rewards/margins": 0.2705841064453125, "eval_rewards/rejected": 1.2724900245666504, "eval_runtime": 174.7239, "eval_samples_per_second": 6.816, "eval_steps_per_second": 6.816, "step": 2350 }, { "epoch": 0.8746004539769305, "grad_norm": 226.57864379882812, "learning_rate": 6.298629121897e-08, "logits/chosen": NaN, "logits/rejected": -6.086965084075928, "logps/chosen": -982.00390625, "logps/rejected": -850.6180419921875, "loss": 0.6142, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 1.2618882656097412, "rewards/margins": 0.42980679869651794, "rewards/rejected": 0.8320814967155457, "step": 2360 }, { "epoch": 0.8783063881039515, "grad_norm": 250.1316680908203, "learning_rate": 6.113375324194146e-08, "logits/chosen": -6.260158061981201, "logits/rejected": -6.332103252410889, "logps/chosen": -921.3849487304688, "logps/rejected": -801.4791259765625, "loss": 0.6029, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 1.3854167461395264, "rewards/margins": 0.49748069047927856, "rewards/rejected": 0.8879362344741821, "step": 2370 }, { "epoch": 0.8820123222309724, "grad_norm": 257.1830749511719, "learning_rate": 5.928121526491293e-08, "logits/chosen": -6.317752361297607, "logits/rejected": NaN, "logps/chosen": -978.55126953125, "logps/rejected": -801.6576538085938, "loss": 0.5736, "rewards/accuracies": 0.75, "rewards/chosen": 1.3101433515548706, "rewards/margins": 0.5170674324035645, "rewards/rejected": 0.7930759787559509, "step": 2380 }, { "epoch": 0.8857182563579933, "grad_norm": 174.4718475341797, "learning_rate": 5.74286772878844e-08, "logits/chosen": -6.182769298553467, "logits/rejected": NaN, "logps/chosen": -841.9371948242188, "logps/rejected": -779.7135009765625, "loss": 0.6186, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.247337818145752, "rewards/margins": 0.39551234245300293, "rewards/rejected": 0.8518252372741699, "step": 2390 }, { "epoch": 0.8894241904850141, "grad_norm": 150.2713165283203, "learning_rate": 5.557613931085587e-08, "logits/chosen": NaN, "logits/rejected": -6.047909736633301, "logps/chosen": -949.482421875, "logps/rejected": -869.8361206054688, "loss": 0.6183, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 1.4006798267364502, "rewards/margins": 0.3790472149848938, "rewards/rejected": 1.0216325521469116, "step": 2400 }, { "epoch": 0.8894241904850141, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.359555244445801, "eval_logps/chosen": -1138.1962890625, "eval_logps/rejected": -1048.9058837890625, "eval_loss": 0.6646097898483276, "eval_rewards/accuracies": 0.6221662759780884, "eval_rewards/chosen": 1.6149109601974487, "eval_rewards/margins": 0.2813268303871155, "eval_rewards/rejected": 1.333584189414978, "eval_runtime": 174.9379, "eval_samples_per_second": 6.808, "eval_steps_per_second": 6.808, "step": 2400 }, { "epoch": 0.893130124612035, "grad_norm": 142.9062042236328, "learning_rate": 5.372360133382734e-08, "logits/chosen": -6.116464138031006, "logits/rejected": -6.133796691894531, "logps/chosen": -922.8450317382812, "logps/rejected": -898.6383666992188, "loss": 0.633, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 1.3375623226165771, "rewards/margins": 0.38691186904907227, "rewards/rejected": 0.9506505131721497, "step": 2410 }, { "epoch": 0.8968360587390559, "grad_norm": 194.79432678222656, "learning_rate": 5.187106335679881e-08, "logits/chosen": -6.197569370269775, "logits/rejected": -6.240142345428467, "logps/chosen": -906.27001953125, "logps/rejected": -837.5906372070312, "loss": 0.6182, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 1.3941477537155151, "rewards/margins": 0.47745227813720703, "rewards/rejected": 0.9166954159736633, "step": 2420 }, { "epoch": 0.9005419928660768, "grad_norm": 176.5300750732422, "learning_rate": 5.001852537977028e-08, "logits/chosen": -6.1893310546875, "logits/rejected": -6.254434585571289, "logps/chosen": -949.20654296875, "logps/rejected": -864.1427001953125, "loss": 0.6234, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.4380910396575928, "rewards/margins": 0.3952023983001709, "rewards/rejected": 1.0428886413574219, "step": 2430 }, { "epoch": 0.9042479269930978, "grad_norm": 204.15846252441406, "learning_rate": 4.8165987402741755e-08, "logits/chosen": -6.22286319732666, "logits/rejected": -6.251989841461182, "logps/chosen": -1061.7998046875, "logps/rejected": -907.6027221679688, "loss": 0.6386, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 1.4448530673980713, "rewards/margins": 0.4636596739292145, "rewards/rejected": 0.9811934232711792, "step": 2440 }, { "epoch": 0.9079538611201186, "grad_norm": 172.95408630371094, "learning_rate": 4.631344942571323e-08, "logits/chosen": -6.1700005531311035, "logits/rejected": -6.175479412078857, "logps/chosen": -931.7879028320312, "logps/rejected": -793.9867553710938, "loss": 0.5927, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.3887077569961548, "rewards/margins": 0.46491020917892456, "rewards/rejected": 0.9237974882125854, "step": 2450 }, { "epoch": 0.9079538611201186, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.360927581787109, "eval_logps/chosen": -1138.57568359375, "eval_logps/rejected": -1049.185302734375, "eval_loss": 0.6698666214942932, "eval_rewards/accuracies": 0.6179680824279785, "eval_rewards/chosen": 1.576967477798462, "eval_rewards/margins": 0.2713308334350586, "eval_rewards/rejected": 1.3056366443634033, "eval_runtime": 174.8797, "eval_samples_per_second": 6.81, "eval_steps_per_second": 6.81, "step": 2450 }, { "epoch": 0.9116597952471395, "grad_norm": 199.45687866210938, "learning_rate": 4.44609114486847e-08, "logits/chosen": -6.102917194366455, "logits/rejected": -6.159636974334717, "logps/chosen": -846.6583251953125, "logps/rejected": -778.142333984375, "loss": 0.6016, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.2215101718902588, "rewards/margins": 0.4805546700954437, "rewards/rejected": 0.7409554719924927, "step": 2460 }, { "epoch": 0.9153657293741604, "grad_norm": 148.47500610351562, "learning_rate": 4.260837347165617e-08, "logits/chosen": -6.2448930740356445, "logits/rejected": -6.158895015716553, "logps/chosen": -1002.0369262695312, "logps/rejected": -890.6515502929688, "loss": 0.5622, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 1.374596357345581, "rewards/margins": 0.5444284081459045, "rewards/rejected": 0.8301678895950317, "step": 2470 }, { "epoch": 0.9190716635011813, "grad_norm": 186.5249481201172, "learning_rate": 4.075583549462764e-08, "logits/chosen": -6.199591159820557, "logits/rejected": -6.20455265045166, "logps/chosen": -952.9064331054688, "logps/rejected": -825.30859375, "loss": 0.6153, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 1.4213062524795532, "rewards/margins": 0.4216051697731018, "rewards/rejected": 0.9997010231018066, "step": 2480 }, { "epoch": 0.9227775976282021, "grad_norm": 175.28465270996094, "learning_rate": 3.890329751759911e-08, "logits/chosen": -6.196984767913818, "logits/rejected": -6.227551460266113, "logps/chosen": -967.3463745117188, "logps/rejected": -785.5040283203125, "loss": 0.5873, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 1.3924157619476318, "rewards/margins": 0.5140901803970337, "rewards/rejected": 0.8783254623413086, "step": 2490 }, { "epoch": 0.926483531755223, "grad_norm": 205.52532958984375, "learning_rate": 3.705075954057058e-08, "logits/chosen": -6.220477104187012, "logits/rejected": -6.22896146774292, "logps/chosen": -926.861328125, "logps/rejected": -857.521484375, "loss": 0.6039, "rewards/accuracies": 0.65625, "rewards/chosen": 1.3103541135787964, "rewards/margins": 0.4116063117980957, "rewards/rejected": 0.8987478017807007, "step": 2500 }, { "epoch": 0.926483531755223, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.359811305999756, "eval_logps/chosen": -1138.34375, "eval_logps/rejected": -1049.01904296875, "eval_loss": 0.6664721965789795, "eval_rewards/accuracies": 0.6263644099235535, "eval_rewards/chosen": 1.6001617908477783, "eval_rewards/margins": 0.2778994143009186, "eval_rewards/rejected": 1.3222622871398926, "eval_runtime": 175.0068, "eval_samples_per_second": 6.805, "eval_steps_per_second": 6.805, "step": 2500 }, { "epoch": 0.9301894658822439, "grad_norm": 126.9505615234375, "learning_rate": 3.5198221563542054e-08, "logits/chosen": NaN, "logits/rejected": -6.187649726867676, "logps/chosen": -1037.003173828125, "logps/rejected": -877.7476806640625, "loss": 0.5535, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 1.5431692600250244, "rewards/margins": 0.6853631734848022, "rewards/rejected": 0.8578060865402222, "step": 2510 }, { "epoch": 0.9338954000092649, "grad_norm": 217.86968994140625, "learning_rate": 3.3345683586513526e-08, "logits/chosen": -6.283205032348633, "logits/rejected": -6.221907615661621, "logps/chosen": -924.9992065429688, "logps/rejected": -855.3511962890625, "loss": 0.6438, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.3765745162963867, "rewards/margins": 0.3500059247016907, "rewards/rejected": 1.0265684127807617, "step": 2520 }, { "epoch": 0.9376013341362858, "grad_norm": 170.4639434814453, "learning_rate": 3.1493145609485e-08, "logits/chosen": -6.257707595825195, "logits/rejected": -6.27487850189209, "logps/chosen": -903.9068603515625, "logps/rejected": -843.6154174804688, "loss": 0.6395, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 1.2748887538909912, "rewards/margins": 0.3705710768699646, "rewards/rejected": 0.9043177366256714, "step": 2530 }, { "epoch": 0.9413072682633066, "grad_norm": 150.2277069091797, "learning_rate": 2.9640607632456464e-08, "logits/chosen": -6.218627452850342, "logits/rejected": -6.296253204345703, "logps/chosen": -1027.5751953125, "logps/rejected": -868.8278198242188, "loss": 0.6068, "rewards/accuracies": 0.65625, "rewards/chosen": 1.4749407768249512, "rewards/margins": 0.5608575940132141, "rewards/rejected": 0.9140831232070923, "step": 2540 }, { "epoch": 0.9450132023903275, "grad_norm": 176.552490234375, "learning_rate": 2.7788069655427936e-08, "logits/chosen": -6.115626335144043, "logits/rejected": -6.002920150756836, "logps/chosen": -835.7297973632812, "logps/rejected": -763.3617553710938, "loss": 0.6288, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 1.1424133777618408, "rewards/margins": 0.3475190997123718, "rewards/rejected": 0.7948943376541138, "step": 2550 }, { "epoch": 0.9450132023903275, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.3612494468688965, "eval_logps/chosen": -1138.4256591796875, "eval_logps/rejected": -1049.066162109375, "eval_loss": 0.6675523519515991, "eval_rewards/accuracies": 0.6146095991134644, "eval_rewards/chosen": 1.5919551849365234, "eval_rewards/margins": 0.2744098901748657, "eval_rewards/rejected": 1.3175454139709473, "eval_runtime": 174.7172, "eval_samples_per_second": 6.817, "eval_steps_per_second": 6.817, "step": 2550 }, { "epoch": 0.9487191365173484, "grad_norm": 168.2140350341797, "learning_rate": 2.5935531678399405e-08, "logits/chosen": -6.1927971839904785, "logits/rejected": -6.197493553161621, "logps/chosen": -991.3941650390625, "logps/rejected": -906.1048583984375, "loss": 0.6289, "rewards/accuracies": 0.625, "rewards/chosen": 1.511885404586792, "rewards/margins": 0.4927326738834381, "rewards/rejected": 1.0191527605056763, "step": 2560 }, { "epoch": 0.9524250706443693, "grad_norm": 171.1510009765625, "learning_rate": 2.4082993701370877e-08, "logits/chosen": -6.209620475769043, "logits/rejected": -6.170471668243408, "logps/chosen": -925.6842041015625, "logps/rejected": -813.4041137695312, "loss": 0.6095, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.367976188659668, "rewards/margins": 0.4230473041534424, "rewards/rejected": 0.9449288249015808, "step": 2570 }, { "epoch": 0.9561310047713902, "grad_norm": 194.6319580078125, "learning_rate": 2.223045572434235e-08, "logits/chosen": -6.165289878845215, "logits/rejected": -6.122786521911621, "logps/chosen": -965.0526123046875, "logps/rejected": -807.0406494140625, "loss": 0.5615, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 1.4062001705169678, "rewards/margins": 0.6220154166221619, "rewards/rejected": 0.7841848134994507, "step": 2580 }, { "epoch": 0.959836938898411, "grad_norm": 187.33181762695312, "learning_rate": 2.037791774731382e-08, "logits/chosen": -6.2167792320251465, "logits/rejected": NaN, "logps/chosen": -980.70263671875, "logps/rejected": -831.4769287109375, "loss": 0.568, "rewards/accuracies": 0.6875, "rewards/chosen": 1.3932929039001465, "rewards/margins": 0.5573422908782959, "rewards/rejected": 0.8359505534172058, "step": 2590 }, { "epoch": 0.963542873025432, "grad_norm": 142.72923278808594, "learning_rate": 1.852537977028529e-08, "logits/chosen": -6.158575534820557, "logits/rejected": -6.203221321105957, "logps/chosen": -921.6607666015625, "logps/rejected": -842.1689453125, "loss": 0.5714, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 1.325329065322876, "rewards/margins": 0.510645866394043, "rewards/rejected": 0.8146833181381226, "step": 2600 }, { "epoch": 0.963542873025432, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.361992835998535, "eval_logps/chosen": -1138.678955078125, "eval_logps/rejected": -1049.255615234375, "eval_loss": 0.6706271171569824, "eval_rewards/accuracies": 0.6120907068252563, "eval_rewards/chosen": 1.566640853881836, "eval_rewards/margins": 0.26803797483444214, "eval_rewards/rejected": 1.2986030578613281, "eval_runtime": 174.9793, "eval_samples_per_second": 6.807, "eval_steps_per_second": 6.807, "step": 2600 }, { "epoch": 0.9672488071524529, "grad_norm": 190.34542846679688, "learning_rate": 1.6672841793256763e-08, "logits/chosen": -6.201694488525391, "logits/rejected": -6.2331223487854, "logps/chosen": -881.90576171875, "logps/rejected": -822.5655517578125, "loss": 0.6516, "rewards/accuracies": 0.59375, "rewards/chosen": 1.2538020610809326, "rewards/margins": 0.3467678427696228, "rewards/rejected": 0.9070342183113098, "step": 2610 }, { "epoch": 0.9709547412794738, "grad_norm": 273.5181884765625, "learning_rate": 1.4820303816228232e-08, "logits/chosen": -6.193233489990234, "logits/rejected": -6.104687690734863, "logps/chosen": -866.4398193359375, "logps/rejected": -785.888916015625, "loss": 0.6333, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 1.187961220741272, "rewards/margins": 0.3987189829349518, "rewards/rejected": 0.7892423868179321, "step": 2620 }, { "epoch": 0.9746606754064947, "grad_norm": 230.2846221923828, "learning_rate": 1.2967765839199703e-08, "logits/chosen": -6.259491920471191, "logits/rejected": -6.279690265655518, "logps/chosen": -791.6710815429688, "logps/rejected": -694.3341674804688, "loss": 0.6119, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.2341381311416626, "rewards/margins": 0.4778687059879303, "rewards/rejected": 0.7562695741653442, "step": 2630 }, { "epoch": 0.9783666095335155, "grad_norm": 219.6723175048828, "learning_rate": 1.1115227862171175e-08, "logits/chosen": -6.238982677459717, "logits/rejected": -6.274487495422363, "logps/chosen": -911.5924072265625, "logps/rejected": -826.4364013671875, "loss": 0.6341, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 1.3268905878067017, "rewards/margins": 0.3773989975452423, "rewards/rejected": 0.9494916200637817, "step": 2640 }, { "epoch": 0.9820725436605364, "grad_norm": 228.98863220214844, "learning_rate": 9.262689885142645e-09, "logits/chosen": -6.187376976013184, "logits/rejected": -6.207940578460693, "logps/chosen": -867.3173828125, "logps/rejected": -846.06298828125, "loss": 0.6064, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.1946938037872314, "rewards/margins": 0.3911550045013428, "rewards/rejected": 0.8035389184951782, "step": 2650 }, { "epoch": 0.9820725436605364, "eval_logits/chosen": NaN, "eval_logits/rejected": -6.362624645233154, "eval_logps/chosen": -1138.8880615234375, "eval_logps/rejected": -1049.453125, "eval_loss": 0.6678369641304016, "eval_rewards/accuracies": 0.6246851682662964, "eval_rewards/chosen": 1.5457268953323364, "eval_rewards/margins": 0.2668676972389221, "eval_rewards/rejected": 1.2788591384887695, "eval_runtime": 174.9484, "eval_samples_per_second": 6.808, "eval_steps_per_second": 6.808, "step": 2650 }, { "epoch": 0.9857784777875573, "grad_norm": 147.26541137695312, "learning_rate": 7.410151908114116e-09, "logits/chosen": -6.185873508453369, "logits/rejected": -6.177300930023193, "logps/chosen": -981.9752197265625, "logps/rejected": -823.0641479492188, "loss": 0.557, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 1.3892064094543457, "rewards/margins": 0.5826338529586792, "rewards/rejected": 0.8065725564956665, "step": 2660 }, { "epoch": 0.9894844119145783, "grad_norm": 143.0415802001953, "learning_rate": 5.5576139310855874e-09, "logits/chosen": -6.007561683654785, "logits/rejected": -6.002453327178955, "logps/chosen": -852.1906127929688, "logps/rejected": -811.2535400390625, "loss": 0.5599, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 1.2815325260162354, "rewards/margins": 0.49190282821655273, "rewards/rejected": 0.7896297574043274, "step": 2670 }, { "epoch": 0.9931903460415991, "grad_norm": 210.65750122070312, "learning_rate": 3.705075954057058e-09, "logits/chosen": -6.069428443908691, "logits/rejected": -6.068325996398926, "logps/chosen": -1039.2158203125, "logps/rejected": -915.1730346679688, "loss": 0.5495, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 1.5940990447998047, "rewards/margins": 0.5972550511360168, "rewards/rejected": 0.9968441128730774, "step": 2680 }, { "epoch": 0.99689628016862, "grad_norm": 211.33241271972656, "learning_rate": 1.852537977028529e-09, "logits/chosen": -6.273778438568115, "logits/rejected": -6.336636543273926, "logps/chosen": -950.8792724609375, "logps/rejected": -863.4602661132812, "loss": 0.6386, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.4326883554458618, "rewards/margins": 0.3855132758617401, "rewards/rejected": 1.0471750497817993, "step": 2690 } ], "logging_steps": 10, "max_steps": 2699, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }