{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.999214865218529, "eval_steps": 1000, "global_step": 3820, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005234231876472127, "grad_norm": 210.4916534423828, "learning_rate": 0.0, "logits/chosen": -5.20703125, "logits/rejected": -5.03125, "logps/chosen": -469.0, "logps/rejected": -450.25, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.005234231876472127, "grad_norm": 208.33755493164062, "learning_rate": 2.3560209424083768e-08, "logits/chosen": -4.896918296813965, "logits/rejected": -4.93077278137207, "logps/chosen": -423.1875, "logps/rejected": -356.0138854980469, "loss": 0.7117, "rewards/accuracies": 0.2013888955116272, "rewards/chosen": 0.0572916679084301, "rewards/margins": 0.0681423619389534, "rewards/rejected": -0.0108506940305233, "step": 10 }, { "epoch": 0.010468463752944255, "grad_norm": 166.18814086914062, "learning_rate": 4.973821989528795e-08, "logits/chosen": -4.843554496765137, "logits/rejected": -4.822265625, "logps/chosen": -348.23748779296875, "logps/rejected": -334.4906311035156, "loss": 0.7587, "rewards/accuracies": 0.24062499403953552, "rewards/chosen": 0.01308593712747097, "rewards/margins": -0.0033203125931322575, "rewards/rejected": 0.01640624925494194, "step": 20 }, { "epoch": 0.015702695629416383, "grad_norm": 174.186279296875, "learning_rate": 7.591623036649214e-08, "logits/chosen": -4.884179592132568, "logits/rejected": -4.926171779632568, "logps/chosen": -399.9375, "logps/rejected": -337.7749938964844, "loss": 0.7698, "rewards/accuracies": 0.20937499403953552, "rewards/chosen": -0.02890625037252903, "rewards/margins": -0.03945312649011612, "rewards/rejected": 0.010546875186264515, "step": 30 }, { "epoch": 0.02093692750588851, "grad_norm": 174.37091064453125, "learning_rate": 1.0209424083769633e-07, "logits/chosen": -4.883984565734863, "logits/rejected": -4.83984375, "logps/chosen": -391.60626220703125, "logps/rejected": -348.7875061035156, "loss": 0.7086, "rewards/accuracies": 0.26249998807907104, "rewards/chosen": 0.06796874850988388, "rewards/margins": 0.09003905951976776, "rewards/rejected": -0.02207031287252903, "step": 40 }, { "epoch": 0.02617115938236064, "grad_norm": 184.66021728515625, "learning_rate": 1.282722513089005e-07, "logits/chosen": -4.859765529632568, "logits/rejected": -4.878320217132568, "logps/chosen": -362.89373779296875, "logps/rejected": -329.4125061035156, "loss": 0.7445, "rewards/accuracies": 0.24375000596046448, "rewards/chosen": 0.03046875074505806, "rewards/margins": 0.0023437500931322575, "rewards/rejected": 0.02812499925494194, "step": 50 }, { "epoch": 0.031405391258832765, "grad_norm": 189.43434143066406, "learning_rate": 1.544502617801047e-07, "logits/chosen": -4.886132717132568, "logits/rejected": -4.882226467132568, "logps/chosen": -423.5406188964844, "logps/rejected": -364.35626220703125, "loss": 0.7487, "rewards/accuracies": 0.20937499403953552, "rewards/chosen": -0.0035156249068677425, "rewards/margins": 0.00390625, "rewards/rejected": -0.0074218749068677425, "step": 60 }, { "epoch": 0.036639623135304895, "grad_norm": 174.75437927246094, "learning_rate": 1.8062827225130888e-07, "logits/chosen": -4.896679878234863, "logits/rejected": -4.800195217132568, "logps/chosen": -381.2749938964844, "logps/rejected": -337.3812561035156, "loss": 0.7801, "rewards/accuracies": 0.23749999701976776, "rewards/chosen": -0.01992187462747097, "rewards/margins": -0.04863281175494194, "rewards/rejected": 0.02871093712747097, "step": 70 }, { "epoch": 0.04187385501177702, "grad_norm": 202.25637817382812, "learning_rate": 2.0680628272251307e-07, "logits/chosen": -4.924609184265137, "logits/rejected": -4.921288967132568, "logps/chosen": -398.34375, "logps/rejected": -356.13751220703125, "loss": 0.7728, "rewards/accuracies": 0.21562500298023224, "rewards/chosen": 0.0013671874767169356, "rewards/margins": -0.03535156324505806, "rewards/rejected": 0.03671874850988388, "step": 80 }, { "epoch": 0.04710808688824915, "grad_norm": 187.97657775878906, "learning_rate": 2.3298429319371725e-07, "logits/chosen": -4.907617092132568, "logits/rejected": -4.8369140625, "logps/chosen": -349.3062438964844, "logps/rejected": -316.0874938964844, "loss": 0.7582, "rewards/accuracies": 0.22812500596046448, "rewards/chosen": -0.0025390624068677425, "rewards/margins": -0.01503906212747097, "rewards/rejected": 0.012500000186264515, "step": 90 }, { "epoch": 0.05234231876472128, "grad_norm": 175.65379333496094, "learning_rate": 2.591623036649215e-07, "logits/chosen": -4.946484565734863, "logits/rejected": -4.925976753234863, "logps/chosen": -376.78125, "logps/rejected": -320.1312561035156, "loss": 0.7598, "rewards/accuracies": 0.265625, "rewards/chosen": -0.01679687574505806, "rewards/margins": -0.02421874925494194, "rewards/rejected": 0.0074218749068677425, "step": 100 }, { "epoch": 0.05757655064119341, "grad_norm": 170.0375518798828, "learning_rate": 2.853403141361256e-07, "logits/chosen": -4.845312595367432, "logits/rejected": -4.8720703125, "logps/chosen": -386.703125, "logps/rejected": -325.6937561035156, "loss": 0.7507, "rewards/accuracies": 0.2593750059604645, "rewards/chosen": 0.05878906324505806, "rewards/margins": 0.06015624850988388, "rewards/rejected": -0.0013671874767169356, "step": 110 }, { "epoch": 0.06281078251766553, "grad_norm": 197.53945922851562, "learning_rate": 3.1151832460732986e-07, "logits/chosen": -4.958788871765137, "logits/rejected": -4.956445217132568, "logps/chosen": -435.625, "logps/rejected": -389.0874938964844, "loss": 0.7531, "rewards/accuracies": 0.22812500596046448, "rewards/chosen": 0.02558593824505806, "rewards/margins": 0.0283203125, "rewards/rejected": -0.0027343749534338713, "step": 120 }, { "epoch": 0.06804501439413765, "grad_norm": 195.65884399414062, "learning_rate": 3.37696335078534e-07, "logits/chosen": -4.831738471984863, "logits/rejected": -4.8544921875, "logps/chosen": -409.33123779296875, "logps/rejected": -352.84375, "loss": 0.739, "rewards/accuracies": 0.24062499403953552, "rewards/chosen": 0.05214843899011612, "rewards/margins": 0.01230468787252903, "rewards/rejected": 0.03984374925494194, "step": 130 }, { "epoch": 0.07327924627060979, "grad_norm": 181.20448303222656, "learning_rate": 3.6387434554973823e-07, "logits/chosen": -4.8798828125, "logits/rejected": -4.847851753234863, "logps/chosen": -382.37188720703125, "logps/rejected": -325.21875, "loss": 0.806, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.03554687649011612, "rewards/margins": 0.01328125037252903, "rewards/rejected": 0.02226562425494194, "step": 140 }, { "epoch": 0.07851347814708191, "grad_norm": 223.26377868652344, "learning_rate": 3.900523560209424e-07, "logits/chosen": -4.886914253234863, "logits/rejected": -4.841406345367432, "logps/chosen": -390.17498779296875, "logps/rejected": -341.46875, "loss": 0.7377, "rewards/accuracies": 0.2593750059604645, "rewards/chosen": 0.07207031548023224, "rewards/margins": 0.04746093600988388, "rewards/rejected": 0.02460937574505806, "step": 150 }, { "epoch": 0.08374771002355404, "grad_norm": 177.750732421875, "learning_rate": 4.162303664921466e-07, "logits/chosen": -4.925390720367432, "logits/rejected": -4.885156154632568, "logps/chosen": -401.07501220703125, "logps/rejected": -343.5625, "loss": 0.6785, "rewards/accuracies": 0.2906250059604645, "rewards/chosen": 0.15644530951976776, "rewards/margins": 0.15839843451976776, "rewards/rejected": -0.001953125, "step": 160 }, { "epoch": 0.08898194190002617, "grad_norm": 192.75547790527344, "learning_rate": 4.424083769633508e-07, "logits/chosen": -4.857617378234863, "logits/rejected": -4.904492378234863, "logps/chosen": -401.5687561035156, "logps/rejected": -358.92498779296875, "loss": 0.6825, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": 0.22578124701976776, "rewards/margins": 0.21152344346046448, "rewards/rejected": 0.01425781287252903, "step": 170 }, { "epoch": 0.0942161737764983, "grad_norm": 141.7459259033203, "learning_rate": 4.6858638743455497e-07, "logits/chosen": -4.9287109375, "logits/rejected": -4.867578029632568, "logps/chosen": -400.8812561035156, "logps/rejected": -355.60626220703125, "loss": 0.6838, "rewards/accuracies": 0.35624998807907104, "rewards/chosen": 0.24589844048023224, "rewards/margins": 0.18886718153953552, "rewards/rejected": 0.05703125149011612, "step": 180 }, { "epoch": 0.09945040565297043, "grad_norm": 208.60675048828125, "learning_rate": 4.947643979057592e-07, "logits/chosen": -4.891211032867432, "logits/rejected": -4.859961032867432, "logps/chosen": -412.54998779296875, "logps/rejected": -349.1937561035156, "loss": 0.7194, "rewards/accuracies": 0.390625, "rewards/chosen": 0.23964843153953552, "rewards/margins": 0.20683594048023224, "rewards/rejected": 0.03281249850988388, "step": 190 }, { "epoch": 0.10468463752944256, "grad_norm": 192.67474365234375, "learning_rate": 5.209424083769634e-07, "logits/chosen": -4.878710746765137, "logits/rejected": -4.849023342132568, "logps/chosen": -401.2875061035156, "logps/rejected": -354.4937438964844, "loss": 0.7203, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": 0.3539062440395355, "rewards/margins": 0.18769530951976776, "rewards/rejected": 0.16621093451976776, "step": 200 }, { "epoch": 0.10991886940591468, "grad_norm": 189.45266723632812, "learning_rate": 5.471204188481675e-07, "logits/chosen": -4.937695503234863, "logits/rejected": -4.834570407867432, "logps/chosen": -368.56561279296875, "logps/rejected": -340.140625, "loss": 0.8076, "rewards/accuracies": 0.34062498807907104, "rewards/chosen": 0.294921875, "rewards/margins": -0.008984374813735485, "rewards/rejected": 0.30390626192092896, "step": 210 }, { "epoch": 0.11515310128238682, "grad_norm": 218.45777893066406, "learning_rate": 5.732984293193717e-07, "logits/chosen": -4.9306640625, "logits/rejected": -4.875586032867432, "logps/chosen": -406.35626220703125, "logps/rejected": -369.3687438964844, "loss": 0.7322, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": 0.38984376192092896, "rewards/margins": 0.22089843451976776, "rewards/rejected": 0.1689453125, "step": 220 }, { "epoch": 0.12038733315885894, "grad_norm": 210.7808074951172, "learning_rate": 5.994764397905759e-07, "logits/chosen": -4.845703125, "logits/rejected": -4.899023532867432, "logps/chosen": -394.7093811035156, "logps/rejected": -351.515625, "loss": 0.703, "rewards/accuracies": 0.4468750059604645, "rewards/chosen": 0.4593749940395355, "rewards/margins": 0.25175780057907104, "rewards/rejected": 0.20761719346046448, "step": 230 }, { "epoch": 0.12562156503533106, "grad_norm": 171.84983825683594, "learning_rate": 6.256544502617801e-07, "logits/chosen": -4.934179782867432, "logits/rejected": -4.895117282867432, "logps/chosen": -384.29998779296875, "logps/rejected": -341.23748779296875, "loss": 0.697, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": 0.48808592557907104, "rewards/margins": 0.2568359375, "rewards/rejected": 0.23125000298023224, "step": 240 }, { "epoch": 0.13085579691180318, "grad_norm": 152.40406799316406, "learning_rate": 6.518324607329843e-07, "logits/chosen": -4.951757907867432, "logits/rejected": -4.938867092132568, "logps/chosen": -412.26251220703125, "logps/rejected": -355.98748779296875, "loss": 0.6396, "rewards/accuracies": 0.47187501192092896, "rewards/chosen": 0.5455077886581421, "rewards/margins": 0.3726562559604645, "rewards/rejected": 0.1728515625, "step": 250 }, { "epoch": 0.1360900287882753, "grad_norm": 170.38427734375, "learning_rate": 6.780104712041884e-07, "logits/chosen": -4.8974609375, "logits/rejected": -4.955859184265137, "logps/chosen": -408.92498779296875, "logps/rejected": -360.7875061035156, "loss": 0.694, "rewards/accuracies": 0.47187501192092896, "rewards/chosen": 0.6585937738418579, "rewards/margins": 0.33183592557907104, "rewards/rejected": 0.3267578184604645, "step": 260 }, { "epoch": 0.14132426066474746, "grad_norm": 156.3699188232422, "learning_rate": 7.041884816753927e-07, "logits/chosen": -4.799218654632568, "logits/rejected": -4.916015625, "logps/chosen": -393.5375061035156, "logps/rejected": -354.3374938964844, "loss": 0.6516, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.6753906011581421, "rewards/margins": 0.4056640565395355, "rewards/rejected": 0.26972657442092896, "step": 270 }, { "epoch": 0.14655849254121958, "grad_norm": 197.5117950439453, "learning_rate": 7.303664921465969e-07, "logits/chosen": -4.9560546875, "logits/rejected": -4.938867092132568, "logps/chosen": -391.15625, "logps/rejected": -360.38751220703125, "loss": 0.737, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.689648449420929, "rewards/margins": 0.28300780057907104, "rewards/rejected": 0.4066406190395355, "step": 280 }, { "epoch": 0.1517927244176917, "grad_norm": 172.09805297851562, "learning_rate": 7.56544502617801e-07, "logits/chosen": -4.994140625, "logits/rejected": -4.981249809265137, "logps/chosen": -439.08123779296875, "logps/rejected": -375.16876220703125, "loss": 0.7421, "rewards/accuracies": 0.4906249940395355, "rewards/chosen": 0.700976550579071, "rewards/margins": 0.3330078125, "rewards/rejected": 0.36796873807907104, "step": 290 }, { "epoch": 0.15702695629416383, "grad_norm": 193.1634063720703, "learning_rate": 7.827225130890051e-07, "logits/chosen": -4.9052734375, "logits/rejected": -4.923632621765137, "logps/chosen": -368.65313720703125, "logps/rejected": -338.1312561035156, "loss": 0.6573, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.576953113079071, "rewards/margins": 0.4130859375, "rewards/rejected": 0.16386719048023224, "step": 300 }, { "epoch": 0.16226118817063595, "grad_norm": 174.55418395996094, "learning_rate": 8.089005235602095e-07, "logits/chosen": -4.867383003234863, "logits/rejected": -4.823437690734863, "logps/chosen": -388.3812561035156, "logps/rejected": -328.98126220703125, "loss": 0.6988, "rewards/accuracies": 0.4593749940395355, "rewards/chosen": 0.5927734375, "rewards/margins": 0.31816405057907104, "rewards/rejected": 0.27460938692092896, "step": 310 }, { "epoch": 0.16749542004710807, "grad_norm": 229.78353881835938, "learning_rate": 8.350785340314136e-07, "logits/chosen": -4.961523532867432, "logits/rejected": -4.855859279632568, "logps/chosen": -385.63751220703125, "logps/rejected": -337.03436279296875, "loss": 0.6617, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": 0.8177734613418579, "rewards/margins": 0.4732421934604645, "rewards/rejected": 0.34453123807907104, "step": 320 }, { "epoch": 0.17272965192358022, "grad_norm": 244.8257598876953, "learning_rate": 8.612565445026177e-07, "logits/chosen": -4.949804782867432, "logits/rejected": -4.877343654632568, "logps/chosen": -373.4937438964844, "logps/rejected": -326.4125061035156, "loss": 0.6691, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.782031238079071, "rewards/margins": 0.44648438692092896, "rewards/rejected": 0.3355468809604645, "step": 330 }, { "epoch": 0.17796388380005235, "grad_norm": 199.0854034423828, "learning_rate": 8.874345549738219e-07, "logits/chosen": -4.966796875, "logits/rejected": -4.977734565734863, "logps/chosen": -397.3812561035156, "logps/rejected": -338.82501220703125, "loss": 0.6806, "rewards/accuracies": 0.5218750238418579, "rewards/chosen": 0.7076171636581421, "rewards/margins": 0.4193359315395355, "rewards/rejected": 0.28828126192092896, "step": 340 }, { "epoch": 0.18319811567652447, "grad_norm": 211.42018127441406, "learning_rate": 9.136125654450262e-07, "logits/chosen": -4.867968559265137, "logits/rejected": -4.908203125, "logps/chosen": -358.13751220703125, "logps/rejected": -345.5375061035156, "loss": 0.6386, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.629101574420929, "rewards/margins": 0.523242175579071, "rewards/rejected": 0.10585937649011612, "step": 350 }, { "epoch": 0.1884323475529966, "grad_norm": 170.07656860351562, "learning_rate": 9.397905759162303e-07, "logits/chosen": -4.985937595367432, "logits/rejected": -4.934179782867432, "logps/chosen": -413.78125, "logps/rejected": -371.0375061035156, "loss": 0.6967, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.505859375, "rewards/margins": 0.38593751192092896, "rewards/rejected": 0.11992187798023224, "step": 360 }, { "epoch": 0.19366657942946872, "grad_norm": 128.83041381835938, "learning_rate": 9.659685863874345e-07, "logits/chosen": -4.901562690734863, "logits/rejected": -4.8740234375, "logps/chosen": -361.8687438964844, "logps/rejected": -300.23748779296875, "loss": 0.5959, "rewards/accuracies": 0.565625011920929, "rewards/chosen": 0.556640625, "rewards/margins": 0.5914062261581421, "rewards/rejected": -0.03476562350988388, "step": 370 }, { "epoch": 0.19890081130594087, "grad_norm": 168.8352508544922, "learning_rate": 9.921465968586386e-07, "logits/chosen": -4.880078315734863, "logits/rejected": -4.859765529632568, "logps/chosen": -384.54998779296875, "logps/rejected": -364.17498779296875, "loss": 0.7032, "rewards/accuracies": 0.53125, "rewards/chosen": 0.7109375, "rewards/margins": 0.40644532442092896, "rewards/rejected": 0.30449217557907104, "step": 380 }, { "epoch": 0.204135043182413, "grad_norm": 180.12904357910156, "learning_rate": 9.999897712489534e-07, "logits/chosen": -5.007616996765137, "logits/rejected": -4.939062595367432, "logps/chosen": -396.9125061035156, "logps/rejected": -344.4750061035156, "loss": 0.6555, "rewards/accuracies": 0.5218750238418579, "rewards/chosen": 0.7738281488418579, "rewards/margins": 0.5560547113418579, "rewards/rejected": 0.2177734375, "step": 390 }, { "epoch": 0.2093692750588851, "grad_norm": 141.98025512695312, "learning_rate": 9.999396722513154e-07, "logits/chosen": -4.9541015625, "logits/rejected": -4.874218940734863, "logps/chosen": -367.75, "logps/rejected": -311.9624938964844, "loss": 0.6593, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.7554687261581421, "rewards/margins": 0.5160156488418579, "rewards/rejected": 0.23945312201976776, "step": 400 }, { "epoch": 0.21460350693535724, "grad_norm": 167.3540496826172, "learning_rate": 9.99847828434916e-07, "logits/chosen": -4.950390815734863, "logits/rejected": -4.970117092132568, "logps/chosen": -372.09375, "logps/rejected": -355.6625061035156, "loss": 0.7244, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.8095703125, "rewards/margins": 0.4154296815395355, "rewards/rejected": 0.3941406309604645, "step": 410 }, { "epoch": 0.21983773881182936, "grad_norm": 164.46804809570312, "learning_rate": 9.99714247468688e-07, "logits/chosen": -4.939648628234863, "logits/rejected": -4.837079048156738, "logps/chosen": -393.45623779296875, "logps/rejected": -366.6187438964844, "loss": 0.6817, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.8345702886581421, "rewards/margins": 0.4537109434604645, "rewards/rejected": 0.380859375, "step": 420 }, { "epoch": 0.22507197068830148, "grad_norm": 217.72833251953125, "learning_rate": 9.995389405066031e-07, "logits/chosen": -4.853906154632568, "logits/rejected": -4.872460842132568, "logps/chosen": -395.4375, "logps/rejected": -350.4624938964844, "loss": 0.6919, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.7529296875, "rewards/margins": 0.4263671934604645, "rewards/rejected": 0.3265624940395355, "step": 430 }, { "epoch": 0.23030620256477363, "grad_norm": 228.83482360839844, "learning_rate": 9.993219221867424e-07, "logits/chosen": -4.986914157867432, "logits/rejected": -5.002148628234863, "logps/chosen": -383.95623779296875, "logps/rejected": -349.1875, "loss": 0.6646, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.8466796875, "rewards/margins": 0.486328125, "rewards/rejected": 0.3603515625, "step": 440 }, { "epoch": 0.23554043444124576, "grad_norm": 166.70730590820312, "learning_rate": 9.990632106300731e-07, "logits/chosen": -4.781054496765137, "logits/rejected": -4.8046875, "logps/chosen": -378.8500061035156, "logps/rejected": -370.20001220703125, "loss": 0.6392, "rewards/accuracies": 0.5406249761581421, "rewards/chosen": 0.9384765625, "rewards/margins": 0.607714831829071, "rewards/rejected": 0.33076173067092896, "step": 450 }, { "epoch": 0.24077466631771788, "grad_norm": 192.34707641601562, "learning_rate": 9.98762827438936e-07, "logits/chosen": -4.938672065734863, "logits/rejected": -4.898828029632568, "logps/chosen": -402.6937561035156, "logps/rejected": -335.46875, "loss": 0.6623, "rewards/accuracies": 0.5406249761581421, "rewards/chosen": 1.0177733898162842, "rewards/margins": 0.52734375, "rewards/rejected": 0.49042969942092896, "step": 460 }, { "epoch": 0.24600889819419, "grad_norm": 201.65838623046875, "learning_rate": 9.98420797695241e-07, "logits/chosen": -4.892968654632568, "logits/rejected": -4.875195503234863, "logps/chosen": -410.29998779296875, "logps/rejected": -370.57501220703125, "loss": 0.6401, "rewards/accuracies": 0.596875011920929, "rewards/chosen": 1.177343726158142, "rewards/margins": 0.697265625, "rewards/rejected": 0.4800781309604645, "step": 470 }, { "epoch": 0.2512431300706621, "grad_norm": 225.55172729492188, "learning_rate": 9.980371499583729e-07, "logits/chosen": -4.888574123382568, "logits/rejected": -4.891015529632568, "logps/chosen": -370.0218811035156, "logps/rejected": -333.6656188964844, "loss": 0.6756, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": 1.047265648841858, "rewards/margins": 0.6171875, "rewards/rejected": 0.4300781190395355, "step": 480 }, { "epoch": 0.2564773619471343, "grad_norm": 164.9784393310547, "learning_rate": 9.976119162628079e-07, "logits/chosen": -4.900195121765137, "logits/rejected": -4.890038967132568, "logps/chosen": -413.57501220703125, "logps/rejected": -345.61248779296875, "loss": 0.6205, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": 1.1925780773162842, "rewards/margins": 0.687304675579071, "rewards/rejected": 0.5052734613418579, "step": 490 }, { "epoch": 0.26171159382360637, "grad_norm": 233.83506774902344, "learning_rate": 9.971451321154368e-07, "logits/chosen": -4.8896484375, "logits/rejected": -4.883203029632568, "logps/chosen": -397.6499938964844, "logps/rejected": -340.90625, "loss": 0.6385, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": 1.1494140625, "rewards/margins": 0.702343761920929, "rewards/rejected": 0.44707030057907104, "step": 500 }, { "epoch": 0.2669458257000785, "grad_norm": 198.21002197265625, "learning_rate": 9.966368364926017e-07, "logits/chosen": -4.929491996765137, "logits/rejected": -4.838671684265137, "logps/chosen": -365.60626220703125, "logps/rejected": -322.2749938964844, "loss": 0.6276, "rewards/accuracies": 0.559374988079071, "rewards/chosen": 0.9931640625, "rewards/margins": 0.629101574420929, "rewards/rejected": 0.36406248807907104, "step": 510 }, { "epoch": 0.2721800575765506, "grad_norm": 205.15029907226562, "learning_rate": 9.960870718368407e-07, "logits/chosen": -4.963476657867432, "logits/rejected": -4.922656059265137, "logps/chosen": -406.7562561035156, "logps/rejected": -355.85626220703125, "loss": 0.6432, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.136328101158142, "rewards/margins": 0.7568359375, "rewards/rejected": 0.3794921934604645, "step": 520 }, { "epoch": 0.27741428945302277, "grad_norm": 237.28872680664062, "learning_rate": 9.954958840533446e-07, "logits/chosen": -4.900000095367432, "logits/rejected": -4.8251953125, "logps/chosen": -382.5, "logps/rejected": -351.04998779296875, "loss": 0.6802, "rewards/accuracies": 0.5625, "rewards/chosen": 0.8763672113418579, "rewards/margins": 0.5835937261581421, "rewards/rejected": 0.29277342557907104, "step": 530 }, { "epoch": 0.2826485213294949, "grad_norm": 211.749755859375, "learning_rate": 9.948633225061229e-07, "logits/chosen": -4.866991996765137, "logits/rejected": -4.947461128234863, "logps/chosen": -402.265625, "logps/rejected": -341.4437561035156, "loss": 0.5959, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": 1.039453148841858, "rewards/margins": 0.8275390863418579, "rewards/rejected": 0.2119140625, "step": 540 }, { "epoch": 0.287882753205967, "grad_norm": 181.6391143798828, "learning_rate": 9.94189440013883e-07, "logits/chosen": -4.931250095367432, "logits/rejected": -4.865429878234863, "logps/chosen": -384.4312438964844, "logps/rejected": -322.8531188964844, "loss": 0.6945, "rewards/accuracies": 0.534375011920929, "rewards/chosen": 0.8609374761581421, "rewards/margins": 0.4603515565395355, "rewards/rejected": 0.40058594942092896, "step": 550 }, { "epoch": 0.29311698508243916, "grad_norm": 136.0985107421875, "learning_rate": 9.93474292845619e-07, "logits/chosen": -4.931640625, "logits/rejected": -4.855078220367432, "logps/chosen": -388.3125, "logps/rejected": -359.29376220703125, "loss": 0.6286, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 1.0783202648162842, "rewards/margins": 0.6087890863418579, "rewards/rejected": 0.46953123807907104, "step": 560 }, { "epoch": 0.29835121695891126, "grad_norm": 185.28721618652344, "learning_rate": 9.927179407159138e-07, "logits/chosen": -4.910937309265137, "logits/rejected": -4.874218940734863, "logps/chosen": -404.10626220703125, "logps/rejected": -343.1812438964844, "loss": 0.6732, "rewards/accuracies": 0.609375, "rewards/chosen": 1.1181640625, "rewards/margins": 0.647753894329071, "rewards/rejected": 0.47041016817092896, "step": 570 }, { "epoch": 0.3035854488353834, "grad_norm": 163.57351684570312, "learning_rate": 9.919204467799522e-07, "logits/chosen": -4.954687595367432, "logits/rejected": -4.9677734375, "logps/chosen": -381.3125, "logps/rejected": -341.6312561035156, "loss": 0.652, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 1.240820288658142, "rewards/margins": 0.773632824420929, "rewards/rejected": 0.4671874940395355, "step": 580 }, { "epoch": 0.30881968071185556, "grad_norm": 266.8165588378906, "learning_rate": 9.910818776282485e-07, "logits/chosen": -4.872460842132568, "logits/rejected": -4.8974609375, "logps/chosen": -381.07501220703125, "logps/rejected": -333.73748779296875, "loss": 0.6804, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 1.093359351158142, "rewards/margins": 0.6187499761581421, "rewards/rejected": 0.474609375, "step": 590 }, { "epoch": 0.31405391258832765, "grad_norm": 171.8970489501953, "learning_rate": 9.902023032810858e-07, "logits/chosen": -4.9111328125, "logits/rejected": -4.847265720367432, "logps/chosen": -345.38751220703125, "logps/rejected": -333.09375, "loss": 0.6499, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 1.248046875, "rewards/margins": 0.5960937738418579, "rewards/rejected": 0.6519531011581421, "step": 600 }, { "epoch": 0.3192881444647998, "grad_norm": 144.56777954101562, "learning_rate": 9.892817971826687e-07, "logits/chosen": -4.935742378234863, "logits/rejected": -4.869336128234863, "logps/chosen": -373.67498779296875, "logps/rejected": -349.59375, "loss": 0.6506, "rewards/accuracies": 0.596875011920929, "rewards/chosen": 1.2042968273162842, "rewards/margins": 0.709765613079071, "rewards/rejected": 0.4945312440395355, "step": 610 }, { "epoch": 0.3245223763412719, "grad_norm": 166.99978637695312, "learning_rate": 9.883204361949916e-07, "logits/chosen": -4.963281154632568, "logits/rejected": -4.999804496765137, "logps/chosen": -424.2749938964844, "logps/rejected": -343.01251220703125, "loss": 0.675, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.940136730670929, "rewards/margins": 0.613964855670929, "rewards/rejected": 0.326171875, "step": 620 }, { "epoch": 0.32975660821774405, "grad_norm": 170.0765380859375, "learning_rate": 9.873183005914202e-07, "logits/chosen": -4.957226753234863, "logits/rejected": -4.872656345367432, "logps/chosen": -370.36248779296875, "logps/rejected": -333.54376220703125, "loss": 0.6338, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.9603515863418579, "rewards/margins": 0.626953125, "rewards/rejected": 0.3333984315395355, "step": 630 }, { "epoch": 0.33499084009421615, "grad_norm": 190.66220092773438, "learning_rate": 9.86275474049989e-07, "logits/chosen": -4.953320503234863, "logits/rejected": -4.85546875, "logps/chosen": -380.45001220703125, "logps/rejected": -321.96563720703125, "loss": 0.6809, "rewards/accuracies": 0.528124988079071, "rewards/chosen": 0.7210937738418579, "rewards/margins": 0.5755859613418579, "rewards/rejected": 0.1455078125, "step": 640 }, { "epoch": 0.3402250719706883, "grad_norm": 129.71112060546875, "learning_rate": 9.851920436464144e-07, "logits/chosen": -4.977148532867432, "logits/rejected": -4.921288967132568, "logps/chosen": -369.2875061035156, "logps/rejected": -339.9312438964844, "loss": 0.7151, "rewards/accuracies": 0.559374988079071, "rewards/chosen": 0.8255859613418579, "rewards/margins": 0.5738281011581421, "rewards/rejected": 0.25175780057907104, "step": 650 }, { "epoch": 0.34545930384716045, "grad_norm": 197.14109802246094, "learning_rate": 9.840680998468231e-07, "logits/chosen": -4.959179878234863, "logits/rejected": -4.951952934265137, "logps/chosen": -360.3343811035156, "logps/rejected": -340.51251220703125, "loss": 0.6309, "rewards/accuracies": 0.590624988079071, "rewards/chosen": 0.8583984375, "rewards/margins": 0.7035156488418579, "rewards/rejected": 0.15488281846046448, "step": 660 }, { "epoch": 0.35069353572363254, "grad_norm": 185.25253295898438, "learning_rate": 9.82903736500199e-07, "logits/chosen": -4.889062404632568, "logits/rejected": -4.888867378234863, "logps/chosen": -377.29998779296875, "logps/rejected": -347.38751220703125, "loss": 0.6674, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.86328125, "rewards/margins": 0.5814453363418579, "rewards/rejected": 0.2818359434604645, "step": 670 }, { "epoch": 0.3559277676001047, "grad_norm": 136.94049072265625, "learning_rate": 9.81699050830546e-07, "logits/chosen": -4.853711128234863, "logits/rejected": -4.849804878234863, "logps/chosen": -366.6812438964844, "logps/rejected": -342.4937438964844, "loss": 0.6587, "rewards/accuracies": 0.559374988079071, "rewards/chosen": 0.813671886920929, "rewards/margins": 0.6734374761581421, "rewards/rejected": 0.14023438096046448, "step": 680 }, { "epoch": 0.3611619994765768, "grad_norm": 147.21014404296875, "learning_rate": 9.804541434287716e-07, "logits/chosen": -4.9560546875, "logits/rejected": -4.939843654632568, "logps/chosen": -403.11248779296875, "logps/rejected": -367.82501220703125, "loss": 0.6308, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 1.1335937976837158, "rewards/margins": 0.7738281488418579, "rewards/rejected": 0.3597656190395355, "step": 690 }, { "epoch": 0.36639623135304894, "grad_norm": 182.96368408203125, "learning_rate": 9.791691182442852e-07, "logits/chosen": -4.942773342132568, "logits/rejected": -4.930468559265137, "logps/chosen": -388.9750061035156, "logps/rejected": -339.29376220703125, "loss": 0.6703, "rewards/accuracies": 0.534375011920929, "rewards/chosen": 1.1164062023162842, "rewards/margins": 0.637890636920929, "rewards/rejected": 0.478515625, "step": 700 }, { "epoch": 0.3716304632295211, "grad_norm": 228.25791931152344, "learning_rate": 9.7784408257632e-07, "logits/chosen": -4.923047065734863, "logits/rejected": -4.917187690734863, "logps/chosen": -401.70001220703125, "logps/rejected": -342.5562438964844, "loss": 0.6096, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": 1.4675781726837158, "rewards/margins": 0.7367187738418579, "rewards/rejected": 0.7308593988418579, "step": 710 }, { "epoch": 0.3768646951059932, "grad_norm": 141.3728485107422, "learning_rate": 9.764791470649727e-07, "logits/chosen": -4.928124904632568, "logits/rejected": -4.915820121765137, "logps/chosen": -364.64373779296875, "logps/rejected": -331.79376220703125, "loss": 0.6055, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": 1.405859351158142, "rewards/margins": 0.771289050579071, "rewards/rejected": 0.634570300579071, "step": 720 }, { "epoch": 0.38209892698246534, "grad_norm": 154.76612854003906, "learning_rate": 9.750744256819658e-07, "logits/chosen": -4.958984375, "logits/rejected": -4.895703315734863, "logps/chosen": -351.35626220703125, "logps/rejected": -316.70001220703125, "loss": 0.6395, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 1.345117211341858, "rewards/margins": 0.7164062261581421, "rewards/rejected": 0.628710925579071, "step": 730 }, { "epoch": 0.38733315885893743, "grad_norm": 180.64218139648438, "learning_rate": 9.736300357211307e-07, "logits/chosen": -4.930078029632568, "logits/rejected": -4.916796684265137, "logps/chosen": -384.1312561035156, "logps/rejected": -324.73126220703125, "loss": 0.6116, "rewards/accuracies": 0.596875011920929, "rewards/chosen": 1.3830077648162842, "rewards/margins": 0.760546863079071, "rewards/rejected": 0.6224609613418579, "step": 740 }, { "epoch": 0.3925673907354096, "grad_norm": 210.36679077148438, "learning_rate": 9.721460977886135e-07, "logits/chosen": -4.892187595367432, "logits/rejected": -4.879101753234863, "logps/chosen": -379.78125, "logps/rejected": -328.375, "loss": 0.6149, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": 1.2996094226837158, "rewards/margins": 0.759765625, "rewards/rejected": 0.539843738079071, "step": 750 }, { "epoch": 0.39780162261188173, "grad_norm": 172.40696716308594, "learning_rate": 9.706227357928043e-07, "logits/chosen": -4.931836128234863, "logits/rejected": -4.895117282867432, "logps/chosen": -372.4624938964844, "logps/rejected": -343.86248779296875, "loss": 0.6295, "rewards/accuracies": 0.596875011920929, "rewards/chosen": 1.308007836341858, "rewards/margins": 0.7974609136581421, "rewards/rejected": 0.510546863079071, "step": 760 }, { "epoch": 0.40303585448835383, "grad_norm": 193.9097442626953, "learning_rate": 9.690600769339914e-07, "logits/chosen": -4.938281059265137, "logits/rejected": -4.889843940734863, "logps/chosen": -374.609375, "logps/rejected": -339.3999938964844, "loss": 0.7294, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 1.3720703125, "rewards/margins": 0.5689452886581421, "rewards/rejected": 0.8031250238418579, "step": 770 }, { "epoch": 0.408270086364826, "grad_norm": 217.3723602294922, "learning_rate": 9.6745825169374e-07, "logits/chosen": -4.968359470367432, "logits/rejected": -4.871679782867432, "logps/chosen": -407.14373779296875, "logps/rejected": -350.8187561035156, "loss": 0.6663, "rewards/accuracies": 0.59375, "rewards/chosen": 1.4031250476837158, "rewards/margins": 0.7689453363418579, "rewards/rejected": 0.6341797113418579, "step": 780 }, { "epoch": 0.4135043182412981, "grad_norm": 152.49200439453125, "learning_rate": 9.658173938239966e-07, "logits/chosen": -4.896679878234863, "logits/rejected": -4.890038967132568, "logps/chosen": -368.0062561035156, "logps/rejected": -356.3374938964844, "loss": 0.65, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 1.2138671875, "rewards/margins": 0.7095702886581421, "rewards/rejected": 0.5042968988418579, "step": 790 }, { "epoch": 0.4187385501177702, "grad_norm": 261.2742004394531, "learning_rate": 9.64137640335921e-07, "logits/chosen": -4.945116996765137, "logits/rejected": -4.999804496765137, "logps/chosen": -391.8687438964844, "logps/rejected": -351.64373779296875, "loss": 0.7386, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": 1.044531226158142, "rewards/margins": 0.4583984315395355, "rewards/rejected": 0.586132824420929, "step": 800 }, { "epoch": 0.4239727819942423, "grad_norm": 187.15277099609375, "learning_rate": 9.624191314884461e-07, "logits/chosen": -4.956250190734863, "logits/rejected": -4.9150390625, "logps/chosen": -369.2124938964844, "logps/rejected": -345.20001220703125, "loss": 0.6706, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 1.0304687023162842, "rewards/margins": 0.613476574420929, "rewards/rejected": 0.4169921875, "step": 810 }, { "epoch": 0.42920701387071447, "grad_norm": 157.43338012695312, "learning_rate": 9.606620107765662e-07, "logits/chosen": -4.924023628234863, "logits/rejected": -4.976366996765137, "logps/chosen": -363.2749938964844, "logps/rejected": -326.07501220703125, "loss": 0.5971, "rewards/accuracies": 0.59375, "rewards/chosen": 1.0558593273162842, "rewards/margins": 0.7662109136581421, "rewards/rejected": 0.2896484434604645, "step": 820 }, { "epoch": 0.4344412457471866, "grad_norm": 184.99835205078125, "learning_rate": 9.58866424919355e-07, "logits/chosen": -4.889062404632568, "logits/rejected": -4.844531059265137, "logps/chosen": -354.6875, "logps/rejected": -317.70623779296875, "loss": 0.7243, "rewards/accuracies": 0.534375011920929, "rewards/chosen": 0.8960937261581421, "rewards/margins": 0.48710936307907104, "rewards/rejected": 0.40898436307907104, "step": 830 }, { "epoch": 0.4396754776236587, "grad_norm": 162.27297973632812, "learning_rate": 9.570325238477148e-07, "logits/chosen": -5.000390529632568, "logits/rejected": -4.886523246765137, "logps/chosen": -399.2562561035156, "logps/rejected": -335.4312438964844, "loss": 0.6241, "rewards/accuracies": 0.625, "rewards/chosen": 1.19921875, "rewards/margins": 0.8248046636581421, "rewards/rejected": 0.3744140565395355, "step": 840 }, { "epoch": 0.44490970950013087, "grad_norm": 234.09226989746094, "learning_rate": 9.551604606918575e-07, "logits/chosen": -4.928906440734863, "logits/rejected": -4.858788967132568, "logps/chosen": -372.3374938964844, "logps/rejected": -318.2250061035156, "loss": 0.6925, "rewards/accuracies": 0.534375011920929, "rewards/chosen": 1.0636718273162842, "rewards/margins": 0.45332032442092896, "rewards/rejected": 0.6103515625, "step": 850 }, { "epoch": 0.45014394137660296, "grad_norm": 179.9008026123047, "learning_rate": 9.532503917685178e-07, "logits/chosen": -4.999609470367432, "logits/rejected": -4.960156440734863, "logps/chosen": -372.4125061035156, "logps/rejected": -361.7562561035156, "loss": 0.6536, "rewards/accuracies": 0.59375, "rewards/chosen": 1.26171875, "rewards/margins": 0.6871093511581421, "rewards/rejected": 0.5746093988418579, "step": 860 }, { "epoch": 0.4553781732530751, "grad_norm": 116.42308044433594, "learning_rate": 9.513024765679012e-07, "logits/chosen": -4.938672065734863, "logits/rejected": -4.885156154632568, "logps/chosen": -375.53125, "logps/rejected": -341.3187561035156, "loss": 0.6173, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": 1.206640601158142, "rewards/margins": 0.7935546636581421, "rewards/rejected": 0.4130859375, "step": 870 }, { "epoch": 0.46061240512954726, "grad_norm": 181.52101135253906, "learning_rate": 9.493168777403662e-07, "logits/chosen": -4.920117378234863, "logits/rejected": -4.913281440734863, "logps/chosen": -379.01251220703125, "logps/rejected": -328.5625, "loss": 0.6217, "rewards/accuracies": 0.609375, "rewards/chosen": 1.252343773841858, "rewards/margins": 0.7464843988418579, "rewards/rejected": 0.505859375, "step": 880 }, { "epoch": 0.46584663700601936, "grad_norm": 264.9549255371094, "learning_rate": 9.472937610828436e-07, "logits/chosen": -4.936718940734863, "logits/rejected": -4.8896484375, "logps/chosen": -373.125, "logps/rejected": -346.01251220703125, "loss": 0.6213, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": 1.2058594226837158, "rewards/margins": 0.7333984375, "rewards/rejected": 0.47246092557907104, "step": 890 }, { "epoch": 0.4710808688824915, "grad_norm": 191.10272216796875, "learning_rate": 9.452332955249919e-07, "logits/chosen": -4.933398246765137, "logits/rejected": -4.836718559265137, "logps/chosen": -382.2250061035156, "logps/rejected": -360.4624938964844, "loss": 0.6295, "rewards/accuracies": 0.596875011920929, "rewards/chosen": 1.0427734851837158, "rewards/margins": 0.709765613079071, "rewards/rejected": 0.3330078125, "step": 900 }, { "epoch": 0.4763151007589636, "grad_norm": 243.35546875, "learning_rate": 9.431356531150925e-07, "logits/chosen": -4.950976371765137, "logits/rejected": -4.892187595367432, "logps/chosen": -384.0375061035156, "logps/rejected": -351.39373779296875, "loss": 0.6983, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 1.1298828125, "rewards/margins": 0.615234375, "rewards/rejected": 0.5146484375, "step": 910 }, { "epoch": 0.48154933263543576, "grad_norm": 210.2702178955078, "learning_rate": 9.410010090056828e-07, "logits/chosen": -4.9072265625, "logits/rejected": -4.929296970367432, "logps/chosen": -387.55938720703125, "logps/rejected": -338.84686279296875, "loss": 0.6665, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 1.264062523841858, "rewards/margins": 0.708984375, "rewards/rejected": 0.5550781488418579, "step": 920 }, { "epoch": 0.48678356451190785, "grad_norm": 159.64968872070312, "learning_rate": 9.388295414389318e-07, "logits/chosen": -4.980078220367432, "logits/rejected": -4.9892578125, "logps/chosen": -408.3062438964844, "logps/rejected": -365.08123779296875, "loss": 0.6093, "rewards/accuracies": 0.640625, "rewards/chosen": 1.5750000476837158, "rewards/margins": 0.9380859136581421, "rewards/rejected": 0.636914074420929, "step": 930 }, { "epoch": 0.49201779638838, "grad_norm": 113.91616821289062, "learning_rate": 9.366214317317562e-07, "logits/chosen": -4.94921875, "logits/rejected": -4.903515815734863, "logps/chosen": -391.0062561035156, "logps/rejected": -332.65625, "loss": 0.5856, "rewards/accuracies": 0.621874988079071, "rewards/chosen": 1.4443359375, "rewards/margins": 0.953125, "rewards/rejected": 0.4912109375, "step": 940 }, { "epoch": 0.49725202826485215, "grad_norm": 207.63845825195312, "learning_rate": 9.343768642606813e-07, "logits/chosen": -4.997265815734863, "logits/rejected": -4.937695503234863, "logps/chosen": -414.1937561035156, "logps/rejected": -361.9375, "loss": 0.663, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 1.3605468273162842, "rewards/margins": 0.777148425579071, "rewards/rejected": 0.5833984613418579, "step": 950 }, { "epoch": 0.5024862601413242, "grad_norm": 160.97222900390625, "learning_rate": 9.320960264464448e-07, "logits/chosen": -4.920312404632568, "logits/rejected": -4.906836032867432, "logps/chosen": -378.0, "logps/rejected": -332.75, "loss": 0.5982, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": 1.3767578601837158, "rewards/margins": 0.823437511920929, "rewards/rejected": 0.5533202886581421, "step": 960 }, { "epoch": 0.5077204920177963, "grad_norm": 218.504150390625, "learning_rate": 9.29779108738348e-07, "logits/chosen": -4.912890434265137, "logits/rejected": -4.897851467132568, "logps/chosen": -385.4437561035156, "logps/rejected": -344.57501220703125, "loss": 0.6216, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": 1.2353515625, "rewards/margins": 0.7802734375, "rewards/rejected": 0.455078125, "step": 970 }, { "epoch": 0.5129547238942685, "grad_norm": 130.3459014892578, "learning_rate": 9.274263045983528e-07, "logits/chosen": -5.009179592132568, "logits/rejected": -4.911913871765137, "logps/chosen": -408.01873779296875, "logps/rejected": -347.6625061035156, "loss": 0.5959, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.7990233898162842, "rewards/margins": 0.9683593511581421, "rewards/rejected": 0.8306640386581421, "step": 980 }, { "epoch": 0.5181889557707406, "grad_norm": 170.50540161132812, "learning_rate": 9.250378104849275e-07, "logits/chosen": -4.947070121765137, "logits/rejected": -4.922070503234863, "logps/chosen": -371.64373779296875, "logps/rejected": -335.2437438964844, "loss": 0.6833, "rewards/accuracies": 0.59375, "rewards/chosen": 1.717382788658142, "rewards/margins": 0.7242187261581421, "rewards/rejected": 0.9931640625, "step": 990 }, { "epoch": 0.5234231876472127, "grad_norm": 144.2186737060547, "learning_rate": 9.226138258366436e-07, "logits/chosen": -4.9814453125, "logits/rejected": -4.927929878234863, "logps/chosen": -385.5874938964844, "logps/rejected": -324.9750061035156, "loss": 0.598, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": 1.8515625, "rewards/margins": 0.8843749761581421, "rewards/rejected": 0.9671875238418579, "step": 1000 }, { "epoch": 0.5234231876472127, "eval_logits/chosen": -4.991390705108643, "eval_logits/rejected": -4.945499897003174, "eval_logps/chosen": -391.65301513671875, "eval_logps/rejected": -346.6080017089844, "eval_loss": 0.6368173956871033, "eval_rewards/accuracies": 0.5879999995231628, "eval_rewards/chosen": 1.6621874570846558, "eval_rewards/margins": 0.8043749928474426, "eval_rewards/rejected": 0.8578125238418579, "eval_runtime": 246.2066, "eval_samples_per_second": 8.123, "eval_steps_per_second": 2.031, "step": 1000 }, { "epoch": 0.528657419523685, "grad_norm": 144.90234375, "learning_rate": 9.201545530555213e-07, "logits/chosen": -4.931250095367432, "logits/rejected": -4.950390815734863, "logps/chosen": -383.4937438964844, "logps/rejected": -345.4937438964844, "loss": 0.6257, "rewards/accuracies": 0.578125, "rewards/chosen": 1.6681640148162842, "rewards/margins": 0.840039074420929, "rewards/rejected": 0.828125, "step": 1010 }, { "epoch": 0.533891651400157, "grad_norm": 196.239990234375, "learning_rate": 9.176601974901304e-07, "logits/chosen": -4.970703125, "logits/rejected": -5.041796684265137, "logps/chosen": -406.09375, "logps/rejected": -341.73748779296875, "loss": 0.7137, "rewards/accuracies": 0.546875, "rewards/chosen": 1.6193358898162842, "rewards/margins": 0.655078113079071, "rewards/rejected": 0.9642578363418579, "step": 1020 }, { "epoch": 0.5391258832766291, "grad_norm": 161.8279571533203, "learning_rate": 9.151309674184427e-07, "logits/chosen": -4.842577934265137, "logits/rejected": -4.944531440734863, "logps/chosen": -373.3203125, "logps/rejected": -319.89373779296875, "loss": 0.6613, "rewards/accuracies": 0.609375, "rewards/chosen": 1.3367187976837158, "rewards/margins": 0.7310546636581421, "rewards/rejected": 0.605664074420929, "step": 1030 }, { "epoch": 0.5443601151531012, "grad_norm": 132.836669921875, "learning_rate": 9.125670740304409e-07, "logits/chosen": -4.990820407867432, "logits/rejected": -4.958398342132568, "logps/chosen": -374.1343688964844, "logps/rejected": -348.85626220703125, "loss": 0.7005, "rewards/accuracies": 0.578125, "rewards/chosen": 1.2927734851837158, "rewards/margins": 0.7640625238418579, "rewards/rejected": 0.5287109613418579, "step": 1040 }, { "epoch": 0.5495943470295734, "grad_norm": 203.34718322753906, "learning_rate": 9.099687314104858e-07, "logits/chosen": -4.945898532867432, "logits/rejected": -4.908203125, "logps/chosen": -392.98748779296875, "logps/rejected": -360.5249938964844, "loss": 0.6067, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 1.2414062023162842, "rewards/margins": 0.850781261920929, "rewards/rejected": 0.390625, "step": 1050 }, { "epoch": 0.5548285789060455, "grad_norm": 175.07394409179688, "learning_rate": 9.073361565194381e-07, "logits/chosen": -4.931836128234863, "logits/rejected": -4.895117282867432, "logps/chosen": -381.45623779296875, "logps/rejected": -324.46875, "loss": 0.6317, "rewards/accuracies": 0.59375, "rewards/chosen": 1.271093726158142, "rewards/margins": 0.803515613079071, "rewards/rejected": 0.46757811307907104, "step": 1060 }, { "epoch": 0.5600628107825176, "grad_norm": 177.40318298339844, "learning_rate": 9.046695691765435e-07, "logits/chosen": -5.020117282867432, "logits/rejected": -5.027929782867432, "logps/chosen": -403.92498779296875, "logps/rejected": -330.57501220703125, "loss": 0.6155, "rewards/accuracies": 0.640625, "rewards/chosen": 1.393164038658142, "rewards/margins": 0.9267578125, "rewards/rejected": 0.4664062559604645, "step": 1070 }, { "epoch": 0.5652970426589898, "grad_norm": 171.57205200195312, "learning_rate": 9.019691920410778e-07, "logits/chosen": -4.941601753234863, "logits/rejected": -4.912695407867432, "logps/chosen": -384.60626220703125, "logps/rejected": -301.3687438964844, "loss": 0.667, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.3781249523162842, "rewards/margins": 0.871874988079071, "rewards/rejected": 0.5062500238418579, "step": 1080 }, { "epoch": 0.5705312745354619, "grad_norm": 225.4108428955078, "learning_rate": 8.992352505937547e-07, "logits/chosen": -4.901757717132568, "logits/rejected": -4.865820407867432, "logps/chosen": -370.59375, "logps/rejected": -298.79376220703125, "loss": 0.643, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": 1.251562476158142, "rewards/margins": 0.6986328363418579, "rewards/rejected": 0.552929699420929, "step": 1090 }, { "epoch": 0.575765506411934, "grad_norm": 159.1053466796875, "learning_rate": 8.964679731178984e-07, "logits/chosen": -4.915234565734863, "logits/rejected": -4.926171779632568, "logps/chosen": -368.17498779296875, "logps/rejected": -343.28125, "loss": 0.6611, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 1.602929711341858, "rewards/margins": 0.7421875, "rewards/rejected": 0.8607422113418579, "step": 1100 }, { "epoch": 0.5809997382884062, "grad_norm": 157.74752807617188, "learning_rate": 8.936675906803815e-07, "logits/chosen": -4.977929592132568, "logits/rejected": -4.948046684265137, "logps/chosen": -405.76251220703125, "logps/rejected": -361.60626220703125, "loss": 0.7074, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 1.430273413658142, "rewards/margins": 0.662890613079071, "rewards/rejected": 0.767382800579071, "step": 1110 }, { "epoch": 0.5862339701648783, "grad_norm": 212.01077270507812, "learning_rate": 8.908343371123319e-07, "logits/chosen": -4.939843654632568, "logits/rejected": -4.8525390625, "logps/chosen": -439.5375061035156, "logps/rejected": -379.95623779296875, "loss": 0.6123, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 1.4445312023162842, "rewards/margins": 0.96875, "rewards/rejected": 0.47578126192092896, "step": 1120 }, { "epoch": 0.5914682020413504, "grad_norm": 179.49246215820312, "learning_rate": 8.879684489896071e-07, "logits/chosen": -4.925195217132568, "logits/rejected": -4.915429592132568, "logps/chosen": -395.23126220703125, "logps/rejected": -354.3500061035156, "loss": 0.6493, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 1.2939453125, "rewards/margins": 0.712695300579071, "rewards/rejected": 0.581250011920929, "step": 1130 }, { "epoch": 0.5967024339178225, "grad_norm": 131.85044860839844, "learning_rate": 8.850701656130407e-07, "logits/chosen": -4.907812595367432, "logits/rejected": -4.919335842132568, "logps/chosen": -412.89373779296875, "logps/rejected": -328.09375, "loss": 0.5895, "rewards/accuracies": 0.65625, "rewards/chosen": 1.2941405773162842, "rewards/margins": 0.790234386920929, "rewards/rejected": 0.50390625, "step": 1140 }, { "epoch": 0.6019366657942947, "grad_norm": 195.68203735351562, "learning_rate": 8.821397289884605e-07, "logits/chosen": -5.012890815734863, "logits/rejected": -4.979882717132568, "logps/chosen": -390.48748779296875, "logps/rejected": -330.01251220703125, "loss": 0.6785, "rewards/accuracies": 0.559374988079071, "rewards/chosen": 1.3611328601837158, "rewards/margins": 0.6683593988418579, "rewards/rejected": 0.6927734613418579, "step": 1150 }, { "epoch": 0.6071708976707668, "grad_norm": 169.86941528320312, "learning_rate": 8.791773838064811e-07, "logits/chosen": -4.842187404632568, "logits/rejected": -4.866406440734863, "logps/chosen": -346.10626220703125, "logps/rejected": -296.70623779296875, "loss": 0.6212, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": 1.258398413658142, "rewards/margins": 0.7396484613418579, "rewards/rejected": 0.518750011920929, "step": 1160 }, { "epoch": 0.6124051295472389, "grad_norm": 165.14083862304688, "learning_rate": 8.76183377422073e-07, "logits/chosen": -4.868847846984863, "logits/rejected": -4.882031440734863, "logps/chosen": -380.46875, "logps/rejected": -348.6625061035156, "loss": 0.647, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 1.4162108898162842, "rewards/margins": 0.8017578125, "rewards/rejected": 0.614453136920929, "step": 1170 }, { "epoch": 0.6176393614237111, "grad_norm": 130.41561889648438, "learning_rate": 8.731579598339079e-07, "logits/chosen": -4.954687595367432, "logits/rejected": -4.914453029632568, "logps/chosen": -391.2406311035156, "logps/rejected": -332.21875, "loss": 0.6067, "rewards/accuracies": 0.625, "rewards/chosen": 1.5021483898162842, "rewards/margins": 0.954882800579071, "rewards/rejected": 0.5472656488418579, "step": 1180 }, { "epoch": 0.6228735933001832, "grad_norm": 132.53321838378906, "learning_rate": 8.701013836634832e-07, "logits/chosen": -4.849413871765137, "logits/rejected": -4.857421875, "logps/chosen": -364.2093811035156, "logps/rejected": -324.4750061035156, "loss": 0.6168, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 1.4675781726837158, "rewards/margins": 0.815625011920929, "rewards/rejected": 0.6519531011581421, "step": 1190 }, { "epoch": 0.6281078251766553, "grad_norm": 153.72689819335938, "learning_rate": 8.670139041340298e-07, "logits/chosen": -4.862988471984863, "logits/rejected": -4.842577934265137, "logps/chosen": -372.52032470703125, "logps/rejected": -338.484375, "loss": 0.5889, "rewards/accuracies": 0.609375, "rewards/chosen": 1.5966796875, "rewards/margins": 0.923632800579071, "rewards/rejected": 0.673046886920929, "step": 1200 }, { "epoch": 0.6333420570531274, "grad_norm": 162.49276733398438, "learning_rate": 8.638957790491998e-07, "logits/chosen": -4.878320217132568, "logits/rejected": -4.880663871765137, "logps/chosen": -355.22186279296875, "logps/rejected": -313.79998779296875, "loss": 0.5915, "rewards/accuracies": 0.628125011920929, "rewards/chosen": 1.660546898841858, "rewards/margins": 0.916015625, "rewards/rejected": 0.7445312738418579, "step": 1210 }, { "epoch": 0.6385762889295996, "grad_norm": 180.18032836914062, "learning_rate": 8.607472687715407e-07, "logits/chosen": -4.961328029632568, "logits/rejected": -4.867968559265137, "logps/chosen": -385.31561279296875, "logps/rejected": -321.89373779296875, "loss": 0.6499, "rewards/accuracies": 0.621874988079071, "rewards/chosen": 1.6570312976837158, "rewards/margins": 0.8404296636581421, "rewards/rejected": 0.816601574420929, "step": 1220 }, { "epoch": 0.6438105208060717, "grad_norm": 179.06814575195312, "learning_rate": 8.575686362007543e-07, "logits/chosen": -4.886523246765137, "logits/rejected": -4.902929782867432, "logps/chosen": -416.9375, "logps/rejected": -334.98748779296875, "loss": 0.5994, "rewards/accuracies": 0.628125011920929, "rewards/chosen": 1.916601538658142, "rewards/margins": 1.1232421398162842, "rewards/rejected": 0.7933593988418579, "step": 1230 }, { "epoch": 0.6490447526825438, "grad_norm": 153.20079040527344, "learning_rate": 8.543601467517459e-07, "logits/chosen": -4.845312595367432, "logits/rejected": -4.864062309265137, "logps/chosen": -363.7906188964844, "logps/rejected": -331.6875, "loss": 0.664, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.6882812976837158, "rewards/margins": 0.8193359375, "rewards/rejected": 0.868945300579071, "step": 1240 }, { "epoch": 0.654278984559016, "grad_norm": 214.4299774169922, "learning_rate": 8.511220683324607e-07, "logits/chosen": -4.970703125, "logits/rejected": -4.931640625, "logps/chosen": -370.92498779296875, "logps/rejected": -337.875, "loss": 0.6863, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 1.5880858898162842, "rewards/margins": 0.713671863079071, "rewards/rejected": 0.8744140863418579, "step": 1250 }, { "epoch": 0.6595132164354881, "grad_norm": 143.33926391601562, "learning_rate": 8.478546713215151e-07, "logits/chosen": -4.959374904632568, "logits/rejected": -4.909570217132568, "logps/chosen": -406.6625061035156, "logps/rejected": -335.3374938964844, "loss": 0.6251, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 1.804296851158142, "rewards/margins": 0.9888671636581421, "rewards/rejected": 0.8154296875, "step": 1260 }, { "epoch": 0.6647474483119602, "grad_norm": 167.4635467529297, "learning_rate": 8.445582285456195e-07, "logits/chosen": -4.932031154632568, "logits/rejected": -4.851758003234863, "logps/chosen": -355.79376220703125, "logps/rejected": -336.7749938964844, "loss": 0.5958, "rewards/accuracies": 0.621874988079071, "rewards/chosen": 1.63427734375, "rewards/margins": 0.929003894329071, "rewards/rejected": 0.705273449420929, "step": 1270 }, { "epoch": 0.6699816801884323, "grad_norm": 184.37896728515625, "learning_rate": 8.412330152567964e-07, "logits/chosen": -4.932031154632568, "logits/rejected": -4.939453125, "logps/chosen": -396.8374938964844, "logps/rejected": -336.10626220703125, "loss": 0.6535, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": 1.560156226158142, "rewards/margins": 0.858593761920929, "rewards/rejected": 0.7015625238418579, "step": 1280 }, { "epoch": 0.6752159120649045, "grad_norm": 214.57464599609375, "learning_rate": 8.378793091093989e-07, "logits/chosen": -4.939062595367432, "logits/rejected": -4.944921970367432, "logps/chosen": -373.7406311035156, "logps/rejected": -350.07501220703125, "loss": 0.6391, "rewards/accuracies": 0.59375, "rewards/chosen": 1.6337890625, "rewards/margins": 0.984179675579071, "rewards/rejected": 0.649609386920929, "step": 1290 }, { "epoch": 0.6804501439413766, "grad_norm": 212.1385040283203, "learning_rate": 8.344973901369252e-07, "logits/chosen": -4.877148628234863, "logits/rejected": -4.854101657867432, "logps/chosen": -398.7562561035156, "logps/rejected": -333.6625061035156, "loss": 0.6681, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 1.3857421875, "rewards/margins": 0.6949218511581421, "rewards/rejected": 0.6908203363418579, "step": 1300 }, { "epoch": 0.6856843758178487, "grad_norm": 166.56057739257812, "learning_rate": 8.310875407286363e-07, "logits/chosen": -4.944140434265137, "logits/rejected": -4.923047065734863, "logps/chosen": -399.48748779296875, "logps/rejected": -360.1187438964844, "loss": 0.5787, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 1.438085913658142, "rewards/margins": 0.96875, "rewards/rejected": 0.4693359434604645, "step": 1310 }, { "epoch": 0.6909186076943209, "grad_norm": 173.50343322753906, "learning_rate": 8.276500456059762e-07, "logits/chosen": -4.961133003234863, "logits/rejected": -4.940625190734863, "logps/chosen": -366.0625, "logps/rejected": -328.7875061035156, "loss": 0.6501, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.4646484851837158, "rewards/margins": 0.8236328363418579, "rewards/rejected": 0.6410156488418579, "step": 1320 }, { "epoch": 0.696152839570793, "grad_norm": 212.61402893066406, "learning_rate": 8.241851917987987e-07, "logits/chosen": -4.988085746765137, "logits/rejected": -4.942773342132568, "logps/chosen": -417.98126220703125, "logps/rejected": -362.6499938964844, "loss": 0.633, "rewards/accuracies": 0.596875011920929, "rewards/chosen": 1.5908203125, "rewards/margins": 0.870312511920929, "rewards/rejected": 0.720507800579071, "step": 1330 }, { "epoch": 0.7013870714472651, "grad_norm": 191.54884338378906, "learning_rate": 8.206932686213996e-07, "logits/chosen": -4.96875, "logits/rejected": -4.950781345367432, "logps/chosen": -400.85626220703125, "logps/rejected": -338.57501220703125, "loss": 0.6086, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 1.5476562976837158, "rewards/margins": 0.960742175579071, "rewards/rejected": 0.5869140625, "step": 1340 }, { "epoch": 0.7066213033237373, "grad_norm": 172.55946350097656, "learning_rate": 8.171745676483592e-07, "logits/chosen": -4.954297065734863, "logits/rejected": -4.958788871765137, "logps/chosen": -383.82501220703125, "logps/rejected": -350.41876220703125, "loss": 0.6771, "rewards/accuracies": 0.590624988079071, "rewards/chosen": 1.4638671875, "rewards/margins": 0.7953125238418579, "rewards/rejected": 0.6685546636581421, "step": 1350 }, { "epoch": 0.7118555352002094, "grad_norm": 143.5701904296875, "learning_rate": 8.13629382690196e-07, "logits/chosen": -4.911718845367432, "logits/rejected": -4.889062404632568, "logps/chosen": -412.7250061035156, "logps/rejected": -362.0843811035156, "loss": 0.7298, "rewards/accuracies": 0.5406249761581421, "rewards/chosen": 1.3662109375, "rewards/margins": 0.824511706829071, "rewards/rejected": 0.541796863079071, "step": 1360 }, { "epoch": 0.7170897670766815, "grad_norm": 178.42764282226562, "learning_rate": 8.100580097688341e-07, "logits/chosen": -4.877343654632568, "logits/rejected": NaN, "logps/chosen": -379.86248779296875, "logps/rejected": -325.48126220703125, "loss": 0.6003, "rewards/accuracies": 0.640625, "rewards/chosen": 1.516992211341858, "rewards/margins": 0.971484363079071, "rewards/rejected": 0.5455077886581421, "step": 1370 }, { "epoch": 0.7223239989531536, "grad_norm": 158.03274536132812, "learning_rate": 8.064607470928844e-07, "logits/chosen": -4.902539253234863, "logits/rejected": -4.937109470367432, "logps/chosen": -369.3812561035156, "logps/rejected": -333.85626220703125, "loss": 0.614, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": 1.4841797351837158, "rewards/margins": 0.9359375238418579, "rewards/rejected": 0.5482422113418579, "step": 1380 }, { "epoch": 0.7275582308296258, "grad_norm": 202.49014282226562, "learning_rate": 8.028378950327452e-07, "logits/chosen": -4.907617092132568, "logits/rejected": -4.905468940734863, "logps/chosen": -417.6312561035156, "logps/rejected": -358.76873779296875, "loss": 0.5693, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": 1.5138671398162842, "rewards/margins": 0.986328125, "rewards/rejected": 0.527539074420929, "step": 1390 }, { "epoch": 0.7327924627060979, "grad_norm": 211.05780029296875, "learning_rate": 7.99189756095521e-07, "logits/chosen": -4.934374809265137, "logits/rejected": -4.918164253234863, "logps/chosen": -385.5, "logps/rejected": -336.1499938964844, "loss": 0.7152, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.411523461341858, "rewards/margins": 0.7962890863418579, "rewards/rejected": 0.615234375, "step": 1400 }, { "epoch": 0.73802669458257, "grad_norm": 166.3219451904297, "learning_rate": 7.955166348997632e-07, "logits/chosen": -4.90576171875, "logits/rejected": -4.845312595367432, "logps/chosen": -358.84375, "logps/rejected": -343.9624938964844, "loss": 0.6367, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": 1.358984351158142, "rewards/margins": 0.8291015625, "rewards/rejected": 0.5298827886581421, "step": 1410 }, { "epoch": 0.7432609264590422, "grad_norm": 220.4535675048828, "learning_rate": 7.918188381500343e-07, "logits/chosen": -4.828711032867432, "logits/rejected": -4.87890625, "logps/chosen": -371.125, "logps/rejected": -322.32501220703125, "loss": 0.7043, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": 1.428125023841858, "rewards/margins": 0.7484375238418579, "rewards/rejected": 0.6796875, "step": 1420 }, { "epoch": 0.7484951583355143, "grad_norm": 133.87107849121094, "learning_rate": 7.880966746112995e-07, "logits/chosen": -4.9150390625, "logits/rejected": -4.8720703125, "logps/chosen": -410.04998779296875, "logps/rejected": -361.6781311035156, "loss": 0.6296, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 1.658203125, "rewards/margins": 0.9189453125, "rewards/rejected": 0.7392578125, "step": 1430 }, { "epoch": 0.7537293902119864, "grad_norm": 179.61343383789062, "learning_rate": 7.843504550831423e-07, "logits/chosen": -4.96875, "logits/rejected": -4.903124809265137, "logps/chosen": -385.23126220703125, "logps/rejected": -350.29998779296875, "loss": 0.6381, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 1.608789086341858, "rewards/margins": 0.8431640863418579, "rewards/rejected": 0.765625, "step": 1440 }, { "epoch": 0.7589636220884585, "grad_norm": 180.80857849121094, "learning_rate": 7.805804923738157e-07, "logits/chosen": -4.959374904632568, "logits/rejected": -4.87890625, "logps/chosen": -375.375, "logps/rejected": -336.65625, "loss": 0.6703, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.2628905773162842, "rewards/margins": 0.8023437261581421, "rewards/rejected": 0.4605468809604645, "step": 1450 }, { "epoch": 0.7641978539649307, "grad_norm": 196.63768005371094, "learning_rate": 7.76787101274121e-07, "logits/chosen": -4.9287109375, "logits/rejected": -4.883984565734863, "logps/chosen": -372.48126220703125, "logps/rejected": -336.6000061035156, "loss": 0.67, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 1.5146484375, "rewards/margins": 0.8949218988418579, "rewards/rejected": 0.6197265386581421, "step": 1460 }, { "epoch": 0.7694320858414028, "grad_norm": 167.95272827148438, "learning_rate": 7.729705985311232e-07, "logits/chosen": -4.915820121765137, "logits/rejected": -4.877734184265137, "logps/chosen": -382.546875, "logps/rejected": -325.9624938964844, "loss": 0.6208, "rewards/accuracies": 0.628125011920929, "rewards/chosen": 1.3701171875, "rewards/margins": 0.9404296875, "rewards/rejected": 0.4296875, "step": 1470 }, { "epoch": 0.7746663177178749, "grad_norm": 138.77662658691406, "learning_rate": 7.69131302821703e-07, "logits/chosen": -4.902734279632568, "logits/rejected": -4.931640625, "logps/chosen": -376.8125, "logps/rejected": -322.0562438964844, "loss": 0.6716, "rewards/accuracies": 0.590624988079071, "rewards/chosen": 1.383398413658142, "rewards/margins": 0.810351550579071, "rewards/rejected": 0.573046863079071, "step": 1480 }, { "epoch": 0.7799005495943471, "grad_norm": 154.5187530517578, "learning_rate": 7.652695347259475e-07, "logits/chosen": -4.971093654632568, "logits/rejected": -4.904296875, "logps/chosen": -395.67498779296875, "logps/rejected": -362.1875, "loss": 0.6566, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": 1.2384765148162842, "rewards/margins": 0.980273425579071, "rewards/rejected": 0.2582031190395355, "step": 1490 }, { "epoch": 0.7851347814708192, "grad_norm": 162.64390563964844, "learning_rate": 7.613856167003811e-07, "logits/chosen": -4.978125095367432, "logits/rejected": -4.976366996765137, "logps/chosen": -351.88751220703125, "logps/rejected": -321.60626220703125, "loss": 0.5678, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.232421875, "rewards/margins": 1.012304663658142, "rewards/rejected": 0.22011718153953552, "step": 1500 }, { "epoch": 0.7903690133472913, "grad_norm": 136.67044067382812, "learning_rate": 7.574798730510415e-07, "logits/chosen": -4.864062309265137, "logits/rejected": -4.8857421875, "logps/chosen": -352.73126220703125, "logps/rejected": -328.5249938964844, "loss": 0.6382, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 1.1335937976837158, "rewards/margins": 0.8216797113418579, "rewards/rejected": 0.3119140565395355, "step": 1510 }, { "epoch": 0.7956032452237635, "grad_norm": 120.07150268554688, "learning_rate": 7.53552629906399e-07, "logits/chosen": -4.921093940734863, "logits/rejected": -4.826952934265137, "logps/chosen": -374.2437438964844, "logps/rejected": -318.73748779296875, "loss": 0.6295, "rewards/accuracies": 0.596875011920929, "rewards/chosen": 1.1589844226837158, "rewards/margins": 0.9205077886581421, "rewards/rejected": 0.23847655951976776, "step": 1520 }, { "epoch": 0.8008374771002356, "grad_norm": 199.56382751464844, "learning_rate": 7.496042151901265e-07, "logits/chosen": -4.826464653015137, "logits/rejected": -4.866406440734863, "logps/chosen": -354.5562438964844, "logps/rejected": -372.2437438964844, "loss": 0.6421, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": 1.2468750476837158, "rewards/margins": 0.923828125, "rewards/rejected": 0.32304686307907104, "step": 1530 }, { "epoch": 0.8060717089767077, "grad_norm": 138.1069793701172, "learning_rate": 7.456349585937164e-07, "logits/chosen": -5.021679878234863, "logits/rejected": -5.015429496765137, "logps/chosen": -389.125, "logps/rejected": -352.8500061035156, "loss": 0.6521, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": 1.2244141101837158, "rewards/margins": 0.8359375, "rewards/rejected": 0.38847655057907104, "step": 1540 }, { "epoch": 0.8113059408531798, "grad_norm": 147.0940704345703, "learning_rate": 7.41645191548952e-07, "logits/chosen": -4.946484565734863, "logits/rejected": -4.922070503234863, "logps/chosen": -399.3062438964844, "logps/rejected": -357.29376220703125, "loss": 0.5846, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": 1.4021484851837158, "rewards/margins": 0.9046875238418579, "rewards/rejected": 0.4974609315395355, "step": 1550 }, { "epoch": 0.816540172729652, "grad_norm": 215.054931640625, "learning_rate": 7.376352472002336e-07, "logits/chosen": -4.926953315734863, "logits/rejected": -4.902539253234863, "logps/chosen": -418.234375, "logps/rejected": -363.98748779296875, "loss": 0.6698, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": 1.3435547351837158, "rewards/margins": 0.82421875, "rewards/rejected": 0.519335925579071, "step": 1560 }, { "epoch": 0.821774404606124, "grad_norm": 125.07649993896484, "learning_rate": 7.336054603767603e-07, "logits/chosen": -4.989648342132568, "logits/rejected": -4.915625095367432, "logps/chosen": -369.0625, "logps/rejected": -336.0062561035156, "loss": 0.665, "rewards/accuracies": 0.559374988079071, "rewards/chosen": 1.2111327648162842, "rewards/margins": 0.876953125, "rewards/rejected": 0.33417969942092896, "step": 1570 }, { "epoch": 0.8270086364825961, "grad_norm": 169.52810668945312, "learning_rate": 7.295561675645719e-07, "logits/chosen": -4.963671684265137, "logits/rejected": -4.933203220367432, "logps/chosen": -392.5375061035156, "logps/rejected": -365.4624938964844, "loss": 0.7313, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 1.33984375, "rewards/margins": 0.7457031011581421, "rewards/rejected": 0.5941406488418579, "step": 1580 }, { "epoch": 0.8322428683590684, "grad_norm": 198.43309020996094, "learning_rate": 7.254877068784535e-07, "logits/chosen": -4.94140625, "logits/rejected": -4.841992378234863, "logps/chosen": -350.5249938964844, "logps/rejected": -335.8812561035156, "loss": 0.606, "rewards/accuracies": 0.609375, "rewards/chosen": 1.3818359375, "rewards/margins": 0.8568359613418579, "rewards/rejected": 0.5249999761581421, "step": 1590 }, { "epoch": 0.8374771002355405, "grad_norm": 199.1750030517578, "learning_rate": 7.214004180337011e-07, "logits/chosen": -4.902929782867432, "logits/rejected": -4.961133003234863, "logps/chosen": -385.1812438964844, "logps/rejected": -364.9624938964844, "loss": 0.5664, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": 1.5080077648162842, "rewards/margins": 0.9750000238418579, "rewards/rejected": 0.533007800579071, "step": 1600 }, { "epoch": 0.8427113321120125, "grad_norm": 184.01495361328125, "learning_rate": 7.172946423177573e-07, "logits/chosen": -4.888281345367432, "logits/rejected": -4.905468940734863, "logps/chosen": -422.60626220703125, "logps/rejected": -354.03125, "loss": 0.5972, "rewards/accuracies": 0.640625, "rewards/chosen": 1.7673828601837158, "rewards/margins": 1.047265648841858, "rewards/rejected": 0.7201172113418579, "step": 1610 }, { "epoch": 0.8479455639884846, "grad_norm": 125.60320281982422, "learning_rate": 7.131707225617124e-07, "logits/chosen": -4.973437309265137, "logits/rejected": -4.959374904632568, "logps/chosen": -377.0, "logps/rejected": -341.5874938964844, "loss": 0.6654, "rewards/accuracies": 0.5625, "rewards/chosen": 1.550195336341858, "rewards/margins": 0.8360351324081421, "rewards/rejected": 0.714160144329071, "step": 1620 }, { "epoch": 0.8531797958649568, "grad_norm": 173.63601684570312, "learning_rate": 7.090290031116797e-07, "logits/chosen": -4.939648628234863, "logits/rejected": -4.907617092132568, "logps/chosen": -403.4156188964844, "logps/rejected": -358.82501220703125, "loss": 0.5993, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.349023461341858, "rewards/margins": 0.9779297113418579, "rewards/rejected": 0.37109375, "step": 1630 }, { "epoch": 0.8584140277414289, "grad_norm": 163.71136474609375, "learning_rate": 7.048698298000411e-07, "logits/chosen": -4.870312690734863, "logits/rejected": -4.928320407867432, "logps/chosen": -404.70623779296875, "logps/rejected": -375.48126220703125, "loss": 0.6343, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 1.4181640148162842, "rewards/margins": 0.845507800579071, "rewards/rejected": 0.5726562738418579, "step": 1640 }, { "epoch": 0.863648259617901, "grad_norm": 222.723876953125, "learning_rate": 7.006935499165714e-07, "logits/chosen": -4.963476657867432, "logits/rejected": -4.891406059265137, "logps/chosen": -401.8500061035156, "logps/rejected": -383.86248779296875, "loss": 0.6256, "rewards/accuracies": 0.625, "rewards/chosen": 1.230078101158142, "rewards/margins": 0.8802734613418579, "rewards/rejected": 0.34980469942092896, "step": 1650 }, { "epoch": 0.8688824914943732, "grad_norm": 118.47099304199219, "learning_rate": 6.965005121794388e-07, "logits/chosen": -4.827538967132568, "logits/rejected": -4.814745903015137, "logps/chosen": -356.0062561035156, "logps/rejected": -299.87188720703125, "loss": 0.5816, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.1697266101837158, "rewards/margins": 0.912304699420929, "rewards/rejected": 0.2574218809604645, "step": 1660 }, { "epoch": 0.8741167233708453, "grad_norm": 192.42030334472656, "learning_rate": 6.922910667060881e-07, "logits/chosen": -5.006249904632568, "logits/rejected": -4.976171970367432, "logps/chosen": -404.4312438964844, "logps/rejected": -368.13751220703125, "loss": 0.6376, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 1.1650390625, "rewards/margins": 0.9501953125, "rewards/rejected": 0.21484375, "step": 1670 }, { "epoch": 0.8793509552473174, "grad_norm": 160.41163635253906, "learning_rate": 6.880655649840044e-07, "logits/chosen": -4.873632907867432, "logits/rejected": -4.816601753234863, "logps/chosen": -402.09375, "logps/rejected": -373.3687438964844, "loss": 0.6158, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.276953101158142, "rewards/margins": 0.9955078363418579, "rewards/rejected": 0.28144532442092896, "step": 1680 }, { "epoch": 0.8845851871237895, "grad_norm": 168.89537048339844, "learning_rate": 6.838243598413657e-07, "logits/chosen": -4.887499809265137, "logits/rejected": -4.804101467132568, "logps/chosen": -375.28125, "logps/rejected": -331.28125, "loss": 0.5757, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 1.162500023841858, "rewards/margins": 0.9521484375, "rewards/rejected": 0.21035155653953552, "step": 1690 }, { "epoch": 0.8898194190002617, "grad_norm": 159.43746948242188, "learning_rate": 6.795678054175811e-07, "logits/chosen": -4.938086032867432, "logits/rejected": NaN, "logps/chosen": -434.98748779296875, "logps/rejected": -336.28125, "loss": 0.6174, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.70703125, "rewards/margins": 1.0734374523162842, "rewards/rejected": 0.633593738079071, "step": 1700 }, { "epoch": 0.8950536508767338, "grad_norm": 162.35177612304688, "learning_rate": 6.752962571337198e-07, "logits/chosen": -4.886523246765137, "logits/rejected": -4.9345703125, "logps/chosen": -431.8374938964844, "logps/rejected": -354.60626220703125, "loss": 0.6132, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.6115233898162842, "rewards/margins": 1.1064453125, "rewards/rejected": 0.505078136920929, "step": 1710 }, { "epoch": 0.9002878827532059, "grad_norm": 142.42416381835938, "learning_rate": 6.710100716628344e-07, "logits/chosen": -4.923828125, "logits/rejected": -4.818749904632568, "logps/chosen": -374.79376220703125, "logps/rejected": -329.53125, "loss": 0.6071, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": 1.328125, "rewards/margins": 0.864062488079071, "rewards/rejected": 0.46406251192092896, "step": 1720 }, { "epoch": 0.9055221146296781, "grad_norm": 166.4989776611328, "learning_rate": 6.66709606900178e-07, "logits/chosen": -4.9091796875, "logits/rejected": -4.919140815734863, "logps/chosen": -381.046875, "logps/rejected": -339.5687561035156, "loss": 0.6385, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 1.3269531726837158, "rewards/margins": 0.899609386920929, "rewards/rejected": 0.4273437559604645, "step": 1730 }, { "epoch": 0.9107563465061502, "grad_norm": 234.83023071289062, "learning_rate": 6.62395221933321e-07, "logits/chosen": -4.978906154632568, "logits/rejected": -4.918359279632568, "logps/chosen": -393.22186279296875, "logps/rejected": -337.9937438964844, "loss": 0.6745, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 1.357031226158142, "rewards/margins": 0.791796863079071, "rewards/rejected": 0.565234363079071, "step": 1740 }, { "epoch": 0.9159905783826223, "grad_norm": 177.23922729492188, "learning_rate": 6.580672770121663e-07, "logits/chosen": -4.906933784484863, "logits/rejected": -4.962890625, "logps/chosen": -403.9750061035156, "logps/rejected": -333.84375, "loss": 0.6506, "rewards/accuracies": 0.621874988079071, "rewards/chosen": 1.5724608898162842, "rewards/margins": 0.9105468988418579, "rewards/rejected": 0.661914050579071, "step": 1750 }, { "epoch": 0.9212248102590945, "grad_norm": 216.88160705566406, "learning_rate": 6.537261335188695e-07, "logits/chosen": -4.89111328125, "logits/rejected": -4.900781154632568, "logps/chosen": -386.15625, "logps/rejected": -357.5874938964844, "loss": 0.6707, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 1.619140625, "rewards/margins": 0.9765625, "rewards/rejected": 0.642578125, "step": 1760 }, { "epoch": 0.9264590421355666, "grad_norm": 181.79258728027344, "learning_rate": 6.493721539376629e-07, "logits/chosen": NaN, "logits/rejected": -4.872460842132568, "logps/chosen": -389.28125, "logps/rejected": -361.04376220703125, "loss": 0.6145, "rewards/accuracies": 0.590624988079071, "rewards/chosen": 1.3533203601837158, "rewards/margins": 0.860546886920929, "rewards/rejected": 0.4927734434604645, "step": 1770 }, { "epoch": 0.9316932740120387, "grad_norm": 208.49319458007812, "learning_rate": 6.450057018245887e-07, "logits/chosen": -4.882031440734863, "logits/rejected": -4.924609184265137, "logps/chosen": -373.7749938964844, "logps/rejected": -341.23126220703125, "loss": 0.7103, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 1.3869140148162842, "rewards/margins": 0.8130859136581421, "rewards/rejected": 0.5738281011581421, "step": 1780 }, { "epoch": 0.9369275058885108, "grad_norm": 134.74072265625, "learning_rate": 6.406271417771417e-07, "logits/chosen": -4.969531059265137, "logits/rejected": -5.007031440734863, "logps/chosen": -401.95623779296875, "logps/rejected": -380.9750061035156, "loss": 0.6091, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.6064453125, "rewards/margins": 1.104101538658142, "rewards/rejected": 0.5023437738418579, "step": 1790 }, { "epoch": 0.942161737764983, "grad_norm": 171.42649841308594, "learning_rate": 6.362368394038253e-07, "logits/chosen": -4.910351753234863, "logits/rejected": -4.897265434265137, "logps/chosen": -365.66876220703125, "logps/rejected": -343.375, "loss": 0.6408, "rewards/accuracies": 0.59375, "rewards/chosen": 1.3917968273162842, "rewards/margins": 0.897656261920929, "rewards/rejected": 0.494140625, "step": 1800 }, { "epoch": 0.9473959696414551, "grad_norm": 114.25118255615234, "learning_rate": 6.318351612936251e-07, "logits/chosen": -4.850976467132568, "logits/rejected": -4.774609565734863, "logps/chosen": -363.95001220703125, "logps/rejected": -312.53125, "loss": 0.6541, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": 1.3136718273162842, "rewards/margins": 0.841796875, "rewards/rejected": 0.47187501192092896, "step": 1810 }, { "epoch": 0.9526302015179272, "grad_norm": 159.3045654296875, "learning_rate": 6.27422474985396e-07, "logits/chosen": -4.926367282867432, "logits/rejected": -4.9013671875, "logps/chosen": -389.125, "logps/rejected": -313.79998779296875, "loss": 0.6071, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5886719226837158, "rewards/margins": 0.958789050579071, "rewards/rejected": 0.6298828125, "step": 1820 }, { "epoch": 0.9578644333943994, "grad_norm": 221.9844207763672, "learning_rate": 6.229991489371753e-07, "logits/chosen": -4.910546779632568, "logits/rejected": -4.9111328125, "logps/chosen": -373.8125, "logps/rejected": -345.90625, "loss": 0.6855, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 1.5886719226837158, "rewards/margins": 0.785351574420929, "rewards/rejected": 0.8033202886581421, "step": 1830 }, { "epoch": 0.9630986652708715, "grad_norm": 193.92507934570312, "learning_rate": 6.185655524954147e-07, "logits/chosen": -4.938672065734863, "logits/rejected": -4.956640720367432, "logps/chosen": -387.70623779296875, "logps/rejected": -347.2875061035156, "loss": 0.581, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.5148437023162842, "rewards/margins": 1.0539062023162842, "rewards/rejected": 0.4609375, "step": 1840 }, { "epoch": 0.9683328971473436, "grad_norm": 216.8731231689453, "learning_rate": 6.141220558641415e-07, "logits/chosen": -4.947070121765137, "logits/rejected": -4.956445217132568, "logps/chosen": -460.01873779296875, "logps/rejected": -384.41876220703125, "loss": 0.6168, "rewards/accuracies": 0.640625, "rewards/chosen": 1.6025390625, "rewards/margins": 1.1396484375, "rewards/rejected": 0.462890625, "step": 1850 }, { "epoch": 0.9735671290238157, "grad_norm": 114.47911834716797, "learning_rate": 6.096690300740452e-07, "logits/chosen": -4.8544921875, "logits/rejected": -4.810742378234863, "logps/chosen": -345.4750061035156, "logps/rejected": -307.84375, "loss": 0.5575, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.350195288658142, "rewards/margins": 0.979296863079071, "rewards/rejected": 0.37089842557907104, "step": 1860 }, { "epoch": 0.9788013609002879, "grad_norm": 180.49423217773438, "learning_rate": 6.052068469514983e-07, "logits/chosen": -4.933398246765137, "logits/rejected": -4.896093845367432, "logps/chosen": -382.9375, "logps/rejected": -338.0687561035156, "loss": 0.5899, "rewards/accuracies": 0.625, "rewards/chosen": 1.393945336341858, "rewards/margins": 1.044921875, "rewards/rejected": 0.3490234315395355, "step": 1870 }, { "epoch": 0.98403559277676, "grad_norm": 187.371337890625, "learning_rate": 6.007358790875071e-07, "logits/chosen": -4.919921875, "logits/rejected": -4.8916015625, "logps/chosen": -384.8125, "logps/rejected": -351.9375, "loss": 0.6323, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 1.2810547351837158, "rewards/margins": 0.92578125, "rewards/rejected": 0.35527342557907104, "step": 1880 }, { "epoch": 0.9892698246532321, "grad_norm": 175.0353240966797, "learning_rate": 5.962564998066017e-07, "logits/chosen": -4.9541015625, "logits/rejected": -4.867578029632568, "logps/chosen": -408.17498779296875, "logps/rejected": -379.53125, "loss": 0.6667, "rewards/accuracies": 0.5625, "rewards/chosen": 1.1785156726837158, "rewards/margins": 0.717578113079071, "rewards/rejected": 0.4609375, "step": 1890 }, { "epoch": 0.9945040565297043, "grad_norm": 172.1247100830078, "learning_rate": 5.917690831356632e-07, "logits/chosen": -4.98828125, "logits/rejected": -4.936132907867432, "logps/chosen": -403.5375061035156, "logps/rejected": -357.3374938964844, "loss": 0.6926, "rewards/accuracies": 0.559374988079071, "rewards/chosen": 1.2490234375, "rewards/margins": 0.8423827886581421, "rewards/rejected": 0.4066406190395355, "step": 1900 }, { "epoch": 0.9997382884061764, "grad_norm": 194.06443786621094, "learning_rate": 5.872740037726918e-07, "logits/chosen": -4.913671970367432, "logits/rejected": -4.895703315734863, "logps/chosen": -411.40625, "logps/rejected": -348.21875, "loss": 0.5596, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.272851586341858, "rewards/margins": 1.1232421398162842, "rewards/rejected": 0.14960937201976776, "step": 1910 }, { "epoch": 1.004710808688825, "grad_norm": 168.50564575195312, "learning_rate": 5.82771637055521e-07, "logits/chosen": -4.8932976722717285, "logits/rejected": -4.887335300445557, "logps/chosen": -343.7894592285156, "logps/rejected": -326.09869384765625, "loss": 0.2841, "rewards/accuracies": 0.8651315569877625, "rewards/chosen": 1.963404655456543, "rewards/margins": 2.168996810913086, "rewards/rejected": -0.2055921107530594, "step": 1920 }, { "epoch": 1.009945040565297, "grad_norm": 60.31418228149414, "learning_rate": 5.78262358930476e-07, "logits/chosen": -4.884375095367432, "logits/rejected": -4.882616996765137, "logps/chosen": -370.48748779296875, "logps/rejected": -322.625, "loss": 0.2411, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.232421875, "rewards/margins": 2.7894530296325684, "rewards/rejected": -0.5570312738418579, "step": 1930 }, { "epoch": 1.0151792724417692, "grad_norm": 62.882877349853516, "learning_rate": 5.737465459209825e-07, "logits/chosen": -4.902539253234863, "logits/rejected": -4.908593654632568, "logps/chosen": -376.0874938964844, "logps/rejected": -338.3125, "loss": 0.3033, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 2.1244139671325684, "rewards/margins": 2.427929639816284, "rewards/rejected": -0.30351561307907104, "step": 1940 }, { "epoch": 1.0204135043182414, "grad_norm": 54.096405029296875, "learning_rate": 5.692245750961274e-07, "logits/chosen": -4.907617092132568, "logits/rejected": -4.908984184265137, "logps/chosen": -400.2875061035156, "logps/rejected": -353.59375, "loss": 0.2533, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.226757764816284, "rewards/margins": 2.6015625, "rewards/rejected": -0.3746093809604645, "step": 1950 }, { "epoch": 1.0256477361947134, "grad_norm": 54.655601501464844, "learning_rate": 5.646968240391729e-07, "logits/chosen": -4.991796970367432, "logits/rejected": -4.936718940734863, "logps/chosen": -392.53436279296875, "logps/rejected": -331.03125, "loss": 0.2129, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.5396485328674316, "rewards/margins": 2.8111329078674316, "rewards/rejected": -0.271484375, "step": 1960 }, { "epoch": 1.0308819680711856, "grad_norm": 44.23270797729492, "learning_rate": 5.601636708160296e-07, "logits/chosen": -4.857421875, "logits/rejected": -4.794335842132568, "logps/chosen": -386.57501220703125, "logps/rejected": -350.91876220703125, "loss": 0.2373, "rewards/accuracies": 0.875, "rewards/chosen": 2.3017578125, "rewards/margins": 2.551953077316284, "rewards/rejected": -0.25019532442092896, "step": 1970 }, { "epoch": 1.0361161999476576, "grad_norm": 87.35701751708984, "learning_rate": 5.55625493943687e-07, "logits/chosen": -4.939062595367432, "logits/rejected": -4.922265529632568, "logps/chosen": -379.01873779296875, "logps/rejected": -326.29376220703125, "loss": 0.2368, "rewards/accuracies": 0.859375, "rewards/chosen": 2.471874952316284, "rewards/margins": 2.732421875, "rewards/rejected": -0.2607421875, "step": 1980 }, { "epoch": 1.0413504318241298, "grad_norm": 78.54818725585938, "learning_rate": 5.510826723586078e-07, "logits/chosen": -5.026757717132568, "logits/rejected": -4.898046970367432, "logps/chosen": -399.7250061035156, "logps/rejected": -349.2749938964844, "loss": 0.2787, "rewards/accuracies": 0.84375, "rewards/chosen": 2.4009766578674316, "rewards/margins": 2.603515625, "rewards/rejected": -0.20234374701976776, "step": 1990 }, { "epoch": 1.046584663700602, "grad_norm": 67.9875259399414, "learning_rate": 5.465355853850871e-07, "logits/chosen": -4.983984470367432, "logits/rejected": -4.919726371765137, "logps/chosen": -384.8187561035156, "logps/rejected": -346.76251220703125, "loss": 0.2668, "rewards/accuracies": 0.878125011920929, "rewards/chosen": 2.5220704078674316, "rewards/margins": 2.5537109375, "rewards/rejected": -0.03164062649011612, "step": 2000 }, { "epoch": 1.046584663700602, "eval_logits/chosen": -4.985968589782715, "eval_logits/rejected": -4.93931245803833, "eval_logps/chosen": -391.64849853515625, "eval_logps/rejected": -346.8900146484375, "eval_loss": 0.645520031452179, "eval_rewards/accuracies": 0.6115000247955322, "eval_rewards/chosen": 1.6663438081741333, "eval_rewards/margins": 0.9365624785423279, "eval_rewards/rejected": 0.7297812700271606, "eval_runtime": 245.3356, "eval_samples_per_second": 8.152, "eval_steps_per_second": 2.038, "step": 2000 }, { "epoch": 1.051818895577074, "grad_norm": 74.96888732910156, "learning_rate": 5.41984612703579e-07, "logits/chosen": -5.0869140625, "logits/rejected": -4.9599609375, "logps/chosen": -383.61248779296875, "logps/rejected": -343.46875, "loss": 0.2217, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": 2.5923829078674316, "rewards/margins": 2.7876954078674316, "rewards/rejected": -0.1953125, "step": 2010 }, { "epoch": 1.0570531274535462, "grad_norm": 53.94102478027344, "learning_rate": 5.37430134318992e-07, "logits/chosen": -4.963476657867432, "logits/rejected": -4.937109470367432, "logps/chosen": -389.93438720703125, "logps/rejected": -335.5, "loss": 0.2719, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.5130858421325684, "rewards/margins": 2.5259766578674316, "rewards/rejected": -0.01279296912252903, "step": 2020 }, { "epoch": 1.0622873593300184, "grad_norm": 74.42227935791016, "learning_rate": 5.328725305289612e-07, "logits/chosen": -4.899804592132568, "logits/rejected": -4.884375095367432, "logps/chosen": -384.09375, "logps/rejected": -348.20623779296875, "loss": 0.2571, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.3919920921325684, "rewards/margins": 2.663867235183716, "rewards/rejected": -0.2718749940395355, "step": 2030 }, { "epoch": 1.0675215912064904, "grad_norm": 59.78847885131836, "learning_rate": 5.283121818920911e-07, "logits/chosen": -4.978711128234863, "logits/rejected": -5.005859375, "logps/chosen": -419.3687438964844, "logps/rejected": -367.98748779296875, "loss": 0.2419, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": 2.4642577171325684, "rewards/margins": 2.6527342796325684, "rewards/rejected": -0.1884765625, "step": 2040 }, { "epoch": 1.0727558230829626, "grad_norm": 64.66287231445312, "learning_rate": 5.237494691961808e-07, "logits/chosen": -4.956445217132568, "logits/rejected": -4.892578125, "logps/chosen": -387.07501220703125, "logps/rejected": -343.56561279296875, "loss": 0.2306, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.2974610328674316, "rewards/margins": 2.5179686546325684, "rewards/rejected": -0.22050781548023224, "step": 2050 }, { "epoch": 1.0779900549594348, "grad_norm": 68.53085327148438, "learning_rate": 5.191847734264272e-07, "logits/chosen": -4.952538967132568, "logits/rejected": -4.972265720367432, "logps/chosen": -383.26873779296875, "logps/rejected": -341.75, "loss": 0.2569, "rewards/accuracies": 0.859375, "rewards/chosen": 2.283007860183716, "rewards/margins": 2.440234422683716, "rewards/rejected": -0.15703125298023224, "step": 2060 }, { "epoch": 1.0832242868359068, "grad_norm": 69.19928741455078, "learning_rate": 5.146184757336133e-07, "logits/chosen": -4.997656345367432, "logits/rejected": -4.974218845367432, "logps/chosen": -353.8687438964844, "logps/rejected": -320.78125, "loss": 0.2585, "rewards/accuracies": 0.878125011920929, "rewards/chosen": 2.4677734375, "rewards/margins": 2.5068359375, "rewards/rejected": -0.0390625, "step": 2070 }, { "epoch": 1.088458518712379, "grad_norm": 73.40951538085938, "learning_rate": 5.100509574022827e-07, "logits/chosen": -4.9599609375, "logits/rejected": -4.957812309265137, "logps/chosen": -393.79998779296875, "logps/rejected": -361.20623779296875, "loss": 0.2544, "rewards/accuracies": 0.846875011920929, "rewards/chosen": 2.5498046875, "rewards/margins": 2.5523438453674316, "rewards/rejected": -0.0027343749534338713, "step": 2080 }, { "epoch": 1.0936927505888512, "grad_norm": 73.82657623291016, "learning_rate": 5.054825998189012e-07, "logits/chosen": -4.938867092132568, "logits/rejected": -4.915625095367432, "logps/chosen": -379.8125, "logps/rejected": -366.41876220703125, "loss": 0.2322, "rewards/accuracies": 0.8843749761581421, "rewards/chosen": 2.5576171875, "rewards/margins": 2.7880859375, "rewards/rejected": -0.23066405951976776, "step": 2090 }, { "epoch": 1.0989269824653232, "grad_norm": 50.599388122558594, "learning_rate": 5.009137844400127e-07, "logits/chosen": -4.993945121765137, "logits/rejected": -4.975781440734863, "logps/chosen": -400.70623779296875, "logps/rejected": -330.59375, "loss": 0.238, "rewards/accuracies": 0.878125011920929, "rewards/chosen": 2.6611328125, "rewards/margins": 2.726757764816284, "rewards/rejected": -0.0654296875, "step": 2100 }, { "epoch": 1.1041612143417954, "grad_norm": 46.873443603515625, "learning_rate": 4.963448927603866e-07, "logits/chosen": -4.898633003234863, "logits/rejected": -4.884765625, "logps/chosen": -364.32501220703125, "logps/rejected": -327.9906311035156, "loss": 0.2234, "rewards/accuracies": 0.871874988079071, "rewards/chosen": 2.4544920921325684, "rewards/margins": 2.6001954078674316, "rewards/rejected": -0.14570312201976776, "step": 2110 }, { "epoch": 1.1093954462182674, "grad_norm": 51.72285842895508, "learning_rate": 4.917763062811631e-07, "logits/chosen": -4.929491996765137, "logits/rejected": -4.903710842132568, "logps/chosen": -368.45623779296875, "logps/rejected": -355.5687561035156, "loss": 0.2638, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.5433592796325684, "rewards/margins": 2.753710985183716, "rewards/rejected": -0.21035155653953552, "step": 2120 }, { "epoch": 1.1146296780947396, "grad_norm": 57.314247131347656, "learning_rate": 4.872084064779983e-07, "logits/chosen": -4.908203125, "logits/rejected": -4.8779296875, "logps/chosen": -374.328125, "logps/rejected": -322.7124938964844, "loss": 0.241, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": 2.5791015625, "rewards/margins": 2.6552734375, "rewards/rejected": -0.07636718451976776, "step": 2130 }, { "epoch": 1.1198639099712118, "grad_norm": 76.68910217285156, "learning_rate": 4.826415747692117e-07, "logits/chosen": -4.901562690734863, "logits/rejected": -4.875195503234863, "logps/chosen": -365.3187561035156, "logps/rejected": -330.0062561035156, "loss": 0.2737, "rewards/accuracies": 0.878125011920929, "rewards/chosen": 2.1650390625, "rewards/margins": 2.5414061546325684, "rewards/rejected": -0.37617188692092896, "step": 2140 }, { "epoch": 1.1250981418476838, "grad_norm": 116.68999481201172, "learning_rate": 4.780761924839365e-07, "logits/chosen": -4.8671875, "logits/rejected": -4.887109279632568, "logps/chosen": -364.12188720703125, "logps/rejected": -341.2124938964844, "loss": 0.2771, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.296093702316284, "rewards/margins": 2.568164110183716, "rewards/rejected": -0.27265626192092896, "step": 2150 }, { "epoch": 1.130332373724156, "grad_norm": 57.82802963256836, "learning_rate": 4.7351264083027954e-07, "logits/chosen": -4.919140815734863, "logits/rejected": -4.878515720367432, "logps/chosen": -364.1937561035156, "logps/rejected": -336.0406188964844, "loss": 0.2627, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.097851514816284, "rewards/margins": 2.4853515625, "rewards/rejected": -0.38749998807907104, "step": 2160 }, { "epoch": 1.1355666056006282, "grad_norm": 68.37049865722656, "learning_rate": 4.689513008634906e-07, "logits/chosen": -4.999609470367432, "logits/rejected": -5.017968654632568, "logps/chosen": -416.5062561035156, "logps/rejected": -363.20623779296875, "loss": 0.2276, "rewards/accuracies": 0.875, "rewards/chosen": 2.353710889816284, "rewards/margins": 3.09228515625, "rewards/rejected": -0.73828125, "step": 2170 }, { "epoch": 1.1408008374771001, "grad_norm": 35.549407958984375, "learning_rate": 4.6439255345414475e-07, "logits/chosen": -4.923242092132568, "logits/rejected": -4.947851657867432, "logps/chosen": -398.1187438964844, "logps/rejected": -356.53125, "loss": 0.2645, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.0933594703674316, "rewards/margins": 2.680468797683716, "rewards/rejected": -0.5869140625, "step": 2180 }, { "epoch": 1.1460350693535724, "grad_norm": 55.24445724487305, "learning_rate": 4.5983677925633836e-07, "logits/chosen": -4.909375190734863, "logits/rejected": -4.917578220367432, "logps/chosen": -388.5874938964844, "logps/rejected": -346.88751220703125, "loss": 0.242, "rewards/accuracies": 0.871874988079071, "rewards/chosen": 2.294140577316284, "rewards/margins": 2.731250047683716, "rewards/rejected": -0.4371093809604645, "step": 2190 }, { "epoch": 1.1512693012300446, "grad_norm": 59.90499496459961, "learning_rate": 4.5528435867590595e-07, "logits/chosen": -5.030468940734863, "logits/rejected": -4.964453220367432, "logps/chosen": -406.85626220703125, "logps/rejected": -363.04998779296875, "loss": 0.2505, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.452343702316284, "rewards/margins": 2.865039110183716, "rewards/rejected": -0.41289061307907104, "step": 2200 }, { "epoch": 1.1565035331065165, "grad_norm": 78.35042572021484, "learning_rate": 4.507356718386556e-07, "logits/chosen": -4.877734184265137, "logits/rejected": -4.879687309265137, "logps/chosen": -373.7875061035156, "logps/rejected": -344.2250061035156, "loss": 0.2261, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.4234375953674316, "rewards/margins": 2.846484422683716, "rewards/rejected": -0.4228515625, "step": 2210 }, { "epoch": 1.1617377649829888, "grad_norm": 83.09690856933594, "learning_rate": 4.461910985586298e-07, "logits/chosen": -4.975781440734863, "logits/rejected": -4.931445121765137, "logps/chosen": -388.70623779296875, "logps/rejected": -347.35626220703125, "loss": 0.218, "rewards/accuracies": 0.903124988079071, "rewards/chosen": 2.574414014816284, "rewards/margins": 2.8560547828674316, "rewards/rejected": -0.2818359434604645, "step": 2220 }, { "epoch": 1.166971996859461, "grad_norm": 75.56656646728516, "learning_rate": 4.4165101830638937e-07, "logits/chosen": -4.910351753234863, "logits/rejected": -4.954297065734863, "logps/chosen": -379.09375, "logps/rejected": -347.3500061035156, "loss": 0.2437, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.480664014816284, "rewards/margins": 2.6923828125, "rewards/rejected": -0.21152344346046448, "step": 2230 }, { "epoch": 1.172206228735933, "grad_norm": 118.14814758300781, "learning_rate": 4.3711581017732866e-07, "logits/chosen": -4.850976467132568, "logits/rejected": NaN, "logps/chosen": -369.6187438964844, "logps/rejected": -342.13751220703125, "loss": 0.2805, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.346484422683716, "rewards/margins": 2.742968797683716, "rewards/rejected": -0.396484375, "step": 2240 }, { "epoch": 1.1774404606124051, "grad_norm": 67.8809814453125, "learning_rate": 4.325858528600214e-07, "logits/chosen": -4.995507717132568, "logits/rejected": -4.978906154632568, "logps/chosen": -369.4437561035156, "logps/rejected": -358.8812561035156, "loss": 0.2502, "rewards/accuracies": 0.8656250238418579, "rewards/chosen": 2.5513672828674316, "rewards/margins": 2.708789110183716, "rewards/rejected": -0.15742187201976776, "step": 2250 }, { "epoch": 1.1826746924888774, "grad_norm": 76.58126068115234, "learning_rate": 4.280615246046001e-07, "logits/chosen": -4.966601371765137, "logits/rejected": -4.994531154632568, "logps/chosen": -384.4937438964844, "logps/rejected": -347.0218811035156, "loss": 0.2324, "rewards/accuracies": 0.878125011920929, "rewards/chosen": 2.514453172683716, "rewards/margins": 2.820507764816284, "rewards/rejected": -0.3060546815395355, "step": 2260 }, { "epoch": 1.1879089243653493, "grad_norm": 96.10749053955078, "learning_rate": 4.235432031911719e-07, "logits/chosen": -4.909765720367432, "logits/rejected": -4.976953029632568, "logps/chosen": -398.3187561035156, "logps/rejected": -354.51873779296875, "loss": 0.2273, "rewards/accuracies": 0.909375011920929, "rewards/chosen": 2.4349608421325684, "rewards/margins": 2.828320264816284, "rewards/rejected": -0.3931640684604645, "step": 2270 }, { "epoch": 1.1931431562418215, "grad_norm": 36.97243118286133, "learning_rate": 4.190312658982747e-07, "logits/chosen": -5.048828125, "logits/rejected": -5.011523246765137, "logps/chosen": -381.58123779296875, "logps/rejected": -353.48126220703125, "loss": 0.2351, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.3939452171325684, "rewards/margins": 2.7671875953674316, "rewards/rejected": -0.37324219942092896, "step": 2280 }, { "epoch": 1.1983773881182938, "grad_norm": 47.53986740112305, "learning_rate": 4.145260894713738e-07, "logits/chosen": -4.933984279632568, "logits/rejected": -4.8203125, "logps/chosen": -368.8187561035156, "logps/rejected": -331.3687438964844, "loss": 0.2506, "rewards/accuracies": 0.871874988079071, "rewards/chosen": 2.228710889816284, "rewards/margins": 2.793164014816284, "rewards/rejected": -0.564648449420929, "step": 2290 }, { "epoch": 1.2036116199947657, "grad_norm": 58.13813018798828, "learning_rate": 4.1002805009140464e-07, "logits/chosen": -4.984375, "logits/rejected": -4.928515434265137, "logps/chosen": -376.4624938964844, "logps/rejected": -346.1812438964844, "loss": 0.2379, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.217578172683716, "rewards/margins": 2.5972657203674316, "rewards/rejected": -0.37968748807907104, "step": 2300 }, { "epoch": 1.208845851871238, "grad_norm": 39.71947479248047, "learning_rate": 4.055375233433608e-07, "logits/chosen": -5.012499809265137, "logits/rejected": -4.912499904632568, "logps/chosen": -376.9312438964844, "logps/rejected": -329.20001220703125, "loss": 0.2367, "rewards/accuracies": 0.90625, "rewards/chosen": 2.112499952316284, "rewards/margins": 2.7300782203674316, "rewards/rejected": -0.6175781488418579, "step": 2310 }, { "epoch": 1.2140800837477101, "grad_norm": 112.1904067993164, "learning_rate": 4.010548841849336e-07, "logits/chosen": -4.931250095367432, "logits/rejected": -4.865429878234863, "logps/chosen": -406.125, "logps/rejected": -326.5625, "loss": 0.2743, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.323046922683716, "rewards/margins": 2.7718749046325684, "rewards/rejected": -0.4488281309604645, "step": 2320 }, { "epoch": 1.2193143156241821, "grad_norm": 78.99494171142578, "learning_rate": 3.9658050691520243e-07, "logits/chosen": -4.9091796875, "logits/rejected": -4.903906345367432, "logps/chosen": -369.9281311035156, "logps/rejected": -345.23748779296875, "loss": 0.2513, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.247851610183716, "rewards/margins": 2.8666014671325684, "rewards/rejected": -0.6187499761581421, "step": 2330 }, { "epoch": 1.2245485475006543, "grad_norm": 109.476806640625, "learning_rate": 3.921147651433822e-07, "logits/chosen": NaN, "logits/rejected": -4.919140815734863, "logps/chosen": -401.16876220703125, "logps/rejected": -353.3187561035156, "loss": 0.2577, "rewards/accuracies": 0.8656250238418579, "rewards/chosen": 2.333984375, "rewards/margins": 2.9466795921325684, "rewards/rejected": -0.6126953363418579, "step": 2340 }, { "epoch": 1.2297827793771263, "grad_norm": 46.796295166015625, "learning_rate": 3.8765803175762547e-07, "logits/chosen": -4.849511623382568, "logits/rejected": -4.8603515625, "logps/chosen": -369.9593811035156, "logps/rejected": -336.3343811035156, "loss": 0.3129, "rewards/accuracies": 0.828125, "rewards/chosen": 1.989843726158142, "rewards/margins": 2.5755858421325684, "rewards/rejected": -0.585742175579071, "step": 2350 }, { "epoch": 1.2350170112535985, "grad_norm": 90.4599838256836, "learning_rate": 3.832106788938873e-07, "logits/chosen": -4.973046779632568, "logits/rejected": -4.943945407867432, "logps/chosen": -381.9312438964844, "logps/rejected": -350.25, "loss": 0.2312, "rewards/accuracies": 0.890625, "rewards/chosen": 2.37890625, "rewards/margins": 2.8921875953674316, "rewards/rejected": -0.5132812261581421, "step": 2360 }, { "epoch": 1.2402512431300707, "grad_norm": 82.85678100585938, "learning_rate": 3.7877307790485204e-07, "logits/chosen": -4.908984184265137, "logits/rejected": -4.899218559265137, "logps/chosen": -378.9312438964844, "logps/rejected": -312.8500061035156, "loss": 0.2501, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.094531297683716, "rewards/margins": 2.6470704078674316, "rewards/rejected": -0.552734375, "step": 2370 }, { "epoch": 1.2454854750065427, "grad_norm": 96.87503814697266, "learning_rate": 3.7434559932892527e-07, "logits/chosen": -5.006640434265137, "logits/rejected": -4.9267578125, "logps/chosen": -418.45623779296875, "logps/rejected": -350.4312438964844, "loss": 0.2299, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.431835889816284, "rewards/margins": 3.082812547683716, "rewards/rejected": -0.651171863079071, "step": 2380 }, { "epoch": 1.250719706883015, "grad_norm": 79.17874908447266, "learning_rate": 3.699286128592939e-07, "logits/chosen": -4.991601467132568, "logits/rejected": -4.909960746765137, "logps/chosen": -423.78125, "logps/rejected": -365.0249938964844, "loss": 0.2314, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.4359374046325684, "rewards/margins": 3.0562500953674316, "rewards/rejected": -0.6205078363418579, "step": 2390 }, { "epoch": 1.255953938759487, "grad_norm": 68.54492950439453, "learning_rate": 3.655224873130571e-07, "logits/chosen": -4.928320407867432, "logits/rejected": -4.9189453125, "logps/chosen": -410.15625, "logps/rejected": -359.3187561035156, "loss": 0.2251, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 2.441601514816284, "rewards/margins": 3.0054688453674316, "rewards/rejected": -0.5638672113418579, "step": 2400 }, { "epoch": 1.2611881706359591, "grad_norm": 44.794761657714844, "learning_rate": 3.611275906004298e-07, "logits/chosen": -4.94140625, "logits/rejected": -4.926562309265137, "logps/chosen": -400.1000061035156, "logps/rejected": -353.46875, "loss": 0.2377, "rewards/accuracies": 0.875, "rewards/chosen": 2.7152342796325684, "rewards/margins": 2.9957032203674316, "rewards/rejected": -0.2802734375, "step": 2410 }, { "epoch": 1.2664224025124313, "grad_norm": 55.99201965332031, "learning_rate": 3.5674428969402306e-07, "logits/chosen": -5.028124809265137, "logits/rejected": -4.926562309265137, "logps/chosen": -414.7875061035156, "logps/rejected": -356.6968688964844, "loss": 0.2463, "rewards/accuracies": 0.878125011920929, "rewards/chosen": 2.388867139816284, "rewards/margins": 2.8080077171325684, "rewards/rejected": -0.4195312559604645, "step": 2420 }, { "epoch": 1.2716566343889033, "grad_norm": 56.941383361816406, "learning_rate": 3.523729505982008e-07, "logits/chosen": -4.903710842132568, "logits/rejected": -4.906054496765137, "logps/chosen": -356.3843688964844, "logps/rejected": -309.8843688964844, "loss": 0.2887, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.256054639816284, "rewards/margins": 2.5619139671325684, "rewards/rejected": -0.30585938692092896, "step": 2430 }, { "epoch": 1.2768908662653755, "grad_norm": 72.36155700683594, "learning_rate": 3.480139383185199e-07, "logits/chosen": -4.971289157867432, "logits/rejected": -4.980273246765137, "logps/chosen": -396.48126220703125, "logps/rejected": -360.7437438964844, "loss": 0.2632, "rewards/accuracies": 0.828125, "rewards/chosen": 2.380078077316284, "rewards/margins": 2.7652344703674316, "rewards/rejected": -0.3851562440395355, "step": 2440 }, { "epoch": 1.2821250981418477, "grad_norm": 34.55730438232422, "learning_rate": 3.436676168312508e-07, "logits/chosen": -4.944921970367432, "logits/rejected": -4.951171875, "logps/chosen": -389.1187438964844, "logps/rejected": -337.2875061035156, "loss": 0.1976, "rewards/accuracies": 0.921875, "rewards/chosen": 2.6058592796325684, "rewards/margins": 3.14453125, "rewards/rejected": -0.539257824420929, "step": 2450 }, { "epoch": 1.2873593300183197, "grad_norm": 40.22491455078125, "learning_rate": 3.393343490529874e-07, "logits/chosen": -4.990429878234863, "logits/rejected": -4.953125, "logps/chosen": -373.5874938964844, "logps/rejected": -318.79998779296875, "loss": 0.2214, "rewards/accuracies": 0.878125011920929, "rewards/chosen": 2.4576172828674316, "rewards/margins": 2.864062547683716, "rewards/rejected": -0.4066406190395355, "step": 2460 }, { "epoch": 1.292593561894792, "grad_norm": 102.94782257080078, "learning_rate": 3.35014496810342e-07, "logits/chosen": -4.963281154632568, "logits/rejected": -4.998437404632568, "logps/chosen": -400.2437438964844, "logps/rejected": -359.1937561035156, "loss": 0.2433, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.50390625, "rewards/margins": 2.947460889816284, "rewards/rejected": -0.44355469942092896, "step": 2470 }, { "epoch": 1.2978277937712641, "grad_norm": 52.22513198852539, "learning_rate": 3.3070842080973365e-07, "logits/chosen": -4.932812690734863, "logits/rejected": -4.923047065734863, "logps/chosen": -363.6812438964844, "logps/rejected": -320.07501220703125, "loss": 0.2448, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.428906202316284, "rewards/margins": 2.8285155296325684, "rewards/rejected": -0.3998046815395355, "step": 2480 }, { "epoch": 1.303062025647736, "grad_norm": 69.16712188720703, "learning_rate": 3.264164806072691e-07, "logits/chosen": -5.05078125, "logits/rejected": -5.073437690734863, "logps/chosen": -426.4750061035156, "logps/rejected": -353.6000061035156, "loss": 0.2077, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.7314453125, "rewards/margins": 3.125195264816284, "rewards/rejected": -0.39453125, "step": 2490 }, { "epoch": 1.3082962575242083, "grad_norm": 102.37962341308594, "learning_rate": 3.221390345787205e-07, "logits/chosen": -4.889452934265137, "logits/rejected": -4.817578315734863, "logps/chosen": -370.3500061035156, "logps/rejected": -355.6625061035156, "loss": 0.2444, "rewards/accuracies": 0.890625, "rewards/chosen": 2.345898389816284, "rewards/margins": 2.8837890625, "rewards/rejected": -0.537890613079071, "step": 2500 }, { "epoch": 1.3135304894006805, "grad_norm": 92.7837905883789, "learning_rate": 3.178764398895999e-07, "logits/chosen": -4.877734184265137, "logits/rejected": -4.872265815734863, "logps/chosen": -359.88751220703125, "logps/rejected": -334.8125, "loss": 0.2433, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.2601561546325684, "rewards/margins": 2.8324217796325684, "rewards/rejected": -0.572070300579071, "step": 2510 }, { "epoch": 1.3187647212771525, "grad_norm": 100.86676788330078, "learning_rate": 3.1362905246533733e-07, "logits/chosen": -4.859570503234863, "logits/rejected": -4.906640529632568, "logps/chosen": -347.79998779296875, "logps/rejected": -337.0, "loss": 0.2659, "rewards/accuracies": 0.878125011920929, "rewards/chosen": 2.224414110183716, "rewards/margins": 2.6773438453674316, "rewards/rejected": -0.45292967557907104, "step": 2520 }, { "epoch": 1.3239989531536247, "grad_norm": 123.00813293457031, "learning_rate": 3.093972269615602e-07, "logits/chosen": -4.939843654632568, "logits/rejected": -4.920312404632568, "logps/chosen": -360.23126220703125, "logps/rejected": -323.1937561035156, "loss": 0.2781, "rewards/accuracies": 0.84375, "rewards/chosen": 2.260937452316284, "rewards/margins": 2.7173829078674316, "rewards/rejected": -0.4564453065395355, "step": 2530 }, { "epoch": 1.329233185030097, "grad_norm": 69.66950225830078, "learning_rate": 3.051813167344807e-07, "logits/chosen": -4.928515434265137, "logits/rejected": -4.857031345367432, "logps/chosen": -369.6499938964844, "logps/rejected": -342.21875, "loss": 0.2526, "rewards/accuracies": 0.8656250238418579, "rewards/chosen": 2.353271484375, "rewards/margins": 2.8285155296325684, "rewards/rejected": -0.47539061307907104, "step": 2540 }, { "epoch": 1.334467416906569, "grad_norm": 72.55719757080078, "learning_rate": 3.009816738113891e-07, "logits/chosen": -4.937109470367432, "logits/rejected": -4.958788871765137, "logps/chosen": -396.54998779296875, "logps/rejected": -354.60626220703125, "loss": 0.2286, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": 2.2650389671325684, "rewards/margins": 2.900585889816284, "rewards/rejected": -0.635546863079071, "step": 2550 }, { "epoch": 1.339701648783041, "grad_norm": 96.79957580566406, "learning_rate": 2.967986488612611e-07, "logits/chosen": -4.9521484375, "logits/rejected": -4.937890529632568, "logps/chosen": -395.7875061035156, "logps/rejected": -342.1625061035156, "loss": 0.2458, "rewards/accuracies": 0.878125011920929, "rewards/chosen": 2.552539110183716, "rewards/margins": 2.956835985183716, "rewards/rejected": -0.40449219942092896, "step": 2560 }, { "epoch": 1.3449358806595133, "grad_norm": 59.690975189208984, "learning_rate": 2.92632591165476e-07, "logits/chosen": -5.039453029632568, "logits/rejected": -4.9697265625, "logps/chosen": -391.9125061035156, "logps/rejected": -347.4375, "loss": 0.2241, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.538281202316284, "rewards/margins": 3.0658202171325684, "rewards/rejected": -0.5279296636581421, "step": 2570 }, { "epoch": 1.3501701125359853, "grad_norm": 53.605316162109375, "learning_rate": 2.884838485886531e-07, "logits/chosen": -4.906836032867432, "logits/rejected": -4.842968940734863, "logps/chosen": -371.6968688964844, "logps/rejected": -326.375, "loss": 0.2727, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.492382764816284, "rewards/margins": 2.8648438453674316, "rewards/rejected": -0.37226563692092896, "step": 2580 }, { "epoch": 1.3554043444124575, "grad_norm": 77.24868774414062, "learning_rate": 2.8435276754960316e-07, "logits/chosen": -4.9951171875, "logits/rejected": -4.946093559265137, "logps/chosen": -375.26873779296875, "logps/rejected": -365.3187561035156, "loss": 0.3203, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.1714844703674316, "rewards/margins": 2.57421875, "rewards/rejected": -0.402587890625, "step": 2590 }, { "epoch": 1.3606385762889297, "grad_norm": 59.24010467529297, "learning_rate": 2.802396929924042e-07, "logits/chosen": -4.954492092132568, "logits/rejected": -4.826757907867432, "logps/chosen": -359.73748779296875, "logps/rejected": -328.046875, "loss": 0.2425, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.259765625, "rewards/margins": 2.7972655296325684, "rewards/rejected": -0.5376952886581421, "step": 2600 }, { "epoch": 1.3658728081654017, "grad_norm": 74.72590637207031, "learning_rate": 2.761449683575979e-07, "logits/chosen": -4.900000095367432, "logits/rejected": -4.919531345367432, "logps/chosen": -357.296875, "logps/rejected": -334.96875, "loss": 0.2071, "rewards/accuracies": 0.903124988079071, "rewards/chosen": 2.4410157203674316, "rewards/margins": 3.023242235183716, "rewards/rejected": -0.582226574420929, "step": 2610 }, { "epoch": 1.371107040041874, "grad_norm": 87.253173828125, "learning_rate": 2.720689355535133e-07, "logits/chosen": -4.9033203125, "logits/rejected": -4.883984565734863, "logps/chosen": -341.98126220703125, "logps/rejected": -303.5687561035156, "loss": 0.2622, "rewards/accuracies": 0.878125011920929, "rewards/chosen": 2.1673827171325684, "rewards/margins": 2.537890672683716, "rewards/rejected": -0.3705078065395355, "step": 2620 }, { "epoch": 1.376341271918346, "grad_norm": 69.79808807373047, "learning_rate": 2.680119349277163e-07, "logits/chosen": -4.940820217132568, "logits/rejected": -4.848828315734863, "logps/chosen": -371.5562438964844, "logps/rejected": -350.7875061035156, "loss": 0.2349, "rewards/accuracies": 0.878125011920929, "rewards/chosen": 2.3643555641174316, "rewards/margins": 2.701367139816284, "rewards/rejected": -0.3369140625, "step": 2630 }, { "epoch": 1.381575503794818, "grad_norm": 56.120361328125, "learning_rate": 2.639743052385917e-07, "logits/chosen": -5.014062404632568, "logits/rejected": -4.997851371765137, "logps/chosen": -389.9437561035156, "logps/rejected": -335.42498779296875, "loss": 0.2701, "rewards/accuracies": 0.8656250238418579, "rewards/chosen": 2.588085889816284, "rewards/margins": 2.818164110183716, "rewards/rejected": -0.23027344048023224, "step": 2640 }, { "epoch": 1.3868097356712903, "grad_norm": 65.84300994873047, "learning_rate": 2.599563836270564e-07, "logits/chosen": -4.965039253234863, "logits/rejected": -5.0, "logps/chosen": -401.09375, "logps/rejected": -363.1187438964844, "loss": 0.2306, "rewards/accuracies": 0.875, "rewards/chosen": 2.4185547828674316, "rewards/margins": 2.8472657203674316, "rewards/rejected": -0.4287109375, "step": 2650 }, { "epoch": 1.3920439675477625, "grad_norm": 45.584815979003906, "learning_rate": 2.55958505588409e-07, "logits/chosen": -4.946093559265137, "logits/rejected": -4.980859279632568, "logps/chosen": -405.0687561035156, "logps/rejected": -364.7250061035156, "loss": 0.2327, "rewards/accuracies": 0.890625, "rewards/chosen": 2.44140625, "rewards/margins": 3.0033202171325684, "rewards/rejected": -0.561718761920929, "step": 2660 }, { "epoch": 1.3972781994242345, "grad_norm": 99.71664428710938, "learning_rate": 2.519810049443152e-07, "logits/chosen": -4.970703125, "logits/rejected": -4.933789253234863, "logps/chosen": -384.6000061035156, "logps/rejected": -324.0062561035156, "loss": 0.2176, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.5091795921325684, "rewards/margins": 2.876171827316284, "rewards/rejected": -0.36699217557907104, "step": 2670 }, { "epoch": 1.4025124313007067, "grad_norm": 89.31651306152344, "learning_rate": 2.4802421381493405e-07, "logits/chosen": -4.931250095367432, "logits/rejected": -4.904101371765137, "logps/chosen": -396.45001220703125, "logps/rejected": -361.52813720703125, "loss": 0.2371, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.5414061546325684, "rewards/margins": 3.0113282203674316, "rewards/rejected": -0.46992188692092896, "step": 2680 }, { "epoch": 1.4077466631771787, "grad_norm": 95.61458587646484, "learning_rate": 2.440884625911861e-07, "logits/chosen": -4.889062404632568, "logits/rejected": -4.907617092132568, "logps/chosen": -370.75, "logps/rejected": -322.85626220703125, "loss": 0.2485, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": 2.578320264816284, "rewards/margins": 2.932421922683716, "rewards/rejected": -0.35429686307907104, "step": 2690 }, { "epoch": 1.4129808950536509, "grad_norm": 48.88241958618164, "learning_rate": 2.4017407990716597e-07, "logits/chosen": -4.9853515625, "logits/rejected": -4.889355659484863, "logps/chosen": -397.48126220703125, "logps/rejected": -372.3062438964844, "loss": 0.2182, "rewards/accuracies": 0.903124988079071, "rewards/chosen": 2.41015625, "rewards/margins": 2.883984327316284, "rewards/rejected": -0.4740234315395355, "step": 2700 }, { "epoch": 1.418215126930123, "grad_norm": 69.61422729492188, "learning_rate": 2.3628139261270135e-07, "logits/chosen": -4.847070217132568, "logits/rejected": -4.800976753234863, "logps/chosen": -358.6499938964844, "logps/rejected": -308.45001220703125, "loss": 0.274, "rewards/accuracies": 0.846875011920929, "rewards/chosen": 2.2535157203674316, "rewards/margins": 2.5884766578674316, "rewards/rejected": -0.3349609375, "step": 2710 }, { "epoch": 1.423449358806595, "grad_norm": 70.08207702636719, "learning_rate": 2.3241072574606102e-07, "logits/chosen": -4.863085746765137, "logits/rejected": -4.841210842132568, "logps/chosen": -380.0718688964844, "logps/rejected": -336.5249938964844, "loss": 0.2373, "rewards/accuracies": 0.8656250238418579, "rewards/chosen": 2.456249952316284, "rewards/margins": 2.755078077316284, "rewards/rejected": -0.298828125, "step": 2720 }, { "epoch": 1.4286835906830673, "grad_norm": 108.25699615478516, "learning_rate": 2.285624025068143e-07, "logits/chosen": -4.928906440734863, "logits/rejected": -4.945898532867432, "logps/chosen": -368.26251220703125, "logps/rejected": -341.04376220703125, "loss": 0.3042, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.263671875, "rewards/margins": 2.716601610183716, "rewards/rejected": -0.453125, "step": 2730 }, { "epoch": 1.4339178225595393, "grad_norm": 126.85956573486328, "learning_rate": 2.247367442288446e-07, "logits/chosen": -4.969140529632568, "logits/rejected": -4.921484470367432, "logps/chosen": -386.3374938964844, "logps/rejected": -326.51873779296875, "loss": 0.2715, "rewards/accuracies": 0.8656250238418579, "rewards/chosen": 2.284960985183716, "rewards/margins": 2.5882811546325684, "rewards/rejected": -0.30351561307907104, "step": 2740 }, { "epoch": 1.4391520544360115, "grad_norm": 123.19251251220703, "learning_rate": 2.209340703535169e-07, "logits/chosen": -4.9365234375, "logits/rejected": -4.944531440734863, "logps/chosen": -372.375, "logps/rejected": -365.01873779296875, "loss": 0.2462, "rewards/accuracies": 0.871874988079071, "rewards/chosen": 2.5208983421325684, "rewards/margins": 2.841992139816284, "rewards/rejected": -0.3208984434604645, "step": 2750 }, { "epoch": 1.4443862863124837, "grad_norm": 61.12422561645508, "learning_rate": 2.171546984030056e-07, "logits/chosen": -4.8955078125, "logits/rejected": -4.868750095367432, "logps/chosen": -400.45623779296875, "logps/rejected": -323.54998779296875, "loss": 0.2206, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.463085889816284, "rewards/margins": 2.9483399391174316, "rewards/rejected": -0.4852539002895355, "step": 2760 }, { "epoch": 1.4496205181889557, "grad_norm": 54.24504852294922, "learning_rate": 2.1339894395378067e-07, "logits/chosen": -4.916211128234863, "logits/rejected": -4.928515434265137, "logps/chosen": -377.65625, "logps/rejected": -348.98748779296875, "loss": 0.2601, "rewards/accuracies": 0.878125011920929, "rewards/chosen": 2.469921827316284, "rewards/margins": 2.9175782203674316, "rewards/rejected": -0.4482421875, "step": 2770 }, { "epoch": 1.4548547500654279, "grad_norm": 64.151123046875, "learning_rate": 2.096671206102582e-07, "logits/chosen": -4.880663871765137, "logits/rejected": -4.913476467132568, "logps/chosen": -358.01251220703125, "logps/rejected": -327.5062561035156, "loss": 0.2538, "rewards/accuracies": 0.890625, "rewards/chosen": 2.448046922683716, "rewards/margins": 2.739062547683716, "rewards/rejected": -0.29121094942092896, "step": 2780 }, { "epoch": 1.4600889819419, "grad_norm": 59.61539077758789, "learning_rate": 2.0595953997861326e-07, "logits/chosen": -4.99609375, "logits/rejected": -4.982031345367432, "logps/chosen": -398.17498779296875, "logps/rejected": -374.73126220703125, "loss": 0.2142, "rewards/accuracies": 0.8843749761581421, "rewards/chosen": 2.585742235183716, "rewards/margins": 3.0992188453674316, "rewards/rejected": -0.513867199420929, "step": 2790 }, { "epoch": 1.465323213818372, "grad_norm": 49.19828796386719, "learning_rate": 2.0227651164076153e-07, "logits/chosen": -4.9931640625, "logits/rejected": -5.001953125, "logps/chosen": -365.3687438964844, "logps/rejected": -352.85626220703125, "loss": 0.2335, "rewards/accuracies": 0.859375, "rewards/chosen": 2.6205077171325684, "rewards/margins": 2.941210985183716, "rewards/rejected": -0.3208984434604645, "step": 2800 }, { "epoch": 1.4705574456948443, "grad_norm": 76.77084350585938, "learning_rate": 1.986183431285095e-07, "logits/chosen": -4.971093654632568, "logits/rejected": -4.912890434265137, "logps/chosen": -392.0625, "logps/rejected": -370.3812561035156, "loss": 0.2214, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.473437547683716, "rewards/margins": 2.940234422683716, "rewards/rejected": -0.4671874940395355, "step": 2810 }, { "epoch": 1.4757916775713165, "grad_norm": 71.76081085205078, "learning_rate": 1.9498533989787508e-07, "logits/chosen": -4.878613471984863, "logits/rejected": -4.786334037780762, "logps/chosen": -366.29998779296875, "logps/rejected": -343.54998779296875, "loss": 0.236, "rewards/accuracies": 0.875, "rewards/chosen": 2.3017578125, "rewards/margins": 2.793750047683716, "rewards/rejected": -0.49199217557907104, "step": 2820 }, { "epoch": 1.4810259094477884, "grad_norm": 69.72960662841797, "learning_rate": 1.9137780530358255e-07, "logits/chosen": -4.914453029632568, "logits/rejected": -4.9453125, "logps/chosen": -362.48748779296875, "logps/rejected": -353.91876220703125, "loss": 0.2212, "rewards/accuracies": 0.8843749761581421, "rewards/chosen": 2.5328125953674316, "rewards/margins": 2.9712891578674316, "rewards/rejected": -0.4384765625, "step": 2830 }, { "epoch": 1.4862601413242607, "grad_norm": 77.40349578857422, "learning_rate": 1.8779604057373232e-07, "logits/chosen": -4.992773532867432, "logits/rejected": -4.929296970367432, "logps/chosen": -411.50311279296875, "logps/rejected": -356.93438720703125, "loss": 0.2398, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.7005858421325684, "rewards/margins": 3.003124952316284, "rewards/rejected": -0.3023437559604645, "step": 2840 }, { "epoch": 1.4914943732007329, "grad_norm": 55.684852600097656, "learning_rate": 1.842403447846485e-07, "logits/chosen": -4.970312595367432, "logits/rejected": -4.931836128234863, "logps/chosen": -388.2250061035156, "logps/rejected": -353.96563720703125, "loss": 0.2158, "rewards/accuracies": 0.8843749761581421, "rewards/chosen": 2.5423827171325684, "rewards/margins": 3.010937452316284, "rewards/rejected": -0.4681640565395355, "step": 2850 }, { "epoch": 1.4967286050772048, "grad_norm": 72.16903686523438, "learning_rate": 1.8071101483590657e-07, "logits/chosen": -4.944140434265137, "logits/rejected": -4.874413967132568, "logps/chosen": -430.2250061035156, "logps/rejected": -371.82501220703125, "loss": 0.221, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.6324219703674316, "rewards/margins": 3.277148485183716, "rewards/rejected": -0.6449218988418579, "step": 2860 }, { "epoch": 1.501962836953677, "grad_norm": 64.22447967529297, "learning_rate": 1.772083454255413e-07, "logits/chosen": -4.961133003234863, "logits/rejected": -5.019921779632568, "logps/chosen": -406.40625, "logps/rejected": -359.37811279296875, "loss": 0.2077, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.8511719703674316, "rewards/margins": 3.1966795921325684, "rewards/rejected": -0.34589844942092896, "step": 2870 }, { "epoch": 1.5071970688301493, "grad_norm": 78.3137435913086, "learning_rate": 1.7373262902544057e-07, "logits/chosen": -4.8740234375, "logits/rejected": -4.925000190734863, "logps/chosen": -398.5874938964844, "logps/rejected": -360.09375, "loss": 0.2454, "rewards/accuracies": 0.8656250238418579, "rewards/chosen": 2.3998045921325684, "rewards/margins": 2.9488282203674316, "rewards/rejected": -0.549023449420929, "step": 2880 }, { "epoch": 1.5124313007066212, "grad_norm": 67.20500946044922, "learning_rate": 1.7028415585692335e-07, "logits/chosen": -4.958203315734863, "logits/rejected": -4.9169921875, "logps/chosen": -396.1937561035156, "logps/rejected": -358.21875, "loss": 0.225, "rewards/accuracies": 0.90625, "rewards/chosen": 2.624218702316284, "rewards/margins": 3.110156297683716, "rewards/rejected": -0.48554688692092896, "step": 2890 }, { "epoch": 1.5176655325830934, "grad_norm": 49.70489501953125, "learning_rate": 1.668632138665071e-07, "logits/chosen": -4.911523342132568, "logits/rejected": -4.883398532867432, "logps/chosen": -371.0625, "logps/rejected": -355.46875, "loss": 0.245, "rewards/accuracies": 0.8656250238418579, "rewards/chosen": 2.466601610183716, "rewards/margins": 2.877734422683716, "rewards/rejected": -0.4115234315395355, "step": 2900 }, { "epoch": 1.5228997644595657, "grad_norm": 65.62977600097656, "learning_rate": 1.6347008870186346e-07, "logits/chosen": -4.897656440734863, "logits/rejected": -4.897656440734863, "logps/chosen": -394.2124938964844, "logps/rejected": -361.3125, "loss": 0.2328, "rewards/accuracies": 0.8843749761581421, "rewards/chosen": 2.5625, "rewards/margins": 2.998046875, "rewards/rejected": -0.43574219942092896, "step": 2910 }, { "epoch": 1.5281339963360376, "grad_norm": 105.50262451171875, "learning_rate": 1.6010506368796718e-07, "logits/chosen": -4.908984184265137, "logits/rejected": -4.954882621765137, "logps/chosen": -393.0562438964844, "logps/rejected": -369.7250061035156, "loss": 0.2317, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 2.283203125, "rewards/margins": 2.9546875953674316, "rewards/rejected": -0.671679675579071, "step": 2920 }, { "epoch": 1.5333682282125098, "grad_norm": 109.94066619873047, "learning_rate": 1.5676841980343852e-07, "logits/chosen": -4.994531154632568, "logits/rejected": -4.940234184265137, "logps/chosen": -404.84375, "logps/rejected": -363.1312561035156, "loss": 0.2324, "rewards/accuracies": 0.871874988079071, "rewards/chosen": 2.5113282203674316, "rewards/margins": 3.0267577171325684, "rewards/rejected": -0.515429675579071, "step": 2930 }, { "epoch": 1.538602460088982, "grad_norm": 47.26187515258789, "learning_rate": 1.5346043565708167e-07, "logits/chosen": -4.947070121765137, "logits/rejected": -4.921484470367432, "logps/chosen": -361.37188720703125, "logps/rejected": -330.3187561035156, "loss": 0.2336, "rewards/accuracies": 0.90625, "rewards/chosen": 2.348437547683716, "rewards/margins": 2.843554735183716, "rewards/rejected": -0.4951171875, "step": 2940 }, { "epoch": 1.543836691965454, "grad_norm": 71.30323028564453, "learning_rate": 1.5018138746462077e-07, "logits/chosen": -4.907422065734863, "logits/rejected": -4.978711128234863, "logps/chosen": -387.203125, "logps/rejected": -340.48748779296875, "loss": 0.2679, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.2132811546325684, "rewards/margins": 2.7132811546325684, "rewards/rejected": -0.5, "step": 2950 }, { "epoch": 1.5490709238419262, "grad_norm": 55.89011001586914, "learning_rate": 1.4693154902563642e-07, "logits/chosen": -4.958788871765137, "logits/rejected": -4.928320407867432, "logps/chosen": -362.15625, "logps/rejected": -326.8500061035156, "loss": 0.2161, "rewards/accuracies": 0.8843749761581421, "rewards/chosen": 2.3705077171325684, "rewards/margins": 2.869335889816284, "rewards/rejected": -0.49882811307907104, "step": 2960 }, { "epoch": 1.5543051557183984, "grad_norm": 82.12539672851562, "learning_rate": 1.4371119170070273e-07, "logits/chosen": -4.854589939117432, "logits/rejected": -4.9404296875, "logps/chosen": -378.0625, "logps/rejected": -340.4750061035156, "loss": 0.2285, "rewards/accuracies": 0.890625, "rewards/chosen": 2.469531297683716, "rewards/margins": 2.9546875953674316, "rewards/rejected": -0.4849609434604645, "step": 2970 }, { "epoch": 1.5595393875948704, "grad_norm": 63.915653228759766, "learning_rate": 1.4052058438873004e-07, "logits/chosen": -4.964062690734863, "logits/rejected": -4.918749809265137, "logps/chosen": -398.4125061035156, "logps/rejected": -368.5687561035156, "loss": 0.2228, "rewards/accuracies": 0.903124988079071, "rewards/chosen": 2.4234375953674316, "rewards/margins": 3.0283203125, "rewards/rejected": -0.6048828363418579, "step": 2980 }, { "epoch": 1.5647736194713424, "grad_norm": 59.789005279541016, "learning_rate": 1.3735999350451043e-07, "logits/chosen": -4.948828220367432, "logits/rejected": -4.914843559265137, "logps/chosen": -392.6187438964844, "logps/rejected": -353.04376220703125, "loss": 0.1967, "rewards/accuracies": 0.909375011920929, "rewards/chosen": 2.397656202316284, "rewards/margins": 3.0794920921325684, "rewards/rejected": -0.6820312738418579, "step": 2990 }, { "epoch": 1.5700078513478148, "grad_norm": 115.76825714111328, "learning_rate": 1.3422968295647325e-07, "logits/chosen": -5.005468845367432, "logits/rejected": -5.006249904632568, "logps/chosen": -408.4437561035156, "logps/rejected": -335.0687561035156, "loss": 0.2652, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": 2.6246094703674316, "rewards/margins": 2.881640672683716, "rewards/rejected": -0.25664061307907104, "step": 3000 }, { "epoch": 1.5700078513478148, "eval_logits/chosen": -4.989828109741211, "eval_logits/rejected": -4.943406105041504, "eval_logps/chosen": -392.08099365234375, "eval_logps/rejected": -347.6390075683594, "eval_loss": 0.6350286602973938, "eval_rewards/accuracies": 0.6269999742507935, "eval_rewards/chosen": 1.486984372138977, "eval_rewards/margins": 1.1169062852859497, "eval_rewards/rejected": 0.37007811665534973, "eval_runtime": 245.3467, "eval_samples_per_second": 8.152, "eval_steps_per_second": 2.038, "step": 3000 }, { "epoch": 1.5752420832242868, "grad_norm": 35.10647964477539, "learning_rate": 1.3112991412464825e-07, "logits/chosen": -4.952734470367432, "logits/rejected": -4.896093845367432, "logps/chosen": -400.20623779296875, "logps/rejected": -352.34375, "loss": 0.2123, "rewards/accuracies": 0.8843749761581421, "rewards/chosen": 2.657421827316284, "rewards/margins": 3.255859375, "rewards/rejected": -0.598828136920929, "step": 3010 }, { "epoch": 1.5804763151007588, "grad_norm": 39.59328842163086, "learning_rate": 1.2806094583884114e-07, "logits/chosen": -5.006054878234863, "logits/rejected": -4.945898532867432, "logps/chosen": -384.59375, "logps/rejected": -338.83123779296875, "loss": 0.2495, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.366406202316284, "rewards/margins": 2.7642579078674316, "rewards/rejected": -0.3980468809604645, "step": 3020 }, { "epoch": 1.5857105469772312, "grad_norm": 62.59682846069336, "learning_rate": 1.2502303435702043e-07, "logits/chosen": -4.912109375, "logits/rejected": -4.942187309265137, "logps/chosen": -382.54376220703125, "logps/rejected": -352.4125061035156, "loss": 0.2394, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": 2.450000047683716, "rewards/margins": 3.1070313453674316, "rewards/rejected": -0.657031238079071, "step": 3030 }, { "epoch": 1.5909447788537032, "grad_norm": 66.94029998779297, "learning_rate": 1.2201643334392082e-07, "logits/chosen": -4.944531440734863, "logits/rejected": -4.9130859375, "logps/chosen": -368.76873779296875, "logps/rejected": -350.42498779296875, "loss": 0.2499, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.353320360183716, "rewards/margins": 2.7894530296325684, "rewards/rejected": -0.43574219942092896, "step": 3040 }, { "epoch": 1.5961790107301752, "grad_norm": 46.61616897583008, "learning_rate": 1.1904139384986123e-07, "logits/chosen": -4.927343845367432, "logits/rejected": -4.915820121765137, "logps/chosen": -388.59375, "logps/rejected": -345.7875061035156, "loss": 0.2505, "rewards/accuracies": 0.846875011920929, "rewards/chosen": 2.388671875, "rewards/margins": 2.7431640625, "rewards/rejected": -0.35429686307907104, "step": 3050 }, { "epoch": 1.6014132426066476, "grad_norm": 50.75476837158203, "learning_rate": 1.1609816428978359e-07, "logits/chosen": -4.940722465515137, "logits/rejected": -4.960351467132568, "logps/chosen": -423.53125, "logps/rejected": -386.4937438964844, "loss": 0.2045, "rewards/accuracies": 0.8843749761581421, "rewards/chosen": 2.556640625, "rewards/margins": 3.1878905296325684, "rewards/rejected": -0.6312500238418579, "step": 3060 }, { "epoch": 1.6066474744831196, "grad_norm": 48.430912017822266, "learning_rate": 1.1318699042250918e-07, "logits/chosen": -5.049218654632568, "logits/rejected": -4.967968940734863, "logps/chosen": -387.9125061035156, "logps/rejected": -358.3187561035156, "loss": 0.2332, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.526562452316284, "rewards/margins": 2.8768553733825684, "rewards/rejected": -0.35009765625, "step": 3070 }, { "epoch": 1.6118817063595916, "grad_norm": 49.43867111206055, "learning_rate": 1.10308115330218e-07, "logits/chosen": -4.908203125, "logits/rejected": -4.865820407867432, "logps/chosen": -383.4468688964844, "logps/rejected": -334.98748779296875, "loss": 0.2408, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.422656297683716, "rewards/margins": 2.9886717796325684, "rewards/rejected": -0.5654296875, "step": 3080 }, { "epoch": 1.6171159382360638, "grad_norm": 124.98592376708984, "learning_rate": 1.0746177939815171e-07, "logits/chosen": -5.013867378234863, "logits/rejected": -4.925000190734863, "logps/chosen": -386.38751220703125, "logps/rejected": -342.1499938964844, "loss": 0.2529, "rewards/accuracies": 0.8656250238418579, "rewards/chosen": 2.3623046875, "rewards/margins": 2.7867188453674316, "rewards/rejected": -0.4248046875, "step": 3090 }, { "epoch": 1.622350170112536, "grad_norm": 38.59086990356445, "learning_rate": 1.0464822029454179e-07, "logits/chosen": -4.906054496765137, "logits/rejected": -4.875, "logps/chosen": -389.0874938964844, "logps/rejected": -381.4750061035156, "loss": 0.2447, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.5, "rewards/margins": 3.15234375, "rewards/rejected": -0.65234375, "step": 3100 }, { "epoch": 1.627584401989008, "grad_norm": 65.8700180053711, "learning_rate": 1.0186767295076359e-07, "logits/chosen": -4.922265529632568, "logits/rejected": -4.895312309265137, "logps/chosen": -387.57501220703125, "logps/rejected": -352.25, "loss": 0.2672, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.3671875, "rewards/margins": 2.696093797683716, "rewards/rejected": -0.32890623807907104, "step": 3110 }, { "epoch": 1.6328186338654802, "grad_norm": 118.82145690917969, "learning_rate": 9.91203695417201e-08, "logits/chosen": -4.96484375, "logits/rejected": -4.932421684265137, "logps/chosen": -384.9437561035156, "logps/rejected": -334.4281311035156, "loss": 0.2448, "rewards/accuracies": 0.8843749761581421, "rewards/chosen": 2.456249952316284, "rewards/margins": 2.823437452316284, "rewards/rejected": -0.3671875, "step": 3120 }, { "epoch": 1.6380528657419524, "grad_norm": 66.63347625732422, "learning_rate": 9.640653946645527e-08, "logits/chosen": -4.909375190734863, "logits/rejected": -4.900976657867432, "logps/chosen": -404.9375, "logps/rejected": -337.1000061035156, "loss": 0.2351, "rewards/accuracies": 0.875, "rewards/chosen": 2.392578125, "rewards/margins": 2.8373045921325684, "rewards/rejected": -0.4447265565395355, "step": 3130 }, { "epoch": 1.6432870976184244, "grad_norm": 65.4285659790039, "learning_rate": 9.372640932899962e-08, "logits/chosen": -4.957617282867432, "logits/rejected": -4.933984279632568, "logps/chosen": -353.54998779296875, "logps/rejected": -326.70623779296875, "loss": 0.2412, "rewards/accuracies": 0.878125011920929, "rewards/chosen": 2.4521484375, "rewards/margins": 2.831249952316284, "rewards/rejected": -0.37890625, "step": 3140 }, { "epoch": 1.6485213294948966, "grad_norm": 54.59511947631836, "learning_rate": 9.108020291944835e-08, "logits/chosen": -4.943945407867432, "logits/rejected": -4.872460842132568, "logps/chosen": -392.54376220703125, "logps/rejected": -330.6499938964844, "loss": 0.2232, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.428906202316284, "rewards/margins": 2.80859375, "rewards/rejected": -0.3794921934604645, "step": 3150 }, { "epoch": 1.6537555613713688, "grad_norm": 98.5328369140625, "learning_rate": 8.84681411952749e-08, "logits/chosen": -4.919335842132568, "logits/rejected": -4.9375, "logps/chosen": -383.03436279296875, "logps/rejected": -359.921875, "loss": 0.241, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.5853514671325684, "rewards/margins": 2.862109422683716, "rewards/rejected": -0.2767578065395355, "step": 3160 }, { "epoch": 1.6589897932478408, "grad_norm": 78.06737518310547, "learning_rate": 8.589044226288156e-08, "logits/chosen": -4.914453029632568, "logits/rejected": -4.8916015625, "logps/chosen": -390.09375, "logps/rejected": -333.60626220703125, "loss": 0.2437, "rewards/accuracies": 0.859375, "rewards/chosen": 2.5648436546325684, "rewards/margins": 2.928515672683716, "rewards/rejected": -0.3631835877895355, "step": 3170 }, { "epoch": 1.664224025124313, "grad_norm": 108.66213989257812, "learning_rate": 8.334732135938761e-08, "logits/chosen": -4.923047065734863, "logits/rejected": -4.921093940734863, "logps/chosen": -376.3999938964844, "logps/rejected": -337.01251220703125, "loss": 0.2139, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.584179639816284, "rewards/margins": 2.985546827316284, "rewards/rejected": -0.4009765684604645, "step": 3180 }, { "epoch": 1.6694582570007852, "grad_norm": 34.4354248046875, "learning_rate": 8.08389908346565e-08, "logits/chosen": -4.950781345367432, "logits/rejected": -5.0048828125, "logps/chosen": -414.5375061035156, "logps/rejected": -349.1187438964844, "loss": 0.2079, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.7281250953674316, "rewards/margins": 3.1263670921325684, "rewards/rejected": -0.39824217557907104, "step": 3190 }, { "epoch": 1.6746924888772572, "grad_norm": 75.2740249633789, "learning_rate": 7.836566013356521e-08, "logits/chosen": -4.90087890625, "logits/rejected": -4.908203125, "logps/chosen": -390.03125, "logps/rejected": -356.265625, "loss": 0.2179, "rewards/accuracies": 0.8843749761581421, "rewards/chosen": 2.472851514816284, "rewards/margins": 2.9048829078674316, "rewards/rejected": -0.431640625, "step": 3200 }, { "epoch": 1.6799267207537294, "grad_norm": 85.98029327392578, "learning_rate": 7.59275357785154e-08, "logits/chosen": -4.944531440734863, "logits/rejected": -4.900390625, "logps/chosen": -384.4624938964844, "logps/rejected": -348.23126220703125, "loss": 0.267, "rewards/accuracies": 0.875, "rewards/chosen": 2.4486327171325684, "rewards/margins": 2.7984375953674316, "rewards/rejected": -0.3499999940395355, "step": 3210 }, { "epoch": 1.6851609526302016, "grad_norm": 82.18426513671875, "learning_rate": 7.352482135218929e-08, "logits/chosen": -4.9853515625, "logits/rejected": -4.929101467132568, "logps/chosen": -396.23748779296875, "logps/rejected": -358.78125, "loss": 0.2489, "rewards/accuracies": 0.8656250238418579, "rewards/chosen": 2.490234375, "rewards/margins": 2.830078125, "rewards/rejected": -0.33964842557907104, "step": 3220 }, { "epoch": 1.6903951845066736, "grad_norm": 98.95600128173828, "learning_rate": 7.115771748054994e-08, "logits/chosen": -4.972265720367432, "logits/rejected": -4.897070407867432, "logps/chosen": -402.04376220703125, "logps/rejected": -343.2437438964844, "loss": 0.2452, "rewards/accuracies": 0.875, "rewards/chosen": 2.739453077316284, "rewards/margins": 3.1927733421325684, "rewards/rejected": -0.4527343809604645, "step": 3230 }, { "epoch": 1.6956294163831458, "grad_norm": 94.96117401123047, "learning_rate": 6.882642181608938e-08, "logits/chosen": -4.9501953125, "logits/rejected": -4.890625, "logps/chosen": -377.26873779296875, "logps/rejected": -349.63751220703125, "loss": 0.2724, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.407031297683716, "rewards/margins": 2.749218702316284, "rewards/rejected": -0.3423828184604645, "step": 3240 }, { "epoch": 1.700863648259618, "grad_norm": 75.37129974365234, "learning_rate": 6.653112902132468e-08, "logits/chosen": -5.006445407867432, "logits/rejected": -4.967577934265137, "logps/chosen": -376.54998779296875, "logps/rejected": -356.1031188964844, "loss": 0.219, "rewards/accuracies": 0.903124988079071, "rewards/chosen": 2.551953077316284, "rewards/margins": 3.0953125953674316, "rewards/rejected": -0.543749988079071, "step": 3250 }, { "epoch": 1.70609788013609, "grad_norm": 66.41917419433594, "learning_rate": 6.427203075254389e-08, "logits/chosen": -4.857226371765137, "logits/rejected": -4.83203125, "logps/chosen": -362.30938720703125, "logps/rejected": -308.640625, "loss": 0.2841, "rewards/accuracies": 0.859375, "rewards/chosen": 2.074414014816284, "rewards/margins": 2.5992188453674316, "rewards/rejected": -0.5248047113418579, "step": 3260 }, { "epoch": 1.7113321120125622, "grad_norm": 118.2750244140625, "learning_rate": 6.204931564380212e-08, "logits/chosen": -5.010156154632568, "logits/rejected": -4.974413871765137, "logps/chosen": -409.45623779296875, "logps/rejected": -380.59375, "loss": 0.2533, "rewards/accuracies": 0.875, "rewards/chosen": 2.5443358421325684, "rewards/margins": 3.0205078125, "rewards/rejected": -0.4761718809604645, "step": 3270 }, { "epoch": 1.7165663438890344, "grad_norm": 54.80821990966797, "learning_rate": 5.98631692911713e-08, "logits/chosen": -4.957226753234863, "logits/rejected": -4.9169921875, "logps/chosen": -397.625, "logps/rejected": -335.5062561035156, "loss": 0.2602, "rewards/accuracies": 0.8656250238418579, "rewards/chosen": 2.584765672683716, "rewards/margins": 2.8857421875, "rewards/rejected": -0.30078125, "step": 3280 }, { "epoch": 1.7218005757655064, "grad_norm": 45.44811248779297, "learning_rate": 5.7713774237242716e-08, "logits/chosen": NaN, "logits/rejected": -4.859765529632568, "logps/chosen": -392.25, "logps/rejected": -349.61248779296875, "loss": 0.2278, "rewards/accuracies": 0.878125011920929, "rewards/chosen": 2.603515625, "rewards/margins": 2.9478516578674316, "rewards/rejected": -0.3443359434604645, "step": 3290 }, { "epoch": 1.7270348076419786, "grad_norm": 53.89841842651367, "learning_rate": 5.5601309955884965e-08, "logits/chosen": -4.878320217132568, "logits/rejected": -4.924218654632568, "logps/chosen": -378.07501220703125, "logps/rejected": -338.28125, "loss": 0.2499, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.169921875, "rewards/margins": 2.637500047683716, "rewards/rejected": -0.4673828184604645, "step": 3300 }, { "epoch": 1.7322690395184508, "grad_norm": 62.9951286315918, "learning_rate": 5.352595283725758e-08, "logits/chosen": -4.916406154632568, "logits/rejected": -4.887890815734863, "logps/chosen": -391.1875, "logps/rejected": -339.20001220703125, "loss": 0.2225, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.575000047683716, "rewards/margins": 2.950976610183716, "rewards/rejected": -0.3759765625, "step": 3310 }, { "epoch": 1.7375032713949228, "grad_norm": 61.876556396484375, "learning_rate": 5.1487876173082704e-08, "logits/chosen": -4.929296970367432, "logits/rejected": -4.953515529632568, "logps/chosen": -397.4937438964844, "logps/rejected": -363.34375, "loss": 0.2382, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.44921875, "rewards/margins": 2.8521485328674316, "rewards/rejected": -0.4029296934604645, "step": 3320 }, { "epoch": 1.7427375032713948, "grad_norm": 59.831703186035156, "learning_rate": 4.948725014217514e-08, "logits/chosen": -4.898828029632568, "logits/rejected": -4.838086128234863, "logps/chosen": -391.26873779296875, "logps/rejected": -333.70623779296875, "loss": 0.2382, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.5179686546325684, "rewards/margins": 2.8492188453674316, "rewards/rejected": -0.3316406309604645, "step": 3330 }, { "epoch": 1.7479717351478672, "grad_norm": 24.349123001098633, "learning_rate": 4.752424179623299e-08, "logits/chosen": -5.004101753234863, "logits/rejected": -4.898828029632568, "logps/chosen": -371.3500061035156, "logps/rejected": -373.1000061035156, "loss": 0.2499, "rewards/accuracies": 0.875, "rewards/chosen": 2.662109375, "rewards/margins": 3.157421827316284, "rewards/rejected": -0.49531251192092896, "step": 3340 }, { "epoch": 1.7532059670243392, "grad_norm": 67.24871826171875, "learning_rate": 4.559901504588809e-08, "logits/chosen": -4.942187309265137, "logits/rejected": -4.926562309265137, "logps/chosen": -390.48126220703125, "logps/rejected": -322.0062561035156, "loss": 0.2214, "rewards/accuracies": 0.8656250238418579, "rewards/chosen": 2.580273389816284, "rewards/margins": 3.0230469703674316, "rewards/rejected": -0.44316405057907104, "step": 3350 }, { "epoch": 1.7584401989008112, "grad_norm": 42.043582916259766, "learning_rate": 4.371173064702011e-08, "logits/chosen": -4.916796684265137, "logits/rejected": -4.927929878234863, "logps/chosen": -344.45001220703125, "logps/rejected": -342.26251220703125, "loss": 0.2431, "rewards/accuracies": 0.875, "rewards/chosen": 2.260546922683716, "rewards/margins": 2.7464842796325684, "rewards/rejected": -0.4857421815395355, "step": 3360 }, { "epoch": 1.7636744307772836, "grad_norm": 54.07103729248047, "learning_rate": 4.1862546187333145e-08, "logits/chosen": -4.983788967132568, "logits/rejected": -4.91650390625, "logps/chosen": -406.0, "logps/rejected": -342.625, "loss": 0.2161, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.5542969703674316, "rewards/margins": 2.9365234375, "rewards/rejected": -0.3822265565395355, "step": 3370 }, { "epoch": 1.7689086626537556, "grad_norm": 121.5853271484375, "learning_rate": 4.005161607319746e-08, "logits/chosen": -4.967382907867432, "logits/rejected": -4.864843845367432, "logps/chosen": -390.23748779296875, "logps/rejected": -356.6499938964844, "loss": 0.2221, "rewards/accuracies": 0.903124988079071, "rewards/chosen": 2.435742139816284, "rewards/margins": 3.141406297683716, "rewards/rejected": -0.7056640386581421, "step": 3380 }, { "epoch": 1.7741428945302276, "grad_norm": 75.98016357421875, "learning_rate": 3.827909151675651e-08, "logits/chosen": -4.952929496765137, "logits/rejected": -4.971093654632568, "logps/chosen": -374.4937438964844, "logps/rejected": -349.3500061035156, "loss": 0.2275, "rewards/accuracies": 0.875, "rewards/chosen": 2.5048828125, "rewards/margins": 2.796679735183716, "rewards/rejected": -0.29179686307907104, "step": 3390 }, { "epoch": 1.7793771264067, "grad_norm": 120.496826171875, "learning_rate": 3.6545120523300554e-08, "logits/chosen": -4.9609375, "logits/rejected": -4.908203125, "logps/chosen": -379.7875061035156, "logps/rejected": -331.6937561035156, "loss": 0.2741, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.555468797683716, "rewards/margins": 3.017382860183716, "rewards/rejected": -0.46210938692092896, "step": 3400 }, { "epoch": 1.784611358283172, "grad_norm": 79.13348388671875, "learning_rate": 3.484984787890854e-08, "logits/chosen": -4.980859279632568, "logits/rejected": -4.9609375, "logps/chosen": -387.2437438964844, "logps/rejected": -349.83123779296875, "loss": 0.2493, "rewards/accuracies": 0.846875011920929, "rewards/chosen": 2.3951172828674316, "rewards/margins": 2.8138670921325684, "rewards/rejected": -0.41874998807907104, "step": 3410 }, { "epoch": 1.789845590159644, "grad_norm": 63.74520492553711, "learning_rate": 3.3193415138358605e-08, "logits/chosen": -4.97265625, "logits/rejected": -4.861132621765137, "logps/chosen": -390.73748779296875, "logps/rejected": -329.20001220703125, "loss": 0.2348, "rewards/accuracies": 0.846875011920929, "rewards/chosen": 2.45703125, "rewards/margins": 2.834179639816284, "rewards/rejected": -0.3773437440395355, "step": 3420 }, { "epoch": 1.7950798220361162, "grad_norm": 75.20218658447266, "learning_rate": 3.1575960613307697e-08, "logits/chosen": -4.920507907867432, "logits/rejected": -4.9482421875, "logps/chosen": -347.2875061035156, "logps/rejected": -305.08123779296875, "loss": 0.2721, "rewards/accuracies": 0.871874988079071, "rewards/chosen": 2.2958984375, "rewards/margins": 2.6490235328674316, "rewards/rejected": -0.3529296815395355, "step": 3430 }, { "epoch": 1.8003140539125884, "grad_norm": 74.88561248779297, "learning_rate": 2.99976193607433e-08, "logits/chosen": -5.017382621765137, "logits/rejected": -4.966406345367432, "logps/chosen": -423.6000061035156, "logps/rejected": -394.14373779296875, "loss": 0.2184, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.6128907203674316, "rewards/margins": 3.1011719703674316, "rewards/rejected": -0.4876953065395355, "step": 3440 }, { "epoch": 1.8055482857890603, "grad_norm": 36.27398681640625, "learning_rate": 2.8458523171705606e-08, "logits/chosen": -4.9306640625, "logits/rejected": -4.905077934265137, "logps/chosen": -369.2875061035156, "logps/rejected": -346.61248779296875, "loss": 0.2271, "rewards/accuracies": 0.875, "rewards/chosen": 2.598437547683716, "rewards/margins": 2.9615235328674316, "rewards/rejected": -0.36308592557907104, "step": 3450 }, { "epoch": 1.8107825176655326, "grad_norm": 106.27587890625, "learning_rate": 2.6958800560283766e-08, "logits/chosen": -4.945116996765137, "logits/rejected": -4.926171779632568, "logps/chosen": -385.1656188964844, "logps/rejected": -366.28125, "loss": 0.225, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": 2.4693360328674316, "rewards/margins": 3.021484375, "rewards/rejected": -0.5521484613418579, "step": 3460 }, { "epoch": 1.8160167495420048, "grad_norm": 114.32062530517578, "learning_rate": 2.5498576752884083e-08, "logits/chosen": -4.911523342132568, "logits/rejected": -4.808398246765137, "logps/chosen": -370.59375, "logps/rejected": -318.5, "loss": 0.2369, "rewards/accuracies": 0.875, "rewards/chosen": 2.5111327171325684, "rewards/margins": 2.847851514816284, "rewards/rejected": -0.3369140625, "step": 3470 }, { "epoch": 1.8212509814184767, "grad_norm": 93.86100769042969, "learning_rate": 2.4077973677774255e-08, "logits/chosen": -4.959179878234863, "logits/rejected": -4.963086128234863, "logps/chosen": -432.6625061035156, "logps/rejected": -376.3187561035156, "loss": 0.2782, "rewards/accuracies": 0.840624988079071, "rewards/chosen": 2.467578172683716, "rewards/margins": 2.8101563453674316, "rewards/rejected": -0.34257811307907104, "step": 3480 }, { "epoch": 1.826485213294949, "grad_norm": 83.02394104003906, "learning_rate": 2.2697109954902262e-08, "logits/chosen": -4.8544921875, "logits/rejected": -4.8017578125, "logps/chosen": -372.0625, "logps/rejected": -348.2124938964844, "loss": 0.2225, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": 2.44140625, "rewards/margins": 3.040820360183716, "rewards/rejected": -0.5992187261581421, "step": 3490 }, { "epoch": 1.8317194451714212, "grad_norm": 94.27802276611328, "learning_rate": 2.13561008859916e-08, "logits/chosen": -4.964257717132568, "logits/rejected": -4.963476657867432, "logps/chosen": -389.70623779296875, "logps/rejected": -330.51251220703125, "loss": 0.2423, "rewards/accuracies": 0.871874988079071, "rewards/chosen": 2.515820264816284, "rewards/margins": 2.8675780296325684, "rewards/rejected": -0.3521484434604645, "step": 3500 }, { "epoch": 1.8369536770478931, "grad_norm": 58.17540740966797, "learning_rate": 2.0055058444913507e-08, "logits/chosen": -4.898633003234863, "logits/rejected": -4.909570217132568, "logps/chosen": -355.20001220703125, "logps/rejected": -337.36248779296875, "loss": 0.2599, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.283398389816284, "rewards/margins": 2.8490233421325684, "rewards/rejected": -0.5658203363418579, "step": 3510 }, { "epoch": 1.8421879089243653, "grad_norm": 59.796470642089844, "learning_rate": 1.879409126833753e-08, "logits/chosen": -4.994336128234863, "logits/rejected": -5.017578125, "logps/chosen": -386.12188720703125, "logps/rejected": -335.0687561035156, "loss": 0.2146, "rewards/accuracies": 0.90625, "rewards/chosen": 2.4814453125, "rewards/margins": 2.8916015625, "rewards/rejected": -0.41015625, "step": 3520 }, { "epoch": 1.8474221408008376, "grad_norm": 45.44034194946289, "learning_rate": 1.757330464665996e-08, "logits/chosen": -4.912499904632568, "logits/rejected": -4.899609565734863, "logps/chosen": -372.46875, "logps/rejected": -357.98748779296875, "loss": 0.2328, "rewards/accuracies": 0.871874988079071, "rewards/chosen": 2.476757764816284, "rewards/margins": 3.15625, "rewards/rejected": -0.6800781488418579, "step": 3530 }, { "epoch": 1.8526563726773095, "grad_norm": 65.31453704833984, "learning_rate": 1.639280051521241e-08, "logits/chosen": -4.944531440734863, "logits/rejected": -4.916601657867432, "logps/chosen": -399.8125, "logps/rejected": -339.3999938964844, "loss": 0.227, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 2.487109422683716, "rewards/margins": 2.870312452316284, "rewards/rejected": -0.3832031190395355, "step": 3540 }, { "epoch": 1.8578906045537817, "grad_norm": 51.45854949951172, "learning_rate": 1.525267744575015e-08, "logits/chosen": -4.949414253234863, "logits/rejected": -4.865038871765137, "logps/chosen": -404.1812438964844, "logps/rejected": -366.2250061035156, "loss": 0.2607, "rewards/accuracies": 0.859375, "rewards/chosen": 2.47265625, "rewards/margins": 2.949414014816284, "rewards/rejected": -0.4769531190395355, "step": 3550 }, { "epoch": 1.863124836430254, "grad_norm": 60.1088981628418, "learning_rate": 1.4153030638221375e-08, "logits/chosen": -4.958203315734863, "logits/rejected": -4.958203315734863, "logps/chosen": -368.4125061035156, "logps/rejected": -341.81561279296875, "loss": 0.2381, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.4730467796325684, "rewards/margins": 2.8033204078674316, "rewards/rejected": -0.3306640684604645, "step": 3560 }, { "epoch": 1.868359068306726, "grad_norm": 41.17192077636719, "learning_rate": 1.309395191281798e-08, "logits/chosen": -4.960741996765137, "logits/rejected": -4.969336032867432, "logps/chosen": -402.375, "logps/rejected": -334.3374938964844, "loss": 0.2175, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.686328172683716, "rewards/margins": 3.005664110183716, "rewards/rejected": -0.3189453184604645, "step": 3570 }, { "epoch": 1.8735933001831981, "grad_norm": 94.69064331054688, "learning_rate": 1.207552970230885e-08, "logits/chosen": -4.971484184265137, "logits/rejected": -4.9296875, "logps/chosen": -366.25, "logps/rejected": -353.0687561035156, "loss": 0.2567, "rewards/accuracies": 0.871874988079071, "rewards/chosen": 2.3587889671325684, "rewards/margins": 2.6753907203674316, "rewards/rejected": -0.31621092557907104, "step": 3580 }, { "epoch": 1.8788275320596703, "grad_norm": 88.20918273925781, "learning_rate": 1.1097849044655494e-08, "logits/chosen": -4.822656154632568, "logits/rejected": -4.860449314117432, "logps/chosen": -357.83123779296875, "logps/rejected": -325.3687438964844, "loss": 0.2512, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.359375, "rewards/margins": 2.7056641578674316, "rewards/rejected": -0.3462890684604645, "step": 3590 }, { "epoch": 1.8840617639361423, "grad_norm": 46.47334671020508, "learning_rate": 1.0160991575911382e-08, "logits/chosen": -4.903906345367432, "logits/rejected": -4.89453125, "logps/chosen": -379.4375, "logps/rejected": -340.1000061035156, "loss": 0.2449, "rewards/accuracies": 0.8843749761581421, "rewards/chosen": 2.5462889671325684, "rewards/margins": 2.9056639671325684, "rewards/rejected": -0.359375, "step": 3600 }, { "epoch": 1.8892959958126145, "grad_norm": 41.36199188232422, "learning_rate": 9.265035523405628e-09, "logits/chosen": -4.874609470367432, "logits/rejected": -4.875, "logps/chosen": -371.29998779296875, "logps/rejected": -315.36248779296875, "loss": 0.2477, "rewards/accuracies": 0.871874988079071, "rewards/chosen": 2.414257764816284, "rewards/margins": 2.7884764671325684, "rewards/rejected": -0.37421876192092896, "step": 3610 }, { "epoch": 1.8945302276890867, "grad_norm": 68.25157928466797, "learning_rate": 8.410055699210716e-09, "logits/chosen": -4.889062404632568, "logits/rejected": -4.949999809265137, "logps/chosen": -341.4937438964844, "logps/rejected": -336.9624938964844, "loss": 0.2711, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.271679639816284, "rewards/margins": 2.66015625, "rewards/rejected": -0.388671875, "step": 3620 }, { "epoch": 1.8997644595655587, "grad_norm": 79.20165252685547, "learning_rate": 7.59612349389599e-09, "logits/chosen": -4.876562595367432, "logits/rejected": -4.8720703125, "logps/chosen": -371.63751220703125, "logps/rejected": -317.72967529296875, "loss": 0.2545, "rewards/accuracies": 0.859375, "rewards/chosen": 2.3128905296325684, "rewards/margins": 2.7681641578674316, "rewards/rejected": -0.4554687440395355, "step": 3630 }, { "epoch": 1.904998691442031, "grad_norm": 45.96782684326172, "learning_rate": 6.823306870566314e-09, "logits/chosen": -4.975195407867432, "logits/rejected": -4.934765815734863, "logps/chosen": -401.2124938964844, "logps/rejected": -324.96875, "loss": 0.2331, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": 2.505859375, "rewards/margins": 2.9339842796325684, "rewards/rejected": -0.42792969942092896, "step": 3640 }, { "epoch": 1.9102329233185031, "grad_norm": 90.01760864257812, "learning_rate": 6.0916703591873396e-09, "logits/chosen": -4.998242378234863, "logits/rejected": -4.94140625, "logps/chosen": -351.2437438964844, "logps/rejected": -334.15625, "loss": 0.227, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": 2.326171875, "rewards/margins": 2.830078125, "rewards/rejected": -0.50390625, "step": 3650 }, { "epoch": 1.9154671551949751, "grad_norm": 102.63104248046875, "learning_rate": 5.401275051197196e-09, "logits/chosen": -4.8974609375, "logits/rejected": -4.918554782867432, "logps/chosen": -359.1000061035156, "logps/rejected": -321.32501220703125, "loss": 0.2518, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.328320264816284, "rewards/margins": 2.5953125953674316, "rewards/rejected": -0.2671875059604645, "step": 3660 }, { "epoch": 1.920701387071447, "grad_norm": 117.56713104248047, "learning_rate": 4.752178594405465e-09, "logits/chosen": -4.902441501617432, "logits/rejected": -4.883593559265137, "logps/chosen": -379.7093811035156, "logps/rejected": -351.51251220703125, "loss": 0.2209, "rewards/accuracies": 0.8843749761581421, "rewards/chosen": 2.631054639816284, "rewards/margins": 3.0482420921325684, "rewards/rejected": -0.41679686307907104, "step": 3670 }, { "epoch": 1.9259356189479195, "grad_norm": 93.31014251708984, "learning_rate": 4.144435188179529e-09, "logits/chosen": -4.938086032867432, "logits/rejected": -4.873632907867432, "logps/chosen": -383.0, "logps/rejected": -348.78125, "loss": 0.2677, "rewards/accuracies": 0.8343750238418579, "rewards/chosen": 2.305859327316284, "rewards/margins": 2.5205078125, "rewards/rejected": -0.21435546875, "step": 3680 }, { "epoch": 1.9311698508243915, "grad_norm": 84.78623962402344, "learning_rate": 3.5780955789187497e-09, "logits/chosen": -4.940820217132568, "logits/rejected": -4.899218559265137, "logps/chosen": -383.96563720703125, "logps/rejected": -362.0375061035156, "loss": 0.2531, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.4800782203674316, "rewards/margins": 2.891406297683716, "rewards/rejected": -0.41132813692092896, "step": 3690 }, { "epoch": 1.9364040827008635, "grad_norm": 49.219505310058594, "learning_rate": 3.0532070558177415e-09, "logits/chosen": -4.896093845367432, "logits/rejected": -4.931250095367432, "logps/chosen": -377.6000061035156, "logps/rejected": -343.4125061035156, "loss": 0.245, "rewards/accuracies": 0.846875011920929, "rewards/chosen": 2.5425782203674316, "rewards/margins": 2.929492235183716, "rewards/rejected": -0.3873046934604645, "step": 3700 }, { "epoch": 1.941638314577336, "grad_norm": 78.91031646728516, "learning_rate": 2.5698134469169243e-09, "logits/chosen": -4.953125, "logits/rejected": -4.934374809265137, "logps/chosen": -385.0218811035156, "logps/rejected": -330.13751220703125, "loss": 0.2366, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 2.4019532203674316, "rewards/margins": 2.8238282203674316, "rewards/rejected": -0.421875, "step": 3710 }, { "epoch": 1.946872546453808, "grad_norm": 57.12163162231445, "learning_rate": 2.127955115443725e-09, "logits/chosen": -4.944531440734863, "logits/rejected": -4.906445503234863, "logps/chosen": -369.9312438964844, "logps/rejected": -337.4125061035156, "loss": 0.2577, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.2890625, "rewards/margins": 2.8275389671325684, "rewards/rejected": -0.538867175579071, "step": 3720 }, { "epoch": 1.95210677833028, "grad_norm": 78.6098403930664, "learning_rate": 1.727668956441497e-09, "logits/chosen": -4.959374904632568, "logits/rejected": -4.991406440734863, "logps/chosen": -399.7437438964844, "logps/rejected": -331.73748779296875, "loss": 0.2523, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.375, "rewards/margins": 2.626171827316284, "rewards/rejected": -0.25117188692092896, "step": 3730 }, { "epoch": 1.957341010206752, "grad_norm": 55.76271057128906, "learning_rate": 1.3689883936894298e-09, "logits/chosen": -4.943359375, "logits/rejected": -4.986132621765137, "logps/chosen": -409.4437561035156, "logps/rejected": -367.39373779296875, "loss": 0.2241, "rewards/accuracies": 0.878125011920929, "rewards/chosen": 2.6058592796325684, "rewards/margins": 3.0054688453674316, "rewards/rejected": -0.4000000059604645, "step": 3740 }, { "epoch": 1.9625752420832243, "grad_norm": 111.84916687011719, "learning_rate": 1.051943376911224e-09, "logits/chosen": -5.040234565734863, "logits/rejected": -4.931445121765137, "logps/chosen": -404.1187438964844, "logps/rejected": -339.7437438964844, "loss": 0.2449, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.505664110183716, "rewards/margins": 2.7621092796325684, "rewards/rejected": -0.2568359375, "step": 3750 }, { "epoch": 1.9678094739596963, "grad_norm": 59.06704330444336, "learning_rate": 7.765603792745934e-10, "logits/chosen": -4.946484565734863, "logits/rejected": -4.961523532867432, "logps/chosen": -389.45623779296875, "logps/rejected": -345.01873779296875, "loss": 0.2778, "rewards/accuracies": 0.890625, "rewards/chosen": 2.509765625, "rewards/margins": 2.9810547828674316, "rewards/rejected": -0.47148436307907104, "step": 3760 }, { "epoch": 1.9730437058361685, "grad_norm": 92.71817779541016, "learning_rate": 5.428623951805322e-10, "logits/chosen": -4.9384765625, "logits/rejected": -4.805273532867432, "logps/chosen": -378.29376220703125, "logps/rejected": -325.1000061035156, "loss": 0.2532, "rewards/accuracies": 0.859375, "rewards/chosen": 2.3115234375, "rewards/margins": 2.801562547683716, "rewards/rejected": -0.49003905057907104, "step": 3770 }, { "epoch": 1.9782779377126407, "grad_norm": 43.97341537475586, "learning_rate": 3.508689383435182e-10, "logits/chosen": -4.916211128234863, "logits/rejected": NaN, "logps/chosen": -366.04998779296875, "logps/rejected": -316.4125061035156, "loss": 0.2257, "rewards/accuracies": 0.875, "rewards/chosen": 2.4408202171325684, "rewards/margins": 2.8275389671325684, "rewards/rejected": -0.38671875, "step": 3780 }, { "epoch": 1.9835121695891127, "grad_norm": 45.00206756591797, "learning_rate": 2.0059604016192665e-10, "logits/chosen": -4.951952934265137, "logits/rejected": -4.9853515625, "logps/chosen": -430.82501220703125, "logps/rejected": -361.9375, "loss": 0.2083, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": 2.6509766578674316, "rewards/margins": 3.0660157203674316, "rewards/rejected": -0.4150390625, "step": 3790 }, { "epoch": 1.988746401465585, "grad_norm": 40.261680603027344, "learning_rate": 9.205624837949066e-11, "logits/chosen": -4.899023532867432, "logits/rejected": -4.893945217132568, "logps/chosen": -383.76873779296875, "logps/rejected": -349.4312438964844, "loss": 0.2323, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": 2.610546827316284, "rewards/margins": 3.113085985183716, "rewards/rejected": -0.5029296875, "step": 3800 }, { "epoch": 1.993980633342057, "grad_norm": 101.87580871582031, "learning_rate": 2.5258626037638618e-11, "logits/chosen": -5.003710746765137, "logits/rejected": -4.898633003234863, "logps/chosen": -397.82501220703125, "logps/rejected": -342.40625, "loss": 0.2884, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.4287109375, "rewards/margins": 2.8919920921325684, "rewards/rejected": -0.4634765684604645, "step": 3810 }, { "epoch": 1.999214865218529, "grad_norm": 106.40267944335938, "learning_rate": 2.087507185999371e-13, "logits/chosen": -4.965624809265137, "logits/rejected": -4.913671970367432, "logps/chosen": -391.70623779296875, "logps/rejected": -372.9375, "loss": 0.2534, "rewards/accuracies": 0.8656250238418579, "rewards/chosen": 2.5560545921325684, "rewards/margins": 2.908203125, "rewards/rejected": -0.3519531190395355, "step": 3820 }, { "epoch": 1.999214865218529, "step": 3820, "total_flos": 0.0, "train_loss": 0.449998592206945, "train_runtime": 37572.0632, "train_samples_per_second": 3.254, "train_steps_per_second": 0.102 } ], "logging_steps": 10, "max_steps": 3820, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }