{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004, "grad_norm": 7034011.299815918, "learning_rate": 2e-09, "logits/chosen": -2.3609464168548584, "logits/rejected": -2.4021644592285156, "logps/chosen": -72.32479858398438, "logps/rejected": -106.78115844726562, "loss": 138817.4219, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.004, "grad_norm": 5234909.800991719, "learning_rate": 2e-08, "logits/chosen": -2.3249125480651855, "logits/rejected": -2.3054325580596924, "logps/chosen": -70.72610473632812, "logps/rejected": -68.99564361572266, "loss": 125594.3333, "rewards/accuracies": 0.375, "rewards/chosen": 6.445489361794898e-06, "rewards/margins": -2.8922620913363062e-05, "rewards/rejected": 3.536810982041061e-05, "step": 10 }, { "epoch": 0.008, "grad_norm": 6739339.394495674, "learning_rate": 4e-08, "logits/chosen": -2.3423686027526855, "logits/rejected": -2.3319311141967773, "logps/chosen": -72.6821060180664, "logps/rejected": -76.68476867675781, "loss": 128657.6, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 7.994120824150741e-05, "rewards/margins": 0.00015745378914289176, "rewards/rejected": -7.751256634946913e-05, "step": 20 }, { "epoch": 0.012, "grad_norm": 6270796.093737289, "learning_rate": 6e-08, "logits/chosen": -2.3667407035827637, "logits/rejected": -2.366872549057007, "logps/chosen": -86.75081634521484, "logps/rejected": -96.1201171875, "loss": 129234.2, "rewards/accuracies": 0.5, "rewards/chosen": -9.695839253254235e-05, "rewards/margins": -2.3627610062249005e-05, "rewards/rejected": -7.333078247029334e-05, "step": 30 }, { "epoch": 0.016, "grad_norm": 6487964.338687279, "learning_rate": 8e-08, "logits/chosen": -2.330949068069458, "logits/rejected": -2.304487466812134, "logps/chosen": -70.66746520996094, "logps/rejected": -76.26786804199219, "loss": 132677.1375, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00011891069880221039, "rewards/margins": 2.9055625418550335e-05, "rewards/rejected": -0.00014796630421187729, "step": 40 }, { "epoch": 0.02, "grad_norm": 5808382.473785103, "learning_rate": 1e-07, "logits/chosen": -2.3761391639709473, "logits/rejected": -2.4001965522766113, "logps/chosen": -64.84712219238281, "logps/rejected": -85.47789001464844, "loss": 131065.9, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.00034615895128808916, "rewards/margins": 0.00022984863608144224, "rewards/rejected": -0.0005760076455771923, "step": 50 }, { "epoch": 0.024, "grad_norm": 4282982.05143524, "learning_rate": 1.2e-07, "logits/chosen": -2.3628551959991455, "logits/rejected": -2.325425386428833, "logps/chosen": -76.96721649169922, "logps/rejected": -81.25682067871094, "loss": 126675.9375, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0004624463617801666, "rewards/margins": 0.0005581938894465566, "rewards/rejected": -0.001020640367642045, "step": 60 }, { "epoch": 0.028, "grad_norm": 7692791.228759594, "learning_rate": 1.4e-07, "logits/chosen": -2.3956987857818604, "logits/rejected": -2.4127683639526367, "logps/chosen": -71.30229187011719, "logps/rejected": -74.56432342529297, "loss": 134539.0625, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0008852133760228753, "rewards/margins": 9.680164657766e-05, "rewards/rejected": -0.0009820150444284081, "step": 70 }, { "epoch": 0.032, "grad_norm": 5583562.2282863185, "learning_rate": 1.6e-07, "logits/chosen": -2.3956503868103027, "logits/rejected": -2.350037097930908, "logps/chosen": -75.2908706665039, "logps/rejected": -77.54324340820312, "loss": 125353.275, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0011667849030345678, "rewards/margins": 0.00017681324970908463, "rewards/rejected": -0.0013435978908091784, "step": 80 }, { "epoch": 0.036, "grad_norm": 7411307.114758692, "learning_rate": 1.8e-07, "logits/chosen": -2.3411900997161865, "logits/rejected": -2.32747220993042, "logps/chosen": -76.68790435791016, "logps/rejected": -77.229736328125, "loss": 134888.05, "rewards/accuracies": 0.5, "rewards/chosen": -0.0010598390363156796, "rewards/margins": 0.00033332061138935387, "rewards/rejected": -0.001393159618601203, "step": 90 }, { "epoch": 0.04, "grad_norm": 6105565.361340655, "learning_rate": 2e-07, "logits/chosen": -2.2321219444274902, "logits/rejected": -2.25978684425354, "logps/chosen": -69.29805755615234, "logps/rejected": -70.91548156738281, "loss": 128186.8375, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0023287434596568346, "rewards/margins": 0.0005028151208534837, "rewards/rejected": -0.0028315584640949965, "step": 100 }, { "epoch": 0.044, "grad_norm": 7344396.598489226, "learning_rate": 2.1999999999999998e-07, "logits/chosen": -2.1552886962890625, "logits/rejected": -2.167569637298584, "logps/chosen": -70.13446044921875, "logps/rejected": -86.2125015258789, "loss": 129394.525, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.005608140490949154, "rewards/margins": 0.004356575198471546, "rewards/rejected": -0.009964716620743275, "step": 110 }, { "epoch": 0.048, "grad_norm": 9496198.88633965, "learning_rate": 2.4e-07, "logits/chosen": -1.9701220989227295, "logits/rejected": -1.9230693578720093, "logps/chosen": -110.21456146240234, "logps/rejected": -117.58609771728516, "loss": 129791.4125, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.01813032478094101, "rewards/margins": 0.003227741690352559, "rewards/rejected": -0.02135806903243065, "step": 120 }, { "epoch": 0.052, "grad_norm": 10004391.66976001, "learning_rate": 2.6e-07, "logits/chosen": -2.015622854232788, "logits/rejected": -2.026458740234375, "logps/chosen": -100.22117614746094, "logps/rejected": -107.8635482788086, "loss": 129000.775, "rewards/accuracies": 0.625, "rewards/chosen": -0.015937697142362595, "rewards/margins": 0.008421100676059723, "rewards/rejected": -0.024358797818422318, "step": 130 }, { "epoch": 0.056, "grad_norm": 10735198.371540312, "learning_rate": 2.8e-07, "logits/chosen": -1.9458515644073486, "logits/rejected": -1.964914321899414, "logps/chosen": -100.13922882080078, "logps/rejected": -123.00533294677734, "loss": 132137.4875, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.024147292599081993, "rewards/margins": 0.011009057983756065, "rewards/rejected": -0.03515635430812836, "step": 140 }, { "epoch": 0.06, "grad_norm": 7516435.594198061, "learning_rate": 3e-07, "logits/chosen": -1.9916763305664062, "logits/rejected": -1.9970576763153076, "logps/chosen": -89.16957092285156, "logps/rejected": -104.74922180175781, "loss": 125561.5375, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.020500652492046356, "rewards/margins": 0.011713030748069286, "rewards/rejected": -0.03221368417143822, "step": 150 }, { "epoch": 0.064, "grad_norm": 7027700.618512748, "learning_rate": 3.2e-07, "logits/chosen": -2.159398078918457, "logits/rejected": -2.1420938968658447, "logps/chosen": -76.12110900878906, "logps/rejected": -94.0234603881836, "loss": 124492.5625, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.014212280511856079, "rewards/margins": 0.01188136450946331, "rewards/rejected": -0.02609364315867424, "step": 160 }, { "epoch": 0.068, "grad_norm": 8112303.913406089, "learning_rate": 3.4000000000000003e-07, "logits/chosen": -2.103625774383545, "logits/rejected": -2.0622053146362305, "logps/chosen": -101.08997344970703, "logps/rejected": -129.8938446044922, "loss": 120661.3375, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.013411534950137138, "rewards/margins": 0.01583944633603096, "rewards/rejected": -0.029250985011458397, "step": 170 }, { "epoch": 0.072, "grad_norm": 8237392.210524994, "learning_rate": 3.6e-07, "logits/chosen": -2.1253855228424072, "logits/rejected": -2.123330593109131, "logps/chosen": -80.26612854003906, "logps/rejected": -116.00606536865234, "loss": 125813.875, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.014310337603092194, "rewards/margins": 0.027810264378786087, "rewards/rejected": -0.04212059825658798, "step": 180 }, { "epoch": 0.076, "grad_norm": 8482656.343559477, "learning_rate": 3.7999999999999996e-07, "logits/chosen": -2.122274875640869, "logits/rejected": -2.0924127101898193, "logps/chosen": -70.27191162109375, "logps/rejected": -88.20128631591797, "loss": 122274.4125, "rewards/accuracies": 0.6875, "rewards/chosen": -0.011920216493308544, "rewards/margins": 0.012561318464577198, "rewards/rejected": -0.02448153682053089, "step": 190 }, { "epoch": 0.08, "grad_norm": 15995751.957306068, "learning_rate": 4e-07, "logits/chosen": -2.16402006149292, "logits/rejected": -2.1568922996520996, "logps/chosen": -84.34500885009766, "logps/rejected": -106.30509185791016, "loss": 124034.3625, "rewards/accuracies": 0.5625, "rewards/chosen": -0.019195228815078735, "rewards/margins": 0.019908469170331955, "rewards/rejected": -0.03910370171070099, "step": 200 }, { "epoch": 0.084, "grad_norm": 9976473.779353945, "learning_rate": 4.1999999999999995e-07, "logits/chosen": -2.1947314739227295, "logits/rejected": -2.155924081802368, "logps/chosen": -85.31925964355469, "logps/rejected": -116.8820571899414, "loss": 133085.9375, "rewards/accuracies": 0.625, "rewards/chosen": -0.016604367643594742, "rewards/margins": 0.010993210598826408, "rewards/rejected": -0.027597576379776, "step": 210 }, { "epoch": 0.088, "grad_norm": 7143746.706174395, "learning_rate": 4.3999999999999997e-07, "logits/chosen": -2.181243419647217, "logits/rejected": -2.1664962768554688, "logps/chosen": -74.75950622558594, "logps/rejected": -87.78418731689453, "loss": 127414.575, "rewards/accuracies": 0.5625, "rewards/chosen": -0.014525257050991058, "rewards/margins": 0.003328158985823393, "rewards/rejected": -0.01785341463983059, "step": 220 }, { "epoch": 0.092, "grad_norm": 9204902.414337158, "learning_rate": 4.6e-07, "logits/chosen": -2.108741044998169, "logits/rejected": -2.048841953277588, "logps/chosen": -78.65644073486328, "logps/rejected": -95.38871765136719, "loss": 127270.9375, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.013477807864546776, "rewards/margins": 0.008071732707321644, "rewards/rejected": -0.021549541503190994, "step": 230 }, { "epoch": 0.096, "grad_norm": 6495004.829819743, "learning_rate": 4.8e-07, "logits/chosen": -2.111128330230713, "logits/rejected": -2.0940356254577637, "logps/chosen": -92.46067810058594, "logps/rejected": -117.76658630371094, "loss": 122517.0, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.015996446833014488, "rewards/margins": 0.015014531090855598, "rewards/rejected": -0.031010976061224937, "step": 240 }, { "epoch": 0.1, "grad_norm": 3914167.3327231077, "learning_rate": 5e-07, "logits/chosen": -2.129955768585205, "logits/rejected": -2.1201798915863037, "logps/chosen": -91.17083740234375, "logps/rejected": -122.591064453125, "loss": 126768.9, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.017495278269052505, "rewards/margins": 0.019053593277931213, "rewards/rejected": -0.03654887527227402, "step": 250 }, { "epoch": 0.104, "grad_norm": 9274304.06015198, "learning_rate": 4.977777777777777e-07, "logits/chosen": -2.1294829845428467, "logits/rejected": -2.1332812309265137, "logps/chosen": -86.34416198730469, "logps/rejected": -110.06596374511719, "loss": 123661.6875, "rewards/accuracies": 0.625, "rewards/chosen": -0.019683022052049637, "rewards/margins": 0.017954688519239426, "rewards/rejected": -0.03763771429657936, "step": 260 }, { "epoch": 0.108, "grad_norm": 5830092.205559107, "learning_rate": 4.955555555555556e-07, "logits/chosen": -2.1960532665252686, "logits/rejected": -2.2218282222747803, "logps/chosen": -94.33865356445312, "logps/rejected": -113.83499908447266, "loss": 125464.4375, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.018877552822232246, "rewards/margins": 0.010512979701161385, "rewards/rejected": -0.02939053252339363, "step": 270 }, { "epoch": 0.112, "grad_norm": 8438742.306721646, "learning_rate": 4.933333333333333e-07, "logits/chosen": -2.257341146469116, "logits/rejected": -2.3109583854675293, "logps/chosen": -94.39613342285156, "logps/rejected": -118.54805755615234, "loss": 126082.3375, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.021040180698037148, "rewards/margins": 0.01762349344789982, "rewards/rejected": -0.038663674145936966, "step": 280 }, { "epoch": 0.116, "grad_norm": 7914235.455845386, "learning_rate": 4.91111111111111e-07, "logits/chosen": -2.327070713043213, "logits/rejected": -2.3570895195007324, "logps/chosen": -87.95598602294922, "logps/rejected": -108.05839538574219, "loss": 129599.7125, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0221787728369236, "rewards/margins": 0.01672218181192875, "rewards/rejected": -0.0389009527862072, "step": 290 }, { "epoch": 0.12, "grad_norm": 6457742.625324568, "learning_rate": 4.888888888888889e-07, "logits/chosen": -2.4012951850891113, "logits/rejected": -2.4345765113830566, "logps/chosen": -74.74217224121094, "logps/rejected": -100.60356140136719, "loss": 124479.875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.018593108281493187, "rewards/margins": 0.021032758057117462, "rewards/rejected": -0.0396258682012558, "step": 300 }, { "epoch": 0.124, "grad_norm": 6578431.206476413, "learning_rate": 4.866666666666666e-07, "logits/chosen": -2.452791452407837, "logits/rejected": -2.4812235832214355, "logps/chosen": -95.68658447265625, "logps/rejected": -111.42750549316406, "loss": 126451.425, "rewards/accuracies": 0.5, "rewards/chosen": -0.018242117017507553, "rewards/margins": 0.01176674384623766, "rewards/rejected": -0.030008861795067787, "step": 310 }, { "epoch": 0.128, "grad_norm": 10851035.518032862, "learning_rate": 4.844444444444445e-07, "logits/chosen": -2.4537229537963867, "logits/rejected": -2.4691717624664307, "logps/chosen": -82.9326171875, "logps/rejected": -116.93620300292969, "loss": 123506.3125, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.02308080717921257, "rewards/margins": 0.024886813014745712, "rewards/rejected": -0.04796762019395828, "step": 320 }, { "epoch": 0.132, "grad_norm": 9223772.443364851, "learning_rate": 4.822222222222222e-07, "logits/chosen": -2.391624927520752, "logits/rejected": -2.407311201095581, "logps/chosen": -91.67464447021484, "logps/rejected": -117.0147705078125, "loss": 121261.9125, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.023308029398322105, "rewards/margins": 0.017484817653894424, "rewards/rejected": -0.04079284518957138, "step": 330 }, { "epoch": 0.136, "grad_norm": 8085358.939583512, "learning_rate": 4.8e-07, "logits/chosen": -2.48149037361145, "logits/rejected": -2.4932546615600586, "logps/chosen": -96.05111694335938, "logps/rejected": -131.0735626220703, "loss": 126914.2875, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.02739790640771389, "rewards/margins": 0.029247354716062546, "rewards/rejected": -0.056645262986421585, "step": 340 }, { "epoch": 0.14, "grad_norm": 7944883.3667990295, "learning_rate": 4.777777777777778e-07, "logits/chosen": -2.45344877243042, "logits/rejected": -2.5137851238250732, "logps/chosen": -89.93304443359375, "logps/rejected": -108.8600845336914, "loss": 122914.55, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0250026136636734, "rewards/margins": 0.014374235644936562, "rewards/rejected": -0.03937685117125511, "step": 350 }, { "epoch": 0.144, "grad_norm": 11202153.92151104, "learning_rate": 4.7555555555555554e-07, "logits/chosen": -2.569916248321533, "logits/rejected": -2.5970470905303955, "logps/chosen": -95.05995178222656, "logps/rejected": -127.2323226928711, "loss": 124243.4125, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.025854643434286118, "rewards/margins": 0.023228293284773827, "rewards/rejected": -0.049082934856414795, "step": 360 }, { "epoch": 0.148, "grad_norm": 6901221.964419149, "learning_rate": 4.733333333333333e-07, "logits/chosen": -2.4675538539886475, "logits/rejected": -2.4503865242004395, "logps/chosen": -85.31706237792969, "logps/rejected": -102.17588806152344, "loss": 127540.3375, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.024571221321821213, "rewards/margins": 0.011107890866696835, "rewards/rejected": -0.03567911311984062, "step": 370 }, { "epoch": 0.152, "grad_norm": 6993857.423860367, "learning_rate": 4.711111111111111e-07, "logits/chosen": -2.459782123565674, "logits/rejected": -2.48356032371521, "logps/chosen": -110.59651184082031, "logps/rejected": -130.7666778564453, "loss": 127438.7, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.031783945858478546, "rewards/margins": 0.014480188488960266, "rewards/rejected": -0.046264130622148514, "step": 380 }, { "epoch": 0.156, "grad_norm": 6436648.717203954, "learning_rate": 4.6888888888888887e-07, "logits/chosen": -2.4548838138580322, "logits/rejected": -2.456662654876709, "logps/chosen": -117.49080657958984, "logps/rejected": -128.62191772460938, "loss": 126004.45, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.027950212359428406, "rewards/margins": 0.008381237275898457, "rewards/rejected": -0.036331452429294586, "step": 390 }, { "epoch": 0.16, "grad_norm": 7569273.392881057, "learning_rate": 4.6666666666666666e-07, "logits/chosen": -2.4525585174560547, "logits/rejected": -2.4519400596618652, "logps/chosen": -104.88145446777344, "logps/rejected": -128.08416748046875, "loss": 126857.475, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.022735530510544777, "rewards/margins": 0.01596837118268013, "rewards/rejected": -0.038703907281160355, "step": 400 }, { "epoch": 0.164, "grad_norm": 6861745.545448723, "learning_rate": 4.644444444444444e-07, "logits/chosen": -2.5066254138946533, "logits/rejected": -2.5164337158203125, "logps/chosen": -86.57884216308594, "logps/rejected": -119.33331298828125, "loss": 124486.5125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.02158975414931774, "rewards/margins": 0.030435938388109207, "rewards/rejected": -0.0520256944000721, "step": 410 }, { "epoch": 0.168, "grad_norm": 6923216.083132582, "learning_rate": 4.622222222222222e-07, "logits/chosen": -2.4752566814422607, "logits/rejected": -2.463294506072998, "logps/chosen": -85.61151885986328, "logps/rejected": -102.90364837646484, "loss": 124946.475, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.020689889788627625, "rewards/margins": 0.011657947674393654, "rewards/rejected": -0.03234783932566643, "step": 420 }, { "epoch": 0.172, "grad_norm": 7450190.250939408, "learning_rate": 4.6e-07, "logits/chosen": -2.547010660171509, "logits/rejected": -2.531663179397583, "logps/chosen": -97.37740325927734, "logps/rejected": -135.26629638671875, "loss": 131081.35, "rewards/accuracies": 0.5625, "rewards/chosen": -0.025491004809737206, "rewards/margins": 0.02565266191959381, "rewards/rejected": -0.051143668591976166, "step": 430 }, { "epoch": 0.176, "grad_norm": 6469045.345880665, "learning_rate": 4.577777777777778e-07, "logits/chosen": -2.6610159873962402, "logits/rejected": -2.659968852996826, "logps/chosen": -99.9223861694336, "logps/rejected": -124.8481216430664, "loss": 117640.1, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.027902353554964066, "rewards/margins": 0.019689548760652542, "rewards/rejected": -0.04759190231561661, "step": 440 }, { "epoch": 0.18, "grad_norm": 7434403.705215201, "learning_rate": 4.555555555555555e-07, "logits/chosen": -2.700005054473877, "logits/rejected": -2.6503853797912598, "logps/chosen": -96.16465759277344, "logps/rejected": -124.81199645996094, "loss": 123096.9, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.026438185945153236, "rewards/margins": 0.018792394548654556, "rewards/rejected": -0.04523057863116264, "step": 450 }, { "epoch": 0.184, "grad_norm": 7874629.531526446, "learning_rate": 4.5333333333333326e-07, "logits/chosen": -2.724388837814331, "logits/rejected": -2.7652335166931152, "logps/chosen": -101.94267272949219, "logps/rejected": -110.65840148925781, "loss": 128125.7, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.024334359914064407, "rewards/margins": 0.005520271137356758, "rewards/rejected": -0.029854634776711464, "step": 460 }, { "epoch": 0.188, "grad_norm": 7560431.171192426, "learning_rate": 4.511111111111111e-07, "logits/chosen": -2.6493871212005615, "logits/rejected": -2.6714179515838623, "logps/chosen": -105.4653549194336, "logps/rejected": -150.9534149169922, "loss": 122614.7375, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.02969837561249733, "rewards/margins": 0.027200985699892044, "rewards/rejected": -0.056899357587099075, "step": 470 }, { "epoch": 0.192, "grad_norm": 8394101.213808972, "learning_rate": 4.4888888888888885e-07, "logits/chosen": -2.495974063873291, "logits/rejected": -2.4936139583587646, "logps/chosen": -105.5544662475586, "logps/rejected": -139.51068115234375, "loss": 127604.6125, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.026578962802886963, "rewards/margins": 0.027212362736463547, "rewards/rejected": -0.05379132181406021, "step": 480 }, { "epoch": 0.196, "grad_norm": 8149957.80282127, "learning_rate": 4.4666666666666664e-07, "logits/chosen": -2.44439959526062, "logits/rejected": -2.4615180492401123, "logps/chosen": -115.50093078613281, "logps/rejected": -153.07492065429688, "loss": 120568.475, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.02712417207658291, "rewards/margins": 0.03037952445447445, "rewards/rejected": -0.05750369280576706, "step": 490 }, { "epoch": 0.2, "grad_norm": 9689223.456248827, "learning_rate": 4.444444444444444e-07, "logits/chosen": -2.4836983680725098, "logits/rejected": -2.4954071044921875, "logps/chosen": -108.3786392211914, "logps/rejected": -131.80975341796875, "loss": 130260.4375, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.03161335736513138, "rewards/margins": 0.019194485619664192, "rewards/rejected": -0.05080784484744072, "step": 500 }, { "epoch": 0.204, "grad_norm": 9433316.334462296, "learning_rate": 4.4222222222222223e-07, "logits/chosen": -2.637115955352783, "logits/rejected": -2.6541285514831543, "logps/chosen": -106.7952880859375, "logps/rejected": -119.38945007324219, "loss": 125224.65, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.02900713123381138, "rewards/margins": 0.011991321109235287, "rewards/rejected": -0.04099845141172409, "step": 510 }, { "epoch": 0.208, "grad_norm": 6547291.880532919, "learning_rate": 4.3999999999999997e-07, "logits/chosen": -2.6971993446350098, "logits/rejected": -2.6676185131073, "logps/chosen": -89.63105773925781, "logps/rejected": -119.62776947021484, "loss": 121176.4, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.025825385004281998, "rewards/margins": 0.021441804245114326, "rewards/rejected": -0.04726719111204147, "step": 520 }, { "epoch": 0.212, "grad_norm": 6024322.818937677, "learning_rate": 4.3777777777777776e-07, "logits/chosen": -2.732637405395508, "logits/rejected": -2.718721866607666, "logps/chosen": -84.20713806152344, "logps/rejected": -120.58128356933594, "loss": 118993.7, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.023359743878245354, "rewards/margins": 0.02675134316086769, "rewards/rejected": -0.0501110777258873, "step": 530 }, { "epoch": 0.216, "grad_norm": 6460660.035353449, "learning_rate": 4.355555555555555e-07, "logits/chosen": -2.488724708557129, "logits/rejected": -2.504575490951538, "logps/chosen": -98.4452133178711, "logps/rejected": -128.78163146972656, "loss": 123458.275, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.026253730058670044, "rewards/margins": 0.024806631729006767, "rewards/rejected": -0.05106035992503166, "step": 540 }, { "epoch": 0.22, "grad_norm": 8461699.658062043, "learning_rate": 4.3333333333333335e-07, "logits/chosen": -2.5291218757629395, "logits/rejected": -2.536240339279175, "logps/chosen": -100.16661071777344, "logps/rejected": -149.42355346679688, "loss": 124909.075, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.027327323332428932, "rewards/margins": 0.036261945962905884, "rewards/rejected": -0.06358926743268967, "step": 550 }, { "epoch": 0.224, "grad_norm": 6921956.012662945, "learning_rate": 4.311111111111111e-07, "logits/chosen": -2.628513813018799, "logits/rejected": -2.6058189868927, "logps/chosen": -107.0963363647461, "logps/rejected": -113.91001892089844, "loss": 127428.7375, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.03390585258603096, "rewards/margins": 0.005185864400118589, "rewards/rejected": -0.039091721177101135, "step": 560 }, { "epoch": 0.228, "grad_norm": 8197834.099947786, "learning_rate": 4.2888888888888883e-07, "logits/chosen": -2.561366558074951, "logits/rejected": -2.561455249786377, "logps/chosen": -105.6274185180664, "logps/rejected": -136.18792724609375, "loss": 126906.175, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.031804267317056656, "rewards/margins": 0.02525196596980095, "rewards/rejected": -0.057056229561567307, "step": 570 }, { "epoch": 0.232, "grad_norm": 8846278.644102238, "learning_rate": 4.266666666666667e-07, "logits/chosen": -2.6525886058807373, "logits/rejected": -2.588754653930664, "logps/chosen": -136.8567657470703, "logps/rejected": -178.04689025878906, "loss": 132691.7875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.04016602411866188, "rewards/margins": 0.027137309312820435, "rewards/rejected": -0.06730332970619202, "step": 580 }, { "epoch": 0.236, "grad_norm": 6333324.696405062, "learning_rate": 4.244444444444444e-07, "logits/chosen": -2.4680473804473877, "logits/rejected": -2.465623378753662, "logps/chosen": -123.52412414550781, "logps/rejected": -135.17831420898438, "loss": 129527.5, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03627150505781174, "rewards/margins": 0.012457914650440216, "rewards/rejected": -0.048729415982961655, "step": 590 }, { "epoch": 0.24, "grad_norm": 7036898.682503791, "learning_rate": 4.222222222222222e-07, "logits/chosen": -2.584538459777832, "logits/rejected": -2.579874038696289, "logps/chosen": -99.02510070800781, "logps/rejected": -107.3072280883789, "loss": 128720.725, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03315219283103943, "rewards/margins": 0.003997699357569218, "rewards/rejected": -0.03714989498257637, "step": 600 }, { "epoch": 0.244, "grad_norm": 7159293.986125982, "learning_rate": 4.1999999999999995e-07, "logits/chosen": -2.649663209915161, "logits/rejected": -2.6600253582000732, "logps/chosen": -101.28582000732422, "logps/rejected": -119.93243408203125, "loss": 129047.95, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.031149577349424362, "rewards/margins": 0.011909973807632923, "rewards/rejected": -0.04305955022573471, "step": 610 }, { "epoch": 0.248, "grad_norm": 7467292.937718221, "learning_rate": 4.177777777777778e-07, "logits/chosen": -2.572115898132324, "logits/rejected": -2.525055408477783, "logps/chosen": -105.48246765136719, "logps/rejected": -131.28749084472656, "loss": 122677.825, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.03393036499619484, "rewards/margins": 0.017631059512495995, "rewards/rejected": -0.051561422646045685, "step": 620 }, { "epoch": 0.252, "grad_norm": 6649301.452495339, "learning_rate": 4.1555555555555554e-07, "logits/chosen": -2.5688421726226807, "logits/rejected": -2.5850729942321777, "logps/chosen": -106.14555358886719, "logps/rejected": -141.26199340820312, "loss": 124169.6, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.027926957234740257, "rewards/margins": 0.031329791992902756, "rewards/rejected": -0.05925675109028816, "step": 630 }, { "epoch": 0.256, "grad_norm": 5701852.577919224, "learning_rate": 4.1333333333333333e-07, "logits/chosen": -2.5793604850769043, "logits/rejected": -2.6216492652893066, "logps/chosen": -94.40669250488281, "logps/rejected": -141.09725952148438, "loss": 119443.9125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.029792586341500282, "rewards/margins": 0.0306295957416296, "rewards/rejected": -0.06042218208312988, "step": 640 }, { "epoch": 0.26, "grad_norm": 7828661.867350275, "learning_rate": 4.1111111111111107e-07, "logits/chosen": -2.5071213245391846, "logits/rejected": -2.5167384147644043, "logps/chosen": -100.64595794677734, "logps/rejected": -135.44271850585938, "loss": 127055.0875, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.03445083647966385, "rewards/margins": 0.021719755604863167, "rewards/rejected": -0.056170590221881866, "step": 650 }, { "epoch": 0.264, "grad_norm": 7230836.816701007, "learning_rate": 4.088888888888889e-07, "logits/chosen": -2.49631404876709, "logits/rejected": -2.536076068878174, "logps/chosen": -101.35478210449219, "logps/rejected": -124.0557632446289, "loss": 128004.0625, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03691136837005615, "rewards/margins": 0.013063013553619385, "rewards/rejected": -0.04997437819838524, "step": 660 }, { "epoch": 0.268, "grad_norm": 9181742.684525523, "learning_rate": 4.0666666666666666e-07, "logits/chosen": -2.5028629302978516, "logits/rejected": -2.5143425464630127, "logps/chosen": -114.0814437866211, "logps/rejected": -130.69984436035156, "loss": 132355.1, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.03546031937003136, "rewards/margins": 0.012831469066441059, "rewards/rejected": -0.04829178377985954, "step": 670 }, { "epoch": 0.272, "grad_norm": 5953835.069109496, "learning_rate": 4.044444444444444e-07, "logits/chosen": -2.3693368434906006, "logits/rejected": -2.3933675289154053, "logps/chosen": -96.07215881347656, "logps/rejected": -133.98353576660156, "loss": 122900.6875, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.029592838138341904, "rewards/margins": 0.028876733034849167, "rewards/rejected": -0.05846957489848137, "step": 680 }, { "epoch": 0.276, "grad_norm": 9742020.165182771, "learning_rate": 4.022222222222222e-07, "logits/chosen": -2.362238645553589, "logits/rejected": -2.3378891944885254, "logps/chosen": -128.8614501953125, "logps/rejected": -144.07827758789062, "loss": 125425.0375, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.03773171454668045, "rewards/margins": 0.013301841914653778, "rewards/rejected": -0.05103355646133423, "step": 690 }, { "epoch": 0.28, "grad_norm": 5698717.703929527, "learning_rate": 4e-07, "logits/chosen": -2.4838829040527344, "logits/rejected": -2.50342059135437, "logps/chosen": -111.4194107055664, "logps/rejected": -129.75094604492188, "loss": 125053.125, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03320156782865524, "rewards/margins": 0.017379306256771088, "rewards/rejected": -0.05058088153600693, "step": 700 }, { "epoch": 0.284, "grad_norm": 8985877.177134423, "learning_rate": 3.977777777777778e-07, "logits/chosen": -2.495482921600342, "logits/rejected": -2.4964957237243652, "logps/chosen": -122.45703125, "logps/rejected": -142.83428955078125, "loss": 128369.4125, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03306376561522484, "rewards/margins": 0.016845058649778366, "rewards/rejected": -0.049908824265003204, "step": 710 }, { "epoch": 0.288, "grad_norm": 6293531.48238979, "learning_rate": 3.955555555555555e-07, "logits/chosen": -2.391200065612793, "logits/rejected": -2.4489917755126953, "logps/chosen": -98.37910461425781, "logps/rejected": -126.7882080078125, "loss": 127457.0, "rewards/accuracies": 0.625, "rewards/chosen": -0.029601294547319412, "rewards/margins": 0.01803305558860302, "rewards/rejected": -0.04763435199856758, "step": 720 }, { "epoch": 0.292, "grad_norm": 7897192.107065841, "learning_rate": 3.933333333333333e-07, "logits/chosen": -2.4582555294036865, "logits/rejected": -2.4759137630462646, "logps/chosen": -98.81364440917969, "logps/rejected": -144.63548278808594, "loss": 123344.0875, "rewards/accuracies": 0.625, "rewards/chosen": -0.029745137318968773, "rewards/margins": 0.03918560594320297, "rewards/rejected": -0.06893075257539749, "step": 730 }, { "epoch": 0.296, "grad_norm": 6327010.4235527, "learning_rate": 3.911111111111111e-07, "logits/chosen": -2.578629732131958, "logits/rejected": -2.5296969413757324, "logps/chosen": -93.8338623046875, "logps/rejected": -119.47264099121094, "loss": 130155.3875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.031289439648389816, "rewards/margins": 0.02071293443441391, "rewards/rejected": -0.052002377808094025, "step": 740 }, { "epoch": 0.3, "grad_norm": 6618521.62330563, "learning_rate": 3.888888888888889e-07, "logits/chosen": -2.5220892429351807, "logits/rejected": -2.533546209335327, "logps/chosen": -115.8681869506836, "logps/rejected": -141.22885131835938, "loss": 119614.875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.027656156569719315, "rewards/margins": 0.018271705135703087, "rewards/rejected": -0.04592785984277725, "step": 750 }, { "epoch": 0.304, "grad_norm": 10187163.188769344, "learning_rate": 3.8666666666666664e-07, "logits/chosen": -2.5118823051452637, "logits/rejected": -2.504910945892334, "logps/chosen": -101.41685485839844, "logps/rejected": -148.4906005859375, "loss": 126592.4875, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.033656515181064606, "rewards/margins": 0.03055490553379059, "rewards/rejected": -0.06421142816543579, "step": 760 }, { "epoch": 0.308, "grad_norm": 5303031.219251705, "learning_rate": 3.8444444444444443e-07, "logits/chosen": -2.430718421936035, "logits/rejected": -2.42429518699646, "logps/chosen": -104.3238754272461, "logps/rejected": -152.8984832763672, "loss": 120932.45, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.02648145519196987, "rewards/margins": 0.034856077283620834, "rewards/rejected": -0.06133753061294556, "step": 770 }, { "epoch": 0.312, "grad_norm": 7788606.775046531, "learning_rate": 3.822222222222222e-07, "logits/chosen": -2.440924882888794, "logits/rejected": -2.420545816421509, "logps/chosen": -115.5602035522461, "logps/rejected": -155.5218048095703, "loss": 125781.1125, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.033296506851911545, "rewards/margins": 0.026110276579856873, "rewards/rejected": -0.059406787157058716, "step": 780 }, { "epoch": 0.316, "grad_norm": 8342190.296045118, "learning_rate": 3.7999999999999996e-07, "logits/chosen": -2.476647138595581, "logits/rejected": -2.469700813293457, "logps/chosen": -128.41778564453125, "logps/rejected": -171.7101287841797, "loss": 123186.7, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.036413662135601044, "rewards/margins": 0.02540646493434906, "rewards/rejected": -0.0618201307952404, "step": 790 }, { "epoch": 0.32, "grad_norm": 6557257.336959155, "learning_rate": 3.7777777777777775e-07, "logits/chosen": -2.5064010620117188, "logits/rejected": -2.4810726642608643, "logps/chosen": -92.31603240966797, "logps/rejected": -137.16452026367188, "loss": 122089.9625, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.026451414451003075, "rewards/margins": 0.033910416066646576, "rewards/rejected": -0.0603618249297142, "step": 800 }, { "epoch": 0.324, "grad_norm": 6904823.359277868, "learning_rate": 3.755555555555555e-07, "logits/chosen": -2.4048755168914795, "logits/rejected": -2.4163169860839844, "logps/chosen": -103.93983459472656, "logps/rejected": -146.17251586914062, "loss": 125580.95, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.034494899213314056, "rewards/margins": 0.028069961816072464, "rewards/rejected": -0.06256486475467682, "step": 810 }, { "epoch": 0.328, "grad_norm": 7723585.3043322805, "learning_rate": 3.7333333333333334e-07, "logits/chosen": -2.3999173641204834, "logits/rejected": -2.3777313232421875, "logps/chosen": -98.98230743408203, "logps/rejected": -113.75994873046875, "loss": 128410.575, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0317092090845108, "rewards/margins": 0.005884192418307066, "rewards/rejected": -0.037593401968479156, "step": 820 }, { "epoch": 0.332, "grad_norm": 5742576.756144718, "learning_rate": 3.711111111111111e-07, "logits/chosen": -2.3831002712249756, "logits/rejected": -2.3882925510406494, "logps/chosen": -94.30198669433594, "logps/rejected": -129.58078002929688, "loss": 126154.8625, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.02777601219713688, "rewards/margins": 0.01992596499621868, "rewards/rejected": -0.04770197719335556, "step": 830 }, { "epoch": 0.336, "grad_norm": 7609192.056262804, "learning_rate": 3.688888888888889e-07, "logits/chosen": -2.434124708175659, "logits/rejected": -2.4821083545684814, "logps/chosen": -96.55790710449219, "logps/rejected": -122.23677062988281, "loss": 123363.6125, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.026610519737005234, "rewards/margins": 0.02544989623129368, "rewards/rejected": -0.05206041410565376, "step": 840 }, { "epoch": 0.34, "grad_norm": 8034675.762755762, "learning_rate": 3.666666666666666e-07, "logits/chosen": -2.3713624477386475, "logits/rejected": -2.366699457168579, "logps/chosen": -115.35933685302734, "logps/rejected": -129.19752502441406, "loss": 126533.75, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0336347371339798, "rewards/margins": 0.0172466691583395, "rewards/rejected": -0.05088140815496445, "step": 850 }, { "epoch": 0.344, "grad_norm": 5975456.469702007, "learning_rate": 3.6444444444444446e-07, "logits/chosen": -2.527848482131958, "logits/rejected": -2.5321898460388184, "logps/chosen": -101.052734375, "logps/rejected": -146.85986328125, "loss": 117886.0, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.026471996679902077, "rewards/margins": 0.038311395794153214, "rewards/rejected": -0.06478338688611984, "step": 860 }, { "epoch": 0.348, "grad_norm": 6906670.4436170915, "learning_rate": 3.622222222222222e-07, "logits/chosen": -2.5193216800689697, "logits/rejected": -2.55930495262146, "logps/chosen": -103.41465759277344, "logps/rejected": -136.6109161376953, "loss": 129100.275, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.029144983738660812, "rewards/margins": 0.02550877258181572, "rewards/rejected": -0.05465375632047653, "step": 870 }, { "epoch": 0.352, "grad_norm": 7471407.013777157, "learning_rate": 3.6e-07, "logits/chosen": -2.451826572418213, "logits/rejected": -2.4506518840789795, "logps/chosen": -98.89453125, "logps/rejected": -141.08523559570312, "loss": 122662.65, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.025945227593183517, "rewards/margins": 0.03901258111000061, "rewards/rejected": -0.06495781242847443, "step": 880 }, { "epoch": 0.356, "grad_norm": 6728467.134306638, "learning_rate": 3.5777777777777773e-07, "logits/chosen": -2.5389392375946045, "logits/rejected": -2.5246338844299316, "logps/chosen": -120.97161865234375, "logps/rejected": -168.5087432861328, "loss": 122320.4375, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.03245999664068222, "rewards/margins": 0.04135856777429581, "rewards/rejected": -0.07381855696439743, "step": 890 }, { "epoch": 0.36, "grad_norm": 7104916.695741348, "learning_rate": 3.5555555555555553e-07, "logits/chosen": -2.5385003089904785, "logits/rejected": -2.5232315063476562, "logps/chosen": -87.90635681152344, "logps/rejected": -116.77327728271484, "loss": 126440.375, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.032553546130657196, "rewards/margins": 0.01972118578851223, "rewards/rejected": -0.052274733781814575, "step": 900 }, { "epoch": 0.364, "grad_norm": 5970259.1415026905, "learning_rate": 3.533333333333333e-07, "logits/chosen": -2.508732557296753, "logits/rejected": -2.5209858417510986, "logps/chosen": -101.33818054199219, "logps/rejected": -125.9253921508789, "loss": 121322.675, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03439612686634064, "rewards/margins": 0.02015717700123787, "rewards/rejected": -0.054553307592868805, "step": 910 }, { "epoch": 0.368, "grad_norm": 8365175.917846098, "learning_rate": 3.5111111111111106e-07, "logits/chosen": -2.4121992588043213, "logits/rejected": -2.403446674346924, "logps/chosen": -110.45259857177734, "logps/rejected": -146.91270446777344, "loss": 124905.35, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.03642472252249718, "rewards/margins": 0.029706323519349098, "rewards/rejected": -0.06613104045391083, "step": 920 }, { "epoch": 0.372, "grad_norm": 7152589.843566105, "learning_rate": 3.488888888888889e-07, "logits/chosen": -2.4605088233947754, "logits/rejected": -2.5225741863250732, "logps/chosen": -104.7183609008789, "logps/rejected": -129.02984619140625, "loss": 125537.525, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.030679216608405113, "rewards/margins": 0.023534944280982018, "rewards/rejected": -0.05421415716409683, "step": 930 }, { "epoch": 0.376, "grad_norm": 7902918.244011351, "learning_rate": 3.4666666666666665e-07, "logits/chosen": -2.4962337017059326, "logits/rejected": -2.560391664505005, "logps/chosen": -88.20844268798828, "logps/rejected": -139.12539672851562, "loss": 120117.675, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.023946184664964676, "rewards/margins": 0.035228628665208817, "rewards/rejected": -0.05917481333017349, "step": 940 }, { "epoch": 0.38, "grad_norm": 7058088.493943834, "learning_rate": 3.4444444444444444e-07, "logits/chosen": -2.4186971187591553, "logits/rejected": -2.479079484939575, "logps/chosen": -109.65245056152344, "logps/rejected": -145.39144897460938, "loss": 131717.475, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.035849470645189285, "rewards/margins": 0.030028488487005234, "rewards/rejected": -0.06587796658277512, "step": 950 }, { "epoch": 0.384, "grad_norm": 9519920.624854606, "learning_rate": 3.422222222222222e-07, "logits/chosen": -2.388206958770752, "logits/rejected": -2.4197933673858643, "logps/chosen": -130.57398986816406, "logps/rejected": -206.50595092773438, "loss": 121014.6125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.03395534306764603, "rewards/margins": 0.04584265127778053, "rewards/rejected": -0.07979799807071686, "step": 960 }, { "epoch": 0.388, "grad_norm": 6508580.208947205, "learning_rate": 3.4000000000000003e-07, "logits/chosen": -2.4630963802337646, "logits/rejected": -2.4617621898651123, "logps/chosen": -97.78787231445312, "logps/rejected": -137.83377075195312, "loss": 129298.375, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03728308528661728, "rewards/margins": 0.032224711030721664, "rewards/rejected": -0.06950780749320984, "step": 970 }, { "epoch": 0.392, "grad_norm": 8870008.273394352, "learning_rate": 3.3777777777777777e-07, "logits/chosen": -2.346057176589966, "logits/rejected": -2.388782262802124, "logps/chosen": -99.29377746582031, "logps/rejected": -174.5662841796875, "loss": 123724.2875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.025686006993055344, "rewards/margins": 0.048049140721559525, "rewards/rejected": -0.07373513281345367, "step": 980 }, { "epoch": 0.396, "grad_norm": 8011988.902105641, "learning_rate": 3.3555555555555556e-07, "logits/chosen": -2.392421007156372, "logits/rejected": -2.373751163482666, "logps/chosen": -98.69950866699219, "logps/rejected": -133.2752227783203, "loss": 122756.525, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.029735039919614792, "rewards/margins": 0.027242526412010193, "rewards/rejected": -0.05697755888104439, "step": 990 }, { "epoch": 0.4, "grad_norm": 9489098.770004237, "learning_rate": 3.333333333333333e-07, "logits/chosen": -2.326141595840454, "logits/rejected": -2.350912094116211, "logps/chosen": -113.98426818847656, "logps/rejected": -161.37271118164062, "loss": 120187.8125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03596794605255127, "rewards/margins": 0.03252139315009117, "rewards/rejected": -0.06848934292793274, "step": 1000 }, { "epoch": 0.404, "grad_norm": 7831638.304742374, "learning_rate": 3.311111111111111e-07, "logits/chosen": -2.4753966331481934, "logits/rejected": -2.4720969200134277, "logps/chosen": -93.01488494873047, "logps/rejected": -129.34759521484375, "loss": 123952.0625, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.029807209968566895, "rewards/margins": 0.028074974194169044, "rewards/rejected": -0.05788217857480049, "step": 1010 }, { "epoch": 0.408, "grad_norm": 9432041.545640234, "learning_rate": 3.288888888888889e-07, "logits/chosen": -2.4366466999053955, "logits/rejected": -2.4423012733459473, "logps/chosen": -104.15936279296875, "logps/rejected": -127.2192611694336, "loss": 126318.675, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.03481433913111687, "rewards/margins": 0.019000452011823654, "rewards/rejected": -0.05381479114294052, "step": 1020 }, { "epoch": 0.412, "grad_norm": 8015912.365619365, "learning_rate": 3.2666666666666663e-07, "logits/chosen": -2.55189847946167, "logits/rejected": -2.544987440109253, "logps/chosen": -107.0470199584961, "logps/rejected": -162.27200317382812, "loss": 121624.775, "rewards/accuracies": 0.625, "rewards/chosen": -0.034415554255247116, "rewards/margins": 0.04376577213406563, "rewards/rejected": -0.07818132638931274, "step": 1030 }, { "epoch": 0.416, "grad_norm": 6289031.406175271, "learning_rate": 3.244444444444444e-07, "logits/chosen": -2.46891450881958, "logits/rejected": -2.4879581928253174, "logps/chosen": -95.2767562866211, "logps/rejected": -143.16920471191406, "loss": 121056.125, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.031673580408096313, "rewards/margins": 0.03125213831663132, "rewards/rejected": -0.06292571872472763, "step": 1040 }, { "epoch": 0.42, "grad_norm": 8474897.39551202, "learning_rate": 3.222222222222222e-07, "logits/chosen": -2.5424282550811768, "logits/rejected": -2.5662999153137207, "logps/chosen": -111.43955993652344, "logps/rejected": -153.2001190185547, "loss": 122305.1625, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.034298766404390335, "rewards/margins": 0.029853323474526405, "rewards/rejected": -0.06415208429098129, "step": 1050 }, { "epoch": 0.424, "grad_norm": 7118829.871963521, "learning_rate": 3.2e-07, "logits/chosen": -2.5913608074188232, "logits/rejected": -2.607485294342041, "logps/chosen": -110.69046783447266, "logps/rejected": -146.65023803710938, "loss": 120745.8125, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02970978617668152, "rewards/margins": 0.02858895994722843, "rewards/rejected": -0.0582987479865551, "step": 1060 }, { "epoch": 0.428, "grad_norm": 6184465.945134909, "learning_rate": 3.1777777777777775e-07, "logits/chosen": -2.61472225189209, "logits/rejected": -2.6045632362365723, "logps/chosen": -118.44596862792969, "logps/rejected": -141.05259704589844, "loss": 119233.675, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.030257636681199074, "rewards/margins": 0.027423173189163208, "rewards/rejected": -0.05768080800771713, "step": 1070 }, { "epoch": 0.432, "grad_norm": 8384350.327301131, "learning_rate": 3.1555555555555554e-07, "logits/chosen": -2.5862181186676025, "logits/rejected": -2.5921201705932617, "logps/chosen": -107.20783996582031, "logps/rejected": -146.17771911621094, "loss": 123723.375, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.028097758069634438, "rewards/margins": 0.03525155037641525, "rewards/rejected": -0.06334930658340454, "step": 1080 }, { "epoch": 0.436, "grad_norm": 6672967.979703066, "learning_rate": 3.1333333333333333e-07, "logits/chosen": -2.5249099731445312, "logits/rejected": -2.5252327919006348, "logps/chosen": -82.09342956542969, "logps/rejected": -124.22785949707031, "loss": 125011.25, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.02518371120095253, "rewards/margins": 0.0352654904127121, "rewards/rejected": -0.06044920161366463, "step": 1090 }, { "epoch": 0.44, "grad_norm": 8650903.887605142, "learning_rate": 3.111111111111111e-07, "logits/chosen": -2.4037208557128906, "logits/rejected": -2.4414310455322266, "logps/chosen": -97.84339141845703, "logps/rejected": -142.56967163085938, "loss": 119550.375, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.029644513502717018, "rewards/margins": 0.03777293115854263, "rewards/rejected": -0.0674174427986145, "step": 1100 }, { "epoch": 0.444, "grad_norm": 6832446.375717195, "learning_rate": 3.0888888888888887e-07, "logits/chosen": -2.4431042671203613, "logits/rejected": -2.456604480743408, "logps/chosen": -107.29023742675781, "logps/rejected": -135.47000122070312, "loss": 121918.525, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.03453664109110832, "rewards/margins": 0.019270362332463264, "rewards/rejected": -0.05380700156092644, "step": 1110 }, { "epoch": 0.448, "grad_norm": 6428580.094510342, "learning_rate": 3.066666666666666e-07, "logits/chosen": -2.515045642852783, "logits/rejected": -2.51838755607605, "logps/chosen": -117.26094818115234, "logps/rejected": -153.82720947265625, "loss": 127075.725, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.031917281448841095, "rewards/margins": 0.024132903665304184, "rewards/rejected": -0.05605018883943558, "step": 1120 }, { "epoch": 0.452, "grad_norm": 7059143.842956961, "learning_rate": 3.0444444444444445e-07, "logits/chosen": -2.4530346393585205, "logits/rejected": -2.4681754112243652, "logps/chosen": -106.003662109375, "logps/rejected": -166.18124389648438, "loss": 125480.3875, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.03530178219079971, "rewards/margins": 0.03853614255785942, "rewards/rejected": -0.07383792102336884, "step": 1130 }, { "epoch": 0.456, "grad_norm": 7003392.120968072, "learning_rate": 3.022222222222222e-07, "logits/chosen": -2.526331901550293, "logits/rejected": -2.5595362186431885, "logps/chosen": -103.13214111328125, "logps/rejected": -151.08694458007812, "loss": 122330.225, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.0366961732506752, "rewards/margins": 0.045485951006412506, "rewards/rejected": -0.08218212425708771, "step": 1140 }, { "epoch": 0.46, "grad_norm": 8693683.344797961, "learning_rate": 3e-07, "logits/chosen": -2.570127248764038, "logits/rejected": -2.5569756031036377, "logps/chosen": -118.91337585449219, "logps/rejected": -157.19509887695312, "loss": 121470.875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.030104130506515503, "rewards/margins": 0.03460243344306946, "rewards/rejected": -0.06470657885074615, "step": 1150 }, { "epoch": 0.464, "grad_norm": 9534213.097492808, "learning_rate": 2.9777777777777773e-07, "logits/chosen": -2.5106987953186035, "logits/rejected": -2.4634547233581543, "logps/chosen": -89.20586395263672, "logps/rejected": -149.52505493164062, "loss": 116528.675, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.02851666882634163, "rewards/margins": 0.04616966471076012, "rewards/rejected": -0.07468634098768234, "step": 1160 }, { "epoch": 0.468, "grad_norm": 8777276.722714778, "learning_rate": 2.9555555555555557e-07, "logits/chosen": -2.488731861114502, "logits/rejected": -2.4706058502197266, "logps/chosen": -112.614501953125, "logps/rejected": -158.08847045898438, "loss": 123641.3375, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.037982989102602005, "rewards/margins": 0.035983096808195114, "rewards/rejected": -0.07396609336137772, "step": 1170 }, { "epoch": 0.472, "grad_norm": 7805149.222381511, "learning_rate": 2.933333333333333e-07, "logits/chosen": -2.4993319511413574, "logits/rejected": -2.4856998920440674, "logps/chosen": -103.1233901977539, "logps/rejected": -121.76432037353516, "loss": 122802.05, "rewards/accuracies": 0.625, "rewards/chosen": -0.03225432708859444, "rewards/margins": 0.015282504260540009, "rewards/rejected": -0.04753682762384415, "step": 1180 }, { "epoch": 0.476, "grad_norm": 9127383.125403812, "learning_rate": 2.911111111111111e-07, "logits/chosen": -2.3260111808776855, "logits/rejected": -2.384988784790039, "logps/chosen": -111.52685546875, "logps/rejected": -160.5020751953125, "loss": 125717.575, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.03155955299735069, "rewards/margins": 0.043639086186885834, "rewards/rejected": -0.07519863545894623, "step": 1190 }, { "epoch": 0.48, "grad_norm": 8277710.5533771645, "learning_rate": 2.8888888888888885e-07, "logits/chosen": -2.4511587619781494, "logits/rejected": -2.473177433013916, "logps/chosen": -101.6271743774414, "logps/rejected": -154.0011749267578, "loss": 119860.575, "rewards/accuracies": 0.625, "rewards/chosen": -0.03446241840720177, "rewards/margins": 0.0441209152340889, "rewards/rejected": -0.07858333736658096, "step": 1200 }, { "epoch": 0.484, "grad_norm": 9000426.122325586, "learning_rate": 2.866666666666667e-07, "logits/chosen": -2.352407693862915, "logits/rejected": -2.3879640102386475, "logps/chosen": -111.2037353515625, "logps/rejected": -155.64559936523438, "loss": 125368.55, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.033860720694065094, "rewards/margins": 0.028398964554071426, "rewards/rejected": -0.06225968152284622, "step": 1210 }, { "epoch": 0.488, "grad_norm": 7439506.210055379, "learning_rate": 2.8444444444444443e-07, "logits/chosen": -2.4001262187957764, "logits/rejected": -2.406364917755127, "logps/chosen": -96.29491424560547, "logps/rejected": -133.1979522705078, "loss": 124972.3125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.031061682850122452, "rewards/margins": 0.028077807277441025, "rewards/rejected": -0.05913949012756348, "step": 1220 }, { "epoch": 0.492, "grad_norm": 8522710.668188507, "learning_rate": 2.8222222222222217e-07, "logits/chosen": -2.455540418624878, "logits/rejected": -2.5077738761901855, "logps/chosen": -118.55711364746094, "logps/rejected": -183.99917602539062, "loss": 123136.65, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.03442969545722008, "rewards/margins": 0.04292518272995949, "rewards/rejected": -0.07735487818717957, "step": 1230 }, { "epoch": 0.496, "grad_norm": 7800702.638161312, "learning_rate": 2.8e-07, "logits/chosen": -2.4330172538757324, "logits/rejected": -2.4518215656280518, "logps/chosen": -106.44327545166016, "logps/rejected": -148.51351928710938, "loss": 123850.675, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.032119907438755035, "rewards/margins": 0.03575160354375839, "rewards/rejected": -0.06787151843309402, "step": 1240 }, { "epoch": 0.5, "grad_norm": 6293849.548335739, "learning_rate": 2.7777777777777776e-07, "logits/chosen": -2.5701098442077637, "logits/rejected": -2.563598394393921, "logps/chosen": -112.28812408447266, "logps/rejected": -145.1765594482422, "loss": 122208.9125, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.034340750426054, "rewards/margins": 0.029792601242661476, "rewards/rejected": -0.06413334608078003, "step": 1250 }, { "epoch": 0.504, "grad_norm": 8164963.35867732, "learning_rate": 2.7555555555555555e-07, "logits/chosen": -2.5201942920684814, "logits/rejected": -2.491058111190796, "logps/chosen": -116.97322845458984, "logps/rejected": -161.62765502929688, "loss": 123380.825, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.04085034877061844, "rewards/margins": 0.03332878276705742, "rewards/rejected": -0.07417913526296616, "step": 1260 }, { "epoch": 0.508, "grad_norm": 7878494.751400375, "learning_rate": 2.733333333333333e-07, "logits/chosen": -2.5432791709899902, "logits/rejected": -2.5216073989868164, "logps/chosen": -104.03746032714844, "logps/rejected": -157.35208129882812, "loss": 115568.125, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.02601642534136772, "rewards/margins": 0.04680890962481499, "rewards/rejected": -0.07282533496618271, "step": 1270 }, { "epoch": 0.512, "grad_norm": 8078756.180573847, "learning_rate": 2.7111111111111114e-07, "logits/chosen": -2.4107511043548584, "logits/rejected": -2.4441466331481934, "logps/chosen": -119.5540771484375, "logps/rejected": -161.68992614746094, "loss": 125619.95, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0344010666012764, "rewards/margins": 0.039026811718940735, "rewards/rejected": -0.07342787086963654, "step": 1280 }, { "epoch": 0.516, "grad_norm": 8353367.375300777, "learning_rate": 2.688888888888889e-07, "logits/chosen": -2.553907632827759, "logits/rejected": -2.5770821571350098, "logps/chosen": -119.15167236328125, "logps/rejected": -144.7032012939453, "loss": 121825.9875, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03453459218144417, "rewards/margins": 0.01783282496035099, "rewards/rejected": -0.05236741900444031, "step": 1290 }, { "epoch": 0.52, "grad_norm": 7928099.234468471, "learning_rate": 2.6666666666666667e-07, "logits/chosen": -2.4872889518737793, "logits/rejected": -2.4608452320098877, "logps/chosen": -97.89281463623047, "logps/rejected": -121.05744934082031, "loss": 121883.65, "rewards/accuracies": 0.625, "rewards/chosen": -0.028054479509592056, "rewards/margins": 0.019761094823479652, "rewards/rejected": -0.04781556874513626, "step": 1300 }, { "epoch": 0.524, "grad_norm": 7247015.687306507, "learning_rate": 2.644444444444444e-07, "logits/chosen": -2.516070604324341, "logits/rejected": -2.552224636077881, "logps/chosen": -106.81401062011719, "logps/rejected": -151.16502380371094, "loss": 115673.95, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.025754611939191818, "rewards/margins": 0.03385675325989723, "rewards/rejected": -0.05961136147379875, "step": 1310 }, { "epoch": 0.528, "grad_norm": 7422418.597697799, "learning_rate": 2.6222222222222226e-07, "logits/chosen": -2.524921417236328, "logits/rejected": -2.5199413299560547, "logps/chosen": -91.73576354980469, "logps/rejected": -131.1287384033203, "loss": 122971.8125, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.031464457511901855, "rewards/margins": 0.029294759035110474, "rewards/rejected": -0.06075920909643173, "step": 1320 }, { "epoch": 0.532, "grad_norm": 6692184.86494848, "learning_rate": 2.6e-07, "logits/chosen": -2.4487650394439697, "logits/rejected": -2.42417573928833, "logps/chosen": -105.08377838134766, "logps/rejected": -152.42251586914062, "loss": 124211.9125, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03637874126434326, "rewards/margins": 0.04163909703493118, "rewards/rejected": -0.07801783829927444, "step": 1330 }, { "epoch": 0.536, "grad_norm": 8776394.815220755, "learning_rate": 2.5777777777777774e-07, "logits/chosen": -2.426945209503174, "logits/rejected": -2.4211270809173584, "logps/chosen": -100.32334899902344, "logps/rejected": -158.4632568359375, "loss": 119074.8125, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.02933502197265625, "rewards/margins": 0.04685003310441971, "rewards/rejected": -0.07618506252765656, "step": 1340 }, { "epoch": 0.54, "grad_norm": 7921315.937162929, "learning_rate": 2.5555555555555553e-07, "logits/chosen": -2.4435505867004395, "logits/rejected": -2.4641623497009277, "logps/chosen": -97.31932067871094, "logps/rejected": -139.09976196289062, "loss": 119875.825, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03033040091395378, "rewards/margins": 0.03286002576351166, "rewards/rejected": -0.06319043040275574, "step": 1350 }, { "epoch": 0.544, "grad_norm": 7064909.318018072, "learning_rate": 2.533333333333333e-07, "logits/chosen": -2.4914050102233887, "logits/rejected": -2.4965710639953613, "logps/chosen": -117.77690124511719, "logps/rejected": -150.23826599121094, "loss": 119877.625, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0384187288582325, "rewards/margins": 0.02663787081837654, "rewards/rejected": -0.06505659967660904, "step": 1360 }, { "epoch": 0.548, "grad_norm": 7218217.506885473, "learning_rate": 2.511111111111111e-07, "logits/chosen": -2.4381630420684814, "logits/rejected": -2.4487814903259277, "logps/chosen": -102.14595794677734, "logps/rejected": -154.1560516357422, "loss": 112836.2875, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.029405618086457253, "rewards/margins": 0.04147377982735634, "rewards/rejected": -0.07087938487529755, "step": 1370 }, { "epoch": 0.552, "grad_norm": 8751055.476830697, "learning_rate": 2.4888888888888886e-07, "logits/chosen": -2.3979814052581787, "logits/rejected": -2.4224982261657715, "logps/chosen": -112.16400146484375, "logps/rejected": -158.16567993164062, "loss": 123894.4, "rewards/accuracies": 0.625, "rewards/chosen": -0.031505607068538666, "rewards/margins": 0.03382585197687149, "rewards/rejected": -0.06533145159482956, "step": 1380 }, { "epoch": 0.556, "grad_norm": 10628471.178826654, "learning_rate": 2.4666666666666665e-07, "logits/chosen": -2.42820143699646, "logits/rejected": -2.439427375793457, "logps/chosen": -86.3087387084961, "logps/rejected": -118.8100814819336, "loss": 122684.5, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.028887376189231873, "rewards/margins": 0.024274542927742004, "rewards/rejected": -0.05316191911697388, "step": 1390 }, { "epoch": 0.56, "grad_norm": 7340403.161227933, "learning_rate": 2.4444444444444445e-07, "logits/chosen": -2.3746609687805176, "logits/rejected": -2.39011812210083, "logps/chosen": -112.76751708984375, "logps/rejected": -155.74038696289062, "loss": 121104.9875, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.032277911901474, "rewards/margins": 0.032715652137994766, "rewards/rejected": -0.06499356776475906, "step": 1400 }, { "epoch": 0.564, "grad_norm": 7097101.153173888, "learning_rate": 2.4222222222222224e-07, "logits/chosen": -2.440396547317505, "logits/rejected": -2.4504265785217285, "logps/chosen": -104.66170501708984, "logps/rejected": -160.94100952148438, "loss": 117720.675, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.019881997257471085, "rewards/margins": 0.04168248176574707, "rewards/rejected": -0.06156448274850845, "step": 1410 }, { "epoch": 0.568, "grad_norm": 5809898.3420226695, "learning_rate": 2.4e-07, "logits/chosen": -2.445601463317871, "logits/rejected": -2.4264917373657227, "logps/chosen": -95.17555236816406, "logps/rejected": -143.88890075683594, "loss": 120431.9875, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.03070194646716118, "rewards/margins": 0.04166869446635246, "rewards/rejected": -0.07237063348293304, "step": 1420 }, { "epoch": 0.572, "grad_norm": 4988644.975711128, "learning_rate": 2.3777777777777777e-07, "logits/chosen": -2.4375877380371094, "logits/rejected": -2.441622257232666, "logps/chosen": -90.29289245605469, "logps/rejected": -128.98793029785156, "loss": 119927.5, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.027714330703020096, "rewards/margins": 0.029140587896108627, "rewards/rejected": -0.056854914873838425, "step": 1430 }, { "epoch": 0.576, "grad_norm": 7822455.89349568, "learning_rate": 2.3555555555555554e-07, "logits/chosen": -2.458700656890869, "logits/rejected": -2.4986705780029297, "logps/chosen": -117.4685287475586, "logps/rejected": -151.8163299560547, "loss": 123864.1, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03203599527478218, "rewards/margins": 0.031274113804101944, "rewards/rejected": -0.06331010907888412, "step": 1440 }, { "epoch": 0.58, "grad_norm": 14175243.183000157, "learning_rate": 2.3333333333333333e-07, "logits/chosen": -2.4828193187713623, "logits/rejected": -2.479646682739258, "logps/chosen": -103.30030822753906, "logps/rejected": -158.0785369873047, "loss": 124849.6125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.032672982662916183, "rewards/margins": 0.04446934536099434, "rewards/rejected": -0.07714232802391052, "step": 1450 }, { "epoch": 0.584, "grad_norm": 6091294.506455895, "learning_rate": 2.311111111111111e-07, "logits/chosen": -2.39980411529541, "logits/rejected": -2.382380723953247, "logps/chosen": -95.15815734863281, "logps/rejected": -165.42840576171875, "loss": 123090.975, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.027900245040655136, "rewards/margins": 0.05805187299847603, "rewards/rejected": -0.08595212548971176, "step": 1460 }, { "epoch": 0.588, "grad_norm": 6711707.485572769, "learning_rate": 2.288888888888889e-07, "logits/chosen": -2.4506986141204834, "logits/rejected": -2.460022449493408, "logps/chosen": -99.18559265136719, "logps/rejected": -133.17782592773438, "loss": 120549.275, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02819480374455452, "rewards/margins": 0.026910748332738876, "rewards/rejected": -0.0551055483520031, "step": 1470 }, { "epoch": 0.592, "grad_norm": 7790254.418823996, "learning_rate": 2.2666666666666663e-07, "logits/chosen": -2.512367010116577, "logits/rejected": -2.5243959426879883, "logps/chosen": -124.3129653930664, "logps/rejected": -171.23019409179688, "loss": 119269.2875, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03208141773939133, "rewards/margins": 0.038219161331653595, "rewards/rejected": -0.07030057162046432, "step": 1480 }, { "epoch": 0.596, "grad_norm": 7436386.208544868, "learning_rate": 2.2444444444444442e-07, "logits/chosen": -2.45894193649292, "logits/rejected": -2.438983678817749, "logps/chosen": -81.72193908691406, "logps/rejected": -120.81268310546875, "loss": 119383.7875, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.023425322026014328, "rewards/margins": 0.031169379130005836, "rewards/rejected": -0.054594703018665314, "step": 1490 }, { "epoch": 0.6, "grad_norm": 6098566.562808592, "learning_rate": 2.222222222222222e-07, "logits/chosen": -2.4491584300994873, "logits/rejected": -2.4278082847595215, "logps/chosen": -109.7543716430664, "logps/rejected": -137.11195373535156, "loss": 122590.0875, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03018758073449135, "rewards/margins": 0.019970091059803963, "rewards/rejected": -0.05015767365694046, "step": 1500 }, { "epoch": 0.604, "grad_norm": 5676696.337246529, "learning_rate": 2.1999999999999998e-07, "logits/chosen": -2.3899359703063965, "logits/rejected": -2.3801169395446777, "logps/chosen": -125.1462173461914, "logps/rejected": -157.1474151611328, "loss": 123626.575, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.035767387598752975, "rewards/margins": 0.029546618461608887, "rewards/rejected": -0.06531400978565216, "step": 1510 }, { "epoch": 0.608, "grad_norm": 6408066.768940639, "learning_rate": 2.1777777777777775e-07, "logits/chosen": -2.511399984359741, "logits/rejected": -2.556304693222046, "logps/chosen": -102.42405700683594, "logps/rejected": -140.045654296875, "loss": 124030.1, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.026485705748200417, "rewards/margins": 0.027697976678609848, "rewards/rejected": -0.054183680564165115, "step": 1520 }, { "epoch": 0.612, "grad_norm": 5703068.150369112, "learning_rate": 2.1555555555555554e-07, "logits/chosen": -2.328718900680542, "logits/rejected": -2.3354427814483643, "logps/chosen": -104.07130432128906, "logps/rejected": -145.46546936035156, "loss": 119267.475, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.035999588668346405, "rewards/margins": 0.03417082130908966, "rewards/rejected": -0.07017041742801666, "step": 1530 }, { "epoch": 0.616, "grad_norm": 6771552.786870031, "learning_rate": 2.1333333333333334e-07, "logits/chosen": -2.3548483848571777, "logits/rejected": -2.3827786445617676, "logps/chosen": -101.12074279785156, "logps/rejected": -159.588623046875, "loss": 122909.8, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.030131155624985695, "rewards/margins": 0.04275660961866379, "rewards/rejected": -0.07288776338100433, "step": 1540 }, { "epoch": 0.62, "grad_norm": 6949724.758162161, "learning_rate": 2.111111111111111e-07, "logits/chosen": -2.3255906105041504, "logits/rejected": -2.3439719676971436, "logps/chosen": -108.85466003417969, "logps/rejected": -158.19845581054688, "loss": 123216.3625, "rewards/accuracies": 0.5625, "rewards/chosen": -0.02804810181260109, "rewards/margins": 0.046050988137722015, "rewards/rejected": -0.0740990936756134, "step": 1550 }, { "epoch": 0.624, "grad_norm": 9265006.397727864, "learning_rate": 2.088888888888889e-07, "logits/chosen": -2.3858678340911865, "logits/rejected": -2.3714261054992676, "logps/chosen": -107.79356384277344, "logps/rejected": -142.68203735351562, "loss": 128108.675, "rewards/accuracies": 0.625, "rewards/chosen": -0.0350666344165802, "rewards/margins": 0.025052737444639206, "rewards/rejected": -0.0601193793118, "step": 1560 }, { "epoch": 0.628, "grad_norm": 8448160.384168701, "learning_rate": 2.0666666666666666e-07, "logits/chosen": -2.302703857421875, "logits/rejected": -2.317382335662842, "logps/chosen": -106.1356201171875, "logps/rejected": -165.3655242919922, "loss": 119009.5625, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.02817341312766075, "rewards/margins": 0.043338801711797714, "rewards/rejected": -0.07151221483945847, "step": 1570 }, { "epoch": 0.632, "grad_norm": 6797346.176978063, "learning_rate": 2.0444444444444446e-07, "logits/chosen": -2.3419301509857178, "logits/rejected": -2.360405683517456, "logps/chosen": -100.89833068847656, "logps/rejected": -136.70143127441406, "loss": 121410.0125, "rewards/accuracies": 0.625, "rewards/chosen": -0.026873702183365822, "rewards/margins": 0.03099043294787407, "rewards/rejected": -0.05786413699388504, "step": 1580 }, { "epoch": 0.636, "grad_norm": 7957900.602760819, "learning_rate": 2.022222222222222e-07, "logits/chosen": -2.325685739517212, "logits/rejected": -2.3536834716796875, "logps/chosen": -86.800048828125, "logps/rejected": -118.13228607177734, "loss": 121560.0125, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.027103954926133156, "rewards/margins": 0.023416386917233467, "rewards/rejected": -0.050520338118076324, "step": 1590 }, { "epoch": 0.64, "grad_norm": 7524893.989731965, "learning_rate": 2e-07, "logits/chosen": -2.379504919052124, "logits/rejected": -2.364494562149048, "logps/chosen": -92.89854431152344, "logps/rejected": -144.94566345214844, "loss": 118599.6, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.027095776051282883, "rewards/margins": 0.04017645865678787, "rewards/rejected": -0.06727223098278046, "step": 1600 }, { "epoch": 0.644, "grad_norm": 7808682.74577819, "learning_rate": 1.9777777777777776e-07, "logits/chosen": -2.462635040283203, "logits/rejected": -2.3920907974243164, "logps/chosen": -123.39002990722656, "logps/rejected": -168.66348266601562, "loss": 123616.1375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.02962644398212433, "rewards/margins": 0.026749838143587112, "rewards/rejected": -0.05637627840042114, "step": 1610 }, { "epoch": 0.648, "grad_norm": 7508118.164730088, "learning_rate": 1.9555555555555555e-07, "logits/chosen": -2.3979110717773438, "logits/rejected": -2.394200563430786, "logps/chosen": -113.19117736816406, "logps/rejected": -152.71365356445312, "loss": 122617.65, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0326806977391243, "rewards/margins": 0.03160088136792183, "rewards/rejected": -0.06428157538175583, "step": 1620 }, { "epoch": 0.652, "grad_norm": 9886937.518719045, "learning_rate": 1.9333333333333332e-07, "logits/chosen": -2.4126977920532227, "logits/rejected": -2.4325013160705566, "logps/chosen": -91.72865295410156, "logps/rejected": -134.23829650878906, "loss": 125535.1625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.029735634103417397, "rewards/margins": 0.03641930967569351, "rewards/rejected": -0.06615494191646576, "step": 1630 }, { "epoch": 0.656, "grad_norm": 5382013.583341201, "learning_rate": 1.911111111111111e-07, "logits/chosen": -2.3764185905456543, "logits/rejected": -2.382178544998169, "logps/chosen": -91.31561279296875, "logps/rejected": -137.80345153808594, "loss": 111719.4375, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.01971900835633278, "rewards/margins": 0.04298390820622444, "rewards/rejected": -0.06270290911197662, "step": 1640 }, { "epoch": 0.66, "grad_norm": 6409699.176115095, "learning_rate": 1.8888888888888888e-07, "logits/chosen": -2.330606698989868, "logits/rejected": -2.312016010284424, "logps/chosen": -114.25804138183594, "logps/rejected": -165.2264404296875, "loss": 122533.2125, "rewards/accuracies": 0.625, "rewards/chosen": -0.026577278971672058, "rewards/margins": 0.0414334274828434, "rewards/rejected": -0.06801070272922516, "step": 1650 }, { "epoch": 0.664, "grad_norm": 6977532.54737261, "learning_rate": 1.8666666666666667e-07, "logits/chosen": -2.320591449737549, "logits/rejected": -2.3048043251037598, "logps/chosen": -117.83941650390625, "logps/rejected": -176.08311462402344, "loss": 118259.3375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.030085504055023193, "rewards/margins": 0.03497748449444771, "rewards/rejected": -0.0650629848241806, "step": 1660 }, { "epoch": 0.668, "grad_norm": 7151487.887768503, "learning_rate": 1.8444444444444444e-07, "logits/chosen": -2.325629949569702, "logits/rejected": -2.37601637840271, "logps/chosen": -95.17213439941406, "logps/rejected": -149.96546936035156, "loss": 117774.0875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.028405601158738136, "rewards/margins": 0.04639150947332382, "rewards/rejected": -0.07479710876941681, "step": 1670 }, { "epoch": 0.672, "grad_norm": 5729479.50083699, "learning_rate": 1.8222222222222223e-07, "logits/chosen": -2.4593453407287598, "logits/rejected": -2.5003867149353027, "logps/chosen": -96.59799194335938, "logps/rejected": -145.85215759277344, "loss": 123032.9, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.03039277158677578, "rewards/margins": 0.034516870975494385, "rewards/rejected": -0.06490965187549591, "step": 1680 }, { "epoch": 0.676, "grad_norm": 6685189.148553702, "learning_rate": 1.8e-07, "logits/chosen": -2.420285224914551, "logits/rejected": -2.405721426010132, "logps/chosen": -101.75447082519531, "logps/rejected": -151.68434143066406, "loss": 121099.5375, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.033398691564798355, "rewards/margins": 0.04428454115986824, "rewards/rejected": -0.0776832327246666, "step": 1690 }, { "epoch": 0.68, "grad_norm": 6727764.202542203, "learning_rate": 1.7777777777777776e-07, "logits/chosen": -2.5021843910217285, "logits/rejected": -2.475396156311035, "logps/chosen": -103.27125549316406, "logps/rejected": -146.76480102539062, "loss": 124458.15, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.027325114235281944, "rewards/margins": 0.04307966306805611, "rewards/rejected": -0.0704047828912735, "step": 1700 }, { "epoch": 0.684, "grad_norm": 8951493.717829395, "learning_rate": 1.7555555555555553e-07, "logits/chosen": -2.359651565551758, "logits/rejected": -2.3753628730773926, "logps/chosen": -92.47693634033203, "logps/rejected": -143.73085021972656, "loss": 122129.25, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.021263476461172104, "rewards/margins": 0.047465912997722626, "rewards/rejected": -0.06872939318418503, "step": 1710 }, { "epoch": 0.688, "grad_norm": 6117231.523934995, "learning_rate": 1.7333333333333332e-07, "logits/chosen": -2.3933348655700684, "logits/rejected": -2.424694538116455, "logps/chosen": -101.28050231933594, "logps/rejected": -161.67837524414062, "loss": 114482.8625, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.025342673063278198, "rewards/margins": 0.051556408405303955, "rewards/rejected": -0.07689908146858215, "step": 1720 }, { "epoch": 0.692, "grad_norm": 8463223.381087825, "learning_rate": 1.711111111111111e-07, "logits/chosen": -2.4435369968414307, "logits/rejected": -2.421454668045044, "logps/chosen": -94.87931823730469, "logps/rejected": -127.29423522949219, "loss": 117968.2625, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.030688803642988205, "rewards/margins": 0.025830263271927834, "rewards/rejected": -0.05651906877756119, "step": 1730 }, { "epoch": 0.696, "grad_norm": 8481439.735231701, "learning_rate": 1.6888888888888888e-07, "logits/chosen": -2.422682285308838, "logits/rejected": -2.4244956970214844, "logps/chosen": -103.77542877197266, "logps/rejected": -137.70925903320312, "loss": 120156.825, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.028543483465909958, "rewards/margins": 0.021968597546219826, "rewards/rejected": -0.05051208287477493, "step": 1740 }, { "epoch": 0.7, "grad_norm": 9216619.77120461, "learning_rate": 1.6666666666666665e-07, "logits/chosen": -2.505079507827759, "logits/rejected": -2.477112054824829, "logps/chosen": -105.45587158203125, "logps/rejected": -140.0381317138672, "loss": 113155.975, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.02943289838731289, "rewards/margins": 0.03403625637292862, "rewards/rejected": -0.06346915662288666, "step": 1750 }, { "epoch": 0.704, "grad_norm": 7248683.781183183, "learning_rate": 1.6444444444444444e-07, "logits/chosen": -2.480299949645996, "logits/rejected": -2.470823287963867, "logps/chosen": -122.30104064941406, "logps/rejected": -164.48951721191406, "loss": 117051.6, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.029943257570266724, "rewards/margins": 0.036385588347911835, "rewards/rejected": -0.06632884591817856, "step": 1760 }, { "epoch": 0.708, "grad_norm": 8402038.296433628, "learning_rate": 1.622222222222222e-07, "logits/chosen": -2.472791910171509, "logits/rejected": -2.4726855754852295, "logps/chosen": -114.65007019042969, "logps/rejected": -153.65573120117188, "loss": 120727.275, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03354992717504501, "rewards/margins": 0.03721586614847183, "rewards/rejected": -0.07076579332351685, "step": 1770 }, { "epoch": 0.712, "grad_norm": 9719574.93632757, "learning_rate": 1.6e-07, "logits/chosen": -2.4326717853546143, "logits/rejected": -2.421159267425537, "logps/chosen": -115.16218566894531, "logps/rejected": -164.61166381835938, "loss": 122159.4125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.031878646463155746, "rewards/margins": 0.04045126959681511, "rewards/rejected": -0.07232991605997086, "step": 1780 }, { "epoch": 0.716, "grad_norm": 7274636.012655509, "learning_rate": 1.5777777777777777e-07, "logits/chosen": -2.3492178916931152, "logits/rejected": -2.3669705390930176, "logps/chosen": -110.37937927246094, "logps/rejected": -148.16090393066406, "loss": 122808.1875, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.033610500395298004, "rewards/margins": 0.031820036470890045, "rewards/rejected": -0.06543054431676865, "step": 1790 }, { "epoch": 0.72, "grad_norm": 7960487.891716203, "learning_rate": 1.5555555555555556e-07, "logits/chosen": -2.5343594551086426, "logits/rejected": -2.509129762649536, "logps/chosen": -112.82719421386719, "logps/rejected": -139.89376831054688, "loss": 124301.7875, "rewards/accuracies": 0.5625, "rewards/chosen": -0.031808070838451385, "rewards/margins": 0.015915410593152046, "rewards/rejected": -0.04772348329424858, "step": 1800 }, { "epoch": 0.724, "grad_norm": 7184987.914163716, "learning_rate": 1.533333333333333e-07, "logits/chosen": -2.5088343620300293, "logits/rejected": -2.5166666507720947, "logps/chosen": -104.326904296875, "logps/rejected": -150.2886505126953, "loss": 128243.85, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.034587159752845764, "rewards/margins": 0.03759396821260452, "rewards/rejected": -0.07218112796545029, "step": 1810 }, { "epoch": 0.728, "grad_norm": 10039229.460908188, "learning_rate": 1.511111111111111e-07, "logits/chosen": -2.45367693901062, "logits/rejected": -2.4529166221618652, "logps/chosen": -107.16294860839844, "logps/rejected": -140.8982696533203, "loss": 124612.4625, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.03276178613305092, "rewards/margins": 0.03512474521994591, "rewards/rejected": -0.06788653880357742, "step": 1820 }, { "epoch": 0.732, "grad_norm": 7124197.534522493, "learning_rate": 1.4888888888888886e-07, "logits/chosen": -2.4414725303649902, "logits/rejected": -2.422440767288208, "logps/chosen": -84.65110778808594, "logps/rejected": -130.88641357421875, "loss": 125120.0625, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.018780285492539406, "rewards/margins": 0.03842353820800781, "rewards/rejected": -0.05720382183790207, "step": 1830 }, { "epoch": 0.736, "grad_norm": 9253360.097651359, "learning_rate": 1.4666666666666666e-07, "logits/chosen": -2.3342652320861816, "logits/rejected": -2.3714077472686768, "logps/chosen": -101.37190246582031, "logps/rejected": -128.89321899414062, "loss": 120892.2875, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.02679322101175785, "rewards/margins": 0.027122925966978073, "rewards/rejected": -0.053916145116090775, "step": 1840 }, { "epoch": 0.74, "grad_norm": 7423694.38438217, "learning_rate": 1.4444444444444442e-07, "logits/chosen": -2.4630966186523438, "logits/rejected": -2.413529634475708, "logps/chosen": -103.9755630493164, "logps/rejected": -133.18716430664062, "loss": 123775.5125, "rewards/accuracies": 0.625, "rewards/chosen": -0.03415101021528244, "rewards/margins": 0.02861020900309086, "rewards/rejected": -0.06276122480630875, "step": 1850 }, { "epoch": 0.744, "grad_norm": 5647373.604720096, "learning_rate": 1.4222222222222222e-07, "logits/chosen": -2.464050769805908, "logits/rejected": -2.501260757446289, "logps/chosen": -125.53248596191406, "logps/rejected": -175.7611083984375, "loss": 117425.8375, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.02989697828888893, "rewards/margins": 0.040645621716976166, "rewards/rejected": -0.0705425962805748, "step": 1860 }, { "epoch": 0.748, "grad_norm": 8230581.203213356, "learning_rate": 1.4e-07, "logits/chosen": -2.5151472091674805, "logits/rejected": -2.509429454803467, "logps/chosen": -92.0722885131836, "logps/rejected": -167.84060668945312, "loss": 119465.4875, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.02767838165163994, "rewards/margins": 0.05135294049978256, "rewards/rejected": -0.0790313258767128, "step": 1870 }, { "epoch": 0.752, "grad_norm": 12754476.984279104, "learning_rate": 1.3777777777777778e-07, "logits/chosen": -2.5021023750305176, "logits/rejected": -2.484841823577881, "logps/chosen": -94.76568603515625, "logps/rejected": -139.48953247070312, "loss": 118982.3875, "rewards/accuracies": 0.625, "rewards/chosen": -0.02872481569647789, "rewards/margins": 0.03676723688840866, "rewards/rejected": -0.06549205631017685, "step": 1880 }, { "epoch": 0.756, "grad_norm": 6983911.433265186, "learning_rate": 1.3555555555555557e-07, "logits/chosen": -2.5138821601867676, "logits/rejected": -2.5218331813812256, "logps/chosen": -89.62753295898438, "logps/rejected": -128.04603576660156, "loss": 120175.3625, "rewards/accuracies": 0.625, "rewards/chosen": -0.025366436690092087, "rewards/margins": 0.028226271271705627, "rewards/rejected": -0.053592704236507416, "step": 1890 }, { "epoch": 0.76, "grad_norm": 7315179.12228925, "learning_rate": 1.3333333333333334e-07, "logits/chosen": -2.3722000122070312, "logits/rejected": -2.361008644104004, "logps/chosen": -106.6434326171875, "logps/rejected": -154.347412109375, "loss": 118594.25, "rewards/accuracies": 0.5625, "rewards/chosen": -0.028675338253378868, "rewards/margins": 0.03925652056932449, "rewards/rejected": -0.06793185323476791, "step": 1900 }, { "epoch": 0.764, "grad_norm": 6191576.733772953, "learning_rate": 1.3111111111111113e-07, "logits/chosen": -2.361506938934326, "logits/rejected": -2.416877269744873, "logps/chosen": -121.11296081542969, "logps/rejected": -182.23646545410156, "loss": 120340.05, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.027058254927396774, "rewards/margins": 0.04295941814780235, "rewards/rejected": -0.07001767307519913, "step": 1910 }, { "epoch": 0.768, "grad_norm": 7960326.332116376, "learning_rate": 1.2888888888888887e-07, "logits/chosen": -2.3773555755615234, "logits/rejected": -2.393650531768799, "logps/chosen": -86.46308135986328, "logps/rejected": -132.29776000976562, "loss": 122114.95, "rewards/accuracies": 0.625, "rewards/chosen": -0.02390897274017334, "rewards/margins": 0.037305351346731186, "rewards/rejected": -0.061214327812194824, "step": 1920 }, { "epoch": 0.772, "grad_norm": 8545237.759950696, "learning_rate": 1.2666666666666666e-07, "logits/chosen": -2.3730955123901367, "logits/rejected": -2.346529960632324, "logps/chosen": -98.66458129882812, "logps/rejected": -133.67532348632812, "loss": 121957.9, "rewards/accuracies": 0.6875, "rewards/chosen": -0.030175382271409035, "rewards/margins": 0.031160462647676468, "rewards/rejected": -0.06133584305644035, "step": 1930 }, { "epoch": 0.776, "grad_norm": 7541759.4377164915, "learning_rate": 1.2444444444444443e-07, "logits/chosen": -2.4905495643615723, "logits/rejected": -2.467737913131714, "logps/chosen": -97.59669494628906, "logps/rejected": -144.62152099609375, "loss": 120647.3875, "rewards/accuracies": 0.625, "rewards/chosen": -0.029411058872938156, "rewards/margins": 0.03424420207738876, "rewards/rejected": -0.06365526467561722, "step": 1940 }, { "epoch": 0.78, "grad_norm": 9641484.377151929, "learning_rate": 1.2222222222222222e-07, "logits/chosen": -2.49107027053833, "logits/rejected": -2.490995407104492, "logps/chosen": -101.06126403808594, "logps/rejected": -136.541748046875, "loss": 121525.775, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.026407599449157715, "rewards/margins": 0.03157456964254379, "rewards/rejected": -0.05798216909170151, "step": 1950 }, { "epoch": 0.784, "grad_norm": 5545873.184552746, "learning_rate": 1.2e-07, "logits/chosen": -2.5349438190460205, "logits/rejected": -2.5639233589172363, "logps/chosen": -100.9637222290039, "logps/rejected": -172.61062622070312, "loss": 119077.0375, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.019367219880223274, "rewards/margins": 0.057732999324798584, "rewards/rejected": -0.0771002247929573, "step": 1960 }, { "epoch": 0.788, "grad_norm": 9492873.423361877, "learning_rate": 1.1777777777777777e-07, "logits/chosen": -2.358701229095459, "logits/rejected": -2.379284381866455, "logps/chosen": -92.12281036376953, "logps/rejected": -166.2657012939453, "loss": 114722.3, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.02654215320944786, "rewards/margins": 0.06289757788181305, "rewards/rejected": -0.08943972736597061, "step": 1970 }, { "epoch": 0.792, "grad_norm": 8128359.015235812, "learning_rate": 1.1555555555555555e-07, "logits/chosen": -2.445798397064209, "logits/rejected": -2.463740110397339, "logps/chosen": -111.72492980957031, "logps/rejected": -154.72195434570312, "loss": 123011.1125, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.029620587825775146, "rewards/margins": 0.03493895009160042, "rewards/rejected": -0.06455953419208527, "step": 1980 }, { "epoch": 0.796, "grad_norm": 9306300.401598651, "learning_rate": 1.1333333333333332e-07, "logits/chosen": -2.417834520339966, "logits/rejected": -2.4220337867736816, "logps/chosen": -91.2249526977539, "logps/rejected": -162.97373962402344, "loss": 118218.9, "rewards/accuracies": 0.6875, "rewards/chosen": -0.023723283782601357, "rewards/margins": 0.057645224034786224, "rewards/rejected": -0.08136852085590363, "step": 1990 }, { "epoch": 0.8, "grad_norm": 8765512.962268472, "learning_rate": 1.111111111111111e-07, "logits/chosen": -2.40264630317688, "logits/rejected": -2.427499294281006, "logps/chosen": -96.80880737304688, "logps/rejected": -145.74375915527344, "loss": 120947.8375, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.02879420481622219, "rewards/margins": 0.04018041118979454, "rewards/rejected": -0.06897461414337158, "step": 2000 }, { "epoch": 0.804, "grad_norm": 7358207.507871911, "learning_rate": 1.0888888888888888e-07, "logits/chosen": -2.491158962249756, "logits/rejected": -2.4770848751068115, "logps/chosen": -109.10551452636719, "logps/rejected": -167.1204071044922, "loss": 116044.225, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02867368422448635, "rewards/margins": 0.04405529797077179, "rewards/rejected": -0.07272897660732269, "step": 2010 }, { "epoch": 0.808, "grad_norm": 6847868.235200222, "learning_rate": 1.0666666666666667e-07, "logits/chosen": -2.4977283477783203, "logits/rejected": -2.4795002937316895, "logps/chosen": -92.58442687988281, "logps/rejected": -137.3270721435547, "loss": 120623.725, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.02764459326863289, "rewards/margins": 0.029529035091400146, "rewards/rejected": -0.057173628360033035, "step": 2020 }, { "epoch": 0.812, "grad_norm": 7345224.131775115, "learning_rate": 1.0444444444444445e-07, "logits/chosen": -2.581921100616455, "logits/rejected": -2.5818488597869873, "logps/chosen": -94.2405014038086, "logps/rejected": -151.82931518554688, "loss": 118029.625, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.026779672130942345, "rewards/margins": 0.04871240258216858, "rewards/rejected": -0.07549206912517548, "step": 2030 }, { "epoch": 0.816, "grad_norm": 8179129.121592561, "learning_rate": 1.0222222222222223e-07, "logits/chosen": -2.510425090789795, "logits/rejected": -2.5082054138183594, "logps/chosen": -98.96800231933594, "logps/rejected": -132.51150512695312, "loss": 119617.05, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02764180861413479, "rewards/margins": 0.024158382788300514, "rewards/rejected": -0.0518001914024353, "step": 2040 }, { "epoch": 0.82, "grad_norm": 8464195.380143736, "learning_rate": 1e-07, "logits/chosen": -2.4244985580444336, "logits/rejected": -2.43390154838562, "logps/chosen": -106.4210205078125, "logps/rejected": -164.44268798828125, "loss": 120873.525, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.029137443751096725, "rewards/margins": 0.049099259078502655, "rewards/rejected": -0.07823669910430908, "step": 2050 }, { "epoch": 0.824, "grad_norm": 7635743.090822039, "learning_rate": 9.777777777777778e-08, "logits/chosen": -2.45817494392395, "logits/rejected": -2.472324848175049, "logps/chosen": -102.89268493652344, "logps/rejected": -155.74337768554688, "loss": 120353.5125, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0331621877849102, "rewards/margins": 0.045612066984176636, "rewards/rejected": -0.07877425849437714, "step": 2060 }, { "epoch": 0.828, "grad_norm": 6226582.326517582, "learning_rate": 9.555555555555556e-08, "logits/chosen": -2.4659340381622314, "logits/rejected": -2.475663661956787, "logps/chosen": -94.48152923583984, "logps/rejected": -128.58985900878906, "loss": 123484.35, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.02351956069469452, "rewards/margins": 0.03220217674970627, "rewards/rejected": -0.05572172999382019, "step": 2070 }, { "epoch": 0.832, "grad_norm": 9949795.202652398, "learning_rate": 9.333333333333334e-08, "logits/chosen": -2.4222323894500732, "logits/rejected": -2.42124342918396, "logps/chosen": -103.56013488769531, "logps/rejected": -154.19448852539062, "loss": 122220.075, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.032791610807180405, "rewards/margins": 0.03772992268204689, "rewards/rejected": -0.0705215334892273, "step": 2080 }, { "epoch": 0.836, "grad_norm": 7095174.456895947, "learning_rate": 9.111111111111112e-08, "logits/chosen": -2.5458078384399414, "logits/rejected": -2.5440893173217773, "logps/chosen": -104.78141784667969, "logps/rejected": -135.82858276367188, "loss": 121374.0375, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.026525821536779404, "rewards/margins": 0.02192816510796547, "rewards/rejected": -0.04845398664474487, "step": 2090 }, { "epoch": 0.84, "grad_norm": 10736924.474295698, "learning_rate": 8.888888888888888e-08, "logits/chosen": -2.4108989238739014, "logits/rejected": -2.442476749420166, "logps/chosen": -109.81797790527344, "logps/rejected": -154.10580444335938, "loss": 118612.975, "rewards/accuracies": 0.625, "rewards/chosen": -0.030571172013878822, "rewards/margins": 0.04257971793413162, "rewards/rejected": -0.0731508880853653, "step": 2100 }, { "epoch": 0.844, "grad_norm": 7793006.873257276, "learning_rate": 8.666666666666666e-08, "logits/chosen": -2.408449411392212, "logits/rejected": -2.401078939437866, "logps/chosen": -98.3482894897461, "logps/rejected": -144.44593811035156, "loss": 119906.3125, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.025788147002458572, "rewards/margins": 0.0393107533454895, "rewards/rejected": -0.06509890407323837, "step": 2110 }, { "epoch": 0.848, "grad_norm": 8262242.910041632, "learning_rate": 8.444444444444444e-08, "logits/chosen": -2.328664779663086, "logits/rejected": -2.3485660552978516, "logps/chosen": -115.07906341552734, "logps/rejected": -163.2090301513672, "loss": 118916.3625, "rewards/accuracies": 0.625, "rewards/chosen": -0.028421631082892418, "rewards/margins": 0.04112589359283447, "rewards/rejected": -0.06954751908779144, "step": 2120 }, { "epoch": 0.852, "grad_norm": 7788645.367897111, "learning_rate": 8.222222222222222e-08, "logits/chosen": -2.381272554397583, "logits/rejected": -2.3988587856292725, "logps/chosen": -98.18142700195312, "logps/rejected": -158.4501190185547, "loss": 114372.2, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.027454352006316185, "rewards/margins": 0.04615364223718643, "rewards/rejected": -0.07360798865556717, "step": 2130 }, { "epoch": 0.856, "grad_norm": 7241736.462505716, "learning_rate": 8e-08, "logits/chosen": -2.384153127670288, "logits/rejected": -2.4060769081115723, "logps/chosen": -112.68023681640625, "logps/rejected": -133.3109588623047, "loss": 122776.875, "rewards/accuracies": 0.625, "rewards/chosen": -0.035701293498277664, "rewards/margins": 0.014023616909980774, "rewards/rejected": -0.04972491040825844, "step": 2140 }, { "epoch": 0.86, "grad_norm": 7653248.620162212, "learning_rate": 7.777777777777778e-08, "logits/chosen": -2.382563591003418, "logits/rejected": -2.4016215801239014, "logps/chosen": -91.17201232910156, "logps/rejected": -136.58473205566406, "loss": 117265.6125, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.025211047381162643, "rewards/margins": 0.03577146679162979, "rewards/rejected": -0.060982514172792435, "step": 2150 }, { "epoch": 0.864, "grad_norm": 7731438.19804926, "learning_rate": 7.555555555555555e-08, "logits/chosen": -2.3781442642211914, "logits/rejected": -2.3403193950653076, "logps/chosen": -121.6617431640625, "logps/rejected": -155.835205078125, "loss": 120368.2625, "rewards/accuracies": 0.625, "rewards/chosen": -0.027185499668121338, "rewards/margins": 0.031700123101472855, "rewards/rejected": -0.05888562276959419, "step": 2160 }, { "epoch": 0.868, "grad_norm": 8267606.777008629, "learning_rate": 7.333333333333333e-08, "logits/chosen": -2.3575565814971924, "logits/rejected": -2.386939287185669, "logps/chosen": -101.64804077148438, "logps/rejected": -148.3137664794922, "loss": 115195.7375, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.023973077535629272, "rewards/margins": 0.044489845633506775, "rewards/rejected": -0.06846292316913605, "step": 2170 }, { "epoch": 0.872, "grad_norm": 9005598.684897516, "learning_rate": 7.111111111111111e-08, "logits/chosen": -2.3442575931549072, "logits/rejected": -2.366135597229004, "logps/chosen": -103.1920166015625, "logps/rejected": -157.7998504638672, "loss": 118398.575, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.03024686500430107, "rewards/margins": 0.04585784301161766, "rewards/rejected": -0.07610471546649933, "step": 2180 }, { "epoch": 0.876, "grad_norm": 9533026.874445997, "learning_rate": 6.888888888888889e-08, "logits/chosen": -2.380309581756592, "logits/rejected": -2.356041431427002, "logps/chosen": -110.62892150878906, "logps/rejected": -141.9584197998047, "loss": 121318.95, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03162779659032822, "rewards/margins": 0.02817652001976967, "rewards/rejected": -0.059804320335388184, "step": 2190 }, { "epoch": 0.88, "grad_norm": 10673576.986433594, "learning_rate": 6.666666666666667e-08, "logits/chosen": -2.3506739139556885, "logits/rejected": -2.372131586074829, "logps/chosen": -100.54646301269531, "logps/rejected": -135.2465362548828, "loss": 126468.65, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03161459043622017, "rewards/margins": 0.02985607460141182, "rewards/rejected": -0.06147066876292229, "step": 2200 }, { "epoch": 0.884, "grad_norm": 8461999.794241536, "learning_rate": 6.444444444444443e-08, "logits/chosen": -2.362769842147827, "logits/rejected": -2.406325578689575, "logps/chosen": -97.97603607177734, "logps/rejected": -142.4342041015625, "loss": 127463.35, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.03243636339902878, "rewards/margins": 0.029870545491576195, "rewards/rejected": -0.062306903302669525, "step": 2210 }, { "epoch": 0.888, "grad_norm": 7368698.764464808, "learning_rate": 6.222222222222221e-08, "logits/chosen": -2.3991339206695557, "logits/rejected": -2.4037344455718994, "logps/chosen": -93.59549713134766, "logps/rejected": -162.4844970703125, "loss": 120745.575, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.027704676613211632, "rewards/margins": 0.05211573839187622, "rewards/rejected": -0.0798204094171524, "step": 2220 }, { "epoch": 0.892, "grad_norm": 8665468.38774931, "learning_rate": 6e-08, "logits/chosen": -2.4542791843414307, "logits/rejected": -2.4168543815612793, "logps/chosen": -93.68408966064453, "logps/rejected": -148.40516662597656, "loss": 118598.825, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.03159097954630852, "rewards/margins": 0.04323247820138931, "rewards/rejected": -0.07482346147298813, "step": 2230 }, { "epoch": 0.896, "grad_norm": 7933695.302057784, "learning_rate": 5.7777777777777775e-08, "logits/chosen": -2.3784477710723877, "logits/rejected": -2.3589086532592773, "logps/chosen": -92.63359832763672, "logps/rejected": -134.8274383544922, "loss": 120496.3875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.028888177126646042, "rewards/margins": 0.03402668610215187, "rewards/rejected": -0.06291486322879791, "step": 2240 }, { "epoch": 0.9, "grad_norm": 9969155.359289682, "learning_rate": 5.555555555555555e-08, "logits/chosen": -2.2903988361358643, "logits/rejected": -2.3016340732574463, "logps/chosen": -85.76798248291016, "logps/rejected": -127.5031967163086, "loss": 120240.075, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02621074579656124, "rewards/margins": 0.03313954919576645, "rewards/rejected": -0.05935030058026314, "step": 2250 }, { "epoch": 0.904, "grad_norm": 9157992.023374882, "learning_rate": 5.3333333333333334e-08, "logits/chosen": -2.336127996444702, "logits/rejected": -2.2945499420166016, "logps/chosen": -91.5769271850586, "logps/rejected": -135.12843322753906, "loss": 120103.1125, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.027380824089050293, "rewards/margins": 0.03264584392309189, "rewards/rejected": -0.06002666801214218, "step": 2260 }, { "epoch": 0.908, "grad_norm": 8951108.084423833, "learning_rate": 5.1111111111111114e-08, "logits/chosen": -2.347435712814331, "logits/rejected": -2.3315415382385254, "logps/chosen": -113.215576171875, "logps/rejected": -128.22955322265625, "loss": 124138.05, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.032336726784706116, "rewards/margins": 0.017136305570602417, "rewards/rejected": -0.04947303608059883, "step": 2270 }, { "epoch": 0.912, "grad_norm": 8242314.92846825, "learning_rate": 4.888888888888889e-08, "logits/chosen": -2.4864022731781006, "logits/rejected": -2.4819133281707764, "logps/chosen": -105.97566223144531, "logps/rejected": -151.5330047607422, "loss": 120462.5125, "rewards/accuracies": 0.6875, "rewards/chosen": -0.026608863845467567, "rewards/margins": 0.0363970547914505, "rewards/rejected": -0.06300591677427292, "step": 2280 }, { "epoch": 0.916, "grad_norm": 9094101.689046768, "learning_rate": 4.666666666666667e-08, "logits/chosen": -2.4374148845672607, "logits/rejected": -2.4441134929656982, "logps/chosen": -103.56266784667969, "logps/rejected": -172.01528930664062, "loss": 121326.7125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02742699161171913, "rewards/margins": 0.06065355986356735, "rewards/rejected": -0.08808055520057678, "step": 2290 }, { "epoch": 0.92, "grad_norm": 10641032.764093434, "learning_rate": 4.444444444444444e-08, "logits/chosen": -2.4069132804870605, "logits/rejected": -2.4145545959472656, "logps/chosen": -82.15616607666016, "logps/rejected": -143.5443115234375, "loss": 115663.2125, "rewards/accuracies": 0.8125, "rewards/chosen": -0.02152777649462223, "rewards/margins": 0.0520428791642189, "rewards/rejected": -0.07357065379619598, "step": 2300 }, { "epoch": 0.924, "grad_norm": 12548436.959082082, "learning_rate": 4.222222222222222e-08, "logits/chosen": -2.448031187057495, "logits/rejected": -2.4767231941223145, "logps/chosen": -115.41275787353516, "logps/rejected": -162.3332977294922, "loss": 123697.6375, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.024739082902669907, "rewards/margins": 0.045056119561195374, "rewards/rejected": -0.06979519873857498, "step": 2310 }, { "epoch": 0.928, "grad_norm": 8095941.315550662, "learning_rate": 4e-08, "logits/chosen": -2.3472633361816406, "logits/rejected": -2.34629225730896, "logps/chosen": -96.70509338378906, "logps/rejected": -156.36058044433594, "loss": 123115.8875, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.02763884700834751, "rewards/margins": 0.04638766124844551, "rewards/rejected": -0.07402651011943817, "step": 2320 }, { "epoch": 0.932, "grad_norm": 9261661.294557055, "learning_rate": 3.7777777777777774e-08, "logits/chosen": -2.379624128341675, "logits/rejected": -2.39247727394104, "logps/chosen": -84.12887573242188, "logps/rejected": -139.15478515625, "loss": 116878.95, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.0225827656686306, "rewards/margins": 0.04350755736231804, "rewards/rejected": -0.06609033048152924, "step": 2330 }, { "epoch": 0.936, "grad_norm": 11782492.59177351, "learning_rate": 3.5555555555555554e-08, "logits/chosen": -2.453583240509033, "logits/rejected": -2.4496898651123047, "logps/chosen": -101.22218322753906, "logps/rejected": -146.49490356445312, "loss": 116143.6875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0275820791721344, "rewards/margins": 0.036730751395225525, "rewards/rejected": -0.06431283056735992, "step": 2340 }, { "epoch": 0.94, "grad_norm": 7788195.554054044, "learning_rate": 3.3333333333333334e-08, "logits/chosen": -2.3583855628967285, "logits/rejected": -2.325178623199463, "logps/chosen": -86.75674438476562, "logps/rejected": -152.60946655273438, "loss": 122277.5875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02565554343163967, "rewards/margins": 0.05285739153623581, "rewards/rejected": -0.07851293683052063, "step": 2350 }, { "epoch": 0.944, "grad_norm": 9203769.822900785, "learning_rate": 3.111111111111111e-08, "logits/chosen": -2.3866257667541504, "logits/rejected": -2.3973793983459473, "logps/chosen": -103.6191177368164, "logps/rejected": -177.63768005371094, "loss": 119365.9125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03137093037366867, "rewards/margins": 0.06034323573112488, "rewards/rejected": -0.09171417355537415, "step": 2360 }, { "epoch": 0.948, "grad_norm": 9423001.082938906, "learning_rate": 2.8888888888888887e-08, "logits/chosen": -2.467428684234619, "logits/rejected": -2.4925427436828613, "logps/chosen": -104.86148834228516, "logps/rejected": -129.6764373779297, "loss": 123624.5625, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.030190538614988327, "rewards/margins": 0.025643909350037575, "rewards/rejected": -0.05583444982767105, "step": 2370 }, { "epoch": 0.952, "grad_norm": 8004704.995606078, "learning_rate": 2.6666666666666667e-08, "logits/chosen": -2.478123188018799, "logits/rejected": -2.4850118160247803, "logps/chosen": -78.22926330566406, "logps/rejected": -149.72119140625, "loss": 111276.7625, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.020721960812807083, "rewards/margins": 0.06527476012706757, "rewards/rejected": -0.08599671721458435, "step": 2380 }, { "epoch": 0.956, "grad_norm": 9750139.945790045, "learning_rate": 2.4444444444444444e-08, "logits/chosen": -2.4585745334625244, "logits/rejected": -2.4466397762298584, "logps/chosen": -103.5519790649414, "logps/rejected": -164.13674926757812, "loss": 121310.4375, "rewards/accuracies": 0.625, "rewards/chosen": -0.0302131325006485, "rewards/margins": 0.04971124976873398, "rewards/rejected": -0.07992438226938248, "step": 2390 }, { "epoch": 0.96, "grad_norm": 7278490.232350556, "learning_rate": 2.222222222222222e-08, "logits/chosen": -2.43925404548645, "logits/rejected": -2.438615322113037, "logps/chosen": -105.12055969238281, "logps/rejected": -164.0720977783203, "loss": 118679.0375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.02877245843410492, "rewards/margins": 0.052051056176424026, "rewards/rejected": -0.08082351088523865, "step": 2400 }, { "epoch": 0.964, "grad_norm": 10483716.8937703, "learning_rate": 2e-08, "logits/chosen": -2.4821510314941406, "logits/rejected": -2.46364426612854, "logps/chosen": -101.2098159790039, "logps/rejected": -143.97183227539062, "loss": 117960.6, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03032396174967289, "rewards/margins": 0.03740059584379196, "rewards/rejected": -0.0677245557308197, "step": 2410 }, { "epoch": 0.968, "grad_norm": 12173721.82019396, "learning_rate": 1.7777777777777777e-08, "logits/chosen": -2.379589557647705, "logits/rejected": -2.4067189693450928, "logps/chosen": -122.52884674072266, "logps/rejected": -172.4658966064453, "loss": 124411.0625, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03309926018118858, "rewards/margins": 0.042371779680252075, "rewards/rejected": -0.07547104358673096, "step": 2420 }, { "epoch": 0.972, "grad_norm": 9706266.63311398, "learning_rate": 1.5555555555555554e-08, "logits/chosen": -2.436565637588501, "logits/rejected": -2.4684276580810547, "logps/chosen": -109.63997650146484, "logps/rejected": -152.08335876464844, "loss": 122077.7, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03295578807592392, "rewards/margins": 0.03596381098031998, "rewards/rejected": -0.0689195990562439, "step": 2430 }, { "epoch": 0.976, "grad_norm": 5892646.249318525, "learning_rate": 1.3333333333333334e-08, "logits/chosen": -2.531766891479492, "logits/rejected": -2.5235583782196045, "logps/chosen": -125.18925476074219, "logps/rejected": -160.97665405273438, "loss": 123783.1, "rewards/accuracies": 0.75, "rewards/chosen": -0.026068557053804398, "rewards/margins": 0.030096372589468956, "rewards/rejected": -0.0561649315059185, "step": 2440 }, { "epoch": 0.98, "grad_norm": 10534775.155367365, "learning_rate": 1.111111111111111e-08, "logits/chosen": -2.369849443435669, "logits/rejected": -2.381124258041382, "logps/chosen": -103.98789978027344, "logps/rejected": -156.1312255859375, "loss": 119931.5125, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.02474259026348591, "rewards/margins": 0.03790457919239998, "rewards/rejected": -0.06264716386795044, "step": 2450 }, { "epoch": 0.984, "grad_norm": 9946607.778478429, "learning_rate": 8.888888888888889e-09, "logits/chosen": -2.487614393234253, "logits/rejected": -2.465937614440918, "logps/chosen": -111.37520599365234, "logps/rejected": -137.24546813964844, "loss": 124467.0125, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03445830196142197, "rewards/margins": 0.02246803045272827, "rewards/rejected": -0.05692633241415024, "step": 2460 }, { "epoch": 0.988, "grad_norm": 12838026.989788342, "learning_rate": 6.666666666666667e-09, "logits/chosen": -2.3559987545013428, "logits/rejected": -2.3515543937683105, "logps/chosen": -103.37791442871094, "logps/rejected": -170.76101684570312, "loss": 123802.275, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.028120238333940506, "rewards/margins": 0.047586239874362946, "rewards/rejected": -0.07570647448301315, "step": 2470 }, { "epoch": 0.992, "grad_norm": 8735124.614081156, "learning_rate": 4.444444444444444e-09, "logits/chosen": -2.4489405155181885, "logits/rejected": -2.4833984375, "logps/chosen": -95.29798889160156, "logps/rejected": -143.02792358398438, "loss": 118030.2125, "rewards/accuracies": 0.75, "rewards/chosen": -0.025047576054930687, "rewards/margins": 0.04086794704198837, "rewards/rejected": -0.06591552495956421, "step": 2480 }, { "epoch": 0.996, "grad_norm": 9991241.9099312, "learning_rate": 2.222222222222222e-09, "logits/chosen": -2.351677417755127, "logits/rejected": -2.272353410720825, "logps/chosen": -93.49928283691406, "logps/rejected": -141.19610595703125, "loss": 118406.9625, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.02804369106888771, "rewards/margins": 0.03656899183988571, "rewards/rejected": -0.06461267918348312, "step": 2490 }, { "epoch": 1.0, "grad_norm": 11977840.992411703, "learning_rate": 0.0, "logits/chosen": -2.5079355239868164, "logits/rejected": -2.510873556137085, "logps/chosen": -100.9855728149414, "logps/rejected": -157.40550231933594, "loss": 117766.25, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.029782477766275406, "rewards/margins": 0.0452786386013031, "rewards/rejected": -0.07506111264228821, "step": 2500 } ], "logging_steps": 10, "max_steps": 2500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }