{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 2495, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004008016032064128, "grad_norm": 3947236.9376629265, "learning_rate": 2e-09, "logits/chosen": -2.5464653968811035, "logits/rejected": -2.4981484413146973, "logps/chosen": -136.25015258789062, "logps/rejected": -109.48806762695312, "loss": 125090.2344, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.004008016032064128, "grad_norm": 4131433.2054548617, "learning_rate": 2e-08, "logits/chosen": -2.444033145904541, "logits/rejected": -2.4536919593811035, "logps/chosen": -96.72305297851562, "logps/rejected": -102.78682708740234, "loss": 128262.9167, "rewards/accuracies": 0.4861111044883728, "rewards/chosen": 1.4120871128397994e-05, "rewards/margins": 5.588051863014698e-06, "rewards/rejected": 8.532813808415085e-06, "step": 10 }, { "epoch": 0.008016032064128256, "grad_norm": 4082913.592970218, "learning_rate": 4e-08, "logits/chosen": -2.470759153366089, "logits/rejected": -2.4879543781280518, "logps/chosen": -82.20399475097656, "logps/rejected": -95.1635513305664, "loss": 128748.0, "rewards/accuracies": 0.5, "rewards/chosen": -0.00010425537766423076, "rewards/margins": -2.428081279504113e-05, "rewards/rejected": -7.997456850716844e-05, "step": 20 }, { "epoch": 0.012024048096192385, "grad_norm": 3699942.5510203396, "learning_rate": 6e-08, "logits/chosen": -2.3510866165161133, "logits/rejected": -2.3375275135040283, "logps/chosen": -90.31131744384766, "logps/rejected": -91.30790710449219, "loss": 128316.2875, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -8.024895214475691e-05, "rewards/margins": 9.57045704126358e-05, "rewards/rejected": -0.00017595352255739272, "step": 30 }, { "epoch": 0.01603206412825651, "grad_norm": 3526899.1699538147, "learning_rate": 8e-08, "logits/chosen": -2.4227395057678223, "logits/rejected": -2.427928924560547, "logps/chosen": -74.48422241210938, "logps/rejected": -77.61112213134766, "loss": 126226.3125, "rewards/accuracies": 0.5, "rewards/chosen": -0.00019940172205679119, "rewards/margins": -5.970364873064682e-05, "rewards/rejected": -0.00013969806605018675, "step": 40 }, { "epoch": 0.02004008016032064, "grad_norm": 4491084.202014744, "learning_rate": 1e-07, "logits/chosen": -2.4444451332092285, "logits/rejected": -2.4303643703460693, "logps/chosen": -82.40409088134766, "logps/rejected": -85.80543518066406, "loss": 129160.9375, "rewards/accuracies": 0.5625, "rewards/chosen": -0.00018717416969593614, "rewards/margins": 0.00014039597590453923, "rewards/rejected": -0.0003275701601523906, "step": 50 }, { "epoch": 0.02404809619238477, "grad_norm": 3403622.9497329933, "learning_rate": 1.2e-07, "logits/chosen": -2.4172229766845703, "logits/rejected": -2.395040988922119, "logps/chosen": -93.23040008544922, "logps/rejected": -97.07014465332031, "loss": 125728.05, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.0002615585399325937, "rewards/margins": -0.0001315469853579998, "rewards/rejected": -0.00013001154002267867, "step": 60 }, { "epoch": 0.028056112224448898, "grad_norm": 4044511.014833911, "learning_rate": 1.4e-07, "logits/chosen": -2.3989176750183105, "logits/rejected": -2.3830370903015137, "logps/chosen": -101.8390121459961, "logps/rejected": -108.08101654052734, "loss": 130639.9625, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.00044045006507076323, "rewards/margins": -9.915141708916053e-05, "rewards/rejected": -0.00034129866980947554, "step": 70 }, { "epoch": 0.03206412825651302, "grad_norm": 5583101.963034321, "learning_rate": 1.6e-07, "logits/chosen": -2.4113218784332275, "logits/rejected": -2.394385814666748, "logps/chosen": -93.59923553466797, "logps/rejected": -96.75505828857422, "loss": 133003.7125, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.00045990836224518716, "rewards/margins": 0.0003325659781694412, "rewards/rejected": -0.0007924743695184588, "step": 80 }, { "epoch": 0.036072144288577156, "grad_norm": 6062402.798193364, "learning_rate": 1.8e-07, "logits/chosen": -2.2995922565460205, "logits/rejected": -2.2753472328186035, "logps/chosen": -83.42291259765625, "logps/rejected": -92.72061920166016, "loss": 130032.6125, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.001521119149401784, "rewards/margins": 0.0012046361807733774, "rewards/rejected": -0.0027257553301751614, "step": 90 }, { "epoch": 0.04008016032064128, "grad_norm": 6312836.0228954, "learning_rate": 2e-07, "logits/chosen": -2.2923099994659424, "logits/rejected": -2.303053379058838, "logps/chosen": -107.588134765625, "logps/rejected": -121.3271484375, "loss": 129288.9125, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.009896589443087578, "rewards/margins": 0.0033134943805634975, "rewards/rejected": -0.013210085220634937, "step": 100 }, { "epoch": 0.04408817635270541, "grad_norm": 5590976.547776195, "learning_rate": 2.1999999999999998e-07, "logits/chosen": -2.3039541244506836, "logits/rejected": -2.3309550285339355, "logps/chosen": -112.95283508300781, "logps/rejected": -126.42842102050781, "loss": 128575.45, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.012949028983712196, "rewards/margins": 0.0049244253896176815, "rewards/rejected": -0.017873454838991165, "step": 110 }, { "epoch": 0.04809619238476954, "grad_norm": 9019678.387801899, "learning_rate": 2.4e-07, "logits/chosen": -2.1888508796691895, "logits/rejected": -2.189389705657959, "logps/chosen": -112.77528381347656, "logps/rejected": -131.09449768066406, "loss": 124154.3125, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.02229696698486805, "rewards/margins": 0.01256654690951109, "rewards/rejected": -0.034863512963056564, "step": 120 }, { "epoch": 0.052104208416833664, "grad_norm": 7017232.640334902, "learning_rate": 2.6e-07, "logits/chosen": -2.3326258659362793, "logits/rejected": -2.3331451416015625, "logps/chosen": -103.95518493652344, "logps/rejected": -114.6365966796875, "loss": 127130.725, "rewards/accuracies": 0.5625, "rewards/chosen": -0.01884140633046627, "rewards/margins": 0.0011669672094285488, "rewards/rejected": -0.020008374005556107, "step": 130 }, { "epoch": 0.056112224448897796, "grad_norm": 8047688.165085967, "learning_rate": 2.8e-07, "logits/chosen": -2.2432637214660645, "logits/rejected": -2.2273428440093994, "logps/chosen": -100.75127410888672, "logps/rejected": -108.84329986572266, "loss": 125544.7375, "rewards/accuracies": 0.625, "rewards/chosen": -0.019932765513658524, "rewards/margins": 0.006595449987798929, "rewards/rejected": -0.026528215035796165, "step": 140 }, { "epoch": 0.06012024048096192, "grad_norm": 13389878.80906382, "learning_rate": 3e-07, "logits/chosen": -2.338097095489502, "logits/rejected": -2.334582805633545, "logps/chosen": -98.87701416015625, "logps/rejected": -128.06649780273438, "loss": 126277.5125, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.01577303186058998, "rewards/margins": 0.00698325177654624, "rewards/rejected": -0.022756287828087807, "step": 150 }, { "epoch": 0.06412825651302605, "grad_norm": 6547486.513058976, "learning_rate": 3.2e-07, "logits/chosen": -2.2872040271759033, "logits/rejected": -2.318220615386963, "logps/chosen": -107.16642761230469, "logps/rejected": -128.63902282714844, "loss": 130629.6125, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.009940323419868946, "rewards/margins": 0.004813443869352341, "rewards/rejected": -0.014753768220543861, "step": 160 }, { "epoch": 0.06813627254509018, "grad_norm": 6238269.898134831, "learning_rate": 3.4000000000000003e-07, "logits/chosen": -2.283688545227051, "logits/rejected": -2.269543170928955, "logps/chosen": -112.38804626464844, "logps/rejected": -124.51107025146484, "loss": 129337.075, "rewards/accuracies": 0.625, "rewards/chosen": -0.018186267465353012, "rewards/margins": 0.010698455385863781, "rewards/rejected": -0.028884723782539368, "step": 170 }, { "epoch": 0.07214428857715431, "grad_norm": 5333508.312286028, "learning_rate": 3.6e-07, "logits/chosen": -2.440035104751587, "logits/rejected": -2.416351318359375, "logps/chosen": -115.54080963134766, "logps/rejected": -128.35433959960938, "loss": 125903.175, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.01606156677007675, "rewards/margins": 0.0027716129552572966, "rewards/rejected": -0.018833179026842117, "step": 180 }, { "epoch": 0.07615230460921844, "grad_norm": 6363941.004816477, "learning_rate": 3.7999999999999996e-07, "logits/chosen": -2.302215337753296, "logits/rejected": -2.316080093383789, "logps/chosen": -95.27733612060547, "logps/rejected": -109.7214126586914, "loss": 128074.3375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.013280262239277363, "rewards/margins": 0.004476197995245457, "rewards/rejected": -0.01775646023452282, "step": 190 }, { "epoch": 0.08016032064128256, "grad_norm": 6604467.189655725, "learning_rate": 4e-07, "logits/chosen": -2.4023125171661377, "logits/rejected": -2.403869152069092, "logps/chosen": -120.79121398925781, "logps/rejected": -123.46217346191406, "loss": 130248.2, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.009212212637066841, "rewards/margins": 0.0011829538270831108, "rewards/rejected": -0.010395165532827377, "step": 200 }, { "epoch": 0.0841683366733467, "grad_norm": 6023064.099431328, "learning_rate": 4.1999999999999995e-07, "logits/chosen": -2.456587553024292, "logits/rejected": -2.45320725440979, "logps/chosen": -97.8330307006836, "logps/rejected": -110.1967544555664, "loss": 132337.3375, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.010994483716785908, "rewards/margins": 0.003299609525129199, "rewards/rejected": -0.01429409347474575, "step": 210 }, { "epoch": 0.08817635270541083, "grad_norm": 6070060.076923608, "learning_rate": 4.3999999999999997e-07, "logits/chosen": -2.464625835418701, "logits/rejected": -2.4630608558654785, "logps/chosen": -101.9610366821289, "logps/rejected": -116.52901458740234, "loss": 122057.3875, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.015559064224362373, "rewards/margins": 0.006916286889463663, "rewards/rejected": -0.02247535064816475, "step": 220 }, { "epoch": 0.09218436873747494, "grad_norm": 6531050.132289726, "learning_rate": 4.6e-07, "logits/chosen": -2.5064964294433594, "logits/rejected": -2.4663119316101074, "logps/chosen": -113.6077880859375, "logps/rejected": -136.72740173339844, "loss": 126837.725, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.02166624739766121, "rewards/margins": 0.015731699764728546, "rewards/rejected": -0.03739794343709946, "step": 230 }, { "epoch": 0.09619238476953908, "grad_norm": 8249199.68586373, "learning_rate": 4.8e-07, "logits/chosen": -2.4191393852233887, "logits/rejected": -2.4175992012023926, "logps/chosen": -140.10438537597656, "logps/rejected": -161.83901977539062, "loss": 127447.7375, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.024259308353066444, "rewards/margins": 0.015592202544212341, "rewards/rejected": -0.039851509034633636, "step": 240 }, { "epoch": 0.10020040080160321, "grad_norm": 6273689.257003604, "learning_rate": 5e-07, "logits/chosen": -2.4710445404052734, "logits/rejected": -2.49545955657959, "logps/chosen": -125.069091796875, "logps/rejected": -141.25308227539062, "loss": 127510.825, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02523133158683777, "rewards/margins": 0.014551195316016674, "rewards/rejected": -0.03978252038359642, "step": 250 }, { "epoch": 0.10420841683366733, "grad_norm": 4863381.252711604, "learning_rate": 4.97772828507795e-07, "logits/chosen": -2.508707046508789, "logits/rejected": -2.5528035163879395, "logps/chosen": -109.48052978515625, "logps/rejected": -121.25135803222656, "loss": 127932.3875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.008669688366353512, "rewards/margins": 0.0026229789946228266, "rewards/rejected": -0.01129266805946827, "step": 260 }, { "epoch": 0.10821643286573146, "grad_norm": 7439978.719265488, "learning_rate": 4.955456570155902e-07, "logits/chosen": -2.661339282989502, "logits/rejected": -2.612370729446411, "logps/chosen": -104.54673767089844, "logps/rejected": -126.21573638916016, "loss": 129504.125, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.013199085369706154, "rewards/margins": 0.013015885837376118, "rewards/rejected": -0.026214972138404846, "step": 270 }, { "epoch": 0.11222444889779559, "grad_norm": 5652432.617099802, "learning_rate": 4.933184855233853e-07, "logits/chosen": -2.657796621322632, "logits/rejected": -2.6554348468780518, "logps/chosen": -120.7027587890625, "logps/rejected": -123.74530029296875, "loss": 133293.2, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.019954059273004532, "rewards/margins": 0.002449373248964548, "rewards/rejected": -0.022403430193662643, "step": 280 }, { "epoch": 0.11623246492985972, "grad_norm": 8254991.879081396, "learning_rate": 4.910913140311803e-07, "logits/chosen": -2.7399675846099854, "logits/rejected": -2.7726333141326904, "logps/chosen": -100.21595001220703, "logps/rejected": -131.60617065429688, "loss": 123595.3, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.015642058104276657, "rewards/margins": 0.015260448679327965, "rewards/rejected": -0.030902501195669174, "step": 290 }, { "epoch": 0.12024048096192384, "grad_norm": 8927344.576164661, "learning_rate": 4.888641425389755e-07, "logits/chosen": -2.6996548175811768, "logits/rejected": -2.74585223197937, "logps/chosen": -113.76595306396484, "logps/rejected": -144.14627075195312, "loss": 126853.8875, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.02661210298538208, "rewards/margins": 0.016480224207043648, "rewards/rejected": -0.04309232532978058, "step": 300 }, { "epoch": 0.12424849699398798, "grad_norm": 6996156.236412326, "learning_rate": 4.866369710467706e-07, "logits/chosen": -2.6147875785827637, "logits/rejected": -2.6017518043518066, "logps/chosen": -103.03385925292969, "logps/rejected": -116.8309097290039, "loss": 128838.425, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0185169018805027, "rewards/margins": 0.004754557274281979, "rewards/rejected": -0.023271460086107254, "step": 310 }, { "epoch": 0.1282565130260521, "grad_norm": 7524659.80788171, "learning_rate": 4.844097995545656e-07, "logits/chosen": -2.696021556854248, "logits/rejected": -2.6883420944213867, "logps/chosen": -122.92547607421875, "logps/rejected": -142.40811157226562, "loss": 122181.3125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.015210810117423534, "rewards/margins": 0.01843477226793766, "rewards/rejected": -0.03364557772874832, "step": 320 }, { "epoch": 0.13226452905811623, "grad_norm": 6072329.149131962, "learning_rate": 4.821826280623608e-07, "logits/chosen": -2.4829201698303223, "logits/rejected": -2.50555419921875, "logps/chosen": -117.76029968261719, "logps/rejected": -145.81637573242188, "loss": 124020.45, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.020629018545150757, "rewards/margins": 0.017869364470243454, "rewards/rejected": -0.03849838301539421, "step": 330 }, { "epoch": 0.13627254509018036, "grad_norm": 8789561.131088747, "learning_rate": 4.799554565701559e-07, "logits/chosen": -2.572274684906006, "logits/rejected": -2.599792957305908, "logps/chosen": -102.4592514038086, "logps/rejected": -136.51661682128906, "loss": 117515.9625, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.024631744250655174, "rewards/margins": 0.019154489040374756, "rewards/rejected": -0.04378623515367508, "step": 340 }, { "epoch": 0.1402805611222445, "grad_norm": 4632119.649847494, "learning_rate": 4.77728285077951e-07, "logits/chosen": -2.4988906383514404, "logits/rejected": -2.4986491203308105, "logps/chosen": -109.21330261230469, "logps/rejected": -121.016357421875, "loss": 126712.85, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0240558423101902, "rewards/margins": 0.011534234508872032, "rewards/rejected": -0.035590074956417084, "step": 350 }, { "epoch": 0.14428857715430862, "grad_norm": 6705809.181653678, "learning_rate": 4.7550111358574605e-07, "logits/chosen": -2.439885377883911, "logits/rejected": -2.488706111907959, "logps/chosen": -121.1862564086914, "logps/rejected": -154.49058532714844, "loss": 130547.825, "rewards/accuracies": 0.625, "rewards/chosen": -0.02486741915345192, "rewards/margins": 0.023590799421072006, "rewards/rejected": -0.04845822602510452, "step": 360 }, { "epoch": 0.14829659318637275, "grad_norm": 9372987.582902173, "learning_rate": 4.7327394209354114e-07, "logits/chosen": -2.326481819152832, "logits/rejected": -2.269331693649292, "logps/chosen": -128.03793334960938, "logps/rejected": -144.99484252929688, "loss": 132171.75, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.028904268518090248, "rewards/margins": 0.015704263001680374, "rewards/rejected": -0.04460852965712547, "step": 370 }, { "epoch": 0.1523046092184369, "grad_norm": 7264491.777537584, "learning_rate": 4.710467706013363e-07, "logits/chosen": -2.2833657264709473, "logits/rejected": -2.276210308074951, "logps/chosen": -104.51480865478516, "logps/rejected": -115.93243408203125, "loss": 124968.1875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.01820625737309456, "rewards/margins": 0.008318779990077019, "rewards/rejected": -0.026525039225816727, "step": 380 }, { "epoch": 0.156312625250501, "grad_norm": 9483300.54549026, "learning_rate": 4.6881959910913137e-07, "logits/chosen": -2.3335509300231934, "logits/rejected": -2.3318705558776855, "logps/chosen": -118.47274017333984, "logps/rejected": -159.32736206054688, "loss": 124226.0375, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.024884693324565887, "rewards/margins": 0.03282006457448006, "rewards/rejected": -0.057704757899045944, "step": 390 }, { "epoch": 0.16032064128256512, "grad_norm": 5705393.312794033, "learning_rate": 4.6659242761692646e-07, "logits/chosen": -2.1676976680755615, "logits/rejected": -2.2024545669555664, "logps/chosen": -128.0613250732422, "logps/rejected": -154.72410583496094, "loss": 131599.275, "rewards/accuracies": 0.625, "rewards/chosen": -0.03110039234161377, "rewards/margins": 0.020628096535801888, "rewards/rejected": -0.05172848701477051, "step": 400 }, { "epoch": 0.16432865731462926, "grad_norm": 7429207.687017749, "learning_rate": 4.643652561247216e-07, "logits/chosen": -2.377800464630127, "logits/rejected": -2.3630149364471436, "logps/chosen": -114.31864929199219, "logps/rejected": -124.80494689941406, "loss": 129026.4125, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.022269196808338165, "rewards/margins": 0.007764645852148533, "rewards/rejected": -0.030033841729164124, "step": 410 }, { "epoch": 0.1683366733466934, "grad_norm": 6398357.85123705, "learning_rate": 4.621380846325167e-07, "logits/chosen": -2.467796802520752, "logits/rejected": -2.486076831817627, "logps/chosen": -117.53851318359375, "logps/rejected": -140.3090362548828, "loss": 124892.1125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.01722443476319313, "rewards/margins": 0.01843501813709736, "rewards/rejected": -0.03565945476293564, "step": 420 }, { "epoch": 0.17234468937875752, "grad_norm": 6277691.218109459, "learning_rate": 4.5991091314031177e-07, "logits/chosen": -2.3002543449401855, "logits/rejected": -2.2977206707000732, "logps/chosen": -104.3665771484375, "logps/rejected": -117.7842788696289, "loss": 127817.725, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.017184479162096977, "rewards/margins": 0.005776461213827133, "rewards/rejected": -0.02296094223856926, "step": 430 }, { "epoch": 0.17635270541082165, "grad_norm": 5524654.29238764, "learning_rate": 4.5768374164810686e-07, "logits/chosen": -2.3842499256134033, "logits/rejected": -2.3653392791748047, "logps/chosen": -106.9764633178711, "logps/rejected": -128.93316650390625, "loss": 125052.9375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.01907220296561718, "rewards/margins": 0.017310332506895065, "rewards/rejected": -0.036382537335157394, "step": 440 }, { "epoch": 0.18036072144288579, "grad_norm": 5698111.597390376, "learning_rate": 4.55456570155902e-07, "logits/chosen": -2.4640183448791504, "logits/rejected": -2.4729461669921875, "logps/chosen": -116.1183090209961, "logps/rejected": -126.81109619140625, "loss": 125015.2, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.02313784509897232, "rewards/margins": 0.007788621820509434, "rewards/rejected": -0.03092646598815918, "step": 450 }, { "epoch": 0.1843687374749499, "grad_norm": 8646711.65753647, "learning_rate": 4.532293986636971e-07, "logits/chosen": -2.2500667572021484, "logits/rejected": -2.2651727199554443, "logps/chosen": -130.732666015625, "logps/rejected": -146.08314514160156, "loss": 128072.8125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0223550908267498, "rewards/margins": 0.015680748969316483, "rewards/rejected": -0.03803584352135658, "step": 460 }, { "epoch": 0.18837675350701402, "grad_norm": 7633681.996131759, "learning_rate": 4.510022271714922e-07, "logits/chosen": -2.5176281929016113, "logits/rejected": -2.5100059509277344, "logps/chosen": -120.24742126464844, "logps/rejected": -144.9837188720703, "loss": 128354.825, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.02441255934536457, "rewards/margins": 0.01506769098341465, "rewards/rejected": -0.03948025032877922, "step": 470 }, { "epoch": 0.19238476953907815, "grad_norm": 5929632.8733501285, "learning_rate": 4.487750556792873e-07, "logits/chosen": -2.497260093688965, "logits/rejected": -2.4669649600982666, "logps/chosen": -129.90042114257812, "logps/rejected": -138.57948303222656, "loss": 126991.15, "rewards/accuracies": 0.5, "rewards/chosen": -0.02791530452668667, "rewards/margins": 0.007056623697280884, "rewards/rejected": -0.0349719300866127, "step": 480 }, { "epoch": 0.1963927855711423, "grad_norm": 7507688.616841515, "learning_rate": 4.465478841870824e-07, "logits/chosen": -2.4561104774475098, "logits/rejected": -2.406879186630249, "logps/chosen": -105.32928466796875, "logps/rejected": -111.3759765625, "loss": 133037.95, "rewards/accuracies": 0.625, "rewards/chosen": -0.02306142821907997, "rewards/margins": 0.006820513866841793, "rewards/rejected": -0.02988194301724434, "step": 490 }, { "epoch": 0.20040080160320642, "grad_norm": 6114303.37476775, "learning_rate": 4.443207126948775e-07, "logits/chosen": -2.5643982887268066, "logits/rejected": -2.5802197456359863, "logps/chosen": -113.5167236328125, "logps/rejected": -129.633544921875, "loss": 126710.2375, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.02158009074628353, "rewards/margins": 0.008723837323486805, "rewards/rejected": -0.030303925275802612, "step": 500 }, { "epoch": 0.20440881763527055, "grad_norm": 5739769.080992031, "learning_rate": 4.420935412026726e-07, "logits/chosen": -2.506775379180908, "logits/rejected": -2.5049405097961426, "logps/chosen": -113.4487075805664, "logps/rejected": -121.42533874511719, "loss": 128230.775, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.023860003799200058, "rewards/margins": 0.004126023501157761, "rewards/rejected": -0.027986029163002968, "step": 510 }, { "epoch": 0.20841683366733466, "grad_norm": 8712683.142514465, "learning_rate": 4.398663697104677e-07, "logits/chosen": -2.2912344932556152, "logits/rejected": -2.2889084815979004, "logps/chosen": -118.3155746459961, "logps/rejected": -136.2932891845703, "loss": 125754.4375, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.02143458090722561, "rewards/margins": 0.01660776697099209, "rewards/rejected": -0.0380423478782177, "step": 520 }, { "epoch": 0.2124248496993988, "grad_norm": 5141990.524270266, "learning_rate": 4.376391982182628e-07, "logits/chosen": -2.2442755699157715, "logits/rejected": -2.258594274520874, "logps/chosen": -119.4423599243164, "logps/rejected": -132.0382080078125, "loss": 126731.7625, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02435903809964657, "rewards/margins": 0.011756391264498234, "rewards/rejected": -0.03611543029546738, "step": 530 }, { "epoch": 0.21643286573146292, "grad_norm": 7685918.890454359, "learning_rate": 4.3541202672605785e-07, "logits/chosen": -2.4173481464385986, "logits/rejected": -2.4380507469177246, "logps/chosen": -118.99906158447266, "logps/rejected": -138.0341339111328, "loss": 125104.65, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.02268015593290329, "rewards/margins": 0.014481584541499615, "rewards/rejected": -0.03716174140572548, "step": 540 }, { "epoch": 0.22044088176352705, "grad_norm": 6743097.627817877, "learning_rate": 4.33184855233853e-07, "logits/chosen": -2.3929710388183594, "logits/rejected": -2.3947911262512207, "logps/chosen": -139.71694946289062, "logps/rejected": -141.9007110595703, "loss": 130289.3125, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.028682414442300797, "rewards/margins": 0.004017618950456381, "rewards/rejected": -0.032700031995773315, "step": 550 }, { "epoch": 0.22444889779559118, "grad_norm": 7880234.192936395, "learning_rate": 4.309576837416481e-07, "logits/chosen": -2.3357808589935303, "logits/rejected": -2.3500537872314453, "logps/chosen": -99.57920837402344, "logps/rejected": -132.2980499267578, "loss": 123243.8375, "rewards/accuracies": 0.625, "rewards/chosen": -0.020820502191781998, "rewards/margins": 0.028303777799010277, "rewards/rejected": -0.04912428557872772, "step": 560 }, { "epoch": 0.22845691382765532, "grad_norm": 5626858.656154921, "learning_rate": 4.2873051224944316e-07, "logits/chosen": -2.4428811073303223, "logits/rejected": -2.437586784362793, "logps/chosen": -108.9743881225586, "logps/rejected": -138.54994201660156, "loss": 126263.475, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.019286539405584335, "rewards/margins": 0.02130548655986786, "rewards/rejected": -0.04059202969074249, "step": 570 }, { "epoch": 0.23246492985971945, "grad_norm": 5580269.332486439, "learning_rate": 4.2650334075723825e-07, "logits/chosen": -2.4557948112487793, "logits/rejected": -2.452768564224243, "logps/chosen": -116.21510314941406, "logps/rejected": -130.24020385742188, "loss": 123232.45, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.025895819067955017, "rewards/margins": 0.007201328873634338, "rewards/rejected": -0.033097147941589355, "step": 580 }, { "epoch": 0.23647294589178355, "grad_norm": 4797099.664206871, "learning_rate": 4.242761692650334e-07, "logits/chosen": -2.418750762939453, "logits/rejected": -2.4155123233795166, "logps/chosen": -118.62259674072266, "logps/rejected": -143.041015625, "loss": 128013.3125, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.02427416667342186, "rewards/margins": 0.013805478811264038, "rewards/rejected": -0.0380796417593956, "step": 590 }, { "epoch": 0.24048096192384769, "grad_norm": 4972883.224938656, "learning_rate": 4.220489977728285e-07, "logits/chosen": -2.3690855503082275, "logits/rejected": -2.3753814697265625, "logps/chosen": -106.62162017822266, "logps/rejected": -125.63492584228516, "loss": 129292.35, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.023660266771912575, "rewards/margins": 0.00939355418086052, "rewards/rejected": -0.03305382281541824, "step": 600 }, { "epoch": 0.24448897795591182, "grad_norm": 6565230.079037476, "learning_rate": 4.1982182628062357e-07, "logits/chosen": -2.433472156524658, "logits/rejected": -2.440901279449463, "logps/chosen": -100.97681427001953, "logps/rejected": -121.0699234008789, "loss": 125732.875, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.021025976166129112, "rewards/margins": 0.012437298893928528, "rewards/rejected": -0.03346327692270279, "step": 610 }, { "epoch": 0.24849699398797595, "grad_norm": 5435060.714939647, "learning_rate": 4.175946547884187e-07, "logits/chosen": -2.4333367347717285, "logits/rejected": -2.4552102088928223, "logps/chosen": -123.1323013305664, "logps/rejected": -142.62408447265625, "loss": 124078.4875, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.025617394596338272, "rewards/margins": 0.008992002345621586, "rewards/rejected": -0.03460939601063728, "step": 620 }, { "epoch": 0.25250501002004005, "grad_norm": 6547015.310004752, "learning_rate": 4.153674832962138e-07, "logits/chosen": -2.4262237548828125, "logits/rejected": -2.444304943084717, "logps/chosen": -112.52055358886719, "logps/rejected": -144.73129272460938, "loss": 126401.4375, "rewards/accuracies": 0.625, "rewards/chosen": -0.02900712564587593, "rewards/margins": 0.020192446187138557, "rewards/rejected": -0.04919956251978874, "step": 630 }, { "epoch": 0.2565130260521042, "grad_norm": 6237814.549776749, "learning_rate": 4.131403118040089e-07, "logits/chosen": -2.631423234939575, "logits/rejected": -2.640061140060425, "logps/chosen": -123.40995788574219, "logps/rejected": -139.66795349121094, "loss": 125416.05, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.020980000495910645, "rewards/margins": 0.02331816963851452, "rewards/rejected": -0.04429817199707031, "step": 640 }, { "epoch": 0.2605210420841683, "grad_norm": 3386484.505424407, "learning_rate": 4.1091314031180397e-07, "logits/chosen": -2.5698630809783936, "logits/rejected": -2.5268971920013428, "logps/chosen": -115.55632019042969, "logps/rejected": -129.51779174804688, "loss": 126294.025, "rewards/accuracies": 0.625, "rewards/chosen": -0.025962088257074356, "rewards/margins": 0.012019636109471321, "rewards/rejected": -0.03798172250390053, "step": 650 }, { "epoch": 0.26452905811623245, "grad_norm": 5974701.756903167, "learning_rate": 4.086859688195991e-07, "logits/chosen": -2.542069673538208, "logits/rejected": -2.558957815170288, "logps/chosen": -128.98672485351562, "logps/rejected": -164.79867553710938, "loss": 123138.7625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.025483956560492516, "rewards/margins": 0.02371075749397278, "rewards/rejected": -0.04919471591711044, "step": 660 }, { "epoch": 0.2685370741482966, "grad_norm": 8618946.302311558, "learning_rate": 4.064587973273942e-07, "logits/chosen": -2.5645318031311035, "logits/rejected": -2.5552051067352295, "logps/chosen": -105.97404479980469, "logps/rejected": -128.65032958984375, "loss": 125055.6, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.029579663649201393, "rewards/margins": 0.009542147628962994, "rewards/rejected": -0.03912181407213211, "step": 670 }, { "epoch": 0.2725450901803607, "grad_norm": 6026442.863047762, "learning_rate": 4.042316258351893e-07, "logits/chosen": -2.4789493083953857, "logits/rejected": -2.4921040534973145, "logps/chosen": -125.647705078125, "logps/rejected": -129.52207946777344, "loss": 124416.45, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.02354869619011879, "rewards/margins": 0.010550996288657188, "rewards/rejected": -0.03409969061613083, "step": 680 }, { "epoch": 0.27655310621242485, "grad_norm": 6693842.633577265, "learning_rate": 4.0200445434298443e-07, "logits/chosen": -2.3823628425598145, "logits/rejected": -2.3753132820129395, "logps/chosen": -126.91679382324219, "logps/rejected": -137.48599243164062, "loss": 125631.625, "rewards/accuracies": 0.625, "rewards/chosen": -0.029714182019233704, "rewards/margins": 0.013311244547367096, "rewards/rejected": -0.0430254265666008, "step": 690 }, { "epoch": 0.280561122244489, "grad_norm": 6734657.314157365, "learning_rate": 3.997772828507795e-07, "logits/chosen": -2.5848867893218994, "logits/rejected": -2.5814270973205566, "logps/chosen": -106.8030776977539, "logps/rejected": -145.386962890625, "loss": 120258.175, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.028242800384759903, "rewards/margins": 0.025544622913002968, "rewards/rejected": -0.05378742143511772, "step": 700 }, { "epoch": 0.2845691382765531, "grad_norm": 6187383.542849222, "learning_rate": 3.975501113585746e-07, "logits/chosen": -2.4657511711120605, "logits/rejected": -2.4647703170776367, "logps/chosen": -141.2967071533203, "logps/rejected": -156.6244354248047, "loss": 126171.675, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.030644794926047325, "rewards/margins": 0.012586990371346474, "rewards/rejected": -0.0432317890226841, "step": 710 }, { "epoch": 0.28857715430861725, "grad_norm": 7209217.2713718135, "learning_rate": 3.9532293986636975e-07, "logits/chosen": -2.5702593326568604, "logits/rejected": -2.580562114715576, "logps/chosen": -115.55619049072266, "logps/rejected": -149.7683868408203, "loss": 121775.1125, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.030658628791570663, "rewards/margins": 0.024698719382286072, "rewards/rejected": -0.05535735562443733, "step": 720 }, { "epoch": 0.2925851703406814, "grad_norm": 7345760.592322324, "learning_rate": 3.930957683741648e-07, "logits/chosen": -2.4842042922973633, "logits/rejected": -2.519537925720215, "logps/chosen": -130.01364135742188, "logps/rejected": -145.00106811523438, "loss": 125944.025, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.031169170513749123, "rewards/margins": 0.012248598039150238, "rewards/rejected": -0.04341777041554451, "step": 730 }, { "epoch": 0.2965931863727455, "grad_norm": 7769041.8821440255, "learning_rate": 3.9086859688195987e-07, "logits/chosen": -2.4142649173736572, "logits/rejected": -2.3964760303497314, "logps/chosen": -110.42384338378906, "logps/rejected": -147.5558319091797, "loss": 128450.175, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.03280683606863022, "rewards/margins": 0.02526194415986538, "rewards/rejected": -0.058068789541721344, "step": 740 }, { "epoch": 0.30060120240480964, "grad_norm": 5837766.548604017, "learning_rate": 3.8864142538975496e-07, "logits/chosen": -2.490830421447754, "logits/rejected": -2.4848005771636963, "logps/chosen": -119.2578353881836, "logps/rejected": -136.1200714111328, "loss": 129370.05, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.027966167777776718, "rewards/margins": 0.01382518745958805, "rewards/rejected": -0.04179135337471962, "step": 750 }, { "epoch": 0.3046092184368738, "grad_norm": 5653605.464138263, "learning_rate": 3.864142538975501e-07, "logits/chosen": -2.6354494094848633, "logits/rejected": -2.6398041248321533, "logps/chosen": -119.77166748046875, "logps/rejected": -135.96331787109375, "loss": 125964.9875, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.02680077590048313, "rewards/margins": 0.009917078539729118, "rewards/rejected": -0.03671785444021225, "step": 760 }, { "epoch": 0.30861723446893785, "grad_norm": 6202101.053709776, "learning_rate": 3.841870824053452e-07, "logits/chosen": -2.656554698944092, "logits/rejected": -2.6409945487976074, "logps/chosen": -112.60661315917969, "logps/rejected": -128.2964630126953, "loss": 125094.25, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0260836873203516, "rewards/margins": 0.011154638603329659, "rewards/rejected": -0.03723832219839096, "step": 770 }, { "epoch": 0.312625250501002, "grad_norm": 6165732.580106268, "learning_rate": 3.819599109131403e-07, "logits/chosen": -2.7159509658813477, "logits/rejected": -2.7287096977233887, "logps/chosen": -102.42594909667969, "logps/rejected": -126.30348205566406, "loss": 129263.9, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.021063674241304398, "rewards/margins": 0.02199883759021759, "rewards/rejected": -0.043062515556812286, "step": 780 }, { "epoch": 0.3166332665330661, "grad_norm": 6848518.542334836, "learning_rate": 3.797327394209354e-07, "logits/chosen": -2.675846576690674, "logits/rejected": -2.706200361251831, "logps/chosen": -118.5123062133789, "logps/rejected": -142.47341918945312, "loss": 125051.95, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.024104077368974686, "rewards/margins": 0.020355774089694023, "rewards/rejected": -0.04445984959602356, "step": 790 }, { "epoch": 0.32064128256513025, "grad_norm": 6903511.395968328, "learning_rate": 3.775055679287305e-07, "logits/chosen": -2.699876308441162, "logits/rejected": -2.663015127182007, "logps/chosen": -129.7513427734375, "logps/rejected": -159.34945678710938, "loss": 123485.125, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.029758721590042114, "rewards/margins": 0.023346439003944397, "rewards/rejected": -0.05310516431927681, "step": 800 }, { "epoch": 0.3246492985971944, "grad_norm": 7400217.269596528, "learning_rate": 3.752783964365256e-07, "logits/chosen": -2.5823917388916016, "logits/rejected": -2.597344398498535, "logps/chosen": -125.24183654785156, "logps/rejected": -148.7626953125, "loss": 124403.4625, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0231742262840271, "rewards/margins": 0.02486516162753105, "rewards/rejected": -0.04803938418626785, "step": 810 }, { "epoch": 0.3286573146292585, "grad_norm": 10022891.443810735, "learning_rate": 3.730512249443207e-07, "logits/chosen": -2.5580251216888428, "logits/rejected": -2.556856393814087, "logps/chosen": -133.8833465576172, "logps/rejected": -167.32559204101562, "loss": 128231.225, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.038261737674474716, "rewards/margins": 0.021149639040231705, "rewards/rejected": -0.05941138416528702, "step": 820 }, { "epoch": 0.33266533066132264, "grad_norm": 7872004.700402355, "learning_rate": 3.708240534521158e-07, "logits/chosen": -2.505337953567505, "logits/rejected": -2.528937816619873, "logps/chosen": -132.82406616210938, "logps/rejected": -155.55078125, "loss": 127033.425, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.02652103267610073, "rewards/margins": 0.02101508341729641, "rewards/rejected": -0.04753611236810684, "step": 830 }, { "epoch": 0.3366733466933868, "grad_norm": 7096774.331416015, "learning_rate": 3.685968819599109e-07, "logits/chosen": -2.5368692874908447, "logits/rejected": -2.535719156265259, "logps/chosen": -108.67805480957031, "logps/rejected": -144.25912475585938, "loss": 124284.075, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.02542160078883171, "rewards/margins": 0.026436615735292435, "rewards/rejected": -0.051858216524124146, "step": 840 }, { "epoch": 0.3406813627254509, "grad_norm": 10050853.271825453, "learning_rate": 3.66369710467706e-07, "logits/chosen": -2.5603981018066406, "logits/rejected": -2.5634753704071045, "logps/chosen": -129.12669372558594, "logps/rejected": -153.24151611328125, "loss": 131637.4125, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.031090397387742996, "rewards/margins": 0.013068552128970623, "rewards/rejected": -0.044158950448036194, "step": 850 }, { "epoch": 0.34468937875751504, "grad_norm": 8788645.52475432, "learning_rate": 3.6414253897550114e-07, "logits/chosen": -2.5309653282165527, "logits/rejected": -2.4787347316741943, "logps/chosen": -102.19111633300781, "logps/rejected": -118.55183410644531, "loss": 125632.975, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.024080926552414894, "rewards/margins": 0.006669840309768915, "rewards/rejected": -0.030750762671232224, "step": 860 }, { "epoch": 0.3486973947895792, "grad_norm": 8574869.451608999, "learning_rate": 3.619153674832962e-07, "logits/chosen": -2.6231815814971924, "logits/rejected": -2.587998390197754, "logps/chosen": -105.8035659790039, "logps/rejected": -130.13351440429688, "loss": 124524.9125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.020694701001048088, "rewards/margins": 0.015317901968955994, "rewards/rejected": -0.03601260110735893, "step": 870 }, { "epoch": 0.3527054108216433, "grad_norm": 6665344.596513341, "learning_rate": 3.596881959910913e-07, "logits/chosen": -2.6597867012023926, "logits/rejected": -2.6807284355163574, "logps/chosen": -127.19599914550781, "logps/rejected": -153.52798461914062, "loss": 127034.3875, "rewards/accuracies": 0.5625, "rewards/chosen": -0.025522371754050255, "rewards/margins": 0.014715611934661865, "rewards/rejected": -0.04023798182606697, "step": 880 }, { "epoch": 0.35671342685370744, "grad_norm": 5233225.260742817, "learning_rate": 3.574610244988864e-07, "logits/chosen": -2.5910658836364746, "logits/rejected": -2.5633342266082764, "logps/chosen": -146.96466064453125, "logps/rejected": -160.8887481689453, "loss": 129368.775, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.027096951380372047, "rewards/margins": 0.007173667661845684, "rewards/rejected": -0.03427061811089516, "step": 890 }, { "epoch": 0.36072144288577157, "grad_norm": 7630203.262076153, "learning_rate": 3.5523385300668154e-07, "logits/chosen": -2.676250457763672, "logits/rejected": -2.657402753829956, "logps/chosen": -101.4141845703125, "logps/rejected": -121.3133544921875, "loss": 118632.3875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.022349627688527107, "rewards/margins": 0.016511743888258934, "rewards/rejected": -0.03886137530207634, "step": 900 }, { "epoch": 0.36472945891783565, "grad_norm": 6160335.089101694, "learning_rate": 3.530066815144766e-07, "logits/chosen": -2.663553237915039, "logits/rejected": -2.6835107803344727, "logps/chosen": -117.61933898925781, "logps/rejected": -127.96868896484375, "loss": 127476.4875, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.028628546744585037, "rewards/margins": 0.009166366420686245, "rewards/rejected": -0.03779491409659386, "step": 910 }, { "epoch": 0.3687374749498998, "grad_norm": 5857190.0709482, "learning_rate": 3.5077951002227166e-07, "logits/chosen": -2.57889986038208, "logits/rejected": -2.5917673110961914, "logps/chosen": -99.30012512207031, "logps/rejected": -117.08067321777344, "loss": 126853.9125, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.023543711751699448, "rewards/margins": 0.010370884090662003, "rewards/rejected": -0.03391459211707115, "step": 920 }, { "epoch": 0.3727454909819639, "grad_norm": 11769408.84668034, "learning_rate": 3.485523385300668e-07, "logits/chosen": -2.5779290199279785, "logits/rejected": -2.543435573577881, "logps/chosen": -124.24088287353516, "logps/rejected": -146.30453491210938, "loss": 125690.2375, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.027858158573508263, "rewards/margins": 0.017768610268831253, "rewards/rejected": -0.045626770704984665, "step": 930 }, { "epoch": 0.37675350701402804, "grad_norm": 6339046.89248737, "learning_rate": 3.463251670378619e-07, "logits/chosen": -2.545441150665283, "logits/rejected": -2.5451228618621826, "logps/chosen": -119.28828430175781, "logps/rejected": -136.03775024414062, "loss": 121444.1875, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.023958856239914894, "rewards/margins": 0.016655322164297104, "rewards/rejected": -0.04061417654156685, "step": 940 }, { "epoch": 0.3807615230460922, "grad_norm": 7905647.567752224, "learning_rate": 3.44097995545657e-07, "logits/chosen": -2.489795446395874, "logits/rejected": -2.474541664123535, "logps/chosen": -105.68360900878906, "logps/rejected": -144.86727905273438, "loss": 123972.2625, "rewards/accuracies": 0.625, "rewards/chosen": -0.02857508696615696, "rewards/margins": 0.035942137241363525, "rewards/rejected": -0.06451722234487534, "step": 950 }, { "epoch": 0.3847695390781563, "grad_norm": 7517249.753109076, "learning_rate": 3.4187082405345207e-07, "logits/chosen": -2.5300402641296387, "logits/rejected": -2.551455020904541, "logps/chosen": -144.90447998046875, "logps/rejected": -181.8575897216797, "loss": 129581.9875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.032712481915950775, "rewards/margins": 0.023180881515145302, "rewards/rejected": -0.055893369019031525, "step": 960 }, { "epoch": 0.38877755511022044, "grad_norm": 21884930.38904731, "learning_rate": 3.396436525612472e-07, "logits/chosen": -2.5614724159240723, "logits/rejected": -2.5662589073181152, "logps/chosen": -138.80459594726562, "logps/rejected": -176.45445251464844, "loss": 129077.6875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.03729068487882614, "rewards/margins": 0.023995213210582733, "rewards/rejected": -0.06128590181469917, "step": 970 }, { "epoch": 0.3927855711422846, "grad_norm": 6397072.682301449, "learning_rate": 3.374164810690423e-07, "logits/chosen": -2.483768939971924, "logits/rejected": -2.497523069381714, "logps/chosen": -107.89897155761719, "logps/rejected": -130.04559326171875, "loss": 126968.175, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.025696447119116783, "rewards/margins": 0.019706759601831436, "rewards/rejected": -0.04540320485830307, "step": 980 }, { "epoch": 0.3967935871743487, "grad_norm": 6076169.314948489, "learning_rate": 3.351893095768374e-07, "logits/chosen": -2.5927734375, "logits/rejected": -2.589218854904175, "logps/chosen": -125.49295806884766, "logps/rejected": -146.27127075195312, "loss": 130093.2, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.029010172933340073, "rewards/margins": 0.014808593317866325, "rewards/rejected": -0.04381876438856125, "step": 990 }, { "epoch": 0.40080160320641284, "grad_norm": 8944436.503605708, "learning_rate": 3.329621380846325e-07, "logits/chosen": -2.4737820625305176, "logits/rejected": -2.4782590866088867, "logps/chosen": -115.3978500366211, "logps/rejected": -139.7859344482422, "loss": 125983.45, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.03420211002230644, "rewards/margins": 0.018732238560914993, "rewards/rejected": -0.052934348583221436, "step": 1000 }, { "epoch": 0.40480961923847697, "grad_norm": 6834249.220142525, "learning_rate": 3.307349665924276e-07, "logits/chosen": -2.4906249046325684, "logits/rejected": -2.478178024291992, "logps/chosen": -130.49822998046875, "logps/rejected": -145.3443145751953, "loss": 125360.8375, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.02826479636132717, "rewards/margins": 0.01623808778822422, "rewards/rejected": -0.04450288414955139, "step": 1010 }, { "epoch": 0.4088176352705411, "grad_norm": 6383495.447630203, "learning_rate": 3.285077951002227e-07, "logits/chosen": -2.5950496196746826, "logits/rejected": -2.5853590965270996, "logps/chosen": -109.3490219116211, "logps/rejected": -135.37911987304688, "loss": 125451.1625, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.02749260701239109, "rewards/margins": 0.01786484755575657, "rewards/rejected": -0.04535745456814766, "step": 1020 }, { "epoch": 0.41282565130260523, "grad_norm": 6800961.848727982, "learning_rate": 3.262806236080178e-07, "logits/chosen": -2.527940273284912, "logits/rejected": -2.538398265838623, "logps/chosen": -116.2686538696289, "logps/rejected": -166.8658905029297, "loss": 121559.9375, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.026556119322776794, "rewards/margins": 0.028276193886995316, "rewards/rejected": -0.05483230948448181, "step": 1030 }, { "epoch": 0.4168336673346693, "grad_norm": 6862016.969047391, "learning_rate": 3.2405345211581293e-07, "logits/chosen": -2.429194211959839, "logits/rejected": -2.442253589630127, "logps/chosen": -129.10696411132812, "logps/rejected": -149.61363220214844, "loss": 127840.825, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.03121834062039852, "rewards/margins": 0.017258524894714355, "rewards/rejected": -0.048476867377758026, "step": 1040 }, { "epoch": 0.42084168336673344, "grad_norm": 7173695.345872832, "learning_rate": 3.21826280623608e-07, "logits/chosen": -2.4863028526306152, "logits/rejected": -2.498213291168213, "logps/chosen": -127.5420913696289, "logps/rejected": -162.05979919433594, "loss": 122006.5375, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.020269382745027542, "rewards/margins": 0.02875341847538948, "rewards/rejected": -0.04902280122041702, "step": 1050 }, { "epoch": 0.4248496993987976, "grad_norm": 6915379.934117418, "learning_rate": 3.195991091314031e-07, "logits/chosen": -2.5418026447296143, "logits/rejected": -2.5308048725128174, "logps/chosen": -115.8625717163086, "logps/rejected": -144.80714416503906, "loss": 121081.8, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.019701775163412094, "rewards/margins": 0.03037576749920845, "rewards/rejected": -0.05007754638791084, "step": 1060 }, { "epoch": 0.4288577154308617, "grad_norm": 7460651.8397620395, "learning_rate": 3.1737193763919825e-07, "logits/chosen": -2.5827364921569824, "logits/rejected": -2.5804123878479004, "logps/chosen": -108.82574462890625, "logps/rejected": -138.85072326660156, "loss": 120443.35, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.019836189225316048, "rewards/margins": 0.029924744740128517, "rewards/rejected": -0.049760930240154266, "step": 1070 }, { "epoch": 0.43286573146292584, "grad_norm": 8373274.336152132, "learning_rate": 3.1514476614699334e-07, "logits/chosen": -2.516143321990967, "logits/rejected": -2.4642560482025146, "logps/chosen": -123.48294830322266, "logps/rejected": -164.36378479003906, "loss": 121728.3875, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.03256348520517349, "rewards/margins": 0.03419329971075058, "rewards/rejected": -0.06675679236650467, "step": 1080 }, { "epoch": 0.43687374749499, "grad_norm": 8184337.333100434, "learning_rate": 3.129175946547884e-07, "logits/chosen": -2.560567855834961, "logits/rejected": -2.524467945098877, "logps/chosen": -121.753662109375, "logps/rejected": -148.0128173828125, "loss": 127275.2125, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.037222061306238174, "rewards/margins": 0.01743399165570736, "rewards/rejected": -0.054656051099300385, "step": 1090 }, { "epoch": 0.4408817635270541, "grad_norm": 6811268.460877864, "learning_rate": 3.1069042316258346e-07, "logits/chosen": -2.541689395904541, "logits/rejected": -2.5355188846588135, "logps/chosen": -118.40226745605469, "logps/rejected": -146.7948455810547, "loss": 129991.525, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.029409324750304222, "rewards/margins": 0.021463513374328613, "rewards/rejected": -0.050872839987277985, "step": 1100 }, { "epoch": 0.44488977955911824, "grad_norm": 6657005.1585574625, "learning_rate": 3.084632516703786e-07, "logits/chosen": -2.6857261657714844, "logits/rejected": -2.672269344329834, "logps/chosen": -101.30345153808594, "logps/rejected": -136.15231323242188, "loss": 122391.7, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.027355913072824478, "rewards/margins": 0.026283621788024902, "rewards/rejected": -0.05363954231142998, "step": 1110 }, { "epoch": 0.44889779559118237, "grad_norm": 7161516.5646296, "learning_rate": 3.062360801781737e-07, "logits/chosen": -2.492475748062134, "logits/rejected": -2.4887194633483887, "logps/chosen": -104.93692779541016, "logps/rejected": -122.68449401855469, "loss": 128814.6, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.02373102679848671, "rewards/margins": 0.012768360786139965, "rewards/rejected": -0.03649938851594925, "step": 1120 }, { "epoch": 0.4529058116232465, "grad_norm": 7008942.925795235, "learning_rate": 3.040089086859688e-07, "logits/chosen": -2.5698654651641846, "logits/rejected": -2.572453498840332, "logps/chosen": -116.10699462890625, "logps/rejected": -147.9447479248047, "loss": 122371.8, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03228304535150528, "rewards/margins": 0.02537880465388298, "rewards/rejected": -0.05766185000538826, "step": 1130 }, { "epoch": 0.45691382765531063, "grad_norm": 6146790.319547828, "learning_rate": 3.017817371937639e-07, "logits/chosen": -2.652864933013916, "logits/rejected": -2.6149935722351074, "logps/chosen": -122.29400634765625, "logps/rejected": -152.6931915283203, "loss": 119369.45, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.030378807336091995, "rewards/margins": 0.018168287351727486, "rewards/rejected": -0.04854709282517433, "step": 1140 }, { "epoch": 0.46092184368737477, "grad_norm": 10274811.215051277, "learning_rate": 2.99554565701559e-07, "logits/chosen": -2.521822690963745, "logits/rejected": -2.530947685241699, "logps/chosen": -130.49484252929688, "logps/rejected": -161.52488708496094, "loss": 126010.9375, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02795564755797386, "rewards/margins": 0.026814181357622147, "rewards/rejected": -0.05476983264088631, "step": 1150 }, { "epoch": 0.4649298597194389, "grad_norm": 5529395.079244611, "learning_rate": 2.973273942093541e-07, "logits/chosen": -2.5171029567718506, "logits/rejected": -2.4763035774230957, "logps/chosen": -125.2338638305664, "logps/rejected": -178.67019653320312, "loss": 125064.3875, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.02554917335510254, "rewards/margins": 0.03890024498105049, "rewards/rejected": -0.06444941461086273, "step": 1160 }, { "epoch": 0.46893787575150303, "grad_norm": 6908649.776157191, "learning_rate": 2.951002227171492e-07, "logits/chosen": -2.5064730644226074, "logits/rejected": -2.4823849201202393, "logps/chosen": -138.76272583007812, "logps/rejected": -171.69248962402344, "loss": 121075.625, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.03167058899998665, "rewards/margins": 0.03182462602853775, "rewards/rejected": -0.0634952187538147, "step": 1170 }, { "epoch": 0.4729458917835671, "grad_norm": 7591675.76560546, "learning_rate": 2.928730512249443e-07, "logits/chosen": -2.518094301223755, "logits/rejected": -2.5124518871307373, "logps/chosen": -106.98667907714844, "logps/rejected": -129.47740173339844, "loss": 124332.4125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.028386671096086502, "rewards/margins": 0.013656134717166424, "rewards/rejected": -0.0420428030192852, "step": 1180 }, { "epoch": 0.47695390781563124, "grad_norm": 5281912.838243102, "learning_rate": 2.906458797327394e-07, "logits/chosen": -2.453968048095703, "logits/rejected": -2.478548526763916, "logps/chosen": -109.14216613769531, "logps/rejected": -177.39352416992188, "loss": 117646.8375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.026093561202287674, "rewards/margins": 0.055483561009168625, "rewards/rejected": -0.0815771296620369, "step": 1190 }, { "epoch": 0.48096192384769537, "grad_norm": 6902598.262857252, "learning_rate": 2.884187082405345e-07, "logits/chosen": -2.65455961227417, "logits/rejected": -2.6624550819396973, "logps/chosen": -114.522216796875, "logps/rejected": -130.91445922851562, "loss": 120296.1625, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0260526891797781, "rewards/margins": 0.011702237650752068, "rewards/rejected": -0.03775492683053017, "step": 1200 }, { "epoch": 0.4849699398797595, "grad_norm": 9529297.592613008, "learning_rate": 2.8619153674832964e-07, "logits/chosen": -2.431119918823242, "logits/rejected": -2.4216580390930176, "logps/chosen": -133.5395965576172, "logps/rejected": -160.01345825195312, "loss": 123218.5875, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03321906179189682, "rewards/margins": 0.023233687505126, "rewards/rejected": -0.05645275115966797, "step": 1210 }, { "epoch": 0.48897795591182364, "grad_norm": 7763735.694070514, "learning_rate": 2.839643652561247e-07, "logits/chosen": -2.512303590774536, "logits/rejected": -2.511324644088745, "logps/chosen": -126.4686050415039, "logps/rejected": -170.10006713867188, "loss": 119174.075, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.028899723663926125, "rewards/margins": 0.0382329560816288, "rewards/rejected": -0.06713266670703888, "step": 1220 }, { "epoch": 0.49298597194388777, "grad_norm": 9045824.328324866, "learning_rate": 2.817371937639198e-07, "logits/chosen": -2.5191609859466553, "logits/rejected": -2.52032732963562, "logps/chosen": -134.53079223632812, "logps/rejected": -167.50125122070312, "loss": 126851.1375, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.03347449749708176, "rewards/margins": 0.019832942634820938, "rewards/rejected": -0.053307436406612396, "step": 1230 }, { "epoch": 0.4969939879759519, "grad_norm": 10448175.950927077, "learning_rate": 2.795100222717149e-07, "logits/chosen": -2.5834548473358154, "logits/rejected": -2.5978755950927734, "logps/chosen": -118.474365234375, "logps/rejected": -161.09974670410156, "loss": 123254.1375, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.029140984639525414, "rewards/margins": 0.03318404406309128, "rewards/rejected": -0.062325023114681244, "step": 1240 }, { "epoch": 0.501002004008016, "grad_norm": 8500418.118135955, "learning_rate": 2.7728285077951004e-07, "logits/chosen": -2.5313282012939453, "logits/rejected": -2.5623884201049805, "logps/chosen": -125.45368957519531, "logps/rejected": -148.5553741455078, "loss": 125882.25, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.028084218502044678, "rewards/margins": 0.027083024382591248, "rewards/rejected": -0.05516723915934563, "step": 1250 }, { "epoch": 0.5050100200400801, "grad_norm": 10978867.823274264, "learning_rate": 2.7505567928730513e-07, "logits/chosen": -2.6042990684509277, "logits/rejected": -2.60687255859375, "logps/chosen": -121.44742584228516, "logps/rejected": -154.75619506835938, "loss": 121126.55, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.02883894369006157, "rewards/margins": 0.022157009690999985, "rewards/rejected": -0.05099595710635185, "step": 1260 }, { "epoch": 0.5090180360721442, "grad_norm": 6172018.078167409, "learning_rate": 2.728285077951002e-07, "logits/chosen": -2.5717759132385254, "logits/rejected": -2.5282435417175293, "logps/chosen": -109.5957260131836, "logps/rejected": -140.98318481445312, "loss": 120695.4875, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.022453511133790016, "rewards/margins": 0.03208887577056885, "rewards/rejected": -0.054542385041713715, "step": 1270 }, { "epoch": 0.5130260521042084, "grad_norm": 5911426.409929097, "learning_rate": 2.7060133630289536e-07, "logits/chosen": -2.6769793033599854, "logits/rejected": -2.690333366394043, "logps/chosen": -110.48323059082031, "logps/rejected": -141.57073974609375, "loss": 129327.425, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.025987869128584862, "rewards/margins": 0.01646535098552704, "rewards/rejected": -0.04245322197675705, "step": 1280 }, { "epoch": 0.5170340681362725, "grad_norm": 6979377.80404185, "learning_rate": 2.683741648106904e-07, "logits/chosen": -2.5594074726104736, "logits/rejected": -2.513986110687256, "logps/chosen": -119.63285827636719, "logps/rejected": -139.09017944335938, "loss": 127555.7375, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.024876989424228668, "rewards/margins": 0.013020751997828484, "rewards/rejected": -0.0378977432847023, "step": 1290 }, { "epoch": 0.5210420841683366, "grad_norm": 5292590.790815719, "learning_rate": 2.661469933184855e-07, "logits/chosen": -2.586153030395508, "logits/rejected": -2.620682716369629, "logps/chosen": -110.29219055175781, "logps/rejected": -155.91311645507812, "loss": 123184.1125, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.022849615663290024, "rewards/margins": 0.03136241436004639, "rewards/rejected": -0.05421202629804611, "step": 1300 }, { "epoch": 0.5250501002004008, "grad_norm": 6798885.42996808, "learning_rate": 2.6391982182628057e-07, "logits/chosen": -2.536839246749878, "logits/rejected": -2.488548517227173, "logps/chosen": -114.6366195678711, "logps/rejected": -137.06741333007812, "loss": 123873.05, "rewards/accuracies": 0.6875, "rewards/chosen": -0.028652016073465347, "rewards/margins": 0.020913179963827133, "rewards/rejected": -0.04956519976258278, "step": 1310 }, { "epoch": 0.5290581162324649, "grad_norm": 6144525.590699139, "learning_rate": 2.616926503340757e-07, "logits/chosen": -2.530562162399292, "logits/rejected": -2.5206761360168457, "logps/chosen": -130.77320861816406, "logps/rejected": -152.3673858642578, "loss": 123906.025, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.027242273092269897, "rewards/margins": 0.016565924510359764, "rewards/rejected": -0.04380819946527481, "step": 1320 }, { "epoch": 0.533066132264529, "grad_norm": 8324367.487045379, "learning_rate": 2.594654788418708e-07, "logits/chosen": -2.4228427410125732, "logits/rejected": -2.416536808013916, "logps/chosen": -113.18867492675781, "logps/rejected": -136.65492248535156, "loss": 127855.825, "rewards/accuracies": 0.6875, "rewards/chosen": -0.028934326022863388, "rewards/margins": 0.015154870226979256, "rewards/rejected": -0.04408919811248779, "step": 1330 }, { "epoch": 0.5370741482965932, "grad_norm": 7499303.936485078, "learning_rate": 2.572383073496659e-07, "logits/chosen": -2.514617443084717, "logits/rejected": -2.522400140762329, "logps/chosen": -133.8329315185547, "logps/rejected": -168.91912841796875, "loss": 123154.5875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03134072571992874, "rewards/margins": 0.031129617244005203, "rewards/rejected": -0.06247033551335335, "step": 1340 }, { "epoch": 0.5410821643286573, "grad_norm": 7001866.440455517, "learning_rate": 2.5501113585746103e-07, "logits/chosen": -2.424100160598755, "logits/rejected": -2.391080856323242, "logps/chosen": -129.46676635742188, "logps/rejected": -170.42715454101562, "loss": 121434.3125, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.02975108101963997, "rewards/margins": 0.0271878931671381, "rewards/rejected": -0.05693897604942322, "step": 1350 }, { "epoch": 0.5450901803607214, "grad_norm": 8973879.608996509, "learning_rate": 2.527839643652561e-07, "logits/chosen": -2.455371618270874, "logits/rejected": -2.448552131652832, "logps/chosen": -107.38471984863281, "logps/rejected": -153.23446655273438, "loss": 122286.9875, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.02663249336183071, "rewards/margins": 0.0314641147851944, "rewards/rejected": -0.05809660628437996, "step": 1360 }, { "epoch": 0.5490981963927856, "grad_norm": 6355878.402179637, "learning_rate": 2.505567928730512e-07, "logits/chosen": -2.506803035736084, "logits/rejected": -2.485286235809326, "logps/chosen": -105.56050109863281, "logps/rejected": -143.86761474609375, "loss": 123893.0875, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.026928072795271873, "rewards/margins": 0.030396688729524612, "rewards/rejected": -0.05732475593686104, "step": 1370 }, { "epoch": 0.5531062124248497, "grad_norm": 6945767.3333855895, "learning_rate": 2.483296213808463e-07, "logits/chosen": -2.563617467880249, "logits/rejected": -2.5720462799072266, "logps/chosen": -120.37355041503906, "logps/rejected": -134.89710998535156, "loss": 126817.7625, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.029632825404405594, "rewards/margins": 0.01115390844643116, "rewards/rejected": -0.040786728262901306, "step": 1380 }, { "epoch": 0.5571142284569138, "grad_norm": 6523846.353792737, "learning_rate": 2.4610244988864143e-07, "logits/chosen": -2.3972599506378174, "logits/rejected": -2.3907814025878906, "logps/chosen": -137.69276428222656, "logps/rejected": -148.29226684570312, "loss": 121494.9875, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0320330411195755, "rewards/margins": 0.012286066077649593, "rewards/rejected": -0.04431910812854767, "step": 1390 }, { "epoch": 0.561122244488978, "grad_norm": 7675269.679056767, "learning_rate": 2.438752783964365e-07, "logits/chosen": -2.477886199951172, "logits/rejected": -2.5009713172912598, "logps/chosen": -127.43409729003906, "logps/rejected": -148.87342834472656, "loss": 128807.55, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.025569623336195946, "rewards/margins": 0.016508014872670174, "rewards/rejected": -0.04207763820886612, "step": 1400 }, { "epoch": 0.5651302605210421, "grad_norm": 7431700.440385598, "learning_rate": 2.416481069042316e-07, "logits/chosen": -2.475954532623291, "logits/rejected": -2.468047857284546, "logps/chosen": -125.47874450683594, "logps/rejected": -153.934326171875, "loss": 128646.025, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.027879610657691956, "rewards/margins": 0.025334885343909264, "rewards/rejected": -0.05321450158953667, "step": 1410 }, { "epoch": 0.5691382765531062, "grad_norm": 6899812.52820539, "learning_rate": 2.394209354120267e-07, "logits/chosen": -2.522841691970825, "logits/rejected": -2.486010789871216, "logps/chosen": -122.23579406738281, "logps/rejected": -163.36984252929688, "loss": 122211.775, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02656758762896061, "rewards/margins": 0.03301847726106644, "rewards/rejected": -0.059586066752672195, "step": 1420 }, { "epoch": 0.5731462925851704, "grad_norm": 6173557.341510408, "learning_rate": 2.371937639198218e-07, "logits/chosen": -2.423492193222046, "logits/rejected": -2.4019968509674072, "logps/chosen": -130.19174194335938, "logps/rejected": -160.64743041992188, "loss": 120977.9625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.031222287565469742, "rewards/margins": 0.02653447352349758, "rewards/rejected": -0.057756759226322174, "step": 1430 }, { "epoch": 0.5771543086172345, "grad_norm": 7643405.223271913, "learning_rate": 2.349665924276169e-07, "logits/chosen": -2.5014212131500244, "logits/rejected": -2.527346134185791, "logps/chosen": -119.39387512207031, "logps/rejected": -138.7740478515625, "loss": 124317.4, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.02439231611788273, "rewards/margins": 0.021347712725400925, "rewards/rejected": -0.0457400307059288, "step": 1440 }, { "epoch": 0.5811623246492986, "grad_norm": 6692302.873722134, "learning_rate": 2.32739420935412e-07, "logits/chosen": -2.5462992191314697, "logits/rejected": -2.5556600093841553, "logps/chosen": -124.85557556152344, "logps/rejected": -163.96566772460938, "loss": 123862.875, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.024991046637296677, "rewards/margins": 0.02179926075041294, "rewards/rejected": -0.04679030552506447, "step": 1450 }, { "epoch": 0.5851703406813628, "grad_norm": 7345120.15663842, "learning_rate": 2.3051224944320713e-07, "logits/chosen": -2.5199873447418213, "logits/rejected": -2.4958107471466064, "logps/chosen": -126.18660736083984, "logps/rejected": -143.85592651367188, "loss": 126067.1625, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.02991018258035183, "rewards/margins": 0.019812356680631638, "rewards/rejected": -0.049722544848918915, "step": 1460 }, { "epoch": 0.5891783567134269, "grad_norm": 6853174.117141145, "learning_rate": 2.2828507795100222e-07, "logits/chosen": -2.4920763969421387, "logits/rejected": -2.4885401725769043, "logps/chosen": -121.91841125488281, "logps/rejected": -166.50521850585938, "loss": 126326.975, "rewards/accuracies": 0.6875, "rewards/chosen": -0.030897056683897972, "rewards/margins": 0.032330263406038284, "rewards/rejected": -0.06322731822729111, "step": 1470 }, { "epoch": 0.593186372745491, "grad_norm": 7779562.538080393, "learning_rate": 2.2605790645879733e-07, "logits/chosen": -2.382236957550049, "logits/rejected": -2.3837084770202637, "logps/chosen": -142.3158721923828, "logps/rejected": -155.12033081054688, "loss": 130777.2875, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03417587652802467, "rewards/margins": 0.012108733877539635, "rewards/rejected": -0.04628460854291916, "step": 1480 }, { "epoch": 0.5971943887775552, "grad_norm": 6279182.617938158, "learning_rate": 2.2383073496659242e-07, "logits/chosen": -2.432779312133789, "logits/rejected": -2.4290943145751953, "logps/chosen": -128.83761596679688, "logps/rejected": -149.1236114501953, "loss": 124363.65, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.032311566174030304, "rewards/margins": 0.015544983558356762, "rewards/rejected": -0.04785655066370964, "step": 1490 }, { "epoch": 0.6012024048096193, "grad_norm": 8182544.255090159, "learning_rate": 2.2160356347438753e-07, "logits/chosen": -2.491617441177368, "logits/rejected": -2.482922077178955, "logps/chosen": -134.8472137451172, "logps/rejected": -160.60122680664062, "loss": 128824.125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03733091801404953, "rewards/margins": 0.017995206639170647, "rewards/rejected": -0.055326126515865326, "step": 1500 }, { "epoch": 0.6052104208416834, "grad_norm": 7016110.496399047, "learning_rate": 2.1937639198218262e-07, "logits/chosen": -2.466414451599121, "logits/rejected": -2.479168176651001, "logps/chosen": -133.4922332763672, "logps/rejected": -155.50863647460938, "loss": 123948.275, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.0342363640666008, "rewards/margins": 0.013794171623885632, "rewards/rejected": -0.04803053289651871, "step": 1510 }, { "epoch": 0.6092184368737475, "grad_norm": 6302317.5145249935, "learning_rate": 2.171492204899777e-07, "logits/chosen": -2.4898009300231934, "logits/rejected": -2.528862237930298, "logps/chosen": -122.98271179199219, "logps/rejected": -146.36402893066406, "loss": 123657.8875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02533816173672676, "rewards/margins": 0.019133783876895905, "rewards/rejected": -0.044471945613622665, "step": 1520 }, { "epoch": 0.6132264529058116, "grad_norm": 6357254.947381178, "learning_rate": 2.1492204899777282e-07, "logits/chosen": -2.3794853687286377, "logits/rejected": -2.387608528137207, "logps/chosen": -117.59773254394531, "logps/rejected": -149.55462646484375, "loss": 121729.5375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.027486393228173256, "rewards/margins": 0.026237377896904945, "rewards/rejected": -0.053723763674497604, "step": 1530 }, { "epoch": 0.6172344689378757, "grad_norm": 5337503.057957096, "learning_rate": 2.126948775055679e-07, "logits/chosen": -2.4973270893096924, "logits/rejected": -2.479884386062622, "logps/chosen": -108.48951721191406, "logps/rejected": -132.74342346191406, "loss": 128555.15, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.028234243392944336, "rewards/margins": 0.017721932381391525, "rewards/rejected": -0.04595617204904556, "step": 1540 }, { "epoch": 0.6212424849699398, "grad_norm": 6436275.76492097, "learning_rate": 2.1046770601336302e-07, "logits/chosen": -2.472238779067993, "logits/rejected": -2.4871106147766113, "logps/chosen": -138.1797637939453, "logps/rejected": -165.5472869873047, "loss": 128596.475, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.02902204915881157, "rewards/margins": 0.022927356883883476, "rewards/rejected": -0.051949404180049896, "step": 1550 }, { "epoch": 0.625250501002004, "grad_norm": 7874140.319482003, "learning_rate": 2.082405345211581e-07, "logits/chosen": -2.4984288215637207, "logits/rejected": -2.5187220573425293, "logps/chosen": -112.08372497558594, "logps/rejected": -143.27493286132812, "loss": 123719.8125, "rewards/accuracies": 0.625, "rewards/chosen": -0.029770880937576294, "rewards/margins": 0.023414723575115204, "rewards/rejected": -0.0531856045126915, "step": 1560 }, { "epoch": 0.6292585170340681, "grad_norm": 7803857.792752564, "learning_rate": 2.0601336302895323e-07, "logits/chosen": -2.521970510482788, "logits/rejected": -2.5433990955352783, "logps/chosen": -156.52963256835938, "logps/rejected": -190.4028778076172, "loss": 120760.1375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.029155146330595016, "rewards/margins": 0.03910597413778305, "rewards/rejected": -0.06826111674308777, "step": 1570 }, { "epoch": 0.6332665330661322, "grad_norm": 5933142.237627983, "learning_rate": 2.0378619153674831e-07, "logits/chosen": -2.5751547813415527, "logits/rejected": -2.537017345428467, "logps/chosen": -112.1880111694336, "logps/rejected": -142.88565063476562, "loss": 121429.8375, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.026801547035574913, "rewards/margins": 0.02268964797258377, "rewards/rejected": -0.049491189420223236, "step": 1580 }, { "epoch": 0.6372745490981964, "grad_norm": 6625290.260166941, "learning_rate": 2.0155902004454343e-07, "logits/chosen": -2.4771504402160645, "logits/rejected": -2.490891933441162, "logps/chosen": -114.3309326171875, "logps/rejected": -150.93031311035156, "loss": 120093.325, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03233719617128372, "rewards/margins": 0.031473204493522644, "rewards/rejected": -0.06381039321422577, "step": 1590 }, { "epoch": 0.6412825651302605, "grad_norm": 8931578.977132296, "learning_rate": 1.9933184855233854e-07, "logits/chosen": -2.410708427429199, "logits/rejected": -2.393162250518799, "logps/chosen": -128.2466583251953, "logps/rejected": -157.00393676757812, "loss": 129014.2, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.031892385333776474, "rewards/margins": 0.03138261288404465, "rewards/rejected": -0.06327499449253082, "step": 1600 }, { "epoch": 0.6452905811623246, "grad_norm": 7621241.075310516, "learning_rate": 1.971046770601336e-07, "logits/chosen": -2.4275999069213867, "logits/rejected": -2.436274290084839, "logps/chosen": -135.43801879882812, "logps/rejected": -174.9510955810547, "loss": 121219.5125, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.025171224027872086, "rewards/margins": 0.024888776242733, "rewards/rejected": -0.050060003995895386, "step": 1610 }, { "epoch": 0.6492985971943888, "grad_norm": 6822723.418880638, "learning_rate": 1.9487750556792872e-07, "logits/chosen": -2.488560199737549, "logits/rejected": -2.467193126678467, "logps/chosen": -116.24674987792969, "logps/rejected": -135.80844116210938, "loss": 122399.8625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02936776913702488, "rewards/margins": 0.018525371327996254, "rewards/rejected": -0.04789314419031143, "step": 1620 }, { "epoch": 0.6533066132264529, "grad_norm": 7659606.2078440925, "learning_rate": 1.926503340757238e-07, "logits/chosen": -2.4457767009735107, "logits/rejected": -2.443535327911377, "logps/chosen": -120.93404388427734, "logps/rejected": -173.1234130859375, "loss": 117188.175, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.02749781683087349, "rewards/margins": 0.040742214769124985, "rewards/rejected": -0.06824003159999847, "step": 1630 }, { "epoch": 0.657314629258517, "grad_norm": 10173287.988396857, "learning_rate": 1.9042316258351892e-07, "logits/chosen": -2.5128328800201416, "logits/rejected": -2.513092041015625, "logps/chosen": -140.1425018310547, "logps/rejected": -157.23348999023438, "loss": 129493.4375, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.03822711110115051, "rewards/margins": 0.012729940004646778, "rewards/rejected": -0.05095704644918442, "step": 1640 }, { "epoch": 0.6613226452905812, "grad_norm": 6527946.473628513, "learning_rate": 1.88195991091314e-07, "logits/chosen": -2.5390524864196777, "logits/rejected": -2.547598361968994, "logps/chosen": -117.64924621582031, "logps/rejected": -163.60293579101562, "loss": 118917.6, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.025879234075546265, "rewards/margins": 0.04024052247405052, "rewards/rejected": -0.06611974537372589, "step": 1650 }, { "epoch": 0.6653306613226453, "grad_norm": 4891201.175696377, "learning_rate": 1.8596881959910912e-07, "logits/chosen": -2.3803093433380127, "logits/rejected": -2.349740743637085, "logps/chosen": -128.08169555664062, "logps/rejected": -154.24667358398438, "loss": 127419.9875, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.029361028224229813, "rewards/margins": 0.01856454275548458, "rewards/rejected": -0.047925569117069244, "step": 1660 }, { "epoch": 0.6693386773547094, "grad_norm": 6543519.27422337, "learning_rate": 1.8374164810690424e-07, "logits/chosen": -2.527883529663086, "logits/rejected": -2.515263319015503, "logps/chosen": -118.89616394042969, "logps/rejected": -147.4104461669922, "loss": 118822.1375, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.01884353719651699, "rewards/margins": 0.0279966089874506, "rewards/rejected": -0.04684014618396759, "step": 1670 }, { "epoch": 0.6733466933867736, "grad_norm": 7806417.669748601, "learning_rate": 1.8151447661469933e-07, "logits/chosen": -2.4772043228149414, "logits/rejected": -2.515587329864502, "logps/chosen": -127.4335708618164, "logps/rejected": -167.4239044189453, "loss": 124433.9625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.029232731088995934, "rewards/margins": 0.029919158667325974, "rewards/rejected": -0.059151895344257355, "step": 1680 }, { "epoch": 0.6773547094188377, "grad_norm": 7678118.878557649, "learning_rate": 1.7928730512249444e-07, "logits/chosen": -2.3632655143737793, "logits/rejected": -2.3506579399108887, "logps/chosen": -131.25975036621094, "logps/rejected": -157.8623046875, "loss": 122209.875, "rewards/accuracies": 0.5625, "rewards/chosen": -0.031646568328142166, "rewards/margins": 0.020655754953622818, "rewards/rejected": -0.052302323281764984, "step": 1690 }, { "epoch": 0.6813627254509018, "grad_norm": 8699839.812013036, "learning_rate": 1.770601336302895e-07, "logits/chosen": -2.3938724994659424, "logits/rejected": -2.3754312992095947, "logps/chosen": -115.58536529541016, "logps/rejected": -166.11984252929688, "loss": 116746.4, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.027456630021333694, "rewards/margins": 0.03820016235113144, "rewards/rejected": -0.06565678864717484, "step": 1700 }, { "epoch": 0.685370741482966, "grad_norm": 5919208.729707667, "learning_rate": 1.7483296213808462e-07, "logits/chosen": -2.4200167655944824, "logits/rejected": -2.427748680114746, "logps/chosen": -126.75040435791016, "logps/rejected": -168.77432250976562, "loss": 121633.425, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.029950793832540512, "rewards/margins": 0.03923628851771355, "rewards/rejected": -0.06918708235025406, "step": 1710 }, { "epoch": 0.6893787575150301, "grad_norm": 8189290.652671266, "learning_rate": 1.726057906458797e-07, "logits/chosen": -2.436396598815918, "logits/rejected": -2.3985111713409424, "logps/chosen": -133.26527404785156, "logps/rejected": -165.04611206054688, "loss": 124233.6125, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.033830661326646805, "rewards/margins": 0.033877044916152954, "rewards/rejected": -0.06770770251750946, "step": 1720 }, { "epoch": 0.6933867735470942, "grad_norm": 6543680.531937181, "learning_rate": 1.7037861915367482e-07, "logits/chosen": -2.3416950702667236, "logits/rejected": -2.358785629272461, "logps/chosen": -121.58663177490234, "logps/rejected": -172.85147094726562, "loss": 121040.9, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.024335870519280434, "rewards/margins": 0.03769830986857414, "rewards/rejected": -0.06203417852520943, "step": 1730 }, { "epoch": 0.6973947895791583, "grad_norm": 8413451.882571388, "learning_rate": 1.6815144766146993e-07, "logits/chosen": -2.5148143768310547, "logits/rejected": -2.5122790336608887, "logps/chosen": -131.95590209960938, "logps/rejected": -170.07913208007812, "loss": 122086.9875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02519279159605503, "rewards/margins": 0.03293418884277344, "rewards/rejected": -0.05812697485089302, "step": 1740 }, { "epoch": 0.7014028056112225, "grad_norm": 7990729.285338638, "learning_rate": 1.6592427616926502e-07, "logits/chosen": -2.424561023712158, "logits/rejected": -2.411344051361084, "logps/chosen": -112.82745361328125, "logps/rejected": -140.15585327148438, "loss": 122354.5875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.024268418550491333, "rewards/margins": 0.021955247968435287, "rewards/rejected": -0.04622367024421692, "step": 1750 }, { "epoch": 0.7054108216432866, "grad_norm": 8082374.5587068405, "learning_rate": 1.6369710467706014e-07, "logits/chosen": -2.3887767791748047, "logits/rejected": -2.399949312210083, "logps/chosen": -113.41932678222656, "logps/rejected": -150.90052795410156, "loss": 123895.825, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.02654331922531128, "rewards/margins": 0.031898465007543564, "rewards/rejected": -0.058441780507564545, "step": 1760 }, { "epoch": 0.7094188376753507, "grad_norm": 10182582.529576585, "learning_rate": 1.6146993318485522e-07, "logits/chosen": -2.471140146255493, "logits/rejected": -2.464400291442871, "logps/chosen": -134.3041534423828, "logps/rejected": -168.62957763671875, "loss": 123414.175, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.028919730335474014, "rewards/margins": 0.032348960638046265, "rewards/rejected": -0.06126868724822998, "step": 1770 }, { "epoch": 0.7134268537074149, "grad_norm": 7012588.447398562, "learning_rate": 1.5924276169265034e-07, "logits/chosen": -2.4046647548675537, "logits/rejected": -2.436089277267456, "logps/chosen": -140.215087890625, "logps/rejected": -177.3277587890625, "loss": 116220.825, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.030313868075609207, "rewards/margins": 0.03892980143427849, "rewards/rejected": -0.0692436695098877, "step": 1780 }, { "epoch": 0.717434869739479, "grad_norm": 5846594.429433788, "learning_rate": 1.5701559020044543e-07, "logits/chosen": -2.5337796211242676, "logits/rejected": -2.4943432807922363, "logps/chosen": -124.1761474609375, "logps/rejected": -172.1324920654297, "loss": 123137.75, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03101753070950508, "rewards/margins": 0.0418589822947979, "rewards/rejected": -0.07287651300430298, "step": 1790 }, { "epoch": 0.7214428857715431, "grad_norm": 7028764.493993101, "learning_rate": 1.5478841870824051e-07, "logits/chosen": -2.426300525665283, "logits/rejected": -2.387045383453369, "logps/chosen": -115.95497131347656, "logps/rejected": -157.61618041992188, "loss": 124143.675, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.024013713002204895, "rewards/margins": 0.037388551980257034, "rewards/rejected": -0.06140226125717163, "step": 1800 }, { "epoch": 0.7254509018036072, "grad_norm": 8234396.37962489, "learning_rate": 1.5256124721603563e-07, "logits/chosen": -2.3112475872039795, "logits/rejected": -2.310009479522705, "logps/chosen": -114.36012268066406, "logps/rejected": -153.5015411376953, "loss": 118239.1875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.018304970115423203, "rewards/margins": 0.04038548097014427, "rewards/rejected": -0.058690451085567474, "step": 1810 }, { "epoch": 0.7294589178356713, "grad_norm": 8651551.474852078, "learning_rate": 1.5033407572383072e-07, "logits/chosen": -2.3074092864990234, "logits/rejected": -2.2610440254211426, "logps/chosen": -126.58909606933594, "logps/rejected": -180.2452392578125, "loss": 115241.575, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.027572233229875565, "rewards/margins": 0.04374260455369949, "rewards/rejected": -0.07131483405828476, "step": 1820 }, { "epoch": 0.7334669338677354, "grad_norm": 7273151.817120667, "learning_rate": 1.4810690423162583e-07, "logits/chosen": -2.2907938957214355, "logits/rejected": -2.241560459136963, "logps/chosen": -121.88932800292969, "logps/rejected": -165.8365020751953, "loss": 118192.675, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.030897384509444237, "rewards/margins": 0.042338501662015915, "rewards/rejected": -0.073235884308815, "step": 1830 }, { "epoch": 0.7374749498997996, "grad_norm": 8416170.338029677, "learning_rate": 1.4587973273942092e-07, "logits/chosen": -2.2940685749053955, "logits/rejected": -2.2601191997528076, "logps/chosen": -124.39451599121094, "logps/rejected": -161.70626831054688, "loss": 117542.4, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.029775938019156456, "rewards/margins": 0.03537831827998161, "rewards/rejected": -0.06515425443649292, "step": 1840 }, { "epoch": 0.7414829659318637, "grad_norm": 8984144.824878268, "learning_rate": 1.4365256124721603e-07, "logits/chosen": -2.4095590114593506, "logits/rejected": -2.382366418838501, "logps/chosen": -131.84091186523438, "logps/rejected": -171.8034210205078, "loss": 123931.3875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03394445404410362, "rewards/margins": 0.0342072993516922, "rewards/rejected": -0.06815175712108612, "step": 1850 }, { "epoch": 0.7454909819639278, "grad_norm": 9132360.1760025, "learning_rate": 1.4142538975501115e-07, "logits/chosen": -2.3694911003112793, "logits/rejected": -2.3512394428253174, "logps/chosen": -123.51580810546875, "logps/rejected": -167.21522521972656, "loss": 127071.8625, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.031071608886122704, "rewards/margins": 0.03928913176059723, "rewards/rejected": -0.07036073505878448, "step": 1860 }, { "epoch": 0.749498997995992, "grad_norm": 7139680.939654487, "learning_rate": 1.3919821826280624e-07, "logits/chosen": -2.462883472442627, "logits/rejected": -2.4619345664978027, "logps/chosen": -109.59526062011719, "logps/rejected": -158.25265502929688, "loss": 119525.8125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.02276531606912613, "rewards/margins": 0.03841192647814751, "rewards/rejected": -0.06117723509669304, "step": 1870 }, { "epoch": 0.7535070140280561, "grad_norm": 9748149.33911386, "learning_rate": 1.3697104677060135e-07, "logits/chosen": -2.395458459854126, "logits/rejected": -2.3749117851257324, "logps/chosen": -123.3841552734375, "logps/rejected": -154.816650390625, "loss": 122715.85, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03459615260362625, "rewards/margins": 0.02642343007028103, "rewards/rejected": -0.06101958826184273, "step": 1880 }, { "epoch": 0.7575150300601202, "grad_norm": 7062510.21247513, "learning_rate": 1.347438752783964e-07, "logits/chosen": -2.466752290725708, "logits/rejected": -2.4664688110351562, "logps/chosen": -117.7652587890625, "logps/rejected": -168.4532928466797, "loss": 122685.4375, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.026580441743135452, "rewards/margins": 0.039577435702085495, "rewards/rejected": -0.06615787744522095, "step": 1890 }, { "epoch": 0.7615230460921844, "grad_norm": 9095349.65659939, "learning_rate": 1.3251670378619153e-07, "logits/chosen": -2.340463161468506, "logits/rejected": -2.343043088912964, "logps/chosen": -135.51889038085938, "logps/rejected": -194.67893981933594, "loss": 124649.7875, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.03404935449361801, "rewards/margins": 0.039114292711019516, "rewards/rejected": -0.07316364347934723, "step": 1900 }, { "epoch": 0.7655310621242485, "grad_norm": 8798241.718444504, "learning_rate": 1.3028953229398661e-07, "logits/chosen": -2.417386054992676, "logits/rejected": -2.35686993598938, "logps/chosen": -126.17924499511719, "logps/rejected": -156.60858154296875, "loss": 126678.975, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.03539714962244034, "rewards/margins": 0.023838359862565994, "rewards/rejected": -0.05923551321029663, "step": 1910 }, { "epoch": 0.7695390781563126, "grad_norm": 8685556.241747925, "learning_rate": 1.2806236080178173e-07, "logits/chosen": -2.2988665103912354, "logits/rejected": -2.2867467403411865, "logps/chosen": -102.80122375488281, "logps/rejected": -136.88668823242188, "loss": 124837.6625, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.02582702599465847, "rewards/margins": 0.02396995946764946, "rewards/rejected": -0.04979699105024338, "step": 1920 }, { "epoch": 0.7735470941883767, "grad_norm": 7757034.4029932795, "learning_rate": 1.2583518930957684e-07, "logits/chosen": -2.3887832164764404, "logits/rejected": -2.388990640640259, "logps/chosen": -121.30000305175781, "logps/rejected": -180.59637451171875, "loss": 117815.775, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.025620033964514732, "rewards/margins": 0.048595868051052094, "rewards/rejected": -0.07421590387821198, "step": 1930 }, { "epoch": 0.7775551102204409, "grad_norm": 8307958.766272747, "learning_rate": 1.2360801781737193e-07, "logits/chosen": -2.38761568069458, "logits/rejected": -2.408409357070923, "logps/chosen": -121.0750732421875, "logps/rejected": -156.45553588867188, "loss": 127248.7375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02739100717008114, "rewards/margins": 0.03653334453701973, "rewards/rejected": -0.06392434984445572, "step": 1940 }, { "epoch": 0.781563126252505, "grad_norm": 7961844.607771009, "learning_rate": 1.2138084632516702e-07, "logits/chosen": -2.2147622108459473, "logits/rejected": -2.216984272003174, "logps/chosen": -137.97161865234375, "logps/rejected": -177.88926696777344, "loss": 117088.8875, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.027384892106056213, "rewards/margins": 0.04585758596658707, "rewards/rejected": -0.07324248552322388, "step": 1950 }, { "epoch": 0.7855711422845691, "grad_norm": 7970074.398916679, "learning_rate": 1.1915367483296213e-07, "logits/chosen": -2.431124210357666, "logits/rejected": -2.408353567123413, "logps/chosen": -131.7977752685547, "logps/rejected": -183.93661499023438, "loss": 118722.2375, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.0299037154763937, "rewards/margins": 0.045822691172361374, "rewards/rejected": -0.07572640478610992, "step": 1960 }, { "epoch": 0.7895791583166333, "grad_norm": 8486402.500741018, "learning_rate": 1.1692650334075723e-07, "logits/chosen": -2.373565196990967, "logits/rejected": -2.3279855251312256, "logps/chosen": -127.22279357910156, "logps/rejected": -165.14312744140625, "loss": 125666.025, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.036864422261714935, "rewards/margins": 0.034248046576976776, "rewards/rejected": -0.07111246883869171, "step": 1970 }, { "epoch": 0.7935871743486974, "grad_norm": 7781551.329745824, "learning_rate": 1.1469933184855234e-07, "logits/chosen": -2.3267343044281006, "logits/rejected": -2.3000550270080566, "logps/chosen": -114.78788757324219, "logps/rejected": -160.58119201660156, "loss": 117669.975, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.02355727180838585, "rewards/margins": 0.04111555963754654, "rewards/rejected": -0.06467284262180328, "step": 1980 }, { "epoch": 0.7975951903807615, "grad_norm": 7705308.394496826, "learning_rate": 1.1247216035634744e-07, "logits/chosen": -2.384936571121216, "logits/rejected": -2.3428092002868652, "logps/chosen": -118.7773208618164, "logps/rejected": -167.65505981445312, "loss": 119291.925, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.03314110264182091, "rewards/margins": 0.03820453956723213, "rewards/rejected": -0.07134564965963364, "step": 1990 }, { "epoch": 0.8016032064128257, "grad_norm": 9748026.950946445, "learning_rate": 1.1024498886414254e-07, "logits/chosen": -2.3572840690612793, "logits/rejected": -2.372260570526123, "logps/chosen": -141.3115997314453, "logps/rejected": -194.31414794921875, "loss": 121163.925, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.03314133733510971, "rewards/margins": 0.04883214458823204, "rewards/rejected": -0.08197349309921265, "step": 2000 }, { "epoch": 0.8056112224448898, "grad_norm": 11371440.513765983, "learning_rate": 1.0801781737193763e-07, "logits/chosen": -2.407268524169922, "logits/rejected": -2.329662799835205, "logps/chosen": -109.80204010009766, "logps/rejected": -165.47967529296875, "loss": 119808.175, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.029940223321318626, "rewards/margins": 0.042202599346637726, "rewards/rejected": -0.0721428170800209, "step": 2010 }, { "epoch": 0.8096192384769539, "grad_norm": 11530967.267240841, "learning_rate": 1.0579064587973273e-07, "logits/chosen": -2.4649455547332764, "logits/rejected": -2.4669435024261475, "logps/chosen": -153.43203735351562, "logps/rejected": -207.6525421142578, "loss": 120230.175, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.033086683601140976, "rewards/margins": 0.044571831822395325, "rewards/rejected": -0.077658511698246, "step": 2020 }, { "epoch": 0.8136272545090181, "grad_norm": 9145691.093056194, "learning_rate": 1.0356347438752784e-07, "logits/chosen": -2.3883135318756104, "logits/rejected": -2.393925666809082, "logps/chosen": -121.42464447021484, "logps/rejected": -175.5780029296875, "loss": 118819.75, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.025960665196180344, "rewards/margins": 0.038529325276613235, "rewards/rejected": -0.06448998302221298, "step": 2030 }, { "epoch": 0.8176352705410822, "grad_norm": 9873094.955285586, "learning_rate": 1.0133630289532294e-07, "logits/chosen": -2.3716039657592773, "logits/rejected": -2.3830373287200928, "logps/chosen": -113.73304748535156, "logps/rejected": -154.63546752929688, "loss": 126263.6, "rewards/accuracies": 0.75, "rewards/chosen": -0.027685221284627914, "rewards/margins": 0.03748108074069023, "rewards/rejected": -0.06516630947589874, "step": 2040 }, { "epoch": 0.8216432865731463, "grad_norm": 9115873.458954994, "learning_rate": 9.910913140311804e-08, "logits/chosen": -2.479027032852173, "logits/rejected": -2.490036964416504, "logps/chosen": -132.74807739257812, "logps/rejected": -184.6726531982422, "loss": 118780.575, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.02712252177298069, "rewards/margins": 0.04764852300286293, "rewards/rejected": -0.07477104663848877, "step": 2050 }, { "epoch": 0.8256513026052105, "grad_norm": 8953461.219512891, "learning_rate": 9.688195991091313e-08, "logits/chosen": -2.465446949005127, "logits/rejected": -2.4427990913391113, "logps/chosen": -119.0552978515625, "logps/rejected": -161.92874145507812, "loss": 119633.0625, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.025605756789445877, "rewards/margins": 0.040582504123449326, "rewards/rejected": -0.0661882609128952, "step": 2060 }, { "epoch": 0.8296593186372746, "grad_norm": 7851765.489091888, "learning_rate": 9.465478841870823e-08, "logits/chosen": -2.3986716270446777, "logits/rejected": -2.411012887954712, "logps/chosen": -118.712646484375, "logps/rejected": -173.01612854003906, "loss": 119445.7875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.024571005254983902, "rewards/margins": 0.04814226180315018, "rewards/rejected": -0.07271327078342438, "step": 2070 }, { "epoch": 0.8336673346693386, "grad_norm": 7783572.975882292, "learning_rate": 9.242761692650333e-08, "logits/chosen": -2.4087226390838623, "logits/rejected": -2.4295570850372314, "logps/chosen": -107.88726806640625, "logps/rejected": -147.86387634277344, "loss": 121158.875, "rewards/accuracies": 0.625, "rewards/chosen": -0.028421640396118164, "rewards/margins": 0.03798586130142212, "rewards/rejected": -0.06640749424695969, "step": 2080 }, { "epoch": 0.8376753507014028, "grad_norm": 13856515.926379297, "learning_rate": 9.020044543429844e-08, "logits/chosen": -2.3862414360046387, "logits/rejected": -2.3929479122161865, "logps/chosen": -128.18722534179688, "logps/rejected": -179.44863891601562, "loss": 118360.075, "rewards/accuracies": 0.625, "rewards/chosen": -0.02913135662674904, "rewards/margins": 0.039204858243465424, "rewards/rejected": -0.06833621114492416, "step": 2090 }, { "epoch": 0.8416833667334669, "grad_norm": 7032800.434205612, "learning_rate": 8.797327394209354e-08, "logits/chosen": -2.3820009231567383, "logits/rejected": -2.3427934646606445, "logps/chosen": -117.68209075927734, "logps/rejected": -162.79541015625, "loss": 124612.8375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.024324778467416763, "rewards/margins": 0.03295541927218437, "rewards/rejected": -0.05728019401431084, "step": 2100 }, { "epoch": 0.845691382765531, "grad_norm": 6171252.074137294, "learning_rate": 8.574610244988864e-08, "logits/chosen": -2.397584915161133, "logits/rejected": -2.371406078338623, "logps/chosen": -109.43270111083984, "logps/rejected": -151.5046844482422, "loss": 119086.7625, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.02608482912182808, "rewards/margins": 0.03447579964995384, "rewards/rejected": -0.06056063249707222, "step": 2110 }, { "epoch": 0.8496993987975952, "grad_norm": 10404482.46317683, "learning_rate": 8.351893095768374e-08, "logits/chosen": -2.363708019256592, "logits/rejected": -2.3750388622283936, "logps/chosen": -124.8462905883789, "logps/rejected": -160.50111389160156, "loss": 120726.7, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.025790056213736534, "rewards/margins": 0.0353570319712162, "rewards/rejected": -0.06114708259701729, "step": 2120 }, { "epoch": 0.8537074148296593, "grad_norm": 10159471.548415452, "learning_rate": 8.129175946547884e-08, "logits/chosen": -2.40871262550354, "logits/rejected": -2.385307788848877, "logps/chosen": -123.74516296386719, "logps/rejected": -174.9013671875, "loss": 117929.3, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.02134835720062256, "rewards/margins": 0.053267043083906174, "rewards/rejected": -0.07461539655923843, "step": 2130 }, { "epoch": 0.8577154308617234, "grad_norm": 9748215.648259088, "learning_rate": 7.906458797327394e-08, "logits/chosen": -2.3864364624023438, "logits/rejected": -2.384763479232788, "logps/chosen": -124.77516174316406, "logps/rejected": -187.81178283691406, "loss": 114719.2875, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.028009647503495216, "rewards/margins": 0.057968758046627045, "rewards/rejected": -0.08597840368747711, "step": 2140 }, { "epoch": 0.8617234468937875, "grad_norm": 8214050.893974697, "learning_rate": 7.683741648106903e-08, "logits/chosen": -2.339771270751953, "logits/rejected": -2.3055481910705566, "logps/chosen": -122.35482025146484, "logps/rejected": -162.99026489257812, "loss": 123583.925, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.03496576473116875, "rewards/margins": 0.031746573746204376, "rewards/rejected": -0.06671233475208282, "step": 2150 }, { "epoch": 0.8657314629258517, "grad_norm": 7550929.657836777, "learning_rate": 7.461024498886414e-08, "logits/chosen": -2.3592019081115723, "logits/rejected": -2.404470443725586, "logps/chosen": -107.17295837402344, "logps/rejected": -150.08053588867188, "loss": 119529.6875, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.024070553481578827, "rewards/margins": 0.035172443836927414, "rewards/rejected": -0.05924300104379654, "step": 2160 }, { "epoch": 0.8697394789579158, "grad_norm": 9141394.945114018, "learning_rate": 7.238307349665924e-08, "logits/chosen": -2.3597967624664307, "logits/rejected": -2.3456203937530518, "logps/chosen": -132.2399139404297, "logps/rejected": -190.35842895507812, "loss": 117310.2625, "rewards/accuracies": 0.75, "rewards/chosen": -0.030771303921937943, "rewards/margins": 0.04350388050079346, "rewards/rejected": -0.0742751806974411, "step": 2170 }, { "epoch": 0.87374749498998, "grad_norm": 9640801.577136654, "learning_rate": 7.015590200445435e-08, "logits/chosen": -2.315046548843384, "logits/rejected": -2.2832789421081543, "logps/chosen": -123.76663970947266, "logps/rejected": -180.56610107421875, "loss": 122200.8125, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.02636173740029335, "rewards/margins": 0.05408860370516777, "rewards/rejected": -0.08045034110546112, "step": 2180 }, { "epoch": 0.8777555110220441, "grad_norm": 9115013.555540964, "learning_rate": 6.792873051224945e-08, "logits/chosen": -2.341780662536621, "logits/rejected": -2.303041934967041, "logps/chosen": -120.33302307128906, "logps/rejected": -179.38143920898438, "loss": 119841.1375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.028819095343351364, "rewards/margins": 0.05842950940132141, "rewards/rejected": -0.08724860846996307, "step": 2190 }, { "epoch": 0.8817635270541082, "grad_norm": 9287669.565738013, "learning_rate": 6.570155902004454e-08, "logits/chosen": -2.3970108032226562, "logits/rejected": -2.3846933841705322, "logps/chosen": -134.4124755859375, "logps/rejected": -194.52996826171875, "loss": 113952.85, "rewards/accuracies": 0.75, "rewards/chosen": -0.030755961313843727, "rewards/margins": 0.059903584420681, "rewards/rejected": -0.09065954387187958, "step": 2200 }, { "epoch": 0.8857715430861723, "grad_norm": 8780636.26287871, "learning_rate": 6.347438752783964e-08, "logits/chosen": -2.3586363792419434, "logits/rejected": -2.341787815093994, "logps/chosen": -121.4832763671875, "logps/rejected": -174.41322326660156, "loss": 125072.6125, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.034476399421691895, "rewards/margins": 0.04450554400682449, "rewards/rejected": -0.07898194342851639, "step": 2210 }, { "epoch": 0.8897795591182365, "grad_norm": 11734592.621806614, "learning_rate": 6.124721603563474e-08, "logits/chosen": -2.3850929737091064, "logits/rejected": -2.3612587451934814, "logps/chosen": -127.06380462646484, "logps/rejected": -176.50833129882812, "loss": 127903.1, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.03346968814730644, "rewards/margins": 0.045090578496456146, "rewards/rejected": -0.07856027781963348, "step": 2220 }, { "epoch": 0.8937875751503006, "grad_norm": 9158318.910190664, "learning_rate": 5.902004454342984e-08, "logits/chosen": -2.3465323448181152, "logits/rejected": -2.3347389698028564, "logps/chosen": -114.64030456542969, "logps/rejected": -158.58358764648438, "loss": 119119.75, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.03357522934675217, "rewards/margins": 0.03692323714494705, "rewards/rejected": -0.07049846649169922, "step": 2230 }, { "epoch": 0.8977955911823647, "grad_norm": 9433469.034726756, "learning_rate": 5.679287305122494e-08, "logits/chosen": -2.3580026626586914, "logits/rejected": -2.3229854106903076, "logps/chosen": -145.0024871826172, "logps/rejected": -185.17398071289062, "loss": 123208.3625, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0344650074839592, "rewards/margins": 0.03345141187310219, "rewards/rejected": -0.06791641563177109, "step": 2240 }, { "epoch": 0.9018036072144289, "grad_norm": 10459560.929278648, "learning_rate": 5.456570155902004e-08, "logits/chosen": -2.2590279579162598, "logits/rejected": -2.2098376750946045, "logps/chosen": -125.89430236816406, "logps/rejected": -180.64013671875, "loss": 119874.9125, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.030151482671499252, "rewards/margins": 0.0487741082906723, "rewards/rejected": -0.07892559468746185, "step": 2250 }, { "epoch": 0.905811623246493, "grad_norm": 9239966.29275951, "learning_rate": 5.233853006681514e-08, "logits/chosen": -2.219543933868408, "logits/rejected": -2.1673264503479004, "logps/chosen": -115.83148193359375, "logps/rejected": -181.96237182617188, "loss": 116310.4625, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.030166417360305786, "rewards/margins": 0.05468549206852913, "rewards/rejected": -0.08485190570354462, "step": 2260 }, { "epoch": 0.9098196392785571, "grad_norm": 7137384.451143832, "learning_rate": 5.0111358574610243e-08, "logits/chosen": -2.3517508506774902, "logits/rejected": -2.328963279724121, "logps/chosen": -124.4570541381836, "logps/rejected": -155.75506591796875, "loss": 125401.625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03177972882986069, "rewards/margins": 0.028021136298775673, "rewards/rejected": -0.05980087071657181, "step": 2270 }, { "epoch": 0.9138276553106213, "grad_norm": 9374102.140787963, "learning_rate": 4.7884187082405345e-08, "logits/chosen": -2.36897611618042, "logits/rejected": -2.351210832595825, "logps/chosen": -121.159423828125, "logps/rejected": -162.73583984375, "loss": 122636.2125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.027210911735892296, "rewards/margins": 0.037821024656295776, "rewards/rejected": -0.06503193080425262, "step": 2280 }, { "epoch": 0.9178356713426854, "grad_norm": 10634549.480236543, "learning_rate": 4.5657015590200446e-08, "logits/chosen": -2.3483309745788574, "logits/rejected": -2.3582046031951904, "logps/chosen": -129.0653839111328, "logps/rejected": -160.6099853515625, "loss": 120660.725, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.02793022058904171, "rewards/margins": 0.028678078204393387, "rewards/rejected": -0.056608300656080246, "step": 2290 }, { "epoch": 0.9218436873747495, "grad_norm": 9025337.740427457, "learning_rate": 4.342984409799554e-08, "logits/chosen": -2.3706603050231934, "logits/rejected": -2.369664192199707, "logps/chosen": -127.57081604003906, "logps/rejected": -188.29193115234375, "loss": 115251.525, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.02242148667573929, "rewards/margins": 0.0556727834045887, "rewards/rejected": -0.07809427380561829, "step": 2300 }, { "epoch": 0.9258517034068137, "grad_norm": 7848051.255311885, "learning_rate": 4.120267260579064e-08, "logits/chosen": -2.3117566108703613, "logits/rejected": -2.3098156452178955, "logps/chosen": -108.4315185546875, "logps/rejected": -150.93917846679688, "loss": 118349.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.029692724347114563, "rewards/margins": 0.03289476037025452, "rewards/rejected": -0.06258748471736908, "step": 2310 }, { "epoch": 0.9298597194388778, "grad_norm": 8752220.280733073, "learning_rate": 3.897550111358574e-08, "logits/chosen": -2.440371036529541, "logits/rejected": -2.396395444869995, "logps/chosen": -118.261962890625, "logps/rejected": -177.1492462158203, "loss": 118122.25, "rewards/accuracies": 0.75, "rewards/chosen": -0.02859870158135891, "rewards/margins": 0.04976039007306099, "rewards/rejected": -0.07835908979177475, "step": 2320 }, { "epoch": 0.9338677354709419, "grad_norm": 8570641.436970409, "learning_rate": 3.6748329621380844e-08, "logits/chosen": -2.3024837970733643, "logits/rejected": -2.2734124660491943, "logps/chosen": -100.15013122558594, "logps/rejected": -150.4217987060547, "loss": 121363.625, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.017847072333097458, "rewards/margins": 0.041332632303237915, "rewards/rejected": -0.05917970463633537, "step": 2330 }, { "epoch": 0.9378757515030061, "grad_norm": 10125338.403382758, "learning_rate": 3.4521158129175945e-08, "logits/chosen": -2.3241610527038574, "logits/rejected": -2.288438081741333, "logps/chosen": -140.2849578857422, "logps/rejected": -185.48062133789062, "loss": 116049.775, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.023854615166783333, "rewards/margins": 0.04641326889395714, "rewards/rejected": -0.07026788592338562, "step": 2340 }, { "epoch": 0.9418837675350702, "grad_norm": 8270337.589395198, "learning_rate": 3.2293986636971046e-08, "logits/chosen": -2.3693883419036865, "logits/rejected": -2.329385995864868, "logps/chosen": -129.80262756347656, "logps/rejected": -171.2855224609375, "loss": 126379.775, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.03462721034884453, "rewards/margins": 0.032999541610479355, "rewards/rejected": -0.06762675940990448, "step": 2350 }, { "epoch": 0.9458917835671342, "grad_norm": 9891605.681991456, "learning_rate": 3.006681514476615e-08, "logits/chosen": -2.339646339416504, "logits/rejected": -2.3495984077453613, "logps/chosen": -112.51835632324219, "logps/rejected": -153.22071838378906, "loss": 119129.5875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.029641568660736084, "rewards/margins": 0.036299534142017365, "rewards/rejected": -0.06594111025333405, "step": 2360 }, { "epoch": 0.9498997995991983, "grad_norm": 8002327.473093823, "learning_rate": 2.783964365256125e-08, "logits/chosen": -2.2044544219970703, "logits/rejected": -2.1942667961120605, "logps/chosen": -125.50931549072266, "logps/rejected": -182.1253204345703, "loss": 120835.975, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.030479159206151962, "rewards/margins": 0.04504828527569771, "rewards/rejected": -0.07552744448184967, "step": 2370 }, { "epoch": 0.9539078156312625, "grad_norm": 9473260.065683817, "learning_rate": 2.5612472160356346e-08, "logits/chosen": -2.3253302574157715, "logits/rejected": -2.30558443069458, "logps/chosen": -120.88480377197266, "logps/rejected": -174.08871459960938, "loss": 116823.8375, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.030502652749419212, "rewards/margins": 0.04683176428079605, "rewards/rejected": -0.07733441144227982, "step": 2380 }, { "epoch": 0.9579158316633266, "grad_norm": 12154341.360154865, "learning_rate": 2.3385300668151448e-08, "logits/chosen": -2.308772563934326, "logits/rejected": -2.3092708587646484, "logps/chosen": -114.74493408203125, "logps/rejected": -176.45138549804688, "loss": 115003.975, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.03354664891958237, "rewards/margins": 0.0526542142033577, "rewards/rejected": -0.08620086312294006, "step": 2390 }, { "epoch": 0.9619238476953907, "grad_norm": 7519811.285504107, "learning_rate": 2.1158129175946545e-08, "logits/chosen": -2.312025785446167, "logits/rejected": -2.3464319705963135, "logps/chosen": -119.38179779052734, "logps/rejected": -166.91592407226562, "loss": 122030.7125, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02894437685608864, "rewards/margins": 0.045145101845264435, "rewards/rejected": -0.07408948242664337, "step": 2400 }, { "epoch": 0.9659318637274549, "grad_norm": 8883024.450495958, "learning_rate": 1.8930957683741647e-08, "logits/chosen": -2.344897747039795, "logits/rejected": -2.285876512527466, "logps/chosen": -111.0980224609375, "logps/rejected": -164.24978637695312, "loss": 121476.0125, "rewards/accuracies": 0.6875, "rewards/chosen": -0.026049071922898293, "rewards/margins": 0.048600487411022186, "rewards/rejected": -0.07464955747127533, "step": 2410 }, { "epoch": 0.969939879759519, "grad_norm": 10097197.568430113, "learning_rate": 1.6703786191536748e-08, "logits/chosen": -2.388754367828369, "logits/rejected": -2.3738179206848145, "logps/chosen": -125.5340576171875, "logps/rejected": -173.31170654296875, "loss": 121610.525, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02290264144539833, "rewards/margins": 0.04468691721558571, "rewards/rejected": -0.06758955866098404, "step": 2420 }, { "epoch": 0.9739478957915831, "grad_norm": 11378129.987507869, "learning_rate": 1.4476614699331847e-08, "logits/chosen": -2.3605947494506836, "logits/rejected": -2.3231589794158936, "logps/chosen": -117.72758483886719, "logps/rejected": -183.6721649169922, "loss": 119195.675, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.027614343911409378, "rewards/margins": 0.05597345903515816, "rewards/rejected": -0.08358780294656754, "step": 2430 }, { "epoch": 0.9779559118236473, "grad_norm": 8462760.728667326, "learning_rate": 1.2249443207126947e-08, "logits/chosen": -2.2801132202148438, "logits/rejected": -2.292315721511841, "logps/chosen": -127.86152648925781, "logps/rejected": -175.5904083251953, "loss": 124485.325, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03231586143374443, "rewards/margins": 0.03901532292366028, "rewards/rejected": -0.07133118063211441, "step": 2440 }, { "epoch": 0.9819639278557114, "grad_norm": 9243740.732393652, "learning_rate": 1.002227171492205e-08, "logits/chosen": -2.277127981185913, "logits/rejected": -2.2810211181640625, "logps/chosen": -117.18321228027344, "logps/rejected": -148.31886291503906, "loss": 127954.8875, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.025695014744997025, "rewards/margins": 0.026529842987656593, "rewards/rejected": -0.052224863320589066, "step": 2450 }, { "epoch": 0.9859719438877755, "grad_norm": 9218086.60683555, "learning_rate": 7.79510022271715e-09, "logits/chosen": -2.361515998840332, "logits/rejected": -2.3449158668518066, "logps/chosen": -115.47230529785156, "logps/rejected": -134.2152099609375, "loss": 121513.8, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.02808554843068123, "rewards/margins": 0.022291336208581924, "rewards/rejected": -0.05037688463926315, "step": 2460 }, { "epoch": 0.9899799599198397, "grad_norm": 8767440.113062855, "learning_rate": 5.5679287305122495e-09, "logits/chosen": -2.3940796852111816, "logits/rejected": -2.373922824859619, "logps/chosen": -146.56610107421875, "logps/rejected": -181.4602508544922, "loss": 121378.2375, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.030033543705940247, "rewards/margins": 0.033586207777261734, "rewards/rejected": -0.06361975520849228, "step": 2470 }, { "epoch": 0.9939879759519038, "grad_norm": 10436515.838738332, "learning_rate": 3.3407572383073495e-09, "logits/chosen": -2.4080631732940674, "logits/rejected": -2.3511130809783936, "logps/chosen": -110.88321685791016, "logps/rejected": -157.0704803466797, "loss": 123335.75, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.02597566321492195, "rewards/margins": 0.04258622229099274, "rewards/rejected": -0.06856188923120499, "step": 2480 }, { "epoch": 0.9979959919839679, "grad_norm": 12722707.579726782, "learning_rate": 1.1135857461024498e-09, "logits/chosen": -2.2917561531066895, "logits/rejected": -2.238448143005371, "logps/chosen": -122.59525299072266, "logps/rejected": -173.32418823242188, "loss": 120410.5375, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.02627197466790676, "rewards/margins": 0.046644873917102814, "rewards/rejected": -0.07291685789823532, "step": 2490 } ], "logging_steps": 10, "max_steps": 2495, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }