{ "best_metric": null, "best_model_checkpoint": null, "epoch": 19.753086419753085, "eval_steps": 500, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.3292181069958848, "grad_norm": 0.5561477541923523, "learning_rate": 8.333333333333333e-07, "logits/chosen": 1.7485754489898682, "logits/rejected": 1.8832639455795288, "logps/chosen": -70.18267059326172, "logps/rejected": -77.9986343383789, "loss": 0.6938, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.005453853867948055, "rewards/margins": 0.013218576088547707, "rewards/rejected": -0.007764720823615789, "step": 10 }, { "epoch": 0.6584362139917695, "grad_norm": 0.48141908645629883, "learning_rate": 1.6666666666666667e-06, "logits/chosen": 1.9016907215118408, "logits/rejected": 1.9251121282577515, "logps/chosen": -96.5027847290039, "logps/rejected": -87.00735473632812, "loss": 0.6937, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.005797500256448984, "rewards/margins": -0.0004409264656715095, "rewards/rejected": -0.005356573965400457, "step": 20 }, { "epoch": 0.9876543209876543, "grad_norm": 0.4639015197753906, "learning_rate": 2.5e-06, "logits/chosen": 1.7938541173934937, "logits/rejected": 1.6962993144989014, "logps/chosen": -71.47590637207031, "logps/rejected": -66.45989227294922, "loss": 0.6942, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.004830303601920605, "rewards/margins": -0.00935445912182331, "rewards/rejected": 0.004524155054241419, "step": 30 }, { "epoch": 1.316872427983539, "grad_norm": 0.44931092858314514, "learning_rate": 3.3333333333333333e-06, "logits/chosen": 1.8256213665008545, "logits/rejected": 1.8677200078964233, "logps/chosen": -75.90711975097656, "logps/rejected": -76.26548767089844, "loss": 0.6935, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.004859285429120064, "rewards/margins": -0.007447429001331329, "rewards/rejected": 0.0025881435722112656, "step": 40 }, { "epoch": 1.646090534979424, "grad_norm": 0.512350857257843, "learning_rate": 4.166666666666667e-06, "logits/chosen": 1.7572576999664307, "logits/rejected": 1.7408854961395264, "logps/chosen": -80.90664672851562, "logps/rejected": -85.82096862792969, "loss": 0.6937, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.00120059447363019, "rewards/margins": -0.0018891148502007127, "rewards/rejected": 0.0006885197362862527, "step": 50 }, { "epoch": 1.9753086419753085, "grad_norm": 0.5260242819786072, "learning_rate": 5e-06, "logits/chosen": 1.827275037765503, "logits/rejected": 1.8168131113052368, "logps/chosen": -86.74467468261719, "logps/rejected": -79.20576477050781, "loss": 0.6937, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": 0.0014693590346723795, "rewards/margins": -0.0025326632894575596, "rewards/rejected": 0.00400202302262187, "step": 60 }, { "epoch": 2.3045267489711936, "grad_norm": 0.5231289267539978, "learning_rate": 4.995770395678171e-06, "logits/chosen": 1.7851394414901733, "logits/rejected": 1.8952877521514893, "logps/chosen": -81.03253173828125, "logps/rejected": -88.5263442993164, "loss": 0.6932, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.003017458599060774, "rewards/margins": -0.002731734188273549, "rewards/rejected": -0.0002857256622519344, "step": 70 }, { "epoch": 2.633744855967078, "grad_norm": 0.5552361011505127, "learning_rate": 4.983095894354858e-06, "logits/chosen": 1.8227930068969727, "logits/rejected": 1.7752052545547485, "logps/chosen": -89.98479461669922, "logps/rejected": -72.01054382324219, "loss": 0.6928, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.0013810636010020971, "rewards/margins": -0.0008344938978552818, "rewards/rejected": 0.0022155570331960917, "step": 80 }, { "epoch": 2.962962962962963, "grad_norm": 0.603434681892395, "learning_rate": 4.962019382530521e-06, "logits/chosen": 1.829049825668335, "logits/rejected": 1.78665030002594, "logps/chosen": -81.95549011230469, "logps/rejected": -76.07003021240234, "loss": 0.6909, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0062006814405322075, "rewards/margins": 0.003528360743075609, "rewards/rejected": 0.00267231953330338, "step": 90 }, { "epoch": 3.292181069958848, "grad_norm": 0.6200206279754639, "learning_rate": 4.93261217644956e-06, "logits/chosen": 1.7920262813568115, "logits/rejected": 1.7817165851593018, "logps/chosen": -84.39167022705078, "logps/rejected": -85.04205322265625, "loss": 0.6907, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0063859038054943085, "rewards/margins": 0.007152262143790722, "rewards/rejected": -0.0007663581636734307, "step": 100 }, { "epoch": 3.6213991769547325, "grad_norm": 0.5937617421150208, "learning_rate": 4.894973780788722e-06, "logits/chosen": 1.8367125988006592, "logits/rejected": 1.8536920547485352, "logps/chosen": -67.41716003417969, "logps/rejected": -72.06455993652344, "loss": 0.6891, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.00015740413800813258, "rewards/margins": 0.013882984407246113, "rewards/rejected": -0.01404038816690445, "step": 110 }, { "epoch": 3.950617283950617, "grad_norm": 0.8695696592330933, "learning_rate": 4.849231551964771e-06, "logits/chosen": 1.8238048553466797, "logits/rejected": 1.7972408533096313, "logps/chosen": -104.34183502197266, "logps/rejected": -81.703369140625, "loss": 0.6864, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.00020323302305769175, "rewards/margins": 0.011230283416807652, "rewards/rejected": -0.011027050204575062, "step": 120 }, { "epoch": 4.279835390946502, "grad_norm": 0.7833828926086426, "learning_rate": 4.7955402672006855e-06, "logits/chosen": 1.7440074682235718, "logits/rejected": 1.7824723720550537, "logps/chosen": -92.3200912475586, "logps/rejected": -85.88248443603516, "loss": 0.6861, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0005388978170230985, "rewards/margins": 0.011814715340733528, "rewards/rejected": -0.011275815777480602, "step": 130 }, { "epoch": 4.609053497942387, "grad_norm": 0.7381640672683716, "learning_rate": 4.734081600808531e-06, "logits/chosen": 1.6734364032745361, "logits/rejected": 1.7311270236968994, "logps/chosen": -88.11524963378906, "logps/rejected": -97.87281799316406, "loss": 0.682, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.006451706402003765, "rewards/margins": 0.021939774975180626, "rewards/rejected": -0.028391480445861816, "step": 140 }, { "epoch": 4.938271604938271, "grad_norm": 0.7793841361999512, "learning_rate": 4.665063509461098e-06, "logits/chosen": 1.8139768838882446, "logits/rejected": 1.766761064529419, "logps/chosen": -77.28334045410156, "logps/rejected": -78.78504943847656, "loss": 0.6777, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.011237703263759613, "rewards/margins": 0.03930521756410599, "rewards/rejected": -0.050542913377285004, "step": 150 }, { "epoch": 5.267489711934156, "grad_norm": 0.7763285040855408, "learning_rate": 4.588719528532342e-06, "logits/chosen": 1.7640300989151, "logits/rejected": 1.6494834423065186, "logps/chosen": -91.71728515625, "logps/rejected": -85.06828308105469, "loss": 0.6714, "rewards/accuracies": 0.625, "rewards/chosen": -0.034448202699422836, "rewards/margins": 0.02681097947061062, "rewards/rejected": -0.0612591877579689, "step": 160 }, { "epoch": 5.596707818930041, "grad_norm": 0.9756498336791992, "learning_rate": 4.50530798188761e-06, "logits/chosen": 1.8123546838760376, "logits/rejected": 1.7651164531707764, "logps/chosen": -95.57429504394531, "logps/rejected": -83.24696350097656, "loss": 0.6667, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02284305915236473, "rewards/margins": 0.04916313290596008, "rewards/rejected": -0.07200618833303452, "step": 170 }, { "epoch": 5.925925925925926, "grad_norm": 0.8213481903076172, "learning_rate": 4.415111107797445e-06, "logits/chosen": 1.7565195560455322, "logits/rejected": 1.8321483135223389, "logps/chosen": -90.69538879394531, "logps/rejected": -77.74930572509766, "loss": 0.6651, "rewards/accuracies": 0.75, "rewards/chosen": -0.030663728713989258, "rewards/margins": 0.07017870247364044, "rewards/rejected": -0.1008424386382103, "step": 180 }, { "epoch": 6.255144032921811, "grad_norm": 1.067845344543457, "learning_rate": 4.318434103932622e-06, "logits/chosen": 1.7998554706573486, "logits/rejected": 1.7774969339370728, "logps/chosen": -86.76210021972656, "logps/rejected": -86.25608825683594, "loss": 0.66, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.07478730380535126, "rewards/margins": 0.07410295307636261, "rewards/rejected": -0.14889024198055267, "step": 190 }, { "epoch": 6.584362139917696, "grad_norm": 0.905776858329773, "learning_rate": 4.215604094671835e-06, "logits/chosen": 1.8654216527938843, "logits/rejected": 1.8751890659332275, "logps/chosen": -74.4819107055664, "logps/rejected": -85.82235717773438, "loss": 0.654, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.08698664605617523, "rewards/margins": 0.06426878273487091, "rewards/rejected": -0.15125542879104614, "step": 200 }, { "epoch": 6.91358024691358, "grad_norm": 0.9491797685623169, "learning_rate": 4.106969024216348e-06, "logits/chosen": 1.7801555395126343, "logits/rejected": 1.8760993480682373, "logps/chosen": -88.22879028320312, "logps/rejected": -91.16268157958984, "loss": 0.6381, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.05148346349596977, "rewards/margins": 0.13421732187271118, "rewards/rejected": -0.18570080399513245, "step": 210 }, { "epoch": 7.242798353909465, "grad_norm": 1.0432075262069702, "learning_rate": 3.992896479256966e-06, "logits/chosen": 1.761461853981018, "logits/rejected": 1.7890026569366455, "logps/chosen": -76.50201416015625, "logps/rejected": -83.002685546875, "loss": 0.6387, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1096138209104538, "rewards/margins": 0.13470278680324554, "rewards/rejected": -0.24431662261486053, "step": 220 }, { "epoch": 7.57201646090535, "grad_norm": 0.9662573933601379, "learning_rate": 3.8737724451770155e-06, "logits/chosen": 1.7268747091293335, "logits/rejected": 1.8781719207763672, "logps/chosen": -85.2467269897461, "logps/rejected": -88.09255981445312, "loss": 0.6253, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.09576521068811417, "rewards/margins": 0.19639071822166443, "rewards/rejected": -0.292155921459198, "step": 230 }, { "epoch": 7.901234567901234, "grad_norm": 1.0413649082183838, "learning_rate": 3.7500000000000005e-06, "logits/chosen": 1.8745641708374023, "logits/rejected": 1.9019807577133179, "logps/chosen": -82.0667724609375, "logps/rejected": -70.83448791503906, "loss": 0.627, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.1119164451956749, "rewards/margins": 0.1671919822692871, "rewards/rejected": -0.2791084349155426, "step": 240 }, { "epoch": 8.23045267489712, "grad_norm": 1.1005665063858032, "learning_rate": 3.621997950501156e-06, "logits/chosen": 1.8319737911224365, "logits/rejected": 1.8594881296157837, "logps/chosen": -82.2616195678711, "logps/rejected": -94.71448516845703, "loss": 0.6196, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.21282243728637695, "rewards/margins": 0.11685125529766083, "rewards/rejected": -0.3296736776828766, "step": 250 }, { "epoch": 8.559670781893004, "grad_norm": 1.0339275598526, "learning_rate": 3.4901994150978926e-06, "logits/chosen": 1.8168909549713135, "logits/rejected": 1.8122689723968506, "logps/chosen": -86.70075988769531, "logps/rejected": -87.91035461425781, "loss": 0.6097, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.21122360229492188, "rewards/margins": 0.1889980137348175, "rewards/rejected": -0.40022164583206177, "step": 260 }, { "epoch": 8.88888888888889, "grad_norm": 1.0026566982269287, "learning_rate": 3.3550503583141726e-06, "logits/chosen": 1.7882308959960938, "logits/rejected": 1.7757456302642822, "logps/chosen": -98.28022766113281, "logps/rejected": -99.22371673583984, "loss": 0.6042, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.23864057660102844, "rewards/margins": 0.17245283722877502, "rewards/rejected": -0.4110933840274811, "step": 270 }, { "epoch": 9.218106995884774, "grad_norm": 0.9841915965080261, "learning_rate": 3.217008081777726e-06, "logits/chosen": 1.8728599548339844, "logits/rejected": 1.7537968158721924, "logps/chosen": -110.29930114746094, "logps/rejected": -76.87739562988281, "loss": 0.5986, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.16357959806919098, "rewards/margins": 0.30878886580467224, "rewards/rejected": -0.47236841917037964, "step": 280 }, { "epoch": 9.547325102880658, "grad_norm": 1.0610634088516235, "learning_rate": 3.0765396768561005e-06, "logits/chosen": 1.8760344982147217, "logits/rejected": 1.8611949682235718, "logps/chosen": -83.32715606689453, "logps/rejected": -91.76313781738281, "loss": 0.5953, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.24952685832977295, "rewards/margins": 0.23791106045246124, "rewards/rejected": -0.4874378740787506, "step": 290 }, { "epoch": 9.876543209876543, "grad_norm": 1.0584696531295776, "learning_rate": 2.9341204441673267e-06, "logits/chosen": 1.713905692100525, "logits/rejected": 1.761605978012085, "logps/chosen": -84.4806900024414, "logps/rejected": -84.85515594482422, "loss": 0.5853, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3275205194950104, "rewards/margins": 0.20603664219379425, "rewards/rejected": -0.5335571765899658, "step": 300 }, { "epoch": 10.205761316872428, "grad_norm": 1.2500941753387451, "learning_rate": 2.7902322853130758e-06, "logits/chosen": 1.70700204372406, "logits/rejected": 1.7708561420440674, "logps/chosen": -83.38414001464844, "logps/rejected": -84.97639465332031, "loss": 0.5889, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.2745892405509949, "rewards/margins": 0.3200107514858246, "rewards/rejected": -0.5945999622344971, "step": 310 }, { "epoch": 10.534979423868313, "grad_norm": 1.0091391801834106, "learning_rate": 2.6453620722761897e-06, "logits/chosen": 1.656237006187439, "logits/rejected": 1.7875343561172485, "logps/chosen": -76.54080200195312, "logps/rejected": -99.77791595458984, "loss": 0.5805, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.2655033469200134, "rewards/margins": 0.23930945992469788, "rewards/rejected": -0.5048128366470337, "step": 320 }, { "epoch": 10.864197530864198, "grad_norm": 1.280364990234375, "learning_rate": 2.5e-06, "logits/chosen": 1.7501579523086548, "logits/rejected": 1.8710143566131592, "logps/chosen": -95.80509948730469, "logps/rejected": -98.95433044433594, "loss": 0.5681, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.25459492206573486, "rewards/margins": 0.4196421504020691, "rewards/rejected": -0.674237072467804, "step": 330 }, { "epoch": 11.193415637860083, "grad_norm": 1.0981961488723755, "learning_rate": 2.3546379277238107e-06, "logits/chosen": 1.781877875328064, "logits/rejected": 1.8056217432022095, "logps/chosen": -80.20987701416016, "logps/rejected": -94.3116226196289, "loss": 0.5684, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.39583852887153625, "rewards/margins": 0.2972935140132904, "rewards/rejected": -0.6931320428848267, "step": 340 }, { "epoch": 11.522633744855966, "grad_norm": 1.1398284435272217, "learning_rate": 2.2097677146869242e-06, "logits/chosen": 1.7265570163726807, "logits/rejected": 1.7091245651245117, "logps/chosen": -75.80311584472656, "logps/rejected": -79.35089111328125, "loss": 0.5656, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.3116758465766907, "rewards/margins": 0.42105427384376526, "rewards/rejected": -0.7327300906181335, "step": 350 }, { "epoch": 11.851851851851851, "grad_norm": 1.1577041149139404, "learning_rate": 2.0658795558326745e-06, "logits/chosen": 1.692983865737915, "logits/rejected": 1.7239530086517334, "logps/chosen": -91.54319763183594, "logps/rejected": -93.06980895996094, "loss": 0.5699, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.38909879326820374, "rewards/margins": 0.34535473585128784, "rewards/rejected": -0.734453558921814, "step": 360 }, { "epoch": 12.181069958847736, "grad_norm": 1.405211329460144, "learning_rate": 1.9234603231439e-06, "logits/chosen": 1.7344582080841064, "logits/rejected": 1.7111711502075195, "logps/chosen": -88.46275329589844, "logps/rejected": -82.77845001220703, "loss": 0.5577, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3398984372615814, "rewards/margins": 0.4276389181613922, "rewards/rejected": -0.7675372958183289, "step": 370 }, { "epoch": 12.510288065843621, "grad_norm": 1.4324347972869873, "learning_rate": 1.7829919182222752e-06, "logits/chosen": 1.7103168964385986, "logits/rejected": 1.7571923732757568, "logps/chosen": -83.42256927490234, "logps/rejected": -95.48295593261719, "loss": 0.5493, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.39600270986557007, "rewards/margins": 0.447258323431015, "rewards/rejected": -0.8432610630989075, "step": 380 }, { "epoch": 12.839506172839506, "grad_norm": 1.1979031562805176, "learning_rate": 1.6449496416858285e-06, "logits/chosen": 1.6545474529266357, "logits/rejected": 1.7649863958358765, "logps/chosen": -78.60997772216797, "logps/rejected": -91.09608459472656, "loss": 0.5618, "rewards/accuracies": 0.875, "rewards/chosen": -0.3745049834251404, "rewards/margins": 0.4601859450340271, "rewards/rejected": -0.8346909284591675, "step": 390 }, { "epoch": 13.168724279835391, "grad_norm": 1.164794683456421, "learning_rate": 1.509800584902108e-06, "logits/chosen": 1.7000430822372437, "logits/rejected": 1.7276875972747803, "logps/chosen": -90.08226013183594, "logps/rejected": -82.01007843017578, "loss": 0.5517, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.3675037622451782, "rewards/margins": 0.44133225083351135, "rewards/rejected": -0.8088359832763672, "step": 400 }, { "epoch": 13.497942386831276, "grad_norm": 1.111011028289795, "learning_rate": 1.3780020494988447e-06, "logits/chosen": 1.5840178728103638, "logits/rejected": 1.7387195825576782, "logps/chosen": -88.83552551269531, "logps/rejected": -125.14837646484375, "loss": 0.5549, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.4572484493255615, "rewards/margins": 0.3457504212856293, "rewards/rejected": -0.8029988408088684, "step": 410 }, { "epoch": 13.82716049382716, "grad_norm": 1.257519006729126, "learning_rate": 1.2500000000000007e-06, "logits/chosen": 1.6768709421157837, "logits/rejected": 1.6739391088485718, "logps/chosen": -88.93721008300781, "logps/rejected": -79.91072082519531, "loss": 0.5387, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.3928380012512207, "rewards/margins": 0.37974029779434204, "rewards/rejected": -0.7725783586502075, "step": 420 }, { "epoch": 14.156378600823045, "grad_norm": 1.1822686195373535, "learning_rate": 1.1262275548229852e-06, "logits/chosen": 1.664902925491333, "logits/rejected": 1.75554621219635, "logps/chosen": -86.20747375488281, "logps/rejected": -88.08210754394531, "loss": 0.5424, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.4149021506309509, "rewards/margins": 0.4194130003452301, "rewards/rejected": -0.8343151211738586, "step": 430 }, { "epoch": 14.48559670781893, "grad_norm": 1.1221873760223389, "learning_rate": 1.0071035207430352e-06, "logits/chosen": 1.6271368265151978, "logits/rejected": 1.7387148141860962, "logps/chosen": -73.69468688964844, "logps/rejected": -94.08692932128906, "loss": 0.5539, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.38762181997299194, "rewards/margins": 0.32877033948898315, "rewards/rejected": -0.7163921594619751, "step": 440 }, { "epoch": 14.814814814814815, "grad_norm": 1.1013416051864624, "learning_rate": 8.930309757836517e-07, "logits/chosen": 1.711073637008667, "logits/rejected": 1.7389122247695923, "logps/chosen": -95.46138763427734, "logps/rejected": -89.34103393554688, "loss": 0.5277, "rewards/accuracies": 0.8125, "rewards/chosen": -0.37107351422309875, "rewards/margins": 0.5281612277030945, "rewards/rejected": -0.8992347717285156, "step": 450 }, { "epoch": 15.1440329218107, "grad_norm": 1.1581361293792725, "learning_rate": 7.843959053281663e-07, "logits/chosen": 1.5964223146438599, "logits/rejected": 1.6815162897109985, "logps/chosen": -84.13243103027344, "logps/rejected": -86.10100555419922, "loss": 0.5482, "rewards/accuracies": 0.8125, "rewards/chosen": -0.48493900895118713, "rewards/margins": 0.46702641248703003, "rewards/rejected": -0.9519654512405396, "step": 460 }, { "epoch": 15.473251028806585, "grad_norm": 1.1499443054199219, "learning_rate": 6.815658960673782e-07, "logits/chosen": 1.761904001235962, "logits/rejected": 1.7722113132476807, "logps/chosen": -83.77279663085938, "logps/rejected": -81.67822265625, "loss": 0.5428, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.439403772354126, "rewards/margins": 0.3410206437110901, "rewards/rejected": -0.7804244160652161, "step": 470 }, { "epoch": 15.802469135802468, "grad_norm": 1.3699110746383667, "learning_rate": 5.848888922025553e-07, "logits/chosen": 1.7152798175811768, "logits/rejected": 1.7187302112579346, "logps/chosen": -91.2159652709961, "logps/rejected": -82.41645812988281, "loss": 0.5314, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.41314107179641724, "rewards/margins": 0.44486457109451294, "rewards/rejected": -0.858005702495575, "step": 480 }, { "epoch": 16.131687242798353, "grad_norm": 1.2674903869628906, "learning_rate": 4.946920181123904e-07, "logits/chosen": 1.7741254568099976, "logits/rejected": 1.7767646312713623, "logps/chosen": -97.18695831298828, "logps/rejected": -101.12361907958984, "loss": 0.5384, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.4852083623409271, "rewards/margins": 0.3798518776893616, "rewards/rejected": -0.8650602102279663, "step": 490 }, { "epoch": 16.46090534979424, "grad_norm": 1.2736115455627441, "learning_rate": 4.1128047146765936e-07, "logits/chosen": 1.7331253290176392, "logits/rejected": 1.8541399240493774, "logps/chosen": -89.29019165039062, "logps/rejected": -92.32213592529297, "loss": 0.5441, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5169966816902161, "rewards/margins": 0.4245742857456207, "rewards/rejected": -0.9415708780288696, "step": 500 }, { "epoch": 16.790123456790123, "grad_norm": 1.735156536102295, "learning_rate": 3.3493649053890325e-07, "logits/chosen": 1.6830623149871826, "logits/rejected": 1.6669334173202515, "logps/chosen": -106.5848388671875, "logps/rejected": -88.74864196777344, "loss": 0.5233, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.43169134855270386, "rewards/margins": 0.38342052698135376, "rewards/rejected": -0.8151118159294128, "step": 510 }, { "epoch": 17.11934156378601, "grad_norm": 1.1315010786056519, "learning_rate": 2.6591839919146963e-07, "logits/chosen": 1.6711467504501343, "logits/rejected": 1.7918866872787476, "logps/chosen": -79.41645050048828, "logps/rejected": -94.12269592285156, "loss": 0.5301, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4840959906578064, "rewards/margins": 0.4427928030490875, "rewards/rejected": -0.9268887639045715, "step": 520 }, { "epoch": 17.448559670781894, "grad_norm": 1.1289525032043457, "learning_rate": 2.044597327993153e-07, "logits/chosen": 1.6908950805664062, "logits/rejected": 1.7840102910995483, "logps/chosen": -81.63914489746094, "logps/rejected": -87.42573547363281, "loss": 0.5401, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.4083719849586487, "rewards/margins": 0.46273595094680786, "rewards/rejected": -0.8711079359054565, "step": 530 }, { "epoch": 17.77777777777778, "grad_norm": 1.6032687425613403, "learning_rate": 1.507684480352292e-07, "logits/chosen": 1.5787999629974365, "logits/rejected": 1.7288663387298584, "logps/chosen": -79.30626678466797, "logps/rejected": -110.68818664550781, "loss": 0.5326, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.449043333530426, "rewards/margins": 0.5290273427963257, "rewards/rejected": -0.9780707359313965, "step": 540 }, { "epoch": 18.106995884773664, "grad_norm": 1.1730103492736816, "learning_rate": 1.0502621921127776e-07, "logits/chosen": 1.7219009399414062, "logits/rejected": 1.7316343784332275, "logps/chosen": -89.55079650878906, "logps/rejected": -91.233642578125, "loss": 0.5312, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.42370209097862244, "rewards/margins": 0.44324246048927307, "rewards/rejected": -0.8669446110725403, "step": 550 }, { "epoch": 18.43621399176955, "grad_norm": 1.4936864376068115, "learning_rate": 6.738782355044048e-08, "logits/chosen": 1.7580926418304443, "logits/rejected": 1.8186404705047607, "logps/chosen": -70.40943908691406, "logps/rejected": -91.91678619384766, "loss": 0.5362, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.42562800645828247, "rewards/margins": 0.34529608488082886, "rewards/rejected": -0.7709239721298218, "step": 560 }, { "epoch": 18.765432098765434, "grad_norm": 1.1581302881240845, "learning_rate": 3.798061746947995e-08, "logits/chosen": 1.625957727432251, "logits/rejected": 1.6400096416473389, "logps/chosen": -95.5112533569336, "logps/rejected": -83.62013244628906, "loss": 0.5241, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.3977498412132263, "rewards/margins": 0.4945148527622223, "rewards/rejected": -0.8922646641731262, "step": 570 }, { "epoch": 19.094650205761315, "grad_norm": 1.129463791847229, "learning_rate": 1.6904105645142443e-08, "logits/chosen": 1.6559406518936157, "logits/rejected": 1.744818091392517, "logps/chosen": -83.44347381591797, "logps/rejected": -98.82951354980469, "loss": 0.5302, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.4388834834098816, "rewards/margins": 0.4895119071006775, "rewards/rejected": -0.9283954501152039, "step": 580 }, { "epoch": 19.4238683127572, "grad_norm": 1.193711519241333, "learning_rate": 4.229604321829561e-09, "logits/chosen": 1.818708062171936, "logits/rejected": 1.714835524559021, "logps/chosen": -118.84761047363281, "logps/rejected": -82.6863784790039, "loss": 0.5319, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.3866303563117981, "rewards/margins": 0.44671958684921265, "rewards/rejected": -0.8333500027656555, "step": 590 }, { "epoch": 19.753086419753085, "grad_norm": 1.4505993127822876, "learning_rate": 0.0, "logits/chosen": 1.70465087890625, "logits/rejected": 1.680711030960083, "logps/chosen": -107.31657409667969, "logps/rejected": -92.33575439453125, "loss": 0.5352, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.4541262984275818, "rewards/margins": 0.4393085539340973, "rewards/rejected": -0.8934348821640015, "step": 600 }, { "epoch": 19.753086419753085, "step": 600, "total_flos": 2.0386871270503875e+18, "train_loss": 0.6034941236178081, "train_runtime": 4431.3984, "train_samples_per_second": 8.747, "train_steps_per_second": 0.135 } ], "logging_steps": 10, "max_steps": 600, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.0386871270503875e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }