{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996685449121644, "eval_steps": 200, "global_step": 754, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013258203513423931, "grad_norm": 27.282764434814453, "learning_rate": 5.000000000000001e-07, "logits/chosen": -0.5551050901412964, "logits/rejected": -0.5903115272521973, "logps/chosen": -123.05072021484375, "logps/rejected": -128.62611389160156, "loss": 1.9744, "nll_loss": 2.560427188873291, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -0.002333300421014428, "rewards/margins": -0.002295339945703745, "rewards/rejected": -3.796028977376409e-05, "step": 10 }, { "epoch": 0.026516407026847863, "grad_norm": 17.47486114501953, "learning_rate": 1.0000000000000002e-06, "logits/chosen": -0.5378572940826416, "logits/rejected": -0.5796166658401489, "logps/chosen": -125.56513977050781, "logps/rejected": -122.72200012207031, "loss": 1.9287, "nll_loss": 2.4739668369293213, "rewards/accuracies": 0.534375011920929, "rewards/chosen": 0.0028954462613910437, "rewards/margins": 0.004081044811755419, "rewards/rejected": -0.0011855984339490533, "step": 20 }, { "epoch": 0.039774610540271794, "grad_norm": 28.93717384338379, "learning_rate": 1.5e-06, "logits/chosen": -0.5573912262916565, "logits/rejected": -0.6301255226135254, "logps/chosen": -120.21688079833984, "logps/rejected": -120.69698333740234, "loss": 1.9568, "nll_loss": 2.5327491760253906, "rewards/accuracies": 0.628125011920929, "rewards/chosen": 0.006910824682563543, "rewards/margins": 0.007299685385078192, "rewards/rejected": -0.0003888603823725134, "step": 30 }, { "epoch": 0.053032814053695726, "grad_norm": 26.4592227935791, "learning_rate": 2.0000000000000003e-06, "logits/chosen": -0.5415462255477905, "logits/rejected": -0.5897966623306274, "logps/chosen": -123.216064453125, "logps/rejected": -116.96390533447266, "loss": 1.945, "nll_loss": 2.5195693969726562, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": 0.02091406285762787, "rewards/margins": 0.020817000418901443, "rewards/rejected": 9.70602995948866e-05, "step": 40 }, { "epoch": 0.06629101756711965, "grad_norm": 17.95655059814453, "learning_rate": 2.5e-06, "logits/chosen": -0.5369003415107727, "logits/rejected": -0.5648149251937866, "logps/chosen": -112.5962905883789, "logps/rejected": -106.9513931274414, "loss": 1.93, "nll_loss": 2.506865978240967, "rewards/accuracies": 0.71875, "rewards/chosen": 0.04093035310506821, "rewards/margins": 0.0435122512280941, "rewards/rejected": -0.0025818957947194576, "step": 50 }, { "epoch": 0.07954922108054359, "grad_norm": 18.845481872558594, "learning_rate": 3e-06, "logits/chosen": -0.5306503176689148, "logits/rejected": -0.5870386958122253, "logps/chosen": -115.35811614990234, "logps/rejected": -119.94677734375, "loss": 1.9014, "nll_loss": 2.470397472381592, "rewards/accuracies": 0.75, "rewards/chosen": 0.0722598135471344, "rewards/margins": 0.07325105369091034, "rewards/rejected": -0.000991240842267871, "step": 60 }, { "epoch": 0.09280742459396751, "grad_norm": 22.062213897705078, "learning_rate": 3.5e-06, "logits/chosen": -0.5302293300628662, "logits/rejected": -0.5745421648025513, "logps/chosen": -107.54048156738281, "logps/rejected": -108.28858947753906, "loss": 1.9362, "nll_loss": 2.587956428527832, "rewards/accuracies": 0.7593749761581421, "rewards/chosen": 0.1419026404619217, "rewards/margins": 0.144887775182724, "rewards/rejected": -0.0029851621948182583, "step": 70 }, { "epoch": 0.10606562810739145, "grad_norm": 22.929567337036133, "learning_rate": 4.000000000000001e-06, "logits/chosen": -0.5311123132705688, "logits/rejected": -0.601387083530426, "logps/chosen": -106.30036926269531, "logps/rejected": -114.9739761352539, "loss": 1.8807, "nll_loss": 2.5304553508758545, "rewards/accuracies": 0.815625011920929, "rewards/chosen": 0.22294898331165314, "rewards/margins": 0.23250994086265564, "rewards/rejected": -0.009560950100421906, "step": 80 }, { "epoch": 0.11932383162081538, "grad_norm": 15.968583106994629, "learning_rate": 4.5e-06, "logits/chosen": -0.5320655703544617, "logits/rejected": -0.558965802192688, "logps/chosen": -113.92137145996094, "logps/rejected": -106.32939147949219, "loss": 1.8047, "nll_loss": 2.424870014190674, "rewards/accuracies": 0.8125, "rewards/chosen": 0.31752246618270874, "rewards/margins": 0.33052030205726624, "rewards/rejected": -0.012997796759009361, "step": 90 }, { "epoch": 0.1325820351342393, "grad_norm": 16.777925491333008, "learning_rate": 5e-06, "logits/chosen": -0.5369315147399902, "logits/rejected": -0.550090491771698, "logps/chosen": -115.67036437988281, "logps/rejected": -113.78245544433594, "loss": 1.7842, "nll_loss": 2.421271800994873, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.4060588777065277, "rewards/margins": 0.42890095710754395, "rewards/rejected": -0.02284209243953228, "step": 100 }, { "epoch": 0.14584023864766324, "grad_norm": 12.77545166015625, "learning_rate": 4.923547400611622e-06, "logits/chosen": -0.4923822283744812, "logits/rejected": -0.550975501537323, "logps/chosen": -104.06398010253906, "logps/rejected": -105.51200103759766, "loss": 1.7203, "nll_loss": 2.3695566654205322, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.5946463942527771, "rewards/margins": 0.6244661211967468, "rewards/rejected": -0.02981976605951786, "step": 110 }, { "epoch": 0.15909844216108718, "grad_norm": 14.160531997680664, "learning_rate": 4.847094801223242e-06, "logits/chosen": -0.5261751413345337, "logits/rejected": -0.593400239944458, "logps/chosen": -109.50382995605469, "logps/rejected": -117.4461669921875, "loss": 1.6824, "nll_loss": 2.298811674118042, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 0.646867573261261, "rewards/margins": 0.6841082572937012, "rewards/rejected": -0.037240687757730484, "step": 120 }, { "epoch": 0.17235664567451112, "grad_norm": 11.232987403869629, "learning_rate": 4.770642201834863e-06, "logits/chosen": -0.5387733578681946, "logits/rejected": -0.5604445338249207, "logps/chosen": -105.5321044921875, "logps/rejected": -108.08251953125, "loss": 1.6447, "nll_loss": 2.28352689743042, "rewards/accuracies": 0.846875011920929, "rewards/chosen": 0.8856587409973145, "rewards/margins": 0.9316526651382446, "rewards/rejected": -0.04599405825138092, "step": 130 }, { "epoch": 0.18561484918793503, "grad_norm": 13.136269569396973, "learning_rate": 4.694189602446483e-06, "logits/chosen": -0.4762907028198242, "logits/rejected": -0.5645761489868164, "logps/chosen": -104.01419830322266, "logps/rejected": -105.42718505859375, "loss": 1.5795, "nll_loss": 2.1962618827819824, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 1.1001434326171875, "rewards/margins": 1.1506679058074951, "rewards/rejected": -0.0505245216190815, "step": 140 }, { "epoch": 0.19887305270135897, "grad_norm": 11.437210083007812, "learning_rate": 4.617737003058104e-06, "logits/chosen": -0.45225948095321655, "logits/rejected": -0.5816742181777954, "logps/chosen": -95.95039367675781, "logps/rejected": -116.07032775878906, "loss": 1.5695, "nll_loss": 2.187495708465576, "rewards/accuracies": 0.875, "rewards/chosen": 1.3856970071792603, "rewards/margins": 1.4338386058807373, "rewards/rejected": -0.04814162850379944, "step": 150 }, { "epoch": 0.2121312562147829, "grad_norm": 11.217316627502441, "learning_rate": 4.541284403669725e-06, "logits/chosen": -0.4223412573337555, "logits/rejected": -0.5557634234428406, "logps/chosen": -99.70536804199219, "logps/rejected": -108.40791320800781, "loss": 1.5248, "nll_loss": 2.0865731239318848, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 1.4318909645080566, "rewards/margins": 1.4701259136199951, "rewards/rejected": -0.03823506087064743, "step": 160 }, { "epoch": 0.22538945972820681, "grad_norm": 11.582230567932129, "learning_rate": 4.464831804281346e-06, "logits/chosen": -0.4045068323612213, "logits/rejected": -0.5808693170547485, "logps/chosen": -101.94197845458984, "logps/rejected": -115.78692626953125, "loss": 1.5259, "nll_loss": 2.059906482696533, "rewards/accuracies": 0.871874988079071, "rewards/chosen": 1.6565885543823242, "rewards/margins": 1.685520887374878, "rewards/rejected": -0.028932059183716774, "step": 170 }, { "epoch": 0.23864766324163075, "grad_norm": 9.85542106628418, "learning_rate": 4.388379204892967e-06, "logits/chosen": -0.40881863236427307, "logits/rejected": -0.5515257120132446, "logps/chosen": -94.77958679199219, "logps/rejected": -109.2522201538086, "loss": 1.4906, "nll_loss": 2.006005048751831, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": 1.7907886505126953, "rewards/margins": 1.8029606342315674, "rewards/rejected": -0.012172091752290726, "step": 180 }, { "epoch": 0.25190586675505466, "grad_norm": 11.775798797607422, "learning_rate": 4.311926605504588e-06, "logits/chosen": -0.40728694200515747, "logits/rejected": -0.5780837535858154, "logps/chosen": -102.8525619506836, "logps/rejected": -113.8287353515625, "loss": 1.5071, "nll_loss": 2.015195846557617, "rewards/accuracies": 0.903124988079071, "rewards/chosen": 1.94949209690094, "rewards/margins": 1.9533236026763916, "rewards/rejected": -0.003831386100500822, "step": 190 }, { "epoch": 0.2651640702684786, "grad_norm": 9.207958221435547, "learning_rate": 4.235474006116208e-06, "logits/chosen": -0.3382512331008911, "logits/rejected": -0.5608124136924744, "logps/chosen": -95.6505355834961, "logps/rejected": -115.0374755859375, "loss": 1.4625, "nll_loss": 1.900460958480835, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 1.9269275665283203, "rewards/margins": 1.9018806219100952, "rewards/rejected": 0.0250468198210001, "step": 200 }, { "epoch": 0.2651640702684786, "eval_logits/chosen": 0.10900751501321793, "eval_logits/rejected": -0.6459429860115051, "eval_logps/chosen": -26.658893585205078, "eval_logps/rejected": -23.843191146850586, "eval_loss": 1.61775803565979, "eval_nll_loss": 2.3195407390594482, "eval_rewards/accuracies": 0.995156466960907, "eval_rewards/chosen": 1.6962153911590576, "eval_rewards/margins": 1.591001272201538, "eval_rewards/rejected": 0.10521402209997177, "eval_runtime": 126.4924, "eval_samples_per_second": 21.203, "eval_steps_per_second": 5.305, "step": 200 }, { "epoch": 0.27842227378190254, "grad_norm": 11.874744415283203, "learning_rate": 4.1590214067278286e-06, "logits/chosen": -0.3222394287586212, "logits/rejected": -0.5524163246154785, "logps/chosen": -91.2304916381836, "logps/rejected": -112.20948791503906, "loss": 1.4373, "nll_loss": 1.863207221031189, "rewards/accuracies": 0.8843749761581421, "rewards/chosen": 2.0715649127960205, "rewards/margins": 2.029603958129883, "rewards/rejected": 0.04196098819375038, "step": 210 }, { "epoch": 0.2916804772953265, "grad_norm": 12.305173873901367, "learning_rate": 4.08256880733945e-06, "logits/chosen": -0.24730145931243896, "logits/rejected": -0.5199188590049744, "logps/chosen": -87.4412841796875, "logps/rejected": -108.43525695800781, "loss": 1.4032, "nll_loss": 1.8019367456436157, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": 2.422152280807495, "rewards/margins": 2.348961114883423, "rewards/rejected": 0.07319097220897675, "step": 220 }, { "epoch": 0.3049386808087504, "grad_norm": 10.129953384399414, "learning_rate": 4.00611620795107e-06, "logits/chosen": -0.31045737862586975, "logits/rejected": -0.5804657340049744, "logps/chosen": -91.82988739013672, "logps/rejected": -122.0359878540039, "loss": 1.4386, "nll_loss": 1.850035309791565, "rewards/accuracies": 0.90625, "rewards/chosen": 2.229877233505249, "rewards/margins": 2.152693748474121, "rewards/rejected": 0.07718367874622345, "step": 230 }, { "epoch": 0.31819688432217436, "grad_norm": 13.068564414978027, "learning_rate": 3.9296636085626916e-06, "logits/chosen": -0.2514588534832001, "logits/rejected": -0.5521794557571411, "logps/chosen": -87.9677963256836, "logps/rejected": -109.2548828125, "loss": 1.4216, "nll_loss": 1.8036372661590576, "rewards/accuracies": 0.909375011920929, "rewards/chosen": 2.569032907485962, "rewards/margins": 2.454697370529175, "rewards/rejected": 0.11433545500040054, "step": 240 }, { "epoch": 0.3314550878355983, "grad_norm": 9.738222122192383, "learning_rate": 3.853211009174313e-06, "logits/chosen": -0.22102048993110657, "logits/rejected": -0.5118038654327393, "logps/chosen": -81.05973815917969, "logps/rejected": -107.3470458984375, "loss": 1.3802, "nll_loss": 1.738581895828247, "rewards/accuracies": 0.890625, "rewards/chosen": 2.435701847076416, "rewards/margins": 2.2943472862243652, "rewards/rejected": 0.14135441184043884, "step": 250 }, { "epoch": 0.34471329134902223, "grad_norm": 11.227466583251953, "learning_rate": 3.776758409785933e-06, "logits/chosen": -0.24970126152038574, "logits/rejected": -0.5423383116722107, "logps/chosen": -90.58589172363281, "logps/rejected": -124.6546859741211, "loss": 1.4231, "nll_loss": 1.796224594116211, "rewards/accuracies": 0.8843749761581421, "rewards/chosen": 2.3060898780822754, "rewards/margins": 2.1725523471832275, "rewards/rejected": 0.13353754580020905, "step": 260 }, { "epoch": 0.3579714948624461, "grad_norm": 10.950806617736816, "learning_rate": 3.7003058103975537e-06, "logits/chosen": -0.22132663428783417, "logits/rejected": -0.5066910982131958, "logps/chosen": -83.74676513671875, "logps/rejected": -106.14500427246094, "loss": 1.3775, "nll_loss": 1.7145506143569946, "rewards/accuracies": 0.921875, "rewards/chosen": 2.4925014972686768, "rewards/margins": 2.321049690246582, "rewards/rejected": 0.17145180702209473, "step": 270 }, { "epoch": 0.37122969837587005, "grad_norm": 10.97486686706543, "learning_rate": 3.6238532110091746e-06, "logits/chosen": -0.258320152759552, "logits/rejected": -0.5529795289039612, "logps/chosen": -86.88526916503906, "logps/rejected": -111.84498596191406, "loss": 1.3977, "nll_loss": 1.7480520009994507, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": 2.6251580715179443, "rewards/margins": 2.444127321243286, "rewards/rejected": 0.1810309737920761, "step": 280 }, { "epoch": 0.384487901889294, "grad_norm": 9.71605110168457, "learning_rate": 3.5474006116207954e-06, "logits/chosen": -0.2799197733402252, "logits/rejected": -0.5588380098342896, "logps/chosen": -95.93229675292969, "logps/rejected": -128.71484375, "loss": 1.4329, "nll_loss": 1.8220994472503662, "rewards/accuracies": 0.940625011920929, "rewards/chosen": 2.5101046562194824, "rewards/margins": 2.3628010749816895, "rewards/rejected": 0.14730362594127655, "step": 290 }, { "epoch": 0.39774610540271793, "grad_norm": 21.213260650634766, "learning_rate": 3.4709480122324163e-06, "logits/chosen": -0.18812108039855957, "logits/rejected": -0.50641930103302, "logps/chosen": -90.42585754394531, "logps/rejected": -107.69468688964844, "loss": 1.39, "nll_loss": 1.7294094562530518, "rewards/accuracies": 0.940625011920929, "rewards/chosen": 2.7429447174072266, "rewards/margins": 2.536832094192505, "rewards/rejected": 0.20611290633678436, "step": 300 }, { "epoch": 0.41100430891614187, "grad_norm": 11.334510803222656, "learning_rate": 3.394495412844037e-06, "logits/chosen": -0.2053213119506836, "logits/rejected": -0.5050525665283203, "logps/chosen": -84.91264343261719, "logps/rejected": -112.988037109375, "loss": 1.375, "nll_loss": 1.7265828847885132, "rewards/accuracies": 0.921875, "rewards/chosen": 2.71644926071167, "rewards/margins": 2.498016834259033, "rewards/rejected": 0.21843275427818298, "step": 310 }, { "epoch": 0.4242625124295658, "grad_norm": 10.023775100708008, "learning_rate": 3.318042813455658e-06, "logits/chosen": -0.20158584415912628, "logits/rejected": -0.5022256970405579, "logps/chosen": -90.71162414550781, "logps/rejected": -111.33503723144531, "loss": 1.381, "nll_loss": 1.7314481735229492, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.7540974617004395, "rewards/margins": 2.4751474857330322, "rewards/rejected": 0.2789500057697296, "step": 320 }, { "epoch": 0.43752071594298975, "grad_norm": 10.706851959228516, "learning_rate": 3.2415902140672784e-06, "logits/chosen": -0.1562536209821701, "logits/rejected": -0.5007289052009583, "logps/chosen": -89.73895263671875, "logps/rejected": -106.5843276977539, "loss": 1.3693, "nll_loss": 1.6994127035140991, "rewards/accuracies": 0.903124988079071, "rewards/chosen": 2.805201292037964, "rewards/margins": 2.5105228424072266, "rewards/rejected": 0.29467862844467163, "step": 330 }, { "epoch": 0.45077891945641363, "grad_norm": 25.567277908325195, "learning_rate": 3.1651376146788993e-06, "logits/chosen": -0.23418506979942322, "logits/rejected": -0.5170575976371765, "logps/chosen": -102.624267578125, "logps/rejected": -112.3178939819336, "loss": 1.4032, "nll_loss": 1.7835899591445923, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 2.494511842727661, "rewards/margins": 2.1974825859069824, "rewards/rejected": 0.2970294654369354, "step": 340 }, { "epoch": 0.46403712296983757, "grad_norm": 12.165983200073242, "learning_rate": 3.08868501529052e-06, "logits/chosen": -0.20166996121406555, "logits/rejected": -0.4993151128292084, "logps/chosen": -99.72298431396484, "logps/rejected": -117.60599517822266, "loss": 1.387, "nll_loss": 1.7688575983047485, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": 2.6475043296813965, "rewards/margins": 2.348958969116211, "rewards/rejected": 0.2985452711582184, "step": 350 }, { "epoch": 0.4772953264832615, "grad_norm": 9.041903495788574, "learning_rate": 3.012232415902141e-06, "logits/chosen": -0.07940540462732315, "logits/rejected": -0.4662111699581146, "logps/chosen": -75.92832946777344, "logps/rejected": -104.47607421875, "loss": 1.3136, "nll_loss": 1.6304452419281006, "rewards/accuracies": 0.921875, "rewards/chosen": 2.911555051803589, "rewards/margins": 2.5380759239196777, "rewards/rejected": 0.37347906827926636, "step": 360 }, { "epoch": 0.49055352999668544, "grad_norm": 10.296601295471191, "learning_rate": 2.935779816513762e-06, "logits/chosen": -0.08891765028238297, "logits/rejected": -0.4381803572177887, "logps/chosen": -83.01859283447266, "logps/rejected": -97.76497650146484, "loss": 1.3309, "nll_loss": 1.6442056894302368, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": 2.8263065814971924, "rewards/margins": 2.439760208129883, "rewards/rejected": 0.3865460455417633, "step": 370 }, { "epoch": 0.5038117335101093, "grad_norm": 11.621145248413086, "learning_rate": 2.8593272171253827e-06, "logits/chosen": -0.13201047480106354, "logits/rejected": -0.4527694582939148, "logps/chosen": -87.77113342285156, "logps/rejected": -122.87522888183594, "loss": 1.3474, "nll_loss": 1.6941699981689453, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.6734607219696045, "rewards/margins": 2.329221248626709, "rewards/rejected": 0.3442399501800537, "step": 380 }, { "epoch": 0.5170699370235333, "grad_norm": 9.234843254089355, "learning_rate": 2.782874617737003e-06, "logits/chosen": -0.13576461374759674, "logits/rejected": -0.5005991458892822, "logps/chosen": -98.45845031738281, "logps/rejected": -119.65950012207031, "loss": 1.3579, "nll_loss": 1.7100152969360352, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 2.792757034301758, "rewards/margins": 2.4400179386138916, "rewards/rejected": 0.35273903608322144, "step": 390 }, { "epoch": 0.5303281405369572, "grad_norm": 10.394464492797852, "learning_rate": 2.706422018348624e-06, "logits/chosen": -0.14902424812316895, "logits/rejected": -0.49045664072036743, "logps/chosen": -96.21540069580078, "logps/rejected": -113.64534759521484, "loss": 1.3661, "nll_loss": 1.73909592628479, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": 2.7949509620666504, "rewards/margins": 2.4181408882141113, "rewards/rejected": 0.3768100440502167, "step": 400 }, { "epoch": 0.5303281405369572, "eval_logits/chosen": 0.7696120142936707, "eval_logits/rejected": -0.5085250735282898, "eval_logps/chosen": -23.760295867919922, "eval_logps/rejected": -19.921955108642578, "eval_loss": 1.6759577989578247, "eval_nll_loss": 2.218850612640381, "eval_rewards/accuracies": 0.8729507923126221, "eval_rewards/chosen": 1.9860752820968628, "eval_rewards/margins": 1.4887374639511108, "eval_rewards/rejected": 0.4973376393318176, "eval_runtime": 126.408, "eval_samples_per_second": 21.217, "eval_steps_per_second": 5.308, "step": 400 }, { "epoch": 0.5435863440503812, "grad_norm": 12.399572372436523, "learning_rate": 2.629969418960245e-06, "logits/chosen": -0.05847010016441345, "logits/rejected": -0.42683249711990356, "logps/chosen": -82.53068542480469, "logps/rejected": -105.50111389160156, "loss": 1.3133, "nll_loss": 1.6254644393920898, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": 3.054617166519165, "rewards/margins": 2.591078042984009, "rewards/rejected": 0.4635390341281891, "step": 410 }, { "epoch": 0.5568445475638051, "grad_norm": 11.11414909362793, "learning_rate": 2.5535168195718657e-06, "logits/chosen": -0.10004544258117676, "logits/rejected": -0.45182594656944275, "logps/chosen": -85.72924041748047, "logps/rejected": -116.79786682128906, "loss": 1.3335, "nll_loss": 1.6914409399032593, "rewards/accuracies": 0.934374988079071, "rewards/chosen": 2.7389349937438965, "rewards/margins": 2.347107410430908, "rewards/rejected": 0.39182740449905396, "step": 420 }, { "epoch": 0.5701027510772291, "grad_norm": 11.188493728637695, "learning_rate": 2.4770642201834866e-06, "logits/chosen": -0.0028325587045401335, "logits/rejected": -0.40763336420059204, "logps/chosen": -83.37824249267578, "logps/rejected": -90.0846176147461, "loss": 1.2739, "nll_loss": 1.597538709640503, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": 3.1972365379333496, "rewards/margins": 2.69289493560791, "rewards/rejected": 0.5043416619300842, "step": 430 }, { "epoch": 0.583360954590653, "grad_norm": 8.546418190002441, "learning_rate": 2.400611620795107e-06, "logits/chosen": -0.017235688865184784, "logits/rejected": -0.4104672372341156, "logps/chosen": -79.5479507446289, "logps/rejected": -100.47693634033203, "loss": 1.3056, "nll_loss": 1.6203285455703735, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.0248606204986572, "rewards/margins": 2.5377180576324463, "rewards/rejected": 0.4871426224708557, "step": 440 }, { "epoch": 0.596619158104077, "grad_norm": 12.532038688659668, "learning_rate": 2.324159021406728e-06, "logits/chosen": -0.08448103815317154, "logits/rejected": -0.41612687706947327, "logps/chosen": -89.61862182617188, "logps/rejected": -112.32474517822266, "loss": 1.3386, "nll_loss": 1.6953102350234985, "rewards/accuracies": 0.921875, "rewards/chosen": 2.803964376449585, "rewards/margins": 2.3399243354797363, "rewards/rejected": 0.4640396535396576, "step": 450 }, { "epoch": 0.6098773616175008, "grad_norm": 13.633370399475098, "learning_rate": 2.2477064220183487e-06, "logits/chosen": -0.013189451768994331, "logits/rejected": -0.41869059205055237, "logps/chosen": -90.39549255371094, "logps/rejected": -107.79378509521484, "loss": 1.289, "nll_loss": 1.6027923822402954, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 2.938788652420044, "rewards/margins": 2.443359851837158, "rewards/rejected": 0.49542921781539917, "step": 460 }, { "epoch": 0.6231355651309247, "grad_norm": 11.236469268798828, "learning_rate": 2.1712538226299696e-06, "logits/chosen": -0.09504064172506332, "logits/rejected": NaN, "logps/chosen": -106.80879974365234, "logps/rejected": -117.15657806396484, "loss": 1.3584, "nll_loss": 1.7296257019042969, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 2.964348316192627, "rewards/margins": 2.4477875232696533, "rewards/rejected": 0.516560971736908, "step": 470 }, { "epoch": 0.6363937686443487, "grad_norm": 10.896163940429688, "learning_rate": 2.0948012232415905e-06, "logits/chosen": -0.0007806614157743752, "logits/rejected": -0.39822936058044434, "logps/chosen": -95.16539001464844, "logps/rejected": -116.85235595703125, "loss": 1.3068, "nll_loss": 1.6314979791641235, "rewards/accuracies": 0.903124988079071, "rewards/chosen": 2.8457746505737305, "rewards/margins": 2.327517509460449, "rewards/rejected": 0.518257200717926, "step": 480 }, { "epoch": 0.6496519721577726, "grad_norm": 7.040874004364014, "learning_rate": 2.0183486238532113e-06, "logits/chosen": 0.059870027005672455, "logits/rejected": -0.3692580461502075, "logps/chosen": -71.55367279052734, "logps/rejected": -92.91752624511719, "loss": 1.2388, "nll_loss": 1.5524179935455322, "rewards/accuracies": 0.953125, "rewards/chosen": 3.224459171295166, "rewards/margins": 2.605776309967041, "rewards/rejected": 0.6186825037002563, "step": 490 }, { "epoch": 0.6629101756711966, "grad_norm": 12.266393661499023, "learning_rate": 1.9418960244648317e-06, "logits/chosen": -0.09728819876909256, "logits/rejected": -0.4279538094997406, "logps/chosen": -98.3723373413086, "logps/rejected": -116.90461730957031, "loss": 1.3497, "nll_loss": 1.7364962100982666, "rewards/accuracies": 0.9281250238418579, "rewards/chosen": 2.8716821670532227, "rewards/margins": 2.3796398639678955, "rewards/rejected": 0.4920427203178406, "step": 500 }, { "epoch": 0.6761683791846205, "grad_norm": 11.75126838684082, "learning_rate": 1.8654434250764528e-06, "logits/chosen": -0.06470651179552078, "logits/rejected": -0.40114492177963257, "logps/chosen": -93.51762390136719, "logps/rejected": -114.25923156738281, "loss": 1.325, "nll_loss": 1.6921663284301758, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 3.008223056793213, "rewards/margins": 2.433964490890503, "rewards/rejected": 0.5742586851119995, "step": 510 }, { "epoch": 0.6894265826980445, "grad_norm": 9.93482494354248, "learning_rate": 1.7889908256880737e-06, "logits/chosen": 0.08653802424669266, "logits/rejected": -0.34983566403388977, "logps/chosen": -74.48589324951172, "logps/rejected": -97.51747131347656, "loss": 1.2279, "nll_loss": 1.5161999464035034, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 3.1022284030914307, "rewards/margins": 2.469442367553711, "rewards/rejected": 0.6327860951423645, "step": 520 }, { "epoch": 0.7026847862114683, "grad_norm": 10.692046165466309, "learning_rate": 1.7125382262996943e-06, "logits/chosen": 0.032983891665935516, "logits/rejected": -0.3582807779312134, "logps/chosen": -82.62593078613281, "logps/rejected": -102.52696228027344, "loss": 1.2671, "nll_loss": 1.5854206085205078, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 3.2378318309783936, "rewards/margins": 2.618978977203369, "rewards/rejected": 0.618852436542511, "step": 530 }, { "epoch": 0.7159429897248922, "grad_norm": 16.475303649902344, "learning_rate": 1.6360856269113152e-06, "logits/chosen": -0.015640150755643845, "logits/rejected": -0.3781605362892151, "logps/chosen": -90.1484146118164, "logps/rejected": -112.01336669921875, "loss": 1.2988, "nll_loss": 1.6478378772735596, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 2.8458220958709717, "rewards/margins": 2.3063721656799316, "rewards/rejected": 0.53944993019104, "step": 540 }, { "epoch": 0.7292011932383162, "grad_norm": 9.720301628112793, "learning_rate": 1.559633027522936e-06, "logits/chosen": -0.07955951988697052, "logits/rejected": -0.4525434374809265, "logps/chosen": -93.29837799072266, "logps/rejected": -139.53305053710938, "loss": 1.3062, "nll_loss": 1.6710792779922485, "rewards/accuracies": 0.934374988079071, "rewards/chosen": 2.821993827819824, "rewards/margins": 2.3934006690979004, "rewards/rejected": 0.42859315872192383, "step": 550 }, { "epoch": 0.7424593967517401, "grad_norm": 12.09350872039795, "learning_rate": 1.4831804281345567e-06, "logits/chosen": 0.014041140675544739, "logits/rejected": -0.37960466742515564, "logps/chosen": -93.00960540771484, "logps/rejected": -117.96089935302734, "loss": 1.2977, "nll_loss": 1.6683666706085205, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 3.0185599327087402, "rewards/margins": 2.459902286529541, "rewards/rejected": 0.5586578249931335, "step": 560 }, { "epoch": 0.7557176002651641, "grad_norm": 9.332964897155762, "learning_rate": 1.4067278287461775e-06, "logits/chosen": -0.0421045646071434, "logits/rejected": -0.3469962775707245, "logps/chosen": -93.99418640136719, "logps/rejected": -120.124267578125, "loss": 1.3144, "nll_loss": 1.673651099205017, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.913119077682495, "rewards/margins": 2.3597278594970703, "rewards/rejected": 0.5533913373947144, "step": 570 }, { "epoch": 0.768975803778588, "grad_norm": 8.982401847839355, "learning_rate": 1.3302752293577984e-06, "logits/chosen": 0.049627698957920074, "logits/rejected": -0.3409837484359741, "logps/chosen": -80.563232421875, "logps/rejected": -110.6832504272461, "loss": 1.2686, "nll_loss": 1.6071580648422241, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": 3.0636610984802246, "rewards/margins": 2.460207223892212, "rewards/rejected": 0.6034537553787231, "step": 580 }, { "epoch": 0.782234007292012, "grad_norm": 11.456655502319336, "learning_rate": 1.253822629969419e-06, "logits/chosen": 0.01245723944157362, "logits/rejected": -0.34452953934669495, "logps/chosen": -83.06973266601562, "logps/rejected": -112.3355712890625, "loss": 1.2596, "nll_loss": 1.579742193222046, "rewards/accuracies": 0.934374988079071, "rewards/chosen": 3.0834407806396484, "rewards/margins": 2.512244701385498, "rewards/rejected": 0.5711959600448608, "step": 590 }, { "epoch": 0.7954922108054359, "grad_norm": 14.906444549560547, "learning_rate": 1.17737003058104e-06, "logits/chosen": 0.10093510150909424, "logits/rejected": -0.3423386812210083, "logps/chosen": -84.59368896484375, "logps/rejected": -118.52232360839844, "loss": 1.2424, "nll_loss": 1.5443143844604492, "rewards/accuracies": 0.934374988079071, "rewards/chosen": 3.1625304222106934, "rewards/margins": 2.5714595317840576, "rewards/rejected": 0.5910708904266357, "step": 600 }, { "epoch": 0.7954922108054359, "eval_logits/chosen": 1.064488410949707, "eval_logits/rejected": -0.3661547303199768, "eval_logps/chosen": -22.925987243652344, "eval_logps/rejected": -17.408470153808594, "eval_loss": 1.7094613313674927, "eval_nll_loss": 2.171783685684204, "eval_rewards/accuracies": 0.7257823944091797, "eval_rewards/chosen": 2.0695061683654785, "eval_rewards/margins": 1.3208197355270386, "eval_rewards/rejected": 0.7486862540245056, "eval_runtime": 126.8733, "eval_samples_per_second": 21.139, "eval_steps_per_second": 5.289, "step": 600 }, { "epoch": 0.8087504143188597, "grad_norm": 11.739520072937012, "learning_rate": 1.1009174311926608e-06, "logits/chosen": 0.1604587882757187, "logits/rejected": -0.30457383394241333, "logps/chosen": -77.71000671386719, "logps/rejected": -100.43540954589844, "loss": 1.2066, "nll_loss": 1.4790329933166504, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.3722426891326904, "rewards/margins": 2.629152297973633, "rewards/rejected": 0.743090808391571, "step": 610 }, { "epoch": 0.8220086178322837, "grad_norm": 10.975104331970215, "learning_rate": 1.0244648318042814e-06, "logits/chosen": 0.08241738379001617, "logits/rejected": -0.30928146839141846, "logps/chosen": -87.46654510498047, "logps/rejected": -106.96031188964844, "loss": 1.2453, "nll_loss": 1.555537223815918, "rewards/accuracies": 0.953125, "rewards/chosen": 3.170712947845459, "rewards/margins": 2.561098575592041, "rewards/rejected": 0.6096144318580627, "step": 620 }, { "epoch": 0.8352668213457076, "grad_norm": 13.56505298614502, "learning_rate": 9.480122324159022e-07, "logits/chosen": 0.023757517337799072, "logits/rejected": -0.33807113766670227, "logps/chosen": -94.47450256347656, "logps/rejected": -128.47509765625, "loss": 1.2845, "nll_loss": 1.631945013999939, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 2.836199998855591, "rewards/margins": 2.2965328693389893, "rewards/rejected": 0.5396672487258911, "step": 630 }, { "epoch": 0.8485250248591316, "grad_norm": 16.084320068359375, "learning_rate": 8.71559633027523e-07, "logits/chosen": 0.0821368619799614, "logits/rejected": -0.33640867471694946, "logps/chosen": -78.98149108886719, "logps/rejected": -111.6216049194336, "loss": 1.2382, "nll_loss": 1.5553481578826904, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": 3.1928162574768066, "rewards/margins": 2.547853708267212, "rewards/rejected": 0.6449624300003052, "step": 640 }, { "epoch": 0.8617832283725555, "grad_norm": 9.197409629821777, "learning_rate": 7.951070336391438e-07, "logits/chosen": 0.11906716972589493, "logits/rejected": -0.3259919583797455, "logps/chosen": -74.3010482788086, "logps/rejected": -108.84968566894531, "loss": 1.2331, "nll_loss": 1.558100938796997, "rewards/accuracies": 0.965624988079071, "rewards/chosen": 3.2675209045410156, "rewards/margins": 2.6362545490264893, "rewards/rejected": 0.6312668323516846, "step": 650 }, { "epoch": 0.8750414318859795, "grad_norm": 9.966322898864746, "learning_rate": 7.186544342507645e-07, "logits/chosen": 0.03254573419690132, "logits/rejected": -0.3160571753978729, "logps/chosen": -85.23878479003906, "logps/rejected": -104.35621643066406, "loss": 1.2927, "nll_loss": 1.662453293800354, "rewards/accuracies": 0.953125, "rewards/chosen": 3.136343002319336, "rewards/margins": 2.490025043487549, "rewards/rejected": 0.6463181376457214, "step": 660 }, { "epoch": 0.8882996353994034, "grad_norm": 9.719200134277344, "learning_rate": 6.422018348623854e-07, "logits/chosen": 0.060719866305589676, "logits/rejected": -0.37117859721183777, "logps/chosen": -82.31736755371094, "logps/rejected": -121.16090393066406, "loss": 1.2738, "nll_loss": 1.6195169687271118, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 3.1395468711853027, "rewards/margins": 2.5326294898986816, "rewards/rejected": 0.6069172620773315, "step": 670 }, { "epoch": 0.9015578389128273, "grad_norm": 10.185211181640625, "learning_rate": 5.657492354740061e-07, "logits/chosen": 0.16828341782093048, "logits/rejected": -0.28492841124534607, "logps/chosen": -74.0738754272461, "logps/rejected": -89.85967254638672, "loss": 1.2257, "nll_loss": 1.5249927043914795, "rewards/accuracies": 0.934374988079071, "rewards/chosen": 3.2941131591796875, "rewards/margins": 2.549973249435425, "rewards/rejected": 0.744140088558197, "step": 680 }, { "epoch": 0.9148160424262513, "grad_norm": 11.983025550842285, "learning_rate": 4.89296636085627e-07, "logits/chosen": 0.035757843405008316, "logits/rejected": -0.3307420015335083, "logps/chosen": -98.5334701538086, "logps/rejected": -112.04931640625, "loss": 1.2842, "nll_loss": 1.63511061668396, "rewards/accuracies": 0.9375, "rewards/chosen": 3.1852855682373047, "rewards/margins": 2.5147435665130615, "rewards/rejected": 0.6705416440963745, "step": 690 }, { "epoch": 0.9280742459396751, "grad_norm": 9.73780632019043, "learning_rate": 4.128440366972478e-07, "logits/chosen": 0.06930799782276154, "logits/rejected": -0.33910712599754333, "logps/chosen": -86.6443862915039, "logps/rejected": -112.4611587524414, "loss": 1.2466, "nll_loss": 1.5855239629745483, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 3.226916551589966, "rewards/margins": 2.535816192626953, "rewards/rejected": 0.6910998225212097, "step": 700 }, { "epoch": 0.9413324494530991, "grad_norm": 9.678099632263184, "learning_rate": 3.363914373088685e-07, "logits/chosen": 0.12796175479888916, "logits/rejected": -0.29829975962638855, "logps/chosen": -83.7620620727539, "logps/rejected": -99.74095153808594, "loss": 1.2307, "nll_loss": 1.524804711341858, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": 3.5030083656311035, "rewards/margins": 2.73651385307312, "rewards/rejected": 0.7664941549301147, "step": 710 }, { "epoch": 0.954590652966523, "grad_norm": 9.837422370910645, "learning_rate": 2.599388379204893e-07, "logits/chosen": 0.06604432314634323, "logits/rejected": -0.326642245054245, "logps/chosen": -87.40840911865234, "logps/rejected": -112.01222229003906, "loss": 1.2683, "nll_loss": 1.62287175655365, "rewards/accuracies": 0.9375, "rewards/chosen": 3.202043056488037, "rewards/margins": 2.5358872413635254, "rewards/rejected": 0.6661559343338013, "step": 720 }, { "epoch": 0.967848856479947, "grad_norm": 14.159199714660645, "learning_rate": 1.8348623853211012e-07, "logits/chosen": 0.02280101552605629, "logits/rejected": -0.3272295594215393, "logps/chosen": -84.34774017333984, "logps/rejected": -106.10346984863281, "loss": 1.2922, "nll_loss": 1.6527671813964844, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 3.2514991760253906, "rewards/margins": 2.529658555984497, "rewards/rejected": 0.7218402624130249, "step": 730 }, { "epoch": 0.9811070599933709, "grad_norm": 9.912482261657715, "learning_rate": 1.070336391437309e-07, "logits/chosen": 0.09289325773715973, "logits/rejected": -0.31532809138298035, "logps/chosen": -82.03899383544922, "logps/rejected": -116.87910461425781, "loss": 1.2127, "nll_loss": 1.5196421146392822, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.268120527267456, "rewards/margins": 2.592498302459717, "rewards/rejected": 0.6756229996681213, "step": 740 }, { "epoch": 0.9943652635067949, "grad_norm": 10.23357105255127, "learning_rate": 3.0581039755351686e-08, "logits/chosen": 0.015320442616939545, "logits/rejected": -0.3034003674983978, "logps/chosen": -99.24415588378906, "logps/rejected": -107.158447265625, "loss": 1.3115, "nll_loss": 1.6806989908218384, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.2072606086730957, "rewards/margins": 2.46455717086792, "rewards/rejected": 0.7427036166191101, "step": 750 } ], "logging_steps": 10, "max_steps": 754, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }