{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2809913726867605, "eval_steps": 100, "global_step": 3200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008780980396461265, "grad_norm": 6.670812129974365, "learning_rate": 1.0000000000000001e-07, "logits/chosen": 4.2431488037109375, "logits/rejected": 4.231738567352295, "logps/chosen": -9.991304397583008, "logps/rejected": -10.524327278137207, "loss": 3.0309, "nll_loss": 2.3641905784606934, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.99739146232605, "rewards/margins": 0.15990665555000305, "rewards/rejected": -3.1572983264923096, "step": 10 }, { "epoch": 0.001756196079292253, "grad_norm": 10.682082176208496, "learning_rate": 2.1111111111111113e-07, "logits/chosen": 4.309535980224609, "logits/rejected": 4.361606597900391, "logps/chosen": -9.786101341247559, "logps/rejected": -10.518722534179688, "loss": 3.1598, "nll_loss": 2.5160868167877197, "rewards/accuracies": 0.625, "rewards/chosen": -2.9358303546905518, "rewards/margins": 0.2197863757610321, "rewards/rejected": -3.1556167602539062, "step": 20 }, { "epoch": 0.0026342941189383797, "grad_norm": 15.04028034210205, "learning_rate": 3.2222222222222227e-07, "logits/chosen": 4.225083351135254, "logits/rejected": 4.215059757232666, "logps/chosen": -9.968446731567383, "logps/rejected": -10.634344100952148, "loss": 2.9674, "nll_loss": 2.318664073944092, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.9905343055725098, "rewards/margins": 0.19976934790611267, "rewards/rejected": -3.1903038024902344, "step": 30 }, { "epoch": 0.003512392158584506, "grad_norm": 10.793362617492676, "learning_rate": 4.333333333333334e-07, "logits/chosen": 4.328730583190918, "logits/rejected": 4.321128845214844, "logps/chosen": -9.762226104736328, "logps/rejected": -10.319639205932617, "loss": 2.7628, "nll_loss": 2.110645294189453, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.928668260574341, "rewards/margins": 0.1672237515449524, "rewards/rejected": -3.0958914756774902, "step": 40 }, { "epoch": 0.004390490198230633, "grad_norm": 6.863041400909424, "learning_rate": 5.444444444444444e-07, "logits/chosen": 4.535063743591309, "logits/rejected": 4.555140495300293, "logps/chosen": -10.173809051513672, "logps/rejected": -10.436089515686035, "loss": 2.7587, "nll_loss": 2.0572845935821533, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -3.052142858505249, "rewards/margins": 0.07868396490812302, "rewards/rejected": -3.130826711654663, "step": 50 }, { "epoch": 0.005268588237876759, "grad_norm": 11.63379955291748, "learning_rate": 6.555555555555556e-07, "logits/chosen": 4.210951805114746, "logits/rejected": 4.193416595458984, "logps/chosen": -9.710775375366211, "logps/rejected": -10.588180541992188, "loss": 3.023, "nll_loss": 2.409853458404541, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.9132332801818848, "rewards/margins": 0.2632210850715637, "rewards/rejected": -3.1764540672302246, "step": 60 }, { "epoch": 0.006146686277522885, "grad_norm": 4.952016830444336, "learning_rate": 7.666666666666667e-07, "logits/chosen": 4.56010627746582, "logits/rejected": 4.571717739105225, "logps/chosen": -9.788490295410156, "logps/rejected": -10.214717864990234, "loss": 2.6203, "nll_loss": 1.9469432830810547, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.936546802520752, "rewards/margins": 0.12786847352981567, "rewards/rejected": -3.064415454864502, "step": 70 }, { "epoch": 0.007024784317169012, "grad_norm": 10.809489250183105, "learning_rate": 8.777777777777778e-07, "logits/chosen": 4.2120256423950195, "logits/rejected": 4.222228050231934, "logps/chosen": -9.806170463562012, "logps/rejected": -10.296957969665527, "loss": 3.3042, "nll_loss": 2.644357681274414, "rewards/accuracies": 0.5625, "rewards/chosen": -2.9418509006500244, "rewards/margins": 0.1472366452217102, "rewards/rejected": -3.08908748626709, "step": 80 }, { "epoch": 0.007902882356815138, "grad_norm": 9.27618408203125, "learning_rate": 9.88888888888889e-07, "logits/chosen": 4.336479187011719, "logits/rejected": 4.3232035636901855, "logps/chosen": -9.828582763671875, "logps/rejected": -10.485132217407227, "loss": 3.1223, "nll_loss": 2.4759140014648438, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.948575496673584, "rewards/margins": 0.1969645768404007, "rewards/rejected": -3.1455399990081787, "step": 90 }, { "epoch": 0.008780980396461266, "grad_norm": 19.710798263549805, "learning_rate": 1.1e-06, "logits/chosen": 4.149864673614502, "logits/rejected": 4.147267818450928, "logps/chosen": -9.967988014221191, "logps/rejected": -10.663251876831055, "loss": 2.9038, "nll_loss": 2.2608590126037598, "rewards/accuracies": 0.6875, "rewards/chosen": -2.990396499633789, "rewards/margins": 0.20857906341552734, "rewards/rejected": -3.1989755630493164, "step": 100 }, { "epoch": 0.009659078436107391, "grad_norm": 10.258548736572266, "learning_rate": 1.2111111111111111e-06, "logits/chosen": 4.241273880004883, "logits/rejected": 4.249630928039551, "logps/chosen": -9.708308219909668, "logps/rejected": -10.221686363220215, "loss": 3.2698, "nll_loss": 2.603203535079956, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.912492513656616, "rewards/margins": 0.15401321649551392, "rewards/rejected": -3.0665059089660645, "step": 110 }, { "epoch": 0.010537176475753519, "grad_norm": 3.6609084606170654, "learning_rate": 1.3222222222222222e-06, "logits/chosen": 4.420263290405273, "logits/rejected": 4.413485527038574, "logps/chosen": -9.733232498168945, "logps/rejected": -10.436403274536133, "loss": 2.5418, "nll_loss": 1.9018337726593018, "rewards/accuracies": 0.6875, "rewards/chosen": -2.9199700355529785, "rewards/margins": 0.21095120906829834, "rewards/rejected": -3.1309211254119873, "step": 120 }, { "epoch": 0.011415274515399644, "grad_norm": 10.068683624267578, "learning_rate": 1.4333333333333335e-06, "logits/chosen": 4.42335319519043, "logits/rejected": 4.493862152099609, "logps/chosen": -9.73039722442627, "logps/rejected": -10.323850631713867, "loss": 2.5285, "nll_loss": 1.8774116039276123, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.919119358062744, "rewards/margins": 0.17803625762462616, "rewards/rejected": -3.0971553325653076, "step": 130 }, { "epoch": 0.01229337255504577, "grad_norm": 11.362488746643066, "learning_rate": 1.5444444444444446e-06, "logits/chosen": 4.332414150238037, "logits/rejected": 4.329668045043945, "logps/chosen": -9.279766082763672, "logps/rejected": -9.99905014038086, "loss": 2.7181, "nll_loss": 2.0923380851745605, "rewards/accuracies": 0.625, "rewards/chosen": -2.7839295864105225, "rewards/margins": 0.21578574180603027, "rewards/rejected": -2.9997153282165527, "step": 140 }, { "epoch": 0.013171470594691898, "grad_norm": 14.254817008972168, "learning_rate": 1.6555555555555559e-06, "logits/chosen": 4.319321155548096, "logits/rejected": 4.321578502655029, "logps/chosen": -9.059564590454102, "logps/rejected": -9.602411270141602, "loss": 2.8514, "nll_loss": 2.1910147666931152, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.717869758605957, "rewards/margins": 0.16285373270511627, "rewards/rejected": -2.880723476409912, "step": 150 }, { "epoch": 0.014049568634338023, "grad_norm": 4.008646488189697, "learning_rate": 1.7666666666666668e-06, "logits/chosen": 4.2461042404174805, "logits/rejected": 4.25100040435791, "logps/chosen": -8.857942581176758, "logps/rejected": -9.37680435180664, "loss": 2.5006, "nll_loss": 1.839271903038025, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.6573829650878906, "rewards/margins": 0.15565846860408783, "rewards/rejected": -2.8130412101745605, "step": 160 }, { "epoch": 0.01492766667398415, "grad_norm": 10.644233703613281, "learning_rate": 1.8777777777777778e-06, "logits/chosen": 4.304837226867676, "logits/rejected": 4.3417558670043945, "logps/chosen": -8.620405197143555, "logps/rejected": -9.385229110717773, "loss": 2.5802, "nll_loss": 1.932861328125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.5861220359802246, "rewards/margins": 0.22944733500480652, "rewards/rejected": -2.8155694007873535, "step": 170 }, { "epoch": 0.015805764713630276, "grad_norm": 9.301039695739746, "learning_rate": 1.988888888888889e-06, "logits/chosen": 4.26754903793335, "logits/rejected": 4.271862030029297, "logps/chosen": -8.370404243469238, "logps/rejected": -8.689592361450195, "loss": 2.4808, "nll_loss": 1.7635695934295654, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.5111212730407715, "rewards/margins": 0.09575649350881577, "rewards/rejected": -2.6068778038024902, "step": 180 }, { "epoch": 0.016683862753276404, "grad_norm": 6.643105506896973, "learning_rate": 2.1000000000000002e-06, "logits/chosen": 4.255660533905029, "logits/rejected": 4.268857955932617, "logps/chosen": -7.950819492340088, "logps/rejected": -8.541925430297852, "loss": 2.2629, "nll_loss": 1.6152054071426392, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.3852455615997314, "rewards/margins": 0.17733201384544373, "rewards/rejected": -2.562577724456787, "step": 190 }, { "epoch": 0.01756196079292253, "grad_norm": 9.348048210144043, "learning_rate": 2.2111111111111113e-06, "logits/chosen": 4.450976848602295, "logits/rejected": 4.4492387771606445, "logps/chosen": -7.3747453689575195, "logps/rejected": -8.266874313354492, "loss": 2.0158, "nll_loss": 1.3951395750045776, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.212423801422119, "rewards/margins": 0.2676388621330261, "rewards/rejected": -2.480062484741211, "step": 200 }, { "epoch": 0.018440058832568655, "grad_norm": 5.35267972946167, "learning_rate": 2.3222222222222224e-06, "logits/chosen": 4.387726783752441, "logits/rejected": 4.407253265380859, "logps/chosen": -6.376626491546631, "logps/rejected": -7.535942077636719, "loss": 1.7794, "nll_loss": 1.1858270168304443, "rewards/accuracies": 0.6875, "rewards/chosen": -1.9129879474639893, "rewards/margins": 0.3477945923805237, "rewards/rejected": -2.2607827186584473, "step": 210 }, { "epoch": 0.019318156872214783, "grad_norm": 8.461145401000977, "learning_rate": 2.4333333333333335e-06, "logits/chosen": 4.57470703125, "logits/rejected": 4.573002815246582, "logps/chosen": -5.478797912597656, "logps/rejected": -6.914730072021484, "loss": 1.4017, "nll_loss": 0.8511344194412231, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.643639326095581, "rewards/margins": 0.4307795464992523, "rewards/rejected": -2.0744190216064453, "step": 220 }, { "epoch": 0.02019625491186091, "grad_norm": 5.639391899108887, "learning_rate": 2.5444444444444446e-06, "logits/chosen": 4.498848915100098, "logits/rejected": 4.52827262878418, "logps/chosen": -4.859742164611816, "logps/rejected": -5.578665733337402, "loss": 1.4217, "nll_loss": 0.754524827003479, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.4579226970672607, "rewards/margins": 0.21567705273628235, "rewards/rejected": -1.6735999584197998, "step": 230 }, { "epoch": 0.021074352951507037, "grad_norm": 5.855747699737549, "learning_rate": 2.6555555555555556e-06, "logits/chosen": 4.4441728591918945, "logits/rejected": 4.421013832092285, "logps/chosen": -3.958739757537842, "logps/rejected": -4.544581413269043, "loss": 1.3529, "nll_loss": 0.6709738969802856, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1876219511032104, "rewards/margins": 0.17575237154960632, "rewards/rejected": -1.3633743524551392, "step": 240 }, { "epoch": 0.02195245099115316, "grad_norm": 5.383121490478516, "learning_rate": 2.766666666666667e-06, "logits/chosen": 4.187775611877441, "logits/rejected": 4.246646404266357, "logps/chosen": -2.965056896209717, "logps/rejected": -3.8153586387634277, "loss": 1.0879, "nll_loss": 0.45148134231567383, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8895170092582703, "rewards/margins": 0.255090594291687, "rewards/rejected": -1.144607663154602, "step": 250 }, { "epoch": 0.02283054903079929, "grad_norm": 4.042531967163086, "learning_rate": 2.8777777777777782e-06, "logits/chosen": 4.055316925048828, "logits/rejected": 4.1006879806518555, "logps/chosen": -2.189060688018799, "logps/rejected": -2.7986176013946533, "loss": 0.9665, "nll_loss": 0.3139139711856842, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6567181348800659, "rewards/margins": 0.1828671246767044, "rewards/rejected": -0.8395851850509644, "step": 260 }, { "epoch": 0.023708647070445416, "grad_norm": 3.6113719940185547, "learning_rate": 2.988888888888889e-06, "logits/chosen": 4.224070072174072, "logits/rejected": 4.2020182609558105, "logps/chosen": -1.5525212287902832, "logps/rejected": -2.3060789108276367, "loss": 0.7876, "nll_loss": 0.17524096369743347, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.465756356716156, "rewards/margins": 0.22606734931468964, "rewards/rejected": -0.6918237805366516, "step": 270 }, { "epoch": 0.02458674511009154, "grad_norm": 2.20628023147583, "learning_rate": 3.1000000000000004e-06, "logits/chosen": 3.9985511302948, "logits/rejected": 3.9572558403015137, "logps/chosen": -1.0324242115020752, "logps/rejected": -1.434692621231079, "loss": 0.8117, "nll_loss": 0.1578795313835144, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.309727281332016, "rewards/margins": 0.12068048864603043, "rewards/rejected": -0.4304077625274658, "step": 280 }, { "epoch": 0.025464843149737668, "grad_norm": 2.605509042739868, "learning_rate": 3.2111111111111115e-06, "logits/chosen": 3.9703261852264404, "logits/rejected": 3.978794574737549, "logps/chosen": -1.302750587463379, "logps/rejected": -1.4635370969772339, "loss": 0.8424, "nll_loss": 0.15038228034973145, "rewards/accuracies": 0.625, "rewards/chosen": -0.39082518219947815, "rewards/margins": 0.048235934227705, "rewards/rejected": -0.43906116485595703, "step": 290 }, { "epoch": 0.026342941189383795, "grad_norm": 4.225305080413818, "learning_rate": 3.322222222222222e-06, "logits/chosen": 3.872316360473633, "logits/rejected": 3.909350872039795, "logps/chosen": -0.7655197381973267, "logps/rejected": -1.3384641408920288, "loss": 0.7091, "nll_loss": 0.07751598209142685, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2296559065580368, "rewards/margins": 0.17188331484794617, "rewards/rejected": -0.40153923630714417, "step": 300 }, { "epoch": 0.027221039229029922, "grad_norm": 2.167863607406616, "learning_rate": 3.4333333333333336e-06, "logits/chosen": 3.8423914909362793, "logits/rejected": 3.8470401763916016, "logps/chosen": -0.781024158000946, "logps/rejected": -1.1648194789886475, "loss": 0.7324, "nll_loss": 0.07609061896800995, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.23430728912353516, "rewards/margins": 0.11513856798410416, "rewards/rejected": -0.3494458794593811, "step": 310 }, { "epoch": 0.028099137268676046, "grad_norm": 2.662125587463379, "learning_rate": 3.5444444444444447e-06, "logits/chosen": 3.844832181930542, "logits/rejected": 3.897855758666992, "logps/chosen": -0.8712307214736938, "logps/rejected": -1.2118942737579346, "loss": 0.754, "nll_loss": 0.09477487206459045, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.2613692581653595, "rewards/margins": 0.10219905525445938, "rewards/rejected": -0.3635682463645935, "step": 320 }, { "epoch": 0.028977235308322174, "grad_norm": 3.6492230892181396, "learning_rate": 3.6555555555555562e-06, "logits/chosen": 3.791374683380127, "logits/rejected": 3.823376417160034, "logps/chosen": -0.8908417820930481, "logps/rejected": -1.1247615814208984, "loss": 0.7571, "nll_loss": 0.07855098694562912, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.26725253462791443, "rewards/margins": 0.07017592340707779, "rewards/rejected": -0.337428480386734, "step": 330 }, { "epoch": 0.0298553333479683, "grad_norm": 4.195188999176025, "learning_rate": 3.766666666666667e-06, "logits/chosen": 3.7930731773376465, "logits/rejected": 3.771570920944214, "logps/chosen": -0.5707345008850098, "logps/rejected": -1.0306470394134521, "loss": 0.7241, "nll_loss": 0.07002006471157074, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1712203472852707, "rewards/margins": 0.13797374069690704, "rewards/rejected": -0.30919408798217773, "step": 340 }, { "epoch": 0.03073343138761443, "grad_norm": 6.730047225952148, "learning_rate": 3.877777777777778e-06, "logits/chosen": 3.7914137840270996, "logits/rejected": 3.830873966217041, "logps/chosen": -0.48989325761795044, "logps/rejected": -1.329006552696228, "loss": 0.6549, "nll_loss": 0.04915159195661545, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.14696797728538513, "rewards/margins": 0.2517339587211609, "rewards/rejected": -0.3987019658088684, "step": 350 }, { "epoch": 0.03161152942726055, "grad_norm": 5.190159320831299, "learning_rate": 3.9888888888888895e-06, "logits/chosen": 3.5701937675476074, "logits/rejected": 3.623018264770508, "logps/chosen": -1.0031934976577759, "logps/rejected": -1.3071503639221191, "loss": 0.7632, "nll_loss": 0.07658834755420685, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.3009580075740814, "rewards/margins": 0.09118713438510895, "rewards/rejected": -0.39214515686035156, "step": 360 }, { "epoch": 0.03248962746690668, "grad_norm": 6.694168567657471, "learning_rate": 4.1e-06, "logits/chosen": 3.823359727859497, "logits/rejected": 3.852163314819336, "logps/chosen": -0.6341744661331177, "logps/rejected": -1.3019847869873047, "loss": 0.7057, "nll_loss": 0.07171504944562912, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.19025234878063202, "rewards/margins": 0.20034310221672058, "rewards/rejected": -0.3905954957008362, "step": 370 }, { "epoch": 0.03336772550655281, "grad_norm": 7.252128601074219, "learning_rate": 4.211111111111112e-06, "logits/chosen": 3.7956886291503906, "logits/rejected": 3.795431137084961, "logps/chosen": -0.7523115277290344, "logps/rejected": -1.489225149154663, "loss": 0.6923, "nll_loss": 0.07404422760009766, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.2256934642791748, "rewards/margins": 0.22107413411140442, "rewards/rejected": -0.4467676281929016, "step": 380 }, { "epoch": 0.034245823546198935, "grad_norm": 2.945425271987915, "learning_rate": 4.322222222222223e-06, "logits/chosen": 3.5031909942626953, "logits/rejected": 3.5364387035369873, "logps/chosen": -0.6282280087471008, "logps/rejected": -1.3442871570587158, "loss": 0.7103, "nll_loss": 0.06170845031738281, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.18846839666366577, "rewards/margins": 0.21481776237487793, "rewards/rejected": -0.4032861292362213, "step": 390 }, { "epoch": 0.03512392158584506, "grad_norm": 6.988142490386963, "learning_rate": 4.433333333333334e-06, "logits/chosen": 3.9105467796325684, "logits/rejected": 3.931438446044922, "logps/chosen": -0.9674631357192993, "logps/rejected": -1.5149281024932861, "loss": 0.7768, "nll_loss": 0.11233203113079071, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.2902389466762543, "rewards/margins": 0.1642395257949829, "rewards/rejected": -0.45447850227355957, "step": 400 }, { "epoch": 0.03600201962549118, "grad_norm": 0.2611992359161377, "learning_rate": 4.544444444444445e-06, "logits/chosen": 3.6991469860076904, "logits/rejected": 3.726545810699463, "logps/chosen": -0.5881733894348145, "logps/rejected": -1.2820765972137451, "loss": 0.6929, "nll_loss": 0.06608637422323227, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.17645201086997986, "rewards/margins": 0.20817098021507263, "rewards/rejected": -0.38462305068969727, "step": 410 }, { "epoch": 0.03688011766513731, "grad_norm": 3.019066095352173, "learning_rate": 4.655555555555556e-06, "logits/chosen": 3.4919254779815674, "logits/rejected": 3.5151939392089844, "logps/chosen": -0.5710722804069519, "logps/rejected": -1.1787471771240234, "loss": 0.712, "nll_loss": 0.05282425880432129, "rewards/accuracies": 0.75, "rewards/chosen": -0.17132170498371124, "rewards/margins": 0.18230250477790833, "rewards/rejected": -0.353624165058136, "step": 420 }, { "epoch": 0.03775821570478344, "grad_norm": 2.8214099407196045, "learning_rate": 4.766666666666667e-06, "logits/chosen": 3.7750792503356934, "logits/rejected": 3.7456068992614746, "logps/chosen": -0.7161394357681274, "logps/rejected": -1.8331083059310913, "loss": 0.6897, "nll_loss": 0.09056379646062851, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2148418426513672, "rewards/margins": 0.335090696811676, "rewards/rejected": -0.5499325394630432, "step": 430 }, { "epoch": 0.038636313744429565, "grad_norm": 5.72930383682251, "learning_rate": 4.877777777777778e-06, "logits/chosen": 3.5365283489227295, "logits/rejected": 3.5479636192321777, "logps/chosen": -0.7414464950561523, "logps/rejected": -1.5791943073272705, "loss": 0.6933, "nll_loss": 0.0765593945980072, "rewards/accuracies": 0.6875, "rewards/chosen": -0.22243395447731018, "rewards/margins": 0.2513243556022644, "rewards/rejected": -0.4737583100795746, "step": 440 }, { "epoch": 0.03951441178407569, "grad_norm": 2.2949461936950684, "learning_rate": 4.988888888888889e-06, "logits/chosen": 3.2915852069854736, "logits/rejected": 3.309730052947998, "logps/chosen": -0.5450859069824219, "logps/rejected": -0.948569118976593, "loss": 0.7334, "nll_loss": 0.0632125660777092, "rewards/accuracies": 0.625, "rewards/chosen": -0.1635257601737976, "rewards/margins": 0.12104494869709015, "rewards/rejected": -0.28457072377204895, "step": 450 }, { "epoch": 0.04039250982372182, "grad_norm": 7.028234481811523, "learning_rate": 5.1e-06, "logits/chosen": 3.5347137451171875, "logits/rejected": 3.542628049850464, "logps/chosen": -0.6212174296379089, "logps/rejected": -1.3661630153656006, "loss": 0.7226, "nll_loss": 0.08348599821329117, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.18636523187160492, "rewards/margins": 0.2234836369752884, "rewards/rejected": -0.4098488688468933, "step": 460 }, { "epoch": 0.04127060786336795, "grad_norm": 5.250203609466553, "learning_rate": 5.211111111111111e-06, "logits/chosen": 3.496631622314453, "logits/rejected": 3.5331413745880127, "logps/chosen": -0.6912875175476074, "logps/rejected": -1.6311848163604736, "loss": 0.6796, "nll_loss": 0.062180064618587494, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.20738628506660461, "rewards/margins": 0.2819691598415375, "rewards/rejected": -0.4893553853034973, "step": 470 }, { "epoch": 0.042148705903014075, "grad_norm": 2.3006033897399902, "learning_rate": 5.322222222222223e-06, "logits/chosen": 3.731518268585205, "logits/rejected": 3.775359630584717, "logps/chosen": -0.8064204454421997, "logps/rejected": -1.6423746347427368, "loss": 0.7282, "nll_loss": 0.09897418320178986, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2419261485338211, "rewards/margins": 0.25078627467155457, "rewards/rejected": -0.49271243810653687, "step": 480 }, { "epoch": 0.043026803942660195, "grad_norm": 6.259355545043945, "learning_rate": 5.4333333333333335e-06, "logits/chosen": 3.3239219188690186, "logits/rejected": 3.3287899494171143, "logps/chosen": -0.623904824256897, "logps/rejected": -1.5192670822143555, "loss": 0.7155, "nll_loss": 0.07001027464866638, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.18717142939567566, "rewards/margins": 0.2686087191104889, "rewards/rejected": -0.4557802081108093, "step": 490 }, { "epoch": 0.04390490198230632, "grad_norm": 4.574368000030518, "learning_rate": 5.544444444444445e-06, "logits/chosen": 3.667168378829956, "logits/rejected": 3.707645893096924, "logps/chosen": -0.7253153920173645, "logps/rejected": -1.8622252941131592, "loss": 0.6561, "nll_loss": 0.06381665915250778, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.21759465336799622, "rewards/margins": 0.34107303619384766, "rewards/rejected": -0.5586676597595215, "step": 500 }, { "epoch": 0.04478300002195245, "grad_norm": 36.264381408691406, "learning_rate": 5.6555555555555566e-06, "logits/chosen": 3.398568630218506, "logits/rejected": 3.468022108078003, "logps/chosen": -0.5565214157104492, "logps/rejected": -1.1638367176055908, "loss": 0.7078, "nll_loss": 0.06747711449861526, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.16695642471313477, "rewards/margins": 0.18219462037086487, "rewards/rejected": -0.34915101528167725, "step": 510 }, { "epoch": 0.04566109806159858, "grad_norm": 4.252997398376465, "learning_rate": 5.766666666666667e-06, "logits/chosen": 3.5346503257751465, "logits/rejected": 3.526895046234131, "logps/chosen": -0.9578359723091125, "logps/rejected": -1.617163896560669, "loss": 0.7567, "nll_loss": 0.11074657738208771, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.2873508036136627, "rewards/margins": 0.19779837131500244, "rewards/rejected": -0.4851491451263428, "step": 520 }, { "epoch": 0.046539196101244705, "grad_norm": 10.661053657531738, "learning_rate": 5.877777777777778e-06, "logits/chosen": 3.5897374153137207, "logits/rejected": 3.599020481109619, "logps/chosen": -0.8656774759292603, "logps/rejected": -1.2782808542251587, "loss": 0.7624, "nll_loss": 0.09282848984003067, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.25970324873924255, "rewards/margins": 0.12378102540969849, "rewards/rejected": -0.38348424434661865, "step": 530 }, { "epoch": 0.04741729414089083, "grad_norm": 2.3466947078704834, "learning_rate": 5.98888888888889e-06, "logits/chosen": 3.41766619682312, "logits/rejected": 3.4891743659973145, "logps/chosen": -0.69093918800354, "logps/rejected": -2.0744166374206543, "loss": 0.6747, "nll_loss": 0.0662418007850647, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.20728178322315216, "rewards/margins": 0.41504326462745667, "rewards/rejected": -0.6223250031471252, "step": 540 }, { "epoch": 0.04829539218053696, "grad_norm": 4.315151214599609, "learning_rate": 6.1e-06, "logits/chosen": 3.4109902381896973, "logits/rejected": 3.5040442943573, "logps/chosen": -0.5460541248321533, "logps/rejected": -1.9011294841766357, "loss": 0.653, "nll_loss": 0.06273610144853592, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.16381624341011047, "rewards/margins": 0.40652260184288025, "rewards/rejected": -0.5703388452529907, "step": 550 }, { "epoch": 0.04917349022018308, "grad_norm": 4.520711898803711, "learning_rate": 6.211111111111111e-06, "logits/chosen": 3.562473773956299, "logits/rejected": 3.6386642456054688, "logps/chosen": -0.7312324047088623, "logps/rejected": -1.6271352767944336, "loss": 0.6966, "nll_loss": 0.06128234788775444, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.21936972439289093, "rewards/margins": 0.26877090334892273, "rewards/rejected": -0.48814067244529724, "step": 560 }, { "epoch": 0.05005158825982921, "grad_norm": 7.277171611785889, "learning_rate": 6.322222222222223e-06, "logits/chosen": 3.472989559173584, "logits/rejected": 3.443589687347412, "logps/chosen": -0.7062110900878906, "logps/rejected": -2.1304116249084473, "loss": 0.6869, "nll_loss": 0.09942348301410675, "rewards/accuracies": 0.625, "rewards/chosen": -0.21186332404613495, "rewards/margins": 0.42726022005081177, "rewards/rejected": -0.6391235589981079, "step": 570 }, { "epoch": 0.050929686299475335, "grad_norm": 7.543278217315674, "learning_rate": 6.433333333333333e-06, "logits/chosen": 3.464301347732544, "logits/rejected": 3.444230556488037, "logps/chosen": -0.7348255515098572, "logps/rejected": -1.602164626121521, "loss": 0.7316, "nll_loss": 0.08772562444210052, "rewards/accuracies": 0.625, "rewards/chosen": -0.22044768929481506, "rewards/margins": 0.26020172238349915, "rewards/rejected": -0.4806493818759918, "step": 580 }, { "epoch": 0.05180778433912146, "grad_norm": 3.7779767513275146, "learning_rate": 6.544444444444445e-06, "logits/chosen": 3.633018970489502, "logits/rejected": 3.709826946258545, "logps/chosen": -0.9298914074897766, "logps/rejected": -1.606702446937561, "loss": 0.7953, "nll_loss": 0.1196284145116806, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.2789674401283264, "rewards/margins": 0.20304329693317413, "rewards/rejected": -0.48201069235801697, "step": 590 }, { "epoch": 0.05268588237876759, "grad_norm": 7.008880615234375, "learning_rate": 6.655555555555556e-06, "logits/chosen": 3.381080150604248, "logits/rejected": 3.444829225540161, "logps/chosen": -0.7211336493492126, "logps/rejected": -1.423513650894165, "loss": 0.7429, "nll_loss": 0.06740613281726837, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.216340109705925, "rewards/margins": 0.21071402728557587, "rewards/rejected": -0.42705410718917847, "step": 600 }, { "epoch": 0.05356398041841372, "grad_norm": 5.524548530578613, "learning_rate": 6.7666666666666665e-06, "logits/chosen": 3.4077486991882324, "logits/rejected": 3.377532958984375, "logps/chosen": -0.5095429420471191, "logps/rejected": -1.0147814750671387, "loss": 0.7157, "nll_loss": 0.0589555986225605, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.15286290645599365, "rewards/margins": 0.1515716016292572, "rewards/rejected": -0.30443447828292847, "step": 610 }, { "epoch": 0.054442078458059845, "grad_norm": 4.042827606201172, "learning_rate": 6.8777777777777785e-06, "logits/chosen": 3.716031551361084, "logits/rejected": 3.713074207305908, "logps/chosen": -0.8152651786804199, "logps/rejected": -1.5531432628631592, "loss": 0.7318, "nll_loss": 0.08608300983905792, "rewards/accuracies": 0.5625, "rewards/chosen": -0.24457958340644836, "rewards/margins": 0.22136345505714417, "rewards/rejected": -0.46594300866127014, "step": 620 }, { "epoch": 0.05532017649770597, "grad_norm": 4.254418849945068, "learning_rate": 6.9888888888888895e-06, "logits/chosen": 3.4341864585876465, "logits/rejected": 3.458519458770752, "logps/chosen": -0.6925168037414551, "logps/rejected": -1.5713815689086914, "loss": 0.6898, "nll_loss": 0.07025544345378876, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.20775504410266876, "rewards/margins": 0.26365941762924194, "rewards/rejected": -0.4714145064353943, "step": 630 }, { "epoch": 0.05619827453735209, "grad_norm": 2.973627805709839, "learning_rate": 7.100000000000001e-06, "logits/chosen": 3.4850242137908936, "logits/rejected": 3.5192267894744873, "logps/chosen": -1.1339752674102783, "logps/rejected": -1.5586907863616943, "loss": 0.8059, "nll_loss": 0.10102218389511108, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.34019264578819275, "rewards/margins": 0.1274145543575287, "rewards/rejected": -0.46760720014572144, "step": 640 }, { "epoch": 0.05707637257699822, "grad_norm": 3.3906142711639404, "learning_rate": 7.211111111111112e-06, "logits/chosen": 3.6287121772766113, "logits/rejected": 3.5881965160369873, "logps/chosen": -0.7608178853988647, "logps/rejected": -1.2269501686096191, "loss": 0.7394, "nll_loss": 0.0750693827867508, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.228245347738266, "rewards/margins": 0.13983972370624542, "rewards/rejected": -0.3680850863456726, "step": 650 }, { "epoch": 0.05795447061664435, "grad_norm": 0.03068475052714348, "learning_rate": 7.322222222222223e-06, "logits/chosen": 3.743140697479248, "logits/rejected": 3.7635676860809326, "logps/chosen": -0.5632290840148926, "logps/rejected": -1.5820039510726929, "loss": 0.6807, "nll_loss": 0.07005371153354645, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.16896870732307434, "rewards/margins": 0.30563241243362427, "rewards/rejected": -0.47460120916366577, "step": 660 }, { "epoch": 0.058832568656290475, "grad_norm": 2.2352776527404785, "learning_rate": 7.433333333333334e-06, "logits/chosen": 3.5528149604797363, "logits/rejected": 3.5446677207946777, "logps/chosen": -0.5461568832397461, "logps/rejected": -1.2885067462921143, "loss": 0.6956, "nll_loss": 0.06532245129346848, "rewards/accuracies": 0.625, "rewards/chosen": -0.16384705901145935, "rewards/margins": 0.22270497679710388, "rewards/rejected": -0.3865520656108856, "step": 670 }, { "epoch": 0.0597106666959366, "grad_norm": 2.6797573566436768, "learning_rate": 7.544444444444445e-06, "logits/chosen": 3.3902244567871094, "logits/rejected": 3.4180169105529785, "logps/chosen": -0.6432263255119324, "logps/rejected": -1.1323941946029663, "loss": 0.7574, "nll_loss": 0.07451333105564117, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.19296793639659882, "rewards/margins": 0.1467503160238266, "rewards/rejected": -0.3397182822227478, "step": 680 }, { "epoch": 0.06058876473558273, "grad_norm": 5.877313137054443, "learning_rate": 7.655555555555556e-06, "logits/chosen": 3.4120171070098877, "logits/rejected": 3.4442646503448486, "logps/chosen": -0.7911199331283569, "logps/rejected": -1.4578664302825928, "loss": 0.7751, "nll_loss": 0.1019410640001297, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.23733600974082947, "rewards/margins": 0.20002400875091553, "rewards/rejected": -0.4373599886894226, "step": 690 }, { "epoch": 0.06146686277522886, "grad_norm": 3.2915701866149902, "learning_rate": 7.766666666666666e-06, "logits/chosen": 3.5103302001953125, "logits/rejected": 3.4995181560516357, "logps/chosen": -0.6160825490951538, "logps/rejected": -1.1019346714019775, "loss": 0.7363, "nll_loss": 0.06433330476284027, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.18482479453086853, "rewards/margins": 0.14575564861297607, "rewards/rejected": -0.3305804133415222, "step": 700 }, { "epoch": 0.06234496081487498, "grad_norm": 3.7106151580810547, "learning_rate": 7.877777777777778e-06, "logits/chosen": 3.474386692047119, "logits/rejected": 3.4553630352020264, "logps/chosen": -0.6972242593765259, "logps/rejected": -1.5925921201705933, "loss": 0.7109, "nll_loss": 0.07773466408252716, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.20916728675365448, "rewards/margins": 0.2686103284358978, "rewards/rejected": -0.4777776300907135, "step": 710 }, { "epoch": 0.0632230588545211, "grad_norm": 1.7685184478759766, "learning_rate": 7.98888888888889e-06, "logits/chosen": 3.6326797008514404, "logits/rejected": 3.625549793243408, "logps/chosen": -0.5006519556045532, "logps/rejected": -1.8575595617294312, "loss": 0.6416, "nll_loss": 0.05640099197626114, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.15019558370113373, "rewards/margins": 0.4070723056793213, "rewards/rejected": -0.5572679042816162, "step": 720 }, { "epoch": 0.06410115689416723, "grad_norm": 4.595531463623047, "learning_rate": 8.1e-06, "logits/chosen": 3.6036553382873535, "logits/rejected": 3.6813888549804688, "logps/chosen": -1.0952359437942505, "logps/rejected": -2.0565478801727295, "loss": 0.7318, "nll_loss": 0.06606093794107437, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.32857078313827515, "rewards/margins": 0.28839364647865295, "rewards/rejected": -0.6169643998146057, "step": 730 }, { "epoch": 0.06497925493381336, "grad_norm": 2.7277488708496094, "learning_rate": 8.211111111111112e-06, "logits/chosen": 3.4828929901123047, "logits/rejected": 3.536961317062378, "logps/chosen": -0.6164706945419312, "logps/rejected": -2.1122829914093018, "loss": 0.6062, "nll_loss": 0.040962688624858856, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.18494121730327606, "rewards/margins": 0.44874364137649536, "rewards/rejected": -0.6336848735809326, "step": 740 }, { "epoch": 0.06585735297345949, "grad_norm": 3.6677157878875732, "learning_rate": 8.322222222222223e-06, "logits/chosen": 3.361325740814209, "logits/rejected": 3.362967014312744, "logps/chosen": -0.9366201162338257, "logps/rejected": -1.7140756845474243, "loss": 0.7507, "nll_loss": 0.10440067946910858, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.28098607063293457, "rewards/margins": 0.23323671519756317, "rewards/rejected": -0.5142227411270142, "step": 750 }, { "epoch": 0.06673545101310562, "grad_norm": 3.729750633239746, "learning_rate": 8.433333333333334e-06, "logits/chosen": 3.343749523162842, "logits/rejected": 3.401230573654175, "logps/chosen": -0.7000004649162292, "logps/rejected": -1.9269546270370483, "loss": 0.6931, "nll_loss": 0.07972760498523712, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.2100001573562622, "rewards/margins": 0.3680862486362457, "rewards/rejected": -0.5780864357948303, "step": 760 }, { "epoch": 0.06761354905275174, "grad_norm": 7.166464328765869, "learning_rate": 8.544444444444445e-06, "logits/chosen": 3.4859509468078613, "logits/rejected": 3.587602138519287, "logps/chosen": -0.7638369798660278, "logps/rejected": -2.2576065063476562, "loss": 0.6832, "nll_loss": 0.0738845020532608, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.22915109992027283, "rewards/margins": 0.4481307864189148, "rewards/rejected": -0.6772819757461548, "step": 770 }, { "epoch": 0.06849164709239787, "grad_norm": 2.416330575942993, "learning_rate": 8.655555555555557e-06, "logits/chosen": 3.408698558807373, "logits/rejected": 3.4245800971984863, "logps/chosen": -0.8418534398078918, "logps/rejected": -1.4894107580184937, "loss": 0.757, "nll_loss": 0.0903228372335434, "rewards/accuracies": 0.5625, "rewards/chosen": -0.25255605578422546, "rewards/margins": 0.1942671835422516, "rewards/rejected": -0.44682326912879944, "step": 780 }, { "epoch": 0.069369745132044, "grad_norm": 4.281314373016357, "learning_rate": 8.766666666666669e-06, "logits/chosen": 3.2489428520202637, "logits/rejected": 3.2514851093292236, "logps/chosen": -0.8190711736679077, "logps/rejected": -1.3487728834152222, "loss": 0.7399, "nll_loss": 0.07894166558980942, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.24572138488292694, "rewards/margins": 0.15891052782535553, "rewards/rejected": -0.40463191270828247, "step": 790 }, { "epoch": 0.07024784317169012, "grad_norm": 1.3228946924209595, "learning_rate": 8.877777777777779e-06, "logits/chosen": 3.2964024543762207, "logits/rejected": 3.3055152893066406, "logps/chosen": -0.8143989443778992, "logps/rejected": -1.2753360271453857, "loss": 0.75, "nll_loss": 0.07704529166221619, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.24431967735290527, "rewards/margins": 0.13828110694885254, "rewards/rejected": -0.3826007843017578, "step": 800 }, { "epoch": 0.07112594121133625, "grad_norm": 3.7781331539154053, "learning_rate": 8.988888888888889e-06, "logits/chosen": 3.68397855758667, "logits/rejected": 3.6694297790527344, "logps/chosen": -0.9365280866622925, "logps/rejected": -1.7311958074569702, "loss": 0.7448, "nll_loss": 0.08974708616733551, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.28095847368240356, "rewards/margins": 0.23840029537677765, "rewards/rejected": -0.51935875415802, "step": 810 }, { "epoch": 0.07200403925098237, "grad_norm": 2.7228267192840576, "learning_rate": 9.100000000000001e-06, "logits/chosen": 3.6748175621032715, "logits/rejected": 3.7467494010925293, "logps/chosen": -0.5383256077766418, "logps/rejected": -2.2832655906677246, "loss": 0.6081, "nll_loss": 0.06415946036577225, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.16149768233299255, "rewards/margins": 0.523482084274292, "rewards/rejected": -0.6849797964096069, "step": 820 }, { "epoch": 0.0728821372906285, "grad_norm": 1.3846280574798584, "learning_rate": 9.211111111111111e-06, "logits/chosen": 3.2965283393859863, "logits/rejected": 3.329348087310791, "logps/chosen": -0.5321189761161804, "logps/rejected": -2.2239809036254883, "loss": 0.6186, "nll_loss": 0.04651743918657303, "rewards/accuracies": 0.625, "rewards/chosen": -0.15963570773601532, "rewards/margins": 0.5075585842132568, "rewards/rejected": -0.6671942472457886, "step": 830 }, { "epoch": 0.07376023533027462, "grad_norm": 4.5536675453186035, "learning_rate": 9.322222222222223e-06, "logits/chosen": 3.347224473953247, "logits/rejected": 3.3436226844787598, "logps/chosen": -0.7508156895637512, "logps/rejected": -1.867352843284607, "loss": 0.707, "nll_loss": 0.0711875781416893, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.2252446860074997, "rewards/margins": 0.3349612355232239, "rewards/rejected": -0.56020587682724, "step": 840 }, { "epoch": 0.07463833336992075, "grad_norm": 4.691596508026123, "learning_rate": 9.433333333333335e-06, "logits/chosen": 3.253293991088867, "logits/rejected": 3.2947421073913574, "logps/chosen": -0.9249482154846191, "logps/rejected": -1.812909483909607, "loss": 0.7568, "nll_loss": 0.09252104163169861, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.2774844765663147, "rewards/margins": 0.2663884162902832, "rewards/rejected": -0.5438728928565979, "step": 850 }, { "epoch": 0.07551643140956688, "grad_norm": 4.119868278503418, "learning_rate": 9.544444444444445e-06, "logits/chosen": 3.3257553577423096, "logits/rejected": 3.3328521251678467, "logps/chosen": -0.8130607604980469, "logps/rejected": -2.2179336547851562, "loss": 0.7061, "nll_loss": 0.09296734631061554, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.2439182549715042, "rewards/margins": 0.42146188020706177, "rewards/rejected": -0.6653801202774048, "step": 860 }, { "epoch": 0.076394529449213, "grad_norm": 4.941491603851318, "learning_rate": 9.655555555555556e-06, "logits/chosen": 3.2392685413360596, "logits/rejected": 3.2623963356018066, "logps/chosen": -0.7361981272697449, "logps/rejected": -1.6700445413589478, "loss": 0.7312, "nll_loss": 0.08274148404598236, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.22085945308208466, "rewards/margins": 0.28015393018722534, "rewards/rejected": -0.5010133385658264, "step": 870 }, { "epoch": 0.07727262748885913, "grad_norm": 1.4032851457595825, "learning_rate": 9.766666666666667e-06, "logits/chosen": 3.2956814765930176, "logits/rejected": 3.318169355392456, "logps/chosen": -0.7056166529655457, "logps/rejected": -1.8036190271377563, "loss": 0.7136, "nll_loss": 0.09712977707386017, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.21168498694896698, "rewards/margins": 0.3294007182121277, "rewards/rejected": -0.5410857200622559, "step": 880 }, { "epoch": 0.07815072552850526, "grad_norm": 3.9460878372192383, "learning_rate": 9.877777777777778e-06, "logits/chosen": 3.4804458618164062, "logits/rejected": 3.5131962299346924, "logps/chosen": -0.9113373756408691, "logps/rejected": -1.8960120677947998, "loss": 0.7272, "nll_loss": 0.08563139289617538, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.2734012007713318, "rewards/margins": 0.295402467250824, "rewards/rejected": -0.5688036680221558, "step": 890 }, { "epoch": 0.07902882356815139, "grad_norm": 5.001852989196777, "learning_rate": 9.98888888888889e-06, "logits/chosen": 3.5693771839141846, "logits/rejected": 3.613219738006592, "logps/chosen": -0.7482819557189941, "logps/rejected": -1.611090898513794, "loss": 0.7488, "nll_loss": 0.10075131803750992, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.2244846075773239, "rewards/margins": 0.25884273648262024, "rewards/rejected": -0.48332732915878296, "step": 900 }, { "epoch": 0.07990692160779751, "grad_norm": 3.9788780212402344, "learning_rate": 9.98888888888889e-06, "logits/chosen": 3.418684720993042, "logits/rejected": 3.4473800659179688, "logps/chosen": -0.5167075991630554, "logps/rejected": -1.0832570791244507, "loss": 0.7187, "nll_loss": 0.053985703736543655, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.155012309551239, "rewards/margins": 0.16996484994888306, "rewards/rejected": -0.3249771296977997, "step": 910 }, { "epoch": 0.08078501964744364, "grad_norm": 1.5054136514663696, "learning_rate": 9.976543209876544e-06, "logits/chosen": 3.389498233795166, "logits/rejected": 3.4252688884735107, "logps/chosen": -0.6502631902694702, "logps/rejected": -1.8272225856781006, "loss": 0.6756, "nll_loss": 0.07306591421365738, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1950789839029312, "rewards/margins": 0.35308781266212463, "rewards/rejected": -0.5481668710708618, "step": 920 }, { "epoch": 0.08166311768708977, "grad_norm": 2.810540199279785, "learning_rate": 9.964197530864198e-06, "logits/chosen": 3.4912326335906982, "logits/rejected": 3.503628969192505, "logps/chosen": -0.4884684681892395, "logps/rejected": -1.4588502645492554, "loss": 0.6613, "nll_loss": 0.04402286559343338, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.14654052257537842, "rewards/margins": 0.29111456871032715, "rewards/rejected": -0.4376550614833832, "step": 930 }, { "epoch": 0.0825412157267359, "grad_norm": 5.386466979980469, "learning_rate": 9.951851851851853e-06, "logits/chosen": 3.386685848236084, "logits/rejected": 3.4002914428710938, "logps/chosen": -0.5482162237167358, "logps/rejected": -1.572486162185669, "loss": 0.6616, "nll_loss": 0.053943734616041183, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.16446486115455627, "rewards/margins": 0.307280957698822, "rewards/rejected": -0.4717458188533783, "step": 940 }, { "epoch": 0.08341931376638202, "grad_norm": 1.676483154296875, "learning_rate": 9.939506172839507e-06, "logits/chosen": 3.3065590858459473, "logits/rejected": 3.3416500091552734, "logps/chosen": -0.5273550748825073, "logps/rejected": -1.707932472229004, "loss": 0.6729, "nll_loss": 0.06074246019124985, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.1582065373659134, "rewards/margins": 0.3541732430458069, "rewards/rejected": -0.5123798251152039, "step": 950 }, { "epoch": 0.08429741180602815, "grad_norm": 2.689913272857666, "learning_rate": 9.927160493827162e-06, "logits/chosen": 3.2740864753723145, "logits/rejected": 3.335360050201416, "logps/chosen": -0.7466616630554199, "logps/rejected": -2.0391454696655273, "loss": 0.7055, "nll_loss": 0.09172563254833221, "rewards/accuracies": 0.5, "rewards/chosen": -0.2239985167980194, "rewards/margins": 0.3877451419830322, "rewards/rejected": -0.6117436289787292, "step": 960 }, { "epoch": 0.08517550984567426, "grad_norm": 3.589853286743164, "learning_rate": 9.914814814814816e-06, "logits/chosen": 3.0097994804382324, "logits/rejected": 3.0853917598724365, "logps/chosen": -0.5722948312759399, "logps/rejected": -1.9204362630844116, "loss": 0.6501, "nll_loss": 0.051983099430799484, "rewards/accuracies": 0.5625, "rewards/chosen": -0.17168846726417542, "rewards/margins": 0.4044424593448639, "rewards/rejected": -0.5761309266090393, "step": 970 }, { "epoch": 0.08605360788532039, "grad_norm": 1.7398462295532227, "learning_rate": 9.90246913580247e-06, "logits/chosen": 3.3110098838806152, "logits/rejected": 3.4121768474578857, "logps/chosen": -0.6699460744857788, "logps/rejected": -2.0870394706726074, "loss": 0.6724, "nll_loss": 0.053984154015779495, "rewards/accuracies": 0.625, "rewards/chosen": -0.20098385214805603, "rewards/margins": 0.4251279830932617, "rewards/rejected": -0.6261118054389954, "step": 980 }, { "epoch": 0.08693170592496652, "grad_norm": 5.3601861000061035, "learning_rate": 9.890123456790123e-06, "logits/chosen": 3.1401820182800293, "logits/rejected": 3.127436399459839, "logps/chosen": -0.7694223523139954, "logps/rejected": -1.6481168270111084, "loss": 0.737, "nll_loss": 0.09365083277225494, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.230826735496521, "rewards/margins": 0.2636083662509918, "rewards/rejected": -0.49443507194519043, "step": 990 }, { "epoch": 0.08780980396461265, "grad_norm": 2.6405630111694336, "learning_rate": 9.877777777777778e-06, "logits/chosen": 3.1894805431365967, "logits/rejected": 3.2360007762908936, "logps/chosen": -0.5739088654518127, "logps/rejected": -2.0212159156799316, "loss": 0.6616, "nll_loss": 0.05945644527673721, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1721726506948471, "rewards/margins": 0.4341921806335449, "rewards/rejected": -0.606364905834198, "step": 1000 }, { "epoch": 0.08868790200425877, "grad_norm": 2.2846076488494873, "learning_rate": 9.865432098765432e-06, "logits/chosen": 3.2782859802246094, "logits/rejected": 3.2814033031463623, "logps/chosen": -0.5302027463912964, "logps/rejected": -1.8145701885223389, "loss": 0.6543, "nll_loss": 0.05663750320672989, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.15906082093715668, "rewards/margins": 0.3853102922439575, "rewards/rejected": -0.5443711280822754, "step": 1010 }, { "epoch": 0.0895660000439049, "grad_norm": 13.04986572265625, "learning_rate": 9.853086419753087e-06, "logits/chosen": 2.8904287815093994, "logits/rejected": 2.907032012939453, "logps/chosen": -1.1254570484161377, "logps/rejected": -2.3068795204162598, "loss": 0.7832, "nll_loss": 0.1419171392917633, "rewards/accuracies": 0.5625, "rewards/chosen": -0.33763712644577026, "rewards/margins": 0.354426771402359, "rewards/rejected": -0.6920639276504517, "step": 1020 }, { "epoch": 0.09044409808355103, "grad_norm": 2.556560754776001, "learning_rate": 9.840740740740743e-06, "logits/chosen": 3.151669502258301, "logits/rejected": 3.1690382957458496, "logps/chosen": -0.7756383419036865, "logps/rejected": -1.6224708557128906, "loss": 0.7551, "nll_loss": 0.09495635330677032, "rewards/accuracies": 0.5, "rewards/chosen": -0.23269149661064148, "rewards/margins": 0.25404977798461914, "rewards/rejected": -0.486741304397583, "step": 1030 }, { "epoch": 0.09132219612319716, "grad_norm": 1.3183997869491577, "learning_rate": 9.828395061728397e-06, "logits/chosen": 3.195861339569092, "logits/rejected": 3.2810165882110596, "logps/chosen": -0.5329464077949524, "logps/rejected": -1.2915513515472412, "loss": 0.7152, "nll_loss": 0.054315369576215744, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.15988394618034363, "rewards/margins": 0.22758150100708008, "rewards/rejected": -0.3874654471874237, "step": 1040 }, { "epoch": 0.09220029416284328, "grad_norm": 1.674479603767395, "learning_rate": 9.81604938271605e-06, "logits/chosen": 3.139688491821289, "logits/rejected": 3.2257683277130127, "logps/chosen": -0.7113819122314453, "logps/rejected": -2.1473631858825684, "loss": 0.679, "nll_loss": 0.04762103408575058, "rewards/accuracies": 0.625, "rewards/chosen": -0.21341457962989807, "rewards/margins": 0.4307943284511566, "rewards/rejected": -0.6442088484764099, "step": 1050 }, { "epoch": 0.09307839220248941, "grad_norm": 1.9272900819778442, "learning_rate": 9.803703703703704e-06, "logits/chosen": 3.403465986251831, "logits/rejected": 3.385577440261841, "logps/chosen": -0.9791383743286133, "logps/rejected": -2.0091757774353027, "loss": 0.7499, "nll_loss": 0.11029829829931259, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.29374146461486816, "rewards/margins": 0.309011310338974, "rewards/rejected": -0.6027528047561646, "step": 1060 }, { "epoch": 0.09395649024213554, "grad_norm": 3.530672073364258, "learning_rate": 9.791358024691359e-06, "logits/chosen": 2.975001573562622, "logits/rejected": 3.063398838043213, "logps/chosen": -0.9404687881469727, "logps/rejected": -1.803180456161499, "loss": 0.7547, "nll_loss": 0.079728864133358, "rewards/accuracies": 0.5, "rewards/chosen": -0.28214067220687866, "rewards/margins": 0.2588135600090027, "rewards/rejected": -0.5409542322158813, "step": 1070 }, { "epoch": 0.09483458828178166, "grad_norm": 7.528195381164551, "learning_rate": 9.779012345679013e-06, "logits/chosen": 3.1206297874450684, "logits/rejected": 3.173870086669922, "logps/chosen": -0.6394155621528625, "logps/rejected": -1.1264150142669678, "loss": 0.7336, "nll_loss": 0.06591827422380447, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.19182467460632324, "rewards/margins": 0.14609983563423157, "rewards/rejected": -0.3379245400428772, "step": 1080 }, { "epoch": 0.09571268632142779, "grad_norm": 2.3711354732513428, "learning_rate": 9.766666666666667e-06, "logits/chosen": 3.2346444129943848, "logits/rejected": 3.3024227619171143, "logps/chosen": -0.7532138824462891, "logps/rejected": -1.9977819919586182, "loss": 0.6926, "nll_loss": 0.0733107179403305, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.22596418857574463, "rewards/margins": 0.3733704090118408, "rewards/rejected": -0.5993345975875854, "step": 1090 }, { "epoch": 0.09659078436107392, "grad_norm": 2.3100759983062744, "learning_rate": 9.754320987654322e-06, "logits/chosen": 3.0819344520568848, "logits/rejected": 3.081664562225342, "logps/chosen": -0.39054492115974426, "logps/rejected": -1.3339643478393555, "loss": 0.6571, "nll_loss": 0.03647618740797043, "rewards/accuracies": 0.5625, "rewards/chosen": -0.11716349422931671, "rewards/margins": 0.2830258309841156, "rewards/rejected": -0.4001893401145935, "step": 1100 }, { "epoch": 0.09746888240072005, "grad_norm": 4.387854099273682, "learning_rate": 9.741975308641976e-06, "logits/chosen": 3.0971732139587402, "logits/rejected": 3.105783224105835, "logps/chosen": -0.6461865305900574, "logps/rejected": -1.5687153339385986, "loss": 0.7209, "nll_loss": 0.08219017088413239, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.19385597109794617, "rewards/margins": 0.2767586410045624, "rewards/rejected": -0.47061461210250854, "step": 1110 }, { "epoch": 0.09834698044036616, "grad_norm": 1.8574669361114502, "learning_rate": 9.72962962962963e-06, "logits/chosen": 3.2028121948242188, "logits/rejected": 3.188230037689209, "logps/chosen": -0.6974012851715088, "logps/rejected": -2.043318033218384, "loss": 0.6628, "nll_loss": 0.06341539323329926, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.20922040939331055, "rewards/margins": 0.40377503633499146, "rewards/rejected": -0.612995445728302, "step": 1120 }, { "epoch": 0.09922507848001229, "grad_norm": 1.4223850965499878, "learning_rate": 9.717283950617285e-06, "logits/chosen": 3.1057040691375732, "logits/rejected": 3.1665711402893066, "logps/chosen": -0.4518910348415375, "logps/rejected": -1.782041311264038, "loss": 0.6529, "nll_loss": 0.04685738682746887, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.1355672925710678, "rewards/margins": 0.39904507994651794, "rewards/rejected": -0.5346124172210693, "step": 1130 }, { "epoch": 0.10010317651965842, "grad_norm": 0.6512376666069031, "learning_rate": 9.70493827160494e-06, "logits/chosen": 3.062418222427368, "logits/rejected": 3.1085588932037354, "logps/chosen": -0.7759238481521606, "logps/rejected": -2.237229585647583, "loss": 0.6829, "nll_loss": 0.08105801045894623, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.2327771931886673, "rewards/margins": 0.43839168548583984, "rewards/rejected": -0.6711689233779907, "step": 1140 }, { "epoch": 0.10098127455930454, "grad_norm": 2.3270082473754883, "learning_rate": 9.692592592592594e-06, "logits/chosen": 2.8819382190704346, "logits/rejected": 2.9401650428771973, "logps/chosen": -0.5062090754508972, "logps/rejected": -1.868971824645996, "loss": 0.6794, "nll_loss": 0.05651511624455452, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1518627107143402, "rewards/margins": 0.40882882475852966, "rewards/rejected": -0.5606915354728699, "step": 1150 }, { "epoch": 0.10185937259895067, "grad_norm": 1.2098430395126343, "learning_rate": 9.680246913580248e-06, "logits/chosen": 2.7454497814178467, "logits/rejected": 2.780897617340088, "logps/chosen": -0.9837905168533325, "logps/rejected": -2.0501391887664795, "loss": 0.7538, "nll_loss": 0.10150198638439178, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.2951371669769287, "rewards/margins": 0.319904625415802, "rewards/rejected": -0.6150418519973755, "step": 1160 }, { "epoch": 0.1027374706385968, "grad_norm": 2.469829559326172, "learning_rate": 9.667901234567903e-06, "logits/chosen": 2.8060965538024902, "logits/rejected": 2.8710038661956787, "logps/chosen": -0.6813799738883972, "logps/rejected": -2.1527724266052246, "loss": 0.6474, "nll_loss": 0.06949031352996826, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2044139802455902, "rewards/margins": 0.44141775369644165, "rewards/rejected": -0.6458317637443542, "step": 1170 }, { "epoch": 0.10361556867824293, "grad_norm": 3.5877394676208496, "learning_rate": 9.655555555555556e-06, "logits/chosen": 3.1048672199249268, "logits/rejected": 3.1411147117614746, "logps/chosen": -0.4382111132144928, "logps/rejected": -2.12353253364563, "loss": 0.624, "nll_loss": 0.048050910234451294, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.13146333396434784, "rewards/margins": 0.5055964589118958, "rewards/rejected": -0.6370598077774048, "step": 1180 }, { "epoch": 0.10449366671788905, "grad_norm": 13.380763053894043, "learning_rate": 9.64320987654321e-06, "logits/chosen": 2.6727805137634277, "logits/rejected": 2.7236101627349854, "logps/chosen": -0.7069253325462341, "logps/rejected": -2.2335355281829834, "loss": 0.6386, "nll_loss": 0.05090578272938728, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.21207761764526367, "rewards/margins": 0.45798301696777344, "rewards/rejected": -0.6700606346130371, "step": 1190 }, { "epoch": 0.10537176475753518, "grad_norm": 7.120882034301758, "learning_rate": 9.630864197530864e-06, "logits/chosen": 2.8762526512145996, "logits/rejected": 2.901745319366455, "logps/chosen": -0.7335812449455261, "logps/rejected": -2.8476223945617676, "loss": 0.6497, "nll_loss": 0.07983629405498505, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2200743705034256, "rewards/margins": 0.6342123746871948, "rewards/rejected": -0.854286789894104, "step": 1200 }, { "epoch": 0.10624986279718131, "grad_norm": 5.2290263175964355, "learning_rate": 9.618518518518519e-06, "logits/chosen": 3.0229249000549316, "logits/rejected": 2.959900379180908, "logps/chosen": -1.3016353845596313, "logps/rejected": -1.7729085683822632, "loss": 0.8886, "nll_loss": 0.14201593399047852, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.3904905915260315, "rewards/margins": 0.14138197898864746, "rewards/rejected": -0.5318726301193237, "step": 1210 }, { "epoch": 0.10712796083682743, "grad_norm": 1.350420355796814, "learning_rate": 9.606172839506173e-06, "logits/chosen": 2.695782423019409, "logits/rejected": 2.6982076168060303, "logps/chosen": -0.5528481602668762, "logps/rejected": -1.7437477111816406, "loss": 0.7039, "nll_loss": 0.06110968068242073, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.16585442423820496, "rewards/margins": 0.3572699725627899, "rewards/rejected": -0.5231243371963501, "step": 1220 }, { "epoch": 0.10800605887647356, "grad_norm": 4.9116291999816895, "learning_rate": 9.593827160493828e-06, "logits/chosen": 2.8747756481170654, "logits/rejected": 2.797008514404297, "logps/chosen": -0.7696909308433533, "logps/rejected": -1.8576171398162842, "loss": 0.7164, "nll_loss": 0.08876083791255951, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.23090729117393494, "rewards/margins": 0.3263779282569885, "rewards/rejected": -0.5572851896286011, "step": 1230 }, { "epoch": 0.10888415691611969, "grad_norm": 3.6960697174072266, "learning_rate": 9.581481481481482e-06, "logits/chosen": 3.121314287185669, "logits/rejected": 3.1768624782562256, "logps/chosen": -0.597256064414978, "logps/rejected": -2.1361727714538574, "loss": 0.6718, "nll_loss": 0.06965653598308563, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.17917683720588684, "rewards/margins": 0.4616750180721283, "rewards/rejected": -0.6408518552780151, "step": 1240 }, { "epoch": 0.10976225495576582, "grad_norm": 1.3148126602172852, "learning_rate": 9.569135802469136e-06, "logits/chosen": 2.9931716918945312, "logits/rejected": 3.0196568965911865, "logps/chosen": -0.8401254415512085, "logps/rejected": -2.2920265197753906, "loss": 0.7173, "nll_loss": 0.08757736533880234, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.2520376443862915, "rewards/margins": 0.4355703294277191, "rewards/rejected": -0.6876079440116882, "step": 1250 }, { "epoch": 0.11064035299541194, "grad_norm": 3.798100471496582, "learning_rate": 9.556790123456791e-06, "logits/chosen": 2.9449126720428467, "logits/rejected": 2.9711549282073975, "logps/chosen": -0.6954627633094788, "logps/rejected": -1.5479028224945068, "loss": 0.7229, "nll_loss": 0.06693422794342041, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.20863886177539825, "rewards/margins": 0.2557320296764374, "rewards/rejected": -0.4643709063529968, "step": 1260 }, { "epoch": 0.11151845103505806, "grad_norm": 2.8658065795898438, "learning_rate": 9.544444444444445e-06, "logits/chosen": 3.015810966491699, "logits/rejected": 3.0757055282592773, "logps/chosen": -1.2240221500396729, "logps/rejected": -1.8749526739120483, "loss": 0.8117, "nll_loss": 0.10025894641876221, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.3672066628932953, "rewards/margins": 0.1952790915966034, "rewards/rejected": -0.5624858140945435, "step": 1270 }, { "epoch": 0.11239654907470419, "grad_norm": 2.584207534790039, "learning_rate": 9.5320987654321e-06, "logits/chosen": 3.0405101776123047, "logits/rejected": 3.0836472511291504, "logps/chosen": -0.5894891619682312, "logps/rejected": -1.7727954387664795, "loss": 0.6706, "nll_loss": 0.057862233370542526, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.17684674263000488, "rewards/margins": 0.35499197244644165, "rewards/rejected": -0.5318387150764465, "step": 1280 }, { "epoch": 0.11327464711435031, "grad_norm": 0.8380700945854187, "learning_rate": 9.519753086419754e-06, "logits/chosen": 3.015899896621704, "logits/rejected": 2.9938347339630127, "logps/chosen": -0.48675793409347534, "logps/rejected": -1.6840702295303345, "loss": 0.6618, "nll_loss": 0.06300728023052216, "rewards/accuracies": 0.6875, "rewards/chosen": -0.14602738618850708, "rewards/margins": 0.35919371247291565, "rewards/rejected": -0.5052211880683899, "step": 1290 }, { "epoch": 0.11415274515399644, "grad_norm": 1.4230750799179077, "learning_rate": 9.507407407407409e-06, "logits/chosen": 3.2599899768829346, "logits/rejected": 3.2479500770568848, "logps/chosen": -0.5513351559638977, "logps/rejected": -1.5504658222198486, "loss": 0.7042, "nll_loss": 0.05995137244462967, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.16540054976940155, "rewards/margins": 0.299739271402359, "rewards/rejected": -0.46513980627059937, "step": 1300 }, { "epoch": 0.11503084319364257, "grad_norm": 1.0455182790756226, "learning_rate": 9.495061728395063e-06, "logits/chosen": 3.0312843322753906, "logits/rejected": 3.068418025970459, "logps/chosen": -0.5078593492507935, "logps/rejected": -2.3526813983917236, "loss": 0.6117, "nll_loss": 0.04707217961549759, "rewards/accuracies": 0.625, "rewards/chosen": -0.152357816696167, "rewards/margins": 0.5534465909004211, "rewards/rejected": -0.7058044672012329, "step": 1310 }, { "epoch": 0.1159089412332887, "grad_norm": 3.6651833057403564, "learning_rate": 9.482716049382716e-06, "logits/chosen": 3.071345806121826, "logits/rejected": 3.0848429203033447, "logps/chosen": -0.6071802973747253, "logps/rejected": -1.6902376413345337, "loss": 0.6877, "nll_loss": 0.06448554247617722, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.1821540892124176, "rewards/margins": 0.324917197227478, "rewards/rejected": -0.507071316242218, "step": 1320 }, { "epoch": 0.11678703927293482, "grad_norm": 5.458474159240723, "learning_rate": 9.47037037037037e-06, "logits/chosen": 2.9955544471740723, "logits/rejected": 2.9541006088256836, "logps/chosen": -1.1582845449447632, "logps/rejected": -2.687746047973633, "loss": 0.8277, "nll_loss": 0.14388300478458405, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.34748542308807373, "rewards/margins": 0.45883846282958984, "rewards/rejected": -0.8063238859176636, "step": 1330 }, { "epoch": 0.11766513731258095, "grad_norm": 1.8496476411819458, "learning_rate": 9.458024691358025e-06, "logits/chosen": 2.8806967735290527, "logits/rejected": 2.922480344772339, "logps/chosen": -0.3559941351413727, "logps/rejected": -2.068563938140869, "loss": 0.5969, "nll_loss": 0.03910910710692406, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.10679824650287628, "rewards/margins": 0.5137708783149719, "rewards/rejected": -0.6205691695213318, "step": 1340 }, { "epoch": 0.11854323535222708, "grad_norm": 8.205083847045898, "learning_rate": 9.44567901234568e-06, "logits/chosen": 2.9301178455352783, "logits/rejected": 2.972548246383667, "logps/chosen": -0.6576313972473145, "logps/rejected": -1.3083826303482056, "loss": 0.7648, "nll_loss": 0.0889785960316658, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.19728945195674896, "rewards/margins": 0.19522538781166077, "rewards/rejected": -0.39251479506492615, "step": 1350 }, { "epoch": 0.1194213333918732, "grad_norm": 3.6549758911132812, "learning_rate": 9.433333333333335e-06, "logits/chosen": 3.0893311500549316, "logits/rejected": 3.068948745727539, "logps/chosen": -0.6783910989761353, "logps/rejected": -2.4969258308410645, "loss": 0.6481, "nll_loss": 0.06912653148174286, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.203517347574234, "rewards/margins": 0.545560359954834, "rewards/rejected": -0.7490777969360352, "step": 1360 }, { "epoch": 0.12029943143151933, "grad_norm": 1.5456334352493286, "learning_rate": 9.42098765432099e-06, "logits/chosen": 2.9339780807495117, "logits/rejected": 2.9962122440338135, "logps/chosen": -0.46310439705848694, "logps/rejected": -2.436547040939331, "loss": 0.6248, "nll_loss": 0.06015176698565483, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.13893131911754608, "rewards/margins": 0.5920329093933105, "rewards/rejected": -0.730964183807373, "step": 1370 }, { "epoch": 0.12117752947116546, "grad_norm": 2.95572829246521, "learning_rate": 9.408641975308642e-06, "logits/chosen": 2.7811484336853027, "logits/rejected": 2.875540018081665, "logps/chosen": -0.6040056347846985, "logps/rejected": -2.7113006114959717, "loss": 0.6342, "nll_loss": 0.042366378009319305, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.18120168149471283, "rewards/margins": 0.6321884393692017, "rewards/rejected": -0.8133901357650757, "step": 1380 }, { "epoch": 0.12205562751081159, "grad_norm": 5.596220970153809, "learning_rate": 9.396296296296297e-06, "logits/chosen": 2.9219090938568115, "logits/rejected": 2.9703197479248047, "logps/chosen": -1.1313722133636475, "logps/rejected": -3.376211166381836, "loss": 0.6689, "nll_loss": 0.11231324821710587, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3394116759300232, "rewards/margins": 0.6734517216682434, "rewards/rejected": -1.0128633975982666, "step": 1390 }, { "epoch": 0.12293372555045771, "grad_norm": 4.154329776763916, "learning_rate": 9.383950617283951e-06, "logits/chosen": 2.875544548034668, "logits/rejected": 2.909510374069214, "logps/chosen": -0.7795349955558777, "logps/rejected": -3.0624213218688965, "loss": 0.6413, "nll_loss": 0.09788934886455536, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.23386052250862122, "rewards/margins": 0.6848658919334412, "rewards/rejected": -0.9187263250350952, "step": 1400 }, { "epoch": 0.12381182359010383, "grad_norm": 0.9190550446510315, "learning_rate": 9.371604938271605e-06, "logits/chosen": 2.800968885421753, "logits/rejected": 2.7952873706817627, "logps/chosen": -0.5842548608779907, "logps/rejected": -2.0285487174987793, "loss": 0.6672, "nll_loss": 0.06469441950321198, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.17527645826339722, "rewards/margins": 0.43328824639320374, "rewards/rejected": -0.6085646748542786, "step": 1410 }, { "epoch": 0.12468992162974996, "grad_norm": 0.4985824525356293, "learning_rate": 9.35925925925926e-06, "logits/chosen": 2.738670825958252, "logits/rejected": 2.7031972408294678, "logps/chosen": -0.714606761932373, "logps/rejected": -1.1444988250732422, "loss": 0.7813, "nll_loss": 0.07031063735485077, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.21438205242156982, "rewards/margins": 0.12896756827831268, "rewards/rejected": -0.3433496356010437, "step": 1420 }, { "epoch": 0.1255680196693961, "grad_norm": 3.7305266857147217, "learning_rate": 9.346913580246914e-06, "logits/chosen": 2.893129825592041, "logits/rejected": 2.9149386882781982, "logps/chosen": -0.46482163667678833, "logps/rejected": -2.2275373935699463, "loss": 0.6239, "nll_loss": 0.051440030336380005, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.13944648206233978, "rewards/margins": 0.5288147926330566, "rewards/rejected": -0.6682612299919128, "step": 1430 }, { "epoch": 0.1264461177090422, "grad_norm": 1.8036631345748901, "learning_rate": 9.334567901234569e-06, "logits/chosen": 2.547828197479248, "logits/rejected": 2.5787394046783447, "logps/chosen": -0.6464110612869263, "logps/rejected": -2.2243905067443848, "loss": 0.6899, "nll_loss": 0.06981117278337479, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.19392332434654236, "rewards/margins": 0.47339382767677307, "rewards/rejected": -0.6673170924186707, "step": 1440 }, { "epoch": 0.12732421574868835, "grad_norm": 1.5003271102905273, "learning_rate": 9.322222222222223e-06, "logits/chosen": 2.7942347526550293, "logits/rejected": 2.790160655975342, "logps/chosen": -0.45981842279434204, "logps/rejected": -1.9567676782608032, "loss": 0.6536, "nll_loss": 0.0419035442173481, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.1379455327987671, "rewards/margins": 0.4490847587585449, "rewards/rejected": -0.587030291557312, "step": 1450 }, { "epoch": 0.12820231378833447, "grad_norm": 3.1813344955444336, "learning_rate": 9.309876543209878e-06, "logits/chosen": 2.6398301124572754, "logits/rejected": 2.6550662517547607, "logps/chosen": -0.6792389154434204, "logps/rejected": -2.9893813133239746, "loss": 0.6317, "nll_loss": 0.08011049032211304, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2037716805934906, "rewards/margins": 0.6930428147315979, "rewards/rejected": -0.8968144655227661, "step": 1460 }, { "epoch": 0.1290804118279806, "grad_norm": 6.025292873382568, "learning_rate": 9.297530864197532e-06, "logits/chosen": 2.5487821102142334, "logits/rejected": 2.6483190059661865, "logps/chosen": -0.3606962561607361, "logps/rejected": -2.5816264152526855, "loss": 0.6086, "nll_loss": 0.04057370498776436, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.10820887982845306, "rewards/margins": 0.6662790179252625, "rewards/rejected": -0.7744879126548767, "step": 1470 }, { "epoch": 0.12995850986762672, "grad_norm": 31.56429100036621, "learning_rate": 9.285185185185186e-06, "logits/chosen": 2.4623022079467773, "logits/rejected": 2.409700393676758, "logps/chosen": -0.9133744239807129, "logps/rejected": -3.7820403575897217, "loss": 0.8332, "nll_loss": 0.23980839550495148, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.27401235699653625, "rewards/margins": 0.8605998158454895, "rewards/rejected": -1.1346122026443481, "step": 1480 }, { "epoch": 0.13083660790727283, "grad_norm": 0.035230621695518494, "learning_rate": 9.27283950617284e-06, "logits/chosen": 2.52907133102417, "logits/rejected": 2.530890464782715, "logps/chosen": -1.2194797992706299, "logps/rejected": -2.8537631034851074, "loss": 0.7781, "nll_loss": 0.14254291355609894, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.36584392189979553, "rewards/margins": 0.49028509855270386, "rewards/rejected": -0.856128990650177, "step": 1490 }, { "epoch": 0.13171470594691898, "grad_norm": 6.077812671661377, "learning_rate": 9.260493827160495e-06, "logits/chosen": 2.5622482299804688, "logits/rejected": 2.607653856277466, "logps/chosen": -0.8888137936592102, "logps/rejected": -2.8897366523742676, "loss": 0.6866, "nll_loss": 0.07926015555858612, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2666441798210144, "rewards/margins": 0.6002769470214844, "rewards/rejected": -0.8669211268424988, "step": 1500 }, { "epoch": 0.1325928039865651, "grad_norm": 3.651186466217041, "learning_rate": 9.24814814814815e-06, "logits/chosen": 2.693009853363037, "logits/rejected": 2.6718087196350098, "logps/chosen": -0.8477522134780884, "logps/rejected": -1.7480132579803467, "loss": 0.7543, "nll_loss": 0.09234277904033661, "rewards/accuracies": 0.5, "rewards/chosen": -0.25432562828063965, "rewards/margins": 0.2700783610343933, "rewards/rejected": -0.5244040489196777, "step": 1510 }, { "epoch": 0.13347090202621123, "grad_norm": 2.8306994438171387, "learning_rate": 9.235802469135802e-06, "logits/chosen": 2.8524880409240723, "logits/rejected": 2.8818418979644775, "logps/chosen": -0.49177321791648865, "logps/rejected": -1.9612071514129639, "loss": 0.6686, "nll_loss": 0.08659791201353073, "rewards/accuracies": 0.625, "rewards/chosen": -0.14753195643424988, "rewards/margins": 0.4408302307128906, "rewards/rejected": -0.5883622169494629, "step": 1520 }, { "epoch": 0.13434900006585734, "grad_norm": 3.763047456741333, "learning_rate": 9.223456790123457e-06, "logits/chosen": 2.698072671890259, "logits/rejected": 2.754232883453369, "logps/chosen": -0.798735499382019, "logps/rejected": -2.2809195518493652, "loss": 0.6913, "nll_loss": 0.07482419162988663, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2396206557750702, "rewards/margins": 0.44465526938438416, "rewards/rejected": -0.6842759251594543, "step": 1530 }, { "epoch": 0.13522709810550348, "grad_norm": 3.579263925552368, "learning_rate": 9.211111111111111e-06, "logits/chosen": 3.0055601596832275, "logits/rejected": 3.015162706375122, "logps/chosen": -0.6990305185317993, "logps/rejected": -1.879294991493225, "loss": 0.6897, "nll_loss": 0.06048674136400223, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.20970916748046875, "rewards/margins": 0.35407939553260803, "rewards/rejected": -0.5637885332107544, "step": 1540 }, { "epoch": 0.1361051961451496, "grad_norm": 1.4635862112045288, "learning_rate": 9.198765432098766e-06, "logits/chosen": 2.665912628173828, "logits/rejected": 2.7371814250946045, "logps/chosen": -0.644806981086731, "logps/rejected": -2.0770316123962402, "loss": 0.6949, "nll_loss": 0.07454844564199448, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.19344215095043182, "rewards/margins": 0.4296673834323883, "rewards/rejected": -0.6231095194816589, "step": 1550 }, { "epoch": 0.13698329418479574, "grad_norm": 3.593688726425171, "learning_rate": 9.18641975308642e-06, "logits/chosen": 3.0721404552459717, "logits/rejected": 3.028421401977539, "logps/chosen": -0.8886274099349976, "logps/rejected": -2.1320443153381348, "loss": 0.7047, "nll_loss": 0.07659469544887543, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.2665882408618927, "rewards/margins": 0.3730250298976898, "rewards/rejected": -0.6396132707595825, "step": 1560 }, { "epoch": 0.13786139222444185, "grad_norm": 6.7611470222473145, "learning_rate": 9.174074074074074e-06, "logits/chosen": 2.952538013458252, "logits/rejected": 2.9351909160614014, "logps/chosen": -0.5501828193664551, "logps/rejected": -2.4539520740509033, "loss": 0.6445, "nll_loss": 0.07492227852344513, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.16505484282970428, "rewards/margins": 0.5711307525634766, "rewards/rejected": -0.7361854910850525, "step": 1570 }, { "epoch": 0.138739490264088, "grad_norm": 0.00917895883321762, "learning_rate": 9.161728395061729e-06, "logits/chosen": 2.942192792892456, "logits/rejected": 2.9979898929595947, "logps/chosen": -0.4612821638584137, "logps/rejected": -1.4370297193527222, "loss": 0.6585, "nll_loss": 0.04147753119468689, "rewards/accuracies": 0.5625, "rewards/chosen": -0.13838467001914978, "rewards/margins": 0.2927243113517761, "rewards/rejected": -0.43110889196395874, "step": 1580 }, { "epoch": 0.1396175883037341, "grad_norm": 1.7900971174240112, "learning_rate": 9.149382716049383e-06, "logits/chosen": 2.7242748737335205, "logits/rejected": 2.7570395469665527, "logps/chosen": -0.5272113084793091, "logps/rejected": -1.8239033222198486, "loss": 0.6752, "nll_loss": 0.07194850593805313, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1581634134054184, "rewards/margins": 0.3890075981616974, "rewards/rejected": -0.5471709966659546, "step": 1590 }, { "epoch": 0.14049568634338025, "grad_norm": 7.255580902099609, "learning_rate": 9.137037037037038e-06, "logits/chosen": 2.852332592010498, "logits/rejected": 2.8654465675354004, "logps/chosen": -0.8652445077896118, "logps/rejected": -2.7340779304504395, "loss": 0.7026, "nll_loss": 0.08957532793283463, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.25957340002059937, "rewards/margins": 0.5606500506401062, "rewards/rejected": -0.8202234506607056, "step": 1600 }, { "epoch": 0.14137378438302636, "grad_norm": 0.27785807847976685, "learning_rate": 9.124691358024692e-06, "logits/chosen": 2.8517799377441406, "logits/rejected": 2.871009111404419, "logps/chosen": -0.45925140380859375, "logps/rejected": -1.38749098777771, "loss": 0.6784, "nll_loss": 0.046599697321653366, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.13777543604373932, "rewards/margins": 0.2784718871116638, "rewards/rejected": -0.41624727845191956, "step": 1610 }, { "epoch": 0.1422518824226725, "grad_norm": 2.3490424156188965, "learning_rate": 9.112345679012347e-06, "logits/chosen": 3.001574754714966, "logits/rejected": 2.9703142642974854, "logps/chosen": -0.4932584762573242, "logps/rejected": -2.1252918243408203, "loss": 0.6813, "nll_loss": 0.08429791033267975, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.1479775607585907, "rewards/margins": 0.48961010575294495, "rewards/rejected": -0.6375876665115356, "step": 1620 }, { "epoch": 0.14312998046231862, "grad_norm": 3.299020767211914, "learning_rate": 9.100000000000001e-06, "logits/chosen": 2.8275907039642334, "logits/rejected": 2.9383082389831543, "logps/chosen": -0.6793197989463806, "logps/rejected": -2.526850700378418, "loss": 0.6513, "nll_loss": 0.05579754710197449, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.20379595458507538, "rewards/margins": 0.5542593002319336, "rewards/rejected": -0.7580552101135254, "step": 1630 }, { "epoch": 0.14400807850196473, "grad_norm": 0.914357602596283, "learning_rate": 9.087654320987655e-06, "logits/chosen": 2.987334966659546, "logits/rejected": 2.9897654056549072, "logps/chosen": -0.665000319480896, "logps/rejected": -2.4969944953918457, "loss": 0.6926, "nll_loss": 0.07977604120969772, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.19950011372566223, "rewards/margins": 0.5495983362197876, "rewards/rejected": -0.7490984797477722, "step": 1640 }, { "epoch": 0.14488617654161087, "grad_norm": 2.353787422180176, "learning_rate": 9.075308641975308e-06, "logits/chosen": 2.7523000240325928, "logits/rejected": 2.805290937423706, "logps/chosen": -0.34468549489974976, "logps/rejected": -2.1994361877441406, "loss": 0.6206, "nll_loss": 0.04046661779284477, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1034056693315506, "rewards/margins": 0.5564252138137817, "rewards/rejected": -0.6598309278488159, "step": 1650 }, { "epoch": 0.145764274581257, "grad_norm": 9.683286666870117, "learning_rate": 9.062962962962964e-06, "logits/chosen": 2.8048713207244873, "logits/rejected": 2.8911221027374268, "logps/chosen": -0.8389989137649536, "logps/rejected": -1.6681379079818726, "loss": 0.7894, "nll_loss": 0.09216944873332977, "rewards/accuracies": 0.625, "rewards/chosen": -0.25169968605041504, "rewards/margins": 0.24874171614646912, "rewards/rejected": -0.5004413723945618, "step": 1660 }, { "epoch": 0.14664237262090313, "grad_norm": 9.369816780090332, "learning_rate": 9.050617283950619e-06, "logits/chosen": 2.8822600841522217, "logits/rejected": 2.904628038406372, "logps/chosen": -0.8616389036178589, "logps/rejected": -2.4774065017700195, "loss": 0.6733, "nll_loss": 0.052507419139146805, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2584916949272156, "rewards/margins": 0.48473024368286133, "rewards/rejected": -0.7432219386100769, "step": 1670 }, { "epoch": 0.14752047066054924, "grad_norm": 1.7723337411880493, "learning_rate": 9.038271604938273e-06, "logits/chosen": 3.0452260971069336, "logits/rejected": 3.092968225479126, "logps/chosen": -0.5382004380226135, "logps/rejected": -1.853811264038086, "loss": 0.6863, "nll_loss": 0.07374037802219391, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.16146014630794525, "rewards/margins": 0.39468324184417725, "rewards/rejected": -0.5561434030532837, "step": 1680 }, { "epoch": 0.14839856870019538, "grad_norm": 4.466056823730469, "learning_rate": 9.025925925925927e-06, "logits/chosen": 2.8592844009399414, "logits/rejected": 2.896420955657959, "logps/chosen": -0.5004099011421204, "logps/rejected": -1.6472933292388916, "loss": 0.6771, "nll_loss": 0.05312333256006241, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.1501229852437973, "rewards/margins": 0.34406501054763794, "rewards/rejected": -0.49418801069259644, "step": 1690 }, { "epoch": 0.1492766667398415, "grad_norm": 2.437678098678589, "learning_rate": 9.013580246913582e-06, "logits/chosen": 2.8448596000671387, "logits/rejected": 2.8915913105010986, "logps/chosen": -0.6149319410324097, "logps/rejected": -1.89218008518219, "loss": 0.673, "nll_loss": 0.05624104663729668, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.18447960913181305, "rewards/margins": 0.3831743597984314, "rewards/rejected": -0.567654013633728, "step": 1700 }, { "epoch": 0.15015476477948764, "grad_norm": 3.493530750274658, "learning_rate": 9.001234567901236e-06, "logits/chosen": 2.781919479370117, "logits/rejected": 2.7699952125549316, "logps/chosen": -0.6766859889030457, "logps/rejected": -2.136669635772705, "loss": 0.6503, "nll_loss": 0.06224127486348152, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2030058205127716, "rewards/margins": 0.43799519538879395, "rewards/rejected": -0.6410009264945984, "step": 1710 }, { "epoch": 0.15103286281913375, "grad_norm": 3.9590651988983154, "learning_rate": 8.988888888888889e-06, "logits/chosen": 2.562682628631592, "logits/rejected": 2.5419199466705322, "logps/chosen": -0.965559184551239, "logps/rejected": -2.73313570022583, "loss": 0.7286, "nll_loss": 0.10021784156560898, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.2896677553653717, "rewards/margins": 0.5302730798721313, "rewards/rejected": -0.8199408650398254, "step": 1720 }, { "epoch": 0.1519109608587799, "grad_norm": 6.649216175079346, "learning_rate": 8.976543209876543e-06, "logits/chosen": 2.754970073699951, "logits/rejected": 2.800556182861328, "logps/chosen": -0.7695094347000122, "logps/rejected": -2.4826478958129883, "loss": 0.7055, "nll_loss": 0.05943988636136055, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.23085281252861023, "rewards/margins": 0.5139415264129639, "rewards/rejected": -0.7447944283485413, "step": 1730 }, { "epoch": 0.152789058898426, "grad_norm": 3.0816867351531982, "learning_rate": 8.964197530864198e-06, "logits/chosen": 2.7005701065063477, "logits/rejected": 2.8037772178649902, "logps/chosen": -0.7151850461959839, "logps/rejected": -2.9252617359161377, "loss": 0.6407, "nll_loss": 0.06549613177776337, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2145555019378662, "rewards/margins": 0.6630231142044067, "rewards/rejected": -0.8775785565376282, "step": 1740 }, { "epoch": 0.15366715693807215, "grad_norm": 3.4558446407318115, "learning_rate": 8.951851851851852e-06, "logits/chosen": 2.783221483230591, "logits/rejected": 2.7478537559509277, "logps/chosen": -0.3045424818992615, "logps/rejected": -1.9958875179290771, "loss": 0.5977, "nll_loss": 0.030114714056253433, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.09136275202035904, "rewards/margins": 0.507403552532196, "rewards/rejected": -0.5987662672996521, "step": 1750 }, { "epoch": 0.15454525497771826, "grad_norm": 2.812927722930908, "learning_rate": 8.939506172839507e-06, "logits/chosen": 2.567533016204834, "logits/rejected": 2.5942509174346924, "logps/chosen": -0.7922319173812866, "logps/rejected": -1.9200432300567627, "loss": 0.7427, "nll_loss": 0.07532784342765808, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.23766958713531494, "rewards/margins": 0.33834341168403625, "rewards/rejected": -0.5760129690170288, "step": 1760 }, { "epoch": 0.1554233530173644, "grad_norm": 4.2586750984191895, "learning_rate": 8.927160493827161e-06, "logits/chosen": 2.8497743606567383, "logits/rejected": 2.857257604598999, "logps/chosen": -1.0099961757659912, "logps/rejected": -3.4478023052215576, "loss": 0.6624, "nll_loss": 0.1071515902876854, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.302998811006546, "rewards/margins": 0.7313419580459595, "rewards/rejected": -1.034340739250183, "step": 1770 }, { "epoch": 0.15630145105701052, "grad_norm": 1.7801121473312378, "learning_rate": 8.914814814814816e-06, "logits/chosen": 2.7116096019744873, "logits/rejected": 2.7383835315704346, "logps/chosen": -0.6959985494613647, "logps/rejected": -2.842719554901123, "loss": 0.626, "nll_loss": 0.07402163743972778, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2087995707988739, "rewards/margins": 0.6440162658691406, "rewards/rejected": -0.8528158068656921, "step": 1780 }, { "epoch": 0.15717954909665663, "grad_norm": 2.949233055114746, "learning_rate": 8.90246913580247e-06, "logits/chosen": 2.5691332817077637, "logits/rejected": 2.6201071739196777, "logps/chosen": -0.7052744626998901, "logps/rejected": -2.180938720703125, "loss": 0.6607, "nll_loss": 0.058830149471759796, "rewards/accuracies": 0.625, "rewards/chosen": -0.21158234775066376, "rewards/margins": 0.4426993429660797, "rewards/rejected": -0.6542816758155823, "step": 1790 }, { "epoch": 0.15805764713630277, "grad_norm": 0.5553939938545227, "learning_rate": 8.890123456790124e-06, "logits/chosen": 2.677717685699463, "logits/rejected": 2.782273292541504, "logps/chosen": -0.4674352705478668, "logps/rejected": -3.086191177368164, "loss": 0.5701, "nll_loss": 0.04577519744634628, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.14023058116436005, "rewards/margins": 0.7856268286705017, "rewards/rejected": -0.925857424736023, "step": 1800 }, { "epoch": 0.15893574517594888, "grad_norm": 2.8835935592651367, "learning_rate": 8.877777777777779e-06, "logits/chosen": 2.745272636413574, "logits/rejected": 2.806513547897339, "logps/chosen": -0.5373865962028503, "logps/rejected": -3.408268451690674, "loss": 0.6131, "nll_loss": 0.053614210337400436, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.16121599078178406, "rewards/margins": 0.8612645864486694, "rewards/rejected": -1.0224807262420654, "step": 1810 }, { "epoch": 0.15981384321559503, "grad_norm": 6.393036365509033, "learning_rate": 8.865432098765433e-06, "logits/chosen": 2.6081528663635254, "logits/rejected": 2.663269281387329, "logps/chosen": -0.6596813201904297, "logps/rejected": -3.0106234550476074, "loss": 0.6523, "nll_loss": 0.07575313746929169, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.19790442287921906, "rewards/margins": 0.7052826881408691, "rewards/rejected": -0.903187096118927, "step": 1820 }, { "epoch": 0.16069194125524114, "grad_norm": 8.32457160949707, "learning_rate": 8.853086419753088e-06, "logits/chosen": 2.1919102668762207, "logits/rejected": 2.2420222759246826, "logps/chosen": -0.6085657477378845, "logps/rejected": -3.387340545654297, "loss": 0.6396, "nll_loss": 0.05589609593153, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1825697273015976, "rewards/margins": 0.8336323499679565, "rewards/rejected": -1.0162022113800049, "step": 1830 }, { "epoch": 0.16157003929488728, "grad_norm": 8.0199556350708, "learning_rate": 8.840740740740742e-06, "logits/chosen": 2.2459845542907715, "logits/rejected": 2.2368826866149902, "logps/chosen": -0.904313862323761, "logps/rejected": -3.4034416675567627, "loss": 0.7623, "nll_loss": 0.13848955929279327, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.27129411697387695, "rewards/margins": 0.7497383952140808, "rewards/rejected": -1.0210325717926025, "step": 1840 }, { "epoch": 0.1624481373345334, "grad_norm": 3.155104398727417, "learning_rate": 8.828395061728395e-06, "logits/chosen": 2.594691514968872, "logits/rejected": 2.571620464324951, "logps/chosen": -1.037233591079712, "logps/rejected": -2.723869800567627, "loss": 0.7723, "nll_loss": 0.12755393981933594, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3111700415611267, "rewards/margins": 0.5059908628463745, "rewards/rejected": -0.817160964012146, "step": 1850 }, { "epoch": 0.16332623537417953, "grad_norm": 3.548657178878784, "learning_rate": 8.81604938271605e-06, "logits/chosen": 2.7853002548217773, "logits/rejected": 2.796757221221924, "logps/chosen": -0.5054196119308472, "logps/rejected": -1.9281543493270874, "loss": 0.6821, "nll_loss": 0.06374648213386536, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.15162590146064758, "rewards/margins": 0.42682045698165894, "rewards/rejected": -0.5784463286399841, "step": 1860 }, { "epoch": 0.16420433341382565, "grad_norm": 5.429587364196777, "learning_rate": 8.803703703703704e-06, "logits/chosen": 2.6975064277648926, "logits/rejected": 2.7702746391296387, "logps/chosen": -0.7951546907424927, "logps/rejected": -2.666987895965576, "loss": 0.6656, "nll_loss": 0.06336641311645508, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.2385464459657669, "rewards/margins": 0.5615500211715698, "rewards/rejected": -0.8000965118408203, "step": 1870 }, { "epoch": 0.1650824314534718, "grad_norm": 1.150895357131958, "learning_rate": 8.791358024691358e-06, "logits/chosen": 2.687932252883911, "logits/rejected": 2.754683017730713, "logps/chosen": -0.7847083806991577, "logps/rejected": -2.204124927520752, "loss": 0.7186, "nll_loss": 0.08406446129083633, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.23541252315044403, "rewards/margins": 0.42582497000694275, "rewards/rejected": -0.6612375378608704, "step": 1880 }, { "epoch": 0.1659605294931179, "grad_norm": 2.4268455505371094, "learning_rate": 8.779012345679012e-06, "logits/chosen": 2.5794925689697266, "logits/rejected": 2.6048686504364014, "logps/chosen": -0.6307061314582825, "logps/rejected": -2.8818671703338623, "loss": 0.619, "nll_loss": 0.07149704545736313, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.1892118602991104, "rewards/margins": 0.6753484010696411, "rewards/rejected": -0.8645601272583008, "step": 1890 }, { "epoch": 0.16683862753276404, "grad_norm": 5.715259075164795, "learning_rate": 8.766666666666669e-06, "logits/chosen": 2.707252025604248, "logits/rejected": 2.6988537311553955, "logps/chosen": -0.6423169374465942, "logps/rejected": -1.875507116317749, "loss": 0.6782, "nll_loss": 0.04863595962524414, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.19269508123397827, "rewards/margins": 0.3699570894241333, "rewards/rejected": -0.5626521706581116, "step": 1900 }, { "epoch": 0.16771672557241016, "grad_norm": 1.6416656970977783, "learning_rate": 8.754320987654323e-06, "logits/chosen": 2.792642593383789, "logits/rejected": 2.8566908836364746, "logps/chosen": -0.6888980269432068, "logps/rejected": -2.5319087505340576, "loss": 0.653, "nll_loss": 0.06277020275592804, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.206669420003891, "rewards/margins": 0.5529031753540039, "rewards/rejected": -0.7595726251602173, "step": 1910 }, { "epoch": 0.1685948236120563, "grad_norm": 1.5920031070709229, "learning_rate": 8.741975308641976e-06, "logits/chosen": 2.6872434616088867, "logits/rejected": 2.6933059692382812, "logps/chosen": -0.5184003710746765, "logps/rejected": -1.9327194690704346, "loss": 0.6515, "nll_loss": 0.06715475022792816, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.15552011132240295, "rewards/margins": 0.42429572343826294, "rewards/rejected": -0.5798158049583435, "step": 1920 }, { "epoch": 0.1694729216517024, "grad_norm": 4.420103073120117, "learning_rate": 8.72962962962963e-06, "logits/chosen": 2.835470676422119, "logits/rejected": 2.8618292808532715, "logps/chosen": -0.7457289695739746, "logps/rejected": -2.064795970916748, "loss": 0.7263, "nll_loss": 0.10260520875453949, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.22371868789196014, "rewards/margins": 0.39572006464004517, "rewards/rejected": -0.6194387674331665, "step": 1930 }, { "epoch": 0.17035101969134853, "grad_norm": 2.6691431999206543, "learning_rate": 8.717283950617285e-06, "logits/chosen": 2.6595866680145264, "logits/rejected": 2.683384656906128, "logps/chosen": -0.5285651087760925, "logps/rejected": -1.9386451244354248, "loss": 0.6966, "nll_loss": 0.07593102753162384, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.1585695445537567, "rewards/margins": 0.4230240285396576, "rewards/rejected": -0.5815936326980591, "step": 1940 }, { "epoch": 0.17122911773099467, "grad_norm": 1.895266056060791, "learning_rate": 8.704938271604939e-06, "logits/chosen": 2.801657199859619, "logits/rejected": 2.762845516204834, "logps/chosen": -0.6032005548477173, "logps/rejected": -1.6262376308441162, "loss": 0.6888, "nll_loss": 0.060188956558704376, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.18096016347408295, "rewards/margins": 0.30691108107566833, "rewards/rejected": -0.4878712594509125, "step": 1950 }, { "epoch": 0.17210721577064078, "grad_norm": 1.7787566184997559, "learning_rate": 8.692592592592593e-06, "logits/chosen": 2.8991875648498535, "logits/rejected": 2.936009645462036, "logps/chosen": -0.5252435803413391, "logps/rejected": -1.9678659439086914, "loss": 0.6254, "nll_loss": 0.047948211431503296, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.15757307410240173, "rewards/margins": 0.432786762714386, "rewards/rejected": -0.5903598070144653, "step": 1960 }, { "epoch": 0.17298531381028692, "grad_norm": 1.326920747756958, "learning_rate": 8.680246913580248e-06, "logits/chosen": 2.5659279823303223, "logits/rejected": 2.590271472930908, "logps/chosen": -0.5297742486000061, "logps/rejected": -2.028113842010498, "loss": 0.6545, "nll_loss": 0.05374212935566902, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.15893225371837616, "rewards/margins": 0.4495018422603607, "rewards/rejected": -0.6084341406822205, "step": 1970 }, { "epoch": 0.17386341184993304, "grad_norm": 4.611727714538574, "learning_rate": 8.667901234567902e-06, "logits/chosen": 2.6264684200286865, "logits/rejected": 2.632913589477539, "logps/chosen": -0.693698525428772, "logps/rejected": -3.1472320556640625, "loss": 0.6069, "nll_loss": 0.05521649122238159, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.20810957252979279, "rewards/margins": 0.7360601425170898, "rewards/rejected": -0.9441697001457214, "step": 1980 }, { "epoch": 0.17474150988957918, "grad_norm": 2.2870845794677734, "learning_rate": 8.655555555555557e-06, "logits/chosen": 2.3779568672180176, "logits/rejected": 2.4282214641571045, "logps/chosen": -0.24996769428253174, "logps/rejected": -2.317931890487671, "loss": 0.5852, "nll_loss": 0.045966412872076035, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.07499031722545624, "rewards/margins": 0.6203892230987549, "rewards/rejected": -0.6953796148300171, "step": 1990 }, { "epoch": 0.1756196079292253, "grad_norm": 1.1437376737594604, "learning_rate": 8.643209876543211e-06, "logits/chosen": 2.380004644393921, "logits/rejected": 2.3685264587402344, "logps/chosen": -0.5816354751586914, "logps/rejected": -2.3618433475494385, "loss": 0.6945, "nll_loss": 0.07123078405857086, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.17449061572551727, "rewards/margins": 0.5340624451637268, "rewards/rejected": -0.70855313539505, "step": 2000 }, { "epoch": 0.17649770596887143, "grad_norm": 9.37255573272705, "learning_rate": 8.630864197530865e-06, "logits/chosen": 2.211916446685791, "logits/rejected": 2.220418930053711, "logps/chosen": -0.5349146723747253, "logps/rejected": -2.8715481758117676, "loss": 0.6366, "nll_loss": 0.06667280942201614, "rewards/accuracies": 0.6875, "rewards/chosen": -0.16047440469264984, "rewards/margins": 0.7009900808334351, "rewards/rejected": -0.8614645004272461, "step": 2010 }, { "epoch": 0.17737580400851755, "grad_norm": 2.2170488834381104, "learning_rate": 8.61851851851852e-06, "logits/chosen": 1.94232177734375, "logits/rejected": 1.9853718280792236, "logps/chosen": -0.2896527945995331, "logps/rejected": -3.063877820968628, "loss": 0.6189, "nll_loss": 0.046251922845840454, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.08689583837985992, "rewards/margins": 0.8322674632072449, "rewards/rejected": -0.9191633462905884, "step": 2020 }, { "epoch": 0.1782539020481637, "grad_norm": 8.343827247619629, "learning_rate": 8.606172839506174e-06, "logits/chosen": 1.9204469919204712, "logits/rejected": 1.948312759399414, "logps/chosen": -1.1184568405151367, "logps/rejected": -3.0377535820007324, "loss": 0.8225, "nll_loss": 0.12297489494085312, "rewards/accuracies": 0.625, "rewards/chosen": -0.3355370759963989, "rewards/margins": 0.5757889747619629, "rewards/rejected": -0.9113261103630066, "step": 2030 }, { "epoch": 0.1791320000878098, "grad_norm": 13.199078559875488, "learning_rate": 8.593827160493829e-06, "logits/chosen": 2.0205349922180176, "logits/rejected": 2.0275635719299316, "logps/chosen": -1.428043007850647, "logps/rejected": -2.872222661972046, "loss": 0.9056, "nll_loss": 0.1807326227426529, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.4284129738807678, "rewards/margins": 0.4332540035247803, "rewards/rejected": -0.8616668581962585, "step": 2040 }, { "epoch": 0.18001009812745594, "grad_norm": 2.260270357131958, "learning_rate": 8.581481481481481e-06, "logits/chosen": 2.2045745849609375, "logits/rejected": 2.187514543533325, "logps/chosen": -0.4845626950263977, "logps/rejected": -2.2464981079101562, "loss": 0.6956, "nll_loss": 0.06010964512825012, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1453687846660614, "rewards/margins": 0.5285806059837341, "rewards/rejected": -0.6739493608474731, "step": 2050 }, { "epoch": 0.18088819616710206, "grad_norm": 2.032146453857422, "learning_rate": 8.569135802469136e-06, "logits/chosen": 2.2264397144317627, "logits/rejected": 2.265310764312744, "logps/chosen": -1.0286977291107178, "logps/rejected": -3.1123714447021484, "loss": 0.7175, "nll_loss": 0.11090108007192612, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3086093068122864, "rewards/margins": 0.6251022815704346, "rewards/rejected": -0.9337115287780762, "step": 2060 }, { "epoch": 0.1817662942067482, "grad_norm": 0.9217659831047058, "learning_rate": 8.55679012345679e-06, "logits/chosen": 2.2858176231384277, "logits/rejected": 2.315831422805786, "logps/chosen": -0.6487756967544556, "logps/rejected": -2.1730990409851074, "loss": 0.6862, "nll_loss": 0.07243213802576065, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.19463272392749786, "rewards/margins": 0.4572969377040863, "rewards/rejected": -0.6519297361373901, "step": 2070 }, { "epoch": 0.1826443922463943, "grad_norm": 4.933953285217285, "learning_rate": 8.544444444444445e-06, "logits/chosen": 2.4036874771118164, "logits/rejected": 2.3770546913146973, "logps/chosen": -0.9166983366012573, "logps/rejected": -2.5144124031066895, "loss": 0.7355, "nll_loss": 0.12400822341442108, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.27500951290130615, "rewards/margins": 0.47931423783302307, "rewards/rejected": -0.7543236613273621, "step": 2080 }, { "epoch": 0.18352249028604042, "grad_norm": 5.920201778411865, "learning_rate": 8.532098765432099e-06, "logits/chosen": 2.504974603652954, "logits/rejected": 2.541961193084717, "logps/chosen": -0.8963934779167175, "logps/rejected": -2.740062713623047, "loss": 0.7162, "nll_loss": 0.1004381999373436, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2689180374145508, "rewards/margins": 0.5531007051467896, "rewards/rejected": -0.8220188021659851, "step": 2090 }, { "epoch": 0.18440058832568657, "grad_norm": 0.10835571587085724, "learning_rate": 8.519753086419754e-06, "logits/chosen": 2.54685640335083, "logits/rejected": 2.627911329269409, "logps/chosen": -0.33035722374916077, "logps/rejected": -2.806840419769287, "loss": 0.561, "nll_loss": 0.03572739288210869, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.09910716861486435, "rewards/margins": 0.7429450154304504, "rewards/rejected": -0.8420522809028625, "step": 2100 }, { "epoch": 0.18527868636533268, "grad_norm": 5.237948417663574, "learning_rate": 8.507407407407408e-06, "logits/chosen": 2.4657981395721436, "logits/rejected": 2.541999101638794, "logps/chosen": -0.5284551382064819, "logps/rejected": -2.2460741996765137, "loss": 0.6753, "nll_loss": 0.055809833109378815, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.15853655338287354, "rewards/margins": 0.515285849571228, "rewards/rejected": -0.6738223433494568, "step": 2110 }, { "epoch": 0.18615678440497882, "grad_norm": 6.183863162994385, "learning_rate": 8.495061728395062e-06, "logits/chosen": 2.370246171951294, "logits/rejected": 2.388896942138672, "logps/chosen": -0.5437101125717163, "logps/rejected": -2.8997700214385986, "loss": 0.6217, "nll_loss": 0.07455585151910782, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1631130427122116, "rewards/margins": 0.7068179249763489, "rewards/rejected": -0.8699310421943665, "step": 2120 }, { "epoch": 0.18703488244462493, "grad_norm": 3.2471072673797607, "learning_rate": 8.482716049382717e-06, "logits/chosen": 2.301064968109131, "logits/rejected": 2.3306994438171387, "logps/chosen": -1.0358989238739014, "logps/rejected": -3.3612143993377686, "loss": 0.6543, "nll_loss": 0.08921568840742111, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3107697069644928, "rewards/margins": 0.6975947618484497, "rewards/rejected": -1.0083644390106201, "step": 2130 }, { "epoch": 0.18791298048427107, "grad_norm": 2.2561800479888916, "learning_rate": 8.470370370370371e-06, "logits/chosen": 2.35896635055542, "logits/rejected": 2.424318790435791, "logps/chosen": -0.42497625946998596, "logps/rejected": -3.6937179565429688, "loss": 0.5684, "nll_loss": 0.049736388027668, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.12749287486076355, "rewards/margins": 0.9806225895881653, "rewards/rejected": -1.1081154346466064, "step": 2140 }, { "epoch": 0.1887910785239172, "grad_norm": 3.721003293991089, "learning_rate": 8.458024691358026e-06, "logits/chosen": 2.294174909591675, "logits/rejected": 2.332521915435791, "logps/chosen": -0.3734773099422455, "logps/rejected": -2.520681858062744, "loss": 0.6118, "nll_loss": 0.03998088836669922, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.11204320192337036, "rewards/margins": 0.6441613435745239, "rewards/rejected": -0.7562046051025391, "step": 2150 }, { "epoch": 0.18966917656356333, "grad_norm": 0.09796835482120514, "learning_rate": 8.44567901234568e-06, "logits/chosen": 2.144991397857666, "logits/rejected": 2.220353364944458, "logps/chosen": -0.5873435139656067, "logps/rejected": -3.0226333141326904, "loss": 0.6662, "nll_loss": 0.05847520753741264, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.17620307207107544, "rewards/margins": 0.7305869460105896, "rewards/rejected": -0.9067900776863098, "step": 2160 }, { "epoch": 0.19054727460320944, "grad_norm": 5.563838005065918, "learning_rate": 8.433333333333334e-06, "logits/chosen": 2.4212334156036377, "logits/rejected": 2.4930455684661865, "logps/chosen": -1.0709176063537598, "logps/rejected": -3.6808440685272217, "loss": 0.705, "nll_loss": 0.1328948587179184, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3212752938270569, "rewards/margins": 0.7829779386520386, "rewards/rejected": -1.1042531728744507, "step": 2170 }, { "epoch": 0.19142537264285558, "grad_norm": 2.7498562335968018, "learning_rate": 8.420987654320987e-06, "logits/chosen": 2.3793070316314697, "logits/rejected": 2.4112510681152344, "logps/chosen": -0.8520647883415222, "logps/rejected": -3.673933506011963, "loss": 0.6735, "nll_loss": 0.11274605989456177, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.25561946630477905, "rewards/margins": 0.8465606570243835, "rewards/rejected": -1.102180004119873, "step": 2180 }, { "epoch": 0.1923034706825017, "grad_norm": 2.2553093433380127, "learning_rate": 8.408641975308642e-06, "logits/chosen": 2.5548908710479736, "logits/rejected": 2.6078009605407715, "logps/chosen": -0.457774817943573, "logps/rejected": -3.5358822345733643, "loss": 0.6123, "nll_loss": 0.05915825441479683, "rewards/accuracies": 0.625, "rewards/chosen": -0.13733243942260742, "rewards/margins": 0.9234321713447571, "rewards/rejected": -1.0607647895812988, "step": 2190 }, { "epoch": 0.19318156872214784, "grad_norm": 2.100526809692383, "learning_rate": 8.396296296296296e-06, "logits/chosen": 2.438559055328369, "logits/rejected": 2.4799530506134033, "logps/chosen": -0.5425572395324707, "logps/rejected": -2.516019582748413, "loss": 0.6859, "nll_loss": 0.06712070107460022, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.1627671718597412, "rewards/margins": 0.5920388698577881, "rewards/rejected": -0.7548059821128845, "step": 2200 }, { "epoch": 0.19405966676179395, "grad_norm": 0.8824114203453064, "learning_rate": 8.383950617283952e-06, "logits/chosen": 2.6323628425598145, "logits/rejected": 2.694988489151001, "logps/chosen": -0.5362733006477356, "logps/rejected": -3.8703293800354004, "loss": 0.5774, "nll_loss": 0.0370684489607811, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.16088199615478516, "rewards/margins": 1.0002167224884033, "rewards/rejected": -1.1610987186431885, "step": 2210 }, { "epoch": 0.1949377648014401, "grad_norm": 7.0314717292785645, "learning_rate": 8.371604938271607e-06, "logits/chosen": 2.701634407043457, "logits/rejected": 2.749321222305298, "logps/chosen": -0.914169430732727, "logps/rejected": -2.8599061965942383, "loss": 0.7608, "nll_loss": 0.10523217916488647, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2742508053779602, "rewards/margins": 0.5837210416793823, "rewards/rejected": -0.8579719662666321, "step": 2220 }, { "epoch": 0.1958158628410862, "grad_norm": 0.8926441669464111, "learning_rate": 8.359259259259261e-06, "logits/chosen": 2.576303720474243, "logits/rejected": 2.606104850769043, "logps/chosen": -0.8748048543930054, "logps/rejected": -2.003169536590576, "loss": 0.7253, "nll_loss": 0.07492824643850327, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.2624414563179016, "rewards/margins": 0.33850938081741333, "rewards/rejected": -0.6009508371353149, "step": 2230 }, { "epoch": 0.19669396088073232, "grad_norm": 1.0316241979599, "learning_rate": 8.346913580246915e-06, "logits/chosen": 2.5414836406707764, "logits/rejected": 2.6227262020111084, "logps/chosen": -0.6043969988822937, "logps/rejected": -3.0870282649993896, "loss": 0.5888, "nll_loss": 0.04015268385410309, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.18131910264492035, "rewards/margins": 0.7447894215583801, "rewards/rejected": -0.9261085391044617, "step": 2240 }, { "epoch": 0.19757205892037846, "grad_norm": 7.171932220458984, "learning_rate": 8.334567901234568e-06, "logits/chosen": 2.5014796257019043, "logits/rejected": 2.4662632942199707, "logps/chosen": -0.5610159039497375, "logps/rejected": -2.8177947998046875, "loss": 0.6317, "nll_loss": 0.06384368985891342, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.16830478608608246, "rewards/margins": 0.6770337224006653, "rewards/rejected": -0.8453385233879089, "step": 2250 }, { "epoch": 0.19845015696002458, "grad_norm": 3.9543685913085938, "learning_rate": 8.322222222222223e-06, "logits/chosen": 2.628187656402588, "logits/rejected": 2.585869312286377, "logps/chosen": -0.62028568983078, "logps/rejected": -2.7624683380126953, "loss": 0.6441, "nll_loss": 0.06617014110088348, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.18608573079109192, "rewards/margins": 0.6426547765731812, "rewards/rejected": -0.8287404775619507, "step": 2260 }, { "epoch": 0.19932825499967072, "grad_norm": 4.6166815757751465, "learning_rate": 8.309876543209877e-06, "logits/chosen": 2.4966917037963867, "logits/rejected": 2.537562847137451, "logps/chosen": -0.963221549987793, "logps/rejected": -3.5489554405212402, "loss": 0.663, "nll_loss": 0.06778384000062943, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.28896647691726685, "rewards/margins": 0.7757201790809631, "rewards/rejected": -1.06468665599823, "step": 2270 }, { "epoch": 0.20020635303931683, "grad_norm": 11.345588684082031, "learning_rate": 8.297530864197531e-06, "logits/chosen": 2.5486679077148438, "logits/rejected": 2.4989380836486816, "logps/chosen": -0.6900479197502136, "logps/rejected": -2.254317045211792, "loss": 0.7252, "nll_loss": 0.07977007329463959, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.20701436698436737, "rewards/margins": 0.4692806601524353, "rewards/rejected": -0.6762951016426086, "step": 2280 }, { "epoch": 0.20108445107896297, "grad_norm": 0.1733788251876831, "learning_rate": 8.285185185185186e-06, "logits/chosen": 2.6525139808654785, "logits/rejected": 2.702258586883545, "logps/chosen": -0.6712150573730469, "logps/rejected": -3.3791470527648926, "loss": 0.6285, "nll_loss": 0.06629864871501923, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.20136454701423645, "rewards/margins": 0.8123796582221985, "rewards/rejected": -1.0137441158294678, "step": 2290 }, { "epoch": 0.20196254911860909, "grad_norm": 2.2842109203338623, "learning_rate": 8.27283950617284e-06, "logits/chosen": 2.430759906768799, "logits/rejected": 2.509899616241455, "logps/chosen": -0.612421989440918, "logps/rejected": -2.978832960128784, "loss": 0.6656, "nll_loss": 0.0659157857298851, "rewards/accuracies": 0.625, "rewards/chosen": -0.18372659385204315, "rewards/margins": 0.7099233865737915, "rewards/rejected": -0.8936498761177063, "step": 2300 }, { "epoch": 0.20284064715825523, "grad_norm": 6.801368713378906, "learning_rate": 8.260493827160495e-06, "logits/chosen": 2.424044609069824, "logits/rejected": 2.4345576763153076, "logps/chosen": -0.4499019682407379, "logps/rejected": -3.497096300125122, "loss": 0.5669, "nll_loss": 0.03968087583780289, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.13497060537338257, "rewards/margins": 0.9141584634780884, "rewards/rejected": -1.0491290092468262, "step": 2310 }, { "epoch": 0.20371874519790134, "grad_norm": 4.946938514709473, "learning_rate": 8.248148148148149e-06, "logits/chosen": 2.310943841934204, "logits/rejected": 2.327831983566284, "logps/chosen": -0.6226638555526733, "logps/rejected": -5.598433971405029, "loss": 0.4657, "nll_loss": 0.053391944617033005, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.18679918348789215, "rewards/margins": 1.4927312135696411, "rewards/rejected": -1.679530382156372, "step": 2320 }, { "epoch": 0.20459684323754748, "grad_norm": 24.650245666503906, "learning_rate": 8.235802469135803e-06, "logits/chosen": 2.0664007663726807, "logits/rejected": 2.158742904663086, "logps/chosen": -2.7977170944213867, "logps/rejected": -6.907550811767578, "loss": 1.5409, "nll_loss": 0.6891128420829773, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8393152356147766, "rewards/margins": 1.2329500913619995, "rewards/rejected": -2.072265148162842, "step": 2330 }, { "epoch": 0.2054749412771936, "grad_norm": 2.8847906589508057, "learning_rate": 8.223456790123458e-06, "logits/chosen": 2.0425117015838623, "logits/rejected": 2.046208381652832, "logps/chosen": -0.8856005668640137, "logps/rejected": -3.2776896953582764, "loss": 0.7465, "nll_loss": 0.10851933062076569, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2656802237033844, "rewards/margins": 0.717626690864563, "rewards/rejected": -0.983306884765625, "step": 2340 }, { "epoch": 0.20635303931683974, "grad_norm": 6.361351013183594, "learning_rate": 8.211111111111112e-06, "logits/chosen": 2.2434327602386475, "logits/rejected": 2.3048999309539795, "logps/chosen": -0.753734290599823, "logps/rejected": -3.09961199760437, "loss": 0.6477, "nll_loss": 0.05653975531458855, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.22612027823925018, "rewards/margins": 0.7037633657455444, "rewards/rejected": -0.929883599281311, "step": 2350 }, { "epoch": 0.20723113735648585, "grad_norm": 4.796877384185791, "learning_rate": 8.198765432098767e-06, "logits/chosen": 2.420621156692505, "logits/rejected": 2.454742670059204, "logps/chosen": -0.5336098670959473, "logps/rejected": -2.5250916481018066, "loss": 0.6861, "nll_loss": 0.07172581553459167, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.16008298099040985, "rewards/margins": 0.5974445343017578, "rewards/rejected": -0.7575275897979736, "step": 2360 }, { "epoch": 0.208109235396132, "grad_norm": 6.713689804077148, "learning_rate": 8.186419753086421e-06, "logits/chosen": 2.700634241104126, "logits/rejected": 2.761862277984619, "logps/chosen": -0.9955413937568665, "logps/rejected": -3.3551056385040283, "loss": 0.7475, "nll_loss": 0.11970362812280655, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2986624538898468, "rewards/margins": 0.7078693509101868, "rewards/rejected": -1.0065317153930664, "step": 2370 }, { "epoch": 0.2089873334357781, "grad_norm": 5.509604454040527, "learning_rate": 8.174074074074074e-06, "logits/chosen": 2.601839780807495, "logits/rejected": 2.6365180015563965, "logps/chosen": -0.7382031679153442, "logps/rejected": -2.511854648590088, "loss": 0.6877, "nll_loss": 0.058238618075847626, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2214609682559967, "rewards/margins": 0.5320954918861389, "rewards/rejected": -0.7535563707351685, "step": 2380 }, { "epoch": 0.20986543147542422, "grad_norm": 0.9899409413337708, "learning_rate": 8.161728395061728e-06, "logits/chosen": 2.59370493888855, "logits/rejected": 2.653318166732788, "logps/chosen": -0.43372973799705505, "logps/rejected": -2.1165356636047363, "loss": 0.6498, "nll_loss": 0.055299948900938034, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.1301189363002777, "rewards/margins": 0.5048418045043945, "rewards/rejected": -0.6349607706069946, "step": 2390 }, { "epoch": 0.21074352951507036, "grad_norm": 2.4443204402923584, "learning_rate": 8.149382716049383e-06, "logits/chosen": 2.621575355529785, "logits/rejected": 2.6399495601654053, "logps/chosen": -0.6197006702423096, "logps/rejected": -2.7705483436584473, "loss": 0.6775, "nll_loss": 0.09681596606969833, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.1859102100133896, "rewards/margins": 0.6452543139457703, "rewards/rejected": -0.831164538860321, "step": 2400 }, { "epoch": 0.21162162755471647, "grad_norm": 3.302237033843994, "learning_rate": 8.137037037037037e-06, "logits/chosen": 2.6685478687286377, "logits/rejected": 2.645535469055176, "logps/chosen": -0.5314685106277466, "logps/rejected": -2.784475803375244, "loss": 0.6021, "nll_loss": 0.05336238071322441, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1594405472278595, "rewards/margins": 0.675902247428894, "rewards/rejected": -0.8353427052497864, "step": 2410 }, { "epoch": 0.21249972559436262, "grad_norm": 1.3908727169036865, "learning_rate": 8.124691358024692e-06, "logits/chosen": 2.372706413269043, "logits/rejected": 2.4324469566345215, "logps/chosen": -0.5971062779426575, "logps/rejected": -2.159764051437378, "loss": 0.6943, "nll_loss": 0.05865710228681564, "rewards/accuracies": 0.6875, "rewards/chosen": -0.179131880402565, "rewards/margins": 0.4687972664833069, "rewards/rejected": -0.6479291915893555, "step": 2420 }, { "epoch": 0.21337782363400873, "grad_norm": 2.2388875484466553, "learning_rate": 8.112345679012346e-06, "logits/chosen": 2.378962993621826, "logits/rejected": 2.4710259437561035, "logps/chosen": -0.6657778024673462, "logps/rejected": -2.7295007705688477, "loss": 0.6557, "nll_loss": 0.07417738437652588, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.19973333179950714, "rewards/margins": 0.6191169619560242, "rewards/rejected": -0.8188502192497253, "step": 2430 }, { "epoch": 0.21425592167365487, "grad_norm": 11.22681713104248, "learning_rate": 8.1e-06, "logits/chosen": 2.5654773712158203, "logits/rejected": 2.5966382026672363, "logps/chosen": -0.5373214483261108, "logps/rejected": -2.4600396156311035, "loss": 0.6436, "nll_loss": 0.07008077204227448, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.16119642555713654, "rewards/margins": 0.5768154263496399, "rewards/rejected": -0.7380119562149048, "step": 2440 }, { "epoch": 0.21513401971330098, "grad_norm": 3.76454496383667, "learning_rate": 8.087654320987655e-06, "logits/chosen": 2.541652202606201, "logits/rejected": 2.5319390296936035, "logps/chosen": -0.7174406051635742, "logps/rejected": -2.493823528289795, "loss": 0.694, "nll_loss": 0.08293718844652176, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.21523217856884003, "rewards/margins": 0.5329148173332214, "rewards/rejected": -0.7481471300125122, "step": 2450 }, { "epoch": 0.21601211775294712, "grad_norm": 6.552890777587891, "learning_rate": 8.07530864197531e-06, "logits/chosen": 2.4907002449035645, "logits/rejected": 2.483581066131592, "logps/chosen": -0.5098174214363098, "logps/rejected": -2.033862829208374, "loss": 0.6884, "nll_loss": 0.06638985127210617, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.15294523537158966, "rewards/margins": 0.4572136402130127, "rewards/rejected": -0.6101589202880859, "step": 2460 }, { "epoch": 0.21689021579259324, "grad_norm": 1.994023323059082, "learning_rate": 8.062962962962964e-06, "logits/chosen": 2.6207573413848877, "logits/rejected": 2.5812675952911377, "logps/chosen": -0.7166529297828674, "logps/rejected": -2.042680263519287, "loss": 0.673, "nll_loss": 0.04922042042016983, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.21499589085578918, "rewards/margins": 0.3978081941604614, "rewards/rejected": -0.6128040552139282, "step": 2470 }, { "epoch": 0.21776831383223938, "grad_norm": 1.6430639028549194, "learning_rate": 8.050617283950618e-06, "logits/chosen": 2.2188925743103027, "logits/rejected": 2.256579875946045, "logps/chosen": -0.4708133637905121, "logps/rejected": -1.9442718029022217, "loss": 0.643, "nll_loss": 0.043398790061473846, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.14124402403831482, "rewards/margins": 0.44203758239746094, "rewards/rejected": -0.5832816362380981, "step": 2480 }, { "epoch": 0.2186464118718855, "grad_norm": 2.5793867111206055, "learning_rate": 8.038271604938272e-06, "logits/chosen": 2.5893726348876953, "logits/rejected": 2.5955262184143066, "logps/chosen": -0.9521909952163696, "logps/rejected": -2.2524361610412598, "loss": 0.7949, "nll_loss": 0.09755026549100876, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.28565728664398193, "rewards/margins": 0.3900734782218933, "rewards/rejected": -0.6757307648658752, "step": 2490 }, { "epoch": 0.21952450991153163, "grad_norm": 3.0991997718811035, "learning_rate": 8.025925925925927e-06, "logits/chosen": 2.4973714351654053, "logits/rejected": 2.5837717056274414, "logps/chosen": -0.7727764248847961, "logps/rejected": -1.6810334920883179, "loss": 0.7202, "nll_loss": 0.08514519035816193, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.23183290660381317, "rewards/margins": 0.2724771797657013, "rewards/rejected": -0.5043100714683533, "step": 2500 }, { "epoch": 0.22040260795117775, "grad_norm": 0.889065682888031, "learning_rate": 8.01358024691358e-06, "logits/chosen": 2.623380422592163, "logits/rejected": 2.6181862354278564, "logps/chosen": -0.6605237126350403, "logps/rejected": -2.3106017112731934, "loss": 0.6626, "nll_loss": 0.056938063353300095, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.19815710186958313, "rewards/margins": 0.4950234889984131, "rewards/rejected": -0.6931806206703186, "step": 2510 }, { "epoch": 0.2212807059908239, "grad_norm": 1.9629164934158325, "learning_rate": 8.001234567901234e-06, "logits/chosen": 2.725886583328247, "logits/rejected": 2.8111538887023926, "logps/chosen": -0.9530594944953918, "logps/rejected": -2.9670989513397217, "loss": 0.6788, "nll_loss": 0.08854852616786957, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.28591784834861755, "rewards/margins": 0.6042118072509766, "rewards/rejected": -0.8901296854019165, "step": 2520 }, { "epoch": 0.22215880403047, "grad_norm": 2.8490424156188965, "learning_rate": 7.98888888888889e-06, "logits/chosen": 2.6199440956115723, "logits/rejected": 2.5510783195495605, "logps/chosen": -0.7600888013839722, "logps/rejected": -2.3557190895080566, "loss": 0.6933, "nll_loss": 0.07890000194311142, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.22802665829658508, "rewards/margins": 0.47868919372558594, "rewards/rejected": -0.7067158222198486, "step": 2530 }, { "epoch": 0.22303690207011612, "grad_norm": 2.2429914474487305, "learning_rate": 7.976543209876545e-06, "logits/chosen": 2.657155752182007, "logits/rejected": 2.730384349822998, "logps/chosen": -0.6944109201431274, "logps/rejected": -2.3378891944885254, "loss": 0.6597, "nll_loss": 0.07231010496616364, "rewards/accuracies": 0.625, "rewards/chosen": -0.20832328498363495, "rewards/margins": 0.4930434226989746, "rewards/rejected": -0.7013667821884155, "step": 2540 }, { "epoch": 0.22391500010976226, "grad_norm": 8.76103401184082, "learning_rate": 7.964197530864199e-06, "logits/chosen": 2.425436019897461, "logits/rejected": 2.432227849960327, "logps/chosen": -0.8007850646972656, "logps/rejected": -1.2312095165252686, "loss": 0.8001, "nll_loss": 0.08954410254955292, "rewards/accuracies": 0.6875, "rewards/chosen": -0.24023552238941193, "rewards/margins": 0.1291273534297943, "rewards/rejected": -0.36936289072036743, "step": 2550 }, { "epoch": 0.22479309814940837, "grad_norm": 0.9743072986602783, "learning_rate": 7.951851851851853e-06, "logits/chosen": 2.5851752758026123, "logits/rejected": 2.6295394897460938, "logps/chosen": -0.5336810946464539, "logps/rejected": -1.8357422351837158, "loss": 0.6645, "nll_loss": 0.0451740063726902, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.16010431945323944, "rewards/margins": 0.39061832427978516, "rewards/rejected": -0.5507226586341858, "step": 2560 }, { "epoch": 0.2256711961890545, "grad_norm": 16.18039321899414, "learning_rate": 7.939506172839508e-06, "logits/chosen": 2.548182487487793, "logits/rejected": 2.511976718902588, "logps/chosen": -0.34628647565841675, "logps/rejected": -1.9476335048675537, "loss": 0.6362, "nll_loss": 0.043379928916692734, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.1038859486579895, "rewards/margins": 0.4804041385650635, "rewards/rejected": -0.584290087223053, "step": 2570 }, { "epoch": 0.22654929422870063, "grad_norm": 0.8062020540237427, "learning_rate": 7.92716049382716e-06, "logits/chosen": 2.4778008460998535, "logits/rejected": 2.540804147720337, "logps/chosen": -0.5255548357963562, "logps/rejected": -2.09405779838562, "loss": 0.6816, "nll_loss": 0.07233087718486786, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.15766644477844238, "rewards/margins": 0.4705510139465332, "rewards/rejected": -0.6282175183296204, "step": 2580 }, { "epoch": 0.22742739226834677, "grad_norm": 0.9400202631950378, "learning_rate": 7.914814814814815e-06, "logits/chosen": 2.3705923557281494, "logits/rejected": 2.3822696208953857, "logps/chosen": -0.29244324564933777, "logps/rejected": -1.8687235116958618, "loss": 0.643, "nll_loss": 0.0392463319003582, "rewards/accuracies": 0.75, "rewards/chosen": -0.08773298561573029, "rewards/margins": 0.47288402915000916, "rewards/rejected": -0.5606169700622559, "step": 2590 }, { "epoch": 0.22830549030799288, "grad_norm": 2.667971134185791, "learning_rate": 7.90246913580247e-06, "logits/chosen": 2.3539376258850098, "logits/rejected": 2.370870351791382, "logps/chosen": -0.8493935465812683, "logps/rejected": -2.638115882873535, "loss": 0.7404, "nll_loss": 0.10912100225687027, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2548181116580963, "rewards/margins": 0.5366166830062866, "rewards/rejected": -0.7914347648620605, "step": 2600 }, { "epoch": 0.22918358834763902, "grad_norm": 2.8281736373901367, "learning_rate": 7.890123456790124e-06, "logits/chosen": 2.146486759185791, "logits/rejected": 2.068389415740967, "logps/chosen": -0.5109766125679016, "logps/rejected": -1.839223861694336, "loss": 0.6687, "nll_loss": 0.056281328201293945, "rewards/accuracies": 0.625, "rewards/chosen": -0.15329298377037048, "rewards/margins": 0.39847415685653687, "rewards/rejected": -0.5517671704292297, "step": 2610 }, { "epoch": 0.23006168638728514, "grad_norm": 0.8495587706565857, "learning_rate": 7.877777777777778e-06, "logits/chosen": 2.6396515369415283, "logits/rejected": 2.7024216651916504, "logps/chosen": -0.6976840496063232, "logps/rejected": -2.1378865242004395, "loss": 0.6814, "nll_loss": 0.056956302374601364, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.20930524170398712, "rewards/margins": 0.43206077814102173, "rewards/rejected": -0.6413660049438477, "step": 2620 }, { "epoch": 0.23093978442693128, "grad_norm": 3.5757088661193848, "learning_rate": 7.865432098765433e-06, "logits/chosen": 2.367114305496216, "logits/rejected": 2.4596433639526367, "logps/chosen": -0.8233789205551147, "logps/rejected": -3.531553268432617, "loss": 0.6048, "nll_loss": 0.07218165695667267, "rewards/accuracies": 0.6875, "rewards/chosen": -0.24701371788978577, "rewards/margins": 0.8124523162841797, "rewards/rejected": -1.059466004371643, "step": 2630 }, { "epoch": 0.2318178824665774, "grad_norm": 1.7588326930999756, "learning_rate": 7.853086419753087e-06, "logits/chosen": 2.2890639305114746, "logits/rejected": 2.2860817909240723, "logps/chosen": -0.31422901153564453, "logps/rejected": -2.4267849922180176, "loss": 0.584, "nll_loss": 0.03038870170712471, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09426870942115784, "rewards/margins": 0.6337667107582092, "rewards/rejected": -0.7280355095863342, "step": 2640 }, { "epoch": 0.23269598050622353, "grad_norm": 4.534848213195801, "learning_rate": 7.840740740740741e-06, "logits/chosen": 2.2893667221069336, "logits/rejected": 2.3961310386657715, "logps/chosen": -1.1099700927734375, "logps/rejected": -3.46467924118042, "loss": 0.7172, "nll_loss": 0.1096486896276474, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3329910337924957, "rewards/margins": 0.7064129114151001, "rewards/rejected": -1.0394039154052734, "step": 2650 }, { "epoch": 0.23357407854586965, "grad_norm": 4.673385143280029, "learning_rate": 7.828395061728396e-06, "logits/chosen": 2.296675205230713, "logits/rejected": 2.2739694118499756, "logps/chosen": -0.5930169820785522, "logps/rejected": -2.1158299446105957, "loss": 0.6747, "nll_loss": 0.06488887220621109, "rewards/accuracies": 0.75, "rewards/chosen": -0.1779050976037979, "rewards/margins": 0.4568440020084381, "rewards/rejected": -0.6347490549087524, "step": 2660 }, { "epoch": 0.23445217658551576, "grad_norm": 4.619322776794434, "learning_rate": 7.81604938271605e-06, "logits/chosen": 2.316483974456787, "logits/rejected": 2.2958438396453857, "logps/chosen": -0.8736156225204468, "logps/rejected": -2.8213837146759033, "loss": 0.7145, "nll_loss": 0.0816427692770958, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2620847225189209, "rewards/margins": 0.5843304395675659, "rewards/rejected": -0.8464152216911316, "step": 2670 }, { "epoch": 0.2353302746251619, "grad_norm": 2.9810938835144043, "learning_rate": 7.803703703703705e-06, "logits/chosen": 2.459575891494751, "logits/rejected": 2.5659358501434326, "logps/chosen": -0.7047882080078125, "logps/rejected": -2.6978650093078613, "loss": 0.6257, "nll_loss": 0.048231981694698334, "rewards/accuracies": 0.75, "rewards/chosen": -0.21143648028373718, "rewards/margins": 0.5979229807853699, "rewards/rejected": -0.8093594312667847, "step": 2680 }, { "epoch": 0.23620837266480801, "grad_norm": 1.5284343957901, "learning_rate": 7.791358024691359e-06, "logits/chosen": 2.327259063720703, "logits/rejected": 2.342414379119873, "logps/chosen": -0.630928635597229, "logps/rejected": -1.5021655559539795, "loss": 0.749, "nll_loss": 0.08067157119512558, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.18927858769893646, "rewards/margins": 0.26137107610702515, "rewards/rejected": -0.4506497383117676, "step": 2690 }, { "epoch": 0.23708647070445416, "grad_norm": 2.211660385131836, "learning_rate": 7.779012345679014e-06, "logits/chosen": 2.6317856311798096, "logits/rejected": 2.615809679031372, "logps/chosen": -0.5954752564430237, "logps/rejected": -2.3459115028381348, "loss": 0.6778, "nll_loss": 0.08433017879724503, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.17864257097244263, "rewards/margins": 0.5251308679580688, "rewards/rejected": -0.7037734389305115, "step": 2700 }, { "epoch": 0.23796456874410027, "grad_norm": 1.088346004486084, "learning_rate": 7.766666666666666e-06, "logits/chosen": 2.3896658420562744, "logits/rejected": 2.4483752250671387, "logps/chosen": -0.9959812164306641, "logps/rejected": -2.1024794578552246, "loss": 0.7702, "nll_loss": 0.07456602156162262, "rewards/accuracies": 0.625, "rewards/chosen": -0.29879438877105713, "rewards/margins": 0.33194953203201294, "rewards/rejected": -0.6307438611984253, "step": 2710 }, { "epoch": 0.2388426667837464, "grad_norm": 0.024926647543907166, "learning_rate": 7.75432098765432e-06, "logits/chosen": 2.4478306770324707, "logits/rejected": 2.464536190032959, "logps/chosen": -0.5027719736099243, "logps/rejected": -2.434457540512085, "loss": 0.6162, "nll_loss": 0.037077441811561584, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.15083159506320953, "rewards/margins": 0.5795056819915771, "rewards/rejected": -0.730337381362915, "step": 2720 }, { "epoch": 0.23972076482339252, "grad_norm": 2.2997021675109863, "learning_rate": 7.741975308641975e-06, "logits/chosen": 2.381772518157959, "logits/rejected": 2.4063820838928223, "logps/chosen": -0.2956869602203369, "logps/rejected": -3.0764455795288086, "loss": 0.5462, "nll_loss": 0.0379708856344223, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.08870609104633331, "rewards/margins": 0.834227442741394, "rewards/rejected": -0.9229336977005005, "step": 2730 }, { "epoch": 0.24059886286303866, "grad_norm": 0.1084788367152214, "learning_rate": 7.72962962962963e-06, "logits/chosen": 2.582411766052246, "logits/rejected": 2.6022956371307373, "logps/chosen": -0.8210613131523132, "logps/rejected": -1.958987832069397, "loss": 0.7403, "nll_loss": 0.07586108148097992, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.24631838500499725, "rewards/margins": 0.34137797355651855, "rewards/rejected": -0.5876964330673218, "step": 2740 }, { "epoch": 0.24147696090268478, "grad_norm": 3.7977356910705566, "learning_rate": 7.717283950617284e-06, "logits/chosen": 2.4403786659240723, "logits/rejected": 2.398824453353882, "logps/chosen": -0.5754778981208801, "logps/rejected": -2.9741971492767334, "loss": 0.631, "nll_loss": 0.06641169637441635, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.17264336347579956, "rewards/margins": 0.7196158170700073, "rewards/rejected": -0.8922592401504517, "step": 2750 }, { "epoch": 0.24235505894233092, "grad_norm": 1.7483222484588623, "learning_rate": 7.70493827160494e-06, "logits/chosen": 2.11322021484375, "logits/rejected": 2.1725521087646484, "logps/chosen": -0.5592783689498901, "logps/rejected": -1.9913737773895264, "loss": 0.671, "nll_loss": 0.07330699265003204, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.16778354346752167, "rewards/margins": 0.4296286106109619, "rewards/rejected": -0.5974121689796448, "step": 2760 }, { "epoch": 0.24323315698197703, "grad_norm": 1.0311236381530762, "learning_rate": 7.692592592592594e-06, "logits/chosen": 2.309854507446289, "logits/rejected": 2.313572406768799, "logps/chosen": -0.6930335760116577, "logps/rejected": -2.353086233139038, "loss": 0.7228, "nll_loss": 0.0806727483868599, "rewards/accuracies": 0.625, "rewards/chosen": -0.20791009068489075, "rewards/margins": 0.498015820980072, "rewards/rejected": -0.7059258818626404, "step": 2770 }, { "epoch": 0.24411125502162317, "grad_norm": 4.491925239562988, "learning_rate": 7.680246913580247e-06, "logits/chosen": 2.534341335296631, "logits/rejected": 2.560044050216675, "logps/chosen": -0.536239743232727, "logps/rejected": -2.622220993041992, "loss": 0.6372, "nll_loss": 0.05687220022082329, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.1608719378709793, "rewards/margins": 0.625794529914856, "rewards/rejected": -0.7866664528846741, "step": 2780 }, { "epoch": 0.2449893530612693, "grad_norm": 0.7264915108680725, "learning_rate": 7.667901234567902e-06, "logits/chosen": 2.2637343406677246, "logits/rejected": 2.312181234359741, "logps/chosen": -0.7516659498214722, "logps/rejected": -1.9516077041625977, "loss": 0.7264, "nll_loss": 0.07927460223436356, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.22549979388713837, "rewards/margins": 0.3599824607372284, "rewards/rejected": -0.5854822993278503, "step": 2790 }, { "epoch": 0.24586745110091543, "grad_norm": 3.0847902297973633, "learning_rate": 7.655555555555556e-06, "logits/chosen": 2.367601156234741, "logits/rejected": 2.403787612915039, "logps/chosen": -0.41557034850120544, "logps/rejected": -2.6111361980438232, "loss": 0.6177, "nll_loss": 0.04188116267323494, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.12467111647129059, "rewards/margins": 0.6586698889732361, "rewards/rejected": -0.7833409905433655, "step": 2800 }, { "epoch": 0.24674554914056154, "grad_norm": 4.381664752960205, "learning_rate": 7.64320987654321e-06, "logits/chosen": 2.159615993499756, "logits/rejected": 2.1967501640319824, "logps/chosen": -0.8560575246810913, "logps/rejected": -2.5341227054595947, "loss": 0.6988, "nll_loss": 0.0791650265455246, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2568172812461853, "rewards/margins": 0.5034195780754089, "rewards/rejected": -0.7602368593215942, "step": 2810 }, { "epoch": 0.24762364718020766, "grad_norm": 1.5694470405578613, "learning_rate": 7.630864197530865e-06, "logits/chosen": 2.573787212371826, "logits/rejected": 2.6199076175689697, "logps/chosen": -0.3929263949394226, "logps/rejected": -2.310509443283081, "loss": 0.5938, "nll_loss": 0.04064936563372612, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.11787792295217514, "rewards/margins": 0.5752750039100647, "rewards/rejected": -0.6931529641151428, "step": 2820 }, { "epoch": 0.2485017452198538, "grad_norm": 0.7673569321632385, "learning_rate": 7.618518518518519e-06, "logits/chosen": 2.1453781127929688, "logits/rejected": 2.2463371753692627, "logps/chosen": -0.40726566314697266, "logps/rejected": -2.0054023265838623, "loss": 0.658, "nll_loss": 0.048675037920475006, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.12217970192432404, "rewards/margins": 0.4794410765171051, "rewards/rejected": -0.6016206741333008, "step": 2830 }, { "epoch": 0.2493798432594999, "grad_norm": 0.3016711473464966, "learning_rate": 7.606172839506173e-06, "logits/chosen": 2.3802428245544434, "logits/rejected": 2.4284090995788574, "logps/chosen": -0.2703506350517273, "logps/rejected": -2.1822762489318848, "loss": 0.6137, "nll_loss": 0.03810378909111023, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.08110519498586655, "rewards/margins": 0.5735777020454407, "rewards/rejected": -0.6546828746795654, "step": 2840 }, { "epoch": 0.250257941299146, "grad_norm": 7.489548683166504, "learning_rate": 7.593827160493827e-06, "logits/chosen": 2.248429298400879, "logits/rejected": 2.308821678161621, "logps/chosen": -0.8789108991622925, "logps/rejected": -3.1246941089630127, "loss": 0.664, "nll_loss": 0.07486443221569061, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2636732757091522, "rewards/margins": 0.6737349033355713, "rewards/rejected": -0.9374082684516907, "step": 2850 }, { "epoch": 0.2511360393387922, "grad_norm": 1.4026418924331665, "learning_rate": 7.581481481481482e-06, "logits/chosen": 2.4665749073028564, "logits/rejected": 2.4932830333709717, "logps/chosen": -0.6312376260757446, "logps/rejected": -2.414559841156006, "loss": 0.6769, "nll_loss": 0.043715715408325195, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.18937130272388458, "rewards/margins": 0.5349966883659363, "rewards/rejected": -0.7243679761886597, "step": 2860 }, { "epoch": 0.2520141373784383, "grad_norm": 6.41206169128418, "learning_rate": 7.569135802469136e-06, "logits/chosen": 1.9791500568389893, "logits/rejected": 2.065732479095459, "logps/chosen": -0.3990306854248047, "logps/rejected": -2.8924102783203125, "loss": 0.5879, "nll_loss": 0.03257184475660324, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.11970920860767365, "rewards/margins": 0.7480138540267944, "rewards/rejected": -0.8677231073379517, "step": 2870 }, { "epoch": 0.2528922354180844, "grad_norm": 0.007237335667014122, "learning_rate": 7.5567901234567905e-06, "logits/chosen": 2.199582099914551, "logits/rejected": 2.2399208545684814, "logps/chosen": -0.5396376848220825, "logps/rejected": -2.956026554107666, "loss": 0.6215, "nll_loss": 0.05001888796687126, "rewards/accuracies": 0.75, "rewards/chosen": -0.16189131140708923, "rewards/margins": 0.7249167561531067, "rewards/rejected": -0.8868080377578735, "step": 2880 }, { "epoch": 0.25377033345773053, "grad_norm": 1.490009069442749, "learning_rate": 7.544444444444445e-06, "logits/chosen": 2.2380213737487793, "logits/rejected": 2.329550266265869, "logps/chosen": -0.5914583206176758, "logps/rejected": -3.31396484375, "loss": 0.6415, "nll_loss": 0.07680721580982208, "rewards/accuracies": 0.75, "rewards/chosen": -0.17743751406669617, "rewards/margins": 0.8167519569396973, "rewards/rejected": -0.9941895604133606, "step": 2890 }, { "epoch": 0.2546484314973767, "grad_norm": 4.014424800872803, "learning_rate": 7.5320987654321e-06, "logits/chosen": 1.9832671880722046, "logits/rejected": 2.1117234230041504, "logps/chosen": -0.7208765745162964, "logps/rejected": -2.1712794303894043, "loss": 0.7748, "nll_loss": 0.10638797283172607, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.21626298129558563, "rewards/margins": 0.43512091040611267, "rewards/rejected": -0.6513839364051819, "step": 2900 }, { "epoch": 0.2555265295370228, "grad_norm": 0.9837947487831116, "learning_rate": 7.519753086419753e-06, "logits/chosen": 2.266629695892334, "logits/rejected": 2.3368468284606934, "logps/chosen": -0.5821598172187805, "logps/rejected": -3.631063938140869, "loss": 0.622, "nll_loss": 0.04266344755887985, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1746479570865631, "rewards/margins": 0.9146712422370911, "rewards/rejected": -1.0893189907073975, "step": 2910 }, { "epoch": 0.25640462757666893, "grad_norm": 7.946182727813721, "learning_rate": 7.507407407407407e-06, "logits/chosen": 2.081207275390625, "logits/rejected": 2.123415946960449, "logps/chosen": -0.6722933053970337, "logps/rejected": -3.213026762008667, "loss": 0.6752, "nll_loss": 0.09485231339931488, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.2016880065202713, "rewards/margins": 0.762220025062561, "rewards/rejected": -0.9639080762863159, "step": 2920 }, { "epoch": 0.25728272561631504, "grad_norm": 3.1857895851135254, "learning_rate": 7.495061728395062e-06, "logits/chosen": 2.0671546459198, "logits/rejected": 2.1429953575134277, "logps/chosen": -1.1341451406478882, "logps/rejected": -3.0532522201538086, "loss": 0.7432, "nll_loss": 0.07530729472637177, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.34024354815483093, "rewards/margins": 0.5757321119308472, "rewards/rejected": -0.9159756898880005, "step": 2930 }, { "epoch": 0.2581608236559612, "grad_norm": 5.178243637084961, "learning_rate": 7.482716049382717e-06, "logits/chosen": 2.082383394241333, "logits/rejected": 2.076977252960205, "logps/chosen": -0.3990221321582794, "logps/rejected": -1.7076250314712524, "loss": 0.6855, "nll_loss": 0.06153715401887894, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1197066530585289, "rewards/margins": 0.39258089661598206, "rewards/rejected": -0.5122874975204468, "step": 2940 }, { "epoch": 0.2590389216956073, "grad_norm": 1.9402194023132324, "learning_rate": 7.4703703703703715e-06, "logits/chosen": 2.0955326557159424, "logits/rejected": 2.119300603866577, "logps/chosen": -0.48326557874679565, "logps/rejected": -2.141892194747925, "loss": 0.6575, "nll_loss": 0.05360071733593941, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.14497968554496765, "rewards/margins": 0.49758806824684143, "rewards/rejected": -0.6425677537918091, "step": 2950 }, { "epoch": 0.25991701973525344, "grad_norm": 2.156696081161499, "learning_rate": 7.458024691358026e-06, "logits/chosen": 2.2110159397125244, "logits/rejected": 2.210599422454834, "logps/chosen": -0.7016893625259399, "logps/rejected": -1.9251207113265991, "loss": 0.7172, "nll_loss": 0.06567586958408356, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.21050682663917542, "rewards/margins": 0.36702945828437805, "rewards/rejected": -0.5775362253189087, "step": 2960 }, { "epoch": 0.26079511777489955, "grad_norm": 4.643383026123047, "learning_rate": 7.44567901234568e-06, "logits/chosen": 2.3343098163604736, "logits/rejected": 2.3111183643341064, "logps/chosen": -0.546606183052063, "logps/rejected": -1.9335724115371704, "loss": 0.6684, "nll_loss": 0.04829854518175125, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.1639818698167801, "rewards/margins": 0.4160899519920349, "rewards/rejected": -0.5800718069076538, "step": 2970 }, { "epoch": 0.26167321581454567, "grad_norm": 3.739795684814453, "learning_rate": 7.433333333333334e-06, "logits/chosen": 2.259247303009033, "logits/rejected": 2.3139965534210205, "logps/chosen": -0.6394690871238708, "logps/rejected": -3.2223987579345703, "loss": 0.6667, "nll_loss": 0.07620217651128769, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1918407380580902, "rewards/margins": 0.774878978729248, "rewards/rejected": -0.9667198061943054, "step": 2980 }, { "epoch": 0.26255131385419184, "grad_norm": 3.2304673194885254, "learning_rate": 7.420987654320988e-06, "logits/chosen": 2.221369504928589, "logits/rejected": 2.2490549087524414, "logps/chosen": -0.3039132356643677, "logps/rejected": -3.022956371307373, "loss": 0.5563, "nll_loss": 0.030767951160669327, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09117396920919418, "rewards/margins": 0.8157129287719727, "rewards/rejected": -0.9068870544433594, "step": 2990 }, { "epoch": 0.26342941189383795, "grad_norm": 1.1860368251800537, "learning_rate": 7.408641975308643e-06, "logits/chosen": 2.0464189052581787, "logits/rejected": 2.0965323448181152, "logps/chosen": -0.32864516973495483, "logps/rejected": -2.536147356033325, "loss": 0.617, "nll_loss": 0.044617362320423126, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.09859354794025421, "rewards/margins": 0.6622506380081177, "rewards/rejected": -0.7608442306518555, "step": 3000 }, { "epoch": 0.26430750993348406, "grad_norm": 1.1873282194137573, "learning_rate": 7.396296296296297e-06, "logits/chosen": 2.152050256729126, "logits/rejected": 2.2446908950805664, "logps/chosen": -0.4739529490470886, "logps/rejected": -3.3722312450408936, "loss": 0.6149, "nll_loss": 0.06702496111392975, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.14218589663505554, "rewards/margins": 0.8694835901260376, "rewards/rejected": -1.011669397354126, "step": 3010 }, { "epoch": 0.2651856079731302, "grad_norm": 4.482595920562744, "learning_rate": 7.3839506172839516e-06, "logits/chosen": 2.094207286834717, "logits/rejected": 2.200845241546631, "logps/chosen": -1.2236783504486084, "logps/rejected": -4.37764835357666, "loss": 0.6561, "nll_loss": 0.09786146134138107, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3671035170555115, "rewards/margins": 0.9461910128593445, "rewards/rejected": -1.3132946491241455, "step": 3020 }, { "epoch": 0.26606370601277635, "grad_norm": 0.8290062546730042, "learning_rate": 7.371604938271606e-06, "logits/chosen": 1.9782556295394897, "logits/rejected": 2.009742021560669, "logps/chosen": -0.425149142742157, "logps/rejected": -3.8111705780029297, "loss": 0.5745, "nll_loss": 0.042985234409570694, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.12754476070404053, "rewards/margins": 1.0158064365386963, "rewards/rejected": -1.1433511972427368, "step": 3030 }, { "epoch": 0.26694180405242246, "grad_norm": 7.664605140686035, "learning_rate": 7.3592592592592595e-06, "logits/chosen": 2.083406448364258, "logits/rejected": 2.1995325088500977, "logps/chosen": -1.0140199661254883, "logps/rejected": -3.976250410079956, "loss": 0.759, "nll_loss": 0.07868107408285141, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3042060136795044, "rewards/margins": 0.8886691927909851, "rewards/rejected": -1.1928752660751343, "step": 3040 }, { "epoch": 0.2678199020920686, "grad_norm": 5.215522766113281, "learning_rate": 7.346913580246914e-06, "logits/chosen": 2.006986141204834, "logits/rejected": 2.1079816818237305, "logps/chosen": -0.6254408955574036, "logps/rejected": -2.870253801345825, "loss": 0.6843, "nll_loss": 0.07333754748106003, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1876322478055954, "rewards/margins": 0.6734437346458435, "rewards/rejected": -0.8610760569572449, "step": 3050 }, { "epoch": 0.2686980001317147, "grad_norm": 1.4749705791473389, "learning_rate": 7.334567901234568e-06, "logits/chosen": 2.1230270862579346, "logits/rejected": 2.2760751247406006, "logps/chosen": -0.9829801321029663, "logps/rejected": -2.8155903816223145, "loss": 0.7342, "nll_loss": 0.10188277065753937, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2948940396308899, "rewards/margins": 0.5497831702232361, "rewards/rejected": -0.8446771502494812, "step": 3060 }, { "epoch": 0.26957609817136086, "grad_norm": 9.571310043334961, "learning_rate": 7.322222222222223e-06, "logits/chosen": 2.225337028503418, "logits/rejected": 2.2470784187316895, "logps/chosen": -0.7389670014381409, "logps/rejected": -2.8324790000915527, "loss": 0.6649, "nll_loss": 0.0714489072561264, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.2216901034116745, "rewards/margins": 0.628053605556488, "rewards/rejected": -0.8497437238693237, "step": 3070 }, { "epoch": 0.27045419621100697, "grad_norm": 1.4797980785369873, "learning_rate": 7.309876543209877e-06, "logits/chosen": 2.0399069786071777, "logits/rejected": 2.1479554176330566, "logps/chosen": -0.7341340780258179, "logps/rejected": -2.218003749847412, "loss": 0.7062, "nll_loss": 0.057169754058122635, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.22024023532867432, "rewards/margins": 0.4451608657836914, "rewards/rejected": -0.6654011011123657, "step": 3080 }, { "epoch": 0.2713322942506531, "grad_norm": 11.532062530517578, "learning_rate": 7.297530864197532e-06, "logits/chosen": 2.1539671421051025, "logits/rejected": 2.135166645050049, "logps/chosen": -0.9680719375610352, "logps/rejected": -2.85876202583313, "loss": 0.7423, "nll_loss": 0.0898386538028717, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2904215455055237, "rewards/margins": 0.5672070980072021, "rewards/rejected": -0.8576286435127258, "step": 3090 }, { "epoch": 0.2722103922902992, "grad_norm": 0.5979923605918884, "learning_rate": 7.285185185185186e-06, "logits/chosen": 2.2366061210632324, "logits/rejected": 2.297550916671753, "logps/chosen": -0.9880453944206238, "logps/rejected": -3.887558698654175, "loss": 0.6517, "nll_loss": 0.08710993081331253, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2964136004447937, "rewards/margins": 0.8698541522026062, "rewards/rejected": -1.1662677526474, "step": 3100 }, { "epoch": 0.27308849032994537, "grad_norm": 6.401426792144775, "learning_rate": 7.27283950617284e-06, "logits/chosen": 2.1483216285705566, "logits/rejected": 2.219287395477295, "logps/chosen": -0.8323481678962708, "logps/rejected": -1.9748185873031616, "loss": 0.7784, "nll_loss": 0.11523783206939697, "rewards/accuracies": 0.625, "rewards/chosen": -0.2497044801712036, "rewards/margins": 0.34274110198020935, "rewards/rejected": -0.5924455523490906, "step": 3110 }, { "epoch": 0.2739665883695915, "grad_norm": 2.1834819316864014, "learning_rate": 7.260493827160494e-06, "logits/chosen": 2.21921968460083, "logits/rejected": 2.247816801071167, "logps/chosen": -0.8979324102401733, "logps/rejected": -2.4821817874908447, "loss": 0.753, "nll_loss": 0.09049418568611145, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.26937970519065857, "rewards/margins": 0.47527486085891724, "rewards/rejected": -0.7446545362472534, "step": 3120 }, { "epoch": 0.2748446864092376, "grad_norm": 2.105536937713623, "learning_rate": 7.2481481481481485e-06, "logits/chosen": 2.3172378540039062, "logits/rejected": 2.3811116218566895, "logps/chosen": -0.33308374881744385, "logps/rejected": -1.8823559284210205, "loss": 0.6691, "nll_loss": 0.04073493555188179, "rewards/accuracies": 0.75, "rewards/chosen": -0.09992513060569763, "rewards/margins": 0.4647817015647888, "rewards/rejected": -0.5647068023681641, "step": 3130 }, { "epoch": 0.2757227844488837, "grad_norm": 5.6695556640625, "learning_rate": 7.235802469135803e-06, "logits/chosen": 2.484741687774658, "logits/rejected": 2.524019718170166, "logps/chosen": -0.6708775758743286, "logps/rejected": -2.733649730682373, "loss": 0.675, "nll_loss": 0.06934002041816711, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.20126327872276306, "rewards/margins": 0.6188317537307739, "rewards/rejected": -0.8200949430465698, "step": 3140 }, { "epoch": 0.2766008824885298, "grad_norm": 0.11841005086898804, "learning_rate": 7.223456790123457e-06, "logits/chosen": 2.3487932682037354, "logits/rejected": 2.4092326164245605, "logps/chosen": -0.5150425434112549, "logps/rejected": -1.9013131856918335, "loss": 0.7062, "nll_loss": 0.05403406545519829, "rewards/accuracies": 0.5, "rewards/chosen": -0.15451276302337646, "rewards/margins": 0.4158812463283539, "rewards/rejected": -0.570393979549408, "step": 3150 }, { "epoch": 0.277478980528176, "grad_norm": 2.9380528926849365, "learning_rate": 7.211111111111112e-06, "logits/chosen": 2.3656086921691895, "logits/rejected": 2.4282584190368652, "logps/chosen": -0.6937441825866699, "logps/rejected": -2.480012893676758, "loss": 0.701, "nll_loss": 0.07696821540594101, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.20812325179576874, "rewards/margins": 0.5358806848526001, "rewards/rejected": -0.74400395154953, "step": 3160 }, { "epoch": 0.2783570785678221, "grad_norm": 5.344911575317383, "learning_rate": 7.198765432098766e-06, "logits/chosen": 2.2785983085632324, "logits/rejected": 2.3243610858917236, "logps/chosen": -0.7472286224365234, "logps/rejected": -1.1668407917022705, "loss": 0.7779, "nll_loss": 0.07855083793401718, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.22416862845420837, "rewards/margins": 0.12588365375995636, "rewards/rejected": -0.35005226731300354, "step": 3170 }, { "epoch": 0.2792351766074682, "grad_norm": 0.625324547290802, "learning_rate": 7.18641975308642e-06, "logits/chosen": 2.2220301628112793, "logits/rejected": 2.2504611015319824, "logps/chosen": -0.7789251208305359, "logps/rejected": -2.3314409255981445, "loss": 0.7352, "nll_loss": 0.06989389657974243, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.23367755115032196, "rewards/margins": 0.4657546877861023, "rewards/rejected": -0.6994322538375854, "step": 3180 }, { "epoch": 0.28011327464711433, "grad_norm": 2.6372480392456055, "learning_rate": 7.174074074074074e-06, "logits/chosen": 2.337601900100708, "logits/rejected": 2.3855247497558594, "logps/chosen": -0.4849260747432709, "logps/rejected": -2.1613705158233643, "loss": 0.6273, "nll_loss": 0.052817367017269135, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.14547783136367798, "rewards/margins": 0.5029333829879761, "rewards/rejected": -0.6484112739562988, "step": 3190 }, { "epoch": 0.2809913726867605, "grad_norm": 2.161203145980835, "learning_rate": 7.1617283950617285e-06, "logits/chosen": 2.1061933040618896, "logits/rejected": 2.1634392738342285, "logps/chosen": -0.5152336955070496, "logps/rejected": -2.2968502044677734, "loss": 0.6311, "nll_loss": 0.04893555864691734, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.15457013249397278, "rewards/margins": 0.5344849824905396, "rewards/rejected": -0.6890550851821899, "step": 3200 } ], "logging_steps": 10, "max_steps": 9000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }