{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.997458513978173, "eval_steps": 200, "global_step": 2508, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02989983555090447, "grad_norm": 78.5, "learning_rate": 4.780876494023904e-07, "logits/chosen": -0.8346603512763977, "logits/rejected": -0.5625396966934204, "logps/chosen": -311.11248779296875, "logps/rejected": -290.71624755859375, "loss": 0.6974, "rewards/accuracies": 0.3199999928474426, "rewards/chosen": -0.005879516713321209, "rewards/margins": -0.0028140258509665728, "rewards/rejected": -0.003072815015912056, "step": 25 }, { "epoch": 0.05979967110180894, "grad_norm": 112.5, "learning_rate": 9.760956175298805e-07, "logits/chosen": -0.8477816581726074, "logits/rejected": -0.5839244723320007, "logps/chosen": -341.1449890136719, "logps/rejected": -303.2749938964844, "loss": 0.6939, "rewards/accuracies": 0.33500000834465027, "rewards/chosen": -0.01889648474752903, "rewards/margins": 0.0013772583333775401, "rewards/rejected": -0.020271606743335724, "step": 50 }, { "epoch": 0.08969950665271341, "grad_norm": 89.5, "learning_rate": 1.4741035856573708e-06, "logits/chosen": -0.7348077893257141, "logits/rejected": -0.419241338968277, "logps/chosen": -311.4237365722656, "logps/rejected": -284.5274963378906, "loss": 0.7, "rewards/accuracies": 0.28999999165534973, "rewards/chosen": -0.020579833537340164, "rewards/margins": -0.008827819488942623, "rewards/rejected": -0.011761474423110485, "step": 75 }, { "epoch": 0.11959934220361788, "grad_norm": 89.5, "learning_rate": 1.9721115537848607e-06, "logits/chosen": -0.9120362997055054, "logits/rejected": -0.566675066947937, "logps/chosen": -322.989990234375, "logps/rejected": -276.8037414550781, "loss": 0.6868, "rewards/accuracies": 0.3675000071525574, "rewards/chosen": -0.027477417141199112, "rewards/margins": 0.018669739365577698, "rewards/rejected": -0.04612060636281967, "step": 100 }, { "epoch": 0.14949917775452234, "grad_norm": 83.0, "learning_rate": 2.470119521912351e-06, "logits/chosen": -0.8410671353340149, "logits/rejected": -0.43034911155700684, "logps/chosen": -297.4024963378906, "logps/rejected": -304.4224853515625, "loss": 0.6832, "rewards/accuracies": 0.36000001430511475, "rewards/chosen": -0.05832824856042862, "rewards/margins": 0.02584075927734375, "rewards/rejected": -0.08419036865234375, "step": 125 }, { "epoch": 0.17939901330542682, "grad_norm": 106.5, "learning_rate": 2.968127490039841e-06, "logits/chosen": -0.9279866814613342, "logits/rejected": -0.6811022758483887, "logps/chosen": -312.67498779296875, "logps/rejected": -285.7799987792969, "loss": 0.6709, "rewards/accuracies": 0.49000000953674316, "rewards/chosen": -0.07547790557146072, "rewards/margins": 0.056133728474378586, "rewards/rejected": -0.1316046118736267, "step": 150 }, { "epoch": 0.2092988488563313, "grad_norm": 96.0, "learning_rate": 3.466135458167331e-06, "logits/chosen": -0.8703573346138, "logits/rejected": -0.5601403713226318, "logps/chosen": -323.947509765625, "logps/rejected": -292.8074951171875, "loss": 0.6696, "rewards/accuracies": 0.4950000047683716, "rewards/chosen": -0.11684814095497131, "rewards/margins": 0.06319641321897507, "rewards/rejected": -0.1800549328327179, "step": 175 }, { "epoch": 0.23919868440723577, "grad_norm": 99.0, "learning_rate": 3.9641434262948205e-06, "logits/chosen": -0.9258654713630676, "logits/rejected": -0.5686477422714233, "logps/chosen": -328.7449951171875, "logps/rejected": -316.5574951171875, "loss": 0.6579, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.1883123815059662, "rewards/margins": 0.09867187589406967, "rewards/rejected": -0.28693297505378723, "step": 200 }, { "epoch": 0.23919868440723577, "eval_logits/chosen": -0.9216321706771851, "eval_logits/rejected": -0.7277408838272095, "eval_logps/chosen": -320.7849426269531, "eval_logps/rejected": -293.8709716796875, "eval_loss": 0.6465986371040344, "eval_rewards/accuracies": 0.560387909412384, "eval_rewards/chosen": -0.19119606912136078, "eval_rewards/margins": 0.1261032223701477, "eval_rewards/rejected": -0.31729716062545776, "eval_runtime": 877.9315, "eval_samples_per_second": 1.694, "eval_steps_per_second": 0.212, "step": 200 }, { "epoch": 0.2690985199581402, "grad_norm": 87.0, "learning_rate": 4.462151394422311e-06, "logits/chosen": -0.8007558584213257, "logits/rejected": -0.505867600440979, "logps/chosen": -320.7512512207031, "logps/rejected": -311.8299865722656, "loss": 0.6444, "rewards/accuracies": 0.5649999976158142, "rewards/chosen": -0.2540551722049713, "rewards/margins": 0.14147095382213593, "rewards/rejected": -0.3954962193965912, "step": 225 }, { "epoch": 0.2989983555090447, "grad_norm": 96.5, "learning_rate": 4.960159362549802e-06, "logits/chosen": -0.9090196490287781, "logits/rejected": -0.6456773281097412, "logps/chosen": -323.7200012207031, "logps/rejected": -295.2149963378906, "loss": 0.6255, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2805468738079071, "rewards/margins": 0.19930054247379303, "rewards/rejected": -0.47991272807121277, "step": 250 }, { "epoch": 0.32889819105994916, "grad_norm": 91.0, "learning_rate": 4.9490474080638015e-06, "logits/chosen": -0.9534767270088196, "logits/rejected": -0.6329247951507568, "logps/chosen": -319.1549987792969, "logps/rejected": -283.88751220703125, "loss": 0.6192, "rewards/accuracies": 0.5924999713897705, "rewards/chosen": -0.29086607694625854, "rewards/margins": 0.23339904844760895, "rewards/rejected": -0.5240704417228699, "step": 275 }, { "epoch": 0.35879802661085364, "grad_norm": 70.5, "learning_rate": 4.8936641559592385e-06, "logits/chosen": -0.9436456561088562, "logits/rejected": -0.7789434790611267, "logps/chosen": -349.5050048828125, "logps/rejected": -310.48748779296875, "loss": 0.627, "rewards/accuracies": 0.6349999904632568, "rewards/chosen": -0.30020782351493835, "rewards/margins": 0.23243407905101776, "rewards/rejected": -0.532727062702179, "step": 300 }, { "epoch": 0.3886978621617581, "grad_norm": 101.0, "learning_rate": 4.838280903854675e-06, "logits/chosen": -0.9607565402984619, "logits/rejected": -0.7166936993598938, "logps/chosen": -317.0874938964844, "logps/rejected": -289.0824890136719, "loss": 0.5906, "rewards/accuracies": 0.6524999737739563, "rewards/chosen": -0.4176098704338074, "rewards/margins": 0.3300067186355591, "rewards/rejected": -0.7473974823951721, "step": 325 }, { "epoch": 0.4185976977126626, "grad_norm": 94.0, "learning_rate": 4.782897651750112e-06, "logits/chosen": -0.9818115234375, "logits/rejected": -0.6833120584487915, "logps/chosen": -321.1875, "logps/rejected": -316.58624267578125, "loss": 0.577, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4978076219558716, "rewards/margins": 0.39054566621780396, "rewards/rejected": -0.8884375095367432, "step": 350 }, { "epoch": 0.44849753326356706, "grad_norm": 83.5, "learning_rate": 4.727514399645548e-06, "logits/chosen": -1.0211011171340942, "logits/rejected": -0.7218142747879028, "logps/chosen": -307.9674987792969, "logps/rejected": -288.7850036621094, "loss": 0.5544, "rewards/accuracies": 0.6974999904632568, "rewards/chosen": -0.4097009301185608, "rewards/margins": 0.4377111792564392, "rewards/rejected": -0.8475390672683716, "step": 375 }, { "epoch": 0.47839736881447154, "grad_norm": 77.5, "learning_rate": 4.672131147540984e-06, "logits/chosen": -0.9680676460266113, "logits/rejected": -0.7582107782363892, "logps/chosen": -337.9375, "logps/rejected": -313.7749938964844, "loss": 0.5977, "rewards/accuracies": 0.6549999713897705, "rewards/chosen": -0.5489477515220642, "rewards/margins": 0.35999757051467896, "rewards/rejected": -0.9089636206626892, "step": 400 }, { "epoch": 0.47839736881447154, "eval_logits/chosen": -1.041106939315796, "eval_logits/rejected": -0.8698605895042419, "eval_logps/chosen": -323.7284851074219, "eval_logps/rejected": -299.6156005859375, "eval_loss": 0.5722406506538391, "eval_rewards/accuracies": 0.6610022783279419, "eval_rewards/chosen": -0.4932539761066437, "eval_rewards/margins": 0.40423059463500977, "eval_rewards/rejected": -0.8973480463027954, "eval_runtime": 876.344, "eval_samples_per_second": 1.697, "eval_steps_per_second": 0.212, "step": 400 }, { "epoch": 0.508297204365376, "grad_norm": 84.0, "learning_rate": 4.61674789543642e-06, "logits/chosen": -1.2390661239624023, "logits/rejected": -0.9836773872375488, "logps/chosen": -328.1875, "logps/rejected": -317.32501220703125, "loss": 0.5527, "rewards/accuracies": 0.6675000190734863, "rewards/chosen": -0.6254773139953613, "rewards/margins": 0.5287072658538818, "rewards/rejected": -1.153835415840149, "step": 425 }, { "epoch": 0.5381970399162804, "grad_norm": 93.0, "learning_rate": 4.561364643331857e-06, "logits/chosen": -1.0737494230270386, "logits/rejected": -0.8683199882507324, "logps/chosen": -316.00250244140625, "logps/rejected": -295.9649963378906, "loss": 0.5736, "rewards/accuracies": 0.6600000262260437, "rewards/chosen": -0.539447009563446, "rewards/margins": 0.46495360136032104, "rewards/rejected": -1.0048657655715942, "step": 450 }, { "epoch": 0.5680968754671849, "grad_norm": 73.5, "learning_rate": 4.505981391227293e-06, "logits/chosen": -1.052968144416809, "logits/rejected": -0.7523078322410583, "logps/chosen": -318.50250244140625, "logps/rejected": -313.8175048828125, "loss": 0.5422, "rewards/accuracies": 0.7149999737739563, "rewards/chosen": -0.5196704268455505, "rewards/margins": 0.5570727586746216, "rewards/rejected": -1.0764819383621216, "step": 475 }, { "epoch": 0.5979967110180894, "grad_norm": 70.0, "learning_rate": 4.4505981391227295e-06, "logits/chosen": -1.1461485624313354, "logits/rejected": -0.9354357719421387, "logps/chosen": -324.4750061035156, "logps/rejected": -294.0775146484375, "loss": 0.5415, "rewards/accuracies": 0.7074999809265137, "rewards/chosen": -0.518980085849762, "rewards/margins": 0.5734081864356995, "rewards/rejected": -1.092441439628601, "step": 500 }, { "epoch": 0.6278965465689939, "grad_norm": 84.0, "learning_rate": 4.395214887018166e-06, "logits/chosen": -1.091801404953003, "logits/rejected": -0.8006445169448853, "logps/chosen": -323.1724853515625, "logps/rejected": -294.4674987792969, "loss": 0.5646, "rewards/accuracies": 0.6700000166893005, "rewards/chosen": -0.672253429889679, "rewards/margins": 0.5069983005523682, "rewards/rejected": -1.1792798042297363, "step": 525 }, { "epoch": 0.6577963821198983, "grad_norm": 95.0, "learning_rate": 4.339831634913603e-06, "logits/chosen": -1.220596194267273, "logits/rejected": -0.9236291646957397, "logps/chosen": -316.7950134277344, "logps/rejected": -302.0824890136719, "loss": 0.5178, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7468109130859375, "rewards/margins": 0.6105853319168091, "rewards/rejected": -1.3566796779632568, "step": 550 }, { "epoch": 0.6876962176708028, "grad_norm": 100.0, "learning_rate": 4.284448382809039e-06, "logits/chosen": -1.0421770811080933, "logits/rejected": -0.7285050749778748, "logps/chosen": -308.42498779296875, "logps/rejected": -269.7037353515625, "loss": 0.5448, "rewards/accuracies": 0.6850000023841858, "rewards/chosen": -0.7317401170730591, "rewards/margins": 0.5794018507003784, "rewards/rejected": -1.3115381002426147, "step": 575 }, { "epoch": 0.7175960532217073, "grad_norm": 97.5, "learning_rate": 4.229065130704476e-06, "logits/chosen": -1.1298235654830933, "logits/rejected": -0.7811802625656128, "logps/chosen": -322.0574951171875, "logps/rejected": -309.9750061035156, "loss": 0.5292, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.590954601764679, "rewards/margins": 0.6085253953933716, "rewards/rejected": -1.1989331245422363, "step": 600 }, { "epoch": 0.7175960532217073, "eval_logits/chosen": -1.078187346458435, "eval_logits/rejected": -0.9206746220588684, "eval_logps/chosen": -324.5967712402344, "eval_logps/rejected": -301.7204284667969, "eval_loss": 0.5492891669273376, "eval_rewards/accuracies": 0.6757872104644775, "eval_rewards/chosen": -0.5633505582809448, "eval_rewards/margins": 0.5408346652984619, "eval_rewards/rejected": -1.1038333177566528, "eval_runtime": 876.4047, "eval_samples_per_second": 1.697, "eval_steps_per_second": 0.212, "step": 600 }, { "epoch": 0.7474958887726117, "grad_norm": 87.5, "learning_rate": 4.173681878599912e-06, "logits/chosen": -1.1809699535369873, "logits/rejected": -0.8887664675712585, "logps/chosen": -303.6575012207031, "logps/rejected": -294.7774963378906, "loss": 0.5261, "rewards/accuracies": 0.7275000214576721, "rewards/chosen": -0.5871319770812988, "rewards/margins": 0.6293676495552063, "rewards/rejected": -1.2162939310073853, "step": 625 }, { "epoch": 0.7773957243235162, "grad_norm": 99.5, "learning_rate": 4.118298626495348e-06, "logits/chosen": -1.1009465456008911, "logits/rejected": -0.9342904686927795, "logps/chosen": -338.12750244140625, "logps/rejected": -318.96624755859375, "loss": 0.5603, "rewards/accuracies": 0.6850000023841858, "rewards/chosen": -0.714611828327179, "rewards/margins": 0.6232568621635437, "rewards/rejected": -1.3377538919448853, "step": 650 }, { "epoch": 0.8072955598744207, "grad_norm": 72.5, "learning_rate": 4.062915374390784e-06, "logits/chosen": -1.2523653507232666, "logits/rejected": -1.0046355724334717, "logps/chosen": -310.9049987792969, "logps/rejected": -297.67498779296875, "loss": 0.5135, "rewards/accuracies": 0.7099999785423279, "rewards/chosen": -0.7437072992324829, "rewards/margins": 0.6859521269798279, "rewards/rejected": -1.4290771484375, "step": 675 }, { "epoch": 0.8371953954253252, "grad_norm": 89.0, "learning_rate": 4.007532122286221e-06, "logits/chosen": -1.2401965856552124, "logits/rejected": -0.8460285663604736, "logps/chosen": -336.927490234375, "logps/rejected": -318.7799987792969, "loss": 0.5186, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7741259932518005, "rewards/margins": 0.7083032131195068, "rewards/rejected": -1.4823095798492432, "step": 700 }, { "epoch": 0.8670952309762296, "grad_norm": 78.0, "learning_rate": 3.9521488701816575e-06, "logits/chosen": -1.1703033447265625, "logits/rejected": -0.9548498392105103, "logps/chosen": -287.87249755859375, "logps/rejected": -300.864990234375, "loss": 0.5476, "rewards/accuracies": 0.6825000047683716, "rewards/chosen": -0.8389843702316284, "rewards/margins": 0.608197033405304, "rewards/rejected": -1.447534203529358, "step": 725 }, { "epoch": 0.8969950665271341, "grad_norm": 100.5, "learning_rate": 3.896765618077094e-06, "logits/chosen": -1.1477763652801514, "logits/rejected": -0.9038227796554565, "logps/chosen": -338.31500244140625, "logps/rejected": -319.9649963378906, "loss": 0.5148, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8131677508354187, "rewards/margins": 0.7464379668235779, "rewards/rejected": -1.559140682220459, "step": 750 }, { "epoch": 0.9268949020780386, "grad_norm": 92.0, "learning_rate": 3.84138236597253e-06, "logits/chosen": -1.2342950105667114, "logits/rejected": -0.946718156337738, "logps/chosen": -331.1512451171875, "logps/rejected": -304.0249938964844, "loss": 0.528, "rewards/accuracies": 0.7149999737739563, "rewards/chosen": -0.9154602289199829, "rewards/margins": 0.6957080364227295, "rewards/rejected": -1.6108520030975342, "step": 775 }, { "epoch": 0.9567947376289431, "grad_norm": 102.0, "learning_rate": 3.7859991138679664e-06, "logits/chosen": -1.0906939506530762, "logits/rejected": -0.9649511575698853, "logps/chosen": -338.5637512207031, "logps/rejected": -338.4674987792969, "loss": 0.5151, "rewards/accuracies": 0.7200000286102295, "rewards/chosen": -0.859545886516571, "rewards/margins": 0.7704944014549255, "rewards/rejected": -1.630163550376892, "step": 800 }, { "epoch": 0.9567947376289431, "eval_logits/chosen": -1.1360965967178345, "eval_logits/rejected": -0.9822049736976624, "eval_logps/chosen": -326.69891357421875, "eval_logps/rejected": -305.0, "eval_loss": 0.5390191674232483, "eval_rewards/accuracies": 0.687980055809021, "eval_rewards/chosen": -0.7810032367706299, "eval_rewards/margins": 0.6442182064056396, "eval_rewards/rejected": -1.4252588748931885, "eval_runtime": 876.4063, "eval_samples_per_second": 1.697, "eval_steps_per_second": 0.212, "step": 800 }, { "epoch": 0.9866945731798475, "grad_norm": 84.5, "learning_rate": 3.730615861763403e-06, "logits/chosen": -1.2244549989700317, "logits/rejected": NaN, "logps/chosen": -334.5425109863281, "logps/rejected": -339.23748779296875, "loss": 0.5275, "rewards/accuracies": 0.7149999737739563, "rewards/chosen": -0.8379321098327637, "rewards/margins": 0.715624988079071, "rewards/rejected": -1.554010033607483, "step": 825 }, { "epoch": 1.0155479144864703, "grad_norm": 57.25, "learning_rate": 3.675232609658839e-06, "logits/chosen": -1.2397924661636353, "logits/rejected": -1.030158281326294, "logps/chosen": -320.9093322753906, "logps/rejected": -305.8393859863281, "loss": 0.4669, "rewards/accuracies": 0.7487046718597412, "rewards/chosen": -0.7694060206413269, "rewards/margins": 0.8478080630302429, "rewards/rejected": -1.6172634363174438, "step": 850 }, { "epoch": 1.045447750037375, "grad_norm": 67.5, "learning_rate": 3.6198493575542758e-06, "logits/chosen": -1.2220094203948975, "logits/rejected": -0.9582018852233887, "logps/chosen": -318.0262451171875, "logps/rejected": -297.5799865722656, "loss": 0.4691, "rewards/accuracies": 0.7724999785423279, "rewards/chosen": -0.7301892042160034, "rewards/margins": 0.9199609160423279, "rewards/rejected": -1.6502331495285034, "step": 875 }, { "epoch": 1.0753475855882793, "grad_norm": 73.5, "learning_rate": 3.564466105449712e-06, "logits/chosen": -1.089396357536316, "logits/rejected": -0.8958370685577393, "logps/chosen": -317.61749267578125, "logps/rejected": -295.4825134277344, "loss": 0.4746, "rewards/accuracies": 0.7574999928474426, "rewards/chosen": -0.8305737376213074, "rewards/margins": 0.8526538014411926, "rewards/rejected": -1.6829102039337158, "step": 900 }, { "epoch": 1.1052474211391838, "grad_norm": 64.5, "learning_rate": 3.509082853345149e-06, "logits/chosen": -1.1403405666351318, "logits/rejected": -0.8662219047546387, "logps/chosen": -322.0574951171875, "logps/rejected": -323.2074890136719, "loss": 0.4641, "rewards/accuracies": 0.7649999856948853, "rewards/chosen": -0.6764746308326721, "rewards/margins": 0.8836804032325745, "rewards/rejected": -1.5600537061691284, "step": 925 }, { "epoch": 1.1351472566900882, "grad_norm": 66.0, "learning_rate": 3.453699601240585e-06, "logits/chosen": -1.2375200986862183, "logits/rejected": -0.9549773931503296, "logps/chosen": -321.0874938964844, "logps/rejected": -306.6000061035156, "loss": 0.4201, "rewards/accuracies": 0.8224999904632568, "rewards/chosen": -0.7068628072738647, "rewards/margins": 1.0075805187225342, "rewards/rejected": -1.7146776914596558, "step": 950 }, { "epoch": 1.1650470922409926, "grad_norm": 64.0, "learning_rate": 3.3983163491360217e-06, "logits/chosen": -1.1668496131896973, "logits/rejected": -0.8835460543632507, "logps/chosen": -320.69000244140625, "logps/rejected": -323.0425109863281, "loss": 0.459, "rewards/accuracies": 0.7825000286102295, "rewards/chosen": -0.7173047065734863, "rewards/margins": 0.9243432879447937, "rewards/rejected": -1.6417040824890137, "step": 975 }, { "epoch": 1.1949469277918972, "grad_norm": 62.75, "learning_rate": 3.342933097031458e-06, "logits/chosen": -1.2166632413864136, "logits/rejected": -0.9624554514884949, "logps/chosen": -301.0849914550781, "logps/rejected": -304.3475036621094, "loss": 0.4656, "rewards/accuracies": 0.7850000262260437, "rewards/chosen": -0.7919347882270813, "rewards/margins": 0.9388867020606995, "rewards/rejected": -1.73046875, "step": 1000 }, { "epoch": 1.1949469277918972, "eval_logits/chosen": -1.160080075263977, "eval_logits/rejected": -1.0079379081726074, "eval_logps/chosen": -326.43280029296875, "eval_logps/rejected": -305.1102294921875, "eval_loss": 0.527574896812439, "eval_rewards/accuracies": 0.6892281174659729, "eval_rewards/chosen": -0.7565616369247437, "eval_rewards/margins": 0.6851438879966736, "eval_rewards/rejected": -1.4416320323944092, "eval_runtime": 876.3772, "eval_samples_per_second": 1.697, "eval_steps_per_second": 0.212, "step": 1000 }, { "epoch": 1.2248467633428017, "grad_norm": 84.0, "learning_rate": 3.2875498449268944e-06, "logits/chosen": -1.1776912212371826, "logits/rejected": -1.050445556640625, "logps/chosen": -343.0050048828125, "logps/rejected": -331.1875, "loss": 0.4213, "rewards/accuracies": 0.8050000071525574, "rewards/chosen": -0.6588146686553955, "rewards/margins": 1.0112402439117432, "rewards/rejected": -1.670253872871399, "step": 1025 }, { "epoch": 1.254746598893706, "grad_norm": 66.0, "learning_rate": 3.2321665928223306e-06, "logits/chosen": -1.2721245288848877, "logits/rejected": -0.9186769127845764, "logps/chosen": -316.4549865722656, "logps/rejected": -315.2925109863281, "loss": 0.4838, "rewards/accuracies": 0.7825000286102295, "rewards/chosen": -0.8342553973197937, "rewards/margins": 0.83197021484375, "rewards/rejected": -1.665708065032959, "step": 1050 }, { "epoch": 1.2846464344446105, "grad_norm": 62.75, "learning_rate": 3.176783340717767e-06, "logits/chosen": -1.1176886558532715, "logits/rejected": -0.9960334300994873, "logps/chosen": -328.32501220703125, "logps/rejected": -328.3450012207031, "loss": 0.4538, "rewards/accuracies": 0.7850000262260437, "rewards/chosen": -0.7273278832435608, "rewards/margins": 0.9573754668235779, "rewards/rejected": -1.684999942779541, "step": 1075 }, { "epoch": 1.314546269995515, "grad_norm": 84.5, "learning_rate": 3.1214000886132033e-06, "logits/chosen": -1.1655590534210205, "logits/rejected": -0.8922329545021057, "logps/chosen": -314.9700012207031, "logps/rejected": -301.5050048828125, "loss": 0.4483, "rewards/accuracies": 0.7850000262260437, "rewards/chosen": -0.6278771758079529, "rewards/margins": 0.9427502155303955, "rewards/rejected": -1.5707299709320068, "step": 1100 }, { "epoch": 1.3444461055464194, "grad_norm": 69.5, "learning_rate": 3.06601683650864e-06, "logits/chosen": -1.2217812538146973, "logits/rejected": -0.976731538772583, "logps/chosen": -324.7850036621094, "logps/rejected": -316.4599914550781, "loss": 0.4368, "rewards/accuracies": 0.8149999976158142, "rewards/chosen": -0.7704944014549255, "rewards/margins": 0.9598730206489563, "rewards/rejected": -1.7300487756729126, "step": 1125 }, { "epoch": 1.374345941097324, "grad_norm": 81.0, "learning_rate": 3.010633584404076e-06, "logits/chosen": -1.203802466392517, "logits/rejected": -0.9061872959136963, "logps/chosen": -330.4175109863281, "logps/rejected": -312.9987487792969, "loss": 0.4787, "rewards/accuracies": 0.75, "rewards/chosen": -0.7830480933189392, "rewards/margins": 0.9129126071929932, "rewards/rejected": -1.6956127882003784, "step": 1150 }, { "epoch": 1.4042457766482284, "grad_norm": 118.0, "learning_rate": 2.955250332299513e-06, "logits/chosen": -1.1928298473358154, "logits/rejected": -0.8999917507171631, "logps/chosen": -320.2650146484375, "logps/rejected": -301.5299987792969, "loss": 0.4698, "rewards/accuracies": 0.7549999952316284, "rewards/chosen": -0.8731860518455505, "rewards/margins": 0.9074377417564392, "rewards/rejected": -1.7800854444503784, "step": 1175 }, { "epoch": 1.434145612199133, "grad_norm": 65.0, "learning_rate": 2.8998670801949493e-06, "logits/chosen": -1.1984894275665283, "logits/rejected": -0.9353277683258057, "logps/chosen": -317.625, "logps/rejected": -325.4075012207031, "loss": 0.4502, "rewards/accuracies": 0.7674999833106995, "rewards/chosen": -0.9375879168510437, "rewards/margins": 0.9699438214302063, "rewards/rejected": -1.9072656631469727, "step": 1200 }, { "epoch": 1.434145612199133, "eval_logits/chosen": -1.156473159790039, "eval_logits/rejected": -1.006028413772583, "eval_logps/chosen": -327.82794189453125, "eval_logps/rejected": -306.8521423339844, "eval_loss": 0.5231196284294128, "eval_rewards/accuracies": 0.6926843523979187, "eval_rewards/chosen": -0.8996713161468506, "eval_rewards/margins": 0.7130159735679626, "eval_rewards/rejected": -1.6129347085952759, "eval_runtime": 876.3506, "eval_samples_per_second": 1.697, "eval_steps_per_second": 0.212, "step": 1200 }, { "epoch": 1.4640454477500373, "grad_norm": 99.5, "learning_rate": 2.844483828090386e-06, "logits/chosen": -1.339633822441101, "logits/rejected": -1.035129427909851, "logps/chosen": -332.54998779296875, "logps/rejected": -319.13751220703125, "loss": 0.4421, "rewards/accuracies": 0.7799999713897705, "rewards/chosen": -0.8549670577049255, "rewards/margins": 1.0162646770477295, "rewards/rejected": -1.8712304830551147, "step": 1225 }, { "epoch": 1.493945283300942, "grad_norm": 83.5, "learning_rate": 2.789100575985822e-06, "logits/chosen": -1.1476205587387085, "logits/rejected": -0.9250108599662781, "logps/chosen": -322.0050048828125, "logps/rejected": -309.3500061035156, "loss": 0.4555, "rewards/accuracies": 0.7549999952316284, "rewards/chosen": -0.8130224347114563, "rewards/margins": 0.9434008598327637, "rewards/rejected": -1.7563867568969727, "step": 1250 }, { "epoch": 1.5238451188518463, "grad_norm": 63.75, "learning_rate": 2.7337173238812586e-06, "logits/chosen": -1.2015457153320312, "logits/rejected": -0.8530246019363403, "logps/chosen": -309.01251220703125, "logps/rejected": -297.7825012207031, "loss": 0.4501, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.836810290813446, "rewards/margins": 0.9292749166488647, "rewards/rejected": -1.7654907703399658, "step": 1275 }, { "epoch": 1.5537449544027506, "grad_norm": 67.0, "learning_rate": 2.6783340717766948e-06, "logits/chosen": -1.2457306385040283, "logits/rejected": -1.0591107606887817, "logps/chosen": -337.9775085449219, "logps/rejected": -308.5375061035156, "loss": 0.4248, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.7735278606414795, "rewards/margins": 1.035646915435791, "rewards/rejected": -1.8087304830551147, "step": 1300 }, { "epoch": 1.5836447899536552, "grad_norm": 51.0, "learning_rate": 2.6229508196721314e-06, "logits/chosen": -1.216982126235962, "logits/rejected": -0.8925817608833313, "logps/chosen": -333.2349853515625, "logps/rejected": -316.62249755859375, "loss": 0.4568, "rewards/accuracies": 0.7850000262260437, "rewards/chosen": -0.8274877667427063, "rewards/margins": 0.9530566334724426, "rewards/rejected": -1.7805664539337158, "step": 1325 }, { "epoch": 1.6135446255045598, "grad_norm": 82.0, "learning_rate": 2.5675675675675675e-06, "logits/chosen": -1.3132140636444092, "logits/rejected": -1.004296898841858, "logps/chosen": -342.4949951171875, "logps/rejected": -317.69500732421875, "loss": 0.429, "rewards/accuracies": 0.8050000071525574, "rewards/chosen": -0.9008423089981079, "rewards/margins": 1.0281542539596558, "rewards/rejected": -1.9285448789596558, "step": 1350 }, { "epoch": 1.6434444610554642, "grad_norm": 116.5, "learning_rate": 2.5121843154630045e-06, "logits/chosen": -1.1408294439315796, "logits/rejected": -0.9321377277374268, "logps/chosen": -335.291259765625, "logps/rejected": -321.29376220703125, "loss": 0.453, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8236993551254272, "rewards/margins": 0.9510498046875, "rewards/rejected": -1.77447509765625, "step": 1375 }, { "epoch": 1.6733442966063685, "grad_norm": 91.0, "learning_rate": 2.4568010633584403e-06, "logits/chosen": -1.1858936548233032, "logits/rejected": -0.9579010009765625, "logps/chosen": -320.9949951171875, "logps/rejected": -296.3374938964844, "loss": 0.4699, "rewards/accuracies": 0.7425000071525574, "rewards/chosen": -0.8678625226020813, "rewards/margins": 0.9215136766433716, "rewards/rejected": -1.7896509170532227, "step": 1400 }, { "epoch": 1.6733442966063685, "eval_logits/chosen": -1.1674253940582275, "eval_logits/rejected": -1.0171688795089722, "eval_logps/chosen": -327.3978576660156, "eval_logps/rejected": -306.6209716796875, "eval_loss": 0.5191056728363037, "eval_rewards/accuracies": 0.6933563947677612, "eval_rewards/chosen": -0.8476693630218506, "eval_rewards/margins": 0.7431673407554626, "eval_rewards/rejected": -1.5906811952590942, "eval_runtime": 876.3262, "eval_samples_per_second": 1.697, "eval_steps_per_second": 0.212, "step": 1400 }, { "epoch": 1.703244132157273, "grad_norm": 82.0, "learning_rate": 2.401417811253877e-06, "logits/chosen": -1.1833282709121704, "logits/rejected": -0.9263910055160522, "logps/chosen": -324.5150146484375, "logps/rejected": -316.1650085449219, "loss": 0.451, "rewards/accuracies": 0.7799999713897705, "rewards/chosen": -0.8199084401130676, "rewards/margins": 0.9980810284614563, "rewards/rejected": -1.8175097703933716, "step": 1425 }, { "epoch": 1.7331439677081777, "grad_norm": 99.0, "learning_rate": 2.3460345591493135e-06, "logits/chosen": -1.1936352252960205, "logits/rejected": -1.0041576623916626, "logps/chosen": -350.885009765625, "logps/rejected": -327.0450134277344, "loss": 0.4702, "rewards/accuracies": 0.75, "rewards/chosen": -0.9122155904769897, "rewards/margins": 0.9335852265357971, "rewards/rejected": -1.8462109565734863, "step": 1450 }, { "epoch": 1.763043803259082, "grad_norm": 59.5, "learning_rate": 2.2906513070447496e-06, "logits/chosen": -1.3379946947097778, "logits/rejected": -1.0853075981140137, "logps/chosen": -299.1099853515625, "logps/rejected": -299.9725036621094, "loss": 0.4607, "rewards/accuracies": 0.7850000262260437, "rewards/chosen": -0.905989408493042, "rewards/margins": 1.0363476276397705, "rewards/rejected": -1.942041039466858, "step": 1475 }, { "epoch": 1.7929436388099864, "grad_norm": 102.0, "learning_rate": 2.235268054940186e-06, "logits/chosen": -1.1545830965042114, "logits/rejected": -0.8675525188446045, "logps/chosen": -321.79998779296875, "logps/rejected": -300.4262390136719, "loss": 0.4854, "rewards/accuracies": 0.7425000071525574, "rewards/chosen": -0.8690832257270813, "rewards/margins": 0.9056127667427063, "rewards/rejected": -1.7749096155166626, "step": 1500 }, { "epoch": 1.822843474360891, "grad_norm": 60.0, "learning_rate": 2.179884802835623e-06, "logits/chosen": -1.2606717348098755, "logits/rejected": -1.0567920207977295, "logps/chosen": -328.82501220703125, "logps/rejected": -304.1050109863281, "loss": 0.4552, "rewards/accuracies": 0.7850000262260437, "rewards/chosen": -0.743670642375946, "rewards/margins": 1.0134960412979126, "rewards/rejected": -1.7573193311691284, "step": 1525 }, { "epoch": 1.8527433099117956, "grad_norm": 59.5, "learning_rate": 2.124501550731059e-06, "logits/chosen": -1.2121707201004028, "logits/rejected": -1.002629041671753, "logps/chosen": -323.5950012207031, "logps/rejected": -317.5299987792969, "loss": 0.4645, "rewards/accuracies": 0.7674999833106995, "rewards/chosen": -0.9758337140083313, "rewards/margins": 0.9835278391838074, "rewards/rejected": -1.959287166595459, "step": 1550 }, { "epoch": 1.8826431454627, "grad_norm": 71.0, "learning_rate": 2.0691182986264955e-06, "logits/chosen": -1.296298861503601, "logits/rejected": NaN, "logps/chosen": -325.7699890136719, "logps/rejected": -299.322509765625, "loss": 0.4515, "rewards/accuracies": 0.7599999904632568, "rewards/chosen": -0.8331592082977295, "rewards/margins": 0.9821679592132568, "rewards/rejected": -1.8158252239227295, "step": 1575 }, { "epoch": 1.9125429810136043, "grad_norm": 70.0, "learning_rate": 2.0137350465219317e-06, "logits/chosen": -1.2260925769805908, "logits/rejected": -0.9426334500312805, "logps/chosen": -330.06500244140625, "logps/rejected": -309.68499755859375, "loss": 0.4436, "rewards/accuracies": 0.7649999856948853, "rewards/chosen": -0.830242931842804, "rewards/margins": 0.9743407964706421, "rewards/rejected": -1.804931640625, "step": 1600 }, { "epoch": 1.9125429810136043, "eval_logits/chosen": -1.1829742193222046, "eval_logits/rejected": -1.033914566040039, "eval_logps/chosen": -327.43011474609375, "eval_logps/rejected": -306.69085693359375, "eval_loss": 0.5206477046012878, "eval_rewards/accuracies": 0.6974846720695496, "eval_rewards/chosen": -0.8544062376022339, "eval_rewards/margins": 0.7440763115882874, "eval_rewards/rejected": -1.598265290260315, "eval_runtime": 876.3416, "eval_samples_per_second": 1.697, "eval_steps_per_second": 0.212, "step": 1600 }, { "epoch": 1.942442816564509, "grad_norm": 73.5, "learning_rate": 1.9583517944173683e-06, "logits/chosen": -1.246303677558899, "logits/rejected": -0.9357275366783142, "logps/chosen": -332.3599853515625, "logps/rejected": -309.1700134277344, "loss": 0.4702, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8381909132003784, "rewards/margins": 0.9997217059135437, "rewards/rejected": -1.837497591972351, "step": 1625 }, { "epoch": 1.9723426521154135, "grad_norm": 68.5, "learning_rate": 1.9029685423128047e-06, "logits/chosen": -1.2618129253387451, "logits/rejected": -1.0779250860214233, "logps/chosen": -339.9324951171875, "logps/rejected": -318.04998779296875, "loss": 0.4583, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8390514850616455, "rewards/margins": 1.0396826267242432, "rewards/rejected": -1.878564476966858, "step": 1650 }, { "epoch": 2.0011959934220362, "grad_norm": 97.0, "learning_rate": 1.847585290208241e-06, "logits/chosen": -1.2342288494110107, "logits/rejected": -0.9683116674423218, "logps/chosen": -332.2409362792969, "logps/rejected": -321.0531005859375, "loss": 0.424, "rewards/accuracies": 0.7642487287521362, "rewards/chosen": -0.7630558013916016, "rewards/margins": 1.0779491662979126, "rewards/rejected": -1.8409063816070557, "step": 1675 }, { "epoch": 2.0310958289729406, "grad_norm": 76.0, "learning_rate": 1.7922020381036776e-06, "logits/chosen": -1.318371295928955, "logits/rejected": -1.0083489418029785, "logps/chosen": -327.114990234375, "logps/rejected": -336.697509765625, "loss": 0.3965, "rewards/accuracies": 0.8475000262260437, "rewards/chosen": -0.7496582269668579, "rewards/margins": 1.0661474466323853, "rewards/rejected": -1.8159960508346558, "step": 1700 }, { "epoch": 2.060995664523845, "grad_norm": 102.5, "learning_rate": 1.736818785999114e-06, "logits/chosen": -1.2396435737609863, "logits/rejected": -0.9828730225563049, "logps/chosen": -332.7074890136719, "logps/rejected": -333.37249755859375, "loss": 0.4101, "rewards/accuracies": 0.8149999976158142, "rewards/chosen": -0.7449682354927063, "rewards/margins": 1.1290674209594727, "rewards/rejected": -1.8738598823547363, "step": 1725 }, { "epoch": 2.09089550007475, "grad_norm": 62.25, "learning_rate": 1.6814355338945504e-06, "logits/chosen": -1.2273823022842407, "logits/rejected": -0.88829505443573, "logps/chosen": -322.93499755859375, "logps/rejected": -300.385009765625, "loss": 0.4221, "rewards/accuracies": 0.8050000071525574, "rewards/chosen": -0.903369128704071, "rewards/margins": 1.0416357517242432, "rewards/rejected": -1.9447948932647705, "step": 1750 }, { "epoch": 2.120795335625654, "grad_norm": 86.5, "learning_rate": 1.6260522817899868e-06, "logits/chosen": -1.2524548768997192, "logits/rejected": -1.0671484470367432, "logps/chosen": -333.92999267578125, "logps/rejected": -318.6400146484375, "loss": 0.4119, "rewards/accuracies": 0.8149999976158142, "rewards/chosen": -0.7944982647895813, "rewards/margins": 1.1625818014144897, "rewards/rejected": -1.9566112756729126, "step": 1775 }, { "epoch": 2.1506951711765585, "grad_norm": 90.0, "learning_rate": 1.5706690296854231e-06, "logits/chosen": -1.2237915992736816, "logits/rejected": -0.956585705280304, "logps/chosen": -320.30999755859375, "logps/rejected": -302.2674865722656, "loss": 0.4528, "rewards/accuracies": 0.7674999833106995, "rewards/chosen": -0.9091894626617432, "rewards/margins": 1.0250316858291626, "rewards/rejected": -1.9344677925109863, "step": 1800 }, { "epoch": 2.1506951711765585, "eval_logits/chosen": -1.191327452659607, "eval_logits/rejected": -1.0433924198150635, "eval_logps/chosen": -327.741943359375, "eval_logps/rejected": -307.1559143066406, "eval_loss": 0.5188325047492981, "eval_rewards/accuracies": 0.6941244602203369, "eval_rewards/chosen": -0.8884723782539368, "eval_rewards/margins": 0.7567348480224609, "eval_rewards/rejected": -1.6454237699508667, "eval_runtime": 876.3236, "eval_samples_per_second": 1.697, "eval_steps_per_second": 0.212, "step": 1800 }, { "epoch": 2.180595006727463, "grad_norm": 74.5, "learning_rate": 1.5152857775808597e-06, "logits/chosen": -1.2849377393722534, "logits/rejected": -0.9589782953262329, "logps/chosen": -321.9987487792969, "logps/rejected": -307.2149963378906, "loss": 0.4031, "rewards/accuracies": 0.8349999785423279, "rewards/chosen": -0.7700170874595642, "rewards/margins": 1.1218103170394897, "rewards/rejected": -1.8917040824890137, "step": 1825 }, { "epoch": 2.2104948422783677, "grad_norm": 73.5, "learning_rate": 1.459902525476296e-06, "logits/chosen": -1.136842131614685, "logits/rejected": -0.9383144974708557, "logps/chosen": -319.8525085449219, "logps/rejected": -333.6600036621094, "loss": 0.424, "rewards/accuracies": 0.8075000047683716, "rewards/chosen": -0.8708154559135437, "rewards/margins": 1.0324267148971558, "rewards/rejected": -1.903378963470459, "step": 1850 }, { "epoch": 2.240394677829272, "grad_norm": 72.5, "learning_rate": 1.4045192733717325e-06, "logits/chosen": -1.1802786588668823, "logits/rejected": -0.9680548310279846, "logps/chosen": -317.48748779296875, "logps/rejected": -299.19000244140625, "loss": 0.4262, "rewards/accuracies": 0.8274999856948853, "rewards/chosen": -0.8513085842132568, "rewards/margins": 1.0704809427261353, "rewards/rejected": -1.9216357469558716, "step": 1875 }, { "epoch": 2.2702945133801764, "grad_norm": 84.0, "learning_rate": 1.3491360212671688e-06, "logits/chosen": -1.2559946775436401, "logits/rejected": -0.9639026522636414, "logps/chosen": -336.9750061035156, "logps/rejected": -323.49249267578125, "loss": 0.4294, "rewards/accuracies": 0.8025000095367432, "rewards/chosen": -0.8724609613418579, "rewards/margins": 1.0881787538528442, "rewards/rejected": -1.960756778717041, "step": 1900 }, { "epoch": 2.3001943489310808, "grad_norm": 71.0, "learning_rate": 1.2937527691626054e-06, "logits/chosen": -1.3266677856445312, "logits/rejected": -1.0626074075698853, "logps/chosen": -305.86749267578125, "logps/rejected": -291.93499755859375, "loss": 0.4471, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9192346334457397, "rewards/margins": 1.0141992568969727, "rewards/rejected": -1.9337549209594727, "step": 1925 }, { "epoch": 2.330094184481985, "grad_norm": 109.5, "learning_rate": 1.2383695170580418e-06, "logits/chosen": -1.1726070642471313, "logits/rejected": -1.0060466527938843, "logps/chosen": -309.7799987792969, "logps/rejected": -311.13751220703125, "loss": 0.4333, "rewards/accuracies": 0.7724999785423279, "rewards/chosen": -0.8455395698547363, "rewards/margins": 1.0642285346984863, "rewards/rejected": -1.9100537300109863, "step": 1950 }, { "epoch": 2.35999402003289, "grad_norm": 43.0, "learning_rate": 1.1829862649534782e-06, "logits/chosen": -1.189868450164795, "logits/rejected": -1.0110809803009033, "logps/chosen": -343.5849914550781, "logps/rejected": -329.1675109863281, "loss": 0.4071, "rewards/accuracies": 0.8224999904632568, "rewards/chosen": -0.8902783393859863, "rewards/margins": 1.0464379787445068, "rewards/rejected": -1.9371508359909058, "step": 1975 }, { "epoch": 2.3898938555837943, "grad_norm": 86.5, "learning_rate": 1.1276030128489146e-06, "logits/chosen": -1.3213348388671875, "logits/rejected": -1.0948954820632935, "logps/chosen": -331.0174865722656, "logps/rejected": -307.2900085449219, "loss": 0.4075, "rewards/accuracies": 0.8349999785423279, "rewards/chosen": -0.8052575588226318, "rewards/margins": 1.1002050638198853, "rewards/rejected": -1.9058740139007568, "step": 2000 }, { "epoch": 2.3898938555837943, "eval_logits/chosen": -1.1904795169830322, "eval_logits/rejected": -1.042686104774475, "eval_logps/chosen": -327.67205810546875, "eval_logps/rejected": -307.0806579589844, "eval_loss": 0.5186262726783752, "eval_rewards/accuracies": 0.6967166662216187, "eval_rewards/chosen": -0.8813358545303345, "eval_rewards/margins": 0.7553303837776184, "eval_rewards/rejected": -1.6366767883300781, "eval_runtime": 876.3711, "eval_samples_per_second": 1.697, "eval_steps_per_second": 0.212, "step": 2000 }, { "epoch": 2.4197936911346987, "grad_norm": 67.0, "learning_rate": 1.072219760744351e-06, "logits/chosen": -1.2627320289611816, "logits/rejected": -1.0026310682296753, "logps/chosen": -335.5675048828125, "logps/rejected": -301.01251220703125, "loss": 0.4202, "rewards/accuracies": 0.7774999737739563, "rewards/chosen": -0.8969201445579529, "rewards/margins": 1.085205078125, "rewards/rejected": -1.9821679592132568, "step": 2025 }, { "epoch": 2.4496935266856035, "grad_norm": 86.0, "learning_rate": 1.0168365086397875e-06, "logits/chosen": -1.2463324069976807, "logits/rejected": -0.9855798482894897, "logps/chosen": -332.5849914550781, "logps/rejected": -324.9624938964844, "loss": 0.4193, "rewards/accuracies": 0.7925000190734863, "rewards/chosen": -0.8326050043106079, "rewards/margins": 1.0910131931304932, "rewards/rejected": -1.9229882955551147, "step": 2050 }, { "epoch": 2.479593362236508, "grad_norm": 53.75, "learning_rate": 9.61453256535224e-07, "logits/chosen": -1.2372454404830933, "logits/rejected": -0.9461462497711182, "logps/chosen": -328.4750061035156, "logps/rejected": -300.5224914550781, "loss": 0.4611, "rewards/accuracies": 0.7524999976158142, "rewards/chosen": -0.8591150045394897, "rewards/margins": 0.9913061261177063, "rewards/rejected": -1.8506054878234863, "step": 2075 }, { "epoch": 2.509493197787412, "grad_norm": 68.0, "learning_rate": 9.060700044306603e-07, "logits/chosen": -1.2847473621368408, "logits/rejected": -1.0720292329788208, "logps/chosen": -337.26251220703125, "logps/rejected": -307.17498779296875, "loss": 0.4101, "rewards/accuracies": 0.7799999713897705, "rewards/chosen": -0.8909338116645813, "rewards/margins": 1.1306884288787842, "rewards/rejected": -2.021728515625, "step": 2100 }, { "epoch": 2.5393930333383166, "grad_norm": 101.0, "learning_rate": 8.506867523260968e-07, "logits/chosen": -1.1994116306304932, "logits/rejected": -0.9730746746063232, "logps/chosen": -338.3999938964844, "logps/rejected": -304.99749755859375, "loss": 0.4387, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.7841222882270813, "rewards/margins": 1.0449267625808716, "rewards/rejected": -1.829746127128601, "step": 2125 }, { "epoch": 2.569292868889221, "grad_norm": 68.5, "learning_rate": 7.953035002215331e-07, "logits/chosen": -1.3298254013061523, "logits/rejected": -1.118627667427063, "logps/chosen": -309.739990234375, "logps/rejected": -308.24749755859375, "loss": 0.4449, "rewards/accuracies": 0.7774999737739563, "rewards/chosen": -0.8520336747169495, "rewards/margins": 0.9700658917427063, "rewards/rejected": -1.8218945264816284, "step": 2150 }, { "epoch": 2.5991927044401257, "grad_norm": 70.5, "learning_rate": 7.399202481169695e-07, "logits/chosen": -1.1831958293914795, "logits/rejected": NaN, "logps/chosen": -327.49249267578125, "logps/rejected": -289.5924987792969, "loss": 0.4473, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8408032059669495, "rewards/margins": 0.9420214891433716, "rewards/rejected": -1.7829101085662842, "step": 2175 }, { "epoch": 2.62909253999103, "grad_norm": 54.0, "learning_rate": 6.845369960124059e-07, "logits/chosen": -1.2656641006469727, "logits/rejected": -0.9782373309135437, "logps/chosen": -324.4200134277344, "logps/rejected": -290.0675048828125, "loss": 0.4419, "rewards/accuracies": 0.7825000286102295, "rewards/chosen": -0.9666149616241455, "rewards/margins": 1.0030114650726318, "rewards/rejected": -1.9694628715515137, "step": 2200 }, { "epoch": 2.62909253999103, "eval_logits/chosen": -1.1868830919265747, "eval_logits/rejected": -1.0399714708328247, "eval_logps/chosen": -327.6585998535156, "eval_logps/rejected": -306.9704284667969, "eval_loss": 0.5178263783454895, "eval_rewards/accuracies": 0.6993087530136108, "eval_rewards/chosen": -0.8778404593467712, "eval_rewards/margins": 0.7548588514328003, "eval_rewards/rejected": -1.6324502229690552, "eval_runtime": 876.3727, "eval_samples_per_second": 1.697, "eval_steps_per_second": 0.212, "step": 2200 }, { "epoch": 2.6589923755419345, "grad_norm": 67.5, "learning_rate": 6.291537439078423e-07, "logits/chosen": -1.2253618240356445, "logits/rejected": -1.0349105596542358, "logps/chosen": -336.12249755859375, "logps/rejected": -311.8275146484375, "loss": 0.4574, "rewards/accuracies": 0.7724999785423279, "rewards/chosen": -0.8752642869949341, "rewards/margins": 0.9961340427398682, "rewards/rejected": -1.8713818788528442, "step": 2225 }, { "epoch": 2.688892211092839, "grad_norm": 100.0, "learning_rate": 5.737704918032787e-07, "logits/chosen": -1.2597771883010864, "logits/rejected": -0.9909564256668091, "logps/chosen": -326.6600036621094, "logps/rejected": -316.19000244140625, "loss": 0.4751, "rewards/accuracies": 0.7674999833106995, "rewards/chosen": -0.9248193502426147, "rewards/margins": 0.9592040777206421, "rewards/rejected": -1.8837096691131592, "step": 2250 }, { "epoch": 2.7187920466437436, "grad_norm": 76.0, "learning_rate": 5.183872396987152e-07, "logits/chosen": -1.2072705030441284, "logits/rejected": -0.9592925906181335, "logps/chosen": -322.36248779296875, "logps/rejected": -315.8374938964844, "loss": 0.391, "rewards/accuracies": 0.8274999856948853, "rewards/chosen": -0.7576141357421875, "rewards/margins": 1.160730004310608, "rewards/rejected": -1.9182031154632568, "step": 2275 }, { "epoch": 2.748691882194648, "grad_norm": 53.0, "learning_rate": 4.630039875941516e-07, "logits/chosen": -1.287199854850769, "logits/rejected": -0.9606054425239563, "logps/chosen": -344.7650146484375, "logps/rejected": -331.24749755859375, "loss": 0.4177, "rewards/accuracies": 0.8149999976158142, "rewards/chosen": -0.7748047113418579, "rewards/margins": 1.1645703315734863, "rewards/rejected": -1.9394140243530273, "step": 2300 }, { "epoch": 2.7785917177455524, "grad_norm": 87.0, "learning_rate": 4.07620735489588e-07, "logits/chosen": -1.2260528802871704, "logits/rejected": -1.0005972385406494, "logps/chosen": -312.9624938964844, "logps/rejected": -323.0400085449219, "loss": 0.3917, "rewards/accuracies": 0.8349999785423279, "rewards/chosen": -0.7925238013267517, "rewards/margins": 1.185449242591858, "rewards/rejected": -1.9780443906784058, "step": 2325 }, { "epoch": 2.8084915532964567, "grad_norm": 56.5, "learning_rate": 3.5223748338502434e-07, "logits/chosen": -1.2027392387390137, "logits/rejected": -0.989107608795166, "logps/chosen": -321.3762512207031, "logps/rejected": -318.11749267578125, "loss": 0.4052, "rewards/accuracies": 0.8174999952316284, "rewards/chosen": -0.8751891851425171, "rewards/margins": 1.1021533012390137, "rewards/rejected": -1.976718783378601, "step": 2350 }, { "epoch": 2.838391388847361, "grad_norm": 54.5, "learning_rate": 2.968542312804608e-07, "logits/chosen": -1.2425882816314697, "logits/rejected": -0.9340093731880188, "logps/chosen": -335.12249755859375, "logps/rejected": -320.2049865722656, "loss": 0.4115, "rewards/accuracies": 0.8224999904632568, "rewards/chosen": -0.8292675614356995, "rewards/margins": 1.1182934045791626, "rewards/rejected": -1.9483104944229126, "step": 2375 }, { "epoch": 2.868291224398266, "grad_norm": 87.0, "learning_rate": 2.4147097917589725e-07, "logits/chosen": -1.3012477159500122, "logits/rejected": -1.0664279460906982, "logps/chosen": -293.489990234375, "logps/rejected": -285.197509765625, "loss": 0.4277, "rewards/accuracies": 0.8025000095367432, "rewards/chosen": -0.8684576153755188, "rewards/margins": 1.069272518157959, "rewards/rejected": -1.9371191263198853, "step": 2400 }, { "epoch": 2.868291224398266, "eval_logits/chosen": -1.1853525638580322, "eval_logits/rejected": -1.0373817682266235, "eval_logps/chosen": -327.3817138671875, "eval_logps/rejected": -306.81451416015625, "eval_loss": 0.5165102481842041, "eval_rewards/accuracies": 0.7006528377532959, "eval_rewards/chosen": -0.8549529314041138, "eval_rewards/margins": 0.7583125829696655, "eval_rewards/rejected": -1.6133127212524414, "eval_runtime": 876.3322, "eval_samples_per_second": 1.697, "eval_steps_per_second": 0.212, "step": 2400 }, { "epoch": 2.8981910599491703, "grad_norm": 46.0, "learning_rate": 1.8608772707133363e-07, "logits/chosen": -1.356745958328247, "logits/rejected": -1.0496530532836914, "logps/chosen": -319.9649963378906, "logps/rejected": -309.7025146484375, "loss": 0.4037, "rewards/accuracies": 0.8025000095367432, "rewards/chosen": -0.8254479765892029, "rewards/margins": 1.1192578077316284, "rewards/rejected": -1.9445117712020874, "step": 2425 }, { "epoch": 2.9280908955000746, "grad_norm": 70.5, "learning_rate": 1.3070447496677006e-07, "logits/chosen": -1.2751880884170532, "logits/rejected": -1.0796799659729004, "logps/chosen": -316.9425048828125, "logps/rejected": -325.7550048828125, "loss": 0.4306, "rewards/accuracies": 0.7724999785423279, "rewards/chosen": -0.8079773187637329, "rewards/margins": 1.000207543373108, "rewards/rejected": -1.8083984851837158, "step": 2450 }, { "epoch": 2.9579907310509794, "grad_norm": 74.0, "learning_rate": 7.532122286220647e-08, "logits/chosen": -1.2595221996307373, "logits/rejected": -1.0140166282653809, "logps/chosen": -320.6000061035156, "logps/rejected": -318.6600036621094, "loss": 0.4808, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.000390648841858, "rewards/margins": 0.931530773639679, "rewards/rejected": -1.9319677352905273, "step": 2475 }, { "epoch": 2.987890566601884, "grad_norm": 78.5, "learning_rate": 1.993797075764289e-08, "logits/chosen": -1.2403491735458374, "logits/rejected": -0.9544309973716736, "logps/chosen": -343.76251220703125, "logps/rejected": -336.38250732421875, "loss": 0.4225, "rewards/accuracies": 0.8149999976158142, "rewards/chosen": -0.7856341600418091, "rewards/margins": 1.0573632717132568, "rewards/rejected": -1.8428466320037842, "step": 2500 } ], "logging_steps": 25, "max_steps": 2508, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }