{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996283909327388, "eval_steps": 500, "global_step": 1345, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0505388331475288, "grad_norm": 4.774968147277832, "learning_rate": 5.037037037037037e-07, "logits/chosen": -0.535763680934906, "logits/rejected": -0.45358335971832275, "logps/chosen": -72.03621673583984, "logps/rejected": -10.970436096191406, "loss": 0.6914, "rewards/accuracies": 0.5533088445663452, "rewards/chosen": 0.003953414969146252, "rewards/margins": 0.003940199967473745, "rewards/rejected": 1.321489253314212e-05, "step": 68 }, { "epoch": 0.1010776662950576, "grad_norm": 7.106871128082275, "learning_rate": 9.991735537190081e-07, "logits/chosen": -0.5598275661468506, "logits/rejected": -0.48031094670295715, "logps/chosen": -67.91443634033203, "logps/rejected": -10.759711265563965, "loss": 0.6763, "rewards/accuracies": 0.7647058963775635, "rewards/chosen": 0.030388537794351578, "rewards/margins": 0.03460656851530075, "rewards/rejected": -0.004218032583594322, "step": 136 }, { "epoch": 0.1516164994425864, "grad_norm": 6.630987167358398, "learning_rate": 9.429752066115701e-07, "logits/chosen": -0.5454370379447937, "logits/rejected": -0.4578668177127838, "logps/chosen": -65.27257537841797, "logps/rejected": -11.141642570495605, "loss": 0.5834, "rewards/accuracies": 0.966911792755127, "rewards/chosen": 0.20791733264923096, "rewards/margins": 0.25071945786476135, "rewards/rejected": -0.04280214384198189, "step": 204 }, { "epoch": 0.2021553325901152, "grad_norm": 2.813685417175293, "learning_rate": 8.867768595041321e-07, "logits/chosen": -0.5535087585449219, "logits/rejected": -0.48542195558547974, "logps/chosen": -59.47464370727539, "logps/rejected": -12.498270988464355, "loss": 0.4096, "rewards/accuracies": 0.9761029481887817, "rewards/chosen": 0.5964955687522888, "rewards/margins": 0.8128367066383362, "rewards/rejected": -0.21634113788604736, "step": 272 }, { "epoch": 0.25269416573764397, "grad_norm": 1.8204927444458008, "learning_rate": 8.305785123966941e-07, "logits/chosen": -0.5383437871932983, "logits/rejected": -0.4763253629207611, "logps/chosen": -59.065216064453125, "logps/rejected": -15.607768058776855, "loss": 0.2626, "rewards/accuracies": 0.9816176295280457, "rewards/chosen": 1.058206558227539, "rewards/margins": 1.5873197317123413, "rewards/rejected": -0.529113233089447, "step": 340 }, { "epoch": 0.3032329988851728, "grad_norm": 2.656168222427368, "learning_rate": 7.743801652892562e-07, "logits/chosen": -0.5623682737350464, "logits/rejected": -0.5051128268241882, "logps/chosen": -57.070865631103516, "logps/rejected": -18.777294158935547, "loss": 0.1687, "rewards/accuracies": 0.9871323704719543, "rewards/chosen": 1.3186638355255127, "rewards/margins": 2.2629363536834717, "rewards/rejected": -0.9442727565765381, "step": 408 }, { "epoch": 0.35377183203270157, "grad_norm": 1.3962138891220093, "learning_rate": 7.181818181818181e-07, "logits/chosen": -0.5668503046035767, "logits/rejected": -0.5197104215621948, "logps/chosen": -58.299644470214844, "logps/rejected": -24.42568016052246, "loss": 0.1139, "rewards/accuracies": 0.9889705777168274, "rewards/chosen": 1.4811238050460815, "rewards/margins": 2.8274738788604736, "rewards/rejected": -1.346349835395813, "step": 476 }, { "epoch": 0.4043106651802304, "grad_norm": 4.664539337158203, "learning_rate": 6.619834710743801e-07, "logits/chosen": -0.5390637516975403, "logits/rejected": -0.5045632719993591, "logps/chosen": -49.41936492919922, "logps/rejected": -27.90851593017578, "loss": 0.077, "rewards/accuracies": 0.9889705777168274, "rewards/chosen": 1.4117194414138794, "rewards/margins": 3.2265889644622803, "rewards/rejected": -1.8148694038391113, "step": 544 }, { "epoch": 0.45484949832775917, "grad_norm": 0.970925509929657, "learning_rate": 6.057851239669421e-07, "logits/chosen": -0.5126790404319763, "logits/rejected": -0.46291089057922363, "logps/chosen": -58.05442810058594, "logps/rejected": -35.03934097290039, "loss": 0.0496, "rewards/accuracies": 0.9908088445663452, "rewards/chosen": 1.6365827322006226, "rewards/margins": 3.8830738067626953, "rewards/rejected": -2.246490955352783, "step": 612 }, { "epoch": 0.5053883314752879, "grad_norm": 1.9165253639221191, "learning_rate": 5.49586776859504e-07, "logits/chosen": -0.5267462730407715, "logits/rejected": -0.45349758863449097, "logps/chosen": -53.660301208496094, "logps/rejected": -34.66725158691406, "loss": 0.0412, "rewards/accuracies": 0.9834558963775635, "rewards/chosen": 1.7377012968063354, "rewards/margins": 4.283178329467773, "rewards/rejected": -2.5454771518707275, "step": 680 }, { "epoch": 0.5559271646228168, "grad_norm": 0.5560820698738098, "learning_rate": 4.933884297520661e-07, "logits/chosen": -0.5102059841156006, "logits/rejected": -0.44081857800483704, "logps/chosen": -48.95499801635742, "logps/rejected": -38.01549530029297, "loss": 0.0338, "rewards/accuracies": 0.9908088445663452, "rewards/chosen": 1.5988441705703735, "rewards/margins": 4.466281890869141, "rewards/rejected": -2.8674376010894775, "step": 748 }, { "epoch": 0.6064659977703456, "grad_norm": 0.654052734375, "learning_rate": 4.3719008264462806e-07, "logits/chosen": -0.5107941627502441, "logits/rejected": -0.424625426530838, "logps/chosen": -52.267120361328125, "logps/rejected": -39.51735305786133, "loss": 0.0403, "rewards/accuracies": 0.9797794222831726, "rewards/chosen": 1.5989586114883423, "rewards/margins": 4.681400775909424, "rewards/rejected": -3.0824427604675293, "step": 816 }, { "epoch": 0.6570048309178744, "grad_norm": 0.2799667716026306, "learning_rate": 3.8099173553719006e-07, "logits/chosen": -0.48151201009750366, "logits/rejected": -0.3872612416744232, "logps/chosen": -51.13566970825195, "logps/rejected": -41.436946868896484, "loss": 0.0386, "rewards/accuracies": 0.9852941036224365, "rewards/chosen": 1.6831274032592773, "rewards/margins": 4.950973033905029, "rewards/rejected": -3.26784610748291, "step": 884 }, { "epoch": 0.7075436640654031, "grad_norm": 3.401352882385254, "learning_rate": 3.2479338842975206e-07, "logits/chosen": -0.4963739216327667, "logits/rejected": -0.397490918636322, "logps/chosen": -51.522117614746094, "logps/rejected": -42.51453399658203, "loss": 0.0242, "rewards/accuracies": 0.9908088445663452, "rewards/chosen": 1.671505331993103, "rewards/margins": 4.983243465423584, "rewards/rejected": -3.3117384910583496, "step": 952 }, { "epoch": 0.758082497212932, "grad_norm": 0.16179317235946655, "learning_rate": 2.6859504132231406e-07, "logits/chosen": -0.4788703918457031, "logits/rejected": -0.3747369050979614, "logps/chosen": -54.24635314941406, "logps/rejected": -42.791847229003906, "loss": 0.037, "rewards/accuracies": 0.9852941036224365, "rewards/chosen": 1.7351138591766357, "rewards/margins": 5.101663112640381, "rewards/rejected": -3.366548776626587, "step": 1020 }, { "epoch": 0.8086213303604608, "grad_norm": 0.2579549252986908, "learning_rate": 2.1239669421487603e-07, "logits/chosen": -0.48607107996940613, "logits/rejected": -0.3732473850250244, "logps/chosen": -53.21406555175781, "logps/rejected": -45.13155746459961, "loss": 0.0328, "rewards/accuracies": 0.9852941036224365, "rewards/chosen": 1.671941876411438, "rewards/margins": 5.195909023284912, "rewards/rejected": -3.5239670276641846, "step": 1088 }, { "epoch": 0.8591601635079896, "grad_norm": 0.4580838680267334, "learning_rate": 1.56198347107438e-07, "logits/chosen": -0.4816429018974304, "logits/rejected": -0.36846473813056946, "logps/chosen": -53.959617614746094, "logps/rejected": -46.597286224365234, "loss": 0.0252, "rewards/accuracies": 0.9889705777168274, "rewards/chosen": 1.7494579553604126, "rewards/margins": 5.300571441650391, "rewards/rejected": -3.5511131286621094, "step": 1156 }, { "epoch": 0.9096989966555183, "grad_norm": 0.2617437243461609, "learning_rate": 1e-07, "logits/chosen": -0.48045316338539124, "logits/rejected": -0.3751773536205292, "logps/chosen": -53.45015335083008, "logps/rejected": -45.071617126464844, "loss": 0.0358, "rewards/accuracies": 0.9834558963775635, "rewards/chosen": 1.7436227798461914, "rewards/margins": 5.314986705780029, "rewards/rejected": -3.571363925933838, "step": 1224 }, { "epoch": 0.9602378298030472, "grad_norm": 0.671177864074707, "learning_rate": 4.3801652892561986e-08, "logits/chosen": -0.47449061274528503, "logits/rejected": -0.35607320070266724, "logps/chosen": -55.56103515625, "logps/rejected": -45.930023193359375, "loss": 0.0315, "rewards/accuracies": 0.9834558963775635, "rewards/chosen": 1.7040444612503052, "rewards/margins": 5.217987060546875, "rewards/rejected": -3.5139424800872803, "step": 1292 }, { "epoch": 0.9996283909327388, "step": 1345, "total_flos": 7.57814156543656e+17, "train_loss": 0.17186723207452484, "train_runtime": 19211.7112, "train_samples_per_second": 0.56, "train_steps_per_second": 0.07 } ], "logging_steps": 68, "max_steps": 1345, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.57814156543656e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }