{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.995910949568378, "eval_steps": 400, "global_step": 137, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03634711494775102, "grad_norm": 32.58869576990809, "learning_rate": 2.857142857142857e-07, "logits/chosen": -11.543076515197754, "logits/rejected": -11.31420612335205, "logps/chosen": -0.5522164106369019, "logps/rejected": -0.5735588669776917, "loss": 4.805, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -5.522164344787598, "rewards/margins": 0.21342459321022034, "rewards/rejected": -5.735588550567627, "step": 5 }, { "epoch": 0.07269422989550205, "grad_norm": 34.3946259201971, "learning_rate": 5.714285714285714e-07, "logits/chosen": -10.83501148223877, "logits/rejected": -10.637781143188477, "logps/chosen": -0.6140329241752625, "logps/rejected": -0.6750808954238892, "loss": 4.7363, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -6.140329837799072, "rewards/margins": 0.6104797124862671, "rewards/rejected": -6.7508087158203125, "step": 10 }, { "epoch": 0.10904134484325306, "grad_norm": 41.73056692404051, "learning_rate": 7.998695344323425e-07, "logits/chosen": -10.648634910583496, "logits/rejected": -10.217233657836914, "logps/chosen": -0.5939905047416687, "logps/rejected": -0.6135314702987671, "loss": 4.8553, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -5.939904689788818, "rewards/margins": 0.1954106092453003, "rewards/rejected": -6.135315418243408, "step": 15 }, { "epoch": 0.1453884597910041, "grad_norm": 36.02519008680044, "learning_rate": 7.953121695121394e-07, "logits/chosen": -10.66395378112793, "logits/rejected": -10.348527908325195, "logps/chosen": -0.5527507066726685, "logps/rejected": -0.5806955099105835, "loss": 4.6386, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -5.527507781982422, "rewards/margins": 0.27944836020469666, "rewards/rejected": -5.806955337524414, "step": 20 }, { "epoch": 0.18173557473875512, "grad_norm": 40.07097906112277, "learning_rate": 7.843163833184991e-07, "logits/chosen": -10.465521812438965, "logits/rejected": -10.2030668258667, "logps/chosen": -0.5359245538711548, "logps/rejected": -0.5927993655204773, "loss": 4.5869, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -5.359244346618652, "rewards/margins": 0.5687500834465027, "rewards/rejected": -5.927994728088379, "step": 25 }, { "epoch": 0.21808268968650613, "grad_norm": 34.259554554601195, "learning_rate": 7.670612634414511e-07, "logits/chosen": -10.547082901000977, "logits/rejected": -10.052377700805664, "logps/chosen": -0.5360420346260071, "logps/rejected": -0.5889440774917603, "loss": 4.5463, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -5.360420227050781, "rewards/margins": 0.5290209054946899, "rewards/rejected": -5.889441013336182, "step": 30 }, { "epoch": 0.25442980463425713, "grad_norm": 30.666733170155783, "learning_rate": 7.438278427948805e-07, "logits/chosen": -10.336666107177734, "logits/rejected": -10.034899711608887, "logps/chosen": -0.6049242615699768, "logps/rejected": -0.6828798055648804, "loss": 4.3807, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -6.0492424964904785, "rewards/margins": 0.7795557379722595, "rewards/rejected": -6.828797817230225, "step": 35 }, { "epoch": 0.2907769195820082, "grad_norm": 37.89264733124544, "learning_rate": 7.149945224533862e-07, "logits/chosen": -10.575347900390625, "logits/rejected": -10.193809509277344, "logps/chosen": -0.6202062964439392, "logps/rejected": -0.6880999803543091, "loss": 4.4737, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -6.20206356048584, "rewards/margins": 0.678936779499054, "rewards/rejected": -6.880999565124512, "step": 40 }, { "epoch": 0.3271240345297592, "grad_norm": 36.08127980591517, "learning_rate": 6.810309086608129e-07, "logits/chosen": -11.216996192932129, "logits/rejected": -10.75574016571045, "logps/chosen": -0.6237494349479675, "logps/rejected": -0.687574028968811, "loss": 4.4138, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -6.237493991851807, "rewards/margins": 0.6382461190223694, "rewards/rejected": -6.875740051269531, "step": 45 }, { "epoch": 0.36347114947751025, "grad_norm": 34.2641539929766, "learning_rate": 6.424901643866552e-07, "logits/chosen": -11.69914722442627, "logits/rejected": -11.246435165405273, "logps/chosen": -0.6562625169754028, "logps/rejected": -0.7224141359329224, "loss": 4.3367, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -6.562624931335449, "rewards/margins": 0.6615163087844849, "rewards/rejected": -7.2241411209106445, "step": 50 }, { "epoch": 0.39981826442526125, "grad_norm": 36.53529303407606, "learning_rate": 6e-07, "logits/chosen": -11.810195922851562, "logits/rejected": -11.363134384155273, "logps/chosen": -0.7002917528152466, "logps/rejected": -0.778505802154541, "loss": 4.3706, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -7.002917289733887, "rewards/margins": 0.7821400761604309, "rewards/rejected": -7.785058498382568, "step": 55 }, { "epoch": 0.43616537937301225, "grad_norm": 35.761908817299414, "learning_rate": 5.542524497952543e-07, "logits/chosen": -11.559731483459473, "logits/rejected": -11.183916091918945, "logps/chosen": -0.7286056280136108, "logps/rejected": -0.8559492230415344, "loss": 4.2217, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -7.2860565185546875, "rewards/margins": 1.273435354232788, "rewards/rejected": -8.559491157531738, "step": 60 }, { "epoch": 0.4725124943207633, "grad_norm": 231.37525001147708, "learning_rate": 5.059926008786647e-07, "logits/chosen": -11.819048881530762, "logits/rejected": -11.407302856445312, "logps/chosen": -0.734784722328186, "logps/rejected": -0.8366082906723022, "loss": 4.1901, "rewards/accuracies": 0.75, "rewards/chosen": -7.347847938537598, "rewards/margins": 1.0182350873947144, "rewards/rejected": -8.366083145141602, "step": 65 }, { "epoch": 0.5088596092685143, "grad_norm": 51.233160095021326, "learning_rate": 4.5600645798745166e-07, "logits/chosen": -11.959329605102539, "logits/rejected": -11.445878028869629, "logps/chosen": -0.7511547803878784, "logps/rejected": -0.8529723882675171, "loss": 4.2912, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -7.511547088623047, "rewards/margins": 1.0181769132614136, "rewards/rejected": -8.52972412109375, "step": 70 }, { "epoch": 0.5452067242162654, "grad_norm": 59.733803732439824, "learning_rate": 4.051081418863895e-07, "logits/chosen": -12.409022331237793, "logits/rejected": -11.8023042678833, "logps/chosen": -0.8404130935668945, "logps/rejected": -0.9404484629631042, "loss": 4.0174, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -8.404130935668945, "rewards/margins": 1.0003544092178345, "rewards/rejected": -9.404484748840332, "step": 75 }, { "epoch": 0.5815538391640164, "grad_norm": 60.192585986207376, "learning_rate": 3.541266298406398e-07, "logits/chosen": -12.791674613952637, "logits/rejected": -12.519620895385742, "logps/chosen": -0.8719121217727661, "logps/rejected": -1.016026258468628, "loss": 4.1563, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.719121932983398, "rewards/margins": 1.4411423206329346, "rewards/rejected": -10.160263061523438, "step": 80 }, { "epoch": 0.6179009541117674, "grad_norm": 63.01387669368681, "learning_rate": 3.0389225412181565e-07, "logits/chosen": -13.761955261230469, "logits/rejected": -13.322006225585938, "logps/chosen": -0.8667750358581543, "logps/rejected": -1.0224027633666992, "loss": 4.0181, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -8.667750358581543, "rewards/margins": 1.5562773942947388, "rewards/rejected": -10.224027633666992, "step": 85 }, { "epoch": 0.6542480690595184, "grad_norm": 58.79093252453101, "learning_rate": 2.5522317844515273e-07, "logits/chosen": -13.937658309936523, "logits/rejected": -13.547172546386719, "logps/chosen": -0.9299012422561646, "logps/rejected": -1.087494134902954, "loss": 3.8751, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -9.29901123046875, "rewards/margins": 1.5759302377700806, "rewards/rejected": -10.874940872192383, "step": 90 }, { "epoch": 0.6905951840072694, "grad_norm": 90.56773515906556, "learning_rate": 2.0891207259509476e-07, "logits/chosen": -14.276998519897461, "logits/rejected": -13.726056098937988, "logps/chosen": -0.9327268600463867, "logps/rejected": -1.1215860843658447, "loss": 3.8327, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -9.327268600463867, "rewards/margins": 1.8885917663574219, "rewards/rejected": -11.215860366821289, "step": 95 }, { "epoch": 0.7269422989550205, "grad_norm": 68.83137467247752, "learning_rate": 1.6571320226872206e-07, "logits/chosen": -15.408729553222656, "logits/rejected": -14.846705436706543, "logps/chosen": -0.994869589805603, "logps/rejected": -1.1596581935882568, "loss": 3.8063, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -9.948695182800293, "rewards/margins": 1.64788818359375, "rewards/rejected": -11.596583366394043, "step": 100 }, { "epoch": 0.7632894139027715, "grad_norm": 81.97650507265567, "learning_rate": 1.2633014440382787e-07, "logits/chosen": -15.801300048828125, "logits/rejected": -15.365028381347656, "logps/chosen": -1.0137146711349487, "logps/rejected": -1.173123836517334, "loss": 3.7038, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -10.137146949768066, "rewards/margins": 1.5940909385681152, "rewards/rejected": -11.731237411499023, "step": 105 }, { "epoch": 0.7996365288505225, "grad_norm": 76.7946168478378, "learning_rate": 9.14043280712228e-08, "logits/chosen": -16.20096206665039, "logits/rejected": -15.696843147277832, "logps/chosen": -1.1373329162597656, "logps/rejected": -1.354536771774292, "loss": 3.6719, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -11.373329162597656, "rewards/margins": 2.1720378398895264, "rewards/rejected": -13.545367240905762, "step": 110 }, { "epoch": 0.8359836437982735, "grad_norm": 85.18089495737438, "learning_rate": 6.150458756494239e-08, "logits/chosen": -16.4575138092041, "logits/rejected": -16.271873474121094, "logps/chosen": -1.0763649940490723, "logps/rejected": -1.265742540359497, "loss": 3.621, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -10.763648986816406, "rewards/margins": 1.8937755823135376, "rewards/rejected": -12.657424926757812, "step": 115 }, { "epoch": 0.8723307587460245, "grad_norm": 80.36432752317246, "learning_rate": 3.711789783843522e-08, "logits/chosen": -16.593286514282227, "logits/rejected": -16.234304428100586, "logps/chosen": -1.041198492050171, "logps/rejected": -1.273951768875122, "loss": 3.481, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -10.411985397338867, "rewards/margins": 2.327531337738037, "rewards/rejected": -12.739517211914062, "step": 120 }, { "epoch": 0.9086778736937755, "grad_norm": 81.72943922815173, "learning_rate": 1.8641443178027784e-08, "logits/chosen": -16.43305206298828, "logits/rejected": -16.25759506225586, "logps/chosen": -1.142892599105835, "logps/rejected": -1.3480346202850342, "loss": 3.6377, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -11.428924560546875, "rewards/margins": 2.051419734954834, "rewards/rejected": -13.480344772338867, "step": 125 }, { "epoch": 0.9450249886415266, "grad_norm": 74.32925754994847, "learning_rate": 6.376148290617145e-09, "logits/chosen": -16.8632869720459, "logits/rejected": -16.60898208618164, "logps/chosen": -1.1211764812469482, "logps/rejected": -1.3164139986038208, "loss": 3.4601, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -11.21176528930664, "rewards/margins": 1.9523746967315674, "rewards/rejected": -13.164140701293945, "step": 130 }, { "epoch": 0.9813721035892776, "grad_norm": 82.1235499731324, "learning_rate": 5.217771643080127e-10, "logits/chosen": -16.644031524658203, "logits/rejected": -16.64177131652832, "logps/chosen": -1.1249492168426514, "logps/rejected": -1.297629714012146, "loss": 3.5925, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -11.249493598937988, "rewards/margins": 1.726805329322815, "rewards/rejected": -12.976297378540039, "step": 135 }, { "epoch": 0.995910949568378, "step": 137, "total_flos": 0.0, "train_loss": 4.128190910729178, "train_runtime": 2318.2745, "train_samples_per_second": 7.594, "train_steps_per_second": 0.059 } ], "logging_steps": 5, "max_steps": 137, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }