{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997303855486654, "eval_steps": 500, "global_step": 1854, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.050148287948234026, "grad_norm": 3.929126501083374, "learning_rate": 5e-07, "logits/chosen": -0.5141991972923279, "logits/rejected": -0.47028154134750366, "logps/chosen": -69.07454681396484, "logps/rejected": -14.869856834411621, "loss": 0.6929, "rewards/accuracies": 0.48521506786346436, "rewards/chosen": -1.161982163466746e-05, "rewards/margins": 0.0009046773775480688, "rewards/rejected": -0.0009162970818579197, "step": 93 }, { "epoch": 0.10029657589646805, "grad_norm": 7.2451324462890625, "learning_rate": 1e-06, "logits/chosen": -0.5093461871147156, "logits/rejected": -0.45741066336631775, "logps/chosen": -76.25039672851562, "logps/rejected": -15.682782173156738, "loss": 0.6682, "rewards/accuracies": 0.7553763389587402, "rewards/chosen": 0.043871019035577774, "rewards/margins": 0.052321143448352814, "rewards/rejected": -0.00845012441277504, "step": 186 }, { "epoch": 0.15044486384470207, "grad_norm": 3.729768991470337, "learning_rate": 9.442446043165467e-07, "logits/chosen": -0.49496081471443176, "logits/rejected": -0.4462580382823944, "logps/chosen": -73.93294525146484, "logps/rejected": -17.590185165405273, "loss": 0.5259, "rewards/accuracies": 0.8602150678634644, "rewards/chosen": 0.3595849275588989, "rewards/margins": 0.43593457341194153, "rewards/rejected": -0.07634969055652618, "step": 279 }, { "epoch": 0.2005931517929361, "grad_norm": 2.1467108726501465, "learning_rate": 8.884892086330935e-07, "logits/chosen": -0.5146396160125732, "logits/rejected": -0.4725695848464966, "logps/chosen": -67.33499145507812, "logps/rejected": -21.152362823486328, "loss": 0.3566, "rewards/accuracies": 0.9408602118492126, "rewards/chosen": 0.8917596340179443, "rewards/margins": 1.2377275228500366, "rewards/rejected": -0.3459678590297699, "step": 372 }, { "epoch": 0.25074143974117014, "grad_norm": 2.8724427223205566, "learning_rate": 8.327338129496403e-07, "logits/chosen": -0.5053039193153381, "logits/rejected": -0.46124857664108276, "logps/chosen": -60.683773040771484, "logps/rejected": -23.27366828918457, "loss": 0.2328, "rewards/accuracies": 0.9731183052062988, "rewards/chosen": 1.1787759065628052, "rewards/margins": 1.961389183998108, "rewards/rejected": -0.7826132774353027, "step": 465 }, { "epoch": 0.30088972768940414, "grad_norm": 4.906219959259033, "learning_rate": 7.769784172661871e-07, "logits/chosen": -0.5105525851249695, "logits/rejected": -0.46868896484375, "logps/chosen": -57.557701110839844, "logps/rejected": -29.869394302368164, "loss": 0.1128, "rewards/accuracies": 0.9798387289047241, "rewards/chosen": 1.250539779663086, "rewards/margins": 2.861812114715576, "rewards/rejected": -1.6112723350524902, "step": 558 }, { "epoch": 0.3510380156376382, "grad_norm": 0.3975902497768402, "learning_rate": 7.212230215827337e-07, "logits/chosen": -0.51771479845047, "logits/rejected": -0.46667957305908203, "logps/chosen": -61.55564880371094, "logps/rejected": -39.249454498291016, "loss": 0.0582, "rewards/accuracies": 0.9865591526031494, "rewards/chosen": 1.4144244194030762, "rewards/margins": 3.826444149017334, "rewards/rejected": -2.412019729614258, "step": 651 }, { "epoch": 0.4011863035858722, "grad_norm": 2.5660245418548584, "learning_rate": 6.654676258992805e-07, "logits/chosen": -0.46265119314193726, "logits/rejected": -0.426276832818985, "logps/chosen": -56.20552062988281, "logps/rejected": -43.43349838256836, "loss": 0.0577, "rewards/accuracies": 0.9811828136444092, "rewards/chosen": 1.415700912475586, "rewards/margins": 4.192880153656006, "rewards/rejected": -2.77717924118042, "step": 744 }, { "epoch": 0.4513345915341062, "grad_norm": 0.4916980564594269, "learning_rate": 6.097122302158273e-07, "logits/chosen": -0.5033692121505737, "logits/rejected": -0.4418078064918518, "logps/chosen": -59.61499786376953, "logps/rejected": -45.10437774658203, "loss": 0.0413, "rewards/accuracies": 0.9852150678634644, "rewards/chosen": 1.577943205833435, "rewards/margins": 4.589221477508545, "rewards/rejected": -3.0112783908843994, "step": 837 }, { "epoch": 0.5014828794823403, "grad_norm": 0.3903834819793701, "learning_rate": 5.539568345323741e-07, "logits/chosen": -0.44615066051483154, "logits/rejected": -0.3858674168586731, "logps/chosen": -59.962955474853516, "logps/rejected": -47.591575622558594, "loss": 0.0306, "rewards/accuracies": 0.9879032373428345, "rewards/chosen": 1.6375161409378052, "rewards/margins": 4.831850528717041, "rewards/rejected": -3.1943342685699463, "step": 930 }, { "epoch": 0.5516311674305743, "grad_norm": 0.27728018164634705, "learning_rate": 4.982014388489209e-07, "logits/chosen": -0.42113569378852844, "logits/rejected": -0.37468260526657104, "logps/chosen": -58.571006774902344, "logps/rejected": -50.136695861816406, "loss": 0.0334, "rewards/accuracies": 0.9865591526031494, "rewards/chosen": 1.5126001834869385, "rewards/margins": 4.904977798461914, "rewards/rejected": -3.3923778533935547, "step": 1023 }, { "epoch": 0.6017794553788083, "grad_norm": 0.25181448459625244, "learning_rate": 4.424460431654676e-07, "logits/chosen": -0.41716840863227844, "logits/rejected": -0.35924020409584045, "logps/chosen": -56.86894226074219, "logps/rejected": -51.47317123413086, "loss": 0.0338, "rewards/accuracies": 0.9838709831237793, "rewards/chosen": 1.5967961549758911, "rewards/margins": 5.1519880294799805, "rewards/rejected": -3.5551917552948, "step": 1116 }, { "epoch": 0.6519277433270423, "grad_norm": 20.254880905151367, "learning_rate": 3.8669064748201436e-07, "logits/chosen": -0.4112766981124878, "logits/rejected": -0.33467066287994385, "logps/chosen": -59.09377670288086, "logps/rejected": -51.40492630004883, "loss": 0.0345, "rewards/accuracies": 0.9879032373428345, "rewards/chosen": 1.6269216537475586, "rewards/margins": 5.300109386444092, "rewards/rejected": -3.6731879711151123, "step": 1209 }, { "epoch": 0.7020760312752764, "grad_norm": 0.0816323384642601, "learning_rate": 3.309352517985611e-07, "logits/chosen": -0.4078998267650604, "logits/rejected": -0.3409072160720825, "logps/chosen": -54.6790771484375, "logps/rejected": -53.65495300292969, "loss": 0.0335, "rewards/accuracies": 0.9865591526031494, "rewards/chosen": 1.5546735525131226, "rewards/margins": 5.438999652862549, "rewards/rejected": -3.8843259811401367, "step": 1302 }, { "epoch": 0.7522243192235104, "grad_norm": 10.449593544006348, "learning_rate": 2.751798561151079e-07, "logits/chosen": -0.41149967908859253, "logits/rejected": -0.3351740837097168, "logps/chosen": -58.309329986572266, "logps/rejected": -52.98766326904297, "loss": 0.0298, "rewards/accuracies": 0.9879032373428345, "rewards/chosen": 1.7052394151687622, "rewards/margins": 5.557784557342529, "rewards/rejected": -3.8525450229644775, "step": 1395 }, { "epoch": 0.8023726071717444, "grad_norm": 1.828351378440857, "learning_rate": 2.1942446043165465e-07, "logits/chosen": -0.41116863489151, "logits/rejected": -0.3420298993587494, "logps/chosen": -57.310306549072266, "logps/rejected": -56.159385681152344, "loss": 0.0199, "rewards/accuracies": 0.9932795763015747, "rewards/chosen": 1.6173115968704224, "rewards/margins": 5.636691093444824, "rewards/rejected": -4.019379615783691, "step": 1488 }, { "epoch": 0.8525208951199784, "grad_norm": 0.3853701651096344, "learning_rate": 1.6366906474820144e-07, "logits/chosen": -0.386165589094162, "logits/rejected": -0.314485102891922, "logps/chosen": -59.73184585571289, "logps/rejected": -56.95379638671875, "loss": 0.0239, "rewards/accuracies": 0.9905914068222046, "rewards/chosen": 1.7348586320877075, "rewards/margins": 5.7297468185424805, "rewards/rejected": -3.9948880672454834, "step": 1581 }, { "epoch": 0.9026691830682124, "grad_norm": 0.15693414211273193, "learning_rate": 1.0791366906474819e-07, "logits/chosen": -0.4054364860057831, "logits/rejected": -0.33498746156692505, "logps/chosen": -59.102989196777344, "logps/rejected": -57.07588195800781, "loss": 0.0154, "rewards/accuracies": 0.9946236610412598, "rewards/chosen": 1.6817787885665894, "rewards/margins": 5.79107141494751, "rewards/rejected": -4.109292984008789, "step": 1674 }, { "epoch": 0.9528174710164464, "grad_norm": 0.12521210312843323, "learning_rate": 5.2158273381294966e-08, "logits/chosen": -0.40175333619117737, "logits/rejected": -0.32109692692756653, "logps/chosen": -56.866294860839844, "logps/rejected": -57.35883331298828, "loss": 0.0239, "rewards/accuracies": 0.9905914068222046, "rewards/chosen": 1.6757546663284302, "rewards/margins": 5.896215438842773, "rewards/rejected": -4.220460414886475, "step": 1767 }, { "epoch": 0.9997303855486654, "step": 1854, "total_flos": 1.1904720283191214e+18, "train_loss": 0.15248539275464632, "train_runtime": 30932.1025, "train_samples_per_second": 0.48, "train_steps_per_second": 0.06 } ], "logging_steps": 93, "max_steps": 1854, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1904720283191214e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }