{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9942196531791907, "eval_steps": 500, "global_step": 43, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06936416184971098, "grad_norm": 2.391996383666992, "learning_rate": 6e-07, "logits/chosen": -0.5742005109786987, "logits/rejected": -0.6095317602157593, "logps/chosen": -2.351984739303589, "logps/rejected": -7.036094665527344, "loss": 0.6932, "rewards/accuracies": 0.1666666716337204, "rewards/chosen": -0.000525117851793766, "rewards/margins": -0.00012056056584697217, "rewards/rejected": -0.0004045573004987091, "step": 3 }, { "epoch": 0.13872832369942195, "grad_norm": 2.0094051361083984, "learning_rate": 9.736842105263158e-07, "logits/chosen": -0.4133426547050476, "logits/rejected": -0.44120118021965027, "logps/chosen": -3.0542521476745605, "logps/rejected": -4.8423871994018555, "loss": 0.6962, "rewards/accuracies": 0.375, "rewards/chosen": -0.006540569476783276, "rewards/margins": -0.0059411413967609406, "rewards/rejected": -0.0005994289531372488, "step": 6 }, { "epoch": 0.20809248554913296, "grad_norm": 1.878108263015747, "learning_rate": 8.947368421052631e-07, "logits/chosen": -0.3521636426448822, "logits/rejected": -0.37608397006988525, "logps/chosen": -3.7146034240722656, "logps/rejected": -4.189270973205566, "loss": 0.6968, "rewards/accuracies": 0.4583333432674408, "rewards/chosen": -0.007832100614905357, "rewards/margins": -0.007222745567560196, "rewards/rejected": -0.0006093545234762132, "step": 9 }, { "epoch": 0.2774566473988439, "grad_norm": 2.917581796646118, "learning_rate": 8.157894736842105e-07, "logits/chosen": -0.4040801525115967, "logits/rejected": -0.43401893973350525, "logps/chosen": -3.188246488571167, "logps/rejected": -5.506511688232422, "loss": 0.6937, "rewards/accuracies": 0.5, "rewards/chosen": 0.00289311446249485, "rewards/margins": -0.0010533285094425082, "rewards/rejected": 0.003946444019675255, "step": 12 }, { "epoch": 0.3468208092485549, "grad_norm": 3.311497688293457, "learning_rate": 7.368421052631578e-07, "logits/chosen": -0.3706355690956116, "logits/rejected": -0.4231971502304077, "logps/chosen": -2.4997897148132324, "logps/rejected": -5.469825744628906, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": 0.0005652035470120609, "rewards/margins": 0.0003093027917202562, "rewards/rejected": 0.00025590075529180467, "step": 15 }, { "epoch": 0.4161849710982659, "grad_norm": 1.4025365114212036, "learning_rate": 6.578947368421053e-07, "logits/chosen": -0.21497611701488495, "logits/rejected": -0.24890606105327606, "logps/chosen": -2.1334173679351807, "logps/rejected": -4.718679428100586, "loss": 0.6913, "rewards/accuracies": 0.4583333432674408, "rewards/chosen": -0.0012112647527828813, "rewards/margins": 0.00380739476531744, "rewards/rejected": -0.005018658936023712, "step": 18 }, { "epoch": 0.48554913294797686, "grad_norm": 2.704420328140259, "learning_rate": 5.789473684210526e-07, "logits/chosen": -0.3268830180168152, "logits/rejected": -0.36747950315475464, "logps/chosen": -3.465270519256592, "logps/rejected": -5.636588096618652, "loss": 0.6992, "rewards/accuracies": 0.2916666865348816, "rewards/chosen": -0.009067912586033344, "rewards/margins": -0.01196976751089096, "rewards/rejected": 0.0029018563218414783, "step": 21 }, { "epoch": 0.5549132947976878, "grad_norm": 2.6874353885650635, "learning_rate": 5e-07, "logits/chosen": -0.34339720010757446, "logits/rejected": -0.3832819163799286, "logps/chosen": -2.2344460487365723, "logps/rejected": -4.726414680480957, "loss": 0.6943, "rewards/accuracies": 0.4583333432674408, "rewards/chosen": 0.004240160807967186, "rewards/margins": -0.002244190312922001, "rewards/rejected": 0.006484351586550474, "step": 24 }, { "epoch": 0.6242774566473989, "grad_norm": 4.569401741027832, "learning_rate": 4.2105263157894733e-07, "logits/chosen": -0.5243338346481323, "logits/rejected": -0.5355302691459656, "logps/chosen": -2.862309217453003, "logps/rejected": -4.5991034507751465, "loss": 0.689, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.004931977950036526, "rewards/margins": 0.008542876690626144, "rewards/rejected": -0.003610898507758975, "step": 27 }, { "epoch": 0.6936416184971098, "grad_norm": 2.8011696338653564, "learning_rate": 3.4210526315789473e-07, "logits/chosen": -0.44834446907043457, "logits/rejected": -0.4534645080566406, "logps/chosen": -4.190229415893555, "logps/rejected": -5.262238502502441, "loss": 0.6922, "rewards/accuracies": 0.5416666865348816, "rewards/chosen": 0.009879859164357185, "rewards/margins": 0.0019751894287765026, "rewards/rejected": 0.007904671132564545, "step": 30 }, { "epoch": 0.7630057803468208, "grad_norm": 3.9597043991088867, "learning_rate": 2.631578947368421e-07, "logits/chosen": -0.3376007080078125, "logits/rejected": -0.3533641993999481, "logps/chosen": -3.724114179611206, "logps/rejected": -5.412773132324219, "loss": 0.6925, "rewards/accuracies": 0.5833333730697632, "rewards/chosen": -0.005668423604220152, "rewards/margins": 0.0014709922252222896, "rewards/rejected": -0.007139415945857763, "step": 33 }, { "epoch": 0.8323699421965318, "grad_norm": 2.256436586380005, "learning_rate": 1.8421052631578946e-07, "logits/chosen": -0.30911755561828613, "logits/rejected": -0.31694161891937256, "logps/chosen": -2.6621932983398438, "logps/rejected": -4.157691478729248, "loss": 0.6959, "rewards/accuracies": 0.375, "rewards/chosen": -0.00074323161970824, "rewards/margins": -0.005398334003984928, "rewards/rejected": 0.004655101802200079, "step": 36 }, { "epoch": 0.9017341040462428, "grad_norm": 2.0652520656585693, "learning_rate": 1.0526315789473683e-07, "logits/chosen": -0.31367242336273193, "logits/rejected": -0.33622536063194275, "logps/chosen": -3.3133456707000732, "logps/rejected": -6.765153408050537, "loss": 0.6929, "rewards/accuracies": 0.375, "rewards/chosen": -0.01026303879916668, "rewards/margins": 0.0005472122575156391, "rewards/rejected": -0.010810251347720623, "step": 39 }, { "epoch": 0.9710982658959537, "grad_norm": 2.498948335647583, "learning_rate": 2.6315789473684208e-08, "logits/chosen": -0.3583389222621918, "logits/rejected": -0.3821001946926117, "logps/chosen": -3.387718677520752, "logps/rejected": -5.769498348236084, "loss": 0.6922, "rewards/accuracies": 0.5416666865348816, "rewards/chosen": 0.0014158273115754128, "rewards/margins": 0.0019499189220368862, "rewards/rejected": -0.0005340920761227608, "step": 42 }, { "epoch": 0.9942196531791907, "step": 43, "total_flos": 8626483251118080.0, "train_loss": 0.6937350367390832, "train_runtime": 231.1609, "train_samples_per_second": 1.492, "train_steps_per_second": 0.186 } ], "logging_steps": 3, "max_steps": 43, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8626483251118080.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }