{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 500, "global_step": 156, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.064, "grad_norm": 46.9622688293457, "learning_rate": 6.249999999999999e-07, "logits/chosen": -1.849869966506958, "logits/rejected": -0.29363900423049927, "logps/chosen": -214.13339233398438, "logps/rejected": -737.3911743164062, "loss": 0.7092, "rewards/accuracies": 0.46875, "rewards/chosen": 0.004342720843851566, "rewards/margins": 0.025443650782108307, "rewards/rejected": -0.021100929006934166, "step": 10 }, { "epoch": 0.128, "grad_norm": 13.521223068237305, "learning_rate": 9.979871469976195e-07, "logits/chosen": -1.8478429317474365, "logits/rejected": -0.2751621603965759, "logps/chosen": -240.21755981445312, "logps/rejected": -844.638427734375, "loss": 0.486, "rewards/accuracies": 0.9468750357627869, "rewards/chosen": 0.0402272529900074, "rewards/margins": 0.6919995546340942, "rewards/rejected": -0.6517722606658936, "step": 20 }, { "epoch": 0.192, "grad_norm": 0.762617290019989, "learning_rate": 9.755282581475767e-07, "logits/chosen": -2.1506738662719727, "logits/rejected": -0.8184519410133362, "logps/chosen": -241.9251251220703, "logps/rejected": -829.9989624023438, "loss": 0.1228, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.33827781677246094, "rewards/margins": 4.653146266937256, "rewards/rejected": -4.991424083709717, "step": 30 }, { "epoch": 0.256, "grad_norm": 1.0136756896972656, "learning_rate": 9.29224396800933e-07, "logits/chosen": -2.6077799797058105, "logits/rejected": -1.6289113759994507, "logps/chosen": -261.08819580078125, "logps/rejected": -949.7811889648438, "loss": 0.0522, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.5243332386016846, "rewards/margins": 14.23307991027832, "rewards/rejected": -16.75741195678711, "step": 40 }, { "epoch": 0.32, "grad_norm": 0.15248946845531464, "learning_rate": 8.613974319136957e-07, "logits/chosen": -2.809011459350586, "logits/rejected": -1.9814908504486084, "logps/chosen": -262.7459411621094, "logps/rejected": -1068.808349609375, "loss": 0.0401, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.401086807250977, "rewards/margins": 23.641544342041016, "rewards/rejected": -28.04262924194336, "step": 50 }, { "epoch": 0.384, "grad_norm": 0.12552589178085327, "learning_rate": 7.754484907260512e-07, "logits/chosen": -2.7779171466827393, "logits/rejected": -2.0284547805786133, "logps/chosen": -282.9354248046875, "logps/rejected": -1090.5594482421875, "loss": 0.036, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.416067123413086, "rewards/margins": 25.74221420288086, "rewards/rejected": -31.158283233642578, "step": 60 }, { "epoch": 0.448, "grad_norm": 0.17482662200927734, "learning_rate": 6.756874120406714e-07, "logits/chosen": -2.7689507007598877, "logits/rejected": -1.9573109149932861, "logps/chosen": -263.86279296875, "logps/rejected": -1074.0966796875, "loss": 0.0455, "rewards/accuracies": 0.984375, "rewards/chosen": -4.39678955078125, "rewards/margins": 24.96966552734375, "rewards/rejected": -29.366456985473633, "step": 70 }, { "epoch": 0.512, "grad_norm": 0.11299557238817215, "learning_rate": 5.671166329088277e-07, "logits/chosen": -2.643343448638916, "logits/rejected": -1.8395075798034668, "logps/chosen": -271.737060546875, "logps/rejected": -1051.107421875, "loss": 0.0341, "rewards/accuracies": 1.0, "rewards/chosen": -3.8055081367492676, "rewards/margins": 22.575603485107422, "rewards/rejected": -26.381113052368164, "step": 80 }, { "epoch": 0.576, "grad_norm": 0.12716512382030487, "learning_rate": 4.5518034554828327e-07, "logits/chosen": -2.627561092376709, "logits/rejected": -1.8044594526290894, "logps/chosen": -267.9153747558594, "logps/rejected": -1012.906494140625, "loss": 0.046, "rewards/accuracies": 0.984375, "rewards/chosen": -3.2899169921875, "rewards/margins": 20.98080825805664, "rewards/rejected": -24.270723342895508, "step": 90 }, { "epoch": 0.64, "grad_norm": 0.14720195531845093, "learning_rate": 3.454915028125263e-07, "logits/chosen": -2.617551803588867, "logits/rejected": -1.7380447387695312, "logps/chosen": -254.84658813476562, "logps/rejected": -996.9198608398438, "loss": 0.0315, "rewards/accuracies": 1.0, "rewards/chosen": -2.6352035999298096, "rewards/margins": 19.858867645263672, "rewards/rejected": -22.49407196044922, "step": 100 }, { "epoch": 0.704, "grad_norm": 0.1784060150384903, "learning_rate": 2.4355036129704696e-07, "logits/chosen": -2.598905324935913, "logits/rejected": -1.7216695547103882, "logps/chosen": -248.74453735351562, "logps/rejected": -993.6848754882812, "loss": 0.036, "rewards/accuracies": 1.0, "rewards/chosen": -2.5882961750030518, "rewards/margins": 18.992816925048828, "rewards/rejected": -21.58111572265625, "step": 110 }, { "epoch": 0.768, "grad_norm": 0.11419311910867691, "learning_rate": 1.5446867550656767e-07, "logits/chosen": -2.5767769813537598, "logits/rejected": -1.681131362915039, "logps/chosen": -256.5839538574219, "logps/rejected": -994.3280029296875, "loss": 0.0745, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.4366414546966553, "rewards/margins": 18.98566436767578, "rewards/rejected": -21.422306060791016, "step": 120 }, { "epoch": 0.832, "grad_norm": 0.10517225414514542, "learning_rate": 8.271337313934867e-08, "logits/chosen": -2.5168538093566895, "logits/rejected": -1.6293193101882935, "logps/chosen": -269.16278076171875, "logps/rejected": -979.6190795898438, "loss": 0.0499, "rewards/accuracies": 0.984375, "rewards/chosen": -2.368694543838501, "rewards/margins": 18.073989868164062, "rewards/rejected": -20.442684173583984, "step": 130 }, { "epoch": 0.896, "grad_norm": 0.1312648206949234, "learning_rate": 3.188256468013139e-08, "logits/chosen": -2.5698208808898926, "logits/rejected": -1.6306824684143066, "logps/chosen": -250.3894805908203, "logps/rejected": -1000.3046875, "loss": 0.0339, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.0379717350006104, "rewards/margins": 18.880815505981445, "rewards/rejected": -20.918787002563477, "step": 140 }, { "epoch": 0.96, "grad_norm": 0.11260352283716202, "learning_rate": 4.5251191160326495e-09, "logits/chosen": -2.5329906940460205, "logits/rejected": -1.5752902030944824, "logps/chosen": -271.48443603515625, "logps/rejected": -1053.84765625, "loss": 0.0735, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.3061203956604004, "rewards/margins": 19.025531768798828, "rewards/rejected": -21.331653594970703, "step": 150 }, { "epoch": 0.9984, "step": 156, "total_flos": 1.1115841451898962e+18, "train_loss": 0.1217762088546386, "train_runtime": 6099.7709, "train_samples_per_second": 0.82, "train_steps_per_second": 0.026 } ], "logging_steps": 10, "max_steps": 156, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1115841451898962e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }