{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9945, "eval_steps": 500, "global_step": 153, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "eta": 0.0010000000474974513, "grad_norm": 0.20925004399681482, "learning_rate": 3.125e-08, "logits/chosen": -1.6728180646896362, "logits/rejected": -1.6728180646896362, "logps/chosen": -139.26568603515625, "logps/pi_response": -223.70187377929688, "logps/ref_response": -223.70187377929688, "logps/rejected": -139.26568603515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.07, "eta": 0.0009999999310821295, "grad_norm": 0.2664449122199502, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -1.9198987483978271, "logits/rejected": -1.9198987483978271, "logps/chosen": -185.7984161376953, "logps/pi_response": -284.7489929199219, "logps/ref_response": -274.8498229980469, "logps/rejected": -185.7984161376953, "loss": 0.693, "rewards/accuracies": 0.09829059988260269, "rewards/chosen": -0.05739467218518257, "rewards/margins": -8.19944467878031e-09, "rewards/rejected": -0.057394664734601974, "step": 10 }, { "epoch": 0.13, "eta": 0.0010000000474974513, "grad_norm": 0.17607425906899, "learning_rate": 4.989490450759331e-07, "logits/chosen": -1.2133342027664185, "logits/rejected": -1.2133342027664185, "logps/chosen": -344.1175231933594, "logps/pi_response": -420.7164001464844, "logps/ref_response": -268.8954772949219, "logps/rejected": -344.1175231933594, "loss": 0.6916, "rewards/accuracies": 0.17307692766189575, "rewards/chosen": -1.5597572326660156, "rewards/margins": 2.3523059056174134e-08, "rewards/rejected": -1.5597573518753052, "step": 20 }, { "epoch": 0.2, "eta": 0.0010000000474974513, "grad_norm": 0.9959359173659126, "learning_rate": 4.872270441827174e-07, "logits/chosen": 0.10036426037549973, "logits/rejected": 0.10036426037549973, "logps/chosen": -764.1023559570312, "logps/pi_response": -850.1804809570312, "logps/ref_response": -272.0489807128906, "logps/rejected": -764.1023559570312, "loss": 0.6873, "rewards/accuracies": 0.11153846234083176, "rewards/chosen": -5.858819007873535, "rewards/margins": -3.943076620771535e-08, "rewards/rejected": -5.858819484710693, "step": 30 }, { "epoch": 0.26, "eta": 0.0010000000474974513, "grad_norm": 1.9030553381279938, "learning_rate": 4.6308512113530063e-07, "logits/chosen": 2.6020889282226562, "logits/rejected": 2.6020889282226562, "logps/chosen": -15368.376953125, "logps/pi_response": -9728.86328125, "logps/ref_response": -275.9498596191406, "logps/rejected": -15368.376953125, "loss": 0.6, "rewards/accuracies": 0.042307693511247635, "rewards/chosen": -151.69932556152344, "rewards/margins": 1.8339891028062993e-07, "rewards/rejected": -151.69932556152344, "step": 40 }, { "epoch": 0.33, "eta": 0.0010000000474974513, "grad_norm": 0.9022013856679871, "learning_rate": 4.277872161641681e-07, "logits/chosen": 5.100710391998291, "logits/rejected": 5.100710391998291, "logps/chosen": -25452.970703125, "logps/pi_response": -16005.287109375, "logps/ref_response": -266.91033935546875, "logps/rejected": -25452.970703125, "loss": 0.5304, "rewards/accuracies": 0.023076923564076424, "rewards/chosen": -252.7429656982422, "rewards/margins": -4.69501202360334e-07, "rewards/rejected": -252.7429656982422, "step": 50 }, { "epoch": 0.39, "eta": 0.0010000000474974513, "grad_norm": 0.9078413895498612, "learning_rate": 3.8318133624280046e-07, "logits/chosen": 4.621513843536377, "logits/rejected": 4.621513843536377, "logps/chosen": -29423.666015625, "logps/pi_response": -17608.6015625, "logps/ref_response": -265.94757080078125, "logps/rejected": -29423.666015625, "loss": 0.519, "rewards/accuracies": 0.03076923079788685, "rewards/chosen": -292.3346862792969, "rewards/margins": 6.455641710090276e-07, "rewards/rejected": -292.3346862792969, "step": 60 }, { "epoch": 0.46, "eta": 0.0010000000474974513, "grad_norm": 0.7756777568132545, "learning_rate": 3.316028034595861e-07, "logits/chosen": 4.356642723083496, "logits/rejected": 4.356642723083496, "logps/chosen": -29171.62890625, "logps/pi_response": -18937.943359375, "logps/ref_response": -276.5423278808594, "logps/rejected": -29171.62890625, "loss": 0.5212, "rewards/accuracies": 0.015384615398943424, "rewards/chosen": -289.7953796386719, "rewards/margins": 0.0, "rewards/rejected": -289.7953796386719, "step": 70 }, { "epoch": 0.52, "eta": 0.0010000000474974513, "grad_norm": 0.9174650913563618, "learning_rate": 2.7575199021178855e-07, "logits/chosen": 4.428848743438721, "logits/rejected": 4.428848743438721, "logps/chosen": -27704.791015625, "logps/pi_response": -18176.86328125, "logps/ref_response": -271.93060302734375, "logps/rejected": -27704.791015625, "loss": 0.5132, "rewards/accuracies": 0.01923076994717121, "rewards/chosen": -275.2251281738281, "rewards/margins": -9.97690108306415e-07, "rewards/rejected": -275.2251281738281, "step": 80 }, { "epoch": 0.58, "eta": 0.0010000000474974513, "grad_norm": 0.9019855937057101, "learning_rate": 2.1855294234408068e-07, "logits/chosen": 4.3772735595703125, "logits/rejected": 4.3772735595703125, "logps/chosen": -29193.177734375, "logps/pi_response": -17440.9921875, "logps/ref_response": -266.1241455078125, "logps/rejected": -29193.177734375, "loss": 0.5231, "rewards/accuracies": 0.03076923079788685, "rewards/chosen": -290.08026123046875, "rewards/margins": -2.9343825147520874e-08, "rewards/rejected": -290.0802307128906, "step": 90 }, { "epoch": 0.65, "eta": 0.0010000000474974513, "grad_norm": 0.8179873354250534, "learning_rate": 1.6300029195778453e-07, "logits/chosen": 4.375646114349365, "logits/rejected": 4.375646114349365, "logps/chosen": -27328.33203125, "logps/pi_response": -17462.0546875, "logps/ref_response": -267.173828125, "logps/rejected": -27328.33203125, "loss": 0.5171, "rewards/accuracies": 0.026923077180981636, "rewards/chosen": -271.51708984375, "rewards/margins": 1.613910427522569e-07, "rewards/rejected": -271.51708984375, "step": 100 }, { "epoch": 0.71, "eta": 0.0010000000474974513, "grad_norm": 0.9895798769556228, "learning_rate": 1.1200247470632392e-07, "logits/chosen": 4.529591083526611, "logits/rejected": 4.529591083526611, "logps/chosen": -28326.220703125, "logps/pi_response": -18734.5078125, "logps/ref_response": -291.685791015625, "logps/rejected": -28326.220703125, "loss": 0.5103, "rewards/accuracies": 0.04615384712815285, "rewards/chosen": -281.44744873046875, "rewards/margins": 1.540550869094659e-07, "rewards/rejected": -281.44744873046875, "step": 110 }, { "epoch": 0.78, "eta": 0.0010000000474974513, "grad_norm": 0.9157342554933109, "learning_rate": 6.822945986946385e-08, "logits/chosen": 4.574192523956299, "logits/rejected": 4.574192523956299, "logps/chosen": -29500.658203125, "logps/pi_response": -18083.599609375, "logps/ref_response": -266.3084716796875, "logps/rejected": -29500.658203125, "loss": 0.5148, "rewards/accuracies": 0.05384615436196327, "rewards/chosen": -293.0876159667969, "rewards/margins": 1.4085036355027114e-06, "rewards/rejected": -293.0876159667969, "step": 120 }, { "epoch": 0.84, "eta": 0.0010000000474974513, "grad_norm": 0.932891201307216, "learning_rate": 3.397296523427806e-08, "logits/chosen": 4.706723213195801, "logits/rejected": 4.706723213195801, "logps/chosen": -26521.8828125, "logps/pi_response": -15752.3818359375, "logps/ref_response": -260.1748046875, "logps/rejected": -26521.884765625, "loss": 0.5218, "rewards/accuracies": 0.04615384712815285, "rewards/chosen": -263.5008544921875, "rewards/margins": 8.876506853994215e-07, "rewards/rejected": -263.5008544921875, "step": 130 }, { "epoch": 0.91, "eta": 0.0010000000474974513, "grad_norm": 0.7630353244282155, "learning_rate": 1.1026475173977978e-08, "logits/chosen": 4.662195205688477, "logits/rejected": 4.662195205688477, "logps/chosen": -29146.107421875, "logps/pi_response": -18217.185546875, "logps/ref_response": -266.84295654296875, "logps/rejected": -29146.107421875, "loss": 0.5144, "rewards/accuracies": 0.03846153989434242, "rewards/chosen": -289.6330261230469, "rewards/margins": 5.868765029504175e-08, "rewards/rejected": -289.6330261230469, "step": 140 }, { "epoch": 0.97, "eta": 0.0010000000474974513, "grad_norm": 0.7944892762549526, "learning_rate": 5.913435276374834e-10, "logits/chosen": 4.59439754486084, "logits/rejected": 4.59439754486084, "logps/chosen": -28846.330078125, "logps/pi_response": -18351.314453125, "logps/ref_response": -276.857666015625, "logps/rejected": -28846.330078125, "loss": 0.5191, "rewards/accuracies": 0.03846153989434242, "rewards/chosen": -286.6873779296875, "rewards/margins": -8.803147579783399e-07, "rewards/rejected": -286.6873779296875, "step": 150 }, { "epoch": 0.99, "step": 153, "total_flos": 0.0, "train_loss": 0.0, "train_runtime": 0.0094, "train_samples_per_second": 2122892.066, "train_steps_per_second": 16240.124 } ], "logging_steps": 10, "max_steps": 153, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }