{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9945, "eval_steps": 500, "global_step": 153, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "eta": 0.0010000000474974513, "grad_norm": 8.605272045068777, "learning_rate": 3.125e-08, "logits/chosen": -2.8784992694854736, "logits/rejected": -2.8769874572753906, "logps/chosen": -263.9749755859375, "logps/pi_response": -246.19029235839844, "logps/ref_response": -246.19029235839844, "logps/rejected": -308.2843322753906, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.07, "eta": 0.0010000000474974513, "grad_norm": 8.688961504116353, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.936194896697998, "logits/rejected": -2.808932304382324, "logps/chosen": -315.6687927246094, "logps/pi_response": -209.20472717285156, "logps/ref_response": -209.1347198486328, "logps/rejected": -260.7985534667969, "loss": 0.6928, "rewards/accuracies": 0.4829059839248657, "rewards/chosen": 0.00021380360703915358, "rewards/margins": 0.0008379952632822096, "rewards/rejected": -0.0006241916562430561, "step": 10 }, { "epoch": 0.13, "eta": 0.0010000000474974513, "grad_norm": 8.833821950128216, "learning_rate": 4.989490450759331e-07, "logits/chosen": -2.852677583694458, "logits/rejected": -2.8317201137542725, "logps/chosen": -274.80267333984375, "logps/pi_response": -189.35801696777344, "logps/ref_response": -187.89822387695312, "logps/rejected": -261.1772766113281, "loss": 0.6876, "rewards/accuracies": 0.6846153736114502, "rewards/chosen": -0.006662360858172178, "rewards/margins": 0.012640128843486309, "rewards/rejected": -0.0193024892359972, "step": 20 }, { "epoch": 0.2, "eta": 0.0010000000474974513, "grad_norm": 9.337463091325768, "learning_rate": 4.872270441827174e-07, "logits/chosen": -2.8088669776916504, "logits/rejected": -2.791938304901123, "logps/chosen": -275.8798828125, "logps/pi_response": -215.20196533203125, "logps/ref_response": -213.5146484375, "logps/rejected": -255.837890625, "loss": 0.6691, "rewards/accuracies": 0.7153846025466919, "rewards/chosen": 0.021489957347512245, "rewards/margins": 0.061311714351177216, "rewards/rejected": -0.03982176259160042, "step": 30 }, { "epoch": 0.26, "eta": 0.0010000000474974513, "grad_norm": 12.278601194362231, "learning_rate": 4.6308512113530063e-07, "logits/chosen": -2.7728219032287598, "logits/rejected": -2.690376043319702, "logps/chosen": -278.7479553222656, "logps/pi_response": -190.5654296875, "logps/ref_response": -177.33053588867188, "logps/rejected": -246.11264038085938, "loss": 0.6443, "rewards/accuracies": 0.6730769276618958, "rewards/chosen": -0.05301598832011223, "rewards/margins": 0.10135015100240707, "rewards/rejected": -0.154366135597229, "step": 40 }, { "epoch": 0.33, "eta": 0.0010000000474974513, "grad_norm": 15.472310357744927, "learning_rate": 4.277872161641681e-07, "logits/chosen": -2.792513608932495, "logits/rejected": -2.7258100509643555, "logps/chosen": -291.08642578125, "logps/pi_response": -217.5293426513672, "logps/ref_response": -194.07823181152344, "logps/rejected": -272.1592102050781, "loss": 0.619, "rewards/accuracies": 0.6692307591438293, "rewards/chosen": -0.05969160422682762, "rewards/margins": 0.2206883430480957, "rewards/rejected": -0.2803799510002136, "step": 50 }, { "epoch": 0.39, "eta": 0.0010000000474974513, "grad_norm": 12.018657879137255, "learning_rate": 3.8318133624280046e-07, "logits/chosen": -2.7461166381835938, "logits/rejected": -2.6338207721710205, "logps/chosen": -268.39324951171875, "logps/pi_response": -218.18861389160156, "logps/ref_response": -193.3256072998047, "logps/rejected": -277.92572021484375, "loss": 0.611, "rewards/accuracies": 0.7153846025466919, "rewards/chosen": -0.13407574594020844, "rewards/margins": 0.21902315318584442, "rewards/rejected": -0.35309889912605286, "step": 60 }, { "epoch": 0.46, "eta": 0.0010000000474974513, "grad_norm": 12.586943156361984, "learning_rate": 3.316028034595861e-07, "logits/chosen": -2.702092170715332, "logits/rejected": -2.648845672607422, "logps/chosen": -275.9683532714844, "logps/pi_response": -199.25994873046875, "logps/ref_response": -183.3825225830078, "logps/rejected": -281.8118896484375, "loss": 0.6125, "rewards/accuracies": 0.6692307591438293, "rewards/chosen": -0.07608187198638916, "rewards/margins": 0.22089019417762756, "rewards/rejected": -0.2969720661640167, "step": 70 }, { "epoch": 0.52, "eta": 0.0010000000474974513, "grad_norm": 16.37677218967305, "learning_rate": 2.7575199021178855e-07, "logits/chosen": -2.654991388320923, "logits/rejected": -2.581737756729126, "logps/chosen": -314.16900634765625, "logps/pi_response": -250.20211791992188, "logps/ref_response": -203.31488037109375, "logps/rejected": -308.69287109375, "loss": 0.5967, "rewards/accuracies": 0.6730769276618958, "rewards/chosen": -0.329426109790802, "rewards/margins": 0.27185821533203125, "rewards/rejected": -0.6012843251228333, "step": 80 }, { "epoch": 0.58, "eta": 0.0010000000474974513, "grad_norm": 22.869287847796812, "learning_rate": 2.1855294234408068e-07, "logits/chosen": -2.4663641452789307, "logits/rejected": -2.1920886039733887, "logps/chosen": -374.7882385253906, "logps/pi_response": -300.48028564453125, "logps/ref_response": -229.24087524414062, "logps/rejected": -370.0035400390625, "loss": 0.573, "rewards/accuracies": 0.7153846025466919, "rewards/chosen": -0.42087164521217346, "rewards/margins": 0.46370625495910645, "rewards/rejected": -0.8845779299736023, "step": 90 }, { "epoch": 0.65, "eta": 0.0010000000474974513, "grad_norm": 22.10929439369714, "learning_rate": 1.6300029195778453e-07, "logits/chosen": -2.2815823554992676, "logits/rejected": -1.9420466423034668, "logps/chosen": -328.23388671875, "logps/pi_response": -285.6993408203125, "logps/ref_response": -202.154541015625, "logps/rejected": -346.80718994140625, "loss": 0.5648, "rewards/accuracies": 0.6653845906257629, "rewards/chosen": -0.5251672863960266, "rewards/margins": 0.47274622321128845, "rewards/rejected": -0.9979135394096375, "step": 100 }, { "epoch": 0.71, "eta": 0.0010000000474974513, "grad_norm": 20.95321740793465, "learning_rate": 1.1200247470632392e-07, "logits/chosen": -2.211641311645508, "logits/rejected": -1.855459451675415, "logps/chosen": -360.8876953125, "logps/pi_response": -303.0977783203125, "logps/ref_response": -215.0885009765625, "logps/rejected": -370.98193359375, "loss": 0.563, "rewards/accuracies": 0.7423076629638672, "rewards/chosen": -0.47680747509002686, "rewards/margins": 0.5583351850509644, "rewards/rejected": -1.0351426601409912, "step": 110 }, { "epoch": 0.78, "eta": 0.0010000000474974513, "grad_norm": 21.53917118957897, "learning_rate": 6.822945986946385e-08, "logits/chosen": -1.8491864204406738, "logits/rejected": -1.6956101655960083, "logps/chosen": -344.0650939941406, "logps/pi_response": -307.6352844238281, "logps/ref_response": -204.07801818847656, "logps/rejected": -390.5289001464844, "loss": 0.5501, "rewards/accuracies": 0.6884615421295166, "rewards/chosen": -0.6317132711410522, "rewards/margins": 0.5379453301429749, "rewards/rejected": -1.1696586608886719, "step": 120 }, { "epoch": 0.84, "eta": 0.0010000000474974513, "grad_norm": 27.579199953433918, "learning_rate": 3.397296523427806e-08, "logits/chosen": -1.9508247375488281, "logits/rejected": -1.6159588098526, "logps/chosen": -333.64599609375, "logps/pi_response": -301.04547119140625, "logps/ref_response": -194.1094207763672, "logps/rejected": -385.6200256347656, "loss": 0.5332, "rewards/accuracies": 0.7038461565971375, "rewards/chosen": -0.6621810793876648, "rewards/margins": 0.6021292805671692, "rewards/rejected": -1.264310359954834, "step": 130 }, { "epoch": 0.91, "eta": 0.0010000000474974513, "grad_norm": 26.751074987142594, "learning_rate": 1.1026475173977978e-08, "logits/chosen": -1.9854283332824707, "logits/rejected": -1.720418930053711, "logps/chosen": -328.14459228515625, "logps/pi_response": -306.83367919921875, "logps/ref_response": -197.67745971679688, "logps/rejected": -386.437255859375, "loss": 0.5515, "rewards/accuracies": 0.7038461565971375, "rewards/chosen": -0.6694343686103821, "rewards/margins": 0.5615480542182922, "rewards/rejected": -1.2309825420379639, "step": 140 }, { "epoch": 0.97, "eta": 0.0010000000474974513, "grad_norm": 27.667206944684594, "learning_rate": 5.913435276374834e-10, "logits/chosen": -1.9676626920700073, "logits/rejected": -1.6368684768676758, "logps/chosen": -339.4317321777344, "logps/pi_response": -303.0911560058594, "logps/ref_response": -192.59991455078125, "logps/rejected": -375.9079895019531, "loss": 0.5599, "rewards/accuracies": 0.7192307710647583, "rewards/chosen": -0.6414641737937927, "rewards/margins": 0.5702866315841675, "rewards/rejected": -1.2117507457733154, "step": 150 }, { "epoch": 0.99, "step": 153, "total_flos": 0.0, "train_loss": 0.5998621676482406, "train_runtime": 41019.2972, "train_samples_per_second": 0.488, "train_steps_per_second": 0.004 } ], "logging_steps": 10, "max_steps": 153, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }