{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9959925193694897, "eval_steps": 400, "global_step": 233, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02137323002938819, "grad_norm": 7.331315052451733, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -0.94921875, "logits/rejected": -0.90625, "logps/chosen": -0.28515625, "logps/rejected": -0.2890625, "loss": 1.5981, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.71484375, "rewards/margins": 0.0072021484375, "rewards/rejected": -0.72265625, "step": 5 }, { "epoch": 0.04274646005877638, "grad_norm": 16.131056100838176, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -0.8984375, "logits/rejected": -0.921875, "logps/chosen": -0.3046875, "logps/rejected": -0.330078125, "loss": 1.5742, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.76171875, "rewards/margins": 0.06494140625, "rewards/rejected": -0.828125, "step": 10 }, { "epoch": 0.06411969008816458, "grad_norm": 9.687597283790334, "learning_rate": 6.249999999999999e-07, "logits/chosen": -1.0, "logits/rejected": -0.9296875, "logps/chosen": -0.296875, "logps/rejected": -0.31640625, "loss": 1.5803, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.7421875, "rewards/margins": 0.048828125, "rewards/rejected": -0.7890625, "step": 15 }, { "epoch": 0.08549292011755276, "grad_norm": 7.802540958166764, "learning_rate": 8.333333333333333e-07, "logits/chosen": -0.99609375, "logits/rejected": -0.9375, "logps/chosen": -0.283203125, "logps/rejected": -0.302734375, "loss": 1.5668, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.7109375, "rewards/margins": 0.04443359375, "rewards/rejected": -0.75390625, "step": 20 }, { "epoch": 0.10686615014694095, "grad_norm": 11.510345597750495, "learning_rate": 9.999435142363483e-07, "logits/chosen": -0.95703125, "logits/rejected": -0.91796875, "logps/chosen": -0.306640625, "logps/rejected": -0.326171875, "loss": 1.5727, "rewards/accuracies": 0.46875, "rewards/chosen": -0.765625, "rewards/margins": 0.048583984375, "rewards/rejected": -0.81640625, "step": 25 }, { "epoch": 0.12823938017632916, "grad_norm": 13.270905394176456, "learning_rate": 9.97967852255038e-07, "logits/chosen": -0.98828125, "logits/rejected": -0.97265625, "logps/chosen": -0.3046875, "logps/rejected": -0.34765625, "loss": 1.5622, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.76171875, "rewards/margins": 0.10791015625, "rewards/rejected": -0.87109375, "step": 30 }, { "epoch": 0.14961261020571734, "grad_norm": 7.551741305333013, "learning_rate": 9.931806517013612e-07, "logits/chosen": -0.93359375, "logits/rejected": -0.9140625, "logps/chosen": -0.3671875, "logps/rejected": -0.447265625, "loss": 1.5496, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.91796875, "rewards/margins": 0.2001953125, "rewards/rejected": -1.1171875, "step": 35 }, { "epoch": 0.17098584023510552, "grad_norm": 9.836362551399633, "learning_rate": 9.856089412257604e-07, "logits/chosen": -0.9453125, "logits/rejected": -0.91796875, "logps/chosen": -0.330078125, "logps/rejected": -0.37109375, "loss": 1.5409, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.82421875, "rewards/margins": 0.1044921875, "rewards/rejected": -0.9296875, "step": 40 }, { "epoch": 0.19235907026449373, "grad_norm": 8.767581425134471, "learning_rate": 9.752954708892377e-07, "logits/chosen": -1.015625, "logits/rejected": -0.9609375, "logps/chosen": -0.38671875, "logps/rejected": -0.42578125, "loss": 1.5454, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.96875, "rewards/margins": 0.09423828125, "rewards/rejected": -1.0625, "step": 45 }, { "epoch": 0.2137323002938819, "grad_norm": 10.8544145228066, "learning_rate": 9.62298470795473e-07, "logits/chosen": -1.0078125, "logits/rejected": -0.984375, "logps/chosen": -0.3984375, "logps/rejected": -0.421875, "loss": 1.5737, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.9921875, "rewards/margins": 0.061767578125, "rewards/rejected": -1.0546875, "step": 50 }, { "epoch": 0.2351055303232701, "grad_norm": 10.212862305253141, "learning_rate": 9.466913223222465e-07, "logits/chosen": -0.97265625, "logits/rejected": -0.92578125, "logps/chosen": -0.35546875, "logps/rejected": -0.4609375, "loss": 1.5398, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.890625, "rewards/margins": 0.263671875, "rewards/rejected": -1.1484375, "step": 55 }, { "epoch": 0.2564787603526583, "grad_norm": 12.631724574842627, "learning_rate": 9.285621438083997e-07, "logits/chosen": -1.015625, "logits/rejected": -0.92578125, "logps/chosen": -0.38671875, "logps/rejected": -0.48828125, "loss": 1.504, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.96484375, "rewards/margins": 0.255859375, "rewards/rejected": -1.21875, "step": 60 }, { "epoch": 0.2778519903820465, "grad_norm": 10.0800410336527, "learning_rate": 9.080132930355566e-07, "logits/chosen": -1.046875, "logits/rejected": -1.0234375, "logps/chosen": -0.423828125, "logps/rejected": -0.5546875, "loss": 1.4833, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.0625, "rewards/margins": 0.326171875, "rewards/rejected": -1.3828125, "step": 65 }, { "epoch": 0.2992252204114347, "grad_norm": 11.90187451004937, "learning_rate": 8.851607893136064e-07, "logits/chosen": -1.0625, "logits/rejected": -1.0, "logps/chosen": -0.47265625, "logps/rejected": -0.5390625, "loss": 1.5116, "rewards/accuracies": 0.625, "rewards/chosen": -1.1796875, "rewards/margins": 0.1669921875, "rewards/rejected": -1.3515625, "step": 70 }, { "epoch": 0.32059845044082286, "grad_norm": 10.51424506148012, "learning_rate": 8.601336584328658e-07, "logits/chosen": -0.9296875, "logits/rejected": -0.91015625, "logps/chosen": -0.46484375, "logps/rejected": -0.6015625, "loss": 1.4666, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1640625, "rewards/margins": 0.34375, "rewards/rejected": -1.5078125, "step": 75 }, { "epoch": 0.34197168047021104, "grad_norm": 11.719501574810968, "learning_rate": 8.330732041813366e-07, "logits/chosen": -0.984375, "logits/rejected": -0.96484375, "logps/chosen": -0.486328125, "logps/rejected": -0.625, "loss": 1.4657, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2109375, "rewards/margins": 0.345703125, "rewards/rejected": -1.5625, "step": 80 }, { "epoch": 0.36334491049959927, "grad_norm": 15.388364264984391, "learning_rate": 8.041322105400921e-07, "logits/chosen": -0.9765625, "logits/rejected": -0.921875, "logps/chosen": -0.55078125, "logps/rejected": -0.77734375, "loss": 1.4418, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3828125, "rewards/margins": 0.5625, "rewards/rejected": -1.9453125, "step": 85 }, { "epoch": 0.38471814052898745, "grad_norm": 16.233063949950242, "learning_rate": 7.734740790612136e-07, "logits/chosen": -0.953125, "logits/rejected": -0.90625, "logps/chosen": -0.6484375, "logps/rejected": -0.79296875, "loss": 1.395, "rewards/accuracies": 0.6875, "rewards/chosen": -1.625, "rewards/margins": 0.359375, "rewards/rejected": -1.984375, "step": 90 }, { "epoch": 0.40609137055837563, "grad_norm": 11.872631923458862, "learning_rate": 7.412719062986631e-07, "logits/chosen": -1.0078125, "logits/rejected": -0.96875, "logps/chosen": -0.68359375, "logps/rejected": -0.90234375, "loss": 1.3857, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7109375, "rewards/margins": 0.54296875, "rewards/rejected": -2.25, "step": 95 }, { "epoch": 0.4274646005877638, "grad_norm": 22.45134823888932, "learning_rate": 7.077075065009433e-07, "logits/chosen": -0.86328125, "logits/rejected": -0.86328125, "logps/chosen": -0.75, "logps/rejected": -0.984375, "loss": 1.3902, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.875, "rewards/margins": 0.5859375, "rewards/rejected": -2.453125, "step": 100 }, { "epoch": 0.448837830617152, "grad_norm": 18.006604817685755, "learning_rate": 6.72970385083438e-07, "logits/chosen": -0.9296875, "logits/rejected": -0.890625, "logps/chosen": -0.8046875, "logps/rejected": -0.99609375, "loss": 1.3847, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.015625, "rewards/margins": 0.4765625, "rewards/rejected": -2.484375, "step": 105 }, { "epoch": 0.4702110606465402, "grad_norm": 16.753267675250456, "learning_rate": 6.372566686762426e-07, "logits/chosen": -0.8515625, "logits/rejected": -0.81640625, "logps/chosen": -0.9375, "logps/rejected": -1.1953125, "loss": 1.2934, "rewards/accuracies": 0.71875, "rewards/chosen": -2.34375, "rewards/margins": 0.64453125, "rewards/rejected": -2.984375, "step": 110 }, { "epoch": 0.4915842906759284, "grad_norm": 18.88632199685095, "learning_rate": 6.00767997788451e-07, "logits/chosen": -0.91015625, "logits/rejected": -0.875, "logps/chosen": -1.09375, "logps/rejected": -1.3359375, "loss": 1.2884, "rewards/accuracies": 0.6875, "rewards/chosen": -2.734375, "rewards/margins": 0.59765625, "rewards/rejected": -3.34375, "step": 115 }, { "epoch": 0.5129575207053166, "grad_norm": 20.87844659691919, "learning_rate": 5.637103883409525e-07, "logits/chosen": -0.89453125, "logits/rejected": -0.90625, "logps/chosen": -1.265625, "logps/rejected": -1.609375, "loss": 1.2416, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.15625, "rewards/margins": 0.8671875, "rewards/rejected": -4.03125, "step": 120 }, { "epoch": 0.5343307507347048, "grad_norm": 19.2002205664862, "learning_rate": 5.262930684955438e-07, "logits/chosen": -0.890625, "logits/rejected": -0.84375, "logps/chosen": -1.3046875, "logps/rejected": -1.7578125, "loss": 1.179, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.25, "rewards/margins": 1.1328125, "rewards/rejected": -4.40625, "step": 125 }, { "epoch": 0.555703980764093, "grad_norm": 21.938475092817775, "learning_rate": 4.88727297347654e-07, "logits/chosen": -0.8203125, "logits/rejected": -0.7890625, "logps/chosen": -1.484375, "logps/rejected": -1.9921875, "loss": 1.2199, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.703125, "rewards/margins": 1.2734375, "rewards/rejected": -4.96875, "step": 130 }, { "epoch": 0.5770772107934812, "grad_norm": 35.15980683434337, "learning_rate": 4.512251721523659e-07, "logits/chosen": -0.80078125, "logits/rejected": -0.7734375, "logps/chosen": -1.703125, "logps/rejected": -2.140625, "loss": 1.1432, "rewards/accuracies": 0.75, "rewards/chosen": -4.25, "rewards/margins": 1.1015625, "rewards/rejected": -5.375, "step": 135 }, { "epoch": 0.5984504408228694, "grad_norm": 31.78956774132237, "learning_rate": 4.139984308181708e-07, "logits/chosen": -0.68359375, "logits/rejected": -0.6328125, "logps/chosen": -1.765625, "logps/rejected": -2.140625, "loss": 1.1473, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.40625, "rewards/margins": 0.94140625, "rewards/rejected": -5.34375, "step": 140 }, { "epoch": 0.6198236708522575, "grad_norm": 24.493239330333395, "learning_rate": 3.772572564296004e-07, "logits/chosen": -0.671875, "logits/rejected": -0.6015625, "logps/chosen": -2.09375, "logps/rejected": -2.578125, "loss": 1.113, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -5.25, "rewards/margins": 1.1953125, "rewards/rejected": -6.4375, "step": 145 }, { "epoch": 0.6411969008816457, "grad_norm": 41.1151110363947, "learning_rate": 3.412090905484337e-07, "logits/chosen": -0.5234375, "logits/rejected": -0.48046875, "logps/chosen": -2.234375, "logps/rejected": -2.65625, "loss": 1.1182, "rewards/accuracies": 0.71875, "rewards/chosen": -5.59375, "rewards/margins": 1.0625, "rewards/rejected": -6.65625, "step": 150 }, { "epoch": 0.6625701309110339, "grad_norm": 34.96661272078944, "learning_rate": 3.060574619936075e-07, "logits/chosen": -0.57421875, "logits/rejected": -0.53125, "logps/chosen": -2.1875, "logps/rejected": -2.578125, "loss": 1.1116, "rewards/accuracies": 0.71875, "rewards/chosen": -5.46875, "rewards/margins": 0.984375, "rewards/rejected": -6.46875, "step": 155 }, { "epoch": 0.6839433609404221, "grad_norm": 30.64693471267594, "learning_rate": 2.720008377125682e-07, "logits/chosen": -0.65234375, "logits/rejected": -0.6171875, "logps/chosen": -2.3125, "logps/rejected": -2.859375, "loss": 1.0848, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -5.78125, "rewards/margins": 1.375, "rewards/rejected": -7.15625, "step": 160 }, { "epoch": 0.7053165909698104, "grad_norm": 30.547513130259837, "learning_rate": 2.3923150223207173e-07, "logits/chosen": -0.478515625, "logits/rejected": -0.427734375, "logps/chosen": -2.390625, "logps/rejected": -2.984375, "loss": 1.0943, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -5.96875, "rewards/margins": 1.484375, "rewards/rejected": -7.46875, "step": 165 }, { "epoch": 0.7266898209991985, "grad_norm": 34.11282517597963, "learning_rate": 2.0793447201508286e-07, "logits/chosen": -0.57421875, "logits/rejected": -0.51953125, "logps/chosen": -2.359375, "logps/rejected": -3.125, "loss": 1.0601, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -5.90625, "rewards/margins": 1.921875, "rewards/rejected": -7.8125, "step": 170 }, { "epoch": 0.7480630510285867, "grad_norm": 40.72462060392372, "learning_rate": 1.7828645085333644e-07, "logits/chosen": -0.55078125, "logits/rejected": -0.5625, "logps/chosen": -2.59375, "logps/rejected": -3.21875, "loss": 1.0617, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -6.46875, "rewards/margins": 1.5546875, "rewards/rejected": -8.0625, "step": 175 }, { "epoch": 0.7694362810579749, "grad_norm": 35.91258416715483, "learning_rate": 1.5045483219344385e-07, "logits/chosen": -0.5859375, "logits/rejected": -0.62109375, "logps/chosen": -2.515625, "logps/rejected": -3.234375, "loss": 1.0314, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -6.28125, "rewards/margins": 1.7890625, "rewards/rejected": -8.0625, "step": 180 }, { "epoch": 0.7908095110873631, "grad_norm": 40.36681084696311, "learning_rate": 1.2459675402943288e-07, "logits/chosen": -0.57421875, "logits/rejected": -0.5078125, "logps/chosen": -2.65625, "logps/rejected": -3.171875, "loss": 1.0645, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -6.65625, "rewards/margins": 1.296875, "rewards/rejected": -7.9375, "step": 185 }, { "epoch": 0.8121827411167513, "grad_norm": 34.60643699666399, "learning_rate": 1.0085821169782199e-07, "logits/chosen": -0.55078125, "logits/rejected": -0.55078125, "logps/chosen": -2.6875, "logps/rejected": -3.34375, "loss": 1.0308, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -6.75, "rewards/margins": 1.640625, "rewards/rejected": -8.375, "step": 190 }, { "epoch": 0.8335559711461394, "grad_norm": 38.94834120560286, "learning_rate": 7.937323358440934e-08, "logits/chosen": -0.515625, "logits/rejected": -0.53125, "logps/chosen": -2.59375, "logps/rejected": -3.359375, "loss": 1.018, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -6.46875, "rewards/margins": 1.9296875, "rewards/rejected": -8.375, "step": 195 }, { "epoch": 0.8549292011755276, "grad_norm": 33.86005508761931, "learning_rate": 6.026312439675551e-08, "logits/chosen": -0.5625, "logits/rejected": -0.5234375, "logps/chosen": -2.671875, "logps/rejected": -3.3125, "loss": 1.031, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -6.65625, "rewards/margins": 1.59375, "rewards/rejected": -8.25, "step": 200 }, { "epoch": 0.8763024312049158, "grad_norm": 35.363713916853904, "learning_rate": 4.3635780274861864e-08, "logits/chosen": -0.4765625, "logits/rejected": -0.4765625, "logps/chosen": -2.734375, "logps/rejected": -3.328125, "loss": 1.0528, "rewards/accuracies": 0.75, "rewards/chosen": -6.84375, "rewards/margins": 1.484375, "rewards/rejected": -8.3125, "step": 205 }, { "epoch": 0.897675661234304, "grad_norm": 35.71370500438308, "learning_rate": 2.958507960694784e-08, "logits/chosen": -0.50390625, "logits/rejected": -0.470703125, "logps/chosen": -2.484375, "logps/rejected": -3.296875, "loss": 1.0286, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -6.1875, "rewards/margins": 2.03125, "rewards/rejected": -8.25, "step": 210 }, { "epoch": 0.9190488912636923, "grad_norm": 33.989939318779676, "learning_rate": 1.8190352989793322e-08, "logits/chosen": -0.5390625, "logits/rejected": -0.52734375, "logps/chosen": -2.6875, "logps/rejected": -3.421875, "loss": 1.0714, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -6.71875, "rewards/margins": 1.859375, "rewards/rejected": -8.5625, "step": 215 }, { "epoch": 0.9404221212930804, "grad_norm": 30.604992224663434, "learning_rate": 9.515935326265378e-09, "logits/chosen": -0.51953125, "logits/rejected": -0.462890625, "logps/chosen": -2.625, "logps/rejected": -3.203125, "loss": 1.0149, "rewards/accuracies": 0.78125, "rewards/chosen": -6.59375, "rewards/margins": 1.421875, "rewards/rejected": -8.0, "step": 220 }, { "epoch": 0.9617953513224686, "grad_norm": 32.491153001452076, "learning_rate": 3.6108025888958447e-09, "logits/chosen": -0.54296875, "logits/rejected": -0.515625, "logps/chosen": -2.625, "logps/rejected": -3.296875, "loss": 1.0096, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -6.5625, "rewards/margins": 1.6796875, "rewards/rejected": -8.25, "step": 225 }, { "epoch": 0.9831685813518568, "grad_norm": 33.176623047011205, "learning_rate": 5.082953003528456e-10, "logits/chosen": -0.5546875, "logits/rejected": -0.54296875, "logps/chosen": -2.625, "logps/rejected": -3.3125, "loss": 0.9971, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -6.5625, "rewards/margins": 1.7265625, "rewards/rejected": -8.3125, "step": 230 }, { "epoch": 0.9959925193694897, "step": 233, "total_flos": 0.0, "train_loss": 1.2805403189597724, "train_runtime": 5384.1773, "train_samples_per_second": 11.121, "train_steps_per_second": 0.043 } ], "logging_steps": 5, "max_steps": 233, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }