{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 225, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1342281879194631, "grad_norm": 36.80321502685547, "kl": 0.20292969048023224, "learning_rate": 9e-08, "logits/chosen": -3776512.0, "logits/rejected": 1030144.0, "logps/chosen": -345.825, "logps/rejected": -421.15, "loss": 0.5017, "num_unsafe": 0.5, "rewards/chosen": -0.005364990234375, "rewards/margins": -0.013800048828125, "rewards/rejected": 0.00843505859375, "step": 10 }, { "epoch": 0.2684563758389262, "grad_norm": 39.34665298461914, "kl": 0.10800781100988388, "learning_rate": 1.8999999999999998e-07, "logits/chosen": 8857600.0, "logits/rejected": 8883404.8, "logps/chosen": -342.925, "logps/rejected": -384.2, "loss": 0.4963, "num_unsafe": 0.699999988079071, "rewards/chosen": 0.0006647109985351562, "rewards/margins": 0.030388832092285156, "rewards/rejected": -0.02972412109375, "step": 20 }, { "epoch": 0.40268456375838924, "grad_norm": 30.005146026611328, "kl": 0.03125, "learning_rate": 2.9e-07, "logits/chosen": 11644723.2, "logits/rejected": 16304537.6, "logps/chosen": -346.1, "logps/rejected": -386.15, "loss": 0.4873, "num_unsafe": 0.6499999761581421, "rewards/chosen": 0.016363525390625, "rewards/margins": 0.10656433105468749, "rewards/rejected": -0.0902008056640625, "step": 30 }, { "epoch": 0.5369127516778524, "grad_norm": 26.460351943969727, "kl": 0.0, "learning_rate": 3.8999999999999997e-07, "logits/chosen": 2756300.8, "logits/rejected": 3816652.8, "logps/chosen": -378.05, "logps/rejected": -397.2, "loss": 0.4541, "num_unsafe": 0.6000000238418579, "rewards/chosen": -0.070062255859375, "rewards/margins": 0.396392822265625, "rewards/rejected": -0.466455078125, "step": 40 }, { "epoch": 0.6711409395973155, "grad_norm": 20.090633392333984, "kl": 0.0, "learning_rate": 4.9e-07, "logits/chosen": 23447142.4, "logits/rejected": 26068582.4, "logps/chosen": -372.375, "logps/rejected": -427.3, "loss": 0.4115, "num_unsafe": 0.25, "rewards/chosen": -0.18045654296875, "rewards/margins": 0.9100219726562501, "rewards/rejected": -1.090478515625, "step": 50 }, { "epoch": 0.8053691275167785, "grad_norm": 29.12238121032715, "kl": 0.0, "learning_rate": 5.9e-07, "logits/chosen": 5931827.2, "logits/rejected": 13681459.2, "logps/chosen": -318.65, "logps/rejected": -371.35, "loss": 0.3601, "num_unsafe": 0.4000000059604645, "rewards/chosen": 0.175537109375, "rewards/margins": 1.456591796875, "rewards/rejected": -1.2810546875, "step": 60 }, { "epoch": 0.9395973154362416, "grad_norm": 13.03518009185791, "kl": 0.0, "learning_rate": 6.9e-07, "logits/chosen": 5972787.2, "logits/rejected": 2752512.0, "logps/chosen": -301.875, "logps/rejected": -392.0, "loss": 0.2818, "num_unsafe": 0.6000000238418579, "rewards/chosen": 0.56591796875, "rewards/margins": 2.639208984375, "rewards/rejected": -2.073291015625, "step": 70 }, { "epoch": 1.0671140939597314, "grad_norm": 19.125137329101562, "kl": 0.00657894741743803, "learning_rate": 7.9e-07, "logits/chosen": 4748773.052631579, "logits/rejected": 10441135.157894736, "logps/chosen": -334.7631578947368, "logps/rejected": -413.6842105263158, "loss": 0.2538, "num_unsafe": 0.5789473652839661, "rewards/chosen": 0.8779296875, "rewards/margins": 3.0099198190789473, "rewards/rejected": -2.1319901315789473, "step": 80 }, { "epoch": 1.2013422818791946, "grad_norm": 21.394241333007812, "kl": 0.18906250596046448, "learning_rate": 8.9e-07, "logits/chosen": 8029593.6, "logits/rejected": 4953088.0, "logps/chosen": -333.6625, "logps/rejected": -412.95, "loss": 0.209, "num_unsafe": 0.699999988079071, "rewards/chosen": 1.444970703125, "rewards/margins": 3.844580078125, "rewards/rejected": -2.399609375, "step": 90 }, { "epoch": 1.3355704697986577, "grad_norm": 8.02198314666748, "kl": 0.20468750596046448, "learning_rate": 9.9e-07, "logits/chosen": 8033792.0, "logits/rejected": 13608140.8, "logps/chosen": -271.025, "logps/rejected": -416.6, "loss": 0.1474, "num_unsafe": 0.6000000238418579, "rewards/chosen": 2.274462890625, "rewards/margins": 5.532470703125, "rewards/rejected": -3.2580078125, "step": 100 }, { "epoch": 1.4697986577181208, "grad_norm": 8.773207664489746, "kl": 0.3812499940395355, "learning_rate": 9.872634363932886e-07, "logits/chosen": 5904793.6, "logits/rejected": 10031923.2, "logps/chosen": -401.7, "logps/rejected": -434.35, "loss": 0.1696, "num_unsafe": 0.550000011920929, "rewards/chosen": 2.0365234375, "rewards/margins": 5.1421875, "rewards/rejected": -3.1056640625, "step": 110 }, { "epoch": 1.604026845637584, "grad_norm": 2.1352667808532715, "kl": 0.725781261920929, "learning_rate": 9.440682244067722e-07, "logits/chosen": 19757875.2, "logits/rejected": 22788505.6, "logps/chosen": -301.825, "logps/rejected": -432.2, "loss": 0.1174, "num_unsafe": 0.5, "rewards/chosen": 2.513671875, "rewards/margins": 6.514453125, "rewards/rejected": -4.00078125, "step": 120 }, { "epoch": 1.738255033557047, "grad_norm": 3.652601480484009, "kl": 0.5015624761581421, "learning_rate": 8.729705727120911e-07, "logits/chosen": 17581260.8, "logits/rejected": 17930649.6, "logps/chosen": -337.55, "logps/rejected": -427.85, "loss": 0.1357, "num_unsafe": 0.25, "rewards/chosen": 2.494921875, "rewards/margins": 6.15390625, "rewards/rejected": -3.658984375, "step": 130 }, { "epoch": 1.87248322147651, "grad_norm": 2.743739604949951, "kl": 1.01171875, "learning_rate": 7.78437808244094e-07, "logits/chosen": 2695168.0, "logits/rejected": -2059059.2, "logps/chosen": -287.05, "logps/rejected": -410.7, "loss": 0.1098, "num_unsafe": 0.550000011920929, "rewards/chosen": 2.70234375, "rewards/margins": 6.565234374999999, "rewards/rejected": -3.862890625, "step": 140 }, { "epoch": 2.0, "grad_norm": 1.1540242433547974, "kl": 0.6759868264198303, "learning_rate": 6.664097722614933e-07, "logits/chosen": 9126534.736842105, "logits/rejected": 11134652.631578946, "logps/chosen": -306.94736842105266, "logps/rejected": -413.3157894736842, "loss": 0.0889, "num_unsafe": 0.6315789222717285, "rewards/chosen": 3.2284128289473686, "rewards/margins": 7.489103618421053, "rewards/rejected": -4.260690789473684, "step": 150 }, { "epoch": 2.134228187919463, "grad_norm": 3.1307098865509033, "kl": 1.0421874523162842, "learning_rate": 5.439255982753717e-07, "logits/chosen": -1861222.4, "logits/rejected": 2562252.8, "logps/chosen": -312.7125, "logps/rejected": -468.6, "loss": 0.0885, "num_unsafe": 0.5, "rewards/chosen": 3.3125, "rewards/margins": 8.0171875, "rewards/rejected": -4.7046875, "step": 160 }, { "epoch": 2.2684563758389262, "grad_norm": 1.1788336038589478, "kl": 0.06875000149011612, "learning_rate": 4.1868141740255817e-07, "logits/chosen": 9242316.8, "logits/rejected": 9269657.6, "logps/chosen": -309.7, "logps/rejected": -428.95, "loss": 0.0865, "num_unsafe": 0.699999988079071, "rewards/chosen": 3.3330078125, "rewards/margins": 7.8982421875, "rewards/rejected": -4.565234375, "step": 170 }, { "epoch": 2.402684563758389, "grad_norm": 1.8654648065567017, "kl": 0.515625, "learning_rate": 2.985467821431687e-07, "logits/chosen": 11762073.6, "logits/rejected": 16133324.8, "logps/chosen": -313.075, "logps/rejected": -435.2, "loss": 0.0958, "num_unsafe": 0.6499999761581421, "rewards/chosen": 3.34765625, "rewards/margins": 8.335546875, "rewards/rejected": -4.987890625, "step": 180 }, { "epoch": 2.5369127516778525, "grad_norm": 2.7622787952423096, "kl": 0.6656249761581421, "learning_rate": 1.9107019345483288e-07, "logits/chosen": 3596492.8, "logits/rejected": 4794982.4, "logps/chosen": -345.0, "logps/rejected": -441.7, "loss": 0.0803, "num_unsafe": 0.6000000238418579, "rewards/chosen": 3.247265625, "rewards/margins": 8.13984375, "rewards/rejected": -4.892578125, "step": 190 }, { "epoch": 2.6711409395973154, "grad_norm": 1.9276018142700195, "kl": 0.22812500596046448, "learning_rate": 1.030048006760823e-07, "logits/chosen": 22071296.0, "logits/rejected": 22795059.2, "logps/chosen": -341.225, "logps/rejected": -464.8, "loss": 0.0984, "num_unsafe": 0.25, "rewards/chosen": 2.9384765625, "rewards/margins": 7.8150390625, "rewards/rejected": -4.8765625, "step": 200 }, { "epoch": 2.8053691275167782, "grad_norm": 1.9921404123306274, "kl": 0.30156248807907104, "learning_rate": 3.9884076317064807e-08, "logits/chosen": 4773068.8, "logits/rejected": 10355916.8, "logps/chosen": -291.175, "logps/rejected": -402.8, "loss": 0.1062, "num_unsafe": 0.4000000059604645, "rewards/chosen": 2.92861328125, "rewards/margins": 7.34970703125, "rewards/rejected": -4.42109375, "step": 210 }, { "epoch": 2.9395973154362416, "grad_norm": 2.2024121284484863, "kl": 0.7749999761581421, "learning_rate": 5.674127631043024e-09, "logits/chosen": 4405657.6, "logits/rejected": -535833.6, "logps/chosen": -272.775, "logps/rejected": -421.8, "loss": 0.0643, "num_unsafe": 0.6000000238418579, "rewards/chosen": 3.49609375, "rewards/margins": 8.56484375, "rewards/rejected": -5.06875, "step": 220 } ], "logging_steps": 10, "max_steps": 225, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 300.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }