{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.255639097744361, "eval_steps": 200, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07518796992481203, "grad_norm": 0.00031717625653774825, "learning_rate": 4.906015037593986e-06, "logits/chosen": -3.08984375, "logits/rejected": -3.359375, "logps/chosen": -759.5999755859375, "logps/rejected": -601.7999877929688, "loss": 0.0505, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.9212889671325684, "rewards/margins": 19.23046875, "rewards/rejected": -15.315332412719727, "step": 20 }, { "epoch": 0.15037593984962405, "grad_norm": 2.5674383538843824e-13, "learning_rate": 4.81203007518797e-06, "logits/chosen": -3.37890625, "logits/rejected": -3.901562452316284, "logps/chosen": -797.4000244140625, "logps/rejected": -834.2000122070312, "loss": 0.0018, "rewards/accuracies": 0.9984375238418579, "rewards/chosen": 0.5521484613418579, "rewards/margins": 38.98125076293945, "rewards/rejected": -38.412498474121094, "step": 40 }, { "epoch": 0.22556390977443608, "grad_norm": 4.184555188916154e-12, "learning_rate": 4.718045112781955e-06, "logits/chosen": -3.237499952316284, "logits/rejected": -4.165625095367432, "logps/chosen": -774.5999755859375, "logps/rejected": -944.7999877929688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.255664110183716, "rewards/margins": 53.625, "rewards/rejected": -50.36249923706055, "step": 60 }, { "epoch": 0.3007518796992481, "grad_norm": 3.674299650107461e-07, "learning_rate": 4.62406015037594e-06, "logits/chosen": -3.0601563453674316, "logits/rejected": -4.301562309265137, "logps/chosen": -744.2000122070312, "logps/rejected": -968.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 6.635937690734863, "rewards/margins": 58.375, "rewards/rejected": -51.724998474121094, "step": 80 }, { "epoch": 0.37593984962406013, "grad_norm": 2.9469468290533765e-13, "learning_rate": 4.530075187969925e-06, "logits/chosen": -3.035937547683716, "logits/rejected": -4.3359375, "logps/chosen": -754.0, "logps/rejected": -981.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 6.212500095367432, "rewards/margins": 58.912498474121094, "rewards/rejected": -52.724998474121094, "step": 100 }, { "epoch": 0.45112781954887216, "grad_norm": 2.450421719466682e-10, "learning_rate": 4.43609022556391e-06, "logits/chosen": -3.0367188453674316, "logits/rejected": -4.3203125, "logps/chosen": -740.0, "logps/rejected": -961.7999877929688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 6.285937309265137, "rewards/margins": 57.474998474121094, "rewards/rejected": -51.1875, "step": 120 }, { "epoch": 0.5263157894736842, "grad_norm": 1.6351303999698253e-10, "learning_rate": 4.342105263157895e-06, "logits/chosen": -3.0234375, "logits/rejected": -4.3046875, "logps/chosen": -754.0, "logps/rejected": -969.4000244140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 6.207812309265137, "rewards/margins": 57.662498474121094, "rewards/rejected": -51.412498474121094, "step": 140 }, { "epoch": 0.6015037593984962, "grad_norm": 1.5915083665281199e-09, "learning_rate": 4.24812030075188e-06, "logits/chosen": -3.03125, "logits/rejected": -4.318749904632568, "logps/chosen": -748.2000122070312, "logps/rejected": -958.7999877929688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 6.248437404632568, "rewards/margins": 57.525001525878906, "rewards/rejected": -51.25, "step": 160 }, { "epoch": 0.6766917293233082, "grad_norm": 4.297014302694241e-12, "learning_rate": 4.1541353383458646e-06, "logits/chosen": -3.0406250953674316, "logits/rejected": -4.279687404632568, "logps/chosen": -741.4000244140625, "logps/rejected": -961.4000244140625, "loss": 0.0132, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": 6.785937309265137, "rewards/margins": 57.4375, "rewards/rejected": -50.625, "step": 180 }, { "epoch": 0.7518796992481203, "grad_norm": 9.095973454791659e-12, "learning_rate": 4.06015037593985e-06, "logits/chosen": -3.055468797683716, "logits/rejected": -4.318749904632568, "logps/chosen": -733.0, "logps/rejected": -968.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.356249809265137, "rewards/margins": 59.5625, "rewards/rejected": -52.224998474121094, "step": 200 }, { "epoch": 0.7518796992481203, "eval_logits/chosen": -3.058178186416626, "eval_logits/rejected": -4.299867153167725, "eval_logps/chosen": -741.7021484375, "eval_logps/rejected": -972.0850830078125, "eval_loss": 2.180002622864663e-09, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 7.262632846832275, "eval_rewards/margins": 58.87765884399414, "eval_rewards/rejected": -51.62765884399414, "eval_runtime": 8.4937, "eval_samples_per_second": 176.601, "eval_score": -0.6606304049491882, "eval_steps_per_second": 5.533, "step": 200 }, { "epoch": 0.8270676691729323, "grad_norm": 2.3694597465927517e-14, "learning_rate": 3.966165413533835e-06, "logits/chosen": -3.057812452316284, "logits/rejected": -4.2890625, "logps/chosen": -739.0, "logps/rejected": -973.4000244140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.339062690734863, "rewards/margins": 58.67499923706055, "rewards/rejected": -51.32500076293945, "step": 220 }, { "epoch": 0.9022556390977443, "grad_norm": 4.7239515256796625e-09, "learning_rate": 3.87218045112782e-06, "logits/chosen": -3.0648436546325684, "logits/rejected": -4.301562309265137, "logps/chosen": -739.4000244140625, "logps/rejected": -980.4000244140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.296875, "rewards/margins": 59.57500076293945, "rewards/rejected": -52.25, "step": 240 }, { "epoch": 0.9774436090225563, "grad_norm": 1.1010824468107971e-09, "learning_rate": 3.778195488721805e-06, "logits/chosen": -3.059375047683716, "logits/rejected": -4.317187309265137, "logps/chosen": -730.0, "logps/rejected": -975.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.485937595367432, "rewards/margins": 60.162498474121094, "rewards/rejected": -52.67499923706055, "step": 260 }, { "epoch": 1.0526315789473684, "grad_norm": 5.510844973332184e-09, "learning_rate": 3.6842105263157896e-06, "logits/chosen": -3.06640625, "logits/rejected": -4.3046875, "logps/chosen": -736.4000244140625, "logps/rejected": -961.5999755859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.443749904632568, "rewards/margins": 58.42499923706055, "rewards/rejected": -51.025001525878906, "step": 280 }, { "epoch": 1.1278195488721805, "grad_norm": 1.0805968862929627e-08, "learning_rate": 3.590225563909775e-06, "logits/chosen": -3.067187547683716, "logits/rejected": -4.329687595367432, "logps/chosen": -737.0, "logps/rejected": -978.2000122070312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.373437404632568, "rewards/margins": 59.849998474121094, "rewards/rejected": -52.474998474121094, "step": 300 }, { "epoch": 1.2030075187969924, "grad_norm": 1.8280615029499892e-13, "learning_rate": 3.4962406015037596e-06, "logits/chosen": -3.063281297683716, "logits/rejected": -4.317187309265137, "logps/chosen": -739.2000122070312, "logps/rejected": -977.2000122070312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.349999904632568, "rewards/margins": 59.67499923706055, "rewards/rejected": -52.337501525878906, "step": 320 }, { "epoch": 1.2781954887218046, "grad_norm": 8.685091807936255e-12, "learning_rate": 3.4022556390977448e-06, "logits/chosen": -3.057812452316284, "logits/rejected": -4.293749809265137, "logps/chosen": -740.4000244140625, "logps/rejected": -961.2000122070312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.3203125, "rewards/margins": 58.587501525878906, "rewards/rejected": -51.25, "step": 340 }, { "epoch": 1.3533834586466165, "grad_norm": 1.6244164681709312e-07, "learning_rate": 3.3082706766917295e-06, "logits/chosen": -3.059375047683716, "logits/rejected": -4.295312404632568, "logps/chosen": -738.2000122070312, "logps/rejected": -958.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.278124809265137, "rewards/margins": 58.337501525878906, "rewards/rejected": -51.04999923706055, "step": 360 }, { "epoch": 1.4285714285714286, "grad_norm": 3.8272192733826815e-13, "learning_rate": 3.2142857142857147e-06, "logits/chosen": -3.0648436546325684, "logits/rejected": -4.318749904632568, "logps/chosen": -731.0, "logps/rejected": -984.4000244140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.357812404632568, "rewards/margins": 60.38750076293945, "rewards/rejected": -53.04999923706055, "step": 380 }, { "epoch": 1.5037593984962405, "grad_norm": 6.450130800887369e-09, "learning_rate": 3.1203007518796995e-06, "logits/chosen": -3.0570311546325684, "logits/rejected": -4.317187309265137, "logps/chosen": -735.5999755859375, "logps/rejected": -965.5999755859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.471875190734863, "rewards/margins": 59.13750076293945, "rewards/rejected": -51.650001525878906, "step": 400 }, { "epoch": 1.5037593984962405, "eval_logits/chosen": -3.0611701011657715, "eval_logits/rejected": -4.303191661834717, "eval_logps/chosen": -741.872314453125, "eval_logps/rejected": -972.85107421875, "eval_loss": 1.5668466524232372e-09, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 7.266622543334961, "eval_rewards/margins": 58.98404312133789, "eval_rewards/rejected": -51.70744705200195, "eval_runtime": 8.4851, "eval_samples_per_second": 176.78, "eval_score": -0.6956531405448914, "eval_steps_per_second": 5.539, "step": 400 }, { "epoch": 1.5789473684210527, "grad_norm": 1.5739247094468196e-10, "learning_rate": 3.0263157894736843e-06, "logits/chosen": -3.063281297683716, "logits/rejected": -4.301562309265137, "logps/chosen": -732.5999755859375, "logps/rejected": -961.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.067187309265137, "rewards/margins": 58.025001525878906, "rewards/rejected": -50.95000076293945, "step": 420 }, { "epoch": 1.6541353383458648, "grad_norm": 5.037244548596992e-09, "learning_rate": 2.9323308270676694e-06, "logits/chosen": -3.0679688453674316, "logits/rejected": -4.328125, "logps/chosen": -735.5999755859375, "logps/rejected": -983.5999755859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.542187690734863, "rewards/margins": 60.11249923706055, "rewards/rejected": -52.54999923706055, "step": 440 }, { "epoch": 1.7293233082706767, "grad_norm": 1.907719763871417e-13, "learning_rate": 2.8383458646616546e-06, "logits/chosen": -3.065624952316284, "logits/rejected": -4.318749904632568, "logps/chosen": -732.0, "logps/rejected": -971.4000244140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.348437309265137, "rewards/margins": 59.599998474121094, "rewards/rejected": -52.275001525878906, "step": 460 }, { "epoch": 1.8045112781954886, "grad_norm": 2.413237608760496e-12, "learning_rate": 2.7443609022556394e-06, "logits/chosen": -3.06640625, "logits/rejected": -4.303124904632568, "logps/chosen": -734.0, "logps/rejected": -962.2000122070312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.337500095367432, "rewards/margins": 58.900001525878906, "rewards/rejected": -51.54999923706055, "step": 480 }, { "epoch": 1.8796992481203008, "grad_norm": 3.8499016698807266e-05, "learning_rate": 2.650375939849624e-06, "logits/chosen": -3.063281297683716, "logits/rejected": -4.34375, "logps/chosen": -741.0, "logps/rejected": -975.4000244140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.368750095367432, "rewards/margins": 60.42499923706055, "rewards/rejected": -53.0625, "step": 500 }, { "epoch": 1.954887218045113, "grad_norm": 1.6477178899318827e-11, "learning_rate": 2.556390977443609e-06, "logits/chosen": -3.051562547683716, "logits/rejected": -4.314062595367432, "logps/chosen": -734.2000122070312, "logps/rejected": -970.4000244140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.464062690734863, "rewards/margins": 59.70000076293945, "rewards/rejected": -52.224998474121094, "step": 520 }, { "epoch": 2.030075187969925, "grad_norm": 2.0829162250893226e-08, "learning_rate": 2.462406015037594e-06, "logits/chosen": -3.022656202316284, "logits/rejected": -4.27734375, "logps/chosen": -713.4000244140625, "logps/rejected": -979.4000244140625, "loss": 0.0045, "rewards/accuracies": 0.995312511920929, "rewards/chosen": 8.104687690734863, "rewards/margins": 60.17499923706055, "rewards/rejected": -52.0625, "step": 540 }, { "epoch": 2.1052631578947367, "grad_norm": 1.0373247696775237e-09, "learning_rate": 2.368421052631579e-06, "logits/chosen": -2.9625000953674316, "logits/rejected": -4.248437404632568, "logps/chosen": -729.2000122070312, "logps/rejected": -967.4000244140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.295312881469727, "rewards/margins": 59.724998474121094, "rewards/rejected": -51.42499923706055, "step": 560 }, { "epoch": 2.180451127819549, "grad_norm": 1.0771024319146517e-10, "learning_rate": 2.274436090225564e-06, "logits/chosen": -2.964062452316284, "logits/rejected": -4.268750190734863, "logps/chosen": -727.2000122070312, "logps/rejected": -964.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.871874809265137, "rewards/margins": 59.3125, "rewards/rejected": -51.42499923706055, "step": 580 }, { "epoch": 2.255639097744361, "grad_norm": 3.252682975051824e-07, "learning_rate": 2.180451127819549e-06, "logits/chosen": -2.9429688453674316, "logits/rejected": -4.2734375, "logps/chosen": -725.0, "logps/rejected": -963.5999755859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.995312690734863, "rewards/margins": 59.537498474121094, "rewards/rejected": -51.537498474121094, "step": 600 }, { "epoch": 2.255639097744361, "eval_logits/chosen": -2.953125, "eval_logits/rejected": -4.254654407501221, "eval_logps/chosen": -734.9786987304688, "eval_logps/rejected": -968.7659301757812, "eval_loss": 3.946915239083637e-09, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 7.918882846832275, "eval_rewards/margins": 59.25, "eval_rewards/rejected": -51.32978820800781, "eval_runtime": 8.4849, "eval_samples_per_second": 176.784, "eval_score": -0.8367462158203125, "eval_steps_per_second": 5.539, "step": 600 } ], "logging_steps": 20, "max_steps": 1064, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }