{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.05, "eval_steps": 1000, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 32.324275970458984, "learning_rate": 1.0000000000000002e-06, "loss": 28.2372, "step": 10 }, { "epoch": 0.0, "grad_norm": NaN, "learning_rate": 3.666666666666667e-06, "loss": 26.9696, "step": 20 }, { "epoch": 0.0, "grad_norm": 54.26140594482422, "learning_rate": 6.333333333333334e-06, "loss": 25.5608, "step": 30 }, { "epoch": 0.0, "grad_norm": 88.16826629638672, "learning_rate": 9.666666666666667e-06, "loss": 22.9594, "step": 40 }, { "epoch": 0.01, "grad_norm": 71.05668640136719, "learning_rate": 1.3000000000000001e-05, "loss": 15.5922, "step": 50 }, { "epoch": 0.01, "grad_norm": 23.444883346557617, "learning_rate": 1.6333333333333335e-05, "loss": 8.6897, "step": 60 }, { "epoch": 0.01, "grad_norm": 16.86968421936035, "learning_rate": 1.9666666666666666e-05, "loss": 5.6481, "step": 70 }, { "epoch": 0.01, "grad_norm": 10.194473266601562, "learning_rate": 2.3000000000000003e-05, "loss": 4.7916, "step": 80 }, { "epoch": 0.01, "grad_norm": 12.411303520202637, "learning_rate": 2.633333333333333e-05, "loss": 4.6363, "step": 90 }, { "epoch": 0.01, "grad_norm": 10.147621154785156, "learning_rate": 2.9666666666666672e-05, "loss": 4.3492, "step": 100 }, { "epoch": 0.01, "grad_norm": 5.482204914093018, "learning_rate": 3.3e-05, "loss": 4.2149, "step": 110 }, { "epoch": 0.01, "grad_norm": 8.201030731201172, "learning_rate": 3.633333333333333e-05, "loss": 4.0633, "step": 120 }, { "epoch": 0.01, "grad_norm": 6.775381565093994, "learning_rate": 3.966666666666667e-05, "loss": 4.0435, "step": 130 }, { "epoch": 0.01, "grad_norm": 4.881369113922119, "learning_rate": 4.3e-05, "loss": 3.8217, "step": 140 }, { "epoch": 0.01, "grad_norm": 2.2867329120635986, "learning_rate": 4.633333333333333e-05, "loss": 3.7584, "step": 150 }, { "epoch": 0.02, "grad_norm": 1.5364590883255005, "learning_rate": 4.966666666666667e-05, "loss": 3.6955, "step": 160 }, { "epoch": 0.02, "grad_norm": 1.0147122144699097, "learning_rate": 5.300000000000001e-05, "loss": 3.6443, "step": 170 }, { "epoch": 0.02, "grad_norm": 3.4737942218780518, "learning_rate": 5.633333333333334e-05, "loss": 3.6058, "step": 180 }, { "epoch": 0.02, "grad_norm": 1.3374801874160767, "learning_rate": 5.966666666666667e-05, "loss": 3.5659, "step": 190 }, { "epoch": 0.02, "grad_norm": 1.2648118734359741, "learning_rate": 6.3e-05, "loss": 3.5651, "step": 200 }, { "epoch": 0.02, "grad_norm": 1.9674913883209229, "learning_rate": 6.633333333333334e-05, "loss": 3.5555, "step": 210 }, { "epoch": 0.02, "grad_norm": 3.2699999809265137, "learning_rate": 6.966666666666668e-05, "loss": 3.5686, "step": 220 }, { "epoch": 0.02, "grad_norm": 3.4021201133728027, "learning_rate": 7.3e-05, "loss": 3.5226, "step": 230 }, { "epoch": 0.02, "grad_norm": 0.5968145132064819, "learning_rate": 7.633333333333334e-05, "loss": 3.5262, "step": 240 }, { "epoch": 0.03, "grad_norm": 7.528881549835205, "learning_rate": 7.966666666666666e-05, "loss": 3.5658, "step": 250 }, { "epoch": 0.03, "grad_norm": 3.5421671867370605, "learning_rate": 8.3e-05, "loss": 3.5355, "step": 260 }, { "epoch": 0.03, "grad_norm": 0.5664910674095154, "learning_rate": 8.633333333333334e-05, "loss": 3.5288, "step": 270 }, { "epoch": 0.03, "grad_norm": 1.6976518630981445, "learning_rate": 8.966666666666666e-05, "loss": 3.51, "step": 280 }, { "epoch": 0.03, "grad_norm": 1.1861720085144043, "learning_rate": 9.300000000000001e-05, "loss": 3.5045, "step": 290 }, { "epoch": 0.03, "grad_norm": 0.5708307027816772, "learning_rate": 9.633333333333335e-05, "loss": 3.5049, "step": 300 }, { "epoch": 0.03, "grad_norm": 0.5384214520454407, "learning_rate": 9.966666666666667e-05, "loss": 3.5043, "step": 310 }, { "epoch": 0.03, "grad_norm": 1.889636754989624, "learning_rate": 9.990721649484537e-05, "loss": 3.4986, "step": 320 }, { "epoch": 0.03, "grad_norm": 0.9980954527854919, "learning_rate": 9.980412371134021e-05, "loss": 3.4995, "step": 330 }, { "epoch": 0.03, "grad_norm": 0.7468856573104858, "learning_rate": 9.970103092783505e-05, "loss": 3.4866, "step": 340 }, { "epoch": 0.04, "grad_norm": 1.9658762216567993, "learning_rate": 9.959793814432991e-05, "loss": 3.5048, "step": 350 }, { "epoch": 0.04, "grad_norm": 1.008418321609497, "learning_rate": 9.949484536082475e-05, "loss": 3.495, "step": 360 }, { "epoch": 0.04, "grad_norm": 0.26405736804008484, "learning_rate": 9.939175257731959e-05, "loss": 3.5073, "step": 370 }, { "epoch": 0.04, "grad_norm": 0.4936525523662567, "learning_rate": 9.928865979381445e-05, "loss": 3.4906, "step": 380 }, { "epoch": 0.04, "grad_norm": 0.947067379951477, "learning_rate": 9.918556701030929e-05, "loss": 3.491, "step": 390 }, { "epoch": 0.04, "grad_norm": 0.7933781147003174, "learning_rate": 9.908247422680413e-05, "loss": 3.483, "step": 400 }, { "epoch": 0.04, "grad_norm": 0.9841229915618896, "learning_rate": 9.897938144329897e-05, "loss": 3.4933, "step": 410 }, { "epoch": 0.04, "grad_norm": 1.510961651802063, "learning_rate": 9.887628865979381e-05, "loss": 3.4771, "step": 420 }, { "epoch": 0.04, "grad_norm": 0.3279583156108856, "learning_rate": 9.877319587628866e-05, "loss": 3.4808, "step": 430 }, { "epoch": 0.04, "grad_norm": 1.3795489072799683, "learning_rate": 9.867010309278351e-05, "loss": 3.4847, "step": 440 }, { "epoch": 0.04, "grad_norm": 1.20867121219635, "learning_rate": 9.856701030927835e-05, "loss": 3.4858, "step": 450 }, { "epoch": 0.05, "grad_norm": 2.4353444576263428, "learning_rate": 9.84639175257732e-05, "loss": 3.4889, "step": 460 }, { "epoch": 0.05, "grad_norm": 0.6255859732627869, "learning_rate": 9.836082474226804e-05, "loss": 3.4883, "step": 470 }, { "epoch": 0.05, "grad_norm": 0.7791628837585449, "learning_rate": 9.825773195876289e-05, "loss": 3.4749, "step": 480 }, { "epoch": 0.05, "grad_norm": 0.6471466422080994, "learning_rate": 9.815463917525773e-05, "loss": 3.4701, "step": 490 }, { "epoch": 0.05, "grad_norm": 0.7829887270927429, "learning_rate": 9.805154639175258e-05, "loss": 3.4874, "step": 500 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "total_flos": 1.268040826294931e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }