{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004, "grad_norm": 133.32803344726562, "learning_rate": 1.5e-05, "loss": 3.6752, "step": 10 }, { "epoch": 0.008, "grad_norm": 39.905460357666016, "learning_rate": 4e-05, "loss": 1.9794, "step": 20 }, { "epoch": 0.012, "grad_norm": 51.313323974609375, "learning_rate": 4.987903225806452e-05, "loss": 2.1903, "step": 30 }, { "epoch": 0.016, "grad_norm": 46.85289001464844, "learning_rate": 4.967741935483871e-05, "loss": 2.0147, "step": 40 }, { "epoch": 0.02, "grad_norm": 80.95632934570312, "learning_rate": 4.94758064516129e-05, "loss": 2.3853, "step": 50 }, { "epoch": 0.024, "grad_norm": 37.81920623779297, "learning_rate": 4.92741935483871e-05, "loss": 2.7306, "step": 60 }, { "epoch": 0.028, "grad_norm": 23.879749298095703, "learning_rate": 4.907258064516129e-05, "loss": 1.6362, "step": 70 }, { "epoch": 0.032, "grad_norm": 26.93802261352539, "learning_rate": 4.887096774193549e-05, "loss": 1.6176, "step": 80 }, { "epoch": 0.036, "grad_norm": 32.77498245239258, "learning_rate": 4.866935483870968e-05, "loss": 1.6534, "step": 90 }, { "epoch": 0.04, "grad_norm": 36.66754913330078, "learning_rate": 4.846774193548387e-05, "loss": 1.6348, "step": 100 }, { "epoch": 0.044, "grad_norm": 32.059288024902344, "learning_rate": 4.8266129032258065e-05, "loss": 1.6225, "step": 110 }, { "epoch": 0.048, "grad_norm": 30.694385528564453, "learning_rate": 4.806451612903226e-05, "loss": 1.945, "step": 120 }, { "epoch": 0.052, "grad_norm": 39.73151397705078, "learning_rate": 4.7862903225806455e-05, "loss": 1.5514, "step": 130 }, { "epoch": 0.056, "grad_norm": 32.25121307373047, "learning_rate": 4.766129032258065e-05, "loss": 1.6446, "step": 140 }, { "epoch": 0.06, "grad_norm": 40.28260803222656, "learning_rate": 4.7459677419354844e-05, "loss": 1.4913, "step": 150 }, { "epoch": 0.064, "grad_norm": 34.746864318847656, "learning_rate": 4.725806451612904e-05, "loss": 1.1556, "step": 160 }, { "epoch": 0.068, "grad_norm": 47.414241790771484, "learning_rate": 4.705645161290323e-05, "loss": 1.5342, "step": 170 }, { "epoch": 0.072, "grad_norm": 61.64772033691406, "learning_rate": 4.685483870967742e-05, "loss": 1.5292, "step": 180 }, { "epoch": 0.076, "grad_norm": 24.82655906677246, "learning_rate": 4.6653225806451617e-05, "loss": 1.2651, "step": 190 }, { "epoch": 0.08, "grad_norm": 28.4489688873291, "learning_rate": 4.645161290322581e-05, "loss": 1.3369, "step": 200 }, { "epoch": 0.084, "grad_norm": 28.591188430786133, "learning_rate": 4.6250000000000006e-05, "loss": 1.5081, "step": 210 }, { "epoch": 0.088, "grad_norm": 28.529451370239258, "learning_rate": 4.60483870967742e-05, "loss": 1.3677, "step": 220 }, { "epoch": 0.092, "grad_norm": 28.43418312072754, "learning_rate": 4.584677419354839e-05, "loss": 1.8053, "step": 230 }, { "epoch": 0.096, "grad_norm": 25.22826385498047, "learning_rate": 4.5645161290322584e-05, "loss": 1.359, "step": 240 }, { "epoch": 0.1, "grad_norm": 26.05068016052246, "learning_rate": 4.544354838709677e-05, "loss": 1.2184, "step": 250 }, { "epoch": 0.104, "grad_norm": 40.48981857299805, "learning_rate": 4.5241935483870966e-05, "loss": 1.3307, "step": 260 }, { "epoch": 0.108, "grad_norm": 22.968761444091797, "learning_rate": 4.504032258064516e-05, "loss": 1.3088, "step": 270 }, { "epoch": 0.112, "grad_norm": 32.09326934814453, "learning_rate": 4.4838709677419356e-05, "loss": 1.3478, "step": 280 }, { "epoch": 0.116, "grad_norm": 40.11349105834961, "learning_rate": 4.463709677419355e-05, "loss": 1.322, "step": 290 }, { "epoch": 0.12, "grad_norm": 31.492155075073242, "learning_rate": 4.4435483870967745e-05, "loss": 1.515, "step": 300 }, { "epoch": 0.124, "grad_norm": 49.823089599609375, "learning_rate": 4.4233870967741933e-05, "loss": 1.291, "step": 310 }, { "epoch": 0.128, "grad_norm": 24.21888542175293, "learning_rate": 4.403225806451613e-05, "loss": 1.3147, "step": 320 }, { "epoch": 0.132, "grad_norm": 19.33460235595703, "learning_rate": 4.383064516129032e-05, "loss": 1.1186, "step": 330 }, { "epoch": 0.136, "grad_norm": 21.692705154418945, "learning_rate": 4.362903225806452e-05, "loss": 1.0384, "step": 340 }, { "epoch": 0.14, "grad_norm": 27.751327514648438, "learning_rate": 4.342741935483871e-05, "loss": 1.0266, "step": 350 }, { "epoch": 0.144, "grad_norm": 19.070554733276367, "learning_rate": 4.322580645161291e-05, "loss": 1.1643, "step": 360 }, { "epoch": 0.148, "grad_norm": 18.97333335876465, "learning_rate": 4.3024193548387095e-05, "loss": 1.4645, "step": 370 }, { "epoch": 0.152, "grad_norm": 36.90088653564453, "learning_rate": 4.282258064516129e-05, "loss": 1.0192, "step": 380 }, { "epoch": 0.156, "grad_norm": 18.172786712646484, "learning_rate": 4.2620967741935485e-05, "loss": 1.0456, "step": 390 }, { "epoch": 0.16, "grad_norm": 19.461885452270508, "learning_rate": 4.241935483870968e-05, "loss": 1.0407, "step": 400 }, { "epoch": 0.164, "grad_norm": 18.21559715270996, "learning_rate": 4.2217741935483874e-05, "loss": 1.1422, "step": 410 }, { "epoch": 0.168, "grad_norm": 18.820493698120117, "learning_rate": 4.201612903225807e-05, "loss": 1.1066, "step": 420 }, { "epoch": 0.172, "grad_norm": 14.057687759399414, "learning_rate": 4.1814516129032264e-05, "loss": 1.0544, "step": 430 }, { "epoch": 0.176, "grad_norm": 25.20148468017578, "learning_rate": 4.161290322580645e-05, "loss": 0.8301, "step": 440 }, { "epoch": 0.18, "grad_norm": 14.65149211883545, "learning_rate": 4.141129032258065e-05, "loss": 0.903, "step": 450 }, { "epoch": 0.184, "grad_norm": 33.052406311035156, "learning_rate": 4.120967741935484e-05, "loss": 1.2327, "step": 460 }, { "epoch": 0.188, "grad_norm": 23.308265686035156, "learning_rate": 4.1008064516129036e-05, "loss": 0.9603, "step": 470 }, { "epoch": 0.192, "grad_norm": 28.700511932373047, "learning_rate": 4.080645161290323e-05, "loss": 1.0459, "step": 480 }, { "epoch": 0.196, "grad_norm": 65.68643951416016, "learning_rate": 4.0604838709677426e-05, "loss": 1.1013, "step": 490 }, { "epoch": 0.2, "grad_norm": 18.58645248413086, "learning_rate": 4.0403225806451614e-05, "loss": 1.112, "step": 500 }, { "epoch": 0.204, "grad_norm": 17.082433700561523, "learning_rate": 4.020161290322581e-05, "loss": 0.9832, "step": 510 }, { "epoch": 0.208, "grad_norm": 15.006577491760254, "learning_rate": 4e-05, "loss": 0.8085, "step": 520 }, { "epoch": 0.212, "grad_norm": 23.60434913635254, "learning_rate": 3.97983870967742e-05, "loss": 1.2125, "step": 530 }, { "epoch": 0.216, "grad_norm": 20.58735466003418, "learning_rate": 3.959677419354839e-05, "loss": 0.9544, "step": 540 }, { "epoch": 0.22, "grad_norm": 24.197471618652344, "learning_rate": 3.939516129032259e-05, "loss": 0.762, "step": 550 }, { "epoch": 0.224, "grad_norm": 20.651039123535156, "learning_rate": 3.9193548387096776e-05, "loss": 0.9309, "step": 560 }, { "epoch": 0.228, "grad_norm": 20.045127868652344, "learning_rate": 3.901209677419355e-05, "loss": 0.8862, "step": 570 }, { "epoch": 0.232, "grad_norm": 18.766826629638672, "learning_rate": 3.8810483870967744e-05, "loss": 0.9722, "step": 580 }, { "epoch": 0.236, "grad_norm": 28.41654396057129, "learning_rate": 3.860887096774194e-05, "loss": 1.0501, "step": 590 }, { "epoch": 0.24, "grad_norm": 14.174357414245605, "learning_rate": 3.8407258064516134e-05, "loss": 0.9859, "step": 600 }, { "epoch": 0.244, "grad_norm": 18.710617065429688, "learning_rate": 3.820564516129033e-05, "loss": 1.0233, "step": 610 }, { "epoch": 0.248, "grad_norm": 22.399860382080078, "learning_rate": 3.800403225806452e-05, "loss": 1.1186, "step": 620 }, { "epoch": 0.252, "grad_norm": 18.62351417541504, "learning_rate": 3.780241935483871e-05, "loss": 1.1073, "step": 630 }, { "epoch": 0.256, "grad_norm": 16.706384658813477, "learning_rate": 3.7600806451612906e-05, "loss": 0.9013, "step": 640 }, { "epoch": 0.26, "grad_norm": 14.72787857055664, "learning_rate": 3.7399193548387094e-05, "loss": 0.8391, "step": 650 }, { "epoch": 0.264, "grad_norm": 17.56301498413086, "learning_rate": 3.719758064516129e-05, "loss": 0.9974, "step": 660 }, { "epoch": 0.268, "grad_norm": 18.445892333984375, "learning_rate": 3.6995967741935484e-05, "loss": 0.9481, "step": 670 }, { "epoch": 0.272, "grad_norm": 15.60034465789795, "learning_rate": 3.679435483870968e-05, "loss": 0.8022, "step": 680 }, { "epoch": 0.276, "grad_norm": 23.985820770263672, "learning_rate": 3.659274193548387e-05, "loss": 0.8189, "step": 690 }, { "epoch": 0.28, "grad_norm": 26.464324951171875, "learning_rate": 3.639112903225806e-05, "loss": 0.8447, "step": 700 }, { "epoch": 0.284, "grad_norm": 24.74170684814453, "learning_rate": 3.6189516129032256e-05, "loss": 0.8458, "step": 710 }, { "epoch": 0.288, "grad_norm": 15.681292533874512, "learning_rate": 3.598790322580645e-05, "loss": 0.9417, "step": 720 }, { "epoch": 0.292, "grad_norm": 18.264453887939453, "learning_rate": 3.5786290322580645e-05, "loss": 0.8536, "step": 730 }, { "epoch": 0.296, "grad_norm": 15.522205352783203, "learning_rate": 3.558467741935484e-05, "loss": 0.6493, "step": 740 }, { "epoch": 0.3, "grad_norm": 36.796165466308594, "learning_rate": 3.5383064516129035e-05, "loss": 0.8446, "step": 750 }, { "epoch": 0.304, "grad_norm": 21.616487503051758, "learning_rate": 3.518145161290323e-05, "loss": 0.8152, "step": 760 }, { "epoch": 0.308, "grad_norm": 13.02557373046875, "learning_rate": 3.497983870967742e-05, "loss": 0.6836, "step": 770 }, { "epoch": 0.312, "grad_norm": 22.531129837036133, "learning_rate": 3.477822580645161e-05, "loss": 0.8337, "step": 780 }, { "epoch": 0.316, "grad_norm": 24.401342391967773, "learning_rate": 3.457661290322581e-05, "loss": 0.7016, "step": 790 }, { "epoch": 0.32, "grad_norm": 15.145552635192871, "learning_rate": 3.4375e-05, "loss": 0.7273, "step": 800 }, { "epoch": 0.324, "grad_norm": 20.092849731445312, "learning_rate": 3.41733870967742e-05, "loss": 0.7287, "step": 810 }, { "epoch": 0.328, "grad_norm": 15.03227424621582, "learning_rate": 3.397177419354839e-05, "loss": 0.6846, "step": 820 }, { "epoch": 0.332, "grad_norm": 13.607186317443848, "learning_rate": 3.377016129032258e-05, "loss": 0.724, "step": 830 }, { "epoch": 0.336, "grad_norm": 24.089006423950195, "learning_rate": 3.3568548387096774e-05, "loss": 0.7993, "step": 840 }, { "epoch": 0.34, "grad_norm": 18.13868522644043, "learning_rate": 3.336693548387097e-05, "loss": 0.6757, "step": 850 }, { "epoch": 0.344, "grad_norm": 17.819578170776367, "learning_rate": 3.3165322580645164e-05, "loss": 0.6762, "step": 860 }, { "epoch": 0.348, "grad_norm": 29.358142852783203, "learning_rate": 3.296370967741936e-05, "loss": 0.6936, "step": 870 }, { "epoch": 0.352, "grad_norm": 27.773387908935547, "learning_rate": 3.2762096774193553e-05, "loss": 0.6531, "step": 880 }, { "epoch": 0.356, "grad_norm": 10.760952949523926, "learning_rate": 3.256048387096775e-05, "loss": 0.7669, "step": 890 }, { "epoch": 0.36, "grad_norm": 20.802019119262695, "learning_rate": 3.2358870967741936e-05, "loss": 0.6365, "step": 900 }, { "epoch": 0.364, "grad_norm": 18.4460391998291, "learning_rate": 3.215725806451613e-05, "loss": 0.9778, "step": 910 }, { "epoch": 0.368, "grad_norm": 23.085039138793945, "learning_rate": 3.1955645161290326e-05, "loss": 0.7247, "step": 920 }, { "epoch": 0.372, "grad_norm": 13.907185554504395, "learning_rate": 3.175403225806452e-05, "loss": 0.6822, "step": 930 }, { "epoch": 0.376, "grad_norm": 13.967331886291504, "learning_rate": 3.1552419354838715e-05, "loss": 0.7839, "step": 940 }, { "epoch": 0.38, "grad_norm": 14.392730712890625, "learning_rate": 3.135080645161291e-05, "loss": 0.7518, "step": 950 }, { "epoch": 0.384, "grad_norm": 12.910331726074219, "learning_rate": 3.11491935483871e-05, "loss": 0.6257, "step": 960 }, { "epoch": 0.388, "grad_norm": 17.412134170532227, "learning_rate": 3.0947580645161286e-05, "loss": 0.8162, "step": 970 }, { "epoch": 0.392, "grad_norm": 16.036808013916016, "learning_rate": 3.074596774193548e-05, "loss": 0.8296, "step": 980 }, { "epoch": 0.396, "grad_norm": 14.738393783569336, "learning_rate": 3.0544354838709676e-05, "loss": 0.5135, "step": 990 }, { "epoch": 0.4, "grad_norm": 13.25367546081543, "learning_rate": 3.034274193548387e-05, "loss": 0.7414, "step": 1000 } ], "logging_steps": 10, "max_steps": 2500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.7984652389369856e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }