| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.4, | |
| "eval_steps": 500, | |
| "global_step": 1000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.004, | |
| "grad_norm": 133.32803344726562, | |
| "learning_rate": 1.5e-05, | |
| "loss": 3.6752, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 39.905460357666016, | |
| "learning_rate": 4e-05, | |
| "loss": 1.9794, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.012, | |
| "grad_norm": 51.313323974609375, | |
| "learning_rate": 4.987903225806452e-05, | |
| "loss": 2.1903, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 46.85289001464844, | |
| "learning_rate": 4.967741935483871e-05, | |
| "loss": 2.0147, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 80.95632934570312, | |
| "learning_rate": 4.94758064516129e-05, | |
| "loss": 2.3853, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 37.81920623779297, | |
| "learning_rate": 4.92741935483871e-05, | |
| "loss": 2.7306, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.028, | |
| "grad_norm": 23.879749298095703, | |
| "learning_rate": 4.907258064516129e-05, | |
| "loss": 1.6362, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 26.93802261352539, | |
| "learning_rate": 4.887096774193549e-05, | |
| "loss": 1.6176, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.036, | |
| "grad_norm": 32.77498245239258, | |
| "learning_rate": 4.866935483870968e-05, | |
| "loss": 1.6534, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 36.66754913330078, | |
| "learning_rate": 4.846774193548387e-05, | |
| "loss": 1.6348, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.044, | |
| "grad_norm": 32.059288024902344, | |
| "learning_rate": 4.8266129032258065e-05, | |
| "loss": 1.6225, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 30.694385528564453, | |
| "learning_rate": 4.806451612903226e-05, | |
| "loss": 1.945, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.052, | |
| "grad_norm": 39.73151397705078, | |
| "learning_rate": 4.7862903225806455e-05, | |
| "loss": 1.5514, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 32.25121307373047, | |
| "learning_rate": 4.766129032258065e-05, | |
| "loss": 1.6446, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 40.28260803222656, | |
| "learning_rate": 4.7459677419354844e-05, | |
| "loss": 1.4913, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 34.746864318847656, | |
| "learning_rate": 4.725806451612904e-05, | |
| "loss": 1.1556, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.068, | |
| "grad_norm": 47.414241790771484, | |
| "learning_rate": 4.705645161290323e-05, | |
| "loss": 1.5342, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "grad_norm": 61.64772033691406, | |
| "learning_rate": 4.685483870967742e-05, | |
| "loss": 1.5292, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.076, | |
| "grad_norm": 24.82655906677246, | |
| "learning_rate": 4.6653225806451617e-05, | |
| "loss": 1.2651, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 28.4489688873291, | |
| "learning_rate": 4.645161290322581e-05, | |
| "loss": 1.3369, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.084, | |
| "grad_norm": 28.591188430786133, | |
| "learning_rate": 4.6250000000000006e-05, | |
| "loss": 1.5081, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "grad_norm": 28.529451370239258, | |
| "learning_rate": 4.60483870967742e-05, | |
| "loss": 1.3677, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.092, | |
| "grad_norm": 28.43418312072754, | |
| "learning_rate": 4.584677419354839e-05, | |
| "loss": 1.8053, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 25.22826385498047, | |
| "learning_rate": 4.5645161290322584e-05, | |
| "loss": 1.359, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 26.05068016052246, | |
| "learning_rate": 4.544354838709677e-05, | |
| "loss": 1.2184, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "grad_norm": 40.48981857299805, | |
| "learning_rate": 4.5241935483870966e-05, | |
| "loss": 1.3307, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.108, | |
| "grad_norm": 22.968761444091797, | |
| "learning_rate": 4.504032258064516e-05, | |
| "loss": 1.3088, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 32.09326934814453, | |
| "learning_rate": 4.4838709677419356e-05, | |
| "loss": 1.3478, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.116, | |
| "grad_norm": 40.11349105834961, | |
| "learning_rate": 4.463709677419355e-05, | |
| "loss": 1.322, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 31.492155075073242, | |
| "learning_rate": 4.4435483870967745e-05, | |
| "loss": 1.515, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.124, | |
| "grad_norm": 49.823089599609375, | |
| "learning_rate": 4.4233870967741933e-05, | |
| "loss": 1.291, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 24.21888542175293, | |
| "learning_rate": 4.403225806451613e-05, | |
| "loss": 1.3147, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.132, | |
| "grad_norm": 19.33460235595703, | |
| "learning_rate": 4.383064516129032e-05, | |
| "loss": 1.1186, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.136, | |
| "grad_norm": 21.692705154418945, | |
| "learning_rate": 4.362903225806452e-05, | |
| "loss": 1.0384, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 27.751327514648438, | |
| "learning_rate": 4.342741935483871e-05, | |
| "loss": 1.0266, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 19.070554733276367, | |
| "learning_rate": 4.322580645161291e-05, | |
| "loss": 1.1643, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.148, | |
| "grad_norm": 18.97333335876465, | |
| "learning_rate": 4.3024193548387095e-05, | |
| "loss": 1.4645, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.152, | |
| "grad_norm": 36.90088653564453, | |
| "learning_rate": 4.282258064516129e-05, | |
| "loss": 1.0192, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.156, | |
| "grad_norm": 18.172786712646484, | |
| "learning_rate": 4.2620967741935485e-05, | |
| "loss": 1.0456, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 19.461885452270508, | |
| "learning_rate": 4.241935483870968e-05, | |
| "loss": 1.0407, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.164, | |
| "grad_norm": 18.21559715270996, | |
| "learning_rate": 4.2217741935483874e-05, | |
| "loss": 1.1422, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.168, | |
| "grad_norm": 18.820493698120117, | |
| "learning_rate": 4.201612903225807e-05, | |
| "loss": 1.1066, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.172, | |
| "grad_norm": 14.057687759399414, | |
| "learning_rate": 4.1814516129032264e-05, | |
| "loss": 1.0544, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 25.20148468017578, | |
| "learning_rate": 4.161290322580645e-05, | |
| "loss": 0.8301, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 14.65149211883545, | |
| "learning_rate": 4.141129032258065e-05, | |
| "loss": 0.903, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.184, | |
| "grad_norm": 33.052406311035156, | |
| "learning_rate": 4.120967741935484e-05, | |
| "loss": 1.2327, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.188, | |
| "grad_norm": 23.308265686035156, | |
| "learning_rate": 4.1008064516129036e-05, | |
| "loss": 0.9603, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 28.700511932373047, | |
| "learning_rate": 4.080645161290323e-05, | |
| "loss": 1.0459, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.196, | |
| "grad_norm": 65.68643951416016, | |
| "learning_rate": 4.0604838709677426e-05, | |
| "loss": 1.1013, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 18.58645248413086, | |
| "learning_rate": 4.0403225806451614e-05, | |
| "loss": 1.112, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.204, | |
| "grad_norm": 17.082433700561523, | |
| "learning_rate": 4.020161290322581e-05, | |
| "loss": 0.9832, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 15.006577491760254, | |
| "learning_rate": 4e-05, | |
| "loss": 0.8085, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.212, | |
| "grad_norm": 23.60434913635254, | |
| "learning_rate": 3.97983870967742e-05, | |
| "loss": 1.2125, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.216, | |
| "grad_norm": 20.58735466003418, | |
| "learning_rate": 3.959677419354839e-05, | |
| "loss": 0.9544, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 24.197471618652344, | |
| "learning_rate": 3.939516129032259e-05, | |
| "loss": 0.762, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 20.651039123535156, | |
| "learning_rate": 3.9193548387096776e-05, | |
| "loss": 0.9309, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.228, | |
| "grad_norm": 20.045127868652344, | |
| "learning_rate": 3.901209677419355e-05, | |
| "loss": 0.8862, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.232, | |
| "grad_norm": 18.766826629638672, | |
| "learning_rate": 3.8810483870967744e-05, | |
| "loss": 0.9722, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.236, | |
| "grad_norm": 28.41654396057129, | |
| "learning_rate": 3.860887096774194e-05, | |
| "loss": 1.0501, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 14.174357414245605, | |
| "learning_rate": 3.8407258064516134e-05, | |
| "loss": 0.9859, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.244, | |
| "grad_norm": 18.710617065429688, | |
| "learning_rate": 3.820564516129033e-05, | |
| "loss": 1.0233, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.248, | |
| "grad_norm": 22.399860382080078, | |
| "learning_rate": 3.800403225806452e-05, | |
| "loss": 1.1186, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.252, | |
| "grad_norm": 18.62351417541504, | |
| "learning_rate": 3.780241935483871e-05, | |
| "loss": 1.1073, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 16.706384658813477, | |
| "learning_rate": 3.7600806451612906e-05, | |
| "loss": 0.9013, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 14.72787857055664, | |
| "learning_rate": 3.7399193548387094e-05, | |
| "loss": 0.8391, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.264, | |
| "grad_norm": 17.56301498413086, | |
| "learning_rate": 3.719758064516129e-05, | |
| "loss": 0.9974, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.268, | |
| "grad_norm": 18.445892333984375, | |
| "learning_rate": 3.6995967741935484e-05, | |
| "loss": 0.9481, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 15.60034465789795, | |
| "learning_rate": 3.679435483870968e-05, | |
| "loss": 0.8022, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.276, | |
| "grad_norm": 23.985820770263672, | |
| "learning_rate": 3.659274193548387e-05, | |
| "loss": 0.8189, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 26.464324951171875, | |
| "learning_rate": 3.639112903225806e-05, | |
| "loss": 0.8447, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.284, | |
| "grad_norm": 24.74170684814453, | |
| "learning_rate": 3.6189516129032256e-05, | |
| "loss": 0.8458, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 15.681292533874512, | |
| "learning_rate": 3.598790322580645e-05, | |
| "loss": 0.9417, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.292, | |
| "grad_norm": 18.264453887939453, | |
| "learning_rate": 3.5786290322580645e-05, | |
| "loss": 0.8536, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.296, | |
| "grad_norm": 15.522205352783203, | |
| "learning_rate": 3.558467741935484e-05, | |
| "loss": 0.6493, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 36.796165466308594, | |
| "learning_rate": 3.5383064516129035e-05, | |
| "loss": 0.8446, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 21.616487503051758, | |
| "learning_rate": 3.518145161290323e-05, | |
| "loss": 0.8152, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.308, | |
| "grad_norm": 13.02557373046875, | |
| "learning_rate": 3.497983870967742e-05, | |
| "loss": 0.6836, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.312, | |
| "grad_norm": 22.531129837036133, | |
| "learning_rate": 3.477822580645161e-05, | |
| "loss": 0.8337, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.316, | |
| "grad_norm": 24.401342391967773, | |
| "learning_rate": 3.457661290322581e-05, | |
| "loss": 0.7016, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 15.145552635192871, | |
| "learning_rate": 3.4375e-05, | |
| "loss": 0.7273, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.324, | |
| "grad_norm": 20.092849731445312, | |
| "learning_rate": 3.41733870967742e-05, | |
| "loss": 0.7287, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.328, | |
| "grad_norm": 15.03227424621582, | |
| "learning_rate": 3.397177419354839e-05, | |
| "loss": 0.6846, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.332, | |
| "grad_norm": 13.607186317443848, | |
| "learning_rate": 3.377016129032258e-05, | |
| "loss": 0.724, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 24.089006423950195, | |
| "learning_rate": 3.3568548387096774e-05, | |
| "loss": 0.7993, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 18.13868522644043, | |
| "learning_rate": 3.336693548387097e-05, | |
| "loss": 0.6757, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.344, | |
| "grad_norm": 17.819578170776367, | |
| "learning_rate": 3.3165322580645164e-05, | |
| "loss": 0.6762, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.348, | |
| "grad_norm": 29.358142852783203, | |
| "learning_rate": 3.296370967741936e-05, | |
| "loss": 0.6936, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 27.773387908935547, | |
| "learning_rate": 3.2762096774193553e-05, | |
| "loss": 0.6531, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.356, | |
| "grad_norm": 10.760952949523926, | |
| "learning_rate": 3.256048387096775e-05, | |
| "loss": 0.7669, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 20.802019119262695, | |
| "learning_rate": 3.2358870967741936e-05, | |
| "loss": 0.6365, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.364, | |
| "grad_norm": 18.4460391998291, | |
| "learning_rate": 3.215725806451613e-05, | |
| "loss": 0.9778, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 23.085039138793945, | |
| "learning_rate": 3.1955645161290326e-05, | |
| "loss": 0.7247, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.372, | |
| "grad_norm": 13.907185554504395, | |
| "learning_rate": 3.175403225806452e-05, | |
| "loss": 0.6822, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.376, | |
| "grad_norm": 13.967331886291504, | |
| "learning_rate": 3.1552419354838715e-05, | |
| "loss": 0.7839, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 14.392730712890625, | |
| "learning_rate": 3.135080645161291e-05, | |
| "loss": 0.7518, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 12.910331726074219, | |
| "learning_rate": 3.11491935483871e-05, | |
| "loss": 0.6257, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.388, | |
| "grad_norm": 17.412134170532227, | |
| "learning_rate": 3.0947580645161286e-05, | |
| "loss": 0.8162, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.392, | |
| "grad_norm": 16.036808013916016, | |
| "learning_rate": 3.074596774193548e-05, | |
| "loss": 0.8296, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.396, | |
| "grad_norm": 14.738393783569336, | |
| "learning_rate": 3.0544354838709676e-05, | |
| "loss": 0.5135, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 13.25367546081543, | |
| "learning_rate": 3.034274193548387e-05, | |
| "loss": 0.7414, | |
| "step": 1000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 250, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.7984652389369856e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |