| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 288, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0, |
| "eval_loss": 4.371264934539795, |
| "eval_num_tokens": 0.0, |
| "eval_runtime": 29.4614, |
| "eval_samples_per_second": 75.115, |
| "eval_steps_per_second": 2.376, |
| "step": 0 |
| }, |
| { |
| "epoch": 0.010416666666666666, |
| "grad_norm": 52.149383544921875, |
| "learning_rate": 0.0, |
| "loss": 15.0648, |
| "num_tokens": 1835008.0, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.020833333333333332, |
| "grad_norm": 51.116416931152344, |
| "learning_rate": 6.896551724137932e-06, |
| "loss": 14.6347, |
| "num_tokens": 3669884.0, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.03125, |
| "grad_norm": 42.77839660644531, |
| "learning_rate": 1.3793103448275863e-05, |
| "loss": 12.7417, |
| "num_tokens": 5504456.0, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.041666666666666664, |
| "grad_norm": 21.081525802612305, |
| "learning_rate": 2.0689655172413793e-05, |
| "loss": 11.2837, |
| "num_tokens": 7338518.0, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.052083333333333336, |
| "grad_norm": 12.225955963134766, |
| "learning_rate": 2.7586206896551727e-05, |
| "loss": 9.6522, |
| "num_tokens": 9171273.0, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.0625, |
| "grad_norm": 22.207218170166016, |
| "learning_rate": 3.4482758620689657e-05, |
| "loss": 13.6798, |
| "num_tokens": 10989860.0, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.07291666666666667, |
| "grad_norm": 14.794663429260254, |
| "learning_rate": 4.1379310344827587e-05, |
| "loss": 11.2003, |
| "num_tokens": 12762151.0, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.08333333333333333, |
| "grad_norm": 12.77396297454834, |
| "learning_rate": 4.827586206896552e-05, |
| "loss": 9.6115, |
| "num_tokens": 14597063.0, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.09375, |
| "grad_norm": 10.279570579528809, |
| "learning_rate": 5.517241379310345e-05, |
| "loss": 8.8284, |
| "num_tokens": 16431725.0, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.10416666666666667, |
| "grad_norm": 8.187755584716797, |
| "learning_rate": 6.206896551724138e-05, |
| "loss": 8.3131, |
| "num_tokens": 18265899.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.11458333333333333, |
| "grad_norm": 5.445209503173828, |
| "learning_rate": 6.896551724137931e-05, |
| "loss": 8.0208, |
| "num_tokens": 20099095.0, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.125, |
| "grad_norm": 6.53316068649292, |
| "learning_rate": 7.586206896551724e-05, |
| "loss": 8.0089, |
| "num_tokens": 21927060.0, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.13541666666666666, |
| "grad_norm": 9.994010925292969, |
| "learning_rate": 8.275862068965517e-05, |
| "loss": 8.1674, |
| "num_tokens": 23686539.0, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.14583333333333334, |
| "grad_norm": 5.84063196182251, |
| "learning_rate": 8.96551724137931e-05, |
| "loss": 7.9024, |
| "num_tokens": 25521502.0, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.15625, |
| "grad_norm": 4.530292987823486, |
| "learning_rate": 9.655172413793105e-05, |
| "loss": 7.7778, |
| "num_tokens": 27356195.0, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.16666666666666666, |
| "grad_norm": 3.1736268997192383, |
| "learning_rate": 0.00010344827586206898, |
| "loss": 7.5457, |
| "num_tokens": 29190486.0, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.17708333333333334, |
| "grad_norm": 3.9896769523620605, |
| "learning_rate": 0.0001103448275862069, |
| "loss": 7.4311, |
| "num_tokens": 31024087.0, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.1875, |
| "grad_norm": 3.630253553390503, |
| "learning_rate": 0.00011724137931034482, |
| "loss": 7.227, |
| "num_tokens": 32855321.0, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.19791666666666666, |
| "grad_norm": 3.98490047454834, |
| "learning_rate": 0.00012413793103448277, |
| "loss": 5.8909, |
| "num_tokens": 34641448.0, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.20833333333333334, |
| "grad_norm": 4.179741859436035, |
| "learning_rate": 0.00013103448275862068, |
| "loss": 7.1386, |
| "num_tokens": 36476456.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.21875, |
| "grad_norm": 3.653348684310913, |
| "learning_rate": 0.00013793103448275863, |
| "loss": 7.3049, |
| "num_tokens": 38311233.0, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.22916666666666666, |
| "grad_norm": 1.8011902570724487, |
| "learning_rate": 0.00014482758620689657, |
| "loss": 7.3865, |
| "num_tokens": 40145678.0, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.23958333333333334, |
| "grad_norm": 2.962880849838257, |
| "learning_rate": 0.00015172413793103449, |
| "loss": 7.1742, |
| "num_tokens": 41979519.0, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 2.545250177383423, |
| "learning_rate": 0.00015862068965517243, |
| "loss": 6.9464, |
| "num_tokens": 43811766.0, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.2604166666666667, |
| "grad_norm": 3.547168254852295, |
| "learning_rate": 0.00016551724137931035, |
| "loss": 5.0189, |
| "num_tokens": 45557051.0, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.2708333333333333, |
| "grad_norm": 3.0412774085998535, |
| "learning_rate": 0.00017241379310344826, |
| "loss": 6.9082, |
| "num_tokens": 47392059.0, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.28125, |
| "grad_norm": 2.2950937747955322, |
| "learning_rate": 0.0001793103448275862, |
| "loss": 7.2282, |
| "num_tokens": 49226914.0, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.2916666666666667, |
| "grad_norm": 2.410656452178955, |
| "learning_rate": 0.00018620689655172415, |
| "loss": 7.1382, |
| "num_tokens": 51061477.0, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.3020833333333333, |
| "grad_norm": 2.726811170578003, |
| "learning_rate": 0.0001931034482758621, |
| "loss": 7.2778, |
| "num_tokens": 52895540.0, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.3125, |
| "grad_norm": 1.3566060066223145, |
| "learning_rate": 0.0002, |
| "loss": 6.8058, |
| "num_tokens": 54728495.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.3229166666666667, |
| "grad_norm": 2.3010756969451904, |
| "learning_rate": 0.0001999926436074355, |
| "loss": 5.614, |
| "num_tokens": 56547945.0, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.3333333333333333, |
| "grad_norm": 2.086775302886963, |
| "learning_rate": 0.00019997057551207221, |
| "loss": 6.1411, |
| "num_tokens": 58335361.0, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.34375, |
| "grad_norm": 2.037832498550415, |
| "learning_rate": 0.0001999337989607416, |
| "loss": 7.1009, |
| "num_tokens": 60170309.0, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.3541666666666667, |
| "grad_norm": 1.5545268058776855, |
| "learning_rate": 0.00019988231936429865, |
| "loss": 7.1493, |
| "num_tokens": 62004989.0, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.3645833333333333, |
| "grad_norm": 2.550959825515747, |
| "learning_rate": 0.00019981614429682575, |
| "loss": 6.8761, |
| "num_tokens": 63839187.0, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.375, |
| "grad_norm": 2.005693197250366, |
| "learning_rate": 0.00019973528349451837, |
| "loss": 6.9338, |
| "num_tokens": 65672410.0, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.3854166666666667, |
| "grad_norm": 1.62908935546875, |
| "learning_rate": 0.00019963974885425266, |
| "loss": 6.2837, |
| "num_tokens": 67500466.0, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.3958333333333333, |
| "grad_norm": 1.622885823249817, |
| "learning_rate": 0.0001995295544318349, |
| "loss": 5.4648, |
| "num_tokens": 69256656.0, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.40625, |
| "grad_norm": 1.3703619241714478, |
| "learning_rate": 0.0001994047164399338, |
| "loss": 6.7164, |
| "num_tokens": 71091624.0, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.4166666666666667, |
| "grad_norm": 1.7858023643493652, |
| "learning_rate": 0.00019926525324569472, |
| "loss": 7.1825, |
| "num_tokens": 72926315.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.4270833333333333, |
| "grad_norm": 1.419968843460083, |
| "learning_rate": 0.00019911118536803787, |
| "loss": 6.9133, |
| "num_tokens": 74760616.0, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.4375, |
| "grad_norm": 1.3873683214187622, |
| "learning_rate": 0.00019894253547463896, |
| "loss": 6.8253, |
| "num_tokens": 76594149.0, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.4479166666666667, |
| "grad_norm": 0.913640022277832, |
| "learning_rate": 0.0001987593283785945, |
| "loss": 6.6054, |
| "num_tokens": 78424329.0, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.4583333333333333, |
| "grad_norm": 1.8171018362045288, |
| "learning_rate": 0.00019856159103477086, |
| "loss": 4.7958, |
| "num_tokens": 80185993.0, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.46875, |
| "grad_norm": 1.4733974933624268, |
| "learning_rate": 0.0001983493525358385, |
| "loss": 6.7481, |
| "num_tokens": 82020985.0, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.4791666666666667, |
| "grad_norm": 1.0637065172195435, |
| "learning_rate": 0.0001981226441079918, |
| "loss": 6.7655, |
| "num_tokens": 83855703.0, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.4895833333333333, |
| "grad_norm": 1.1668665409088135, |
| "learning_rate": 0.0001978814991063546, |
| "loss": 6.8615, |
| "num_tokens": 85690104.0, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 1.1763768196105957, |
| "learning_rate": 0.00019762595301007281, |
| "loss": 6.8688, |
| "num_tokens": 87523856.0, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.5104166666666666, |
| "grad_norm": 1.0596013069152832, |
| "learning_rate": 0.00019735604341709448, |
| "loss": 6.5309, |
| "num_tokens": 89355335.0, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.5208333333333334, |
| "grad_norm": 1.4968317747116089, |
| "learning_rate": 0.00019707181003863808, |
| "loss": 4.2569, |
| "num_tokens": 91098058.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.53125, |
| "grad_norm": 1.4468708038330078, |
| "learning_rate": 0.0001967732946933499, |
| "loss": 6.4713, |
| "num_tokens": 92933066.0, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.5416666666666666, |
| "grad_norm": 1.1204187870025635, |
| "learning_rate": 0.0001964605413011512, |
| "loss": 6.8482, |
| "num_tokens": 94767926.0, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.5520833333333334, |
| "grad_norm": 1.1414028406143188, |
| "learning_rate": 0.00019613359587677658, |
| "loss": 6.9067, |
| "num_tokens": 96602477.0, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.5625, |
| "grad_norm": 1.1222317218780518, |
| "learning_rate": 0.0001957925065230038, |
| "loss": 6.8611, |
| "num_tokens": 98436510.0, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.5729166666666666, |
| "grad_norm": 1.2287341356277466, |
| "learning_rate": 0.00019543732342357662, |
| "loss": 6.6331, |
| "num_tokens": 100269204.0, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.5833333333333334, |
| "grad_norm": 1.4391974210739136, |
| "learning_rate": 0.00019506809883582124, |
| "loss": 4.9841, |
| "num_tokens": 102087564.0, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.59375, |
| "grad_norm": 0.9550106525421143, |
| "learning_rate": 0.0001946848870829579, |
| "loss": 5.7976, |
| "num_tokens": 103853432.0, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.6041666666666666, |
| "grad_norm": 0.9806034564971924, |
| "learning_rate": 0.00019428774454610843, |
| "loss": 6.7797, |
| "num_tokens": 105688316.0, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.6145833333333334, |
| "grad_norm": 1.075658917427063, |
| "learning_rate": 0.00019387672965600087, |
| "loss": 6.8395, |
| "num_tokens": 107522938.0, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.625, |
| "grad_norm": 1.0181782245635986, |
| "learning_rate": 0.00019345190288437293, |
| "loss": 6.775, |
| "num_tokens": 109357089.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.6354166666666666, |
| "grad_norm": 1.1025474071502686, |
| "learning_rate": 0.00019301332673507456, |
| "loss": 6.5375, |
| "num_tokens": 111190261.0, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.6458333333333334, |
| "grad_norm": 1.0409873723983765, |
| "learning_rate": 0.00019256106573487238, |
| "loss": 5.8817, |
| "num_tokens": 113017434.0, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.65625, |
| "grad_norm": 1.040127158164978, |
| "learning_rate": 0.00019209518642395547, |
| "loss": 4.9992, |
| "num_tokens": 114773984.0, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.6666666666666666, |
| "grad_norm": 1.0943304300308228, |
| "learning_rate": 0.00019161575734614585, |
| "loss": 6.3306, |
| "num_tokens": 116608963.0, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.6770833333333334, |
| "grad_norm": 0.9448878765106201, |
| "learning_rate": 0.0001911228490388136, |
| "loss": 6.7798, |
| "num_tokens": 118443674.0, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.6875, |
| "grad_norm": 1.0497307777404785, |
| "learning_rate": 0.00019061653402249878, |
| "loss": 6.7631, |
| "num_tokens": 120278001.0, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.6979166666666666, |
| "grad_norm": 0.9516660571098328, |
| "learning_rate": 0.0001900968867902419, |
| "loss": 6.7123, |
| "num_tokens": 122111518.0, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.7083333333333334, |
| "grad_norm": 0.8681999444961548, |
| "learning_rate": 0.00018956398379662366, |
| "loss": 6.2591, |
| "num_tokens": 123941919.0, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.71875, |
| "grad_norm": 2.1472890377044678, |
| "learning_rate": 0.00018901790344651645, |
| "loss": 4.4264, |
| "num_tokens": 125698383.0, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.7291666666666666, |
| "grad_norm": 1.3623411655426025, |
| "learning_rate": 0.00018845872608354877, |
| "loss": 6.4687, |
| "num_tokens": 127533391.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.7395833333333334, |
| "grad_norm": 1.0571578741073608, |
| "learning_rate": 0.0001878865339782846, |
| "loss": 6.6055, |
| "num_tokens": 129368145.0, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 1.1096223592758179, |
| "learning_rate": 0.00018730141131611882, |
| "loss": 6.9156, |
| "num_tokens": 131202603.0, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.7604166666666666, |
| "grad_norm": 1.0648548603057861, |
| "learning_rate": 0.0001867034441848915, |
| "loss": 6.7572, |
| "num_tokens": 133036394.0, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.7708333333333334, |
| "grad_norm": 0.9367074370384216, |
| "learning_rate": 0.00018609272056222188, |
| "loss": 6.5532, |
| "num_tokens": 134868491.0, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.78125, |
| "grad_norm": 1.4335384368896484, |
| "learning_rate": 0.00018546933030256417, |
| "loss": 4.2937, |
| "num_tokens": 136608131.0, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.7916666666666666, |
| "grad_norm": 4.660006046295166, |
| "learning_rate": 0.00018483336512398784, |
| "loss": 6.397, |
| "num_tokens": 138443139.0, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.8020833333333334, |
| "grad_norm": 1.2509715557098389, |
| "learning_rate": 0.00018418491859468312, |
| "loss": 6.8787, |
| "num_tokens": 140278014.0, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.8125, |
| "grad_norm": 0.9726094007492065, |
| "learning_rate": 0.00018352408611919453, |
| "loss": 6.8013, |
| "num_tokens": 142112594.0, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.8229166666666666, |
| "grad_norm": 1.0140933990478516, |
| "learning_rate": 0.00018285096492438424, |
| "loss": 6.7311, |
| "num_tokens": 143946705.0, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.8333333333333334, |
| "grad_norm": 1.0336406230926514, |
| "learning_rate": 0.0001821656540451273, |
| "loss": 6.6231, |
| "num_tokens": 145779640.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.84375, |
| "grad_norm": 1.1661409139633179, |
| "learning_rate": 0.0001814682543097409, |
| "loss": 5.2183, |
| "num_tokens": 147600011.0, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.8541666666666666, |
| "grad_norm": 0.9476063251495361, |
| "learning_rate": 0.0001807588683251495, |
| "loss": 5.6845, |
| "num_tokens": 149377100.0, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.8645833333333334, |
| "grad_norm": 0.9452285170555115, |
| "learning_rate": 0.00018003760046178882, |
| "loss": 6.5403, |
| "num_tokens": 151211990.0, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.875, |
| "grad_norm": 0.958979606628418, |
| "learning_rate": 0.00017930455683824978, |
| "loss": 6.6053, |
| "num_tokens": 153046584.0, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.8854166666666666, |
| "grad_norm": 1.163456678390503, |
| "learning_rate": 0.00017855984530566564, |
| "loss": 6.6077, |
| "num_tokens": 154880688.0, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.8958333333333334, |
| "grad_norm": 1.1934983730316162, |
| "learning_rate": 0.00017780357543184397, |
| "loss": 6.5929, |
| "num_tokens": 156713762.0, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.90625, |
| "grad_norm": 8.590930938720703, |
| "learning_rate": 0.00017703585848514634, |
| "loss": 5.8754, |
| "num_tokens": 158541215.0, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.9166666666666666, |
| "grad_norm": 13.481514930725098, |
| "learning_rate": 0.00017625680741811746, |
| "loss": 4.9299, |
| "num_tokens": 160317578.0, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.9270833333333334, |
| "grad_norm": 3.5045320987701416, |
| "learning_rate": 0.00017546653685086695, |
| "loss": 6.3777, |
| "num_tokens": 162152538.0, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.9375, |
| "grad_norm": 2.1936583518981934, |
| "learning_rate": 0.00017466516305420524, |
| "loss": 6.7691, |
| "num_tokens": 163987214.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.9479166666666666, |
| "grad_norm": 1.0100579261779785, |
| "learning_rate": 0.00017385280393253716, |
| "loss": 6.6249, |
| "num_tokens": 165821456.0, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.9583333333333334, |
| "grad_norm": 1.6229281425476074, |
| "learning_rate": 0.00017302957900651474, |
| "loss": 6.4187, |
| "num_tokens": 167654831.0, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.96875, |
| "grad_norm": 1.4464794397354126, |
| "learning_rate": 0.00017219560939545246, |
| "loss": 6.1919, |
| "num_tokens": 169484982.0, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.9791666666666666, |
| "grad_norm": 1.393432855606079, |
| "learning_rate": 0.00017135101779950724, |
| "loss": 4.3906, |
| "num_tokens": 171229135.0, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.9895833333333334, |
| "grad_norm": 1.427167534828186, |
| "learning_rate": 0.00017049592848162584, |
| "loss": 6.5857, |
| "num_tokens": 173063228.0, |
| "step": 95 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.7913920879364014, |
| "learning_rate": 0.00016963046724926222, |
| "loss": 5.8988, |
| "num_tokens": 174885743.0, |
| "step": 96 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.40726137161254883, |
| "eval_num_tokens": 174885743.0, |
| "eval_runtime": 29.6224, |
| "eval_samples_per_second": 74.707, |
| "eval_steps_per_second": 2.363, |
| "step": 96 |
| }, |
| { |
| "epoch": 1.0104166666666667, |
| "grad_norm": 1.2719930410385132, |
| "learning_rate": 0.00016875476143586788, |
| "loss": 5.9922, |
| "num_tokens": 176720751.0, |
| "step": 97 |
| }, |
| { |
| "epoch": 1.0208333333333333, |
| "grad_norm": 1.0774774551391602, |
| "learning_rate": 0.00016786893988215753, |
| "loss": 6.0907, |
| "num_tokens": 178555589.0, |
| "step": 98 |
| }, |
| { |
| "epoch": 1.03125, |
| "grad_norm": 1.2304658889770508, |
| "learning_rate": 0.00016697313291715298, |
| "loss": 6.1506, |
| "num_tokens": 180390095.0, |
| "step": 99 |
| }, |
| { |
| "epoch": 1.0416666666666667, |
| "grad_norm": 1.3053410053253174, |
| "learning_rate": 0.00016606747233900815, |
| "loss": 5.9837, |
| "num_tokens": 182224036.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.0520833333333333, |
| "grad_norm": 0.9655668139457703, |
| "learning_rate": 0.00016515209139561794, |
| "loss": 5.8686, |
| "num_tokens": 184056473.0, |
| "step": 101 |
| }, |
| { |
| "epoch": 1.0625, |
| "grad_norm": 1.163251280784607, |
| "learning_rate": 0.0001642271247650136, |
| "loss": 4.2894, |
| "num_tokens": 185872860.0, |
| "step": 102 |
| }, |
| { |
| "epoch": 1.0729166666666667, |
| "grad_norm": 0.9375080466270447, |
| "learning_rate": 0.00016329270853554807, |
| "loss": 5.1262, |
| "num_tokens": 187644701.0, |
| "step": 103 |
| }, |
| { |
| "epoch": 1.0833333333333333, |
| "grad_norm": 0.836040198802948, |
| "learning_rate": 0.00016234898018587337, |
| "loss": 5.9917, |
| "num_tokens": 189479639.0, |
| "step": 104 |
| }, |
| { |
| "epoch": 1.09375, |
| "grad_norm": 1.0972402095794678, |
| "learning_rate": 0.00016139607856471377, |
| "loss": 6.1988, |
| "num_tokens": 191314308.0, |
| "step": 105 |
| }, |
| { |
| "epoch": 1.1041666666666667, |
| "grad_norm": 0.8589965105056763, |
| "learning_rate": 0.0001604341438704373, |
| "loss": 6.2019, |
| "num_tokens": 193148505.0, |
| "step": 106 |
| }, |
| { |
| "epoch": 1.1145833333333333, |
| "grad_norm": 0.8513528108596802, |
| "learning_rate": 0.00015946331763042867, |
| "loss": 5.855, |
| "num_tokens": 194981706.0, |
| "step": 107 |
| }, |
| { |
| "epoch": 1.125, |
| "grad_norm": 1.2804700136184692, |
| "learning_rate": 0.00015848374268026647, |
| "loss": 5.0832, |
| "num_tokens": 196807954.0, |
| "step": 108 |
| }, |
| { |
| "epoch": 1.1354166666666667, |
| "grad_norm": 0.9044701457023621, |
| "learning_rate": 0.0001574955631427083, |
| "loss": 4.5894, |
| "num_tokens": 198547334.0, |
| "step": 109 |
| }, |
| { |
| "epoch": 1.1458333333333333, |
| "grad_norm": 0.9213070273399353, |
| "learning_rate": 0.00015649892440648623, |
| "loss": 5.8609, |
| "num_tokens": 200382304.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.15625, |
| "grad_norm": 1.2462490797042847, |
| "learning_rate": 0.00015549397310491605, |
| "loss": 6.0755, |
| "num_tokens": 202216995.0, |
| "step": 111 |
| }, |
| { |
| "epoch": 1.1666666666666667, |
| "grad_norm": 0.9984328746795654, |
| "learning_rate": 0.00015448085709432338, |
| "loss": 6.1775, |
| "num_tokens": 204051331.0, |
| "step": 112 |
| }, |
| { |
| "epoch": 1.1770833333333333, |
| "grad_norm": 0.8022240400314331, |
| "learning_rate": 0.00015345972543229, |
| "loss": 6.0557, |
| "num_tokens": 205884933.0, |
| "step": 113 |
| }, |
| { |
| "epoch": 1.1875, |
| "grad_norm": 0.8404216170310974, |
| "learning_rate": 0.00015243072835572318, |
| "loss": 5.6417, |
| "num_tokens": 207715221.0, |
| "step": 114 |
| }, |
| { |
| "epoch": 1.1979166666666667, |
| "grad_norm": 1.0233253240585327, |
| "learning_rate": 0.0001513940172587518, |
| "loss": 3.9534, |
| "num_tokens": 209463400.0, |
| "step": 115 |
| }, |
| { |
| "epoch": 1.2083333333333333, |
| "grad_norm": 0.8012278079986572, |
| "learning_rate": 0.000150349744670452, |
| "loss": 5.8775, |
| "num_tokens": 211298403.0, |
| "step": 116 |
| }, |
| { |
| "epoch": 1.21875, |
| "grad_norm": 0.9274885058403015, |
| "learning_rate": 0.00014929806423240582, |
| "loss": 6.1698, |
| "num_tokens": 213133134.0, |
| "step": 117 |
| }, |
| { |
| "epoch": 1.2291666666666667, |
| "grad_norm": 0.8778573870658875, |
| "learning_rate": 0.00014823913067609637, |
| "loss": 6.0613, |
| "num_tokens": 214967541.0, |
| "step": 118 |
| }, |
| { |
| "epoch": 1.2395833333333333, |
| "grad_norm": 0.9037396907806396, |
| "learning_rate": 0.00014717309980014244, |
| "loss": 6.0915, |
| "num_tokens": 216801312.0, |
| "step": 119 |
| }, |
| { |
| "epoch": 1.25, |
| "grad_norm": 0.9712461233139038, |
| "learning_rate": 0.00014610012844737622, |
| "loss": 5.7472, |
| "num_tokens": 218633029.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.2604166666666667, |
| "grad_norm": 1.3214646577835083, |
| "learning_rate": 0.00014502037448176734, |
| "loss": 3.4601, |
| "num_tokens": 220392276.0, |
| "step": 121 |
| }, |
| { |
| "epoch": 1.2708333333333333, |
| "grad_norm": 0.9007311463356018, |
| "learning_rate": 0.00014393399676519667, |
| "loss": 5.8749, |
| "num_tokens": 222227284.0, |
| "step": 122 |
| }, |
| { |
| "epoch": 1.28125, |
| "grad_norm": 0.9098818898200989, |
| "learning_rate": 0.00014284115513408336, |
| "loss": 5.9967, |
| "num_tokens": 224062147.0, |
| "step": 123 |
| }, |
| { |
| "epoch": 1.2916666666666667, |
| "grad_norm": 0.8330245018005371, |
| "learning_rate": 0.00014174201037586842, |
| "loss": 5.9866, |
| "num_tokens": 225896682.0, |
| "step": 124 |
| }, |
| { |
| "epoch": 1.3020833333333333, |
| "grad_norm": 0.8968133330345154, |
| "learning_rate": 0.0001406367242053583, |
| "loss": 5.9487, |
| "num_tokens": 227730675.0, |
| "step": 125 |
| }, |
| { |
| "epoch": 1.3125, |
| "grad_norm": 0.8355301022529602, |
| "learning_rate": 0.00013952545924093238, |
| "loss": 5.8684, |
| "num_tokens": 229563354.0, |
| "step": 126 |
| }, |
| { |
| "epoch": 1.3229166666666667, |
| "grad_norm": 0.9524968862533569, |
| "learning_rate": 0.00013840837898061712, |
| "loss": 4.6631, |
| "num_tokens": 231382349.0, |
| "step": 127 |
| }, |
| { |
| "epoch": 1.3333333333333333, |
| "grad_norm": 0.7690907716751099, |
| "learning_rate": 0.00013728564777803088, |
| "loss": 5.1172, |
| "num_tokens": 233157485.0, |
| "step": 128 |
| }, |
| { |
| "epoch": 1.34375, |
| "grad_norm": 0.8060216903686523, |
| "learning_rate": 0.00013615743081820308, |
| "loss": 5.999, |
| "num_tokens": 234992398.0, |
| "step": 129 |
| }, |
| { |
| "epoch": 1.3541666666666667, |
| "grad_norm": 0.8314852118492126, |
| "learning_rate": 0.00013502389409327087, |
| "loss": 6.0317, |
| "num_tokens": 236827040.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.3645833333333333, |
| "grad_norm": 0.8006500601768494, |
| "learning_rate": 0.0001338852043780569, |
| "loss": 5.7848, |
| "num_tokens": 238661202.0, |
| "step": 131 |
| }, |
| { |
| "epoch": 1.375, |
| "grad_norm": 0.8115113973617554, |
| "learning_rate": 0.00013274152920553226, |
| "loss": 5.9345, |
| "num_tokens": 240494474.0, |
| "step": 132 |
| }, |
| { |
| "epoch": 1.3854166666666667, |
| "grad_norm": 0.7442668676376343, |
| "learning_rate": 0.0001315930368421676, |
| "loss": 5.234, |
| "num_tokens": 242322367.0, |
| "step": 133 |
| }, |
| { |
| "epoch": 1.3958333333333333, |
| "grad_norm": 1.0141338109970093, |
| "learning_rate": 0.00013043989626317667, |
| "loss": 4.1387, |
| "num_tokens": 244114841.0, |
| "step": 134 |
| }, |
| { |
| "epoch": 1.40625, |
| "grad_norm": 0.8511059284210205, |
| "learning_rate": 0.00012928227712765504, |
| "loss": 5.9175, |
| "num_tokens": 245949813.0, |
| "step": 135 |
| }, |
| { |
| "epoch": 1.4166666666666667, |
| "grad_norm": 0.8228452801704407, |
| "learning_rate": 0.00012812034975361874, |
| "loss": 5.9637, |
| "num_tokens": 247784505.0, |
| "step": 136 |
| }, |
| { |
| "epoch": 1.4270833333333333, |
| "grad_norm": 0.8691542148590088, |
| "learning_rate": 0.00012695428509294567, |
| "loss": 5.9107, |
| "num_tokens": 249618830.0, |
| "step": 137 |
| }, |
| { |
| "epoch": 1.4375, |
| "grad_norm": 0.7453424334526062, |
| "learning_rate": 0.0001257842547062238, |
| "loss": 5.7804, |
| "num_tokens": 251452377.0, |
| "step": 138 |
| }, |
| { |
| "epoch": 1.4479166666666667, |
| "grad_norm": 0.7112407088279724, |
| "learning_rate": 0.00012461043073750988, |
| "loss": 5.5699, |
| "num_tokens": 253283287.0, |
| "step": 139 |
| }, |
| { |
| "epoch": 1.4583333333333333, |
| "grad_norm": 0.9047596454620361, |
| "learning_rate": 0.00012343298588900225, |
| "loss": 3.9271, |
| "num_tokens": 255026046.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.46875, |
| "grad_norm": 0.7571535110473633, |
| "learning_rate": 0.00012225209339563145, |
| "loss": 5.9484, |
| "num_tokens": 256861054.0, |
| "step": 141 |
| }, |
| { |
| "epoch": 1.4791666666666667, |
| "grad_norm": 0.8223585486412048, |
| "learning_rate": 0.00012106792699957263, |
| "loss": 5.8375, |
| "num_tokens": 258695847.0, |
| "step": 142 |
| }, |
| { |
| "epoch": 1.4895833333333333, |
| "grad_norm": 0.8138189315795898, |
| "learning_rate": 0.00011988066092468324, |
| "loss": 6.041, |
| "num_tokens": 260530258.0, |
| "step": 143 |
| }, |
| { |
| "epoch": 1.5, |
| "grad_norm": 0.8064430356025696, |
| "learning_rate": 0.00011869046985086978, |
| "loss": 6.0705, |
| "num_tokens": 262364015.0, |
| "step": 144 |
| }, |
| { |
| "epoch": 1.5104166666666665, |
| "grad_norm": 0.7332646250724792, |
| "learning_rate": 0.00011749752888838754, |
| "loss": 5.7084, |
| "num_tokens": 264195805.0, |
| "step": 145 |
| }, |
| { |
| "epoch": 1.5208333333333335, |
| "grad_norm": 0.9756503701210022, |
| "learning_rate": 0.00011630201355207709, |
| "loss": 3.9937, |
| "num_tokens": 265976881.0, |
| "step": 146 |
| }, |
| { |
| "epoch": 1.53125, |
| "grad_norm": 0.8575289845466614, |
| "learning_rate": 0.000115104099735541, |
| "loss": 5.6886, |
| "num_tokens": 267811889.0, |
| "step": 147 |
| }, |
| { |
| "epoch": 1.5416666666666665, |
| "grad_norm": 0.8274205923080444, |
| "learning_rate": 0.00011390396368526517, |
| "loss": 5.9956, |
| "num_tokens": 269646750.0, |
| "step": 148 |
| }, |
| { |
| "epoch": 1.5520833333333335, |
| "grad_norm": 0.8247790336608887, |
| "learning_rate": 0.00011270178197468789, |
| "loss": 6.0649, |
| "num_tokens": 271481363.0, |
| "step": 149 |
| }, |
| { |
| "epoch": 1.5625, |
| "grad_norm": 1.015745759010315, |
| "learning_rate": 0.00011149773147822111, |
| "loss": 6.079, |
| "num_tokens": 273315476.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.5729166666666665, |
| "grad_norm": 1.3161146640777588, |
| "learning_rate": 0.00011029198934522725, |
| "loss": 5.8144, |
| "num_tokens": 275148482.0, |
| "step": 151 |
| }, |
| { |
| "epoch": 1.5833333333333335, |
| "grad_norm": 0.7493459582328796, |
| "learning_rate": 0.00010908473297395551, |
| "loss": 4.7308, |
| "num_tokens": 276969990.0, |
| "step": 152 |
| }, |
| { |
| "epoch": 1.59375, |
| "grad_norm": 0.7081484794616699, |
| "learning_rate": 0.0001078761399854418, |
| "loss": 4.8451, |
| "num_tokens": 278741308.0, |
| "step": 153 |
| }, |
| { |
| "epoch": 1.6041666666666665, |
| "grad_norm": 0.8730968236923218, |
| "learning_rate": 0.00010666638819737553, |
| "loss": 5.9697, |
| "num_tokens": 280576202.0, |
| "step": 154 |
| }, |
| { |
| "epoch": 1.6145833333333335, |
| "grad_norm": 0.8091002106666565, |
| "learning_rate": 0.00010545565559793796, |
| "loss": 6.1964, |
| "num_tokens": 282410853.0, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.625, |
| "grad_norm": 0.7971783876419067, |
| "learning_rate": 0.00010424412031961484, |
| "loss": 6.0415, |
| "num_tokens": 284245022.0, |
| "step": 156 |
| }, |
| { |
| "epoch": 1.6354166666666665, |
| "grad_norm": 0.8752973675727844, |
| "learning_rate": 0.0001030319606129885, |
| "loss": 5.8945, |
| "num_tokens": 286078219.0, |
| "step": 157 |
| }, |
| { |
| "epoch": 1.6458333333333335, |
| "grad_norm": 0.7812337279319763, |
| "learning_rate": 0.00010181935482051197, |
| "loss": 5.2035, |
| "num_tokens": 287905531.0, |
| "step": 158 |
| }, |
| { |
| "epoch": 1.65625, |
| "grad_norm": 0.8713350296020508, |
| "learning_rate": 0.00010060648135026998, |
| "loss": 4.39, |
| "num_tokens": 289671225.0, |
| "step": 159 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 0.8351245522499084, |
| "learning_rate": 9.939351864973006e-05, |
| "loss": 5.74, |
| "num_tokens": 291506205.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.6770833333333335, |
| "grad_norm": 0.8304809927940369, |
| "learning_rate": 9.818064517948805e-05, |
| "loss": 5.6296, |
| "num_tokens": 293340903.0, |
| "step": 161 |
| }, |
| { |
| "epoch": 1.6875, |
| "grad_norm": 0.7580217123031616, |
| "learning_rate": 9.696803938701154e-05, |
| "loss": 5.9524, |
| "num_tokens": 295175185.0, |
| "step": 162 |
| }, |
| { |
| "epoch": 1.6979166666666665, |
| "grad_norm": 0.9276612997055054, |
| "learning_rate": 9.57558796803852e-05, |
| "loss": 5.9102, |
| "num_tokens": 297008730.0, |
| "step": 163 |
| }, |
| { |
| "epoch": 1.7083333333333335, |
| "grad_norm": 0.7644199132919312, |
| "learning_rate": 9.454434440206211e-05, |
| "loss": 5.7445, |
| "num_tokens": 298839757.0, |
| "step": 164 |
| }, |
| { |
| "epoch": 1.71875, |
| "grad_norm": 0.8255704045295715, |
| "learning_rate": 9.33336118026245e-05, |
| "loss": 4.0577, |
| "num_tokens": 300606809.0, |
| "step": 165 |
| }, |
| { |
| "epoch": 1.7291666666666665, |
| "grad_norm": 0.8408218622207642, |
| "learning_rate": 9.212386001455826e-05, |
| "loss": 5.7455, |
| "num_tokens": 302441808.0, |
| "step": 166 |
| }, |
| { |
| "epoch": 1.7395833333333335, |
| "grad_norm": 0.791253387928009, |
| "learning_rate": 9.091526702604448e-05, |
| "loss": 6.0161, |
| "num_tokens": 304276550.0, |
| "step": 167 |
| }, |
| { |
| "epoch": 1.75, |
| "grad_norm": 0.7565672397613525, |
| "learning_rate": 8.970801065477276e-05, |
| "loss": 6.0298, |
| "num_tokens": 306110958.0, |
| "step": 168 |
| }, |
| { |
| "epoch": 1.7604166666666665, |
| "grad_norm": 0.8409264087677002, |
| "learning_rate": 8.85022685217789e-05, |
| "loss": 5.8963, |
| "num_tokens": 307944693.0, |
| "step": 169 |
| }, |
| { |
| "epoch": 1.7708333333333335, |
| "grad_norm": 0.7787932753562927, |
| "learning_rate": 8.729821802531212e-05, |
| "loss": 5.7313, |
| "num_tokens": 309776172.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.78125, |
| "grad_norm": 1.0043314695358276, |
| "learning_rate": 8.609603631473487e-05, |
| "loss": 3.5194, |
| "num_tokens": 311489802.0, |
| "step": 171 |
| }, |
| { |
| "epoch": 1.7916666666666665, |
| "grad_norm": 0.7282080054283142, |
| "learning_rate": 8.489590026445902e-05, |
| "loss": 5.7639, |
| "num_tokens": 313324810.0, |
| "step": 172 |
| }, |
| { |
| "epoch": 1.8020833333333335, |
| "grad_norm": 0.7764191627502441, |
| "learning_rate": 8.369798644792293e-05, |
| "loss": 5.8578, |
| "num_tokens": 315159645.0, |
| "step": 173 |
| }, |
| { |
| "epoch": 1.8125, |
| "grad_norm": 0.8042583465576172, |
| "learning_rate": 8.250247111161248e-05, |
| "loss": 5.9218, |
| "num_tokens": 316994149.0, |
| "step": 174 |
| }, |
| { |
| "epoch": 1.8229166666666665, |
| "grad_norm": 0.7392557263374329, |
| "learning_rate": 8.130953014913025e-05, |
| "loss": 5.7039, |
| "num_tokens": 318828139.0, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.8333333333333335, |
| "grad_norm": 0.742917001247406, |
| "learning_rate": 8.011933907531678e-05, |
| "loss": 5.6819, |
| "num_tokens": 320660783.0, |
| "step": 176 |
| }, |
| { |
| "epoch": 1.84375, |
| "grad_norm": 0.8126767873764038, |
| "learning_rate": 7.89320730004274e-05, |
| "loss": 4.2975, |
| "num_tokens": 322478386.0, |
| "step": 177 |
| }, |
| { |
| "epoch": 1.8541666666666665, |
| "grad_norm": 0.6863731741905212, |
| "learning_rate": 7.774790660436858e-05, |
| "loss": 5.0231, |
| "num_tokens": 324250009.0, |
| "step": 178 |
| }, |
| { |
| "epoch": 1.8645833333333335, |
| "grad_norm": 0.8107653856277466, |
| "learning_rate": 7.656701411099777e-05, |
| "loss": 5.9952, |
| "num_tokens": 326084914.0, |
| "step": 179 |
| }, |
| { |
| "epoch": 1.875, |
| "grad_norm": 0.7769672274589539, |
| "learning_rate": 7.538956926249014e-05, |
| "loss": 5.9752, |
| "num_tokens": 327919576.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.8854166666666665, |
| "grad_norm": 0.8014964461326599, |
| "learning_rate": 7.421574529377623e-05, |
| "loss": 6.0391, |
| "num_tokens": 329753817.0, |
| "step": 181 |
| }, |
| { |
| "epoch": 1.8958333333333335, |
| "grad_norm": 0.7131144404411316, |
| "learning_rate": 7.304571490705433e-05, |
| "loss": 5.7906, |
| "num_tokens": 331587167.0, |
| "step": 182 |
| }, |
| { |
| "epoch": 1.90625, |
| "grad_norm": 0.6183276176452637, |
| "learning_rate": 7.187965024638127e-05, |
| "loss": 5.2981, |
| "num_tokens": 333416340.0, |
| "step": 183 |
| }, |
| { |
| "epoch": 1.9166666666666665, |
| "grad_norm": 0.7822322249412537, |
| "learning_rate": 7.071772287234497e-05, |
| "loss": 4.1759, |
| "num_tokens": 335184043.0, |
| "step": 184 |
| }, |
| { |
| "epoch": 1.9270833333333335, |
| "grad_norm": 0.9038224220275879, |
| "learning_rate": 6.956010373682335e-05, |
| "loss": 5.8676, |
| "num_tokens": 337019011.0, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.9375, |
| "grad_norm": 0.7321260571479797, |
| "learning_rate": 6.840696315783239e-05, |
| "loss": 5.9956, |
| "num_tokens": 338853731.0, |
| "step": 186 |
| }, |
| { |
| "epoch": 1.9479166666666665, |
| "grad_norm": 0.6627479791641235, |
| "learning_rate": 6.725847079446778e-05, |
| "loss": 5.9686, |
| "num_tokens": 340688113.0, |
| "step": 187 |
| }, |
| { |
| "epoch": 1.9583333333333335, |
| "grad_norm": 0.8157824277877808, |
| "learning_rate": 6.611479562194314e-05, |
| "loss": 5.8535, |
| "num_tokens": 342521820.0, |
| "step": 188 |
| }, |
| { |
| "epoch": 1.96875, |
| "grad_norm": 0.7909327745437622, |
| "learning_rate": 6.497610590672916e-05, |
| "loss": 5.6085, |
| "num_tokens": 344353269.0, |
| "step": 189 |
| }, |
| { |
| "epoch": 1.9791666666666665, |
| "grad_norm": 0.9760119318962097, |
| "learning_rate": 6.384256918179691e-05, |
| "loss": 3.755, |
| "num_tokens": 346123614.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.9895833333333335, |
| "grad_norm": 0.6910778880119324, |
| "learning_rate": 6.271435222196916e-05, |
| "loss": 5.5981, |
| "num_tokens": 347957631.0, |
| "step": 191 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.6510729193687439, |
| "learning_rate": 6.159162101938292e-05, |
| "loss": 4.9271, |
| "num_tokens": 349771486.0, |
| "step": 192 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.3423305153846741, |
| "eval_num_tokens": 349771486.0, |
| "eval_runtime": 29.6637, |
| "eval_samples_per_second": 74.603, |
| "eval_steps_per_second": 2.36, |
| "step": 192 |
| }, |
| { |
| "epoch": 2.0104166666666665, |
| "grad_norm": 0.6948514580726624, |
| "learning_rate": 6.047454075906764e-05, |
| "loss": 5.0611, |
| "num_tokens": 351606494.0, |
| "step": 193 |
| }, |
| { |
| "epoch": 2.0208333333333335, |
| "grad_norm": 0.7176052331924438, |
| "learning_rate": 5.9363275794641736e-05, |
| "loss": 5.2794, |
| "num_tokens": 353441326.0, |
| "step": 194 |
| }, |
| { |
| "epoch": 2.03125, |
| "grad_norm": 0.7443753480911255, |
| "learning_rate": 5.825798962413164e-05, |
| "loss": 5.4079, |
| "num_tokens": 355275836.0, |
| "step": 195 |
| }, |
| { |
| "epoch": 2.0416666666666665, |
| "grad_norm": 0.7725483775138855, |
| "learning_rate": 5.7158844865916625e-05, |
| "loss": 5.4036, |
| "num_tokens": 357109789.0, |
| "step": 196 |
| }, |
| { |
| "epoch": 2.0520833333333335, |
| "grad_norm": 0.7521345615386963, |
| "learning_rate": 5.606600323480332e-05, |
| "loss": 5.2468, |
| "num_tokens": 358942348.0, |
| "step": 197 |
| }, |
| { |
| "epoch": 2.0625, |
| "grad_norm": 0.7622354626655579, |
| "learning_rate": 5.497962551823266e-05, |
| "loss": 3.8429, |
| "num_tokens": 360757610.0, |
| "step": 198 |
| }, |
| { |
| "epoch": 2.0729166666666665, |
| "grad_norm": 0.630013644695282, |
| "learning_rate": 5.389987155262379e-05, |
| "loss": 4.3363, |
| "num_tokens": 362511064.0, |
| "step": 199 |
| }, |
| { |
| "epoch": 2.0833333333333335, |
| "grad_norm": 0.7594055533409119, |
| "learning_rate": 5.282690019985757e-05, |
| "loss": 5.4325, |
| "num_tokens": 364345973.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 2.09375, |
| "grad_norm": 0.6902291774749756, |
| "learning_rate": 5.176086932390365e-05, |
| "loss": 5.3965, |
| "num_tokens": 366180610.0, |
| "step": 201 |
| }, |
| { |
| "epoch": 2.1041666666666665, |
| "grad_norm": 0.7697541117668152, |
| "learning_rate": 5.070193576759419e-05, |
| "loss": 5.37, |
| "num_tokens": 368014780.0, |
| "step": 202 |
| }, |
| { |
| "epoch": 2.1145833333333335, |
| "grad_norm": 0.7714757323265076, |
| "learning_rate": 4.965025532954801e-05, |
| "loss": 5.2093, |
| "num_tokens": 369847958.0, |
| "step": 203 |
| }, |
| { |
| "epoch": 2.125, |
| "grad_norm": 0.6553754210472107, |
| "learning_rate": 4.8605982741248215e-05, |
| "loss": 4.6465, |
| "num_tokens": 371674362.0, |
| "step": 204 |
| }, |
| { |
| "epoch": 2.1354166666666665, |
| "grad_norm": 0.7760845422744751, |
| "learning_rate": 4.756927164427685e-05, |
| "loss": 3.8276, |
| "num_tokens": 373456236.0, |
| "step": 205 |
| }, |
| { |
| "epoch": 2.1458333333333335, |
| "grad_norm": 0.8509770035743713, |
| "learning_rate": 4.654027456771004e-05, |
| "loss": 5.3333, |
| "num_tokens": 375291204.0, |
| "step": 206 |
| }, |
| { |
| "epoch": 2.15625, |
| "grad_norm": 0.7249243259429932, |
| "learning_rate": 4.551914290567665e-05, |
| "loss": 5.328, |
| "num_tokens": 377125884.0, |
| "step": 207 |
| }, |
| { |
| "epoch": 2.1666666666666665, |
| "grad_norm": 0.727703869342804, |
| "learning_rate": 4.450602689508398e-05, |
| "loss": 5.4402, |
| "num_tokens": 378960200.0, |
| "step": 208 |
| }, |
| { |
| "epoch": 2.1770833333333335, |
| "grad_norm": 0.7168691158294678, |
| "learning_rate": 4.35010755935138e-05, |
| "loss": 5.3316, |
| "num_tokens": 380793712.0, |
| "step": 209 |
| }, |
| { |
| "epoch": 2.1875, |
| "grad_norm": 0.6583669185638428, |
| "learning_rate": 4.250443685729169e-05, |
| "loss": 5.0108, |
| "num_tokens": 382624695.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 2.1979166666666665, |
| "grad_norm": 0.9036791324615479, |
| "learning_rate": 4.151625731973354e-05, |
| "loss": 3.5982, |
| "num_tokens": 384387840.0, |
| "step": 211 |
| }, |
| { |
| "epoch": 2.2083333333333335, |
| "grad_norm": 0.7037631869316101, |
| "learning_rate": 4.053668236957134e-05, |
| "loss": 5.0738, |
| "num_tokens": 386222848.0, |
| "step": 212 |
| }, |
| { |
| "epoch": 2.21875, |
| "grad_norm": 0.7382966876029968, |
| "learning_rate": 3.956585612956268e-05, |
| "loss": 5.4672, |
| "num_tokens": 388057618.0, |
| "step": 213 |
| }, |
| { |
| "epoch": 2.2291666666666665, |
| "grad_norm": 0.7034947872161865, |
| "learning_rate": 3.8603921435286236e-05, |
| "loss": 5.3153, |
| "num_tokens": 389891998.0, |
| "step": 214 |
| }, |
| { |
| "epoch": 2.2395833333333335, |
| "grad_norm": 0.700252115726471, |
| "learning_rate": 3.7651019814126654e-05, |
| "loss": 5.1981, |
| "num_tokens": 391725631.0, |
| "step": 215 |
| }, |
| { |
| "epoch": 2.25, |
| "grad_norm": 0.702680766582489, |
| "learning_rate": 3.670729146445195e-05, |
| "loss": 4.976, |
| "num_tokens": 393557040.0, |
| "step": 216 |
| }, |
| { |
| "epoch": 2.2604166666666665, |
| "grad_norm": 0.8597504496574402, |
| "learning_rate": 3.577287523498641e-05, |
| "loss": 3.1902, |
| "num_tokens": 395290416.0, |
| "step": 217 |
| }, |
| { |
| "epoch": 2.2708333333333335, |
| "grad_norm": 0.6755448579788208, |
| "learning_rate": 3.4847908604382095e-05, |
| "loss": 5.2388, |
| "num_tokens": 397125424.0, |
| "step": 218 |
| }, |
| { |
| "epoch": 2.28125, |
| "grad_norm": 0.7342623472213745, |
| "learning_rate": 3.393252766099187e-05, |
| "loss": 5.322, |
| "num_tokens": 398960262.0, |
| "step": 219 |
| }, |
| { |
| "epoch": 2.2916666666666665, |
| "grad_norm": 0.6767341494560242, |
| "learning_rate": 3.3026867082847056e-05, |
| "loss": 5.4798, |
| "num_tokens": 400794825.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 2.3020833333333335, |
| "grad_norm": 1.052095651626587, |
| "learning_rate": 3.21310601178425e-05, |
| "loss": 5.5205, |
| "num_tokens": 402628868.0, |
| "step": 221 |
| }, |
| { |
| "epoch": 2.3125, |
| "grad_norm": 0.6284305453300476, |
| "learning_rate": 3.1245238564132163e-05, |
| "loss": 5.2478, |
| "num_tokens": 404461483.0, |
| "step": 222 |
| }, |
| { |
| "epoch": 2.3229166666666665, |
| "grad_norm": 2.5975844860076904, |
| "learning_rate": 3.036953275073783e-05, |
| "loss": 4.1041, |
| "num_tokens": 406281705.0, |
| "step": 223 |
| }, |
| { |
| "epoch": 2.3333333333333335, |
| "grad_norm": 0.608096718788147, |
| "learning_rate": 2.950407151837421e-05, |
| "loss": 4.6334, |
| "num_tokens": 408058365.0, |
| "step": 224 |
| }, |
| { |
| "epoch": 2.34375, |
| "grad_norm": 0.6512030363082886, |
| "learning_rate": 2.864898220049277e-05, |
| "loss": 5.4096, |
| "num_tokens": 409893288.0, |
| "step": 225 |
| }, |
| { |
| "epoch": 2.3541666666666665, |
| "grad_norm": 0.6984207034111023, |
| "learning_rate": 2.7804390604547557e-05, |
| "loss": 5.5544, |
| "num_tokens": 411727962.0, |
| "step": 226 |
| }, |
| { |
| "epoch": 2.3645833333333335, |
| "grad_norm": 0.645203173160553, |
| "learning_rate": 2.697042099348528e-05, |
| "loss": 5.5268, |
| "num_tokens": 413562214.0, |
| "step": 227 |
| }, |
| { |
| "epoch": 2.375, |
| "grad_norm": 0.7451352477073669, |
| "learning_rate": 2.6147196067462852e-05, |
| "loss": 5.1853, |
| "num_tokens": 415395615.0, |
| "step": 228 |
| }, |
| { |
| "epoch": 2.3854166666666665, |
| "grad_norm": 0.6436541676521301, |
| "learning_rate": 2.533483694579477e-05, |
| "loss": 4.9048, |
| "num_tokens": 417224307.0, |
| "step": 229 |
| }, |
| { |
| "epoch": 2.3958333333333335, |
| "grad_norm": 0.7218170762062073, |
| "learning_rate": 2.4533463149133073e-05, |
| "loss": 3.8916, |
| "num_tokens": 419005795.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 2.40625, |
| "grad_norm": 0.6904891729354858, |
| "learning_rate": 2.3743192581882556e-05, |
| "loss": 5.2894, |
| "num_tokens": 420840789.0, |
| "step": 231 |
| }, |
| { |
| "epoch": 2.4166666666666665, |
| "grad_norm": 0.6729670763015747, |
| "learning_rate": 2.296414151485371e-05, |
| "loss": 5.5056, |
| "num_tokens": 422675503.0, |
| "step": 232 |
| }, |
| { |
| "epoch": 2.4270833333333335, |
| "grad_norm": 0.6402373909950256, |
| "learning_rate": 2.2196424568156073e-05, |
| "loss": 5.3354, |
| "num_tokens": 424509862.0, |
| "step": 233 |
| }, |
| { |
| "epoch": 2.4375, |
| "grad_norm": 0.6556631922721863, |
| "learning_rate": 2.1440154694334404e-05, |
| "loss": 5.2873, |
| "num_tokens": 426343538.0, |
| "step": 234 |
| }, |
| { |
| "epoch": 2.4479166666666665, |
| "grad_norm": 0.6008151173591614, |
| "learning_rate": 2.069544316175025e-05, |
| "loss": 5.0402, |
| "num_tokens": 428174442.0, |
| "step": 235 |
| }, |
| { |
| "epoch": 2.4583333333333335, |
| "grad_norm": 0.851319432258606, |
| "learning_rate": 1.9962399538211207e-05, |
| "loss": 3.3889, |
| "num_tokens": 429919669.0, |
| "step": 236 |
| }, |
| { |
| "epoch": 2.46875, |
| "grad_norm": 0.6194161176681519, |
| "learning_rate": 1.9241131674850542e-05, |
| "loss": 5.3071, |
| "num_tokens": 431754677.0, |
| "step": 237 |
| }, |
| { |
| "epoch": 2.4791666666666665, |
| "grad_norm": 0.738338828086853, |
| "learning_rate": 1.853174569025914e-05, |
| "loss": 5.4458, |
| "num_tokens": 433589432.0, |
| "step": 238 |
| }, |
| { |
| "epoch": 2.4895833333333335, |
| "grad_norm": 0.6430217623710632, |
| "learning_rate": 1.7834345954872713e-05, |
| "loss": 5.4256, |
| "num_tokens": 435423869.0, |
| "step": 239 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 0.6353598237037659, |
| "learning_rate": 1.7149035075615794e-05, |
| "loss": 5.3607, |
| "num_tokens": 437257708.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 2.5104166666666665, |
| "grad_norm": 0.6141358017921448, |
| "learning_rate": 1.6475913880805514e-05, |
| "loss": 5.1188, |
| "num_tokens": 439089899.0, |
| "step": 241 |
| }, |
| { |
| "epoch": 2.5208333333333335, |
| "grad_norm": 0.946632444858551, |
| "learning_rate": 1.5815081405316912e-05, |
| "loss": 3.4461, |
| "num_tokens": 440851746.0, |
| "step": 242 |
| }, |
| { |
| "epoch": 2.53125, |
| "grad_norm": 0.5994837880134583, |
| "learning_rate": 1.5166634876012187e-05, |
| "loss": 5.2468, |
| "num_tokens": 442686754.0, |
| "step": 243 |
| }, |
| { |
| "epoch": 2.5416666666666665, |
| "grad_norm": 0.6093682646751404, |
| "learning_rate": 1.4530669697435861e-05, |
| "loss": 5.3245, |
| "num_tokens": 444521618.0, |
| "step": 244 |
| }, |
| { |
| "epoch": 2.5520833333333335, |
| "grad_norm": 0.6194096803665161, |
| "learning_rate": 1.3907279437778153e-05, |
| "loss": 5.3153, |
| "num_tokens": 446356145.0, |
| "step": 245 |
| }, |
| { |
| "epoch": 2.5625, |
| "grad_norm": 0.7306890487670898, |
| "learning_rate": 1.329655581510847e-05, |
| "loss": 5.3427, |
| "num_tokens": 448190084.0, |
| "step": 246 |
| }, |
| { |
| "epoch": 2.5729166666666665, |
| "grad_norm": 0.6183792352676392, |
| "learning_rate": 1.2698588683881186e-05, |
| "loss": 5.1223, |
| "num_tokens": 450022704.0, |
| "step": 247 |
| }, |
| { |
| "epoch": 2.5833333333333335, |
| "grad_norm": 0.7473891973495483, |
| "learning_rate": 1.2113466021715425e-05, |
| "loss": 3.8863, |
| "num_tokens": 451841614.0, |
| "step": 248 |
| }, |
| { |
| "epoch": 2.59375, |
| "grad_norm": 0.5600215196609497, |
| "learning_rate": 1.1541273916451235e-05, |
| "loss": 4.5054, |
| "num_tokens": 453629048.0, |
| "step": 249 |
| }, |
| { |
| "epoch": 2.6041666666666665, |
| "grad_norm": 0.5968042612075806, |
| "learning_rate": 1.0982096553483568e-05, |
| "loss": 5.156, |
| "num_tokens": 455463961.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 2.6145833333333335, |
| "grad_norm": 0.6372103691101074, |
| "learning_rate": 1.0436016203376343e-05, |
| "loss": 5.3602, |
| "num_tokens": 457298596.0, |
| "step": 251 |
| }, |
| { |
| "epoch": 2.625, |
| "grad_norm": 0.6640581488609314, |
| "learning_rate": 9.903113209758096e-06, |
| "loss": 5.1987, |
| "num_tokens": 459132752.0, |
| "step": 252 |
| }, |
| { |
| "epoch": 2.6354166666666665, |
| "grad_norm": 0.6306663155555725, |
| "learning_rate": 9.383465977501227e-06, |
| "loss": 5.3803, |
| "num_tokens": 460966021.0, |
| "step": 253 |
| }, |
| { |
| "epoch": 2.6458333333333335, |
| "grad_norm": 0.5750814080238342, |
| "learning_rate": 8.87715096118642e-06, |
| "loss": 4.7525, |
| "num_tokens": 462794863.0, |
| "step": 254 |
| }, |
| { |
| "epoch": 2.65625, |
| "grad_norm": 0.7967090606689453, |
| "learning_rate": 8.384242653854146e-06, |
| "loss": 3.8948, |
| "num_tokens": 464568212.0, |
| "step": 255 |
| }, |
| { |
| "epoch": 2.6666666666666665, |
| "grad_norm": 0.6060699820518494, |
| "learning_rate": 7.904813576044534e-06, |
| "loss": 5.129, |
| "num_tokens": 466403182.0, |
| "step": 256 |
| }, |
| { |
| "epoch": 2.6770833333333335, |
| "grad_norm": 1.2050302028656006, |
| "learning_rate": 7.4389342651276395e-06, |
| "loss": 5.295, |
| "num_tokens": 468237885.0, |
| "step": 257 |
| }, |
| { |
| "epoch": 2.6875, |
| "grad_norm": 0.6205068826675415, |
| "learning_rate": 6.986673264925436e-06, |
| "loss": 5.3837, |
| "num_tokens": 470072187.0, |
| "step": 258 |
| }, |
| { |
| "epoch": 2.6979166666666665, |
| "grad_norm": 0.6179949641227722, |
| "learning_rate": 6.548097115627106e-06, |
| "loss": 5.3384, |
| "num_tokens": 471905720.0, |
| "step": 259 |
| }, |
| { |
| "epoch": 2.7083333333333335, |
| "grad_norm": 1.1975480318069458, |
| "learning_rate": 6.123270343999132e-06, |
| "loss": 4.995, |
| "num_tokens": 473735960.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 2.71875, |
| "grad_norm": 0.9503664970397949, |
| "learning_rate": 5.71225545389158e-06, |
| "loss": 3.3257, |
| "num_tokens": 475491069.0, |
| "step": 261 |
| }, |
| { |
| "epoch": 2.7291666666666665, |
| "grad_norm": 0.5758056044578552, |
| "learning_rate": 5.315112917042098e-06, |
| "loss": 5.086, |
| "num_tokens": 477326077.0, |
| "step": 262 |
| }, |
| { |
| "epoch": 2.7395833333333335, |
| "grad_norm": 0.6178621649742126, |
| "learning_rate": 4.931901164178765e-06, |
| "loss": 5.4641, |
| "num_tokens": 479160897.0, |
| "step": 263 |
| }, |
| { |
| "epoch": 2.75, |
| "grad_norm": 0.6436989903450012, |
| "learning_rate": 4.562676576423397e-06, |
| "loss": 5.4971, |
| "num_tokens": 480995368.0, |
| "step": 264 |
| }, |
| { |
| "epoch": 2.7604166666666665, |
| "grad_norm": 0.626816987991333, |
| "learning_rate": 4.207493476996205e-06, |
| "loss": 5.3907, |
| "num_tokens": 482829284.0, |
| "step": 265 |
| }, |
| { |
| "epoch": 2.7708333333333335, |
| "grad_norm": 0.6234866976737976, |
| "learning_rate": 3.866404123223444e-06, |
| "loss": 5.1555, |
| "num_tokens": 484661533.0, |
| "step": 266 |
| }, |
| { |
| "epoch": 2.78125, |
| "grad_norm": 0.7500293254852295, |
| "learning_rate": 3.53945869884883e-06, |
| "loss": 3.6557, |
| "num_tokens": 486428903.0, |
| "step": 267 |
| }, |
| { |
| "epoch": 2.7916666666666665, |
| "grad_norm": 0.6020712852478027, |
| "learning_rate": 3.226705306650113e-06, |
| "loss": 5.2937, |
| "num_tokens": 488263911.0, |
| "step": 268 |
| }, |
| { |
| "epoch": 2.8020833333333335, |
| "grad_norm": 0.6582825183868408, |
| "learning_rate": 2.9281899613619047e-06, |
| "loss": 5.3827, |
| "num_tokens": 490098734.0, |
| "step": 269 |
| }, |
| { |
| "epoch": 2.8125, |
| "grad_norm": 0.6350494623184204, |
| "learning_rate": 2.6439565829055268e-06, |
| "loss": 5.4643, |
| "num_tokens": 491933221.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 2.8229166666666665, |
| "grad_norm": 0.6131789684295654, |
| "learning_rate": 2.3740469899272145e-06, |
| "loss": 5.309, |
| "num_tokens": 493767156.0, |
| "step": 271 |
| }, |
| { |
| "epoch": 2.8333333333333335, |
| "grad_norm": 0.6128297448158264, |
| "learning_rate": 2.1185008936454254e-06, |
| "loss": 5.0592, |
| "num_tokens": 495599700.0, |
| "step": 272 |
| }, |
| { |
| "epoch": 2.84375, |
| "grad_norm": 0.7139255404472351, |
| "learning_rate": 1.8773558920082034e-06, |
| "loss": 3.9382, |
| "num_tokens": 497418917.0, |
| "step": 273 |
| }, |
| { |
| "epoch": 2.8541666666666665, |
| "grad_norm": 0.5505620241165161, |
| "learning_rate": 1.6506474641614923e-06, |
| "loss": 4.6059, |
| "num_tokens": 499174914.0, |
| "step": 274 |
| }, |
| { |
| "epoch": 2.8645833333333335, |
| "grad_norm": 0.6600484848022461, |
| "learning_rate": 1.4384089652291543e-06, |
| "loss": 5.108, |
| "num_tokens": 501009815.0, |
| "step": 275 |
| }, |
| { |
| "epoch": 2.875, |
| "grad_norm": 0.6675175428390503, |
| "learning_rate": 1.240671621405498e-06, |
| "loss": 5.4172, |
| "num_tokens": 502844441.0, |
| "step": 276 |
| }, |
| { |
| "epoch": 2.8854166666666665, |
| "grad_norm": 0.6115691661834717, |
| "learning_rate": 1.0574645253610404e-06, |
| "loss": 5.3482, |
| "num_tokens": 504678606.0, |
| "step": 277 |
| }, |
| { |
| "epoch": 2.8958333333333335, |
| "grad_norm": 0.5985777974128723, |
| "learning_rate": 8.888146319621537e-07, |
| "loss": 5.2838, |
| "num_tokens": 506511804.0, |
| "step": 278 |
| }, |
| { |
| "epoch": 2.90625, |
| "grad_norm": 0.583017885684967, |
| "learning_rate": 7.347467543052932e-07, |
| "loss": 4.6211, |
| "num_tokens": 508338446.0, |
| "step": 279 |
| }, |
| { |
| "epoch": 2.9166666666666665, |
| "grad_norm": 0.6637594103813171, |
| "learning_rate": 5.952835600662288e-07, |
| "loss": 3.8064, |
| "num_tokens": 510108388.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 2.9270833333333335, |
| "grad_norm": 0.5726197957992554, |
| "learning_rate": 4.704455681650788e-07, |
| "loss": 4.9473, |
| "num_tokens": 511943368.0, |
| "step": 281 |
| }, |
| { |
| "epoch": 2.9375, |
| "grad_norm": 0.630939781665802, |
| "learning_rate": 3.6025114574734785e-07, |
| "loss": 5.369, |
| "num_tokens": 513778092.0, |
| "step": 282 |
| }, |
| { |
| "epoch": 2.9479166666666665, |
| "grad_norm": 0.7909092307090759, |
| "learning_rate": 2.647165054816325e-07, |
| "loss": 5.3277, |
| "num_tokens": 515612457.0, |
| "step": 283 |
| }, |
| { |
| "epoch": 2.9583333333333335, |
| "grad_norm": 0.7272047400474548, |
| "learning_rate": 1.838557031742738e-07, |
| "loss": 5.2173, |
| "num_tokens": 517446170.0, |
| "step": 284 |
| }, |
| { |
| "epoch": 2.96875, |
| "grad_norm": 0.6117963790893555, |
| "learning_rate": 1.1768063570136711e-07, |
| "loss": 5.0083, |
| "num_tokens": 519277051.0, |
| "step": 285 |
| }, |
| { |
| "epoch": 2.9791666666666665, |
| "grad_norm": 0.7872759699821472, |
| "learning_rate": 6.62010392584067e-08, |
| "loss": 3.5749, |
| "num_tokens": 521007127.0, |
| "step": 286 |
| }, |
| { |
| "epoch": 2.9895833333333335, |
| "grad_norm": 0.6111645698547363, |
| "learning_rate": 2.942448792778718e-08, |
| "loss": 5.3112, |
| "num_tokens": 522841066.0, |
| "step": 287 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 0.5639166235923767, |
| "learning_rate": 7.3563925645059315e-09, |
| "loss": 4.265, |
| "num_tokens": 524657229.0, |
| "step": 288 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.3267965018749237, |
| "eval_num_tokens": 524657229.0, |
| "eval_runtime": 29.6695, |
| "eval_samples_per_second": 74.588, |
| "eval_steps_per_second": 2.359, |
| "step": 288 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 288, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.57731504726653e+19, |
| "train_batch_size": 28, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|