| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 5.0, |
| "eval_steps": 500, |
| "global_step": 1385, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 1.5288876295089722, |
| "epoch": 0.0036199095022624436, |
| "grad_norm": 7.694402694702148, |
| "learning_rate": 0.0, |
| "loss": 1.3638800382614136, |
| "mean_token_accuracy": 0.8390376716852188, |
| "num_tokens": 9306.0, |
| "step": 1 |
| }, |
| { |
| "entropy": 1.4891301393508911, |
| "epoch": 0.007239819004524887, |
| "grad_norm": 10.345961570739746, |
| "learning_rate": 2.9850746268656716e-06, |
| "loss": 1.404976725578308, |
| "mean_token_accuracy": 0.8416248112916946, |
| "num_tokens": 18426.0, |
| "step": 2 |
| }, |
| { |
| "entropy": 1.5589642226696014, |
| "epoch": 0.01085972850678733, |
| "grad_norm": 7.015953063964844, |
| "learning_rate": 5.970149253731343e-06, |
| "loss": 1.5137574672698975, |
| "mean_token_accuracy": 0.844389408826828, |
| "num_tokens": 27017.0, |
| "step": 3 |
| }, |
| { |
| "entropy": 1.5128743052482605, |
| "epoch": 0.014479638009049774, |
| "grad_norm": 8.688935279846191, |
| "learning_rate": 8.955223880597016e-06, |
| "loss": 1.429337501525879, |
| "mean_token_accuracy": 0.8436507284641266, |
| "num_tokens": 36186.0, |
| "step": 4 |
| }, |
| { |
| "entropy": 1.5162458419799805, |
| "epoch": 0.01809954751131222, |
| "grad_norm": 18.12025260925293, |
| "learning_rate": 1.1940298507462686e-05, |
| "loss": 1.5170090198516846, |
| "mean_token_accuracy": 0.8127347379922867, |
| "num_tokens": 45259.0, |
| "step": 5 |
| }, |
| { |
| "entropy": 1.607073962688446, |
| "epoch": 0.02171945701357466, |
| "grad_norm": 11.967021942138672, |
| "learning_rate": 1.4925373134328357e-05, |
| "loss": 1.7809343338012695, |
| "mean_token_accuracy": 0.7834140509366989, |
| "num_tokens": 53870.0, |
| "step": 6 |
| }, |
| { |
| "entropy": 1.5723404288291931, |
| "epoch": 0.025339366515837104, |
| "grad_norm": 7.197022438049316, |
| "learning_rate": 1.791044776119403e-05, |
| "loss": 1.355630874633789, |
| "mean_token_accuracy": 0.8707718253135681, |
| "num_tokens": 62422.0, |
| "step": 7 |
| }, |
| { |
| "entropy": 1.6577945351600647, |
| "epoch": 0.02895927601809955, |
| "grad_norm": 9.124281883239746, |
| "learning_rate": 2.0895522388059702e-05, |
| "loss": 1.5860857963562012, |
| "mean_token_accuracy": 0.8311486840248108, |
| "num_tokens": 70836.0, |
| "step": 8 |
| }, |
| { |
| "entropy": 1.5083436369895935, |
| "epoch": 0.03257918552036199, |
| "grad_norm": 9.471440315246582, |
| "learning_rate": 2.3880597014925373e-05, |
| "loss": 1.4798086881637573, |
| "mean_token_accuracy": 0.8188015669584274, |
| "num_tokens": 79489.0, |
| "step": 9 |
| }, |
| { |
| "entropy": 1.5417097806930542, |
| "epoch": 0.03619909502262444, |
| "grad_norm": 6.9740309715271, |
| "learning_rate": 2.6865671641791047e-05, |
| "loss": 1.4000660181045532, |
| "mean_token_accuracy": 0.8296933174133301, |
| "num_tokens": 88400.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 1.6839916408061981, |
| "epoch": 0.039819004524886875, |
| "grad_norm": 8.314177513122559, |
| "learning_rate": 2.9850746268656714e-05, |
| "loss": 1.3732950687408447, |
| "mean_token_accuracy": 0.8450545966625214, |
| "num_tokens": 97018.0, |
| "step": 11 |
| }, |
| { |
| "entropy": 1.7210606038570404, |
| "epoch": 0.04343891402714932, |
| "grad_norm": 6.364591598510742, |
| "learning_rate": 3.283582089552239e-05, |
| "loss": 1.2142231464385986, |
| "mean_token_accuracy": 0.8527437746524811, |
| "num_tokens": 105866.0, |
| "step": 12 |
| }, |
| { |
| "entropy": 1.6527923345565796, |
| "epoch": 0.047058823529411764, |
| "grad_norm": 4.993825912475586, |
| "learning_rate": 3.582089552238806e-05, |
| "loss": 0.9318434000015259, |
| "mean_token_accuracy": 0.8724203705787659, |
| "num_tokens": 114999.0, |
| "step": 13 |
| }, |
| { |
| "entropy": 1.7282630801200867, |
| "epoch": 0.05067873303167421, |
| "grad_norm": 4.304642677307129, |
| "learning_rate": 3.8805970149253736e-05, |
| "loss": 0.9089325070381165, |
| "mean_token_accuracy": 0.8798395097255707, |
| "num_tokens": 124184.0, |
| "step": 14 |
| }, |
| { |
| "entropy": 1.8163867890834808, |
| "epoch": 0.05429864253393665, |
| "grad_norm": 3.7051150798797607, |
| "learning_rate": 4.1791044776119404e-05, |
| "loss": 0.7500128746032715, |
| "mean_token_accuracy": 0.8938957899808884, |
| "num_tokens": 132985.0, |
| "step": 15 |
| }, |
| { |
| "entropy": 2.0243027210235596, |
| "epoch": 0.0579185520361991, |
| "grad_norm": 4.971452236175537, |
| "learning_rate": 4.477611940298508e-05, |
| "loss": 1.0864768028259277, |
| "mean_token_accuracy": 0.8368343859910965, |
| "num_tokens": 141599.0, |
| "step": 16 |
| }, |
| { |
| "entropy": 2.125125467777252, |
| "epoch": 0.06153846153846154, |
| "grad_norm": 4.845816612243652, |
| "learning_rate": 4.7761194029850745e-05, |
| "loss": 0.7839944958686829, |
| "mean_token_accuracy": 0.8883605301380157, |
| "num_tokens": 149961.0, |
| "step": 17 |
| }, |
| { |
| "entropy": 2.167099416255951, |
| "epoch": 0.06515837104072399, |
| "grad_norm": 4.479213237762451, |
| "learning_rate": 5.074626865671642e-05, |
| "loss": 0.6522338390350342, |
| "mean_token_accuracy": 0.8985295295715332, |
| "num_tokens": 158394.0, |
| "step": 18 |
| }, |
| { |
| "entropy": 2.3476614952087402, |
| "epoch": 0.06877828054298643, |
| "grad_norm": 4.596512794494629, |
| "learning_rate": 5.373134328358209e-05, |
| "loss": 0.5884965062141418, |
| "mean_token_accuracy": 0.8780558109283447, |
| "num_tokens": 167295.0, |
| "step": 19 |
| }, |
| { |
| "entropy": 2.620903968811035, |
| "epoch": 0.07239819004524888, |
| "grad_norm": 3.99661922454834, |
| "learning_rate": 5.671641791044776e-05, |
| "loss": 0.6179074645042419, |
| "mean_token_accuracy": 0.875989705324173, |
| "num_tokens": 176255.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 2.663840651512146, |
| "epoch": 0.0760180995475113, |
| "grad_norm": 2.395817518234253, |
| "learning_rate": 5.970149253731343e-05, |
| "loss": 0.5167301893234253, |
| "mean_token_accuracy": 0.8798592388629913, |
| "num_tokens": 185601.0, |
| "step": 21 |
| }, |
| { |
| "entropy": 3.007373869419098, |
| "epoch": 0.07963800904977375, |
| "grad_norm": 1.9023845195770264, |
| "learning_rate": 6.268656716417911e-05, |
| "loss": 0.4969954788684845, |
| "mean_token_accuracy": 0.8838344216346741, |
| "num_tokens": 194036.0, |
| "step": 22 |
| }, |
| { |
| "entropy": 3.014187455177307, |
| "epoch": 0.0832579185520362, |
| "grad_norm": 1.0483063459396362, |
| "learning_rate": 6.567164179104478e-05, |
| "loss": 0.4313647150993347, |
| "mean_token_accuracy": 0.8821887522935867, |
| "num_tokens": 203167.0, |
| "step": 23 |
| }, |
| { |
| "entropy": 3.5317789912223816, |
| "epoch": 0.08687782805429864, |
| "grad_norm": 1.9082902669906616, |
| "learning_rate": 6.865671641791044e-05, |
| "loss": 0.6452760100364685, |
| "mean_token_accuracy": 0.8487882167100906, |
| "num_tokens": 211791.0, |
| "step": 24 |
| }, |
| { |
| "entropy": 3.470491588115692, |
| "epoch": 0.09049773755656108, |
| "grad_norm": 1.3330037593841553, |
| "learning_rate": 7.164179104477612e-05, |
| "loss": 0.609681248664856, |
| "mean_token_accuracy": 0.8559600114822388, |
| "num_tokens": 220855.0, |
| "step": 25 |
| }, |
| { |
| "entropy": 3.8751351833343506, |
| "epoch": 0.09411764705882353, |
| "grad_norm": 1.864700436592102, |
| "learning_rate": 7.46268656716418e-05, |
| "loss": 0.5049571394920349, |
| "mean_token_accuracy": 0.8841463923454285, |
| "num_tokens": 229389.0, |
| "step": 26 |
| }, |
| { |
| "entropy": 3.706156551837921, |
| "epoch": 0.09773755656108597, |
| "grad_norm": 1.7854461669921875, |
| "learning_rate": 7.761194029850747e-05, |
| "loss": 0.39277932047843933, |
| "mean_token_accuracy": 0.9083494395017624, |
| "num_tokens": 238523.0, |
| "step": 27 |
| }, |
| { |
| "entropy": 3.8404606580734253, |
| "epoch": 0.10135746606334842, |
| "grad_norm": 2.3090603351593018, |
| "learning_rate": 8.059701492537314e-05, |
| "loss": 0.35173487663269043, |
| "mean_token_accuracy": 0.9240767657756805, |
| "num_tokens": 247228.0, |
| "step": 28 |
| }, |
| { |
| "entropy": 4.031954348087311, |
| "epoch": 0.10497737556561086, |
| "grad_norm": 1.6039751768112183, |
| "learning_rate": 8.358208955223881e-05, |
| "loss": 0.5736312866210938, |
| "mean_token_accuracy": 0.8665709495544434, |
| "num_tokens": 255766.0, |
| "step": 29 |
| }, |
| { |
| "entropy": 4.0528751611709595, |
| "epoch": 0.1085972850678733, |
| "grad_norm": 1.5278459787368774, |
| "learning_rate": 8.656716417910447e-05, |
| "loss": 0.46521249413490295, |
| "mean_token_accuracy": 0.8837475925683975, |
| "num_tokens": 264515.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 3.7500529289245605, |
| "epoch": 0.11221719457013575, |
| "grad_norm": 1.2679299116134644, |
| "learning_rate": 8.955223880597016e-05, |
| "loss": 0.6649113297462463, |
| "mean_token_accuracy": 0.8520247489213943, |
| "num_tokens": 273421.0, |
| "step": 31 |
| }, |
| { |
| "entropy": 3.870599329471588, |
| "epoch": 0.1158371040723982, |
| "grad_norm": 1.4248121976852417, |
| "learning_rate": 9.253731343283582e-05, |
| "loss": 0.2973020672798157, |
| "mean_token_accuracy": 0.9219554513692856, |
| "num_tokens": 281883.0, |
| "step": 32 |
| }, |
| { |
| "entropy": 3.5264278650283813, |
| "epoch": 0.11945701357466064, |
| "grad_norm": 0.7345032691955566, |
| "learning_rate": 9.552238805970149e-05, |
| "loss": 0.504949688911438, |
| "mean_token_accuracy": 0.9012798517942429, |
| "num_tokens": 290801.0, |
| "step": 33 |
| }, |
| { |
| "entropy": 3.6929972171783447, |
| "epoch": 0.12307692307692308, |
| "grad_norm": 1.0366969108581543, |
| "learning_rate": 9.850746268656717e-05, |
| "loss": 0.35403263568878174, |
| "mean_token_accuracy": 0.9262522161006927, |
| "num_tokens": 299165.0, |
| "step": 34 |
| }, |
| { |
| "entropy": 3.458252251148224, |
| "epoch": 0.12669683257918551, |
| "grad_norm": 0.8196644186973572, |
| "learning_rate": 0.00010149253731343284, |
| "loss": 0.46754205226898193, |
| "mean_token_accuracy": 0.8908957839012146, |
| "num_tokens": 307618.0, |
| "step": 35 |
| }, |
| { |
| "entropy": 3.502661347389221, |
| "epoch": 0.13031674208144797, |
| "grad_norm": 0.8469979166984558, |
| "learning_rate": 0.0001044776119402985, |
| "loss": 0.377693772315979, |
| "mean_token_accuracy": 0.9080320745706558, |
| "num_tokens": 316466.0, |
| "step": 36 |
| }, |
| { |
| "entropy": 3.3003416061401367, |
| "epoch": 0.1339366515837104, |
| "grad_norm": 0.7982582449913025, |
| "learning_rate": 0.00010746268656716419, |
| "loss": 0.3522840142250061, |
| "mean_token_accuracy": 0.9149231612682343, |
| "num_tokens": 325197.0, |
| "step": 37 |
| }, |
| { |
| "entropy": 3.265174388885498, |
| "epoch": 0.13755656108597286, |
| "grad_norm": 0.7812036275863647, |
| "learning_rate": 0.00011044776119402987, |
| "loss": 0.33715564012527466, |
| "mean_token_accuracy": 0.9169812202453613, |
| "num_tokens": 334246.0, |
| "step": 38 |
| }, |
| { |
| "entropy": 3.2711930871009827, |
| "epoch": 0.1411764705882353, |
| "grad_norm": 0.6879589557647705, |
| "learning_rate": 0.00011343283582089552, |
| "loss": 0.22535352408885956, |
| "mean_token_accuracy": 0.9371069073677063, |
| "num_tokens": 342753.0, |
| "step": 39 |
| }, |
| { |
| "entropy": 3.039187252521515, |
| "epoch": 0.14479638009049775, |
| "grad_norm": 0.7044833898544312, |
| "learning_rate": 0.0001164179104477612, |
| "loss": 0.269231379032135, |
| "mean_token_accuracy": 0.9326638281345367, |
| "num_tokens": 351703.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 3.1341193318367004, |
| "epoch": 0.14841628959276018, |
| "grad_norm": 0.7598081231117249, |
| "learning_rate": 0.00011940298507462686, |
| "loss": 0.23056557774543762, |
| "mean_token_accuracy": 0.9387440532445908, |
| "num_tokens": 360201.0, |
| "step": 41 |
| }, |
| { |
| "entropy": 3.1592912673950195, |
| "epoch": 0.1520361990950226, |
| "grad_norm": 0.9636098146438599, |
| "learning_rate": 0.00012238805970149255, |
| "loss": 0.4810163974761963, |
| "mean_token_accuracy": 0.8872013241052628, |
| "num_tokens": 368978.0, |
| "step": 42 |
| }, |
| { |
| "entropy": 2.778893828392029, |
| "epoch": 0.15565610859728507, |
| "grad_norm": 0.5792455077171326, |
| "learning_rate": 0.00012537313432835822, |
| "loss": 0.21380706131458282, |
| "mean_token_accuracy": 0.9375593662261963, |
| "num_tokens": 378445.0, |
| "step": 43 |
| }, |
| { |
| "entropy": 2.7576374411582947, |
| "epoch": 0.1592760180995475, |
| "grad_norm": 0.9693785905838013, |
| "learning_rate": 0.00012835820895522389, |
| "loss": 0.2909581959247589, |
| "mean_token_accuracy": 0.9316168874502182, |
| "num_tokens": 387672.0, |
| "step": 44 |
| }, |
| { |
| "entropy": 2.5791444182395935, |
| "epoch": 0.16289592760180996, |
| "grad_norm": 0.6936656832695007, |
| "learning_rate": 0.00013134328358208955, |
| "loss": 0.32536542415618896, |
| "mean_token_accuracy": 0.9312366247177124, |
| "num_tokens": 396735.0, |
| "step": 45 |
| }, |
| { |
| "entropy": 2.483812391757965, |
| "epoch": 0.1665158371040724, |
| "grad_norm": 0.8131234049797058, |
| "learning_rate": 0.00013432835820895525, |
| "loss": 0.4375811219215393, |
| "mean_token_accuracy": 0.8777281790971756, |
| "num_tokens": 405449.0, |
| "step": 46 |
| }, |
| { |
| "entropy": 1.9735961556434631, |
| "epoch": 0.17013574660633485, |
| "grad_norm": 0.7072490453720093, |
| "learning_rate": 0.0001373134328358209, |
| "loss": 0.3115054965019226, |
| "mean_token_accuracy": 0.9156161844730377, |
| "num_tokens": 414791.0, |
| "step": 47 |
| }, |
| { |
| "entropy": 1.7938538491725922, |
| "epoch": 0.17375565610859728, |
| "grad_norm": 0.8033711314201355, |
| "learning_rate": 0.00014029850746268658, |
| "loss": 0.4930950999259949, |
| "mean_token_accuracy": 0.8819727599620819, |
| "num_tokens": 423449.0, |
| "step": 48 |
| }, |
| { |
| "entropy": 1.6752981543540955, |
| "epoch": 0.17737556561085974, |
| "grad_norm": 0.4999159574508667, |
| "learning_rate": 0.00014328358208955225, |
| "loss": 0.2053433656692505, |
| "mean_token_accuracy": 0.952075183391571, |
| "num_tokens": 432347.0, |
| "step": 49 |
| }, |
| { |
| "entropy": 1.6547318398952484, |
| "epoch": 0.18099547511312217, |
| "grad_norm": 5.030092716217041, |
| "learning_rate": 0.00014626865671641792, |
| "loss": 0.24254727363586426, |
| "mean_token_accuracy": 0.9326749891042709, |
| "num_tokens": 441197.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 1.6684256792068481, |
| "epoch": 0.18461538461538463, |
| "grad_norm": 0.5943530201911926, |
| "learning_rate": 0.0001492537313432836, |
| "loss": 0.3166338801383972, |
| "mean_token_accuracy": 0.9284558445215225, |
| "num_tokens": 450481.0, |
| "step": 51 |
| }, |
| { |
| "entropy": 1.6687548756599426, |
| "epoch": 0.18823529411764706, |
| "grad_norm": 0.6136987805366516, |
| "learning_rate": 0.00015223880597014925, |
| "loss": 0.31150421500205994, |
| "mean_token_accuracy": 0.9149410724639893, |
| "num_tokens": 459376.0, |
| "step": 52 |
| }, |
| { |
| "entropy": 1.6935299038887024, |
| "epoch": 0.19185520361990951, |
| "grad_norm": 0.6838834285736084, |
| "learning_rate": 0.00015522388059701495, |
| "loss": 0.39871394634246826, |
| "mean_token_accuracy": 0.9073809385299683, |
| "num_tokens": 468010.0, |
| "step": 53 |
| }, |
| { |
| "entropy": 1.5816974937915802, |
| "epoch": 0.19547511312217195, |
| "grad_norm": 0.5880348086357117, |
| "learning_rate": 0.00015820895522388059, |
| "loss": 0.25584831833839417, |
| "mean_token_accuracy": 0.9261536300182343, |
| "num_tokens": 476753.0, |
| "step": 54 |
| }, |
| { |
| "entropy": 1.5977029204368591, |
| "epoch": 0.19909502262443438, |
| "grad_norm": 0.5524119734764099, |
| "learning_rate": 0.00016119402985074628, |
| "loss": 0.27022480964660645, |
| "mean_token_accuracy": 0.9257165640592575, |
| "num_tokens": 485781.0, |
| "step": 55 |
| }, |
| { |
| "entropy": 1.6373226642608643, |
| "epoch": 0.20271493212669683, |
| "grad_norm": 0.6135741472244263, |
| "learning_rate": 0.00016417910447761195, |
| "loss": 0.263553261756897, |
| "mean_token_accuracy": 0.93193618953228, |
| "num_tokens": 494688.0, |
| "step": 56 |
| }, |
| { |
| "entropy": 1.5185258090496063, |
| "epoch": 0.20633484162895926, |
| "grad_norm": 0.5645662546157837, |
| "learning_rate": 0.00016716417910447761, |
| "loss": 0.297269344329834, |
| "mean_token_accuracy": 0.9199682921171188, |
| "num_tokens": 503958.0, |
| "step": 57 |
| }, |
| { |
| "entropy": 1.530813604593277, |
| "epoch": 0.20995475113122172, |
| "grad_norm": 0.5128429532051086, |
| "learning_rate": 0.00017014925373134328, |
| "loss": 0.3574504554271698, |
| "mean_token_accuracy": 0.9292190074920654, |
| "num_tokens": 513417.0, |
| "step": 58 |
| }, |
| { |
| "entropy": 1.5204592645168304, |
| "epoch": 0.21357466063348415, |
| "grad_norm": 0.5484449863433838, |
| "learning_rate": 0.00017313432835820895, |
| "loss": 0.2696574032306671, |
| "mean_token_accuracy": 0.9359241276979446, |
| "num_tokens": 522380.0, |
| "step": 59 |
| }, |
| { |
| "entropy": 1.5577877461910248, |
| "epoch": 0.2171945701357466, |
| "grad_norm": 0.6176109313964844, |
| "learning_rate": 0.00017611940298507464, |
| "loss": 0.3171631097793579, |
| "mean_token_accuracy": 0.9242918938398361, |
| "num_tokens": 531241.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 1.5508787035942078, |
| "epoch": 0.22081447963800904, |
| "grad_norm": 0.5896923542022705, |
| "learning_rate": 0.0001791044776119403, |
| "loss": 0.18115177750587463, |
| "mean_token_accuracy": 0.9481829404830933, |
| "num_tokens": 540149.0, |
| "step": 61 |
| }, |
| { |
| "entropy": 1.5782250761985779, |
| "epoch": 0.2244343891402715, |
| "grad_norm": 0.7554022669792175, |
| "learning_rate": 0.00018208955223880598, |
| "loss": 0.3728199303150177, |
| "mean_token_accuracy": 0.9055676311254501, |
| "num_tokens": 548927.0, |
| "step": 62 |
| }, |
| { |
| "entropy": 1.6092694103717804, |
| "epoch": 0.22805429864253393, |
| "grad_norm": 0.696146547794342, |
| "learning_rate": 0.00018507462686567165, |
| "loss": 0.49058157205581665, |
| "mean_token_accuracy": 0.8916458785533905, |
| "num_tokens": 557895.0, |
| "step": 63 |
| }, |
| { |
| "entropy": 1.5317207276821136, |
| "epoch": 0.2316742081447964, |
| "grad_norm": 0.6891468167304993, |
| "learning_rate": 0.00018805970149253734, |
| "loss": 0.36224162578582764, |
| "mean_token_accuracy": 0.9079867750406265, |
| "num_tokens": 567080.0, |
| "step": 64 |
| }, |
| { |
| "entropy": 1.5777660310268402, |
| "epoch": 0.23529411764705882, |
| "grad_norm": 0.6747457981109619, |
| "learning_rate": 0.00019104477611940298, |
| "loss": 0.34208086133003235, |
| "mean_token_accuracy": 0.9191138446331024, |
| "num_tokens": 576156.0, |
| "step": 65 |
| }, |
| { |
| "entropy": 1.579883337020874, |
| "epoch": 0.23891402714932128, |
| "grad_norm": 0.6088735461235046, |
| "learning_rate": 0.00019402985074626867, |
| "loss": 0.2968827188014984, |
| "mean_token_accuracy": 0.9238106161355972, |
| "num_tokens": 585109.0, |
| "step": 66 |
| }, |
| { |
| "entropy": 1.6180895566940308, |
| "epoch": 0.2425339366515837, |
| "grad_norm": 0.5592202544212341, |
| "learning_rate": 0.00019701492537313434, |
| "loss": 0.17197401821613312, |
| "mean_token_accuracy": 0.954246997833252, |
| "num_tokens": 593744.0, |
| "step": 67 |
| }, |
| { |
| "entropy": 1.5819954574108124, |
| "epoch": 0.24615384615384617, |
| "grad_norm": 0.6158381700515747, |
| "learning_rate": 0.0002, |
| "loss": 0.32391372323036194, |
| "mean_token_accuracy": 0.9132590889930725, |
| "num_tokens": 602817.0, |
| "step": 68 |
| }, |
| { |
| "entropy": 1.6124244332313538, |
| "epoch": 0.2497737556561086, |
| "grad_norm": 0.6058560013771057, |
| "learning_rate": 0.00019999990383005872, |
| "loss": 0.398790568113327, |
| "mean_token_accuracy": 0.9038740396499634, |
| "num_tokens": 611500.0, |
| "step": 69 |
| }, |
| { |
| "entropy": 1.5589013695716858, |
| "epoch": 0.25339366515837103, |
| "grad_norm": 0.5041698813438416, |
| "learning_rate": 0.00019999961532044045, |
| "loss": 0.24661695957183838, |
| "mean_token_accuracy": 0.9379201531410217, |
| "num_tokens": 620497.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 1.539864867925644, |
| "epoch": 0.25701357466063346, |
| "grad_norm": 0.5611797571182251, |
| "learning_rate": 0.00019999913447176174, |
| "loss": 0.2731279730796814, |
| "mean_token_accuracy": 0.9369927495718002, |
| "num_tokens": 629538.0, |
| "step": 71 |
| }, |
| { |
| "entropy": 1.5526191294193268, |
| "epoch": 0.26063348416289595, |
| "grad_norm": 0.49492526054382324, |
| "learning_rate": 0.00019999846128505015, |
| "loss": 0.22361817955970764, |
| "mean_token_accuracy": 0.9443113952875137, |
| "num_tokens": 638389.0, |
| "step": 72 |
| }, |
| { |
| "entropy": 1.6069320440292358, |
| "epoch": 0.2642533936651584, |
| "grad_norm": 0.6465152502059937, |
| "learning_rate": 0.00019999759576174448, |
| "loss": 0.2783147692680359, |
| "mean_token_accuracy": 0.9260384887456894, |
| "num_tokens": 647129.0, |
| "step": 73 |
| }, |
| { |
| "entropy": 1.5199538469314575, |
| "epoch": 0.2678733031674208, |
| "grad_norm": 0.541709840297699, |
| "learning_rate": 0.00019999653790369438, |
| "loss": 0.23969395458698273, |
| "mean_token_accuracy": 0.9355704188346863, |
| "num_tokens": 655779.0, |
| "step": 74 |
| }, |
| { |
| "entropy": 1.4944458305835724, |
| "epoch": 0.27149321266968324, |
| "grad_norm": 0.5421354174613953, |
| "learning_rate": 0.00019999528771316057, |
| "loss": 0.23178081214427948, |
| "mean_token_accuracy": 0.9418981224298477, |
| "num_tokens": 664953.0, |
| "step": 75 |
| }, |
| { |
| "entropy": 1.4615215063095093, |
| "epoch": 0.2751131221719457, |
| "grad_norm": 0.5120413899421692, |
| "learning_rate": 0.00019999384519281494, |
| "loss": 0.21126341819763184, |
| "mean_token_accuracy": 0.9391632974147797, |
| "num_tokens": 674258.0, |
| "step": 76 |
| }, |
| { |
| "entropy": 1.5696524381637573, |
| "epoch": 0.27873303167420815, |
| "grad_norm": 0.6865569949150085, |
| "learning_rate": 0.00019999221034574028, |
| "loss": 0.31426694989204407, |
| "mean_token_accuracy": 0.9186421036720276, |
| "num_tokens": 682844.0, |
| "step": 77 |
| }, |
| { |
| "entropy": 1.4924792051315308, |
| "epoch": 0.2823529411764706, |
| "grad_norm": 0.6203171610832214, |
| "learning_rate": 0.00019999038317543036, |
| "loss": 0.22723156213760376, |
| "mean_token_accuracy": 0.9455482661724091, |
| "num_tokens": 692032.0, |
| "step": 78 |
| }, |
| { |
| "entropy": 1.5375191569328308, |
| "epoch": 0.285972850678733, |
| "grad_norm": 0.5844593048095703, |
| "learning_rate": 0.00019998836368579013, |
| "loss": 0.3107585310935974, |
| "mean_token_accuracy": 0.9372987002134323, |
| "num_tokens": 700940.0, |
| "step": 79 |
| }, |
| { |
| "entropy": 1.542263776063919, |
| "epoch": 0.2895927601809955, |
| "grad_norm": 0.6159414649009705, |
| "learning_rate": 0.00019998615188113547, |
| "loss": 0.20322281122207642, |
| "mean_token_accuracy": 0.9484172016382217, |
| "num_tokens": 709374.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 1.566244751214981, |
| "epoch": 0.29321266968325793, |
| "grad_norm": 0.7167544960975647, |
| "learning_rate": 0.00019998374776619316, |
| "loss": 0.289408415555954, |
| "mean_token_accuracy": 0.9302106499671936, |
| "num_tokens": 718254.0, |
| "step": 81 |
| }, |
| { |
| "entropy": 1.5220647156238556, |
| "epoch": 0.29683257918552036, |
| "grad_norm": 0.5164801478385925, |
| "learning_rate": 0.0001999811513461012, |
| "loss": 0.16729353368282318, |
| "mean_token_accuracy": 0.961905837059021, |
| "num_tokens": 726892.0, |
| "step": 82 |
| }, |
| { |
| "entropy": 1.4655065834522247, |
| "epoch": 0.3004524886877828, |
| "grad_norm": 0.7785144448280334, |
| "learning_rate": 0.00019997836262640825, |
| "loss": 0.2753179669380188, |
| "mean_token_accuracy": 0.9345895648002625, |
| "num_tokens": 736144.0, |
| "step": 83 |
| }, |
| { |
| "entropy": 1.5420578122138977, |
| "epoch": 0.3040723981900452, |
| "grad_norm": 0.7086989879608154, |
| "learning_rate": 0.00019997538161307425, |
| "loss": 0.19912396371364594, |
| "mean_token_accuracy": 0.948458805680275, |
| "num_tokens": 744947.0, |
| "step": 84 |
| }, |
| { |
| "entropy": 1.5529279112815857, |
| "epoch": 0.3076923076923077, |
| "grad_norm": 0.6447221040725708, |
| "learning_rate": 0.00019997220831246987, |
| "loss": 0.19798173010349274, |
| "mean_token_accuracy": 0.9535399079322815, |
| "num_tokens": 754040.0, |
| "step": 85 |
| }, |
| { |
| "entropy": 1.5264079570770264, |
| "epoch": 0.31131221719457014, |
| "grad_norm": 0.6189771890640259, |
| "learning_rate": 0.00019996884273137686, |
| "loss": 0.20467980206012726, |
| "mean_token_accuracy": 0.9415531605482101, |
| "num_tokens": 763365.0, |
| "step": 86 |
| }, |
| { |
| "entropy": 1.751158595085144, |
| "epoch": 0.31493212669683257, |
| "grad_norm": 0.7291711568832397, |
| "learning_rate": 0.0001999652848769878, |
| "loss": 0.24249613285064697, |
| "mean_token_accuracy": 0.9340829253196716, |
| "num_tokens": 771408.0, |
| "step": 87 |
| }, |
| { |
| "entropy": 1.6374999582767487, |
| "epoch": 0.318552036199095, |
| "grad_norm": 0.6003106236457825, |
| "learning_rate": 0.00019996153475690623, |
| "loss": 0.3658824861049652, |
| "mean_token_accuracy": 0.918518453836441, |
| "num_tokens": 780637.0, |
| "step": 88 |
| }, |
| { |
| "entropy": 1.5862501561641693, |
| "epoch": 0.3221719457013575, |
| "grad_norm": 0.6139584183692932, |
| "learning_rate": 0.00019995759237914656, |
| "loss": 0.29145702719688416, |
| "mean_token_accuracy": 0.9236557334661484, |
| "num_tokens": 789741.0, |
| "step": 89 |
| }, |
| { |
| "entropy": 1.67588272690773, |
| "epoch": 0.3257918552036199, |
| "grad_norm": 0.5298960208892822, |
| "learning_rate": 0.0001999534577521341, |
| "loss": 0.19036075472831726, |
| "mean_token_accuracy": 0.9556817710399628, |
| "num_tokens": 798373.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 1.6315190196037292, |
| "epoch": 0.32941176470588235, |
| "grad_norm": 0.5074024796485901, |
| "learning_rate": 0.00019994913088470498, |
| "loss": 0.22543872892856598, |
| "mean_token_accuracy": 0.9336265623569489, |
| "num_tokens": 807267.0, |
| "step": 91 |
| }, |
| { |
| "entropy": 1.5913198590278625, |
| "epoch": 0.3330316742081448, |
| "grad_norm": 0.7284563779830933, |
| "learning_rate": 0.00019994461178610617, |
| "loss": 0.24927280843257904, |
| "mean_token_accuracy": 0.9343689233064651, |
| "num_tokens": 816046.0, |
| "step": 92 |
| }, |
| { |
| "entropy": 1.6069969236850739, |
| "epoch": 0.33665158371040727, |
| "grad_norm": 0.7105880379676819, |
| "learning_rate": 0.00019993990046599555, |
| "loss": 0.295354962348938, |
| "mean_token_accuracy": 0.9300834238529205, |
| "num_tokens": 824773.0, |
| "step": 93 |
| }, |
| { |
| "entropy": 1.6023050248622894, |
| "epoch": 0.3402714932126697, |
| "grad_norm": 0.5128775238990784, |
| "learning_rate": 0.00019993499693444168, |
| "loss": 0.13714508712291718, |
| "mean_token_accuracy": 0.9659775942564011, |
| "num_tokens": 833307.0, |
| "step": 94 |
| }, |
| { |
| "entropy": 1.47949880361557, |
| "epoch": 0.3438914027149321, |
| "grad_norm": 0.4838848114013672, |
| "learning_rate": 0.00019992990120192393, |
| "loss": 0.1908133178949356, |
| "mean_token_accuracy": 0.9466118365526199, |
| "num_tokens": 842176.0, |
| "step": 95 |
| }, |
| { |
| "entropy": 1.5162540972232819, |
| "epoch": 0.34751131221719456, |
| "grad_norm": 0.4756297767162323, |
| "learning_rate": 0.00019992461327933252, |
| "loss": 0.14188416302204132, |
| "mean_token_accuracy": 0.9609939008951187, |
| "num_tokens": 851079.0, |
| "step": 96 |
| }, |
| { |
| "entropy": 1.5233789086341858, |
| "epoch": 0.351131221719457, |
| "grad_norm": 0.5846927762031555, |
| "learning_rate": 0.00019991913317796825, |
| "loss": 0.35901975631713867, |
| "mean_token_accuracy": 0.9176534414291382, |
| "num_tokens": 860190.0, |
| "step": 97 |
| }, |
| { |
| "entropy": 1.488583117723465, |
| "epoch": 0.3547511312217195, |
| "grad_norm": 0.5157149434089661, |
| "learning_rate": 0.00019991346090954268, |
| "loss": 0.1544593721628189, |
| "mean_token_accuracy": 0.9559026509523392, |
| "num_tokens": 869107.0, |
| "step": 98 |
| }, |
| { |
| "entropy": 1.4912174940109253, |
| "epoch": 0.3583710407239819, |
| "grad_norm": 0.6246358156204224, |
| "learning_rate": 0.00019990759648617814, |
| "loss": 0.2815958857536316, |
| "mean_token_accuracy": 0.9374883323907852, |
| "num_tokens": 877732.0, |
| "step": 99 |
| }, |
| { |
| "entropy": 1.4868848025798798, |
| "epoch": 0.36199095022624433, |
| "grad_norm": 0.502863883972168, |
| "learning_rate": 0.0001999015399204075, |
| "loss": 0.15397809445858002, |
| "mean_token_accuracy": 0.9577479511499405, |
| "num_tokens": 886494.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 1.4532659351825714, |
| "epoch": 0.36561085972850677, |
| "grad_norm": 0.4709261655807495, |
| "learning_rate": 0.0001998952912251743, |
| "loss": 0.16087600588798523, |
| "mean_token_accuracy": 0.9519130736589432, |
| "num_tokens": 895455.0, |
| "step": 101 |
| }, |
| { |
| "entropy": 1.491580843925476, |
| "epoch": 0.36923076923076925, |
| "grad_norm": 0.5536823868751526, |
| "learning_rate": 0.0001998888504138327, |
| "loss": 0.16084736585617065, |
| "mean_token_accuracy": 0.960470125079155, |
| "num_tokens": 904392.0, |
| "step": 102 |
| }, |
| { |
| "entropy": 1.4420756101608276, |
| "epoch": 0.3728506787330317, |
| "grad_norm": 0.5427432060241699, |
| "learning_rate": 0.00019988221750014747, |
| "loss": 0.2450316995382309, |
| "mean_token_accuracy": 0.9366131573915482, |
| "num_tokens": 914065.0, |
| "step": 103 |
| }, |
| { |
| "entropy": 1.5350265502929688, |
| "epoch": 0.3764705882352941, |
| "grad_norm": 0.4882596731185913, |
| "learning_rate": 0.00019987539249829381, |
| "loss": 0.21313555538654327, |
| "mean_token_accuracy": 0.9590773284435272, |
| "num_tokens": 922687.0, |
| "step": 104 |
| }, |
| { |
| "entropy": 1.5769560635089874, |
| "epoch": 0.38009049773755654, |
| "grad_norm": 0.7525632977485657, |
| "learning_rate": 0.0001998683754228575, |
| "loss": 0.2643919289112091, |
| "mean_token_accuracy": 0.9193268120288849, |
| "num_tokens": 930932.0, |
| "step": 105 |
| }, |
| { |
| "entropy": 1.5345399081707, |
| "epoch": 0.38371040723981903, |
| "grad_norm": 0.6206967234611511, |
| "learning_rate": 0.00019986116628883485, |
| "loss": 0.2925248146057129, |
| "mean_token_accuracy": 0.931127279996872, |
| "num_tokens": 939846.0, |
| "step": 106 |
| }, |
| { |
| "entropy": 1.5490939021110535, |
| "epoch": 0.38733031674208146, |
| "grad_norm": 0.5089172124862671, |
| "learning_rate": 0.00019985376511163255, |
| "loss": 0.14840808510780334, |
| "mean_token_accuracy": 0.9644817113876343, |
| "num_tokens": 948368.0, |
| "step": 107 |
| }, |
| { |
| "entropy": 1.5281821191310883, |
| "epoch": 0.3909502262443439, |
| "grad_norm": 0.4952821135520935, |
| "learning_rate": 0.00019984617190706768, |
| "loss": 0.21237969398498535, |
| "mean_token_accuracy": 0.9380226731300354, |
| "num_tokens": 957250.0, |
| "step": 108 |
| }, |
| { |
| "entropy": 1.5303342044353485, |
| "epoch": 0.3945701357466063, |
| "grad_norm": 0.5427054166793823, |
| "learning_rate": 0.00019983838669136782, |
| "loss": 0.16821178793907166, |
| "mean_token_accuracy": 0.9463026076555252, |
| "num_tokens": 965749.0, |
| "step": 109 |
| }, |
| { |
| "entropy": 1.5286442935466766, |
| "epoch": 0.39819004524886875, |
| "grad_norm": 0.4814460873603821, |
| "learning_rate": 0.00019983040948117078, |
| "loss": 0.21669423580169678, |
| "mean_token_accuracy": 0.9485635608434677, |
| "num_tokens": 973995.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 1.5193851590156555, |
| "epoch": 0.40180995475113124, |
| "grad_norm": 0.4849177300930023, |
| "learning_rate": 0.00019982224029352477, |
| "loss": 0.1973811537027359, |
| "mean_token_accuracy": 0.9545295536518097, |
| "num_tokens": 982514.0, |
| "step": 111 |
| }, |
| { |
| "entropy": 1.5222062468528748, |
| "epoch": 0.40542986425339367, |
| "grad_norm": 0.5772113800048828, |
| "learning_rate": 0.00019981387914588822, |
| "loss": 0.357006311416626, |
| "mean_token_accuracy": 0.9294342249631882, |
| "num_tokens": 991115.0, |
| "step": 112 |
| }, |
| { |
| "entropy": 1.4591252207756042, |
| "epoch": 0.4090497737556561, |
| "grad_norm": 0.40475624799728394, |
| "learning_rate": 0.00019980532605612985, |
| "loss": 0.17911836504936218, |
| "mean_token_accuracy": 0.958268016576767, |
| "num_tokens": 1000407.0, |
| "step": 113 |
| }, |
| { |
| "entropy": 1.5297361612319946, |
| "epoch": 0.41266968325791853, |
| "grad_norm": 0.5671453475952148, |
| "learning_rate": 0.0001997965810425285, |
| "loss": 0.14860941469669342, |
| "mean_token_accuracy": 0.9657443910837173, |
| "num_tokens": 1008486.0, |
| "step": 114 |
| }, |
| { |
| "entropy": 1.4873208403587341, |
| "epoch": 0.416289592760181, |
| "grad_norm": 0.47243061661720276, |
| "learning_rate": 0.0001997876441237733, |
| "loss": 0.27150753140449524, |
| "mean_token_accuracy": 0.9356586933135986, |
| "num_tokens": 1017408.0, |
| "step": 115 |
| }, |
| { |
| "entropy": 1.5046488046646118, |
| "epoch": 0.41990950226244345, |
| "grad_norm": 0.5545538067817688, |
| "learning_rate": 0.00019977851531896335, |
| "loss": 0.27598193287849426, |
| "mean_token_accuracy": 0.9293323308229446, |
| "num_tokens": 1026279.0, |
| "step": 116 |
| }, |
| { |
| "entropy": 1.5389924347400665, |
| "epoch": 0.4235294117647059, |
| "grad_norm": 0.5456140637397766, |
| "learning_rate": 0.00019976919464760793, |
| "loss": 0.20166276395320892, |
| "mean_token_accuracy": 0.9474701136350632, |
| "num_tokens": 1034896.0, |
| "step": 117 |
| }, |
| { |
| "entropy": 1.5261160135269165, |
| "epoch": 0.4271493212669683, |
| "grad_norm": 0.4349948763847351, |
| "learning_rate": 0.00019975968212962637, |
| "loss": 0.11712481081485748, |
| "mean_token_accuracy": 0.9618890285491943, |
| "num_tokens": 1043357.0, |
| "step": 118 |
| }, |
| { |
| "entropy": 1.4879018366336823, |
| "epoch": 0.4307692307692308, |
| "grad_norm": 0.6200428009033203, |
| "learning_rate": 0.00019974997778534793, |
| "loss": 0.2552901804447174, |
| "mean_token_accuracy": 0.931525394320488, |
| "num_tokens": 1051617.0, |
| "step": 119 |
| }, |
| { |
| "entropy": 1.495672345161438, |
| "epoch": 0.4343891402714932, |
| "grad_norm": 0.5531431436538696, |
| "learning_rate": 0.0001997400816355119, |
| "loss": 0.18440331518650055, |
| "mean_token_accuracy": 0.9566466957330704, |
| "num_tokens": 1060297.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 1.468797743320465, |
| "epoch": 0.43800904977375565, |
| "grad_norm": 0.8418586254119873, |
| "learning_rate": 0.00019972999370126737, |
| "loss": 0.4476836621761322, |
| "mean_token_accuracy": 0.9020169228315353, |
| "num_tokens": 1069625.0, |
| "step": 121 |
| }, |
| { |
| "entropy": 1.5040291845798492, |
| "epoch": 0.4416289592760181, |
| "grad_norm": 0.5432603359222412, |
| "learning_rate": 0.00019971971400417342, |
| "loss": 0.23491275310516357, |
| "mean_token_accuracy": 0.938463494181633, |
| "num_tokens": 1078541.0, |
| "step": 122 |
| }, |
| { |
| "entropy": 1.507046639919281, |
| "epoch": 0.4452488687782805, |
| "grad_norm": 0.515389621257782, |
| "learning_rate": 0.00019970924256619888, |
| "loss": 0.19066768884658813, |
| "mean_token_accuracy": 0.9394121021032333, |
| "num_tokens": 1087015.0, |
| "step": 123 |
| }, |
| { |
| "entropy": 1.5334805250167847, |
| "epoch": 0.448868778280543, |
| "grad_norm": 0.6119932532310486, |
| "learning_rate": 0.00019969857940972235, |
| "loss": 0.2432323843240738, |
| "mean_token_accuracy": 0.9420388340950012, |
| "num_tokens": 1095644.0, |
| "step": 124 |
| }, |
| { |
| "entropy": 1.451358139514923, |
| "epoch": 0.45248868778280543, |
| "grad_norm": 0.513521134853363, |
| "learning_rate": 0.00019968772455753218, |
| "loss": 0.21337461471557617, |
| "mean_token_accuracy": 0.9473854750394821, |
| "num_tokens": 1104652.0, |
| "step": 125 |
| }, |
| { |
| "entropy": 1.403743416070938, |
| "epoch": 0.45610859728506786, |
| "grad_norm": 0.44188353419303894, |
| "learning_rate": 0.00019967667803282637, |
| "loss": 0.21201607584953308, |
| "mean_token_accuracy": 0.9484260380268097, |
| "num_tokens": 1113648.0, |
| "step": 126 |
| }, |
| { |
| "entropy": 1.4469349682331085, |
| "epoch": 0.4597285067873303, |
| "grad_norm": 0.5705162286758423, |
| "learning_rate": 0.00019966543985921258, |
| "loss": 0.18666209280490875, |
| "mean_token_accuracy": 0.9515744149684906, |
| "num_tokens": 1122645.0, |
| "step": 127 |
| }, |
| { |
| "entropy": 1.3682746887207031, |
| "epoch": 0.4633484162895928, |
| "grad_norm": 0.49191927909851074, |
| "learning_rate": 0.000199654010060708, |
| "loss": 0.29679813981056213, |
| "mean_token_accuracy": 0.9304109215736389, |
| "num_tokens": 1132041.0, |
| "step": 128 |
| }, |
| { |
| "entropy": 1.4161922633647919, |
| "epoch": 0.4669683257918552, |
| "grad_norm": 0.47388190031051636, |
| "learning_rate": 0.00019964238866173933, |
| "loss": 0.1531015932559967, |
| "mean_token_accuracy": 0.9585694819688797, |
| "num_tokens": 1140871.0, |
| "step": 129 |
| }, |
| { |
| "entropy": 1.3751187920570374, |
| "epoch": 0.47058823529411764, |
| "grad_norm": 0.46824195981025696, |
| "learning_rate": 0.00019963057568714288, |
| "loss": 0.17030593752861023, |
| "mean_token_accuracy": 0.9534722566604614, |
| "num_tokens": 1149706.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 1.3917218148708344, |
| "epoch": 0.47420814479638007, |
| "grad_norm": 0.4428865909576416, |
| "learning_rate": 0.00019961857116216415, |
| "loss": 0.15017299354076385, |
| "mean_token_accuracy": 0.9592516124248505, |
| "num_tokens": 1158749.0, |
| "step": 131 |
| }, |
| { |
| "entropy": 1.3426124155521393, |
| "epoch": 0.47782805429864256, |
| "grad_norm": 0.49072983860969543, |
| "learning_rate": 0.00019960637511245823, |
| "loss": 0.22365210950374603, |
| "mean_token_accuracy": 0.946484237909317, |
| "num_tokens": 1167772.0, |
| "step": 132 |
| }, |
| { |
| "entropy": 1.421400249004364, |
| "epoch": 0.481447963800905, |
| "grad_norm": 0.5675694346427917, |
| "learning_rate": 0.00019959398756408937, |
| "loss": 0.18156108260154724, |
| "mean_token_accuracy": 0.9530180990695953, |
| "num_tokens": 1176196.0, |
| "step": 133 |
| }, |
| { |
| "entropy": 1.3424357175827026, |
| "epoch": 0.4850678733031674, |
| "grad_norm": 0.5297194123268127, |
| "learning_rate": 0.0001995814085435311, |
| "loss": 0.18270625174045563, |
| "mean_token_accuracy": 0.9406759738922119, |
| "num_tokens": 1185613.0, |
| "step": 134 |
| }, |
| { |
| "entropy": 1.4340205192565918, |
| "epoch": 0.48868778280542985, |
| "grad_norm": 0.5901919603347778, |
| "learning_rate": 0.00019956863807766618, |
| "loss": 0.1788717657327652, |
| "mean_token_accuracy": 0.9499698132276535, |
| "num_tokens": 1193890.0, |
| "step": 135 |
| }, |
| { |
| "entropy": 1.4506348073482513, |
| "epoch": 0.49230769230769234, |
| "grad_norm": 0.5080121159553528, |
| "learning_rate": 0.00019955567619378653, |
| "loss": 0.12078559398651123, |
| "mean_token_accuracy": 0.9667535722255707, |
| "num_tokens": 1202125.0, |
| "step": 136 |
| }, |
| { |
| "entropy": 1.382477194070816, |
| "epoch": 0.49592760180995477, |
| "grad_norm": 0.659888744354248, |
| "learning_rate": 0.00019954252291959313, |
| "loss": 0.2572862505912781, |
| "mean_token_accuracy": 0.9361128658056259, |
| "num_tokens": 1210985.0, |
| "step": 137 |
| }, |
| { |
| "entropy": 1.4523886442184448, |
| "epoch": 0.4995475113122172, |
| "grad_norm": 0.6076639890670776, |
| "learning_rate": 0.00019952917828319587, |
| "loss": 0.1389123499393463, |
| "mean_token_accuracy": 0.9607612937688828, |
| "num_tokens": 1219414.0, |
| "step": 138 |
| }, |
| { |
| "entropy": 1.4372033476829529, |
| "epoch": 0.5031674208144796, |
| "grad_norm": 0.6114508509635925, |
| "learning_rate": 0.00019951564231311382, |
| "loss": 0.16256970167160034, |
| "mean_token_accuracy": 0.9580988585948944, |
| "num_tokens": 1227819.0, |
| "step": 139 |
| }, |
| { |
| "entropy": 1.4240647554397583, |
| "epoch": 0.5067873303167421, |
| "grad_norm": 0.5607258677482605, |
| "learning_rate": 0.00019950191503827477, |
| "loss": 0.1742154359817505, |
| "mean_token_accuracy": 0.9604121297597885, |
| "num_tokens": 1236815.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 1.3567279279232025, |
| "epoch": 0.5104072398190045, |
| "grad_norm": 0.49631235003471375, |
| "learning_rate": 0.00019948799648801546, |
| "loss": 0.1200169026851654, |
| "mean_token_accuracy": 0.9611728638410568, |
| "num_tokens": 1246290.0, |
| "step": 141 |
| }, |
| { |
| "entropy": 1.506151258945465, |
| "epoch": 0.5140271493212669, |
| "grad_norm": 0.9381592273712158, |
| "learning_rate": 0.0001994738866920813, |
| "loss": 0.24524807929992676, |
| "mean_token_accuracy": 0.9364699125289917, |
| "num_tokens": 1254437.0, |
| "step": 142 |
| }, |
| { |
| "entropy": 1.4939132928848267, |
| "epoch": 0.5176470588235295, |
| "grad_norm": 0.4674718976020813, |
| "learning_rate": 0.00019945958568062656, |
| "loss": 0.22789186239242554, |
| "mean_token_accuracy": 0.9441357254981995, |
| "num_tokens": 1263451.0, |
| "step": 143 |
| }, |
| { |
| "entropy": 1.501807302236557, |
| "epoch": 0.5212669683257919, |
| "grad_norm": 0.5704030394554138, |
| "learning_rate": 0.00019944509348421394, |
| "loss": 0.18432818353176117, |
| "mean_token_accuracy": 0.953639954328537, |
| "num_tokens": 1272204.0, |
| "step": 144 |
| }, |
| { |
| "entropy": 1.4492950141429901, |
| "epoch": 0.5248868778280543, |
| "grad_norm": 0.47559797763824463, |
| "learning_rate": 0.000199430410133815, |
| "loss": 0.147740438580513, |
| "mean_token_accuracy": 0.9607143253087997, |
| "num_tokens": 1281403.0, |
| "step": 145 |
| }, |
| { |
| "entropy": 1.4073392152786255, |
| "epoch": 0.5285067873303168, |
| "grad_norm": 0.6094549298286438, |
| "learning_rate": 0.00019941553566080956, |
| "loss": 0.2326284945011139, |
| "mean_token_accuracy": 0.9408996403217316, |
| "num_tokens": 1290379.0, |
| "step": 146 |
| }, |
| { |
| "entropy": 1.507258117198944, |
| "epoch": 0.5321266968325792, |
| "grad_norm": 0.6971456408500671, |
| "learning_rate": 0.00019940047009698605, |
| "loss": 0.2862337529659271, |
| "mean_token_accuracy": 0.9211678206920624, |
| "num_tokens": 1298939.0, |
| "step": 147 |
| }, |
| { |
| "entropy": 1.4335385262966156, |
| "epoch": 0.5357466063348416, |
| "grad_norm": 0.5851684808731079, |
| "learning_rate": 0.00019938521347454127, |
| "loss": 0.29994359612464905, |
| "mean_token_accuracy": 0.9208105802536011, |
| "num_tokens": 1307833.0, |
| "step": 148 |
| }, |
| { |
| "entropy": 1.489009529352188, |
| "epoch": 0.539366515837104, |
| "grad_norm": 0.7157009840011597, |
| "learning_rate": 0.00019936976582608023, |
| "loss": 0.33578288555145264, |
| "mean_token_accuracy": 0.9373409301042557, |
| "num_tokens": 1316684.0, |
| "step": 149 |
| }, |
| { |
| "entropy": 1.4718311429023743, |
| "epoch": 0.5429864253393665, |
| "grad_norm": 0.5213670134544373, |
| "learning_rate": 0.00019935412718461625, |
| "loss": 0.13324445486068726, |
| "mean_token_accuracy": 0.9562166035175323, |
| "num_tokens": 1325251.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 1.4213023483753204, |
| "epoch": 0.5466063348416289, |
| "grad_norm": 0.5317531824111938, |
| "learning_rate": 0.0001993382975835709, |
| "loss": 0.15537622570991516, |
| "mean_token_accuracy": 0.964701920747757, |
| "num_tokens": 1333742.0, |
| "step": 151 |
| }, |
| { |
| "entropy": 1.3925495147705078, |
| "epoch": 0.5502262443438914, |
| "grad_norm": 0.4998919665813446, |
| "learning_rate": 0.00019932227705677372, |
| "loss": 0.1906745433807373, |
| "mean_token_accuracy": 0.9361033141613007, |
| "num_tokens": 1342670.0, |
| "step": 152 |
| }, |
| { |
| "entropy": 1.3675826787948608, |
| "epoch": 0.5538461538461539, |
| "grad_norm": 0.3895116448402405, |
| "learning_rate": 0.00019930606563846234, |
| "loss": 0.14962267875671387, |
| "mean_token_accuracy": 0.9600752294063568, |
| "num_tokens": 1351651.0, |
| "step": 153 |
| }, |
| { |
| "entropy": 1.2704340815544128, |
| "epoch": 0.5574660633484163, |
| "grad_norm": 0.43442079424858093, |
| "learning_rate": 0.0001992896633632823, |
| "loss": 0.1176116019487381, |
| "mean_token_accuracy": 0.9697213470935822, |
| "num_tokens": 1360957.0, |
| "step": 154 |
| }, |
| { |
| "entropy": 1.349033147096634, |
| "epoch": 0.5610859728506787, |
| "grad_norm": 0.6015726923942566, |
| "learning_rate": 0.00019927307026628715, |
| "loss": 0.17969676852226257, |
| "mean_token_accuracy": 0.9535713642835617, |
| "num_tokens": 1369448.0, |
| "step": 155 |
| }, |
| { |
| "entropy": 1.3353968262672424, |
| "epoch": 0.5647058823529412, |
| "grad_norm": 0.5272573828697205, |
| "learning_rate": 0.00019925628638293815, |
| "loss": 0.21488747000694275, |
| "mean_token_accuracy": 0.942707359790802, |
| "num_tokens": 1378342.0, |
| "step": 156 |
| }, |
| { |
| "entropy": 1.3764784634113312, |
| "epoch": 0.5683257918552036, |
| "grad_norm": 0.7961441278457642, |
| "learning_rate": 0.00019923931174910421, |
| "loss": 0.30012083053588867, |
| "mean_token_accuracy": 0.9248047173023224, |
| "num_tokens": 1386635.0, |
| "step": 157 |
| }, |
| { |
| "entropy": 1.4050418436527252, |
| "epoch": 0.571945701357466, |
| "grad_norm": 0.6271554231643677, |
| "learning_rate": 0.00019922214640106207, |
| "loss": 0.184654101729393, |
| "mean_token_accuracy": 0.9567630439996719, |
| "num_tokens": 1394801.0, |
| "step": 158 |
| }, |
| { |
| "entropy": 1.3368881344795227, |
| "epoch": 0.5755656108597285, |
| "grad_norm": 0.5737248063087463, |
| "learning_rate": 0.00019920479037549595, |
| "loss": 0.2726176977157593, |
| "mean_token_accuracy": 0.9449877142906189, |
| "num_tokens": 1403634.0, |
| "step": 159 |
| }, |
| { |
| "entropy": 1.3764293193817139, |
| "epoch": 0.579185520361991, |
| "grad_norm": 0.6995347142219543, |
| "learning_rate": 0.00019918724370949754, |
| "loss": 0.2793852686882019, |
| "mean_token_accuracy": 0.9334268867969513, |
| "num_tokens": 1412184.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 1.3516182601451874, |
| "epoch": 0.5828054298642534, |
| "grad_norm": 0.6867108941078186, |
| "learning_rate": 0.00019916950644056607, |
| "loss": 0.23619085550308228, |
| "mean_token_accuracy": 0.9341517090797424, |
| "num_tokens": 1421258.0, |
| "step": 161 |
| }, |
| { |
| "entropy": 1.3733141720294952, |
| "epoch": 0.5864253393665159, |
| "grad_norm": 0.6309821009635925, |
| "learning_rate": 0.00019915157860660797, |
| "loss": 0.27165842056274414, |
| "mean_token_accuracy": 0.9349307119846344, |
| "num_tokens": 1430376.0, |
| "step": 162 |
| }, |
| { |
| "entropy": 1.421423226594925, |
| "epoch": 0.5900452488687783, |
| "grad_norm": 0.512278139591217, |
| "learning_rate": 0.000199133460245937, |
| "loss": 0.09708991646766663, |
| "mean_token_accuracy": 0.9740542620420456, |
| "num_tokens": 1439210.0, |
| "step": 163 |
| }, |
| { |
| "entropy": 1.4290501475334167, |
| "epoch": 0.5936651583710407, |
| "grad_norm": 0.54677814245224, |
| "learning_rate": 0.0001991151513972741, |
| "loss": 0.1681460738182068, |
| "mean_token_accuracy": 0.9526286870241165, |
| "num_tokens": 1447909.0, |
| "step": 164 |
| }, |
| { |
| "entropy": 1.4586975574493408, |
| "epoch": 0.5972850678733032, |
| "grad_norm": 0.5319063067436218, |
| "learning_rate": 0.00019909665209974723, |
| "loss": 0.27126502990722656, |
| "mean_token_accuracy": 0.9449646919965744, |
| "num_tokens": 1456865.0, |
| "step": 165 |
| }, |
| { |
| "entropy": 1.291123479604721, |
| "epoch": 0.6009049773755656, |
| "grad_norm": 0.41675442457199097, |
| "learning_rate": 0.00019907796239289154, |
| "loss": 0.14705216884613037, |
| "mean_token_accuracy": 0.957685187458992, |
| "num_tokens": 1466501.0, |
| "step": 166 |
| }, |
| { |
| "entropy": 1.4526861608028412, |
| "epoch": 0.604524886877828, |
| "grad_norm": 0.6517218351364136, |
| "learning_rate": 0.0001990590823166489, |
| "loss": 0.18804579973220825, |
| "mean_token_accuracy": 0.95452880859375, |
| "num_tokens": 1475217.0, |
| "step": 167 |
| }, |
| { |
| "entropy": 1.4182425439357758, |
| "epoch": 0.6081447963800904, |
| "grad_norm": 0.44628605246543884, |
| "learning_rate": 0.0001990400119113681, |
| "loss": 0.16652102768421173, |
| "mean_token_accuracy": 0.9495290070772171, |
| "num_tokens": 1484272.0, |
| "step": 168 |
| }, |
| { |
| "entropy": 1.395858347415924, |
| "epoch": 0.611764705882353, |
| "grad_norm": 0.5252248048782349, |
| "learning_rate": 0.00019902075121780473, |
| "loss": 0.204986572265625, |
| "mean_token_accuracy": 0.9546073526144028, |
| "num_tokens": 1493197.0, |
| "step": 169 |
| }, |
| { |
| "entropy": 1.437942624092102, |
| "epoch": 0.6153846153846154, |
| "grad_norm": 0.574350893497467, |
| "learning_rate": 0.00019900130027712099, |
| "loss": 0.2608497142791748, |
| "mean_token_accuracy": 0.9362770020961761, |
| "num_tokens": 1502113.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 1.4258457124233246, |
| "epoch": 0.6190045248868778, |
| "grad_norm": 0.5768272876739502, |
| "learning_rate": 0.00019898165913088568, |
| "loss": 0.2692157030105591, |
| "mean_token_accuracy": 0.9251071363687515, |
| "num_tokens": 1510830.0, |
| "step": 171 |
| }, |
| { |
| "entropy": 1.4638590514659882, |
| "epoch": 0.6226244343891403, |
| "grad_norm": 0.5094544291496277, |
| "learning_rate": 0.00019896182782107408, |
| "loss": 0.11549721658229828, |
| "mean_token_accuracy": 0.969596341252327, |
| "num_tokens": 1519258.0, |
| "step": 172 |
| }, |
| { |
| "entropy": 1.4395278096199036, |
| "epoch": 0.6262443438914027, |
| "grad_norm": 0.5421784520149231, |
| "learning_rate": 0.00019894180639006787, |
| "loss": 0.20283782482147217, |
| "mean_token_accuracy": 0.9391360431909561, |
| "num_tokens": 1527863.0, |
| "step": 173 |
| }, |
| { |
| "entropy": 1.3791865408420563, |
| "epoch": 0.6298642533936651, |
| "grad_norm": 0.5409896373748779, |
| "learning_rate": 0.00019892159488065506, |
| "loss": 0.1997358500957489, |
| "mean_token_accuracy": 0.9525162279605865, |
| "num_tokens": 1536957.0, |
| "step": 174 |
| }, |
| { |
| "entropy": 1.4018727838993073, |
| "epoch": 0.6334841628959276, |
| "grad_norm": 0.6673674583435059, |
| "learning_rate": 0.00019890119333602988, |
| "loss": 0.1960916519165039, |
| "mean_token_accuracy": 0.9513887315988541, |
| "num_tokens": 1545655.0, |
| "step": 175 |
| }, |
| { |
| "entropy": 1.45829838514328, |
| "epoch": 0.63710407239819, |
| "grad_norm": 0.4443865716457367, |
| "learning_rate": 0.00019888060179979266, |
| "loss": 0.19763894379138947, |
| "mean_token_accuracy": 0.9526630192995071, |
| "num_tokens": 1554210.0, |
| "step": 176 |
| }, |
| { |
| "entropy": 1.5121627151966095, |
| "epoch": 0.6407239819004525, |
| "grad_norm": 0.5975261926651001, |
| "learning_rate": 0.00019885982031594973, |
| "loss": 0.21303991973400116, |
| "mean_token_accuracy": 0.9420748949050903, |
| "num_tokens": 1562730.0, |
| "step": 177 |
| }, |
| { |
| "entropy": 1.375084936618805, |
| "epoch": 0.644343891402715, |
| "grad_norm": 0.457962304353714, |
| "learning_rate": 0.00019883884892891348, |
| "loss": 0.1408441662788391, |
| "mean_token_accuracy": 0.967100590467453, |
| "num_tokens": 1571700.0, |
| "step": 178 |
| }, |
| { |
| "entropy": 1.541441649198532, |
| "epoch": 0.6479638009049774, |
| "grad_norm": 0.6157453060150146, |
| "learning_rate": 0.000198817687683502, |
| "loss": 0.0904114842414856, |
| "mean_token_accuracy": 0.9715709984302521, |
| "num_tokens": 1579817.0, |
| "step": 179 |
| }, |
| { |
| "entropy": 1.5142415463924408, |
| "epoch": 0.6515837104072398, |
| "grad_norm": 0.8100699186325073, |
| "learning_rate": 0.0001987963366249392, |
| "loss": 0.398171067237854, |
| "mean_token_accuracy": 0.9328930824995041, |
| "num_tokens": 1588732.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 1.4462586343288422, |
| "epoch": 0.6552036199095023, |
| "grad_norm": 0.44163593649864197, |
| "learning_rate": 0.0001987747957988547, |
| "loss": 0.12847575545310974, |
| "mean_token_accuracy": 0.961458757519722, |
| "num_tokens": 1598066.0, |
| "step": 181 |
| }, |
| { |
| "entropy": 1.4784432351589203, |
| "epoch": 0.6588235294117647, |
| "grad_norm": 0.6655211448669434, |
| "learning_rate": 0.00019875306525128354, |
| "loss": 0.22915330529212952, |
| "mean_token_accuracy": 0.9353652149438858, |
| "num_tokens": 1606976.0, |
| "step": 182 |
| }, |
| { |
| "entropy": 1.5425707399845123, |
| "epoch": 0.6624434389140271, |
| "grad_norm": 0.537034273147583, |
| "learning_rate": 0.00019873114502866633, |
| "loss": 0.13760660588741302, |
| "mean_token_accuracy": 0.9617000967264175, |
| "num_tokens": 1615552.0, |
| "step": 183 |
| }, |
| { |
| "entropy": 1.317031353712082, |
| "epoch": 0.6660633484162896, |
| "grad_norm": 0.4945172667503357, |
| "learning_rate": 0.00019870903517784898, |
| "loss": 0.22668133676052094, |
| "mean_token_accuracy": 0.935705840587616, |
| "num_tokens": 1625671.0, |
| "step": 184 |
| }, |
| { |
| "entropy": 1.4837210774421692, |
| "epoch": 0.669683257918552, |
| "grad_norm": 0.4989132285118103, |
| "learning_rate": 0.00019868673574608266, |
| "loss": 0.1490660309791565, |
| "mean_token_accuracy": 0.9522654563188553, |
| "num_tokens": 1634216.0, |
| "step": 185 |
| }, |
| { |
| "entropy": 1.5495317876338959, |
| "epoch": 0.6733031674208145, |
| "grad_norm": 0.8123180270195007, |
| "learning_rate": 0.0001986642467810237, |
| "loss": 0.17341700196266174, |
| "mean_token_accuracy": 0.9526006877422333, |
| "num_tokens": 1642342.0, |
| "step": 186 |
| }, |
| { |
| "entropy": 1.4706730842590332, |
| "epoch": 0.676923076923077, |
| "grad_norm": 0.7836541533470154, |
| "learning_rate": 0.00019864156833073352, |
| "loss": 0.2447955161333084, |
| "mean_token_accuracy": 0.9472227543592453, |
| "num_tokens": 1650719.0, |
| "step": 187 |
| }, |
| { |
| "entropy": 1.436569631099701, |
| "epoch": 0.6805429864253394, |
| "grad_norm": 0.4677812159061432, |
| "learning_rate": 0.00019861870044367844, |
| "loss": 0.11487612128257751, |
| "mean_token_accuracy": 0.9670159816741943, |
| "num_tokens": 1659875.0, |
| "step": 188 |
| }, |
| { |
| "entropy": 1.5288162529468536, |
| "epoch": 0.6841628959276018, |
| "grad_norm": 0.5686696171760559, |
| "learning_rate": 0.0001985956431687296, |
| "loss": 0.16770240664482117, |
| "mean_token_accuracy": 0.9642325341701508, |
| "num_tokens": 1668729.0, |
| "step": 189 |
| }, |
| { |
| "entropy": 1.4523391425609589, |
| "epoch": 0.6877828054298643, |
| "grad_norm": 0.7280884981155396, |
| "learning_rate": 0.00019857239655516302, |
| "loss": 0.3095542788505554, |
| "mean_token_accuracy": 0.9423863142728806, |
| "num_tokens": 1677528.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 1.454990178346634, |
| "epoch": 0.6914027149321267, |
| "grad_norm": 0.4571734070777893, |
| "learning_rate": 0.0001985489606526592, |
| "loss": 0.09302312880754471, |
| "mean_token_accuracy": 0.9739308208227158, |
| "num_tokens": 1686546.0, |
| "step": 191 |
| }, |
| { |
| "entropy": 1.3993627727031708, |
| "epoch": 0.6950226244343891, |
| "grad_norm": 0.42863190174102783, |
| "learning_rate": 0.00019852533551130324, |
| "loss": 0.17399480938911438, |
| "mean_token_accuracy": 0.9535701423883438, |
| "num_tokens": 1695918.0, |
| "step": 192 |
| }, |
| { |
| "entropy": 1.4340824484825134, |
| "epoch": 0.6986425339366515, |
| "grad_norm": 0.4958471357822418, |
| "learning_rate": 0.00019850152118158472, |
| "loss": 0.3563914895057678, |
| "mean_token_accuracy": 0.9297028332948685, |
| "num_tokens": 1705739.0, |
| "step": 193 |
| }, |
| { |
| "entropy": 1.5259247124195099, |
| "epoch": 0.702262443438914, |
| "grad_norm": 0.6084112524986267, |
| "learning_rate": 0.00019847751771439738, |
| "loss": 0.1985940933227539, |
| "mean_token_accuracy": 0.9563599675893784, |
| "num_tokens": 1714371.0, |
| "step": 194 |
| }, |
| { |
| "entropy": 1.5039476454257965, |
| "epoch": 0.7058823529411765, |
| "grad_norm": 0.49471357464790344, |
| "learning_rate": 0.00019845332516103933, |
| "loss": 0.10389607399702072, |
| "mean_token_accuracy": 0.9768806099891663, |
| "num_tokens": 1722649.0, |
| "step": 195 |
| }, |
| { |
| "entropy": 1.4451472461223602, |
| "epoch": 0.709502262443439, |
| "grad_norm": 0.46219033002853394, |
| "learning_rate": 0.0001984289435732127, |
| "loss": 0.13712066411972046, |
| "mean_token_accuracy": 0.9715917259454727, |
| "num_tokens": 1731453.0, |
| "step": 196 |
| }, |
| { |
| "entropy": 1.4479835629463196, |
| "epoch": 0.7131221719457014, |
| "grad_norm": 0.4487977921962738, |
| "learning_rate": 0.00019840437300302366, |
| "loss": 0.08234203606843948, |
| "mean_token_accuracy": 0.9709072560071945, |
| "num_tokens": 1740339.0, |
| "step": 197 |
| }, |
| { |
| "entropy": 1.4245754480361938, |
| "epoch": 0.7167420814479638, |
| "grad_norm": 0.5051844120025635, |
| "learning_rate": 0.00019837961350298213, |
| "loss": 0.12205886840820312, |
| "mean_token_accuracy": 0.9654085338115692, |
| "num_tokens": 1748827.0, |
| "step": 198 |
| }, |
| { |
| "entropy": 1.5265796482563019, |
| "epoch": 0.7203619909502262, |
| "grad_norm": 0.7977287173271179, |
| "learning_rate": 0.00019835466512600197, |
| "loss": 0.23942159116268158, |
| "mean_token_accuracy": 0.9452069103717804, |
| "num_tokens": 1757208.0, |
| "step": 199 |
| }, |
| { |
| "entropy": 1.4634148180484772, |
| "epoch": 0.7239819004524887, |
| "grad_norm": 0.4664369523525238, |
| "learning_rate": 0.00019832952792540054, |
| "loss": 0.15900103747844696, |
| "mean_token_accuracy": 0.9659069031476974, |
| "num_tokens": 1765951.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 1.4476028680801392, |
| "epoch": 0.7276018099547511, |
| "grad_norm": 0.6050535440444946, |
| "learning_rate": 0.00019830420195489877, |
| "loss": 0.17117048799991608, |
| "mean_token_accuracy": 0.9614047408103943, |
| "num_tokens": 1775046.0, |
| "step": 201 |
| }, |
| { |
| "entropy": 1.5342890918254852, |
| "epoch": 0.7312217194570135, |
| "grad_norm": 0.6432852745056152, |
| "learning_rate": 0.00019827868726862117, |
| "loss": 0.31473690271377563, |
| "mean_token_accuracy": 0.9222957193851471, |
| "num_tokens": 1783798.0, |
| "step": 202 |
| }, |
| { |
| "entropy": 1.533927708864212, |
| "epoch": 0.7348416289592761, |
| "grad_norm": 0.4874444305896759, |
| "learning_rate": 0.00019825298392109529, |
| "loss": 0.11646515130996704, |
| "mean_token_accuracy": 0.970250278711319, |
| "num_tokens": 1792421.0, |
| "step": 203 |
| }, |
| { |
| "entropy": 1.4147436618804932, |
| "epoch": 0.7384615384615385, |
| "grad_norm": 0.4096081256866455, |
| "learning_rate": 0.00019822709196725208, |
| "loss": 0.15057605504989624, |
| "mean_token_accuracy": 0.9578173905611038, |
| "num_tokens": 1801310.0, |
| "step": 204 |
| }, |
| { |
| "entropy": 1.447014480829239, |
| "epoch": 0.7420814479638009, |
| "grad_norm": 0.4614224135875702, |
| "learning_rate": 0.00019820101146242547, |
| "loss": 0.08990304172039032, |
| "mean_token_accuracy": 0.9700856953859329, |
| "num_tokens": 1810156.0, |
| "step": 205 |
| }, |
| { |
| "entropy": 1.4257702827453613, |
| "epoch": 0.7457013574660634, |
| "grad_norm": 0.7065203189849854, |
| "learning_rate": 0.00019817474246235233, |
| "loss": 0.23199987411499023, |
| "mean_token_accuracy": 0.9359505921602249, |
| "num_tokens": 1819254.0, |
| "step": 206 |
| }, |
| { |
| "entropy": 1.5035441517829895, |
| "epoch": 0.7493212669683258, |
| "grad_norm": 0.6417229771614075, |
| "learning_rate": 0.00019814828502317245, |
| "loss": 0.2971632480621338, |
| "mean_token_accuracy": 0.9346631169319153, |
| "num_tokens": 1828186.0, |
| "step": 207 |
| }, |
| { |
| "entropy": 1.451407551765442, |
| "epoch": 0.7529411764705882, |
| "grad_norm": 0.3759056031703949, |
| "learning_rate": 0.00019812163920142827, |
| "loss": 0.09616382420063019, |
| "mean_token_accuracy": 0.9760568290948868, |
| "num_tokens": 1837148.0, |
| "step": 208 |
| }, |
| { |
| "entropy": 1.5474788844585419, |
| "epoch": 0.7565610859728507, |
| "grad_norm": 0.5765470266342163, |
| "learning_rate": 0.0001980948050540648, |
| "loss": 0.17177847027778625, |
| "mean_token_accuracy": 0.9533937126398087, |
| "num_tokens": 1845935.0, |
| "step": 209 |
| }, |
| { |
| "entropy": 1.513023853302002, |
| "epoch": 0.7601809954751131, |
| "grad_norm": 0.42471620440483093, |
| "learning_rate": 0.00019806778263842964, |
| "loss": 0.13085290789604187, |
| "mean_token_accuracy": 0.9693024903535843, |
| "num_tokens": 1854982.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 1.611119568347931, |
| "epoch": 0.7638009049773755, |
| "grad_norm": 0.4796192944049835, |
| "learning_rate": 0.00019804057201227259, |
| "loss": 0.10031422972679138, |
| "mean_token_accuracy": 0.9697081595659256, |
| "num_tokens": 1863315.0, |
| "step": 211 |
| }, |
| { |
| "entropy": 1.6045884191989899, |
| "epoch": 0.7674208144796381, |
| "grad_norm": 0.6628240346908569, |
| "learning_rate": 0.00019801317323374574, |
| "loss": 0.2653411030769348, |
| "mean_token_accuracy": 0.9474365711212158, |
| "num_tokens": 1871544.0, |
| "step": 212 |
| }, |
| { |
| "entropy": 1.4857927858829498, |
| "epoch": 0.7710407239819005, |
| "grad_norm": 0.5168606638908386, |
| "learning_rate": 0.00019798558636140333, |
| "loss": 0.11389698088169098, |
| "mean_token_accuracy": 0.9681493788957596, |
| "num_tokens": 1880335.0, |
| "step": 213 |
| }, |
| { |
| "entropy": 1.5443885922431946, |
| "epoch": 0.7746606334841629, |
| "grad_norm": 0.5839455723762512, |
| "learning_rate": 0.00019795781145420148, |
| "loss": 0.1918099969625473, |
| "mean_token_accuracy": 0.9581391960382462, |
| "num_tokens": 1889345.0, |
| "step": 214 |
| }, |
| { |
| "entropy": 1.456317961215973, |
| "epoch": 0.7782805429864253, |
| "grad_norm": 0.44960126280784607, |
| "learning_rate": 0.00019792984857149826, |
| "loss": 0.12002551555633545, |
| "mean_token_accuracy": 0.9649894386529922, |
| "num_tokens": 1898319.0, |
| "step": 215 |
| }, |
| { |
| "entropy": 1.4415479898452759, |
| "epoch": 0.7819004524886878, |
| "grad_norm": 0.4050678014755249, |
| "learning_rate": 0.00019790169777305345, |
| "loss": 0.11734722554683685, |
| "mean_token_accuracy": 0.9726845920085907, |
| "num_tokens": 1907445.0, |
| "step": 216 |
| }, |
| { |
| "entropy": 1.469126135110855, |
| "epoch": 0.7855203619909502, |
| "grad_norm": 0.6080633997917175, |
| "learning_rate": 0.00019787335911902835, |
| "loss": 0.23930203914642334, |
| "mean_token_accuracy": 0.9382640719413757, |
| "num_tokens": 1916139.0, |
| "step": 217 |
| }, |
| { |
| "entropy": 1.3760231733322144, |
| "epoch": 0.7891402714932126, |
| "grad_norm": 0.4775363802909851, |
| "learning_rate": 0.00019784483266998575, |
| "loss": 0.2287708967924118, |
| "mean_token_accuracy": 0.9554623812437057, |
| "num_tokens": 1925201.0, |
| "step": 218 |
| }, |
| { |
| "entropy": 1.4956690967082977, |
| "epoch": 0.7927601809954751, |
| "grad_norm": 0.7189075946807861, |
| "learning_rate": 0.0001978161184868899, |
| "loss": 0.29633429646492004, |
| "mean_token_accuracy": 0.9378493428230286, |
| "num_tokens": 1933751.0, |
| "step": 219 |
| }, |
| { |
| "entropy": 1.4381499290466309, |
| "epoch": 0.7963800904977375, |
| "grad_norm": 0.6960969567298889, |
| "learning_rate": 0.00019778721663110603, |
| "loss": 0.257027268409729, |
| "mean_token_accuracy": 0.9326806962490082, |
| "num_tokens": 1942630.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 1.419518768787384, |
| "epoch": 0.8, |
| "grad_norm": 0.5286183953285217, |
| "learning_rate": 0.00019775812716440073, |
| "loss": 0.13132816553115845, |
| "mean_token_accuracy": 0.9626937806606293, |
| "num_tokens": 1951779.0, |
| "step": 221 |
| }, |
| { |
| "entropy": 1.4531921744346619, |
| "epoch": 0.8036199095022625, |
| "grad_norm": 0.5929961204528809, |
| "learning_rate": 0.00019772885014894125, |
| "loss": 0.2502361536026001, |
| "mean_token_accuracy": 0.9397906512022018, |
| "num_tokens": 1960796.0, |
| "step": 222 |
| }, |
| { |
| "entropy": 1.5002046525478363, |
| "epoch": 0.8072398190045249, |
| "grad_norm": 0.5530325770378113, |
| "learning_rate": 0.00019769938564729585, |
| "loss": 0.1540164351463318, |
| "mean_token_accuracy": 0.9645767956972122, |
| "num_tokens": 1969357.0, |
| "step": 223 |
| }, |
| { |
| "entropy": 1.4433454275131226, |
| "epoch": 0.8108597285067873, |
| "grad_norm": 0.598328709602356, |
| "learning_rate": 0.00019766973372243343, |
| "loss": 0.25215908885002136, |
| "mean_token_accuracy": 0.9557305723428726, |
| "num_tokens": 1978387.0, |
| "step": 224 |
| }, |
| { |
| "entropy": 1.4278393983840942, |
| "epoch": 0.8144796380090498, |
| "grad_norm": 0.7414979934692383, |
| "learning_rate": 0.00019763989443772337, |
| "loss": 0.2587546408176422, |
| "mean_token_accuracy": 0.9320106357336044, |
| "num_tokens": 1987317.0, |
| "step": 225 |
| }, |
| { |
| "entropy": 1.4586597084999084, |
| "epoch": 0.8180995475113122, |
| "grad_norm": 0.623528778553009, |
| "learning_rate": 0.0001976098678569355, |
| "loss": 0.21999062597751617, |
| "mean_token_accuracy": 0.9393687099218369, |
| "num_tokens": 1996024.0, |
| "step": 226 |
| }, |
| { |
| "entropy": 1.487585186958313, |
| "epoch": 0.8217194570135746, |
| "grad_norm": 0.6269425749778748, |
| "learning_rate": 0.00019757965404423994, |
| "loss": 0.2157050371170044, |
| "mean_token_accuracy": 0.9460557103157043, |
| "num_tokens": 2005075.0, |
| "step": 227 |
| }, |
| { |
| "entropy": 1.4444681107997894, |
| "epoch": 0.8253393665158371, |
| "grad_norm": 0.4378167390823364, |
| "learning_rate": 0.0001975492530642069, |
| "loss": 0.20458024740219116, |
| "mean_token_accuracy": 0.9438146203756332, |
| "num_tokens": 2014220.0, |
| "step": 228 |
| }, |
| { |
| "entropy": 1.4960754811763763, |
| "epoch": 0.8289592760180996, |
| "grad_norm": 0.4860404133796692, |
| "learning_rate": 0.0001975186649818066, |
| "loss": 0.28230106830596924, |
| "mean_token_accuracy": 0.9395165890455246, |
| "num_tokens": 2023338.0, |
| "step": 229 |
| }, |
| { |
| "entropy": 1.5679333209991455, |
| "epoch": 0.832579185520362, |
| "grad_norm": 0.655274510383606, |
| "learning_rate": 0.00019748788986240917, |
| "loss": 0.1854293942451477, |
| "mean_token_accuracy": 0.9481187909841537, |
| "num_tokens": 2031605.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 1.4240575730800629, |
| "epoch": 0.8361990950226245, |
| "grad_norm": 0.3618476986885071, |
| "learning_rate": 0.0001974569277717844, |
| "loss": 0.09264940023422241, |
| "mean_token_accuracy": 0.975529670715332, |
| "num_tokens": 2041057.0, |
| "step": 231 |
| }, |
| { |
| "entropy": 1.5365903079509735, |
| "epoch": 0.8398190045248869, |
| "grad_norm": 0.4857717454433441, |
| "learning_rate": 0.00019742577877610173, |
| "loss": 0.3051563799381256, |
| "mean_token_accuracy": 0.9376842081546783, |
| "num_tokens": 2049959.0, |
| "step": 232 |
| }, |
| { |
| "entropy": 1.383328765630722, |
| "epoch": 0.8434389140271493, |
| "grad_norm": 0.45292651653289795, |
| "learning_rate": 0.0001973944429419299, |
| "loss": 0.2122434377670288, |
| "mean_token_accuracy": 0.9505620300769806, |
| "num_tokens": 2059255.0, |
| "step": 233 |
| }, |
| { |
| "entropy": 1.445828378200531, |
| "epoch": 0.8470588235294118, |
| "grad_norm": 0.41845881938934326, |
| "learning_rate": 0.00019736292033623704, |
| "loss": 0.11589747667312622, |
| "mean_token_accuracy": 0.9683210551738739, |
| "num_tokens": 2068297.0, |
| "step": 234 |
| }, |
| { |
| "entropy": 1.4795459508895874, |
| "epoch": 0.8506787330316742, |
| "grad_norm": 0.4432578384876251, |
| "learning_rate": 0.00019733121102639048, |
| "loss": 0.07866068929433823, |
| "mean_token_accuracy": 0.9755797684192657, |
| "num_tokens": 2076844.0, |
| "step": 235 |
| }, |
| { |
| "entropy": 1.5486867129802704, |
| "epoch": 0.8542986425339366, |
| "grad_norm": 0.5824456810951233, |
| "learning_rate": 0.00019729931508015647, |
| "loss": 0.23570555448532104, |
| "mean_token_accuracy": 0.943296879529953, |
| "num_tokens": 2085155.0, |
| "step": 236 |
| }, |
| { |
| "entropy": 1.4806998372077942, |
| "epoch": 0.857918552036199, |
| "grad_norm": 0.5646211504936218, |
| "learning_rate": 0.0001972672325657001, |
| "loss": 0.20387002825737, |
| "mean_token_accuracy": 0.9514354765415192, |
| "num_tokens": 2093967.0, |
| "step": 237 |
| }, |
| { |
| "entropy": 1.4301677942276, |
| "epoch": 0.8615384615384616, |
| "grad_norm": 0.5802773833274841, |
| "learning_rate": 0.0001972349635515853, |
| "loss": 0.2552819550037384, |
| "mean_token_accuracy": 0.922490268945694, |
| "num_tokens": 2103028.0, |
| "step": 238 |
| }, |
| { |
| "entropy": 1.324641764163971, |
| "epoch": 0.865158371040724, |
| "grad_norm": 0.4743529260158539, |
| "learning_rate": 0.00019720250810677446, |
| "loss": 0.12536188960075378, |
| "mean_token_accuracy": 0.9665613174438477, |
| "num_tokens": 2111936.0, |
| "step": 239 |
| }, |
| { |
| "entropy": 1.397893875837326, |
| "epoch": 0.8687782805429864, |
| "grad_norm": 0.555899441242218, |
| "learning_rate": 0.00019716986630062842, |
| "loss": 0.12725675106048584, |
| "mean_token_accuracy": 0.9541808664798737, |
| "num_tokens": 2120622.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 1.3342379927635193, |
| "epoch": 0.8723981900452489, |
| "grad_norm": 0.552416205406189, |
| "learning_rate": 0.00019713703820290634, |
| "loss": 0.16678127646446228, |
| "mean_token_accuracy": 0.9486428052186966, |
| "num_tokens": 2129761.0, |
| "step": 241 |
| }, |
| { |
| "entropy": 1.3923524022102356, |
| "epoch": 0.8760180995475113, |
| "grad_norm": 0.5822156667709351, |
| "learning_rate": 0.00019710402388376544, |
| "loss": 0.1675427258014679, |
| "mean_token_accuracy": 0.957242414355278, |
| "num_tokens": 2138328.0, |
| "step": 242 |
| }, |
| { |
| "entropy": 1.3071589767932892, |
| "epoch": 0.8796380090497737, |
| "grad_norm": 0.4842599928379059, |
| "learning_rate": 0.00019707082341376093, |
| "loss": 0.10306215286254883, |
| "mean_token_accuracy": 0.9641513824462891, |
| "num_tokens": 2147288.0, |
| "step": 243 |
| }, |
| { |
| "entropy": 1.327076405286789, |
| "epoch": 0.8832579185520362, |
| "grad_norm": 0.5377116799354553, |
| "learning_rate": 0.0001970374368638459, |
| "loss": 0.11235228180885315, |
| "mean_token_accuracy": 0.9719918966293335, |
| "num_tokens": 2156064.0, |
| "step": 244 |
| }, |
| { |
| "entropy": 1.3401291370391846, |
| "epoch": 0.8868778280542986, |
| "grad_norm": 0.5172064900398254, |
| "learning_rate": 0.00019700386430537105, |
| "loss": 0.1520514041185379, |
| "mean_token_accuracy": 0.9551143050193787, |
| "num_tokens": 2165166.0, |
| "step": 245 |
| }, |
| { |
| "entropy": 1.335503101348877, |
| "epoch": 0.890497737556561, |
| "grad_norm": 0.551672101020813, |
| "learning_rate": 0.00019697010581008463, |
| "loss": 0.1884230524301529, |
| "mean_token_accuracy": 0.9497657865285873, |
| "num_tokens": 2174115.0, |
| "step": 246 |
| }, |
| { |
| "entropy": 1.3267870545387268, |
| "epoch": 0.8941176470588236, |
| "grad_norm": 0.4992049038410187, |
| "learning_rate": 0.00019693616145013227, |
| "loss": 0.18525123596191406, |
| "mean_token_accuracy": 0.9568396955728531, |
| "num_tokens": 2183228.0, |
| "step": 247 |
| }, |
| { |
| "entropy": 1.405649095773697, |
| "epoch": 0.897737556561086, |
| "grad_norm": 1.0682220458984375, |
| "learning_rate": 0.00019690203129805672, |
| "loss": 0.379827618598938, |
| "mean_token_accuracy": 0.8877183198928833, |
| "num_tokens": 2192055.0, |
| "step": 248 |
| }, |
| { |
| "entropy": 1.373319834470749, |
| "epoch": 0.9013574660633484, |
| "grad_norm": 0.44303223490715027, |
| "learning_rate": 0.00019686771542679797, |
| "loss": 0.09994952380657196, |
| "mean_token_accuracy": 0.9754335582256317, |
| "num_tokens": 2201010.0, |
| "step": 249 |
| }, |
| { |
| "entropy": 1.4375985264778137, |
| "epoch": 0.9049773755656109, |
| "grad_norm": 0.5711826682090759, |
| "learning_rate": 0.0001968332139096927, |
| "loss": 0.1751662790775299, |
| "mean_token_accuracy": 0.9558616876602173, |
| "num_tokens": 2209782.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 1.393396943807602, |
| "epoch": 0.9085972850678733, |
| "grad_norm": 0.4692879915237427, |
| "learning_rate": 0.00019679852682047457, |
| "loss": 0.1593962460756302, |
| "mean_token_accuracy": 0.9541475772857666, |
| "num_tokens": 2218923.0, |
| "step": 251 |
| }, |
| { |
| "entropy": 1.3871179819107056, |
| "epoch": 0.9122171945701357, |
| "grad_norm": 0.4700246751308441, |
| "learning_rate": 0.0001967636542332736, |
| "loss": 0.11389590799808502, |
| "mean_token_accuracy": 0.971915066242218, |
| "num_tokens": 2227564.0, |
| "step": 252 |
| }, |
| { |
| "entropy": 1.3677232265472412, |
| "epoch": 0.9158371040723982, |
| "grad_norm": 0.3912360668182373, |
| "learning_rate": 0.00019672859622261633, |
| "loss": 0.09452933073043823, |
| "mean_token_accuracy": 0.9733647406101227, |
| "num_tokens": 2236836.0, |
| "step": 253 |
| }, |
| { |
| "entropy": 1.3464795053005219, |
| "epoch": 0.9194570135746606, |
| "grad_norm": 0.44805341958999634, |
| "learning_rate": 0.0001966933528634256, |
| "loss": 0.15307973325252533, |
| "mean_token_accuracy": 0.9646724760532379, |
| "num_tokens": 2245881.0, |
| "step": 254 |
| }, |
| { |
| "entropy": 1.3869231641292572, |
| "epoch": 0.9230769230769231, |
| "grad_norm": 0.4598497450351715, |
| "learning_rate": 0.00019665792423102037, |
| "loss": 0.13864350318908691, |
| "mean_token_accuracy": 0.963247537612915, |
| "num_tokens": 2254736.0, |
| "step": 255 |
| }, |
| { |
| "entropy": 1.385448396205902, |
| "epoch": 0.9266968325791856, |
| "grad_norm": 0.5225904583930969, |
| "learning_rate": 0.0001966223104011155, |
| "loss": 0.1266992688179016, |
| "mean_token_accuracy": 0.9631596356630325, |
| "num_tokens": 2263315.0, |
| "step": 256 |
| }, |
| { |
| "entropy": 1.4815338253974915, |
| "epoch": 0.930316742081448, |
| "grad_norm": 0.6528983116149902, |
| "learning_rate": 0.00019658651144982163, |
| "loss": 0.2216826230287552, |
| "mean_token_accuracy": 0.9490031599998474, |
| "num_tokens": 2271832.0, |
| "step": 257 |
| }, |
| { |
| "entropy": 1.4158953726291656, |
| "epoch": 0.9339366515837104, |
| "grad_norm": 0.45583924651145935, |
| "learning_rate": 0.00019655052745364509, |
| "loss": 0.13053885102272034, |
| "mean_token_accuracy": 0.9635764211416245, |
| "num_tokens": 2280582.0, |
| "step": 258 |
| }, |
| { |
| "entropy": 1.3917952477931976, |
| "epoch": 0.9375565610859729, |
| "grad_norm": 0.4816400110721588, |
| "learning_rate": 0.00019651435848948762, |
| "loss": 0.13310927152633667, |
| "mean_token_accuracy": 0.9665548801422119, |
| "num_tokens": 2289239.0, |
| "step": 259 |
| }, |
| { |
| "entropy": 1.335536628961563, |
| "epoch": 0.9411764705882353, |
| "grad_norm": 0.46578988432884216, |
| "learning_rate": 0.00019647800463464622, |
| "loss": 0.17411091923713684, |
| "mean_token_accuracy": 0.9487465471029282, |
| "num_tokens": 2298569.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 1.3734354674816132, |
| "epoch": 0.9447963800904977, |
| "grad_norm": 0.4758208990097046, |
| "learning_rate": 0.00019644146596681312, |
| "loss": 0.13356219232082367, |
| "mean_token_accuracy": 0.966251790523529, |
| "num_tokens": 2307333.0, |
| "step": 261 |
| }, |
| { |
| "entropy": 1.3474647402763367, |
| "epoch": 0.9484162895927601, |
| "grad_norm": 0.30391982197761536, |
| "learning_rate": 0.00019640474256407545, |
| "loss": 0.07434239238500595, |
| "mean_token_accuracy": 0.9785039573907852, |
| "num_tokens": 2315917.0, |
| "step": 262 |
| }, |
| { |
| "entropy": 1.327022761106491, |
| "epoch": 0.9520361990950226, |
| "grad_norm": 0.6123433113098145, |
| "learning_rate": 0.00019636783450491517, |
| "loss": 0.37289372086524963, |
| "mean_token_accuracy": 0.9232099652290344, |
| "num_tokens": 2325041.0, |
| "step": 263 |
| }, |
| { |
| "entropy": 1.2676138877868652, |
| "epoch": 0.9556561085972851, |
| "grad_norm": 0.6251657605171204, |
| "learning_rate": 0.00019633074186820886, |
| "loss": 0.2077867090702057, |
| "mean_token_accuracy": 0.9404798150062561, |
| "num_tokens": 2334552.0, |
| "step": 264 |
| }, |
| { |
| "entropy": 1.3420342803001404, |
| "epoch": 0.9592760180995475, |
| "grad_norm": 0.44629064202308655, |
| "learning_rate": 0.0001962934647332275, |
| "loss": 0.10463929176330566, |
| "mean_token_accuracy": 0.9691900908946991, |
| "num_tokens": 2343361.0, |
| "step": 265 |
| }, |
| { |
| "entropy": 1.449256420135498, |
| "epoch": 0.96289592760181, |
| "grad_norm": 0.9838017821311951, |
| "learning_rate": 0.0001962560031796365, |
| "loss": 0.2925741970539093, |
| "mean_token_accuracy": 0.9219896346330643, |
| "num_tokens": 2351556.0, |
| "step": 266 |
| }, |
| { |
| "entropy": 1.2965570390224457, |
| "epoch": 0.9665158371040724, |
| "grad_norm": 0.5719688534736633, |
| "learning_rate": 0.00019621835728749525, |
| "loss": 0.23820579051971436, |
| "mean_token_accuracy": 0.9379686415195465, |
| "num_tokens": 2361033.0, |
| "step": 267 |
| }, |
| { |
| "entropy": 1.3180812895298004, |
| "epoch": 0.9701357466063348, |
| "grad_norm": 0.4512738883495331, |
| "learning_rate": 0.0001961805271372572, |
| "loss": 0.15501753985881805, |
| "mean_token_accuracy": 0.9587176293134689, |
| "num_tokens": 2369972.0, |
| "step": 268 |
| }, |
| { |
| "entropy": 1.360179752111435, |
| "epoch": 0.9737556561085973, |
| "grad_norm": 0.39895451068878174, |
| "learning_rate": 0.00019614251280976948, |
| "loss": 0.08832871168851852, |
| "mean_token_accuracy": 0.9742900878190994, |
| "num_tokens": 2378872.0, |
| "step": 269 |
| }, |
| { |
| "entropy": 1.3346007466316223, |
| "epoch": 0.9773755656108597, |
| "grad_norm": 0.4796980619430542, |
| "learning_rate": 0.00019610431438627296, |
| "loss": 0.1033824160695076, |
| "mean_token_accuracy": 0.9715064615011215, |
| "num_tokens": 2387688.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 1.301027923822403, |
| "epoch": 0.9809954751131221, |
| "grad_norm": 0.4513569176197052, |
| "learning_rate": 0.00019606593194840177, |
| "loss": 0.18686418235301971, |
| "mean_token_accuracy": 0.9559367001056671, |
| "num_tokens": 2397053.0, |
| "step": 271 |
| }, |
| { |
| "entropy": 1.3664647042751312, |
| "epoch": 0.9846153846153847, |
| "grad_norm": 0.5319916009902954, |
| "learning_rate": 0.0001960273655781835, |
| "loss": 0.24958105385303497, |
| "mean_token_accuracy": 0.9426600635051727, |
| "num_tokens": 2405763.0, |
| "step": 272 |
| }, |
| { |
| "entropy": 1.3043864369392395, |
| "epoch": 0.9882352941176471, |
| "grad_norm": 0.44749879837036133, |
| "learning_rate": 0.00019598861535803863, |
| "loss": 0.15809005498886108, |
| "mean_token_accuracy": 0.9583301842212677, |
| "num_tokens": 2414884.0, |
| "step": 273 |
| }, |
| { |
| "entropy": 1.3713374137878418, |
| "epoch": 0.9918552036199095, |
| "grad_norm": 0.5084534883499146, |
| "learning_rate": 0.00019594968137078068, |
| "loss": 0.17388306558132172, |
| "mean_token_accuracy": 0.9520856887102127, |
| "num_tokens": 2423715.0, |
| "step": 274 |
| }, |
| { |
| "entropy": 1.365702897310257, |
| "epoch": 0.995475113122172, |
| "grad_norm": 0.459522008895874, |
| "learning_rate": 0.00019591056369961586, |
| "loss": 0.33452439308166504, |
| "mean_token_accuracy": 0.9534527063369751, |
| "num_tokens": 2432458.0, |
| "step": 275 |
| }, |
| { |
| "entropy": 1.3280319273471832, |
| "epoch": 0.9990950226244344, |
| "grad_norm": 0.4350273311138153, |
| "learning_rate": 0.00019587126242814288, |
| "loss": 0.12349647283554077, |
| "mean_token_accuracy": 0.9627684652805328, |
| "num_tokens": 2441126.0, |
| "step": 276 |
| }, |
| { |
| "entropy": 1.6572558879852295, |
| "epoch": 1.0, |
| "grad_norm": 0.9549095630645752, |
| "learning_rate": 0.00019583177764035295, |
| "loss": 0.028879065066576004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 2441725.0, |
| "step": 277 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_entropy": 1.3299247259046973, |
| "eval_loss": 0.16865810751914978, |
| "eval_mean_token_accuracy": 0.9547278832613937, |
| "eval_num_tokens": 2441725.0, |
| "eval_runtime": 31.7656, |
| "eval_samples_per_second": 11.616, |
| "eval_steps_per_second": 3.872, |
| "step": 277 |
| }, |
| { |
| "entropy": 1.3716817200183868, |
| "epoch": 1.0036199095022624, |
| "grad_norm": 0.6422934532165527, |
| "learning_rate": 0.00019579210942062932, |
| "loss": 0.1597217321395874, |
| "mean_token_accuracy": 0.955962136387825, |
| "num_tokens": 2450437.0, |
| "step": 278 |
| }, |
| { |
| "entropy": 1.2969892621040344, |
| "epoch": 1.0072398190045249, |
| "grad_norm": 0.44608697295188904, |
| "learning_rate": 0.0001957522578537474, |
| "loss": 0.09895047545433044, |
| "mean_token_accuracy": 0.9709436744451523, |
| "num_tokens": 2459078.0, |
| "step": 279 |
| }, |
| { |
| "entropy": 1.2229366302490234, |
| "epoch": 1.0108597285067873, |
| "grad_norm": 0.4623957872390747, |
| "learning_rate": 0.0001957122230248743, |
| "loss": 0.16620564460754395, |
| "mean_token_accuracy": 0.9499163031578064, |
| "num_tokens": 2468073.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 1.33988156914711, |
| "epoch": 1.0144796380090497, |
| "grad_norm": 0.6455283761024475, |
| "learning_rate": 0.0001956720050195689, |
| "loss": 0.1705092489719391, |
| "mean_token_accuracy": 0.9635123610496521, |
| "num_tokens": 2476886.0, |
| "step": 281 |
| }, |
| { |
| "entropy": 1.2605039477348328, |
| "epoch": 1.0180995475113122, |
| "grad_norm": 0.4512750804424286, |
| "learning_rate": 0.00019563160392378144, |
| "loss": 0.30783483386039734, |
| "mean_token_accuracy": 0.932560071349144, |
| "num_tokens": 2486249.0, |
| "step": 282 |
| }, |
| { |
| "entropy": 1.2216798067092896, |
| "epoch": 1.0217194570135746, |
| "grad_norm": 0.45792317390441895, |
| "learning_rate": 0.00019559101982385356, |
| "loss": 0.1141764372587204, |
| "mean_token_accuracy": 0.9699619710445404, |
| "num_tokens": 2495163.0, |
| "step": 283 |
| }, |
| { |
| "entropy": 1.258674830198288, |
| "epoch": 1.025339366515837, |
| "grad_norm": 0.48922327160835266, |
| "learning_rate": 0.00019555025280651786, |
| "loss": 0.11167445778846741, |
| "mean_token_accuracy": 0.9687153398990631, |
| "num_tokens": 2504066.0, |
| "step": 284 |
| }, |
| { |
| "entropy": 1.3179872334003448, |
| "epoch": 1.0289592760180994, |
| "grad_norm": 0.6329558491706848, |
| "learning_rate": 0.00019550930295889803, |
| "loss": 0.17174683511257172, |
| "mean_token_accuracy": 0.9596489369869232, |
| "num_tokens": 2512418.0, |
| "step": 285 |
| }, |
| { |
| "entropy": 1.3070985078811646, |
| "epoch": 1.032579185520362, |
| "grad_norm": 0.6164458990097046, |
| "learning_rate": 0.00019546817036850827, |
| "loss": 0.15053772926330566, |
| "mean_token_accuracy": 0.9639337956905365, |
| "num_tokens": 2520822.0, |
| "step": 286 |
| }, |
| { |
| "entropy": 1.1846771538257599, |
| "epoch": 1.0361990950226245, |
| "grad_norm": 0.49578550457954407, |
| "learning_rate": 0.00019542685512325357, |
| "loss": 0.1119435578584671, |
| "mean_token_accuracy": 0.9712934345006943, |
| "num_tokens": 2530019.0, |
| "step": 287 |
| }, |
| { |
| "entropy": 1.1750589311122894, |
| "epoch": 1.039819004524887, |
| "grad_norm": 0.3983454406261444, |
| "learning_rate": 0.00019538535731142907, |
| "loss": 0.09416541457176208, |
| "mean_token_accuracy": 0.9704293310642242, |
| "num_tokens": 2539456.0, |
| "step": 288 |
| }, |
| { |
| "entropy": 1.3071076273918152, |
| "epoch": 1.0434389140271494, |
| "grad_norm": 0.6677536368370056, |
| "learning_rate": 0.00019534367702172016, |
| "loss": 0.19954326748847961, |
| "mean_token_accuracy": 0.9511717855930328, |
| "num_tokens": 2548133.0, |
| "step": 289 |
| }, |
| { |
| "entropy": 1.3091352581977844, |
| "epoch": 1.0470588235294118, |
| "grad_norm": 0.5290597081184387, |
| "learning_rate": 0.00019530181434320224, |
| "loss": 0.09285785257816315, |
| "mean_token_accuracy": 0.9735703021287918, |
| "num_tokens": 2556747.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 1.1955563724040985, |
| "epoch": 1.0506787330316743, |
| "grad_norm": 0.5170581340789795, |
| "learning_rate": 0.00019525976936534035, |
| "loss": 0.1392112374305725, |
| "mean_token_accuracy": 0.9660876840353012, |
| "num_tokens": 2566078.0, |
| "step": 291 |
| }, |
| { |
| "entropy": 1.2591860592365265, |
| "epoch": 1.0542986425339367, |
| "grad_norm": 0.6322119832038879, |
| "learning_rate": 0.00019521754217798935, |
| "loss": 0.2537181079387665, |
| "mean_token_accuracy": 0.9380859881639481, |
| "num_tokens": 2575652.0, |
| "step": 292 |
| }, |
| { |
| "entropy": 1.3299800157546997, |
| "epoch": 1.0579185520361991, |
| "grad_norm": 0.6024647355079651, |
| "learning_rate": 0.00019517513287139326, |
| "loss": 0.1248047798871994, |
| "mean_token_accuracy": 0.9594511985778809, |
| "num_tokens": 2584405.0, |
| "step": 293 |
| }, |
| { |
| "entropy": 1.3162773251533508, |
| "epoch": 1.0615384615384615, |
| "grad_norm": 0.6070277690887451, |
| "learning_rate": 0.0001951325415361855, |
| "loss": 0.14759968221187592, |
| "mean_token_accuracy": 0.9557203203439713, |
| "num_tokens": 2593314.0, |
| "step": 294 |
| }, |
| { |
| "entropy": 1.2592947483062744, |
| "epoch": 1.065158371040724, |
| "grad_norm": 0.44067755341529846, |
| "learning_rate": 0.00019508976826338844, |
| "loss": 0.131802037358284, |
| "mean_token_accuracy": 0.9598903208971024, |
| "num_tokens": 2602597.0, |
| "step": 295 |
| }, |
| { |
| "entropy": 1.2933553457260132, |
| "epoch": 1.0687782805429864, |
| "grad_norm": 0.5667601227760315, |
| "learning_rate": 0.00019504681314441323, |
| "loss": 0.15577419102191925, |
| "mean_token_accuracy": 0.9549229890108109, |
| "num_tokens": 2611891.0, |
| "step": 296 |
| }, |
| { |
| "entropy": 1.3254594206809998, |
| "epoch": 1.0723981900452488, |
| "grad_norm": 0.519059956073761, |
| "learning_rate": 0.00019500367627105965, |
| "loss": 0.14545762538909912, |
| "mean_token_accuracy": 0.9637808352708817, |
| "num_tokens": 2620889.0, |
| "step": 297 |
| }, |
| { |
| "entropy": 1.277051329612732, |
| "epoch": 1.0760180995475113, |
| "grad_norm": 0.4660187065601349, |
| "learning_rate": 0.00019496035773551592, |
| "loss": 0.14717882871627808, |
| "mean_token_accuracy": 0.9652319550514221, |
| "num_tokens": 2629885.0, |
| "step": 298 |
| }, |
| { |
| "entropy": 1.3058906197547913, |
| "epoch": 1.0796380090497737, |
| "grad_norm": 0.8524317145347595, |
| "learning_rate": 0.0001949168576303586, |
| "loss": 0.10511646419763565, |
| "mean_token_accuracy": 0.9672305583953857, |
| "num_tokens": 2638639.0, |
| "step": 299 |
| }, |
| { |
| "entropy": 1.3437564969062805, |
| "epoch": 1.0832579185520361, |
| "grad_norm": 0.9568637609481812, |
| "learning_rate": 0.00019487317604855212, |
| "loss": 0.12141256034374237, |
| "mean_token_accuracy": 0.9637749493122101, |
| "num_tokens": 2647779.0, |
| "step": 300 |
| }, |
| { |
| "entropy": 1.3359644711017609, |
| "epoch": 1.0868778280542986, |
| "grad_norm": 0.6103286147117615, |
| "learning_rate": 0.00019482931308344888, |
| "loss": 0.14975383877754211, |
| "mean_token_accuracy": 0.9517766237258911, |
| "num_tokens": 2656565.0, |
| "step": 301 |
| }, |
| { |
| "entropy": 1.2883181869983673, |
| "epoch": 1.090497737556561, |
| "grad_norm": 0.4035210609436035, |
| "learning_rate": 0.00019478526882878876, |
| "loss": 0.10880422592163086, |
| "mean_token_accuracy": 0.9710263162851334, |
| "num_tokens": 2665569.0, |
| "step": 302 |
| }, |
| { |
| "entropy": 1.357493907213211, |
| "epoch": 1.0941176470588236, |
| "grad_norm": 0.5569011569023132, |
| "learning_rate": 0.00019474104337869924, |
| "loss": 0.13409318029880524, |
| "mean_token_accuracy": 0.9593861550092697, |
| "num_tokens": 2674397.0, |
| "step": 303 |
| }, |
| { |
| "entropy": 1.3459482192993164, |
| "epoch": 1.097737556561086, |
| "grad_norm": 0.6249450445175171, |
| "learning_rate": 0.00019469663682769491, |
| "loss": 0.19079425930976868, |
| "mean_token_accuracy": 0.9542694389820099, |
| "num_tokens": 2683208.0, |
| "step": 304 |
| }, |
| { |
| "entropy": 1.3437992334365845, |
| "epoch": 1.1013574660633485, |
| "grad_norm": 0.5010721683502197, |
| "learning_rate": 0.00019465204927067754, |
| "loss": 0.14577272534370422, |
| "mean_token_accuracy": 0.9553558230400085, |
| "num_tokens": 2691945.0, |
| "step": 305 |
| }, |
| { |
| "entropy": 1.42454132437706, |
| "epoch": 1.104977375565611, |
| "grad_norm": 0.5698776841163635, |
| "learning_rate": 0.0001946072808029355, |
| "loss": 0.15152013301849365, |
| "mean_token_accuracy": 0.9586130678653717, |
| "num_tokens": 2700595.0, |
| "step": 306 |
| }, |
| { |
| "entropy": 1.3169154226779938, |
| "epoch": 1.1085972850678734, |
| "grad_norm": 0.5620861649513245, |
| "learning_rate": 0.00019456233152014406, |
| "loss": 0.20957821607589722, |
| "mean_token_accuracy": 0.9405840635299683, |
| "num_tokens": 2709771.0, |
| "step": 307 |
| }, |
| { |
| "entropy": 1.358991116285324, |
| "epoch": 1.1122171945701358, |
| "grad_norm": 0.46596547961235046, |
| "learning_rate": 0.00019451720151836467, |
| "loss": 0.12322796881198883, |
| "mean_token_accuracy": 0.9694350957870483, |
| "num_tokens": 2718386.0, |
| "step": 308 |
| }, |
| { |
| "entropy": 1.3823304772377014, |
| "epoch": 1.1158371040723982, |
| "grad_norm": 0.4822905659675598, |
| "learning_rate": 0.00019447189089404513, |
| "loss": 0.08683277666568756, |
| "mean_token_accuracy": 0.9738442450761795, |
| "num_tokens": 2726918.0, |
| "step": 309 |
| }, |
| { |
| "entropy": 1.2822128236293793, |
| "epoch": 1.1194570135746607, |
| "grad_norm": 0.5359209775924683, |
| "learning_rate": 0.00019442639974401923, |
| "loss": 0.14018404483795166, |
| "mean_token_accuracy": 0.9630445092916489, |
| "num_tokens": 2736277.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 1.2816834151744843, |
| "epoch": 1.123076923076923, |
| "grad_norm": 0.4495919644832611, |
| "learning_rate": 0.00019438072816550654, |
| "loss": 0.13849881291389465, |
| "mean_token_accuracy": 0.9659072607755661, |
| "num_tokens": 2745710.0, |
| "step": 311 |
| }, |
| { |
| "entropy": 1.308707356452942, |
| "epoch": 1.1266968325791855, |
| "grad_norm": 0.404366135597229, |
| "learning_rate": 0.0001943348762561123, |
| "loss": 0.09485723078250885, |
| "mean_token_accuracy": 0.9704456329345703, |
| "num_tokens": 2754571.0, |
| "step": 312 |
| }, |
| { |
| "entropy": 1.2276718020439148, |
| "epoch": 1.130316742081448, |
| "grad_norm": 0.524721086025238, |
| "learning_rate": 0.00019428884411382694, |
| "loss": 0.15793752670288086, |
| "mean_token_accuracy": 0.9559158235788345, |
| "num_tokens": 2763888.0, |
| "step": 313 |
| }, |
| { |
| "entropy": 1.1902028918266296, |
| "epoch": 1.1339366515837104, |
| "grad_norm": 0.41495031118392944, |
| "learning_rate": 0.00019424263183702634, |
| "loss": 0.08333931863307953, |
| "mean_token_accuracy": 0.9717884361743927, |
| "num_tokens": 2773451.0, |
| "step": 314 |
| }, |
| { |
| "entropy": 1.3228943943977356, |
| "epoch": 1.1375565610859728, |
| "grad_norm": 0.5118943452835083, |
| "learning_rate": 0.00019419623952447113, |
| "loss": 0.13395828008651733, |
| "mean_token_accuracy": 0.9631195217370987, |
| "num_tokens": 2782201.0, |
| "step": 315 |
| }, |
| { |
| "entropy": 1.2930387556552887, |
| "epoch": 1.1411764705882352, |
| "grad_norm": 0.5813370943069458, |
| "learning_rate": 0.0001941496672753068, |
| "loss": 0.20286405086517334, |
| "mean_token_accuracy": 0.9531850218772888, |
| "num_tokens": 2791064.0, |
| "step": 316 |
| }, |
| { |
| "entropy": 1.2918854355812073, |
| "epoch": 1.1447963800904977, |
| "grad_norm": 0.5122251510620117, |
| "learning_rate": 0.00019410291518906337, |
| "loss": 0.13441404700279236, |
| "mean_token_accuracy": 0.9620678126811981, |
| "num_tokens": 2799899.0, |
| "step": 317 |
| }, |
| { |
| "entropy": 1.2272875010967255, |
| "epoch": 1.14841628959276, |
| "grad_norm": 0.4150262475013733, |
| "learning_rate": 0.00019405598336565518, |
| "loss": 0.11041846871376038, |
| "mean_token_accuracy": 0.9675246626138687, |
| "num_tokens": 2809282.0, |
| "step": 318 |
| }, |
| { |
| "entropy": 1.2951529920101166, |
| "epoch": 1.1520361990950225, |
| "grad_norm": 0.6175352334976196, |
| "learning_rate": 0.00019400887190538068, |
| "loss": 0.23986774682998657, |
| "mean_token_accuracy": 0.9405841678380966, |
| "num_tokens": 2818065.0, |
| "step": 319 |
| }, |
| { |
| "entropy": 1.279006689786911, |
| "epoch": 1.155656108597285, |
| "grad_norm": 0.5972070693969727, |
| "learning_rate": 0.00019396158090892224, |
| "loss": 0.11880911141633987, |
| "mean_token_accuracy": 0.9688185602426529, |
| "num_tokens": 2827164.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 1.3515214622020721, |
| "epoch": 1.1592760180995474, |
| "grad_norm": 0.6211175322532654, |
| "learning_rate": 0.00019391411047734589, |
| "loss": 0.20557641983032227, |
| "mean_token_accuracy": 0.9422554075717926, |
| "num_tokens": 2835820.0, |
| "step": 321 |
| }, |
| { |
| "entropy": 1.3550400137901306, |
| "epoch": 1.16289592760181, |
| "grad_norm": 0.5280090570449829, |
| "learning_rate": 0.00019386646071210118, |
| "loss": 0.10726026445627213, |
| "mean_token_accuracy": 0.9667237550020218, |
| "num_tokens": 2844372.0, |
| "step": 322 |
| }, |
| { |
| "entropy": 1.2510974407196045, |
| "epoch": 1.1665158371040725, |
| "grad_norm": 0.5300607085227966, |
| "learning_rate": 0.00019381863171502088, |
| "loss": 0.11015061289072037, |
| "mean_token_accuracy": 0.9701625555753708, |
| "num_tokens": 2853765.0, |
| "step": 323 |
| }, |
| { |
| "entropy": 1.3709427416324615, |
| "epoch": 1.170135746606335, |
| "grad_norm": 0.5143060088157654, |
| "learning_rate": 0.00019377062358832083, |
| "loss": 0.1023775190114975, |
| "mean_token_accuracy": 0.9671410173177719, |
| "num_tokens": 2862380.0, |
| "step": 324 |
| }, |
| { |
| "entropy": 1.3767890334129333, |
| "epoch": 1.1737556561085973, |
| "grad_norm": 0.5431631207466125, |
| "learning_rate": 0.00019372243643459963, |
| "loss": 0.10558684170246124, |
| "mean_token_accuracy": 0.9684659391641617, |
| "num_tokens": 2870789.0, |
| "step": 325 |
| }, |
| { |
| "entropy": 1.2412969470024109, |
| "epoch": 1.1773755656108598, |
| "grad_norm": 0.4398118257522583, |
| "learning_rate": 0.0001936740703568386, |
| "loss": 0.08340749889612198, |
| "mean_token_accuracy": 0.9579537361860275, |
| "num_tokens": 2880113.0, |
| "step": 326 |
| }, |
| { |
| "entropy": 1.3048627376556396, |
| "epoch": 1.1809954751131222, |
| "grad_norm": 0.5652127861976624, |
| "learning_rate": 0.00019362552545840121, |
| "loss": 0.10193085670471191, |
| "mean_token_accuracy": 0.9732934236526489, |
| "num_tokens": 2888677.0, |
| "step": 327 |
| }, |
| { |
| "entropy": 1.251385748386383, |
| "epoch": 1.1846153846153846, |
| "grad_norm": 1.2118943929672241, |
| "learning_rate": 0.00019357680184303334, |
| "loss": 0.11773515492677689, |
| "mean_token_accuracy": 0.9713618904352188, |
| "num_tokens": 2897670.0, |
| "step": 328 |
| }, |
| { |
| "entropy": 1.2210056483745575, |
| "epoch": 1.188235294117647, |
| "grad_norm": 0.566527247428894, |
| "learning_rate": 0.00019352789961486273, |
| "loss": 0.13454154133796692, |
| "mean_token_accuracy": 0.9604692161083221, |
| "num_tokens": 2906641.0, |
| "step": 329 |
| }, |
| { |
| "entropy": 1.2078846395015717, |
| "epoch": 1.1918552036199095, |
| "grad_norm": 0.5194998979568481, |
| "learning_rate": 0.00019347881887839878, |
| "loss": 0.12406279891729355, |
| "mean_token_accuracy": 0.9622378796339035, |
| "num_tokens": 2915822.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 1.1988820135593414, |
| "epoch": 1.195475113122172, |
| "grad_norm": 0.6065554618835449, |
| "learning_rate": 0.00019342955973853236, |
| "loss": 0.13872164487838745, |
| "mean_token_accuracy": 0.9626883864402771, |
| "num_tokens": 2924800.0, |
| "step": 331 |
| }, |
| { |
| "entropy": 1.2309212684631348, |
| "epoch": 1.1990950226244343, |
| "grad_norm": 0.4229549765586853, |
| "learning_rate": 0.00019338012230053574, |
| "loss": 0.0660879835486412, |
| "mean_token_accuracy": 0.9759407639503479, |
| "num_tokens": 2933258.0, |
| "step": 332 |
| }, |
| { |
| "entropy": 1.2222792506217957, |
| "epoch": 1.2027149321266968, |
| "grad_norm": 0.3530382215976715, |
| "learning_rate": 0.00019333050667006213, |
| "loss": 0.049285903573036194, |
| "mean_token_accuracy": 0.9843859821557999, |
| "num_tokens": 2942300.0, |
| "step": 333 |
| }, |
| { |
| "entropy": 1.2958929240703583, |
| "epoch": 1.2063348416289592, |
| "grad_norm": 0.7826951742172241, |
| "learning_rate": 0.00019328071295314557, |
| "loss": 0.1304435431957245, |
| "mean_token_accuracy": 0.9591120481491089, |
| "num_tokens": 2950767.0, |
| "step": 334 |
| }, |
| { |
| "entropy": 1.117813378572464, |
| "epoch": 1.2099547511312216, |
| "grad_norm": 0.4954948425292969, |
| "learning_rate": 0.0001932307412562007, |
| "loss": 0.13508693873882294, |
| "mean_token_accuracy": 0.9585950672626495, |
| "num_tokens": 2960646.0, |
| "step": 335 |
| }, |
| { |
| "entropy": 1.1835031807422638, |
| "epoch": 1.213574660633484, |
| "grad_norm": 0.5961520671844482, |
| "learning_rate": 0.00019318059168602251, |
| "loss": 0.13114790618419647, |
| "mean_token_accuracy": 0.9647198617458344, |
| "num_tokens": 2969505.0, |
| "step": 336 |
| }, |
| { |
| "entropy": 1.2350926995277405, |
| "epoch": 1.2171945701357467, |
| "grad_norm": 0.639654815196991, |
| "learning_rate": 0.0001931302643497862, |
| "loss": 0.14303456246852875, |
| "mean_token_accuracy": 0.9530725330114365, |
| "num_tokens": 2978152.0, |
| "step": 337 |
| }, |
| { |
| "entropy": 1.2022943198680878, |
| "epoch": 1.2208144796380092, |
| "grad_norm": 0.481646865606308, |
| "learning_rate": 0.00019307975935504672, |
| "loss": 0.08106391131877899, |
| "mean_token_accuracy": 0.9747956246137619, |
| "num_tokens": 2987130.0, |
| "step": 338 |
| }, |
| { |
| "entropy": 1.1898784339427948, |
| "epoch": 1.2244343891402716, |
| "grad_norm": 0.6450216770172119, |
| "learning_rate": 0.00019302907680973888, |
| "loss": 0.13768108189105988, |
| "mean_token_accuracy": 0.9617143720388412, |
| "num_tokens": 2996486.0, |
| "step": 339 |
| }, |
| { |
| "entropy": 1.2522628903388977, |
| "epoch": 1.228054298642534, |
| "grad_norm": 0.6536288857460022, |
| "learning_rate": 0.00019297821682217676, |
| "loss": 0.10431766510009766, |
| "mean_token_accuracy": 0.9742023795843124, |
| "num_tokens": 3005405.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 1.227498173713684, |
| "epoch": 1.2316742081447964, |
| "grad_norm": 0.43904492259025574, |
| "learning_rate": 0.00019292717950105382, |
| "loss": 0.08097510784864426, |
| "mean_token_accuracy": 0.9721501767635345, |
| "num_tokens": 3014074.0, |
| "step": 341 |
| }, |
| { |
| "entropy": 1.2638218104839325, |
| "epoch": 1.2352941176470589, |
| "grad_norm": 0.43028151988983154, |
| "learning_rate": 0.00019287596495544233, |
| "loss": 0.08344768732786179, |
| "mean_token_accuracy": 0.9769876450300217, |
| "num_tokens": 3022652.0, |
| "step": 342 |
| }, |
| { |
| "entropy": 1.3337022960186005, |
| "epoch": 1.2389140271493213, |
| "grad_norm": 0.5973348021507263, |
| "learning_rate": 0.0001928245732947935, |
| "loss": 0.15496382117271423, |
| "mean_token_accuracy": 0.9576286971569061, |
| "num_tokens": 3030878.0, |
| "step": 343 |
| }, |
| { |
| "entropy": 1.3223533630371094, |
| "epoch": 1.2425339366515837, |
| "grad_norm": 0.5969900488853455, |
| "learning_rate": 0.0001927730046289369, |
| "loss": 0.15188942849636078, |
| "mean_token_accuracy": 0.9633741676807404, |
| "num_tokens": 3039311.0, |
| "step": 344 |
| }, |
| { |
| "entropy": 1.2445839643478394, |
| "epoch": 1.2461538461538462, |
| "grad_norm": 0.528199315071106, |
| "learning_rate": 0.00019272125906808038, |
| "loss": 0.17896166443824768, |
| "mean_token_accuracy": 0.9567457586526871, |
| "num_tokens": 3048358.0, |
| "step": 345 |
| }, |
| { |
| "entropy": 1.2038472890853882, |
| "epoch": 1.2497737556561086, |
| "grad_norm": 0.5293557643890381, |
| "learning_rate": 0.00019266933672280998, |
| "loss": 0.09390994906425476, |
| "mean_token_accuracy": 0.9739362895488739, |
| "num_tokens": 3057158.0, |
| "step": 346 |
| }, |
| { |
| "entropy": 1.3120388686656952, |
| "epoch": 1.253393665158371, |
| "grad_norm": 0.6997618675231934, |
| "learning_rate": 0.00019261723770408942, |
| "loss": 0.10516057163476944, |
| "mean_token_accuracy": 0.9721816033124924, |
| "num_tokens": 3065552.0, |
| "step": 347 |
| }, |
| { |
| "entropy": 1.2618502080440521, |
| "epoch": 1.2570135746606335, |
| "grad_norm": 0.9361847043037415, |
| "learning_rate": 0.00019256496212326, |
| "loss": 0.20228593051433563, |
| "mean_token_accuracy": 0.952130600810051, |
| "num_tokens": 3074114.0, |
| "step": 348 |
| }, |
| { |
| "entropy": 1.1915834546089172, |
| "epoch": 1.260633484162896, |
| "grad_norm": 0.5289909243583679, |
| "learning_rate": 0.00019251251009204037, |
| "loss": 0.190103217959404, |
| "mean_token_accuracy": 0.9520878791809082, |
| "num_tokens": 3083265.0, |
| "step": 349 |
| }, |
| { |
| "entropy": 1.2389221489429474, |
| "epoch": 1.2642533936651583, |
| "grad_norm": 0.5938348770141602, |
| "learning_rate": 0.0001924598817225263, |
| "loss": 0.14038029313087463, |
| "mean_token_accuracy": 0.9594598710536957, |
| "num_tokens": 3092345.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 1.2747001945972443, |
| "epoch": 1.2678733031674208, |
| "grad_norm": 0.49824684858322144, |
| "learning_rate": 0.00019240707712719042, |
| "loss": 0.09080217778682709, |
| "mean_token_accuracy": 0.9752634167671204, |
| "num_tokens": 3100948.0, |
| "step": 351 |
| }, |
| { |
| "entropy": 1.2456386089324951, |
| "epoch": 1.2714932126696832, |
| "grad_norm": 0.4126341938972473, |
| "learning_rate": 0.0001923540964188819, |
| "loss": 0.07552246004343033, |
| "mean_token_accuracy": 0.973249226808548, |
| "num_tokens": 3109527.0, |
| "step": 352 |
| }, |
| { |
| "entropy": 1.2375999987125397, |
| "epoch": 1.2751131221719456, |
| "grad_norm": 0.5416721701622009, |
| "learning_rate": 0.0001923009397108264, |
| "loss": 0.11064227670431137, |
| "mean_token_accuracy": 0.9698382914066315, |
| "num_tokens": 3118529.0, |
| "step": 353 |
| }, |
| { |
| "entropy": 1.323571503162384, |
| "epoch": 1.278733031674208, |
| "grad_norm": 0.539566159248352, |
| "learning_rate": 0.00019224760711662555, |
| "loss": 0.06606832891702652, |
| "mean_token_accuracy": 0.9804727733135223, |
| "num_tokens": 3127119.0, |
| "step": 354 |
| }, |
| { |
| "entropy": 1.2750852406024933, |
| "epoch": 1.2823529411764705, |
| "grad_norm": 0.466310054063797, |
| "learning_rate": 0.00019219409875025705, |
| "loss": 0.09571981430053711, |
| "mean_token_accuracy": 0.9744787514209747, |
| "num_tokens": 3135490.0, |
| "step": 355 |
| }, |
| { |
| "entropy": 1.2746248841285706, |
| "epoch": 1.285972850678733, |
| "grad_norm": 0.5672609210014343, |
| "learning_rate": 0.00019214041472607408, |
| "loss": 0.11834894865751266, |
| "mean_token_accuracy": 0.9640013575553894, |
| "num_tokens": 3144069.0, |
| "step": 356 |
| }, |
| { |
| "entropy": 1.2627168595790863, |
| "epoch": 1.2895927601809956, |
| "grad_norm": 0.6581894755363464, |
| "learning_rate": 0.00019208655515880532, |
| "loss": 0.09647037833929062, |
| "mean_token_accuracy": 0.9689393192529678, |
| "num_tokens": 3152615.0, |
| "step": 357 |
| }, |
| { |
| "entropy": 1.1794680655002594, |
| "epoch": 1.293212669683258, |
| "grad_norm": 0.5053655505180359, |
| "learning_rate": 0.00019203252016355458, |
| "loss": 0.11060067266225815, |
| "mean_token_accuracy": 0.9651738703250885, |
| "num_tokens": 3161401.0, |
| "step": 358 |
| }, |
| { |
| "entropy": 1.212617665529251, |
| "epoch": 1.2968325791855204, |
| "grad_norm": 0.8028094172477722, |
| "learning_rate": 0.00019197830985580064, |
| "loss": 0.2338217943906784, |
| "mean_token_accuracy": 0.9495572596788406, |
| "num_tokens": 3170012.0, |
| "step": 359 |
| }, |
| { |
| "entropy": 1.1559679508209229, |
| "epoch": 1.3004524886877828, |
| "grad_norm": 0.7312418818473816, |
| "learning_rate": 0.00019192392435139676, |
| "loss": 0.20356854796409607, |
| "mean_token_accuracy": 0.948117196559906, |
| "num_tokens": 3179571.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 1.23529851436615, |
| "epoch": 1.3040723981900453, |
| "grad_norm": 0.5860849022865295, |
| "learning_rate": 0.00019186936376657085, |
| "loss": 0.09007853269577026, |
| "mean_token_accuracy": 0.9779597371816635, |
| "num_tokens": 3188355.0, |
| "step": 361 |
| }, |
| { |
| "entropy": 1.1607768833637238, |
| "epoch": 1.3076923076923077, |
| "grad_norm": 0.5011453032493591, |
| "learning_rate": 0.0001918146282179248, |
| "loss": 0.10823806375265121, |
| "mean_token_accuracy": 0.9690431505441666, |
| "num_tokens": 3197273.0, |
| "step": 362 |
| }, |
| { |
| "entropy": 1.3031161725521088, |
| "epoch": 1.3113122171945701, |
| "grad_norm": 0.7570735812187195, |
| "learning_rate": 0.0001917597178224345, |
| "loss": 0.12617093324661255, |
| "mean_token_accuracy": 0.9726371467113495, |
| "num_tokens": 3205523.0, |
| "step": 363 |
| }, |
| { |
| "entropy": 1.2163672745227814, |
| "epoch": 1.3149321266968326, |
| "grad_norm": 0.79036545753479, |
| "learning_rate": 0.0001917046326974495, |
| "loss": 0.1774558573961258, |
| "mean_token_accuracy": 0.9524620473384857, |
| "num_tokens": 3214337.0, |
| "step": 364 |
| }, |
| { |
| "entropy": 1.2065471410751343, |
| "epoch": 1.318552036199095, |
| "grad_norm": 0.4970189332962036, |
| "learning_rate": 0.00019164937296069275, |
| "loss": 0.11900650709867477, |
| "mean_token_accuracy": 0.9698603302240372, |
| "num_tokens": 3223272.0, |
| "step": 365 |
| }, |
| { |
| "entropy": 1.1421026289463043, |
| "epoch": 1.3221719457013574, |
| "grad_norm": 0.4852188527584076, |
| "learning_rate": 0.0001915939387302604, |
| "loss": 0.13389186561107635, |
| "mean_token_accuracy": 0.9624761492013931, |
| "num_tokens": 3232805.0, |
| "step": 366 |
| }, |
| { |
| "entropy": 1.2605140805244446, |
| "epoch": 1.3257918552036199, |
| "grad_norm": 0.4731639623641968, |
| "learning_rate": 0.00019153833012462148, |
| "loss": 0.13248053193092346, |
| "mean_token_accuracy": 0.9626576453447342, |
| "num_tokens": 3242004.0, |
| "step": 367 |
| }, |
| { |
| "entropy": 1.2730947136878967, |
| "epoch": 1.3294117647058823, |
| "grad_norm": 0.5051520466804504, |
| "learning_rate": 0.0001914825472626177, |
| "loss": 0.13219039142131805, |
| "mean_token_accuracy": 0.9609686881303787, |
| "num_tokens": 3250883.0, |
| "step": 368 |
| }, |
| { |
| "entropy": 1.2701692283153534, |
| "epoch": 1.3330316742081447, |
| "grad_norm": 0.4948354959487915, |
| "learning_rate": 0.00019142659026346315, |
| "loss": 0.11131806671619415, |
| "mean_token_accuracy": 0.9648500084877014, |
| "num_tokens": 3259677.0, |
| "step": 369 |
| }, |
| { |
| "entropy": 1.1959101557731628, |
| "epoch": 1.3366515837104074, |
| "grad_norm": 0.4939797520637512, |
| "learning_rate": 0.00019137045924674402, |
| "loss": 0.23960143327713013, |
| "mean_token_accuracy": 0.9330800324678421, |
| "num_tokens": 3268905.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 1.233262300491333, |
| "epoch": 1.3402714932126698, |
| "grad_norm": 0.5234730839729309, |
| "learning_rate": 0.00019131415433241855, |
| "loss": 0.1446327269077301, |
| "mean_token_accuracy": 0.960878998041153, |
| "num_tokens": 3278287.0, |
| "step": 371 |
| }, |
| { |
| "entropy": 1.273567020893097, |
| "epoch": 1.3438914027149322, |
| "grad_norm": 0.8267992734909058, |
| "learning_rate": 0.0001912576756408165, |
| "loss": 0.18108022212982178, |
| "mean_token_accuracy": 0.957704022526741, |
| "num_tokens": 3287174.0, |
| "step": 372 |
| }, |
| { |
| "entropy": 1.222804993391037, |
| "epoch": 1.3475113122171947, |
| "grad_norm": 0.45400527119636536, |
| "learning_rate": 0.000191201023292639, |
| "loss": 0.0790121927857399, |
| "mean_token_accuracy": 0.9772313088178635, |
| "num_tokens": 3296209.0, |
| "step": 373 |
| }, |
| { |
| "entropy": 1.2407637536525726, |
| "epoch": 1.351131221719457, |
| "grad_norm": 0.4726577401161194, |
| "learning_rate": 0.00019114419740895837, |
| "loss": 0.09999781101942062, |
| "mean_token_accuracy": 0.9702717959880829, |
| "num_tokens": 3304805.0, |
| "step": 374 |
| }, |
| { |
| "entropy": 1.287319839000702, |
| "epoch": 1.3547511312217195, |
| "grad_norm": 0.7152829766273499, |
| "learning_rate": 0.00019108719811121772, |
| "loss": 0.24104124307632446, |
| "mean_token_accuracy": 0.9389240592718124, |
| "num_tokens": 3313481.0, |
| "step": 375 |
| }, |
| { |
| "entropy": 1.3423154652118683, |
| "epoch": 1.358371040723982, |
| "grad_norm": 0.643339991569519, |
| "learning_rate": 0.00019103002552123087, |
| "loss": 0.12666553258895874, |
| "mean_token_accuracy": 0.9654455035924911, |
| "num_tokens": 3321945.0, |
| "step": 376 |
| }, |
| { |
| "entropy": 1.291712373495102, |
| "epoch": 1.3619909502262444, |
| "grad_norm": 0.6353316307067871, |
| "learning_rate": 0.0001909726797611819, |
| "loss": 0.19853462278842926, |
| "mean_token_accuracy": 0.9522408545017242, |
| "num_tokens": 3330280.0, |
| "step": 377 |
| }, |
| { |
| "entropy": 1.2879594564437866, |
| "epoch": 1.3656108597285068, |
| "grad_norm": 0.5303124785423279, |
| "learning_rate": 0.000190915160953625, |
| "loss": 0.07663790136575699, |
| "mean_token_accuracy": 0.976201668381691, |
| "num_tokens": 3338655.0, |
| "step": 378 |
| }, |
| { |
| "entropy": 1.2824178040027618, |
| "epoch": 1.3692307692307693, |
| "grad_norm": 0.7815021872520447, |
| "learning_rate": 0.00019085746922148413, |
| "loss": 0.29046493768692017, |
| "mean_token_accuracy": 0.9358613342046738, |
| "num_tokens": 3347371.0, |
| "step": 379 |
| }, |
| { |
| "entropy": 1.3061807453632355, |
| "epoch": 1.3728506787330317, |
| "grad_norm": 0.6244264841079712, |
| "learning_rate": 0.00019079960468805293, |
| "loss": 0.14304828643798828, |
| "mean_token_accuracy": 0.9528721123933792, |
| "num_tokens": 3355742.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 1.2946366965770721, |
| "epoch": 1.3764705882352941, |
| "grad_norm": 0.5752382278442383, |
| "learning_rate": 0.0001907415674769942, |
| "loss": 0.10442131012678146, |
| "mean_token_accuracy": 0.9699090272188187, |
| "num_tokens": 3364520.0, |
| "step": 381 |
| }, |
| { |
| "entropy": 1.341704785823822, |
| "epoch": 1.3800904977375565, |
| "grad_norm": 0.5594130754470825, |
| "learning_rate": 0.00019068335771233987, |
| "loss": 0.15637581050395966, |
| "mean_token_accuracy": 0.957904726266861, |
| "num_tokens": 3373052.0, |
| "step": 382 |
| }, |
| { |
| "entropy": 1.2504234313964844, |
| "epoch": 1.383710407239819, |
| "grad_norm": 0.5094558596611023, |
| "learning_rate": 0.0001906249755184906, |
| "loss": 0.1411437839269638, |
| "mean_token_accuracy": 0.9628296792507172, |
| "num_tokens": 3382086.0, |
| "step": 383 |
| }, |
| { |
| "entropy": 1.2977957129478455, |
| "epoch": 1.3873303167420814, |
| "grad_norm": 0.745290994644165, |
| "learning_rate": 0.00019056642102021555, |
| "loss": 0.16349725425243378, |
| "mean_token_accuracy": 0.9588443785905838, |
| "num_tokens": 3391079.0, |
| "step": 384 |
| }, |
| { |
| "entropy": 1.3416646420955658, |
| "epoch": 1.3909502262443438, |
| "grad_norm": 0.6071876287460327, |
| "learning_rate": 0.00019050769434265206, |
| "loss": 0.09388062357902527, |
| "mean_token_accuracy": 0.9732513576745987, |
| "num_tokens": 3399955.0, |
| "step": 385 |
| }, |
| { |
| "entropy": 1.2769374251365662, |
| "epoch": 1.3945701357466063, |
| "grad_norm": 0.458711713552475, |
| "learning_rate": 0.00019044879561130553, |
| "loss": 0.0859433189034462, |
| "mean_token_accuracy": 0.9764412939548492, |
| "num_tokens": 3408410.0, |
| "step": 386 |
| }, |
| { |
| "entropy": 1.2753216326236725, |
| "epoch": 1.3981900452488687, |
| "grad_norm": 0.5320125818252563, |
| "learning_rate": 0.00019038972495204906, |
| "loss": 0.12986987829208374, |
| "mean_token_accuracy": 0.9634640216827393, |
| "num_tokens": 3416972.0, |
| "step": 387 |
| }, |
| { |
| "entropy": 1.3119005858898163, |
| "epoch": 1.4018099547511311, |
| "grad_norm": 0.5521829128265381, |
| "learning_rate": 0.00019033048249112304, |
| "loss": 0.12304998189210892, |
| "mean_token_accuracy": 0.9719865322113037, |
| "num_tokens": 3425822.0, |
| "step": 388 |
| }, |
| { |
| "entropy": 1.246106207370758, |
| "epoch": 1.4054298642533936, |
| "grad_norm": 0.8556618690490723, |
| "learning_rate": 0.00019027106835513519, |
| "loss": 0.09790987521409988, |
| "mean_token_accuracy": 0.9762465804815292, |
| "num_tokens": 3434913.0, |
| "step": 389 |
| }, |
| { |
| "entropy": 1.3027318120002747, |
| "epoch": 1.409049773755656, |
| "grad_norm": 0.5831668972969055, |
| "learning_rate": 0.00019021148267106002, |
| "loss": 0.12975762784481049, |
| "mean_token_accuracy": 0.9633228182792664, |
| "num_tokens": 3443968.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 1.2715855538845062, |
| "epoch": 1.4126696832579184, |
| "grad_norm": 0.5838220715522766, |
| "learning_rate": 0.00019015172556623863, |
| "loss": 0.24836933612823486, |
| "mean_token_accuracy": 0.9483881741762161, |
| "num_tokens": 3453112.0, |
| "step": 391 |
| }, |
| { |
| "entropy": 1.282654047012329, |
| "epoch": 1.416289592760181, |
| "grad_norm": 0.5957282185554504, |
| "learning_rate": 0.00019009179716837865, |
| "loss": 0.15488135814666748, |
| "mean_token_accuracy": 0.9558688700199127, |
| "num_tokens": 3461543.0, |
| "step": 392 |
| }, |
| { |
| "entropy": 1.2500621974468231, |
| "epoch": 1.4199095022624435, |
| "grad_norm": 0.5969480872154236, |
| "learning_rate": 0.0001900316976055535, |
| "loss": 0.118685781955719, |
| "mean_token_accuracy": 0.9681677222251892, |
| "num_tokens": 3470434.0, |
| "step": 393 |
| }, |
| { |
| "entropy": 1.1552152931690216, |
| "epoch": 1.423529411764706, |
| "grad_norm": 0.9155212044715881, |
| "learning_rate": 0.00018997142700620257, |
| "loss": 0.07897457480430603, |
| "mean_token_accuracy": 0.9746341109275818, |
| "num_tokens": 3479766.0, |
| "step": 394 |
| }, |
| { |
| "entropy": 1.2394072115421295, |
| "epoch": 1.4271493212669684, |
| "grad_norm": 0.5433281064033508, |
| "learning_rate": 0.00018991098549913084, |
| "loss": 0.10004748404026031, |
| "mean_token_accuracy": 0.970876082777977, |
| "num_tokens": 3487864.0, |
| "step": 395 |
| }, |
| { |
| "entropy": 1.194315493106842, |
| "epoch": 1.4307692307692308, |
| "grad_norm": 0.5625967383384705, |
| "learning_rate": 0.00018985037321350836, |
| "loss": 0.15023575723171234, |
| "mean_token_accuracy": 0.9571869820356369, |
| "num_tokens": 3496909.0, |
| "step": 396 |
| }, |
| { |
| "entropy": 1.1606856882572174, |
| "epoch": 1.4343891402714932, |
| "grad_norm": 0.41998282074928284, |
| "learning_rate": 0.0001897895902788703, |
| "loss": 0.06819174438714981, |
| "mean_token_accuracy": 0.9810739904642105, |
| "num_tokens": 3505970.0, |
| "step": 397 |
| }, |
| { |
| "entropy": 1.2330349385738373, |
| "epoch": 1.4380090497737557, |
| "grad_norm": 0.46300387382507324, |
| "learning_rate": 0.00018972863682511639, |
| "loss": 0.11061134934425354, |
| "mean_token_accuracy": 0.9722975939512253, |
| "num_tokens": 3514464.0, |
| "step": 398 |
| }, |
| { |
| "entropy": 1.2248220443725586, |
| "epoch": 1.441628959276018, |
| "grad_norm": 0.6571424007415771, |
| "learning_rate": 0.00018966751298251093, |
| "loss": 0.17451299726963043, |
| "mean_token_accuracy": 0.9536366164684296, |
| "num_tokens": 3523449.0, |
| "step": 399 |
| }, |
| { |
| "entropy": 1.2604781985282898, |
| "epoch": 1.4452488687782805, |
| "grad_norm": 0.7056911587715149, |
| "learning_rate": 0.00018960621888168224, |
| "loss": 0.1714896708726883, |
| "mean_token_accuracy": 0.9471757411956787, |
| "num_tokens": 3532403.0, |
| "step": 400 |
| }, |
| { |
| "entropy": 1.2510155141353607, |
| "epoch": 1.448868778280543, |
| "grad_norm": 0.582774817943573, |
| "learning_rate": 0.00018954475465362256, |
| "loss": 0.1749400794506073, |
| "mean_token_accuracy": 0.9718905985355377, |
| "num_tokens": 3541154.0, |
| "step": 401 |
| }, |
| { |
| "entropy": 1.3139209747314453, |
| "epoch": 1.4524886877828054, |
| "grad_norm": 0.5442278981208801, |
| "learning_rate": 0.00018948312042968768, |
| "loss": 0.14033550024032593, |
| "mean_token_accuracy": 0.9652996808290482, |
| "num_tokens": 3549472.0, |
| "step": 402 |
| }, |
| { |
| "entropy": 1.2959212362766266, |
| "epoch": 1.4561085972850678, |
| "grad_norm": 0.5408539175987244, |
| "learning_rate": 0.00018942131634159672, |
| "loss": 0.10644003748893738, |
| "mean_token_accuracy": 0.9660961031913757, |
| "num_tokens": 3557890.0, |
| "step": 403 |
| }, |
| { |
| "entropy": 1.285716027021408, |
| "epoch": 1.4597285067873302, |
| "grad_norm": 0.49533000588417053, |
| "learning_rate": 0.00018935934252143182, |
| "loss": 0.12249112129211426, |
| "mean_token_accuracy": 0.9659420847892761, |
| "num_tokens": 3566859.0, |
| "step": 404 |
| }, |
| { |
| "entropy": 1.270496904850006, |
| "epoch": 1.463348416289593, |
| "grad_norm": 0.4820156991481781, |
| "learning_rate": 0.0001892971991016378, |
| "loss": 0.11467836797237396, |
| "mean_token_accuracy": 0.9734348058700562, |
| "num_tokens": 3575782.0, |
| "step": 405 |
| }, |
| { |
| "entropy": 1.2943720519542694, |
| "epoch": 1.4669683257918553, |
| "grad_norm": 0.34380054473876953, |
| "learning_rate": 0.00018923488621502198, |
| "loss": 0.06653222441673279, |
| "mean_token_accuracy": 0.9807761162519455, |
| "num_tokens": 3584468.0, |
| "step": 406 |
| }, |
| { |
| "entropy": 1.3197762072086334, |
| "epoch": 1.4705882352941178, |
| "grad_norm": 0.5286886692047119, |
| "learning_rate": 0.00018917240399475387, |
| "loss": 0.1562788039445877, |
| "mean_token_accuracy": 0.9582570791244507, |
| "num_tokens": 3593198.0, |
| "step": 407 |
| }, |
| { |
| "entropy": 1.2224310040473938, |
| "epoch": 1.4742081447963802, |
| "grad_norm": 0.47084492444992065, |
| "learning_rate": 0.00018910975257436477, |
| "loss": 0.08669901639223099, |
| "mean_token_accuracy": 0.9738520681858063, |
| "num_tokens": 3602529.0, |
| "step": 408 |
| }, |
| { |
| "entropy": 1.2566787004470825, |
| "epoch": 1.4778280542986426, |
| "grad_norm": 0.5421281456947327, |
| "learning_rate": 0.00018904693208774773, |
| "loss": 0.09741362929344177, |
| "mean_token_accuracy": 0.9702080637216568, |
| "num_tokens": 3611438.0, |
| "step": 409 |
| }, |
| { |
| "entropy": 1.2232867777347565, |
| "epoch": 1.481447963800905, |
| "grad_norm": 0.591533899307251, |
| "learning_rate": 0.000188983942669157, |
| "loss": 0.1027684137225151, |
| "mean_token_accuracy": 0.9754174500703812, |
| "num_tokens": 3620209.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 1.172331839799881, |
| "epoch": 1.4850678733031675, |
| "grad_norm": 0.6023557782173157, |
| "learning_rate": 0.00018892078445320785, |
| "loss": 0.19547313451766968, |
| "mean_token_accuracy": 0.9482788294553757, |
| "num_tokens": 3629483.0, |
| "step": 411 |
| }, |
| { |
| "entropy": 1.1692214012145996, |
| "epoch": 1.48868778280543, |
| "grad_norm": 0.4271499216556549, |
| "learning_rate": 0.00018885745757487633, |
| "loss": 0.06866015493869781, |
| "mean_token_accuracy": 0.9770233035087585, |
| "num_tokens": 3638167.0, |
| "step": 412 |
| }, |
| { |
| "entropy": 1.218559056520462, |
| "epoch": 1.4923076923076923, |
| "grad_norm": 0.6048818826675415, |
| "learning_rate": 0.00018879396216949895, |
| "loss": 0.1184379905462265, |
| "mean_token_accuracy": 0.9668450653553009, |
| "num_tokens": 3646941.0, |
| "step": 413 |
| }, |
| { |
| "entropy": 1.1115884184837341, |
| "epoch": 1.4959276018099548, |
| "grad_norm": 0.5207604765892029, |
| "learning_rate": 0.00018873029837277236, |
| "loss": 0.09107951819896698, |
| "mean_token_accuracy": 0.970865860581398, |
| "num_tokens": 3656408.0, |
| "step": 414 |
| }, |
| { |
| "entropy": 1.201383799314499, |
| "epoch": 1.4995475113122172, |
| "grad_norm": 0.6223848462104797, |
| "learning_rate": 0.0001886664663207531, |
| "loss": 0.14010068774223328, |
| "mean_token_accuracy": 0.9613067805767059, |
| "num_tokens": 3665223.0, |
| "step": 415 |
| }, |
| { |
| "entropy": 1.2117818593978882, |
| "epoch": 1.5031674208144796, |
| "grad_norm": 0.5893263220787048, |
| "learning_rate": 0.00018860246614985725, |
| "loss": 0.09732384979724884, |
| "mean_token_accuracy": 0.9756953567266464, |
| "num_tokens": 3673568.0, |
| "step": 416 |
| }, |
| { |
| "entropy": 1.068862423300743, |
| "epoch": 1.506787330316742, |
| "grad_norm": 0.6032145023345947, |
| "learning_rate": 0.0001885382979968602, |
| "loss": 0.16733071208000183, |
| "mean_token_accuracy": 0.9616134315729141, |
| "num_tokens": 3683588.0, |
| "step": 417 |
| }, |
| { |
| "entropy": 1.164185881614685, |
| "epoch": 1.5104072398190045, |
| "grad_norm": 0.8030836582183838, |
| "learning_rate": 0.00018847396199889638, |
| "loss": 0.191024512052536, |
| "mean_token_accuracy": 0.9492377042770386, |
| "num_tokens": 3692221.0, |
| "step": 418 |
| }, |
| { |
| "entropy": 1.2202486097812653, |
| "epoch": 1.514027149321267, |
| "grad_norm": 0.45743128657341003, |
| "learning_rate": 0.00018840945829345885, |
| "loss": 0.0803522914648056, |
| "mean_token_accuracy": 0.9740827530622482, |
| "num_tokens": 3700599.0, |
| "step": 419 |
| }, |
| { |
| "entropy": 1.2234172523021698, |
| "epoch": 1.5176470588235293, |
| "grad_norm": 0.6399345397949219, |
| "learning_rate": 0.0001883447870183991, |
| "loss": 0.13413016498088837, |
| "mean_token_accuracy": 0.9694488942623138, |
| "num_tokens": 3709316.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 1.219748318195343, |
| "epoch": 1.5212669683257918, |
| "grad_norm": 0.8917973041534424, |
| "learning_rate": 0.00018827994831192675, |
| "loss": 0.14400342106819153, |
| "mean_token_accuracy": 0.9627624005079269, |
| "num_tokens": 3717808.0, |
| "step": 421 |
| }, |
| { |
| "entropy": 1.257588416337967, |
| "epoch": 1.5248868778280542, |
| "grad_norm": 0.61835116147995, |
| "learning_rate": 0.0001882149423126093, |
| "loss": 0.12378428876399994, |
| "mean_token_accuracy": 0.9704181104898453, |
| "num_tokens": 3726278.0, |
| "step": 422 |
| }, |
| { |
| "entropy": 1.2053538858890533, |
| "epoch": 1.5285067873303166, |
| "grad_norm": 0.5542409420013428, |
| "learning_rate": 0.0001881497691593716, |
| "loss": 0.0971193015575409, |
| "mean_token_accuracy": 0.9743164777755737, |
| "num_tokens": 3734935.0, |
| "step": 423 |
| }, |
| { |
| "entropy": 1.1871007978916168, |
| "epoch": 1.532126696832579, |
| "grad_norm": 0.3344699740409851, |
| "learning_rate": 0.0001880844289914959, |
| "loss": 0.03764911741018295, |
| "mean_token_accuracy": 0.9889417439699173, |
| "num_tokens": 3743868.0, |
| "step": 424 |
| }, |
| { |
| "entropy": 1.266874372959137, |
| "epoch": 1.5357466063348415, |
| "grad_norm": 0.5863392949104309, |
| "learning_rate": 0.0001880189219486213, |
| "loss": 0.12176309525966644, |
| "mean_token_accuracy": 0.9698539972305298, |
| "num_tokens": 3752414.0, |
| "step": 425 |
| }, |
| { |
| "entropy": 1.1402153968811035, |
| "epoch": 1.539366515837104, |
| "grad_norm": 0.4934159517288208, |
| "learning_rate": 0.00018795324817074354, |
| "loss": 0.1012497991323471, |
| "mean_token_accuracy": 0.9715209603309631, |
| "num_tokens": 3761525.0, |
| "step": 426 |
| }, |
| { |
| "entropy": 1.2725040912628174, |
| "epoch": 1.5429864253393664, |
| "grad_norm": 0.6184947490692139, |
| "learning_rate": 0.0001878874077982147, |
| "loss": 0.09586258977651596, |
| "mean_token_accuracy": 0.9663965255022049, |
| "num_tokens": 3769460.0, |
| "step": 427 |
| }, |
| { |
| "entropy": 1.1935326755046844, |
| "epoch": 1.5466063348416288, |
| "grad_norm": 0.711137056350708, |
| "learning_rate": 0.0001878214009717429, |
| "loss": 0.203983873128891, |
| "mean_token_accuracy": 0.9529214203357697, |
| "num_tokens": 3778214.0, |
| "step": 428 |
| }, |
| { |
| "entropy": 1.2496784329414368, |
| "epoch": 1.5502262443438914, |
| "grad_norm": 0.5968933701515198, |
| "learning_rate": 0.00018775522783239198, |
| "loss": 0.14222870767116547, |
| "mean_token_accuracy": 0.963072806596756, |
| "num_tokens": 3786842.0, |
| "step": 429 |
| }, |
| { |
| "entropy": 1.1680949032306671, |
| "epoch": 1.5538461538461539, |
| "grad_norm": 0.5371803641319275, |
| "learning_rate": 0.0001876888885215812, |
| "loss": 0.1057431548833847, |
| "mean_token_accuracy": 0.9662029445171356, |
| "num_tokens": 3795718.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 1.1462930738925934, |
| "epoch": 1.5574660633484163, |
| "grad_norm": 0.4879329800605774, |
| "learning_rate": 0.0001876223831810849, |
| "loss": 0.0943731814622879, |
| "mean_token_accuracy": 0.9766229838132858, |
| "num_tokens": 3804833.0, |
| "step": 431 |
| }, |
| { |
| "entropy": 1.2148894369602203, |
| "epoch": 1.5610859728506787, |
| "grad_norm": 0.8211036324501038, |
| "learning_rate": 0.00018755571195303234, |
| "loss": 0.354468435049057, |
| "mean_token_accuracy": 0.9370880573987961, |
| "num_tokens": 3813804.0, |
| "step": 432 |
| }, |
| { |
| "entropy": 1.2123090624809265, |
| "epoch": 1.5647058823529412, |
| "grad_norm": 0.7378460168838501, |
| "learning_rate": 0.00018748887497990727, |
| "loss": 0.10816670954227448, |
| "mean_token_accuracy": 0.9683575332164764, |
| "num_tokens": 3822405.0, |
| "step": 433 |
| }, |
| { |
| "entropy": 1.224818378686905, |
| "epoch": 1.5683257918552036, |
| "grad_norm": 0.6279307007789612, |
| "learning_rate": 0.00018742187240454762, |
| "loss": 0.15239953994750977, |
| "mean_token_accuracy": 0.9667632728815079, |
| "num_tokens": 3830990.0, |
| "step": 434 |
| }, |
| { |
| "entropy": 1.2601450085639954, |
| "epoch": 1.571945701357466, |
| "grad_norm": 0.705889105796814, |
| "learning_rate": 0.00018735470437014523, |
| "loss": 0.07424315810203552, |
| "mean_token_accuracy": 0.9760804325342178, |
| "num_tokens": 3839416.0, |
| "step": 435 |
| }, |
| { |
| "entropy": 1.190734475851059, |
| "epoch": 1.5755656108597285, |
| "grad_norm": 0.3725827634334564, |
| "learning_rate": 0.00018728737102024557, |
| "loss": 0.06711282581090927, |
| "mean_token_accuracy": 0.9825543165206909, |
| "num_tokens": 3848851.0, |
| "step": 436 |
| }, |
| { |
| "entropy": 1.22418212890625, |
| "epoch": 1.5791855203619911, |
| "grad_norm": 0.46165597438812256, |
| "learning_rate": 0.00018721987249874746, |
| "loss": 0.0854751318693161, |
| "mean_token_accuracy": 0.9737638980150223, |
| "num_tokens": 3857922.0, |
| "step": 437 |
| }, |
| { |
| "entropy": 1.2956913709640503, |
| "epoch": 1.5828054298642535, |
| "grad_norm": 0.5615050196647644, |
| "learning_rate": 0.0001871522089499026, |
| "loss": 0.11327139288187027, |
| "mean_token_accuracy": 0.9696285277605057, |
| "num_tokens": 3866613.0, |
| "step": 438 |
| }, |
| { |
| "entropy": 1.2908932268619537, |
| "epoch": 1.586425339366516, |
| "grad_norm": 0.5302788615226746, |
| "learning_rate": 0.00018708438051831544, |
| "loss": 0.1113893985748291, |
| "mean_token_accuracy": 0.9665745049715042, |
| "num_tokens": 3875118.0, |
| "step": 439 |
| }, |
| { |
| "entropy": 1.217937409877777, |
| "epoch": 1.5900452488687784, |
| "grad_norm": 0.3633266091346741, |
| "learning_rate": 0.00018701638734894277, |
| "loss": 0.06300318986177444, |
| "mean_token_accuracy": 0.9804518818855286, |
| "num_tokens": 3884140.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 1.246230572462082, |
| "epoch": 1.5936651583710408, |
| "grad_norm": 0.474832147359848, |
| "learning_rate": 0.00018694822958709346, |
| "loss": 0.07810146361589432, |
| "mean_token_accuracy": 0.9765773564577103, |
| "num_tokens": 3892875.0, |
| "step": 441 |
| }, |
| { |
| "entropy": 1.2694453299045563, |
| "epoch": 1.5972850678733033, |
| "grad_norm": 0.3883070647716522, |
| "learning_rate": 0.00018687990737842818, |
| "loss": 0.062109194695949554, |
| "mean_token_accuracy": 0.9821051061153412, |
| "num_tokens": 3901322.0, |
| "step": 442 |
| }, |
| { |
| "entropy": 1.2491609454154968, |
| "epoch": 1.6009049773755657, |
| "grad_norm": 0.6458966732025146, |
| "learning_rate": 0.0001868114208689589, |
| "loss": 0.16670571267604828, |
| "mean_token_accuracy": 0.9681277722120285, |
| "num_tokens": 3910066.0, |
| "step": 443 |
| }, |
| { |
| "entropy": 1.289526790380478, |
| "epoch": 1.6045248868778281, |
| "grad_norm": 0.498731404542923, |
| "learning_rate": 0.0001867427702050489, |
| "loss": 0.1024109423160553, |
| "mean_token_accuracy": 0.9770669341087341, |
| "num_tokens": 3918561.0, |
| "step": 444 |
| }, |
| { |
| "entropy": 1.3065584897994995, |
| "epoch": 1.6081447963800906, |
| "grad_norm": 0.5213361382484436, |
| "learning_rate": 0.00018667395553341213, |
| "loss": 0.10743463039398193, |
| "mean_token_accuracy": 0.968547523021698, |
| "num_tokens": 3927277.0, |
| "step": 445 |
| }, |
| { |
| "entropy": 1.2784769237041473, |
| "epoch": 1.611764705882353, |
| "grad_norm": 0.46484726667404175, |
| "learning_rate": 0.00018660497700111317, |
| "loss": 0.1371474266052246, |
| "mean_token_accuracy": 0.958917498588562, |
| "num_tokens": 3936569.0, |
| "step": 446 |
| }, |
| { |
| "entropy": 1.2938779890537262, |
| "epoch": 1.6153846153846154, |
| "grad_norm": 0.496535986661911, |
| "learning_rate": 0.00018653583475556663, |
| "loss": 0.07164958864450455, |
| "mean_token_accuracy": 0.9746866375207901, |
| "num_tokens": 3945175.0, |
| "step": 447 |
| }, |
| { |
| "entropy": 1.2572109401226044, |
| "epoch": 1.6190045248868778, |
| "grad_norm": 1.1173293590545654, |
| "learning_rate": 0.00018646652894453714, |
| "loss": 0.17376887798309326, |
| "mean_token_accuracy": 0.9627924859523773, |
| "num_tokens": 3953978.0, |
| "step": 448 |
| }, |
| { |
| "entropy": 1.2627245783805847, |
| "epoch": 1.6226244343891403, |
| "grad_norm": 0.7195766568183899, |
| "learning_rate": 0.00018639705971613878, |
| "loss": 0.1997443288564682, |
| "mean_token_accuracy": 0.9475196748971939, |
| "num_tokens": 3962897.0, |
| "step": 449 |
| }, |
| { |
| "entropy": 1.2522348463535309, |
| "epoch": 1.6262443438914027, |
| "grad_norm": 0.47655388712882996, |
| "learning_rate": 0.00018632742721883495, |
| "loss": 0.08064761012792587, |
| "mean_token_accuracy": 0.967804342508316, |
| "num_tokens": 3971740.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 1.2210132479667664, |
| "epoch": 1.6298642533936651, |
| "grad_norm": 0.5680899620056152, |
| "learning_rate": 0.00018625763160143796, |
| "loss": 0.1746440976858139, |
| "mean_token_accuracy": 0.9542393088340759, |
| "num_tokens": 3980791.0, |
| "step": 451 |
| }, |
| { |
| "entropy": 1.276931256055832, |
| "epoch": 1.6334841628959276, |
| "grad_norm": 0.5334168076515198, |
| "learning_rate": 0.0001861876730131087, |
| "loss": 0.10449045896530151, |
| "mean_token_accuracy": 0.9746640473604202, |
| "num_tokens": 3990061.0, |
| "step": 452 |
| }, |
| { |
| "entropy": 1.3530596196651459, |
| "epoch": 1.63710407239819, |
| "grad_norm": 0.654348611831665, |
| "learning_rate": 0.00018611755160335633, |
| "loss": 0.13995029032230377, |
| "mean_token_accuracy": 0.9673926830291748, |
| "num_tokens": 3998360.0, |
| "step": 453 |
| }, |
| { |
| "entropy": 1.356580764055252, |
| "epoch": 1.6407239819004524, |
| "grad_norm": 0.7779679298400879, |
| "learning_rate": 0.000186047267522038, |
| "loss": 0.3456251621246338, |
| "mean_token_accuracy": 0.9260966181755066, |
| "num_tokens": 4007104.0, |
| "step": 454 |
| }, |
| { |
| "entropy": 1.2545486092567444, |
| "epoch": 1.6443438914027149, |
| "grad_norm": 0.49409937858581543, |
| "learning_rate": 0.00018597682091935856, |
| "loss": 0.13179728388786316, |
| "mean_token_accuracy": 0.9634029120206833, |
| "num_tokens": 4016114.0, |
| "step": 455 |
| }, |
| { |
| "entropy": 1.2673145532608032, |
| "epoch": 1.6479638009049773, |
| "grad_norm": 0.4457748532295227, |
| "learning_rate": 0.00018590621194587007, |
| "loss": 0.12703005969524384, |
| "mean_token_accuracy": 0.9695380330085754, |
| "num_tokens": 4025108.0, |
| "step": 456 |
| }, |
| { |
| "entropy": 1.2589110732078552, |
| "epoch": 1.6515837104072397, |
| "grad_norm": 0.5149383544921875, |
| "learning_rate": 0.0001858354407524717, |
| "loss": 0.14232893288135529, |
| "mean_token_accuracy": 0.9638843387365341, |
| "num_tokens": 4034230.0, |
| "step": 457 |
| }, |
| { |
| "entropy": 1.3335690796375275, |
| "epoch": 1.6552036199095022, |
| "grad_norm": 0.4933992326259613, |
| "learning_rate": 0.00018576450749040925, |
| "loss": 0.09372726082801819, |
| "mean_token_accuracy": 0.9729661494493484, |
| "num_tokens": 4043163.0, |
| "step": 458 |
| }, |
| { |
| "entropy": 1.2553574740886688, |
| "epoch": 1.6588235294117646, |
| "grad_norm": 0.426299124956131, |
| "learning_rate": 0.0001856934123112749, |
| "loss": 0.06914810836315155, |
| "mean_token_accuracy": 0.9822471588850021, |
| "num_tokens": 4051766.0, |
| "step": 459 |
| }, |
| { |
| "entropy": 1.2507081627845764, |
| "epoch": 1.662443438914027, |
| "grad_norm": 0.5942637324333191, |
| "learning_rate": 0.00018562215536700684, |
| "loss": 0.09914569556713104, |
| "mean_token_accuracy": 0.9764914512634277, |
| "num_tokens": 4061075.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 1.2598736882209778, |
| "epoch": 1.6660633484162894, |
| "grad_norm": 0.43723762035369873, |
| "learning_rate": 0.000185550736809889, |
| "loss": 0.10020029544830322, |
| "mean_token_accuracy": 0.9784475862979889, |
| "num_tokens": 4069722.0, |
| "step": 461 |
| }, |
| { |
| "entropy": 1.25190868973732, |
| "epoch": 1.6696832579185519, |
| "grad_norm": 0.42693883180618286, |
| "learning_rate": 0.00018547915679255063, |
| "loss": 0.05682477727532387, |
| "mean_token_accuracy": 0.9844070225954056, |
| "num_tokens": 4078483.0, |
| "step": 462 |
| }, |
| { |
| "entropy": 1.231699526309967, |
| "epoch": 1.6733031674208145, |
| "grad_norm": 0.526006817817688, |
| "learning_rate": 0.00018540741546796616, |
| "loss": 0.0700770914554596, |
| "mean_token_accuracy": 0.9812077730894089, |
| "num_tokens": 4087298.0, |
| "step": 463 |
| }, |
| { |
| "entropy": 1.1822182536125183, |
| "epoch": 1.676923076923077, |
| "grad_norm": 0.5364736914634705, |
| "learning_rate": 0.00018533551298945467, |
| "loss": 0.11393093317747116, |
| "mean_token_accuracy": 0.9698948115110397, |
| "num_tokens": 4096459.0, |
| "step": 464 |
| }, |
| { |
| "entropy": 1.1950626969337463, |
| "epoch": 1.6805429864253394, |
| "grad_norm": 0.6466018557548523, |
| "learning_rate": 0.00018526344951067957, |
| "loss": 0.17861007153987885, |
| "mean_token_accuracy": 0.9639775156974792, |
| "num_tokens": 4105457.0, |
| "step": 465 |
| }, |
| { |
| "entropy": 1.1978155076503754, |
| "epoch": 1.6841628959276018, |
| "grad_norm": 0.4935831129550934, |
| "learning_rate": 0.00018519122518564853, |
| "loss": 0.08525866270065308, |
| "mean_token_accuracy": 0.9758468270301819, |
| "num_tokens": 4114038.0, |
| "step": 466 |
| }, |
| { |
| "entropy": 1.1613716185092926, |
| "epoch": 1.6877828054298643, |
| "grad_norm": 0.41199934482574463, |
| "learning_rate": 0.0001851188401687128, |
| "loss": 0.07017679512500763, |
| "mean_token_accuracy": 0.9776453971862793, |
| "num_tokens": 4123007.0, |
| "step": 467 |
| }, |
| { |
| "entropy": 1.1513382196426392, |
| "epoch": 1.6914027149321267, |
| "grad_norm": 0.5673828721046448, |
| "learning_rate": 0.00018504629461456716, |
| "loss": 0.09683945775032043, |
| "mean_token_accuracy": 0.9727943688631058, |
| "num_tokens": 4131925.0, |
| "step": 468 |
| }, |
| { |
| "entropy": 1.210301250219345, |
| "epoch": 1.6950226244343891, |
| "grad_norm": 0.5047227144241333, |
| "learning_rate": 0.00018497358867824933, |
| "loss": 0.08428950607776642, |
| "mean_token_accuracy": 0.9737090021371841, |
| "num_tokens": 4140851.0, |
| "step": 469 |
| }, |
| { |
| "entropy": 1.3032833933830261, |
| "epoch": 1.6986425339366515, |
| "grad_norm": 0.6038223505020142, |
| "learning_rate": 0.00018490072251513997, |
| "loss": 0.06815248727798462, |
| "mean_token_accuracy": 0.9779106676578522, |
| "num_tokens": 4148989.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 1.23912513256073, |
| "epoch": 1.702262443438914, |
| "grad_norm": 0.6200883388519287, |
| "learning_rate": 0.00018482769628096207, |
| "loss": 0.08937155455350876, |
| "mean_token_accuracy": 0.9763010591268539, |
| "num_tokens": 4157517.0, |
| "step": 471 |
| }, |
| { |
| "entropy": 1.186729907989502, |
| "epoch": 1.7058823529411766, |
| "grad_norm": 0.5706359148025513, |
| "learning_rate": 0.00018475451013178062, |
| "loss": 0.1340227574110031, |
| "mean_token_accuracy": 0.9582885801792145, |
| "num_tokens": 4166422.0, |
| "step": 472 |
| }, |
| { |
| "entropy": 1.2165184915065765, |
| "epoch": 1.709502262443439, |
| "grad_norm": 0.4458298981189728, |
| "learning_rate": 0.00018468116422400258, |
| "loss": 0.06072482466697693, |
| "mean_token_accuracy": 0.9787209331989288, |
| "num_tokens": 4174473.0, |
| "step": 473 |
| }, |
| { |
| "entropy": 1.253999799489975, |
| "epoch": 1.7131221719457015, |
| "grad_norm": 0.5249391794204712, |
| "learning_rate": 0.00018460765871437614, |
| "loss": 0.13922284543514252, |
| "mean_token_accuracy": 0.9605212956666946, |
| "num_tokens": 4183121.0, |
| "step": 474 |
| }, |
| { |
| "entropy": 1.2191531360149384, |
| "epoch": 1.716742081447964, |
| "grad_norm": 0.6076857447624207, |
| "learning_rate": 0.0001845339937599906, |
| "loss": 0.12412364035844803, |
| "mean_token_accuracy": 0.9641828685998917, |
| "num_tokens": 4192043.0, |
| "step": 475 |
| }, |
| { |
| "entropy": 1.1414133310317993, |
| "epoch": 1.7203619909502263, |
| "grad_norm": 0.5084185004234314, |
| "learning_rate": 0.00018446016951827619, |
| "loss": 0.08176974952220917, |
| "mean_token_accuracy": 0.9742649793624878, |
| "num_tokens": 4201387.0, |
| "step": 476 |
| }, |
| { |
| "entropy": 1.2488122284412384, |
| "epoch": 1.7239819004524888, |
| "grad_norm": 0.5439050793647766, |
| "learning_rate": 0.0001843861861470033, |
| "loss": 0.12782233953475952, |
| "mean_token_accuracy": 0.9623425304889679, |
| "num_tokens": 4210216.0, |
| "step": 477 |
| }, |
| { |
| "entropy": 1.1921941936016083, |
| "epoch": 1.7276018099547512, |
| "grad_norm": 0.6951958537101746, |
| "learning_rate": 0.00018431204380428258, |
| "loss": 0.1784716546535492, |
| "mean_token_accuracy": 0.9505428522825241, |
| "num_tokens": 4219386.0, |
| "step": 478 |
| }, |
| { |
| "entropy": 1.2704735100269318, |
| "epoch": 1.7312217194570136, |
| "grad_norm": 0.4298340678215027, |
| "learning_rate": 0.00018423774264856433, |
| "loss": 0.07889077067375183, |
| "mean_token_accuracy": 0.9754424393177032, |
| "num_tokens": 4228012.0, |
| "step": 479 |
| }, |
| { |
| "entropy": 1.2802889347076416, |
| "epoch": 1.734841628959276, |
| "grad_norm": 0.6962547898292542, |
| "learning_rate": 0.00018416328283863827, |
| "loss": 0.10208003222942352, |
| "mean_token_accuracy": 0.9654501229524612, |
| "num_tokens": 4236361.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 1.2991151809692383, |
| "epoch": 1.7384615384615385, |
| "grad_norm": 0.5058232545852661, |
| "learning_rate": 0.00018408866453363326, |
| "loss": 0.09221667051315308, |
| "mean_token_accuracy": 0.9807891100645065, |
| "num_tokens": 4244839.0, |
| "step": 481 |
| }, |
| { |
| "entropy": 1.2336131632328033, |
| "epoch": 1.742081447963801, |
| "grad_norm": 0.43272894620895386, |
| "learning_rate": 0.0001840138878930167, |
| "loss": 0.06068682670593262, |
| "mean_token_accuracy": 0.9835522323846817, |
| "num_tokens": 4253448.0, |
| "step": 482 |
| }, |
| { |
| "entropy": 1.1949119865894318, |
| "epoch": 1.7457013574660634, |
| "grad_norm": 0.49109941720962524, |
| "learning_rate": 0.00018393895307659456, |
| "loss": 0.11378560215234756, |
| "mean_token_accuracy": 0.9686966389417648, |
| "num_tokens": 4262859.0, |
| "step": 483 |
| }, |
| { |
| "entropy": 1.3476176857948303, |
| "epoch": 1.7493212669683258, |
| "grad_norm": 0.8294044733047485, |
| "learning_rate": 0.00018386386024451076, |
| "loss": 0.2184215486049652, |
| "mean_token_accuracy": 0.9302650094032288, |
| "num_tokens": 4271081.0, |
| "step": 484 |
| }, |
| { |
| "entropy": 1.2542327046394348, |
| "epoch": 1.7529411764705882, |
| "grad_norm": 0.5723432898521423, |
| "learning_rate": 0.0001837886095572469, |
| "loss": 0.12125033140182495, |
| "mean_token_accuracy": 0.9635651111602783, |
| "num_tokens": 4280186.0, |
| "step": 485 |
| }, |
| { |
| "entropy": 1.3042361438274384, |
| "epoch": 1.7565610859728507, |
| "grad_norm": 0.7513923645019531, |
| "learning_rate": 0.00018371320117562199, |
| "loss": 0.22429926693439484, |
| "mean_token_accuracy": 0.9487362802028656, |
| "num_tokens": 4288944.0, |
| "step": 486 |
| }, |
| { |
| "entropy": 1.2472188770771027, |
| "epoch": 1.760180995475113, |
| "grad_norm": 0.45380401611328125, |
| "learning_rate": 0.000183637635260792, |
| "loss": 0.09528672695159912, |
| "mean_token_accuracy": 0.972507655620575, |
| "num_tokens": 4297837.0, |
| "step": 487 |
| }, |
| { |
| "entropy": 1.2608753442764282, |
| "epoch": 1.7638009049773755, |
| "grad_norm": 0.4613839089870453, |
| "learning_rate": 0.00018356191197424964, |
| "loss": 0.12640029191970825, |
| "mean_token_accuracy": 0.9590530246496201, |
| "num_tokens": 4306763.0, |
| "step": 488 |
| }, |
| { |
| "entropy": 1.3038392961025238, |
| "epoch": 1.767420814479638, |
| "grad_norm": 0.484052836894989, |
| "learning_rate": 0.0001834860314778238, |
| "loss": 0.11759282648563385, |
| "mean_token_accuracy": 0.9666974991559982, |
| "num_tokens": 4315329.0, |
| "step": 489 |
| }, |
| { |
| "entropy": 1.2559982240200043, |
| "epoch": 1.7710407239819004, |
| "grad_norm": 0.3966261148452759, |
| "learning_rate": 0.00018340999393367952, |
| "loss": 0.060240764170885086, |
| "mean_token_accuracy": 0.9834811985492706, |
| "num_tokens": 4324074.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 1.2651242315769196, |
| "epoch": 1.7746606334841628, |
| "grad_norm": 0.5276649594306946, |
| "learning_rate": 0.00018333379950431734, |
| "loss": 0.12336815893650055, |
| "mean_token_accuracy": 0.963963121175766, |
| "num_tokens": 4332893.0, |
| "step": 491 |
| }, |
| { |
| "entropy": 1.340283215045929, |
| "epoch": 1.7782805429864252, |
| "grad_norm": 0.7897153496742249, |
| "learning_rate": 0.0001832574483525731, |
| "loss": 0.47839581966400146, |
| "mean_token_accuracy": 0.9122047275304794, |
| "num_tokens": 4342135.0, |
| "step": 492 |
| }, |
| { |
| "entropy": 1.275327444076538, |
| "epoch": 1.7819004524886877, |
| "grad_norm": 0.5885195732116699, |
| "learning_rate": 0.00018318094064161765, |
| "loss": 0.15483446419239044, |
| "mean_token_accuracy": 0.962849572300911, |
| "num_tokens": 4351087.0, |
| "step": 493 |
| }, |
| { |
| "entropy": 1.3376893401145935, |
| "epoch": 1.78552036199095, |
| "grad_norm": 0.3955666124820709, |
| "learning_rate": 0.00018310427653495632, |
| "loss": 0.057708803564310074, |
| "mean_token_accuracy": 0.9868191480636597, |
| "num_tokens": 4359453.0, |
| "step": 494 |
| }, |
| { |
| "entropy": 1.2282173037528992, |
| "epoch": 1.7891402714932125, |
| "grad_norm": 0.4878976047039032, |
| "learning_rate": 0.00018302745619642874, |
| "loss": 0.09314609318971634, |
| "mean_token_accuracy": 0.9711224585771561, |
| "num_tokens": 4368135.0, |
| "step": 495 |
| }, |
| { |
| "entropy": 1.215164691209793, |
| "epoch": 1.792760180995475, |
| "grad_norm": 0.5175027847290039, |
| "learning_rate": 0.00018295047979020843, |
| "loss": 0.09527207911014557, |
| "mean_token_accuracy": 0.970024898648262, |
| "num_tokens": 4377547.0, |
| "step": 496 |
| }, |
| { |
| "entropy": 1.2793397009372711, |
| "epoch": 1.7963800904977374, |
| "grad_norm": 0.39276501536369324, |
| "learning_rate": 0.00018287334748080236, |
| "loss": 0.051972195506095886, |
| "mean_token_accuracy": 0.9798386096954346, |
| "num_tokens": 4386400.0, |
| "step": 497 |
| }, |
| { |
| "entropy": 1.2163749635219574, |
| "epoch": 1.8, |
| "grad_norm": 0.679814875125885, |
| "learning_rate": 0.00018279605943305084, |
| "loss": 0.17564961314201355, |
| "mean_token_accuracy": 0.9553549140691757, |
| "num_tokens": 4395216.0, |
| "step": 498 |
| }, |
| { |
| "entropy": 1.1838999092578888, |
| "epoch": 1.8036199095022625, |
| "grad_norm": 0.4638931155204773, |
| "learning_rate": 0.00018271861581212686, |
| "loss": 0.14034540951251984, |
| "mean_token_accuracy": 0.9645161777734756, |
| "num_tokens": 4404951.0, |
| "step": 499 |
| }, |
| { |
| "entropy": 1.3320802450180054, |
| "epoch": 1.807239819004525, |
| "grad_norm": 0.5198440551757812, |
| "learning_rate": 0.00018264101678353592, |
| "loss": 0.10376127064228058, |
| "mean_token_accuracy": 0.9734873324632645, |
| "num_tokens": 4413295.0, |
| "step": 500 |
| }, |
| { |
| "entropy": 1.3872873187065125, |
| "epoch": 1.8108597285067873, |
| "grad_norm": 0.6447911858558655, |
| "learning_rate": 0.00018256326251311572, |
| "loss": 0.14052145183086395, |
| "mean_token_accuracy": 0.9612767547369003, |
| "num_tokens": 4421685.0, |
| "step": 501 |
| }, |
| { |
| "entropy": 1.3084447383880615, |
| "epoch": 1.8144796380090498, |
| "grad_norm": 0.40712690353393555, |
| "learning_rate": 0.0001824853531670356, |
| "loss": 0.06982739269733429, |
| "mean_token_accuracy": 0.9787922501564026, |
| "num_tokens": 4430505.0, |
| "step": 502 |
| }, |
| { |
| "entropy": 1.2643664479255676, |
| "epoch": 1.8180995475113122, |
| "grad_norm": 0.5247007012367249, |
| "learning_rate": 0.00018240728891179647, |
| "loss": 0.13731859624385834, |
| "mean_token_accuracy": 0.9635806679725647, |
| "num_tokens": 4439489.0, |
| "step": 503 |
| }, |
| { |
| "entropy": 1.290818691253662, |
| "epoch": 1.8217194570135746, |
| "grad_norm": 0.5838543772697449, |
| "learning_rate": 0.00018232906991423015, |
| "loss": 0.14157697558403015, |
| "mean_token_accuracy": 0.9528656303882599, |
| "num_tokens": 4448345.0, |
| "step": 504 |
| }, |
| { |
| "entropy": 1.268191635608673, |
| "epoch": 1.825339366515837, |
| "grad_norm": 0.5802812576293945, |
| "learning_rate": 0.00018225069634149929, |
| "loss": 0.09528884291648865, |
| "mean_token_accuracy": 0.9706045240163803, |
| "num_tokens": 4456917.0, |
| "step": 505 |
| }, |
| { |
| "entropy": 1.3473228812217712, |
| "epoch": 1.8289592760180997, |
| "grad_norm": 0.4340367913246155, |
| "learning_rate": 0.0001821721683610968, |
| "loss": 0.10973142832517624, |
| "mean_token_accuracy": 0.9727098494768143, |
| "num_tokens": 4465709.0, |
| "step": 506 |
| }, |
| { |
| "entropy": 1.3878930509090424, |
| "epoch": 1.8325791855203621, |
| "grad_norm": 0.6490364074707031, |
| "learning_rate": 0.00018209348614084552, |
| "loss": 0.16322913765907288, |
| "mean_token_accuracy": 0.9474020302295685, |
| "num_tokens": 4474226.0, |
| "step": 507 |
| }, |
| { |
| "entropy": 1.2699617445468903, |
| "epoch": 1.8361990950226246, |
| "grad_norm": 0.3948211371898651, |
| "learning_rate": 0.0001820146498488981, |
| "loss": 0.06969399005174637, |
| "mean_token_accuracy": 0.9818782657384872, |
| "num_tokens": 4483339.0, |
| "step": 508 |
| }, |
| { |
| "entropy": 1.30024915933609, |
| "epoch": 1.839819004524887, |
| "grad_norm": 0.5742026567459106, |
| "learning_rate": 0.0001819356596537363, |
| "loss": 0.17156578600406647, |
| "mean_token_accuracy": 0.9574418365955353, |
| "num_tokens": 4491595.0, |
| "step": 509 |
| }, |
| { |
| "entropy": 1.1884125173091888, |
| "epoch": 1.8434389140271494, |
| "grad_norm": 0.9810559749603271, |
| "learning_rate": 0.00018185651572417082, |
| "loss": 0.10991943627595901, |
| "mean_token_accuracy": 0.9672488421201706, |
| "num_tokens": 4500876.0, |
| "step": 510 |
| }, |
| { |
| "entropy": 1.2843712866306305, |
| "epoch": 1.8470588235294119, |
| "grad_norm": 0.6281305551528931, |
| "learning_rate": 0.00018177721822934097, |
| "loss": 0.2122631072998047, |
| "mean_token_accuracy": 0.951048418879509, |
| "num_tokens": 4509761.0, |
| "step": 511 |
| }, |
| { |
| "entropy": 1.3060316145420074, |
| "epoch": 1.8506787330316743, |
| "grad_norm": 0.41631197929382324, |
| "learning_rate": 0.00018169776733871422, |
| "loss": 0.06948904693126678, |
| "mean_token_accuracy": 0.9814287573099136, |
| "num_tokens": 4518518.0, |
| "step": 512 |
| }, |
| { |
| "entropy": 1.3802857398986816, |
| "epoch": 1.8542986425339367, |
| "grad_norm": 0.6457367539405823, |
| "learning_rate": 0.0001816181632220858, |
| "loss": 0.13644376397132874, |
| "mean_token_accuracy": 0.9505161345005035, |
| "num_tokens": 4527031.0, |
| "step": 513 |
| }, |
| { |
| "entropy": 1.2905277013778687, |
| "epoch": 1.8579185520361992, |
| "grad_norm": 0.4269099831581116, |
| "learning_rate": 0.00018153840604957845, |
| "loss": 0.09844273328781128, |
| "mean_token_accuracy": 0.976461797952652, |
| "num_tokens": 4535711.0, |
| "step": 514 |
| }, |
| { |
| "entropy": 1.261955976486206, |
| "epoch": 1.8615384615384616, |
| "grad_norm": 0.4700486660003662, |
| "learning_rate": 0.00018145849599164205, |
| "loss": 0.09094507992267609, |
| "mean_token_accuracy": 0.9761765152215958, |
| "num_tokens": 4544533.0, |
| "step": 515 |
| }, |
| { |
| "entropy": 1.31758314371109, |
| "epoch": 1.865158371040724, |
| "grad_norm": 0.6493532061576843, |
| "learning_rate": 0.00018137843321905316, |
| "loss": 0.1887962967157364, |
| "mean_token_accuracy": 0.952616810798645, |
| "num_tokens": 4553242.0, |
| "step": 516 |
| }, |
| { |
| "entropy": 1.2938291728496552, |
| "epoch": 1.8687782805429864, |
| "grad_norm": 0.37741488218307495, |
| "learning_rate": 0.00018129821790291464, |
| "loss": 0.08777758479118347, |
| "mean_token_accuracy": 0.9782676845788956, |
| "num_tokens": 4561885.0, |
| "step": 517 |
| }, |
| { |
| "entropy": 1.3216747641563416, |
| "epoch": 1.8723981900452489, |
| "grad_norm": 0.632188618183136, |
| "learning_rate": 0.00018121785021465552, |
| "loss": 0.1349005103111267, |
| "mean_token_accuracy": 0.9619114547967911, |
| "num_tokens": 4570513.0, |
| "step": 518 |
| }, |
| { |
| "entropy": 1.342225968837738, |
| "epoch": 1.8760180995475113, |
| "grad_norm": 0.6463941335678101, |
| "learning_rate": 0.00018113733032603036, |
| "loss": 0.3203854560852051, |
| "mean_token_accuracy": 0.9548316597938538, |
| "num_tokens": 4579275.0, |
| "step": 519 |
| }, |
| { |
| "entropy": 1.2357032895088196, |
| "epoch": 1.8796380090497737, |
| "grad_norm": 0.43039470911026, |
| "learning_rate": 0.00018105665840911894, |
| "loss": 0.04711674526333809, |
| "mean_token_accuracy": 0.983293205499649, |
| "num_tokens": 4588329.0, |
| "step": 520 |
| }, |
| { |
| "entropy": 1.2061834037303925, |
| "epoch": 1.8832579185520362, |
| "grad_norm": 0.4798518419265747, |
| "learning_rate": 0.00018097583463632606, |
| "loss": 0.09780099987983704, |
| "mean_token_accuracy": 0.9670642167329788, |
| "num_tokens": 4597226.0, |
| "step": 521 |
| }, |
| { |
| "entropy": 1.267607867717743, |
| "epoch": 1.8868778280542986, |
| "grad_norm": 0.9377403855323792, |
| "learning_rate": 0.000180894859180381, |
| "loss": 0.19476808607578278, |
| "mean_token_accuracy": 0.95573590695858, |
| "num_tokens": 4605908.0, |
| "step": 522 |
| }, |
| { |
| "entropy": 1.140032321214676, |
| "epoch": 1.890497737556561, |
| "grad_norm": 0.4788985848426819, |
| "learning_rate": 0.00018081373221433717, |
| "loss": 0.1614486277103424, |
| "mean_token_accuracy": 0.9491380751132965, |
| "num_tokens": 4615514.0, |
| "step": 523 |
| }, |
| { |
| "entropy": 1.2712955176830292, |
| "epoch": 1.8941176470588235, |
| "grad_norm": 0.6569368243217468, |
| "learning_rate": 0.00018073245391157184, |
| "loss": 0.18930307030677795, |
| "mean_token_accuracy": 0.9584528356790543, |
| "num_tokens": 4624496.0, |
| "step": 524 |
| }, |
| { |
| "entropy": 1.2802537977695465, |
| "epoch": 1.897737556561086, |
| "grad_norm": 0.646899402141571, |
| "learning_rate": 0.00018065102444578566, |
| "loss": 0.292553186416626, |
| "mean_token_accuracy": 0.9424069970846176, |
| "num_tokens": 4633712.0, |
| "step": 525 |
| }, |
| { |
| "entropy": 1.2484091222286224, |
| "epoch": 1.9013574660633483, |
| "grad_norm": 0.5471976399421692, |
| "learning_rate": 0.0001805694439910023, |
| "loss": 0.07856341451406479, |
| "mean_token_accuracy": 0.9713937491178513, |
| "num_tokens": 4642625.0, |
| "step": 526 |
| }, |
| { |
| "entropy": 1.3004970252513885, |
| "epoch": 1.9049773755656108, |
| "grad_norm": 0.6762946248054504, |
| "learning_rate": 0.00018048771272156821, |
| "loss": 0.20788748562335968, |
| "mean_token_accuracy": 0.9558361321687698, |
| "num_tokens": 4651212.0, |
| "step": 527 |
| }, |
| { |
| "entropy": 1.2837709486484528, |
| "epoch": 1.9085972850678732, |
| "grad_norm": 0.5465518236160278, |
| "learning_rate": 0.00018040583081215206, |
| "loss": 0.06877493858337402, |
| "mean_token_accuracy": 0.9773479402065277, |
| "num_tokens": 4659999.0, |
| "step": 528 |
| }, |
| { |
| "entropy": 1.1701121926307678, |
| "epoch": 1.9122171945701356, |
| "grad_norm": 0.48923078179359436, |
| "learning_rate": 0.00018032379843774442, |
| "loss": 0.10492908954620361, |
| "mean_token_accuracy": 0.9667482525110245, |
| "num_tokens": 4669539.0, |
| "step": 529 |
| }, |
| { |
| "entropy": 1.265353113412857, |
| "epoch": 1.915837104072398, |
| "grad_norm": 0.5817871689796448, |
| "learning_rate": 0.0001802416157736576, |
| "loss": 0.1619410663843155, |
| "mean_token_accuracy": 0.9500089585781097, |
| "num_tokens": 4678597.0, |
| "step": 530 |
| }, |
| { |
| "entropy": 1.3536691665649414, |
| "epoch": 1.9194570135746605, |
| "grad_norm": 0.420358806848526, |
| "learning_rate": 0.0001801592829955249, |
| "loss": 0.05708397924900055, |
| "mean_token_accuracy": 0.9855902940034866, |
| "num_tokens": 4686961.0, |
| "step": 531 |
| }, |
| { |
| "entropy": 1.211147278547287, |
| "epoch": 1.9230769230769231, |
| "grad_norm": 0.6017248034477234, |
| "learning_rate": 0.00018007680027930053, |
| "loss": 0.14558392763137817, |
| "mean_token_accuracy": 0.957173079252243, |
| "num_tokens": 4696021.0, |
| "step": 532 |
| }, |
| { |
| "entropy": 1.2624102234840393, |
| "epoch": 1.9266968325791856, |
| "grad_norm": 0.409318208694458, |
| "learning_rate": 0.00017999416780125908, |
| "loss": 0.0590839758515358, |
| "mean_token_accuracy": 0.9783227145671844, |
| "num_tokens": 4704751.0, |
| "step": 533 |
| }, |
| { |
| "entropy": 1.2459297180175781, |
| "epoch": 1.930316742081448, |
| "grad_norm": 0.4089992046356201, |
| "learning_rate": 0.0001799113857379953, |
| "loss": 0.09553386270999908, |
| "mean_token_accuracy": 0.9749322384595871, |
| "num_tokens": 4713896.0, |
| "step": 534 |
| }, |
| { |
| "entropy": 1.23102468252182, |
| "epoch": 1.9339366515837104, |
| "grad_norm": 0.3723222315311432, |
| "learning_rate": 0.00017982845426642348, |
| "loss": 0.09167241305112839, |
| "mean_token_accuracy": 0.9732823669910431, |
| "num_tokens": 4723046.0, |
| "step": 535 |
| }, |
| { |
| "entropy": 1.3357162177562714, |
| "epoch": 1.9375565610859729, |
| "grad_norm": 0.48041215538978577, |
| "learning_rate": 0.00017974537356377733, |
| "loss": 0.09750382602214813, |
| "mean_token_accuracy": 0.9764254987239838, |
| "num_tokens": 4731993.0, |
| "step": 536 |
| }, |
| { |
| "entropy": 1.1988002955913544, |
| "epoch": 1.9411764705882353, |
| "grad_norm": 0.5414320826530457, |
| "learning_rate": 0.00017966214380760938, |
| "loss": 0.14568619430065155, |
| "mean_token_accuracy": 0.9629835188388824, |
| "num_tokens": 4741245.0, |
| "step": 537 |
| }, |
| { |
| "entropy": 1.2370244562625885, |
| "epoch": 1.9447963800904977, |
| "grad_norm": 0.45901384949684143, |
| "learning_rate": 0.00017957876517579076, |
| "loss": 0.0786663368344307, |
| "mean_token_accuracy": 0.9746132791042328, |
| "num_tokens": 4750000.0, |
| "step": 538 |
| }, |
| { |
| "entropy": 1.2960913479328156, |
| "epoch": 1.9484162895927601, |
| "grad_norm": 0.5607526898384094, |
| "learning_rate": 0.00017949523784651085, |
| "loss": 0.19930651783943176, |
| "mean_token_accuracy": 0.9484933167695999, |
| "num_tokens": 4758980.0, |
| "step": 539 |
| }, |
| { |
| "entropy": 1.3163366615772247, |
| "epoch": 1.9520361990950226, |
| "grad_norm": 0.4517749845981598, |
| "learning_rate": 0.00017941156199827664, |
| "loss": 0.05434092879295349, |
| "mean_token_accuracy": 0.9835511595010757, |
| "num_tokens": 4767348.0, |
| "step": 540 |
| }, |
| { |
| "entropy": 1.2590799927711487, |
| "epoch": 1.9556561085972852, |
| "grad_norm": 0.7361325025558472, |
| "learning_rate": 0.00017932773780991262, |
| "loss": 0.15642693638801575, |
| "mean_token_accuracy": 0.9532411098480225, |
| "num_tokens": 4776468.0, |
| "step": 541 |
| }, |
| { |
| "entropy": 1.2585523426532745, |
| "epoch": 1.9592760180995477, |
| "grad_norm": 0.7297177314758301, |
| "learning_rate": 0.00017924376546056035, |
| "loss": 0.18437303602695465, |
| "mean_token_accuracy": 0.9587585777044296, |
| "num_tokens": 4785492.0, |
| "step": 542 |
| }, |
| { |
| "entropy": 1.238587647676468, |
| "epoch": 1.96289592760181, |
| "grad_norm": 0.7112489342689514, |
| "learning_rate": 0.00017915964512967784, |
| "loss": 0.1504441499710083, |
| "mean_token_accuracy": 0.9600091278553009, |
| "num_tokens": 4794274.0, |
| "step": 543 |
| }, |
| { |
| "entropy": 1.2908321619033813, |
| "epoch": 1.9665158371040725, |
| "grad_norm": 0.6113592386245728, |
| "learning_rate": 0.00017907537699703955, |
| "loss": 0.11070753633975983, |
| "mean_token_accuracy": 0.9691345393657684, |
| "num_tokens": 4802874.0, |
| "step": 544 |
| }, |
| { |
| "entropy": 1.1804803311824799, |
| "epoch": 1.970135746606335, |
| "grad_norm": 0.5324094295501709, |
| "learning_rate": 0.00017899096124273576, |
| "loss": 0.10199148207902908, |
| "mean_token_accuracy": 0.9748995751142502, |
| "num_tokens": 4812257.0, |
| "step": 545 |
| }, |
| { |
| "entropy": 1.2372592389583588, |
| "epoch": 1.9737556561085974, |
| "grad_norm": 0.34164050221443176, |
| "learning_rate": 0.00017890639804717215, |
| "loss": 0.055172618478536606, |
| "mean_token_accuracy": 0.9877361357212067, |
| "num_tokens": 4821231.0, |
| "step": 546 |
| }, |
| { |
| "entropy": 1.2872498035430908, |
| "epoch": 1.9773755656108598, |
| "grad_norm": 0.5182923674583435, |
| "learning_rate": 0.00017882168759106957, |
| "loss": 0.10375868529081345, |
| "mean_token_accuracy": 0.9739436358213425, |
| "num_tokens": 4829899.0, |
| "step": 547 |
| }, |
| { |
| "entropy": 1.1443316638469696, |
| "epoch": 1.9809954751131222, |
| "grad_norm": 0.5763622522354126, |
| "learning_rate": 0.00017873683005546358, |
| "loss": 0.13787207007408142, |
| "mean_token_accuracy": 0.9625149518251419, |
| "num_tokens": 4839358.0, |
| "step": 548 |
| }, |
| { |
| "entropy": 1.1885304749011993, |
| "epoch": 1.9846153846153847, |
| "grad_norm": 0.734394907951355, |
| "learning_rate": 0.00017865182562170403, |
| "loss": 0.14687997102737427, |
| "mean_token_accuracy": 0.9644728451967239, |
| "num_tokens": 4848291.0, |
| "step": 549 |
| }, |
| { |
| "entropy": 1.25540229678154, |
| "epoch": 1.988235294117647, |
| "grad_norm": 0.609951913356781, |
| "learning_rate": 0.00017856667447145475, |
| "loss": 0.10373552143573761, |
| "mean_token_accuracy": 0.9681924432516098, |
| "num_tokens": 4856839.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 1.2945852875709534, |
| "epoch": 1.9918552036199095, |
| "grad_norm": 0.9161359071731567, |
| "learning_rate": 0.00017848137678669307, |
| "loss": 0.1577455848455429, |
| "mean_token_accuracy": 0.963507279753685, |
| "num_tokens": 4865115.0, |
| "step": 551 |
| }, |
| { |
| "entropy": 1.208385318517685, |
| "epoch": 1.995475113122172, |
| "grad_norm": 0.5070799589157104, |
| "learning_rate": 0.00017839593274970953, |
| "loss": 0.14434640109539032, |
| "mean_token_accuracy": 0.9649564027786255, |
| "num_tokens": 4874000.0, |
| "step": 552 |
| }, |
| { |
| "entropy": 1.2376223504543304, |
| "epoch": 1.9990950226244344, |
| "grad_norm": 0.539681077003479, |
| "learning_rate": 0.00017831034254310748, |
| "loss": 0.1051136925816536, |
| "mean_token_accuracy": 0.9742441177368164, |
| "num_tokens": 4882778.0, |
| "step": 553 |
| }, |
| { |
| "entropy": 1.4333502054214478, |
| "epoch": 2.0, |
| "grad_norm": 4.310206890106201, |
| "learning_rate": 0.00017822460634980245, |
| "loss": 0.2112778276205063, |
| "mean_token_accuracy": 0.9484536051750183, |
| "num_tokens": 4883450.0, |
| "step": 554 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_entropy": 1.2949430923151777, |
| "eval_loss": 0.1359202116727829, |
| "eval_mean_token_accuracy": 0.9644398606889616, |
| "eval_num_tokens": 4883450.0, |
| "eval_runtime": 31.7669, |
| "eval_samples_per_second": 11.616, |
| "eval_steps_per_second": 3.872, |
| "step": 554 |
| }, |
| { |
| "entropy": 1.267829716205597, |
| "epoch": 2.0036199095022624, |
| "grad_norm": 0.5096814632415771, |
| "learning_rate": 0.00017813872435302222, |
| "loss": 0.0520174577832222, |
| "mean_token_accuracy": 0.9827230721712112, |
| "num_tokens": 4892530.0, |
| "step": 555 |
| }, |
| { |
| "entropy": 1.336664855480194, |
| "epoch": 2.007239819004525, |
| "grad_norm": 0.648607075214386, |
| "learning_rate": 0.000178052696736306, |
| "loss": 0.12705332040786743, |
| "mean_token_accuracy": 0.9673191756010056, |
| "num_tokens": 4900989.0, |
| "step": 556 |
| }, |
| { |
| "entropy": 1.323121964931488, |
| "epoch": 2.0108597285067873, |
| "grad_norm": 0.5979547500610352, |
| "learning_rate": 0.00017796652368350422, |
| "loss": 0.13934022188186646, |
| "mean_token_accuracy": 0.9531149715185165, |
| "num_tokens": 4909811.0, |
| "step": 557 |
| }, |
| { |
| "entropy": 1.27756667137146, |
| "epoch": 2.0144796380090497, |
| "grad_norm": 0.40110835433006287, |
| "learning_rate": 0.00017788020537877822, |
| "loss": 0.06965979188680649, |
| "mean_token_accuracy": 0.9806021302938461, |
| "num_tokens": 4918656.0, |
| "step": 558 |
| }, |
| { |
| "entropy": 1.2799800336360931, |
| "epoch": 2.018099547511312, |
| "grad_norm": 0.4222742021083832, |
| "learning_rate": 0.00017779374200659967, |
| "loss": 0.06322097778320312, |
| "mean_token_accuracy": 0.9810404479503632, |
| "num_tokens": 4927468.0, |
| "step": 559 |
| }, |
| { |
| "entropy": 1.2439637184143066, |
| "epoch": 2.0217194570135746, |
| "grad_norm": 0.4971936047077179, |
| "learning_rate": 0.00017770713375175027, |
| "loss": 0.09368139505386353, |
| "mean_token_accuracy": 0.9739051014184952, |
| "num_tokens": 4936501.0, |
| "step": 560 |
| }, |
| { |
| "entropy": 1.2581075429916382, |
| "epoch": 2.025339366515837, |
| "grad_norm": 0.5043046474456787, |
| "learning_rate": 0.00017762038079932143, |
| "loss": 0.08156520873308182, |
| "mean_token_accuracy": 0.9788997769355774, |
| "num_tokens": 4945377.0, |
| "step": 561 |
| }, |
| { |
| "entropy": 1.2525224387645721, |
| "epoch": 2.0289592760180994, |
| "grad_norm": 0.47259730100631714, |
| "learning_rate": 0.00017753348333471368, |
| "loss": 0.06477752327919006, |
| "mean_token_accuracy": 0.9779137223958969, |
| "num_tokens": 4954230.0, |
| "step": 562 |
| }, |
| { |
| "entropy": 1.1896491348743439, |
| "epoch": 2.032579185520362, |
| "grad_norm": 0.68027663230896, |
| "learning_rate": 0.00017744644154363642, |
| "loss": 0.13440944254398346, |
| "mean_token_accuracy": 0.961474671959877, |
| "num_tokens": 4963326.0, |
| "step": 563 |
| }, |
| { |
| "entropy": 1.2167057394981384, |
| "epoch": 2.0361990950226243, |
| "grad_norm": 0.541343092918396, |
| "learning_rate": 0.0001773592556121076, |
| "loss": 0.11538562178611755, |
| "mean_token_accuracy": 0.9634369909763336, |
| "num_tokens": 4972140.0, |
| "step": 564 |
| }, |
| { |
| "entropy": 1.2127482295036316, |
| "epoch": 2.0398190045248867, |
| "grad_norm": 0.6436272263526917, |
| "learning_rate": 0.00017727192572645307, |
| "loss": 0.12803298234939575, |
| "mean_token_accuracy": 0.9709215462207794, |
| "num_tokens": 4981091.0, |
| "step": 565 |
| }, |
| { |
| "entropy": 1.2022112607955933, |
| "epoch": 2.043438914027149, |
| "grad_norm": 0.5350651144981384, |
| "learning_rate": 0.0001771844520733064, |
| "loss": 0.06252528727054596, |
| "mean_token_accuracy": 0.9814591854810715, |
| "num_tokens": 4989755.0, |
| "step": 566 |
| }, |
| { |
| "entropy": 1.1322139203548431, |
| "epoch": 2.0470588235294116, |
| "grad_norm": 0.4417763650417328, |
| "learning_rate": 0.00017709683483960837, |
| "loss": 0.057584479451179504, |
| "mean_token_accuracy": 0.9763814359903336, |
| "num_tokens": 4998071.0, |
| "step": 567 |
| }, |
| { |
| "entropy": 1.1674880683422089, |
| "epoch": 2.050678733031674, |
| "grad_norm": 0.8559631109237671, |
| "learning_rate": 0.00017700907421260668, |
| "loss": 0.14195309579372406, |
| "mean_token_accuracy": 0.9630620330572128, |
| "num_tokens": 5006275.0, |
| "step": 568 |
| }, |
| { |
| "entropy": 1.0746576488018036, |
| "epoch": 2.0542986425339365, |
| "grad_norm": 0.5514897108078003, |
| "learning_rate": 0.00017692117037985538, |
| "loss": 0.10994663834571838, |
| "mean_token_accuracy": 0.9645248502492905, |
| "num_tokens": 5015941.0, |
| "step": 569 |
| }, |
| { |
| "entropy": 1.0583149194717407, |
| "epoch": 2.057918552036199, |
| "grad_norm": 0.8742787837982178, |
| "learning_rate": 0.00017683312352921463, |
| "loss": 0.060930777341127396, |
| "mean_token_accuracy": 0.9797597676515579, |
| "num_tokens": 5025002.0, |
| "step": 570 |
| }, |
| { |
| "entropy": 1.0571471750736237, |
| "epoch": 2.0615384615384613, |
| "grad_norm": 0.5724367499351501, |
| "learning_rate": 0.00017674493384885022, |
| "loss": 0.09305945783853531, |
| "mean_token_accuracy": 0.9714621901512146, |
| "num_tokens": 5034508.0, |
| "step": 571 |
| }, |
| { |
| "entropy": 1.0819552540779114, |
| "epoch": 2.065158371040724, |
| "grad_norm": 0.4609355032444, |
| "learning_rate": 0.00017665660152723319, |
| "loss": 0.06915001571178436, |
| "mean_token_accuracy": 0.9803351610898972, |
| "num_tokens": 5043545.0, |
| "step": 572 |
| }, |
| { |
| "entropy": 1.1555139124393463, |
| "epoch": 2.0687782805429866, |
| "grad_norm": 0.6659719347953796, |
| "learning_rate": 0.00017656812675313936, |
| "loss": 0.0947500616312027, |
| "mean_token_accuracy": 0.9700139313936234, |
| "num_tokens": 5052212.0, |
| "step": 573 |
| }, |
| { |
| "entropy": 1.0778053104877472, |
| "epoch": 2.072398190045249, |
| "grad_norm": 0.4302602708339691, |
| "learning_rate": 0.00017647950971564914, |
| "loss": 0.11282960325479507, |
| "mean_token_accuracy": 0.9660980701446533, |
| "num_tokens": 5061672.0, |
| "step": 574 |
| }, |
| { |
| "entropy": 1.111078679561615, |
| "epoch": 2.0760180995475115, |
| "grad_norm": 0.4093020260334015, |
| "learning_rate": 0.00017639075060414675, |
| "loss": 0.07460740208625793, |
| "mean_token_accuracy": 0.9758316427469254, |
| "num_tokens": 5071120.0, |
| "step": 575 |
| }, |
| { |
| "entropy": 1.1735278069972992, |
| "epoch": 2.079638009049774, |
| "grad_norm": 0.726333737373352, |
| "learning_rate": 0.0001763018496083202, |
| "loss": 0.31053513288497925, |
| "mean_token_accuracy": 0.9477875083684921, |
| "num_tokens": 5079991.0, |
| "step": 576 |
| }, |
| { |
| "entropy": 1.214000791311264, |
| "epoch": 2.0832579185520363, |
| "grad_norm": 0.7157718539237976, |
| "learning_rate": 0.00017621280691816076, |
| "loss": 0.14849108457565308, |
| "mean_token_accuracy": 0.952953040599823, |
| "num_tokens": 5088629.0, |
| "step": 577 |
| }, |
| { |
| "entropy": 1.1892287135124207, |
| "epoch": 2.086877828054299, |
| "grad_norm": 0.8626441359519958, |
| "learning_rate": 0.00017612362272396233, |
| "loss": 0.11797386407852173, |
| "mean_token_accuracy": 0.9675682783126831, |
| "num_tokens": 5097362.0, |
| "step": 578 |
| }, |
| { |
| "entropy": 1.2235265970230103, |
| "epoch": 2.090497737556561, |
| "grad_norm": 0.6362661123275757, |
| "learning_rate": 0.00017603429721632134, |
| "loss": 0.09463687241077423, |
| "mean_token_accuracy": 0.9706145972013474, |
| "num_tokens": 5105848.0, |
| "step": 579 |
| }, |
| { |
| "entropy": 1.1600496768951416, |
| "epoch": 2.0941176470588236, |
| "grad_norm": 0.3596010208129883, |
| "learning_rate": 0.00017594483058613625, |
| "loss": 0.06019435077905655, |
| "mean_token_accuracy": 0.9838933050632477, |
| "num_tokens": 5115154.0, |
| "step": 580 |
| }, |
| { |
| "entropy": 1.202838271856308, |
| "epoch": 2.097737556561086, |
| "grad_norm": 0.5307016372680664, |
| "learning_rate": 0.00017585522302460707, |
| "loss": 0.14253605902194977, |
| "mean_token_accuracy": 0.9725675880908966, |
| "num_tokens": 5124126.0, |
| "step": 581 |
| }, |
| { |
| "entropy": 1.2034749388694763, |
| "epoch": 2.1013574660633485, |
| "grad_norm": 0.5540820956230164, |
| "learning_rate": 0.00017576547472323501, |
| "loss": 0.07838793098926544, |
| "mean_token_accuracy": 0.9791185557842255, |
| "num_tokens": 5132742.0, |
| "step": 582 |
| }, |
| { |
| "entropy": 1.1183785498142242, |
| "epoch": 2.104977375565611, |
| "grad_norm": 0.613678514957428, |
| "learning_rate": 0.00017567558587382198, |
| "loss": 0.12506228685379028, |
| "mean_token_accuracy": 0.9711886942386627, |
| "num_tokens": 5142111.0, |
| "step": 583 |
| }, |
| { |
| "entropy": 1.1404469907283783, |
| "epoch": 2.1085972850678734, |
| "grad_norm": 0.5098339915275574, |
| "learning_rate": 0.00017558555666847037, |
| "loss": 0.0471160002052784, |
| "mean_token_accuracy": 0.9831248819828033, |
| "num_tokens": 5151127.0, |
| "step": 584 |
| }, |
| { |
| "entropy": 1.1900439262390137, |
| "epoch": 2.112217194570136, |
| "grad_norm": 0.5168299674987793, |
| "learning_rate": 0.00017549538729958247, |
| "loss": 0.1095990240573883, |
| "mean_token_accuracy": 0.9677491337060928, |
| "num_tokens": 5159888.0, |
| "step": 585 |
| }, |
| { |
| "entropy": 1.1944100558757782, |
| "epoch": 2.1158371040723982, |
| "grad_norm": 0.7810997366905212, |
| "learning_rate": 0.00017540507795986014, |
| "loss": 0.19430337846279144, |
| "mean_token_accuracy": 0.9515476375818253, |
| "num_tokens": 5168845.0, |
| "step": 586 |
| }, |
| { |
| "entropy": 1.1583697199821472, |
| "epoch": 2.1194570135746607, |
| "grad_norm": 0.45587292313575745, |
| "learning_rate": 0.0001753146288423043, |
| "loss": 0.05915957689285278, |
| "mean_token_accuracy": 0.9794735610485077, |
| "num_tokens": 5177533.0, |
| "step": 587 |
| }, |
| { |
| "entropy": 1.1053299605846405, |
| "epoch": 2.123076923076923, |
| "grad_norm": 0.5066711902618408, |
| "learning_rate": 0.00017522404014021472, |
| "loss": 0.07206124812364578, |
| "mean_token_accuracy": 0.9787262827157974, |
| "num_tokens": 5186820.0, |
| "step": 588 |
| }, |
| { |
| "entropy": 1.211129367351532, |
| "epoch": 2.1266968325791855, |
| "grad_norm": 0.6494729518890381, |
| "learning_rate": 0.00017513331204718934, |
| "loss": 0.12303437292575836, |
| "mean_token_accuracy": 0.9687947034835815, |
| "num_tokens": 5195554.0, |
| "step": 589 |
| }, |
| { |
| "entropy": 1.217880368232727, |
| "epoch": 2.130316742081448, |
| "grad_norm": 0.6168936491012573, |
| "learning_rate": 0.0001750424447571241, |
| "loss": 0.08860359340906143, |
| "mean_token_accuracy": 0.9771160632371902, |
| "num_tokens": 5204399.0, |
| "step": 590 |
| }, |
| { |
| "entropy": 1.152166485786438, |
| "epoch": 2.1339366515837104, |
| "grad_norm": 0.4032968580722809, |
| "learning_rate": 0.00017495143846421235, |
| "loss": 0.07568950951099396, |
| "mean_token_accuracy": 0.9729534089565277, |
| "num_tokens": 5213389.0, |
| "step": 591 |
| }, |
| { |
| "entropy": 1.15593022108078, |
| "epoch": 2.137556561085973, |
| "grad_norm": 0.4182475805282593, |
| "learning_rate": 0.00017486029336294455, |
| "loss": 0.04834774509072304, |
| "mean_token_accuracy": 0.9854657202959061, |
| "num_tokens": 5222138.0, |
| "step": 592 |
| }, |
| { |
| "entropy": 1.1080695390701294, |
| "epoch": 2.1411764705882352, |
| "grad_norm": 0.5492544174194336, |
| "learning_rate": 0.00017476900964810777, |
| "loss": 0.06573710590600967, |
| "mean_token_accuracy": 0.9790465831756592, |
| "num_tokens": 5231123.0, |
| "step": 593 |
| }, |
| { |
| "entropy": 1.0967484414577484, |
| "epoch": 2.1447963800904977, |
| "grad_norm": 0.5121078491210938, |
| "learning_rate": 0.00017467758751478537, |
| "loss": 0.061242155730724335, |
| "mean_token_accuracy": 0.9807155132293701, |
| "num_tokens": 5239859.0, |
| "step": 594 |
| }, |
| { |
| "entropy": 1.1613673865795135, |
| "epoch": 2.14841628959276, |
| "grad_norm": 0.5526803731918335, |
| "learning_rate": 0.00017458602715835644, |
| "loss": 0.06058318912982941, |
| "mean_token_accuracy": 0.978880986571312, |
| "num_tokens": 5248469.0, |
| "step": 595 |
| }, |
| { |
| "entropy": 1.141857922077179, |
| "epoch": 2.1520361990950225, |
| "grad_norm": 0.4709974229335785, |
| "learning_rate": 0.00017449432877449553, |
| "loss": 0.06889600306749344, |
| "mean_token_accuracy": 0.9787448197603226, |
| "num_tokens": 5257459.0, |
| "step": 596 |
| }, |
| { |
| "entropy": 1.1636947095394135, |
| "epoch": 2.155656108597285, |
| "grad_norm": 0.5113511085510254, |
| "learning_rate": 0.00017440249255917218, |
| "loss": 0.060567114502191544, |
| "mean_token_accuracy": 0.9818770885467529, |
| "num_tokens": 5266237.0, |
| "step": 597 |
| }, |
| { |
| "entropy": 1.1132238507270813, |
| "epoch": 2.1592760180995474, |
| "grad_norm": 0.6330764889717102, |
| "learning_rate": 0.00017431051870865044, |
| "loss": 0.07782386243343353, |
| "mean_token_accuracy": 0.9758347868919373, |
| "num_tokens": 5275498.0, |
| "step": 598 |
| }, |
| { |
| "entropy": 1.1733905673027039, |
| "epoch": 2.16289592760181, |
| "grad_norm": 0.6248951554298401, |
| "learning_rate": 0.00017421840741948852, |
| "loss": 0.10059335827827454, |
| "mean_token_accuracy": 0.9720931500196457, |
| "num_tokens": 5284031.0, |
| "step": 599 |
| }, |
| { |
| "entropy": 1.1515400111675262, |
| "epoch": 2.1665158371040723, |
| "grad_norm": 0.6329942941665649, |
| "learning_rate": 0.00017412615888853837, |
| "loss": 0.08151932805776596, |
| "mean_token_accuracy": 0.9802259802818298, |
| "num_tokens": 5292855.0, |
| "step": 600 |
| }, |
| { |
| "entropy": 1.141521841287613, |
| "epoch": 2.1701357466063347, |
| "grad_norm": 0.6487274765968323, |
| "learning_rate": 0.0001740337733129453, |
| "loss": 0.08144231140613556, |
| "mean_token_accuracy": 0.9778266698122025, |
| "num_tokens": 5301800.0, |
| "step": 601 |
| }, |
| { |
| "entropy": 1.1145538836717606, |
| "epoch": 2.173755656108597, |
| "grad_norm": 0.5885897278785706, |
| "learning_rate": 0.0001739412508901473, |
| "loss": 0.08007162064313889, |
| "mean_token_accuracy": 0.9757736772298813, |
| "num_tokens": 5310489.0, |
| "step": 602 |
| }, |
| { |
| "entropy": 1.0910236835479736, |
| "epoch": 2.1773755656108595, |
| "grad_norm": 0.6552841067314148, |
| "learning_rate": 0.00017384859181787503, |
| "loss": 0.08375010639429092, |
| "mean_token_accuracy": 0.9668237864971161, |
| "num_tokens": 5319658.0, |
| "step": 603 |
| }, |
| { |
| "entropy": 1.1157051026821136, |
| "epoch": 2.180995475113122, |
| "grad_norm": 0.510016679763794, |
| "learning_rate": 0.00017375579629415105, |
| "loss": 0.05919726938009262, |
| "mean_token_accuracy": 0.9795113205909729, |
| "num_tokens": 5328730.0, |
| "step": 604 |
| }, |
| { |
| "entropy": 1.1671057641506195, |
| "epoch": 2.184615384615385, |
| "grad_norm": 0.688136100769043, |
| "learning_rate": 0.00017366286451728967, |
| "loss": 0.12569661438465118, |
| "mean_token_accuracy": 0.9674014300107956, |
| "num_tokens": 5337128.0, |
| "step": 605 |
| }, |
| { |
| "entropy": 1.1464425325393677, |
| "epoch": 2.1882352941176473, |
| "grad_norm": 0.61946040391922, |
| "learning_rate": 0.00017356979668589625, |
| "loss": 0.08147244155406952, |
| "mean_token_accuracy": 0.9739107489585876, |
| "num_tokens": 5345906.0, |
| "step": 606 |
| }, |
| { |
| "entropy": 1.1685318052768707, |
| "epoch": 2.1918552036199097, |
| "grad_norm": 0.6955191493034363, |
| "learning_rate": 0.00017347659299886693, |
| "loss": 0.15282778441905975, |
| "mean_token_accuracy": 0.9649801999330521, |
| "num_tokens": 5354505.0, |
| "step": 607 |
| }, |
| { |
| "entropy": 1.1361754834651947, |
| "epoch": 2.195475113122172, |
| "grad_norm": 0.5646288990974426, |
| "learning_rate": 0.00017338325365538827, |
| "loss": 0.06721736490726471, |
| "mean_token_accuracy": 0.9789784252643585, |
| "num_tokens": 5363098.0, |
| "step": 608 |
| }, |
| { |
| "entropy": 1.1541111469268799, |
| "epoch": 2.1990950226244346, |
| "grad_norm": 0.4780377745628357, |
| "learning_rate": 0.0001732897788549367, |
| "loss": 0.06245143711566925, |
| "mean_token_accuracy": 0.9757596999406815, |
| "num_tokens": 5371853.0, |
| "step": 609 |
| }, |
| { |
| "entropy": 1.113056868314743, |
| "epoch": 2.202714932126697, |
| "grad_norm": 0.5598439574241638, |
| "learning_rate": 0.0001731961687972781, |
| "loss": 0.09727580845355988, |
| "mean_token_accuracy": 0.9673656225204468, |
| "num_tokens": 5380346.0, |
| "step": 610 |
| }, |
| { |
| "entropy": 1.1484054028987885, |
| "epoch": 2.2063348416289594, |
| "grad_norm": 0.6644022464752197, |
| "learning_rate": 0.00017310242368246746, |
| "loss": 0.1005115807056427, |
| "mean_token_accuracy": 0.9728731662034988, |
| "num_tokens": 5389310.0, |
| "step": 611 |
| }, |
| { |
| "entropy": 1.0054269880056381, |
| "epoch": 2.209954751131222, |
| "grad_norm": 0.954916775226593, |
| "learning_rate": 0.0001730085437108484, |
| "loss": 0.07220233231782913, |
| "mean_token_accuracy": 0.9803072810173035, |
| "num_tokens": 5398815.0, |
| "step": 612 |
| }, |
| { |
| "entropy": 1.0945743322372437, |
| "epoch": 2.2135746606334843, |
| "grad_norm": 0.4745389223098755, |
| "learning_rate": 0.00017291452908305268, |
| "loss": 0.05182403326034546, |
| "mean_token_accuracy": 0.9830630421638489, |
| "num_tokens": 5407661.0, |
| "step": 613 |
| }, |
| { |
| "entropy": 1.1867251992225647, |
| "epoch": 2.2171945701357467, |
| "grad_norm": 0.7837973237037659, |
| "learning_rate": 0.00017282037999999996, |
| "loss": 0.14014287292957306, |
| "mean_token_accuracy": 0.9501726627349854, |
| "num_tokens": 5415977.0, |
| "step": 614 |
| }, |
| { |
| "entropy": 1.0867418944835663, |
| "epoch": 2.220814479638009, |
| "grad_norm": 0.4593866765499115, |
| "learning_rate": 0.0001727260966628971, |
| "loss": 0.08343060314655304, |
| "mean_token_accuracy": 0.977153941988945, |
| "num_tokens": 5425295.0, |
| "step": 615 |
| }, |
| { |
| "entropy": 1.094106286764145, |
| "epoch": 2.2244343891402716, |
| "grad_norm": 0.31321874260902405, |
| "learning_rate": 0.00017263167927323794, |
| "loss": 0.02154790610074997, |
| "mean_token_accuracy": 0.9943936318159103, |
| "num_tokens": 5433892.0, |
| "step": 616 |
| }, |
| { |
| "entropy": 1.1765957176685333, |
| "epoch": 2.228054298642534, |
| "grad_norm": 0.5103018879890442, |
| "learning_rate": 0.00017253712803280284, |
| "loss": 0.07026584446430206, |
| "mean_token_accuracy": 0.9814851880073547, |
| "num_tokens": 5442650.0, |
| "step": 617 |
| }, |
| { |
| "entropy": 1.15475395321846, |
| "epoch": 2.2316742081447964, |
| "grad_norm": 0.46093472838401794, |
| "learning_rate": 0.00017244244314365822, |
| "loss": 0.052377212792634964, |
| "mean_token_accuracy": 0.9835858196020126, |
| "num_tokens": 5451484.0, |
| "step": 618 |
| }, |
| { |
| "entropy": 1.084006518125534, |
| "epoch": 2.235294117647059, |
| "grad_norm": 0.43632498383522034, |
| "learning_rate": 0.000172347624808156, |
| "loss": 0.04849786311388016, |
| "mean_token_accuracy": 0.9841872155666351, |
| "num_tokens": 5460401.0, |
| "step": 619 |
| }, |
| { |
| "entropy": 1.1410707533359528, |
| "epoch": 2.2389140271493213, |
| "grad_norm": 0.6720604300498962, |
| "learning_rate": 0.00017225267322893345, |
| "loss": 0.13110417127609253, |
| "mean_token_accuracy": 0.9632755517959595, |
| "num_tokens": 5469141.0, |
| "step": 620 |
| }, |
| { |
| "entropy": 1.132197231054306, |
| "epoch": 2.2425339366515837, |
| "grad_norm": 0.6811819672584534, |
| "learning_rate": 0.00017215758860891246, |
| "loss": 0.05728556215763092, |
| "mean_token_accuracy": 0.9824267625808716, |
| "num_tokens": 5477947.0, |
| "step": 621 |
| }, |
| { |
| "entropy": 1.0879136621952057, |
| "epoch": 2.246153846153846, |
| "grad_norm": 0.8642657399177551, |
| "learning_rate": 0.00017206237115129937, |
| "loss": 0.16126996278762817, |
| "mean_token_accuracy": 0.9610114395618439, |
| "num_tokens": 5486895.0, |
| "step": 622 |
| }, |
| { |
| "entropy": 1.074364259839058, |
| "epoch": 2.2497737556561086, |
| "grad_norm": 0.5326434373855591, |
| "learning_rate": 0.00017196702105958428, |
| "loss": 0.04988791421055794, |
| "mean_token_accuracy": 0.9847123473882675, |
| "num_tokens": 5495604.0, |
| "step": 623 |
| }, |
| { |
| "entropy": 1.108335942029953, |
| "epoch": 2.253393665158371, |
| "grad_norm": 0.4806241989135742, |
| "learning_rate": 0.00017187153853754082, |
| "loss": 0.08469416946172714, |
| "mean_token_accuracy": 0.9805634319782257, |
| "num_tokens": 5504355.0, |
| "step": 624 |
| }, |
| { |
| "entropy": 1.0444909036159515, |
| "epoch": 2.2570135746606335, |
| "grad_norm": 0.48259949684143066, |
| "learning_rate": 0.00017177592378922566, |
| "loss": 0.08314619958400726, |
| "mean_token_accuracy": 0.9782232642173767, |
| "num_tokens": 5513744.0, |
| "step": 625 |
| }, |
| { |
| "entropy": 1.1189175844192505, |
| "epoch": 2.260633484162896, |
| "grad_norm": 0.5330878496170044, |
| "learning_rate": 0.00017168017701897802, |
| "loss": 0.11519207060337067, |
| "mean_token_accuracy": 0.9642496109008789, |
| "num_tokens": 5522549.0, |
| "step": 626 |
| }, |
| { |
| "entropy": 1.0698913782835007, |
| "epoch": 2.2642533936651583, |
| "grad_norm": 0.6421942710876465, |
| "learning_rate": 0.0001715842984314192, |
| "loss": 0.08139073848724365, |
| "mean_token_accuracy": 0.9752504527568817, |
| "num_tokens": 5531666.0, |
| "step": 627 |
| }, |
| { |
| "entropy": 1.0454991310834885, |
| "epoch": 2.2678733031674208, |
| "grad_norm": 0.5831551551818848, |
| "learning_rate": 0.0001714882882314523, |
| "loss": 0.10700318962335587, |
| "mean_token_accuracy": 0.9666198194026947, |
| "num_tokens": 5541331.0, |
| "step": 628 |
| }, |
| { |
| "entropy": 1.1714680790901184, |
| "epoch": 2.271493212669683, |
| "grad_norm": 0.5227723717689514, |
| "learning_rate": 0.00017139214662426167, |
| "loss": 0.08072106540203094, |
| "mean_token_accuracy": 0.9822585135698318, |
| "num_tokens": 5549828.0, |
| "step": 629 |
| }, |
| { |
| "entropy": 1.094650149345398, |
| "epoch": 2.2751131221719456, |
| "grad_norm": 0.5172683596611023, |
| "learning_rate": 0.00017129587381531247, |
| "loss": 0.05595247447490692, |
| "mean_token_accuracy": 0.977413684129715, |
| "num_tokens": 5558456.0, |
| "step": 630 |
| }, |
| { |
| "entropy": 1.0995945632457733, |
| "epoch": 2.278733031674208, |
| "grad_norm": 0.3441588878631592, |
| "learning_rate": 0.00017119947001035027, |
| "loss": 0.040461085736751556, |
| "mean_token_accuracy": 0.9847785383462906, |
| "num_tokens": 5567051.0, |
| "step": 631 |
| }, |
| { |
| "entropy": 1.0973523557186127, |
| "epoch": 2.2823529411764705, |
| "grad_norm": 0.49515512585639954, |
| "learning_rate": 0.0001711029354154006, |
| "loss": 0.08333179354667664, |
| "mean_token_accuracy": 0.9789102673530579, |
| "num_tokens": 5576195.0, |
| "step": 632 |
| }, |
| { |
| "entropy": 1.0842012166976929, |
| "epoch": 2.285972850678733, |
| "grad_norm": 0.36962664127349854, |
| "learning_rate": 0.00017100627023676848, |
| "loss": 0.04567601531744003, |
| "mean_token_accuracy": 0.9858750253915787, |
| "num_tokens": 5585076.0, |
| "step": 633 |
| }, |
| { |
| "entropy": 1.0841620564460754, |
| "epoch": 2.2895927601809953, |
| "grad_norm": 0.4125482738018036, |
| "learning_rate": 0.000170909474681038, |
| "loss": 0.04856008291244507, |
| "mean_token_accuracy": 0.9842440634965897, |
| "num_tokens": 5594132.0, |
| "step": 634 |
| }, |
| { |
| "entropy": 1.108697071671486, |
| "epoch": 2.2932126696832578, |
| "grad_norm": 0.4849383533000946, |
| "learning_rate": 0.0001708125489550719, |
| "loss": 0.05033132806420326, |
| "mean_token_accuracy": 0.9845256805419922, |
| "num_tokens": 5602901.0, |
| "step": 635 |
| }, |
| { |
| "entropy": 1.1090115308761597, |
| "epoch": 2.29683257918552, |
| "grad_norm": 0.5984789729118347, |
| "learning_rate": 0.00017071549326601107, |
| "loss": 0.09391042590141296, |
| "mean_token_accuracy": 0.9781784564256668, |
| "num_tokens": 5611778.0, |
| "step": 636 |
| }, |
| { |
| "entropy": 1.1380729377269745, |
| "epoch": 2.3004524886877826, |
| "grad_norm": 0.688556969165802, |
| "learning_rate": 0.0001706183078212742, |
| "loss": 0.07684573531150818, |
| "mean_token_accuracy": 0.9796445071697235, |
| "num_tokens": 5620319.0, |
| "step": 637 |
| }, |
| { |
| "entropy": 1.0685915052890778, |
| "epoch": 2.304072398190045, |
| "grad_norm": 0.43945908546447754, |
| "learning_rate": 0.00017052099282855728, |
| "loss": 0.06292518973350525, |
| "mean_token_accuracy": 0.9775279760360718, |
| "num_tokens": 5629494.0, |
| "step": 638 |
| }, |
| { |
| "entropy": 1.0556871443986893, |
| "epoch": 2.3076923076923075, |
| "grad_norm": 0.6430138349533081, |
| "learning_rate": 0.00017042354849583312, |
| "loss": 0.10716582834720612, |
| "mean_token_accuracy": 0.9718299359083176, |
| "num_tokens": 5638360.0, |
| "step": 639 |
| }, |
| { |
| "entropy": 1.0811335742473602, |
| "epoch": 2.31131221719457, |
| "grad_norm": 0.5156210660934448, |
| "learning_rate": 0.00017032597503135097, |
| "loss": 0.06649160385131836, |
| "mean_token_accuracy": 0.9791721701622009, |
| "num_tokens": 5647294.0, |
| "step": 640 |
| }, |
| { |
| "entropy": 1.0688489079475403, |
| "epoch": 2.3149321266968323, |
| "grad_norm": 0.6636553406715393, |
| "learning_rate": 0.000170228272643636, |
| "loss": 0.1042657196521759, |
| "mean_token_accuracy": 0.9654766619205475, |
| "num_tokens": 5656288.0, |
| "step": 641 |
| }, |
| { |
| "entropy": 1.0857466459274292, |
| "epoch": 2.318552036199095, |
| "grad_norm": 0.7920498847961426, |
| "learning_rate": 0.00017013044154148894, |
| "loss": 0.12234874814748764, |
| "mean_token_accuracy": 0.9632948487997055, |
| "num_tokens": 5665003.0, |
| "step": 642 |
| }, |
| { |
| "entropy": 1.0862885862588882, |
| "epoch": 2.3221719457013577, |
| "grad_norm": 0.6242631673812866, |
| "learning_rate": 0.00017003248193398564, |
| "loss": 0.0991068035364151, |
| "mean_token_accuracy": 0.9662000685930252, |
| "num_tokens": 5674000.0, |
| "step": 643 |
| }, |
| { |
| "entropy": 1.1462857723236084, |
| "epoch": 2.32579185520362, |
| "grad_norm": 0.596567690372467, |
| "learning_rate": 0.00016993439403047652, |
| "loss": 0.09591332077980042, |
| "mean_token_accuracy": 0.9745471328496933, |
| "num_tokens": 5682989.0, |
| "step": 644 |
| }, |
| { |
| "entropy": 1.1377845704555511, |
| "epoch": 2.3294117647058825, |
| "grad_norm": 0.5160103440284729, |
| "learning_rate": 0.0001698361780405862, |
| "loss": 0.08644409477710724, |
| "mean_token_accuracy": 0.9708640724420547, |
| "num_tokens": 5691812.0, |
| "step": 645 |
| }, |
| { |
| "entropy": 1.1150462329387665, |
| "epoch": 2.333031674208145, |
| "grad_norm": 0.45699501037597656, |
| "learning_rate": 0.00016973783417421304, |
| "loss": 0.05352473631501198, |
| "mean_token_accuracy": 0.984683021903038, |
| "num_tokens": 5700825.0, |
| "step": 646 |
| }, |
| { |
| "entropy": 1.2092556655406952, |
| "epoch": 2.3366515837104074, |
| "grad_norm": 0.8381008505821228, |
| "learning_rate": 0.00016963936264152867, |
| "loss": 0.08669580519199371, |
| "mean_token_accuracy": 0.980311319231987, |
| "num_tokens": 5709406.0, |
| "step": 647 |
| }, |
| { |
| "entropy": 1.1319242715835571, |
| "epoch": 2.34027149321267, |
| "grad_norm": 0.53270024061203, |
| "learning_rate": 0.00016954076365297758, |
| "loss": 0.05547341704368591, |
| "mean_token_accuracy": 0.9790041297674179, |
| "num_tokens": 5718379.0, |
| "step": 648 |
| }, |
| { |
| "entropy": 1.1626895368099213, |
| "epoch": 2.3438914027149322, |
| "grad_norm": 0.5577008128166199, |
| "learning_rate": 0.00016944203741927662, |
| "loss": 0.08301683515310287, |
| "mean_token_accuracy": 0.9779217094182968, |
| "num_tokens": 5726870.0, |
| "step": 649 |
| }, |
| { |
| "entropy": 1.1811838150024414, |
| "epoch": 2.3475113122171947, |
| "grad_norm": 1.1514310836791992, |
| "learning_rate": 0.00016934318415141457, |
| "loss": 0.22124511003494263, |
| "mean_token_accuracy": 0.9386986643075943, |
| "num_tokens": 5735262.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 1.1047629415988922, |
| "epoch": 2.351131221719457, |
| "grad_norm": 0.42921116948127747, |
| "learning_rate": 0.00016924420406065177, |
| "loss": 0.07803885638713837, |
| "mean_token_accuracy": 0.977928563952446, |
| "num_tokens": 5744320.0, |
| "step": 651 |
| }, |
| { |
| "entropy": 1.187497317790985, |
| "epoch": 2.3547511312217195, |
| "grad_norm": 0.458602637052536, |
| "learning_rate": 0.00016914509735851954, |
| "loss": 0.05676557496190071, |
| "mean_token_accuracy": 0.983381986618042, |
| "num_tokens": 5752768.0, |
| "step": 652 |
| }, |
| { |
| "entropy": 1.1605049073696136, |
| "epoch": 2.358371040723982, |
| "grad_norm": 0.700816810131073, |
| "learning_rate": 0.00016904586425681975, |
| "loss": 0.1257372498512268, |
| "mean_token_accuracy": 0.9657402634620667, |
| "num_tokens": 5761355.0, |
| "step": 653 |
| }, |
| { |
| "entropy": 1.1560527682304382, |
| "epoch": 2.3619909502262444, |
| "grad_norm": 0.8418037295341492, |
| "learning_rate": 0.00016894650496762444, |
| "loss": 0.18459515273571014, |
| "mean_token_accuracy": 0.9515699297189713, |
| "num_tokens": 5770257.0, |
| "step": 654 |
| }, |
| { |
| "entropy": 1.1892625987529755, |
| "epoch": 2.365610859728507, |
| "grad_norm": 0.478832483291626, |
| "learning_rate": 0.00016884701970327538, |
| "loss": 0.052141569554805756, |
| "mean_token_accuracy": 0.9864660948514938, |
| "num_tokens": 5778834.0, |
| "step": 655 |
| }, |
| { |
| "entropy": 1.1170645952224731, |
| "epoch": 2.3692307692307693, |
| "grad_norm": 0.5372560024261475, |
| "learning_rate": 0.00016874740867638339, |
| "loss": 0.10951077938079834, |
| "mean_token_accuracy": 0.9724908918142319, |
| "num_tokens": 5788066.0, |
| "step": 656 |
| }, |
| { |
| "entropy": 1.1744292080402374, |
| "epoch": 2.3728506787330317, |
| "grad_norm": 0.6120523810386658, |
| "learning_rate": 0.00016864767209982825, |
| "loss": 0.09085190296173096, |
| "mean_token_accuracy": 0.9735588431358337, |
| "num_tokens": 5796324.0, |
| "step": 657 |
| }, |
| { |
| "entropy": 1.1730450987815857, |
| "epoch": 2.376470588235294, |
| "grad_norm": 0.5754011273384094, |
| "learning_rate": 0.00016854781018675797, |
| "loss": 0.08808492124080658, |
| "mean_token_accuracy": 0.9751751571893692, |
| "num_tokens": 5805217.0, |
| "step": 658 |
| }, |
| { |
| "entropy": 1.194241851568222, |
| "epoch": 2.3800904977375565, |
| "grad_norm": 0.4330058991909027, |
| "learning_rate": 0.00016844782315058847, |
| "loss": 0.0617540068924427, |
| "mean_token_accuracy": 0.982567548751831, |
| "num_tokens": 5813640.0, |
| "step": 659 |
| }, |
| { |
| "entropy": 1.1041472554206848, |
| "epoch": 2.383710407239819, |
| "grad_norm": 0.44740787148475647, |
| "learning_rate": 0.0001683477112050029, |
| "loss": 0.04697256535291672, |
| "mean_token_accuracy": 0.9860917925834656, |
| "num_tokens": 5822218.0, |
| "step": 660 |
| }, |
| { |
| "entropy": 1.1061788201332092, |
| "epoch": 2.3873303167420814, |
| "grad_norm": 0.7774989604949951, |
| "learning_rate": 0.0001682474745639516, |
| "loss": 0.09556365013122559, |
| "mean_token_accuracy": 0.975138321518898, |
| "num_tokens": 5831553.0, |
| "step": 661 |
| }, |
| { |
| "entropy": 1.1203438639640808, |
| "epoch": 2.390950226244344, |
| "grad_norm": 0.6026033759117126, |
| "learning_rate": 0.0001681471134416512, |
| "loss": 0.09611985087394714, |
| "mean_token_accuracy": 0.9710993468761444, |
| "num_tokens": 5840305.0, |
| "step": 662 |
| }, |
| { |
| "entropy": 1.109214961528778, |
| "epoch": 2.3945701357466063, |
| "grad_norm": 0.4015118479728699, |
| "learning_rate": 0.00016804662805258448, |
| "loss": 0.06578999012708664, |
| "mean_token_accuracy": 0.9815486073493958, |
| "num_tokens": 5848976.0, |
| "step": 663 |
| }, |
| { |
| "entropy": 1.1184172928333282, |
| "epoch": 2.3981900452488687, |
| "grad_norm": 0.7624635100364685, |
| "learning_rate": 0.00016794601861149977, |
| "loss": 0.15326440334320068, |
| "mean_token_accuracy": 0.9595926702022552, |
| "num_tokens": 5857668.0, |
| "step": 664 |
| }, |
| { |
| "entropy": 1.096958041191101, |
| "epoch": 2.401809954751131, |
| "grad_norm": 0.5075679421424866, |
| "learning_rate": 0.00016784528533341045, |
| "loss": 0.08701962232589722, |
| "mean_token_accuracy": 0.974994570016861, |
| "num_tokens": 5866959.0, |
| "step": 665 |
| }, |
| { |
| "entropy": 1.1943987607955933, |
| "epoch": 2.4054298642533936, |
| "grad_norm": 0.6924847960472107, |
| "learning_rate": 0.0001677444284335946, |
| "loss": 0.07678639143705368, |
| "mean_token_accuracy": 0.9794516116380692, |
| "num_tokens": 5875275.0, |
| "step": 666 |
| }, |
| { |
| "entropy": 1.1881066262722015, |
| "epoch": 2.409049773755656, |
| "grad_norm": 0.5888217091560364, |
| "learning_rate": 0.0001676434481275945, |
| "loss": 0.0933694839477539, |
| "mean_token_accuracy": 0.9765624105930328, |
| "num_tokens": 5883715.0, |
| "step": 667 |
| }, |
| { |
| "entropy": 1.0600565671920776, |
| "epoch": 2.4126696832579184, |
| "grad_norm": 0.846140444278717, |
| "learning_rate": 0.00016754234463121613, |
| "loss": 0.14402684569358826, |
| "mean_token_accuracy": 0.9649781733751297, |
| "num_tokens": 5892799.0, |
| "step": 668 |
| }, |
| { |
| "entropy": 1.1473102271556854, |
| "epoch": 2.416289592760181, |
| "grad_norm": 0.6479727029800415, |
| "learning_rate": 0.0001674411181605288, |
| "loss": 0.1251659095287323, |
| "mean_token_accuracy": 0.9626735299825668, |
| "num_tokens": 5901619.0, |
| "step": 669 |
| }, |
| { |
| "entropy": 1.1313286423683167, |
| "epoch": 2.4199095022624433, |
| "grad_norm": 0.418414443731308, |
| "learning_rate": 0.0001673397689318646, |
| "loss": 0.06655631214380264, |
| "mean_token_accuracy": 0.9822564423084259, |
| "num_tokens": 5910486.0, |
| "step": 670 |
| }, |
| { |
| "entropy": 1.1435776054859161, |
| "epoch": 2.4235294117647057, |
| "grad_norm": 0.5648186802864075, |
| "learning_rate": 0.00016723829716181797, |
| "loss": 0.05666976422071457, |
| "mean_token_accuracy": 0.982610896229744, |
| "num_tokens": 5918909.0, |
| "step": 671 |
| }, |
| { |
| "entropy": 1.07819664478302, |
| "epoch": 2.427149321266968, |
| "grad_norm": 0.500468909740448, |
| "learning_rate": 0.00016713670306724512, |
| "loss": 0.0956311747431755, |
| "mean_token_accuracy": 0.9763072431087494, |
| "num_tokens": 5927967.0, |
| "step": 672 |
| }, |
| { |
| "entropy": 1.1843810379505157, |
| "epoch": 2.430769230769231, |
| "grad_norm": 0.7661120891571045, |
| "learning_rate": 0.0001670349868652639, |
| "loss": 0.21462547779083252, |
| "mean_token_accuracy": 0.93952676653862, |
| "num_tokens": 5936451.0, |
| "step": 673 |
| }, |
| { |
| "entropy": 1.0327152162790298, |
| "epoch": 2.4343891402714934, |
| "grad_norm": 0.45229560136795044, |
| "learning_rate": 0.00016693314877325294, |
| "loss": 0.04901588708162308, |
| "mean_token_accuracy": 0.9859603494405746, |
| "num_tokens": 5945612.0, |
| "step": 674 |
| }, |
| { |
| "entropy": 1.0337743610143661, |
| "epoch": 2.438009049773756, |
| "grad_norm": 0.44453921914100647, |
| "learning_rate": 0.00016683118900885147, |
| "loss": 0.04502352327108383, |
| "mean_token_accuracy": 0.9859343767166138, |
| "num_tokens": 5954911.0, |
| "step": 675 |
| }, |
| { |
| "entropy": 1.1681455671787262, |
| "epoch": 2.4416289592760183, |
| "grad_norm": 0.7814058661460876, |
| "learning_rate": 0.00016672910778995866, |
| "loss": 0.15076348185539246, |
| "mean_token_accuracy": 0.9624718725681305, |
| "num_tokens": 5963327.0, |
| "step": 676 |
| }, |
| { |
| "entropy": 1.065615475177765, |
| "epoch": 2.4452488687782807, |
| "grad_norm": 0.6926926374435425, |
| "learning_rate": 0.00016662690533473333, |
| "loss": 0.1291479766368866, |
| "mean_token_accuracy": 0.9634927213191986, |
| "num_tokens": 5972300.0, |
| "step": 677 |
| }, |
| { |
| "entropy": 1.0294676572084427, |
| "epoch": 2.448868778280543, |
| "grad_norm": 0.3955880403518677, |
| "learning_rate": 0.0001665245818615933, |
| "loss": 0.07051679491996765, |
| "mean_token_accuracy": 0.9777989983558655, |
| "num_tokens": 5981742.0, |
| "step": 678 |
| }, |
| { |
| "entropy": 1.0838841944932938, |
| "epoch": 2.4524886877828056, |
| "grad_norm": 0.5345229506492615, |
| "learning_rate": 0.0001664221375892151, |
| "loss": 0.14341828227043152, |
| "mean_token_accuracy": 0.9667878448963165, |
| "num_tokens": 5990635.0, |
| "step": 679 |
| }, |
| { |
| "entropy": 1.1758331060409546, |
| "epoch": 2.456108597285068, |
| "grad_norm": 0.6868032813072205, |
| "learning_rate": 0.0001663195727365334, |
| "loss": 0.08597201108932495, |
| "mean_token_accuracy": 0.9740214198827744, |
| "num_tokens": 5999095.0, |
| "step": 680 |
| }, |
| { |
| "entropy": 1.0949173271656036, |
| "epoch": 2.4597285067873305, |
| "grad_norm": 0.589972198009491, |
| "learning_rate": 0.0001662168875227405, |
| "loss": 0.08794485032558441, |
| "mean_token_accuracy": 0.97372767329216, |
| "num_tokens": 6008205.0, |
| "step": 681 |
| }, |
| { |
| "entropy": 1.1521604359149933, |
| "epoch": 2.463348416289593, |
| "grad_norm": 0.6321046948432922, |
| "learning_rate": 0.00016611408216728603, |
| "loss": 0.11494572460651398, |
| "mean_token_accuracy": 0.9587929248809814, |
| "num_tokens": 6016678.0, |
| "step": 682 |
| }, |
| { |
| "entropy": 1.1596512496471405, |
| "epoch": 2.4669683257918553, |
| "grad_norm": 0.6953479647636414, |
| "learning_rate": 0.0001660111568898763, |
| "loss": 0.07764746993780136, |
| "mean_token_accuracy": 0.9718074202537537, |
| "num_tokens": 6025013.0, |
| "step": 683 |
| }, |
| { |
| "entropy": 1.0262929499149323, |
| "epoch": 2.4705882352941178, |
| "grad_norm": 0.5953262448310852, |
| "learning_rate": 0.00016590811191047393, |
| "loss": 0.07723493129014969, |
| "mean_token_accuracy": 0.9786953032016754, |
| "num_tokens": 6034590.0, |
| "step": 684 |
| }, |
| { |
| "entropy": 1.0716820657253265, |
| "epoch": 2.47420814479638, |
| "grad_norm": 0.516004204750061, |
| "learning_rate": 0.00016580494744929735, |
| "loss": 0.07066604495048523, |
| "mean_token_accuracy": 0.9768574684858322, |
| "num_tokens": 6043734.0, |
| "step": 685 |
| }, |
| { |
| "entropy": 1.1649815142154694, |
| "epoch": 2.4778280542986426, |
| "grad_norm": 0.5916234254837036, |
| "learning_rate": 0.00016570166372682034, |
| "loss": 0.07808145135641098, |
| "mean_token_accuracy": 0.9774973541498184, |
| "num_tokens": 6052374.0, |
| "step": 686 |
| }, |
| { |
| "entropy": 1.100892648100853, |
| "epoch": 2.481447963800905, |
| "grad_norm": 0.5512471795082092, |
| "learning_rate": 0.0001655982609637716, |
| "loss": 0.15753786265850067, |
| "mean_token_accuracy": 0.9573144018650055, |
| "num_tokens": 6061366.0, |
| "step": 687 |
| }, |
| { |
| "entropy": 1.0837741196155548, |
| "epoch": 2.4850678733031675, |
| "grad_norm": 0.47572416067123413, |
| "learning_rate": 0.00016549473938113414, |
| "loss": 0.07854226976633072, |
| "mean_token_accuracy": 0.9767839014530182, |
| "num_tokens": 6070370.0, |
| "step": 688 |
| }, |
| { |
| "entropy": 1.1012869477272034, |
| "epoch": 2.48868778280543, |
| "grad_norm": 0.545578122138977, |
| "learning_rate": 0.00016539109920014498, |
| "loss": 0.10669726133346558, |
| "mean_token_accuracy": 0.9735602736473083, |
| "num_tokens": 6079236.0, |
| "step": 689 |
| }, |
| { |
| "entropy": 1.1038524806499481, |
| "epoch": 2.4923076923076923, |
| "grad_norm": 0.6199021339416504, |
| "learning_rate": 0.0001652873406422945, |
| "loss": 0.09693003445863724, |
| "mean_token_accuracy": 0.9718763530254364, |
| "num_tokens": 6087610.0, |
| "step": 690 |
| }, |
| { |
| "entropy": 1.0678670406341553, |
| "epoch": 2.4959276018099548, |
| "grad_norm": 0.5688477754592896, |
| "learning_rate": 0.00016518346392932625, |
| "loss": 0.0637565553188324, |
| "mean_token_accuracy": 0.9771751910448074, |
| "num_tokens": 6096248.0, |
| "step": 691 |
| }, |
| { |
| "entropy": 1.0639970898628235, |
| "epoch": 2.499547511312217, |
| "grad_norm": 0.6916640400886536, |
| "learning_rate": 0.00016507946928323607, |
| "loss": 0.13879074156284332, |
| "mean_token_accuracy": 0.9657151252031326, |
| "num_tokens": 6105076.0, |
| "step": 692 |
| }, |
| { |
| "entropy": 0.9907592087984085, |
| "epoch": 2.5031674208144796, |
| "grad_norm": 0.6389366984367371, |
| "learning_rate": 0.000164975356926272, |
| "loss": 0.1363474577665329, |
| "mean_token_accuracy": 0.9586378186941147, |
| "num_tokens": 6114625.0, |
| "step": 693 |
| }, |
| { |
| "entropy": 1.131547451019287, |
| "epoch": 2.506787330316742, |
| "grad_norm": 1.4108179807662964, |
| "learning_rate": 0.0001648711270809335, |
| "loss": 0.09669815748929977, |
| "mean_token_accuracy": 0.9737755954265594, |
| "num_tokens": 6123053.0, |
| "step": 694 |
| }, |
| { |
| "entropy": 1.0598485469818115, |
| "epoch": 2.5104072398190045, |
| "grad_norm": 0.4932820796966553, |
| "learning_rate": 0.0001647667799699713, |
| "loss": 0.05943776294589043, |
| "mean_token_accuracy": 0.9825068265199661, |
| "num_tokens": 6131892.0, |
| "step": 695 |
| }, |
| { |
| "entropy": 1.1157889068126678, |
| "epoch": 2.514027149321267, |
| "grad_norm": 0.513300895690918, |
| "learning_rate": 0.00016466231581638654, |
| "loss": 0.0392400287091732, |
| "mean_token_accuracy": 0.9910160303115845, |
| "num_tokens": 6140337.0, |
| "step": 696 |
| }, |
| { |
| "entropy": 1.0774150341749191, |
| "epoch": 2.5176470588235293, |
| "grad_norm": 0.48255565762519836, |
| "learning_rate": 0.00016455773484343062, |
| "loss": 0.06849890947341919, |
| "mean_token_accuracy": 0.9777995198965073, |
| "num_tokens": 6149473.0, |
| "step": 697 |
| }, |
| { |
| "entropy": 1.0931665301322937, |
| "epoch": 2.521266968325792, |
| "grad_norm": 0.4410548806190491, |
| "learning_rate": 0.0001644530372746046, |
| "loss": 0.04295572638511658, |
| "mean_token_accuracy": 0.9881383031606674, |
| "num_tokens": 6158108.0, |
| "step": 698 |
| }, |
| { |
| "entropy": 1.0721315890550613, |
| "epoch": 2.524886877828054, |
| "grad_norm": 0.6796205639839172, |
| "learning_rate": 0.00016434822333365867, |
| "loss": 0.08365722000598907, |
| "mean_token_accuracy": 0.9753518998622894, |
| "num_tokens": 6167169.0, |
| "step": 699 |
| }, |
| { |
| "entropy": 1.0268315523862839, |
| "epoch": 2.5285067873303166, |
| "grad_norm": 0.4017026722431183, |
| "learning_rate": 0.00016424329324459167, |
| "loss": 0.04044954478740692, |
| "mean_token_accuracy": 0.985278993844986, |
| "num_tokens": 6176461.0, |
| "step": 700 |
| }, |
| { |
| "entropy": 1.1094456613063812, |
| "epoch": 2.532126696832579, |
| "grad_norm": 0.3827606439590454, |
| "learning_rate": 0.0001641382472316508, |
| "loss": 0.047150127589702606, |
| "mean_token_accuracy": 0.9888079762458801, |
| "num_tokens": 6185277.0, |
| "step": 701 |
| }, |
| { |
| "entropy": 1.0462537854909897, |
| "epoch": 2.5357466063348415, |
| "grad_norm": 0.5289177894592285, |
| "learning_rate": 0.00016403308551933085, |
| "loss": 0.0867241695523262, |
| "mean_token_accuracy": 0.9772706627845764, |
| "num_tokens": 6194094.0, |
| "step": 702 |
| }, |
| { |
| "entropy": 1.1250386536121368, |
| "epoch": 2.539366515837104, |
| "grad_norm": 0.586585521697998, |
| "learning_rate": 0.000163927808332374, |
| "loss": 0.04973389208316803, |
| "mean_token_accuracy": 0.9858016967773438, |
| "num_tokens": 6202301.0, |
| "step": 703 |
| }, |
| { |
| "entropy": 1.1375506520271301, |
| "epoch": 2.5429864253393664, |
| "grad_norm": 0.745067298412323, |
| "learning_rate": 0.00016382241589576918, |
| "loss": 0.0903111919760704, |
| "mean_token_accuracy": 0.9715902656316757, |
| "num_tokens": 6210728.0, |
| "step": 704 |
| }, |
| { |
| "entropy": 1.0829818844795227, |
| "epoch": 2.546606334841629, |
| "grad_norm": 0.5703446865081787, |
| "learning_rate": 0.00016371690843475153, |
| "loss": 0.08242139220237732, |
| "mean_token_accuracy": 0.9692940562963486, |
| "num_tokens": 6219287.0, |
| "step": 705 |
| }, |
| { |
| "entropy": 1.0283890515565872, |
| "epoch": 2.5502262443438912, |
| "grad_norm": 0.4306364357471466, |
| "learning_rate": 0.00016361128617480212, |
| "loss": 0.053607720881700516, |
| "mean_token_accuracy": 0.9828784912824631, |
| "num_tokens": 6227933.0, |
| "step": 706 |
| }, |
| { |
| "entropy": 1.0512140542268753, |
| "epoch": 2.5538461538461537, |
| "grad_norm": 0.6051420569419861, |
| "learning_rate": 0.0001635055493416473, |
| "loss": 0.09999658167362213, |
| "mean_token_accuracy": 0.9734647274017334, |
| "num_tokens": 6237019.0, |
| "step": 707 |
| }, |
| { |
| "entropy": 1.1175464391708374, |
| "epoch": 2.557466063348416, |
| "grad_norm": 0.5659928321838379, |
| "learning_rate": 0.00016339969816125832, |
| "loss": 0.0763474628329277, |
| "mean_token_accuracy": 0.9784427285194397, |
| "num_tokens": 6245720.0, |
| "step": 708 |
| }, |
| { |
| "entropy": 0.9877434521913528, |
| "epoch": 2.5610859728506785, |
| "grad_norm": 0.6605835556983948, |
| "learning_rate": 0.00016329373285985078, |
| "loss": 0.14345091581344604, |
| "mean_token_accuracy": 0.9613773822784424, |
| "num_tokens": 6255493.0, |
| "step": 709 |
| }, |
| { |
| "entropy": 1.0417379438877106, |
| "epoch": 2.564705882352941, |
| "grad_norm": 0.5875850915908813, |
| "learning_rate": 0.0001631876536638841, |
| "loss": 0.11363250762224197, |
| "mean_token_accuracy": 0.9701417088508606, |
| "num_tokens": 6264663.0, |
| "step": 710 |
| }, |
| { |
| "entropy": 1.1045846343040466, |
| "epoch": 2.5683257918552034, |
| "grad_norm": 0.543202817440033, |
| "learning_rate": 0.00016308146080006123, |
| "loss": 0.062342025339603424, |
| "mean_token_accuracy": 0.9822514802217484, |
| "num_tokens": 6272943.0, |
| "step": 711 |
| }, |
| { |
| "entropy": 1.1131125092506409, |
| "epoch": 2.571945701357466, |
| "grad_norm": 0.5200212001800537, |
| "learning_rate": 0.00016297515449532795, |
| "loss": 0.0743773877620697, |
| "mean_token_accuracy": 0.9790451228618622, |
| "num_tokens": 6282017.0, |
| "step": 712 |
| }, |
| { |
| "entropy": 1.0882102400064468, |
| "epoch": 2.5755656108597282, |
| "grad_norm": 0.2667261064052582, |
| "learning_rate": 0.0001628687349768726, |
| "loss": 0.02705930732190609, |
| "mean_token_accuracy": 0.9914503395557404, |
| "num_tokens": 6291228.0, |
| "step": 713 |
| }, |
| { |
| "entropy": 1.1583672761917114, |
| "epoch": 2.579185520361991, |
| "grad_norm": 0.3874809443950653, |
| "learning_rate": 0.00016276220247212522, |
| "loss": 0.027443181723356247, |
| "mean_token_accuracy": 0.9916303902864456, |
| "num_tokens": 6299957.0, |
| "step": 714 |
| }, |
| { |
| "entropy": 1.1209280490875244, |
| "epoch": 2.5828054298642535, |
| "grad_norm": 0.3878846764564514, |
| "learning_rate": 0.00016265555720875756, |
| "loss": 0.04895198345184326, |
| "mean_token_accuracy": 0.9888665974140167, |
| "num_tokens": 6309045.0, |
| "step": 715 |
| }, |
| { |
| "entropy": 1.0764774233102798, |
| "epoch": 2.586425339366516, |
| "grad_norm": 0.43458428978919983, |
| "learning_rate": 0.00016254879941468223, |
| "loss": 0.043831516057252884, |
| "mean_token_accuracy": 0.9901125580072403, |
| "num_tokens": 6317957.0, |
| "step": 716 |
| }, |
| { |
| "entropy": 1.0688991844654083, |
| "epoch": 2.5900452488687784, |
| "grad_norm": 0.4384344220161438, |
| "learning_rate": 0.0001624419293180524, |
| "loss": 0.0616069957613945, |
| "mean_token_accuracy": 0.983244925737381, |
| "num_tokens": 6327135.0, |
| "step": 717 |
| }, |
| { |
| "entropy": 1.1092505156993866, |
| "epoch": 2.593665158371041, |
| "grad_norm": 0.5844867825508118, |
| "learning_rate": 0.00016233494714726118, |
| "loss": 0.04842917621135712, |
| "mean_token_accuracy": 0.9868716895580292, |
| "num_tokens": 6336285.0, |
| "step": 718 |
| }, |
| { |
| "entropy": 1.0994287133216858, |
| "epoch": 2.5972850678733033, |
| "grad_norm": 0.6489019393920898, |
| "learning_rate": 0.0001622278531309412, |
| "loss": 0.14275093376636505, |
| "mean_token_accuracy": 0.9666349589824677, |
| "num_tokens": 6345608.0, |
| "step": 719 |
| }, |
| { |
| "entropy": 1.1014614403247833, |
| "epoch": 2.6009049773755657, |
| "grad_norm": 0.8203519582748413, |
| "learning_rate": 0.00016212064749796418, |
| "loss": 0.12135060131549835, |
| "mean_token_accuracy": 0.9609973132610321, |
| "num_tokens": 6354345.0, |
| "step": 720 |
| }, |
| { |
| "entropy": 1.1402380466461182, |
| "epoch": 2.604524886877828, |
| "grad_norm": 0.3748965263366699, |
| "learning_rate": 0.00016201333047744025, |
| "loss": 0.0338396318256855, |
| "mean_token_accuracy": 0.9894662201404572, |
| "num_tokens": 6362603.0, |
| "step": 721 |
| }, |
| { |
| "entropy": 1.1779527068138123, |
| "epoch": 2.6081447963800906, |
| "grad_norm": 0.5257209539413452, |
| "learning_rate": 0.00016190590229871773, |
| "loss": 0.08586816489696503, |
| "mean_token_accuracy": 0.9766449332237244, |
| "num_tokens": 6370579.0, |
| "step": 722 |
| }, |
| { |
| "entropy": 1.13621586561203, |
| "epoch": 2.611764705882353, |
| "grad_norm": 0.7259751558303833, |
| "learning_rate": 0.00016179836319138243, |
| "loss": 0.13181325793266296, |
| "mean_token_accuracy": 0.9618227183818817, |
| "num_tokens": 6378869.0, |
| "step": 723 |
| }, |
| { |
| "entropy": 1.113575041294098, |
| "epoch": 2.6153846153846154, |
| "grad_norm": 0.49848487973213196, |
| "learning_rate": 0.00016169071338525718, |
| "loss": 0.09116464853286743, |
| "mean_token_accuracy": 0.9724429994821548, |
| "num_tokens": 6388111.0, |
| "step": 724 |
| }, |
| { |
| "entropy": 1.07393079996109, |
| "epoch": 2.619004524886878, |
| "grad_norm": 0.742392897605896, |
| "learning_rate": 0.0001615829531104015, |
| "loss": 0.16890200972557068, |
| "mean_token_accuracy": 0.9505817890167236, |
| "num_tokens": 6397238.0, |
| "step": 725 |
| }, |
| { |
| "entropy": 1.0620396435260773, |
| "epoch": 2.6226244343891403, |
| "grad_norm": 0.5371675491333008, |
| "learning_rate": 0.00016147508259711088, |
| "loss": 0.11606789380311966, |
| "mean_token_accuracy": 0.9657173007726669, |
| "num_tokens": 6406558.0, |
| "step": 726 |
| }, |
| { |
| "entropy": 1.0380123108625412, |
| "epoch": 2.6262443438914027, |
| "grad_norm": 0.4158170521259308, |
| "learning_rate": 0.00016136710207591653, |
| "loss": 0.06112787127494812, |
| "mean_token_accuracy": 0.9832183867692947, |
| "num_tokens": 6415614.0, |
| "step": 727 |
| }, |
| { |
| "entropy": 1.1722862720489502, |
| "epoch": 2.629864253393665, |
| "grad_norm": 0.6638741493225098, |
| "learning_rate": 0.00016125901177758457, |
| "loss": 0.06863709539175034, |
| "mean_token_accuracy": 0.9769297689199448, |
| "num_tokens": 6423840.0, |
| "step": 728 |
| }, |
| { |
| "entropy": 1.1042883694171906, |
| "epoch": 2.6334841628959276, |
| "grad_norm": 0.6980579495429993, |
| "learning_rate": 0.00016115081193311592, |
| "loss": 0.3179304003715515, |
| "mean_token_accuracy": 0.9388009011745453, |
| "num_tokens": 6433056.0, |
| "step": 729 |
| }, |
| { |
| "entropy": 1.0506968647241592, |
| "epoch": 2.63710407239819, |
| "grad_norm": 0.4494501054286957, |
| "learning_rate": 0.00016104250277374548, |
| "loss": 0.06316471844911575, |
| "mean_token_accuracy": 0.9813105314970016, |
| "num_tokens": 6442272.0, |
| "step": 730 |
| }, |
| { |
| "entropy": 1.0719667375087738, |
| "epoch": 2.6407239819004524, |
| "grad_norm": 0.49762746691703796, |
| "learning_rate": 0.00016093408453094182, |
| "loss": 0.07798872143030167, |
| "mean_token_accuracy": 0.9814620614051819, |
| "num_tokens": 6451215.0, |
| "step": 731 |
| }, |
| { |
| "entropy": 1.0985628068447113, |
| "epoch": 2.644343891402715, |
| "grad_norm": 0.5135414600372314, |
| "learning_rate": 0.00016082555743640668, |
| "loss": 0.05784185230731964, |
| "mean_token_accuracy": 0.981594517827034, |
| "num_tokens": 6460140.0, |
| "step": 732 |
| }, |
| { |
| "entropy": 1.1157509833574295, |
| "epoch": 2.6479638009049773, |
| "grad_norm": 0.9371573328971863, |
| "learning_rate": 0.00016071692172207435, |
| "loss": 0.19296394288539886, |
| "mean_token_accuracy": 0.9558616280555725, |
| "num_tokens": 6469215.0, |
| "step": 733 |
| }, |
| { |
| "entropy": 1.0665288716554642, |
| "epoch": 2.6515837104072397, |
| "grad_norm": 0.4882467985153198, |
| "learning_rate": 0.00016060817762011126, |
| "loss": 0.06225307658314705, |
| "mean_token_accuracy": 0.9782555550336838, |
| "num_tokens": 6477975.0, |
| "step": 734 |
| }, |
| { |
| "entropy": 1.058951660990715, |
| "epoch": 2.655203619909502, |
| "grad_norm": 0.44127899408340454, |
| "learning_rate": 0.00016049932536291552, |
| "loss": 0.054334647953510284, |
| "mean_token_accuracy": 0.9814004898071289, |
| "num_tokens": 6486967.0, |
| "step": 735 |
| }, |
| { |
| "entropy": 1.0905370116233826, |
| "epoch": 2.6588235294117646, |
| "grad_norm": 0.43531718850135803, |
| "learning_rate": 0.00016039036518311633, |
| "loss": 0.047857142984867096, |
| "mean_token_accuracy": 0.9830509722232819, |
| "num_tokens": 6495636.0, |
| "step": 736 |
| }, |
| { |
| "entropy": 1.0584595501422882, |
| "epoch": 2.662443438914027, |
| "grad_norm": 0.5107179284095764, |
| "learning_rate": 0.00016028129731357366, |
| "loss": 0.069837786257267, |
| "mean_token_accuracy": 0.9805668294429779, |
| "num_tokens": 6504726.0, |
| "step": 737 |
| }, |
| { |
| "entropy": 1.0202111154794693, |
| "epoch": 2.6660633484162894, |
| "grad_norm": 0.4959315359592438, |
| "learning_rate": 0.00016017212198737732, |
| "loss": 0.10300014168024063, |
| "mean_token_accuracy": 0.9703309834003448, |
| "num_tokens": 6514345.0, |
| "step": 738 |
| }, |
| { |
| "entropy": 1.127130776643753, |
| "epoch": 2.669683257918552, |
| "grad_norm": 0.6754599213600159, |
| "learning_rate": 0.00016006283943784715, |
| "loss": 0.10128761827945709, |
| "mean_token_accuracy": 0.9754067361354828, |
| "num_tokens": 6523052.0, |
| "step": 739 |
| }, |
| { |
| "entropy": 1.1106500625610352, |
| "epoch": 2.6733031674208148, |
| "grad_norm": 0.7177493572235107, |
| "learning_rate": 0.00015995344989853193, |
| "loss": 0.19780850410461426, |
| "mean_token_accuracy": 0.9586081951856613, |
| "num_tokens": 6532090.0, |
| "step": 740 |
| }, |
| { |
| "entropy": 1.0810956358909607, |
| "epoch": 2.676923076923077, |
| "grad_norm": 0.5800163149833679, |
| "learning_rate": 0.00015984395360320902, |
| "loss": 0.07419327646493912, |
| "mean_token_accuracy": 0.9828365594148636, |
| "num_tokens": 6541418.0, |
| "step": 741 |
| }, |
| { |
| "entropy": 1.1289043724536896, |
| "epoch": 2.6805429864253396, |
| "grad_norm": 0.41637271642684937, |
| "learning_rate": 0.0001597343507858841, |
| "loss": 0.07509341835975647, |
| "mean_token_accuracy": 0.9790188372135162, |
| "num_tokens": 6550296.0, |
| "step": 742 |
| }, |
| { |
| "entropy": 1.150795191526413, |
| "epoch": 2.684162895927602, |
| "grad_norm": 0.6585742831230164, |
| "learning_rate": 0.00015962464168079045, |
| "loss": 0.10690723359584808, |
| "mean_token_accuracy": 0.9658952206373215, |
| "num_tokens": 6558723.0, |
| "step": 743 |
| }, |
| { |
| "entropy": 1.1867717504501343, |
| "epoch": 2.6877828054298645, |
| "grad_norm": 0.6974299550056458, |
| "learning_rate": 0.00015951482652238843, |
| "loss": 0.1916104406118393, |
| "mean_token_accuracy": 0.9603984951972961, |
| "num_tokens": 6567642.0, |
| "step": 744 |
| }, |
| { |
| "entropy": 1.1048817336559296, |
| "epoch": 2.691402714932127, |
| "grad_norm": 0.4655424952507019, |
| "learning_rate": 0.0001594049055453651, |
| "loss": 0.12130922079086304, |
| "mean_token_accuracy": 0.9698162972927094, |
| "num_tokens": 6577045.0, |
| "step": 745 |
| }, |
| { |
| "entropy": 1.162132978439331, |
| "epoch": 2.6950226244343893, |
| "grad_norm": 0.6110576391220093, |
| "learning_rate": 0.00015929487898463368, |
| "loss": 0.07808665931224823, |
| "mean_token_accuracy": 0.9791989624500275, |
| "num_tokens": 6585933.0, |
| "step": 746 |
| }, |
| { |
| "entropy": 1.1560872793197632, |
| "epoch": 2.6986425339366518, |
| "grad_norm": 0.6684656739234924, |
| "learning_rate": 0.00015918474707533298, |
| "loss": 0.08515496551990509, |
| "mean_token_accuracy": 0.9754028916358948, |
| "num_tokens": 6594832.0, |
| "step": 747 |
| }, |
| { |
| "entropy": 1.2077683806419373, |
| "epoch": 2.702262443438914, |
| "grad_norm": 0.624879002571106, |
| "learning_rate": 0.00015907451005282698, |
| "loss": 0.1127995178103447, |
| "mean_token_accuracy": 0.9747825711965561, |
| "num_tokens": 6603677.0, |
| "step": 748 |
| }, |
| { |
| "entropy": 1.1817778050899506, |
| "epoch": 2.7058823529411766, |
| "grad_norm": 0.4592488408088684, |
| "learning_rate": 0.00015896416815270437, |
| "loss": 0.0558847077190876, |
| "mean_token_accuracy": 0.9848069846630096, |
| "num_tokens": 6612569.0, |
| "step": 749 |
| }, |
| { |
| "entropy": 1.1611264646053314, |
| "epoch": 2.709502262443439, |
| "grad_norm": 0.4479089677333832, |
| "learning_rate": 0.0001588537216107778, |
| "loss": 0.07405956834554672, |
| "mean_token_accuracy": 0.9793966263532639, |
| "num_tokens": 6621219.0, |
| "step": 750 |
| }, |
| { |
| "entropy": 1.1612786650657654, |
| "epoch": 2.7131221719457015, |
| "grad_norm": 0.42668989300727844, |
| "learning_rate": 0.00015874317066308372, |
| "loss": 0.06808929890394211, |
| "mean_token_accuracy": 0.985272541642189, |
| "num_tokens": 6630187.0, |
| "step": 751 |
| }, |
| { |
| "entropy": 1.2155989408493042, |
| "epoch": 2.716742081447964, |
| "grad_norm": 0.73838210105896, |
| "learning_rate": 0.00015863251554588167, |
| "loss": 0.13851481676101685, |
| "mean_token_accuracy": 0.963520884513855, |
| "num_tokens": 6638579.0, |
| "step": 752 |
| }, |
| { |
| "entropy": 1.1978608965873718, |
| "epoch": 2.7203619909502263, |
| "grad_norm": 0.5378918647766113, |
| "learning_rate": 0.00015852175649565375, |
| "loss": 0.08914665132761002, |
| "mean_token_accuracy": 0.9813847839832306, |
| "num_tokens": 6646986.0, |
| "step": 753 |
| }, |
| { |
| "entropy": 1.203873634338379, |
| "epoch": 2.723981900452489, |
| "grad_norm": 0.518156111240387, |
| "learning_rate": 0.0001584108937491042, |
| "loss": 0.07944593578577042, |
| "mean_token_accuracy": 0.9775935709476471, |
| "num_tokens": 6655534.0, |
| "step": 754 |
| }, |
| { |
| "entropy": 1.1785966455936432, |
| "epoch": 2.727601809954751, |
| "grad_norm": 0.6289830803871155, |
| "learning_rate": 0.00015829992754315893, |
| "loss": 0.08792146295309067, |
| "mean_token_accuracy": 0.9775703102350235, |
| "num_tokens": 6664173.0, |
| "step": 755 |
| }, |
| { |
| "entropy": 1.0918410420417786, |
| "epoch": 2.7312217194570136, |
| "grad_norm": 0.553460955619812, |
| "learning_rate": 0.00015818885811496485, |
| "loss": 0.10784870386123657, |
| "mean_token_accuracy": 0.9730519503355026, |
| "num_tokens": 6673280.0, |
| "step": 756 |
| }, |
| { |
| "entropy": 1.141640067100525, |
| "epoch": 2.734841628959276, |
| "grad_norm": 0.5629732012748718, |
| "learning_rate": 0.0001580776857018895, |
| "loss": 0.10254421085119247, |
| "mean_token_accuracy": 0.967141255736351, |
| "num_tokens": 6682098.0, |
| "step": 757 |
| }, |
| { |
| "entropy": 1.093793347477913, |
| "epoch": 2.7384615384615385, |
| "grad_norm": 0.4974505603313446, |
| "learning_rate": 0.00015796641054152067, |
| "loss": 0.08042000234127045, |
| "mean_token_accuracy": 0.9785896241664886, |
| "num_tokens": 6690846.0, |
| "step": 758 |
| }, |
| { |
| "entropy": 1.1073049008846283, |
| "epoch": 2.742081447963801, |
| "grad_norm": 0.4363830089569092, |
| "learning_rate": 0.00015785503287166547, |
| "loss": 0.07343566417694092, |
| "mean_token_accuracy": 0.974511981010437, |
| "num_tokens": 6699819.0, |
| "step": 759 |
| }, |
| { |
| "entropy": 1.0879010558128357, |
| "epoch": 2.7457013574660634, |
| "grad_norm": 0.38586366176605225, |
| "learning_rate": 0.00015774355293035025, |
| "loss": 0.04004283994436264, |
| "mean_token_accuracy": 0.9880010634660721, |
| "num_tokens": 6708498.0, |
| "step": 760 |
| }, |
| { |
| "entropy": 1.078858882188797, |
| "epoch": 2.749321266968326, |
| "grad_norm": 0.6422889232635498, |
| "learning_rate": 0.0001576319709558199, |
| "loss": 0.10385136306285858, |
| "mean_token_accuracy": 0.974150076508522, |
| "num_tokens": 6717185.0, |
| "step": 761 |
| }, |
| { |
| "entropy": 1.077678918838501, |
| "epoch": 2.7529411764705882, |
| "grad_norm": 0.7637373805046082, |
| "learning_rate": 0.00015752028718653735, |
| "loss": 0.06696485728025436, |
| "mean_token_accuracy": 0.9813330173492432, |
| "num_tokens": 6725915.0, |
| "step": 762 |
| }, |
| { |
| "entropy": 1.155141294002533, |
| "epoch": 2.7565610859728507, |
| "grad_norm": 0.6038472056388855, |
| "learning_rate": 0.00015740850186118306, |
| "loss": 0.06791059672832489, |
| "mean_token_accuracy": 0.9810560345649719, |
| "num_tokens": 6734292.0, |
| "step": 763 |
| }, |
| { |
| "entropy": 1.161244884133339, |
| "epoch": 2.760180995475113, |
| "grad_norm": 0.5682697892189026, |
| "learning_rate": 0.00015729661521865452, |
| "loss": 0.06183531507849693, |
| "mean_token_accuracy": 0.9830152094364166, |
| "num_tokens": 6742663.0, |
| "step": 764 |
| }, |
| { |
| "entropy": 1.0876408368349075, |
| "epoch": 2.7638009049773755, |
| "grad_norm": 0.4295364320278168, |
| "learning_rate": 0.00015718462749806587, |
| "loss": 0.0469270683825016, |
| "mean_token_accuracy": 0.9854940623044968, |
| "num_tokens": 6751726.0, |
| "step": 765 |
| }, |
| { |
| "entropy": 1.1176190674304962, |
| "epoch": 2.767420814479638, |
| "grad_norm": 0.5521655678749084, |
| "learning_rate": 0.00015707253893874705, |
| "loss": 0.10531380772590637, |
| "mean_token_accuracy": 0.9681271910667419, |
| "num_tokens": 6760728.0, |
| "step": 766 |
| }, |
| { |
| "entropy": 1.1246620118618011, |
| "epoch": 2.7710407239819004, |
| "grad_norm": 0.5280638337135315, |
| "learning_rate": 0.00015696034978024368, |
| "loss": 0.05504276230931282, |
| "mean_token_accuracy": 0.9822255373001099, |
| "num_tokens": 6769862.0, |
| "step": 767 |
| }, |
| { |
| "entropy": 1.167225182056427, |
| "epoch": 2.774660633484163, |
| "grad_norm": 0.7716050148010254, |
| "learning_rate": 0.0001568480602623163, |
| "loss": 0.08834150433540344, |
| "mean_token_accuracy": 0.9732990860939026, |
| "num_tokens": 6778769.0, |
| "step": 768 |
| }, |
| { |
| "entropy": 1.1058853566646576, |
| "epoch": 2.7782805429864252, |
| "grad_norm": 0.5791372060775757, |
| "learning_rate": 0.00015673567062493993, |
| "loss": 0.06552623212337494, |
| "mean_token_accuracy": 0.9831362217664719, |
| "num_tokens": 6787847.0, |
| "step": 769 |
| }, |
| { |
| "entropy": 1.1691460013389587, |
| "epoch": 2.7819004524886877, |
| "grad_norm": 0.5165508389472961, |
| "learning_rate": 0.00015662318110830356, |
| "loss": 0.10017089545726776, |
| "mean_token_accuracy": 0.9761621057987213, |
| "num_tokens": 6796419.0, |
| "step": 770 |
| }, |
| { |
| "entropy": 1.1316891312599182, |
| "epoch": 2.78552036199095, |
| "grad_norm": 0.5280610918998718, |
| "learning_rate": 0.00015651059195280972, |
| "loss": 0.04987496882677078, |
| "mean_token_accuracy": 0.984514445066452, |
| "num_tokens": 6805123.0, |
| "step": 771 |
| }, |
| { |
| "entropy": 1.104190319776535, |
| "epoch": 2.7891402714932125, |
| "grad_norm": 0.4536793828010559, |
| "learning_rate": 0.0001563979033990737, |
| "loss": 0.06398440897464752, |
| "mean_token_accuracy": 0.9832892119884491, |
| "num_tokens": 6814257.0, |
| "step": 772 |
| }, |
| { |
| "entropy": 1.0896694660186768, |
| "epoch": 2.792760180995475, |
| "grad_norm": 0.45330801606178284, |
| "learning_rate": 0.0001562851156879233, |
| "loss": 0.06826596707105637, |
| "mean_token_accuracy": 0.9796571433544159, |
| "num_tokens": 6823932.0, |
| "step": 773 |
| }, |
| { |
| "entropy": 1.067289099097252, |
| "epoch": 2.7963800904977374, |
| "grad_norm": 0.3646707236766815, |
| "learning_rate": 0.0001561722290603983, |
| "loss": 0.056463152170181274, |
| "mean_token_accuracy": 0.9811888188123703, |
| "num_tokens": 6833045.0, |
| "step": 774 |
| }, |
| { |
| "entropy": 1.1854591965675354, |
| "epoch": 2.8, |
| "grad_norm": 0.7943421602249146, |
| "learning_rate": 0.00015605924375774986, |
| "loss": 0.22466346621513367, |
| "mean_token_accuracy": 0.9558106809854507, |
| "num_tokens": 6841858.0, |
| "step": 775 |
| }, |
| { |
| "entropy": 1.2166031897068024, |
| "epoch": 2.8036199095022623, |
| "grad_norm": 0.5690715909004211, |
| "learning_rate": 0.0001559461600214399, |
| "loss": 0.07189326733350754, |
| "mean_token_accuracy": 0.9798267781734467, |
| "num_tokens": 6849972.0, |
| "step": 776 |
| }, |
| { |
| "entropy": 1.0468733608722687, |
| "epoch": 2.8072398190045247, |
| "grad_norm": 0.39525169134140015, |
| "learning_rate": 0.0001558329780931408, |
| "loss": 0.07288673520088196, |
| "mean_token_accuracy": 0.9749600887298584, |
| "num_tokens": 6859355.0, |
| "step": 777 |
| }, |
| { |
| "entropy": 1.176623821258545, |
| "epoch": 2.810859728506787, |
| "grad_norm": 0.5643757581710815, |
| "learning_rate": 0.0001557196982147348, |
| "loss": 0.06095210090279579, |
| "mean_token_accuracy": 0.9794624149799347, |
| "num_tokens": 6867749.0, |
| "step": 778 |
| }, |
| { |
| "entropy": 1.1942293345928192, |
| "epoch": 2.8144796380090495, |
| "grad_norm": 0.5817746520042419, |
| "learning_rate": 0.00015560632062831337, |
| "loss": 0.06656618416309357, |
| "mean_token_accuracy": 0.9780125468969345, |
| "num_tokens": 6876216.0, |
| "step": 779 |
| }, |
| { |
| "entropy": 1.0938580930233002, |
| "epoch": 2.818099547511312, |
| "grad_norm": 0.3751153349876404, |
| "learning_rate": 0.00015549284557617697, |
| "loss": 0.06173507869243622, |
| "mean_token_accuracy": 0.9801245480775833, |
| "num_tokens": 6885009.0, |
| "step": 780 |
| }, |
| { |
| "entropy": 1.147508054971695, |
| "epoch": 2.8217194570135744, |
| "grad_norm": 0.5696470141410828, |
| "learning_rate": 0.00015537927330083412, |
| "loss": 0.11268901824951172, |
| "mean_token_accuracy": 0.9691483378410339, |
| "num_tokens": 6893473.0, |
| "step": 781 |
| }, |
| { |
| "entropy": 1.1333416998386383, |
| "epoch": 2.825339366515837, |
| "grad_norm": 0.5513712167739868, |
| "learning_rate": 0.00015526560404500138, |
| "loss": 0.07699988782405853, |
| "mean_token_accuracy": 0.9764687865972519, |
| "num_tokens": 6901878.0, |
| "step": 782 |
| }, |
| { |
| "entropy": 1.1387991607189178, |
| "epoch": 2.8289592760180997, |
| "grad_norm": 0.6534690260887146, |
| "learning_rate": 0.00015515183805160228, |
| "loss": 0.10432648658752441, |
| "mean_token_accuracy": 0.9748246222734451, |
| "num_tokens": 6910566.0, |
| "step": 783 |
| }, |
| { |
| "entropy": 1.0964215993881226, |
| "epoch": 2.832579185520362, |
| "grad_norm": 0.5456584095954895, |
| "learning_rate": 0.00015503797556376737, |
| "loss": 0.09832392632961273, |
| "mean_token_accuracy": 0.9682436734437943, |
| "num_tokens": 6919451.0, |
| "step": 784 |
| }, |
| { |
| "entropy": 1.100774735212326, |
| "epoch": 2.8361990950226246, |
| "grad_norm": 0.499659925699234, |
| "learning_rate": 0.00015492401682483324, |
| "loss": 0.08655116707086563, |
| "mean_token_accuracy": 0.9732284247875214, |
| "num_tokens": 6928533.0, |
| "step": 785 |
| }, |
| { |
| "entropy": 1.0831056982278824, |
| "epoch": 2.839819004524887, |
| "grad_norm": 0.5561793446540833, |
| "learning_rate": 0.0001548099620783422, |
| "loss": 0.06871826946735382, |
| "mean_token_accuracy": 0.9839567542076111, |
| "num_tokens": 6937781.0, |
| "step": 786 |
| }, |
| { |
| "entropy": 1.0972970724105835, |
| "epoch": 2.8434389140271494, |
| "grad_norm": 0.46581363677978516, |
| "learning_rate": 0.0001546958115680418, |
| "loss": 0.06161344051361084, |
| "mean_token_accuracy": 0.9849719405174255, |
| "num_tokens": 6946751.0, |
| "step": 787 |
| }, |
| { |
| "entropy": 1.1013003289699554, |
| "epoch": 2.847058823529412, |
| "grad_norm": 0.5828675031661987, |
| "learning_rate": 0.00015458156553788423, |
| "loss": 0.2391076385974884, |
| "mean_token_accuracy": 0.9569680541753769, |
| "num_tokens": 6956178.0, |
| "step": 788 |
| }, |
| { |
| "entropy": 1.129268318414688, |
| "epoch": 2.8506787330316743, |
| "grad_norm": 0.5637741088867188, |
| "learning_rate": 0.00015446722423202575, |
| "loss": 0.1284228265285492, |
| "mean_token_accuracy": 0.9719979614019394, |
| "num_tokens": 6965201.0, |
| "step": 789 |
| }, |
| { |
| "entropy": 1.1032065749168396, |
| "epoch": 2.8542986425339367, |
| "grad_norm": 0.3668762147426605, |
| "learning_rate": 0.00015435278789482636, |
| "loss": 0.04885539412498474, |
| "mean_token_accuracy": 0.9844310134649277, |
| "num_tokens": 6974144.0, |
| "step": 790 |
| }, |
| { |
| "entropy": 1.10554239153862, |
| "epoch": 2.857918552036199, |
| "grad_norm": 0.5248369574546814, |
| "learning_rate": 0.00015423825677084895, |
| "loss": 0.04060475528240204, |
| "mean_token_accuracy": 0.988097608089447, |
| "num_tokens": 6982816.0, |
| "step": 791 |
| }, |
| { |
| "entropy": 1.1331679821014404, |
| "epoch": 2.8615384615384616, |
| "grad_norm": 0.7115232348442078, |
| "learning_rate": 0.00015412363110485928, |
| "loss": 0.10422416776418686, |
| "mean_token_accuracy": 0.9733791649341583, |
| "num_tokens": 6991605.0, |
| "step": 792 |
| }, |
| { |
| "entropy": 1.0728564262390137, |
| "epoch": 2.865158371040724, |
| "grad_norm": 0.7061241269111633, |
| "learning_rate": 0.00015400891114182488, |
| "loss": 0.10765457153320312, |
| "mean_token_accuracy": 0.9669128805398941, |
| "num_tokens": 7000489.0, |
| "step": 793 |
| }, |
| { |
| "entropy": 1.1403984129428864, |
| "epoch": 2.8687782805429864, |
| "grad_norm": 0.6950392127037048, |
| "learning_rate": 0.0001538940971269149, |
| "loss": 0.0980193242430687, |
| "mean_token_accuracy": 0.9763042032718658, |
| "num_tokens": 7008774.0, |
| "step": 794 |
| }, |
| { |
| "entropy": 1.069732904434204, |
| "epoch": 2.872398190045249, |
| "grad_norm": 0.3522324860095978, |
| "learning_rate": 0.00015377918930549952, |
| "loss": 0.0528433583676815, |
| "mean_token_accuracy": 0.9891404360532761, |
| "num_tokens": 7017656.0, |
| "step": 795 |
| }, |
| { |
| "entropy": 1.1608846485614777, |
| "epoch": 2.8760180995475113, |
| "grad_norm": 0.6504537463188171, |
| "learning_rate": 0.00015366418792314937, |
| "loss": 0.053374141454696655, |
| "mean_token_accuracy": 0.982115313410759, |
| "num_tokens": 7025980.0, |
| "step": 796 |
| }, |
| { |
| "entropy": 1.0937940925359726, |
| "epoch": 2.8796380090497737, |
| "grad_norm": 0.6381659507751465, |
| "learning_rate": 0.0001535490932256351, |
| "loss": 0.07902165502309799, |
| "mean_token_accuracy": 0.9712257385253906, |
| "num_tokens": 7034682.0, |
| "step": 797 |
| }, |
| { |
| "entropy": 1.1293490529060364, |
| "epoch": 2.883257918552036, |
| "grad_norm": 0.6040161848068237, |
| "learning_rate": 0.00015343390545892658, |
| "loss": 0.059771500527858734, |
| "mean_token_accuracy": 0.9815134108066559, |
| "num_tokens": 7043427.0, |
| "step": 798 |
| }, |
| { |
| "entropy": 1.0549205243587494, |
| "epoch": 2.8868778280542986, |
| "grad_norm": 0.5692182779312134, |
| "learning_rate": 0.00015331862486919282, |
| "loss": 0.09364207834005356, |
| "mean_token_accuracy": 0.9750859886407852, |
| "num_tokens": 7052551.0, |
| "step": 799 |
| }, |
| { |
| "entropy": 1.097869873046875, |
| "epoch": 2.890497737556561, |
| "grad_norm": 0.42280399799346924, |
| "learning_rate": 0.00015320325170280107, |
| "loss": 0.04443611204624176, |
| "mean_token_accuracy": 0.9883531183004379, |
| "num_tokens": 7061476.0, |
| "step": 800 |
| }, |
| { |
| "entropy": 1.082559585571289, |
| "epoch": 2.8941176470588235, |
| "grad_norm": 0.263492614030838, |
| "learning_rate": 0.00015308778620631643, |
| "loss": 0.02307066321372986, |
| "mean_token_accuracy": 0.9891369044780731, |
| "num_tokens": 7070227.0, |
| "step": 801 |
| }, |
| { |
| "entropy": 1.131628304719925, |
| "epoch": 2.897737556561086, |
| "grad_norm": 0.5446733236312866, |
| "learning_rate": 0.0001529722286265014, |
| "loss": 0.059960536658763885, |
| "mean_token_accuracy": 0.979179710149765, |
| "num_tokens": 7079092.0, |
| "step": 802 |
| }, |
| { |
| "entropy": 1.0937869250774384, |
| "epoch": 2.9013574660633483, |
| "grad_norm": 0.5581566691398621, |
| "learning_rate": 0.00015285657921031514, |
| "loss": 0.08178013563156128, |
| "mean_token_accuracy": 0.9795632362365723, |
| "num_tokens": 7088409.0, |
| "step": 803 |
| }, |
| { |
| "entropy": 1.185722142457962, |
| "epoch": 2.9049773755656108, |
| "grad_norm": 0.9806889295578003, |
| "learning_rate": 0.00015274083820491325, |
| "loss": 0.14850257337093353, |
| "mean_token_accuracy": 0.9594536870718002, |
| "num_tokens": 7096631.0, |
| "step": 804 |
| }, |
| { |
| "entropy": 1.0202390849590302, |
| "epoch": 2.908597285067873, |
| "grad_norm": 0.5378230214118958, |
| "learning_rate": 0.00015262500585764687, |
| "loss": 0.08723378926515579, |
| "mean_token_accuracy": 0.9726832509040833, |
| "num_tokens": 7106014.0, |
| "step": 805 |
| }, |
| { |
| "entropy": 1.1566883027553558, |
| "epoch": 2.9122171945701356, |
| "grad_norm": 0.4863153100013733, |
| "learning_rate": 0.00015250908241606253, |
| "loss": 0.04234391450881958, |
| "mean_token_accuracy": 0.9871827512979507, |
| "num_tokens": 7114443.0, |
| "step": 806 |
| }, |
| { |
| "entropy": 1.1635094583034515, |
| "epoch": 2.915837104072398, |
| "grad_norm": 0.7457255125045776, |
| "learning_rate": 0.00015239306812790129, |
| "loss": 0.11422393471002579, |
| "mean_token_accuracy": 0.969372570514679, |
| "num_tokens": 7123016.0, |
| "step": 807 |
| }, |
| { |
| "entropy": 1.1335663199424744, |
| "epoch": 2.9194570135746605, |
| "grad_norm": 0.7970643043518066, |
| "learning_rate": 0.00015227696324109845, |
| "loss": 0.18115636706352234, |
| "mean_token_accuracy": 0.9552556723356247, |
| "num_tokens": 7131986.0, |
| "step": 808 |
| }, |
| { |
| "entropy": 1.0883623361587524, |
| "epoch": 2.9230769230769234, |
| "grad_norm": 0.3898361623287201, |
| "learning_rate": 0.00015216076800378286, |
| "loss": 0.042640797793865204, |
| "mean_token_accuracy": 0.9852175265550613, |
| "num_tokens": 7140496.0, |
| "step": 809 |
| }, |
| { |
| "entropy": 1.1370824575424194, |
| "epoch": 2.926696832579186, |
| "grad_norm": 0.6726946234703064, |
| "learning_rate": 0.0001520444826642766, |
| "loss": 0.08559117466211319, |
| "mean_token_accuracy": 0.9699132144451141, |
| "num_tokens": 7149329.0, |
| "step": 810 |
| }, |
| { |
| "entropy": 1.1126140654087067, |
| "epoch": 2.930316742081448, |
| "grad_norm": 0.6819010972976685, |
| "learning_rate": 0.00015192810747109413, |
| "loss": 0.07975786924362183, |
| "mean_token_accuracy": 0.9801275730133057, |
| "num_tokens": 7157890.0, |
| "step": 811 |
| }, |
| { |
| "entropy": 1.0616440922021866, |
| "epoch": 2.9339366515837106, |
| "grad_norm": 0.5790680050849915, |
| "learning_rate": 0.000151811642672942, |
| "loss": 0.09595339745283127, |
| "mean_token_accuracy": 0.9808698296546936, |
| "num_tokens": 7166656.0, |
| "step": 812 |
| }, |
| { |
| "entropy": 1.0721414238214493, |
| "epoch": 2.937556561085973, |
| "grad_norm": 0.44250181317329407, |
| "learning_rate": 0.00015169508851871835, |
| "loss": 0.05376347899436951, |
| "mean_token_accuracy": 0.9847523421049118, |
| "num_tokens": 7175308.0, |
| "step": 813 |
| }, |
| { |
| "entropy": 1.0640469789505005, |
| "epoch": 2.9411764705882355, |
| "grad_norm": 0.4585307240486145, |
| "learning_rate": 0.00015157844525751213, |
| "loss": 0.05569484457373619, |
| "mean_token_accuracy": 0.9836412966251373, |
| "num_tokens": 7184404.0, |
| "step": 814 |
| }, |
| { |
| "entropy": 1.1266240775585175, |
| "epoch": 2.944796380090498, |
| "grad_norm": 0.5807170271873474, |
| "learning_rate": 0.00015146171313860284, |
| "loss": 0.04534313082695007, |
| "mean_token_accuracy": 0.9861873239278793, |
| "num_tokens": 7192704.0, |
| "step": 815 |
| }, |
| { |
| "entropy": 1.04159614443779, |
| "epoch": 2.9484162895927604, |
| "grad_norm": 0.6643418073654175, |
| "learning_rate": 0.00015134489241145984, |
| "loss": 0.08871917426586151, |
| "mean_token_accuracy": 0.9774074703454971, |
| "num_tokens": 7201775.0, |
| "step": 816 |
| }, |
| { |
| "entropy": 1.059510976076126, |
| "epoch": 2.952036199095023, |
| "grad_norm": 0.6120842695236206, |
| "learning_rate": 0.00015122798332574183, |
| "loss": 0.10533600300550461, |
| "mean_token_accuracy": 0.9733265042304993, |
| "num_tokens": 7210515.0, |
| "step": 817 |
| }, |
| { |
| "entropy": 1.126617282629013, |
| "epoch": 2.9556561085972852, |
| "grad_norm": 0.605033814907074, |
| "learning_rate": 0.00015111098613129637, |
| "loss": 0.07193339616060257, |
| "mean_token_accuracy": 0.9838567227125168, |
| "num_tokens": 7218761.0, |
| "step": 818 |
| }, |
| { |
| "entropy": 1.1339809894561768, |
| "epoch": 2.9592760180995477, |
| "grad_norm": 0.7737305164337158, |
| "learning_rate": 0.0001509939010781593, |
| "loss": 0.1462799310684204, |
| "mean_token_accuracy": 0.9620816707611084, |
| "num_tokens": 7227422.0, |
| "step": 819 |
| }, |
| { |
| "entropy": 1.0626371949911118, |
| "epoch": 2.96289592760181, |
| "grad_norm": 0.4753362834453583, |
| "learning_rate": 0.0001508767284165543, |
| "loss": 0.06507248431444168, |
| "mean_token_accuracy": 0.9830497950315475, |
| "num_tokens": 7236243.0, |
| "step": 820 |
| }, |
| { |
| "entropy": 1.0648977309465408, |
| "epoch": 2.9665158371040725, |
| "grad_norm": 0.6420553922653198, |
| "learning_rate": 0.0001507594683968921, |
| "loss": 0.0898992195725441, |
| "mean_token_accuracy": 0.9699191451072693, |
| "num_tokens": 7245278.0, |
| "step": 821 |
| }, |
| { |
| "entropy": 1.1461690664291382, |
| "epoch": 2.970135746606335, |
| "grad_norm": 0.738278865814209, |
| "learning_rate": 0.0001506421212697703, |
| "loss": 0.10916579514741898, |
| "mean_token_accuracy": 0.9676524996757507, |
| "num_tokens": 7254069.0, |
| "step": 822 |
| }, |
| { |
| "entropy": 1.1318964213132858, |
| "epoch": 2.9737556561085974, |
| "grad_norm": 0.5524135231971741, |
| "learning_rate": 0.00015052468728597265, |
| "loss": 0.0710684061050415, |
| "mean_token_accuracy": 0.9867733418941498, |
| "num_tokens": 7262846.0, |
| "step": 823 |
| }, |
| { |
| "entropy": 1.190606713294983, |
| "epoch": 2.97737556561086, |
| "grad_norm": 0.5717188119888306, |
| "learning_rate": 0.00015040716669646837, |
| "loss": 0.09186422824859619, |
| "mean_token_accuracy": 0.9754447788000107, |
| "num_tokens": 7271169.0, |
| "step": 824 |
| }, |
| { |
| "entropy": 1.147791177034378, |
| "epoch": 2.9809954751131222, |
| "grad_norm": 0.5148852467536926, |
| "learning_rate": 0.0001502895597524119, |
| "loss": 0.05435680225491524, |
| "mean_token_accuracy": 0.9840258657932281, |
| "num_tokens": 7280020.0, |
| "step": 825 |
| }, |
| { |
| "entropy": 1.1764528155326843, |
| "epoch": 2.9846153846153847, |
| "grad_norm": 0.7775517702102661, |
| "learning_rate": 0.00015017186670514225, |
| "loss": 0.09064161032438278, |
| "mean_token_accuracy": 0.9730498492717743, |
| "num_tokens": 7288874.0, |
| "step": 826 |
| }, |
| { |
| "entropy": 1.1749920845031738, |
| "epoch": 2.988235294117647, |
| "grad_norm": 0.40038371086120605, |
| "learning_rate": 0.0001500540878061823, |
| "loss": 0.07684603333473206, |
| "mean_token_accuracy": 0.9770887941122055, |
| "num_tokens": 7298183.0, |
| "step": 827 |
| }, |
| { |
| "entropy": 1.1883585751056671, |
| "epoch": 2.9918552036199095, |
| "grad_norm": 0.34793826937675476, |
| "learning_rate": 0.00014993622330723857, |
| "loss": 0.046813275665044785, |
| "mean_token_accuracy": 0.9880518466234207, |
| "num_tokens": 7306947.0, |
| "step": 828 |
| }, |
| { |
| "entropy": 1.204830139875412, |
| "epoch": 2.995475113122172, |
| "grad_norm": 0.6548672318458557, |
| "learning_rate": 0.00014981827346020033, |
| "loss": 0.1149185448884964, |
| "mean_token_accuracy": 0.9740531295537949, |
| "num_tokens": 7315487.0, |
| "step": 829 |
| }, |
| { |
| "entropy": 1.1531266868114471, |
| "epoch": 2.9990950226244344, |
| "grad_norm": 0.40315133333206177, |
| "learning_rate": 0.00014970023851713945, |
| "loss": 0.07841235399246216, |
| "mean_token_accuracy": 0.9789654463529587, |
| "num_tokens": 7324480.0, |
| "step": 830 |
| }, |
| { |
| "entropy": 1.1628870964050293, |
| "epoch": 3.0, |
| "grad_norm": 1.4110596179962158, |
| "learning_rate": 0.0001495821187303095, |
| "loss": 0.05664234235882759, |
| "mean_token_accuracy": 0.981249988079071, |
| "num_tokens": 7325175.0, |
| "step": 831 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_entropy": 1.1410768734730357, |
| "eval_loss": 0.12047121673822403, |
| "eval_mean_token_accuracy": 0.9686469343619618, |
| "eval_num_tokens": 7325175.0, |
| "eval_runtime": 31.7876, |
| "eval_samples_per_second": 11.608, |
| "eval_steps_per_second": 3.869, |
| "step": 831 |
| }, |
| { |
| "entropy": 1.1343726515769958, |
| "epoch": 3.0036199095022624, |
| "grad_norm": 0.49150392413139343, |
| "learning_rate": 0.00014946391435214555, |
| "loss": 0.06470951437950134, |
| "mean_token_accuracy": 0.9820976257324219, |
| "num_tokens": 7334031.0, |
| "step": 832 |
| }, |
| { |
| "entropy": 1.0768441557884216, |
| "epoch": 3.007239819004525, |
| "grad_norm": 0.37447497248649597, |
| "learning_rate": 0.0001493456256352632, |
| "loss": 0.05001193284988403, |
| "mean_token_accuracy": 0.9848204553127289, |
| "num_tokens": 7342799.0, |
| "step": 833 |
| }, |
| { |
| "entropy": 1.1359966397285461, |
| "epoch": 3.0108597285067873, |
| "grad_norm": 0.4405897557735443, |
| "learning_rate": 0.00014922725283245846, |
| "loss": 0.05152616277337074, |
| "mean_token_accuracy": 0.9811764508485794, |
| "num_tokens": 7351473.0, |
| "step": 834 |
| }, |
| { |
| "entropy": 1.0891236364841461, |
| "epoch": 3.0144796380090497, |
| "grad_norm": 0.5045367479324341, |
| "learning_rate": 0.00014910879619670704, |
| "loss": 0.0627930611371994, |
| "mean_token_accuracy": 0.981208473443985, |
| "num_tokens": 7360361.0, |
| "step": 835 |
| }, |
| { |
| "entropy": 1.0107998549938202, |
| "epoch": 3.018099547511312, |
| "grad_norm": 0.574455976486206, |
| "learning_rate": 0.00014899025598116378, |
| "loss": 0.05342878773808479, |
| "mean_token_accuracy": 0.9806361496448517, |
| "num_tokens": 7369519.0, |
| "step": 836 |
| }, |
| { |
| "entropy": 1.087983787059784, |
| "epoch": 3.0217194570135746, |
| "grad_norm": 0.49305710196495056, |
| "learning_rate": 0.00014887163243916212, |
| "loss": 0.04466189816594124, |
| "mean_token_accuracy": 0.9900805652141571, |
| "num_tokens": 7378108.0, |
| "step": 837 |
| }, |
| { |
| "entropy": 0.9926144480705261, |
| "epoch": 3.025339366515837, |
| "grad_norm": 0.5207587480545044, |
| "learning_rate": 0.00014875292582421361, |
| "loss": 0.07304289937019348, |
| "mean_token_accuracy": 0.9777405709028244, |
| "num_tokens": 7387315.0, |
| "step": 838 |
| }, |
| { |
| "entropy": 1.00730961561203, |
| "epoch": 3.0289592760180994, |
| "grad_norm": 0.7438811659812927, |
| "learning_rate": 0.00014863413639000728, |
| "loss": 0.05742796137928963, |
| "mean_token_accuracy": 0.9818450957536697, |
| "num_tokens": 7396562.0, |
| "step": 839 |
| }, |
| { |
| "entropy": 0.9555287957191467, |
| "epoch": 3.032579185520362, |
| "grad_norm": 0.3822678029537201, |
| "learning_rate": 0.00014851526439040922, |
| "loss": 0.03751285746693611, |
| "mean_token_accuracy": 0.9855490773916245, |
| "num_tokens": 7405764.0, |
| "step": 840 |
| }, |
| { |
| "entropy": 0.974100261926651, |
| "epoch": 3.0361990950226243, |
| "grad_norm": 0.6839661598205566, |
| "learning_rate": 0.000148396310079462, |
| "loss": 0.08734611421823502, |
| "mean_token_accuracy": 0.9757889807224274, |
| "num_tokens": 7414847.0, |
| "step": 841 |
| }, |
| { |
| "entropy": 0.9459673017263412, |
| "epoch": 3.0398190045248867, |
| "grad_norm": 0.5624176263809204, |
| "learning_rate": 0.00014827727371138392, |
| "loss": 0.07012228667736053, |
| "mean_token_accuracy": 0.9756064116954803, |
| "num_tokens": 7424403.0, |
| "step": 842 |
| }, |
| { |
| "entropy": 0.9462547451257706, |
| "epoch": 3.043438914027149, |
| "grad_norm": 0.6111705899238586, |
| "learning_rate": 0.00014815815554056888, |
| "loss": 0.046397194266319275, |
| "mean_token_accuracy": 0.9845404624938965, |
| "num_tokens": 7433163.0, |
| "step": 843 |
| }, |
| { |
| "entropy": 0.9449738264083862, |
| "epoch": 3.0470588235294116, |
| "grad_norm": 0.48121562600135803, |
| "learning_rate": 0.0001480389558215855, |
| "loss": 0.06078047305345535, |
| "mean_token_accuracy": 0.9810367077589035, |
| "num_tokens": 7441782.0, |
| "step": 844 |
| }, |
| { |
| "entropy": 1.069021388888359, |
| "epoch": 3.050678733031674, |
| "grad_norm": 0.7874725461006165, |
| "learning_rate": 0.00014791967480917657, |
| "loss": 0.08393625169992447, |
| "mean_token_accuracy": 0.9736702889204025, |
| "num_tokens": 7449859.0, |
| "step": 845 |
| }, |
| { |
| "entropy": 0.9736010432243347, |
| "epoch": 3.0542986425339365, |
| "grad_norm": 0.6149342656135559, |
| "learning_rate": 0.00014780031275825873, |
| "loss": 0.11650148779153824, |
| "mean_token_accuracy": 0.9748559445142746, |
| "num_tokens": 7458763.0, |
| "step": 846 |
| }, |
| { |
| "entropy": 0.9672216325998306, |
| "epoch": 3.057918552036199, |
| "grad_norm": 0.7002614140510559, |
| "learning_rate": 0.00014768086992392187, |
| "loss": 0.06093838810920715, |
| "mean_token_accuracy": 0.9850212782621384, |
| "num_tokens": 7467393.0, |
| "step": 847 |
| }, |
| { |
| "entropy": 1.0159880816936493, |
| "epoch": 3.0615384615384613, |
| "grad_norm": 0.5158461332321167, |
| "learning_rate": 0.00014756134656142842, |
| "loss": 0.03521204739809036, |
| "mean_token_accuracy": 0.9884093105792999, |
| "num_tokens": 7475902.0, |
| "step": 848 |
| }, |
| { |
| "entropy": 0.9773909002542496, |
| "epoch": 3.065158371040724, |
| "grad_norm": 0.6164082288742065, |
| "learning_rate": 0.00014744174292621284, |
| "loss": 0.09098848700523376, |
| "mean_token_accuracy": 0.9765210151672363, |
| "num_tokens": 7484956.0, |
| "step": 849 |
| }, |
| { |
| "entropy": 0.9457692354917526, |
| "epoch": 3.0687782805429866, |
| "grad_norm": 0.49424877762794495, |
| "learning_rate": 0.00014732205927388135, |
| "loss": 0.07613382488489151, |
| "mean_token_accuracy": 0.9721274673938751, |
| "num_tokens": 7494091.0, |
| "step": 850 |
| }, |
| { |
| "entropy": 1.0335682481527328, |
| "epoch": 3.072398190045249, |
| "grad_norm": 0.5534892082214355, |
| "learning_rate": 0.00014720229586021098, |
| "loss": 0.06013672798871994, |
| "mean_token_accuracy": 0.9849141836166382, |
| "num_tokens": 7502371.0, |
| "step": 851 |
| }, |
| { |
| "entropy": 1.015167698264122, |
| "epoch": 3.0760180995475115, |
| "grad_norm": 0.6271040439605713, |
| "learning_rate": 0.00014708245294114933, |
| "loss": 0.08082443475723267, |
| "mean_token_accuracy": 0.9719363301992416, |
| "num_tokens": 7511051.0, |
| "step": 852 |
| }, |
| { |
| "entropy": 0.8989782184362411, |
| "epoch": 3.079638009049774, |
| "grad_norm": 0.4136255979537964, |
| "learning_rate": 0.00014696253077281385, |
| "loss": 0.046568505465984344, |
| "mean_token_accuracy": 0.9837394803762436, |
| "num_tokens": 7520417.0, |
| "step": 853 |
| }, |
| { |
| "entropy": 1.008704587817192, |
| "epoch": 3.0832579185520363, |
| "grad_norm": 0.664092481136322, |
| "learning_rate": 0.00014684252961149144, |
| "loss": 0.08081446588039398, |
| "mean_token_accuracy": 0.9798438847064972, |
| "num_tokens": 7529050.0, |
| "step": 854 |
| }, |
| { |
| "entropy": 0.9838613867759705, |
| "epoch": 3.086877828054299, |
| "grad_norm": 0.6071294546127319, |
| "learning_rate": 0.00014672244971363768, |
| "loss": 0.05976680666208267, |
| "mean_token_accuracy": 0.9791270047426224, |
| "num_tokens": 7537868.0, |
| "step": 855 |
| }, |
| { |
| "entropy": 1.026266947388649, |
| "epoch": 3.090497737556561, |
| "grad_norm": 0.8460846543312073, |
| "learning_rate": 0.00014660229133587653, |
| "loss": 0.0821947306394577, |
| "mean_token_accuracy": 0.9793416410684586, |
| "num_tokens": 7546543.0, |
| "step": 856 |
| }, |
| { |
| "entropy": 0.9946709871292114, |
| "epoch": 3.0941176470588236, |
| "grad_norm": 0.5959231853485107, |
| "learning_rate": 0.00014648205473499963, |
| "loss": 0.13812144100666046, |
| "mean_token_accuracy": 0.9636797457933426, |
| "num_tokens": 7556035.0, |
| "step": 857 |
| }, |
| { |
| "entropy": 0.9716462343931198, |
| "epoch": 3.097737556561086, |
| "grad_norm": 0.30362147092819214, |
| "learning_rate": 0.00014636174016796583, |
| "loss": 0.020999953150749207, |
| "mean_token_accuracy": 0.9924765825271606, |
| "num_tokens": 7564738.0, |
| "step": 858 |
| }, |
| { |
| "entropy": 1.0234932005405426, |
| "epoch": 3.1013574660633485, |
| "grad_norm": 0.6290194988250732, |
| "learning_rate": 0.0001462413478919006, |
| "loss": 0.056703150272369385, |
| "mean_token_accuracy": 0.9818439483642578, |
| "num_tokens": 7573699.0, |
| "step": 859 |
| }, |
| { |
| "entropy": 1.0116705000400543, |
| "epoch": 3.104977375565611, |
| "grad_norm": 0.5160185098648071, |
| "learning_rate": 0.00014612087816409533, |
| "loss": 0.058757197111845016, |
| "mean_token_accuracy": 0.9875614196062088, |
| "num_tokens": 7582626.0, |
| "step": 860 |
| }, |
| { |
| "entropy": 1.0897808521986008, |
| "epoch": 3.1085972850678734, |
| "grad_norm": 0.5133566856384277, |
| "learning_rate": 0.00014600033124200718, |
| "loss": 0.08064649254083633, |
| "mean_token_accuracy": 0.9845390319824219, |
| "num_tokens": 7591273.0, |
| "step": 861 |
| }, |
| { |
| "entropy": 1.081803947687149, |
| "epoch": 3.112217194570136, |
| "grad_norm": 1.003861904144287, |
| "learning_rate": 0.00014587970738325808, |
| "loss": 0.1418905109167099, |
| "mean_token_accuracy": 0.9711157530546188, |
| "num_tokens": 7600159.0, |
| "step": 862 |
| }, |
| { |
| "entropy": 1.0890114307403564, |
| "epoch": 3.1158371040723982, |
| "grad_norm": 0.441112220287323, |
| "learning_rate": 0.00014575900684563452, |
| "loss": 0.05492641404271126, |
| "mean_token_accuracy": 0.9856597781181335, |
| "num_tokens": 7608829.0, |
| "step": 863 |
| }, |
| { |
| "entropy": 1.0998935103416443, |
| "epoch": 3.1194570135746607, |
| "grad_norm": 0.749271035194397, |
| "learning_rate": 0.0001456382298870868, |
| "loss": 0.13943280279636383, |
| "mean_token_accuracy": 0.9622896611690521, |
| "num_tokens": 7617472.0, |
| "step": 864 |
| }, |
| { |
| "entropy": 1.0612820237874985, |
| "epoch": 3.123076923076923, |
| "grad_norm": 0.4013805091381073, |
| "learning_rate": 0.00014551737676572846, |
| "loss": 0.037314120680093765, |
| "mean_token_accuracy": 0.9867268055677414, |
| "num_tokens": 7626380.0, |
| "step": 865 |
| }, |
| { |
| "entropy": 1.0484765768051147, |
| "epoch": 3.1266968325791855, |
| "grad_norm": 0.4148269593715668, |
| "learning_rate": 0.00014539644773983599, |
| "loss": 0.0693223774433136, |
| "mean_token_accuracy": 0.9800146818161011, |
| "num_tokens": 7635156.0, |
| "step": 866 |
| }, |
| { |
| "entropy": 1.0329798758029938, |
| "epoch": 3.130316742081448, |
| "grad_norm": 0.33298373222351074, |
| "learning_rate": 0.00014527544306784792, |
| "loss": 0.038426462560892105, |
| "mean_token_accuracy": 0.9857963621616364, |
| "num_tokens": 7644301.0, |
| "step": 867 |
| }, |
| { |
| "entropy": 1.0733687579631805, |
| "epoch": 3.1339366515837104, |
| "grad_norm": 0.5466052889823914, |
| "learning_rate": 0.0001451543630083646, |
| "loss": 0.03809817135334015, |
| "mean_token_accuracy": 0.9909420758485794, |
| "num_tokens": 7652600.0, |
| "step": 868 |
| }, |
| { |
| "entropy": 1.0370618999004364, |
| "epoch": 3.137556561085973, |
| "grad_norm": 0.5771836042404175, |
| "learning_rate": 0.00014503320782014735, |
| "loss": 0.07035155594348907, |
| "mean_token_accuracy": 0.9797674417495728, |
| "num_tokens": 7661663.0, |
| "step": 869 |
| }, |
| { |
| "entropy": 1.0356185287237167, |
| "epoch": 3.1411764705882352, |
| "grad_norm": 0.4902818202972412, |
| "learning_rate": 0.0001449119777621181, |
| "loss": 0.03925105184316635, |
| "mean_token_accuracy": 0.9836414456367493, |
| "num_tokens": 7670373.0, |
| "step": 870 |
| }, |
| { |
| "entropy": 1.0262203961610794, |
| "epoch": 3.1447963800904977, |
| "grad_norm": 0.6004799604415894, |
| "learning_rate": 0.00014479067309335888, |
| "loss": 0.055851537734270096, |
| "mean_token_accuracy": 0.9868045449256897, |
| "num_tokens": 7679173.0, |
| "step": 871 |
| }, |
| { |
| "entropy": 1.0273671448230743, |
| "epoch": 3.14841628959276, |
| "grad_norm": 0.4942399263381958, |
| "learning_rate": 0.00014466929407311102, |
| "loss": 0.07361710071563721, |
| "mean_token_accuracy": 0.9777687937021255, |
| "num_tokens": 7687847.0, |
| "step": 872 |
| }, |
| { |
| "entropy": 1.0208537876605988, |
| "epoch": 3.1520361990950225, |
| "grad_norm": 0.47028565406799316, |
| "learning_rate": 0.0001445478409607748, |
| "loss": 0.06834034621715546, |
| "mean_token_accuracy": 0.9840661883354187, |
| "num_tokens": 7696853.0, |
| "step": 873 |
| }, |
| { |
| "entropy": 0.9666854441165924, |
| "epoch": 3.155656108597285, |
| "grad_norm": 0.4551008939743042, |
| "learning_rate": 0.00014442631401590889, |
| "loss": 0.05758052319288254, |
| "mean_token_accuracy": 0.9800277948379517, |
| "num_tokens": 7706188.0, |
| "step": 874 |
| }, |
| { |
| "entropy": 1.0102877765893936, |
| "epoch": 3.1592760180995474, |
| "grad_norm": 0.4480641186237335, |
| "learning_rate": 0.00014430471349822973, |
| "loss": 0.04771336168050766, |
| "mean_token_accuracy": 0.9888424724340439, |
| "num_tokens": 7714582.0, |
| "step": 875 |
| }, |
| { |
| "entropy": 1.0120342820882797, |
| "epoch": 3.16289592760181, |
| "grad_norm": 0.4889662563800812, |
| "learning_rate": 0.00014418303966761095, |
| "loss": 0.04983747750520706, |
| "mean_token_accuracy": 0.984560638666153, |
| "num_tokens": 7723527.0, |
| "step": 876 |
| }, |
| { |
| "entropy": 1.0261083096265793, |
| "epoch": 3.1665158371040723, |
| "grad_norm": 0.5745841860771179, |
| "learning_rate": 0.0001440612927840829, |
| "loss": 0.03039124421775341, |
| "mean_token_accuracy": 0.99087293446064, |
| "num_tokens": 7731911.0, |
| "step": 877 |
| }, |
| { |
| "entropy": 0.9984003603458405, |
| "epoch": 3.1701357466063347, |
| "grad_norm": 0.6179273724555969, |
| "learning_rate": 0.00014393947310783204, |
| "loss": 0.06010914221405983, |
| "mean_token_accuracy": 0.9845625460147858, |
| "num_tokens": 7740814.0, |
| "step": 878 |
| }, |
| { |
| "entropy": 0.9657698571681976, |
| "epoch": 3.173755656108597, |
| "grad_norm": 0.6100254654884338, |
| "learning_rate": 0.00014381758089920037, |
| "loss": 0.07441750913858414, |
| "mean_token_accuracy": 0.9785300642251968, |
| "num_tokens": 7750064.0, |
| "step": 879 |
| }, |
| { |
| "entropy": 0.9885086268186569, |
| "epoch": 3.1773755656108595, |
| "grad_norm": 0.5612260103225708, |
| "learning_rate": 0.00014369561641868497, |
| "loss": 0.04010923579335213, |
| "mean_token_accuracy": 0.9847719967365265, |
| "num_tokens": 7758761.0, |
| "step": 880 |
| }, |
| { |
| "entropy": 0.9970448017120361, |
| "epoch": 3.180995475113122, |
| "grad_norm": 0.741584062576294, |
| "learning_rate": 0.00014357357992693726, |
| "loss": 0.22217880189418793, |
| "mean_token_accuracy": 0.9596049934625626, |
| "num_tokens": 7768137.0, |
| "step": 881 |
| }, |
| { |
| "entropy": 1.0056591778993607, |
| "epoch": 3.184615384615385, |
| "grad_norm": 0.6077441573143005, |
| "learning_rate": 0.0001434514716847627, |
| "loss": 0.06942348182201385, |
| "mean_token_accuracy": 0.9832926243543625, |
| "num_tokens": 7776809.0, |
| "step": 882 |
| }, |
| { |
| "entropy": 0.9155485332012177, |
| "epoch": 3.1882352941176473, |
| "grad_norm": 0.49946412444114685, |
| "learning_rate": 0.00014332929195311997, |
| "loss": 0.05635019764304161, |
| "mean_token_accuracy": 0.9822289496660233, |
| "num_tokens": 7786610.0, |
| "step": 883 |
| }, |
| { |
| "entropy": 1.0032794624567032, |
| "epoch": 3.1918552036199097, |
| "grad_norm": 0.508366584777832, |
| "learning_rate": 0.00014320704099312053, |
| "loss": 0.0505295991897583, |
| "mean_token_accuracy": 0.9848868101835251, |
| "num_tokens": 7795865.0, |
| "step": 884 |
| }, |
| { |
| "entropy": 1.02556973695755, |
| "epoch": 3.195475113122172, |
| "grad_norm": 0.5340198278427124, |
| "learning_rate": 0.00014308471906602824, |
| "loss": 0.04925612732768059, |
| "mean_token_accuracy": 0.9854798167943954, |
| "num_tokens": 7804601.0, |
| "step": 885 |
| }, |
| { |
| "entropy": 0.9870988875627518, |
| "epoch": 3.1990950226244346, |
| "grad_norm": 0.5888339877128601, |
| "learning_rate": 0.00014296232643325836, |
| "loss": 0.0761546716094017, |
| "mean_token_accuracy": 0.9741184711456299, |
| "num_tokens": 7813665.0, |
| "step": 886 |
| }, |
| { |
| "entropy": 1.003835067152977, |
| "epoch": 3.202714932126697, |
| "grad_norm": 0.5220237970352173, |
| "learning_rate": 0.00014283986335637743, |
| "loss": 0.07831569761037827, |
| "mean_token_accuracy": 0.9766727834939957, |
| "num_tokens": 7822827.0, |
| "step": 887 |
| }, |
| { |
| "entropy": 0.994221106171608, |
| "epoch": 3.2063348416289594, |
| "grad_norm": 0.38538306951522827, |
| "learning_rate": 0.00014271733009710245, |
| "loss": 0.04108966886997223, |
| "mean_token_accuracy": 0.9864413440227509, |
| "num_tokens": 7832062.0, |
| "step": 888 |
| }, |
| { |
| "entropy": 1.0374914705753326, |
| "epoch": 3.209954751131222, |
| "grad_norm": 0.49359604716300964, |
| "learning_rate": 0.0001425947269173006, |
| "loss": 0.07969976961612701, |
| "mean_token_accuracy": 0.9739227145910263, |
| "num_tokens": 7840683.0, |
| "step": 889 |
| }, |
| { |
| "entropy": 1.0366221368312836, |
| "epoch": 3.2135746606334843, |
| "grad_norm": 0.6392092704772949, |
| "learning_rate": 0.00014247205407898813, |
| "loss": 0.07518687844276428, |
| "mean_token_accuracy": 0.9690025746822357, |
| "num_tokens": 7849354.0, |
| "step": 890 |
| }, |
| { |
| "entropy": 1.0584574043750763, |
| "epoch": 3.2171945701357467, |
| "grad_norm": 0.5655986666679382, |
| "learning_rate": 0.0001423493118443305, |
| "loss": 0.05813714116811752, |
| "mean_token_accuracy": 0.9855156242847443, |
| "num_tokens": 7857837.0, |
| "step": 891 |
| }, |
| { |
| "entropy": 1.0073893517255783, |
| "epoch": 3.220814479638009, |
| "grad_norm": 0.5855907201766968, |
| "learning_rate": 0.00014222650047564128, |
| "loss": 0.060413941740989685, |
| "mean_token_accuracy": 0.983014851808548, |
| "num_tokens": 7866804.0, |
| "step": 892 |
| }, |
| { |
| "entropy": 1.0234777629375458, |
| "epoch": 3.2244343891402716, |
| "grad_norm": 0.5941819548606873, |
| "learning_rate": 0.00014210362023538194, |
| "loss": 0.07761058211326599, |
| "mean_token_accuracy": 0.977308601140976, |
| "num_tokens": 7875824.0, |
| "step": 893 |
| }, |
| { |
| "entropy": 0.985499694943428, |
| "epoch": 3.228054298642534, |
| "grad_norm": 0.3823990523815155, |
| "learning_rate": 0.00014198067138616096, |
| "loss": 0.03702878952026367, |
| "mean_token_accuracy": 0.9904143661260605, |
| "num_tokens": 7884781.0, |
| "step": 894 |
| }, |
| { |
| "entropy": 1.0092906057834625, |
| "epoch": 3.2316742081447964, |
| "grad_norm": 0.6509421467781067, |
| "learning_rate": 0.00014185765419073352, |
| "loss": 0.07195041328668594, |
| "mean_token_accuracy": 0.9753110408782959, |
| "num_tokens": 7894018.0, |
| "step": 895 |
| }, |
| { |
| "entropy": 1.0174790173768997, |
| "epoch": 3.235294117647059, |
| "grad_norm": 0.5773764848709106, |
| "learning_rate": 0.00014173456891200097, |
| "loss": 0.0730120837688446, |
| "mean_token_accuracy": 0.9764125943183899, |
| "num_tokens": 7903077.0, |
| "step": 896 |
| }, |
| { |
| "entropy": 1.0282800495624542, |
| "epoch": 3.2389140271493213, |
| "grad_norm": 0.46587687730789185, |
| "learning_rate": 0.00014161141581300993, |
| "loss": 0.022020503878593445, |
| "mean_token_accuracy": 0.9938695281744003, |
| "num_tokens": 7911650.0, |
| "step": 897 |
| }, |
| { |
| "entropy": 0.9804821610450745, |
| "epoch": 3.2425339366515837, |
| "grad_norm": 0.571721076965332, |
| "learning_rate": 0.00014148819515695226, |
| "loss": 0.07927205413579941, |
| "mean_token_accuracy": 0.9796456694602966, |
| "num_tokens": 7921082.0, |
| "step": 898 |
| }, |
| { |
| "entropy": 1.031940370798111, |
| "epoch": 3.246153846153846, |
| "grad_norm": 0.49029776453971863, |
| "learning_rate": 0.0001413649072071639, |
| "loss": 0.02996247261762619, |
| "mean_token_accuracy": 0.9895152151584625, |
| "num_tokens": 7929219.0, |
| "step": 899 |
| }, |
| { |
| "entropy": 0.9698035717010498, |
| "epoch": 3.2497737556561086, |
| "grad_norm": 0.7558808922767639, |
| "learning_rate": 0.00014124155222712477, |
| "loss": 0.07464667409658432, |
| "mean_token_accuracy": 0.9750736951828003, |
| "num_tokens": 7938458.0, |
| "step": 900 |
| }, |
| { |
| "entropy": 1.0520759522914886, |
| "epoch": 3.253393665158371, |
| "grad_norm": 0.6176110506057739, |
| "learning_rate": 0.00014111813048045804, |
| "loss": 0.08047696202993393, |
| "mean_token_accuracy": 0.9749610275030136, |
| "num_tokens": 7947000.0, |
| "step": 901 |
| }, |
| { |
| "entropy": 1.073697492480278, |
| "epoch": 3.2570135746606335, |
| "grad_norm": 0.9040740132331848, |
| "learning_rate": 0.00014099464223092951, |
| "loss": 0.09678997844457626, |
| "mean_token_accuracy": 0.9677305668592453, |
| "num_tokens": 7955392.0, |
| "step": 902 |
| }, |
| { |
| "entropy": 0.9997714757919312, |
| "epoch": 3.260633484162896, |
| "grad_norm": 0.4019973576068878, |
| "learning_rate": 0.00014087108774244714, |
| "loss": 0.05037511885166168, |
| "mean_token_accuracy": 0.9856746792793274, |
| "num_tokens": 7963894.0, |
| "step": 903 |
| }, |
| { |
| "entropy": 1.0553741455078125, |
| "epoch": 3.2642533936651583, |
| "grad_norm": 0.49038296937942505, |
| "learning_rate": 0.00014074746727906046, |
| "loss": 0.05171579495072365, |
| "mean_token_accuracy": 0.9860897809267044, |
| "num_tokens": 7972712.0, |
| "step": 904 |
| }, |
| { |
| "entropy": 0.9860815703868866, |
| "epoch": 3.2678733031674208, |
| "grad_norm": 0.4992769658565521, |
| "learning_rate": 0.00014062378110495989, |
| "loss": 0.05135425552725792, |
| "mean_token_accuracy": 0.9861588776111603, |
| "num_tokens": 7981964.0, |
| "step": 905 |
| }, |
| { |
| "entropy": 1.0063879191875458, |
| "epoch": 3.271493212669683, |
| "grad_norm": 0.5779732465744019, |
| "learning_rate": 0.00014050002948447644, |
| "loss": 0.07176823914051056, |
| "mean_token_accuracy": 0.9783128350973129, |
| "num_tokens": 7990478.0, |
| "step": 906 |
| }, |
| { |
| "entropy": 1.0536756813526154, |
| "epoch": 3.2751131221719456, |
| "grad_norm": 0.35298392176628113, |
| "learning_rate": 0.00014037621268208093, |
| "loss": 0.0257625300437212, |
| "mean_token_accuracy": 0.9923644959926605, |
| "num_tokens": 7999213.0, |
| "step": 907 |
| }, |
| { |
| "entropy": 1.0033773183822632, |
| "epoch": 3.278733031674208, |
| "grad_norm": 0.5744296312332153, |
| "learning_rate": 0.00014025233096238337, |
| "loss": 0.05902718007564545, |
| "mean_token_accuracy": 0.9848368018865585, |
| "num_tokens": 8008151.0, |
| "step": 908 |
| }, |
| { |
| "entropy": 1.0164627432823181, |
| "epoch": 3.2823529411764705, |
| "grad_norm": 0.625365138053894, |
| "learning_rate": 0.0001401283845901327, |
| "loss": 0.05885080248117447, |
| "mean_token_accuracy": 0.9817720204591751, |
| "num_tokens": 8016336.0, |
| "step": 909 |
| }, |
| { |
| "entropy": 0.9507493227720261, |
| "epoch": 3.285972850678733, |
| "grad_norm": 0.5023903250694275, |
| "learning_rate": 0.00014000437383021586, |
| "loss": 0.04318719357252121, |
| "mean_token_accuracy": 0.9899467080831528, |
| "num_tokens": 8025425.0, |
| "step": 910 |
| }, |
| { |
| "entropy": 0.9879760593175888, |
| "epoch": 3.2895927601809953, |
| "grad_norm": 0.629055380821228, |
| "learning_rate": 0.00013988029894765748, |
| "loss": 0.08269868046045303, |
| "mean_token_accuracy": 0.9814896881580353, |
| "num_tokens": 8034580.0, |
| "step": 911 |
| }, |
| { |
| "entropy": 1.0588575601577759, |
| "epoch": 3.2932126696832578, |
| "grad_norm": 0.6117694973945618, |
| "learning_rate": 0.00013975616020761922, |
| "loss": 0.06348450481891632, |
| "mean_token_accuracy": 0.9816050231456757, |
| "num_tokens": 8042841.0, |
| "step": 912 |
| }, |
| { |
| "entropy": 1.0400816798210144, |
| "epoch": 3.29683257918552, |
| "grad_norm": 0.5118631720542908, |
| "learning_rate": 0.0001396319578753992, |
| "loss": 0.040266815572977066, |
| "mean_token_accuracy": 0.9846054464578629, |
| "num_tokens": 8051443.0, |
| "step": 913 |
| }, |
| { |
| "entropy": 1.1316498517990112, |
| "epoch": 3.3004524886877826, |
| "grad_norm": 0.7367774248123169, |
| "learning_rate": 0.0001395076922164314, |
| "loss": 0.08550389111042023, |
| "mean_token_accuracy": 0.9712842255830765, |
| "num_tokens": 8059492.0, |
| "step": 914 |
| }, |
| { |
| "entropy": 1.084427922964096, |
| "epoch": 3.304072398190045, |
| "grad_norm": 0.7586991786956787, |
| "learning_rate": 0.00013938336349628524, |
| "loss": 0.06349455565214157, |
| "mean_token_accuracy": 0.9829322099685669, |
| "num_tokens": 8067797.0, |
| "step": 915 |
| }, |
| { |
| "entropy": 1.0688078999519348, |
| "epoch": 3.3076923076923075, |
| "grad_norm": 0.5953513383865356, |
| "learning_rate": 0.0001392589719806648, |
| "loss": 0.0380900502204895, |
| "mean_token_accuracy": 0.9876203536987305, |
| "num_tokens": 8075988.0, |
| "step": 916 |
| }, |
| { |
| "entropy": 1.0175382196903229, |
| "epoch": 3.31131221719457, |
| "grad_norm": 0.5219187140464783, |
| "learning_rate": 0.00013913451793540844, |
| "loss": 0.026124723255634308, |
| "mean_token_accuracy": 0.9945862740278244, |
| "num_tokens": 8084620.0, |
| "step": 917 |
| }, |
| { |
| "entropy": 1.0590780079364777, |
| "epoch": 3.3149321266968323, |
| "grad_norm": 0.4491603672504425, |
| "learning_rate": 0.0001390100016264881, |
| "loss": 0.051901232451200485, |
| "mean_token_accuracy": 0.9856987297534943, |
| "num_tokens": 8093124.0, |
| "step": 918 |
| }, |
| { |
| "entropy": 0.9357586652040482, |
| "epoch": 3.318552036199095, |
| "grad_norm": 0.5711967945098877, |
| "learning_rate": 0.00013888542332000882, |
| "loss": 0.05333176627755165, |
| "mean_token_accuracy": 0.9833570569753647, |
| "num_tokens": 8102615.0, |
| "step": 919 |
| }, |
| { |
| "entropy": 1.0148942023515701, |
| "epoch": 3.3221719457013577, |
| "grad_norm": 0.6172460913658142, |
| "learning_rate": 0.0001387607832822081, |
| "loss": 0.07704110443592072, |
| "mean_token_accuracy": 0.9769087731838226, |
| "num_tokens": 8111712.0, |
| "step": 920 |
| }, |
| { |
| "entropy": 1.005068227648735, |
| "epoch": 3.32579185520362, |
| "grad_norm": 0.48708808422088623, |
| "learning_rate": 0.0001386360817794554, |
| "loss": 0.057994648814201355, |
| "mean_token_accuracy": 0.9857281446456909, |
| "num_tokens": 8120661.0, |
| "step": 921 |
| }, |
| { |
| "entropy": 1.0857775509357452, |
| "epoch": 3.3294117647058825, |
| "grad_norm": 0.521118700504303, |
| "learning_rate": 0.00013851131907825152, |
| "loss": 0.0432361476123333, |
| "mean_token_accuracy": 0.9871759116649628, |
| "num_tokens": 8128962.0, |
| "step": 922 |
| }, |
| { |
| "entropy": 0.9878742694854736, |
| "epoch": 3.333031674208145, |
| "grad_norm": 0.5457652807235718, |
| "learning_rate": 0.00013838649544522803, |
| "loss": 0.057160355150699615, |
| "mean_token_accuracy": 0.9802255481481552, |
| "num_tokens": 8138079.0, |
| "step": 923 |
| }, |
| { |
| "entropy": 0.9153740406036377, |
| "epoch": 3.3366515837104074, |
| "grad_norm": 0.6113324761390686, |
| "learning_rate": 0.00013826161114714682, |
| "loss": 0.05363360047340393, |
| "mean_token_accuracy": 0.9866549521684647, |
| "num_tokens": 8147296.0, |
| "step": 924 |
| }, |
| { |
| "entropy": 0.9832455366849899, |
| "epoch": 3.34027149321267, |
| "grad_norm": 0.7847388386726379, |
| "learning_rate": 0.00013813666645089926, |
| "loss": 0.10442715883255005, |
| "mean_token_accuracy": 0.9688615947961807, |
| "num_tokens": 8156139.0, |
| "step": 925 |
| }, |
| { |
| "entropy": 1.0013352185487747, |
| "epoch": 3.3438914027149322, |
| "grad_norm": 0.4540573060512543, |
| "learning_rate": 0.0001380116616235059, |
| "loss": 0.04038259759545326, |
| "mean_token_accuracy": 0.9876029193401337, |
| "num_tokens": 8164879.0, |
| "step": 926 |
| }, |
| { |
| "entropy": 0.9603820294141769, |
| "epoch": 3.3475113122171947, |
| "grad_norm": 0.5983673930168152, |
| "learning_rate": 0.00013788659693211584, |
| "loss": 0.08187659829854965, |
| "mean_token_accuracy": 0.9774649292230606, |
| "num_tokens": 8174389.0, |
| "step": 927 |
| }, |
| { |
| "entropy": 1.0243980884552002, |
| "epoch": 3.351131221719457, |
| "grad_norm": 0.6228734850883484, |
| "learning_rate": 0.000137761472644006, |
| "loss": 0.048151057213544846, |
| "mean_token_accuracy": 0.9839256554841995, |
| "num_tokens": 8182842.0, |
| "step": 928 |
| }, |
| { |
| "entropy": 0.9616681337356567, |
| "epoch": 3.3547511312217195, |
| "grad_norm": 0.5719891786575317, |
| "learning_rate": 0.00013763628902658075, |
| "loss": 0.03624703735113144, |
| "mean_token_accuracy": 0.9900500476360321, |
| "num_tokens": 8192132.0, |
| "step": 929 |
| }, |
| { |
| "entropy": 1.0459087640047073, |
| "epoch": 3.358371040723982, |
| "grad_norm": 0.6618694067001343, |
| "learning_rate": 0.0001375110463473712, |
| "loss": 0.06819941848516464, |
| "mean_token_accuracy": 0.9793660789728165, |
| "num_tokens": 8200462.0, |
| "step": 930 |
| }, |
| { |
| "entropy": 0.9984315633773804, |
| "epoch": 3.3619909502262444, |
| "grad_norm": 0.4620678722858429, |
| "learning_rate": 0.00013738574487403475, |
| "loss": 0.034341007471084595, |
| "mean_token_accuracy": 0.9875331819057465, |
| "num_tokens": 8209284.0, |
| "step": 931 |
| }, |
| { |
| "entropy": 1.0406230092048645, |
| "epoch": 3.365610859728507, |
| "grad_norm": 0.5373427867889404, |
| "learning_rate": 0.00013726038487435436, |
| "loss": 0.043672651052474976, |
| "mean_token_accuracy": 0.9890551418066025, |
| "num_tokens": 8217959.0, |
| "step": 932 |
| }, |
| { |
| "entropy": 0.9317538440227509, |
| "epoch": 3.3692307692307693, |
| "grad_norm": 0.48770636320114136, |
| "learning_rate": 0.00013713496661623816, |
| "loss": 0.0499286986887455, |
| "mean_token_accuracy": 0.9822133630514145, |
| "num_tokens": 8227583.0, |
| "step": 933 |
| }, |
| { |
| "entropy": 0.9775789231061935, |
| "epoch": 3.3728506787330317, |
| "grad_norm": 0.5296841263771057, |
| "learning_rate": 0.00013700949036771874, |
| "loss": 0.056918881833553314, |
| "mean_token_accuracy": 0.9848699420690536, |
| "num_tokens": 8237155.0, |
| "step": 934 |
| }, |
| { |
| "entropy": 1.0716052204370499, |
| "epoch": 3.376470588235294, |
| "grad_norm": 0.6076134443283081, |
| "learning_rate": 0.00013688395639695252, |
| "loss": 0.05968927592039108, |
| "mean_token_accuracy": 0.9806047230958939, |
| "num_tokens": 8245583.0, |
| "step": 935 |
| }, |
| { |
| "entropy": 1.0416816025972366, |
| "epoch": 3.3800904977375565, |
| "grad_norm": 0.45776641368865967, |
| "learning_rate": 0.00013675836497221953, |
| "loss": 0.04919914901256561, |
| "mean_token_accuracy": 0.9881732165813446, |
| "num_tokens": 8254310.0, |
| "step": 936 |
| }, |
| { |
| "entropy": 0.9982000887393951, |
| "epoch": 3.383710407239819, |
| "grad_norm": 0.43318697810173035, |
| "learning_rate": 0.00013663271636192234, |
| "loss": 0.034604959189891815, |
| "mean_token_accuracy": 0.9873620271682739, |
| "num_tokens": 8263199.0, |
| "step": 937 |
| }, |
| { |
| "entropy": 1.0052271336317062, |
| "epoch": 3.3873303167420814, |
| "grad_norm": 0.44714435935020447, |
| "learning_rate": 0.00013650701083458585, |
| "loss": 0.08507149666547775, |
| "mean_token_accuracy": 0.9845927953720093, |
| "num_tokens": 8272493.0, |
| "step": 938 |
| }, |
| { |
| "entropy": 1.0300282388925552, |
| "epoch": 3.390950226244344, |
| "grad_norm": 0.3920552730560303, |
| "learning_rate": 0.0001363812486588566, |
| "loss": 0.0559050627052784, |
| "mean_token_accuracy": 0.9840603917837143, |
| "num_tokens": 8281216.0, |
| "step": 939 |
| }, |
| { |
| "entropy": 1.0108753889799118, |
| "epoch": 3.3945701357466063, |
| "grad_norm": 0.5175376534461975, |
| "learning_rate": 0.0001362554301035021, |
| "loss": 0.056878913193941116, |
| "mean_token_accuracy": 0.9844557046890259, |
| "num_tokens": 8290001.0, |
| "step": 940 |
| }, |
| { |
| "entropy": 1.0616537183523178, |
| "epoch": 3.3981900452488687, |
| "grad_norm": 0.5512950420379639, |
| "learning_rate": 0.0001361295554374105, |
| "loss": 0.05054660886526108, |
| "mean_token_accuracy": 0.9832082837820053, |
| "num_tokens": 8298553.0, |
| "step": 941 |
| }, |
| { |
| "entropy": 1.1058834791183472, |
| "epoch": 3.401809954751131, |
| "grad_norm": 0.4575974941253662, |
| "learning_rate": 0.00013600362492958976, |
| "loss": 0.0616161935031414, |
| "mean_token_accuracy": 0.9793710261583328, |
| "num_tokens": 8307283.0, |
| "step": 942 |
| }, |
| { |
| "entropy": 1.0524688065052032, |
| "epoch": 3.4054298642533936, |
| "grad_norm": 0.4527650773525238, |
| "learning_rate": 0.00013587763884916716, |
| "loss": 0.045646894723176956, |
| "mean_token_accuracy": 0.9848221987485886, |
| "num_tokens": 8316116.0, |
| "step": 943 |
| }, |
| { |
| "entropy": 1.0609974563121796, |
| "epoch": 3.409049773755656, |
| "grad_norm": 0.6199139952659607, |
| "learning_rate": 0.0001357515974653888, |
| "loss": 0.10065983235836029, |
| "mean_token_accuracy": 0.9717418551445007, |
| "num_tokens": 8324716.0, |
| "step": 944 |
| }, |
| { |
| "entropy": 0.958422839641571, |
| "epoch": 3.4126696832579184, |
| "grad_norm": 0.5813620090484619, |
| "learning_rate": 0.000135625501047619, |
| "loss": 0.07302285730838776, |
| "mean_token_accuracy": 0.9771098643541336, |
| "num_tokens": 8334091.0, |
| "step": 945 |
| }, |
| { |
| "entropy": 1.0653769075870514, |
| "epoch": 3.416289592760181, |
| "grad_norm": 0.5696032047271729, |
| "learning_rate": 0.00013549934986533966, |
| "loss": 0.06528802961111069, |
| "mean_token_accuracy": 0.9849737137556076, |
| "num_tokens": 8343075.0, |
| "step": 946 |
| }, |
| { |
| "entropy": 1.0103659331798553, |
| "epoch": 3.4199095022624433, |
| "grad_norm": 0.36156147718429565, |
| "learning_rate": 0.0001353731441881496, |
| "loss": 0.041538454592227936, |
| "mean_token_accuracy": 0.9860167354345322, |
| "num_tokens": 8352196.0, |
| "step": 947 |
| }, |
| { |
| "entropy": 0.9813169538974762, |
| "epoch": 3.4235294117647057, |
| "grad_norm": 0.465168833732605, |
| "learning_rate": 0.00013524688428576435, |
| "loss": 0.0380837507545948, |
| "mean_token_accuracy": 0.9855641424655914, |
| "num_tokens": 8361142.0, |
| "step": 948 |
| }, |
| { |
| "entropy": 0.9958767145872116, |
| "epoch": 3.427149321266968, |
| "grad_norm": 0.3552955090999603, |
| "learning_rate": 0.0001351205704280151, |
| "loss": 0.026597965508699417, |
| "mean_token_accuracy": 0.9929200410842896, |
| "num_tokens": 8370017.0, |
| "step": 949 |
| }, |
| { |
| "entropy": 0.970636785030365, |
| "epoch": 3.430769230769231, |
| "grad_norm": 0.5098865032196045, |
| "learning_rate": 0.00013499420288484842, |
| "loss": 0.03246006369590759, |
| "mean_token_accuracy": 0.9889640510082245, |
| "num_tokens": 8379008.0, |
| "step": 950 |
| }, |
| { |
| "entropy": 1.012205347418785, |
| "epoch": 3.4343891402714934, |
| "grad_norm": 0.7393912672996521, |
| "learning_rate": 0.00013486778192632574, |
| "loss": 0.07227301597595215, |
| "mean_token_accuracy": 0.977458193898201, |
| "num_tokens": 8387896.0, |
| "step": 951 |
| }, |
| { |
| "entropy": 0.9507156610488892, |
| "epoch": 3.438009049773756, |
| "grad_norm": 0.6013923287391663, |
| "learning_rate": 0.0001347413078226224, |
| "loss": 0.0642247200012207, |
| "mean_token_accuracy": 0.9781456738710403, |
| "num_tokens": 8396889.0, |
| "step": 952 |
| }, |
| { |
| "entropy": 0.9904819428920746, |
| "epoch": 3.4416289592760183, |
| "grad_norm": 0.5121738314628601, |
| "learning_rate": 0.00013461478084402745, |
| "loss": 0.04460640996694565, |
| "mean_token_accuracy": 0.9875061810016632, |
| "num_tokens": 8405690.0, |
| "step": 953 |
| }, |
| { |
| "entropy": 0.9784403592348099, |
| "epoch": 3.4452488687782807, |
| "grad_norm": 0.43736669421195984, |
| "learning_rate": 0.00013448820126094307, |
| "loss": 0.038092780858278275, |
| "mean_token_accuracy": 0.9895127415657043, |
| "num_tokens": 8414500.0, |
| "step": 954 |
| }, |
| { |
| "entropy": 0.9568644464015961, |
| "epoch": 3.448868778280543, |
| "grad_norm": 0.3654727637767792, |
| "learning_rate": 0.0001343615693438836, |
| "loss": 0.029216358438134193, |
| "mean_token_accuracy": 0.9893685132265091, |
| "num_tokens": 8423326.0, |
| "step": 955 |
| }, |
| { |
| "entropy": 0.927936390042305, |
| "epoch": 3.4524886877828056, |
| "grad_norm": 0.4584848880767822, |
| "learning_rate": 0.0001342348853634754, |
| "loss": 0.04364006593823433, |
| "mean_token_accuracy": 0.9881165325641632, |
| "num_tokens": 8432485.0, |
| "step": 956 |
| }, |
| { |
| "entropy": 0.9923946708440781, |
| "epoch": 3.456108597285068, |
| "grad_norm": 0.7116801142692566, |
| "learning_rate": 0.00013410814959045607, |
| "loss": 0.10070855170488358, |
| "mean_token_accuracy": 0.9667374640703201, |
| "num_tokens": 8441344.0, |
| "step": 957 |
| }, |
| { |
| "entropy": 0.9566802680492401, |
| "epoch": 3.4597285067873305, |
| "grad_norm": 0.5868293046951294, |
| "learning_rate": 0.00013398136229567383, |
| "loss": 0.04937519505620003, |
| "mean_token_accuracy": 0.982713058590889, |
| "num_tokens": 8450202.0, |
| "step": 958 |
| }, |
| { |
| "entropy": 0.9900821000337601, |
| "epoch": 3.463348416289593, |
| "grad_norm": 0.4907667338848114, |
| "learning_rate": 0.00013385452375008704, |
| "loss": 0.052656762301921844, |
| "mean_token_accuracy": 0.9915094673633575, |
| "num_tokens": 8459117.0, |
| "step": 959 |
| }, |
| { |
| "entropy": 0.926659345626831, |
| "epoch": 3.4669683257918553, |
| "grad_norm": 0.5979401469230652, |
| "learning_rate": 0.00013372763422476365, |
| "loss": 0.06121910735964775, |
| "mean_token_accuracy": 0.9823627024888992, |
| "num_tokens": 8468190.0, |
| "step": 960 |
| }, |
| { |
| "entropy": 0.9869116097688675, |
| "epoch": 3.4705882352941178, |
| "grad_norm": 0.47893184423446655, |
| "learning_rate": 0.00013360069399088044, |
| "loss": 0.06325532495975494, |
| "mean_token_accuracy": 0.982891395688057, |
| "num_tokens": 8477414.0, |
| "step": 961 |
| }, |
| { |
| "entropy": 0.9895572513341904, |
| "epoch": 3.47420814479638, |
| "grad_norm": 0.6157852411270142, |
| "learning_rate": 0.00013347370331972272, |
| "loss": 0.06221451610326767, |
| "mean_token_accuracy": 0.9826388210058212, |
| "num_tokens": 8486453.0, |
| "step": 962 |
| }, |
| { |
| "entropy": 0.9911331236362457, |
| "epoch": 3.4778280542986426, |
| "grad_norm": 0.540799617767334, |
| "learning_rate": 0.0001333466624826834, |
| "loss": 0.09207938611507416, |
| "mean_token_accuracy": 0.9749413877725601, |
| "num_tokens": 8495741.0, |
| "step": 963 |
| }, |
| { |
| "entropy": 0.9546624422073364, |
| "epoch": 3.481447963800905, |
| "grad_norm": 0.7074460387229919, |
| "learning_rate": 0.0001332195717512628, |
| "loss": 0.05557447671890259, |
| "mean_token_accuracy": 0.9830146133899689, |
| "num_tokens": 8505116.0, |
| "step": 964 |
| }, |
| { |
| "entropy": 0.980869710445404, |
| "epoch": 3.4850678733031675, |
| "grad_norm": 0.49514511227607727, |
| "learning_rate": 0.00013309243139706772, |
| "loss": 0.04955790191888809, |
| "mean_token_accuracy": 0.9866456240415573, |
| "num_tokens": 8514060.0, |
| "step": 965 |
| }, |
| { |
| "entropy": 1.0564606338739395, |
| "epoch": 3.48868778280543, |
| "grad_norm": 0.6254670023918152, |
| "learning_rate": 0.00013296524169181107, |
| "loss": 0.060992419719696045, |
| "mean_token_accuracy": 0.9791516810655594, |
| "num_tokens": 8522527.0, |
| "step": 966 |
| }, |
| { |
| "entropy": 1.0110660940408707, |
| "epoch": 3.4923076923076923, |
| "grad_norm": 0.49232155084609985, |
| "learning_rate": 0.00013283800290731114, |
| "loss": 0.065431147813797, |
| "mean_token_accuracy": 0.9852920174598694, |
| "num_tokens": 8531494.0, |
| "step": 967 |
| }, |
| { |
| "entropy": 1.02008418738842, |
| "epoch": 3.4959276018099548, |
| "grad_norm": 0.5096694827079773, |
| "learning_rate": 0.0001327107153154913, |
| "loss": 0.06423597782850266, |
| "mean_token_accuracy": 0.9796927273273468, |
| "num_tokens": 8540545.0, |
| "step": 968 |
| }, |
| { |
| "entropy": 1.0431571304798126, |
| "epoch": 3.499547511312217, |
| "grad_norm": 0.5744512677192688, |
| "learning_rate": 0.00013258337918837905, |
| "loss": 0.07912938296794891, |
| "mean_token_accuracy": 0.980916902422905, |
| "num_tokens": 8549127.0, |
| "step": 969 |
| }, |
| { |
| "entropy": 0.9832931458950043, |
| "epoch": 3.5031674208144796, |
| "grad_norm": 0.5376607775688171, |
| "learning_rate": 0.00013245599479810564, |
| "loss": 0.05653414875268936, |
| "mean_token_accuracy": 0.9854852706193924, |
| "num_tokens": 8558086.0, |
| "step": 970 |
| }, |
| { |
| "entropy": 1.053595095872879, |
| "epoch": 3.506787330316742, |
| "grad_norm": 0.6880918741226196, |
| "learning_rate": 0.00013232856241690555, |
| "loss": 0.10130049288272858, |
| "mean_token_accuracy": 0.9788567274808884, |
| "num_tokens": 8566974.0, |
| "step": 971 |
| }, |
| { |
| "entropy": 1.090879499912262, |
| "epoch": 3.5104072398190045, |
| "grad_norm": 0.5793285965919495, |
| "learning_rate": 0.0001322010823171158, |
| "loss": 0.11286645382642746, |
| "mean_token_accuracy": 0.9682568907737732, |
| "num_tokens": 8575648.0, |
| "step": 972 |
| }, |
| { |
| "entropy": 1.0895802229642868, |
| "epoch": 3.514027149321267, |
| "grad_norm": 0.45659247040748596, |
| "learning_rate": 0.00013207355477117534, |
| "loss": 0.04402415081858635, |
| "mean_token_accuracy": 0.9873204827308655, |
| "num_tokens": 8584145.0, |
| "step": 973 |
| }, |
| { |
| "entropy": 1.0633756816387177, |
| "epoch": 3.5176470588235293, |
| "grad_norm": 0.35166093707084656, |
| "learning_rate": 0.00013194598005162447, |
| "loss": 0.028780082240700722, |
| "mean_token_accuracy": 0.9902430325746536, |
| "num_tokens": 8592757.0, |
| "step": 974 |
| }, |
| { |
| "entropy": 0.9252981543540955, |
| "epoch": 3.521266968325792, |
| "grad_norm": 0.3973873257637024, |
| "learning_rate": 0.00013181835843110448, |
| "loss": 0.04340490698814392, |
| "mean_token_accuracy": 0.9855764210224152, |
| "num_tokens": 8602304.0, |
| "step": 975 |
| }, |
| { |
| "entropy": 0.9904424697160721, |
| "epoch": 3.524886877828054, |
| "grad_norm": 0.5329234600067139, |
| "learning_rate": 0.0001316906901823567, |
| "loss": 0.07568858563899994, |
| "mean_token_accuracy": 0.9819598495960236, |
| "num_tokens": 8611730.0, |
| "step": 976 |
| }, |
| { |
| "entropy": 0.9927194565534592, |
| "epoch": 3.5285067873303166, |
| "grad_norm": 0.5585746169090271, |
| "learning_rate": 0.00013156297557822224, |
| "loss": 0.06918197870254517, |
| "mean_token_accuracy": 0.9821109473705292, |
| "num_tokens": 8620786.0, |
| "step": 977 |
| }, |
| { |
| "entropy": 0.9288990050554276, |
| "epoch": 3.532126696832579, |
| "grad_norm": 0.530066728591919, |
| "learning_rate": 0.00013143521489164124, |
| "loss": 0.06363017857074738, |
| "mean_token_accuracy": 0.9863996803760529, |
| "num_tokens": 8630041.0, |
| "step": 978 |
| }, |
| { |
| "entropy": 0.9934940189123154, |
| "epoch": 3.5357466063348415, |
| "grad_norm": 0.6356056928634644, |
| "learning_rate": 0.00013130740839565228, |
| "loss": 0.11892964690923691, |
| "mean_token_accuracy": 0.9646638482809067, |
| "num_tokens": 8639083.0, |
| "step": 979 |
| }, |
| { |
| "entropy": 1.0539715886116028, |
| "epoch": 3.539366515837104, |
| "grad_norm": 0.6019116044044495, |
| "learning_rate": 0.0001311795563633919, |
| "loss": 0.05613447353243828, |
| "mean_token_accuracy": 0.981610581278801, |
| "num_tokens": 8647585.0, |
| "step": 980 |
| }, |
| { |
| "entropy": 0.9783657044172287, |
| "epoch": 3.5429864253393664, |
| "grad_norm": 0.5022520422935486, |
| "learning_rate": 0.00013105165906809394, |
| "loss": 0.06036650761961937, |
| "mean_token_accuracy": 0.9770310521125793, |
| "num_tokens": 8656253.0, |
| "step": 981 |
| }, |
| { |
| "entropy": 0.9798354804515839, |
| "epoch": 3.546606334841629, |
| "grad_norm": 0.45951077342033386, |
| "learning_rate": 0.00013092371678308896, |
| "loss": 0.04008646309375763, |
| "mean_token_accuracy": 0.9884042888879776, |
| "num_tokens": 8664814.0, |
| "step": 982 |
| }, |
| { |
| "entropy": 1.0052898228168488, |
| "epoch": 3.5502262443438912, |
| "grad_norm": 0.9089862108230591, |
| "learning_rate": 0.0001307957297818036, |
| "loss": 0.07838708907365799, |
| "mean_token_accuracy": 0.971778929233551, |
| "num_tokens": 8672936.0, |
| "step": 983 |
| }, |
| { |
| "entropy": 0.9371594786643982, |
| "epoch": 3.5538461538461537, |
| "grad_norm": 0.517698347568512, |
| "learning_rate": 0.00013066769833776026, |
| "loss": 0.052979908883571625, |
| "mean_token_accuracy": 0.9832516461610794, |
| "num_tokens": 8682346.0, |
| "step": 984 |
| }, |
| { |
| "entropy": 0.9779858440160751, |
| "epoch": 3.557466063348416, |
| "grad_norm": 0.47408440709114075, |
| "learning_rate": 0.00013053962272457613, |
| "loss": 0.048960305750370026, |
| "mean_token_accuracy": 0.9822594523429871, |
| "num_tokens": 8690870.0, |
| "step": 985 |
| }, |
| { |
| "entropy": 0.915733277797699, |
| "epoch": 3.5610859728506785, |
| "grad_norm": 0.6086940765380859, |
| "learning_rate": 0.00013041150321596286, |
| "loss": 0.07182019203901291, |
| "mean_token_accuracy": 0.9770045280456543, |
| "num_tokens": 8699907.0, |
| "step": 986 |
| }, |
| { |
| "entropy": 0.9649381190538406, |
| "epoch": 3.564705882352941, |
| "grad_norm": 0.5936797261238098, |
| "learning_rate": 0.00013028334008572588, |
| "loss": 0.06510870158672333, |
| "mean_token_accuracy": 0.9784034192562103, |
| "num_tokens": 8708771.0, |
| "step": 987 |
| }, |
| { |
| "entropy": 0.9736950397491455, |
| "epoch": 3.5683257918552034, |
| "grad_norm": 0.6395031809806824, |
| "learning_rate": 0.00013015513360776392, |
| "loss": 0.09703779220581055, |
| "mean_token_accuracy": 0.9763300269842148, |
| "num_tokens": 8717543.0, |
| "step": 988 |
| }, |
| { |
| "entropy": 0.9976915121078491, |
| "epoch": 3.571945701357466, |
| "grad_norm": 0.725874662399292, |
| "learning_rate": 0.00013002688405606828, |
| "loss": 0.09122011065483093, |
| "mean_token_accuracy": 0.9732316583395004, |
| "num_tokens": 8726064.0, |
| "step": 989 |
| }, |
| { |
| "entropy": 0.998377114534378, |
| "epoch": 3.5755656108597282, |
| "grad_norm": 0.6217262148857117, |
| "learning_rate": 0.0001298985917047224, |
| "loss": 0.06388919055461884, |
| "mean_token_accuracy": 0.9831690788269043, |
| "num_tokens": 8734777.0, |
| "step": 990 |
| }, |
| { |
| "entropy": 0.9755560904741287, |
| "epoch": 3.579185520361991, |
| "grad_norm": 0.5802934169769287, |
| "learning_rate": 0.000129770256827901, |
| "loss": 0.0830259919166565, |
| "mean_token_accuracy": 0.9810828566551208, |
| "num_tokens": 8743671.0, |
| "step": 991 |
| }, |
| { |
| "entropy": 0.9394370019435883, |
| "epoch": 3.5828054298642535, |
| "grad_norm": 0.4319647252559662, |
| "learning_rate": 0.00012964187969986986, |
| "loss": 0.029521089047193527, |
| "mean_token_accuracy": 0.9876839071512222, |
| "num_tokens": 8752588.0, |
| "step": 992 |
| }, |
| { |
| "entropy": 1.0086267590522766, |
| "epoch": 3.586425339366516, |
| "grad_norm": 0.44192439317703247, |
| "learning_rate": 0.00012951346059498505, |
| "loss": 0.044906750321388245, |
| "mean_token_accuracy": 0.9855604767799377, |
| "num_tokens": 8761348.0, |
| "step": 993 |
| }, |
| { |
| "entropy": 1.0500630140304565, |
| "epoch": 3.5900452488687784, |
| "grad_norm": 0.7281487584114075, |
| "learning_rate": 0.00012938499978769222, |
| "loss": 0.0938921645283699, |
| "mean_token_accuracy": 0.9799060225486755, |
| "num_tokens": 8770331.0, |
| "step": 994 |
| }, |
| { |
| "entropy": 1.1058216989040375, |
| "epoch": 3.593665158371041, |
| "grad_norm": 0.6030604243278503, |
| "learning_rate": 0.00012925649755252624, |
| "loss": 0.07762658596038818, |
| "mean_token_accuracy": 0.9789818972349167, |
| "num_tokens": 8778983.0, |
| "step": 995 |
| }, |
| { |
| "entropy": 1.0310122072696686, |
| "epoch": 3.5972850678733033, |
| "grad_norm": 0.44901198148727417, |
| "learning_rate": 0.00012912795416411056, |
| "loss": 0.04441501572728157, |
| "mean_token_accuracy": 0.9828421622514725, |
| "num_tokens": 8787524.0, |
| "step": 996 |
| }, |
| { |
| "entropy": 1.0446508675813675, |
| "epoch": 3.6009049773755657, |
| "grad_norm": 0.5148265957832336, |
| "learning_rate": 0.0001289993698971564, |
| "loss": 0.046136148273944855, |
| "mean_token_accuracy": 0.9823300242424011, |
| "num_tokens": 8796366.0, |
| "step": 997 |
| }, |
| { |
| "entropy": 1.0721295475959778, |
| "epoch": 3.604524886877828, |
| "grad_norm": 0.5323315858840942, |
| "learning_rate": 0.00012887074502646257, |
| "loss": 0.037365175783634186, |
| "mean_token_accuracy": 0.988229975104332, |
| "num_tokens": 8804941.0, |
| "step": 998 |
| }, |
| { |
| "entropy": 1.097947746515274, |
| "epoch": 3.6081447963800906, |
| "grad_norm": 0.7002111077308655, |
| "learning_rate": 0.00012874207982691447, |
| "loss": 0.07946470379829407, |
| "mean_token_accuracy": 0.9763154089450836, |
| "num_tokens": 8813676.0, |
| "step": 999 |
| }, |
| { |
| "entropy": 1.0364596843719482, |
| "epoch": 3.611764705882353, |
| "grad_norm": 0.620373547077179, |
| "learning_rate": 0.00012861337457348383, |
| "loss": 0.08317571133375168, |
| "mean_token_accuracy": 0.9791288524866104, |
| "num_tokens": 8822412.0, |
| "step": 1000 |
| }, |
| { |
| "entropy": 1.0231077075004578, |
| "epoch": 3.6153846153846154, |
| "grad_norm": 0.36507073044776917, |
| "learning_rate": 0.0001284846295412278, |
| "loss": 0.039691824465990067, |
| "mean_token_accuracy": 0.9890045672655106, |
| "num_tokens": 8831109.0, |
| "step": 1001 |
| }, |
| { |
| "entropy": 1.0747565776109695, |
| "epoch": 3.619004524886878, |
| "grad_norm": 0.5191677808761597, |
| "learning_rate": 0.00012835584500528875, |
| "loss": 0.05060931667685509, |
| "mean_token_accuracy": 0.9855721145868301, |
| "num_tokens": 8839568.0, |
| "step": 1002 |
| }, |
| { |
| "entropy": 1.0877159237861633, |
| "epoch": 3.6226244343891403, |
| "grad_norm": 0.5678452849388123, |
| "learning_rate": 0.00012822702124089337, |
| "loss": 0.05626006796956062, |
| "mean_token_accuracy": 0.97838294506073, |
| "num_tokens": 8848116.0, |
| "step": 1003 |
| }, |
| { |
| "entropy": 1.0909876823425293, |
| "epoch": 3.6262443438914027, |
| "grad_norm": 0.5667127966880798, |
| "learning_rate": 0.00012809815852335213, |
| "loss": 0.061909645795822144, |
| "mean_token_accuracy": 0.9774574041366577, |
| "num_tokens": 8856312.0, |
| "step": 1004 |
| }, |
| { |
| "entropy": 1.0426032990217209, |
| "epoch": 3.629864253393665, |
| "grad_norm": 0.4420037269592285, |
| "learning_rate": 0.00012796925712805883, |
| "loss": 0.05646451935172081, |
| "mean_token_accuracy": 0.9806726723909378, |
| "num_tokens": 8864934.0, |
| "step": 1005 |
| }, |
| { |
| "entropy": 0.9995952397584915, |
| "epoch": 3.6334841628959276, |
| "grad_norm": 0.4093227684497833, |
| "learning_rate": 0.00012784031733048992, |
| "loss": 0.04071081057190895, |
| "mean_token_accuracy": 0.9907932132482529, |
| "num_tokens": 8873915.0, |
| "step": 1006 |
| }, |
| { |
| "entropy": 0.9970235526561737, |
| "epoch": 3.63710407239819, |
| "grad_norm": 0.5725744962692261, |
| "learning_rate": 0.0001277113394062039, |
| "loss": 0.06767860054969788, |
| "mean_token_accuracy": 0.9846918284893036, |
| "num_tokens": 8882883.0, |
| "step": 1007 |
| }, |
| { |
| "entropy": 0.9805562347173691, |
| "epoch": 3.6407239819004524, |
| "grad_norm": 0.4164693057537079, |
| "learning_rate": 0.0001275823236308408, |
| "loss": 0.03698574751615524, |
| "mean_token_accuracy": 0.9875341504812241, |
| "num_tokens": 8891980.0, |
| "step": 1008 |
| }, |
| { |
| "entropy": 1.0244318395853043, |
| "epoch": 3.644343891402715, |
| "grad_norm": 0.608647882938385, |
| "learning_rate": 0.0001274532702801214, |
| "loss": 0.06296362727880478, |
| "mean_token_accuracy": 0.9802230894565582, |
| "num_tokens": 8900866.0, |
| "step": 1009 |
| }, |
| { |
| "entropy": 0.9997225403785706, |
| "epoch": 3.6479638009049773, |
| "grad_norm": 0.5184244513511658, |
| "learning_rate": 0.00012732417962984697, |
| "loss": 0.07126037776470184, |
| "mean_token_accuracy": 0.982243612408638, |
| "num_tokens": 8909823.0, |
| "step": 1010 |
| }, |
| { |
| "entropy": 1.006606712937355, |
| "epoch": 3.6515837104072397, |
| "grad_norm": 0.4548531770706177, |
| "learning_rate": 0.00012719505195589833, |
| "loss": 0.04628659039735794, |
| "mean_token_accuracy": 0.9868118911981583, |
| "num_tokens": 8918370.0, |
| "step": 1011 |
| }, |
| { |
| "entropy": 0.9653205871582031, |
| "epoch": 3.655203619909502, |
| "grad_norm": 0.6441347599029541, |
| "learning_rate": 0.0001270658875342356, |
| "loss": 0.07759731262922287, |
| "mean_token_accuracy": 0.9780310839414597, |
| "num_tokens": 8927424.0, |
| "step": 1012 |
| }, |
| { |
| "entropy": 1.0112672001123428, |
| "epoch": 3.6588235294117646, |
| "grad_norm": 0.5434349775314331, |
| "learning_rate": 0.00012693668664089724, |
| "loss": 0.06205587089061737, |
| "mean_token_accuracy": 0.9839424788951874, |
| "num_tokens": 8935733.0, |
| "step": 1013 |
| }, |
| { |
| "entropy": 0.9845980405807495, |
| "epoch": 3.662443438914027, |
| "grad_norm": 0.39817336201667786, |
| "learning_rate": 0.00012680744955199976, |
| "loss": 0.048281848430633545, |
| "mean_token_accuracy": 0.9853204637765884, |
| "num_tokens": 8944307.0, |
| "step": 1014 |
| }, |
| { |
| "entropy": 1.0732997953891754, |
| "epoch": 3.6660633484162894, |
| "grad_norm": 0.5867156982421875, |
| "learning_rate": 0.00012667817654373704, |
| "loss": 0.0651404857635498, |
| "mean_token_accuracy": 0.9803767651319504, |
| "num_tokens": 8952575.0, |
| "step": 1015 |
| }, |
| { |
| "entropy": 1.0137926042079926, |
| "epoch": 3.669683257918552, |
| "grad_norm": 0.5951899290084839, |
| "learning_rate": 0.0001265488678923797, |
| "loss": 0.09196449816226959, |
| "mean_token_accuracy": 0.9773082733154297, |
| "num_tokens": 8961217.0, |
| "step": 1016 |
| }, |
| { |
| "entropy": 1.0144722759723663, |
| "epoch": 3.6733031674208148, |
| "grad_norm": 0.599597692489624, |
| "learning_rate": 0.00012641952387427448, |
| "loss": 0.07395292818546295, |
| "mean_token_accuracy": 0.9723586440086365, |
| "num_tokens": 8969507.0, |
| "step": 1017 |
| }, |
| { |
| "entropy": 0.9949919432401657, |
| "epoch": 3.676923076923077, |
| "grad_norm": 0.5899074673652649, |
| "learning_rate": 0.0001262901447658438, |
| "loss": 0.057064566761255264, |
| "mean_token_accuracy": 0.9813538044691086, |
| "num_tokens": 8978229.0, |
| "step": 1018 |
| }, |
| { |
| "entropy": 0.9935206919908524, |
| "epoch": 3.6805429864253396, |
| "grad_norm": 0.5249025225639343, |
| "learning_rate": 0.000126160730843585, |
| "loss": 0.03331971541047096, |
| "mean_token_accuracy": 0.9901821464300156, |
| "num_tokens": 8987107.0, |
| "step": 1019 |
| }, |
| { |
| "entropy": 0.9692054241895676, |
| "epoch": 3.684162895927602, |
| "grad_norm": 0.7538760900497437, |
| "learning_rate": 0.00012603128238406985, |
| "loss": 0.09024970233440399, |
| "mean_token_accuracy": 0.9682414084672928, |
| "num_tokens": 8996471.0, |
| "step": 1020 |
| }, |
| { |
| "entropy": 0.9907859861850739, |
| "epoch": 3.6877828054298645, |
| "grad_norm": 0.541009783744812, |
| "learning_rate": 0.00012590179966394388, |
| "loss": 0.04950612783432007, |
| "mean_token_accuracy": 0.9824161380529404, |
| "num_tokens": 9005068.0, |
| "step": 1021 |
| }, |
| { |
| "entropy": 1.0388163626194, |
| "epoch": 3.691402714932127, |
| "grad_norm": 0.5293216705322266, |
| "learning_rate": 0.0001257722829599259, |
| "loss": 0.04979352653026581, |
| "mean_token_accuracy": 0.9849574863910675, |
| "num_tokens": 9013415.0, |
| "step": 1022 |
| }, |
| { |
| "entropy": 0.9590071588754654, |
| "epoch": 3.6950226244343893, |
| "grad_norm": 0.5384345054626465, |
| "learning_rate": 0.0001256427325488074, |
| "loss": 0.05108953267335892, |
| "mean_token_accuracy": 0.9861488491296768, |
| "num_tokens": 9022908.0, |
| "step": 1023 |
| }, |
| { |
| "entropy": 1.0496671795845032, |
| "epoch": 3.6986425339366518, |
| "grad_norm": 0.598530113697052, |
| "learning_rate": 0.00012551314870745174, |
| "loss": 0.04889511317014694, |
| "mean_token_accuracy": 0.9841330647468567, |
| "num_tokens": 9031377.0, |
| "step": 1024 |
| }, |
| { |
| "entropy": 1.06731316447258, |
| "epoch": 3.702262443438914, |
| "grad_norm": 0.7431966066360474, |
| "learning_rate": 0.00012538353171279387, |
| "loss": 0.09418823570013046, |
| "mean_token_accuracy": 0.9767041355371475, |
| "num_tokens": 9039580.0, |
| "step": 1025 |
| }, |
| { |
| "entropy": 1.0664094239473343, |
| "epoch": 3.7058823529411766, |
| "grad_norm": 0.5317267775535583, |
| "learning_rate": 0.00012525388184183952, |
| "loss": 0.05456814914941788, |
| "mean_token_accuracy": 0.9801364839076996, |
| "num_tokens": 9047994.0, |
| "step": 1026 |
| }, |
| { |
| "entropy": 0.9594382792711258, |
| "epoch": 3.709502262443439, |
| "grad_norm": 0.32988351583480835, |
| "learning_rate": 0.00012512419937166474, |
| "loss": 0.03360046446323395, |
| "mean_token_accuracy": 0.9874261766672134, |
| "num_tokens": 9057249.0, |
| "step": 1027 |
| }, |
| { |
| "entropy": 1.0023006796836853, |
| "epoch": 3.7131221719457015, |
| "grad_norm": 0.47979024052619934, |
| "learning_rate": 0.0001249944845794151, |
| "loss": 0.060380980372428894, |
| "mean_token_accuracy": 0.98314069211483, |
| "num_tokens": 9065777.0, |
| "step": 1028 |
| }, |
| { |
| "entropy": 0.9864871054887772, |
| "epoch": 3.716742081447964, |
| "grad_norm": 0.5115134716033936, |
| "learning_rate": 0.00012486473774230548, |
| "loss": 0.05596606805920601, |
| "mean_token_accuracy": 0.9851608425378799, |
| "num_tokens": 9075135.0, |
| "step": 1029 |
| }, |
| { |
| "entropy": 1.0391730964183807, |
| "epoch": 3.7203619909502263, |
| "grad_norm": 0.867382287979126, |
| "learning_rate": 0.00012473495913761906, |
| "loss": 0.15137547254562378, |
| "mean_token_accuracy": 0.9702001363039017, |
| "num_tokens": 9083899.0, |
| "step": 1030 |
| }, |
| { |
| "entropy": 1.0278279185295105, |
| "epoch": 3.723981900452489, |
| "grad_norm": 0.6137107014656067, |
| "learning_rate": 0.00012460514904270696, |
| "loss": 0.04550578072667122, |
| "mean_token_accuracy": 0.9855063706636429, |
| "num_tokens": 9092262.0, |
| "step": 1031 |
| }, |
| { |
| "entropy": 0.9615179747343063, |
| "epoch": 3.727601809954751, |
| "grad_norm": 0.5605074763298035, |
| "learning_rate": 0.00012447530773498764, |
| "loss": 0.05297612026333809, |
| "mean_token_accuracy": 0.9862687736749649, |
| "num_tokens": 9101021.0, |
| "step": 1032 |
| }, |
| { |
| "entropy": 1.0526714771986008, |
| "epoch": 3.7312217194570136, |
| "grad_norm": 0.5493430495262146, |
| "learning_rate": 0.0001243454354919462, |
| "loss": 0.048210758715867996, |
| "mean_token_accuracy": 0.9809322506189346, |
| "num_tokens": 9109829.0, |
| "step": 1033 |
| }, |
| { |
| "entropy": 1.0210031270980835, |
| "epoch": 3.734841628959276, |
| "grad_norm": 0.6273694634437561, |
| "learning_rate": 0.00012421553259113393, |
| "loss": 0.07619710266590118, |
| "mean_token_accuracy": 0.9756554067134857, |
| "num_tokens": 9118711.0, |
| "step": 1034 |
| }, |
| { |
| "entropy": 0.999250665307045, |
| "epoch": 3.7384615384615385, |
| "grad_norm": 0.5953567028045654, |
| "learning_rate": 0.00012408559931016753, |
| "loss": 0.03722090646624565, |
| "mean_token_accuracy": 0.9891637414693832, |
| "num_tokens": 9127649.0, |
| "step": 1035 |
| }, |
| { |
| "entropy": 1.0046712160110474, |
| "epoch": 3.742081447963801, |
| "grad_norm": 0.5995718836784363, |
| "learning_rate": 0.0001239556359267287, |
| "loss": 0.07123475521802902, |
| "mean_token_accuracy": 0.9800655543804169, |
| "num_tokens": 9136631.0, |
| "step": 1036 |
| }, |
| { |
| "entropy": 0.9638098627328873, |
| "epoch": 3.7457013574660634, |
| "grad_norm": 0.35988542437553406, |
| "learning_rate": 0.0001238256427185635, |
| "loss": 0.028186630457639694, |
| "mean_token_accuracy": 0.9930652678012848, |
| "num_tokens": 9145889.0, |
| "step": 1037 |
| }, |
| { |
| "entropy": 0.9974884688854218, |
| "epoch": 3.749321266968326, |
| "grad_norm": 0.6581419110298157, |
| "learning_rate": 0.0001236956199634817, |
| "loss": 0.07928231358528137, |
| "mean_token_accuracy": 0.977179154753685, |
| "num_tokens": 9155071.0, |
| "step": 1038 |
| }, |
| { |
| "entropy": 0.9659110605716705, |
| "epoch": 3.7529411764705882, |
| "grad_norm": 0.5204628705978394, |
| "learning_rate": 0.00012356556793935615, |
| "loss": 0.07529903948307037, |
| "mean_token_accuracy": 0.9794208407402039, |
| "num_tokens": 9164329.0, |
| "step": 1039 |
| }, |
| { |
| "entropy": 0.9618893265724182, |
| "epoch": 3.7565610859728507, |
| "grad_norm": 0.43765532970428467, |
| "learning_rate": 0.00012343548692412233, |
| "loss": 0.04690020531415939, |
| "mean_token_accuracy": 0.986578032374382, |
| "num_tokens": 9173806.0, |
| "step": 1040 |
| }, |
| { |
| "entropy": 1.0338812470436096, |
| "epoch": 3.760180995475113, |
| "grad_norm": 0.4304349422454834, |
| "learning_rate": 0.00012330537719577766, |
| "loss": 0.03645741939544678, |
| "mean_token_accuracy": 0.9854725003242493, |
| "num_tokens": 9182477.0, |
| "step": 1041 |
| }, |
| { |
| "entropy": 1.0175089985132217, |
| "epoch": 3.7638009049773755, |
| "grad_norm": 0.4084160029888153, |
| "learning_rate": 0.00012317523903238094, |
| "loss": 0.03105458803474903, |
| "mean_token_accuracy": 0.9912384748458862, |
| "num_tokens": 9191539.0, |
| "step": 1042 |
| }, |
| { |
| "entropy": 1.0527340471744537, |
| "epoch": 3.767420814479638, |
| "grad_norm": 0.5655209422111511, |
| "learning_rate": 0.00012304507271205167, |
| "loss": 0.05545002967119217, |
| "mean_token_accuracy": 0.9856874346733093, |
| "num_tokens": 9200288.0, |
| "step": 1043 |
| }, |
| { |
| "entropy": 1.0532978475093842, |
| "epoch": 3.7710407239819004, |
| "grad_norm": 0.4896293878555298, |
| "learning_rate": 0.00012291487851296955, |
| "loss": 0.03723525255918503, |
| "mean_token_accuracy": 0.9878070503473282, |
| "num_tokens": 9208798.0, |
| "step": 1044 |
| }, |
| { |
| "entropy": 0.9996777772903442, |
| "epoch": 3.774660633484163, |
| "grad_norm": 0.2994062900543213, |
| "learning_rate": 0.00012278465671337394, |
| "loss": 0.021810417994856834, |
| "mean_token_accuracy": 0.9923857599496841, |
| "num_tokens": 9217755.0, |
| "step": 1045 |
| }, |
| { |
| "entropy": 1.0102759897708893, |
| "epoch": 3.7782805429864252, |
| "grad_norm": 0.7357338070869446, |
| "learning_rate": 0.0001226544075915631, |
| "loss": 0.08538160473108292, |
| "mean_token_accuracy": 0.9771738797426224, |
| "num_tokens": 9226530.0, |
| "step": 1046 |
| }, |
| { |
| "entropy": 1.0522404909133911, |
| "epoch": 3.7819004524886877, |
| "grad_norm": 0.6169455647468567, |
| "learning_rate": 0.0001225241314258937, |
| "loss": 0.054077863693237305, |
| "mean_token_accuracy": 0.9810755997896194, |
| "num_tokens": 9235491.0, |
| "step": 1047 |
| }, |
| { |
| "entropy": 0.9845469892024994, |
| "epoch": 3.78552036199095, |
| "grad_norm": 0.6194866895675659, |
| "learning_rate": 0.00012239382849478026, |
| "loss": 0.09550972282886505, |
| "mean_token_accuracy": 0.9748479872941971, |
| "num_tokens": 9244320.0, |
| "step": 1048 |
| }, |
| { |
| "entropy": 0.9689999371767044, |
| "epoch": 3.7891402714932125, |
| "grad_norm": 0.599198043346405, |
| "learning_rate": 0.0001222634990766944, |
| "loss": 0.06929125636816025, |
| "mean_token_accuracy": 0.9795756936073303, |
| "num_tokens": 9253358.0, |
| "step": 1049 |
| }, |
| { |
| "entropy": 0.9611286520957947, |
| "epoch": 3.792760180995475, |
| "grad_norm": 0.5120612978935242, |
| "learning_rate": 0.00012213314345016434, |
| "loss": 0.04920945689082146, |
| "mean_token_accuracy": 0.987091675400734, |
| "num_tokens": 9262305.0, |
| "step": 1050 |
| }, |
| { |
| "entropy": 0.9123319238424301, |
| "epoch": 3.7963800904977374, |
| "grad_norm": 0.47379791736602783, |
| "learning_rate": 0.00012200276189377449, |
| "loss": 0.04485338181257248, |
| "mean_token_accuracy": 0.9874735176563263, |
| "num_tokens": 9271327.0, |
| "step": 1051 |
| }, |
| { |
| "entropy": 0.9740808606147766, |
| "epoch": 3.8, |
| "grad_norm": 0.6042353510856628, |
| "learning_rate": 0.00012187235468616449, |
| "loss": 0.06215674430131912, |
| "mean_token_accuracy": 0.9854099005460739, |
| "num_tokens": 9279720.0, |
| "step": 1052 |
| }, |
| { |
| "entropy": 1.0416322499513626, |
| "epoch": 3.8036199095022623, |
| "grad_norm": 0.5319172739982605, |
| "learning_rate": 0.00012174192210602886, |
| "loss": 0.03299910947680473, |
| "mean_token_accuracy": 0.990405261516571, |
| "num_tokens": 9287790.0, |
| "step": 1053 |
| }, |
| { |
| "entropy": 0.9842112064361572, |
| "epoch": 3.8072398190045247, |
| "grad_norm": 0.4761062264442444, |
| "learning_rate": 0.00012161146443211635, |
| "loss": 0.04322975501418114, |
| "mean_token_accuracy": 0.9890616089105606, |
| "num_tokens": 9296546.0, |
| "step": 1054 |
| }, |
| { |
| "entropy": 0.9370725750923157, |
| "epoch": 3.810859728506787, |
| "grad_norm": 0.6730228662490845, |
| "learning_rate": 0.00012148098194322936, |
| "loss": 0.13308462500572205, |
| "mean_token_accuracy": 0.9631438553333282, |
| "num_tokens": 9305702.0, |
| "step": 1055 |
| }, |
| { |
| "entropy": 0.9374968558549881, |
| "epoch": 3.8144796380090495, |
| "grad_norm": 0.48976925015449524, |
| "learning_rate": 0.00012135047491822329, |
| "loss": 0.04608523100614548, |
| "mean_token_accuracy": 0.9850935637950897, |
| "num_tokens": 9314649.0, |
| "step": 1056 |
| }, |
| { |
| "entropy": 1.0022027492523193, |
| "epoch": 3.818099547511312, |
| "grad_norm": 0.5045378804206848, |
| "learning_rate": 0.00012121994363600593, |
| "loss": 0.053614161908626556, |
| "mean_token_accuracy": 0.9847323000431061, |
| "num_tokens": 9323366.0, |
| "step": 1057 |
| }, |
| { |
| "entropy": 1.0107389986515045, |
| "epoch": 3.8217194570135744, |
| "grad_norm": 0.8898943066596985, |
| "learning_rate": 0.00012108938837553703, |
| "loss": 0.23248156905174255, |
| "mean_token_accuracy": 0.9641059786081314, |
| "num_tokens": 9332078.0, |
| "step": 1058 |
| }, |
| { |
| "entropy": 1.0229197144508362, |
| "epoch": 3.825339366515837, |
| "grad_norm": 0.6454104781150818, |
| "learning_rate": 0.00012095880941582744, |
| "loss": 0.07357359677553177, |
| "mean_token_accuracy": 0.9791290163993835, |
| "num_tokens": 9340736.0, |
| "step": 1059 |
| }, |
| { |
| "entropy": 1.0136512219905853, |
| "epoch": 3.8289592760180997, |
| "grad_norm": 0.4896777272224426, |
| "learning_rate": 0.00012082820703593885, |
| "loss": 0.056676387786865234, |
| "mean_token_accuracy": 0.9834884107112885, |
| "num_tokens": 9349618.0, |
| "step": 1060 |
| }, |
| { |
| "entropy": 0.9840548485517502, |
| "epoch": 3.832579185520362, |
| "grad_norm": 0.7807687520980835, |
| "learning_rate": 0.00012069758151498279, |
| "loss": 0.08798709511756897, |
| "mean_token_accuracy": 0.9721374362707138, |
| "num_tokens": 9358452.0, |
| "step": 1061 |
| }, |
| { |
| "entropy": 1.0566797703504562, |
| "epoch": 3.8361990950226246, |
| "grad_norm": 0.9508253335952759, |
| "learning_rate": 0.0001205669331321204, |
| "loss": 0.08302375674247742, |
| "mean_token_accuracy": 0.9773988276720047, |
| "num_tokens": 9367052.0, |
| "step": 1062 |
| }, |
| { |
| "entropy": 1.0889401733875275, |
| "epoch": 3.839819004524887, |
| "grad_norm": 0.3949955999851227, |
| "learning_rate": 0.00012043626216656154, |
| "loss": 0.023118160665035248, |
| "mean_token_accuracy": 0.9940158426761627, |
| "num_tokens": 9375475.0, |
| "step": 1063 |
| }, |
| { |
| "entropy": 1.0559107959270477, |
| "epoch": 3.8434389140271494, |
| "grad_norm": 0.9540901780128479, |
| "learning_rate": 0.00012030556889756451, |
| "loss": 0.09784621000289917, |
| "mean_token_accuracy": 0.9844755232334137, |
| "num_tokens": 9384455.0, |
| "step": 1064 |
| }, |
| { |
| "entropy": 1.0997906029224396, |
| "epoch": 3.847058823529412, |
| "grad_norm": 0.7329124212265015, |
| "learning_rate": 0.00012017485360443512, |
| "loss": 0.05017324537038803, |
| "mean_token_accuracy": 0.9844279885292053, |
| "num_tokens": 9392971.0, |
| "step": 1065 |
| }, |
| { |
| "entropy": 1.0585920810699463, |
| "epoch": 3.8506787330316743, |
| "grad_norm": 0.35517770051956177, |
| "learning_rate": 0.00012004411656652629, |
| "loss": 0.033513545989990234, |
| "mean_token_accuracy": 0.9896632134914398, |
| "num_tokens": 9401493.0, |
| "step": 1066 |
| }, |
| { |
| "entropy": 0.9919628500938416, |
| "epoch": 3.8542986425339367, |
| "grad_norm": 0.4272564649581909, |
| "learning_rate": 0.00011991335806323751, |
| "loss": 0.05677922070026398, |
| "mean_token_accuracy": 0.9791074395179749, |
| "num_tokens": 9410389.0, |
| "step": 1067 |
| }, |
| { |
| "entropy": 1.0546822249889374, |
| "epoch": 3.857918552036199, |
| "grad_norm": 0.5334388017654419, |
| "learning_rate": 0.00011978257837401396, |
| "loss": 0.05315824970602989, |
| "mean_token_accuracy": 0.9821736663579941, |
| "num_tokens": 9419227.0, |
| "step": 1068 |
| }, |
| { |
| "entropy": 1.041745737195015, |
| "epoch": 3.8615384615384616, |
| "grad_norm": 0.534310519695282, |
| "learning_rate": 0.00011965177777834627, |
| "loss": 0.07340014725923538, |
| "mean_token_accuracy": 0.9823083132505417, |
| "num_tokens": 9428116.0, |
| "step": 1069 |
| }, |
| { |
| "entropy": 0.9816903918981552, |
| "epoch": 3.865158371040724, |
| "grad_norm": 0.5933845639228821, |
| "learning_rate": 0.0001195209565557696, |
| "loss": 0.07548420131206512, |
| "mean_token_accuracy": 0.9744589179754257, |
| "num_tokens": 9437254.0, |
| "step": 1070 |
| }, |
| { |
| "entropy": 1.0700978338718414, |
| "epoch": 3.8687782805429864, |
| "grad_norm": 0.606353223323822, |
| "learning_rate": 0.00011939011498586333, |
| "loss": 0.039710596203804016, |
| "mean_token_accuracy": 0.9877680093050003, |
| "num_tokens": 9445784.0, |
| "step": 1071 |
| }, |
| { |
| "entropy": 1.0325657278299332, |
| "epoch": 3.872398190045249, |
| "grad_norm": 0.8289651870727539, |
| "learning_rate": 0.00011925925334825026, |
| "loss": 0.05568333715200424, |
| "mean_token_accuracy": 0.9832515269517899, |
| "num_tokens": 9454906.0, |
| "step": 1072 |
| }, |
| { |
| "entropy": 0.9403438866138458, |
| "epoch": 3.8760180995475113, |
| "grad_norm": 0.5779698491096497, |
| "learning_rate": 0.00011912837192259605, |
| "loss": 0.06453107297420502, |
| "mean_token_accuracy": 0.9814022779464722, |
| "num_tokens": 9463771.0, |
| "step": 1073 |
| }, |
| { |
| "entropy": 1.06302210688591, |
| "epoch": 3.8796380090497737, |
| "grad_norm": 0.6681597232818604, |
| "learning_rate": 0.0001189974709886087, |
| "loss": 0.07519081979990005, |
| "mean_token_accuracy": 0.9822188168764114, |
| "num_tokens": 9472487.0, |
| "step": 1074 |
| }, |
| { |
| "entropy": 0.9794299155473709, |
| "epoch": 3.883257918552036, |
| "grad_norm": 0.6864436864852905, |
| "learning_rate": 0.00011886655082603784, |
| "loss": 0.06897931545972824, |
| "mean_token_accuracy": 0.9769734293222427, |
| "num_tokens": 9481545.0, |
| "step": 1075 |
| }, |
| { |
| "entropy": 0.9836923331022263, |
| "epoch": 3.8868778280542986, |
| "grad_norm": 0.470438688993454, |
| "learning_rate": 0.00011873561171467428, |
| "loss": 0.05008576065301895, |
| "mean_token_accuracy": 0.9836294203996658, |
| "num_tokens": 9491031.0, |
| "step": 1076 |
| }, |
| { |
| "entropy": 1.048751562833786, |
| "epoch": 3.890497737556561, |
| "grad_norm": 0.36586275696754456, |
| "learning_rate": 0.0001186046539343493, |
| "loss": 0.032275643199682236, |
| "mean_token_accuracy": 0.9896851181983948, |
| "num_tokens": 9499811.0, |
| "step": 1077 |
| }, |
| { |
| "entropy": 1.0775998830795288, |
| "epoch": 3.8941176470588235, |
| "grad_norm": 0.4112566113471985, |
| "learning_rate": 0.00011847367776493398, |
| "loss": 0.04865328222513199, |
| "mean_token_accuracy": 0.9793375581502914, |
| "num_tokens": 9508692.0, |
| "step": 1078 |
| }, |
| { |
| "entropy": 1.0528927445411682, |
| "epoch": 3.897737556561086, |
| "grad_norm": 0.5616442561149597, |
| "learning_rate": 0.00011834268348633883, |
| "loss": 0.07484862208366394, |
| "mean_token_accuracy": 0.9722632467746735, |
| "num_tokens": 9517646.0, |
| "step": 1079 |
| }, |
| { |
| "entropy": 1.1230517029762268, |
| "epoch": 3.9013574660633483, |
| "grad_norm": 0.5908562541007996, |
| "learning_rate": 0.00011821167137851299, |
| "loss": 0.07327218353748322, |
| "mean_token_accuracy": 0.9811713099479675, |
| "num_tokens": 9526143.0, |
| "step": 1080 |
| }, |
| { |
| "entropy": 1.0355236679315567, |
| "epoch": 3.9049773755656108, |
| "grad_norm": 0.3904285728931427, |
| "learning_rate": 0.0001180806417214437, |
| "loss": 0.03152093291282654, |
| "mean_token_accuracy": 0.9888577461242676, |
| "num_tokens": 9535160.0, |
| "step": 1081 |
| }, |
| { |
| "entropy": 1.0083343833684921, |
| "epoch": 3.908597285067873, |
| "grad_norm": 0.5973058938980103, |
| "learning_rate": 0.00011794959479515577, |
| "loss": 0.05609864741563797, |
| "mean_token_accuracy": 0.9796222299337387, |
| "num_tokens": 9544285.0, |
| "step": 1082 |
| }, |
| { |
| "entropy": 1.0868518203496933, |
| "epoch": 3.9122171945701356, |
| "grad_norm": 0.5905822515487671, |
| "learning_rate": 0.00011781853087971087, |
| "loss": 0.09653709828853607, |
| "mean_token_accuracy": 0.97333624958992, |
| "num_tokens": 9553206.0, |
| "step": 1083 |
| }, |
| { |
| "entropy": 1.0268581211566925, |
| "epoch": 3.915837104072398, |
| "grad_norm": 0.5431144833564758, |
| "learning_rate": 0.00011768745025520694, |
| "loss": 0.03400646895170212, |
| "mean_token_accuracy": 0.9925975948572159, |
| "num_tokens": 9561982.0, |
| "step": 1084 |
| }, |
| { |
| "entropy": 0.9988400340080261, |
| "epoch": 3.9194570135746605, |
| "grad_norm": 0.5526976585388184, |
| "learning_rate": 0.00011755635320177765, |
| "loss": 0.06047695130109787, |
| "mean_token_accuracy": 0.9864342510700226, |
| "num_tokens": 9570908.0, |
| "step": 1085 |
| }, |
| { |
| "entropy": 1.0497360080480576, |
| "epoch": 3.9230769230769234, |
| "grad_norm": 0.48714399337768555, |
| "learning_rate": 0.00011742523999959189, |
| "loss": 0.05991292744874954, |
| "mean_token_accuracy": 0.9843471348285675, |
| "num_tokens": 9579709.0, |
| "step": 1086 |
| }, |
| { |
| "entropy": 0.9997294843196869, |
| "epoch": 3.926696832579186, |
| "grad_norm": 0.40273338556289673, |
| "learning_rate": 0.0001172941109288529, |
| "loss": 0.0249432772397995, |
| "mean_token_accuracy": 0.9916731268167496, |
| "num_tokens": 9587982.0, |
| "step": 1087 |
| }, |
| { |
| "entropy": 1.071354240179062, |
| "epoch": 3.930316742081448, |
| "grad_norm": 0.5073276162147522, |
| "learning_rate": 0.00011716296626979789, |
| "loss": 0.052621759474277496, |
| "mean_token_accuracy": 0.9854983687400818, |
| "num_tokens": 9596660.0, |
| "step": 1088 |
| }, |
| { |
| "entropy": 1.0524671822786331, |
| "epoch": 3.9339366515837106, |
| "grad_norm": 0.44608592987060547, |
| "learning_rate": 0.00011703180630269743, |
| "loss": 0.0402885302901268, |
| "mean_token_accuracy": 0.9871297627687454, |
| "num_tokens": 9605415.0, |
| "step": 1089 |
| }, |
| { |
| "entropy": 0.9869890660047531, |
| "epoch": 3.937556561085973, |
| "grad_norm": 0.5331403613090515, |
| "learning_rate": 0.00011690063130785478, |
| "loss": 0.05267741531133652, |
| "mean_token_accuracy": 0.9868313521146774, |
| "num_tokens": 9614049.0, |
| "step": 1090 |
| }, |
| { |
| "entropy": 1.0029538869857788, |
| "epoch": 3.9411764705882355, |
| "grad_norm": 0.40601280331611633, |
| "learning_rate": 0.00011676944156560532, |
| "loss": 0.03916563838720322, |
| "mean_token_accuracy": 0.9879664182662964, |
| "num_tokens": 9623099.0, |
| "step": 1091 |
| }, |
| { |
| "entropy": 1.0587478280067444, |
| "epoch": 3.944796380090498, |
| "grad_norm": 0.6124465465545654, |
| "learning_rate": 0.00011663823735631585, |
| "loss": 0.06757491081953049, |
| "mean_token_accuracy": 0.9832141548395157, |
| "num_tokens": 9631984.0, |
| "step": 1092 |
| }, |
| { |
| "entropy": 1.0624504685401917, |
| "epoch": 3.9484162895927604, |
| "grad_norm": 0.5510776042938232, |
| "learning_rate": 0.00011650701896038428, |
| "loss": 0.04322856664657593, |
| "mean_token_accuracy": 0.9892711043357849, |
| "num_tokens": 9640701.0, |
| "step": 1093 |
| }, |
| { |
| "entropy": 1.0204610973596573, |
| "epoch": 3.952036199095023, |
| "grad_norm": 0.5441774725914001, |
| "learning_rate": 0.00011637578665823865, |
| "loss": 0.2228085845708847, |
| "mean_token_accuracy": 0.9644595384597778, |
| "num_tokens": 9649828.0, |
| "step": 1094 |
| }, |
| { |
| "entropy": 0.9337645173072815, |
| "epoch": 3.9556561085972852, |
| "grad_norm": 0.5637805461883545, |
| "learning_rate": 0.00011624454073033686, |
| "loss": 0.06843124330043793, |
| "mean_token_accuracy": 0.9804229438304901, |
| "num_tokens": 9659279.0, |
| "step": 1095 |
| }, |
| { |
| "entropy": 0.9486549347639084, |
| "epoch": 3.9592760180995477, |
| "grad_norm": 0.5789593458175659, |
| "learning_rate": 0.00011611328145716582, |
| "loss": 0.05753622204065323, |
| "mean_token_accuracy": 0.9850260466337204, |
| "num_tokens": 9668201.0, |
| "step": 1096 |
| }, |
| { |
| "entropy": 0.9881956428289413, |
| "epoch": 3.96289592760181, |
| "grad_norm": 0.4987047612667084, |
| "learning_rate": 0.00011598200911924104, |
| "loss": 0.029422685503959656, |
| "mean_token_accuracy": 0.989275798201561, |
| "num_tokens": 9677075.0, |
| "step": 1097 |
| }, |
| { |
| "entropy": 0.9573028832674026, |
| "epoch": 3.9665158371040725, |
| "grad_norm": 0.7386200428009033, |
| "learning_rate": 0.00011585072399710588, |
| "loss": 0.03801329806447029, |
| "mean_token_accuracy": 0.9843745678663254, |
| "num_tokens": 9686251.0, |
| "step": 1098 |
| }, |
| { |
| "entropy": 0.9610146731138229, |
| "epoch": 3.970135746606335, |
| "grad_norm": 0.4404725134372711, |
| "learning_rate": 0.00011571942637133115, |
| "loss": 0.057003386318683624, |
| "mean_token_accuracy": 0.9834477603435516, |
| "num_tokens": 9695626.0, |
| "step": 1099 |
| }, |
| { |
| "entropy": 0.9653383642435074, |
| "epoch": 3.9737556561085974, |
| "grad_norm": 0.8406949043273926, |
| "learning_rate": 0.00011558811652251422, |
| "loss": 0.11011351644992828, |
| "mean_token_accuracy": 0.9752728492021561, |
| "num_tokens": 9704667.0, |
| "step": 1100 |
| }, |
| { |
| "entropy": 1.060383379459381, |
| "epoch": 3.97737556561086, |
| "grad_norm": 0.4521214962005615, |
| "learning_rate": 0.00011545679473127864, |
| "loss": 0.050808414816856384, |
| "mean_token_accuracy": 0.9801691472530365, |
| "num_tokens": 9713336.0, |
| "step": 1101 |
| }, |
| { |
| "entropy": 0.9874586910009384, |
| "epoch": 3.9809954751131222, |
| "grad_norm": 0.6479194760322571, |
| "learning_rate": 0.00011532546127827355, |
| "loss": 0.05640149861574173, |
| "mean_token_accuracy": 0.978395476937294, |
| "num_tokens": 9722165.0, |
| "step": 1102 |
| }, |
| { |
| "entropy": 1.0500174909830093, |
| "epoch": 3.9846153846153847, |
| "grad_norm": 0.4811345934867859, |
| "learning_rate": 0.00011519411644417296, |
| "loss": 0.03901681676506996, |
| "mean_token_accuracy": 0.9876944124698639, |
| "num_tokens": 9730697.0, |
| "step": 1103 |
| }, |
| { |
| "entropy": 0.9794940203428268, |
| "epoch": 3.988235294117647, |
| "grad_norm": 0.5242920517921448, |
| "learning_rate": 0.00011506276050967518, |
| "loss": 0.08005990833044052, |
| "mean_token_accuracy": 0.9715511202812195, |
| "num_tokens": 9740258.0, |
| "step": 1104 |
| }, |
| { |
| "entropy": 1.0328656435012817, |
| "epoch": 3.9918552036199095, |
| "grad_norm": 0.4445230960845947, |
| "learning_rate": 0.00011493139375550222, |
| "loss": 0.0437643937766552, |
| "mean_token_accuracy": 0.9823936223983765, |
| "num_tokens": 9748823.0, |
| "step": 1105 |
| }, |
| { |
| "entropy": 1.0510858297348022, |
| "epoch": 3.995475113122172, |
| "grad_norm": 0.848091185092926, |
| "learning_rate": 0.00011480001646239935, |
| "loss": 0.10476335883140564, |
| "mean_token_accuracy": 0.9740482717752457, |
| "num_tokens": 9757407.0, |
| "step": 1106 |
| }, |
| { |
| "entropy": 1.046503335237503, |
| "epoch": 3.9990950226244344, |
| "grad_norm": 0.5196622014045715, |
| "learning_rate": 0.00011466862891113424, |
| "loss": 0.03807297721505165, |
| "mean_token_accuracy": 0.9873451888561249, |
| "num_tokens": 9766046.0, |
| "step": 1107 |
| }, |
| { |
| "entropy": 0.8666110038757324, |
| "epoch": 4.0, |
| "grad_norm": 0.6595801115036011, |
| "learning_rate": 0.00011453723138249647, |
| "loss": 0.02082827128469944, |
| "mean_token_accuracy": 0.9935275316238403, |
| "num_tokens": 9766900.0, |
| "step": 1108 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_entropy": 1.0280046855531089, |
| "eval_loss": 0.11439266055822372, |
| "eval_mean_token_accuracy": 0.9721979295335165, |
| "eval_num_tokens": 9766900.0, |
| "eval_runtime": 31.7938, |
| "eval_samples_per_second": 11.606, |
| "eval_steps_per_second": 3.869, |
| "step": 1108 |
| }, |
| { |
| "entropy": 1.0500973612070084, |
| "epoch": 4.003619909502262, |
| "grad_norm": 0.5530663132667542, |
| "learning_rate": 0.00011440582415729704, |
| "loss": 0.04492343217134476, |
| "mean_token_accuracy": 0.985475093126297, |
| "num_tokens": 9775335.0, |
| "step": 1109 |
| }, |
| { |
| "entropy": 0.9695511311292648, |
| "epoch": 4.007239819004525, |
| "grad_norm": 0.41322094202041626, |
| "learning_rate": 0.0001142744075163676, |
| "loss": 0.03918066620826721, |
| "mean_token_accuracy": 0.9863087087869644, |
| "num_tokens": 9784958.0, |
| "step": 1110 |
| }, |
| { |
| "entropy": 1.0054823160171509, |
| "epoch": 4.010859728506787, |
| "grad_norm": 0.29584264755249023, |
| "learning_rate": 0.0001141429817405599, |
| "loss": 0.024759791791439056, |
| "mean_token_accuracy": 0.9934576153755188, |
| "num_tokens": 9793859.0, |
| "step": 1111 |
| }, |
| { |
| "entropy": 0.9745224118232727, |
| "epoch": 4.01447963800905, |
| "grad_norm": 0.4821024239063263, |
| "learning_rate": 0.00011401154711074536, |
| "loss": 0.03338223323225975, |
| "mean_token_accuracy": 0.9919255524873734, |
| "num_tokens": 9803033.0, |
| "step": 1112 |
| }, |
| { |
| "entropy": 0.9960066974163055, |
| "epoch": 4.018099547511312, |
| "grad_norm": 0.5458202958106995, |
| "learning_rate": 0.00011388010390781412, |
| "loss": 0.046578116714954376, |
| "mean_token_accuracy": 0.9831000417470932, |
| "num_tokens": 9812187.0, |
| "step": 1113 |
| }, |
| { |
| "entropy": 1.008404940366745, |
| "epoch": 4.021719457013575, |
| "grad_norm": 0.45678576827049255, |
| "learning_rate": 0.00011374865241267478, |
| "loss": 0.024580247700214386, |
| "mean_token_accuracy": 0.9908605217933655, |
| "num_tokens": 9820761.0, |
| "step": 1114 |
| }, |
| { |
| "entropy": 0.9565981924533844, |
| "epoch": 4.025339366515837, |
| "grad_norm": 0.5080620050430298, |
| "learning_rate": 0.00011361719290625359, |
| "loss": 0.06319691240787506, |
| "mean_token_accuracy": 0.9872463345527649, |
| "num_tokens": 9829985.0, |
| "step": 1115 |
| }, |
| { |
| "entropy": 0.9249087870121002, |
| "epoch": 4.0289592760180994, |
| "grad_norm": 0.47282564640045166, |
| "learning_rate": 0.000113485725669494, |
| "loss": 0.034620530903339386, |
| "mean_token_accuracy": 0.9901333600282669, |
| "num_tokens": 9839520.0, |
| "step": 1116 |
| }, |
| { |
| "entropy": 0.9898876994848251, |
| "epoch": 4.032579185520362, |
| "grad_norm": 0.35816535353660583, |
| "learning_rate": 0.0001133542509833559, |
| "loss": 0.0199007298797369, |
| "mean_token_accuracy": 0.994959831237793, |
| "num_tokens": 9848010.0, |
| "step": 1117 |
| }, |
| { |
| "entropy": 0.9201329201459885, |
| "epoch": 4.036199095022624, |
| "grad_norm": 0.36862438917160034, |
| "learning_rate": 0.00011322276912881509, |
| "loss": 0.033990710973739624, |
| "mean_token_accuracy": 0.987580731511116, |
| "num_tokens": 9857231.0, |
| "step": 1118 |
| }, |
| { |
| "entropy": 0.8616881370544434, |
| "epoch": 4.039819004524887, |
| "grad_norm": 0.4026683568954468, |
| "learning_rate": 0.00011309128038686278, |
| "loss": 0.02125810645520687, |
| "mean_token_accuracy": 0.990529865026474, |
| "num_tokens": 9866285.0, |
| "step": 1119 |
| }, |
| { |
| "entropy": 0.9119723290205002, |
| "epoch": 4.043438914027149, |
| "grad_norm": 0.5761182904243469, |
| "learning_rate": 0.00011295978503850487, |
| "loss": 0.04496622830629349, |
| "mean_token_accuracy": 0.9852714240550995, |
| "num_tokens": 9874724.0, |
| "step": 1120 |
| }, |
| { |
| "entropy": 0.8586753159761429, |
| "epoch": 4.047058823529412, |
| "grad_norm": 0.5358923673629761, |
| "learning_rate": 0.00011282828336476134, |
| "loss": 0.04419999569654465, |
| "mean_token_accuracy": 0.9864743649959564, |
| "num_tokens": 9884023.0, |
| "step": 1121 |
| }, |
| { |
| "entropy": 0.8618190735578537, |
| "epoch": 4.050678733031674, |
| "grad_norm": 0.6738160848617554, |
| "learning_rate": 0.00011269677564666565, |
| "loss": 0.051054857671260834, |
| "mean_token_accuracy": 0.9872888177633286, |
| "num_tokens": 9892904.0, |
| "step": 1122 |
| }, |
| { |
| "entropy": 0.8987855166196823, |
| "epoch": 4.0542986425339365, |
| "grad_norm": 0.828499972820282, |
| "learning_rate": 0.00011256526216526433, |
| "loss": 0.03101392835378647, |
| "mean_token_accuracy": 0.9909973591566086, |
| "num_tokens": 9901345.0, |
| "step": 1123 |
| }, |
| { |
| "entropy": 0.8801412582397461, |
| "epoch": 4.057918552036199, |
| "grad_norm": 0.6591492891311646, |
| "learning_rate": 0.00011243374320161607, |
| "loss": 0.04960804432630539, |
| "mean_token_accuracy": 0.9818640649318695, |
| "num_tokens": 9909853.0, |
| "step": 1124 |
| }, |
| { |
| "entropy": 0.8607528507709503, |
| "epoch": 4.061538461538461, |
| "grad_norm": 0.884670615196228, |
| "learning_rate": 0.0001123022190367913, |
| "loss": 0.11029860377311707, |
| "mean_token_accuracy": 0.9751772731542587, |
| "num_tokens": 9918983.0, |
| "step": 1125 |
| }, |
| { |
| "entropy": 0.864204928278923, |
| "epoch": 4.065158371040724, |
| "grad_norm": 0.6966027021408081, |
| "learning_rate": 0.00011217068995187172, |
| "loss": 0.054947629570961, |
| "mean_token_accuracy": 0.9846376031637192, |
| "num_tokens": 9927814.0, |
| "step": 1126 |
| }, |
| { |
| "entropy": 0.8841895759105682, |
| "epoch": 4.068778280542986, |
| "grad_norm": 0.49447473883628845, |
| "learning_rate": 0.00011203915622794934, |
| "loss": 0.04461928457021713, |
| "mean_token_accuracy": 0.9833347350358963, |
| "num_tokens": 9936735.0, |
| "step": 1127 |
| }, |
| { |
| "entropy": 0.8424310237169266, |
| "epoch": 4.072398190045249, |
| "grad_norm": 0.3998212516307831, |
| "learning_rate": 0.00011190761814612616, |
| "loss": 0.022949082776904106, |
| "mean_token_accuracy": 0.9910575300455093, |
| "num_tokens": 9945746.0, |
| "step": 1128 |
| }, |
| { |
| "entropy": 0.9149628132581711, |
| "epoch": 4.076018099547511, |
| "grad_norm": 0.48935163021087646, |
| "learning_rate": 0.00011177607598751354, |
| "loss": 0.0449216291308403, |
| "mean_token_accuracy": 0.9876349568367004, |
| "num_tokens": 9954786.0, |
| "step": 1129 |
| }, |
| { |
| "entropy": 0.9436136782169342, |
| "epoch": 4.0796380090497735, |
| "grad_norm": 0.6684723496437073, |
| "learning_rate": 0.00011164453003323152, |
| "loss": 0.02876024693250656, |
| "mean_token_accuracy": 0.9874544590711594, |
| "num_tokens": 9962930.0, |
| "step": 1130 |
| }, |
| { |
| "entropy": 0.9349515736103058, |
| "epoch": 4.083257918552036, |
| "grad_norm": 0.3718799352645874, |
| "learning_rate": 0.00011151298056440825, |
| "loss": 0.022371353581547737, |
| "mean_token_accuracy": 0.9954051822423935, |
| "num_tokens": 9971056.0, |
| "step": 1131 |
| }, |
| { |
| "entropy": 0.8744916170835495, |
| "epoch": 4.086877828054298, |
| "grad_norm": 0.5116303563117981, |
| "learning_rate": 0.00011138142786217936, |
| "loss": 0.021263940259814262, |
| "mean_token_accuracy": 0.9941851943731308, |
| "num_tokens": 9980049.0, |
| "step": 1132 |
| }, |
| { |
| "entropy": 0.8928166627883911, |
| "epoch": 4.090497737556561, |
| "grad_norm": 0.5083812475204468, |
| "learning_rate": 0.00011124987220768743, |
| "loss": 0.050615012645721436, |
| "mean_token_accuracy": 0.9832881093025208, |
| "num_tokens": 9989156.0, |
| "step": 1133 |
| }, |
| { |
| "entropy": 0.9249687194824219, |
| "epoch": 4.094117647058823, |
| "grad_norm": 0.39031726121902466, |
| "learning_rate": 0.00011111831388208138, |
| "loss": 0.04024628549814224, |
| "mean_token_accuracy": 0.9921209067106247, |
| "num_tokens": 9998097.0, |
| "step": 1134 |
| }, |
| { |
| "entropy": 0.922693282365799, |
| "epoch": 4.097737556561086, |
| "grad_norm": 0.692787766456604, |
| "learning_rate": 0.00011098675316651576, |
| "loss": 0.03948502615094185, |
| "mean_token_accuracy": 0.9865765869617462, |
| "num_tokens": 10006658.0, |
| "step": 1135 |
| }, |
| { |
| "entropy": 0.8548087179660797, |
| "epoch": 4.101357466063348, |
| "grad_norm": 0.6769980788230896, |
| "learning_rate": 0.00011085519034215027, |
| "loss": 0.04489884525537491, |
| "mean_token_accuracy": 0.9869794398546219, |
| "num_tokens": 10016035.0, |
| "step": 1136 |
| }, |
| { |
| "entropy": 0.9093173742294312, |
| "epoch": 4.1049773755656105, |
| "grad_norm": 0.44418567419052124, |
| "learning_rate": 0.0001107236256901491, |
| "loss": 0.04798293486237526, |
| "mean_token_accuracy": 0.9824085086584091, |
| "num_tokens": 10024945.0, |
| "step": 1137 |
| }, |
| { |
| "entropy": 0.9510129541158676, |
| "epoch": 4.108597285067873, |
| "grad_norm": 0.5759614706039429, |
| "learning_rate": 0.00011059205949168037, |
| "loss": 0.027157757431268692, |
| "mean_token_accuracy": 0.9911233633756638, |
| "num_tokens": 10033483.0, |
| "step": 1138 |
| }, |
| { |
| "entropy": 0.9277460277080536, |
| "epoch": 4.112217194570135, |
| "grad_norm": 0.5908122658729553, |
| "learning_rate": 0.00011046049202791553, |
| "loss": 0.06614906340837479, |
| "mean_token_accuracy": 0.9815282225608826, |
| "num_tokens": 10042459.0, |
| "step": 1139 |
| }, |
| { |
| "entropy": 0.887006476521492, |
| "epoch": 4.115837104072398, |
| "grad_norm": 0.35706913471221924, |
| "learning_rate": 0.00011032892358002862, |
| "loss": 0.018396304920315742, |
| "mean_token_accuracy": 0.9945356100797653, |
| "num_tokens": 10051689.0, |
| "step": 1140 |
| }, |
| { |
| "entropy": 0.9180538654327393, |
| "epoch": 4.11945701357466, |
| "grad_norm": 0.5394801497459412, |
| "learning_rate": 0.00011019735442919594, |
| "loss": 0.07490956783294678, |
| "mean_token_accuracy": 0.9806875884532928, |
| "num_tokens": 10060478.0, |
| "step": 1141 |
| }, |
| { |
| "entropy": 0.9495862722396851, |
| "epoch": 4.123076923076923, |
| "grad_norm": 0.44942763447761536, |
| "learning_rate": 0.00011006578485659513, |
| "loss": 0.044375285506248474, |
| "mean_token_accuracy": 0.9899759888648987, |
| "num_tokens": 10069076.0, |
| "step": 1142 |
| }, |
| { |
| "entropy": 0.925600215792656, |
| "epoch": 4.126696832579185, |
| "grad_norm": 0.42881152033805847, |
| "learning_rate": 0.00010993421514340489, |
| "loss": 0.031960733234882355, |
| "mean_token_accuracy": 0.9871046096086502, |
| "num_tokens": 10078094.0, |
| "step": 1143 |
| }, |
| { |
| "entropy": 0.9153067022562027, |
| "epoch": 4.130316742081448, |
| "grad_norm": 0.38486459851264954, |
| "learning_rate": 0.0001098026455708041, |
| "loss": 0.026029404252767563, |
| "mean_token_accuracy": 0.9911711812019348, |
| "num_tokens": 10086945.0, |
| "step": 1144 |
| }, |
| { |
| "entropy": 0.9598971456289291, |
| "epoch": 4.133936651583711, |
| "grad_norm": 0.43179431557655334, |
| "learning_rate": 0.00010967107641997141, |
| "loss": 0.03247709199786186, |
| "mean_token_accuracy": 0.9906739294528961, |
| "num_tokens": 10095794.0, |
| "step": 1145 |
| }, |
| { |
| "entropy": 0.934249758720398, |
| "epoch": 4.137556561085973, |
| "grad_norm": 0.6291822195053101, |
| "learning_rate": 0.0001095395079720845, |
| "loss": 0.056933820247650146, |
| "mean_token_accuracy": 0.986266016960144, |
| "num_tokens": 10104405.0, |
| "step": 1146 |
| }, |
| { |
| "entropy": 0.9094070792198181, |
| "epoch": 4.141176470588236, |
| "grad_norm": 0.5577878952026367, |
| "learning_rate": 0.00010940794050831964, |
| "loss": 0.057201556861400604, |
| "mean_token_accuracy": 0.9824305176734924, |
| "num_tokens": 10113779.0, |
| "step": 1147 |
| }, |
| { |
| "entropy": 0.8751973658800125, |
| "epoch": 4.144796380090498, |
| "grad_norm": 0.40218469500541687, |
| "learning_rate": 0.00010927637430985091, |
| "loss": 0.026242714375257492, |
| "mean_token_accuracy": 0.9944493323564529, |
| "num_tokens": 10122831.0, |
| "step": 1148 |
| }, |
| { |
| "entropy": 0.8808578848838806, |
| "epoch": 4.1484162895927605, |
| "grad_norm": 0.32130417227745056, |
| "learning_rate": 0.00010914480965784974, |
| "loss": 0.022959765046834946, |
| "mean_token_accuracy": 0.9945127964019775, |
| "num_tokens": 10132207.0, |
| "step": 1149 |
| }, |
| { |
| "entropy": 0.8300377726554871, |
| "epoch": 4.152036199095023, |
| "grad_norm": 0.4078406095504761, |
| "learning_rate": 0.00010901324683348428, |
| "loss": 0.026690851897001266, |
| "mean_token_accuracy": 0.9955218881368637, |
| "num_tokens": 10141567.0, |
| "step": 1150 |
| }, |
| { |
| "entropy": 0.8891526609659195, |
| "epoch": 4.155656108597285, |
| "grad_norm": 0.47017818689346313, |
| "learning_rate": 0.00010888168611791864, |
| "loss": 0.022021235898137093, |
| "mean_token_accuracy": 0.98915895819664, |
| "num_tokens": 10150452.0, |
| "step": 1151 |
| }, |
| { |
| "entropy": 0.860780343413353, |
| "epoch": 4.159276018099548, |
| "grad_norm": 0.6516409516334534, |
| "learning_rate": 0.0001087501277923126, |
| "loss": 0.04971027001738548, |
| "mean_token_accuracy": 0.9832734167575836, |
| "num_tokens": 10159499.0, |
| "step": 1152 |
| }, |
| { |
| "entropy": 0.8994930535554886, |
| "epoch": 4.16289592760181, |
| "grad_norm": 0.5181270837783813, |
| "learning_rate": 0.00010861857213782068, |
| "loss": 0.03901517018675804, |
| "mean_token_accuracy": 0.9868257790803909, |
| "num_tokens": 10168426.0, |
| "step": 1153 |
| }, |
| { |
| "entropy": 0.8511227667331696, |
| "epoch": 4.166515837104073, |
| "grad_norm": 0.4299660921096802, |
| "learning_rate": 0.00010848701943559176, |
| "loss": 0.02564258500933647, |
| "mean_token_accuracy": 0.9918291866779327, |
| "num_tokens": 10177416.0, |
| "step": 1154 |
| }, |
| { |
| "entropy": 0.8692123293876648, |
| "epoch": 4.170135746606335, |
| "grad_norm": 0.7815737128257751, |
| "learning_rate": 0.00010835546996676848, |
| "loss": 0.05228308588266373, |
| "mean_token_accuracy": 0.9848518073558807, |
| "num_tokens": 10186241.0, |
| "step": 1155 |
| }, |
| { |
| "entropy": 0.8547987043857574, |
| "epoch": 4.173755656108598, |
| "grad_norm": 0.6081969738006592, |
| "learning_rate": 0.00010822392401248649, |
| "loss": 0.044268108904361725, |
| "mean_token_accuracy": 0.9888571053743362, |
| "num_tokens": 10195462.0, |
| "step": 1156 |
| }, |
| { |
| "entropy": 0.8557311743497849, |
| "epoch": 4.17737556561086, |
| "grad_norm": 0.4504448175430298, |
| "learning_rate": 0.00010809238185387389, |
| "loss": 0.047527655959129333, |
| "mean_token_accuracy": 0.984959602355957, |
| "num_tokens": 10204724.0, |
| "step": 1157 |
| }, |
| { |
| "entropy": 0.8731201887130737, |
| "epoch": 4.180995475113122, |
| "grad_norm": 0.6010822653770447, |
| "learning_rate": 0.00010796084377205071, |
| "loss": 0.05222795158624649, |
| "mean_token_accuracy": 0.9812868386507034, |
| "num_tokens": 10213426.0, |
| "step": 1158 |
| }, |
| { |
| "entropy": 0.8757798075675964, |
| "epoch": 4.184615384615385, |
| "grad_norm": 0.655741274356842, |
| "learning_rate": 0.00010782931004812831, |
| "loss": 0.03696342930197716, |
| "mean_token_accuracy": 0.988820269703865, |
| "num_tokens": 10221951.0, |
| "step": 1159 |
| }, |
| { |
| "entropy": 0.8456969112157822, |
| "epoch": 4.188235294117647, |
| "grad_norm": 0.7178743481636047, |
| "learning_rate": 0.00010769778096320873, |
| "loss": 0.03984824940562248, |
| "mean_token_accuracy": 0.9859680682420731, |
| "num_tokens": 10231000.0, |
| "step": 1160 |
| }, |
| { |
| "entropy": 0.8414007723331451, |
| "epoch": 4.19185520361991, |
| "grad_norm": 0.4650433361530304, |
| "learning_rate": 0.00010756625679838397, |
| "loss": 0.027776649221777916, |
| "mean_token_accuracy": 0.9927218854427338, |
| "num_tokens": 10240349.0, |
| "step": 1161 |
| }, |
| { |
| "entropy": 0.8852977901697159, |
| "epoch": 4.195475113122172, |
| "grad_norm": 0.7676152586936951, |
| "learning_rate": 0.0001074347378347357, |
| "loss": 0.04361552372574806, |
| "mean_token_accuracy": 0.9907232075929642, |
| "num_tokens": 10248879.0, |
| "step": 1162 |
| }, |
| { |
| "entropy": 0.8889047503471375, |
| "epoch": 4.199095022624435, |
| "grad_norm": 0.4866149127483368, |
| "learning_rate": 0.00010730322435333433, |
| "loss": 0.033081792294979095, |
| "mean_token_accuracy": 0.9893742352724075, |
| "num_tokens": 10257389.0, |
| "step": 1163 |
| }, |
| { |
| "entropy": 0.8337071388959885, |
| "epoch": 4.202714932126697, |
| "grad_norm": 0.3786505460739136, |
| "learning_rate": 0.00010717171663523871, |
| "loss": 0.032933011651039124, |
| "mean_token_accuracy": 0.989232674241066, |
| "num_tokens": 10266161.0, |
| "step": 1164 |
| }, |
| { |
| "entropy": 0.856635645031929, |
| "epoch": 4.206334841628959, |
| "grad_norm": 0.42627274990081787, |
| "learning_rate": 0.00010704021496149517, |
| "loss": 0.03942575678229332, |
| "mean_token_accuracy": 0.9888184368610382, |
| "num_tokens": 10275036.0, |
| "step": 1165 |
| }, |
| { |
| "entropy": 0.8601881414651871, |
| "epoch": 4.209954751131222, |
| "grad_norm": 0.44472524523735046, |
| "learning_rate": 0.00010690871961313724, |
| "loss": 0.03517714887857437, |
| "mean_token_accuracy": 0.9896524995565414, |
| "num_tokens": 10283967.0, |
| "step": 1166 |
| }, |
| { |
| "entropy": 0.8576315343379974, |
| "epoch": 4.213574660633484, |
| "grad_norm": 0.641677975654602, |
| "learning_rate": 0.00010677723087118495, |
| "loss": 0.055644311010837555, |
| "mean_token_accuracy": 0.9859830141067505, |
| "num_tokens": 10293047.0, |
| "step": 1167 |
| }, |
| { |
| "entropy": 0.8803037852048874, |
| "epoch": 4.217194570135747, |
| "grad_norm": 0.45178845524787903, |
| "learning_rate": 0.00010664574901664415, |
| "loss": 0.02904437854886055, |
| "mean_token_accuracy": 0.9915114343166351, |
| "num_tokens": 10301802.0, |
| "step": 1168 |
| }, |
| { |
| "entropy": 0.8520867824554443, |
| "epoch": 4.220814479638009, |
| "grad_norm": 0.5569225549697876, |
| "learning_rate": 0.00010651427433050603, |
| "loss": 0.037966180592775345, |
| "mean_token_accuracy": 0.9883747845888138, |
| "num_tokens": 10310649.0, |
| "step": 1169 |
| }, |
| { |
| "entropy": 0.9215613752603531, |
| "epoch": 4.224434389140272, |
| "grad_norm": 0.34931108355522156, |
| "learning_rate": 0.00010638280709374642, |
| "loss": 0.0191643126308918, |
| "mean_token_accuracy": 0.9933111071586609, |
| "num_tokens": 10319077.0, |
| "step": 1170 |
| }, |
| { |
| "entropy": 0.8855588883161545, |
| "epoch": 4.228054298642534, |
| "grad_norm": 0.4663868844509125, |
| "learning_rate": 0.00010625134758732527, |
| "loss": 0.019490336999297142, |
| "mean_token_accuracy": 0.9924804866313934, |
| "num_tokens": 10327433.0, |
| "step": 1171 |
| }, |
| { |
| "entropy": 0.8989104330539703, |
| "epoch": 4.2316742081447964, |
| "grad_norm": 0.32792529463768005, |
| "learning_rate": 0.0001061198960921859, |
| "loss": 0.023120509460568428, |
| "mean_token_accuracy": 0.9922090172767639, |
| "num_tokens": 10335992.0, |
| "step": 1172 |
| }, |
| { |
| "entropy": 0.9072947949171066, |
| "epoch": 4.235294117647059, |
| "grad_norm": 0.6342188715934753, |
| "learning_rate": 0.00010598845288925465, |
| "loss": 0.057832568883895874, |
| "mean_token_accuracy": 0.9827272593975067, |
| "num_tokens": 10344173.0, |
| "step": 1173 |
| }, |
| { |
| "entropy": 0.8618551194667816, |
| "epoch": 4.238914027149321, |
| "grad_norm": 0.6048193573951721, |
| "learning_rate": 0.0001058570182594401, |
| "loss": 0.045300260186195374, |
| "mean_token_accuracy": 0.9866802990436554, |
| "num_tokens": 10353441.0, |
| "step": 1174 |
| }, |
| { |
| "entropy": 0.8884298205375671, |
| "epoch": 4.242533936651584, |
| "grad_norm": 0.6075233221054077, |
| "learning_rate": 0.00010572559248363244, |
| "loss": 0.043454039841890335, |
| "mean_token_accuracy": 0.983539417386055, |
| "num_tokens": 10362718.0, |
| "step": 1175 |
| }, |
| { |
| "entropy": 0.8603847771883011, |
| "epoch": 4.246153846153846, |
| "grad_norm": 0.4278615117073059, |
| "learning_rate": 0.00010559417584270297, |
| "loss": 0.03194679319858551, |
| "mean_token_accuracy": 0.9888970702886581, |
| "num_tokens": 10371727.0, |
| "step": 1176 |
| }, |
| { |
| "entropy": 0.8933188319206238, |
| "epoch": 4.249773755656109, |
| "grad_norm": 0.7898560166358948, |
| "learning_rate": 0.00010546276861750355, |
| "loss": 0.041754692792892456, |
| "mean_token_accuracy": 0.9864661991596222, |
| "num_tokens": 10380718.0, |
| "step": 1177 |
| }, |
| { |
| "entropy": 0.8819432407617569, |
| "epoch": 4.253393665158371, |
| "grad_norm": 0.6854934096336365, |
| "learning_rate": 0.0001053313710888658, |
| "loss": 0.032602183520793915, |
| "mean_token_accuracy": 0.9920495897531509, |
| "num_tokens": 10389630.0, |
| "step": 1178 |
| }, |
| { |
| "entropy": 0.9068233966827393, |
| "epoch": 4.2570135746606335, |
| "grad_norm": 0.5194956064224243, |
| "learning_rate": 0.00010519998353760068, |
| "loss": 0.04578553885221481, |
| "mean_token_accuracy": 0.9855708330869675, |
| "num_tokens": 10398459.0, |
| "step": 1179 |
| }, |
| { |
| "entropy": 0.8744179904460907, |
| "epoch": 4.260633484162896, |
| "grad_norm": 0.402622789144516, |
| "learning_rate": 0.00010506860624449779, |
| "loss": 0.028466589748859406, |
| "mean_token_accuracy": 0.9951285868883133, |
| "num_tokens": 10407731.0, |
| "step": 1180 |
| }, |
| { |
| "entropy": 0.8742282688617706, |
| "epoch": 4.264253393665158, |
| "grad_norm": 0.3886919319629669, |
| "learning_rate": 0.00010493723949032486, |
| "loss": 0.026117544621229172, |
| "mean_token_accuracy": 0.9926921725273132, |
| "num_tokens": 10416677.0, |
| "step": 1181 |
| }, |
| { |
| "entropy": 0.8899593651294708, |
| "epoch": 4.267873303167421, |
| "grad_norm": 0.4906803369522095, |
| "learning_rate": 0.00010480588355582708, |
| "loss": 0.040936604142189026, |
| "mean_token_accuracy": 0.9897563010454178, |
| "num_tokens": 10425954.0, |
| "step": 1182 |
| }, |
| { |
| "entropy": 0.9250012636184692, |
| "epoch": 4.271493212669683, |
| "grad_norm": 0.5353180170059204, |
| "learning_rate": 0.00010467453872172646, |
| "loss": 0.04668327420949936, |
| "mean_token_accuracy": 0.9911708980798721, |
| "num_tokens": 10434599.0, |
| "step": 1183 |
| }, |
| { |
| "entropy": 0.9150962978601456, |
| "epoch": 4.275113122171946, |
| "grad_norm": 0.9419394731521606, |
| "learning_rate": 0.00010454320526872139, |
| "loss": 0.04198862612247467, |
| "mean_token_accuracy": 0.9864400774240494, |
| "num_tokens": 10443597.0, |
| "step": 1184 |
| }, |
| { |
| "entropy": 0.8760367184877396, |
| "epoch": 4.278733031674208, |
| "grad_norm": 0.5297932624816895, |
| "learning_rate": 0.00010441188347748583, |
| "loss": 0.044471338391304016, |
| "mean_token_accuracy": 0.9881382882595062, |
| "num_tokens": 10452692.0, |
| "step": 1185 |
| }, |
| { |
| "entropy": 0.8790275156497955, |
| "epoch": 4.2823529411764705, |
| "grad_norm": 0.46820127964019775, |
| "learning_rate": 0.00010428057362866888, |
| "loss": 0.040998734533786774, |
| "mean_token_accuracy": 0.9851799458265305, |
| "num_tokens": 10461733.0, |
| "step": 1186 |
| }, |
| { |
| "entropy": 0.897658497095108, |
| "epoch": 4.285972850678733, |
| "grad_norm": 0.42992568016052246, |
| "learning_rate": 0.00010414927600289412, |
| "loss": 0.027044817805290222, |
| "mean_token_accuracy": 0.9944102466106415, |
| "num_tokens": 10470816.0, |
| "step": 1187 |
| }, |
| { |
| "entropy": 0.9087939113378525, |
| "epoch": 4.289592760180995, |
| "grad_norm": 0.32280153036117554, |
| "learning_rate": 0.00010401799088075899, |
| "loss": 0.012480968609452248, |
| "mean_token_accuracy": 0.995869055390358, |
| "num_tokens": 10479089.0, |
| "step": 1188 |
| }, |
| { |
| "entropy": 0.8632544577121735, |
| "epoch": 4.293212669683258, |
| "grad_norm": 0.540331244468689, |
| "learning_rate": 0.0001038867185428342, |
| "loss": 0.023278292268514633, |
| "mean_token_accuracy": 0.9912919998168945, |
| "num_tokens": 10487988.0, |
| "step": 1189 |
| }, |
| { |
| "entropy": 0.9213965982198715, |
| "epoch": 4.29683257918552, |
| "grad_norm": 0.6128994226455688, |
| "learning_rate": 0.00010375545926966316, |
| "loss": 0.029522467404603958, |
| "mean_token_accuracy": 0.9888563305139542, |
| "num_tokens": 10496518.0, |
| "step": 1190 |
| }, |
| { |
| "entropy": 0.9032467007637024, |
| "epoch": 4.300452488687783, |
| "grad_norm": 0.4757763147354126, |
| "learning_rate": 0.00010362421334176138, |
| "loss": 0.01943778805434704, |
| "mean_token_accuracy": 0.9960848391056061, |
| "num_tokens": 10504869.0, |
| "step": 1191 |
| }, |
| { |
| "entropy": 0.8697908222675323, |
| "epoch": 4.304072398190045, |
| "grad_norm": 0.6480269432067871, |
| "learning_rate": 0.00010349298103961577, |
| "loss": 0.06841661036014557, |
| "mean_token_accuracy": 0.9833658635616302, |
| "num_tokens": 10513783.0, |
| "step": 1192 |
| }, |
| { |
| "entropy": 0.9298106580972672, |
| "epoch": 4.3076923076923075, |
| "grad_norm": 0.8510603904724121, |
| "learning_rate": 0.00010336176264368418, |
| "loss": 0.07194619625806808, |
| "mean_token_accuracy": 0.9774241000413895, |
| "num_tokens": 10522312.0, |
| "step": 1193 |
| }, |
| { |
| "entropy": 0.8464011400938034, |
| "epoch": 4.31131221719457, |
| "grad_norm": 0.634172260761261, |
| "learning_rate": 0.00010323055843439473, |
| "loss": 0.07087381184101105, |
| "mean_token_accuracy": 0.9855861663818359, |
| "num_tokens": 10531488.0, |
| "step": 1194 |
| }, |
| { |
| "entropy": 0.8965292721986771, |
| "epoch": 4.314932126696832, |
| "grad_norm": 0.654795229434967, |
| "learning_rate": 0.00010309936869214525, |
| "loss": 0.048451248556375504, |
| "mean_token_accuracy": 0.9868658185005188, |
| "num_tokens": 10539668.0, |
| "step": 1195 |
| }, |
| { |
| "entropy": 0.8605607151985168, |
| "epoch": 4.318552036199095, |
| "grad_norm": 0.5905312299728394, |
| "learning_rate": 0.00010296819369730258, |
| "loss": 0.05599949508905411, |
| "mean_token_accuracy": 0.981212854385376, |
| "num_tokens": 10548226.0, |
| "step": 1196 |
| }, |
| { |
| "entropy": 0.8900521546602249, |
| "epoch": 4.322171945701357, |
| "grad_norm": 0.7872176766395569, |
| "learning_rate": 0.00010283703373020212, |
| "loss": 0.04480139538645744, |
| "mean_token_accuracy": 0.9872453063726425, |
| "num_tokens": 10557015.0, |
| "step": 1197 |
| }, |
| { |
| "entropy": 0.8230308294296265, |
| "epoch": 4.32579185520362, |
| "grad_norm": 0.94143146276474, |
| "learning_rate": 0.00010270588907114716, |
| "loss": 0.04959043487906456, |
| "mean_token_accuracy": 0.9870394170284271, |
| "num_tokens": 10566241.0, |
| "step": 1198 |
| }, |
| { |
| "entropy": 0.8108661025762558, |
| "epoch": 4.329411764705882, |
| "grad_norm": 0.5073636770248413, |
| "learning_rate": 0.00010257476000040816, |
| "loss": 0.039761241525411606, |
| "mean_token_accuracy": 0.9890642464160919, |
| "num_tokens": 10575524.0, |
| "step": 1199 |
| }, |
| { |
| "entropy": 0.8538337796926498, |
| "epoch": 4.3330316742081445, |
| "grad_norm": 0.7470809817314148, |
| "learning_rate": 0.00010244364679822238, |
| "loss": 0.07473456114530563, |
| "mean_token_accuracy": 0.9846348017454147, |
| "num_tokens": 10584153.0, |
| "step": 1200 |
| }, |
| { |
| "entropy": 0.8373128026723862, |
| "epoch": 4.336651583710407, |
| "grad_norm": 0.3188524544239044, |
| "learning_rate": 0.00010231254974479312, |
| "loss": 0.011771505698561668, |
| "mean_token_accuracy": 0.9948063492774963, |
| "num_tokens": 10593069.0, |
| "step": 1201 |
| }, |
| { |
| "entropy": 0.8809863477945328, |
| "epoch": 4.340271493212669, |
| "grad_norm": 0.805793821811676, |
| "learning_rate": 0.00010218146912028917, |
| "loss": 0.23618434369564056, |
| "mean_token_accuracy": 0.9603950381278992, |
| "num_tokens": 10602057.0, |
| "step": 1202 |
| }, |
| { |
| "entropy": 0.8320661336183548, |
| "epoch": 4.343891402714932, |
| "grad_norm": 0.5631644129753113, |
| "learning_rate": 0.00010205040520484423, |
| "loss": 0.0406966507434845, |
| "mean_token_accuracy": 0.9823737889528275, |
| "num_tokens": 10611082.0, |
| "step": 1203 |
| }, |
| { |
| "entropy": 0.9149579256772995, |
| "epoch": 4.347511312217194, |
| "grad_norm": 0.6743116974830627, |
| "learning_rate": 0.0001019193582785563, |
| "loss": 0.07820634543895721, |
| "mean_token_accuracy": 0.9759713411331177, |
| "num_tokens": 10619854.0, |
| "step": 1204 |
| }, |
| { |
| "entropy": 0.859554186463356, |
| "epoch": 4.351131221719457, |
| "grad_norm": 0.6592840552330017, |
| "learning_rate": 0.00010178832862148706, |
| "loss": 0.05549190193414688, |
| "mean_token_accuracy": 0.986658051609993, |
| "num_tokens": 10628992.0, |
| "step": 1205 |
| }, |
| { |
| "entropy": 0.9278796315193176, |
| "epoch": 4.354751131221719, |
| "grad_norm": 0.7901132106781006, |
| "learning_rate": 0.00010165731651366122, |
| "loss": 0.057204730808734894, |
| "mean_token_accuracy": 0.9778600037097931, |
| "num_tokens": 10637257.0, |
| "step": 1206 |
| }, |
| { |
| "entropy": 0.907976433634758, |
| "epoch": 4.3583710407239815, |
| "grad_norm": 0.6733710169792175, |
| "learning_rate": 0.00010152632223506604, |
| "loss": 0.02807607874274254, |
| "mean_token_accuracy": 0.9882861822843552, |
| "num_tokens": 10646088.0, |
| "step": 1207 |
| }, |
| { |
| "entropy": 0.8498242944478989, |
| "epoch": 4.361990950226244, |
| "grad_norm": 0.44677144289016724, |
| "learning_rate": 0.00010139534606565073, |
| "loss": 0.042324379086494446, |
| "mean_token_accuracy": 0.9891783744096756, |
| "num_tokens": 10655709.0, |
| "step": 1208 |
| }, |
| { |
| "entropy": 0.9032723009586334, |
| "epoch": 4.365610859728506, |
| "grad_norm": 0.5025045871734619, |
| "learning_rate": 0.00010126438828532571, |
| "loss": 0.03960299491882324, |
| "mean_token_accuracy": 0.9883267283439636, |
| "num_tokens": 10664403.0, |
| "step": 1209 |
| }, |
| { |
| "entropy": 0.9668902158737183, |
| "epoch": 4.36923076923077, |
| "grad_norm": 0.39445021748542786, |
| "learning_rate": 0.00010113344917396215, |
| "loss": 0.02176782116293907, |
| "mean_token_accuracy": 0.9942312687635422, |
| "num_tokens": 10672986.0, |
| "step": 1210 |
| }, |
| { |
| "entropy": 0.9814326167106628, |
| "epoch": 4.372850678733032, |
| "grad_norm": 0.6776084303855896, |
| "learning_rate": 0.00010100252901139131, |
| "loss": 0.04824589565396309, |
| "mean_token_accuracy": 0.9848027527332306, |
| "num_tokens": 10681148.0, |
| "step": 1211 |
| }, |
| { |
| "entropy": 0.9044711589813232, |
| "epoch": 4.376470588235295, |
| "grad_norm": 0.42135122418403625, |
| "learning_rate": 0.00010087162807740397, |
| "loss": 0.024921081960201263, |
| "mean_token_accuracy": 0.9918465316295624, |
| "num_tokens": 10690147.0, |
| "step": 1212 |
| }, |
| { |
| "entropy": 0.9244499206542969, |
| "epoch": 4.380090497737557, |
| "grad_norm": 0.49353837966918945, |
| "learning_rate": 0.00010074074665174977, |
| "loss": 0.042720977216959, |
| "mean_token_accuracy": 0.9891730397939682, |
| "num_tokens": 10698852.0, |
| "step": 1213 |
| }, |
| { |
| "entropy": 0.847070038318634, |
| "epoch": 4.383710407239819, |
| "grad_norm": 0.3503934144973755, |
| "learning_rate": 0.00010060988501413668, |
| "loss": 0.019927293062210083, |
| "mean_token_accuracy": 0.9944724142551422, |
| "num_tokens": 10708150.0, |
| "step": 1214 |
| }, |
| { |
| "entropy": 0.8821483105421066, |
| "epoch": 4.387330316742082, |
| "grad_norm": 0.5833554267883301, |
| "learning_rate": 0.00010047904344423043, |
| "loss": 0.05499357730150223, |
| "mean_token_accuracy": 0.9791678488254547, |
| "num_tokens": 10716975.0, |
| "step": 1215 |
| }, |
| { |
| "entropy": 0.8406474888324738, |
| "epoch": 4.390950226244344, |
| "grad_norm": 0.5742695927619934, |
| "learning_rate": 0.00010034822222165377, |
| "loss": 0.038775935769081116, |
| "mean_token_accuracy": 0.986319050192833, |
| "num_tokens": 10725858.0, |
| "step": 1216 |
| }, |
| { |
| "entropy": 0.857618436217308, |
| "epoch": 4.394570135746607, |
| "grad_norm": 0.5021587014198303, |
| "learning_rate": 0.00010021742162598606, |
| "loss": 0.024894721806049347, |
| "mean_token_accuracy": 0.99241703748703, |
| "num_tokens": 10734716.0, |
| "step": 1217 |
| }, |
| { |
| "entropy": 0.9438434839248657, |
| "epoch": 4.398190045248869, |
| "grad_norm": 0.5450843572616577, |
| "learning_rate": 0.00010008664193676251, |
| "loss": 0.0339815691113472, |
| "mean_token_accuracy": 0.9880622923374176, |
| "num_tokens": 10742902.0, |
| "step": 1218 |
| }, |
| { |
| "entropy": 0.9149988889694214, |
| "epoch": 4.401809954751132, |
| "grad_norm": 0.5017070770263672, |
| "learning_rate": 9.995588343347373e-05, |
| "loss": 0.03535865992307663, |
| "mean_token_accuracy": 0.9894069284200668, |
| "num_tokens": 10751684.0, |
| "step": 1219 |
| }, |
| { |
| "entropy": 0.8433951437473297, |
| "epoch": 4.405429864253394, |
| "grad_norm": 0.4098859131336212, |
| "learning_rate": 9.98251463955649e-05, |
| "loss": 0.03204619884490967, |
| "mean_token_accuracy": 0.9892712533473969, |
| "num_tokens": 10760889.0, |
| "step": 1220 |
| }, |
| { |
| "entropy": 0.9143005758523941, |
| "epoch": 4.409049773755656, |
| "grad_norm": 0.8866249918937683, |
| "learning_rate": 9.96944311024355e-05, |
| "loss": 0.08796840161085129, |
| "mean_token_accuracy": 0.9761585295200348, |
| "num_tokens": 10769436.0, |
| "step": 1221 |
| }, |
| { |
| "entropy": 0.8927360326051712, |
| "epoch": 4.412669683257919, |
| "grad_norm": 0.5494005680084229, |
| "learning_rate": 9.956373783343847e-05, |
| "loss": 0.03506309166550636, |
| "mean_token_accuracy": 0.9899781793355942, |
| "num_tokens": 10778203.0, |
| "step": 1222 |
| }, |
| { |
| "entropy": 0.8397981226444244, |
| "epoch": 4.416289592760181, |
| "grad_norm": 0.47664108872413635, |
| "learning_rate": 9.943306686787964e-05, |
| "loss": 0.04049497842788696, |
| "mean_token_accuracy": 0.9860450029373169, |
| "num_tokens": 10787495.0, |
| "step": 1223 |
| }, |
| { |
| "entropy": 0.87860107421875, |
| "epoch": 4.419909502262444, |
| "grad_norm": 0.860580563545227, |
| "learning_rate": 9.930241848501722e-05, |
| "loss": 0.17442655563354492, |
| "mean_token_accuracy": 0.9721402823925018, |
| "num_tokens": 10796694.0, |
| "step": 1224 |
| }, |
| { |
| "entropy": 0.8964706212282181, |
| "epoch": 4.423529411764706, |
| "grad_norm": 0.5525467395782471, |
| "learning_rate": 9.917179296406116e-05, |
| "loss": 0.045697472989559174, |
| "mean_token_accuracy": 0.9850039780139923, |
| "num_tokens": 10805314.0, |
| "step": 1225 |
| }, |
| { |
| "entropy": 0.9075468629598618, |
| "epoch": 4.427149321266969, |
| "grad_norm": 1.0463852882385254, |
| "learning_rate": 9.904119058417256e-05, |
| "loss": 0.19723130762577057, |
| "mean_token_accuracy": 0.964007630944252, |
| "num_tokens": 10814188.0, |
| "step": 1226 |
| }, |
| { |
| "entropy": 0.892121896147728, |
| "epoch": 4.430769230769231, |
| "grad_norm": 0.8913745880126953, |
| "learning_rate": 9.891061162446302e-05, |
| "loss": 0.05357573926448822, |
| "mean_token_accuracy": 0.9836710542440414, |
| "num_tokens": 10822890.0, |
| "step": 1227 |
| }, |
| { |
| "entropy": 0.8689533025026321, |
| "epoch": 4.4343891402714934, |
| "grad_norm": 0.4191250503063202, |
| "learning_rate": 9.87800563639941e-05, |
| "loss": 0.027562851086258888, |
| "mean_token_accuracy": 0.9921068102121353, |
| "num_tokens": 10832375.0, |
| "step": 1228 |
| }, |
| { |
| "entropy": 0.9156316965818405, |
| "epoch": 4.438009049773756, |
| "grad_norm": 0.40549466013908386, |
| "learning_rate": 9.864952508177673e-05, |
| "loss": 0.026554280892014503, |
| "mean_token_accuracy": 0.990155890583992, |
| "num_tokens": 10840600.0, |
| "step": 1229 |
| }, |
| { |
| "entropy": 0.8978532254695892, |
| "epoch": 4.441628959276018, |
| "grad_norm": 0.6001290678977966, |
| "learning_rate": 9.851901805677066e-05, |
| "loss": 0.044659726321697235, |
| "mean_token_accuracy": 0.9852373898029327, |
| "num_tokens": 10849243.0, |
| "step": 1230 |
| }, |
| { |
| "entropy": 0.8671956062316895, |
| "epoch": 4.445248868778281, |
| "grad_norm": 0.46831727027893066, |
| "learning_rate": 9.838853556788366e-05, |
| "loss": 0.024291303008794785, |
| "mean_token_accuracy": 0.9942016154527664, |
| "num_tokens": 10858006.0, |
| "step": 1231 |
| }, |
| { |
| "entropy": 0.8727803975343704, |
| "epoch": 4.448868778280543, |
| "grad_norm": 0.5648714303970337, |
| "learning_rate": 9.825807789397115e-05, |
| "loss": 0.03640315309166908, |
| "mean_token_accuracy": 0.9874684065580368, |
| "num_tokens": 10867043.0, |
| "step": 1232 |
| }, |
| { |
| "entropy": 0.889243483543396, |
| "epoch": 4.452488687782806, |
| "grad_norm": 0.5327542424201965, |
| "learning_rate": 9.812764531383556e-05, |
| "loss": 0.0371791273355484, |
| "mean_token_accuracy": 0.9900572001934052, |
| "num_tokens": 10875412.0, |
| "step": 1233 |
| }, |
| { |
| "entropy": 0.9031431376934052, |
| "epoch": 4.456108597285068, |
| "grad_norm": 0.4150716960430145, |
| "learning_rate": 9.799723810622552e-05, |
| "loss": 0.023472465574741364, |
| "mean_token_accuracy": 0.9918784946203232, |
| "num_tokens": 10883763.0, |
| "step": 1234 |
| }, |
| { |
| "entropy": 0.85833740234375, |
| "epoch": 4.4597285067873305, |
| "grad_norm": 0.7519460916519165, |
| "learning_rate": 9.786685654983567e-05, |
| "loss": 0.04321755841374397, |
| "mean_token_accuracy": 0.9835457056760788, |
| "num_tokens": 10892993.0, |
| "step": 1235 |
| }, |
| { |
| "entropy": 0.8594792932271957, |
| "epoch": 4.463348416289593, |
| "grad_norm": 0.4252811670303345, |
| "learning_rate": 9.773650092330566e-05, |
| "loss": 0.027196653187274933, |
| "mean_token_accuracy": 0.9954462945461273, |
| "num_tokens": 10901708.0, |
| "step": 1236 |
| }, |
| { |
| "entropy": 0.9015035331249237, |
| "epoch": 4.466968325791855, |
| "grad_norm": 0.5715810656547546, |
| "learning_rate": 9.760617150521976e-05, |
| "loss": 0.07570506632328033, |
| "mean_token_accuracy": 0.9812192022800446, |
| "num_tokens": 10910501.0, |
| "step": 1237 |
| }, |
| { |
| "entropy": 0.8576754629611969, |
| "epoch": 4.470588235294118, |
| "grad_norm": 0.6568597555160522, |
| "learning_rate": 9.747586857410629e-05, |
| "loss": 0.04479028284549713, |
| "mean_token_accuracy": 0.9829374700784683, |
| "num_tokens": 10919679.0, |
| "step": 1238 |
| }, |
| { |
| "entropy": 0.9022691249847412, |
| "epoch": 4.47420814479638, |
| "grad_norm": 0.8238912224769592, |
| "learning_rate": 9.73455924084369e-05, |
| "loss": 0.05337506905198097, |
| "mean_token_accuracy": 0.9812590926885605, |
| "num_tokens": 10928585.0, |
| "step": 1239 |
| }, |
| { |
| "entropy": 0.926213338971138, |
| "epoch": 4.477828054298643, |
| "grad_norm": 0.632087767124176, |
| "learning_rate": 9.721534328662609e-05, |
| "loss": 0.0570443719625473, |
| "mean_token_accuracy": 0.9859164953231812, |
| "num_tokens": 10937371.0, |
| "step": 1240 |
| }, |
| { |
| "entropy": 0.8500941395759583, |
| "epoch": 4.481447963800905, |
| "grad_norm": 0.43256041407585144, |
| "learning_rate": 9.708512148703049e-05, |
| "loss": 0.03233124688267708, |
| "mean_token_accuracy": 0.9893523305654526, |
| "num_tokens": 10946660.0, |
| "step": 1241 |
| }, |
| { |
| "entropy": 0.9078550934791565, |
| "epoch": 4.4850678733031675, |
| "grad_norm": 0.4536542296409607, |
| "learning_rate": 9.695492728794837e-05, |
| "loss": 0.028730809688568115, |
| "mean_token_accuracy": 0.9890875816345215, |
| "num_tokens": 10955396.0, |
| "step": 1242 |
| }, |
| { |
| "entropy": 0.9219816625118256, |
| "epoch": 4.48868778280543, |
| "grad_norm": 0.6403781771659851, |
| "learning_rate": 9.682476096761907e-05, |
| "loss": 0.05681582912802696, |
| "mean_token_accuracy": 0.9841127395629883, |
| "num_tokens": 10964137.0, |
| "step": 1243 |
| }, |
| { |
| "entropy": 0.8394899666309357, |
| "epoch": 4.492307692307692, |
| "grad_norm": 0.4959585666656494, |
| "learning_rate": 9.669462280422234e-05, |
| "loss": 0.04324223846197128, |
| "mean_token_accuracy": 0.9881971031427383, |
| "num_tokens": 10973576.0, |
| "step": 1244 |
| }, |
| { |
| "entropy": 0.9000880867242813, |
| "epoch": 4.495927601809955, |
| "grad_norm": 0.5821248292922974, |
| "learning_rate": 9.656451307587769e-05, |
| "loss": 0.034078195691108704, |
| "mean_token_accuracy": 0.9908082634210587, |
| "num_tokens": 10982332.0, |
| "step": 1245 |
| }, |
| { |
| "entropy": 0.9280275702476501, |
| "epoch": 4.499547511312217, |
| "grad_norm": 0.3630794286727905, |
| "learning_rate": 9.643443206064386e-05, |
| "loss": 0.020034782588481903, |
| "mean_token_accuracy": 0.9927627146244049, |
| "num_tokens": 10990693.0, |
| "step": 1246 |
| }, |
| { |
| "entropy": 0.8500865697860718, |
| "epoch": 4.50316742081448, |
| "grad_norm": 0.6605433225631714, |
| "learning_rate": 9.630438003651833e-05, |
| "loss": 0.049847669899463654, |
| "mean_token_accuracy": 0.9830765873193741, |
| "num_tokens": 10999984.0, |
| "step": 1247 |
| }, |
| { |
| "entropy": 0.8788967728614807, |
| "epoch": 4.506787330316742, |
| "grad_norm": 0.49596118927001953, |
| "learning_rate": 9.617435728143654e-05, |
| "loss": 0.040307819843292236, |
| "mean_token_accuracy": 0.9881236255168915, |
| "num_tokens": 11008928.0, |
| "step": 1248 |
| }, |
| { |
| "entropy": 0.8518171012401581, |
| "epoch": 4.5104072398190045, |
| "grad_norm": 0.48991096019744873, |
| "learning_rate": 9.60443640732713e-05, |
| "loss": 0.03139914199709892, |
| "mean_token_accuracy": 0.9922763705253601, |
| "num_tokens": 11018014.0, |
| "step": 1249 |
| }, |
| { |
| "entropy": 0.8158316314220428, |
| "epoch": 4.514027149321267, |
| "grad_norm": 0.5165399312973022, |
| "learning_rate": 9.59144006898325e-05, |
| "loss": 0.048440560698509216, |
| "mean_token_accuracy": 0.9869976490736008, |
| "num_tokens": 11027583.0, |
| "step": 1250 |
| }, |
| { |
| "entropy": 0.9327925741672516, |
| "epoch": 4.517647058823529, |
| "grad_norm": 0.667120635509491, |
| "learning_rate": 9.57844674088661e-05, |
| "loss": 0.03646295145153999, |
| "mean_token_accuracy": 0.9886159151792526, |
| "num_tokens": 11035928.0, |
| "step": 1251 |
| }, |
| { |
| "entropy": 0.8431118130683899, |
| "epoch": 4.521266968325792, |
| "grad_norm": 0.5622652173042297, |
| "learning_rate": 9.565456450805382e-05, |
| "loss": 0.029620612040162086, |
| "mean_token_accuracy": 0.9908718019723892, |
| "num_tokens": 11045121.0, |
| "step": 1252 |
| }, |
| { |
| "entropy": 0.9268823117017746, |
| "epoch": 4.524886877828054, |
| "grad_norm": 0.46910595893859863, |
| "learning_rate": 9.552469226501237e-05, |
| "loss": 0.026674820110201836, |
| "mean_token_accuracy": 0.9921789020299911, |
| "num_tokens": 11053733.0, |
| "step": 1253 |
| }, |
| { |
| "entropy": 0.8750782012939453, |
| "epoch": 4.528506787330317, |
| "grad_norm": 0.5680973529815674, |
| "learning_rate": 9.539485095729308e-05, |
| "loss": 0.050825320184230804, |
| "mean_token_accuracy": 0.985125944018364, |
| "num_tokens": 11062542.0, |
| "step": 1254 |
| }, |
| { |
| "entropy": 0.889702245593071, |
| "epoch": 4.532126696832579, |
| "grad_norm": 0.5325557589530945, |
| "learning_rate": 9.526504086238097e-05, |
| "loss": 0.04129321873188019, |
| "mean_token_accuracy": 0.9894774854183197, |
| "num_tokens": 11071184.0, |
| "step": 1255 |
| }, |
| { |
| "entropy": 0.8572105765342712, |
| "epoch": 4.5357466063348415, |
| "grad_norm": 0.5291478037834167, |
| "learning_rate": 9.513526225769454e-05, |
| "loss": 0.028147444128990173, |
| "mean_token_accuracy": 0.9894004464149475, |
| "num_tokens": 11080303.0, |
| "step": 1256 |
| }, |
| { |
| "entropy": 0.9068948477506638, |
| "epoch": 4.539366515837104, |
| "grad_norm": 0.5786257982254028, |
| "learning_rate": 9.500551542058492e-05, |
| "loss": 0.03714841976761818, |
| "mean_token_accuracy": 0.9861379265785217, |
| "num_tokens": 11088917.0, |
| "step": 1257 |
| }, |
| { |
| "entropy": 0.8949933052062988, |
| "epoch": 4.542986425339366, |
| "grad_norm": 0.5275850892066956, |
| "learning_rate": 9.487580062833532e-05, |
| "loss": 0.04033740609884262, |
| "mean_token_accuracy": 0.9893816113471985, |
| "num_tokens": 11097661.0, |
| "step": 1258 |
| }, |
| { |
| "entropy": 0.9183568209409714, |
| "epoch": 4.546606334841629, |
| "grad_norm": 0.6421916484832764, |
| "learning_rate": 9.474611815816048e-05, |
| "loss": 0.06847433000802994, |
| "mean_token_accuracy": 0.9772931635379791, |
| "num_tokens": 11106229.0, |
| "step": 1259 |
| }, |
| { |
| "entropy": 0.9245471358299255, |
| "epoch": 4.550226244343891, |
| "grad_norm": 0.5704196691513062, |
| "learning_rate": 9.461646828720616e-05, |
| "loss": 0.0404348149895668, |
| "mean_token_accuracy": 0.9885217696428299, |
| "num_tokens": 11115179.0, |
| "step": 1260 |
| }, |
| { |
| "entropy": 0.9546661376953125, |
| "epoch": 4.553846153846154, |
| "grad_norm": 0.8367648720741272, |
| "learning_rate": 9.448685129254828e-05, |
| "loss": 0.07936983555555344, |
| "mean_token_accuracy": 0.9769112765789032, |
| "num_tokens": 11123860.0, |
| "step": 1261 |
| }, |
| { |
| "entropy": 0.8901357203722, |
| "epoch": 4.557466063348416, |
| "grad_norm": 0.48454558849334717, |
| "learning_rate": 9.435726745119264e-05, |
| "loss": 0.029340846464037895, |
| "mean_token_accuracy": 0.9888796657323837, |
| "num_tokens": 11132821.0, |
| "step": 1262 |
| }, |
| { |
| "entropy": 0.8747525364160538, |
| "epoch": 4.5610859728506785, |
| "grad_norm": 0.47339335083961487, |
| "learning_rate": 9.422771704007409e-05, |
| "loss": 0.03331891447305679, |
| "mean_token_accuracy": 0.9901967644691467, |
| "num_tokens": 11141742.0, |
| "step": 1263 |
| }, |
| { |
| "entropy": 0.8973748683929443, |
| "epoch": 4.564705882352941, |
| "grad_norm": 0.5341124534606934, |
| "learning_rate": 9.409820033605614e-05, |
| "loss": 0.03468211740255356, |
| "mean_token_accuracy": 0.990693673491478, |
| "num_tokens": 11150411.0, |
| "step": 1264 |
| }, |
| { |
| "entropy": 0.8835297226905823, |
| "epoch": 4.568325791855203, |
| "grad_norm": 0.5115938186645508, |
| "learning_rate": 9.39687176159302e-05, |
| "loss": 0.037047356367111206, |
| "mean_token_accuracy": 0.9864104092121124, |
| "num_tokens": 11159328.0, |
| "step": 1265 |
| }, |
| { |
| "entropy": 0.8623639047145844, |
| "epoch": 4.571945701357466, |
| "grad_norm": 0.5846521258354187, |
| "learning_rate": 9.3839269156415e-05, |
| "loss": 0.04693792760372162, |
| "mean_token_accuracy": 0.9854484647512436, |
| "num_tokens": 11168251.0, |
| "step": 1266 |
| }, |
| { |
| "entropy": 0.9362972676753998, |
| "epoch": 4.575565610859728, |
| "grad_norm": 0.26710987091064453, |
| "learning_rate": 9.370985523415623e-05, |
| "loss": 0.014222146943211555, |
| "mean_token_accuracy": 0.9938410818576813, |
| "num_tokens": 11176539.0, |
| "step": 1267 |
| }, |
| { |
| "entropy": 0.9218808859586716, |
| "epoch": 4.579185520361991, |
| "grad_norm": 0.594578742980957, |
| "learning_rate": 9.358047612572554e-05, |
| "loss": 0.04387912154197693, |
| "mean_token_accuracy": 0.9871724396944046, |
| "num_tokens": 11184908.0, |
| "step": 1268 |
| }, |
| { |
| "entropy": 0.9472506046295166, |
| "epoch": 4.582805429864253, |
| "grad_norm": 0.7246670722961426, |
| "learning_rate": 9.345113210762033e-05, |
| "loss": 0.060271404683589935, |
| "mean_token_accuracy": 0.9830828458070755, |
| "num_tokens": 11193370.0, |
| "step": 1269 |
| }, |
| { |
| "entropy": 0.8755769431591034, |
| "epoch": 4.5864253393665155, |
| "grad_norm": 0.4499802887439728, |
| "learning_rate": 9.332182345626297e-05, |
| "loss": 0.019561700522899628, |
| "mean_token_accuracy": 0.9953616410493851, |
| "num_tokens": 11202005.0, |
| "step": 1270 |
| }, |
| { |
| "entropy": 0.9018907248973846, |
| "epoch": 4.590045248868778, |
| "grad_norm": 0.4629576504230499, |
| "learning_rate": 9.319255044800026e-05, |
| "loss": 0.018598034977912903, |
| "mean_token_accuracy": 0.9917204529047012, |
| "num_tokens": 11210617.0, |
| "step": 1271 |
| }, |
| { |
| "entropy": 0.9013173431158066, |
| "epoch": 4.59366515837104, |
| "grad_norm": 0.5423282980918884, |
| "learning_rate": 9.306331335910279e-05, |
| "loss": 0.03331900015473366, |
| "mean_token_accuracy": 0.9889495521783829, |
| "num_tokens": 11219357.0, |
| "step": 1272 |
| }, |
| { |
| "entropy": 0.9359396398067474, |
| "epoch": 4.597285067873303, |
| "grad_norm": 0.37270912528038025, |
| "learning_rate": 9.293411246576442e-05, |
| "loss": 0.01753406971693039, |
| "mean_token_accuracy": 0.995793953537941, |
| "num_tokens": 11227697.0, |
| "step": 1273 |
| }, |
| { |
| "entropy": 0.8669909536838531, |
| "epoch": 4.600904977375565, |
| "grad_norm": 0.7340614795684814, |
| "learning_rate": 9.280494804410167e-05, |
| "loss": 0.067268967628479, |
| "mean_token_accuracy": 0.9793154299259186, |
| "num_tokens": 11236232.0, |
| "step": 1274 |
| }, |
| { |
| "entropy": 0.923119992017746, |
| "epoch": 4.604524886877828, |
| "grad_norm": 1.0877807140350342, |
| "learning_rate": 9.267582037015308e-05, |
| "loss": 0.05310777574777603, |
| "mean_token_accuracy": 0.9892044812440872, |
| "num_tokens": 11244814.0, |
| "step": 1275 |
| }, |
| { |
| "entropy": 0.8926683962345123, |
| "epoch": 4.60814479638009, |
| "grad_norm": 0.4921732246875763, |
| "learning_rate": 9.254672971987863e-05, |
| "loss": 0.027801400050520897, |
| "mean_token_accuracy": 0.9915051609277725, |
| "num_tokens": 11253577.0, |
| "step": 1276 |
| }, |
| { |
| "entropy": 0.9701418429613113, |
| "epoch": 4.6117647058823525, |
| "grad_norm": 0.7067182064056396, |
| "learning_rate": 9.241767636915923e-05, |
| "loss": 0.04861636832356453, |
| "mean_token_accuracy": 0.9905703663825989, |
| "num_tokens": 11261550.0, |
| "step": 1277 |
| }, |
| { |
| "entropy": 0.9237975776195526, |
| "epoch": 4.615384615384615, |
| "grad_norm": 0.45785027742385864, |
| "learning_rate": 9.22886605937961e-05, |
| "loss": 0.031087510287761688, |
| "mean_token_accuracy": 0.9908934086561203, |
| "num_tokens": 11270148.0, |
| "step": 1278 |
| }, |
| { |
| "entropy": 0.9537828266620636, |
| "epoch": 4.619004524886877, |
| "grad_norm": 1.2928509712219238, |
| "learning_rate": 9.21596826695101e-05, |
| "loss": 0.03970994055271149, |
| "mean_token_accuracy": 0.9891022890806198, |
| "num_tokens": 11278610.0, |
| "step": 1279 |
| }, |
| { |
| "entropy": 0.8938749879598618, |
| "epoch": 4.62262443438914, |
| "grad_norm": 0.3910481929779053, |
| "learning_rate": 9.203074287194118e-05, |
| "loss": 0.03463595360517502, |
| "mean_token_accuracy": 0.9896118193864822, |
| "num_tokens": 11287694.0, |
| "step": 1280 |
| }, |
| { |
| "entropy": 0.9483572989702225, |
| "epoch": 4.626244343891402, |
| "grad_norm": 0.48045435547828674, |
| "learning_rate": 9.190184147664791e-05, |
| "loss": 0.0231014396995306, |
| "mean_token_accuracy": 0.9937128722667694, |
| "num_tokens": 11296155.0, |
| "step": 1281 |
| }, |
| { |
| "entropy": 0.9255064278841019, |
| "epoch": 4.629864253393665, |
| "grad_norm": 0.39855605363845825, |
| "learning_rate": 9.177297875910667e-05, |
| "loss": 0.021291583776474, |
| "mean_token_accuracy": 0.9956228137016296, |
| "num_tokens": 11304810.0, |
| "step": 1282 |
| }, |
| { |
| "entropy": 0.9462402909994125, |
| "epoch": 4.633484162895927, |
| "grad_norm": 0.4094499349594116, |
| "learning_rate": 9.164415499471126e-05, |
| "loss": 0.032552555203437805, |
| "mean_token_accuracy": 0.9914036691188812, |
| "num_tokens": 11313680.0, |
| "step": 1283 |
| }, |
| { |
| "entropy": 0.951962873339653, |
| "epoch": 4.63710407239819, |
| "grad_norm": 0.7086257934570312, |
| "learning_rate": 9.151537045877221e-05, |
| "loss": 0.04020824283361435, |
| "mean_token_accuracy": 0.9873267412185669, |
| "num_tokens": 11322162.0, |
| "step": 1284 |
| }, |
| { |
| "entropy": 0.9462797939777374, |
| "epoch": 4.640723981900453, |
| "grad_norm": 0.6588366627693176, |
| "learning_rate": 9.138662542651621e-05, |
| "loss": 0.047957099974155426, |
| "mean_token_accuracy": 0.9874707162380219, |
| "num_tokens": 11331228.0, |
| "step": 1285 |
| }, |
| { |
| "entropy": 0.9177692830562592, |
| "epoch": 4.644343891402715, |
| "grad_norm": 0.39822492003440857, |
| "learning_rate": 9.125792017308553e-05, |
| "loss": 0.03488968685269356, |
| "mean_token_accuracy": 0.9879388362169266, |
| "num_tokens": 11340479.0, |
| "step": 1286 |
| }, |
| { |
| "entropy": 1.017251044511795, |
| "epoch": 4.647963800904978, |
| "grad_norm": 0.6734071373939514, |
| "learning_rate": 9.112925497353746e-05, |
| "loss": 0.0549258328974247, |
| "mean_token_accuracy": 0.9842551499605179, |
| "num_tokens": 11349185.0, |
| "step": 1287 |
| }, |
| { |
| "entropy": 0.9144134521484375, |
| "epoch": 4.65158371040724, |
| "grad_norm": 0.6358972787857056, |
| "learning_rate": 9.100063010284366e-05, |
| "loss": 0.0614742636680603, |
| "mean_token_accuracy": 0.9769433587789536, |
| "num_tokens": 11358288.0, |
| "step": 1288 |
| }, |
| { |
| "entropy": 0.9544530212879181, |
| "epoch": 4.655203619909503, |
| "grad_norm": 0.6581546068191528, |
| "learning_rate": 9.087204583588951e-05, |
| "loss": 0.05966397002339363, |
| "mean_token_accuracy": 0.9798359274864197, |
| "num_tokens": 11367016.0, |
| "step": 1289 |
| }, |
| { |
| "entropy": 0.9357064366340637, |
| "epoch": 4.658823529411765, |
| "grad_norm": 0.7771779298782349, |
| "learning_rate": 9.074350244747379e-05, |
| "loss": 0.0511971078813076, |
| "mean_token_accuracy": 0.981827974319458, |
| "num_tokens": 11376242.0, |
| "step": 1290 |
| }, |
| { |
| "entropy": 0.9112809002399445, |
| "epoch": 4.6624434389140275, |
| "grad_norm": 0.5351385474205017, |
| "learning_rate": 9.061500021230782e-05, |
| "loss": 0.033410944044589996, |
| "mean_token_accuracy": 0.9908107221126556, |
| "num_tokens": 11385110.0, |
| "step": 1291 |
| }, |
| { |
| "entropy": 0.9920302629470825, |
| "epoch": 4.66606334841629, |
| "grad_norm": 0.7133076190948486, |
| "learning_rate": 9.048653940501499e-05, |
| "loss": 0.07016268372535706, |
| "mean_token_accuracy": 0.9680485278367996, |
| "num_tokens": 11393510.0, |
| "step": 1292 |
| }, |
| { |
| "entropy": 0.9926072955131531, |
| "epoch": 4.669683257918552, |
| "grad_norm": 0.39922699332237244, |
| "learning_rate": 9.035812030013013e-05, |
| "loss": 0.028499318286776543, |
| "mean_token_accuracy": 0.9919241964817047, |
| "num_tokens": 11402001.0, |
| "step": 1293 |
| }, |
| { |
| "entropy": 0.9475626051425934, |
| "epoch": 4.673303167420815, |
| "grad_norm": 0.29215705394744873, |
| "learning_rate": 9.022974317209902e-05, |
| "loss": 0.021906405687332153, |
| "mean_token_accuracy": 0.9935397207736969, |
| "num_tokens": 11410775.0, |
| "step": 1294 |
| }, |
| { |
| "entropy": 0.9720835387706757, |
| "epoch": 4.676923076923077, |
| "grad_norm": 0.5957951545715332, |
| "learning_rate": 9.010140829527767e-05, |
| "loss": 0.03795255348086357, |
| "mean_token_accuracy": 0.9885151833295822, |
| "num_tokens": 11419599.0, |
| "step": 1295 |
| }, |
| { |
| "entropy": 0.9771716296672821, |
| "epoch": 4.68054298642534, |
| "grad_norm": 0.340427041053772, |
| "learning_rate": 8.997311594393172e-05, |
| "loss": 0.027071382850408554, |
| "mean_token_accuracy": 0.9911787509918213, |
| "num_tokens": 11428528.0, |
| "step": 1296 |
| }, |
| { |
| "entropy": 0.9611377120018005, |
| "epoch": 4.684162895927602, |
| "grad_norm": 0.5193417072296143, |
| "learning_rate": 8.98448663922361e-05, |
| "loss": 0.05718943476676941, |
| "mean_token_accuracy": 0.9795946478843689, |
| "num_tokens": 11437338.0, |
| "step": 1297 |
| }, |
| { |
| "entropy": 0.9352683573961258, |
| "epoch": 4.6877828054298645, |
| "grad_norm": 0.5943431854248047, |
| "learning_rate": 8.971665991427414e-05, |
| "loss": 0.10998387634754181, |
| "mean_token_accuracy": 0.9796771854162216, |
| "num_tokens": 11446290.0, |
| "step": 1298 |
| }, |
| { |
| "entropy": 0.9648092687129974, |
| "epoch": 4.691402714932127, |
| "grad_norm": 0.49834129214286804, |
| "learning_rate": 8.958849678403716e-05, |
| "loss": 0.023775417357683182, |
| "mean_token_accuracy": 0.9925644546747208, |
| "num_tokens": 11454628.0, |
| "step": 1299 |
| }, |
| { |
| "entropy": 1.0155327767133713, |
| "epoch": 4.695022624434389, |
| "grad_norm": 0.7852084636688232, |
| "learning_rate": 8.946037727542389e-05, |
| "loss": 0.044226959347724915, |
| "mean_token_accuracy": 0.9850529134273529, |
| "num_tokens": 11462704.0, |
| "step": 1300 |
| }, |
| { |
| "entropy": 0.926691085100174, |
| "epoch": 4.698642533936652, |
| "grad_norm": 0.4994286298751831, |
| "learning_rate": 8.933230166223973e-05, |
| "loss": 0.039819322526454926, |
| "mean_token_accuracy": 0.9850803911685944, |
| "num_tokens": 11471404.0, |
| "step": 1301 |
| }, |
| { |
| "entropy": 0.9265129566192627, |
| "epoch": 4.702262443438914, |
| "grad_norm": 0.42576301097869873, |
| "learning_rate": 8.920427021819642e-05, |
| "loss": 0.02052636444568634, |
| "mean_token_accuracy": 0.9918248355388641, |
| "num_tokens": 11480092.0, |
| "step": 1302 |
| }, |
| { |
| "entropy": 0.8638421446084976, |
| "epoch": 4.705882352941177, |
| "grad_norm": 0.48810967803001404, |
| "learning_rate": 8.90762832169111e-05, |
| "loss": 0.025864070281386375, |
| "mean_token_accuracy": 0.9912404119968414, |
| "num_tokens": 11489596.0, |
| "step": 1303 |
| }, |
| { |
| "entropy": 0.8895996809005737, |
| "epoch": 4.709502262443439, |
| "grad_norm": 0.6966197490692139, |
| "learning_rate": 8.89483409319061e-05, |
| "loss": 0.05462773144245148, |
| "mean_token_accuracy": 0.9865398108959198, |
| "num_tokens": 11499041.0, |
| "step": 1304 |
| }, |
| { |
| "entropy": 0.9096980839967728, |
| "epoch": 4.7131221719457015, |
| "grad_norm": 0.5785058736801147, |
| "learning_rate": 8.882044363660813e-05, |
| "loss": 0.041293296962976456, |
| "mean_token_accuracy": 0.9886120110750198, |
| "num_tokens": 11508044.0, |
| "step": 1305 |
| }, |
| { |
| "entropy": 0.9218699336051941, |
| "epoch": 4.716742081447964, |
| "grad_norm": 0.7000371217727661, |
| "learning_rate": 8.869259160434776e-05, |
| "loss": 0.04127487540245056, |
| "mean_token_accuracy": 0.9850119203329086, |
| "num_tokens": 11516859.0, |
| "step": 1306 |
| }, |
| { |
| "entropy": 0.8685460537672043, |
| "epoch": 4.720361990950226, |
| "grad_norm": 0.6293456554412842, |
| "learning_rate": 8.856478510835878e-05, |
| "loss": 0.0625356063246727, |
| "mean_token_accuracy": 0.9840066283941269, |
| "num_tokens": 11526405.0, |
| "step": 1307 |
| }, |
| { |
| "entropy": 0.9357750713825226, |
| "epoch": 4.723981900452489, |
| "grad_norm": 0.5446988344192505, |
| "learning_rate": 8.843702442177777e-05, |
| "loss": 0.06257246434688568, |
| "mean_token_accuracy": 0.982016310095787, |
| "num_tokens": 11535367.0, |
| "step": 1308 |
| }, |
| { |
| "entropy": 0.9088067710399628, |
| "epoch": 4.727601809954751, |
| "grad_norm": 0.8511834740638733, |
| "learning_rate": 8.830930981764331e-05, |
| "loss": 0.07266386598348618, |
| "mean_token_accuracy": 0.9781930446624756, |
| "num_tokens": 11544182.0, |
| "step": 1309 |
| }, |
| { |
| "entropy": 0.8685061484575272, |
| "epoch": 4.731221719457014, |
| "grad_norm": 0.41793859004974365, |
| "learning_rate": 8.818164156889557e-05, |
| "loss": 0.01888015680015087, |
| "mean_token_accuracy": 0.9903381317853928, |
| "num_tokens": 11553094.0, |
| "step": 1310 |
| }, |
| { |
| "entropy": 0.9195333868265152, |
| "epoch": 4.734841628959276, |
| "grad_norm": 0.45836082100868225, |
| "learning_rate": 8.805401994837552e-05, |
| "loss": 0.04215538874268532, |
| "mean_token_accuracy": 0.988913893699646, |
| "num_tokens": 11561681.0, |
| "step": 1311 |
| }, |
| { |
| "entropy": 0.8929650038480759, |
| "epoch": 4.7384615384615385, |
| "grad_norm": 0.5067728757858276, |
| "learning_rate": 8.79264452288247e-05, |
| "loss": 0.05104423314332962, |
| "mean_token_accuracy": 0.9887937307357788, |
| "num_tokens": 11570799.0, |
| "step": 1312 |
| }, |
| { |
| "entropy": 0.9310666769742966, |
| "epoch": 4.742081447963801, |
| "grad_norm": 0.5647673606872559, |
| "learning_rate": 8.77989176828842e-05, |
| "loss": 0.044227782636880875, |
| "mean_token_accuracy": 0.9847547262907028, |
| "num_tokens": 11579550.0, |
| "step": 1313 |
| }, |
| { |
| "entropy": 0.866066038608551, |
| "epoch": 4.745701357466063, |
| "grad_norm": 0.7207821607589722, |
| "learning_rate": 8.767143758309441e-05, |
| "loss": 0.05543539673089981, |
| "mean_token_accuracy": 0.9844647198915482, |
| "num_tokens": 11589395.0, |
| "step": 1314 |
| }, |
| { |
| "entropy": 0.8902018964290619, |
| "epoch": 4.749321266968326, |
| "grad_norm": 0.6174284219741821, |
| "learning_rate": 8.754400520189434e-05, |
| "loss": 0.05488777905702591, |
| "mean_token_accuracy": 0.9817168563604355, |
| "num_tokens": 11598532.0, |
| "step": 1315 |
| }, |
| { |
| "entropy": 0.9263211190700531, |
| "epoch": 4.752941176470588, |
| "grad_norm": 0.5040092468261719, |
| "learning_rate": 8.741662081162101e-05, |
| "loss": 0.03482822701334953, |
| "mean_token_accuracy": 0.9891321510076523, |
| "num_tokens": 11607186.0, |
| "step": 1316 |
| }, |
| { |
| "entropy": 0.8952204138040543, |
| "epoch": 4.756561085972851, |
| "grad_norm": 0.581134021282196, |
| "learning_rate": 8.728928468450872e-05, |
| "loss": 0.039117198437452316, |
| "mean_token_accuracy": 0.9872443825006485, |
| "num_tokens": 11616135.0, |
| "step": 1317 |
| }, |
| { |
| "entropy": 0.903697595000267, |
| "epoch": 4.760180995475113, |
| "grad_norm": 0.5381007790565491, |
| "learning_rate": 8.716199709268888e-05, |
| "loss": 0.044438499957323074, |
| "mean_token_accuracy": 0.9856991767883301, |
| "num_tokens": 11624827.0, |
| "step": 1318 |
| }, |
| { |
| "entropy": 0.8752525150775909, |
| "epoch": 4.7638009049773755, |
| "grad_norm": 0.3217228353023529, |
| "learning_rate": 8.703475830818897e-05, |
| "loss": 0.02365241013467312, |
| "mean_token_accuracy": 0.9894928485155106, |
| "num_tokens": 11633905.0, |
| "step": 1319 |
| }, |
| { |
| "entropy": 0.8906976282596588, |
| "epoch": 4.767420814479638, |
| "grad_norm": 0.49503424763679504, |
| "learning_rate": 8.690756860293228e-05, |
| "loss": 0.05426553636789322, |
| "mean_token_accuracy": 0.9833587259054184, |
| "num_tokens": 11642732.0, |
| "step": 1320 |
| }, |
| { |
| "entropy": 0.8395461142063141, |
| "epoch": 4.7710407239819, |
| "grad_norm": 0.5068437457084656, |
| "learning_rate": 8.678042824873718e-05, |
| "loss": 0.037847042083740234, |
| "mean_token_accuracy": 0.9877204298973083, |
| "num_tokens": 11651835.0, |
| "step": 1321 |
| }, |
| { |
| "entropy": 0.833812415599823, |
| "epoch": 4.774660633484163, |
| "grad_norm": 0.33258169889450073, |
| "learning_rate": 8.665333751731657e-05, |
| "loss": 0.023194529116153717, |
| "mean_token_accuracy": 0.9938502311706543, |
| "num_tokens": 11661353.0, |
| "step": 1322 |
| }, |
| { |
| "entropy": 0.8494678735733032, |
| "epoch": 4.778280542986425, |
| "grad_norm": 0.41458430886268616, |
| "learning_rate": 8.652629668027731e-05, |
| "loss": 0.02174549549818039, |
| "mean_token_accuracy": 0.9927671700716019, |
| "num_tokens": 11670581.0, |
| "step": 1323 |
| }, |
| { |
| "entropy": 0.9297997653484344, |
| "epoch": 4.781900452488688, |
| "grad_norm": 0.6218132972717285, |
| "learning_rate": 8.639930600911958e-05, |
| "loss": 0.03217285871505737, |
| "mean_token_accuracy": 0.9903489202260971, |
| "num_tokens": 11679163.0, |
| "step": 1324 |
| }, |
| { |
| "entropy": 0.9243068993091583, |
| "epoch": 4.78552036199095, |
| "grad_norm": 0.6559213399887085, |
| "learning_rate": 8.627236577523638e-05, |
| "loss": 0.0579834058880806, |
| "mean_token_accuracy": 0.9795654565095901, |
| "num_tokens": 11687686.0, |
| "step": 1325 |
| }, |
| { |
| "entropy": 0.8774077147245407, |
| "epoch": 4.7891402714932125, |
| "grad_norm": 0.3600568175315857, |
| "learning_rate": 8.614547624991298e-05, |
| "loss": 0.03028794750571251, |
| "mean_token_accuracy": 0.9934851080179214, |
| "num_tokens": 11696611.0, |
| "step": 1326 |
| }, |
| { |
| "entropy": 0.892438679933548, |
| "epoch": 4.792760180995475, |
| "grad_norm": 0.4619959592819214, |
| "learning_rate": 8.601863770432621e-05, |
| "loss": 0.025009188801050186, |
| "mean_token_accuracy": 0.9917362928390503, |
| "num_tokens": 11705104.0, |
| "step": 1327 |
| }, |
| { |
| "entropy": 0.8198121190071106, |
| "epoch": 4.796380090497737, |
| "grad_norm": 0.3183024227619171, |
| "learning_rate": 8.589185040954397e-05, |
| "loss": 0.01881255768239498, |
| "mean_token_accuracy": 0.9933484643697739, |
| "num_tokens": 11714363.0, |
| "step": 1328 |
| }, |
| { |
| "entropy": 0.8623606115579605, |
| "epoch": 4.8, |
| "grad_norm": 0.4741048216819763, |
| "learning_rate": 8.576511463652459e-05, |
| "loss": 0.03326078876852989, |
| "mean_token_accuracy": 0.9923247247934341, |
| "num_tokens": 11723196.0, |
| "step": 1329 |
| }, |
| { |
| "entropy": 0.8758032768964767, |
| "epoch": 4.803619909502262, |
| "grad_norm": 0.4637037515640259, |
| "learning_rate": 8.563843065611644e-05, |
| "loss": 0.016431959345936775, |
| "mean_token_accuracy": 0.9943708181381226, |
| "num_tokens": 11731543.0, |
| "step": 1330 |
| }, |
| { |
| "entropy": 0.8611963838338852, |
| "epoch": 4.807239819004525, |
| "grad_norm": 0.32876336574554443, |
| "learning_rate": 8.551179873905695e-05, |
| "loss": 0.011545347981154919, |
| "mean_token_accuracy": 0.9968069344758987, |
| "num_tokens": 11740082.0, |
| "step": 1331 |
| }, |
| { |
| "entropy": 0.878911018371582, |
| "epoch": 4.810859728506787, |
| "grad_norm": 0.5621935129165649, |
| "learning_rate": 8.538521915597255e-05, |
| "loss": 0.03741168975830078, |
| "mean_token_accuracy": 0.9872355908155441, |
| "num_tokens": 11748731.0, |
| "step": 1332 |
| }, |
| { |
| "entropy": 0.8687689304351807, |
| "epoch": 4.8144796380090495, |
| "grad_norm": 0.6842139959335327, |
| "learning_rate": 8.525869217737765e-05, |
| "loss": 0.04332878440618515, |
| "mean_token_accuracy": 0.9868139326572418, |
| "num_tokens": 11757407.0, |
| "step": 1333 |
| }, |
| { |
| "entropy": 0.8257188647985458, |
| "epoch": 4.818099547511312, |
| "grad_norm": 0.6337531805038452, |
| "learning_rate": 8.513221807367431e-05, |
| "loss": 0.05870246887207031, |
| "mean_token_accuracy": 0.9833265393972397, |
| "num_tokens": 11766726.0, |
| "step": 1334 |
| }, |
| { |
| "entropy": 0.8069233894348145, |
| "epoch": 4.821719457013574, |
| "grad_norm": 0.5104876160621643, |
| "learning_rate": 8.500579711515157e-05, |
| "loss": 0.060887325555086136, |
| "mean_token_accuracy": 0.9869293421506882, |
| "num_tokens": 11776135.0, |
| "step": 1335 |
| }, |
| { |
| "entropy": 0.8201400488615036, |
| "epoch": 4.825339366515837, |
| "grad_norm": 0.5308789610862732, |
| "learning_rate": 8.487942957198494e-05, |
| "loss": 0.0418221578001976, |
| "mean_token_accuracy": 0.9881918877363205, |
| "num_tokens": 11785483.0, |
| "step": 1336 |
| }, |
| { |
| "entropy": 0.7880836576223373, |
| "epoch": 4.828959276018099, |
| "grad_norm": 0.7861600518226624, |
| "learning_rate": 8.47531157142357e-05, |
| "loss": 0.05443998798727989, |
| "mean_token_accuracy": 0.9877728223800659, |
| "num_tokens": 11794707.0, |
| "step": 1337 |
| }, |
| { |
| "entropy": 0.8260870426893234, |
| "epoch": 4.832579185520362, |
| "grad_norm": 0.487807959318161, |
| "learning_rate": 8.462685581185041e-05, |
| "loss": 0.048258986324071884, |
| "mean_token_accuracy": 0.9844470620155334, |
| "num_tokens": 11803756.0, |
| "step": 1338 |
| }, |
| { |
| "entropy": 0.8445771187543869, |
| "epoch": 4.836199095022624, |
| "grad_norm": 0.6421375274658203, |
| "learning_rate": 8.450065013466038e-05, |
| "loss": 0.06861534714698792, |
| "mean_token_accuracy": 0.980402871966362, |
| "num_tokens": 11812898.0, |
| "step": 1339 |
| }, |
| { |
| "entropy": 0.8794773668050766, |
| "epoch": 4.839819004524887, |
| "grad_norm": 0.6781204342842102, |
| "learning_rate": 8.437449895238103e-05, |
| "loss": 0.03358523175120354, |
| "mean_token_accuracy": 0.9903728663921356, |
| "num_tokens": 11821573.0, |
| "step": 1340 |
| }, |
| { |
| "entropy": 0.8457778543233871, |
| "epoch": 4.843438914027149, |
| "grad_norm": 0.44379106163978577, |
| "learning_rate": 8.424840253461122e-05, |
| "loss": 0.029661916196346283, |
| "mean_token_accuracy": 0.9889777451753616, |
| "num_tokens": 11830502.0, |
| "step": 1341 |
| }, |
| { |
| "entropy": 0.8524041026830673, |
| "epoch": 4.847058823529411, |
| "grad_norm": 0.5640258193016052, |
| "learning_rate": 8.412236115083285e-05, |
| "loss": 0.038459956645965576, |
| "mean_token_accuracy": 0.9902327507734299, |
| "num_tokens": 11839647.0, |
| "step": 1342 |
| }, |
| { |
| "entropy": 0.8478478789329529, |
| "epoch": 4.850678733031674, |
| "grad_norm": 0.5391163229942322, |
| "learning_rate": 8.399637507041029e-05, |
| "loss": 0.044210609048604965, |
| "mean_token_accuracy": 0.982999712228775, |
| "num_tokens": 11848488.0, |
| "step": 1343 |
| }, |
| { |
| "entropy": 0.8328516036272049, |
| "epoch": 4.854298642533936, |
| "grad_norm": 0.635456383228302, |
| "learning_rate": 8.387044456258952e-05, |
| "loss": 0.030223991721868515, |
| "mean_token_accuracy": 0.9904019236564636, |
| "num_tokens": 11857067.0, |
| "step": 1344 |
| }, |
| { |
| "entropy": 0.869571715593338, |
| "epoch": 4.857918552036199, |
| "grad_norm": 0.6396393775939941, |
| "learning_rate": 8.37445698964979e-05, |
| "loss": 0.04591453820466995, |
| "mean_token_accuracy": 0.9870909750461578, |
| "num_tokens": 11865616.0, |
| "step": 1345 |
| }, |
| { |
| "entropy": 0.8614618182182312, |
| "epoch": 4.861538461538462, |
| "grad_norm": 0.4452335238456726, |
| "learning_rate": 8.361875134114343e-05, |
| "loss": 0.033750128000974655, |
| "mean_token_accuracy": 0.9905308783054352, |
| "num_tokens": 11874492.0, |
| "step": 1346 |
| }, |
| { |
| "entropy": 0.8712886422872543, |
| "epoch": 4.8651583710407245, |
| "grad_norm": 0.698449969291687, |
| "learning_rate": 8.349298916541415e-05, |
| "loss": 0.048250485211610794, |
| "mean_token_accuracy": 0.9835482239723206, |
| "num_tokens": 11883017.0, |
| "step": 1347 |
| }, |
| { |
| "entropy": 0.8200095295906067, |
| "epoch": 4.868778280542987, |
| "grad_norm": 0.47245922684669495, |
| "learning_rate": 8.336728363807767e-05, |
| "loss": 0.036576300859451294, |
| "mean_token_accuracy": 0.9883123934268951, |
| "num_tokens": 11891712.0, |
| "step": 1348 |
| }, |
| { |
| "entropy": 0.9147795736789703, |
| "epoch": 4.872398190045249, |
| "grad_norm": 0.5961781144142151, |
| "learning_rate": 8.324163502778048e-05, |
| "loss": 0.02409369871020317, |
| "mean_token_accuracy": 0.9936271756887436, |
| "num_tokens": 11899916.0, |
| "step": 1349 |
| }, |
| { |
| "entropy": 0.8785306960344315, |
| "epoch": 4.876018099547512, |
| "grad_norm": 0.5444777607917786, |
| "learning_rate": 8.31160436030475e-05, |
| "loss": 0.09001006931066513, |
| "mean_token_accuracy": 0.9791886657476425, |
| "num_tokens": 11908640.0, |
| "step": 1350 |
| }, |
| { |
| "entropy": 0.8604621440172195, |
| "epoch": 4.879638009049774, |
| "grad_norm": 0.58026123046875, |
| "learning_rate": 8.299050963228133e-05, |
| "loss": 0.034584179520606995, |
| "mean_token_accuracy": 0.9857950210571289, |
| "num_tokens": 11917480.0, |
| "step": 1351 |
| }, |
| { |
| "entropy": 0.8894483149051666, |
| "epoch": 4.883257918552037, |
| "grad_norm": 0.6459085941314697, |
| "learning_rate": 8.286503338376186e-05, |
| "loss": 0.05245271325111389, |
| "mean_token_accuracy": 0.9862827807664871, |
| "num_tokens": 11925818.0, |
| "step": 1352 |
| }, |
| { |
| "entropy": 0.8818574398756027, |
| "epoch": 4.886877828054299, |
| "grad_norm": 0.36797812581062317, |
| "learning_rate": 8.273961512564566e-05, |
| "loss": 0.026651078835129738, |
| "mean_token_accuracy": 0.9902130663394928, |
| "num_tokens": 11934907.0, |
| "step": 1353 |
| }, |
| { |
| "entropy": 0.8988694995641708, |
| "epoch": 4.8904977375565615, |
| "grad_norm": 0.43273940682411194, |
| "learning_rate": 8.261425512596525e-05, |
| "loss": 0.03141481429338455, |
| "mean_token_accuracy": 0.9927571415901184, |
| "num_tokens": 11943693.0, |
| "step": 1354 |
| }, |
| { |
| "entropy": 0.9035229533910751, |
| "epoch": 4.894117647058824, |
| "grad_norm": 0.5842182636260986, |
| "learning_rate": 8.24889536526288e-05, |
| "loss": 0.06542620062828064, |
| "mean_token_accuracy": 0.9820240437984467, |
| "num_tokens": 11952674.0, |
| "step": 1355 |
| }, |
| { |
| "entropy": 0.882953867316246, |
| "epoch": 4.897737556561086, |
| "grad_norm": 0.6475455164909363, |
| "learning_rate": 8.236371097341925e-05, |
| "loss": 0.03121659904718399, |
| "mean_token_accuracy": 0.9919475317001343, |
| "num_tokens": 11961391.0, |
| "step": 1356 |
| }, |
| { |
| "entropy": 0.8515945225954056, |
| "epoch": 4.901357466063349, |
| "grad_norm": 0.4392819106578827, |
| "learning_rate": 8.223852735599402e-05, |
| "loss": 0.029448354616761208, |
| "mean_token_accuracy": 0.9892574101686478, |
| "num_tokens": 11970576.0, |
| "step": 1357 |
| }, |
| { |
| "entropy": 0.967382162809372, |
| "epoch": 4.904977375565611, |
| "grad_norm": 0.6789311170578003, |
| "learning_rate": 8.21134030678842e-05, |
| "loss": 0.03611960634589195, |
| "mean_token_accuracy": 0.9896086901426315, |
| "num_tokens": 11978784.0, |
| "step": 1358 |
| }, |
| { |
| "entropy": 0.9994252473115921, |
| "epoch": 4.908597285067874, |
| "grad_norm": 0.5801035761833191, |
| "learning_rate": 8.198833837649412e-05, |
| "loss": 0.03757525607943535, |
| "mean_token_accuracy": 0.9898461997509003, |
| "num_tokens": 11986702.0, |
| "step": 1359 |
| }, |
| { |
| "entropy": 0.951177105307579, |
| "epoch": 4.912217194570136, |
| "grad_norm": 0.5682723522186279, |
| "learning_rate": 8.186333354910076e-05, |
| "loss": 0.030282404273748398, |
| "mean_token_accuracy": 0.9920378625392914, |
| "num_tokens": 11995195.0, |
| "step": 1360 |
| }, |
| { |
| "entropy": 0.9082824736833572, |
| "epoch": 4.9158371040723985, |
| "grad_norm": 0.4542546272277832, |
| "learning_rate": 8.17383888528532e-05, |
| "loss": 0.03241301700472832, |
| "mean_token_accuracy": 0.9934439212083817, |
| "num_tokens": 12003988.0, |
| "step": 1361 |
| }, |
| { |
| "entropy": 0.8864389061927795, |
| "epoch": 4.919457013574661, |
| "grad_norm": 0.5183593034744263, |
| "learning_rate": 8.161350455477197e-05, |
| "loss": 0.03903008624911308, |
| "mean_token_accuracy": 0.9880173355340958, |
| "num_tokens": 12012979.0, |
| "step": 1362 |
| }, |
| { |
| "entropy": 0.9383509159088135, |
| "epoch": 4.923076923076923, |
| "grad_norm": 0.4837839603424072, |
| "learning_rate": 8.14886809217485e-05, |
| "loss": 0.041375696659088135, |
| "mean_token_accuracy": 0.9865945130586624, |
| "num_tokens": 12021717.0, |
| "step": 1363 |
| }, |
| { |
| "entropy": 0.9083155989646912, |
| "epoch": 4.926696832579186, |
| "grad_norm": 0.3800016939640045, |
| "learning_rate": 8.136391822054466e-05, |
| "loss": 0.019948210567235947, |
| "mean_token_accuracy": 0.991382360458374, |
| "num_tokens": 12030384.0, |
| "step": 1364 |
| }, |
| { |
| "entropy": 0.8765196651220322, |
| "epoch": 4.930316742081448, |
| "grad_norm": 0.5050578713417053, |
| "learning_rate": 8.123921671779193e-05, |
| "loss": 0.029966101050376892, |
| "mean_token_accuracy": 0.9913382828235626, |
| "num_tokens": 12039517.0, |
| "step": 1365 |
| }, |
| { |
| "entropy": 0.8959863632917404, |
| "epoch": 4.933936651583711, |
| "grad_norm": 0.5598185658454895, |
| "learning_rate": 8.111457667999123e-05, |
| "loss": 0.041590698063373566, |
| "mean_token_accuracy": 0.9848756641149521, |
| "num_tokens": 12048401.0, |
| "step": 1366 |
| }, |
| { |
| "entropy": 0.9154993742704391, |
| "epoch": 4.937556561085973, |
| "grad_norm": 0.7497878670692444, |
| "learning_rate": 8.098999837351193e-05, |
| "loss": 0.060688458383083344, |
| "mean_token_accuracy": 0.9851639270782471, |
| "num_tokens": 12057061.0, |
| "step": 1367 |
| }, |
| { |
| "entropy": 0.8572700619697571, |
| "epoch": 4.9411764705882355, |
| "grad_norm": 0.39258310198783875, |
| "learning_rate": 8.086548206459157e-05, |
| "loss": 0.03950543329119682, |
| "mean_token_accuracy": 0.9885773807764053, |
| "num_tokens": 12066074.0, |
| "step": 1368 |
| }, |
| { |
| "entropy": 0.9203635454177856, |
| "epoch": 4.944796380090498, |
| "grad_norm": 0.4106742739677429, |
| "learning_rate": 8.07410280193352e-05, |
| "loss": 0.028075195848941803, |
| "mean_token_accuracy": 0.9931609779596329, |
| "num_tokens": 12074134.0, |
| "step": 1369 |
| }, |
| { |
| "entropy": 0.9020792841911316, |
| "epoch": 4.94841628959276, |
| "grad_norm": 0.627805233001709, |
| "learning_rate": 8.061663650371478e-05, |
| "loss": 0.06281647086143494, |
| "mean_token_accuracy": 0.9771750569343567, |
| "num_tokens": 12083039.0, |
| "step": 1370 |
| }, |
| { |
| "entropy": 0.8852731287479401, |
| "epoch": 4.952036199095023, |
| "grad_norm": 0.657303512096405, |
| "learning_rate": 8.049230778356864e-05, |
| "loss": 0.029552282765507698, |
| "mean_token_accuracy": 0.991500198841095, |
| "num_tokens": 12091688.0, |
| "step": 1371 |
| }, |
| { |
| "entropy": 0.8907477855682373, |
| "epoch": 4.955656108597285, |
| "grad_norm": 0.5756560564041138, |
| "learning_rate": 8.036804212460085e-05, |
| "loss": 0.06554236263036728, |
| "mean_token_accuracy": 0.9861236363649368, |
| "num_tokens": 12101040.0, |
| "step": 1372 |
| }, |
| { |
| "entropy": 0.901018038392067, |
| "epoch": 4.959276018099548, |
| "grad_norm": 0.6850928664207458, |
| "learning_rate": 8.024383979238082e-05, |
| "loss": 0.028915666043758392, |
| "mean_token_accuracy": 0.9911025762557983, |
| "num_tokens": 12109748.0, |
| "step": 1373 |
| }, |
| { |
| "entropy": 0.8686677515506744, |
| "epoch": 4.96289592760181, |
| "grad_norm": 0.4919429123401642, |
| "learning_rate": 8.011970105234254e-05, |
| "loss": 0.04680994898080826, |
| "mean_token_accuracy": 0.9891356080770493, |
| "num_tokens": 12118627.0, |
| "step": 1374 |
| }, |
| { |
| "entropy": 0.9104600250720978, |
| "epoch": 4.9665158371040725, |
| "grad_norm": 0.4508710205554962, |
| "learning_rate": 7.999562616978418e-05, |
| "loss": 0.022277040407061577, |
| "mean_token_accuracy": 0.9955233782529831, |
| "num_tokens": 12126969.0, |
| "step": 1375 |
| }, |
| { |
| "entropy": 0.9048454165458679, |
| "epoch": 4.970135746606335, |
| "grad_norm": 0.5170649290084839, |
| "learning_rate": 7.987161540986733e-05, |
| "loss": 0.02180103212594986, |
| "mean_token_accuracy": 0.9945137053728104, |
| "num_tokens": 12135468.0, |
| "step": 1376 |
| }, |
| { |
| "entropy": 0.8686342835426331, |
| "epoch": 4.973755656108597, |
| "grad_norm": 0.8190595507621765, |
| "learning_rate": 7.974766903761663e-05, |
| "loss": 0.05652451515197754, |
| "mean_token_accuracy": 0.9849314540624619, |
| "num_tokens": 12144295.0, |
| "step": 1377 |
| }, |
| { |
| "entropy": 0.862045481801033, |
| "epoch": 4.97737556561086, |
| "grad_norm": 0.6442601084709167, |
| "learning_rate": 7.962378731791913e-05, |
| "loss": 0.0695543959736824, |
| "mean_token_accuracy": 0.9838263392448425, |
| "num_tokens": 12153149.0, |
| "step": 1378 |
| }, |
| { |
| "entropy": 0.8685291558504105, |
| "epoch": 4.980995475113122, |
| "grad_norm": 0.4403861463069916, |
| "learning_rate": 7.949997051552358e-05, |
| "loss": 0.02654072642326355, |
| "mean_token_accuracy": 0.9921073764562607, |
| "num_tokens": 12162005.0, |
| "step": 1379 |
| }, |
| { |
| "entropy": 0.8518020212650299, |
| "epoch": 4.984615384615385, |
| "grad_norm": 0.5914137959480286, |
| "learning_rate": 7.937621889504015e-05, |
| "loss": 0.028411295264959335, |
| "mean_token_accuracy": 0.9918225407600403, |
| "num_tokens": 12171200.0, |
| "step": 1380 |
| }, |
| { |
| "entropy": 0.8667004555463791, |
| "epoch": 4.988235294117647, |
| "grad_norm": 0.4217661917209625, |
| "learning_rate": 7.925253272093959e-05, |
| "loss": 0.03126171976327896, |
| "mean_token_accuracy": 0.990281954407692, |
| "num_tokens": 12180238.0, |
| "step": 1381 |
| }, |
| { |
| "entropy": 0.8675567209720612, |
| "epoch": 4.9918552036199095, |
| "grad_norm": 0.5172854065895081, |
| "learning_rate": 7.912891225755288e-05, |
| "loss": 0.04897421598434448, |
| "mean_token_accuracy": 0.9878519624471664, |
| "num_tokens": 12189686.0, |
| "step": 1382 |
| }, |
| { |
| "entropy": 0.8342727571725845, |
| "epoch": 4.995475113122172, |
| "grad_norm": 0.6539183259010315, |
| "learning_rate": 7.900535776907049e-05, |
| "loss": 0.11599453538656235, |
| "mean_token_accuracy": 0.9718467444181442, |
| "num_tokens": 12199160.0, |
| "step": 1383 |
| }, |
| { |
| "entropy": 0.8954996764659882, |
| "epoch": 4.999095022624434, |
| "grad_norm": 0.43926727771759033, |
| "learning_rate": 7.888186951954197e-05, |
| "loss": 0.022596832364797592, |
| "mean_token_accuracy": 0.9941199272871017, |
| "num_tokens": 12207819.0, |
| "step": 1384 |
| }, |
| { |
| "entropy": 0.7847009301185608, |
| "epoch": 5.0, |
| "grad_norm": 3.054668426513672, |
| "learning_rate": 7.875844777287526e-05, |
| "loss": 0.07187020778656006, |
| "mean_token_accuracy": 0.9810426831245422, |
| "num_tokens": 12208625.0, |
| "step": 1385 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_entropy": 0.8913698419322812, |
| "eval_loss": 0.11407072842121124, |
| "eval_mean_token_accuracy": 0.9732460108229785, |
| "eval_num_tokens": 12208625.0, |
| "eval_runtime": 31.7957, |
| "eval_samples_per_second": 11.605, |
| "eval_steps_per_second": 3.868, |
| "step": 1385 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 2216, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 8, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 9.473572203031882e+18, |
| "train_batch_size": 3, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|