{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 220, "global_step": 2195, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0022811519817507843, "grad_norm": 36.5, "learning_rate": 0.0, "loss": 0.5696, "num_tokens": 101285.0, "step": 1 }, { "epoch": 0.004562303963501569, "grad_norm": 40.0, "learning_rate": 2.272727272727273e-08, "loss": 0.5528, "num_tokens": 201607.0, "step": 2 }, { "epoch": 0.006843455945252352, "grad_norm": 40.0, "learning_rate": 4.545454545454546e-08, "loss": 0.5961, "num_tokens": 302114.0, "step": 3 }, { "epoch": 0.009124607927003137, "grad_norm": 36.5, "learning_rate": 6.818181818181819e-08, "loss": 0.5565, "num_tokens": 402612.0, "step": 4 }, { "epoch": 0.01140575990875392, "grad_norm": 37.25, "learning_rate": 9.090909090909091e-08, "loss": 0.5805, "num_tokens": 503563.0, "step": 5 }, { "epoch": 0.013686911890504704, "grad_norm": 37.25, "learning_rate": 1.1363636363636364e-07, "loss": 0.5586, "num_tokens": 604014.0, "step": 6 }, { "epoch": 0.015968063872255488, "grad_norm": 37.75, "learning_rate": 1.3636363636363637e-07, "loss": 0.5878, "num_tokens": 704845.0, "step": 7 }, { "epoch": 0.018249215854006275, "grad_norm": 41.0, "learning_rate": 1.590909090909091e-07, "loss": 0.5713, "num_tokens": 806049.0, "step": 8 }, { "epoch": 0.020530367835757058, "grad_norm": 37.0, "learning_rate": 1.8181818181818183e-07, "loss": 0.5469, "num_tokens": 906816.0, "step": 9 }, { "epoch": 0.02281151981750784, "grad_norm": 37.5, "learning_rate": 2.0454545454545456e-07, "loss": 0.5636, "num_tokens": 1007886.0, "step": 10 }, { "epoch": 0.025092671799258625, "grad_norm": 38.25, "learning_rate": 2.2727272727272729e-07, "loss": 0.5809, "num_tokens": 1108733.0, "step": 11 }, { "epoch": 0.02737382378100941, "grad_norm": 37.5, "learning_rate": 2.5000000000000004e-07, "loss": 0.5589, "num_tokens": 1209136.0, "step": 12 }, { "epoch": 0.029654975762760195, "grad_norm": 36.5, "learning_rate": 2.7272727272727274e-07, "loss": 0.5435, "num_tokens": 1309751.0, "step": 13 }, { "epoch": 0.031936127744510975, "grad_norm": 36.5, "learning_rate": 2.954545454545455e-07, "loss": 0.5601, "num_tokens": 1410751.0, "step": 14 }, { "epoch": 0.03421727972626176, "grad_norm": 37.25, "learning_rate": 3.181818181818182e-07, "loss": 0.5769, "num_tokens": 1511131.0, "step": 15 }, { "epoch": 0.03649843170801255, "grad_norm": 38.75, "learning_rate": 3.409090909090909e-07, "loss": 0.589, "num_tokens": 1612153.0, "step": 16 }, { "epoch": 0.03877958368976333, "grad_norm": 36.75, "learning_rate": 3.6363636363636366e-07, "loss": 0.5731, "num_tokens": 1713177.0, "step": 17 }, { "epoch": 0.041060735671514116, "grad_norm": 38.25, "learning_rate": 3.8636363636363636e-07, "loss": 0.5888, "num_tokens": 1814504.0, "step": 18 }, { "epoch": 0.043341887653264896, "grad_norm": 37.5, "learning_rate": 4.090909090909091e-07, "loss": 0.578, "num_tokens": 1915997.0, "step": 19 }, { "epoch": 0.04562303963501568, "grad_norm": 39.25, "learning_rate": 4.3181818181818187e-07, "loss": 0.5852, "num_tokens": 2016362.0, "step": 20 }, { "epoch": 0.04790419161676647, "grad_norm": 36.5, "learning_rate": 4.5454545454545457e-07, "loss": 0.5353, "num_tokens": 2117417.0, "step": 21 }, { "epoch": 0.05018534359851725, "grad_norm": 37.75, "learning_rate": 4.772727272727274e-07, "loss": 0.536, "num_tokens": 2218119.0, "step": 22 }, { "epoch": 0.05246649558026804, "grad_norm": 37.5, "learning_rate": 5.000000000000001e-07, "loss": 0.5641, "num_tokens": 2318572.0, "step": 23 }, { "epoch": 0.05474764756201882, "grad_norm": 35.5, "learning_rate": 5.227272727272728e-07, "loss": 0.539, "num_tokens": 2418620.0, "step": 24 }, { "epoch": 0.057028799543769604, "grad_norm": 34.5, "learning_rate": 5.454545454545455e-07, "loss": 0.5561, "num_tokens": 2519614.0, "step": 25 }, { "epoch": 0.05930995152552039, "grad_norm": 34.75, "learning_rate": 5.681818181818182e-07, "loss": 0.5575, "num_tokens": 2620909.0, "step": 26 }, { "epoch": 0.06159110350727117, "grad_norm": 37.75, "learning_rate": 5.90909090909091e-07, "loss": 0.5742, "num_tokens": 2721111.0, "step": 27 }, { "epoch": 0.06387225548902195, "grad_norm": 36.0, "learning_rate": 6.136363636363637e-07, "loss": 0.5474, "num_tokens": 2821868.0, "step": 28 }, { "epoch": 0.06615340747077274, "grad_norm": 36.5, "learning_rate": 6.363636363636364e-07, "loss": 0.5386, "num_tokens": 2922514.0, "step": 29 }, { "epoch": 0.06843455945252352, "grad_norm": 36.25, "learning_rate": 6.590909090909091e-07, "loss": 0.5148, "num_tokens": 3022695.0, "step": 30 }, { "epoch": 0.07071571143427431, "grad_norm": 34.25, "learning_rate": 6.818181818181818e-07, "loss": 0.5032, "num_tokens": 3123488.0, "step": 31 }, { "epoch": 0.0729968634160251, "grad_norm": 35.25, "learning_rate": 7.045454545454545e-07, "loss": 0.4977, "num_tokens": 3224466.0, "step": 32 }, { "epoch": 0.07527801539777587, "grad_norm": 33.5, "learning_rate": 7.272727272727273e-07, "loss": 0.5188, "num_tokens": 3324991.0, "step": 33 }, { "epoch": 0.07755916737952666, "grad_norm": 33.75, "learning_rate": 7.5e-07, "loss": 0.5255, "num_tokens": 3426067.0, "step": 34 }, { "epoch": 0.07984031936127745, "grad_norm": 34.5, "learning_rate": 7.727272727272727e-07, "loss": 0.4756, "num_tokens": 3526369.0, "step": 35 }, { "epoch": 0.08212147134302823, "grad_norm": 32.75, "learning_rate": 7.954545454545455e-07, "loss": 0.5036, "num_tokens": 3626858.0, "step": 36 }, { "epoch": 0.08440262332477902, "grad_norm": 32.25, "learning_rate": 8.181818181818182e-07, "loss": 0.4714, "num_tokens": 3728224.0, "step": 37 }, { "epoch": 0.08668377530652979, "grad_norm": 34.0, "learning_rate": 8.409090909090909e-07, "loss": 0.4578, "num_tokens": 3828658.0, "step": 38 }, { "epoch": 0.08896492728828058, "grad_norm": 33.5, "learning_rate": 8.636363636363637e-07, "loss": 0.4786, "num_tokens": 3929692.0, "step": 39 }, { "epoch": 0.09124607927003137, "grad_norm": 31.125, "learning_rate": 8.863636363636364e-07, "loss": 0.4703, "num_tokens": 4030289.0, "step": 40 }, { "epoch": 0.09352723125178215, "grad_norm": 31.25, "learning_rate": 9.090909090909091e-07, "loss": 0.4342, "num_tokens": 4131800.0, "step": 41 }, { "epoch": 0.09580838323353294, "grad_norm": 33.0, "learning_rate": 9.31818181818182e-07, "loss": 0.4676, "num_tokens": 4231753.0, "step": 42 }, { "epoch": 0.09808953521528371, "grad_norm": 31.125, "learning_rate": 9.545454545454548e-07, "loss": 0.4731, "num_tokens": 4332691.0, "step": 43 }, { "epoch": 0.1003706871970345, "grad_norm": 30.125, "learning_rate": 9.772727272727275e-07, "loss": 0.4749, "num_tokens": 4433285.0, "step": 44 }, { "epoch": 0.10265183917878529, "grad_norm": 31.125, "learning_rate": 1.0000000000000002e-06, "loss": 0.4189, "num_tokens": 4533564.0, "step": 45 }, { "epoch": 0.10493299116053607, "grad_norm": 30.75, "learning_rate": 1.0227272727272729e-06, "loss": 0.4255, "num_tokens": 4634467.0, "step": 46 }, { "epoch": 0.10721414314228686, "grad_norm": 27.875, "learning_rate": 1.0454545454545456e-06, "loss": 0.4048, "num_tokens": 4735946.0, "step": 47 }, { "epoch": 0.10949529512403763, "grad_norm": 26.875, "learning_rate": 1.0681818181818183e-06, "loss": 0.4004, "num_tokens": 4836926.0, "step": 48 }, { "epoch": 0.11177644710578842, "grad_norm": 26.125, "learning_rate": 1.090909090909091e-06, "loss": 0.395, "num_tokens": 4937817.0, "step": 49 }, { "epoch": 0.11405759908753921, "grad_norm": 25.0, "learning_rate": 1.1136363636363637e-06, "loss": 0.3823, "num_tokens": 5038087.0, "step": 50 }, { "epoch": 0.11633875106929, "grad_norm": 25.625, "learning_rate": 1.1363636363636364e-06, "loss": 0.3824, "num_tokens": 5138715.0, "step": 51 }, { "epoch": 0.11861990305104078, "grad_norm": 21.75, "learning_rate": 1.159090909090909e-06, "loss": 0.3809, "num_tokens": 5239127.0, "step": 52 }, { "epoch": 0.12090105503279155, "grad_norm": 21.0, "learning_rate": 1.181818181818182e-06, "loss": 0.3506, "num_tokens": 5339970.0, "step": 53 }, { "epoch": 0.12318220701454234, "grad_norm": 21.125, "learning_rate": 1.2045454545454547e-06, "loss": 0.3289, "num_tokens": 5440685.0, "step": 54 }, { "epoch": 0.12546335899629313, "grad_norm": 19.875, "learning_rate": 1.2272727272727274e-06, "loss": 0.3043, "num_tokens": 5541092.0, "step": 55 }, { "epoch": 0.1277445109780439, "grad_norm": 18.125, "learning_rate": 1.25e-06, "loss": 0.3187, "num_tokens": 5641780.0, "step": 56 }, { "epoch": 0.1300256629597947, "grad_norm": 16.625, "learning_rate": 1.2727272727272728e-06, "loss": 0.3034, "num_tokens": 5743248.0, "step": 57 }, { "epoch": 0.13230681494154548, "grad_norm": 17.0, "learning_rate": 1.2954545454545455e-06, "loss": 0.3044, "num_tokens": 5843948.0, "step": 58 }, { "epoch": 0.13458796692329628, "grad_norm": 17.0, "learning_rate": 1.3181818181818182e-06, "loss": 0.2897, "num_tokens": 5944742.0, "step": 59 }, { "epoch": 0.13686911890504705, "grad_norm": 14.875, "learning_rate": 1.3409090909090911e-06, "loss": 0.3103, "num_tokens": 6045762.0, "step": 60 }, { "epoch": 0.13915027088679782, "grad_norm": 13.75, "learning_rate": 1.3636363636363636e-06, "loss": 0.2776, "num_tokens": 6146455.0, "step": 61 }, { "epoch": 0.14143142286854862, "grad_norm": 15.4375, "learning_rate": 1.3863636363636365e-06, "loss": 0.2703, "num_tokens": 6246888.0, "step": 62 }, { "epoch": 0.1437125748502994, "grad_norm": 13.0625, "learning_rate": 1.409090909090909e-06, "loss": 0.275, "num_tokens": 6347911.0, "step": 63 }, { "epoch": 0.1459937268320502, "grad_norm": 12.4375, "learning_rate": 1.431818181818182e-06, "loss": 0.2508, "num_tokens": 6448581.0, "step": 64 }, { "epoch": 0.14827487881380097, "grad_norm": 12.125, "learning_rate": 1.4545454545454546e-06, "loss": 0.2309, "num_tokens": 6549199.0, "step": 65 }, { "epoch": 0.15055603079555174, "grad_norm": 10.375, "learning_rate": 1.4772727272727275e-06, "loss": 0.1979, "num_tokens": 6649530.0, "step": 66 }, { "epoch": 0.15283718277730254, "grad_norm": 10.125, "learning_rate": 1.5e-06, "loss": 0.2, "num_tokens": 6749951.0, "step": 67 }, { "epoch": 0.15511833475905332, "grad_norm": 9.625, "learning_rate": 1.522727272727273e-06, "loss": 0.1818, "num_tokens": 6850634.0, "step": 68 }, { "epoch": 0.15739948674080412, "grad_norm": 10.0625, "learning_rate": 1.5454545454545454e-06, "loss": 0.2014, "num_tokens": 6951594.0, "step": 69 }, { "epoch": 0.1596806387225549, "grad_norm": 9.25, "learning_rate": 1.5681818181818184e-06, "loss": 0.2041, "num_tokens": 7053422.0, "step": 70 }, { "epoch": 0.16196179070430566, "grad_norm": 9.375, "learning_rate": 1.590909090909091e-06, "loss": 0.2102, "num_tokens": 7153876.0, "step": 71 }, { "epoch": 0.16424294268605646, "grad_norm": 7.75, "learning_rate": 1.613636363636364e-06, "loss": 0.1694, "num_tokens": 7254206.0, "step": 72 }, { "epoch": 0.16652409466780724, "grad_norm": 7.28125, "learning_rate": 1.6363636363636365e-06, "loss": 0.1702, "num_tokens": 7354662.0, "step": 73 }, { "epoch": 0.16880524664955804, "grad_norm": 7.25, "learning_rate": 1.6590909090909094e-06, "loss": 0.1685, "num_tokens": 7455451.0, "step": 74 }, { "epoch": 0.1710863986313088, "grad_norm": 6.6875, "learning_rate": 1.6818181818181819e-06, "loss": 0.1586, "num_tokens": 7555720.0, "step": 75 }, { "epoch": 0.17336755061305958, "grad_norm": 7.0625, "learning_rate": 1.7045454545454546e-06, "loss": 0.1686, "num_tokens": 7656908.0, "step": 76 }, { "epoch": 0.17564870259481039, "grad_norm": 7.25, "learning_rate": 1.7272727272727275e-06, "loss": 0.1425, "num_tokens": 7757340.0, "step": 77 }, { "epoch": 0.17792985457656116, "grad_norm": 5.90625, "learning_rate": 1.75e-06, "loss": 0.1398, "num_tokens": 7857928.0, "step": 78 }, { "epoch": 0.18021100655831196, "grad_norm": 5.53125, "learning_rate": 1.7727272727272729e-06, "loss": 0.1514, "num_tokens": 7958721.0, "step": 79 }, { "epoch": 0.18249215854006273, "grad_norm": 5.5625, "learning_rate": 1.7954545454545456e-06, "loss": 0.148, "num_tokens": 8059145.0, "step": 80 }, { "epoch": 0.1847733105218135, "grad_norm": 5.15625, "learning_rate": 1.8181818181818183e-06, "loss": 0.1366, "num_tokens": 8159854.0, "step": 81 }, { "epoch": 0.1870544625035643, "grad_norm": 4.5625, "learning_rate": 1.840909090909091e-06, "loss": 0.1364, "num_tokens": 8260872.0, "step": 82 }, { "epoch": 0.18933561448531508, "grad_norm": 4.90625, "learning_rate": 1.863636363636364e-06, "loss": 0.1278, "num_tokens": 8361165.0, "step": 83 }, { "epoch": 0.19161676646706588, "grad_norm": 4.59375, "learning_rate": 1.8863636363636364e-06, "loss": 0.1322, "num_tokens": 8461370.0, "step": 84 }, { "epoch": 0.19389791844881665, "grad_norm": 4.84375, "learning_rate": 1.9090909090909095e-06, "loss": 0.1265, "num_tokens": 8562027.0, "step": 85 }, { "epoch": 0.19617907043056743, "grad_norm": 4.75, "learning_rate": 1.931818181818182e-06, "loss": 0.1315, "num_tokens": 8663335.0, "step": 86 }, { "epoch": 0.19846022241231823, "grad_norm": 4.71875, "learning_rate": 1.954545454545455e-06, "loss": 0.116, "num_tokens": 8763558.0, "step": 87 }, { "epoch": 0.200741374394069, "grad_norm": 4.0625, "learning_rate": 1.977272727272727e-06, "loss": 0.1052, "num_tokens": 8863890.0, "step": 88 }, { "epoch": 0.2030225263758198, "grad_norm": 4.5625, "learning_rate": 2.0000000000000003e-06, "loss": 0.1082, "num_tokens": 8964149.0, "step": 89 }, { "epoch": 0.20530367835757057, "grad_norm": 4.5625, "learning_rate": 2.022727272727273e-06, "loss": 0.1153, "num_tokens": 9064854.0, "step": 90 }, { "epoch": 0.20758483033932135, "grad_norm": 4.1875, "learning_rate": 2.0454545454545457e-06, "loss": 0.1068, "num_tokens": 9164765.0, "step": 91 }, { "epoch": 0.20986598232107215, "grad_norm": 4.53125, "learning_rate": 2.0681818181818184e-06, "loss": 0.1079, "num_tokens": 9264748.0, "step": 92 }, { "epoch": 0.21214713430282292, "grad_norm": 3.75, "learning_rate": 2.090909090909091e-06, "loss": 0.1012, "num_tokens": 9365264.0, "step": 93 }, { "epoch": 0.21442828628457372, "grad_norm": 4.1875, "learning_rate": 2.113636363636364e-06, "loss": 0.1022, "num_tokens": 9465763.0, "step": 94 }, { "epoch": 0.2167094382663245, "grad_norm": 3.859375, "learning_rate": 2.1363636363636365e-06, "loss": 0.111, "num_tokens": 9566820.0, "step": 95 }, { "epoch": 0.21899059024807527, "grad_norm": 4.03125, "learning_rate": 2.1590909090909092e-06, "loss": 0.1038, "num_tokens": 9667052.0, "step": 96 }, { "epoch": 0.22127174222982607, "grad_norm": 4.5625, "learning_rate": 2.181818181818182e-06, "loss": 0.1119, "num_tokens": 9767445.0, "step": 97 }, { "epoch": 0.22355289421157684, "grad_norm": 3.90625, "learning_rate": 2.2045454545454547e-06, "loss": 0.1048, "num_tokens": 9868516.0, "step": 98 }, { "epoch": 0.22583404619332764, "grad_norm": 4.0625, "learning_rate": 2.2272727272727274e-06, "loss": 0.1046, "num_tokens": 9968971.0, "step": 99 }, { "epoch": 0.22811519817507842, "grad_norm": 4.21875, "learning_rate": 2.25e-06, "loss": 0.1088, "num_tokens": 10069985.0, "step": 100 }, { "epoch": 0.2303963501568292, "grad_norm": 3.96875, "learning_rate": 2.2727272727272728e-06, "loss": 0.1052, "num_tokens": 10170353.0, "step": 101 }, { "epoch": 0.23267750213858, "grad_norm": 3.765625, "learning_rate": 2.295454545454546e-06, "loss": 0.0935, "num_tokens": 10270557.0, "step": 102 }, { "epoch": 0.23495865412033076, "grad_norm": 4.0, "learning_rate": 2.318181818181818e-06, "loss": 0.0969, "num_tokens": 10371528.0, "step": 103 }, { "epoch": 0.23723980610208156, "grad_norm": 3.90625, "learning_rate": 2.3409090909090913e-06, "loss": 0.1073, "num_tokens": 10471809.0, "step": 104 }, { "epoch": 0.23952095808383234, "grad_norm": 3.5, "learning_rate": 2.363636363636364e-06, "loss": 0.0763, "num_tokens": 10572042.0, "step": 105 }, { "epoch": 0.2418021100655831, "grad_norm": 3.484375, "learning_rate": 2.3863636363636367e-06, "loss": 0.0983, "num_tokens": 10672650.0, "step": 106 }, { "epoch": 0.2440832620473339, "grad_norm": 3.0625, "learning_rate": 2.4090909090909094e-06, "loss": 0.0782, "num_tokens": 10773323.0, "step": 107 }, { "epoch": 0.24636441402908468, "grad_norm": 3.453125, "learning_rate": 2.431818181818182e-06, "loss": 0.0991, "num_tokens": 10873762.0, "step": 108 }, { "epoch": 0.24864556601083548, "grad_norm": 3.453125, "learning_rate": 2.454545454545455e-06, "loss": 0.0767, "num_tokens": 10974769.0, "step": 109 }, { "epoch": 0.25092671799258626, "grad_norm": 3.328125, "learning_rate": 2.4772727272727275e-06, "loss": 0.0783, "num_tokens": 11074557.0, "step": 110 }, { "epoch": 0.25320786997433703, "grad_norm": 3.109375, "learning_rate": 2.5e-06, "loss": 0.0804, "num_tokens": 11175194.0, "step": 111 }, { "epoch": 0.2554890219560878, "grad_norm": 3.71875, "learning_rate": 2.522727272727273e-06, "loss": 0.0807, "num_tokens": 11275590.0, "step": 112 }, { "epoch": 0.25777017393783863, "grad_norm": 2.671875, "learning_rate": 2.5454545454545456e-06, "loss": 0.0853, "num_tokens": 11376268.0, "step": 113 }, { "epoch": 0.2600513259195894, "grad_norm": 3.3125, "learning_rate": 2.5681818181818187e-06, "loss": 0.0888, "num_tokens": 11476858.0, "step": 114 }, { "epoch": 0.2623324779013402, "grad_norm": 2.90625, "learning_rate": 2.590909090909091e-06, "loss": 0.0811, "num_tokens": 11577687.0, "step": 115 }, { "epoch": 0.26461362988309095, "grad_norm": 2.921875, "learning_rate": 2.6136363636363637e-06, "loss": 0.0759, "num_tokens": 11677999.0, "step": 116 }, { "epoch": 0.2668947818648417, "grad_norm": 3.28125, "learning_rate": 2.6363636363636364e-06, "loss": 0.0853, "num_tokens": 11778318.0, "step": 117 }, { "epoch": 0.26917593384659255, "grad_norm": 2.84375, "learning_rate": 2.6590909090909095e-06, "loss": 0.0767, "num_tokens": 11879277.0, "step": 118 }, { "epoch": 0.2714570858283433, "grad_norm": 4.0625, "learning_rate": 2.6818181818181822e-06, "loss": 0.1015, "num_tokens": 11979762.0, "step": 119 }, { "epoch": 0.2737382378100941, "grad_norm": 3.578125, "learning_rate": 2.7045454545454545e-06, "loss": 0.1023, "num_tokens": 12081051.0, "step": 120 }, { "epoch": 0.27601938979184487, "grad_norm": 3.5625, "learning_rate": 2.7272727272727272e-06, "loss": 0.0864, "num_tokens": 12181662.0, "step": 121 }, { "epoch": 0.27830054177359564, "grad_norm": 2.96875, "learning_rate": 2.7500000000000004e-06, "loss": 0.0781, "num_tokens": 12282155.0, "step": 122 }, { "epoch": 0.2805816937553465, "grad_norm": 3.078125, "learning_rate": 2.772727272727273e-06, "loss": 0.0728, "num_tokens": 12382366.0, "step": 123 }, { "epoch": 0.28286284573709725, "grad_norm": 3.578125, "learning_rate": 2.7954545454545458e-06, "loss": 0.091, "num_tokens": 12483342.0, "step": 124 }, { "epoch": 0.285143997718848, "grad_norm": 3.453125, "learning_rate": 2.818181818181818e-06, "loss": 0.085, "num_tokens": 12583773.0, "step": 125 }, { "epoch": 0.2874251497005988, "grad_norm": 2.90625, "learning_rate": 2.8409090909090916e-06, "loss": 0.0819, "num_tokens": 12684522.0, "step": 126 }, { "epoch": 0.28970630168234957, "grad_norm": 3.171875, "learning_rate": 2.863636363636364e-06, "loss": 0.0775, "num_tokens": 12785187.0, "step": 127 }, { "epoch": 0.2919874536641004, "grad_norm": 3.40625, "learning_rate": 2.8863636363636366e-06, "loss": 0.0768, "num_tokens": 12885525.0, "step": 128 }, { "epoch": 0.29426860564585117, "grad_norm": 3.359375, "learning_rate": 2.9090909090909093e-06, "loss": 0.0827, "num_tokens": 12985890.0, "step": 129 }, { "epoch": 0.29654975762760194, "grad_norm": 2.96875, "learning_rate": 2.931818181818182e-06, "loss": 0.066, "num_tokens": 13086727.0, "step": 130 }, { "epoch": 0.2988309096093527, "grad_norm": 3.140625, "learning_rate": 2.954545454545455e-06, "loss": 0.0627, "num_tokens": 13187554.0, "step": 131 }, { "epoch": 0.3011120615911035, "grad_norm": 3.125, "learning_rate": 2.9772727272727274e-06, "loss": 0.0727, "num_tokens": 13288465.0, "step": 132 }, { "epoch": 0.3033932135728543, "grad_norm": 3.25, "learning_rate": 3e-06, "loss": 0.0716, "num_tokens": 13389136.0, "step": 133 }, { "epoch": 0.3056743655546051, "grad_norm": 2.796875, "learning_rate": 3.0227272727272728e-06, "loss": 0.0614, "num_tokens": 13489966.0, "step": 134 }, { "epoch": 0.30795551753635586, "grad_norm": 3.09375, "learning_rate": 3.045454545454546e-06, "loss": 0.0767, "num_tokens": 13590958.0, "step": 135 }, { "epoch": 0.31023666951810663, "grad_norm": 2.71875, "learning_rate": 3.0681818181818186e-06, "loss": 0.0642, "num_tokens": 13692055.0, "step": 136 }, { "epoch": 0.3125178214998574, "grad_norm": 2.953125, "learning_rate": 3.090909090909091e-06, "loss": 0.077, "num_tokens": 13792289.0, "step": 137 }, { "epoch": 0.31479897348160824, "grad_norm": 2.703125, "learning_rate": 3.1136363636363636e-06, "loss": 0.0631, "num_tokens": 13892642.0, "step": 138 }, { "epoch": 0.317080125463359, "grad_norm": 3.46875, "learning_rate": 3.1363636363636367e-06, "loss": 0.0757, "num_tokens": 13993149.0, "step": 139 }, { "epoch": 0.3193612774451098, "grad_norm": 2.796875, "learning_rate": 3.1590909090909094e-06, "loss": 0.0703, "num_tokens": 14093903.0, "step": 140 }, { "epoch": 0.32164242942686055, "grad_norm": 3.203125, "learning_rate": 3.181818181818182e-06, "loss": 0.08, "num_tokens": 14195177.0, "step": 141 }, { "epoch": 0.3239235814086113, "grad_norm": 3.0, "learning_rate": 3.204545454545455e-06, "loss": 0.0702, "num_tokens": 14296256.0, "step": 142 }, { "epoch": 0.32620473339036216, "grad_norm": 2.515625, "learning_rate": 3.227272727272728e-06, "loss": 0.0535, "num_tokens": 14396712.0, "step": 143 }, { "epoch": 0.32848588537211293, "grad_norm": 3.0, "learning_rate": 3.2500000000000002e-06, "loss": 0.0774, "num_tokens": 14496957.0, "step": 144 }, { "epoch": 0.3307670373538637, "grad_norm": 2.703125, "learning_rate": 3.272727272727273e-06, "loss": 0.0668, "num_tokens": 14597333.0, "step": 145 }, { "epoch": 0.3330481893356145, "grad_norm": 3.125, "learning_rate": 3.2954545454545456e-06, "loss": 0.074, "num_tokens": 14698290.0, "step": 146 }, { "epoch": 0.33532934131736525, "grad_norm": 2.609375, "learning_rate": 3.3181818181818188e-06, "loss": 0.0559, "num_tokens": 14798990.0, "step": 147 }, { "epoch": 0.3376104932991161, "grad_norm": 3.265625, "learning_rate": 3.3409090909090915e-06, "loss": 0.0709, "num_tokens": 14899843.0, "step": 148 }, { "epoch": 0.33989164528086685, "grad_norm": 2.890625, "learning_rate": 3.3636363636363637e-06, "loss": 0.0654, "num_tokens": 15000120.0, "step": 149 }, { "epoch": 0.3421727972626176, "grad_norm": 2.875, "learning_rate": 3.3863636363636364e-06, "loss": 0.0682, "num_tokens": 15100693.0, "step": 150 }, { "epoch": 0.3444539492443684, "grad_norm": 3.046875, "learning_rate": 3.409090909090909e-06, "loss": 0.0556, "num_tokens": 15200919.0, "step": 151 }, { "epoch": 0.34673510122611917, "grad_norm": 2.953125, "learning_rate": 3.4318181818181823e-06, "loss": 0.06, "num_tokens": 15300891.0, "step": 152 }, { "epoch": 0.34901625320787, "grad_norm": 2.96875, "learning_rate": 3.454545454545455e-06, "loss": 0.0558, "num_tokens": 15401421.0, "step": 153 }, { "epoch": 0.35129740518962077, "grad_norm": 3.359375, "learning_rate": 3.4772727272727277e-06, "loss": 0.0747, "num_tokens": 15501984.0, "step": 154 }, { "epoch": 0.35357855717137154, "grad_norm": 2.453125, "learning_rate": 3.5e-06, "loss": 0.0601, "num_tokens": 15602882.0, "step": 155 }, { "epoch": 0.3558597091531223, "grad_norm": 2.109375, "learning_rate": 3.522727272727273e-06, "loss": 0.0531, "num_tokens": 15703475.0, "step": 156 }, { "epoch": 0.3581408611348731, "grad_norm": 3.078125, "learning_rate": 3.5454545454545458e-06, "loss": 0.073, "num_tokens": 15804266.0, "step": 157 }, { "epoch": 0.3604220131166239, "grad_norm": 2.453125, "learning_rate": 3.5681818181818185e-06, "loss": 0.0574, "num_tokens": 15905447.0, "step": 158 }, { "epoch": 0.3627031650983747, "grad_norm": 2.59375, "learning_rate": 3.590909090909091e-06, "loss": 0.0653, "num_tokens": 16005738.0, "step": 159 }, { "epoch": 0.36498431708012546, "grad_norm": 2.921875, "learning_rate": 3.6136363636363643e-06, "loss": 0.0646, "num_tokens": 16106174.0, "step": 160 }, { "epoch": 0.36726546906187624, "grad_norm": 2.828125, "learning_rate": 3.6363636363636366e-06, "loss": 0.0694, "num_tokens": 16207059.0, "step": 161 }, { "epoch": 0.369546621043627, "grad_norm": 3.265625, "learning_rate": 3.6590909090909093e-06, "loss": 0.0579, "num_tokens": 16307597.0, "step": 162 }, { "epoch": 0.37182777302537784, "grad_norm": 2.609375, "learning_rate": 3.681818181818182e-06, "loss": 0.0616, "num_tokens": 16408647.0, "step": 163 }, { "epoch": 0.3741089250071286, "grad_norm": 2.875, "learning_rate": 3.704545454545455e-06, "loss": 0.0551, "num_tokens": 16509259.0, "step": 164 }, { "epoch": 0.3763900769888794, "grad_norm": 2.90625, "learning_rate": 3.727272727272728e-06, "loss": 0.0624, "num_tokens": 16609653.0, "step": 165 }, { "epoch": 0.37867122897063016, "grad_norm": 2.453125, "learning_rate": 3.7500000000000005e-06, "loss": 0.051, "num_tokens": 16710340.0, "step": 166 }, { "epoch": 0.38095238095238093, "grad_norm": 2.796875, "learning_rate": 3.772727272727273e-06, "loss": 0.0721, "num_tokens": 16811513.0, "step": 167 }, { "epoch": 0.38323353293413176, "grad_norm": 2.578125, "learning_rate": 3.7954545454545455e-06, "loss": 0.0612, "num_tokens": 16912152.0, "step": 168 }, { "epoch": 0.38551468491588253, "grad_norm": 2.484375, "learning_rate": 3.818181818181819e-06, "loss": 0.0574, "num_tokens": 17012767.0, "step": 169 }, { "epoch": 0.3877958368976333, "grad_norm": 2.171875, "learning_rate": 3.840909090909091e-06, "loss": 0.0492, "num_tokens": 17113183.0, "step": 170 }, { "epoch": 0.3900769888793841, "grad_norm": 2.734375, "learning_rate": 3.863636363636364e-06, "loss": 0.0682, "num_tokens": 17213491.0, "step": 171 }, { "epoch": 0.39235814086113485, "grad_norm": 3.078125, "learning_rate": 3.886363636363637e-06, "loss": 0.0689, "num_tokens": 17313729.0, "step": 172 }, { "epoch": 0.3946392928428857, "grad_norm": 2.375, "learning_rate": 3.90909090909091e-06, "loss": 0.0535, "num_tokens": 17414739.0, "step": 173 }, { "epoch": 0.39692044482463645, "grad_norm": 2.375, "learning_rate": 3.931818181818182e-06, "loss": 0.0488, "num_tokens": 17515315.0, "step": 174 }, { "epoch": 0.3992015968063872, "grad_norm": 2.328125, "learning_rate": 3.954545454545454e-06, "loss": 0.0565, "num_tokens": 17616297.0, "step": 175 }, { "epoch": 0.401482748788138, "grad_norm": 2.21875, "learning_rate": 3.9772727272727275e-06, "loss": 0.0449, "num_tokens": 17717098.0, "step": 176 }, { "epoch": 0.4037639007698888, "grad_norm": 2.125, "learning_rate": 4.000000000000001e-06, "loss": 0.0477, "num_tokens": 17817610.0, "step": 177 }, { "epoch": 0.4060450527516396, "grad_norm": 2.046875, "learning_rate": 4.022727272727273e-06, "loss": 0.055, "num_tokens": 17917749.0, "step": 178 }, { "epoch": 0.4083262047333904, "grad_norm": 2.625, "learning_rate": 4.045454545454546e-06, "loss": 0.0562, "num_tokens": 18018110.0, "step": 179 }, { "epoch": 0.41060735671514115, "grad_norm": 2.40625, "learning_rate": 4.068181818181818e-06, "loss": 0.0698, "num_tokens": 18119213.0, "step": 180 }, { "epoch": 0.4128885086968919, "grad_norm": 2.8125, "learning_rate": 4.0909090909090915e-06, "loss": 0.0628, "num_tokens": 18220203.0, "step": 181 }, { "epoch": 0.4151696606786427, "grad_norm": 2.28125, "learning_rate": 4.113636363636364e-06, "loss": 0.0554, "num_tokens": 18321312.0, "step": 182 }, { "epoch": 0.4174508126603935, "grad_norm": 2.8125, "learning_rate": 4.136363636363637e-06, "loss": 0.0594, "num_tokens": 18422143.0, "step": 183 }, { "epoch": 0.4197319646421443, "grad_norm": 1.921875, "learning_rate": 4.159090909090909e-06, "loss": 0.036, "num_tokens": 18522377.0, "step": 184 }, { "epoch": 0.42201311662389507, "grad_norm": 2.5625, "learning_rate": 4.181818181818182e-06, "loss": 0.0588, "num_tokens": 18623069.0, "step": 185 }, { "epoch": 0.42429426860564584, "grad_norm": 2.6875, "learning_rate": 4.204545454545455e-06, "loss": 0.0549, "num_tokens": 18724063.0, "step": 186 }, { "epoch": 0.4265754205873966, "grad_norm": 2.234375, "learning_rate": 4.227272727272728e-06, "loss": 0.0479, "num_tokens": 18824902.0, "step": 187 }, { "epoch": 0.42885657256914744, "grad_norm": 2.578125, "learning_rate": 4.25e-06, "loss": 0.0628, "num_tokens": 18924943.0, "step": 188 }, { "epoch": 0.4311377245508982, "grad_norm": 2.953125, "learning_rate": 4.272727272727273e-06, "loss": 0.0576, "num_tokens": 19025564.0, "step": 189 }, { "epoch": 0.433418876532649, "grad_norm": 2.59375, "learning_rate": 4.295454545454546e-06, "loss": 0.0565, "num_tokens": 19125719.0, "step": 190 }, { "epoch": 0.43570002851439976, "grad_norm": 2.296875, "learning_rate": 4.3181818181818185e-06, "loss": 0.0493, "num_tokens": 19226416.0, "step": 191 }, { "epoch": 0.43798118049615054, "grad_norm": 2.921875, "learning_rate": 4.340909090909091e-06, "loss": 0.0592, "num_tokens": 19327315.0, "step": 192 }, { "epoch": 0.44026233247790136, "grad_norm": 2.75, "learning_rate": 4.363636363636364e-06, "loss": 0.0492, "num_tokens": 19428135.0, "step": 193 }, { "epoch": 0.44254348445965214, "grad_norm": 2.53125, "learning_rate": 4.386363636363637e-06, "loss": 0.0578, "num_tokens": 19529079.0, "step": 194 }, { "epoch": 0.4448246364414029, "grad_norm": 2.046875, "learning_rate": 4.409090909090909e-06, "loss": 0.0367, "num_tokens": 19629954.0, "step": 195 }, { "epoch": 0.4471057884231537, "grad_norm": 2.40625, "learning_rate": 4.4318181818181824e-06, "loss": 0.0425, "num_tokens": 19730544.0, "step": 196 }, { "epoch": 0.44938694040490446, "grad_norm": 1.96875, "learning_rate": 4.454545454545455e-06, "loss": 0.0401, "num_tokens": 19830807.0, "step": 197 }, { "epoch": 0.4516680923866553, "grad_norm": 2.46875, "learning_rate": 4.477272727272728e-06, "loss": 0.0467, "num_tokens": 19931552.0, "step": 198 }, { "epoch": 0.45394924436840606, "grad_norm": 2.578125, "learning_rate": 4.5e-06, "loss": 0.0511, "num_tokens": 20032278.0, "step": 199 }, { "epoch": 0.45623039635015683, "grad_norm": 2.515625, "learning_rate": 4.522727272727273e-06, "loss": 0.0447, "num_tokens": 20133256.0, "step": 200 }, { "epoch": 0.4585115483319076, "grad_norm": 2.34375, "learning_rate": 4.5454545454545455e-06, "loss": 0.0428, "num_tokens": 20234472.0, "step": 201 }, { "epoch": 0.4607927003136584, "grad_norm": 2.84375, "learning_rate": 4.568181818181819e-06, "loss": 0.0639, "num_tokens": 20335382.0, "step": 202 }, { "epoch": 0.4630738522954092, "grad_norm": 2.40625, "learning_rate": 4.590909090909092e-06, "loss": 0.05, "num_tokens": 20435895.0, "step": 203 }, { "epoch": 0.46535500427716, "grad_norm": 1.640625, "learning_rate": 4.613636363636364e-06, "loss": 0.0313, "num_tokens": 20537050.0, "step": 204 }, { "epoch": 0.46763615625891075, "grad_norm": 2.375, "learning_rate": 4.636363636363636e-06, "loss": 0.0568, "num_tokens": 20637786.0, "step": 205 }, { "epoch": 0.4699173082406615, "grad_norm": 2.421875, "learning_rate": 4.6590909090909095e-06, "loss": 0.0453, "num_tokens": 20738950.0, "step": 206 }, { "epoch": 0.4721984602224123, "grad_norm": 2.46875, "learning_rate": 4.681818181818183e-06, "loss": 0.0431, "num_tokens": 20840335.0, "step": 207 }, { "epoch": 0.4744796122041631, "grad_norm": 3.546875, "learning_rate": 4.704545454545455e-06, "loss": 0.0723, "num_tokens": 20941421.0, "step": 208 }, { "epoch": 0.4767607641859139, "grad_norm": 2.109375, "learning_rate": 4.727272727272728e-06, "loss": 0.0515, "num_tokens": 21041614.0, "step": 209 }, { "epoch": 0.47904191616766467, "grad_norm": 2.65625, "learning_rate": 4.75e-06, "loss": 0.0496, "num_tokens": 21141997.0, "step": 210 }, { "epoch": 0.48132306814941545, "grad_norm": 2.171875, "learning_rate": 4.772727272727273e-06, "loss": 0.0422, "num_tokens": 21242234.0, "step": 211 }, { "epoch": 0.4836042201311662, "grad_norm": 2.171875, "learning_rate": 4.795454545454546e-06, "loss": 0.0508, "num_tokens": 21343302.0, "step": 212 }, { "epoch": 0.48588537211291705, "grad_norm": 2.390625, "learning_rate": 4.818181818181819e-06, "loss": 0.0516, "num_tokens": 21443574.0, "step": 213 }, { "epoch": 0.4881665240946678, "grad_norm": 2.25, "learning_rate": 4.840909090909091e-06, "loss": 0.0476, "num_tokens": 21543910.0, "step": 214 }, { "epoch": 0.4904476760764186, "grad_norm": 2.6875, "learning_rate": 4.863636363636364e-06, "loss": 0.0609, "num_tokens": 21644799.0, "step": 215 }, { "epoch": 0.49272882805816937, "grad_norm": 2.546875, "learning_rate": 4.8863636363636365e-06, "loss": 0.0545, "num_tokens": 21745692.0, "step": 216 }, { "epoch": 0.49500998003992014, "grad_norm": 2.328125, "learning_rate": 4.90909090909091e-06, "loss": 0.0447, "num_tokens": 21846361.0, "step": 217 }, { "epoch": 0.49729113202167097, "grad_norm": 2.5625, "learning_rate": 4.931818181818182e-06, "loss": 0.0515, "num_tokens": 21947004.0, "step": 218 }, { "epoch": 0.49957228400342174, "grad_norm": 1.796875, "learning_rate": 4.954545454545455e-06, "loss": 0.0373, "num_tokens": 22048460.0, "step": 219 }, { "epoch": 0.5018534359851725, "grad_norm": 2.421875, "learning_rate": 4.977272727272728e-06, "loss": 0.0379, "num_tokens": 22148542.0, "step": 220 }, { "epoch": 0.5018534359851725, "eval_loss": 0.048636823892593384, "eval_num_tokens": 22148542.0, "eval_runtime": 53.9984, "eval_samples_per_second": 155.282, "eval_steps_per_second": 4.871, "step": 220 }, { "epoch": 0.5041345879669233, "grad_norm": 2.25, "learning_rate": 5e-06, "loss": 0.0496, "num_tokens": 22249449.0, "step": 221 }, { "epoch": 0.5064157399486741, "grad_norm": 1.609375, "learning_rate": 4.999999290524132e-06, "loss": 0.0396, "num_tokens": 22350029.0, "step": 222 }, { "epoch": 0.5086968919304249, "grad_norm": 3.0, "learning_rate": 4.999997162096932e-06, "loss": 0.0593, "num_tokens": 22450986.0, "step": 223 }, { "epoch": 0.5109780439121756, "grad_norm": 1.9375, "learning_rate": 4.999993614719606e-06, "loss": 0.0404, "num_tokens": 22551784.0, "step": 224 }, { "epoch": 0.5132591958939264, "grad_norm": 2.359375, "learning_rate": 4.999988648394169e-06, "loss": 0.0516, "num_tokens": 22652952.0, "step": 225 }, { "epoch": 0.5155403478756773, "grad_norm": 2.4375, "learning_rate": 4.99998226312344e-06, "loss": 0.0504, "num_tokens": 22753234.0, "step": 226 }, { "epoch": 0.517821499857428, "grad_norm": 2.15625, "learning_rate": 4.999974458911041e-06, "loss": 0.0506, "num_tokens": 22853703.0, "step": 227 }, { "epoch": 0.5201026518391788, "grad_norm": 2.609375, "learning_rate": 4.999965235761404e-06, "loss": 0.0583, "num_tokens": 22954187.0, "step": 228 }, { "epoch": 0.5223838038209295, "grad_norm": 2.234375, "learning_rate": 4.999954593679762e-06, "loss": 0.0467, "num_tokens": 23054807.0, "step": 229 }, { "epoch": 0.5246649558026804, "grad_norm": 2.28125, "learning_rate": 4.999942532672157e-06, "loss": 0.0476, "num_tokens": 23154881.0, "step": 230 }, { "epoch": 0.5269461077844312, "grad_norm": 1.8203125, "learning_rate": 4.999929052745434e-06, "loss": 0.0416, "num_tokens": 23255765.0, "step": 231 }, { "epoch": 0.5292272597661819, "grad_norm": 2.375, "learning_rate": 4.999914153907243e-06, "loss": 0.0567, "num_tokens": 23356437.0, "step": 232 }, { "epoch": 0.5315084117479327, "grad_norm": 2.46875, "learning_rate": 4.999897836166041e-06, "loss": 0.0434, "num_tokens": 23456721.0, "step": 233 }, { "epoch": 0.5337895637296834, "grad_norm": 1.9375, "learning_rate": 4.999880099531089e-06, "loss": 0.0451, "num_tokens": 23556486.0, "step": 234 }, { "epoch": 0.5360707157114343, "grad_norm": 2.09375, "learning_rate": 4.999860944012455e-06, "loss": 0.0435, "num_tokens": 23657126.0, "step": 235 }, { "epoch": 0.5383518676931851, "grad_norm": 2.140625, "learning_rate": 4.999840369621011e-06, "loss": 0.0514, "num_tokens": 23757915.0, "step": 236 }, { "epoch": 0.5406330196749358, "grad_norm": 2.4375, "learning_rate": 4.999818376368435e-06, "loss": 0.0429, "num_tokens": 23858335.0, "step": 237 }, { "epoch": 0.5429141716566867, "grad_norm": 2.09375, "learning_rate": 4.999794964267208e-06, "loss": 0.0457, "num_tokens": 23959227.0, "step": 238 }, { "epoch": 0.5451953236384374, "grad_norm": 2.109375, "learning_rate": 4.9997701333306215e-06, "loss": 0.0376, "num_tokens": 24060350.0, "step": 239 }, { "epoch": 0.5474764756201882, "grad_norm": 2.296875, "learning_rate": 4.999743883572766e-06, "loss": 0.0505, "num_tokens": 24161180.0, "step": 240 }, { "epoch": 0.549757627601939, "grad_norm": 1.8984375, "learning_rate": 4.999716215008542e-06, "loss": 0.0422, "num_tokens": 24261362.0, "step": 241 }, { "epoch": 0.5520387795836897, "grad_norm": 2.0625, "learning_rate": 4.999687127653654e-06, "loss": 0.0523, "num_tokens": 24362031.0, "step": 242 }, { "epoch": 0.5543199315654406, "grad_norm": 2.5625, "learning_rate": 4.99965662152461e-06, "loss": 0.0449, "num_tokens": 24463224.0, "step": 243 }, { "epoch": 0.5566010835471913, "grad_norm": 2.359375, "learning_rate": 4.999624696638725e-06, "loss": 0.05, "num_tokens": 24563880.0, "step": 244 }, { "epoch": 0.5588822355289421, "grad_norm": 2.171875, "learning_rate": 4.999591353014119e-06, "loss": 0.0382, "num_tokens": 24664511.0, "step": 245 }, { "epoch": 0.561163387510693, "grad_norm": 2.359375, "learning_rate": 4.999556590669718e-06, "loss": 0.0517, "num_tokens": 24765536.0, "step": 246 }, { "epoch": 0.5634445394924437, "grad_norm": 2.3125, "learning_rate": 4.999520409625253e-06, "loss": 0.0564, "num_tokens": 24866073.0, "step": 247 }, { "epoch": 0.5657256914741945, "grad_norm": 2.046875, "learning_rate": 4.999482809901257e-06, "loss": 0.0428, "num_tokens": 24966387.0, "step": 248 }, { "epoch": 0.5680068434559452, "grad_norm": 1.828125, "learning_rate": 4.999443791519074e-06, "loss": 0.0424, "num_tokens": 25066985.0, "step": 249 }, { "epoch": 0.570287995437696, "grad_norm": 2.65625, "learning_rate": 4.999403354500847e-06, "loss": 0.0368, "num_tokens": 25167553.0, "step": 250 }, { "epoch": 0.5725691474194469, "grad_norm": 1.9140625, "learning_rate": 4.99936149886953e-06, "loss": 0.0446, "num_tokens": 25268297.0, "step": 251 }, { "epoch": 0.5748502994011976, "grad_norm": 2.34375, "learning_rate": 4.999318224648878e-06, "loss": 0.0488, "num_tokens": 25369145.0, "step": 252 }, { "epoch": 0.5771314513829484, "grad_norm": 1.546875, "learning_rate": 4.999273531863453e-06, "loss": 0.0304, "num_tokens": 25470087.0, "step": 253 }, { "epoch": 0.5794126033646991, "grad_norm": 3.125, "learning_rate": 4.999227420538622e-06, "loss": 0.0568, "num_tokens": 25570962.0, "step": 254 }, { "epoch": 0.58169375534645, "grad_norm": 2.640625, "learning_rate": 4.999179890700555e-06, "loss": 0.0477, "num_tokens": 25671229.0, "step": 255 }, { "epoch": 0.5839749073282008, "grad_norm": 2.078125, "learning_rate": 4.999130942376232e-06, "loss": 0.0344, "num_tokens": 25771644.0, "step": 256 }, { "epoch": 0.5862560593099515, "grad_norm": 1.609375, "learning_rate": 4.999080575593433e-06, "loss": 0.0329, "num_tokens": 25871852.0, "step": 257 }, { "epoch": 0.5885372112917023, "grad_norm": 2.34375, "learning_rate": 4.999028790380746e-06, "loss": 0.0441, "num_tokens": 25972347.0, "step": 258 }, { "epoch": 0.590818363273453, "grad_norm": 1.6953125, "learning_rate": 4.9989755867675635e-06, "loss": 0.044, "num_tokens": 26072976.0, "step": 259 }, { "epoch": 0.5930995152552039, "grad_norm": 2.40625, "learning_rate": 4.998920964784082e-06, "loss": 0.0485, "num_tokens": 26173562.0, "step": 260 }, { "epoch": 0.5953806672369547, "grad_norm": 2.09375, "learning_rate": 4.998864924461305e-06, "loss": 0.0378, "num_tokens": 26273891.0, "step": 261 }, { "epoch": 0.5976618192187054, "grad_norm": 1.9921875, "learning_rate": 4.998807465831039e-06, "loss": 0.0439, "num_tokens": 26374548.0, "step": 262 }, { "epoch": 0.5999429712004563, "grad_norm": 2.265625, "learning_rate": 4.998748588925897e-06, "loss": 0.0361, "num_tokens": 26475115.0, "step": 263 }, { "epoch": 0.602224123182207, "grad_norm": 2.15625, "learning_rate": 4.998688293779297e-06, "loss": 0.0363, "num_tokens": 26576116.0, "step": 264 }, { "epoch": 0.6045052751639578, "grad_norm": 2.421875, "learning_rate": 4.998626580425459e-06, "loss": 0.0456, "num_tokens": 26676748.0, "step": 265 }, { "epoch": 0.6067864271457086, "grad_norm": 2.390625, "learning_rate": 4.998563448899413e-06, "loss": 0.0371, "num_tokens": 26777577.0, "step": 266 }, { "epoch": 0.6090675791274593, "grad_norm": 2.453125, "learning_rate": 4.998498899236989e-06, "loss": 0.0451, "num_tokens": 26878493.0, "step": 267 }, { "epoch": 0.6113487311092102, "grad_norm": 2.09375, "learning_rate": 4.998432931474825e-06, "loss": 0.037, "num_tokens": 26979212.0, "step": 268 }, { "epoch": 0.6136298830909609, "grad_norm": 2.1875, "learning_rate": 4.998365545650365e-06, "loss": 0.0411, "num_tokens": 27080688.0, "step": 269 }, { "epoch": 0.6159110350727117, "grad_norm": 2.390625, "learning_rate": 4.998296741801852e-06, "loss": 0.0443, "num_tokens": 27180818.0, "step": 270 }, { "epoch": 0.6181921870544625, "grad_norm": 2.1875, "learning_rate": 4.998226519968341e-06, "loss": 0.0402, "num_tokens": 27281292.0, "step": 271 }, { "epoch": 0.6204733390362133, "grad_norm": 2.53125, "learning_rate": 4.998154880189688e-06, "loss": 0.0373, "num_tokens": 27381864.0, "step": 272 }, { "epoch": 0.6227544910179641, "grad_norm": 1.8671875, "learning_rate": 4.998081822506552e-06, "loss": 0.0364, "num_tokens": 27482742.0, "step": 273 }, { "epoch": 0.6250356429997148, "grad_norm": 2.109375, "learning_rate": 4.998007346960402e-06, "loss": 0.0377, "num_tokens": 27583853.0, "step": 274 }, { "epoch": 0.6273167949814656, "grad_norm": 2.25, "learning_rate": 4.997931453593507e-06, "loss": 0.049, "num_tokens": 27684599.0, "step": 275 }, { "epoch": 0.6295979469632165, "grad_norm": 2.484375, "learning_rate": 4.997854142448944e-06, "loss": 0.0365, "num_tokens": 27785238.0, "step": 276 }, { "epoch": 0.6318790989449672, "grad_norm": 1.6875, "learning_rate": 4.997775413570593e-06, "loss": 0.0356, "num_tokens": 27886337.0, "step": 277 }, { "epoch": 0.634160250926718, "grad_norm": 1.78125, "learning_rate": 4.997695267003139e-06, "loss": 0.0303, "num_tokens": 27986804.0, "step": 278 }, { "epoch": 0.6364414029084687, "grad_norm": 2.125, "learning_rate": 4.99761370279207e-06, "loss": 0.047, "num_tokens": 28086747.0, "step": 279 }, { "epoch": 0.6387225548902196, "grad_norm": 2.375, "learning_rate": 4.997530720983682e-06, "loss": 0.0453, "num_tokens": 28187671.0, "step": 280 }, { "epoch": 0.6410037068719704, "grad_norm": 2.4375, "learning_rate": 4.9974463216250735e-06, "loss": 0.0399, "num_tokens": 28288228.0, "step": 281 }, { "epoch": 0.6432848588537211, "grad_norm": 1.9296875, "learning_rate": 4.997360504764148e-06, "loss": 0.038, "num_tokens": 28388657.0, "step": 282 }, { "epoch": 0.6455660108354719, "grad_norm": 1.9921875, "learning_rate": 4.997273270449614e-06, "loss": 0.0418, "num_tokens": 28489391.0, "step": 283 }, { "epoch": 0.6478471628172227, "grad_norm": 2.6875, "learning_rate": 4.997184618730983e-06, "loss": 0.0475, "num_tokens": 28590533.0, "step": 284 }, { "epoch": 0.6501283147989735, "grad_norm": 2.25, "learning_rate": 4.997094549658572e-06, "loss": 0.0468, "num_tokens": 28692156.0, "step": 285 }, { "epoch": 0.6524094667807243, "grad_norm": 2.3125, "learning_rate": 4.997003063283503e-06, "loss": 0.0618, "num_tokens": 28792475.0, "step": 286 }, { "epoch": 0.654690618762475, "grad_norm": 2.1875, "learning_rate": 4.996910159657703e-06, "loss": 0.0446, "num_tokens": 28893561.0, "step": 287 }, { "epoch": 0.6569717707442259, "grad_norm": 2.21875, "learning_rate": 4.996815838833899e-06, "loss": 0.0492, "num_tokens": 28994905.0, "step": 288 }, { "epoch": 0.6592529227259766, "grad_norm": 1.984375, "learning_rate": 4.99672010086563e-06, "loss": 0.045, "num_tokens": 29096265.0, "step": 289 }, { "epoch": 0.6615340747077274, "grad_norm": 2.171875, "learning_rate": 4.996622945807231e-06, "loss": 0.0371, "num_tokens": 29197130.0, "step": 290 }, { "epoch": 0.6638152266894782, "grad_norm": 2.296875, "learning_rate": 4.996524373713848e-06, "loss": 0.0445, "num_tokens": 29298773.0, "step": 291 }, { "epoch": 0.666096378671229, "grad_norm": 2.015625, "learning_rate": 4.996424384641428e-06, "loss": 0.0467, "num_tokens": 29399291.0, "step": 292 }, { "epoch": 0.6683775306529798, "grad_norm": 1.953125, "learning_rate": 4.996322978646722e-06, "loss": 0.0422, "num_tokens": 29499902.0, "step": 293 }, { "epoch": 0.6706586826347305, "grad_norm": 1.5859375, "learning_rate": 4.996220155787287e-06, "loss": 0.0425, "num_tokens": 29600583.0, "step": 294 }, { "epoch": 0.6729398346164813, "grad_norm": 2.265625, "learning_rate": 4.996115916121483e-06, "loss": 0.0518, "num_tokens": 29701414.0, "step": 295 }, { "epoch": 0.6752209865982322, "grad_norm": 2.296875, "learning_rate": 4.996010259708475e-06, "loss": 0.046, "num_tokens": 29802022.0, "step": 296 }, { "epoch": 0.6775021385799829, "grad_norm": 1.8671875, "learning_rate": 4.99590318660823e-06, "loss": 0.0396, "num_tokens": 29903038.0, "step": 297 }, { "epoch": 0.6797832905617337, "grad_norm": 2.09375, "learning_rate": 4.9957946968815215e-06, "loss": 0.0482, "num_tokens": 30003598.0, "step": 298 }, { "epoch": 0.6820644425434844, "grad_norm": 3.015625, "learning_rate": 4.995684790589927e-06, "loss": 0.048, "num_tokens": 30104284.0, "step": 299 }, { "epoch": 0.6843455945252352, "grad_norm": 2.015625, "learning_rate": 4.995573467795825e-06, "loss": 0.0461, "num_tokens": 30205280.0, "step": 300 }, { "epoch": 0.6866267465069861, "grad_norm": 2.125, "learning_rate": 4.995460728562403e-06, "loss": 0.0417, "num_tokens": 30305852.0, "step": 301 }, { "epoch": 0.6889078984887368, "grad_norm": 2.1875, "learning_rate": 4.9953465729536475e-06, "loss": 0.0372, "num_tokens": 30406530.0, "step": 302 }, { "epoch": 0.6911890504704876, "grad_norm": 2.0625, "learning_rate": 4.995231001034352e-06, "loss": 0.0387, "num_tokens": 30506704.0, "step": 303 }, { "epoch": 0.6934702024522383, "grad_norm": 2.03125, "learning_rate": 4.995114012870112e-06, "loss": 0.0421, "num_tokens": 30607290.0, "step": 304 }, { "epoch": 0.6957513544339892, "grad_norm": 1.8359375, "learning_rate": 4.99499560852733e-06, "loss": 0.0403, "num_tokens": 30708125.0, "step": 305 }, { "epoch": 0.69803250641574, "grad_norm": 1.9140625, "learning_rate": 4.994875788073207e-06, "loss": 0.0464, "num_tokens": 30809463.0, "step": 306 }, { "epoch": 0.7003136583974907, "grad_norm": 1.8828125, "learning_rate": 4.994754551575752e-06, "loss": 0.043, "num_tokens": 30909656.0, "step": 307 }, { "epoch": 0.7025948103792415, "grad_norm": 2.21875, "learning_rate": 4.994631899103777e-06, "loss": 0.0418, "num_tokens": 31010341.0, "step": 308 }, { "epoch": 0.7048759623609923, "grad_norm": 2.015625, "learning_rate": 4.9945078307268974e-06, "loss": 0.0448, "num_tokens": 31110616.0, "step": 309 }, { "epoch": 0.7071571143427431, "grad_norm": 2.15625, "learning_rate": 4.994382346515531e-06, "loss": 0.0376, "num_tokens": 31211733.0, "step": 310 }, { "epoch": 0.7094382663244939, "grad_norm": 1.6953125, "learning_rate": 4.9942554465409e-06, "loss": 0.0363, "num_tokens": 31311720.0, "step": 311 }, { "epoch": 0.7117194183062446, "grad_norm": 1.7421875, "learning_rate": 4.994127130875032e-06, "loss": 0.0321, "num_tokens": 31412447.0, "step": 312 }, { "epoch": 0.7140005702879955, "grad_norm": 1.6953125, "learning_rate": 4.993997399590755e-06, "loss": 0.0387, "num_tokens": 31513043.0, "step": 313 }, { "epoch": 0.7162817222697462, "grad_norm": 1.9140625, "learning_rate": 4.993866252761702e-06, "loss": 0.0367, "num_tokens": 31613631.0, "step": 314 }, { "epoch": 0.718562874251497, "grad_norm": 1.6328125, "learning_rate": 4.993733690462311e-06, "loss": 0.0312, "num_tokens": 31714267.0, "step": 315 }, { "epoch": 0.7208440262332478, "grad_norm": 1.71875, "learning_rate": 4.99359971276782e-06, "loss": 0.0306, "num_tokens": 31814913.0, "step": 316 }, { "epoch": 0.7231251782149986, "grad_norm": 1.7421875, "learning_rate": 4.993464319754273e-06, "loss": 0.0355, "num_tokens": 31915309.0, "step": 317 }, { "epoch": 0.7254063301967494, "grad_norm": 1.71875, "learning_rate": 4.993327511498516e-06, "loss": 0.0336, "num_tokens": 32016080.0, "step": 318 }, { "epoch": 0.7276874821785001, "grad_norm": 2.203125, "learning_rate": 4.9931892880782e-06, "loss": 0.045, "num_tokens": 32116768.0, "step": 319 }, { "epoch": 0.7299686341602509, "grad_norm": 1.8984375, "learning_rate": 4.993049649571775e-06, "loss": 0.0378, "num_tokens": 32217414.0, "step": 320 }, { "epoch": 0.7322497861420018, "grad_norm": 2.65625, "learning_rate": 4.992908596058501e-06, "loss": 0.0438, "num_tokens": 32318344.0, "step": 321 }, { "epoch": 0.7345309381237525, "grad_norm": 2.5, "learning_rate": 4.992766127618434e-06, "loss": 0.0541, "num_tokens": 32418834.0, "step": 322 }, { "epoch": 0.7368120901055033, "grad_norm": 2.09375, "learning_rate": 4.992622244332439e-06, "loss": 0.0399, "num_tokens": 32519078.0, "step": 323 }, { "epoch": 0.739093242087254, "grad_norm": 1.7109375, "learning_rate": 4.992476946282179e-06, "loss": 0.0342, "num_tokens": 32619956.0, "step": 324 }, { "epoch": 0.7413743940690048, "grad_norm": 2.4375, "learning_rate": 4.992330233550124e-06, "loss": 0.0396, "num_tokens": 32720692.0, "step": 325 }, { "epoch": 0.7436555460507557, "grad_norm": 2.34375, "learning_rate": 4.9921821062195445e-06, "loss": 0.0441, "num_tokens": 32820791.0, "step": 326 }, { "epoch": 0.7459366980325064, "grad_norm": 2.296875, "learning_rate": 4.9920325643745145e-06, "loss": 0.0417, "num_tokens": 32921456.0, "step": 327 }, { "epoch": 0.7482178500142572, "grad_norm": 2.03125, "learning_rate": 4.991881608099912e-06, "loss": 0.0437, "num_tokens": 33022463.0, "step": 328 }, { "epoch": 0.7504990019960079, "grad_norm": 1.9765625, "learning_rate": 4.991729237481417e-06, "loss": 0.0399, "num_tokens": 33123059.0, "step": 329 }, { "epoch": 0.7527801539777588, "grad_norm": 2.125, "learning_rate": 4.991575452605511e-06, "loss": 0.0416, "num_tokens": 33223853.0, "step": 330 }, { "epoch": 0.7550613059595096, "grad_norm": 1.9609375, "learning_rate": 4.9914202535594795e-06, "loss": 0.0373, "num_tokens": 33324317.0, "step": 331 }, { "epoch": 0.7573424579412603, "grad_norm": 1.9375, "learning_rate": 4.991263640431411e-06, "loss": 0.044, "num_tokens": 33425147.0, "step": 332 }, { "epoch": 0.7596236099230111, "grad_norm": 1.953125, "learning_rate": 4.9911056133101965e-06, "loss": 0.0395, "num_tokens": 33525598.0, "step": 333 }, { "epoch": 0.7619047619047619, "grad_norm": 1.8984375, "learning_rate": 4.990946172285528e-06, "loss": 0.0407, "num_tokens": 33625990.0, "step": 334 }, { "epoch": 0.7641859138865127, "grad_norm": 1.7578125, "learning_rate": 4.990785317447901e-06, "loss": 0.0353, "num_tokens": 33726776.0, "step": 335 }, { "epoch": 0.7664670658682635, "grad_norm": 1.84375, "learning_rate": 4.990623048888615e-06, "loss": 0.0453, "num_tokens": 33827105.0, "step": 336 }, { "epoch": 0.7687482178500142, "grad_norm": 2.328125, "learning_rate": 4.9904593666997704e-06, "loss": 0.0528, "num_tokens": 33927484.0, "step": 337 }, { "epoch": 0.7710293698317651, "grad_norm": 1.515625, "learning_rate": 4.990294270974268e-06, "loss": 0.0354, "num_tokens": 34028104.0, "step": 338 }, { "epoch": 0.7733105218135158, "grad_norm": 1.765625, "learning_rate": 4.990127761805816e-06, "loss": 0.043, "num_tokens": 34128011.0, "step": 339 }, { "epoch": 0.7755916737952666, "grad_norm": 1.90625, "learning_rate": 4.989959839288919e-06, "loss": 0.0449, "num_tokens": 34229369.0, "step": 340 }, { "epoch": 0.7778728257770174, "grad_norm": 1.71875, "learning_rate": 4.989790503518888e-06, "loss": 0.0432, "num_tokens": 34329983.0, "step": 341 }, { "epoch": 0.7801539777587682, "grad_norm": 1.921875, "learning_rate": 4.9896197545918345e-06, "loss": 0.0426, "num_tokens": 34431063.0, "step": 342 }, { "epoch": 0.782435129740519, "grad_norm": 1.859375, "learning_rate": 4.989447592604673e-06, "loss": 0.0432, "num_tokens": 34532191.0, "step": 343 }, { "epoch": 0.7847162817222697, "grad_norm": 1.546875, "learning_rate": 4.989274017655117e-06, "loss": 0.0285, "num_tokens": 34632916.0, "step": 344 }, { "epoch": 0.7869974337040205, "grad_norm": 1.5859375, "learning_rate": 4.989099029841687e-06, "loss": 0.0324, "num_tokens": 34733165.0, "step": 345 }, { "epoch": 0.7892785856857714, "grad_norm": 1.9375, "learning_rate": 4.988922629263701e-06, "loss": 0.0465, "num_tokens": 34833504.0, "step": 346 }, { "epoch": 0.7915597376675221, "grad_norm": 1.84375, "learning_rate": 4.988744816021283e-06, "loss": 0.0476, "num_tokens": 34934694.0, "step": 347 }, { "epoch": 0.7938408896492729, "grad_norm": 1.75, "learning_rate": 4.988565590215352e-06, "loss": 0.0377, "num_tokens": 35035290.0, "step": 348 }, { "epoch": 0.7961220416310236, "grad_norm": 1.8828125, "learning_rate": 4.9883849519476364e-06, "loss": 0.0386, "num_tokens": 35136223.0, "step": 349 }, { "epoch": 0.7984031936127745, "grad_norm": 1.859375, "learning_rate": 4.988202901320663e-06, "loss": 0.0373, "num_tokens": 35237077.0, "step": 350 }, { "epoch": 0.8006843455945253, "grad_norm": 2.046875, "learning_rate": 4.988019438437759e-06, "loss": 0.039, "num_tokens": 35337193.0, "step": 351 }, { "epoch": 0.802965497576276, "grad_norm": 1.9609375, "learning_rate": 4.987834563403055e-06, "loss": 0.0355, "num_tokens": 35438594.0, "step": 352 }, { "epoch": 0.8052466495580268, "grad_norm": 1.9921875, "learning_rate": 4.987648276321482e-06, "loss": 0.0396, "num_tokens": 35538727.0, "step": 353 }, { "epoch": 0.8075278015397775, "grad_norm": 2.46875, "learning_rate": 4.987460577298774e-06, "loss": 0.0377, "num_tokens": 35639702.0, "step": 354 }, { "epoch": 0.8098089535215284, "grad_norm": 2.015625, "learning_rate": 4.9872714664414635e-06, "loss": 0.0445, "num_tokens": 35739556.0, "step": 355 }, { "epoch": 0.8120901055032792, "grad_norm": 2.140625, "learning_rate": 4.987080943856887e-06, "loss": 0.0469, "num_tokens": 35839739.0, "step": 356 }, { "epoch": 0.8143712574850299, "grad_norm": 1.828125, "learning_rate": 4.986889009653183e-06, "loss": 0.0404, "num_tokens": 35940710.0, "step": 357 }, { "epoch": 0.8166524094667807, "grad_norm": 1.6640625, "learning_rate": 4.986695663939288e-06, "loss": 0.0407, "num_tokens": 36041476.0, "step": 358 }, { "epoch": 0.8189335614485315, "grad_norm": 1.796875, "learning_rate": 4.986500906824942e-06, "loss": 0.0347, "num_tokens": 36141871.0, "step": 359 }, { "epoch": 0.8212147134302823, "grad_norm": 1.78125, "learning_rate": 4.986304738420684e-06, "loss": 0.0367, "num_tokens": 36242522.0, "step": 360 }, { "epoch": 0.8234958654120331, "grad_norm": 1.5859375, "learning_rate": 4.9861071588378565e-06, "loss": 0.0343, "num_tokens": 36342957.0, "step": 361 }, { "epoch": 0.8257770173937838, "grad_norm": 1.9453125, "learning_rate": 4.985908168188602e-06, "loss": 0.04, "num_tokens": 36443352.0, "step": 362 }, { "epoch": 0.8280581693755347, "grad_norm": 1.84375, "learning_rate": 4.985707766585865e-06, "loss": 0.0384, "num_tokens": 36543626.0, "step": 363 }, { "epoch": 0.8303393213572854, "grad_norm": 1.9296875, "learning_rate": 4.985505954143387e-06, "loss": 0.0387, "num_tokens": 36644323.0, "step": 364 }, { "epoch": 0.8326204733390362, "grad_norm": 1.859375, "learning_rate": 4.985302730975713e-06, "loss": 0.0363, "num_tokens": 36745065.0, "step": 365 }, { "epoch": 0.834901625320787, "grad_norm": 2.109375, "learning_rate": 4.9850980971981914e-06, "loss": 0.0426, "num_tokens": 36845536.0, "step": 366 }, { "epoch": 0.8371827773025378, "grad_norm": 1.9453125, "learning_rate": 4.984892052926965e-06, "loss": 0.0439, "num_tokens": 36946067.0, "step": 367 }, { "epoch": 0.8394639292842886, "grad_norm": 1.796875, "learning_rate": 4.984684598278982e-06, "loss": 0.0339, "num_tokens": 37046261.0, "step": 368 }, { "epoch": 0.8417450812660393, "grad_norm": 2.375, "learning_rate": 4.984475733371991e-06, "loss": 0.0431, "num_tokens": 37147194.0, "step": 369 }, { "epoch": 0.8440262332477901, "grad_norm": 2.28125, "learning_rate": 4.984265458324538e-06, "loss": 0.0427, "num_tokens": 37247448.0, "step": 370 }, { "epoch": 0.846307385229541, "grad_norm": 1.8125, "learning_rate": 4.984053773255971e-06, "loss": 0.0353, "num_tokens": 37348461.0, "step": 371 }, { "epoch": 0.8485885372112917, "grad_norm": 1.9375, "learning_rate": 4.9838406782864394e-06, "loss": 0.0358, "num_tokens": 37449188.0, "step": 372 }, { "epoch": 0.8508696891930425, "grad_norm": 1.6875, "learning_rate": 4.983626173536891e-06, "loss": 0.0339, "num_tokens": 37549907.0, "step": 373 }, { "epoch": 0.8531508411747932, "grad_norm": 2.015625, "learning_rate": 4.983410259129075e-06, "loss": 0.0395, "num_tokens": 37650454.0, "step": 374 }, { "epoch": 0.8554319931565441, "grad_norm": 1.609375, "learning_rate": 4.983192935185539e-06, "loss": 0.033, "num_tokens": 37750727.0, "step": 375 }, { "epoch": 0.8577131451382949, "grad_norm": 1.8203125, "learning_rate": 4.9829742018296335e-06, "loss": 0.0436, "num_tokens": 37851450.0, "step": 376 }, { "epoch": 0.8599942971200456, "grad_norm": 1.796875, "learning_rate": 4.9827540591855064e-06, "loss": 0.0356, "num_tokens": 37951862.0, "step": 377 }, { "epoch": 0.8622754491017964, "grad_norm": 2.453125, "learning_rate": 4.9825325073781075e-06, "loss": 0.0453, "num_tokens": 38052370.0, "step": 378 }, { "epoch": 0.8645566010835471, "grad_norm": 1.4921875, "learning_rate": 4.982309546533184e-06, "loss": 0.0336, "num_tokens": 38152621.0, "step": 379 }, { "epoch": 0.866837753065298, "grad_norm": 2.421875, "learning_rate": 4.982085176777285e-06, "loss": 0.0483, "num_tokens": 38253356.0, "step": 380 }, { "epoch": 0.8691189050470488, "grad_norm": 2.28125, "learning_rate": 4.981859398237758e-06, "loss": 0.045, "num_tokens": 38354079.0, "step": 381 }, { "epoch": 0.8714000570287995, "grad_norm": 2.09375, "learning_rate": 4.9816322110427505e-06, "loss": 0.0438, "num_tokens": 38455123.0, "step": 382 }, { "epoch": 0.8736812090105504, "grad_norm": 2.140625, "learning_rate": 4.98140361532121e-06, "loss": 0.0416, "num_tokens": 38555501.0, "step": 383 }, { "epoch": 0.8759623609923011, "grad_norm": 2.421875, "learning_rate": 4.981173611202883e-06, "loss": 0.0376, "num_tokens": 38655899.0, "step": 384 }, { "epoch": 0.8782435129740519, "grad_norm": 2.015625, "learning_rate": 4.980942198818315e-06, "loss": 0.0417, "num_tokens": 38756807.0, "step": 385 }, { "epoch": 0.8805246649558027, "grad_norm": 1.828125, "learning_rate": 4.980709378298851e-06, "loss": 0.0371, "num_tokens": 38857410.0, "step": 386 }, { "epoch": 0.8828058169375534, "grad_norm": 1.96875, "learning_rate": 4.980475149776636e-06, "loss": 0.0418, "num_tokens": 38957860.0, "step": 387 }, { "epoch": 0.8850869689193043, "grad_norm": 1.953125, "learning_rate": 4.980239513384614e-06, "loss": 0.0388, "num_tokens": 39058508.0, "step": 388 }, { "epoch": 0.887368120901055, "grad_norm": 1.5859375, "learning_rate": 4.980002469256527e-06, "loss": 0.0313, "num_tokens": 39159073.0, "step": 389 }, { "epoch": 0.8896492728828058, "grad_norm": 1.5390625, "learning_rate": 4.979764017526916e-06, "loss": 0.0363, "num_tokens": 39259771.0, "step": 390 }, { "epoch": 0.8919304248645566, "grad_norm": 1.6015625, "learning_rate": 4.979524158331123e-06, "loss": 0.0325, "num_tokens": 39360106.0, "step": 391 }, { "epoch": 0.8942115768463074, "grad_norm": 2.0625, "learning_rate": 4.979282891805287e-06, "loss": 0.0476, "num_tokens": 39461052.0, "step": 392 }, { "epoch": 0.8964927288280582, "grad_norm": 1.9921875, "learning_rate": 4.979040218086345e-06, "loss": 0.0389, "num_tokens": 39561366.0, "step": 393 }, { "epoch": 0.8987738808098089, "grad_norm": 1.703125, "learning_rate": 4.978796137312036e-06, "loss": 0.0315, "num_tokens": 39661903.0, "step": 394 }, { "epoch": 0.9010550327915597, "grad_norm": 2.203125, "learning_rate": 4.978550649620894e-06, "loss": 0.0438, "num_tokens": 39762806.0, "step": 395 }, { "epoch": 0.9033361847733106, "grad_norm": 1.84375, "learning_rate": 4.978303755152254e-06, "loss": 0.0396, "num_tokens": 39863922.0, "step": 396 }, { "epoch": 0.9056173367550613, "grad_norm": 1.3125, "learning_rate": 4.978055454046247e-06, "loss": 0.0299, "num_tokens": 39964466.0, "step": 397 }, { "epoch": 0.9078984887368121, "grad_norm": 2.375, "learning_rate": 4.977805746443807e-06, "loss": 0.0437, "num_tokens": 40064992.0, "step": 398 }, { "epoch": 0.9101796407185628, "grad_norm": 1.5, "learning_rate": 4.9775546324866596e-06, "loss": 0.0384, "num_tokens": 40165562.0, "step": 399 }, { "epoch": 0.9124607927003137, "grad_norm": 2.390625, "learning_rate": 4.977302112317334e-06, "loss": 0.042, "num_tokens": 40266007.0, "step": 400 }, { "epoch": 0.9147419446820645, "grad_norm": 2.046875, "learning_rate": 4.977048186079155e-06, "loss": 0.0401, "num_tokens": 40366433.0, "step": 401 }, { "epoch": 0.9170230966638152, "grad_norm": 2.421875, "learning_rate": 4.976792853916248e-06, "loss": 0.0373, "num_tokens": 40466592.0, "step": 402 }, { "epoch": 0.919304248645566, "grad_norm": 2.171875, "learning_rate": 4.9765361159735335e-06, "loss": 0.0425, "num_tokens": 40567577.0, "step": 403 }, { "epoch": 0.9215854006273168, "grad_norm": 1.828125, "learning_rate": 4.97627797239673e-06, "loss": 0.038, "num_tokens": 40667938.0, "step": 404 }, { "epoch": 0.9238665526090676, "grad_norm": 2.03125, "learning_rate": 4.976018423332357e-06, "loss": 0.0485, "num_tokens": 40768433.0, "step": 405 }, { "epoch": 0.9261477045908184, "grad_norm": 2.03125, "learning_rate": 4.975757468927727e-06, "loss": 0.0312, "num_tokens": 40869308.0, "step": 406 }, { "epoch": 0.9284288565725691, "grad_norm": 1.75, "learning_rate": 4.975495109330954e-06, "loss": 0.0398, "num_tokens": 40969710.0, "step": 407 }, { "epoch": 0.93071000855432, "grad_norm": 1.484375, "learning_rate": 4.97523134469095e-06, "loss": 0.0398, "num_tokens": 41070326.0, "step": 408 }, { "epoch": 0.9329911605360707, "grad_norm": 1.6171875, "learning_rate": 4.97496617515742e-06, "loss": 0.0374, "num_tokens": 41171174.0, "step": 409 }, { "epoch": 0.9352723125178215, "grad_norm": 2.375, "learning_rate": 4.974699600880869e-06, "loss": 0.0404, "num_tokens": 41272451.0, "step": 410 }, { "epoch": 0.9375534644995723, "grad_norm": 1.796875, "learning_rate": 4.974431622012601e-06, "loss": 0.0448, "num_tokens": 41372799.0, "step": 411 }, { "epoch": 0.939834616481323, "grad_norm": 1.5859375, "learning_rate": 4.974162238704716e-06, "loss": 0.0312, "num_tokens": 41473489.0, "step": 412 }, { "epoch": 0.9421157684630739, "grad_norm": 1.703125, "learning_rate": 4.973891451110109e-06, "loss": 0.0362, "num_tokens": 41574224.0, "step": 413 }, { "epoch": 0.9443969204448246, "grad_norm": 2.390625, "learning_rate": 4.973619259382475e-06, "loss": 0.0368, "num_tokens": 41675157.0, "step": 414 }, { "epoch": 0.9466780724265754, "grad_norm": 1.7578125, "learning_rate": 4.973345663676305e-06, "loss": 0.0328, "num_tokens": 41775412.0, "step": 415 }, { "epoch": 0.9489592244083263, "grad_norm": 1.59375, "learning_rate": 4.973070664146885e-06, "loss": 0.0388, "num_tokens": 41876148.0, "step": 416 }, { "epoch": 0.951240376390077, "grad_norm": 1.5625, "learning_rate": 4.972794260950301e-06, "loss": 0.0331, "num_tokens": 41976805.0, "step": 417 }, { "epoch": 0.9535215283718278, "grad_norm": 1.75, "learning_rate": 4.972516454243433e-06, "loss": 0.0375, "num_tokens": 42078040.0, "step": 418 }, { "epoch": 0.9558026803535785, "grad_norm": 1.6875, "learning_rate": 4.972237244183961e-06, "loss": 0.0307, "num_tokens": 42178989.0, "step": 419 }, { "epoch": 0.9580838323353293, "grad_norm": 1.5, "learning_rate": 4.971956630930356e-06, "loss": 0.0322, "num_tokens": 42280024.0, "step": 420 }, { "epoch": 0.9603649843170802, "grad_norm": 1.875, "learning_rate": 4.971674614641891e-06, "loss": 0.0365, "num_tokens": 42380838.0, "step": 421 }, { "epoch": 0.9626461362988309, "grad_norm": 2.0625, "learning_rate": 4.971391195478632e-06, "loss": 0.0393, "num_tokens": 42481410.0, "step": 422 }, { "epoch": 0.9649272882805817, "grad_norm": 2.140625, "learning_rate": 4.971106373601443e-06, "loss": 0.0483, "num_tokens": 42581616.0, "step": 423 }, { "epoch": 0.9672084402623324, "grad_norm": 1.8046875, "learning_rate": 4.9708201491719825e-06, "loss": 0.0382, "num_tokens": 42682033.0, "step": 424 }, { "epoch": 0.9694895922440833, "grad_norm": 1.84375, "learning_rate": 4.9705325223527055e-06, "loss": 0.0356, "num_tokens": 42782771.0, "step": 425 }, { "epoch": 0.9717707442258341, "grad_norm": 2.015625, "learning_rate": 4.970243493306865e-06, "loss": 0.038, "num_tokens": 42883341.0, "step": 426 }, { "epoch": 0.9740518962075848, "grad_norm": 1.9765625, "learning_rate": 4.969953062198508e-06, "loss": 0.0393, "num_tokens": 42984042.0, "step": 427 }, { "epoch": 0.9763330481893356, "grad_norm": 1.9453125, "learning_rate": 4.969661229192477e-06, "loss": 0.0456, "num_tokens": 43084650.0, "step": 428 }, { "epoch": 0.9786142001710864, "grad_norm": 1.59375, "learning_rate": 4.969367994454412e-06, "loss": 0.0359, "num_tokens": 43185739.0, "step": 429 }, { "epoch": 0.9808953521528372, "grad_norm": 1.8359375, "learning_rate": 4.9690733581507445e-06, "loss": 0.0406, "num_tokens": 43286225.0, "step": 430 }, { "epoch": 0.983176504134588, "grad_norm": 2.109375, "learning_rate": 4.968777320448707e-06, "loss": 0.0373, "num_tokens": 43387106.0, "step": 431 }, { "epoch": 0.9854576561163387, "grad_norm": 1.65625, "learning_rate": 4.9684798815163235e-06, "loss": 0.0361, "num_tokens": 43487524.0, "step": 432 }, { "epoch": 0.9877388080980896, "grad_norm": 1.9609375, "learning_rate": 4.968181041522416e-06, "loss": 0.0419, "num_tokens": 43588282.0, "step": 433 }, { "epoch": 0.9900199600798403, "grad_norm": 2.28125, "learning_rate": 4.967880800636599e-06, "loss": 0.0355, "num_tokens": 43688801.0, "step": 434 }, { "epoch": 0.9923011120615911, "grad_norm": 1.9140625, "learning_rate": 4.967579159029284e-06, "loss": 0.042, "num_tokens": 43789244.0, "step": 435 }, { "epoch": 0.9945822640433419, "grad_norm": 1.6640625, "learning_rate": 4.9672761168716766e-06, "loss": 0.0332, "num_tokens": 43889771.0, "step": 436 }, { "epoch": 0.9968634160250927, "grad_norm": 1.828125, "learning_rate": 4.966971674335778e-06, "loss": 0.0328, "num_tokens": 43990196.0, "step": 437 }, { "epoch": 0.9991445680068435, "grad_norm": 1.421875, "learning_rate": 4.966665831594383e-06, "loss": 0.033, "num_tokens": 44091025.0, "step": 438 }, { "epoch": 1.0, "grad_norm": 2.34375, "learning_rate": 4.966358588821084e-06, "loss": 0.0288, "num_tokens": 44127872.0, "step": 439 }, { "epoch": 1.0022811519817507, "grad_norm": 1.515625, "learning_rate": 4.966049946190265e-06, "loss": 0.026, "num_tokens": 44228467.0, "step": 440 }, { "epoch": 1.0022811519817507, "eval_loss": 0.03784199804067612, "eval_num_tokens": 44228467.0, "eval_runtime": 52.6965, "eval_samples_per_second": 159.119, "eval_steps_per_second": 4.991, "step": 440 }, { "epoch": 1.0045623039635017, "grad_norm": 1.9921875, "learning_rate": 4.9657399038771045e-06, "loss": 0.0253, "num_tokens": 44328960.0, "step": 441 }, { "epoch": 1.0068434559452524, "grad_norm": 2.046875, "learning_rate": 4.965428462057578e-06, "loss": 0.0374, "num_tokens": 44429568.0, "step": 442 }, { "epoch": 1.009124607927003, "grad_norm": 1.53125, "learning_rate": 4.965115620908453e-06, "loss": 0.0311, "num_tokens": 44530318.0, "step": 443 }, { "epoch": 1.011405759908754, "grad_norm": 1.7421875, "learning_rate": 4.964801380607293e-06, "loss": 0.0354, "num_tokens": 44630923.0, "step": 444 }, { "epoch": 1.0136869118905047, "grad_norm": 1.828125, "learning_rate": 4.964485741332453e-06, "loss": 0.034, "num_tokens": 44732494.0, "step": 445 }, { "epoch": 1.0159680638722555, "grad_norm": 1.5234375, "learning_rate": 4.964168703263086e-06, "loss": 0.0338, "num_tokens": 44833725.0, "step": 446 }, { "epoch": 1.0182492158540062, "grad_norm": 2.34375, "learning_rate": 4.963850266579136e-06, "loss": 0.0443, "num_tokens": 44934784.0, "step": 447 }, { "epoch": 1.0205303678357571, "grad_norm": 1.6875, "learning_rate": 4.963530431461341e-06, "loss": 0.0343, "num_tokens": 45035167.0, "step": 448 }, { "epoch": 1.0228115198175078, "grad_norm": 1.7890625, "learning_rate": 4.963209198091232e-06, "loss": 0.0361, "num_tokens": 45136334.0, "step": 449 }, { "epoch": 1.0250926717992586, "grad_norm": 1.46875, "learning_rate": 4.962886566651138e-06, "loss": 0.0274, "num_tokens": 45237506.0, "step": 450 }, { "epoch": 1.0273738237810095, "grad_norm": 2.171875, "learning_rate": 4.962562537324176e-06, "loss": 0.0298, "num_tokens": 45338558.0, "step": 451 }, { "epoch": 1.0296549757627602, "grad_norm": 1.9453125, "learning_rate": 4.96223711029426e-06, "loss": 0.0381, "num_tokens": 45438984.0, "step": 452 }, { "epoch": 1.031936127744511, "grad_norm": 1.8984375, "learning_rate": 4.961910285746094e-06, "loss": 0.0317, "num_tokens": 45539439.0, "step": 453 }, { "epoch": 1.0342172797262617, "grad_norm": 1.578125, "learning_rate": 4.9615820638651805e-06, "loss": 0.0297, "num_tokens": 45639687.0, "step": 454 }, { "epoch": 1.0364984317080126, "grad_norm": 2.125, "learning_rate": 4.961252444837809e-06, "loss": 0.0342, "num_tokens": 45739933.0, "step": 455 }, { "epoch": 1.0387795836897633, "grad_norm": 2.140625, "learning_rate": 4.960921428851066e-06, "loss": 0.0325, "num_tokens": 45840109.0, "step": 456 }, { "epoch": 1.041060735671514, "grad_norm": 1.7578125, "learning_rate": 4.960589016092832e-06, "loss": 0.0316, "num_tokens": 45941478.0, "step": 457 }, { "epoch": 1.043341887653265, "grad_norm": 2.015625, "learning_rate": 4.960255206751774e-06, "loss": 0.0417, "num_tokens": 46042010.0, "step": 458 }, { "epoch": 1.0456230396350157, "grad_norm": 1.796875, "learning_rate": 4.959920001017358e-06, "loss": 0.0326, "num_tokens": 46142753.0, "step": 459 }, { "epoch": 1.0479041916167664, "grad_norm": 1.7578125, "learning_rate": 4.95958339907984e-06, "loss": 0.0377, "num_tokens": 46243640.0, "step": 460 }, { "epoch": 1.0501853435985173, "grad_norm": 2.265625, "learning_rate": 4.959245401130269e-06, "loss": 0.0355, "num_tokens": 46343774.0, "step": 461 }, { "epoch": 1.052466495580268, "grad_norm": 1.515625, "learning_rate": 4.958906007360487e-06, "loss": 0.0279, "num_tokens": 46444438.0, "step": 462 }, { "epoch": 1.0547476475620188, "grad_norm": 1.9921875, "learning_rate": 4.958565217963125e-06, "loss": 0.0359, "num_tokens": 46545284.0, "step": 463 }, { "epoch": 1.0570287995437697, "grad_norm": 2.46875, "learning_rate": 4.95822303313161e-06, "loss": 0.0361, "num_tokens": 46645880.0, "step": 464 }, { "epoch": 1.0593099515255204, "grad_norm": 2.53125, "learning_rate": 4.957879453060159e-06, "loss": 0.0349, "num_tokens": 46747153.0, "step": 465 }, { "epoch": 1.0615911035072711, "grad_norm": 1.9453125, "learning_rate": 4.957534477943782e-06, "loss": 0.0381, "num_tokens": 46848211.0, "step": 466 }, { "epoch": 1.0638722554890219, "grad_norm": 1.625, "learning_rate": 4.957188107978279e-06, "loss": 0.029, "num_tokens": 46949218.0, "step": 467 }, { "epoch": 1.0661534074707728, "grad_norm": 1.796875, "learning_rate": 4.956840343360245e-06, "loss": 0.0361, "num_tokens": 47050303.0, "step": 468 }, { "epoch": 1.0684345594525235, "grad_norm": 1.8828125, "learning_rate": 4.956491184287062e-06, "loss": 0.0312, "num_tokens": 47151182.0, "step": 469 }, { "epoch": 1.0707157114342742, "grad_norm": 1.8515625, "learning_rate": 4.9561406309569084e-06, "loss": 0.0314, "num_tokens": 47251998.0, "step": 470 }, { "epoch": 1.0729968634160252, "grad_norm": 1.7265625, "learning_rate": 4.955788683568749e-06, "loss": 0.0334, "num_tokens": 47352502.0, "step": 471 }, { "epoch": 1.075278015397776, "grad_norm": 1.8359375, "learning_rate": 4.955435342322345e-06, "loss": 0.0292, "num_tokens": 47452898.0, "step": 472 }, { "epoch": 1.0775591673795266, "grad_norm": 1.515625, "learning_rate": 4.955080607418244e-06, "loss": 0.0336, "num_tokens": 47553583.0, "step": 473 }, { "epoch": 1.0798403193612773, "grad_norm": 1.5625, "learning_rate": 4.954724479057788e-06, "loss": 0.0295, "num_tokens": 47653471.0, "step": 474 }, { "epoch": 1.0821214713430283, "grad_norm": 1.9765625, "learning_rate": 4.954366957443107e-06, "loss": 0.0415, "num_tokens": 47753553.0, "step": 475 }, { "epoch": 1.084402623324779, "grad_norm": 1.7421875, "learning_rate": 4.954008042777125e-06, "loss": 0.0287, "num_tokens": 47854154.0, "step": 476 }, { "epoch": 1.0866837753065297, "grad_norm": 1.9453125, "learning_rate": 4.953647735263555e-06, "loss": 0.0356, "num_tokens": 47954833.0, "step": 477 }, { "epoch": 1.0889649272882806, "grad_norm": 1.6640625, "learning_rate": 4.953286035106898e-06, "loss": 0.0317, "num_tokens": 48055658.0, "step": 478 }, { "epoch": 1.0912460792700314, "grad_norm": 1.8671875, "learning_rate": 4.952922942512452e-06, "loss": 0.0419, "num_tokens": 48156154.0, "step": 479 }, { "epoch": 1.093527231251782, "grad_norm": 1.6640625, "learning_rate": 4.9525584576862985e-06, "loss": 0.0319, "num_tokens": 48256581.0, "step": 480 }, { "epoch": 1.095808383233533, "grad_norm": 1.8203125, "learning_rate": 4.952192580835313e-06, "loss": 0.036, "num_tokens": 48357380.0, "step": 481 }, { "epoch": 1.0980895352152837, "grad_norm": 1.9140625, "learning_rate": 4.9518253121671595e-06, "loss": 0.0381, "num_tokens": 48458503.0, "step": 482 }, { "epoch": 1.1003706871970345, "grad_norm": 1.609375, "learning_rate": 4.951456651890294e-06, "loss": 0.0257, "num_tokens": 48559756.0, "step": 483 }, { "epoch": 1.1026518391787854, "grad_norm": 1.90625, "learning_rate": 4.951086600213959e-06, "loss": 0.0416, "num_tokens": 48660333.0, "step": 484 }, { "epoch": 1.1049329911605361, "grad_norm": 1.8125, "learning_rate": 4.950715157348191e-06, "loss": 0.0284, "num_tokens": 48760735.0, "step": 485 }, { "epoch": 1.1072141431422868, "grad_norm": 2.109375, "learning_rate": 4.950342323503812e-06, "loss": 0.0423, "num_tokens": 48861180.0, "step": 486 }, { "epoch": 1.1094952951240376, "grad_norm": 2.1875, "learning_rate": 4.949968098892436e-06, "loss": 0.0481, "num_tokens": 48961184.0, "step": 487 }, { "epoch": 1.1117764471057885, "grad_norm": 1.6484375, "learning_rate": 4.949592483726465e-06, "loss": 0.0319, "num_tokens": 49062062.0, "step": 488 }, { "epoch": 1.1140575990875392, "grad_norm": 1.8984375, "learning_rate": 4.949215478219092e-06, "loss": 0.0388, "num_tokens": 49162142.0, "step": 489 }, { "epoch": 1.11633875106929, "grad_norm": 1.671875, "learning_rate": 4.948837082584298e-06, "loss": 0.0368, "num_tokens": 49262954.0, "step": 490 }, { "epoch": 1.1186199030510409, "grad_norm": 1.8125, "learning_rate": 4.9484572970368516e-06, "loss": 0.0325, "num_tokens": 49363161.0, "step": 491 }, { "epoch": 1.1209010550327916, "grad_norm": 1.8359375, "learning_rate": 4.948076121792313e-06, "loss": 0.0312, "num_tokens": 49463112.0, "step": 492 }, { "epoch": 1.1231822070145423, "grad_norm": 1.6875, "learning_rate": 4.9476935570670294e-06, "loss": 0.032, "num_tokens": 49564093.0, "step": 493 }, { "epoch": 1.125463358996293, "grad_norm": 1.6328125, "learning_rate": 4.947309603078138e-06, "loss": 0.0393, "num_tokens": 49665064.0, "step": 494 }, { "epoch": 1.127744510978044, "grad_norm": 1.7890625, "learning_rate": 4.946924260043563e-06, "loss": 0.03, "num_tokens": 49765547.0, "step": 495 }, { "epoch": 1.1300256629597947, "grad_norm": 1.4453125, "learning_rate": 4.946537528182017e-06, "loss": 0.0328, "num_tokens": 49865847.0, "step": 496 }, { "epoch": 1.1323068149415454, "grad_norm": 1.921875, "learning_rate": 4.946149407713002e-06, "loss": 0.0376, "num_tokens": 49966612.0, "step": 497 }, { "epoch": 1.1345879669232963, "grad_norm": 1.546875, "learning_rate": 4.945759898856809e-06, "loss": 0.0352, "num_tokens": 50067196.0, "step": 498 }, { "epoch": 1.136869118905047, "grad_norm": 1.8125, "learning_rate": 4.9453690018345144e-06, "loss": 0.0406, "num_tokens": 50167238.0, "step": 499 }, { "epoch": 1.1391502708867978, "grad_norm": 1.7578125, "learning_rate": 4.944976716867984e-06, "loss": 0.0347, "num_tokens": 50267261.0, "step": 500 }, { "epoch": 1.1414314228685487, "grad_norm": 1.859375, "learning_rate": 4.944583044179871e-06, "loss": 0.0295, "num_tokens": 50368125.0, "step": 501 }, { "epoch": 1.1437125748502994, "grad_norm": 1.6328125, "learning_rate": 4.944187983993617e-06, "loss": 0.03, "num_tokens": 50469087.0, "step": 502 }, { "epoch": 1.1459937268320501, "grad_norm": 1.9921875, "learning_rate": 4.94379153653345e-06, "loss": 0.0341, "num_tokens": 50569130.0, "step": 503 }, { "epoch": 1.148274878813801, "grad_norm": 1.953125, "learning_rate": 4.9433937020243854e-06, "loss": 0.0469, "num_tokens": 50669687.0, "step": 504 }, { "epoch": 1.1505560307955518, "grad_norm": 1.890625, "learning_rate": 4.942994480692228e-06, "loss": 0.0342, "num_tokens": 50769889.0, "step": 505 }, { "epoch": 1.1528371827773025, "grad_norm": 1.2109375, "learning_rate": 4.942593872763566e-06, "loss": 0.0256, "num_tokens": 50870344.0, "step": 506 }, { "epoch": 1.1551183347590532, "grad_norm": 1.7734375, "learning_rate": 4.9421918784657795e-06, "loss": 0.0318, "num_tokens": 50970940.0, "step": 507 }, { "epoch": 1.1573994867408042, "grad_norm": 1.328125, "learning_rate": 4.94178849802703e-06, "loss": 0.0326, "num_tokens": 51071211.0, "step": 508 }, { "epoch": 1.159680638722555, "grad_norm": 1.8828125, "learning_rate": 4.9413837316762705e-06, "loss": 0.0416, "num_tokens": 51171430.0, "step": 509 }, { "epoch": 1.1619617907043056, "grad_norm": 1.4765625, "learning_rate": 4.940977579643237e-06, "loss": 0.0292, "num_tokens": 51271857.0, "step": 510 }, { "epoch": 1.1642429426860565, "grad_norm": 1.5703125, "learning_rate": 4.940570042158454e-06, "loss": 0.0349, "num_tokens": 51372580.0, "step": 511 }, { "epoch": 1.1665240946678073, "grad_norm": 1.59375, "learning_rate": 4.940161119453232e-06, "loss": 0.0389, "num_tokens": 51473369.0, "step": 512 }, { "epoch": 1.168805246649558, "grad_norm": 1.625, "learning_rate": 4.939750811759668e-06, "loss": 0.0341, "num_tokens": 51574013.0, "step": 513 }, { "epoch": 1.1710863986313087, "grad_norm": 1.8671875, "learning_rate": 4.939339119310645e-06, "loss": 0.0402, "num_tokens": 51674950.0, "step": 514 }, { "epoch": 1.1733675506130596, "grad_norm": 1.875, "learning_rate": 4.93892604233983e-06, "loss": 0.0437, "num_tokens": 51775521.0, "step": 515 }, { "epoch": 1.1756487025948104, "grad_norm": 2.046875, "learning_rate": 4.93851158108168e-06, "loss": 0.0421, "num_tokens": 51876638.0, "step": 516 }, { "epoch": 1.177929854576561, "grad_norm": 1.40625, "learning_rate": 4.938095735771433e-06, "loss": 0.0247, "num_tokens": 51977365.0, "step": 517 }, { "epoch": 1.180211006558312, "grad_norm": 1.796875, "learning_rate": 4.937678506645116e-06, "loss": 0.0432, "num_tokens": 52078113.0, "step": 518 }, { "epoch": 1.1824921585400627, "grad_norm": 1.453125, "learning_rate": 4.937259893939539e-06, "loss": 0.0325, "num_tokens": 52179140.0, "step": 519 }, { "epoch": 1.1847733105218134, "grad_norm": 1.6484375, "learning_rate": 4.9368398978923e-06, "loss": 0.034, "num_tokens": 52279817.0, "step": 520 }, { "epoch": 1.1870544625035644, "grad_norm": 1.5703125, "learning_rate": 4.93641851874178e-06, "loss": 0.0338, "num_tokens": 52380155.0, "step": 521 }, { "epoch": 1.189335614485315, "grad_norm": 1.5625, "learning_rate": 4.935995756727146e-06, "loss": 0.0321, "num_tokens": 52480924.0, "step": 522 }, { "epoch": 1.1916167664670658, "grad_norm": 1.4140625, "learning_rate": 4.935571612088349e-06, "loss": 0.0209, "num_tokens": 52581868.0, "step": 523 }, { "epoch": 1.1938979184488168, "grad_norm": 2.0625, "learning_rate": 4.935146085066125e-06, "loss": 0.0365, "num_tokens": 52682218.0, "step": 524 }, { "epoch": 1.1961790704305675, "grad_norm": 1.78125, "learning_rate": 4.934719175901996e-06, "loss": 0.0375, "num_tokens": 52782734.0, "step": 525 }, { "epoch": 1.1984602224123182, "grad_norm": 1.5078125, "learning_rate": 4.934290884838266e-06, "loss": 0.0241, "num_tokens": 52883557.0, "step": 526 }, { "epoch": 1.200741374394069, "grad_norm": 2.40625, "learning_rate": 4.933861212118027e-06, "loss": 0.0287, "num_tokens": 52984218.0, "step": 527 }, { "epoch": 1.2030225263758199, "grad_norm": 1.5078125, "learning_rate": 4.933430157985151e-06, "loss": 0.0333, "num_tokens": 53084598.0, "step": 528 }, { "epoch": 1.2053036783575706, "grad_norm": 1.4296875, "learning_rate": 4.932997722684296e-06, "loss": 0.025, "num_tokens": 53185097.0, "step": 529 }, { "epoch": 1.2075848303393213, "grad_norm": 1.7265625, "learning_rate": 4.932563906460905e-06, "loss": 0.0236, "num_tokens": 53285440.0, "step": 530 }, { "epoch": 1.2098659823210722, "grad_norm": 1.78125, "learning_rate": 4.932128709561202e-06, "loss": 0.0361, "num_tokens": 53386683.0, "step": 531 }, { "epoch": 1.212147134302823, "grad_norm": 1.734375, "learning_rate": 4.931692132232198e-06, "loss": 0.0274, "num_tokens": 53487677.0, "step": 532 }, { "epoch": 1.2144282862845737, "grad_norm": 1.875, "learning_rate": 4.931254174721687e-06, "loss": 0.04, "num_tokens": 53588195.0, "step": 533 }, { "epoch": 1.2167094382663244, "grad_norm": 1.6171875, "learning_rate": 4.930814837278242e-06, "loss": 0.0285, "num_tokens": 53689526.0, "step": 534 }, { "epoch": 1.2189905902480753, "grad_norm": 1.7265625, "learning_rate": 4.930374120151225e-06, "loss": 0.0341, "num_tokens": 53790195.0, "step": 535 }, { "epoch": 1.221271742229826, "grad_norm": 2.046875, "learning_rate": 4.929932023590776e-06, "loss": 0.0375, "num_tokens": 53890252.0, "step": 536 }, { "epoch": 1.2235528942115768, "grad_norm": 2.046875, "learning_rate": 4.929488547847823e-06, "loss": 0.0412, "num_tokens": 53990610.0, "step": 537 }, { "epoch": 1.2258340461933277, "grad_norm": 1.34375, "learning_rate": 4.9290436931740735e-06, "loss": 0.0306, "num_tokens": 54091537.0, "step": 538 }, { "epoch": 1.2281151981750784, "grad_norm": 1.984375, "learning_rate": 4.928597459822018e-06, "loss": 0.0396, "num_tokens": 54192056.0, "step": 539 }, { "epoch": 1.2303963501568291, "grad_norm": 1.8515625, "learning_rate": 4.928149848044931e-06, "loss": 0.0394, "num_tokens": 54292679.0, "step": 540 }, { "epoch": 1.23267750213858, "grad_norm": 1.625, "learning_rate": 4.9277008580968665e-06, "loss": 0.0289, "num_tokens": 54393402.0, "step": 541 }, { "epoch": 1.2349586541203308, "grad_norm": 1.4375, "learning_rate": 4.927250490232664e-06, "loss": 0.0253, "num_tokens": 54493916.0, "step": 542 }, { "epoch": 1.2372398061020815, "grad_norm": 1.53125, "learning_rate": 4.926798744707943e-06, "loss": 0.0287, "num_tokens": 54594984.0, "step": 543 }, { "epoch": 1.2395209580838324, "grad_norm": 1.546875, "learning_rate": 4.926345621779106e-06, "loss": 0.0344, "num_tokens": 54695926.0, "step": 544 }, { "epoch": 1.2418021100655832, "grad_norm": 1.3984375, "learning_rate": 4.9258911217033355e-06, "loss": 0.0247, "num_tokens": 54796424.0, "step": 545 }, { "epoch": 1.2440832620473339, "grad_norm": 1.6796875, "learning_rate": 4.925435244738599e-06, "loss": 0.0298, "num_tokens": 54897028.0, "step": 546 }, { "epoch": 1.2463644140290846, "grad_norm": 1.4921875, "learning_rate": 4.924977991143642e-06, "loss": 0.0226, "num_tokens": 54998559.0, "step": 547 }, { "epoch": 1.2486455660108355, "grad_norm": 1.359375, "learning_rate": 4.924519361177993e-06, "loss": 0.0273, "num_tokens": 55099535.0, "step": 548 }, { "epoch": 1.2509267179925863, "grad_norm": 1.609375, "learning_rate": 4.9240593551019625e-06, "loss": 0.0366, "num_tokens": 55200809.0, "step": 549 }, { "epoch": 1.253207869974337, "grad_norm": 1.8984375, "learning_rate": 4.92359797317664e-06, "loss": 0.0338, "num_tokens": 55301400.0, "step": 550 }, { "epoch": 1.255489021956088, "grad_norm": 1.40625, "learning_rate": 4.923135215663897e-06, "loss": 0.0245, "num_tokens": 55401976.0, "step": 551 }, { "epoch": 1.2577701739378386, "grad_norm": 1.7421875, "learning_rate": 4.922671082826386e-06, "loss": 0.0353, "num_tokens": 55502303.0, "step": 552 }, { "epoch": 1.2600513259195893, "grad_norm": 1.890625, "learning_rate": 4.92220557492754e-06, "loss": 0.0335, "num_tokens": 55603011.0, "step": 553 }, { "epoch": 1.26233247790134, "grad_norm": 1.2578125, "learning_rate": 4.921738692231572e-06, "loss": 0.0189, "num_tokens": 55703477.0, "step": 554 }, { "epoch": 1.264613629883091, "grad_norm": 1.53125, "learning_rate": 4.9212704350034764e-06, "loss": 0.0286, "num_tokens": 55803994.0, "step": 555 }, { "epoch": 1.2668947818648417, "grad_norm": 1.5, "learning_rate": 4.920800803509026e-06, "loss": 0.0242, "num_tokens": 55904250.0, "step": 556 }, { "epoch": 1.2691759338465927, "grad_norm": 1.6640625, "learning_rate": 4.920329798014775e-06, "loss": 0.0247, "num_tokens": 56005051.0, "step": 557 }, { "epoch": 1.2714570858283434, "grad_norm": 2.0, "learning_rate": 4.919857418788056e-06, "loss": 0.0328, "num_tokens": 56105287.0, "step": 558 }, { "epoch": 1.273738237810094, "grad_norm": 1.9765625, "learning_rate": 4.919383666096985e-06, "loss": 0.0345, "num_tokens": 56205784.0, "step": 559 }, { "epoch": 1.2760193897918448, "grad_norm": 1.7109375, "learning_rate": 4.918908540210452e-06, "loss": 0.0292, "num_tokens": 56306810.0, "step": 560 }, { "epoch": 1.2783005417735955, "grad_norm": 2.0, "learning_rate": 4.91843204139813e-06, "loss": 0.0313, "num_tokens": 56407542.0, "step": 561 }, { "epoch": 1.2805816937553465, "grad_norm": 1.8828125, "learning_rate": 4.917954169930472e-06, "loss": 0.0324, "num_tokens": 56508291.0, "step": 562 }, { "epoch": 1.2828628457370972, "grad_norm": 2.5, "learning_rate": 4.917474926078707e-06, "loss": 0.0511, "num_tokens": 56609829.0, "step": 563 }, { "epoch": 1.2851439977188481, "grad_norm": 1.9140625, "learning_rate": 4.916994310114845e-06, "loss": 0.0329, "num_tokens": 56711108.0, "step": 564 }, { "epoch": 1.2874251497005988, "grad_norm": 2.046875, "learning_rate": 4.916512322311675e-06, "loss": 0.0377, "num_tokens": 56811844.0, "step": 565 }, { "epoch": 1.2897063016823496, "grad_norm": 1.3203125, "learning_rate": 4.916028962942763e-06, "loss": 0.0226, "num_tokens": 56912797.0, "step": 566 }, { "epoch": 1.2919874536641003, "grad_norm": 1.8359375, "learning_rate": 4.915544232282455e-06, "loss": 0.032, "num_tokens": 57013694.0, "step": 567 }, { "epoch": 1.2942686056458512, "grad_norm": 1.6171875, "learning_rate": 4.915058130605874e-06, "loss": 0.0301, "num_tokens": 57114187.0, "step": 568 }, { "epoch": 1.296549757627602, "grad_norm": 1.4765625, "learning_rate": 4.9145706581889235e-06, "loss": 0.0374, "num_tokens": 57214364.0, "step": 569 }, { "epoch": 1.2988309096093527, "grad_norm": 1.515625, "learning_rate": 4.914081815308283e-06, "loss": 0.0281, "num_tokens": 57314867.0, "step": 570 }, { "epoch": 1.3011120615911036, "grad_norm": 1.71875, "learning_rate": 4.913591602241409e-06, "loss": 0.0379, "num_tokens": 57415510.0, "step": 571 }, { "epoch": 1.3033932135728543, "grad_norm": 1.8203125, "learning_rate": 4.9131000192665365e-06, "loss": 0.0406, "num_tokens": 57515984.0, "step": 572 }, { "epoch": 1.305674365554605, "grad_norm": 1.71875, "learning_rate": 4.9126070666626815e-06, "loss": 0.033, "num_tokens": 57616638.0, "step": 573 }, { "epoch": 1.3079555175363557, "grad_norm": 1.1171875, "learning_rate": 4.912112744709632e-06, "loss": 0.0205, "num_tokens": 57717023.0, "step": 574 }, { "epoch": 1.3102366695181067, "grad_norm": 1.734375, "learning_rate": 4.911617053687957e-06, "loss": 0.0313, "num_tokens": 57817759.0, "step": 575 }, { "epoch": 1.3125178214998574, "grad_norm": 1.6875, "learning_rate": 4.911119993878999e-06, "loss": 0.0359, "num_tokens": 57918307.0, "step": 576 }, { "epoch": 1.3147989734816083, "grad_norm": 1.6953125, "learning_rate": 4.910621565564882e-06, "loss": 0.0371, "num_tokens": 58019213.0, "step": 577 }, { "epoch": 1.317080125463359, "grad_norm": 1.7109375, "learning_rate": 4.910121769028503e-06, "loss": 0.0385, "num_tokens": 58119850.0, "step": 578 }, { "epoch": 1.3193612774451098, "grad_norm": 1.921875, "learning_rate": 4.909620604553537e-06, "loss": 0.036, "num_tokens": 58220255.0, "step": 579 }, { "epoch": 1.3216424294268605, "grad_norm": 1.4375, "learning_rate": 4.909118072424436e-06, "loss": 0.0256, "num_tokens": 58320691.0, "step": 580 }, { "epoch": 1.3239235814086112, "grad_norm": 1.453125, "learning_rate": 4.908614172926426e-06, "loss": 0.0245, "num_tokens": 58421373.0, "step": 581 }, { "epoch": 1.3262047333903622, "grad_norm": 1.546875, "learning_rate": 4.908108906345512e-06, "loss": 0.028, "num_tokens": 58522380.0, "step": 582 }, { "epoch": 1.3284858853721129, "grad_norm": 1.484375, "learning_rate": 4.907602272968473e-06, "loss": 0.0231, "num_tokens": 58622662.0, "step": 583 }, { "epoch": 1.3307670373538638, "grad_norm": 1.5625, "learning_rate": 4.907094273082865e-06, "loss": 0.0262, "num_tokens": 58722901.0, "step": 584 }, { "epoch": 1.3330481893356145, "grad_norm": 1.6171875, "learning_rate": 4.906584906977018e-06, "loss": 0.0303, "num_tokens": 58824084.0, "step": 585 }, { "epoch": 1.3353293413173652, "grad_norm": 1.9296875, "learning_rate": 4.906074174940038e-06, "loss": 0.0339, "num_tokens": 58924727.0, "step": 586 }, { "epoch": 1.337610493299116, "grad_norm": 1.7734375, "learning_rate": 4.905562077261808e-06, "loss": 0.0372, "num_tokens": 59024995.0, "step": 587 }, { "epoch": 1.339891645280867, "grad_norm": 1.78125, "learning_rate": 4.905048614232984e-06, "loss": 0.0295, "num_tokens": 59125611.0, "step": 588 }, { "epoch": 1.3421727972626176, "grad_norm": 1.8046875, "learning_rate": 4.904533786144998e-06, "loss": 0.0386, "num_tokens": 59226488.0, "step": 589 }, { "epoch": 1.3444539492443683, "grad_norm": 1.40625, "learning_rate": 4.904017593290056e-06, "loss": 0.0249, "num_tokens": 59326746.0, "step": 590 }, { "epoch": 1.3467351012261193, "grad_norm": 1.4921875, "learning_rate": 4.903500035961139e-06, "loss": 0.0309, "num_tokens": 59427771.0, "step": 591 }, { "epoch": 1.34901625320787, "grad_norm": 1.546875, "learning_rate": 4.902981114452005e-06, "loss": 0.0315, "num_tokens": 59528298.0, "step": 592 }, { "epoch": 1.3512974051896207, "grad_norm": 1.8046875, "learning_rate": 4.90246082905718e-06, "loss": 0.0311, "num_tokens": 59628861.0, "step": 593 }, { "epoch": 1.3535785571713714, "grad_norm": 1.9140625, "learning_rate": 4.90193918007197e-06, "loss": 0.03, "num_tokens": 59729471.0, "step": 594 }, { "epoch": 1.3558597091531224, "grad_norm": 2.125, "learning_rate": 4.901416167792452e-06, "loss": 0.0349, "num_tokens": 59830204.0, "step": 595 }, { "epoch": 1.358140861134873, "grad_norm": 1.328125, "learning_rate": 4.9008917925154795e-06, "loss": 0.02, "num_tokens": 59931096.0, "step": 596 }, { "epoch": 1.360422013116624, "grad_norm": 1.2734375, "learning_rate": 4.900366054538675e-06, "loss": 0.0213, "num_tokens": 60031261.0, "step": 597 }, { "epoch": 1.3627031650983747, "grad_norm": 1.3984375, "learning_rate": 4.8998389541604405e-06, "loss": 0.0269, "num_tokens": 60131831.0, "step": 598 }, { "epoch": 1.3649843170801255, "grad_norm": 1.7578125, "learning_rate": 4.899310491679945e-06, "loss": 0.0274, "num_tokens": 60232535.0, "step": 599 }, { "epoch": 1.3672654690618762, "grad_norm": 1.828125, "learning_rate": 4.898780667397136e-06, "loss": 0.036, "num_tokens": 60333282.0, "step": 600 }, { "epoch": 1.369546621043627, "grad_norm": 1.484375, "learning_rate": 4.89824948161273e-06, "loss": 0.0347, "num_tokens": 60433865.0, "step": 601 }, { "epoch": 1.3718277730253778, "grad_norm": 1.4453125, "learning_rate": 4.8977169346282184e-06, "loss": 0.0304, "num_tokens": 60534303.0, "step": 602 }, { "epoch": 1.3741089250071286, "grad_norm": 1.6796875, "learning_rate": 4.8971830267458645e-06, "loss": 0.028, "num_tokens": 60635191.0, "step": 603 }, { "epoch": 1.3763900769888795, "grad_norm": 1.53125, "learning_rate": 4.896647758268703e-06, "loss": 0.028, "num_tokens": 60735946.0, "step": 604 }, { "epoch": 1.3786712289706302, "grad_norm": 1.5, "learning_rate": 4.8961111295005444e-06, "loss": 0.0288, "num_tokens": 60836281.0, "step": 605 }, { "epoch": 1.380952380952381, "grad_norm": 1.9140625, "learning_rate": 4.895573140745967e-06, "loss": 0.0352, "num_tokens": 60936777.0, "step": 606 }, { "epoch": 1.3832335329341316, "grad_norm": 1.6953125, "learning_rate": 4.895033792310323e-06, "loss": 0.0362, "num_tokens": 61037492.0, "step": 607 }, { "epoch": 1.3855146849158826, "grad_norm": 1.6640625, "learning_rate": 4.894493084499736e-06, "loss": 0.0334, "num_tokens": 61138324.0, "step": 608 }, { "epoch": 1.3877958368976333, "grad_norm": 1.3828125, "learning_rate": 4.893951017621103e-06, "loss": 0.0288, "num_tokens": 61239593.0, "step": 609 }, { "epoch": 1.390076988879384, "grad_norm": 1.5703125, "learning_rate": 4.893407591982088e-06, "loss": 0.0317, "num_tokens": 61339659.0, "step": 610 }, { "epoch": 1.392358140861135, "grad_norm": 1.90625, "learning_rate": 4.892862807891131e-06, "loss": 0.0371, "num_tokens": 61440386.0, "step": 611 }, { "epoch": 1.3946392928428857, "grad_norm": 1.84375, "learning_rate": 4.89231666565744e-06, "loss": 0.0318, "num_tokens": 61541175.0, "step": 612 }, { "epoch": 1.3969204448246364, "grad_norm": 1.8359375, "learning_rate": 4.891769165590995e-06, "loss": 0.0366, "num_tokens": 61642101.0, "step": 613 }, { "epoch": 1.3992015968063871, "grad_norm": 1.9921875, "learning_rate": 4.891220308002547e-06, "loss": 0.0336, "num_tokens": 61742232.0, "step": 614 }, { "epoch": 1.401482748788138, "grad_norm": 1.65625, "learning_rate": 4.890670093203617e-06, "loss": 0.0292, "num_tokens": 61842373.0, "step": 615 }, { "epoch": 1.4037639007698888, "grad_norm": 1.8046875, "learning_rate": 4.890118521506494e-06, "loss": 0.031, "num_tokens": 61942641.0, "step": 616 }, { "epoch": 1.4060450527516397, "grad_norm": 1.4453125, "learning_rate": 4.889565593224242e-06, "loss": 0.0335, "num_tokens": 62042870.0, "step": 617 }, { "epoch": 1.4083262047333904, "grad_norm": 1.6796875, "learning_rate": 4.889011308670693e-06, "loss": 0.0295, "num_tokens": 62143185.0, "step": 618 }, { "epoch": 1.4106073567151411, "grad_norm": 1.7578125, "learning_rate": 4.8884556681604445e-06, "loss": 0.0332, "num_tokens": 62244179.0, "step": 619 }, { "epoch": 1.4128885086968919, "grad_norm": 1.8046875, "learning_rate": 4.8878986720088715e-06, "loss": 0.033, "num_tokens": 62344570.0, "step": 620 }, { "epoch": 1.4151696606786426, "grad_norm": 1.5390625, "learning_rate": 4.8873403205321115e-06, "loss": 0.0343, "num_tokens": 62445576.0, "step": 621 }, { "epoch": 1.4174508126603935, "grad_norm": 1.578125, "learning_rate": 4.886780614047075e-06, "loss": 0.0328, "num_tokens": 62546411.0, "step": 622 }, { "epoch": 1.4197319646421442, "grad_norm": 2.125, "learning_rate": 4.886219552871441e-06, "loss": 0.0385, "num_tokens": 62647589.0, "step": 623 }, { "epoch": 1.4220131166238952, "grad_norm": 1.859375, "learning_rate": 4.885657137323656e-06, "loss": 0.0285, "num_tokens": 62748484.0, "step": 624 }, { "epoch": 1.424294268605646, "grad_norm": 1.453125, "learning_rate": 4.885093367722937e-06, "loss": 0.024, "num_tokens": 62849371.0, "step": 625 }, { "epoch": 1.4265754205873966, "grad_norm": 1.453125, "learning_rate": 4.884528244389269e-06, "loss": 0.0258, "num_tokens": 62949661.0, "step": 626 }, { "epoch": 1.4288565725691473, "grad_norm": 1.625, "learning_rate": 4.883961767643404e-06, "loss": 0.0382, "num_tokens": 63050272.0, "step": 627 }, { "epoch": 1.4311377245508983, "grad_norm": 1.984375, "learning_rate": 4.883393937806864e-06, "loss": 0.0372, "num_tokens": 63151170.0, "step": 628 }, { "epoch": 1.433418876532649, "grad_norm": 1.578125, "learning_rate": 4.882824755201938e-06, "loss": 0.0402, "num_tokens": 63252256.0, "step": 629 }, { "epoch": 1.4357000285143997, "grad_norm": 1.75, "learning_rate": 4.8822542201516835e-06, "loss": 0.0309, "num_tokens": 63353047.0, "step": 630 }, { "epoch": 1.4379811804961506, "grad_norm": 1.6015625, "learning_rate": 4.881682332979925e-06, "loss": 0.0394, "num_tokens": 63454380.0, "step": 631 }, { "epoch": 1.4402623324779014, "grad_norm": 1.7109375, "learning_rate": 4.881109094011254e-06, "loss": 0.036, "num_tokens": 63555715.0, "step": 632 }, { "epoch": 1.442543484459652, "grad_norm": 1.5625, "learning_rate": 4.88053450357103e-06, "loss": 0.0295, "num_tokens": 63656671.0, "step": 633 }, { "epoch": 1.4448246364414028, "grad_norm": 1.9296875, "learning_rate": 4.87995856198538e-06, "loss": 0.0397, "num_tokens": 63756927.0, "step": 634 }, { "epoch": 1.4471057884231537, "grad_norm": 1.8828125, "learning_rate": 4.879381269581197e-06, "loss": 0.0304, "num_tokens": 63857312.0, "step": 635 }, { "epoch": 1.4493869404049045, "grad_norm": 1.6796875, "learning_rate": 4.878802626686141e-06, "loss": 0.0298, "num_tokens": 63957984.0, "step": 636 }, { "epoch": 1.4516680923866554, "grad_norm": 1.4921875, "learning_rate": 4.8782226336286395e-06, "loss": 0.0296, "num_tokens": 64058673.0, "step": 637 }, { "epoch": 1.4539492443684061, "grad_norm": 1.453125, "learning_rate": 4.8776412907378845e-06, "loss": 0.0265, "num_tokens": 64159355.0, "step": 638 }, { "epoch": 1.4562303963501568, "grad_norm": 1.921875, "learning_rate": 4.877058598343835e-06, "loss": 0.0383, "num_tokens": 64260636.0, "step": 639 }, { "epoch": 1.4585115483319075, "grad_norm": 1.53125, "learning_rate": 4.876474556777216e-06, "loss": 0.0299, "num_tokens": 64361690.0, "step": 640 }, { "epoch": 1.4607927003136583, "grad_norm": 1.3203125, "learning_rate": 4.8758891663695165e-06, "loss": 0.021, "num_tokens": 64462311.0, "step": 641 }, { "epoch": 1.4630738522954092, "grad_norm": 1.65625, "learning_rate": 4.875302427452996e-06, "loss": 0.0374, "num_tokens": 64562576.0, "step": 642 }, { "epoch": 1.46535500427716, "grad_norm": 1.78125, "learning_rate": 4.874714340360674e-06, "loss": 0.0368, "num_tokens": 64663714.0, "step": 643 }, { "epoch": 1.4676361562589109, "grad_norm": 1.671875, "learning_rate": 4.874124905426339e-06, "loss": 0.024, "num_tokens": 64764590.0, "step": 644 }, { "epoch": 1.4699173082406616, "grad_norm": 1.4921875, "learning_rate": 4.873534122984541e-06, "loss": 0.0298, "num_tokens": 64864810.0, "step": 645 }, { "epoch": 1.4721984602224123, "grad_norm": 1.3359375, "learning_rate": 4.872941993370598e-06, "loss": 0.0262, "num_tokens": 64965402.0, "step": 646 }, { "epoch": 1.474479612204163, "grad_norm": 1.7890625, "learning_rate": 4.872348516920591e-06, "loss": 0.0398, "num_tokens": 65065775.0, "step": 647 }, { "epoch": 1.476760764185914, "grad_norm": 1.421875, "learning_rate": 4.8717536939713665e-06, "loss": 0.0238, "num_tokens": 65166876.0, "step": 648 }, { "epoch": 1.4790419161676647, "grad_norm": 1.46875, "learning_rate": 4.871157524860533e-06, "loss": 0.0269, "num_tokens": 65267659.0, "step": 649 }, { "epoch": 1.4813230681494154, "grad_norm": 1.640625, "learning_rate": 4.870560009926465e-06, "loss": 0.0307, "num_tokens": 65368835.0, "step": 650 }, { "epoch": 1.4836042201311663, "grad_norm": 1.9609375, "learning_rate": 4.869961149508301e-06, "loss": 0.0339, "num_tokens": 65468949.0, "step": 651 }, { "epoch": 1.485885372112917, "grad_norm": 1.4765625, "learning_rate": 4.869360943945943e-06, "loss": 0.0322, "num_tokens": 65569418.0, "step": 652 }, { "epoch": 1.4881665240946678, "grad_norm": 1.7734375, "learning_rate": 4.868759393580054e-06, "loss": 0.0316, "num_tokens": 65669870.0, "step": 653 }, { "epoch": 1.4904476760764185, "grad_norm": 1.6875, "learning_rate": 4.868156498752066e-06, "loss": 0.0354, "num_tokens": 65770607.0, "step": 654 }, { "epoch": 1.4927288280581694, "grad_norm": 1.6875, "learning_rate": 4.8675522598041675e-06, "loss": 0.0257, "num_tokens": 65870913.0, "step": 655 }, { "epoch": 1.4950099800399201, "grad_norm": 1.5, "learning_rate": 4.866946677079314e-06, "loss": 0.0305, "num_tokens": 65971846.0, "step": 656 }, { "epoch": 1.497291132021671, "grad_norm": 1.9765625, "learning_rate": 4.866339750921222e-06, "loss": 0.045, "num_tokens": 66072296.0, "step": 657 }, { "epoch": 1.4995722840034218, "grad_norm": 2.078125, "learning_rate": 4.86573148167437e-06, "loss": 0.0414, "num_tokens": 66173257.0, "step": 658 }, { "epoch": 1.5018534359851725, "grad_norm": 1.6015625, "learning_rate": 4.865121869684003e-06, "loss": 0.0318, "num_tokens": 66274396.0, "step": 659 }, { "epoch": 1.5041345879669232, "grad_norm": 1.71875, "learning_rate": 4.864510915296122e-06, "loss": 0.0353, "num_tokens": 66375732.0, "step": 660 }, { "epoch": 1.5041345879669232, "eval_loss": 0.03508564829826355, "eval_num_tokens": 66375732.0, "eval_runtime": 52.6824, "eval_samples_per_second": 159.161, "eval_steps_per_second": 4.992, "step": 660 }, { "epoch": 1.506415739948674, "grad_norm": 1.484375, "learning_rate": 4.8638986188574955e-06, "loss": 0.0312, "num_tokens": 66476327.0, "step": 661 }, { "epoch": 1.5086968919304249, "grad_norm": 2.328125, "learning_rate": 4.863284980715649e-06, "loss": 0.0405, "num_tokens": 66576804.0, "step": 662 }, { "epoch": 1.5109780439121756, "grad_norm": 1.4765625, "learning_rate": 4.8626700012188724e-06, "loss": 0.0334, "num_tokens": 66677381.0, "step": 663 }, { "epoch": 1.5132591958939265, "grad_norm": 1.484375, "learning_rate": 4.8620536807162164e-06, "loss": 0.027, "num_tokens": 66777706.0, "step": 664 }, { "epoch": 1.5155403478756773, "grad_norm": 2.546875, "learning_rate": 4.861436019557492e-06, "loss": 0.0488, "num_tokens": 66877979.0, "step": 665 }, { "epoch": 1.517821499857428, "grad_norm": 1.40625, "learning_rate": 4.8608170180932725e-06, "loss": 0.0289, "num_tokens": 66978366.0, "step": 666 }, { "epoch": 1.5201026518391787, "grad_norm": 1.34375, "learning_rate": 4.860196676674891e-06, "loss": 0.03, "num_tokens": 67078759.0, "step": 667 }, { "epoch": 1.5223838038209294, "grad_norm": 1.2421875, "learning_rate": 4.8595749956544414e-06, "loss": 0.0232, "num_tokens": 67179283.0, "step": 668 }, { "epoch": 1.5246649558026804, "grad_norm": 1.6171875, "learning_rate": 4.858951975384777e-06, "loss": 0.032, "num_tokens": 67279703.0, "step": 669 }, { "epoch": 1.5269461077844313, "grad_norm": 1.5859375, "learning_rate": 4.858327616219513e-06, "loss": 0.0357, "num_tokens": 67380360.0, "step": 670 }, { "epoch": 1.529227259766182, "grad_norm": 1.765625, "learning_rate": 4.857701918513023e-06, "loss": 0.0346, "num_tokens": 67480560.0, "step": 671 }, { "epoch": 1.5315084117479327, "grad_norm": 1.8125, "learning_rate": 4.857074882620442e-06, "loss": 0.0343, "num_tokens": 67581723.0, "step": 672 }, { "epoch": 1.5337895637296834, "grad_norm": 1.703125, "learning_rate": 4.856446508897662e-06, "loss": 0.0327, "num_tokens": 67682967.0, "step": 673 }, { "epoch": 1.5360707157114342, "grad_norm": 1.515625, "learning_rate": 4.8558167977013365e-06, "loss": 0.0287, "num_tokens": 67784183.0, "step": 674 }, { "epoch": 1.538351867693185, "grad_norm": 1.4296875, "learning_rate": 4.8551857493888775e-06, "loss": 0.029, "num_tokens": 67884431.0, "step": 675 }, { "epoch": 1.5406330196749358, "grad_norm": 1.796875, "learning_rate": 4.854553364318456e-06, "loss": 0.0353, "num_tokens": 67985435.0, "step": 676 }, { "epoch": 1.5429141716566868, "grad_norm": 1.609375, "learning_rate": 4.8539196428490016e-06, "loss": 0.0287, "num_tokens": 68085679.0, "step": 677 }, { "epoch": 1.5451953236384375, "grad_norm": 1.6953125, "learning_rate": 4.8532845853402015e-06, "loss": 0.0307, "num_tokens": 68186008.0, "step": 678 }, { "epoch": 1.5474764756201882, "grad_norm": 1.96875, "learning_rate": 4.8526481921525035e-06, "loss": 0.0393, "num_tokens": 68286663.0, "step": 679 }, { "epoch": 1.549757627601939, "grad_norm": 1.7265625, "learning_rate": 4.85201046364711e-06, "loss": 0.0305, "num_tokens": 68387477.0, "step": 680 }, { "epoch": 1.5520387795836896, "grad_norm": 1.6640625, "learning_rate": 4.851371400185986e-06, "loss": 0.034, "num_tokens": 68488152.0, "step": 681 }, { "epoch": 1.5543199315654406, "grad_norm": 1.9453125, "learning_rate": 4.85073100213185e-06, "loss": 0.0313, "num_tokens": 68588667.0, "step": 682 }, { "epoch": 1.5566010835471913, "grad_norm": 1.8203125, "learning_rate": 4.8500892698481784e-06, "loss": 0.0386, "num_tokens": 68689317.0, "step": 683 }, { "epoch": 1.5588822355289422, "grad_norm": 2.15625, "learning_rate": 4.849446203699209e-06, "loss": 0.0382, "num_tokens": 68789963.0, "step": 684 }, { "epoch": 1.561163387510693, "grad_norm": 1.6875, "learning_rate": 4.848801804049932e-06, "loss": 0.0361, "num_tokens": 68890718.0, "step": 685 }, { "epoch": 1.5634445394924437, "grad_norm": 1.59375, "learning_rate": 4.848156071266095e-06, "loss": 0.0328, "num_tokens": 68991177.0, "step": 686 }, { "epoch": 1.5657256914741944, "grad_norm": 1.984375, "learning_rate": 4.847509005714207e-06, "loss": 0.0385, "num_tokens": 69091478.0, "step": 687 }, { "epoch": 1.568006843455945, "grad_norm": 1.2890625, "learning_rate": 4.846860607761527e-06, "loss": 0.0276, "num_tokens": 69191869.0, "step": 688 }, { "epoch": 1.570287995437696, "grad_norm": 1.78125, "learning_rate": 4.8462108777760734e-06, "loss": 0.0375, "num_tokens": 69293083.0, "step": 689 }, { "epoch": 1.572569147419447, "grad_norm": 1.4453125, "learning_rate": 4.845559816126622e-06, "loss": 0.0337, "num_tokens": 69393688.0, "step": 690 }, { "epoch": 1.5748502994011977, "grad_norm": 1.3984375, "learning_rate": 4.844907423182699e-06, "loss": 0.0253, "num_tokens": 69494066.0, "step": 691 }, { "epoch": 1.5771314513829484, "grad_norm": 1.9140625, "learning_rate": 4.844253699314596e-06, "loss": 0.0413, "num_tokens": 69595338.0, "step": 692 }, { "epoch": 1.5794126033646991, "grad_norm": 1.3203125, "learning_rate": 4.843598644893349e-06, "loss": 0.028, "num_tokens": 69695797.0, "step": 693 }, { "epoch": 1.5816937553464498, "grad_norm": 1.484375, "learning_rate": 4.842942260290757e-06, "loss": 0.0271, "num_tokens": 69796651.0, "step": 694 }, { "epoch": 1.5839749073282008, "grad_norm": 1.8359375, "learning_rate": 4.84228454587937e-06, "loss": 0.0413, "num_tokens": 69897115.0, "step": 695 }, { "epoch": 1.5862560593099515, "grad_norm": 1.46875, "learning_rate": 4.841625502032495e-06, "loss": 0.0283, "num_tokens": 69997494.0, "step": 696 }, { "epoch": 1.5885372112917024, "grad_norm": 1.734375, "learning_rate": 4.84096512912419e-06, "loss": 0.0302, "num_tokens": 70097736.0, "step": 697 }, { "epoch": 1.5908183632734532, "grad_norm": 1.6953125, "learning_rate": 4.8403034275292735e-06, "loss": 0.0321, "num_tokens": 70198122.0, "step": 698 }, { "epoch": 1.5930995152552039, "grad_norm": 1.7421875, "learning_rate": 4.839640397623312e-06, "loss": 0.0239, "num_tokens": 70298673.0, "step": 699 }, { "epoch": 1.5953806672369546, "grad_norm": 1.53125, "learning_rate": 4.83897603978263e-06, "loss": 0.0293, "num_tokens": 70399279.0, "step": 700 }, { "epoch": 1.5976618192187053, "grad_norm": 1.578125, "learning_rate": 4.838310354384304e-06, "loss": 0.0319, "num_tokens": 70499713.0, "step": 701 }, { "epoch": 1.5999429712004563, "grad_norm": 1.2734375, "learning_rate": 4.8376433418061615e-06, "loss": 0.0263, "num_tokens": 70600345.0, "step": 702 }, { "epoch": 1.602224123182207, "grad_norm": 1.578125, "learning_rate": 4.8369750024267904e-06, "loss": 0.0282, "num_tokens": 70701327.0, "step": 703 }, { "epoch": 1.604505275163958, "grad_norm": 1.6953125, "learning_rate": 4.836305336625523e-06, "loss": 0.0319, "num_tokens": 70801875.0, "step": 704 }, { "epoch": 1.6067864271457086, "grad_norm": 1.578125, "learning_rate": 4.835634344782453e-06, "loss": 0.0294, "num_tokens": 70902502.0, "step": 705 }, { "epoch": 1.6090675791274593, "grad_norm": 1.90625, "learning_rate": 4.834962027278418e-06, "loss": 0.0329, "num_tokens": 71003205.0, "step": 706 }, { "epoch": 1.61134873110921, "grad_norm": 1.578125, "learning_rate": 4.834288384495015e-06, "loss": 0.0296, "num_tokens": 71103580.0, "step": 707 }, { "epoch": 1.6136298830909608, "grad_norm": 1.671875, "learning_rate": 4.833613416814591e-06, "loss": 0.0359, "num_tokens": 71204192.0, "step": 708 }, { "epoch": 1.6159110350727117, "grad_norm": 1.8046875, "learning_rate": 4.832937124620243e-06, "loss": 0.0321, "num_tokens": 71305264.0, "step": 709 }, { "epoch": 1.6181921870544627, "grad_norm": 1.6484375, "learning_rate": 4.832259508295822e-06, "loss": 0.0268, "num_tokens": 71405843.0, "step": 710 }, { "epoch": 1.6204733390362134, "grad_norm": 2.015625, "learning_rate": 4.831580568225931e-06, "loss": 0.0324, "num_tokens": 71507017.0, "step": 711 }, { "epoch": 1.622754491017964, "grad_norm": 1.3359375, "learning_rate": 4.830900304795921e-06, "loss": 0.0209, "num_tokens": 71608058.0, "step": 712 }, { "epoch": 1.6250356429997148, "grad_norm": 1.5703125, "learning_rate": 4.8302187183918996e-06, "loss": 0.0338, "num_tokens": 71708763.0, "step": 713 }, { "epoch": 1.6273167949814655, "grad_norm": 1.4375, "learning_rate": 4.8295358094007184e-06, "loss": 0.0324, "num_tokens": 71808597.0, "step": 714 }, { "epoch": 1.6295979469632165, "grad_norm": 1.9296875, "learning_rate": 4.828851578209986e-06, "loss": 0.0392, "num_tokens": 71909720.0, "step": 715 }, { "epoch": 1.6318790989449672, "grad_norm": 1.5, "learning_rate": 4.828166025208059e-06, "loss": 0.0257, "num_tokens": 72010432.0, "step": 716 }, { "epoch": 1.6341602509267181, "grad_norm": 1.9765625, "learning_rate": 4.8274791507840416e-06, "loss": 0.0358, "num_tokens": 72111071.0, "step": 717 }, { "epoch": 1.6364414029084688, "grad_norm": 1.6328125, "learning_rate": 4.826790955327793e-06, "loss": 0.0323, "num_tokens": 72212255.0, "step": 718 }, { "epoch": 1.6387225548902196, "grad_norm": 1.9296875, "learning_rate": 4.826101439229918e-06, "loss": 0.0354, "num_tokens": 72312436.0, "step": 719 }, { "epoch": 1.6410037068719703, "grad_norm": 1.8671875, "learning_rate": 4.825410602881774e-06, "loss": 0.0365, "num_tokens": 72412883.0, "step": 720 }, { "epoch": 1.643284858853721, "grad_norm": 2.0, "learning_rate": 4.824718446675465e-06, "loss": 0.0415, "num_tokens": 72513238.0, "step": 721 }, { "epoch": 1.645566010835472, "grad_norm": 1.625, "learning_rate": 4.8240249710038455e-06, "loss": 0.034, "num_tokens": 72613903.0, "step": 722 }, { "epoch": 1.6478471628172227, "grad_norm": 1.6640625, "learning_rate": 4.82333017626052e-06, "loss": 0.0255, "num_tokens": 72714138.0, "step": 723 }, { "epoch": 1.6501283147989736, "grad_norm": 1.9375, "learning_rate": 4.82263406283984e-06, "loss": 0.0366, "num_tokens": 72814583.0, "step": 724 }, { "epoch": 1.6524094667807243, "grad_norm": 1.5546875, "learning_rate": 4.821936631136907e-06, "loss": 0.0241, "num_tokens": 72915246.0, "step": 725 }, { "epoch": 1.654690618762475, "grad_norm": 1.640625, "learning_rate": 4.821237881547567e-06, "loss": 0.0314, "num_tokens": 73016753.0, "step": 726 }, { "epoch": 1.6569717707442257, "grad_norm": 1.359375, "learning_rate": 4.82053781446842e-06, "loss": 0.0269, "num_tokens": 73117307.0, "step": 727 }, { "epoch": 1.6592529227259765, "grad_norm": 1.5625, "learning_rate": 4.819836430296809e-06, "loss": 0.03, "num_tokens": 73217703.0, "step": 728 }, { "epoch": 1.6615340747077274, "grad_norm": 1.921875, "learning_rate": 4.819133729430826e-06, "loss": 0.045, "num_tokens": 73318423.0, "step": 729 }, { "epoch": 1.6638152266894783, "grad_norm": 1.5078125, "learning_rate": 4.818429712269312e-06, "loss": 0.025, "num_tokens": 73419546.0, "step": 730 }, { "epoch": 1.666096378671229, "grad_norm": 1.734375, "learning_rate": 4.8177243792118515e-06, "loss": 0.0267, "num_tokens": 73520101.0, "step": 731 }, { "epoch": 1.6683775306529798, "grad_norm": 1.78125, "learning_rate": 4.8170177306587785e-06, "loss": 0.041, "num_tokens": 73621939.0, "step": 732 }, { "epoch": 1.6706586826347305, "grad_norm": 2.28125, "learning_rate": 4.8163097670111735e-06, "loss": 0.0443, "num_tokens": 73723343.0, "step": 733 }, { "epoch": 1.6729398346164812, "grad_norm": 1.5625, "learning_rate": 4.815600488670863e-06, "loss": 0.0384, "num_tokens": 73823997.0, "step": 734 }, { "epoch": 1.6752209865982322, "grad_norm": 1.4140625, "learning_rate": 4.81488989604042e-06, "loss": 0.0237, "num_tokens": 73924275.0, "step": 735 }, { "epoch": 1.6775021385799829, "grad_norm": 1.796875, "learning_rate": 4.814177989523162e-06, "loss": 0.0398, "num_tokens": 74025591.0, "step": 736 }, { "epoch": 1.6797832905617338, "grad_norm": 1.515625, "learning_rate": 4.813464769523154e-06, "loss": 0.0311, "num_tokens": 74126372.0, "step": 737 }, { "epoch": 1.6820644425434845, "grad_norm": 1.578125, "learning_rate": 4.812750236445206e-06, "loss": 0.0368, "num_tokens": 74227496.0, "step": 738 }, { "epoch": 1.6843455945252352, "grad_norm": 1.734375, "learning_rate": 4.812034390694874e-06, "loss": 0.0429, "num_tokens": 74328393.0, "step": 739 }, { "epoch": 1.686626746506986, "grad_norm": 1.3046875, "learning_rate": 4.811317232678456e-06, "loss": 0.023, "num_tokens": 74428963.0, "step": 740 }, { "epoch": 1.6889078984887367, "grad_norm": 1.5546875, "learning_rate": 4.810598762803e-06, "loss": 0.0352, "num_tokens": 74530067.0, "step": 741 }, { "epoch": 1.6911890504704876, "grad_norm": 1.4609375, "learning_rate": 4.809878981476293e-06, "loss": 0.0285, "num_tokens": 74630398.0, "step": 742 }, { "epoch": 1.6934702024522383, "grad_norm": 1.6015625, "learning_rate": 4.80915788910687e-06, "loss": 0.0305, "num_tokens": 74730657.0, "step": 743 }, { "epoch": 1.6957513544339893, "grad_norm": 1.203125, "learning_rate": 4.80843548610401e-06, "loss": 0.0279, "num_tokens": 74831369.0, "step": 744 }, { "epoch": 1.69803250641574, "grad_norm": 1.4609375, "learning_rate": 4.807711772877733e-06, "loss": 0.0244, "num_tokens": 74931566.0, "step": 745 }, { "epoch": 1.7003136583974907, "grad_norm": 1.765625, "learning_rate": 4.8069867498388066e-06, "loss": 0.0365, "num_tokens": 75031699.0, "step": 746 }, { "epoch": 1.7025948103792414, "grad_norm": 1.625, "learning_rate": 4.806260417398739e-06, "loss": 0.0321, "num_tokens": 75131869.0, "step": 747 }, { "epoch": 1.7048759623609921, "grad_norm": 1.6875, "learning_rate": 4.805532775969783e-06, "loss": 0.0219, "num_tokens": 75232446.0, "step": 748 }, { "epoch": 1.707157114342743, "grad_norm": 1.3203125, "learning_rate": 4.804803825964933e-06, "loss": 0.0261, "num_tokens": 75332934.0, "step": 749 }, { "epoch": 1.709438266324494, "grad_norm": 1.5390625, "learning_rate": 4.804073567797928e-06, "loss": 0.0313, "num_tokens": 75433669.0, "step": 750 }, { "epoch": 1.7117194183062447, "grad_norm": 1.1875, "learning_rate": 4.803342001883247e-06, "loss": 0.0209, "num_tokens": 75534526.0, "step": 751 }, { "epoch": 1.7140005702879955, "grad_norm": 1.6640625, "learning_rate": 4.802609128636113e-06, "loss": 0.0305, "num_tokens": 75635047.0, "step": 752 }, { "epoch": 1.7162817222697462, "grad_norm": 1.8359375, "learning_rate": 4.801874948472492e-06, "loss": 0.0383, "num_tokens": 75735583.0, "step": 753 }, { "epoch": 1.718562874251497, "grad_norm": 1.7421875, "learning_rate": 4.801139461809089e-06, "loss": 0.0402, "num_tokens": 75836542.0, "step": 754 }, { "epoch": 1.7208440262332478, "grad_norm": 1.59375, "learning_rate": 4.800402669063353e-06, "loss": 0.0268, "num_tokens": 75936596.0, "step": 755 }, { "epoch": 1.7231251782149986, "grad_norm": 1.6171875, "learning_rate": 4.799664570653473e-06, "loss": 0.0327, "num_tokens": 76037303.0, "step": 756 }, { "epoch": 1.7254063301967495, "grad_norm": 1.4375, "learning_rate": 4.79892516699838e-06, "loss": 0.027, "num_tokens": 76138037.0, "step": 757 }, { "epoch": 1.7276874821785002, "grad_norm": 1.3359375, "learning_rate": 4.798184458517745e-06, "loss": 0.0307, "num_tokens": 76239298.0, "step": 758 }, { "epoch": 1.729968634160251, "grad_norm": 1.6875, "learning_rate": 4.797442445631978e-06, "loss": 0.0383, "num_tokens": 76339943.0, "step": 759 }, { "epoch": 1.7322497861420016, "grad_norm": 1.4296875, "learning_rate": 4.7966991287622335e-06, "loss": 0.0297, "num_tokens": 76440430.0, "step": 760 }, { "epoch": 1.7345309381237524, "grad_norm": 1.578125, "learning_rate": 4.795954508330403e-06, "loss": 0.0293, "num_tokens": 76540648.0, "step": 761 }, { "epoch": 1.7368120901055033, "grad_norm": 1.390625, "learning_rate": 4.795208584759119e-06, "loss": 0.0223, "num_tokens": 76641530.0, "step": 762 }, { "epoch": 1.739093242087254, "grad_norm": 1.0546875, "learning_rate": 4.794461358471753e-06, "loss": 0.0219, "num_tokens": 76742430.0, "step": 763 }, { "epoch": 1.741374394069005, "grad_norm": 1.5546875, "learning_rate": 4.7937128298924155e-06, "loss": 0.024, "num_tokens": 76843393.0, "step": 764 }, { "epoch": 1.7436555460507557, "grad_norm": 1.1796875, "learning_rate": 4.7929629994459584e-06, "loss": 0.0251, "num_tokens": 76944379.0, "step": 765 }, { "epoch": 1.7459366980325064, "grad_norm": 1.5, "learning_rate": 4.792211867557969e-06, "loss": 0.0257, "num_tokens": 77045334.0, "step": 766 }, { "epoch": 1.7482178500142571, "grad_norm": 1.28125, "learning_rate": 4.7914594346547774e-06, "loss": 0.0213, "num_tokens": 77145737.0, "step": 767 }, { "epoch": 1.7504990019960078, "grad_norm": 1.4375, "learning_rate": 4.790705701163449e-06, "loss": 0.0238, "num_tokens": 77246145.0, "step": 768 }, { "epoch": 1.7527801539777588, "grad_norm": 1.6796875, "learning_rate": 4.789950667511789e-06, "loss": 0.0406, "num_tokens": 77346957.0, "step": 769 }, { "epoch": 1.7550613059595097, "grad_norm": 1.921875, "learning_rate": 4.789194334128338e-06, "loss": 0.0283, "num_tokens": 77447568.0, "step": 770 }, { "epoch": 1.7573424579412604, "grad_norm": 1.28125, "learning_rate": 4.788436701442378e-06, "loss": 0.027, "num_tokens": 77548590.0, "step": 771 }, { "epoch": 1.7596236099230111, "grad_norm": 1.6953125, "learning_rate": 4.787677769883926e-06, "loss": 0.0403, "num_tokens": 77649308.0, "step": 772 }, { "epoch": 1.7619047619047619, "grad_norm": 1.46875, "learning_rate": 4.786917539883738e-06, "loss": 0.0281, "num_tokens": 77750094.0, "step": 773 }, { "epoch": 1.7641859138865126, "grad_norm": 1.59375, "learning_rate": 4.786156011873304e-06, "loss": 0.0361, "num_tokens": 77850628.0, "step": 774 }, { "epoch": 1.7664670658682635, "grad_norm": 1.359375, "learning_rate": 4.785393186284854e-06, "loss": 0.0227, "num_tokens": 77950887.0, "step": 775 }, { "epoch": 1.7687482178500142, "grad_norm": 1.9453125, "learning_rate": 4.784629063551354e-06, "loss": 0.0281, "num_tokens": 78051621.0, "step": 776 }, { "epoch": 1.7710293698317652, "grad_norm": 2.25, "learning_rate": 4.783863644106502e-06, "loss": 0.0355, "num_tokens": 78152468.0, "step": 777 }, { "epoch": 1.773310521813516, "grad_norm": 1.8828125, "learning_rate": 4.783096928384739e-06, "loss": 0.0344, "num_tokens": 78253503.0, "step": 778 }, { "epoch": 1.7755916737952666, "grad_norm": 1.7734375, "learning_rate": 4.782328916821235e-06, "loss": 0.041, "num_tokens": 78353868.0, "step": 779 }, { "epoch": 1.7778728257770173, "grad_norm": 1.75, "learning_rate": 4.7815596098519004e-06, "loss": 0.0305, "num_tokens": 78454483.0, "step": 780 }, { "epoch": 1.780153977758768, "grad_norm": 2.109375, "learning_rate": 4.780789007913379e-06, "loss": 0.0414, "num_tokens": 78555352.0, "step": 781 }, { "epoch": 1.782435129740519, "grad_norm": 1.8671875, "learning_rate": 4.780017111443048e-06, "loss": 0.0416, "num_tokens": 78655886.0, "step": 782 }, { "epoch": 1.7847162817222697, "grad_norm": 1.90625, "learning_rate": 4.779243920879023e-06, "loss": 0.0371, "num_tokens": 78756017.0, "step": 783 }, { "epoch": 1.7869974337040206, "grad_norm": 2.140625, "learning_rate": 4.77846943666015e-06, "loss": 0.0388, "num_tokens": 78857034.0, "step": 784 }, { "epoch": 1.7892785856857714, "grad_norm": 1.6875, "learning_rate": 4.777693659226013e-06, "loss": 0.0333, "num_tokens": 78957884.0, "step": 785 }, { "epoch": 1.791559737667522, "grad_norm": 1.453125, "learning_rate": 4.776916589016928e-06, "loss": 0.0266, "num_tokens": 79058288.0, "step": 786 }, { "epoch": 1.7938408896492728, "grad_norm": 1.984375, "learning_rate": 4.776138226473944e-06, "loss": 0.0363, "num_tokens": 79158505.0, "step": 787 }, { "epoch": 1.7961220416310235, "grad_norm": 1.90625, "learning_rate": 4.775358572038845e-06, "loss": 0.0362, "num_tokens": 79259308.0, "step": 788 }, { "epoch": 1.7984031936127745, "grad_norm": 1.640625, "learning_rate": 4.774577626154148e-06, "loss": 0.033, "num_tokens": 79360125.0, "step": 789 }, { "epoch": 1.8006843455945254, "grad_norm": 1.640625, "learning_rate": 4.773795389263104e-06, "loss": 0.0394, "num_tokens": 79460920.0, "step": 790 }, { "epoch": 1.802965497576276, "grad_norm": 1.4609375, "learning_rate": 4.773011861809694e-06, "loss": 0.039, "num_tokens": 79561592.0, "step": 791 }, { "epoch": 1.8052466495580268, "grad_norm": 1.65625, "learning_rate": 4.772227044238632e-06, "loss": 0.0331, "num_tokens": 79662392.0, "step": 792 }, { "epoch": 1.8075278015397775, "grad_norm": 1.609375, "learning_rate": 4.771440936995367e-06, "loss": 0.0362, "num_tokens": 79762870.0, "step": 793 }, { "epoch": 1.8098089535215283, "grad_norm": 1.484375, "learning_rate": 4.770653540526079e-06, "loss": 0.0294, "num_tokens": 79863603.0, "step": 794 }, { "epoch": 1.8120901055032792, "grad_norm": 1.5078125, "learning_rate": 4.7698648552776785e-06, "loss": 0.0295, "num_tokens": 79963903.0, "step": 795 }, { "epoch": 1.81437125748503, "grad_norm": 1.6875, "learning_rate": 4.769074881697806e-06, "loss": 0.0255, "num_tokens": 80064771.0, "step": 796 }, { "epoch": 1.8166524094667809, "grad_norm": 1.5859375, "learning_rate": 4.768283620234838e-06, "loss": 0.0327, "num_tokens": 80165415.0, "step": 797 }, { "epoch": 1.8189335614485316, "grad_norm": 1.5546875, "learning_rate": 4.767491071337877e-06, "loss": 0.0287, "num_tokens": 80266273.0, "step": 798 }, { "epoch": 1.8212147134302823, "grad_norm": 1.578125, "learning_rate": 4.766697235456761e-06, "loss": 0.0331, "num_tokens": 80367852.0, "step": 799 }, { "epoch": 1.823495865412033, "grad_norm": 1.6171875, "learning_rate": 4.765902113042053e-06, "loss": 0.0353, "num_tokens": 80467939.0, "step": 800 }, { "epoch": 1.8257770173937837, "grad_norm": 1.6328125, "learning_rate": 4.765105704545052e-06, "loss": 0.0332, "num_tokens": 80568601.0, "step": 801 }, { "epoch": 1.8280581693755347, "grad_norm": 1.65625, "learning_rate": 4.7643080104177815e-06, "loss": 0.0344, "num_tokens": 80669212.0, "step": 802 }, { "epoch": 1.8303393213572854, "grad_norm": 1.6875, "learning_rate": 4.763509031113e-06, "loss": 0.0364, "num_tokens": 80770184.0, "step": 803 }, { "epoch": 1.8326204733390363, "grad_norm": 1.7109375, "learning_rate": 4.7627087670841894e-06, "loss": 0.0274, "num_tokens": 80870763.0, "step": 804 }, { "epoch": 1.834901625320787, "grad_norm": 1.546875, "learning_rate": 4.761907218785566e-06, "loss": 0.0238, "num_tokens": 80971062.0, "step": 805 }, { "epoch": 1.8371827773025378, "grad_norm": 1.6875, "learning_rate": 4.761104386672074e-06, "loss": 0.0269, "num_tokens": 81072136.0, "step": 806 }, { "epoch": 1.8394639292842885, "grad_norm": 1.875, "learning_rate": 4.760300271199384e-06, "loss": 0.0412, "num_tokens": 81172925.0, "step": 807 }, { "epoch": 1.8417450812660392, "grad_norm": 1.6953125, "learning_rate": 4.759494872823896e-06, "loss": 0.0372, "num_tokens": 81273373.0, "step": 808 }, { "epoch": 1.8440262332477901, "grad_norm": 1.453125, "learning_rate": 4.758688192002741e-06, "loss": 0.0285, "num_tokens": 81374238.0, "step": 809 }, { "epoch": 1.846307385229541, "grad_norm": 1.7578125, "learning_rate": 4.757880229193773e-06, "loss": 0.0357, "num_tokens": 81474638.0, "step": 810 }, { "epoch": 1.8485885372112918, "grad_norm": 1.6484375, "learning_rate": 4.757070984855577e-06, "loss": 0.0294, "num_tokens": 81575209.0, "step": 811 }, { "epoch": 1.8508696891930425, "grad_norm": 1.84375, "learning_rate": 4.756260459447465e-06, "loss": 0.0387, "num_tokens": 81675601.0, "step": 812 }, { "epoch": 1.8531508411747932, "grad_norm": 1.3671875, "learning_rate": 4.755448653429475e-06, "loss": 0.0237, "num_tokens": 81776578.0, "step": 813 }, { "epoch": 1.855431993156544, "grad_norm": 2.0, "learning_rate": 4.754635567262372e-06, "loss": 0.032, "num_tokens": 81877258.0, "step": 814 }, { "epoch": 1.8577131451382949, "grad_norm": 1.3671875, "learning_rate": 4.753821201407648e-06, "loss": 0.0277, "num_tokens": 81977861.0, "step": 815 }, { "epoch": 1.8599942971200456, "grad_norm": 1.5390625, "learning_rate": 4.7530055563275225e-06, "loss": 0.0335, "num_tokens": 82078945.0, "step": 816 }, { "epoch": 1.8622754491017965, "grad_norm": 1.640625, "learning_rate": 4.7521886324849385e-06, "loss": 0.039, "num_tokens": 82179249.0, "step": 817 }, { "epoch": 1.8645566010835473, "grad_norm": 1.3359375, "learning_rate": 4.751370430343568e-06, "loss": 0.0293, "num_tokens": 82280049.0, "step": 818 }, { "epoch": 1.866837753065298, "grad_norm": 1.3671875, "learning_rate": 4.750550950367805e-06, "loss": 0.0341, "num_tokens": 82380730.0, "step": 819 }, { "epoch": 1.8691189050470487, "grad_norm": 1.65625, "learning_rate": 4.749730193022771e-06, "loss": 0.0361, "num_tokens": 82481134.0, "step": 820 }, { "epoch": 1.8714000570287994, "grad_norm": 1.7734375, "learning_rate": 4.748908158774312e-06, "loss": 0.0306, "num_tokens": 82582056.0, "step": 821 }, { "epoch": 1.8736812090105504, "grad_norm": 1.4453125, "learning_rate": 4.748084848089e-06, "loss": 0.0317, "num_tokens": 82682479.0, "step": 822 }, { "epoch": 1.875962360992301, "grad_norm": 1.6484375, "learning_rate": 4.747260261434128e-06, "loss": 0.0293, "num_tokens": 82783445.0, "step": 823 }, { "epoch": 1.878243512974052, "grad_norm": 1.484375, "learning_rate": 4.7464343992777175e-06, "loss": 0.024, "num_tokens": 82883862.0, "step": 824 }, { "epoch": 1.8805246649558027, "grad_norm": 1.3359375, "learning_rate": 4.74560726208851e-06, "loss": 0.0258, "num_tokens": 82984449.0, "step": 825 }, { "epoch": 1.8828058169375534, "grad_norm": 1.5546875, "learning_rate": 4.744778850335974e-06, "loss": 0.0288, "num_tokens": 83085008.0, "step": 826 }, { "epoch": 1.8850869689193042, "grad_norm": 1.2578125, "learning_rate": 4.7439491644903e-06, "loss": 0.0252, "num_tokens": 83185756.0, "step": 827 }, { "epoch": 1.8873681209010549, "grad_norm": 1.453125, "learning_rate": 4.743118205022402e-06, "loss": 0.0289, "num_tokens": 83286414.0, "step": 828 }, { "epoch": 1.8896492728828058, "grad_norm": 1.6640625, "learning_rate": 4.742285972403915e-06, "loss": 0.0318, "num_tokens": 83387029.0, "step": 829 }, { "epoch": 1.8919304248645568, "grad_norm": 1.5703125, "learning_rate": 4.7414524671071995e-06, "loss": 0.0262, "num_tokens": 83487973.0, "step": 830 }, { "epoch": 1.8942115768463075, "grad_norm": 1.53125, "learning_rate": 4.7406176896053356e-06, "loss": 0.0214, "num_tokens": 83588579.0, "step": 831 }, { "epoch": 1.8964927288280582, "grad_norm": 1.6953125, "learning_rate": 4.739781640372129e-06, "loss": 0.032, "num_tokens": 83689373.0, "step": 832 }, { "epoch": 1.898773880809809, "grad_norm": 1.640625, "learning_rate": 4.7389443198821035e-06, "loss": 0.0295, "num_tokens": 83790086.0, "step": 833 }, { "epoch": 1.9010550327915596, "grad_norm": 1.78125, "learning_rate": 4.738105728610507e-06, "loss": 0.0329, "num_tokens": 83890691.0, "step": 834 }, { "epoch": 1.9033361847733106, "grad_norm": 1.375, "learning_rate": 4.737265867033307e-06, "loss": 0.0314, "num_tokens": 83991138.0, "step": 835 }, { "epoch": 1.9056173367550613, "grad_norm": 1.5625, "learning_rate": 4.736424735627193e-06, "loss": 0.0301, "num_tokens": 84091137.0, "step": 836 }, { "epoch": 1.9078984887368122, "grad_norm": 1.9140625, "learning_rate": 4.735582334869575e-06, "loss": 0.0395, "num_tokens": 84191778.0, "step": 837 }, { "epoch": 1.910179640718563, "grad_norm": 1.578125, "learning_rate": 4.734738665238583e-06, "loss": 0.0241, "num_tokens": 84292672.0, "step": 838 }, { "epoch": 1.9124607927003137, "grad_norm": 1.828125, "learning_rate": 4.733893727213068e-06, "loss": 0.0293, "num_tokens": 84393353.0, "step": 839 }, { "epoch": 1.9147419446820644, "grad_norm": 1.5390625, "learning_rate": 4.7330475212726e-06, "loss": 0.0312, "num_tokens": 84494679.0, "step": 840 }, { "epoch": 1.917023096663815, "grad_norm": 1.59375, "learning_rate": 4.73220004789747e-06, "loss": 0.0346, "num_tokens": 84594986.0, "step": 841 }, { "epoch": 1.919304248645566, "grad_norm": 1.375, "learning_rate": 4.7313513075686875e-06, "loss": 0.0295, "num_tokens": 84695649.0, "step": 842 }, { "epoch": 1.9215854006273168, "grad_norm": 1.6015625, "learning_rate": 4.73050130076798e-06, "loss": 0.026, "num_tokens": 84796608.0, "step": 843 }, { "epoch": 1.9238665526090677, "grad_norm": 1.5546875, "learning_rate": 4.729650027977797e-06, "loss": 0.0294, "num_tokens": 84897517.0, "step": 844 }, { "epoch": 1.9261477045908184, "grad_norm": 1.484375, "learning_rate": 4.728797489681302e-06, "loss": 0.0362, "num_tokens": 84998006.0, "step": 845 }, { "epoch": 1.9284288565725691, "grad_norm": 1.5, "learning_rate": 4.7279436863623805e-06, "loss": 0.0222, "num_tokens": 85099229.0, "step": 846 }, { "epoch": 1.9307100085543198, "grad_norm": 1.5859375, "learning_rate": 4.7270886185056355e-06, "loss": 0.0259, "num_tokens": 85200220.0, "step": 847 }, { "epoch": 1.9329911605360706, "grad_norm": 1.9453125, "learning_rate": 4.726232286596385e-06, "loss": 0.0303, "num_tokens": 85301053.0, "step": 848 }, { "epoch": 1.9352723125178215, "grad_norm": 1.9140625, "learning_rate": 4.725374691120669e-06, "loss": 0.0441, "num_tokens": 85401439.0, "step": 849 }, { "epoch": 1.9375534644995724, "grad_norm": 1.6875, "learning_rate": 4.7245158325652396e-06, "loss": 0.033, "num_tokens": 85502220.0, "step": 850 }, { "epoch": 1.9398346164813232, "grad_norm": 1.75, "learning_rate": 4.7236557114175705e-06, "loss": 0.0408, "num_tokens": 85602723.0, "step": 851 }, { "epoch": 1.9421157684630739, "grad_norm": 1.703125, "learning_rate": 4.722794328165849e-06, "loss": 0.0298, "num_tokens": 85703336.0, "step": 852 }, { "epoch": 1.9443969204448246, "grad_norm": 1.6015625, "learning_rate": 4.721931683298979e-06, "loss": 0.025, "num_tokens": 85803687.0, "step": 853 }, { "epoch": 1.9466780724265753, "grad_norm": 1.25, "learning_rate": 4.721067777306582e-06, "loss": 0.0263, "num_tokens": 85904075.0, "step": 854 }, { "epoch": 1.9489592244083263, "grad_norm": 1.5234375, "learning_rate": 4.7202026106789935e-06, "loss": 0.0296, "num_tokens": 86004533.0, "step": 855 }, { "epoch": 1.951240376390077, "grad_norm": 1.5078125, "learning_rate": 4.719336183907266e-06, "loss": 0.0263, "num_tokens": 86105320.0, "step": 856 }, { "epoch": 1.953521528371828, "grad_norm": 1.4375, "learning_rate": 4.718468497483166e-06, "loss": 0.0276, "num_tokens": 86206074.0, "step": 857 }, { "epoch": 1.9558026803535786, "grad_norm": 2.46875, "learning_rate": 4.717599551899177e-06, "loss": 0.0365, "num_tokens": 86306660.0, "step": 858 }, { "epoch": 1.9580838323353293, "grad_norm": 1.5546875, "learning_rate": 4.716729347648494e-06, "loss": 0.0298, "num_tokens": 86407389.0, "step": 859 }, { "epoch": 1.96036498431708, "grad_norm": 1.390625, "learning_rate": 4.71585788522503e-06, "loss": 0.0269, "num_tokens": 86508342.0, "step": 860 }, { "epoch": 1.9626461362988308, "grad_norm": 1.4765625, "learning_rate": 4.7149851651234085e-06, "loss": 0.0251, "num_tokens": 86609005.0, "step": 861 }, { "epoch": 1.9649272882805817, "grad_norm": 2.296875, "learning_rate": 4.714111187838969e-06, "loss": 0.043, "num_tokens": 86710498.0, "step": 862 }, { "epoch": 1.9672084402623324, "grad_norm": 1.828125, "learning_rate": 4.713235953867764e-06, "loss": 0.0368, "num_tokens": 86810252.0, "step": 863 }, { "epoch": 1.9694895922440834, "grad_norm": 1.2265625, "learning_rate": 4.712359463706561e-06, "loss": 0.0235, "num_tokens": 86910623.0, "step": 864 }, { "epoch": 1.971770744225834, "grad_norm": 2.1875, "learning_rate": 4.711481717852837e-06, "loss": 0.0399, "num_tokens": 87011707.0, "step": 865 }, { "epoch": 1.9740518962075848, "grad_norm": 1.7265625, "learning_rate": 4.710602716804784e-06, "loss": 0.033, "num_tokens": 87112324.0, "step": 866 }, { "epoch": 1.9763330481893355, "grad_norm": 1.609375, "learning_rate": 4.709722461061307e-06, "loss": 0.0333, "num_tokens": 87212782.0, "step": 867 }, { "epoch": 1.9786142001710862, "grad_norm": 1.5078125, "learning_rate": 4.70884095112202e-06, "loss": 0.0301, "num_tokens": 87313465.0, "step": 868 }, { "epoch": 1.9808953521528372, "grad_norm": 1.6875, "learning_rate": 4.707958187487254e-06, "loss": 0.0304, "num_tokens": 87414325.0, "step": 869 }, { "epoch": 1.9831765041345881, "grad_norm": 1.4296875, "learning_rate": 4.707074170658046e-06, "loss": 0.0267, "num_tokens": 87514094.0, "step": 870 }, { "epoch": 1.9854576561163388, "grad_norm": 1.3125, "learning_rate": 4.706188901136148e-06, "loss": 0.0237, "num_tokens": 87615129.0, "step": 871 }, { "epoch": 1.9877388080980896, "grad_norm": 1.8046875, "learning_rate": 4.705302379424023e-06, "loss": 0.0396, "num_tokens": 87716093.0, "step": 872 }, { "epoch": 1.9900199600798403, "grad_norm": 1.4375, "learning_rate": 4.704414606024842e-06, "loss": 0.0289, "num_tokens": 87816117.0, "step": 873 }, { "epoch": 1.992301112061591, "grad_norm": 1.3671875, "learning_rate": 4.703525581442488e-06, "loss": 0.0243, "num_tokens": 87916689.0, "step": 874 }, { "epoch": 1.994582264043342, "grad_norm": 1.4375, "learning_rate": 4.702635306181554e-06, "loss": 0.029, "num_tokens": 88017203.0, "step": 875 }, { "epoch": 1.9968634160250927, "grad_norm": 1.5859375, "learning_rate": 4.701743780747345e-06, "loss": 0.0363, "num_tokens": 88118311.0, "step": 876 }, { "epoch": 1.9991445680068436, "grad_norm": 1.765625, "learning_rate": 4.700851005645872e-06, "loss": 0.0279, "num_tokens": 88218961.0, "step": 877 }, { "epoch": 2.0, "grad_norm": 2.1875, "learning_rate": 4.699956981383857e-06, "loss": 0.0232, "num_tokens": 88255744.0, "step": 878 }, { "epoch": 2.0022811519817507, "grad_norm": 1.3125, "learning_rate": 4.699061708468732e-06, "loss": 0.0281, "num_tokens": 88356076.0, "step": 879 }, { "epoch": 2.0045623039635014, "grad_norm": 1.2734375, "learning_rate": 4.698165187408635e-06, "loss": 0.026, "num_tokens": 88457293.0, "step": 880 }, { "epoch": 2.0045623039635014, "eval_loss": 0.03313034400343895, "eval_num_tokens": 88457293.0, "eval_runtime": 52.7309, "eval_samples_per_second": 159.015, "eval_steps_per_second": 4.988, "step": 880 }, { "epoch": 2.006843455945252, "grad_norm": 1.265625, "learning_rate": 4.697267418712415e-06, "loss": 0.0237, "num_tokens": 88557949.0, "step": 881 }, { "epoch": 2.0091246079270033, "grad_norm": 1.4609375, "learning_rate": 4.6963684028896285e-06, "loss": 0.03, "num_tokens": 88658734.0, "step": 882 }, { "epoch": 2.011405759908754, "grad_norm": 1.2265625, "learning_rate": 4.695468140450539e-06, "loss": 0.0228, "num_tokens": 88759481.0, "step": 883 }, { "epoch": 2.0136869118905047, "grad_norm": 1.8046875, "learning_rate": 4.6945666319061166e-06, "loss": 0.0238, "num_tokens": 88860223.0, "step": 884 }, { "epoch": 2.0159680638722555, "grad_norm": 1.6015625, "learning_rate": 4.6936638777680435e-06, "loss": 0.0386, "num_tokens": 88960628.0, "step": 885 }, { "epoch": 2.018249215854006, "grad_norm": 1.234375, "learning_rate": 4.6927598785487026e-06, "loss": 0.0212, "num_tokens": 89060788.0, "step": 886 }, { "epoch": 2.020530367835757, "grad_norm": 1.703125, "learning_rate": 4.691854634761188e-06, "loss": 0.0325, "num_tokens": 89162141.0, "step": 887 }, { "epoch": 2.022811519817508, "grad_norm": 1.34375, "learning_rate": 4.690948146919299e-06, "loss": 0.0245, "num_tokens": 89262834.0, "step": 888 }, { "epoch": 2.025092671799259, "grad_norm": 1.3203125, "learning_rate": 4.690040415537538e-06, "loss": 0.0234, "num_tokens": 89363581.0, "step": 889 }, { "epoch": 2.0273738237810095, "grad_norm": 1.75, "learning_rate": 4.689131441131119e-06, "loss": 0.0384, "num_tokens": 89464851.0, "step": 890 }, { "epoch": 2.02965497576276, "grad_norm": 1.46875, "learning_rate": 4.6882212242159555e-06, "loss": 0.0271, "num_tokens": 89565482.0, "step": 891 }, { "epoch": 2.031936127744511, "grad_norm": 1.1171875, "learning_rate": 4.687309765308671e-06, "loss": 0.0215, "num_tokens": 89666549.0, "step": 892 }, { "epoch": 2.0342172797262617, "grad_norm": 1.5, "learning_rate": 4.6863970649265914e-06, "loss": 0.0323, "num_tokens": 89767566.0, "step": 893 }, { "epoch": 2.0364984317080124, "grad_norm": 1.234375, "learning_rate": 4.685483123587748e-06, "loss": 0.0206, "num_tokens": 89868238.0, "step": 894 }, { "epoch": 2.0387795836897635, "grad_norm": 1.3515625, "learning_rate": 4.684567941810876e-06, "loss": 0.0226, "num_tokens": 89969093.0, "step": 895 }, { "epoch": 2.0410607356715142, "grad_norm": 1.375, "learning_rate": 4.683651520115414e-06, "loss": 0.0231, "num_tokens": 90070283.0, "step": 896 }, { "epoch": 2.043341887653265, "grad_norm": 1.265625, "learning_rate": 4.682733859021508e-06, "loss": 0.0216, "num_tokens": 90171029.0, "step": 897 }, { "epoch": 2.0456230396350157, "grad_norm": 1.2578125, "learning_rate": 4.681814959050002e-06, "loss": 0.0193, "num_tokens": 90272195.0, "step": 898 }, { "epoch": 2.0479041916167664, "grad_norm": 2.015625, "learning_rate": 4.680894820722446e-06, "loss": 0.0378, "num_tokens": 90373278.0, "step": 899 }, { "epoch": 2.050185343598517, "grad_norm": 1.5, "learning_rate": 4.679973444561095e-06, "loss": 0.027, "num_tokens": 90473725.0, "step": 900 }, { "epoch": 2.052466495580268, "grad_norm": 1.859375, "learning_rate": 4.679050831088902e-06, "loss": 0.0274, "num_tokens": 90574081.0, "step": 901 }, { "epoch": 2.054747647562019, "grad_norm": 1.34375, "learning_rate": 4.678126980829525e-06, "loss": 0.0229, "num_tokens": 90675051.0, "step": 902 }, { "epoch": 2.0570287995437697, "grad_norm": 1.6328125, "learning_rate": 4.677201894307325e-06, "loss": 0.0311, "num_tokens": 90775446.0, "step": 903 }, { "epoch": 2.0593099515255204, "grad_norm": 1.6328125, "learning_rate": 4.676275572047362e-06, "loss": 0.0319, "num_tokens": 90876064.0, "step": 904 }, { "epoch": 2.061591103507271, "grad_norm": 1.3125, "learning_rate": 4.675348014575399e-06, "loss": 0.0236, "num_tokens": 90976661.0, "step": 905 }, { "epoch": 2.063872255489022, "grad_norm": 1.7421875, "learning_rate": 4.674419222417899e-06, "loss": 0.0315, "num_tokens": 91077122.0, "step": 906 }, { "epoch": 2.0661534074707726, "grad_norm": 1.1015625, "learning_rate": 4.673489196102028e-06, "loss": 0.0179, "num_tokens": 91178366.0, "step": 907 }, { "epoch": 2.0684345594525233, "grad_norm": 1.3828125, "learning_rate": 4.67255793615565e-06, "loss": 0.0188, "num_tokens": 91278836.0, "step": 908 }, { "epoch": 2.0707157114342745, "grad_norm": 1.46875, "learning_rate": 4.67162544310733e-06, "loss": 0.0232, "num_tokens": 91379633.0, "step": 909 }, { "epoch": 2.072996863416025, "grad_norm": 1.2109375, "learning_rate": 4.670691717486333e-06, "loss": 0.022, "num_tokens": 91480695.0, "step": 910 }, { "epoch": 2.075278015397776, "grad_norm": 2.328125, "learning_rate": 4.669756759822625e-06, "loss": 0.0323, "num_tokens": 91581468.0, "step": 911 }, { "epoch": 2.0775591673795266, "grad_norm": 1.203125, "learning_rate": 4.668820570646868e-06, "loss": 0.0229, "num_tokens": 91681785.0, "step": 912 }, { "epoch": 2.0798403193612773, "grad_norm": 1.921875, "learning_rate": 4.667883150490427e-06, "loss": 0.0261, "num_tokens": 91781934.0, "step": 913 }, { "epoch": 2.082121471343028, "grad_norm": 1.84375, "learning_rate": 4.666944499885361e-06, "loss": 0.0363, "num_tokens": 91882561.0, "step": 914 }, { "epoch": 2.084402623324779, "grad_norm": 1.6484375, "learning_rate": 4.6660046193644315e-06, "loss": 0.0248, "num_tokens": 91983196.0, "step": 915 }, { "epoch": 2.08668377530653, "grad_norm": 1.3984375, "learning_rate": 4.665063509461098e-06, "loss": 0.031, "num_tokens": 92084162.0, "step": 916 }, { "epoch": 2.0889649272882806, "grad_norm": 1.703125, "learning_rate": 4.664121170709512e-06, "loss": 0.029, "num_tokens": 92185004.0, "step": 917 }, { "epoch": 2.0912460792700314, "grad_norm": 1.6015625, "learning_rate": 4.663177603644532e-06, "loss": 0.0247, "num_tokens": 92285586.0, "step": 918 }, { "epoch": 2.093527231251782, "grad_norm": 2.328125, "learning_rate": 4.662232808801704e-06, "loss": 0.0307, "num_tokens": 92386298.0, "step": 919 }, { "epoch": 2.095808383233533, "grad_norm": 1.3515625, "learning_rate": 4.661286786717278e-06, "loss": 0.0289, "num_tokens": 92487299.0, "step": 920 }, { "epoch": 2.0980895352152835, "grad_norm": 1.3125, "learning_rate": 4.660339537928198e-06, "loss": 0.0249, "num_tokens": 92587712.0, "step": 921 }, { "epoch": 2.1003706871970347, "grad_norm": 1.4140625, "learning_rate": 4.659391062972102e-06, "loss": 0.0295, "num_tokens": 92688748.0, "step": 922 }, { "epoch": 2.1026518391787854, "grad_norm": 1.4296875, "learning_rate": 4.658441362387328e-06, "loss": 0.0186, "num_tokens": 92789136.0, "step": 923 }, { "epoch": 2.104932991160536, "grad_norm": 1.390625, "learning_rate": 4.657490436712907e-06, "loss": 0.0265, "num_tokens": 92889285.0, "step": 924 }, { "epoch": 2.107214143142287, "grad_norm": 1.1796875, "learning_rate": 4.6565382864885665e-06, "loss": 0.0168, "num_tokens": 92990172.0, "step": 925 }, { "epoch": 2.1094952951240376, "grad_norm": 1.421875, "learning_rate": 4.655584912254727e-06, "loss": 0.0264, "num_tokens": 93090956.0, "step": 926 }, { "epoch": 2.1117764471057883, "grad_norm": 1.703125, "learning_rate": 4.654630314552508e-06, "loss": 0.0319, "num_tokens": 93191698.0, "step": 927 }, { "epoch": 2.1140575990875394, "grad_norm": 1.6875, "learning_rate": 4.653674493923718e-06, "loss": 0.0279, "num_tokens": 93292582.0, "step": 928 }, { "epoch": 2.11633875106929, "grad_norm": 1.5859375, "learning_rate": 4.652717450910864e-06, "loss": 0.0324, "num_tokens": 93393400.0, "step": 929 }, { "epoch": 2.118619903051041, "grad_norm": 1.484375, "learning_rate": 4.651759186057144e-06, "loss": 0.0293, "num_tokens": 93493862.0, "step": 930 }, { "epoch": 2.1209010550327916, "grad_norm": 1.5078125, "learning_rate": 4.650799699906452e-06, "loss": 0.0265, "num_tokens": 93594575.0, "step": 931 }, { "epoch": 2.1231822070145423, "grad_norm": 1.5390625, "learning_rate": 4.649838993003373e-06, "loss": 0.0278, "num_tokens": 93695443.0, "step": 932 }, { "epoch": 2.125463358996293, "grad_norm": 1.3359375, "learning_rate": 4.648877065893186e-06, "loss": 0.0216, "num_tokens": 93796155.0, "step": 933 }, { "epoch": 2.1277445109780437, "grad_norm": 1.1484375, "learning_rate": 4.647913919121861e-06, "loss": 0.0161, "num_tokens": 93897154.0, "step": 934 }, { "epoch": 2.130025662959795, "grad_norm": 1.234375, "learning_rate": 4.646949553236064e-06, "loss": 0.0236, "num_tokens": 93998020.0, "step": 935 }, { "epoch": 2.1323068149415456, "grad_norm": 2.171875, "learning_rate": 4.645983968783148e-06, "loss": 0.0304, "num_tokens": 94098528.0, "step": 936 }, { "epoch": 2.1345879669232963, "grad_norm": 1.890625, "learning_rate": 4.645017166311163e-06, "loss": 0.0367, "num_tokens": 94199923.0, "step": 937 }, { "epoch": 2.136869118905047, "grad_norm": 1.6640625, "learning_rate": 4.644049146368844e-06, "loss": 0.0335, "num_tokens": 94300060.0, "step": 938 }, { "epoch": 2.1391502708867978, "grad_norm": 1.484375, "learning_rate": 4.643079909505622e-06, "loss": 0.027, "num_tokens": 94400883.0, "step": 939 }, { "epoch": 2.1414314228685485, "grad_norm": 1.3125, "learning_rate": 4.642109456271618e-06, "loss": 0.023, "num_tokens": 94501635.0, "step": 940 }, { "epoch": 2.143712574850299, "grad_norm": 1.359375, "learning_rate": 4.64113778721764e-06, "loss": 0.0254, "num_tokens": 94602077.0, "step": 941 }, { "epoch": 2.1459937268320504, "grad_norm": 1.5625, "learning_rate": 4.640164902895192e-06, "loss": 0.0298, "num_tokens": 94703309.0, "step": 942 }, { "epoch": 2.148274878813801, "grad_norm": 1.40625, "learning_rate": 4.6391908038564615e-06, "loss": 0.0213, "num_tokens": 94804032.0, "step": 943 }, { "epoch": 2.150556030795552, "grad_norm": 1.8125, "learning_rate": 4.6382154906543295e-06, "loss": 0.0287, "num_tokens": 94904869.0, "step": 944 }, { "epoch": 2.1528371827773025, "grad_norm": 1.5390625, "learning_rate": 4.637238963842365e-06, "loss": 0.0293, "num_tokens": 95004903.0, "step": 945 }, { "epoch": 2.1551183347590532, "grad_norm": 1.28125, "learning_rate": 4.636261223974826e-06, "loss": 0.019, "num_tokens": 95105205.0, "step": 946 }, { "epoch": 2.157399486740804, "grad_norm": 1.3984375, "learning_rate": 4.635282271606658e-06, "loss": 0.0272, "num_tokens": 95205817.0, "step": 947 }, { "epoch": 2.1596806387225547, "grad_norm": 1.453125, "learning_rate": 4.634302107293497e-06, "loss": 0.0312, "num_tokens": 95305951.0, "step": 948 }, { "epoch": 2.161961790704306, "grad_norm": 1.4453125, "learning_rate": 4.633320731591663e-06, "loss": 0.0185, "num_tokens": 95406993.0, "step": 949 }, { "epoch": 2.1642429426860565, "grad_norm": 1.6015625, "learning_rate": 4.632338145058167e-06, "loss": 0.0284, "num_tokens": 95507813.0, "step": 950 }, { "epoch": 2.1665240946678073, "grad_norm": 1.7578125, "learning_rate": 4.631354348250706e-06, "loss": 0.0325, "num_tokens": 95608742.0, "step": 951 }, { "epoch": 2.168805246649558, "grad_norm": 1.3671875, "learning_rate": 4.630369341727665e-06, "loss": 0.0217, "num_tokens": 95709021.0, "step": 952 }, { "epoch": 2.1710863986313087, "grad_norm": 1.6328125, "learning_rate": 4.629383126048114e-06, "loss": 0.0249, "num_tokens": 95809722.0, "step": 953 }, { "epoch": 2.1733675506130594, "grad_norm": 2.078125, "learning_rate": 4.6283957017718105e-06, "loss": 0.026, "num_tokens": 95910656.0, "step": 954 }, { "epoch": 2.1756487025948106, "grad_norm": 1.46875, "learning_rate": 4.627407069459196e-06, "loss": 0.0227, "num_tokens": 96011123.0, "step": 955 }, { "epoch": 2.1779298545765613, "grad_norm": 1.65625, "learning_rate": 4.626417229671401e-06, "loss": 0.0297, "num_tokens": 96112484.0, "step": 956 }, { "epoch": 2.180211006558312, "grad_norm": 1.7421875, "learning_rate": 4.625426182970237e-06, "loss": 0.0299, "num_tokens": 96212933.0, "step": 957 }, { "epoch": 2.1824921585400627, "grad_norm": 1.8828125, "learning_rate": 4.6244339299182065e-06, "loss": 0.0299, "num_tokens": 96313383.0, "step": 958 }, { "epoch": 2.1847733105218134, "grad_norm": 1.7578125, "learning_rate": 4.62344047107849e-06, "loss": 0.0239, "num_tokens": 96414100.0, "step": 959 }, { "epoch": 2.187054462503564, "grad_norm": 1.4453125, "learning_rate": 4.622445807014956e-06, "loss": 0.031, "num_tokens": 96515335.0, "step": 960 }, { "epoch": 2.189335614485315, "grad_norm": 1.9453125, "learning_rate": 4.621449938292159e-06, "loss": 0.0413, "num_tokens": 96616661.0, "step": 961 }, { "epoch": 2.191616766467066, "grad_norm": 1.3984375, "learning_rate": 4.620452865475331e-06, "loss": 0.021, "num_tokens": 96717778.0, "step": 962 }, { "epoch": 2.1938979184488168, "grad_norm": 1.90625, "learning_rate": 4.6194545891303955e-06, "loss": 0.0347, "num_tokens": 96818363.0, "step": 963 }, { "epoch": 2.1961790704305675, "grad_norm": 1.765625, "learning_rate": 4.618455109823952e-06, "loss": 0.0293, "num_tokens": 96918838.0, "step": 964 }, { "epoch": 2.198460222412318, "grad_norm": 1.2421875, "learning_rate": 4.617454428123287e-06, "loss": 0.0209, "num_tokens": 97018981.0, "step": 965 }, { "epoch": 2.200741374394069, "grad_norm": 1.5390625, "learning_rate": 4.616452544596367e-06, "loss": 0.0245, "num_tokens": 97119948.0, "step": 966 }, { "epoch": 2.2030225263758196, "grad_norm": 1.5078125, "learning_rate": 4.615449459811843e-06, "loss": 0.0248, "num_tokens": 97220531.0, "step": 967 }, { "epoch": 2.205303678357571, "grad_norm": 1.65625, "learning_rate": 4.614445174339045e-06, "loss": 0.0248, "num_tokens": 97321117.0, "step": 968 }, { "epoch": 2.2075848303393215, "grad_norm": 1.59375, "learning_rate": 4.613439688747988e-06, "loss": 0.0296, "num_tokens": 97422108.0, "step": 969 }, { "epoch": 2.2098659823210722, "grad_norm": 1.78125, "learning_rate": 4.612433003609365e-06, "loss": 0.0262, "num_tokens": 97522376.0, "step": 970 }, { "epoch": 2.212147134302823, "grad_norm": 1.8203125, "learning_rate": 4.611425119494552e-06, "loss": 0.0316, "num_tokens": 97623327.0, "step": 971 }, { "epoch": 2.2144282862845737, "grad_norm": 1.7578125, "learning_rate": 4.6104160369756025e-06, "loss": 0.0346, "num_tokens": 97724685.0, "step": 972 }, { "epoch": 2.2167094382663244, "grad_norm": 1.2265625, "learning_rate": 4.609405756625254e-06, "loss": 0.0196, "num_tokens": 97825509.0, "step": 973 }, { "epoch": 2.218990590248075, "grad_norm": 2.25, "learning_rate": 4.608394279016921e-06, "loss": 0.0286, "num_tokens": 97926260.0, "step": 974 }, { "epoch": 2.2212717422298263, "grad_norm": 1.6484375, "learning_rate": 4.6073816047247e-06, "loss": 0.0285, "num_tokens": 98027406.0, "step": 975 }, { "epoch": 2.223552894211577, "grad_norm": 2.21875, "learning_rate": 4.606367734323365e-06, "loss": 0.0238, "num_tokens": 98128224.0, "step": 976 }, { "epoch": 2.2258340461933277, "grad_norm": 1.703125, "learning_rate": 4.605352668388369e-06, "loss": 0.0243, "num_tokens": 98229513.0, "step": 977 }, { "epoch": 2.2281151981750784, "grad_norm": 1.8125, "learning_rate": 4.6043364074958435e-06, "loss": 0.0286, "num_tokens": 98330458.0, "step": 978 }, { "epoch": 2.230396350156829, "grad_norm": 1.875, "learning_rate": 4.6033189522226e-06, "loss": 0.0325, "num_tokens": 98430919.0, "step": 979 }, { "epoch": 2.23267750213858, "grad_norm": 1.4296875, "learning_rate": 4.602300303146123e-06, "loss": 0.0244, "num_tokens": 98532044.0, "step": 980 }, { "epoch": 2.2349586541203306, "grad_norm": 1.4296875, "learning_rate": 4.601280460844583e-06, "loss": 0.021, "num_tokens": 98632374.0, "step": 981 }, { "epoch": 2.2372398061020817, "grad_norm": 1.6171875, "learning_rate": 4.6002594258968185e-06, "loss": 0.0293, "num_tokens": 98732785.0, "step": 982 }, { "epoch": 2.2395209580838324, "grad_norm": 1.3671875, "learning_rate": 4.599237198882351e-06, "loss": 0.026, "num_tokens": 98833228.0, "step": 983 }, { "epoch": 2.241802110065583, "grad_norm": 1.4296875, "learning_rate": 4.598213780381377e-06, "loss": 0.0213, "num_tokens": 98933533.0, "step": 984 }, { "epoch": 2.244083262047334, "grad_norm": 1.375, "learning_rate": 4.59718917097477e-06, "loss": 0.0217, "num_tokens": 99034100.0, "step": 985 }, { "epoch": 2.2463644140290846, "grad_norm": 1.609375, "learning_rate": 4.596163371244076e-06, "loss": 0.0339, "num_tokens": 99134632.0, "step": 986 }, { "epoch": 2.2486455660108353, "grad_norm": 1.4140625, "learning_rate": 4.595136381771521e-06, "loss": 0.0199, "num_tokens": 99234743.0, "step": 987 }, { "epoch": 2.250926717992586, "grad_norm": 1.4921875, "learning_rate": 4.594108203140004e-06, "loss": 0.0228, "num_tokens": 99335647.0, "step": 988 }, { "epoch": 2.253207869974337, "grad_norm": 1.3515625, "learning_rate": 4.593078835933099e-06, "loss": 0.0265, "num_tokens": 99436541.0, "step": 989 }, { "epoch": 2.255489021956088, "grad_norm": 1.625, "learning_rate": 4.592048280735055e-06, "loss": 0.0272, "num_tokens": 99537358.0, "step": 990 }, { "epoch": 2.2577701739378386, "grad_norm": 1.203125, "learning_rate": 4.591016538130796e-06, "loss": 0.0245, "num_tokens": 99637414.0, "step": 991 }, { "epoch": 2.2600513259195893, "grad_norm": 1.484375, "learning_rate": 4.589983608705918e-06, "loss": 0.0232, "num_tokens": 99737721.0, "step": 992 }, { "epoch": 2.26233247790134, "grad_norm": 1.9375, "learning_rate": 4.588949493046693e-06, "loss": 0.0348, "num_tokens": 99838401.0, "step": 993 }, { "epoch": 2.264613629883091, "grad_norm": 1.7109375, "learning_rate": 4.587914191740064e-06, "loss": 0.0268, "num_tokens": 99938543.0, "step": 994 }, { "epoch": 2.266894781864842, "grad_norm": 1.3828125, "learning_rate": 4.586877705373648e-06, "loss": 0.0258, "num_tokens": 100039235.0, "step": 995 }, { "epoch": 2.2691759338465927, "grad_norm": 1.40625, "learning_rate": 4.585840034535736e-06, "loss": 0.0271, "num_tokens": 100140755.0, "step": 996 }, { "epoch": 2.2714570858283434, "grad_norm": 1.5390625, "learning_rate": 4.584801179815289e-06, "loss": 0.025, "num_tokens": 100241485.0, "step": 997 }, { "epoch": 2.273738237810094, "grad_norm": 1.5390625, "learning_rate": 4.583761141801941e-06, "loss": 0.0236, "num_tokens": 100341940.0, "step": 998 }, { "epoch": 2.276019389791845, "grad_norm": 1.421875, "learning_rate": 4.5827199210859975e-06, "loss": 0.0277, "num_tokens": 100442650.0, "step": 999 }, { "epoch": 2.2783005417735955, "grad_norm": 1.8046875, "learning_rate": 4.581677518258435e-06, "loss": 0.0382, "num_tokens": 100543359.0, "step": 1000 }, { "epoch": 2.2805816937553463, "grad_norm": 1.265625, "learning_rate": 4.580633933910901e-06, "loss": 0.0197, "num_tokens": 100644088.0, "step": 1001 }, { "epoch": 2.2828628457370974, "grad_norm": 1.375, "learning_rate": 4.579589168635715e-06, "loss": 0.03, "num_tokens": 100744174.0, "step": 1002 }, { "epoch": 2.285143997718848, "grad_norm": 1.9609375, "learning_rate": 4.578543223025865e-06, "loss": 0.0308, "num_tokens": 100844469.0, "step": 1003 }, { "epoch": 2.287425149700599, "grad_norm": 1.5625, "learning_rate": 4.577496097675009e-06, "loss": 0.0271, "num_tokens": 100944907.0, "step": 1004 }, { "epoch": 2.2897063016823496, "grad_norm": 1.5703125, "learning_rate": 4.576447793177476e-06, "loss": 0.0239, "num_tokens": 101045173.0, "step": 1005 }, { "epoch": 2.2919874536641003, "grad_norm": 1.6875, "learning_rate": 4.575398310128263e-06, "loss": 0.0257, "num_tokens": 101146144.0, "step": 1006 }, { "epoch": 2.294268605645851, "grad_norm": 1.859375, "learning_rate": 4.574347649123036e-06, "loss": 0.0447, "num_tokens": 101246777.0, "step": 1007 }, { "epoch": 2.296549757627602, "grad_norm": 1.7265625, "learning_rate": 4.57329581075813e-06, "loss": 0.0328, "num_tokens": 101347455.0, "step": 1008 }, { "epoch": 2.298830909609353, "grad_norm": 1.21875, "learning_rate": 4.572242795630549e-06, "loss": 0.0226, "num_tokens": 101448340.0, "step": 1009 }, { "epoch": 2.3011120615911036, "grad_norm": 1.46875, "learning_rate": 4.571188604337963e-06, "loss": 0.0277, "num_tokens": 101549264.0, "step": 1010 }, { "epoch": 2.3033932135728543, "grad_norm": 1.46875, "learning_rate": 4.570133237478711e-06, "loss": 0.024, "num_tokens": 101649676.0, "step": 1011 }, { "epoch": 2.305674365554605, "grad_norm": 1.4296875, "learning_rate": 4.5690766956517985e-06, "loss": 0.0258, "num_tokens": 101750088.0, "step": 1012 }, { "epoch": 2.3079555175363557, "grad_norm": 1.484375, "learning_rate": 4.568018979456899e-06, "loss": 0.0273, "num_tokens": 101850891.0, "step": 1013 }, { "epoch": 2.3102366695181065, "grad_norm": 1.625, "learning_rate": 4.566960089494351e-06, "loss": 0.0269, "num_tokens": 101951750.0, "step": 1014 }, { "epoch": 2.312517821499857, "grad_norm": 1.328125, "learning_rate": 4.5659000263651615e-06, "loss": 0.0224, "num_tokens": 102052419.0, "step": 1015 }, { "epoch": 2.3147989734816083, "grad_norm": 1.6953125, "learning_rate": 4.564838790671e-06, "loss": 0.026, "num_tokens": 102153382.0, "step": 1016 }, { "epoch": 2.317080125463359, "grad_norm": 1.3671875, "learning_rate": 4.5637763830142046e-06, "loss": 0.025, "num_tokens": 102254125.0, "step": 1017 }, { "epoch": 2.31936127744511, "grad_norm": 1.5, "learning_rate": 4.562712803997776e-06, "loss": 0.0251, "num_tokens": 102354617.0, "step": 1018 }, { "epoch": 2.3216424294268605, "grad_norm": 1.2890625, "learning_rate": 4.5616480542253825e-06, "loss": 0.0169, "num_tokens": 102454799.0, "step": 1019 }, { "epoch": 2.323923581408611, "grad_norm": 1.3203125, "learning_rate": 4.5605821343013555e-06, "loss": 0.0274, "num_tokens": 102555777.0, "step": 1020 }, { "epoch": 2.3262047333903624, "grad_norm": 1.4140625, "learning_rate": 4.55951504483069e-06, "loss": 0.0258, "num_tokens": 102656645.0, "step": 1021 }, { "epoch": 2.328485885372113, "grad_norm": 1.640625, "learning_rate": 4.558446786419045e-06, "loss": 0.0271, "num_tokens": 102756788.0, "step": 1022 }, { "epoch": 2.330767037353864, "grad_norm": 1.25, "learning_rate": 4.557377359672745e-06, "loss": 0.0201, "num_tokens": 102856924.0, "step": 1023 }, { "epoch": 2.3330481893356145, "grad_norm": 1.6953125, "learning_rate": 4.556306765198775e-06, "loss": 0.0251, "num_tokens": 102957675.0, "step": 1024 }, { "epoch": 2.3353293413173652, "grad_norm": 1.46875, "learning_rate": 4.555235003604782e-06, "loss": 0.0303, "num_tokens": 103058121.0, "step": 1025 }, { "epoch": 2.337610493299116, "grad_norm": 1.5703125, "learning_rate": 4.55416207549908e-06, "loss": 0.0235, "num_tokens": 103158647.0, "step": 1026 }, { "epoch": 2.3398916452808667, "grad_norm": 1.65625, "learning_rate": 4.5530879814906404e-06, "loss": 0.0284, "num_tokens": 103259100.0, "step": 1027 }, { "epoch": 2.3421727972626174, "grad_norm": 1.671875, "learning_rate": 4.5520127221891e-06, "loss": 0.0302, "num_tokens": 103360018.0, "step": 1028 }, { "epoch": 2.3444539492443686, "grad_norm": 1.4375, "learning_rate": 4.5509362982047525e-06, "loss": 0.0265, "num_tokens": 103460374.0, "step": 1029 }, { "epoch": 2.3467351012261193, "grad_norm": 1.90625, "learning_rate": 4.549858710148558e-06, "loss": 0.0342, "num_tokens": 103561496.0, "step": 1030 }, { "epoch": 2.34901625320787, "grad_norm": 1.4765625, "learning_rate": 4.548779958632134e-06, "loss": 0.0242, "num_tokens": 103662286.0, "step": 1031 }, { "epoch": 2.3512974051896207, "grad_norm": 1.671875, "learning_rate": 4.5477000442677575e-06, "loss": 0.0208, "num_tokens": 103762798.0, "step": 1032 }, { "epoch": 2.3535785571713714, "grad_norm": 1.046875, "learning_rate": 4.546618967668369e-06, "loss": 0.0194, "num_tokens": 103863459.0, "step": 1033 }, { "epoch": 2.355859709153122, "grad_norm": 1.515625, "learning_rate": 4.545536729447566e-06, "loss": 0.0283, "num_tokens": 103964505.0, "step": 1034 }, { "epoch": 2.3581408611348733, "grad_norm": 1.625, "learning_rate": 4.544453330219606e-06, "loss": 0.0266, "num_tokens": 104065283.0, "step": 1035 }, { "epoch": 2.360422013116624, "grad_norm": 1.5625, "learning_rate": 4.543368770599406e-06, "loss": 0.0284, "num_tokens": 104166187.0, "step": 1036 }, { "epoch": 2.3627031650983747, "grad_norm": 1.4609375, "learning_rate": 4.542283051202539e-06, "loss": 0.024, "num_tokens": 104266610.0, "step": 1037 }, { "epoch": 2.3649843170801255, "grad_norm": 1.609375, "learning_rate": 4.541196172645242e-06, "loss": 0.0229, "num_tokens": 104367521.0, "step": 1038 }, { "epoch": 2.367265469061876, "grad_norm": 2.0, "learning_rate": 4.540108135544403e-06, "loss": 0.0335, "num_tokens": 104468307.0, "step": 1039 }, { "epoch": 2.369546621043627, "grad_norm": 1.640625, "learning_rate": 4.5390189405175725e-06, "loss": 0.0237, "num_tokens": 104568652.0, "step": 1040 }, { "epoch": 2.3718277730253776, "grad_norm": 1.953125, "learning_rate": 4.537928588182955e-06, "loss": 0.0324, "num_tokens": 104669285.0, "step": 1041 }, { "epoch": 2.374108925007129, "grad_norm": 1.4921875, "learning_rate": 4.536837079159416e-06, "loss": 0.0246, "num_tokens": 104769766.0, "step": 1042 }, { "epoch": 2.3763900769888795, "grad_norm": 1.71875, "learning_rate": 4.535744414066473e-06, "loss": 0.0274, "num_tokens": 104870436.0, "step": 1043 }, { "epoch": 2.37867122897063, "grad_norm": 1.6171875, "learning_rate": 4.534650593524302e-06, "loss": 0.0283, "num_tokens": 104970763.0, "step": 1044 }, { "epoch": 2.380952380952381, "grad_norm": 1.4296875, "learning_rate": 4.533555618153735e-06, "loss": 0.0222, "num_tokens": 105071311.0, "step": 1045 }, { "epoch": 2.3832335329341316, "grad_norm": 1.375, "learning_rate": 4.532459488576258e-06, "loss": 0.024, "num_tokens": 105171551.0, "step": 1046 }, { "epoch": 2.3855146849158824, "grad_norm": 1.25, "learning_rate": 4.531362205414013e-06, "loss": 0.0194, "num_tokens": 105271773.0, "step": 1047 }, { "epoch": 2.3877958368976335, "grad_norm": 1.8828125, "learning_rate": 4.530263769289798e-06, "loss": 0.0265, "num_tokens": 105372914.0, "step": 1048 }, { "epoch": 2.3900769888793842, "grad_norm": 1.921875, "learning_rate": 4.529164180827063e-06, "loss": 0.031, "num_tokens": 105473011.0, "step": 1049 }, { "epoch": 2.392358140861135, "grad_norm": 1.3515625, "learning_rate": 4.528063440649913e-06, "loss": 0.0238, "num_tokens": 105573807.0, "step": 1050 }, { "epoch": 2.3946392928428857, "grad_norm": 1.53125, "learning_rate": 4.526961549383109e-06, "loss": 0.0264, "num_tokens": 105674360.0, "step": 1051 }, { "epoch": 2.3969204448246364, "grad_norm": 1.5234375, "learning_rate": 4.52585850765206e-06, "loss": 0.0303, "num_tokens": 105774899.0, "step": 1052 }, { "epoch": 2.399201596806387, "grad_norm": 1.6015625, "learning_rate": 4.524754316082833e-06, "loss": 0.0221, "num_tokens": 105875376.0, "step": 1053 }, { "epoch": 2.401482748788138, "grad_norm": 1.40625, "learning_rate": 4.5236489753021465e-06, "loss": 0.0256, "num_tokens": 105975809.0, "step": 1054 }, { "epoch": 2.4037639007698886, "grad_norm": 1.703125, "learning_rate": 4.522542485937369e-06, "loss": 0.0222, "num_tokens": 106076419.0, "step": 1055 }, { "epoch": 2.4060450527516397, "grad_norm": 1.3359375, "learning_rate": 4.521434848616523e-06, "loss": 0.0177, "num_tokens": 106177326.0, "step": 1056 }, { "epoch": 2.4083262047333904, "grad_norm": 1.6953125, "learning_rate": 4.520326063968283e-06, "loss": 0.0321, "num_tokens": 106277534.0, "step": 1057 }, { "epoch": 2.410607356715141, "grad_norm": 1.65625, "learning_rate": 4.5192161326219716e-06, "loss": 0.0213, "num_tokens": 106378309.0, "step": 1058 }, { "epoch": 2.412888508696892, "grad_norm": 1.4296875, "learning_rate": 4.5181050552075665e-06, "loss": 0.0287, "num_tokens": 106478903.0, "step": 1059 }, { "epoch": 2.4151696606786426, "grad_norm": 1.484375, "learning_rate": 4.516992832355694e-06, "loss": 0.0237, "num_tokens": 106578912.0, "step": 1060 }, { "epoch": 2.4174508126603937, "grad_norm": 1.59375, "learning_rate": 4.515879464697629e-06, "loss": 0.0233, "num_tokens": 106679974.0, "step": 1061 }, { "epoch": 2.4197319646421445, "grad_norm": 1.84375, "learning_rate": 4.514764952865297e-06, "loss": 0.0358, "num_tokens": 106780331.0, "step": 1062 }, { "epoch": 2.422013116623895, "grad_norm": 2.234375, "learning_rate": 4.513649297491275e-06, "loss": 0.0382, "num_tokens": 106880598.0, "step": 1063 }, { "epoch": 2.424294268605646, "grad_norm": 1.671875, "learning_rate": 4.512532499208787e-06, "loss": 0.0282, "num_tokens": 106981578.0, "step": 1064 }, { "epoch": 2.4265754205873966, "grad_norm": 1.25, "learning_rate": 4.511414558651706e-06, "loss": 0.0189, "num_tokens": 107082875.0, "step": 1065 }, { "epoch": 2.4288565725691473, "grad_norm": 1.3046875, "learning_rate": 4.5102954764545525e-06, "loss": 0.0211, "num_tokens": 107183482.0, "step": 1066 }, { "epoch": 2.431137724550898, "grad_norm": 1.3359375, "learning_rate": 4.509175253252497e-06, "loss": 0.0217, "num_tokens": 107283843.0, "step": 1067 }, { "epoch": 2.4334188765326488, "grad_norm": 1.34375, "learning_rate": 4.508053889681357e-06, "loss": 0.0202, "num_tokens": 107384381.0, "step": 1068 }, { "epoch": 2.4357000285144, "grad_norm": 1.6171875, "learning_rate": 4.5069313863775956e-06, "loss": 0.0228, "num_tokens": 107484715.0, "step": 1069 }, { "epoch": 2.4379811804961506, "grad_norm": 1.484375, "learning_rate": 4.505807743978325e-06, "loss": 0.0232, "num_tokens": 107585825.0, "step": 1070 }, { "epoch": 2.4402623324779014, "grad_norm": 1.84375, "learning_rate": 4.5046829631213014e-06, "loss": 0.0341, "num_tokens": 107686227.0, "step": 1071 }, { "epoch": 2.442543484459652, "grad_norm": 1.609375, "learning_rate": 4.503557044444931e-06, "loss": 0.0317, "num_tokens": 107786424.0, "step": 1072 }, { "epoch": 2.444824636441403, "grad_norm": 1.5703125, "learning_rate": 4.502429988588263e-06, "loss": 0.0254, "num_tokens": 107887665.0, "step": 1073 }, { "epoch": 2.4471057884231535, "grad_norm": 1.546875, "learning_rate": 4.50130179619099e-06, "loss": 0.0272, "num_tokens": 107988251.0, "step": 1074 }, { "epoch": 2.4493869404049047, "grad_norm": 1.3125, "learning_rate": 4.500172467893455e-06, "loss": 0.0229, "num_tokens": 108088584.0, "step": 1075 }, { "epoch": 2.4516680923866554, "grad_norm": 1.5625, "learning_rate": 4.499042004336642e-06, "loss": 0.029, "num_tokens": 108189105.0, "step": 1076 }, { "epoch": 2.453949244368406, "grad_norm": 1.546875, "learning_rate": 4.497910406162182e-06, "loss": 0.0275, "num_tokens": 108289486.0, "step": 1077 }, { "epoch": 2.456230396350157, "grad_norm": 1.7890625, "learning_rate": 4.496777674012345e-06, "loss": 0.0298, "num_tokens": 108389820.0, "step": 1078 }, { "epoch": 2.4585115483319075, "grad_norm": 1.484375, "learning_rate": 4.495643808530049e-06, "loss": 0.027, "num_tokens": 108490380.0, "step": 1079 }, { "epoch": 2.4607927003136583, "grad_norm": 1.4921875, "learning_rate": 4.494508810358855e-06, "loss": 0.0219, "num_tokens": 108591077.0, "step": 1080 }, { "epoch": 2.463073852295409, "grad_norm": 1.765625, "learning_rate": 4.4933726801429665e-06, "loss": 0.0328, "num_tokens": 108692033.0, "step": 1081 }, { "epoch": 2.46535500427716, "grad_norm": 1.5078125, "learning_rate": 4.4922354185272275e-06, "loss": 0.0174, "num_tokens": 108792360.0, "step": 1082 }, { "epoch": 2.467636156258911, "grad_norm": 1.6328125, "learning_rate": 4.491097026157127e-06, "loss": 0.0239, "num_tokens": 108892954.0, "step": 1083 }, { "epoch": 2.4699173082406616, "grad_norm": 1.921875, "learning_rate": 4.489957503678794e-06, "loss": 0.0283, "num_tokens": 108993961.0, "step": 1084 }, { "epoch": 2.4721984602224123, "grad_norm": 1.25, "learning_rate": 4.488816851738999e-06, "loss": 0.0154, "num_tokens": 109094058.0, "step": 1085 }, { "epoch": 2.474479612204163, "grad_norm": 1.7265625, "learning_rate": 4.487675070985156e-06, "loss": 0.035, "num_tokens": 109194681.0, "step": 1086 }, { "epoch": 2.4767607641859137, "grad_norm": 1.5234375, "learning_rate": 4.4865321620653144e-06, "loss": 0.0324, "num_tokens": 109295180.0, "step": 1087 }, { "epoch": 2.479041916167665, "grad_norm": 1.7109375, "learning_rate": 4.485388125628171e-06, "loss": 0.0309, "num_tokens": 109395808.0, "step": 1088 }, { "epoch": 2.4813230681494156, "grad_norm": 1.40625, "learning_rate": 4.484242962323056e-06, "loss": 0.0235, "num_tokens": 109496080.0, "step": 1089 }, { "epoch": 2.4836042201311663, "grad_norm": 1.6015625, "learning_rate": 4.483096672799942e-06, "loss": 0.0286, "num_tokens": 109596653.0, "step": 1090 }, { "epoch": 2.485885372112917, "grad_norm": 1.0859375, "learning_rate": 4.481949257709442e-06, "loss": 0.0222, "num_tokens": 109697388.0, "step": 1091 }, { "epoch": 2.4881665240946678, "grad_norm": 1.875, "learning_rate": 4.480800717702807e-06, "loss": 0.0292, "num_tokens": 109798803.0, "step": 1092 }, { "epoch": 2.4904476760764185, "grad_norm": 1.28125, "learning_rate": 4.479651053431926e-06, "loss": 0.0207, "num_tokens": 109899332.0, "step": 1093 }, { "epoch": 2.492728828058169, "grad_norm": 1.375, "learning_rate": 4.4785002655493246e-06, "loss": 0.021, "num_tokens": 110000090.0, "step": 1094 }, { "epoch": 2.49500998003992, "grad_norm": 1.359375, "learning_rate": 4.477348354708169e-06, "loss": 0.0258, "num_tokens": 110100934.0, "step": 1095 }, { "epoch": 2.497291132021671, "grad_norm": 1.484375, "learning_rate": 4.476195321562262e-06, "loss": 0.0233, "num_tokens": 110201248.0, "step": 1096 }, { "epoch": 2.499572284003422, "grad_norm": 1.7421875, "learning_rate": 4.475041166766042e-06, "loss": 0.032, "num_tokens": 110301751.0, "step": 1097 }, { "epoch": 2.5018534359851725, "grad_norm": 1.6015625, "learning_rate": 4.473885890974586e-06, "loss": 0.033, "num_tokens": 110402936.0, "step": 1098 }, { "epoch": 2.5041345879669232, "grad_norm": 1.5078125, "learning_rate": 4.472729494843605e-06, "loss": 0.0232, "num_tokens": 110503550.0, "step": 1099 }, { "epoch": 2.506415739948674, "grad_norm": 1.296875, "learning_rate": 4.471571979029448e-06, "loss": 0.0287, "num_tokens": 110604451.0, "step": 1100 }, { "epoch": 2.506415739948674, "eval_loss": 0.03224305808544159, "eval_num_tokens": 110604451.0, "eval_runtime": 52.6563, "eval_samples_per_second": 159.24, "eval_steps_per_second": 4.995, "step": 1100 }, { "epoch": 2.508696891930425, "grad_norm": 1.140625, "learning_rate": 4.470413344189098e-06, "loss": 0.0203, "num_tokens": 110704593.0, "step": 1101 }, { "epoch": 2.510978043912176, "grad_norm": 1.7890625, "learning_rate": 4.469253590980175e-06, "loss": 0.0305, "num_tokens": 110805161.0, "step": 1102 }, { "epoch": 2.5132591958939265, "grad_norm": 1.5078125, "learning_rate": 4.46809272006093e-06, "loss": 0.0199, "num_tokens": 110905656.0, "step": 1103 }, { "epoch": 2.5155403478756773, "grad_norm": 1.1796875, "learning_rate": 4.466930732090254e-06, "loss": 0.0162, "num_tokens": 111005908.0, "step": 1104 }, { "epoch": 2.517821499857428, "grad_norm": 1.4765625, "learning_rate": 4.465767627727668e-06, "loss": 0.0231, "num_tokens": 111107122.0, "step": 1105 }, { "epoch": 2.5201026518391787, "grad_norm": 1.8515625, "learning_rate": 4.464603407633326e-06, "loss": 0.0306, "num_tokens": 111207650.0, "step": 1106 }, { "epoch": 2.5223838038209294, "grad_norm": 2.046875, "learning_rate": 4.463438072468018e-06, "loss": 0.0335, "num_tokens": 111307762.0, "step": 1107 }, { "epoch": 2.52466495580268, "grad_norm": 1.546875, "learning_rate": 4.462271622893166e-06, "loss": 0.025, "num_tokens": 111408286.0, "step": 1108 }, { "epoch": 2.5269461077844313, "grad_norm": 1.6640625, "learning_rate": 4.461104059570825e-06, "loss": 0.0317, "num_tokens": 111508858.0, "step": 1109 }, { "epoch": 2.529227259766182, "grad_norm": 1.6171875, "learning_rate": 4.4599353831636785e-06, "loss": 0.0312, "num_tokens": 111609849.0, "step": 1110 }, { "epoch": 2.5315084117479327, "grad_norm": 1.75, "learning_rate": 4.458765594335048e-06, "loss": 0.0327, "num_tokens": 111710330.0, "step": 1111 }, { "epoch": 2.5337895637296834, "grad_norm": 1.7265625, "learning_rate": 4.457594693748881e-06, "loss": 0.0243, "num_tokens": 111810799.0, "step": 1112 }, { "epoch": 2.536070715711434, "grad_norm": 1.4140625, "learning_rate": 4.456422682069758e-06, "loss": 0.0231, "num_tokens": 111911416.0, "step": 1113 }, { "epoch": 2.5383518676931853, "grad_norm": 1.9453125, "learning_rate": 4.455249559962892e-06, "loss": 0.0324, "num_tokens": 112011846.0, "step": 1114 }, { "epoch": 2.540633019674936, "grad_norm": 1.3671875, "learning_rate": 4.454075328094123e-06, "loss": 0.025, "num_tokens": 112113095.0, "step": 1115 }, { "epoch": 2.5429141716566868, "grad_norm": 1.5, "learning_rate": 4.452899987129922e-06, "loss": 0.0314, "num_tokens": 112214450.0, "step": 1116 }, { "epoch": 2.5451953236384375, "grad_norm": 1.796875, "learning_rate": 4.4517235377373915e-06, "loss": 0.0319, "num_tokens": 112315146.0, "step": 1117 }, { "epoch": 2.547476475620188, "grad_norm": 1.453125, "learning_rate": 4.45054598058426e-06, "loss": 0.0296, "num_tokens": 112415716.0, "step": 1118 }, { "epoch": 2.549757627601939, "grad_norm": 1.5078125, "learning_rate": 4.449367316338887e-06, "loss": 0.0343, "num_tokens": 112516415.0, "step": 1119 }, { "epoch": 2.5520387795836896, "grad_norm": 1.1328125, "learning_rate": 4.448187545670258e-06, "loss": 0.0196, "num_tokens": 112617286.0, "step": 1120 }, { "epoch": 2.5543199315654403, "grad_norm": 1.375, "learning_rate": 4.44700666924799e-06, "loss": 0.0253, "num_tokens": 112717493.0, "step": 1121 }, { "epoch": 2.556601083547191, "grad_norm": 1.2421875, "learning_rate": 4.4458246877423254e-06, "loss": 0.0184, "num_tokens": 112817874.0, "step": 1122 }, { "epoch": 2.5588822355289422, "grad_norm": 1.7578125, "learning_rate": 4.444641601824134e-06, "loss": 0.0286, "num_tokens": 112918148.0, "step": 1123 }, { "epoch": 2.561163387510693, "grad_norm": 1.6796875, "learning_rate": 4.443457412164911e-06, "loss": 0.0254, "num_tokens": 113018711.0, "step": 1124 }, { "epoch": 2.5634445394924437, "grad_norm": 1.484375, "learning_rate": 4.442272119436781e-06, "loss": 0.0292, "num_tokens": 113119958.0, "step": 1125 }, { "epoch": 2.5657256914741944, "grad_norm": 1.65625, "learning_rate": 4.441085724312494e-06, "loss": 0.0332, "num_tokens": 113220717.0, "step": 1126 }, { "epoch": 2.568006843455945, "grad_norm": 1.7890625, "learning_rate": 4.4398982274654235e-06, "loss": 0.0298, "num_tokens": 113322120.0, "step": 1127 }, { "epoch": 2.5702879954376963, "grad_norm": 1.34375, "learning_rate": 4.43870962956957e-06, "loss": 0.0177, "num_tokens": 113422930.0, "step": 1128 }, { "epoch": 2.572569147419447, "grad_norm": 1.6953125, "learning_rate": 4.437519931299559e-06, "loss": 0.0255, "num_tokens": 113524627.0, "step": 1129 }, { "epoch": 2.5748502994011977, "grad_norm": 1.421875, "learning_rate": 4.43632913333064e-06, "loss": 0.0226, "num_tokens": 113624991.0, "step": 1130 }, { "epoch": 2.5771314513829484, "grad_norm": 1.3828125, "learning_rate": 4.435137236338688e-06, "loss": 0.0214, "num_tokens": 113725934.0, "step": 1131 }, { "epoch": 2.579412603364699, "grad_norm": 1.25, "learning_rate": 4.433944241000199e-06, "loss": 0.0219, "num_tokens": 113826603.0, "step": 1132 }, { "epoch": 2.58169375534645, "grad_norm": 1.078125, "learning_rate": 4.4327501479922955e-06, "loss": 0.0174, "num_tokens": 113927573.0, "step": 1133 }, { "epoch": 2.5839749073282006, "grad_norm": 1.2890625, "learning_rate": 4.431554957992722e-06, "loss": 0.0269, "num_tokens": 114027914.0, "step": 1134 }, { "epoch": 2.5862560593099513, "grad_norm": 1.2890625, "learning_rate": 4.430358671679843e-06, "loss": 0.0244, "num_tokens": 114128136.0, "step": 1135 }, { "epoch": 2.5885372112917024, "grad_norm": 1.75, "learning_rate": 4.42916128973265e-06, "loss": 0.0267, "num_tokens": 114228643.0, "step": 1136 }, { "epoch": 2.590818363273453, "grad_norm": 1.5703125, "learning_rate": 4.427962812830753e-06, "loss": 0.0274, "num_tokens": 114329416.0, "step": 1137 }, { "epoch": 2.593099515255204, "grad_norm": 1.96875, "learning_rate": 4.426763241654383e-06, "loss": 0.0305, "num_tokens": 114430177.0, "step": 1138 }, { "epoch": 2.5953806672369546, "grad_norm": 1.890625, "learning_rate": 4.425562576884396e-06, "loss": 0.029, "num_tokens": 114530689.0, "step": 1139 }, { "epoch": 2.5976618192187053, "grad_norm": 1.25, "learning_rate": 4.424360819202264e-06, "loss": 0.0209, "num_tokens": 114631294.0, "step": 1140 }, { "epoch": 2.5999429712004565, "grad_norm": 1.421875, "learning_rate": 4.423157969290081e-06, "loss": 0.0263, "num_tokens": 114731574.0, "step": 1141 }, { "epoch": 2.602224123182207, "grad_norm": 1.6640625, "learning_rate": 4.421954027830565e-06, "loss": 0.0252, "num_tokens": 114832515.0, "step": 1142 }, { "epoch": 2.604505275163958, "grad_norm": 1.7265625, "learning_rate": 4.4207489955070465e-06, "loss": 0.0246, "num_tokens": 114932832.0, "step": 1143 }, { "epoch": 2.6067864271457086, "grad_norm": 1.9296875, "learning_rate": 4.419542873003479e-06, "loss": 0.029, "num_tokens": 115033886.0, "step": 1144 }, { "epoch": 2.6090675791274593, "grad_norm": 1.796875, "learning_rate": 4.418335661004436e-06, "loss": 0.0272, "num_tokens": 115134400.0, "step": 1145 }, { "epoch": 2.61134873110921, "grad_norm": 1.578125, "learning_rate": 4.417127360195107e-06, "loss": 0.0265, "num_tokens": 115235105.0, "step": 1146 }, { "epoch": 2.613629883090961, "grad_norm": 1.6015625, "learning_rate": 4.415917971261299e-06, "loss": 0.0312, "num_tokens": 115336102.0, "step": 1147 }, { "epoch": 2.6159110350727115, "grad_norm": 1.484375, "learning_rate": 4.414707494889439e-06, "loss": 0.0219, "num_tokens": 115436786.0, "step": 1148 }, { "epoch": 2.6181921870544627, "grad_norm": 2.203125, "learning_rate": 4.413495931766571e-06, "loss": 0.0267, "num_tokens": 115537853.0, "step": 1149 }, { "epoch": 2.6204733390362134, "grad_norm": 1.234375, "learning_rate": 4.412283282580352e-06, "loss": 0.0165, "num_tokens": 115638076.0, "step": 1150 }, { "epoch": 2.622754491017964, "grad_norm": 1.53125, "learning_rate": 4.41106954801906e-06, "loss": 0.0268, "num_tokens": 115738891.0, "step": 1151 }, { "epoch": 2.625035642999715, "grad_norm": 1.3671875, "learning_rate": 4.409854728771588e-06, "loss": 0.0257, "num_tokens": 115839711.0, "step": 1152 }, { "epoch": 2.6273167949814655, "grad_norm": 1.78125, "learning_rate": 4.4086388255274425e-06, "loss": 0.0301, "num_tokens": 115939989.0, "step": 1153 }, { "epoch": 2.6295979469632167, "grad_norm": 1.296875, "learning_rate": 4.407421838976747e-06, "loss": 0.0207, "num_tokens": 116040841.0, "step": 1154 }, { "epoch": 2.6318790989449674, "grad_norm": 1.59375, "learning_rate": 4.40620376981024e-06, "loss": 0.0212, "num_tokens": 116141562.0, "step": 1155 }, { "epoch": 2.634160250926718, "grad_norm": 2.1875, "learning_rate": 4.404984618719275e-06, "loss": 0.0385, "num_tokens": 116241770.0, "step": 1156 }, { "epoch": 2.636441402908469, "grad_norm": 1.15625, "learning_rate": 4.403764386395817e-06, "loss": 0.023, "num_tokens": 116342942.0, "step": 1157 }, { "epoch": 2.6387225548902196, "grad_norm": 1.3828125, "learning_rate": 4.402543073532446e-06, "loss": 0.0206, "num_tokens": 116444058.0, "step": 1158 }, { "epoch": 2.6410037068719703, "grad_norm": 1.34375, "learning_rate": 4.401320680822357e-06, "loss": 0.0221, "num_tokens": 116545285.0, "step": 1159 }, { "epoch": 2.643284858853721, "grad_norm": 1.7890625, "learning_rate": 4.400097208959357e-06, "loss": 0.0301, "num_tokens": 116646789.0, "step": 1160 }, { "epoch": 2.6455660108354717, "grad_norm": 1.328125, "learning_rate": 4.398872658637863e-06, "loss": 0.0235, "num_tokens": 116747276.0, "step": 1161 }, { "epoch": 2.6478471628172224, "grad_norm": 1.828125, "learning_rate": 4.397647030552907e-06, "loss": 0.0345, "num_tokens": 116847573.0, "step": 1162 }, { "epoch": 2.6501283147989736, "grad_norm": 2.046875, "learning_rate": 4.396420325400132e-06, "loss": 0.033, "num_tokens": 116948228.0, "step": 1163 }, { "epoch": 2.6524094667807243, "grad_norm": 1.953125, "learning_rate": 4.3951925438757936e-06, "loss": 0.0396, "num_tokens": 117049276.0, "step": 1164 }, { "epoch": 2.654690618762475, "grad_norm": 1.65625, "learning_rate": 4.3939636866767535e-06, "loss": 0.0262, "num_tokens": 117150264.0, "step": 1165 }, { "epoch": 2.6569717707442257, "grad_norm": 1.5546875, "learning_rate": 4.39273375450049e-06, "loss": 0.0263, "num_tokens": 117250603.0, "step": 1166 }, { "epoch": 2.6592529227259765, "grad_norm": 1.5703125, "learning_rate": 4.391502748045088e-06, "loss": 0.0319, "num_tokens": 117350889.0, "step": 1167 }, { "epoch": 2.6615340747077276, "grad_norm": 1.46875, "learning_rate": 4.390270668009244e-06, "loss": 0.0304, "num_tokens": 117451396.0, "step": 1168 }, { "epoch": 2.6638152266894783, "grad_norm": 1.4296875, "learning_rate": 4.38903751509226e-06, "loss": 0.0249, "num_tokens": 117552373.0, "step": 1169 }, { "epoch": 2.666096378671229, "grad_norm": 1.5859375, "learning_rate": 4.3878032899940534e-06, "loss": 0.0298, "num_tokens": 117653044.0, "step": 1170 }, { "epoch": 2.66837753065298, "grad_norm": 1.625, "learning_rate": 4.386567993415144e-06, "loss": 0.0233, "num_tokens": 117753936.0, "step": 1171 }, { "epoch": 2.6706586826347305, "grad_norm": 2.03125, "learning_rate": 4.3853316260566635e-06, "loss": 0.0327, "num_tokens": 117854474.0, "step": 1172 }, { "epoch": 2.672939834616481, "grad_norm": 1.6328125, "learning_rate": 4.384094188620349e-06, "loss": 0.0243, "num_tokens": 117954631.0, "step": 1173 }, { "epoch": 2.675220986598232, "grad_norm": 1.515625, "learning_rate": 4.3828556818085485e-06, "loss": 0.0244, "num_tokens": 118055181.0, "step": 1174 }, { "epoch": 2.6775021385799826, "grad_norm": 1.515625, "learning_rate": 4.3816161063242115e-06, "loss": 0.0285, "num_tokens": 118155373.0, "step": 1175 }, { "epoch": 2.679783290561734, "grad_norm": 1.0703125, "learning_rate": 4.3803754628708995e-06, "loss": 0.0172, "num_tokens": 118256551.0, "step": 1176 }, { "epoch": 2.6820644425434845, "grad_norm": 1.203125, "learning_rate": 4.379133752152776e-06, "loss": 0.0169, "num_tokens": 118357452.0, "step": 1177 }, { "epoch": 2.6843455945252352, "grad_norm": 1.6484375, "learning_rate": 4.377890974874614e-06, "loss": 0.0267, "num_tokens": 118458863.0, "step": 1178 }, { "epoch": 2.686626746506986, "grad_norm": 1.3671875, "learning_rate": 4.376647131741787e-06, "loss": 0.0261, "num_tokens": 118559564.0, "step": 1179 }, { "epoch": 2.6889078984887367, "grad_norm": 1.6875, "learning_rate": 4.375402223460279e-06, "loss": 0.0308, "num_tokens": 118660061.0, "step": 1180 }, { "epoch": 2.691189050470488, "grad_norm": 1.3515625, "learning_rate": 4.3741562507366754e-06, "loss": 0.0267, "num_tokens": 118760739.0, "step": 1181 }, { "epoch": 2.6934702024522386, "grad_norm": 1.375, "learning_rate": 4.3729092142781655e-06, "loss": 0.0239, "num_tokens": 118861459.0, "step": 1182 }, { "epoch": 2.6957513544339893, "grad_norm": 1.359375, "learning_rate": 4.3716611147925435e-06, "loss": 0.0199, "num_tokens": 118961728.0, "step": 1183 }, { "epoch": 2.69803250641574, "grad_norm": 1.5625, "learning_rate": 4.370411952988207e-06, "loss": 0.0285, "num_tokens": 119062578.0, "step": 1184 }, { "epoch": 2.7003136583974907, "grad_norm": 1.6171875, "learning_rate": 4.369161729574155e-06, "loss": 0.0259, "num_tokens": 119162949.0, "step": 1185 }, { "epoch": 2.7025948103792414, "grad_norm": 1.1953125, "learning_rate": 4.367910445259991e-06, "loss": 0.0187, "num_tokens": 119263140.0, "step": 1186 }, { "epoch": 2.704875962360992, "grad_norm": 1.4375, "learning_rate": 4.36665810075592e-06, "loss": 0.0252, "num_tokens": 119363603.0, "step": 1187 }, { "epoch": 2.707157114342743, "grad_norm": 1.2734375, "learning_rate": 4.365404696772748e-06, "loss": 0.0231, "num_tokens": 119464478.0, "step": 1188 }, { "epoch": 2.709438266324494, "grad_norm": 1.3359375, "learning_rate": 4.364150234021883e-06, "loss": 0.0234, "num_tokens": 119565500.0, "step": 1189 }, { "epoch": 2.7117194183062447, "grad_norm": 1.59375, "learning_rate": 4.362894713215334e-06, "loss": 0.0338, "num_tokens": 119667059.0, "step": 1190 }, { "epoch": 2.7140005702879955, "grad_norm": 1.25, "learning_rate": 4.361638135065711e-06, "loss": 0.0231, "num_tokens": 119768909.0, "step": 1191 }, { "epoch": 2.716281722269746, "grad_norm": 1.484375, "learning_rate": 4.360380500286222e-06, "loss": 0.0265, "num_tokens": 119869725.0, "step": 1192 }, { "epoch": 2.718562874251497, "grad_norm": 1.5625, "learning_rate": 4.359121809590678e-06, "loss": 0.0253, "num_tokens": 119970530.0, "step": 1193 }, { "epoch": 2.720844026233248, "grad_norm": 1.4375, "learning_rate": 4.357862063693486e-06, "loss": 0.0226, "num_tokens": 120070577.0, "step": 1194 }, { "epoch": 2.7231251782149988, "grad_norm": 1.5078125, "learning_rate": 4.356601263309654e-06, "loss": 0.0268, "num_tokens": 120170896.0, "step": 1195 }, { "epoch": 2.7254063301967495, "grad_norm": 1.2265625, "learning_rate": 4.355339409154788e-06, "loss": 0.019, "num_tokens": 120271724.0, "step": 1196 }, { "epoch": 2.7276874821785, "grad_norm": 1.5859375, "learning_rate": 4.354076501945093e-06, "loss": 0.033, "num_tokens": 120372409.0, "step": 1197 }, { "epoch": 2.729968634160251, "grad_norm": 1.6796875, "learning_rate": 4.352812542397369e-06, "loss": 0.0265, "num_tokens": 120472589.0, "step": 1198 }, { "epoch": 2.7322497861420016, "grad_norm": 1.6796875, "learning_rate": 4.351547531229016e-06, "loss": 0.0327, "num_tokens": 120573130.0, "step": 1199 }, { "epoch": 2.7345309381237524, "grad_norm": 1.5078125, "learning_rate": 4.350281469158029e-06, "loss": 0.0278, "num_tokens": 120673485.0, "step": 1200 }, { "epoch": 2.736812090105503, "grad_norm": 1.59375, "learning_rate": 4.3490143569030025e-06, "loss": 0.0202, "num_tokens": 120773991.0, "step": 1201 }, { "epoch": 2.739093242087254, "grad_norm": 1.9765625, "learning_rate": 4.347746195183123e-06, "loss": 0.0264, "num_tokens": 120874805.0, "step": 1202 }, { "epoch": 2.741374394069005, "grad_norm": 1.6953125, "learning_rate": 4.346476984718176e-06, "loss": 0.0266, "num_tokens": 120975776.0, "step": 1203 }, { "epoch": 2.7436555460507557, "grad_norm": 1.2109375, "learning_rate": 4.345206726228538e-06, "loss": 0.0186, "num_tokens": 121076494.0, "step": 1204 }, { "epoch": 2.7459366980325064, "grad_norm": 1.7890625, "learning_rate": 4.343935420435187e-06, "loss": 0.028, "num_tokens": 121177149.0, "step": 1205 }, { "epoch": 2.748217850014257, "grad_norm": 1.359375, "learning_rate": 4.34266306805969e-06, "loss": 0.0272, "num_tokens": 121277402.0, "step": 1206 }, { "epoch": 2.750499001996008, "grad_norm": 1.78125, "learning_rate": 4.341389669824209e-06, "loss": 0.0268, "num_tokens": 121377689.0, "step": 1207 }, { "epoch": 2.752780153977759, "grad_norm": 2.328125, "learning_rate": 4.340115226451501e-06, "loss": 0.0211, "num_tokens": 121478508.0, "step": 1208 }, { "epoch": 2.7550613059595097, "grad_norm": 1.71875, "learning_rate": 4.338839738664915e-06, "loss": 0.0324, "num_tokens": 121579240.0, "step": 1209 }, { "epoch": 2.7573424579412604, "grad_norm": 1.7109375, "learning_rate": 4.3375632071883935e-06, "loss": 0.029, "num_tokens": 121679781.0, "step": 1210 }, { "epoch": 2.759623609923011, "grad_norm": 1.671875, "learning_rate": 4.336285632746472e-06, "loss": 0.0321, "num_tokens": 121780377.0, "step": 1211 }, { "epoch": 2.761904761904762, "grad_norm": 1.1953125, "learning_rate": 4.3350070160642754e-06, "loss": 0.0206, "num_tokens": 121880674.0, "step": 1212 }, { "epoch": 2.7641859138865126, "grad_norm": 1.3828125, "learning_rate": 4.333727357867523e-06, "loss": 0.0217, "num_tokens": 121981765.0, "step": 1213 }, { "epoch": 2.7664670658682633, "grad_norm": 1.3515625, "learning_rate": 4.3324466588825235e-06, "loss": 0.0233, "num_tokens": 122082362.0, "step": 1214 }, { "epoch": 2.768748217850014, "grad_norm": 1.421875, "learning_rate": 4.331164919836177e-06, "loss": 0.0239, "num_tokens": 122183279.0, "step": 1215 }, { "epoch": 2.771029369831765, "grad_norm": 1.46875, "learning_rate": 4.329882141455974e-06, "loss": 0.0277, "num_tokens": 122283437.0, "step": 1216 }, { "epoch": 2.773310521813516, "grad_norm": 1.7109375, "learning_rate": 4.3285983244699955e-06, "loss": 0.0276, "num_tokens": 122384268.0, "step": 1217 }, { "epoch": 2.7755916737952666, "grad_norm": 1.9375, "learning_rate": 4.327313469606911e-06, "loss": 0.0235, "num_tokens": 122485079.0, "step": 1218 }, { "epoch": 2.7778728257770173, "grad_norm": 1.6015625, "learning_rate": 4.326027577595977e-06, "loss": 0.0211, "num_tokens": 122585530.0, "step": 1219 }, { "epoch": 2.780153977758768, "grad_norm": 1.703125, "learning_rate": 4.324740649167044e-06, "loss": 0.0288, "num_tokens": 122686134.0, "step": 1220 }, { "epoch": 2.782435129740519, "grad_norm": 1.1796875, "learning_rate": 4.323452685050545e-06, "loss": 0.0201, "num_tokens": 122787146.0, "step": 1221 }, { "epoch": 2.78471628172227, "grad_norm": 1.3046875, "learning_rate": 4.3221636859775075e-06, "loss": 0.0249, "num_tokens": 122887769.0, "step": 1222 }, { "epoch": 2.7869974337040206, "grad_norm": 1.421875, "learning_rate": 4.320873652679538e-06, "loss": 0.0275, "num_tokens": 122988305.0, "step": 1223 }, { "epoch": 2.7892785856857714, "grad_norm": 1.2578125, "learning_rate": 4.319582585888838e-06, "loss": 0.018, "num_tokens": 123088954.0, "step": 1224 }, { "epoch": 2.791559737667522, "grad_norm": 1.4609375, "learning_rate": 4.31829048633819e-06, "loss": 0.0225, "num_tokens": 123189908.0, "step": 1225 }, { "epoch": 2.793840889649273, "grad_norm": 1.6484375, "learning_rate": 4.316997354760965e-06, "loss": 0.032, "num_tokens": 123290741.0, "step": 1226 }, { "epoch": 2.7961220416310235, "grad_norm": 1.7265625, "learning_rate": 4.3157031918911204e-06, "loss": 0.0259, "num_tokens": 123390929.0, "step": 1227 }, { "epoch": 2.7984031936127742, "grad_norm": 1.25, "learning_rate": 4.314407998463198e-06, "loss": 0.0271, "num_tokens": 123491547.0, "step": 1228 }, { "epoch": 2.8006843455945254, "grad_norm": 1.703125, "learning_rate": 4.3131117752123235e-06, "loss": 0.0256, "num_tokens": 123591866.0, "step": 1229 }, { "epoch": 2.802965497576276, "grad_norm": 1.9765625, "learning_rate": 4.311814522874209e-06, "loss": 0.0336, "num_tokens": 123692234.0, "step": 1230 }, { "epoch": 2.805246649558027, "grad_norm": 1.890625, "learning_rate": 4.3105162421851494e-06, "loss": 0.0256, "num_tokens": 123792665.0, "step": 1231 }, { "epoch": 2.8075278015397775, "grad_norm": 1.6953125, "learning_rate": 4.309216933882025e-06, "loss": 0.0252, "num_tokens": 123893070.0, "step": 1232 }, { "epoch": 2.8098089535215283, "grad_norm": 1.703125, "learning_rate": 4.307916598702296e-06, "loss": 0.0303, "num_tokens": 123994160.0, "step": 1233 }, { "epoch": 2.8120901055032794, "grad_norm": 1.40625, "learning_rate": 4.3066152373840105e-06, "loss": 0.0319, "num_tokens": 124095320.0, "step": 1234 }, { "epoch": 2.81437125748503, "grad_norm": 1.765625, "learning_rate": 4.305312850665794e-06, "loss": 0.0324, "num_tokens": 124196034.0, "step": 1235 }, { "epoch": 2.816652409466781, "grad_norm": 1.390625, "learning_rate": 4.304009439286855e-06, "loss": 0.0247, "num_tokens": 124296112.0, "step": 1236 }, { "epoch": 2.8189335614485316, "grad_norm": 1.5703125, "learning_rate": 4.3027050039869865e-06, "loss": 0.0303, "num_tokens": 124396947.0, "step": 1237 }, { "epoch": 2.8212147134302823, "grad_norm": 1.25, "learning_rate": 4.301399545506561e-06, "loss": 0.022, "num_tokens": 124497783.0, "step": 1238 }, { "epoch": 2.823495865412033, "grad_norm": 1.375, "learning_rate": 4.3000930645865305e-06, "loss": 0.0207, "num_tokens": 124598441.0, "step": 1239 }, { "epoch": 2.8257770173937837, "grad_norm": 1.8828125, "learning_rate": 4.298785561968428e-06, "loss": 0.0248, "num_tokens": 124698668.0, "step": 1240 }, { "epoch": 2.8280581693755344, "grad_norm": 1.671875, "learning_rate": 4.297477038394368e-06, "loss": 0.0287, "num_tokens": 124799290.0, "step": 1241 }, { "epoch": 2.830339321357285, "grad_norm": 2.125, "learning_rate": 4.296167494607043e-06, "loss": 0.0257, "num_tokens": 124900530.0, "step": 1242 }, { "epoch": 2.8326204733390363, "grad_norm": 1.6953125, "learning_rate": 4.294856931349724e-06, "loss": 0.0292, "num_tokens": 125001206.0, "step": 1243 }, { "epoch": 2.834901625320787, "grad_norm": 1.40625, "learning_rate": 4.293545349366262e-06, "loss": 0.0216, "num_tokens": 125101264.0, "step": 1244 }, { "epoch": 2.8371827773025378, "grad_norm": 1.5625, "learning_rate": 4.292232749401085e-06, "loss": 0.0189, "num_tokens": 125201785.0, "step": 1245 }, { "epoch": 2.8394639292842885, "grad_norm": 1.2578125, "learning_rate": 4.2909191321992e-06, "loss": 0.025, "num_tokens": 125302711.0, "step": 1246 }, { "epoch": 2.841745081266039, "grad_norm": 1.8828125, "learning_rate": 4.2896044985061915e-06, "loss": 0.03, "num_tokens": 125403307.0, "step": 1247 }, { "epoch": 2.8440262332477904, "grad_norm": 1.171875, "learning_rate": 4.288288849068218e-06, "loss": 0.0208, "num_tokens": 125503981.0, "step": 1248 }, { "epoch": 2.846307385229541, "grad_norm": 1.703125, "learning_rate": 4.286972184632019e-06, "loss": 0.027, "num_tokens": 125604887.0, "step": 1249 }, { "epoch": 2.848588537211292, "grad_norm": 1.2890625, "learning_rate": 4.285654505944906e-06, "loss": 0.0191, "num_tokens": 125705430.0, "step": 1250 }, { "epoch": 2.8508696891930425, "grad_norm": 1.484375, "learning_rate": 4.28433581375477e-06, "loss": 0.0195, "num_tokens": 125806771.0, "step": 1251 }, { "epoch": 2.8531508411747932, "grad_norm": 1.671875, "learning_rate": 4.283016108810073e-06, "loss": 0.0331, "num_tokens": 125907549.0, "step": 1252 }, { "epoch": 2.855431993156544, "grad_norm": 1.34375, "learning_rate": 4.281695391859854e-06, "loss": 0.0277, "num_tokens": 126008323.0, "step": 1253 }, { "epoch": 2.8577131451382947, "grad_norm": 1.1484375, "learning_rate": 4.28037366365373e-06, "loss": 0.0169, "num_tokens": 126108937.0, "step": 1254 }, { "epoch": 2.8599942971200454, "grad_norm": 1.515625, "learning_rate": 4.279050924941885e-06, "loss": 0.0186, "num_tokens": 126209735.0, "step": 1255 }, { "epoch": 2.8622754491017965, "grad_norm": 1.6796875, "learning_rate": 4.2777271764750805e-06, "loss": 0.0247, "num_tokens": 126310558.0, "step": 1256 }, { "epoch": 2.8645566010835473, "grad_norm": 1.421875, "learning_rate": 4.276402419004652e-06, "loss": 0.0259, "num_tokens": 126410977.0, "step": 1257 }, { "epoch": 2.866837753065298, "grad_norm": 1.390625, "learning_rate": 4.275076653282504e-06, "loss": 0.0218, "num_tokens": 126511301.0, "step": 1258 }, { "epoch": 2.8691189050470487, "grad_norm": 1.625, "learning_rate": 4.273749880061118e-06, "loss": 0.0347, "num_tokens": 126611616.0, "step": 1259 }, { "epoch": 2.8714000570287994, "grad_norm": 1.390625, "learning_rate": 4.272422100093542e-06, "loss": 0.0162, "num_tokens": 126712096.0, "step": 1260 }, { "epoch": 2.8736812090105506, "grad_norm": 1.3359375, "learning_rate": 4.271093314133401e-06, "loss": 0.0218, "num_tokens": 126812812.0, "step": 1261 }, { "epoch": 2.8759623609923013, "grad_norm": 1.640625, "learning_rate": 4.269763522934888e-06, "loss": 0.0307, "num_tokens": 126913588.0, "step": 1262 }, { "epoch": 2.878243512974052, "grad_norm": 2.0, "learning_rate": 4.268432727252765e-06, "loss": 0.0325, "num_tokens": 127014815.0, "step": 1263 }, { "epoch": 2.8805246649558027, "grad_norm": 1.453125, "learning_rate": 4.2671009278423665e-06, "loss": 0.0241, "num_tokens": 127115164.0, "step": 1264 }, { "epoch": 2.8828058169375534, "grad_norm": 1.4296875, "learning_rate": 4.265768125459597e-06, "loss": 0.0203, "num_tokens": 127215872.0, "step": 1265 }, { "epoch": 2.885086968919304, "grad_norm": 1.5703125, "learning_rate": 4.264434320860929e-06, "loss": 0.0228, "num_tokens": 127316485.0, "step": 1266 }, { "epoch": 2.887368120901055, "grad_norm": 1.578125, "learning_rate": 4.2630995148034044e-06, "loss": 0.0291, "num_tokens": 127416642.0, "step": 1267 }, { "epoch": 2.8896492728828056, "grad_norm": 1.3984375, "learning_rate": 4.261763708044633e-06, "loss": 0.0177, "num_tokens": 127517223.0, "step": 1268 }, { "epoch": 2.8919304248645568, "grad_norm": 1.0859375, "learning_rate": 4.2604269013427925e-06, "loss": 0.0182, "num_tokens": 127618095.0, "step": 1269 }, { "epoch": 2.8942115768463075, "grad_norm": 1.625, "learning_rate": 4.25908909545663e-06, "loss": 0.0252, "num_tokens": 127718603.0, "step": 1270 }, { "epoch": 2.896492728828058, "grad_norm": 1.5625, "learning_rate": 4.257750291145457e-06, "loss": 0.0277, "num_tokens": 127818700.0, "step": 1271 }, { "epoch": 2.898773880809809, "grad_norm": 1.5, "learning_rate": 4.256410489169154e-06, "loss": 0.0267, "num_tokens": 127919386.0, "step": 1272 }, { "epoch": 2.9010550327915596, "grad_norm": 1.3828125, "learning_rate": 4.255069690288166e-06, "loss": 0.0192, "num_tokens": 128020171.0, "step": 1273 }, { "epoch": 2.903336184773311, "grad_norm": 1.7421875, "learning_rate": 4.253727895263504e-06, "loss": 0.0255, "num_tokens": 128121085.0, "step": 1274 }, { "epoch": 2.9056173367550615, "grad_norm": 1.8515625, "learning_rate": 4.252385104856746e-06, "loss": 0.0244, "num_tokens": 128221555.0, "step": 1275 }, { "epoch": 2.9078984887368122, "grad_norm": 1.4140625, "learning_rate": 4.251041319830034e-06, "loss": 0.0213, "num_tokens": 128322016.0, "step": 1276 }, { "epoch": 2.910179640718563, "grad_norm": 1.5234375, "learning_rate": 4.249696540946074e-06, "loss": 0.0278, "num_tokens": 128423361.0, "step": 1277 }, { "epoch": 2.9124607927003137, "grad_norm": 2.265625, "learning_rate": 4.248350768968136e-06, "loss": 0.0415, "num_tokens": 128523368.0, "step": 1278 }, { "epoch": 2.9147419446820644, "grad_norm": 1.5234375, "learning_rate": 4.247004004660055e-06, "loss": 0.0265, "num_tokens": 128623793.0, "step": 1279 }, { "epoch": 2.917023096663815, "grad_norm": 1.453125, "learning_rate": 4.245656248786228e-06, "loss": 0.0264, "num_tokens": 128724247.0, "step": 1280 }, { "epoch": 2.919304248645566, "grad_norm": 1.40625, "learning_rate": 4.2443075021116166e-06, "loss": 0.0256, "num_tokens": 128824434.0, "step": 1281 }, { "epoch": 2.9215854006273165, "grad_norm": 1.109375, "learning_rate": 4.242957765401741e-06, "loss": 0.0171, "num_tokens": 128925142.0, "step": 1282 }, { "epoch": 2.9238665526090677, "grad_norm": 1.15625, "learning_rate": 4.241607039422687e-06, "loss": 0.0169, "num_tokens": 129025658.0, "step": 1283 }, { "epoch": 2.9261477045908184, "grad_norm": 1.4453125, "learning_rate": 4.2402553249411e-06, "loss": 0.0256, "num_tokens": 129126378.0, "step": 1284 }, { "epoch": 2.928428856572569, "grad_norm": 1.296875, "learning_rate": 4.238902622724188e-06, "loss": 0.026, "num_tokens": 129227425.0, "step": 1285 }, { "epoch": 2.93071000855432, "grad_norm": 1.53125, "learning_rate": 4.237548933539718e-06, "loss": 0.0262, "num_tokens": 129327783.0, "step": 1286 }, { "epoch": 2.9329911605360706, "grad_norm": 1.7578125, "learning_rate": 4.236194258156019e-06, "loss": 0.0316, "num_tokens": 129428870.0, "step": 1287 }, { "epoch": 2.9352723125178217, "grad_norm": 1.6484375, "learning_rate": 4.234838597341977e-06, "loss": 0.031, "num_tokens": 129529774.0, "step": 1288 }, { "epoch": 2.9375534644995724, "grad_norm": 1.6171875, "learning_rate": 4.233481951867039e-06, "loss": 0.0247, "num_tokens": 129630104.0, "step": 1289 }, { "epoch": 2.939834616481323, "grad_norm": 1.5625, "learning_rate": 4.232124322501212e-06, "loss": 0.0285, "num_tokens": 129730616.0, "step": 1290 }, { "epoch": 2.942115768463074, "grad_norm": 1.453125, "learning_rate": 4.230765710015058e-06, "loss": 0.0264, "num_tokens": 129830714.0, "step": 1291 }, { "epoch": 2.9443969204448246, "grad_norm": 1.484375, "learning_rate": 4.229406115179703e-06, "loss": 0.0226, "num_tokens": 129931646.0, "step": 1292 }, { "epoch": 2.9466780724265753, "grad_norm": 1.9453125, "learning_rate": 4.228045538766823e-06, "loss": 0.0267, "num_tokens": 130031841.0, "step": 1293 }, { "epoch": 2.948959224408326, "grad_norm": 2.015625, "learning_rate": 4.226683981548656e-06, "loss": 0.0338, "num_tokens": 130132597.0, "step": 1294 }, { "epoch": 2.9512403763900767, "grad_norm": 1.625, "learning_rate": 4.2253214442979975e-06, "loss": 0.0294, "num_tokens": 130233426.0, "step": 1295 }, { "epoch": 2.953521528371828, "grad_norm": 1.6328125, "learning_rate": 4.223957927788195e-06, "loss": 0.0247, "num_tokens": 130334117.0, "step": 1296 }, { "epoch": 2.9558026803535786, "grad_norm": 1.3359375, "learning_rate": 4.222593432793155e-06, "loss": 0.021, "num_tokens": 130434568.0, "step": 1297 }, { "epoch": 2.9580838323353293, "grad_norm": 1.34375, "learning_rate": 4.2212279600873385e-06, "loss": 0.0223, "num_tokens": 130535600.0, "step": 1298 }, { "epoch": 2.96036498431708, "grad_norm": 1.6328125, "learning_rate": 4.219861510445762e-06, "loss": 0.0324, "num_tokens": 130636089.0, "step": 1299 }, { "epoch": 2.962646136298831, "grad_norm": 1.7578125, "learning_rate": 4.2184940846439946e-06, "loss": 0.029, "num_tokens": 130737018.0, "step": 1300 }, { "epoch": 2.964927288280582, "grad_norm": 1.2265625, "learning_rate": 4.217125683458162e-06, "loss": 0.0232, "num_tokens": 130837674.0, "step": 1301 }, { "epoch": 2.9672084402623327, "grad_norm": 1.828125, "learning_rate": 4.215756307664941e-06, "loss": 0.0267, "num_tokens": 130938461.0, "step": 1302 }, { "epoch": 2.9694895922440834, "grad_norm": 1.453125, "learning_rate": 4.214385958041565e-06, "loss": 0.0356, "num_tokens": 131039300.0, "step": 1303 }, { "epoch": 2.971770744225834, "grad_norm": 1.59375, "learning_rate": 4.213014635365816e-06, "loss": 0.0288, "num_tokens": 131139476.0, "step": 1304 }, { "epoch": 2.974051896207585, "grad_norm": 1.640625, "learning_rate": 4.2116423404160316e-06, "loss": 0.028, "num_tokens": 131240506.0, "step": 1305 }, { "epoch": 2.9763330481893355, "grad_norm": 1.90625, "learning_rate": 4.210269073971098e-06, "loss": 0.0293, "num_tokens": 131341796.0, "step": 1306 }, { "epoch": 2.9786142001710862, "grad_norm": 1.34375, "learning_rate": 4.208894836810457e-06, "loss": 0.022, "num_tokens": 131442518.0, "step": 1307 }, { "epoch": 2.980895352152837, "grad_norm": 1.8125, "learning_rate": 4.207519629714099e-06, "loss": 0.0318, "num_tokens": 131542885.0, "step": 1308 }, { "epoch": 2.983176504134588, "grad_norm": 1.609375, "learning_rate": 4.206143453462562e-06, "loss": 0.0258, "num_tokens": 131643568.0, "step": 1309 }, { "epoch": 2.985457656116339, "grad_norm": 1.375, "learning_rate": 4.204766308836941e-06, "loss": 0.025, "num_tokens": 131744589.0, "step": 1310 }, { "epoch": 2.9877388080980896, "grad_norm": 1.71875, "learning_rate": 4.203388196618874e-06, "loss": 0.027, "num_tokens": 131845013.0, "step": 1311 }, { "epoch": 2.9900199600798403, "grad_norm": 1.1953125, "learning_rate": 4.202009117590552e-06, "loss": 0.0185, "num_tokens": 131945359.0, "step": 1312 }, { "epoch": 2.992301112061591, "grad_norm": 1.5546875, "learning_rate": 4.200629072534713e-06, "loss": 0.0225, "num_tokens": 132046074.0, "step": 1313 }, { "epoch": 2.994582264043342, "grad_norm": 1.59375, "learning_rate": 4.1992480622346455e-06, "loss": 0.0198, "num_tokens": 132146188.0, "step": 1314 }, { "epoch": 2.996863416025093, "grad_norm": 1.8046875, "learning_rate": 4.197866087474181e-06, "loss": 0.0302, "num_tokens": 132246490.0, "step": 1315 }, { "epoch": 2.9991445680068436, "grad_norm": 1.953125, "learning_rate": 4.196483149037707e-06, "loss": 0.0306, "num_tokens": 132346844.0, "step": 1316 }, { "epoch": 3.0, "grad_norm": 2.359375, "learning_rate": 4.195099247710147e-06, "loss": 0.023, "num_tokens": 132383616.0, "step": 1317 }, { "epoch": 3.0022811519817507, "grad_norm": 1.3828125, "learning_rate": 4.1937143842769805e-06, "loss": 0.0232, "num_tokens": 132483887.0, "step": 1318 }, { "epoch": 3.0045623039635014, "grad_norm": 1.1171875, "learning_rate": 4.192328559524227e-06, "loss": 0.0205, "num_tokens": 132585044.0, "step": 1319 }, { "epoch": 3.006843455945252, "grad_norm": 1.3125, "learning_rate": 4.190941774238454e-06, "loss": 0.0207, "num_tokens": 132686129.0, "step": 1320 }, { "epoch": 3.006843455945252, "eval_loss": 0.031444501131772995, "eval_num_tokens": 132686129.0, "eval_runtime": 52.7502, "eval_samples_per_second": 158.957, "eval_steps_per_second": 4.986, "step": 1320 }, { "epoch": 3.0091246079270033, "grad_norm": 1.1796875, "learning_rate": 4.1895540292067765e-06, "loss": 0.0206, "num_tokens": 132786644.0, "step": 1321 }, { "epoch": 3.011405759908754, "grad_norm": 1.46875, "learning_rate": 4.18816532521685e-06, "loss": 0.0253, "num_tokens": 132887688.0, "step": 1322 }, { "epoch": 3.0136869118905047, "grad_norm": 1.28125, "learning_rate": 4.1867756630568755e-06, "loss": 0.022, "num_tokens": 132989103.0, "step": 1323 }, { "epoch": 3.0159680638722555, "grad_norm": 1.2734375, "learning_rate": 4.1853850435156e-06, "loss": 0.0224, "num_tokens": 133089948.0, "step": 1324 }, { "epoch": 3.018249215854006, "grad_norm": 1.3828125, "learning_rate": 4.18399346738231e-06, "loss": 0.0239, "num_tokens": 133189912.0, "step": 1325 }, { "epoch": 3.020530367835757, "grad_norm": 1.1796875, "learning_rate": 4.18260093544684e-06, "loss": 0.0175, "num_tokens": 133290027.0, "step": 1326 }, { "epoch": 3.022811519817508, "grad_norm": 1.1328125, "learning_rate": 4.181207448499562e-06, "loss": 0.0177, "num_tokens": 133390902.0, "step": 1327 }, { "epoch": 3.025092671799259, "grad_norm": 1.5546875, "learning_rate": 4.179813007331394e-06, "loss": 0.0264, "num_tokens": 133491420.0, "step": 1328 }, { "epoch": 3.0273738237810095, "grad_norm": 1.2734375, "learning_rate": 4.178417612733792e-06, "loss": 0.0229, "num_tokens": 133591714.0, "step": 1329 }, { "epoch": 3.02965497576276, "grad_norm": 1.609375, "learning_rate": 4.177021265498757e-06, "loss": 0.0207, "num_tokens": 133691897.0, "step": 1330 }, { "epoch": 3.031936127744511, "grad_norm": 1.4453125, "learning_rate": 4.1756239664188275e-06, "loss": 0.0191, "num_tokens": 133792865.0, "step": 1331 }, { "epoch": 3.0342172797262617, "grad_norm": 1.8359375, "learning_rate": 4.1742257162870835e-06, "loss": 0.0257, "num_tokens": 133893599.0, "step": 1332 }, { "epoch": 3.0364984317080124, "grad_norm": 1.484375, "learning_rate": 4.172826515897146e-06, "loss": 0.0233, "num_tokens": 133994119.0, "step": 1333 }, { "epoch": 3.0387795836897635, "grad_norm": 1.4453125, "learning_rate": 4.171426366043172e-06, "loss": 0.0232, "num_tokens": 134095002.0, "step": 1334 }, { "epoch": 3.0410607356715142, "grad_norm": 1.203125, "learning_rate": 4.170025267519862e-06, "loss": 0.0189, "num_tokens": 134195472.0, "step": 1335 }, { "epoch": 3.043341887653265, "grad_norm": 1.4140625, "learning_rate": 4.168623221122451e-06, "loss": 0.0276, "num_tokens": 134296466.0, "step": 1336 }, { "epoch": 3.0456230396350157, "grad_norm": 1.25, "learning_rate": 4.167220227646713e-06, "loss": 0.0215, "num_tokens": 134397352.0, "step": 1337 }, { "epoch": 3.0479041916167664, "grad_norm": 1.6875, "learning_rate": 4.165816287888962e-06, "loss": 0.025, "num_tokens": 134498478.0, "step": 1338 }, { "epoch": 3.050185343598517, "grad_norm": 1.3046875, "learning_rate": 4.164411402646045e-06, "loss": 0.0174, "num_tokens": 134598983.0, "step": 1339 }, { "epoch": 3.052466495580268, "grad_norm": 1.8046875, "learning_rate": 4.163005572715348e-06, "loss": 0.0317, "num_tokens": 134699444.0, "step": 1340 }, { "epoch": 3.054747647562019, "grad_norm": 1.6015625, "learning_rate": 4.161598798894795e-06, "loss": 0.0236, "num_tokens": 134799806.0, "step": 1341 }, { "epoch": 3.0570287995437697, "grad_norm": 1.640625, "learning_rate": 4.160191081982841e-06, "loss": 0.028, "num_tokens": 134900492.0, "step": 1342 }, { "epoch": 3.0593099515255204, "grad_norm": 1.6015625, "learning_rate": 4.15878242277848e-06, "loss": 0.0283, "num_tokens": 135001922.0, "step": 1343 }, { "epoch": 3.061591103507271, "grad_norm": 1.3359375, "learning_rate": 4.157372822081241e-06, "loss": 0.0204, "num_tokens": 135102860.0, "step": 1344 }, { "epoch": 3.063872255489022, "grad_norm": 1.484375, "learning_rate": 4.155962280691184e-06, "loss": 0.0238, "num_tokens": 135203332.0, "step": 1345 }, { "epoch": 3.0661534074707726, "grad_norm": 1.3046875, "learning_rate": 4.154550799408906e-06, "loss": 0.0221, "num_tokens": 135303776.0, "step": 1346 }, { "epoch": 3.0684345594525233, "grad_norm": 1.703125, "learning_rate": 4.153138379035537e-06, "loss": 0.0201, "num_tokens": 135404352.0, "step": 1347 }, { "epoch": 3.0707157114342745, "grad_norm": 1.96875, "learning_rate": 4.1517250203727395e-06, "loss": 0.0316, "num_tokens": 135505220.0, "step": 1348 }, { "epoch": 3.072996863416025, "grad_norm": 1.3046875, "learning_rate": 4.150310724222708e-06, "loss": 0.0218, "num_tokens": 135605800.0, "step": 1349 }, { "epoch": 3.075278015397776, "grad_norm": 1.578125, "learning_rate": 4.14889549138817e-06, "loss": 0.0261, "num_tokens": 135706935.0, "step": 1350 }, { "epoch": 3.0775591673795266, "grad_norm": 1.640625, "learning_rate": 4.147479322672383e-06, "loss": 0.0268, "num_tokens": 135807579.0, "step": 1351 }, { "epoch": 3.0798403193612773, "grad_norm": 1.609375, "learning_rate": 4.14606221887914e-06, "loss": 0.0214, "num_tokens": 135908374.0, "step": 1352 }, { "epoch": 3.082121471343028, "grad_norm": 1.3984375, "learning_rate": 4.144644180812759e-06, "loss": 0.0245, "num_tokens": 136009343.0, "step": 1353 }, { "epoch": 3.084402623324779, "grad_norm": 2.03125, "learning_rate": 4.143225209278093e-06, "loss": 0.0343, "num_tokens": 136110006.0, "step": 1354 }, { "epoch": 3.08668377530653, "grad_norm": 1.5546875, "learning_rate": 4.141805305080521e-06, "loss": 0.0214, "num_tokens": 136210211.0, "step": 1355 }, { "epoch": 3.0889649272882806, "grad_norm": 1.421875, "learning_rate": 4.1403844690259544e-06, "loss": 0.0192, "num_tokens": 136310736.0, "step": 1356 }, { "epoch": 3.0912460792700314, "grad_norm": 1.34375, "learning_rate": 4.138962701920831e-06, "loss": 0.0222, "num_tokens": 136411786.0, "step": 1357 }, { "epoch": 3.093527231251782, "grad_norm": 1.1796875, "learning_rate": 4.13754000457212e-06, "loss": 0.0223, "num_tokens": 136512209.0, "step": 1358 }, { "epoch": 3.095808383233533, "grad_norm": 1.671875, "learning_rate": 4.136116377787317e-06, "loss": 0.0253, "num_tokens": 136612661.0, "step": 1359 }, { "epoch": 3.0980895352152835, "grad_norm": 1.671875, "learning_rate": 4.134691822374445e-06, "loss": 0.0281, "num_tokens": 136713185.0, "step": 1360 }, { "epoch": 3.1003706871970347, "grad_norm": 1.40625, "learning_rate": 4.1332663391420515e-06, "loss": 0.0226, "num_tokens": 136813553.0, "step": 1361 }, { "epoch": 3.1026518391787854, "grad_norm": 1.375, "learning_rate": 4.131839928899217e-06, "loss": 0.0218, "num_tokens": 136914897.0, "step": 1362 }, { "epoch": 3.104932991160536, "grad_norm": 1.484375, "learning_rate": 4.130412592455542e-06, "loss": 0.0209, "num_tokens": 137015690.0, "step": 1363 }, { "epoch": 3.107214143142287, "grad_norm": 1.6171875, "learning_rate": 4.128984330621157e-06, "loss": 0.0206, "num_tokens": 137116355.0, "step": 1364 }, { "epoch": 3.1094952951240376, "grad_norm": 1.625, "learning_rate": 4.127555144206713e-06, "loss": 0.0195, "num_tokens": 137217183.0, "step": 1365 }, { "epoch": 3.1117764471057883, "grad_norm": 1.265625, "learning_rate": 4.126125034023392e-06, "loss": 0.0198, "num_tokens": 137317684.0, "step": 1366 }, { "epoch": 3.1140575990875394, "grad_norm": 1.6796875, "learning_rate": 4.124694000882894e-06, "loss": 0.0248, "num_tokens": 137419030.0, "step": 1367 }, { "epoch": 3.11633875106929, "grad_norm": 1.5234375, "learning_rate": 4.123262045597447e-06, "loss": 0.0234, "num_tokens": 137519802.0, "step": 1368 }, { "epoch": 3.118619903051041, "grad_norm": 1.5859375, "learning_rate": 4.121829168979802e-06, "loss": 0.0242, "num_tokens": 137620479.0, "step": 1369 }, { "epoch": 3.1209010550327916, "grad_norm": 1.65625, "learning_rate": 4.120395371843231e-06, "loss": 0.0236, "num_tokens": 137720857.0, "step": 1370 }, { "epoch": 3.1231822070145423, "grad_norm": 1.421875, "learning_rate": 4.11896065500153e-06, "loss": 0.0218, "num_tokens": 137821131.0, "step": 1371 }, { "epoch": 3.125463358996293, "grad_norm": 1.3671875, "learning_rate": 4.117525019269016e-06, "loss": 0.0234, "num_tokens": 137921629.0, "step": 1372 }, { "epoch": 3.1277445109780437, "grad_norm": 1.75, "learning_rate": 4.116088465460529e-06, "loss": 0.0246, "num_tokens": 138022105.0, "step": 1373 }, { "epoch": 3.130025662959795, "grad_norm": 1.6953125, "learning_rate": 4.114650994391428e-06, "loss": 0.0279, "num_tokens": 138123067.0, "step": 1374 }, { "epoch": 3.1323068149415456, "grad_norm": 1.5390625, "learning_rate": 4.113212606877596e-06, "loss": 0.0223, "num_tokens": 138223389.0, "step": 1375 }, { "epoch": 3.1345879669232963, "grad_norm": 1.515625, "learning_rate": 4.111773303735432e-06, "loss": 0.0216, "num_tokens": 138324219.0, "step": 1376 }, { "epoch": 3.136869118905047, "grad_norm": 1.2734375, "learning_rate": 4.110333085781857e-06, "loss": 0.0202, "num_tokens": 138424612.0, "step": 1377 }, { "epoch": 3.1391502708867978, "grad_norm": 1.375, "learning_rate": 4.108891953834312e-06, "loss": 0.017, "num_tokens": 138524782.0, "step": 1378 }, { "epoch": 3.1414314228685485, "grad_norm": 1.609375, "learning_rate": 4.107449908710753e-06, "loss": 0.0232, "num_tokens": 138626000.0, "step": 1379 }, { "epoch": 3.143712574850299, "grad_norm": 1.6953125, "learning_rate": 4.106006951229661e-06, "loss": 0.0277, "num_tokens": 138726592.0, "step": 1380 }, { "epoch": 3.1459937268320504, "grad_norm": 1.4921875, "learning_rate": 4.104563082210028e-06, "loss": 0.0252, "num_tokens": 138827457.0, "step": 1381 }, { "epoch": 3.148274878813801, "grad_norm": 1.140625, "learning_rate": 4.1031183024713665e-06, "loss": 0.0179, "num_tokens": 138928032.0, "step": 1382 }, { "epoch": 3.150556030795552, "grad_norm": 0.9140625, "learning_rate": 4.101672612833706e-06, "loss": 0.0145, "num_tokens": 139028370.0, "step": 1383 }, { "epoch": 3.1528371827773025, "grad_norm": 1.46875, "learning_rate": 4.100226014117592e-06, "loss": 0.0166, "num_tokens": 139128743.0, "step": 1384 }, { "epoch": 3.1551183347590532, "grad_norm": 1.2421875, "learning_rate": 4.098778507144086e-06, "loss": 0.0188, "num_tokens": 139229397.0, "step": 1385 }, { "epoch": 3.157399486740804, "grad_norm": 1.46875, "learning_rate": 4.097330092734765e-06, "loss": 0.0223, "num_tokens": 139329448.0, "step": 1386 }, { "epoch": 3.1596806387225547, "grad_norm": 1.3359375, "learning_rate": 4.09588077171172e-06, "loss": 0.0198, "num_tokens": 139430520.0, "step": 1387 }, { "epoch": 3.161961790704306, "grad_norm": 1.3125, "learning_rate": 4.094430544897559e-06, "loss": 0.015, "num_tokens": 139530836.0, "step": 1388 }, { "epoch": 3.1642429426860565, "grad_norm": 1.75, "learning_rate": 4.092979413115404e-06, "loss": 0.0256, "num_tokens": 139631461.0, "step": 1389 }, { "epoch": 3.1665240946678073, "grad_norm": 1.59375, "learning_rate": 4.091527377188886e-06, "loss": 0.0265, "num_tokens": 139732602.0, "step": 1390 }, { "epoch": 3.168805246649558, "grad_norm": 1.8203125, "learning_rate": 4.090074437942155e-06, "loss": 0.0271, "num_tokens": 139833825.0, "step": 1391 }, { "epoch": 3.1710863986313087, "grad_norm": 1.28125, "learning_rate": 4.088620596199872e-06, "loss": 0.0162, "num_tokens": 139934670.0, "step": 1392 }, { "epoch": 3.1733675506130594, "grad_norm": 1.859375, "learning_rate": 4.087165852787206e-06, "loss": 0.0287, "num_tokens": 140035482.0, "step": 1393 }, { "epoch": 3.1756487025948106, "grad_norm": 1.9375, "learning_rate": 4.085710208529844e-06, "loss": 0.0322, "num_tokens": 140136806.0, "step": 1394 }, { "epoch": 3.1779298545765613, "grad_norm": 1.453125, "learning_rate": 4.084253664253981e-06, "loss": 0.0237, "num_tokens": 140237471.0, "step": 1395 }, { "epoch": 3.180211006558312, "grad_norm": 1.75, "learning_rate": 4.082796220786324e-06, "loss": 0.0267, "num_tokens": 140338561.0, "step": 1396 }, { "epoch": 3.1824921585400627, "grad_norm": 1.59375, "learning_rate": 4.081337878954088e-06, "loss": 0.0226, "num_tokens": 140439214.0, "step": 1397 }, { "epoch": 3.1847733105218134, "grad_norm": 1.3671875, "learning_rate": 4.079878639585002e-06, "loss": 0.0172, "num_tokens": 140540113.0, "step": 1398 }, { "epoch": 3.187054462503564, "grad_norm": 1.703125, "learning_rate": 4.0784185035072996e-06, "loss": 0.0242, "num_tokens": 140640175.0, "step": 1399 }, { "epoch": 3.189335614485315, "grad_norm": 1.203125, "learning_rate": 4.076957471549728e-06, "loss": 0.0209, "num_tokens": 140740472.0, "step": 1400 }, { "epoch": 3.191616766467066, "grad_norm": 1.8203125, "learning_rate": 4.0754955445415405e-06, "loss": 0.0268, "num_tokens": 140840930.0, "step": 1401 }, { "epoch": 3.1938979184488168, "grad_norm": 1.3984375, "learning_rate": 4.074032723312497e-06, "loss": 0.0193, "num_tokens": 140941443.0, "step": 1402 }, { "epoch": 3.1961790704305675, "grad_norm": 2.125, "learning_rate": 4.072569008692868e-06, "loss": 0.0306, "num_tokens": 141041879.0, "step": 1403 }, { "epoch": 3.198460222412318, "grad_norm": 1.7265625, "learning_rate": 4.071104401513429e-06, "loss": 0.028, "num_tokens": 141142699.0, "step": 1404 }, { "epoch": 3.200741374394069, "grad_norm": 1.6796875, "learning_rate": 4.069638902605464e-06, "loss": 0.0212, "num_tokens": 141244300.0, "step": 1405 }, { "epoch": 3.2030225263758196, "grad_norm": 1.3359375, "learning_rate": 4.06817251280076e-06, "loss": 0.0217, "num_tokens": 141344882.0, "step": 1406 }, { "epoch": 3.205303678357571, "grad_norm": 1.640625, "learning_rate": 4.0667052329316125e-06, "loss": 0.031, "num_tokens": 141445784.0, "step": 1407 }, { "epoch": 3.2075848303393215, "grad_norm": 1.5625, "learning_rate": 4.0652370638308215e-06, "loss": 0.024, "num_tokens": 141545937.0, "step": 1408 }, { "epoch": 3.2098659823210722, "grad_norm": 1.2421875, "learning_rate": 4.063768006331691e-06, "loss": 0.0204, "num_tokens": 141646523.0, "step": 1409 }, { "epoch": 3.212147134302823, "grad_norm": 1.3671875, "learning_rate": 4.06229806126803e-06, "loss": 0.0223, "num_tokens": 141747370.0, "step": 1410 }, { "epoch": 3.2144282862845737, "grad_norm": 1.1484375, "learning_rate": 4.06082722947415e-06, "loss": 0.0132, "num_tokens": 141848416.0, "step": 1411 }, { "epoch": 3.2167094382663244, "grad_norm": 1.6875, "learning_rate": 4.059355511784868e-06, "loss": 0.0273, "num_tokens": 141948761.0, "step": 1412 }, { "epoch": 3.218990590248075, "grad_norm": 1.5859375, "learning_rate": 4.057882909035503e-06, "loss": 0.0291, "num_tokens": 142049670.0, "step": 1413 }, { "epoch": 3.2212717422298263, "grad_norm": 1.4921875, "learning_rate": 4.0564094220618735e-06, "loss": 0.0231, "num_tokens": 142150151.0, "step": 1414 }, { "epoch": 3.223552894211577, "grad_norm": 1.2421875, "learning_rate": 4.054935051700305e-06, "loss": 0.0215, "num_tokens": 142250731.0, "step": 1415 }, { "epoch": 3.2258340461933277, "grad_norm": 1.5625, "learning_rate": 4.053459798787619e-06, "loss": 0.0255, "num_tokens": 142351511.0, "step": 1416 }, { "epoch": 3.2281151981750784, "grad_norm": 1.375, "learning_rate": 4.0519836641611425e-06, "loss": 0.0197, "num_tokens": 142452096.0, "step": 1417 }, { "epoch": 3.230396350156829, "grad_norm": 1.578125, "learning_rate": 4.050506648658701e-06, "loss": 0.0291, "num_tokens": 142552669.0, "step": 1418 }, { "epoch": 3.23267750213858, "grad_norm": 1.625, "learning_rate": 4.049028753118619e-06, "loss": 0.023, "num_tokens": 142653519.0, "step": 1419 }, { "epoch": 3.2349586541203306, "grad_norm": 1.7578125, "learning_rate": 4.047549978379721e-06, "loss": 0.0213, "num_tokens": 142753954.0, "step": 1420 }, { "epoch": 3.2372398061020817, "grad_norm": 1.8828125, "learning_rate": 4.046070325281333e-06, "loss": 0.024, "num_tokens": 142854462.0, "step": 1421 }, { "epoch": 3.2395209580838324, "grad_norm": 1.9609375, "learning_rate": 4.044589794663275e-06, "loss": 0.028, "num_tokens": 142955431.0, "step": 1422 }, { "epoch": 3.241802110065583, "grad_norm": 2.421875, "learning_rate": 4.04310838736587e-06, "loss": 0.0327, "num_tokens": 143056796.0, "step": 1423 }, { "epoch": 3.244083262047334, "grad_norm": 2.109375, "learning_rate": 4.041626104229937e-06, "loss": 0.0307, "num_tokens": 143157220.0, "step": 1424 }, { "epoch": 3.2463644140290846, "grad_norm": 1.4296875, "learning_rate": 4.0401429460967864e-06, "loss": 0.0251, "num_tokens": 143257026.0, "step": 1425 }, { "epoch": 3.2486455660108353, "grad_norm": 1.4453125, "learning_rate": 4.038658913808235e-06, "loss": 0.0252, "num_tokens": 143357931.0, "step": 1426 }, { "epoch": 3.250926717992586, "grad_norm": 1.6171875, "learning_rate": 4.037174008206589e-06, "loss": 0.0245, "num_tokens": 143457822.0, "step": 1427 }, { "epoch": 3.253207869974337, "grad_norm": 1.484375, "learning_rate": 4.035688230134651e-06, "loss": 0.0249, "num_tokens": 143558495.0, "step": 1428 }, { "epoch": 3.255489021956088, "grad_norm": 1.5546875, "learning_rate": 4.034201580435723e-06, "loss": 0.0223, "num_tokens": 143658973.0, "step": 1429 }, { "epoch": 3.2577701739378386, "grad_norm": 1.5546875, "learning_rate": 4.0327140599535954e-06, "loss": 0.0251, "num_tokens": 143759365.0, "step": 1430 }, { "epoch": 3.2600513259195893, "grad_norm": 1.8828125, "learning_rate": 4.031225669532558e-06, "loss": 0.0322, "num_tokens": 143859560.0, "step": 1431 }, { "epoch": 3.26233247790134, "grad_norm": 1.3828125, "learning_rate": 4.029736410017392e-06, "loss": 0.0209, "num_tokens": 143960209.0, "step": 1432 }, { "epoch": 3.264613629883091, "grad_norm": 1.4296875, "learning_rate": 4.028246282253373e-06, "loss": 0.0209, "num_tokens": 144060501.0, "step": 1433 }, { "epoch": 3.266894781864842, "grad_norm": 1.625, "learning_rate": 4.026755287086267e-06, "loss": 0.0284, "num_tokens": 144161419.0, "step": 1434 }, { "epoch": 3.2691759338465927, "grad_norm": 1.078125, "learning_rate": 4.025263425362335e-06, "loss": 0.019, "num_tokens": 144261642.0, "step": 1435 }, { "epoch": 3.2714570858283434, "grad_norm": 1.4375, "learning_rate": 4.0237706979283306e-06, "loss": 0.0245, "num_tokens": 144361990.0, "step": 1436 }, { "epoch": 3.273738237810094, "grad_norm": 1.1328125, "learning_rate": 4.022277105631495e-06, "loss": 0.0172, "num_tokens": 144462659.0, "step": 1437 }, { "epoch": 3.276019389791845, "grad_norm": 1.390625, "learning_rate": 4.020782649319563e-06, "loss": 0.0241, "num_tokens": 144563459.0, "step": 1438 }, { "epoch": 3.2783005417735955, "grad_norm": 1.265625, "learning_rate": 4.019287329840759e-06, "loss": 0.0196, "num_tokens": 144663810.0, "step": 1439 }, { "epoch": 3.2805816937553463, "grad_norm": 1.5, "learning_rate": 4.017791148043797e-06, "loss": 0.0212, "num_tokens": 144764424.0, "step": 1440 }, { "epoch": 3.2828628457370974, "grad_norm": 1.375, "learning_rate": 4.016294104777883e-06, "loss": 0.0213, "num_tokens": 144864969.0, "step": 1441 }, { "epoch": 3.285143997718848, "grad_norm": 1.7578125, "learning_rate": 4.0147962008927065e-06, "loss": 0.0203, "num_tokens": 144966555.0, "step": 1442 }, { "epoch": 3.287425149700599, "grad_norm": 1.4765625, "learning_rate": 4.013297437238452e-06, "loss": 0.0176, "num_tokens": 145066791.0, "step": 1443 }, { "epoch": 3.2897063016823496, "grad_norm": 1.5703125, "learning_rate": 4.011797814665787e-06, "loss": 0.0289, "num_tokens": 145167347.0, "step": 1444 }, { "epoch": 3.2919874536641003, "grad_norm": 1.7421875, "learning_rate": 4.010297334025869e-06, "loss": 0.0242, "num_tokens": 145268137.0, "step": 1445 }, { "epoch": 3.294268605645851, "grad_norm": 1.5390625, "learning_rate": 4.008795996170341e-06, "loss": 0.0177, "num_tokens": 145368721.0, "step": 1446 }, { "epoch": 3.296549757627602, "grad_norm": 1.5078125, "learning_rate": 4.0072938019513345e-06, "loss": 0.0248, "num_tokens": 145469836.0, "step": 1447 }, { "epoch": 3.298830909609353, "grad_norm": 1.65625, "learning_rate": 4.0057907522214646e-06, "loss": 0.0252, "num_tokens": 145571300.0, "step": 1448 }, { "epoch": 3.3011120615911036, "grad_norm": 1.421875, "learning_rate": 4.004286847833835e-06, "loss": 0.0181, "num_tokens": 145672270.0, "step": 1449 }, { "epoch": 3.3033932135728543, "grad_norm": 1.9921875, "learning_rate": 4.002782089642031e-06, "loss": 0.0248, "num_tokens": 145772690.0, "step": 1450 }, { "epoch": 3.305674365554605, "grad_norm": 1.296875, "learning_rate": 4.001276478500127e-06, "loss": 0.0241, "num_tokens": 145874241.0, "step": 1451 }, { "epoch": 3.3079555175363557, "grad_norm": 1.5078125, "learning_rate": 3.9997700152626755e-06, "loss": 0.0204, "num_tokens": 145974664.0, "step": 1452 }, { "epoch": 3.3102366695181065, "grad_norm": 1.3203125, "learning_rate": 3.9982627007847186e-06, "loss": 0.0217, "num_tokens": 146076331.0, "step": 1453 }, { "epoch": 3.312517821499857, "grad_norm": 1.546875, "learning_rate": 3.996754535921777e-06, "loss": 0.0227, "num_tokens": 146177170.0, "step": 1454 }, { "epoch": 3.3147989734816083, "grad_norm": 1.4609375, "learning_rate": 3.995245521529857e-06, "loss": 0.0192, "num_tokens": 146277670.0, "step": 1455 }, { "epoch": 3.317080125463359, "grad_norm": 1.5546875, "learning_rate": 3.993735658465446e-06, "loss": 0.0193, "num_tokens": 146377589.0, "step": 1456 }, { "epoch": 3.31936127744511, "grad_norm": 1.4765625, "learning_rate": 3.992224947585513e-06, "loss": 0.0217, "num_tokens": 146478403.0, "step": 1457 }, { "epoch": 3.3216424294268605, "grad_norm": 1.2265625, "learning_rate": 3.990713389747508e-06, "loss": 0.0159, "num_tokens": 146578895.0, "step": 1458 }, { "epoch": 3.323923581408611, "grad_norm": 1.2421875, "learning_rate": 3.989200985809362e-06, "loss": 0.0213, "num_tokens": 146679504.0, "step": 1459 }, { "epoch": 3.3262047333903624, "grad_norm": 1.25, "learning_rate": 3.987687736629487e-06, "loss": 0.0175, "num_tokens": 146779810.0, "step": 1460 }, { "epoch": 3.328485885372113, "grad_norm": 1.7421875, "learning_rate": 3.986173643066774e-06, "loss": 0.0257, "num_tokens": 146880118.0, "step": 1461 }, { "epoch": 3.330767037353864, "grad_norm": 1.5859375, "learning_rate": 3.984658705980593e-06, "loss": 0.0271, "num_tokens": 146981443.0, "step": 1462 }, { "epoch": 3.3330481893356145, "grad_norm": 1.640625, "learning_rate": 3.983142926230792e-06, "loss": 0.0273, "num_tokens": 147082424.0, "step": 1463 }, { "epoch": 3.3353293413173652, "grad_norm": 1.6015625, "learning_rate": 3.981626304677701e-06, "loss": 0.0233, "num_tokens": 147183335.0, "step": 1464 }, { "epoch": 3.337610493299116, "grad_norm": 1.8671875, "learning_rate": 3.980108842182121e-06, "loss": 0.0248, "num_tokens": 147284344.0, "step": 1465 }, { "epoch": 3.3398916452808667, "grad_norm": 1.5390625, "learning_rate": 3.978590539605338e-06, "loss": 0.0265, "num_tokens": 147385858.0, "step": 1466 }, { "epoch": 3.3421727972626174, "grad_norm": 1.40625, "learning_rate": 3.97707139780911e-06, "loss": 0.0206, "num_tokens": 147487344.0, "step": 1467 }, { "epoch": 3.3444539492443686, "grad_norm": 1.40625, "learning_rate": 3.975551417655673e-06, "loss": 0.0214, "num_tokens": 147587662.0, "step": 1468 }, { "epoch": 3.3467351012261193, "grad_norm": 1.3046875, "learning_rate": 3.974030600007737e-06, "loss": 0.0184, "num_tokens": 147688090.0, "step": 1469 }, { "epoch": 3.34901625320787, "grad_norm": 1.4140625, "learning_rate": 3.97250894572849e-06, "loss": 0.0222, "num_tokens": 147788638.0, "step": 1470 }, { "epoch": 3.3512974051896207, "grad_norm": 1.7421875, "learning_rate": 3.970986455681593e-06, "loss": 0.0272, "num_tokens": 147889712.0, "step": 1471 }, { "epoch": 3.3535785571713714, "grad_norm": 1.6875, "learning_rate": 3.969463130731183e-06, "loss": 0.0248, "num_tokens": 147989729.0, "step": 1472 }, { "epoch": 3.355859709153122, "grad_norm": 1.359375, "learning_rate": 3.967938971741869e-06, "loss": 0.0188, "num_tokens": 148090935.0, "step": 1473 }, { "epoch": 3.3581408611348733, "grad_norm": 1.6484375, "learning_rate": 3.966413979578734e-06, "loss": 0.0264, "num_tokens": 148191677.0, "step": 1474 }, { "epoch": 3.360422013116624, "grad_norm": 1.765625, "learning_rate": 3.964888155107335e-06, "loss": 0.0238, "num_tokens": 148292185.0, "step": 1475 }, { "epoch": 3.3627031650983747, "grad_norm": 1.46875, "learning_rate": 3.963361499193699e-06, "loss": 0.0217, "num_tokens": 148393248.0, "step": 1476 }, { "epoch": 3.3649843170801255, "grad_norm": 1.7578125, "learning_rate": 3.9618340127043274e-06, "loss": 0.0289, "num_tokens": 148493697.0, "step": 1477 }, { "epoch": 3.367265469061876, "grad_norm": 1.3828125, "learning_rate": 3.960305696506192e-06, "loss": 0.0169, "num_tokens": 148594314.0, "step": 1478 }, { "epoch": 3.369546621043627, "grad_norm": 1.3671875, "learning_rate": 3.958776551466737e-06, "loss": 0.0181, "num_tokens": 148694747.0, "step": 1479 }, { "epoch": 3.3718277730253776, "grad_norm": 1.609375, "learning_rate": 3.957246578453873e-06, "loss": 0.0265, "num_tokens": 148794569.0, "step": 1480 }, { "epoch": 3.374108925007129, "grad_norm": 1.65625, "learning_rate": 3.955715778335984e-06, "loss": 0.0289, "num_tokens": 148894837.0, "step": 1481 }, { "epoch": 3.3763900769888795, "grad_norm": 1.296875, "learning_rate": 3.954184151981924e-06, "loss": 0.0177, "num_tokens": 148995244.0, "step": 1482 }, { "epoch": 3.37867122897063, "grad_norm": 1.3203125, "learning_rate": 3.952651700261012e-06, "loss": 0.0194, "num_tokens": 149095755.0, "step": 1483 }, { "epoch": 3.380952380952381, "grad_norm": 1.7890625, "learning_rate": 3.95111842404304e-06, "loss": 0.023, "num_tokens": 149197276.0, "step": 1484 }, { "epoch": 3.3832335329341316, "grad_norm": 1.328125, "learning_rate": 3.949584324198266e-06, "loss": 0.0202, "num_tokens": 149298139.0, "step": 1485 }, { "epoch": 3.3855146849158824, "grad_norm": 1.3203125, "learning_rate": 3.948049401597414e-06, "loss": 0.0218, "num_tokens": 149398844.0, "step": 1486 }, { "epoch": 3.3877958368976335, "grad_norm": 1.2734375, "learning_rate": 3.946513657111678e-06, "loss": 0.0174, "num_tokens": 149499930.0, "step": 1487 }, { "epoch": 3.3900769888793842, "grad_norm": 1.5234375, "learning_rate": 3.944977091612716e-06, "loss": 0.0239, "num_tokens": 149600586.0, "step": 1488 }, { "epoch": 3.392358140861135, "grad_norm": 1.4921875, "learning_rate": 3.943439705972654e-06, "loss": 0.026, "num_tokens": 149701192.0, "step": 1489 }, { "epoch": 3.3946392928428857, "grad_norm": 1.7890625, "learning_rate": 3.94190150106408e-06, "loss": 0.0225, "num_tokens": 149801831.0, "step": 1490 }, { "epoch": 3.3969204448246364, "grad_norm": 1.34375, "learning_rate": 3.9403624777600526e-06, "loss": 0.0198, "num_tokens": 149902335.0, "step": 1491 }, { "epoch": 3.399201596806387, "grad_norm": 1.6015625, "learning_rate": 3.938822636934089e-06, "loss": 0.021, "num_tokens": 150002785.0, "step": 1492 }, { "epoch": 3.401482748788138, "grad_norm": 1.375, "learning_rate": 3.937281979460175e-06, "loss": 0.0179, "num_tokens": 150103431.0, "step": 1493 }, { "epoch": 3.4037639007698886, "grad_norm": 1.7109375, "learning_rate": 3.9357405062127565e-06, "loss": 0.0229, "num_tokens": 150203923.0, "step": 1494 }, { "epoch": 3.4060450527516397, "grad_norm": 2.234375, "learning_rate": 3.934198218066745e-06, "loss": 0.027, "num_tokens": 150305013.0, "step": 1495 }, { "epoch": 3.4083262047333904, "grad_norm": 1.671875, "learning_rate": 3.932655115897513e-06, "loss": 0.0239, "num_tokens": 150405459.0, "step": 1496 }, { "epoch": 3.410607356715141, "grad_norm": 1.546875, "learning_rate": 3.9311112005808955e-06, "loss": 0.0242, "num_tokens": 150506140.0, "step": 1497 }, { "epoch": 3.412888508696892, "grad_norm": 1.3515625, "learning_rate": 3.92956647299319e-06, "loss": 0.0243, "num_tokens": 150606800.0, "step": 1498 }, { "epoch": 3.4151696606786426, "grad_norm": 3.03125, "learning_rate": 3.928020934011153e-06, "loss": 0.0248, "num_tokens": 150707467.0, "step": 1499 }, { "epoch": 3.4174508126603937, "grad_norm": 1.40625, "learning_rate": 3.926474584512002e-06, "loss": 0.0177, "num_tokens": 150807896.0, "step": 1500 }, { "epoch": 3.4197319646421445, "grad_norm": 1.953125, "learning_rate": 3.924927425373417e-06, "loss": 0.0302, "num_tokens": 150908764.0, "step": 1501 }, { "epoch": 3.422013116623895, "grad_norm": 1.953125, "learning_rate": 3.9233794574735345e-06, "loss": 0.0298, "num_tokens": 151009714.0, "step": 1502 }, { "epoch": 3.424294268605646, "grad_norm": 1.375, "learning_rate": 3.921830681690951e-06, "loss": 0.0192, "num_tokens": 151109505.0, "step": 1503 }, { "epoch": 3.4265754205873966, "grad_norm": 1.5390625, "learning_rate": 3.920281098904722e-06, "loss": 0.0212, "num_tokens": 151210446.0, "step": 1504 }, { "epoch": 3.4288565725691473, "grad_norm": 1.7265625, "learning_rate": 3.918730709994361e-06, "loss": 0.0291, "num_tokens": 151311178.0, "step": 1505 }, { "epoch": 3.431137724550898, "grad_norm": 1.828125, "learning_rate": 3.91717951583984e-06, "loss": 0.0227, "num_tokens": 151411601.0, "step": 1506 }, { "epoch": 3.4334188765326488, "grad_norm": 1.4140625, "learning_rate": 3.915627517321584e-06, "loss": 0.0216, "num_tokens": 151512545.0, "step": 1507 }, { "epoch": 3.4357000285144, "grad_norm": 1.3515625, "learning_rate": 3.914074715320479e-06, "loss": 0.0194, "num_tokens": 151613317.0, "step": 1508 }, { "epoch": 3.4379811804961506, "grad_norm": 1.5, "learning_rate": 3.912521110717866e-06, "loss": 0.0257, "num_tokens": 151714245.0, "step": 1509 }, { "epoch": 3.4402623324779014, "grad_norm": 1.3515625, "learning_rate": 3.9109667043955405e-06, "loss": 0.0225, "num_tokens": 151814790.0, "step": 1510 }, { "epoch": 3.442543484459652, "grad_norm": 1.0234375, "learning_rate": 3.909411497235752e-06, "loss": 0.0118, "num_tokens": 151915245.0, "step": 1511 }, { "epoch": 3.444824636441403, "grad_norm": 1.6640625, "learning_rate": 3.907855490121208e-06, "loss": 0.0245, "num_tokens": 152015877.0, "step": 1512 }, { "epoch": 3.4471057884231535, "grad_norm": 1.5234375, "learning_rate": 3.906298683935068e-06, "loss": 0.0284, "num_tokens": 152116496.0, "step": 1513 }, { "epoch": 3.4493869404049047, "grad_norm": 1.9453125, "learning_rate": 3.904741079560944e-06, "loss": 0.0299, "num_tokens": 152217492.0, "step": 1514 }, { "epoch": 3.4516680923866554, "grad_norm": 1.46875, "learning_rate": 3.903182677882904e-06, "loss": 0.0204, "num_tokens": 152318355.0, "step": 1515 }, { "epoch": 3.453949244368406, "grad_norm": 1.9375, "learning_rate": 3.901623479785465e-06, "loss": 0.0274, "num_tokens": 152418635.0, "step": 1516 }, { "epoch": 3.456230396350157, "grad_norm": 1.4453125, "learning_rate": 3.900063486153598e-06, "loss": 0.0208, "num_tokens": 152519374.0, "step": 1517 }, { "epoch": 3.4585115483319075, "grad_norm": 1.671875, "learning_rate": 3.898502697872725e-06, "loss": 0.0216, "num_tokens": 152620117.0, "step": 1518 }, { "epoch": 3.4607927003136583, "grad_norm": 1.3984375, "learning_rate": 3.896941115828721e-06, "loss": 0.0235, "num_tokens": 152720647.0, "step": 1519 }, { "epoch": 3.463073852295409, "grad_norm": 1.484375, "learning_rate": 3.895378740907908e-06, "loss": 0.0177, "num_tokens": 152820724.0, "step": 1520 }, { "epoch": 3.46535500427716, "grad_norm": 1.484375, "learning_rate": 3.89381557399706e-06, "loss": 0.0208, "num_tokens": 152921415.0, "step": 1521 }, { "epoch": 3.467636156258911, "grad_norm": 1.5703125, "learning_rate": 3.892251615983401e-06, "loss": 0.0235, "num_tokens": 153022274.0, "step": 1522 }, { "epoch": 3.4699173082406616, "grad_norm": 1.578125, "learning_rate": 3.890686867754604e-06, "loss": 0.0229, "num_tokens": 153122753.0, "step": 1523 }, { "epoch": 3.4721984602224123, "grad_norm": 1.4296875, "learning_rate": 3.889121330198788e-06, "loss": 0.0229, "num_tokens": 153223504.0, "step": 1524 }, { "epoch": 3.474479612204163, "grad_norm": 1.125, "learning_rate": 3.887555004204524e-06, "loss": 0.0171, "num_tokens": 153324537.0, "step": 1525 }, { "epoch": 3.4767607641859137, "grad_norm": 1.5859375, "learning_rate": 3.885987890660828e-06, "loss": 0.0231, "num_tokens": 153425491.0, "step": 1526 }, { "epoch": 3.479041916167665, "grad_norm": 1.7578125, "learning_rate": 3.884419990457161e-06, "loss": 0.0335, "num_tokens": 153526119.0, "step": 1527 }, { "epoch": 3.4813230681494156, "grad_norm": 1.59375, "learning_rate": 3.882851304483436e-06, "loss": 0.0244, "num_tokens": 153626671.0, "step": 1528 }, { "epoch": 3.4836042201311663, "grad_norm": 1.4296875, "learning_rate": 3.881281833630007e-06, "loss": 0.0189, "num_tokens": 153727998.0, "step": 1529 }, { "epoch": 3.485885372112917, "grad_norm": 1.4765625, "learning_rate": 3.879711578787676e-06, "loss": 0.0277, "num_tokens": 153829147.0, "step": 1530 }, { "epoch": 3.4881665240946678, "grad_norm": 1.5703125, "learning_rate": 3.87814054084769e-06, "loss": 0.0166, "num_tokens": 153930060.0, "step": 1531 }, { "epoch": 3.4904476760764185, "grad_norm": 1.296875, "learning_rate": 3.8765687207017375e-06, "loss": 0.0178, "num_tokens": 154030787.0, "step": 1532 }, { "epoch": 3.492728828058169, "grad_norm": 1.328125, "learning_rate": 3.874996119241956e-06, "loss": 0.018, "num_tokens": 154131611.0, "step": 1533 }, { "epoch": 3.49500998003992, "grad_norm": 1.21875, "learning_rate": 3.873422737360922e-06, "loss": 0.0194, "num_tokens": 154232193.0, "step": 1534 }, { "epoch": 3.497291132021671, "grad_norm": 1.5625, "learning_rate": 3.871848575951658e-06, "loss": 0.0268, "num_tokens": 154332805.0, "step": 1535 }, { "epoch": 3.499572284003422, "grad_norm": 1.4375, "learning_rate": 3.8702736359076265e-06, "loss": 0.0219, "num_tokens": 154433711.0, "step": 1536 }, { "epoch": 3.5018534359851725, "grad_norm": 2.1875, "learning_rate": 3.868697918122733e-06, "loss": 0.0305, "num_tokens": 154534217.0, "step": 1537 }, { "epoch": 3.5041345879669232, "grad_norm": 1.40625, "learning_rate": 3.867121423491325e-06, "loss": 0.017, "num_tokens": 154635225.0, "step": 1538 }, { "epoch": 3.506415739948674, "grad_norm": 1.5625, "learning_rate": 3.86554415290819e-06, "loss": 0.0263, "num_tokens": 154736007.0, "step": 1539 }, { "epoch": 3.508696891930425, "grad_norm": 1.4921875, "learning_rate": 3.8639661072685575e-06, "loss": 0.0254, "num_tokens": 154836505.0, "step": 1540 }, { "epoch": 3.508696891930425, "eval_loss": 0.03198961541056633, "eval_num_tokens": 154836505.0, "eval_runtime": 52.6964, "eval_samples_per_second": 159.119, "eval_steps_per_second": 4.991, "step": 1540 }, { "epoch": 3.510978043912176, "grad_norm": 1.34375, "learning_rate": 3.862387287468095e-06, "loss": 0.0219, "num_tokens": 154937389.0, "step": 1541 }, { "epoch": 3.5132591958939265, "grad_norm": 2.0, "learning_rate": 3.860807694402909e-06, "loss": 0.0291, "num_tokens": 155037683.0, "step": 1542 }, { "epoch": 3.5155403478756773, "grad_norm": 1.59375, "learning_rate": 3.859227328969547e-06, "loss": 0.0213, "num_tokens": 155138857.0, "step": 1543 }, { "epoch": 3.517821499857428, "grad_norm": 1.6875, "learning_rate": 3.857646192064995e-06, "loss": 0.0191, "num_tokens": 155239664.0, "step": 1544 }, { "epoch": 3.5201026518391787, "grad_norm": 1.5859375, "learning_rate": 3.856064284586674e-06, "loss": 0.0194, "num_tokens": 155340176.0, "step": 1545 }, { "epoch": 3.5223838038209294, "grad_norm": 1.2890625, "learning_rate": 3.854481607432445e-06, "loss": 0.0141, "num_tokens": 155440566.0, "step": 1546 }, { "epoch": 3.52466495580268, "grad_norm": 1.4921875, "learning_rate": 3.852898161500605e-06, "loss": 0.0216, "num_tokens": 155541423.0, "step": 1547 }, { "epoch": 3.5269461077844313, "grad_norm": 1.8046875, "learning_rate": 3.851313947689888e-06, "loss": 0.029, "num_tokens": 155641809.0, "step": 1548 }, { "epoch": 3.529227259766182, "grad_norm": 2.171875, "learning_rate": 3.849728966899462e-06, "loss": 0.0258, "num_tokens": 155742640.0, "step": 1549 }, { "epoch": 3.5315084117479327, "grad_norm": 1.3046875, "learning_rate": 3.848143220028931e-06, "loss": 0.0184, "num_tokens": 155842428.0, "step": 1550 }, { "epoch": 3.5337895637296834, "grad_norm": 0.921875, "learning_rate": 3.846556707978337e-06, "loss": 0.0127, "num_tokens": 155942983.0, "step": 1551 }, { "epoch": 3.536070715711434, "grad_norm": 1.234375, "learning_rate": 3.844969431648151e-06, "loss": 0.0162, "num_tokens": 156043222.0, "step": 1552 }, { "epoch": 3.5383518676931853, "grad_norm": 1.5078125, "learning_rate": 3.843381391939281e-06, "loss": 0.0228, "num_tokens": 156143963.0, "step": 1553 }, { "epoch": 3.540633019674936, "grad_norm": 1.4296875, "learning_rate": 3.841792589753067e-06, "loss": 0.0193, "num_tokens": 156244810.0, "step": 1554 }, { "epoch": 3.5429141716566868, "grad_norm": 1.2890625, "learning_rate": 3.840203025991285e-06, "loss": 0.0161, "num_tokens": 156345580.0, "step": 1555 }, { "epoch": 3.5451953236384375, "grad_norm": 1.3125, "learning_rate": 3.838612701556138e-06, "loss": 0.0168, "num_tokens": 156446497.0, "step": 1556 }, { "epoch": 3.547476475620188, "grad_norm": 1.4375, "learning_rate": 3.837021617350266e-06, "loss": 0.0154, "num_tokens": 156547059.0, "step": 1557 }, { "epoch": 3.549757627601939, "grad_norm": 1.453125, "learning_rate": 3.8354297742767345e-06, "loss": 0.0207, "num_tokens": 156647882.0, "step": 1558 }, { "epoch": 3.5520387795836896, "grad_norm": 1.578125, "learning_rate": 3.833837173239044e-06, "loss": 0.0248, "num_tokens": 156748666.0, "step": 1559 }, { "epoch": 3.5543199315654403, "grad_norm": 1.640625, "learning_rate": 3.832243815141126e-06, "loss": 0.0265, "num_tokens": 156848966.0, "step": 1560 }, { "epoch": 3.556601083547191, "grad_norm": 2.015625, "learning_rate": 3.830649700887339e-06, "loss": 0.0252, "num_tokens": 156949306.0, "step": 1561 }, { "epoch": 3.5588822355289422, "grad_norm": 1.7421875, "learning_rate": 3.829054831382471e-06, "loss": 0.0237, "num_tokens": 157050181.0, "step": 1562 }, { "epoch": 3.561163387510693, "grad_norm": 1.7890625, "learning_rate": 3.827459207531739e-06, "loss": 0.029, "num_tokens": 157151050.0, "step": 1563 }, { "epoch": 3.5634445394924437, "grad_norm": 1.484375, "learning_rate": 3.825862830240787e-06, "loss": 0.02, "num_tokens": 157251787.0, "step": 1564 }, { "epoch": 3.5657256914741944, "grad_norm": 1.578125, "learning_rate": 3.82426570041569e-06, "loss": 0.0203, "num_tokens": 157351789.0, "step": 1565 }, { "epoch": 3.568006843455945, "grad_norm": 1.546875, "learning_rate": 3.822667818962948e-06, "loss": 0.0216, "num_tokens": 157452324.0, "step": 1566 }, { "epoch": 3.5702879954376963, "grad_norm": 1.6640625, "learning_rate": 3.821069186789486e-06, "loss": 0.0225, "num_tokens": 157552728.0, "step": 1567 }, { "epoch": 3.572569147419447, "grad_norm": 1.4375, "learning_rate": 3.819469804802659e-06, "loss": 0.0247, "num_tokens": 157653441.0, "step": 1568 }, { "epoch": 3.5748502994011977, "grad_norm": 1.828125, "learning_rate": 3.8178696739102435e-06, "loss": 0.0298, "num_tokens": 157754125.0, "step": 1569 }, { "epoch": 3.5771314513829484, "grad_norm": 1.234375, "learning_rate": 3.816268795020443e-06, "loss": 0.0163, "num_tokens": 157855142.0, "step": 1570 }, { "epoch": 3.579412603364699, "grad_norm": 1.4453125, "learning_rate": 3.814667169041887e-06, "loss": 0.0225, "num_tokens": 157955277.0, "step": 1571 }, { "epoch": 3.58169375534645, "grad_norm": 1.859375, "learning_rate": 3.8130647968836254e-06, "loss": 0.0239, "num_tokens": 158055856.0, "step": 1572 }, { "epoch": 3.5839749073282006, "grad_norm": 1.1640625, "learning_rate": 3.811461679455136e-06, "loss": 0.0162, "num_tokens": 158156097.0, "step": 1573 }, { "epoch": 3.5862560593099513, "grad_norm": 1.46875, "learning_rate": 3.809857817666316e-06, "loss": 0.0233, "num_tokens": 158257375.0, "step": 1574 }, { "epoch": 3.5885372112917024, "grad_norm": 1.53125, "learning_rate": 3.808253212427486e-06, "loss": 0.0261, "num_tokens": 158358103.0, "step": 1575 }, { "epoch": 3.590818363273453, "grad_norm": 1.1875, "learning_rate": 3.8066478646493898e-06, "loss": 0.0173, "num_tokens": 158458739.0, "step": 1576 }, { "epoch": 3.593099515255204, "grad_norm": 1.6015625, "learning_rate": 3.805041775243191e-06, "loss": 0.0258, "num_tokens": 158559505.0, "step": 1577 }, { "epoch": 3.5953806672369546, "grad_norm": 1.5546875, "learning_rate": 3.803434945120475e-06, "loss": 0.0222, "num_tokens": 158660319.0, "step": 1578 }, { "epoch": 3.5976618192187053, "grad_norm": 1.4765625, "learning_rate": 3.801827375193249e-06, "loss": 0.0225, "num_tokens": 158760806.0, "step": 1579 }, { "epoch": 3.5999429712004565, "grad_norm": 1.5703125, "learning_rate": 3.8002190663739362e-06, "loss": 0.0218, "num_tokens": 158861377.0, "step": 1580 }, { "epoch": 3.602224123182207, "grad_norm": 1.3125, "learning_rate": 3.798610019575384e-06, "loss": 0.0168, "num_tokens": 158961593.0, "step": 1581 }, { "epoch": 3.604505275163958, "grad_norm": 2.203125, "learning_rate": 3.7970002357108554e-06, "loss": 0.0257, "num_tokens": 159061992.0, "step": 1582 }, { "epoch": 3.6067864271457086, "grad_norm": 1.6484375, "learning_rate": 3.7953897156940323e-06, "loss": 0.0287, "num_tokens": 159162360.0, "step": 1583 }, { "epoch": 3.6090675791274593, "grad_norm": 1.703125, "learning_rate": 3.793778460439015e-06, "loss": 0.0267, "num_tokens": 159262711.0, "step": 1584 }, { "epoch": 3.61134873110921, "grad_norm": 1.7109375, "learning_rate": 3.792166470860321e-06, "loss": 0.0309, "num_tokens": 159363109.0, "step": 1585 }, { "epoch": 3.613629883090961, "grad_norm": 1.71875, "learning_rate": 3.790553747872885e-06, "loss": 0.0264, "num_tokens": 159463496.0, "step": 1586 }, { "epoch": 3.6159110350727115, "grad_norm": 1.625, "learning_rate": 3.788940292392056e-06, "loss": 0.0227, "num_tokens": 159564636.0, "step": 1587 }, { "epoch": 3.6181921870544627, "grad_norm": 1.484375, "learning_rate": 3.787326105333601e-06, "loss": 0.0262, "num_tokens": 159665788.0, "step": 1588 }, { "epoch": 3.6204733390362134, "grad_norm": 1.671875, "learning_rate": 3.7857111876137017e-06, "loss": 0.0261, "num_tokens": 159766222.0, "step": 1589 }, { "epoch": 3.622754491017964, "grad_norm": 1.5234375, "learning_rate": 3.784095540148954e-06, "loss": 0.0288, "num_tokens": 159866629.0, "step": 1590 }, { "epoch": 3.625035642999715, "grad_norm": 1.5234375, "learning_rate": 3.7824791638563674e-06, "loss": 0.0262, "num_tokens": 159967057.0, "step": 1591 }, { "epoch": 3.6273167949814655, "grad_norm": 1.390625, "learning_rate": 3.7808620596533675e-06, "loss": 0.0164, "num_tokens": 160067818.0, "step": 1592 }, { "epoch": 3.6295979469632167, "grad_norm": 1.6953125, "learning_rate": 3.77924422845779e-06, "loss": 0.0332, "num_tokens": 160168074.0, "step": 1593 }, { "epoch": 3.6318790989449674, "grad_norm": 1.578125, "learning_rate": 3.7776256711878856e-06, "loss": 0.0284, "num_tokens": 160268605.0, "step": 1594 }, { "epoch": 3.634160250926718, "grad_norm": 1.6484375, "learning_rate": 3.7760063887623155e-06, "loss": 0.0244, "num_tokens": 160368996.0, "step": 1595 }, { "epoch": 3.636441402908469, "grad_norm": 1.03125, "learning_rate": 3.7743863821001538e-06, "loss": 0.0217, "num_tokens": 160470358.0, "step": 1596 }, { "epoch": 3.6387225548902196, "grad_norm": 1.484375, "learning_rate": 3.7727656521208843e-06, "loss": 0.019, "num_tokens": 160570701.0, "step": 1597 }, { "epoch": 3.6410037068719703, "grad_norm": 1.546875, "learning_rate": 3.771144199744402e-06, "loss": 0.0257, "num_tokens": 160670855.0, "step": 1598 }, { "epoch": 3.643284858853721, "grad_norm": 1.234375, "learning_rate": 3.7695220258910124e-06, "loss": 0.0223, "num_tokens": 160771321.0, "step": 1599 }, { "epoch": 3.6455660108354717, "grad_norm": 1.78125, "learning_rate": 3.7678991314814305e-06, "loss": 0.0241, "num_tokens": 160871550.0, "step": 1600 }, { "epoch": 3.6478471628172224, "grad_norm": 1.3359375, "learning_rate": 3.766275517436779e-06, "loss": 0.0178, "num_tokens": 160972097.0, "step": 1601 }, { "epoch": 3.6501283147989736, "grad_norm": 1.4765625, "learning_rate": 3.7646511846785904e-06, "loss": 0.0253, "num_tokens": 161073034.0, "step": 1602 }, { "epoch": 3.6524094667807243, "grad_norm": 1.46875, "learning_rate": 3.7630261341288044e-06, "loss": 0.0195, "num_tokens": 161173544.0, "step": 1603 }, { "epoch": 3.654690618762475, "grad_norm": 1.7265625, "learning_rate": 3.7614003667097674e-06, "loss": 0.0295, "num_tokens": 161274116.0, "step": 1604 }, { "epoch": 3.6569717707442257, "grad_norm": 1.796875, "learning_rate": 3.759773883344236e-06, "loss": 0.0246, "num_tokens": 161374240.0, "step": 1605 }, { "epoch": 3.6592529227259765, "grad_norm": 1.421875, "learning_rate": 3.7581466849553685e-06, "loss": 0.0176, "num_tokens": 161475713.0, "step": 1606 }, { "epoch": 3.6615340747077276, "grad_norm": 1.2734375, "learning_rate": 3.7565187724667324e-06, "loss": 0.0206, "num_tokens": 161576701.0, "step": 1607 }, { "epoch": 3.6638152266894783, "grad_norm": 1.4375, "learning_rate": 3.7548901468022993e-06, "loss": 0.0184, "num_tokens": 161677241.0, "step": 1608 }, { "epoch": 3.666096378671229, "grad_norm": 1.3984375, "learning_rate": 3.7532608088864444e-06, "loss": 0.0183, "num_tokens": 161777705.0, "step": 1609 }, { "epoch": 3.66837753065298, "grad_norm": 1.6328125, "learning_rate": 3.75163075964395e-06, "loss": 0.0247, "num_tokens": 161878306.0, "step": 1610 }, { "epoch": 3.6706586826347305, "grad_norm": 1.453125, "learning_rate": 3.7500000000000005e-06, "loss": 0.0285, "num_tokens": 161979295.0, "step": 1611 }, { "epoch": 3.672939834616481, "grad_norm": 1.3046875, "learning_rate": 3.748368530880183e-06, "loss": 0.0166, "num_tokens": 162079274.0, "step": 1612 }, { "epoch": 3.675220986598232, "grad_norm": 1.109375, "learning_rate": 3.7467363532104874e-06, "loss": 0.018, "num_tokens": 162180263.0, "step": 1613 }, { "epoch": 3.6775021385799826, "grad_norm": 1.390625, "learning_rate": 3.7451034679173082e-06, "loss": 0.0244, "num_tokens": 162280757.0, "step": 1614 }, { "epoch": 3.679783290561734, "grad_norm": 1.4453125, "learning_rate": 3.7434698759274366e-06, "loss": 0.0226, "num_tokens": 162382000.0, "step": 1615 }, { "epoch": 3.6820644425434845, "grad_norm": 1.5234375, "learning_rate": 3.741835578168071e-06, "loss": 0.0203, "num_tokens": 162483023.0, "step": 1616 }, { "epoch": 3.6843455945252352, "grad_norm": 1.703125, "learning_rate": 3.740200575566806e-06, "loss": 0.0233, "num_tokens": 162583462.0, "step": 1617 }, { "epoch": 3.686626746506986, "grad_norm": 1.6640625, "learning_rate": 3.7385648690516364e-06, "loss": 0.0218, "num_tokens": 162684149.0, "step": 1618 }, { "epoch": 3.6889078984887367, "grad_norm": 1.5859375, "learning_rate": 3.7369284595509587e-06, "loss": 0.0224, "num_tokens": 162785084.0, "step": 1619 }, { "epoch": 3.691189050470488, "grad_norm": 1.3984375, "learning_rate": 3.7352913479935672e-06, "loss": 0.02, "num_tokens": 162886112.0, "step": 1620 }, { "epoch": 3.6934702024522386, "grad_norm": 1.953125, "learning_rate": 3.7336535353086546e-06, "loss": 0.0267, "num_tokens": 162987123.0, "step": 1621 }, { "epoch": 3.6957513544339893, "grad_norm": 1.859375, "learning_rate": 3.7320150224258124e-06, "loss": 0.0228, "num_tokens": 163087799.0, "step": 1622 }, { "epoch": 3.69803250641574, "grad_norm": 1.3515625, "learning_rate": 3.7303758102750274e-06, "loss": 0.0212, "num_tokens": 163188627.0, "step": 1623 }, { "epoch": 3.7003136583974907, "grad_norm": 1.8359375, "learning_rate": 3.7287358997866872e-06, "loss": 0.0228, "num_tokens": 163289045.0, "step": 1624 }, { "epoch": 3.7025948103792414, "grad_norm": 1.625, "learning_rate": 3.7270952918915715e-06, "loss": 0.0232, "num_tokens": 163389305.0, "step": 1625 }, { "epoch": 3.704875962360992, "grad_norm": 1.65625, "learning_rate": 3.7254539875208577e-06, "loss": 0.0193, "num_tokens": 163489911.0, "step": 1626 }, { "epoch": 3.707157114342743, "grad_norm": 1.546875, "learning_rate": 3.7238119876061196e-06, "loss": 0.0174, "num_tokens": 163590374.0, "step": 1627 }, { "epoch": 3.709438266324494, "grad_norm": 1.28125, "learning_rate": 3.7221692930793234e-06, "loss": 0.0199, "num_tokens": 163690951.0, "step": 1628 }, { "epoch": 3.7117194183062447, "grad_norm": 1.734375, "learning_rate": 3.7205259048728316e-06, "loss": 0.028, "num_tokens": 163792185.0, "step": 1629 }, { "epoch": 3.7140005702879955, "grad_norm": 1.7734375, "learning_rate": 3.718881823919399e-06, "loss": 0.0231, "num_tokens": 163893288.0, "step": 1630 }, { "epoch": 3.716281722269746, "grad_norm": 1.1640625, "learning_rate": 3.717237051152175e-06, "loss": 0.0114, "num_tokens": 163994076.0, "step": 1631 }, { "epoch": 3.718562874251497, "grad_norm": 1.4453125, "learning_rate": 3.7155915875047005e-06, "loss": 0.0221, "num_tokens": 164094784.0, "step": 1632 }, { "epoch": 3.720844026233248, "grad_norm": 1.96875, "learning_rate": 3.7139454339109082e-06, "loss": 0.0229, "num_tokens": 164195178.0, "step": 1633 }, { "epoch": 3.7231251782149988, "grad_norm": 1.3828125, "learning_rate": 3.7122985913051242e-06, "loss": 0.02, "num_tokens": 164296063.0, "step": 1634 }, { "epoch": 3.7254063301967495, "grad_norm": 1.3203125, "learning_rate": 3.710651060622064e-06, "loss": 0.0171, "num_tokens": 164396606.0, "step": 1635 }, { "epoch": 3.7276874821785, "grad_norm": 1.9375, "learning_rate": 3.7090028427968343e-06, "loss": 0.026, "num_tokens": 164496603.0, "step": 1636 }, { "epoch": 3.729968634160251, "grad_norm": 1.265625, "learning_rate": 3.7073539387649316e-06, "loss": 0.0176, "num_tokens": 164596999.0, "step": 1637 }, { "epoch": 3.7322497861420016, "grad_norm": 1.4765625, "learning_rate": 3.7057043494622423e-06, "loss": 0.0242, "num_tokens": 164697635.0, "step": 1638 }, { "epoch": 3.7345309381237524, "grad_norm": 1.8203125, "learning_rate": 3.704054075825042e-06, "loss": 0.0229, "num_tokens": 164798033.0, "step": 1639 }, { "epoch": 3.736812090105503, "grad_norm": 1.171875, "learning_rate": 3.702403118789992e-06, "loss": 0.0181, "num_tokens": 164898790.0, "step": 1640 }, { "epoch": 3.739093242087254, "grad_norm": 1.1328125, "learning_rate": 3.7007514792941462e-06, "loss": 0.0182, "num_tokens": 164999564.0, "step": 1641 }, { "epoch": 3.741374394069005, "grad_norm": 1.6015625, "learning_rate": 3.6990991582749414e-06, "loss": 0.0184, "num_tokens": 165100309.0, "step": 1642 }, { "epoch": 3.7436555460507557, "grad_norm": 2.0625, "learning_rate": 3.6974461566702048e-06, "loss": 0.0269, "num_tokens": 165201196.0, "step": 1643 }, { "epoch": 3.7459366980325064, "grad_norm": 1.953125, "learning_rate": 3.695792475418146e-06, "loss": 0.0258, "num_tokens": 165301938.0, "step": 1644 }, { "epoch": 3.748217850014257, "grad_norm": 1.5546875, "learning_rate": 3.6941381154573646e-06, "loss": 0.0186, "num_tokens": 165402446.0, "step": 1645 }, { "epoch": 3.750499001996008, "grad_norm": 1.5703125, "learning_rate": 3.692483077726843e-06, "loss": 0.027, "num_tokens": 165503175.0, "step": 1646 }, { "epoch": 3.752780153977759, "grad_norm": 1.4140625, "learning_rate": 3.6908273631659475e-06, "loss": 0.0245, "num_tokens": 165603173.0, "step": 1647 }, { "epoch": 3.7550613059595097, "grad_norm": 1.6171875, "learning_rate": 3.689170972714431e-06, "loss": 0.0251, "num_tokens": 165704052.0, "step": 1648 }, { "epoch": 3.7573424579412604, "grad_norm": 1.34375, "learning_rate": 3.6875139073124277e-06, "loss": 0.0176, "num_tokens": 165804625.0, "step": 1649 }, { "epoch": 3.759623609923011, "grad_norm": 1.609375, "learning_rate": 3.6858561679004567e-06, "loss": 0.0278, "num_tokens": 165905253.0, "step": 1650 }, { "epoch": 3.761904761904762, "grad_norm": 1.375, "learning_rate": 3.684197755419419e-06, "loss": 0.0218, "num_tokens": 166006012.0, "step": 1651 }, { "epoch": 3.7641859138865126, "grad_norm": 1.3125, "learning_rate": 3.6825386708105963e-06, "loss": 0.0222, "num_tokens": 166106587.0, "step": 1652 }, { "epoch": 3.7664670658682633, "grad_norm": 1.5625, "learning_rate": 3.6808789150156545e-06, "loss": 0.0172, "num_tokens": 166207293.0, "step": 1653 }, { "epoch": 3.768748217850014, "grad_norm": 1.6640625, "learning_rate": 3.679218488976638e-06, "loss": 0.0324, "num_tokens": 166308156.0, "step": 1654 }, { "epoch": 3.771029369831765, "grad_norm": 1.4296875, "learning_rate": 3.677557393635973e-06, "loss": 0.0211, "num_tokens": 166408866.0, "step": 1655 }, { "epoch": 3.773310521813516, "grad_norm": 1.234375, "learning_rate": 3.6758956299364643e-06, "loss": 0.0193, "num_tokens": 166509135.0, "step": 1656 }, { "epoch": 3.7755916737952666, "grad_norm": 1.5390625, "learning_rate": 3.674233198821299e-06, "loss": 0.0233, "num_tokens": 166610505.0, "step": 1657 }, { "epoch": 3.7778728257770173, "grad_norm": 1.125, "learning_rate": 3.6725701012340387e-06, "loss": 0.0119, "num_tokens": 166711107.0, "step": 1658 }, { "epoch": 3.780153977758768, "grad_norm": 1.6875, "learning_rate": 3.6709063381186267e-06, "loss": 0.0292, "num_tokens": 166812115.0, "step": 1659 }, { "epoch": 3.782435129740519, "grad_norm": 1.7734375, "learning_rate": 3.6692419104193823e-06, "loss": 0.0289, "num_tokens": 166912712.0, "step": 1660 }, { "epoch": 3.78471628172227, "grad_norm": 1.4453125, "learning_rate": 3.6675768190810023e-06, "loss": 0.0225, "num_tokens": 167013291.0, "step": 1661 }, { "epoch": 3.7869974337040206, "grad_norm": 1.5546875, "learning_rate": 3.665911065048561e-06, "loss": 0.0227, "num_tokens": 167114519.0, "step": 1662 }, { "epoch": 3.7892785856857714, "grad_norm": 1.3046875, "learning_rate": 3.6642446492675075e-06, "loss": 0.0188, "num_tokens": 167214978.0, "step": 1663 }, { "epoch": 3.791559737667522, "grad_norm": 1.4140625, "learning_rate": 3.6625775726836677e-06, "loss": 0.0196, "num_tokens": 167315545.0, "step": 1664 }, { "epoch": 3.793840889649273, "grad_norm": 1.5546875, "learning_rate": 3.6609098362432425e-06, "loss": 0.0223, "num_tokens": 167415952.0, "step": 1665 }, { "epoch": 3.7961220416310235, "grad_norm": 1.859375, "learning_rate": 3.659241440892806e-06, "loss": 0.0276, "num_tokens": 167516673.0, "step": 1666 }, { "epoch": 3.7984031936127742, "grad_norm": 1.234375, "learning_rate": 3.6575723875793085e-06, "loss": 0.0205, "num_tokens": 167617217.0, "step": 1667 }, { "epoch": 3.8006843455945254, "grad_norm": 1.9921875, "learning_rate": 3.655902677250071e-06, "loss": 0.0247, "num_tokens": 167718007.0, "step": 1668 }, { "epoch": 3.802965497576276, "grad_norm": 1.53125, "learning_rate": 3.6542323108527896e-06, "loss": 0.0237, "num_tokens": 167818689.0, "step": 1669 }, { "epoch": 3.805246649558027, "grad_norm": 1.6484375, "learning_rate": 3.652561289335532e-06, "loss": 0.0269, "num_tokens": 167918579.0, "step": 1670 }, { "epoch": 3.8075278015397775, "grad_norm": 2.140625, "learning_rate": 3.6508896136467376e-06, "loss": 0.0207, "num_tokens": 168019498.0, "step": 1671 }, { "epoch": 3.8098089535215283, "grad_norm": 1.578125, "learning_rate": 3.649217284735217e-06, "loss": 0.0251, "num_tokens": 168120393.0, "step": 1672 }, { "epoch": 3.8120901055032794, "grad_norm": 1.546875, "learning_rate": 3.6475443035501522e-06, "loss": 0.0264, "num_tokens": 168220656.0, "step": 1673 }, { "epoch": 3.81437125748503, "grad_norm": 1.1875, "learning_rate": 3.645870671041095e-06, "loss": 0.0195, "num_tokens": 168321237.0, "step": 1674 }, { "epoch": 3.816652409466781, "grad_norm": 1.328125, "learning_rate": 3.6441963881579668e-06, "loss": 0.026, "num_tokens": 168422395.0, "step": 1675 }, { "epoch": 3.8189335614485316, "grad_norm": 1.46875, "learning_rate": 3.642521455851058e-06, "loss": 0.0292, "num_tokens": 168522876.0, "step": 1676 }, { "epoch": 3.8212147134302823, "grad_norm": 1.640625, "learning_rate": 3.6408458750710284e-06, "loss": 0.0204, "num_tokens": 168623769.0, "step": 1677 }, { "epoch": 3.823495865412033, "grad_norm": 1.578125, "learning_rate": 3.639169646768905e-06, "loss": 0.0298, "num_tokens": 168724776.0, "step": 1678 }, { "epoch": 3.8257770173937837, "grad_norm": 1.609375, "learning_rate": 3.637492771896082e-06, "loss": 0.0198, "num_tokens": 168825158.0, "step": 1679 }, { "epoch": 3.8280581693755344, "grad_norm": 1.15625, "learning_rate": 3.6358152514043226e-06, "loss": 0.0164, "num_tokens": 168925809.0, "step": 1680 }, { "epoch": 3.830339321357285, "grad_norm": 1.3203125, "learning_rate": 3.634137086245754e-06, "loss": 0.0184, "num_tokens": 169026623.0, "step": 1681 }, { "epoch": 3.8326204733390363, "grad_norm": 1.5703125, "learning_rate": 3.6324582773728712e-06, "loss": 0.0241, "num_tokens": 169127549.0, "step": 1682 }, { "epoch": 3.834901625320787, "grad_norm": 1.75, "learning_rate": 3.6307788257385325e-06, "loss": 0.0275, "num_tokens": 169228044.0, "step": 1683 }, { "epoch": 3.8371827773025378, "grad_norm": 1.5546875, "learning_rate": 3.6290987322959624e-06, "loss": 0.0253, "num_tokens": 169329431.0, "step": 1684 }, { "epoch": 3.8394639292842885, "grad_norm": 1.1328125, "learning_rate": 3.6274179979987507e-06, "loss": 0.0172, "num_tokens": 169429815.0, "step": 1685 }, { "epoch": 3.841745081266039, "grad_norm": 1.9921875, "learning_rate": 3.625736623800849e-06, "loss": 0.0237, "num_tokens": 169530449.0, "step": 1686 }, { "epoch": 3.8440262332477904, "grad_norm": 1.6328125, "learning_rate": 3.624054610656572e-06, "loss": 0.0257, "num_tokens": 169631536.0, "step": 1687 }, { "epoch": 3.846307385229541, "grad_norm": 1.765625, "learning_rate": 3.622371959520599e-06, "loss": 0.0207, "num_tokens": 169732114.0, "step": 1688 }, { "epoch": 3.848588537211292, "grad_norm": 1.5390625, "learning_rate": 3.6206886713479705e-06, "loss": 0.0248, "num_tokens": 169832743.0, "step": 1689 }, { "epoch": 3.8508696891930425, "grad_norm": 1.6640625, "learning_rate": 3.6190047470940875e-06, "loss": 0.0205, "num_tokens": 169933177.0, "step": 1690 }, { "epoch": 3.8531508411747932, "grad_norm": 1.6953125, "learning_rate": 3.6173201877147134e-06, "loss": 0.0226, "num_tokens": 170033359.0, "step": 1691 }, { "epoch": 3.855431993156544, "grad_norm": 1.4921875, "learning_rate": 3.6156349941659717e-06, "loss": 0.0193, "num_tokens": 170134266.0, "step": 1692 }, { "epoch": 3.8577131451382947, "grad_norm": 1.421875, "learning_rate": 3.613949167404345e-06, "loss": 0.0209, "num_tokens": 170234458.0, "step": 1693 }, { "epoch": 3.8599942971200454, "grad_norm": 1.7265625, "learning_rate": 3.6122627083866773e-06, "loss": 0.0196, "num_tokens": 170334729.0, "step": 1694 }, { "epoch": 3.8622754491017965, "grad_norm": 1.4296875, "learning_rate": 3.610575618070169e-06, "loss": 0.0187, "num_tokens": 170435242.0, "step": 1695 }, { "epoch": 3.8645566010835473, "grad_norm": 1.4921875, "learning_rate": 3.6088878974123796e-06, "loss": 0.0216, "num_tokens": 170536318.0, "step": 1696 }, { "epoch": 3.866837753065298, "grad_norm": 1.3125, "learning_rate": 3.6071995473712284e-06, "loss": 0.0164, "num_tokens": 170636666.0, "step": 1697 }, { "epoch": 3.8691189050470487, "grad_norm": 1.3671875, "learning_rate": 3.605510568904989e-06, "loss": 0.0207, "num_tokens": 170737335.0, "step": 1698 }, { "epoch": 3.8714000570287994, "grad_norm": 1.2265625, "learning_rate": 3.6038209629722936e-06, "loss": 0.0185, "num_tokens": 170838202.0, "step": 1699 }, { "epoch": 3.8736812090105506, "grad_norm": 1.609375, "learning_rate": 3.6021307305321295e-06, "loss": 0.0193, "num_tokens": 170939144.0, "step": 1700 }, { "epoch": 3.8759623609923013, "grad_norm": 1.359375, "learning_rate": 3.6004398725438406e-06, "loss": 0.0208, "num_tokens": 171039967.0, "step": 1701 }, { "epoch": 3.878243512974052, "grad_norm": 1.5, "learning_rate": 3.5987483899671245e-06, "loss": 0.0228, "num_tokens": 171140891.0, "step": 1702 }, { "epoch": 3.8805246649558027, "grad_norm": 1.3125, "learning_rate": 3.597056283762034e-06, "loss": 0.0175, "num_tokens": 171242275.0, "step": 1703 }, { "epoch": 3.8828058169375534, "grad_norm": 1.359375, "learning_rate": 3.5953635548889777e-06, "loss": 0.0188, "num_tokens": 171342497.0, "step": 1704 }, { "epoch": 3.885086968919304, "grad_norm": 1.46875, "learning_rate": 3.5936702043087134e-06, "loss": 0.0249, "num_tokens": 171442803.0, "step": 1705 }, { "epoch": 3.887368120901055, "grad_norm": 1.9375, "learning_rate": 3.5919762329823556e-06, "loss": 0.026, "num_tokens": 171543826.0, "step": 1706 }, { "epoch": 3.8896492728828056, "grad_norm": 1.4296875, "learning_rate": 3.5902816418713694e-06, "loss": 0.0166, "num_tokens": 171644486.0, "step": 1707 }, { "epoch": 3.8919304248645568, "grad_norm": 1.34375, "learning_rate": 3.5885864319375717e-06, "loss": 0.0182, "num_tokens": 171745621.0, "step": 1708 }, { "epoch": 3.8942115768463075, "grad_norm": 1.71875, "learning_rate": 3.5868906041431313e-06, "loss": 0.0249, "num_tokens": 171845868.0, "step": 1709 }, { "epoch": 3.896492728828058, "grad_norm": 2.046875, "learning_rate": 3.5851941594505674e-06, "loss": 0.0325, "num_tokens": 171946175.0, "step": 1710 }, { "epoch": 3.898773880809809, "grad_norm": 1.109375, "learning_rate": 3.5834970988227484e-06, "loss": 0.0193, "num_tokens": 172046881.0, "step": 1711 }, { "epoch": 3.9010550327915596, "grad_norm": 1.890625, "learning_rate": 3.581799423222895e-06, "loss": 0.0226, "num_tokens": 172147697.0, "step": 1712 }, { "epoch": 3.903336184773311, "grad_norm": 1.734375, "learning_rate": 3.580101133614573e-06, "loss": 0.0267, "num_tokens": 172249246.0, "step": 1713 }, { "epoch": 3.9056173367550615, "grad_norm": 2.09375, "learning_rate": 3.5784022309617006e-06, "loss": 0.0325, "num_tokens": 172350244.0, "step": 1714 }, { "epoch": 3.9078984887368122, "grad_norm": 1.6015625, "learning_rate": 3.57670271622854e-06, "loss": 0.0244, "num_tokens": 172450549.0, "step": 1715 }, { "epoch": 3.910179640718563, "grad_norm": 1.4375, "learning_rate": 3.5750025903797053e-06, "loss": 0.0226, "num_tokens": 172550534.0, "step": 1716 }, { "epoch": 3.9124607927003137, "grad_norm": 1.7890625, "learning_rate": 3.5733018543801534e-06, "loss": 0.032, "num_tokens": 172650691.0, "step": 1717 }, { "epoch": 3.9147419446820644, "grad_norm": 1.8125, "learning_rate": 3.5716005091951906e-06, "loss": 0.0258, "num_tokens": 172750977.0, "step": 1718 }, { "epoch": 3.917023096663815, "grad_norm": 1.4140625, "learning_rate": 3.569898555790466e-06, "loss": 0.0246, "num_tokens": 172851156.0, "step": 1719 }, { "epoch": 3.919304248645566, "grad_norm": 1.6640625, "learning_rate": 3.5681959951319766e-06, "loss": 0.0255, "num_tokens": 172952210.0, "step": 1720 }, { "epoch": 3.9215854006273165, "grad_norm": 1.359375, "learning_rate": 3.566492828186063e-06, "loss": 0.0159, "num_tokens": 173052608.0, "step": 1721 }, { "epoch": 3.9238665526090677, "grad_norm": 1.1015625, "learning_rate": 3.564789055919409e-06, "loss": 0.0165, "num_tokens": 173153454.0, "step": 1722 }, { "epoch": 3.9261477045908184, "grad_norm": 1.59375, "learning_rate": 3.5630846792990435e-06, "loss": 0.0236, "num_tokens": 173254016.0, "step": 1723 }, { "epoch": 3.928428856572569, "grad_norm": 1.421875, "learning_rate": 3.5613796992923382e-06, "loss": 0.0189, "num_tokens": 173354797.0, "step": 1724 }, { "epoch": 3.93071000855432, "grad_norm": 1.1796875, "learning_rate": 3.559674116867006e-06, "loss": 0.0168, "num_tokens": 173455607.0, "step": 1725 }, { "epoch": 3.9329911605360706, "grad_norm": 1.4765625, "learning_rate": 3.5579679329911025e-06, "loss": 0.0184, "num_tokens": 173556378.0, "step": 1726 }, { "epoch": 3.9352723125178217, "grad_norm": 1.40625, "learning_rate": 3.556261148633026e-06, "loss": 0.0226, "num_tokens": 173657183.0, "step": 1727 }, { "epoch": 3.9375534644995724, "grad_norm": 1.28125, "learning_rate": 3.5545537647615125e-06, "loss": 0.0221, "num_tokens": 173757705.0, "step": 1728 }, { "epoch": 3.939834616481323, "grad_norm": 2.046875, "learning_rate": 3.552845782345642e-06, "loss": 0.0374, "num_tokens": 173858079.0, "step": 1729 }, { "epoch": 3.942115768463074, "grad_norm": 1.5546875, "learning_rate": 3.551137202354831e-06, "loss": 0.0226, "num_tokens": 173958461.0, "step": 1730 }, { "epoch": 3.9443969204448246, "grad_norm": 1.4375, "learning_rate": 3.5494280257588367e-06, "loss": 0.0266, "num_tokens": 174059215.0, "step": 1731 }, { "epoch": 3.9466780724265753, "grad_norm": 1.0859375, "learning_rate": 3.547718253527755e-06, "loss": 0.0174, "num_tokens": 174160085.0, "step": 1732 }, { "epoch": 3.948959224408326, "grad_norm": 1.5078125, "learning_rate": 3.546007886632019e-06, "loss": 0.0245, "num_tokens": 174260112.0, "step": 1733 }, { "epoch": 3.9512403763900767, "grad_norm": 1.78125, "learning_rate": 3.5442969260424022e-06, "loss": 0.0219, "num_tokens": 174360866.0, "step": 1734 }, { "epoch": 3.953521528371828, "grad_norm": 1.234375, "learning_rate": 3.5425853727300095e-06, "loss": 0.0176, "num_tokens": 174461812.0, "step": 1735 }, { "epoch": 3.9558026803535786, "grad_norm": 1.5390625, "learning_rate": 3.5408732276662882e-06, "loss": 0.0166, "num_tokens": 174562137.0, "step": 1736 }, { "epoch": 3.9580838323353293, "grad_norm": 1.6640625, "learning_rate": 3.5391604918230173e-06, "loss": 0.0242, "num_tokens": 174663444.0, "step": 1737 }, { "epoch": 3.96036498431708, "grad_norm": 1.421875, "learning_rate": 3.537447166172313e-06, "loss": 0.0224, "num_tokens": 174763989.0, "step": 1738 }, { "epoch": 3.962646136298831, "grad_norm": 1.21875, "learning_rate": 3.5357332516866256e-06, "loss": 0.0146, "num_tokens": 174864353.0, "step": 1739 }, { "epoch": 3.964927288280582, "grad_norm": 1.125, "learning_rate": 3.534018749338741e-06, "loss": 0.0195, "num_tokens": 174965291.0, "step": 1740 }, { "epoch": 3.9672084402623327, "grad_norm": 2.21875, "learning_rate": 3.532303660101776e-06, "loss": 0.03, "num_tokens": 175065807.0, "step": 1741 }, { "epoch": 3.9694895922440834, "grad_norm": 1.4609375, "learning_rate": 3.530587984949183e-06, "loss": 0.0216, "num_tokens": 175166193.0, "step": 1742 }, { "epoch": 3.971770744225834, "grad_norm": 1.3359375, "learning_rate": 3.5288717248547453e-06, "loss": 0.0188, "num_tokens": 175267651.0, "step": 1743 }, { "epoch": 3.974051896207585, "grad_norm": 1.515625, "learning_rate": 3.5271548807925803e-06, "loss": 0.0169, "num_tokens": 175368367.0, "step": 1744 }, { "epoch": 3.9763330481893355, "grad_norm": 1.59375, "learning_rate": 3.525437453737136e-06, "loss": 0.0238, "num_tokens": 175469019.0, "step": 1745 }, { "epoch": 3.9786142001710862, "grad_norm": 1.265625, "learning_rate": 3.5237194446631883e-06, "loss": 0.019, "num_tokens": 175568921.0, "step": 1746 }, { "epoch": 3.980895352152837, "grad_norm": 1.734375, "learning_rate": 3.522000854545849e-06, "loss": 0.0217, "num_tokens": 175669217.0, "step": 1747 }, { "epoch": 3.983176504134588, "grad_norm": 1.765625, "learning_rate": 3.520281684360554e-06, "loss": 0.03, "num_tokens": 175769789.0, "step": 1748 }, { "epoch": 3.985457656116339, "grad_norm": 1.6796875, "learning_rate": 3.5185619350830725e-06, "loss": 0.0236, "num_tokens": 175870126.0, "step": 1749 }, { "epoch": 3.9877388080980896, "grad_norm": 1.3203125, "learning_rate": 3.516841607689501e-06, "loss": 0.0176, "num_tokens": 175971205.0, "step": 1750 }, { "epoch": 3.9900199600798403, "grad_norm": 1.453125, "learning_rate": 3.515120703156264e-06, "loss": 0.0249, "num_tokens": 176071118.0, "step": 1751 }, { "epoch": 3.992301112061591, "grad_norm": 1.5078125, "learning_rate": 3.5133992224601126e-06, "loss": 0.0211, "num_tokens": 176171590.0, "step": 1752 }, { "epoch": 3.994582264043342, "grad_norm": 1.5546875, "learning_rate": 3.511677166578128e-06, "loss": 0.0293, "num_tokens": 176272942.0, "step": 1753 }, { "epoch": 3.996863416025093, "grad_norm": 1.40625, "learning_rate": 3.509954536487714e-06, "loss": 0.0223, "num_tokens": 176373455.0, "step": 1754 }, { "epoch": 3.9991445680068436, "grad_norm": 1.8984375, "learning_rate": 3.5082313331666035e-06, "loss": 0.0281, "num_tokens": 176474638.0, "step": 1755 }, { "epoch": 4.0, "grad_norm": 2.25, "learning_rate": 3.506507557592853e-06, "loss": 0.0213, "num_tokens": 176511488.0, "step": 1756 }, { "epoch": 4.002281151981751, "grad_norm": 1.53125, "learning_rate": 3.5047832107448437e-06, "loss": 0.0205, "num_tokens": 176611646.0, "step": 1757 }, { "epoch": 4.004562303963501, "grad_norm": 1.4296875, "learning_rate": 3.503058293601283e-06, "loss": 0.0183, "num_tokens": 176712591.0, "step": 1758 }, { "epoch": 4.006843455945252, "grad_norm": 1.09375, "learning_rate": 3.5013328071411995e-06, "loss": 0.0174, "num_tokens": 176813492.0, "step": 1759 }, { "epoch": 4.009124607927003, "grad_norm": 1.65625, "learning_rate": 3.499606752343945e-06, "loss": 0.0223, "num_tokens": 176914804.0, "step": 1760 }, { "epoch": 4.009124607927003, "eval_loss": 0.031710922718048096, "eval_num_tokens": 176914804.0, "eval_runtime": 52.7235, "eval_samples_per_second": 159.037, "eval_steps_per_second": 4.988, "step": 1760 }, { "epoch": 4.011405759908754, "grad_norm": 1.421875, "learning_rate": 3.4978801301891972e-06, "loss": 0.0231, "num_tokens": 177015595.0, "step": 1761 }, { "epoch": 4.013686911890504, "grad_norm": 1.0859375, "learning_rate": 3.496152941656952e-06, "loss": 0.0161, "num_tokens": 177115892.0, "step": 1762 }, { "epoch": 4.015968063872256, "grad_norm": 1.4375, "learning_rate": 3.494425187727528e-06, "loss": 0.0219, "num_tokens": 177216537.0, "step": 1763 }, { "epoch": 4.018249215854007, "grad_norm": 1.46875, "learning_rate": 3.4926968693815667e-06, "loss": 0.0149, "num_tokens": 177317055.0, "step": 1764 }, { "epoch": 4.020530367835757, "grad_norm": 1.546875, "learning_rate": 3.4909679876000256e-06, "loss": 0.0187, "num_tokens": 177417914.0, "step": 1765 }, { "epoch": 4.022811519817508, "grad_norm": 1.6640625, "learning_rate": 3.4892385433641875e-06, "loss": 0.0161, "num_tokens": 177518487.0, "step": 1766 }, { "epoch": 4.025092671799259, "grad_norm": 1.3125, "learning_rate": 3.4875085376556493e-06, "loss": 0.0222, "num_tokens": 177619471.0, "step": 1767 }, { "epoch": 4.0273738237810095, "grad_norm": 1.5625, "learning_rate": 3.4857779714563305e-06, "loss": 0.022, "num_tokens": 177719840.0, "step": 1768 }, { "epoch": 4.02965497576276, "grad_norm": 1.390625, "learning_rate": 3.4840468457484654e-06, "loss": 0.0193, "num_tokens": 177820799.0, "step": 1769 }, { "epoch": 4.031936127744511, "grad_norm": 1.3046875, "learning_rate": 3.4823151615146093e-06, "loss": 0.021, "num_tokens": 177921534.0, "step": 1770 }, { "epoch": 4.034217279726262, "grad_norm": 1.2265625, "learning_rate": 3.480582919737631e-06, "loss": 0.0185, "num_tokens": 178022013.0, "step": 1771 }, { "epoch": 4.036498431708012, "grad_norm": 1.421875, "learning_rate": 3.478850121400719e-06, "loss": 0.0174, "num_tokens": 178123070.0, "step": 1772 }, { "epoch": 4.038779583689763, "grad_norm": 1.28125, "learning_rate": 3.477116767487375e-06, "loss": 0.0166, "num_tokens": 178223750.0, "step": 1773 }, { "epoch": 4.041060735671514, "grad_norm": 1.2109375, "learning_rate": 3.475382858981418e-06, "loss": 0.0198, "num_tokens": 178324359.0, "step": 1774 }, { "epoch": 4.0433418876532645, "grad_norm": 1.328125, "learning_rate": 3.473648396866981e-06, "loss": 0.0224, "num_tokens": 178424794.0, "step": 1775 }, { "epoch": 4.045623039635016, "grad_norm": 1.6875, "learning_rate": 3.4719133821285108e-06, "loss": 0.0242, "num_tokens": 178525621.0, "step": 1776 }, { "epoch": 4.047904191616767, "grad_norm": 1.5234375, "learning_rate": 3.470177815750769e-06, "loss": 0.0193, "num_tokens": 178627108.0, "step": 1777 }, { "epoch": 4.050185343598518, "grad_norm": 1.359375, "learning_rate": 3.4684416987188273e-06, "loss": 0.0216, "num_tokens": 178728230.0, "step": 1778 }, { "epoch": 4.052466495580268, "grad_norm": 1.5546875, "learning_rate": 3.4667050320180755e-06, "loss": 0.0244, "num_tokens": 178828759.0, "step": 1779 }, { "epoch": 4.054747647562019, "grad_norm": 1.4375, "learning_rate": 3.4649678166342104e-06, "loss": 0.0173, "num_tokens": 178929186.0, "step": 1780 }, { "epoch": 4.05702879954377, "grad_norm": 1.390625, "learning_rate": 3.4632300535532415e-06, "loss": 0.0215, "num_tokens": 179030139.0, "step": 1781 }, { "epoch": 4.05930995152552, "grad_norm": 1.5625, "learning_rate": 3.46149174376149e-06, "loss": 0.0182, "num_tokens": 179130397.0, "step": 1782 }, { "epoch": 4.061591103507271, "grad_norm": 1.515625, "learning_rate": 3.459752888245587e-06, "loss": 0.0154, "num_tokens": 179231295.0, "step": 1783 }, { "epoch": 4.063872255489022, "grad_norm": 1.9296875, "learning_rate": 3.4580134879924732e-06, "loss": 0.0252, "num_tokens": 179331884.0, "step": 1784 }, { "epoch": 4.066153407470773, "grad_norm": 1.4296875, "learning_rate": 3.4562735439894e-06, "loss": 0.0175, "num_tokens": 179432908.0, "step": 1785 }, { "epoch": 4.068434559452523, "grad_norm": 1.578125, "learning_rate": 3.4545330572239234e-06, "loss": 0.0188, "num_tokens": 179533224.0, "step": 1786 }, { "epoch": 4.070715711434274, "grad_norm": 1.3203125, "learning_rate": 3.452792028683912e-06, "loss": 0.0196, "num_tokens": 179633819.0, "step": 1787 }, { "epoch": 4.072996863416025, "grad_norm": 1.46875, "learning_rate": 3.4510504593575396e-06, "loss": 0.0155, "num_tokens": 179734421.0, "step": 1788 }, { "epoch": 4.0752780153977755, "grad_norm": 1.8203125, "learning_rate": 3.449308350233287e-06, "loss": 0.0247, "num_tokens": 179835539.0, "step": 1789 }, { "epoch": 4.077559167379527, "grad_norm": 1.6953125, "learning_rate": 3.447565702299942e-06, "loss": 0.0243, "num_tokens": 179936177.0, "step": 1790 }, { "epoch": 4.079840319361278, "grad_norm": 1.5390625, "learning_rate": 3.445822516546598e-06, "loss": 0.0182, "num_tokens": 180037161.0, "step": 1791 }, { "epoch": 4.0821214713430285, "grad_norm": 1.6953125, "learning_rate": 3.444078793962653e-06, "loss": 0.0215, "num_tokens": 180137517.0, "step": 1792 }, { "epoch": 4.084402623324779, "grad_norm": 1.4765625, "learning_rate": 3.4423345355378114e-06, "loss": 0.0201, "num_tokens": 180238292.0, "step": 1793 }, { "epoch": 4.08668377530653, "grad_norm": 1.5390625, "learning_rate": 3.440589742262079e-06, "loss": 0.0168, "num_tokens": 180338687.0, "step": 1794 }, { "epoch": 4.088964927288281, "grad_norm": 1.8359375, "learning_rate": 3.438844415125768e-06, "loss": 0.027, "num_tokens": 180439440.0, "step": 1795 }, { "epoch": 4.091246079270031, "grad_norm": 1.1484375, "learning_rate": 3.437098555119493e-06, "loss": 0.0138, "num_tokens": 180540121.0, "step": 1796 }, { "epoch": 4.093527231251782, "grad_norm": 1.6171875, "learning_rate": 3.4353521632341686e-06, "loss": 0.0185, "num_tokens": 180640644.0, "step": 1797 }, { "epoch": 4.095808383233533, "grad_norm": 1.2890625, "learning_rate": 3.4336052404610138e-06, "loss": 0.0186, "num_tokens": 180741371.0, "step": 1798 }, { "epoch": 4.0980895352152835, "grad_norm": 1.6640625, "learning_rate": 3.431857787791549e-06, "loss": 0.0191, "num_tokens": 180842034.0, "step": 1799 }, { "epoch": 4.100370687197034, "grad_norm": 1.1953125, "learning_rate": 3.4301098062175936e-06, "loss": 0.0154, "num_tokens": 180943038.0, "step": 1800 }, { "epoch": 4.102651839178785, "grad_norm": 1.8046875, "learning_rate": 3.4283612967312692e-06, "loss": 0.0308, "num_tokens": 181043099.0, "step": 1801 }, { "epoch": 4.104932991160536, "grad_norm": 1.4140625, "learning_rate": 3.426612260324996e-06, "loss": 0.0167, "num_tokens": 181143439.0, "step": 1802 }, { "epoch": 4.107214143142287, "grad_norm": 1.546875, "learning_rate": 3.424862697991491e-06, "loss": 0.0215, "num_tokens": 181244180.0, "step": 1803 }, { "epoch": 4.109495295124038, "grad_norm": 1.6015625, "learning_rate": 3.4231126107237754e-06, "loss": 0.0198, "num_tokens": 181346208.0, "step": 1804 }, { "epoch": 4.111776447105789, "grad_norm": 1.3125, "learning_rate": 3.4213619995151628e-06, "loss": 0.0205, "num_tokens": 181446616.0, "step": 1805 }, { "epoch": 4.114057599087539, "grad_norm": 1.0859375, "learning_rate": 3.4196108653592662e-06, "loss": 0.0137, "num_tokens": 181547367.0, "step": 1806 }, { "epoch": 4.11633875106929, "grad_norm": 1.03125, "learning_rate": 3.417859209249997e-06, "loss": 0.0172, "num_tokens": 181647709.0, "step": 1807 }, { "epoch": 4.118619903051041, "grad_norm": 1.578125, "learning_rate": 3.4161070321815605e-06, "loss": 0.0178, "num_tokens": 181748124.0, "step": 1808 }, { "epoch": 4.120901055032792, "grad_norm": 1.65625, "learning_rate": 3.4143543351484585e-06, "loss": 0.0206, "num_tokens": 181848681.0, "step": 1809 }, { "epoch": 4.123182207014542, "grad_norm": 1.3671875, "learning_rate": 3.4126011191454877e-06, "loss": 0.0184, "num_tokens": 181948875.0, "step": 1810 }, { "epoch": 4.125463358996293, "grad_norm": 1.8515625, "learning_rate": 3.4108473851677408e-06, "loss": 0.0254, "num_tokens": 182049246.0, "step": 1811 }, { "epoch": 4.127744510978044, "grad_norm": 1.75, "learning_rate": 3.4090931342106024e-06, "loss": 0.0218, "num_tokens": 182150464.0, "step": 1812 }, { "epoch": 4.1300256629597945, "grad_norm": 1.625, "learning_rate": 3.4073383672697524e-06, "loss": 0.0177, "num_tokens": 182251163.0, "step": 1813 }, { "epoch": 4.132306814941545, "grad_norm": 1.8046875, "learning_rate": 3.4055830853411616e-06, "loss": 0.0245, "num_tokens": 182351491.0, "step": 1814 }, { "epoch": 4.134587966923296, "grad_norm": 1.3515625, "learning_rate": 3.4038272894210945e-06, "loss": 0.0178, "num_tokens": 182451753.0, "step": 1815 }, { "epoch": 4.136869118905047, "grad_norm": 1.3671875, "learning_rate": 3.4020709805061066e-06, "loss": 0.0206, "num_tokens": 182552817.0, "step": 1816 }, { "epoch": 4.139150270886798, "grad_norm": 1.296875, "learning_rate": 3.4003141595930456e-06, "loss": 0.0164, "num_tokens": 182653456.0, "step": 1817 }, { "epoch": 4.141431422868549, "grad_norm": 1.59375, "learning_rate": 3.3985568276790487e-06, "loss": 0.0231, "num_tokens": 182754831.0, "step": 1818 }, { "epoch": 4.1437125748503, "grad_norm": 1.8984375, "learning_rate": 3.3967989857615434e-06, "loss": 0.0283, "num_tokens": 182855584.0, "step": 1819 }, { "epoch": 4.14599372683205, "grad_norm": 1.265625, "learning_rate": 3.3950406348382483e-06, "loss": 0.0157, "num_tokens": 182955945.0, "step": 1820 }, { "epoch": 4.148274878813801, "grad_norm": 1.3828125, "learning_rate": 3.3932817759071666e-06, "loss": 0.0221, "num_tokens": 183056462.0, "step": 1821 }, { "epoch": 4.150556030795552, "grad_norm": 1.6015625, "learning_rate": 3.3915224099665962e-06, "loss": 0.0238, "num_tokens": 183157333.0, "step": 1822 }, { "epoch": 4.1528371827773025, "grad_norm": 1.71875, "learning_rate": 3.389762538015116e-06, "loss": 0.0238, "num_tokens": 183257826.0, "step": 1823 }, { "epoch": 4.155118334759053, "grad_norm": 1.453125, "learning_rate": 3.388002161051598e-06, "loss": 0.0205, "num_tokens": 183358280.0, "step": 1824 }, { "epoch": 4.157399486740804, "grad_norm": 1.5, "learning_rate": 3.3862412800751963e-06, "loss": 0.0232, "num_tokens": 183458448.0, "step": 1825 }, { "epoch": 4.159680638722555, "grad_norm": 1.859375, "learning_rate": 3.3844798960853533e-06, "loss": 0.0293, "num_tokens": 183559092.0, "step": 1826 }, { "epoch": 4.161961790704305, "grad_norm": 2.125, "learning_rate": 3.382718010081797e-06, "loss": 0.0256, "num_tokens": 183659757.0, "step": 1827 }, { "epoch": 4.164242942686056, "grad_norm": 1.546875, "learning_rate": 3.38095562306454e-06, "loss": 0.0234, "num_tokens": 183760250.0, "step": 1828 }, { "epoch": 4.166524094667807, "grad_norm": 1.21875, "learning_rate": 3.3791927360338785e-06, "loss": 0.0153, "num_tokens": 183861221.0, "step": 1829 }, { "epoch": 4.168805246649558, "grad_norm": 1.34375, "learning_rate": 3.3774293499903934e-06, "loss": 0.0226, "num_tokens": 183961374.0, "step": 1830 }, { "epoch": 4.171086398631309, "grad_norm": 2.0625, "learning_rate": 3.3756654659349487e-06, "loss": 0.0305, "num_tokens": 184061598.0, "step": 1831 }, { "epoch": 4.17336755061306, "grad_norm": 1.453125, "learning_rate": 3.373901084868691e-06, "loss": 0.0169, "num_tokens": 184162308.0, "step": 1832 }, { "epoch": 4.175648702594811, "grad_norm": 1.5703125, "learning_rate": 3.372136207793049e-06, "loss": 0.0206, "num_tokens": 184262867.0, "step": 1833 }, { "epoch": 4.177929854576561, "grad_norm": 1.4453125, "learning_rate": 3.3703708357097333e-06, "loss": 0.0152, "num_tokens": 184363536.0, "step": 1834 }, { "epoch": 4.180211006558312, "grad_norm": 1.140625, "learning_rate": 3.3686049696207336e-06, "loss": 0.0146, "num_tokens": 184464042.0, "step": 1835 }, { "epoch": 4.182492158540063, "grad_norm": 1.0546875, "learning_rate": 3.3668386105283226e-06, "loss": 0.0091, "num_tokens": 184564480.0, "step": 1836 }, { "epoch": 4.1847733105218134, "grad_norm": 1.375, "learning_rate": 3.365071759435051e-06, "loss": 0.0227, "num_tokens": 184665186.0, "step": 1837 }, { "epoch": 4.187054462503564, "grad_norm": 1.2734375, "learning_rate": 3.363304417343749e-06, "loss": 0.0202, "num_tokens": 184766172.0, "step": 1838 }, { "epoch": 4.189335614485315, "grad_norm": 1.6328125, "learning_rate": 3.3615365852575276e-06, "loss": 0.0192, "num_tokens": 184867562.0, "step": 1839 }, { "epoch": 4.191616766467066, "grad_norm": 1.3828125, "learning_rate": 3.359768264179772e-06, "loss": 0.0166, "num_tokens": 184968676.0, "step": 1840 }, { "epoch": 4.193897918448816, "grad_norm": 1.2265625, "learning_rate": 3.357999455114148e-06, "loss": 0.0166, "num_tokens": 185069221.0, "step": 1841 }, { "epoch": 4.196179070430567, "grad_norm": 1.3359375, "learning_rate": 3.356230159064599e-06, "loss": 0.019, "num_tokens": 185169635.0, "step": 1842 }, { "epoch": 4.198460222412319, "grad_norm": 1.375, "learning_rate": 3.3544603770353407e-06, "loss": 0.0169, "num_tokens": 185270330.0, "step": 1843 }, { "epoch": 4.200741374394069, "grad_norm": 1.7890625, "learning_rate": 3.352690110030869e-06, "loss": 0.0203, "num_tokens": 185371110.0, "step": 1844 }, { "epoch": 4.20302252637582, "grad_norm": 1.5859375, "learning_rate": 3.350919359055953e-06, "loss": 0.0231, "num_tokens": 185472055.0, "step": 1845 }, { "epoch": 4.205303678357571, "grad_norm": 1.4375, "learning_rate": 3.3491481251156355e-06, "loss": 0.0165, "num_tokens": 185572937.0, "step": 1846 }, { "epoch": 4.2075848303393215, "grad_norm": 1.546875, "learning_rate": 3.347376409215236e-06, "loss": 0.0225, "num_tokens": 185673827.0, "step": 1847 }, { "epoch": 4.209865982321072, "grad_norm": 1.7421875, "learning_rate": 3.345604212360346e-06, "loss": 0.0255, "num_tokens": 185774094.0, "step": 1848 }, { "epoch": 4.212147134302823, "grad_norm": 1.2578125, "learning_rate": 3.3438315355568295e-06, "loss": 0.017, "num_tokens": 185874277.0, "step": 1849 }, { "epoch": 4.214428286284574, "grad_norm": 1.5625, "learning_rate": 3.3420583798108253e-06, "loss": 0.019, "num_tokens": 185974654.0, "step": 1850 }, { "epoch": 4.216709438266324, "grad_norm": 1.2890625, "learning_rate": 3.34028474612874e-06, "loss": 0.0187, "num_tokens": 186075129.0, "step": 1851 }, { "epoch": 4.218990590248075, "grad_norm": 1.6875, "learning_rate": 3.338510635517256e-06, "loss": 0.0239, "num_tokens": 186176095.0, "step": 1852 }, { "epoch": 4.221271742229826, "grad_norm": 1.5625, "learning_rate": 3.3367360489833236e-06, "loss": 0.0175, "num_tokens": 186277379.0, "step": 1853 }, { "epoch": 4.2235528942115765, "grad_norm": 1.4140625, "learning_rate": 3.3349609875341626e-06, "loss": 0.0178, "num_tokens": 186378044.0, "step": 1854 }, { "epoch": 4.225834046193327, "grad_norm": 1.5703125, "learning_rate": 3.3331854521772656e-06, "loss": 0.0171, "num_tokens": 186478414.0, "step": 1855 }, { "epoch": 4.228115198175079, "grad_norm": 1.4375, "learning_rate": 3.3314094439203903e-06, "loss": 0.0187, "num_tokens": 186578762.0, "step": 1856 }, { "epoch": 4.23039635015683, "grad_norm": 1.4375, "learning_rate": 3.3296329637715662e-06, "loss": 0.0169, "num_tokens": 186678850.0, "step": 1857 }, { "epoch": 4.23267750213858, "grad_norm": 1.328125, "learning_rate": 3.3278560127390892e-06, "loss": 0.0152, "num_tokens": 186779646.0, "step": 1858 }, { "epoch": 4.234958654120331, "grad_norm": 1.4453125, "learning_rate": 3.32607859183152e-06, "loss": 0.0164, "num_tokens": 186879563.0, "step": 1859 }, { "epoch": 4.237239806102082, "grad_norm": 1.7265625, "learning_rate": 3.3243007020576917e-06, "loss": 0.022, "num_tokens": 186980044.0, "step": 1860 }, { "epoch": 4.2395209580838324, "grad_norm": 1.484375, "learning_rate": 3.322522344426698e-06, "loss": 0.016, "num_tokens": 187080359.0, "step": 1861 }, { "epoch": 4.241802110065583, "grad_norm": 1.46875, "learning_rate": 3.320743519947901e-06, "loss": 0.0195, "num_tokens": 187181247.0, "step": 1862 }, { "epoch": 4.244083262047334, "grad_norm": 1.71875, "learning_rate": 3.318964229630927e-06, "loss": 0.0267, "num_tokens": 187281790.0, "step": 1863 }, { "epoch": 4.246364414029085, "grad_norm": 1.5625, "learning_rate": 3.3171844744856675e-06, "loss": 0.0162, "num_tokens": 187382245.0, "step": 1864 }, { "epoch": 4.248645566010835, "grad_norm": 1.65625, "learning_rate": 3.3154042555222758e-06, "loss": 0.0239, "num_tokens": 187483305.0, "step": 1865 }, { "epoch": 4.250926717992586, "grad_norm": 1.625, "learning_rate": 3.3136235737511715e-06, "loss": 0.0221, "num_tokens": 187583618.0, "step": 1866 }, { "epoch": 4.253207869974337, "grad_norm": 1.7109375, "learning_rate": 3.3118424301830343e-06, "loss": 0.0304, "num_tokens": 187684138.0, "step": 1867 }, { "epoch": 4.2554890219560875, "grad_norm": 1.671875, "learning_rate": 3.310060825828807e-06, "loss": 0.0251, "num_tokens": 187784565.0, "step": 1868 }, { "epoch": 4.257770173937839, "grad_norm": 1.4609375, "learning_rate": 3.3082787616996938e-06, "loss": 0.0173, "num_tokens": 187885811.0, "step": 1869 }, { "epoch": 4.26005132591959, "grad_norm": 1.6015625, "learning_rate": 3.3064962388071586e-06, "loss": 0.0271, "num_tokens": 187986394.0, "step": 1870 }, { "epoch": 4.2623324779013405, "grad_norm": 1.890625, "learning_rate": 3.3047132581629297e-06, "loss": 0.0273, "num_tokens": 188087631.0, "step": 1871 }, { "epoch": 4.264613629883091, "grad_norm": 1.5625, "learning_rate": 3.3029298207789907e-06, "loss": 0.0249, "num_tokens": 188188968.0, "step": 1872 }, { "epoch": 4.266894781864842, "grad_norm": 1.3203125, "learning_rate": 3.301145927667586e-06, "loss": 0.018, "num_tokens": 188289645.0, "step": 1873 }, { "epoch": 4.269175933846593, "grad_norm": 1.8515625, "learning_rate": 3.2993615798412204e-06, "loss": 0.0119, "num_tokens": 188390262.0, "step": 1874 }, { "epoch": 4.271457085828343, "grad_norm": 1.5078125, "learning_rate": 3.297576778312654e-06, "loss": 0.0205, "num_tokens": 188491073.0, "step": 1875 }, { "epoch": 4.273738237810094, "grad_norm": 1.609375, "learning_rate": 3.295791524094906e-06, "loss": 0.0239, "num_tokens": 188591908.0, "step": 1876 }, { "epoch": 4.276019389791845, "grad_norm": 1.28125, "learning_rate": 3.294005818201252e-06, "loss": 0.0127, "num_tokens": 188692103.0, "step": 1877 }, { "epoch": 4.2783005417735955, "grad_norm": 1.5625, "learning_rate": 3.2922196616452253e-06, "loss": 0.0217, "num_tokens": 188792588.0, "step": 1878 }, { "epoch": 4.280581693755346, "grad_norm": 1.578125, "learning_rate": 3.2904330554406126e-06, "loss": 0.0179, "num_tokens": 188893647.0, "step": 1879 }, { "epoch": 4.282862845737097, "grad_norm": 1.4453125, "learning_rate": 3.288646000601457e-06, "loss": 0.0231, "num_tokens": 188994304.0, "step": 1880 }, { "epoch": 4.285143997718848, "grad_norm": 1.4296875, "learning_rate": 3.286858498142057e-06, "loss": 0.018, "num_tokens": 189094978.0, "step": 1881 }, { "epoch": 4.287425149700598, "grad_norm": 1.4765625, "learning_rate": 3.285070549076965e-06, "loss": 0.0222, "num_tokens": 189195676.0, "step": 1882 }, { "epoch": 4.289706301682349, "grad_norm": 1.4609375, "learning_rate": 3.283282154420985e-06, "loss": 0.0192, "num_tokens": 189296599.0, "step": 1883 }, { "epoch": 4.291987453664101, "grad_norm": 1.875, "learning_rate": 3.2814933151891766e-06, "loss": 0.0275, "num_tokens": 189397225.0, "step": 1884 }, { "epoch": 4.2942686056458514, "grad_norm": 1.6015625, "learning_rate": 3.2797040323968493e-06, "loss": 0.0237, "num_tokens": 189497855.0, "step": 1885 }, { "epoch": 4.296549757627602, "grad_norm": 1.515625, "learning_rate": 3.277914307059566e-06, "loss": 0.0207, "num_tokens": 189598590.0, "step": 1886 }, { "epoch": 4.298830909609353, "grad_norm": 1.515625, "learning_rate": 3.276124140193141e-06, "loss": 0.0206, "num_tokens": 189699980.0, "step": 1887 }, { "epoch": 4.301112061591104, "grad_norm": 1.515625, "learning_rate": 3.274333532813637e-06, "loss": 0.0201, "num_tokens": 189800435.0, "step": 1888 }, { "epoch": 4.303393213572854, "grad_norm": 1.703125, "learning_rate": 3.272542485937369e-06, "loss": 0.0202, "num_tokens": 189901113.0, "step": 1889 }, { "epoch": 4.305674365554605, "grad_norm": 1.515625, "learning_rate": 3.2707510005809005e-06, "loss": 0.0253, "num_tokens": 190002289.0, "step": 1890 }, { "epoch": 4.307955517536356, "grad_norm": 1.46875, "learning_rate": 3.2689590777610443e-06, "loss": 0.0197, "num_tokens": 190103074.0, "step": 1891 }, { "epoch": 4.3102366695181065, "grad_norm": 1.5703125, "learning_rate": 3.267166718494861e-06, "loss": 0.021, "num_tokens": 190203347.0, "step": 1892 }, { "epoch": 4.312517821499857, "grad_norm": 1.359375, "learning_rate": 3.265373923799658e-06, "loss": 0.021, "num_tokens": 190304223.0, "step": 1893 }, { "epoch": 4.314798973481608, "grad_norm": 1.40625, "learning_rate": 3.263580694692992e-06, "loss": 0.0196, "num_tokens": 190405250.0, "step": 1894 }, { "epoch": 4.317080125463359, "grad_norm": 1.7265625, "learning_rate": 3.261787032192666e-06, "loss": 0.0231, "num_tokens": 190505535.0, "step": 1895 }, { "epoch": 4.319361277445109, "grad_norm": 1.75, "learning_rate": 3.259992937316727e-06, "loss": 0.0201, "num_tokens": 190605167.0, "step": 1896 }, { "epoch": 4.321642429426861, "grad_norm": 1.4765625, "learning_rate": 3.258198411083469e-06, "loss": 0.0202, "num_tokens": 190705271.0, "step": 1897 }, { "epoch": 4.323923581408612, "grad_norm": 1.6953125, "learning_rate": 3.2564034545114308e-06, "loss": 0.0233, "num_tokens": 190806314.0, "step": 1898 }, { "epoch": 4.326204733390362, "grad_norm": 1.6640625, "learning_rate": 3.2546080686193947e-06, "loss": 0.0182, "num_tokens": 190907140.0, "step": 1899 }, { "epoch": 4.328485885372113, "grad_norm": 1.4140625, "learning_rate": 3.2528122544263873e-06, "loss": 0.0198, "num_tokens": 191007697.0, "step": 1900 }, { "epoch": 4.330767037353864, "grad_norm": 1.1328125, "learning_rate": 3.251016012951678e-06, "loss": 0.0143, "num_tokens": 191108319.0, "step": 1901 }, { "epoch": 4.3330481893356145, "grad_norm": 1.6015625, "learning_rate": 3.2492193452147774e-06, "loss": 0.0219, "num_tokens": 191208341.0, "step": 1902 }, { "epoch": 4.335329341317365, "grad_norm": 1.640625, "learning_rate": 3.247422252235442e-06, "loss": 0.0183, "num_tokens": 191309228.0, "step": 1903 }, { "epoch": 4.337610493299116, "grad_norm": 1.4921875, "learning_rate": 3.245624735033665e-06, "loss": 0.0158, "num_tokens": 191409828.0, "step": 1904 }, { "epoch": 4.339891645280867, "grad_norm": 1.6484375, "learning_rate": 3.2438267946296836e-06, "loss": 0.0233, "num_tokens": 191510562.0, "step": 1905 }, { "epoch": 4.342172797262617, "grad_norm": 1.296875, "learning_rate": 3.242028432043974e-06, "loss": 0.0166, "num_tokens": 191611645.0, "step": 1906 }, { "epoch": 4.344453949244368, "grad_norm": 1.40625, "learning_rate": 3.2402296482972513e-06, "loss": 0.0181, "num_tokens": 191713043.0, "step": 1907 }, { "epoch": 4.346735101226119, "grad_norm": 1.359375, "learning_rate": 3.238430444410471e-06, "loss": 0.0201, "num_tokens": 191814181.0, "step": 1908 }, { "epoch": 4.3490162532078696, "grad_norm": 1.7421875, "learning_rate": 3.2366308214048262e-06, "loss": 0.0313, "num_tokens": 191914776.0, "step": 1909 }, { "epoch": 4.351297405189621, "grad_norm": 1.6484375, "learning_rate": 3.2348307803017493e-06, "loss": 0.0219, "num_tokens": 192015276.0, "step": 1910 }, { "epoch": 4.353578557171372, "grad_norm": 1.4921875, "learning_rate": 3.2330303221229078e-06, "loss": 0.0222, "num_tokens": 192115687.0, "step": 1911 }, { "epoch": 4.355859709153123, "grad_norm": 1.2421875, "learning_rate": 3.231229447890206e-06, "loss": 0.0122, "num_tokens": 192216197.0, "step": 1912 }, { "epoch": 4.358140861134873, "grad_norm": 1.1875, "learning_rate": 3.229428158625787e-06, "loss": 0.0165, "num_tokens": 192317092.0, "step": 1913 }, { "epoch": 4.360422013116624, "grad_norm": 1.1484375, "learning_rate": 3.2276264553520275e-06, "loss": 0.016, "num_tokens": 192417304.0, "step": 1914 }, { "epoch": 4.362703165098375, "grad_norm": 1.78125, "learning_rate": 3.2258243390915397e-06, "loss": 0.0291, "num_tokens": 192518270.0, "step": 1915 }, { "epoch": 4.3649843170801255, "grad_norm": 1.3828125, "learning_rate": 3.2240218108671683e-06, "loss": 0.0204, "num_tokens": 192618820.0, "step": 1916 }, { "epoch": 4.367265469061876, "grad_norm": 1.78125, "learning_rate": 3.2222188717019965e-06, "loss": 0.0239, "num_tokens": 192719140.0, "step": 1917 }, { "epoch": 4.369546621043627, "grad_norm": 1.9765625, "learning_rate": 3.220415522619335e-06, "loss": 0.022, "num_tokens": 192819961.0, "step": 1918 }, { "epoch": 4.371827773025378, "grad_norm": 1.6875, "learning_rate": 3.218611764642732e-06, "loss": 0.0235, "num_tokens": 192920635.0, "step": 1919 }, { "epoch": 4.374108925007128, "grad_norm": 1.671875, "learning_rate": 3.2168075987959633e-06, "loss": 0.0212, "num_tokens": 193020930.0, "step": 1920 }, { "epoch": 4.376390076988879, "grad_norm": 1.453125, "learning_rate": 3.2150030261030414e-06, "loss": 0.0179, "num_tokens": 193121431.0, "step": 1921 }, { "epoch": 4.37867122897063, "grad_norm": 1.53125, "learning_rate": 3.2131980475882053e-06, "loss": 0.0169, "num_tokens": 193221866.0, "step": 1922 }, { "epoch": 4.380952380952381, "grad_norm": 2.171875, "learning_rate": 3.2113926642759256e-06, "loss": 0.03, "num_tokens": 193321836.0, "step": 1923 }, { "epoch": 4.383233532934132, "grad_norm": 1.46875, "learning_rate": 3.2095868771909037e-06, "loss": 0.0256, "num_tokens": 193422170.0, "step": 1924 }, { "epoch": 4.385514684915883, "grad_norm": 1.3203125, "learning_rate": 3.2077806873580696e-06, "loss": 0.0197, "num_tokens": 193522694.0, "step": 1925 }, { "epoch": 4.3877958368976335, "grad_norm": 1.6015625, "learning_rate": 3.205974095802582e-06, "loss": 0.018, "num_tokens": 193622695.0, "step": 1926 }, { "epoch": 4.390076988879384, "grad_norm": 1.2890625, "learning_rate": 3.204167103549827e-06, "loss": 0.0166, "num_tokens": 193723360.0, "step": 1927 }, { "epoch": 4.392358140861135, "grad_norm": 1.53125, "learning_rate": 3.2023597116254175e-06, "loss": 0.0197, "num_tokens": 193823682.0, "step": 1928 }, { "epoch": 4.394639292842886, "grad_norm": 1.75, "learning_rate": 3.2005519210551955e-06, "loss": 0.0204, "num_tokens": 193924320.0, "step": 1929 }, { "epoch": 4.396920444824636, "grad_norm": 1.984375, "learning_rate": 3.1987437328652287e-06, "loss": 0.0251, "num_tokens": 194024915.0, "step": 1930 }, { "epoch": 4.399201596806387, "grad_norm": 1.8203125, "learning_rate": 3.196935148081808e-06, "loss": 0.0245, "num_tokens": 194125658.0, "step": 1931 }, { "epoch": 4.401482748788138, "grad_norm": 1.6953125, "learning_rate": 3.1951261677314526e-06, "loss": 0.0228, "num_tokens": 194226663.0, "step": 1932 }, { "epoch": 4.4037639007698886, "grad_norm": 1.5546875, "learning_rate": 3.1933167928409046e-06, "loss": 0.0206, "num_tokens": 194327902.0, "step": 1933 }, { "epoch": 4.406045052751639, "grad_norm": 1.359375, "learning_rate": 3.1915070244371295e-06, "loss": 0.019, "num_tokens": 194428882.0, "step": 1934 }, { "epoch": 4.40832620473339, "grad_norm": 1.3203125, "learning_rate": 3.1896968635473174e-06, "loss": 0.0161, "num_tokens": 194529827.0, "step": 1935 }, { "epoch": 4.410607356715142, "grad_norm": 1.3828125, "learning_rate": 3.187886311198881e-06, "loss": 0.0175, "num_tokens": 194630723.0, "step": 1936 }, { "epoch": 4.412888508696892, "grad_norm": 1.5703125, "learning_rate": 3.1860753684194536e-06, "loss": 0.0259, "num_tokens": 194731677.0, "step": 1937 }, { "epoch": 4.415169660678643, "grad_norm": 1.6875, "learning_rate": 3.1842640362368932e-06, "loss": 0.0205, "num_tokens": 194832854.0, "step": 1938 }, { "epoch": 4.417450812660394, "grad_norm": 1.8671875, "learning_rate": 3.182452315679276e-06, "loss": 0.0232, "num_tokens": 194933979.0, "step": 1939 }, { "epoch": 4.4197319646421445, "grad_norm": 1.53125, "learning_rate": 3.1806402077748987e-06, "loss": 0.0181, "num_tokens": 195034788.0, "step": 1940 }, { "epoch": 4.422013116623895, "grad_norm": 1.921875, "learning_rate": 3.178827713552281e-06, "loss": 0.0203, "num_tokens": 195135497.0, "step": 1941 }, { "epoch": 4.424294268605646, "grad_norm": 1.7265625, "learning_rate": 3.177014834040158e-06, "loss": 0.022, "num_tokens": 195236195.0, "step": 1942 }, { "epoch": 4.426575420587397, "grad_norm": 1.53125, "learning_rate": 3.1752015702674855e-06, "loss": 0.0226, "num_tokens": 195336753.0, "step": 1943 }, { "epoch": 4.428856572569147, "grad_norm": 1.609375, "learning_rate": 3.173387923263437e-06, "loss": 0.0217, "num_tokens": 195437239.0, "step": 1944 }, { "epoch": 4.431137724550898, "grad_norm": 1.28125, "learning_rate": 3.1715738940574032e-06, "loss": 0.0145, "num_tokens": 195537887.0, "step": 1945 }, { "epoch": 4.433418876532649, "grad_norm": 1.4375, "learning_rate": 3.1697594836789924e-06, "loss": 0.0211, "num_tokens": 195638791.0, "step": 1946 }, { "epoch": 4.4357000285143995, "grad_norm": 1.390625, "learning_rate": 3.167944693158029e-06, "loss": 0.0173, "num_tokens": 195739093.0, "step": 1947 }, { "epoch": 4.43798118049615, "grad_norm": 1.75, "learning_rate": 3.166129523524553e-06, "loss": 0.0242, "num_tokens": 195839441.0, "step": 1948 }, { "epoch": 4.440262332477902, "grad_norm": 1.6171875, "learning_rate": 3.1643139758088194e-06, "loss": 0.0184, "num_tokens": 195940470.0, "step": 1949 }, { "epoch": 4.4425434844596525, "grad_norm": 1.390625, "learning_rate": 3.1624980510412984e-06, "loss": 0.0163, "num_tokens": 196041130.0, "step": 1950 }, { "epoch": 4.444824636441403, "grad_norm": 1.640625, "learning_rate": 3.160681750252674e-06, "loss": 0.0241, "num_tokens": 196141598.0, "step": 1951 }, { "epoch": 4.447105788423154, "grad_norm": 1.4375, "learning_rate": 3.1588650744738418e-06, "loss": 0.0171, "num_tokens": 196242126.0, "step": 1952 }, { "epoch": 4.449386940404905, "grad_norm": 1.71875, "learning_rate": 3.1570480247359147e-06, "loss": 0.021, "num_tokens": 196342845.0, "step": 1953 }, { "epoch": 4.451668092386655, "grad_norm": 1.546875, "learning_rate": 3.155230602070213e-06, "loss": 0.0241, "num_tokens": 196443077.0, "step": 1954 }, { "epoch": 4.453949244368406, "grad_norm": 1.7734375, "learning_rate": 3.153412807508271e-06, "loss": 0.0175, "num_tokens": 196542675.0, "step": 1955 }, { "epoch": 4.456230396350157, "grad_norm": 1.265625, "learning_rate": 3.1515946420818343e-06, "loss": 0.0204, "num_tokens": 196643363.0, "step": 1956 }, { "epoch": 4.4585115483319075, "grad_norm": 1.3671875, "learning_rate": 3.1497761068228585e-06, "loss": 0.0136, "num_tokens": 196744075.0, "step": 1957 }, { "epoch": 4.460792700313658, "grad_norm": 1.5625, "learning_rate": 3.1479572027635085e-06, "loss": 0.0211, "num_tokens": 196844308.0, "step": 1958 }, { "epoch": 4.463073852295409, "grad_norm": 1.4140625, "learning_rate": 3.1461379309361594e-06, "loss": 0.023, "num_tokens": 196944987.0, "step": 1959 }, { "epoch": 4.46535500427716, "grad_norm": 1.3828125, "learning_rate": 3.144318292373395e-06, "loss": 0.0209, "num_tokens": 197045741.0, "step": 1960 }, { "epoch": 4.46763615625891, "grad_norm": 1.921875, "learning_rate": 3.142498288108007e-06, "loss": 0.0171, "num_tokens": 197146733.0, "step": 1961 }, { "epoch": 4.469917308240661, "grad_norm": 1.21875, "learning_rate": 3.1406779191729954e-06, "loss": 0.0122, "num_tokens": 197247090.0, "step": 1962 }, { "epoch": 4.472198460222412, "grad_norm": 1.8984375, "learning_rate": 3.1388571866015645e-06, "loss": 0.0267, "num_tokens": 197347656.0, "step": 1963 }, { "epoch": 4.4744796122041635, "grad_norm": 1.65625, "learning_rate": 3.1370360914271286e-06, "loss": 0.0219, "num_tokens": 197448211.0, "step": 1964 }, { "epoch": 4.476760764185914, "grad_norm": 2.0625, "learning_rate": 3.1352146346833057e-06, "loss": 0.0261, "num_tokens": 197549093.0, "step": 1965 }, { "epoch": 4.479041916167665, "grad_norm": 1.578125, "learning_rate": 3.133392817403919e-06, "loss": 0.0187, "num_tokens": 197649895.0, "step": 1966 }, { "epoch": 4.481323068149416, "grad_norm": 1.59375, "learning_rate": 3.131570640622998e-06, "loss": 0.0234, "num_tokens": 197750383.0, "step": 1967 }, { "epoch": 4.483604220131166, "grad_norm": 1.859375, "learning_rate": 3.1297481053747737e-06, "loss": 0.024, "num_tokens": 197851778.0, "step": 1968 }, { "epoch": 4.485885372112917, "grad_norm": 1.546875, "learning_rate": 3.127925212693682e-06, "loss": 0.0178, "num_tokens": 197952032.0, "step": 1969 }, { "epoch": 4.488166524094668, "grad_norm": 1.84375, "learning_rate": 3.1261019636143636e-06, "loss": 0.0235, "num_tokens": 198052462.0, "step": 1970 }, { "epoch": 4.4904476760764185, "grad_norm": 1.3046875, "learning_rate": 3.124278359171657e-06, "loss": 0.0193, "num_tokens": 198153239.0, "step": 1971 }, { "epoch": 4.492728828058169, "grad_norm": 1.5234375, "learning_rate": 3.122454400400606e-06, "loss": 0.0156, "num_tokens": 198253557.0, "step": 1972 }, { "epoch": 4.49500998003992, "grad_norm": 1.3984375, "learning_rate": 3.1206300883364547e-06, "loss": 0.017, "num_tokens": 198354709.0, "step": 1973 }, { "epoch": 4.497291132021671, "grad_norm": 1.703125, "learning_rate": 3.1188054240146463e-06, "loss": 0.0234, "num_tokens": 198455605.0, "step": 1974 }, { "epoch": 4.499572284003421, "grad_norm": 1.46875, "learning_rate": 3.1169804084708267e-06, "loss": 0.0205, "num_tokens": 198556316.0, "step": 1975 }, { "epoch": 4.501853435985172, "grad_norm": 1.6328125, "learning_rate": 3.1151550427408383e-06, "loss": 0.0216, "num_tokens": 198657078.0, "step": 1976 }, { "epoch": 4.504134587966924, "grad_norm": 1.53125, "learning_rate": 3.1133293278607228e-06, "loss": 0.0202, "num_tokens": 198757746.0, "step": 1977 }, { "epoch": 4.506415739948674, "grad_norm": 1.3828125, "learning_rate": 3.1115032648667224e-06, "loss": 0.0209, "num_tokens": 198858920.0, "step": 1978 }, { "epoch": 4.508696891930425, "grad_norm": 1.296875, "learning_rate": 3.1096768547952743e-06, "loss": 0.0163, "num_tokens": 198959541.0, "step": 1979 }, { "epoch": 4.510978043912176, "grad_norm": 1.953125, "learning_rate": 3.1078500986830134e-06, "loss": 0.0195, "num_tokens": 199060301.0, "step": 1980 }, { "epoch": 4.510978043912176, "eval_loss": 0.03198155388236046, "eval_num_tokens": 199060301.0, "eval_runtime": 52.6696, "eval_samples_per_second": 159.2, "eval_steps_per_second": 4.993, "step": 1980 }, { "epoch": 4.5132591958939265, "grad_norm": 1.2890625, "learning_rate": 3.1060229975667716e-06, "loss": 0.0183, "num_tokens": 199160974.0, "step": 1981 }, { "epoch": 4.515540347875677, "grad_norm": 1.546875, "learning_rate": 3.104195552483576e-06, "loss": 0.0205, "num_tokens": 199261490.0, "step": 1982 }, { "epoch": 4.517821499857428, "grad_norm": 1.3125, "learning_rate": 3.102367764470649e-06, "loss": 0.0184, "num_tokens": 199362207.0, "step": 1983 }, { "epoch": 4.520102651839179, "grad_norm": 1.4453125, "learning_rate": 3.1005396345654087e-06, "loss": 0.0192, "num_tokens": 199463550.0, "step": 1984 }, { "epoch": 4.522383803820929, "grad_norm": 1.4921875, "learning_rate": 3.0987111638054657e-06, "loss": 0.0233, "num_tokens": 199563973.0, "step": 1985 }, { "epoch": 4.52466495580268, "grad_norm": 1.109375, "learning_rate": 3.0968823532286246e-06, "loss": 0.0172, "num_tokens": 199664796.0, "step": 1986 }, { "epoch": 4.526946107784431, "grad_norm": 1.25, "learning_rate": 3.095053203872883e-06, "loss": 0.0172, "num_tokens": 199765776.0, "step": 1987 }, { "epoch": 4.529227259766182, "grad_norm": 1.546875, "learning_rate": 3.0932237167764306e-06, "loss": 0.0228, "num_tokens": 199866412.0, "step": 1988 }, { "epoch": 4.531508411747932, "grad_norm": 1.7265625, "learning_rate": 3.0913938929776493e-06, "loss": 0.0218, "num_tokens": 199966823.0, "step": 1989 }, { "epoch": 4.533789563729684, "grad_norm": 1.6640625, "learning_rate": 3.0895637335151117e-06, "loss": 0.0203, "num_tokens": 200067560.0, "step": 1990 }, { "epoch": 4.536070715711435, "grad_norm": 1.71875, "learning_rate": 3.0877332394275806e-06, "loss": 0.0207, "num_tokens": 200167552.0, "step": 1991 }, { "epoch": 4.538351867693185, "grad_norm": 1.6875, "learning_rate": 3.08590241175401e-06, "loss": 0.0233, "num_tokens": 200268024.0, "step": 1992 }, { "epoch": 4.540633019674936, "grad_norm": 1.5625, "learning_rate": 3.0840712515335412e-06, "loss": 0.0164, "num_tokens": 200368281.0, "step": 1993 }, { "epoch": 4.542914171656687, "grad_norm": 1.296875, "learning_rate": 3.0822397598055065e-06, "loss": 0.0161, "num_tokens": 200469447.0, "step": 1994 }, { "epoch": 4.5451953236384375, "grad_norm": 1.859375, "learning_rate": 3.080407937609424e-06, "loss": 0.0222, "num_tokens": 200570475.0, "step": 1995 }, { "epoch": 4.547476475620188, "grad_norm": 1.5, "learning_rate": 3.0785757859850025e-06, "loss": 0.0204, "num_tokens": 200671154.0, "step": 1996 }, { "epoch": 4.549757627601939, "grad_norm": 1.671875, "learning_rate": 3.0767433059721338e-06, "loss": 0.0188, "num_tokens": 200771583.0, "step": 1997 }, { "epoch": 4.55203877958369, "grad_norm": 1.4609375, "learning_rate": 3.074910498610899e-06, "loss": 0.0195, "num_tokens": 200872498.0, "step": 1998 }, { "epoch": 4.55431993156544, "grad_norm": 1.515625, "learning_rate": 3.0730773649415647e-06, "loss": 0.0228, "num_tokens": 200972804.0, "step": 1999 }, { "epoch": 4.556601083547191, "grad_norm": 1.265625, "learning_rate": 3.0712439060045818e-06, "loss": 0.0196, "num_tokens": 201073126.0, "step": 2000 }, { "epoch": 4.558882235528942, "grad_norm": 1.453125, "learning_rate": 3.069410122840585e-06, "loss": 0.0179, "num_tokens": 201173856.0, "step": 2001 }, { "epoch": 4.5611633875106925, "grad_norm": 1.390625, "learning_rate": 3.0675760164903972e-06, "loss": 0.0204, "num_tokens": 201274465.0, "step": 2002 }, { "epoch": 4.563444539492444, "grad_norm": 1.1015625, "learning_rate": 3.065741587995019e-06, "loss": 0.014, "num_tokens": 201375034.0, "step": 2003 }, { "epoch": 4.565725691474195, "grad_norm": 1.390625, "learning_rate": 3.0639068383956373e-06, "loss": 0.0183, "num_tokens": 201475421.0, "step": 2004 }, { "epoch": 4.5680068434559455, "grad_norm": 1.734375, "learning_rate": 3.062071768733621e-06, "loss": 0.027, "num_tokens": 201576260.0, "step": 2005 }, { "epoch": 4.570287995437696, "grad_norm": 1.5078125, "learning_rate": 3.0602363800505198e-06, "loss": 0.0151, "num_tokens": 201677295.0, "step": 2006 }, { "epoch": 4.572569147419447, "grad_norm": 1.6640625, "learning_rate": 3.0584006733880656e-06, "loss": 0.0183, "num_tokens": 201777762.0, "step": 2007 }, { "epoch": 4.574850299401198, "grad_norm": 1.328125, "learning_rate": 3.0565646497881697e-06, "loss": 0.0153, "num_tokens": 201878178.0, "step": 2008 }, { "epoch": 4.577131451382948, "grad_norm": 1.4140625, "learning_rate": 3.0547283102929228e-06, "loss": 0.0188, "num_tokens": 201978719.0, "step": 2009 }, { "epoch": 4.579412603364699, "grad_norm": 1.5625, "learning_rate": 3.0528916559445967e-06, "loss": 0.0161, "num_tokens": 202079484.0, "step": 2010 }, { "epoch": 4.58169375534645, "grad_norm": 1.265625, "learning_rate": 3.05105468778564e-06, "loss": 0.0129, "num_tokens": 202180666.0, "step": 2011 }, { "epoch": 4.583974907328201, "grad_norm": 1.4140625, "learning_rate": 3.049217406858681e-06, "loss": 0.0196, "num_tokens": 202280992.0, "step": 2012 }, { "epoch": 4.586256059309951, "grad_norm": 1.7265625, "learning_rate": 3.047379814206526e-06, "loss": 0.0249, "num_tokens": 202381595.0, "step": 2013 }, { "epoch": 4.588537211291702, "grad_norm": 1.4609375, "learning_rate": 3.0455419108721556e-06, "loss": 0.0216, "num_tokens": 202482174.0, "step": 2014 }, { "epoch": 4.590818363273453, "grad_norm": 1.65625, "learning_rate": 3.043703697898728e-06, "loss": 0.0158, "num_tokens": 202583172.0, "step": 2015 }, { "epoch": 4.593099515255204, "grad_norm": 1.28125, "learning_rate": 3.041865176329579e-06, "loss": 0.0148, "num_tokens": 202684044.0, "step": 2016 }, { "epoch": 4.595380667236955, "grad_norm": 1.6953125, "learning_rate": 3.040026347208217e-06, "loss": 0.0239, "num_tokens": 202785110.0, "step": 2017 }, { "epoch": 4.597661819218706, "grad_norm": 1.3203125, "learning_rate": 3.0381872115783256e-06, "loss": 0.0193, "num_tokens": 202886145.0, "step": 2018 }, { "epoch": 4.5999429712004565, "grad_norm": 1.3984375, "learning_rate": 3.0363477704837633e-06, "loss": 0.0171, "num_tokens": 202986639.0, "step": 2019 }, { "epoch": 4.602224123182207, "grad_norm": 1.8671875, "learning_rate": 3.034508024968561e-06, "loss": 0.0284, "num_tokens": 203087712.0, "step": 2020 }, { "epoch": 4.604505275163958, "grad_norm": 1.265625, "learning_rate": 3.032667976076923e-06, "loss": 0.0166, "num_tokens": 203188421.0, "step": 2021 }, { "epoch": 4.606786427145709, "grad_norm": 1.6171875, "learning_rate": 3.0308276248532244e-06, "loss": 0.02, "num_tokens": 203288800.0, "step": 2022 }, { "epoch": 4.609067579127459, "grad_norm": 1.296875, "learning_rate": 3.0289869723420144e-06, "loss": 0.0134, "num_tokens": 203389500.0, "step": 2023 }, { "epoch": 4.61134873110921, "grad_norm": 1.6171875, "learning_rate": 3.027146019588012e-06, "loss": 0.0184, "num_tokens": 203490394.0, "step": 2024 }, { "epoch": 4.613629883090961, "grad_norm": 1.4765625, "learning_rate": 3.025304767636105e-06, "loss": 0.0146, "num_tokens": 203591623.0, "step": 2025 }, { "epoch": 4.6159110350727115, "grad_norm": 1.265625, "learning_rate": 3.0234632175313537e-06, "loss": 0.0199, "num_tokens": 203692020.0, "step": 2026 }, { "epoch": 4.618192187054462, "grad_norm": 1.9609375, "learning_rate": 3.0216213703189856e-06, "loss": 0.0198, "num_tokens": 203792628.0, "step": 2027 }, { "epoch": 4.620473339036213, "grad_norm": 1.484375, "learning_rate": 3.019779227044398e-06, "loss": 0.0205, "num_tokens": 203893879.0, "step": 2028 }, { "epoch": 4.6227544910179645, "grad_norm": 1.3125, "learning_rate": 3.0179367887531567e-06, "loss": 0.0188, "num_tokens": 203994141.0, "step": 2029 }, { "epoch": 4.625035642999714, "grad_norm": 1.3984375, "learning_rate": 3.016094056490993e-06, "loss": 0.0162, "num_tokens": 204094897.0, "step": 2030 }, { "epoch": 4.627316794981466, "grad_norm": 1.8359375, "learning_rate": 3.0142510313038057e-06, "loss": 0.0185, "num_tokens": 204196022.0, "step": 2031 }, { "epoch": 4.629597946963217, "grad_norm": 1.8515625, "learning_rate": 3.012407714237662e-06, "loss": 0.0213, "num_tokens": 204296686.0, "step": 2032 }, { "epoch": 4.631879098944967, "grad_norm": 1.8671875, "learning_rate": 3.010564106338791e-06, "loss": 0.0226, "num_tokens": 204397728.0, "step": 2033 }, { "epoch": 4.634160250926718, "grad_norm": 1.5078125, "learning_rate": 3.0087202086535915e-06, "loss": 0.0249, "num_tokens": 204499106.0, "step": 2034 }, { "epoch": 4.636441402908469, "grad_norm": 1.5078125, "learning_rate": 3.006876022228622e-06, "loss": 0.0169, "num_tokens": 204599855.0, "step": 2035 }, { "epoch": 4.63872255489022, "grad_norm": 1.578125, "learning_rate": 3.0050315481106074e-06, "loss": 0.0206, "num_tokens": 204700187.0, "step": 2036 }, { "epoch": 4.64100370687197, "grad_norm": 1.859375, "learning_rate": 3.0031867873464372e-06, "loss": 0.0255, "num_tokens": 204800638.0, "step": 2037 }, { "epoch": 4.643284858853721, "grad_norm": 1.453125, "learning_rate": 3.00134174098316e-06, "loss": 0.0201, "num_tokens": 204900152.0, "step": 2038 }, { "epoch": 4.645566010835472, "grad_norm": 1.2265625, "learning_rate": 2.999496410067989e-06, "loss": 0.016, "num_tokens": 205001615.0, "step": 2039 }, { "epoch": 4.647847162817222, "grad_norm": 1.640625, "learning_rate": 2.9976507956482996e-06, "loss": 0.0229, "num_tokens": 205102424.0, "step": 2040 }, { "epoch": 4.650128314798973, "grad_norm": 1.953125, "learning_rate": 2.9958048987716266e-06, "loss": 0.026, "num_tokens": 205202671.0, "step": 2041 }, { "epoch": 4.652409466780725, "grad_norm": 1.859375, "learning_rate": 2.993958720485664e-06, "loss": 0.0274, "num_tokens": 205303229.0, "step": 2042 }, { "epoch": 4.654690618762475, "grad_norm": 1.2421875, "learning_rate": 2.9921122618382687e-06, "loss": 0.0161, "num_tokens": 205404204.0, "step": 2043 }, { "epoch": 4.656971770744226, "grad_norm": 1.375, "learning_rate": 2.9902655238774537e-06, "loss": 0.0187, "num_tokens": 205504718.0, "step": 2044 }, { "epoch": 4.659252922725977, "grad_norm": 2.109375, "learning_rate": 2.988418507651392e-06, "loss": 0.0287, "num_tokens": 205605570.0, "step": 2045 }, { "epoch": 4.661534074707728, "grad_norm": 1.640625, "learning_rate": 2.9865712142084145e-06, "loss": 0.0177, "num_tokens": 205706721.0, "step": 2046 }, { "epoch": 4.663815226689478, "grad_norm": 1.15625, "learning_rate": 2.98472364459701e-06, "loss": 0.0161, "num_tokens": 205807604.0, "step": 2047 }, { "epoch": 4.666096378671229, "grad_norm": 1.609375, "learning_rate": 2.982875799865823e-06, "loss": 0.0255, "num_tokens": 205908218.0, "step": 2048 }, { "epoch": 4.66837753065298, "grad_norm": 1.71875, "learning_rate": 2.9810276810636535e-06, "loss": 0.0202, "num_tokens": 206008645.0, "step": 2049 }, { "epoch": 4.6706586826347305, "grad_norm": 1.7265625, "learning_rate": 2.97917928923946e-06, "loss": 0.0232, "num_tokens": 206109640.0, "step": 2050 }, { "epoch": 4.672939834616481, "grad_norm": 1.46875, "learning_rate": 2.977330625442352e-06, "loss": 0.0212, "num_tokens": 206210663.0, "step": 2051 }, { "epoch": 4.675220986598232, "grad_norm": 1.28125, "learning_rate": 2.9754816907215963e-06, "loss": 0.0134, "num_tokens": 206311185.0, "step": 2052 }, { "epoch": 4.677502138579983, "grad_norm": 1.7578125, "learning_rate": 2.9736324861266125e-06, "loss": 0.0236, "num_tokens": 206411853.0, "step": 2053 }, { "epoch": 4.679783290561733, "grad_norm": 1.8515625, "learning_rate": 2.9717830127069734e-06, "loss": 0.0228, "num_tokens": 206512176.0, "step": 2054 }, { "epoch": 4.682064442543484, "grad_norm": 1.328125, "learning_rate": 2.969933271512404e-06, "loss": 0.0178, "num_tokens": 206612717.0, "step": 2055 }, { "epoch": 4.684345594525235, "grad_norm": 1.9375, "learning_rate": 2.9680832635927824e-06, "loss": 0.0232, "num_tokens": 206713057.0, "step": 2056 }, { "epoch": 4.686626746506986, "grad_norm": 1.6015625, "learning_rate": 2.9662329899981375e-06, "loss": 0.0211, "num_tokens": 206813918.0, "step": 2057 }, { "epoch": 4.688907898488737, "grad_norm": 1.1953125, "learning_rate": 2.964382451778648e-06, "loss": 0.0123, "num_tokens": 206914618.0, "step": 2058 }, { "epoch": 4.691189050470488, "grad_norm": 1.375, "learning_rate": 2.9625316499846444e-06, "loss": 0.0156, "num_tokens": 207015027.0, "step": 2059 }, { "epoch": 4.693470202452239, "grad_norm": 1.6015625, "learning_rate": 2.9606805856666053e-06, "loss": 0.0258, "num_tokens": 207115868.0, "step": 2060 }, { "epoch": 4.695751354433989, "grad_norm": 1.546875, "learning_rate": 2.95882925987516e-06, "loss": 0.0225, "num_tokens": 207216587.0, "step": 2061 }, { "epoch": 4.69803250641574, "grad_norm": 1.421875, "learning_rate": 2.9569776736610855e-06, "loss": 0.0229, "num_tokens": 207317270.0, "step": 2062 }, { "epoch": 4.700313658397491, "grad_norm": 1.453125, "learning_rate": 2.9551258280753046e-06, "loss": 0.025, "num_tokens": 207418043.0, "step": 2063 }, { "epoch": 4.702594810379241, "grad_norm": 1.2578125, "learning_rate": 2.953273724168891e-06, "loss": 0.0169, "num_tokens": 207518719.0, "step": 2064 }, { "epoch": 4.704875962360992, "grad_norm": 1.875, "learning_rate": 2.9514213629930614e-06, "loss": 0.0341, "num_tokens": 207618902.0, "step": 2065 }, { "epoch": 4.707157114342743, "grad_norm": 1.1015625, "learning_rate": 2.949568745599182e-06, "loss": 0.0141, "num_tokens": 207719245.0, "step": 2066 }, { "epoch": 4.709438266324494, "grad_norm": 1.5546875, "learning_rate": 2.9477158730387615e-06, "loss": 0.0223, "num_tokens": 207819643.0, "step": 2067 }, { "epoch": 4.711719418306244, "grad_norm": 1.5234375, "learning_rate": 2.945862746363455e-06, "loss": 0.0185, "num_tokens": 207920641.0, "step": 2068 }, { "epoch": 4.714000570287995, "grad_norm": 1.53125, "learning_rate": 2.944009366625061e-06, "loss": 0.0237, "num_tokens": 208021126.0, "step": 2069 }, { "epoch": 4.716281722269747, "grad_norm": 1.671875, "learning_rate": 2.942155734875523e-06, "loss": 0.0244, "num_tokens": 208121645.0, "step": 2070 }, { "epoch": 4.718562874251497, "grad_norm": 1.546875, "learning_rate": 2.9403018521669256e-06, "loss": 0.0188, "num_tokens": 208222265.0, "step": 2071 }, { "epoch": 4.720844026233248, "grad_norm": 1.2890625, "learning_rate": 2.938447719551498e-06, "loss": 0.016, "num_tokens": 208322538.0, "step": 2072 }, { "epoch": 4.723125178214999, "grad_norm": 1.6796875, "learning_rate": 2.9365933380816092e-06, "loss": 0.0222, "num_tokens": 208423336.0, "step": 2073 }, { "epoch": 4.7254063301967495, "grad_norm": 1.8671875, "learning_rate": 2.93473870880977e-06, "loss": 0.0216, "num_tokens": 208524254.0, "step": 2074 }, { "epoch": 4.7276874821785, "grad_norm": 1.671875, "learning_rate": 2.932883832788633e-06, "loss": 0.0215, "num_tokens": 208624965.0, "step": 2075 }, { "epoch": 4.729968634160251, "grad_norm": 1.9609375, "learning_rate": 2.9310287110709895e-06, "loss": 0.0327, "num_tokens": 208725765.0, "step": 2076 }, { "epoch": 4.732249786142002, "grad_norm": 1.3671875, "learning_rate": 2.9291733447097714e-06, "loss": 0.0181, "num_tokens": 208826325.0, "step": 2077 }, { "epoch": 4.734530938123752, "grad_norm": 1.703125, "learning_rate": 2.927317734758047e-06, "loss": 0.0195, "num_tokens": 208927026.0, "step": 2078 }, { "epoch": 4.736812090105503, "grad_norm": 1.671875, "learning_rate": 2.925461882269027e-06, "loss": 0.0229, "num_tokens": 209028009.0, "step": 2079 }, { "epoch": 4.739093242087254, "grad_norm": 1.859375, "learning_rate": 2.9236057882960567e-06, "loss": 0.0255, "num_tokens": 209129051.0, "step": 2080 }, { "epoch": 4.7413743940690045, "grad_norm": 1.5234375, "learning_rate": 2.921749453892618e-06, "loss": 0.0196, "num_tokens": 209229855.0, "step": 2081 }, { "epoch": 4.743655546050755, "grad_norm": 1.1484375, "learning_rate": 2.919892880112332e-06, "loss": 0.0149, "num_tokens": 209330351.0, "step": 2082 }, { "epoch": 4.745936698032507, "grad_norm": 1.7265625, "learning_rate": 2.9180360680089542e-06, "loss": 0.0221, "num_tokens": 209430754.0, "step": 2083 }, { "epoch": 4.748217850014258, "grad_norm": 1.5, "learning_rate": 2.9161790186363746e-06, "loss": 0.0171, "num_tokens": 209531346.0, "step": 2084 }, { "epoch": 4.750499001996008, "grad_norm": 1.4140625, "learning_rate": 2.9143217330486186e-06, "loss": 0.0206, "num_tokens": 209631609.0, "step": 2085 }, { "epoch": 4.752780153977759, "grad_norm": 1.4921875, "learning_rate": 2.9124642122998453e-06, "loss": 0.0186, "num_tokens": 209732728.0, "step": 2086 }, { "epoch": 4.75506130595951, "grad_norm": 1.3671875, "learning_rate": 2.9106064574443477e-06, "loss": 0.0147, "num_tokens": 209833161.0, "step": 2087 }, { "epoch": 4.75734245794126, "grad_norm": 1.4375, "learning_rate": 2.9087484695365523e-06, "loss": 0.021, "num_tokens": 209933681.0, "step": 2088 }, { "epoch": 4.759623609923011, "grad_norm": 1.546875, "learning_rate": 2.906890249631017e-06, "loss": 0.022, "num_tokens": 210034156.0, "step": 2089 }, { "epoch": 4.761904761904762, "grad_norm": 1.2265625, "learning_rate": 2.905031798782431e-06, "loss": 0.016, "num_tokens": 210134866.0, "step": 2090 }, { "epoch": 4.764185913886513, "grad_norm": 1.7734375, "learning_rate": 2.903173118045616e-06, "loss": 0.0238, "num_tokens": 210235483.0, "step": 2091 }, { "epoch": 4.766467065868263, "grad_norm": 1.6875, "learning_rate": 2.901314208475522e-06, "loss": 0.0213, "num_tokens": 210336097.0, "step": 2092 }, { "epoch": 4.768748217850014, "grad_norm": 1.7578125, "learning_rate": 2.8994550711272317e-06, "loss": 0.0229, "num_tokens": 210436724.0, "step": 2093 }, { "epoch": 4.771029369831765, "grad_norm": 1.5390625, "learning_rate": 2.897595707055954e-06, "loss": 0.0172, "num_tokens": 210537306.0, "step": 2094 }, { "epoch": 4.7733105218135155, "grad_norm": 1.6484375, "learning_rate": 2.8957361173170297e-06, "loss": 0.0137, "num_tokens": 210637564.0, "step": 2095 }, { "epoch": 4.775591673795267, "grad_norm": 1.5859375, "learning_rate": 2.893876302965925e-06, "loss": 0.022, "num_tokens": 210738221.0, "step": 2096 }, { "epoch": 4.777872825777018, "grad_norm": 1.2734375, "learning_rate": 2.8920162650582344e-06, "loss": 0.018, "num_tokens": 210840026.0, "step": 2097 }, { "epoch": 4.7801539777587685, "grad_norm": 1.7109375, "learning_rate": 2.8901560046496797e-06, "loss": 0.0237, "num_tokens": 210940611.0, "step": 2098 }, { "epoch": 4.782435129740519, "grad_norm": 1.6875, "learning_rate": 2.8882955227961098e-06, "loss": 0.0284, "num_tokens": 211041190.0, "step": 2099 }, { "epoch": 4.78471628172227, "grad_norm": 1.453125, "learning_rate": 2.886434820553497e-06, "loss": 0.0153, "num_tokens": 211142127.0, "step": 2100 }, { "epoch": 4.786997433704021, "grad_norm": 1.359375, "learning_rate": 2.884573898977941e-06, "loss": 0.0173, "num_tokens": 211242505.0, "step": 2101 }, { "epoch": 4.789278585685771, "grad_norm": 1.546875, "learning_rate": 2.882712759125664e-06, "loss": 0.0195, "num_tokens": 211343452.0, "step": 2102 }, { "epoch": 4.791559737667522, "grad_norm": 1.671875, "learning_rate": 2.8808514020530127e-06, "loss": 0.0247, "num_tokens": 211443905.0, "step": 2103 }, { "epoch": 4.793840889649273, "grad_norm": 1.3359375, "learning_rate": 2.8789898288164595e-06, "loss": 0.0158, "num_tokens": 211544217.0, "step": 2104 }, { "epoch": 4.7961220416310235, "grad_norm": 1.5859375, "learning_rate": 2.8771280404725953e-06, "loss": 0.0215, "num_tokens": 211644295.0, "step": 2105 }, { "epoch": 4.798403193612774, "grad_norm": 1.71875, "learning_rate": 2.8752660380781367e-06, "loss": 0.0238, "num_tokens": 211745070.0, "step": 2106 }, { "epoch": 4.800684345594525, "grad_norm": 1.5390625, "learning_rate": 2.8734038226899198e-06, "loss": 0.0169, "num_tokens": 211845182.0, "step": 2107 }, { "epoch": 4.802965497576276, "grad_norm": 1.53125, "learning_rate": 2.8715413953649012e-06, "loss": 0.0228, "num_tokens": 211946047.0, "step": 2108 }, { "epoch": 4.805246649558027, "grad_norm": 1.6640625, "learning_rate": 2.8696787571601597e-06, "loss": 0.0235, "num_tokens": 212046684.0, "step": 2109 }, { "epoch": 4.807527801539777, "grad_norm": 1.4296875, "learning_rate": 2.8678159091328926e-06, "loss": 0.0158, "num_tokens": 212147456.0, "step": 2110 }, { "epoch": 4.809808953521529, "grad_norm": 1.5625, "learning_rate": 2.865952852340417e-06, "loss": 0.0245, "num_tokens": 212248098.0, "step": 2111 }, { "epoch": 4.812090105503279, "grad_norm": 1.7421875, "learning_rate": 2.864089587840167e-06, "loss": 0.0206, "num_tokens": 212348097.0, "step": 2112 }, { "epoch": 4.81437125748503, "grad_norm": 2.25, "learning_rate": 2.862226116689696e-06, "loss": 0.0216, "num_tokens": 212448562.0, "step": 2113 }, { "epoch": 4.816652409466781, "grad_norm": 1.578125, "learning_rate": 2.8603624399466732e-06, "loss": 0.0174, "num_tokens": 212549938.0, "step": 2114 }, { "epoch": 4.818933561448532, "grad_norm": 1.5, "learning_rate": 2.858498558668888e-06, "loss": 0.0195, "num_tokens": 212650705.0, "step": 2115 }, { "epoch": 4.821214713430282, "grad_norm": 1.859375, "learning_rate": 2.856634473914242e-06, "loss": 0.0266, "num_tokens": 212752091.0, "step": 2116 }, { "epoch": 4.823495865412033, "grad_norm": 1.4140625, "learning_rate": 2.854770186740753e-06, "loss": 0.0191, "num_tokens": 212853316.0, "step": 2117 }, { "epoch": 4.825777017393784, "grad_norm": 1.78125, "learning_rate": 2.8529056982065557e-06, "loss": 0.0231, "num_tokens": 212953823.0, "step": 2118 }, { "epoch": 4.8280581693755344, "grad_norm": 1.46875, "learning_rate": 2.8510410093698966e-06, "loss": 0.0188, "num_tokens": 213053935.0, "step": 2119 }, { "epoch": 4.830339321357285, "grad_norm": 1.34375, "learning_rate": 2.849176121289138e-06, "loss": 0.0178, "num_tokens": 213155310.0, "step": 2120 }, { "epoch": 4.832620473339036, "grad_norm": 1.4609375, "learning_rate": 2.8473110350227536e-06, "loss": 0.0177, "num_tokens": 213256172.0, "step": 2121 }, { "epoch": 4.8349016253207875, "grad_norm": 1.375, "learning_rate": 2.845445751629331e-06, "loss": 0.0188, "num_tokens": 213357504.0, "step": 2122 }, { "epoch": 4.837182777302537, "grad_norm": 1.4453125, "learning_rate": 2.843580272167569e-06, "loss": 0.0182, "num_tokens": 213458319.0, "step": 2123 }, { "epoch": 4.839463929284289, "grad_norm": 1.3203125, "learning_rate": 2.8417145976962773e-06, "loss": 0.0137, "num_tokens": 213558601.0, "step": 2124 }, { "epoch": 4.84174508126604, "grad_norm": 1.4765625, "learning_rate": 2.8398487292743772e-06, "loss": 0.0216, "num_tokens": 213658916.0, "step": 2125 }, { "epoch": 4.84402623324779, "grad_norm": 1.546875, "learning_rate": 2.8379826679609e-06, "loss": 0.0243, "num_tokens": 213759140.0, "step": 2126 }, { "epoch": 4.846307385229541, "grad_norm": 1.7734375, "learning_rate": 2.836116414814985e-06, "loss": 0.0225, "num_tokens": 213859379.0, "step": 2127 }, { "epoch": 4.848588537211292, "grad_norm": 1.6015625, "learning_rate": 2.8342499708958827e-06, "loss": 0.0219, "num_tokens": 213959952.0, "step": 2128 }, { "epoch": 4.8508696891930425, "grad_norm": 1.53125, "learning_rate": 2.8323833372629485e-06, "loss": 0.0204, "num_tokens": 214061055.0, "step": 2129 }, { "epoch": 4.853150841174793, "grad_norm": 1.1875, "learning_rate": 2.8305165149756496e-06, "loss": 0.0139, "num_tokens": 214161787.0, "step": 2130 }, { "epoch": 4.855431993156544, "grad_norm": 1.8125, "learning_rate": 2.828649505093558e-06, "loss": 0.0207, "num_tokens": 214262464.0, "step": 2131 }, { "epoch": 4.857713145138295, "grad_norm": 1.734375, "learning_rate": 2.826782308676351e-06, "loss": 0.022, "num_tokens": 214362689.0, "step": 2132 }, { "epoch": 4.859994297120045, "grad_norm": 1.4609375, "learning_rate": 2.824914926783815e-06, "loss": 0.012, "num_tokens": 214463597.0, "step": 2133 }, { "epoch": 4.862275449101796, "grad_norm": 1.421875, "learning_rate": 2.82304736047584e-06, "loss": 0.0191, "num_tokens": 214564351.0, "step": 2134 }, { "epoch": 4.864556601083547, "grad_norm": 1.6015625, "learning_rate": 2.821179610812419e-06, "loss": 0.0218, "num_tokens": 214665405.0, "step": 2135 }, { "epoch": 4.8668377530652975, "grad_norm": 1.7109375, "learning_rate": 2.819311678853652e-06, "loss": 0.0221, "num_tokens": 214766690.0, "step": 2136 }, { "epoch": 4.869118905047049, "grad_norm": 1.3046875, "learning_rate": 2.8174435656597403e-06, "loss": 0.0144, "num_tokens": 214867391.0, "step": 2137 }, { "epoch": 4.8714000570288, "grad_norm": 1.5078125, "learning_rate": 2.8155752722909896e-06, "loss": 0.0205, "num_tokens": 214968056.0, "step": 2138 }, { "epoch": 4.873681209010551, "grad_norm": 1.3828125, "learning_rate": 2.8137067998078073e-06, "loss": 0.0146, "num_tokens": 215068250.0, "step": 2139 }, { "epoch": 4.875962360992301, "grad_norm": 1.59375, "learning_rate": 2.8118381492707004e-06, "loss": 0.025, "num_tokens": 215168516.0, "step": 2140 }, { "epoch": 4.878243512974052, "grad_norm": 1.203125, "learning_rate": 2.8099693217402807e-06, "loss": 0.0178, "num_tokens": 215269704.0, "step": 2141 }, { "epoch": 4.880524664955803, "grad_norm": 1.5390625, "learning_rate": 2.808100318277258e-06, "loss": 0.0298, "num_tokens": 215370482.0, "step": 2142 }, { "epoch": 4.8828058169375534, "grad_norm": 1.6015625, "learning_rate": 2.806231139942443e-06, "loss": 0.0195, "num_tokens": 215471817.0, "step": 2143 }, { "epoch": 4.885086968919304, "grad_norm": 2.390625, "learning_rate": 2.8043617877967456e-06, "loss": 0.0179, "num_tokens": 215572184.0, "step": 2144 }, { "epoch": 4.887368120901055, "grad_norm": 1.5078125, "learning_rate": 2.8024922629011727e-06, "loss": 0.0225, "num_tokens": 215673187.0, "step": 2145 }, { "epoch": 4.889649272882806, "grad_norm": 1.515625, "learning_rate": 2.800622566316831e-06, "loss": 0.0181, "num_tokens": 215773665.0, "step": 2146 }, { "epoch": 4.891930424864556, "grad_norm": 1.078125, "learning_rate": 2.798752699104925e-06, "loss": 0.0102, "num_tokens": 215873855.0, "step": 2147 }, { "epoch": 4.894211576846307, "grad_norm": 2.296875, "learning_rate": 2.7968826623267542e-06, "loss": 0.0224, "num_tokens": 215974571.0, "step": 2148 }, { "epoch": 4.896492728828058, "grad_norm": 1.546875, "learning_rate": 2.7950124570437163e-06, "loss": 0.017, "num_tokens": 216075263.0, "step": 2149 }, { "epoch": 4.898773880809809, "grad_norm": 1.7265625, "learning_rate": 2.793142084317303e-06, "loss": 0.0263, "num_tokens": 216175605.0, "step": 2150 }, { "epoch": 4.90105503279156, "grad_norm": 1.5625, "learning_rate": 2.7912715452091014e-06, "loss": 0.0162, "num_tokens": 216276543.0, "step": 2151 }, { "epoch": 4.903336184773311, "grad_norm": 1.7890625, "learning_rate": 2.789400840780795e-06, "loss": 0.0243, "num_tokens": 216376880.0, "step": 2152 }, { "epoch": 4.9056173367550615, "grad_norm": 1.671875, "learning_rate": 2.7875299720941577e-06, "loss": 0.0193, "num_tokens": 216477927.0, "step": 2153 }, { "epoch": 4.907898488736812, "grad_norm": 1.6796875, "learning_rate": 2.785658940211059e-06, "loss": 0.0207, "num_tokens": 216578334.0, "step": 2154 }, { "epoch": 4.910179640718563, "grad_norm": 1.8125, "learning_rate": 2.7837877461934616e-06, "loss": 0.0223, "num_tokens": 216678400.0, "step": 2155 }, { "epoch": 4.912460792700314, "grad_norm": 2.140625, "learning_rate": 2.7819163911034175e-06, "loss": 0.0251, "num_tokens": 216779031.0, "step": 2156 }, { "epoch": 4.914741944682064, "grad_norm": 1.8203125, "learning_rate": 2.7800448760030724e-06, "loss": 0.0243, "num_tokens": 216879866.0, "step": 2157 }, { "epoch": 4.917023096663815, "grad_norm": 1.6953125, "learning_rate": 2.7781732019546625e-06, "loss": 0.0165, "num_tokens": 216980419.0, "step": 2158 }, { "epoch": 4.919304248645566, "grad_norm": 1.7265625, "learning_rate": 2.776301370020513e-06, "loss": 0.0209, "num_tokens": 217080334.0, "step": 2159 }, { "epoch": 4.9215854006273165, "grad_norm": 1.828125, "learning_rate": 2.7744293812630412e-06, "loss": 0.0254, "num_tokens": 217180983.0, "step": 2160 }, { "epoch": 4.923866552609067, "grad_norm": 2.0625, "learning_rate": 2.77255723674475e-06, "loss": 0.0303, "num_tokens": 217281822.0, "step": 2161 }, { "epoch": 4.926147704590818, "grad_norm": 1.84375, "learning_rate": 2.770684937528233e-06, "loss": 0.026, "num_tokens": 217382428.0, "step": 2162 }, { "epoch": 4.92842885657257, "grad_norm": 1.625, "learning_rate": 2.7688124846761716e-06, "loss": 0.019, "num_tokens": 217482837.0, "step": 2163 }, { "epoch": 4.93071000855432, "grad_norm": 1.8671875, "learning_rate": 2.766939879251333e-06, "loss": 0.0287, "num_tokens": 217584003.0, "step": 2164 }, { "epoch": 4.932991160536071, "grad_norm": 1.6875, "learning_rate": 2.7650671223165726e-06, "loss": 0.0179, "num_tokens": 217684435.0, "step": 2165 }, { "epoch": 4.935272312517822, "grad_norm": 1.46875, "learning_rate": 2.7631942149348313e-06, "loss": 0.0217, "num_tokens": 217785071.0, "step": 2166 }, { "epoch": 4.937553464499572, "grad_norm": 1.546875, "learning_rate": 2.761321158169134e-06, "loss": 0.0199, "num_tokens": 217885886.0, "step": 2167 }, { "epoch": 4.939834616481323, "grad_norm": 1.484375, "learning_rate": 2.759447953082593e-06, "loss": 0.02, "num_tokens": 217986161.0, "step": 2168 }, { "epoch": 4.942115768463074, "grad_norm": 1.2421875, "learning_rate": 2.757574600738402e-06, "loss": 0.0214, "num_tokens": 218086247.0, "step": 2169 }, { "epoch": 4.944396920444825, "grad_norm": 1.359375, "learning_rate": 2.755701102199841e-06, "loss": 0.0204, "num_tokens": 218186869.0, "step": 2170 }, { "epoch": 4.946678072426575, "grad_norm": 1.4140625, "learning_rate": 2.7538274585302707e-06, "loss": 0.0137, "num_tokens": 218287590.0, "step": 2171 }, { "epoch": 4.948959224408326, "grad_norm": 1.28125, "learning_rate": 2.751953670793135e-06, "loss": 0.0177, "num_tokens": 218388072.0, "step": 2172 }, { "epoch": 4.951240376390077, "grad_norm": 1.2578125, "learning_rate": 2.7500797400519595e-06, "loss": 0.0201, "num_tokens": 218488684.0, "step": 2173 }, { "epoch": 4.9535215283718275, "grad_norm": 1.296875, "learning_rate": 2.7482056673703526e-06, "loss": 0.0235, "num_tokens": 218589468.0, "step": 2174 }, { "epoch": 4.955802680353578, "grad_norm": 1.765625, "learning_rate": 2.746331453812e-06, "loss": 0.0187, "num_tokens": 218690120.0, "step": 2175 }, { "epoch": 4.95808383233533, "grad_norm": 1.21875, "learning_rate": 2.74445710044067e-06, "loss": 0.0188, "num_tokens": 218791159.0, "step": 2176 }, { "epoch": 4.9603649843170805, "grad_norm": 1.25, "learning_rate": 2.7425826083202096e-06, "loss": 0.0152, "num_tokens": 218891617.0, "step": 2177 }, { "epoch": 4.962646136298831, "grad_norm": 1.5703125, "learning_rate": 2.740707978514543e-06, "loss": 0.0224, "num_tokens": 218992617.0, "step": 2178 }, { "epoch": 4.964927288280582, "grad_norm": 1.71875, "learning_rate": 2.738833212087676e-06, "loss": 0.0178, "num_tokens": 219093525.0, "step": 2179 }, { "epoch": 4.967208440262333, "grad_norm": 1.171875, "learning_rate": 2.736958310103688e-06, "loss": 0.0164, "num_tokens": 219194123.0, "step": 2180 }, { "epoch": 4.969489592244083, "grad_norm": 2.015625, "learning_rate": 2.735083273626738e-06, "loss": 0.03, "num_tokens": 219294324.0, "step": 2181 }, { "epoch": 4.971770744225834, "grad_norm": 1.2734375, "learning_rate": 2.7332081037210607e-06, "loss": 0.0202, "num_tokens": 219395518.0, "step": 2182 }, { "epoch": 4.974051896207585, "grad_norm": 1.6171875, "learning_rate": 2.7313328014509653e-06, "loss": 0.0236, "num_tokens": 219495995.0, "step": 2183 }, { "epoch": 4.9763330481893355, "grad_norm": 1.328125, "learning_rate": 2.729457367880838e-06, "loss": 0.0193, "num_tokens": 219596708.0, "step": 2184 }, { "epoch": 4.978614200171086, "grad_norm": 1.546875, "learning_rate": 2.727581804075139e-06, "loss": 0.021, "num_tokens": 219696797.0, "step": 2185 }, { "epoch": 4.980895352152837, "grad_norm": 1.609375, "learning_rate": 2.7257061110984005e-06, "loss": 0.023, "num_tokens": 219797291.0, "step": 2186 }, { "epoch": 4.983176504134588, "grad_norm": 1.796875, "learning_rate": 2.7238302900152327e-06, "loss": 0.022, "num_tokens": 219897745.0, "step": 2187 }, { "epoch": 4.985457656116338, "grad_norm": 1.5625, "learning_rate": 2.7219543418903115e-06, "loss": 0.022, "num_tokens": 219997864.0, "step": 2188 }, { "epoch": 4.98773880809809, "grad_norm": 1.7109375, "learning_rate": 2.720078267788392e-06, "loss": 0.0253, "num_tokens": 220098171.0, "step": 2189 }, { "epoch": 4.99001996007984, "grad_norm": 1.703125, "learning_rate": 2.718202068774296e-06, "loss": 0.0266, "num_tokens": 220199670.0, "step": 2190 }, { "epoch": 4.992301112061591, "grad_norm": 1.578125, "learning_rate": 2.7163257459129184e-06, "loss": 0.0186, "num_tokens": 220300427.0, "step": 2191 }, { "epoch": 4.994582264043342, "grad_norm": 1.2734375, "learning_rate": 2.7144493002692242e-06, "loss": 0.0167, "num_tokens": 220401122.0, "step": 2192 }, { "epoch": 4.996863416025093, "grad_norm": 1.953125, "learning_rate": 2.7125727329082474e-06, "loss": 0.0294, "num_tokens": 220501625.0, "step": 2193 }, { "epoch": 4.999144568006844, "grad_norm": 1.4453125, "learning_rate": 2.7106960448950904e-06, "loss": 0.0169, "num_tokens": 220602216.0, "step": 2194 }, { "epoch": 5.0, "grad_norm": 1.7109375, "learning_rate": 2.7088192372949267e-06, "loss": 0.0169, "num_tokens": 220639360.0, "step": 2195 } ], "logging_steps": 1, "max_steps": 4390, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.4007296275814646e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }