{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 3315, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009049773755656109, "grad_norm": 0.47269991076396356, "learning_rate": 9.999997754708895e-06, "loss": 0.0008, "step": 1 }, { "epoch": 0.0018099547511312218, "grad_norm": 5.58908466397515, "learning_rate": 9.999991018837592e-06, "loss": 0.0579, "step": 2 }, { "epoch": 0.0027149321266968325, "grad_norm": 8.192018248534273, "learning_rate": 9.999979792392144e-06, "loss": 0.1689, "step": 3 }, { "epoch": 0.0036199095022624436, "grad_norm": 4.7854739083366, "learning_rate": 9.999964075382633e-06, "loss": 0.0821, "step": 4 }, { "epoch": 0.004524886877828055, "grad_norm": 4.1698402564905965, "learning_rate": 9.999943867823174e-06, "loss": 0.1015, "step": 5 }, { "epoch": 0.005429864253393665, "grad_norm": 3.6865182443158404, "learning_rate": 9.999919169731915e-06, "loss": 0.0961, "step": 6 }, { "epoch": 0.006334841628959276, "grad_norm": 3.825809035713377, "learning_rate": 9.999889981131038e-06, "loss": 0.1216, "step": 7 }, { "epoch": 0.007239819004524887, "grad_norm": 3.3720878948755124, "learning_rate": 9.99985630204676e-06, "loss": 0.0667, "step": 8 }, { "epoch": 0.008144796380090498, "grad_norm": 5.1194010399230265, "learning_rate": 9.999818132509327e-06, "loss": 0.1423, "step": 9 }, { "epoch": 0.00904977375565611, "grad_norm": 6.919741103077917, "learning_rate": 9.999775472553019e-06, "loss": 0.1084, "step": 10 }, { "epoch": 0.009954751131221719, "grad_norm": 3.4181205844959095, "learning_rate": 9.999728322216153e-06, "loss": 0.0922, "step": 11 }, { "epoch": 0.01085972850678733, "grad_norm": 3.045258433749866, "learning_rate": 9.99967668154107e-06, "loss": 0.0781, "step": 12 }, { "epoch": 0.011764705882352941, "grad_norm": 6.545815123392674, "learning_rate": 9.999620550574155e-06, "loss": 0.1184, "step": 13 }, { "epoch": 0.012669683257918552, "grad_norm": 4.267191144668087, "learning_rate": 9.999559929365815e-06, "loss": 0.1139, "step": 14 }, { "epoch": 0.013574660633484163, "grad_norm": 5.607594105060613, "learning_rate": 9.999494817970498e-06, "loss": 0.1599, "step": 15 }, { "epoch": 0.014479638009049774, "grad_norm": 4.608136763428088, "learning_rate": 9.99942521644668e-06, "loss": 0.1571, "step": 16 }, { "epoch": 0.015384615384615385, "grad_norm": 3.731719940688652, "learning_rate": 9.999351124856873e-06, "loss": 0.1346, "step": 17 }, { "epoch": 0.016289592760180997, "grad_norm": 3.5131724902760304, "learning_rate": 9.999272543267621e-06, "loss": 0.1164, "step": 18 }, { "epoch": 0.017194570135746608, "grad_norm": 5.183134745409451, "learning_rate": 9.999189471749495e-06, "loss": 0.1398, "step": 19 }, { "epoch": 0.01809954751131222, "grad_norm": 2.893941660096166, "learning_rate": 9.999101910377107e-06, "loss": 0.0947, "step": 20 }, { "epoch": 0.019004524886877826, "grad_norm": 2.8419229469148717, "learning_rate": 9.999009859229097e-06, "loss": 0.1319, "step": 21 }, { "epoch": 0.019909502262443438, "grad_norm": 4.654569754527284, "learning_rate": 9.998913318388134e-06, "loss": 0.1829, "step": 22 }, { "epoch": 0.02081447963800905, "grad_norm": 3.3092299130797986, "learning_rate": 9.998812287940925e-06, "loss": 0.1395, "step": 23 }, { "epoch": 0.02171945701357466, "grad_norm": 3.503225285786879, "learning_rate": 9.99870676797821e-06, "loss": 0.1275, "step": 24 }, { "epoch": 0.02262443438914027, "grad_norm": 2.938363326710556, "learning_rate": 9.998596758594752e-06, "loss": 0.1267, "step": 25 }, { "epoch": 0.023529411764705882, "grad_norm": 2.7318128258286123, "learning_rate": 9.99848225988936e-06, "loss": 0.1182, "step": 26 }, { "epoch": 0.024434389140271493, "grad_norm": 3.3197263172738167, "learning_rate": 9.998363271964859e-06, "loss": 0.1238, "step": 27 }, { "epoch": 0.025339366515837104, "grad_norm": 3.2146683567329957, "learning_rate": 9.99823979492812e-06, "loss": 0.1228, "step": 28 }, { "epoch": 0.026244343891402715, "grad_norm": 3.454226853437581, "learning_rate": 9.998111828890039e-06, "loss": 0.1277, "step": 29 }, { "epoch": 0.027149321266968326, "grad_norm": 4.818327511634349, "learning_rate": 9.997979373965542e-06, "loss": 0.1978, "step": 30 }, { "epoch": 0.028054298642533938, "grad_norm": 2.4367976421622286, "learning_rate": 9.99784243027359e-06, "loss": 0.0992, "step": 31 }, { "epoch": 0.02895927601809955, "grad_norm": 4.883109432829794, "learning_rate": 9.997700997937173e-06, "loss": 0.1297, "step": 32 }, { "epoch": 0.02986425339366516, "grad_norm": 4.41468793014304, "learning_rate": 9.997555077083318e-06, "loss": 0.1671, "step": 33 }, { "epoch": 0.03076923076923077, "grad_norm": 3.3652719994320033, "learning_rate": 9.997404667843076e-06, "loss": 0.1265, "step": 34 }, { "epoch": 0.03167420814479638, "grad_norm": 4.80482934897459, "learning_rate": 9.997249770351531e-06, "loss": 0.1667, "step": 35 }, { "epoch": 0.03257918552036199, "grad_norm": 3.5646186437318366, "learning_rate": 9.9970903847478e-06, "loss": 0.1397, "step": 36 }, { "epoch": 0.0334841628959276, "grad_norm": 3.641799499057348, "learning_rate": 9.996926511175031e-06, "loss": 0.1305, "step": 37 }, { "epoch": 0.034389140271493215, "grad_norm": 3.249670859629268, "learning_rate": 9.9967581497804e-06, "loss": 0.1073, "step": 38 }, { "epoch": 0.03529411764705882, "grad_norm": 6.768125241208062, "learning_rate": 9.996585300715117e-06, "loss": 0.172, "step": 39 }, { "epoch": 0.03619909502262444, "grad_norm": 4.05121535599894, "learning_rate": 9.996407964134416e-06, "loss": 0.1453, "step": 40 }, { "epoch": 0.037104072398190045, "grad_norm": 4.739381485140201, "learning_rate": 9.996226140197572e-06, "loss": 0.18, "step": 41 }, { "epoch": 0.03800904977375565, "grad_norm": 3.804310526570808, "learning_rate": 9.996039829067879e-06, "loss": 0.1533, "step": 42 }, { "epoch": 0.03891402714932127, "grad_norm": 4.21024758747627, "learning_rate": 9.99584903091267e-06, "loss": 0.1642, "step": 43 }, { "epoch": 0.039819004524886875, "grad_norm": 6.757244686775144, "learning_rate": 9.995653745903301e-06, "loss": 0.1112, "step": 44 }, { "epoch": 0.04072398190045249, "grad_norm": 3.5575104177180896, "learning_rate": 9.995453974215164e-06, "loss": 0.14, "step": 45 }, { "epoch": 0.0416289592760181, "grad_norm": 5.2989876325877745, "learning_rate": 9.995249716027673e-06, "loss": 0.2003, "step": 46 }, { "epoch": 0.04253393665158371, "grad_norm": 4.295626046392908, "learning_rate": 9.99504097152428e-06, "loss": 0.1322, "step": 47 }, { "epoch": 0.04343891402714932, "grad_norm": 4.609685844247391, "learning_rate": 9.994827740892458e-06, "loss": 0.1695, "step": 48 }, { "epoch": 0.044343891402714934, "grad_norm": 5.1028619306010885, "learning_rate": 9.994610024323715e-06, "loss": 0.1968, "step": 49 }, { "epoch": 0.04524886877828054, "grad_norm": 3.266685403781502, "learning_rate": 9.994387822013586e-06, "loss": 0.1502, "step": 50 }, { "epoch": 0.046153846153846156, "grad_norm": 5.998633114915114, "learning_rate": 9.994161134161635e-06, "loss": 0.2021, "step": 51 }, { "epoch": 0.047058823529411764, "grad_norm": 2.951057051540106, "learning_rate": 9.99392996097145e-06, "loss": 0.1196, "step": 52 }, { "epoch": 0.04796380090497738, "grad_norm": 4.247979494562004, "learning_rate": 9.993694302650656e-06, "loss": 0.1282, "step": 53 }, { "epoch": 0.048868778280542986, "grad_norm": 3.1985814899859872, "learning_rate": 9.993454159410902e-06, "loss": 0.1415, "step": 54 }, { "epoch": 0.049773755656108594, "grad_norm": 3.138565842478053, "learning_rate": 9.99320953146786e-06, "loss": 0.1325, "step": 55 }, { "epoch": 0.05067873303167421, "grad_norm": 2.8556010047642886, "learning_rate": 9.992960419041239e-06, "loss": 0.1219, "step": 56 }, { "epoch": 0.051583710407239816, "grad_norm": 4.346358501839163, "learning_rate": 9.99270682235477e-06, "loss": 0.1715, "step": 57 }, { "epoch": 0.05248868778280543, "grad_norm": 3.3749292271304148, "learning_rate": 9.992448741636208e-06, "loss": 0.1716, "step": 58 }, { "epoch": 0.05339366515837104, "grad_norm": 3.120311601007822, "learning_rate": 9.992186177117345e-06, "loss": 0.1363, "step": 59 }, { "epoch": 0.05429864253393665, "grad_norm": 3.9350937945206934, "learning_rate": 9.991919129033994e-06, "loss": 0.1436, "step": 60 }, { "epoch": 0.05520361990950226, "grad_norm": 4.634005541940868, "learning_rate": 9.991647597625991e-06, "loss": 0.2246, "step": 61 }, { "epoch": 0.056108597285067875, "grad_norm": 4.073134299222118, "learning_rate": 9.991371583137206e-06, "loss": 0.1677, "step": 62 }, { "epoch": 0.05701357466063348, "grad_norm": 4.137546485146638, "learning_rate": 9.991091085815532e-06, "loss": 0.1406, "step": 63 }, { "epoch": 0.0579185520361991, "grad_norm": 2.9272745639045703, "learning_rate": 9.990806105912889e-06, "loss": 0.1106, "step": 64 }, { "epoch": 0.058823529411764705, "grad_norm": 3.523354867615051, "learning_rate": 9.990516643685222e-06, "loss": 0.1322, "step": 65 }, { "epoch": 0.05972850678733032, "grad_norm": 5.101610870882501, "learning_rate": 9.990222699392499e-06, "loss": 0.1552, "step": 66 }, { "epoch": 0.06063348416289593, "grad_norm": 7.8657889984539775, "learning_rate": 9.989924273298719e-06, "loss": 0.1391, "step": 67 }, { "epoch": 0.06153846153846154, "grad_norm": 3.8667917772907106, "learning_rate": 9.989621365671902e-06, "loss": 0.1345, "step": 68 }, { "epoch": 0.06244343891402715, "grad_norm": 4.304766172576919, "learning_rate": 9.989313976784093e-06, "loss": 0.1429, "step": 69 }, { "epoch": 0.06334841628959276, "grad_norm": 2.9095593083404157, "learning_rate": 9.989002106911368e-06, "loss": 0.1189, "step": 70 }, { "epoch": 0.06425339366515836, "grad_norm": 3.4940023946027283, "learning_rate": 9.988685756333818e-06, "loss": 0.1358, "step": 71 }, { "epoch": 0.06515837104072399, "grad_norm": 4.38787993716091, "learning_rate": 9.988364925335565e-06, "loss": 0.1421, "step": 72 }, { "epoch": 0.0660633484162896, "grad_norm": 2.7384505209362473, "learning_rate": 9.98803961420475e-06, "loss": 0.1045, "step": 73 }, { "epoch": 0.0669683257918552, "grad_norm": 3.0385506327441094, "learning_rate": 9.987709823233543e-06, "loss": 0.126, "step": 74 }, { "epoch": 0.06787330316742081, "grad_norm": 4.6745313540672075, "learning_rate": 9.987375552718133e-06, "loss": 0.2122, "step": 75 }, { "epoch": 0.06877828054298643, "grad_norm": 4.144632053888554, "learning_rate": 9.987036802958734e-06, "loss": 0.1386, "step": 76 }, { "epoch": 0.06968325791855204, "grad_norm": 5.191868998934833, "learning_rate": 9.986693574259584e-06, "loss": 0.2513, "step": 77 }, { "epoch": 0.07058823529411765, "grad_norm": 3.733548639226825, "learning_rate": 9.98634586692894e-06, "loss": 0.1736, "step": 78 }, { "epoch": 0.07149321266968325, "grad_norm": 4.335044285084542, "learning_rate": 9.985993681279087e-06, "loss": 0.1411, "step": 79 }, { "epoch": 0.07239819004524888, "grad_norm": 4.38886494893894, "learning_rate": 9.985637017626326e-06, "loss": 0.1599, "step": 80 }, { "epoch": 0.07330316742081448, "grad_norm": 4.068551124667603, "learning_rate": 9.985275876290982e-06, "loss": 0.1982, "step": 81 }, { "epoch": 0.07420814479638009, "grad_norm": 9.751303321149006, "learning_rate": 9.984910257597406e-06, "loss": 0.1899, "step": 82 }, { "epoch": 0.0751131221719457, "grad_norm": 2.8228663720375713, "learning_rate": 9.984540161873961e-06, "loss": 0.117, "step": 83 }, { "epoch": 0.0760180995475113, "grad_norm": 3.477078743250181, "learning_rate": 9.984165589453041e-06, "loss": 0.1454, "step": 84 }, { "epoch": 0.07692307692307693, "grad_norm": 4.911407636725847, "learning_rate": 9.983786540671052e-06, "loss": 0.2021, "step": 85 }, { "epoch": 0.07782805429864253, "grad_norm": 3.6581220722919214, "learning_rate": 9.983403015868424e-06, "loss": 0.1513, "step": 86 }, { "epoch": 0.07873303167420814, "grad_norm": 4.08384032257526, "learning_rate": 9.983015015389608e-06, "loss": 0.1864, "step": 87 }, { "epoch": 0.07963800904977375, "grad_norm": 3.469002501809984, "learning_rate": 9.982622539583074e-06, "loss": 0.1457, "step": 88 }, { "epoch": 0.08054298642533937, "grad_norm": 3.2154356301977853, "learning_rate": 9.98222558880131e-06, "loss": 0.1316, "step": 89 }, { "epoch": 0.08144796380090498, "grad_norm": 5.445599519607536, "learning_rate": 9.981824163400827e-06, "loss": 0.1427, "step": 90 }, { "epoch": 0.08235294117647059, "grad_norm": 5.426938375489811, "learning_rate": 9.981418263742148e-06, "loss": 0.1955, "step": 91 }, { "epoch": 0.0832579185520362, "grad_norm": 3.2870546259709505, "learning_rate": 9.98100789018982e-06, "loss": 0.1151, "step": 92 }, { "epoch": 0.08416289592760182, "grad_norm": 3.9153844378676315, "learning_rate": 9.980593043112405e-06, "loss": 0.1527, "step": 93 }, { "epoch": 0.08506787330316742, "grad_norm": 4.768583263006547, "learning_rate": 9.980173722882485e-06, "loss": 0.1314, "step": 94 }, { "epoch": 0.08597285067873303, "grad_norm": 3.287304573978618, "learning_rate": 9.979749929876658e-06, "loss": 0.1539, "step": 95 }, { "epoch": 0.08687782805429864, "grad_norm": 5.590238985709891, "learning_rate": 9.979321664475541e-06, "loss": 0.2117, "step": 96 }, { "epoch": 0.08778280542986425, "grad_norm": 3.3454425477029024, "learning_rate": 9.978888927063763e-06, "loss": 0.0964, "step": 97 }, { "epoch": 0.08868778280542987, "grad_norm": 4.492978037803346, "learning_rate": 9.978451718029975e-06, "loss": 0.1972, "step": 98 }, { "epoch": 0.08959276018099548, "grad_norm": 3.4680183002399376, "learning_rate": 9.978010037766842e-06, "loss": 0.1339, "step": 99 }, { "epoch": 0.09049773755656108, "grad_norm": 7.535024738087428, "learning_rate": 9.977563886671043e-06, "loss": 0.2962, "step": 100 }, { "epoch": 0.09140271493212669, "grad_norm": 4.25341117877344, "learning_rate": 9.977113265143273e-06, "loss": 0.1638, "step": 101 }, { "epoch": 0.09230769230769231, "grad_norm": 2.81115539760575, "learning_rate": 9.976658173588244e-06, "loss": 0.1035, "step": 102 }, { "epoch": 0.09321266968325792, "grad_norm": 3.2497018203012287, "learning_rate": 9.976198612414681e-06, "loss": 0.1329, "step": 103 }, { "epoch": 0.09411764705882353, "grad_norm": 3.7393434244153085, "learning_rate": 9.975734582035323e-06, "loss": 0.1426, "step": 104 }, { "epoch": 0.09502262443438914, "grad_norm": 3.4601957493918363, "learning_rate": 9.975266082866923e-06, "loss": 0.1518, "step": 105 }, { "epoch": 0.09592760180995476, "grad_norm": 3.8493230670014302, "learning_rate": 9.97479311533025e-06, "loss": 0.1701, "step": 106 }, { "epoch": 0.09683257918552036, "grad_norm": 6.136951456089172, "learning_rate": 9.97431567985008e-06, "loss": 0.2177, "step": 107 }, { "epoch": 0.09773755656108597, "grad_norm": 4.876331768558669, "learning_rate": 9.97383377685521e-06, "loss": 0.2383, "step": 108 }, { "epoch": 0.09864253393665158, "grad_norm": 3.100107729077273, "learning_rate": 9.973347406778442e-06, "loss": 0.1589, "step": 109 }, { "epoch": 0.09954751131221719, "grad_norm": 4.508698417783094, "learning_rate": 9.972856570056594e-06, "loss": 0.1825, "step": 110 }, { "epoch": 0.10045248868778281, "grad_norm": 2.9041171849986256, "learning_rate": 9.972361267130495e-06, "loss": 0.1286, "step": 111 }, { "epoch": 0.10135746606334842, "grad_norm": 2.744628605490882, "learning_rate": 9.971861498444983e-06, "loss": 0.112, "step": 112 }, { "epoch": 0.10226244343891402, "grad_norm": 2.6199217798951713, "learning_rate": 9.97135726444891e-06, "loss": 0.1225, "step": 113 }, { "epoch": 0.10316742081447963, "grad_norm": 4.007704580772402, "learning_rate": 9.970848565595137e-06, "loss": 0.1467, "step": 114 }, { "epoch": 0.10407239819004525, "grad_norm": 4.535634176272924, "learning_rate": 9.970335402340534e-06, "loss": 0.1868, "step": 115 }, { "epoch": 0.10497737556561086, "grad_norm": 3.716946132669637, "learning_rate": 9.969817775145983e-06, "loss": 0.1587, "step": 116 }, { "epoch": 0.10588235294117647, "grad_norm": 3.126583025742419, "learning_rate": 9.96929568447637e-06, "loss": 0.1344, "step": 117 }, { "epoch": 0.10678733031674208, "grad_norm": 3.754872324750814, "learning_rate": 9.968769130800595e-06, "loss": 0.1559, "step": 118 }, { "epoch": 0.1076923076923077, "grad_norm": 4.448920024284253, "learning_rate": 9.968238114591567e-06, "loss": 0.1169, "step": 119 }, { "epoch": 0.1085972850678733, "grad_norm": 4.836947968479832, "learning_rate": 9.967702636326195e-06, "loss": 0.1575, "step": 120 }, { "epoch": 0.10950226244343891, "grad_norm": 3.6876302740668545, "learning_rate": 9.967162696485407e-06, "loss": 0.1414, "step": 121 }, { "epoch": 0.11040723981900452, "grad_norm": 3.422088264233701, "learning_rate": 9.966618295554127e-06, "loss": 0.1458, "step": 122 }, { "epoch": 0.11131221719457013, "grad_norm": 4.443004430343489, "learning_rate": 9.966069434021294e-06, "loss": 0.1677, "step": 123 }, { "epoch": 0.11221719457013575, "grad_norm": 2.780812020049164, "learning_rate": 9.965516112379847e-06, "loss": 0.1299, "step": 124 }, { "epoch": 0.11312217194570136, "grad_norm": 3.0436026095851347, "learning_rate": 9.964958331126735e-06, "loss": 0.1204, "step": 125 }, { "epoch": 0.11402714932126697, "grad_norm": 5.371626306307826, "learning_rate": 9.964396090762909e-06, "loss": 0.1768, "step": 126 }, { "epoch": 0.11493212669683257, "grad_norm": 3.3765821527833015, "learning_rate": 9.963829391793327e-06, "loss": 0.1436, "step": 127 }, { "epoch": 0.1158371040723982, "grad_norm": 2.9069381429971903, "learning_rate": 9.96325823472695e-06, "loss": 0.1085, "step": 128 }, { "epoch": 0.1167420814479638, "grad_norm": 2.751697227477261, "learning_rate": 9.962682620076745e-06, "loss": 0.1253, "step": 129 }, { "epoch": 0.11764705882352941, "grad_norm": 5.007423383553132, "learning_rate": 9.96210254835968e-06, "loss": 0.2303, "step": 130 }, { "epoch": 0.11855203619909502, "grad_norm": 2.950216943940045, "learning_rate": 9.961518020096726e-06, "loss": 0.1021, "step": 131 }, { "epoch": 0.11945701357466064, "grad_norm": 5.9474651498437066, "learning_rate": 9.96092903581286e-06, "loss": 0.256, "step": 132 }, { "epoch": 0.12036199095022625, "grad_norm": 2.5136060889350884, "learning_rate": 9.960335596037057e-06, "loss": 0.1285, "step": 133 }, { "epoch": 0.12126696832579185, "grad_norm": 4.810184581070253, "learning_rate": 9.959737701302294e-06, "loss": 0.2322, "step": 134 }, { "epoch": 0.12217194570135746, "grad_norm": 3.7501271502184106, "learning_rate": 9.959135352145552e-06, "loss": 0.2301, "step": 135 }, { "epoch": 0.12307692307692308, "grad_norm": 3.302485296231545, "learning_rate": 9.958528549107812e-06, "loss": 0.1528, "step": 136 }, { "epoch": 0.12398190045248869, "grad_norm": 2.8344487643269263, "learning_rate": 9.957917292734048e-06, "loss": 0.1235, "step": 137 }, { "epoch": 0.1248868778280543, "grad_norm": 17.546266647159825, "learning_rate": 9.957301583573244e-06, "loss": 0.2319, "step": 138 }, { "epoch": 0.1257918552036199, "grad_norm": 4.672782860995035, "learning_rate": 9.956681422178379e-06, "loss": 0.1206, "step": 139 }, { "epoch": 0.12669683257918551, "grad_norm": 3.205865644634478, "learning_rate": 9.956056809106426e-06, "loss": 0.1403, "step": 140 }, { "epoch": 0.12760180995475112, "grad_norm": 3.3010644177963266, "learning_rate": 9.955427744918367e-06, "loss": 0.132, "step": 141 }, { "epoch": 0.12850678733031673, "grad_norm": 3.617839206370422, "learning_rate": 9.954794230179167e-06, "loss": 0.1685, "step": 142 }, { "epoch": 0.12941176470588237, "grad_norm": 3.484040382820782, "learning_rate": 9.954156265457801e-06, "loss": 0.1554, "step": 143 }, { "epoch": 0.13031674208144797, "grad_norm": 4.848498443706656, "learning_rate": 9.953513851327236e-06, "loss": 0.1886, "step": 144 }, { "epoch": 0.13122171945701358, "grad_norm": 3.4925723606282095, "learning_rate": 9.952866988364431e-06, "loss": 0.1772, "step": 145 }, { "epoch": 0.1321266968325792, "grad_norm": 2.0684345916514086, "learning_rate": 9.95221567715035e-06, "loss": 0.0889, "step": 146 }, { "epoch": 0.1330316742081448, "grad_norm": 4.78594384499052, "learning_rate": 9.951559918269939e-06, "loss": 0.244, "step": 147 }, { "epoch": 0.1339366515837104, "grad_norm": 3.5074658276597486, "learning_rate": 9.950899712312149e-06, "loss": 0.1796, "step": 148 }, { "epoch": 0.134841628959276, "grad_norm": 3.215184062083378, "learning_rate": 9.950235059869926e-06, "loss": 0.125, "step": 149 }, { "epoch": 0.13574660633484162, "grad_norm": 3.775248876255966, "learning_rate": 9.9495659615402e-06, "loss": 0.1492, "step": 150 }, { "epoch": 0.13665158371040723, "grad_norm": 4.0782705500889636, "learning_rate": 9.9488924179239e-06, "loss": 0.2163, "step": 151 }, { "epoch": 0.13755656108597286, "grad_norm": 3.0102987280837383, "learning_rate": 9.948214429625948e-06, "loss": 0.1434, "step": 152 }, { "epoch": 0.13846153846153847, "grad_norm": 3.8164382586934424, "learning_rate": 9.947531997255256e-06, "loss": 0.183, "step": 153 }, { "epoch": 0.13936651583710408, "grad_norm": 4.368245898723328, "learning_rate": 9.946845121424728e-06, "loss": 0.126, "step": 154 }, { "epoch": 0.14027149321266968, "grad_norm": 17.915033921374775, "learning_rate": 9.946153802751257e-06, "loss": 0.2337, "step": 155 }, { "epoch": 0.1411764705882353, "grad_norm": 3.4233775392196693, "learning_rate": 9.945458041855732e-06, "loss": 0.1593, "step": 156 }, { "epoch": 0.1420814479638009, "grad_norm": 3.125349450094617, "learning_rate": 9.94475783936302e-06, "loss": 0.1798, "step": 157 }, { "epoch": 0.1429864253393665, "grad_norm": 2.804228992378659, "learning_rate": 9.94405319590199e-06, "loss": 0.1457, "step": 158 }, { "epoch": 0.14389140271493212, "grad_norm": 2.941767068288394, "learning_rate": 9.943344112105494e-06, "loss": 0.1463, "step": 159 }, { "epoch": 0.14479638009049775, "grad_norm": 3.0094648271857176, "learning_rate": 9.942630588610368e-06, "loss": 0.1383, "step": 160 }, { "epoch": 0.14570135746606336, "grad_norm": 3.9930991190537033, "learning_rate": 9.941912626057442e-06, "loss": 0.1944, "step": 161 }, { "epoch": 0.14660633484162897, "grad_norm": 3.140142598330223, "learning_rate": 9.94119022509153e-06, "loss": 0.1471, "step": 162 }, { "epoch": 0.14751131221719457, "grad_norm": 2.8172771396727296, "learning_rate": 9.94046338636143e-06, "loss": 0.1449, "step": 163 }, { "epoch": 0.14841628959276018, "grad_norm": 5.0585584023590044, "learning_rate": 9.93973211051993e-06, "loss": 0.1816, "step": 164 }, { "epoch": 0.1493212669683258, "grad_norm": 4.473187710777589, "learning_rate": 9.938996398223802e-06, "loss": 0.2101, "step": 165 }, { "epoch": 0.1502262443438914, "grad_norm": 4.7359187886767655, "learning_rate": 9.938256250133797e-06, "loss": 0.1889, "step": 166 }, { "epoch": 0.151131221719457, "grad_norm": 3.2784821794851076, "learning_rate": 9.937511666914658e-06, "loss": 0.1411, "step": 167 }, { "epoch": 0.1520361990950226, "grad_norm": 3.886912636321786, "learning_rate": 9.936762649235105e-06, "loss": 0.1303, "step": 168 }, { "epoch": 0.15294117647058825, "grad_norm": 2.8309389695309872, "learning_rate": 9.936009197767847e-06, "loss": 0.1417, "step": 169 }, { "epoch": 0.15384615384615385, "grad_norm": 3.364620976715102, "learning_rate": 9.935251313189564e-06, "loss": 0.1421, "step": 170 }, { "epoch": 0.15475113122171946, "grad_norm": 3.5520143517751577, "learning_rate": 9.934488996180933e-06, "loss": 0.1567, "step": 171 }, { "epoch": 0.15565610859728507, "grad_norm": 2.629835110710642, "learning_rate": 9.933722247426597e-06, "loss": 0.1037, "step": 172 }, { "epoch": 0.15656108597285068, "grad_norm": 4.58641097949774, "learning_rate": 9.932951067615188e-06, "loss": 0.1566, "step": 173 }, { "epoch": 0.15746606334841629, "grad_norm": 2.966424716797116, "learning_rate": 9.932175457439318e-06, "loss": 0.1506, "step": 174 }, { "epoch": 0.1583710407239819, "grad_norm": 3.092917977494139, "learning_rate": 9.931395417595568e-06, "loss": 0.1419, "step": 175 }, { "epoch": 0.1592760180995475, "grad_norm": 3.0163912289750003, "learning_rate": 9.930610948784513e-06, "loss": 0.1839, "step": 176 }, { "epoch": 0.16018099547511314, "grad_norm": 4.299427241197647, "learning_rate": 9.929822051710692e-06, "loss": 0.1573, "step": 177 }, { "epoch": 0.16108597285067874, "grad_norm": 4.874134042397829, "learning_rate": 9.929028727082629e-06, "loss": 0.2599, "step": 178 }, { "epoch": 0.16199095022624435, "grad_norm": 3.947396085358746, "learning_rate": 9.92823097561282e-06, "loss": 0.1609, "step": 179 }, { "epoch": 0.16289592760180996, "grad_norm": 2.9539558390997636, "learning_rate": 9.927428798017738e-06, "loss": 0.1312, "step": 180 }, { "epoch": 0.16380090497737557, "grad_norm": 4.8598920212602055, "learning_rate": 9.926622195017836e-06, "loss": 0.1902, "step": 181 }, { "epoch": 0.16470588235294117, "grad_norm": 2.8420224473786893, "learning_rate": 9.925811167337533e-06, "loss": 0.139, "step": 182 }, { "epoch": 0.16561085972850678, "grad_norm": 2.2252010707147822, "learning_rate": 9.92499571570523e-06, "loss": 0.1092, "step": 183 }, { "epoch": 0.1665158371040724, "grad_norm": 12.853005239609908, "learning_rate": 9.924175840853294e-06, "loss": 0.3702, "step": 184 }, { "epoch": 0.167420814479638, "grad_norm": 4.064881485878235, "learning_rate": 9.92335154351807e-06, "loss": 0.1636, "step": 185 }, { "epoch": 0.16832579185520363, "grad_norm": 2.7677255561910146, "learning_rate": 9.922522824439874e-06, "loss": 0.1421, "step": 186 }, { "epoch": 0.16923076923076924, "grad_norm": 4.124502410494096, "learning_rate": 9.921689684362989e-06, "loss": 0.1387, "step": 187 }, { "epoch": 0.17013574660633485, "grad_norm": 3.0961960517304177, "learning_rate": 9.920852124035675e-06, "loss": 0.1754, "step": 188 }, { "epoch": 0.17104072398190046, "grad_norm": 2.9156121828313553, "learning_rate": 9.920010144210157e-06, "loss": 0.1285, "step": 189 }, { "epoch": 0.17194570135746606, "grad_norm": 5.375025739338563, "learning_rate": 9.919163745642633e-06, "loss": 0.191, "step": 190 }, { "epoch": 0.17285067873303167, "grad_norm": 3.4769332398936297, "learning_rate": 9.918312929093264e-06, "loss": 0.1885, "step": 191 }, { "epoch": 0.17375565610859728, "grad_norm": 5.6444431133143915, "learning_rate": 9.917457695326185e-06, "loss": 0.1615, "step": 192 }, { "epoch": 0.17466063348416289, "grad_norm": 3.1941797519579858, "learning_rate": 9.916598045109494e-06, "loss": 0.1498, "step": 193 }, { "epoch": 0.1755656108597285, "grad_norm": 3.304330844660922, "learning_rate": 9.915733979215259e-06, "loss": 0.1434, "step": 194 }, { "epoch": 0.17647058823529413, "grad_norm": 4.660649797781891, "learning_rate": 9.91486549841951e-06, "loss": 0.1963, "step": 195 }, { "epoch": 0.17737556561085974, "grad_norm": 2.9660121995140085, "learning_rate": 9.913992603502244e-06, "loss": 0.1516, "step": 196 }, { "epoch": 0.17828054298642534, "grad_norm": 3.8941832439218933, "learning_rate": 9.913115295247423e-06, "loss": 0.1494, "step": 197 }, { "epoch": 0.17918552036199095, "grad_norm": 3.424366486005219, "learning_rate": 9.912233574442971e-06, "loss": 0.1607, "step": 198 }, { "epoch": 0.18009049773755656, "grad_norm": 3.938489455882468, "learning_rate": 9.911347441880776e-06, "loss": 0.2246, "step": 199 }, { "epoch": 0.18099547511312217, "grad_norm": 2.5539992074558504, "learning_rate": 9.91045689835669e-06, "loss": 0.1167, "step": 200 }, { "epoch": 0.18190045248868777, "grad_norm": 3.5711225945333345, "learning_rate": 9.909561944670526e-06, "loss": 0.2188, "step": 201 }, { "epoch": 0.18280542986425338, "grad_norm": 7.03783349682091, "learning_rate": 9.908662581626049e-06, "loss": 0.186, "step": 202 }, { "epoch": 0.18371040723981902, "grad_norm": 3.0173929684347103, "learning_rate": 9.907758810031001e-06, "loss": 0.1338, "step": 203 }, { "epoch": 0.18461538461538463, "grad_norm": 3.225226643743285, "learning_rate": 9.906850630697068e-06, "loss": 0.1459, "step": 204 }, { "epoch": 0.18552036199095023, "grad_norm": 2.8867061901293707, "learning_rate": 9.905938044439904e-06, "loss": 0.1401, "step": 205 }, { "epoch": 0.18642533936651584, "grad_norm": 2.8362554693845325, "learning_rate": 9.905021052079116e-06, "loss": 0.1519, "step": 206 }, { "epoch": 0.18733031674208145, "grad_norm": 2.704556280547443, "learning_rate": 9.904099654438271e-06, "loss": 0.1453, "step": 207 }, { "epoch": 0.18823529411764706, "grad_norm": 3.304004068794862, "learning_rate": 9.903173852344889e-06, "loss": 0.1547, "step": 208 }, { "epoch": 0.18914027149321266, "grad_norm": 4.19516158385984, "learning_rate": 9.902243646630452e-06, "loss": 0.1778, "step": 209 }, { "epoch": 0.19004524886877827, "grad_norm": 2.786805676862999, "learning_rate": 9.901309038130392e-06, "loss": 0.1164, "step": 210 }, { "epoch": 0.19095022624434388, "grad_norm": 3.0419940125135656, "learning_rate": 9.900370027684092e-06, "loss": 0.1454, "step": 211 }, { "epoch": 0.19185520361990951, "grad_norm": 3.9846806293589427, "learning_rate": 9.899426616134898e-06, "loss": 0.1648, "step": 212 }, { "epoch": 0.19276018099547512, "grad_norm": 3.0209054104755557, "learning_rate": 9.898478804330101e-06, "loss": 0.1408, "step": 213 }, { "epoch": 0.19366515837104073, "grad_norm": 3.3097746870467706, "learning_rate": 9.897526593120946e-06, "loss": 0.1496, "step": 214 }, { "epoch": 0.19457013574660634, "grad_norm": 2.9252383721996744, "learning_rate": 9.896569983362632e-06, "loss": 0.1381, "step": 215 }, { "epoch": 0.19547511312217195, "grad_norm": 2.9163189016307323, "learning_rate": 9.895608975914303e-06, "loss": 0.1382, "step": 216 }, { "epoch": 0.19638009049773755, "grad_norm": 3.077385138808738, "learning_rate": 9.894643571639057e-06, "loss": 0.1266, "step": 217 }, { "epoch": 0.19728506787330316, "grad_norm": 3.19872570343343, "learning_rate": 9.89367377140394e-06, "loss": 0.1455, "step": 218 }, { "epoch": 0.19819004524886877, "grad_norm": 2.8539956954251733, "learning_rate": 9.892699576079945e-06, "loss": 0.1372, "step": 219 }, { "epoch": 0.19909502262443438, "grad_norm": 2.4975166882646263, "learning_rate": 9.891720986542011e-06, "loss": 0.1219, "step": 220 }, { "epoch": 0.2, "grad_norm": 4.485313124348238, "learning_rate": 9.890738003669029e-06, "loss": 0.1528, "step": 221 }, { "epoch": 0.20090497737556562, "grad_norm": 3.1679681334691714, "learning_rate": 9.889750628343829e-06, "loss": 0.1398, "step": 222 }, { "epoch": 0.20180995475113123, "grad_norm": 3.485629257235678, "learning_rate": 9.888758861453191e-06, "loss": 0.1747, "step": 223 }, { "epoch": 0.20271493212669683, "grad_norm": 5.25876773360685, "learning_rate": 9.887762703887835e-06, "loss": 0.1648, "step": 224 }, { "epoch": 0.20361990950226244, "grad_norm": 3.6557264243776495, "learning_rate": 9.886762156542428e-06, "loss": 0.1703, "step": 225 }, { "epoch": 0.20452488687782805, "grad_norm": 3.7275599900815535, "learning_rate": 9.885757220315579e-06, "loss": 0.1541, "step": 226 }, { "epoch": 0.20542986425339366, "grad_norm": 3.2687933184005784, "learning_rate": 9.884747896109837e-06, "loss": 0.1851, "step": 227 }, { "epoch": 0.20633484162895926, "grad_norm": 2.882358947460851, "learning_rate": 9.883734184831691e-06, "loss": 0.1301, "step": 228 }, { "epoch": 0.2072398190045249, "grad_norm": 4.372551506969895, "learning_rate": 9.882716087391572e-06, "loss": 0.209, "step": 229 }, { "epoch": 0.2081447963800905, "grad_norm": 4.403166645770634, "learning_rate": 9.881693604703853e-06, "loss": 0.1535, "step": 230 }, { "epoch": 0.20904977375565612, "grad_norm": 3.335935045789655, "learning_rate": 9.880666737686839e-06, "loss": 0.1444, "step": 231 }, { "epoch": 0.20995475113122172, "grad_norm": 3.2063304920092928, "learning_rate": 9.87963548726278e-06, "loss": 0.1359, "step": 232 }, { "epoch": 0.21085972850678733, "grad_norm": 2.9799560529300155, "learning_rate": 9.878599854357854e-06, "loss": 0.1504, "step": 233 }, { "epoch": 0.21176470588235294, "grad_norm": 3.776700928295496, "learning_rate": 9.877559839902185e-06, "loss": 0.1904, "step": 234 }, { "epoch": 0.21266968325791855, "grad_norm": 2.785966785293352, "learning_rate": 9.876515444829822e-06, "loss": 0.1323, "step": 235 }, { "epoch": 0.21357466063348415, "grad_norm": 3.811250382184527, "learning_rate": 9.875466670078756e-06, "loss": 0.1473, "step": 236 }, { "epoch": 0.21447963800904976, "grad_norm": 2.965651719677829, "learning_rate": 9.874413516590912e-06, "loss": 0.1439, "step": 237 }, { "epoch": 0.2153846153846154, "grad_norm": 3.1092880018100795, "learning_rate": 9.873355985312141e-06, "loss": 0.1444, "step": 238 }, { "epoch": 0.216289592760181, "grad_norm": 3.772659513826838, "learning_rate": 9.872294077192229e-06, "loss": 0.209, "step": 239 }, { "epoch": 0.2171945701357466, "grad_norm": 2.9295714531002317, "learning_rate": 9.871227793184893e-06, "loss": 0.1333, "step": 240 }, { "epoch": 0.21809954751131222, "grad_norm": 2.3369742857312894, "learning_rate": 9.87015713424778e-06, "loss": 0.0954, "step": 241 }, { "epoch": 0.21900452488687783, "grad_norm": 3.1573459619538244, "learning_rate": 9.869082101342468e-06, "loss": 0.1438, "step": 242 }, { "epoch": 0.21990950226244343, "grad_norm": 2.8642962185092227, "learning_rate": 9.868002695434461e-06, "loss": 0.1305, "step": 243 }, { "epoch": 0.22081447963800904, "grad_norm": 2.666820556624314, "learning_rate": 9.866918917493193e-06, "loss": 0.1413, "step": 244 }, { "epoch": 0.22171945701357465, "grad_norm": 3.577905631486273, "learning_rate": 9.865830768492019e-06, "loss": 0.2035, "step": 245 }, { "epoch": 0.22262443438914026, "grad_norm": 2.215626113110654, "learning_rate": 9.864738249408227e-06, "loss": 0.1131, "step": 246 }, { "epoch": 0.2235294117647059, "grad_norm": 2.928904142753711, "learning_rate": 9.863641361223025e-06, "loss": 0.1391, "step": 247 }, { "epoch": 0.2244343891402715, "grad_norm": 4.414760795017388, "learning_rate": 9.862540104921545e-06, "loss": 0.1585, "step": 248 }, { "epoch": 0.2253393665158371, "grad_norm": 5.633097738817657, "learning_rate": 9.861434481492846e-06, "loss": 0.1601, "step": 249 }, { "epoch": 0.22624434389140272, "grad_norm": 2.5206453248811944, "learning_rate": 9.860324491929905e-06, "loss": 0.108, "step": 250 }, { "epoch": 0.22714932126696832, "grad_norm": 10.535221643738181, "learning_rate": 9.85921013722962e-06, "loss": 0.1606, "step": 251 }, { "epoch": 0.22805429864253393, "grad_norm": 3.2856257321874858, "learning_rate": 9.858091418392815e-06, "loss": 0.1652, "step": 252 }, { "epoch": 0.22895927601809954, "grad_norm": 2.2864552233287507, "learning_rate": 9.856968336424229e-06, "loss": 0.11, "step": 253 }, { "epoch": 0.22986425339366515, "grad_norm": 2.8019677909437197, "learning_rate": 9.855840892332519e-06, "loss": 0.1373, "step": 254 }, { "epoch": 0.23076923076923078, "grad_norm": 3.1401072754458212, "learning_rate": 9.854709087130261e-06, "loss": 0.1687, "step": 255 }, { "epoch": 0.2316742081447964, "grad_norm": 2.6051491289522057, "learning_rate": 9.85357292183395e-06, "loss": 0.1316, "step": 256 }, { "epoch": 0.232579185520362, "grad_norm": 3.4673530996103152, "learning_rate": 9.852432397463992e-06, "loss": 0.1856, "step": 257 }, { "epoch": 0.2334841628959276, "grad_norm": 2.368871467543675, "learning_rate": 9.85128751504471e-06, "loss": 0.125, "step": 258 }, { "epoch": 0.2343891402714932, "grad_norm": 2.4277326786647824, "learning_rate": 9.850138275604346e-06, "loss": 0.1141, "step": 259 }, { "epoch": 0.23529411764705882, "grad_norm": 3.0280873639448864, "learning_rate": 9.848984680175049e-06, "loss": 0.1558, "step": 260 }, { "epoch": 0.23619909502262443, "grad_norm": 3.986747246347848, "learning_rate": 9.84782672979288e-06, "loss": 0.19, "step": 261 }, { "epoch": 0.23710407239819004, "grad_norm": 3.055601187256445, "learning_rate": 9.846664425497816e-06, "loss": 0.1631, "step": 262 }, { "epoch": 0.23800904977375564, "grad_norm": 3.1000069503760694, "learning_rate": 9.845497768333738e-06, "loss": 0.1436, "step": 263 }, { "epoch": 0.23891402714932128, "grad_norm": 2.903218867324638, "learning_rate": 9.844326759348443e-06, "loss": 0.1584, "step": 264 }, { "epoch": 0.2398190045248869, "grad_norm": 2.994689218922668, "learning_rate": 9.843151399593636e-06, "loss": 0.1274, "step": 265 }, { "epoch": 0.2407239819004525, "grad_norm": 3.2773990072563177, "learning_rate": 9.84197169012492e-06, "loss": 0.1855, "step": 266 }, { "epoch": 0.2416289592760181, "grad_norm": 2.7129531740161696, "learning_rate": 9.840787632001818e-06, "loss": 0.1355, "step": 267 }, { "epoch": 0.2425339366515837, "grad_norm": 2.641020778109807, "learning_rate": 9.839599226287747e-06, "loss": 0.1398, "step": 268 }, { "epoch": 0.24343891402714932, "grad_norm": 3.679076753858812, "learning_rate": 9.838406474050038e-06, "loss": 0.2231, "step": 269 }, { "epoch": 0.24434389140271492, "grad_norm": 3.4597538740877027, "learning_rate": 9.837209376359918e-06, "loss": 0.189, "step": 270 }, { "epoch": 0.24524886877828053, "grad_norm": 4.979338140758549, "learning_rate": 9.836007934292519e-06, "loss": 0.1775, "step": 271 }, { "epoch": 0.24615384615384617, "grad_norm": 3.7721190138815524, "learning_rate": 9.834802148926883e-06, "loss": 0.1915, "step": 272 }, { "epoch": 0.24705882352941178, "grad_norm": 3.2198537448455227, "learning_rate": 9.833592021345938e-06, "loss": 0.19, "step": 273 }, { "epoch": 0.24796380090497738, "grad_norm": 3.9702765130339372, "learning_rate": 9.832377552636522e-06, "loss": 0.166, "step": 274 }, { "epoch": 0.248868778280543, "grad_norm": 4.004175157579648, "learning_rate": 9.831158743889373e-06, "loss": 0.209, "step": 275 }, { "epoch": 0.2497737556561086, "grad_norm": 3.5386128302816897, "learning_rate": 9.829935596199118e-06, "loss": 0.2272, "step": 276 }, { "epoch": 0.2506787330316742, "grad_norm": 3.3692160245913074, "learning_rate": 9.828708110664289e-06, "loss": 0.1655, "step": 277 }, { "epoch": 0.2515837104072398, "grad_norm": 2.659383780357718, "learning_rate": 9.827476288387308e-06, "loss": 0.1174, "step": 278 }, { "epoch": 0.2524886877828054, "grad_norm": 2.97007613631081, "learning_rate": 9.826240130474497e-06, "loss": 0.1513, "step": 279 }, { "epoch": 0.25339366515837103, "grad_norm": 3.360794578223047, "learning_rate": 9.82499963803607e-06, "loss": 0.1917, "step": 280 }, { "epoch": 0.25429864253393664, "grad_norm": 24.890550802546958, "learning_rate": 9.823754812186135e-06, "loss": 0.3229, "step": 281 }, { "epoch": 0.25520361990950224, "grad_norm": 3.109727206035918, "learning_rate": 9.822505654042687e-06, "loss": 0.1441, "step": 282 }, { "epoch": 0.25610859728506785, "grad_norm": 4.69085641612878, "learning_rate": 9.821252164727617e-06, "loss": 0.3019, "step": 283 }, { "epoch": 0.25701357466063346, "grad_norm": 3.167992827718007, "learning_rate": 9.819994345366706e-06, "loss": 0.1574, "step": 284 }, { "epoch": 0.2579185520361991, "grad_norm": 3.309006919882036, "learning_rate": 9.81873219708962e-06, "loss": 0.229, "step": 285 }, { "epoch": 0.25882352941176473, "grad_norm": 3.4558503033352053, "learning_rate": 9.817465721029916e-06, "loss": 0.1742, "step": 286 }, { "epoch": 0.25972850678733034, "grad_norm": 3.397712705656808, "learning_rate": 9.816194918325037e-06, "loss": 0.1687, "step": 287 }, { "epoch": 0.26063348416289595, "grad_norm": 4.002345066756905, "learning_rate": 9.814919790116313e-06, "loss": 0.1939, "step": 288 }, { "epoch": 0.26153846153846155, "grad_norm": 2.842260393382833, "learning_rate": 9.813640337548955e-06, "loss": 0.1592, "step": 289 }, { "epoch": 0.26244343891402716, "grad_norm": 4.019493745308691, "learning_rate": 9.81235656177206e-06, "loss": 0.1571, "step": 290 }, { "epoch": 0.26334841628959277, "grad_norm": 2.6868500406675992, "learning_rate": 9.811068463938613e-06, "loss": 0.116, "step": 291 }, { "epoch": 0.2642533936651584, "grad_norm": 2.9000119480958886, "learning_rate": 9.80977604520547e-06, "loss": 0.162, "step": 292 }, { "epoch": 0.265158371040724, "grad_norm": 3.3313577605700493, "learning_rate": 9.808479306733378e-06, "loss": 0.1146, "step": 293 }, { "epoch": 0.2660633484162896, "grad_norm": 3.975614906428808, "learning_rate": 9.807178249686959e-06, "loss": 0.2299, "step": 294 }, { "epoch": 0.2669683257918552, "grad_norm": 3.1164107026501773, "learning_rate": 9.80587287523471e-06, "loss": 0.1544, "step": 295 }, { "epoch": 0.2678733031674208, "grad_norm": 6.024990021908801, "learning_rate": 9.80456318454901e-06, "loss": 0.1567, "step": 296 }, { "epoch": 0.2687782805429864, "grad_norm": 4.39575437579821, "learning_rate": 9.803249178806118e-06, "loss": 0.2751, "step": 297 }, { "epoch": 0.269683257918552, "grad_norm": 3.538434676677128, "learning_rate": 9.80193085918616e-06, "loss": 0.1539, "step": 298 }, { "epoch": 0.27058823529411763, "grad_norm": 2.8726628186468277, "learning_rate": 9.800608226873143e-06, "loss": 0.1425, "step": 299 }, { "epoch": 0.27149321266968324, "grad_norm": 3.9206692863211905, "learning_rate": 9.79928128305494e-06, "loss": 0.1682, "step": 300 }, { "epoch": 0.27239819004524884, "grad_norm": 2.4932201316239784, "learning_rate": 9.797950028923309e-06, "loss": 0.1056, "step": 301 }, { "epoch": 0.27330316742081445, "grad_norm": 3.0232305837299287, "learning_rate": 9.796614465673864e-06, "loss": 0.1383, "step": 302 }, { "epoch": 0.2742081447963801, "grad_norm": 5.146756809859809, "learning_rate": 9.7952745945061e-06, "loss": 0.1567, "step": 303 }, { "epoch": 0.2751131221719457, "grad_norm": 3.246930957918113, "learning_rate": 9.793930416623377e-06, "loss": 0.186, "step": 304 }, { "epoch": 0.27601809954751133, "grad_norm": 2.7176676662526544, "learning_rate": 9.792581933232924e-06, "loss": 0.1166, "step": 305 }, { "epoch": 0.27692307692307694, "grad_norm": 2.5893210629751238, "learning_rate": 9.791229145545832e-06, "loss": 0.1332, "step": 306 }, { "epoch": 0.27782805429864255, "grad_norm": 4.585530689246288, "learning_rate": 9.789872054777066e-06, "loss": 0.2304, "step": 307 }, { "epoch": 0.27873303167420815, "grad_norm": 2.4869910943578057, "learning_rate": 9.78851066214545e-06, "loss": 0.1257, "step": 308 }, { "epoch": 0.27963800904977376, "grad_norm": 2.5841122972650914, "learning_rate": 9.787144968873673e-06, "loss": 0.1621, "step": 309 }, { "epoch": 0.28054298642533937, "grad_norm": 2.971869095943368, "learning_rate": 9.78577497618829e-06, "loss": 0.1619, "step": 310 }, { "epoch": 0.281447963800905, "grad_norm": 3.341834337339794, "learning_rate": 9.784400685319708e-06, "loss": 0.1386, "step": 311 }, { "epoch": 0.2823529411764706, "grad_norm": 2.9706249275569228, "learning_rate": 9.783022097502204e-06, "loss": 0.1279, "step": 312 }, { "epoch": 0.2832579185520362, "grad_norm": 3.1735436166742232, "learning_rate": 9.78163921397391e-06, "loss": 0.1494, "step": 313 }, { "epoch": 0.2841628959276018, "grad_norm": 2.443854994693101, "learning_rate": 9.780252035976815e-06, "loss": 0.1092, "step": 314 }, { "epoch": 0.2850678733031674, "grad_norm": 2.422678686639888, "learning_rate": 9.778860564756769e-06, "loss": 0.1056, "step": 315 }, { "epoch": 0.285972850678733, "grad_norm": 3.0261208822233394, "learning_rate": 9.777464801563474e-06, "loss": 0.1382, "step": 316 }, { "epoch": 0.2868778280542986, "grad_norm": 2.697233583903949, "learning_rate": 9.776064747650484e-06, "loss": 0.1524, "step": 317 }, { "epoch": 0.28778280542986423, "grad_norm": 3.819784685592441, "learning_rate": 9.774660404275218e-06, "loss": 0.179, "step": 318 }, { "epoch": 0.28868778280542984, "grad_norm": 2.9279529681287735, "learning_rate": 9.773251772698933e-06, "loss": 0.1449, "step": 319 }, { "epoch": 0.2895927601809955, "grad_norm": 8.537125701210558, "learning_rate": 9.771838854186748e-06, "loss": 0.3746, "step": 320 }, { "epoch": 0.2904977375565611, "grad_norm": 4.0348922931380296, "learning_rate": 9.770421650007627e-06, "loss": 0.1599, "step": 321 }, { "epoch": 0.2914027149321267, "grad_norm": 3.979172301421955, "learning_rate": 9.769000161434384e-06, "loss": 0.218, "step": 322 }, { "epoch": 0.2923076923076923, "grad_norm": 3.9017772378986013, "learning_rate": 9.767574389743683e-06, "loss": 0.2768, "step": 323 }, { "epoch": 0.29321266968325793, "grad_norm": 3.198512573174419, "learning_rate": 9.76614433621603e-06, "loss": 0.179, "step": 324 }, { "epoch": 0.29411764705882354, "grad_norm": 4.390636869284099, "learning_rate": 9.764710002135784e-06, "loss": 0.1739, "step": 325 }, { "epoch": 0.29502262443438915, "grad_norm": 3.9120501759783965, "learning_rate": 9.76327138879114e-06, "loss": 0.2118, "step": 326 }, { "epoch": 0.29592760180995475, "grad_norm": 3.814113658986286, "learning_rate": 9.761828497474143e-06, "loss": 0.2356, "step": 327 }, { "epoch": 0.29683257918552036, "grad_norm": 3.234733859297368, "learning_rate": 9.760381329480675e-06, "loss": 0.1393, "step": 328 }, { "epoch": 0.29773755656108597, "grad_norm": 2.738507531547146, "learning_rate": 9.758929886110462e-06, "loss": 0.1703, "step": 329 }, { "epoch": 0.2986425339366516, "grad_norm": 2.5255959410299464, "learning_rate": 9.757474168667072e-06, "loss": 0.1333, "step": 330 }, { "epoch": 0.2995475113122172, "grad_norm": 2.840634817607307, "learning_rate": 9.756014178457905e-06, "loss": 0.1386, "step": 331 }, { "epoch": 0.3004524886877828, "grad_norm": 2.2840412789163684, "learning_rate": 9.754549916794203e-06, "loss": 0.0962, "step": 332 }, { "epoch": 0.3013574660633484, "grad_norm": 17.6414093953133, "learning_rate": 9.753081384991045e-06, "loss": 0.3221, "step": 333 }, { "epoch": 0.302262443438914, "grad_norm": 1.924446983259794, "learning_rate": 9.751608584367344e-06, "loss": 0.1008, "step": 334 }, { "epoch": 0.3031674208144796, "grad_norm": 3.0805918809915047, "learning_rate": 9.750131516245844e-06, "loss": 0.1705, "step": 335 }, { "epoch": 0.3040723981900452, "grad_norm": 6.067978341950807, "learning_rate": 9.748650181953126e-06, "loss": 0.2051, "step": 336 }, { "epoch": 0.3049773755656109, "grad_norm": 4.2543515361061015, "learning_rate": 9.747164582819598e-06, "loss": 0.1666, "step": 337 }, { "epoch": 0.3058823529411765, "grad_norm": 3.5827593459771716, "learning_rate": 9.745674720179507e-06, "loss": 0.1704, "step": 338 }, { "epoch": 0.3067873303167421, "grad_norm": 2.374612419468961, "learning_rate": 9.744180595370918e-06, "loss": 0.1286, "step": 339 }, { "epoch": 0.3076923076923077, "grad_norm": 3.363836040799564, "learning_rate": 9.742682209735727e-06, "loss": 0.1711, "step": 340 }, { "epoch": 0.3085972850678733, "grad_norm": 3.0249621097197696, "learning_rate": 9.741179564619666e-06, "loss": 0.157, "step": 341 }, { "epoch": 0.3095022624434389, "grad_norm": 3.31500014295259, "learning_rate": 9.73967266137228e-06, "loss": 0.1456, "step": 342 }, { "epoch": 0.31040723981900453, "grad_norm": 2.558860128885958, "learning_rate": 9.738161501346944e-06, "loss": 0.159, "step": 343 }, { "epoch": 0.31131221719457014, "grad_norm": 3.3864336810903013, "learning_rate": 9.736646085900859e-06, "loss": 0.1644, "step": 344 }, { "epoch": 0.31221719457013575, "grad_norm": 2.5935126349954807, "learning_rate": 9.73512641639504e-06, "loss": 0.1473, "step": 345 }, { "epoch": 0.31312217194570136, "grad_norm": 2.7031707269539145, "learning_rate": 9.733602494194329e-06, "loss": 0.1389, "step": 346 }, { "epoch": 0.31402714932126696, "grad_norm": 2.910571083652294, "learning_rate": 9.732074320667387e-06, "loss": 0.1657, "step": 347 }, { "epoch": 0.31493212669683257, "grad_norm": 3.146164596508397, "learning_rate": 9.73054189718669e-06, "loss": 0.153, "step": 348 }, { "epoch": 0.3158371040723982, "grad_norm": 6.5104157717358415, "learning_rate": 9.729005225128533e-06, "loss": 0.1716, "step": 349 }, { "epoch": 0.3167420814479638, "grad_norm": 2.268972980203384, "learning_rate": 9.72746430587303e-06, "loss": 0.1273, "step": 350 }, { "epoch": 0.3176470588235294, "grad_norm": 2.8987375801254185, "learning_rate": 9.7259191408041e-06, "loss": 0.1667, "step": 351 }, { "epoch": 0.318552036199095, "grad_norm": 4.424360603578729, "learning_rate": 9.724369731309481e-06, "loss": 0.3181, "step": 352 }, { "epoch": 0.3194570135746606, "grad_norm": 3.10210317269789, "learning_rate": 9.72281607878073e-06, "loss": 0.1866, "step": 353 }, { "epoch": 0.32036199095022627, "grad_norm": 3.821973750063843, "learning_rate": 9.721258184613204e-06, "loss": 0.2226, "step": 354 }, { "epoch": 0.3212669683257919, "grad_norm": 2.3106824328263884, "learning_rate": 9.719696050206072e-06, "loss": 0.1353, "step": 355 }, { "epoch": 0.3221719457013575, "grad_norm": 3.286446770099135, "learning_rate": 9.718129676962315e-06, "loss": 0.2042, "step": 356 }, { "epoch": 0.3230769230769231, "grad_norm": 3.14474914864614, "learning_rate": 9.716559066288716e-06, "loss": 0.1903, "step": 357 }, { "epoch": 0.3239819004524887, "grad_norm": 3.761315841787902, "learning_rate": 9.714984219595869e-06, "loss": 0.1782, "step": 358 }, { "epoch": 0.3248868778280543, "grad_norm": 4.020885509348991, "learning_rate": 9.713405138298167e-06, "loss": 0.2054, "step": 359 }, { "epoch": 0.3257918552036199, "grad_norm": 2.464048200689045, "learning_rate": 9.711821823813812e-06, "loss": 0.1396, "step": 360 }, { "epoch": 0.3266968325791855, "grad_norm": 4.336858839989107, "learning_rate": 9.710234277564803e-06, "loss": 0.1975, "step": 361 }, { "epoch": 0.32760180995475113, "grad_norm": 2.8055124315588276, "learning_rate": 9.708642500976939e-06, "loss": 0.1343, "step": 362 }, { "epoch": 0.32850678733031674, "grad_norm": 3.2250754807260824, "learning_rate": 9.707046495479827e-06, "loss": 0.154, "step": 363 }, { "epoch": 0.32941176470588235, "grad_norm": 3.212570802969483, "learning_rate": 9.705446262506858e-06, "loss": 0.1991, "step": 364 }, { "epoch": 0.33031674208144796, "grad_norm": 3.862875288124874, "learning_rate": 9.703841803495234e-06, "loss": 0.2112, "step": 365 }, { "epoch": 0.33122171945701356, "grad_norm": 2.9499977610736257, "learning_rate": 9.702233119885944e-06, "loss": 0.1913, "step": 366 }, { "epoch": 0.33212669683257917, "grad_norm": 3.0998099769966743, "learning_rate": 9.70062021312377e-06, "loss": 0.1938, "step": 367 }, { "epoch": 0.3330316742081448, "grad_norm": 2.4638018805859896, "learning_rate": 9.699003084657295e-06, "loss": 0.1254, "step": 368 }, { "epoch": 0.3339366515837104, "grad_norm": 2.9384892781981526, "learning_rate": 9.697381735938887e-06, "loss": 0.1688, "step": 369 }, { "epoch": 0.334841628959276, "grad_norm": 2.496915541376654, "learning_rate": 9.695756168424703e-06, "loss": 0.1103, "step": 370 }, { "epoch": 0.3357466063348416, "grad_norm": 3.2451342137903176, "learning_rate": 9.694126383574696e-06, "loss": 0.1656, "step": 371 }, { "epoch": 0.33665158371040727, "grad_norm": 2.9963587445601485, "learning_rate": 9.692492382852601e-06, "loss": 0.1579, "step": 372 }, { "epoch": 0.3375565610859729, "grad_norm": 2.8252512600290065, "learning_rate": 9.69085416772594e-06, "loss": 0.1384, "step": 373 }, { "epoch": 0.3384615384615385, "grad_norm": 6.741979136335651, "learning_rate": 9.689211739666023e-06, "loss": 0.2558, "step": 374 }, { "epoch": 0.3393665158371041, "grad_norm": 2.9132975059729356, "learning_rate": 9.68756510014794e-06, "loss": 0.138, "step": 375 }, { "epoch": 0.3402714932126697, "grad_norm": 4.716109001846332, "learning_rate": 9.685914250650566e-06, "loss": 0.1365, "step": 376 }, { "epoch": 0.3411764705882353, "grad_norm": 2.4198868944670617, "learning_rate": 9.684259192656554e-06, "loss": 0.1146, "step": 377 }, { "epoch": 0.3420814479638009, "grad_norm": 2.4565426068688185, "learning_rate": 9.68259992765234e-06, "loss": 0.1507, "step": 378 }, { "epoch": 0.3429864253393665, "grad_norm": 2.839653848354654, "learning_rate": 9.68093645712814e-06, "loss": 0.1659, "step": 379 }, { "epoch": 0.3438914027149321, "grad_norm": 3.347557256372121, "learning_rate": 9.67926878257794e-06, "loss": 0.2309, "step": 380 }, { "epoch": 0.34479638009049773, "grad_norm": 3.293960097392764, "learning_rate": 9.677596905499508e-06, "loss": 0.1515, "step": 381 }, { "epoch": 0.34570135746606334, "grad_norm": 2.657237823468197, "learning_rate": 9.675920827394383e-06, "loss": 0.1579, "step": 382 }, { "epoch": 0.34660633484162895, "grad_norm": 2.5548088427083435, "learning_rate": 9.67424054976788e-06, "loss": 0.1522, "step": 383 }, { "epoch": 0.34751131221719456, "grad_norm": 2.802440751161209, "learning_rate": 9.672556074129085e-06, "loss": 0.1534, "step": 384 }, { "epoch": 0.34841628959276016, "grad_norm": 5.464483158232452, "learning_rate": 9.67086740199085e-06, "loss": 0.1759, "step": 385 }, { "epoch": 0.34932126696832577, "grad_norm": 3.3562695677665735, "learning_rate": 9.6691745348698e-06, "loss": 0.1283, "step": 386 }, { "epoch": 0.3502262443438914, "grad_norm": 2.474975229232364, "learning_rate": 9.667477474286328e-06, "loss": 0.1548, "step": 387 }, { "epoch": 0.351131221719457, "grad_norm": 4.92920343283297, "learning_rate": 9.665776221764592e-06, "loss": 0.211, "step": 388 }, { "epoch": 0.35203619909502265, "grad_norm": 2.7777792830752013, "learning_rate": 9.664070778832513e-06, "loss": 0.1544, "step": 389 }, { "epoch": 0.35294117647058826, "grad_norm": 2.4763842799586016, "learning_rate": 9.66236114702178e-06, "loss": 0.1397, "step": 390 }, { "epoch": 0.35384615384615387, "grad_norm": 2.185023850272238, "learning_rate": 9.66064732786784e-06, "loss": 0.1055, "step": 391 }, { "epoch": 0.3547511312217195, "grad_norm": 3.1900116065606308, "learning_rate": 9.658929322909903e-06, "loss": 0.1481, "step": 392 }, { "epoch": 0.3556561085972851, "grad_norm": 3.158573293837953, "learning_rate": 9.657207133690936e-06, "loss": 0.1691, "step": 393 }, { "epoch": 0.3565610859728507, "grad_norm": 4.564728272889586, "learning_rate": 9.655480761757664e-06, "loss": 0.1766, "step": 394 }, { "epoch": 0.3574660633484163, "grad_norm": 2.871260372971337, "learning_rate": 9.653750208660577e-06, "loss": 0.1439, "step": 395 }, { "epoch": 0.3583710407239819, "grad_norm": 3.4566531359297326, "learning_rate": 9.652015475953904e-06, "loss": 0.1619, "step": 396 }, { "epoch": 0.3592760180995475, "grad_norm": 2.799944654115598, "learning_rate": 9.650276565195644e-06, "loss": 0.1273, "step": 397 }, { "epoch": 0.3601809954751131, "grad_norm": 4.012991838845053, "learning_rate": 9.648533477947539e-06, "loss": 0.1678, "step": 398 }, { "epoch": 0.3610859728506787, "grad_norm": 2.68886242007211, "learning_rate": 9.646786215775083e-06, "loss": 0.1262, "step": 399 }, { "epoch": 0.36199095022624433, "grad_norm": 3.744011902172636, "learning_rate": 9.645034780247521e-06, "loss": 0.1735, "step": 400 }, { "epoch": 0.36289592760180994, "grad_norm": 2.4185572874587327, "learning_rate": 9.64327917293785e-06, "loss": 0.126, "step": 401 }, { "epoch": 0.36380090497737555, "grad_norm": 3.235033470393472, "learning_rate": 9.641519395422806e-06, "loss": 0.1821, "step": 402 }, { "epoch": 0.36470588235294116, "grad_norm": 2.51102368687468, "learning_rate": 9.639755449282874e-06, "loss": 0.1306, "step": 403 }, { "epoch": 0.36561085972850677, "grad_norm": 3.267906342111658, "learning_rate": 9.637987336102285e-06, "loss": 0.1783, "step": 404 }, { "epoch": 0.3665158371040724, "grad_norm": 3.6066664353981386, "learning_rate": 9.636215057469009e-06, "loss": 0.1757, "step": 405 }, { "epoch": 0.36742081447963804, "grad_norm": 2.8298622304443093, "learning_rate": 9.634438614974759e-06, "loss": 0.1466, "step": 406 }, { "epoch": 0.36832579185520364, "grad_norm": 3.4643657841161617, "learning_rate": 9.632658010214985e-06, "loss": 0.1763, "step": 407 }, { "epoch": 0.36923076923076925, "grad_norm": 15.149853746939558, "learning_rate": 9.630873244788884e-06, "loss": 0.3415, "step": 408 }, { "epoch": 0.37013574660633486, "grad_norm": 2.3617929137103677, "learning_rate": 9.629084320299375e-06, "loss": 0.1239, "step": 409 }, { "epoch": 0.37104072398190047, "grad_norm": 3.3940837165451274, "learning_rate": 9.627291238353127e-06, "loss": 0.163, "step": 410 }, { "epoch": 0.3719457013574661, "grad_norm": 3.211445126220133, "learning_rate": 9.625494000560534e-06, "loss": 0.1777, "step": 411 }, { "epoch": 0.3728506787330317, "grad_norm": 2.303470058563585, "learning_rate": 9.623692608535722e-06, "loss": 0.1148, "step": 412 }, { "epoch": 0.3737556561085973, "grad_norm": 3.1329521177961097, "learning_rate": 9.621887063896555e-06, "loss": 0.1735, "step": 413 }, { "epoch": 0.3746606334841629, "grad_norm": 2.369132676404496, "learning_rate": 9.620077368264621e-06, "loss": 0.1528, "step": 414 }, { "epoch": 0.3755656108597285, "grad_norm": 5.107379275650216, "learning_rate": 9.618263523265238e-06, "loss": 0.2303, "step": 415 }, { "epoch": 0.3764705882352941, "grad_norm": 2.094892731993931, "learning_rate": 9.616445530527448e-06, "loss": 0.101, "step": 416 }, { "epoch": 0.3773755656108597, "grad_norm": 2.5312998066641055, "learning_rate": 9.61462339168402e-06, "loss": 0.1363, "step": 417 }, { "epoch": 0.3782805429864253, "grad_norm": 2.8922781816612786, "learning_rate": 9.612797108371449e-06, "loss": 0.1696, "step": 418 }, { "epoch": 0.37918552036199094, "grad_norm": 2.617641291041387, "learning_rate": 9.61096668222995e-06, "loss": 0.1422, "step": 419 }, { "epoch": 0.38009049773755654, "grad_norm": 2.25830989225017, "learning_rate": 9.609132114903458e-06, "loss": 0.126, "step": 420 }, { "epoch": 0.38099547511312215, "grad_norm": 4.1111443925809885, "learning_rate": 9.607293408039628e-06, "loss": 0.1304, "step": 421 }, { "epoch": 0.38190045248868776, "grad_norm": 2.5036397504772467, "learning_rate": 9.605450563289834e-06, "loss": 0.1483, "step": 422 }, { "epoch": 0.38280542986425337, "grad_norm": 3.382814441465967, "learning_rate": 9.603603582309161e-06, "loss": 0.1865, "step": 423 }, { "epoch": 0.38371040723981903, "grad_norm": 4.850170001864585, "learning_rate": 9.60175246675642e-06, "loss": 0.245, "step": 424 }, { "epoch": 0.38461538461538464, "grad_norm": 3.005405982522235, "learning_rate": 9.599897218294122e-06, "loss": 0.1485, "step": 425 }, { "epoch": 0.38552036199095024, "grad_norm": 3.897465899865831, "learning_rate": 9.5980378385885e-06, "loss": 0.23, "step": 426 }, { "epoch": 0.38642533936651585, "grad_norm": 2.803679913864033, "learning_rate": 9.596174329309492e-06, "loss": 0.1483, "step": 427 }, { "epoch": 0.38733031674208146, "grad_norm": 2.4802848714796375, "learning_rate": 9.594306692130744e-06, "loss": 0.1278, "step": 428 }, { "epoch": 0.38823529411764707, "grad_norm": 3.152404184944791, "learning_rate": 9.592434928729617e-06, "loss": 0.1436, "step": 429 }, { "epoch": 0.3891402714932127, "grad_norm": 3.0599861032624416, "learning_rate": 9.590559040787168e-06, "loss": 0.1474, "step": 430 }, { "epoch": 0.3900452488687783, "grad_norm": 5.874854755674749, "learning_rate": 9.588679029988164e-06, "loss": 0.4235, "step": 431 }, { "epoch": 0.3909502262443439, "grad_norm": 3.4560768371244635, "learning_rate": 9.586794898021074e-06, "loss": 0.1961, "step": 432 }, { "epoch": 0.3918552036199095, "grad_norm": 3.403149469992086, "learning_rate": 9.584906646578069e-06, "loss": 0.1492, "step": 433 }, { "epoch": 0.3927601809954751, "grad_norm": 2.9559209636187256, "learning_rate": 9.583014277355017e-06, "loss": 0.1617, "step": 434 }, { "epoch": 0.3936651583710407, "grad_norm": 3.9588361877767806, "learning_rate": 9.581117792051487e-06, "loss": 0.1814, "step": 435 }, { "epoch": 0.3945701357466063, "grad_norm": 2.674670330302031, "learning_rate": 9.579217192370741e-06, "loss": 0.1649, "step": 436 }, { "epoch": 0.39547511312217193, "grad_norm": 3.4294225412466575, "learning_rate": 9.577312480019744e-06, "loss": 0.2331, "step": 437 }, { "epoch": 0.39638009049773754, "grad_norm": 2.1394657229395344, "learning_rate": 9.575403656709147e-06, "loss": 0.0885, "step": 438 }, { "epoch": 0.39728506787330314, "grad_norm": 2.897608035819603, "learning_rate": 9.573490724153292e-06, "loss": 0.1975, "step": 439 }, { "epoch": 0.39819004524886875, "grad_norm": 3.3326831692791083, "learning_rate": 9.57157368407022e-06, "loss": 0.1885, "step": 440 }, { "epoch": 0.3990950226244344, "grad_norm": 4.317556342329746, "learning_rate": 9.569652538181653e-06, "loss": 0.202, "step": 441 }, { "epoch": 0.4, "grad_norm": 4.65507289499758, "learning_rate": 9.567727288213005e-06, "loss": 0.1901, "step": 442 }, { "epoch": 0.40090497737556563, "grad_norm": 2.8730644330677864, "learning_rate": 9.565797935893376e-06, "loss": 0.2157, "step": 443 }, { "epoch": 0.40180995475113124, "grad_norm": 3.524836797386128, "learning_rate": 9.563864482955547e-06, "loss": 0.194, "step": 444 }, { "epoch": 0.40271493212669685, "grad_norm": 2.7982753223949177, "learning_rate": 9.561926931135985e-06, "loss": 0.16, "step": 445 }, { "epoch": 0.40361990950226245, "grad_norm": 3.483577506141529, "learning_rate": 9.559985282174837e-06, "loss": 0.1485, "step": 446 }, { "epoch": 0.40452488687782806, "grad_norm": 3.4511455949453556, "learning_rate": 9.55803953781593e-06, "loss": 0.1618, "step": 447 }, { "epoch": 0.40542986425339367, "grad_norm": 3.026223814036544, "learning_rate": 9.556089699806767e-06, "loss": 0.1439, "step": 448 }, { "epoch": 0.4063348416289593, "grad_norm": 2.8325255401714986, "learning_rate": 9.554135769898534e-06, "loss": 0.1702, "step": 449 }, { "epoch": 0.4072398190045249, "grad_norm": 2.549324902681385, "learning_rate": 9.552177749846083e-06, "loss": 0.1443, "step": 450 }, { "epoch": 0.4081447963800905, "grad_norm": 3.289017347772091, "learning_rate": 9.550215641407947e-06, "loss": 0.1573, "step": 451 }, { "epoch": 0.4090497737556561, "grad_norm": 2.3728337992797313, "learning_rate": 9.548249446346324e-06, "loss": 0.1431, "step": 452 }, { "epoch": 0.4099547511312217, "grad_norm": 3.0816003675898926, "learning_rate": 9.546279166427093e-06, "loss": 0.1743, "step": 453 }, { "epoch": 0.4108597285067873, "grad_norm": 2.8404744580376464, "learning_rate": 9.544304803419787e-06, "loss": 0.1566, "step": 454 }, { "epoch": 0.4117647058823529, "grad_norm": 3.2663219589668193, "learning_rate": 9.542326359097619e-06, "loss": 0.1664, "step": 455 }, { "epoch": 0.41266968325791853, "grad_norm": 3.636230644617084, "learning_rate": 9.540343835237461e-06, "loss": 0.209, "step": 456 }, { "epoch": 0.41357466063348414, "grad_norm": 4.3399436546874615, "learning_rate": 9.53835723361985e-06, "loss": 0.1826, "step": 457 }, { "epoch": 0.4144796380090498, "grad_norm": 2.513282513963796, "learning_rate": 9.536366556028986e-06, "loss": 0.1141, "step": 458 }, { "epoch": 0.4153846153846154, "grad_norm": 2.269475137885105, "learning_rate": 9.534371804252727e-06, "loss": 0.1411, "step": 459 }, { "epoch": 0.416289592760181, "grad_norm": 2.851365205760618, "learning_rate": 9.532372980082598e-06, "loss": 0.1338, "step": 460 }, { "epoch": 0.4171945701357466, "grad_norm": 3.09411739716065, "learning_rate": 9.530370085313769e-06, "loss": 0.1549, "step": 461 }, { "epoch": 0.41809954751131223, "grad_norm": 2.844278345238442, "learning_rate": 9.528363121745076e-06, "loss": 0.1511, "step": 462 }, { "epoch": 0.41900452488687784, "grad_norm": 3.203637884953108, "learning_rate": 9.526352091179007e-06, "loss": 0.1635, "step": 463 }, { "epoch": 0.41990950226244345, "grad_norm": 1.8887010539013787, "learning_rate": 9.524336995421701e-06, "loss": 0.1002, "step": 464 }, { "epoch": 0.42081447963800905, "grad_norm": 2.5838315479164597, "learning_rate": 9.522317836282949e-06, "loss": 0.1345, "step": 465 }, { "epoch": 0.42171945701357466, "grad_norm": 4.882291211214287, "learning_rate": 9.520294615576188e-06, "loss": 0.298, "step": 466 }, { "epoch": 0.42262443438914027, "grad_norm": 2.749704742855777, "learning_rate": 9.518267335118509e-06, "loss": 0.1243, "step": 467 }, { "epoch": 0.4235294117647059, "grad_norm": 8.189875592528267, "learning_rate": 9.516235996730645e-06, "loss": 0.1241, "step": 468 }, { "epoch": 0.4244343891402715, "grad_norm": 3.8210374344054414, "learning_rate": 9.514200602236974e-06, "loss": 0.162, "step": 469 }, { "epoch": 0.4253393665158371, "grad_norm": 2.6968869241892457, "learning_rate": 9.512161153465518e-06, "loss": 0.1219, "step": 470 }, { "epoch": 0.4262443438914027, "grad_norm": 3.005496134734956, "learning_rate": 9.510117652247938e-06, "loss": 0.1424, "step": 471 }, { "epoch": 0.4271493212669683, "grad_norm": 2.0490933501956783, "learning_rate": 9.508070100419538e-06, "loss": 0.1289, "step": 472 }, { "epoch": 0.4280542986425339, "grad_norm": 2.7520306632973317, "learning_rate": 9.506018499819257e-06, "loss": 0.1959, "step": 473 }, { "epoch": 0.4289592760180995, "grad_norm": 2.039030355669391, "learning_rate": 9.503962852289673e-06, "loss": 0.1074, "step": 474 }, { "epoch": 0.4298642533936652, "grad_norm": 2.2279918179498925, "learning_rate": 9.501903159676993e-06, "loss": 0.1371, "step": 475 }, { "epoch": 0.4307692307692308, "grad_norm": 3.460620218632478, "learning_rate": 9.499839423831062e-06, "loss": 0.1622, "step": 476 }, { "epoch": 0.4316742081447964, "grad_norm": 2.128182490224455, "learning_rate": 9.497771646605356e-06, "loss": 0.1409, "step": 477 }, { "epoch": 0.432579185520362, "grad_norm": 2.902069342295019, "learning_rate": 9.495699829856981e-06, "loss": 0.1521, "step": 478 }, { "epoch": 0.4334841628959276, "grad_norm": 9.033877217855405, "learning_rate": 9.493623975446669e-06, "loss": 0.2716, "step": 479 }, { "epoch": 0.4343891402714932, "grad_norm": 3.084384630754352, "learning_rate": 9.491544085238778e-06, "loss": 0.1345, "step": 480 }, { "epoch": 0.43529411764705883, "grad_norm": 6.178642162772755, "learning_rate": 9.489460161101291e-06, "loss": 0.314, "step": 481 }, { "epoch": 0.43619909502262444, "grad_norm": 3.8568443735221787, "learning_rate": 9.487372204905817e-06, "loss": 0.1639, "step": 482 }, { "epoch": 0.43710407239819005, "grad_norm": 3.319973252018457, "learning_rate": 9.485280218527581e-06, "loss": 0.1408, "step": 483 }, { "epoch": 0.43800904977375565, "grad_norm": 2.2654330042075475, "learning_rate": 9.483184203845432e-06, "loss": 0.1122, "step": 484 }, { "epoch": 0.43891402714932126, "grad_norm": 4.489992041296296, "learning_rate": 9.481084162741835e-06, "loss": 0.1917, "step": 485 }, { "epoch": 0.43981900452488687, "grad_norm": 2.7431005009835108, "learning_rate": 9.478980097102872e-06, "loss": 0.142, "step": 486 }, { "epoch": 0.4407239819004525, "grad_norm": 2.756002001135366, "learning_rate": 9.476872008818239e-06, "loss": 0.1377, "step": 487 }, { "epoch": 0.4416289592760181, "grad_norm": 3.0085249875731312, "learning_rate": 9.474759899781244e-06, "loss": 0.1774, "step": 488 }, { "epoch": 0.4425339366515837, "grad_norm": 3.2967052479860315, "learning_rate": 9.472643771888805e-06, "loss": 0.168, "step": 489 }, { "epoch": 0.4434389140271493, "grad_norm": 4.934244643918891, "learning_rate": 9.470523627041452e-06, "loss": 0.2209, "step": 490 }, { "epoch": 0.4443438914027149, "grad_norm": 4.122933594161965, "learning_rate": 9.468399467143327e-06, "loss": 0.2576, "step": 491 }, { "epoch": 0.4452488687782805, "grad_norm": 2.7749583369857933, "learning_rate": 9.466271294102167e-06, "loss": 0.1431, "step": 492 }, { "epoch": 0.4461538461538462, "grad_norm": 2.8582946507673745, "learning_rate": 9.46413910982932e-06, "loss": 0.1574, "step": 493 }, { "epoch": 0.4470588235294118, "grad_norm": 2.88936827951968, "learning_rate": 9.46200291623974e-06, "loss": 0.1607, "step": 494 }, { "epoch": 0.4479638009049774, "grad_norm": 3.000557579952457, "learning_rate": 9.459862715251973e-06, "loss": 0.1726, "step": 495 }, { "epoch": 0.448868778280543, "grad_norm": 3.2791851289387384, "learning_rate": 9.45771850878817e-06, "loss": 0.1548, "step": 496 }, { "epoch": 0.4497737556561086, "grad_norm": 3.2814567115244397, "learning_rate": 9.45557029877408e-06, "loss": 0.1879, "step": 497 }, { "epoch": 0.4506787330316742, "grad_norm": 3.9655427651101025, "learning_rate": 9.453418087139044e-06, "loss": 0.2269, "step": 498 }, { "epoch": 0.4515837104072398, "grad_norm": 2.638537603988149, "learning_rate": 9.451261875815996e-06, "loss": 0.1336, "step": 499 }, { "epoch": 0.45248868778280543, "grad_norm": 2.8885316393485874, "learning_rate": 9.44910166674147e-06, "loss": 0.1412, "step": 500 }, { "epoch": 0.45339366515837104, "grad_norm": 3.4569958353857735, "learning_rate": 9.446937461855582e-06, "loss": 0.1722, "step": 501 }, { "epoch": 0.45429864253393665, "grad_norm": 13.849309518229196, "learning_rate": 9.444769263102042e-06, "loss": 0.2234, "step": 502 }, { "epoch": 0.45520361990950226, "grad_norm": 3.5704367417239125, "learning_rate": 9.442597072428144e-06, "loss": 0.2576, "step": 503 }, { "epoch": 0.45610859728506786, "grad_norm": 2.9372558520652103, "learning_rate": 9.440420891784766e-06, "loss": 0.1541, "step": 504 }, { "epoch": 0.45701357466063347, "grad_norm": 2.2336736280518315, "learning_rate": 9.438240723126376e-06, "loss": 0.1241, "step": 505 }, { "epoch": 0.4579185520361991, "grad_norm": 2.7911181551583453, "learning_rate": 9.436056568411014e-06, "loss": 0.1405, "step": 506 }, { "epoch": 0.4588235294117647, "grad_norm": 2.3151837062905005, "learning_rate": 9.43386842960031e-06, "loss": 0.1246, "step": 507 }, { "epoch": 0.4597285067873303, "grad_norm": 2.3571018219082753, "learning_rate": 9.431676308659466e-06, "loss": 0.1136, "step": 508 }, { "epoch": 0.4606334841628959, "grad_norm": 8.59188472354185, "learning_rate": 9.42948020755726e-06, "loss": 0.2774, "step": 509 }, { "epoch": 0.46153846153846156, "grad_norm": 2.447634559491362, "learning_rate": 9.427280128266049e-06, "loss": 0.1187, "step": 510 }, { "epoch": 0.46244343891402717, "grad_norm": 2.4299793928684914, "learning_rate": 9.425076072761762e-06, "loss": 0.1343, "step": 511 }, { "epoch": 0.4633484162895928, "grad_norm": 3.284743775158164, "learning_rate": 9.422868043023892e-06, "loss": 0.1492, "step": 512 }, { "epoch": 0.4642533936651584, "grad_norm": 3.855700350876412, "learning_rate": 9.42065604103551e-06, "loss": 0.2378, "step": 513 }, { "epoch": 0.465158371040724, "grad_norm": 2.395400771441453, "learning_rate": 9.41844006878325e-06, "loss": 0.1226, "step": 514 }, { "epoch": 0.4660633484162896, "grad_norm": 2.782544226510058, "learning_rate": 9.416220128257317e-06, "loss": 0.1343, "step": 515 }, { "epoch": 0.4669683257918552, "grad_norm": 2.681548196271278, "learning_rate": 9.41399622145147e-06, "loss": 0.134, "step": 516 }, { "epoch": 0.4678733031674208, "grad_norm": 2.8779356349327214, "learning_rate": 9.411768350363042e-06, "loss": 0.1453, "step": 517 }, { "epoch": 0.4687782805429864, "grad_norm": 2.5319965695787614, "learning_rate": 9.40953651699292e-06, "loss": 0.1321, "step": 518 }, { "epoch": 0.46968325791855203, "grad_norm": 2.3440312459206294, "learning_rate": 9.407300723345544e-06, "loss": 0.1257, "step": 519 }, { "epoch": 0.47058823529411764, "grad_norm": 3.4760783258337407, "learning_rate": 9.405060971428924e-06, "loss": 0.1001, "step": 520 }, { "epoch": 0.47149321266968325, "grad_norm": 3.592933822546608, "learning_rate": 9.402817263254615e-06, "loss": 0.1631, "step": 521 }, { "epoch": 0.47239819004524886, "grad_norm": 3.676531006392057, "learning_rate": 9.400569600837728e-06, "loss": 0.154, "step": 522 }, { "epoch": 0.47330316742081446, "grad_norm": 2.735262045716061, "learning_rate": 9.398317986196928e-06, "loss": 0.1615, "step": 523 }, { "epoch": 0.47420814479638007, "grad_norm": 2.5817213788283686, "learning_rate": 9.396062421354424e-06, "loss": 0.1391, "step": 524 }, { "epoch": 0.4751131221719457, "grad_norm": 2.7924317530294425, "learning_rate": 9.393802908335978e-06, "loss": 0.1315, "step": 525 }, { "epoch": 0.4760180995475113, "grad_norm": 2.674252364881151, "learning_rate": 9.391539449170894e-06, "loss": 0.1375, "step": 526 }, { "epoch": 0.47692307692307695, "grad_norm": 2.935392332411267, "learning_rate": 9.389272045892023e-06, "loss": 0.1868, "step": 527 }, { "epoch": 0.47782805429864256, "grad_norm": 2.2747824982154383, "learning_rate": 9.387000700535758e-06, "loss": 0.1429, "step": 528 }, { "epoch": 0.47873303167420816, "grad_norm": 2.4196296521019045, "learning_rate": 9.38472541514203e-06, "loss": 0.1559, "step": 529 }, { "epoch": 0.4796380090497738, "grad_norm": 2.531892443580504, "learning_rate": 9.382446191754313e-06, "loss": 0.1392, "step": 530 }, { "epoch": 0.4805429864253394, "grad_norm": 2.4797064823060815, "learning_rate": 9.380163032419611e-06, "loss": 0.1235, "step": 531 }, { "epoch": 0.481447963800905, "grad_norm": 3.7794077416965086, "learning_rate": 9.37787593918847e-06, "loss": 0.2187, "step": 532 }, { "epoch": 0.4823529411764706, "grad_norm": 2.679310736074672, "learning_rate": 9.375584914114963e-06, "loss": 0.1466, "step": 533 }, { "epoch": 0.4832579185520362, "grad_norm": 3.922469898529439, "learning_rate": 9.373289959256702e-06, "loss": 0.196, "step": 534 }, { "epoch": 0.4841628959276018, "grad_norm": 2.929253783386853, "learning_rate": 9.370991076674821e-06, "loss": 0.1702, "step": 535 }, { "epoch": 0.4850678733031674, "grad_norm": 1.9090547731325358, "learning_rate": 9.368688268433982e-06, "loss": 0.1002, "step": 536 }, { "epoch": 0.485972850678733, "grad_norm": 3.8633460073349157, "learning_rate": 9.366381536602377e-06, "loss": 0.2739, "step": 537 }, { "epoch": 0.48687782805429863, "grad_norm": 2.84345494567985, "learning_rate": 9.36407088325172e-06, "loss": 0.1606, "step": 538 }, { "epoch": 0.48778280542986424, "grad_norm": 3.753855482603751, "learning_rate": 9.361756310457249e-06, "loss": 0.1495, "step": 539 }, { "epoch": 0.48868778280542985, "grad_norm": 3.789457022782253, "learning_rate": 9.359437820297716e-06, "loss": 0.1369, "step": 540 }, { "epoch": 0.48959276018099546, "grad_norm": 3.323725228766967, "learning_rate": 9.357115414855397e-06, "loss": 0.1212, "step": 541 }, { "epoch": 0.49049773755656106, "grad_norm": 2.8825671169645086, "learning_rate": 9.35478909621608e-06, "loss": 0.1555, "step": 542 }, { "epoch": 0.49140271493212667, "grad_norm": 2.276773822311198, "learning_rate": 9.352458866469076e-06, "loss": 0.1259, "step": 543 }, { "epoch": 0.49230769230769234, "grad_norm": 3.0117926411125056, "learning_rate": 9.350124727707197e-06, "loss": 0.1677, "step": 544 }, { "epoch": 0.49321266968325794, "grad_norm": 2.828834820711535, "learning_rate": 9.347786682026774e-06, "loss": 0.1797, "step": 545 }, { "epoch": 0.49411764705882355, "grad_norm": 3.3959688621469692, "learning_rate": 9.345444731527642e-06, "loss": 0.1694, "step": 546 }, { "epoch": 0.49502262443438916, "grad_norm": 2.883907582286255, "learning_rate": 9.343098878313149e-06, "loss": 0.1295, "step": 547 }, { "epoch": 0.49592760180995477, "grad_norm": 2.503907740275003, "learning_rate": 9.34074912449014e-06, "loss": 0.1287, "step": 548 }, { "epoch": 0.4968325791855204, "grad_norm": 2.439561077751108, "learning_rate": 9.33839547216897e-06, "loss": 0.1471, "step": 549 }, { "epoch": 0.497737556561086, "grad_norm": 3.740875676378895, "learning_rate": 9.336037923463494e-06, "loss": 0.2224, "step": 550 }, { "epoch": 0.4986425339366516, "grad_norm": 3.215710205690654, "learning_rate": 9.333676480491063e-06, "loss": 0.1569, "step": 551 }, { "epoch": 0.4995475113122172, "grad_norm": 2.755235943269231, "learning_rate": 9.331311145372528e-06, "loss": 0.1285, "step": 552 }, { "epoch": 0.5004524886877828, "grad_norm": 2.1647211159432946, "learning_rate": 9.328941920232237e-06, "loss": 0.1317, "step": 553 }, { "epoch": 0.5013574660633484, "grad_norm": 2.7303210190917713, "learning_rate": 9.326568807198028e-06, "loss": 0.1554, "step": 554 }, { "epoch": 0.502262443438914, "grad_norm": 2.182150056307695, "learning_rate": 9.324191808401235e-06, "loss": 0.108, "step": 555 }, { "epoch": 0.5031674208144796, "grad_norm": 3.0865560788411592, "learning_rate": 9.321810925976677e-06, "loss": 0.1592, "step": 556 }, { "epoch": 0.5040723981900452, "grad_norm": 3.0063515318288925, "learning_rate": 9.319426162062665e-06, "loss": 0.1912, "step": 557 }, { "epoch": 0.5049773755656108, "grad_norm": 2.595504879662294, "learning_rate": 9.317037518800997e-06, "loss": 0.1598, "step": 558 }, { "epoch": 0.5058823529411764, "grad_norm": 4.970306226707205, "learning_rate": 9.31464499833695e-06, "loss": 0.2448, "step": 559 }, { "epoch": 0.5067873303167421, "grad_norm": 3.4198337840090702, "learning_rate": 9.312248602819284e-06, "loss": 0.1271, "step": 560 }, { "epoch": 0.5076923076923077, "grad_norm": 2.305503937960965, "learning_rate": 9.309848334400247e-06, "loss": 0.1184, "step": 561 }, { "epoch": 0.5085972850678733, "grad_norm": 2.4686232549615403, "learning_rate": 9.307444195235555e-06, "loss": 0.1352, "step": 562 }, { "epoch": 0.5095022624434389, "grad_norm": 2.780922298739625, "learning_rate": 9.305036187484407e-06, "loss": 0.1588, "step": 563 }, { "epoch": 0.5104072398190045, "grad_norm": 3.0676405380428697, "learning_rate": 9.302624313309471e-06, "loss": 0.1642, "step": 564 }, { "epoch": 0.5113122171945701, "grad_norm": 2.8198451942305303, "learning_rate": 9.300208574876897e-06, "loss": 0.1603, "step": 565 }, { "epoch": 0.5122171945701357, "grad_norm": 2.4336950542880276, "learning_rate": 9.297788974356294e-06, "loss": 0.1712, "step": 566 }, { "epoch": 0.5131221719457013, "grad_norm": 4.465230619829966, "learning_rate": 9.295365513920749e-06, "loss": 0.2117, "step": 567 }, { "epoch": 0.5140271493212669, "grad_norm": 2.9656010720317556, "learning_rate": 9.292938195746806e-06, "loss": 0.1532, "step": 568 }, { "epoch": 0.5149321266968325, "grad_norm": 2.3375223794917352, "learning_rate": 9.290507022014486e-06, "loss": 0.1415, "step": 569 }, { "epoch": 0.5158371040723982, "grad_norm": 2.611356021230858, "learning_rate": 9.288071994907262e-06, "loss": 0.1274, "step": 570 }, { "epoch": 0.5167420814479639, "grad_norm": 2.248349675566894, "learning_rate": 9.285633116612071e-06, "loss": 0.1194, "step": 571 }, { "epoch": 0.5176470588235295, "grad_norm": 1.9408557085463654, "learning_rate": 9.283190389319315e-06, "loss": 0.1169, "step": 572 }, { "epoch": 0.5185520361990951, "grad_norm": 3.281804953327162, "learning_rate": 9.280743815222842e-06, "loss": 0.1667, "step": 573 }, { "epoch": 0.5194570135746607, "grad_norm": 3.5388468836291955, "learning_rate": 9.278293396519965e-06, "loss": 0.1616, "step": 574 }, { "epoch": 0.5203619909502263, "grad_norm": 2.7949093802500955, "learning_rate": 9.275839135411439e-06, "loss": 0.1543, "step": 575 }, { "epoch": 0.5212669683257919, "grad_norm": 3.636672691080532, "learning_rate": 9.273381034101483e-06, "loss": 0.2173, "step": 576 }, { "epoch": 0.5221719457013575, "grad_norm": 3.598288741254789, "learning_rate": 9.270919094797754e-06, "loss": 0.2088, "step": 577 }, { "epoch": 0.5230769230769231, "grad_norm": 2.4371756766083563, "learning_rate": 9.268453319711362e-06, "loss": 0.1494, "step": 578 }, { "epoch": 0.5239819004524887, "grad_norm": 2.682766960552868, "learning_rate": 9.26598371105686e-06, "loss": 0.1303, "step": 579 }, { "epoch": 0.5248868778280543, "grad_norm": 1.969423008624484, "learning_rate": 9.263510271052243e-06, "loss": 0.1177, "step": 580 }, { "epoch": 0.5257918552036199, "grad_norm": 3.3895147424317704, "learning_rate": 9.26103300191895e-06, "loss": 0.1693, "step": 581 }, { "epoch": 0.5266968325791855, "grad_norm": 4.283053456886952, "learning_rate": 9.258551905881856e-06, "loss": 0.22, "step": 582 }, { "epoch": 0.5276018099547511, "grad_norm": 2.901734841424762, "learning_rate": 9.256066985169274e-06, "loss": 0.1396, "step": 583 }, { "epoch": 0.5285067873303168, "grad_norm": 2.021271148003746, "learning_rate": 9.253578242012951e-06, "loss": 0.1206, "step": 584 }, { "epoch": 0.5294117647058824, "grad_norm": 3.10331321671179, "learning_rate": 9.251085678648072e-06, "loss": 0.1397, "step": 585 }, { "epoch": 0.530316742081448, "grad_norm": 2.5535782822464594, "learning_rate": 9.248589297313245e-06, "loss": 0.1694, "step": 586 }, { "epoch": 0.5312217194570136, "grad_norm": 2.6532548103330615, "learning_rate": 9.246089100250513e-06, "loss": 0.1337, "step": 587 }, { "epoch": 0.5321266968325792, "grad_norm": 3.9476630547771028, "learning_rate": 9.243585089705344e-06, "loss": 0.2685, "step": 588 }, { "epoch": 0.5330316742081448, "grad_norm": 2.947340551365793, "learning_rate": 9.241077267926632e-06, "loss": 0.1606, "step": 589 }, { "epoch": 0.5339366515837104, "grad_norm": 2.6039630350837037, "learning_rate": 9.238565637166692e-06, "loss": 0.1479, "step": 590 }, { "epoch": 0.534841628959276, "grad_norm": 2.0093587106363477, "learning_rate": 9.236050199681259e-06, "loss": 0.1005, "step": 591 }, { "epoch": 0.5357466063348416, "grad_norm": 3.1717971113145413, "learning_rate": 9.23353095772949e-06, "loss": 0.1832, "step": 592 }, { "epoch": 0.5366515837104072, "grad_norm": 2.508722848468128, "learning_rate": 9.231007913573961e-06, "loss": 0.1199, "step": 593 }, { "epoch": 0.5375565610859728, "grad_norm": 3.0528186692086483, "learning_rate": 9.228481069480655e-06, "loss": 0.211, "step": 594 }, { "epoch": 0.5384615384615384, "grad_norm": 3.4397495148909787, "learning_rate": 9.225950427718974e-06, "loss": 0.1559, "step": 595 }, { "epoch": 0.539366515837104, "grad_norm": 2.6591457008164405, "learning_rate": 9.22341599056173e-06, "loss": 0.142, "step": 596 }, { "epoch": 0.5402714932126697, "grad_norm": 2.4513815138849613, "learning_rate": 9.22087776028514e-06, "loss": 0.1503, "step": 597 }, { "epoch": 0.5411764705882353, "grad_norm": 2.8870092333204487, "learning_rate": 9.218335739168833e-06, "loss": 0.1562, "step": 598 }, { "epoch": 0.5420814479638009, "grad_norm": 2.056427444951522, "learning_rate": 9.215789929495837e-06, "loss": 0.1176, "step": 599 }, { "epoch": 0.5429864253393665, "grad_norm": 2.343956312336827, "learning_rate": 9.213240333552589e-06, "loss": 0.1303, "step": 600 }, { "epoch": 0.5438914027149321, "grad_norm": 2.5599305823729726, "learning_rate": 9.210686953628921e-06, "loss": 0.1151, "step": 601 }, { "epoch": 0.5447963800904977, "grad_norm": 2.6275329983843707, "learning_rate": 9.208129792018066e-06, "loss": 0.1639, "step": 602 }, { "epoch": 0.5457013574660633, "grad_norm": 2.865673707539603, "learning_rate": 9.205568851016653e-06, "loss": 0.1463, "step": 603 }, { "epoch": 0.5466063348416289, "grad_norm": 2.803136027455164, "learning_rate": 9.203004132924705e-06, "loss": 0.1309, "step": 604 }, { "epoch": 0.5475113122171946, "grad_norm": 2.2265333503286806, "learning_rate": 9.200435640045637e-06, "loss": 0.1168, "step": 605 }, { "epoch": 0.5484162895927602, "grad_norm": 2.0784632636451965, "learning_rate": 9.197863374686255e-06, "loss": 0.1144, "step": 606 }, { "epoch": 0.5493212669683258, "grad_norm": 3.295754558065845, "learning_rate": 9.195287339156753e-06, "loss": 0.2181, "step": 607 }, { "epoch": 0.5502262443438914, "grad_norm": 2.9274199342925113, "learning_rate": 9.192707535770711e-06, "loss": 0.1467, "step": 608 }, { "epoch": 0.551131221719457, "grad_norm": 3.810981921332449, "learning_rate": 9.190123966845092e-06, "loss": 0.1275, "step": 609 }, { "epoch": 0.5520361990950227, "grad_norm": 2.74957147266702, "learning_rate": 9.187536634700244e-06, "loss": 0.1396, "step": 610 }, { "epoch": 0.5529411764705883, "grad_norm": 2.679205905481507, "learning_rate": 9.18494554165989e-06, "loss": 0.1443, "step": 611 }, { "epoch": 0.5538461538461539, "grad_norm": 3.1686823919988503, "learning_rate": 9.182350690051134e-06, "loss": 0.2068, "step": 612 }, { "epoch": 0.5547511312217195, "grad_norm": 2.663830676313956, "learning_rate": 9.179752082204456e-06, "loss": 0.132, "step": 613 }, { "epoch": 0.5556561085972851, "grad_norm": 3.170717490148902, "learning_rate": 9.177149720453707e-06, "loss": 0.2415, "step": 614 }, { "epoch": 0.5565610859728507, "grad_norm": 2.3171101623708292, "learning_rate": 9.174543607136111e-06, "loss": 0.139, "step": 615 }, { "epoch": 0.5574660633484163, "grad_norm": 3.2814478664077744, "learning_rate": 9.171933744592262e-06, "loss": 0.2057, "step": 616 }, { "epoch": 0.5583710407239819, "grad_norm": 3.4358004720889075, "learning_rate": 9.169320135166121e-06, "loss": 0.1581, "step": 617 }, { "epoch": 0.5592760180995475, "grad_norm": 3.752685774846717, "learning_rate": 9.166702781205013e-06, "loss": 0.1829, "step": 618 }, { "epoch": 0.5601809954751131, "grad_norm": 3.991858626726494, "learning_rate": 9.164081685059625e-06, "loss": 0.2271, "step": 619 }, { "epoch": 0.5610859728506787, "grad_norm": 3.3528294028901695, "learning_rate": 9.161456849084007e-06, "loss": 0.1519, "step": 620 }, { "epoch": 0.5619909502262443, "grad_norm": 2.537319162127424, "learning_rate": 9.158828275635569e-06, "loss": 0.137, "step": 621 }, { "epoch": 0.56289592760181, "grad_norm": 2.569918363402158, "learning_rate": 9.156195967075077e-06, "loss": 0.1567, "step": 622 }, { "epoch": 0.5638009049773756, "grad_norm": 2.287381675825459, "learning_rate": 9.153559925766648e-06, "loss": 0.1118, "step": 623 }, { "epoch": 0.5647058823529412, "grad_norm": 2.3913481902896168, "learning_rate": 9.150920154077753e-06, "loss": 0.1319, "step": 624 }, { "epoch": 0.5656108597285068, "grad_norm": 4.707871108621468, "learning_rate": 9.14827665437922e-06, "loss": 0.1613, "step": 625 }, { "epoch": 0.5665158371040724, "grad_norm": 3.249673424564298, "learning_rate": 9.145629429045214e-06, "loss": 0.2763, "step": 626 }, { "epoch": 0.567420814479638, "grad_norm": 2.067111809707517, "learning_rate": 9.142978480453251e-06, "loss": 0.1113, "step": 627 }, { "epoch": 0.5683257918552036, "grad_norm": 2.1646721331988896, "learning_rate": 9.140323810984197e-06, "loss": 0.1119, "step": 628 }, { "epoch": 0.5692307692307692, "grad_norm": 2.4745824698415717, "learning_rate": 9.13766542302225e-06, "loss": 0.1307, "step": 629 }, { "epoch": 0.5701357466063348, "grad_norm": 3.2308888347579208, "learning_rate": 9.135003318954954e-06, "loss": 0.1431, "step": 630 }, { "epoch": 0.5710407239819004, "grad_norm": 2.293770637208164, "learning_rate": 9.132337501173186e-06, "loss": 0.1559, "step": 631 }, { "epoch": 0.571945701357466, "grad_norm": 2.5461414083300444, "learning_rate": 9.129667972071163e-06, "loss": 0.1637, "step": 632 }, { "epoch": 0.5728506787330316, "grad_norm": 2.9016123137015506, "learning_rate": 9.126994734046432e-06, "loss": 0.159, "step": 633 }, { "epoch": 0.5737556561085972, "grad_norm": 3.3341352036861682, "learning_rate": 9.124317789499874e-06, "loss": 0.1952, "step": 634 }, { "epoch": 0.5746606334841629, "grad_norm": 3.105182531498295, "learning_rate": 9.121637140835696e-06, "loss": 0.2949, "step": 635 }, { "epoch": 0.5755656108597285, "grad_norm": 2.3056411637243968, "learning_rate": 9.11895279046143e-06, "loss": 0.1421, "step": 636 }, { "epoch": 0.5764705882352941, "grad_norm": 3.905424539905286, "learning_rate": 9.116264740787937e-06, "loss": 0.2174, "step": 637 }, { "epoch": 0.5773755656108597, "grad_norm": 2.432243999601415, "learning_rate": 9.1135729942294e-06, "loss": 0.1287, "step": 638 }, { "epoch": 0.5782805429864254, "grad_norm": 3.792296564286366, "learning_rate": 9.110877553203318e-06, "loss": 0.2289, "step": 639 }, { "epoch": 0.579185520361991, "grad_norm": 3.596857978376343, "learning_rate": 9.108178420130514e-06, "loss": 0.1338, "step": 640 }, { "epoch": 0.5800904977375566, "grad_norm": 2.2291635801254803, "learning_rate": 9.105475597435122e-06, "loss": 0.134, "step": 641 }, { "epoch": 0.5809954751131222, "grad_norm": 2.6811091565901775, "learning_rate": 9.102769087544591e-06, "loss": 0.2155, "step": 642 }, { "epoch": 0.5819004524886878, "grad_norm": 2.48004129322547, "learning_rate": 9.100058892889684e-06, "loss": 0.1383, "step": 643 }, { "epoch": 0.5828054298642534, "grad_norm": 2.267545267745005, "learning_rate": 9.09734501590447e-06, "loss": 0.1167, "step": 644 }, { "epoch": 0.583710407239819, "grad_norm": 2.4374854050028127, "learning_rate": 9.094627459026326e-06, "loss": 0.1397, "step": 645 }, { "epoch": 0.5846153846153846, "grad_norm": 2.3915086566734236, "learning_rate": 9.091906224695935e-06, "loss": 0.1138, "step": 646 }, { "epoch": 0.5855203619909503, "grad_norm": 2.356957420891488, "learning_rate": 9.089181315357286e-06, "loss": 0.1381, "step": 647 }, { "epoch": 0.5864253393665159, "grad_norm": 3.483791277207042, "learning_rate": 9.086452733457658e-06, "loss": 0.2065, "step": 648 }, { "epoch": 0.5873303167420815, "grad_norm": 2.602615716662256, "learning_rate": 9.083720481447639e-06, "loss": 0.1339, "step": 649 }, { "epoch": 0.5882352941176471, "grad_norm": 2.7592307158368476, "learning_rate": 9.08098456178111e-06, "loss": 0.1687, "step": 650 }, { "epoch": 0.5891402714932127, "grad_norm": 2.5151082164378833, "learning_rate": 9.078244976915244e-06, "loss": 0.1392, "step": 651 }, { "epoch": 0.5900452488687783, "grad_norm": 2.9525179013037275, "learning_rate": 9.075501729310507e-06, "loss": 0.1589, "step": 652 }, { "epoch": 0.5909502262443439, "grad_norm": 3.1631368337290557, "learning_rate": 9.072754821430658e-06, "loss": 0.1217, "step": 653 }, { "epoch": 0.5918552036199095, "grad_norm": 2.9787582547326728, "learning_rate": 9.070004255742737e-06, "loss": 0.1885, "step": 654 }, { "epoch": 0.5927601809954751, "grad_norm": 2.195649180911512, "learning_rate": 9.067250034717072e-06, "loss": 0.1415, "step": 655 }, { "epoch": 0.5936651583710407, "grad_norm": 2.9859067845348646, "learning_rate": 9.064492160827276e-06, "loss": 0.149, "step": 656 }, { "epoch": 0.5945701357466063, "grad_norm": 2.7514586096607134, "learning_rate": 9.06173063655024e-06, "loss": 0.1134, "step": 657 }, { "epoch": 0.5954751131221719, "grad_norm": 3.012817728228047, "learning_rate": 9.058965464366133e-06, "loss": 0.1832, "step": 658 }, { "epoch": 0.5963800904977375, "grad_norm": 2.349966666196632, "learning_rate": 9.056196646758406e-06, "loss": 0.1296, "step": 659 }, { "epoch": 0.5972850678733032, "grad_norm": 2.7014006958152894, "learning_rate": 9.053424186213776e-06, "loss": 0.1362, "step": 660 }, { "epoch": 0.5981900452488688, "grad_norm": 3.9892617395789793, "learning_rate": 9.050648085222234e-06, "loss": 0.1776, "step": 661 }, { "epoch": 0.5990950226244344, "grad_norm": 2.908990841263601, "learning_rate": 9.047868346277045e-06, "loss": 0.1696, "step": 662 }, { "epoch": 0.6, "grad_norm": 3.8700538834827776, "learning_rate": 9.045084971874738e-06, "loss": 0.247, "step": 663 }, { "epoch": 0.6009049773755656, "grad_norm": 2.4352392048533305, "learning_rate": 9.042297964515106e-06, "loss": 0.1497, "step": 664 }, { "epoch": 0.6018099547511312, "grad_norm": 2.8264863822175372, "learning_rate": 9.039507326701207e-06, "loss": 0.1415, "step": 665 }, { "epoch": 0.6027149321266968, "grad_norm": 3.069444361817013, "learning_rate": 9.036713060939358e-06, "loss": 0.1903, "step": 666 }, { "epoch": 0.6036199095022624, "grad_norm": 2.042111609334182, "learning_rate": 9.033915169739136e-06, "loss": 0.1313, "step": 667 }, { "epoch": 0.604524886877828, "grad_norm": 3.6229608846206625, "learning_rate": 9.031113655613372e-06, "loss": 0.1974, "step": 668 }, { "epoch": 0.6054298642533936, "grad_norm": 2.6632493014862986, "learning_rate": 9.028308521078154e-06, "loss": 0.1234, "step": 669 }, { "epoch": 0.6063348416289592, "grad_norm": 1.9926283571756347, "learning_rate": 9.025499768652817e-06, "loss": 0.1013, "step": 670 }, { "epoch": 0.6072398190045248, "grad_norm": 3.1375396346313336, "learning_rate": 9.022687400859947e-06, "loss": 0.1676, "step": 671 }, { "epoch": 0.6081447963800904, "grad_norm": 4.135056517500281, "learning_rate": 9.019871420225382e-06, "loss": 0.271, "step": 672 }, { "epoch": 0.609049773755656, "grad_norm": 2.6734431689570224, "learning_rate": 9.017051829278196e-06, "loss": 0.1408, "step": 673 }, { "epoch": 0.6099547511312218, "grad_norm": 2.6126240978089035, "learning_rate": 9.014228630550712e-06, "loss": 0.1543, "step": 674 }, { "epoch": 0.6108597285067874, "grad_norm": 2.7130159003789966, "learning_rate": 9.011401826578492e-06, "loss": 0.121, "step": 675 }, { "epoch": 0.611764705882353, "grad_norm": 2.5561402795817734, "learning_rate": 9.008571419900334e-06, "loss": 0.1433, "step": 676 }, { "epoch": 0.6126696832579186, "grad_norm": 2.7319350782860816, "learning_rate": 9.005737413058273e-06, "loss": 0.1336, "step": 677 }, { "epoch": 0.6135746606334842, "grad_norm": 3.2588544828679393, "learning_rate": 9.002899808597577e-06, "loss": 0.1765, "step": 678 }, { "epoch": 0.6144796380090498, "grad_norm": 2.5245482954413268, "learning_rate": 9.000058609066746e-06, "loss": 0.1374, "step": 679 }, { "epoch": 0.6153846153846154, "grad_norm": 2.953551154446445, "learning_rate": 8.997213817017508e-06, "loss": 0.1255, "step": 680 }, { "epoch": 0.616289592760181, "grad_norm": 2.7660884495017966, "learning_rate": 8.994365435004814e-06, "loss": 0.1188, "step": 681 }, { "epoch": 0.6171945701357466, "grad_norm": 2.0149984333911823, "learning_rate": 8.991513465586848e-06, "loss": 0.1107, "step": 682 }, { "epoch": 0.6180995475113122, "grad_norm": 2.6730291425693458, "learning_rate": 8.988657911325008e-06, "loss": 0.1827, "step": 683 }, { "epoch": 0.6190045248868778, "grad_norm": 2.8302865429471784, "learning_rate": 8.985798774783914e-06, "loss": 0.1549, "step": 684 }, { "epoch": 0.6199095022624435, "grad_norm": 3.053926146373984, "learning_rate": 8.982936058531403e-06, "loss": 0.116, "step": 685 }, { "epoch": 0.6208144796380091, "grad_norm": 3.4680358862141754, "learning_rate": 8.98006976513853e-06, "loss": 0.1974, "step": 686 }, { "epoch": 0.6217194570135747, "grad_norm": 1.9944831150782294, "learning_rate": 8.977199897179558e-06, "loss": 0.1316, "step": 687 }, { "epoch": 0.6226244343891403, "grad_norm": 3.9230417394804036, "learning_rate": 8.974326457231964e-06, "loss": 0.2087, "step": 688 }, { "epoch": 0.6235294117647059, "grad_norm": 2.215194521712967, "learning_rate": 8.97144944787643e-06, "loss": 0.1081, "step": 689 }, { "epoch": 0.6244343891402715, "grad_norm": 2.3578597644187482, "learning_rate": 8.968568871696847e-06, "loss": 0.1233, "step": 690 }, { "epoch": 0.6253393665158371, "grad_norm": 2.750797038735155, "learning_rate": 8.965684731280309e-06, "loss": 0.1894, "step": 691 }, { "epoch": 0.6262443438914027, "grad_norm": 3.078969179298111, "learning_rate": 8.962797029217107e-06, "loss": 0.1912, "step": 692 }, { "epoch": 0.6271493212669683, "grad_norm": 2.9651393222196676, "learning_rate": 8.959905768100734e-06, "loss": 0.1441, "step": 693 }, { "epoch": 0.6280542986425339, "grad_norm": 2.6487934421041137, "learning_rate": 8.957010950527881e-06, "loss": 0.1267, "step": 694 }, { "epoch": 0.6289592760180995, "grad_norm": 1.8815989309396086, "learning_rate": 8.95411257909843e-06, "loss": 0.1129, "step": 695 }, { "epoch": 0.6298642533936651, "grad_norm": 2.657012714284515, "learning_rate": 8.951210656415457e-06, "loss": 0.155, "step": 696 }, { "epoch": 0.6307692307692307, "grad_norm": 2.7632606457884523, "learning_rate": 8.948305185085226e-06, "loss": 0.1251, "step": 697 }, { "epoch": 0.6316742081447964, "grad_norm": 3.157693567764918, "learning_rate": 8.945396167717187e-06, "loss": 0.1765, "step": 698 }, { "epoch": 0.632579185520362, "grad_norm": 4.390916890959404, "learning_rate": 8.94248360692398e-06, "loss": 0.1553, "step": 699 }, { "epoch": 0.6334841628959276, "grad_norm": 1.9706401458362335, "learning_rate": 8.939567505321418e-06, "loss": 0.0966, "step": 700 }, { "epoch": 0.6343891402714932, "grad_norm": 3.155641372126822, "learning_rate": 8.936647865528507e-06, "loss": 0.2151, "step": 701 }, { "epoch": 0.6352941176470588, "grad_norm": 2.8473977370431327, "learning_rate": 8.933724690167417e-06, "loss": 0.1703, "step": 702 }, { "epoch": 0.6361990950226244, "grad_norm": 2.818532094728658, "learning_rate": 8.930797981863504e-06, "loss": 0.1594, "step": 703 }, { "epoch": 0.63710407239819, "grad_norm": 4.937306672700299, "learning_rate": 8.927867743245292e-06, "loss": 0.2431, "step": 704 }, { "epoch": 0.6380090497737556, "grad_norm": 3.401257791479914, "learning_rate": 8.924933976944474e-06, "loss": 0.1546, "step": 705 }, { "epoch": 0.6389140271493212, "grad_norm": 2.9365601012171005, "learning_rate": 8.921996685595917e-06, "loss": 0.1359, "step": 706 }, { "epoch": 0.6398190045248868, "grad_norm": 2.5639604155847495, "learning_rate": 8.919055871837649e-06, "loss": 0.2185, "step": 707 }, { "epoch": 0.6407239819004525, "grad_norm": 2.2517907378316475, "learning_rate": 8.916111538310864e-06, "loss": 0.1214, "step": 708 }, { "epoch": 0.6416289592760182, "grad_norm": 3.978260758402524, "learning_rate": 8.913163687659915e-06, "loss": 0.2814, "step": 709 }, { "epoch": 0.6425339366515838, "grad_norm": 2.638976521538929, "learning_rate": 8.910212322532317e-06, "loss": 0.1641, "step": 710 }, { "epoch": 0.6434389140271494, "grad_norm": 2.02066886631697, "learning_rate": 8.907257445578739e-06, "loss": 0.1019, "step": 711 }, { "epoch": 0.644343891402715, "grad_norm": 8.276965893921547, "learning_rate": 8.904299059453003e-06, "loss": 0.2576, "step": 712 }, { "epoch": 0.6452488687782806, "grad_norm": 2.694319474805787, "learning_rate": 8.901337166812087e-06, "loss": 0.1628, "step": 713 }, { "epoch": 0.6461538461538462, "grad_norm": 2.653213645252289, "learning_rate": 8.898371770316113e-06, "loss": 0.1573, "step": 714 }, { "epoch": 0.6470588235294118, "grad_norm": 2.5650614625355983, "learning_rate": 8.895402872628352e-06, "loss": 0.1733, "step": 715 }, { "epoch": 0.6479638009049774, "grad_norm": 2.791359524567814, "learning_rate": 8.892430476415223e-06, "loss": 0.1838, "step": 716 }, { "epoch": 0.648868778280543, "grad_norm": 2.6959291988097047, "learning_rate": 8.889454584346281e-06, "loss": 0.1452, "step": 717 }, { "epoch": 0.6497737556561086, "grad_norm": 2.480334771771893, "learning_rate": 8.886475199094226e-06, "loss": 0.135, "step": 718 }, { "epoch": 0.6506787330316742, "grad_norm": 3.090255790699134, "learning_rate": 8.88349232333489e-06, "loss": 0.136, "step": 719 }, { "epoch": 0.6515837104072398, "grad_norm": 3.7778160217799015, "learning_rate": 8.880505959747245e-06, "loss": 0.2363, "step": 720 }, { "epoch": 0.6524886877828054, "grad_norm": 2.198750850511883, "learning_rate": 8.877516111013391e-06, "loss": 0.1353, "step": 721 }, { "epoch": 0.653393665158371, "grad_norm": 3.042202065195582, "learning_rate": 8.874522779818563e-06, "loss": 0.1841, "step": 722 }, { "epoch": 0.6542986425339367, "grad_norm": 2.5025941358558867, "learning_rate": 8.87152596885112e-06, "loss": 0.165, "step": 723 }, { "epoch": 0.6552036199095023, "grad_norm": 2.5031218202677445, "learning_rate": 8.868525680802547e-06, "loss": 0.1291, "step": 724 }, { "epoch": 0.6561085972850679, "grad_norm": 3.9924770883310714, "learning_rate": 8.86552191836745e-06, "loss": 0.2105, "step": 725 }, { "epoch": 0.6570135746606335, "grad_norm": 2.5820133053792302, "learning_rate": 8.862514684243562e-06, "loss": 0.1499, "step": 726 }, { "epoch": 0.6579185520361991, "grad_norm": 2.7335023521258353, "learning_rate": 8.859503981131726e-06, "loss": 0.1586, "step": 727 }, { "epoch": 0.6588235294117647, "grad_norm": 3.597883358481034, "learning_rate": 8.856489811735904e-06, "loss": 0.24, "step": 728 }, { "epoch": 0.6597285067873303, "grad_norm": 3.0368654137396924, "learning_rate": 8.853472178763172e-06, "loss": 0.1924, "step": 729 }, { "epoch": 0.6606334841628959, "grad_norm": 2.383689168717609, "learning_rate": 8.850451084923717e-06, "loss": 0.1284, "step": 730 }, { "epoch": 0.6615384615384615, "grad_norm": 3.033908592753215, "learning_rate": 8.84742653293083e-06, "loss": 0.1877, "step": 731 }, { "epoch": 0.6624434389140271, "grad_norm": 2.2565572135016327, "learning_rate": 8.844398525500915e-06, "loss": 0.1097, "step": 732 }, { "epoch": 0.6633484162895927, "grad_norm": 2.158062839580616, "learning_rate": 8.84136706535347e-06, "loss": 0.1084, "step": 733 }, { "epoch": 0.6642533936651583, "grad_norm": 2.354673320258265, "learning_rate": 8.838332155211104e-06, "loss": 0.1565, "step": 734 }, { "epoch": 0.665158371040724, "grad_norm": 2.3388704663917266, "learning_rate": 8.835293797799517e-06, "loss": 0.132, "step": 735 }, { "epoch": 0.6660633484162896, "grad_norm": 3.154382523229033, "learning_rate": 8.83225199584751e-06, "loss": 0.1761, "step": 736 }, { "epoch": 0.6669683257918552, "grad_norm": 2.6651742189381906, "learning_rate": 8.829206752086973e-06, "loss": 0.1571, "step": 737 }, { "epoch": 0.6678733031674208, "grad_norm": 2.446519593226046, "learning_rate": 8.826158069252887e-06, "loss": 0.1385, "step": 738 }, { "epoch": 0.6687782805429864, "grad_norm": 3.2043842136048193, "learning_rate": 8.823105950083332e-06, "loss": 0.1902, "step": 739 }, { "epoch": 0.669683257918552, "grad_norm": 2.793162274977135, "learning_rate": 8.82005039731946e-06, "loss": 0.1503, "step": 740 }, { "epoch": 0.6705882352941176, "grad_norm": 3.219404999624288, "learning_rate": 8.816991413705515e-06, "loss": 0.1856, "step": 741 }, { "epoch": 0.6714932126696832, "grad_norm": 4.8178388498054625, "learning_rate": 8.813929001988821e-06, "loss": 0.2442, "step": 742 }, { "epoch": 0.6723981900452489, "grad_norm": 2.2591121420708973, "learning_rate": 8.81086316491978e-06, "loss": 0.1012, "step": 743 }, { "epoch": 0.6733031674208145, "grad_norm": 2.1904570139928476, "learning_rate": 8.807793905251871e-06, "loss": 0.1256, "step": 744 }, { "epoch": 0.6742081447963801, "grad_norm": 2.057062464842664, "learning_rate": 8.804721225741646e-06, "loss": 0.1204, "step": 745 }, { "epoch": 0.6751131221719457, "grad_norm": 2.449871058412831, "learning_rate": 8.80164512914873e-06, "loss": 0.1109, "step": 746 }, { "epoch": 0.6760180995475114, "grad_norm": 4.442089830980623, "learning_rate": 8.798565618235814e-06, "loss": 0.2433, "step": 747 }, { "epoch": 0.676923076923077, "grad_norm": 2.5760097552617647, "learning_rate": 8.795482695768658e-06, "loss": 0.1348, "step": 748 }, { "epoch": 0.6778280542986426, "grad_norm": 2.254171294156851, "learning_rate": 8.792396364516087e-06, "loss": 0.1094, "step": 749 }, { "epoch": 0.6787330316742082, "grad_norm": 2.136091579064456, "learning_rate": 8.789306627249985e-06, "loss": 0.0985, "step": 750 }, { "epoch": 0.6796380090497738, "grad_norm": 2.695254217941918, "learning_rate": 8.786213486745294e-06, "loss": 0.2005, "step": 751 }, { "epoch": 0.6805429864253394, "grad_norm": 2.209795835510196, "learning_rate": 8.783116945780017e-06, "loss": 0.1204, "step": 752 }, { "epoch": 0.681447963800905, "grad_norm": 2.082628887936816, "learning_rate": 8.780017007135206e-06, "loss": 0.1208, "step": 753 }, { "epoch": 0.6823529411764706, "grad_norm": 2.5360276921615688, "learning_rate": 8.776913673594968e-06, "loss": 0.1221, "step": 754 }, { "epoch": 0.6832579185520362, "grad_norm": 2.0534433240511687, "learning_rate": 8.77380694794646e-06, "loss": 0.1126, "step": 755 }, { "epoch": 0.6841628959276018, "grad_norm": 4.944531642031324, "learning_rate": 8.77069683297988e-06, "loss": 0.1856, "step": 756 }, { "epoch": 0.6850678733031674, "grad_norm": 3.765327019453328, "learning_rate": 8.767583331488476e-06, "loss": 0.119, "step": 757 }, { "epoch": 0.685972850678733, "grad_norm": 3.9462227698131387, "learning_rate": 8.764466446268533e-06, "loss": 0.228, "step": 758 }, { "epoch": 0.6868778280542986, "grad_norm": 2.6576773492990986, "learning_rate": 8.761346180119377e-06, "loss": 0.1786, "step": 759 }, { "epoch": 0.6877828054298643, "grad_norm": 2.749578908436458, "learning_rate": 8.75822253584337e-06, "loss": 0.1543, "step": 760 }, { "epoch": 0.6886877828054299, "grad_norm": 2.8495077336041033, "learning_rate": 8.755095516245912e-06, "loss": 0.143, "step": 761 }, { "epoch": 0.6895927601809955, "grad_norm": 2.3544987654960146, "learning_rate": 8.751965124135426e-06, "loss": 0.1533, "step": 762 }, { "epoch": 0.6904977375565611, "grad_norm": 4.341084434993484, "learning_rate": 8.74883136232337e-06, "loss": 0.2737, "step": 763 }, { "epoch": 0.6914027149321267, "grad_norm": 3.198786611954631, "learning_rate": 8.745694233624228e-06, "loss": 0.1831, "step": 764 }, { "epoch": 0.6923076923076923, "grad_norm": 2.432925156439658, "learning_rate": 8.742553740855507e-06, "loss": 0.1308, "step": 765 }, { "epoch": 0.6932126696832579, "grad_norm": 3.1407659851949807, "learning_rate": 8.739409886837734e-06, "loss": 0.2115, "step": 766 }, { "epoch": 0.6941176470588235, "grad_norm": 2.581337199348178, "learning_rate": 8.736262674394455e-06, "loss": 0.1599, "step": 767 }, { "epoch": 0.6950226244343891, "grad_norm": 3.316861481223149, "learning_rate": 8.733112106352238e-06, "loss": 0.2049, "step": 768 }, { "epoch": 0.6959276018099547, "grad_norm": 7.540134697946811, "learning_rate": 8.729958185540654e-06, "loss": 0.261, "step": 769 }, { "epoch": 0.6968325791855203, "grad_norm": 2.307329734502126, "learning_rate": 8.726800914792296e-06, "loss": 0.1406, "step": 770 }, { "epoch": 0.6977375565610859, "grad_norm": 2.3887782769185892, "learning_rate": 8.723640296942756e-06, "loss": 0.1537, "step": 771 }, { "epoch": 0.6986425339366515, "grad_norm": 2.9173391370603854, "learning_rate": 8.720476334830642e-06, "loss": 0.1504, "step": 772 }, { "epoch": 0.6995475113122172, "grad_norm": 3.445565082663965, "learning_rate": 8.717309031297556e-06, "loss": 0.2011, "step": 773 }, { "epoch": 0.7004524886877828, "grad_norm": 2.6959843711785, "learning_rate": 8.714138389188108e-06, "loss": 0.1264, "step": 774 }, { "epoch": 0.7013574660633484, "grad_norm": 2.20432598904184, "learning_rate": 8.710964411349902e-06, "loss": 0.133, "step": 775 }, { "epoch": 0.702262443438914, "grad_norm": 2.2908734675368745, "learning_rate": 8.707787100633543e-06, "loss": 0.1312, "step": 776 }, { "epoch": 0.7031674208144797, "grad_norm": 1.9318111048492586, "learning_rate": 8.704606459892623e-06, "loss": 0.098, "step": 777 }, { "epoch": 0.7040723981900453, "grad_norm": 2.5601768811499928, "learning_rate": 8.701422491983729e-06, "loss": 0.1488, "step": 778 }, { "epoch": 0.7049773755656109, "grad_norm": 2.636043555546599, "learning_rate": 8.698235199766432e-06, "loss": 0.1339, "step": 779 }, { "epoch": 0.7058823529411765, "grad_norm": 2.5563572716390865, "learning_rate": 8.695044586103297e-06, "loss": 0.1327, "step": 780 }, { "epoch": 0.7067873303167421, "grad_norm": 3.1095903037697536, "learning_rate": 8.69185065385986e-06, "loss": 0.2535, "step": 781 }, { "epoch": 0.7076923076923077, "grad_norm": 2.6258502129774786, "learning_rate": 8.688653405904653e-06, "loss": 0.1461, "step": 782 }, { "epoch": 0.7085972850678733, "grad_norm": 2.3626465097219302, "learning_rate": 8.685452845109168e-06, "loss": 0.1364, "step": 783 }, { "epoch": 0.709502262443439, "grad_norm": 5.225473092438738, "learning_rate": 8.682248974347884e-06, "loss": 0.3259, "step": 784 }, { "epoch": 0.7104072398190046, "grad_norm": 2.7751223998022123, "learning_rate": 8.679041796498253e-06, "loss": 0.1977, "step": 785 }, { "epoch": 0.7113122171945702, "grad_norm": 3.1899018680216376, "learning_rate": 8.675831314440692e-06, "loss": 0.1683, "step": 786 }, { "epoch": 0.7122171945701358, "grad_norm": 2.7880748451962374, "learning_rate": 8.672617531058588e-06, "loss": 0.1993, "step": 787 }, { "epoch": 0.7131221719457014, "grad_norm": 3.8374682603675128, "learning_rate": 8.66940044923829e-06, "loss": 0.2102, "step": 788 }, { "epoch": 0.714027149321267, "grad_norm": 2.919637542064198, "learning_rate": 8.666180071869116e-06, "loss": 0.2056, "step": 789 }, { "epoch": 0.7149321266968326, "grad_norm": 2.879346423599716, "learning_rate": 8.66295640184334e-06, "loss": 0.1952, "step": 790 }, { "epoch": 0.7158371040723982, "grad_norm": 2.4448246879061, "learning_rate": 8.659729442056189e-06, "loss": 0.1228, "step": 791 }, { "epoch": 0.7167420814479638, "grad_norm": 2.440825734086771, "learning_rate": 8.656499195405852e-06, "loss": 0.1711, "step": 792 }, { "epoch": 0.7176470588235294, "grad_norm": 2.8375025918165417, "learning_rate": 8.653265664793466e-06, "loss": 0.1395, "step": 793 }, { "epoch": 0.718552036199095, "grad_norm": 2.452960653652663, "learning_rate": 8.650028853123118e-06, "loss": 0.181, "step": 794 }, { "epoch": 0.7194570135746606, "grad_norm": 2.2177160121288786, "learning_rate": 8.646788763301842e-06, "loss": 0.1413, "step": 795 }, { "epoch": 0.7203619909502262, "grad_norm": 3.08804054955271, "learning_rate": 8.643545398239614e-06, "loss": 0.1399, "step": 796 }, { "epoch": 0.7212669683257918, "grad_norm": 3.657211221213087, "learning_rate": 8.640298760849358e-06, "loss": 0.1795, "step": 797 }, { "epoch": 0.7221719457013575, "grad_norm": 2.8743263358296023, "learning_rate": 8.637048854046926e-06, "loss": 0.1725, "step": 798 }, { "epoch": 0.7230769230769231, "grad_norm": 2.6604518084846207, "learning_rate": 8.633795680751116e-06, "loss": 0.1574, "step": 799 }, { "epoch": 0.7239819004524887, "grad_norm": 2.7251538180399093, "learning_rate": 8.630539243883659e-06, "loss": 0.1984, "step": 800 }, { "epoch": 0.7248868778280543, "grad_norm": 2.405947678859239, "learning_rate": 8.627279546369211e-06, "loss": 0.1326, "step": 801 }, { "epoch": 0.7257918552036199, "grad_norm": 2.358591843350857, "learning_rate": 8.62401659113536e-06, "loss": 0.1308, "step": 802 }, { "epoch": 0.7266968325791855, "grad_norm": 2.211490162835877, "learning_rate": 8.620750381112622e-06, "loss": 0.1101, "step": 803 }, { "epoch": 0.7276018099547511, "grad_norm": 2.175863585815348, "learning_rate": 8.617480919234432e-06, "loss": 0.1258, "step": 804 }, { "epoch": 0.7285067873303167, "grad_norm": 2.301915308478046, "learning_rate": 8.61420820843715e-06, "loss": 0.0991, "step": 805 }, { "epoch": 0.7294117647058823, "grad_norm": 1.6964907101118736, "learning_rate": 8.610932251660046e-06, "loss": 0.1028, "step": 806 }, { "epoch": 0.7303167420814479, "grad_norm": 2.435549834349859, "learning_rate": 8.607653051845317e-06, "loss": 0.1769, "step": 807 }, { "epoch": 0.7312217194570135, "grad_norm": 3.517793616889787, "learning_rate": 8.604370611938065e-06, "loss": 0.1651, "step": 808 }, { "epoch": 0.7321266968325791, "grad_norm": 2.7419101492089557, "learning_rate": 8.601084934886297e-06, "loss": 0.158, "step": 809 }, { "epoch": 0.7330316742081447, "grad_norm": 2.2007999639783855, "learning_rate": 8.59779602364094e-06, "loss": 0.1153, "step": 810 }, { "epoch": 0.7339366515837104, "grad_norm": 2.142126592108819, "learning_rate": 8.59450388115582e-06, "loss": 0.1071, "step": 811 }, { "epoch": 0.7348416289592761, "grad_norm": 3.4355588877922116, "learning_rate": 8.59120851038766e-06, "loss": 0.1633, "step": 812 }, { "epoch": 0.7357466063348417, "grad_norm": 2.6300015515820774, "learning_rate": 8.587909914296089e-06, "loss": 0.1479, "step": 813 }, { "epoch": 0.7366515837104073, "grad_norm": 3.0730885071717333, "learning_rate": 8.584608095843629e-06, "loss": 0.1653, "step": 814 }, { "epoch": 0.7375565610859729, "grad_norm": 2.7513614183046315, "learning_rate": 8.581303057995697e-06, "loss": 0.1305, "step": 815 }, { "epoch": 0.7384615384615385, "grad_norm": 2.956924702776653, "learning_rate": 8.577994803720605e-06, "loss": 0.1662, "step": 816 }, { "epoch": 0.7393665158371041, "grad_norm": 2.3889540657649615, "learning_rate": 8.57468333598955e-06, "loss": 0.1266, "step": 817 }, { "epoch": 0.7402714932126697, "grad_norm": 4.255622121688202, "learning_rate": 8.571368657776613e-06, "loss": 0.2128, "step": 818 }, { "epoch": 0.7411764705882353, "grad_norm": 2.8746262458596124, "learning_rate": 8.568050772058763e-06, "loss": 0.1801, "step": 819 }, { "epoch": 0.7420814479638009, "grad_norm": 2.6536227246164623, "learning_rate": 8.564729681815846e-06, "loss": 0.138, "step": 820 }, { "epoch": 0.7429864253393665, "grad_norm": 2.9073305013064537, "learning_rate": 8.561405390030589e-06, "loss": 0.1974, "step": 821 }, { "epoch": 0.7438914027149321, "grad_norm": 2.3095862678494017, "learning_rate": 8.558077899688592e-06, "loss": 0.1428, "step": 822 }, { "epoch": 0.7447963800904978, "grad_norm": 2.2162766585033666, "learning_rate": 8.554747213778332e-06, "loss": 0.169, "step": 823 }, { "epoch": 0.7457013574660634, "grad_norm": 2.390363482058944, "learning_rate": 8.55141333529115e-06, "loss": 0.133, "step": 824 }, { "epoch": 0.746606334841629, "grad_norm": 2.119593633224951, "learning_rate": 8.548076267221258e-06, "loss": 0.1017, "step": 825 }, { "epoch": 0.7475113122171946, "grad_norm": 3.3763846154439174, "learning_rate": 8.544736012565729e-06, "loss": 0.2341, "step": 826 }, { "epoch": 0.7484162895927602, "grad_norm": 2.5480346322626755, "learning_rate": 8.541392574324504e-06, "loss": 0.1293, "step": 827 }, { "epoch": 0.7493212669683258, "grad_norm": 1.6702675659656585, "learning_rate": 8.53804595550038e-06, "loss": 0.0913, "step": 828 }, { "epoch": 0.7502262443438914, "grad_norm": 2.206197520358442, "learning_rate": 8.534696159099007e-06, "loss": 0.1241, "step": 829 }, { "epoch": 0.751131221719457, "grad_norm": 10.200425470567255, "learning_rate": 8.531343188128896e-06, "loss": 0.2904, "step": 830 }, { "epoch": 0.7520361990950226, "grad_norm": 2.4717486468769434, "learning_rate": 8.527987045601404e-06, "loss": 0.1437, "step": 831 }, { "epoch": 0.7529411764705882, "grad_norm": 2.014705669864348, "learning_rate": 8.524627734530738e-06, "loss": 0.1173, "step": 832 }, { "epoch": 0.7538461538461538, "grad_norm": 2.6961516831372387, "learning_rate": 8.521265257933948e-06, "loss": 0.1329, "step": 833 }, { "epoch": 0.7547511312217194, "grad_norm": 2.729944916107847, "learning_rate": 8.517899618830932e-06, "loss": 0.1903, "step": 834 }, { "epoch": 0.755656108597285, "grad_norm": 2.6319444320063488, "learning_rate": 8.514530820244427e-06, "loss": 0.1468, "step": 835 }, { "epoch": 0.7565610859728507, "grad_norm": 2.7261481779338768, "learning_rate": 8.511158865200004e-06, "loss": 0.146, "step": 836 }, { "epoch": 0.7574660633484163, "grad_norm": 2.292486048660837, "learning_rate": 8.50778375672607e-06, "loss": 0.1345, "step": 837 }, { "epoch": 0.7583710407239819, "grad_norm": 3.5860560684298823, "learning_rate": 8.50440549785387e-06, "loss": 0.2517, "step": 838 }, { "epoch": 0.7592760180995475, "grad_norm": 1.9156023470768113, "learning_rate": 8.501024091617472e-06, "loss": 0.1187, "step": 839 }, { "epoch": 0.7601809954751131, "grad_norm": 2.735460618625039, "learning_rate": 8.497639541053769e-06, "loss": 0.2123, "step": 840 }, { "epoch": 0.7610859728506787, "grad_norm": 2.4015363165137047, "learning_rate": 8.494251849202487e-06, "loss": 0.141, "step": 841 }, { "epoch": 0.7619909502262443, "grad_norm": 2.7386036403035914, "learning_rate": 8.490861019106162e-06, "loss": 0.145, "step": 842 }, { "epoch": 0.7628959276018099, "grad_norm": 2.403440901938142, "learning_rate": 8.487467053810161e-06, "loss": 0.1404, "step": 843 }, { "epoch": 0.7638009049773755, "grad_norm": 2.0308292021578747, "learning_rate": 8.484069956362655e-06, "loss": 0.1221, "step": 844 }, { "epoch": 0.7647058823529411, "grad_norm": 2.0443774014456904, "learning_rate": 8.480669729814635e-06, "loss": 0.1, "step": 845 }, { "epoch": 0.7656108597285067, "grad_norm": 2.7079097322808825, "learning_rate": 8.477266377219899e-06, "loss": 0.1515, "step": 846 }, { "epoch": 0.7665158371040725, "grad_norm": 2.7581221251389807, "learning_rate": 8.473859901635054e-06, "loss": 0.1799, "step": 847 }, { "epoch": 0.7674208144796381, "grad_norm": 2.730366005186231, "learning_rate": 8.470450306119513e-06, "loss": 0.1371, "step": 848 }, { "epoch": 0.7683257918552037, "grad_norm": 2.454759890989941, "learning_rate": 8.46703759373549e-06, "loss": 0.1447, "step": 849 }, { "epoch": 0.7692307692307693, "grad_norm": 2.4557478327775044, "learning_rate": 8.463621767547998e-06, "loss": 0.1282, "step": 850 }, { "epoch": 0.7701357466063349, "grad_norm": 2.59299494646766, "learning_rate": 8.460202830624845e-06, "loss": 0.1299, "step": 851 }, { "epoch": 0.7710407239819005, "grad_norm": 3.515887435847182, "learning_rate": 8.456780786036636e-06, "loss": 0.2434, "step": 852 }, { "epoch": 0.7719457013574661, "grad_norm": 4.959654332444487, "learning_rate": 8.453355636856766e-06, "loss": 0.2329, "step": 853 }, { "epoch": 0.7728506787330317, "grad_norm": 2.5687760758797835, "learning_rate": 8.449927386161417e-06, "loss": 0.1434, "step": 854 }, { "epoch": 0.7737556561085973, "grad_norm": 2.3504735465137574, "learning_rate": 8.446496037029555e-06, "loss": 0.1379, "step": 855 }, { "epoch": 0.7746606334841629, "grad_norm": 2.2060399065190657, "learning_rate": 8.443061592542934e-06, "loss": 0.1351, "step": 856 }, { "epoch": 0.7755656108597285, "grad_norm": 2.3934096821777624, "learning_rate": 8.439624055786085e-06, "loss": 0.1255, "step": 857 }, { "epoch": 0.7764705882352941, "grad_norm": 2.3837760907163736, "learning_rate": 8.436183429846314e-06, "loss": 0.1546, "step": 858 }, { "epoch": 0.7773755656108597, "grad_norm": 2.330777683740034, "learning_rate": 8.432739717813707e-06, "loss": 0.1619, "step": 859 }, { "epoch": 0.7782805429864253, "grad_norm": 2.563921691110909, "learning_rate": 8.429292922781115e-06, "loss": 0.1815, "step": 860 }, { "epoch": 0.779185520361991, "grad_norm": 2.614331967463575, "learning_rate": 8.425843047844166e-06, "loss": 0.151, "step": 861 }, { "epoch": 0.7800904977375566, "grad_norm": 3.5319754250592155, "learning_rate": 8.422390096101245e-06, "loss": 0.1945, "step": 862 }, { "epoch": 0.7809954751131222, "grad_norm": 2.056545705757779, "learning_rate": 8.418934070653505e-06, "loss": 0.1323, "step": 863 }, { "epoch": 0.7819004524886878, "grad_norm": 2.3726670773497314, "learning_rate": 8.415474974604862e-06, "loss": 0.123, "step": 864 }, { "epoch": 0.7828054298642534, "grad_norm": 2.5794240512791062, "learning_rate": 8.412012811061985e-06, "loss": 0.1365, "step": 865 }, { "epoch": 0.783710407239819, "grad_norm": 2.0801546691568813, "learning_rate": 8.4085475831343e-06, "loss": 0.1054, "step": 866 }, { "epoch": 0.7846153846153846, "grad_norm": 2.8187276646023367, "learning_rate": 8.405079293933986e-06, "loss": 0.1442, "step": 867 }, { "epoch": 0.7855203619909502, "grad_norm": 2.7217064465423157, "learning_rate": 8.401607946575971e-06, "loss": 0.1269, "step": 868 }, { "epoch": 0.7864253393665158, "grad_norm": 2.2815708375440815, "learning_rate": 8.398133544177928e-06, "loss": 0.1168, "step": 869 }, { "epoch": 0.7873303167420814, "grad_norm": 2.7905194854552398, "learning_rate": 8.394656089860274e-06, "loss": 0.1842, "step": 870 }, { "epoch": 0.788235294117647, "grad_norm": 3.4232655903904696, "learning_rate": 8.39117558674617e-06, "loss": 0.1887, "step": 871 }, { "epoch": 0.7891402714932126, "grad_norm": 6.22085450713435, "learning_rate": 8.387692037961512e-06, "loss": 0.278, "step": 872 }, { "epoch": 0.7900452488687782, "grad_norm": 3.3503310784242037, "learning_rate": 8.384205446634934e-06, "loss": 0.1368, "step": 873 }, { "epoch": 0.7909502262443439, "grad_norm": 2.8126105189437665, "learning_rate": 8.3807158158978e-06, "loss": 0.1605, "step": 874 }, { "epoch": 0.7918552036199095, "grad_norm": 3.8653349943749027, "learning_rate": 8.377223148884202e-06, "loss": 0.2816, "step": 875 }, { "epoch": 0.7927601809954751, "grad_norm": 2.6612658412199277, "learning_rate": 8.373727448730965e-06, "loss": 0.133, "step": 876 }, { "epoch": 0.7936651583710407, "grad_norm": 2.552652607811253, "learning_rate": 8.370228718577636e-06, "loss": 0.1337, "step": 877 }, { "epoch": 0.7945701357466063, "grad_norm": 2.346828304513426, "learning_rate": 8.366726961566478e-06, "loss": 0.1223, "step": 878 }, { "epoch": 0.7954751131221719, "grad_norm": 2.523538685390395, "learning_rate": 8.363222180842478e-06, "loss": 0.1412, "step": 879 }, { "epoch": 0.7963800904977375, "grad_norm": 2.5366119438454744, "learning_rate": 8.359714379553338e-06, "loss": 0.1506, "step": 880 }, { "epoch": 0.7972850678733032, "grad_norm": 2.409616223864393, "learning_rate": 8.356203560849474e-06, "loss": 0.1427, "step": 881 }, { "epoch": 0.7981900452488688, "grad_norm": 2.978823091396137, "learning_rate": 8.352689727884006e-06, "loss": 0.1493, "step": 882 }, { "epoch": 0.7990950226244344, "grad_norm": 2.5686906609081963, "learning_rate": 8.349172883812766e-06, "loss": 0.1552, "step": 883 }, { "epoch": 0.8, "grad_norm": 2.666003426578484, "learning_rate": 8.345653031794292e-06, "loss": 0.1424, "step": 884 }, { "epoch": 0.8009049773755657, "grad_norm": 2.4764329580402045, "learning_rate": 8.342130174989819e-06, "loss": 0.1265, "step": 885 }, { "epoch": 0.8018099547511313, "grad_norm": 2.5367183070021966, "learning_rate": 8.338604316563283e-06, "loss": 0.1411, "step": 886 }, { "epoch": 0.8027149321266969, "grad_norm": 2.1236161207013398, "learning_rate": 8.335075459681314e-06, "loss": 0.1265, "step": 887 }, { "epoch": 0.8036199095022625, "grad_norm": 2.116357980926334, "learning_rate": 8.33154360751324e-06, "loss": 0.1201, "step": 888 }, { "epoch": 0.8045248868778281, "grad_norm": 3.2108542196234615, "learning_rate": 8.328008763231073e-06, "loss": 0.1705, "step": 889 }, { "epoch": 0.8054298642533937, "grad_norm": 3.314086942685075, "learning_rate": 8.324470930009514e-06, "loss": 0.2017, "step": 890 }, { "epoch": 0.8063348416289593, "grad_norm": 3.1234546919606645, "learning_rate": 8.32093011102595e-06, "loss": 0.1245, "step": 891 }, { "epoch": 0.8072398190045249, "grad_norm": 2.2399919697905895, "learning_rate": 8.31738630946045e-06, "loss": 0.1107, "step": 892 }, { "epoch": 0.8081447963800905, "grad_norm": 2.177698516226183, "learning_rate": 8.31383952849576e-06, "loss": 0.1369, "step": 893 }, { "epoch": 0.8090497737556561, "grad_norm": 2.5570074089034485, "learning_rate": 8.3102897713173e-06, "loss": 0.1682, "step": 894 }, { "epoch": 0.8099547511312217, "grad_norm": 2.6324304442272983, "learning_rate": 8.306737041113169e-06, "loss": 0.1421, "step": 895 }, { "epoch": 0.8108597285067873, "grad_norm": 2.4700284634224348, "learning_rate": 8.303181341074128e-06, "loss": 0.149, "step": 896 }, { "epoch": 0.8117647058823529, "grad_norm": 3.447394252752322, "learning_rate": 8.299622674393615e-06, "loss": 0.2257, "step": 897 }, { "epoch": 0.8126696832579186, "grad_norm": 2.173782590512821, "learning_rate": 8.29606104426772e-06, "loss": 0.1663, "step": 898 }, { "epoch": 0.8135746606334842, "grad_norm": 2.391161482501662, "learning_rate": 8.292496453895212e-06, "loss": 0.1634, "step": 899 }, { "epoch": 0.8144796380090498, "grad_norm": 2.4824893330967903, "learning_rate": 8.288928906477497e-06, "loss": 0.1216, "step": 900 }, { "epoch": 0.8153846153846154, "grad_norm": 2.592840762496073, "learning_rate": 8.285358405218655e-06, "loss": 0.1391, "step": 901 }, { "epoch": 0.816289592760181, "grad_norm": 2.465552469772489, "learning_rate": 8.28178495332541e-06, "loss": 0.134, "step": 902 }, { "epoch": 0.8171945701357466, "grad_norm": 3.3944519893415346, "learning_rate": 8.278208554007137e-06, "loss": 0.1757, "step": 903 }, { "epoch": 0.8180995475113122, "grad_norm": 2.8639546123987922, "learning_rate": 8.274629210475859e-06, "loss": 0.2146, "step": 904 }, { "epoch": 0.8190045248868778, "grad_norm": 2.450416443441049, "learning_rate": 8.271046925946247e-06, "loss": 0.1702, "step": 905 }, { "epoch": 0.8199095022624434, "grad_norm": 2.871298042163001, "learning_rate": 8.267461703635604e-06, "loss": 0.1675, "step": 906 }, { "epoch": 0.820814479638009, "grad_norm": 2.855852536123597, "learning_rate": 8.26387354676388e-06, "loss": 0.2032, "step": 907 }, { "epoch": 0.8217194570135746, "grad_norm": 1.9229881166504925, "learning_rate": 8.26028245855366e-06, "loss": 0.1177, "step": 908 }, { "epoch": 0.8226244343891402, "grad_norm": 2.6667930877553956, "learning_rate": 8.256688442230154e-06, "loss": 0.1233, "step": 909 }, { "epoch": 0.8235294117647058, "grad_norm": 3.3501426681558115, "learning_rate": 8.25309150102121e-06, "loss": 0.142, "step": 910 }, { "epoch": 0.8244343891402715, "grad_norm": 2.604612025146735, "learning_rate": 8.249491638157302e-06, "loss": 0.1436, "step": 911 }, { "epoch": 0.8253393665158371, "grad_norm": 3.351738837015411, "learning_rate": 8.245888856871525e-06, "loss": 0.1848, "step": 912 }, { "epoch": 0.8262443438914027, "grad_norm": 3.6307593174463104, "learning_rate": 8.242283160399593e-06, "loss": 0.219, "step": 913 }, { "epoch": 0.8271493212669683, "grad_norm": 2.3507223496128202, "learning_rate": 8.238674551979844e-06, "loss": 0.1289, "step": 914 }, { "epoch": 0.8280542986425339, "grad_norm": 3.3575911127282523, "learning_rate": 8.235063034853228e-06, "loss": 0.2503, "step": 915 }, { "epoch": 0.8289592760180996, "grad_norm": 1.6156864942987101, "learning_rate": 8.231448612263309e-06, "loss": 0.0974, "step": 916 }, { "epoch": 0.8298642533936652, "grad_norm": 1.9787795777460282, "learning_rate": 8.227831287456258e-06, "loss": 0.1162, "step": 917 }, { "epoch": 0.8307692307692308, "grad_norm": 2.126972260614509, "learning_rate": 8.224211063680854e-06, "loss": 0.1331, "step": 918 }, { "epoch": 0.8316742081447964, "grad_norm": 3.218438590100773, "learning_rate": 8.22058794418848e-06, "loss": 0.1789, "step": 919 }, { "epoch": 0.832579185520362, "grad_norm": 3.145769554760846, "learning_rate": 8.216961932233118e-06, "loss": 0.1635, "step": 920 }, { "epoch": 0.8334841628959276, "grad_norm": 2.9482834359929244, "learning_rate": 8.21333303107135e-06, "loss": 0.1382, "step": 921 }, { "epoch": 0.8343891402714932, "grad_norm": 2.961812830716214, "learning_rate": 8.209701243962353e-06, "loss": 0.1768, "step": 922 }, { "epoch": 0.8352941176470589, "grad_norm": 2.013193792309923, "learning_rate": 8.206066574167893e-06, "loss": 0.1102, "step": 923 }, { "epoch": 0.8361990950226245, "grad_norm": 2.923765440770551, "learning_rate": 8.202429024952326e-06, "loss": 0.1797, "step": 924 }, { "epoch": 0.8371040723981901, "grad_norm": 1.9422230333603063, "learning_rate": 8.198788599582596e-06, "loss": 0.0981, "step": 925 }, { "epoch": 0.8380090497737557, "grad_norm": 3.411111620651561, "learning_rate": 8.19514530132823e-06, "loss": 0.26, "step": 926 }, { "epoch": 0.8389140271493213, "grad_norm": 2.4600373715656128, "learning_rate": 8.191499133461332e-06, "loss": 0.1421, "step": 927 }, { "epoch": 0.8398190045248869, "grad_norm": 2.0017850686746295, "learning_rate": 8.187850099256586e-06, "loss": 0.1233, "step": 928 }, { "epoch": 0.8407239819004525, "grad_norm": 1.8375044796002489, "learning_rate": 8.18419820199125e-06, "loss": 0.0916, "step": 929 }, { "epoch": 0.8416289592760181, "grad_norm": 1.8603559141746167, "learning_rate": 8.180543444945154e-06, "loss": 0.116, "step": 930 }, { "epoch": 0.8425339366515837, "grad_norm": 2.7272276956936676, "learning_rate": 8.176885831400692e-06, "loss": 0.1307, "step": 931 }, { "epoch": 0.8434389140271493, "grad_norm": 2.232427369481157, "learning_rate": 8.17322536464283e-06, "loss": 0.1303, "step": 932 }, { "epoch": 0.8443438914027149, "grad_norm": 3.3249997724592766, "learning_rate": 8.169562047959093e-06, "loss": 0.172, "step": 933 }, { "epoch": 0.8452488687782805, "grad_norm": 2.3788139153518943, "learning_rate": 8.165895884639564e-06, "loss": 0.1146, "step": 934 }, { "epoch": 0.8461538461538461, "grad_norm": 2.1530627777654905, "learning_rate": 8.162226877976886e-06, "loss": 0.1227, "step": 935 }, { "epoch": 0.8470588235294118, "grad_norm": 2.56573933658478, "learning_rate": 8.158555031266255e-06, "loss": 0.1395, "step": 936 }, { "epoch": 0.8479638009049774, "grad_norm": 2.021283521776496, "learning_rate": 8.154880347805417e-06, "loss": 0.119, "step": 937 }, { "epoch": 0.848868778280543, "grad_norm": 2.899911146217443, "learning_rate": 8.151202830894662e-06, "loss": 0.1844, "step": 938 }, { "epoch": 0.8497737556561086, "grad_norm": 2.710870029426209, "learning_rate": 8.147522483836832e-06, "loss": 0.1801, "step": 939 }, { "epoch": 0.8506787330316742, "grad_norm": 3.0132900127025612, "learning_rate": 8.143839309937307e-06, "loss": 0.1629, "step": 940 }, { "epoch": 0.8515837104072398, "grad_norm": 3.7702267736002, "learning_rate": 8.140153312504004e-06, "loss": 0.2423, "step": 941 }, { "epoch": 0.8524886877828054, "grad_norm": 3.879430834922516, "learning_rate": 8.136464494847382e-06, "loss": 0.1808, "step": 942 }, { "epoch": 0.853393665158371, "grad_norm": 2.325009222072507, "learning_rate": 8.132772860280422e-06, "loss": 0.1156, "step": 943 }, { "epoch": 0.8542986425339366, "grad_norm": 2.526820471994312, "learning_rate": 8.129078412118649e-06, "loss": 0.1078, "step": 944 }, { "epoch": 0.8552036199095022, "grad_norm": 2.17498718824531, "learning_rate": 8.125381153680103e-06, "loss": 0.1041, "step": 945 }, { "epoch": 0.8561085972850678, "grad_norm": 2.787780386924259, "learning_rate": 8.121681088285353e-06, "loss": 0.1447, "step": 946 }, { "epoch": 0.8570135746606334, "grad_norm": 2.18017295123774, "learning_rate": 8.117978219257491e-06, "loss": 0.1448, "step": 947 }, { "epoch": 0.857918552036199, "grad_norm": 2.606638267514815, "learning_rate": 8.114272549922122e-06, "loss": 0.1706, "step": 948 }, { "epoch": 0.8588235294117647, "grad_norm": 1.9142895757257528, "learning_rate": 8.110564083607371e-06, "loss": 0.0987, "step": 949 }, { "epoch": 0.8597285067873304, "grad_norm": 1.8162127557376495, "learning_rate": 8.10685282364387e-06, "loss": 0.093, "step": 950 }, { "epoch": 0.860633484162896, "grad_norm": 1.9777779413057226, "learning_rate": 8.103138773364763e-06, "loss": 0.1021, "step": 951 }, { "epoch": 0.8615384615384616, "grad_norm": 3.3454572769139586, "learning_rate": 8.099421936105702e-06, "loss": 0.1866, "step": 952 }, { "epoch": 0.8624434389140272, "grad_norm": 3.113356330823321, "learning_rate": 8.095702315204837e-06, "loss": 0.1717, "step": 953 }, { "epoch": 0.8633484162895928, "grad_norm": 2.4120290658212262, "learning_rate": 8.091979914002824e-06, "loss": 0.1472, "step": 954 }, { "epoch": 0.8642533936651584, "grad_norm": 3.158195345440952, "learning_rate": 8.088254735842808e-06, "loss": 0.2208, "step": 955 }, { "epoch": 0.865158371040724, "grad_norm": 2.1408668620377482, "learning_rate": 8.084526784070435e-06, "loss": 0.1142, "step": 956 }, { "epoch": 0.8660633484162896, "grad_norm": 3.499266010637949, "learning_rate": 8.080796062033842e-06, "loss": 0.2179, "step": 957 }, { "epoch": 0.8669683257918552, "grad_norm": 2.133043065231571, "learning_rate": 8.077062573083648e-06, "loss": 0.1034, "step": 958 }, { "epoch": 0.8678733031674208, "grad_norm": 2.6980562549474243, "learning_rate": 8.073326320572964e-06, "loss": 0.1921, "step": 959 }, { "epoch": 0.8687782805429864, "grad_norm": 2.460025655601907, "learning_rate": 8.069587307857377e-06, "loss": 0.1578, "step": 960 }, { "epoch": 0.869683257918552, "grad_norm": 3.2010500627200806, "learning_rate": 8.065845538294957e-06, "loss": 0.1352, "step": 961 }, { "epoch": 0.8705882352941177, "grad_norm": 2.5410074349996568, "learning_rate": 8.06210101524625e-06, "loss": 0.1457, "step": 962 }, { "epoch": 0.8714932126696833, "grad_norm": 2.198868338526924, "learning_rate": 8.058353742074275e-06, "loss": 0.1312, "step": 963 }, { "epoch": 0.8723981900452489, "grad_norm": 2.1766721590423157, "learning_rate": 8.054603722144514e-06, "loss": 0.1172, "step": 964 }, { "epoch": 0.8733031674208145, "grad_norm": 2.5038318405389823, "learning_rate": 8.050850958824926e-06, "loss": 0.1673, "step": 965 }, { "epoch": 0.8742081447963801, "grad_norm": 2.5770067494632225, "learning_rate": 8.047095455485927e-06, "loss": 0.16, "step": 966 }, { "epoch": 0.8751131221719457, "grad_norm": 2.2636997335645908, "learning_rate": 8.0433372155004e-06, "loss": 0.104, "step": 967 }, { "epoch": 0.8760180995475113, "grad_norm": 3.2191264988866832, "learning_rate": 8.039576242243679e-06, "loss": 0.2318, "step": 968 }, { "epoch": 0.8769230769230769, "grad_norm": 2.465840666065054, "learning_rate": 8.035812539093557e-06, "loss": 0.1524, "step": 969 }, { "epoch": 0.8778280542986425, "grad_norm": 2.676368494623283, "learning_rate": 8.032046109430276e-06, "loss": 0.1848, "step": 970 }, { "epoch": 0.8787330316742081, "grad_norm": 2.199174951334976, "learning_rate": 8.028276956636532e-06, "loss": 0.1214, "step": 971 }, { "epoch": 0.8796380090497737, "grad_norm": 2.264297919208008, "learning_rate": 8.02450508409746e-06, "loss": 0.141, "step": 972 }, { "epoch": 0.8805429864253393, "grad_norm": 2.920053143671988, "learning_rate": 8.020730495200641e-06, "loss": 0.1404, "step": 973 }, { "epoch": 0.881447963800905, "grad_norm": 3.022056601784027, "learning_rate": 8.016953193336096e-06, "loss": 0.2244, "step": 974 }, { "epoch": 0.8823529411764706, "grad_norm": 2.309800569674523, "learning_rate": 8.013173181896283e-06, "loss": 0.1196, "step": 975 }, { "epoch": 0.8832579185520362, "grad_norm": 3.1602228280380937, "learning_rate": 8.00939046427609e-06, "loss": 0.1463, "step": 976 }, { "epoch": 0.8841628959276018, "grad_norm": 2.5277652596649416, "learning_rate": 8.00560504387284e-06, "loss": 0.1315, "step": 977 }, { "epoch": 0.8850678733031674, "grad_norm": 2.1564157602938185, "learning_rate": 8.001816924086281e-06, "loss": 0.1093, "step": 978 }, { "epoch": 0.885972850678733, "grad_norm": 3.090782506065567, "learning_rate": 7.998026108318583e-06, "loss": 0.1651, "step": 979 }, { "epoch": 0.8868778280542986, "grad_norm": 2.0345926592428554, "learning_rate": 7.994232599974346e-06, "loss": 0.1179, "step": 980 }, { "epoch": 0.8877828054298642, "grad_norm": 2.888979046271271, "learning_rate": 7.990436402460575e-06, "loss": 0.1828, "step": 981 }, { "epoch": 0.8886877828054298, "grad_norm": 2.573790217098038, "learning_rate": 7.986637519186702e-06, "loss": 0.1755, "step": 982 }, { "epoch": 0.8895927601809954, "grad_norm": 2.379329889260902, "learning_rate": 7.982835953564567e-06, "loss": 0.1477, "step": 983 }, { "epoch": 0.890497737556561, "grad_norm": 2.935176875822916, "learning_rate": 7.979031709008416e-06, "loss": 0.159, "step": 984 }, { "epoch": 0.8914027149321267, "grad_norm": 3.5206752473456233, "learning_rate": 7.975224788934903e-06, "loss": 0.1288, "step": 985 }, { "epoch": 0.8923076923076924, "grad_norm": 2.5887680363955266, "learning_rate": 7.971415196763088e-06, "loss": 0.1492, "step": 986 }, { "epoch": 0.893212669683258, "grad_norm": 2.6508790366416304, "learning_rate": 7.967602935914427e-06, "loss": 0.1811, "step": 987 }, { "epoch": 0.8941176470588236, "grad_norm": 2.9304046938884474, "learning_rate": 7.963788009812775e-06, "loss": 0.1657, "step": 988 }, { "epoch": 0.8950226244343892, "grad_norm": 2.479361336159627, "learning_rate": 7.95997042188438e-06, "loss": 0.1225, "step": 989 }, { "epoch": 0.8959276018099548, "grad_norm": 2.9359786981890283, "learning_rate": 7.95615017555788e-06, "loss": 0.1465, "step": 990 }, { "epoch": 0.8968325791855204, "grad_norm": 4.3084103613673355, "learning_rate": 7.9523272742643e-06, "loss": 0.2446, "step": 991 }, { "epoch": 0.897737556561086, "grad_norm": 2.2839963348552628, "learning_rate": 7.948501721437051e-06, "loss": 0.1346, "step": 992 }, { "epoch": 0.8986425339366516, "grad_norm": 2.5190747529232347, "learning_rate": 7.944673520511926e-06, "loss": 0.1477, "step": 993 }, { "epoch": 0.8995475113122172, "grad_norm": 3.080866633615977, "learning_rate": 7.940842674927094e-06, "loss": 0.1637, "step": 994 }, { "epoch": 0.9004524886877828, "grad_norm": 2.5123783357479774, "learning_rate": 7.937009188123102e-06, "loss": 0.1774, "step": 995 }, { "epoch": 0.9013574660633484, "grad_norm": 2.088481259167524, "learning_rate": 7.933173063542867e-06, "loss": 0.1287, "step": 996 }, { "epoch": 0.902262443438914, "grad_norm": 2.2543068981798178, "learning_rate": 7.929334304631673e-06, "loss": 0.126, "step": 997 }, { "epoch": 0.9031674208144796, "grad_norm": 2.14576262935664, "learning_rate": 7.925492914837176e-06, "loss": 0.1315, "step": 998 }, { "epoch": 0.9040723981900453, "grad_norm": 2.6965814827740346, "learning_rate": 7.921648897609388e-06, "loss": 0.1747, "step": 999 }, { "epoch": 0.9049773755656109, "grad_norm": 2.6833644671613577, "learning_rate": 7.917802256400688e-06, "loss": 0.1768, "step": 1000 }, { "epoch": 0.9058823529411765, "grad_norm": 2.777424127662366, "learning_rate": 7.913952994665805e-06, "loss": 0.1062, "step": 1001 }, { "epoch": 0.9067873303167421, "grad_norm": 2.5425033983140657, "learning_rate": 7.910101115861825e-06, "loss": 0.1711, "step": 1002 }, { "epoch": 0.9076923076923077, "grad_norm": 3.7388531952066755, "learning_rate": 7.906246623448184e-06, "loss": 0.2294, "step": 1003 }, { "epoch": 0.9085972850678733, "grad_norm": 2.143823677234635, "learning_rate": 7.902389520886664e-06, "loss": 0.1479, "step": 1004 }, { "epoch": 0.9095022624434389, "grad_norm": 2.4191398935247936, "learning_rate": 7.898529811641393e-06, "loss": 0.139, "step": 1005 }, { "epoch": 0.9104072398190045, "grad_norm": 3.226533514025441, "learning_rate": 7.89466749917884e-06, "loss": 0.2411, "step": 1006 }, { "epoch": 0.9113122171945701, "grad_norm": 1.4644914570332404, "learning_rate": 7.89080258696781e-06, "loss": 0.0908, "step": 1007 }, { "epoch": 0.9122171945701357, "grad_norm": 2.3597812329360166, "learning_rate": 7.886935078479445e-06, "loss": 0.123, "step": 1008 }, { "epoch": 0.9131221719457013, "grad_norm": 3.173985516136146, "learning_rate": 7.883064977187217e-06, "loss": 0.2148, "step": 1009 }, { "epoch": 0.9140271493212669, "grad_norm": 2.6051434496722106, "learning_rate": 7.879192286566929e-06, "loss": 0.1539, "step": 1010 }, { "epoch": 0.9149321266968325, "grad_norm": 2.6551080063540153, "learning_rate": 7.875317010096706e-06, "loss": 0.177, "step": 1011 }, { "epoch": 0.9158371040723982, "grad_norm": 3.1450290527006373, "learning_rate": 7.871439151257e-06, "loss": 0.2016, "step": 1012 }, { "epoch": 0.9167420814479638, "grad_norm": 2.81108117159095, "learning_rate": 7.86755871353058e-06, "loss": 0.1621, "step": 1013 }, { "epoch": 0.9176470588235294, "grad_norm": 2.24820738613363, "learning_rate": 7.863675700402527e-06, "loss": 0.1167, "step": 1014 }, { "epoch": 0.918552036199095, "grad_norm": 2.2059435433112515, "learning_rate": 7.859790115360243e-06, "loss": 0.1163, "step": 1015 }, { "epoch": 0.9194570135746606, "grad_norm": 2.2138831495399143, "learning_rate": 7.855901961893434e-06, "loss": 0.1216, "step": 1016 }, { "epoch": 0.9203619909502262, "grad_norm": 2.5523426614351297, "learning_rate": 7.852011243494116e-06, "loss": 0.1515, "step": 1017 }, { "epoch": 0.9212669683257918, "grad_norm": 3.6199425707797057, "learning_rate": 7.848117963656606e-06, "loss": 0.1333, "step": 1018 }, { "epoch": 0.9221719457013575, "grad_norm": 3.875863697460923, "learning_rate": 7.844222125877521e-06, "loss": 0.1969, "step": 1019 }, { "epoch": 0.9230769230769231, "grad_norm": 2.718785733106217, "learning_rate": 7.84032373365578e-06, "loss": 0.1403, "step": 1020 }, { "epoch": 0.9239819004524887, "grad_norm": 2.2238172094221467, "learning_rate": 7.836422790492592e-06, "loss": 0.1421, "step": 1021 }, { "epoch": 0.9248868778280543, "grad_norm": 2.7540414543158036, "learning_rate": 7.832519299891456e-06, "loss": 0.203, "step": 1022 }, { "epoch": 0.92579185520362, "grad_norm": 2.363327644148614, "learning_rate": 7.828613265358167e-06, "loss": 0.1245, "step": 1023 }, { "epoch": 0.9266968325791856, "grad_norm": 2.0683993897210775, "learning_rate": 7.824704690400793e-06, "loss": 0.1477, "step": 1024 }, { "epoch": 0.9276018099547512, "grad_norm": 2.2034987918749054, "learning_rate": 7.82079357852969e-06, "loss": 0.1388, "step": 1025 }, { "epoch": 0.9285067873303168, "grad_norm": 2.646367085837996, "learning_rate": 7.816879933257495e-06, "loss": 0.176, "step": 1026 }, { "epoch": 0.9294117647058824, "grad_norm": 2.322686915254514, "learning_rate": 7.812963758099118e-06, "loss": 0.1333, "step": 1027 }, { "epoch": 0.930316742081448, "grad_norm": 1.8691363839169952, "learning_rate": 7.809045056571734e-06, "loss": 0.1023, "step": 1028 }, { "epoch": 0.9312217194570136, "grad_norm": 2.5491265204447093, "learning_rate": 7.805123832194797e-06, "loss": 0.1325, "step": 1029 }, { "epoch": 0.9321266968325792, "grad_norm": 1.6709410101661162, "learning_rate": 7.801200088490026e-06, "loss": 0.1002, "step": 1030 }, { "epoch": 0.9330316742081448, "grad_norm": 2.8641775440180974, "learning_rate": 7.797273828981395e-06, "loss": 0.1695, "step": 1031 }, { "epoch": 0.9339366515837104, "grad_norm": 2.297731438324608, "learning_rate": 7.793345057195143e-06, "loss": 0.1009, "step": 1032 }, { "epoch": 0.934841628959276, "grad_norm": 2.1151715883394426, "learning_rate": 7.789413776659767e-06, "loss": 0.1442, "step": 1033 }, { "epoch": 0.9357466063348416, "grad_norm": 2.006828630842273, "learning_rate": 7.785479990906014e-06, "loss": 0.0962, "step": 1034 }, { "epoch": 0.9366515837104072, "grad_norm": 2.143698827706719, "learning_rate": 7.781543703466881e-06, "loss": 0.0941, "step": 1035 }, { "epoch": 0.9375565610859729, "grad_norm": 2.392004861198157, "learning_rate": 7.777604917877611e-06, "loss": 0.1254, "step": 1036 }, { "epoch": 0.9384615384615385, "grad_norm": 3.3296033201816155, "learning_rate": 7.773663637675695e-06, "loss": 0.2009, "step": 1037 }, { "epoch": 0.9393665158371041, "grad_norm": 2.09751743193068, "learning_rate": 7.76971986640086e-06, "loss": 0.1009, "step": 1038 }, { "epoch": 0.9402714932126697, "grad_norm": 2.133681723957763, "learning_rate": 7.76577360759507e-06, "loss": 0.1422, "step": 1039 }, { "epoch": 0.9411764705882353, "grad_norm": 2.548475767931052, "learning_rate": 7.76182486480253e-06, "loss": 0.1239, "step": 1040 }, { "epoch": 0.9420814479638009, "grad_norm": 2.4305168130044508, "learning_rate": 7.757873641569666e-06, "loss": 0.1382, "step": 1041 }, { "epoch": 0.9429864253393665, "grad_norm": 2.046516956333466, "learning_rate": 7.753919941445138e-06, "loss": 0.1153, "step": 1042 }, { "epoch": 0.9438914027149321, "grad_norm": 2.5389200440198723, "learning_rate": 7.74996376797983e-06, "loss": 0.1733, "step": 1043 }, { "epoch": 0.9447963800904977, "grad_norm": 3.6144766751792723, "learning_rate": 7.746005124726847e-06, "loss": 0.163, "step": 1044 }, { "epoch": 0.9457013574660633, "grad_norm": 2.556649284684414, "learning_rate": 7.742044015241508e-06, "loss": 0.1375, "step": 1045 }, { "epoch": 0.9466063348416289, "grad_norm": 2.603276387385148, "learning_rate": 7.738080443081356e-06, "loss": 0.1475, "step": 1046 }, { "epoch": 0.9475113122171945, "grad_norm": 3.084299357325151, "learning_rate": 7.734114411806134e-06, "loss": 0.1442, "step": 1047 }, { "epoch": 0.9484162895927601, "grad_norm": 2.634879851113872, "learning_rate": 7.730145924977804e-06, "loss": 0.1374, "step": 1048 }, { "epoch": 0.9493212669683257, "grad_norm": 2.9024500422774846, "learning_rate": 7.72617498616053e-06, "loss": 0.1449, "step": 1049 }, { "epoch": 0.9502262443438914, "grad_norm": 2.592245307533752, "learning_rate": 7.722201598920673e-06, "loss": 0.1309, "step": 1050 }, { "epoch": 0.951131221719457, "grad_norm": 2.0636175682577322, "learning_rate": 7.718225766826803e-06, "loss": 0.1202, "step": 1051 }, { "epoch": 0.9520361990950226, "grad_norm": 3.6135878744990335, "learning_rate": 7.714247493449677e-06, "loss": 0.2228, "step": 1052 }, { "epoch": 0.9529411764705882, "grad_norm": 2.5786692055838913, "learning_rate": 7.710266782362248e-06, "loss": 0.1136, "step": 1053 }, { "epoch": 0.9538461538461539, "grad_norm": 2.3950640838913606, "learning_rate": 7.706283637139658e-06, "loss": 0.1214, "step": 1054 }, { "epoch": 0.9547511312217195, "grad_norm": 2.2042865184971734, "learning_rate": 7.702298061359236e-06, "loss": 0.1196, "step": 1055 }, { "epoch": 0.9556561085972851, "grad_norm": 2.1097671747159112, "learning_rate": 7.698310058600492e-06, "loss": 0.1401, "step": 1056 }, { "epoch": 0.9565610859728507, "grad_norm": 2.396754075753726, "learning_rate": 7.694319632445118e-06, "loss": 0.1171, "step": 1057 }, { "epoch": 0.9574660633484163, "grad_norm": 2.288318014816315, "learning_rate": 7.69032678647698e-06, "loss": 0.1238, "step": 1058 }, { "epoch": 0.9583710407239819, "grad_norm": 1.9361752848491243, "learning_rate": 7.68633152428212e-06, "loss": 0.1165, "step": 1059 }, { "epoch": 0.9592760180995475, "grad_norm": 2.0880584333279684, "learning_rate": 7.682333849448749e-06, "loss": 0.1036, "step": 1060 }, { "epoch": 0.9601809954751132, "grad_norm": 2.41470577970656, "learning_rate": 7.678333765567244e-06, "loss": 0.1217, "step": 1061 }, { "epoch": 0.9610859728506788, "grad_norm": 2.4495393419736105, "learning_rate": 7.674331276230143e-06, "loss": 0.1178, "step": 1062 }, { "epoch": 0.9619909502262444, "grad_norm": 3.6502848466205178, "learning_rate": 7.670326385032151e-06, "loss": 0.1342, "step": 1063 }, { "epoch": 0.96289592760181, "grad_norm": 2.9471646718822115, "learning_rate": 7.666319095570128e-06, "loss": 0.1542, "step": 1064 }, { "epoch": 0.9638009049773756, "grad_norm": 2.0736497448424123, "learning_rate": 7.662309411443084e-06, "loss": 0.1279, "step": 1065 }, { "epoch": 0.9647058823529412, "grad_norm": 2.4835810933064977, "learning_rate": 7.658297336252181e-06, "loss": 0.1204, "step": 1066 }, { "epoch": 0.9656108597285068, "grad_norm": 2.3695100853076205, "learning_rate": 7.654282873600732e-06, "loss": 0.1193, "step": 1067 }, { "epoch": 0.9665158371040724, "grad_norm": 3.0514237267346433, "learning_rate": 7.650266027094191e-06, "loss": 0.2021, "step": 1068 }, { "epoch": 0.967420814479638, "grad_norm": 2.397406472873698, "learning_rate": 7.646246800340155e-06, "loss": 0.1402, "step": 1069 }, { "epoch": 0.9683257918552036, "grad_norm": 3.280091941229361, "learning_rate": 7.642225196948357e-06, "loss": 0.2331, "step": 1070 }, { "epoch": 0.9692307692307692, "grad_norm": 2.9252144197632415, "learning_rate": 7.638201220530664e-06, "loss": 0.1322, "step": 1071 }, { "epoch": 0.9701357466063348, "grad_norm": 2.067612950485954, "learning_rate": 7.634174874701076e-06, "loss": 0.1071, "step": 1072 }, { "epoch": 0.9710407239819004, "grad_norm": 2.6580416129454476, "learning_rate": 7.630146163075722e-06, "loss": 0.1471, "step": 1073 }, { "epoch": 0.971945701357466, "grad_norm": 2.0988839877937218, "learning_rate": 7.626115089272852e-06, "loss": 0.103, "step": 1074 }, { "epoch": 0.9728506787330317, "grad_norm": 2.3973172623023737, "learning_rate": 7.622081656912842e-06, "loss": 0.1282, "step": 1075 }, { "epoch": 0.9737556561085973, "grad_norm": 2.457295314687229, "learning_rate": 7.618045869618182e-06, "loss": 0.1349, "step": 1076 }, { "epoch": 0.9746606334841629, "grad_norm": 1.8257257751630962, "learning_rate": 7.614007731013478e-06, "loss": 0.0831, "step": 1077 }, { "epoch": 0.9755656108597285, "grad_norm": 2.4025534945079774, "learning_rate": 7.60996724472545e-06, "loss": 0.1165, "step": 1078 }, { "epoch": 0.9764705882352941, "grad_norm": 2.779786635928083, "learning_rate": 7.605924414382926e-06, "loss": 0.148, "step": 1079 }, { "epoch": 0.9773755656108597, "grad_norm": 2.9248672684743147, "learning_rate": 7.601879243616838e-06, "loss": 0.166, "step": 1080 }, { "epoch": 0.9782805429864253, "grad_norm": 3.0669346867165963, "learning_rate": 7.597831736060219e-06, "loss": 0.1747, "step": 1081 }, { "epoch": 0.9791855203619909, "grad_norm": 2.465642129613748, "learning_rate": 7.5937818953482035e-06, "loss": 0.1612, "step": 1082 }, { "epoch": 0.9800904977375565, "grad_norm": 2.764723617181479, "learning_rate": 7.58972972511802e-06, "loss": 0.1829, "step": 1083 }, { "epoch": 0.9809954751131221, "grad_norm": 2.1710299246798703, "learning_rate": 7.585675229008989e-06, "loss": 0.1117, "step": 1084 }, { "epoch": 0.9819004524886877, "grad_norm": 2.6905717695567057, "learning_rate": 7.581618410662519e-06, "loss": 0.1357, "step": 1085 }, { "epoch": 0.9828054298642533, "grad_norm": 1.6903676602950768, "learning_rate": 7.5775592737221075e-06, "loss": 0.0998, "step": 1086 }, { "epoch": 0.983710407239819, "grad_norm": 1.6496467062545457, "learning_rate": 7.57349782183333e-06, "loss": 0.0774, "step": 1087 }, { "epoch": 0.9846153846153847, "grad_norm": 2.0251888333707107, "learning_rate": 7.5694340586438446e-06, "loss": 0.1007, "step": 1088 }, { "epoch": 0.9855203619909503, "grad_norm": 2.8153494305744013, "learning_rate": 7.565367987803382e-06, "loss": 0.1339, "step": 1089 }, { "epoch": 0.9864253393665159, "grad_norm": 2.5840918960104386, "learning_rate": 7.56129961296375e-06, "loss": 0.1666, "step": 1090 }, { "epoch": 0.9873303167420815, "grad_norm": 2.913231622131756, "learning_rate": 7.55722893777882e-06, "loss": 0.2066, "step": 1091 }, { "epoch": 0.9882352941176471, "grad_norm": 1.9742616431763331, "learning_rate": 7.553155965904535e-06, "loss": 0.1237, "step": 1092 }, { "epoch": 0.9891402714932127, "grad_norm": 3.185409699206425, "learning_rate": 7.549080700998898e-06, "loss": 0.1794, "step": 1093 }, { "epoch": 0.9900452488687783, "grad_norm": 2.4665525708682354, "learning_rate": 7.545003146721968e-06, "loss": 0.1262, "step": 1094 }, { "epoch": 0.9909502262443439, "grad_norm": 2.515088154888766, "learning_rate": 7.540923306735868e-06, "loss": 0.1155, "step": 1095 }, { "epoch": 0.9918552036199095, "grad_norm": 2.4946335182368986, "learning_rate": 7.536841184704765e-06, "loss": 0.1687, "step": 1096 }, { "epoch": 0.9927601809954751, "grad_norm": 3.1646117334500783, "learning_rate": 7.5327567842948836e-06, "loss": 0.1547, "step": 1097 }, { "epoch": 0.9936651583710407, "grad_norm": 2.2266583440119265, "learning_rate": 7.52867010917449e-06, "loss": 0.1383, "step": 1098 }, { "epoch": 0.9945701357466064, "grad_norm": 1.518821028032265, "learning_rate": 7.524581163013891e-06, "loss": 0.0795, "step": 1099 }, { "epoch": 0.995475113122172, "grad_norm": 2.171226662616825, "learning_rate": 7.5204899494854415e-06, "loss": 0.1088, "step": 1100 }, { "epoch": 0.9963800904977376, "grad_norm": 3.4793161356958717, "learning_rate": 7.516396472263525e-06, "loss": 0.179, "step": 1101 }, { "epoch": 0.9972850678733032, "grad_norm": 2.752371623137058, "learning_rate": 7.5123007350245604e-06, "loss": 0.1351, "step": 1102 }, { "epoch": 0.9981900452488688, "grad_norm": 3.4149268524218117, "learning_rate": 7.508202741446997e-06, "loss": 0.1644, "step": 1103 }, { "epoch": 0.9990950226244344, "grad_norm": 3.6076376789228624, "learning_rate": 7.504102495211312e-06, "loss": 0.1723, "step": 1104 }, { "epoch": 1.0, "grad_norm": 3.5331575415866205, "learning_rate": 7.500000000000001e-06, "loss": 0.2338, "step": 1105 }, { "epoch": 1.0009049773755656, "grad_norm": 2.1922641346029845, "learning_rate": 7.495895259497586e-06, "loss": 0.1335, "step": 1106 }, { "epoch": 1.0018099547511312, "grad_norm": 2.2016553302552015, "learning_rate": 7.491788277390596e-06, "loss": 0.1617, "step": 1107 }, { "epoch": 1.0027149321266968, "grad_norm": 3.082344494823835, "learning_rate": 7.487679057367585e-06, "loss": 0.1996, "step": 1108 }, { "epoch": 1.0036199095022624, "grad_norm": 1.5771399455454036, "learning_rate": 7.48356760311911e-06, "loss": 0.0925, "step": 1109 }, { "epoch": 1.004524886877828, "grad_norm": 2.8601674238876953, "learning_rate": 7.479453918337733e-06, "loss": 0.2128, "step": 1110 }, { "epoch": 1.0054298642533936, "grad_norm": 1.805011127712613, "learning_rate": 7.475338006718025e-06, "loss": 0.0819, "step": 1111 }, { "epoch": 1.0063348416289593, "grad_norm": 1.8776499824364477, "learning_rate": 7.471219871956551e-06, "loss": 0.0877, "step": 1112 }, { "epoch": 1.0072398190045249, "grad_norm": 1.5652799024473654, "learning_rate": 7.467099517751879e-06, "loss": 0.0797, "step": 1113 }, { "epoch": 1.0081447963800905, "grad_norm": 2.579884484620537, "learning_rate": 7.4629769478045634e-06, "loss": 0.1761, "step": 1114 }, { "epoch": 1.009049773755656, "grad_norm": 1.9328709999539533, "learning_rate": 7.458852165817153e-06, "loss": 0.1099, "step": 1115 }, { "epoch": 1.0099547511312217, "grad_norm": 2.1682505293667536, "learning_rate": 7.454725175494184e-06, "loss": 0.141, "step": 1116 }, { "epoch": 1.0108597285067873, "grad_norm": 2.0774741456179493, "learning_rate": 7.450595980542173e-06, "loss": 0.1093, "step": 1117 }, { "epoch": 1.011764705882353, "grad_norm": 2.3977606585358973, "learning_rate": 7.4464645846696186e-06, "loss": 0.1109, "step": 1118 }, { "epoch": 1.0126696832579185, "grad_norm": 1.7383542025365062, "learning_rate": 7.442330991586995e-06, "loss": 0.1099, "step": 1119 }, { "epoch": 1.0135746606334841, "grad_norm": 1.9172582001008367, "learning_rate": 7.438195205006749e-06, "loss": 0.0945, "step": 1120 }, { "epoch": 1.0144796380090497, "grad_norm": 2.125579334935607, "learning_rate": 7.4340572286433e-06, "loss": 0.1453, "step": 1121 }, { "epoch": 1.0153846153846153, "grad_norm": 2.0819138785436007, "learning_rate": 7.42991706621303e-06, "loss": 0.1295, "step": 1122 }, { "epoch": 1.016289592760181, "grad_norm": 1.5944010580517736, "learning_rate": 7.425774721434291e-06, "loss": 0.0871, "step": 1123 }, { "epoch": 1.0171945701357465, "grad_norm": 1.693786380164809, "learning_rate": 7.4216301980273895e-06, "loss": 0.082, "step": 1124 }, { "epoch": 1.0180995475113122, "grad_norm": 1.8039502221323722, "learning_rate": 7.417483499714589e-06, "loss": 0.1061, "step": 1125 }, { "epoch": 1.0190045248868778, "grad_norm": 2.0556497180802054, "learning_rate": 7.413334630220107e-06, "loss": 0.1137, "step": 1126 }, { "epoch": 1.0199095022624434, "grad_norm": 1.8491634632754557, "learning_rate": 7.409183593270114e-06, "loss": 0.0919, "step": 1127 }, { "epoch": 1.020814479638009, "grad_norm": 2.6763025621401155, "learning_rate": 7.405030392592723e-06, "loss": 0.1703, "step": 1128 }, { "epoch": 1.0217194570135746, "grad_norm": 1.799467303479428, "learning_rate": 7.400875031917991e-06, "loss": 0.1079, "step": 1129 }, { "epoch": 1.0226244343891402, "grad_norm": 1.3209886921393268, "learning_rate": 7.396717514977916e-06, "loss": 0.08, "step": 1130 }, { "epoch": 1.0235294117647058, "grad_norm": 1.520108327012023, "learning_rate": 7.392557845506433e-06, "loss": 0.1145, "step": 1131 }, { "epoch": 1.0244343891402714, "grad_norm": 1.5566221435089125, "learning_rate": 7.388396027239411e-06, "loss": 0.1132, "step": 1132 }, { "epoch": 1.025339366515837, "grad_norm": 2.0987271097171694, "learning_rate": 7.384232063914645e-06, "loss": 0.1019, "step": 1133 }, { "epoch": 1.0262443438914026, "grad_norm": 1.8226435573827628, "learning_rate": 7.380065959271858e-06, "loss": 0.098, "step": 1134 }, { "epoch": 1.0271493212669682, "grad_norm": 1.907518497764111, "learning_rate": 7.3758977170527e-06, "loss": 0.1213, "step": 1135 }, { "epoch": 1.0280542986425338, "grad_norm": 1.755796956426844, "learning_rate": 7.371727341000737e-06, "loss": 0.1073, "step": 1136 }, { "epoch": 1.0289592760180994, "grad_norm": 2.272793941827577, "learning_rate": 7.367554834861453e-06, "loss": 0.1234, "step": 1137 }, { "epoch": 1.029864253393665, "grad_norm": 2.5990485345628085, "learning_rate": 7.363380202382242e-06, "loss": 0.1602, "step": 1138 }, { "epoch": 1.0307692307692307, "grad_norm": 2.0714376107280623, "learning_rate": 7.35920344731241e-06, "loss": 0.141, "step": 1139 }, { "epoch": 1.0316742081447963, "grad_norm": 1.7734458481040223, "learning_rate": 7.355024573403174e-06, "loss": 0.1202, "step": 1140 }, { "epoch": 1.032579185520362, "grad_norm": 2.2801899970175175, "learning_rate": 7.350843584407645e-06, "loss": 0.1042, "step": 1141 }, { "epoch": 1.0334841628959277, "grad_norm": 1.7379812121986757, "learning_rate": 7.34666048408084e-06, "loss": 0.1163, "step": 1142 }, { "epoch": 1.0343891402714933, "grad_norm": 1.7094951534642764, "learning_rate": 7.342475276179668e-06, "loss": 0.1004, "step": 1143 }, { "epoch": 1.035294117647059, "grad_norm": 1.7604150713670628, "learning_rate": 7.3382879644629345e-06, "loss": 0.1077, "step": 1144 }, { "epoch": 1.0361990950226245, "grad_norm": 1.8044378612282506, "learning_rate": 7.3340985526913335e-06, "loss": 0.102, "step": 1145 }, { "epoch": 1.0371040723981901, "grad_norm": 1.7882839680579186, "learning_rate": 7.329907044627444e-06, "loss": 0.133, "step": 1146 }, { "epoch": 1.0380090497737557, "grad_norm": 1.451963207998006, "learning_rate": 7.325713444035728e-06, "loss": 0.0854, "step": 1147 }, { "epoch": 1.0389140271493214, "grad_norm": 1.321479642027481, "learning_rate": 7.321517754682528e-06, "loss": 0.0728, "step": 1148 }, { "epoch": 1.039819004524887, "grad_norm": 2.03911677231095, "learning_rate": 7.31731998033606e-06, "loss": 0.1033, "step": 1149 }, { "epoch": 1.0407239819004526, "grad_norm": 2.3471915026927896, "learning_rate": 7.313120124766417e-06, "loss": 0.1342, "step": 1150 }, { "epoch": 1.0416289592760182, "grad_norm": 1.7956122861088146, "learning_rate": 7.308918191745554e-06, "loss": 0.108, "step": 1151 }, { "epoch": 1.0425339366515838, "grad_norm": 2.488782092775104, "learning_rate": 7.3047141850473e-06, "loss": 0.1415, "step": 1152 }, { "epoch": 1.0434389140271494, "grad_norm": 2.0677839500676964, "learning_rate": 7.300508108447342e-06, "loss": 0.1497, "step": 1153 }, { "epoch": 1.044343891402715, "grad_norm": 1.6558132651677724, "learning_rate": 7.296299965723224e-06, "loss": 0.0954, "step": 1154 }, { "epoch": 1.0452488687782806, "grad_norm": 1.7757224442955932, "learning_rate": 7.292089760654352e-06, "loss": 0.1149, "step": 1155 }, { "epoch": 1.0461538461538462, "grad_norm": 1.3488517425375925, "learning_rate": 7.287877497021978e-06, "loss": 0.0715, "step": 1156 }, { "epoch": 1.0470588235294118, "grad_norm": 1.8855789440867983, "learning_rate": 7.283663178609204e-06, "loss": 0.1396, "step": 1157 }, { "epoch": 1.0479638009049774, "grad_norm": 2.2870706311056295, "learning_rate": 7.279446809200981e-06, "loss": 0.1804, "step": 1158 }, { "epoch": 1.048868778280543, "grad_norm": 1.4666213422308183, "learning_rate": 7.275228392584099e-06, "loss": 0.0917, "step": 1159 }, { "epoch": 1.0497737556561086, "grad_norm": 2.1676255926413504, "learning_rate": 7.271007932547188e-06, "loss": 0.1118, "step": 1160 }, { "epoch": 1.0506787330316743, "grad_norm": 2.178599450421768, "learning_rate": 7.266785432880711e-06, "loss": 0.13, "step": 1161 }, { "epoch": 1.0515837104072399, "grad_norm": 1.6351216492951335, "learning_rate": 7.262560897376966e-06, "loss": 0.1156, "step": 1162 }, { "epoch": 1.0524886877828055, "grad_norm": 1.7784425562352972, "learning_rate": 7.258334329830075e-06, "loss": 0.0966, "step": 1163 }, { "epoch": 1.053393665158371, "grad_norm": 1.8001539181408508, "learning_rate": 7.2541057340359925e-06, "loss": 0.0957, "step": 1164 }, { "epoch": 1.0542986425339367, "grad_norm": 1.8033936219025621, "learning_rate": 7.249875113792485e-06, "loss": 0.1023, "step": 1165 }, { "epoch": 1.0552036199095023, "grad_norm": 1.7231350233290634, "learning_rate": 7.2456424728991446e-06, "loss": 0.0925, "step": 1166 }, { "epoch": 1.056108597285068, "grad_norm": 2.2030907241153534, "learning_rate": 7.241407815157376e-06, "loss": 0.121, "step": 1167 }, { "epoch": 1.0570135746606335, "grad_norm": 1.8593340798603428, "learning_rate": 7.237171144370395e-06, "loss": 0.1143, "step": 1168 }, { "epoch": 1.0579185520361991, "grad_norm": 1.6286028255588605, "learning_rate": 7.232932464343224e-06, "loss": 0.0857, "step": 1169 }, { "epoch": 1.0588235294117647, "grad_norm": 1.895481993990696, "learning_rate": 7.2286917788826926e-06, "loss": 0.134, "step": 1170 }, { "epoch": 1.0597285067873303, "grad_norm": 1.6788522029199244, "learning_rate": 7.2244490917974274e-06, "loss": 0.0947, "step": 1171 }, { "epoch": 1.060633484162896, "grad_norm": 1.468803942579001, "learning_rate": 7.220204406897859e-06, "loss": 0.1022, "step": 1172 }, { "epoch": 1.0615384615384615, "grad_norm": 3.8993186550569474, "learning_rate": 7.215957727996208e-06, "loss": 0.1862, "step": 1173 }, { "epoch": 1.0624434389140271, "grad_norm": 1.4271094362375814, "learning_rate": 7.211709058906484e-06, "loss": 0.114, "step": 1174 }, { "epoch": 1.0633484162895928, "grad_norm": 1.6621068796164937, "learning_rate": 7.207458403444488e-06, "loss": 0.1152, "step": 1175 }, { "epoch": 1.0642533936651584, "grad_norm": 1.6118410956178248, "learning_rate": 7.203205765427805e-06, "loss": 0.1003, "step": 1176 }, { "epoch": 1.065158371040724, "grad_norm": 1.6291989767257526, "learning_rate": 7.198951148675798e-06, "loss": 0.1209, "step": 1177 }, { "epoch": 1.0660633484162896, "grad_norm": 2.3222582690499602, "learning_rate": 7.1946945570096074e-06, "loss": 0.1335, "step": 1178 }, { "epoch": 1.0669683257918552, "grad_norm": 1.5273055591456077, "learning_rate": 7.1904359942521496e-06, "loss": 0.113, "step": 1179 }, { "epoch": 1.0678733031674208, "grad_norm": 2.175336754500709, "learning_rate": 7.186175464228109e-06, "loss": 0.1124, "step": 1180 }, { "epoch": 1.0687782805429864, "grad_norm": 1.5777273962406695, "learning_rate": 7.181912970763938e-06, "loss": 0.1118, "step": 1181 }, { "epoch": 1.069683257918552, "grad_norm": 2.36606795407044, "learning_rate": 7.177648517687852e-06, "loss": 0.1516, "step": 1182 }, { "epoch": 1.0705882352941176, "grad_norm": 1.755403095945363, "learning_rate": 7.173382108829826e-06, "loss": 0.1073, "step": 1183 }, { "epoch": 1.0714932126696832, "grad_norm": 1.65807149096368, "learning_rate": 7.169113748021591e-06, "loss": 0.1244, "step": 1184 }, { "epoch": 1.0723981900452488, "grad_norm": 1.395870495147806, "learning_rate": 7.1648434390966356e-06, "loss": 0.0799, "step": 1185 }, { "epoch": 1.0733031674208144, "grad_norm": 3.8073988569658463, "learning_rate": 7.1605711858901915e-06, "loss": 0.1268, "step": 1186 }, { "epoch": 1.07420814479638, "grad_norm": 1.7602017277924364, "learning_rate": 7.15629699223924e-06, "loss": 0.1188, "step": 1187 }, { "epoch": 1.0751131221719457, "grad_norm": 1.8602187245547142, "learning_rate": 7.1520208619825045e-06, "loss": 0.1668, "step": 1188 }, { "epoch": 1.0760180995475113, "grad_norm": 1.285970409676873, "learning_rate": 7.147742798960449e-06, "loss": 0.0735, "step": 1189 }, { "epoch": 1.0769230769230769, "grad_norm": 1.8209299742222496, "learning_rate": 7.143462807015271e-06, "loss": 0.1282, "step": 1190 }, { "epoch": 1.0778280542986425, "grad_norm": 1.9967645885189462, "learning_rate": 7.139180889990903e-06, "loss": 0.1204, "step": 1191 }, { "epoch": 1.078733031674208, "grad_norm": 1.6455432490129729, "learning_rate": 7.134897051733005e-06, "loss": 0.0928, "step": 1192 }, { "epoch": 1.0796380090497737, "grad_norm": 2.095534337641482, "learning_rate": 7.13061129608896e-06, "loss": 0.13, "step": 1193 }, { "epoch": 1.0805429864253393, "grad_norm": 1.8568078677235873, "learning_rate": 7.1263236269078785e-06, "loss": 0.1211, "step": 1194 }, { "epoch": 1.081447963800905, "grad_norm": 1.7631781091234453, "learning_rate": 7.122034048040586e-06, "loss": 0.1343, "step": 1195 }, { "epoch": 1.0823529411764705, "grad_norm": 1.7644280723022452, "learning_rate": 7.117742563339622e-06, "loss": 0.1038, "step": 1196 }, { "epoch": 1.0832579185520361, "grad_norm": 1.9887944335745658, "learning_rate": 7.1134491766592415e-06, "loss": 0.1139, "step": 1197 }, { "epoch": 1.0841628959276017, "grad_norm": 2.0530631977093448, "learning_rate": 7.109153891855406e-06, "loss": 0.1629, "step": 1198 }, { "epoch": 1.0850678733031673, "grad_norm": 1.5126602268916487, "learning_rate": 7.104856712785781e-06, "loss": 0.0896, "step": 1199 }, { "epoch": 1.085972850678733, "grad_norm": 1.9753147136104434, "learning_rate": 7.100557643309732e-06, "loss": 0.138, "step": 1200 }, { "epoch": 1.0868778280542986, "grad_norm": 1.6942285097947596, "learning_rate": 7.096256687288325e-06, "loss": 0.109, "step": 1201 }, { "epoch": 1.0877828054298642, "grad_norm": 1.5659315132836802, "learning_rate": 7.09195384858432e-06, "loss": 0.0823, "step": 1202 }, { "epoch": 1.0886877828054298, "grad_norm": 1.7532089737945953, "learning_rate": 7.0876491310621664e-06, "loss": 0.1387, "step": 1203 }, { "epoch": 1.0895927601809954, "grad_norm": 1.9812107782828874, "learning_rate": 7.083342538588003e-06, "loss": 0.1125, "step": 1204 }, { "epoch": 1.090497737556561, "grad_norm": 1.5376849075110535, "learning_rate": 7.079034075029651e-06, "loss": 0.0868, "step": 1205 }, { "epoch": 1.0914027149321266, "grad_norm": 1.9204746975889349, "learning_rate": 7.074723744256609e-06, "loss": 0.1038, "step": 1206 }, { "epoch": 1.0923076923076924, "grad_norm": 1.3754730144639642, "learning_rate": 7.07041155014006e-06, "loss": 0.0994, "step": 1207 }, { "epoch": 1.093212669683258, "grad_norm": 1.8395346014369933, "learning_rate": 7.066097496552856e-06, "loss": 0.1113, "step": 1208 }, { "epoch": 1.0941176470588236, "grad_norm": 1.8912130001727967, "learning_rate": 7.061781587369518e-06, "loss": 0.1316, "step": 1209 }, { "epoch": 1.0950226244343892, "grad_norm": 3.6551752323836384, "learning_rate": 7.057463826466235e-06, "loss": 0.2977, "step": 1210 }, { "epoch": 1.0959276018099549, "grad_norm": 3.146550592744216, "learning_rate": 7.053144217720862e-06, "loss": 0.1841, "step": 1211 }, { "epoch": 1.0968325791855205, "grad_norm": 2.227154692747032, "learning_rate": 7.048822765012906e-06, "loss": 0.1718, "step": 1212 }, { "epoch": 1.097737556561086, "grad_norm": 1.9311488231665879, "learning_rate": 7.044499472223539e-06, "loss": 0.1233, "step": 1213 }, { "epoch": 1.0986425339366517, "grad_norm": 2.394171734282369, "learning_rate": 7.040174343235577e-06, "loss": 0.1235, "step": 1214 }, { "epoch": 1.0995475113122173, "grad_norm": 1.397010589736092, "learning_rate": 7.035847381933494e-06, "loss": 0.0909, "step": 1215 }, { "epoch": 1.100452488687783, "grad_norm": 1.40583623862521, "learning_rate": 7.0315185922034014e-06, "loss": 0.1126, "step": 1216 }, { "epoch": 1.1013574660633485, "grad_norm": 2.091072739539456, "learning_rate": 7.027187977933059e-06, "loss": 0.1214, "step": 1217 }, { "epoch": 1.102262443438914, "grad_norm": 2.1321202534677512, "learning_rate": 7.0228555430118605e-06, "loss": 0.1186, "step": 1218 }, { "epoch": 1.1031674208144797, "grad_norm": 1.8516436690846876, "learning_rate": 7.0185212913308384e-06, "loss": 0.1059, "step": 1219 }, { "epoch": 1.1040723981900453, "grad_norm": 1.935917736309234, "learning_rate": 7.014185226782655e-06, "loss": 0.1152, "step": 1220 }, { "epoch": 1.104977375565611, "grad_norm": 1.6663963186128126, "learning_rate": 7.009847353261601e-06, "loss": 0.1207, "step": 1221 }, { "epoch": 1.1058823529411765, "grad_norm": 1.5681553286161334, "learning_rate": 7.005507674663594e-06, "loss": 0.1244, "step": 1222 }, { "epoch": 1.1067873303167421, "grad_norm": 2.1486025952814223, "learning_rate": 7.001166194886168e-06, "loss": 0.1184, "step": 1223 }, { "epoch": 1.1076923076923078, "grad_norm": 2.0065627405860713, "learning_rate": 6.9968229178284775e-06, "loss": 0.125, "step": 1224 }, { "epoch": 1.1085972850678734, "grad_norm": 1.9793099781472892, "learning_rate": 6.992477847391292e-06, "loss": 0.1327, "step": 1225 }, { "epoch": 1.109502262443439, "grad_norm": 1.8620812371691264, "learning_rate": 6.988130987476992e-06, "loss": 0.1157, "step": 1226 }, { "epoch": 1.1104072398190046, "grad_norm": 1.758142938937307, "learning_rate": 6.9837823419895625e-06, "loss": 0.1452, "step": 1227 }, { "epoch": 1.1113122171945702, "grad_norm": 1.2784395790928247, "learning_rate": 6.979431914834591e-06, "loss": 0.0805, "step": 1228 }, { "epoch": 1.1122171945701358, "grad_norm": 1.546692266139137, "learning_rate": 6.975079709919272e-06, "loss": 0.0971, "step": 1229 }, { "epoch": 1.1131221719457014, "grad_norm": 1.7174011586904154, "learning_rate": 6.970725731152389e-06, "loss": 0.1191, "step": 1230 }, { "epoch": 1.114027149321267, "grad_norm": 2.0575424216591576, "learning_rate": 6.966369982444324e-06, "loss": 0.1272, "step": 1231 }, { "epoch": 1.1149321266968326, "grad_norm": 1.2587754606674457, "learning_rate": 6.962012467707046e-06, "loss": 0.0741, "step": 1232 }, { "epoch": 1.1158371040723982, "grad_norm": 1.3563693049688588, "learning_rate": 6.95765319085411e-06, "loss": 0.0796, "step": 1233 }, { "epoch": 1.1167420814479638, "grad_norm": 1.5340522271847958, "learning_rate": 6.953292155800655e-06, "loss": 0.1, "step": 1234 }, { "epoch": 1.1176470588235294, "grad_norm": 1.4929582341821928, "learning_rate": 6.948929366463397e-06, "loss": 0.0758, "step": 1235 }, { "epoch": 1.118552036199095, "grad_norm": 1.7933687669874876, "learning_rate": 6.944564826760631e-06, "loss": 0.1173, "step": 1236 }, { "epoch": 1.1194570135746607, "grad_norm": 1.849178704422051, "learning_rate": 6.9401985406122204e-06, "loss": 0.1088, "step": 1237 }, { "epoch": 1.1203619909502263, "grad_norm": 1.8071507026987752, "learning_rate": 6.935830511939598e-06, "loss": 0.1164, "step": 1238 }, { "epoch": 1.1212669683257919, "grad_norm": 2.6304793079504623, "learning_rate": 6.931460744665763e-06, "loss": 0.1491, "step": 1239 }, { "epoch": 1.1221719457013575, "grad_norm": 2.0894783560715777, "learning_rate": 6.927089242715277e-06, "loss": 0.161, "step": 1240 }, { "epoch": 1.123076923076923, "grad_norm": 2.184942881867564, "learning_rate": 6.922716010014256e-06, "loss": 0.1406, "step": 1241 }, { "epoch": 1.1239819004524887, "grad_norm": 1.6587246109682738, "learning_rate": 6.918341050490369e-06, "loss": 0.0944, "step": 1242 }, { "epoch": 1.1248868778280543, "grad_norm": 1.7162602131429607, "learning_rate": 6.913964368072845e-06, "loss": 0.1033, "step": 1243 }, { "epoch": 1.12579185520362, "grad_norm": 1.6965702314209157, "learning_rate": 6.909585966692451e-06, "loss": 0.1041, "step": 1244 }, { "epoch": 1.1266968325791855, "grad_norm": 1.5862367046840284, "learning_rate": 6.905205850281502e-06, "loss": 0.0933, "step": 1245 }, { "epoch": 1.1276018099547511, "grad_norm": 1.5338487526116236, "learning_rate": 6.900824022773853e-06, "loss": 0.0984, "step": 1246 }, { "epoch": 1.1285067873303167, "grad_norm": 1.660606368969695, "learning_rate": 6.896440488104895e-06, "loss": 0.1081, "step": 1247 }, { "epoch": 1.1294117647058823, "grad_norm": 1.4030636885062906, "learning_rate": 6.892055250211552e-06, "loss": 0.0833, "step": 1248 }, { "epoch": 1.130316742081448, "grad_norm": 2.109572021010495, "learning_rate": 6.8876683130322794e-06, "loss": 0.1075, "step": 1249 }, { "epoch": 1.1312217194570136, "grad_norm": 1.4479003341534908, "learning_rate": 6.883279680507057e-06, "loss": 0.0991, "step": 1250 }, { "epoch": 1.1321266968325792, "grad_norm": 1.3773921942941627, "learning_rate": 6.878889356577386e-06, "loss": 0.0821, "step": 1251 }, { "epoch": 1.1330316742081448, "grad_norm": 2.0480871101377116, "learning_rate": 6.874497345186291e-06, "loss": 0.1466, "step": 1252 }, { "epoch": 1.1339366515837104, "grad_norm": 1.4757682782348929, "learning_rate": 6.8701036502783105e-06, "loss": 0.0835, "step": 1253 }, { "epoch": 1.134841628959276, "grad_norm": 1.6972269890083205, "learning_rate": 6.865708275799492e-06, "loss": 0.1084, "step": 1254 }, { "epoch": 1.1357466063348416, "grad_norm": 1.531624701968786, "learning_rate": 6.861311225697392e-06, "loss": 0.0938, "step": 1255 }, { "epoch": 1.1366515837104072, "grad_norm": 1.4513196985441814, "learning_rate": 6.8569125039210785e-06, "loss": 0.0794, "step": 1256 }, { "epoch": 1.1375565610859728, "grad_norm": 1.8133464085388948, "learning_rate": 6.8525121144211115e-06, "loss": 0.1109, "step": 1257 }, { "epoch": 1.1384615384615384, "grad_norm": 1.9451944275306225, "learning_rate": 6.848110061149555e-06, "loss": 0.0872, "step": 1258 }, { "epoch": 1.139366515837104, "grad_norm": 1.5165461922089194, "learning_rate": 6.8437063480599665e-06, "loss": 0.0902, "step": 1259 }, { "epoch": 1.1402714932126696, "grad_norm": 1.7676978787258588, "learning_rate": 6.8393009791073895e-06, "loss": 0.1329, "step": 1260 }, { "epoch": 1.1411764705882352, "grad_norm": 1.5853114111293194, "learning_rate": 6.834893958248361e-06, "loss": 0.1091, "step": 1261 }, { "epoch": 1.1420814479638008, "grad_norm": 1.612388829665663, "learning_rate": 6.830485289440899e-06, "loss": 0.0963, "step": 1262 }, { "epoch": 1.1429864253393665, "grad_norm": 1.8698828317576908, "learning_rate": 6.826074976644501e-06, "loss": 0.1314, "step": 1263 }, { "epoch": 1.143891402714932, "grad_norm": 1.290211985962354, "learning_rate": 6.821663023820141e-06, "loss": 0.0772, "step": 1264 }, { "epoch": 1.1447963800904977, "grad_norm": 2.434935996300668, "learning_rate": 6.817249434930267e-06, "loss": 0.1243, "step": 1265 }, { "epoch": 1.1457013574660633, "grad_norm": 1.894709942743616, "learning_rate": 6.812834213938795e-06, "loss": 0.1395, "step": 1266 }, { "epoch": 1.1466063348416289, "grad_norm": 1.2972583096174484, "learning_rate": 6.808417364811108e-06, "loss": 0.0784, "step": 1267 }, { "epoch": 1.1475113122171945, "grad_norm": 3.6553976148969136, "learning_rate": 6.80399889151405e-06, "loss": 0.1808, "step": 1268 }, { "epoch": 1.14841628959276, "grad_norm": 1.8751991051779473, "learning_rate": 6.799578798015926e-06, "loss": 0.088, "step": 1269 }, { "epoch": 1.1493212669683257, "grad_norm": 1.6411418385493588, "learning_rate": 6.7951570882864944e-06, "loss": 0.1257, "step": 1270 }, { "epoch": 1.1502262443438913, "grad_norm": 1.5120978541320969, "learning_rate": 6.7907337662969654e-06, "loss": 0.0926, "step": 1271 }, { "epoch": 1.151131221719457, "grad_norm": 1.7962042578820314, "learning_rate": 6.786308836019997e-06, "loss": 0.1312, "step": 1272 }, { "epoch": 1.1520361990950225, "grad_norm": 1.5236591379560571, "learning_rate": 6.781882301429691e-06, "loss": 0.1017, "step": 1273 }, { "epoch": 1.1529411764705881, "grad_norm": 1.6776668766787584, "learning_rate": 6.77745416650159e-06, "loss": 0.1, "step": 1274 }, { "epoch": 1.1538461538461537, "grad_norm": 1.6955960899966918, "learning_rate": 6.773024435212678e-06, "loss": 0.0975, "step": 1275 }, { "epoch": 1.1547511312217194, "grad_norm": 1.7087605825812089, "learning_rate": 6.768593111541368e-06, "loss": 0.0965, "step": 1276 }, { "epoch": 1.155656108597285, "grad_norm": 1.6106849035561603, "learning_rate": 6.764160199467504e-06, "loss": 0.0867, "step": 1277 }, { "epoch": 1.1565610859728506, "grad_norm": 2.1500059513136103, "learning_rate": 6.759725702972358e-06, "loss": 0.1148, "step": 1278 }, { "epoch": 1.1574660633484162, "grad_norm": 1.5784765165382237, "learning_rate": 6.755289626038624e-06, "loss": 0.083, "step": 1279 }, { "epoch": 1.1583710407239818, "grad_norm": 2.2318745372118745, "learning_rate": 6.750851972650416e-06, "loss": 0.1729, "step": 1280 }, { "epoch": 1.1592760180995474, "grad_norm": 1.9746836355313646, "learning_rate": 6.746412746793263e-06, "loss": 0.1185, "step": 1281 }, { "epoch": 1.1601809954751132, "grad_norm": 2.6136231444788787, "learning_rate": 6.741971952454105e-06, "loss": 0.1389, "step": 1282 }, { "epoch": 1.1610859728506788, "grad_norm": 1.7897993981740363, "learning_rate": 6.737529593621296e-06, "loss": 0.1142, "step": 1283 }, { "epoch": 1.1619909502262444, "grad_norm": 1.6605297886662247, "learning_rate": 6.73308567428459e-06, "loss": 0.1193, "step": 1284 }, { "epoch": 1.16289592760181, "grad_norm": 1.1908906643296517, "learning_rate": 6.728640198435143e-06, "loss": 0.0735, "step": 1285 }, { "epoch": 1.1638009049773756, "grad_norm": 1.4611485796272017, "learning_rate": 6.724193170065511e-06, "loss": 0.1097, "step": 1286 }, { "epoch": 1.1647058823529413, "grad_norm": 3.0617048847439112, "learning_rate": 6.719744593169642e-06, "loss": 0.2659, "step": 1287 }, { "epoch": 1.1656108597285069, "grad_norm": 3.1745271174002134, "learning_rate": 6.7152944717428765e-06, "loss": 0.2217, "step": 1288 }, { "epoch": 1.1665158371040725, "grad_norm": 1.5970142736077402, "learning_rate": 6.7108428097819435e-06, "loss": 0.0952, "step": 1289 }, { "epoch": 1.167420814479638, "grad_norm": 2.111385069847254, "learning_rate": 6.706389611284953e-06, "loss": 0.1724, "step": 1290 }, { "epoch": 1.1683257918552037, "grad_norm": 1.7367683143056902, "learning_rate": 6.701934880251394e-06, "loss": 0.1127, "step": 1291 }, { "epoch": 1.1692307692307693, "grad_norm": 1.3944810924643092, "learning_rate": 6.697478620682137e-06, "loss": 0.0828, "step": 1292 }, { "epoch": 1.170135746606335, "grad_norm": 1.7270786444374504, "learning_rate": 6.693020836579418e-06, "loss": 0.1299, "step": 1293 }, { "epoch": 1.1710407239819005, "grad_norm": 1.3235014665461586, "learning_rate": 6.68856153194685e-06, "loss": 0.0913, "step": 1294 }, { "epoch": 1.1719457013574661, "grad_norm": 2.043254264453605, "learning_rate": 6.684100710789405e-06, "loss": 0.1304, "step": 1295 }, { "epoch": 1.1728506787330317, "grad_norm": 2.2320140489045337, "learning_rate": 6.6796383771134196e-06, "loss": 0.139, "step": 1296 }, { "epoch": 1.1737556561085973, "grad_norm": 1.8232401945672165, "learning_rate": 6.6751745349265924e-06, "loss": 0.1494, "step": 1297 }, { "epoch": 1.174660633484163, "grad_norm": 1.540364562353793, "learning_rate": 6.670709188237972e-06, "loss": 0.0971, "step": 1298 }, { "epoch": 1.1755656108597285, "grad_norm": 2.166076294167987, "learning_rate": 6.666242341057958e-06, "loss": 0.1135, "step": 1299 }, { "epoch": 1.1764705882352942, "grad_norm": 1.4580264325786396, "learning_rate": 6.6617739973982985e-06, "loss": 0.0768, "step": 1300 }, { "epoch": 1.1773755656108598, "grad_norm": 1.7863034886112796, "learning_rate": 6.65730416127209e-06, "loss": 0.1039, "step": 1301 }, { "epoch": 1.1782805429864254, "grad_norm": 1.8585585947900747, "learning_rate": 6.652832836693764e-06, "loss": 0.1431, "step": 1302 }, { "epoch": 1.179185520361991, "grad_norm": 1.6048520330480291, "learning_rate": 6.648360027679091e-06, "loss": 0.0982, "step": 1303 }, { "epoch": 1.1800904977375566, "grad_norm": 2.06272047444573, "learning_rate": 6.6438857382451734e-06, "loss": 0.1178, "step": 1304 }, { "epoch": 1.1809954751131222, "grad_norm": 11.130136406575488, "learning_rate": 6.639409972410446e-06, "loss": 0.2381, "step": 1305 }, { "epoch": 1.1819004524886878, "grad_norm": 1.7254463730389378, "learning_rate": 6.634932734194665e-06, "loss": 0.1011, "step": 1306 }, { "epoch": 1.1828054298642534, "grad_norm": 1.675281377123123, "learning_rate": 6.630454027618915e-06, "loss": 0.0866, "step": 1307 }, { "epoch": 1.183710407239819, "grad_norm": 1.5408125594577984, "learning_rate": 6.6259738567055935e-06, "loss": 0.0991, "step": 1308 }, { "epoch": 1.1846153846153846, "grad_norm": 1.9696793886142214, "learning_rate": 6.6214922254784145e-06, "loss": 0.1244, "step": 1309 }, { "epoch": 1.1855203619909502, "grad_norm": 1.5391120471495372, "learning_rate": 6.617009137962407e-06, "loss": 0.1068, "step": 1310 }, { "epoch": 1.1864253393665158, "grad_norm": 2.291869376314531, "learning_rate": 6.612524598183907e-06, "loss": 0.1417, "step": 1311 }, { "epoch": 1.1873303167420814, "grad_norm": 1.6481486195907131, "learning_rate": 6.608038610170549e-06, "loss": 0.1001, "step": 1312 }, { "epoch": 1.188235294117647, "grad_norm": 1.2827455325697144, "learning_rate": 6.6035511779512764e-06, "loss": 0.0815, "step": 1313 }, { "epoch": 1.1891402714932127, "grad_norm": 1.349323812332708, "learning_rate": 6.599062305556325e-06, "loss": 0.0765, "step": 1314 }, { "epoch": 1.1900452488687783, "grad_norm": 1.949973685384092, "learning_rate": 6.594571997017224e-06, "loss": 0.1191, "step": 1315 }, { "epoch": 1.1909502262443439, "grad_norm": 1.6756958958491852, "learning_rate": 6.590080256366793e-06, "loss": 0.1049, "step": 1316 }, { "epoch": 1.1918552036199095, "grad_norm": 1.9980270474928676, "learning_rate": 6.58558708763914e-06, "loss": 0.1099, "step": 1317 }, { "epoch": 1.192760180995475, "grad_norm": 1.4299035223827186, "learning_rate": 6.581092494869652e-06, "loss": 0.0891, "step": 1318 }, { "epoch": 1.1936651583710407, "grad_norm": 1.4887236549825786, "learning_rate": 6.576596482094998e-06, "loss": 0.0875, "step": 1319 }, { "epoch": 1.1945701357466063, "grad_norm": 1.713004674224898, "learning_rate": 6.57209905335312e-06, "loss": 0.1126, "step": 1320 }, { "epoch": 1.195475113122172, "grad_norm": 1.7856101010926178, "learning_rate": 6.567600212683232e-06, "loss": 0.1087, "step": 1321 }, { "epoch": 1.1963800904977375, "grad_norm": 1.7466842533335738, "learning_rate": 6.5630999641258185e-06, "loss": 0.0977, "step": 1322 }, { "epoch": 1.1972850678733031, "grad_norm": 1.5906034839378955, "learning_rate": 6.558598311722626e-06, "loss": 0.0979, "step": 1323 }, { "epoch": 1.1981900452488687, "grad_norm": 1.802523921018589, "learning_rate": 6.554095259516662e-06, "loss": 0.1303, "step": 1324 }, { "epoch": 1.1990950226244343, "grad_norm": 1.663023380850814, "learning_rate": 6.549590811552193e-06, "loss": 0.1044, "step": 1325 }, { "epoch": 1.2, "grad_norm": 1.798972202928568, "learning_rate": 6.545084971874738e-06, "loss": 0.0991, "step": 1326 }, { "epoch": 1.2009049773755656, "grad_norm": 1.5380736489315683, "learning_rate": 6.540577744531063e-06, "loss": 0.1017, "step": 1327 }, { "epoch": 1.2018099547511312, "grad_norm": 2.079136562146054, "learning_rate": 6.536069133569185e-06, "loss": 0.137, "step": 1328 }, { "epoch": 1.2027149321266968, "grad_norm": 1.6533301118065262, "learning_rate": 6.531559143038363e-06, "loss": 0.1265, "step": 1329 }, { "epoch": 1.2036199095022624, "grad_norm": 1.5942071873353278, "learning_rate": 6.5270477769890906e-06, "loss": 0.0913, "step": 1330 }, { "epoch": 1.204524886877828, "grad_norm": 1.787267676594585, "learning_rate": 6.522535039473102e-06, "loss": 0.1158, "step": 1331 }, { "epoch": 1.2054298642533936, "grad_norm": 1.2214649844464507, "learning_rate": 6.518020934543359e-06, "loss": 0.0757, "step": 1332 }, { "epoch": 1.2063348416289592, "grad_norm": 1.731816088825364, "learning_rate": 6.513505466254055e-06, "loss": 0.0838, "step": 1333 }, { "epoch": 1.2072398190045248, "grad_norm": 1.794373948910979, "learning_rate": 6.508988638660607e-06, "loss": 0.098, "step": 1334 }, { "epoch": 1.2081447963800904, "grad_norm": 1.6565596235595816, "learning_rate": 6.504470455819651e-06, "loss": 0.1195, "step": 1335 }, { "epoch": 1.209049773755656, "grad_norm": 1.3705039107185246, "learning_rate": 6.4999509217890414e-06, "loss": 0.0865, "step": 1336 }, { "epoch": 1.2099547511312216, "grad_norm": 1.8095910564406879, "learning_rate": 6.495430040627846e-06, "loss": 0.1377, "step": 1337 }, { "epoch": 1.2108597285067872, "grad_norm": 1.4281513645892001, "learning_rate": 6.4909078163963415e-06, "loss": 0.1075, "step": 1338 }, { "epoch": 1.2117647058823529, "grad_norm": 1.708462262919853, "learning_rate": 6.486384253156014e-06, "loss": 0.0959, "step": 1339 }, { "epoch": 1.2126696832579185, "grad_norm": 1.4880342081419347, "learning_rate": 6.481859354969549e-06, "loss": 0.0873, "step": 1340 }, { "epoch": 1.213574660633484, "grad_norm": 1.870428254278432, "learning_rate": 6.477333125900831e-06, "loss": 0.1337, "step": 1341 }, { "epoch": 1.2144796380090497, "grad_norm": 1.42245979501637, "learning_rate": 6.472805570014941e-06, "loss": 0.0907, "step": 1342 }, { "epoch": 1.2153846153846155, "grad_norm": 1.9919055573420639, "learning_rate": 6.468276691378155e-06, "loss": 0.1166, "step": 1343 }, { "epoch": 1.2162895927601811, "grad_norm": 1.9495193540925522, "learning_rate": 6.4637464940579285e-06, "loss": 0.1375, "step": 1344 }, { "epoch": 1.2171945701357467, "grad_norm": 3.5655330254185698, "learning_rate": 6.4592149821229064e-06, "loss": 0.1729, "step": 1345 }, { "epoch": 1.2180995475113123, "grad_norm": 1.7119415513883947, "learning_rate": 6.454682159642919e-06, "loss": 0.1022, "step": 1346 }, { "epoch": 1.219004524886878, "grad_norm": 1.7006786241294176, "learning_rate": 6.450148030688963e-06, "loss": 0.0943, "step": 1347 }, { "epoch": 1.2199095022624435, "grad_norm": 1.6033501507884274, "learning_rate": 6.445612599333219e-06, "loss": 0.1218, "step": 1348 }, { "epoch": 1.2208144796380092, "grad_norm": 1.1608212479994315, "learning_rate": 6.441075869649027e-06, "loss": 0.0579, "step": 1349 }, { "epoch": 1.2217194570135748, "grad_norm": 1.550738942730819, "learning_rate": 6.436537845710904e-06, "loss": 0.0919, "step": 1350 }, { "epoch": 1.2226244343891404, "grad_norm": 1.5858619172916424, "learning_rate": 6.431998531594521e-06, "loss": 0.0974, "step": 1351 }, { "epoch": 1.223529411764706, "grad_norm": 1.7016154261901792, "learning_rate": 6.427457931376712e-06, "loss": 0.1035, "step": 1352 }, { "epoch": 1.2244343891402716, "grad_norm": 1.4643883539260385, "learning_rate": 6.422916049135463e-06, "loss": 0.0805, "step": 1353 }, { "epoch": 1.2253393665158372, "grad_norm": 1.4610725165176515, "learning_rate": 6.418372888949913e-06, "loss": 0.0945, "step": 1354 }, { "epoch": 1.2262443438914028, "grad_norm": 1.8519866938460297, "learning_rate": 6.413828454900351e-06, "loss": 0.1345, "step": 1355 }, { "epoch": 1.2271493212669684, "grad_norm": 2.064623877761793, "learning_rate": 6.409282751068207e-06, "loss": 0.1491, "step": 1356 }, { "epoch": 1.228054298642534, "grad_norm": 1.8826978066198974, "learning_rate": 6.404735781536052e-06, "loss": 0.1057, "step": 1357 }, { "epoch": 1.2289592760180996, "grad_norm": 1.5792095318367145, "learning_rate": 6.400187550387593e-06, "loss": 0.0734, "step": 1358 }, { "epoch": 1.2298642533936652, "grad_norm": 2.057017159614157, "learning_rate": 6.395638061707674e-06, "loss": 0.1522, "step": 1359 }, { "epoch": 1.2307692307692308, "grad_norm": 1.7799843096140149, "learning_rate": 6.391087319582264e-06, "loss": 0.127, "step": 1360 }, { "epoch": 1.2316742081447964, "grad_norm": 1.8725875095420657, "learning_rate": 6.38653532809846e-06, "loss": 0.1208, "step": 1361 }, { "epoch": 1.232579185520362, "grad_norm": 2.053846929892985, "learning_rate": 6.381982091344478e-06, "loss": 0.1199, "step": 1362 }, { "epoch": 1.2334841628959277, "grad_norm": 1.4470818481526995, "learning_rate": 6.377427613409657e-06, "loss": 0.102, "step": 1363 }, { "epoch": 1.2343891402714933, "grad_norm": 1.4098337039554554, "learning_rate": 6.3728718983844495e-06, "loss": 0.0835, "step": 1364 }, { "epoch": 1.2352941176470589, "grad_norm": 1.93510670043282, "learning_rate": 6.368314950360416e-06, "loss": 0.1146, "step": 1365 }, { "epoch": 1.2361990950226245, "grad_norm": 1.7558561242912172, "learning_rate": 6.363756773430226e-06, "loss": 0.1289, "step": 1366 }, { "epoch": 1.23710407239819, "grad_norm": 2.290633913202078, "learning_rate": 6.359197371687655e-06, "loss": 0.1725, "step": 1367 }, { "epoch": 1.2380090497737557, "grad_norm": 2.5423357103480786, "learning_rate": 6.354636749227576e-06, "loss": 0.1626, "step": 1368 }, { "epoch": 1.2389140271493213, "grad_norm": 2.1818392416539876, "learning_rate": 6.350074910145959e-06, "loss": 0.1408, "step": 1369 }, { "epoch": 1.239819004524887, "grad_norm": 1.9517818771051936, "learning_rate": 6.3455118585398676e-06, "loss": 0.1458, "step": 1370 }, { "epoch": 1.2407239819004525, "grad_norm": 1.3522415352793242, "learning_rate": 6.34094759850745e-06, "loss": 0.0825, "step": 1371 }, { "epoch": 1.2416289592760181, "grad_norm": 1.3919494536365127, "learning_rate": 6.3363821341479485e-06, "loss": 0.101, "step": 1372 }, { "epoch": 1.2425339366515837, "grad_norm": 1.9096773121926989, "learning_rate": 6.331815469561676e-06, "loss": 0.161, "step": 1373 }, { "epoch": 1.2434389140271493, "grad_norm": 1.552087322329305, "learning_rate": 6.327247608850035e-06, "loss": 0.0902, "step": 1374 }, { "epoch": 1.244343891402715, "grad_norm": 1.4334295304691287, "learning_rate": 6.3226785561154914e-06, "loss": 0.0942, "step": 1375 }, { "epoch": 1.2452488687782806, "grad_norm": 1.6899338261835237, "learning_rate": 6.318108315461588e-06, "loss": 0.1171, "step": 1376 }, { "epoch": 1.2461538461538462, "grad_norm": 2.848420389704957, "learning_rate": 6.313536890992935e-06, "loss": 0.2049, "step": 1377 }, { "epoch": 1.2470588235294118, "grad_norm": 1.6256091173152563, "learning_rate": 6.308964286815203e-06, "loss": 0.1126, "step": 1378 }, { "epoch": 1.2479638009049774, "grad_norm": 1.372133248522217, "learning_rate": 6.304390507035121e-06, "loss": 0.0828, "step": 1379 }, { "epoch": 1.248868778280543, "grad_norm": 1.7592600848926607, "learning_rate": 6.299815555760478e-06, "loss": 0.0992, "step": 1380 }, { "epoch": 1.2497737556561086, "grad_norm": 2.264173849012822, "learning_rate": 6.29523943710011e-06, "loss": 0.1673, "step": 1381 }, { "epoch": 1.2506787330316742, "grad_norm": 1.5524181592214903, "learning_rate": 6.290662155163909e-06, "loss": 0.0717, "step": 1382 }, { "epoch": 1.2515837104072398, "grad_norm": 1.856670441849533, "learning_rate": 6.286083714062804e-06, "loss": 0.1254, "step": 1383 }, { "epoch": 1.2524886877828054, "grad_norm": 1.7388737024466439, "learning_rate": 6.28150411790877e-06, "loss": 0.105, "step": 1384 }, { "epoch": 1.253393665158371, "grad_norm": 1.5094920695271357, "learning_rate": 6.276923370814815e-06, "loss": 0.0881, "step": 1385 }, { "epoch": 1.2542986425339366, "grad_norm": 1.8717995389515691, "learning_rate": 6.272341476894985e-06, "loss": 0.1578, "step": 1386 }, { "epoch": 1.2552036199095022, "grad_norm": 1.375409256926493, "learning_rate": 6.267758440264355e-06, "loss": 0.0872, "step": 1387 }, { "epoch": 1.2561085972850679, "grad_norm": 2.6675765599942642, "learning_rate": 6.263174265039026e-06, "loss": 0.1258, "step": 1388 }, { "epoch": 1.2570135746606335, "grad_norm": 1.8825235636419784, "learning_rate": 6.258588955336118e-06, "loss": 0.101, "step": 1389 }, { "epoch": 1.257918552036199, "grad_norm": 1.6275134138048, "learning_rate": 6.254002515273775e-06, "loss": 0.1094, "step": 1390 }, { "epoch": 1.2588235294117647, "grad_norm": 1.7031108583715802, "learning_rate": 6.249414948971154e-06, "loss": 0.0867, "step": 1391 }, { "epoch": 1.2597285067873303, "grad_norm": 1.8984152167888722, "learning_rate": 6.244826260548426e-06, "loss": 0.1182, "step": 1392 }, { "epoch": 1.260633484162896, "grad_norm": 1.459058683519767, "learning_rate": 6.240236454126764e-06, "loss": 0.0759, "step": 1393 }, { "epoch": 1.2615384615384615, "grad_norm": 1.7061964419221696, "learning_rate": 6.235645533828348e-06, "loss": 0.1197, "step": 1394 }, { "epoch": 1.262443438914027, "grad_norm": 1.6721867059054658, "learning_rate": 6.231053503776363e-06, "loss": 0.0985, "step": 1395 }, { "epoch": 1.2633484162895927, "grad_norm": 1.2170723866524704, "learning_rate": 6.226460368094985e-06, "loss": 0.0663, "step": 1396 }, { "epoch": 1.2642533936651583, "grad_norm": 2.1898544741046244, "learning_rate": 6.221866130909384e-06, "loss": 0.1621, "step": 1397 }, { "epoch": 1.265158371040724, "grad_norm": 1.7306839682289545, "learning_rate": 6.217270796345722e-06, "loss": 0.1229, "step": 1398 }, { "epoch": 1.2660633484162895, "grad_norm": 2.1188281517272087, "learning_rate": 6.21267436853114e-06, "loss": 0.1527, "step": 1399 }, { "epoch": 1.2669683257918551, "grad_norm": 2.17986162615389, "learning_rate": 6.208076851593768e-06, "loss": 0.1156, "step": 1400 }, { "epoch": 1.2678733031674208, "grad_norm": 2.6815824889146787, "learning_rate": 6.2034782496627145e-06, "loss": 0.1332, "step": 1401 }, { "epoch": 1.2687782805429864, "grad_norm": 1.7525270793636882, "learning_rate": 6.198878566868055e-06, "loss": 0.0745, "step": 1402 }, { "epoch": 1.269683257918552, "grad_norm": 1.9631803492886448, "learning_rate": 6.1942778073408425e-06, "loss": 0.1071, "step": 1403 }, { "epoch": 1.2705882352941176, "grad_norm": 3.379464315186274, "learning_rate": 6.189675975213094e-06, "loss": 0.1054, "step": 1404 }, { "epoch": 1.2714932126696832, "grad_norm": 1.7065624451346977, "learning_rate": 6.185073074617793e-06, "loss": 0.118, "step": 1405 }, { "epoch": 1.2723981900452488, "grad_norm": 1.6251408653875818, "learning_rate": 6.180469109688876e-06, "loss": 0.0988, "step": 1406 }, { "epoch": 1.2733031674208144, "grad_norm": 1.7477043292432204, "learning_rate": 6.175864084561242e-06, "loss": 0.0935, "step": 1407 }, { "epoch": 1.27420814479638, "grad_norm": 2.1545945221176033, "learning_rate": 6.17125800337074e-06, "loss": 0.1271, "step": 1408 }, { "epoch": 1.2751131221719456, "grad_norm": 2.174836870731984, "learning_rate": 6.166650870254167e-06, "loss": 0.1058, "step": 1409 }, { "epoch": 1.2760180995475112, "grad_norm": 1.6733189007803289, "learning_rate": 6.1620426893492645e-06, "loss": 0.1256, "step": 1410 }, { "epoch": 1.2769230769230768, "grad_norm": 1.6058856060154407, "learning_rate": 6.157433464794717e-06, "loss": 0.113, "step": 1411 }, { "epoch": 1.2778280542986424, "grad_norm": 1.3567765857568501, "learning_rate": 6.152823200730142e-06, "loss": 0.0851, "step": 1412 }, { "epoch": 1.278733031674208, "grad_norm": 1.5082400048044666, "learning_rate": 6.148211901296095e-06, "loss": 0.0876, "step": 1413 }, { "epoch": 1.2796380090497737, "grad_norm": 2.094560776805565, "learning_rate": 6.143599570634062e-06, "loss": 0.1349, "step": 1414 }, { "epoch": 1.2805429864253393, "grad_norm": 1.7439937750171681, "learning_rate": 6.13898621288645e-06, "loss": 0.1342, "step": 1415 }, { "epoch": 1.2814479638009049, "grad_norm": 1.681407245362687, "learning_rate": 6.134371832196591e-06, "loss": 0.101, "step": 1416 }, { "epoch": 1.2823529411764705, "grad_norm": 1.6247771256806354, "learning_rate": 6.129756432708739e-06, "loss": 0.0957, "step": 1417 }, { "epoch": 1.283257918552036, "grad_norm": 1.5741277137957554, "learning_rate": 6.1251400185680585e-06, "loss": 0.1232, "step": 1418 }, { "epoch": 1.2841628959276017, "grad_norm": 1.591886240738505, "learning_rate": 6.1205225939206285e-06, "loss": 0.1085, "step": 1419 }, { "epoch": 1.2850678733031673, "grad_norm": 1.597664759791562, "learning_rate": 6.115904162913431e-06, "loss": 0.0968, "step": 1420 }, { "epoch": 1.285972850678733, "grad_norm": 2.635648785199977, "learning_rate": 6.111284729694358e-06, "loss": 0.1325, "step": 1421 }, { "epoch": 1.2868778280542985, "grad_norm": 1.4037465801175466, "learning_rate": 6.106664298412196e-06, "loss": 0.0843, "step": 1422 }, { "epoch": 1.2877828054298641, "grad_norm": 1.2383407627640504, "learning_rate": 6.102042873216631e-06, "loss": 0.0846, "step": 1423 }, { "epoch": 1.2886877828054297, "grad_norm": 1.4758820683238936, "learning_rate": 6.097420458258243e-06, "loss": 0.0988, "step": 1424 }, { "epoch": 1.2895927601809956, "grad_norm": 1.6558218298076197, "learning_rate": 6.092797057688496e-06, "loss": 0.1038, "step": 1425 }, { "epoch": 1.2904977375565612, "grad_norm": 1.4676807307129012, "learning_rate": 6.088172675659742e-06, "loss": 0.0802, "step": 1426 }, { "epoch": 1.2914027149321268, "grad_norm": 1.500534714215747, "learning_rate": 6.083547316325217e-06, "loss": 0.1085, "step": 1427 }, { "epoch": 1.2923076923076924, "grad_norm": 1.8461017345541169, "learning_rate": 6.078920983839032e-06, "loss": 0.151, "step": 1428 }, { "epoch": 1.293212669683258, "grad_norm": 2.452786935918913, "learning_rate": 6.074293682356169e-06, "loss": 0.1729, "step": 1429 }, { "epoch": 1.2941176470588236, "grad_norm": 2.154565875268813, "learning_rate": 6.0696654160324875e-06, "loss": 0.1156, "step": 1430 }, { "epoch": 1.2950226244343892, "grad_norm": 1.93813720203821, "learning_rate": 6.065036189024708e-06, "loss": 0.1308, "step": 1431 }, { "epoch": 1.2959276018099548, "grad_norm": 1.540479234751041, "learning_rate": 6.060406005490414e-06, "loss": 0.108, "step": 1432 }, { "epoch": 1.2968325791855204, "grad_norm": 2.7662731885912293, "learning_rate": 6.0557748695880525e-06, "loss": 0.1316, "step": 1433 }, { "epoch": 1.297737556561086, "grad_norm": 1.4939830404858991, "learning_rate": 6.051142785476921e-06, "loss": 0.0852, "step": 1434 }, { "epoch": 1.2986425339366516, "grad_norm": 1.4009568478976977, "learning_rate": 6.046509757317168e-06, "loss": 0.0934, "step": 1435 }, { "epoch": 1.2995475113122172, "grad_norm": 1.6678399148520835, "learning_rate": 6.041875789269798e-06, "loss": 0.118, "step": 1436 }, { "epoch": 1.3004524886877828, "grad_norm": 1.636982879180795, "learning_rate": 6.037240885496649e-06, "loss": 0.0906, "step": 1437 }, { "epoch": 1.3013574660633485, "grad_norm": 1.1668673932728155, "learning_rate": 6.032605050160406e-06, "loss": 0.071, "step": 1438 }, { "epoch": 1.302262443438914, "grad_norm": 1.8863387823878928, "learning_rate": 6.027968287424588e-06, "loss": 0.0858, "step": 1439 }, { "epoch": 1.3031674208144797, "grad_norm": 1.77284257570628, "learning_rate": 6.0233306014535505e-06, "loss": 0.1331, "step": 1440 }, { "epoch": 1.3040723981900453, "grad_norm": 1.7634790323485885, "learning_rate": 6.0186919964124735e-06, "loss": 0.1004, "step": 1441 }, { "epoch": 1.3049773755656109, "grad_norm": 1.6162656964694497, "learning_rate": 6.0140524764673645e-06, "loss": 0.0906, "step": 1442 }, { "epoch": 1.3058823529411765, "grad_norm": 1.4307919209766087, "learning_rate": 6.009412045785051e-06, "loss": 0.0853, "step": 1443 }, { "epoch": 1.306787330316742, "grad_norm": 2.3701020834691993, "learning_rate": 6.004770708533184e-06, "loss": 0.1055, "step": 1444 }, { "epoch": 1.3076923076923077, "grad_norm": 1.9967339150242125, "learning_rate": 6.000128468880223e-06, "loss": 0.102, "step": 1445 }, { "epoch": 1.3085972850678733, "grad_norm": 1.969346496245966, "learning_rate": 5.995485330995439e-06, "loss": 0.1552, "step": 1446 }, { "epoch": 1.309502262443439, "grad_norm": 1.6336823578533193, "learning_rate": 5.9908412990489105e-06, "loss": 0.0851, "step": 1447 }, { "epoch": 1.3104072398190045, "grad_norm": 1.290829147726599, "learning_rate": 5.98619637721152e-06, "loss": 0.0727, "step": 1448 }, { "epoch": 1.3113122171945701, "grad_norm": 2.2997071935769307, "learning_rate": 5.981550569654947e-06, "loss": 0.1518, "step": 1449 }, { "epoch": 1.3122171945701357, "grad_norm": 1.403883354479397, "learning_rate": 5.976903880551669e-06, "loss": 0.0748, "step": 1450 }, { "epoch": 1.3131221719457014, "grad_norm": 1.8166668287909498, "learning_rate": 5.972256314074953e-06, "loss": 0.0968, "step": 1451 }, { "epoch": 1.314027149321267, "grad_norm": 2.4101010544163075, "learning_rate": 5.967607874398854e-06, "loss": 0.1057, "step": 1452 }, { "epoch": 1.3149321266968326, "grad_norm": 1.8237813185616427, "learning_rate": 5.962958565698215e-06, "loss": 0.1206, "step": 1453 }, { "epoch": 1.3158371040723982, "grad_norm": 1.6191742077023883, "learning_rate": 5.958308392148654e-06, "loss": 0.0942, "step": 1454 }, { "epoch": 1.3167420814479638, "grad_norm": 1.875329561974573, "learning_rate": 5.953657357926569e-06, "loss": 0.12, "step": 1455 }, { "epoch": 1.3176470588235294, "grad_norm": 1.5231124401895535, "learning_rate": 5.9490054672091305e-06, "loss": 0.1012, "step": 1456 }, { "epoch": 1.318552036199095, "grad_norm": 1.7254224781836414, "learning_rate": 5.944352724174276e-06, "loss": 0.1136, "step": 1457 }, { "epoch": 1.3194570135746606, "grad_norm": 1.9973057202934266, "learning_rate": 5.939699133000715e-06, "loss": 0.1032, "step": 1458 }, { "epoch": 1.3203619909502262, "grad_norm": 2.7189536492287116, "learning_rate": 5.935044697867911e-06, "loss": 0.17, "step": 1459 }, { "epoch": 1.3212669683257918, "grad_norm": 2.013830977183228, "learning_rate": 5.930389422956088e-06, "loss": 0.1432, "step": 1460 }, { "epoch": 1.3221719457013574, "grad_norm": 1.512820840589491, "learning_rate": 5.9257333124462275e-06, "loss": 0.0926, "step": 1461 }, { "epoch": 1.323076923076923, "grad_norm": 1.358841940187811, "learning_rate": 5.921076370520058e-06, "loss": 0.0861, "step": 1462 }, { "epoch": 1.3239819004524886, "grad_norm": 1.7346588531257423, "learning_rate": 5.916418601360056e-06, "loss": 0.1066, "step": 1463 }, { "epoch": 1.3248868778280543, "grad_norm": 1.2307330373920504, "learning_rate": 5.911760009149439e-06, "loss": 0.0691, "step": 1464 }, { "epoch": 1.3257918552036199, "grad_norm": 1.4970396656401879, "learning_rate": 5.907100598072166e-06, "loss": 0.0878, "step": 1465 }, { "epoch": 1.3266968325791855, "grad_norm": 1.5494162718219278, "learning_rate": 5.902440372312931e-06, "loss": 0.1109, "step": 1466 }, { "epoch": 1.327601809954751, "grad_norm": 1.3323504660831742, "learning_rate": 5.8977793360571604e-06, "loss": 0.0687, "step": 1467 }, { "epoch": 1.3285067873303167, "grad_norm": 1.915127870477593, "learning_rate": 5.893117493491005e-06, "loss": 0.1172, "step": 1468 }, { "epoch": 1.3294117647058823, "grad_norm": 1.4686749464817186, "learning_rate": 5.888454848801345e-06, "loss": 0.0819, "step": 1469 }, { "epoch": 1.330316742081448, "grad_norm": 2.6746154728225395, "learning_rate": 5.883791406175775e-06, "loss": 0.1779, "step": 1470 }, { "epoch": 1.3312217194570135, "grad_norm": 2.5178183594576042, "learning_rate": 5.879127169802612e-06, "loss": 0.0941, "step": 1471 }, { "epoch": 1.3321266968325791, "grad_norm": 1.214302892669453, "learning_rate": 5.8744621438708835e-06, "loss": 0.0709, "step": 1472 }, { "epoch": 1.3330316742081447, "grad_norm": 1.453673628247997, "learning_rate": 5.869796332570326e-06, "loss": 0.0709, "step": 1473 }, { "epoch": 1.3339366515837103, "grad_norm": 1.7184325102586313, "learning_rate": 5.865129740091379e-06, "loss": 0.1254, "step": 1474 }, { "epoch": 1.334841628959276, "grad_norm": 1.791939166428392, "learning_rate": 5.860462370625189e-06, "loss": 0.134, "step": 1475 }, { "epoch": 1.3357466063348415, "grad_norm": 1.4963427047473514, "learning_rate": 5.855794228363594e-06, "loss": 0.0907, "step": 1476 }, { "epoch": 1.3366515837104074, "grad_norm": 2.127401905600227, "learning_rate": 5.851125317499131e-06, "loss": 0.1214, "step": 1477 }, { "epoch": 1.337556561085973, "grad_norm": 1.5410975174017945, "learning_rate": 5.8464556422250275e-06, "loss": 0.0826, "step": 1478 }, { "epoch": 1.3384615384615386, "grad_norm": 1.6971596215983256, "learning_rate": 5.841785206735192e-06, "loss": 0.117, "step": 1479 }, { "epoch": 1.3393665158371042, "grad_norm": 3.579649089957919, "learning_rate": 5.837114015224223e-06, "loss": 0.1028, "step": 1480 }, { "epoch": 1.3402714932126698, "grad_norm": 1.7258573501252552, "learning_rate": 5.832442071887393e-06, "loss": 0.1372, "step": 1481 }, { "epoch": 1.3411764705882354, "grad_norm": 1.603490447709255, "learning_rate": 5.82776938092065e-06, "loss": 0.1044, "step": 1482 }, { "epoch": 1.342081447963801, "grad_norm": 1.9710352784925858, "learning_rate": 5.823095946520616e-06, "loss": 0.1227, "step": 1483 }, { "epoch": 1.3429864253393666, "grad_norm": 1.529542271484244, "learning_rate": 5.818421772884578e-06, "loss": 0.0885, "step": 1484 }, { "epoch": 1.3438914027149322, "grad_norm": 1.9929879205362964, "learning_rate": 5.813746864210489e-06, "loss": 0.1303, "step": 1485 }, { "epoch": 1.3447963800904978, "grad_norm": 2.3299235758737407, "learning_rate": 5.809071224696961e-06, "loss": 0.1374, "step": 1486 }, { "epoch": 1.3457013574660635, "grad_norm": 1.3855720739745034, "learning_rate": 5.8043948585432645e-06, "loss": 0.0814, "step": 1487 }, { "epoch": 1.346606334841629, "grad_norm": 1.532014871498716, "learning_rate": 5.799717769949318e-06, "loss": 0.0986, "step": 1488 }, { "epoch": 1.3475113122171947, "grad_norm": 1.3462521347766496, "learning_rate": 5.795039963115693e-06, "loss": 0.0932, "step": 1489 }, { "epoch": 1.3484162895927603, "grad_norm": 1.7754127862843083, "learning_rate": 5.790361442243605e-06, "loss": 0.0871, "step": 1490 }, { "epoch": 1.3493212669683259, "grad_norm": 2.60902646165849, "learning_rate": 5.785682211534911e-06, "loss": 0.1748, "step": 1491 }, { "epoch": 1.3502262443438915, "grad_norm": 1.9960742273445187, "learning_rate": 5.781002275192104e-06, "loss": 0.1366, "step": 1492 }, { "epoch": 1.351131221719457, "grad_norm": 1.7084537363611976, "learning_rate": 5.7763216374183094e-06, "loss": 0.1163, "step": 1493 }, { "epoch": 1.3520361990950227, "grad_norm": 1.8746867720127762, "learning_rate": 5.771640302417291e-06, "loss": 0.1357, "step": 1494 }, { "epoch": 1.3529411764705883, "grad_norm": 1.9089936399956946, "learning_rate": 5.766958274393428e-06, "loss": 0.1101, "step": 1495 }, { "epoch": 1.353846153846154, "grad_norm": 1.8720778887617429, "learning_rate": 5.762275557551728e-06, "loss": 0.0994, "step": 1496 }, { "epoch": 1.3547511312217195, "grad_norm": 1.6902237387970753, "learning_rate": 5.757592156097815e-06, "loss": 0.1482, "step": 1497 }, { "epoch": 1.3556561085972851, "grad_norm": 1.604298419591375, "learning_rate": 5.752908074237931e-06, "loss": 0.1176, "step": 1498 }, { "epoch": 1.3565610859728507, "grad_norm": 1.4522491549528458, "learning_rate": 5.748223316178927e-06, "loss": 0.0998, "step": 1499 }, { "epoch": 1.3574660633484164, "grad_norm": 1.3049344186321137, "learning_rate": 5.743537886128258e-06, "loss": 0.0701, "step": 1500 }, { "epoch": 1.358371040723982, "grad_norm": 1.4393384784914582, "learning_rate": 5.7388517882939884e-06, "loss": 0.0824, "step": 1501 }, { "epoch": 1.3592760180995476, "grad_norm": 1.5575101321182427, "learning_rate": 5.734165026884782e-06, "loss": 0.1181, "step": 1502 }, { "epoch": 1.3601809954751132, "grad_norm": 1.344456860140244, "learning_rate": 5.72947760610989e-06, "loss": 0.072, "step": 1503 }, { "epoch": 1.3610859728506788, "grad_norm": 1.6279260288415862, "learning_rate": 5.7247895301791675e-06, "loss": 0.0868, "step": 1504 }, { "epoch": 1.3619909502262444, "grad_norm": 1.7471543771453542, "learning_rate": 5.72010080330305e-06, "loss": 0.1142, "step": 1505 }, { "epoch": 1.36289592760181, "grad_norm": 1.717009211761528, "learning_rate": 5.71541142969256e-06, "loss": 0.1091, "step": 1506 }, { "epoch": 1.3638009049773756, "grad_norm": 1.8256563026460528, "learning_rate": 5.7107214135593025e-06, "loss": 0.1166, "step": 1507 }, { "epoch": 1.3647058823529412, "grad_norm": 1.4350648446581205, "learning_rate": 5.706030759115458e-06, "loss": 0.0779, "step": 1508 }, { "epoch": 1.3656108597285068, "grad_norm": 1.590883659501213, "learning_rate": 5.701339470573779e-06, "loss": 0.0898, "step": 1509 }, { "epoch": 1.3665158371040724, "grad_norm": 1.376349624733568, "learning_rate": 5.696647552147589e-06, "loss": 0.1116, "step": 1510 }, { "epoch": 1.367420814479638, "grad_norm": 1.953394660913433, "learning_rate": 5.69195500805078e-06, "loss": 0.1098, "step": 1511 }, { "epoch": 1.3683257918552036, "grad_norm": 1.5080204326399718, "learning_rate": 5.6872618424978e-06, "loss": 0.086, "step": 1512 }, { "epoch": 1.3692307692307693, "grad_norm": 1.5251895723497495, "learning_rate": 5.682568059703659e-06, "loss": 0.1037, "step": 1513 }, { "epoch": 1.3701357466063349, "grad_norm": 1.918105414459449, "learning_rate": 5.6778736638839206e-06, "loss": 0.1416, "step": 1514 }, { "epoch": 1.3710407239819005, "grad_norm": 1.5410162829010856, "learning_rate": 5.673178659254698e-06, "loss": 0.0941, "step": 1515 }, { "epoch": 1.371945701357466, "grad_norm": 2.2540474244348694, "learning_rate": 5.6684830500326524e-06, "loss": 0.1324, "step": 1516 }, { "epoch": 1.3728506787330317, "grad_norm": 1.6120866620778445, "learning_rate": 5.663786840434989e-06, "loss": 0.0717, "step": 1517 }, { "epoch": 1.3737556561085973, "grad_norm": 2.6405881910094076, "learning_rate": 5.659090034679451e-06, "loss": 0.1414, "step": 1518 }, { "epoch": 1.374660633484163, "grad_norm": 1.7277220499807677, "learning_rate": 5.654392636984314e-06, "loss": 0.1214, "step": 1519 }, { "epoch": 1.3755656108597285, "grad_norm": 1.4169721565029363, "learning_rate": 5.64969465156839e-06, "loss": 0.0885, "step": 1520 }, { "epoch": 1.3764705882352941, "grad_norm": 1.8737896908147893, "learning_rate": 5.644996082651018e-06, "loss": 0.1179, "step": 1521 }, { "epoch": 1.3773755656108597, "grad_norm": 1.5823525793460216, "learning_rate": 5.640296934452057e-06, "loss": 0.0862, "step": 1522 }, { "epoch": 1.3782805429864253, "grad_norm": 1.8089108315400693, "learning_rate": 5.635597211191892e-06, "loss": 0.1152, "step": 1523 }, { "epoch": 1.379185520361991, "grad_norm": 1.4352602253199287, "learning_rate": 5.630896917091421e-06, "loss": 0.108, "step": 1524 }, { "epoch": 1.3800904977375565, "grad_norm": 2.2177124503129395, "learning_rate": 5.626196056372056e-06, "loss": 0.1538, "step": 1525 }, { "epoch": 1.3809954751131222, "grad_norm": 1.3308514074231528, "learning_rate": 5.6214946332557165e-06, "loss": 0.0684, "step": 1526 }, { "epoch": 1.3819004524886878, "grad_norm": 1.5160494210675046, "learning_rate": 5.616792651964827e-06, "loss": 0.1028, "step": 1527 }, { "epoch": 1.3828054298642534, "grad_norm": 1.5529899959351547, "learning_rate": 5.612090116722315e-06, "loss": 0.0854, "step": 1528 }, { "epoch": 1.383710407239819, "grad_norm": 1.219454931133637, "learning_rate": 5.607387031751605e-06, "loss": 0.0633, "step": 1529 }, { "epoch": 1.3846153846153846, "grad_norm": 1.5698334970202938, "learning_rate": 5.6026834012766155e-06, "loss": 0.0936, "step": 1530 }, { "epoch": 1.3855203619909502, "grad_norm": 2.2057696075849536, "learning_rate": 5.597979229521754e-06, "loss": 0.0948, "step": 1531 }, { "epoch": 1.3864253393665158, "grad_norm": 1.2584853338157567, "learning_rate": 5.593274520711914e-06, "loss": 0.0913, "step": 1532 }, { "epoch": 1.3873303167420814, "grad_norm": 2.4092689510352225, "learning_rate": 5.588569279072471e-06, "loss": 0.19, "step": 1533 }, { "epoch": 1.388235294117647, "grad_norm": 1.2039287828758718, "learning_rate": 5.583863508829281e-06, "loss": 0.0626, "step": 1534 }, { "epoch": 1.3891402714932126, "grad_norm": 2.0699522056028865, "learning_rate": 5.579157214208675e-06, "loss": 0.1524, "step": 1535 }, { "epoch": 1.3900452488687782, "grad_norm": 1.9624420695204696, "learning_rate": 5.574450399437452e-06, "loss": 0.1111, "step": 1536 }, { "epoch": 1.3909502262443438, "grad_norm": 2.3232033860295744, "learning_rate": 5.56974306874288e-06, "loss": 0.1527, "step": 1537 }, { "epoch": 1.3918552036199094, "grad_norm": 1.7543510487794312, "learning_rate": 5.565035226352688e-06, "loss": 0.1142, "step": 1538 }, { "epoch": 1.392760180995475, "grad_norm": 3.0758757001553496, "learning_rate": 5.56032687649507e-06, "loss": 0.1769, "step": 1539 }, { "epoch": 1.3936651583710407, "grad_norm": 1.8697025327241696, "learning_rate": 5.555618023398671e-06, "loss": 0.1572, "step": 1540 }, { "epoch": 1.3945701357466063, "grad_norm": 1.9808205565000498, "learning_rate": 5.550908671292591e-06, "loss": 0.1374, "step": 1541 }, { "epoch": 1.3954751131221719, "grad_norm": 1.8316068295762395, "learning_rate": 5.546198824406373e-06, "loss": 0.1159, "step": 1542 }, { "epoch": 1.3963800904977375, "grad_norm": 1.1679638132544228, "learning_rate": 5.5414884869700104e-06, "loss": 0.0707, "step": 1543 }, { "epoch": 1.397285067873303, "grad_norm": 1.775568504929065, "learning_rate": 5.5367776632139335e-06, "loss": 0.1091, "step": 1544 }, { "epoch": 1.3981900452488687, "grad_norm": 1.7666008964032531, "learning_rate": 5.532066357369012e-06, "loss": 0.1018, "step": 1545 }, { "epoch": 1.3990950226244343, "grad_norm": 1.390716012010575, "learning_rate": 5.527354573666546e-06, "loss": 0.1069, "step": 1546 }, { "epoch": 1.4, "grad_norm": 1.3107938994249062, "learning_rate": 5.522642316338268e-06, "loss": 0.0698, "step": 1547 }, { "epoch": 1.4009049773755655, "grad_norm": 1.9918844307773487, "learning_rate": 5.517929589616331e-06, "loss": 0.1134, "step": 1548 }, { "epoch": 1.4018099547511311, "grad_norm": 1.5003608567634896, "learning_rate": 5.513216397733314e-06, "loss": 0.107, "step": 1549 }, { "epoch": 1.4027149321266967, "grad_norm": 1.695414025390821, "learning_rate": 5.508502744922212e-06, "loss": 0.1143, "step": 1550 }, { "epoch": 1.4036199095022623, "grad_norm": 1.322008834697294, "learning_rate": 5.503788635416432e-06, "loss": 0.1023, "step": 1551 }, { "epoch": 1.404524886877828, "grad_norm": 1.7324007541167545, "learning_rate": 5.499074073449796e-06, "loss": 0.1036, "step": 1552 }, { "epoch": 1.4054298642533936, "grad_norm": 1.2561996635203372, "learning_rate": 5.494359063256529e-06, "loss": 0.0773, "step": 1553 }, { "epoch": 1.4063348416289592, "grad_norm": 32.69795788540405, "learning_rate": 5.489643609071259e-06, "loss": 0.2805, "step": 1554 }, { "epoch": 1.4072398190045248, "grad_norm": 1.967268730610222, "learning_rate": 5.484927715129011e-06, "loss": 0.1372, "step": 1555 }, { "epoch": 1.4081447963800904, "grad_norm": 1.5813595879606228, "learning_rate": 5.48021138566521e-06, "loss": 0.0933, "step": 1556 }, { "epoch": 1.409049773755656, "grad_norm": 3.8848726528255897, "learning_rate": 5.475494624915668e-06, "loss": 0.1575, "step": 1557 }, { "epoch": 1.4099547511312216, "grad_norm": 1.282830950153243, "learning_rate": 5.470777437116585e-06, "loss": 0.0766, "step": 1558 }, { "epoch": 1.4108597285067872, "grad_norm": 1.2144456031310586, "learning_rate": 5.466059826504544e-06, "loss": 0.0608, "step": 1559 }, { "epoch": 1.4117647058823528, "grad_norm": 1.3550795605435935, "learning_rate": 5.46134179731651e-06, "loss": 0.0715, "step": 1560 }, { "epoch": 1.4126696832579184, "grad_norm": 3.3401895695139823, "learning_rate": 5.4566233537898225e-06, "loss": 0.1827, "step": 1561 }, { "epoch": 1.413574660633484, "grad_norm": 1.1841446427806928, "learning_rate": 5.451904500162194e-06, "loss": 0.0669, "step": 1562 }, { "epoch": 1.4144796380090499, "grad_norm": 1.245836095796603, "learning_rate": 5.447185240671703e-06, "loss": 0.0842, "step": 1563 }, { "epoch": 1.4153846153846155, "grad_norm": 1.8046622057267887, "learning_rate": 5.442465579556793e-06, "loss": 0.0905, "step": 1564 }, { "epoch": 1.416289592760181, "grad_norm": 1.6522476733835365, "learning_rate": 5.437745521056272e-06, "loss": 0.0912, "step": 1565 }, { "epoch": 1.4171945701357467, "grad_norm": 1.3590658264412125, "learning_rate": 5.433025069409301e-06, "loss": 0.0926, "step": 1566 }, { "epoch": 1.4180995475113123, "grad_norm": 1.4037753683993075, "learning_rate": 5.428304228855395e-06, "loss": 0.0818, "step": 1567 }, { "epoch": 1.419004524886878, "grad_norm": 1.7867304273648141, "learning_rate": 5.423583003634417e-06, "loss": 0.1031, "step": 1568 }, { "epoch": 1.4199095022624435, "grad_norm": 1.3962174679770947, "learning_rate": 5.418861397986581e-06, "loss": 0.1078, "step": 1569 }, { "epoch": 1.420814479638009, "grad_norm": 2.4407891416068677, "learning_rate": 5.414139416152435e-06, "loss": 0.1588, "step": 1570 }, { "epoch": 1.4217194570135747, "grad_norm": 1.8268142209315694, "learning_rate": 5.409417062372871e-06, "loss": 0.1021, "step": 1571 }, { "epoch": 1.4226244343891403, "grad_norm": 2.391006539715981, "learning_rate": 5.404694340889111e-06, "loss": 0.1744, "step": 1572 }, { "epoch": 1.423529411764706, "grad_norm": 1.5320956250460507, "learning_rate": 5.399971255942708e-06, "loss": 0.0876, "step": 1573 }, { "epoch": 1.4244343891402715, "grad_norm": 1.294456367918225, "learning_rate": 5.395247811775545e-06, "loss": 0.0823, "step": 1574 }, { "epoch": 1.4253393665158371, "grad_norm": 1.3300284738373755, "learning_rate": 5.390524012629824e-06, "loss": 0.0765, "step": 1575 }, { "epoch": 1.4262443438914028, "grad_norm": 1.295273312477959, "learning_rate": 5.3857998627480655e-06, "loss": 0.0767, "step": 1576 }, { "epoch": 1.4271493212669684, "grad_norm": 1.7443310949806565, "learning_rate": 5.381075366373105e-06, "loss": 0.0972, "step": 1577 }, { "epoch": 1.428054298642534, "grad_norm": 1.5181534547163391, "learning_rate": 5.376350527748094e-06, "loss": 0.0884, "step": 1578 }, { "epoch": 1.4289592760180996, "grad_norm": 1.1031072414172505, "learning_rate": 5.371625351116486e-06, "loss": 0.0614, "step": 1579 }, { "epoch": 1.4298642533936652, "grad_norm": 1.5351727950918794, "learning_rate": 5.3668998407220385e-06, "loss": 0.1109, "step": 1580 }, { "epoch": 1.4307692307692308, "grad_norm": 1.7058653448331842, "learning_rate": 5.362174000808813e-06, "loss": 0.0937, "step": 1581 }, { "epoch": 1.4316742081447964, "grad_norm": 2.1147094164918125, "learning_rate": 5.35744783562116e-06, "loss": 0.1432, "step": 1582 }, { "epoch": 1.432579185520362, "grad_norm": 1.7147789862712488, "learning_rate": 5.35272134940373e-06, "loss": 0.109, "step": 1583 }, { "epoch": 1.4334841628959276, "grad_norm": 1.4959593305659629, "learning_rate": 5.347994546401457e-06, "loss": 0.1011, "step": 1584 }, { "epoch": 1.4343891402714932, "grad_norm": 2.0577829254849305, "learning_rate": 5.343267430859559e-06, "loss": 0.1309, "step": 1585 }, { "epoch": 1.4352941176470588, "grad_norm": 1.9518740598566202, "learning_rate": 5.338540007023538e-06, "loss": 0.1353, "step": 1586 }, { "epoch": 1.4361990950226244, "grad_norm": 1.4262368785194892, "learning_rate": 5.333812279139169e-06, "loss": 0.0864, "step": 1587 }, { "epoch": 1.43710407239819, "grad_norm": 1.820036612100041, "learning_rate": 5.3290842514525046e-06, "loss": 0.0906, "step": 1588 }, { "epoch": 1.4380090497737557, "grad_norm": 1.6876643676097072, "learning_rate": 5.324355928209863e-06, "loss": 0.0987, "step": 1589 }, { "epoch": 1.4389140271493213, "grad_norm": 1.312550577069888, "learning_rate": 5.319627313657829e-06, "loss": 0.0881, "step": 1590 }, { "epoch": 1.4398190045248869, "grad_norm": 1.3023285327771046, "learning_rate": 5.314898412043248e-06, "loss": 0.0596, "step": 1591 }, { "epoch": 1.4407239819004525, "grad_norm": 1.509010967569994, "learning_rate": 5.310169227613226e-06, "loss": 0.1037, "step": 1592 }, { "epoch": 1.441628959276018, "grad_norm": 1.6663060048904017, "learning_rate": 5.305439764615121e-06, "loss": 0.0937, "step": 1593 }, { "epoch": 1.4425339366515837, "grad_norm": 2.2024298364938946, "learning_rate": 5.300710027296542e-06, "loss": 0.0885, "step": 1594 }, { "epoch": 1.4434389140271493, "grad_norm": 2.5445992950378886, "learning_rate": 5.295980019905342e-06, "loss": 0.1524, "step": 1595 }, { "epoch": 1.444343891402715, "grad_norm": 1.7957914977402523, "learning_rate": 5.29124974668962e-06, "loss": 0.0979, "step": 1596 }, { "epoch": 1.4452488687782805, "grad_norm": 1.4023064007768271, "learning_rate": 5.286519211897711e-06, "loss": 0.0981, "step": 1597 }, { "epoch": 1.4461538461538461, "grad_norm": 1.4044902028298643, "learning_rate": 5.281788419778187e-06, "loss": 0.0783, "step": 1598 }, { "epoch": 1.4470588235294117, "grad_norm": 1.4195315948798346, "learning_rate": 5.27705737457985e-06, "loss": 0.079, "step": 1599 }, { "epoch": 1.4479638009049773, "grad_norm": 1.4755756016085562, "learning_rate": 5.272326080551729e-06, "loss": 0.0791, "step": 1600 }, { "epoch": 1.448868778280543, "grad_norm": 1.8515975548496038, "learning_rate": 5.267594541943079e-06, "loss": 0.0945, "step": 1601 }, { "epoch": 1.4497737556561086, "grad_norm": 2.6025968699051703, "learning_rate": 5.262862763003369e-06, "loss": 0.1662, "step": 1602 }, { "epoch": 1.4506787330316742, "grad_norm": 2.1002138321302097, "learning_rate": 5.258130747982291e-06, "loss": 0.1177, "step": 1603 }, { "epoch": 1.4515837104072398, "grad_norm": 1.8192743329244498, "learning_rate": 5.253398501129742e-06, "loss": 0.0848, "step": 1604 }, { "epoch": 1.4524886877828054, "grad_norm": 1.6340248320265534, "learning_rate": 5.248666026695835e-06, "loss": 0.0954, "step": 1605 }, { "epoch": 1.453393665158371, "grad_norm": 2.4153841755881666, "learning_rate": 5.2439333289308795e-06, "loss": 0.1306, "step": 1606 }, { "epoch": 1.4542986425339366, "grad_norm": 1.6188668600791163, "learning_rate": 5.23920041208539e-06, "loss": 0.0988, "step": 1607 }, { "epoch": 1.4552036199095022, "grad_norm": 2.1662383526671274, "learning_rate": 5.234467280410079e-06, "loss": 0.0953, "step": 1608 }, { "epoch": 1.4561085972850678, "grad_norm": 2.102680087755978, "learning_rate": 5.229733938155846e-06, "loss": 0.0886, "step": 1609 }, { "epoch": 1.4570135746606334, "grad_norm": 1.8364184725003787, "learning_rate": 5.2250003895737865e-06, "loss": 0.08, "step": 1610 }, { "epoch": 1.457918552036199, "grad_norm": 1.2145936917616533, "learning_rate": 5.220266638915178e-06, "loss": 0.0758, "step": 1611 }, { "epoch": 1.4588235294117646, "grad_norm": 1.5926556389826558, "learning_rate": 5.2155326904314795e-06, "loss": 0.0984, "step": 1612 }, { "epoch": 1.4597285067873302, "grad_norm": 1.1802887248286138, "learning_rate": 5.210798548374326e-06, "loss": 0.0813, "step": 1613 }, { "epoch": 1.4606334841628958, "grad_norm": 1.4340072911903086, "learning_rate": 5.206064216995532e-06, "loss": 0.0798, "step": 1614 }, { "epoch": 1.4615384615384617, "grad_norm": 9.433344348852527, "learning_rate": 5.201329700547077e-06, "loss": 0.3263, "step": 1615 }, { "epoch": 1.4624434389140273, "grad_norm": 1.5310470327644823, "learning_rate": 5.196595003281107e-06, "loss": 0.1181, "step": 1616 }, { "epoch": 1.463348416289593, "grad_norm": 2.7173635498204054, "learning_rate": 5.191860129449932e-06, "loss": 0.1416, "step": 1617 }, { "epoch": 1.4642533936651585, "grad_norm": 2.054064311321922, "learning_rate": 5.187125083306018e-06, "loss": 0.0909, "step": 1618 }, { "epoch": 1.465158371040724, "grad_norm": 2.07216327697606, "learning_rate": 5.182389869101992e-06, "loss": 0.1711, "step": 1619 }, { "epoch": 1.4660633484162897, "grad_norm": 1.3840064338785383, "learning_rate": 5.177654491090627e-06, "loss": 0.0752, "step": 1620 }, { "epoch": 1.4669683257918553, "grad_norm": 1.6621321309171488, "learning_rate": 5.17291895352484e-06, "loss": 0.1001, "step": 1621 }, { "epoch": 1.467873303167421, "grad_norm": 1.464514941030754, "learning_rate": 5.168183260657699e-06, "loss": 0.0813, "step": 1622 }, { "epoch": 1.4687782805429865, "grad_norm": 2.256951482426178, "learning_rate": 5.163447416742405e-06, "loss": 0.1302, "step": 1623 }, { "epoch": 1.4696832579185521, "grad_norm": 1.584471747843378, "learning_rate": 5.1587114260323e-06, "loss": 0.0957, "step": 1624 }, { "epoch": 1.4705882352941178, "grad_norm": 1.3719067046688354, "learning_rate": 5.153975292780852e-06, "loss": 0.0806, "step": 1625 }, { "epoch": 1.4714932126696834, "grad_norm": 4.549882450198334, "learning_rate": 5.149239021241663e-06, "loss": 0.1791, "step": 1626 }, { "epoch": 1.472398190045249, "grad_norm": 1.8260029088688592, "learning_rate": 5.144502615668455e-06, "loss": 0.1536, "step": 1627 }, { "epoch": 1.4733031674208146, "grad_norm": 2.0413937255851393, "learning_rate": 5.139766080315073e-06, "loss": 0.1469, "step": 1628 }, { "epoch": 1.4742081447963802, "grad_norm": 1.443695046412948, "learning_rate": 5.135029419435474e-06, "loss": 0.0711, "step": 1629 }, { "epoch": 1.4751131221719458, "grad_norm": 1.5778346141585031, "learning_rate": 5.130292637283735e-06, "loss": 0.0739, "step": 1630 }, { "epoch": 1.4760180995475114, "grad_norm": 1.654033651231704, "learning_rate": 5.125555738114035e-06, "loss": 0.1132, "step": 1631 }, { "epoch": 1.476923076923077, "grad_norm": 1.3769906699063068, "learning_rate": 5.120818726180662e-06, "loss": 0.0805, "step": 1632 }, { "epoch": 1.4778280542986426, "grad_norm": 2.1620672753357897, "learning_rate": 5.116081605738005e-06, "loss": 0.1292, "step": 1633 }, { "epoch": 1.4787330316742082, "grad_norm": 1.5653661214379433, "learning_rate": 5.11134438104055e-06, "loss": 0.0857, "step": 1634 }, { "epoch": 1.4796380090497738, "grad_norm": 1.8380566882529024, "learning_rate": 5.1066070563428736e-06, "loss": 0.1101, "step": 1635 }, { "epoch": 1.4805429864253394, "grad_norm": 1.7756981141867398, "learning_rate": 5.1018696358996475e-06, "loss": 0.0956, "step": 1636 }, { "epoch": 1.481447963800905, "grad_norm": 1.1833374704802906, "learning_rate": 5.097132123965628e-06, "loss": 0.0621, "step": 1637 }, { "epoch": 1.4823529411764707, "grad_norm": 3.76993401327604, "learning_rate": 5.09239452479565e-06, "loss": 0.2252, "step": 1638 }, { "epoch": 1.4832579185520363, "grad_norm": 1.7235115611377918, "learning_rate": 5.0876568426446306e-06, "loss": 0.0997, "step": 1639 }, { "epoch": 1.4841628959276019, "grad_norm": 1.2549938732236012, "learning_rate": 5.082919081767558e-06, "loss": 0.0773, "step": 1640 }, { "epoch": 1.4850678733031675, "grad_norm": 2.1587495454617702, "learning_rate": 5.0781812464194955e-06, "loss": 0.106, "step": 1641 }, { "epoch": 1.485972850678733, "grad_norm": 2.3726670438648227, "learning_rate": 5.073443340855571e-06, "loss": 0.1385, "step": 1642 }, { "epoch": 1.4868778280542987, "grad_norm": 1.8396084561537023, "learning_rate": 5.068705369330974e-06, "loss": 0.1058, "step": 1643 }, { "epoch": 1.4877828054298643, "grad_norm": 1.6055365804688548, "learning_rate": 5.063967336100955e-06, "loss": 0.0966, "step": 1644 }, { "epoch": 1.48868778280543, "grad_norm": 1.572106533318075, "learning_rate": 5.059229245420819e-06, "loss": 0.1175, "step": 1645 }, { "epoch": 1.4895927601809955, "grad_norm": 1.2615364264769493, "learning_rate": 5.054491101545924e-06, "loss": 0.0849, "step": 1646 }, { "epoch": 1.4904977375565611, "grad_norm": 1.5294288725754515, "learning_rate": 5.049752908731676e-06, "loss": 0.0889, "step": 1647 }, { "epoch": 1.4914027149321267, "grad_norm": 2.04912328235146, "learning_rate": 5.045014671233521e-06, "loss": 0.1341, "step": 1648 }, { "epoch": 1.4923076923076923, "grad_norm": 1.7048461594596955, "learning_rate": 5.0402763933069496e-06, "loss": 0.0833, "step": 1649 }, { "epoch": 1.493212669683258, "grad_norm": 1.4847731988875996, "learning_rate": 5.035538079207488e-06, "loss": 0.0807, "step": 1650 }, { "epoch": 1.4941176470588236, "grad_norm": 2.127553645597984, "learning_rate": 5.030799733190694e-06, "loss": 0.148, "step": 1651 }, { "epoch": 1.4950226244343892, "grad_norm": 1.1841580823245648, "learning_rate": 5.026061359512152e-06, "loss": 0.0837, "step": 1652 }, { "epoch": 1.4959276018099548, "grad_norm": 1.577718326536623, "learning_rate": 5.021322962427475e-06, "loss": 0.0814, "step": 1653 }, { "epoch": 1.4968325791855204, "grad_norm": 1.474559078869548, "learning_rate": 5.0165845461922934e-06, "loss": 0.0952, "step": 1654 }, { "epoch": 1.497737556561086, "grad_norm": 1.2610598105667654, "learning_rate": 5.01184611506226e-06, "loss": 0.0788, "step": 1655 }, { "epoch": 1.4986425339366516, "grad_norm": 1.4339098757435058, "learning_rate": 5.007107673293035e-06, "loss": 0.0854, "step": 1656 }, { "epoch": 1.4995475113122172, "grad_norm": 1.9217783610133155, "learning_rate": 5.002369225140293e-06, "loss": 0.1463, "step": 1657 }, { "epoch": 1.5004524886877828, "grad_norm": 1.9449316922279258, "learning_rate": 4.997630774859708e-06, "loss": 0.1152, "step": 1658 }, { "epoch": 1.5013574660633484, "grad_norm": 1.5520349721460054, "learning_rate": 4.9928923267069655e-06, "loss": 0.1182, "step": 1659 }, { "epoch": 1.502262443438914, "grad_norm": 1.7011548478155767, "learning_rate": 4.988153884937742e-06, "loss": 0.1119, "step": 1660 }, { "epoch": 1.5031674208144796, "grad_norm": 1.644855134437417, "learning_rate": 4.983415453807707e-06, "loss": 0.102, "step": 1661 }, { "epoch": 1.5040723981900452, "grad_norm": 1.399449141276884, "learning_rate": 4.978677037572526e-06, "loss": 0.0868, "step": 1662 }, { "epoch": 1.5049773755656108, "grad_norm": 2.3319208922519805, "learning_rate": 4.973938640487849e-06, "loss": 0.0976, "step": 1663 }, { "epoch": 1.5058823529411764, "grad_norm": 1.5261342834755043, "learning_rate": 4.9692002668093075e-06, "loss": 0.0821, "step": 1664 }, { "epoch": 1.506787330316742, "grad_norm": 1.3381991373090063, "learning_rate": 4.964461920792512e-06, "loss": 0.0831, "step": 1665 }, { "epoch": 1.5076923076923077, "grad_norm": 1.4948686915982217, "learning_rate": 4.959723606693051e-06, "loss": 0.0835, "step": 1666 }, { "epoch": 1.5085972850678733, "grad_norm": 1.5119247042029265, "learning_rate": 4.954985328766479e-06, "loss": 0.0838, "step": 1667 }, { "epoch": 1.5095022624434389, "grad_norm": 2.390716064621844, "learning_rate": 4.950247091268326e-06, "loss": 0.117, "step": 1668 }, { "epoch": 1.5104072398190045, "grad_norm": 1.2207647534900592, "learning_rate": 4.945508898454078e-06, "loss": 0.0821, "step": 1669 }, { "epoch": 1.51131221719457, "grad_norm": 1.621549637145539, "learning_rate": 4.940770754579183e-06, "loss": 0.0992, "step": 1670 }, { "epoch": 1.5122171945701357, "grad_norm": 1.5368806514587756, "learning_rate": 4.936032663899046e-06, "loss": 0.096, "step": 1671 }, { "epoch": 1.5131221719457013, "grad_norm": 1.2625578516156628, "learning_rate": 4.931294630669027e-06, "loss": 0.0837, "step": 1672 }, { "epoch": 1.514027149321267, "grad_norm": 1.623174521166372, "learning_rate": 4.926556659144431e-06, "loss": 0.1057, "step": 1673 }, { "epoch": 1.5149321266968325, "grad_norm": 1.5312395671024743, "learning_rate": 4.921818753580505e-06, "loss": 0.0865, "step": 1674 }, { "epoch": 1.5158371040723981, "grad_norm": 1.3241028070449834, "learning_rate": 4.917080918232444e-06, "loss": 0.0776, "step": 1675 }, { "epoch": 1.5167420814479637, "grad_norm": 1.5327042758605487, "learning_rate": 4.912343157355371e-06, "loss": 0.083, "step": 1676 }, { "epoch": 1.5176470588235293, "grad_norm": 1.5479331814581498, "learning_rate": 4.907605475204352e-06, "loss": 0.0964, "step": 1677 }, { "epoch": 1.518552036199095, "grad_norm": 1.5000722989435682, "learning_rate": 4.902867876034374e-06, "loss": 0.0823, "step": 1678 }, { "epoch": 1.5194570135746606, "grad_norm": 1.6824700551964489, "learning_rate": 4.898130364100353e-06, "loss": 0.1193, "step": 1679 }, { "epoch": 1.5203619909502262, "grad_norm": 1.7004262502542973, "learning_rate": 4.893392943657127e-06, "loss": 0.1174, "step": 1680 }, { "epoch": 1.5212669683257918, "grad_norm": 1.2689139631740343, "learning_rate": 4.888655618959453e-06, "loss": 0.07, "step": 1681 }, { "epoch": 1.5221719457013574, "grad_norm": 1.7283884654957742, "learning_rate": 4.883918394261997e-06, "loss": 0.1167, "step": 1682 }, { "epoch": 1.523076923076923, "grad_norm": 1.3824922079707083, "learning_rate": 4.87918127381934e-06, "loss": 0.0765, "step": 1683 }, { "epoch": 1.5239819004524886, "grad_norm": 1.5974275240088145, "learning_rate": 4.874444261885967e-06, "loss": 0.1165, "step": 1684 }, { "epoch": 1.5248868778280542, "grad_norm": 1.473876944805113, "learning_rate": 4.8697073627162675e-06, "loss": 0.0801, "step": 1685 }, { "epoch": 1.5257918552036198, "grad_norm": 1.5637131819378314, "learning_rate": 4.8649705805645276e-06, "loss": 0.0831, "step": 1686 }, { "epoch": 1.5266968325791854, "grad_norm": 1.7900601211103742, "learning_rate": 4.86023391968493e-06, "loss": 0.1332, "step": 1687 }, { "epoch": 1.527601809954751, "grad_norm": 1.4317727514027143, "learning_rate": 4.8554973843315475e-06, "loss": 0.0994, "step": 1688 }, { "epoch": 1.5285067873303166, "grad_norm": 1.4816758136418906, "learning_rate": 4.850760978758338e-06, "loss": 0.0845, "step": 1689 }, { "epoch": 1.5294117647058822, "grad_norm": 1.0523802726294447, "learning_rate": 4.846024707219149e-06, "loss": 0.0625, "step": 1690 }, { "epoch": 1.5303167420814479, "grad_norm": 1.7637694736890777, "learning_rate": 4.841288573967703e-06, "loss": 0.1255, "step": 1691 }, { "epoch": 1.5312217194570135, "grad_norm": 2.220474373012984, "learning_rate": 4.836552583257597e-06, "loss": 0.1404, "step": 1692 }, { "epoch": 1.532126696832579, "grad_norm": 1.4167344847028724, "learning_rate": 4.831816739342303e-06, "loss": 0.0848, "step": 1693 }, { "epoch": 1.5330316742081447, "grad_norm": 1.5908020124706828, "learning_rate": 4.827081046475162e-06, "loss": 0.1025, "step": 1694 }, { "epoch": 1.5339366515837103, "grad_norm": 1.8986675636937735, "learning_rate": 4.822345508909376e-06, "loss": 0.1074, "step": 1695 }, { "epoch": 1.534841628959276, "grad_norm": 1.7256648950962326, "learning_rate": 4.8176101308980095e-06, "loss": 0.0711, "step": 1696 }, { "epoch": 1.5357466063348415, "grad_norm": 1.4878093957552372, "learning_rate": 4.812874916693984e-06, "loss": 0.0837, "step": 1697 }, { "epoch": 1.536651583710407, "grad_norm": 1.3408100524213644, "learning_rate": 4.8081398705500706e-06, "loss": 0.0925, "step": 1698 }, { "epoch": 1.5375565610859727, "grad_norm": 1.6551934458799866, "learning_rate": 4.803404996718896e-06, "loss": 0.0892, "step": 1699 }, { "epoch": 1.5384615384615383, "grad_norm": 1.470334430328486, "learning_rate": 4.798670299452926e-06, "loss": 0.0798, "step": 1700 }, { "epoch": 1.539366515837104, "grad_norm": 1.0009865711095576, "learning_rate": 4.7939357830044705e-06, "loss": 0.0601, "step": 1701 }, { "epoch": 1.5402714932126695, "grad_norm": 1.5851008695919169, "learning_rate": 4.789201451625675e-06, "loss": 0.1098, "step": 1702 }, { "epoch": 1.5411764705882351, "grad_norm": 1.3118693305255968, "learning_rate": 4.784467309568524e-06, "loss": 0.0801, "step": 1703 }, { "epoch": 1.5420814479638008, "grad_norm": 1.4041669536852508, "learning_rate": 4.779733361084825e-06, "loss": 0.1009, "step": 1704 }, { "epoch": 1.5429864253393664, "grad_norm": 2.6308210754726797, "learning_rate": 4.774999610426216e-06, "loss": 0.1338, "step": 1705 }, { "epoch": 1.543891402714932, "grad_norm": 1.7379260894597146, "learning_rate": 4.770266061844157e-06, "loss": 0.1286, "step": 1706 }, { "epoch": 1.5447963800904976, "grad_norm": 1.3692195589055736, "learning_rate": 4.765532719589925e-06, "loss": 0.1007, "step": 1707 }, { "epoch": 1.5457013574660632, "grad_norm": 1.7787020668153601, "learning_rate": 4.760799587914612e-06, "loss": 0.0945, "step": 1708 }, { "epoch": 1.5466063348416288, "grad_norm": 1.8910946255544865, "learning_rate": 4.756066671069124e-06, "loss": 0.1193, "step": 1709 }, { "epoch": 1.5475113122171946, "grad_norm": 2.508450705156525, "learning_rate": 4.751333973304166e-06, "loss": 0.1948, "step": 1710 }, { "epoch": 1.5484162895927602, "grad_norm": 1.6765273326562742, "learning_rate": 4.746601498870258e-06, "loss": 0.0838, "step": 1711 }, { "epoch": 1.5493212669683258, "grad_norm": 1.545383648151973, "learning_rate": 4.741869252017711e-06, "loss": 0.088, "step": 1712 }, { "epoch": 1.5502262443438914, "grad_norm": 1.5367650774102437, "learning_rate": 4.7371372369966316e-06, "loss": 0.1042, "step": 1713 }, { "epoch": 1.551131221719457, "grad_norm": 1.9656006545345168, "learning_rate": 4.732405458056921e-06, "loss": 0.1213, "step": 1714 }, { "epoch": 1.5520361990950227, "grad_norm": 1.237306681494891, "learning_rate": 4.727673919448271e-06, "loss": 0.0659, "step": 1715 }, { "epoch": 1.5529411764705883, "grad_norm": 1.1564310354086886, "learning_rate": 4.7229426254201504e-06, "loss": 0.0788, "step": 1716 }, { "epoch": 1.5538461538461539, "grad_norm": 1.8124059295014197, "learning_rate": 4.718211580221813e-06, "loss": 0.1053, "step": 1717 }, { "epoch": 1.5547511312217195, "grad_norm": 2.354175595803461, "learning_rate": 4.7134807881022894e-06, "loss": 0.1136, "step": 1718 }, { "epoch": 1.555656108597285, "grad_norm": 1.7132426039686635, "learning_rate": 4.70875025331038e-06, "loss": 0.1284, "step": 1719 }, { "epoch": 1.5565610859728507, "grad_norm": 1.2502910732430998, "learning_rate": 4.704019980094659e-06, "loss": 0.098, "step": 1720 }, { "epoch": 1.5574660633484163, "grad_norm": 2.2584545224003088, "learning_rate": 4.699289972703459e-06, "loss": 0.0816, "step": 1721 }, { "epoch": 1.558371040723982, "grad_norm": 1.8619325905644104, "learning_rate": 4.694560235384879e-06, "loss": 0.1028, "step": 1722 }, { "epoch": 1.5592760180995475, "grad_norm": 1.3539346165395978, "learning_rate": 4.6898307723867745e-06, "loss": 0.0744, "step": 1723 }, { "epoch": 1.5601809954751131, "grad_norm": 1.4058567309286387, "learning_rate": 4.685101587956753e-06, "loss": 0.0827, "step": 1724 }, { "epoch": 1.5610859728506787, "grad_norm": 1.4097267047597115, "learning_rate": 4.680372686342173e-06, "loss": 0.0799, "step": 1725 }, { "epoch": 1.5619909502262443, "grad_norm": 1.1920200587501226, "learning_rate": 4.675644071790138e-06, "loss": 0.0575, "step": 1726 }, { "epoch": 1.56289592760181, "grad_norm": 1.471628701809891, "learning_rate": 4.670915748547496e-06, "loss": 0.0778, "step": 1727 }, { "epoch": 1.5638009049773756, "grad_norm": 1.6008236099132445, "learning_rate": 4.666187720860832e-06, "loss": 0.093, "step": 1728 }, { "epoch": 1.5647058823529412, "grad_norm": 1.7929287092670774, "learning_rate": 4.661459992976463e-06, "loss": 0.1339, "step": 1729 }, { "epoch": 1.5656108597285068, "grad_norm": 1.5489148644665036, "learning_rate": 4.656732569140441e-06, "loss": 0.0944, "step": 1730 }, { "epoch": 1.5665158371040724, "grad_norm": 1.5990956811521226, "learning_rate": 4.652005453598544e-06, "loss": 0.1023, "step": 1731 }, { "epoch": 1.567420814479638, "grad_norm": 1.4196827311970137, "learning_rate": 4.64727865059627e-06, "loss": 0.0762, "step": 1732 }, { "epoch": 1.5683257918552036, "grad_norm": 1.3722354768338103, "learning_rate": 4.6425521643788405e-06, "loss": 0.0757, "step": 1733 }, { "epoch": 1.5692307692307692, "grad_norm": 2.7004384429407144, "learning_rate": 4.637825999191189e-06, "loss": 0.1078, "step": 1734 }, { "epoch": 1.5701357466063348, "grad_norm": 1.3052960844499584, "learning_rate": 4.6331001592779615e-06, "loss": 0.0703, "step": 1735 }, { "epoch": 1.5710407239819004, "grad_norm": 2.099070982661598, "learning_rate": 4.6283746488835155e-06, "loss": 0.1288, "step": 1736 }, { "epoch": 1.571945701357466, "grad_norm": 1.2802279249300015, "learning_rate": 4.623649472251907e-06, "loss": 0.0821, "step": 1737 }, { "epoch": 1.5728506787330316, "grad_norm": 1.527199469278847, "learning_rate": 4.618924633626896e-06, "loss": 0.096, "step": 1738 }, { "epoch": 1.5737556561085972, "grad_norm": 1.2771888074602873, "learning_rate": 4.614200137251935e-06, "loss": 0.0843, "step": 1739 }, { "epoch": 1.5746606334841629, "grad_norm": 1.3926040443004102, "learning_rate": 4.609475987370177e-06, "loss": 0.0679, "step": 1740 }, { "epoch": 1.5755656108597285, "grad_norm": 1.4707779113903008, "learning_rate": 4.604752188224455e-06, "loss": 0.0945, "step": 1741 }, { "epoch": 1.576470588235294, "grad_norm": 1.4223192585976479, "learning_rate": 4.6000287440572925e-06, "loss": 0.104, "step": 1742 }, { "epoch": 1.5773755656108597, "grad_norm": 1.4370648198508178, "learning_rate": 4.5953056591108895e-06, "loss": 0.0921, "step": 1743 }, { "epoch": 1.5782805429864255, "grad_norm": 1.6890605618927357, "learning_rate": 4.59058293762713e-06, "loss": 0.0931, "step": 1744 }, { "epoch": 1.5791855203619911, "grad_norm": 1.9321017384238108, "learning_rate": 4.585860583847566e-06, "loss": 0.1125, "step": 1745 }, { "epoch": 1.5800904977375567, "grad_norm": 1.8510976810224875, "learning_rate": 4.5811386020134205e-06, "loss": 0.107, "step": 1746 }, { "epoch": 1.5809954751131223, "grad_norm": 1.0385060500565562, "learning_rate": 4.576416996365584e-06, "loss": 0.0567, "step": 1747 }, { "epoch": 1.581900452488688, "grad_norm": 1.5801105449784154, "learning_rate": 4.571695771144606e-06, "loss": 0.1225, "step": 1748 }, { "epoch": 1.5828054298642535, "grad_norm": 1.5475125886785388, "learning_rate": 4.5669749305907e-06, "loss": 0.0962, "step": 1749 }, { "epoch": 1.5837104072398192, "grad_norm": 1.6602160841513578, "learning_rate": 4.562254478943729e-06, "loss": 0.0808, "step": 1750 }, { "epoch": 1.5846153846153848, "grad_norm": 1.249464489102752, "learning_rate": 4.557534420443209e-06, "loss": 0.0655, "step": 1751 }, { "epoch": 1.5855203619909504, "grad_norm": 1.3595201033612074, "learning_rate": 4.552814759328299e-06, "loss": 0.0674, "step": 1752 }, { "epoch": 1.586425339366516, "grad_norm": 1.1167825299396126, "learning_rate": 4.5480954998378075e-06, "loss": 0.0704, "step": 1753 }, { "epoch": 1.5873303167420816, "grad_norm": 1.0425324996313272, "learning_rate": 4.543376646210178e-06, "loss": 0.0643, "step": 1754 }, { "epoch": 1.5882352941176472, "grad_norm": 1.673145034421986, "learning_rate": 4.53865820268349e-06, "loss": 0.1141, "step": 1755 }, { "epoch": 1.5891402714932128, "grad_norm": 3.850346946103571, "learning_rate": 4.533940173495457e-06, "loss": 0.2311, "step": 1756 }, { "epoch": 1.5900452488687784, "grad_norm": 1.5834977968468233, "learning_rate": 4.5292225628834165e-06, "loss": 0.0816, "step": 1757 }, { "epoch": 1.590950226244344, "grad_norm": 1.5322842870186826, "learning_rate": 4.524505375084333e-06, "loss": 0.0814, "step": 1758 }, { "epoch": 1.5918552036199096, "grad_norm": 1.375808552568626, "learning_rate": 4.5197886143347905e-06, "loss": 0.0761, "step": 1759 }, { "epoch": 1.5927601809954752, "grad_norm": 1.5236028853017465, "learning_rate": 4.51507228487099e-06, "loss": 0.0888, "step": 1760 }, { "epoch": 1.5936651583710408, "grad_norm": 1.0832736357801944, "learning_rate": 4.5103563909287424e-06, "loss": 0.0654, "step": 1761 }, { "epoch": 1.5945701357466064, "grad_norm": 1.4244624985148349, "learning_rate": 4.505640936743472e-06, "loss": 0.0947, "step": 1762 }, { "epoch": 1.595475113122172, "grad_norm": 1.3456097834182006, "learning_rate": 4.500925926550205e-06, "loss": 0.0723, "step": 1763 }, { "epoch": 1.5963800904977377, "grad_norm": 1.4763765119209764, "learning_rate": 4.49621136458357e-06, "loss": 0.0797, "step": 1764 }, { "epoch": 1.5972850678733033, "grad_norm": 1.997429561479053, "learning_rate": 4.49149725507779e-06, "loss": 0.0942, "step": 1765 }, { "epoch": 1.5981900452488689, "grad_norm": 1.4387231826616114, "learning_rate": 4.486783602266687e-06, "loss": 0.0945, "step": 1766 }, { "epoch": 1.5990950226244345, "grad_norm": 1.1343086253414731, "learning_rate": 4.48207041038367e-06, "loss": 0.0621, "step": 1767 }, { "epoch": 1.6, "grad_norm": 1.7330718283656037, "learning_rate": 4.477357683661734e-06, "loss": 0.1009, "step": 1768 }, { "epoch": 1.6009049773755657, "grad_norm": 1.9601430699657223, "learning_rate": 4.472645426333455e-06, "loss": 0.1625, "step": 1769 }, { "epoch": 1.6018099547511313, "grad_norm": 1.340332612987568, "learning_rate": 4.467933642630989e-06, "loss": 0.0681, "step": 1770 }, { "epoch": 1.602714932126697, "grad_norm": 1.876481140009925, "learning_rate": 4.463222336786067e-06, "loss": 0.1009, "step": 1771 }, { "epoch": 1.6036199095022625, "grad_norm": 1.5373375003989744, "learning_rate": 4.458511513029991e-06, "loss": 0.0794, "step": 1772 }, { "epoch": 1.6045248868778281, "grad_norm": 1.4881864843623696, "learning_rate": 4.45380117559363e-06, "loss": 0.111, "step": 1773 }, { "epoch": 1.6054298642533937, "grad_norm": 1.5579259914279224, "learning_rate": 4.449091328707411e-06, "loss": 0.0841, "step": 1774 }, { "epoch": 1.6063348416289593, "grad_norm": 1.4371062680317737, "learning_rate": 4.44438197660133e-06, "loss": 0.0865, "step": 1775 }, { "epoch": 1.607239819004525, "grad_norm": 2.159069549962715, "learning_rate": 4.439673123504931e-06, "loss": 0.1777, "step": 1776 }, { "epoch": 1.6081447963800906, "grad_norm": 1.2202653320494183, "learning_rate": 4.4349647736473135e-06, "loss": 0.0803, "step": 1777 }, { "epoch": 1.6090497737556562, "grad_norm": 2.3774196599233828, "learning_rate": 4.430256931257122e-06, "loss": 0.119, "step": 1778 }, { "epoch": 1.6099547511312218, "grad_norm": 1.5700704949712025, "learning_rate": 4.425549600562549e-06, "loss": 0.0935, "step": 1779 }, { "epoch": 1.6108597285067874, "grad_norm": 1.565236062847299, "learning_rate": 4.420842785791326e-06, "loss": 0.1117, "step": 1780 }, { "epoch": 1.611764705882353, "grad_norm": 1.6558904530206415, "learning_rate": 4.41613649117072e-06, "loss": 0.0888, "step": 1781 }, { "epoch": 1.6126696832579186, "grad_norm": 1.6367560033121005, "learning_rate": 4.411430720927531e-06, "loss": 0.0991, "step": 1782 }, { "epoch": 1.6135746606334842, "grad_norm": 2.585509737163265, "learning_rate": 4.406725479288087e-06, "loss": 0.1839, "step": 1783 }, { "epoch": 1.6144796380090498, "grad_norm": 1.6301144770828926, "learning_rate": 4.402020770478248e-06, "loss": 0.0915, "step": 1784 }, { "epoch": 1.6153846153846154, "grad_norm": 1.7727555535528643, "learning_rate": 4.397316598723385e-06, "loss": 0.1323, "step": 1785 }, { "epoch": 1.616289592760181, "grad_norm": 1.2560671424332388, "learning_rate": 4.392612968248396e-06, "loss": 0.0582, "step": 1786 }, { "epoch": 1.6171945701357466, "grad_norm": 1.647610988123561, "learning_rate": 4.3879098832776865e-06, "loss": 0.0784, "step": 1787 }, { "epoch": 1.6180995475113122, "grad_norm": 1.3834065542392802, "learning_rate": 4.383207348035175e-06, "loss": 0.0913, "step": 1788 }, { "epoch": 1.6190045248868778, "grad_norm": 1.624261294889478, "learning_rate": 4.378505366744286e-06, "loss": 0.1187, "step": 1789 }, { "epoch": 1.6199095022624435, "grad_norm": 2.0650536245527493, "learning_rate": 4.373803943627946e-06, "loss": 0.1151, "step": 1790 }, { "epoch": 1.620814479638009, "grad_norm": 1.800288008942424, "learning_rate": 4.369103082908581e-06, "loss": 0.1212, "step": 1791 }, { "epoch": 1.6217194570135747, "grad_norm": 1.7202972437167798, "learning_rate": 4.364402788808109e-06, "loss": 0.1038, "step": 1792 }, { "epoch": 1.6226244343891403, "grad_norm": 1.3587099413410626, "learning_rate": 4.359703065547944e-06, "loss": 0.0794, "step": 1793 }, { "epoch": 1.6235294117647059, "grad_norm": 1.6401457963612107, "learning_rate": 4.355003917348985e-06, "loss": 0.0932, "step": 1794 }, { "epoch": 1.6244343891402715, "grad_norm": 1.2062667232189015, "learning_rate": 4.350305348431612e-06, "loss": 0.0683, "step": 1795 }, { "epoch": 1.625339366515837, "grad_norm": 1.6483115568291349, "learning_rate": 4.345607363015688e-06, "loss": 0.0753, "step": 1796 }, { "epoch": 1.6262443438914027, "grad_norm": 1.2862940743135423, "learning_rate": 4.340909965320552e-06, "loss": 0.0718, "step": 1797 }, { "epoch": 1.6271493212669683, "grad_norm": 1.4795507395491703, "learning_rate": 4.336213159565013e-06, "loss": 0.0692, "step": 1798 }, { "epoch": 1.628054298642534, "grad_norm": 1.6069438156292684, "learning_rate": 4.33151694996735e-06, "loss": 0.1365, "step": 1799 }, { "epoch": 1.6289592760180995, "grad_norm": 1.3962804917238467, "learning_rate": 4.326821340745304e-06, "loss": 0.0807, "step": 1800 }, { "epoch": 1.6298642533936651, "grad_norm": 1.345546330746348, "learning_rate": 4.322126336116082e-06, "loss": 0.101, "step": 1801 }, { "epoch": 1.6307692307692307, "grad_norm": 1.5775368873443723, "learning_rate": 4.3174319402963436e-06, "loss": 0.0772, "step": 1802 }, { "epoch": 1.6316742081447964, "grad_norm": 1.6069434900506872, "learning_rate": 4.312738157502203e-06, "loss": 0.089, "step": 1803 }, { "epoch": 1.632579185520362, "grad_norm": 1.2416745435488084, "learning_rate": 4.308044991949223e-06, "loss": 0.0677, "step": 1804 }, { "epoch": 1.6334841628959276, "grad_norm": 1.5505927851598165, "learning_rate": 4.303352447852412e-06, "loss": 0.0695, "step": 1805 }, { "epoch": 1.6343891402714932, "grad_norm": 1.280841651322394, "learning_rate": 4.298660529426223e-06, "loss": 0.0771, "step": 1806 }, { "epoch": 1.6352941176470588, "grad_norm": 1.2649495242443964, "learning_rate": 4.293969240884545e-06, "loss": 0.0653, "step": 1807 }, { "epoch": 1.6361990950226244, "grad_norm": 1.6257279970228797, "learning_rate": 4.2892785864407e-06, "loss": 0.0983, "step": 1808 }, { "epoch": 1.63710407239819, "grad_norm": 1.3175255726008444, "learning_rate": 4.284588570307442e-06, "loss": 0.0687, "step": 1809 }, { "epoch": 1.6380090497737556, "grad_norm": 1.2394168799984386, "learning_rate": 4.279899196696953e-06, "loss": 0.0696, "step": 1810 }, { "epoch": 1.6389140271493212, "grad_norm": 1.3552747212113905, "learning_rate": 4.275210469820835e-06, "loss": 0.0699, "step": 1811 }, { "epoch": 1.6398190045248868, "grad_norm": 1.7554730254274875, "learning_rate": 4.270522393890112e-06, "loss": 0.1073, "step": 1812 }, { "epoch": 1.6407239819004524, "grad_norm": 1.548768629321513, "learning_rate": 4.265834973115219e-06, "loss": 0.1087, "step": 1813 }, { "epoch": 1.641628959276018, "grad_norm": 1.3030258120741023, "learning_rate": 4.261148211706011e-06, "loss": 0.088, "step": 1814 }, { "epoch": 1.6425339366515836, "grad_norm": 1.581513298880004, "learning_rate": 4.256462113871741e-06, "loss": 0.0946, "step": 1815 }, { "epoch": 1.6434389140271493, "grad_norm": 2.1729412107485198, "learning_rate": 4.251776683821073e-06, "loss": 0.1572, "step": 1816 }, { "epoch": 1.6443438914027149, "grad_norm": 1.6128095190559597, "learning_rate": 4.2470919257620686e-06, "loss": 0.096, "step": 1817 }, { "epoch": 1.6452488687782805, "grad_norm": 2.0227883867112757, "learning_rate": 4.242407843902185e-06, "loss": 0.1501, "step": 1818 }, { "epoch": 1.646153846153846, "grad_norm": 1.9277512206421903, "learning_rate": 4.237724442448273e-06, "loss": 0.1164, "step": 1819 }, { "epoch": 1.6470588235294117, "grad_norm": 1.4206317656145246, "learning_rate": 4.233041725606573e-06, "loss": 0.0787, "step": 1820 }, { "epoch": 1.6479638009049773, "grad_norm": 2.037884641217024, "learning_rate": 4.22835969758271e-06, "loss": 0.1359, "step": 1821 }, { "epoch": 1.648868778280543, "grad_norm": 1.777164164173728, "learning_rate": 4.2236783625816905e-06, "loss": 0.1113, "step": 1822 }, { "epoch": 1.6497737556561085, "grad_norm": 1.2151353606818325, "learning_rate": 4.218997724807899e-06, "loss": 0.0845, "step": 1823 }, { "epoch": 1.6506787330316741, "grad_norm": 1.3535918760025882, "learning_rate": 4.21431778846509e-06, "loss": 0.0896, "step": 1824 }, { "epoch": 1.6515837104072397, "grad_norm": 1.324478875093495, "learning_rate": 4.209638557756396e-06, "loss": 0.086, "step": 1825 }, { "epoch": 1.6524886877828053, "grad_norm": 1.6809515850732655, "learning_rate": 4.204960036884307e-06, "loss": 0.1123, "step": 1826 }, { "epoch": 1.653393665158371, "grad_norm": 1.775096043192863, "learning_rate": 4.200282230050683e-06, "loss": 0.1022, "step": 1827 }, { "epoch": 1.6542986425339365, "grad_norm": 1.5155850834796383, "learning_rate": 4.195605141456736e-06, "loss": 0.0845, "step": 1828 }, { "epoch": 1.6552036199095022, "grad_norm": 1.943826239817483, "learning_rate": 4.190928775303038e-06, "loss": 0.1019, "step": 1829 }, { "epoch": 1.6561085972850678, "grad_norm": 1.9273628777989928, "learning_rate": 4.186253135789511e-06, "loss": 0.1038, "step": 1830 }, { "epoch": 1.6570135746606334, "grad_norm": 1.500103947574699, "learning_rate": 4.1815782271154224e-06, "loss": 0.0802, "step": 1831 }, { "epoch": 1.657918552036199, "grad_norm": 1.5728521533543132, "learning_rate": 4.176904053479385e-06, "loss": 0.0958, "step": 1832 }, { "epoch": 1.6588235294117646, "grad_norm": 1.356747588264187, "learning_rate": 4.17223061907935e-06, "loss": 0.0777, "step": 1833 }, { "epoch": 1.6597285067873302, "grad_norm": 1.5285960312989482, "learning_rate": 4.1675579281126075e-06, "loss": 0.0893, "step": 1834 }, { "epoch": 1.6606334841628958, "grad_norm": 1.650656583600986, "learning_rate": 4.162885984775777e-06, "loss": 0.0985, "step": 1835 }, { "epoch": 1.6615384615384614, "grad_norm": 1.225928306769829, "learning_rate": 4.158214793264808e-06, "loss": 0.0629, "step": 1836 }, { "epoch": 1.662443438914027, "grad_norm": 1.2465546657008306, "learning_rate": 4.153544357774973e-06, "loss": 0.0672, "step": 1837 }, { "epoch": 1.6633484162895926, "grad_norm": 1.6585985224308202, "learning_rate": 4.148874682500869e-06, "loss": 0.0848, "step": 1838 }, { "epoch": 1.6642533936651582, "grad_norm": 1.5174503515312605, "learning_rate": 4.144205771636407e-06, "loss": 0.0867, "step": 1839 }, { "epoch": 1.6651583710407238, "grad_norm": 1.2737616020384055, "learning_rate": 4.139537629374814e-06, "loss": 0.0731, "step": 1840 }, { "epoch": 1.6660633484162894, "grad_norm": 1.354218393179032, "learning_rate": 4.134870259908623e-06, "loss": 0.0842, "step": 1841 }, { "epoch": 1.666968325791855, "grad_norm": 1.3595778990550529, "learning_rate": 4.130203667429676e-06, "loss": 0.0787, "step": 1842 }, { "epoch": 1.6678733031674207, "grad_norm": 2.0499223477556883, "learning_rate": 4.125537856129117e-06, "loss": 0.1006, "step": 1843 }, { "epoch": 1.6687782805429863, "grad_norm": 1.6738803682340462, "learning_rate": 4.120872830197389e-06, "loss": 0.076, "step": 1844 }, { "epoch": 1.6696832579185519, "grad_norm": 1.8032672629415174, "learning_rate": 4.116208593824227e-06, "loss": 0.1227, "step": 1845 }, { "epoch": 1.6705882352941175, "grad_norm": 1.164560649416901, "learning_rate": 4.111545151198657e-06, "loss": 0.0642, "step": 1846 }, { "epoch": 1.671493212669683, "grad_norm": 1.7548551158648333, "learning_rate": 4.106882506508996e-06, "loss": 0.1265, "step": 1847 }, { "epoch": 1.672398190045249, "grad_norm": 1.6044837552523123, "learning_rate": 4.102220663942841e-06, "loss": 0.081, "step": 1848 }, { "epoch": 1.6733031674208145, "grad_norm": 1.6749289156749576, "learning_rate": 4.09755962768707e-06, "loss": 0.0936, "step": 1849 }, { "epoch": 1.6742081447963801, "grad_norm": 1.3338926267933795, "learning_rate": 4.092899401927836e-06, "loss": 0.0774, "step": 1850 }, { "epoch": 1.6751131221719457, "grad_norm": 1.4739642586344024, "learning_rate": 4.088239990850562e-06, "loss": 0.0981, "step": 1851 }, { "epoch": 1.6760180995475114, "grad_norm": 1.0040126427215723, "learning_rate": 4.083581398639945e-06, "loss": 0.0507, "step": 1852 }, { "epoch": 1.676923076923077, "grad_norm": 1.6730577629103696, "learning_rate": 4.0789236294799425e-06, "loss": 0.1093, "step": 1853 }, { "epoch": 1.6778280542986426, "grad_norm": 1.4923450903705746, "learning_rate": 4.074266687553773e-06, "loss": 0.078, "step": 1854 }, { "epoch": 1.6787330316742082, "grad_norm": 1.525762646206919, "learning_rate": 4.069610577043912e-06, "loss": 0.0788, "step": 1855 }, { "epoch": 1.6796380090497738, "grad_norm": 1.6717273843242995, "learning_rate": 4.0649553021320904e-06, "loss": 0.1128, "step": 1856 }, { "epoch": 1.6805429864253394, "grad_norm": 1.3521921185663697, "learning_rate": 4.0603008669992866e-06, "loss": 0.0719, "step": 1857 }, { "epoch": 1.681447963800905, "grad_norm": 1.4130050063565651, "learning_rate": 4.055647275825724e-06, "loss": 0.0639, "step": 1858 }, { "epoch": 1.6823529411764706, "grad_norm": 1.65425664555407, "learning_rate": 4.050994532790871e-06, "loss": 0.1066, "step": 1859 }, { "epoch": 1.6832579185520362, "grad_norm": 1.7895564916562698, "learning_rate": 4.046342642073433e-06, "loss": 0.1003, "step": 1860 }, { "epoch": 1.6841628959276018, "grad_norm": 1.2702623300994342, "learning_rate": 4.041691607851348e-06, "loss": 0.0848, "step": 1861 }, { "epoch": 1.6850678733031674, "grad_norm": 1.906334601953978, "learning_rate": 4.0370414343017875e-06, "loss": 0.165, "step": 1862 }, { "epoch": 1.685972850678733, "grad_norm": 1.5280126798407467, "learning_rate": 4.032392125601148e-06, "loss": 0.1159, "step": 1863 }, { "epoch": 1.6868778280542986, "grad_norm": 1.29669107578385, "learning_rate": 4.0277436859250485e-06, "loss": 0.0637, "step": 1864 }, { "epoch": 1.6877828054298643, "grad_norm": 1.6840234456242036, "learning_rate": 4.0230961194483325e-06, "loss": 0.1066, "step": 1865 }, { "epoch": 1.6886877828054299, "grad_norm": 1.744054141841458, "learning_rate": 4.018449430345054e-06, "loss": 0.0972, "step": 1866 }, { "epoch": 1.6895927601809955, "grad_norm": 1.3571503829286367, "learning_rate": 4.013803622788482e-06, "loss": 0.0724, "step": 1867 }, { "epoch": 1.690497737556561, "grad_norm": 4.277764138796462, "learning_rate": 4.00915870095109e-06, "loss": 0.1127, "step": 1868 }, { "epoch": 1.6914027149321267, "grad_norm": 1.2362335041615835, "learning_rate": 4.004514669004562e-06, "loss": 0.082, "step": 1869 }, { "epoch": 1.6923076923076923, "grad_norm": 1.257923654130206, "learning_rate": 3.999871531119779e-06, "loss": 0.0763, "step": 1870 }, { "epoch": 1.693212669683258, "grad_norm": 1.6416201929658616, "learning_rate": 3.995229291466818e-06, "loss": 0.1047, "step": 1871 }, { "epoch": 1.6941176470588235, "grad_norm": 1.697109508474552, "learning_rate": 3.99058795421495e-06, "loss": 0.1392, "step": 1872 }, { "epoch": 1.6950226244343891, "grad_norm": 1.2130201248649801, "learning_rate": 3.985947523532637e-06, "loss": 0.0806, "step": 1873 }, { "epoch": 1.6959276018099547, "grad_norm": 1.3573004954598398, "learning_rate": 3.981308003587528e-06, "loss": 0.0692, "step": 1874 }, { "epoch": 1.6968325791855203, "grad_norm": 1.8721819867151308, "learning_rate": 3.976669398546451e-06, "loss": 0.1093, "step": 1875 }, { "epoch": 1.697737556561086, "grad_norm": 1.5448307601985796, "learning_rate": 3.9720317125754124e-06, "loss": 0.1175, "step": 1876 }, { "epoch": 1.6986425339366515, "grad_norm": 1.4954306511889126, "learning_rate": 3.967394949839596e-06, "loss": 0.074, "step": 1877 }, { "epoch": 1.6995475113122172, "grad_norm": 1.7588709816459893, "learning_rate": 3.962759114503353e-06, "loss": 0.0993, "step": 1878 }, { "epoch": 1.7004524886877828, "grad_norm": 2.232489343485164, "learning_rate": 3.958124210730204e-06, "loss": 0.0938, "step": 1879 }, { "epoch": 1.7013574660633484, "grad_norm": 1.3354251458038755, "learning_rate": 3.9534902426828325e-06, "loss": 0.0849, "step": 1880 }, { "epoch": 1.702262443438914, "grad_norm": 1.4758856756227428, "learning_rate": 3.9488572145230806e-06, "loss": 0.0844, "step": 1881 }, { "epoch": 1.7031674208144798, "grad_norm": 1.5287232590761313, "learning_rate": 3.944225130411949e-06, "loss": 0.088, "step": 1882 }, { "epoch": 1.7040723981900454, "grad_norm": 1.1792573503461194, "learning_rate": 3.939593994509586e-06, "loss": 0.0537, "step": 1883 }, { "epoch": 1.704977375565611, "grad_norm": 1.4878744064243166, "learning_rate": 3.934963810975294e-06, "loss": 0.0885, "step": 1884 }, { "epoch": 1.7058823529411766, "grad_norm": 1.1972864378838253, "learning_rate": 3.930334583967514e-06, "loss": 0.0776, "step": 1885 }, { "epoch": 1.7067873303167422, "grad_norm": 1.2973194625012605, "learning_rate": 3.925706317643832e-06, "loss": 0.0703, "step": 1886 }, { "epoch": 1.7076923076923078, "grad_norm": 1.6891085401186356, "learning_rate": 3.92107901616097e-06, "loss": 0.0714, "step": 1887 }, { "epoch": 1.7085972850678735, "grad_norm": 1.9100699851511203, "learning_rate": 3.916452683674785e-06, "loss": 0.1409, "step": 1888 }, { "epoch": 1.709502262443439, "grad_norm": 1.3179332994087676, "learning_rate": 3.91182732434026e-06, "loss": 0.0583, "step": 1889 }, { "epoch": 1.7104072398190047, "grad_norm": 1.8457453675521087, "learning_rate": 3.907202942311506e-06, "loss": 0.118, "step": 1890 }, { "epoch": 1.7113122171945703, "grad_norm": 1.4636576662853384, "learning_rate": 3.902579541741759e-06, "loss": 0.081, "step": 1891 }, { "epoch": 1.7122171945701359, "grad_norm": 1.6952917605028006, "learning_rate": 3.89795712678337e-06, "loss": 0.0979, "step": 1892 }, { "epoch": 1.7131221719457015, "grad_norm": 0.9867050545040247, "learning_rate": 3.8933357015878064e-06, "loss": 0.0522, "step": 1893 }, { "epoch": 1.714027149321267, "grad_norm": 1.2827604014266742, "learning_rate": 3.888715270305645e-06, "loss": 0.068, "step": 1894 }, { "epoch": 1.7149321266968327, "grad_norm": 2.0023667489560606, "learning_rate": 3.884095837086571e-06, "loss": 0.1534, "step": 1895 }, { "epoch": 1.7158371040723983, "grad_norm": 1.6197750861222422, "learning_rate": 3.879477406079374e-06, "loss": 0.0996, "step": 1896 }, { "epoch": 1.716742081447964, "grad_norm": 1.4625043402828333, "learning_rate": 3.874859981431943e-06, "loss": 0.0869, "step": 1897 }, { "epoch": 1.7176470588235295, "grad_norm": 1.4564258372965293, "learning_rate": 3.870243567291263e-06, "loss": 0.1068, "step": 1898 }, { "epoch": 1.7185520361990951, "grad_norm": 1.5838075564949303, "learning_rate": 3.86562816780341e-06, "loss": 0.0881, "step": 1899 }, { "epoch": 1.7194570135746607, "grad_norm": 1.6700959770762853, "learning_rate": 3.861013787113553e-06, "loss": 0.0903, "step": 1900 }, { "epoch": 1.7203619909502263, "grad_norm": 1.8782694838436451, "learning_rate": 3.856400429365941e-06, "loss": 0.1139, "step": 1901 }, { "epoch": 1.721266968325792, "grad_norm": 1.3163702009817828, "learning_rate": 3.851788098703907e-06, "loss": 0.0795, "step": 1902 }, { "epoch": 1.7221719457013576, "grad_norm": 1.6520932402015864, "learning_rate": 3.84717679926986e-06, "loss": 0.1284, "step": 1903 }, { "epoch": 1.7230769230769232, "grad_norm": 1.2765861606632447, "learning_rate": 3.842566535205286e-06, "loss": 0.0735, "step": 1904 }, { "epoch": 1.7239819004524888, "grad_norm": 1.7595859951774686, "learning_rate": 3.837957310650738e-06, "loss": 0.1426, "step": 1905 }, { "epoch": 1.7248868778280544, "grad_norm": 2.224513061499205, "learning_rate": 3.833349129745836e-06, "loss": 0.174, "step": 1906 }, { "epoch": 1.72579185520362, "grad_norm": 1.4886580259690083, "learning_rate": 3.828741996629263e-06, "loss": 0.0867, "step": 1907 }, { "epoch": 1.7266968325791856, "grad_norm": 1.3830091368029558, "learning_rate": 3.82413591543876e-06, "loss": 0.0868, "step": 1908 }, { "epoch": 1.7276018099547512, "grad_norm": 1.8411027679270224, "learning_rate": 3.819530890311126e-06, "loss": 0.1022, "step": 1909 }, { "epoch": 1.7285067873303168, "grad_norm": 1.3486495942655534, "learning_rate": 3.81492692538221e-06, "loss": 0.0876, "step": 1910 }, { "epoch": 1.7294117647058824, "grad_norm": 1.289448140724135, "learning_rate": 3.8103240247869077e-06, "loss": 0.0661, "step": 1911 }, { "epoch": 1.730316742081448, "grad_norm": 1.3474349572792352, "learning_rate": 3.805722192659159e-06, "loss": 0.0696, "step": 1912 }, { "epoch": 1.7312217194570136, "grad_norm": 1.9164057261582488, "learning_rate": 3.8011214331319466e-06, "loss": 0.1049, "step": 1913 }, { "epoch": 1.7321266968325792, "grad_norm": 1.4284241399359645, "learning_rate": 3.796521750337288e-06, "loss": 0.0802, "step": 1914 }, { "epoch": 1.7330316742081449, "grad_norm": 1.2249799850500267, "learning_rate": 3.7919231484062334e-06, "loss": 0.0652, "step": 1915 }, { "epoch": 1.7339366515837105, "grad_norm": 1.8511030016235264, "learning_rate": 3.7873256314688633e-06, "loss": 0.1312, "step": 1916 }, { "epoch": 1.734841628959276, "grad_norm": 1.143925827021224, "learning_rate": 3.782729203654281e-06, "loss": 0.0796, "step": 1917 }, { "epoch": 1.7357466063348417, "grad_norm": 1.682604857299741, "learning_rate": 3.7781338690906162e-06, "loss": 0.099, "step": 1918 }, { "epoch": 1.7366515837104073, "grad_norm": 2.2338095129262374, "learning_rate": 3.7735396319050156e-06, "loss": 0.1537, "step": 1919 }, { "epoch": 1.737556561085973, "grad_norm": 1.3439433481974743, "learning_rate": 3.7689464962236367e-06, "loss": 0.0585, "step": 1920 }, { "epoch": 1.7384615384615385, "grad_norm": 1.7725028544804404, "learning_rate": 3.7643544661716518e-06, "loss": 0.0792, "step": 1921 }, { "epoch": 1.739366515837104, "grad_norm": 1.9401447028080598, "learning_rate": 3.759763545873238e-06, "loss": 0.0768, "step": 1922 }, { "epoch": 1.7402714932126697, "grad_norm": 1.6834725083089404, "learning_rate": 3.755173739451575e-06, "loss": 0.1056, "step": 1923 }, { "epoch": 1.7411764705882353, "grad_norm": 1.2811479054000092, "learning_rate": 3.7505850510288455e-06, "loss": 0.0837, "step": 1924 }, { "epoch": 1.742081447963801, "grad_norm": 1.557282609855081, "learning_rate": 3.7459974847262253e-06, "loss": 0.0804, "step": 1925 }, { "epoch": 1.7429864253393665, "grad_norm": 3.723516861674599, "learning_rate": 3.741411044663883e-06, "loss": 0.1748, "step": 1926 }, { "epoch": 1.7438914027149321, "grad_norm": 1.2069131270856523, "learning_rate": 3.736825734960975e-06, "loss": 0.0685, "step": 1927 }, { "epoch": 1.7447963800904978, "grad_norm": 1.49173155473936, "learning_rate": 3.7322415597356444e-06, "loss": 0.0875, "step": 1928 }, { "epoch": 1.7457013574660634, "grad_norm": 1.1708576498028593, "learning_rate": 3.7276585231050146e-06, "loss": 0.0674, "step": 1929 }, { "epoch": 1.746606334841629, "grad_norm": 1.9120008991755466, "learning_rate": 3.723076629185186e-06, "loss": 0.1005, "step": 1930 }, { "epoch": 1.7475113122171946, "grad_norm": 2.846858669442348, "learning_rate": 3.718495882091232e-06, "loss": 0.1733, "step": 1931 }, { "epoch": 1.7484162895927602, "grad_norm": 1.38322801870114, "learning_rate": 3.7139162859371956e-06, "loss": 0.0809, "step": 1932 }, { "epoch": 1.7493212669683258, "grad_norm": 1.296047842374742, "learning_rate": 3.7093378448360917e-06, "loss": 0.071, "step": 1933 }, { "epoch": 1.7502262443438914, "grad_norm": 1.614537540952569, "learning_rate": 3.70476056289989e-06, "loss": 0.1383, "step": 1934 }, { "epoch": 1.751131221719457, "grad_norm": 1.3133453715488679, "learning_rate": 3.700184444239524e-06, "loss": 0.0892, "step": 1935 }, { "epoch": 1.7520361990950226, "grad_norm": 1.174236904558815, "learning_rate": 3.69560949296488e-06, "loss": 0.0615, "step": 1936 }, { "epoch": 1.7529411764705882, "grad_norm": 1.8209806231690355, "learning_rate": 3.6910357131847986e-06, "loss": 0.077, "step": 1937 }, { "epoch": 1.7538461538461538, "grad_norm": 1.5698593428732064, "learning_rate": 3.6864631090070656e-06, "loss": 0.093, "step": 1938 }, { "epoch": 1.7547511312217194, "grad_norm": 1.7475752365868553, "learning_rate": 3.6818916845384124e-06, "loss": 0.1346, "step": 1939 }, { "epoch": 1.755656108597285, "grad_norm": 1.251987071545434, "learning_rate": 3.677321443884509e-06, "loss": 0.0789, "step": 1940 }, { "epoch": 1.7565610859728507, "grad_norm": 2.1541614798440576, "learning_rate": 3.6727523911499663e-06, "loss": 0.1296, "step": 1941 }, { "epoch": 1.7574660633484163, "grad_norm": 1.1899717337343916, "learning_rate": 3.668184530438324e-06, "loss": 0.0633, "step": 1942 }, { "epoch": 1.7583710407239819, "grad_norm": 1.4131026374029785, "learning_rate": 3.663617865852054e-06, "loss": 0.0966, "step": 1943 }, { "epoch": 1.7592760180995475, "grad_norm": 1.4718351318698115, "learning_rate": 3.659052401492551e-06, "loss": 0.0903, "step": 1944 }, { "epoch": 1.760180995475113, "grad_norm": 2.301519317262477, "learning_rate": 3.654488141460134e-06, "loss": 0.131, "step": 1945 }, { "epoch": 1.7610859728506787, "grad_norm": 1.401827433105591, "learning_rate": 3.6499250898540416e-06, "loss": 0.0576, "step": 1946 }, { "epoch": 1.7619909502262443, "grad_norm": 1.0899229251328513, "learning_rate": 3.645363250772425e-06, "loss": 0.0591, "step": 1947 }, { "epoch": 1.76289592760181, "grad_norm": 1.1565107746083452, "learning_rate": 3.6408026283123465e-06, "loss": 0.0672, "step": 1948 }, { "epoch": 1.7638009049773755, "grad_norm": 1.7435405833662738, "learning_rate": 3.6362432265697744e-06, "loss": 0.1192, "step": 1949 }, { "epoch": 1.7647058823529411, "grad_norm": 1.8809601837426888, "learning_rate": 3.6316850496395863e-06, "loss": 0.0923, "step": 1950 }, { "epoch": 1.7656108597285067, "grad_norm": 1.404921407809477, "learning_rate": 3.6271281016155525e-06, "loss": 0.0826, "step": 1951 }, { "epoch": 1.7665158371040723, "grad_norm": 1.352149404202887, "learning_rate": 3.622572386590344e-06, "loss": 0.064, "step": 1952 }, { "epoch": 1.767420814479638, "grad_norm": 1.2264396611970831, "learning_rate": 3.6180179086555235e-06, "loss": 0.0601, "step": 1953 }, { "epoch": 1.7683257918552036, "grad_norm": 1.8135364533223342, "learning_rate": 3.613464671901542e-06, "loss": 0.087, "step": 1954 }, { "epoch": 1.7692307692307692, "grad_norm": 1.6519335927432173, "learning_rate": 3.6089126804177373e-06, "loss": 0.1061, "step": 1955 }, { "epoch": 1.7701357466063348, "grad_norm": 1.0334048867919348, "learning_rate": 3.604361938292327e-06, "loss": 0.0519, "step": 1956 }, { "epoch": 1.7710407239819004, "grad_norm": 1.6872176449423724, "learning_rate": 3.599812449612408e-06, "loss": 0.0774, "step": 1957 }, { "epoch": 1.771945701357466, "grad_norm": 1.125939025605856, "learning_rate": 3.5952642184639497e-06, "loss": 0.0705, "step": 1958 }, { "epoch": 1.7728506787330316, "grad_norm": 1.780397922864517, "learning_rate": 3.5907172489317943e-06, "loss": 0.1331, "step": 1959 }, { "epoch": 1.7737556561085972, "grad_norm": 1.0491907126456208, "learning_rate": 3.5861715450996505e-06, "loss": 0.0565, "step": 1960 }, { "epoch": 1.7746606334841628, "grad_norm": 1.2735892279665557, "learning_rate": 3.581627111050089e-06, "loss": 0.0914, "step": 1961 }, { "epoch": 1.7755656108597284, "grad_norm": 1.2415997215912253, "learning_rate": 3.577083950864539e-06, "loss": 0.0672, "step": 1962 }, { "epoch": 1.776470588235294, "grad_norm": 1.8391825933125956, "learning_rate": 3.5725420686232903e-06, "loss": 0.1406, "step": 1963 }, { "epoch": 1.7773755656108596, "grad_norm": 1.481306743690505, "learning_rate": 3.5680014684054804e-06, "loss": 0.0974, "step": 1964 }, { "epoch": 1.7782805429864252, "grad_norm": 1.2624355624615313, "learning_rate": 3.563462154289098e-06, "loss": 0.0738, "step": 1965 }, { "epoch": 1.7791855203619908, "grad_norm": 1.1968297645433181, "learning_rate": 3.558924130350974e-06, "loss": 0.0612, "step": 1966 }, { "epoch": 1.7800904977375565, "grad_norm": 1.3691520444017713, "learning_rate": 3.5543874006667833e-06, "loss": 0.0786, "step": 1967 }, { "epoch": 1.780995475113122, "grad_norm": 1.2354422351516687, "learning_rate": 3.549851969311038e-06, "loss": 0.0758, "step": 1968 }, { "epoch": 1.7819004524886877, "grad_norm": 1.2570886705384954, "learning_rate": 3.5453178403570833e-06, "loss": 0.0747, "step": 1969 }, { "epoch": 1.7828054298642533, "grad_norm": 1.2184558707481536, "learning_rate": 3.5407850178770944e-06, "loss": 0.0686, "step": 1970 }, { "epoch": 1.7837104072398189, "grad_norm": 1.4213383284679058, "learning_rate": 3.536253505942073e-06, "loss": 0.0702, "step": 1971 }, { "epoch": 1.7846153846153845, "grad_norm": 1.251072692836287, "learning_rate": 3.5317233086218474e-06, "loss": 0.0684, "step": 1972 }, { "epoch": 1.78552036199095, "grad_norm": 1.3488339378060115, "learning_rate": 3.527194429985059e-06, "loss": 0.0795, "step": 1973 }, { "epoch": 1.7864253393665157, "grad_norm": 1.1446787088925978, "learning_rate": 3.522666874099171e-06, "loss": 0.0744, "step": 1974 }, { "epoch": 1.7873303167420813, "grad_norm": 1.2506112318565996, "learning_rate": 3.5181406450304536e-06, "loss": 0.0727, "step": 1975 }, { "epoch": 1.788235294117647, "grad_norm": 1.6480515594683218, "learning_rate": 3.513615746843987e-06, "loss": 0.1325, "step": 1976 }, { "epoch": 1.7891402714932125, "grad_norm": 1.3041220625448446, "learning_rate": 3.5090921836036598e-06, "loss": 0.0749, "step": 1977 }, { "epoch": 1.7900452488687781, "grad_norm": 1.3887012544736188, "learning_rate": 3.504569959372156e-06, "loss": 0.0837, "step": 1978 }, { "epoch": 1.7909502262443437, "grad_norm": 1.1643671788420429, "learning_rate": 3.5000490782109606e-06, "loss": 0.0672, "step": 1979 }, { "epoch": 1.7918552036199094, "grad_norm": 1.331101212984055, "learning_rate": 3.49552954418035e-06, "loss": 0.0801, "step": 1980 }, { "epoch": 1.792760180995475, "grad_norm": 1.863337785954134, "learning_rate": 3.4910113613393935e-06, "loss": 0.0965, "step": 1981 }, { "epoch": 1.7936651583710406, "grad_norm": 1.3958254166288364, "learning_rate": 3.4864945337459455e-06, "loss": 0.0848, "step": 1982 }, { "epoch": 1.7945701357466062, "grad_norm": 1.5883381911760532, "learning_rate": 3.4819790654566423e-06, "loss": 0.0905, "step": 1983 }, { "epoch": 1.7954751131221718, "grad_norm": 1.4263898896816252, "learning_rate": 3.4774649605268995e-06, "loss": 0.0773, "step": 1984 }, { "epoch": 1.7963800904977374, "grad_norm": 1.4692376758495511, "learning_rate": 3.4729522230109103e-06, "loss": 0.1016, "step": 1985 }, { "epoch": 1.7972850678733032, "grad_norm": 1.68469034468746, "learning_rate": 3.468440856961639e-06, "loss": 0.0968, "step": 1986 }, { "epoch": 1.7981900452488688, "grad_norm": 1.9453232407326406, "learning_rate": 3.4639308664308157e-06, "loss": 0.122, "step": 1987 }, { "epoch": 1.7990950226244344, "grad_norm": 1.4951619032835135, "learning_rate": 3.4594222554689384e-06, "loss": 0.095, "step": 1988 }, { "epoch": 1.8, "grad_norm": 1.8944487381872368, "learning_rate": 3.4549150281252635e-06, "loss": 0.1268, "step": 1989 }, { "epoch": 1.8009049773755657, "grad_norm": 1.4356477298271946, "learning_rate": 3.4504091884478076e-06, "loss": 0.089, "step": 1990 }, { "epoch": 1.8018099547511313, "grad_norm": 1.6100988762596586, "learning_rate": 3.445904740483339e-06, "loss": 0.0951, "step": 1991 }, { "epoch": 1.8027149321266969, "grad_norm": 1.127716016957017, "learning_rate": 3.441401688277376e-06, "loss": 0.0641, "step": 1992 }, { "epoch": 1.8036199095022625, "grad_norm": 1.8272523660584064, "learning_rate": 3.4369000358741828e-06, "loss": 0.1349, "step": 1993 }, { "epoch": 1.804524886877828, "grad_norm": 1.2402510670057727, "learning_rate": 3.4323997873167697e-06, "loss": 0.062, "step": 1994 }, { "epoch": 1.8054298642533937, "grad_norm": 1.7885803342924598, "learning_rate": 3.4279009466468825e-06, "loss": 0.1131, "step": 1995 }, { "epoch": 1.8063348416289593, "grad_norm": 1.6317003750051635, "learning_rate": 3.4234035179050052e-06, "loss": 0.0863, "step": 1996 }, { "epoch": 1.807239819004525, "grad_norm": 1.172525434238513, "learning_rate": 3.4189075051303495e-06, "loss": 0.0632, "step": 1997 }, { "epoch": 1.8081447963800905, "grad_norm": 1.464724395284757, "learning_rate": 3.4144129123608616e-06, "loss": 0.0868, "step": 1998 }, { "epoch": 1.8090497737556561, "grad_norm": 1.281668910304494, "learning_rate": 3.4099197436332087e-06, "loss": 0.0646, "step": 1999 }, { "epoch": 1.8099547511312217, "grad_norm": 1.215229122528799, "learning_rate": 3.405428002982779e-06, "loss": 0.0607, "step": 2000 }, { "epoch": 1.8108597285067873, "grad_norm": 1.6690254857649067, "learning_rate": 3.400937694443678e-06, "loss": 0.0888, "step": 2001 }, { "epoch": 1.811764705882353, "grad_norm": 1.5856953415422297, "learning_rate": 3.3964488220487252e-06, "loss": 0.1117, "step": 2002 }, { "epoch": 1.8126696832579186, "grad_norm": 1.5270617016332626, "learning_rate": 3.3919613898294523e-06, "loss": 0.0982, "step": 2003 }, { "epoch": 1.8135746606334842, "grad_norm": 1.2585546527099634, "learning_rate": 3.387475401816096e-06, "loss": 0.0643, "step": 2004 }, { "epoch": 1.8144796380090498, "grad_norm": 1.235345740125394, "learning_rate": 3.3829908620375953e-06, "loss": 0.071, "step": 2005 }, { "epoch": 1.8153846153846154, "grad_norm": 1.4108937561625021, "learning_rate": 3.378507774521587e-06, "loss": 0.0931, "step": 2006 }, { "epoch": 1.816289592760181, "grad_norm": 1.143297055797399, "learning_rate": 3.37402614329441e-06, "loss": 0.062, "step": 2007 }, { "epoch": 1.8171945701357466, "grad_norm": 1.767251640457276, "learning_rate": 3.3695459723810873e-06, "loss": 0.0848, "step": 2008 }, { "epoch": 1.8180995475113122, "grad_norm": 1.3190752764175515, "learning_rate": 3.3650672658053364e-06, "loss": 0.0756, "step": 2009 }, { "epoch": 1.8190045248868778, "grad_norm": 1.5398173317005988, "learning_rate": 3.3605900275895565e-06, "loss": 0.098, "step": 2010 }, { "epoch": 1.8199095022624434, "grad_norm": 1.8979983680857015, "learning_rate": 3.3561142617548274e-06, "loss": 0.1185, "step": 2011 }, { "epoch": 1.820814479638009, "grad_norm": 1.6058497912193674, "learning_rate": 3.3516399723209103e-06, "loss": 0.0846, "step": 2012 }, { "epoch": 1.8217194570135746, "grad_norm": 2.002376852794014, "learning_rate": 3.3471671633062375e-06, "loss": 0.0928, "step": 2013 }, { "epoch": 1.8226244343891402, "grad_norm": 2.6158853996801823, "learning_rate": 3.342695838727912e-06, "loss": 0.1439, "step": 2014 }, { "epoch": 1.8235294117647058, "grad_norm": 1.5539345957998751, "learning_rate": 3.3382260026017027e-06, "loss": 0.0962, "step": 2015 }, { "epoch": 1.8244343891402715, "grad_norm": 1.5709525773320878, "learning_rate": 3.333757658942045e-06, "loss": 0.0965, "step": 2016 }, { "epoch": 1.825339366515837, "grad_norm": 1.5541279858482029, "learning_rate": 3.3292908117620313e-06, "loss": 0.112, "step": 2017 }, { "epoch": 1.8262443438914027, "grad_norm": 1.124408401061992, "learning_rate": 3.3248254650734097e-06, "loss": 0.0614, "step": 2018 }, { "epoch": 1.8271493212669683, "grad_norm": 1.4212680149025947, "learning_rate": 3.320361622886581e-06, "loss": 0.0685, "step": 2019 }, { "epoch": 1.8280542986425339, "grad_norm": 1.340447660000901, "learning_rate": 3.3158992892105975e-06, "loss": 0.089, "step": 2020 }, { "epoch": 1.8289592760180997, "grad_norm": 1.139512859711715, "learning_rate": 3.311438468053151e-06, "loss": 0.064, "step": 2021 }, { "epoch": 1.8298642533936653, "grad_norm": 1.5169406803746415, "learning_rate": 3.3069791634205817e-06, "loss": 0.1082, "step": 2022 }, { "epoch": 1.830769230769231, "grad_norm": 1.6701128282176054, "learning_rate": 3.3025213793178647e-06, "loss": 0.1128, "step": 2023 }, { "epoch": 1.8316742081447965, "grad_norm": 1.3771360790957667, "learning_rate": 3.2980651197486065e-06, "loss": 0.0802, "step": 2024 }, { "epoch": 1.8325791855203621, "grad_norm": 1.836749494560819, "learning_rate": 3.2936103887150484e-06, "loss": 0.1039, "step": 2025 }, { "epoch": 1.8334841628959277, "grad_norm": 2.1390695294463873, "learning_rate": 3.2891571902180565e-06, "loss": 0.0837, "step": 2026 }, { "epoch": 1.8343891402714934, "grad_norm": 1.5848343825378122, "learning_rate": 3.284705528257124e-06, "loss": 0.0781, "step": 2027 }, { "epoch": 1.835294117647059, "grad_norm": 1.375914743795714, "learning_rate": 3.2802554068303595e-06, "loss": 0.0816, "step": 2028 }, { "epoch": 1.8361990950226246, "grad_norm": 1.082209394795314, "learning_rate": 3.2758068299344914e-06, "loss": 0.0494, "step": 2029 }, { "epoch": 1.8371040723981902, "grad_norm": 1.5493687241134924, "learning_rate": 3.271359801564858e-06, "loss": 0.1257, "step": 2030 }, { "epoch": 1.8380090497737558, "grad_norm": 1.361229706692232, "learning_rate": 3.2669143257154114e-06, "loss": 0.0794, "step": 2031 }, { "epoch": 1.8389140271493214, "grad_norm": 2.215244134498231, "learning_rate": 3.2624704063787047e-06, "loss": 0.1127, "step": 2032 }, { "epoch": 1.839819004524887, "grad_norm": 1.4262659999102716, "learning_rate": 3.258028047545895e-06, "loss": 0.0693, "step": 2033 }, { "epoch": 1.8407239819004526, "grad_norm": 1.6029239650835214, "learning_rate": 3.253587253206738e-06, "loss": 0.11, "step": 2034 }, { "epoch": 1.8416289592760182, "grad_norm": 1.6876385458580911, "learning_rate": 3.2491480273495847e-06, "loss": 0.1089, "step": 2035 }, { "epoch": 1.8425339366515838, "grad_norm": 1.6637002958305596, "learning_rate": 3.244710373961376e-06, "loss": 0.1228, "step": 2036 }, { "epoch": 1.8434389140271494, "grad_norm": 1.542640250852168, "learning_rate": 3.2402742970276425e-06, "loss": 0.0762, "step": 2037 }, { "epoch": 1.844343891402715, "grad_norm": 1.3450685433404026, "learning_rate": 3.2358398005324965e-06, "loss": 0.0904, "step": 2038 }, { "epoch": 1.8452488687782806, "grad_norm": 1.3568614347823444, "learning_rate": 3.231406888458632e-06, "loss": 0.0791, "step": 2039 }, { "epoch": 1.8461538461538463, "grad_norm": 1.1254432479203453, "learning_rate": 3.226975564787322e-06, "loss": 0.0627, "step": 2040 }, { "epoch": 1.8470588235294119, "grad_norm": 1.8692943965019257, "learning_rate": 3.22254583349841e-06, "loss": 0.1136, "step": 2041 }, { "epoch": 1.8479638009049775, "grad_norm": 1.1877598633089645, "learning_rate": 3.2181176985703113e-06, "loss": 0.075, "step": 2042 }, { "epoch": 1.848868778280543, "grad_norm": 1.536367074497494, "learning_rate": 3.213691163980004e-06, "loss": 0.0801, "step": 2043 }, { "epoch": 1.8497737556561087, "grad_norm": 1.6482426700443003, "learning_rate": 3.209266233703035e-06, "loss": 0.0961, "step": 2044 }, { "epoch": 1.8506787330316743, "grad_norm": 1.2847957879167315, "learning_rate": 3.204842911713506e-06, "loss": 0.0917, "step": 2045 }, { "epoch": 1.85158371040724, "grad_norm": 1.5001828750781578, "learning_rate": 3.200421201984074e-06, "loss": 0.0857, "step": 2046 }, { "epoch": 1.8524886877828055, "grad_norm": 1.4500206525091583, "learning_rate": 3.196001108485951e-06, "loss": 0.0758, "step": 2047 }, { "epoch": 1.8533936651583711, "grad_norm": 1.5098837634514808, "learning_rate": 3.1915826351888933e-06, "loss": 0.0919, "step": 2048 }, { "epoch": 1.8542986425339367, "grad_norm": 3.9292785395175023, "learning_rate": 3.1871657860612065e-06, "loss": 0.2271, "step": 2049 }, { "epoch": 1.8552036199095023, "grad_norm": 1.476964874923321, "learning_rate": 3.182750565069735e-06, "loss": 0.0902, "step": 2050 }, { "epoch": 1.856108597285068, "grad_norm": 1.7765209853852713, "learning_rate": 3.178336976179861e-06, "loss": 0.0805, "step": 2051 }, { "epoch": 1.8570135746606335, "grad_norm": 1.2976354323789536, "learning_rate": 3.1739250233554998e-06, "loss": 0.0708, "step": 2052 }, { "epoch": 1.8579185520361992, "grad_norm": 1.698380367781338, "learning_rate": 3.1695147105591016e-06, "loss": 0.0819, "step": 2053 }, { "epoch": 1.8588235294117648, "grad_norm": 1.3933642256805194, "learning_rate": 3.16510604175164e-06, "loss": 0.0719, "step": 2054 }, { "epoch": 1.8597285067873304, "grad_norm": 1.7511017800009316, "learning_rate": 3.1606990208926125e-06, "loss": 0.1072, "step": 2055 }, { "epoch": 1.860633484162896, "grad_norm": 1.622025118780001, "learning_rate": 3.1562936519400356e-06, "loss": 0.1036, "step": 2056 }, { "epoch": 1.8615384615384616, "grad_norm": 1.364515737925582, "learning_rate": 3.151889938850445e-06, "loss": 0.0675, "step": 2057 }, { "epoch": 1.8624434389140272, "grad_norm": 1.9665257596993595, "learning_rate": 3.1474878855788893e-06, "loss": 0.1009, "step": 2058 }, { "epoch": 1.8633484162895928, "grad_norm": 1.3472187252379122, "learning_rate": 3.143087496078923e-06, "loss": 0.0938, "step": 2059 }, { "epoch": 1.8642533936651584, "grad_norm": 2.0056799945025987, "learning_rate": 3.1386887743026083e-06, "loss": 0.1274, "step": 2060 }, { "epoch": 1.865158371040724, "grad_norm": 1.4722199010724881, "learning_rate": 3.13429172420051e-06, "loss": 0.0885, "step": 2061 }, { "epoch": 1.8660633484162896, "grad_norm": 1.2558510650443648, "learning_rate": 3.1298963497216904e-06, "loss": 0.0731, "step": 2062 }, { "epoch": 1.8669683257918552, "grad_norm": 1.1985741743352558, "learning_rate": 3.1255026548137095e-06, "loss": 0.0724, "step": 2063 }, { "epoch": 1.8678733031674208, "grad_norm": 1.478793717532513, "learning_rate": 3.121110643422615e-06, "loss": 0.0886, "step": 2064 }, { "epoch": 1.8687782805429864, "grad_norm": 1.2651382161916211, "learning_rate": 3.1167203194929447e-06, "loss": 0.0626, "step": 2065 }, { "epoch": 1.869683257918552, "grad_norm": 1.4770671125142387, "learning_rate": 3.1123316869677222e-06, "loss": 0.0934, "step": 2066 }, { "epoch": 1.8705882352941177, "grad_norm": 1.3383487797829507, "learning_rate": 3.107944749788449e-06, "loss": 0.0616, "step": 2067 }, { "epoch": 1.8714932126696833, "grad_norm": 1.7996411699038337, "learning_rate": 3.1035595118951067e-06, "loss": 0.1315, "step": 2068 }, { "epoch": 1.8723981900452489, "grad_norm": 1.6613712551266646, "learning_rate": 3.099175977226149e-06, "loss": 0.0982, "step": 2069 }, { "epoch": 1.8733031674208145, "grad_norm": 1.4367284518540904, "learning_rate": 3.0947941497184985e-06, "loss": 0.0872, "step": 2070 }, { "epoch": 1.87420814479638, "grad_norm": 1.8312730790115619, "learning_rate": 3.09041403330755e-06, "loss": 0.1182, "step": 2071 }, { "epoch": 1.8751131221719457, "grad_norm": 1.511787735711287, "learning_rate": 3.0860356319271563e-06, "loss": 0.0906, "step": 2072 }, { "epoch": 1.8760180995475113, "grad_norm": 1.3214396353318876, "learning_rate": 3.0816589495096315e-06, "loss": 0.0677, "step": 2073 }, { "epoch": 1.876923076923077, "grad_norm": 1.2310422647949475, "learning_rate": 3.0772839899857465e-06, "loss": 0.0621, "step": 2074 }, { "epoch": 1.8778280542986425, "grad_norm": 2.3501770937385973, "learning_rate": 3.0729107572847244e-06, "loss": 0.0816, "step": 2075 }, { "epoch": 1.8787330316742081, "grad_norm": 1.2822442008074548, "learning_rate": 3.0685392553342376e-06, "loss": 0.0602, "step": 2076 }, { "epoch": 1.8796380090497737, "grad_norm": 1.269467994481317, "learning_rate": 3.0641694880604033e-06, "loss": 0.0585, "step": 2077 }, { "epoch": 1.8805429864253393, "grad_norm": 1.3907042296039689, "learning_rate": 3.059801459387781e-06, "loss": 0.0691, "step": 2078 }, { "epoch": 1.881447963800905, "grad_norm": 1.438006227193471, "learning_rate": 3.05543517323937e-06, "loss": 0.0709, "step": 2079 }, { "epoch": 1.8823529411764706, "grad_norm": 1.2831845296115711, "learning_rate": 3.0510706335366034e-06, "loss": 0.0891, "step": 2080 }, { "epoch": 1.8832579185520362, "grad_norm": 1.264721468090393, "learning_rate": 3.046707844199347e-06, "loss": 0.073, "step": 2081 }, { "epoch": 1.8841628959276018, "grad_norm": 1.3043303510327562, "learning_rate": 3.042346809145892e-06, "loss": 0.0739, "step": 2082 }, { "epoch": 1.8850678733031674, "grad_norm": 1.4792247185998466, "learning_rate": 3.0379875322929553e-06, "loss": 0.0891, "step": 2083 }, { "epoch": 1.885972850678733, "grad_norm": 1.041404911108274, "learning_rate": 3.033630017555677e-06, "loss": 0.0647, "step": 2084 }, { "epoch": 1.8868778280542986, "grad_norm": 1.2465101192190848, "learning_rate": 3.0292742688476125e-06, "loss": 0.0736, "step": 2085 }, { "epoch": 1.8877828054298642, "grad_norm": 2.0156292795467525, "learning_rate": 3.02492029008073e-06, "loss": 0.0816, "step": 2086 }, { "epoch": 1.8886877828054298, "grad_norm": 1.310371739292188, "learning_rate": 3.0205680851654095e-06, "loss": 0.079, "step": 2087 }, { "epoch": 1.8895927601809954, "grad_norm": 1.2210244844480969, "learning_rate": 3.0162176580104396e-06, "loss": 0.0647, "step": 2088 }, { "epoch": 1.890497737556561, "grad_norm": 1.4554289583750768, "learning_rate": 3.0118690125230095e-06, "loss": 0.0787, "step": 2089 }, { "epoch": 1.8914027149321266, "grad_norm": 1.2149551484936392, "learning_rate": 3.0075221526087083e-06, "loss": 0.0636, "step": 2090 }, { "epoch": 1.8923076923076922, "grad_norm": 1.1422531819426196, "learning_rate": 3.0031770821715233e-06, "loss": 0.0677, "step": 2091 }, { "epoch": 1.8932126696832579, "grad_norm": 1.1915629988943433, "learning_rate": 2.9988338051138333e-06, "loss": 0.0662, "step": 2092 }, { "epoch": 1.8941176470588235, "grad_norm": 2.061784987406666, "learning_rate": 2.9944923253364066e-06, "loss": 0.1027, "step": 2093 }, { "epoch": 1.895022624434389, "grad_norm": 1.363378796212542, "learning_rate": 2.990152646738399e-06, "loss": 0.0743, "step": 2094 }, { "epoch": 1.8959276018099547, "grad_norm": 1.1738403442057235, "learning_rate": 2.985814773217346e-06, "loss": 0.066, "step": 2095 }, { "epoch": 1.8968325791855203, "grad_norm": 1.3509513777940168, "learning_rate": 2.981478708669163e-06, "loss": 0.0662, "step": 2096 }, { "epoch": 1.897737556561086, "grad_norm": 1.0933026945666182, "learning_rate": 2.977144456988141e-06, "loss": 0.0477, "step": 2097 }, { "epoch": 1.8986425339366515, "grad_norm": 1.4292806643647544, "learning_rate": 2.972812022066943e-06, "loss": 0.0946, "step": 2098 }, { "epoch": 1.899547511312217, "grad_norm": 0.8512388954219322, "learning_rate": 2.9684814077966006e-06, "loss": 0.0485, "step": 2099 }, { "epoch": 1.9004524886877827, "grad_norm": 1.0761069755093904, "learning_rate": 2.964152618066508e-06, "loss": 0.0575, "step": 2100 }, { "epoch": 1.9013574660633483, "grad_norm": 1.5511816475045077, "learning_rate": 2.9598256567644247e-06, "loss": 0.0934, "step": 2101 }, { "epoch": 1.902262443438914, "grad_norm": 1.0855777110833726, "learning_rate": 2.9555005277764635e-06, "loss": 0.0668, "step": 2102 }, { "epoch": 1.9031674208144795, "grad_norm": 1.488878911062376, "learning_rate": 2.9511772349870958e-06, "loss": 0.1154, "step": 2103 }, { "epoch": 1.9040723981900451, "grad_norm": 1.3778978097822074, "learning_rate": 2.946855782279141e-06, "loss": 0.0663, "step": 2104 }, { "epoch": 1.9049773755656108, "grad_norm": 1.6718609934447173, "learning_rate": 2.9425361735337655e-06, "loss": 0.0931, "step": 2105 }, { "epoch": 1.9058823529411764, "grad_norm": 1.4217857347279574, "learning_rate": 2.9382184126304834e-06, "loss": 0.0772, "step": 2106 }, { "epoch": 1.906787330316742, "grad_norm": 1.6915052078015294, "learning_rate": 2.9339025034471456e-06, "loss": 0.1068, "step": 2107 }, { "epoch": 1.9076923076923076, "grad_norm": 1.4259327507309025, "learning_rate": 2.9295884498599415e-06, "loss": 0.0836, "step": 2108 }, { "epoch": 1.9085972850678732, "grad_norm": 1.189865425648556, "learning_rate": 2.9252762557433922e-06, "loss": 0.0643, "step": 2109 }, { "epoch": 1.9095022624434388, "grad_norm": 1.6793815559016372, "learning_rate": 2.920965924970352e-06, "loss": 0.1073, "step": 2110 }, { "epoch": 1.9104072398190044, "grad_norm": 1.2476097502865013, "learning_rate": 2.9166574614119982e-06, "loss": 0.0729, "step": 2111 }, { "epoch": 1.91131221719457, "grad_norm": 1.5812393528684239, "learning_rate": 2.9123508689378356e-06, "loss": 0.0931, "step": 2112 }, { "epoch": 1.9122171945701356, "grad_norm": 1.4396302218781116, "learning_rate": 2.908046151415681e-06, "loss": 0.0812, "step": 2113 }, { "epoch": 1.9131221719457012, "grad_norm": 1.7742064337035093, "learning_rate": 2.9037433127116777e-06, "loss": 0.1239, "step": 2114 }, { "epoch": 1.9140271493212668, "grad_norm": 1.3359616566960362, "learning_rate": 2.899442356690271e-06, "loss": 0.0834, "step": 2115 }, { "epoch": 1.9149321266968324, "grad_norm": 1.024041458098199, "learning_rate": 2.895143287214221e-06, "loss": 0.0459, "step": 2116 }, { "epoch": 1.915837104072398, "grad_norm": 1.2690516905374611, "learning_rate": 2.890846108144596e-06, "loss": 0.0704, "step": 2117 }, { "epoch": 1.9167420814479637, "grad_norm": 1.3262209000596794, "learning_rate": 2.8865508233407597e-06, "loss": 0.061, "step": 2118 }, { "epoch": 1.9176470588235293, "grad_norm": 1.8049962103145774, "learning_rate": 2.8822574366603804e-06, "loss": 0.0848, "step": 2119 }, { "epoch": 1.9185520361990949, "grad_norm": 1.5497616883117575, "learning_rate": 2.8779659519594173e-06, "loss": 0.0761, "step": 2120 }, { "epoch": 1.9194570135746605, "grad_norm": 1.5658264586794877, "learning_rate": 2.873676373092123e-06, "loss": 0.1123, "step": 2121 }, { "epoch": 1.920361990950226, "grad_norm": 1.2921012079858722, "learning_rate": 2.8693887039110426e-06, "loss": 0.0757, "step": 2122 }, { "epoch": 1.9212669683257917, "grad_norm": 1.1246570111123302, "learning_rate": 2.865102948266998e-06, "loss": 0.0542, "step": 2123 }, { "epoch": 1.9221719457013575, "grad_norm": 1.5248243047431291, "learning_rate": 2.8608191100090974e-06, "loss": 0.0784, "step": 2124 }, { "epoch": 1.9230769230769231, "grad_norm": 2.8955055872086124, "learning_rate": 2.8565371929847286e-06, "loss": 0.0875, "step": 2125 }, { "epoch": 1.9239819004524887, "grad_norm": 1.2370919675864132, "learning_rate": 2.85225720103955e-06, "loss": 0.0661, "step": 2126 }, { "epoch": 1.9248868778280543, "grad_norm": 1.49422105564383, "learning_rate": 2.847979138017496e-06, "loss": 0.082, "step": 2127 }, { "epoch": 1.92579185520362, "grad_norm": 1.8093695777678633, "learning_rate": 2.84370300776076e-06, "loss": 0.1014, "step": 2128 }, { "epoch": 1.9266968325791856, "grad_norm": 1.6863792679151224, "learning_rate": 2.83942881410981e-06, "loss": 0.0863, "step": 2129 }, { "epoch": 1.9276018099547512, "grad_norm": 1.3602886941933352, "learning_rate": 2.835156560903365e-06, "loss": 0.0628, "step": 2130 }, { "epoch": 1.9285067873303168, "grad_norm": 1.907407331956616, "learning_rate": 2.8308862519784074e-06, "loss": 0.1295, "step": 2131 }, { "epoch": 1.9294117647058824, "grad_norm": 1.4815959200195215, "learning_rate": 2.8266178911701757e-06, "loss": 0.0819, "step": 2132 }, { "epoch": 1.930316742081448, "grad_norm": 1.2820801165774367, "learning_rate": 2.822351482312149e-06, "loss": 0.0828, "step": 2133 }, { "epoch": 1.9312217194570136, "grad_norm": 1.4288916395669642, "learning_rate": 2.818087029236064e-06, "loss": 0.0695, "step": 2134 }, { "epoch": 1.9321266968325792, "grad_norm": 1.9336957529581966, "learning_rate": 2.813824535771892e-06, "loss": 0.1124, "step": 2135 }, { "epoch": 1.9330316742081448, "grad_norm": 1.307167324240644, "learning_rate": 2.80956400574785e-06, "loss": 0.0945, "step": 2136 }, { "epoch": 1.9339366515837104, "grad_norm": 2.135508411069363, "learning_rate": 2.805305442990394e-06, "loss": 0.0841, "step": 2137 }, { "epoch": 1.934841628959276, "grad_norm": 1.1466020426382857, "learning_rate": 2.801048851324203e-06, "loss": 0.0598, "step": 2138 }, { "epoch": 1.9357466063348416, "grad_norm": 1.2401936559419573, "learning_rate": 2.7967942345721967e-06, "loss": 0.0693, "step": 2139 }, { "epoch": 1.9366515837104072, "grad_norm": 1.3864314862909022, "learning_rate": 2.7925415965555126e-06, "loss": 0.09, "step": 2140 }, { "epoch": 1.9375565610859729, "grad_norm": 1.2354941630303458, "learning_rate": 2.788290941093517e-06, "loss": 0.0633, "step": 2141 }, { "epoch": 1.9384615384615385, "grad_norm": 1.2464783624819658, "learning_rate": 2.7840422720037943e-06, "loss": 0.0789, "step": 2142 }, { "epoch": 1.939366515837104, "grad_norm": 1.2507945601903365, "learning_rate": 2.7797955931021412e-06, "loss": 0.0665, "step": 2143 }, { "epoch": 1.9402714932126697, "grad_norm": 1.5320295549116454, "learning_rate": 2.7755509082025717e-06, "loss": 0.0846, "step": 2144 }, { "epoch": 1.9411764705882353, "grad_norm": 1.4125822975977576, "learning_rate": 2.771308221117309e-06, "loss": 0.0715, "step": 2145 }, { "epoch": 1.9420814479638009, "grad_norm": 1.587598300976593, "learning_rate": 2.7670675356567764e-06, "loss": 0.0791, "step": 2146 }, { "epoch": 1.9429864253393665, "grad_norm": 1.3989208055476121, "learning_rate": 2.7628288556296066e-06, "loss": 0.0906, "step": 2147 }, { "epoch": 1.943891402714932, "grad_norm": 2.0267019127501804, "learning_rate": 2.7585921848426243e-06, "loss": 0.1368, "step": 2148 }, { "epoch": 1.9447963800904977, "grad_norm": 1.3428254572930314, "learning_rate": 2.754357527100855e-06, "loss": 0.0847, "step": 2149 }, { "epoch": 1.9457013574660633, "grad_norm": 1.3466198497072657, "learning_rate": 2.7501248862075163e-06, "loss": 0.0843, "step": 2150 }, { "epoch": 1.946606334841629, "grad_norm": 1.370319975551412, "learning_rate": 2.7458942659640087e-06, "loss": 0.0962, "step": 2151 }, { "epoch": 1.9475113122171945, "grad_norm": 1.2329316559128707, "learning_rate": 2.7416656701699264e-06, "loss": 0.0701, "step": 2152 }, { "epoch": 1.9484162895927601, "grad_norm": 1.576993710929066, "learning_rate": 2.7374391026230364e-06, "loss": 0.1182, "step": 2153 }, { "epoch": 1.9493212669683257, "grad_norm": 1.4034357306775953, "learning_rate": 2.73321456711929e-06, "loss": 0.0844, "step": 2154 }, { "epoch": 1.9502262443438914, "grad_norm": 1.5206042787172596, "learning_rate": 2.7289920674528142e-06, "loss": 0.0978, "step": 2155 }, { "epoch": 1.951131221719457, "grad_norm": 1.3355233132916506, "learning_rate": 2.7247716074159014e-06, "loss": 0.0832, "step": 2156 }, { "epoch": 1.9520361990950226, "grad_norm": 1.1109394008819384, "learning_rate": 2.720553190799019e-06, "loss": 0.0648, "step": 2157 }, { "epoch": 1.9529411764705882, "grad_norm": 1.1714986301507466, "learning_rate": 2.7163368213907975e-06, "loss": 0.0597, "step": 2158 }, { "epoch": 1.953846153846154, "grad_norm": 0.9463455349317355, "learning_rate": 2.712122502978024e-06, "loss": 0.0468, "step": 2159 }, { "epoch": 1.9547511312217196, "grad_norm": 1.3173758995280447, "learning_rate": 2.7079102393456503e-06, "loss": 0.0693, "step": 2160 }, { "epoch": 1.9556561085972852, "grad_norm": 1.6269738386970256, "learning_rate": 2.7037000342767764e-06, "loss": 0.0695, "step": 2161 }, { "epoch": 1.9565610859728508, "grad_norm": 1.5077944429810568, "learning_rate": 2.699491891552659e-06, "loss": 0.0943, "step": 2162 }, { "epoch": 1.9574660633484164, "grad_norm": 1.2161758958967512, "learning_rate": 2.695285814952702e-06, "loss": 0.0667, "step": 2163 }, { "epoch": 1.958371040723982, "grad_norm": 1.4264534539980693, "learning_rate": 2.6910818082544466e-06, "loss": 0.0936, "step": 2164 }, { "epoch": 1.9592760180995477, "grad_norm": 1.1165352759346907, "learning_rate": 2.6868798752335867e-06, "loss": 0.0648, "step": 2165 }, { "epoch": 1.9601809954751133, "grad_norm": 1.1452931600428766, "learning_rate": 2.6826800196639415e-06, "loss": 0.0625, "step": 2166 }, { "epoch": 1.9610859728506789, "grad_norm": 0.947287165139779, "learning_rate": 2.678482245317473e-06, "loss": 0.0427, "step": 2167 }, { "epoch": 1.9619909502262445, "grad_norm": 1.5717041619152496, "learning_rate": 2.6742865559642737e-06, "loss": 0.1188, "step": 2168 }, { "epoch": 1.96289592760181, "grad_norm": 1.610366325354885, "learning_rate": 2.6700929553725573e-06, "loss": 0.1046, "step": 2169 }, { "epoch": 1.9638009049773757, "grad_norm": 1.5545251628949806, "learning_rate": 2.6659014473086665e-06, "loss": 0.0751, "step": 2170 }, { "epoch": 1.9647058823529413, "grad_norm": 1.4683524919078894, "learning_rate": 2.6617120355370667e-06, "loss": 0.0954, "step": 2171 }, { "epoch": 1.965610859728507, "grad_norm": 1.2017069737501727, "learning_rate": 2.6575247238203327e-06, "loss": 0.0628, "step": 2172 }, { "epoch": 1.9665158371040725, "grad_norm": 3.4310386766462586, "learning_rate": 2.653339515919162e-06, "loss": 0.1393, "step": 2173 }, { "epoch": 1.9674208144796381, "grad_norm": 1.7224960123264432, "learning_rate": 2.6491564155923555e-06, "loss": 0.1194, "step": 2174 }, { "epoch": 1.9683257918552037, "grad_norm": 1.4516370377673735, "learning_rate": 2.6449754265968263e-06, "loss": 0.0838, "step": 2175 }, { "epoch": 1.9692307692307693, "grad_norm": 1.2005295632148294, "learning_rate": 2.64079655268759e-06, "loss": 0.0602, "step": 2176 }, { "epoch": 1.970135746606335, "grad_norm": 1.5341641342296441, "learning_rate": 2.636619797617759e-06, "loss": 0.0716, "step": 2177 }, { "epoch": 1.9710407239819006, "grad_norm": 1.8163974066364703, "learning_rate": 2.63244516513855e-06, "loss": 0.0949, "step": 2178 }, { "epoch": 1.9719457013574662, "grad_norm": 1.5092757791264835, "learning_rate": 2.6282726589992634e-06, "loss": 0.0769, "step": 2179 }, { "epoch": 1.9728506787330318, "grad_norm": 1.3181301458765813, "learning_rate": 2.6241022829473e-06, "loss": 0.107, "step": 2180 }, { "epoch": 1.9737556561085974, "grad_norm": 1.3562839261258368, "learning_rate": 2.6199340407281437e-06, "loss": 0.0725, "step": 2181 }, { "epoch": 1.974660633484163, "grad_norm": 1.6877793984280691, "learning_rate": 2.6157679360853574e-06, "loss": 0.0904, "step": 2182 }, { "epoch": 1.9755656108597286, "grad_norm": 1.830441788048403, "learning_rate": 2.6116039727605925e-06, "loss": 0.0979, "step": 2183 }, { "epoch": 1.9764705882352942, "grad_norm": 1.7486444710670528, "learning_rate": 2.607442154493568e-06, "loss": 0.112, "step": 2184 }, { "epoch": 1.9773755656108598, "grad_norm": 1.3858554836409827, "learning_rate": 2.603282485022085e-06, "loss": 0.0839, "step": 2185 }, { "epoch": 1.9782805429864254, "grad_norm": 1.164731212442636, "learning_rate": 2.599124968082012e-06, "loss": 0.0614, "step": 2186 }, { "epoch": 1.979185520361991, "grad_norm": 1.017797613014104, "learning_rate": 2.594969607407279e-06, "loss": 0.0541, "step": 2187 }, { "epoch": 1.9800904977375566, "grad_norm": 1.1598484407806444, "learning_rate": 2.590816406729887e-06, "loss": 0.0693, "step": 2188 }, { "epoch": 1.9809954751131222, "grad_norm": 1.73543316761775, "learning_rate": 2.586665369779894e-06, "loss": 0.1118, "step": 2189 }, { "epoch": 1.9819004524886878, "grad_norm": 1.1583665260694327, "learning_rate": 2.5825165002854124e-06, "loss": 0.056, "step": 2190 }, { "epoch": 1.9828054298642535, "grad_norm": 1.249757456934596, "learning_rate": 2.578369801972613e-06, "loss": 0.0912, "step": 2191 }, { "epoch": 1.983710407239819, "grad_norm": 1.877667481511001, "learning_rate": 2.5742252785657094e-06, "loss": 0.1245, "step": 2192 }, { "epoch": 1.9846153846153847, "grad_norm": 1.3224342456755285, "learning_rate": 2.57008293378697e-06, "loss": 0.0864, "step": 2193 }, { "epoch": 1.9855203619909503, "grad_norm": 1.2480073723412364, "learning_rate": 2.5659427713567026e-06, "loss": 0.0625, "step": 2194 }, { "epoch": 1.9864253393665159, "grad_norm": 1.28559016159755, "learning_rate": 2.5618047949932524e-06, "loss": 0.0662, "step": 2195 }, { "epoch": 1.9873303167420815, "grad_norm": 1.4689017586790123, "learning_rate": 2.5576690084130085e-06, "loss": 0.0757, "step": 2196 }, { "epoch": 1.988235294117647, "grad_norm": 3.035933336742508, "learning_rate": 2.5535354153303827e-06, "loss": 0.1878, "step": 2197 }, { "epoch": 1.9891402714932127, "grad_norm": 1.0482283512166854, "learning_rate": 2.549404019457827e-06, "loss": 0.0601, "step": 2198 }, { "epoch": 1.9900452488687783, "grad_norm": 1.23488323992723, "learning_rate": 2.5452748245058177e-06, "loss": 0.0816, "step": 2199 }, { "epoch": 1.990950226244344, "grad_norm": 1.234743359080253, "learning_rate": 2.5411478341828475e-06, "loss": 0.0681, "step": 2200 }, { "epoch": 1.9918552036199095, "grad_norm": 1.484040798387322, "learning_rate": 2.5370230521954374e-06, "loss": 0.0892, "step": 2201 }, { "epoch": 1.9927601809954751, "grad_norm": 1.3026981860834812, "learning_rate": 2.532900482248124e-06, "loss": 0.0786, "step": 2202 }, { "epoch": 1.9936651583710407, "grad_norm": 1.0880345994539362, "learning_rate": 2.52878012804345e-06, "loss": 0.0563, "step": 2203 }, { "epoch": 1.9945701357466064, "grad_norm": 1.4756864167730288, "learning_rate": 2.5246619932819784e-06, "loss": 0.0746, "step": 2204 }, { "epoch": 1.995475113122172, "grad_norm": 1.584095345840437, "learning_rate": 2.5205460816622684e-06, "loss": 0.094, "step": 2205 }, { "epoch": 1.9963800904977376, "grad_norm": 1.2025370468863426, "learning_rate": 2.5164323968808918e-06, "loss": 0.0777, "step": 2206 }, { "epoch": 1.9972850678733032, "grad_norm": 1.293046553304415, "learning_rate": 2.512320942632417e-06, "loss": 0.0649, "step": 2207 }, { "epoch": 1.9981900452488688, "grad_norm": 1.7777063587082067, "learning_rate": 2.508211722609405e-06, "loss": 0.114, "step": 2208 }, { "epoch": 1.9990950226244344, "grad_norm": 1.7926064083803739, "learning_rate": 2.504104740502419e-06, "loss": 0.0863, "step": 2209 }, { "epoch": 2.0, "grad_norm": 1.1880609430145956, "learning_rate": 2.5000000000000015e-06, "loss": 0.0808, "step": 2210 }, { "epoch": 2.0009049773755656, "grad_norm": 0.9912250499553991, "learning_rate": 2.49589750478869e-06, "loss": 0.0396, "step": 2211 }, { "epoch": 2.001809954751131, "grad_norm": 1.0953433208564833, "learning_rate": 2.4917972585530054e-06, "loss": 0.0536, "step": 2212 }, { "epoch": 2.002714932126697, "grad_norm": 0.8816904569285002, "learning_rate": 2.4876992649754417e-06, "loss": 0.0393, "step": 2213 }, { "epoch": 2.0036199095022624, "grad_norm": 0.7685361059204949, "learning_rate": 2.4836035277364765e-06, "loss": 0.0414, "step": 2214 }, { "epoch": 2.004524886877828, "grad_norm": 0.8364202429692399, "learning_rate": 2.479510050514561e-06, "loss": 0.0303, "step": 2215 }, { "epoch": 2.0054298642533936, "grad_norm": 0.7729467101102588, "learning_rate": 2.4754188369861104e-06, "loss": 0.0374, "step": 2216 }, { "epoch": 2.0063348416289593, "grad_norm": 1.223356569217661, "learning_rate": 2.471329890825514e-06, "loss": 0.0725, "step": 2217 }, { "epoch": 2.007239819004525, "grad_norm": 0.8929012787944102, "learning_rate": 2.4672432157051185e-06, "loss": 0.0358, "step": 2218 }, { "epoch": 2.0081447963800905, "grad_norm": 0.9468871797619344, "learning_rate": 2.463158815295236e-06, "loss": 0.0401, "step": 2219 }, { "epoch": 2.009049773755656, "grad_norm": 0.9592117419392461, "learning_rate": 2.4590766932641353e-06, "loss": 0.0445, "step": 2220 }, { "epoch": 2.0099547511312217, "grad_norm": 1.1085241145328442, "learning_rate": 2.454996853278033e-06, "loss": 0.0583, "step": 2221 }, { "epoch": 2.0108597285067873, "grad_norm": 0.9260010166175255, "learning_rate": 2.4509192990011053e-06, "loss": 0.0506, "step": 2222 }, { "epoch": 2.011764705882353, "grad_norm": 0.8669246516693493, "learning_rate": 2.4468440340954664e-06, "loss": 0.0431, "step": 2223 }, { "epoch": 2.0126696832579185, "grad_norm": 0.9992095090085016, "learning_rate": 2.442771062221181e-06, "loss": 0.0561, "step": 2224 }, { "epoch": 2.013574660633484, "grad_norm": 1.3979459287217566, "learning_rate": 2.438700387036253e-06, "loss": 0.0878, "step": 2225 }, { "epoch": 2.0144796380090497, "grad_norm": 1.102321245943693, "learning_rate": 2.4346320121966192e-06, "loss": 0.0461, "step": 2226 }, { "epoch": 2.0153846153846153, "grad_norm": 2.056876694165583, "learning_rate": 2.430565941356157e-06, "loss": 0.0706, "step": 2227 }, { "epoch": 2.016289592760181, "grad_norm": 1.5514959104135901, "learning_rate": 2.4265021781666707e-06, "loss": 0.0756, "step": 2228 }, { "epoch": 2.0171945701357465, "grad_norm": 1.2264225262116255, "learning_rate": 2.4224407262778925e-06, "loss": 0.0727, "step": 2229 }, { "epoch": 2.018099547511312, "grad_norm": 0.786618706333402, "learning_rate": 2.4183815893374817e-06, "loss": 0.0374, "step": 2230 }, { "epoch": 2.0190045248868778, "grad_norm": 2.0325699516130573, "learning_rate": 2.414324770991011e-06, "loss": 0.0576, "step": 2231 }, { "epoch": 2.0199095022624434, "grad_norm": 0.9776350383362603, "learning_rate": 2.410270274881981e-06, "loss": 0.0499, "step": 2232 }, { "epoch": 2.020814479638009, "grad_norm": 1.0459104203751095, "learning_rate": 2.406218104651797e-06, "loss": 0.0504, "step": 2233 }, { "epoch": 2.0217194570135746, "grad_norm": 0.8460033076228194, "learning_rate": 2.4021682639397807e-06, "loss": 0.0391, "step": 2234 }, { "epoch": 2.02262443438914, "grad_norm": 1.005587364080754, "learning_rate": 2.3981207563831633e-06, "loss": 0.0314, "step": 2235 }, { "epoch": 2.023529411764706, "grad_norm": 1.1965060289290228, "learning_rate": 2.3940755856170744e-06, "loss": 0.0477, "step": 2236 }, { "epoch": 2.0244343891402714, "grad_norm": 0.9234961578272607, "learning_rate": 2.3900327552745512e-06, "loss": 0.0484, "step": 2237 }, { "epoch": 2.025339366515837, "grad_norm": 1.0832169825367228, "learning_rate": 2.385992268986523e-06, "loss": 0.0479, "step": 2238 }, { "epoch": 2.0262443438914026, "grad_norm": 0.7573829994869705, "learning_rate": 2.3819541303818192e-06, "loss": 0.0344, "step": 2239 }, { "epoch": 2.0271493212669682, "grad_norm": 1.3895962228635796, "learning_rate": 2.3779183430871596e-06, "loss": 0.0667, "step": 2240 }, { "epoch": 2.028054298642534, "grad_norm": 1.261433287400531, "learning_rate": 2.3738849107271477e-06, "loss": 0.0576, "step": 2241 }, { "epoch": 2.0289592760180994, "grad_norm": 0.9832196739914639, "learning_rate": 2.369853836924279e-06, "loss": 0.0558, "step": 2242 }, { "epoch": 2.029864253393665, "grad_norm": 1.3024143819271154, "learning_rate": 2.365825125298924e-06, "loss": 0.0617, "step": 2243 }, { "epoch": 2.0307692307692307, "grad_norm": 0.6375750855222112, "learning_rate": 2.3617987794693358e-06, "loss": 0.0327, "step": 2244 }, { "epoch": 2.0316742081447963, "grad_norm": 1.2590690166900378, "learning_rate": 2.3577748030516443e-06, "loss": 0.0626, "step": 2245 }, { "epoch": 2.032579185520362, "grad_norm": 0.8603640007007963, "learning_rate": 2.353753199659845e-06, "loss": 0.0468, "step": 2246 }, { "epoch": 2.0334841628959275, "grad_norm": 0.8978847418897999, "learning_rate": 2.3497339729058083e-06, "loss": 0.0486, "step": 2247 }, { "epoch": 2.034389140271493, "grad_norm": 1.081140115424151, "learning_rate": 2.345717126399269e-06, "loss": 0.0854, "step": 2248 }, { "epoch": 2.0352941176470587, "grad_norm": 1.4279320460035625, "learning_rate": 2.341702663747819e-06, "loss": 0.0912, "step": 2249 }, { "epoch": 2.0361990950226243, "grad_norm": 0.8987384011214158, "learning_rate": 2.3376905885569185e-06, "loss": 0.0445, "step": 2250 }, { "epoch": 2.03710407239819, "grad_norm": 0.9792189592749779, "learning_rate": 2.333680904429873e-06, "loss": 0.0523, "step": 2251 }, { "epoch": 2.0380090497737555, "grad_norm": 0.8165895103908756, "learning_rate": 2.329673614967848e-06, "loss": 0.0359, "step": 2252 }, { "epoch": 2.038914027149321, "grad_norm": 1.0881468674145935, "learning_rate": 2.325668723769858e-06, "loss": 0.0505, "step": 2253 }, { "epoch": 2.0398190045248867, "grad_norm": 0.968157235688072, "learning_rate": 2.321666234432758e-06, "loss": 0.0525, "step": 2254 }, { "epoch": 2.0407239819004523, "grad_norm": 1.0128606727995364, "learning_rate": 2.3176661505512534e-06, "loss": 0.0434, "step": 2255 }, { "epoch": 2.041628959276018, "grad_norm": 1.2330613789480591, "learning_rate": 2.313668475717881e-06, "loss": 0.0629, "step": 2256 }, { "epoch": 2.0425339366515836, "grad_norm": 1.2207587591143456, "learning_rate": 2.3096732135230206e-06, "loss": 0.0452, "step": 2257 }, { "epoch": 2.043438914027149, "grad_norm": 1.1386395348792469, "learning_rate": 2.305680367554884e-06, "loss": 0.0489, "step": 2258 }, { "epoch": 2.0443438914027148, "grad_norm": 1.3480759865235092, "learning_rate": 2.301689941399509e-06, "loss": 0.059, "step": 2259 }, { "epoch": 2.0452488687782804, "grad_norm": 0.8146846948028001, "learning_rate": 2.2977019386407653e-06, "loss": 0.0411, "step": 2260 }, { "epoch": 2.046153846153846, "grad_norm": 0.9484479459968158, "learning_rate": 2.2937163628603437e-06, "loss": 0.0434, "step": 2261 }, { "epoch": 2.0470588235294116, "grad_norm": 1.0414903248118579, "learning_rate": 2.289733217637753e-06, "loss": 0.0511, "step": 2262 }, { "epoch": 2.047963800904977, "grad_norm": 0.8502961930279713, "learning_rate": 2.285752506550325e-06, "loss": 0.0395, "step": 2263 }, { "epoch": 2.048868778280543, "grad_norm": 1.0245478399829056, "learning_rate": 2.281774233173198e-06, "loss": 0.0531, "step": 2264 }, { "epoch": 2.0497737556561084, "grad_norm": 0.9893308944248003, "learning_rate": 2.2777984010793264e-06, "loss": 0.0477, "step": 2265 }, { "epoch": 2.050678733031674, "grad_norm": 1.1759327927456467, "learning_rate": 2.2738250138394725e-06, "loss": 0.056, "step": 2266 }, { "epoch": 2.0515837104072396, "grad_norm": 0.7242124584249288, "learning_rate": 2.269854075022197e-06, "loss": 0.0322, "step": 2267 }, { "epoch": 2.0524886877828052, "grad_norm": 1.2377244070331592, "learning_rate": 2.2658855881938685e-06, "loss": 0.059, "step": 2268 }, { "epoch": 2.053393665158371, "grad_norm": 0.7370276138772038, "learning_rate": 2.261919556918647e-06, "loss": 0.0322, "step": 2269 }, { "epoch": 2.0542986425339365, "grad_norm": 0.8782637632647999, "learning_rate": 2.2579559847584924e-06, "loss": 0.0484, "step": 2270 }, { "epoch": 2.055203619909502, "grad_norm": 0.9172890391367426, "learning_rate": 2.2539948752731555e-06, "loss": 0.0373, "step": 2271 }, { "epoch": 2.0561085972850677, "grad_norm": 0.9702829886188283, "learning_rate": 2.2500362320201706e-06, "loss": 0.0552, "step": 2272 }, { "epoch": 2.0570135746606333, "grad_norm": 0.7792250413276972, "learning_rate": 2.246080058554862e-06, "loss": 0.035, "step": 2273 }, { "epoch": 2.057918552036199, "grad_norm": 0.9342458203062465, "learning_rate": 2.2421263584303353e-06, "loss": 0.0477, "step": 2274 }, { "epoch": 2.0588235294117645, "grad_norm": 1.3620425780202923, "learning_rate": 2.238175135197471e-06, "loss": 0.0685, "step": 2275 }, { "epoch": 2.05972850678733, "grad_norm": 0.8216242096520002, "learning_rate": 2.2342263924049306e-06, "loss": 0.0459, "step": 2276 }, { "epoch": 2.0606334841628957, "grad_norm": 0.9313231212089851, "learning_rate": 2.2302801335991414e-06, "loss": 0.0386, "step": 2277 }, { "epoch": 2.0615384615384613, "grad_norm": 0.8092985715643326, "learning_rate": 2.2263363623243058e-06, "loss": 0.0493, "step": 2278 }, { "epoch": 2.062443438914027, "grad_norm": 3.808678274339399, "learning_rate": 2.22239508212239e-06, "loss": 0.2407, "step": 2279 }, { "epoch": 2.0633484162895925, "grad_norm": 0.9856661353738887, "learning_rate": 2.2184562965331203e-06, "loss": 0.0469, "step": 2280 }, { "epoch": 2.064253393665158, "grad_norm": 0.908194887175138, "learning_rate": 2.214520009093988e-06, "loss": 0.0336, "step": 2281 }, { "epoch": 2.065158371040724, "grad_norm": 1.1314556971227905, "learning_rate": 2.210586223340234e-06, "loss": 0.0631, "step": 2282 }, { "epoch": 2.06606334841629, "grad_norm": 0.9365159414156122, "learning_rate": 2.2066549428048573e-06, "loss": 0.0553, "step": 2283 }, { "epoch": 2.0669683257918554, "grad_norm": 0.7837035513950583, "learning_rate": 2.2027261710186075e-06, "loss": 0.055, "step": 2284 }, { "epoch": 2.067873303167421, "grad_norm": 0.936112238495506, "learning_rate": 2.1987999115099763e-06, "loss": 0.0425, "step": 2285 }, { "epoch": 2.0687782805429866, "grad_norm": 0.9055490888836492, "learning_rate": 2.1948761678052026e-06, "loss": 0.042, "step": 2286 }, { "epoch": 2.0696832579185522, "grad_norm": 0.8429426794418236, "learning_rate": 2.1909549434282683e-06, "loss": 0.0418, "step": 2287 }, { "epoch": 2.070588235294118, "grad_norm": 1.9484321281629942, "learning_rate": 2.1870362419008844e-06, "loss": 0.1039, "step": 2288 }, { "epoch": 2.0714932126696834, "grad_norm": 0.8305570216030792, "learning_rate": 2.183120066742506e-06, "loss": 0.0344, "step": 2289 }, { "epoch": 2.072398190045249, "grad_norm": 0.9945213682432923, "learning_rate": 2.17920642147031e-06, "loss": 0.0518, "step": 2290 }, { "epoch": 2.0733031674208147, "grad_norm": 0.8288331727946717, "learning_rate": 2.175295309599208e-06, "loss": 0.0596, "step": 2291 }, { "epoch": 2.0742081447963803, "grad_norm": 1.201107584834119, "learning_rate": 2.1713867346418354e-06, "loss": 0.0579, "step": 2292 }, { "epoch": 2.075113122171946, "grad_norm": 0.7649459389928531, "learning_rate": 2.1674807001085433e-06, "loss": 0.0367, "step": 2293 }, { "epoch": 2.0760180995475115, "grad_norm": 0.9121445814273598, "learning_rate": 2.163577209507411e-06, "loss": 0.0381, "step": 2294 }, { "epoch": 2.076923076923077, "grad_norm": 0.8970347654941218, "learning_rate": 2.159676266344222e-06, "loss": 0.0419, "step": 2295 }, { "epoch": 2.0778280542986427, "grad_norm": 0.8875772976191906, "learning_rate": 2.15577787412248e-06, "loss": 0.0502, "step": 2296 }, { "epoch": 2.0787330316742083, "grad_norm": 0.6708989348635835, "learning_rate": 2.1518820363433975e-06, "loss": 0.0349, "step": 2297 }, { "epoch": 2.079638009049774, "grad_norm": 1.0240700277752606, "learning_rate": 2.147988756505886e-06, "loss": 0.0517, "step": 2298 }, { "epoch": 2.0805429864253395, "grad_norm": 0.8718895271429873, "learning_rate": 2.1440980381065684e-06, "loss": 0.0408, "step": 2299 }, { "epoch": 2.081447963800905, "grad_norm": 0.9018918365277958, "learning_rate": 2.140209884639759e-06, "loss": 0.0425, "step": 2300 }, { "epoch": 2.0823529411764707, "grad_norm": 0.7484410244296416, "learning_rate": 2.136324299597474e-06, "loss": 0.0369, "step": 2301 }, { "epoch": 2.0832579185520363, "grad_norm": 0.7096795837079026, "learning_rate": 2.1324412864694237e-06, "loss": 0.0317, "step": 2302 }, { "epoch": 2.084162895927602, "grad_norm": 0.7783332749088212, "learning_rate": 2.1285608487430013e-06, "loss": 0.0366, "step": 2303 }, { "epoch": 2.0850678733031676, "grad_norm": 0.8681537378882966, "learning_rate": 2.124682989903295e-06, "loss": 0.0401, "step": 2304 }, { "epoch": 2.085972850678733, "grad_norm": 1.4415017106639592, "learning_rate": 2.120807713433074e-06, "loss": 0.0721, "step": 2305 }, { "epoch": 2.086877828054299, "grad_norm": 1.0780768118145132, "learning_rate": 2.116935022812785e-06, "loss": 0.0571, "step": 2306 }, { "epoch": 2.0877828054298644, "grad_norm": 1.0054217242666277, "learning_rate": 2.1130649215205583e-06, "loss": 0.0406, "step": 2307 }, { "epoch": 2.08868778280543, "grad_norm": 0.7413889347699699, "learning_rate": 2.1091974130321922e-06, "loss": 0.0379, "step": 2308 }, { "epoch": 2.0895927601809956, "grad_norm": 0.7221698819016884, "learning_rate": 2.1053325008211613e-06, "loss": 0.0297, "step": 2309 }, { "epoch": 2.090497737556561, "grad_norm": 0.8765836899551135, "learning_rate": 2.1014701883586087e-06, "loss": 0.0486, "step": 2310 }, { "epoch": 2.091402714932127, "grad_norm": 0.8706312624066331, "learning_rate": 2.0976104791133373e-06, "loss": 0.0397, "step": 2311 }, { "epoch": 2.0923076923076924, "grad_norm": 0.8399204171897292, "learning_rate": 2.0937533765518187e-06, "loss": 0.0343, "step": 2312 }, { "epoch": 2.093212669683258, "grad_norm": 1.2675717479130881, "learning_rate": 2.0898988841381768e-06, "loss": 0.0768, "step": 2313 }, { "epoch": 2.0941176470588236, "grad_norm": 0.8810269141609063, "learning_rate": 2.0860470053341957e-06, "loss": 0.0459, "step": 2314 }, { "epoch": 2.0950226244343892, "grad_norm": 0.7634595727786504, "learning_rate": 2.082197743599314e-06, "loss": 0.0373, "step": 2315 }, { "epoch": 2.095927601809955, "grad_norm": 1.0788596462807534, "learning_rate": 2.078351102390613e-06, "loss": 0.0507, "step": 2316 }, { "epoch": 2.0968325791855205, "grad_norm": 0.6799295958266247, "learning_rate": 2.0745070851628255e-06, "loss": 0.0304, "step": 2317 }, { "epoch": 2.097737556561086, "grad_norm": 1.2286424284375774, "learning_rate": 2.070665695368329e-06, "loss": 0.0607, "step": 2318 }, { "epoch": 2.0986425339366517, "grad_norm": 1.4117434893673892, "learning_rate": 2.0668269364571358e-06, "loss": 0.0671, "step": 2319 }, { "epoch": 2.0995475113122173, "grad_norm": 0.9210377541761785, "learning_rate": 2.0629908118769004e-06, "loss": 0.0569, "step": 2320 }, { "epoch": 2.100452488687783, "grad_norm": 0.9374241910253764, "learning_rate": 2.0591573250729073e-06, "loss": 0.0491, "step": 2321 }, { "epoch": 2.1013574660633485, "grad_norm": 1.7094382149745269, "learning_rate": 2.0553264794880757e-06, "loss": 0.082, "step": 2322 }, { "epoch": 2.102262443438914, "grad_norm": 0.7474124237461709, "learning_rate": 2.0514982785629517e-06, "loss": 0.0307, "step": 2323 }, { "epoch": 2.1031674208144797, "grad_norm": 1.0450653252701478, "learning_rate": 2.0476727257357027e-06, "loss": 0.0481, "step": 2324 }, { "epoch": 2.1040723981900453, "grad_norm": 1.0643315158278455, "learning_rate": 2.043849824442124e-06, "loss": 0.0561, "step": 2325 }, { "epoch": 2.104977375565611, "grad_norm": 1.7820000650400851, "learning_rate": 2.040029578115623e-06, "loss": 0.0698, "step": 2326 }, { "epoch": 2.1058823529411765, "grad_norm": 1.39730739915926, "learning_rate": 2.0362119901872262e-06, "loss": 0.09, "step": 2327 }, { "epoch": 2.106787330316742, "grad_norm": 0.9217243959257071, "learning_rate": 2.032397064085575e-06, "loss": 0.048, "step": 2328 }, { "epoch": 2.1076923076923078, "grad_norm": 0.9804442345975442, "learning_rate": 2.028584803236914e-06, "loss": 0.0524, "step": 2329 }, { "epoch": 2.1085972850678734, "grad_norm": 0.6744851853167925, "learning_rate": 2.024775211065098e-06, "loss": 0.0262, "step": 2330 }, { "epoch": 2.109502262443439, "grad_norm": 0.99213580872258, "learning_rate": 2.0209682909915856e-06, "loss": 0.0529, "step": 2331 }, { "epoch": 2.1104072398190046, "grad_norm": 1.2633231942037075, "learning_rate": 2.017164046435433e-06, "loss": 0.0429, "step": 2332 }, { "epoch": 2.11131221719457, "grad_norm": 0.7327415684772112, "learning_rate": 2.013362480813298e-06, "loss": 0.0295, "step": 2333 }, { "epoch": 2.112217194570136, "grad_norm": 0.8882818085542539, "learning_rate": 2.009563597539424e-06, "loss": 0.0337, "step": 2334 }, { "epoch": 2.1131221719457014, "grad_norm": 1.1076301548496412, "learning_rate": 2.0057674000256556e-06, "loss": 0.0552, "step": 2335 }, { "epoch": 2.114027149321267, "grad_norm": 0.7483612050787772, "learning_rate": 2.001973891681416e-06, "loss": 0.042, "step": 2336 }, { "epoch": 2.1149321266968326, "grad_norm": 0.8818130350253378, "learning_rate": 1.998183075913719e-06, "loss": 0.0362, "step": 2337 }, { "epoch": 2.1158371040723982, "grad_norm": 0.6857028908230189, "learning_rate": 1.9943949561271606e-06, "loss": 0.0362, "step": 2338 }, { "epoch": 2.116742081447964, "grad_norm": 0.9454311064826166, "learning_rate": 1.9906095357239095e-06, "loss": 0.04, "step": 2339 }, { "epoch": 2.1176470588235294, "grad_norm": 0.9468427777264223, "learning_rate": 1.9868268181037186e-06, "loss": 0.0351, "step": 2340 }, { "epoch": 2.118552036199095, "grad_norm": 5.143549546820368, "learning_rate": 1.9830468066639045e-06, "loss": 0.2332, "step": 2341 }, { "epoch": 2.1194570135746607, "grad_norm": 1.1664615716295876, "learning_rate": 1.979269504799359e-06, "loss": 0.0583, "step": 2342 }, { "epoch": 2.1203619909502263, "grad_norm": 0.8879259658805669, "learning_rate": 1.9754949159025414e-06, "loss": 0.0353, "step": 2343 }, { "epoch": 2.121266968325792, "grad_norm": 1.0376456480260756, "learning_rate": 1.9717230433634686e-06, "loss": 0.0496, "step": 2344 }, { "epoch": 2.1221719457013575, "grad_norm": 0.9342769323364928, "learning_rate": 1.967953890569723e-06, "loss": 0.0457, "step": 2345 }, { "epoch": 2.123076923076923, "grad_norm": 1.638791143792905, "learning_rate": 1.9641874609064443e-06, "loss": 0.0723, "step": 2346 }, { "epoch": 2.1239819004524887, "grad_norm": 1.024531269674049, "learning_rate": 1.9604237577563212e-06, "loss": 0.0565, "step": 2347 }, { "epoch": 2.1248868778280543, "grad_norm": 0.7075735359898796, "learning_rate": 1.956662784499601e-06, "loss": 0.0305, "step": 2348 }, { "epoch": 2.12579185520362, "grad_norm": 0.9116939772218543, "learning_rate": 1.9529045445140727e-06, "loss": 0.0494, "step": 2349 }, { "epoch": 2.1266968325791855, "grad_norm": 0.7305236228483667, "learning_rate": 1.9491490411750745e-06, "loss": 0.036, "step": 2350 }, { "epoch": 2.127601809954751, "grad_norm": 0.6818129748995472, "learning_rate": 1.9453962778554876e-06, "loss": 0.0298, "step": 2351 }, { "epoch": 2.1285067873303167, "grad_norm": 0.6577284015657147, "learning_rate": 1.941646257925727e-06, "loss": 0.0375, "step": 2352 }, { "epoch": 2.1294117647058823, "grad_norm": 0.9200929902778257, "learning_rate": 1.937898984753751e-06, "loss": 0.0455, "step": 2353 }, { "epoch": 2.130316742081448, "grad_norm": 0.8991117129353328, "learning_rate": 1.9341544617050435e-06, "loss": 0.0491, "step": 2354 }, { "epoch": 2.1312217194570136, "grad_norm": 0.9430973312636298, "learning_rate": 1.9304126921426235e-06, "loss": 0.0477, "step": 2355 }, { "epoch": 2.132126696832579, "grad_norm": 1.1847446613880255, "learning_rate": 1.9266736794270385e-06, "loss": 0.0555, "step": 2356 }, { "epoch": 2.1330316742081448, "grad_norm": 1.0682595995783288, "learning_rate": 1.922937426916353e-06, "loss": 0.043, "step": 2357 }, { "epoch": 2.1339366515837104, "grad_norm": 1.1177123388179457, "learning_rate": 1.919203937966161e-06, "loss": 0.0542, "step": 2358 }, { "epoch": 2.134841628959276, "grad_norm": 0.8586798994157264, "learning_rate": 1.915473215929566e-06, "loss": 0.0379, "step": 2359 }, { "epoch": 2.1357466063348416, "grad_norm": 0.8099993431265294, "learning_rate": 1.9117452641571934e-06, "loss": 0.0424, "step": 2360 }, { "epoch": 2.136651583710407, "grad_norm": 0.7791273048581713, "learning_rate": 1.9080200859971794e-06, "loss": 0.0378, "step": 2361 }, { "epoch": 2.137556561085973, "grad_norm": 1.0832988595330342, "learning_rate": 1.9042976847951638e-06, "loss": 0.057, "step": 2362 }, { "epoch": 2.1384615384615384, "grad_norm": 1.0233934939616278, "learning_rate": 1.9005780638942982e-06, "loss": 0.0592, "step": 2363 }, { "epoch": 2.139366515837104, "grad_norm": 0.9808518797628787, "learning_rate": 1.8968612266352376e-06, "loss": 0.0441, "step": 2364 }, { "epoch": 2.1402714932126696, "grad_norm": 0.8380097484269137, "learning_rate": 1.893147176356131e-06, "loss": 0.0506, "step": 2365 }, { "epoch": 2.1411764705882352, "grad_norm": 0.9840607158973618, "learning_rate": 1.8894359163926312e-06, "loss": 0.0527, "step": 2366 }, { "epoch": 2.142081447963801, "grad_norm": 0.8080216546401838, "learning_rate": 1.8857274500778788e-06, "loss": 0.0411, "step": 2367 }, { "epoch": 2.1429864253393665, "grad_norm": 0.7042593776203878, "learning_rate": 1.8820217807425095e-06, "loss": 0.0348, "step": 2368 }, { "epoch": 2.143891402714932, "grad_norm": 1.0452579044823118, "learning_rate": 1.8783189117146482e-06, "loss": 0.0484, "step": 2369 }, { "epoch": 2.1447963800904977, "grad_norm": 0.7083337362492458, "learning_rate": 1.8746188463198983e-06, "loss": 0.0306, "step": 2370 }, { "epoch": 2.1457013574660633, "grad_norm": 0.8138811176477975, "learning_rate": 1.8709215878813537e-06, "loss": 0.0342, "step": 2371 }, { "epoch": 2.146606334841629, "grad_norm": 3.3018689827437404, "learning_rate": 1.8672271397195784e-06, "loss": 0.122, "step": 2372 }, { "epoch": 2.1475113122171945, "grad_norm": 0.8686840529681125, "learning_rate": 1.8635355051526194e-06, "loss": 0.0354, "step": 2373 }, { "epoch": 2.14841628959276, "grad_norm": 1.2973318021974545, "learning_rate": 1.8598466874959969e-06, "loss": 0.0623, "step": 2374 }, { "epoch": 2.1493212669683257, "grad_norm": 1.4745766070264368, "learning_rate": 1.8561606900626938e-06, "loss": 0.0778, "step": 2375 }, { "epoch": 2.1502262443438913, "grad_norm": 0.7019555230456804, "learning_rate": 1.8524775161631676e-06, "loss": 0.0309, "step": 2376 }, { "epoch": 2.151131221719457, "grad_norm": 0.8892681340111187, "learning_rate": 1.8487971691053391e-06, "loss": 0.0555, "step": 2377 }, { "epoch": 2.1520361990950225, "grad_norm": 1.0647043469652089, "learning_rate": 1.8451196521945847e-06, "loss": 0.0387, "step": 2378 }, { "epoch": 2.152941176470588, "grad_norm": 2.120853952728789, "learning_rate": 1.8414449687337467e-06, "loss": 0.0906, "step": 2379 }, { "epoch": 2.1538461538461537, "grad_norm": 0.7298683634151624, "learning_rate": 1.8377731220231144e-06, "loss": 0.0279, "step": 2380 }, { "epoch": 2.1547511312217194, "grad_norm": 1.128466795656349, "learning_rate": 1.8341041153604362e-06, "loss": 0.0696, "step": 2381 }, { "epoch": 2.155656108597285, "grad_norm": 1.1301806510928134, "learning_rate": 1.8304379520409088e-06, "loss": 0.058, "step": 2382 }, { "epoch": 2.1565610859728506, "grad_norm": 0.7850935460450701, "learning_rate": 1.8267746353571703e-06, "loss": 0.0377, "step": 2383 }, { "epoch": 2.157466063348416, "grad_norm": 0.6743991484609266, "learning_rate": 1.8231141685993092e-06, "loss": 0.0292, "step": 2384 }, { "epoch": 2.158371040723982, "grad_norm": 0.7292523122793214, "learning_rate": 1.8194565550548477e-06, "loss": 0.0337, "step": 2385 }, { "epoch": 2.1592760180995474, "grad_norm": 0.8267529949022357, "learning_rate": 1.81580179800875e-06, "loss": 0.0346, "step": 2386 }, { "epoch": 2.160180995475113, "grad_norm": 0.8292845671339328, "learning_rate": 1.8121499007434151e-06, "loss": 0.0453, "step": 2387 }, { "epoch": 2.1610859728506786, "grad_norm": 1.2810013026358151, "learning_rate": 1.8085008665386688e-06, "loss": 0.0597, "step": 2388 }, { "epoch": 2.161990950226244, "grad_norm": 1.3155760691321028, "learning_rate": 1.8048546986717703e-06, "loss": 0.0529, "step": 2389 }, { "epoch": 2.16289592760181, "grad_norm": 1.0185145123364223, "learning_rate": 1.8012114004174048e-06, "loss": 0.0588, "step": 2390 }, { "epoch": 2.1638009049773754, "grad_norm": 0.964113426346921, "learning_rate": 1.7975709750476744e-06, "loss": 0.0525, "step": 2391 }, { "epoch": 2.164705882352941, "grad_norm": 0.8970497187118045, "learning_rate": 1.7939334258321094e-06, "loss": 0.0518, "step": 2392 }, { "epoch": 2.1656108597285066, "grad_norm": 0.9324363285106755, "learning_rate": 1.7902987560376483e-06, "loss": 0.0514, "step": 2393 }, { "epoch": 2.1665158371040723, "grad_norm": 0.7722556430598915, "learning_rate": 1.7866669689286499e-06, "loss": 0.0407, "step": 2394 }, { "epoch": 2.167420814479638, "grad_norm": 0.962974088049048, "learning_rate": 1.7830380677668836e-06, "loss": 0.0487, "step": 2395 }, { "epoch": 2.1683257918552035, "grad_norm": 0.7282483828966129, "learning_rate": 1.7794120558115214e-06, "loss": 0.0337, "step": 2396 }, { "epoch": 2.169230769230769, "grad_norm": 1.5037465306840858, "learning_rate": 1.7757889363191484e-06, "loss": 0.0403, "step": 2397 }, { "epoch": 2.1701357466063347, "grad_norm": 0.8292152973265101, "learning_rate": 1.7721687125437436e-06, "loss": 0.0409, "step": 2398 }, { "epoch": 2.1710407239819003, "grad_norm": 1.8126890725259959, "learning_rate": 1.7685513877366917e-06, "loss": 0.0558, "step": 2399 }, { "epoch": 2.171945701357466, "grad_norm": 0.6912520669024398, "learning_rate": 1.764936965146773e-06, "loss": 0.0377, "step": 2400 }, { "epoch": 2.1728506787330315, "grad_norm": 0.7153647732077454, "learning_rate": 1.761325448020157e-06, "loss": 0.0339, "step": 2401 }, { "epoch": 2.173755656108597, "grad_norm": 0.9032455681125224, "learning_rate": 1.757716839600409e-06, "loss": 0.0423, "step": 2402 }, { "epoch": 2.1746606334841627, "grad_norm": 0.57370054500016, "learning_rate": 1.7541111431284775e-06, "loss": 0.0254, "step": 2403 }, { "epoch": 2.1755656108597283, "grad_norm": 1.2938421541399208, "learning_rate": 1.7505083618426982e-06, "loss": 0.0689, "step": 2404 }, { "epoch": 2.176470588235294, "grad_norm": 1.7558886095930764, "learning_rate": 1.746908498978791e-06, "loss": 0.0883, "step": 2405 }, { "epoch": 2.1773755656108595, "grad_norm": 0.9491367618888851, "learning_rate": 1.7433115577698473e-06, "loss": 0.0433, "step": 2406 }, { "epoch": 2.178280542986425, "grad_norm": 0.9725271683336718, "learning_rate": 1.739717541446342e-06, "loss": 0.0534, "step": 2407 }, { "epoch": 2.1791855203619908, "grad_norm": 0.7498841893943593, "learning_rate": 1.7361264532361216e-06, "loss": 0.0404, "step": 2408 }, { "epoch": 2.1800904977375564, "grad_norm": 0.6744321882104648, "learning_rate": 1.7325382963643977e-06, "loss": 0.0311, "step": 2409 }, { "epoch": 2.180995475113122, "grad_norm": 0.8482389711369979, "learning_rate": 1.7289530740537569e-06, "loss": 0.0341, "step": 2410 }, { "epoch": 2.1819004524886876, "grad_norm": 0.9716825602292766, "learning_rate": 1.7253707895241423e-06, "loss": 0.0489, "step": 2411 }, { "epoch": 2.182805429864253, "grad_norm": 0.7720116986931215, "learning_rate": 1.7217914459928647e-06, "loss": 0.0423, "step": 2412 }, { "epoch": 2.1837104072398192, "grad_norm": 0.9240857994735179, "learning_rate": 1.7182150466745934e-06, "loss": 0.0538, "step": 2413 }, { "epoch": 2.184615384615385, "grad_norm": 0.7883962627972492, "learning_rate": 1.7146415947813472e-06, "loss": 0.0383, "step": 2414 }, { "epoch": 2.1855203619909505, "grad_norm": 0.6366336933351108, "learning_rate": 1.7110710935225055e-06, "loss": 0.0299, "step": 2415 }, { "epoch": 2.186425339366516, "grad_norm": 0.9714691463286033, "learning_rate": 1.7075035461047916e-06, "loss": 0.0528, "step": 2416 }, { "epoch": 2.1873303167420817, "grad_norm": 1.0240647390545219, "learning_rate": 1.7039389557322793e-06, "loss": 0.0479, "step": 2417 }, { "epoch": 2.1882352941176473, "grad_norm": 0.9573116038072697, "learning_rate": 1.7003773256063882e-06, "loss": 0.0451, "step": 2418 }, { "epoch": 2.189140271493213, "grad_norm": 0.7164685355206943, "learning_rate": 1.6968186589258734e-06, "loss": 0.0267, "step": 2419 }, { "epoch": 2.1900452488687785, "grad_norm": 0.7209297722944881, "learning_rate": 1.6932629588868332e-06, "loss": 0.0384, "step": 2420 }, { "epoch": 2.190950226244344, "grad_norm": 0.8621046980446793, "learning_rate": 1.6897102286827021e-06, "loss": 0.0392, "step": 2421 }, { "epoch": 2.1918552036199097, "grad_norm": 0.9771529153479585, "learning_rate": 1.6861604715042424e-06, "loss": 0.046, "step": 2422 }, { "epoch": 2.1927601809954753, "grad_norm": 1.0178236393269304, "learning_rate": 1.6826136905395529e-06, "loss": 0.0478, "step": 2423 }, { "epoch": 2.193665158371041, "grad_norm": 0.7768939270332926, "learning_rate": 1.679069888974052e-06, "loss": 0.0351, "step": 2424 }, { "epoch": 2.1945701357466065, "grad_norm": 0.7870415913884594, "learning_rate": 1.6755290699904881e-06, "loss": 0.0352, "step": 2425 }, { "epoch": 2.195475113122172, "grad_norm": 0.8449146060573636, "learning_rate": 1.67199123676893e-06, "loss": 0.0361, "step": 2426 }, { "epoch": 2.1963800904977377, "grad_norm": 0.8031995957129023, "learning_rate": 1.6684563924867619e-06, "loss": 0.0342, "step": 2427 }, { "epoch": 2.1972850678733034, "grad_norm": 0.9781589225089253, "learning_rate": 1.6649245403186881e-06, "loss": 0.058, "step": 2428 }, { "epoch": 2.198190045248869, "grad_norm": 0.8185341535541832, "learning_rate": 1.6613956834367196e-06, "loss": 0.031, "step": 2429 }, { "epoch": 2.1990950226244346, "grad_norm": 0.9696106474086303, "learning_rate": 1.6578698250101828e-06, "loss": 0.0555, "step": 2430 }, { "epoch": 2.2, "grad_norm": 1.0926491182036657, "learning_rate": 1.6543469682057105e-06, "loss": 0.0742, "step": 2431 }, { "epoch": 2.200904977375566, "grad_norm": 1.1492445640554672, "learning_rate": 1.6508271161872352e-06, "loss": 0.0596, "step": 2432 }, { "epoch": 2.2018099547511314, "grad_norm": 1.2403148940269366, "learning_rate": 1.6473102721159955e-06, "loss": 0.0617, "step": 2433 }, { "epoch": 2.202714932126697, "grad_norm": 0.7495225885932016, "learning_rate": 1.643796439150529e-06, "loss": 0.0346, "step": 2434 }, { "epoch": 2.2036199095022626, "grad_norm": 0.7341443402548741, "learning_rate": 1.6402856204466611e-06, "loss": 0.0341, "step": 2435 }, { "epoch": 2.204524886877828, "grad_norm": 0.9641196491786104, "learning_rate": 1.6367778191575223e-06, "loss": 0.0428, "step": 2436 }, { "epoch": 2.205429864253394, "grad_norm": 1.0455138881590667, "learning_rate": 1.6332730384335226e-06, "loss": 0.055, "step": 2437 }, { "epoch": 2.2063348416289594, "grad_norm": 0.9256969147171166, "learning_rate": 1.629771281422366e-06, "loss": 0.0403, "step": 2438 }, { "epoch": 2.207239819004525, "grad_norm": 0.6202449275748878, "learning_rate": 1.6262725512690347e-06, "loss": 0.0255, "step": 2439 }, { "epoch": 2.2081447963800906, "grad_norm": 0.7865649002085825, "learning_rate": 1.6227768511157976e-06, "loss": 0.0329, "step": 2440 }, { "epoch": 2.2090497737556563, "grad_norm": 0.6617288697839336, "learning_rate": 1.6192841841022016e-06, "loss": 0.0303, "step": 2441 }, { "epoch": 2.209954751131222, "grad_norm": 1.1811183505383276, "learning_rate": 1.615794553365066e-06, "loss": 0.068, "step": 2442 }, { "epoch": 2.2108597285067875, "grad_norm": 0.6856470947652591, "learning_rate": 1.6123079620384884e-06, "loss": 0.0297, "step": 2443 }, { "epoch": 2.211764705882353, "grad_norm": 0.8062713914282235, "learning_rate": 1.60882441325383e-06, "loss": 0.0383, "step": 2444 }, { "epoch": 2.2126696832579187, "grad_norm": 0.8738846160536903, "learning_rate": 1.6053439101397257e-06, "loss": 0.0474, "step": 2445 }, { "epoch": 2.2135746606334843, "grad_norm": 0.960404517841308, "learning_rate": 1.6018664558220737e-06, "loss": 0.0467, "step": 2446 }, { "epoch": 2.21447963800905, "grad_norm": 0.8761218057872954, "learning_rate": 1.5983920534240299e-06, "loss": 0.0423, "step": 2447 }, { "epoch": 2.2153846153846155, "grad_norm": 0.6494200788033256, "learning_rate": 1.5949207060660138e-06, "loss": 0.0328, "step": 2448 }, { "epoch": 2.216289592760181, "grad_norm": 0.784448372159251, "learning_rate": 1.5914524168657008e-06, "loss": 0.036, "step": 2449 }, { "epoch": 2.2171945701357467, "grad_norm": 0.9664262809600711, "learning_rate": 1.5879871889380155e-06, "loss": 0.0455, "step": 2450 }, { "epoch": 2.2180995475113123, "grad_norm": 1.5573401223696313, "learning_rate": 1.5845250253951395e-06, "loss": 0.0917, "step": 2451 }, { "epoch": 2.219004524886878, "grad_norm": 0.6643566757979478, "learning_rate": 1.5810659293464953e-06, "loss": 0.0307, "step": 2452 }, { "epoch": 2.2199095022624435, "grad_norm": 1.0583376476250193, "learning_rate": 1.5776099038987557e-06, "loss": 0.0482, "step": 2453 }, { "epoch": 2.220814479638009, "grad_norm": 0.7353800544476959, "learning_rate": 1.5741569521558352e-06, "loss": 0.044, "step": 2454 }, { "epoch": 2.2217194570135748, "grad_norm": 0.7908591382370382, "learning_rate": 1.5707070772188843e-06, "loss": 0.0382, "step": 2455 }, { "epoch": 2.2226244343891404, "grad_norm": 0.6604618243835711, "learning_rate": 1.5672602821862949e-06, "loss": 0.0301, "step": 2456 }, { "epoch": 2.223529411764706, "grad_norm": 1.0161986453399088, "learning_rate": 1.5638165701536866e-06, "loss": 0.0436, "step": 2457 }, { "epoch": 2.2244343891402716, "grad_norm": 0.787045641637352, "learning_rate": 1.560375944213916e-06, "loss": 0.036, "step": 2458 }, { "epoch": 2.225339366515837, "grad_norm": 0.9163019736469256, "learning_rate": 1.5569384074570683e-06, "loss": 0.0389, "step": 2459 }, { "epoch": 2.226244343891403, "grad_norm": 0.9605777774125946, "learning_rate": 1.5535039629704467e-06, "loss": 0.0444, "step": 2460 }, { "epoch": 2.2271493212669684, "grad_norm": 0.9766260606855056, "learning_rate": 1.5500726138385851e-06, "loss": 0.051, "step": 2461 }, { "epoch": 2.228054298642534, "grad_norm": 0.8113118118413968, "learning_rate": 1.546644363143236e-06, "loss": 0.0398, "step": 2462 }, { "epoch": 2.2289592760180996, "grad_norm": 0.7575241330441174, "learning_rate": 1.5432192139633645e-06, "loss": 0.0411, "step": 2463 }, { "epoch": 2.2298642533936652, "grad_norm": 0.799314870530451, "learning_rate": 1.5397971693751563e-06, "loss": 0.0316, "step": 2464 }, { "epoch": 2.230769230769231, "grad_norm": 0.8874700162318425, "learning_rate": 1.5363782324520033e-06, "loss": 0.0507, "step": 2465 }, { "epoch": 2.2316742081447964, "grad_norm": 0.7014544929342107, "learning_rate": 1.53296240626451e-06, "loss": 0.0335, "step": 2466 }, { "epoch": 2.232579185520362, "grad_norm": 0.8933509052171541, "learning_rate": 1.5295496938804877e-06, "loss": 0.0373, "step": 2467 }, { "epoch": 2.2334841628959277, "grad_norm": 0.857381312469773, "learning_rate": 1.5261400983649465e-06, "loss": 0.0387, "step": 2468 }, { "epoch": 2.2343891402714933, "grad_norm": 0.8632678333138551, "learning_rate": 1.5227336227801038e-06, "loss": 0.0455, "step": 2469 }, { "epoch": 2.235294117647059, "grad_norm": 0.9341612484749755, "learning_rate": 1.5193302701853674e-06, "loss": 0.0582, "step": 2470 }, { "epoch": 2.2361990950226245, "grad_norm": 0.6752191634059339, "learning_rate": 1.515930043637346e-06, "loss": 0.0318, "step": 2471 }, { "epoch": 2.23710407239819, "grad_norm": 0.8638401042172076, "learning_rate": 1.5125329461898408e-06, "loss": 0.038, "step": 2472 }, { "epoch": 2.2380090497737557, "grad_norm": 0.9319964150807272, "learning_rate": 1.509138980893838e-06, "loss": 0.0475, "step": 2473 }, { "epoch": 2.2389140271493213, "grad_norm": 1.019471336173116, "learning_rate": 1.5057481507975158e-06, "loss": 0.0338, "step": 2474 }, { "epoch": 2.239819004524887, "grad_norm": 0.7816104114598976, "learning_rate": 1.502360458946232e-06, "loss": 0.0293, "step": 2475 }, { "epoch": 2.2407239819004525, "grad_norm": 0.7359281980199075, "learning_rate": 1.49897590838253e-06, "loss": 0.0317, "step": 2476 }, { "epoch": 2.241628959276018, "grad_norm": 0.8100408598247035, "learning_rate": 1.4955945021461315e-06, "loss": 0.0404, "step": 2477 }, { "epoch": 2.2425339366515837, "grad_norm": 0.7057624179264087, "learning_rate": 1.49221624327393e-06, "loss": 0.0309, "step": 2478 }, { "epoch": 2.2434389140271493, "grad_norm": 0.9271814488534939, "learning_rate": 1.4888411347999976e-06, "loss": 0.0464, "step": 2479 }, { "epoch": 2.244343891402715, "grad_norm": 0.9507183270586785, "learning_rate": 1.4854691797555753e-06, "loss": 0.0373, "step": 2480 }, { "epoch": 2.2452488687782806, "grad_norm": 0.6679317125100059, "learning_rate": 1.4821003811690688e-06, "loss": 0.0298, "step": 2481 }, { "epoch": 2.246153846153846, "grad_norm": 0.6363559920370621, "learning_rate": 1.4787347420660541e-06, "loss": 0.0244, "step": 2482 }, { "epoch": 2.2470588235294118, "grad_norm": 0.886368803103128, "learning_rate": 1.475372265469265e-06, "loss": 0.0418, "step": 2483 }, { "epoch": 2.2479638009049774, "grad_norm": 0.8031458959432866, "learning_rate": 1.4720129543985972e-06, "loss": 0.0474, "step": 2484 }, { "epoch": 2.248868778280543, "grad_norm": 0.6270377301445172, "learning_rate": 1.4686568118711054e-06, "loss": 0.0257, "step": 2485 }, { "epoch": 2.2497737556561086, "grad_norm": 0.7242149867486656, "learning_rate": 1.4653038409009933e-06, "loss": 0.0345, "step": 2486 }, { "epoch": 2.250678733031674, "grad_norm": 0.9701573012671614, "learning_rate": 1.4619540444996227e-06, "loss": 0.0637, "step": 2487 }, { "epoch": 2.25158371040724, "grad_norm": 0.7524711496841208, "learning_rate": 1.4586074256754974e-06, "loss": 0.0318, "step": 2488 }, { "epoch": 2.2524886877828054, "grad_norm": 0.7907852204336028, "learning_rate": 1.4552639874342717e-06, "loss": 0.0336, "step": 2489 }, { "epoch": 2.253393665158371, "grad_norm": 1.056459552381597, "learning_rate": 1.451923732778745e-06, "loss": 0.0494, "step": 2490 }, { "epoch": 2.2542986425339366, "grad_norm": 0.8930747989477008, "learning_rate": 1.4485866647088515e-06, "loss": 0.0443, "step": 2491 }, { "epoch": 2.2552036199095022, "grad_norm": 1.3958845797552097, "learning_rate": 1.4452527862216687e-06, "loss": 0.0571, "step": 2492 }, { "epoch": 2.256108597285068, "grad_norm": 0.9832708715633575, "learning_rate": 1.441922100311408e-06, "loss": 0.07, "step": 2493 }, { "epoch": 2.2570135746606335, "grad_norm": 1.8346243606633144, "learning_rate": 1.438594609969412e-06, "loss": 0.0612, "step": 2494 }, { "epoch": 2.257918552036199, "grad_norm": 0.7355532491536344, "learning_rate": 1.435270318184156e-06, "loss": 0.0375, "step": 2495 }, { "epoch": 2.2588235294117647, "grad_norm": 0.9215158887822487, "learning_rate": 1.4319492279412388e-06, "loss": 0.0598, "step": 2496 }, { "epoch": 2.2597285067873303, "grad_norm": 0.6575679308921293, "learning_rate": 1.4286313422233877e-06, "loss": 0.0248, "step": 2497 }, { "epoch": 2.260633484162896, "grad_norm": 1.95523530639185, "learning_rate": 1.4253166640104522e-06, "loss": 0.0923, "step": 2498 }, { "epoch": 2.2615384615384615, "grad_norm": 1.0261553398178644, "learning_rate": 1.4220051962793952e-06, "loss": 0.0463, "step": 2499 }, { "epoch": 2.262443438914027, "grad_norm": 0.6944493619747503, "learning_rate": 1.418696942004304e-06, "loss": 0.0266, "step": 2500 }, { "epoch": 2.2633484162895927, "grad_norm": 1.1099891911542301, "learning_rate": 1.415391904156373e-06, "loss": 0.0475, "step": 2501 }, { "epoch": 2.2642533936651583, "grad_norm": 0.785779881121854, "learning_rate": 1.4120900857039127e-06, "loss": 0.0378, "step": 2502 }, { "epoch": 2.265158371040724, "grad_norm": 0.7130443053313417, "learning_rate": 1.4087914896123423e-06, "loss": 0.0274, "step": 2503 }, { "epoch": 2.2660633484162895, "grad_norm": 1.2710317716939696, "learning_rate": 1.4054961188441819e-06, "loss": 0.0594, "step": 2504 }, { "epoch": 2.266968325791855, "grad_norm": 0.8524348932608476, "learning_rate": 1.4022039763590595e-06, "loss": 0.0374, "step": 2505 }, { "epoch": 2.2678733031674208, "grad_norm": 0.8528055907485237, "learning_rate": 1.398915065113704e-06, "loss": 0.0425, "step": 2506 }, { "epoch": 2.2687782805429864, "grad_norm": 0.7352775486236619, "learning_rate": 1.3956293880619375e-06, "loss": 0.0407, "step": 2507 }, { "epoch": 2.269683257918552, "grad_norm": 0.8626184946136709, "learning_rate": 1.3923469481546841e-06, "loss": 0.0436, "step": 2508 }, { "epoch": 2.2705882352941176, "grad_norm": 0.6476300770998238, "learning_rate": 1.389067748339954e-06, "loss": 0.0297, "step": 2509 }, { "epoch": 2.271493212669683, "grad_norm": 0.988876226654389, "learning_rate": 1.3857917915628516e-06, "loss": 0.0459, "step": 2510 }, { "epoch": 2.272398190045249, "grad_norm": 0.6157715442262321, "learning_rate": 1.3825190807655692e-06, "loss": 0.0301, "step": 2511 }, { "epoch": 2.2733031674208144, "grad_norm": 0.8324261217519804, "learning_rate": 1.3792496188873789e-06, "loss": 0.0374, "step": 2512 }, { "epoch": 2.27420814479638, "grad_norm": 0.8002062288010153, "learning_rate": 1.3759834088646413e-06, "loss": 0.0306, "step": 2513 }, { "epoch": 2.2751131221719456, "grad_norm": 1.1903546618144165, "learning_rate": 1.372720453630791e-06, "loss": 0.0603, "step": 2514 }, { "epoch": 2.276018099547511, "grad_norm": 0.8121803623597421, "learning_rate": 1.369460756116342e-06, "loss": 0.034, "step": 2515 }, { "epoch": 2.276923076923077, "grad_norm": 0.6489345310639646, "learning_rate": 1.366204319248885e-06, "loss": 0.0332, "step": 2516 }, { "epoch": 2.2778280542986424, "grad_norm": 0.7473456877527381, "learning_rate": 1.3629511459530758e-06, "loss": 0.0365, "step": 2517 }, { "epoch": 2.278733031674208, "grad_norm": 0.9225186667395353, "learning_rate": 1.359701239150646e-06, "loss": 0.0508, "step": 2518 }, { "epoch": 2.2796380090497737, "grad_norm": 0.9392059896380985, "learning_rate": 1.3564546017603874e-06, "loss": 0.0392, "step": 2519 }, { "epoch": 2.2805429864253393, "grad_norm": 0.7526828466737088, "learning_rate": 1.3532112366981598e-06, "loss": 0.0347, "step": 2520 }, { "epoch": 2.281447963800905, "grad_norm": 0.8191343592959013, "learning_rate": 1.3499711468768838e-06, "loss": 0.0357, "step": 2521 }, { "epoch": 2.2823529411764705, "grad_norm": 0.6643619347071402, "learning_rate": 1.3467343352065349e-06, "loss": 0.0315, "step": 2522 }, { "epoch": 2.283257918552036, "grad_norm": 1.0904822558417528, "learning_rate": 1.3435008045941484e-06, "loss": 0.0709, "step": 2523 }, { "epoch": 2.2841628959276017, "grad_norm": 0.6491593452249732, "learning_rate": 1.3402705579438125e-06, "loss": 0.0238, "step": 2524 }, { "epoch": 2.2850678733031673, "grad_norm": 0.9569887558925444, "learning_rate": 1.3370435981566622e-06, "loss": 0.0559, "step": 2525 }, { "epoch": 2.285972850678733, "grad_norm": 1.1335716785096597, "learning_rate": 1.3338199281308856e-06, "loss": 0.0557, "step": 2526 }, { "epoch": 2.2868778280542985, "grad_norm": 0.7344599611952336, "learning_rate": 1.3305995507617114e-06, "loss": 0.031, "step": 2527 }, { "epoch": 2.287782805429864, "grad_norm": 0.9231394333089422, "learning_rate": 1.3273824689414144e-06, "loss": 0.0504, "step": 2528 }, { "epoch": 2.2886877828054297, "grad_norm": 0.5891535447983963, "learning_rate": 1.3241686855593107e-06, "loss": 0.0269, "step": 2529 }, { "epoch": 2.2895927601809953, "grad_norm": 0.9458377847993373, "learning_rate": 1.3209582035017487e-06, "loss": 0.051, "step": 2530 }, { "epoch": 2.290497737556561, "grad_norm": 0.9039872054963027, "learning_rate": 1.3177510256521176e-06, "loss": 0.0472, "step": 2531 }, { "epoch": 2.2914027149321265, "grad_norm": 1.154081789668472, "learning_rate": 1.3145471548908345e-06, "loss": 0.0556, "step": 2532 }, { "epoch": 2.292307692307692, "grad_norm": 0.9143230462070668, "learning_rate": 1.3113465940953495e-06, "loss": 0.0405, "step": 2533 }, { "epoch": 2.2932126696832578, "grad_norm": 0.7808008485286221, "learning_rate": 1.3081493461401406e-06, "loss": 0.0334, "step": 2534 }, { "epoch": 2.2941176470588234, "grad_norm": 0.9316328141713702, "learning_rate": 1.3049554138967052e-06, "loss": 0.0444, "step": 2535 }, { "epoch": 2.295022624434389, "grad_norm": 0.7074244787538267, "learning_rate": 1.3017648002335687e-06, "loss": 0.0367, "step": 2536 }, { "epoch": 2.2959276018099546, "grad_norm": 0.8687945281357811, "learning_rate": 1.2985775080162737e-06, "loss": 0.0378, "step": 2537 }, { "epoch": 2.29683257918552, "grad_norm": 0.7702343021700191, "learning_rate": 1.2953935401073786e-06, "loss": 0.0353, "step": 2538 }, { "epoch": 2.297737556561086, "grad_norm": 0.7685575675981646, "learning_rate": 1.292212899366458e-06, "loss": 0.0351, "step": 2539 }, { "epoch": 2.2986425339366514, "grad_norm": 0.8037726606213045, "learning_rate": 1.2890355886500971e-06, "loss": 0.0352, "step": 2540 }, { "epoch": 2.299547511312217, "grad_norm": 0.859575242669245, "learning_rate": 1.2858616108118933e-06, "loss": 0.039, "step": 2541 }, { "epoch": 2.3004524886877826, "grad_norm": 1.2476377194651833, "learning_rate": 1.2826909687024453e-06, "loss": 0.0646, "step": 2542 }, { "epoch": 2.3013574660633482, "grad_norm": 0.922075316934062, "learning_rate": 1.2795236651693593e-06, "loss": 0.0424, "step": 2543 }, { "epoch": 2.302262443438914, "grad_norm": 0.8381403427794504, "learning_rate": 1.276359703057245e-06, "loss": 0.0605, "step": 2544 }, { "epoch": 2.3031674208144794, "grad_norm": 0.7311041280553966, "learning_rate": 1.273199085207706e-06, "loss": 0.042, "step": 2545 }, { "epoch": 2.304072398190045, "grad_norm": 0.9209955371747572, "learning_rate": 1.2700418144593478e-06, "loss": 0.0528, "step": 2546 }, { "epoch": 2.3049773755656107, "grad_norm": 0.6403580442559484, "learning_rate": 1.2668878936477641e-06, "loss": 0.0246, "step": 2547 }, { "epoch": 2.3058823529411763, "grad_norm": 1.1482619982028717, "learning_rate": 1.2637373256055445e-06, "loss": 0.0621, "step": 2548 }, { "epoch": 2.306787330316742, "grad_norm": 0.7932206014694385, "learning_rate": 1.2605901131622677e-06, "loss": 0.0398, "step": 2549 }, { "epoch": 2.3076923076923075, "grad_norm": 1.936476867908876, "learning_rate": 1.257446259144494e-06, "loss": 0.0717, "step": 2550 }, { "epoch": 2.308597285067873, "grad_norm": 0.6852013826211945, "learning_rate": 1.2543057663757724e-06, "loss": 0.0264, "step": 2551 }, { "epoch": 2.3095022624434387, "grad_norm": 0.7372645571722535, "learning_rate": 1.2511686376766313e-06, "loss": 0.0293, "step": 2552 }, { "epoch": 2.3104072398190043, "grad_norm": 1.277661963190135, "learning_rate": 1.248034875864575e-06, "loss": 0.0778, "step": 2553 }, { "epoch": 2.31131221719457, "grad_norm": 0.698082630968149, "learning_rate": 1.2449044837540901e-06, "loss": 0.0282, "step": 2554 }, { "epoch": 2.3122171945701355, "grad_norm": 0.7797917874536524, "learning_rate": 1.2417774641566298e-06, "loss": 0.0357, "step": 2555 }, { "epoch": 2.313122171945701, "grad_norm": 0.856686694543815, "learning_rate": 1.238653819880623e-06, "loss": 0.0381, "step": 2556 }, { "epoch": 2.3140271493212667, "grad_norm": 0.9466748524148197, "learning_rate": 1.2355335537314683e-06, "loss": 0.0488, "step": 2557 }, { "epoch": 2.3149321266968323, "grad_norm": 1.5535769726210984, "learning_rate": 1.2324166685115246e-06, "loss": 0.0765, "step": 2558 }, { "epoch": 2.315837104072398, "grad_norm": 1.1197608587889198, "learning_rate": 1.2293031670201206e-06, "loss": 0.0663, "step": 2559 }, { "epoch": 2.3167420814479636, "grad_norm": 0.9922621831401446, "learning_rate": 1.2261930520535403e-06, "loss": 0.0467, "step": 2560 }, { "epoch": 2.317647058823529, "grad_norm": 0.9725179071382128, "learning_rate": 1.2230863264050308e-06, "loss": 0.0502, "step": 2561 }, { "epoch": 2.318552036199095, "grad_norm": 0.7510314014632642, "learning_rate": 1.219982992864795e-06, "loss": 0.0331, "step": 2562 }, { "epoch": 2.3194570135746604, "grad_norm": 0.7868622122906311, "learning_rate": 1.2168830542199839e-06, "loss": 0.0336, "step": 2563 }, { "epoch": 2.3203619909502264, "grad_norm": 0.7979275710716222, "learning_rate": 1.213786513254706e-06, "loss": 0.0399, "step": 2564 }, { "epoch": 2.321266968325792, "grad_norm": 0.9655603911000464, "learning_rate": 1.210693372750017e-06, "loss": 0.0367, "step": 2565 }, { "epoch": 2.3221719457013577, "grad_norm": 0.962185311338082, "learning_rate": 1.2076036354839131e-06, "loss": 0.0461, "step": 2566 }, { "epoch": 2.3230769230769233, "grad_norm": 1.0031787043281364, "learning_rate": 1.2045173042313429e-06, "loss": 0.0444, "step": 2567 }, { "epoch": 2.323981900452489, "grad_norm": 0.7385929666266231, "learning_rate": 1.2014343817641871e-06, "loss": 0.0375, "step": 2568 }, { "epoch": 2.3248868778280545, "grad_norm": 0.7261802892742593, "learning_rate": 1.1983548708512705e-06, "loss": 0.0324, "step": 2569 }, { "epoch": 2.32579185520362, "grad_norm": 0.7951114922078603, "learning_rate": 1.1952787742583549e-06, "loss": 0.0382, "step": 2570 }, { "epoch": 2.3266968325791857, "grad_norm": 0.8610452574192168, "learning_rate": 1.192206094748129e-06, "loss": 0.0374, "step": 2571 }, { "epoch": 2.3276018099547513, "grad_norm": 0.6271535398371755, "learning_rate": 1.1891368350802207e-06, "loss": 0.0284, "step": 2572 }, { "epoch": 2.328506787330317, "grad_norm": 0.6379141534246953, "learning_rate": 1.1860709980111796e-06, "loss": 0.0248, "step": 2573 }, { "epoch": 2.3294117647058825, "grad_norm": 1.207988856808245, "learning_rate": 1.1830085862944851e-06, "loss": 0.0927, "step": 2574 }, { "epoch": 2.330316742081448, "grad_norm": 0.7521327634132245, "learning_rate": 1.1799496026805413e-06, "loss": 0.0465, "step": 2575 }, { "epoch": 2.3312217194570137, "grad_norm": 0.6277191711207167, "learning_rate": 1.1768940499166692e-06, "loss": 0.0317, "step": 2576 }, { "epoch": 2.3321266968325793, "grad_norm": 0.8246713612702228, "learning_rate": 1.173841930747114e-06, "loss": 0.038, "step": 2577 }, { "epoch": 2.333031674208145, "grad_norm": 0.7383565535958059, "learning_rate": 1.1707932479130302e-06, "loss": 0.0351, "step": 2578 }, { "epoch": 2.3339366515837106, "grad_norm": 1.05561384546989, "learning_rate": 1.1677480041524919e-06, "loss": 0.042, "step": 2579 }, { "epoch": 2.334841628959276, "grad_norm": 0.6792092697603586, "learning_rate": 1.1647062022004845e-06, "loss": 0.0232, "step": 2580 }, { "epoch": 2.3357466063348418, "grad_norm": 0.9826763937336815, "learning_rate": 1.1616678447888973e-06, "loss": 0.0534, "step": 2581 }, { "epoch": 2.3366515837104074, "grad_norm": 0.8743993421380839, "learning_rate": 1.1586329346465303e-06, "loss": 0.0414, "step": 2582 }, { "epoch": 2.337556561085973, "grad_norm": 1.064303591093418, "learning_rate": 1.1556014744990873e-06, "loss": 0.0647, "step": 2583 }, { "epoch": 2.3384615384615386, "grad_norm": 1.3280695952309112, "learning_rate": 1.1525734670691702e-06, "loss": 0.0384, "step": 2584 }, { "epoch": 2.339366515837104, "grad_norm": 0.6359372653791865, "learning_rate": 1.1495489150762851e-06, "loss": 0.0244, "step": 2585 }, { "epoch": 2.34027149321267, "grad_norm": 0.8537991644435163, "learning_rate": 1.1465278212368287e-06, "loss": 0.0393, "step": 2586 }, { "epoch": 2.3411764705882354, "grad_norm": 0.8908538444486502, "learning_rate": 1.1435101882640964e-06, "loss": 0.0327, "step": 2587 }, { "epoch": 2.342081447963801, "grad_norm": 0.8228098040798479, "learning_rate": 1.1404960188682762e-06, "loss": 0.0344, "step": 2588 }, { "epoch": 2.3429864253393666, "grad_norm": 0.9649542348288105, "learning_rate": 1.137485315756439e-06, "loss": 0.0487, "step": 2589 }, { "epoch": 2.3438914027149322, "grad_norm": 0.7971523270159585, "learning_rate": 1.1344780816325512e-06, "loss": 0.0332, "step": 2590 }, { "epoch": 2.344796380090498, "grad_norm": 0.7449062196334657, "learning_rate": 1.1314743191974552e-06, "loss": 0.0359, "step": 2591 }, { "epoch": 2.3457013574660635, "grad_norm": 0.7804699069872082, "learning_rate": 1.1284740311488813e-06, "loss": 0.0391, "step": 2592 }, { "epoch": 2.346606334841629, "grad_norm": 0.8269317640815249, "learning_rate": 1.1254772201814385e-06, "loss": 0.0432, "step": 2593 }, { "epoch": 2.3475113122171947, "grad_norm": 1.0727241247406636, "learning_rate": 1.1224838889866096e-06, "loss": 0.052, "step": 2594 }, { "epoch": 2.3484162895927603, "grad_norm": 0.6168155730199024, "learning_rate": 1.1194940402527566e-06, "loss": 0.0275, "step": 2595 }, { "epoch": 2.349321266968326, "grad_norm": 1.0779104924714764, "learning_rate": 1.1165076766651118e-06, "loss": 0.0478, "step": 2596 }, { "epoch": 2.3502262443438915, "grad_norm": 0.9103962874468873, "learning_rate": 1.1135248009057753e-06, "loss": 0.037, "step": 2597 }, { "epoch": 2.351131221719457, "grad_norm": 0.7000543016603261, "learning_rate": 1.1105454156537204e-06, "loss": 0.0268, "step": 2598 }, { "epoch": 2.3520361990950227, "grad_norm": 1.140103405765445, "learning_rate": 1.1075695235847783e-06, "loss": 0.0407, "step": 2599 }, { "epoch": 2.3529411764705883, "grad_norm": 1.6547991189169198, "learning_rate": 1.1045971273716476e-06, "loss": 0.0534, "step": 2600 }, { "epoch": 2.353846153846154, "grad_norm": 0.7384560671054595, "learning_rate": 1.1016282296838887e-06, "loss": 0.0371, "step": 2601 }, { "epoch": 2.3547511312217195, "grad_norm": 0.928905776467423, "learning_rate": 1.0986628331879146e-06, "loss": 0.0452, "step": 2602 }, { "epoch": 2.355656108597285, "grad_norm": 1.226762828734155, "learning_rate": 1.0957009405469982e-06, "loss": 0.0536, "step": 2603 }, { "epoch": 2.3565610859728507, "grad_norm": 0.9172840233109727, "learning_rate": 1.0927425544212622e-06, "loss": 0.0426, "step": 2604 }, { "epoch": 2.3574660633484164, "grad_norm": 0.6211216592803511, "learning_rate": 1.089787677467683e-06, "loss": 0.0225, "step": 2605 }, { "epoch": 2.358371040723982, "grad_norm": 1.4690577958687072, "learning_rate": 1.086836312340086e-06, "loss": 0.0969, "step": 2606 }, { "epoch": 2.3592760180995476, "grad_norm": 1.2154110985508093, "learning_rate": 1.083888461689137e-06, "loss": 0.0474, "step": 2607 }, { "epoch": 2.360180995475113, "grad_norm": 1.8499229525226473, "learning_rate": 1.0809441281623517e-06, "loss": 0.0603, "step": 2608 }, { "epoch": 2.361085972850679, "grad_norm": 1.2311661867734651, "learning_rate": 1.0780033144040847e-06, "loss": 0.0682, "step": 2609 }, { "epoch": 2.3619909502262444, "grad_norm": 0.9461762087171546, "learning_rate": 1.075066023055527e-06, "loss": 0.0472, "step": 2610 }, { "epoch": 2.36289592760181, "grad_norm": 0.8110088120058028, "learning_rate": 1.072132256754711e-06, "loss": 0.0388, "step": 2611 }, { "epoch": 2.3638009049773756, "grad_norm": 1.0199710873505767, "learning_rate": 1.0692020181364976e-06, "loss": 0.0481, "step": 2612 }, { "epoch": 2.364705882352941, "grad_norm": 0.9679603937563467, "learning_rate": 1.066275309832584e-06, "loss": 0.0583, "step": 2613 }, { "epoch": 2.365610859728507, "grad_norm": 0.7232323540361846, "learning_rate": 1.0633521344714959e-06, "loss": 0.0315, "step": 2614 }, { "epoch": 2.3665158371040724, "grad_norm": 0.9776196548274518, "learning_rate": 1.0604324946785826e-06, "loss": 0.0479, "step": 2615 }, { "epoch": 2.367420814479638, "grad_norm": 0.8057071040162692, "learning_rate": 1.0575163930760234e-06, "loss": 0.0383, "step": 2616 }, { "epoch": 2.3683257918552036, "grad_norm": 0.9780582039528894, "learning_rate": 1.0546038322828145e-06, "loss": 0.0684, "step": 2617 }, { "epoch": 2.3692307692307693, "grad_norm": 0.9990250679077866, "learning_rate": 1.0516948149147755e-06, "loss": 0.0454, "step": 2618 }, { "epoch": 2.370135746606335, "grad_norm": 1.0180586128601705, "learning_rate": 1.0487893435845449e-06, "loss": 0.0581, "step": 2619 }, { "epoch": 2.3710407239819005, "grad_norm": 0.9216771531949585, "learning_rate": 1.0458874209015708e-06, "loss": 0.0451, "step": 2620 }, { "epoch": 2.371945701357466, "grad_norm": 1.0301513154836799, "learning_rate": 1.0429890494721207e-06, "loss": 0.0571, "step": 2621 }, { "epoch": 2.3728506787330317, "grad_norm": 0.8305572472553229, "learning_rate": 1.0400942318992669e-06, "loss": 0.0398, "step": 2622 }, { "epoch": 2.3737556561085973, "grad_norm": 1.0129957662920013, "learning_rate": 1.037202970782894e-06, "loss": 0.046, "step": 2623 }, { "epoch": 2.374660633484163, "grad_norm": 0.6385402099529031, "learning_rate": 1.0343152687196929e-06, "loss": 0.0302, "step": 2624 }, { "epoch": 2.3755656108597285, "grad_norm": 1.009705325861011, "learning_rate": 1.0314311283031531e-06, "loss": 0.0504, "step": 2625 }, { "epoch": 2.376470588235294, "grad_norm": 0.974891305325137, "learning_rate": 1.02855055212357e-06, "loss": 0.0452, "step": 2626 }, { "epoch": 2.3773755656108597, "grad_norm": 0.8825939709488355, "learning_rate": 1.0256735427680375e-06, "loss": 0.0304, "step": 2627 }, { "epoch": 2.3782805429864253, "grad_norm": 0.8161615316084225, "learning_rate": 1.0228001028204426e-06, "loss": 0.0347, "step": 2628 }, { "epoch": 2.379185520361991, "grad_norm": 0.5938326752880472, "learning_rate": 1.019930234861472e-06, "loss": 0.0255, "step": 2629 }, { "epoch": 2.3800904977375565, "grad_norm": 2.245580038320779, "learning_rate": 1.0170639414685985e-06, "loss": 0.0453, "step": 2630 }, { "epoch": 2.380995475113122, "grad_norm": 0.9297992814800051, "learning_rate": 1.0142012252160877e-06, "loss": 0.0504, "step": 2631 }, { "epoch": 2.3819004524886878, "grad_norm": 0.923344156697915, "learning_rate": 1.0113420886749948e-06, "loss": 0.0509, "step": 2632 }, { "epoch": 2.3828054298642534, "grad_norm": 0.7477164143178139, "learning_rate": 1.008486534413154e-06, "loss": 0.05, "step": 2633 }, { "epoch": 2.383710407239819, "grad_norm": 0.8288204045356838, "learning_rate": 1.005634564995188e-06, "loss": 0.0424, "step": 2634 }, { "epoch": 2.3846153846153846, "grad_norm": 0.675811458117462, "learning_rate": 1.0027861829824953e-06, "loss": 0.0327, "step": 2635 }, { "epoch": 2.38552036199095, "grad_norm": 0.8579898154818211, "learning_rate": 9.999413909332556e-07, "loss": 0.0374, "step": 2636 }, { "epoch": 2.386425339366516, "grad_norm": 0.9664137843815112, "learning_rate": 9.971001914024248e-07, "loss": 0.0559, "step": 2637 }, { "epoch": 2.3873303167420814, "grad_norm": 0.7951913034122884, "learning_rate": 9.942625869417282e-07, "loss": 0.037, "step": 2638 }, { "epoch": 2.388235294117647, "grad_norm": 0.9974978915770845, "learning_rate": 9.91428580099667e-07, "loss": 0.0449, "step": 2639 }, { "epoch": 2.3891402714932126, "grad_norm": 0.6593742187711864, "learning_rate": 9.885981734215094e-07, "loss": 0.0303, "step": 2640 }, { "epoch": 2.3900452488687782, "grad_norm": 2.7128765286591827, "learning_rate": 9.857713694492887e-07, "loss": 0.1194, "step": 2641 }, { "epoch": 2.390950226244344, "grad_norm": 0.9082431472217841, "learning_rate": 9.829481707218046e-07, "loss": 0.0513, "step": 2642 }, { "epoch": 2.3918552036199094, "grad_norm": 0.7988671271521771, "learning_rate": 9.80128579774619e-07, "loss": 0.0349, "step": 2643 }, { "epoch": 2.392760180995475, "grad_norm": 0.6991367229146327, "learning_rate": 9.77312599140054e-07, "loss": 0.0312, "step": 2644 }, { "epoch": 2.3936651583710407, "grad_norm": 0.6318808523779031, "learning_rate": 9.745002313471847e-07, "loss": 0.0267, "step": 2645 }, { "epoch": 2.3945701357466063, "grad_norm": 1.110708743387884, "learning_rate": 9.716914789218469e-07, "loss": 0.0471, "step": 2646 }, { "epoch": 2.395475113122172, "grad_norm": 0.7117816923249081, "learning_rate": 9.688863443866286e-07, "loss": 0.0316, "step": 2647 }, { "epoch": 2.3963800904977375, "grad_norm": 0.7811554279125077, "learning_rate": 9.660848302608643e-07, "loss": 0.0343, "step": 2648 }, { "epoch": 2.397285067873303, "grad_norm": 0.8552165505954159, "learning_rate": 9.63286939060643e-07, "loss": 0.0364, "step": 2649 }, { "epoch": 2.3981900452488687, "grad_norm": 0.7701572705877309, "learning_rate": 9.60492673298794e-07, "loss": 0.0414, "step": 2650 }, { "epoch": 2.3990950226244343, "grad_norm": 1.5007250330523978, "learning_rate": 9.577020354848942e-07, "loss": 0.0489, "step": 2651 }, { "epoch": 2.4, "grad_norm": 1.1086279715687422, "learning_rate": 9.549150281252633e-07, "loss": 0.0498, "step": 2652 }, { "epoch": 2.4009049773755655, "grad_norm": 0.9747506097415293, "learning_rate": 9.521316537229552e-07, "loss": 0.0439, "step": 2653 }, { "epoch": 2.401809954751131, "grad_norm": 1.173386975567951, "learning_rate": 9.493519147777663e-07, "loss": 0.0449, "step": 2654 }, { "epoch": 2.4027149321266967, "grad_norm": 0.8197206163982492, "learning_rate": 9.465758137862264e-07, "loss": 0.0434, "step": 2655 }, { "epoch": 2.4036199095022623, "grad_norm": 0.681803933538416, "learning_rate": 9.438033532415947e-07, "loss": 0.0292, "step": 2656 }, { "epoch": 2.404524886877828, "grad_norm": 0.9344040505333044, "learning_rate": 9.410345356338674e-07, "loss": 0.0484, "step": 2657 }, { "epoch": 2.4054298642533936, "grad_norm": 0.9758687257026147, "learning_rate": 9.382693634497609e-07, "loss": 0.06, "step": 2658 }, { "epoch": 2.406334841628959, "grad_norm": 0.8834579959184313, "learning_rate": 9.355078391727246e-07, "loss": 0.038, "step": 2659 }, { "epoch": 2.4072398190045248, "grad_norm": 0.9057341932791929, "learning_rate": 9.327499652829292e-07, "loss": 0.0486, "step": 2660 }, { "epoch": 2.4081447963800904, "grad_norm": 0.8076526341014191, "learning_rate": 9.299957442572644e-07, "loss": 0.0336, "step": 2661 }, { "epoch": 2.409049773755656, "grad_norm": 1.0214423082845376, "learning_rate": 9.272451785693438e-07, "loss": 0.0528, "step": 2662 }, { "epoch": 2.4099547511312216, "grad_norm": 0.9553041910586839, "learning_rate": 9.244982706894928e-07, "loss": 0.0515, "step": 2663 }, { "epoch": 2.410859728506787, "grad_norm": 0.9144519416685438, "learning_rate": 9.217550230847566e-07, "loss": 0.0691, "step": 2664 }, { "epoch": 2.411764705882353, "grad_norm": 0.7897708909880325, "learning_rate": 9.190154382188921e-07, "loss": 0.0381, "step": 2665 }, { "epoch": 2.4126696832579184, "grad_norm": 1.0994281187283934, "learning_rate": 9.162795185523621e-07, "loss": 0.0692, "step": 2666 }, { "epoch": 2.413574660633484, "grad_norm": 0.826855477503333, "learning_rate": 9.135472665423434e-07, "loss": 0.0347, "step": 2667 }, { "epoch": 2.4144796380090496, "grad_norm": 0.6766821789200015, "learning_rate": 9.10818684642717e-07, "loss": 0.0286, "step": 2668 }, { "epoch": 2.4153846153846152, "grad_norm": 0.73284826621376, "learning_rate": 9.080937753040647e-07, "loss": 0.0276, "step": 2669 }, { "epoch": 2.416289592760181, "grad_norm": 1.0784535044298005, "learning_rate": 9.053725409736752e-07, "loss": 0.0646, "step": 2670 }, { "epoch": 2.4171945701357465, "grad_norm": 1.012644612227263, "learning_rate": 9.026549840955312e-07, "loss": 0.0493, "step": 2671 }, { "epoch": 2.418099547511312, "grad_norm": 0.7059967916383321, "learning_rate": 8.999411071103165e-07, "loss": 0.0274, "step": 2672 }, { "epoch": 2.4190045248868777, "grad_norm": 0.8119711085606054, "learning_rate": 8.972309124554102e-07, "loss": 0.042, "step": 2673 }, { "epoch": 2.4199095022624433, "grad_norm": 0.7063903128011623, "learning_rate": 8.945244025648792e-07, "loss": 0.0312, "step": 2674 }, { "epoch": 2.420814479638009, "grad_norm": 0.8449639040490272, "learning_rate": 8.918215798694879e-07, "loss": 0.0359, "step": 2675 }, { "epoch": 2.4217194570135745, "grad_norm": 1.1369852221249508, "learning_rate": 8.891224467966824e-07, "loss": 0.0456, "step": 2676 }, { "epoch": 2.42262443438914, "grad_norm": 0.8756784866346076, "learning_rate": 8.86427005770601e-07, "loss": 0.0405, "step": 2677 }, { "epoch": 2.4235294117647057, "grad_norm": 0.8140288417762599, "learning_rate": 8.837352592120646e-07, "loss": 0.0373, "step": 2678 }, { "epoch": 2.4244343891402713, "grad_norm": 0.6509710777028266, "learning_rate": 8.810472095385713e-07, "loss": 0.0267, "step": 2679 }, { "epoch": 2.425339366515837, "grad_norm": 0.8987787854022495, "learning_rate": 8.783628591643056e-07, "loss": 0.0348, "step": 2680 }, { "epoch": 2.4262443438914025, "grad_norm": 0.7472394661797312, "learning_rate": 8.756822105001267e-07, "loss": 0.0334, "step": 2681 }, { "epoch": 2.427149321266968, "grad_norm": 0.7458989379509303, "learning_rate": 8.730052659535677e-07, "loss": 0.0273, "step": 2682 }, { "epoch": 2.4280542986425337, "grad_norm": 0.703186053544088, "learning_rate": 8.70332027928838e-07, "loss": 0.0378, "step": 2683 }, { "epoch": 2.4289592760180994, "grad_norm": 0.7047884295507241, "learning_rate": 8.676624988268151e-07, "loss": 0.0307, "step": 2684 }, { "epoch": 2.4298642533936654, "grad_norm": 0.7410163360615714, "learning_rate": 8.649966810450472e-07, "loss": 0.0377, "step": 2685 }, { "epoch": 2.430769230769231, "grad_norm": 0.7355439568506587, "learning_rate": 8.623345769777514e-07, "loss": 0.0366, "step": 2686 }, { "epoch": 2.4316742081447966, "grad_norm": 0.8098328328738347, "learning_rate": 8.596761890158045e-07, "loss": 0.0378, "step": 2687 }, { "epoch": 2.4325791855203622, "grad_norm": 1.1703124216412213, "learning_rate": 8.570215195467502e-07, "loss": 0.0546, "step": 2688 }, { "epoch": 2.433484162895928, "grad_norm": 0.7541901286952175, "learning_rate": 8.543705709547889e-07, "loss": 0.0292, "step": 2689 }, { "epoch": 2.4343891402714934, "grad_norm": 0.5955756724446686, "learning_rate": 8.517233456207819e-07, "loss": 0.0291, "step": 2690 }, { "epoch": 2.435294117647059, "grad_norm": 0.8158834845448169, "learning_rate": 8.490798459222477e-07, "loss": 0.0391, "step": 2691 }, { "epoch": 2.4361990950226247, "grad_norm": 0.9372686990049449, "learning_rate": 8.464400742333534e-07, "loss": 0.0409, "step": 2692 }, { "epoch": 2.4371040723981903, "grad_norm": 0.7298729079596984, "learning_rate": 8.438040329249247e-07, "loss": 0.0336, "step": 2693 }, { "epoch": 2.438009049773756, "grad_norm": 0.932377939957442, "learning_rate": 8.411717243644313e-07, "loss": 0.0337, "step": 2694 }, { "epoch": 2.4389140271493215, "grad_norm": 0.7167184013870387, "learning_rate": 8.38543150915993e-07, "loss": 0.0448, "step": 2695 }, { "epoch": 2.439819004524887, "grad_norm": 0.7887849738309405, "learning_rate": 8.359183149403771e-07, "loss": 0.0407, "step": 2696 }, { "epoch": 2.4407239819004527, "grad_norm": 0.8089810820144382, "learning_rate": 8.332972187949889e-07, "loss": 0.0282, "step": 2697 }, { "epoch": 2.4416289592760183, "grad_norm": 1.0813970376475863, "learning_rate": 8.306798648338793e-07, "loss": 0.0429, "step": 2698 }, { "epoch": 2.442533936651584, "grad_norm": 0.6221166150332357, "learning_rate": 8.280662554077384e-07, "loss": 0.0275, "step": 2699 }, { "epoch": 2.4434389140271495, "grad_norm": 1.0198586092761088, "learning_rate": 8.254563928638892e-07, "loss": 0.0554, "step": 2700 }, { "epoch": 2.444343891402715, "grad_norm": 0.6797885628092073, "learning_rate": 8.228502795462945e-07, "loss": 0.0337, "step": 2701 }, { "epoch": 2.4452488687782807, "grad_norm": 0.9262815737822404, "learning_rate": 8.202479177955453e-07, "loss": 0.0336, "step": 2702 }, { "epoch": 2.4461538461538463, "grad_norm": 0.7743928595499517, "learning_rate": 8.176493099488664e-07, "loss": 0.0336, "step": 2703 }, { "epoch": 2.447058823529412, "grad_norm": 0.78095482967438, "learning_rate": 8.150544583401116e-07, "loss": 0.0427, "step": 2704 }, { "epoch": 2.4479638009049776, "grad_norm": 0.7556295984311291, "learning_rate": 8.124633652997571e-07, "loss": 0.04, "step": 2705 }, { "epoch": 2.448868778280543, "grad_norm": 0.7229507399813149, "learning_rate": 8.098760331549088e-07, "loss": 0.0413, "step": 2706 }, { "epoch": 2.4497737556561088, "grad_norm": 0.7110673930719824, "learning_rate": 8.072924642292906e-07, "loss": 0.0334, "step": 2707 }, { "epoch": 2.4506787330316744, "grad_norm": 0.7815565371766832, "learning_rate": 8.047126608432482e-07, "loss": 0.0339, "step": 2708 }, { "epoch": 2.45158371040724, "grad_norm": 0.743440110439971, "learning_rate": 8.021366253137469e-07, "loss": 0.0319, "step": 2709 }, { "epoch": 2.4524886877828056, "grad_norm": 0.7514409561616919, "learning_rate": 7.995643599543645e-07, "loss": 0.0403, "step": 2710 }, { "epoch": 2.453393665158371, "grad_norm": 0.6726277411846796, "learning_rate": 7.969958670752965e-07, "loss": 0.0254, "step": 2711 }, { "epoch": 2.454298642533937, "grad_norm": 0.9129066600154353, "learning_rate": 7.944311489833489e-07, "loss": 0.0393, "step": 2712 }, { "epoch": 2.4552036199095024, "grad_norm": 0.7419193655891949, "learning_rate": 7.918702079819351e-07, "loss": 0.0329, "step": 2713 }, { "epoch": 2.456108597285068, "grad_norm": 0.7580637522780499, "learning_rate": 7.893130463710808e-07, "loss": 0.0384, "step": 2714 }, { "epoch": 2.4570135746606336, "grad_norm": 0.7768235174224403, "learning_rate": 7.86759666447412e-07, "loss": 0.0341, "step": 2715 }, { "epoch": 2.4579185520361992, "grad_norm": 0.7673983422838445, "learning_rate": 7.842100705041633e-07, "loss": 0.0479, "step": 2716 }, { "epoch": 2.458823529411765, "grad_norm": 0.8287367659391337, "learning_rate": 7.816642608311692e-07, "loss": 0.0369, "step": 2717 }, { "epoch": 2.4597285067873305, "grad_norm": 1.2921347678236443, "learning_rate": 7.791222397148613e-07, "loss": 0.0724, "step": 2718 }, { "epoch": 2.460633484162896, "grad_norm": 0.7069352559379001, "learning_rate": 7.76584009438272e-07, "loss": 0.0292, "step": 2719 }, { "epoch": 2.4615384615384617, "grad_norm": 0.8009416626514949, "learning_rate": 7.740495722810271e-07, "loss": 0.0369, "step": 2720 }, { "epoch": 2.4624434389140273, "grad_norm": 0.6864491397179993, "learning_rate": 7.715189305193454e-07, "loss": 0.0318, "step": 2721 }, { "epoch": 2.463348416289593, "grad_norm": 1.714673775292681, "learning_rate": 7.689920864260408e-07, "loss": 0.0429, "step": 2722 }, { "epoch": 2.4642533936651585, "grad_norm": 1.0381882283048198, "learning_rate": 7.664690422705102e-07, "loss": 0.0404, "step": 2723 }, { "epoch": 2.465158371040724, "grad_norm": 1.054003976266776, "learning_rate": 7.639498003187418e-07, "loss": 0.0499, "step": 2724 }, { "epoch": 2.4660633484162897, "grad_norm": 0.6401473442786483, "learning_rate": 7.614343628333104e-07, "loss": 0.0228, "step": 2725 }, { "epoch": 2.4669683257918553, "grad_norm": 0.5892821873918196, "learning_rate": 7.589227320733694e-07, "loss": 0.0264, "step": 2726 }, { "epoch": 2.467873303167421, "grad_norm": 0.8008034683004936, "learning_rate": 7.564149102946572e-07, "loss": 0.0375, "step": 2727 }, { "epoch": 2.4687782805429865, "grad_norm": 0.7750938260682356, "learning_rate": 7.539108997494881e-07, "loss": 0.0321, "step": 2728 }, { "epoch": 2.469683257918552, "grad_norm": 0.9815653232691922, "learning_rate": 7.514107026867557e-07, "loss": 0.045, "step": 2729 }, { "epoch": 2.4705882352941178, "grad_norm": 0.6555031509052804, "learning_rate": 7.489143213519301e-07, "loss": 0.0291, "step": 2730 }, { "epoch": 2.4714932126696834, "grad_norm": 0.9295429616305909, "learning_rate": 7.464217579870497e-07, "loss": 0.0472, "step": 2731 }, { "epoch": 2.472398190045249, "grad_norm": 1.1668376337492756, "learning_rate": 7.439330148307283e-07, "loss": 0.0345, "step": 2732 }, { "epoch": 2.4733031674208146, "grad_norm": 0.8741228793599057, "learning_rate": 7.414480941181463e-07, "loss": 0.0462, "step": 2733 }, { "epoch": 2.47420814479638, "grad_norm": 0.7574479834233521, "learning_rate": 7.38966998081051e-07, "loss": 0.0379, "step": 2734 }, { "epoch": 2.475113122171946, "grad_norm": 0.6477011722775576, "learning_rate": 7.364897289477585e-07, "loss": 0.0295, "step": 2735 }, { "epoch": 2.4760180995475114, "grad_norm": 0.6090280196016717, "learning_rate": 7.340162889431418e-07, "loss": 0.0266, "step": 2736 }, { "epoch": 2.476923076923077, "grad_norm": 1.0590037779388906, "learning_rate": 7.315466802886401e-07, "loss": 0.0524, "step": 2737 }, { "epoch": 2.4778280542986426, "grad_norm": 1.1074412149830848, "learning_rate": 7.29080905202248e-07, "loss": 0.0505, "step": 2738 }, { "epoch": 2.478733031674208, "grad_norm": 0.7076277983219426, "learning_rate": 7.26618965898519e-07, "loss": 0.0276, "step": 2739 }, { "epoch": 2.479638009049774, "grad_norm": 0.7749443593302434, "learning_rate": 7.241608645885629e-07, "loss": 0.0355, "step": 2740 }, { "epoch": 2.4805429864253394, "grad_norm": 0.7870707468778443, "learning_rate": 7.217066034800385e-07, "loss": 0.0345, "step": 2741 }, { "epoch": 2.481447963800905, "grad_norm": 1.0404420006414459, "learning_rate": 7.19256184777159e-07, "loss": 0.0606, "step": 2742 }, { "epoch": 2.4823529411764707, "grad_norm": 0.6412809925123755, "learning_rate": 7.168096106806871e-07, "loss": 0.0251, "step": 2743 }, { "epoch": 2.4832579185520363, "grad_norm": 0.6921949234747162, "learning_rate": 7.143668833879292e-07, "loss": 0.0274, "step": 2744 }, { "epoch": 2.484162895927602, "grad_norm": 0.8629715989963025, "learning_rate": 7.119280050927407e-07, "loss": 0.043, "step": 2745 }, { "epoch": 2.4850678733031675, "grad_norm": 0.9624871836772174, "learning_rate": 7.094929779855148e-07, "loss": 0.0538, "step": 2746 }, { "epoch": 2.485972850678733, "grad_norm": 1.0338232523030735, "learning_rate": 7.070618042531946e-07, "loss": 0.0539, "step": 2747 }, { "epoch": 2.4868778280542987, "grad_norm": 0.9632173767854249, "learning_rate": 7.046344860792526e-07, "loss": 0.04, "step": 2748 }, { "epoch": 2.4877828054298643, "grad_norm": 1.0170075891140045, "learning_rate": 7.022110256437059e-07, "loss": 0.0408, "step": 2749 }, { "epoch": 2.48868778280543, "grad_norm": 0.7340682735125633, "learning_rate": 6.997914251231036e-07, "loss": 0.0363, "step": 2750 }, { "epoch": 2.4895927601809955, "grad_norm": 1.0452115818096708, "learning_rate": 6.973756866905279e-07, "loss": 0.0539, "step": 2751 }, { "epoch": 2.490497737556561, "grad_norm": 0.7403985029762598, "learning_rate": 6.949638125155949e-07, "loss": 0.0253, "step": 2752 }, { "epoch": 2.4914027149321267, "grad_norm": 1.1132190057313949, "learning_rate": 6.92555804764446e-07, "loss": 0.0481, "step": 2753 }, { "epoch": 2.4923076923076923, "grad_norm": 0.6238874530883983, "learning_rate": 6.901516655997536e-07, "loss": 0.0281, "step": 2754 }, { "epoch": 2.493212669683258, "grad_norm": 1.0661831968884503, "learning_rate": 6.87751397180716e-07, "loss": 0.0524, "step": 2755 }, { "epoch": 2.4941176470588236, "grad_norm": 0.8457592011740179, "learning_rate": 6.853550016630517e-07, "loss": 0.0498, "step": 2756 }, { "epoch": 2.495022624434389, "grad_norm": 0.7777874047184854, "learning_rate": 6.829624811990037e-07, "loss": 0.0399, "step": 2757 }, { "epoch": 2.4959276018099548, "grad_norm": 0.8787891015411515, "learning_rate": 6.80573837937335e-07, "loss": 0.0318, "step": 2758 }, { "epoch": 2.4968325791855204, "grad_norm": 1.696492579900664, "learning_rate": 6.781890740233233e-07, "loss": 0.055, "step": 2759 }, { "epoch": 2.497737556561086, "grad_norm": 0.9337818639325328, "learning_rate": 6.758081915987669e-07, "loss": 0.0536, "step": 2760 }, { "epoch": 2.4986425339366516, "grad_norm": 1.2486237941717513, "learning_rate": 6.734311928019726e-07, "loss": 0.0407, "step": 2761 }, { "epoch": 2.499547511312217, "grad_norm": 0.7148091025679155, "learning_rate": 6.710580797677635e-07, "loss": 0.0359, "step": 2762 }, { "epoch": 2.500452488687783, "grad_norm": 0.8773178985548671, "learning_rate": 6.686888546274728e-07, "loss": 0.0325, "step": 2763 }, { "epoch": 2.5013574660633484, "grad_norm": 0.7970136801568087, "learning_rate": 6.663235195089379e-07, "loss": 0.0359, "step": 2764 }, { "epoch": 2.502262443438914, "grad_norm": 0.8148534944837704, "learning_rate": 6.639620765365074e-07, "loss": 0.0403, "step": 2765 }, { "epoch": 2.5031674208144796, "grad_norm": 0.857477743432816, "learning_rate": 6.616045278310301e-07, "loss": 0.0361, "step": 2766 }, { "epoch": 2.5040723981900452, "grad_norm": 0.7088498582269318, "learning_rate": 6.592508755098603e-07, "loss": 0.0252, "step": 2767 }, { "epoch": 2.504977375565611, "grad_norm": 0.7574709210670998, "learning_rate": 6.569011216868531e-07, "loss": 0.03, "step": 2768 }, { "epoch": 2.5058823529411764, "grad_norm": 0.7819588763603988, "learning_rate": 6.545552684723583e-07, "loss": 0.0421, "step": 2769 }, { "epoch": 2.506787330316742, "grad_norm": 0.8176369577369139, "learning_rate": 6.522133179732271e-07, "loss": 0.0401, "step": 2770 }, { "epoch": 2.5076923076923077, "grad_norm": 0.8478245266463836, "learning_rate": 6.498752722928042e-07, "loss": 0.0458, "step": 2771 }, { "epoch": 2.5085972850678733, "grad_norm": 1.0748502604834715, "learning_rate": 6.475411335309246e-07, "loss": 0.0443, "step": 2772 }, { "epoch": 2.509502262443439, "grad_norm": 0.7446868463629692, "learning_rate": 6.452109037839199e-07, "loss": 0.0377, "step": 2773 }, { "epoch": 2.5104072398190045, "grad_norm": 1.1364068312883662, "learning_rate": 6.428845851446042e-07, "loss": 0.0344, "step": 2774 }, { "epoch": 2.51131221719457, "grad_norm": 0.6884076499497777, "learning_rate": 6.405621797022848e-07, "loss": 0.0312, "step": 2775 }, { "epoch": 2.5122171945701357, "grad_norm": 1.185907394947825, "learning_rate": 6.382436895427524e-07, "loss": 0.0468, "step": 2776 }, { "epoch": 2.5131221719457013, "grad_norm": 0.7631514698297553, "learning_rate": 6.359291167482795e-07, "loss": 0.0368, "step": 2777 }, { "epoch": 2.514027149321267, "grad_norm": 0.9134931491779233, "learning_rate": 6.336184633976239e-07, "loss": 0.0303, "step": 2778 }, { "epoch": 2.5149321266968325, "grad_norm": 0.8853935689180916, "learning_rate": 6.313117315660195e-07, "loss": 0.0409, "step": 2779 }, { "epoch": 2.515837104072398, "grad_norm": 0.6309720005972017, "learning_rate": 6.290089233251811e-07, "loss": 0.0258, "step": 2780 }, { "epoch": 2.5167420814479637, "grad_norm": 0.7312698404725453, "learning_rate": 6.267100407432996e-07, "loss": 0.0337, "step": 2781 }, { "epoch": 2.5176470588235293, "grad_norm": 0.709458899492248, "learning_rate": 6.244150858850368e-07, "loss": 0.0304, "step": 2782 }, { "epoch": 2.518552036199095, "grad_norm": 0.6937871300267655, "learning_rate": 6.221240608115309e-07, "loss": 0.0272, "step": 2783 }, { "epoch": 2.5194570135746606, "grad_norm": 0.8856581101793736, "learning_rate": 6.1983696758039e-07, "loss": 0.0402, "step": 2784 }, { "epoch": 2.520361990950226, "grad_norm": 1.0328601331587697, "learning_rate": 6.175538082456883e-07, "loss": 0.04, "step": 2785 }, { "epoch": 2.521266968325792, "grad_norm": 0.9378268033209113, "learning_rate": 6.152745848579706e-07, "loss": 0.0484, "step": 2786 }, { "epoch": 2.5221719457013574, "grad_norm": 0.8333256638539068, "learning_rate": 6.129992994642426e-07, "loss": 0.0375, "step": 2787 }, { "epoch": 2.523076923076923, "grad_norm": 0.6353844763559957, "learning_rate": 6.107279541079769e-07, "loss": 0.0255, "step": 2788 }, { "epoch": 2.5239819004524886, "grad_norm": 0.8600483434768252, "learning_rate": 6.084605508291075e-07, "loss": 0.0425, "step": 2789 }, { "epoch": 2.524886877828054, "grad_norm": 0.6048348715576986, "learning_rate": 6.061970916640236e-07, "loss": 0.0315, "step": 2790 }, { "epoch": 2.52579185520362, "grad_norm": 0.7360280479772784, "learning_rate": 6.039375786455781e-07, "loss": 0.0325, "step": 2791 }, { "epoch": 2.5266968325791854, "grad_norm": 0.7501996886985817, "learning_rate": 6.016820138030743e-07, "loss": 0.0339, "step": 2792 }, { "epoch": 2.527601809954751, "grad_norm": 0.6605565984782014, "learning_rate": 5.99430399162273e-07, "loss": 0.0278, "step": 2793 }, { "epoch": 2.5285067873303166, "grad_norm": 1.1416808189777683, "learning_rate": 5.97182736745387e-07, "loss": 0.0524, "step": 2794 }, { "epoch": 2.5294117647058822, "grad_norm": 0.6278746396225946, "learning_rate": 5.949390285710777e-07, "loss": 0.0267, "step": 2795 }, { "epoch": 2.530316742081448, "grad_norm": 0.8008625542505672, "learning_rate": 5.926992766544575e-07, "loss": 0.0461, "step": 2796 }, { "epoch": 2.5312217194570135, "grad_norm": 0.9384962110975563, "learning_rate": 5.904634830070832e-07, "loss": 0.0506, "step": 2797 }, { "epoch": 2.532126696832579, "grad_norm": 0.9883526156713017, "learning_rate": 5.882316496369584e-07, "loss": 0.053, "step": 2798 }, { "epoch": 2.5330316742081447, "grad_norm": 0.8174692426864809, "learning_rate": 5.860037785485301e-07, "loss": 0.0319, "step": 2799 }, { "epoch": 2.5339366515837103, "grad_norm": 1.3638045774813523, "learning_rate": 5.837798717426846e-07, "loss": 0.046, "step": 2800 }, { "epoch": 2.534841628959276, "grad_norm": 0.807586524776894, "learning_rate": 5.815599312167497e-07, "loss": 0.0341, "step": 2801 }, { "epoch": 2.5357466063348415, "grad_norm": 0.9510227015533964, "learning_rate": 5.793439589644917e-07, "loss": 0.0463, "step": 2802 }, { "epoch": 2.536651583710407, "grad_norm": 0.7487879855140138, "learning_rate": 5.771319569761097e-07, "loss": 0.0296, "step": 2803 }, { "epoch": 2.5375565610859727, "grad_norm": 0.8704129793111497, "learning_rate": 5.749239272382407e-07, "loss": 0.0434, "step": 2804 }, { "epoch": 2.5384615384615383, "grad_norm": 0.8031970626412861, "learning_rate": 5.727198717339511e-07, "loss": 0.0414, "step": 2805 }, { "epoch": 2.539366515837104, "grad_norm": 1.148181789926356, "learning_rate": 5.705197924427397e-07, "loss": 0.0479, "step": 2806 }, { "epoch": 2.5402714932126695, "grad_norm": 0.5055810144376174, "learning_rate": 5.683236913405354e-07, "loss": 0.0235, "step": 2807 }, { "epoch": 2.541176470588235, "grad_norm": 0.7891616644049865, "learning_rate": 5.661315703996905e-07, "loss": 0.0474, "step": 2808 }, { "epoch": 2.5420814479638008, "grad_norm": 0.6512885052722237, "learning_rate": 5.639434315889874e-07, "loss": 0.0332, "step": 2809 }, { "epoch": 2.5429864253393664, "grad_norm": 0.8777269601548613, "learning_rate": 5.617592768736269e-07, "loss": 0.0369, "step": 2810 }, { "epoch": 2.543891402714932, "grad_norm": 0.8552868398696908, "learning_rate": 5.595791082152352e-07, "loss": 0.0523, "step": 2811 }, { "epoch": 2.5447963800904976, "grad_norm": 0.9596655336717829, "learning_rate": 5.57402927571859e-07, "loss": 0.0572, "step": 2812 }, { "epoch": 2.545701357466063, "grad_norm": 1.0992127083141479, "learning_rate": 5.552307368979598e-07, "loss": 0.0512, "step": 2813 }, { "epoch": 2.546606334841629, "grad_norm": 0.62283907805787, "learning_rate": 5.530625381444183e-07, "loss": 0.026, "step": 2814 }, { "epoch": 2.5475113122171944, "grad_norm": 0.9506925336578668, "learning_rate": 5.508983332585316e-07, "loss": 0.0495, "step": 2815 }, { "epoch": 2.54841628959276, "grad_norm": 0.6213280327490673, "learning_rate": 5.487381241840045e-07, "loss": 0.0271, "step": 2816 }, { "epoch": 2.5493212669683256, "grad_norm": 0.8767628662936813, "learning_rate": 5.465819128609589e-07, "loss": 0.0394, "step": 2817 }, { "epoch": 2.5502262443438912, "grad_norm": 0.7454577271973711, "learning_rate": 5.444297012259215e-07, "loss": 0.0421, "step": 2818 }, { "epoch": 2.551131221719457, "grad_norm": 0.9441523062828199, "learning_rate": 5.422814912118302e-07, "loss": 0.046, "step": 2819 }, { "epoch": 2.5520361990950224, "grad_norm": 0.6599661833462379, "learning_rate": 5.401372847480285e-07, "loss": 0.0304, "step": 2820 }, { "epoch": 2.552941176470588, "grad_norm": 0.8258362066274993, "learning_rate": 5.379970837602611e-07, "loss": 0.0373, "step": 2821 }, { "epoch": 2.5538461538461537, "grad_norm": 0.853386715775507, "learning_rate": 5.358608901706802e-07, "loss": 0.0415, "step": 2822 }, { "epoch": 2.5547511312217193, "grad_norm": 0.5556701874261966, "learning_rate": 5.337287058978346e-07, "loss": 0.0261, "step": 2823 }, { "epoch": 2.555656108597285, "grad_norm": 0.8188893898039067, "learning_rate": 5.316005328566748e-07, "loss": 0.0298, "step": 2824 }, { "epoch": 2.5565610859728505, "grad_norm": 0.9072173312770728, "learning_rate": 5.294763729585484e-07, "loss": 0.0369, "step": 2825 }, { "epoch": 2.557466063348416, "grad_norm": 1.0071729479729161, "learning_rate": 5.273562281111972e-07, "loss": 0.0544, "step": 2826 }, { "epoch": 2.5583710407239817, "grad_norm": 0.8222686758481971, "learning_rate": 5.25240100218759e-07, "loss": 0.036, "step": 2827 }, { "epoch": 2.5592760180995473, "grad_norm": 0.978112693533028, "learning_rate": 5.231279911817632e-07, "loss": 0.0436, "step": 2828 }, { "epoch": 2.560180995475113, "grad_norm": 0.8392013949575247, "learning_rate": 5.210199028971291e-07, "loss": 0.0385, "step": 2829 }, { "epoch": 2.5610859728506785, "grad_norm": 0.7010574770758436, "learning_rate": 5.18915837258166e-07, "loss": 0.0284, "step": 2830 }, { "epoch": 2.561990950226244, "grad_norm": 0.8778938213964833, "learning_rate": 5.168157961545689e-07, "loss": 0.042, "step": 2831 }, { "epoch": 2.5628959276018097, "grad_norm": 0.6060442973996022, "learning_rate": 5.147197814724197e-07, "loss": 0.028, "step": 2832 }, { "epoch": 2.5638009049773753, "grad_norm": 0.7572998461966527, "learning_rate": 5.126277950941849e-07, "loss": 0.0364, "step": 2833 }, { "epoch": 2.564705882352941, "grad_norm": 0.5874140243476635, "learning_rate": 5.105398388987098e-07, "loss": 0.0276, "step": 2834 }, { "epoch": 2.5656108597285066, "grad_norm": 0.8671111524928647, "learning_rate": 5.084559147612244e-07, "loss": 0.0294, "step": 2835 }, { "epoch": 2.566515837104072, "grad_norm": 1.0080684823491075, "learning_rate": 5.063760245533328e-07, "loss": 0.0534, "step": 2836 }, { "epoch": 2.5674208144796378, "grad_norm": 0.7679074513512577, "learning_rate": 5.043001701430195e-07, "loss": 0.0288, "step": 2837 }, { "epoch": 2.5683257918552034, "grad_norm": 0.8465637186619729, "learning_rate": 5.022283533946448e-07, "loss": 0.0399, "step": 2838 }, { "epoch": 2.569230769230769, "grad_norm": 0.7831850001451033, "learning_rate": 5.001605761689399e-07, "loss": 0.0346, "step": 2839 }, { "epoch": 2.5701357466063346, "grad_norm": 0.7312456277956487, "learning_rate": 4.980968403230097e-07, "loss": 0.0241, "step": 2840 }, { "epoch": 2.5710407239819, "grad_norm": 0.9586934011142283, "learning_rate": 4.960371477103304e-07, "loss": 0.0446, "step": 2841 }, { "epoch": 2.571945701357466, "grad_norm": 0.8182313733646589, "learning_rate": 4.939815001807441e-07, "loss": 0.0285, "step": 2842 }, { "epoch": 2.5728506787330314, "grad_norm": 0.8956103362854947, "learning_rate": 4.919298995804634e-07, "loss": 0.0467, "step": 2843 }, { "epoch": 2.573755656108597, "grad_norm": 0.9457418002759556, "learning_rate": 4.898823477520625e-07, "loss": 0.0437, "step": 2844 }, { "epoch": 2.5746606334841626, "grad_norm": 0.8677464399822743, "learning_rate": 4.87838846534483e-07, "loss": 0.0469, "step": 2845 }, { "epoch": 2.5755656108597282, "grad_norm": 0.9655752265091393, "learning_rate": 4.857993977630271e-07, "loss": 0.0445, "step": 2846 }, { "epoch": 2.576470588235294, "grad_norm": 0.7724729190573112, "learning_rate": 4.837640032693558e-07, "loss": 0.0306, "step": 2847 }, { "epoch": 2.5773755656108595, "grad_norm": 0.8820281449357115, "learning_rate": 4.817326648814924e-07, "loss": 0.0433, "step": 2848 }, { "epoch": 2.5782805429864255, "grad_norm": 0.9136762574698185, "learning_rate": 4.79705384423812e-07, "loss": 0.0433, "step": 2849 }, { "epoch": 2.579185520361991, "grad_norm": 0.9446383679839098, "learning_rate": 4.776821637170525e-07, "loss": 0.0407, "step": 2850 }, { "epoch": 2.5800904977375567, "grad_norm": 1.3079568448243146, "learning_rate": 4.7566300457829894e-07, "loss": 0.0834, "step": 2851 }, { "epoch": 2.5809954751131223, "grad_norm": 0.878089828296684, "learning_rate": 4.7364790882099223e-07, "loss": 0.0375, "step": 2852 }, { "epoch": 2.581900452488688, "grad_norm": 0.9803491716663776, "learning_rate": 4.716368782549235e-07, "loss": 0.0361, "step": 2853 }, { "epoch": 2.5828054298642535, "grad_norm": 0.6621880398931814, "learning_rate": 4.6962991468623153e-07, "loss": 0.0313, "step": 2854 }, { "epoch": 2.583710407239819, "grad_norm": 0.9329502738839909, "learning_rate": 4.6762701991740434e-07, "loss": 0.0589, "step": 2855 }, { "epoch": 2.5846153846153848, "grad_norm": 1.127661192996403, "learning_rate": 4.6562819574727304e-07, "loss": 0.0651, "step": 2856 }, { "epoch": 2.5855203619909504, "grad_norm": 1.0123185560715633, "learning_rate": 4.6363344397101493e-07, "loss": 0.0465, "step": 2857 }, { "epoch": 2.586425339366516, "grad_norm": 0.96081473321691, "learning_rate": 4.616427663801515e-07, "loss": 0.0396, "step": 2858 }, { "epoch": 2.5873303167420816, "grad_norm": 0.7657502760340723, "learning_rate": 4.596561647625397e-07, "loss": 0.0348, "step": 2859 }, { "epoch": 2.588235294117647, "grad_norm": 0.8519597356576727, "learning_rate": 4.576736409023813e-07, "loss": 0.0314, "step": 2860 }, { "epoch": 2.589140271493213, "grad_norm": 0.5390455609717775, "learning_rate": 4.556951965802137e-07, "loss": 0.0236, "step": 2861 }, { "epoch": 2.5900452488687784, "grad_norm": 0.8176382709020114, "learning_rate": 4.5372083357290875e-07, "loss": 0.0385, "step": 2862 }, { "epoch": 2.590950226244344, "grad_norm": 6.761016920071442, "learning_rate": 4.517505536536759e-07, "loss": 0.1138, "step": 2863 }, { "epoch": 2.5918552036199096, "grad_norm": 0.8779820470774448, "learning_rate": 4.497843585920547e-07, "loss": 0.0322, "step": 2864 }, { "epoch": 2.5927601809954752, "grad_norm": 1.0527346336961774, "learning_rate": 4.4782225015391754e-07, "loss": 0.0424, "step": 2865 }, { "epoch": 2.593665158371041, "grad_norm": 0.6463881900326071, "learning_rate": 4.458642301014676e-07, "loss": 0.0288, "step": 2866 }, { "epoch": 2.5945701357466064, "grad_norm": 0.7798094129147664, "learning_rate": 4.4391030019323266e-07, "loss": 0.0363, "step": 2867 }, { "epoch": 2.595475113122172, "grad_norm": 0.5864174758984141, "learning_rate": 4.4196046218407175e-07, "loss": 0.0229, "step": 2868 }, { "epoch": 2.5963800904977377, "grad_norm": 0.7907549530649035, "learning_rate": 4.4001471782516345e-07, "loss": 0.0475, "step": 2869 }, { "epoch": 2.5972850678733033, "grad_norm": 0.7718303499273337, "learning_rate": 4.3807306886401555e-07, "loss": 0.0367, "step": 2870 }, { "epoch": 2.598190045248869, "grad_norm": 0.9685985977129347, "learning_rate": 4.361355170444537e-07, "loss": 0.0395, "step": 2871 }, { "epoch": 2.5990950226244345, "grad_norm": 0.8147098112452584, "learning_rate": 4.342020641066247e-07, "loss": 0.0324, "step": 2872 }, { "epoch": 2.6, "grad_norm": 0.8325147098820587, "learning_rate": 4.322727117869951e-07, "loss": 0.0343, "step": 2873 }, { "epoch": 2.6009049773755657, "grad_norm": 0.8610595970660633, "learning_rate": 4.3034746181834843e-07, "loss": 0.0443, "step": 2874 }, { "epoch": 2.6018099547511313, "grad_norm": 0.52386133192153, "learning_rate": 4.284263159297819e-07, "loss": 0.0257, "step": 2875 }, { "epoch": 2.602714932126697, "grad_norm": 0.922188422713026, "learning_rate": 4.2650927584670985e-07, "loss": 0.0432, "step": 2876 }, { "epoch": 2.6036199095022625, "grad_norm": 0.8999756867909663, "learning_rate": 4.2459634329085566e-07, "loss": 0.0522, "step": 2877 }, { "epoch": 2.604524886877828, "grad_norm": 0.8997560542018134, "learning_rate": 4.22687519980256e-07, "loss": 0.0422, "step": 2878 }, { "epoch": 2.6054298642533937, "grad_norm": 0.6851393699666478, "learning_rate": 4.207828076292586e-07, "loss": 0.0269, "step": 2879 }, { "epoch": 2.6063348416289593, "grad_norm": 0.9504084310707824, "learning_rate": 4.1888220794851386e-07, "loss": 0.0491, "step": 2880 }, { "epoch": 2.607239819004525, "grad_norm": 0.9815582864178244, "learning_rate": 4.1698572264498417e-07, "loss": 0.0617, "step": 2881 }, { "epoch": 2.6081447963800906, "grad_norm": 0.7193347966279237, "learning_rate": 4.1509335342193144e-07, "loss": 0.0353, "step": 2882 }, { "epoch": 2.609049773755656, "grad_norm": 0.654387445666886, "learning_rate": 4.132051019789263e-07, "loss": 0.0322, "step": 2883 }, { "epoch": 2.6099547511312218, "grad_norm": 0.6770702077461647, "learning_rate": 4.11320970011837e-07, "loss": 0.0286, "step": 2884 }, { "epoch": 2.6108597285067874, "grad_norm": 0.8778056197312223, "learning_rate": 4.0944095921283347e-07, "loss": 0.045, "step": 2885 }, { "epoch": 2.611764705882353, "grad_norm": 0.9857415517670977, "learning_rate": 4.0756507127038494e-07, "loss": 0.0466, "step": 2886 }, { "epoch": 2.6126696832579186, "grad_norm": 0.9102001751045841, "learning_rate": 4.056933078692571e-07, "loss": 0.0577, "step": 2887 }, { "epoch": 2.613574660633484, "grad_norm": 1.1207641101816639, "learning_rate": 4.0382567069051026e-07, "loss": 0.0553, "step": 2888 }, { "epoch": 2.61447963800905, "grad_norm": 0.6649720384175961, "learning_rate": 4.0196216141150214e-07, "loss": 0.0306, "step": 2889 }, { "epoch": 2.6153846153846154, "grad_norm": 0.7492262344111136, "learning_rate": 4.001027817058789e-07, "loss": 0.0313, "step": 2890 }, { "epoch": 2.616289592760181, "grad_norm": 1.2616822768038856, "learning_rate": 3.9824753324358143e-07, "loss": 0.0618, "step": 2891 }, { "epoch": 2.6171945701357466, "grad_norm": 0.9990483596359474, "learning_rate": 3.963964176908391e-07, "loss": 0.0462, "step": 2892 }, { "epoch": 2.6180995475113122, "grad_norm": 0.8708632403214249, "learning_rate": 3.9454943671016763e-07, "loss": 0.0389, "step": 2893 }, { "epoch": 2.619004524886878, "grad_norm": 2.1220556005474744, "learning_rate": 3.927065919603729e-07, "loss": 0.0506, "step": 2894 }, { "epoch": 2.6199095022624435, "grad_norm": 0.8493746746769308, "learning_rate": 3.908678850965425e-07, "loss": 0.0385, "step": 2895 }, { "epoch": 2.620814479638009, "grad_norm": 1.150379303889801, "learning_rate": 3.8903331777005004e-07, "loss": 0.0714, "step": 2896 }, { "epoch": 2.6217194570135747, "grad_norm": 0.6556786315656268, "learning_rate": 3.8720289162855084e-07, "loss": 0.0239, "step": 2897 }, { "epoch": 2.6226244343891403, "grad_norm": 0.8456077671699287, "learning_rate": 3.853766083159799e-07, "loss": 0.0426, "step": 2898 }, { "epoch": 2.623529411764706, "grad_norm": 1.0002534012415192, "learning_rate": 3.8355446947255293e-07, "loss": 0.0443, "step": 2899 }, { "epoch": 2.6244343891402715, "grad_norm": 0.6791744025603745, "learning_rate": 3.8173647673476366e-07, "loss": 0.0341, "step": 2900 }, { "epoch": 2.625339366515837, "grad_norm": 0.6923592350421514, "learning_rate": 3.799226317353788e-07, "loss": 0.0249, "step": 2901 }, { "epoch": 2.6262443438914027, "grad_norm": 0.9402799776722883, "learning_rate": 3.781129361034452e-07, "loss": 0.0398, "step": 2902 }, { "epoch": 2.6271493212669683, "grad_norm": 1.0357216270921785, "learning_rate": 3.7630739146427784e-07, "loss": 0.0624, "step": 2903 }, { "epoch": 2.628054298642534, "grad_norm": 0.6944016027435207, "learning_rate": 3.745059994394673e-07, "loss": 0.0299, "step": 2904 }, { "epoch": 2.6289592760180995, "grad_norm": 0.6269306136238832, "learning_rate": 3.727087616468739e-07, "loss": 0.0311, "step": 2905 }, { "epoch": 2.629864253393665, "grad_norm": 1.1135603928711555, "learning_rate": 3.709156797006247e-07, "loss": 0.0589, "step": 2906 }, { "epoch": 2.6307692307692307, "grad_norm": 1.0102490497194743, "learning_rate": 3.691267552111183e-07, "loss": 0.0496, "step": 2907 }, { "epoch": 2.6316742081447964, "grad_norm": 0.8007050025332535, "learning_rate": 3.67341989785015e-07, "loss": 0.0412, "step": 2908 }, { "epoch": 2.632579185520362, "grad_norm": 0.7842054538623121, "learning_rate": 3.6556138502524263e-07, "loss": 0.0432, "step": 2909 }, { "epoch": 2.6334841628959276, "grad_norm": 0.4901392727322221, "learning_rate": 3.6378494253099307e-07, "loss": 0.0179, "step": 2910 }, { "epoch": 2.634389140271493, "grad_norm": 0.9402192423818209, "learning_rate": 3.620126638977167e-07, "loss": 0.0463, "step": 2911 }, { "epoch": 2.635294117647059, "grad_norm": 0.8523313667547433, "learning_rate": 3.602445507171276e-07, "loss": 0.0474, "step": 2912 }, { "epoch": 2.6361990950226244, "grad_norm": 0.8183821265211729, "learning_rate": 3.5848060457719545e-07, "loss": 0.0397, "step": 2913 }, { "epoch": 2.63710407239819, "grad_norm": 0.7275599562233309, "learning_rate": 3.5672082706215085e-07, "loss": 0.034, "step": 2914 }, { "epoch": 2.6380090497737556, "grad_norm": 0.8262545581806183, "learning_rate": 3.549652197524783e-07, "loss": 0.0341, "step": 2915 }, { "epoch": 2.638914027149321, "grad_norm": 0.7400058509961716, "learning_rate": 3.5321378422491783e-07, "loss": 0.0336, "step": 2916 }, { "epoch": 2.639819004524887, "grad_norm": 0.9457497367293509, "learning_rate": 3.5146652205246214e-07, "loss": 0.0467, "step": 2917 }, { "epoch": 2.6407239819004524, "grad_norm": 0.8965759214604637, "learning_rate": 3.4972343480435657e-07, "loss": 0.0407, "step": 2918 }, { "epoch": 2.641628959276018, "grad_norm": 0.8976756625601975, "learning_rate": 3.479845240460961e-07, "loss": 0.0282, "step": 2919 }, { "epoch": 2.6425339366515836, "grad_norm": 0.7596339004755028, "learning_rate": 3.462497913394258e-07, "loss": 0.0342, "step": 2920 }, { "epoch": 2.6434389140271493, "grad_norm": 1.4639503076417617, "learning_rate": 3.445192382423357e-07, "loss": 0.0513, "step": 2921 }, { "epoch": 2.644343891402715, "grad_norm": 0.8537798348744534, "learning_rate": 3.4279286630906573e-07, "loss": 0.038, "step": 2922 }, { "epoch": 2.6452488687782805, "grad_norm": 0.7240810990541549, "learning_rate": 3.4107067709009957e-07, "loss": 0.0391, "step": 2923 }, { "epoch": 2.646153846153846, "grad_norm": 0.7459489490785816, "learning_rate": 3.3935267213216163e-07, "loss": 0.0358, "step": 2924 }, { "epoch": 2.6470588235294117, "grad_norm": 0.6339300814791389, "learning_rate": 3.3763885297822153e-07, "loss": 0.0306, "step": 2925 }, { "epoch": 2.6479638009049773, "grad_norm": 0.8119621064713185, "learning_rate": 3.3592922116748806e-07, "loss": 0.0413, "step": 2926 }, { "epoch": 2.648868778280543, "grad_norm": 0.7104013680395282, "learning_rate": 3.3422377823540965e-07, "loss": 0.0391, "step": 2927 }, { "epoch": 2.6497737556561085, "grad_norm": 0.9561259118515107, "learning_rate": 3.325225257136738e-07, "loss": 0.0329, "step": 2928 }, { "epoch": 2.650678733031674, "grad_norm": 0.777868847543646, "learning_rate": 3.3082546513020174e-07, "loss": 0.0329, "step": 2929 }, { "epoch": 2.6515837104072397, "grad_norm": 0.7141469042508116, "learning_rate": 3.2913259800915196e-07, "loss": 0.0336, "step": 2930 }, { "epoch": 2.6524886877828053, "grad_norm": 0.8206964677888978, "learning_rate": 3.2744392587091723e-07, "loss": 0.0462, "step": 2931 }, { "epoch": 2.653393665158371, "grad_norm": 0.9046005720338017, "learning_rate": 3.257594502321204e-07, "loss": 0.0489, "step": 2932 }, { "epoch": 2.6542986425339365, "grad_norm": 0.8332736176586649, "learning_rate": 3.2407917260561806e-07, "loss": 0.037, "step": 2933 }, { "epoch": 2.655203619909502, "grad_norm": 0.82382920118471, "learning_rate": 3.224030945004936e-07, "loss": 0.0381, "step": 2934 }, { "epoch": 2.6561085972850678, "grad_norm": 0.9185861635751621, "learning_rate": 3.2073121742206117e-07, "loss": 0.0395, "step": 2935 }, { "epoch": 2.6570135746606334, "grad_norm": 0.5907877570690759, "learning_rate": 3.190635428718619e-07, "loss": 0.0279, "step": 2936 }, { "epoch": 2.657918552036199, "grad_norm": 0.8052826390417739, "learning_rate": 3.1740007234766003e-07, "loss": 0.0366, "step": 2937 }, { "epoch": 2.6588235294117646, "grad_norm": 0.6111952022156668, "learning_rate": 3.1574080734344757e-07, "loss": 0.0261, "step": 2938 }, { "epoch": 2.65972850678733, "grad_norm": 0.7678463145085238, "learning_rate": 3.1408574934943594e-07, "loss": 0.0286, "step": 2939 }, { "epoch": 2.660633484162896, "grad_norm": 0.9603099434760001, "learning_rate": 3.1243489985206097e-07, "loss": 0.0415, "step": 2940 }, { "epoch": 2.6615384615384614, "grad_norm": 0.7377803392602184, "learning_rate": 3.1078826033397845e-07, "loss": 0.0292, "step": 2941 }, { "epoch": 2.662443438914027, "grad_norm": 0.8700783419365025, "learning_rate": 3.091458322740604e-07, "loss": 0.0375, "step": 2942 }, { "epoch": 2.6633484162895926, "grad_norm": 0.7467301501229242, "learning_rate": 3.075076171473995e-07, "loss": 0.0409, "step": 2943 }, { "epoch": 2.6642533936651582, "grad_norm": 0.7894162713856404, "learning_rate": 3.0587361642530457e-07, "loss": 0.0391, "step": 2944 }, { "epoch": 2.665158371040724, "grad_norm": 0.7412818355799966, "learning_rate": 3.0424383157529716e-07, "loss": 0.043, "step": 2945 }, { "epoch": 2.6660633484162894, "grad_norm": 0.7558910296886152, "learning_rate": 3.026182640611153e-07, "loss": 0.0289, "step": 2946 }, { "epoch": 2.666968325791855, "grad_norm": 0.5396670435080645, "learning_rate": 3.0099691534270613e-07, "loss": 0.0215, "step": 2947 }, { "epoch": 2.6678733031674207, "grad_norm": 1.1668399948525237, "learning_rate": 2.9937978687622995e-07, "loss": 0.0522, "step": 2948 }, { "epoch": 2.6687782805429863, "grad_norm": 0.651133168519965, "learning_rate": 2.977668801140582e-07, "loss": 0.025, "step": 2949 }, { "epoch": 2.669683257918552, "grad_norm": 0.6979132778337807, "learning_rate": 2.961581965047672e-07, "loss": 0.034, "step": 2950 }, { "epoch": 2.6705882352941175, "grad_norm": 0.8852144003221406, "learning_rate": 2.9455373749314285e-07, "loss": 0.044, "step": 2951 }, { "epoch": 2.671493212669683, "grad_norm": 0.5281413321975541, "learning_rate": 2.9295350452017537e-07, "loss": 0.0205, "step": 2952 }, { "epoch": 2.672398190045249, "grad_norm": 1.0579113562366935, "learning_rate": 2.9135749902306107e-07, "loss": 0.0506, "step": 2953 }, { "epoch": 2.6733031674208148, "grad_norm": 0.6768956921148874, "learning_rate": 2.897657224351985e-07, "loss": 0.0272, "step": 2954 }, { "epoch": 2.6742081447963804, "grad_norm": 0.8262844068485521, "learning_rate": 2.8817817618618846e-07, "loss": 0.0464, "step": 2955 }, { "epoch": 2.675113122171946, "grad_norm": 0.7465423995336021, "learning_rate": 2.8659486170183327e-07, "loss": 0.031, "step": 2956 }, { "epoch": 2.6760180995475116, "grad_norm": 0.7530532189795534, "learning_rate": 2.85015780404132e-07, "loss": 0.0377, "step": 2957 }, { "epoch": 2.676923076923077, "grad_norm": 0.5939871159514409, "learning_rate": 2.834409337112842e-07, "loss": 0.0282, "step": 2958 }, { "epoch": 2.677828054298643, "grad_norm": 1.1403469628324963, "learning_rate": 2.818703230376862e-07, "loss": 0.0585, "step": 2959 }, { "epoch": 2.6787330316742084, "grad_norm": 0.8737057920834232, "learning_rate": 2.803039497939281e-07, "loss": 0.0419, "step": 2960 }, { "epoch": 2.679638009049774, "grad_norm": 0.6578314419164244, "learning_rate": 2.787418153867971e-07, "loss": 0.0297, "step": 2961 }, { "epoch": 2.6805429864253396, "grad_norm": 0.8524641068407285, "learning_rate": 2.771839212192701e-07, "loss": 0.0328, "step": 2962 }, { "epoch": 2.681447963800905, "grad_norm": 0.6803141113731712, "learning_rate": 2.756302686905177e-07, "loss": 0.0304, "step": 2963 }, { "epoch": 2.682352941176471, "grad_norm": 0.9029010621525011, "learning_rate": 2.7408085919590265e-07, "loss": 0.0449, "step": 2964 }, { "epoch": 2.6832579185520364, "grad_norm": 1.0307087680614984, "learning_rate": 2.7253569412697244e-07, "loss": 0.0502, "step": 2965 }, { "epoch": 2.684162895927602, "grad_norm": 0.8445589141418787, "learning_rate": 2.7099477487146754e-07, "loss": 0.0474, "step": 2966 }, { "epoch": 2.6850678733031677, "grad_norm": 0.777854962655268, "learning_rate": 2.6945810281331085e-07, "loss": 0.033, "step": 2967 }, { "epoch": 2.6859728506787333, "grad_norm": 0.716588325215143, "learning_rate": 2.6792567933261405e-07, "loss": 0.036, "step": 2968 }, { "epoch": 2.686877828054299, "grad_norm": 0.6648298488477664, "learning_rate": 2.663975058056717e-07, "loss": 0.024, "step": 2969 }, { "epoch": 2.6877828054298645, "grad_norm": 0.7240005919678763, "learning_rate": 2.648735836049615e-07, "loss": 0.0319, "step": 2970 }, { "epoch": 2.68868778280543, "grad_norm": 1.0928824972579572, "learning_rate": 2.6335391409914314e-07, "loss": 0.0531, "step": 2971 }, { "epoch": 2.6895927601809957, "grad_norm": 1.025020899681285, "learning_rate": 2.6183849865305645e-07, "loss": 0.0487, "step": 2972 }, { "epoch": 2.6904977375565613, "grad_norm": 0.6698452232380842, "learning_rate": 2.6032733862772106e-07, "loss": 0.0321, "step": 2973 }, { "epoch": 2.691402714932127, "grad_norm": 0.8581806835076109, "learning_rate": 2.588204353803353e-07, "loss": 0.0308, "step": 2974 }, { "epoch": 2.6923076923076925, "grad_norm": 1.3927540167559378, "learning_rate": 2.573177902642726e-07, "loss": 0.0398, "step": 2975 }, { "epoch": 2.693212669683258, "grad_norm": 1.1791231702508618, "learning_rate": 2.558194046290835e-07, "loss": 0.0581, "step": 2976 }, { "epoch": 2.6941176470588237, "grad_norm": 1.0715180618152524, "learning_rate": 2.5432527982049424e-07, "loss": 0.0496, "step": 2977 }, { "epoch": 2.6950226244343893, "grad_norm": 0.7452561780676891, "learning_rate": 2.5283541718040095e-07, "loss": 0.033, "step": 2978 }, { "epoch": 2.695927601809955, "grad_norm": 0.723288405774461, "learning_rate": 2.5134981804687487e-07, "loss": 0.0481, "step": 2979 }, { "epoch": 2.6968325791855206, "grad_norm": 0.8189342964016437, "learning_rate": 2.4986848375415653e-07, "loss": 0.0279, "step": 2980 }, { "epoch": 2.697737556561086, "grad_norm": 0.7293349101282012, "learning_rate": 2.483914156326561e-07, "loss": 0.0296, "step": 2981 }, { "epoch": 2.6986425339366518, "grad_norm": 1.0132717076269706, "learning_rate": 2.469186150089548e-07, "loss": 0.0486, "step": 2982 }, { "epoch": 2.6995475113122174, "grad_norm": 0.6879674112301807, "learning_rate": 2.454500832057971e-07, "loss": 0.0334, "step": 2983 }, { "epoch": 2.700452488687783, "grad_norm": 0.6933546410480212, "learning_rate": 2.4398582154209646e-07, "loss": 0.0317, "step": 2984 }, { "epoch": 2.7013574660633486, "grad_norm": 1.1378533665413524, "learning_rate": 2.4252583133292927e-07, "loss": 0.0585, "step": 2985 }, { "epoch": 2.702262443438914, "grad_norm": 0.9710705230400719, "learning_rate": 2.410701138895383e-07, "loss": 0.0435, "step": 2986 }, { "epoch": 2.70316742081448, "grad_norm": 0.7534184032014971, "learning_rate": 2.396186705193265e-07, "loss": 0.0432, "step": 2987 }, { "epoch": 2.7040723981900454, "grad_norm": 0.8111861398653997, "learning_rate": 2.3817150252585853e-07, "loss": 0.0353, "step": 2988 }, { "epoch": 2.704977375565611, "grad_norm": 0.7970732513004493, "learning_rate": 2.3672861120886072e-07, "loss": 0.0413, "step": 2989 }, { "epoch": 2.7058823529411766, "grad_norm": 1.7367684706105304, "learning_rate": 2.3528999786421758e-07, "loss": 0.1009, "step": 2990 }, { "epoch": 2.7067873303167422, "grad_norm": 0.8263412841573439, "learning_rate": 2.3385566378397007e-07, "loss": 0.0349, "step": 2991 }, { "epoch": 2.707692307692308, "grad_norm": 0.8217023535556328, "learning_rate": 2.3242561025631882e-07, "loss": 0.0383, "step": 2992 }, { "epoch": 2.7085972850678735, "grad_norm": 0.7624140118068965, "learning_rate": 2.3099983856561704e-07, "loss": 0.0302, "step": 2993 }, { "epoch": 2.709502262443439, "grad_norm": 0.9437506652668349, "learning_rate": 2.2957834999237426e-07, "loss": 0.0436, "step": 2994 }, { "epoch": 2.7104072398190047, "grad_norm": 0.850763119897463, "learning_rate": 2.2816114581325377e-07, "loss": 0.0497, "step": 2995 }, { "epoch": 2.7113122171945703, "grad_norm": 0.7189937745534006, "learning_rate": 2.2674822730106792e-07, "loss": 0.0307, "step": 2996 }, { "epoch": 2.712217194570136, "grad_norm": 0.6310800454609364, "learning_rate": 2.2533959572478393e-07, "loss": 0.0248, "step": 2997 }, { "epoch": 2.7131221719457015, "grad_norm": 0.6445347682716394, "learning_rate": 2.2393525234951585e-07, "loss": 0.0316, "step": 2998 }, { "epoch": 2.714027149321267, "grad_norm": 1.0738365298565293, "learning_rate": 2.225351984365276e-07, "loss": 0.0451, "step": 2999 }, { "epoch": 2.7149321266968327, "grad_norm": 0.7183471302091506, "learning_rate": 2.2113943524323167e-07, "loss": 0.0315, "step": 3000 }, { "epoch": 2.7158371040723983, "grad_norm": 0.8882948637078791, "learning_rate": 2.1974796402318477e-07, "loss": 0.044, "step": 3001 }, { "epoch": 2.716742081447964, "grad_norm": 1.0149572630583168, "learning_rate": 2.1836078602609001e-07, "loss": 0.0426, "step": 3002 }, { "epoch": 2.7176470588235295, "grad_norm": 0.9002483353942873, "learning_rate": 2.1697790249779638e-07, "loss": 0.0304, "step": 3003 }, { "epoch": 2.718552036199095, "grad_norm": 1.0163890298886349, "learning_rate": 2.1559931468029205e-07, "loss": 0.054, "step": 3004 }, { "epoch": 2.7194570135746607, "grad_norm": 0.883325550304858, "learning_rate": 2.1422502381171163e-07, "loss": 0.0437, "step": 3005 }, { "epoch": 2.7203619909502263, "grad_norm": 0.8108895654094237, "learning_rate": 2.128550311263261e-07, "loss": 0.0517, "step": 3006 }, { "epoch": 2.721266968325792, "grad_norm": 1.4021701473911858, "learning_rate": 2.1148933785455018e-07, "loss": 0.1097, "step": 3007 }, { "epoch": 2.7221719457013576, "grad_norm": 0.9945525764597718, "learning_rate": 2.101279452229349e-07, "loss": 0.0565, "step": 3008 }, { "epoch": 2.723076923076923, "grad_norm": 0.8134302289819094, "learning_rate": 2.0877085445416889e-07, "loss": 0.0416, "step": 3009 }, { "epoch": 2.723981900452489, "grad_norm": 0.9427800866379048, "learning_rate": 2.0741806676707887e-07, "loss": 0.0451, "step": 3010 }, { "epoch": 2.7248868778280544, "grad_norm": 0.8371519089684442, "learning_rate": 2.0606958337662408e-07, "loss": 0.0315, "step": 3011 }, { "epoch": 2.72579185520362, "grad_norm": 1.1855308302610257, "learning_rate": 2.0472540549390074e-07, "loss": 0.0956, "step": 3012 }, { "epoch": 2.7266968325791856, "grad_norm": 1.0777650402230938, "learning_rate": 2.0338553432613706e-07, "loss": 0.0505, "step": 3013 }, { "epoch": 2.727601809954751, "grad_norm": 0.7906371391564205, "learning_rate": 2.020499710766932e-07, "loss": 0.0354, "step": 3014 }, { "epoch": 2.728506787330317, "grad_norm": 0.7684911198119202, "learning_rate": 2.007187169450603e-07, "loss": 0.026, "step": 3015 }, { "epoch": 2.7294117647058824, "grad_norm": 0.7129277966176025, "learning_rate": 1.9939177312685963e-07, "loss": 0.038, "step": 3016 }, { "epoch": 2.730316742081448, "grad_norm": 1.0056405015966188, "learning_rate": 1.9806914081384133e-07, "loss": 0.0448, "step": 3017 }, { "epoch": 2.7312217194570136, "grad_norm": 0.9488638203785268, "learning_rate": 1.9675082119388346e-07, "loss": 0.0425, "step": 3018 }, { "epoch": 2.7321266968325792, "grad_norm": 0.8981434768764692, "learning_rate": 1.9543681545099004e-07, "loss": 0.0468, "step": 3019 }, { "epoch": 2.733031674208145, "grad_norm": 1.2958084616203687, "learning_rate": 1.941271247652915e-07, "loss": 0.0614, "step": 3020 }, { "epoch": 2.7339366515837105, "grad_norm": 1.003783641681347, "learning_rate": 1.9282175031304307e-07, "loss": 0.0524, "step": 3021 }, { "epoch": 2.734841628959276, "grad_norm": 0.8904845255947712, "learning_rate": 1.9152069326662192e-07, "loss": 0.0541, "step": 3022 }, { "epoch": 2.7357466063348417, "grad_norm": 0.87740462699958, "learning_rate": 1.9022395479453005e-07, "loss": 0.0376, "step": 3023 }, { "epoch": 2.7366515837104073, "grad_norm": 0.7533895399319646, "learning_rate": 1.8893153606138802e-07, "loss": 0.032, "step": 3024 }, { "epoch": 2.737556561085973, "grad_norm": 0.9663964895069556, "learning_rate": 1.8764343822793962e-07, "loss": 0.0644, "step": 3025 }, { "epoch": 2.7384615384615385, "grad_norm": 0.6915730154417901, "learning_rate": 1.8635966245104663e-07, "loss": 0.0325, "step": 3026 }, { "epoch": 2.739366515837104, "grad_norm": 0.8086192488018025, "learning_rate": 1.8508020988368902e-07, "loss": 0.0381, "step": 3027 }, { "epoch": 2.7402714932126697, "grad_norm": 1.1312438198384283, "learning_rate": 1.8380508167496368e-07, "loss": 0.0643, "step": 3028 }, { "epoch": 2.7411764705882353, "grad_norm": 0.9117206074105033, "learning_rate": 1.825342789700846e-07, "loss": 0.0418, "step": 3029 }, { "epoch": 2.742081447963801, "grad_norm": 1.0321575884215888, "learning_rate": 1.8126780291038037e-07, "loss": 0.056, "step": 3030 }, { "epoch": 2.7429864253393665, "grad_norm": 1.086384402930287, "learning_rate": 1.800056546332951e-07, "loss": 0.0574, "step": 3031 }, { "epoch": 2.743891402714932, "grad_norm": 0.8705727278440942, "learning_rate": 1.7874783527238315e-07, "loss": 0.043, "step": 3032 }, { "epoch": 2.7447963800904978, "grad_norm": 0.7196187353011336, "learning_rate": 1.7749434595731364e-07, "loss": 0.0377, "step": 3033 }, { "epoch": 2.7457013574660634, "grad_norm": 0.928356806700628, "learning_rate": 1.7624518781386601e-07, "loss": 0.0566, "step": 3034 }, { "epoch": 2.746606334841629, "grad_norm": 0.9982802271930581, "learning_rate": 1.7500036196392956e-07, "loss": 0.0576, "step": 3035 }, { "epoch": 2.7475113122171946, "grad_norm": 0.908950691140068, "learning_rate": 1.7375986952550328e-07, "loss": 0.042, "step": 3036 }, { "epoch": 2.74841628959276, "grad_norm": 0.9520238760735512, "learning_rate": 1.725237116126932e-07, "loss": 0.0419, "step": 3037 }, { "epoch": 2.749321266968326, "grad_norm": 0.762511086992885, "learning_rate": 1.7129188933571295e-07, "loss": 0.0371, "step": 3038 }, { "epoch": 2.7502262443438914, "grad_norm": 0.8799880054745145, "learning_rate": 1.7006440380088361e-07, "loss": 0.0482, "step": 3039 }, { "epoch": 2.751131221719457, "grad_norm": 0.6759710006361493, "learning_rate": 1.688412561106284e-07, "loss": 0.0359, "step": 3040 }, { "epoch": 2.7520361990950226, "grad_norm": 1.008206101231249, "learning_rate": 1.6762244736347798e-07, "loss": 0.0463, "step": 3041 }, { "epoch": 2.7529411764705882, "grad_norm": 0.7847977573031666, "learning_rate": 1.664079786540629e-07, "loss": 0.05, "step": 3042 }, { "epoch": 2.753846153846154, "grad_norm": 0.9065228448047745, "learning_rate": 1.651978510731189e-07, "loss": 0.0331, "step": 3043 }, { "epoch": 2.7547511312217194, "grad_norm": 0.5670755068079508, "learning_rate": 1.6399206570748117e-07, "loss": 0.0212, "step": 3044 }, { "epoch": 2.755656108597285, "grad_norm": 0.6192751891988074, "learning_rate": 1.6279062364008446e-07, "loss": 0.0256, "step": 3045 }, { "epoch": 2.7565610859728507, "grad_norm": 0.9474139282736715, "learning_rate": 1.6159352594996403e-07, "loss": 0.0491, "step": 3046 }, { "epoch": 2.7574660633484163, "grad_norm": 0.7938621159842667, "learning_rate": 1.6040077371225426e-07, "loss": 0.0435, "step": 3047 }, { "epoch": 2.758371040723982, "grad_norm": 0.6742208629783519, "learning_rate": 1.5921236799818385e-07, "loss": 0.0355, "step": 3048 }, { "epoch": 2.7592760180995475, "grad_norm": 0.8298626576275716, "learning_rate": 1.5802830987508067e-07, "loss": 0.0377, "step": 3049 }, { "epoch": 2.760180995475113, "grad_norm": 0.8050805432810808, "learning_rate": 1.5684860040636573e-07, "loss": 0.0515, "step": 3050 }, { "epoch": 2.7610859728506787, "grad_norm": 0.8802216088780123, "learning_rate": 1.5567324065155653e-07, "loss": 0.0415, "step": 3051 }, { "epoch": 2.7619909502262443, "grad_norm": 0.8392528991655681, "learning_rate": 1.5450223166626254e-07, "loss": 0.0346, "step": 3052 }, { "epoch": 2.76289592760181, "grad_norm": 0.5929549426051067, "learning_rate": 1.5333557450218639e-07, "loss": 0.0276, "step": 3053 }, { "epoch": 2.7638009049773755, "grad_norm": 0.7239669531633957, "learning_rate": 1.5217327020712157e-07, "loss": 0.0328, "step": 3054 }, { "epoch": 2.764705882352941, "grad_norm": 0.8714652234375803, "learning_rate": 1.510153198249531e-07, "loss": 0.0388, "step": 3055 }, { "epoch": 2.7656108597285067, "grad_norm": 0.8785164926886225, "learning_rate": 1.498617243956546e-07, "loss": 0.0361, "step": 3056 }, { "epoch": 2.7665158371040723, "grad_norm": 0.8120672389212615, "learning_rate": 1.487124849552901e-07, "loss": 0.041, "step": 3057 }, { "epoch": 2.767420814479638, "grad_norm": 0.9599362933094197, "learning_rate": 1.475676025360101e-07, "loss": 0.0438, "step": 3058 }, { "epoch": 2.7683257918552036, "grad_norm": 0.6456664137637933, "learning_rate": 1.4642707816605206e-07, "loss": 0.0312, "step": 3059 }, { "epoch": 2.769230769230769, "grad_norm": 0.713776490896217, "learning_rate": 1.4529091286973994e-07, "loss": 0.0324, "step": 3060 }, { "epoch": 2.7701357466063348, "grad_norm": 0.6746481328380771, "learning_rate": 1.4415910766748198e-07, "loss": 0.034, "step": 3061 }, { "epoch": 2.7710407239819004, "grad_norm": 0.7555222925013636, "learning_rate": 1.430316635757717e-07, "loss": 0.0308, "step": 3062 }, { "epoch": 2.771945701357466, "grad_norm": 0.7794007025708242, "learning_rate": 1.4190858160718468e-07, "loss": 0.0451, "step": 3063 }, { "epoch": 2.7728506787330316, "grad_norm": 0.7940820140658598, "learning_rate": 1.407898627703802e-07, "loss": 0.0395, "step": 3064 }, { "epoch": 2.773755656108597, "grad_norm": 0.9554675933834362, "learning_rate": 1.3967550807009677e-07, "loss": 0.0478, "step": 3065 }, { "epoch": 2.774660633484163, "grad_norm": 0.9758180710533387, "learning_rate": 1.3856551850715494e-07, "loss": 0.0517, "step": 3066 }, { "epoch": 2.7755656108597284, "grad_norm": 1.0234374943261404, "learning_rate": 1.3745989507845558e-07, "loss": 0.0403, "step": 3067 }, { "epoch": 2.776470588235294, "grad_norm": 0.6771977114775679, "learning_rate": 1.363586387769761e-07, "loss": 0.0316, "step": 3068 }, { "epoch": 2.7773755656108596, "grad_norm": 0.7711597164359577, "learning_rate": 1.352617505917736e-07, "loss": 0.0394, "step": 3069 }, { "epoch": 2.7782805429864252, "grad_norm": 0.8865977882700403, "learning_rate": 1.3416923150798123e-07, "loss": 0.0376, "step": 3070 }, { "epoch": 2.779185520361991, "grad_norm": 0.7846165199180828, "learning_rate": 1.3308108250680796e-07, "loss": 0.0409, "step": 3071 }, { "epoch": 2.7800904977375565, "grad_norm": 0.8251053087352314, "learning_rate": 1.319973045655393e-07, "loss": 0.0412, "step": 3072 }, { "epoch": 2.780995475113122, "grad_norm": 0.731055045145988, "learning_rate": 1.3091789865753268e-07, "loss": 0.0317, "step": 3073 }, { "epoch": 2.7819004524886877, "grad_norm": 0.7481749572045017, "learning_rate": 1.2984286575222105e-07, "loss": 0.0357, "step": 3074 }, { "epoch": 2.7828054298642533, "grad_norm": 0.5073516621947262, "learning_rate": 1.2877220681510927e-07, "loss": 0.0191, "step": 3075 }, { "epoch": 2.783710407239819, "grad_norm": 0.8015118025005862, "learning_rate": 1.2770592280777316e-07, "loss": 0.0424, "step": 3076 }, { "epoch": 2.7846153846153845, "grad_norm": 0.6796074888602505, "learning_rate": 1.2664401468786114e-07, "loss": 0.0366, "step": 3077 }, { "epoch": 2.78552036199095, "grad_norm": 0.7109963721818252, "learning_rate": 1.2558648340908865e-07, "loss": 0.0272, "step": 3078 }, { "epoch": 2.7864253393665157, "grad_norm": 0.5591564463249132, "learning_rate": 1.2453332992124256e-07, "loss": 0.0227, "step": 3079 }, { "epoch": 2.7873303167420813, "grad_norm": 0.7548017254252788, "learning_rate": 1.2348455517017855e-07, "loss": 0.0304, "step": 3080 }, { "epoch": 2.788235294117647, "grad_norm": 0.7700421405373505, "learning_rate": 1.22440160097817e-07, "loss": 0.0362, "step": 3081 }, { "epoch": 2.7891402714932125, "grad_norm": 0.8481542991196741, "learning_rate": 1.2140014564214652e-07, "loss": 0.0423, "step": 3082 }, { "epoch": 2.790045248868778, "grad_norm": 1.587114376827527, "learning_rate": 1.203645127372216e-07, "loss": 0.0702, "step": 3083 }, { "epoch": 2.7909502262443437, "grad_norm": 0.8059462106785703, "learning_rate": 1.19333262313161e-07, "loss": 0.0416, "step": 3084 }, { "epoch": 2.7918552036199094, "grad_norm": 0.6178267369340678, "learning_rate": 1.1830639529614774e-07, "loss": 0.0265, "step": 3085 }, { "epoch": 2.792760180995475, "grad_norm": 0.832021929787715, "learning_rate": 1.1728391260842798e-07, "loss": 0.0448, "step": 3086 }, { "epoch": 2.7936651583710406, "grad_norm": 0.7073229725915556, "learning_rate": 1.1626581516831048e-07, "loss": 0.03, "step": 3087 }, { "epoch": 2.794570135746606, "grad_norm": 1.170533781536664, "learning_rate": 1.1525210389016439e-07, "loss": 0.033, "step": 3088 }, { "epoch": 2.795475113122172, "grad_norm": 0.7693807186439109, "learning_rate": 1.1424277968442088e-07, "loss": 0.03, "step": 3089 }, { "epoch": 2.7963800904977374, "grad_norm": 0.647096199258872, "learning_rate": 1.1323784345757205e-07, "loss": 0.0322, "step": 3090 }, { "epoch": 2.797285067873303, "grad_norm": 0.6754148864961371, "learning_rate": 1.1223729611216539e-07, "loss": 0.0325, "step": 3091 }, { "epoch": 2.7981900452488686, "grad_norm": 0.6989637390262156, "learning_rate": 1.1124113854680984e-07, "loss": 0.0314, "step": 3092 }, { "epoch": 2.799095022624434, "grad_norm": 0.7849085270677302, "learning_rate": 1.1024937165617144e-07, "loss": 0.0405, "step": 3093 }, { "epoch": 2.8, "grad_norm": 0.611969298467452, "learning_rate": 1.0926199633097156e-07, "loss": 0.0307, "step": 3094 }, { "epoch": 2.8009049773755654, "grad_norm": 1.5662910684902445, "learning_rate": 1.0827901345798919e-07, "loss": 0.0877, "step": 3095 }, { "epoch": 2.801809954751131, "grad_norm": 0.823163958645072, "learning_rate": 1.0730042392005647e-07, "loss": 0.042, "step": 3096 }, { "epoch": 2.8027149321266966, "grad_norm": 0.8788690736876312, "learning_rate": 1.0632622859606034e-07, "loss": 0.0463, "step": 3097 }, { "epoch": 2.8036199095022623, "grad_norm": 0.9065585511417897, "learning_rate": 1.0535642836094373e-07, "loss": 0.0506, "step": 3098 }, { "epoch": 2.804524886877828, "grad_norm": 0.9055203573216608, "learning_rate": 1.0439102408569768e-07, "loss": 0.0286, "step": 3099 }, { "epoch": 2.8054298642533935, "grad_norm": 0.555673416768716, "learning_rate": 1.0343001663736918e-07, "loss": 0.0234, "step": 3100 }, { "epoch": 2.806334841628959, "grad_norm": 0.7680223477745549, "learning_rate": 1.024734068790545e-07, "loss": 0.0401, "step": 3101 }, { "epoch": 2.8072398190045247, "grad_norm": 1.0417451263870738, "learning_rate": 1.0152119566990026e-07, "loss": 0.05, "step": 3102 }, { "epoch": 2.8081447963800903, "grad_norm": 0.9098460454377238, "learning_rate": 1.0057338386510352e-07, "loss": 0.038, "step": 3103 }, { "epoch": 2.809049773755656, "grad_norm": 0.9243411196352124, "learning_rate": 9.962997231590887e-08, "loss": 0.0366, "step": 3104 }, { "epoch": 2.8099547511312215, "grad_norm": 0.8023571659024885, "learning_rate": 9.869096186961025e-08, "loss": 0.0308, "step": 3105 }, { "epoch": 2.810859728506787, "grad_norm": 0.8135835605603456, "learning_rate": 9.775635336954913e-08, "loss": 0.0307, "step": 3106 }, { "epoch": 2.8117647058823527, "grad_norm": 0.6717091678953264, "learning_rate": 9.682614765511134e-08, "loss": 0.0343, "step": 3107 }, { "epoch": 2.8126696832579183, "grad_norm": 1.05268793722646, "learning_rate": 9.590034556173078e-08, "loss": 0.0497, "step": 3108 }, { "epoch": 2.813574660633484, "grad_norm": 0.8727052165763626, "learning_rate": 9.497894792088514e-08, "loss": 0.0379, "step": 3109 }, { "epoch": 2.8144796380090495, "grad_norm": 1.194537529685351, "learning_rate": 9.406195556009745e-08, "loss": 0.0481, "step": 3110 }, { "epoch": 2.815384615384615, "grad_norm": 1.1784507157281694, "learning_rate": 9.314936930293283e-08, "loss": 0.0521, "step": 3111 }, { "epoch": 2.8162895927601808, "grad_norm": 0.7968989364570704, "learning_rate": 9.224118996900067e-08, "loss": 0.034, "step": 3112 }, { "epoch": 2.8171945701357464, "grad_norm": 0.9256424056658804, "learning_rate": 9.133741837395127e-08, "loss": 0.052, "step": 3113 }, { "epoch": 2.818099547511312, "grad_norm": 0.6366613809239065, "learning_rate": 9.043805532947647e-08, "loss": 0.0246, "step": 3114 }, { "epoch": 2.8190045248868776, "grad_norm": 0.9329364511021364, "learning_rate": 8.954310164331015e-08, "loss": 0.0443, "step": 3115 }, { "epoch": 2.819909502262443, "grad_norm": 0.7191643204792627, "learning_rate": 8.86525581192238e-08, "loss": 0.0334, "step": 3116 }, { "epoch": 2.820814479638009, "grad_norm": 0.7052446498157873, "learning_rate": 8.776642555702986e-08, "loss": 0.033, "step": 3117 }, { "epoch": 2.8217194570135744, "grad_norm": 1.1457755578513893, "learning_rate": 8.688470475257837e-08, "loss": 0.0349, "step": 3118 }, { "epoch": 2.82262443438914, "grad_norm": 0.7569606613831292, "learning_rate": 8.600739649775702e-08, "loss": 0.0385, "step": 3119 }, { "epoch": 2.8235294117647056, "grad_norm": 0.7876833885202672, "learning_rate": 8.513450158049109e-08, "loss": 0.0328, "step": 3120 }, { "epoch": 2.8244343891402712, "grad_norm": 0.6895998202222309, "learning_rate": 8.426602078474233e-08, "loss": 0.0363, "step": 3121 }, { "epoch": 2.825339366515837, "grad_norm": 0.9655863144300147, "learning_rate": 8.340195489050629e-08, "loss": 0.0393, "step": 3122 }, { "epoch": 2.8262443438914024, "grad_norm": 0.6429825832376328, "learning_rate": 8.254230467381552e-08, "loss": 0.0325, "step": 3123 }, { "epoch": 2.827149321266968, "grad_norm": 0.7388062369220354, "learning_rate": 8.168707090673688e-08, "loss": 0.0332, "step": 3124 }, { "epoch": 2.8280542986425337, "grad_norm": 0.5448658037200568, "learning_rate": 8.08362543573682e-08, "loss": 0.0284, "step": 3125 }, { "epoch": 2.8289592760180997, "grad_norm": 0.6852743950056802, "learning_rate": 7.99898557898432e-08, "loss": 0.0272, "step": 3126 }, { "epoch": 2.8298642533936653, "grad_norm": 0.8031290854118033, "learning_rate": 7.914787596432549e-08, "loss": 0.0373, "step": 3127 }, { "epoch": 2.830769230769231, "grad_norm": 0.7556698079263456, "learning_rate": 7.831031563701131e-08, "loss": 0.0356, "step": 3128 }, { "epoch": 2.8316742081447965, "grad_norm": 0.6176442326395905, "learning_rate": 7.74771755601278e-08, "loss": 0.0211, "step": 3129 }, { "epoch": 2.832579185520362, "grad_norm": 2.0329942975299335, "learning_rate": 7.664845648193087e-08, "loss": 0.054, "step": 3130 }, { "epoch": 2.8334841628959277, "grad_norm": 0.5721168997749405, "learning_rate": 7.582415914670738e-08, "loss": 0.0275, "step": 3131 }, { "epoch": 2.8343891402714934, "grad_norm": 0.7509523505738033, "learning_rate": 7.50042842947718e-08, "loss": 0.0356, "step": 3132 }, { "epoch": 2.835294117647059, "grad_norm": 0.6404613989204495, "learning_rate": 7.418883266246734e-08, "loss": 0.0288, "step": 3133 }, { "epoch": 2.8361990950226246, "grad_norm": 0.8990716215816196, "learning_rate": 7.337780498216541e-08, "loss": 0.0327, "step": 3134 }, { "epoch": 2.83710407239819, "grad_norm": 0.6104670321198926, "learning_rate": 7.257120198226219e-08, "loss": 0.0242, "step": 3135 }, { "epoch": 2.838009049773756, "grad_norm": 0.7118540810130862, "learning_rate": 7.176902438718159e-08, "loss": 0.0366, "step": 3136 }, { "epoch": 2.8389140271493214, "grad_norm": 0.811705886844816, "learning_rate": 7.097127291737283e-08, "loss": 0.0374, "step": 3137 }, { "epoch": 2.839819004524887, "grad_norm": 0.7829429199123514, "learning_rate": 7.017794828930891e-08, "loss": 0.042, "step": 3138 }, { "epoch": 2.8407239819004526, "grad_norm": 0.8212166738806442, "learning_rate": 6.938905121548823e-08, "loss": 0.0297, "step": 3139 }, { "epoch": 2.841628959276018, "grad_norm": 0.7685156197282735, "learning_rate": 6.860458240443179e-08, "loss": 0.039, "step": 3140 }, { "epoch": 2.842533936651584, "grad_norm": 0.8648726826222265, "learning_rate": 6.782454256068439e-08, "loss": 0.0371, "step": 3141 }, { "epoch": 2.8434389140271494, "grad_norm": 0.9943818437409878, "learning_rate": 6.70489323848128e-08, "loss": 0.0453, "step": 3142 }, { "epoch": 2.844343891402715, "grad_norm": 0.6766549095421549, "learning_rate": 6.62777525734043e-08, "loss": 0.0293, "step": 3143 }, { "epoch": 2.8452488687782806, "grad_norm": 0.6949139377569827, "learning_rate": 6.551100381906872e-08, "loss": 0.0327, "step": 3144 }, { "epoch": 2.8461538461538463, "grad_norm": 0.8006621499024716, "learning_rate": 6.474868681043578e-08, "loss": 0.0355, "step": 3145 }, { "epoch": 2.847058823529412, "grad_norm": 0.8411757698086274, "learning_rate": 6.399080223215503e-08, "loss": 0.0488, "step": 3146 }, { "epoch": 2.8479638009049775, "grad_norm": 0.7058437817339792, "learning_rate": 6.323735076489534e-08, "loss": 0.0314, "step": 3147 }, { "epoch": 2.848868778280543, "grad_norm": 0.7371513655046624, "learning_rate": 6.248833308534263e-08, "loss": 0.034, "step": 3148 }, { "epoch": 2.8497737556561087, "grad_norm": 1.1552021764863942, "learning_rate": 6.174374986620324e-08, "loss": 0.0307, "step": 3149 }, { "epoch": 2.8506787330316743, "grad_norm": 1.1481086970047496, "learning_rate": 6.100360177619946e-08, "loss": 0.0586, "step": 3150 }, { "epoch": 2.85158371040724, "grad_norm": 1.0129981170226592, "learning_rate": 6.026788948007011e-08, "loss": 0.0375, "step": 3151 }, { "epoch": 2.8524886877828055, "grad_norm": 0.6813506988789841, "learning_rate": 5.9536613638570553e-08, "loss": 0.0307, "step": 3152 }, { "epoch": 2.853393665158371, "grad_norm": 0.5675126915369519, "learning_rate": 5.8809774908471505e-08, "loss": 0.0252, "step": 3153 }, { "epoch": 2.8542986425339367, "grad_norm": 0.7128197803713955, "learning_rate": 5.808737394255859e-08, "loss": 0.0423, "step": 3154 }, { "epoch": 2.8552036199095023, "grad_norm": 0.6089976172274052, "learning_rate": 5.736941138963281e-08, "loss": 0.0236, "step": 3155 }, { "epoch": 2.856108597285068, "grad_norm": 0.731365707148675, "learning_rate": 5.665588789450782e-08, "loss": 0.0292, "step": 3156 }, { "epoch": 2.8570135746606335, "grad_norm": 1.2105098436809312, "learning_rate": 5.5946804098010455e-08, "loss": 0.0355, "step": 3157 }, { "epoch": 2.857918552036199, "grad_norm": 0.6960189699586964, "learning_rate": 5.524216063698074e-08, "loss": 0.0339, "step": 3158 }, { "epoch": 2.8588235294117648, "grad_norm": 0.9146107194882618, "learning_rate": 5.454195814427021e-08, "loss": 0.048, "step": 3159 }, { "epoch": 2.8597285067873304, "grad_norm": 0.9209805345864592, "learning_rate": 5.384619724874307e-08, "loss": 0.0378, "step": 3160 }, { "epoch": 2.860633484162896, "grad_norm": 0.8080240991964212, "learning_rate": 5.315487857527279e-08, "loss": 0.0335, "step": 3161 }, { "epoch": 2.8615384615384616, "grad_norm": 0.8549345852896387, "learning_rate": 5.246800274474439e-08, "loss": 0.0494, "step": 3162 }, { "epoch": 2.862443438914027, "grad_norm": 0.7481143783541836, "learning_rate": 5.178557037405274e-08, "loss": 0.0395, "step": 3163 }, { "epoch": 2.863348416289593, "grad_norm": 0.7832587566703945, "learning_rate": 5.1107582076100334e-08, "loss": 0.041, "step": 3164 }, { "epoch": 2.8642533936651584, "grad_norm": 0.7604712120567836, "learning_rate": 5.0434038459801213e-08, "loss": 0.0353, "step": 3165 }, { "epoch": 2.865158371040724, "grad_norm": 0.6366132605014381, "learning_rate": 4.976494013007538e-08, "loss": 0.0306, "step": 3166 }, { "epoch": 2.8660633484162896, "grad_norm": 0.6545775037365474, "learning_rate": 4.910028768785047e-08, "loss": 0.0276, "step": 3167 }, { "epoch": 2.8669683257918552, "grad_norm": 0.9186195278627522, "learning_rate": 4.8440081730062315e-08, "loss": 0.0372, "step": 3168 }, { "epoch": 2.867873303167421, "grad_norm": 0.7554653964331492, "learning_rate": 4.7784322849652175e-08, "loss": 0.0421, "step": 3169 }, { "epoch": 2.8687782805429864, "grad_norm": 0.8436287461291376, "learning_rate": 4.713301163556894e-08, "loss": 0.0468, "step": 3170 }, { "epoch": 2.869683257918552, "grad_norm": 0.8845643710671129, "learning_rate": 4.6486148672764684e-08, "loss": 0.0413, "step": 3171 }, { "epoch": 2.8705882352941177, "grad_norm": 0.7178005110482265, "learning_rate": 4.584373454219859e-08, "loss": 0.0298, "step": 3172 }, { "epoch": 2.8714932126696833, "grad_norm": 0.7324294317136776, "learning_rate": 4.5205769820833024e-08, "loss": 0.0393, "step": 3173 }, { "epoch": 2.872398190045249, "grad_norm": 0.8496555555822194, "learning_rate": 4.4572255081634655e-08, "loss": 0.0385, "step": 3174 }, { "epoch": 2.8733031674208145, "grad_norm": 1.2837499730619681, "learning_rate": 4.394319089357335e-08, "loss": 0.0443, "step": 3175 }, { "epoch": 2.87420814479638, "grad_norm": 0.728129601524244, "learning_rate": 4.331857782162219e-08, "loss": 0.0312, "step": 3176 }, { "epoch": 2.8751131221719457, "grad_norm": 0.6115239101622115, "learning_rate": 4.269841642675576e-08, "loss": 0.025, "step": 3177 }, { "epoch": 2.8760180995475113, "grad_norm": 0.7039496734512185, "learning_rate": 4.208270726595243e-08, "loss": 0.0254, "step": 3178 }, { "epoch": 2.876923076923077, "grad_norm": 0.7490276864711076, "learning_rate": 4.147145089218985e-08, "loss": 0.0357, "step": 3179 }, { "epoch": 2.8778280542986425, "grad_norm": 0.6347688013912419, "learning_rate": 4.086464785444777e-08, "loss": 0.0309, "step": 3180 }, { "epoch": 2.878733031674208, "grad_norm": 0.8588621380137473, "learning_rate": 4.026229869770581e-08, "loss": 0.0458, "step": 3181 }, { "epoch": 2.8796380090497737, "grad_norm": 0.5590946145420937, "learning_rate": 3.966440396294347e-08, "loss": 0.0202, "step": 3182 }, { "epoch": 2.8805429864253393, "grad_norm": 0.5296870437223781, "learning_rate": 3.907096418714063e-08, "loss": 0.0234, "step": 3183 }, { "epoch": 2.881447963800905, "grad_norm": 0.7180613954632119, "learning_rate": 3.848197990327429e-08, "loss": 0.0354, "step": 3184 }, { "epoch": 2.8823529411764706, "grad_norm": 0.8383418948183886, "learning_rate": 3.7897451640321326e-08, "loss": 0.043, "step": 3185 }, { "epoch": 2.883257918552036, "grad_norm": 0.654770570516088, "learning_rate": 3.731737992325624e-08, "loss": 0.0221, "step": 3186 }, { "epoch": 2.8841628959276018, "grad_norm": 0.6693432650920953, "learning_rate": 3.6741765273051176e-08, "loss": 0.0303, "step": 3187 }, { "epoch": 2.8850678733031674, "grad_norm": 0.5301832218075485, "learning_rate": 3.617060820667484e-08, "loss": 0.0236, "step": 3188 }, { "epoch": 2.885972850678733, "grad_norm": 0.6927502329231183, "learning_rate": 3.560390923709245e-08, "loss": 0.029, "step": 3189 }, { "epoch": 2.8868778280542986, "grad_norm": 0.8685061468188198, "learning_rate": 3.504166887326688e-08, "loss": 0.0527, "step": 3190 }, { "epoch": 2.887782805429864, "grad_norm": 0.9562188541481642, "learning_rate": 3.4483887620153644e-08, "loss": 0.0415, "step": 3191 }, { "epoch": 2.88868778280543, "grad_norm": 0.7482994563996461, "learning_rate": 3.393056597870703e-08, "loss": 0.0395, "step": 3192 }, { "epoch": 2.8895927601809954, "grad_norm": 0.77315372502387, "learning_rate": 3.338170444587341e-08, "loss": 0.0367, "step": 3193 }, { "epoch": 2.890497737556561, "grad_norm": 1.315262880120884, "learning_rate": 3.283730351459402e-08, "loss": 0.1, "step": 3194 }, { "epoch": 2.8914027149321266, "grad_norm": 0.8700894751477838, "learning_rate": 3.229736367380498e-08, "loss": 0.0391, "step": 3195 }, { "epoch": 2.8923076923076922, "grad_norm": 0.8385445245518364, "learning_rate": 3.1761885408435055e-08, "loss": 0.0421, "step": 3196 }, { "epoch": 2.893212669683258, "grad_norm": 0.6455939310523194, "learning_rate": 3.1230869199405656e-08, "loss": 0.0257, "step": 3197 }, { "epoch": 2.8941176470588235, "grad_norm": 0.8235031362770999, "learning_rate": 3.0704315523631956e-08, "loss": 0.0433, "step": 3198 }, { "epoch": 2.895022624434389, "grad_norm": 0.9681823977457733, "learning_rate": 3.018222485401956e-08, "loss": 0.0568, "step": 3199 }, { "epoch": 2.8959276018099547, "grad_norm": 0.6693685762149273, "learning_rate": 2.966459765946672e-08, "loss": 0.0332, "step": 3200 }, { "epoch": 2.8968325791855203, "grad_norm": 0.6358154993185421, "learning_rate": 2.915143440486379e-08, "loss": 0.0293, "step": 3201 }, { "epoch": 2.897737556561086, "grad_norm": 0.8277499505818272, "learning_rate": 2.8642735551090427e-08, "loss": 0.0256, "step": 3202 }, { "epoch": 2.8986425339366515, "grad_norm": 1.3318771162762828, "learning_rate": 2.813850155501785e-08, "loss": 0.0688, "step": 3203 }, { "epoch": 2.899547511312217, "grad_norm": 1.0821956650015914, "learning_rate": 2.7638732869506023e-08, "loss": 0.0475, "step": 3204 }, { "epoch": 2.9004524886877827, "grad_norm": 0.6774089600372997, "learning_rate": 2.714342994340646e-08, "loss": 0.0353, "step": 3205 }, { "epoch": 2.9013574660633483, "grad_norm": 0.5958195204095359, "learning_rate": 2.6652593221558888e-08, "loss": 0.0305, "step": 3206 }, { "epoch": 2.902262443438914, "grad_norm": 1.321828459881946, "learning_rate": 2.616622314479067e-08, "loss": 0.0478, "step": 3207 }, { "epoch": 2.9031674208144795, "grad_norm": 0.747184136825469, "learning_rate": 2.5684320149919617e-08, "loss": 0.0386, "step": 3208 }, { "epoch": 2.904072398190045, "grad_norm": 0.7842426123883246, "learning_rate": 2.5206884669751186e-08, "loss": 0.0394, "step": 3209 }, { "epoch": 2.9049773755656108, "grad_norm": 0.9010148857958354, "learning_rate": 2.4733917133077378e-08, "loss": 0.048, "step": 3210 }, { "epoch": 2.9058823529411764, "grad_norm": 2.479778534148413, "learning_rate": 2.426541796467785e-08, "loss": 0.1266, "step": 3211 }, { "epoch": 2.906787330316742, "grad_norm": 1.070471684492352, "learning_rate": 2.3801387585319913e-08, "loss": 0.0642, "step": 3212 }, { "epoch": 2.9076923076923076, "grad_norm": 1.034467373079982, "learning_rate": 2.3341826411756863e-08, "loss": 0.0538, "step": 3213 }, { "epoch": 2.908597285067873, "grad_norm": 0.6629959003456427, "learning_rate": 2.2886734856727987e-08, "loss": 0.0302, "step": 3214 }, { "epoch": 2.909502262443439, "grad_norm": 1.0975343502966572, "learning_rate": 2.2436113328958565e-08, "loss": 0.0656, "step": 3215 }, { "epoch": 2.9104072398190044, "grad_norm": 0.7838121248519655, "learning_rate": 2.19899622331593e-08, "loss": 0.0324, "step": 3216 }, { "epoch": 2.91131221719457, "grad_norm": 0.7188109298226211, "learning_rate": 2.154828197002523e-08, "loss": 0.033, "step": 3217 }, { "epoch": 2.9122171945701356, "grad_norm": 0.6407226408293722, "learning_rate": 2.1111072936237374e-08, "loss": 0.028, "step": 3218 }, { "epoch": 2.913122171945701, "grad_norm": 0.940887786214616, "learning_rate": 2.0678335524460526e-08, "loss": 0.0429, "step": 3219 }, { "epoch": 2.914027149321267, "grad_norm": 0.8461758106875216, "learning_rate": 2.0250070123342124e-08, "loss": 0.0333, "step": 3220 }, { "epoch": 2.9149321266968324, "grad_norm": 0.7883138994624612, "learning_rate": 1.9826277117515615e-08, "loss": 0.0286, "step": 3221 }, { "epoch": 2.915837104072398, "grad_norm": 0.7866018056930718, "learning_rate": 1.940695688759542e-08, "loss": 0.0443, "step": 3222 }, { "epoch": 2.9167420814479637, "grad_norm": 1.3527988974610679, "learning_rate": 1.8992109810180847e-08, "loss": 0.054, "step": 3223 }, { "epoch": 2.9176470588235293, "grad_norm": 0.583902144268083, "learning_rate": 1.8581736257852756e-08, "loss": 0.0287, "step": 3224 }, { "epoch": 2.918552036199095, "grad_norm": 0.6122961047816468, "learning_rate": 1.8175836599173545e-08, "loss": 0.0258, "step": 3225 }, { "epoch": 2.9194570135746605, "grad_norm": 0.8232568583210034, "learning_rate": 1.7774411198689388e-08, "loss": 0.0394, "step": 3226 }, { "epoch": 2.920361990950226, "grad_norm": 0.7758537474157885, "learning_rate": 1.737746041692634e-08, "loss": 0.0339, "step": 3227 }, { "epoch": 2.9212669683257917, "grad_norm": 0.9541998995908781, "learning_rate": 1.6984984610392553e-08, "loss": 0.0464, "step": 3228 }, { "epoch": 2.9221719457013577, "grad_norm": 0.6880754110293973, "learning_rate": 1.6596984131577732e-08, "loss": 0.0412, "step": 3229 }, { "epoch": 2.9230769230769234, "grad_norm": 0.842261716023834, "learning_rate": 1.6213459328950355e-08, "loss": 0.0516, "step": 3230 }, { "epoch": 2.923981900452489, "grad_norm": 1.220737720449956, "learning_rate": 1.5834410546960444e-08, "loss": 0.0478, "step": 3231 }, { "epoch": 2.9248868778280546, "grad_norm": 0.67557415093044, "learning_rate": 1.5459838126039017e-08, "loss": 0.0325, "step": 3232 }, { "epoch": 2.92579185520362, "grad_norm": 0.8135588801193293, "learning_rate": 1.5089742402595308e-08, "loss": 0.0382, "step": 3233 }, { "epoch": 2.926696832579186, "grad_norm": 1.0323955858861646, "learning_rate": 1.472412370901788e-08, "loss": 0.0497, "step": 3234 }, { "epoch": 2.9276018099547514, "grad_norm": 0.7006577779995979, "learning_rate": 1.4362982373675171e-08, "loss": 0.0378, "step": 3235 }, { "epoch": 2.928506787330317, "grad_norm": 0.7484896459876342, "learning_rate": 1.4006318720913848e-08, "loss": 0.037, "step": 3236 }, { "epoch": 2.9294117647058826, "grad_norm": 0.714711593338945, "learning_rate": 1.3654133071059894e-08, "loss": 0.0317, "step": 3237 }, { "epoch": 2.930316742081448, "grad_norm": 0.7452407138868669, "learning_rate": 1.3306425740416961e-08, "loss": 0.029, "step": 3238 }, { "epoch": 2.931221719457014, "grad_norm": 0.6478656762154883, "learning_rate": 1.2963197041266362e-08, "loss": 0.0269, "step": 3239 }, { "epoch": 2.9321266968325794, "grad_norm": 1.2249418698090044, "learning_rate": 1.2624447281867625e-08, "loss": 0.0585, "step": 3240 }, { "epoch": 2.933031674208145, "grad_norm": 0.8477720755381488, "learning_rate": 1.229017676645794e-08, "loss": 0.0366, "step": 3241 }, { "epoch": 2.9339366515837106, "grad_norm": 0.9990373913006311, "learning_rate": 1.1960385795250496e-08, "loss": 0.0502, "step": 3242 }, { "epoch": 2.9348416289592762, "grad_norm": 0.8109368857671516, "learning_rate": 1.163507466443614e-08, "loss": 0.0303, "step": 3243 }, { "epoch": 2.935746606334842, "grad_norm": 1.4755825710041333, "learning_rate": 1.1314243666182279e-08, "loss": 0.0838, "step": 3244 }, { "epoch": 2.9366515837104075, "grad_norm": 0.7412100339446903, "learning_rate": 1.0997893088632306e-08, "loss": 0.0391, "step": 3245 }, { "epoch": 2.937556561085973, "grad_norm": 1.123800701537486, "learning_rate": 1.0686023215906172e-08, "loss": 0.0674, "step": 3246 }, { "epoch": 2.9384615384615387, "grad_norm": 0.9042810928809615, "learning_rate": 1.0378634328099268e-08, "loss": 0.0384, "step": 3247 }, { "epoch": 2.9393665158371043, "grad_norm": 0.8010732073569782, "learning_rate": 1.0075726701282429e-08, "loss": 0.0401, "step": 3248 }, { "epoch": 2.94027149321267, "grad_norm": 0.7931682658499842, "learning_rate": 9.777300607501928e-09, "loss": 0.0326, "step": 3249 }, { "epoch": 2.9411764705882355, "grad_norm": 0.9504102013996716, "learning_rate": 9.48335631477948e-09, "loss": 0.0352, "step": 3250 }, { "epoch": 2.942081447963801, "grad_norm": 0.8931228695696115, "learning_rate": 9.193894087111132e-09, "loss": 0.0438, "step": 3251 }, { "epoch": 2.9429864253393667, "grad_norm": 1.033444481486722, "learning_rate": 8.908914184467821e-09, "loss": 0.0497, "step": 3252 }, { "epoch": 2.9438914027149323, "grad_norm": 1.2432687819162807, "learning_rate": 8.628416862794253e-09, "loss": 0.0752, "step": 3253 }, { "epoch": 2.944796380090498, "grad_norm": 0.8285703865672056, "learning_rate": 8.352402374010027e-09, "loss": 0.037, "step": 3254 }, { "epoch": 2.9457013574660635, "grad_norm": 1.191781341312269, "learning_rate": 8.080870966008513e-09, "loss": 0.0661, "step": 3255 }, { "epoch": 2.946606334841629, "grad_norm": 0.7749463296759261, "learning_rate": 7.813822882655753e-09, "loss": 0.0304, "step": 3256 }, { "epoch": 2.9475113122171948, "grad_norm": 0.7905264373854993, "learning_rate": 7.551258363792669e-09, "loss": 0.0451, "step": 3257 }, { "epoch": 2.9484162895927604, "grad_norm": 0.8007915442435302, "learning_rate": 7.293177645232296e-09, "loss": 0.0353, "step": 3258 }, { "epoch": 2.949321266968326, "grad_norm": 0.8018057795293111, "learning_rate": 7.039580958762004e-09, "loss": 0.0319, "step": 3259 }, { "epoch": 2.9502262443438916, "grad_norm": 1.2358269210467427, "learning_rate": 6.79046853214016e-09, "loss": 0.0762, "step": 3260 }, { "epoch": 2.951131221719457, "grad_norm": 0.8115070814503702, "learning_rate": 6.545840589099461e-09, "loss": 0.0461, "step": 3261 }, { "epoch": 2.952036199095023, "grad_norm": 0.9311586811922313, "learning_rate": 6.305697349344164e-09, "loss": 0.0371, "step": 3262 }, { "epoch": 2.9529411764705884, "grad_norm": 0.8379150886598652, "learning_rate": 6.070039028550634e-09, "loss": 0.0396, "step": 3263 }, { "epoch": 2.953846153846154, "grad_norm": 0.5918504435646987, "learning_rate": 5.838865838366792e-09, "loss": 0.0302, "step": 3264 }, { "epoch": 2.9547511312217196, "grad_norm": 0.7790904424047679, "learning_rate": 5.612177986414891e-09, "loss": 0.0429, "step": 3265 }, { "epoch": 2.9556561085972852, "grad_norm": 0.7732709028722244, "learning_rate": 5.389975676285408e-09, "loss": 0.036, "step": 3266 }, { "epoch": 2.956561085972851, "grad_norm": 0.6515000785642422, "learning_rate": 5.172259107542599e-09, "loss": 0.0335, "step": 3267 }, { "epoch": 2.9574660633484164, "grad_norm": 1.0723427757830237, "learning_rate": 4.959028475721161e-09, "loss": 0.063, "step": 3268 }, { "epoch": 2.958371040723982, "grad_norm": 0.6587532953792505, "learning_rate": 4.7502839723267966e-09, "loss": 0.0281, "step": 3269 }, { "epoch": 2.9592760180995477, "grad_norm": 0.7325525690077453, "learning_rate": 4.546025784837316e-09, "loss": 0.0342, "step": 3270 }, { "epoch": 2.9601809954751133, "grad_norm": 0.8924726819116777, "learning_rate": 4.346254096698754e-09, "loss": 0.0399, "step": 3271 }, { "epoch": 2.961085972850679, "grad_norm": 0.6228633049126818, "learning_rate": 4.150969087330925e-09, "loss": 0.0294, "step": 3272 }, { "epoch": 2.9619909502262445, "grad_norm": 0.9474896016482157, "learning_rate": 3.96017093212131e-09, "loss": 0.0507, "step": 3273 }, { "epoch": 2.96289592760181, "grad_norm": 0.862539123822899, "learning_rate": 3.773859802429503e-09, "loss": 0.0393, "step": 3274 }, { "epoch": 2.9638009049773757, "grad_norm": 0.8228857059144223, "learning_rate": 3.5920358655844312e-09, "loss": 0.0389, "step": 3275 }, { "epoch": 2.9647058823529413, "grad_norm": 0.876409454601174, "learning_rate": 3.41469928488547e-09, "loss": 0.0447, "step": 3276 }, { "epoch": 2.965610859728507, "grad_norm": 0.7507496441257262, "learning_rate": 3.241850219601328e-09, "loss": 0.0277, "step": 3277 }, { "epoch": 2.9665158371040725, "grad_norm": 0.7574332683776498, "learning_rate": 3.0734888249700502e-09, "loss": 0.0301, "step": 3278 }, { "epoch": 2.967420814479638, "grad_norm": 0.7000647799250475, "learning_rate": 2.9096152522006816e-09, "loss": 0.03, "step": 3279 }, { "epoch": 2.9683257918552037, "grad_norm": 0.9662813235392312, "learning_rate": 2.7502296484699374e-09, "loss": 0.0507, "step": 3280 }, { "epoch": 2.9692307692307693, "grad_norm": 0.9588123832720401, "learning_rate": 2.595332156925534e-09, "loss": 0.0432, "step": 3281 }, { "epoch": 2.970135746606335, "grad_norm": 0.6995158091429858, "learning_rate": 2.444922916682302e-09, "loss": 0.0348, "step": 3282 }, { "epoch": 2.9710407239819006, "grad_norm": 0.7337876458009699, "learning_rate": 2.299002062826072e-09, "loss": 0.0382, "step": 3283 }, { "epoch": 2.971945701357466, "grad_norm": 0.8788466239740423, "learning_rate": 2.157569726410902e-09, "loss": 0.0509, "step": 3284 }, { "epoch": 2.9728506787330318, "grad_norm": 0.7988862410948818, "learning_rate": 2.0206260344590724e-09, "loss": 0.0386, "step": 3285 }, { "epoch": 2.9737556561085974, "grad_norm": 0.6654614556737743, "learning_rate": 1.8881711099622002e-09, "loss": 0.0278, "step": 3286 }, { "epoch": 2.974660633484163, "grad_norm": 1.091119965759106, "learning_rate": 1.7602050718801278e-09, "loss": 0.064, "step": 3287 }, { "epoch": 2.9755656108597286, "grad_norm": 1.0076848953913116, "learning_rate": 1.636728035140922e-09, "loss": 0.0427, "step": 3288 }, { "epoch": 2.976470588235294, "grad_norm": 0.7624886748407835, "learning_rate": 1.5177401106419853e-09, "loss": 0.0307, "step": 3289 }, { "epoch": 2.97737556561086, "grad_norm": 1.15347828810676, "learning_rate": 1.4032414052478348e-09, "loss": 0.0565, "step": 3290 }, { "epoch": 2.9782805429864254, "grad_norm": 0.6370289977483845, "learning_rate": 1.2932320217917681e-09, "loss": 0.0278, "step": 3291 }, { "epoch": 2.979185520361991, "grad_norm": 1.0672322084630557, "learning_rate": 1.1877120590753077e-09, "loss": 0.0588, "step": 3292 }, { "epoch": 2.9800904977375566, "grad_norm": 0.9272947230607421, "learning_rate": 1.086681611867091e-09, "loss": 0.0495, "step": 3293 }, { "epoch": 2.9809954751131222, "grad_norm": 0.6611983356926571, "learning_rate": 9.901407709050902e-10, "loss": 0.0361, "step": 3294 }, { "epoch": 2.981900452488688, "grad_norm": 0.8508000186319571, "learning_rate": 8.980896228932834e-10, "loss": 0.04, "step": 3295 }, { "epoch": 2.9828054298642535, "grad_norm": 0.6477528533248964, "learning_rate": 8.105282505049828e-10, "loss": 0.0237, "step": 3296 }, { "epoch": 2.983710407239819, "grad_norm": 0.7004803848363645, "learning_rate": 7.274567323800607e-10, "loss": 0.0312, "step": 3297 }, { "epoch": 2.9846153846153847, "grad_norm": 0.7524685329063047, "learning_rate": 6.488751431266149e-10, "loss": 0.036, "step": 3298 }, { "epoch": 2.9855203619909503, "grad_norm": 0.7494158393875915, "learning_rate": 5.747835533198576e-10, "loss": 0.0281, "step": 3299 }, { "epoch": 2.986425339366516, "grad_norm": 0.6540091942681845, "learning_rate": 5.051820295032262e-10, "loss": 0.0313, "step": 3300 }, { "epoch": 2.9873303167420815, "grad_norm": 0.9814981989145577, "learning_rate": 4.400706341861627e-10, "loss": 0.0626, "step": 3301 }, { "epoch": 2.988235294117647, "grad_norm": 0.8623147221864974, "learning_rate": 3.7944942584688947e-10, "loss": 0.0471, "step": 3302 }, { "epoch": 2.9891402714932127, "grad_norm": 0.9151857759647057, "learning_rate": 3.2331845893074363e-10, "loss": 0.0481, "step": 3303 }, { "epoch": 2.9900452488687783, "grad_norm": 0.6568144881075099, "learning_rate": 2.7167778384851185e-10, "loss": 0.029, "step": 3304 }, { "epoch": 2.990950226244344, "grad_norm": 1.1910818280208544, "learning_rate": 2.2452744698087114e-10, "loss": 0.0575, "step": 3305 }, { "epoch": 2.9918552036199095, "grad_norm": 1.14408842177795, "learning_rate": 1.8186749067339305e-10, "loss": 0.0591, "step": 3306 }, { "epoch": 2.992760180995475, "grad_norm": 0.7157867072737293, "learning_rate": 1.4369795324042923e-10, "loss": 0.0347, "step": 3307 }, { "epoch": 2.9936651583710407, "grad_norm": 0.7594307119657838, "learning_rate": 1.1001886896178093e-10, "loss": 0.0339, "step": 3308 }, { "epoch": 2.9945701357466064, "grad_norm": 0.6719909756361517, "learning_rate": 8.083026808602956e-11, "loss": 0.0356, "step": 3309 }, { "epoch": 2.995475113122172, "grad_norm": 0.7818004076367445, "learning_rate": 5.613217682720606e-11, "loss": 0.0368, "step": 3310 }, { "epoch": 2.9963800904977376, "grad_norm": 0.7826337994178894, "learning_rate": 3.5924617368121584e-11, "loss": 0.0322, "step": 3311 }, { "epoch": 2.997285067873303, "grad_norm": 1.2769190584423342, "learning_rate": 2.020760785648168e-11, "loss": 0.0537, "step": 3312 }, { "epoch": 2.998190045248869, "grad_norm": 0.8337410198811518, "learning_rate": 8.981162408217004e-12, "loss": 0.0354, "step": 3313 }, { "epoch": 2.9990950226244344, "grad_norm": 0.6986470216529161, "learning_rate": 2.2452911063730597e-12, "loss": 0.0316, "step": 3314 }, { "epoch": 3.0, "grad_norm": 0.5206874543007064, "learning_rate": 0.0, "loss": 0.0234, "step": 3315 } ], "logging_steps": 1, "max_steps": 3315, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 19249332510720.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }