{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.9977228936766505, "eval_steps": 500, "global_step": 890, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005605184795936241, "grad_norm": 5.90385013851782, "learning_rate": 8.98876404494382e-07, "loss": 0.807, "step": 1 }, { "epoch": 0.011210369591872483, "grad_norm": 6.006366750773737, "learning_rate": 1.797752808988764e-06, "loss": 0.8236, "step": 2 }, { "epoch": 0.016815554387808723, "grad_norm": 5.925343792163714, "learning_rate": 2.696629213483146e-06, "loss": 0.8109, "step": 3 }, { "epoch": 0.022420739183744966, "grad_norm": 5.527952797665692, "learning_rate": 3.595505617977528e-06, "loss": 0.7994, "step": 4 }, { "epoch": 0.028025923979681205, "grad_norm": 4.373958309383871, "learning_rate": 4.494382022471911e-06, "loss": 0.7563, "step": 5 }, { "epoch": 0.033631108775617445, "grad_norm": 2.4658304323036786, "learning_rate": 5.393258426966292e-06, "loss": 0.713, "step": 6 }, { "epoch": 0.039236293571553685, "grad_norm": 2.047561880721445, "learning_rate": 6.292134831460674e-06, "loss": 0.7011, "step": 7 }, { "epoch": 0.04484147836748993, "grad_norm": 4.026532746780989, "learning_rate": 7.191011235955056e-06, "loss": 0.7074, "step": 8 }, { "epoch": 0.05044666316342617, "grad_norm": 4.325745201316262, "learning_rate": 8.08988764044944e-06, "loss": 0.6906, "step": 9 }, { "epoch": 0.05605184795936241, "grad_norm": 3.8759783016193707, "learning_rate": 8.988764044943822e-06, "loss": 0.6496, "step": 10 }, { "epoch": 0.06165703275529865, "grad_norm": 4.428668216367442, "learning_rate": 9.887640449438202e-06, "loss": 0.6512, "step": 11 }, { "epoch": 0.06726221755123489, "grad_norm": 3.492900535526824, "learning_rate": 1.0786516853932584e-05, "loss": 0.6363, "step": 12 }, { "epoch": 0.07286740234717114, "grad_norm": 2.1273856381054657, "learning_rate": 1.1685393258426966e-05, "loss": 0.6132, "step": 13 }, { "epoch": 0.07847258714310737, "grad_norm": 2.1122501992949747, "learning_rate": 1.2584269662921348e-05, "loss": 0.5844, "step": 14 }, { "epoch": 0.08407777193904362, "grad_norm": 2.5622529515656667, "learning_rate": 1.348314606741573e-05, "loss": 0.5948, "step": 15 }, { "epoch": 0.08968295673497986, "grad_norm": 1.8958213967675108, "learning_rate": 1.4382022471910113e-05, "loss": 0.5727, "step": 16 }, { "epoch": 0.0952881415309161, "grad_norm": 0.9617068848658947, "learning_rate": 1.5280898876404495e-05, "loss": 0.5603, "step": 17 }, { "epoch": 0.10089332632685234, "grad_norm": 1.2698908465747545, "learning_rate": 1.617977528089888e-05, "loss": 0.5606, "step": 18 }, { "epoch": 0.10649851112278858, "grad_norm": 1.0006685793638639, "learning_rate": 1.707865168539326e-05, "loss": 0.5489, "step": 19 }, { "epoch": 0.11210369591872482, "grad_norm": 0.7031106693279928, "learning_rate": 1.7977528089887643e-05, "loss": 0.5355, "step": 20 }, { "epoch": 0.11770888071466107, "grad_norm": 0.9009903971329004, "learning_rate": 1.8876404494382024e-05, "loss": 0.5292, "step": 21 }, { "epoch": 0.1233140655105973, "grad_norm": 0.6766379367642102, "learning_rate": 1.9775280898876404e-05, "loss": 0.5199, "step": 22 }, { "epoch": 0.12891925030653353, "grad_norm": 0.6738985046630845, "learning_rate": 2.067415730337079e-05, "loss": 0.5161, "step": 23 }, { "epoch": 0.13452443510246978, "grad_norm": 0.6180587250422095, "learning_rate": 2.1573033707865168e-05, "loss": 0.5061, "step": 24 }, { "epoch": 0.14012961989840603, "grad_norm": 0.4834626230192744, "learning_rate": 2.2471910112359556e-05, "loss": 0.5139, "step": 25 }, { "epoch": 0.14573480469434227, "grad_norm": 0.5878637275212983, "learning_rate": 2.3370786516853933e-05, "loss": 0.5027, "step": 26 }, { "epoch": 0.15133998949027852, "grad_norm": 0.5004842186398177, "learning_rate": 2.426966292134832e-05, "loss": 0.5011, "step": 27 }, { "epoch": 0.15694517428621474, "grad_norm": 0.5254669056909826, "learning_rate": 2.5168539325842697e-05, "loss": 0.4909, "step": 28 }, { "epoch": 0.16255035908215099, "grad_norm": 0.4765392239552605, "learning_rate": 2.606741573033708e-05, "loss": 0.4984, "step": 29 }, { "epoch": 0.16815554387808723, "grad_norm": 0.46499584686266865, "learning_rate": 2.696629213483146e-05, "loss": 0.4831, "step": 30 }, { "epoch": 0.17376072867402348, "grad_norm": 0.4681038310213469, "learning_rate": 2.7865168539325845e-05, "loss": 0.4868, "step": 31 }, { "epoch": 0.17936591346995973, "grad_norm": 0.5306865190068607, "learning_rate": 2.8764044943820226e-05, "loss": 0.4799, "step": 32 }, { "epoch": 0.18497109826589594, "grad_norm": 0.5310145854714095, "learning_rate": 2.966292134831461e-05, "loss": 0.4773, "step": 33 }, { "epoch": 0.1905762830618322, "grad_norm": 0.761548939341114, "learning_rate": 3.056179775280899e-05, "loss": 0.4812, "step": 34 }, { "epoch": 0.19618146785776844, "grad_norm": 1.200005749726904, "learning_rate": 3.1460674157303374e-05, "loss": 0.4911, "step": 35 }, { "epoch": 0.20178665265370468, "grad_norm": 0.8174168347705099, "learning_rate": 3.235955056179776e-05, "loss": 0.467, "step": 36 }, { "epoch": 0.20739183744964093, "grad_norm": 0.4081677644510946, "learning_rate": 3.325842696629214e-05, "loss": 0.4635, "step": 37 }, { "epoch": 0.21299702224557715, "grad_norm": 0.6329076432258968, "learning_rate": 3.415730337078652e-05, "loss": 0.4709, "step": 38 }, { "epoch": 0.2186022070415134, "grad_norm": 1.0112197078534875, "learning_rate": 3.50561797752809e-05, "loss": 0.4696, "step": 39 }, { "epoch": 0.22420739183744964, "grad_norm": 0.9221867209391528, "learning_rate": 3.5955056179775286e-05, "loss": 0.4611, "step": 40 }, { "epoch": 0.2298125766333859, "grad_norm": 0.84837092144476, "learning_rate": 3.685393258426967e-05, "loss": 0.474, "step": 41 }, { "epoch": 0.23541776142932214, "grad_norm": 1.1853308215342242, "learning_rate": 3.775280898876405e-05, "loss": 0.4678, "step": 42 }, { "epoch": 0.24102294622525836, "grad_norm": 0.5210063275074145, "learning_rate": 3.865168539325843e-05, "loss": 0.4562, "step": 43 }, { "epoch": 0.2466281310211946, "grad_norm": 0.908397332328363, "learning_rate": 3.955056179775281e-05, "loss": 0.4674, "step": 44 }, { "epoch": 0.2522333158171308, "grad_norm": 1.1080166716443598, "learning_rate": 4.04494382022472e-05, "loss": 0.4508, "step": 45 }, { "epoch": 0.25783850061306707, "grad_norm": 0.45360566647654976, "learning_rate": 4.134831460674158e-05, "loss": 0.4509, "step": 46 }, { "epoch": 0.2634436854090033, "grad_norm": 0.9861729575846025, "learning_rate": 4.224719101123595e-05, "loss": 0.4551, "step": 47 }, { "epoch": 0.26904887020493956, "grad_norm": 0.46277631804738467, "learning_rate": 4.3146067415730337e-05, "loss": 0.446, "step": 48 }, { "epoch": 0.2746540550008758, "grad_norm": 0.7678134928965062, "learning_rate": 4.404494382022472e-05, "loss": 0.4518, "step": 49 }, { "epoch": 0.28025923979681205, "grad_norm": 0.6470876222514552, "learning_rate": 4.494382022471911e-05, "loss": 0.4545, "step": 50 }, { "epoch": 0.2858644245927483, "grad_norm": 0.6677697699594126, "learning_rate": 4.584269662921348e-05, "loss": 0.4542, "step": 51 }, { "epoch": 0.29146960938868455, "grad_norm": 0.9005318621831405, "learning_rate": 4.6741573033707865e-05, "loss": 0.4472, "step": 52 }, { "epoch": 0.2970747941846208, "grad_norm": 1.0135754268066979, "learning_rate": 4.764044943820225e-05, "loss": 0.4449, "step": 53 }, { "epoch": 0.30267997898055704, "grad_norm": 1.3120984053765912, "learning_rate": 4.853932584269664e-05, "loss": 0.4514, "step": 54 }, { "epoch": 0.30828516377649323, "grad_norm": 0.9125344012381023, "learning_rate": 4.943820224719101e-05, "loss": 0.448, "step": 55 }, { "epoch": 0.3138903485724295, "grad_norm": 1.5065061920138354, "learning_rate": 5.0337078651685394e-05, "loss": 0.4513, "step": 56 }, { "epoch": 0.3194955333683657, "grad_norm": 0.7122548779299299, "learning_rate": 5.123595505617978e-05, "loss": 0.4447, "step": 57 }, { "epoch": 0.32510071816430197, "grad_norm": 1.3540801647141738, "learning_rate": 5.213483146067416e-05, "loss": 0.4556, "step": 58 }, { "epoch": 0.3307059029602382, "grad_norm": 0.8323407754065357, "learning_rate": 5.303370786516854e-05, "loss": 0.4345, "step": 59 }, { "epoch": 0.33631108775617446, "grad_norm": 1.116821907282998, "learning_rate": 5.393258426966292e-05, "loss": 0.4491, "step": 60 }, { "epoch": 0.3419162725521107, "grad_norm": 1.0824014046607489, "learning_rate": 5.4831460674157306e-05, "loss": 0.4472, "step": 61 }, { "epoch": 0.34752145734804696, "grad_norm": 0.9265407958585422, "learning_rate": 5.573033707865169e-05, "loss": 0.4362, "step": 62 }, { "epoch": 0.3531266421439832, "grad_norm": 0.8981510798852507, "learning_rate": 5.662921348314607e-05, "loss": 0.453, "step": 63 }, { "epoch": 0.35873182693991945, "grad_norm": 0.9858658036066037, "learning_rate": 5.752808988764045e-05, "loss": 0.4428, "step": 64 }, { "epoch": 0.36433701173585564, "grad_norm": 1.2513865798978006, "learning_rate": 5.8426966292134835e-05, "loss": 0.4428, "step": 65 }, { "epoch": 0.3699421965317919, "grad_norm": 0.9397639573210418, "learning_rate": 5.932584269662922e-05, "loss": 0.4324, "step": 66 }, { "epoch": 0.37554738132772814, "grad_norm": 1.4017380301394493, "learning_rate": 6.0224719101123596e-05, "loss": 0.45, "step": 67 }, { "epoch": 0.3811525661236644, "grad_norm": 1.0777658678202968, "learning_rate": 6.112359550561798e-05, "loss": 0.4401, "step": 68 }, { "epoch": 0.38675775091960063, "grad_norm": 0.912558239818085, "learning_rate": 6.202247191011237e-05, "loss": 0.4351, "step": 69 }, { "epoch": 0.3923629357155369, "grad_norm": 1.2273797098955324, "learning_rate": 6.292134831460675e-05, "loss": 0.4516, "step": 70 }, { "epoch": 0.3979681205114731, "grad_norm": 1.1806852980385731, "learning_rate": 6.382022471910112e-05, "loss": 0.4442, "step": 71 }, { "epoch": 0.40357330530740937, "grad_norm": 1.1616577982500746, "learning_rate": 6.471910112359552e-05, "loss": 0.4342, "step": 72 }, { "epoch": 0.4091784901033456, "grad_norm": 0.8390636237723854, "learning_rate": 6.561797752808989e-05, "loss": 0.427, "step": 73 }, { "epoch": 0.41478367489928186, "grad_norm": 0.8499496288682458, "learning_rate": 6.651685393258428e-05, "loss": 0.4379, "step": 74 }, { "epoch": 0.42038885969521805, "grad_norm": 1.17937338360059, "learning_rate": 6.741573033707866e-05, "loss": 0.4291, "step": 75 }, { "epoch": 0.4259940444911543, "grad_norm": 0.7821897701029938, "learning_rate": 6.831460674157304e-05, "loss": 0.4351, "step": 76 }, { "epoch": 0.43159922928709055, "grad_norm": 0.8440055305481178, "learning_rate": 6.921348314606743e-05, "loss": 0.4304, "step": 77 }, { "epoch": 0.4372044140830268, "grad_norm": 0.7526712919502756, "learning_rate": 7.01123595505618e-05, "loss": 0.4366, "step": 78 }, { "epoch": 0.44280959887896304, "grad_norm": 1.1114429881926073, "learning_rate": 7.101123595505618e-05, "loss": 0.4326, "step": 79 }, { "epoch": 0.4484147836748993, "grad_norm": 0.7746430179490161, "learning_rate": 7.191011235955057e-05, "loss": 0.4336, "step": 80 }, { "epoch": 0.45401996847083553, "grad_norm": 0.8505120829834041, "learning_rate": 7.280898876404495e-05, "loss": 0.4352, "step": 81 }, { "epoch": 0.4596251532667718, "grad_norm": 1.4415361142111385, "learning_rate": 7.370786516853934e-05, "loss": 0.4385, "step": 82 }, { "epoch": 0.465230338062708, "grad_norm": 0.7475789995240804, "learning_rate": 7.46067415730337e-05, "loss": 0.431, "step": 83 }, { "epoch": 0.4708355228586443, "grad_norm": 0.9854738368310488, "learning_rate": 7.55056179775281e-05, "loss": 0.4327, "step": 84 }, { "epoch": 0.47644070765458046, "grad_norm": 1.4375947776402878, "learning_rate": 7.640449438202248e-05, "loss": 0.4355, "step": 85 }, { "epoch": 0.4820458924505167, "grad_norm": 0.7290469112827799, "learning_rate": 7.730337078651686e-05, "loss": 0.4335, "step": 86 }, { "epoch": 0.48765107724645296, "grad_norm": 1.1019633173104773, "learning_rate": 7.820224719101124e-05, "loss": 0.4303, "step": 87 }, { "epoch": 0.4932562620423892, "grad_norm": 0.9810209491585931, "learning_rate": 7.910112359550562e-05, "loss": 0.4296, "step": 88 }, { "epoch": 0.49886144683832545, "grad_norm": 0.8966074291671375, "learning_rate": 8e-05, "loss": 0.4338, "step": 89 }, { "epoch": 0.5044666316342616, "grad_norm": 1.0809659461252454, "learning_rate": 7.999969234487637e-05, "loss": 0.4323, "step": 90 }, { "epoch": 0.5100718164301979, "grad_norm": 1.44533907238761, "learning_rate": 7.999876938423802e-05, "loss": 0.4436, "step": 91 }, { "epoch": 0.5156770012261341, "grad_norm": 0.7701781299751447, "learning_rate": 7.999723113228264e-05, "loss": 0.4406, "step": 92 }, { "epoch": 0.5212821860220704, "grad_norm": 0.9804243571239605, "learning_rate": 7.999507761267278e-05, "loss": 0.4245, "step": 93 }, { "epoch": 0.5268873708180066, "grad_norm": 1.218580909562173, "learning_rate": 7.999230885853554e-05, "loss": 0.444, "step": 94 }, { "epoch": 0.5324925556139429, "grad_norm": 0.6868435592682877, "learning_rate": 7.998892491246195e-05, "loss": 0.4316, "step": 95 }, { "epoch": 0.5380977404098791, "grad_norm": 0.7857929091209908, "learning_rate": 7.998492582650644e-05, "loss": 0.4292, "step": 96 }, { "epoch": 0.5437029252058154, "grad_norm": 0.960028166874925, "learning_rate": 7.998031166218598e-05, "loss": 0.434, "step": 97 }, { "epoch": 0.5493081100017516, "grad_norm": 0.7041102432235921, "learning_rate": 7.997508249047913e-05, "loss": 0.4215, "step": 98 }, { "epoch": 0.5549132947976878, "grad_norm": 0.7044014441304309, "learning_rate": 7.996923839182498e-05, "loss": 0.425, "step": 99 }, { "epoch": 0.5605184795936241, "grad_norm": 0.6385459114746951, "learning_rate": 7.996277945612184e-05, "loss": 0.42, "step": 100 }, { "epoch": 0.5661236643895603, "grad_norm": 0.7731522061106563, "learning_rate": 7.995570578272598e-05, "loss": 0.4253, "step": 101 }, { "epoch": 0.5717288491854966, "grad_norm": 0.7332716664705065, "learning_rate": 7.994801748044995e-05, "loss": 0.4313, "step": 102 }, { "epoch": 0.5773340339814328, "grad_norm": 0.6776804568843835, "learning_rate": 7.993971466756107e-05, "loss": 0.4188, "step": 103 }, { "epoch": 0.5829392187773691, "grad_norm": 0.5922393199265042, "learning_rate": 7.993079747177948e-05, "loss": 0.4184, "step": 104 }, { "epoch": 0.5885444035733053, "grad_norm": 0.398394656224325, "learning_rate": 7.99212660302762e-05, "loss": 0.4155, "step": 105 }, { "epoch": 0.5941495883692416, "grad_norm": 0.5350176373169183, "learning_rate": 7.991112048967111e-05, "loss": 0.4157, "step": 106 }, { "epoch": 0.5997547731651778, "grad_norm": 0.38641904379334474, "learning_rate": 7.990036100603055e-05, "loss": 0.4119, "step": 107 }, { "epoch": 0.6053599579611141, "grad_norm": 0.5283945082650043, "learning_rate": 7.988898774486507e-05, "loss": 0.4129, "step": 108 }, { "epoch": 0.6109651427570503, "grad_norm": 0.5488840852639991, "learning_rate": 7.987700088112675e-05, "loss": 0.4224, "step": 109 }, { "epoch": 0.6165703275529865, "grad_norm": 0.5785726461047852, "learning_rate": 7.986440059920659e-05, "loss": 0.4077, "step": 110 }, { "epoch": 0.6221755123489228, "grad_norm": 0.5777052205377616, "learning_rate": 7.985118709293167e-05, "loss": 0.4166, "step": 111 }, { "epoch": 0.627780697144859, "grad_norm": 0.596491662412661, "learning_rate": 7.983736056556212e-05, "loss": 0.4132, "step": 112 }, { "epoch": 0.6333858819407953, "grad_norm": 0.6957057882031554, "learning_rate": 7.982292122978806e-05, "loss": 0.4178, "step": 113 }, { "epoch": 0.6389910667367315, "grad_norm": 0.6847230422083609, "learning_rate": 7.980786930772624e-05, "loss": 0.4118, "step": 114 }, { "epoch": 0.6445962515326678, "grad_norm": 0.6837460304512333, "learning_rate": 7.979220503091673e-05, "loss": 0.4147, "step": 115 }, { "epoch": 0.6502014363286039, "grad_norm": 0.6798597739557275, "learning_rate": 7.977592864031929e-05, "loss": 0.4171, "step": 116 }, { "epoch": 0.6558066211245402, "grad_norm": 0.548015114928552, "learning_rate": 7.975904038630963e-05, "loss": 0.4117, "step": 117 }, { "epoch": 0.6614118059204764, "grad_norm": 0.5642042657981582, "learning_rate": 7.974154052867569e-05, "loss": 0.4126, "step": 118 }, { "epoch": 0.6670169907164126, "grad_norm": 0.7655295571497013, "learning_rate": 7.97234293366135e-05, "loss": 0.4154, "step": 119 }, { "epoch": 0.6726221755123489, "grad_norm": 0.874608237347998, "learning_rate": 7.970470708872308e-05, "loss": 0.4236, "step": 120 }, { "epoch": 0.6782273603082851, "grad_norm": 0.8310771349721764, "learning_rate": 7.968537407300423e-05, "loss": 0.421, "step": 121 }, { "epoch": 0.6838325451042214, "grad_norm": 0.6553657946996332, "learning_rate": 7.966543058685203e-05, "loss": 0.4035, "step": 122 }, { "epoch": 0.6894377299001576, "grad_norm": 0.5023716536308365, "learning_rate": 7.964487693705224e-05, "loss": 0.416, "step": 123 }, { "epoch": 0.6950429146960939, "grad_norm": 0.4876124028877786, "learning_rate": 7.962371343977664e-05, "loss": 0.4116, "step": 124 }, { "epoch": 0.7006480994920301, "grad_norm": 0.4811788654442628, "learning_rate": 7.960194042057817e-05, "loss": 0.4181, "step": 125 }, { "epoch": 0.7062532842879664, "grad_norm": 0.7564883843961022, "learning_rate": 7.957955821438588e-05, "loss": 0.4061, "step": 126 }, { "epoch": 0.7118584690839026, "grad_norm": 0.6430241009529823, "learning_rate": 7.955656716549977e-05, "loss": 0.4099, "step": 127 }, { "epoch": 0.7174636538798389, "grad_norm": 0.5578593223115135, "learning_rate": 7.953296762758556e-05, "loss": 0.422, "step": 128 }, { "epoch": 0.7230688386757751, "grad_norm": 0.6294882800949052, "learning_rate": 7.950875996366916e-05, "loss": 0.4195, "step": 129 }, { "epoch": 0.7286740234717113, "grad_norm": 0.5769470393949646, "learning_rate": 7.948394454613117e-05, "loss": 0.4057, "step": 130 }, { "epoch": 0.7342792082676476, "grad_norm": 0.6608310787451351, "learning_rate": 7.945852175670113e-05, "loss": 0.4117, "step": 131 }, { "epoch": 0.7398843930635838, "grad_norm": 0.6476166241559258, "learning_rate": 7.943249198645159e-05, "loss": 0.4115, "step": 132 }, { "epoch": 0.7454895778595201, "grad_norm": 0.41750277808352504, "learning_rate": 7.940585563579216e-05, "loss": 0.4187, "step": 133 }, { "epoch": 0.7510947626554563, "grad_norm": 0.5395404061131682, "learning_rate": 7.937861311446334e-05, "loss": 0.4097, "step": 134 }, { "epoch": 0.7566999474513926, "grad_norm": 0.5858318930064079, "learning_rate": 7.935076484153019e-05, "loss": 0.4003, "step": 135 }, { "epoch": 0.7623051322473288, "grad_norm": 0.5173078099977891, "learning_rate": 7.932231124537589e-05, "loss": 0.4056, "step": 136 }, { "epoch": 0.7679103170432651, "grad_norm": 0.4498691054115375, "learning_rate": 7.929325276369519e-05, "loss": 0.4066, "step": 137 }, { "epoch": 0.7735155018392013, "grad_norm": 0.42467263261017896, "learning_rate": 7.92635898434876e-05, "loss": 0.4064, "step": 138 }, { "epoch": 0.7791206866351374, "grad_norm": 0.37958937321762776, "learning_rate": 7.923332294105063e-05, "loss": 0.4034, "step": 139 }, { "epoch": 0.7847258714310738, "grad_norm": 0.3468484137692954, "learning_rate": 7.920245252197263e-05, "loss": 0.4039, "step": 140 }, { "epoch": 0.7903310562270099, "grad_norm": 0.4169469149627637, "learning_rate": 7.917097906112574e-05, "loss": 0.4087, "step": 141 }, { "epoch": 0.7959362410229462, "grad_norm": 0.41834364109362276, "learning_rate": 7.913890304265853e-05, "loss": 0.405, "step": 142 }, { "epoch": 0.8015414258188824, "grad_norm": 0.3513985390988816, "learning_rate": 7.910622495998858e-05, "loss": 0.4006, "step": 143 }, { "epoch": 0.8071466106148187, "grad_norm": 0.30562923751430504, "learning_rate": 7.907294531579487e-05, "loss": 0.399, "step": 144 }, { "epoch": 0.8127517954107549, "grad_norm": 0.25016597293101506, "learning_rate": 7.903906462201004e-05, "loss": 0.404, "step": 145 }, { "epoch": 0.8183569802066912, "grad_norm": 0.330583944503076, "learning_rate": 7.900458339981254e-05, "loss": 0.4001, "step": 146 }, { "epoch": 0.8239621650026274, "grad_norm": 0.4232095940752348, "learning_rate": 7.896950217961862e-05, "loss": 0.4058, "step": 147 }, { "epoch": 0.8295673497985637, "grad_norm": 0.4726435967002529, "learning_rate": 7.893382150107413e-05, "loss": 0.3979, "step": 148 }, { "epoch": 0.8351725345944999, "grad_norm": 0.5224683089251781, "learning_rate": 7.889754191304624e-05, "loss": 0.4016, "step": 149 }, { "epoch": 0.8407777193904361, "grad_norm": 0.6703932689764993, "learning_rate": 7.886066397361502e-05, "loss": 0.4019, "step": 150 }, { "epoch": 0.8463829041863724, "grad_norm": 0.7976450116603597, "learning_rate": 7.882318825006482e-05, "loss": 0.4042, "step": 151 }, { "epoch": 0.8519880889823086, "grad_norm": 0.8440592541875934, "learning_rate": 7.878511531887553e-05, "loss": 0.405, "step": 152 }, { "epoch": 0.8575932737782449, "grad_norm": 0.7574577755866619, "learning_rate": 7.874644576571382e-05, "loss": 0.4141, "step": 153 }, { "epoch": 0.8631984585741811, "grad_norm": 0.6868823675156073, "learning_rate": 7.870718018542394e-05, "loss": 0.4085, "step": 154 }, { "epoch": 0.8688036433701174, "grad_norm": 0.7949981417283204, "learning_rate": 7.866731918201877e-05, "loss": 0.4123, "step": 155 }, { "epoch": 0.8744088281660536, "grad_norm": 0.7652258936220558, "learning_rate": 7.862686336867042e-05, "loss": 0.4074, "step": 156 }, { "epoch": 0.8800140129619899, "grad_norm": 0.5492331474373146, "learning_rate": 7.858581336770078e-05, "loss": 0.412, "step": 157 }, { "epoch": 0.8856191977579261, "grad_norm": 0.4985373864867304, "learning_rate": 7.854416981057202e-05, "loss": 0.4001, "step": 158 }, { "epoch": 0.8912243825538623, "grad_norm": 0.6843980488570995, "learning_rate": 7.850193333787679e-05, "loss": 0.3962, "step": 159 }, { "epoch": 0.8968295673497986, "grad_norm": 0.3471378020904331, "learning_rate": 7.845910459932851e-05, "loss": 0.3988, "step": 160 }, { "epoch": 0.9024347521457348, "grad_norm": 0.5626872736911909, "learning_rate": 7.841568425375118e-05, "loss": 0.3996, "step": 161 }, { "epoch": 0.9080399369416711, "grad_norm": 0.7412359084209332, "learning_rate": 7.83716729690694e-05, "loss": 0.3996, "step": 162 }, { "epoch": 0.9136451217376073, "grad_norm": 0.3955248842527265, "learning_rate": 7.832707142229803e-05, "loss": 0.4003, "step": 163 }, { "epoch": 0.9192503065335436, "grad_norm": 0.47855528089224836, "learning_rate": 7.828188029953179e-05, "loss": 0.4002, "step": 164 }, { "epoch": 0.9248554913294798, "grad_norm": 0.5159440067301492, "learning_rate": 7.823610029593471e-05, "loss": 0.3962, "step": 165 }, { "epoch": 0.930460676125416, "grad_norm": 0.3576812605070119, "learning_rate": 7.818973211572943e-05, "loss": 0.393, "step": 166 }, { "epoch": 0.9360658609213522, "grad_norm": 0.35348537654761086, "learning_rate": 7.814277647218634e-05, "loss": 0.4037, "step": 167 }, { "epoch": 0.9416710457172885, "grad_norm": 0.39413342377875193, "learning_rate": 7.809523408761266e-05, "loss": 0.3942, "step": 168 }, { "epoch": 0.9472762305132247, "grad_norm": 0.3951438373093825, "learning_rate": 7.80471056933413e-05, "loss": 0.4012, "step": 169 }, { "epoch": 0.9528814153091609, "grad_norm": 0.45433061259053, "learning_rate": 7.799839202971963e-05, "loss": 0.3982, "step": 170 }, { "epoch": 0.9584866001050972, "grad_norm": 0.519370105848961, "learning_rate": 7.794909384609807e-05, "loss": 0.3994, "step": 171 }, { "epoch": 0.9640917849010334, "grad_norm": 0.5495567414849357, "learning_rate": 7.789921190081851e-05, "loss": 0.3979, "step": 172 }, { "epoch": 0.9696969696969697, "grad_norm": 0.6394859589748585, "learning_rate": 7.784874696120279e-05, "loss": 0.3959, "step": 173 }, { "epoch": 0.9753021544929059, "grad_norm": 0.7704242402793997, "learning_rate": 7.779769980354077e-05, "loss": 0.4027, "step": 174 }, { "epoch": 0.9809073392888422, "grad_norm": 0.9985269827974932, "learning_rate": 7.774607121307841e-05, "loss": 0.405, "step": 175 }, { "epoch": 0.9865125240847784, "grad_norm": 0.868487642699029, "learning_rate": 7.769386198400576e-05, "loss": 0.3957, "step": 176 }, { "epoch": 0.9921177088807147, "grad_norm": 0.40924003925560676, "learning_rate": 7.764107291944464e-05, "loss": 0.3905, "step": 177 }, { "epoch": 0.9977228936766509, "grad_norm": 0.4918744419515202, "learning_rate": 7.758770483143634e-05, "loss": 0.389, "step": 178 }, { "epoch": 1.0052548607461902, "grad_norm": 1.487265905041594, "learning_rate": 7.753375854092918e-05, "loss": 0.7519, "step": 179 }, { "epoch": 1.0108600455421264, "grad_norm": 1.0599518409785367, "learning_rate": 7.747923487776579e-05, "loss": 0.3924, "step": 180 }, { "epoch": 1.0164652303380628, "grad_norm": 0.6685215482157915, "learning_rate": 7.742413468067038e-05, "loss": 0.3886, "step": 181 }, { "epoch": 1.022070415133999, "grad_norm": 0.6798972273258548, "learning_rate": 7.736845879723585e-05, "loss": 0.3925, "step": 182 }, { "epoch": 1.0276755999299352, "grad_norm": 0.5898524978289069, "learning_rate": 7.731220808391072e-05, "loss": 0.3799, "step": 183 }, { "epoch": 1.0332807847258714, "grad_norm": 0.5305286635410624, "learning_rate": 7.725538340598603e-05, "loss": 0.3858, "step": 184 }, { "epoch": 1.0388859695218078, "grad_norm": 0.5290381001929488, "learning_rate": 7.719798563758193e-05, "loss": 0.3792, "step": 185 }, { "epoch": 1.044491154317744, "grad_norm": 0.42976752812289126, "learning_rate": 7.71400156616343e-05, "loss": 0.378, "step": 186 }, { "epoch": 1.0500963391136802, "grad_norm": 0.4255688619941376, "learning_rate": 7.708147436988112e-05, "loss": 0.3838, "step": 187 }, { "epoch": 1.0557015239096164, "grad_norm": 0.3844923007802862, "learning_rate": 7.702236266284886e-05, "loss": 0.3838, "step": 188 }, { "epoch": 1.0613067087055525, "grad_norm": 0.4493904358687705, "learning_rate": 7.696268144983844e-05, "loss": 0.3773, "step": 189 }, { "epoch": 1.066911893501489, "grad_norm": 0.43703395699798525, "learning_rate": 7.690243164891146e-05, "loss": 0.3789, "step": 190 }, { "epoch": 1.0725170782974252, "grad_norm": 0.3129550094380894, "learning_rate": 7.684161418687588e-05, "loss": 0.3841, "step": 191 }, { "epoch": 1.0781222630933613, "grad_norm": 0.3196436601574946, "learning_rate": 7.678022999927191e-05, "loss": 0.375, "step": 192 }, { "epoch": 1.0837274478892975, "grad_norm": 0.32439557065583957, "learning_rate": 7.671828003035754e-05, "loss": 0.3808, "step": 193 }, { "epoch": 1.0893326326852337, "grad_norm": 0.3551997178098723, "learning_rate": 7.665576523309402e-05, "loss": 0.3808, "step": 194 }, { "epoch": 1.0949378174811701, "grad_norm": 0.28222901059062117, "learning_rate": 7.659268656913125e-05, "loss": 0.3755, "step": 195 }, { "epoch": 1.1005430022771063, "grad_norm": 0.24413884185384668, "learning_rate": 7.652904500879294e-05, "loss": 0.3771, "step": 196 }, { "epoch": 1.1061481870730425, "grad_norm": 0.3258543915444851, "learning_rate": 7.646484153106168e-05, "loss": 0.3819, "step": 197 }, { "epoch": 1.1117533718689787, "grad_norm": 0.30494792365590756, "learning_rate": 7.640007712356394e-05, "loss": 0.3739, "step": 198 }, { "epoch": 1.1173585566649151, "grad_norm": 0.3416322895952249, "learning_rate": 7.633475278255477e-05, "loss": 0.3729, "step": 199 }, { "epoch": 1.1229637414608513, "grad_norm": 0.33058200454972514, "learning_rate": 7.626886951290262e-05, "loss": 0.3778, "step": 200 }, { "epoch": 1.1285689262567875, "grad_norm": 0.24553286849423914, "learning_rate": 7.620242832807375e-05, "loss": 0.3815, "step": 201 }, { "epoch": 1.1341741110527237, "grad_norm": 0.3094016267852568, "learning_rate": 7.61354302501167e-05, "loss": 0.3739, "step": 202 }, { "epoch": 1.1397792958486601, "grad_norm": 0.3110189115438822, "learning_rate": 7.606787630964658e-05, "loss": 0.3744, "step": 203 }, { "epoch": 1.1453844806445963, "grad_norm": 0.3105648524438776, "learning_rate": 7.599976754582917e-05, "loss": 0.3733, "step": 204 }, { "epoch": 1.1509896654405325, "grad_norm": 0.38377775896842775, "learning_rate": 7.593110500636499e-05, "loss": 0.3777, "step": 205 }, { "epoch": 1.1565948502364687, "grad_norm": 0.5289840818037539, "learning_rate": 7.586188974747315e-05, "loss": 0.3748, "step": 206 }, { "epoch": 1.1622000350324049, "grad_norm": 0.6002663848970253, "learning_rate": 7.579212283387508e-05, "loss": 0.376, "step": 207 }, { "epoch": 1.1678052198283413, "grad_norm": 0.543546869431326, "learning_rate": 7.57218053387782e-05, "loss": 0.3818, "step": 208 }, { "epoch": 1.1734104046242775, "grad_norm": 0.38254861936914103, "learning_rate": 7.565093834385944e-05, "loss": 0.3733, "step": 209 }, { "epoch": 1.1790155894202137, "grad_norm": 0.25789361293808194, "learning_rate": 7.557952293924843e-05, "loss": 0.3741, "step": 210 }, { "epoch": 1.1846207742161499, "grad_norm": 0.3509813225234253, "learning_rate": 7.550756022351098e-05, "loss": 0.3766, "step": 211 }, { "epoch": 1.1902259590120863, "grad_norm": 0.41381875355849096, "learning_rate": 7.5435051303632e-05, "loss": 0.3771, "step": 212 }, { "epoch": 1.1958311438080225, "grad_norm": 0.31160966900292714, "learning_rate": 7.53619972949985e-05, "loss": 0.37, "step": 213 }, { "epoch": 1.2014363286039587, "grad_norm": 0.25377869542538456, "learning_rate": 7.528839932138248e-05, "loss": 0.3742, "step": 214 }, { "epoch": 1.2070415133998948, "grad_norm": 0.3670686486051864, "learning_rate": 7.521425851492366e-05, "loss": 0.3741, "step": 215 }, { "epoch": 1.2126466981958313, "grad_norm": 0.35278618781458415, "learning_rate": 7.513957601611196e-05, "loss": 0.3689, "step": 216 }, { "epoch": 1.2182518829917675, "grad_norm": 0.27734720069224306, "learning_rate": 7.506435297377006e-05, "loss": 0.3709, "step": 217 }, { "epoch": 1.2238570677877036, "grad_norm": 0.3278146817806966, "learning_rate": 7.498859054503568e-05, "loss": 0.3758, "step": 218 }, { "epoch": 1.2294622525836398, "grad_norm": 0.4148552525962653, "learning_rate": 7.491228989534378e-05, "loss": 0.3722, "step": 219 }, { "epoch": 1.235067437379576, "grad_norm": 0.4473202149669945, "learning_rate": 7.483545219840865e-05, "loss": 0.3754, "step": 220 }, { "epoch": 1.2406726221755124, "grad_norm": 0.44427273810379947, "learning_rate": 7.475807863620587e-05, "loss": 0.3762, "step": 221 }, { "epoch": 1.2462778069714486, "grad_norm": 0.49016712568602555, "learning_rate": 7.468017039895404e-05, "loss": 0.3761, "step": 222 }, { "epoch": 1.2518829917673848, "grad_norm": 0.47873415753545473, "learning_rate": 7.460172868509664e-05, "loss": 0.3734, "step": 223 }, { "epoch": 1.257488176563321, "grad_norm": 0.43672364622786203, "learning_rate": 7.452275470128338e-05, "loss": 0.3721, "step": 224 }, { "epoch": 1.2630933613592572, "grad_norm": 0.43990010868929574, "learning_rate": 7.444324966235179e-05, "loss": 0.374, "step": 225 }, { "epoch": 1.2686985461551936, "grad_norm": 0.4713815408143108, "learning_rate": 7.436321479130855e-05, "loss": 0.3713, "step": 226 }, { "epoch": 1.2743037309511298, "grad_norm": 0.45464514691112873, "learning_rate": 7.428265131931053e-05, "loss": 0.3706, "step": 227 }, { "epoch": 1.279908915747066, "grad_norm": 0.510024464970406, "learning_rate": 7.420156048564599e-05, "loss": 0.3741, "step": 228 }, { "epoch": 1.2855141005430024, "grad_norm": 0.5372593621720837, "learning_rate": 7.411994353771542e-05, "loss": 0.3696, "step": 229 }, { "epoch": 1.2911192853389384, "grad_norm": 0.4219487710133918, "learning_rate": 7.40378017310125e-05, "loss": 0.3711, "step": 230 }, { "epoch": 1.2967244701348748, "grad_norm": 0.2523356442192507, "learning_rate": 7.395513632910455e-05, "loss": 0.371, "step": 231 }, { "epoch": 1.302329654930811, "grad_norm": 0.3232824061176527, "learning_rate": 7.38719486036133e-05, "loss": 0.3755, "step": 232 }, { "epoch": 1.3079348397267472, "grad_norm": 0.3600749335990331, "learning_rate": 7.378823983419529e-05, "loss": 0.373, "step": 233 }, { "epoch": 1.3135400245226836, "grad_norm": 0.31629844859755163, "learning_rate": 7.370401130852207e-05, "loss": 0.3734, "step": 234 }, { "epoch": 1.3191452093186198, "grad_norm": 0.30569567935552483, "learning_rate": 7.361926432226053e-05, "loss": 0.377, "step": 235 }, { "epoch": 1.324750394114556, "grad_norm": 0.2484814268194845, "learning_rate": 7.35340001790529e-05, "loss": 0.3711, "step": 236 }, { "epoch": 1.3303555789104922, "grad_norm": 0.2481897952153626, "learning_rate": 7.34482201904967e-05, "loss": 0.3769, "step": 237 }, { "epoch": 1.3359607637064284, "grad_norm": 0.301422303142377, "learning_rate": 7.336192567612458e-05, "loss": 0.3746, "step": 238 }, { "epoch": 1.3415659485023648, "grad_norm": 0.23838306194654293, "learning_rate": 7.327511796338402e-05, "loss": 0.3776, "step": 239 }, { "epoch": 1.347171133298301, "grad_norm": 0.24847852807759896, "learning_rate": 7.318779838761688e-05, "loss": 0.3673, "step": 240 }, { "epoch": 1.3527763180942372, "grad_norm": 0.32043115894332014, "learning_rate": 7.309996829203894e-05, "loss": 0.3706, "step": 241 }, { "epoch": 1.3583815028901733, "grad_norm": 0.27995723770731035, "learning_rate": 7.301162902771911e-05, "loss": 0.3698, "step": 242 }, { "epoch": 1.3639866876861095, "grad_norm": 0.23369201897150527, "learning_rate": 7.292278195355875e-05, "loss": 0.3765, "step": 243 }, { "epoch": 1.369591872482046, "grad_norm": 0.2782336040131324, "learning_rate": 7.28334284362708e-05, "loss": 0.3698, "step": 244 }, { "epoch": 1.3751970572779821, "grad_norm": 0.32558286225893, "learning_rate": 7.274356985035856e-05, "loss": 0.363, "step": 245 }, { "epoch": 1.3808022420739183, "grad_norm": 0.3651496945923943, "learning_rate": 7.265320757809478e-05, "loss": 0.3708, "step": 246 }, { "epoch": 1.3864074268698547, "grad_norm": 0.37775778360725737, "learning_rate": 7.256234300950025e-05, "loss": 0.3739, "step": 247 }, { "epoch": 1.392012611665791, "grad_norm": 0.3865569735032884, "learning_rate": 7.247097754232251e-05, "loss": 0.3663, "step": 248 }, { "epoch": 1.3976177964617271, "grad_norm": 0.4600474923978821, "learning_rate": 7.237911258201422e-05, "loss": 0.3725, "step": 249 }, { "epoch": 1.4032229812576633, "grad_norm": 0.578230623305736, "learning_rate": 7.228674954171169e-05, "loss": 0.3717, "step": 250 }, { "epoch": 1.4088281660535995, "grad_norm": 0.5579950344731053, "learning_rate": 7.219388984221304e-05, "loss": 0.375, "step": 251 }, { "epoch": 1.414433350849536, "grad_norm": 0.42369290331551335, "learning_rate": 7.210053491195638e-05, "loss": 0.3673, "step": 252 }, { "epoch": 1.420038535645472, "grad_norm": 0.3852634055274567, "learning_rate": 7.200668618699786e-05, "loss": 0.3669, "step": 253 }, { "epoch": 1.4256437204414083, "grad_norm": 0.3342986128914958, "learning_rate": 7.191234511098952e-05, "loss": 0.3675, "step": 254 }, { "epoch": 1.4312489052373445, "grad_norm": 0.28417998997516397, "learning_rate": 7.181751313515716e-05, "loss": 0.3736, "step": 255 }, { "epoch": 1.4368540900332807, "grad_norm": 0.316021315356994, "learning_rate": 7.172219171827788e-05, "loss": 0.3652, "step": 256 }, { "epoch": 1.442459274829217, "grad_norm": 0.3330261543122812, "learning_rate": 7.162638232665785e-05, "loss": 0.3781, "step": 257 }, { "epoch": 1.4480644596251533, "grad_norm": 0.41235945578908095, "learning_rate": 7.153008643410957e-05, "loss": 0.3676, "step": 258 }, { "epoch": 1.4536696444210895, "grad_norm": 0.5120467154192809, "learning_rate": 7.143330552192925e-05, "loss": 0.3688, "step": 259 }, { "epoch": 1.4592748292170257, "grad_norm": 0.566273828052107, "learning_rate": 7.13360410788741e-05, "loss": 0.3728, "step": 260 }, { "epoch": 1.4648800140129619, "grad_norm": 0.48257393278128896, "learning_rate": 7.123829460113933e-05, "loss": 0.3698, "step": 261 }, { "epoch": 1.4704851988088983, "grad_norm": 0.3962978665568913, "learning_rate": 7.114006759233514e-05, "loss": 0.3708, "step": 262 }, { "epoch": 1.4760903836048345, "grad_norm": 0.43632437470514573, "learning_rate": 7.104136156346368e-05, "loss": 0.3776, "step": 263 }, { "epoch": 1.4816955684007707, "grad_norm": 0.5027739581146445, "learning_rate": 7.094217803289573e-05, "loss": 0.377, "step": 264 }, { "epoch": 1.487300753196707, "grad_norm": 0.4748778836922837, "learning_rate": 7.084251852634736e-05, "loss": 0.374, "step": 265 }, { "epoch": 1.4929059379926433, "grad_norm": 0.31244286402393573, "learning_rate": 7.074238457685644e-05, "loss": 0.3656, "step": 266 }, { "epoch": 1.4985111227885795, "grad_norm": 0.2969138202613333, "learning_rate": 7.064177772475912e-05, "loss": 0.377, "step": 267 }, { "epoch": 1.5041163075845156, "grad_norm": 0.41051969660231596, "learning_rate": 7.054069951766608e-05, "loss": 0.3763, "step": 268 }, { "epoch": 1.5097214923804518, "grad_norm": 0.38457550467039503, "learning_rate": 7.043915151043871e-05, "loss": 0.3714, "step": 269 }, { "epoch": 1.5153266771763882, "grad_norm": 0.29650502362650927, "learning_rate": 7.033713526516528e-05, "loss": 0.3708, "step": 270 }, { "epoch": 1.5209318619723244, "grad_norm": 0.4069481640243356, "learning_rate": 7.023465235113678e-05, "loss": 0.3734, "step": 271 }, { "epoch": 1.5265370467682606, "grad_norm": 0.4592931917273877, "learning_rate": 7.013170434482291e-05, "loss": 0.3697, "step": 272 }, { "epoch": 1.532142231564197, "grad_norm": 0.34558335688526987, "learning_rate": 7.002829282984776e-05, "loss": 0.3601, "step": 273 }, { "epoch": 1.537747416360133, "grad_norm": 0.32785340771208665, "learning_rate": 6.992441939696543e-05, "loss": 0.3708, "step": 274 }, { "epoch": 1.5433526011560694, "grad_norm": 0.42401801041386833, "learning_rate": 6.982008564403562e-05, "loss": 0.3709, "step": 275 }, { "epoch": 1.5489577859520056, "grad_norm": 0.3554711523304497, "learning_rate": 6.971529317599903e-05, "loss": 0.3625, "step": 276 }, { "epoch": 1.5545629707479418, "grad_norm": 0.33335189910758606, "learning_rate": 6.961004360485263e-05, "loss": 0.3723, "step": 277 }, { "epoch": 1.5601681555438782, "grad_norm": 0.3198930394708329, "learning_rate": 6.950433854962489e-05, "loss": 0.3601, "step": 278 }, { "epoch": 1.5657733403398142, "grad_norm": 0.27989792618134535, "learning_rate": 6.939817963635095e-05, "loss": 0.3703, "step": 279 }, { "epoch": 1.5713785251357506, "grad_norm": 0.29366802755384774, "learning_rate": 6.929156849804745e-05, "loss": 0.3714, "step": 280 }, { "epoch": 1.5769837099316868, "grad_norm": 0.2785219347149019, "learning_rate": 6.918450677468754e-05, "loss": 0.3763, "step": 281 }, { "epoch": 1.582588894727623, "grad_norm": 0.2578554063108834, "learning_rate": 6.907699611317563e-05, "loss": 0.3708, "step": 282 }, { "epoch": 1.5881940795235594, "grad_norm": 0.24508422370288088, "learning_rate": 6.896903816732199e-05, "loss": 0.3808, "step": 283 }, { "epoch": 1.5937992643194954, "grad_norm": 0.3084468014981416, "learning_rate": 6.88606345978174e-05, "loss": 0.3668, "step": 284 }, { "epoch": 1.5994044491154318, "grad_norm": 0.3064359422764903, "learning_rate": 6.875178707220752e-05, "loss": 0.3703, "step": 285 }, { "epoch": 1.605009633911368, "grad_norm": 0.259368905565046, "learning_rate": 6.86424972648673e-05, "loss": 0.3682, "step": 286 }, { "epoch": 1.6106148187073042, "grad_norm": 0.3138681158378778, "learning_rate": 6.853276685697522e-05, "loss": 0.361, "step": 287 }, { "epoch": 1.6162200035032406, "grad_norm": 0.28723735473752765, "learning_rate": 6.842259753648736e-05, "loss": 0.3691, "step": 288 }, { "epoch": 1.6218251882991768, "grad_norm": 0.22917475477913027, "learning_rate": 6.831199099811154e-05, "loss": 0.3738, "step": 289 }, { "epoch": 1.627430373095113, "grad_norm": 0.22158438127508895, "learning_rate": 6.820094894328115e-05, "loss": 0.3673, "step": 290 }, { "epoch": 1.6330355578910494, "grad_norm": 0.24914259580358247, "learning_rate": 6.808947308012907e-05, "loss": 0.3623, "step": 291 }, { "epoch": 1.6386407426869853, "grad_norm": 0.27721435112401327, "learning_rate": 6.797756512346131e-05, "loss": 0.371, "step": 292 }, { "epoch": 1.6442459274829218, "grad_norm": 0.28690822410974365, "learning_rate": 6.786522679473069e-05, "loss": 0.3704, "step": 293 }, { "epoch": 1.649851112278858, "grad_norm": 0.29175870389773917, "learning_rate": 6.775245982201031e-05, "loss": 0.3705, "step": 294 }, { "epoch": 1.6554562970747941, "grad_norm": 0.31996465044367267, "learning_rate": 6.763926593996704e-05, "loss": 0.3621, "step": 295 }, { "epoch": 1.6610614818707305, "grad_norm": 0.28536850062056507, "learning_rate": 6.752564688983475e-05, "loss": 0.3678, "step": 296 }, { "epoch": 1.6666666666666665, "grad_norm": 0.25078819883082826, "learning_rate": 6.741160441938761e-05, "loss": 0.3633, "step": 297 }, { "epoch": 1.672271851462603, "grad_norm": 0.18918507286548947, "learning_rate": 6.729714028291311e-05, "loss": 0.3641, "step": 298 }, { "epoch": 1.6778770362585391, "grad_norm": 0.1854770882692491, "learning_rate": 6.718225624118518e-05, "loss": 0.3736, "step": 299 }, { "epoch": 1.6834822210544753, "grad_norm": 0.2398880873671139, "learning_rate": 6.7066954061437e-05, "loss": 0.3627, "step": 300 }, { "epoch": 1.6890874058504117, "grad_norm": 0.2708404661115209, "learning_rate": 6.695123551733391e-05, "loss": 0.3615, "step": 301 }, { "epoch": 1.694692590646348, "grad_norm": 0.22472657962922266, "learning_rate": 6.683510238894603e-05, "loss": 0.3601, "step": 302 }, { "epoch": 1.700297775442284, "grad_norm": 0.1879478767675107, "learning_rate": 6.671855646272099e-05, "loss": 0.3704, "step": 303 }, { "epoch": 1.7059029602382203, "grad_norm": 0.1898981801511567, "learning_rate": 6.660159953145632e-05, "loss": 0.3741, "step": 304 }, { "epoch": 1.7115081450341565, "grad_norm": 0.23990986850106974, "learning_rate": 6.648423339427203e-05, "loss": 0.365, "step": 305 }, { "epoch": 1.717113329830093, "grad_norm": 0.2961808669969359, "learning_rate": 6.636645985658274e-05, "loss": 0.3703, "step": 306 }, { "epoch": 1.722718514626029, "grad_norm": 0.28018628616841723, "learning_rate": 6.62482807300701e-05, "loss": 0.3725, "step": 307 }, { "epoch": 1.7283236994219653, "grad_norm": 0.23407750459599536, "learning_rate": 6.612969783265477e-05, "loss": 0.3774, "step": 308 }, { "epoch": 1.7339288842179017, "grad_norm": 0.2348613025637216, "learning_rate": 6.601071298846859e-05, "loss": 0.3678, "step": 309 }, { "epoch": 1.7395340690138377, "grad_norm": 0.27859996434994966, "learning_rate": 6.589132802782636e-05, "loss": 0.3681, "step": 310 }, { "epoch": 1.745139253809774, "grad_norm": 0.30291442596447254, "learning_rate": 6.577154478719786e-05, "loss": 0.3626, "step": 311 }, { "epoch": 1.7507444386057103, "grad_norm": 0.34755080893012474, "learning_rate": 6.565136510917946e-05, "loss": 0.3618, "step": 312 }, { "epoch": 1.7563496234016465, "grad_norm": 0.4116767959803346, "learning_rate": 6.553079084246583e-05, "loss": 0.3681, "step": 313 }, { "epoch": 1.7619548081975829, "grad_norm": 0.46137945732023167, "learning_rate": 6.540982384182154e-05, "loss": 0.3724, "step": 314 }, { "epoch": 1.7675599929935188, "grad_norm": 0.5111615976410253, "learning_rate": 6.528846596805246e-05, "loss": 0.3656, "step": 315 }, { "epoch": 1.7731651777894553, "grad_norm": 0.5142382717546382, "learning_rate": 6.516671908797717e-05, "loss": 0.3652, "step": 316 }, { "epoch": 1.7787703625853915, "grad_norm": 0.45272084053581846, "learning_rate": 6.504458507439825e-05, "loss": 0.3708, "step": 317 }, { "epoch": 1.7843755473813276, "grad_norm": 0.3546131780130616, "learning_rate": 6.492206580607344e-05, "loss": 0.372, "step": 318 }, { "epoch": 1.789980732177264, "grad_norm": 0.35982588012048017, "learning_rate": 6.479916316768677e-05, "loss": 0.368, "step": 319 }, { "epoch": 1.7955859169732002, "grad_norm": 0.42694941448039553, "learning_rate": 6.467587904981959e-05, "loss": 0.3724, "step": 320 }, { "epoch": 1.8011911017691364, "grad_norm": 0.38051976086581385, "learning_rate": 6.455221534892138e-05, "loss": 0.3714, "step": 321 }, { "epoch": 1.8067962865650729, "grad_norm": 0.23803744685632255, "learning_rate": 6.442817396728073e-05, "loss": 0.363, "step": 322 }, { "epoch": 1.8124014713610088, "grad_norm": 0.19779044986555921, "learning_rate": 6.430375681299596e-05, "loss": 0.3652, "step": 323 }, { "epoch": 1.8180066561569452, "grad_norm": 0.2848062424619444, "learning_rate": 6.417896579994583e-05, "loss": 0.3701, "step": 324 }, { "epoch": 1.8236118409528814, "grad_norm": 0.3241743893145074, "learning_rate": 6.405380284776007e-05, "loss": 0.3631, "step": 325 }, { "epoch": 1.8292170257488176, "grad_norm": 0.23348066280971422, "learning_rate": 6.392826988178984e-05, "loss": 0.3655, "step": 326 }, { "epoch": 1.834822210544754, "grad_norm": 0.22485128049462547, "learning_rate": 6.380236883307814e-05, "loss": 0.3649, "step": 327 }, { "epoch": 1.84042739534069, "grad_norm": 0.26646687828106147, "learning_rate": 6.367610163833015e-05, "loss": 0.3704, "step": 328 }, { "epoch": 1.8460325801366264, "grad_norm": 0.22618321758644827, "learning_rate": 6.35494702398833e-05, "loss": 0.3622, "step": 329 }, { "epoch": 1.8516377649325626, "grad_norm": 0.22154925411761456, "learning_rate": 6.342247658567753e-05, "loss": 0.366, "step": 330 }, { "epoch": 1.8572429497284988, "grad_norm": 0.29666284481675853, "learning_rate": 6.329512262922525e-05, "loss": 0.3689, "step": 331 }, { "epoch": 1.8628481345244352, "grad_norm": 0.2939745343835554, "learning_rate": 6.316741032958133e-05, "loss": 0.3592, "step": 332 }, { "epoch": 1.8684533193203714, "grad_norm": 0.2738063356122982, "learning_rate": 6.303934165131296e-05, "loss": 0.3632, "step": 333 }, { "epoch": 1.8740585041163076, "grad_norm": 0.25344755125569035, "learning_rate": 6.291091856446935e-05, "loss": 0.3682, "step": 334 }, { "epoch": 1.8796636889122438, "grad_norm": 0.22436307153393606, "learning_rate": 6.278214304455156e-05, "loss": 0.3657, "step": 335 }, { "epoch": 1.88526887370818, "grad_norm": 0.25070997700969644, "learning_rate": 6.265301707248199e-05, "loss": 0.3699, "step": 336 }, { "epoch": 1.8908740585041164, "grad_norm": 0.2733139068534298, "learning_rate": 6.252354263457403e-05, "loss": 0.3695, "step": 337 }, { "epoch": 1.8964792433000526, "grad_norm": 0.2866080779686849, "learning_rate": 6.239372172250134e-05, "loss": 0.3714, "step": 338 }, { "epoch": 1.9020844280959888, "grad_norm": 0.21011808838464177, "learning_rate": 6.226355633326739e-05, "loss": 0.3664, "step": 339 }, { "epoch": 1.9076896128919252, "grad_norm": 0.1975979876067148, "learning_rate": 6.21330484691746e-05, "loss": 0.3669, "step": 340 }, { "epoch": 1.9132947976878611, "grad_norm": 0.21416216664449675, "learning_rate": 6.200220013779366e-05, "loss": 0.3668, "step": 341 }, { "epoch": 1.9188999824837976, "grad_norm": 0.21995877340660605, "learning_rate": 6.187101335193252e-05, "loss": 0.3602, "step": 342 }, { "epoch": 1.9245051672797338, "grad_norm": 0.19030470602968677, "learning_rate": 6.173949012960552e-05, "loss": 0.3617, "step": 343 }, { "epoch": 1.93011035207567, "grad_norm": 0.2128195145941448, "learning_rate": 6.160763249400236e-05, "loss": 0.3624, "step": 344 }, { "epoch": 1.9357155368716064, "grad_norm": 0.245421223589709, "learning_rate": 6.147544247345684e-05, "loss": 0.3603, "step": 345 }, { "epoch": 1.9413207216675423, "grad_norm": 0.21883215784819807, "learning_rate": 6.134292210141585e-05, "loss": 0.3594, "step": 346 }, { "epoch": 1.9469259064634787, "grad_norm": 0.1967876169444006, "learning_rate": 6.121007341640797e-05, "loss": 0.368, "step": 347 }, { "epoch": 1.952531091259415, "grad_norm": 0.20005898323834861, "learning_rate": 6.10768984620121e-05, "loss": 0.3735, "step": 348 }, { "epoch": 1.9581362760553511, "grad_norm": 0.23622231826107862, "learning_rate": 6.0943399286826126e-05, "loss": 0.3621, "step": 349 }, { "epoch": 1.9637414608512875, "grad_norm": 0.27814473844495363, "learning_rate": 6.080957794443529e-05, "loss": 0.3583, "step": 350 }, { "epoch": 1.9693466456472237, "grad_norm": 0.302395060856694, "learning_rate": 6.067543649338069e-05, "loss": 0.3626, "step": 351 }, { "epoch": 1.97495183044316, "grad_norm": 0.2330957117550289, "learning_rate": 6.0540976997127534e-05, "loss": 0.3626, "step": 352 }, { "epoch": 1.9805570152390963, "grad_norm": 0.17919274741898725, "learning_rate": 6.040620152403351e-05, "loss": 0.3699, "step": 353 }, { "epoch": 1.9861622000350323, "grad_norm": 0.19040666115368088, "learning_rate": 6.0271112147316816e-05, "loss": 0.362, "step": 354 }, { "epoch": 1.9917673848309687, "grad_norm": 0.19757884670083642, "learning_rate": 6.013571094502443e-05, "loss": 0.3609, "step": 355 }, { "epoch": 1.997372569626905, "grad_norm": 0.2205769315300188, "learning_rate": 6.000000000000001e-05, "loss": 0.3747, "step": 356 }, { "epoch": 2.0056051847959364, "grad_norm": 0.2741039461067367, "learning_rate": 5.986398139985195e-05, "loss": 0.339, "step": 357 }, { "epoch": 2.0112103695918724, "grad_norm": 0.31588543514752565, "learning_rate": 5.97276572369212e-05, "loss": 0.3439, "step": 358 }, { "epoch": 2.016815554387809, "grad_norm": 0.46336824661725023, "learning_rate": 5.959102960824914e-05, "loss": 0.3396, "step": 359 }, { "epoch": 2.0224207391837448, "grad_norm": 0.6543572364352943, "learning_rate": 5.945410061554526e-05, "loss": 0.3462, "step": 360 }, { "epoch": 2.028025923979681, "grad_norm": 0.7224099179065516, "learning_rate": 5.931687236515485e-05, "loss": 0.3452, "step": 361 }, { "epoch": 2.0336311087756176, "grad_norm": 0.5577072940081442, "learning_rate": 5.917934696802667e-05, "loss": 0.3393, "step": 362 }, { "epoch": 2.0392362935715536, "grad_norm": 0.33124598923207127, "learning_rate": 5.904152653968032e-05, "loss": 0.3403, "step": 363 }, { "epoch": 2.04484147836749, "grad_norm": 0.4930639395918798, "learning_rate": 5.890341320017389e-05, "loss": 0.3404, "step": 364 }, { "epoch": 2.0504466631634264, "grad_norm": 0.4584696038181033, "learning_rate": 5.8765009074071176e-05, "loss": 0.3413, "step": 365 }, { "epoch": 2.0560518479593624, "grad_norm": 0.3093299417205952, "learning_rate": 5.8626316290409124e-05, "loss": 0.3414, "step": 366 }, { "epoch": 2.0616570327552988, "grad_norm": 0.39704392594198545, "learning_rate": 5.8487336982665016e-05, "loss": 0.337, "step": 367 }, { "epoch": 2.0672622175512347, "grad_norm": 0.30243949060200587, "learning_rate": 5.8348073288723625e-05, "loss": 0.342, "step": 368 }, { "epoch": 2.072867402347171, "grad_norm": 0.2815416454530177, "learning_rate": 5.820852735084443e-05, "loss": 0.3382, "step": 369 }, { "epoch": 2.0784725871431076, "grad_norm": 0.37397854680211123, "learning_rate": 5.8068701315628564e-05, "loss": 0.338, "step": 370 }, { "epoch": 2.0840777719390435, "grad_norm": 0.2659776331075204, "learning_rate": 5.792859733398582e-05, "loss": 0.338, "step": 371 }, { "epoch": 2.08968295673498, "grad_norm": 0.263065032898508, "learning_rate": 5.7788217561101604e-05, "loss": 0.3399, "step": 372 }, { "epoch": 2.095288141530916, "grad_norm": 0.25395323107827705, "learning_rate": 5.7647564156403734e-05, "loss": 0.3436, "step": 373 }, { "epoch": 2.1008933263268523, "grad_norm": 0.26293260303616806, "learning_rate": 5.750663928352923e-05, "loss": 0.3335, "step": 374 }, { "epoch": 2.1064985111227887, "grad_norm": 0.3165184344301909, "learning_rate": 5.7365445110291063e-05, "loss": 0.3308, "step": 375 }, { "epoch": 2.1121036959187247, "grad_norm": 0.2209241133233396, "learning_rate": 5.7223983808644757e-05, "loss": 0.3384, "step": 376 }, { "epoch": 2.117708880714661, "grad_norm": 0.22848826893513474, "learning_rate": 5.7082257554655046e-05, "loss": 0.3302, "step": 377 }, { "epoch": 2.123314065510597, "grad_norm": 0.19866524720209594, "learning_rate": 5.6940268528462324e-05, "loss": 0.3325, "step": 378 }, { "epoch": 2.1289192503065335, "grad_norm": 0.18924595566511337, "learning_rate": 5.6798018914249176e-05, "loss": 0.3409, "step": 379 }, { "epoch": 2.13452443510247, "grad_norm": 0.2094808002199248, "learning_rate": 5.665551090020671e-05, "loss": 0.3368, "step": 380 }, { "epoch": 2.140129619898406, "grad_norm": 0.1896298642995286, "learning_rate": 5.651274667850099e-05, "loss": 0.3382, "step": 381 }, { "epoch": 2.1457348046943423, "grad_norm": 0.22305481312184167, "learning_rate": 5.6369728445239216e-05, "loss": 0.3365, "step": 382 }, { "epoch": 2.1513399894902787, "grad_norm": 0.18402697711670618, "learning_rate": 5.622645840043599e-05, "loss": 0.3327, "step": 383 }, { "epoch": 2.1569451742862147, "grad_norm": 0.18539919040240732, "learning_rate": 5.60829387479795e-05, "loss": 0.3367, "step": 384 }, { "epoch": 2.162550359082151, "grad_norm": 0.21582225739629068, "learning_rate": 5.5939171695597546e-05, "loss": 0.3395, "step": 385 }, { "epoch": 2.168155543878087, "grad_norm": 0.16294915710384228, "learning_rate": 5.579515945482366e-05, "loss": 0.3356, "step": 386 }, { "epoch": 2.1737607286740235, "grad_norm": 0.17506193706528747, "learning_rate": 5.5650904240963015e-05, "loss": 0.3389, "step": 387 }, { "epoch": 2.17936591346996, "grad_norm": 0.17986589577627823, "learning_rate": 5.55064082730584e-05, "loss": 0.3376, "step": 388 }, { "epoch": 2.184971098265896, "grad_norm": 0.20225696584185415, "learning_rate": 5.536167377385606e-05, "loss": 0.3352, "step": 389 }, { "epoch": 2.1905762830618323, "grad_norm": 0.20452311461397996, "learning_rate": 5.521670296977151e-05, "loss": 0.3427, "step": 390 }, { "epoch": 2.1961814678577682, "grad_norm": 0.20853709114695754, "learning_rate": 5.507149809085528e-05, "loss": 0.3414, "step": 391 }, { "epoch": 2.2017866526537047, "grad_norm": 0.19806872698081288, "learning_rate": 5.4926061370758616e-05, "loss": 0.3382, "step": 392 }, { "epoch": 2.207391837449641, "grad_norm": 0.18154490506390078, "learning_rate": 5.4780395046699116e-05, "loss": 0.3334, "step": 393 }, { "epoch": 2.212997022245577, "grad_norm": 0.20341890410491417, "learning_rate": 5.4634501359426345e-05, "loss": 0.3404, "step": 394 }, { "epoch": 2.2186022070415135, "grad_norm": 0.22993218917327035, "learning_rate": 5.4488382553187307e-05, "loss": 0.3443, "step": 395 }, { "epoch": 2.2242073918374494, "grad_norm": 0.26354208951183833, "learning_rate": 5.434204087569199e-05, "loss": 0.3377, "step": 396 }, { "epoch": 2.229812576633386, "grad_norm": 0.26935013286659065, "learning_rate": 5.419547857807871e-05, "loss": 0.3383, "step": 397 }, { "epoch": 2.2354177614293222, "grad_norm": 0.216246753815593, "learning_rate": 5.404869791487958e-05, "loss": 0.3354, "step": 398 }, { "epoch": 2.241022946225258, "grad_norm": 0.18048698989035397, "learning_rate": 5.390170114398575e-05, "loss": 0.3425, "step": 399 }, { "epoch": 2.2466281310211946, "grad_norm": 0.14602735362790997, "learning_rate": 5.375449052661271e-05, "loss": 0.3395, "step": 400 }, { "epoch": 2.252233315817131, "grad_norm": 0.21006582382336506, "learning_rate": 5.360706832726548e-05, "loss": 0.3364, "step": 401 }, { "epoch": 2.257838500613067, "grad_norm": 0.23551991763406468, "learning_rate": 5.345943681370381e-05, "loss": 0.3411, "step": 402 }, { "epoch": 2.2634436854090034, "grad_norm": 0.23856885482494433, "learning_rate": 5.33115982569073e-05, "loss": 0.3418, "step": 403 }, { "epoch": 2.2690488702049394, "grad_norm": 0.2300140164186402, "learning_rate": 5.31635549310404e-05, "loss": 0.3378, "step": 404 }, { "epoch": 2.274654055000876, "grad_norm": 0.195987902529524, "learning_rate": 5.3015309113417513e-05, "loss": 0.3311, "step": 405 }, { "epoch": 2.280259239796812, "grad_norm": 0.23463070349016596, "learning_rate": 5.286686308446788e-05, "loss": 0.3451, "step": 406 }, { "epoch": 2.285864424592748, "grad_norm": 0.2288982821893169, "learning_rate": 5.27182191277006e-05, "loss": 0.3377, "step": 407 }, { "epoch": 2.2914696093886846, "grad_norm": 0.2546834665154465, "learning_rate": 5.256937952966942e-05, "loss": 0.3377, "step": 408 }, { "epoch": 2.2970747941846206, "grad_norm": 0.2278705546240396, "learning_rate": 5.242034657993756e-05, "loss": 0.3327, "step": 409 }, { "epoch": 2.302679978980557, "grad_norm": 0.2120066354222386, "learning_rate": 5.227112257104256e-05, "loss": 0.3367, "step": 410 }, { "epoch": 2.3082851637764934, "grad_norm": 0.21472567166361733, "learning_rate": 5.2121709798460965e-05, "loss": 0.3313, "step": 411 }, { "epoch": 2.3138903485724294, "grad_norm": 0.1362367689167016, "learning_rate": 5.197211056057304e-05, "loss": 0.3351, "step": 412 }, { "epoch": 2.319495533368366, "grad_norm": 0.15862071124581603, "learning_rate": 5.182232715862738e-05, "loss": 0.3338, "step": 413 }, { "epoch": 2.325100718164302, "grad_norm": 0.18326499258519496, "learning_rate": 5.167236189670551e-05, "loss": 0.3404, "step": 414 }, { "epoch": 2.330705902960238, "grad_norm": 0.1581861572597703, "learning_rate": 5.152221708168652e-05, "loss": 0.3375, "step": 415 }, { "epoch": 2.3363110877561746, "grad_norm": 0.14207676687336432, "learning_rate": 5.137189502321149e-05, "loss": 0.3433, "step": 416 }, { "epoch": 2.3419162725521105, "grad_norm": 0.1688755529335156, "learning_rate": 5.122139803364798e-05, "loss": 0.337, "step": 417 }, { "epoch": 2.347521457348047, "grad_norm": 0.17084826149933108, "learning_rate": 5.1070728428054506e-05, "loss": 0.3337, "step": 418 }, { "epoch": 2.3531266421439834, "grad_norm": 0.17086756093968097, "learning_rate": 5.091988852414485e-05, "loss": 0.3379, "step": 419 }, { "epoch": 2.3587318269399193, "grad_norm": 0.17512853477255008, "learning_rate": 5.07688806422525e-05, "loss": 0.3346, "step": 420 }, { "epoch": 2.3643370117358558, "grad_norm": 0.13968477165183565, "learning_rate": 5.0617707105294876e-05, "loss": 0.337, "step": 421 }, { "epoch": 2.3699421965317917, "grad_norm": 0.1551281956472879, "learning_rate": 5.046637023873763e-05, "loss": 0.3414, "step": 422 }, { "epoch": 2.375547381327728, "grad_norm": 0.13800979999513777, "learning_rate": 5.0314872370558895e-05, "loss": 0.332, "step": 423 }, { "epoch": 2.3811525661236645, "grad_norm": 0.1367384754270518, "learning_rate": 5.016321583121342e-05, "loss": 0.3402, "step": 424 }, { "epoch": 2.3867577509196005, "grad_norm": 0.1638479532373916, "learning_rate": 5.00114029535968e-05, "loss": 0.3325, "step": 425 }, { "epoch": 2.392362935715537, "grad_norm": 0.16001330446175308, "learning_rate": 4.985943607300951e-05, "loss": 0.3378, "step": 426 }, { "epoch": 2.3979681205114733, "grad_norm": 0.18621075203491558, "learning_rate": 4.9707317527121e-05, "loss": 0.3395, "step": 427 }, { "epoch": 2.4035733053074093, "grad_norm": 0.18597047109928305, "learning_rate": 4.9555049655933786e-05, "loss": 0.3383, "step": 428 }, { "epoch": 2.4091784901033457, "grad_norm": 0.16909692251131547, "learning_rate": 4.940263480174741e-05, "loss": 0.3336, "step": 429 }, { "epoch": 2.4147836748992817, "grad_norm": 0.12955190193212013, "learning_rate": 4.9250075309122414e-05, "loss": 0.336, "step": 430 }, { "epoch": 2.420388859695218, "grad_norm": 0.15190807156866015, "learning_rate": 4.909737352484427e-05, "loss": 0.3399, "step": 431 }, { "epoch": 2.425994044491154, "grad_norm": 0.1573962735046912, "learning_rate": 4.894453179788728e-05, "loss": 0.3408, "step": 432 }, { "epoch": 2.4315992292870905, "grad_norm": 0.1370853516111357, "learning_rate": 4.879155247937849e-05, "loss": 0.3318, "step": 433 }, { "epoch": 2.437204414083027, "grad_norm": 0.15508060558426304, "learning_rate": 4.8638437922561445e-05, "loss": 0.3435, "step": 434 }, { "epoch": 2.442809598878963, "grad_norm": 0.14664188616583732, "learning_rate": 4.8485190482760046e-05, "loss": 0.3303, "step": 435 }, { "epoch": 2.4484147836748993, "grad_norm": 0.13239676829377417, "learning_rate": 4.833181251734228e-05, "loss": 0.3358, "step": 436 }, { "epoch": 2.4540199684708357, "grad_norm": 0.1689890101652452, "learning_rate": 4.8178306385684014e-05, "loss": 0.3379, "step": 437 }, { "epoch": 2.4596251532667717, "grad_norm": 0.19600685835243647, "learning_rate": 4.802467444913263e-05, "loss": 0.3375, "step": 438 }, { "epoch": 2.465230338062708, "grad_norm": 0.1614065765277386, "learning_rate": 4.787091907097075e-05, "loss": 0.3353, "step": 439 }, { "epoch": 2.4708355228586445, "grad_norm": 0.1508189987286625, "learning_rate": 4.771704261637988e-05, "loss": 0.3349, "step": 440 }, { "epoch": 2.4764407076545805, "grad_norm": 0.18101253119293248, "learning_rate": 4.756304745240398e-05, "loss": 0.3408, "step": 441 }, { "epoch": 2.482045892450517, "grad_norm": 0.1826261585945622, "learning_rate": 4.740893594791314e-05, "loss": 0.3351, "step": 442 }, { "epoch": 2.487651077246453, "grad_norm": 0.14052710829053464, "learning_rate": 4.7254710473567035e-05, "loss": 0.3357, "step": 443 }, { "epoch": 2.4932562620423893, "grad_norm": 0.14437812699305436, "learning_rate": 4.710037340177855e-05, "loss": 0.3323, "step": 444 }, { "epoch": 2.4988614468383252, "grad_norm": 0.15602474948645395, "learning_rate": 4.694592710667723e-05, "loss": 0.3315, "step": 445 }, { "epoch": 2.5044666316342616, "grad_norm": 0.14055328691934155, "learning_rate": 4.6791373964072755e-05, "loss": 0.3417, "step": 446 }, { "epoch": 2.510071816430198, "grad_norm": 0.1388879025795148, "learning_rate": 4.663671635141844e-05, "loss": 0.3334, "step": 447 }, { "epoch": 2.515677001226134, "grad_norm": 0.14203432091623683, "learning_rate": 4.648195664777466e-05, "loss": 0.3293, "step": 448 }, { "epoch": 2.5212821860220704, "grad_norm": 0.1362951045485, "learning_rate": 4.6327097233772167e-05, "loss": 0.3398, "step": 449 }, { "epoch": 2.526887370818007, "grad_norm": 0.13873000403640978, "learning_rate": 4.617214049157559e-05, "loss": 0.3447, "step": 450 }, { "epoch": 2.532492555613943, "grad_norm": 0.13932412475931288, "learning_rate": 4.601708880484672e-05, "loss": 0.3378, "step": 451 }, { "epoch": 2.5380977404098792, "grad_norm": 0.13616797667697145, "learning_rate": 4.586194455870782e-05, "loss": 0.3357, "step": 452 }, { "epoch": 2.5437029252058156, "grad_norm": 0.12388008240947325, "learning_rate": 4.5706710139705035e-05, "loss": 0.3367, "step": 453 }, { "epoch": 2.5493081100017516, "grad_norm": 0.13428327954219868, "learning_rate": 4.555138793577156e-05, "loss": 0.3372, "step": 454 }, { "epoch": 2.5549132947976876, "grad_norm": 0.12462707556187247, "learning_rate": 4.5395980336191e-05, "loss": 0.3386, "step": 455 }, { "epoch": 2.560518479593624, "grad_norm": 0.1235771755300071, "learning_rate": 4.524048973156056e-05, "loss": 0.3381, "step": 456 }, { "epoch": 2.5661236643895604, "grad_norm": 0.14711243405086183, "learning_rate": 4.508491851375431e-05, "loss": 0.3316, "step": 457 }, { "epoch": 2.5717288491854964, "grad_norm": 0.14459114492388414, "learning_rate": 4.4929269075886345e-05, "loss": 0.3298, "step": 458 }, { "epoch": 2.577334033981433, "grad_norm": 0.12808061163839768, "learning_rate": 4.477354381227405e-05, "loss": 0.3365, "step": 459 }, { "epoch": 2.582939218777369, "grad_norm": 0.1605215774840903, "learning_rate": 4.4617745118401146e-05, "loss": 0.3436, "step": 460 }, { "epoch": 2.588544403573305, "grad_norm": 0.1686937530955673, "learning_rate": 4.446187539088098e-05, "loss": 0.3401, "step": 461 }, { "epoch": 2.5941495883692416, "grad_norm": 0.1527872446582201, "learning_rate": 4.4305937027419554e-05, "loss": 0.336, "step": 462 }, { "epoch": 2.599754773165178, "grad_norm": 0.15128554865510688, "learning_rate": 4.4149932426778726e-05, "loss": 0.3344, "step": 463 }, { "epoch": 2.605359957961114, "grad_norm": 0.14920667419147088, "learning_rate": 4.399386398873919e-05, "loss": 0.337, "step": 464 }, { "epoch": 2.6109651427570504, "grad_norm": 0.1744457398926105, "learning_rate": 4.383773411406369e-05, "loss": 0.3315, "step": 465 }, { "epoch": 2.6165703275529864, "grad_norm": 0.17078707601958157, "learning_rate": 4.368154520446e-05, "loss": 0.3381, "step": 466 }, { "epoch": 2.6221755123489228, "grad_norm": 0.13633202612822465, "learning_rate": 4.352529966254408e-05, "loss": 0.3356, "step": 467 }, { "epoch": 2.6277806971448587, "grad_norm": 0.15243033406951811, "learning_rate": 4.336899989180297e-05, "loss": 0.336, "step": 468 }, { "epoch": 2.633385881940795, "grad_norm": 0.14747243716476988, "learning_rate": 4.3212648296557956e-05, "loss": 0.3404, "step": 469 }, { "epoch": 2.6389910667367316, "grad_norm": 0.14313810836029056, "learning_rate": 4.305624728192749e-05, "loss": 0.3383, "step": 470 }, { "epoch": 2.6445962515326675, "grad_norm": 0.15292894547047348, "learning_rate": 4.289979925379025e-05, "loss": 0.3347, "step": 471 }, { "epoch": 2.650201436328604, "grad_norm": 0.16666900752167832, "learning_rate": 4.274330661874812e-05, "loss": 0.3389, "step": 472 }, { "epoch": 2.6558066211245404, "grad_norm": 0.14101399886632246, "learning_rate": 4.258677178408914e-05, "loss": 0.3472, "step": 473 }, { "epoch": 2.6614118059204763, "grad_norm": 0.14283842141759293, "learning_rate": 4.2430197157750506e-05, "loss": 0.3288, "step": 474 }, { "epoch": 2.6670169907164127, "grad_norm": 0.18182542514094624, "learning_rate": 4.227358514828151e-05, "loss": 0.3344, "step": 475 }, { "epoch": 2.672622175512349, "grad_norm": 0.15313956411935128, "learning_rate": 4.2116938164806523e-05, "loss": 0.3448, "step": 476 }, { "epoch": 2.678227360308285, "grad_norm": 0.15589616508314744, "learning_rate": 4.19602586169879e-05, "loss": 0.3429, "step": 477 }, { "epoch": 2.6838325451042215, "grad_norm": 0.18115048482512122, "learning_rate": 4.1803548914988915e-05, "loss": 0.3341, "step": 478 }, { "epoch": 2.6894377299001575, "grad_norm": 0.15383530203792303, "learning_rate": 4.164681146943672e-05, "loss": 0.3369, "step": 479 }, { "epoch": 2.695042914696094, "grad_norm": 0.18132657455214243, "learning_rate": 4.1490048691385184e-05, "loss": 0.3387, "step": 480 }, { "epoch": 2.70064809949203, "grad_norm": 0.1663579198570477, "learning_rate": 4.133326299227796e-05, "loss": 0.3426, "step": 481 }, { "epoch": 2.7062532842879663, "grad_norm": 0.15912216978968627, "learning_rate": 4.1176456783911186e-05, "loss": 0.3391, "step": 482 }, { "epoch": 2.7118584690839027, "grad_norm": 0.14961940121164838, "learning_rate": 4.1019632478396535e-05, "loss": 0.3346, "step": 483 }, { "epoch": 2.7174636538798387, "grad_norm": 0.13821373229896533, "learning_rate": 4.0862792488124084e-05, "loss": 0.3444, "step": 484 }, { "epoch": 2.723068838675775, "grad_norm": 0.1446248191283951, "learning_rate": 4.070593922572515e-05, "loss": 0.3397, "step": 485 }, { "epoch": 2.7286740234717115, "grad_norm": 0.14611396725110462, "learning_rate": 4.0549075104035235e-05, "loss": 0.3381, "step": 486 }, { "epoch": 2.7342792082676475, "grad_norm": 0.15021839121813585, "learning_rate": 4.0392202536056864e-05, "loss": 0.3376, "step": 487 }, { "epoch": 2.739884393063584, "grad_norm": 0.1222365954508722, "learning_rate": 4.023532393492249e-05, "loss": 0.3418, "step": 488 }, { "epoch": 2.7454895778595203, "grad_norm": 0.13385222223998502, "learning_rate": 4.007844171385742e-05, "loss": 0.3375, "step": 489 }, { "epoch": 2.7510947626554563, "grad_norm": 0.16084769560491935, "learning_rate": 3.992155828614259e-05, "loss": 0.3383, "step": 490 }, { "epoch": 2.7566999474513927, "grad_norm": 0.11593149953710691, "learning_rate": 3.976467606507752e-05, "loss": 0.334, "step": 491 }, { "epoch": 2.7623051322473287, "grad_norm": 0.12135253379499054, "learning_rate": 3.960779746394315e-05, "loss": 0.3369, "step": 492 }, { "epoch": 2.767910317043265, "grad_norm": 0.15157050861544064, "learning_rate": 3.9450924895964785e-05, "loss": 0.3378, "step": 493 }, { "epoch": 2.773515501839201, "grad_norm": 0.13097668206944849, "learning_rate": 3.929406077427486e-05, "loss": 0.3378, "step": 494 }, { "epoch": 2.7791206866351374, "grad_norm": 0.16603204878535843, "learning_rate": 3.913720751187593e-05, "loss": 0.335, "step": 495 }, { "epoch": 2.784725871431074, "grad_norm": 0.14752638359057793, "learning_rate": 3.898036752160348e-05, "loss": 0.3333, "step": 496 }, { "epoch": 2.79033105622701, "grad_norm": 0.11483015755025572, "learning_rate": 3.882354321608883e-05, "loss": 0.3324, "step": 497 }, { "epoch": 2.7959362410229462, "grad_norm": 0.15227040786502544, "learning_rate": 3.8666737007722055e-05, "loss": 0.3334, "step": 498 }, { "epoch": 2.8015414258188827, "grad_norm": 0.15767384723901806, "learning_rate": 3.8509951308614816e-05, "loss": 0.3346, "step": 499 }, { "epoch": 2.8071466106148186, "grad_norm": 0.12626963411483594, "learning_rate": 3.8353188530563296e-05, "loss": 0.3433, "step": 500 }, { "epoch": 2.812751795410755, "grad_norm": 0.13733431001138763, "learning_rate": 3.8196451085011085e-05, "loss": 0.3327, "step": 501 }, { "epoch": 2.8183569802066915, "grad_norm": 0.14680623002425597, "learning_rate": 3.80397413830121e-05, "loss": 0.3362, "step": 502 }, { "epoch": 2.8239621650026274, "grad_norm": 0.13113991590813315, "learning_rate": 3.7883061835193476e-05, "loss": 0.3316, "step": 503 }, { "epoch": 2.829567349798564, "grad_norm": 0.14765888572997615, "learning_rate": 3.772641485171849e-05, "loss": 0.3354, "step": 504 }, { "epoch": 2.8351725345945, "grad_norm": 0.13209441385102483, "learning_rate": 3.756980284224951e-05, "loss": 0.3387, "step": 505 }, { "epoch": 2.840777719390436, "grad_norm": 0.14879932604775253, "learning_rate": 3.7413228215910866e-05, "loss": 0.3369, "step": 506 }, { "epoch": 2.846382904186372, "grad_norm": 0.13679808581693134, "learning_rate": 3.725669338125189e-05, "loss": 0.3316, "step": 507 }, { "epoch": 2.8519880889823086, "grad_norm": 0.14870932780889795, "learning_rate": 3.710020074620976e-05, "loss": 0.3411, "step": 508 }, { "epoch": 2.857593273778245, "grad_norm": 0.15465220506717361, "learning_rate": 3.6943752718072526e-05, "loss": 0.3431, "step": 509 }, { "epoch": 2.863198458574181, "grad_norm": 0.13086189881896804, "learning_rate": 3.6787351703442064e-05, "loss": 0.3361, "step": 510 }, { "epoch": 2.8688036433701174, "grad_norm": 0.15533250172898358, "learning_rate": 3.663100010819704e-05, "loss": 0.3409, "step": 511 }, { "epoch": 2.874408828166054, "grad_norm": 0.15368379331778698, "learning_rate": 3.6474700337455946e-05, "loss": 0.3366, "step": 512 }, { "epoch": 2.8800140129619898, "grad_norm": 0.13213250938368978, "learning_rate": 3.631845479554001e-05, "loss": 0.3404, "step": 513 }, { "epoch": 2.885619197757926, "grad_norm": 0.12630288060204878, "learning_rate": 3.616226588593634e-05, "loss": 0.3364, "step": 514 }, { "epoch": 2.891224382553862, "grad_norm": 0.13593521293454738, "learning_rate": 3.6006136011260835e-05, "loss": 0.3381, "step": 515 }, { "epoch": 2.8968295673497986, "grad_norm": 0.11554742949705811, "learning_rate": 3.5850067573221294e-05, "loss": 0.331, "step": 516 }, { "epoch": 2.9024347521457345, "grad_norm": 0.1198528477058993, "learning_rate": 3.569406297258045e-05, "loss": 0.3382, "step": 517 }, { "epoch": 2.908039936941671, "grad_norm": 0.12987613409396606, "learning_rate": 3.553812460911903e-05, "loss": 0.332, "step": 518 }, { "epoch": 2.9136451217376074, "grad_norm": 0.10360442006807373, "learning_rate": 3.538225488159886e-05, "loss": 0.3345, "step": 519 }, { "epoch": 2.9192503065335433, "grad_norm": 0.11999841966016728, "learning_rate": 3.5226456187725966e-05, "loss": 0.3356, "step": 520 }, { "epoch": 2.9248554913294798, "grad_norm": 0.12157002192921504, "learning_rate": 3.507073092411366e-05, "loss": 0.331, "step": 521 }, { "epoch": 2.930460676125416, "grad_norm": 0.11962352346017135, "learning_rate": 3.4915081486245696e-05, "loss": 0.3221, "step": 522 }, { "epoch": 2.936065860921352, "grad_norm": 0.10590981284086302, "learning_rate": 3.4759510268439444e-05, "loss": 0.3271, "step": 523 }, { "epoch": 2.9416710457172885, "grad_norm": 0.12508588575966806, "learning_rate": 3.460401966380901e-05, "loss": 0.334, "step": 524 }, { "epoch": 2.947276230513225, "grad_norm": 0.12411992558068433, "learning_rate": 3.4448612064228455e-05, "loss": 0.3342, "step": 525 }, { "epoch": 2.952881415309161, "grad_norm": 0.12323545333378516, "learning_rate": 3.4293289860294985e-05, "loss": 0.3397, "step": 526 }, { "epoch": 2.9584866001050973, "grad_norm": 0.12211808120842779, "learning_rate": 3.4138055441292186e-05, "loss": 0.333, "step": 527 }, { "epoch": 2.9640917849010333, "grad_norm": 0.11492860031416643, "learning_rate": 3.3982911195153294e-05, "loss": 0.3329, "step": 528 }, { "epoch": 2.9696969696969697, "grad_norm": 0.13508535922750942, "learning_rate": 3.3827859508424415e-05, "loss": 0.3398, "step": 529 }, { "epoch": 2.9753021544929057, "grad_norm": 0.11565425526423825, "learning_rate": 3.367290276622785e-05, "loss": 0.3365, "step": 530 }, { "epoch": 2.980907339288842, "grad_norm": 0.13139009508968535, "learning_rate": 3.3518043352225354e-05, "loss": 0.3312, "step": 531 }, { "epoch": 2.9865125240847785, "grad_norm": 0.1362299448189819, "learning_rate": 3.3363283648581564e-05, "loss": 0.3292, "step": 532 }, { "epoch": 2.9921177088807145, "grad_norm": 0.12683117549730533, "learning_rate": 3.3208626035927265e-05, "loss": 0.3306, "step": 533 }, { "epoch": 2.997722893676651, "grad_norm": 0.10969694374220536, "learning_rate": 3.305407289332279e-05, "loss": 0.3331, "step": 534 }, { "epoch": 3.00525486074619, "grad_norm": 0.3592894801843749, "learning_rate": 3.289962659822146e-05, "loss": 0.612, "step": 535 }, { "epoch": 3.0108600455421266, "grad_norm": 0.2839378100457828, "learning_rate": 3.274528952643296e-05, "loss": 0.309, "step": 536 }, { "epoch": 3.0164652303380626, "grad_norm": 0.2432146356050859, "learning_rate": 3.259106405208686e-05, "loss": 0.3106, "step": 537 }, { "epoch": 3.022070415133999, "grad_norm": 0.2777016538919042, "learning_rate": 3.2436952547596016e-05, "loss": 0.3208, "step": 538 }, { "epoch": 3.0276755999299354, "grad_norm": 0.29446096248561565, "learning_rate": 3.228295738362013e-05, "loss": 0.3128, "step": 539 }, { "epoch": 3.0332807847258714, "grad_norm": 0.2134321257824934, "learning_rate": 3.212908092902925e-05, "loss": 0.3123, "step": 540 }, { "epoch": 3.038885969521808, "grad_norm": 0.2929761811829302, "learning_rate": 3.1975325550867376e-05, "loss": 0.3149, "step": 541 }, { "epoch": 3.0444911543177438, "grad_norm": 0.23083582350400958, "learning_rate": 3.182169361431599e-05, "loss": 0.3181, "step": 542 }, { "epoch": 3.05009633911368, "grad_norm": 0.22809559973140262, "learning_rate": 3.1668187482657724e-05, "loss": 0.3196, "step": 543 }, { "epoch": 3.0557015239096166, "grad_norm": 0.2825820642402612, "learning_rate": 3.151480951723997e-05, "loss": 0.3146, "step": 544 }, { "epoch": 3.0613067087055525, "grad_norm": 0.2033350745344508, "learning_rate": 3.1361562077438575e-05, "loss": 0.3122, "step": 545 }, { "epoch": 3.066911893501489, "grad_norm": 0.21932897795785233, "learning_rate": 3.120844752062153e-05, "loss": 0.3124, "step": 546 }, { "epoch": 3.072517078297425, "grad_norm": 0.21168302798367983, "learning_rate": 3.1055468202112734e-05, "loss": 0.3166, "step": 547 }, { "epoch": 3.0781222630933613, "grad_norm": 0.17772176835113396, "learning_rate": 3.090262647515575e-05, "loss": 0.3077, "step": 548 }, { "epoch": 3.0837274478892978, "grad_norm": 0.17841507645772034, "learning_rate": 3.0749924690877606e-05, "loss": 0.314, "step": 549 }, { "epoch": 3.0893326326852337, "grad_norm": 0.1771081138385722, "learning_rate": 3.0597365198252605e-05, "loss": 0.3145, "step": 550 }, { "epoch": 3.09493781748117, "grad_norm": 0.17292167901780775, "learning_rate": 3.044495034406623e-05, "loss": 0.3141, "step": 551 }, { "epoch": 3.100543002277106, "grad_norm": 0.15958589623983585, "learning_rate": 3.0292682472879016e-05, "loss": 0.309, "step": 552 }, { "epoch": 3.1061481870730425, "grad_norm": 0.17823919421432835, "learning_rate": 3.014056392699051e-05, "loss": 0.3107, "step": 553 }, { "epoch": 3.111753371868979, "grad_norm": 0.14347972248545185, "learning_rate": 2.998859704640321e-05, "loss": 0.3122, "step": 554 }, { "epoch": 3.117358556664915, "grad_norm": 0.15502920693257438, "learning_rate": 2.9836784168786587e-05, "loss": 0.3079, "step": 555 }, { "epoch": 3.1229637414608513, "grad_norm": 0.14905838975522384, "learning_rate": 2.968512762944112e-05, "loss": 0.3085, "step": 556 }, { "epoch": 3.1285689262567877, "grad_norm": 0.15067799733981543, "learning_rate": 2.953362976126238e-05, "loss": 0.3173, "step": 557 }, { "epoch": 3.1341741110527237, "grad_norm": 0.1352871810582152, "learning_rate": 2.9382292894705137e-05, "loss": 0.3168, "step": 558 }, { "epoch": 3.13977929584866, "grad_norm": 0.13712672403520984, "learning_rate": 2.9231119357747514e-05, "loss": 0.3096, "step": 559 }, { "epoch": 3.145384480644596, "grad_norm": 0.13040410642011227, "learning_rate": 2.908011147585516e-05, "loss": 0.313, "step": 560 }, { "epoch": 3.1509896654405325, "grad_norm": 0.11413659831512099, "learning_rate": 2.8929271571945504e-05, "loss": 0.3173, "step": 561 }, { "epoch": 3.156594850236469, "grad_norm": 0.13404522891478707, "learning_rate": 2.8778601966352028e-05, "loss": 0.3129, "step": 562 }, { "epoch": 3.162200035032405, "grad_norm": 0.12169309736493167, "learning_rate": 2.8628104976788527e-05, "loss": 0.3144, "step": 563 }, { "epoch": 3.1678052198283413, "grad_norm": 0.12136151456869423, "learning_rate": 2.8477782918313495e-05, "loss": 0.3101, "step": 564 }, { "epoch": 3.1734104046242773, "grad_norm": 0.12720103445578, "learning_rate": 2.83276381032945e-05, "loss": 0.3064, "step": 565 }, { "epoch": 3.1790155894202137, "grad_norm": 0.11658578152683795, "learning_rate": 2.8177672841372642e-05, "loss": 0.3104, "step": 566 }, { "epoch": 3.18462077421615, "grad_norm": 0.1297562392547841, "learning_rate": 2.802788943942697e-05, "loss": 0.3123, "step": 567 }, { "epoch": 3.190225959012086, "grad_norm": 0.11985456340120923, "learning_rate": 2.787829020153904e-05, "loss": 0.3146, "step": 568 }, { "epoch": 3.1958311438080225, "grad_norm": 0.1262651870939076, "learning_rate": 2.772887742895745e-05, "loss": 0.3075, "step": 569 }, { "epoch": 3.2014363286039584, "grad_norm": 0.12464097287272473, "learning_rate": 2.7579653420062444e-05, "loss": 0.3045, "step": 570 }, { "epoch": 3.207041513399895, "grad_norm": 0.12341347110151742, "learning_rate": 2.7430620470330588e-05, "loss": 0.3052, "step": 571 }, { "epoch": 3.2126466981958313, "grad_norm": 0.11689537607575821, "learning_rate": 2.7281780872299397e-05, "loss": 0.3092, "step": 572 }, { "epoch": 3.2182518829917672, "grad_norm": 0.12733447393829486, "learning_rate": 2.7133136915532117e-05, "loss": 0.3126, "step": 573 }, { "epoch": 3.2238570677877036, "grad_norm": 0.1123962660816928, "learning_rate": 2.69846908865825e-05, "loss": 0.3125, "step": 574 }, { "epoch": 3.22946225258364, "grad_norm": 0.13480471605245697, "learning_rate": 2.68364450689596e-05, "loss": 0.3186, "step": 575 }, { "epoch": 3.235067437379576, "grad_norm": 0.11275478327505037, "learning_rate": 2.6688401743092704e-05, "loss": 0.3172, "step": 576 }, { "epoch": 3.2406726221755124, "grad_norm": 0.12484323278884911, "learning_rate": 2.6540563186296186e-05, "loss": 0.3102, "step": 577 }, { "epoch": 3.2462778069714484, "grad_norm": 0.11197370815895867, "learning_rate": 2.639293167273453e-05, "loss": 0.3031, "step": 578 }, { "epoch": 3.251882991767385, "grad_norm": 0.11454623610347744, "learning_rate": 2.6245509473387296e-05, "loss": 0.3065, "step": 579 }, { "epoch": 3.2574881765633212, "grad_norm": 0.12304422921255855, "learning_rate": 2.609829885601425e-05, "loss": 0.3089, "step": 580 }, { "epoch": 3.263093361359257, "grad_norm": 0.11259423111356101, "learning_rate": 2.5951302085120437e-05, "loss": 0.3105, "step": 581 }, { "epoch": 3.2686985461551936, "grad_norm": 0.11271478685448443, "learning_rate": 2.5804521421921305e-05, "loss": 0.3068, "step": 582 }, { "epoch": 3.2743037309511296, "grad_norm": 0.11647961636111298, "learning_rate": 2.5657959124308036e-05, "loss": 0.316, "step": 583 }, { "epoch": 3.279908915747066, "grad_norm": 0.12470089799540612, "learning_rate": 2.551161744681271e-05, "loss": 0.3122, "step": 584 }, { "epoch": 3.2855141005430024, "grad_norm": 0.09459288821151796, "learning_rate": 2.5365498640573675e-05, "loss": 0.3082, "step": 585 }, { "epoch": 3.2911192853389384, "grad_norm": 0.12027588781638404, "learning_rate": 2.5219604953300897e-05, "loss": 0.3104, "step": 586 }, { "epoch": 3.296724470134875, "grad_norm": 0.09693184441523361, "learning_rate": 2.5073938629241404e-05, "loss": 0.3113, "step": 587 }, { "epoch": 3.302329654930811, "grad_norm": 0.11431466893797337, "learning_rate": 2.4928501909144735e-05, "loss": 0.3122, "step": 588 }, { "epoch": 3.307934839726747, "grad_norm": 0.10947655679887262, "learning_rate": 2.4783297030228504e-05, "loss": 0.3059, "step": 589 }, { "epoch": 3.3135400245226836, "grad_norm": 0.10724694744795145, "learning_rate": 2.4638326226143955e-05, "loss": 0.3059, "step": 590 }, { "epoch": 3.3191452093186196, "grad_norm": 0.10607719365259763, "learning_rate": 2.449359172694161e-05, "loss": 0.3126, "step": 591 }, { "epoch": 3.324750394114556, "grad_norm": 0.10720345042194546, "learning_rate": 2.4349095759037e-05, "loss": 0.3089, "step": 592 }, { "epoch": 3.3303555789104924, "grad_norm": 0.10273817164660135, "learning_rate": 2.4204840545176356e-05, "loss": 0.3108, "step": 593 }, { "epoch": 3.3359607637064284, "grad_norm": 0.09975537559192163, "learning_rate": 2.406082830440247e-05, "loss": 0.3124, "step": 594 }, { "epoch": 3.3415659485023648, "grad_norm": 0.09406955914926102, "learning_rate": 2.3917061252020513e-05, "loss": 0.316, "step": 595 }, { "epoch": 3.3471711332983007, "grad_norm": 0.09940842953248977, "learning_rate": 2.3773541599564016e-05, "loss": 0.3127, "step": 596 }, { "epoch": 3.352776318094237, "grad_norm": 0.10553838056133437, "learning_rate": 2.36302715547608e-05, "loss": 0.3057, "step": 597 }, { "epoch": 3.3583815028901736, "grad_norm": 0.10680543765646766, "learning_rate": 2.3487253321499025e-05, "loss": 0.3064, "step": 598 }, { "epoch": 3.3639866876861095, "grad_norm": 0.10212325660144811, "learning_rate": 2.3344489099793298e-05, "loss": 0.3054, "step": 599 }, { "epoch": 3.369591872482046, "grad_norm": 0.09345936279910022, "learning_rate": 2.3201981085750848e-05, "loss": 0.3091, "step": 600 }, { "epoch": 3.3751970572779824, "grad_norm": 0.10692872809212166, "learning_rate": 2.3059731471537692e-05, "loss": 0.3057, "step": 601 }, { "epoch": 3.3808022420739183, "grad_norm": 0.10292193319438646, "learning_rate": 2.2917742445344957e-05, "loss": 0.3039, "step": 602 }, { "epoch": 3.3864074268698547, "grad_norm": 0.1044728564683231, "learning_rate": 2.2776016191355247e-05, "loss": 0.307, "step": 603 }, { "epoch": 3.3920126116657907, "grad_norm": 0.11121222286242763, "learning_rate": 2.2634554889708946e-05, "loss": 0.3146, "step": 604 }, { "epoch": 3.397617796461727, "grad_norm": 0.0953250187025364, "learning_rate": 2.2493360716470778e-05, "loss": 0.3088, "step": 605 }, { "epoch": 3.403222981257663, "grad_norm": 0.10907994857009719, "learning_rate": 2.2352435843596276e-05, "loss": 0.3122, "step": 606 }, { "epoch": 3.4088281660535995, "grad_norm": 0.09794843878191668, "learning_rate": 2.2211782438898403e-05, "loss": 0.3072, "step": 607 }, { "epoch": 3.414433350849536, "grad_norm": 0.1106427273961921, "learning_rate": 2.207140266601419e-05, "loss": 0.3173, "step": 608 }, { "epoch": 3.420038535645472, "grad_norm": 0.09676869359844141, "learning_rate": 2.193129868437145e-05, "loss": 0.3097, "step": 609 }, { "epoch": 3.4256437204414083, "grad_norm": 0.1127315347743384, "learning_rate": 2.179147264915558e-05, "loss": 0.3087, "step": 610 }, { "epoch": 3.4312489052373447, "grad_norm": 0.10860744939106713, "learning_rate": 2.1651926711276374e-05, "loss": 0.3064, "step": 611 }, { "epoch": 3.4368540900332807, "grad_norm": 0.10854328769381996, "learning_rate": 2.1512663017334994e-05, "loss": 0.3098, "step": 612 }, { "epoch": 3.442459274829217, "grad_norm": 0.11762452038097213, "learning_rate": 2.1373683709590873e-05, "loss": 0.3115, "step": 613 }, { "epoch": 3.4480644596251535, "grad_norm": 0.10805927729310495, "learning_rate": 2.1234990925928827e-05, "loss": 0.3078, "step": 614 }, { "epoch": 3.4536696444210895, "grad_norm": 0.1210574300692429, "learning_rate": 2.1096586799826123e-05, "loss": 0.3131, "step": 615 }, { "epoch": 3.459274829217026, "grad_norm": 0.11322408230361472, "learning_rate": 2.0958473460319685e-05, "loss": 0.3045, "step": 616 }, { "epoch": 3.464880014012962, "grad_norm": 0.12288017952430454, "learning_rate": 2.0820653031973363e-05, "loss": 0.3004, "step": 617 }, { "epoch": 3.4704851988088983, "grad_norm": 0.11467508943459158, "learning_rate": 2.0683127634845155e-05, "loss": 0.3118, "step": 618 }, { "epoch": 3.4760903836048342, "grad_norm": 0.10089539729462121, "learning_rate": 2.0545899384454753e-05, "loss": 0.3115, "step": 619 }, { "epoch": 3.4816955684007707, "grad_norm": 0.10903963497738692, "learning_rate": 2.040897039175087e-05, "loss": 0.3183, "step": 620 }, { "epoch": 3.487300753196707, "grad_norm": 0.0960280103883697, "learning_rate": 2.0272342763078806e-05, "loss": 0.3168, "step": 621 }, { "epoch": 3.492905937992643, "grad_norm": 0.1019907001338773, "learning_rate": 2.0136018600148065e-05, "loss": 0.314, "step": 622 }, { "epoch": 3.4985111227885795, "grad_norm": 0.09703216086704151, "learning_rate": 2.0000000000000012e-05, "loss": 0.3114, "step": 623 }, { "epoch": 3.504116307584516, "grad_norm": 0.09927974827651981, "learning_rate": 1.9864289054975595e-05, "loss": 0.3066, "step": 624 }, { "epoch": 3.509721492380452, "grad_norm": 0.10115048078252437, "learning_rate": 1.9728887852683204e-05, "loss": 0.3063, "step": 625 }, { "epoch": 3.5153266771763882, "grad_norm": 0.09617916095602932, "learning_rate": 1.959379847596652e-05, "loss": 0.3078, "step": 626 }, { "epoch": 3.5209318619723247, "grad_norm": 0.10018831202713846, "learning_rate": 1.9459023002872466e-05, "loss": 0.306, "step": 627 }, { "epoch": 3.5265370467682606, "grad_norm": 0.09046907662038928, "learning_rate": 1.9324563506619323e-05, "loss": 0.3093, "step": 628 }, { "epoch": 3.532142231564197, "grad_norm": 0.09707258005221424, "learning_rate": 1.9190422055564716e-05, "loss": 0.3068, "step": 629 }, { "epoch": 3.537747416360133, "grad_norm": 0.08790868545557047, "learning_rate": 1.9056600713173884e-05, "loss": 0.3063, "step": 630 }, { "epoch": 3.5433526011560694, "grad_norm": 0.09522724255822446, "learning_rate": 1.8923101537987906e-05, "loss": 0.3071, "step": 631 }, { "epoch": 3.5489577859520054, "grad_norm": 0.09275069555664794, "learning_rate": 1.878992658359205e-05, "loss": 0.3082, "step": 632 }, { "epoch": 3.554562970747942, "grad_norm": 0.09662779046438905, "learning_rate": 1.865707789858416e-05, "loss": 0.317, "step": 633 }, { "epoch": 3.560168155543878, "grad_norm": 0.09038439246839879, "learning_rate": 1.852455752654318e-05, "loss": 0.3095, "step": 634 }, { "epoch": 3.565773340339814, "grad_norm": 0.0892351312327684, "learning_rate": 1.839236750599767e-05, "loss": 0.3099, "step": 635 }, { "epoch": 3.5713785251357506, "grad_norm": 0.0901544987475721, "learning_rate": 1.8260509870394475e-05, "loss": 0.3145, "step": 636 }, { "epoch": 3.576983709931687, "grad_norm": 0.08859989195760687, "learning_rate": 1.8128986648067487e-05, "loss": 0.3054, "step": 637 }, { "epoch": 3.582588894727623, "grad_norm": 0.09128402171545218, "learning_rate": 1.7997799862206346e-05, "loss": 0.3121, "step": 638 }, { "epoch": 3.5881940795235594, "grad_norm": 0.09185126057088099, "learning_rate": 1.78669515308254e-05, "loss": 0.3103, "step": 639 }, { "epoch": 3.5937992643194954, "grad_norm": 0.0893997681385203, "learning_rate": 1.7736443666732626e-05, "loss": 0.3099, "step": 640 }, { "epoch": 3.599404449115432, "grad_norm": 0.0927092479814224, "learning_rate": 1.7606278277498674e-05, "loss": 0.3096, "step": 641 }, { "epoch": 3.6050096339113678, "grad_norm": 0.09322798178482147, "learning_rate": 1.747645736542599e-05, "loss": 0.312, "step": 642 }, { "epoch": 3.610614818707304, "grad_norm": 0.09305885545146318, "learning_rate": 1.7346982927518014e-05, "loss": 0.3121, "step": 643 }, { "epoch": 3.6162200035032406, "grad_norm": 0.09354862097700213, "learning_rate": 1.721785695544846e-05, "loss": 0.3084, "step": 644 }, { "epoch": 3.6218251882991765, "grad_norm": 0.08879212707112467, "learning_rate": 1.7089081435530667e-05, "loss": 0.3103, "step": 645 }, { "epoch": 3.627430373095113, "grad_norm": 0.08777213540858403, "learning_rate": 1.6960658348687046e-05, "loss": 0.3094, "step": 646 }, { "epoch": 3.6330355578910494, "grad_norm": 0.09446494446048785, "learning_rate": 1.683258967041866e-05, "loss": 0.3099, "step": 647 }, { "epoch": 3.6386407426869853, "grad_norm": 0.0856866484739207, "learning_rate": 1.6704877370774748e-05, "loss": 0.3046, "step": 648 }, { "epoch": 3.6442459274829218, "grad_norm": 0.08653060673663804, "learning_rate": 1.6577523414322478e-05, "loss": 0.3039, "step": 649 }, { "epoch": 3.649851112278858, "grad_norm": 0.0914995782178045, "learning_rate": 1.6450529760116705e-05, "loss": 0.3115, "step": 650 }, { "epoch": 3.655456297074794, "grad_norm": 0.08842063602461302, "learning_rate": 1.6323898361669857e-05, "loss": 0.3099, "step": 651 }, { "epoch": 3.6610614818707305, "grad_norm": 0.09196505054199886, "learning_rate": 1.6197631166921856e-05, "loss": 0.3059, "step": 652 }, { "epoch": 3.6666666666666665, "grad_norm": 0.08826582262682107, "learning_rate": 1.6071730118210173e-05, "loss": 0.3065, "step": 653 }, { "epoch": 3.672271851462603, "grad_norm": 0.08601459200435725, "learning_rate": 1.594619715223994e-05, "loss": 0.3083, "step": 654 }, { "epoch": 3.677877036258539, "grad_norm": 0.09329501275849174, "learning_rate": 1.5821034200054176e-05, "loss": 0.3116, "step": 655 }, { "epoch": 3.6834822210544753, "grad_norm": 0.08578241057447392, "learning_rate": 1.569624318700405e-05, "loss": 0.3111, "step": 656 }, { "epoch": 3.6890874058504117, "grad_norm": 0.08212635956097417, "learning_rate": 1.5571826032719287e-05, "loss": 0.3103, "step": 657 }, { "epoch": 3.6946925906463477, "grad_norm": 0.0920585030532975, "learning_rate": 1.5447784651078642e-05, "loss": 0.3119, "step": 658 }, { "epoch": 3.700297775442284, "grad_norm": 0.08381481949938734, "learning_rate": 1.532412095018044e-05, "loss": 0.3053, "step": 659 }, { "epoch": 3.7059029602382205, "grad_norm": 0.08430489721177843, "learning_rate": 1.5200836832313246e-05, "loss": 0.308, "step": 660 }, { "epoch": 3.7115081450341565, "grad_norm": 0.09318737393858478, "learning_rate": 1.5077934193926584e-05, "loss": 0.3125, "step": 661 }, { "epoch": 3.717113329830093, "grad_norm": 0.09089782801509312, "learning_rate": 1.4955414925601757e-05, "loss": 0.316, "step": 662 }, { "epoch": 3.7227185146260293, "grad_norm": 0.08583258468324842, "learning_rate": 1.4833280912022834e-05, "loss": 0.3064, "step": 663 }, { "epoch": 3.7283236994219653, "grad_norm": 0.08591690878371737, "learning_rate": 1.4711534031947543e-05, "loss": 0.3194, "step": 664 }, { "epoch": 3.7339288842179017, "grad_norm": 0.08916277837335898, "learning_rate": 1.459017615817846e-05, "loss": 0.3096, "step": 665 }, { "epoch": 3.7395340690138377, "grad_norm": 0.08697008778870499, "learning_rate": 1.4469209157534172e-05, "loss": 0.3042, "step": 666 }, { "epoch": 3.745139253809774, "grad_norm": 0.08513292108575704, "learning_rate": 1.4348634890820554e-05, "loss": 0.3042, "step": 667 }, { "epoch": 3.75074443860571, "grad_norm": 0.09258652498297838, "learning_rate": 1.4228455212802149e-05, "loss": 0.3081, "step": 668 }, { "epoch": 3.7563496234016465, "grad_norm": 0.08525936977928056, "learning_rate": 1.4108671972173644e-05, "loss": 0.3109, "step": 669 }, { "epoch": 3.761954808197583, "grad_norm": 0.08470431515893076, "learning_rate": 1.3989287011531425e-05, "loss": 0.312, "step": 670 }, { "epoch": 3.767559992993519, "grad_norm": 0.09004964227688511, "learning_rate": 1.3870302167345222e-05, "loss": 0.3079, "step": 671 }, { "epoch": 3.7731651777894553, "grad_norm": 0.08825735283568481, "learning_rate": 1.3751719269929908e-05, "loss": 0.3049, "step": 672 }, { "epoch": 3.7787703625853917, "grad_norm": 0.08533456832494438, "learning_rate": 1.3633540143417268e-05, "loss": 0.3046, "step": 673 }, { "epoch": 3.7843755473813276, "grad_norm": 0.09333040420463685, "learning_rate": 1.3515766605727984e-05, "loss": 0.3056, "step": 674 }, { "epoch": 3.789980732177264, "grad_norm": 0.09109816447279726, "learning_rate": 1.3398400468543682e-05, "loss": 0.3138, "step": 675 }, { "epoch": 3.7955859169732005, "grad_norm": 0.08953063350846481, "learning_rate": 1.328144353727903e-05, "loss": 0.3095, "step": 676 }, { "epoch": 3.8011911017691364, "grad_norm": 0.08970232410085, "learning_rate": 1.3164897611053981e-05, "loss": 0.3092, "step": 677 }, { "epoch": 3.806796286565073, "grad_norm": 0.08560717524353414, "learning_rate": 1.3048764482666112e-05, "loss": 0.3068, "step": 678 }, { "epoch": 3.812401471361009, "grad_norm": 0.0870128302022409, "learning_rate": 1.2933045938563012e-05, "loss": 0.3103, "step": 679 }, { "epoch": 3.8180066561569452, "grad_norm": 0.08488653192608872, "learning_rate": 1.281774375881482e-05, "loss": 0.3092, "step": 680 }, { "epoch": 3.823611840952881, "grad_norm": 0.08327605535332809, "learning_rate": 1.2702859717086886e-05, "loss": 0.3077, "step": 681 }, { "epoch": 3.8292170257488176, "grad_norm": 0.082880372307135, "learning_rate": 1.2588395580612392e-05, "loss": 0.3115, "step": 682 }, { "epoch": 3.834822210544754, "grad_norm": 0.08662923231375559, "learning_rate": 1.247435311016525e-05, "loss": 0.3066, "step": 683 }, { "epoch": 3.84042739534069, "grad_norm": 0.08229450378457145, "learning_rate": 1.2360734060032967e-05, "loss": 0.3053, "step": 684 }, { "epoch": 3.8460325801366264, "grad_norm": 0.08665772393719716, "learning_rate": 1.2247540177989695e-05, "loss": 0.3047, "step": 685 }, { "epoch": 3.851637764932563, "grad_norm": 0.08865321374818363, "learning_rate": 1.2134773205269323e-05, "loss": 0.3132, "step": 686 }, { "epoch": 3.857242949728499, "grad_norm": 0.09256990438194068, "learning_rate": 1.2022434876538696e-05, "loss": 0.3098, "step": 687 }, { "epoch": 3.862848134524435, "grad_norm": 0.08775878742002668, "learning_rate": 1.191052691987094e-05, "loss": 0.308, "step": 688 }, { "epoch": 3.8684533193203716, "grad_norm": 0.09049516342589015, "learning_rate": 1.1799051056718844e-05, "loss": 0.308, "step": 689 }, { "epoch": 3.8740585041163076, "grad_norm": 0.0850218500678186, "learning_rate": 1.1688009001888475e-05, "loss": 0.3082, "step": 690 }, { "epoch": 3.8796636889122436, "grad_norm": 0.09015284370086712, "learning_rate": 1.1577402463512652e-05, "loss": 0.3125, "step": 691 }, { "epoch": 3.88526887370818, "grad_norm": 0.08936579824727202, "learning_rate": 1.1467233143024803e-05, "loss": 0.2996, "step": 692 }, { "epoch": 3.8908740585041164, "grad_norm": 0.08641714653740455, "learning_rate": 1.1357502735132715e-05, "loss": 0.3085, "step": 693 }, { "epoch": 3.8964792433000524, "grad_norm": 0.08002468185657431, "learning_rate": 1.1248212927792502e-05, "loss": 0.3074, "step": 694 }, { "epoch": 3.9020844280959888, "grad_norm": 0.08225416168538756, "learning_rate": 1.1139365402182625e-05, "loss": 0.3056, "step": 695 }, { "epoch": 3.907689612891925, "grad_norm": 0.08346850523992327, "learning_rate": 1.1030961832678014e-05, "loss": 0.309, "step": 696 }, { "epoch": 3.913294797687861, "grad_norm": 0.08024780002044651, "learning_rate": 1.0923003886824382e-05, "loss": 0.3063, "step": 697 }, { "epoch": 3.9188999824837976, "grad_norm": 0.09339560984140326, "learning_rate": 1.081549322531247e-05, "loss": 0.3029, "step": 698 }, { "epoch": 3.924505167279734, "grad_norm": 0.08375958744991581, "learning_rate": 1.0708431501952567e-05, "loss": 0.3181, "step": 699 }, { "epoch": 3.93011035207567, "grad_norm": 0.07819955008853643, "learning_rate": 1.060182036364907e-05, "loss": 0.3072, "step": 700 }, { "epoch": 3.9357155368716064, "grad_norm": 0.07835982764160071, "learning_rate": 1.0495661450375114e-05, "loss": 0.302, "step": 701 }, { "epoch": 3.9413207216675423, "grad_norm": 0.08325758169200392, "learning_rate": 1.0389956395147389e-05, "loss": 0.3062, "step": 702 }, { "epoch": 3.9469259064634787, "grad_norm": 0.08333976071773525, "learning_rate": 1.0284706824000983e-05, "loss": 0.3146, "step": 703 }, { "epoch": 3.9525310912594147, "grad_norm": 0.0786546369775888, "learning_rate": 1.0179914355964384e-05, "loss": 0.3099, "step": 704 }, { "epoch": 3.958136276055351, "grad_norm": 0.0826606587187586, "learning_rate": 1.0075580603034569e-05, "loss": 0.3078, "step": 705 }, { "epoch": 3.9637414608512875, "grad_norm": 0.08157885632921875, "learning_rate": 9.971707170152243e-06, "loss": 0.3025, "step": 706 }, { "epoch": 3.9693466456472235, "grad_norm": 0.07569811929437366, "learning_rate": 9.86829565517709e-06, "loss": 0.3052, "step": 707 }, { "epoch": 3.97495183044316, "grad_norm": 0.08433452418842283, "learning_rate": 9.765347648863228e-06, "loss": 0.3103, "step": 708 }, { "epoch": 3.9805570152390963, "grad_norm": 0.08116174303332119, "learning_rate": 9.662864734834736e-06, "loss": 0.3162, "step": 709 }, { "epoch": 3.9861622000350323, "grad_norm": 0.07742272052051664, "learning_rate": 9.560848489561292e-06, "loss": 0.3088, "step": 710 }, { "epoch": 3.9917673848309687, "grad_norm": 0.08299995390154452, "learning_rate": 9.459300482333931e-06, "loss": 0.3133, "step": 711 }, { "epoch": 3.997372569626905, "grad_norm": 0.0834553917977672, "learning_rate": 9.358222275240884e-06, "loss": 0.3094, "step": 712 }, { "epoch": 4.005605184795936, "grad_norm": 0.13513807332008165, "learning_rate": 9.257615423143566e-06, "loss": 0.2945, "step": 713 }, { "epoch": 4.011210369591873, "grad_norm": 0.10621089229052157, "learning_rate": 9.157481473652643e-06, "loss": 0.2885, "step": 714 }, { "epoch": 4.016815554387809, "grad_norm": 0.08714199459949697, "learning_rate": 9.05782196710427e-06, "loss": 0.2958, "step": 715 }, { "epoch": 4.022420739183745, "grad_norm": 0.10547298624299982, "learning_rate": 8.958638436536322e-06, "loss": 0.2904, "step": 716 }, { "epoch": 4.028025923979682, "grad_norm": 0.11824374552084228, "learning_rate": 8.85993240766487e-06, "loss": 0.2939, "step": 717 }, { "epoch": 4.033631108775618, "grad_norm": 0.11166981880398426, "learning_rate": 8.761705398860684e-06, "loss": 0.2879, "step": 718 }, { "epoch": 4.039236293571554, "grad_norm": 0.10093675260999213, "learning_rate": 8.6639589211259e-06, "loss": 0.292, "step": 719 }, { "epoch": 4.0448414783674895, "grad_norm": 0.10073956157160133, "learning_rate": 8.566694478070748e-06, "loss": 0.2849, "step": 720 }, { "epoch": 4.050446663163426, "grad_norm": 0.10476897069362466, "learning_rate": 8.469913565890443e-06, "loss": 0.2929, "step": 721 }, { "epoch": 4.056051847959362, "grad_norm": 0.10098301015704451, "learning_rate": 8.373617673342154e-06, "loss": 0.2915, "step": 722 }, { "epoch": 4.061657032755298, "grad_norm": 0.09703917635173606, "learning_rate": 8.277808281722116e-06, "loss": 0.2899, "step": 723 }, { "epoch": 4.067262217551235, "grad_norm": 0.10034460587837235, "learning_rate": 8.182486864842852e-06, "loss": 0.2917, "step": 724 }, { "epoch": 4.072867402347171, "grad_norm": 0.09179532694160371, "learning_rate": 8.087654889010475e-06, "loss": 0.2892, "step": 725 }, { "epoch": 4.078472587143107, "grad_norm": 0.10753395912690455, "learning_rate": 7.993313813002137e-06, "loss": 0.2905, "step": 726 }, { "epoch": 4.084077771939044, "grad_norm": 0.08822865040425992, "learning_rate": 7.899465088043632e-06, "loss": 0.2861, "step": 727 }, { "epoch": 4.08968295673498, "grad_norm": 0.08713560322111398, "learning_rate": 7.806110157786978e-06, "loss": 0.2903, "step": 728 }, { "epoch": 4.095288141530916, "grad_norm": 0.09170567396266732, "learning_rate": 7.713250458288333e-06, "loss": 0.2901, "step": 729 }, { "epoch": 4.100893326326853, "grad_norm": 0.09585484257486802, "learning_rate": 7.620887417985789e-06, "loss": 0.2887, "step": 730 }, { "epoch": 4.106498511122789, "grad_norm": 0.08818209427141882, "learning_rate": 7.529022457677504e-06, "loss": 0.2924, "step": 731 }, { "epoch": 4.112103695918725, "grad_norm": 0.08443201723287494, "learning_rate": 7.437656990499746e-06, "loss": 0.2863, "step": 732 }, { "epoch": 4.117708880714661, "grad_norm": 0.08873488187304306, "learning_rate": 7.346792421905231e-06, "loss": 0.2938, "step": 733 }, { "epoch": 4.1233140655105975, "grad_norm": 0.08694222494262985, "learning_rate": 7.2564301496414535e-06, "loss": 0.2924, "step": 734 }, { "epoch": 4.1289192503065335, "grad_norm": 0.08256227612309179, "learning_rate": 7.166571563729223e-06, "loss": 0.2917, "step": 735 }, { "epoch": 4.1345244351024695, "grad_norm": 0.08001663820565025, "learning_rate": 7.07721804644125e-06, "loss": 0.285, "step": 736 }, { "epoch": 4.140129619898406, "grad_norm": 0.08149206652526432, "learning_rate": 6.988370972280911e-06, "loss": 0.287, "step": 737 }, { "epoch": 4.145734804694342, "grad_norm": 0.0830237803672675, "learning_rate": 6.900031707961083e-06, "loss": 0.2929, "step": 738 }, { "epoch": 4.151339989490278, "grad_norm": 0.07889881500439108, "learning_rate": 6.812201612383132e-06, "loss": 0.2908, "step": 739 }, { "epoch": 4.156945174286215, "grad_norm": 0.07965629951868874, "learning_rate": 6.724882036615991e-06, "loss": 0.2911, "step": 740 }, { "epoch": 4.162550359082151, "grad_norm": 0.08142470834051524, "learning_rate": 6.638074323875426e-06, "loss": 0.2845, "step": 741 }, { "epoch": 4.168155543878087, "grad_norm": 0.0790024403007617, "learning_rate": 6.551779809503305e-06, "loss": 0.2912, "step": 742 }, { "epoch": 4.173760728674024, "grad_norm": 0.08162794031253676, "learning_rate": 6.465999820947107e-06, "loss": 0.287, "step": 743 }, { "epoch": 4.17936591346996, "grad_norm": 0.08060613968699802, "learning_rate": 6.380735677739474e-06, "loss": 0.2917, "step": 744 }, { "epoch": 4.184971098265896, "grad_norm": 0.0763576924622887, "learning_rate": 6.295988691477939e-06, "loss": 0.2895, "step": 745 }, { "epoch": 4.190576283061832, "grad_norm": 0.07704795721026231, "learning_rate": 6.2117601658047234e-06, "loss": 0.2914, "step": 746 }, { "epoch": 4.196181467857769, "grad_norm": 0.08636531460875918, "learning_rate": 6.128051396386707e-06, "loss": 0.2908, "step": 747 }, { "epoch": 4.201786652653705, "grad_norm": 0.08428531970028809, "learning_rate": 6.044863670895473e-06, "loss": 0.292, "step": 748 }, { "epoch": 4.207391837449641, "grad_norm": 0.07541521932519012, "learning_rate": 5.962198268987514e-06, "loss": 0.2956, "step": 749 }, { "epoch": 4.2129970222455775, "grad_norm": 0.08036165670495596, "learning_rate": 5.880056462284573e-06, "loss": 0.2955, "step": 750 }, { "epoch": 4.2186022070415135, "grad_norm": 0.07962458343055896, "learning_rate": 5.798439514354024e-06, "loss": 0.2904, "step": 751 }, { "epoch": 4.224207391837449, "grad_norm": 0.07557676462990631, "learning_rate": 5.7173486806894804e-06, "loss": 0.2943, "step": 752 }, { "epoch": 4.229812576633386, "grad_norm": 0.07611814912098987, "learning_rate": 5.6367852086914555e-06, "loss": 0.2921, "step": 753 }, { "epoch": 4.235417761429322, "grad_norm": 0.07591915780682665, "learning_rate": 5.556750337648207e-06, "loss": 0.2966, "step": 754 }, { "epoch": 4.241022946225258, "grad_norm": 0.07451638627135487, "learning_rate": 5.477245298716636e-06, "loss": 0.2916, "step": 755 }, { "epoch": 4.246628131021194, "grad_norm": 0.07408998865978028, "learning_rate": 5.398271314903376e-06, "loss": 0.2922, "step": 756 }, { "epoch": 4.252233315817131, "grad_norm": 0.07545427591965101, "learning_rate": 5.3198296010459604e-06, "loss": 0.2894, "step": 757 }, { "epoch": 4.257838500613067, "grad_norm": 0.07784079160936691, "learning_rate": 5.241921363794143e-06, "loss": 0.2899, "step": 758 }, { "epoch": 4.263443685409003, "grad_norm": 0.07434860497601323, "learning_rate": 5.1645478015913556e-06, "loss": 0.2938, "step": 759 }, { "epoch": 4.26904887020494, "grad_norm": 0.07524905441026238, "learning_rate": 5.0877101046562335e-06, "loss": 0.2925, "step": 760 }, { "epoch": 4.274654055000876, "grad_norm": 0.07534375562369973, "learning_rate": 5.011409454964336e-06, "loss": 0.2956, "step": 761 }, { "epoch": 4.280259239796812, "grad_norm": 0.07564823418534802, "learning_rate": 4.935647026229951e-06, "loss": 0.2897, "step": 762 }, { "epoch": 4.285864424592749, "grad_norm": 0.07430360134148153, "learning_rate": 4.860423983888054e-06, "loss": 0.2909, "step": 763 }, { "epoch": 4.291469609388685, "grad_norm": 0.07690448982555907, "learning_rate": 4.785741485076356e-06, "loss": 0.2921, "step": 764 }, { "epoch": 4.297074794184621, "grad_norm": 0.07333540337330373, "learning_rate": 4.711600678617521e-06, "loss": 0.2924, "step": 765 }, { "epoch": 4.302679978980557, "grad_norm": 0.0707515459460907, "learning_rate": 4.6380027050015165e-06, "loss": 0.2897, "step": 766 }, { "epoch": 4.308285163776493, "grad_norm": 0.07535393854931703, "learning_rate": 4.564948696368014e-06, "loss": 0.2941, "step": 767 }, { "epoch": 4.313890348572429, "grad_norm": 0.07963087040482336, "learning_rate": 4.492439776489024e-06, "loss": 0.2928, "step": 768 }, { "epoch": 4.319495533368365, "grad_norm": 0.07387271710468704, "learning_rate": 4.420477060751575e-06, "loss": 0.292, "step": 769 }, { "epoch": 4.325100718164302, "grad_norm": 0.07736925416730672, "learning_rate": 4.349061656140583e-06, "loss": 0.2944, "step": 770 }, { "epoch": 4.330705902960238, "grad_norm": 0.07529231788326105, "learning_rate": 4.278194661221804e-06, "loss": 0.2879, "step": 771 }, { "epoch": 4.336311087756174, "grad_norm": 0.08026949246594235, "learning_rate": 4.207877166124936e-06, "loss": 0.2917, "step": 772 }, { "epoch": 4.341916272552111, "grad_norm": 0.07620416762621339, "learning_rate": 4.138110252526866e-06, "loss": 0.291, "step": 773 }, { "epoch": 4.347521457348047, "grad_norm": 0.07422185414389502, "learning_rate": 4.068894993635009e-06, "loss": 0.2907, "step": 774 }, { "epoch": 4.353126642143983, "grad_norm": 0.07226310970169152, "learning_rate": 4.000232454170827e-06, "loss": 0.2875, "step": 775 }, { "epoch": 4.35873182693992, "grad_norm": 0.07432275041631858, "learning_rate": 3.932123690353425e-06, "loss": 0.2885, "step": 776 }, { "epoch": 4.364337011735856, "grad_norm": 0.0732145587763894, "learning_rate": 3.8645697498833e-06, "loss": 0.298, "step": 777 }, { "epoch": 4.369942196531792, "grad_norm": 0.07336339349758086, "learning_rate": 3.7975716719262522e-06, "loss": 0.2892, "step": 778 }, { "epoch": 4.375547381327728, "grad_norm": 0.07442021970479001, "learning_rate": 3.7311304870973807e-06, "loss": 0.2899, "step": 779 }, { "epoch": 4.3811525661236645, "grad_norm": 0.07353220474247239, "learning_rate": 3.6652472174452337e-06, "loss": 0.2877, "step": 780 }, { "epoch": 4.3867577509196005, "grad_norm": 0.0715587497335768, "learning_rate": 3.599922876436077e-06, "loss": 0.289, "step": 781 }, { "epoch": 4.3923629357155365, "grad_norm": 0.06874108127193815, "learning_rate": 3.535158468938331e-06, "loss": 0.2923, "step": 782 }, { "epoch": 4.397968120511473, "grad_norm": 0.0731981866264841, "learning_rate": 3.4709549912070693e-06, "loss": 0.2895, "step": 783 }, { "epoch": 4.403573305307409, "grad_norm": 0.07134091663650302, "learning_rate": 3.4073134308687574e-06, "loss": 0.2946, "step": 784 }, { "epoch": 4.409178490103345, "grad_norm": 0.07158473138919103, "learning_rate": 3.3442347669059917e-06, "loss": 0.2888, "step": 785 }, { "epoch": 4.414783674899282, "grad_norm": 0.0705765283738734, "learning_rate": 3.2817199696424785e-06, "loss": 0.2921, "step": 786 }, { "epoch": 4.420388859695218, "grad_norm": 0.0713673125862577, "learning_rate": 3.219770000728102e-06, "loss": 0.2897, "step": 787 }, { "epoch": 4.425994044491154, "grad_norm": 0.07163002908771758, "learning_rate": 3.1583858131241274e-06, "loss": 0.2931, "step": 788 }, { "epoch": 4.431599229287091, "grad_norm": 0.07068386839729027, "learning_rate": 3.0975683510885512e-06, "loss": 0.2915, "step": 789 }, { "epoch": 4.437204414083027, "grad_norm": 0.06828223728970015, "learning_rate": 3.0373185501615655e-06, "loss": 0.2863, "step": 790 }, { "epoch": 4.442809598878963, "grad_norm": 0.06925305783885854, "learning_rate": 2.97763733715116e-06, "loss": 0.286, "step": 791 }, { "epoch": 4.448414783674899, "grad_norm": 0.07093825781423818, "learning_rate": 2.9185256301188782e-06, "loss": 0.2918, "step": 792 }, { "epoch": 4.454019968470836, "grad_norm": 0.07292024935590449, "learning_rate": 2.8599843383657178e-06, "loss": 0.2849, "step": 793 }, { "epoch": 4.459625153266772, "grad_norm": 0.07136882672592504, "learning_rate": 2.8020143624180796e-06, "loss": 0.2941, "step": 794 }, { "epoch": 4.465230338062708, "grad_norm": 0.07262486143198338, "learning_rate": 2.744616594013976e-06, "loss": 0.29, "step": 795 }, { "epoch": 4.4708355228586445, "grad_norm": 0.07044266557562419, "learning_rate": 2.6877919160892817e-06, "loss": 0.286, "step": 796 }, { "epoch": 4.4764407076545805, "grad_norm": 0.07076171900766302, "learning_rate": 2.631541202764161e-06, "loss": 0.2913, "step": 797 }, { "epoch": 4.482045892450516, "grad_norm": 0.07266466034845562, "learning_rate": 2.5758653193296244e-06, "loss": 0.2966, "step": 798 }, { "epoch": 4.487651077246453, "grad_norm": 0.06891412203515149, "learning_rate": 2.520765122234212e-06, "loss": 0.2938, "step": 799 }, { "epoch": 4.493256262042389, "grad_norm": 0.06927383792607966, "learning_rate": 2.4662414590708216e-06, "loss": 0.2886, "step": 800 }, { "epoch": 4.498861446838325, "grad_norm": 0.06986747799982956, "learning_rate": 2.4122951685636674e-06, "loss": 0.2895, "step": 801 }, { "epoch": 4.504466631634262, "grad_norm": 0.07085917182268424, "learning_rate": 2.3589270805553842e-06, "loss": 0.293, "step": 802 }, { "epoch": 4.510071816430198, "grad_norm": 0.07134101177324086, "learning_rate": 2.3061380159942593e-06, "loss": 0.2919, "step": 803 }, { "epoch": 4.515677001226134, "grad_norm": 0.07152411549915325, "learning_rate": 2.2539287869215974e-06, "loss": 0.2909, "step": 804 }, { "epoch": 4.52128218602207, "grad_norm": 0.0709029110171059, "learning_rate": 2.2023001964592485e-06, "loss": 0.2909, "step": 805 }, { "epoch": 4.526887370818007, "grad_norm": 0.07123396088624603, "learning_rate": 2.1512530387972187e-06, "loss": 0.2894, "step": 806 }, { "epoch": 4.532492555613943, "grad_norm": 0.07087391238341717, "learning_rate": 2.100788099181501e-06, "loss": 0.292, "step": 807 }, { "epoch": 4.538097740409879, "grad_norm": 0.07244705960873961, "learning_rate": 2.050906153901946e-06, "loss": 0.2886, "step": 808 }, { "epoch": 4.543702925205816, "grad_norm": 0.0702625208879413, "learning_rate": 2.0016079702803683e-06, "loss": 0.2912, "step": 809 }, { "epoch": 4.549308110001752, "grad_norm": 0.07025143510925803, "learning_rate": 1.9528943066586993e-06, "loss": 0.2859, "step": 810 }, { "epoch": 4.554913294797688, "grad_norm": 0.06797065457217236, "learning_rate": 1.9047659123873475e-06, "loss": 0.2897, "step": 811 }, { "epoch": 4.560518479593624, "grad_norm": 0.06967626315727225, "learning_rate": 1.8572235278136741e-06, "loss": 0.2896, "step": 812 }, { "epoch": 4.56612366438956, "grad_norm": 0.06946061969739742, "learning_rate": 1.81026788427058e-06, "loss": 0.2919, "step": 813 }, { "epoch": 4.571728849185496, "grad_norm": 0.06956148278872504, "learning_rate": 1.7638997040652928e-06, "loss": 0.2865, "step": 814 }, { "epoch": 4.577334033981433, "grad_norm": 0.06903042275149272, "learning_rate": 1.7181197004682148e-06, "loss": 0.2987, "step": 815 }, { "epoch": 4.582939218777369, "grad_norm": 0.07027061783424711, "learning_rate": 1.6729285777019776e-06, "loss": 0.2943, "step": 816 }, { "epoch": 4.588544403573305, "grad_norm": 0.07497684616923145, "learning_rate": 1.6283270309306098e-06, "loss": 0.2954, "step": 817 }, { "epoch": 4.594149588369241, "grad_norm": 0.06928091037284238, "learning_rate": 1.58431574624883e-06, "loss": 0.2978, "step": 818 }, { "epoch": 4.599754773165178, "grad_norm": 0.06987516073546424, "learning_rate": 1.5408954006715004e-06, "loss": 0.2906, "step": 819 }, { "epoch": 4.605359957961114, "grad_norm": 0.0679280004765429, "learning_rate": 1.4980666621232076e-06, "loss": 0.2913, "step": 820 }, { "epoch": 4.61096514275705, "grad_norm": 0.06872535211929137, "learning_rate": 1.4558301894279958e-06, "loss": 0.2929, "step": 821 }, { "epoch": 4.616570327552987, "grad_norm": 0.07416106638882912, "learning_rate": 1.4141866322992237e-06, "loss": 0.2965, "step": 822 }, { "epoch": 4.622175512348923, "grad_norm": 0.06811896916091317, "learning_rate": 1.3731366313295858e-06, "loss": 0.2885, "step": 823 }, { "epoch": 4.627780697144859, "grad_norm": 0.0695579136671645, "learning_rate": 1.3326808179812266e-06, "loss": 0.291, "step": 824 }, { "epoch": 4.633385881940796, "grad_norm": 0.06863496179473963, "learning_rate": 1.292819814576065e-06, "loss": 0.294, "step": 825 }, { "epoch": 4.638991066736732, "grad_norm": 0.0705807081382788, "learning_rate": 1.253554234286196e-06, "loss": 0.2904, "step": 826 }, { "epoch": 4.6445962515326675, "grad_norm": 0.06663794364232753, "learning_rate": 1.214884681124473e-06, "loss": 0.2925, "step": 827 }, { "epoch": 4.650201436328604, "grad_norm": 0.06891427767384369, "learning_rate": 1.1768117499351983e-06, "loss": 0.2947, "step": 828 }, { "epoch": 4.65580662112454, "grad_norm": 0.06804227143364437, "learning_rate": 1.1393360263849895e-06, "loss": 0.296, "step": 829 }, { "epoch": 4.661411805920476, "grad_norm": 0.06729466113720775, "learning_rate": 1.1024580869537682e-06, "loss": 0.2895, "step": 830 }, { "epoch": 4.667016990716412, "grad_norm": 0.06981525289632151, "learning_rate": 1.0661784989258784e-06, "loss": 0.2952, "step": 831 }, { "epoch": 4.672622175512349, "grad_norm": 0.06887481964028351, "learning_rate": 1.0304978203813864e-06, "loss": 0.2942, "step": 832 }, { "epoch": 4.678227360308285, "grad_norm": 0.06799466764149216, "learning_rate": 9.954166001874665e-07, "loss": 0.2911, "step": 833 }, { "epoch": 4.683832545104221, "grad_norm": 0.06713373354236589, "learning_rate": 9.609353779899711e-07, "loss": 0.2937, "step": 834 }, { "epoch": 4.689437729900158, "grad_norm": 0.06801377589396243, "learning_rate": 9.270546842051398e-07, "loss": 0.2917, "step": 835 }, { "epoch": 4.695042914696094, "grad_norm": 0.06877475417334285, "learning_rate": 8.937750400114243e-07, "loss": 0.2951, "step": 836 }, { "epoch": 4.70064809949203, "grad_norm": 0.0682963203858977, "learning_rate": 8.610969573414762e-07, "loss": 0.2851, "step": 837 }, { "epoch": 4.706253284287967, "grad_norm": 0.06874191377961358, "learning_rate": 8.290209388742698e-07, "loss": 0.2923, "step": 838 }, { "epoch": 4.711858469083903, "grad_norm": 0.06818766825685828, "learning_rate": 7.975474780273828e-07, "loss": 0.2903, "step": 839 }, { "epoch": 4.717463653879839, "grad_norm": 0.06850750083385908, "learning_rate": 7.666770589493854e-07, "loss": 0.2912, "step": 840 }, { "epoch": 4.7230688386757755, "grad_norm": 0.06669019358923452, "learning_rate": 7.36410156512406e-07, "loss": 0.2886, "step": 841 }, { "epoch": 4.7286740234717115, "grad_norm": 0.0685257652819839, "learning_rate": 7.0674723630483e-07, "loss": 0.2955, "step": 842 }, { "epoch": 4.7342792082676475, "grad_norm": 0.06869894321372189, "learning_rate": 6.776887546241196e-07, "loss": 0.2894, "step": 843 }, { "epoch": 4.7398843930635834, "grad_norm": 0.0658977357878887, "learning_rate": 6.492351584698231e-07, "loss": 0.29, "step": 844 }, { "epoch": 4.74548957785952, "grad_norm": 0.06766778529150386, "learning_rate": 6.213868855366656e-07, "loss": 0.2919, "step": 845 }, { "epoch": 4.751094762655456, "grad_norm": 0.06789970654207798, "learning_rate": 5.94144364207847e-07, "loss": 0.2848, "step": 846 }, { "epoch": 4.756699947451392, "grad_norm": 0.06763905592382854, "learning_rate": 5.675080135484212e-07, "loss": 0.2919, "step": 847 }, { "epoch": 4.762305132247329, "grad_norm": 0.0673637188944257, "learning_rate": 5.41478243298883e-07, "loss": 0.2895, "step": 848 }, { "epoch": 4.767910317043265, "grad_norm": 0.06599718582014574, "learning_rate": 5.160554538688356e-07, "loss": 0.2866, "step": 849 }, { "epoch": 4.773515501839201, "grad_norm": 0.06680161975531686, "learning_rate": 4.912400363308534e-07, "loss": 0.2905, "step": 850 }, { "epoch": 4.779120686635137, "grad_norm": 0.06774042383141317, "learning_rate": 4.670323724144599e-07, "loss": 0.29, "step": 851 }, { "epoch": 4.784725871431074, "grad_norm": 0.06766365842045775, "learning_rate": 4.434328345002348e-07, "loss": 0.2893, "step": 852 }, { "epoch": 4.79033105622701, "grad_norm": 0.0669757827435882, "learning_rate": 4.204417856141252e-07, "loss": 0.2934, "step": 853 }, { "epoch": 4.795936241022947, "grad_norm": 0.06783350979091353, "learning_rate": 3.980595794218278e-07, "loss": 0.2933, "step": 854 }, { "epoch": 4.801541425818883, "grad_norm": 0.06756472881490112, "learning_rate": 3.762865602233623e-07, "loss": 0.2938, "step": 855 }, { "epoch": 4.807146610614819, "grad_norm": 0.06573406645601809, "learning_rate": 3.551230629477731e-07, "loss": 0.2838, "step": 856 }, { "epoch": 4.812751795410755, "grad_norm": 0.06500971296841233, "learning_rate": 3.3456941314798264e-07, "loss": 0.2858, "step": 857 }, { "epoch": 4.8183569802066915, "grad_norm": 0.06666567495190365, "learning_rate": 3.14625926995773e-07, "loss": 0.2928, "step": 858 }, { "epoch": 4.823962165002627, "grad_norm": 0.06842156859320352, "learning_rate": 2.9529291127693204e-07, "loss": 0.2978, "step": 859 }, { "epoch": 4.829567349798563, "grad_norm": 0.06770694825005188, "learning_rate": 2.765706633865195e-07, "loss": 0.2937, "step": 860 }, { "epoch": 4.8351725345945, "grad_norm": 0.06561834235318093, "learning_rate": 2.584594713243105e-07, "loss": 0.2885, "step": 861 }, { "epoch": 4.840777719390436, "grad_norm": 0.06847439054641216, "learning_rate": 2.409596136903636e-07, "loss": 0.2934, "step": 862 }, { "epoch": 4.846382904186372, "grad_norm": 0.06925202861994563, "learning_rate": 2.2407135968072203e-07, "loss": 0.2912, "step": 863 }, { "epoch": 4.851988088982308, "grad_norm": 0.06568658183290636, "learning_rate": 2.0779496908327034e-07, "loss": 0.2865, "step": 864 }, { "epoch": 4.857593273778245, "grad_norm": 0.06527850349185818, "learning_rate": 1.9213069227376423e-07, "loss": 0.285, "step": 865 }, { "epoch": 4.863198458574181, "grad_norm": 0.06537692688098144, "learning_rate": 1.7707877021195364e-07, "loss": 0.2893, "step": 866 }, { "epoch": 4.868803643370118, "grad_norm": 0.0673736030444187, "learning_rate": 1.6263943443788344e-07, "loss": 0.2929, "step": 867 }, { "epoch": 4.874408828166054, "grad_norm": 0.06944665125489001, "learning_rate": 1.488129070683364e-07, "loss": 0.2891, "step": 868 }, { "epoch": 4.88001401296199, "grad_norm": 0.06656997632932934, "learning_rate": 1.355994007934136e-07, "loss": 0.2897, "step": 869 }, { "epoch": 4.885619197757926, "grad_norm": 0.06760741888344543, "learning_rate": 1.229991188732571e-07, "loss": 0.2976, "step": 870 }, { "epoch": 4.891224382553863, "grad_norm": 0.06541550926939739, "learning_rate": 1.1101225513493685e-07, "loss": 0.2867, "step": 871 }, { "epoch": 4.896829567349799, "grad_norm": 0.06563519047490654, "learning_rate": 9.963899396944865e-08, "loss": 0.2908, "step": 872 }, { "epoch": 4.9024347521457345, "grad_norm": 0.0667333344033991, "learning_rate": 8.887951032889863e-08, "loss": 0.2927, "step": 873 }, { "epoch": 4.908039936941671, "grad_norm": 0.06534766168074065, "learning_rate": 7.873396972379876e-08, "loss": 0.287, "step": 874 }, { "epoch": 4.913645121737607, "grad_norm": 0.06573554633063639, "learning_rate": 6.920252822053109e-08, "loss": 0.2935, "step": 875 }, { "epoch": 4.919250306533543, "grad_norm": 0.06692957578454926, "learning_rate": 6.028533243893186e-08, "loss": 0.297, "step": 876 }, { "epoch": 4.924855491329479, "grad_norm": 0.06696022392708174, "learning_rate": 5.19825195500534e-08, "loss": 0.2933, "step": 877 }, { "epoch": 4.930460676125416, "grad_norm": 0.06551574218774454, "learning_rate": 4.429421727403682e-08, "loss": 0.2934, "step": 878 }, { "epoch": 4.936065860921352, "grad_norm": 0.06734924333505671, "learning_rate": 3.722054387816698e-08, "loss": 0.2887, "step": 879 }, { "epoch": 4.941671045717289, "grad_norm": 0.06685929779911977, "learning_rate": 3.076160817503393e-08, "loss": 0.2919, "step": 880 }, { "epoch": 4.947276230513225, "grad_norm": 0.0657094812782885, "learning_rate": 2.491750952087202e-08, "loss": 0.2899, "step": 881 }, { "epoch": 4.952881415309161, "grad_norm": 0.06618212935511, "learning_rate": 1.968833781402335e-08, "loss": 0.2934, "step": 882 }, { "epoch": 4.958486600105097, "grad_norm": 0.06522788950148393, "learning_rate": 1.5074173493565548e-08, "loss": 0.2889, "step": 883 }, { "epoch": 4.964091784901034, "grad_norm": 0.06617327426574475, "learning_rate": 1.1075087538059415e-08, "loss": 0.2869, "step": 884 }, { "epoch": 4.96969696969697, "grad_norm": 0.06616131705312475, "learning_rate": 7.69114146446981e-09, "loss": 0.2862, "step": 885 }, { "epoch": 4.975302154492906, "grad_norm": 0.06628519866860673, "learning_rate": 4.922387327219724e-09, "loss": 0.2946, "step": 886 }, { "epoch": 4.9809073392888426, "grad_norm": 0.06516030532915469, "learning_rate": 2.7688677173687285e-09, "loss": 0.2916, "step": 887 }, { "epoch": 4.9865125240847785, "grad_norm": 0.06490676692933119, "learning_rate": 1.2306157619956793e-09, "loss": 0.2929, "step": 888 }, { "epoch": 4.9921177088807145, "grad_norm": 0.06796682705221838, "learning_rate": 3.0765512364361317e-10, "loss": 0.2958, "step": 889 }, { "epoch": 4.9977228936766505, "grad_norm": 0.06629486855855717, "learning_rate": 0.0, "loss": 0.2924, "step": 890 }, { "epoch": 4.9977228936766505, "step": 890, "total_flos": 2.367212535366969e+19, "train_loss": 0.0, "train_runtime": 1.7461, "train_samples_per_second": 261530.086, "train_steps_per_second": 509.716 } ], "logging_steps": 1, "max_steps": 890, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.367212535366969e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }