| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 4.9977228936766505, |
| "eval_steps": 500, |
| "global_step": 890, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.005605184795936241, |
| "grad_norm": 5.90385013851782, |
| "learning_rate": 8.98876404494382e-07, |
| "loss": 0.807, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.011210369591872483, |
| "grad_norm": 6.006366750773737, |
| "learning_rate": 1.797752808988764e-06, |
| "loss": 0.8236, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.016815554387808723, |
| "grad_norm": 5.925343792163714, |
| "learning_rate": 2.696629213483146e-06, |
| "loss": 0.8109, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.022420739183744966, |
| "grad_norm": 5.527952797665692, |
| "learning_rate": 3.595505617977528e-06, |
| "loss": 0.7994, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.028025923979681205, |
| "grad_norm": 4.373958309383871, |
| "learning_rate": 4.494382022471911e-06, |
| "loss": 0.7563, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.033631108775617445, |
| "grad_norm": 2.4658304323036786, |
| "learning_rate": 5.393258426966292e-06, |
| "loss": 0.713, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.039236293571553685, |
| "grad_norm": 2.047561880721445, |
| "learning_rate": 6.292134831460674e-06, |
| "loss": 0.7011, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.04484147836748993, |
| "grad_norm": 4.026532746780989, |
| "learning_rate": 7.191011235955056e-06, |
| "loss": 0.7074, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.05044666316342617, |
| "grad_norm": 4.325745201316262, |
| "learning_rate": 8.08988764044944e-06, |
| "loss": 0.6906, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.05605184795936241, |
| "grad_norm": 3.8759783016193707, |
| "learning_rate": 8.988764044943822e-06, |
| "loss": 0.6496, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.06165703275529865, |
| "grad_norm": 4.428668216367442, |
| "learning_rate": 9.887640449438202e-06, |
| "loss": 0.6512, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.06726221755123489, |
| "grad_norm": 3.492900535526824, |
| "learning_rate": 1.0786516853932584e-05, |
| "loss": 0.6363, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.07286740234717114, |
| "grad_norm": 2.1273856381054657, |
| "learning_rate": 1.1685393258426966e-05, |
| "loss": 0.6132, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.07847258714310737, |
| "grad_norm": 2.1122501992949747, |
| "learning_rate": 1.2584269662921348e-05, |
| "loss": 0.5844, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.08407777193904362, |
| "grad_norm": 2.5622529515656667, |
| "learning_rate": 1.348314606741573e-05, |
| "loss": 0.5948, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.08968295673497986, |
| "grad_norm": 1.8958213967675108, |
| "learning_rate": 1.4382022471910113e-05, |
| "loss": 0.5727, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.0952881415309161, |
| "grad_norm": 0.9617068848658947, |
| "learning_rate": 1.5280898876404495e-05, |
| "loss": 0.5603, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.10089332632685234, |
| "grad_norm": 1.2698908465747545, |
| "learning_rate": 1.617977528089888e-05, |
| "loss": 0.5606, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.10649851112278858, |
| "grad_norm": 1.0006685793638639, |
| "learning_rate": 1.707865168539326e-05, |
| "loss": 0.5489, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.11210369591872482, |
| "grad_norm": 0.7031106693279928, |
| "learning_rate": 1.7977528089887643e-05, |
| "loss": 0.5355, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.11770888071466107, |
| "grad_norm": 0.9009903971329004, |
| "learning_rate": 1.8876404494382024e-05, |
| "loss": 0.5292, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.1233140655105973, |
| "grad_norm": 0.6766379367642102, |
| "learning_rate": 1.9775280898876404e-05, |
| "loss": 0.5199, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.12891925030653353, |
| "grad_norm": 0.6738985046630845, |
| "learning_rate": 2.067415730337079e-05, |
| "loss": 0.5161, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.13452443510246978, |
| "grad_norm": 0.6180587250422095, |
| "learning_rate": 2.1573033707865168e-05, |
| "loss": 0.5061, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.14012961989840603, |
| "grad_norm": 0.4834626230192744, |
| "learning_rate": 2.2471910112359556e-05, |
| "loss": 0.5139, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.14573480469434227, |
| "grad_norm": 0.5878637275212983, |
| "learning_rate": 2.3370786516853933e-05, |
| "loss": 0.5027, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.15133998949027852, |
| "grad_norm": 0.5004842186398177, |
| "learning_rate": 2.426966292134832e-05, |
| "loss": 0.5011, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.15694517428621474, |
| "grad_norm": 0.5254669056909826, |
| "learning_rate": 2.5168539325842697e-05, |
| "loss": 0.4909, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.16255035908215099, |
| "grad_norm": 0.4765392239552605, |
| "learning_rate": 2.606741573033708e-05, |
| "loss": 0.4984, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.16815554387808723, |
| "grad_norm": 0.46499584686266865, |
| "learning_rate": 2.696629213483146e-05, |
| "loss": 0.4831, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.17376072867402348, |
| "grad_norm": 0.4681038310213469, |
| "learning_rate": 2.7865168539325845e-05, |
| "loss": 0.4868, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.17936591346995973, |
| "grad_norm": 0.5306865190068607, |
| "learning_rate": 2.8764044943820226e-05, |
| "loss": 0.4799, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.18497109826589594, |
| "grad_norm": 0.5310145854714095, |
| "learning_rate": 2.966292134831461e-05, |
| "loss": 0.4773, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.1905762830618322, |
| "grad_norm": 0.761548939341114, |
| "learning_rate": 3.056179775280899e-05, |
| "loss": 0.4812, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.19618146785776844, |
| "grad_norm": 1.200005749726904, |
| "learning_rate": 3.1460674157303374e-05, |
| "loss": 0.4911, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.20178665265370468, |
| "grad_norm": 0.8174168347705099, |
| "learning_rate": 3.235955056179776e-05, |
| "loss": 0.467, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.20739183744964093, |
| "grad_norm": 0.4081677644510946, |
| "learning_rate": 3.325842696629214e-05, |
| "loss": 0.4635, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.21299702224557715, |
| "grad_norm": 0.6329076432258968, |
| "learning_rate": 3.415730337078652e-05, |
| "loss": 0.4709, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.2186022070415134, |
| "grad_norm": 1.0112197078534875, |
| "learning_rate": 3.50561797752809e-05, |
| "loss": 0.4696, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.22420739183744964, |
| "grad_norm": 0.9221867209391528, |
| "learning_rate": 3.5955056179775286e-05, |
| "loss": 0.4611, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.2298125766333859, |
| "grad_norm": 0.84837092144476, |
| "learning_rate": 3.685393258426967e-05, |
| "loss": 0.474, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.23541776142932214, |
| "grad_norm": 1.1853308215342242, |
| "learning_rate": 3.775280898876405e-05, |
| "loss": 0.4678, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.24102294622525836, |
| "grad_norm": 0.5210063275074145, |
| "learning_rate": 3.865168539325843e-05, |
| "loss": 0.4562, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.2466281310211946, |
| "grad_norm": 0.908397332328363, |
| "learning_rate": 3.955056179775281e-05, |
| "loss": 0.4674, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.2522333158171308, |
| "grad_norm": 1.1080166716443598, |
| "learning_rate": 4.04494382022472e-05, |
| "loss": 0.4508, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.25783850061306707, |
| "grad_norm": 0.45360566647654976, |
| "learning_rate": 4.134831460674158e-05, |
| "loss": 0.4509, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.2634436854090033, |
| "grad_norm": 0.9861729575846025, |
| "learning_rate": 4.224719101123595e-05, |
| "loss": 0.4551, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.26904887020493956, |
| "grad_norm": 0.46277631804738467, |
| "learning_rate": 4.3146067415730337e-05, |
| "loss": 0.446, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.2746540550008758, |
| "grad_norm": 0.7678134928965062, |
| "learning_rate": 4.404494382022472e-05, |
| "loss": 0.4518, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.28025923979681205, |
| "grad_norm": 0.6470876222514552, |
| "learning_rate": 4.494382022471911e-05, |
| "loss": 0.4545, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.2858644245927483, |
| "grad_norm": 0.6677697699594126, |
| "learning_rate": 4.584269662921348e-05, |
| "loss": 0.4542, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.29146960938868455, |
| "grad_norm": 0.9005318621831405, |
| "learning_rate": 4.6741573033707865e-05, |
| "loss": 0.4472, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.2970747941846208, |
| "grad_norm": 1.0135754268066979, |
| "learning_rate": 4.764044943820225e-05, |
| "loss": 0.4449, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.30267997898055704, |
| "grad_norm": 1.3120984053765912, |
| "learning_rate": 4.853932584269664e-05, |
| "loss": 0.4514, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.30828516377649323, |
| "grad_norm": 0.9125344012381023, |
| "learning_rate": 4.943820224719101e-05, |
| "loss": 0.448, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.3138903485724295, |
| "grad_norm": 1.5065061920138354, |
| "learning_rate": 5.0337078651685394e-05, |
| "loss": 0.4513, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.3194955333683657, |
| "grad_norm": 0.7122548779299299, |
| "learning_rate": 5.123595505617978e-05, |
| "loss": 0.4447, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.32510071816430197, |
| "grad_norm": 1.3540801647141738, |
| "learning_rate": 5.213483146067416e-05, |
| "loss": 0.4556, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.3307059029602382, |
| "grad_norm": 0.8323407754065357, |
| "learning_rate": 5.303370786516854e-05, |
| "loss": 0.4345, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.33631108775617446, |
| "grad_norm": 1.116821907282998, |
| "learning_rate": 5.393258426966292e-05, |
| "loss": 0.4491, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.3419162725521107, |
| "grad_norm": 1.0824014046607489, |
| "learning_rate": 5.4831460674157306e-05, |
| "loss": 0.4472, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.34752145734804696, |
| "grad_norm": 0.9265407958585422, |
| "learning_rate": 5.573033707865169e-05, |
| "loss": 0.4362, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.3531266421439832, |
| "grad_norm": 0.8981510798852507, |
| "learning_rate": 5.662921348314607e-05, |
| "loss": 0.453, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.35873182693991945, |
| "grad_norm": 0.9858658036066037, |
| "learning_rate": 5.752808988764045e-05, |
| "loss": 0.4428, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.36433701173585564, |
| "grad_norm": 1.2513865798978006, |
| "learning_rate": 5.8426966292134835e-05, |
| "loss": 0.4428, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.3699421965317919, |
| "grad_norm": 0.9397639573210418, |
| "learning_rate": 5.932584269662922e-05, |
| "loss": 0.4324, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.37554738132772814, |
| "grad_norm": 1.4017380301394493, |
| "learning_rate": 6.0224719101123596e-05, |
| "loss": 0.45, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.3811525661236644, |
| "grad_norm": 1.0777658678202968, |
| "learning_rate": 6.112359550561798e-05, |
| "loss": 0.4401, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.38675775091960063, |
| "grad_norm": 0.912558239818085, |
| "learning_rate": 6.202247191011237e-05, |
| "loss": 0.4351, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.3923629357155369, |
| "grad_norm": 1.2273797098955324, |
| "learning_rate": 6.292134831460675e-05, |
| "loss": 0.4516, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.3979681205114731, |
| "grad_norm": 1.1806852980385731, |
| "learning_rate": 6.382022471910112e-05, |
| "loss": 0.4442, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.40357330530740937, |
| "grad_norm": 1.1616577982500746, |
| "learning_rate": 6.471910112359552e-05, |
| "loss": 0.4342, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.4091784901033456, |
| "grad_norm": 0.8390636237723854, |
| "learning_rate": 6.561797752808989e-05, |
| "loss": 0.427, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.41478367489928186, |
| "grad_norm": 0.8499496288682458, |
| "learning_rate": 6.651685393258428e-05, |
| "loss": 0.4379, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.42038885969521805, |
| "grad_norm": 1.17937338360059, |
| "learning_rate": 6.741573033707866e-05, |
| "loss": 0.4291, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.4259940444911543, |
| "grad_norm": 0.7821897701029938, |
| "learning_rate": 6.831460674157304e-05, |
| "loss": 0.4351, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.43159922928709055, |
| "grad_norm": 0.8440055305481178, |
| "learning_rate": 6.921348314606743e-05, |
| "loss": 0.4304, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.4372044140830268, |
| "grad_norm": 0.7526712919502756, |
| "learning_rate": 7.01123595505618e-05, |
| "loss": 0.4366, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.44280959887896304, |
| "grad_norm": 1.1114429881926073, |
| "learning_rate": 7.101123595505618e-05, |
| "loss": 0.4326, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.4484147836748993, |
| "grad_norm": 0.7746430179490161, |
| "learning_rate": 7.191011235955057e-05, |
| "loss": 0.4336, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.45401996847083553, |
| "grad_norm": 0.8505120829834041, |
| "learning_rate": 7.280898876404495e-05, |
| "loss": 0.4352, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.4596251532667718, |
| "grad_norm": 1.4415361142111385, |
| "learning_rate": 7.370786516853934e-05, |
| "loss": 0.4385, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.465230338062708, |
| "grad_norm": 0.7475789995240804, |
| "learning_rate": 7.46067415730337e-05, |
| "loss": 0.431, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.4708355228586443, |
| "grad_norm": 0.9854738368310488, |
| "learning_rate": 7.55056179775281e-05, |
| "loss": 0.4327, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.47644070765458046, |
| "grad_norm": 1.4375947776402878, |
| "learning_rate": 7.640449438202248e-05, |
| "loss": 0.4355, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.4820458924505167, |
| "grad_norm": 0.7290469112827799, |
| "learning_rate": 7.730337078651686e-05, |
| "loss": 0.4335, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.48765107724645296, |
| "grad_norm": 1.1019633173104773, |
| "learning_rate": 7.820224719101124e-05, |
| "loss": 0.4303, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.4932562620423892, |
| "grad_norm": 0.9810209491585931, |
| "learning_rate": 7.910112359550562e-05, |
| "loss": 0.4296, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.49886144683832545, |
| "grad_norm": 0.8966074291671375, |
| "learning_rate": 8e-05, |
| "loss": 0.4338, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.5044666316342616, |
| "grad_norm": 1.0809659461252454, |
| "learning_rate": 7.999969234487637e-05, |
| "loss": 0.4323, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.5100718164301979, |
| "grad_norm": 1.44533907238761, |
| "learning_rate": 7.999876938423802e-05, |
| "loss": 0.4436, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.5156770012261341, |
| "grad_norm": 0.7701781299751447, |
| "learning_rate": 7.999723113228264e-05, |
| "loss": 0.4406, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.5212821860220704, |
| "grad_norm": 0.9804243571239605, |
| "learning_rate": 7.999507761267278e-05, |
| "loss": 0.4245, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.5268873708180066, |
| "grad_norm": 1.218580909562173, |
| "learning_rate": 7.999230885853554e-05, |
| "loss": 0.444, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.5324925556139429, |
| "grad_norm": 0.6868435592682877, |
| "learning_rate": 7.998892491246195e-05, |
| "loss": 0.4316, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.5380977404098791, |
| "grad_norm": 0.7857929091209908, |
| "learning_rate": 7.998492582650644e-05, |
| "loss": 0.4292, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.5437029252058154, |
| "grad_norm": 0.960028166874925, |
| "learning_rate": 7.998031166218598e-05, |
| "loss": 0.434, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.5493081100017516, |
| "grad_norm": 0.7041102432235921, |
| "learning_rate": 7.997508249047913e-05, |
| "loss": 0.4215, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.5549132947976878, |
| "grad_norm": 0.7044014441304309, |
| "learning_rate": 7.996923839182498e-05, |
| "loss": 0.425, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.5605184795936241, |
| "grad_norm": 0.6385459114746951, |
| "learning_rate": 7.996277945612184e-05, |
| "loss": 0.42, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.5661236643895603, |
| "grad_norm": 0.7731522061106563, |
| "learning_rate": 7.995570578272598e-05, |
| "loss": 0.4253, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.5717288491854966, |
| "grad_norm": 0.7332716664705065, |
| "learning_rate": 7.994801748044995e-05, |
| "loss": 0.4313, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.5773340339814328, |
| "grad_norm": 0.6776804568843835, |
| "learning_rate": 7.993971466756107e-05, |
| "loss": 0.4188, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.5829392187773691, |
| "grad_norm": 0.5922393199265042, |
| "learning_rate": 7.993079747177948e-05, |
| "loss": 0.4184, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.5885444035733053, |
| "grad_norm": 0.398394656224325, |
| "learning_rate": 7.99212660302762e-05, |
| "loss": 0.4155, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.5941495883692416, |
| "grad_norm": 0.5350176373169183, |
| "learning_rate": 7.991112048967111e-05, |
| "loss": 0.4157, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.5997547731651778, |
| "grad_norm": 0.38641904379334474, |
| "learning_rate": 7.990036100603055e-05, |
| "loss": 0.4119, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.6053599579611141, |
| "grad_norm": 0.5283945082650043, |
| "learning_rate": 7.988898774486507e-05, |
| "loss": 0.4129, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.6109651427570503, |
| "grad_norm": 0.5488840852639991, |
| "learning_rate": 7.987700088112675e-05, |
| "loss": 0.4224, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.6165703275529865, |
| "grad_norm": 0.5785726461047852, |
| "learning_rate": 7.986440059920659e-05, |
| "loss": 0.4077, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.6221755123489228, |
| "grad_norm": 0.5777052205377616, |
| "learning_rate": 7.985118709293167e-05, |
| "loss": 0.4166, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.627780697144859, |
| "grad_norm": 0.596491662412661, |
| "learning_rate": 7.983736056556212e-05, |
| "loss": 0.4132, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.6333858819407953, |
| "grad_norm": 0.6957057882031554, |
| "learning_rate": 7.982292122978806e-05, |
| "loss": 0.4178, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.6389910667367315, |
| "grad_norm": 0.6847230422083609, |
| "learning_rate": 7.980786930772624e-05, |
| "loss": 0.4118, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.6445962515326678, |
| "grad_norm": 0.6837460304512333, |
| "learning_rate": 7.979220503091673e-05, |
| "loss": 0.4147, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.6502014363286039, |
| "grad_norm": 0.6798597739557275, |
| "learning_rate": 7.977592864031929e-05, |
| "loss": 0.4171, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.6558066211245402, |
| "grad_norm": 0.548015114928552, |
| "learning_rate": 7.975904038630963e-05, |
| "loss": 0.4117, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.6614118059204764, |
| "grad_norm": 0.5642042657981582, |
| "learning_rate": 7.974154052867569e-05, |
| "loss": 0.4126, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.6670169907164126, |
| "grad_norm": 0.7655295571497013, |
| "learning_rate": 7.97234293366135e-05, |
| "loss": 0.4154, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.6726221755123489, |
| "grad_norm": 0.874608237347998, |
| "learning_rate": 7.970470708872308e-05, |
| "loss": 0.4236, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.6782273603082851, |
| "grad_norm": 0.8310771349721764, |
| "learning_rate": 7.968537407300423e-05, |
| "loss": 0.421, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.6838325451042214, |
| "grad_norm": 0.6553657946996332, |
| "learning_rate": 7.966543058685203e-05, |
| "loss": 0.4035, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.6894377299001576, |
| "grad_norm": 0.5023716536308365, |
| "learning_rate": 7.964487693705224e-05, |
| "loss": 0.416, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.6950429146960939, |
| "grad_norm": 0.4876124028877786, |
| "learning_rate": 7.962371343977664e-05, |
| "loss": 0.4116, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.7006480994920301, |
| "grad_norm": 0.4811788654442628, |
| "learning_rate": 7.960194042057817e-05, |
| "loss": 0.4181, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.7062532842879664, |
| "grad_norm": 0.7564883843961022, |
| "learning_rate": 7.957955821438588e-05, |
| "loss": 0.4061, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.7118584690839026, |
| "grad_norm": 0.6430241009529823, |
| "learning_rate": 7.955656716549977e-05, |
| "loss": 0.4099, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.7174636538798389, |
| "grad_norm": 0.5578593223115135, |
| "learning_rate": 7.953296762758556e-05, |
| "loss": 0.422, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.7230688386757751, |
| "grad_norm": 0.6294882800949052, |
| "learning_rate": 7.950875996366916e-05, |
| "loss": 0.4195, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.7286740234717113, |
| "grad_norm": 0.5769470393949646, |
| "learning_rate": 7.948394454613117e-05, |
| "loss": 0.4057, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.7342792082676476, |
| "grad_norm": 0.6608310787451351, |
| "learning_rate": 7.945852175670113e-05, |
| "loss": 0.4117, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.7398843930635838, |
| "grad_norm": 0.6476166241559258, |
| "learning_rate": 7.943249198645159e-05, |
| "loss": 0.4115, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.7454895778595201, |
| "grad_norm": 0.41750277808352504, |
| "learning_rate": 7.940585563579216e-05, |
| "loss": 0.4187, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.7510947626554563, |
| "grad_norm": 0.5395404061131682, |
| "learning_rate": 7.937861311446334e-05, |
| "loss": 0.4097, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.7566999474513926, |
| "grad_norm": 0.5858318930064079, |
| "learning_rate": 7.935076484153019e-05, |
| "loss": 0.4003, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.7623051322473288, |
| "grad_norm": 0.5173078099977891, |
| "learning_rate": 7.932231124537589e-05, |
| "loss": 0.4056, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.7679103170432651, |
| "grad_norm": 0.4498691054115375, |
| "learning_rate": 7.929325276369519e-05, |
| "loss": 0.4066, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.7735155018392013, |
| "grad_norm": 0.42467263261017896, |
| "learning_rate": 7.92635898434876e-05, |
| "loss": 0.4064, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.7791206866351374, |
| "grad_norm": 0.37958937321762776, |
| "learning_rate": 7.923332294105063e-05, |
| "loss": 0.4034, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.7847258714310738, |
| "grad_norm": 0.3468484137692954, |
| "learning_rate": 7.920245252197263e-05, |
| "loss": 0.4039, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.7903310562270099, |
| "grad_norm": 0.4169469149627637, |
| "learning_rate": 7.917097906112574e-05, |
| "loss": 0.4087, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.7959362410229462, |
| "grad_norm": 0.41834364109362276, |
| "learning_rate": 7.913890304265853e-05, |
| "loss": 0.405, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.8015414258188824, |
| "grad_norm": 0.3513985390988816, |
| "learning_rate": 7.910622495998858e-05, |
| "loss": 0.4006, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.8071466106148187, |
| "grad_norm": 0.30562923751430504, |
| "learning_rate": 7.907294531579487e-05, |
| "loss": 0.399, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.8127517954107549, |
| "grad_norm": 0.25016597293101506, |
| "learning_rate": 7.903906462201004e-05, |
| "loss": 0.404, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.8183569802066912, |
| "grad_norm": 0.330583944503076, |
| "learning_rate": 7.900458339981254e-05, |
| "loss": 0.4001, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.8239621650026274, |
| "grad_norm": 0.4232095940752348, |
| "learning_rate": 7.896950217961862e-05, |
| "loss": 0.4058, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.8295673497985637, |
| "grad_norm": 0.4726435967002529, |
| "learning_rate": 7.893382150107413e-05, |
| "loss": 0.3979, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.8351725345944999, |
| "grad_norm": 0.5224683089251781, |
| "learning_rate": 7.889754191304624e-05, |
| "loss": 0.4016, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.8407777193904361, |
| "grad_norm": 0.6703932689764993, |
| "learning_rate": 7.886066397361502e-05, |
| "loss": 0.4019, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.8463829041863724, |
| "grad_norm": 0.7976450116603597, |
| "learning_rate": 7.882318825006482e-05, |
| "loss": 0.4042, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.8519880889823086, |
| "grad_norm": 0.8440592541875934, |
| "learning_rate": 7.878511531887553e-05, |
| "loss": 0.405, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.8575932737782449, |
| "grad_norm": 0.7574577755866619, |
| "learning_rate": 7.874644576571382e-05, |
| "loss": 0.4141, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.8631984585741811, |
| "grad_norm": 0.6868823675156073, |
| "learning_rate": 7.870718018542394e-05, |
| "loss": 0.4085, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.8688036433701174, |
| "grad_norm": 0.7949981417283204, |
| "learning_rate": 7.866731918201877e-05, |
| "loss": 0.4123, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.8744088281660536, |
| "grad_norm": 0.7652258936220558, |
| "learning_rate": 7.862686336867042e-05, |
| "loss": 0.4074, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.8800140129619899, |
| "grad_norm": 0.5492331474373146, |
| "learning_rate": 7.858581336770078e-05, |
| "loss": 0.412, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.8856191977579261, |
| "grad_norm": 0.4985373864867304, |
| "learning_rate": 7.854416981057202e-05, |
| "loss": 0.4001, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.8912243825538623, |
| "grad_norm": 0.6843980488570995, |
| "learning_rate": 7.850193333787679e-05, |
| "loss": 0.3962, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.8968295673497986, |
| "grad_norm": 0.3471378020904331, |
| "learning_rate": 7.845910459932851e-05, |
| "loss": 0.3988, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.9024347521457348, |
| "grad_norm": 0.5626872736911909, |
| "learning_rate": 7.841568425375118e-05, |
| "loss": 0.3996, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.9080399369416711, |
| "grad_norm": 0.7412359084209332, |
| "learning_rate": 7.83716729690694e-05, |
| "loss": 0.3996, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.9136451217376073, |
| "grad_norm": 0.3955248842527265, |
| "learning_rate": 7.832707142229803e-05, |
| "loss": 0.4003, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.9192503065335436, |
| "grad_norm": 0.47855528089224836, |
| "learning_rate": 7.828188029953179e-05, |
| "loss": 0.4002, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.9248554913294798, |
| "grad_norm": 0.5159440067301492, |
| "learning_rate": 7.823610029593471e-05, |
| "loss": 0.3962, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.930460676125416, |
| "grad_norm": 0.3576812605070119, |
| "learning_rate": 7.818973211572943e-05, |
| "loss": 0.393, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.9360658609213522, |
| "grad_norm": 0.35348537654761086, |
| "learning_rate": 7.814277647218634e-05, |
| "loss": 0.4037, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.9416710457172885, |
| "grad_norm": 0.39413342377875193, |
| "learning_rate": 7.809523408761266e-05, |
| "loss": 0.3942, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.9472762305132247, |
| "grad_norm": 0.3951438373093825, |
| "learning_rate": 7.80471056933413e-05, |
| "loss": 0.4012, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.9528814153091609, |
| "grad_norm": 0.45433061259053, |
| "learning_rate": 7.799839202971963e-05, |
| "loss": 0.3982, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.9584866001050972, |
| "grad_norm": 0.519370105848961, |
| "learning_rate": 7.794909384609807e-05, |
| "loss": 0.3994, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.9640917849010334, |
| "grad_norm": 0.5495567414849357, |
| "learning_rate": 7.789921190081851e-05, |
| "loss": 0.3979, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.9696969696969697, |
| "grad_norm": 0.6394859589748585, |
| "learning_rate": 7.784874696120279e-05, |
| "loss": 0.3959, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.9753021544929059, |
| "grad_norm": 0.7704242402793997, |
| "learning_rate": 7.779769980354077e-05, |
| "loss": 0.4027, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.9809073392888422, |
| "grad_norm": 0.9985269827974932, |
| "learning_rate": 7.774607121307841e-05, |
| "loss": 0.405, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.9865125240847784, |
| "grad_norm": 0.868487642699029, |
| "learning_rate": 7.769386198400576e-05, |
| "loss": 0.3957, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.9921177088807147, |
| "grad_norm": 0.40924003925560676, |
| "learning_rate": 7.764107291944464e-05, |
| "loss": 0.3905, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.9977228936766509, |
| "grad_norm": 0.4918744419515202, |
| "learning_rate": 7.758770483143634e-05, |
| "loss": 0.389, |
| "step": 178 |
| }, |
| { |
| "epoch": 1.0052548607461902, |
| "grad_norm": 1.487265905041594, |
| "learning_rate": 7.753375854092918e-05, |
| "loss": 0.7519, |
| "step": 179 |
| }, |
| { |
| "epoch": 1.0108600455421264, |
| "grad_norm": 1.0599518409785367, |
| "learning_rate": 7.747923487776579e-05, |
| "loss": 0.3924, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.0164652303380628, |
| "grad_norm": 0.6685215482157915, |
| "learning_rate": 7.742413468067038e-05, |
| "loss": 0.3886, |
| "step": 181 |
| }, |
| { |
| "epoch": 1.022070415133999, |
| "grad_norm": 0.6798972273258548, |
| "learning_rate": 7.736845879723585e-05, |
| "loss": 0.3925, |
| "step": 182 |
| }, |
| { |
| "epoch": 1.0276755999299352, |
| "grad_norm": 0.5898524978289069, |
| "learning_rate": 7.731220808391072e-05, |
| "loss": 0.3799, |
| "step": 183 |
| }, |
| { |
| "epoch": 1.0332807847258714, |
| "grad_norm": 0.5305286635410624, |
| "learning_rate": 7.725538340598603e-05, |
| "loss": 0.3858, |
| "step": 184 |
| }, |
| { |
| "epoch": 1.0388859695218078, |
| "grad_norm": 0.5290381001929488, |
| "learning_rate": 7.719798563758193e-05, |
| "loss": 0.3792, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.044491154317744, |
| "grad_norm": 0.42976752812289126, |
| "learning_rate": 7.71400156616343e-05, |
| "loss": 0.378, |
| "step": 186 |
| }, |
| { |
| "epoch": 1.0500963391136802, |
| "grad_norm": 0.4255688619941376, |
| "learning_rate": 7.708147436988112e-05, |
| "loss": 0.3838, |
| "step": 187 |
| }, |
| { |
| "epoch": 1.0557015239096164, |
| "grad_norm": 0.3844923007802862, |
| "learning_rate": 7.702236266284886e-05, |
| "loss": 0.3838, |
| "step": 188 |
| }, |
| { |
| "epoch": 1.0613067087055525, |
| "grad_norm": 0.4493904358687705, |
| "learning_rate": 7.696268144983844e-05, |
| "loss": 0.3773, |
| "step": 189 |
| }, |
| { |
| "epoch": 1.066911893501489, |
| "grad_norm": 0.43703395699798525, |
| "learning_rate": 7.690243164891146e-05, |
| "loss": 0.3789, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.0725170782974252, |
| "grad_norm": 0.3129550094380894, |
| "learning_rate": 7.684161418687588e-05, |
| "loss": 0.3841, |
| "step": 191 |
| }, |
| { |
| "epoch": 1.0781222630933613, |
| "grad_norm": 0.3196436601574946, |
| "learning_rate": 7.678022999927191e-05, |
| "loss": 0.375, |
| "step": 192 |
| }, |
| { |
| "epoch": 1.0837274478892975, |
| "grad_norm": 0.32439557065583957, |
| "learning_rate": 7.671828003035754e-05, |
| "loss": 0.3808, |
| "step": 193 |
| }, |
| { |
| "epoch": 1.0893326326852337, |
| "grad_norm": 0.3551997178098723, |
| "learning_rate": 7.665576523309402e-05, |
| "loss": 0.3808, |
| "step": 194 |
| }, |
| { |
| "epoch": 1.0949378174811701, |
| "grad_norm": 0.28222901059062117, |
| "learning_rate": 7.659268656913125e-05, |
| "loss": 0.3755, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.1005430022771063, |
| "grad_norm": 0.24413884185384668, |
| "learning_rate": 7.652904500879294e-05, |
| "loss": 0.3771, |
| "step": 196 |
| }, |
| { |
| "epoch": 1.1061481870730425, |
| "grad_norm": 0.3258543915444851, |
| "learning_rate": 7.646484153106168e-05, |
| "loss": 0.3819, |
| "step": 197 |
| }, |
| { |
| "epoch": 1.1117533718689787, |
| "grad_norm": 0.30494792365590756, |
| "learning_rate": 7.640007712356394e-05, |
| "loss": 0.3739, |
| "step": 198 |
| }, |
| { |
| "epoch": 1.1173585566649151, |
| "grad_norm": 0.3416322895952249, |
| "learning_rate": 7.633475278255477e-05, |
| "loss": 0.3729, |
| "step": 199 |
| }, |
| { |
| "epoch": 1.1229637414608513, |
| "grad_norm": 0.33058200454972514, |
| "learning_rate": 7.626886951290262e-05, |
| "loss": 0.3778, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.1285689262567875, |
| "grad_norm": 0.24553286849423914, |
| "learning_rate": 7.620242832807375e-05, |
| "loss": 0.3815, |
| "step": 201 |
| }, |
| { |
| "epoch": 1.1341741110527237, |
| "grad_norm": 0.3094016267852568, |
| "learning_rate": 7.61354302501167e-05, |
| "loss": 0.3739, |
| "step": 202 |
| }, |
| { |
| "epoch": 1.1397792958486601, |
| "grad_norm": 0.3110189115438822, |
| "learning_rate": 7.606787630964658e-05, |
| "loss": 0.3744, |
| "step": 203 |
| }, |
| { |
| "epoch": 1.1453844806445963, |
| "grad_norm": 0.3105648524438776, |
| "learning_rate": 7.599976754582917e-05, |
| "loss": 0.3733, |
| "step": 204 |
| }, |
| { |
| "epoch": 1.1509896654405325, |
| "grad_norm": 0.38377775896842775, |
| "learning_rate": 7.593110500636499e-05, |
| "loss": 0.3777, |
| "step": 205 |
| }, |
| { |
| "epoch": 1.1565948502364687, |
| "grad_norm": 0.5289840818037539, |
| "learning_rate": 7.586188974747315e-05, |
| "loss": 0.3748, |
| "step": 206 |
| }, |
| { |
| "epoch": 1.1622000350324049, |
| "grad_norm": 0.6002663848970253, |
| "learning_rate": 7.579212283387508e-05, |
| "loss": 0.376, |
| "step": 207 |
| }, |
| { |
| "epoch": 1.1678052198283413, |
| "grad_norm": 0.543546869431326, |
| "learning_rate": 7.57218053387782e-05, |
| "loss": 0.3818, |
| "step": 208 |
| }, |
| { |
| "epoch": 1.1734104046242775, |
| "grad_norm": 0.38254861936914103, |
| "learning_rate": 7.565093834385944e-05, |
| "loss": 0.3733, |
| "step": 209 |
| }, |
| { |
| "epoch": 1.1790155894202137, |
| "grad_norm": 0.25789361293808194, |
| "learning_rate": 7.557952293924843e-05, |
| "loss": 0.3741, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.1846207742161499, |
| "grad_norm": 0.3509813225234253, |
| "learning_rate": 7.550756022351098e-05, |
| "loss": 0.3766, |
| "step": 211 |
| }, |
| { |
| "epoch": 1.1902259590120863, |
| "grad_norm": 0.41381875355849096, |
| "learning_rate": 7.5435051303632e-05, |
| "loss": 0.3771, |
| "step": 212 |
| }, |
| { |
| "epoch": 1.1958311438080225, |
| "grad_norm": 0.31160966900292714, |
| "learning_rate": 7.53619972949985e-05, |
| "loss": 0.37, |
| "step": 213 |
| }, |
| { |
| "epoch": 1.2014363286039587, |
| "grad_norm": 0.25377869542538456, |
| "learning_rate": 7.528839932138248e-05, |
| "loss": 0.3742, |
| "step": 214 |
| }, |
| { |
| "epoch": 1.2070415133998948, |
| "grad_norm": 0.3670686486051864, |
| "learning_rate": 7.521425851492366e-05, |
| "loss": 0.3741, |
| "step": 215 |
| }, |
| { |
| "epoch": 1.2126466981958313, |
| "grad_norm": 0.35278618781458415, |
| "learning_rate": 7.513957601611196e-05, |
| "loss": 0.3689, |
| "step": 216 |
| }, |
| { |
| "epoch": 1.2182518829917675, |
| "grad_norm": 0.27734720069224306, |
| "learning_rate": 7.506435297377006e-05, |
| "loss": 0.3709, |
| "step": 217 |
| }, |
| { |
| "epoch": 1.2238570677877036, |
| "grad_norm": 0.3278146817806966, |
| "learning_rate": 7.498859054503568e-05, |
| "loss": 0.3758, |
| "step": 218 |
| }, |
| { |
| "epoch": 1.2294622525836398, |
| "grad_norm": 0.4148552525962653, |
| "learning_rate": 7.491228989534378e-05, |
| "loss": 0.3722, |
| "step": 219 |
| }, |
| { |
| "epoch": 1.235067437379576, |
| "grad_norm": 0.4473202149669945, |
| "learning_rate": 7.483545219840865e-05, |
| "loss": 0.3754, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.2406726221755124, |
| "grad_norm": 0.44427273810379947, |
| "learning_rate": 7.475807863620587e-05, |
| "loss": 0.3762, |
| "step": 221 |
| }, |
| { |
| "epoch": 1.2462778069714486, |
| "grad_norm": 0.49016712568602555, |
| "learning_rate": 7.468017039895404e-05, |
| "loss": 0.3761, |
| "step": 222 |
| }, |
| { |
| "epoch": 1.2518829917673848, |
| "grad_norm": 0.47873415753545473, |
| "learning_rate": 7.460172868509664e-05, |
| "loss": 0.3734, |
| "step": 223 |
| }, |
| { |
| "epoch": 1.257488176563321, |
| "grad_norm": 0.43672364622786203, |
| "learning_rate": 7.452275470128338e-05, |
| "loss": 0.3721, |
| "step": 224 |
| }, |
| { |
| "epoch": 1.2630933613592572, |
| "grad_norm": 0.43990010868929574, |
| "learning_rate": 7.444324966235179e-05, |
| "loss": 0.374, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.2686985461551936, |
| "grad_norm": 0.4713815408143108, |
| "learning_rate": 7.436321479130855e-05, |
| "loss": 0.3713, |
| "step": 226 |
| }, |
| { |
| "epoch": 1.2743037309511298, |
| "grad_norm": 0.45464514691112873, |
| "learning_rate": 7.428265131931053e-05, |
| "loss": 0.3706, |
| "step": 227 |
| }, |
| { |
| "epoch": 1.279908915747066, |
| "grad_norm": 0.510024464970406, |
| "learning_rate": 7.420156048564599e-05, |
| "loss": 0.3741, |
| "step": 228 |
| }, |
| { |
| "epoch": 1.2855141005430024, |
| "grad_norm": 0.5372593621720837, |
| "learning_rate": 7.411994353771542e-05, |
| "loss": 0.3696, |
| "step": 229 |
| }, |
| { |
| "epoch": 1.2911192853389384, |
| "grad_norm": 0.4219487710133918, |
| "learning_rate": 7.40378017310125e-05, |
| "loss": 0.3711, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.2967244701348748, |
| "grad_norm": 0.2523356442192507, |
| "learning_rate": 7.395513632910455e-05, |
| "loss": 0.371, |
| "step": 231 |
| }, |
| { |
| "epoch": 1.302329654930811, |
| "grad_norm": 0.3232824061176527, |
| "learning_rate": 7.38719486036133e-05, |
| "loss": 0.3755, |
| "step": 232 |
| }, |
| { |
| "epoch": 1.3079348397267472, |
| "grad_norm": 0.3600749335990331, |
| "learning_rate": 7.378823983419529e-05, |
| "loss": 0.373, |
| "step": 233 |
| }, |
| { |
| "epoch": 1.3135400245226836, |
| "grad_norm": 0.31629844859755163, |
| "learning_rate": 7.370401130852207e-05, |
| "loss": 0.3734, |
| "step": 234 |
| }, |
| { |
| "epoch": 1.3191452093186198, |
| "grad_norm": 0.30569567935552483, |
| "learning_rate": 7.361926432226053e-05, |
| "loss": 0.377, |
| "step": 235 |
| }, |
| { |
| "epoch": 1.324750394114556, |
| "grad_norm": 0.2484814268194845, |
| "learning_rate": 7.35340001790529e-05, |
| "loss": 0.3711, |
| "step": 236 |
| }, |
| { |
| "epoch": 1.3303555789104922, |
| "grad_norm": 0.2481897952153626, |
| "learning_rate": 7.34482201904967e-05, |
| "loss": 0.3769, |
| "step": 237 |
| }, |
| { |
| "epoch": 1.3359607637064284, |
| "grad_norm": 0.301422303142377, |
| "learning_rate": 7.336192567612458e-05, |
| "loss": 0.3746, |
| "step": 238 |
| }, |
| { |
| "epoch": 1.3415659485023648, |
| "grad_norm": 0.23838306194654293, |
| "learning_rate": 7.327511796338402e-05, |
| "loss": 0.3776, |
| "step": 239 |
| }, |
| { |
| "epoch": 1.347171133298301, |
| "grad_norm": 0.24847852807759896, |
| "learning_rate": 7.318779838761688e-05, |
| "loss": 0.3673, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.3527763180942372, |
| "grad_norm": 0.32043115894332014, |
| "learning_rate": 7.309996829203894e-05, |
| "loss": 0.3706, |
| "step": 241 |
| }, |
| { |
| "epoch": 1.3583815028901733, |
| "grad_norm": 0.27995723770731035, |
| "learning_rate": 7.301162902771911e-05, |
| "loss": 0.3698, |
| "step": 242 |
| }, |
| { |
| "epoch": 1.3639866876861095, |
| "grad_norm": 0.23369201897150527, |
| "learning_rate": 7.292278195355875e-05, |
| "loss": 0.3765, |
| "step": 243 |
| }, |
| { |
| "epoch": 1.369591872482046, |
| "grad_norm": 0.2782336040131324, |
| "learning_rate": 7.28334284362708e-05, |
| "loss": 0.3698, |
| "step": 244 |
| }, |
| { |
| "epoch": 1.3751970572779821, |
| "grad_norm": 0.32558286225893, |
| "learning_rate": 7.274356985035856e-05, |
| "loss": 0.363, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.3808022420739183, |
| "grad_norm": 0.3651496945923943, |
| "learning_rate": 7.265320757809478e-05, |
| "loss": 0.3708, |
| "step": 246 |
| }, |
| { |
| "epoch": 1.3864074268698547, |
| "grad_norm": 0.37775778360725737, |
| "learning_rate": 7.256234300950025e-05, |
| "loss": 0.3739, |
| "step": 247 |
| }, |
| { |
| "epoch": 1.392012611665791, |
| "grad_norm": 0.3865569735032884, |
| "learning_rate": 7.247097754232251e-05, |
| "loss": 0.3663, |
| "step": 248 |
| }, |
| { |
| "epoch": 1.3976177964617271, |
| "grad_norm": 0.4600474923978821, |
| "learning_rate": 7.237911258201422e-05, |
| "loss": 0.3725, |
| "step": 249 |
| }, |
| { |
| "epoch": 1.4032229812576633, |
| "grad_norm": 0.578230623305736, |
| "learning_rate": 7.228674954171169e-05, |
| "loss": 0.3717, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.4088281660535995, |
| "grad_norm": 0.5579950344731053, |
| "learning_rate": 7.219388984221304e-05, |
| "loss": 0.375, |
| "step": 251 |
| }, |
| { |
| "epoch": 1.414433350849536, |
| "grad_norm": 0.42369290331551335, |
| "learning_rate": 7.210053491195638e-05, |
| "loss": 0.3673, |
| "step": 252 |
| }, |
| { |
| "epoch": 1.420038535645472, |
| "grad_norm": 0.3852634055274567, |
| "learning_rate": 7.200668618699786e-05, |
| "loss": 0.3669, |
| "step": 253 |
| }, |
| { |
| "epoch": 1.4256437204414083, |
| "grad_norm": 0.3342986128914958, |
| "learning_rate": 7.191234511098952e-05, |
| "loss": 0.3675, |
| "step": 254 |
| }, |
| { |
| "epoch": 1.4312489052373445, |
| "grad_norm": 0.28417998997516397, |
| "learning_rate": 7.181751313515716e-05, |
| "loss": 0.3736, |
| "step": 255 |
| }, |
| { |
| "epoch": 1.4368540900332807, |
| "grad_norm": 0.316021315356994, |
| "learning_rate": 7.172219171827788e-05, |
| "loss": 0.3652, |
| "step": 256 |
| }, |
| { |
| "epoch": 1.442459274829217, |
| "grad_norm": 0.3330261543122812, |
| "learning_rate": 7.162638232665785e-05, |
| "loss": 0.3781, |
| "step": 257 |
| }, |
| { |
| "epoch": 1.4480644596251533, |
| "grad_norm": 0.41235945578908095, |
| "learning_rate": 7.153008643410957e-05, |
| "loss": 0.3676, |
| "step": 258 |
| }, |
| { |
| "epoch": 1.4536696444210895, |
| "grad_norm": 0.5120467154192809, |
| "learning_rate": 7.143330552192925e-05, |
| "loss": 0.3688, |
| "step": 259 |
| }, |
| { |
| "epoch": 1.4592748292170257, |
| "grad_norm": 0.566273828052107, |
| "learning_rate": 7.13360410788741e-05, |
| "loss": 0.3728, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.4648800140129619, |
| "grad_norm": 0.48257393278128896, |
| "learning_rate": 7.123829460113933e-05, |
| "loss": 0.3698, |
| "step": 261 |
| }, |
| { |
| "epoch": 1.4704851988088983, |
| "grad_norm": 0.3962978665568913, |
| "learning_rate": 7.114006759233514e-05, |
| "loss": 0.3708, |
| "step": 262 |
| }, |
| { |
| "epoch": 1.4760903836048345, |
| "grad_norm": 0.43632437470514573, |
| "learning_rate": 7.104136156346368e-05, |
| "loss": 0.3776, |
| "step": 263 |
| }, |
| { |
| "epoch": 1.4816955684007707, |
| "grad_norm": 0.5027739581146445, |
| "learning_rate": 7.094217803289573e-05, |
| "loss": 0.377, |
| "step": 264 |
| }, |
| { |
| "epoch": 1.487300753196707, |
| "grad_norm": 0.4748778836922837, |
| "learning_rate": 7.084251852634736e-05, |
| "loss": 0.374, |
| "step": 265 |
| }, |
| { |
| "epoch": 1.4929059379926433, |
| "grad_norm": 0.31244286402393573, |
| "learning_rate": 7.074238457685644e-05, |
| "loss": 0.3656, |
| "step": 266 |
| }, |
| { |
| "epoch": 1.4985111227885795, |
| "grad_norm": 0.2969138202613333, |
| "learning_rate": 7.064177772475912e-05, |
| "loss": 0.377, |
| "step": 267 |
| }, |
| { |
| "epoch": 1.5041163075845156, |
| "grad_norm": 0.41051969660231596, |
| "learning_rate": 7.054069951766608e-05, |
| "loss": 0.3763, |
| "step": 268 |
| }, |
| { |
| "epoch": 1.5097214923804518, |
| "grad_norm": 0.38457550467039503, |
| "learning_rate": 7.043915151043871e-05, |
| "loss": 0.3714, |
| "step": 269 |
| }, |
| { |
| "epoch": 1.5153266771763882, |
| "grad_norm": 0.29650502362650927, |
| "learning_rate": 7.033713526516528e-05, |
| "loss": 0.3708, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.5209318619723244, |
| "grad_norm": 0.4069481640243356, |
| "learning_rate": 7.023465235113678e-05, |
| "loss": 0.3734, |
| "step": 271 |
| }, |
| { |
| "epoch": 1.5265370467682606, |
| "grad_norm": 0.4592931917273877, |
| "learning_rate": 7.013170434482291e-05, |
| "loss": 0.3697, |
| "step": 272 |
| }, |
| { |
| "epoch": 1.532142231564197, |
| "grad_norm": 0.34558335688526987, |
| "learning_rate": 7.002829282984776e-05, |
| "loss": 0.3601, |
| "step": 273 |
| }, |
| { |
| "epoch": 1.537747416360133, |
| "grad_norm": 0.32785340771208665, |
| "learning_rate": 6.992441939696543e-05, |
| "loss": 0.3708, |
| "step": 274 |
| }, |
| { |
| "epoch": 1.5433526011560694, |
| "grad_norm": 0.42401801041386833, |
| "learning_rate": 6.982008564403562e-05, |
| "loss": 0.3709, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.5489577859520056, |
| "grad_norm": 0.3554711523304497, |
| "learning_rate": 6.971529317599903e-05, |
| "loss": 0.3625, |
| "step": 276 |
| }, |
| { |
| "epoch": 1.5545629707479418, |
| "grad_norm": 0.33335189910758606, |
| "learning_rate": 6.961004360485263e-05, |
| "loss": 0.3723, |
| "step": 277 |
| }, |
| { |
| "epoch": 1.5601681555438782, |
| "grad_norm": 0.3198930394708329, |
| "learning_rate": 6.950433854962489e-05, |
| "loss": 0.3601, |
| "step": 278 |
| }, |
| { |
| "epoch": 1.5657733403398142, |
| "grad_norm": 0.27989792618134535, |
| "learning_rate": 6.939817963635095e-05, |
| "loss": 0.3703, |
| "step": 279 |
| }, |
| { |
| "epoch": 1.5713785251357506, |
| "grad_norm": 0.29366802755384774, |
| "learning_rate": 6.929156849804745e-05, |
| "loss": 0.3714, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.5769837099316868, |
| "grad_norm": 0.2785219347149019, |
| "learning_rate": 6.918450677468754e-05, |
| "loss": 0.3763, |
| "step": 281 |
| }, |
| { |
| "epoch": 1.582588894727623, |
| "grad_norm": 0.2578554063108834, |
| "learning_rate": 6.907699611317563e-05, |
| "loss": 0.3708, |
| "step": 282 |
| }, |
| { |
| "epoch": 1.5881940795235594, |
| "grad_norm": 0.24508422370288088, |
| "learning_rate": 6.896903816732199e-05, |
| "loss": 0.3808, |
| "step": 283 |
| }, |
| { |
| "epoch": 1.5937992643194954, |
| "grad_norm": 0.3084468014981416, |
| "learning_rate": 6.88606345978174e-05, |
| "loss": 0.3668, |
| "step": 284 |
| }, |
| { |
| "epoch": 1.5994044491154318, |
| "grad_norm": 0.3064359422764903, |
| "learning_rate": 6.875178707220752e-05, |
| "loss": 0.3703, |
| "step": 285 |
| }, |
| { |
| "epoch": 1.605009633911368, |
| "grad_norm": 0.259368905565046, |
| "learning_rate": 6.86424972648673e-05, |
| "loss": 0.3682, |
| "step": 286 |
| }, |
| { |
| "epoch": 1.6106148187073042, |
| "grad_norm": 0.3138681158378778, |
| "learning_rate": 6.853276685697522e-05, |
| "loss": 0.361, |
| "step": 287 |
| }, |
| { |
| "epoch": 1.6162200035032406, |
| "grad_norm": 0.28723735473752765, |
| "learning_rate": 6.842259753648736e-05, |
| "loss": 0.3691, |
| "step": 288 |
| }, |
| { |
| "epoch": 1.6218251882991768, |
| "grad_norm": 0.22917475477913027, |
| "learning_rate": 6.831199099811154e-05, |
| "loss": 0.3738, |
| "step": 289 |
| }, |
| { |
| "epoch": 1.627430373095113, |
| "grad_norm": 0.22158438127508895, |
| "learning_rate": 6.820094894328115e-05, |
| "loss": 0.3673, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.6330355578910494, |
| "grad_norm": 0.24914259580358247, |
| "learning_rate": 6.808947308012907e-05, |
| "loss": 0.3623, |
| "step": 291 |
| }, |
| { |
| "epoch": 1.6386407426869853, |
| "grad_norm": 0.27721435112401327, |
| "learning_rate": 6.797756512346131e-05, |
| "loss": 0.371, |
| "step": 292 |
| }, |
| { |
| "epoch": 1.6442459274829218, |
| "grad_norm": 0.28690822410974365, |
| "learning_rate": 6.786522679473069e-05, |
| "loss": 0.3704, |
| "step": 293 |
| }, |
| { |
| "epoch": 1.649851112278858, |
| "grad_norm": 0.29175870389773917, |
| "learning_rate": 6.775245982201031e-05, |
| "loss": 0.3705, |
| "step": 294 |
| }, |
| { |
| "epoch": 1.6554562970747941, |
| "grad_norm": 0.31996465044367267, |
| "learning_rate": 6.763926593996704e-05, |
| "loss": 0.3621, |
| "step": 295 |
| }, |
| { |
| "epoch": 1.6610614818707305, |
| "grad_norm": 0.28536850062056507, |
| "learning_rate": 6.752564688983475e-05, |
| "loss": 0.3678, |
| "step": 296 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 0.25078819883082826, |
| "learning_rate": 6.741160441938761e-05, |
| "loss": 0.3633, |
| "step": 297 |
| }, |
| { |
| "epoch": 1.672271851462603, |
| "grad_norm": 0.18918507286548947, |
| "learning_rate": 6.729714028291311e-05, |
| "loss": 0.3641, |
| "step": 298 |
| }, |
| { |
| "epoch": 1.6778770362585391, |
| "grad_norm": 0.1854770882692491, |
| "learning_rate": 6.718225624118518e-05, |
| "loss": 0.3736, |
| "step": 299 |
| }, |
| { |
| "epoch": 1.6834822210544753, |
| "grad_norm": 0.2398880873671139, |
| "learning_rate": 6.7066954061437e-05, |
| "loss": 0.3627, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.6890874058504117, |
| "grad_norm": 0.2708404661115209, |
| "learning_rate": 6.695123551733391e-05, |
| "loss": 0.3615, |
| "step": 301 |
| }, |
| { |
| "epoch": 1.694692590646348, |
| "grad_norm": 0.22472657962922266, |
| "learning_rate": 6.683510238894603e-05, |
| "loss": 0.3601, |
| "step": 302 |
| }, |
| { |
| "epoch": 1.700297775442284, |
| "grad_norm": 0.1879478767675107, |
| "learning_rate": 6.671855646272099e-05, |
| "loss": 0.3704, |
| "step": 303 |
| }, |
| { |
| "epoch": 1.7059029602382203, |
| "grad_norm": 0.1898981801511567, |
| "learning_rate": 6.660159953145632e-05, |
| "loss": 0.3741, |
| "step": 304 |
| }, |
| { |
| "epoch": 1.7115081450341565, |
| "grad_norm": 0.23990986850106974, |
| "learning_rate": 6.648423339427203e-05, |
| "loss": 0.365, |
| "step": 305 |
| }, |
| { |
| "epoch": 1.717113329830093, |
| "grad_norm": 0.2961808669969359, |
| "learning_rate": 6.636645985658274e-05, |
| "loss": 0.3703, |
| "step": 306 |
| }, |
| { |
| "epoch": 1.722718514626029, |
| "grad_norm": 0.28018628616841723, |
| "learning_rate": 6.62482807300701e-05, |
| "loss": 0.3725, |
| "step": 307 |
| }, |
| { |
| "epoch": 1.7283236994219653, |
| "grad_norm": 0.23407750459599536, |
| "learning_rate": 6.612969783265477e-05, |
| "loss": 0.3774, |
| "step": 308 |
| }, |
| { |
| "epoch": 1.7339288842179017, |
| "grad_norm": 0.2348613025637216, |
| "learning_rate": 6.601071298846859e-05, |
| "loss": 0.3678, |
| "step": 309 |
| }, |
| { |
| "epoch": 1.7395340690138377, |
| "grad_norm": 0.27859996434994966, |
| "learning_rate": 6.589132802782636e-05, |
| "loss": 0.3681, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.745139253809774, |
| "grad_norm": 0.30291442596447254, |
| "learning_rate": 6.577154478719786e-05, |
| "loss": 0.3626, |
| "step": 311 |
| }, |
| { |
| "epoch": 1.7507444386057103, |
| "grad_norm": 0.34755080893012474, |
| "learning_rate": 6.565136510917946e-05, |
| "loss": 0.3618, |
| "step": 312 |
| }, |
| { |
| "epoch": 1.7563496234016465, |
| "grad_norm": 0.4116767959803346, |
| "learning_rate": 6.553079084246583e-05, |
| "loss": 0.3681, |
| "step": 313 |
| }, |
| { |
| "epoch": 1.7619548081975829, |
| "grad_norm": 0.46137945732023167, |
| "learning_rate": 6.540982384182154e-05, |
| "loss": 0.3724, |
| "step": 314 |
| }, |
| { |
| "epoch": 1.7675599929935188, |
| "grad_norm": 0.5111615976410253, |
| "learning_rate": 6.528846596805246e-05, |
| "loss": 0.3656, |
| "step": 315 |
| }, |
| { |
| "epoch": 1.7731651777894553, |
| "grad_norm": 0.5142382717546382, |
| "learning_rate": 6.516671908797717e-05, |
| "loss": 0.3652, |
| "step": 316 |
| }, |
| { |
| "epoch": 1.7787703625853915, |
| "grad_norm": 0.45272084053581846, |
| "learning_rate": 6.504458507439825e-05, |
| "loss": 0.3708, |
| "step": 317 |
| }, |
| { |
| "epoch": 1.7843755473813276, |
| "grad_norm": 0.3546131780130616, |
| "learning_rate": 6.492206580607344e-05, |
| "loss": 0.372, |
| "step": 318 |
| }, |
| { |
| "epoch": 1.789980732177264, |
| "grad_norm": 0.35982588012048017, |
| "learning_rate": 6.479916316768677e-05, |
| "loss": 0.368, |
| "step": 319 |
| }, |
| { |
| "epoch": 1.7955859169732002, |
| "grad_norm": 0.42694941448039553, |
| "learning_rate": 6.467587904981959e-05, |
| "loss": 0.3724, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.8011911017691364, |
| "grad_norm": 0.38051976086581385, |
| "learning_rate": 6.455221534892138e-05, |
| "loss": 0.3714, |
| "step": 321 |
| }, |
| { |
| "epoch": 1.8067962865650729, |
| "grad_norm": 0.23803744685632255, |
| "learning_rate": 6.442817396728073e-05, |
| "loss": 0.363, |
| "step": 322 |
| }, |
| { |
| "epoch": 1.8124014713610088, |
| "grad_norm": 0.19779044986555921, |
| "learning_rate": 6.430375681299596e-05, |
| "loss": 0.3652, |
| "step": 323 |
| }, |
| { |
| "epoch": 1.8180066561569452, |
| "grad_norm": 0.2848062424619444, |
| "learning_rate": 6.417896579994583e-05, |
| "loss": 0.3701, |
| "step": 324 |
| }, |
| { |
| "epoch": 1.8236118409528814, |
| "grad_norm": 0.3241743893145074, |
| "learning_rate": 6.405380284776007e-05, |
| "loss": 0.3631, |
| "step": 325 |
| }, |
| { |
| "epoch": 1.8292170257488176, |
| "grad_norm": 0.23348066280971422, |
| "learning_rate": 6.392826988178984e-05, |
| "loss": 0.3655, |
| "step": 326 |
| }, |
| { |
| "epoch": 1.834822210544754, |
| "grad_norm": 0.22485128049462547, |
| "learning_rate": 6.380236883307814e-05, |
| "loss": 0.3649, |
| "step": 327 |
| }, |
| { |
| "epoch": 1.84042739534069, |
| "grad_norm": 0.26646687828106147, |
| "learning_rate": 6.367610163833015e-05, |
| "loss": 0.3704, |
| "step": 328 |
| }, |
| { |
| "epoch": 1.8460325801366264, |
| "grad_norm": 0.22618321758644827, |
| "learning_rate": 6.35494702398833e-05, |
| "loss": 0.3622, |
| "step": 329 |
| }, |
| { |
| "epoch": 1.8516377649325626, |
| "grad_norm": 0.22154925411761456, |
| "learning_rate": 6.342247658567753e-05, |
| "loss": 0.366, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.8572429497284988, |
| "grad_norm": 0.29666284481675853, |
| "learning_rate": 6.329512262922525e-05, |
| "loss": 0.3689, |
| "step": 331 |
| }, |
| { |
| "epoch": 1.8628481345244352, |
| "grad_norm": 0.2939745343835554, |
| "learning_rate": 6.316741032958133e-05, |
| "loss": 0.3592, |
| "step": 332 |
| }, |
| { |
| "epoch": 1.8684533193203714, |
| "grad_norm": 0.2738063356122982, |
| "learning_rate": 6.303934165131296e-05, |
| "loss": 0.3632, |
| "step": 333 |
| }, |
| { |
| "epoch": 1.8740585041163076, |
| "grad_norm": 0.25344755125569035, |
| "learning_rate": 6.291091856446935e-05, |
| "loss": 0.3682, |
| "step": 334 |
| }, |
| { |
| "epoch": 1.8796636889122438, |
| "grad_norm": 0.22436307153393606, |
| "learning_rate": 6.278214304455156e-05, |
| "loss": 0.3657, |
| "step": 335 |
| }, |
| { |
| "epoch": 1.88526887370818, |
| "grad_norm": 0.25070997700969644, |
| "learning_rate": 6.265301707248199e-05, |
| "loss": 0.3699, |
| "step": 336 |
| }, |
| { |
| "epoch": 1.8908740585041164, |
| "grad_norm": 0.2733139068534298, |
| "learning_rate": 6.252354263457403e-05, |
| "loss": 0.3695, |
| "step": 337 |
| }, |
| { |
| "epoch": 1.8964792433000526, |
| "grad_norm": 0.2866080779686849, |
| "learning_rate": 6.239372172250134e-05, |
| "loss": 0.3714, |
| "step": 338 |
| }, |
| { |
| "epoch": 1.9020844280959888, |
| "grad_norm": 0.21011808838464177, |
| "learning_rate": 6.226355633326739e-05, |
| "loss": 0.3664, |
| "step": 339 |
| }, |
| { |
| "epoch": 1.9076896128919252, |
| "grad_norm": 0.1975979876067148, |
| "learning_rate": 6.21330484691746e-05, |
| "loss": 0.3669, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.9132947976878611, |
| "grad_norm": 0.21416216664449675, |
| "learning_rate": 6.200220013779366e-05, |
| "loss": 0.3668, |
| "step": 341 |
| }, |
| { |
| "epoch": 1.9188999824837976, |
| "grad_norm": 0.21995877340660605, |
| "learning_rate": 6.187101335193252e-05, |
| "loss": 0.3602, |
| "step": 342 |
| }, |
| { |
| "epoch": 1.9245051672797338, |
| "grad_norm": 0.19030470602968677, |
| "learning_rate": 6.173949012960552e-05, |
| "loss": 0.3617, |
| "step": 343 |
| }, |
| { |
| "epoch": 1.93011035207567, |
| "grad_norm": 0.2128195145941448, |
| "learning_rate": 6.160763249400236e-05, |
| "loss": 0.3624, |
| "step": 344 |
| }, |
| { |
| "epoch": 1.9357155368716064, |
| "grad_norm": 0.245421223589709, |
| "learning_rate": 6.147544247345684e-05, |
| "loss": 0.3603, |
| "step": 345 |
| }, |
| { |
| "epoch": 1.9413207216675423, |
| "grad_norm": 0.21883215784819807, |
| "learning_rate": 6.134292210141585e-05, |
| "loss": 0.3594, |
| "step": 346 |
| }, |
| { |
| "epoch": 1.9469259064634787, |
| "grad_norm": 0.1967876169444006, |
| "learning_rate": 6.121007341640797e-05, |
| "loss": 0.368, |
| "step": 347 |
| }, |
| { |
| "epoch": 1.952531091259415, |
| "grad_norm": 0.20005898323834861, |
| "learning_rate": 6.10768984620121e-05, |
| "loss": 0.3735, |
| "step": 348 |
| }, |
| { |
| "epoch": 1.9581362760553511, |
| "grad_norm": 0.23622231826107862, |
| "learning_rate": 6.0943399286826126e-05, |
| "loss": 0.3621, |
| "step": 349 |
| }, |
| { |
| "epoch": 1.9637414608512875, |
| "grad_norm": 0.27814473844495363, |
| "learning_rate": 6.080957794443529e-05, |
| "loss": 0.3583, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.9693466456472237, |
| "grad_norm": 0.302395060856694, |
| "learning_rate": 6.067543649338069e-05, |
| "loss": 0.3626, |
| "step": 351 |
| }, |
| { |
| "epoch": 1.97495183044316, |
| "grad_norm": 0.2330957117550289, |
| "learning_rate": 6.0540976997127534e-05, |
| "loss": 0.3626, |
| "step": 352 |
| }, |
| { |
| "epoch": 1.9805570152390963, |
| "grad_norm": 0.17919274741898725, |
| "learning_rate": 6.040620152403351e-05, |
| "loss": 0.3699, |
| "step": 353 |
| }, |
| { |
| "epoch": 1.9861622000350323, |
| "grad_norm": 0.19040666115368088, |
| "learning_rate": 6.0271112147316816e-05, |
| "loss": 0.362, |
| "step": 354 |
| }, |
| { |
| "epoch": 1.9917673848309687, |
| "grad_norm": 0.19757884670083642, |
| "learning_rate": 6.013571094502443e-05, |
| "loss": 0.3609, |
| "step": 355 |
| }, |
| { |
| "epoch": 1.997372569626905, |
| "grad_norm": 0.2205769315300188, |
| "learning_rate": 6.000000000000001e-05, |
| "loss": 0.3747, |
| "step": 356 |
| }, |
| { |
| "epoch": 2.0056051847959364, |
| "grad_norm": 0.2741039461067367, |
| "learning_rate": 5.986398139985195e-05, |
| "loss": 0.339, |
| "step": 357 |
| }, |
| { |
| "epoch": 2.0112103695918724, |
| "grad_norm": 0.31588543514752565, |
| "learning_rate": 5.97276572369212e-05, |
| "loss": 0.3439, |
| "step": 358 |
| }, |
| { |
| "epoch": 2.016815554387809, |
| "grad_norm": 0.46336824661725023, |
| "learning_rate": 5.959102960824914e-05, |
| "loss": 0.3396, |
| "step": 359 |
| }, |
| { |
| "epoch": 2.0224207391837448, |
| "grad_norm": 0.6543572364352943, |
| "learning_rate": 5.945410061554526e-05, |
| "loss": 0.3462, |
| "step": 360 |
| }, |
| { |
| "epoch": 2.028025923979681, |
| "grad_norm": 0.7224099179065516, |
| "learning_rate": 5.931687236515485e-05, |
| "loss": 0.3452, |
| "step": 361 |
| }, |
| { |
| "epoch": 2.0336311087756176, |
| "grad_norm": 0.5577072940081442, |
| "learning_rate": 5.917934696802667e-05, |
| "loss": 0.3393, |
| "step": 362 |
| }, |
| { |
| "epoch": 2.0392362935715536, |
| "grad_norm": 0.33124598923207127, |
| "learning_rate": 5.904152653968032e-05, |
| "loss": 0.3403, |
| "step": 363 |
| }, |
| { |
| "epoch": 2.04484147836749, |
| "grad_norm": 0.4930639395918798, |
| "learning_rate": 5.890341320017389e-05, |
| "loss": 0.3404, |
| "step": 364 |
| }, |
| { |
| "epoch": 2.0504466631634264, |
| "grad_norm": 0.4584696038181033, |
| "learning_rate": 5.8765009074071176e-05, |
| "loss": 0.3413, |
| "step": 365 |
| }, |
| { |
| "epoch": 2.0560518479593624, |
| "grad_norm": 0.3093299417205952, |
| "learning_rate": 5.8626316290409124e-05, |
| "loss": 0.3414, |
| "step": 366 |
| }, |
| { |
| "epoch": 2.0616570327552988, |
| "grad_norm": 0.39704392594198545, |
| "learning_rate": 5.8487336982665016e-05, |
| "loss": 0.337, |
| "step": 367 |
| }, |
| { |
| "epoch": 2.0672622175512347, |
| "grad_norm": 0.30243949060200587, |
| "learning_rate": 5.8348073288723625e-05, |
| "loss": 0.342, |
| "step": 368 |
| }, |
| { |
| "epoch": 2.072867402347171, |
| "grad_norm": 0.2815416454530177, |
| "learning_rate": 5.820852735084443e-05, |
| "loss": 0.3382, |
| "step": 369 |
| }, |
| { |
| "epoch": 2.0784725871431076, |
| "grad_norm": 0.37397854680211123, |
| "learning_rate": 5.8068701315628564e-05, |
| "loss": 0.338, |
| "step": 370 |
| }, |
| { |
| "epoch": 2.0840777719390435, |
| "grad_norm": 0.2659776331075204, |
| "learning_rate": 5.792859733398582e-05, |
| "loss": 0.338, |
| "step": 371 |
| }, |
| { |
| "epoch": 2.08968295673498, |
| "grad_norm": 0.263065032898508, |
| "learning_rate": 5.7788217561101604e-05, |
| "loss": 0.3399, |
| "step": 372 |
| }, |
| { |
| "epoch": 2.095288141530916, |
| "grad_norm": 0.25395323107827705, |
| "learning_rate": 5.7647564156403734e-05, |
| "loss": 0.3436, |
| "step": 373 |
| }, |
| { |
| "epoch": 2.1008933263268523, |
| "grad_norm": 0.26293260303616806, |
| "learning_rate": 5.750663928352923e-05, |
| "loss": 0.3335, |
| "step": 374 |
| }, |
| { |
| "epoch": 2.1064985111227887, |
| "grad_norm": 0.3165184344301909, |
| "learning_rate": 5.7365445110291063e-05, |
| "loss": 0.3308, |
| "step": 375 |
| }, |
| { |
| "epoch": 2.1121036959187247, |
| "grad_norm": 0.2209241133233396, |
| "learning_rate": 5.7223983808644757e-05, |
| "loss": 0.3384, |
| "step": 376 |
| }, |
| { |
| "epoch": 2.117708880714661, |
| "grad_norm": 0.22848826893513474, |
| "learning_rate": 5.7082257554655046e-05, |
| "loss": 0.3302, |
| "step": 377 |
| }, |
| { |
| "epoch": 2.123314065510597, |
| "grad_norm": 0.19866524720209594, |
| "learning_rate": 5.6940268528462324e-05, |
| "loss": 0.3325, |
| "step": 378 |
| }, |
| { |
| "epoch": 2.1289192503065335, |
| "grad_norm": 0.18924595566511337, |
| "learning_rate": 5.6798018914249176e-05, |
| "loss": 0.3409, |
| "step": 379 |
| }, |
| { |
| "epoch": 2.13452443510247, |
| "grad_norm": 0.2094808002199248, |
| "learning_rate": 5.665551090020671e-05, |
| "loss": 0.3368, |
| "step": 380 |
| }, |
| { |
| "epoch": 2.140129619898406, |
| "grad_norm": 0.1896298642995286, |
| "learning_rate": 5.651274667850099e-05, |
| "loss": 0.3382, |
| "step": 381 |
| }, |
| { |
| "epoch": 2.1457348046943423, |
| "grad_norm": 0.22305481312184167, |
| "learning_rate": 5.6369728445239216e-05, |
| "loss": 0.3365, |
| "step": 382 |
| }, |
| { |
| "epoch": 2.1513399894902787, |
| "grad_norm": 0.18402697711670618, |
| "learning_rate": 5.622645840043599e-05, |
| "loss": 0.3327, |
| "step": 383 |
| }, |
| { |
| "epoch": 2.1569451742862147, |
| "grad_norm": 0.18539919040240732, |
| "learning_rate": 5.60829387479795e-05, |
| "loss": 0.3367, |
| "step": 384 |
| }, |
| { |
| "epoch": 2.162550359082151, |
| "grad_norm": 0.21582225739629068, |
| "learning_rate": 5.5939171695597546e-05, |
| "loss": 0.3395, |
| "step": 385 |
| }, |
| { |
| "epoch": 2.168155543878087, |
| "grad_norm": 0.16294915710384228, |
| "learning_rate": 5.579515945482366e-05, |
| "loss": 0.3356, |
| "step": 386 |
| }, |
| { |
| "epoch": 2.1737607286740235, |
| "grad_norm": 0.17506193706528747, |
| "learning_rate": 5.5650904240963015e-05, |
| "loss": 0.3389, |
| "step": 387 |
| }, |
| { |
| "epoch": 2.17936591346996, |
| "grad_norm": 0.17986589577627823, |
| "learning_rate": 5.55064082730584e-05, |
| "loss": 0.3376, |
| "step": 388 |
| }, |
| { |
| "epoch": 2.184971098265896, |
| "grad_norm": 0.20225696584185415, |
| "learning_rate": 5.536167377385606e-05, |
| "loss": 0.3352, |
| "step": 389 |
| }, |
| { |
| "epoch": 2.1905762830618323, |
| "grad_norm": 0.20452311461397996, |
| "learning_rate": 5.521670296977151e-05, |
| "loss": 0.3427, |
| "step": 390 |
| }, |
| { |
| "epoch": 2.1961814678577682, |
| "grad_norm": 0.20853709114695754, |
| "learning_rate": 5.507149809085528e-05, |
| "loss": 0.3414, |
| "step": 391 |
| }, |
| { |
| "epoch": 2.2017866526537047, |
| "grad_norm": 0.19806872698081288, |
| "learning_rate": 5.4926061370758616e-05, |
| "loss": 0.3382, |
| "step": 392 |
| }, |
| { |
| "epoch": 2.207391837449641, |
| "grad_norm": 0.18154490506390078, |
| "learning_rate": 5.4780395046699116e-05, |
| "loss": 0.3334, |
| "step": 393 |
| }, |
| { |
| "epoch": 2.212997022245577, |
| "grad_norm": 0.20341890410491417, |
| "learning_rate": 5.4634501359426345e-05, |
| "loss": 0.3404, |
| "step": 394 |
| }, |
| { |
| "epoch": 2.2186022070415135, |
| "grad_norm": 0.22993218917327035, |
| "learning_rate": 5.4488382553187307e-05, |
| "loss": 0.3443, |
| "step": 395 |
| }, |
| { |
| "epoch": 2.2242073918374494, |
| "grad_norm": 0.26354208951183833, |
| "learning_rate": 5.434204087569199e-05, |
| "loss": 0.3377, |
| "step": 396 |
| }, |
| { |
| "epoch": 2.229812576633386, |
| "grad_norm": 0.26935013286659065, |
| "learning_rate": 5.419547857807871e-05, |
| "loss": 0.3383, |
| "step": 397 |
| }, |
| { |
| "epoch": 2.2354177614293222, |
| "grad_norm": 0.216246753815593, |
| "learning_rate": 5.404869791487958e-05, |
| "loss": 0.3354, |
| "step": 398 |
| }, |
| { |
| "epoch": 2.241022946225258, |
| "grad_norm": 0.18048698989035397, |
| "learning_rate": 5.390170114398575e-05, |
| "loss": 0.3425, |
| "step": 399 |
| }, |
| { |
| "epoch": 2.2466281310211946, |
| "grad_norm": 0.14602735362790997, |
| "learning_rate": 5.375449052661271e-05, |
| "loss": 0.3395, |
| "step": 400 |
| }, |
| { |
| "epoch": 2.252233315817131, |
| "grad_norm": 0.21006582382336506, |
| "learning_rate": 5.360706832726548e-05, |
| "loss": 0.3364, |
| "step": 401 |
| }, |
| { |
| "epoch": 2.257838500613067, |
| "grad_norm": 0.23551991763406468, |
| "learning_rate": 5.345943681370381e-05, |
| "loss": 0.3411, |
| "step": 402 |
| }, |
| { |
| "epoch": 2.2634436854090034, |
| "grad_norm": 0.23856885482494433, |
| "learning_rate": 5.33115982569073e-05, |
| "loss": 0.3418, |
| "step": 403 |
| }, |
| { |
| "epoch": 2.2690488702049394, |
| "grad_norm": 0.2300140164186402, |
| "learning_rate": 5.31635549310404e-05, |
| "loss": 0.3378, |
| "step": 404 |
| }, |
| { |
| "epoch": 2.274654055000876, |
| "grad_norm": 0.195987902529524, |
| "learning_rate": 5.3015309113417513e-05, |
| "loss": 0.3311, |
| "step": 405 |
| }, |
| { |
| "epoch": 2.280259239796812, |
| "grad_norm": 0.23463070349016596, |
| "learning_rate": 5.286686308446788e-05, |
| "loss": 0.3451, |
| "step": 406 |
| }, |
| { |
| "epoch": 2.285864424592748, |
| "grad_norm": 0.2288982821893169, |
| "learning_rate": 5.27182191277006e-05, |
| "loss": 0.3377, |
| "step": 407 |
| }, |
| { |
| "epoch": 2.2914696093886846, |
| "grad_norm": 0.2546834665154465, |
| "learning_rate": 5.256937952966942e-05, |
| "loss": 0.3377, |
| "step": 408 |
| }, |
| { |
| "epoch": 2.2970747941846206, |
| "grad_norm": 0.2278705546240396, |
| "learning_rate": 5.242034657993756e-05, |
| "loss": 0.3327, |
| "step": 409 |
| }, |
| { |
| "epoch": 2.302679978980557, |
| "grad_norm": 0.2120066354222386, |
| "learning_rate": 5.227112257104256e-05, |
| "loss": 0.3367, |
| "step": 410 |
| }, |
| { |
| "epoch": 2.3082851637764934, |
| "grad_norm": 0.21472567166361733, |
| "learning_rate": 5.2121709798460965e-05, |
| "loss": 0.3313, |
| "step": 411 |
| }, |
| { |
| "epoch": 2.3138903485724294, |
| "grad_norm": 0.1362367689167016, |
| "learning_rate": 5.197211056057304e-05, |
| "loss": 0.3351, |
| "step": 412 |
| }, |
| { |
| "epoch": 2.319495533368366, |
| "grad_norm": 0.15862071124581603, |
| "learning_rate": 5.182232715862738e-05, |
| "loss": 0.3338, |
| "step": 413 |
| }, |
| { |
| "epoch": 2.325100718164302, |
| "grad_norm": 0.18326499258519496, |
| "learning_rate": 5.167236189670551e-05, |
| "loss": 0.3404, |
| "step": 414 |
| }, |
| { |
| "epoch": 2.330705902960238, |
| "grad_norm": 0.1581861572597703, |
| "learning_rate": 5.152221708168652e-05, |
| "loss": 0.3375, |
| "step": 415 |
| }, |
| { |
| "epoch": 2.3363110877561746, |
| "grad_norm": 0.14207676687336432, |
| "learning_rate": 5.137189502321149e-05, |
| "loss": 0.3433, |
| "step": 416 |
| }, |
| { |
| "epoch": 2.3419162725521105, |
| "grad_norm": 0.1688755529335156, |
| "learning_rate": 5.122139803364798e-05, |
| "loss": 0.337, |
| "step": 417 |
| }, |
| { |
| "epoch": 2.347521457348047, |
| "grad_norm": 0.17084826149933108, |
| "learning_rate": 5.1070728428054506e-05, |
| "loss": 0.3337, |
| "step": 418 |
| }, |
| { |
| "epoch": 2.3531266421439834, |
| "grad_norm": 0.17086756093968097, |
| "learning_rate": 5.091988852414485e-05, |
| "loss": 0.3379, |
| "step": 419 |
| }, |
| { |
| "epoch": 2.3587318269399193, |
| "grad_norm": 0.17512853477255008, |
| "learning_rate": 5.07688806422525e-05, |
| "loss": 0.3346, |
| "step": 420 |
| }, |
| { |
| "epoch": 2.3643370117358558, |
| "grad_norm": 0.13968477165183565, |
| "learning_rate": 5.0617707105294876e-05, |
| "loss": 0.337, |
| "step": 421 |
| }, |
| { |
| "epoch": 2.3699421965317917, |
| "grad_norm": 0.1551281956472879, |
| "learning_rate": 5.046637023873763e-05, |
| "loss": 0.3414, |
| "step": 422 |
| }, |
| { |
| "epoch": 2.375547381327728, |
| "grad_norm": 0.13800979999513777, |
| "learning_rate": 5.0314872370558895e-05, |
| "loss": 0.332, |
| "step": 423 |
| }, |
| { |
| "epoch": 2.3811525661236645, |
| "grad_norm": 0.1367384754270518, |
| "learning_rate": 5.016321583121342e-05, |
| "loss": 0.3402, |
| "step": 424 |
| }, |
| { |
| "epoch": 2.3867577509196005, |
| "grad_norm": 0.1638479532373916, |
| "learning_rate": 5.00114029535968e-05, |
| "loss": 0.3325, |
| "step": 425 |
| }, |
| { |
| "epoch": 2.392362935715537, |
| "grad_norm": 0.16001330446175308, |
| "learning_rate": 4.985943607300951e-05, |
| "loss": 0.3378, |
| "step": 426 |
| }, |
| { |
| "epoch": 2.3979681205114733, |
| "grad_norm": 0.18621075203491558, |
| "learning_rate": 4.9707317527121e-05, |
| "loss": 0.3395, |
| "step": 427 |
| }, |
| { |
| "epoch": 2.4035733053074093, |
| "grad_norm": 0.18597047109928305, |
| "learning_rate": 4.9555049655933786e-05, |
| "loss": 0.3383, |
| "step": 428 |
| }, |
| { |
| "epoch": 2.4091784901033457, |
| "grad_norm": 0.16909692251131547, |
| "learning_rate": 4.940263480174741e-05, |
| "loss": 0.3336, |
| "step": 429 |
| }, |
| { |
| "epoch": 2.4147836748992817, |
| "grad_norm": 0.12955190193212013, |
| "learning_rate": 4.9250075309122414e-05, |
| "loss": 0.336, |
| "step": 430 |
| }, |
| { |
| "epoch": 2.420388859695218, |
| "grad_norm": 0.15190807156866015, |
| "learning_rate": 4.909737352484427e-05, |
| "loss": 0.3399, |
| "step": 431 |
| }, |
| { |
| "epoch": 2.425994044491154, |
| "grad_norm": 0.1573962735046912, |
| "learning_rate": 4.894453179788728e-05, |
| "loss": 0.3408, |
| "step": 432 |
| }, |
| { |
| "epoch": 2.4315992292870905, |
| "grad_norm": 0.1370853516111357, |
| "learning_rate": 4.879155247937849e-05, |
| "loss": 0.3318, |
| "step": 433 |
| }, |
| { |
| "epoch": 2.437204414083027, |
| "grad_norm": 0.15508060558426304, |
| "learning_rate": 4.8638437922561445e-05, |
| "loss": 0.3435, |
| "step": 434 |
| }, |
| { |
| "epoch": 2.442809598878963, |
| "grad_norm": 0.14664188616583732, |
| "learning_rate": 4.8485190482760046e-05, |
| "loss": 0.3303, |
| "step": 435 |
| }, |
| { |
| "epoch": 2.4484147836748993, |
| "grad_norm": 0.13239676829377417, |
| "learning_rate": 4.833181251734228e-05, |
| "loss": 0.3358, |
| "step": 436 |
| }, |
| { |
| "epoch": 2.4540199684708357, |
| "grad_norm": 0.1689890101652452, |
| "learning_rate": 4.8178306385684014e-05, |
| "loss": 0.3379, |
| "step": 437 |
| }, |
| { |
| "epoch": 2.4596251532667717, |
| "grad_norm": 0.19600685835243647, |
| "learning_rate": 4.802467444913263e-05, |
| "loss": 0.3375, |
| "step": 438 |
| }, |
| { |
| "epoch": 2.465230338062708, |
| "grad_norm": 0.1614065765277386, |
| "learning_rate": 4.787091907097075e-05, |
| "loss": 0.3353, |
| "step": 439 |
| }, |
| { |
| "epoch": 2.4708355228586445, |
| "grad_norm": 0.1508189987286625, |
| "learning_rate": 4.771704261637988e-05, |
| "loss": 0.3349, |
| "step": 440 |
| }, |
| { |
| "epoch": 2.4764407076545805, |
| "grad_norm": 0.18101253119293248, |
| "learning_rate": 4.756304745240398e-05, |
| "loss": 0.3408, |
| "step": 441 |
| }, |
| { |
| "epoch": 2.482045892450517, |
| "grad_norm": 0.1826261585945622, |
| "learning_rate": 4.740893594791314e-05, |
| "loss": 0.3351, |
| "step": 442 |
| }, |
| { |
| "epoch": 2.487651077246453, |
| "grad_norm": 0.14052710829053464, |
| "learning_rate": 4.7254710473567035e-05, |
| "loss": 0.3357, |
| "step": 443 |
| }, |
| { |
| "epoch": 2.4932562620423893, |
| "grad_norm": 0.14437812699305436, |
| "learning_rate": 4.710037340177855e-05, |
| "loss": 0.3323, |
| "step": 444 |
| }, |
| { |
| "epoch": 2.4988614468383252, |
| "grad_norm": 0.15602474948645395, |
| "learning_rate": 4.694592710667723e-05, |
| "loss": 0.3315, |
| "step": 445 |
| }, |
| { |
| "epoch": 2.5044666316342616, |
| "grad_norm": 0.14055328691934155, |
| "learning_rate": 4.6791373964072755e-05, |
| "loss": 0.3417, |
| "step": 446 |
| }, |
| { |
| "epoch": 2.510071816430198, |
| "grad_norm": 0.1388879025795148, |
| "learning_rate": 4.663671635141844e-05, |
| "loss": 0.3334, |
| "step": 447 |
| }, |
| { |
| "epoch": 2.515677001226134, |
| "grad_norm": 0.14203432091623683, |
| "learning_rate": 4.648195664777466e-05, |
| "loss": 0.3293, |
| "step": 448 |
| }, |
| { |
| "epoch": 2.5212821860220704, |
| "grad_norm": 0.1362951045485, |
| "learning_rate": 4.6327097233772167e-05, |
| "loss": 0.3398, |
| "step": 449 |
| }, |
| { |
| "epoch": 2.526887370818007, |
| "grad_norm": 0.13873000403640978, |
| "learning_rate": 4.617214049157559e-05, |
| "loss": 0.3447, |
| "step": 450 |
| }, |
| { |
| "epoch": 2.532492555613943, |
| "grad_norm": 0.13932412475931288, |
| "learning_rate": 4.601708880484672e-05, |
| "loss": 0.3378, |
| "step": 451 |
| }, |
| { |
| "epoch": 2.5380977404098792, |
| "grad_norm": 0.13616797667697145, |
| "learning_rate": 4.586194455870782e-05, |
| "loss": 0.3357, |
| "step": 452 |
| }, |
| { |
| "epoch": 2.5437029252058156, |
| "grad_norm": 0.12388008240947325, |
| "learning_rate": 4.5706710139705035e-05, |
| "loss": 0.3367, |
| "step": 453 |
| }, |
| { |
| "epoch": 2.5493081100017516, |
| "grad_norm": 0.13428327954219868, |
| "learning_rate": 4.555138793577156e-05, |
| "loss": 0.3372, |
| "step": 454 |
| }, |
| { |
| "epoch": 2.5549132947976876, |
| "grad_norm": 0.12462707556187247, |
| "learning_rate": 4.5395980336191e-05, |
| "loss": 0.3386, |
| "step": 455 |
| }, |
| { |
| "epoch": 2.560518479593624, |
| "grad_norm": 0.1235771755300071, |
| "learning_rate": 4.524048973156056e-05, |
| "loss": 0.3381, |
| "step": 456 |
| }, |
| { |
| "epoch": 2.5661236643895604, |
| "grad_norm": 0.14711243405086183, |
| "learning_rate": 4.508491851375431e-05, |
| "loss": 0.3316, |
| "step": 457 |
| }, |
| { |
| "epoch": 2.5717288491854964, |
| "grad_norm": 0.14459114492388414, |
| "learning_rate": 4.4929269075886345e-05, |
| "loss": 0.3298, |
| "step": 458 |
| }, |
| { |
| "epoch": 2.577334033981433, |
| "grad_norm": 0.12808061163839768, |
| "learning_rate": 4.477354381227405e-05, |
| "loss": 0.3365, |
| "step": 459 |
| }, |
| { |
| "epoch": 2.582939218777369, |
| "grad_norm": 0.1605215774840903, |
| "learning_rate": 4.4617745118401146e-05, |
| "loss": 0.3436, |
| "step": 460 |
| }, |
| { |
| "epoch": 2.588544403573305, |
| "grad_norm": 0.1686937530955673, |
| "learning_rate": 4.446187539088098e-05, |
| "loss": 0.3401, |
| "step": 461 |
| }, |
| { |
| "epoch": 2.5941495883692416, |
| "grad_norm": 0.1527872446582201, |
| "learning_rate": 4.4305937027419554e-05, |
| "loss": 0.336, |
| "step": 462 |
| }, |
| { |
| "epoch": 2.599754773165178, |
| "grad_norm": 0.15128554865510688, |
| "learning_rate": 4.4149932426778726e-05, |
| "loss": 0.3344, |
| "step": 463 |
| }, |
| { |
| "epoch": 2.605359957961114, |
| "grad_norm": 0.14920667419147088, |
| "learning_rate": 4.399386398873919e-05, |
| "loss": 0.337, |
| "step": 464 |
| }, |
| { |
| "epoch": 2.6109651427570504, |
| "grad_norm": 0.1744457398926105, |
| "learning_rate": 4.383773411406369e-05, |
| "loss": 0.3315, |
| "step": 465 |
| }, |
| { |
| "epoch": 2.6165703275529864, |
| "grad_norm": 0.17078707601958157, |
| "learning_rate": 4.368154520446e-05, |
| "loss": 0.3381, |
| "step": 466 |
| }, |
| { |
| "epoch": 2.6221755123489228, |
| "grad_norm": 0.13633202612822465, |
| "learning_rate": 4.352529966254408e-05, |
| "loss": 0.3356, |
| "step": 467 |
| }, |
| { |
| "epoch": 2.6277806971448587, |
| "grad_norm": 0.15243033406951811, |
| "learning_rate": 4.336899989180297e-05, |
| "loss": 0.336, |
| "step": 468 |
| }, |
| { |
| "epoch": 2.633385881940795, |
| "grad_norm": 0.14747243716476988, |
| "learning_rate": 4.3212648296557956e-05, |
| "loss": 0.3404, |
| "step": 469 |
| }, |
| { |
| "epoch": 2.6389910667367316, |
| "grad_norm": 0.14313810836029056, |
| "learning_rate": 4.305624728192749e-05, |
| "loss": 0.3383, |
| "step": 470 |
| }, |
| { |
| "epoch": 2.6445962515326675, |
| "grad_norm": 0.15292894547047348, |
| "learning_rate": 4.289979925379025e-05, |
| "loss": 0.3347, |
| "step": 471 |
| }, |
| { |
| "epoch": 2.650201436328604, |
| "grad_norm": 0.16666900752167832, |
| "learning_rate": 4.274330661874812e-05, |
| "loss": 0.3389, |
| "step": 472 |
| }, |
| { |
| "epoch": 2.6558066211245404, |
| "grad_norm": 0.14101399886632246, |
| "learning_rate": 4.258677178408914e-05, |
| "loss": 0.3472, |
| "step": 473 |
| }, |
| { |
| "epoch": 2.6614118059204763, |
| "grad_norm": 0.14283842141759293, |
| "learning_rate": 4.2430197157750506e-05, |
| "loss": 0.3288, |
| "step": 474 |
| }, |
| { |
| "epoch": 2.6670169907164127, |
| "grad_norm": 0.18182542514094624, |
| "learning_rate": 4.227358514828151e-05, |
| "loss": 0.3344, |
| "step": 475 |
| }, |
| { |
| "epoch": 2.672622175512349, |
| "grad_norm": 0.15313956411935128, |
| "learning_rate": 4.2116938164806523e-05, |
| "loss": 0.3448, |
| "step": 476 |
| }, |
| { |
| "epoch": 2.678227360308285, |
| "grad_norm": 0.15589616508314744, |
| "learning_rate": 4.19602586169879e-05, |
| "loss": 0.3429, |
| "step": 477 |
| }, |
| { |
| "epoch": 2.6838325451042215, |
| "grad_norm": 0.18115048482512122, |
| "learning_rate": 4.1803548914988915e-05, |
| "loss": 0.3341, |
| "step": 478 |
| }, |
| { |
| "epoch": 2.6894377299001575, |
| "grad_norm": 0.15383530203792303, |
| "learning_rate": 4.164681146943672e-05, |
| "loss": 0.3369, |
| "step": 479 |
| }, |
| { |
| "epoch": 2.695042914696094, |
| "grad_norm": 0.18132657455214243, |
| "learning_rate": 4.1490048691385184e-05, |
| "loss": 0.3387, |
| "step": 480 |
| }, |
| { |
| "epoch": 2.70064809949203, |
| "grad_norm": 0.1663579198570477, |
| "learning_rate": 4.133326299227796e-05, |
| "loss": 0.3426, |
| "step": 481 |
| }, |
| { |
| "epoch": 2.7062532842879663, |
| "grad_norm": 0.15912216978968627, |
| "learning_rate": 4.1176456783911186e-05, |
| "loss": 0.3391, |
| "step": 482 |
| }, |
| { |
| "epoch": 2.7118584690839027, |
| "grad_norm": 0.14961940121164838, |
| "learning_rate": 4.1019632478396535e-05, |
| "loss": 0.3346, |
| "step": 483 |
| }, |
| { |
| "epoch": 2.7174636538798387, |
| "grad_norm": 0.13821373229896533, |
| "learning_rate": 4.0862792488124084e-05, |
| "loss": 0.3444, |
| "step": 484 |
| }, |
| { |
| "epoch": 2.723068838675775, |
| "grad_norm": 0.1446248191283951, |
| "learning_rate": 4.070593922572515e-05, |
| "loss": 0.3397, |
| "step": 485 |
| }, |
| { |
| "epoch": 2.7286740234717115, |
| "grad_norm": 0.14611396725110462, |
| "learning_rate": 4.0549075104035235e-05, |
| "loss": 0.3381, |
| "step": 486 |
| }, |
| { |
| "epoch": 2.7342792082676475, |
| "grad_norm": 0.15021839121813585, |
| "learning_rate": 4.0392202536056864e-05, |
| "loss": 0.3376, |
| "step": 487 |
| }, |
| { |
| "epoch": 2.739884393063584, |
| "grad_norm": 0.1222365954508722, |
| "learning_rate": 4.023532393492249e-05, |
| "loss": 0.3418, |
| "step": 488 |
| }, |
| { |
| "epoch": 2.7454895778595203, |
| "grad_norm": 0.13385222223998502, |
| "learning_rate": 4.007844171385742e-05, |
| "loss": 0.3375, |
| "step": 489 |
| }, |
| { |
| "epoch": 2.7510947626554563, |
| "grad_norm": 0.16084769560491935, |
| "learning_rate": 3.992155828614259e-05, |
| "loss": 0.3383, |
| "step": 490 |
| }, |
| { |
| "epoch": 2.7566999474513927, |
| "grad_norm": 0.11593149953710691, |
| "learning_rate": 3.976467606507752e-05, |
| "loss": 0.334, |
| "step": 491 |
| }, |
| { |
| "epoch": 2.7623051322473287, |
| "grad_norm": 0.12135253379499054, |
| "learning_rate": 3.960779746394315e-05, |
| "loss": 0.3369, |
| "step": 492 |
| }, |
| { |
| "epoch": 2.767910317043265, |
| "grad_norm": 0.15157050861544064, |
| "learning_rate": 3.9450924895964785e-05, |
| "loss": 0.3378, |
| "step": 493 |
| }, |
| { |
| "epoch": 2.773515501839201, |
| "grad_norm": 0.13097668206944849, |
| "learning_rate": 3.929406077427486e-05, |
| "loss": 0.3378, |
| "step": 494 |
| }, |
| { |
| "epoch": 2.7791206866351374, |
| "grad_norm": 0.16603204878535843, |
| "learning_rate": 3.913720751187593e-05, |
| "loss": 0.335, |
| "step": 495 |
| }, |
| { |
| "epoch": 2.784725871431074, |
| "grad_norm": 0.14752638359057793, |
| "learning_rate": 3.898036752160348e-05, |
| "loss": 0.3333, |
| "step": 496 |
| }, |
| { |
| "epoch": 2.79033105622701, |
| "grad_norm": 0.11483015755025572, |
| "learning_rate": 3.882354321608883e-05, |
| "loss": 0.3324, |
| "step": 497 |
| }, |
| { |
| "epoch": 2.7959362410229462, |
| "grad_norm": 0.15227040786502544, |
| "learning_rate": 3.8666737007722055e-05, |
| "loss": 0.3334, |
| "step": 498 |
| }, |
| { |
| "epoch": 2.8015414258188827, |
| "grad_norm": 0.15767384723901806, |
| "learning_rate": 3.8509951308614816e-05, |
| "loss": 0.3346, |
| "step": 499 |
| }, |
| { |
| "epoch": 2.8071466106148186, |
| "grad_norm": 0.12626963411483594, |
| "learning_rate": 3.8353188530563296e-05, |
| "loss": 0.3433, |
| "step": 500 |
| }, |
| { |
| "epoch": 2.812751795410755, |
| "grad_norm": 0.13733431001138763, |
| "learning_rate": 3.8196451085011085e-05, |
| "loss": 0.3327, |
| "step": 501 |
| }, |
| { |
| "epoch": 2.8183569802066915, |
| "grad_norm": 0.14680623002425597, |
| "learning_rate": 3.80397413830121e-05, |
| "loss": 0.3362, |
| "step": 502 |
| }, |
| { |
| "epoch": 2.8239621650026274, |
| "grad_norm": 0.13113991590813315, |
| "learning_rate": 3.7883061835193476e-05, |
| "loss": 0.3316, |
| "step": 503 |
| }, |
| { |
| "epoch": 2.829567349798564, |
| "grad_norm": 0.14765888572997615, |
| "learning_rate": 3.772641485171849e-05, |
| "loss": 0.3354, |
| "step": 504 |
| }, |
| { |
| "epoch": 2.8351725345945, |
| "grad_norm": 0.13209441385102483, |
| "learning_rate": 3.756980284224951e-05, |
| "loss": 0.3387, |
| "step": 505 |
| }, |
| { |
| "epoch": 2.840777719390436, |
| "grad_norm": 0.14879932604775253, |
| "learning_rate": 3.7413228215910866e-05, |
| "loss": 0.3369, |
| "step": 506 |
| }, |
| { |
| "epoch": 2.846382904186372, |
| "grad_norm": 0.13679808581693134, |
| "learning_rate": 3.725669338125189e-05, |
| "loss": 0.3316, |
| "step": 507 |
| }, |
| { |
| "epoch": 2.8519880889823086, |
| "grad_norm": 0.14870932780889795, |
| "learning_rate": 3.710020074620976e-05, |
| "loss": 0.3411, |
| "step": 508 |
| }, |
| { |
| "epoch": 2.857593273778245, |
| "grad_norm": 0.15465220506717361, |
| "learning_rate": 3.6943752718072526e-05, |
| "loss": 0.3431, |
| "step": 509 |
| }, |
| { |
| "epoch": 2.863198458574181, |
| "grad_norm": 0.13086189881896804, |
| "learning_rate": 3.6787351703442064e-05, |
| "loss": 0.3361, |
| "step": 510 |
| }, |
| { |
| "epoch": 2.8688036433701174, |
| "grad_norm": 0.15533250172898358, |
| "learning_rate": 3.663100010819704e-05, |
| "loss": 0.3409, |
| "step": 511 |
| }, |
| { |
| "epoch": 2.874408828166054, |
| "grad_norm": 0.15368379331778698, |
| "learning_rate": 3.6474700337455946e-05, |
| "loss": 0.3366, |
| "step": 512 |
| }, |
| { |
| "epoch": 2.8800140129619898, |
| "grad_norm": 0.13213250938368978, |
| "learning_rate": 3.631845479554001e-05, |
| "loss": 0.3404, |
| "step": 513 |
| }, |
| { |
| "epoch": 2.885619197757926, |
| "grad_norm": 0.12630288060204878, |
| "learning_rate": 3.616226588593634e-05, |
| "loss": 0.3364, |
| "step": 514 |
| }, |
| { |
| "epoch": 2.891224382553862, |
| "grad_norm": 0.13593521293454738, |
| "learning_rate": 3.6006136011260835e-05, |
| "loss": 0.3381, |
| "step": 515 |
| }, |
| { |
| "epoch": 2.8968295673497986, |
| "grad_norm": 0.11554742949705811, |
| "learning_rate": 3.5850067573221294e-05, |
| "loss": 0.331, |
| "step": 516 |
| }, |
| { |
| "epoch": 2.9024347521457345, |
| "grad_norm": 0.1198528477058993, |
| "learning_rate": 3.569406297258045e-05, |
| "loss": 0.3382, |
| "step": 517 |
| }, |
| { |
| "epoch": 2.908039936941671, |
| "grad_norm": 0.12987613409396606, |
| "learning_rate": 3.553812460911903e-05, |
| "loss": 0.332, |
| "step": 518 |
| }, |
| { |
| "epoch": 2.9136451217376074, |
| "grad_norm": 0.10360442006807373, |
| "learning_rate": 3.538225488159886e-05, |
| "loss": 0.3345, |
| "step": 519 |
| }, |
| { |
| "epoch": 2.9192503065335433, |
| "grad_norm": 0.11999841966016728, |
| "learning_rate": 3.5226456187725966e-05, |
| "loss": 0.3356, |
| "step": 520 |
| }, |
| { |
| "epoch": 2.9248554913294798, |
| "grad_norm": 0.12157002192921504, |
| "learning_rate": 3.507073092411366e-05, |
| "loss": 0.331, |
| "step": 521 |
| }, |
| { |
| "epoch": 2.930460676125416, |
| "grad_norm": 0.11962352346017135, |
| "learning_rate": 3.4915081486245696e-05, |
| "loss": 0.3221, |
| "step": 522 |
| }, |
| { |
| "epoch": 2.936065860921352, |
| "grad_norm": 0.10590981284086302, |
| "learning_rate": 3.4759510268439444e-05, |
| "loss": 0.3271, |
| "step": 523 |
| }, |
| { |
| "epoch": 2.9416710457172885, |
| "grad_norm": 0.12508588575966806, |
| "learning_rate": 3.460401966380901e-05, |
| "loss": 0.334, |
| "step": 524 |
| }, |
| { |
| "epoch": 2.947276230513225, |
| "grad_norm": 0.12411992558068433, |
| "learning_rate": 3.4448612064228455e-05, |
| "loss": 0.3342, |
| "step": 525 |
| }, |
| { |
| "epoch": 2.952881415309161, |
| "grad_norm": 0.12323545333378516, |
| "learning_rate": 3.4293289860294985e-05, |
| "loss": 0.3397, |
| "step": 526 |
| }, |
| { |
| "epoch": 2.9584866001050973, |
| "grad_norm": 0.12211808120842779, |
| "learning_rate": 3.4138055441292186e-05, |
| "loss": 0.333, |
| "step": 527 |
| }, |
| { |
| "epoch": 2.9640917849010333, |
| "grad_norm": 0.11492860031416643, |
| "learning_rate": 3.3982911195153294e-05, |
| "loss": 0.3329, |
| "step": 528 |
| }, |
| { |
| "epoch": 2.9696969696969697, |
| "grad_norm": 0.13508535922750942, |
| "learning_rate": 3.3827859508424415e-05, |
| "loss": 0.3398, |
| "step": 529 |
| }, |
| { |
| "epoch": 2.9753021544929057, |
| "grad_norm": 0.11565425526423825, |
| "learning_rate": 3.367290276622785e-05, |
| "loss": 0.3365, |
| "step": 530 |
| }, |
| { |
| "epoch": 2.980907339288842, |
| "grad_norm": 0.13139009508968535, |
| "learning_rate": 3.3518043352225354e-05, |
| "loss": 0.3312, |
| "step": 531 |
| }, |
| { |
| "epoch": 2.9865125240847785, |
| "grad_norm": 0.1362299448189819, |
| "learning_rate": 3.3363283648581564e-05, |
| "loss": 0.3292, |
| "step": 532 |
| }, |
| { |
| "epoch": 2.9921177088807145, |
| "grad_norm": 0.12683117549730533, |
| "learning_rate": 3.3208626035927265e-05, |
| "loss": 0.3306, |
| "step": 533 |
| }, |
| { |
| "epoch": 2.997722893676651, |
| "grad_norm": 0.10969694374220536, |
| "learning_rate": 3.305407289332279e-05, |
| "loss": 0.3331, |
| "step": 534 |
| }, |
| { |
| "epoch": 3.00525486074619, |
| "grad_norm": 0.3592894801843749, |
| "learning_rate": 3.289962659822146e-05, |
| "loss": 0.612, |
| "step": 535 |
| }, |
| { |
| "epoch": 3.0108600455421266, |
| "grad_norm": 0.2839378100457828, |
| "learning_rate": 3.274528952643296e-05, |
| "loss": 0.309, |
| "step": 536 |
| }, |
| { |
| "epoch": 3.0164652303380626, |
| "grad_norm": 0.2432146356050859, |
| "learning_rate": 3.259106405208686e-05, |
| "loss": 0.3106, |
| "step": 537 |
| }, |
| { |
| "epoch": 3.022070415133999, |
| "grad_norm": 0.2777016538919042, |
| "learning_rate": 3.2436952547596016e-05, |
| "loss": 0.3208, |
| "step": 538 |
| }, |
| { |
| "epoch": 3.0276755999299354, |
| "grad_norm": 0.29446096248561565, |
| "learning_rate": 3.228295738362013e-05, |
| "loss": 0.3128, |
| "step": 539 |
| }, |
| { |
| "epoch": 3.0332807847258714, |
| "grad_norm": 0.2134321257824934, |
| "learning_rate": 3.212908092902925e-05, |
| "loss": 0.3123, |
| "step": 540 |
| }, |
| { |
| "epoch": 3.038885969521808, |
| "grad_norm": 0.2929761811829302, |
| "learning_rate": 3.1975325550867376e-05, |
| "loss": 0.3149, |
| "step": 541 |
| }, |
| { |
| "epoch": 3.0444911543177438, |
| "grad_norm": 0.23083582350400958, |
| "learning_rate": 3.182169361431599e-05, |
| "loss": 0.3181, |
| "step": 542 |
| }, |
| { |
| "epoch": 3.05009633911368, |
| "grad_norm": 0.22809559973140262, |
| "learning_rate": 3.1668187482657724e-05, |
| "loss": 0.3196, |
| "step": 543 |
| }, |
| { |
| "epoch": 3.0557015239096166, |
| "grad_norm": 0.2825820642402612, |
| "learning_rate": 3.151480951723997e-05, |
| "loss": 0.3146, |
| "step": 544 |
| }, |
| { |
| "epoch": 3.0613067087055525, |
| "grad_norm": 0.2033350745344508, |
| "learning_rate": 3.1361562077438575e-05, |
| "loss": 0.3122, |
| "step": 545 |
| }, |
| { |
| "epoch": 3.066911893501489, |
| "grad_norm": 0.21932897795785233, |
| "learning_rate": 3.120844752062153e-05, |
| "loss": 0.3124, |
| "step": 546 |
| }, |
| { |
| "epoch": 3.072517078297425, |
| "grad_norm": 0.21168302798367983, |
| "learning_rate": 3.1055468202112734e-05, |
| "loss": 0.3166, |
| "step": 547 |
| }, |
| { |
| "epoch": 3.0781222630933613, |
| "grad_norm": 0.17772176835113396, |
| "learning_rate": 3.090262647515575e-05, |
| "loss": 0.3077, |
| "step": 548 |
| }, |
| { |
| "epoch": 3.0837274478892978, |
| "grad_norm": 0.17841507645772034, |
| "learning_rate": 3.0749924690877606e-05, |
| "loss": 0.314, |
| "step": 549 |
| }, |
| { |
| "epoch": 3.0893326326852337, |
| "grad_norm": 0.1771081138385722, |
| "learning_rate": 3.0597365198252605e-05, |
| "loss": 0.3145, |
| "step": 550 |
| }, |
| { |
| "epoch": 3.09493781748117, |
| "grad_norm": 0.17292167901780775, |
| "learning_rate": 3.044495034406623e-05, |
| "loss": 0.3141, |
| "step": 551 |
| }, |
| { |
| "epoch": 3.100543002277106, |
| "grad_norm": 0.15958589623983585, |
| "learning_rate": 3.0292682472879016e-05, |
| "loss": 0.309, |
| "step": 552 |
| }, |
| { |
| "epoch": 3.1061481870730425, |
| "grad_norm": 0.17823919421432835, |
| "learning_rate": 3.014056392699051e-05, |
| "loss": 0.3107, |
| "step": 553 |
| }, |
| { |
| "epoch": 3.111753371868979, |
| "grad_norm": 0.14347972248545185, |
| "learning_rate": 2.998859704640321e-05, |
| "loss": 0.3122, |
| "step": 554 |
| }, |
| { |
| "epoch": 3.117358556664915, |
| "grad_norm": 0.15502920693257438, |
| "learning_rate": 2.9836784168786587e-05, |
| "loss": 0.3079, |
| "step": 555 |
| }, |
| { |
| "epoch": 3.1229637414608513, |
| "grad_norm": 0.14905838975522384, |
| "learning_rate": 2.968512762944112e-05, |
| "loss": 0.3085, |
| "step": 556 |
| }, |
| { |
| "epoch": 3.1285689262567877, |
| "grad_norm": 0.15067799733981543, |
| "learning_rate": 2.953362976126238e-05, |
| "loss": 0.3173, |
| "step": 557 |
| }, |
| { |
| "epoch": 3.1341741110527237, |
| "grad_norm": 0.1352871810582152, |
| "learning_rate": 2.9382292894705137e-05, |
| "loss": 0.3168, |
| "step": 558 |
| }, |
| { |
| "epoch": 3.13977929584866, |
| "grad_norm": 0.13712672403520984, |
| "learning_rate": 2.9231119357747514e-05, |
| "loss": 0.3096, |
| "step": 559 |
| }, |
| { |
| "epoch": 3.145384480644596, |
| "grad_norm": 0.13040410642011227, |
| "learning_rate": 2.908011147585516e-05, |
| "loss": 0.313, |
| "step": 560 |
| }, |
| { |
| "epoch": 3.1509896654405325, |
| "grad_norm": 0.11413659831512099, |
| "learning_rate": 2.8929271571945504e-05, |
| "loss": 0.3173, |
| "step": 561 |
| }, |
| { |
| "epoch": 3.156594850236469, |
| "grad_norm": 0.13404522891478707, |
| "learning_rate": 2.8778601966352028e-05, |
| "loss": 0.3129, |
| "step": 562 |
| }, |
| { |
| "epoch": 3.162200035032405, |
| "grad_norm": 0.12169309736493167, |
| "learning_rate": 2.8628104976788527e-05, |
| "loss": 0.3144, |
| "step": 563 |
| }, |
| { |
| "epoch": 3.1678052198283413, |
| "grad_norm": 0.12136151456869423, |
| "learning_rate": 2.8477782918313495e-05, |
| "loss": 0.3101, |
| "step": 564 |
| }, |
| { |
| "epoch": 3.1734104046242773, |
| "grad_norm": 0.12720103445578, |
| "learning_rate": 2.83276381032945e-05, |
| "loss": 0.3064, |
| "step": 565 |
| }, |
| { |
| "epoch": 3.1790155894202137, |
| "grad_norm": 0.11658578152683795, |
| "learning_rate": 2.8177672841372642e-05, |
| "loss": 0.3104, |
| "step": 566 |
| }, |
| { |
| "epoch": 3.18462077421615, |
| "grad_norm": 0.1297562392547841, |
| "learning_rate": 2.802788943942697e-05, |
| "loss": 0.3123, |
| "step": 567 |
| }, |
| { |
| "epoch": 3.190225959012086, |
| "grad_norm": 0.11985456340120923, |
| "learning_rate": 2.787829020153904e-05, |
| "loss": 0.3146, |
| "step": 568 |
| }, |
| { |
| "epoch": 3.1958311438080225, |
| "grad_norm": 0.1262651870939076, |
| "learning_rate": 2.772887742895745e-05, |
| "loss": 0.3075, |
| "step": 569 |
| }, |
| { |
| "epoch": 3.2014363286039584, |
| "grad_norm": 0.12464097287272473, |
| "learning_rate": 2.7579653420062444e-05, |
| "loss": 0.3045, |
| "step": 570 |
| }, |
| { |
| "epoch": 3.207041513399895, |
| "grad_norm": 0.12341347110151742, |
| "learning_rate": 2.7430620470330588e-05, |
| "loss": 0.3052, |
| "step": 571 |
| }, |
| { |
| "epoch": 3.2126466981958313, |
| "grad_norm": 0.11689537607575821, |
| "learning_rate": 2.7281780872299397e-05, |
| "loss": 0.3092, |
| "step": 572 |
| }, |
| { |
| "epoch": 3.2182518829917672, |
| "grad_norm": 0.12733447393829486, |
| "learning_rate": 2.7133136915532117e-05, |
| "loss": 0.3126, |
| "step": 573 |
| }, |
| { |
| "epoch": 3.2238570677877036, |
| "grad_norm": 0.1123962660816928, |
| "learning_rate": 2.69846908865825e-05, |
| "loss": 0.3125, |
| "step": 574 |
| }, |
| { |
| "epoch": 3.22946225258364, |
| "grad_norm": 0.13480471605245697, |
| "learning_rate": 2.68364450689596e-05, |
| "loss": 0.3186, |
| "step": 575 |
| }, |
| { |
| "epoch": 3.235067437379576, |
| "grad_norm": 0.11275478327505037, |
| "learning_rate": 2.6688401743092704e-05, |
| "loss": 0.3172, |
| "step": 576 |
| }, |
| { |
| "epoch": 3.2406726221755124, |
| "grad_norm": 0.12484323278884911, |
| "learning_rate": 2.6540563186296186e-05, |
| "loss": 0.3102, |
| "step": 577 |
| }, |
| { |
| "epoch": 3.2462778069714484, |
| "grad_norm": 0.11197370815895867, |
| "learning_rate": 2.639293167273453e-05, |
| "loss": 0.3031, |
| "step": 578 |
| }, |
| { |
| "epoch": 3.251882991767385, |
| "grad_norm": 0.11454623610347744, |
| "learning_rate": 2.6245509473387296e-05, |
| "loss": 0.3065, |
| "step": 579 |
| }, |
| { |
| "epoch": 3.2574881765633212, |
| "grad_norm": 0.12304422921255855, |
| "learning_rate": 2.609829885601425e-05, |
| "loss": 0.3089, |
| "step": 580 |
| }, |
| { |
| "epoch": 3.263093361359257, |
| "grad_norm": 0.11259423111356101, |
| "learning_rate": 2.5951302085120437e-05, |
| "loss": 0.3105, |
| "step": 581 |
| }, |
| { |
| "epoch": 3.2686985461551936, |
| "grad_norm": 0.11271478685448443, |
| "learning_rate": 2.5804521421921305e-05, |
| "loss": 0.3068, |
| "step": 582 |
| }, |
| { |
| "epoch": 3.2743037309511296, |
| "grad_norm": 0.11647961636111298, |
| "learning_rate": 2.5657959124308036e-05, |
| "loss": 0.316, |
| "step": 583 |
| }, |
| { |
| "epoch": 3.279908915747066, |
| "grad_norm": 0.12470089799540612, |
| "learning_rate": 2.551161744681271e-05, |
| "loss": 0.3122, |
| "step": 584 |
| }, |
| { |
| "epoch": 3.2855141005430024, |
| "grad_norm": 0.09459288821151796, |
| "learning_rate": 2.5365498640573675e-05, |
| "loss": 0.3082, |
| "step": 585 |
| }, |
| { |
| "epoch": 3.2911192853389384, |
| "grad_norm": 0.12027588781638404, |
| "learning_rate": 2.5219604953300897e-05, |
| "loss": 0.3104, |
| "step": 586 |
| }, |
| { |
| "epoch": 3.296724470134875, |
| "grad_norm": 0.09693184441523361, |
| "learning_rate": 2.5073938629241404e-05, |
| "loss": 0.3113, |
| "step": 587 |
| }, |
| { |
| "epoch": 3.302329654930811, |
| "grad_norm": 0.11431466893797337, |
| "learning_rate": 2.4928501909144735e-05, |
| "loss": 0.3122, |
| "step": 588 |
| }, |
| { |
| "epoch": 3.307934839726747, |
| "grad_norm": 0.10947655679887262, |
| "learning_rate": 2.4783297030228504e-05, |
| "loss": 0.3059, |
| "step": 589 |
| }, |
| { |
| "epoch": 3.3135400245226836, |
| "grad_norm": 0.10724694744795145, |
| "learning_rate": 2.4638326226143955e-05, |
| "loss": 0.3059, |
| "step": 590 |
| }, |
| { |
| "epoch": 3.3191452093186196, |
| "grad_norm": 0.10607719365259763, |
| "learning_rate": 2.449359172694161e-05, |
| "loss": 0.3126, |
| "step": 591 |
| }, |
| { |
| "epoch": 3.324750394114556, |
| "grad_norm": 0.10720345042194546, |
| "learning_rate": 2.4349095759037e-05, |
| "loss": 0.3089, |
| "step": 592 |
| }, |
| { |
| "epoch": 3.3303555789104924, |
| "grad_norm": 0.10273817164660135, |
| "learning_rate": 2.4204840545176356e-05, |
| "loss": 0.3108, |
| "step": 593 |
| }, |
| { |
| "epoch": 3.3359607637064284, |
| "grad_norm": 0.09975537559192163, |
| "learning_rate": 2.406082830440247e-05, |
| "loss": 0.3124, |
| "step": 594 |
| }, |
| { |
| "epoch": 3.3415659485023648, |
| "grad_norm": 0.09406955914926102, |
| "learning_rate": 2.3917061252020513e-05, |
| "loss": 0.316, |
| "step": 595 |
| }, |
| { |
| "epoch": 3.3471711332983007, |
| "grad_norm": 0.09940842953248977, |
| "learning_rate": 2.3773541599564016e-05, |
| "loss": 0.3127, |
| "step": 596 |
| }, |
| { |
| "epoch": 3.352776318094237, |
| "grad_norm": 0.10553838056133437, |
| "learning_rate": 2.36302715547608e-05, |
| "loss": 0.3057, |
| "step": 597 |
| }, |
| { |
| "epoch": 3.3583815028901736, |
| "grad_norm": 0.10680543765646766, |
| "learning_rate": 2.3487253321499025e-05, |
| "loss": 0.3064, |
| "step": 598 |
| }, |
| { |
| "epoch": 3.3639866876861095, |
| "grad_norm": 0.10212325660144811, |
| "learning_rate": 2.3344489099793298e-05, |
| "loss": 0.3054, |
| "step": 599 |
| }, |
| { |
| "epoch": 3.369591872482046, |
| "grad_norm": 0.09345936279910022, |
| "learning_rate": 2.3201981085750848e-05, |
| "loss": 0.3091, |
| "step": 600 |
| }, |
| { |
| "epoch": 3.3751970572779824, |
| "grad_norm": 0.10692872809212166, |
| "learning_rate": 2.3059731471537692e-05, |
| "loss": 0.3057, |
| "step": 601 |
| }, |
| { |
| "epoch": 3.3808022420739183, |
| "grad_norm": 0.10292193319438646, |
| "learning_rate": 2.2917742445344957e-05, |
| "loss": 0.3039, |
| "step": 602 |
| }, |
| { |
| "epoch": 3.3864074268698547, |
| "grad_norm": 0.1044728564683231, |
| "learning_rate": 2.2776016191355247e-05, |
| "loss": 0.307, |
| "step": 603 |
| }, |
| { |
| "epoch": 3.3920126116657907, |
| "grad_norm": 0.11121222286242763, |
| "learning_rate": 2.2634554889708946e-05, |
| "loss": 0.3146, |
| "step": 604 |
| }, |
| { |
| "epoch": 3.397617796461727, |
| "grad_norm": 0.0953250187025364, |
| "learning_rate": 2.2493360716470778e-05, |
| "loss": 0.3088, |
| "step": 605 |
| }, |
| { |
| "epoch": 3.403222981257663, |
| "grad_norm": 0.10907994857009719, |
| "learning_rate": 2.2352435843596276e-05, |
| "loss": 0.3122, |
| "step": 606 |
| }, |
| { |
| "epoch": 3.4088281660535995, |
| "grad_norm": 0.09794843878191668, |
| "learning_rate": 2.2211782438898403e-05, |
| "loss": 0.3072, |
| "step": 607 |
| }, |
| { |
| "epoch": 3.414433350849536, |
| "grad_norm": 0.1106427273961921, |
| "learning_rate": 2.207140266601419e-05, |
| "loss": 0.3173, |
| "step": 608 |
| }, |
| { |
| "epoch": 3.420038535645472, |
| "grad_norm": 0.09676869359844141, |
| "learning_rate": 2.193129868437145e-05, |
| "loss": 0.3097, |
| "step": 609 |
| }, |
| { |
| "epoch": 3.4256437204414083, |
| "grad_norm": 0.1127315347743384, |
| "learning_rate": 2.179147264915558e-05, |
| "loss": 0.3087, |
| "step": 610 |
| }, |
| { |
| "epoch": 3.4312489052373447, |
| "grad_norm": 0.10860744939106713, |
| "learning_rate": 2.1651926711276374e-05, |
| "loss": 0.3064, |
| "step": 611 |
| }, |
| { |
| "epoch": 3.4368540900332807, |
| "grad_norm": 0.10854328769381996, |
| "learning_rate": 2.1512663017334994e-05, |
| "loss": 0.3098, |
| "step": 612 |
| }, |
| { |
| "epoch": 3.442459274829217, |
| "grad_norm": 0.11762452038097213, |
| "learning_rate": 2.1373683709590873e-05, |
| "loss": 0.3115, |
| "step": 613 |
| }, |
| { |
| "epoch": 3.4480644596251535, |
| "grad_norm": 0.10805927729310495, |
| "learning_rate": 2.1234990925928827e-05, |
| "loss": 0.3078, |
| "step": 614 |
| }, |
| { |
| "epoch": 3.4536696444210895, |
| "grad_norm": 0.1210574300692429, |
| "learning_rate": 2.1096586799826123e-05, |
| "loss": 0.3131, |
| "step": 615 |
| }, |
| { |
| "epoch": 3.459274829217026, |
| "grad_norm": 0.11322408230361472, |
| "learning_rate": 2.0958473460319685e-05, |
| "loss": 0.3045, |
| "step": 616 |
| }, |
| { |
| "epoch": 3.464880014012962, |
| "grad_norm": 0.12288017952430454, |
| "learning_rate": 2.0820653031973363e-05, |
| "loss": 0.3004, |
| "step": 617 |
| }, |
| { |
| "epoch": 3.4704851988088983, |
| "grad_norm": 0.11467508943459158, |
| "learning_rate": 2.0683127634845155e-05, |
| "loss": 0.3118, |
| "step": 618 |
| }, |
| { |
| "epoch": 3.4760903836048342, |
| "grad_norm": 0.10089539729462121, |
| "learning_rate": 2.0545899384454753e-05, |
| "loss": 0.3115, |
| "step": 619 |
| }, |
| { |
| "epoch": 3.4816955684007707, |
| "grad_norm": 0.10903963497738692, |
| "learning_rate": 2.040897039175087e-05, |
| "loss": 0.3183, |
| "step": 620 |
| }, |
| { |
| "epoch": 3.487300753196707, |
| "grad_norm": 0.0960280103883697, |
| "learning_rate": 2.0272342763078806e-05, |
| "loss": 0.3168, |
| "step": 621 |
| }, |
| { |
| "epoch": 3.492905937992643, |
| "grad_norm": 0.1019907001338773, |
| "learning_rate": 2.0136018600148065e-05, |
| "loss": 0.314, |
| "step": 622 |
| }, |
| { |
| "epoch": 3.4985111227885795, |
| "grad_norm": 0.09703216086704151, |
| "learning_rate": 2.0000000000000012e-05, |
| "loss": 0.3114, |
| "step": 623 |
| }, |
| { |
| "epoch": 3.504116307584516, |
| "grad_norm": 0.09927974827651981, |
| "learning_rate": 1.9864289054975595e-05, |
| "loss": 0.3066, |
| "step": 624 |
| }, |
| { |
| "epoch": 3.509721492380452, |
| "grad_norm": 0.10115048078252437, |
| "learning_rate": 1.9728887852683204e-05, |
| "loss": 0.3063, |
| "step": 625 |
| }, |
| { |
| "epoch": 3.5153266771763882, |
| "grad_norm": 0.09617916095602932, |
| "learning_rate": 1.959379847596652e-05, |
| "loss": 0.3078, |
| "step": 626 |
| }, |
| { |
| "epoch": 3.5209318619723247, |
| "grad_norm": 0.10018831202713846, |
| "learning_rate": 1.9459023002872466e-05, |
| "loss": 0.306, |
| "step": 627 |
| }, |
| { |
| "epoch": 3.5265370467682606, |
| "grad_norm": 0.09046907662038928, |
| "learning_rate": 1.9324563506619323e-05, |
| "loss": 0.3093, |
| "step": 628 |
| }, |
| { |
| "epoch": 3.532142231564197, |
| "grad_norm": 0.09707258005221424, |
| "learning_rate": 1.9190422055564716e-05, |
| "loss": 0.3068, |
| "step": 629 |
| }, |
| { |
| "epoch": 3.537747416360133, |
| "grad_norm": 0.08790868545557047, |
| "learning_rate": 1.9056600713173884e-05, |
| "loss": 0.3063, |
| "step": 630 |
| }, |
| { |
| "epoch": 3.5433526011560694, |
| "grad_norm": 0.09522724255822446, |
| "learning_rate": 1.8923101537987906e-05, |
| "loss": 0.3071, |
| "step": 631 |
| }, |
| { |
| "epoch": 3.5489577859520054, |
| "grad_norm": 0.09275069555664794, |
| "learning_rate": 1.878992658359205e-05, |
| "loss": 0.3082, |
| "step": 632 |
| }, |
| { |
| "epoch": 3.554562970747942, |
| "grad_norm": 0.09662779046438905, |
| "learning_rate": 1.865707789858416e-05, |
| "loss": 0.317, |
| "step": 633 |
| }, |
| { |
| "epoch": 3.560168155543878, |
| "grad_norm": 0.09038439246839879, |
| "learning_rate": 1.852455752654318e-05, |
| "loss": 0.3095, |
| "step": 634 |
| }, |
| { |
| "epoch": 3.565773340339814, |
| "grad_norm": 0.0892351312327684, |
| "learning_rate": 1.839236750599767e-05, |
| "loss": 0.3099, |
| "step": 635 |
| }, |
| { |
| "epoch": 3.5713785251357506, |
| "grad_norm": 0.0901544987475721, |
| "learning_rate": 1.8260509870394475e-05, |
| "loss": 0.3145, |
| "step": 636 |
| }, |
| { |
| "epoch": 3.576983709931687, |
| "grad_norm": 0.08859989195760687, |
| "learning_rate": 1.8128986648067487e-05, |
| "loss": 0.3054, |
| "step": 637 |
| }, |
| { |
| "epoch": 3.582588894727623, |
| "grad_norm": 0.09128402171545218, |
| "learning_rate": 1.7997799862206346e-05, |
| "loss": 0.3121, |
| "step": 638 |
| }, |
| { |
| "epoch": 3.5881940795235594, |
| "grad_norm": 0.09185126057088099, |
| "learning_rate": 1.78669515308254e-05, |
| "loss": 0.3103, |
| "step": 639 |
| }, |
| { |
| "epoch": 3.5937992643194954, |
| "grad_norm": 0.0893997681385203, |
| "learning_rate": 1.7736443666732626e-05, |
| "loss": 0.3099, |
| "step": 640 |
| }, |
| { |
| "epoch": 3.599404449115432, |
| "grad_norm": 0.0927092479814224, |
| "learning_rate": 1.7606278277498674e-05, |
| "loss": 0.3096, |
| "step": 641 |
| }, |
| { |
| "epoch": 3.6050096339113678, |
| "grad_norm": 0.09322798178482147, |
| "learning_rate": 1.747645736542599e-05, |
| "loss": 0.312, |
| "step": 642 |
| }, |
| { |
| "epoch": 3.610614818707304, |
| "grad_norm": 0.09305885545146318, |
| "learning_rate": 1.7346982927518014e-05, |
| "loss": 0.3121, |
| "step": 643 |
| }, |
| { |
| "epoch": 3.6162200035032406, |
| "grad_norm": 0.09354862097700213, |
| "learning_rate": 1.721785695544846e-05, |
| "loss": 0.3084, |
| "step": 644 |
| }, |
| { |
| "epoch": 3.6218251882991765, |
| "grad_norm": 0.08879212707112467, |
| "learning_rate": 1.7089081435530667e-05, |
| "loss": 0.3103, |
| "step": 645 |
| }, |
| { |
| "epoch": 3.627430373095113, |
| "grad_norm": 0.08777213540858403, |
| "learning_rate": 1.6960658348687046e-05, |
| "loss": 0.3094, |
| "step": 646 |
| }, |
| { |
| "epoch": 3.6330355578910494, |
| "grad_norm": 0.09446494446048785, |
| "learning_rate": 1.683258967041866e-05, |
| "loss": 0.3099, |
| "step": 647 |
| }, |
| { |
| "epoch": 3.6386407426869853, |
| "grad_norm": 0.0856866484739207, |
| "learning_rate": 1.6704877370774748e-05, |
| "loss": 0.3046, |
| "step": 648 |
| }, |
| { |
| "epoch": 3.6442459274829218, |
| "grad_norm": 0.08653060673663804, |
| "learning_rate": 1.6577523414322478e-05, |
| "loss": 0.3039, |
| "step": 649 |
| }, |
| { |
| "epoch": 3.649851112278858, |
| "grad_norm": 0.0914995782178045, |
| "learning_rate": 1.6450529760116705e-05, |
| "loss": 0.3115, |
| "step": 650 |
| }, |
| { |
| "epoch": 3.655456297074794, |
| "grad_norm": 0.08842063602461302, |
| "learning_rate": 1.6323898361669857e-05, |
| "loss": 0.3099, |
| "step": 651 |
| }, |
| { |
| "epoch": 3.6610614818707305, |
| "grad_norm": 0.09196505054199886, |
| "learning_rate": 1.6197631166921856e-05, |
| "loss": 0.3059, |
| "step": 652 |
| }, |
| { |
| "epoch": 3.6666666666666665, |
| "grad_norm": 0.08826582262682107, |
| "learning_rate": 1.6071730118210173e-05, |
| "loss": 0.3065, |
| "step": 653 |
| }, |
| { |
| "epoch": 3.672271851462603, |
| "grad_norm": 0.08601459200435725, |
| "learning_rate": 1.594619715223994e-05, |
| "loss": 0.3083, |
| "step": 654 |
| }, |
| { |
| "epoch": 3.677877036258539, |
| "grad_norm": 0.09329501275849174, |
| "learning_rate": 1.5821034200054176e-05, |
| "loss": 0.3116, |
| "step": 655 |
| }, |
| { |
| "epoch": 3.6834822210544753, |
| "grad_norm": 0.08578241057447392, |
| "learning_rate": 1.569624318700405e-05, |
| "loss": 0.3111, |
| "step": 656 |
| }, |
| { |
| "epoch": 3.6890874058504117, |
| "grad_norm": 0.08212635956097417, |
| "learning_rate": 1.5571826032719287e-05, |
| "loss": 0.3103, |
| "step": 657 |
| }, |
| { |
| "epoch": 3.6946925906463477, |
| "grad_norm": 0.0920585030532975, |
| "learning_rate": 1.5447784651078642e-05, |
| "loss": 0.3119, |
| "step": 658 |
| }, |
| { |
| "epoch": 3.700297775442284, |
| "grad_norm": 0.08381481949938734, |
| "learning_rate": 1.532412095018044e-05, |
| "loss": 0.3053, |
| "step": 659 |
| }, |
| { |
| "epoch": 3.7059029602382205, |
| "grad_norm": 0.08430489721177843, |
| "learning_rate": 1.5200836832313246e-05, |
| "loss": 0.308, |
| "step": 660 |
| }, |
| { |
| "epoch": 3.7115081450341565, |
| "grad_norm": 0.09318737393858478, |
| "learning_rate": 1.5077934193926584e-05, |
| "loss": 0.3125, |
| "step": 661 |
| }, |
| { |
| "epoch": 3.717113329830093, |
| "grad_norm": 0.09089782801509312, |
| "learning_rate": 1.4955414925601757e-05, |
| "loss": 0.316, |
| "step": 662 |
| }, |
| { |
| "epoch": 3.7227185146260293, |
| "grad_norm": 0.08583258468324842, |
| "learning_rate": 1.4833280912022834e-05, |
| "loss": 0.3064, |
| "step": 663 |
| }, |
| { |
| "epoch": 3.7283236994219653, |
| "grad_norm": 0.08591690878371737, |
| "learning_rate": 1.4711534031947543e-05, |
| "loss": 0.3194, |
| "step": 664 |
| }, |
| { |
| "epoch": 3.7339288842179017, |
| "grad_norm": 0.08916277837335898, |
| "learning_rate": 1.459017615817846e-05, |
| "loss": 0.3096, |
| "step": 665 |
| }, |
| { |
| "epoch": 3.7395340690138377, |
| "grad_norm": 0.08697008778870499, |
| "learning_rate": 1.4469209157534172e-05, |
| "loss": 0.3042, |
| "step": 666 |
| }, |
| { |
| "epoch": 3.745139253809774, |
| "grad_norm": 0.08513292108575704, |
| "learning_rate": 1.4348634890820554e-05, |
| "loss": 0.3042, |
| "step": 667 |
| }, |
| { |
| "epoch": 3.75074443860571, |
| "grad_norm": 0.09258652498297838, |
| "learning_rate": 1.4228455212802149e-05, |
| "loss": 0.3081, |
| "step": 668 |
| }, |
| { |
| "epoch": 3.7563496234016465, |
| "grad_norm": 0.08525936977928056, |
| "learning_rate": 1.4108671972173644e-05, |
| "loss": 0.3109, |
| "step": 669 |
| }, |
| { |
| "epoch": 3.761954808197583, |
| "grad_norm": 0.08470431515893076, |
| "learning_rate": 1.3989287011531425e-05, |
| "loss": 0.312, |
| "step": 670 |
| }, |
| { |
| "epoch": 3.767559992993519, |
| "grad_norm": 0.09004964227688511, |
| "learning_rate": 1.3870302167345222e-05, |
| "loss": 0.3079, |
| "step": 671 |
| }, |
| { |
| "epoch": 3.7731651777894553, |
| "grad_norm": 0.08825735283568481, |
| "learning_rate": 1.3751719269929908e-05, |
| "loss": 0.3049, |
| "step": 672 |
| }, |
| { |
| "epoch": 3.7787703625853917, |
| "grad_norm": 0.08533456832494438, |
| "learning_rate": 1.3633540143417268e-05, |
| "loss": 0.3046, |
| "step": 673 |
| }, |
| { |
| "epoch": 3.7843755473813276, |
| "grad_norm": 0.09333040420463685, |
| "learning_rate": 1.3515766605727984e-05, |
| "loss": 0.3056, |
| "step": 674 |
| }, |
| { |
| "epoch": 3.789980732177264, |
| "grad_norm": 0.09109816447279726, |
| "learning_rate": 1.3398400468543682e-05, |
| "loss": 0.3138, |
| "step": 675 |
| }, |
| { |
| "epoch": 3.7955859169732005, |
| "grad_norm": 0.08953063350846481, |
| "learning_rate": 1.328144353727903e-05, |
| "loss": 0.3095, |
| "step": 676 |
| }, |
| { |
| "epoch": 3.8011911017691364, |
| "grad_norm": 0.08970232410085, |
| "learning_rate": 1.3164897611053981e-05, |
| "loss": 0.3092, |
| "step": 677 |
| }, |
| { |
| "epoch": 3.806796286565073, |
| "grad_norm": 0.08560717524353414, |
| "learning_rate": 1.3048764482666112e-05, |
| "loss": 0.3068, |
| "step": 678 |
| }, |
| { |
| "epoch": 3.812401471361009, |
| "grad_norm": 0.0870128302022409, |
| "learning_rate": 1.2933045938563012e-05, |
| "loss": 0.3103, |
| "step": 679 |
| }, |
| { |
| "epoch": 3.8180066561569452, |
| "grad_norm": 0.08488653192608872, |
| "learning_rate": 1.281774375881482e-05, |
| "loss": 0.3092, |
| "step": 680 |
| }, |
| { |
| "epoch": 3.823611840952881, |
| "grad_norm": 0.08327605535332809, |
| "learning_rate": 1.2702859717086886e-05, |
| "loss": 0.3077, |
| "step": 681 |
| }, |
| { |
| "epoch": 3.8292170257488176, |
| "grad_norm": 0.082880372307135, |
| "learning_rate": 1.2588395580612392e-05, |
| "loss": 0.3115, |
| "step": 682 |
| }, |
| { |
| "epoch": 3.834822210544754, |
| "grad_norm": 0.08662923231375559, |
| "learning_rate": 1.247435311016525e-05, |
| "loss": 0.3066, |
| "step": 683 |
| }, |
| { |
| "epoch": 3.84042739534069, |
| "grad_norm": 0.08229450378457145, |
| "learning_rate": 1.2360734060032967e-05, |
| "loss": 0.3053, |
| "step": 684 |
| }, |
| { |
| "epoch": 3.8460325801366264, |
| "grad_norm": 0.08665772393719716, |
| "learning_rate": 1.2247540177989695e-05, |
| "loss": 0.3047, |
| "step": 685 |
| }, |
| { |
| "epoch": 3.851637764932563, |
| "grad_norm": 0.08865321374818363, |
| "learning_rate": 1.2134773205269323e-05, |
| "loss": 0.3132, |
| "step": 686 |
| }, |
| { |
| "epoch": 3.857242949728499, |
| "grad_norm": 0.09256990438194068, |
| "learning_rate": 1.2022434876538696e-05, |
| "loss": 0.3098, |
| "step": 687 |
| }, |
| { |
| "epoch": 3.862848134524435, |
| "grad_norm": 0.08775878742002668, |
| "learning_rate": 1.191052691987094e-05, |
| "loss": 0.308, |
| "step": 688 |
| }, |
| { |
| "epoch": 3.8684533193203716, |
| "grad_norm": 0.09049516342589015, |
| "learning_rate": 1.1799051056718844e-05, |
| "loss": 0.308, |
| "step": 689 |
| }, |
| { |
| "epoch": 3.8740585041163076, |
| "grad_norm": 0.0850218500678186, |
| "learning_rate": 1.1688009001888475e-05, |
| "loss": 0.3082, |
| "step": 690 |
| }, |
| { |
| "epoch": 3.8796636889122436, |
| "grad_norm": 0.09015284370086712, |
| "learning_rate": 1.1577402463512652e-05, |
| "loss": 0.3125, |
| "step": 691 |
| }, |
| { |
| "epoch": 3.88526887370818, |
| "grad_norm": 0.08936579824727202, |
| "learning_rate": 1.1467233143024803e-05, |
| "loss": 0.2996, |
| "step": 692 |
| }, |
| { |
| "epoch": 3.8908740585041164, |
| "grad_norm": 0.08641714653740455, |
| "learning_rate": 1.1357502735132715e-05, |
| "loss": 0.3085, |
| "step": 693 |
| }, |
| { |
| "epoch": 3.8964792433000524, |
| "grad_norm": 0.08002468185657431, |
| "learning_rate": 1.1248212927792502e-05, |
| "loss": 0.3074, |
| "step": 694 |
| }, |
| { |
| "epoch": 3.9020844280959888, |
| "grad_norm": 0.08225416168538756, |
| "learning_rate": 1.1139365402182625e-05, |
| "loss": 0.3056, |
| "step": 695 |
| }, |
| { |
| "epoch": 3.907689612891925, |
| "grad_norm": 0.08346850523992327, |
| "learning_rate": 1.1030961832678014e-05, |
| "loss": 0.309, |
| "step": 696 |
| }, |
| { |
| "epoch": 3.913294797687861, |
| "grad_norm": 0.08024780002044651, |
| "learning_rate": 1.0923003886824382e-05, |
| "loss": 0.3063, |
| "step": 697 |
| }, |
| { |
| "epoch": 3.9188999824837976, |
| "grad_norm": 0.09339560984140326, |
| "learning_rate": 1.081549322531247e-05, |
| "loss": 0.3029, |
| "step": 698 |
| }, |
| { |
| "epoch": 3.924505167279734, |
| "grad_norm": 0.08375958744991581, |
| "learning_rate": 1.0708431501952567e-05, |
| "loss": 0.3181, |
| "step": 699 |
| }, |
| { |
| "epoch": 3.93011035207567, |
| "grad_norm": 0.07819955008853643, |
| "learning_rate": 1.060182036364907e-05, |
| "loss": 0.3072, |
| "step": 700 |
| }, |
| { |
| "epoch": 3.9357155368716064, |
| "grad_norm": 0.07835982764160071, |
| "learning_rate": 1.0495661450375114e-05, |
| "loss": 0.302, |
| "step": 701 |
| }, |
| { |
| "epoch": 3.9413207216675423, |
| "grad_norm": 0.08325758169200392, |
| "learning_rate": 1.0389956395147389e-05, |
| "loss": 0.3062, |
| "step": 702 |
| }, |
| { |
| "epoch": 3.9469259064634787, |
| "grad_norm": 0.08333976071773525, |
| "learning_rate": 1.0284706824000983e-05, |
| "loss": 0.3146, |
| "step": 703 |
| }, |
| { |
| "epoch": 3.9525310912594147, |
| "grad_norm": 0.0786546369775888, |
| "learning_rate": 1.0179914355964384e-05, |
| "loss": 0.3099, |
| "step": 704 |
| }, |
| { |
| "epoch": 3.958136276055351, |
| "grad_norm": 0.0826606587187586, |
| "learning_rate": 1.0075580603034569e-05, |
| "loss": 0.3078, |
| "step": 705 |
| }, |
| { |
| "epoch": 3.9637414608512875, |
| "grad_norm": 0.08157885632921875, |
| "learning_rate": 9.971707170152243e-06, |
| "loss": 0.3025, |
| "step": 706 |
| }, |
| { |
| "epoch": 3.9693466456472235, |
| "grad_norm": 0.07569811929437366, |
| "learning_rate": 9.86829565517709e-06, |
| "loss": 0.3052, |
| "step": 707 |
| }, |
| { |
| "epoch": 3.97495183044316, |
| "grad_norm": 0.08433452418842283, |
| "learning_rate": 9.765347648863228e-06, |
| "loss": 0.3103, |
| "step": 708 |
| }, |
| { |
| "epoch": 3.9805570152390963, |
| "grad_norm": 0.08116174303332119, |
| "learning_rate": 9.662864734834736e-06, |
| "loss": 0.3162, |
| "step": 709 |
| }, |
| { |
| "epoch": 3.9861622000350323, |
| "grad_norm": 0.07742272052051664, |
| "learning_rate": 9.560848489561292e-06, |
| "loss": 0.3088, |
| "step": 710 |
| }, |
| { |
| "epoch": 3.9917673848309687, |
| "grad_norm": 0.08299995390154452, |
| "learning_rate": 9.459300482333931e-06, |
| "loss": 0.3133, |
| "step": 711 |
| }, |
| { |
| "epoch": 3.997372569626905, |
| "grad_norm": 0.0834553917977672, |
| "learning_rate": 9.358222275240884e-06, |
| "loss": 0.3094, |
| "step": 712 |
| }, |
| { |
| "epoch": 4.005605184795936, |
| "grad_norm": 0.13513807332008165, |
| "learning_rate": 9.257615423143566e-06, |
| "loss": 0.2945, |
| "step": 713 |
| }, |
| { |
| "epoch": 4.011210369591873, |
| "grad_norm": 0.10621089229052157, |
| "learning_rate": 9.157481473652643e-06, |
| "loss": 0.2885, |
| "step": 714 |
| }, |
| { |
| "epoch": 4.016815554387809, |
| "grad_norm": 0.08714199459949697, |
| "learning_rate": 9.05782196710427e-06, |
| "loss": 0.2958, |
| "step": 715 |
| }, |
| { |
| "epoch": 4.022420739183745, |
| "grad_norm": 0.10547298624299982, |
| "learning_rate": 8.958638436536322e-06, |
| "loss": 0.2904, |
| "step": 716 |
| }, |
| { |
| "epoch": 4.028025923979682, |
| "grad_norm": 0.11824374552084228, |
| "learning_rate": 8.85993240766487e-06, |
| "loss": 0.2939, |
| "step": 717 |
| }, |
| { |
| "epoch": 4.033631108775618, |
| "grad_norm": 0.11166981880398426, |
| "learning_rate": 8.761705398860684e-06, |
| "loss": 0.2879, |
| "step": 718 |
| }, |
| { |
| "epoch": 4.039236293571554, |
| "grad_norm": 0.10093675260999213, |
| "learning_rate": 8.6639589211259e-06, |
| "loss": 0.292, |
| "step": 719 |
| }, |
| { |
| "epoch": 4.0448414783674895, |
| "grad_norm": 0.10073956157160133, |
| "learning_rate": 8.566694478070748e-06, |
| "loss": 0.2849, |
| "step": 720 |
| }, |
| { |
| "epoch": 4.050446663163426, |
| "grad_norm": 0.10476897069362466, |
| "learning_rate": 8.469913565890443e-06, |
| "loss": 0.2929, |
| "step": 721 |
| }, |
| { |
| "epoch": 4.056051847959362, |
| "grad_norm": 0.10098301015704451, |
| "learning_rate": 8.373617673342154e-06, |
| "loss": 0.2915, |
| "step": 722 |
| }, |
| { |
| "epoch": 4.061657032755298, |
| "grad_norm": 0.09703917635173606, |
| "learning_rate": 8.277808281722116e-06, |
| "loss": 0.2899, |
| "step": 723 |
| }, |
| { |
| "epoch": 4.067262217551235, |
| "grad_norm": 0.10034460587837235, |
| "learning_rate": 8.182486864842852e-06, |
| "loss": 0.2917, |
| "step": 724 |
| }, |
| { |
| "epoch": 4.072867402347171, |
| "grad_norm": 0.09179532694160371, |
| "learning_rate": 8.087654889010475e-06, |
| "loss": 0.2892, |
| "step": 725 |
| }, |
| { |
| "epoch": 4.078472587143107, |
| "grad_norm": 0.10753395912690455, |
| "learning_rate": 7.993313813002137e-06, |
| "loss": 0.2905, |
| "step": 726 |
| }, |
| { |
| "epoch": 4.084077771939044, |
| "grad_norm": 0.08822865040425992, |
| "learning_rate": 7.899465088043632e-06, |
| "loss": 0.2861, |
| "step": 727 |
| }, |
| { |
| "epoch": 4.08968295673498, |
| "grad_norm": 0.08713560322111398, |
| "learning_rate": 7.806110157786978e-06, |
| "loss": 0.2903, |
| "step": 728 |
| }, |
| { |
| "epoch": 4.095288141530916, |
| "grad_norm": 0.09170567396266732, |
| "learning_rate": 7.713250458288333e-06, |
| "loss": 0.2901, |
| "step": 729 |
| }, |
| { |
| "epoch": 4.100893326326853, |
| "grad_norm": 0.09585484257486802, |
| "learning_rate": 7.620887417985789e-06, |
| "loss": 0.2887, |
| "step": 730 |
| }, |
| { |
| "epoch": 4.106498511122789, |
| "grad_norm": 0.08818209427141882, |
| "learning_rate": 7.529022457677504e-06, |
| "loss": 0.2924, |
| "step": 731 |
| }, |
| { |
| "epoch": 4.112103695918725, |
| "grad_norm": 0.08443201723287494, |
| "learning_rate": 7.437656990499746e-06, |
| "loss": 0.2863, |
| "step": 732 |
| }, |
| { |
| "epoch": 4.117708880714661, |
| "grad_norm": 0.08873488187304306, |
| "learning_rate": 7.346792421905231e-06, |
| "loss": 0.2938, |
| "step": 733 |
| }, |
| { |
| "epoch": 4.1233140655105975, |
| "grad_norm": 0.08694222494262985, |
| "learning_rate": 7.2564301496414535e-06, |
| "loss": 0.2924, |
| "step": 734 |
| }, |
| { |
| "epoch": 4.1289192503065335, |
| "grad_norm": 0.08256227612309179, |
| "learning_rate": 7.166571563729223e-06, |
| "loss": 0.2917, |
| "step": 735 |
| }, |
| { |
| "epoch": 4.1345244351024695, |
| "grad_norm": 0.08001663820565025, |
| "learning_rate": 7.07721804644125e-06, |
| "loss": 0.285, |
| "step": 736 |
| }, |
| { |
| "epoch": 4.140129619898406, |
| "grad_norm": 0.08149206652526432, |
| "learning_rate": 6.988370972280911e-06, |
| "loss": 0.287, |
| "step": 737 |
| }, |
| { |
| "epoch": 4.145734804694342, |
| "grad_norm": 0.0830237803672675, |
| "learning_rate": 6.900031707961083e-06, |
| "loss": 0.2929, |
| "step": 738 |
| }, |
| { |
| "epoch": 4.151339989490278, |
| "grad_norm": 0.07889881500439108, |
| "learning_rate": 6.812201612383132e-06, |
| "loss": 0.2908, |
| "step": 739 |
| }, |
| { |
| "epoch": 4.156945174286215, |
| "grad_norm": 0.07965629951868874, |
| "learning_rate": 6.724882036615991e-06, |
| "loss": 0.2911, |
| "step": 740 |
| }, |
| { |
| "epoch": 4.162550359082151, |
| "grad_norm": 0.08142470834051524, |
| "learning_rate": 6.638074323875426e-06, |
| "loss": 0.2845, |
| "step": 741 |
| }, |
| { |
| "epoch": 4.168155543878087, |
| "grad_norm": 0.0790024403007617, |
| "learning_rate": 6.551779809503305e-06, |
| "loss": 0.2912, |
| "step": 742 |
| }, |
| { |
| "epoch": 4.173760728674024, |
| "grad_norm": 0.08162794031253676, |
| "learning_rate": 6.465999820947107e-06, |
| "loss": 0.287, |
| "step": 743 |
| }, |
| { |
| "epoch": 4.17936591346996, |
| "grad_norm": 0.08060613968699802, |
| "learning_rate": 6.380735677739474e-06, |
| "loss": 0.2917, |
| "step": 744 |
| }, |
| { |
| "epoch": 4.184971098265896, |
| "grad_norm": 0.0763576924622887, |
| "learning_rate": 6.295988691477939e-06, |
| "loss": 0.2895, |
| "step": 745 |
| }, |
| { |
| "epoch": 4.190576283061832, |
| "grad_norm": 0.07704795721026231, |
| "learning_rate": 6.2117601658047234e-06, |
| "loss": 0.2914, |
| "step": 746 |
| }, |
| { |
| "epoch": 4.196181467857769, |
| "grad_norm": 0.08636531460875918, |
| "learning_rate": 6.128051396386707e-06, |
| "loss": 0.2908, |
| "step": 747 |
| }, |
| { |
| "epoch": 4.201786652653705, |
| "grad_norm": 0.08428531970028809, |
| "learning_rate": 6.044863670895473e-06, |
| "loss": 0.292, |
| "step": 748 |
| }, |
| { |
| "epoch": 4.207391837449641, |
| "grad_norm": 0.07541521932519012, |
| "learning_rate": 5.962198268987514e-06, |
| "loss": 0.2956, |
| "step": 749 |
| }, |
| { |
| "epoch": 4.2129970222455775, |
| "grad_norm": 0.08036165670495596, |
| "learning_rate": 5.880056462284573e-06, |
| "loss": 0.2955, |
| "step": 750 |
| }, |
| { |
| "epoch": 4.2186022070415135, |
| "grad_norm": 0.07962458343055896, |
| "learning_rate": 5.798439514354024e-06, |
| "loss": 0.2904, |
| "step": 751 |
| }, |
| { |
| "epoch": 4.224207391837449, |
| "grad_norm": 0.07557676462990631, |
| "learning_rate": 5.7173486806894804e-06, |
| "loss": 0.2943, |
| "step": 752 |
| }, |
| { |
| "epoch": 4.229812576633386, |
| "grad_norm": 0.07611814912098987, |
| "learning_rate": 5.6367852086914555e-06, |
| "loss": 0.2921, |
| "step": 753 |
| }, |
| { |
| "epoch": 4.235417761429322, |
| "grad_norm": 0.07591915780682665, |
| "learning_rate": 5.556750337648207e-06, |
| "loss": 0.2966, |
| "step": 754 |
| }, |
| { |
| "epoch": 4.241022946225258, |
| "grad_norm": 0.07451638627135487, |
| "learning_rate": 5.477245298716636e-06, |
| "loss": 0.2916, |
| "step": 755 |
| }, |
| { |
| "epoch": 4.246628131021194, |
| "grad_norm": 0.07408998865978028, |
| "learning_rate": 5.398271314903376e-06, |
| "loss": 0.2922, |
| "step": 756 |
| }, |
| { |
| "epoch": 4.252233315817131, |
| "grad_norm": 0.07545427591965101, |
| "learning_rate": 5.3198296010459604e-06, |
| "loss": 0.2894, |
| "step": 757 |
| }, |
| { |
| "epoch": 4.257838500613067, |
| "grad_norm": 0.07784079160936691, |
| "learning_rate": 5.241921363794143e-06, |
| "loss": 0.2899, |
| "step": 758 |
| }, |
| { |
| "epoch": 4.263443685409003, |
| "grad_norm": 0.07434860497601323, |
| "learning_rate": 5.1645478015913556e-06, |
| "loss": 0.2938, |
| "step": 759 |
| }, |
| { |
| "epoch": 4.26904887020494, |
| "grad_norm": 0.07524905441026238, |
| "learning_rate": 5.0877101046562335e-06, |
| "loss": 0.2925, |
| "step": 760 |
| }, |
| { |
| "epoch": 4.274654055000876, |
| "grad_norm": 0.07534375562369973, |
| "learning_rate": 5.011409454964336e-06, |
| "loss": 0.2956, |
| "step": 761 |
| }, |
| { |
| "epoch": 4.280259239796812, |
| "grad_norm": 0.07564823418534802, |
| "learning_rate": 4.935647026229951e-06, |
| "loss": 0.2897, |
| "step": 762 |
| }, |
| { |
| "epoch": 4.285864424592749, |
| "grad_norm": 0.07430360134148153, |
| "learning_rate": 4.860423983888054e-06, |
| "loss": 0.2909, |
| "step": 763 |
| }, |
| { |
| "epoch": 4.291469609388685, |
| "grad_norm": 0.07690448982555907, |
| "learning_rate": 4.785741485076356e-06, |
| "loss": 0.2921, |
| "step": 764 |
| }, |
| { |
| "epoch": 4.297074794184621, |
| "grad_norm": 0.07333540337330373, |
| "learning_rate": 4.711600678617521e-06, |
| "loss": 0.2924, |
| "step": 765 |
| }, |
| { |
| "epoch": 4.302679978980557, |
| "grad_norm": 0.0707515459460907, |
| "learning_rate": 4.6380027050015165e-06, |
| "loss": 0.2897, |
| "step": 766 |
| }, |
| { |
| "epoch": 4.308285163776493, |
| "grad_norm": 0.07535393854931703, |
| "learning_rate": 4.564948696368014e-06, |
| "loss": 0.2941, |
| "step": 767 |
| }, |
| { |
| "epoch": 4.313890348572429, |
| "grad_norm": 0.07963087040482336, |
| "learning_rate": 4.492439776489024e-06, |
| "loss": 0.2928, |
| "step": 768 |
| }, |
| { |
| "epoch": 4.319495533368365, |
| "grad_norm": 0.07387271710468704, |
| "learning_rate": 4.420477060751575e-06, |
| "loss": 0.292, |
| "step": 769 |
| }, |
| { |
| "epoch": 4.325100718164302, |
| "grad_norm": 0.07736925416730672, |
| "learning_rate": 4.349061656140583e-06, |
| "loss": 0.2944, |
| "step": 770 |
| }, |
| { |
| "epoch": 4.330705902960238, |
| "grad_norm": 0.07529231788326105, |
| "learning_rate": 4.278194661221804e-06, |
| "loss": 0.2879, |
| "step": 771 |
| }, |
| { |
| "epoch": 4.336311087756174, |
| "grad_norm": 0.08026949246594235, |
| "learning_rate": 4.207877166124936e-06, |
| "loss": 0.2917, |
| "step": 772 |
| }, |
| { |
| "epoch": 4.341916272552111, |
| "grad_norm": 0.07620416762621339, |
| "learning_rate": 4.138110252526866e-06, |
| "loss": 0.291, |
| "step": 773 |
| }, |
| { |
| "epoch": 4.347521457348047, |
| "grad_norm": 0.07422185414389502, |
| "learning_rate": 4.068894993635009e-06, |
| "loss": 0.2907, |
| "step": 774 |
| }, |
| { |
| "epoch": 4.353126642143983, |
| "grad_norm": 0.07226310970169152, |
| "learning_rate": 4.000232454170827e-06, |
| "loss": 0.2875, |
| "step": 775 |
| }, |
| { |
| "epoch": 4.35873182693992, |
| "grad_norm": 0.07432275041631858, |
| "learning_rate": 3.932123690353425e-06, |
| "loss": 0.2885, |
| "step": 776 |
| }, |
| { |
| "epoch": 4.364337011735856, |
| "grad_norm": 0.0732145587763894, |
| "learning_rate": 3.8645697498833e-06, |
| "loss": 0.298, |
| "step": 777 |
| }, |
| { |
| "epoch": 4.369942196531792, |
| "grad_norm": 0.07336339349758086, |
| "learning_rate": 3.7975716719262522e-06, |
| "loss": 0.2892, |
| "step": 778 |
| }, |
| { |
| "epoch": 4.375547381327728, |
| "grad_norm": 0.07442021970479001, |
| "learning_rate": 3.7311304870973807e-06, |
| "loss": 0.2899, |
| "step": 779 |
| }, |
| { |
| "epoch": 4.3811525661236645, |
| "grad_norm": 0.07353220474247239, |
| "learning_rate": 3.6652472174452337e-06, |
| "loss": 0.2877, |
| "step": 780 |
| }, |
| { |
| "epoch": 4.3867577509196005, |
| "grad_norm": 0.0715587497335768, |
| "learning_rate": 3.599922876436077e-06, |
| "loss": 0.289, |
| "step": 781 |
| }, |
| { |
| "epoch": 4.3923629357155365, |
| "grad_norm": 0.06874108127193815, |
| "learning_rate": 3.535158468938331e-06, |
| "loss": 0.2923, |
| "step": 782 |
| }, |
| { |
| "epoch": 4.397968120511473, |
| "grad_norm": 0.0731981866264841, |
| "learning_rate": 3.4709549912070693e-06, |
| "loss": 0.2895, |
| "step": 783 |
| }, |
| { |
| "epoch": 4.403573305307409, |
| "grad_norm": 0.07134091663650302, |
| "learning_rate": 3.4073134308687574e-06, |
| "loss": 0.2946, |
| "step": 784 |
| }, |
| { |
| "epoch": 4.409178490103345, |
| "grad_norm": 0.07158473138919103, |
| "learning_rate": 3.3442347669059917e-06, |
| "loss": 0.2888, |
| "step": 785 |
| }, |
| { |
| "epoch": 4.414783674899282, |
| "grad_norm": 0.0705765283738734, |
| "learning_rate": 3.2817199696424785e-06, |
| "loss": 0.2921, |
| "step": 786 |
| }, |
| { |
| "epoch": 4.420388859695218, |
| "grad_norm": 0.0713673125862577, |
| "learning_rate": 3.219770000728102e-06, |
| "loss": 0.2897, |
| "step": 787 |
| }, |
| { |
| "epoch": 4.425994044491154, |
| "grad_norm": 0.07163002908771758, |
| "learning_rate": 3.1583858131241274e-06, |
| "loss": 0.2931, |
| "step": 788 |
| }, |
| { |
| "epoch": 4.431599229287091, |
| "grad_norm": 0.07068386839729027, |
| "learning_rate": 3.0975683510885512e-06, |
| "loss": 0.2915, |
| "step": 789 |
| }, |
| { |
| "epoch": 4.437204414083027, |
| "grad_norm": 0.06828223728970015, |
| "learning_rate": 3.0373185501615655e-06, |
| "loss": 0.2863, |
| "step": 790 |
| }, |
| { |
| "epoch": 4.442809598878963, |
| "grad_norm": 0.06925305783885854, |
| "learning_rate": 2.97763733715116e-06, |
| "loss": 0.286, |
| "step": 791 |
| }, |
| { |
| "epoch": 4.448414783674899, |
| "grad_norm": 0.07093825781423818, |
| "learning_rate": 2.9185256301188782e-06, |
| "loss": 0.2918, |
| "step": 792 |
| }, |
| { |
| "epoch": 4.454019968470836, |
| "grad_norm": 0.07292024935590449, |
| "learning_rate": 2.8599843383657178e-06, |
| "loss": 0.2849, |
| "step": 793 |
| }, |
| { |
| "epoch": 4.459625153266772, |
| "grad_norm": 0.07136882672592504, |
| "learning_rate": 2.8020143624180796e-06, |
| "loss": 0.2941, |
| "step": 794 |
| }, |
| { |
| "epoch": 4.465230338062708, |
| "grad_norm": 0.07262486143198338, |
| "learning_rate": 2.744616594013976e-06, |
| "loss": 0.29, |
| "step": 795 |
| }, |
| { |
| "epoch": 4.4708355228586445, |
| "grad_norm": 0.07044266557562419, |
| "learning_rate": 2.6877919160892817e-06, |
| "loss": 0.286, |
| "step": 796 |
| }, |
| { |
| "epoch": 4.4764407076545805, |
| "grad_norm": 0.07076171900766302, |
| "learning_rate": 2.631541202764161e-06, |
| "loss": 0.2913, |
| "step": 797 |
| }, |
| { |
| "epoch": 4.482045892450516, |
| "grad_norm": 0.07266466034845562, |
| "learning_rate": 2.5758653193296244e-06, |
| "loss": 0.2966, |
| "step": 798 |
| }, |
| { |
| "epoch": 4.487651077246453, |
| "grad_norm": 0.06891412203515149, |
| "learning_rate": 2.520765122234212e-06, |
| "loss": 0.2938, |
| "step": 799 |
| }, |
| { |
| "epoch": 4.493256262042389, |
| "grad_norm": 0.06927383792607966, |
| "learning_rate": 2.4662414590708216e-06, |
| "loss": 0.2886, |
| "step": 800 |
| }, |
| { |
| "epoch": 4.498861446838325, |
| "grad_norm": 0.06986747799982956, |
| "learning_rate": 2.4122951685636674e-06, |
| "loss": 0.2895, |
| "step": 801 |
| }, |
| { |
| "epoch": 4.504466631634262, |
| "grad_norm": 0.07085917182268424, |
| "learning_rate": 2.3589270805553842e-06, |
| "loss": 0.293, |
| "step": 802 |
| }, |
| { |
| "epoch": 4.510071816430198, |
| "grad_norm": 0.07134101177324086, |
| "learning_rate": 2.3061380159942593e-06, |
| "loss": 0.2919, |
| "step": 803 |
| }, |
| { |
| "epoch": 4.515677001226134, |
| "grad_norm": 0.07152411549915325, |
| "learning_rate": 2.2539287869215974e-06, |
| "loss": 0.2909, |
| "step": 804 |
| }, |
| { |
| "epoch": 4.52128218602207, |
| "grad_norm": 0.0709029110171059, |
| "learning_rate": 2.2023001964592485e-06, |
| "loss": 0.2909, |
| "step": 805 |
| }, |
| { |
| "epoch": 4.526887370818007, |
| "grad_norm": 0.07123396088624603, |
| "learning_rate": 2.1512530387972187e-06, |
| "loss": 0.2894, |
| "step": 806 |
| }, |
| { |
| "epoch": 4.532492555613943, |
| "grad_norm": 0.07087391238341717, |
| "learning_rate": 2.100788099181501e-06, |
| "loss": 0.292, |
| "step": 807 |
| }, |
| { |
| "epoch": 4.538097740409879, |
| "grad_norm": 0.07244705960873961, |
| "learning_rate": 2.050906153901946e-06, |
| "loss": 0.2886, |
| "step": 808 |
| }, |
| { |
| "epoch": 4.543702925205816, |
| "grad_norm": 0.0702625208879413, |
| "learning_rate": 2.0016079702803683e-06, |
| "loss": 0.2912, |
| "step": 809 |
| }, |
| { |
| "epoch": 4.549308110001752, |
| "grad_norm": 0.07025143510925803, |
| "learning_rate": 1.9528943066586993e-06, |
| "loss": 0.2859, |
| "step": 810 |
| }, |
| { |
| "epoch": 4.554913294797688, |
| "grad_norm": 0.06797065457217236, |
| "learning_rate": 1.9047659123873475e-06, |
| "loss": 0.2897, |
| "step": 811 |
| }, |
| { |
| "epoch": 4.560518479593624, |
| "grad_norm": 0.06967626315727225, |
| "learning_rate": 1.8572235278136741e-06, |
| "loss": 0.2896, |
| "step": 812 |
| }, |
| { |
| "epoch": 4.56612366438956, |
| "grad_norm": 0.06946061969739742, |
| "learning_rate": 1.81026788427058e-06, |
| "loss": 0.2919, |
| "step": 813 |
| }, |
| { |
| "epoch": 4.571728849185496, |
| "grad_norm": 0.06956148278872504, |
| "learning_rate": 1.7638997040652928e-06, |
| "loss": 0.2865, |
| "step": 814 |
| }, |
| { |
| "epoch": 4.577334033981433, |
| "grad_norm": 0.06903042275149272, |
| "learning_rate": 1.7181197004682148e-06, |
| "loss": 0.2987, |
| "step": 815 |
| }, |
| { |
| "epoch": 4.582939218777369, |
| "grad_norm": 0.07027061783424711, |
| "learning_rate": 1.6729285777019776e-06, |
| "loss": 0.2943, |
| "step": 816 |
| }, |
| { |
| "epoch": 4.588544403573305, |
| "grad_norm": 0.07497684616923145, |
| "learning_rate": 1.6283270309306098e-06, |
| "loss": 0.2954, |
| "step": 817 |
| }, |
| { |
| "epoch": 4.594149588369241, |
| "grad_norm": 0.06928091037284238, |
| "learning_rate": 1.58431574624883e-06, |
| "loss": 0.2978, |
| "step": 818 |
| }, |
| { |
| "epoch": 4.599754773165178, |
| "grad_norm": 0.06987516073546424, |
| "learning_rate": 1.5408954006715004e-06, |
| "loss": 0.2906, |
| "step": 819 |
| }, |
| { |
| "epoch": 4.605359957961114, |
| "grad_norm": 0.0679280004765429, |
| "learning_rate": 1.4980666621232076e-06, |
| "loss": 0.2913, |
| "step": 820 |
| }, |
| { |
| "epoch": 4.61096514275705, |
| "grad_norm": 0.06872535211929137, |
| "learning_rate": 1.4558301894279958e-06, |
| "loss": 0.2929, |
| "step": 821 |
| }, |
| { |
| "epoch": 4.616570327552987, |
| "grad_norm": 0.07416106638882912, |
| "learning_rate": 1.4141866322992237e-06, |
| "loss": 0.2965, |
| "step": 822 |
| }, |
| { |
| "epoch": 4.622175512348923, |
| "grad_norm": 0.06811896916091317, |
| "learning_rate": 1.3731366313295858e-06, |
| "loss": 0.2885, |
| "step": 823 |
| }, |
| { |
| "epoch": 4.627780697144859, |
| "grad_norm": 0.0695579136671645, |
| "learning_rate": 1.3326808179812266e-06, |
| "loss": 0.291, |
| "step": 824 |
| }, |
| { |
| "epoch": 4.633385881940796, |
| "grad_norm": 0.06863496179473963, |
| "learning_rate": 1.292819814576065e-06, |
| "loss": 0.294, |
| "step": 825 |
| }, |
| { |
| "epoch": 4.638991066736732, |
| "grad_norm": 0.0705807081382788, |
| "learning_rate": 1.253554234286196e-06, |
| "loss": 0.2904, |
| "step": 826 |
| }, |
| { |
| "epoch": 4.6445962515326675, |
| "grad_norm": 0.06663794364232753, |
| "learning_rate": 1.214884681124473e-06, |
| "loss": 0.2925, |
| "step": 827 |
| }, |
| { |
| "epoch": 4.650201436328604, |
| "grad_norm": 0.06891427767384369, |
| "learning_rate": 1.1768117499351983e-06, |
| "loss": 0.2947, |
| "step": 828 |
| }, |
| { |
| "epoch": 4.65580662112454, |
| "grad_norm": 0.06804227143364437, |
| "learning_rate": 1.1393360263849895e-06, |
| "loss": 0.296, |
| "step": 829 |
| }, |
| { |
| "epoch": 4.661411805920476, |
| "grad_norm": 0.06729466113720775, |
| "learning_rate": 1.1024580869537682e-06, |
| "loss": 0.2895, |
| "step": 830 |
| }, |
| { |
| "epoch": 4.667016990716412, |
| "grad_norm": 0.06981525289632151, |
| "learning_rate": 1.0661784989258784e-06, |
| "loss": 0.2952, |
| "step": 831 |
| }, |
| { |
| "epoch": 4.672622175512349, |
| "grad_norm": 0.06887481964028351, |
| "learning_rate": 1.0304978203813864e-06, |
| "loss": 0.2942, |
| "step": 832 |
| }, |
| { |
| "epoch": 4.678227360308285, |
| "grad_norm": 0.06799466764149216, |
| "learning_rate": 9.954166001874665e-07, |
| "loss": 0.2911, |
| "step": 833 |
| }, |
| { |
| "epoch": 4.683832545104221, |
| "grad_norm": 0.06713373354236589, |
| "learning_rate": 9.609353779899711e-07, |
| "loss": 0.2937, |
| "step": 834 |
| }, |
| { |
| "epoch": 4.689437729900158, |
| "grad_norm": 0.06801377589396243, |
| "learning_rate": 9.270546842051398e-07, |
| "loss": 0.2917, |
| "step": 835 |
| }, |
| { |
| "epoch": 4.695042914696094, |
| "grad_norm": 0.06877475417334285, |
| "learning_rate": 8.937750400114243e-07, |
| "loss": 0.2951, |
| "step": 836 |
| }, |
| { |
| "epoch": 4.70064809949203, |
| "grad_norm": 0.0682963203858977, |
| "learning_rate": 8.610969573414762e-07, |
| "loss": 0.2851, |
| "step": 837 |
| }, |
| { |
| "epoch": 4.706253284287967, |
| "grad_norm": 0.06874191377961358, |
| "learning_rate": 8.290209388742698e-07, |
| "loss": 0.2923, |
| "step": 838 |
| }, |
| { |
| "epoch": 4.711858469083903, |
| "grad_norm": 0.06818766825685828, |
| "learning_rate": 7.975474780273828e-07, |
| "loss": 0.2903, |
| "step": 839 |
| }, |
| { |
| "epoch": 4.717463653879839, |
| "grad_norm": 0.06850750083385908, |
| "learning_rate": 7.666770589493854e-07, |
| "loss": 0.2912, |
| "step": 840 |
| }, |
| { |
| "epoch": 4.7230688386757755, |
| "grad_norm": 0.06669019358923452, |
| "learning_rate": 7.36410156512406e-07, |
| "loss": 0.2886, |
| "step": 841 |
| }, |
| { |
| "epoch": 4.7286740234717115, |
| "grad_norm": 0.0685257652819839, |
| "learning_rate": 7.0674723630483e-07, |
| "loss": 0.2955, |
| "step": 842 |
| }, |
| { |
| "epoch": 4.7342792082676475, |
| "grad_norm": 0.06869894321372189, |
| "learning_rate": 6.776887546241196e-07, |
| "loss": 0.2894, |
| "step": 843 |
| }, |
| { |
| "epoch": 4.7398843930635834, |
| "grad_norm": 0.0658977357878887, |
| "learning_rate": 6.492351584698231e-07, |
| "loss": 0.29, |
| "step": 844 |
| }, |
| { |
| "epoch": 4.74548957785952, |
| "grad_norm": 0.06766778529150386, |
| "learning_rate": 6.213868855366656e-07, |
| "loss": 0.2919, |
| "step": 845 |
| }, |
| { |
| "epoch": 4.751094762655456, |
| "grad_norm": 0.06789970654207798, |
| "learning_rate": 5.94144364207847e-07, |
| "loss": 0.2848, |
| "step": 846 |
| }, |
| { |
| "epoch": 4.756699947451392, |
| "grad_norm": 0.06763905592382854, |
| "learning_rate": 5.675080135484212e-07, |
| "loss": 0.2919, |
| "step": 847 |
| }, |
| { |
| "epoch": 4.762305132247329, |
| "grad_norm": 0.0673637188944257, |
| "learning_rate": 5.41478243298883e-07, |
| "loss": 0.2895, |
| "step": 848 |
| }, |
| { |
| "epoch": 4.767910317043265, |
| "grad_norm": 0.06599718582014574, |
| "learning_rate": 5.160554538688356e-07, |
| "loss": 0.2866, |
| "step": 849 |
| }, |
| { |
| "epoch": 4.773515501839201, |
| "grad_norm": 0.06680161975531686, |
| "learning_rate": 4.912400363308534e-07, |
| "loss": 0.2905, |
| "step": 850 |
| }, |
| { |
| "epoch": 4.779120686635137, |
| "grad_norm": 0.06774042383141317, |
| "learning_rate": 4.670323724144599e-07, |
| "loss": 0.29, |
| "step": 851 |
| }, |
| { |
| "epoch": 4.784725871431074, |
| "grad_norm": 0.06766365842045775, |
| "learning_rate": 4.434328345002348e-07, |
| "loss": 0.2893, |
| "step": 852 |
| }, |
| { |
| "epoch": 4.79033105622701, |
| "grad_norm": 0.0669757827435882, |
| "learning_rate": 4.204417856141252e-07, |
| "loss": 0.2934, |
| "step": 853 |
| }, |
| { |
| "epoch": 4.795936241022947, |
| "grad_norm": 0.06783350979091353, |
| "learning_rate": 3.980595794218278e-07, |
| "loss": 0.2933, |
| "step": 854 |
| }, |
| { |
| "epoch": 4.801541425818883, |
| "grad_norm": 0.06756472881490112, |
| "learning_rate": 3.762865602233623e-07, |
| "loss": 0.2938, |
| "step": 855 |
| }, |
| { |
| "epoch": 4.807146610614819, |
| "grad_norm": 0.06573406645601809, |
| "learning_rate": 3.551230629477731e-07, |
| "loss": 0.2838, |
| "step": 856 |
| }, |
| { |
| "epoch": 4.812751795410755, |
| "grad_norm": 0.06500971296841233, |
| "learning_rate": 3.3456941314798264e-07, |
| "loss": 0.2858, |
| "step": 857 |
| }, |
| { |
| "epoch": 4.8183569802066915, |
| "grad_norm": 0.06666567495190365, |
| "learning_rate": 3.14625926995773e-07, |
| "loss": 0.2928, |
| "step": 858 |
| }, |
| { |
| "epoch": 4.823962165002627, |
| "grad_norm": 0.06842156859320352, |
| "learning_rate": 2.9529291127693204e-07, |
| "loss": 0.2978, |
| "step": 859 |
| }, |
| { |
| "epoch": 4.829567349798563, |
| "grad_norm": 0.06770694825005188, |
| "learning_rate": 2.765706633865195e-07, |
| "loss": 0.2937, |
| "step": 860 |
| }, |
| { |
| "epoch": 4.8351725345945, |
| "grad_norm": 0.06561834235318093, |
| "learning_rate": 2.584594713243105e-07, |
| "loss": 0.2885, |
| "step": 861 |
| }, |
| { |
| "epoch": 4.840777719390436, |
| "grad_norm": 0.06847439054641216, |
| "learning_rate": 2.409596136903636e-07, |
| "loss": 0.2934, |
| "step": 862 |
| }, |
| { |
| "epoch": 4.846382904186372, |
| "grad_norm": 0.06925202861994563, |
| "learning_rate": 2.2407135968072203e-07, |
| "loss": 0.2912, |
| "step": 863 |
| }, |
| { |
| "epoch": 4.851988088982308, |
| "grad_norm": 0.06568658183290636, |
| "learning_rate": 2.0779496908327034e-07, |
| "loss": 0.2865, |
| "step": 864 |
| }, |
| { |
| "epoch": 4.857593273778245, |
| "grad_norm": 0.06527850349185818, |
| "learning_rate": 1.9213069227376423e-07, |
| "loss": 0.285, |
| "step": 865 |
| }, |
| { |
| "epoch": 4.863198458574181, |
| "grad_norm": 0.06537692688098144, |
| "learning_rate": 1.7707877021195364e-07, |
| "loss": 0.2893, |
| "step": 866 |
| }, |
| { |
| "epoch": 4.868803643370118, |
| "grad_norm": 0.0673736030444187, |
| "learning_rate": 1.6263943443788344e-07, |
| "loss": 0.2929, |
| "step": 867 |
| }, |
| { |
| "epoch": 4.874408828166054, |
| "grad_norm": 0.06944665125489001, |
| "learning_rate": 1.488129070683364e-07, |
| "loss": 0.2891, |
| "step": 868 |
| }, |
| { |
| "epoch": 4.88001401296199, |
| "grad_norm": 0.06656997632932934, |
| "learning_rate": 1.355994007934136e-07, |
| "loss": 0.2897, |
| "step": 869 |
| }, |
| { |
| "epoch": 4.885619197757926, |
| "grad_norm": 0.06760741888344543, |
| "learning_rate": 1.229991188732571e-07, |
| "loss": 0.2976, |
| "step": 870 |
| }, |
| { |
| "epoch": 4.891224382553863, |
| "grad_norm": 0.06541550926939739, |
| "learning_rate": 1.1101225513493685e-07, |
| "loss": 0.2867, |
| "step": 871 |
| }, |
| { |
| "epoch": 4.896829567349799, |
| "grad_norm": 0.06563519047490654, |
| "learning_rate": 9.963899396944865e-08, |
| "loss": 0.2908, |
| "step": 872 |
| }, |
| { |
| "epoch": 4.9024347521457345, |
| "grad_norm": 0.0667333344033991, |
| "learning_rate": 8.887951032889863e-08, |
| "loss": 0.2927, |
| "step": 873 |
| }, |
| { |
| "epoch": 4.908039936941671, |
| "grad_norm": 0.06534766168074065, |
| "learning_rate": 7.873396972379876e-08, |
| "loss": 0.287, |
| "step": 874 |
| }, |
| { |
| "epoch": 4.913645121737607, |
| "grad_norm": 0.06573554633063639, |
| "learning_rate": 6.920252822053109e-08, |
| "loss": 0.2935, |
| "step": 875 |
| }, |
| { |
| "epoch": 4.919250306533543, |
| "grad_norm": 0.06692957578454926, |
| "learning_rate": 6.028533243893186e-08, |
| "loss": 0.297, |
| "step": 876 |
| }, |
| { |
| "epoch": 4.924855491329479, |
| "grad_norm": 0.06696022392708174, |
| "learning_rate": 5.19825195500534e-08, |
| "loss": 0.2933, |
| "step": 877 |
| }, |
| { |
| "epoch": 4.930460676125416, |
| "grad_norm": 0.06551574218774454, |
| "learning_rate": 4.429421727403682e-08, |
| "loss": 0.2934, |
| "step": 878 |
| }, |
| { |
| "epoch": 4.936065860921352, |
| "grad_norm": 0.06734924333505671, |
| "learning_rate": 3.722054387816698e-08, |
| "loss": 0.2887, |
| "step": 879 |
| }, |
| { |
| "epoch": 4.941671045717289, |
| "grad_norm": 0.06685929779911977, |
| "learning_rate": 3.076160817503393e-08, |
| "loss": 0.2919, |
| "step": 880 |
| }, |
| { |
| "epoch": 4.947276230513225, |
| "grad_norm": 0.0657094812782885, |
| "learning_rate": 2.491750952087202e-08, |
| "loss": 0.2899, |
| "step": 881 |
| }, |
| { |
| "epoch": 4.952881415309161, |
| "grad_norm": 0.06618212935511, |
| "learning_rate": 1.968833781402335e-08, |
| "loss": 0.2934, |
| "step": 882 |
| }, |
| { |
| "epoch": 4.958486600105097, |
| "grad_norm": 0.06522788950148393, |
| "learning_rate": 1.5074173493565548e-08, |
| "loss": 0.2889, |
| "step": 883 |
| }, |
| { |
| "epoch": 4.964091784901034, |
| "grad_norm": 0.06617327426574475, |
| "learning_rate": 1.1075087538059415e-08, |
| "loss": 0.2869, |
| "step": 884 |
| }, |
| { |
| "epoch": 4.96969696969697, |
| "grad_norm": 0.06616131705312475, |
| "learning_rate": 7.69114146446981e-09, |
| "loss": 0.2862, |
| "step": 885 |
| }, |
| { |
| "epoch": 4.975302154492906, |
| "grad_norm": 0.06628519866860673, |
| "learning_rate": 4.922387327219724e-09, |
| "loss": 0.2946, |
| "step": 886 |
| }, |
| { |
| "epoch": 4.9809073392888426, |
| "grad_norm": 0.06516030532915469, |
| "learning_rate": 2.7688677173687285e-09, |
| "loss": 0.2916, |
| "step": 887 |
| }, |
| { |
| "epoch": 4.9865125240847785, |
| "grad_norm": 0.06490676692933119, |
| "learning_rate": 1.2306157619956793e-09, |
| "loss": 0.2929, |
| "step": 888 |
| }, |
| { |
| "epoch": 4.9921177088807145, |
| "grad_norm": 0.06796682705221838, |
| "learning_rate": 3.0765512364361317e-10, |
| "loss": 0.2958, |
| "step": 889 |
| }, |
| { |
| "epoch": 4.9977228936766505, |
| "grad_norm": 0.06629486855855717, |
| "learning_rate": 0.0, |
| "loss": 0.2924, |
| "step": 890 |
| }, |
| { |
| "epoch": 4.9977228936766505, |
| "step": 890, |
| "total_flos": 2.367212535366969e+19, |
| "train_loss": 0.0, |
| "train_runtime": 1.7461, |
| "train_samples_per_second": 261530.086, |
| "train_steps_per_second": 509.716 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 890, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.367212535366969e+19, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|