{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.990403071017274, "eval_steps": 500, "global_step": 1950, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025591810620601407, "grad_norm": 5.72460191514523, "learning_rate": 2.0512820512820514e-07, "loss": 0.8785, "step": 1 }, { "epoch": 0.005118362124120281, "grad_norm": 5.959578433623624, "learning_rate": 4.102564102564103e-07, "loss": 0.8678, "step": 2 }, { "epoch": 0.007677543186180422, "grad_norm": 5.902957688136316, "learning_rate": 6.153846153846155e-07, "loss": 0.8968, "step": 3 }, { "epoch": 0.010236724248240563, "grad_norm": 5.4723551383557805, "learning_rate": 8.205128205128206e-07, "loss": 0.8418, "step": 4 }, { "epoch": 0.012795905310300703, "grad_norm": 5.787912289462209, "learning_rate": 1.0256410256410257e-06, "loss": 0.886, "step": 5 }, { "epoch": 0.015355086372360844, "grad_norm": 5.557852691465803, "learning_rate": 1.230769230769231e-06, "loss": 0.9033, "step": 6 }, { "epoch": 0.017914267434420986, "grad_norm": 5.362076082832634, "learning_rate": 1.4358974358974359e-06, "loss": 0.8629, "step": 7 }, { "epoch": 0.020473448496481125, "grad_norm": 4.370734339586599, "learning_rate": 1.6410256410256412e-06, "loss": 0.8157, "step": 8 }, { "epoch": 0.023032629558541268, "grad_norm": 4.493266725432695, "learning_rate": 1.8461538461538465e-06, "loss": 0.8392, "step": 9 }, { "epoch": 0.025591810620601407, "grad_norm": 4.121132813839215, "learning_rate": 2.0512820512820513e-06, "loss": 0.8246, "step": 10 }, { "epoch": 0.02815099168266155, "grad_norm": 2.4085810485641095, "learning_rate": 2.2564102564102566e-06, "loss": 0.8009, "step": 11 }, { "epoch": 0.030710172744721688, "grad_norm": 2.3319618373499087, "learning_rate": 2.461538461538462e-06, "loss": 0.8189, "step": 12 }, { "epoch": 0.03326935380678183, "grad_norm": 2.050198471153189, "learning_rate": 2.666666666666667e-06, "loss": 0.7671, "step": 13 }, { "epoch": 0.03582853486884197, "grad_norm": 2.5524552964667926, "learning_rate": 2.8717948717948717e-06, "loss": 0.7587, "step": 14 }, { "epoch": 0.03838771593090211, "grad_norm": 3.44805626463955, "learning_rate": 3.0769230769230774e-06, "loss": 0.779, "step": 15 }, { "epoch": 0.04094689699296225, "grad_norm": 3.559383059115368, "learning_rate": 3.2820512820512823e-06, "loss": 0.758, "step": 16 }, { "epoch": 0.04350607805502239, "grad_norm": 3.33809383465494, "learning_rate": 3.487179487179487e-06, "loss": 0.7136, "step": 17 }, { "epoch": 0.046065259117082535, "grad_norm": 3.2168388473087757, "learning_rate": 3.692307692307693e-06, "loss": 0.7369, "step": 18 }, { "epoch": 0.04862444017914268, "grad_norm": 2.5872597896051728, "learning_rate": 3.897435897435898e-06, "loss": 0.7323, "step": 19 }, { "epoch": 0.05118362124120281, "grad_norm": 2.0282374599316957, "learning_rate": 4.102564102564103e-06, "loss": 0.6876, "step": 20 }, { "epoch": 0.053742802303262956, "grad_norm": 1.5964078137928233, "learning_rate": 4.307692307692308e-06, "loss": 0.6678, "step": 21 }, { "epoch": 0.0563019833653231, "grad_norm": 1.2909627910012984, "learning_rate": 4.512820512820513e-06, "loss": 0.662, "step": 22 }, { "epoch": 0.05886116442738324, "grad_norm": 1.3177057124827334, "learning_rate": 4.717948717948718e-06, "loss": 0.6594, "step": 23 }, { "epoch": 0.061420345489443376, "grad_norm": 1.24942825931957, "learning_rate": 4.923076923076924e-06, "loss": 0.626, "step": 24 }, { "epoch": 0.06397952655150352, "grad_norm": 1.2673077068864744, "learning_rate": 5.128205128205128e-06, "loss": 0.6347, "step": 25 }, { "epoch": 0.06653870761356366, "grad_norm": 1.1563656900829429, "learning_rate": 5.333333333333334e-06, "loss": 0.6329, "step": 26 }, { "epoch": 0.0690978886756238, "grad_norm": 1.044094666100426, "learning_rate": 5.538461538461539e-06, "loss": 0.6254, "step": 27 }, { "epoch": 0.07165706973768395, "grad_norm": 0.9466038321235274, "learning_rate": 5.743589743589743e-06, "loss": 0.6295, "step": 28 }, { "epoch": 0.07421625079974409, "grad_norm": 0.7981163236308523, "learning_rate": 5.948717948717949e-06, "loss": 0.6155, "step": 29 }, { "epoch": 0.07677543186180422, "grad_norm": 0.9968280326546483, "learning_rate": 6.153846153846155e-06, "loss": 0.6113, "step": 30 }, { "epoch": 0.07933461292386436, "grad_norm": 0.9260190035124614, "learning_rate": 6.358974358974359e-06, "loss": 0.636, "step": 31 }, { "epoch": 0.0818937939859245, "grad_norm": 0.8477667711908703, "learning_rate": 6.564102564102565e-06, "loss": 0.63, "step": 32 }, { "epoch": 0.08445297504798464, "grad_norm": 0.6532816121925329, "learning_rate": 6.76923076923077e-06, "loss": 0.6108, "step": 33 }, { "epoch": 0.08701215611004479, "grad_norm": 0.7821169327010173, "learning_rate": 6.974358974358974e-06, "loss": 0.6408, "step": 34 }, { "epoch": 0.08957133717210493, "grad_norm": 0.7393409032705888, "learning_rate": 7.17948717948718e-06, "loss": 0.5915, "step": 35 }, { "epoch": 0.09213051823416507, "grad_norm": 0.6644188521415291, "learning_rate": 7.384615384615386e-06, "loss": 0.5961, "step": 36 }, { "epoch": 0.09468969929622521, "grad_norm": 0.593116184422468, "learning_rate": 7.58974358974359e-06, "loss": 0.5971, "step": 37 }, { "epoch": 0.09724888035828536, "grad_norm": 0.6919315440965734, "learning_rate": 7.794871794871796e-06, "loss": 0.5725, "step": 38 }, { "epoch": 0.09980806142034548, "grad_norm": 0.6959155805001593, "learning_rate": 8.000000000000001e-06, "loss": 0.5685, "step": 39 }, { "epoch": 0.10236724248240563, "grad_norm": 0.6398643666989815, "learning_rate": 8.205128205128205e-06, "loss": 0.6138, "step": 40 }, { "epoch": 0.10492642354446577, "grad_norm": 0.6361698523409376, "learning_rate": 8.410256410256411e-06, "loss": 0.5849, "step": 41 }, { "epoch": 0.10748560460652591, "grad_norm": 0.5303957188864263, "learning_rate": 8.615384615384617e-06, "loss": 0.5533, "step": 42 }, { "epoch": 0.11004478566858605, "grad_norm": 0.5370326395081426, "learning_rate": 8.820512820512821e-06, "loss": 0.5833, "step": 43 }, { "epoch": 0.1126039667306462, "grad_norm": 0.590660924049006, "learning_rate": 9.025641025641027e-06, "loss": 0.5888, "step": 44 }, { "epoch": 0.11516314779270634, "grad_norm": 0.540686200850521, "learning_rate": 9.230769230769232e-06, "loss": 0.5624, "step": 45 }, { "epoch": 0.11772232885476648, "grad_norm": 0.5777571666796689, "learning_rate": 9.435897435897436e-06, "loss": 0.5851, "step": 46 }, { "epoch": 0.12028150991682661, "grad_norm": 0.537318145066867, "learning_rate": 9.641025641025642e-06, "loss": 0.5644, "step": 47 }, { "epoch": 0.12284069097888675, "grad_norm": 0.5211947061981134, "learning_rate": 9.846153846153848e-06, "loss": 0.5591, "step": 48 }, { "epoch": 0.1253998720409469, "grad_norm": 0.5397404148687415, "learning_rate": 1.0051282051282052e-05, "loss": 0.5843, "step": 49 }, { "epoch": 0.12795905310300704, "grad_norm": 0.5297926804183296, "learning_rate": 1.0256410256410256e-05, "loss": 0.5277, "step": 50 }, { "epoch": 0.13051823416506717, "grad_norm": 0.605652388150248, "learning_rate": 1.0461538461538463e-05, "loss": 0.5724, "step": 51 }, { "epoch": 0.13307741522712732, "grad_norm": 0.45885002080359344, "learning_rate": 1.0666666666666667e-05, "loss": 0.5381, "step": 52 }, { "epoch": 0.13563659628918745, "grad_norm": 0.5930020802013372, "learning_rate": 1.0871794871794871e-05, "loss": 0.6045, "step": 53 }, { "epoch": 0.1381957773512476, "grad_norm": 0.5808129528368039, "learning_rate": 1.1076923076923079e-05, "loss": 0.5222, "step": 54 }, { "epoch": 0.14075495841330773, "grad_norm": 0.5154128262574531, "learning_rate": 1.1282051282051283e-05, "loss": 0.5578, "step": 55 }, { "epoch": 0.1433141394753679, "grad_norm": 0.5390655268219918, "learning_rate": 1.1487179487179487e-05, "loss": 0.5436, "step": 56 }, { "epoch": 0.14587332053742802, "grad_norm": 0.5586414645653933, "learning_rate": 1.1692307692307694e-05, "loss": 0.5353, "step": 57 }, { "epoch": 0.14843250159948818, "grad_norm": 0.6534105047151474, "learning_rate": 1.1897435897435898e-05, "loss": 0.5392, "step": 58 }, { "epoch": 0.1509916826615483, "grad_norm": 0.5945111461514314, "learning_rate": 1.2102564102564102e-05, "loss": 0.5542, "step": 59 }, { "epoch": 0.15355086372360843, "grad_norm": 0.6505380466948517, "learning_rate": 1.230769230769231e-05, "loss": 0.5252, "step": 60 }, { "epoch": 0.1561100447856686, "grad_norm": 0.6510336772233184, "learning_rate": 1.2512820512820514e-05, "loss": 0.5683, "step": 61 }, { "epoch": 0.15866922584772872, "grad_norm": 0.5509745528884461, "learning_rate": 1.2717948717948718e-05, "loss": 0.5205, "step": 62 }, { "epoch": 0.16122840690978887, "grad_norm": 0.5917102526960739, "learning_rate": 1.2923076923076925e-05, "loss": 0.5603, "step": 63 }, { "epoch": 0.163787587971849, "grad_norm": 0.560110790537656, "learning_rate": 1.312820512820513e-05, "loss": 0.545, "step": 64 }, { "epoch": 0.16634676903390916, "grad_norm": 0.5678363411921677, "learning_rate": 1.3333333333333333e-05, "loss": 0.5287, "step": 65 }, { "epoch": 0.1689059500959693, "grad_norm": 0.5819806020456917, "learning_rate": 1.353846153846154e-05, "loss": 0.5535, "step": 66 }, { "epoch": 0.17146513115802944, "grad_norm": 0.5558896303005907, "learning_rate": 1.3743589743589745e-05, "loss": 0.5648, "step": 67 }, { "epoch": 0.17402431222008957, "grad_norm": 0.5818297224628268, "learning_rate": 1.3948717948717949e-05, "loss": 0.545, "step": 68 }, { "epoch": 0.1765834932821497, "grad_norm": 0.5604385516101225, "learning_rate": 1.4153846153846156e-05, "loss": 0.5625, "step": 69 }, { "epoch": 0.17914267434420986, "grad_norm": 0.5262255043539227, "learning_rate": 1.435897435897436e-05, "loss": 0.553, "step": 70 }, { "epoch": 0.18170185540626999, "grad_norm": 0.5449392960823104, "learning_rate": 1.4564102564102564e-05, "loss": 0.532, "step": 71 }, { "epoch": 0.18426103646833014, "grad_norm": 0.5757069226256301, "learning_rate": 1.4769230769230772e-05, "loss": 0.538, "step": 72 }, { "epoch": 0.18682021753039027, "grad_norm": 0.5241911795419738, "learning_rate": 1.4974358974358976e-05, "loss": 0.537, "step": 73 }, { "epoch": 0.18937939859245043, "grad_norm": 0.4755835874750557, "learning_rate": 1.517948717948718e-05, "loss": 0.5015, "step": 74 }, { "epoch": 0.19193857965451055, "grad_norm": 0.5816260566225624, "learning_rate": 1.5384615384615387e-05, "loss": 0.5317, "step": 75 }, { "epoch": 0.1944977607165707, "grad_norm": 0.5324895484169254, "learning_rate": 1.558974358974359e-05, "loss": 0.503, "step": 76 }, { "epoch": 0.19705694177863084, "grad_norm": 0.5873638181887759, "learning_rate": 1.5794871794871795e-05, "loss": 0.5571, "step": 77 }, { "epoch": 0.19961612284069097, "grad_norm": 0.5466165214856151, "learning_rate": 1.6000000000000003e-05, "loss": 0.5177, "step": 78 }, { "epoch": 0.20217530390275112, "grad_norm": 0.6209989234883905, "learning_rate": 1.6205128205128207e-05, "loss": 0.5268, "step": 79 }, { "epoch": 0.20473448496481125, "grad_norm": 0.5479973413858492, "learning_rate": 1.641025641025641e-05, "loss": 0.5096, "step": 80 }, { "epoch": 0.2072936660268714, "grad_norm": 0.7512485115452843, "learning_rate": 1.6615384615384618e-05, "loss": 0.5398, "step": 81 }, { "epoch": 0.20985284708893154, "grad_norm": 0.5988795561569944, "learning_rate": 1.6820512820512822e-05, "loss": 0.5054, "step": 82 }, { "epoch": 0.2124120281509917, "grad_norm": 0.6349565283068188, "learning_rate": 1.7025641025641026e-05, "loss": 0.5301, "step": 83 }, { "epoch": 0.21497120921305182, "grad_norm": 0.7482046004578073, "learning_rate": 1.7230769230769234e-05, "loss": 0.5451, "step": 84 }, { "epoch": 0.21753039027511195, "grad_norm": 0.5452089019203088, "learning_rate": 1.7435897435897438e-05, "loss": 0.537, "step": 85 }, { "epoch": 0.2200895713371721, "grad_norm": 0.6062443857702727, "learning_rate": 1.7641025641025642e-05, "loss": 0.5057, "step": 86 }, { "epoch": 0.22264875239923224, "grad_norm": 0.6133096727031904, "learning_rate": 1.784615384615385e-05, "loss": 0.549, "step": 87 }, { "epoch": 0.2252079334612924, "grad_norm": 0.6734829746990577, "learning_rate": 1.8051282051282053e-05, "loss": 0.5549, "step": 88 }, { "epoch": 0.22776711452335252, "grad_norm": 0.5692319701895174, "learning_rate": 1.8256410256410257e-05, "loss": 0.5053, "step": 89 }, { "epoch": 0.23032629558541268, "grad_norm": 0.6551305606242741, "learning_rate": 1.8461538461538465e-05, "loss": 0.5451, "step": 90 }, { "epoch": 0.2328854766474728, "grad_norm": 0.5869184912875696, "learning_rate": 1.866666666666667e-05, "loss": 0.5573, "step": 91 }, { "epoch": 0.23544465770953296, "grad_norm": 0.6175475389576918, "learning_rate": 1.8871794871794873e-05, "loss": 0.5239, "step": 92 }, { "epoch": 0.2380038387715931, "grad_norm": 0.6923526622405791, "learning_rate": 1.907692307692308e-05, "loss": 0.5178, "step": 93 }, { "epoch": 0.24056301983365322, "grad_norm": 0.6169855947769446, "learning_rate": 1.9282051282051284e-05, "loss": 0.5583, "step": 94 }, { "epoch": 0.24312220089571338, "grad_norm": 0.8905504059133514, "learning_rate": 1.9487179487179488e-05, "loss": 0.5577, "step": 95 }, { "epoch": 0.2456813819577735, "grad_norm": 0.7206999368653747, "learning_rate": 1.9692307692307696e-05, "loss": 0.5248, "step": 96 }, { "epoch": 0.24824056301983366, "grad_norm": 0.5689479892809097, "learning_rate": 1.98974358974359e-05, "loss": 0.5143, "step": 97 }, { "epoch": 0.2507997440818938, "grad_norm": 0.7120200186242683, "learning_rate": 2.0102564102564104e-05, "loss": 0.5363, "step": 98 }, { "epoch": 0.2533589251439539, "grad_norm": 0.5649172258139606, "learning_rate": 2.0307692307692308e-05, "loss": 0.5232, "step": 99 }, { "epoch": 0.2559181062060141, "grad_norm": 0.8530948421377756, "learning_rate": 2.0512820512820512e-05, "loss": 0.5127, "step": 100 }, { "epoch": 0.25847728726807423, "grad_norm": 0.7220421419640349, "learning_rate": 2.0717948717948723e-05, "loss": 0.5472, "step": 101 }, { "epoch": 0.26103646833013433, "grad_norm": 0.7179821332235292, "learning_rate": 2.0923076923076927e-05, "loss": 0.4748, "step": 102 }, { "epoch": 0.2635956493921945, "grad_norm": 0.5969399443763903, "learning_rate": 2.112820512820513e-05, "loss": 0.4869, "step": 103 }, { "epoch": 0.26615483045425464, "grad_norm": 0.7652225150209184, "learning_rate": 2.1333333333333335e-05, "loss": 0.5048, "step": 104 }, { "epoch": 0.2687140115163148, "grad_norm": 0.7026938678210959, "learning_rate": 2.153846153846154e-05, "loss": 0.5351, "step": 105 }, { "epoch": 0.2712731925783749, "grad_norm": 0.6625081293533241, "learning_rate": 2.1743589743589743e-05, "loss": 0.5071, "step": 106 }, { "epoch": 0.27383237364043506, "grad_norm": 0.7338103048357757, "learning_rate": 2.1948717948717954e-05, "loss": 0.5214, "step": 107 }, { "epoch": 0.2763915547024952, "grad_norm": 0.7157427738707126, "learning_rate": 2.2153846153846158e-05, "loss": 0.521, "step": 108 }, { "epoch": 0.27895073576455537, "grad_norm": 0.8391389612996835, "learning_rate": 2.235897435897436e-05, "loss": 0.5114, "step": 109 }, { "epoch": 0.28150991682661547, "grad_norm": 0.6739896722237592, "learning_rate": 2.2564102564102566e-05, "loss": 0.4607, "step": 110 }, { "epoch": 0.2840690978886756, "grad_norm": 0.7325968758566463, "learning_rate": 2.276923076923077e-05, "loss": 0.5114, "step": 111 }, { "epoch": 0.2866282789507358, "grad_norm": 0.9192765458484284, "learning_rate": 2.2974358974358974e-05, "loss": 0.5164, "step": 112 }, { "epoch": 0.2891874600127959, "grad_norm": 0.7638911037686114, "learning_rate": 2.3179487179487184e-05, "loss": 0.5056, "step": 113 }, { "epoch": 0.29174664107485604, "grad_norm": 0.7754803757011083, "learning_rate": 2.338461538461539e-05, "loss": 0.5137, "step": 114 }, { "epoch": 0.2943058221369162, "grad_norm": 0.7313220696083259, "learning_rate": 2.3589743589743593e-05, "loss": 0.5171, "step": 115 }, { "epoch": 0.29686500319897635, "grad_norm": 0.8944240636016003, "learning_rate": 2.3794871794871797e-05, "loss": 0.5631, "step": 116 }, { "epoch": 0.29942418426103645, "grad_norm": 0.827344957741263, "learning_rate": 2.4e-05, "loss": 0.5329, "step": 117 }, { "epoch": 0.3019833653230966, "grad_norm": 0.7763009278418379, "learning_rate": 2.4205128205128205e-05, "loss": 0.5341, "step": 118 }, { "epoch": 0.30454254638515676, "grad_norm": 0.7998736408167985, "learning_rate": 2.4410256410256415e-05, "loss": 0.52, "step": 119 }, { "epoch": 0.30710172744721687, "grad_norm": 0.7411952795822903, "learning_rate": 2.461538461538462e-05, "loss": 0.5418, "step": 120 }, { "epoch": 0.309660908509277, "grad_norm": 0.659770656309478, "learning_rate": 2.4820512820512824e-05, "loss": 0.5195, "step": 121 }, { "epoch": 0.3122200895713372, "grad_norm": 0.8056693118680838, "learning_rate": 2.5025641025641028e-05, "loss": 0.5215, "step": 122 }, { "epoch": 0.31477927063339733, "grad_norm": 0.9631898506281213, "learning_rate": 2.523076923076923e-05, "loss": 0.5283, "step": 123 }, { "epoch": 0.31733845169545744, "grad_norm": 0.7096814914325649, "learning_rate": 2.5435897435897436e-05, "loss": 0.5155, "step": 124 }, { "epoch": 0.3198976327575176, "grad_norm": 1.028582043530853, "learning_rate": 2.5641025641025646e-05, "loss": 0.5394, "step": 125 }, { "epoch": 0.32245681381957775, "grad_norm": 0.7239590324246933, "learning_rate": 2.584615384615385e-05, "loss": 0.5446, "step": 126 }, { "epoch": 0.32501599488163785, "grad_norm": 1.0571455117998556, "learning_rate": 2.6051282051282054e-05, "loss": 0.5335, "step": 127 }, { "epoch": 0.327575175943698, "grad_norm": 1.0256105590142106, "learning_rate": 2.625641025641026e-05, "loss": 0.5418, "step": 128 }, { "epoch": 0.33013435700575816, "grad_norm": 1.0959117099820284, "learning_rate": 2.6461538461538463e-05, "loss": 0.5544, "step": 129 }, { "epoch": 0.3326935380678183, "grad_norm": 0.7463871798931493, "learning_rate": 2.6666666666666667e-05, "loss": 0.4965, "step": 130 }, { "epoch": 0.3352527191298784, "grad_norm": 1.0194762534931083, "learning_rate": 2.687179487179487e-05, "loss": 0.4776, "step": 131 }, { "epoch": 0.3378119001919386, "grad_norm": 0.7119748311745303, "learning_rate": 2.707692307692308e-05, "loss": 0.5203, "step": 132 }, { "epoch": 0.34037108125399873, "grad_norm": 0.920481871489979, "learning_rate": 2.7282051282051285e-05, "loss": 0.5142, "step": 133 }, { "epoch": 0.3429302623160589, "grad_norm": 0.7065977141822832, "learning_rate": 2.748717948717949e-05, "loss": 0.5349, "step": 134 }, { "epoch": 0.345489443378119, "grad_norm": 0.9570189898635619, "learning_rate": 2.7692307692307694e-05, "loss": 0.556, "step": 135 }, { "epoch": 0.34804862444017914, "grad_norm": 0.8774520896998024, "learning_rate": 2.7897435897435898e-05, "loss": 0.5221, "step": 136 }, { "epoch": 0.3506078055022393, "grad_norm": 0.9754340258356095, "learning_rate": 2.81025641025641e-05, "loss": 0.5165, "step": 137 }, { "epoch": 0.3531669865642994, "grad_norm": 0.9512013949257682, "learning_rate": 2.8307692307692312e-05, "loss": 0.5094, "step": 138 }, { "epoch": 0.35572616762635956, "grad_norm": 1.0101279645456138, "learning_rate": 2.8512820512820516e-05, "loss": 0.5134, "step": 139 }, { "epoch": 0.3582853486884197, "grad_norm": 0.9392456947082269, "learning_rate": 2.871794871794872e-05, "loss": 0.5154, "step": 140 }, { "epoch": 0.36084452975047987, "grad_norm": 1.064205953704163, "learning_rate": 2.8923076923076925e-05, "loss": 0.5422, "step": 141 }, { "epoch": 0.36340371081253997, "grad_norm": 0.8361885058587943, "learning_rate": 2.912820512820513e-05, "loss": 0.5045, "step": 142 }, { "epoch": 0.3659628918746001, "grad_norm": 1.0678541131176078, "learning_rate": 2.9333333333333333e-05, "loss": 0.4843, "step": 143 }, { "epoch": 0.3685220729366603, "grad_norm": 0.6800509515379447, "learning_rate": 2.9538461538461543e-05, "loss": 0.5256, "step": 144 }, { "epoch": 0.3710812539987204, "grad_norm": 0.7904486157434544, "learning_rate": 2.9743589743589747e-05, "loss": 0.5225, "step": 145 }, { "epoch": 0.37364043506078054, "grad_norm": 0.816468325578623, "learning_rate": 2.994871794871795e-05, "loss": 0.5389, "step": 146 }, { "epoch": 0.3761996161228407, "grad_norm": 0.7918935996515184, "learning_rate": 3.0153846153846155e-05, "loss": 0.5014, "step": 147 }, { "epoch": 0.37875879718490085, "grad_norm": 0.7555828789735101, "learning_rate": 3.035897435897436e-05, "loss": 0.5236, "step": 148 }, { "epoch": 0.38131797824696095, "grad_norm": 1.0603633188732544, "learning_rate": 3.0564102564102564e-05, "loss": 0.5271, "step": 149 }, { "epoch": 0.3838771593090211, "grad_norm": 0.8796724653002846, "learning_rate": 3.0769230769230774e-05, "loss": 0.5214, "step": 150 }, { "epoch": 0.38643634037108127, "grad_norm": 0.6823417266648101, "learning_rate": 3.097435897435898e-05, "loss": 0.492, "step": 151 }, { "epoch": 0.3889955214331414, "grad_norm": 0.9675008798265416, "learning_rate": 3.117948717948718e-05, "loss": 0.5282, "step": 152 }, { "epoch": 0.3915547024952015, "grad_norm": 1.0136226084949147, "learning_rate": 3.1384615384615386e-05, "loss": 0.5044, "step": 153 }, { "epoch": 0.3941138835572617, "grad_norm": 0.9351502869426284, "learning_rate": 3.158974358974359e-05, "loss": 0.5006, "step": 154 }, { "epoch": 0.39667306461932184, "grad_norm": 0.9882679082998469, "learning_rate": 3.1794871794871795e-05, "loss": 0.5013, "step": 155 }, { "epoch": 0.39923224568138194, "grad_norm": 0.9382011251547424, "learning_rate": 3.2000000000000005e-05, "loss": 0.4777, "step": 156 }, { "epoch": 0.4017914267434421, "grad_norm": 0.9879747473370469, "learning_rate": 3.220512820512821e-05, "loss": 0.4718, "step": 157 }, { "epoch": 0.40435060780550225, "grad_norm": 0.9016818417869712, "learning_rate": 3.2410256410256413e-05, "loss": 0.5294, "step": 158 }, { "epoch": 0.4069097888675624, "grad_norm": 0.9938632562876675, "learning_rate": 3.261538461538462e-05, "loss": 0.4991, "step": 159 }, { "epoch": 0.4094689699296225, "grad_norm": 1.3410049525059016, "learning_rate": 3.282051282051282e-05, "loss": 0.5095, "step": 160 }, { "epoch": 0.41202815099168266, "grad_norm": 0.8871685101904818, "learning_rate": 3.3025641025641025e-05, "loss": 0.5483, "step": 161 }, { "epoch": 0.4145873320537428, "grad_norm": 1.2268108807413454, "learning_rate": 3.3230769230769236e-05, "loss": 0.5174, "step": 162 }, { "epoch": 0.4171465131158029, "grad_norm": 0.9220772123108049, "learning_rate": 3.343589743589744e-05, "loss": 0.5181, "step": 163 }, { "epoch": 0.4197056941778631, "grad_norm": 1.0658979698672157, "learning_rate": 3.3641025641025644e-05, "loss": 0.539, "step": 164 }, { "epoch": 0.42226487523992323, "grad_norm": 0.97733215203732, "learning_rate": 3.384615384615385e-05, "loss": 0.5191, "step": 165 }, { "epoch": 0.4248240563019834, "grad_norm": 1.1290947988408833, "learning_rate": 3.405128205128205e-05, "loss": 0.5169, "step": 166 }, { "epoch": 0.4273832373640435, "grad_norm": 0.8361689719032068, "learning_rate": 3.4256410256410256e-05, "loss": 0.5114, "step": 167 }, { "epoch": 0.42994241842610365, "grad_norm": 0.9928527289292229, "learning_rate": 3.446153846153847e-05, "loss": 0.5123, "step": 168 }, { "epoch": 0.4325015994881638, "grad_norm": 0.7704183020179163, "learning_rate": 3.466666666666667e-05, "loss": 0.4828, "step": 169 }, { "epoch": 0.4350607805502239, "grad_norm": 0.9283903900385092, "learning_rate": 3.4871794871794875e-05, "loss": 0.5226, "step": 170 }, { "epoch": 0.43761996161228406, "grad_norm": 0.925519484340918, "learning_rate": 3.507692307692308e-05, "loss": 0.4956, "step": 171 }, { "epoch": 0.4401791426743442, "grad_norm": 0.9993231714335514, "learning_rate": 3.5282051282051283e-05, "loss": 0.5355, "step": 172 }, { "epoch": 0.44273832373640437, "grad_norm": 1.311826250820302, "learning_rate": 3.548717948717949e-05, "loss": 0.5237, "step": 173 }, { "epoch": 0.44529750479846447, "grad_norm": 0.9273485381049265, "learning_rate": 3.56923076923077e-05, "loss": 0.5131, "step": 174 }, { "epoch": 0.44785668586052463, "grad_norm": 1.248607874192031, "learning_rate": 3.58974358974359e-05, "loss": 0.4972, "step": 175 }, { "epoch": 0.4504158669225848, "grad_norm": 0.8222900953155937, "learning_rate": 3.6102564102564106e-05, "loss": 0.5364, "step": 176 }, { "epoch": 0.45297504798464494, "grad_norm": 1.123693838028357, "learning_rate": 3.630769230769231e-05, "loss": 0.5219, "step": 177 }, { "epoch": 0.45553422904670504, "grad_norm": 0.7807365104513995, "learning_rate": 3.6512820512820514e-05, "loss": 0.5182, "step": 178 }, { "epoch": 0.4580934101087652, "grad_norm": 0.9231872939650723, "learning_rate": 3.671794871794872e-05, "loss": 0.5235, "step": 179 }, { "epoch": 0.46065259117082535, "grad_norm": 0.8053927537642723, "learning_rate": 3.692307692307693e-05, "loss": 0.5202, "step": 180 }, { "epoch": 0.46321177223288545, "grad_norm": 1.1551405361338565, "learning_rate": 3.712820512820513e-05, "loss": 0.5173, "step": 181 }, { "epoch": 0.4657709532949456, "grad_norm": 0.9439986501141405, "learning_rate": 3.733333333333334e-05, "loss": 0.5232, "step": 182 }, { "epoch": 0.46833013435700577, "grad_norm": 1.106193836601767, "learning_rate": 3.753846153846154e-05, "loss": 0.5533, "step": 183 }, { "epoch": 0.4708893154190659, "grad_norm": 0.794287368963475, "learning_rate": 3.7743589743589745e-05, "loss": 0.4762, "step": 184 }, { "epoch": 0.473448496481126, "grad_norm": 0.8611752707863242, "learning_rate": 3.794871794871795e-05, "loss": 0.5274, "step": 185 }, { "epoch": 0.4760076775431862, "grad_norm": 0.8817686680939495, "learning_rate": 3.815384615384616e-05, "loss": 0.5188, "step": 186 }, { "epoch": 0.47856685860524634, "grad_norm": 0.8845212101910219, "learning_rate": 3.8358974358974364e-05, "loss": 0.5218, "step": 187 }, { "epoch": 0.48112603966730644, "grad_norm": 0.663678181823649, "learning_rate": 3.856410256410257e-05, "loss": 0.5227, "step": 188 }, { "epoch": 0.4836852207293666, "grad_norm": 0.755313719038617, "learning_rate": 3.876923076923077e-05, "loss": 0.5426, "step": 189 }, { "epoch": 0.48624440179142675, "grad_norm": 0.68323576325022, "learning_rate": 3.8974358974358976e-05, "loss": 0.5254, "step": 190 }, { "epoch": 0.4888035828534869, "grad_norm": 0.7623898782087327, "learning_rate": 3.917948717948718e-05, "loss": 0.5011, "step": 191 }, { "epoch": 0.491362763915547, "grad_norm": 0.6832707042582635, "learning_rate": 3.938461538461539e-05, "loss": 0.5318, "step": 192 }, { "epoch": 0.49392194497760716, "grad_norm": 0.8383249840589115, "learning_rate": 3.9589743589743595e-05, "loss": 0.5244, "step": 193 }, { "epoch": 0.4964811260396673, "grad_norm": 0.6885029349814699, "learning_rate": 3.97948717948718e-05, "loss": 0.5068, "step": 194 }, { "epoch": 0.4990403071017274, "grad_norm": 0.9336320586686789, "learning_rate": 4e-05, "loss": 0.5013, "step": 195 }, { "epoch": 0.5015994881637876, "grad_norm": 1.1273619177371816, "learning_rate": 3.999996795609852e-05, "loss": 0.4924, "step": 196 }, { "epoch": 0.5041586692258477, "grad_norm": 0.8484780646183507, "learning_rate": 3.9999871824496765e-05, "loss": 0.4774, "step": 197 }, { "epoch": 0.5067178502879078, "grad_norm": 0.7566071063905553, "learning_rate": 3.999971160550277e-05, "loss": 0.5328, "step": 198 }, { "epoch": 0.509277031349968, "grad_norm": 0.963017285802582, "learning_rate": 3.999948729962994e-05, "loss": 0.535, "step": 199 }, { "epoch": 0.5118362124120281, "grad_norm": 0.6417865549660366, "learning_rate": 3.9999198907597046e-05, "loss": 0.502, "step": 200 }, { "epoch": 0.5143953934740882, "grad_norm": 0.8877386892780115, "learning_rate": 3.999884643032821e-05, "loss": 0.513, "step": 201 }, { "epoch": 0.5169545745361485, "grad_norm": 0.7952767722764698, "learning_rate": 3.999842986895289e-05, "loss": 0.5121, "step": 202 }, { "epoch": 0.5195137555982086, "grad_norm": 0.7352919644974529, "learning_rate": 3.999794922480593e-05, "loss": 0.4881, "step": 203 }, { "epoch": 0.5220729366602687, "grad_norm": 0.7101035899255743, "learning_rate": 3.9997404499427494e-05, "loss": 0.4941, "step": 204 }, { "epoch": 0.5246321177223289, "grad_norm": 0.7457517555879764, "learning_rate": 3.9996795694563096e-05, "loss": 0.5128, "step": 205 }, { "epoch": 0.527191298784389, "grad_norm": 0.6573802709079243, "learning_rate": 3.999612281216358e-05, "loss": 0.4949, "step": 206 }, { "epoch": 0.5297504798464492, "grad_norm": 0.7445707355838409, "learning_rate": 3.9995385854385124e-05, "loss": 0.5036, "step": 207 }, { "epoch": 0.5323096609085093, "grad_norm": 0.7713581724495707, "learning_rate": 3.999458482358924e-05, "loss": 0.4985, "step": 208 }, { "epoch": 0.5348688419705694, "grad_norm": 0.7752015257631751, "learning_rate": 3.9993719722342726e-05, "loss": 0.508, "step": 209 }, { "epoch": 0.5374280230326296, "grad_norm": 1.133508847111212, "learning_rate": 3.999279055341771e-05, "loss": 0.5261, "step": 210 }, { "epoch": 0.5399872040946897, "grad_norm": 0.7726822279135389, "learning_rate": 3.999179731979162e-05, "loss": 0.4672, "step": 211 }, { "epoch": 0.5425463851567498, "grad_norm": 0.7766807053995074, "learning_rate": 3.9990740024647154e-05, "loss": 0.4984, "step": 212 }, { "epoch": 0.54510556621881, "grad_norm": 0.5303932042183875, "learning_rate": 3.9989618671372304e-05, "loss": 0.5129, "step": 213 }, { "epoch": 0.5476647472808701, "grad_norm": 0.6026485573392468, "learning_rate": 3.998843326356032e-05, "loss": 0.5127, "step": 214 }, { "epoch": 0.5502239283429302, "grad_norm": 0.6118913537653141, "learning_rate": 3.998718380500971e-05, "loss": 0.458, "step": 215 }, { "epoch": 0.5527831094049904, "grad_norm": 0.6445812096951002, "learning_rate": 3.998587029972423e-05, "loss": 0.495, "step": 216 }, { "epoch": 0.5553422904670505, "grad_norm": 0.7741176770417552, "learning_rate": 3.998449275191286e-05, "loss": 0.5095, "step": 217 }, { "epoch": 0.5579014715291107, "grad_norm": 0.6972261092630921, "learning_rate": 3.9983051165989814e-05, "loss": 0.4871, "step": 218 }, { "epoch": 0.5604606525911708, "grad_norm": 0.6664826347703592, "learning_rate": 3.998154554657448e-05, "loss": 0.5137, "step": 219 }, { "epoch": 0.5630198336532309, "grad_norm": 0.6334253599100873, "learning_rate": 3.997997589849145e-05, "loss": 0.5494, "step": 220 }, { "epoch": 0.5655790147152912, "grad_norm": 0.7145250281069672, "learning_rate": 3.99783422267705e-05, "loss": 0.5219, "step": 221 }, { "epoch": 0.5681381957773513, "grad_norm": 0.8804491997042745, "learning_rate": 3.997664453664654e-05, "loss": 0.5305, "step": 222 }, { "epoch": 0.5706973768394114, "grad_norm": 0.5941374342473514, "learning_rate": 3.9974882833559634e-05, "loss": 0.492, "step": 223 }, { "epoch": 0.5732565579014716, "grad_norm": 0.8541812515528515, "learning_rate": 3.997305712315497e-05, "loss": 0.4994, "step": 224 }, { "epoch": 0.5758157389635317, "grad_norm": 0.8858077653575287, "learning_rate": 3.9971167411282835e-05, "loss": 0.5268, "step": 225 }, { "epoch": 0.5783749200255918, "grad_norm": 1.0807623477992674, "learning_rate": 3.9969213703998606e-05, "loss": 0.5047, "step": 226 }, { "epoch": 0.580934101087652, "grad_norm": 0.8213345788017161, "learning_rate": 3.9967196007562725e-05, "loss": 0.5302, "step": 227 }, { "epoch": 0.5834932821497121, "grad_norm": 0.6055767495165976, "learning_rate": 3.996511432844067e-05, "loss": 0.4833, "step": 228 }, { "epoch": 0.5860524632117722, "grad_norm": 0.9265228707425465, "learning_rate": 3.996296867330296e-05, "loss": 0.5146, "step": 229 }, { "epoch": 0.5886116442738324, "grad_norm": 1.228465580948079, "learning_rate": 3.99607590490251e-05, "loss": 0.474, "step": 230 }, { "epoch": 0.5911708253358925, "grad_norm": 0.8130275473999699, "learning_rate": 3.9958485462687606e-05, "loss": 0.4767, "step": 231 }, { "epoch": 0.5937300063979527, "grad_norm": 0.780191780172052, "learning_rate": 3.995614792157592e-05, "loss": 0.5037, "step": 232 }, { "epoch": 0.5962891874600128, "grad_norm": 0.9566872727407667, "learning_rate": 3.995374643318045e-05, "loss": 0.5152, "step": 233 }, { "epoch": 0.5988483685220729, "grad_norm": 1.1646368305147163, "learning_rate": 3.9951281005196486e-05, "loss": 0.5329, "step": 234 }, { "epoch": 0.6014075495841331, "grad_norm": 0.7216132566248876, "learning_rate": 3.9948751645524235e-05, "loss": 0.5285, "step": 235 }, { "epoch": 0.6039667306461932, "grad_norm": 1.0575511476108006, "learning_rate": 3.994615836226874e-05, "loss": 0.5364, "step": 236 }, { "epoch": 0.6065259117082533, "grad_norm": 0.9622069565170775, "learning_rate": 3.994350116373991e-05, "loss": 0.5067, "step": 237 }, { "epoch": 0.6090850927703135, "grad_norm": 0.7168012454794431, "learning_rate": 3.9940780058452416e-05, "loss": 0.5012, "step": 238 }, { "epoch": 0.6116442738323736, "grad_norm": 0.8302983155644609, "learning_rate": 3.9937995055125774e-05, "loss": 0.5282, "step": 239 }, { "epoch": 0.6142034548944337, "grad_norm": 0.7497453674792344, "learning_rate": 3.9935146162684206e-05, "loss": 0.4992, "step": 240 }, { "epoch": 0.6167626359564939, "grad_norm": 0.8217939024893259, "learning_rate": 3.993223339025667e-05, "loss": 0.4895, "step": 241 }, { "epoch": 0.619321817018554, "grad_norm": 0.7219771415337329, "learning_rate": 3.992925674717683e-05, "loss": 0.4636, "step": 242 }, { "epoch": 0.6218809980806143, "grad_norm": 0.9341076990511636, "learning_rate": 3.9926216242983017e-05, "loss": 0.5008, "step": 243 }, { "epoch": 0.6244401791426744, "grad_norm": 0.7304570272364406, "learning_rate": 3.9923111887418185e-05, "loss": 0.4921, "step": 244 }, { "epoch": 0.6269993602047345, "grad_norm": 1.0471475376229655, "learning_rate": 3.9919943690429906e-05, "loss": 0.4768, "step": 245 }, { "epoch": 0.6295585412667947, "grad_norm": 0.693079198064159, "learning_rate": 3.991671166217031e-05, "loss": 0.4786, "step": 246 }, { "epoch": 0.6321177223288548, "grad_norm": 1.1561745746369312, "learning_rate": 3.991341581299609e-05, "loss": 0.5182, "step": 247 }, { "epoch": 0.6346769033909149, "grad_norm": 0.8741051048237501, "learning_rate": 3.991005615346843e-05, "loss": 0.5024, "step": 248 }, { "epoch": 0.6372360844529751, "grad_norm": 0.9086658125226366, "learning_rate": 3.990663269435298e-05, "loss": 0.4974, "step": 249 }, { "epoch": 0.6397952655150352, "grad_norm": 0.9761201164077572, "learning_rate": 3.9903145446619837e-05, "loss": 0.5112, "step": 250 }, { "epoch": 0.6423544465770953, "grad_norm": 0.8136560445992532, "learning_rate": 3.989959442144352e-05, "loss": 0.4971, "step": 251 }, { "epoch": 0.6449136276391555, "grad_norm": 0.6200675715165651, "learning_rate": 3.989597963020289e-05, "loss": 0.506, "step": 252 }, { "epoch": 0.6474728087012156, "grad_norm": 0.9727978732394552, "learning_rate": 3.989230108448115e-05, "loss": 0.5132, "step": 253 }, { "epoch": 0.6500319897632757, "grad_norm": 0.7588308378509555, "learning_rate": 3.9888558796065784e-05, "loss": 0.4948, "step": 254 }, { "epoch": 0.6525911708253359, "grad_norm": 0.8984908598875354, "learning_rate": 3.9884752776948564e-05, "loss": 0.4912, "step": 255 }, { "epoch": 0.655150351887396, "grad_norm": 0.9180268932937014, "learning_rate": 3.988088303932545e-05, "loss": 0.5058, "step": 256 }, { "epoch": 0.6577095329494562, "grad_norm": 0.9874072428401991, "learning_rate": 3.987694959559658e-05, "loss": 0.5265, "step": 257 }, { "epoch": 0.6602687140115163, "grad_norm": 0.7774534286927767, "learning_rate": 3.9872952458366267e-05, "loss": 0.5116, "step": 258 }, { "epoch": 0.6628278950735764, "grad_norm": 0.8189986183875947, "learning_rate": 3.9868891640442874e-05, "loss": 0.507, "step": 259 }, { "epoch": 0.6653870761356366, "grad_norm": 0.6489509247329256, "learning_rate": 3.9864767154838864e-05, "loss": 0.5111, "step": 260 }, { "epoch": 0.6679462571976967, "grad_norm": 0.8761101947097708, "learning_rate": 3.986057901477069e-05, "loss": 0.5127, "step": 261 }, { "epoch": 0.6705054382597568, "grad_norm": 0.9687654727768278, "learning_rate": 3.985632723365878e-05, "loss": 0.547, "step": 262 }, { "epoch": 0.673064619321817, "grad_norm": 0.7867771900738217, "learning_rate": 3.985201182512752e-05, "loss": 0.516, "step": 263 }, { "epoch": 0.6756238003838771, "grad_norm": 0.6453678386295493, "learning_rate": 3.984763280300514e-05, "loss": 0.481, "step": 264 }, { "epoch": 0.6781829814459372, "grad_norm": 0.7765079788191963, "learning_rate": 3.9843190181323744e-05, "loss": 0.4913, "step": 265 }, { "epoch": 0.6807421625079975, "grad_norm": 0.6373936761246601, "learning_rate": 3.983868397431923e-05, "loss": 0.5133, "step": 266 }, { "epoch": 0.6833013435700576, "grad_norm": 0.730921099373597, "learning_rate": 3.983411419643125e-05, "loss": 0.5339, "step": 267 }, { "epoch": 0.6858605246321178, "grad_norm": 0.8152333844378884, "learning_rate": 3.982948086230312e-05, "loss": 0.4865, "step": 268 }, { "epoch": 0.6884197056941779, "grad_norm": 0.7100466271823358, "learning_rate": 3.9824783986781897e-05, "loss": 0.49, "step": 269 }, { "epoch": 0.690978886756238, "grad_norm": 0.8934016406293627, "learning_rate": 3.982002358491817e-05, "loss": 0.5208, "step": 270 }, { "epoch": 0.6935380678182982, "grad_norm": 0.6756901306503084, "learning_rate": 3.981519967196614e-05, "loss": 0.5191, "step": 271 }, { "epoch": 0.6960972488803583, "grad_norm": 0.8009942346681957, "learning_rate": 3.98103122633835e-05, "loss": 0.5067, "step": 272 }, { "epoch": 0.6986564299424184, "grad_norm": 0.6923671981740297, "learning_rate": 3.980536137483141e-05, "loss": 0.4868, "step": 273 }, { "epoch": 0.7012156110044786, "grad_norm": 0.9566550041102404, "learning_rate": 3.980034702217445e-05, "loss": 0.5398, "step": 274 }, { "epoch": 0.7037747920665387, "grad_norm": 0.8440959667430001, "learning_rate": 3.979526922148058e-05, "loss": 0.4658, "step": 275 }, { "epoch": 0.7063339731285988, "grad_norm": 0.8588196985974998, "learning_rate": 3.9790127989021024e-05, "loss": 0.5052, "step": 276 }, { "epoch": 0.708893154190659, "grad_norm": 0.7562049758522937, "learning_rate": 3.978492334127032e-05, "loss": 0.5267, "step": 277 }, { "epoch": 0.7114523352527191, "grad_norm": 0.632945819901848, "learning_rate": 3.977965529490618e-05, "loss": 0.4618, "step": 278 }, { "epoch": 0.7140115163147792, "grad_norm": 0.845801145881355, "learning_rate": 3.9774323866809485e-05, "loss": 0.4683, "step": 279 }, { "epoch": 0.7165706973768394, "grad_norm": 0.7371708961015324, "learning_rate": 3.9768929074064206e-05, "loss": 0.5364, "step": 280 }, { "epoch": 0.7191298784388995, "grad_norm": 0.7371186918676583, "learning_rate": 3.976347093395736e-05, "loss": 0.5061, "step": 281 }, { "epoch": 0.7216890595009597, "grad_norm": 0.7032178872579914, "learning_rate": 3.9757949463978975e-05, "loss": 0.5242, "step": 282 }, { "epoch": 0.7242482405630198, "grad_norm": 0.7617726502548777, "learning_rate": 3.9752364681821973e-05, "loss": 0.4888, "step": 283 }, { "epoch": 0.7268074216250799, "grad_norm": 0.7903937882632156, "learning_rate": 3.9746716605382186e-05, "loss": 0.5203, "step": 284 }, { "epoch": 0.7293666026871402, "grad_norm": 0.7645749463474476, "learning_rate": 3.9741005252758255e-05, "loss": 0.5116, "step": 285 }, { "epoch": 0.7319257837492003, "grad_norm": 0.6220992771519867, "learning_rate": 3.973523064225159e-05, "loss": 0.4671, "step": 286 }, { "epoch": 0.7344849648112604, "grad_norm": 0.5115715364977104, "learning_rate": 3.972939279236627e-05, "loss": 0.4565, "step": 287 }, { "epoch": 0.7370441458733206, "grad_norm": 0.7021026496208411, "learning_rate": 3.9723491721809076e-05, "loss": 0.5379, "step": 288 }, { "epoch": 0.7396033269353807, "grad_norm": 0.5201603136030488, "learning_rate": 3.971752744948932e-05, "loss": 0.4692, "step": 289 }, { "epoch": 0.7421625079974408, "grad_norm": 0.8208102312039668, "learning_rate": 3.971149999451886e-05, "loss": 0.4882, "step": 290 }, { "epoch": 0.744721689059501, "grad_norm": 1.119540735969476, "learning_rate": 3.970540937621201e-05, "loss": 0.5097, "step": 291 }, { "epoch": 0.7472808701215611, "grad_norm": 0.7635743214902218, "learning_rate": 3.9699255614085495e-05, "loss": 0.5101, "step": 292 }, { "epoch": 0.7498400511836213, "grad_norm": 0.8126931797929314, "learning_rate": 3.969303872785837e-05, "loss": 0.4889, "step": 293 }, { "epoch": 0.7523992322456814, "grad_norm": 0.7624486728902423, "learning_rate": 3.9686758737451955e-05, "loss": 0.4966, "step": 294 }, { "epoch": 0.7549584133077415, "grad_norm": 0.5229197667386186, "learning_rate": 3.9680415662989806e-05, "loss": 0.4886, "step": 295 }, { "epoch": 0.7575175943698017, "grad_norm": 0.6766454901060076, "learning_rate": 3.967400952479759e-05, "loss": 0.4661, "step": 296 }, { "epoch": 0.7600767754318618, "grad_norm": 0.6030935943397303, "learning_rate": 3.966754034340308e-05, "loss": 0.5526, "step": 297 }, { "epoch": 0.7626359564939219, "grad_norm": 0.688481808856117, "learning_rate": 3.966100813953607e-05, "loss": 0.5191, "step": 298 }, { "epoch": 0.7651951375559821, "grad_norm": 0.8026340827764397, "learning_rate": 3.965441293412827e-05, "loss": 0.4875, "step": 299 }, { "epoch": 0.7677543186180422, "grad_norm": 0.5877161986618623, "learning_rate": 3.9647754748313294e-05, "loss": 0.5581, "step": 300 }, { "epoch": 0.7703134996801023, "grad_norm": 0.7230843693397704, "learning_rate": 3.964103360342658e-05, "loss": 0.4941, "step": 301 }, { "epoch": 0.7728726807421625, "grad_norm": 0.6543565273084903, "learning_rate": 3.963424952100529e-05, "loss": 0.4749, "step": 302 }, { "epoch": 0.7754318618042226, "grad_norm": 0.6083305140129942, "learning_rate": 3.962740252278827e-05, "loss": 0.506, "step": 303 }, { "epoch": 0.7779910428662828, "grad_norm": 0.6712573537812702, "learning_rate": 3.962049263071598e-05, "loss": 0.4752, "step": 304 }, { "epoch": 0.780550223928343, "grad_norm": 0.8280090720406486, "learning_rate": 3.96135198669304e-05, "loss": 0.4891, "step": 305 }, { "epoch": 0.783109404990403, "grad_norm": 0.6598880067981137, "learning_rate": 3.960648425377499e-05, "loss": 0.4947, "step": 306 }, { "epoch": 0.7856685860524633, "grad_norm": 0.7265475529158774, "learning_rate": 3.95993858137946e-05, "loss": 0.4989, "step": 307 }, { "epoch": 0.7882277671145234, "grad_norm": 0.5888767351942641, "learning_rate": 3.959222456973541e-05, "loss": 0.4829, "step": 308 }, { "epoch": 0.7907869481765835, "grad_norm": 0.6375937306972569, "learning_rate": 3.958500054454482e-05, "loss": 0.4574, "step": 309 }, { "epoch": 0.7933461292386437, "grad_norm": 0.6933840977854485, "learning_rate": 3.957771376137144e-05, "loss": 0.5059, "step": 310 }, { "epoch": 0.7959053103007038, "grad_norm": 0.6896810504909161, "learning_rate": 3.9570364243564966e-05, "loss": 0.4992, "step": 311 }, { "epoch": 0.7984644913627639, "grad_norm": 0.6468420638732157, "learning_rate": 3.9562952014676116e-05, "loss": 0.496, "step": 312 }, { "epoch": 0.8010236724248241, "grad_norm": 0.6274952773967104, "learning_rate": 3.955547709845656e-05, "loss": 0.4874, "step": 313 }, { "epoch": 0.8035828534868842, "grad_norm": 0.5653574700934775, "learning_rate": 3.9547939518858856e-05, "loss": 0.5067, "step": 314 }, { "epoch": 0.8061420345489443, "grad_norm": 0.6550905511901227, "learning_rate": 3.954033930003634e-05, "loss": 0.5212, "step": 315 }, { "epoch": 0.8087012156110045, "grad_norm": 0.7494837380638635, "learning_rate": 3.953267646634309e-05, "loss": 0.505, "step": 316 }, { "epoch": 0.8112603966730646, "grad_norm": 0.635770951888326, "learning_rate": 3.95249510423338e-05, "loss": 0.4967, "step": 317 }, { "epoch": 0.8138195777351248, "grad_norm": 0.7410641008583149, "learning_rate": 3.9517163052763756e-05, "loss": 0.4773, "step": 318 }, { "epoch": 0.8163787587971849, "grad_norm": 0.7149351574867076, "learning_rate": 3.9509312522588704e-05, "loss": 0.4709, "step": 319 }, { "epoch": 0.818937939859245, "grad_norm": 0.823519098221886, "learning_rate": 3.9501399476964806e-05, "loss": 0.4867, "step": 320 }, { "epoch": 0.8214971209213052, "grad_norm": 0.7163722784021596, "learning_rate": 3.9493423941248564e-05, "loss": 0.507, "step": 321 }, { "epoch": 0.8240563019833653, "grad_norm": 0.5792885526249222, "learning_rate": 3.948538594099668e-05, "loss": 0.4863, "step": 322 }, { "epoch": 0.8266154830454254, "grad_norm": 0.8674821812665616, "learning_rate": 3.9477285501966064e-05, "loss": 0.4497, "step": 323 }, { "epoch": 0.8291746641074856, "grad_norm": 0.6695505549743577, "learning_rate": 3.946912265011368e-05, "loss": 0.4853, "step": 324 }, { "epoch": 0.8317338451695457, "grad_norm": 0.6407618610078097, "learning_rate": 3.946089741159648e-05, "loss": 0.4742, "step": 325 }, { "epoch": 0.8342930262316058, "grad_norm": 0.8505071134562596, "learning_rate": 3.9452609812771346e-05, "loss": 0.5346, "step": 326 }, { "epoch": 0.836852207293666, "grad_norm": 0.5413673046728109, "learning_rate": 3.944425988019498e-05, "loss": 0.4677, "step": 327 }, { "epoch": 0.8394113883557262, "grad_norm": 0.9828950428531091, "learning_rate": 3.9435847640623806e-05, "loss": 0.4808, "step": 328 }, { "epoch": 0.8419705694177864, "grad_norm": 0.6480252890065408, "learning_rate": 3.942737312101394e-05, "loss": 0.5019, "step": 329 }, { "epoch": 0.8445297504798465, "grad_norm": 0.9281412616435286, "learning_rate": 3.9418836348521045e-05, "loss": 0.5069, "step": 330 }, { "epoch": 0.8470889315419066, "grad_norm": 0.6511602292237915, "learning_rate": 3.941023735050027e-05, "loss": 0.5135, "step": 331 }, { "epoch": 0.8496481126039668, "grad_norm": 0.649990025588154, "learning_rate": 3.9401576154506155e-05, "loss": 0.4721, "step": 332 }, { "epoch": 0.8522072936660269, "grad_norm": 0.8525830113834602, "learning_rate": 3.9392852788292556e-05, "loss": 0.4747, "step": 333 }, { "epoch": 0.854766474728087, "grad_norm": 0.8122595340814978, "learning_rate": 3.938406727981254e-05, "loss": 0.5036, "step": 334 }, { "epoch": 0.8573256557901472, "grad_norm": 0.6813807690997764, "learning_rate": 3.937521965721831e-05, "loss": 0.4778, "step": 335 }, { "epoch": 0.8598848368522073, "grad_norm": 0.742372369654133, "learning_rate": 3.936630994886109e-05, "loss": 0.4912, "step": 336 }, { "epoch": 0.8624440179142674, "grad_norm": 0.6932498968117697, "learning_rate": 3.9357338183291066e-05, "loss": 0.5033, "step": 337 }, { "epoch": 0.8650031989763276, "grad_norm": 0.7002201428035697, "learning_rate": 3.934830438925728e-05, "loss": 0.4843, "step": 338 }, { "epoch": 0.8675623800383877, "grad_norm": 0.6143454063707157, "learning_rate": 3.933920859570753e-05, "loss": 0.4959, "step": 339 }, { "epoch": 0.8701215611004478, "grad_norm": 0.5609771595796579, "learning_rate": 3.933005083178828e-05, "loss": 0.4778, "step": 340 }, { "epoch": 0.872680742162508, "grad_norm": 0.5872526379907818, "learning_rate": 3.932083112684459e-05, "loss": 0.4736, "step": 341 }, { "epoch": 0.8752399232245681, "grad_norm": 0.5259898431650297, "learning_rate": 3.931154951041998e-05, "loss": 0.5061, "step": 342 }, { "epoch": 0.8777991042866283, "grad_norm": 0.6462230152484912, "learning_rate": 3.930220601225638e-05, "loss": 0.503, "step": 343 }, { "epoch": 0.8803582853486884, "grad_norm": 0.6210066174968442, "learning_rate": 3.9292800662294e-05, "loss": 0.4592, "step": 344 }, { "epoch": 0.8829174664107485, "grad_norm": 0.7033811879806838, "learning_rate": 3.928333349067125e-05, "loss": 0.4839, "step": 345 }, { "epoch": 0.8854766474728087, "grad_norm": 0.7112080589811673, "learning_rate": 3.927380452772464e-05, "loss": 0.4833, "step": 346 }, { "epoch": 0.8880358285348688, "grad_norm": 0.5763115473730898, "learning_rate": 3.926421380398869e-05, "loss": 0.5128, "step": 347 }, { "epoch": 0.8905950095969289, "grad_norm": 0.493333488186968, "learning_rate": 3.925456135019582e-05, "loss": 0.4777, "step": 348 }, { "epoch": 0.8931541906589892, "grad_norm": 0.5969713695047262, "learning_rate": 3.924484719727625e-05, "loss": 0.5548, "step": 349 }, { "epoch": 0.8957133717210493, "grad_norm": 0.5743073508713653, "learning_rate": 3.923507137635792e-05, "loss": 0.4993, "step": 350 }, { "epoch": 0.8982725527831094, "grad_norm": 0.5816084208432016, "learning_rate": 3.922523391876638e-05, "loss": 0.4974, "step": 351 }, { "epoch": 0.9008317338451696, "grad_norm": 0.6913456504659746, "learning_rate": 3.921533485602467e-05, "loss": 0.5038, "step": 352 }, { "epoch": 0.9033909149072297, "grad_norm": 0.5710576834895075, "learning_rate": 3.920537421985327e-05, "loss": 0.469, "step": 353 }, { "epoch": 0.9059500959692899, "grad_norm": 0.831878724155053, "learning_rate": 3.9195352042169924e-05, "loss": 0.5178, "step": 354 }, { "epoch": 0.90850927703135, "grad_norm": 0.5623208840830399, "learning_rate": 3.9185268355089606e-05, "loss": 0.4892, "step": 355 }, { "epoch": 0.9110684580934101, "grad_norm": 0.8167633776332197, "learning_rate": 3.9175123190924384e-05, "loss": 0.5193, "step": 356 }, { "epoch": 0.9136276391554703, "grad_norm": 0.5899572202270219, "learning_rate": 3.916491658218333e-05, "loss": 0.4739, "step": 357 }, { "epoch": 0.9161868202175304, "grad_norm": 0.7206391505617225, "learning_rate": 3.9154648561572386e-05, "loss": 0.4752, "step": 358 }, { "epoch": 0.9187460012795905, "grad_norm": 0.7210244727385913, "learning_rate": 3.91443191619943e-05, "loss": 0.4789, "step": 359 }, { "epoch": 0.9213051823416507, "grad_norm": 0.7320868272015927, "learning_rate": 3.913392841654851e-05, "loss": 0.503, "step": 360 }, { "epoch": 0.9238643634037108, "grad_norm": 0.8003299130587506, "learning_rate": 3.9123476358531e-05, "loss": 0.4917, "step": 361 }, { "epoch": 0.9264235444657709, "grad_norm": 0.7467581757493919, "learning_rate": 3.911296302143426e-05, "loss": 0.4973, "step": 362 }, { "epoch": 0.9289827255278311, "grad_norm": 0.695841862059534, "learning_rate": 3.9102388438947104e-05, "loss": 0.5014, "step": 363 }, { "epoch": 0.9315419065898912, "grad_norm": 0.8452597656759123, "learning_rate": 3.909175264495464e-05, "loss": 0.4528, "step": 364 }, { "epoch": 0.9341010876519513, "grad_norm": 0.8328804446765739, "learning_rate": 3.9081055673538093e-05, "loss": 0.5014, "step": 365 }, { "epoch": 0.9366602687140115, "grad_norm": 0.8976120702595518, "learning_rate": 3.907029755897473e-05, "loss": 0.4767, "step": 366 }, { "epoch": 0.9392194497760716, "grad_norm": 0.6729987700598375, "learning_rate": 3.905947833573775e-05, "loss": 0.4758, "step": 367 }, { "epoch": 0.9417786308381318, "grad_norm": 0.965419537056689, "learning_rate": 3.904859803849617e-05, "loss": 0.4952, "step": 368 }, { "epoch": 0.944337811900192, "grad_norm": 0.8355270729808283, "learning_rate": 3.903765670211469e-05, "loss": 0.5048, "step": 369 }, { "epoch": 0.946896992962252, "grad_norm": 0.7603742900226873, "learning_rate": 3.902665436165364e-05, "loss": 0.491, "step": 370 }, { "epoch": 0.9494561740243123, "grad_norm": 0.7054418825885687, "learning_rate": 3.901559105236881e-05, "loss": 0.4654, "step": 371 }, { "epoch": 0.9520153550863724, "grad_norm": 0.6840867683040738, "learning_rate": 3.9004466809711343e-05, "loss": 0.4789, "step": 372 }, { "epoch": 0.9545745361484325, "grad_norm": 0.7107919372914869, "learning_rate": 3.8993281669327664e-05, "loss": 0.5041, "step": 373 }, { "epoch": 0.9571337172104927, "grad_norm": 0.5705893437651786, "learning_rate": 3.8982035667059327e-05, "loss": 0.4724, "step": 374 }, { "epoch": 0.9596928982725528, "grad_norm": 0.6398479886368891, "learning_rate": 3.897072883894291e-05, "loss": 0.512, "step": 375 }, { "epoch": 0.9622520793346129, "grad_norm": 0.6496978842533705, "learning_rate": 3.895936122120991e-05, "loss": 0.4998, "step": 376 }, { "epoch": 0.9648112603966731, "grad_norm": 0.6636946077689827, "learning_rate": 3.8947932850286585e-05, "loss": 0.5105, "step": 377 }, { "epoch": 0.9673704414587332, "grad_norm": 0.8146680744120419, "learning_rate": 3.893644376279392e-05, "loss": 0.5081, "step": 378 }, { "epoch": 0.9699296225207934, "grad_norm": 0.6043005275708316, "learning_rate": 3.8924893995547427e-05, "loss": 0.465, "step": 379 }, { "epoch": 0.9724888035828535, "grad_norm": 0.7737447625323293, "learning_rate": 3.8913283585557054e-05, "loss": 0.4745, "step": 380 }, { "epoch": 0.9750479846449136, "grad_norm": 0.6584366000203595, "learning_rate": 3.89016125700271e-05, "loss": 0.4646, "step": 381 }, { "epoch": 0.9776071657069738, "grad_norm": 0.7267670466528602, "learning_rate": 3.888988098635604e-05, "loss": 0.5443, "step": 382 }, { "epoch": 0.9801663467690339, "grad_norm": 0.8281518075048059, "learning_rate": 3.8878088872136446e-05, "loss": 0.5175, "step": 383 }, { "epoch": 0.982725527831094, "grad_norm": 0.7524480266101858, "learning_rate": 3.8866236265154864e-05, "loss": 0.4752, "step": 384 }, { "epoch": 0.9852847088931542, "grad_norm": 0.8364410578252669, "learning_rate": 3.885432320339167e-05, "loss": 0.4752, "step": 385 }, { "epoch": 0.9878438899552143, "grad_norm": 0.684899934501505, "learning_rate": 3.884234972502095e-05, "loss": 0.4931, "step": 386 }, { "epoch": 0.9904030710172744, "grad_norm": 0.6158115167694288, "learning_rate": 3.88303158684104e-05, "loss": 0.4861, "step": 387 }, { "epoch": 0.9929622520793346, "grad_norm": 0.7078911086493701, "learning_rate": 3.8818221672121204e-05, "loss": 0.4921, "step": 388 }, { "epoch": 0.9955214331413947, "grad_norm": 0.5372294629344029, "learning_rate": 3.8806067174907876e-05, "loss": 0.513, "step": 389 }, { "epoch": 0.9980806142034548, "grad_norm": 0.7744974417312304, "learning_rate": 3.879385241571817e-05, "loss": 0.4669, "step": 390 }, { "epoch": 1.000639795265515, "grad_norm": 0.649847374858299, "learning_rate": 3.878157743369294e-05, "loss": 0.6153, "step": 391 }, { "epoch": 1.0031989763275753, "grad_norm": 0.7557922196471385, "learning_rate": 3.876924226816602e-05, "loss": 0.3771, "step": 392 }, { "epoch": 1.0057581573896353, "grad_norm": 0.720395789720446, "learning_rate": 3.875684695866409e-05, "loss": 0.4498, "step": 393 }, { "epoch": 1.0083173384516955, "grad_norm": 0.6684707254214445, "learning_rate": 3.874439154490656e-05, "loss": 0.4581, "step": 394 }, { "epoch": 1.0108765195137557, "grad_norm": 0.6845617164901984, "learning_rate": 3.873187606680543e-05, "loss": 0.4382, "step": 395 }, { "epoch": 1.0134357005758157, "grad_norm": 0.6687231254947439, "learning_rate": 3.871930056446518e-05, "loss": 0.3945, "step": 396 }, { "epoch": 1.0159948816378759, "grad_norm": 0.8306778742589912, "learning_rate": 3.870666507818262e-05, "loss": 0.4194, "step": 397 }, { "epoch": 1.018554062699936, "grad_norm": 0.7593090405257436, "learning_rate": 3.869396964844679e-05, "loss": 0.4378, "step": 398 }, { "epoch": 1.021113243761996, "grad_norm": 0.545915139395705, "learning_rate": 3.8681214315938786e-05, "loss": 0.4009, "step": 399 }, { "epoch": 1.0236724248240563, "grad_norm": 0.8728350624043225, "learning_rate": 3.866839912153168e-05, "loss": 0.4239, "step": 400 }, { "epoch": 1.0262316058861165, "grad_norm": 0.7685743303640589, "learning_rate": 3.8655524106290345e-05, "loss": 0.4433, "step": 401 }, { "epoch": 1.0287907869481765, "grad_norm": 0.6436857445045538, "learning_rate": 3.864258931147136e-05, "loss": 0.4135, "step": 402 }, { "epoch": 1.0313499680102367, "grad_norm": 0.6043825799230796, "learning_rate": 3.862959477852285e-05, "loss": 0.4511, "step": 403 }, { "epoch": 1.033909149072297, "grad_norm": 0.864882027042227, "learning_rate": 3.8616540549084366e-05, "loss": 0.4281, "step": 404 }, { "epoch": 1.036468330134357, "grad_norm": 0.5669229687455682, "learning_rate": 3.860342666498677e-05, "loss": 0.4265, "step": 405 }, { "epoch": 1.0390275111964171, "grad_norm": 1.092925757327788, "learning_rate": 3.859025316825204e-05, "loss": 0.4171, "step": 406 }, { "epoch": 1.0415866922584773, "grad_norm": 0.6235272525172618, "learning_rate": 3.8577020101093214e-05, "loss": 0.3889, "step": 407 }, { "epoch": 1.0441458733205373, "grad_norm": 0.7044477218033379, "learning_rate": 3.856372750591419e-05, "loss": 0.4268, "step": 408 }, { "epoch": 1.0467050543825975, "grad_norm": 0.6758623607211073, "learning_rate": 3.8550375425309643e-05, "loss": 0.376, "step": 409 }, { "epoch": 1.0492642354446577, "grad_norm": 0.703616580836483, "learning_rate": 3.853696390206484e-05, "loss": 0.4782, "step": 410 }, { "epoch": 1.051823416506718, "grad_norm": 0.7120884633418646, "learning_rate": 3.8523492979155534e-05, "loss": 0.4156, "step": 411 }, { "epoch": 1.054382597568778, "grad_norm": 0.7074414036356858, "learning_rate": 3.850996269974782e-05, "loss": 0.4044, "step": 412 }, { "epoch": 1.0569417786308382, "grad_norm": 0.6805988413911291, "learning_rate": 3.849637310719799e-05, "loss": 0.4659, "step": 413 }, { "epoch": 1.0595009596928984, "grad_norm": 0.751339193242575, "learning_rate": 3.84827242450524e-05, "loss": 0.4362, "step": 414 }, { "epoch": 1.0620601407549584, "grad_norm": 0.8444583705618753, "learning_rate": 3.846901615704734e-05, "loss": 0.4671, "step": 415 }, { "epoch": 1.0646193218170186, "grad_norm": 0.5494970933209252, "learning_rate": 3.845524888710885e-05, "loss": 0.4192, "step": 416 }, { "epoch": 1.0671785028790788, "grad_norm": 0.6871657555654107, "learning_rate": 3.844142247935265e-05, "loss": 0.4392, "step": 417 }, { "epoch": 1.0697376839411388, "grad_norm": 0.6255544924834299, "learning_rate": 3.842753697808395e-05, "loss": 0.4098, "step": 418 }, { "epoch": 1.072296865003199, "grad_norm": 0.5236343059225219, "learning_rate": 3.84135924277973e-05, "loss": 0.4039, "step": 419 }, { "epoch": 1.0748560460652592, "grad_norm": 0.7081136566843064, "learning_rate": 3.839958887317649e-05, "loss": 0.42, "step": 420 }, { "epoch": 1.0774152271273192, "grad_norm": 0.630614377419562, "learning_rate": 3.838552635909436e-05, "loss": 0.4065, "step": 421 }, { "epoch": 1.0799744081893794, "grad_norm": 0.5615204386321961, "learning_rate": 3.8371404930612704e-05, "loss": 0.4146, "step": 422 }, { "epoch": 1.0825335892514396, "grad_norm": 0.6565436867003143, "learning_rate": 3.835722463298208e-05, "loss": 0.4064, "step": 423 }, { "epoch": 1.0850927703134996, "grad_norm": 0.5737679071890739, "learning_rate": 3.83429855116417e-05, "loss": 0.4552, "step": 424 }, { "epoch": 1.0876519513755598, "grad_norm": 0.6434114879042129, "learning_rate": 3.832868761221926e-05, "loss": 0.4441, "step": 425 }, { "epoch": 1.09021113243762, "grad_norm": 0.6708602456648777, "learning_rate": 3.831433098053082e-05, "loss": 0.4022, "step": 426 }, { "epoch": 1.09277031349968, "grad_norm": 0.623494522486998, "learning_rate": 3.829991566258061e-05, "loss": 0.4043, "step": 427 }, { "epoch": 1.0953294945617402, "grad_norm": 0.698870377709763, "learning_rate": 3.828544170456094e-05, "loss": 0.4559, "step": 428 }, { "epoch": 1.0978886756238004, "grad_norm": 0.6409469981980804, "learning_rate": 3.827090915285202e-05, "loss": 0.423, "step": 429 }, { "epoch": 1.1004478566858604, "grad_norm": 0.7366884864992727, "learning_rate": 3.825631805402182e-05, "loss": 0.4878, "step": 430 }, { "epoch": 1.1030070377479206, "grad_norm": 0.6346910715057278, "learning_rate": 3.824166845482591e-05, "loss": 0.3875, "step": 431 }, { "epoch": 1.1055662188099808, "grad_norm": 0.598625329081273, "learning_rate": 3.8226960402207316e-05, "loss": 0.4201, "step": 432 }, { "epoch": 1.108125399872041, "grad_norm": 0.7955423553177828, "learning_rate": 3.821219394329638e-05, "loss": 0.468, "step": 433 }, { "epoch": 1.110684580934101, "grad_norm": 0.6767598095676679, "learning_rate": 3.81973691254106e-05, "loss": 0.4104, "step": 434 }, { "epoch": 1.1132437619961613, "grad_norm": 0.669121013587649, "learning_rate": 3.818248599605448e-05, "loss": 0.3625, "step": 435 }, { "epoch": 1.1158029430582213, "grad_norm": 0.9836919729491361, "learning_rate": 3.816754460291936e-05, "loss": 0.4852, "step": 436 }, { "epoch": 1.1183621241202815, "grad_norm": 0.9574882546263436, "learning_rate": 3.8152544993883305e-05, "loss": 0.4003, "step": 437 }, { "epoch": 1.1209213051823417, "grad_norm": 0.6647251327930666, "learning_rate": 3.813748721701091e-05, "loss": 0.4202, "step": 438 }, { "epoch": 1.1234804862444019, "grad_norm": 1.05006176773705, "learning_rate": 3.812237132055317e-05, "loss": 0.4341, "step": 439 }, { "epoch": 1.1260396673064619, "grad_norm": 0.7815506825876288, "learning_rate": 3.810719735294731e-05, "loss": 0.4748, "step": 440 }, { "epoch": 1.128598848368522, "grad_norm": 1.0347544390312602, "learning_rate": 3.809196536281665e-05, "loss": 0.4248, "step": 441 }, { "epoch": 1.1311580294305823, "grad_norm": 0.8757539929940961, "learning_rate": 3.807667539897041e-05, "loss": 0.3786, "step": 442 }, { "epoch": 1.1337172104926423, "grad_norm": 0.869699779958049, "learning_rate": 3.8061327510403624e-05, "loss": 0.4397, "step": 443 }, { "epoch": 1.1362763915547025, "grad_norm": 1.0544887595956864, "learning_rate": 3.80459217462969e-05, "loss": 0.4333, "step": 444 }, { "epoch": 1.1388355726167627, "grad_norm": 0.7148540444466959, "learning_rate": 3.8030458156016326e-05, "loss": 0.432, "step": 445 }, { "epoch": 1.1413947536788227, "grad_norm": 0.8417941713288487, "learning_rate": 3.801493678911326e-05, "loss": 0.4414, "step": 446 }, { "epoch": 1.143953934740883, "grad_norm": 0.9182856799978838, "learning_rate": 3.799935769532425e-05, "loss": 0.4318, "step": 447 }, { "epoch": 1.1465131158029431, "grad_norm": 0.6142048538916006, "learning_rate": 3.798372092457076e-05, "loss": 0.3898, "step": 448 }, { "epoch": 1.1490722968650031, "grad_norm": 0.8780972905895119, "learning_rate": 3.796802652695911e-05, "loss": 0.5123, "step": 449 }, { "epoch": 1.1516314779270633, "grad_norm": 0.7702566431359069, "learning_rate": 3.795227455278029e-05, "loss": 0.3752, "step": 450 }, { "epoch": 1.1541906589891235, "grad_norm": 0.6619464755572778, "learning_rate": 3.7936465052509744e-05, "loss": 0.4028, "step": 451 }, { "epoch": 1.1567498400511835, "grad_norm": 0.8748332649924442, "learning_rate": 3.79205980768073e-05, "loss": 0.4178, "step": 452 }, { "epoch": 1.1593090211132437, "grad_norm": 0.5921826549497154, "learning_rate": 3.790467367651694e-05, "loss": 0.4034, "step": 453 }, { "epoch": 1.161868202175304, "grad_norm": 0.8231905572303784, "learning_rate": 3.788869190266664e-05, "loss": 0.4934, "step": 454 }, { "epoch": 1.164427383237364, "grad_norm": 0.7187628790605729, "learning_rate": 3.787265280646825e-05, "loss": 0.4113, "step": 455 }, { "epoch": 1.1669865642994242, "grad_norm": 0.6550020914082988, "learning_rate": 3.785655643931728e-05, "loss": 0.4038, "step": 456 }, { "epoch": 1.1695457453614844, "grad_norm": 0.7857598212931832, "learning_rate": 3.784040285279279e-05, "loss": 0.4083, "step": 457 }, { "epoch": 1.1721049264235446, "grad_norm": 0.6478292689928322, "learning_rate": 3.782419209865716e-05, "loss": 0.387, "step": 458 }, { "epoch": 1.1746641074856046, "grad_norm": 0.6645094823023456, "learning_rate": 3.780792422885597e-05, "loss": 0.3904, "step": 459 }, { "epoch": 1.1772232885476648, "grad_norm": 0.6316724059123985, "learning_rate": 3.7791599295517825e-05, "loss": 0.4225, "step": 460 }, { "epoch": 1.1797824696097248, "grad_norm": 0.5440033363984303, "learning_rate": 3.777521735095418e-05, "loss": 0.4116, "step": 461 }, { "epoch": 1.182341650671785, "grad_norm": 0.7448918620076476, "learning_rate": 3.7758778447659184e-05, "loss": 0.4272, "step": 462 }, { "epoch": 1.1849008317338452, "grad_norm": 0.5835830556541663, "learning_rate": 3.774228263830948e-05, "loss": 0.3958, "step": 463 }, { "epoch": 1.1874600127959054, "grad_norm": 0.5982038873563293, "learning_rate": 3.772572997576409e-05, "loss": 0.4053, "step": 464 }, { "epoch": 1.1900191938579654, "grad_norm": 0.5062712599156436, "learning_rate": 3.7709120513064196e-05, "loss": 0.3874, "step": 465 }, { "epoch": 1.1925783749200256, "grad_norm": 0.606586616558905, "learning_rate": 3.769245430343301e-05, "loss": 0.4528, "step": 466 }, { "epoch": 1.1951375559820858, "grad_norm": 0.7220231131524425, "learning_rate": 3.767573140027556e-05, "loss": 0.433, "step": 467 }, { "epoch": 1.1976967370441458, "grad_norm": 0.6706551361731272, "learning_rate": 3.7658951857178544e-05, "loss": 0.443, "step": 468 }, { "epoch": 1.200255918106206, "grad_norm": 0.5717527361383908, "learning_rate": 3.764211572791017e-05, "loss": 0.4669, "step": 469 }, { "epoch": 1.2028150991682662, "grad_norm": 0.8106231588399798, "learning_rate": 3.762522306641998e-05, "loss": 0.406, "step": 470 }, { "epoch": 1.2053742802303262, "grad_norm": 0.5810325663311328, "learning_rate": 3.760827392683863e-05, "loss": 0.4304, "step": 471 }, { "epoch": 1.2079334612923864, "grad_norm": 0.8071567509837944, "learning_rate": 3.759126836347779e-05, "loss": 0.4044, "step": 472 }, { "epoch": 1.2104926423544466, "grad_norm": 0.700979544013499, "learning_rate": 3.757420643082991e-05, "loss": 0.4397, "step": 473 }, { "epoch": 1.2130518234165066, "grad_norm": 0.5863406678085621, "learning_rate": 3.755708818356809e-05, "loss": 0.4099, "step": 474 }, { "epoch": 1.2156110044785668, "grad_norm": 0.6016554944675842, "learning_rate": 3.7539913676545874e-05, "loss": 0.4107, "step": 475 }, { "epoch": 1.218170185540627, "grad_norm": 0.5831332187240583, "learning_rate": 3.7522682964797066e-05, "loss": 0.4023, "step": 476 }, { "epoch": 1.220729366602687, "grad_norm": 0.6093541377057935, "learning_rate": 3.75053961035356e-05, "loss": 0.4301, "step": 477 }, { "epoch": 1.2232885476647473, "grad_norm": 0.6465778983982824, "learning_rate": 3.748805314815532e-05, "loss": 0.3933, "step": 478 }, { "epoch": 1.2258477287268075, "grad_norm": 0.5808579764017139, "learning_rate": 3.7470654154229834e-05, "loss": 0.4386, "step": 479 }, { "epoch": 1.2284069097888675, "grad_norm": 0.7856607698111114, "learning_rate": 3.745319917751229e-05, "loss": 0.4201, "step": 480 }, { "epoch": 1.2309660908509277, "grad_norm": 0.5050539819501365, "learning_rate": 3.743568827393525e-05, "loss": 0.4773, "step": 481 }, { "epoch": 1.2335252719129879, "grad_norm": 0.8385075664472458, "learning_rate": 3.741812149961049e-05, "loss": 0.4041, "step": 482 }, { "epoch": 1.236084452975048, "grad_norm": 0.624705443326678, "learning_rate": 3.740049891082879e-05, "loss": 0.4157, "step": 483 }, { "epoch": 1.238643634037108, "grad_norm": 0.5436565794572649, "learning_rate": 3.738282056405981e-05, "loss": 0.3959, "step": 484 }, { "epoch": 1.2412028150991683, "grad_norm": 0.785730847073682, "learning_rate": 3.736508651595188e-05, "loss": 0.4413, "step": 485 }, { "epoch": 1.2437619961612283, "grad_norm": 0.5059153650301063, "learning_rate": 3.734729682333179e-05, "loss": 0.4033, "step": 486 }, { "epoch": 1.2463211772232885, "grad_norm": 0.609111900762071, "learning_rate": 3.732945154320467e-05, "loss": 0.4282, "step": 487 }, { "epoch": 1.2488803582853487, "grad_norm": 0.5621990676055562, "learning_rate": 3.731155073275375e-05, "loss": 0.411, "step": 488 }, { "epoch": 1.251439539347409, "grad_norm": 0.5476414987684265, "learning_rate": 3.729359444934022e-05, "loss": 0.4217, "step": 489 }, { "epoch": 1.253998720409469, "grad_norm": 0.5674756992559286, "learning_rate": 3.727558275050301e-05, "loss": 0.461, "step": 490 }, { "epoch": 1.2565579014715291, "grad_norm": 0.6949881690129194, "learning_rate": 3.725751569395863e-05, "loss": 0.4621, "step": 491 }, { "epoch": 1.2591170825335891, "grad_norm": 0.5458610089583471, "learning_rate": 3.723939333760099e-05, "loss": 0.4508, "step": 492 }, { "epoch": 1.2616762635956493, "grad_norm": 0.5906724584503589, "learning_rate": 3.7221215739501176e-05, "loss": 0.4276, "step": 493 }, { "epoch": 1.2642354446577095, "grad_norm": 0.4800178261816677, "learning_rate": 3.720298295790732e-05, "loss": 0.3921, "step": 494 }, { "epoch": 1.2667946257197698, "grad_norm": 0.6636123062743209, "learning_rate": 3.718469505124434e-05, "loss": 0.4584, "step": 495 }, { "epoch": 1.2693538067818297, "grad_norm": 0.5232376990167991, "learning_rate": 3.716635207811385e-05, "loss": 0.4168, "step": 496 }, { "epoch": 1.27191298784389, "grad_norm": 0.6767875708314827, "learning_rate": 3.714795409729388e-05, "loss": 0.4379, "step": 497 }, { "epoch": 1.2744721689059502, "grad_norm": 0.5605992428107962, "learning_rate": 3.712950116773875e-05, "loss": 0.4074, "step": 498 }, { "epoch": 1.2770313499680102, "grad_norm": 0.5932800163478181, "learning_rate": 3.711099334857884e-05, "loss": 0.4194, "step": 499 }, { "epoch": 1.2795905310300704, "grad_norm": 0.5438179500986763, "learning_rate": 3.709243069912041e-05, "loss": 0.3917, "step": 500 }, { "epoch": 1.2821497120921306, "grad_norm": 0.7190898552769511, "learning_rate": 3.707381327884545e-05, "loss": 0.4717, "step": 501 }, { "epoch": 1.2847088931541908, "grad_norm": 0.6816367378411385, "learning_rate": 3.705514114741142e-05, "loss": 0.3782, "step": 502 }, { "epoch": 1.2872680742162508, "grad_norm": 0.5116844048821922, "learning_rate": 3.703641436465114e-05, "loss": 0.4225, "step": 503 }, { "epoch": 1.289827255278311, "grad_norm": 0.6736005325770137, "learning_rate": 3.70176329905725e-05, "loss": 0.4444, "step": 504 }, { "epoch": 1.292386436340371, "grad_norm": 0.5864619464379411, "learning_rate": 3.699879708535838e-05, "loss": 0.4354, "step": 505 }, { "epoch": 1.2949456174024312, "grad_norm": 0.6078408475459052, "learning_rate": 3.6979906709366334e-05, "loss": 0.453, "step": 506 }, { "epoch": 1.2975047984644914, "grad_norm": 0.7409797972431034, "learning_rate": 3.696096192312852e-05, "loss": 0.4365, "step": 507 }, { "epoch": 1.3000639795265516, "grad_norm": 0.4775651025928874, "learning_rate": 3.694196278735142e-05, "loss": 0.4391, "step": 508 }, { "epoch": 1.3026231605886116, "grad_norm": 0.5823632608853917, "learning_rate": 3.692290936291568e-05, "loss": 0.3875, "step": 509 }, { "epoch": 1.3051823416506718, "grad_norm": 0.5307154121726435, "learning_rate": 3.69038017108759e-05, "loss": 0.4059, "step": 510 }, { "epoch": 1.3077415227127318, "grad_norm": 0.5835304311686238, "learning_rate": 3.688463989246045e-05, "loss": 0.4505, "step": 511 }, { "epoch": 1.310300703774792, "grad_norm": 0.5459622175115579, "learning_rate": 3.686542396907128e-05, "loss": 0.3994, "step": 512 }, { "epoch": 1.3128598848368522, "grad_norm": 0.5266102200698167, "learning_rate": 3.6846154002283696e-05, "loss": 0.3954, "step": 513 }, { "epoch": 1.3154190658989124, "grad_norm": 0.49436862141147236, "learning_rate": 3.68268300538462e-05, "loss": 0.4417, "step": 514 }, { "epoch": 1.3179782469609724, "grad_norm": 0.5981532644758176, "learning_rate": 3.680745218568026e-05, "loss": 0.4382, "step": 515 }, { "epoch": 1.3205374280230326, "grad_norm": 0.4757174461989479, "learning_rate": 3.678802045988012e-05, "loss": 0.3686, "step": 516 }, { "epoch": 1.3230966090850926, "grad_norm": 0.6321671394432529, "learning_rate": 3.676853493871262e-05, "loss": 0.4418, "step": 517 }, { "epoch": 1.3256557901471528, "grad_norm": 0.44736313515673864, "learning_rate": 3.674899568461696e-05, "loss": 0.4235, "step": 518 }, { "epoch": 1.328214971209213, "grad_norm": 0.5494531167802622, "learning_rate": 3.6729402760204535e-05, "loss": 0.4069, "step": 519 }, { "epoch": 1.3307741522712733, "grad_norm": 0.5068999873204574, "learning_rate": 3.6709756228258735e-05, "loss": 0.4284, "step": 520 }, { "epoch": 1.3333333333333333, "grad_norm": 0.506062505003235, "learning_rate": 3.669005615173469e-05, "loss": 0.4438, "step": 521 }, { "epoch": 1.3358925143953935, "grad_norm": 0.6379310859811307, "learning_rate": 3.667030259375915e-05, "loss": 0.4142, "step": 522 }, { "epoch": 1.3384516954574537, "grad_norm": 0.42277797644227044, "learning_rate": 3.665049561763021e-05, "loss": 0.3805, "step": 523 }, { "epoch": 1.3410108765195137, "grad_norm": 0.6090026974823245, "learning_rate": 3.663063528681716e-05, "loss": 0.4136, "step": 524 }, { "epoch": 1.3435700575815739, "grad_norm": 0.46562372404588254, "learning_rate": 3.6610721664960236e-05, "loss": 0.4354, "step": 525 }, { "epoch": 1.346129238643634, "grad_norm": 0.6500313998419536, "learning_rate": 3.659075481587046e-05, "loss": 0.4283, "step": 526 }, { "epoch": 1.3486884197056943, "grad_norm": 0.5211253922160387, "learning_rate": 3.65707348035294e-05, "loss": 0.4255, "step": 527 }, { "epoch": 1.3512476007677543, "grad_norm": 0.6220670330163766, "learning_rate": 3.6550661692089e-05, "loss": 0.4191, "step": 528 }, { "epoch": 1.3538067818298145, "grad_norm": 0.544711113768934, "learning_rate": 3.6530535545871326e-05, "loss": 0.436, "step": 529 }, { "epoch": 1.3563659628918745, "grad_norm": 0.7052970063859283, "learning_rate": 3.65103564293684e-05, "loss": 0.4949, "step": 530 }, { "epoch": 1.3589251439539347, "grad_norm": 0.5145642841555808, "learning_rate": 3.6490124407242007e-05, "loss": 0.4131, "step": 531 }, { "epoch": 1.361484325015995, "grad_norm": 0.5857771172798699, "learning_rate": 3.646983954432342e-05, "loss": 0.4146, "step": 532 }, { "epoch": 1.3640435060780551, "grad_norm": 0.4920261736044566, "learning_rate": 3.644950190561325e-05, "loss": 0.4284, "step": 533 }, { "epoch": 1.3666026871401151, "grad_norm": 0.5687057750503046, "learning_rate": 3.642911155628124e-05, "loss": 0.4514, "step": 534 }, { "epoch": 1.3691618682021753, "grad_norm": 0.5724699735691123, "learning_rate": 3.640866856166601e-05, "loss": 0.4539, "step": 535 }, { "epoch": 1.3717210492642353, "grad_norm": 0.5749209613889618, "learning_rate": 3.6388172987274913e-05, "loss": 0.3865, "step": 536 }, { "epoch": 1.3742802303262955, "grad_norm": 0.6228395395499405, "learning_rate": 3.636762489878374e-05, "loss": 0.4075, "step": 537 }, { "epoch": 1.3768394113883557, "grad_norm": 0.6903037733166263, "learning_rate": 3.63470243620366e-05, "loss": 0.4312, "step": 538 }, { "epoch": 1.379398592450416, "grad_norm": 0.678573324042214, "learning_rate": 3.632637144304565e-05, "loss": 0.4806, "step": 539 }, { "epoch": 1.381957773512476, "grad_norm": 0.6951420732428104, "learning_rate": 3.6305666207990886e-05, "loss": 0.439, "step": 540 }, { "epoch": 1.3845169545745362, "grad_norm": 0.5961242888653673, "learning_rate": 3.628490872321998e-05, "loss": 0.4205, "step": 541 }, { "epoch": 1.3870761356365962, "grad_norm": 0.7217418974601812, "learning_rate": 3.626409905524799e-05, "loss": 0.4707, "step": 542 }, { "epoch": 1.3896353166986564, "grad_norm": 0.45054510458685476, "learning_rate": 3.624323727075723e-05, "loss": 0.4145, "step": 543 }, { "epoch": 1.3921944977607166, "grad_norm": 0.7284286885213026, "learning_rate": 3.622232343659698e-05, "loss": 0.4299, "step": 544 }, { "epoch": 1.3947536788227768, "grad_norm": 0.6769810527885796, "learning_rate": 3.6201357619783336e-05, "loss": 0.4163, "step": 545 }, { "epoch": 1.3973128598848368, "grad_norm": 0.5221114603904703, "learning_rate": 3.6180339887498953e-05, "loss": 0.4443, "step": 546 }, { "epoch": 1.399872040946897, "grad_norm": 0.6008896686645977, "learning_rate": 3.615927030709284e-05, "loss": 0.4318, "step": 547 }, { "epoch": 1.4024312220089572, "grad_norm": 0.5343608825415525, "learning_rate": 3.613814894608016e-05, "loss": 0.4623, "step": 548 }, { "epoch": 1.4049904030710172, "grad_norm": 0.691771745080796, "learning_rate": 3.6116975872141984e-05, "loss": 0.4624, "step": 549 }, { "epoch": 1.4075495841330774, "grad_norm": 0.6259414433121822, "learning_rate": 3.609575115312511e-05, "loss": 0.4508, "step": 550 }, { "epoch": 1.4101087651951376, "grad_norm": 0.6649399879213752, "learning_rate": 3.607447485704182e-05, "loss": 0.4143, "step": 551 }, { "epoch": 1.4126679462571978, "grad_norm": 0.47253136462127165, "learning_rate": 3.605314705206966e-05, "loss": 0.4106, "step": 552 }, { "epoch": 1.4152271273192578, "grad_norm": 0.5556693107195737, "learning_rate": 3.603176780655124e-05, "loss": 0.4616, "step": 553 }, { "epoch": 1.417786308381318, "grad_norm": 0.4021083541093729, "learning_rate": 3.601033718899401e-05, "loss": 0.3928, "step": 554 }, { "epoch": 1.420345489443378, "grad_norm": 0.5351580026934937, "learning_rate": 3.598885526807003e-05, "loss": 0.4661, "step": 555 }, { "epoch": 1.4229046705054382, "grad_norm": 0.5247684064361327, "learning_rate": 3.596732211261574e-05, "loss": 0.4303, "step": 556 }, { "epoch": 1.4254638515674984, "grad_norm": 0.49894549324782106, "learning_rate": 3.594573779163179e-05, "loss": 0.3938, "step": 557 }, { "epoch": 1.4280230326295587, "grad_norm": 0.5751022801506976, "learning_rate": 3.5924102374282754e-05, "loss": 0.4401, "step": 558 }, { "epoch": 1.4305822136916186, "grad_norm": 0.4560617808969142, "learning_rate": 3.590241592989696e-05, "loss": 0.4241, "step": 559 }, { "epoch": 1.4331413947536789, "grad_norm": 0.6245655178581035, "learning_rate": 3.5880678527966224e-05, "loss": 0.4138, "step": 560 }, { "epoch": 1.4357005758157388, "grad_norm": 0.5355963534405355, "learning_rate": 3.5858890238145674e-05, "loss": 0.4145, "step": 561 }, { "epoch": 1.438259756877799, "grad_norm": 0.6422997737126074, "learning_rate": 3.583705113025348e-05, "loss": 0.4554, "step": 562 }, { "epoch": 1.4408189379398593, "grad_norm": 0.6761962261433756, "learning_rate": 3.581516127427068e-05, "loss": 0.4176, "step": 563 }, { "epoch": 1.4433781190019195, "grad_norm": 0.6187534229527232, "learning_rate": 3.5793220740340904e-05, "loss": 0.4255, "step": 564 }, { "epoch": 1.4459373000639795, "grad_norm": 0.5255489769173007, "learning_rate": 3.577122959877017e-05, "loss": 0.4147, "step": 565 }, { "epoch": 1.4484964811260397, "grad_norm": 0.5786362132356377, "learning_rate": 3.57491879200267e-05, "loss": 0.4018, "step": 566 }, { "epoch": 1.4510556621880997, "grad_norm": 0.5297316242089579, "learning_rate": 3.572709577474062e-05, "loss": 0.4446, "step": 567 }, { "epoch": 1.4536148432501599, "grad_norm": 0.5131558041266011, "learning_rate": 3.570495323370378e-05, "loss": 0.4475, "step": 568 }, { "epoch": 1.45617402431222, "grad_norm": 0.7521443480114743, "learning_rate": 3.568276036786952e-05, "loss": 0.4091, "step": 569 }, { "epoch": 1.4587332053742803, "grad_norm": 0.5969703958136027, "learning_rate": 3.566051724835245e-05, "loss": 0.4283, "step": 570 }, { "epoch": 1.4612923864363403, "grad_norm": 0.4037709403552394, "learning_rate": 3.5638223946428194e-05, "loss": 0.4271, "step": 571 }, { "epoch": 1.4638515674984005, "grad_norm": 0.5680529949916449, "learning_rate": 3.561588053353319e-05, "loss": 0.4253, "step": 572 }, { "epoch": 1.4664107485604607, "grad_norm": 0.4379512610916079, "learning_rate": 3.559348708126445e-05, "loss": 0.3955, "step": 573 }, { "epoch": 1.4689699296225207, "grad_norm": 0.6425572888305521, "learning_rate": 3.557104366137934e-05, "loss": 0.4208, "step": 574 }, { "epoch": 1.471529110684581, "grad_norm": 0.5147430638168887, "learning_rate": 3.554855034579532e-05, "loss": 0.4206, "step": 575 }, { "epoch": 1.4740882917466411, "grad_norm": 0.5773864382218094, "learning_rate": 3.552600720658976e-05, "loss": 0.3936, "step": 576 }, { "epoch": 1.4766474728087013, "grad_norm": 0.5531560341828542, "learning_rate": 3.550341431599967e-05, "loss": 0.4674, "step": 577 }, { "epoch": 1.4792066538707613, "grad_norm": 0.5447609179432047, "learning_rate": 3.5480771746421494e-05, "loss": 0.4032, "step": 578 }, { "epoch": 1.4817658349328215, "grad_norm": 0.572196110734813, "learning_rate": 3.545807957041084e-05, "loss": 0.4509, "step": 579 }, { "epoch": 1.4843250159948815, "grad_norm": 0.5028398333246705, "learning_rate": 3.5435337860682304e-05, "loss": 0.3334, "step": 580 }, { "epoch": 1.4868841970569417, "grad_norm": 0.7160945031568345, "learning_rate": 3.54125466901092e-05, "loss": 0.4598, "step": 581 }, { "epoch": 1.489443378119002, "grad_norm": 0.6791351765101247, "learning_rate": 3.538970613172332e-05, "loss": 0.4055, "step": 582 }, { "epoch": 1.4920025591810622, "grad_norm": 0.8140085732718708, "learning_rate": 3.536681625871474e-05, "loss": 0.3982, "step": 583 }, { "epoch": 1.4945617402431222, "grad_norm": 0.6600156473699682, "learning_rate": 3.534387714443153e-05, "loss": 0.4283, "step": 584 }, { "epoch": 1.4971209213051824, "grad_norm": 0.9072053140385462, "learning_rate": 3.532088886237956e-05, "loss": 0.461, "step": 585 }, { "epoch": 1.4996801023672424, "grad_norm": 0.5291141400904751, "learning_rate": 3.5297851486222274e-05, "loss": 0.4105, "step": 586 }, { "epoch": 1.5022392834293026, "grad_norm": 0.8711742022509097, "learning_rate": 3.527476508978039e-05, "loss": 0.4266, "step": 587 }, { "epoch": 1.5047984644913628, "grad_norm": 0.5011541715943939, "learning_rate": 3.525162974703174e-05, "loss": 0.4681, "step": 588 }, { "epoch": 1.507357645553423, "grad_norm": 0.7884765548368484, "learning_rate": 3.5228445532110996e-05, "loss": 0.4341, "step": 589 }, { "epoch": 1.5099168266154832, "grad_norm": 0.5740501726759667, "learning_rate": 3.520521251930941e-05, "loss": 0.4128, "step": 590 }, { "epoch": 1.5124760076775432, "grad_norm": 0.5460891296171442, "learning_rate": 3.518193078307463e-05, "loss": 0.4188, "step": 591 }, { "epoch": 1.5150351887396032, "grad_norm": 0.7055940264802735, "learning_rate": 3.515860039801043e-05, "loss": 0.3965, "step": 592 }, { "epoch": 1.5175943698016634, "grad_norm": 0.5874392809254365, "learning_rate": 3.513522143887645e-05, "loss": 0.4918, "step": 593 }, { "epoch": 1.5201535508637236, "grad_norm": 0.6245938510885568, "learning_rate": 3.5111793980588006e-05, "loss": 0.4285, "step": 594 }, { "epoch": 1.5227127319257838, "grad_norm": 0.48774952340541083, "learning_rate": 3.5088318098215805e-05, "loss": 0.4013, "step": 595 }, { "epoch": 1.525271912987844, "grad_norm": 0.4468453631601922, "learning_rate": 3.506479386698575e-05, "loss": 0.3958, "step": 596 }, { "epoch": 1.527831094049904, "grad_norm": 0.5538376365976897, "learning_rate": 3.5041221362278644e-05, "loss": 0.4347, "step": 597 }, { "epoch": 1.530390275111964, "grad_norm": 0.4775499856280007, "learning_rate": 3.5017600659629986e-05, "loss": 0.4484, "step": 598 }, { "epoch": 1.5329494561740242, "grad_norm": 0.5259243816931762, "learning_rate": 3.499393183472973e-05, "loss": 0.4211, "step": 599 }, { "epoch": 1.5355086372360844, "grad_norm": 0.5198109524806964, "learning_rate": 3.497021496342203e-05, "loss": 0.4363, "step": 600 }, { "epoch": 1.5380678182981447, "grad_norm": 0.4696941806509467, "learning_rate": 3.494645012170498e-05, "loss": 0.4295, "step": 601 }, { "epoch": 1.5406269993602049, "grad_norm": 0.6474700305067774, "learning_rate": 3.4922637385730406e-05, "loss": 0.5, "step": 602 }, { "epoch": 1.5431861804222649, "grad_norm": 0.4793948522260617, "learning_rate": 3.489877683180362e-05, "loss": 0.3845, "step": 603 }, { "epoch": 1.545745361484325, "grad_norm": 0.539237627319592, "learning_rate": 3.487486853638314e-05, "loss": 0.4356, "step": 604 }, { "epoch": 1.548304542546385, "grad_norm": 0.5532057074478213, "learning_rate": 3.485091257608047e-05, "loss": 0.3891, "step": 605 }, { "epoch": 1.5508637236084453, "grad_norm": 0.6992089262871057, "learning_rate": 3.482690902765984e-05, "loss": 0.4571, "step": 606 }, { "epoch": 1.5534229046705055, "grad_norm": 0.4638019008983174, "learning_rate": 3.4802857968038e-05, "loss": 0.4188, "step": 607 }, { "epoch": 1.5559820857325657, "grad_norm": 0.7914532265925823, "learning_rate": 3.4778759474283936e-05, "loss": 0.4534, "step": 608 }, { "epoch": 1.5585412667946257, "grad_norm": 0.5295039816146513, "learning_rate": 3.475461362361861e-05, "loss": 0.4001, "step": 609 }, { "epoch": 1.561100447856686, "grad_norm": 0.6132941939025509, "learning_rate": 3.473042049341474e-05, "loss": 0.4225, "step": 610 }, { "epoch": 1.5636596289187459, "grad_norm": 0.714057587477597, "learning_rate": 3.470618016119658e-05, "loss": 0.4136, "step": 611 }, { "epoch": 1.566218809980806, "grad_norm": 0.5320432430474729, "learning_rate": 3.468189270463959e-05, "loss": 0.4004, "step": 612 }, { "epoch": 1.5687779910428663, "grad_norm": 0.66784032505952, "learning_rate": 3.465755820157026e-05, "loss": 0.4065, "step": 613 }, { "epoch": 1.5713371721049265, "grad_norm": 0.49152795733031895, "learning_rate": 3.463317672996583e-05, "loss": 0.3791, "step": 614 }, { "epoch": 1.5738963531669867, "grad_norm": 0.555185505506747, "learning_rate": 3.4608748367954064e-05, "loss": 0.4633, "step": 615 }, { "epoch": 1.5764555342290467, "grad_norm": 0.4980166871980404, "learning_rate": 3.4584273193812956e-05, "loss": 0.4252, "step": 616 }, { "epoch": 1.5790147152911067, "grad_norm": 0.589732498310526, "learning_rate": 3.45597512859705e-05, "loss": 0.4839, "step": 617 }, { "epoch": 1.581573896353167, "grad_norm": 0.5743441085203349, "learning_rate": 3.4535182723004466e-05, "loss": 0.4062, "step": 618 }, { "epoch": 1.5841330774152271, "grad_norm": 0.49126577666625093, "learning_rate": 3.451056758364212e-05, "loss": 0.4135, "step": 619 }, { "epoch": 1.5866922584772873, "grad_norm": 0.5037192947792559, "learning_rate": 3.4485905946759965e-05, "loss": 0.459, "step": 620 }, { "epoch": 1.5892514395393476, "grad_norm": 0.5439325021578344, "learning_rate": 3.446119789138351e-05, "loss": 0.3882, "step": 621 }, { "epoch": 1.5918106206014075, "grad_norm": 0.5020489297705218, "learning_rate": 3.443644349668701e-05, "loss": 0.4053, "step": 622 }, { "epoch": 1.5943698016634675, "grad_norm": 0.5634072314676822, "learning_rate": 3.4411642841993185e-05, "loss": 0.4065, "step": 623 }, { "epoch": 1.5969289827255277, "grad_norm": 0.4911032231149214, "learning_rate": 3.438679600677303e-05, "loss": 0.4207, "step": 624 }, { "epoch": 1.599488163787588, "grad_norm": 0.6463093595410542, "learning_rate": 3.4361903070645484e-05, "loss": 0.4195, "step": 625 }, { "epoch": 1.6020473448496482, "grad_norm": 0.5395555876619189, "learning_rate": 3.433696411337723e-05, "loss": 0.4359, "step": 626 }, { "epoch": 1.6046065259117084, "grad_norm": 0.5456191870776568, "learning_rate": 3.431197921488242e-05, "loss": 0.4325, "step": 627 }, { "epoch": 1.6071657069737684, "grad_norm": 0.5181489907237284, "learning_rate": 3.4286948455222425e-05, "loss": 0.4262, "step": 628 }, { "epoch": 1.6097248880358286, "grad_norm": 0.4542522655783656, "learning_rate": 3.426187191460555e-05, "loss": 0.4008, "step": 629 }, { "epoch": 1.6122840690978886, "grad_norm": 0.5748315523058384, "learning_rate": 3.423674967338681e-05, "loss": 0.4613, "step": 630 }, { "epoch": 1.6148432501599488, "grad_norm": 0.4893291757606303, "learning_rate": 3.421158181206769e-05, "loss": 0.411, "step": 631 }, { "epoch": 1.617402431222009, "grad_norm": 0.580837787265809, "learning_rate": 3.418636841129582e-05, "loss": 0.417, "step": 632 }, { "epoch": 1.6199616122840692, "grad_norm": 0.6495075917348687, "learning_rate": 3.416110955186477e-05, "loss": 0.4817, "step": 633 }, { "epoch": 1.6225207933461292, "grad_norm": 0.45060911216179744, "learning_rate": 3.4135805314713804e-05, "loss": 0.4033, "step": 634 }, { "epoch": 1.6250799744081894, "grad_norm": 0.5998653037164785, "learning_rate": 3.411045578092754e-05, "loss": 0.3912, "step": 635 }, { "epoch": 1.6276391554702494, "grad_norm": 0.5310655122776353, "learning_rate": 3.4085061031735794e-05, "loss": 0.4313, "step": 636 }, { "epoch": 1.6301983365323096, "grad_norm": 0.6388088047336757, "learning_rate": 3.405962114851324e-05, "loss": 0.4433, "step": 637 }, { "epoch": 1.6327575175943698, "grad_norm": 0.5039515005309823, "learning_rate": 3.4034136212779195e-05, "loss": 0.414, "step": 638 }, { "epoch": 1.63531669865643, "grad_norm": 0.6351418548004399, "learning_rate": 3.4008606306197336e-05, "loss": 0.4271, "step": 639 }, { "epoch": 1.6378758797184902, "grad_norm": 0.5382251510311172, "learning_rate": 3.398303151057543e-05, "loss": 0.4223, "step": 640 }, { "epoch": 1.6404350607805502, "grad_norm": 0.5213424824749899, "learning_rate": 3.3957411907865123e-05, "loss": 0.4169, "step": 641 }, { "epoch": 1.6429942418426102, "grad_norm": 0.6114074897511479, "learning_rate": 3.393174758016161e-05, "loss": 0.4141, "step": 642 }, { "epoch": 1.6455534229046704, "grad_norm": 0.5077805851469291, "learning_rate": 3.39060386097034e-05, "loss": 0.4354, "step": 643 }, { "epoch": 1.6481126039667306, "grad_norm": 0.5648032929681525, "learning_rate": 3.3880285078872076e-05, "loss": 0.3944, "step": 644 }, { "epoch": 1.6506717850287909, "grad_norm": 0.5623890506817537, "learning_rate": 3.385448707019199e-05, "loss": 0.463, "step": 645 }, { "epoch": 1.653230966090851, "grad_norm": 0.47253413270859174, "learning_rate": 3.382864466633003e-05, "loss": 0.4179, "step": 646 }, { "epoch": 1.655790147152911, "grad_norm": 0.6667403121893724, "learning_rate": 3.3802757950095346e-05, "loss": 0.4401, "step": 647 }, { "epoch": 1.658349328214971, "grad_norm": 0.48362697457689036, "learning_rate": 3.377682700443907e-05, "loss": 0.4294, "step": 648 }, { "epoch": 1.6609085092770313, "grad_norm": 0.49305745599661777, "learning_rate": 3.375085191245407e-05, "loss": 0.4166, "step": 649 }, { "epoch": 1.6634676903390915, "grad_norm": 0.5645696987568555, "learning_rate": 3.372483275737468e-05, "loss": 0.3922, "step": 650 }, { "epoch": 1.6660268714011517, "grad_norm": 0.6497192793696049, "learning_rate": 3.3698769622576404e-05, "loss": 0.484, "step": 651 }, { "epoch": 1.668586052463212, "grad_norm": 0.6004795627408669, "learning_rate": 3.367266259157572e-05, "loss": 0.4744, "step": 652 }, { "epoch": 1.671145233525272, "grad_norm": 0.6035991357132837, "learning_rate": 3.364651174802974e-05, "loss": 0.4576, "step": 653 }, { "epoch": 1.673704414587332, "grad_norm": 0.5638974284467396, "learning_rate": 3.3620317175735945e-05, "loss": 0.3829, "step": 654 }, { "epoch": 1.676263595649392, "grad_norm": 0.6144746966380764, "learning_rate": 3.359407895863199e-05, "loss": 0.4219, "step": 655 }, { "epoch": 1.6788227767114523, "grad_norm": 0.6330635207423285, "learning_rate": 3.356779718079534e-05, "loss": 0.3939, "step": 656 }, { "epoch": 1.6813819577735125, "grad_norm": 0.6511893943323425, "learning_rate": 3.3541471926443084e-05, "loss": 0.4626, "step": 657 }, { "epoch": 1.6839411388355727, "grad_norm": 0.7313805162264239, "learning_rate": 3.3515103279931584e-05, "loss": 0.4443, "step": 658 }, { "epoch": 1.6865003198976327, "grad_norm": 0.4567231810719478, "learning_rate": 3.3488691325756294e-05, "loss": 0.4072, "step": 659 }, { "epoch": 1.689059500959693, "grad_norm": 0.6498464270828352, "learning_rate": 3.34622361485514e-05, "loss": 0.4532, "step": 660 }, { "epoch": 1.691618682021753, "grad_norm": 0.46427052887792153, "learning_rate": 3.343573783308964e-05, "loss": 0.4266, "step": 661 }, { "epoch": 1.6941778630838131, "grad_norm": 0.5468516675156209, "learning_rate": 3.340919646428193e-05, "loss": 0.4208, "step": 662 }, { "epoch": 1.6967370441458733, "grad_norm": 0.5751528606477143, "learning_rate": 3.3382612127177166e-05, "loss": 0.4146, "step": 663 }, { "epoch": 1.6992962252079336, "grad_norm": 0.5793295201191827, "learning_rate": 3.335598490696196e-05, "loss": 0.4623, "step": 664 }, { "epoch": 1.7018554062699938, "grad_norm": 0.7077864391730174, "learning_rate": 3.332931488896029e-05, "loss": 0.4459, "step": 665 }, { "epoch": 1.7044145873320538, "grad_norm": 0.4786371924890489, "learning_rate": 3.330260215863332e-05, "loss": 0.3967, "step": 666 }, { "epoch": 1.7069737683941137, "grad_norm": 0.71627935746326, "learning_rate": 3.327584680157904e-05, "loss": 0.4466, "step": 667 }, { "epoch": 1.709532949456174, "grad_norm": 0.5672877205591593, "learning_rate": 3.3249048903532075e-05, "loss": 0.4245, "step": 668 }, { "epoch": 1.7120921305182342, "grad_norm": 0.5967392434258936, "learning_rate": 3.322220855036333e-05, "loss": 0.4399, "step": 669 }, { "epoch": 1.7146513115802944, "grad_norm": 0.619642871257619, "learning_rate": 3.319532582807977e-05, "loss": 0.4429, "step": 670 }, { "epoch": 1.7172104926423546, "grad_norm": 0.5718993726856418, "learning_rate": 3.316840082282412e-05, "loss": 0.4049, "step": 671 }, { "epoch": 1.7197696737044146, "grad_norm": 0.7158788807881703, "learning_rate": 3.314143362087462e-05, "loss": 0.465, "step": 672 }, { "epoch": 1.7223288547664746, "grad_norm": 0.49801282221595167, "learning_rate": 3.3114424308644686e-05, "loss": 0.4304, "step": 673 }, { "epoch": 1.7248880358285348, "grad_norm": 0.732000530911472, "learning_rate": 3.3087372972682703e-05, "loss": 0.4496, "step": 674 }, { "epoch": 1.727447216890595, "grad_norm": 0.5016534768936702, "learning_rate": 3.30602796996717e-05, "loss": 0.4196, "step": 675 }, { "epoch": 1.7300063979526552, "grad_norm": 0.6732998458752849, "learning_rate": 3.303314457642911e-05, "loss": 0.4377, "step": 676 }, { "epoch": 1.7325655790147154, "grad_norm": 0.512135853555358, "learning_rate": 3.300596768990644e-05, "loss": 0.4032, "step": 677 }, { "epoch": 1.7351247600767754, "grad_norm": 0.5565097137954512, "learning_rate": 3.297874912718902e-05, "loss": 0.4124, "step": 678 }, { "epoch": 1.7376839411388356, "grad_norm": 0.5579693456602185, "learning_rate": 3.2951488975495785e-05, "loss": 0.4493, "step": 679 }, { "epoch": 1.7402431222008956, "grad_norm": 0.5767956410825538, "learning_rate": 3.2924187322178865e-05, "loss": 0.4701, "step": 680 }, { "epoch": 1.7428023032629558, "grad_norm": 0.474585132199083, "learning_rate": 3.2896844254723414e-05, "loss": 0.4118, "step": 681 }, { "epoch": 1.745361484325016, "grad_norm": 0.44776574599085095, "learning_rate": 3.28694598607473e-05, "loss": 0.3928, "step": 682 }, { "epoch": 1.7479206653870762, "grad_norm": 0.4613485770735262, "learning_rate": 3.28420342280008e-05, "loss": 0.4185, "step": 683 }, { "epoch": 1.7504798464491362, "grad_norm": 0.6343181624558385, "learning_rate": 3.281456744436634e-05, "loss": 0.4133, "step": 684 }, { "epoch": 1.7530390275111964, "grad_norm": 0.5493779464654077, "learning_rate": 3.278705959785821e-05, "loss": 0.4671, "step": 685 }, { "epoch": 1.7555982085732564, "grad_norm": 0.5441386561154584, "learning_rate": 3.2759510776622274e-05, "loss": 0.4453, "step": 686 }, { "epoch": 1.7581573896353166, "grad_norm": 0.629823120577556, "learning_rate": 3.273192106893572e-05, "loss": 0.3839, "step": 687 }, { "epoch": 1.7607165706973769, "grad_norm": 0.549530453190963, "learning_rate": 3.270429056320672e-05, "loss": 0.4502, "step": 688 }, { "epoch": 1.763275751759437, "grad_norm": 0.6031975081301458, "learning_rate": 3.26766193479742e-05, "loss": 0.4738, "step": 689 }, { "epoch": 1.7658349328214973, "grad_norm": 0.48136220411058944, "learning_rate": 3.2648907511907544e-05, "loss": 0.4036, "step": 690 }, { "epoch": 1.7683941138835573, "grad_norm": 0.6253783336704821, "learning_rate": 3.262115514380628e-05, "loss": 0.4081, "step": 691 }, { "epoch": 1.7709532949456173, "grad_norm": 0.47939071537048983, "learning_rate": 3.25933623325998e-05, "loss": 0.4314, "step": 692 }, { "epoch": 1.7735124760076775, "grad_norm": 0.4518873833371369, "learning_rate": 3.256552916734713e-05, "loss": 0.3986, "step": 693 }, { "epoch": 1.7760716570697377, "grad_norm": 0.6074017672602955, "learning_rate": 3.25376557372366e-05, "loss": 0.4324, "step": 694 }, { "epoch": 1.778630838131798, "grad_norm": 0.470674972052956, "learning_rate": 3.250974213158555e-05, "loss": 0.3933, "step": 695 }, { "epoch": 1.781190019193858, "grad_norm": 0.6255159824430874, "learning_rate": 3.248178843984006e-05, "loss": 0.4252, "step": 696 }, { "epoch": 1.783749200255918, "grad_norm": 0.5371760909763971, "learning_rate": 3.245379475157465e-05, "loss": 0.4778, "step": 697 }, { "epoch": 1.786308381317978, "grad_norm": 0.5168010335153898, "learning_rate": 3.242576115649205e-05, "loss": 0.4229, "step": 698 }, { "epoch": 1.7888675623800383, "grad_norm": 0.49166449165933496, "learning_rate": 3.239768774442281e-05, "loss": 0.4005, "step": 699 }, { "epoch": 1.7914267434420985, "grad_norm": 0.4932386148580624, "learning_rate": 3.23695746053251e-05, "loss": 0.4163, "step": 700 }, { "epoch": 1.7939859245041587, "grad_norm": 0.5880360652699835, "learning_rate": 3.2341421829284394e-05, "loss": 0.4413, "step": 701 }, { "epoch": 1.796545105566219, "grad_norm": 0.4625923123089751, "learning_rate": 3.2313229506513167e-05, "loss": 0.426, "step": 702 }, { "epoch": 1.799104286628279, "grad_norm": 0.5109208128171377, "learning_rate": 3.228499772735062e-05, "loss": 0.393, "step": 703 }, { "epoch": 1.8016634676903391, "grad_norm": 0.4806179945017673, "learning_rate": 3.2256726582262384e-05, "loss": 0.4479, "step": 704 }, { "epoch": 1.8042226487523991, "grad_norm": 0.4970665904869278, "learning_rate": 3.222841616184025e-05, "loss": 0.4318, "step": 705 }, { "epoch": 1.8067818298144593, "grad_norm": 0.5666482942373245, "learning_rate": 3.220006655680183e-05, "loss": 0.4245, "step": 706 }, { "epoch": 1.8093410108765196, "grad_norm": 0.4999181748027583, "learning_rate": 3.2171677857990334e-05, "loss": 0.4372, "step": 707 }, { "epoch": 1.8119001919385798, "grad_norm": 0.47380754642052403, "learning_rate": 3.2143250156374226e-05, "loss": 0.3926, "step": 708 }, { "epoch": 1.8144593730006398, "grad_norm": 0.6008131918939661, "learning_rate": 3.211478354304695e-05, "loss": 0.4533, "step": 709 }, { "epoch": 1.8170185540627, "grad_norm": 0.5144973230871912, "learning_rate": 3.208627810922665e-05, "loss": 0.4352, "step": 710 }, { "epoch": 1.81957773512476, "grad_norm": 0.4699623742277227, "learning_rate": 3.2057733946255844e-05, "loss": 0.3852, "step": 711 }, { "epoch": 1.8221369161868202, "grad_norm": 0.4961706904135912, "learning_rate": 3.202915114560118e-05, "loss": 0.4445, "step": 712 }, { "epoch": 1.8246960972488804, "grad_norm": 0.561279443158619, "learning_rate": 3.200052979885309e-05, "loss": 0.4802, "step": 713 }, { "epoch": 1.8272552783109406, "grad_norm": 0.4557933345271347, "learning_rate": 3.197186999772555e-05, "loss": 0.4029, "step": 714 }, { "epoch": 1.8298144593730008, "grad_norm": 0.589112376636283, "learning_rate": 3.194317183405573e-05, "loss": 0.4563, "step": 715 }, { "epoch": 1.8323736404350608, "grad_norm": 0.5161326824368248, "learning_rate": 3.191443539980374e-05, "loss": 0.4556, "step": 716 }, { "epoch": 1.8349328214971208, "grad_norm": 0.464424728984026, "learning_rate": 3.188566078705235e-05, "loss": 0.4044, "step": 717 }, { "epoch": 1.837492002559181, "grad_norm": 0.47067836600914265, "learning_rate": 3.1856848088006636e-05, "loss": 0.4335, "step": 718 }, { "epoch": 1.8400511836212412, "grad_norm": 0.5769491747912345, "learning_rate": 3.182799739499371e-05, "loss": 0.4407, "step": 719 }, { "epoch": 1.8426103646833014, "grad_norm": 0.4932465377591071, "learning_rate": 3.1799108800462466e-05, "loss": 0.4328, "step": 720 }, { "epoch": 1.8451695457453616, "grad_norm": 0.488480860113565, "learning_rate": 3.177018239698322e-05, "loss": 0.4235, "step": 721 }, { "epoch": 1.8477287268074216, "grad_norm": 0.48833589337954714, "learning_rate": 3.1741218277247466e-05, "loss": 0.4132, "step": 722 }, { "epoch": 1.8502879078694816, "grad_norm": 0.39099920314827113, "learning_rate": 3.1712216534067536e-05, "loss": 0.4265, "step": 723 }, { "epoch": 1.8528470889315418, "grad_norm": 0.45542998139885993, "learning_rate": 3.168317726037634e-05, "loss": 0.3971, "step": 724 }, { "epoch": 1.855406269993602, "grad_norm": 0.46738782533250195, "learning_rate": 3.1654100549227024e-05, "loss": 0.4559, "step": 725 }, { "epoch": 1.8579654510556622, "grad_norm": 0.40700257974662957, "learning_rate": 3.1624986493792735e-05, "loss": 0.4135, "step": 726 }, { "epoch": 1.8605246321177225, "grad_norm": 0.4797435460256252, "learning_rate": 3.159583518736625e-05, "loss": 0.4463, "step": 727 }, { "epoch": 1.8630838131797824, "grad_norm": 0.47740901056002083, "learning_rate": 3.156664672335973e-05, "loss": 0.3884, "step": 728 }, { "epoch": 1.8656429942418427, "grad_norm": 0.49631875720308977, "learning_rate": 3.153742119530441e-05, "loss": 0.4162, "step": 729 }, { "epoch": 1.8682021753039026, "grad_norm": 0.49356825084947964, "learning_rate": 3.1508158696850275e-05, "loss": 0.4329, "step": 730 }, { "epoch": 1.8707613563659629, "grad_norm": 0.4599036508089157, "learning_rate": 3.1478859321765796e-05, "loss": 0.428, "step": 731 }, { "epoch": 1.873320537428023, "grad_norm": 0.48294927528675924, "learning_rate": 3.144952316393758e-05, "loss": 0.4058, "step": 732 }, { "epoch": 1.8758797184900833, "grad_norm": 0.5890205448596298, "learning_rate": 3.142015031737016e-05, "loss": 0.4776, "step": 733 }, { "epoch": 1.8784388995521433, "grad_norm": 0.4923421153314979, "learning_rate": 3.139074087618556e-05, "loss": 0.4045, "step": 734 }, { "epoch": 1.8809980806142035, "grad_norm": 0.5087618925492778, "learning_rate": 3.136129493462312e-05, "loss": 0.4275, "step": 735 }, { "epoch": 1.8835572616762635, "grad_norm": 0.5648227631500222, "learning_rate": 3.133181258703912e-05, "loss": 0.4727, "step": 736 }, { "epoch": 1.8861164427383237, "grad_norm": 0.5234994479526746, "learning_rate": 3.1302293927906516e-05, "loss": 0.3967, "step": 737 }, { "epoch": 1.888675623800384, "grad_norm": 0.6807348510101979, "learning_rate": 3.1272739051814594e-05, "loss": 0.4551, "step": 738 }, { "epoch": 1.891234804862444, "grad_norm": 0.4969832272412207, "learning_rate": 3.1243148053468715e-05, "loss": 0.3773, "step": 739 }, { "epoch": 1.8937939859245043, "grad_norm": 0.6317866448620022, "learning_rate": 3.121352102768998e-05, "loss": 0.4389, "step": 740 }, { "epoch": 1.8963531669865643, "grad_norm": 0.5131091797313253, "learning_rate": 3.1183858069414936e-05, "loss": 0.4458, "step": 741 }, { "epoch": 1.8989123480486243, "grad_norm": 0.4972035729160381, "learning_rate": 3.115415927369529e-05, "loss": 0.4451, "step": 742 }, { "epoch": 1.9014715291106845, "grad_norm": 0.47895024578706524, "learning_rate": 3.112442473569754e-05, "loss": 0.4324, "step": 743 }, { "epoch": 1.9040307101727447, "grad_norm": 0.4691676491006599, "learning_rate": 3.109465455070278e-05, "loss": 0.4035, "step": 744 }, { "epoch": 1.906589891234805, "grad_norm": 0.4736286307947326, "learning_rate": 3.106484881410628e-05, "loss": 0.4446, "step": 745 }, { "epoch": 1.9091490722968651, "grad_norm": 0.4359831705290721, "learning_rate": 3.103500762141725e-05, "loss": 0.3829, "step": 746 }, { "epoch": 1.9117082533589251, "grad_norm": 0.462916560551583, "learning_rate": 3.1005131068258506e-05, "loss": 0.4107, "step": 747 }, { "epoch": 1.9142674344209851, "grad_norm": 0.47173985630781595, "learning_rate": 3.09752192503662e-05, "loss": 0.3903, "step": 748 }, { "epoch": 1.9168266154830453, "grad_norm": 0.43246703137114556, "learning_rate": 3.094527226358945e-05, "loss": 0.4091, "step": 749 }, { "epoch": 1.9193857965451055, "grad_norm": 0.5056443186885541, "learning_rate": 3.091529020389009e-05, "loss": 0.4837, "step": 750 }, { "epoch": 1.9219449776071658, "grad_norm": 0.49376735973598973, "learning_rate": 3.088527316734235e-05, "loss": 0.4124, "step": 751 }, { "epoch": 1.924504158669226, "grad_norm": 0.5428642850508197, "learning_rate": 3.08552212501325e-05, "loss": 0.4304, "step": 752 }, { "epoch": 1.927063339731286, "grad_norm": 0.5365719180678239, "learning_rate": 3.082513454855863e-05, "loss": 0.405, "step": 753 }, { "epoch": 1.9296225207933462, "grad_norm": 0.4559502049703374, "learning_rate": 3.079501315903026e-05, "loss": 0.445, "step": 754 }, { "epoch": 1.9321817018554062, "grad_norm": 0.5222902812164878, "learning_rate": 3.076485717806808e-05, "loss": 0.3726, "step": 755 }, { "epoch": 1.9347408829174664, "grad_norm": 0.5737561385558596, "learning_rate": 3.073466670230361e-05, "loss": 0.4588, "step": 756 }, { "epoch": 1.9373000639795266, "grad_norm": 0.43383831135928497, "learning_rate": 3.070444182847891e-05, "loss": 0.4006, "step": 757 }, { "epoch": 1.9398592450415868, "grad_norm": 0.49738529963698463, "learning_rate": 3.067418265344628e-05, "loss": 0.404, "step": 758 }, { "epoch": 1.9424184261036468, "grad_norm": 0.4779872060995513, "learning_rate": 3.0643889274167926e-05, "loss": 0.4642, "step": 759 }, { "epoch": 1.944977607165707, "grad_norm": 0.4501703649941174, "learning_rate": 3.061356178771564e-05, "loss": 0.3845, "step": 760 }, { "epoch": 1.947536788227767, "grad_norm": 0.566851781049989, "learning_rate": 3.058320029127052e-05, "loss": 0.4603, "step": 761 }, { "epoch": 1.9500959692898272, "grad_norm": 0.41716803055724166, "learning_rate": 3.055280488212266e-05, "loss": 0.3988, "step": 762 }, { "epoch": 1.9526551503518874, "grad_norm": 0.5321354765650695, "learning_rate": 3.052237565767079e-05, "loss": 0.4633, "step": 763 }, { "epoch": 1.9552143314139476, "grad_norm": 0.5101148541262678, "learning_rate": 3.0491912715422047e-05, "loss": 0.4154, "step": 764 }, { "epoch": 1.9577735124760078, "grad_norm": 0.44138127982821407, "learning_rate": 3.0461416152991555e-05, "loss": 0.3971, "step": 765 }, { "epoch": 1.9603326935380678, "grad_norm": 0.5119670448118282, "learning_rate": 3.043088606810221e-05, "loss": 0.4344, "step": 766 }, { "epoch": 1.9628918746001278, "grad_norm": 0.5844510240848945, "learning_rate": 3.0400322558584308e-05, "loss": 0.4369, "step": 767 }, { "epoch": 1.965451055662188, "grad_norm": 0.45663704360586077, "learning_rate": 3.0369725722375274e-05, "loss": 0.4666, "step": 768 }, { "epoch": 1.9680102367242482, "grad_norm": 0.539565481061931, "learning_rate": 3.0339095657519292e-05, "loss": 0.4359, "step": 769 }, { "epoch": 1.9705694177863085, "grad_norm": 0.5385392681842599, "learning_rate": 3.0308432462167045e-05, "loss": 0.4264, "step": 770 }, { "epoch": 1.9731285988483687, "grad_norm": 0.432889165826209, "learning_rate": 3.0277736234575378e-05, "loss": 0.3845, "step": 771 }, { "epoch": 1.9756877799104287, "grad_norm": 0.5443873180170078, "learning_rate": 3.0247007073106976e-05, "loss": 0.406, "step": 772 }, { "epoch": 1.9782469609724886, "grad_norm": 0.5012354762450505, "learning_rate": 3.0216245076230062e-05, "loss": 0.4334, "step": 773 }, { "epoch": 1.9808061420345489, "grad_norm": 0.5232208647955975, "learning_rate": 3.0185450342518075e-05, "loss": 0.4268, "step": 774 }, { "epoch": 1.983365323096609, "grad_norm": 0.4513266845951912, "learning_rate": 3.015462297064936e-05, "loss": 0.3783, "step": 775 }, { "epoch": 1.9859245041586693, "grad_norm": 0.5054305167039745, "learning_rate": 3.0123763059406835e-05, "loss": 0.4148, "step": 776 }, { "epoch": 1.9884836852207295, "grad_norm": 0.47543662649122564, "learning_rate": 3.009287070767771e-05, "loss": 0.4083, "step": 777 }, { "epoch": 1.9910428662827895, "grad_norm": 0.5147396246458542, "learning_rate": 3.0061946014453113e-05, "loss": 0.406, "step": 778 }, { "epoch": 1.9936020473448497, "grad_norm": 0.537028842282906, "learning_rate": 3.0030989078827848e-05, "loss": 0.386, "step": 779 }, { "epoch": 1.9961612284069097, "grad_norm": 0.4661586754448457, "learning_rate": 3.0000000000000004e-05, "loss": 0.4218, "step": 780 }, { "epoch": 1.99872040946897, "grad_norm": 0.4261635908664324, "learning_rate": 2.9968978877270672e-05, "loss": 0.4449, "step": 781 }, { "epoch": 2.00127959053103, "grad_norm": 0.6837214787749436, "learning_rate": 2.9937925810043654e-05, "loss": 0.4318, "step": 782 }, { "epoch": 2.0038387715930903, "grad_norm": 0.451386066705997, "learning_rate": 2.990684089782507e-05, "loss": 0.3612, "step": 783 }, { "epoch": 2.0063979526551505, "grad_norm": 0.7078432089850583, "learning_rate": 2.987572424022311e-05, "loss": 0.3505, "step": 784 }, { "epoch": 2.0089571337172103, "grad_norm": 0.5081289643217769, "learning_rate": 2.98445759369477e-05, "loss": 0.3149, "step": 785 }, { "epoch": 2.0115163147792705, "grad_norm": 0.5849221231819475, "learning_rate": 2.9813396087810134e-05, "loss": 0.3514, "step": 786 }, { "epoch": 2.0140754958413307, "grad_norm": 0.48371629559591783, "learning_rate": 2.9782184792722845e-05, "loss": 0.3339, "step": 787 }, { "epoch": 2.016634676903391, "grad_norm": 0.6063864453716905, "learning_rate": 2.9750942151698968e-05, "loss": 0.389, "step": 788 }, { "epoch": 2.019193857965451, "grad_norm": 0.6277898887247666, "learning_rate": 2.971966826485212e-05, "loss": 0.3283, "step": 789 }, { "epoch": 2.0217530390275114, "grad_norm": 0.5884137115714144, "learning_rate": 2.9688363232396056e-05, "loss": 0.3353, "step": 790 }, { "epoch": 2.024312220089571, "grad_norm": 0.5242949584836221, "learning_rate": 2.9657027154644294e-05, "loss": 0.3059, "step": 791 }, { "epoch": 2.0268714011516313, "grad_norm": 0.5023037783914988, "learning_rate": 2.962566013200986e-05, "loss": 0.3433, "step": 792 }, { "epoch": 2.0294305822136915, "grad_norm": 0.6201000972339953, "learning_rate": 2.959426226500493e-05, "loss": 0.318, "step": 793 }, { "epoch": 2.0319897632757518, "grad_norm": 0.44395380528610195, "learning_rate": 2.9562833654240518e-05, "loss": 0.3401, "step": 794 }, { "epoch": 2.034548944337812, "grad_norm": 0.43056205609112264, "learning_rate": 2.9531374400426158e-05, "loss": 0.2937, "step": 795 }, { "epoch": 2.037108125399872, "grad_norm": 0.4803245756648518, "learning_rate": 2.949988460436958e-05, "loss": 0.3396, "step": 796 }, { "epoch": 2.0396673064619324, "grad_norm": 0.41525308273215306, "learning_rate": 2.946836436697636e-05, "loss": 0.3508, "step": 797 }, { "epoch": 2.042226487523992, "grad_norm": 0.4033224019153147, "learning_rate": 2.943681378924964e-05, "loss": 0.307, "step": 798 }, { "epoch": 2.0447856685860524, "grad_norm": 0.4163421073725909, "learning_rate": 2.94052329722898e-05, "loss": 0.3145, "step": 799 }, { "epoch": 2.0473448496481126, "grad_norm": 0.374364306892129, "learning_rate": 2.9373622017294075e-05, "loss": 0.3412, "step": 800 }, { "epoch": 2.049904030710173, "grad_norm": 0.3911100371820488, "learning_rate": 2.934198102555631e-05, "loss": 0.3046, "step": 801 }, { "epoch": 2.052463211772233, "grad_norm": 0.4467542980832922, "learning_rate": 2.9310310098466588e-05, "loss": 0.291, "step": 802 }, { "epoch": 2.055022392834293, "grad_norm": 0.42396484047274274, "learning_rate": 2.92786093375109e-05, "loss": 0.3268, "step": 803 }, { "epoch": 2.057581573896353, "grad_norm": 0.52821272561634, "learning_rate": 2.924687884427087e-05, "loss": 0.3699, "step": 804 }, { "epoch": 2.060140754958413, "grad_norm": 0.4278559612529404, "learning_rate": 2.9215118720423375e-05, "loss": 0.3389, "step": 805 }, { "epoch": 2.0626999360204734, "grad_norm": 0.4868265035371802, "learning_rate": 2.9183329067740235e-05, "loss": 0.2993, "step": 806 }, { "epoch": 2.0652591170825336, "grad_norm": 0.41003428936435155, "learning_rate": 2.9151509988087912e-05, "loss": 0.3138, "step": 807 }, { "epoch": 2.067818298144594, "grad_norm": 0.4548780109348631, "learning_rate": 2.911966158342713e-05, "loss": 0.3298, "step": 808 }, { "epoch": 2.070377479206654, "grad_norm": 0.42982485262636566, "learning_rate": 2.9087783955812628e-05, "loss": 0.3493, "step": 809 }, { "epoch": 2.072936660268714, "grad_norm": 0.37659830424896135, "learning_rate": 2.9055877207392752e-05, "loss": 0.2905, "step": 810 }, { "epoch": 2.075495841330774, "grad_norm": 0.6047986137094586, "learning_rate": 2.9023941440409164e-05, "loss": 0.3921, "step": 811 }, { "epoch": 2.0780550223928342, "grad_norm": 0.4024787680486599, "learning_rate": 2.899197675719653e-05, "loss": 0.3126, "step": 812 }, { "epoch": 2.0806142034548945, "grad_norm": 0.4864391147176377, "learning_rate": 2.8959983260182166e-05, "loss": 0.3259, "step": 813 }, { "epoch": 2.0831733845169547, "grad_norm": 0.4289885322757846, "learning_rate": 2.8927961051885716e-05, "loss": 0.3327, "step": 814 }, { "epoch": 2.085732565579015, "grad_norm": 0.46469758441400516, "learning_rate": 2.8895910234918828e-05, "loss": 0.3566, "step": 815 }, { "epoch": 2.0882917466410746, "grad_norm": 0.47717231725597103, "learning_rate": 2.886383091198483e-05, "loss": 0.3543, "step": 816 }, { "epoch": 2.090850927703135, "grad_norm": 0.40073207808377775, "learning_rate": 2.8831723185878382e-05, "loss": 0.2954, "step": 817 }, { "epoch": 2.093410108765195, "grad_norm": 0.5629907330757088, "learning_rate": 2.8799587159485166e-05, "loss": 0.3302, "step": 818 }, { "epoch": 2.0959692898272553, "grad_norm": 0.5537969468557435, "learning_rate": 2.876742293578155e-05, "loss": 0.3323, "step": 819 }, { "epoch": 2.0985284708893155, "grad_norm": 0.5750723337033808, "learning_rate": 2.873523061783426e-05, "loss": 0.3083, "step": 820 }, { "epoch": 2.1010876519513757, "grad_norm": 0.5333136369374486, "learning_rate": 2.8703010308800034e-05, "loss": 0.3516, "step": 821 }, { "epoch": 2.103646833013436, "grad_norm": 0.5946698263030077, "learning_rate": 2.8670762111925313e-05, "loss": 0.3337, "step": 822 }, { "epoch": 2.1062060140754957, "grad_norm": 0.4783241662438903, "learning_rate": 2.863848613054591e-05, "loss": 0.302, "step": 823 }, { "epoch": 2.108765195137556, "grad_norm": 0.42094897221075406, "learning_rate": 2.8606182468086654e-05, "loss": 0.3739, "step": 824 }, { "epoch": 2.111324376199616, "grad_norm": 0.5112995720272753, "learning_rate": 2.8573851228061084e-05, "loss": 0.3328, "step": 825 }, { "epoch": 2.1138835572616763, "grad_norm": 0.4268589732083703, "learning_rate": 2.8541492514071115e-05, "loss": 0.3199, "step": 826 }, { "epoch": 2.1164427383237365, "grad_norm": 0.40030928394161236, "learning_rate": 2.850910642980668e-05, "loss": 0.3229, "step": 827 }, { "epoch": 2.1190019193857967, "grad_norm": 0.4208663528647155, "learning_rate": 2.8476693079045432e-05, "loss": 0.3475, "step": 828 }, { "epoch": 2.1215611004478565, "grad_norm": 0.43920626817956737, "learning_rate": 2.8444252565652397e-05, "loss": 0.3395, "step": 829 }, { "epoch": 2.1241202815099167, "grad_norm": 0.4129992695563009, "learning_rate": 2.8411784993579633e-05, "loss": 0.2742, "step": 830 }, { "epoch": 2.126679462571977, "grad_norm": 0.4454585391740596, "learning_rate": 2.8379290466865906e-05, "loss": 0.3328, "step": 831 }, { "epoch": 2.129238643634037, "grad_norm": 0.46723501336721224, "learning_rate": 2.834676908963636e-05, "loss": 0.3379, "step": 832 }, { "epoch": 2.1317978246960974, "grad_norm": 0.5238858504216463, "learning_rate": 2.8314220966102177e-05, "loss": 0.3621, "step": 833 }, { "epoch": 2.1343570057581576, "grad_norm": 0.4375600366447412, "learning_rate": 2.828164620056024e-05, "loss": 0.3031, "step": 834 }, { "epoch": 2.1369161868202173, "grad_norm": 0.4011400320445024, "learning_rate": 2.8249044897392814e-05, "loss": 0.3167, "step": 835 }, { "epoch": 2.1394753678822775, "grad_norm": 0.48667210285852947, "learning_rate": 2.8216417161067187e-05, "loss": 0.3517, "step": 836 }, { "epoch": 2.1420345489443378, "grad_norm": 0.540946535276379, "learning_rate": 2.818376309613535e-05, "loss": 0.3276, "step": 837 }, { "epoch": 2.144593730006398, "grad_norm": 0.41449908449590483, "learning_rate": 2.8151082807233684e-05, "loss": 0.3429, "step": 838 }, { "epoch": 2.147152911068458, "grad_norm": 0.4411596533715045, "learning_rate": 2.811837639908257e-05, "loss": 0.3064, "step": 839 }, { "epoch": 2.1497120921305184, "grad_norm": 0.4049429223396906, "learning_rate": 2.80856439764861e-05, "loss": 0.3212, "step": 840 }, { "epoch": 2.1522712731925786, "grad_norm": 0.41603945550088506, "learning_rate": 2.8052885644331742e-05, "loss": 0.3097, "step": 841 }, { "epoch": 2.1548304542546384, "grad_norm": 0.526968199578847, "learning_rate": 2.8020101507589958e-05, "loss": 0.3547, "step": 842 }, { "epoch": 2.1573896353166986, "grad_norm": 0.38305125359444786, "learning_rate": 2.798729167131391e-05, "loss": 0.3027, "step": 843 }, { "epoch": 2.159948816378759, "grad_norm": 0.6336647706261161, "learning_rate": 2.795445624063913e-05, "loss": 0.3806, "step": 844 }, { "epoch": 2.162507997440819, "grad_norm": 0.44531937519080506, "learning_rate": 2.792159532078314e-05, "loss": 0.3323, "step": 845 }, { "epoch": 2.165067178502879, "grad_norm": 0.44290733063906507, "learning_rate": 2.7888709017045146e-05, "loss": 0.3237, "step": 846 }, { "epoch": 2.167626359564939, "grad_norm": 0.4370183293736626, "learning_rate": 2.7855797434805695e-05, "loss": 0.338, "step": 847 }, { "epoch": 2.170185540626999, "grad_norm": 0.4164027353779343, "learning_rate": 2.782286067952634e-05, "loss": 0.3278, "step": 848 }, { "epoch": 2.1727447216890594, "grad_norm": 0.49409168279966853, "learning_rate": 2.7789898856749297e-05, "loss": 0.3568, "step": 849 }, { "epoch": 2.1753039027511196, "grad_norm": 0.43481015195691675, "learning_rate": 2.77569120720971e-05, "loss": 0.3356, "step": 850 }, { "epoch": 2.17786308381318, "grad_norm": 0.4983975940240211, "learning_rate": 2.772390043127228e-05, "loss": 0.3373, "step": 851 }, { "epoch": 2.18042226487524, "grad_norm": 0.43692987588400956, "learning_rate": 2.7690864040057023e-05, "loss": 0.3108, "step": 852 }, { "epoch": 2.1829814459373003, "grad_norm": 0.44866752393409093, "learning_rate": 2.7657803004312797e-05, "loss": 0.3347, "step": 853 }, { "epoch": 2.18554062699936, "grad_norm": 0.5027966160971863, "learning_rate": 2.7624717429980067e-05, "loss": 0.3536, "step": 854 }, { "epoch": 2.1880998080614202, "grad_norm": 0.4093435861731407, "learning_rate": 2.7591607423077932e-05, "loss": 0.2917, "step": 855 }, { "epoch": 2.1906589891234804, "grad_norm": 0.49615217011265117, "learning_rate": 2.755847308970376e-05, "loss": 0.3502, "step": 856 }, { "epoch": 2.1932181701855407, "grad_norm": 0.3967350798943657, "learning_rate": 2.752531453603288e-05, "loss": 0.3177, "step": 857 }, { "epoch": 2.195777351247601, "grad_norm": 0.4774434168760954, "learning_rate": 2.7492131868318247e-05, "loss": 0.3616, "step": 858 }, { "epoch": 2.198336532309661, "grad_norm": 0.4533916271890891, "learning_rate": 2.7458925192890057e-05, "loss": 0.3235, "step": 859 }, { "epoch": 2.200895713371721, "grad_norm": 0.4666533097303878, "learning_rate": 2.7425694616155474e-05, "loss": 0.362, "step": 860 }, { "epoch": 2.203454894433781, "grad_norm": 0.5256588022807218, "learning_rate": 2.739244024459822e-05, "loss": 0.3577, "step": 861 }, { "epoch": 2.2060140754958413, "grad_norm": 0.42484253391437565, "learning_rate": 2.7359162184778276e-05, "loss": 0.3297, "step": 862 }, { "epoch": 2.2085732565579015, "grad_norm": 0.44868873474156656, "learning_rate": 2.7325860543331533e-05, "loss": 0.336, "step": 863 }, { "epoch": 2.2111324376199617, "grad_norm": 0.4732703525530866, "learning_rate": 2.7292535426969436e-05, "loss": 0.3057, "step": 864 }, { "epoch": 2.213691618682022, "grad_norm": 0.477868292020973, "learning_rate": 2.7259186942478656e-05, "loss": 0.308, "step": 865 }, { "epoch": 2.216250799744082, "grad_norm": 0.4842331398481323, "learning_rate": 2.7225815196720767e-05, "loss": 0.3145, "step": 866 }, { "epoch": 2.218809980806142, "grad_norm": 0.5381115021925195, "learning_rate": 2.7192420296631835e-05, "loss": 0.3798, "step": 867 }, { "epoch": 2.221369161868202, "grad_norm": 0.4236836146413328, "learning_rate": 2.7159002349222178e-05, "loss": 0.3237, "step": 868 }, { "epoch": 2.2239283429302623, "grad_norm": 0.5479188963928002, "learning_rate": 2.7125561461575924e-05, "loss": 0.3832, "step": 869 }, { "epoch": 2.2264875239923225, "grad_norm": 0.45518106829461097, "learning_rate": 2.7092097740850712e-05, "loss": 0.3048, "step": 870 }, { "epoch": 2.2290467050543827, "grad_norm": 0.4441896488412185, "learning_rate": 2.7058611294277378e-05, "loss": 0.3141, "step": 871 }, { "epoch": 2.2316058861164425, "grad_norm": 0.43355994332482317, "learning_rate": 2.702510222915956e-05, "loss": 0.3107, "step": 872 }, { "epoch": 2.2341650671785027, "grad_norm": 0.5312269604580118, "learning_rate": 2.6991570652873357e-05, "loss": 0.3404, "step": 873 }, { "epoch": 2.236724248240563, "grad_norm": 0.3995110246728028, "learning_rate": 2.6958016672867048e-05, "loss": 0.3122, "step": 874 }, { "epoch": 2.239283429302623, "grad_norm": 0.4611276323390611, "learning_rate": 2.692444039666066e-05, "loss": 0.317, "step": 875 }, { "epoch": 2.2418426103646834, "grad_norm": 0.5307192324613822, "learning_rate": 2.6890841931845674e-05, "loss": 0.3579, "step": 876 }, { "epoch": 2.2444017914267436, "grad_norm": 0.41152344841446314, "learning_rate": 2.68572213860847e-05, "loss": 0.3278, "step": 877 }, { "epoch": 2.2469609724888038, "grad_norm": 0.439703599513908, "learning_rate": 2.6823578867111072e-05, "loss": 0.3207, "step": 878 }, { "epoch": 2.2495201535508635, "grad_norm": 0.48436216914849156, "learning_rate": 2.6789914482728546e-05, "loss": 0.3923, "step": 879 }, { "epoch": 2.2520793346129238, "grad_norm": 0.4065191432791332, "learning_rate": 2.6756228340810946e-05, "loss": 0.3092, "step": 880 }, { "epoch": 2.254638515674984, "grad_norm": 0.45257368911274026, "learning_rate": 2.6722520549301813e-05, "loss": 0.3201, "step": 881 }, { "epoch": 2.257197696737044, "grad_norm": 0.47676366709922463, "learning_rate": 2.6688791216214064e-05, "loss": 0.3552, "step": 882 }, { "epoch": 2.2597568777991044, "grad_norm": 0.4388935470431639, "learning_rate": 2.6655040449629646e-05, "loss": 0.3117, "step": 883 }, { "epoch": 2.2623160588611646, "grad_norm": 0.49852041258030133, "learning_rate": 2.6621268357699165e-05, "loss": 0.2986, "step": 884 }, { "epoch": 2.2648752399232244, "grad_norm": 0.5574815275071192, "learning_rate": 2.6587475048641596e-05, "loss": 0.3652, "step": 885 }, { "epoch": 2.2674344209852846, "grad_norm": 0.532498078005579, "learning_rate": 2.655366063074388e-05, "loss": 0.3361, "step": 886 }, { "epoch": 2.269993602047345, "grad_norm": 0.4905390491427929, "learning_rate": 2.6519825212360607e-05, "loss": 0.2904, "step": 887 }, { "epoch": 2.272552783109405, "grad_norm": 0.4548356990918924, "learning_rate": 2.6485968901913658e-05, "loss": 0.3383, "step": 888 }, { "epoch": 2.275111964171465, "grad_norm": 0.4527361560109495, "learning_rate": 2.6452091807891855e-05, "loss": 0.3395, "step": 889 }, { "epoch": 2.2776711452335254, "grad_norm": 0.4444282648587205, "learning_rate": 2.6418194038850634e-05, "loss": 0.3155, "step": 890 }, { "epoch": 2.2802303262955856, "grad_norm": 0.37005966275705604, "learning_rate": 2.6384275703411666e-05, "loss": 0.3172, "step": 891 }, { "epoch": 2.2827895073576454, "grad_norm": 0.4280292706382066, "learning_rate": 2.635033691026253e-05, "loss": 0.3643, "step": 892 }, { "epoch": 2.2853486884197056, "grad_norm": 0.46336387808601265, "learning_rate": 2.6316377768156366e-05, "loss": 0.3516, "step": 893 }, { "epoch": 2.287907869481766, "grad_norm": 0.3675246620437907, "learning_rate": 2.6282398385911503e-05, "loss": 0.2782, "step": 894 }, { "epoch": 2.290467050543826, "grad_norm": 0.4188680082069283, "learning_rate": 2.624839887241115e-05, "loss": 0.3521, "step": 895 }, { "epoch": 2.2930262316058863, "grad_norm": 0.42684239051457756, "learning_rate": 2.6214379336603016e-05, "loss": 0.2909, "step": 896 }, { "epoch": 2.295585412667946, "grad_norm": 0.36815581001539444, "learning_rate": 2.618033988749895e-05, "loss": 0.3068, "step": 897 }, { "epoch": 2.2981445937300062, "grad_norm": 0.4355098607462678, "learning_rate": 2.614628063417464e-05, "loss": 0.3561, "step": 898 }, { "epoch": 2.3007037747920664, "grad_norm": 0.4278330160593537, "learning_rate": 2.6112201685769224e-05, "loss": 0.3265, "step": 899 }, { "epoch": 2.3032629558541267, "grad_norm": 0.39014595589180573, "learning_rate": 2.607810315148494e-05, "loss": 0.3569, "step": 900 }, { "epoch": 2.305822136916187, "grad_norm": 0.4667698942028393, "learning_rate": 2.60439851405868e-05, "loss": 0.3631, "step": 901 }, { "epoch": 2.308381317978247, "grad_norm": 0.41965973806882434, "learning_rate": 2.600984776240222e-05, "loss": 0.3248, "step": 902 }, { "epoch": 2.3109404990403073, "grad_norm": 0.44724491924157506, "learning_rate": 2.5975691126320678e-05, "loss": 0.3854, "step": 903 }, { "epoch": 2.313499680102367, "grad_norm": 0.46251594524874595, "learning_rate": 2.5941515341793366e-05, "loss": 0.3503, "step": 904 }, { "epoch": 2.3160588611644273, "grad_norm": 0.35346075618792994, "learning_rate": 2.5907320518332827e-05, "loss": 0.3309, "step": 905 }, { "epoch": 2.3186180422264875, "grad_norm": 0.3826935478844609, "learning_rate": 2.587310676551262e-05, "loss": 0.2894, "step": 906 }, { "epoch": 2.3211772232885477, "grad_norm": 0.416397755264069, "learning_rate": 2.5838874192966953e-05, "loss": 0.3716, "step": 907 }, { "epoch": 2.323736404350608, "grad_norm": 0.3982851139464845, "learning_rate": 2.5804622910390348e-05, "loss": 0.2833, "step": 908 }, { "epoch": 2.326295585412668, "grad_norm": 0.4591104893849011, "learning_rate": 2.5770353027537276e-05, "loss": 0.3277, "step": 909 }, { "epoch": 2.328854766474728, "grad_norm": 0.382725152261151, "learning_rate": 2.5736064654221808e-05, "loss": 0.323, "step": 910 }, { "epoch": 2.331413947536788, "grad_norm": 0.5087928821054611, "learning_rate": 2.5701757900317277e-05, "loss": 0.3314, "step": 911 }, { "epoch": 2.3339731285988483, "grad_norm": 0.3914001278691039, "learning_rate": 2.5667432875755904e-05, "loss": 0.3508, "step": 912 }, { "epoch": 2.3365323096609085, "grad_norm": 0.4510480545622787, "learning_rate": 2.5633089690528455e-05, "loss": 0.3529, "step": 913 }, { "epoch": 2.3390914907229687, "grad_norm": 0.4134813709280613, "learning_rate": 2.559872845468391e-05, "loss": 0.3286, "step": 914 }, { "epoch": 2.341650671785029, "grad_norm": 0.40105410165144934, "learning_rate": 2.5564349278329056e-05, "loss": 0.2852, "step": 915 }, { "epoch": 2.344209852847089, "grad_norm": 0.49998212235296036, "learning_rate": 2.5529952271628192e-05, "loss": 0.2916, "step": 916 }, { "epoch": 2.346769033909149, "grad_norm": 0.46734623561200184, "learning_rate": 2.5495537544802757e-05, "loss": 0.3497, "step": 917 }, { "epoch": 2.349328214971209, "grad_norm": 0.43677669206755015, "learning_rate": 2.5461105208130953e-05, "loss": 0.359, "step": 918 }, { "epoch": 2.3518873960332694, "grad_norm": 0.49010454865810016, "learning_rate": 2.542665537194742e-05, "loss": 0.3368, "step": 919 }, { "epoch": 2.3544465770953296, "grad_norm": 0.370850360816377, "learning_rate": 2.539218814664288e-05, "loss": 0.3222, "step": 920 }, { "epoch": 2.3570057581573898, "grad_norm": 0.46886497417633327, "learning_rate": 2.5357703642663766e-05, "loss": 0.3633, "step": 921 }, { "epoch": 2.3595649392194495, "grad_norm": 0.4227514997155462, "learning_rate": 2.5323201970511883e-05, "loss": 0.3497, "step": 922 }, { "epoch": 2.3621241202815098, "grad_norm": 0.4015401134012503, "learning_rate": 2.528868324074405e-05, "loss": 0.3076, "step": 923 }, { "epoch": 2.36468330134357, "grad_norm": 0.4012146844135177, "learning_rate": 2.525414756397174e-05, "loss": 0.3117, "step": 924 }, { "epoch": 2.36724248240563, "grad_norm": 0.3724913651532696, "learning_rate": 2.521959505086075e-05, "loss": 0.2948, "step": 925 }, { "epoch": 2.3698016634676904, "grad_norm": 0.44515489828057647, "learning_rate": 2.5185025812130794e-05, "loss": 0.3624, "step": 926 }, { "epoch": 2.3723608445297506, "grad_norm": 0.4333489454322345, "learning_rate": 2.5150439958555205e-05, "loss": 0.3254, "step": 927 }, { "epoch": 2.374920025591811, "grad_norm": 0.4284602049159506, "learning_rate": 2.5115837600960564e-05, "loss": 0.3232, "step": 928 }, { "epoch": 2.3774792066538706, "grad_norm": 0.4327763714080197, "learning_rate": 2.5081218850226315e-05, "loss": 0.3213, "step": 929 }, { "epoch": 2.380038387715931, "grad_norm": 0.4092391708238846, "learning_rate": 2.5046583817284437e-05, "loss": 0.3645, "step": 930 }, { "epoch": 2.382597568777991, "grad_norm": 0.3874157650273361, "learning_rate": 2.5011932613119098e-05, "loss": 0.3546, "step": 931 }, { "epoch": 2.385156749840051, "grad_norm": 0.4154060512286611, "learning_rate": 2.497726534876627e-05, "loss": 0.3724, "step": 932 }, { "epoch": 2.3877159309021114, "grad_norm": 0.3520365071455179, "learning_rate": 2.4942582135313393e-05, "loss": 0.3171, "step": 933 }, { "epoch": 2.3902751119641716, "grad_norm": 0.4129362868719995, "learning_rate": 2.490788308389902e-05, "loss": 0.3081, "step": 934 }, { "epoch": 2.3928342930262314, "grad_norm": 0.3826332240853489, "learning_rate": 2.487316830571244e-05, "loss": 0.3167, "step": 935 }, { "epoch": 2.3953934740882916, "grad_norm": 0.41073512437749543, "learning_rate": 2.4838437911993355e-05, "loss": 0.2872, "step": 936 }, { "epoch": 2.397952655150352, "grad_norm": 0.41060249122236425, "learning_rate": 2.48036920140315e-05, "loss": 0.3331, "step": 937 }, { "epoch": 2.400511836212412, "grad_norm": 0.39939347922246277, "learning_rate": 2.4768930723166266e-05, "loss": 0.309, "step": 938 }, { "epoch": 2.4030710172744723, "grad_norm": 0.44800610692896503, "learning_rate": 2.473415415078642e-05, "loss": 0.3301, "step": 939 }, { "epoch": 2.4056301983365325, "grad_norm": 0.45045037484262557, "learning_rate": 2.4699362408329646e-05, "loss": 0.3545, "step": 940 }, { "epoch": 2.4081893793985927, "grad_norm": 0.4516520833709128, "learning_rate": 2.466455560728227e-05, "loss": 0.3219, "step": 941 }, { "epoch": 2.4107485604606524, "grad_norm": 0.4140396756771499, "learning_rate": 2.4629733859178867e-05, "loss": 0.3312, "step": 942 }, { "epoch": 2.4133077415227127, "grad_norm": 0.4242325070475781, "learning_rate": 2.4594897275601887e-05, "loss": 0.3657, "step": 943 }, { "epoch": 2.415866922584773, "grad_norm": 0.3534056791478892, "learning_rate": 2.456004596818135e-05, "loss": 0.2875, "step": 944 }, { "epoch": 2.418426103646833, "grad_norm": 0.4504625178937038, "learning_rate": 2.4525180048594452e-05, "loss": 0.3947, "step": 945 }, { "epoch": 2.4209852847088933, "grad_norm": 0.3713456955088067, "learning_rate": 2.4490299628565168e-05, "loss": 0.3365, "step": 946 }, { "epoch": 2.423544465770953, "grad_norm": 0.4139332630376369, "learning_rate": 2.4455404819864e-05, "loss": 0.3213, "step": 947 }, { "epoch": 2.4261036468330133, "grad_norm": 0.39284365253142334, "learning_rate": 2.4420495734307527e-05, "loss": 0.3707, "step": 948 }, { "epoch": 2.4286628278950735, "grad_norm": 0.44240961049247096, "learning_rate": 2.4385572483758066e-05, "loss": 0.373, "step": 949 }, { "epoch": 2.4312220089571337, "grad_norm": 0.41468110454884644, "learning_rate": 2.435063518012335e-05, "loss": 0.3791, "step": 950 }, { "epoch": 2.433781190019194, "grad_norm": 0.38546461774505014, "learning_rate": 2.4315683935356127e-05, "loss": 0.3092, "step": 951 }, { "epoch": 2.436340371081254, "grad_norm": 0.3973539449011059, "learning_rate": 2.4280718861453814e-05, "loss": 0.3537, "step": 952 }, { "epoch": 2.4388995521433143, "grad_norm": 0.40087880001543535, "learning_rate": 2.424574007045816e-05, "loss": 0.3513, "step": 953 }, { "epoch": 2.441458733205374, "grad_norm": 0.4363352682087938, "learning_rate": 2.421074767445485e-05, "loss": 0.3168, "step": 954 }, { "epoch": 2.4440179142674343, "grad_norm": 0.387588478700538, "learning_rate": 2.4175741785573177e-05, "loss": 0.3156, "step": 955 }, { "epoch": 2.4465770953294945, "grad_norm": 0.43136617250905906, "learning_rate": 2.4140722515985666e-05, "loss": 0.3396, "step": 956 }, { "epoch": 2.4491362763915547, "grad_norm": 0.4356259978949205, "learning_rate": 2.4105689977907722e-05, "loss": 0.3633, "step": 957 }, { "epoch": 2.451695457453615, "grad_norm": 0.3939046327707216, "learning_rate": 2.407064428359726e-05, "loss": 0.3367, "step": 958 }, { "epoch": 2.454254638515675, "grad_norm": 0.44530753935780215, "learning_rate": 2.4035585545354353e-05, "loss": 0.2652, "step": 959 }, { "epoch": 2.456813819577735, "grad_norm": 0.39324736708789354, "learning_rate": 2.4000513875520892e-05, "loss": 0.3497, "step": 960 }, { "epoch": 2.459373000639795, "grad_norm": 0.3745102508002373, "learning_rate": 2.396542938648018e-05, "loss": 0.351, "step": 961 }, { "epoch": 2.4619321817018553, "grad_norm": 0.433148063417755, "learning_rate": 2.3930332190656604e-05, "loss": 0.3226, "step": 962 }, { "epoch": 2.4644913627639156, "grad_norm": 0.5060978327975577, "learning_rate": 2.3895222400515282e-05, "loss": 0.3944, "step": 963 }, { "epoch": 2.4670505438259758, "grad_norm": 0.3831333684566055, "learning_rate": 2.3860100128561677e-05, "loss": 0.303, "step": 964 }, { "epoch": 2.469609724888036, "grad_norm": 0.4834696949807748, "learning_rate": 2.3824965487341247e-05, "loss": 0.36, "step": 965 }, { "epoch": 2.472168905950096, "grad_norm": 0.48539696684918826, "learning_rate": 2.3789818589439094e-05, "loss": 0.3418, "step": 966 }, { "epoch": 2.474728087012156, "grad_norm": 0.3663525699221002, "learning_rate": 2.375465954747959e-05, "loss": 0.2906, "step": 967 }, { "epoch": 2.477287268074216, "grad_norm": 0.4444877062926493, "learning_rate": 2.371948847412602e-05, "loss": 0.3281, "step": 968 }, { "epoch": 2.4798464491362764, "grad_norm": 0.37267946857207057, "learning_rate": 2.3684305482080233e-05, "loss": 0.3214, "step": 969 }, { "epoch": 2.4824056301983366, "grad_norm": 0.422309895496103, "learning_rate": 2.3649110684082258e-05, "loss": 0.3309, "step": 970 }, { "epoch": 2.484964811260397, "grad_norm": 0.366896575024139, "learning_rate": 2.361390419290995e-05, "loss": 0.3359, "step": 971 }, { "epoch": 2.4875239923224566, "grad_norm": 0.4252498965109737, "learning_rate": 2.357868612137866e-05, "loss": 0.3162, "step": 972 }, { "epoch": 2.490083173384517, "grad_norm": 0.4454769676233995, "learning_rate": 2.3543456582340815e-05, "loss": 0.3458, "step": 973 }, { "epoch": 2.492642354446577, "grad_norm": 0.504528012046428, "learning_rate": 2.3508215688685607e-05, "loss": 0.3783, "step": 974 }, { "epoch": 2.495201535508637, "grad_norm": 0.3791982649378316, "learning_rate": 2.3472963553338614e-05, "loss": 0.3439, "step": 975 }, { "epoch": 2.4977607165706974, "grad_norm": 0.3958397655771158, "learning_rate": 2.3437700289261417e-05, "loss": 0.3098, "step": 976 }, { "epoch": 2.5003198976327576, "grad_norm": 0.4716678361651927, "learning_rate": 2.3402426009451288e-05, "loss": 0.3442, "step": 977 }, { "epoch": 2.502879078694818, "grad_norm": 0.41505752875646384, "learning_rate": 2.3367140826940768e-05, "loss": 0.3393, "step": 978 }, { "epoch": 2.505438259756878, "grad_norm": 0.4831411264450984, "learning_rate": 2.333184485479737e-05, "loss": 0.3406, "step": 979 }, { "epoch": 2.507997440818938, "grad_norm": 0.44259478488091053, "learning_rate": 2.3296538206123134e-05, "loss": 0.3498, "step": 980 }, { "epoch": 2.510556621880998, "grad_norm": 0.39650938475151654, "learning_rate": 2.326122099405435e-05, "loss": 0.3218, "step": 981 }, { "epoch": 2.5131158029430583, "grad_norm": 0.44478690078566685, "learning_rate": 2.3225893331761143e-05, "loss": 0.3354, "step": 982 }, { "epoch": 2.5156749840051185, "grad_norm": 0.4617579108787994, "learning_rate": 2.319055533244712e-05, "loss": 0.3689, "step": 983 }, { "epoch": 2.5182341650671782, "grad_norm": 0.36510971786258006, "learning_rate": 2.315520710934903e-05, "loss": 0.3189, "step": 984 }, { "epoch": 2.5207933461292384, "grad_norm": 0.5962978268524062, "learning_rate": 2.311984877573636e-05, "loss": 0.3785, "step": 985 }, { "epoch": 2.5233525271912987, "grad_norm": 0.4286182290118198, "learning_rate": 2.3084480444911006e-05, "loss": 0.2969, "step": 986 }, { "epoch": 2.525911708253359, "grad_norm": 0.406129472684799, "learning_rate": 2.304910223020691e-05, "loss": 0.3622, "step": 987 }, { "epoch": 2.528470889315419, "grad_norm": 0.40352070167371706, "learning_rate": 2.3013714244989665e-05, "loss": 0.3003, "step": 988 }, { "epoch": 2.5310300703774793, "grad_norm": 0.41853745249254193, "learning_rate": 2.2978316602656183e-05, "loss": 0.3545, "step": 989 }, { "epoch": 2.5335892514395395, "grad_norm": 0.3751248697656993, "learning_rate": 2.2942909416634326e-05, "loss": 0.3317, "step": 990 }, { "epoch": 2.5361484325015997, "grad_norm": 0.38457641982344676, "learning_rate": 2.290749280038252e-05, "loss": 0.3186, "step": 991 }, { "epoch": 2.5387076135636595, "grad_norm": 0.43410698680189885, "learning_rate": 2.2872066867389434e-05, "loss": 0.3819, "step": 992 }, { "epoch": 2.5412667946257197, "grad_norm": 0.3827463318912182, "learning_rate": 2.2836631731173577e-05, "loss": 0.3428, "step": 993 }, { "epoch": 2.54382597568778, "grad_norm": 0.3626009354081465, "learning_rate": 2.2801187505282948e-05, "loss": 0.3313, "step": 994 }, { "epoch": 2.54638515674984, "grad_norm": 0.3879708806451702, "learning_rate": 2.2765734303294666e-05, "loss": 0.302, "step": 995 }, { "epoch": 2.5489443378119003, "grad_norm": 0.3615193094932171, "learning_rate": 2.2730272238814636e-05, "loss": 0.3022, "step": 996 }, { "epoch": 2.55150351887396, "grad_norm": 0.38109707078147037, "learning_rate": 2.2694801425477136e-05, "loss": 0.3199, "step": 997 }, { "epoch": 2.5540626999360203, "grad_norm": 0.35624704442372485, "learning_rate": 2.2659321976944507e-05, "loss": 0.3394, "step": 998 }, { "epoch": 2.5566218809980805, "grad_norm": 0.5236454693659701, "learning_rate": 2.2623834006906732e-05, "loss": 0.3254, "step": 999 }, { "epoch": 2.5591810620601407, "grad_norm": 0.4793516094510245, "learning_rate": 2.2588337629081107e-05, "loss": 0.4122, "step": 1000 }, { "epoch": 2.561740243122201, "grad_norm": 0.32519173651998734, "learning_rate": 2.25528329572119e-05, "loss": 0.2782, "step": 1001 }, { "epoch": 2.564299424184261, "grad_norm": 0.47513564073653997, "learning_rate": 2.25173201050699e-05, "loss": 0.4075, "step": 1002 }, { "epoch": 2.5668586052463214, "grad_norm": 0.34327531232567976, "learning_rate": 2.248179918645216e-05, "loss": 0.2602, "step": 1003 }, { "epoch": 2.5694177863083816, "grad_norm": 0.42876499217691605, "learning_rate": 2.2446270315181566e-05, "loss": 0.3538, "step": 1004 }, { "epoch": 2.5719769673704413, "grad_norm": 0.38643082957234787, "learning_rate": 2.2410733605106462e-05, "loss": 0.3331, "step": 1005 }, { "epoch": 2.5745361484325016, "grad_norm": 0.3845952145329833, "learning_rate": 2.237518917010035e-05, "loss": 0.3068, "step": 1006 }, { "epoch": 2.5770953294945618, "grad_norm": 0.398304345128026, "learning_rate": 2.233963712406147e-05, "loss": 0.3455, "step": 1007 }, { "epoch": 2.579654510556622, "grad_norm": 0.4106067027436424, "learning_rate": 2.2304077580912423e-05, "loss": 0.3266, "step": 1008 }, { "epoch": 2.5822136916186818, "grad_norm": 0.3515875497757696, "learning_rate": 2.2268510654599885e-05, "loss": 0.3089, "step": 1009 }, { "epoch": 2.584772872680742, "grad_norm": 0.34230034407291976, "learning_rate": 2.2232936459094158e-05, "loss": 0.37, "step": 1010 }, { "epoch": 2.587332053742802, "grad_norm": 0.3409277384030245, "learning_rate": 2.2197355108388835e-05, "loss": 0.3425, "step": 1011 }, { "epoch": 2.5898912348048624, "grad_norm": 0.3659124451915072, "learning_rate": 2.216176671650045e-05, "loss": 0.3417, "step": 1012 }, { "epoch": 2.5924504158669226, "grad_norm": 0.34644737723549984, "learning_rate": 2.2126171397468105e-05, "loss": 0.3048, "step": 1013 }, { "epoch": 2.595009596928983, "grad_norm": 0.34192043331418503, "learning_rate": 2.209056926535307e-05, "loss": 0.3245, "step": 1014 }, { "epoch": 2.597568777991043, "grad_norm": 0.4024225219500372, "learning_rate": 2.205496043423849e-05, "loss": 0.3501, "step": 1015 }, { "epoch": 2.6001279590531032, "grad_norm": 0.3444122394151278, "learning_rate": 2.2019345018228922e-05, "loss": 0.3403, "step": 1016 }, { "epoch": 2.602687140115163, "grad_norm": 0.41753443691652886, "learning_rate": 2.1983723131450088e-05, "loss": 0.3609, "step": 1017 }, { "epoch": 2.605246321177223, "grad_norm": 0.42275585901863255, "learning_rate": 2.194809488804839e-05, "loss": 0.3427, "step": 1018 }, { "epoch": 2.6078055022392834, "grad_norm": 0.3513925245942965, "learning_rate": 2.1912460402190625e-05, "loss": 0.2984, "step": 1019 }, { "epoch": 2.6103646833013436, "grad_norm": 0.4845404009383636, "learning_rate": 2.1876819788063586e-05, "loss": 0.342, "step": 1020 }, { "epoch": 2.612923864363404, "grad_norm": 0.38632744480954595, "learning_rate": 2.1841173159873718e-05, "loss": 0.3178, "step": 1021 }, { "epoch": 2.6154830454254636, "grad_norm": 0.40341965144316216, "learning_rate": 2.1805520631846705e-05, "loss": 0.3454, "step": 1022 }, { "epoch": 2.618042226487524, "grad_norm": 0.5429183555857332, "learning_rate": 2.176986231822717e-05, "loss": 0.3407, "step": 1023 }, { "epoch": 2.620601407549584, "grad_norm": 0.3693931345744361, "learning_rate": 2.173419833327826e-05, "loss": 0.2931, "step": 1024 }, { "epoch": 2.6231605886116443, "grad_norm": 0.41461427921632693, "learning_rate": 2.16985287912813e-05, "loss": 0.3462, "step": 1025 }, { "epoch": 2.6257197696737045, "grad_norm": 0.35793827637412173, "learning_rate": 2.166285380653541e-05, "loss": 0.2649, "step": 1026 }, { "epoch": 2.6282789507357647, "grad_norm": 0.43455345040805726, "learning_rate": 2.1627173493357167e-05, "loss": 0.3432, "step": 1027 }, { "epoch": 2.630838131797825, "grad_norm": 0.36886933829443885, "learning_rate": 2.1591487966080215e-05, "loss": 0.3106, "step": 1028 }, { "epoch": 2.633397312859885, "grad_norm": 0.3991822617060509, "learning_rate": 2.1555797339054898e-05, "loss": 0.3621, "step": 1029 }, { "epoch": 2.635956493921945, "grad_norm": 0.3967391590295086, "learning_rate": 2.1520101726647922e-05, "loss": 0.3711, "step": 1030 }, { "epoch": 2.638515674984005, "grad_norm": 0.3577916760514241, "learning_rate": 2.1484401243241947e-05, "loss": 0.2945, "step": 1031 }, { "epoch": 2.6410748560460653, "grad_norm": 0.40155422081290365, "learning_rate": 2.1448696003235252e-05, "loss": 0.3366, "step": 1032 }, { "epoch": 2.6436340371081255, "grad_norm": 0.36541079152322986, "learning_rate": 2.1412986121041355e-05, "loss": 0.2932, "step": 1033 }, { "epoch": 2.6461932181701853, "grad_norm": 0.3484241132798254, "learning_rate": 2.1377271711088655e-05, "loss": 0.3339, "step": 1034 }, { "epoch": 2.6487523992322455, "grad_norm": 0.43519634098943255, "learning_rate": 2.1341552887820048e-05, "loss": 0.3762, "step": 1035 }, { "epoch": 2.6513115802943057, "grad_norm": 0.33001353721510546, "learning_rate": 2.1305829765692588e-05, "loss": 0.3277, "step": 1036 }, { "epoch": 2.653870761356366, "grad_norm": 0.3720609129475248, "learning_rate": 2.1270102459177093e-05, "loss": 0.3101, "step": 1037 }, { "epoch": 2.656429942418426, "grad_norm": 0.3837314560637158, "learning_rate": 2.123437108275779e-05, "loss": 0.351, "step": 1038 }, { "epoch": 2.6589891234804863, "grad_norm": 0.40361073664191494, "learning_rate": 2.119863575093195e-05, "loss": 0.3171, "step": 1039 }, { "epoch": 2.6615483045425465, "grad_norm": 0.35854323369583274, "learning_rate": 2.1162896578209517e-05, "loss": 0.3253, "step": 1040 }, { "epoch": 2.6641074856046068, "grad_norm": 0.37248590893696937, "learning_rate": 2.112715367911275e-05, "loss": 0.3511, "step": 1041 }, { "epoch": 2.6666666666666665, "grad_norm": 0.3453384873567075, "learning_rate": 2.1091407168175836e-05, "loss": 0.3414, "step": 1042 }, { "epoch": 2.6692258477287267, "grad_norm": 0.40821485869737884, "learning_rate": 2.1055657159944545e-05, "loss": 0.3326, "step": 1043 }, { "epoch": 2.671785028790787, "grad_norm": 0.36123413084691214, "learning_rate": 2.1019903768975852e-05, "loss": 0.3298, "step": 1044 }, { "epoch": 2.674344209852847, "grad_norm": 0.3404084610072503, "learning_rate": 2.0984147109837564e-05, "loss": 0.3047, "step": 1045 }, { "epoch": 2.6769033909149074, "grad_norm": 0.4454775171788898, "learning_rate": 2.094838729710798e-05, "loss": 0.3679, "step": 1046 }, { "epoch": 2.679462571976967, "grad_norm": 0.3571602753390297, "learning_rate": 2.0912624445375483e-05, "loss": 0.3261, "step": 1047 }, { "epoch": 2.6820217530390273, "grad_norm": 0.332410280409988, "learning_rate": 2.0876858669238206e-05, "loss": 0.3114, "step": 1048 }, { "epoch": 2.6845809341010876, "grad_norm": 0.42377420739915694, "learning_rate": 2.0841090083303643e-05, "loss": 0.342, "step": 1049 }, { "epoch": 2.6871401151631478, "grad_norm": 0.3395618485708725, "learning_rate": 2.0805318802188307e-05, "loss": 0.3157, "step": 1050 }, { "epoch": 2.689699296225208, "grad_norm": 0.3545398294326781, "learning_rate": 2.0769544940517326e-05, "loss": 0.3207, "step": 1051 }, { "epoch": 2.692258477287268, "grad_norm": 0.41494596285417495, "learning_rate": 2.0733768612924137e-05, "loss": 0.3294, "step": 1052 }, { "epoch": 2.6948176583493284, "grad_norm": 0.3439663523935823, "learning_rate": 2.0697989934050025e-05, "loss": 0.2852, "step": 1053 }, { "epoch": 2.6973768394113886, "grad_norm": 0.41679625940370135, "learning_rate": 2.0662209018543836e-05, "loss": 0.3548, "step": 1054 }, { "epoch": 2.6999360204734484, "grad_norm": 0.4309626747124301, "learning_rate": 2.0626425981061608e-05, "loss": 0.326, "step": 1055 }, { "epoch": 2.7024952015355086, "grad_norm": 0.3700268039056913, "learning_rate": 2.0590640936266132e-05, "loss": 0.3346, "step": 1056 }, { "epoch": 2.705054382597569, "grad_norm": 0.4092764582830664, "learning_rate": 2.0554853998826652e-05, "loss": 0.3432, "step": 1057 }, { "epoch": 2.707613563659629, "grad_norm": 0.3003644777646875, "learning_rate": 2.0519065283418494e-05, "loss": 0.257, "step": 1058 }, { "epoch": 2.710172744721689, "grad_norm": 0.36206552407899595, "learning_rate": 2.0483274904722647e-05, "loss": 0.3339, "step": 1059 }, { "epoch": 2.712731925783749, "grad_norm": 0.41404402491302905, "learning_rate": 2.0447482977425465e-05, "loss": 0.3705, "step": 1060 }, { "epoch": 2.715291106845809, "grad_norm": 0.35467256324797414, "learning_rate": 2.0411689616218234e-05, "loss": 0.3173, "step": 1061 }, { "epoch": 2.7178502879078694, "grad_norm": 0.39211100334061155, "learning_rate": 2.037589493579685e-05, "loss": 0.336, "step": 1062 }, { "epoch": 2.7204094689699296, "grad_norm": 0.3752676154141884, "learning_rate": 2.034009905086144e-05, "loss": 0.339, "step": 1063 }, { "epoch": 2.72296865003199, "grad_norm": 0.41294533749554696, "learning_rate": 2.0304302076115987e-05, "loss": 0.3187, "step": 1064 }, { "epoch": 2.72552783109405, "grad_norm": 0.37876928739351945, "learning_rate": 2.0268504126267952e-05, "loss": 0.2895, "step": 1065 }, { "epoch": 2.7280870121561103, "grad_norm": 0.4001922065674535, "learning_rate": 2.0232705316027946e-05, "loss": 0.3153, "step": 1066 }, { "epoch": 2.73064619321817, "grad_norm": 0.8048596079823415, "learning_rate": 2.019690576010931e-05, "loss": 0.3593, "step": 1067 }, { "epoch": 2.7332053742802302, "grad_norm": 0.38282646966391626, "learning_rate": 2.0161105573227798e-05, "loss": 0.3035, "step": 1068 }, { "epoch": 2.7357645553422905, "grad_norm": 0.46315568817195285, "learning_rate": 2.0125304870101184e-05, "loss": 0.3751, "step": 1069 }, { "epoch": 2.7383237364043507, "grad_norm": 0.4109495278872969, "learning_rate": 2.008950376544887e-05, "loss": 0.3346, "step": 1070 }, { "epoch": 2.740882917466411, "grad_norm": 0.4857096813595627, "learning_rate": 2.005370237399157e-05, "loss": 0.328, "step": 1071 }, { "epoch": 2.7434420985284707, "grad_norm": 0.39791730028337013, "learning_rate": 2.0017900810450923e-05, "loss": 0.2865, "step": 1072 }, { "epoch": 2.746001279590531, "grad_norm": 0.4465305517364689, "learning_rate": 1.9982099189549087e-05, "loss": 0.3647, "step": 1073 }, { "epoch": 2.748560460652591, "grad_norm": 0.41005369965327937, "learning_rate": 1.9946297626008432e-05, "loss": 0.3151, "step": 1074 }, { "epoch": 2.7511196417146513, "grad_norm": 0.49665282457932985, "learning_rate": 1.9910496234551132e-05, "loss": 0.3809, "step": 1075 }, { "epoch": 2.7536788227767115, "grad_norm": 0.37361149681320743, "learning_rate": 1.9874695129898826e-05, "loss": 0.3221, "step": 1076 }, { "epoch": 2.7562380038387717, "grad_norm": 0.51447395549755, "learning_rate": 1.9838894426772205e-05, "loss": 0.3677, "step": 1077 }, { "epoch": 2.758797184900832, "grad_norm": 0.4601671007615494, "learning_rate": 1.9803094239890692e-05, "loss": 0.3519, "step": 1078 }, { "epoch": 2.761356365962892, "grad_norm": 0.39680274509707913, "learning_rate": 1.9767294683972064e-05, "loss": 0.3521, "step": 1079 }, { "epoch": 2.763915547024952, "grad_norm": 0.43847108432837295, "learning_rate": 1.9731495873732055e-05, "loss": 0.3346, "step": 1080 }, { "epoch": 2.766474728087012, "grad_norm": 0.4078264869228389, "learning_rate": 1.969569792388402e-05, "loss": 0.3624, "step": 1081 }, { "epoch": 2.7690339091490723, "grad_norm": 0.37340927869032653, "learning_rate": 1.9659900949138562e-05, "loss": 0.3252, "step": 1082 }, { "epoch": 2.7715930902111325, "grad_norm": 0.7622901676621391, "learning_rate": 1.9624105064203157e-05, "loss": 0.3829, "step": 1083 }, { "epoch": 2.7741522712731923, "grad_norm": 0.4249232215662602, "learning_rate": 1.9588310383781773e-05, "loss": 0.342, "step": 1084 }, { "epoch": 2.7767114523352525, "grad_norm": 0.41402606002990694, "learning_rate": 1.9552517022574542e-05, "loss": 0.3756, "step": 1085 }, { "epoch": 2.7792706333973127, "grad_norm": 0.48132928718497536, "learning_rate": 1.951672509527736e-05, "loss": 0.3263, "step": 1086 }, { "epoch": 2.781829814459373, "grad_norm": 0.3954278720969905, "learning_rate": 1.9480934716581513e-05, "loss": 0.3209, "step": 1087 }, { "epoch": 2.784388995521433, "grad_norm": 0.39183072418087983, "learning_rate": 1.944514600117335e-05, "loss": 0.3807, "step": 1088 }, { "epoch": 2.7869481765834934, "grad_norm": 0.4620346036605357, "learning_rate": 1.940935906373388e-05, "loss": 0.3576, "step": 1089 }, { "epoch": 2.7895073576455536, "grad_norm": 0.36599489933850005, "learning_rate": 1.93735740189384e-05, "loss": 0.3239, "step": 1090 }, { "epoch": 2.792066538707614, "grad_norm": 0.3686130358461154, "learning_rate": 1.9337790981456164e-05, "loss": 0.331, "step": 1091 }, { "epoch": 2.7946257197696736, "grad_norm": 0.38425163759635245, "learning_rate": 1.930201006594999e-05, "loss": 0.2916, "step": 1092 }, { "epoch": 2.7971849008317338, "grad_norm": 0.4405563689728695, "learning_rate": 1.926623138707587e-05, "loss": 0.3361, "step": 1093 }, { "epoch": 2.799744081893794, "grad_norm": 0.38706329631394476, "learning_rate": 1.923045505948267e-05, "loss": 0.323, "step": 1094 }, { "epoch": 2.802303262955854, "grad_norm": 0.4708041341171456, "learning_rate": 1.9194681197811703e-05, "loss": 0.3378, "step": 1095 }, { "epoch": 2.8048624440179144, "grad_norm": 0.43844070620974923, "learning_rate": 1.915890991669636e-05, "loss": 0.331, "step": 1096 }, { "epoch": 2.807421625079974, "grad_norm": 0.4532919809550908, "learning_rate": 1.9123141330761804e-05, "loss": 0.3863, "step": 1097 }, { "epoch": 2.8099808061420344, "grad_norm": 0.48356206097375876, "learning_rate": 1.9087375554624527e-05, "loss": 0.3241, "step": 1098 }, { "epoch": 2.8125399872040946, "grad_norm": 0.37016571224063527, "learning_rate": 1.9051612702892028e-05, "loss": 0.3035, "step": 1099 }, { "epoch": 2.815099168266155, "grad_norm": 0.5488710646000331, "learning_rate": 1.901585289016244e-05, "loss": 0.3365, "step": 1100 }, { "epoch": 2.817658349328215, "grad_norm": 0.39186801965858076, "learning_rate": 1.898009623102415e-05, "loss": 0.3171, "step": 1101 }, { "epoch": 2.8202175303902752, "grad_norm": 0.396128192090393, "learning_rate": 1.894434284005546e-05, "loss": 0.2926, "step": 1102 }, { "epoch": 2.8227767114523354, "grad_norm": 0.4498006361647164, "learning_rate": 1.890859283182417e-05, "loss": 0.339, "step": 1103 }, { "epoch": 2.8253358925143957, "grad_norm": 0.4421659448994112, "learning_rate": 1.887284632088725e-05, "loss": 0.3512, "step": 1104 }, { "epoch": 2.8278950735764554, "grad_norm": 0.40391454153472733, "learning_rate": 1.8837103421790486e-05, "loss": 0.3431, "step": 1105 }, { "epoch": 2.8304542546385156, "grad_norm": 0.4029197127830964, "learning_rate": 1.8801364249068053e-05, "loss": 0.3083, "step": 1106 }, { "epoch": 2.833013435700576, "grad_norm": 0.3636262656158161, "learning_rate": 1.8765628917242213e-05, "loss": 0.2695, "step": 1107 }, { "epoch": 2.835572616762636, "grad_norm": 0.41088999848761776, "learning_rate": 1.8729897540822914e-05, "loss": 0.3191, "step": 1108 }, { "epoch": 2.838131797824696, "grad_norm": 0.4141748812616923, "learning_rate": 1.8694170234307415e-05, "loss": 0.3494, "step": 1109 }, { "epoch": 2.840690978886756, "grad_norm": 0.40924209313260995, "learning_rate": 1.8658447112179952e-05, "loss": 0.3424, "step": 1110 }, { "epoch": 2.8432501599488162, "grad_norm": 0.38985761612852116, "learning_rate": 1.8622728288911358e-05, "loss": 0.3367, "step": 1111 }, { "epoch": 2.8458093410108765, "grad_norm": 0.38078814823399454, "learning_rate": 1.858701387895865e-05, "loss": 0.3143, "step": 1112 }, { "epoch": 2.8483685220729367, "grad_norm": 0.34959904397806785, "learning_rate": 1.8551303996764755e-05, "loss": 0.3007, "step": 1113 }, { "epoch": 2.850927703134997, "grad_norm": 0.4183989425563707, "learning_rate": 1.8515598756758064e-05, "loss": 0.359, "step": 1114 }, { "epoch": 2.853486884197057, "grad_norm": 0.37111554641813266, "learning_rate": 1.8479898273352084e-05, "loss": 0.3381, "step": 1115 }, { "epoch": 2.8560460652591173, "grad_norm": 0.36998674648942864, "learning_rate": 1.8444202660945105e-05, "loss": 0.3317, "step": 1116 }, { "epoch": 2.858605246321177, "grad_norm": 0.4020148068654659, "learning_rate": 1.8408512033919798e-05, "loss": 0.3048, "step": 1117 }, { "epoch": 2.8611644273832373, "grad_norm": 0.4005462960969913, "learning_rate": 1.837282650664284e-05, "loss": 0.3603, "step": 1118 }, { "epoch": 2.8637236084452975, "grad_norm": 0.3444120356474249, "learning_rate": 1.8337146193464595e-05, "loss": 0.2898, "step": 1119 }, { "epoch": 2.8662827895073577, "grad_norm": 0.39082290145745685, "learning_rate": 1.83014712087187e-05, "loss": 0.3131, "step": 1120 }, { "epoch": 2.868841970569418, "grad_norm": 0.35821645503904304, "learning_rate": 1.8265801666721744e-05, "loss": 0.3433, "step": 1121 }, { "epoch": 2.8714011516314777, "grad_norm": 0.41668733158509796, "learning_rate": 1.8230137681772836e-05, "loss": 0.3567, "step": 1122 }, { "epoch": 2.873960332693538, "grad_norm": 0.34804389069027475, "learning_rate": 1.8194479368153298e-05, "loss": 0.3136, "step": 1123 }, { "epoch": 2.876519513755598, "grad_norm": 0.3911488779456878, "learning_rate": 1.8158826840126292e-05, "loss": 0.3412, "step": 1124 }, { "epoch": 2.8790786948176583, "grad_norm": 0.44396897741245006, "learning_rate": 1.8123180211936417e-05, "loss": 0.3644, "step": 1125 }, { "epoch": 2.8816378758797185, "grad_norm": 0.3818666952140594, "learning_rate": 1.808753959780938e-05, "loss": 0.2988, "step": 1126 }, { "epoch": 2.8841970569417787, "grad_norm": 0.3819899108018794, "learning_rate": 1.805190511195162e-05, "loss": 0.3304, "step": 1127 }, { "epoch": 2.886756238003839, "grad_norm": 0.3988449634404456, "learning_rate": 1.801627686854992e-05, "loss": 0.3413, "step": 1128 }, { "epoch": 2.889315419065899, "grad_norm": 0.4410132899232368, "learning_rate": 1.7980654981771074e-05, "loss": 0.3725, "step": 1129 }, { "epoch": 2.891874600127959, "grad_norm": 0.31362864495515286, "learning_rate": 1.794503956576152e-05, "loss": 0.2833, "step": 1130 }, { "epoch": 2.894433781190019, "grad_norm": 0.411109247445083, "learning_rate": 1.7909430734646936e-05, "loss": 0.3297, "step": 1131 }, { "epoch": 2.8969929622520794, "grad_norm": 0.3360115333630458, "learning_rate": 1.78738286025319e-05, "loss": 0.2985, "step": 1132 }, { "epoch": 2.8995521433141396, "grad_norm": 0.3728330681966131, "learning_rate": 1.7838233283499554e-05, "loss": 0.378, "step": 1133 }, { "epoch": 2.9021113243761993, "grad_norm": 0.3539275837478167, "learning_rate": 1.780264489161117e-05, "loss": 0.3638, "step": 1134 }, { "epoch": 2.9046705054382596, "grad_norm": 0.33582742300915935, "learning_rate": 1.776706354090585e-05, "loss": 0.3383, "step": 1135 }, { "epoch": 2.9072296865003198, "grad_norm": 0.38396584024437336, "learning_rate": 1.7731489345400118e-05, "loss": 0.3116, "step": 1136 }, { "epoch": 2.90978886756238, "grad_norm": 0.3121417283022767, "learning_rate": 1.769592241908758e-05, "loss": 0.3089, "step": 1137 }, { "epoch": 2.91234804862444, "grad_norm": 0.37871598245894667, "learning_rate": 1.766036287593854e-05, "loss": 0.3504, "step": 1138 }, { "epoch": 2.9149072296865004, "grad_norm": 0.4083761440481677, "learning_rate": 1.762481082989965e-05, "loss": 0.3338, "step": 1139 }, { "epoch": 2.9174664107485606, "grad_norm": 0.3759166218369834, "learning_rate": 1.758926639489354e-05, "loss": 0.3448, "step": 1140 }, { "epoch": 2.920025591810621, "grad_norm": 0.3918883448687237, "learning_rate": 1.755372968481844e-05, "loss": 0.3465, "step": 1141 }, { "epoch": 2.9225847728726806, "grad_norm": 0.45281102397931977, "learning_rate": 1.7518200813547842e-05, "loss": 0.352, "step": 1142 }, { "epoch": 2.925143953934741, "grad_norm": 0.3403187481784673, "learning_rate": 1.748267989493011e-05, "loss": 0.2767, "step": 1143 }, { "epoch": 2.927703134996801, "grad_norm": 0.32374732234103554, "learning_rate": 1.7447167042788108e-05, "loss": 0.3003, "step": 1144 }, { "epoch": 2.9302623160588612, "grad_norm": 0.43821825023278765, "learning_rate": 1.7411662370918893e-05, "loss": 0.3365, "step": 1145 }, { "epoch": 2.9328214971209214, "grad_norm": 0.3735728621894312, "learning_rate": 1.7376165993093278e-05, "loss": 0.3164, "step": 1146 }, { "epoch": 2.935380678182981, "grad_norm": 0.3713647905663265, "learning_rate": 1.7340678023055496e-05, "loss": 0.3237, "step": 1147 }, { "epoch": 2.9379398592450414, "grad_norm": 0.40116903737296333, "learning_rate": 1.7305198574522864e-05, "loss": 0.3614, "step": 1148 }, { "epoch": 2.9404990403071016, "grad_norm": 0.40054289621797295, "learning_rate": 1.7269727761185374e-05, "loss": 0.334, "step": 1149 }, { "epoch": 2.943058221369162, "grad_norm": 0.3925230509455669, "learning_rate": 1.7234265696705344e-05, "loss": 0.2959, "step": 1150 }, { "epoch": 2.945617402431222, "grad_norm": 0.42214888830794545, "learning_rate": 1.7198812494717062e-05, "loss": 0.3776, "step": 1151 }, { "epoch": 2.9481765834932823, "grad_norm": 0.3616779488648713, "learning_rate": 1.7163368268826433e-05, "loss": 0.3016, "step": 1152 }, { "epoch": 2.9507357645553425, "grad_norm": 0.38477907353760216, "learning_rate": 1.7127933132610573e-05, "loss": 0.3073, "step": 1153 }, { "epoch": 2.9532949456174027, "grad_norm": 0.38583388740216534, "learning_rate": 1.7092507199617482e-05, "loss": 0.3303, "step": 1154 }, { "epoch": 2.9558541266794625, "grad_norm": 0.4615746603773426, "learning_rate": 1.7057090583365678e-05, "loss": 0.3944, "step": 1155 }, { "epoch": 2.9584133077415227, "grad_norm": 0.3793974621003137, "learning_rate": 1.7021683397343823e-05, "loss": 0.3298, "step": 1156 }, { "epoch": 2.960972488803583, "grad_norm": 0.4920742763843501, "learning_rate": 1.698628575501034e-05, "loss": 0.3401, "step": 1157 }, { "epoch": 2.963531669865643, "grad_norm": 0.4104914994234224, "learning_rate": 1.6950897769793093e-05, "loss": 0.3268, "step": 1158 }, { "epoch": 2.966090850927703, "grad_norm": 0.41153736245664496, "learning_rate": 1.6915519555089e-05, "loss": 0.3594, "step": 1159 }, { "epoch": 2.968650031989763, "grad_norm": 0.3875070318404897, "learning_rate": 1.6880151224263646e-05, "loss": 0.3398, "step": 1160 }, { "epoch": 2.9712092130518233, "grad_norm": 0.44887352669211456, "learning_rate": 1.6844792890650976e-05, "loss": 0.2813, "step": 1161 }, { "epoch": 2.9737683941138835, "grad_norm": 0.3800558590649599, "learning_rate": 1.680944466755289e-05, "loss": 0.3635, "step": 1162 }, { "epoch": 2.9763275751759437, "grad_norm": 0.3971504175952064, "learning_rate": 1.6774106668238867e-05, "loss": 0.3146, "step": 1163 }, { "epoch": 2.978886756238004, "grad_norm": 0.4715791982640647, "learning_rate": 1.673877900594566e-05, "loss": 0.3553, "step": 1164 }, { "epoch": 2.981445937300064, "grad_norm": 0.30859611389638464, "learning_rate": 1.6703461793876876e-05, "loss": 0.2989, "step": 1165 }, { "epoch": 2.9840051183621243, "grad_norm": 0.3973251808749978, "learning_rate": 1.6668155145202638e-05, "loss": 0.3579, "step": 1166 }, { "epoch": 2.986564299424184, "grad_norm": 0.446835333586439, "learning_rate": 1.6632859173059232e-05, "loss": 0.3258, "step": 1167 }, { "epoch": 2.9891234804862443, "grad_norm": 0.3811823018283798, "learning_rate": 1.6597573990548722e-05, "loss": 0.3201, "step": 1168 }, { "epoch": 2.9916826615483045, "grad_norm": 0.375872795491289, "learning_rate": 1.6562299710738586e-05, "loss": 0.3255, "step": 1169 }, { "epoch": 2.9942418426103647, "grad_norm": 0.4029499625289535, "learning_rate": 1.6527036446661396e-05, "loss": 0.307, "step": 1170 }, { "epoch": 2.996801023672425, "grad_norm": 0.4156370523912452, "learning_rate": 1.6491784311314403e-05, "loss": 0.3797, "step": 1171 }, { "epoch": 2.9993602047344847, "grad_norm": 0.4189485161365368, "learning_rate": 1.6456543417659192e-05, "loss": 0.3488, "step": 1172 }, { "epoch": 3.001919385796545, "grad_norm": 0.5633124123177335, "learning_rate": 1.6421313878621344e-05, "loss": 0.2857, "step": 1173 }, { "epoch": 3.004478566858605, "grad_norm": 0.39083213560124314, "learning_rate": 1.6386095807090047e-05, "loss": 0.2379, "step": 1174 }, { "epoch": 3.0070377479206654, "grad_norm": 0.5240113784249277, "learning_rate": 1.635088931591775e-05, "loss": 0.2245, "step": 1175 }, { "epoch": 3.0095969289827256, "grad_norm": 0.6354763063642779, "learning_rate": 1.631569451791977e-05, "loss": 0.2869, "step": 1176 }, { "epoch": 3.012156110044786, "grad_norm": 0.4893508226750199, "learning_rate": 1.628051152587398e-05, "loss": 0.2703, "step": 1177 }, { "epoch": 3.014715291106846, "grad_norm": 0.4524451646086151, "learning_rate": 1.6245340452520414e-05, "loss": 0.2176, "step": 1178 }, { "epoch": 3.0172744721689058, "grad_norm": 0.4778994330296646, "learning_rate": 1.6210181410560912e-05, "loss": 0.2571, "step": 1179 }, { "epoch": 3.019833653230966, "grad_norm": 0.37599046051577073, "learning_rate": 1.6175034512658753e-05, "loss": 0.2338, "step": 1180 }, { "epoch": 3.022392834293026, "grad_norm": 0.45320853578403025, "learning_rate": 1.613989987143833e-05, "loss": 0.2325, "step": 1181 }, { "epoch": 3.0249520153550864, "grad_norm": 0.43340502469781866, "learning_rate": 1.610477759948472e-05, "loss": 0.2767, "step": 1182 }, { "epoch": 3.0275111964171466, "grad_norm": 0.38361490841329304, "learning_rate": 1.6069667809343396e-05, "loss": 0.2445, "step": 1183 }, { "epoch": 3.030070377479207, "grad_norm": 0.39878208398916914, "learning_rate": 1.603457061351983e-05, "loss": 0.2506, "step": 1184 }, { "epoch": 3.0326295585412666, "grad_norm": 0.44131555640934655, "learning_rate": 1.5999486124479115e-05, "loss": 0.2588, "step": 1185 }, { "epoch": 3.035188739603327, "grad_norm": 0.36228403498534006, "learning_rate": 1.5964414454645647e-05, "loss": 0.2394, "step": 1186 }, { "epoch": 3.037747920665387, "grad_norm": 0.40556573549758734, "learning_rate": 1.5929355716402754e-05, "loss": 0.2422, "step": 1187 }, { "epoch": 3.0403071017274472, "grad_norm": 0.46971240300411676, "learning_rate": 1.5894310022092288e-05, "loss": 0.2536, "step": 1188 }, { "epoch": 3.0428662827895074, "grad_norm": 0.37074278168598435, "learning_rate": 1.5859277484014338e-05, "loss": 0.2262, "step": 1189 }, { "epoch": 3.0454254638515676, "grad_norm": 0.4101230047135583, "learning_rate": 1.5824258214426833e-05, "loss": 0.2501, "step": 1190 }, { "epoch": 3.047984644913628, "grad_norm": 0.4316482731662535, "learning_rate": 1.5789252325545157e-05, "loss": 0.2766, "step": 1191 }, { "epoch": 3.0505438259756876, "grad_norm": 0.36783848268146724, "learning_rate": 1.5754259929541848e-05, "loss": 0.2401, "step": 1192 }, { "epoch": 3.053103007037748, "grad_norm": 0.4061482848745174, "learning_rate": 1.5719281138546186e-05, "loss": 0.2508, "step": 1193 }, { "epoch": 3.055662188099808, "grad_norm": 0.382539152433566, "learning_rate": 1.568431606464388e-05, "loss": 0.2489, "step": 1194 }, { "epoch": 3.0582213691618683, "grad_norm": 0.3652697874982772, "learning_rate": 1.5649364819876655e-05, "loss": 0.2429, "step": 1195 }, { "epoch": 3.0607805502239285, "grad_norm": 0.42737097970366417, "learning_rate": 1.561442751624193e-05, "loss": 0.256, "step": 1196 }, { "epoch": 3.0633397312859887, "grad_norm": 0.3207166801748589, "learning_rate": 1.557950426569248e-05, "loss": 0.1962, "step": 1197 }, { "epoch": 3.0658989123480485, "grad_norm": 0.3943172590265861, "learning_rate": 1.5544595180136003e-05, "loss": 0.2519, "step": 1198 }, { "epoch": 3.0684580934101087, "grad_norm": 0.36274192661719984, "learning_rate": 1.550970037143483e-05, "loss": 0.2279, "step": 1199 }, { "epoch": 3.071017274472169, "grad_norm": 0.3630874200444502, "learning_rate": 1.547481995140556e-05, "loss": 0.2516, "step": 1200 }, { "epoch": 3.073576455534229, "grad_norm": 0.3518754730219404, "learning_rate": 1.5439954031818652e-05, "loss": 0.2329, "step": 1201 }, { "epoch": 3.0761356365962893, "grad_norm": 0.37790098887949486, "learning_rate": 1.5405102724398113e-05, "loss": 0.2677, "step": 1202 }, { "epoch": 3.0786948176583495, "grad_norm": 0.35043382673558215, "learning_rate": 1.5370266140821143e-05, "loss": 0.2294, "step": 1203 }, { "epoch": 3.0812539987204093, "grad_norm": 0.3731184820167596, "learning_rate": 1.5335444392717738e-05, "loss": 0.2319, "step": 1204 }, { "epoch": 3.0838131797824695, "grad_norm": 0.3395760795759123, "learning_rate": 1.5300637591670357e-05, "loss": 0.2333, "step": 1205 }, { "epoch": 3.0863723608445297, "grad_norm": 0.3530521161404101, "learning_rate": 1.5265845849213588e-05, "loss": 0.2458, "step": 1206 }, { "epoch": 3.08893154190659, "grad_norm": 0.2968062718035343, "learning_rate": 1.523106927683374e-05, "loss": 0.1984, "step": 1207 }, { "epoch": 3.09149072296865, "grad_norm": 0.34618157328728927, "learning_rate": 1.5196307985968509e-05, "loss": 0.2338, "step": 1208 }, { "epoch": 3.0940499040307103, "grad_norm": 0.3991365653818135, "learning_rate": 1.5161562088006649e-05, "loss": 0.2639, "step": 1209 }, { "epoch": 3.09660908509277, "grad_norm": 0.3337465568445769, "learning_rate": 1.5126831694287564e-05, "loss": 0.2354, "step": 1210 }, { "epoch": 3.0991682661548303, "grad_norm": 0.39228609678359605, "learning_rate": 1.5092116916100982e-05, "loss": 0.2737, "step": 1211 }, { "epoch": 3.1017274472168905, "grad_norm": 0.3216556126081721, "learning_rate": 1.5057417864686607e-05, "loss": 0.2237, "step": 1212 }, { "epoch": 3.1042866282789507, "grad_norm": 0.3567015015151436, "learning_rate": 1.5022734651233737e-05, "loss": 0.2568, "step": 1213 }, { "epoch": 3.106845809341011, "grad_norm": 0.35178848138592544, "learning_rate": 1.4988067386880904e-05, "loss": 0.2276, "step": 1214 }, { "epoch": 3.109404990403071, "grad_norm": 0.3511504661864566, "learning_rate": 1.4953416182715566e-05, "loss": 0.2699, "step": 1215 }, { "epoch": 3.1119641714651314, "grad_norm": 0.3425398327341164, "learning_rate": 1.4918781149773694e-05, "loss": 0.2677, "step": 1216 }, { "epoch": 3.114523352527191, "grad_norm": 0.36160910451306577, "learning_rate": 1.4884162399039439e-05, "loss": 0.2545, "step": 1217 }, { "epoch": 3.1170825335892514, "grad_norm": 0.37097329916252125, "learning_rate": 1.4849560041444795e-05, "loss": 0.2609, "step": 1218 }, { "epoch": 3.1196417146513116, "grad_norm": 0.36352782561925345, "learning_rate": 1.4814974187869218e-05, "loss": 0.2236, "step": 1219 }, { "epoch": 3.122200895713372, "grad_norm": 0.3551683721423837, "learning_rate": 1.478040494913926e-05, "loss": 0.2244, "step": 1220 }, { "epoch": 3.124760076775432, "grad_norm": 0.3308813359796844, "learning_rate": 1.4745852436028262e-05, "loss": 0.2591, "step": 1221 }, { "epoch": 3.127319257837492, "grad_norm": 0.3507830733548493, "learning_rate": 1.4711316759255963e-05, "loss": 0.2453, "step": 1222 }, { "epoch": 3.129878438899552, "grad_norm": 0.33582461966461585, "learning_rate": 1.4676798029488123e-05, "loss": 0.2593, "step": 1223 }, { "epoch": 3.132437619961612, "grad_norm": 0.3507842455435477, "learning_rate": 1.464229635733624e-05, "loss": 0.2372, "step": 1224 }, { "epoch": 3.1349968010236724, "grad_norm": 0.3318567188084375, "learning_rate": 1.460781185335713e-05, "loss": 0.231, "step": 1225 }, { "epoch": 3.1375559820857326, "grad_norm": 0.3188990523059626, "learning_rate": 1.4573344628052588e-05, "loss": 0.2376, "step": 1226 }, { "epoch": 3.140115163147793, "grad_norm": 0.3664030290111237, "learning_rate": 1.4538894791869052e-05, "loss": 0.2585, "step": 1227 }, { "epoch": 3.142674344209853, "grad_norm": 0.3456582759492691, "learning_rate": 1.4504462455197248e-05, "loss": 0.2295, "step": 1228 }, { "epoch": 3.145233525271913, "grad_norm": 0.3071644333690587, "learning_rate": 1.4470047728371813e-05, "loss": 0.2113, "step": 1229 }, { "epoch": 3.147792706333973, "grad_norm": 0.33706220227684885, "learning_rate": 1.443565072167095e-05, "loss": 0.2286, "step": 1230 }, { "epoch": 3.1503518873960332, "grad_norm": 0.3213473350814528, "learning_rate": 1.4401271545316096e-05, "loss": 0.2333, "step": 1231 }, { "epoch": 3.1529110684580934, "grad_norm": 0.32806923950956013, "learning_rate": 1.436691030947155e-05, "loss": 0.2338, "step": 1232 }, { "epoch": 3.1554702495201536, "grad_norm": 0.33540430311298725, "learning_rate": 1.43325671242441e-05, "loss": 0.209, "step": 1233 }, { "epoch": 3.158029430582214, "grad_norm": 0.3208377254757641, "learning_rate": 1.4298242099682726e-05, "loss": 0.245, "step": 1234 }, { "epoch": 3.1605886116442736, "grad_norm": 0.3427654538840671, "learning_rate": 1.4263935345778202e-05, "loss": 0.2521, "step": 1235 }, { "epoch": 3.163147792706334, "grad_norm": 0.35551971354398254, "learning_rate": 1.4229646972462732e-05, "loss": 0.2338, "step": 1236 }, { "epoch": 3.165706973768394, "grad_norm": 0.33357596367989273, "learning_rate": 1.419537708960966e-05, "loss": 0.2322, "step": 1237 }, { "epoch": 3.1682661548304543, "grad_norm": 0.3416940872307819, "learning_rate": 1.4161125807033059e-05, "loss": 0.24, "step": 1238 }, { "epoch": 3.1708253358925145, "grad_norm": 0.3259027337159305, "learning_rate": 1.412689323448739e-05, "loss": 0.2705, "step": 1239 }, { "epoch": 3.1733845169545747, "grad_norm": 0.33778026989222404, "learning_rate": 1.409267948166718e-05, "loss": 0.2335, "step": 1240 }, { "epoch": 3.175943698016635, "grad_norm": 0.3425941944724759, "learning_rate": 1.4058484658206646e-05, "loss": 0.2684, "step": 1241 }, { "epoch": 3.1785028790786947, "grad_norm": 0.304680436913791, "learning_rate": 1.4024308873679327e-05, "loss": 0.2181, "step": 1242 }, { "epoch": 3.181062060140755, "grad_norm": 0.3522850109826806, "learning_rate": 1.3990152237597787e-05, "loss": 0.2572, "step": 1243 }, { "epoch": 3.183621241202815, "grad_norm": 0.3209721557320742, "learning_rate": 1.3956014859413211e-05, "loss": 0.2337, "step": 1244 }, { "epoch": 3.1861804222648753, "grad_norm": 0.3293098487746776, "learning_rate": 1.3921896848515064e-05, "loss": 0.2411, "step": 1245 }, { "epoch": 3.1887396033269355, "grad_norm": 0.30365057870700035, "learning_rate": 1.388779831423078e-05, "loss": 0.2291, "step": 1246 }, { "epoch": 3.1912987843889957, "grad_norm": 0.3131208709907512, "learning_rate": 1.3853719365825357e-05, "loss": 0.2352, "step": 1247 }, { "epoch": 3.1938579654510555, "grad_norm": 0.3376254463164988, "learning_rate": 1.3819660112501054e-05, "loss": 0.2625, "step": 1248 }, { "epoch": 3.1964171465131157, "grad_norm": 0.3069287683421629, "learning_rate": 1.3785620663396992e-05, "loss": 0.2229, "step": 1249 }, { "epoch": 3.198976327575176, "grad_norm": 0.3316589759980029, "learning_rate": 1.3751601127588849e-05, "loss": 0.245, "step": 1250 }, { "epoch": 3.201535508637236, "grad_norm": 0.30527916893181595, "learning_rate": 1.37176016140885e-05, "loss": 0.2346, "step": 1251 }, { "epoch": 3.2040946896992963, "grad_norm": 0.34900918414936455, "learning_rate": 1.3683622231843644e-05, "loss": 0.2392, "step": 1252 }, { "epoch": 3.2066538707613566, "grad_norm": 0.30641882677939075, "learning_rate": 1.364966308973747e-05, "loss": 0.2218, "step": 1253 }, { "epoch": 3.2092130518234163, "grad_norm": 0.3296381755475144, "learning_rate": 1.3615724296588342e-05, "loss": 0.2566, "step": 1254 }, { "epoch": 3.2117722328854765, "grad_norm": 0.34080590800970306, "learning_rate": 1.3581805961149371e-05, "loss": 0.2518, "step": 1255 }, { "epoch": 3.2143314139475367, "grad_norm": 0.31502390005004344, "learning_rate": 1.3547908192108143e-05, "loss": 0.2288, "step": 1256 }, { "epoch": 3.216890595009597, "grad_norm": 0.30635056034248115, "learning_rate": 1.3514031098086349e-05, "loss": 0.2539, "step": 1257 }, { "epoch": 3.219449776071657, "grad_norm": 0.34066214746897916, "learning_rate": 1.3480174787639397e-05, "loss": 0.2664, "step": 1258 }, { "epoch": 3.2220089571337174, "grad_norm": 0.31764511418435903, "learning_rate": 1.3446339369256121e-05, "loss": 0.2067, "step": 1259 }, { "epoch": 3.224568138195777, "grad_norm": 0.29852956617495935, "learning_rate": 1.341252495135841e-05, "loss": 0.2298, "step": 1260 }, { "epoch": 3.2271273192578374, "grad_norm": 0.34615186747664684, "learning_rate": 1.3378731642300841e-05, "loss": 0.2488, "step": 1261 }, { "epoch": 3.2296865003198976, "grad_norm": 0.31284863193899576, "learning_rate": 1.3344959550370362e-05, "loss": 0.222, "step": 1262 }, { "epoch": 3.232245681381958, "grad_norm": 0.3198015399733083, "learning_rate": 1.3311208783785945e-05, "loss": 0.2561, "step": 1263 }, { "epoch": 3.234804862444018, "grad_norm": 0.33484963926651445, "learning_rate": 1.327747945069819e-05, "loss": 0.2532, "step": 1264 }, { "epoch": 3.237364043506078, "grad_norm": 0.3505508918710989, "learning_rate": 1.324377165918906e-05, "loss": 0.253, "step": 1265 }, { "epoch": 3.2399232245681384, "grad_norm": 0.3317100885612124, "learning_rate": 1.3210085517271459e-05, "loss": 0.2488, "step": 1266 }, { "epoch": 3.242482405630198, "grad_norm": 0.3183951056600632, "learning_rate": 1.3176421132888936e-05, "loss": 0.2206, "step": 1267 }, { "epoch": 3.2450415866922584, "grad_norm": 0.7798174468009574, "learning_rate": 1.3142778613915308e-05, "loss": 0.3465, "step": 1268 }, { "epoch": 3.2476007677543186, "grad_norm": 0.3276263112485529, "learning_rate": 1.3109158068154329e-05, "loss": 0.2206, "step": 1269 }, { "epoch": 3.250159948816379, "grad_norm": 0.35582948619273064, "learning_rate": 1.3075559603339354e-05, "loss": 0.2272, "step": 1270 }, { "epoch": 3.252719129878439, "grad_norm": 0.3425439408964034, "learning_rate": 1.304198332713296e-05, "loss": 0.2587, "step": 1271 }, { "epoch": 3.255278310940499, "grad_norm": 0.35588463150095667, "learning_rate": 1.3008429347126641e-05, "loss": 0.2585, "step": 1272 }, { "epoch": 3.257837492002559, "grad_norm": 0.3354553998723496, "learning_rate": 1.2974897770840448e-05, "loss": 0.2067, "step": 1273 }, { "epoch": 3.260396673064619, "grad_norm": 0.3247184453320128, "learning_rate": 1.2941388705722627e-05, "loss": 0.2449, "step": 1274 }, { "epoch": 3.2629558541266794, "grad_norm": 0.3269101886184072, "learning_rate": 1.2907902259149287e-05, "loss": 0.2454, "step": 1275 }, { "epoch": 3.2655150351887396, "grad_norm": 0.34277751654037186, "learning_rate": 1.2874438538424086e-05, "loss": 0.2267, "step": 1276 }, { "epoch": 3.2680742162508, "grad_norm": 0.3425839528353915, "learning_rate": 1.2840997650777829e-05, "loss": 0.2289, "step": 1277 }, { "epoch": 3.27063339731286, "grad_norm": 0.32496809601312776, "learning_rate": 1.2807579703368162e-05, "loss": 0.2437, "step": 1278 }, { "epoch": 3.27319257837492, "grad_norm": 0.37627417428336984, "learning_rate": 1.2774184803279245e-05, "loss": 0.2196, "step": 1279 }, { "epoch": 3.27575175943698, "grad_norm": 0.30844464297394786, "learning_rate": 1.274081305752135e-05, "loss": 0.2148, "step": 1280 }, { "epoch": 3.2783109404990403, "grad_norm": 0.32041752442694194, "learning_rate": 1.2707464573030572e-05, "loss": 0.2495, "step": 1281 }, { "epoch": 3.2808701215611005, "grad_norm": 0.3247468389566215, "learning_rate": 1.2674139456668479e-05, "loss": 0.2558, "step": 1282 }, { "epoch": 3.2834293026231607, "grad_norm": 0.30941778730595587, "learning_rate": 1.2640837815221731e-05, "loss": 0.2238, "step": 1283 }, { "epoch": 3.285988483685221, "grad_norm": 0.3397180703699647, "learning_rate": 1.260755975540178e-05, "loss": 0.2405, "step": 1284 }, { "epoch": 3.2885476647472807, "grad_norm": 0.31915117073868005, "learning_rate": 1.2574305383844528e-05, "loss": 0.2396, "step": 1285 }, { "epoch": 3.291106845809341, "grad_norm": 0.2977168854647766, "learning_rate": 1.2541074807109945e-05, "loss": 0.2286, "step": 1286 }, { "epoch": 3.293666026871401, "grad_norm": 0.3141534078265832, "learning_rate": 1.250786813168176e-05, "loss": 0.2291, "step": 1287 }, { "epoch": 3.2962252079334613, "grad_norm": 0.3250362176047104, "learning_rate": 1.2474685463967125e-05, "loss": 0.2353, "step": 1288 }, { "epoch": 3.2987843889955215, "grad_norm": 0.5783304188096524, "learning_rate": 1.2441526910296253e-05, "loss": 0.2316, "step": 1289 }, { "epoch": 3.3013435700575817, "grad_norm": 0.31254991008955707, "learning_rate": 1.2408392576922075e-05, "loss": 0.2336, "step": 1290 }, { "epoch": 3.303902751119642, "grad_norm": 0.31041323932723247, "learning_rate": 1.2375282570019933e-05, "loss": 0.2457, "step": 1291 }, { "epoch": 3.3064619321817017, "grad_norm": 0.32993719976229857, "learning_rate": 1.2342196995687212e-05, "loss": 0.2588, "step": 1292 }, { "epoch": 3.309021113243762, "grad_norm": 0.3016426692910031, "learning_rate": 1.2309135959942986e-05, "loss": 0.2577, "step": 1293 }, { "epoch": 3.311580294305822, "grad_norm": 0.34298650349077, "learning_rate": 1.227609956872772e-05, "loss": 0.2386, "step": 1294 }, { "epoch": 3.3141394753678823, "grad_norm": 0.3233823231490881, "learning_rate": 1.2243087927902905e-05, "loss": 0.2203, "step": 1295 }, { "epoch": 3.3166986564299425, "grad_norm": 0.3431401051489707, "learning_rate": 1.2210101143250708e-05, "loss": 0.2369, "step": 1296 }, { "epoch": 3.3192578374920023, "grad_norm": 0.30536095566488874, "learning_rate": 1.2177139320473663e-05, "loss": 0.209, "step": 1297 }, { "epoch": 3.3218170185540625, "grad_norm": 0.3380078591507895, "learning_rate": 1.2144202565194311e-05, "loss": 0.2793, "step": 1298 }, { "epoch": 3.3243761996161227, "grad_norm": 0.3327840853187567, "learning_rate": 1.211129098295486e-05, "loss": 0.2473, "step": 1299 }, { "epoch": 3.326935380678183, "grad_norm": 0.29406663008971645, "learning_rate": 1.2078404679216864e-05, "loss": 0.2056, "step": 1300 }, { "epoch": 3.329494561740243, "grad_norm": 0.31662925879264764, "learning_rate": 1.2045543759360876e-05, "loss": 0.2443, "step": 1301 }, { "epoch": 3.3320537428023034, "grad_norm": 0.322730147707437, "learning_rate": 1.2012708328686093e-05, "loss": 0.2143, "step": 1302 }, { "epoch": 3.3346129238643636, "grad_norm": 0.2975751708993095, "learning_rate": 1.1979898492410049e-05, "loss": 0.2385, "step": 1303 }, { "epoch": 3.3371721049264234, "grad_norm": 0.3636857886009555, "learning_rate": 1.1947114355668265e-05, "loss": 0.2623, "step": 1304 }, { "epoch": 3.3397312859884836, "grad_norm": 0.3199715781542156, "learning_rate": 1.1914356023513904e-05, "loss": 0.2605, "step": 1305 }, { "epoch": 3.342290467050544, "grad_norm": 0.32209675935492127, "learning_rate": 1.1881623600917437e-05, "loss": 0.2474, "step": 1306 }, { "epoch": 3.344849648112604, "grad_norm": 0.3146940770709782, "learning_rate": 1.1848917192766322e-05, "loss": 0.1886, "step": 1307 }, { "epoch": 3.347408829174664, "grad_norm": 0.3515135762125379, "learning_rate": 1.1816236903864656e-05, "loss": 0.2639, "step": 1308 }, { "epoch": 3.3499680102367244, "grad_norm": 0.3273044475438286, "learning_rate": 1.1783582838932821e-05, "loss": 0.2681, "step": 1309 }, { "epoch": 3.352527191298784, "grad_norm": 0.3241556852063347, "learning_rate": 1.1750955102607193e-05, "loss": 0.2148, "step": 1310 }, { "epoch": 3.3550863723608444, "grad_norm": 0.34788552766120195, "learning_rate": 1.1718353799439766e-05, "loss": 0.2328, "step": 1311 }, { "epoch": 3.3576455534229046, "grad_norm": 0.31285926538820524, "learning_rate": 1.1685779033897827e-05, "loss": 0.2139, "step": 1312 }, { "epoch": 3.360204734484965, "grad_norm": 0.34091923953794956, "learning_rate": 1.1653230910363645e-05, "loss": 0.2522, "step": 1313 }, { "epoch": 3.362763915547025, "grad_norm": 0.3093765842790142, "learning_rate": 1.1620709533134104e-05, "loss": 0.25, "step": 1314 }, { "epoch": 3.3653230966090852, "grad_norm": 0.3332035792467053, "learning_rate": 1.1588215006420374e-05, "loss": 0.2729, "step": 1315 }, { "epoch": 3.3678822776711455, "grad_norm": 0.3076095736538766, "learning_rate": 1.1555747434347606e-05, "loss": 0.2076, "step": 1316 }, { "epoch": 3.370441458733205, "grad_norm": 0.30288813913337326, "learning_rate": 1.1523306920954571e-05, "loss": 0.2449, "step": 1317 }, { "epoch": 3.3730006397952654, "grad_norm": 0.3702724394333769, "learning_rate": 1.1490893570193328e-05, "loss": 0.2646, "step": 1318 }, { "epoch": 3.3755598208573256, "grad_norm": 0.35103580998810946, "learning_rate": 1.1458507485928891e-05, "loss": 0.2634, "step": 1319 }, { "epoch": 3.378119001919386, "grad_norm": 0.3037095018272702, "learning_rate": 1.1426148771938915e-05, "loss": 0.2229, "step": 1320 }, { "epoch": 3.380678182981446, "grad_norm": 0.3192579058292554, "learning_rate": 1.139381753191335e-05, "loss": 0.2489, "step": 1321 }, { "epoch": 3.383237364043506, "grad_norm": 0.32501023665860496, "learning_rate": 1.1361513869454092e-05, "loss": 0.2407, "step": 1322 }, { "epoch": 3.385796545105566, "grad_norm": 0.35596917491983554, "learning_rate": 1.1329237888074691e-05, "loss": 0.2437, "step": 1323 }, { "epoch": 3.3883557261676263, "grad_norm": 0.33172031186682566, "learning_rate": 1.129698969119998e-05, "loss": 0.2623, "step": 1324 }, { "epoch": 3.3909149072296865, "grad_norm": 0.2995594606117323, "learning_rate": 1.1264769382165748e-05, "loss": 0.1996, "step": 1325 }, { "epoch": 3.3934740882917467, "grad_norm": 0.3194047630014032, "learning_rate": 1.123257706421845e-05, "loss": 0.236, "step": 1326 }, { "epoch": 3.396033269353807, "grad_norm": 0.33068857427400655, "learning_rate": 1.1200412840514839e-05, "loss": 0.2244, "step": 1327 }, { "epoch": 3.398592450415867, "grad_norm": 0.36837193881845204, "learning_rate": 1.1168276814121621e-05, "loss": 0.2828, "step": 1328 }, { "epoch": 3.401151631477927, "grad_norm": 0.3076965971301543, "learning_rate": 1.1136169088015177e-05, "loss": 0.2241, "step": 1329 }, { "epoch": 3.403710812539987, "grad_norm": 0.3411885790050691, "learning_rate": 1.110408976508118e-05, "loss": 0.2232, "step": 1330 }, { "epoch": 3.4062699936020473, "grad_norm": 0.3263500540450158, "learning_rate": 1.107203894811429e-05, "loss": 0.2572, "step": 1331 }, { "epoch": 3.4088291746641075, "grad_norm": 0.3416605787702754, "learning_rate": 1.1040016739817836e-05, "loss": 0.2433, "step": 1332 }, { "epoch": 3.4113883557261677, "grad_norm": 0.335195547574942, "learning_rate": 1.1008023242803477e-05, "loss": 0.2648, "step": 1333 }, { "epoch": 3.413947536788228, "grad_norm": 0.36538157527653864, "learning_rate": 1.097605855959084e-05, "loss": 0.2286, "step": 1334 }, { "epoch": 3.4165067178502877, "grad_norm": 0.2912108672412734, "learning_rate": 1.094412279260726e-05, "loss": 0.2073, "step": 1335 }, { "epoch": 3.419065898912348, "grad_norm": 0.32352858434290793, "learning_rate": 1.0912216044187382e-05, "loss": 0.2725, "step": 1336 }, { "epoch": 3.421625079974408, "grad_norm": 0.30980460582268804, "learning_rate": 1.0880338416572872e-05, "loss": 0.242, "step": 1337 }, { "epoch": 3.4241842610364683, "grad_norm": 0.3026357282953144, "learning_rate": 1.0848490011912096e-05, "loss": 0.2207, "step": 1338 }, { "epoch": 3.4267434420985285, "grad_norm": 0.3222649336637817, "learning_rate": 1.0816670932259763e-05, "loss": 0.2196, "step": 1339 }, { "epoch": 3.4293026231605888, "grad_norm": 0.3346500533447882, "learning_rate": 1.0784881279576635e-05, "loss": 0.2187, "step": 1340 }, { "epoch": 3.431861804222649, "grad_norm": 0.3122079565048836, "learning_rate": 1.0753121155729133e-05, "loss": 0.2227, "step": 1341 }, { "epoch": 3.4344209852847087, "grad_norm": 0.3240510909707239, "learning_rate": 1.07213906624891e-05, "loss": 0.2231, "step": 1342 }, { "epoch": 3.436980166346769, "grad_norm": 0.3235912618403718, "learning_rate": 1.0689689901533424e-05, "loss": 0.2492, "step": 1343 }, { "epoch": 3.439539347408829, "grad_norm": 0.3040119908970231, "learning_rate": 1.0658018974443692e-05, "loss": 0.1984, "step": 1344 }, { "epoch": 3.4420985284708894, "grad_norm": 0.340863607236755, "learning_rate": 1.0626377982705929e-05, "loss": 0.2349, "step": 1345 }, { "epoch": 3.4446577095329496, "grad_norm": 0.32795701173977326, "learning_rate": 1.059476702771021e-05, "loss": 0.2529, "step": 1346 }, { "epoch": 3.4472168905950094, "grad_norm": 0.31132450713720333, "learning_rate": 1.056318621075036e-05, "loss": 0.2095, "step": 1347 }, { "epoch": 3.4497760716570696, "grad_norm": 0.3254097118432526, "learning_rate": 1.0531635633023644e-05, "loss": 0.2358, "step": 1348 }, { "epoch": 3.4523352527191298, "grad_norm": 0.3017269372689714, "learning_rate": 1.050011539563043e-05, "loss": 0.2247, "step": 1349 }, { "epoch": 3.45489443378119, "grad_norm": 0.3283903326525304, "learning_rate": 1.0468625599573842e-05, "loss": 0.2718, "step": 1350 }, { "epoch": 3.45745361484325, "grad_norm": 0.31738351502037276, "learning_rate": 1.0437166345759489e-05, "loss": 0.2345, "step": 1351 }, { "epoch": 3.4600127959053104, "grad_norm": 0.30402755053632596, "learning_rate": 1.0405737734995083e-05, "loss": 0.2057, "step": 1352 }, { "epoch": 3.4625719769673706, "grad_norm": 0.33488642167297444, "learning_rate": 1.037433986799015e-05, "loss": 0.2439, "step": 1353 }, { "epoch": 3.4651311580294304, "grad_norm": 0.30914851197686366, "learning_rate": 1.034297284535571e-05, "loss": 0.2028, "step": 1354 }, { "epoch": 3.4676903390914906, "grad_norm": 0.32175431813825445, "learning_rate": 1.0311636767603952e-05, "loss": 0.2439, "step": 1355 }, { "epoch": 3.470249520153551, "grad_norm": 0.3395487986448244, "learning_rate": 1.028033173514788e-05, "loss": 0.2502, "step": 1356 }, { "epoch": 3.472808701215611, "grad_norm": 0.3042126709214444, "learning_rate": 1.0249057848301043e-05, "loss": 0.2395, "step": 1357 }, { "epoch": 3.4753678822776712, "grad_norm": 0.3183501854149144, "learning_rate": 1.0217815207277165e-05, "loss": 0.2234, "step": 1358 }, { "epoch": 3.4779270633397315, "grad_norm": 0.3233478623491546, "learning_rate": 1.0186603912189867e-05, "loss": 0.2589, "step": 1359 }, { "epoch": 3.480486244401791, "grad_norm": 0.3018286569760461, "learning_rate": 1.0155424063052306e-05, "loss": 0.2401, "step": 1360 }, { "epoch": 3.4830454254638514, "grad_norm": 0.3174313407841064, "learning_rate": 1.0124275759776889e-05, "loss": 0.2399, "step": 1361 }, { "epoch": 3.4856046065259116, "grad_norm": 0.307048568974569, "learning_rate": 1.0093159102174938e-05, "loss": 0.2291, "step": 1362 }, { "epoch": 3.488163787587972, "grad_norm": 0.3132031600114937, "learning_rate": 1.006207418995636e-05, "loss": 0.2086, "step": 1363 }, { "epoch": 3.490722968650032, "grad_norm": 0.34596427815653313, "learning_rate": 1.0031021122729328e-05, "loss": 0.2497, "step": 1364 }, { "epoch": 3.4932821497120923, "grad_norm": 0.2986896060364163, "learning_rate": 1.0000000000000006e-05, "loss": 0.2379, "step": 1365 }, { "epoch": 3.4958413307741525, "grad_norm": 0.3319066544902576, "learning_rate": 9.969010921172155e-06, "loss": 0.2542, "step": 1366 }, { "epoch": 3.4984005118362123, "grad_norm": 0.3053208810307986, "learning_rate": 9.938053985546883e-06, "loss": 0.2299, "step": 1367 }, { "epoch": 3.5009596928982725, "grad_norm": 0.3736366846122222, "learning_rate": 9.907129292322298e-06, "loss": 0.2676, "step": 1368 }, { "epoch": 3.5035188739603327, "grad_norm": 0.33175316766942814, "learning_rate": 9.876236940593173e-06, "loss": 0.2753, "step": 1369 }, { "epoch": 3.506078055022393, "grad_norm": 0.3276624133983928, "learning_rate": 9.84537702935065e-06, "loss": 0.2745, "step": 1370 }, { "epoch": 3.508637236084453, "grad_norm": 0.3067138284048095, "learning_rate": 9.814549657481935e-06, "loss": 0.2201, "step": 1371 }, { "epoch": 3.511196417146513, "grad_norm": 0.3489342217784152, "learning_rate": 9.783754923769946e-06, "loss": 0.2402, "step": 1372 }, { "epoch": 3.513755598208573, "grad_norm": 0.3103704194146652, "learning_rate": 9.752992926893027e-06, "loss": 0.231, "step": 1373 }, { "epoch": 3.5163147792706333, "grad_norm": 0.29799307682543535, "learning_rate": 9.722263765424628e-06, "loss": 0.2103, "step": 1374 }, { "epoch": 3.5188739603326935, "grad_norm": 0.3245882511730939, "learning_rate": 9.691567537832964e-06, "loss": 0.2547, "step": 1375 }, { "epoch": 3.5214331413947537, "grad_norm": 0.29350433799889125, "learning_rate": 9.660904342480715e-06, "loss": 0.2083, "step": 1376 }, { "epoch": 3.523992322456814, "grad_norm": 0.3333636077305378, "learning_rate": 9.630274277624729e-06, "loss": 0.2837, "step": 1377 }, { "epoch": 3.526551503518874, "grad_norm": 0.33952864266921756, "learning_rate": 9.599677441415694e-06, "loss": 0.2313, "step": 1378 }, { "epoch": 3.5291106845809344, "grad_norm": 0.3012959852140507, "learning_rate": 9.5691139318978e-06, "loss": 0.2171, "step": 1379 }, { "epoch": 3.531669865642994, "grad_norm": 0.3318689398716116, "learning_rate": 9.538583847008452e-06, "loss": 0.2366, "step": 1380 }, { "epoch": 3.5342290467050543, "grad_norm": 0.3358375103639254, "learning_rate": 9.508087284577963e-06, "loss": 0.2402, "step": 1381 }, { "epoch": 3.5367882277671145, "grad_norm": 0.30920378089911293, "learning_rate": 9.477624342329209e-06, "loss": 0.2143, "step": 1382 }, { "epoch": 3.5393474088291748, "grad_norm": 0.3169427298803479, "learning_rate": 9.447195117877343e-06, "loss": 0.2285, "step": 1383 }, { "epoch": 3.541906589891235, "grad_norm": 0.3112954457690554, "learning_rate": 9.416799708729486e-06, "loss": 0.2315, "step": 1384 }, { "epoch": 3.5444657709532947, "grad_norm": 0.3238511016385153, "learning_rate": 9.386438212284372e-06, "loss": 0.2252, "step": 1385 }, { "epoch": 3.547024952015355, "grad_norm": 0.3031398199187957, "learning_rate": 9.356110725832081e-06, "loss": 0.2376, "step": 1386 }, { "epoch": 3.549584133077415, "grad_norm": 0.3105091167975465, "learning_rate": 9.325817346553725e-06, "loss": 0.2689, "step": 1387 }, { "epoch": 3.5521433141394754, "grad_norm": 0.31208594113425225, "learning_rate": 9.295558171521093e-06, "loss": 0.2278, "step": 1388 }, { "epoch": 3.5547024952015356, "grad_norm": 0.31485619105429463, "learning_rate": 9.265333297696395e-06, "loss": 0.242, "step": 1389 }, { "epoch": 3.557261676263596, "grad_norm": 0.31606147283215824, "learning_rate": 9.235142821931928e-06, "loss": 0.2363, "step": 1390 }, { "epoch": 3.559820857325656, "grad_norm": 0.3150525527068536, "learning_rate": 9.204986840969749e-06, "loss": 0.2199, "step": 1391 }, { "epoch": 3.5623800383877158, "grad_norm": 0.31857444893477177, "learning_rate": 9.174865451441375e-06, "loss": 0.2283, "step": 1392 }, { "epoch": 3.564939219449776, "grad_norm": 0.30466028849006704, "learning_rate": 9.1447787498675e-06, "loss": 0.232, "step": 1393 }, { "epoch": 3.567498400511836, "grad_norm": 0.34031824974175295, "learning_rate": 9.114726832657658e-06, "loss": 0.2663, "step": 1394 }, { "epoch": 3.5700575815738964, "grad_norm": 0.3261116373502211, "learning_rate": 9.084709796109907e-06, "loss": 0.2489, "step": 1395 }, { "epoch": 3.5726167626359566, "grad_norm": 0.30217642557332414, "learning_rate": 9.054727736410555e-06, "loss": 0.2613, "step": 1396 }, { "epoch": 3.5751759436980164, "grad_norm": 0.3029175133984261, "learning_rate": 9.02478074963381e-06, "loss": 0.2263, "step": 1397 }, { "epoch": 3.5777351247600766, "grad_norm": 0.3613891446327612, "learning_rate": 8.994868931741499e-06, "loss": 0.2658, "step": 1398 }, { "epoch": 3.580294305822137, "grad_norm": 0.32375645597232505, "learning_rate": 8.964992378582758e-06, "loss": 0.2458, "step": 1399 }, { "epoch": 3.582853486884197, "grad_norm": 0.346977280248451, "learning_rate": 8.93515118589373e-06, "loss": 0.2673, "step": 1400 }, { "epoch": 3.5854126679462572, "grad_norm": 0.3091670769782311, "learning_rate": 8.905345449297223e-06, "loss": 0.2517, "step": 1401 }, { "epoch": 3.5879718490083174, "grad_norm": 0.3033606651844572, "learning_rate": 8.87557526430246e-06, "loss": 0.2012, "step": 1402 }, { "epoch": 3.5905310300703777, "grad_norm": 0.3115678004260849, "learning_rate": 8.845840726304723e-06, "loss": 0.2297, "step": 1403 }, { "epoch": 3.593090211132438, "grad_norm": 0.3441565775702763, "learning_rate": 8.816141930585067e-06, "loss": 0.2542, "step": 1404 }, { "epoch": 3.5956493921944976, "grad_norm": 0.30703493196426435, "learning_rate": 8.786478972310023e-06, "loss": 0.2342, "step": 1405 }, { "epoch": 3.598208573256558, "grad_norm": 0.31549384345069187, "learning_rate": 8.756851946531294e-06, "loss": 0.247, "step": 1406 }, { "epoch": 3.600767754318618, "grad_norm": 0.29102001056426585, "learning_rate": 8.72726094818541e-06, "loss": 0.2074, "step": 1407 }, { "epoch": 3.6033269353806783, "grad_norm": 0.33490062048588876, "learning_rate": 8.697706072093493e-06, "loss": 0.2541, "step": 1408 }, { "epoch": 3.6058861164427385, "grad_norm": 0.3124637481108502, "learning_rate": 8.668187412960887e-06, "loss": 0.2437, "step": 1409 }, { "epoch": 3.6084452975047983, "grad_norm": 0.3113279052119832, "learning_rate": 8.638705065376887e-06, "loss": 0.2389, "step": 1410 }, { "epoch": 3.6110044785668585, "grad_norm": 0.3194418676306725, "learning_rate": 8.609259123814443e-06, "loss": 0.2549, "step": 1411 }, { "epoch": 3.6135636596289187, "grad_norm": 0.28910975717043763, "learning_rate": 8.579849682629844e-06, "loss": 0.2246, "step": 1412 }, { "epoch": 3.616122840690979, "grad_norm": 0.31307324283444393, "learning_rate": 8.550476836062419e-06, "loss": 0.2425, "step": 1413 }, { "epoch": 3.618682021753039, "grad_norm": 0.3040084877742325, "learning_rate": 8.521140678234214e-06, "loss": 0.2361, "step": 1414 }, { "epoch": 3.6212412028150993, "grad_norm": 0.3130765747434785, "learning_rate": 8.491841303149728e-06, "loss": 0.2272, "step": 1415 }, { "epoch": 3.6238003838771595, "grad_norm": 0.30265698644043443, "learning_rate": 8.462578804695595e-06, "loss": 0.2701, "step": 1416 }, { "epoch": 3.6263595649392193, "grad_norm": 0.2987350574519827, "learning_rate": 8.43335327664027e-06, "loss": 0.2177, "step": 1417 }, { "epoch": 3.6289187460012795, "grad_norm": 0.3166676881180338, "learning_rate": 8.404164812633755e-06, "loss": 0.2756, "step": 1418 }, { "epoch": 3.6314779270633397, "grad_norm": 0.3033675615693007, "learning_rate": 8.375013506207275e-06, "loss": 0.2136, "step": 1419 }, { "epoch": 3.6340371081254, "grad_norm": 0.30761103254564687, "learning_rate": 8.345899450772975e-06, "loss": 0.2535, "step": 1420 }, { "epoch": 3.63659628918746, "grad_norm": 0.2982029035930307, "learning_rate": 8.316822739623662e-06, "loss": 0.2165, "step": 1421 }, { "epoch": 3.63915547024952, "grad_norm": 0.31969771781374523, "learning_rate": 8.287783465932466e-06, "loss": 0.257, "step": 1422 }, { "epoch": 3.64171465131158, "grad_norm": 0.2837299669230536, "learning_rate": 8.258781722752535e-06, "loss": 0.224, "step": 1423 }, { "epoch": 3.6442738323736403, "grad_norm": 0.3012916036117272, "learning_rate": 8.229817603016786e-06, "loss": 0.2246, "step": 1424 }, { "epoch": 3.6468330134357005, "grad_norm": 0.31189818144182524, "learning_rate": 8.200891199537549e-06, "loss": 0.2695, "step": 1425 }, { "epoch": 3.6493921944977608, "grad_norm": 0.2997238939361931, "learning_rate": 8.1720026050063e-06, "loss": 0.1862, "step": 1426 }, { "epoch": 3.651951375559821, "grad_norm": 0.34075811134574374, "learning_rate": 8.143151911993374e-06, "loss": 0.2619, "step": 1427 }, { "epoch": 3.654510556621881, "grad_norm": 0.3146837641261533, "learning_rate": 8.114339212947655e-06, "loss": 0.2396, "step": 1428 }, { "epoch": 3.6570697376839414, "grad_norm": 0.325379060971747, "learning_rate": 8.085564600196258e-06, "loss": 0.2435, "step": 1429 }, { "epoch": 3.659628918746001, "grad_norm": 0.33089426856010606, "learning_rate": 8.056828165944282e-06, "loss": 0.2459, "step": 1430 }, { "epoch": 3.6621880998080614, "grad_norm": 0.3111195583478781, "learning_rate": 8.028130002274459e-06, "loss": 0.2328, "step": 1431 }, { "epoch": 3.6647472808701216, "grad_norm": 0.2917405967267679, "learning_rate": 7.999470201146915e-06, "loss": 0.2273, "step": 1432 }, { "epoch": 3.667306461932182, "grad_norm": 0.3072708256616894, "learning_rate": 7.970848854398825e-06, "loss": 0.2616, "step": 1433 }, { "epoch": 3.669865642994242, "grad_norm": 0.3274866801130236, "learning_rate": 7.942266053744155e-06, "loss": 0.2469, "step": 1434 }, { "epoch": 3.6724248240563018, "grad_norm": 0.3206452949252231, "learning_rate": 7.913721890773354e-06, "loss": 0.2265, "step": 1435 }, { "epoch": 3.674984005118362, "grad_norm": 0.3076598617812908, "learning_rate": 7.885216456953053e-06, "loss": 0.2167, "step": 1436 }, { "epoch": 3.677543186180422, "grad_norm": 0.2979466544850749, "learning_rate": 7.856749843625777e-06, "loss": 0.2203, "step": 1437 }, { "epoch": 3.6801023672424824, "grad_norm": 0.3224437385684691, "learning_rate": 7.828322142009672e-06, "loss": 0.2473, "step": 1438 }, { "epoch": 3.6826615483045426, "grad_norm": 0.31485417513081154, "learning_rate": 7.799933443198173e-06, "loss": 0.2606, "step": 1439 }, { "epoch": 3.685220729366603, "grad_norm": 0.3243639601134415, "learning_rate": 7.771583838159756e-06, "loss": 0.2633, "step": 1440 }, { "epoch": 3.687779910428663, "grad_norm": 0.32893510730247094, "learning_rate": 7.743273417737617e-06, "loss": 0.2531, "step": 1441 }, { "epoch": 3.690339091490723, "grad_norm": 0.31189557871340884, "learning_rate": 7.715002272649388e-06, "loss": 0.2403, "step": 1442 }, { "epoch": 3.692898272552783, "grad_norm": 0.2901780629695267, "learning_rate": 7.686770493486835e-06, "loss": 0.2517, "step": 1443 }, { "epoch": 3.6954574536148432, "grad_norm": 0.3159906947616901, "learning_rate": 7.65857817071561e-06, "loss": 0.2492, "step": 1444 }, { "epoch": 3.6980166346769034, "grad_norm": 0.3098159857766409, "learning_rate": 7.630425394674903e-06, "loss": 0.2341, "step": 1445 }, { "epoch": 3.7005758157389637, "grad_norm": 0.31575476226888965, "learning_rate": 7.602312255577193e-06, "loss": 0.2416, "step": 1446 }, { "epoch": 3.7031349968010234, "grad_norm": 0.31087089571752347, "learning_rate": 7.574238843507957e-06, "loss": 0.2673, "step": 1447 }, { "epoch": 3.7056941778630836, "grad_norm": 0.3209375688192084, "learning_rate": 7.546205248425353e-06, "loss": 0.2313, "step": 1448 }, { "epoch": 3.708253358925144, "grad_norm": 0.30680259294814516, "learning_rate": 7.518211560159949e-06, "loss": 0.2187, "step": 1449 }, { "epoch": 3.710812539987204, "grad_norm": 0.3064495121568587, "learning_rate": 7.49025786841445e-06, "loss": 0.2161, "step": 1450 }, { "epoch": 3.7133717210492643, "grad_norm": 0.29214767577744066, "learning_rate": 7.462344262763399e-06, "loss": 0.2339, "step": 1451 }, { "epoch": 3.7159309021113245, "grad_norm": 0.3146911137467561, "learning_rate": 7.434470832652865e-06, "loss": 0.2464, "step": 1452 }, { "epoch": 3.7184900831733847, "grad_norm": 0.3499806012960674, "learning_rate": 7.406637667400205e-06, "loss": 0.2246, "step": 1453 }, { "epoch": 3.721049264235445, "grad_norm": 0.30724891177758956, "learning_rate": 7.378844856193736e-06, "loss": 0.272, "step": 1454 }, { "epoch": 3.7236084452975047, "grad_norm": 0.29711759225447126, "learning_rate": 7.3510924880924575e-06, "loss": 0.2205, "step": 1455 }, { "epoch": 3.726167626359565, "grad_norm": 0.31114069148352147, "learning_rate": 7.323380652025794e-06, "loss": 0.2619, "step": 1456 }, { "epoch": 3.728726807421625, "grad_norm": 0.3019555109712794, "learning_rate": 7.295709436793284e-06, "loss": 0.2526, "step": 1457 }, { "epoch": 3.7312859884836853, "grad_norm": 0.30587326696750855, "learning_rate": 7.268078931064293e-06, "loss": 0.2156, "step": 1458 }, { "epoch": 3.7338451695457455, "grad_norm": 0.3010387774085701, "learning_rate": 7.2404892233777334e-06, "loss": 0.2343, "step": 1459 }, { "epoch": 3.7364043506078053, "grad_norm": 0.30265924975544334, "learning_rate": 7.212940402141808e-06, "loss": 0.2542, "step": 1460 }, { "epoch": 3.7389635316698655, "grad_norm": 0.2899279021405336, "learning_rate": 7.185432555633672e-06, "loss": 0.2263, "step": 1461 }, { "epoch": 3.7415227127319257, "grad_norm": 0.3197841686852777, "learning_rate": 7.1579657719992045e-06, "loss": 0.2665, "step": 1462 }, { "epoch": 3.744081893793986, "grad_norm": 0.27978040220930955, "learning_rate": 7.130540139252704e-06, "loss": 0.2338, "step": 1463 }, { "epoch": 3.746641074856046, "grad_norm": 0.31288929736620646, "learning_rate": 7.1031557452765934e-06, "loss": 0.2372, "step": 1464 }, { "epoch": 3.7492002559181064, "grad_norm": 0.3098001081263227, "learning_rate": 7.075812677821145e-06, "loss": 0.2221, "step": 1465 }, { "epoch": 3.7517594369801666, "grad_norm": 0.29645463690795515, "learning_rate": 7.048511024504223e-06, "loss": 0.2439, "step": 1466 }, { "epoch": 3.7543186180422263, "grad_norm": 0.30305748324499643, "learning_rate": 7.021250872810983e-06, "loss": 0.2447, "step": 1467 }, { "epoch": 3.7568777991042865, "grad_norm": 0.3180001382034292, "learning_rate": 6.9940323100935725e-06, "loss": 0.2455, "step": 1468 }, { "epoch": 3.7594369801663468, "grad_norm": 0.30172748918488546, "learning_rate": 6.966855423570898e-06, "loss": 0.2319, "step": 1469 }, { "epoch": 3.761996161228407, "grad_norm": 0.30342551799145645, "learning_rate": 6.939720300328303e-06, "loss": 0.2283, "step": 1470 }, { "epoch": 3.764555342290467, "grad_norm": 0.31295405822100386, "learning_rate": 6.9126270273173e-06, "loss": 0.2361, "step": 1471 }, { "epoch": 3.767114523352527, "grad_norm": 0.2970422267078146, "learning_rate": 6.885575691355315e-06, "loss": 0.1965, "step": 1472 }, { "epoch": 3.769673704414587, "grad_norm": 0.32398586323455586, "learning_rate": 6.858566379125389e-06, "loss": 0.2661, "step": 1473 }, { "epoch": 3.7722328854766474, "grad_norm": 0.28922597868223626, "learning_rate": 6.831599177175879e-06, "loss": 0.222, "step": 1474 }, { "epoch": 3.7747920665387076, "grad_norm": 0.296927401509587, "learning_rate": 6.8046741719202385e-06, "loss": 0.2262, "step": 1475 }, { "epoch": 3.777351247600768, "grad_norm": 0.318579910031184, "learning_rate": 6.777791449636681e-06, "loss": 0.2455, "step": 1476 }, { "epoch": 3.779910428662828, "grad_norm": 0.30482817177680954, "learning_rate": 6.7509510964679305e-06, "loss": 0.2376, "step": 1477 }, { "epoch": 3.782469609724888, "grad_norm": 0.3043184575100951, "learning_rate": 6.724153198420957e-06, "loss": 0.2508, "step": 1478 }, { "epoch": 3.7850287907869484, "grad_norm": 0.29471146448676344, "learning_rate": 6.697397841366686e-06, "loss": 0.2219, "step": 1479 }, { "epoch": 3.787587971849008, "grad_norm": 0.2763016151280136, "learning_rate": 6.67068511103971e-06, "loss": 0.2092, "step": 1480 }, { "epoch": 3.7901471529110684, "grad_norm": 0.29319933516975266, "learning_rate": 6.644015093038049e-06, "loss": 0.2042, "step": 1481 }, { "epoch": 3.7927063339731286, "grad_norm": 0.3160040922162459, "learning_rate": 6.617387872822842e-06, "loss": 0.2269, "step": 1482 }, { "epoch": 3.795265515035189, "grad_norm": 0.31249964016432036, "learning_rate": 6.590803535718082e-06, "loss": 0.2841, "step": 1483 }, { "epoch": 3.797824696097249, "grad_norm": 0.29901036986186935, "learning_rate": 6.564262166910367e-06, "loss": 0.2096, "step": 1484 }, { "epoch": 3.800383877159309, "grad_norm": 0.3151541438953574, "learning_rate": 6.537763851448593e-06, "loss": 0.2215, "step": 1485 }, { "epoch": 3.802943058221369, "grad_norm": 0.3238104601226396, "learning_rate": 6.511308674243711e-06, "loss": 0.2493, "step": 1486 }, { "epoch": 3.8055022392834292, "grad_norm": 0.30561591970597685, "learning_rate": 6.484896720068421e-06, "loss": 0.238, "step": 1487 }, { "epoch": 3.8080614203454894, "grad_norm": 0.28920704087260063, "learning_rate": 6.458528073556925e-06, "loss": 0.2685, "step": 1488 }, { "epoch": 3.8106206014075497, "grad_norm": 0.3038589031703289, "learning_rate": 6.432202819204667e-06, "loss": 0.248, "step": 1489 }, { "epoch": 3.81317978246961, "grad_norm": 0.3385452502303158, "learning_rate": 6.4059210413680175e-06, "loss": 0.2503, "step": 1490 }, { "epoch": 3.81573896353167, "grad_norm": 0.2768232976810782, "learning_rate": 6.379682824264055e-06, "loss": 0.2164, "step": 1491 }, { "epoch": 3.81829814459373, "grad_norm": 0.3009349939503682, "learning_rate": 6.353488251970275e-06, "loss": 0.2366, "step": 1492 }, { "epoch": 3.82085732565579, "grad_norm": 0.2928819289728828, "learning_rate": 6.327337408424281e-06, "loss": 0.2332, "step": 1493 }, { "epoch": 3.8234165067178503, "grad_norm": 0.2870974660694633, "learning_rate": 6.301230377423595e-06, "loss": 0.2, "step": 1494 }, { "epoch": 3.8259756877799105, "grad_norm": 0.3081786003102801, "learning_rate": 6.275167242625331e-06, "loss": 0.2414, "step": 1495 }, { "epoch": 3.8285348688419707, "grad_norm": 0.2790089490684267, "learning_rate": 6.2491480875459336e-06, "loss": 0.215, "step": 1496 }, { "epoch": 3.8310940499040305, "grad_norm": 0.29306281039648446, "learning_rate": 6.223172995560935e-06, "loss": 0.2679, "step": 1497 }, { "epoch": 3.8336532309660907, "grad_norm": 0.2762204818599091, "learning_rate": 6.1972420499046635e-06, "loss": 0.2192, "step": 1498 }, { "epoch": 3.836212412028151, "grad_norm": 0.29828532304173333, "learning_rate": 6.171355333669973e-06, "loss": 0.2441, "step": 1499 }, { "epoch": 3.838771593090211, "grad_norm": 0.32131992187351277, "learning_rate": 6.145512929808013e-06, "loss": 0.229, "step": 1500 }, { "epoch": 3.8413307741522713, "grad_norm": 0.3043046372632, "learning_rate": 6.119714921127933e-06, "loss": 0.2694, "step": 1501 }, { "epoch": 3.8438899552143315, "grad_norm": 0.2717132569091513, "learning_rate": 6.093961390296603e-06, "loss": 0.2254, "step": 1502 }, { "epoch": 3.8464491362763917, "grad_norm": 0.30029687863634635, "learning_rate": 6.068252419838399e-06, "loss": 0.2326, "step": 1503 }, { "epoch": 3.849008317338452, "grad_norm": 0.27288392014891966, "learning_rate": 6.042588092134878e-06, "loss": 0.2163, "step": 1504 }, { "epoch": 3.8515674984005117, "grad_norm": 0.283307260045485, "learning_rate": 6.016968489424572e-06, "loss": 0.2312, "step": 1505 }, { "epoch": 3.854126679462572, "grad_norm": 0.2919987165904583, "learning_rate": 5.991393693802674e-06, "loss": 0.2533, "step": 1506 }, { "epoch": 3.856685860524632, "grad_norm": 0.2968617574713475, "learning_rate": 5.96586378722081e-06, "loss": 0.2397, "step": 1507 }, { "epoch": 3.8592450415866923, "grad_norm": 0.28294719312430794, "learning_rate": 5.940378851486766e-06, "loss": 0.2302, "step": 1508 }, { "epoch": 3.8618042226487526, "grad_norm": 0.2885647154013107, "learning_rate": 5.9149389682642165e-06, "loss": 0.2429, "step": 1509 }, { "epoch": 3.8643634037108123, "grad_norm": 0.29335993884299777, "learning_rate": 5.889544219072465e-06, "loss": 0.2347, "step": 1510 }, { "epoch": 3.8669225847728725, "grad_norm": 0.30407531909262675, "learning_rate": 5.864194685286206e-06, "loss": 0.2405, "step": 1511 }, { "epoch": 3.8694817658349328, "grad_norm": 0.29889145730228683, "learning_rate": 5.838890448135228e-06, "loss": 0.2373, "step": 1512 }, { "epoch": 3.872040946896993, "grad_norm": 0.3002231215499318, "learning_rate": 5.81363158870418e-06, "loss": 0.2316, "step": 1513 }, { "epoch": 3.874600127959053, "grad_norm": 0.3246106606552585, "learning_rate": 5.788418187932314e-06, "loss": 0.2365, "step": 1514 }, { "epoch": 3.8771593090211134, "grad_norm": 0.29292707103941984, "learning_rate": 5.7632503266131925e-06, "loss": 0.2087, "step": 1515 }, { "epoch": 3.8797184900831736, "grad_norm": 0.3255584037433501, "learning_rate": 5.7381280853944585e-06, "loss": 0.2807, "step": 1516 }, { "epoch": 3.8822776711452334, "grad_norm": 0.2912419112819368, "learning_rate": 5.713051544777584e-06, "loss": 0.2218, "step": 1517 }, { "epoch": 3.8848368522072936, "grad_norm": 0.3048580778970364, "learning_rate": 5.688020785117581e-06, "loss": 0.2753, "step": 1518 }, { "epoch": 3.887396033269354, "grad_norm": 0.29167337976733226, "learning_rate": 5.66303588662277e-06, "loss": 0.2329, "step": 1519 }, { "epoch": 3.889955214331414, "grad_norm": 0.2992142371051243, "learning_rate": 5.638096929354522e-06, "loss": 0.2268, "step": 1520 }, { "epoch": 3.892514395393474, "grad_norm": 0.3027423583297044, "learning_rate": 5.613203993226981e-06, "loss": 0.221, "step": 1521 }, { "epoch": 3.895073576455534, "grad_norm": 0.3021686605260421, "learning_rate": 5.588357158006821e-06, "loss": 0.252, "step": 1522 }, { "epoch": 3.897632757517594, "grad_norm": 0.2996327213886315, "learning_rate": 5.563556503312997e-06, "loss": 0.2318, "step": 1523 }, { "epoch": 3.9001919385796544, "grad_norm": 0.2957035292429405, "learning_rate": 5.538802108616494e-06, "loss": 0.239, "step": 1524 }, { "epoch": 3.9027511196417146, "grad_norm": 0.309492243720712, "learning_rate": 5.514094053240035e-06, "loss": 0.2228, "step": 1525 }, { "epoch": 3.905310300703775, "grad_norm": 0.6416163470606975, "learning_rate": 5.489432416357885e-06, "loss": 0.2326, "step": 1526 }, { "epoch": 3.907869481765835, "grad_norm": 0.297847179245253, "learning_rate": 5.46481727699554e-06, "loss": 0.2346, "step": 1527 }, { "epoch": 3.9104286628278953, "grad_norm": 0.30350301665082174, "learning_rate": 5.440248714029508e-06, "loss": 0.2478, "step": 1528 }, { "epoch": 3.9129878438899555, "grad_norm": 0.29944760683896515, "learning_rate": 5.415726806187052e-06, "loss": 0.2306, "step": 1529 }, { "epoch": 3.9155470249520152, "grad_norm": 0.28720986057591197, "learning_rate": 5.39125163204594e-06, "loss": 0.2057, "step": 1530 }, { "epoch": 3.9181062060140754, "grad_norm": 0.2868190209794002, "learning_rate": 5.3668232700341735e-06, "loss": 0.2278, "step": 1531 }, { "epoch": 3.9206653870761357, "grad_norm": 0.3118363419035033, "learning_rate": 5.342441798429747e-06, "loss": 0.2518, "step": 1532 }, { "epoch": 3.923224568138196, "grad_norm": 0.27782593526187427, "learning_rate": 5.318107295360424e-06, "loss": 0.2334, "step": 1533 }, { "epoch": 3.925783749200256, "grad_norm": 0.29231670514977803, "learning_rate": 5.293819838803429e-06, "loss": 0.2198, "step": 1534 }, { "epoch": 3.928342930262316, "grad_norm": 0.30148858674026124, "learning_rate": 5.269579506585259e-06, "loss": 0.2291, "step": 1535 }, { "epoch": 3.930902111324376, "grad_norm": 0.2942793998750714, "learning_rate": 5.245386376381398e-06, "loss": 0.2235, "step": 1536 }, { "epoch": 3.9334612923864363, "grad_norm": 0.30230762629355007, "learning_rate": 5.221240525716071e-06, "loss": 0.2182, "step": 1537 }, { "epoch": 3.9360204734484965, "grad_norm": 0.2838869405763987, "learning_rate": 5.197142031961999e-06, "loss": 0.2531, "step": 1538 }, { "epoch": 3.9385796545105567, "grad_norm": 0.284590980219594, "learning_rate": 5.17309097234016e-06, "loss": 0.2235, "step": 1539 }, { "epoch": 3.941138835572617, "grad_norm": 0.2805331306681004, "learning_rate": 5.149087423919541e-06, "loss": 0.1941, "step": 1540 }, { "epoch": 3.943698016634677, "grad_norm": 0.3227111819969921, "learning_rate": 5.125131463616863e-06, "loss": 0.2598, "step": 1541 }, { "epoch": 3.946257197696737, "grad_norm": 0.3095815791809101, "learning_rate": 5.101223168196381e-06, "loss": 0.26, "step": 1542 }, { "epoch": 3.948816378758797, "grad_norm": 0.28306994750627523, "learning_rate": 5.077362614269599e-06, "loss": 0.2214, "step": 1543 }, { "epoch": 3.9513755598208573, "grad_norm": 0.30833267171559586, "learning_rate": 5.05354987829503e-06, "loss": 0.2473, "step": 1544 }, { "epoch": 3.9539347408829175, "grad_norm": 0.29154031796148494, "learning_rate": 5.029785036577976e-06, "loss": 0.231, "step": 1545 }, { "epoch": 3.9564939219449777, "grad_norm": 0.3230057370683127, "learning_rate": 5.0060681652702745e-06, "loss": 0.2538, "step": 1546 }, { "epoch": 3.9590531030070375, "grad_norm": 0.28553506909372656, "learning_rate": 4.982399340370017e-06, "loss": 0.231, "step": 1547 }, { "epoch": 3.9616122840690977, "grad_norm": 0.30170947130683173, "learning_rate": 4.958778637721364e-06, "loss": 0.2454, "step": 1548 }, { "epoch": 3.964171465131158, "grad_norm": 0.28267032534811537, "learning_rate": 4.935206133014259e-06, "loss": 0.2417, "step": 1549 }, { "epoch": 3.966730646193218, "grad_norm": 0.2954487998931121, "learning_rate": 4.911681901784198e-06, "loss": 0.2319, "step": 1550 }, { "epoch": 3.9692898272552783, "grad_norm": 0.32228832699706966, "learning_rate": 4.8882060194119985e-06, "loss": 0.2282, "step": 1551 }, { "epoch": 3.9718490083173386, "grad_norm": 0.34818873755980667, "learning_rate": 4.864778561123555e-06, "loss": 0.2718, "step": 1552 }, { "epoch": 3.9744081893793988, "grad_norm": 0.27407731345565545, "learning_rate": 4.841399601989574e-06, "loss": 0.2039, "step": 1553 }, { "epoch": 3.976967370441459, "grad_norm": 0.2749332266585077, "learning_rate": 4.8180692169253714e-06, "loss": 0.2181, "step": 1554 }, { "epoch": 3.9795265515035187, "grad_norm": 0.3010112395338718, "learning_rate": 4.794787480690597e-06, "loss": 0.2232, "step": 1555 }, { "epoch": 3.982085732565579, "grad_norm": 0.29171650447929676, "learning_rate": 4.771554467889012e-06, "loss": 0.2391, "step": 1556 }, { "epoch": 3.984644913627639, "grad_norm": 0.2906873549139878, "learning_rate": 4.74837025296826e-06, "loss": 0.2297, "step": 1557 }, { "epoch": 3.9872040946896994, "grad_norm": 0.3106548346493187, "learning_rate": 4.725234910219609e-06, "loss": 0.2564, "step": 1558 }, { "epoch": 3.9897632757517596, "grad_norm": 0.29178940946369886, "learning_rate": 4.702148513777731e-06, "loss": 0.2457, "step": 1559 }, { "epoch": 3.9923224568138194, "grad_norm": 0.290141222179049, "learning_rate": 4.679111137620442e-06, "loss": 0.2007, "step": 1560 }, { "epoch": 3.9948816378758796, "grad_norm": 0.3078661741665255, "learning_rate": 4.656122855568477e-06, "loss": 0.2416, "step": 1561 }, { "epoch": 3.99744081893794, "grad_norm": 0.28827757162372486, "learning_rate": 4.63318374128527e-06, "loss": 0.2416, "step": 1562 }, { "epoch": 4.0, "grad_norm": 0.36059866962884213, "learning_rate": 4.610293868276681e-06, "loss": 0.286, "step": 1563 }, { "epoch": 4.00255918106206, "grad_norm": 0.4806949756871284, "learning_rate": 4.587453309890804e-06, "loss": 0.1829, "step": 1564 }, { "epoch": 4.00511836212412, "grad_norm": 0.4071198782663298, "learning_rate": 4.5646621393177e-06, "loss": 0.2002, "step": 1565 }, { "epoch": 4.007677543186181, "grad_norm": 0.29904364596653477, "learning_rate": 4.541920429589168e-06, "loss": 0.1689, "step": 1566 }, { "epoch": 4.010236724248241, "grad_norm": 0.319382339625052, "learning_rate": 4.519228253578514e-06, "loss": 0.162, "step": 1567 }, { "epoch": 4.012795905310301, "grad_norm": 0.44482796159027516, "learning_rate": 4.496585684000332e-06, "loss": 0.1905, "step": 1568 }, { "epoch": 4.015355086372361, "grad_norm": 0.4881520882605733, "learning_rate": 4.47399279341024e-06, "loss": 0.1883, "step": 1569 }, { "epoch": 4.017914267434421, "grad_norm": 0.39543877818036155, "learning_rate": 4.451449654204685e-06, "loss": 0.1792, "step": 1570 }, { "epoch": 4.020473448496481, "grad_norm": 0.33428133649360503, "learning_rate": 4.428956338620671e-06, "loss": 0.1549, "step": 1571 }, { "epoch": 4.023032629558541, "grad_norm": 0.39094054957770324, "learning_rate": 4.406512918735555e-06, "loss": 0.168, "step": 1572 }, { "epoch": 4.025591810620601, "grad_norm": 0.41624590579275506, "learning_rate": 4.384119466466816e-06, "loss": 0.1546, "step": 1573 }, { "epoch": 4.028150991682661, "grad_norm": 0.41050595250529065, "learning_rate": 4.361776053571816e-06, "loss": 0.1553, "step": 1574 }, { "epoch": 4.030710172744722, "grad_norm": 0.338525098415926, "learning_rate": 4.339482751647557e-06, "loss": 0.1672, "step": 1575 }, { "epoch": 4.033269353806782, "grad_norm": 0.3227337964991407, "learning_rate": 4.317239632130485e-06, "loss": 0.1694, "step": 1576 }, { "epoch": 4.035828534868842, "grad_norm": 0.3126102999935797, "learning_rate": 4.295046766296224e-06, "loss": 0.1652, "step": 1577 }, { "epoch": 4.038387715930902, "grad_norm": 0.3419323105009614, "learning_rate": 4.272904225259387e-06, "loss": 0.1643, "step": 1578 }, { "epoch": 4.0409468969929625, "grad_norm": 0.35539152039667277, "learning_rate": 4.250812079973301e-06, "loss": 0.1693, "step": 1579 }, { "epoch": 4.043506078055023, "grad_norm": 0.34150875416867105, "learning_rate": 4.228770401229824e-06, "loss": 0.1676, "step": 1580 }, { "epoch": 4.046065259117083, "grad_norm": 0.31333649874909303, "learning_rate": 4.206779259659102e-06, "loss": 0.1837, "step": 1581 }, { "epoch": 4.048624440179142, "grad_norm": 0.29497085250511934, "learning_rate": 4.184838725729326e-06, "loss": 0.1606, "step": 1582 }, { "epoch": 4.0511836212412025, "grad_norm": 0.28865884769293576, "learning_rate": 4.1629488697465195e-06, "loss": 0.1701, "step": 1583 }, { "epoch": 4.053742802303263, "grad_norm": 0.30617690195804087, "learning_rate": 4.141109761854332e-06, "loss": 0.1586, "step": 1584 }, { "epoch": 4.056301983365323, "grad_norm": 0.32345308536745493, "learning_rate": 4.119321472033779e-06, "loss": 0.1787, "step": 1585 }, { "epoch": 4.058861164427383, "grad_norm": 0.29444459546640134, "learning_rate": 4.097584070103042e-06, "loss": 0.153, "step": 1586 }, { "epoch": 4.061420345489443, "grad_norm": 0.28258351174881763, "learning_rate": 4.075897625717249e-06, "loss": 0.1593, "step": 1587 }, { "epoch": 4.0639795265515035, "grad_norm": 0.3000401537763792, "learning_rate": 4.054262208368216e-06, "loss": 0.1805, "step": 1588 }, { "epoch": 4.066538707613564, "grad_norm": 0.31652390372720957, "learning_rate": 4.032677887384262e-06, "loss": 0.1702, "step": 1589 }, { "epoch": 4.069097888675624, "grad_norm": 0.3228892663985873, "learning_rate": 4.011144731929981e-06, "loss": 0.1913, "step": 1590 }, { "epoch": 4.071657069737684, "grad_norm": 0.31373728199242906, "learning_rate": 3.989662811005992e-06, "loss": 0.1727, "step": 1591 }, { "epoch": 4.074216250799744, "grad_norm": 0.30059854075379616, "learning_rate": 3.96823219344876e-06, "loss": 0.2085, "step": 1592 }, { "epoch": 4.076775431861805, "grad_norm": 0.29253073860156936, "learning_rate": 3.9468529479303445e-06, "loss": 0.1746, "step": 1593 }, { "epoch": 4.079334612923865, "grad_norm": 0.2925598790253486, "learning_rate": 3.925525142958189e-06, "loss": 0.1949, "step": 1594 }, { "epoch": 4.081893793985924, "grad_norm": 0.29501284249932824, "learning_rate": 3.904248846874894e-06, "loss": 0.1665, "step": 1595 }, { "epoch": 4.084452975047984, "grad_norm": 0.3128356519747454, "learning_rate": 3.883024127858017e-06, "loss": 0.1725, "step": 1596 }, { "epoch": 4.0870121561100445, "grad_norm": 0.2888374275584094, "learning_rate": 3.861851053919847e-06, "loss": 0.1873, "step": 1597 }, { "epoch": 4.089571337172105, "grad_norm": 0.28170591176204707, "learning_rate": 3.840729692907164e-06, "loss": 0.1789, "step": 1598 }, { "epoch": 4.092130518234165, "grad_norm": 0.2851212955027544, "learning_rate": 3.819660112501053e-06, "loss": 0.1587, "step": 1599 }, { "epoch": 4.094689699296225, "grad_norm": 0.2794461018913548, "learning_rate": 3.7986423802166705e-06, "loss": 0.1564, "step": 1600 }, { "epoch": 4.097248880358285, "grad_norm": 0.29566827038245036, "learning_rate": 3.7776765634030234e-06, "loss": 0.1636, "step": 1601 }, { "epoch": 4.099808061420346, "grad_norm": 0.2933200780850988, "learning_rate": 3.756762729242773e-06, "loss": 0.1991, "step": 1602 }, { "epoch": 4.102367242482406, "grad_norm": 0.2761865132545396, "learning_rate": 3.7359009447520112e-06, "loss": 0.165, "step": 1603 }, { "epoch": 4.104926423544466, "grad_norm": 0.2661987317855668, "learning_rate": 3.715091276780023e-06, "loss": 0.1897, "step": 1604 }, { "epoch": 4.107485604606526, "grad_norm": 0.2898297360177148, "learning_rate": 3.694333792009115e-06, "loss": 0.1967, "step": 1605 }, { "epoch": 4.110044785668586, "grad_norm": 0.30088937431780177, "learning_rate": 3.6736285569543585e-06, "loss": 0.1705, "step": 1606 }, { "epoch": 4.112603966730646, "grad_norm": 0.3076179252763136, "learning_rate": 3.652975637963401e-06, "loss": 0.1865, "step": 1607 }, { "epoch": 4.115163147792706, "grad_norm": 0.2666186074605756, "learning_rate": 3.632375101216259e-06, "loss": 0.1804, "step": 1608 }, { "epoch": 4.117722328854766, "grad_norm": 0.264680141837875, "learning_rate": 3.6118270127250954e-06, "loss": 0.139, "step": 1609 }, { "epoch": 4.120281509916826, "grad_norm": 0.26713666643272443, "learning_rate": 3.5913314383339937e-06, "loss": 0.1533, "step": 1610 }, { "epoch": 4.122840690978887, "grad_norm": 0.28782921605361467, "learning_rate": 3.5708884437187673e-06, "loss": 0.1614, "step": 1611 }, { "epoch": 4.125399872040947, "grad_norm": 0.27318365156538243, "learning_rate": 3.5504980943867538e-06, "loss": 0.1868, "step": 1612 }, { "epoch": 4.127959053103007, "grad_norm": 0.27437741262556425, "learning_rate": 3.53016045567659e-06, "loss": 0.1634, "step": 1613 }, { "epoch": 4.130518234165067, "grad_norm": 0.27443522507309115, "learning_rate": 3.509875592757999e-06, "loss": 0.2041, "step": 1614 }, { "epoch": 4.1330774152271275, "grad_norm": 0.2733625632699625, "learning_rate": 3.4896435706316e-06, "loss": 0.1676, "step": 1615 }, { "epoch": 4.135636596289188, "grad_norm": 0.28425889785798236, "learning_rate": 3.469464454128684e-06, "loss": 0.1714, "step": 1616 }, { "epoch": 4.138195777351248, "grad_norm": 0.27651054747700926, "learning_rate": 3.4493383079110054e-06, "loss": 0.2032, "step": 1617 }, { "epoch": 4.140754958413308, "grad_norm": 0.27095766306492064, "learning_rate": 3.429265196470599e-06, "loss": 0.1654, "step": 1618 }, { "epoch": 4.143314139475368, "grad_norm": 0.27829913330629624, "learning_rate": 3.409245184129546e-06, "loss": 0.1753, "step": 1619 }, { "epoch": 4.145873320537428, "grad_norm": 0.26642028532837686, "learning_rate": 3.3892783350397675e-06, "loss": 0.1605, "step": 1620 }, { "epoch": 4.148432501599488, "grad_norm": 0.2671909078689866, "learning_rate": 3.369364713182848e-06, "loss": 0.1546, "step": 1621 }, { "epoch": 4.150991682661548, "grad_norm": 0.2744629320037609, "learning_rate": 3.349504382369795e-06, "loss": 0.1606, "step": 1622 }, { "epoch": 4.153550863723608, "grad_norm": 0.2655886430602391, "learning_rate": 3.329697406240855e-06, "loss": 0.1802, "step": 1623 }, { "epoch": 4.1561100447856685, "grad_norm": 0.2735670466849154, "learning_rate": 3.309943848265311e-06, "loss": 0.1685, "step": 1624 }, { "epoch": 4.158669225847729, "grad_norm": 0.2733935619603659, "learning_rate": 3.290243771741275e-06, "loss": 0.1712, "step": 1625 }, { "epoch": 4.161228406909789, "grad_norm": 0.28420268431811874, "learning_rate": 3.2705972397954655e-06, "loss": 0.1888, "step": 1626 }, { "epoch": 4.163787587971849, "grad_norm": 0.26985232539202264, "learning_rate": 3.2510043153830486e-06, "loss": 0.1877, "step": 1627 }, { "epoch": 4.166346769033909, "grad_norm": 0.2656409578402704, "learning_rate": 3.231465061287391e-06, "loss": 0.1844, "step": 1628 }, { "epoch": 4.1689059500959695, "grad_norm": 0.2870535364269227, "learning_rate": 3.211979540119883e-06, "loss": 0.1489, "step": 1629 }, { "epoch": 4.17146513115803, "grad_norm": 0.2751438862963445, "learning_rate": 3.1925478143197418e-06, "loss": 0.1651, "step": 1630 }, { "epoch": 4.17402431222009, "grad_norm": 0.28504383266802613, "learning_rate": 3.1731699461537958e-06, "loss": 0.1809, "step": 1631 }, { "epoch": 4.176583493282149, "grad_norm": 0.28160560698590886, "learning_rate": 3.153845997716303e-06, "loss": 0.1608, "step": 1632 }, { "epoch": 4.1791426743442095, "grad_norm": 0.2792655962671141, "learning_rate": 3.1345760309287264e-06, "loss": 0.1486, "step": 1633 }, { "epoch": 4.18170185540627, "grad_norm": 0.2835400810692843, "learning_rate": 3.1153601075395533e-06, "loss": 0.1742, "step": 1634 }, { "epoch": 4.18426103646833, "grad_norm": 0.27354882443074413, "learning_rate": 3.0961982891241083e-06, "loss": 0.1892, "step": 1635 }, { "epoch": 4.18682021753039, "grad_norm": 0.2593652179219975, "learning_rate": 3.0770906370843234e-06, "loss": 0.176, "step": 1636 }, { "epoch": 4.18937939859245, "grad_norm": 0.28067593594699425, "learning_rate": 3.058037212648579e-06, "loss": 0.1942, "step": 1637 }, { "epoch": 4.1919385796545106, "grad_norm": 0.27960007176651136, "learning_rate": 3.039038076871481e-06, "loss": 0.1722, "step": 1638 }, { "epoch": 4.194497760716571, "grad_norm": 0.2784512143122092, "learning_rate": 3.02009329063367e-06, "loss": 0.1819, "step": 1639 }, { "epoch": 4.197056941778631, "grad_norm": 0.291087706558686, "learning_rate": 3.001202914641628e-06, "loss": 0.1855, "step": 1640 }, { "epoch": 4.199616122840691, "grad_norm": 0.2731191212662854, "learning_rate": 2.9823670094275e-06, "loss": 0.1671, "step": 1641 }, { "epoch": 4.202175303902751, "grad_norm": 0.27430309267830155, "learning_rate": 2.9635856353488645e-06, "loss": 0.1731, "step": 1642 }, { "epoch": 4.204734484964812, "grad_norm": 0.26629467870174495, "learning_rate": 2.9448588525885746e-06, "loss": 0.1845, "step": 1643 }, { "epoch": 4.207293666026872, "grad_norm": 0.2742753809709845, "learning_rate": 2.9261867211545603e-06, "loss": 0.1748, "step": 1644 }, { "epoch": 4.209852847088931, "grad_norm": 0.28012849721285554, "learning_rate": 2.907569300879596e-06, "loss": 0.1994, "step": 1645 }, { "epoch": 4.212412028150991, "grad_norm": 0.28016509946069285, "learning_rate": 2.889006651421169e-06, "loss": 0.1788, "step": 1646 }, { "epoch": 4.214971209213052, "grad_norm": 0.2811917062883201, "learning_rate": 2.870498832261257e-06, "loss": 0.1486, "step": 1647 }, { "epoch": 4.217530390275112, "grad_norm": 0.277577622075296, "learning_rate": 2.85204590270612e-06, "loss": 0.1832, "step": 1648 }, { "epoch": 4.220089571337172, "grad_norm": 0.27124773514140205, "learning_rate": 2.8336479218861556e-06, "loss": 0.1626, "step": 1649 }, { "epoch": 4.222648752399232, "grad_norm": 0.26604650947349917, "learning_rate": 2.815304948755664e-06, "loss": 0.1686, "step": 1650 }, { "epoch": 4.225207933461292, "grad_norm": 0.273081026747494, "learning_rate": 2.7970170420926957e-06, "loss": 0.1713, "step": 1651 }, { "epoch": 4.227767114523353, "grad_norm": 0.2906700784230057, "learning_rate": 2.778784260498828e-06, "loss": 0.1681, "step": 1652 }, { "epoch": 4.230326295585413, "grad_norm": 0.29269194422721684, "learning_rate": 2.7606066623990145e-06, "loss": 0.1869, "step": 1653 }, { "epoch": 4.232885476647473, "grad_norm": 0.28591911142148163, "learning_rate": 2.742484306041373e-06, "loss": 0.174, "step": 1654 }, { "epoch": 4.235444657709533, "grad_norm": 0.27682074794775224, "learning_rate": 2.7244172494969978e-06, "loss": 0.1855, "step": 1655 }, { "epoch": 4.2380038387715935, "grad_norm": 0.302986579035124, "learning_rate": 2.7064055506597875e-06, "loss": 0.1641, "step": 1656 }, { "epoch": 4.240563019833653, "grad_norm": 0.2799225480334416, "learning_rate": 2.688449267246258e-06, "loss": 0.1923, "step": 1657 }, { "epoch": 4.243122200895713, "grad_norm": 0.38988709458435794, "learning_rate": 2.6705484567953386e-06, "loss": 0.2104, "step": 1658 }, { "epoch": 4.245681381957773, "grad_norm": 0.26949955966216177, "learning_rate": 2.6527031766682142e-06, "loss": 0.1718, "step": 1659 }, { "epoch": 4.248240563019833, "grad_norm": 0.2788156254648664, "learning_rate": 2.6349134840481294e-06, "loss": 0.1711, "step": 1660 }, { "epoch": 4.250799744081894, "grad_norm": 0.2621763225250691, "learning_rate": 2.6171794359401957e-06, "loss": 0.1532, "step": 1661 }, { "epoch": 4.253358925143954, "grad_norm": 0.2865555062851034, "learning_rate": 2.599501089171217e-06, "loss": 0.1552, "step": 1662 }, { "epoch": 4.255918106206014, "grad_norm": 0.26224793954089864, "learning_rate": 2.581878500389523e-06, "loss": 0.1755, "step": 1663 }, { "epoch": 4.258477287268074, "grad_norm": 0.2721003139035216, "learning_rate": 2.564311726064754e-06, "loss": 0.1898, "step": 1664 }, { "epoch": 4.2610364683301345, "grad_norm": 0.27269538230220364, "learning_rate": 2.546800822487714e-06, "loss": 0.1698, "step": 1665 }, { "epoch": 4.263595649392195, "grad_norm": 0.28827484525986047, "learning_rate": 2.5293458457701726e-06, "loss": 0.2087, "step": 1666 }, { "epoch": 4.266154830454255, "grad_norm": 0.27259819354187614, "learning_rate": 2.5119468518446844e-06, "loss": 0.18, "step": 1667 }, { "epoch": 4.268714011516315, "grad_norm": 0.28656968871426663, "learning_rate": 2.494603896464405e-06, "loss": 0.1818, "step": 1668 }, { "epoch": 4.271273192578375, "grad_norm": 0.28475736704342813, "learning_rate": 2.47731703520294e-06, "loss": 0.1888, "step": 1669 }, { "epoch": 4.273832373640435, "grad_norm": 0.2816641273674954, "learning_rate": 2.4600863234541338e-06, "loss": 0.186, "step": 1670 }, { "epoch": 4.276391554702495, "grad_norm": 0.2836263153168214, "learning_rate": 2.4429118164319076e-06, "loss": 0.1554, "step": 1671 }, { "epoch": 4.278950735764555, "grad_norm": 0.2840636606374706, "learning_rate": 2.4257935691700897e-06, "loss": 0.2089, "step": 1672 }, { "epoch": 4.281509916826615, "grad_norm": 0.2745376280727821, "learning_rate": 2.408731636522217e-06, "loss": 0.1579, "step": 1673 }, { "epoch": 4.2840690978886755, "grad_norm": 0.2605630953509163, "learning_rate": 2.3917260731613733e-06, "loss": 0.1903, "step": 1674 }, { "epoch": 4.286628278950736, "grad_norm": 0.2740285831465967, "learning_rate": 2.374776933580025e-06, "loss": 0.1725, "step": 1675 }, { "epoch": 4.289187460012796, "grad_norm": 0.28095336757761535, "learning_rate": 2.35788427208983e-06, "loss": 0.1867, "step": 1676 }, { "epoch": 4.291746641074856, "grad_norm": 0.27154745065005187, "learning_rate": 2.3410481428214602e-06, "loss": 0.1613, "step": 1677 }, { "epoch": 4.294305822136916, "grad_norm": 0.27345791664879376, "learning_rate": 2.324268599724451e-06, "loss": 0.1667, "step": 1678 }, { "epoch": 4.296865003198977, "grad_norm": 0.2732090679685769, "learning_rate": 2.307545696566997e-06, "loss": 0.1657, "step": 1679 }, { "epoch": 4.299424184261037, "grad_norm": 0.2697521270279199, "learning_rate": 2.2908794869358044e-06, "loss": 0.1897, "step": 1680 }, { "epoch": 4.301983365323097, "grad_norm": 0.27114579546466616, "learning_rate": 2.274270024235912e-06, "loss": 0.188, "step": 1681 }, { "epoch": 4.304542546385157, "grad_norm": 0.2729061433296195, "learning_rate": 2.2577173616905256e-06, "loss": 0.1595, "step": 1682 }, { "epoch": 4.3071017274472165, "grad_norm": 0.2906131303633033, "learning_rate": 2.2412215523408266e-06, "loss": 0.1737, "step": 1683 }, { "epoch": 4.309660908509277, "grad_norm": 0.2768622060092854, "learning_rate": 2.2247826490458223e-06, "loss": 0.1796, "step": 1684 }, { "epoch": 4.312220089571337, "grad_norm": 0.27867740126032275, "learning_rate": 2.2084007044821764e-06, "loss": 0.1565, "step": 1685 }, { "epoch": 4.314779270633397, "grad_norm": 0.2723637032411945, "learning_rate": 2.1920757711440354e-06, "loss": 0.1756, "step": 1686 }, { "epoch": 4.317338451695457, "grad_norm": 0.2676292277176362, "learning_rate": 2.1758079013428435e-06, "loss": 0.1683, "step": 1687 }, { "epoch": 4.319897632757518, "grad_norm": 0.29143070017187106, "learning_rate": 2.159597147207213e-06, "loss": 0.1697, "step": 1688 }, { "epoch": 4.322456813819578, "grad_norm": 0.27260777733690406, "learning_rate": 2.143443560682721e-06, "loss": 0.1788, "step": 1689 }, { "epoch": 4.325015994881638, "grad_norm": 0.27238765087958206, "learning_rate": 2.127347193531757e-06, "loss": 0.1704, "step": 1690 }, { "epoch": 4.327575175943698, "grad_norm": 0.3004112415700906, "learning_rate": 2.1113080973333643e-06, "loss": 0.1684, "step": 1691 }, { "epoch": 4.330134357005758, "grad_norm": 0.27426577172310523, "learning_rate": 2.0953263234830667e-06, "loss": 0.1541, "step": 1692 }, { "epoch": 4.332693538067819, "grad_norm": 0.27454753389749326, "learning_rate": 2.0794019231926986e-06, "loss": 0.1861, "step": 1693 }, { "epoch": 4.335252719129878, "grad_norm": 0.2809263966115645, "learning_rate": 2.0635349474902598e-06, "loss": 0.1785, "step": 1694 }, { "epoch": 4.337811900191938, "grad_norm": 0.2690301934416836, "learning_rate": 2.0477254472197237e-06, "loss": 0.1896, "step": 1695 }, { "epoch": 4.340371081253998, "grad_norm": 0.2682604924926407, "learning_rate": 2.0319734730408935e-06, "loss": 0.1775, "step": 1696 }, { "epoch": 4.342930262316059, "grad_norm": 0.2696807174487812, "learning_rate": 2.016279075429246e-06, "loss": 0.1903, "step": 1697 }, { "epoch": 4.345489443378119, "grad_norm": 0.2665228127768651, "learning_rate": 2.0006423046757596e-06, "loss": 0.1754, "step": 1698 }, { "epoch": 4.348048624440179, "grad_norm": 0.2694146119632947, "learning_rate": 1.985063210886735e-06, "loss": 0.1549, "step": 1699 }, { "epoch": 4.350607805502239, "grad_norm": 0.2822224216458224, "learning_rate": 1.96954184398368e-06, "loss": 0.1362, "step": 1700 }, { "epoch": 4.3531669865642995, "grad_norm": 0.2615035547353888, "learning_rate": 1.9540782537031045e-06, "loss": 0.1586, "step": 1701 }, { "epoch": 4.35572616762636, "grad_norm": 0.269471616538485, "learning_rate": 1.9386724895963805e-06, "loss": 0.1612, "step": 1702 }, { "epoch": 4.35828534868842, "grad_norm": 0.2682599270036803, "learning_rate": 1.9233246010295903e-06, "loss": 0.1822, "step": 1703 }, { "epoch": 4.36084452975048, "grad_norm": 0.2632188336157985, "learning_rate": 1.908034637183356e-06, "loss": 0.1815, "step": 1704 }, { "epoch": 4.36340371081254, "grad_norm": 0.2615148499789861, "learning_rate": 1.8928026470526917e-06, "loss": 0.1545, "step": 1705 }, { "epoch": 4.3659628918746005, "grad_norm": 0.274824297046551, "learning_rate": 1.8776286794468346e-06, "loss": 0.1476, "step": 1706 }, { "epoch": 4.36852207293666, "grad_norm": 0.27527733450113034, "learning_rate": 1.8625127829890922e-06, "loss": 0.2037, "step": 1707 }, { "epoch": 4.37108125399872, "grad_norm": 0.27423935602322536, "learning_rate": 1.8474550061166984e-06, "loss": 0.1719, "step": 1708 }, { "epoch": 4.37364043506078, "grad_norm": 0.27364608091349185, "learning_rate": 1.8324553970806436e-06, "loss": 0.1664, "step": 1709 }, { "epoch": 4.3761996161228405, "grad_norm": 0.2796166648988934, "learning_rate": 1.817514003945524e-06, "loss": 0.1953, "step": 1710 }, { "epoch": 4.378758797184901, "grad_norm": 0.26079246660216127, "learning_rate": 1.802630874589404e-06, "loss": 0.1641, "step": 1711 }, { "epoch": 4.381317978246961, "grad_norm": 0.2999617994892891, "learning_rate": 1.787806056703627e-06, "loss": 0.1718, "step": 1712 }, { "epoch": 4.383877159309021, "grad_norm": 0.2889063824640918, "learning_rate": 1.7730395977926917e-06, "loss": 0.1653, "step": 1713 }, { "epoch": 4.386436340371081, "grad_norm": 0.26748602837639757, "learning_rate": 1.758331545174099e-06, "loss": 0.1842, "step": 1714 }, { "epoch": 4.3889955214331415, "grad_norm": 0.2806929556508416, "learning_rate": 1.743681945978184e-06, "loss": 0.1586, "step": 1715 }, { "epoch": 4.391554702495202, "grad_norm": 0.29287158041254, "learning_rate": 1.7290908471479805e-06, "loss": 0.1761, "step": 1716 }, { "epoch": 4.394113883557262, "grad_norm": 0.28949556938232984, "learning_rate": 1.7145582954390638e-06, "loss": 0.1831, "step": 1717 }, { "epoch": 4.396673064619322, "grad_norm": 0.2707679886069612, "learning_rate": 1.7000843374193987e-06, "loss": 0.1796, "step": 1718 }, { "epoch": 4.399232245681382, "grad_norm": 0.26434041057826485, "learning_rate": 1.6856690194691872e-06, "loss": 0.1812, "step": 1719 }, { "epoch": 4.401791426743442, "grad_norm": 0.2717307275192052, "learning_rate": 1.6713123877807413e-06, "loss": 0.1618, "step": 1720 }, { "epoch": 4.404350607805502, "grad_norm": 0.26427198325914797, "learning_rate": 1.6570144883582994e-06, "loss": 0.1485, "step": 1721 }, { "epoch": 4.406909788867562, "grad_norm": 0.2830739919541902, "learning_rate": 1.6427753670179214e-06, "loss": 0.1628, "step": 1722 }, { "epoch": 4.409468969929622, "grad_norm": 0.2667553896951004, "learning_rate": 1.6285950693872999e-06, "loss": 0.1887, "step": 1723 }, { "epoch": 4.4120281509916826, "grad_norm": 0.2681400499833261, "learning_rate": 1.614473640905645e-06, "loss": 0.1629, "step": 1724 }, { "epoch": 4.414587332053743, "grad_norm": 0.2629743389937269, "learning_rate": 1.6004111268235156e-06, "loss": 0.2008, "step": 1725 }, { "epoch": 4.417146513115803, "grad_norm": 0.285465240532548, "learning_rate": 1.5864075722027017e-06, "loss": 0.191, "step": 1726 }, { "epoch": 4.419705694177863, "grad_norm": 0.2683638382247192, "learning_rate": 1.5724630219160553e-06, "loss": 0.2073, "step": 1727 }, { "epoch": 4.422264875239923, "grad_norm": 0.2829211001794818, "learning_rate": 1.5585775206473508e-06, "loss": 0.1568, "step": 1728 }, { "epoch": 4.424824056301984, "grad_norm": 0.2788212658151338, "learning_rate": 1.5447511128911542e-06, "loss": 0.1728, "step": 1729 }, { "epoch": 4.427383237364044, "grad_norm": 0.2848579162361746, "learning_rate": 1.5309838429526714e-06, "loss": 0.1904, "step": 1730 }, { "epoch": 4.429942418426104, "grad_norm": 0.2654662056800488, "learning_rate": 1.5172757549476024e-06, "loss": 0.166, "step": 1731 }, { "epoch": 4.432501599488164, "grad_norm": 0.285577343916777, "learning_rate": 1.5036268928020125e-06, "loss": 0.195, "step": 1732 }, { "epoch": 4.435060780550224, "grad_norm": 0.25545559192222317, "learning_rate": 1.4900373002521851e-06, "loss": 0.1706, "step": 1733 }, { "epoch": 4.437619961612284, "grad_norm": 0.279484081091582, "learning_rate": 1.4765070208444732e-06, "loss": 0.1909, "step": 1734 }, { "epoch": 4.440179142674344, "grad_norm": 0.27394133244756325, "learning_rate": 1.4630360979351644e-06, "loss": 0.1955, "step": 1735 }, { "epoch": 4.442738323736404, "grad_norm": 0.27730795832891525, "learning_rate": 1.4496245746903626e-06, "loss": 0.1668, "step": 1736 }, { "epoch": 4.445297504798464, "grad_norm": 0.2515943739407271, "learning_rate": 1.4362724940858109e-06, "loss": 0.173, "step": 1737 }, { "epoch": 4.447856685860525, "grad_norm": 0.25367847682125305, "learning_rate": 1.422979898906789e-06, "loss": 0.1639, "step": 1738 }, { "epoch": 4.450415866922585, "grad_norm": 0.27558480853746065, "learning_rate": 1.4097468317479623e-06, "loss": 0.1633, "step": 1739 }, { "epoch": 4.452975047984645, "grad_norm": 0.27695396643812065, "learning_rate": 1.396573335013236e-06, "loss": 0.1808, "step": 1740 }, { "epoch": 4.455534229046705, "grad_norm": 0.2804193028236503, "learning_rate": 1.3834594509156319e-06, "loss": 0.1673, "step": 1741 }, { "epoch": 4.4580934101087655, "grad_norm": 0.2782333366929398, "learning_rate": 1.3704052214771513e-06, "loss": 0.1971, "step": 1742 }, { "epoch": 4.460652591170826, "grad_norm": 0.2777400443098731, "learning_rate": 1.3574106885286465e-06, "loss": 0.1737, "step": 1743 }, { "epoch": 4.463211772232885, "grad_norm": 0.2764493972670724, "learning_rate": 1.344475893709658e-06, "loss": 0.1904, "step": 1744 }, { "epoch": 4.465770953294945, "grad_norm": 0.2752241285220294, "learning_rate": 1.3316008784683265e-06, "loss": 0.1613, "step": 1745 }, { "epoch": 4.468330134357005, "grad_norm": 0.2693372116468191, "learning_rate": 1.3187856840612167e-06, "loss": 0.1627, "step": 1746 }, { "epoch": 4.470889315419066, "grad_norm": 0.26299523604064184, "learning_rate": 1.3060303515532135e-06, "loss": 0.1644, "step": 1747 }, { "epoch": 4.473448496481126, "grad_norm": 0.26861009102213246, "learning_rate": 1.2933349218173774e-06, "loss": 0.1748, "step": 1748 }, { "epoch": 4.476007677543186, "grad_norm": 0.266256378668002, "learning_rate": 1.2806994355348224e-06, "loss": 0.1717, "step": 1749 }, { "epoch": 4.478566858605246, "grad_norm": 0.2783524667972571, "learning_rate": 1.2681239331945695e-06, "loss": 0.1739, "step": 1750 }, { "epoch": 4.4811260396673065, "grad_norm": 0.2807754665043445, "learning_rate": 1.2556084550934423e-06, "loss": 0.163, "step": 1751 }, { "epoch": 4.483685220729367, "grad_norm": 0.2751132941559695, "learning_rate": 1.2431530413359138e-06, "loss": 0.1596, "step": 1752 }, { "epoch": 4.486244401791427, "grad_norm": 0.27965370827809377, "learning_rate": 1.2307577318339825e-06, "loss": 0.1764, "step": 1753 }, { "epoch": 4.488803582853487, "grad_norm": 0.27090435786248723, "learning_rate": 1.2184225663070604e-06, "loss": 0.1904, "step": 1754 }, { "epoch": 4.491362763915547, "grad_norm": 0.2830802635501525, "learning_rate": 1.2061475842818337e-06, "loss": 0.1785, "step": 1755 }, { "epoch": 4.4939219449776076, "grad_norm": 0.2624221347168147, "learning_rate": 1.1939328250921278e-06, "loss": 0.1804, "step": 1756 }, { "epoch": 4.496481126039667, "grad_norm": 0.27670735162368, "learning_rate": 1.1817783278788042e-06, "loss": 0.1534, "step": 1757 }, { "epoch": 4.499040307101727, "grad_norm": 0.2755467325350106, "learning_rate": 1.169684131589608e-06, "loss": 0.1791, "step": 1758 }, { "epoch": 4.501599488163787, "grad_norm": 0.2750192315338786, "learning_rate": 1.1576502749790608e-06, "loss": 0.1721, "step": 1759 }, { "epoch": 4.5041586692258475, "grad_norm": 0.26825366429953873, "learning_rate": 1.1456767966083393e-06, "loss": 0.1739, "step": 1760 }, { "epoch": 4.506717850287908, "grad_norm": 0.26461882189193386, "learning_rate": 1.1337637348451369e-06, "loss": 0.1836, "step": 1761 }, { "epoch": 4.509277031349968, "grad_norm": 0.27170229996613754, "learning_rate": 1.1219111278635575e-06, "loss": 0.1746, "step": 1762 }, { "epoch": 4.511836212412028, "grad_norm": 0.28612289439672206, "learning_rate": 1.1101190136439689e-06, "loss": 0.1664, "step": 1763 }, { "epoch": 4.514395393474088, "grad_norm": 0.2814719237385938, "learning_rate": 1.0983874299729092e-06, "loss": 0.1552, "step": 1764 }, { "epoch": 4.516954574536149, "grad_norm": 0.27224408532725913, "learning_rate": 1.086716414442952e-06, "loss": 0.155, "step": 1765 }, { "epoch": 4.519513755598209, "grad_norm": 0.2683837888920839, "learning_rate": 1.0751060044525797e-06, "loss": 0.1947, "step": 1766 }, { "epoch": 4.522072936660269, "grad_norm": 0.266405093166955, "learning_rate": 1.0635562372060825e-06, "loss": 0.179, "step": 1767 }, { "epoch": 4.524632117722329, "grad_norm": 0.26568191781978007, "learning_rate": 1.052067149713416e-06, "loss": 0.1595, "step": 1768 }, { "epoch": 4.527191298784389, "grad_norm": 0.27613787388854283, "learning_rate": 1.0406387787900974e-06, "loss": 0.2022, "step": 1769 }, { "epoch": 4.529750479846449, "grad_norm": 0.2783446591602134, "learning_rate": 1.0292711610570904e-06, "loss": 0.1965, "step": 1770 }, { "epoch": 4.532309660908509, "grad_norm": 0.2754628182404677, "learning_rate": 1.0179643329406752e-06, "loss": 0.1796, "step": 1771 }, { "epoch": 4.534868841970569, "grad_norm": 0.2717991423747503, "learning_rate": 1.0067183306723384e-06, "loss": 0.1872, "step": 1772 }, { "epoch": 4.537428023032629, "grad_norm": 0.26023938540588254, "learning_rate": 9.955331902886645e-07, "loss": 0.1645, "step": 1773 }, { "epoch": 4.53998720409469, "grad_norm": 0.2697243580148783, "learning_rate": 9.844089476312035e-07, "loss": 0.1736, "step": 1774 }, { "epoch": 4.54254638515675, "grad_norm": 0.27089652411524956, "learning_rate": 9.733456383463658e-07, "loss": 0.156, "step": 1775 }, { "epoch": 4.54510556621881, "grad_norm": 0.2625263168411182, "learning_rate": 9.62343297885313e-07, "loss": 0.1709, "step": 1776 }, { "epoch": 4.54766474728087, "grad_norm": 0.2720147925441457, "learning_rate": 9.514019615038395e-07, "loss": 0.1609, "step": 1777 }, { "epoch": 4.55022392834293, "grad_norm": 0.26862738106885103, "learning_rate": 9.40521664262255e-07, "loss": 0.1823, "step": 1778 }, { "epoch": 4.552783109404991, "grad_norm": 0.2858477259373205, "learning_rate": 9.297024410252753e-07, "loss": 0.1719, "step": 1779 }, { "epoch": 4.555342290467051, "grad_norm": 0.2792231695337476, "learning_rate": 9.189443264619102e-07, "loss": 0.2187, "step": 1780 }, { "epoch": 4.557901471529111, "grad_norm": 0.2722587468079133, "learning_rate": 9.082473550453619e-07, "loss": 0.1581, "step": 1781 }, { "epoch": 4.560460652591171, "grad_norm": 0.25994947891197123, "learning_rate": 8.976115610528957e-07, "loss": 0.1813, "step": 1782 }, { "epoch": 4.563019833653231, "grad_norm": 0.2685596280130304, "learning_rate": 8.870369785657451e-07, "loss": 0.1637, "step": 1783 }, { "epoch": 4.565579014715291, "grad_norm": 0.2624613954232775, "learning_rate": 8.765236414690026e-07, "loss": 0.1867, "step": 1784 }, { "epoch": 4.568138195777351, "grad_norm": 0.26985980601394455, "learning_rate": 8.660715834514977e-07, "loss": 0.1812, "step": 1785 }, { "epoch": 4.570697376839411, "grad_norm": 0.2782580432929674, "learning_rate": 8.556808380057013e-07, "loss": 0.1551, "step": 1786 }, { "epoch": 4.5732565579014715, "grad_norm": 0.3229561788111089, "learning_rate": 8.453514384276196e-07, "loss": 0.1665, "step": 1787 }, { "epoch": 4.575815738963532, "grad_norm": 0.26676739208882927, "learning_rate": 8.350834178166755e-07, "loss": 0.2019, "step": 1788 }, { "epoch": 4.578374920025592, "grad_norm": 0.25638061426053027, "learning_rate": 8.248768090756143e-07, "loss": 0.1623, "step": 1789 }, { "epoch": 4.580934101087652, "grad_norm": 0.2769760353268046, "learning_rate": 8.147316449103959e-07, "loss": 0.193, "step": 1790 }, { "epoch": 4.583493282149712, "grad_norm": 0.2827217753260577, "learning_rate": 8.046479578300803e-07, "loss": 0.1573, "step": 1791 }, { "epoch": 4.5860524632117725, "grad_norm": 0.267728272299288, "learning_rate": 7.946257801467339e-07, "loss": 0.1534, "step": 1792 }, { "epoch": 4.588611644273833, "grad_norm": 0.26899124519431056, "learning_rate": 7.846651439753273e-07, "loss": 0.1785, "step": 1793 }, { "epoch": 4.591170825335892, "grad_norm": 0.2655562652017706, "learning_rate": 7.747660812336221e-07, "loss": 0.1632, "step": 1794 }, { "epoch": 4.593730006397953, "grad_norm": 0.2912418780943663, "learning_rate": 7.649286236420806e-07, "loss": 0.1664, "step": 1795 }, { "epoch": 4.5962891874600125, "grad_norm": 0.2773582855251603, "learning_rate": 7.551528027237553e-07, "loss": 0.1649, "step": 1796 }, { "epoch": 4.598848368522073, "grad_norm": 0.2706350032862212, "learning_rate": 7.454386498041865e-07, "loss": 0.1897, "step": 1797 }, { "epoch": 4.601407549584133, "grad_norm": 0.27987597117336843, "learning_rate": 7.357861960113121e-07, "loss": 0.1806, "step": 1798 }, { "epoch": 4.603966730646193, "grad_norm": 0.2554799929519513, "learning_rate": 7.261954722753595e-07, "loss": 0.1454, "step": 1799 }, { "epoch": 4.606525911708253, "grad_norm": 0.28194077726489003, "learning_rate": 7.166665093287539e-07, "loss": 0.1956, "step": 1800 }, { "epoch": 4.6090850927703135, "grad_norm": 0.27206485970301414, "learning_rate": 7.071993377060038e-07, "loss": 0.1813, "step": 1801 }, { "epoch": 4.611644273832374, "grad_norm": 0.27639368969275124, "learning_rate": 6.977939877436224e-07, "loss": 0.1937, "step": 1802 }, { "epoch": 4.614203454894434, "grad_norm": 0.26700294636297844, "learning_rate": 6.884504895800237e-07, "loss": 0.159, "step": 1803 }, { "epoch": 4.616762635956494, "grad_norm": 0.2715005815453172, "learning_rate": 6.791688731554158e-07, "loss": 0.1608, "step": 1804 }, { "epoch": 4.619321817018554, "grad_norm": 0.27127828240291824, "learning_rate": 6.69949168211721e-07, "loss": 0.1857, "step": 1805 }, { "epoch": 4.621880998080615, "grad_norm": 0.28402081462443657, "learning_rate": 6.607914042924756e-07, "loss": 0.1918, "step": 1806 }, { "epoch": 4.624440179142674, "grad_norm": 0.26263908410916775, "learning_rate": 6.516956107427241e-07, "loss": 0.1569, "step": 1807 }, { "epoch": 4.626999360204734, "grad_norm": 0.27371755225997646, "learning_rate": 6.426618167089338e-07, "loss": 0.1557, "step": 1808 }, { "epoch": 4.629558541266794, "grad_norm": 0.26959266513847036, "learning_rate": 6.336900511389133e-07, "loss": 0.1733, "step": 1809 }, { "epoch": 4.6321177223288545, "grad_norm": 0.27453758652223553, "learning_rate": 6.247803427816945e-07, "loss": 0.1635, "step": 1810 }, { "epoch": 4.634676903390915, "grad_norm": 0.2673151789681698, "learning_rate": 6.159327201874598e-07, "loss": 0.1709, "step": 1811 }, { "epoch": 4.637236084452975, "grad_norm": 0.2702926085830735, "learning_rate": 6.071472117074462e-07, "loss": 0.1815, "step": 1812 }, { "epoch": 4.639795265515035, "grad_norm": 0.2788070786022333, "learning_rate": 5.984238454938496e-07, "loss": 0.1527, "step": 1813 }, { "epoch": 4.642354446577095, "grad_norm": 0.27358568856995236, "learning_rate": 5.897626494997366e-07, "loss": 0.1785, "step": 1814 }, { "epoch": 4.644913627639156, "grad_norm": 0.2718095549716457, "learning_rate": 5.811636514789598e-07, "loss": 0.1853, "step": 1815 }, { "epoch": 4.647472808701216, "grad_norm": 0.27759832517042105, "learning_rate": 5.726268789860645e-07, "loss": 0.1646, "step": 1816 }, { "epoch": 4.650031989763276, "grad_norm": 0.26320625609355736, "learning_rate": 5.641523593761977e-07, "loss": 0.1723, "step": 1817 }, { "epoch": 4.652591170825336, "grad_norm": 0.27780583001556897, "learning_rate": 5.557401198050327e-07, "loss": 0.184, "step": 1818 }, { "epoch": 4.6551503518873965, "grad_norm": 0.27504183175562963, "learning_rate": 5.473901872286602e-07, "loss": 0.1712, "step": 1819 }, { "epoch": 4.657709532949456, "grad_norm": 0.2774680321446144, "learning_rate": 5.391025884035239e-07, "loss": 0.1817, "step": 1820 }, { "epoch": 4.660268714011516, "grad_norm": 0.26555781569772313, "learning_rate": 5.308773498863251e-07, "loss": 0.1576, "step": 1821 }, { "epoch": 4.662827895073576, "grad_norm": 0.33797160433489215, "learning_rate": 5.22714498033936e-07, "loss": 0.1929, "step": 1822 }, { "epoch": 4.665387076135636, "grad_norm": 0.28245555374717063, "learning_rate": 5.146140590033199e-07, "loss": 0.1869, "step": 1823 }, { "epoch": 4.667946257197697, "grad_norm": 0.27464067240369455, "learning_rate": 5.065760587514446e-07, "loss": 0.1902, "step": 1824 }, { "epoch": 4.670505438259757, "grad_norm": 0.2672943490021358, "learning_rate": 4.986005230351954e-07, "loss": 0.188, "step": 1825 }, { "epoch": 4.673064619321817, "grad_norm": 0.2657663358279065, "learning_rate": 4.906874774113024e-07, "loss": 0.184, "step": 1826 }, { "epoch": 4.675623800383877, "grad_norm": 0.2674714454963707, "learning_rate": 4.828369472362493e-07, "loss": 0.1469, "step": 1827 }, { "epoch": 4.6781829814459375, "grad_norm": 0.2882898088898947, "learning_rate": 4.750489576662021e-07, "loss": 0.162, "step": 1828 }, { "epoch": 4.680742162507998, "grad_norm": 0.27727023737142387, "learning_rate": 4.6732353365691374e-07, "loss": 0.1543, "step": 1829 }, { "epoch": 4.683301343570058, "grad_norm": 0.2636650641131126, "learning_rate": 4.5966069996365993e-07, "loss": 0.1561, "step": 1830 }, { "epoch": 4.685860524632118, "grad_norm": 0.2682699322744399, "learning_rate": 4.5206048114114775e-07, "loss": 0.1673, "step": 1831 }, { "epoch": 4.688419705694178, "grad_norm": 0.2743352871936966, "learning_rate": 4.4452290154344046e-07, "loss": 0.1807, "step": 1832 }, { "epoch": 4.690978886756238, "grad_norm": 0.2770317090716807, "learning_rate": 4.3704798532388624e-07, "loss": 0.2129, "step": 1833 }, { "epoch": 4.693538067818298, "grad_norm": 0.280836808159879, "learning_rate": 4.296357564350362e-07, "loss": 0.1604, "step": 1834 }, { "epoch": 4.696097248880358, "grad_norm": 0.26525175245500215, "learning_rate": 4.22286238628562e-07, "loss": 0.1763, "step": 1835 }, { "epoch": 4.698656429942418, "grad_norm": 0.2700214270271814, "learning_rate": 4.1499945545518283e-07, "loss": 0.154, "step": 1836 }, { "epoch": 4.7012156110044785, "grad_norm": 0.24327556501731853, "learning_rate": 4.077754302645964e-07, "loss": 0.1616, "step": 1837 }, { "epoch": 4.703774792066539, "grad_norm": 0.2652109590879505, "learning_rate": 4.006141862054014e-07, "loss": 0.1809, "step": 1838 }, { "epoch": 4.706333973128599, "grad_norm": 0.26799443888146, "learning_rate": 3.935157462250128e-07, "loss": 0.1799, "step": 1839 }, { "epoch": 4.708893154190659, "grad_norm": 0.26417342730564347, "learning_rate": 3.8648013306960664e-07, "loss": 0.1697, "step": 1840 }, { "epoch": 4.711452335252719, "grad_norm": 0.2655166510116645, "learning_rate": 3.7950736928402674e-07, "loss": 0.1354, "step": 1841 }, { "epoch": 4.7140115163147795, "grad_norm": 0.27018720652857536, "learning_rate": 3.7259747721173134e-07, "loss": 0.1568, "step": 1842 }, { "epoch": 4.71657069737684, "grad_norm": 0.26944814432184416, "learning_rate": 3.6575047899471085e-07, "loss": 0.1539, "step": 1843 }, { "epoch": 4.719129878438899, "grad_norm": 0.273820427322093, "learning_rate": 3.5896639657342134e-07, "loss": 0.1566, "step": 1844 }, { "epoch": 4.72168905950096, "grad_norm": 0.25408644087595794, "learning_rate": 3.522452516867048e-07, "loss": 0.1751, "step": 1845 }, { "epoch": 4.7242482405630195, "grad_norm": 0.282638324070005, "learning_rate": 3.455870658717353e-07, "loss": 0.1788, "step": 1846 }, { "epoch": 4.72680742162508, "grad_norm": 0.27645267654347633, "learning_rate": 3.3899186046393526e-07, "loss": 0.1856, "step": 1847 }, { "epoch": 4.72936660268714, "grad_norm": 0.28435322569370036, "learning_rate": 3.324596565969174e-07, "loss": 0.1903, "step": 1848 }, { "epoch": 4.7319257837492, "grad_norm": 0.2665475946312596, "learning_rate": 3.2599047520241123e-07, "loss": 0.1625, "step": 1849 }, { "epoch": 4.73448496481126, "grad_norm": 0.2848927094305906, "learning_rate": 3.1958433701019697e-07, "loss": 0.2058, "step": 1850 }, { "epoch": 4.737044145873321, "grad_norm": 0.2704317535606738, "learning_rate": 3.1324126254804524e-07, "loss": 0.1868, "step": 1851 }, { "epoch": 4.739603326935381, "grad_norm": 0.2800291692162128, "learning_rate": 3.069612721416371e-07, "loss": 0.1794, "step": 1852 }, { "epoch": 4.742162507997441, "grad_norm": 0.27622038521669706, "learning_rate": 3.007443859145087e-07, "loss": 0.1701, "step": 1853 }, { "epoch": 4.744721689059501, "grad_norm": 0.28051196613093177, "learning_rate": 2.9459062378799806e-07, "loss": 0.193, "step": 1854 }, { "epoch": 4.747280870121561, "grad_norm": 0.26937886948023293, "learning_rate": 2.8850000548115155e-07, "loss": 0.1645, "step": 1855 }, { "epoch": 4.749840051183622, "grad_norm": 0.2574745053917364, "learning_rate": 2.8247255051068845e-07, "loss": 0.1711, "step": 1856 }, { "epoch": 4.752399232245681, "grad_norm": 0.2716275327438086, "learning_rate": 2.7650827819093005e-07, "loss": 0.1699, "step": 1857 }, { "epoch": 4.754958413307741, "grad_norm": 0.2590049108849854, "learning_rate": 2.706072076337285e-07, "loss": 0.1648, "step": 1858 }, { "epoch": 4.757517594369801, "grad_norm": 0.27267669634639347, "learning_rate": 2.647693577484156e-07, "loss": 0.1887, "step": 1859 }, { "epoch": 4.760076775431862, "grad_norm": 0.27747570018737217, "learning_rate": 2.5899474724174313e-07, "loss": 0.1822, "step": 1860 }, { "epoch": 4.762635956493922, "grad_norm": 0.27256177644643004, "learning_rate": 2.532833946178137e-07, "loss": 0.1833, "step": 1861 }, { "epoch": 4.765195137555982, "grad_norm": 0.26875210471690963, "learning_rate": 2.4763531817802777e-07, "loss": 0.1634, "step": 1862 }, { "epoch": 4.767754318618042, "grad_norm": 0.2841356724669023, "learning_rate": 2.4205053602103015e-07, "loss": 0.1716, "step": 1863 }, { "epoch": 4.770313499680102, "grad_norm": 0.28261688509298977, "learning_rate": 2.365290660426389e-07, "loss": 0.1804, "step": 1864 }, { "epoch": 4.772872680742163, "grad_norm": 0.2673985339554513, "learning_rate": 2.3107092593579905e-07, "loss": 0.17, "step": 1865 }, { "epoch": 4.775431861804223, "grad_norm": 0.2644854641715479, "learning_rate": 2.2567613319051997e-07, "loss": 0.1624, "step": 1866 }, { "epoch": 4.777991042866283, "grad_norm": 0.2657701185136481, "learning_rate": 2.2034470509382234e-07, "loss": 0.1967, "step": 1867 }, { "epoch": 4.780550223928343, "grad_norm": 0.26830383283496656, "learning_rate": 2.1507665872968264e-07, "loss": 0.1743, "step": 1868 }, { "epoch": 4.7831094049904035, "grad_norm": 0.26825661403886375, "learning_rate": 2.0987201097897757e-07, "loss": 0.1697, "step": 1869 }, { "epoch": 4.785668586052463, "grad_norm": 0.25032287801653486, "learning_rate": 2.0473077851942858e-07, "loss": 0.1692, "step": 1870 }, { "epoch": 4.788227767114523, "grad_norm": 0.2540438269392125, "learning_rate": 1.9965297782554848e-07, "loss": 0.1594, "step": 1871 }, { "epoch": 4.790786948176583, "grad_norm": 0.25648856154725486, "learning_rate": 1.9463862516859277e-07, "loss": 0.1862, "step": 1872 }, { "epoch": 4.7933461292386434, "grad_norm": 0.27520225342246696, "learning_rate": 1.896877366165062e-07, "loss": 0.1625, "step": 1873 }, { "epoch": 4.795905310300704, "grad_norm": 0.2879456654071117, "learning_rate": 1.8480032803386505e-07, "loss": 0.1647, "step": 1874 }, { "epoch": 4.798464491362764, "grad_norm": 0.26113619907647856, "learning_rate": 1.799764150818306e-07, "loss": 0.1556, "step": 1875 }, { "epoch": 4.801023672424824, "grad_norm": 0.2684336534763457, "learning_rate": 1.7521601321810687e-07, "loss": 0.1686, "step": 1876 }, { "epoch": 4.803582853486884, "grad_norm": 0.287932659535979, "learning_rate": 1.7051913769687623e-07, "loss": 0.1549, "step": 1877 }, { "epoch": 4.8061420345489445, "grad_norm": 0.26143215817508003, "learning_rate": 1.658858035687594e-07, "loss": 0.1796, "step": 1878 }, { "epoch": 4.808701215611005, "grad_norm": 0.26742321309706935, "learning_rate": 1.6131602568076887e-07, "loss": 0.1723, "step": 1879 }, { "epoch": 4.811260396673065, "grad_norm": 0.26866517810628654, "learning_rate": 1.5680981867625566e-07, "loss": 0.1631, "step": 1880 }, { "epoch": 4.813819577735125, "grad_norm": 0.2664673668196368, "learning_rate": 1.5236719699486256e-07, "loss": 0.1595, "step": 1881 }, { "epoch": 4.816378758797185, "grad_norm": 0.26624359763267497, "learning_rate": 1.479881748724865e-07, "loss": 0.174, "step": 1882 }, { "epoch": 4.818937939859245, "grad_norm": 0.26586320390850965, "learning_rate": 1.4367276634122073e-07, "loss": 0.1733, "step": 1883 }, { "epoch": 4.821497120921305, "grad_norm": 0.2734079332163835, "learning_rate": 1.3942098522931491e-07, "loss": 0.1524, "step": 1884 }, { "epoch": 4.824056301983365, "grad_norm": 0.26860707446851667, "learning_rate": 1.3523284516113955e-07, "loss": 0.1801, "step": 1885 }, { "epoch": 4.826615483045425, "grad_norm": 0.26293219622877667, "learning_rate": 1.3110835955712831e-07, "loss": 0.1789, "step": 1886 }, { "epoch": 4.8291746641074855, "grad_norm": 0.26548180261766674, "learning_rate": 1.2704754163374022e-07, "loss": 0.1643, "step": 1887 }, { "epoch": 4.831733845169546, "grad_norm": 0.27466550793343814, "learning_rate": 1.2305040440342198e-07, "loss": 0.1417, "step": 1888 }, { "epoch": 4.834293026231606, "grad_norm": 0.27044679875019695, "learning_rate": 1.1911696067455902e-07, "loss": 0.1862, "step": 1889 }, { "epoch": 4.836852207293666, "grad_norm": 0.2667382729572236, "learning_rate": 1.1524722305144231e-07, "loss": 0.1671, "step": 1890 }, { "epoch": 4.839411388355726, "grad_norm": 0.26077082259648754, "learning_rate": 1.114412039342172e-07, "loss": 0.1583, "step": 1891 }, { "epoch": 4.841970569417787, "grad_norm": 0.2683982318957702, "learning_rate": 1.0769891551885903e-07, "loss": 0.1689, "step": 1892 }, { "epoch": 4.844529750479847, "grad_norm": 0.26442562426087834, "learning_rate": 1.0402036979711317e-07, "loss": 0.1901, "step": 1893 }, { "epoch": 4.847088931541906, "grad_norm": 0.2628418022630685, "learning_rate": 1.0040557855648169e-07, "loss": 0.1628, "step": 1894 }, { "epoch": 4.849648112603967, "grad_norm": 0.2643402568604478, "learning_rate": 9.685455338016347e-08, "loss": 0.1769, "step": 1895 }, { "epoch": 4.8522072936660265, "grad_norm": 0.27073662782178587, "learning_rate": 9.336730564702745e-08, "loss": 0.163, "step": 1896 }, { "epoch": 4.854766474728087, "grad_norm": 0.2592523308295433, "learning_rate": 8.994384653157718e-08, "loss": 0.1748, "step": 1897 }, { "epoch": 4.857325655790147, "grad_norm": 0.26364468600135543, "learning_rate": 8.658418700391302e-08, "loss": 0.1658, "step": 1898 }, { "epoch": 4.859884836852207, "grad_norm": 0.26261286147824464, "learning_rate": 8.328833782969003e-08, "loss": 0.166, "step": 1899 }, { "epoch": 4.862444017914267, "grad_norm": 0.2752608130158951, "learning_rate": 8.005630957010014e-08, "loss": 0.1832, "step": 1900 }, { "epoch": 4.865003198976328, "grad_norm": 0.27593480590108715, "learning_rate": 7.688811258181883e-08, "loss": 0.1838, "step": 1901 }, { "epoch": 4.867562380038388, "grad_norm": 0.27236472207879586, "learning_rate": 7.378375701698748e-08, "loss": 0.1898, "step": 1902 }, { "epoch": 4.870121561100448, "grad_norm": 0.25195081103744715, "learning_rate": 7.074325282317329e-08, "loss": 0.1754, "step": 1903 }, { "epoch": 4.872680742162508, "grad_norm": 0.2869783000463503, "learning_rate": 6.776660974333605e-08, "loss": 0.1572, "step": 1904 }, { "epoch": 4.8752399232245685, "grad_norm": 0.27441367536012884, "learning_rate": 6.485383731580142e-08, "loss": 0.1766, "step": 1905 }, { "epoch": 4.877799104286629, "grad_norm": 0.28772532733219014, "learning_rate": 6.200494487422771e-08, "loss": 0.1794, "step": 1906 }, { "epoch": 4.880358285348688, "grad_norm": 0.27889648675155043, "learning_rate": 5.921994154758137e-08, "loss": 0.17, "step": 1907 }, { "epoch": 4.882917466410748, "grad_norm": 0.25762876059244744, "learning_rate": 5.649883626009933e-08, "loss": 0.1415, "step": 1908 }, { "epoch": 4.885476647472808, "grad_norm": 0.27418736611080513, "learning_rate": 5.3841637731260054e-08, "loss": 0.1637, "step": 1909 }, { "epoch": 4.888035828534869, "grad_norm": 0.2644534739010174, "learning_rate": 5.1248354475768034e-08, "loss": 0.1856, "step": 1910 }, { "epoch": 4.890595009596929, "grad_norm": 0.26943083477416585, "learning_rate": 4.871899480351605e-08, "loss": 0.1833, "step": 1911 }, { "epoch": 4.893154190658989, "grad_norm": 0.27096897776649476, "learning_rate": 4.6253566819554066e-08, "loss": 0.1646, "step": 1912 }, { "epoch": 4.895713371721049, "grad_norm": 0.2810986979004101, "learning_rate": 4.385207842407813e-08, "loss": 0.1688, "step": 1913 }, { "epoch": 4.8982725527831095, "grad_norm": 0.2780463267788789, "learning_rate": 4.151453731239707e-08, "loss": 0.1889, "step": 1914 }, { "epoch": 4.90083173384517, "grad_norm": 0.2740470136715985, "learning_rate": 3.924095097489922e-08, "loss": 0.1771, "step": 1915 }, { "epoch": 4.90339091490723, "grad_norm": 0.2670609796470983, "learning_rate": 3.703132669704568e-08, "loss": 0.1767, "step": 1916 }, { "epoch": 4.90595009596929, "grad_norm": 0.276634001409615, "learning_rate": 3.4885671559332645e-08, "loss": 0.1698, "step": 1917 }, { "epoch": 4.90850927703135, "grad_norm": 0.2836253559347624, "learning_rate": 3.280399243727806e-08, "loss": 0.1434, "step": 1918 }, { "epoch": 4.9110684580934105, "grad_norm": 0.2596326694355661, "learning_rate": 3.078629600139271e-08, "loss": 0.1738, "step": 1919 }, { "epoch": 4.91362763915547, "grad_norm": 0.26138854550688545, "learning_rate": 2.8832588717164766e-08, "loss": 0.1698, "step": 1920 }, { "epoch": 4.91618682021753, "grad_norm": 0.2629381551374675, "learning_rate": 2.694287684503083e-08, "loss": 0.1803, "step": 1921 }, { "epoch": 4.91874600127959, "grad_norm": 0.2707296291029451, "learning_rate": 2.511716644036932e-08, "loss": 0.2076, "step": 1922 }, { "epoch": 4.9213051823416505, "grad_norm": 0.2643795979993967, "learning_rate": 2.3355463353467168e-08, "loss": 0.1737, "step": 1923 }, { "epoch": 4.923864363403711, "grad_norm": 0.2687386354716593, "learning_rate": 2.1657773229508684e-08, "loss": 0.1525, "step": 1924 }, { "epoch": 4.926423544465771, "grad_norm": 0.25773983452315097, "learning_rate": 2.0024101508555604e-08, "loss": 0.1611, "step": 1925 }, { "epoch": 4.928982725527831, "grad_norm": 0.2759068990299683, "learning_rate": 1.8454453425527098e-08, "loss": 0.149, "step": 1926 }, { "epoch": 4.931541906589891, "grad_norm": 0.2658729323365574, "learning_rate": 1.6948834010190874e-08, "loss": 0.1928, "step": 1927 }, { "epoch": 4.9341010876519515, "grad_norm": 0.28178214512822436, "learning_rate": 1.550724808713877e-08, "loss": 0.1885, "step": 1928 }, { "epoch": 4.936660268714012, "grad_norm": 0.28860699599060785, "learning_rate": 1.4129700275771208e-08, "loss": 0.1466, "step": 1929 }, { "epoch": 4.939219449776072, "grad_norm": 0.2584677694793919, "learning_rate": 1.281619499029274e-08, "loss": 0.1844, "step": 1930 }, { "epoch": 4.941778630838132, "grad_norm": 0.2704355939225655, "learning_rate": 1.1566736439685422e-08, "loss": 0.1687, "step": 1931 }, { "epoch": 4.944337811900192, "grad_norm": 0.2647721632155255, "learning_rate": 1.0381328627702136e-08, "loss": 0.1398, "step": 1932 }, { "epoch": 4.946896992962252, "grad_norm": 0.26415602194940657, "learning_rate": 9.259975352848838e-09, "loss": 0.1665, "step": 1933 }, { "epoch": 4.949456174024312, "grad_norm": 0.26483516576483956, "learning_rate": 8.20268020838455e-09, "loss": 0.1632, "step": 1934 }, { "epoch": 4.952015355086372, "grad_norm": 0.26643152935588643, "learning_rate": 7.209446582292501e-09, "loss": 0.1563, "step": 1935 }, { "epoch": 4.954574536148432, "grad_norm": 0.25910553240725315, "learning_rate": 6.2802776572779005e-09, "loss": 0.1467, "step": 1936 }, { "epoch": 4.957133717210493, "grad_norm": 0.25618053832139825, "learning_rate": 5.415176410765721e-09, "loss": 0.1586, "step": 1937 }, { "epoch": 4.959692898272553, "grad_norm": 0.2519718295297774, "learning_rate": 4.614145614876275e-09, "loss": 0.175, "step": 1938 }, { "epoch": 4.962252079334613, "grad_norm": 0.266276772335997, "learning_rate": 3.877187836422991e-09, "loss": 0.1893, "step": 1939 }, { "epoch": 4.964811260396673, "grad_norm": 0.27130663744108346, "learning_rate": 3.2043054369057523e-09, "loss": 0.1928, "step": 1940 }, { "epoch": 4.967370441458733, "grad_norm": 0.2716068333936989, "learning_rate": 2.5955005725064597e-09, "loss": 0.1714, "step": 1941 }, { "epoch": 4.969929622520794, "grad_norm": 0.26663386055671573, "learning_rate": 2.0507751940690434e-09, "loss": 0.1648, "step": 1942 }, { "epoch": 4.972488803582854, "grad_norm": 0.26494109005360517, "learning_rate": 1.5701310471083476e-09, "loss": 0.1591, "step": 1943 }, { "epoch": 4.975047984644913, "grad_norm": 0.2655087142061252, "learning_rate": 1.1535696717945855e-09, "loss": 0.1437, "step": 1944 }, { "epoch": 4.977607165706974, "grad_norm": 0.27477087152296875, "learning_rate": 8.010924029533406e-10, "loss": 0.1491, "step": 1945 }, { "epoch": 4.980166346769034, "grad_norm": 0.2590976709734058, "learning_rate": 5.127003700589051e-10, "loss": 0.1584, "step": 1946 }, { "epoch": 4.982725527831094, "grad_norm": 0.267447188689341, "learning_rate": 2.8839449723205847e-10, "loss": 0.1975, "step": 1947 }, { "epoch": 4.985284708893154, "grad_norm": 0.2598735449781636, "learning_rate": 1.2817550323784843e-10, "loss": 0.2008, "step": 1948 }, { "epoch": 4.987843889955214, "grad_norm": 0.26368976010696143, "learning_rate": 3.2043901478928666e-11, "loss": 0.1795, "step": 1949 }, { "epoch": 4.990403071017274, "grad_norm": 0.2645697161648628, "learning_rate": 0.0, "loss": 0.1707, "step": 1950 }, { "epoch": 4.990403071017274, "step": 1950, "total_flos": 3.115960359367213e+18, "train_loss": 0.34092234334120386, "train_runtime": 73926.9587, "train_samples_per_second": 3.382, "train_steps_per_second": 0.026 } ], "logging_steps": 1.0, "max_steps": 1950, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.115960359367213e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }