{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.16478536705940514, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 8.239268352970256e-05, "grad_norm": 370.92846474567483, "learning_rate": 0.0, "loss": 1.9502, "step": 1 }, { "epoch": 0.00016478536705940512, "grad_norm": 478.9745480395951, "learning_rate": 1.3717421124828532e-08, "loss": 1.7099, "step": 2 }, { "epoch": 0.0002471780505891077, "grad_norm": 816.010339911805, "learning_rate": 2.7434842249657065e-08, "loss": 1.9407, "step": 3 }, { "epoch": 0.00032957073411881023, "grad_norm": 472.8469994501748, "learning_rate": 4.1152263374485605e-08, "loss": 2.0365, "step": 4 }, { "epoch": 0.00041196341764851283, "grad_norm": 578.3746838755396, "learning_rate": 5.486968449931413e-08, "loss": 1.9464, "step": 5 }, { "epoch": 0.0004943561011782154, "grad_norm": 404.71316071877214, "learning_rate": 6.858710562414266e-08, "loss": 1.6153, "step": 6 }, { "epoch": 0.000576748784707918, "grad_norm": 336.6982918558558, "learning_rate": 8.230452674897121e-08, "loss": 1.5113, "step": 7 }, { "epoch": 0.0006591414682376205, "grad_norm": 341.52944273145397, "learning_rate": 9.602194787379974e-08, "loss": 1.8882, "step": 8 }, { "epoch": 0.0007415341517673231, "grad_norm": 391.4084771497659, "learning_rate": 1.0973936899862826e-07, "loss": 2.0696, "step": 9 }, { "epoch": 0.0008239268352970257, "grad_norm": 422.9036219570121, "learning_rate": 1.234567901234568e-07, "loss": 2.0605, "step": 10 }, { "epoch": 0.0009063195188267282, "grad_norm": 797.3739771528897, "learning_rate": 1.3717421124828532e-07, "loss": 2.4847, "step": 11 }, { "epoch": 0.0009887122023564308, "grad_norm": 375.3289213197193, "learning_rate": 1.5089163237311387e-07, "loss": 1.9973, "step": 12 }, { "epoch": 0.0010711048858861334, "grad_norm": 392.38417270652195, "learning_rate": 1.6460905349794242e-07, "loss": 1.8486, "step": 13 }, { "epoch": 0.001153497569415836, "grad_norm": 299.91546496064956, "learning_rate": 1.7832647462277092e-07, "loss": 2.0903, "step": 14 }, { "epoch": 0.0012358902529455383, "grad_norm": 323.60559376717487, "learning_rate": 1.9204389574759947e-07, "loss": 2.199, "step": 15 }, { "epoch": 0.001318282936475241, "grad_norm": 448.30929677447216, "learning_rate": 2.05761316872428e-07, "loss": 1.9866, "step": 16 }, { "epoch": 0.0014006756200049435, "grad_norm": 496.01105968376754, "learning_rate": 2.1947873799725652e-07, "loss": 1.7145, "step": 17 }, { "epoch": 0.0014830683035346461, "grad_norm": 254.87258601806334, "learning_rate": 2.3319615912208507e-07, "loss": 1.4035, "step": 18 }, { "epoch": 0.0015654609870643487, "grad_norm": 196.08575158272188, "learning_rate": 2.469135802469136e-07, "loss": 1.5703, "step": 19 }, { "epoch": 0.0016478536705940513, "grad_norm": 181.93540410857392, "learning_rate": 2.606310013717421e-07, "loss": 1.2346, "step": 20 }, { "epoch": 0.0017302463541237537, "grad_norm": 292.3097054026038, "learning_rate": 2.7434842249657064e-07, "loss": 1.6449, "step": 21 }, { "epoch": 0.0018126390376534563, "grad_norm": 189.7490456066442, "learning_rate": 2.880658436213992e-07, "loss": 1.3064, "step": 22 }, { "epoch": 0.001895031721183159, "grad_norm": 154.94758342589185, "learning_rate": 3.0178326474622774e-07, "loss": 0.8725, "step": 23 }, { "epoch": 0.0019774244047128615, "grad_norm": 175.30016139724518, "learning_rate": 3.1550068587105627e-07, "loss": 1.0209, "step": 24 }, { "epoch": 0.002059817088242564, "grad_norm": 138.98861988367486, "learning_rate": 3.2921810699588484e-07, "loss": 0.819, "step": 25 }, { "epoch": 0.0021422097717722667, "grad_norm": 171.4439871730333, "learning_rate": 3.4293552812071337e-07, "loss": 0.9231, "step": 26 }, { "epoch": 0.0022246024553019693, "grad_norm": 198.66126017668074, "learning_rate": 3.5665294924554184e-07, "loss": 1.1098, "step": 27 }, { "epoch": 0.002306995138831672, "grad_norm": 128.99561291431132, "learning_rate": 3.7037037037037036e-07, "loss": 0.6259, "step": 28 }, { "epoch": 0.0023893878223613745, "grad_norm": 139.7562027108659, "learning_rate": 3.8408779149519894e-07, "loss": 0.9026, "step": 29 }, { "epoch": 0.0024717805058910767, "grad_norm": 8493.779228229192, "learning_rate": 3.9780521262002746e-07, "loss": 7.6096, "step": 30 }, { "epoch": 0.0025541731894207793, "grad_norm": 96.52109090080035, "learning_rate": 4.11522633744856e-07, "loss": 0.9848, "step": 31 }, { "epoch": 0.002636565872950482, "grad_norm": 46.277401372799, "learning_rate": 4.252400548696845e-07, "loss": 0.7812, "step": 32 }, { "epoch": 0.0027189585564801845, "grad_norm": 90.33996022715957, "learning_rate": 4.3895747599451304e-07, "loss": 0.9017, "step": 33 }, { "epoch": 0.002801351240009887, "grad_norm": 34.651039695496756, "learning_rate": 4.526748971193416e-07, "loss": 0.5284, "step": 34 }, { "epoch": 0.0028837439235395897, "grad_norm": 57.438589709256526, "learning_rate": 4.6639231824417014e-07, "loss": 0.6959, "step": 35 }, { "epoch": 0.0029661366070692923, "grad_norm": 88.03798435445572, "learning_rate": 4.801097393689986e-07, "loss": 0.6558, "step": 36 }, { "epoch": 0.003048529290598995, "grad_norm": 168.93972734994549, "learning_rate": 4.938271604938272e-07, "loss": 1.0815, "step": 37 }, { "epoch": 0.0031309219741286975, "grad_norm": 46.38654936308973, "learning_rate": 5.075445816186558e-07, "loss": 0.6026, "step": 38 }, { "epoch": 0.0032133146576584, "grad_norm": 44.94814475982433, "learning_rate": 5.212620027434842e-07, "loss": 0.7385, "step": 39 }, { "epoch": 0.0032957073411881027, "grad_norm": 165.7034467276052, "learning_rate": 5.349794238683128e-07, "loss": 0.6596, "step": 40 }, { "epoch": 0.0033781000247178053, "grad_norm": 97.98006118196572, "learning_rate": 5.486968449931413e-07, "loss": 1.02, "step": 41 }, { "epoch": 0.0034604927082475074, "grad_norm": 69.93402365846087, "learning_rate": 5.624142661179699e-07, "loss": 0.674, "step": 42 }, { "epoch": 0.00354288539177721, "grad_norm": 1163.1480162660114, "learning_rate": 5.761316872427984e-07, "loss": 3.2917, "step": 43 }, { "epoch": 0.0036252780753069126, "grad_norm": 74.37426736775474, "learning_rate": 5.898491083676269e-07, "loss": 0.7122, "step": 44 }, { "epoch": 0.0037076707588366152, "grad_norm": 91.55829350532069, "learning_rate": 6.035665294924555e-07, "loss": 0.8284, "step": 45 }, { "epoch": 0.003790063442366318, "grad_norm": 46.12057854425198, "learning_rate": 6.17283950617284e-07, "loss": 0.8129, "step": 46 }, { "epoch": 0.0038724561258960204, "grad_norm": 65.2084871860374, "learning_rate": 6.310013717421125e-07, "loss": 0.7338, "step": 47 }, { "epoch": 0.003954848809425723, "grad_norm": 76.89157130197114, "learning_rate": 6.44718792866941e-07, "loss": 0.3108, "step": 48 }, { "epoch": 0.004037241492955425, "grad_norm": 73.53979766200762, "learning_rate": 6.584362139917697e-07, "loss": 0.508, "step": 49 }, { "epoch": 0.004119634176485128, "grad_norm": 94.68664560631554, "learning_rate": 6.721536351165982e-07, "loss": 0.9084, "step": 50 }, { "epoch": 0.00420202686001483, "grad_norm": 146.08418676025838, "learning_rate": 6.858710562414267e-07, "loss": 0.8076, "step": 51 }, { "epoch": 0.004284419543544533, "grad_norm": 98.79171062611543, "learning_rate": 6.995884773662552e-07, "loss": 0.5388, "step": 52 }, { "epoch": 0.004366812227074236, "grad_norm": 63.9354584989466, "learning_rate": 7.133058984910837e-07, "loss": 0.6614, "step": 53 }, { "epoch": 0.004449204910603939, "grad_norm": 28.560826747239517, "learning_rate": 7.270233196159123e-07, "loss": 0.221, "step": 54 }, { "epoch": 0.004531597594133641, "grad_norm": 119.28245305633594, "learning_rate": 7.407407407407407e-07, "loss": 0.7162, "step": 55 }, { "epoch": 0.004613990277663344, "grad_norm": 106.52974721356492, "learning_rate": 7.544581618655693e-07, "loss": 0.7543, "step": 56 }, { "epoch": 0.004696382961193046, "grad_norm": 216.2323272199254, "learning_rate": 7.681755829903979e-07, "loss": 0.612, "step": 57 }, { "epoch": 0.004778775644722749, "grad_norm": 81.76961781882962, "learning_rate": 7.818930041152265e-07, "loss": 0.9122, "step": 58 }, { "epoch": 0.004861168328252451, "grad_norm": 81.51218859422868, "learning_rate": 7.956104252400549e-07, "loss": 0.4165, "step": 59 }, { "epoch": 0.004943561011782153, "grad_norm": 45.85355666847451, "learning_rate": 8.093278463648835e-07, "loss": 0.3937, "step": 60 }, { "epoch": 0.005025953695311856, "grad_norm": 113.09646005752293, "learning_rate": 8.23045267489712e-07, "loss": 0.7733, "step": 61 }, { "epoch": 0.0051083463788415585, "grad_norm": 42.137339872436065, "learning_rate": 8.367626886145406e-07, "loss": 0.5397, "step": 62 }, { "epoch": 0.005190739062371262, "grad_norm": 99.72516559451445, "learning_rate": 8.50480109739369e-07, "loss": 0.7653, "step": 63 }, { "epoch": 0.005273131745900964, "grad_norm": 37.868334658532945, "learning_rate": 8.641975308641976e-07, "loss": 0.4084, "step": 64 }, { "epoch": 0.005355524429430667, "grad_norm": 253.35802935100432, "learning_rate": 8.779149519890261e-07, "loss": 0.6051, "step": 65 }, { "epoch": 0.005437917112960369, "grad_norm": 29.821485219757093, "learning_rate": 8.916323731138548e-07, "loss": 0.4946, "step": 66 }, { "epoch": 0.005520309796490072, "grad_norm": 26.807855345732474, "learning_rate": 9.053497942386832e-07, "loss": 0.3619, "step": 67 }, { "epoch": 0.005602702480019774, "grad_norm": 45.035127254975706, "learning_rate": 9.190672153635118e-07, "loss": 0.5831, "step": 68 }, { "epoch": 0.005685095163549477, "grad_norm": 44.226686731133306, "learning_rate": 9.327846364883403e-07, "loss": 0.6431, "step": 69 }, { "epoch": 0.005767487847079179, "grad_norm": 60.33898555308833, "learning_rate": 9.465020576131687e-07, "loss": 0.4899, "step": 70 }, { "epoch": 0.005849880530608882, "grad_norm": 35.25027498428163, "learning_rate": 9.602194787379972e-07, "loss": 0.3887, "step": 71 }, { "epoch": 0.0059322732141385845, "grad_norm": 73.25174042894214, "learning_rate": 9.73936899862826e-07, "loss": 0.6252, "step": 72 }, { "epoch": 0.006014665897668287, "grad_norm": 52.590662239348354, "learning_rate": 9.876543209876544e-07, "loss": 0.3872, "step": 73 }, { "epoch": 0.00609705858119799, "grad_norm": 29.726473681600194, "learning_rate": 1.001371742112483e-06, "loss": 0.4349, "step": 74 }, { "epoch": 0.006179451264727692, "grad_norm": 26.072142658169017, "learning_rate": 1.0150891632373115e-06, "loss": 0.6492, "step": 75 }, { "epoch": 0.006261843948257395, "grad_norm": 38.68492891617437, "learning_rate": 1.02880658436214e-06, "loss": 0.8022, "step": 76 }, { "epoch": 0.006344236631787097, "grad_norm": 120.40708886210712, "learning_rate": 1.0425240054869685e-06, "loss": 0.5611, "step": 77 }, { "epoch": 0.0064266293153168, "grad_norm": 20.221000748278993, "learning_rate": 1.0562414266117972e-06, "loss": 0.2969, "step": 78 }, { "epoch": 0.006509021998846502, "grad_norm": 137.6805973085389, "learning_rate": 1.0699588477366256e-06, "loss": 0.648, "step": 79 }, { "epoch": 0.006591414682376205, "grad_norm": 40.1096379523084, "learning_rate": 1.083676268861454e-06, "loss": 0.8103, "step": 80 }, { "epoch": 0.0066738073659059075, "grad_norm": 32.096473511201374, "learning_rate": 1.0973936899862826e-06, "loss": 0.6186, "step": 81 }, { "epoch": 0.0067562000494356105, "grad_norm": 24.075343816904766, "learning_rate": 1.111111111111111e-06, "loss": 0.3877, "step": 82 }, { "epoch": 0.006838592732965313, "grad_norm": 48.42109801664082, "learning_rate": 1.1248285322359397e-06, "loss": 0.447, "step": 83 }, { "epoch": 0.006920985416495015, "grad_norm": 42.65831233770232, "learning_rate": 1.1385459533607684e-06, "loss": 0.7162, "step": 84 }, { "epoch": 0.007003378100024718, "grad_norm": 71.20273416415172, "learning_rate": 1.1522633744855969e-06, "loss": 0.6573, "step": 85 }, { "epoch": 0.00708577078355442, "grad_norm": 70.73981135151499, "learning_rate": 1.1659807956104253e-06, "loss": 0.3774, "step": 86 }, { "epoch": 0.007168163467084123, "grad_norm": 17.02358862648308, "learning_rate": 1.1796982167352538e-06, "loss": 0.4372, "step": 87 }, { "epoch": 0.007250556150613825, "grad_norm": 38.56110621340388, "learning_rate": 1.1934156378600823e-06, "loss": 0.5007, "step": 88 }, { "epoch": 0.007332948834143528, "grad_norm": 21.689880371993823, "learning_rate": 1.207133058984911e-06, "loss": 0.4363, "step": 89 }, { "epoch": 0.0074153415176732304, "grad_norm": 53.876169409804625, "learning_rate": 1.2208504801097394e-06, "loss": 0.4091, "step": 90 }, { "epoch": 0.0074977342012029335, "grad_norm": 48.147837297588566, "learning_rate": 1.234567901234568e-06, "loss": 0.6979, "step": 91 }, { "epoch": 0.007580126884732636, "grad_norm": 20.467449188390766, "learning_rate": 1.2482853223593966e-06, "loss": 0.4081, "step": 92 }, { "epoch": 0.007662519568262339, "grad_norm": 23.825819702066855, "learning_rate": 1.262002743484225e-06, "loss": 0.5095, "step": 93 }, { "epoch": 0.007744912251792041, "grad_norm": 49.54875914048349, "learning_rate": 1.2757201646090535e-06, "loss": 0.8153, "step": 94 }, { "epoch": 0.007827304935321744, "grad_norm": 36.71859670716872, "learning_rate": 1.289437585733882e-06, "loss": 0.4975, "step": 95 }, { "epoch": 0.007909697618851446, "grad_norm": 52.89761869922755, "learning_rate": 1.3031550068587107e-06, "loss": 0.6777, "step": 96 }, { "epoch": 0.007992090302381148, "grad_norm": 262.046184232095, "learning_rate": 1.3168724279835394e-06, "loss": 0.5125, "step": 97 }, { "epoch": 0.00807448298591085, "grad_norm": 23.8518705316023, "learning_rate": 1.3305898491083676e-06, "loss": 0.5802, "step": 98 }, { "epoch": 0.008156875669440554, "grad_norm": 24.43774608417277, "learning_rate": 1.3443072702331963e-06, "loss": 0.4466, "step": 99 }, { "epoch": 0.008239268352970256, "grad_norm": 27.243336976835526, "learning_rate": 1.3580246913580248e-06, "loss": 0.606, "step": 100 }, { "epoch": 0.008321661036499959, "grad_norm": 18.838152665368614, "learning_rate": 1.3717421124828535e-06, "loss": 0.4605, "step": 101 }, { "epoch": 0.00840405372002966, "grad_norm": 26.949888572345216, "learning_rate": 1.3854595336076817e-06, "loss": 0.4532, "step": 102 }, { "epoch": 0.008486446403559365, "grad_norm": 21.572594872057856, "learning_rate": 1.3991769547325104e-06, "loss": 0.4991, "step": 103 }, { "epoch": 0.008568839087089067, "grad_norm": 28.33027763947139, "learning_rate": 1.412894375857339e-06, "loss": 0.5669, "step": 104 }, { "epoch": 0.008651231770618769, "grad_norm": 31.09867407487906, "learning_rate": 1.4266117969821674e-06, "loss": 0.5158, "step": 105 }, { "epoch": 0.008733624454148471, "grad_norm": 47.502117851757255, "learning_rate": 1.440329218106996e-06, "loss": 0.6078, "step": 106 }, { "epoch": 0.008816017137678173, "grad_norm": 30.0902294117928, "learning_rate": 1.4540466392318245e-06, "loss": 0.6867, "step": 107 }, { "epoch": 0.008898409821207877, "grad_norm": 16.049485540251304, "learning_rate": 1.4677640603566532e-06, "loss": 0.5267, "step": 108 }, { "epoch": 0.00898080250473758, "grad_norm": 30.186256751846674, "learning_rate": 1.4814814814814815e-06, "loss": 0.6437, "step": 109 }, { "epoch": 0.009063195188267282, "grad_norm": 23.921754142654017, "learning_rate": 1.4951989026063101e-06, "loss": 0.7187, "step": 110 }, { "epoch": 0.009145587871796984, "grad_norm": 40.13689702977842, "learning_rate": 1.5089163237311386e-06, "loss": 0.4591, "step": 111 }, { "epoch": 0.009227980555326688, "grad_norm": 21.792212279571824, "learning_rate": 1.5226337448559673e-06, "loss": 0.4377, "step": 112 }, { "epoch": 0.00931037323885639, "grad_norm": 12.609083806149128, "learning_rate": 1.5363511659807958e-06, "loss": 0.4975, "step": 113 }, { "epoch": 0.009392765922386092, "grad_norm": 19.801853097766696, "learning_rate": 1.5500685871056242e-06, "loss": 0.4519, "step": 114 }, { "epoch": 0.009475158605915794, "grad_norm": 44.527628785852514, "learning_rate": 1.563786008230453e-06, "loss": 0.5393, "step": 115 }, { "epoch": 0.009557551289445498, "grad_norm": 17.968320630306675, "learning_rate": 1.5775034293552812e-06, "loss": 0.6014, "step": 116 }, { "epoch": 0.0096399439729752, "grad_norm": 23.423995548663576, "learning_rate": 1.5912208504801099e-06, "loss": 0.4331, "step": 117 }, { "epoch": 0.009722336656504902, "grad_norm": 18.98686296805731, "learning_rate": 1.6049382716049383e-06, "loss": 0.5621, "step": 118 }, { "epoch": 0.009804729340034605, "grad_norm": 13.635326129289362, "learning_rate": 1.618655692729767e-06, "loss": 0.2893, "step": 119 }, { "epoch": 0.009887122023564307, "grad_norm": 29.502202435441244, "learning_rate": 1.6323731138545953e-06, "loss": 0.5988, "step": 120 }, { "epoch": 0.00996951470709401, "grad_norm": 26.759044629536252, "learning_rate": 1.646090534979424e-06, "loss": 0.6966, "step": 121 }, { "epoch": 0.010051907390623713, "grad_norm": 16.944673727591262, "learning_rate": 1.6598079561042526e-06, "loss": 0.6288, "step": 122 }, { "epoch": 0.010134300074153415, "grad_norm": 22.18252955446083, "learning_rate": 1.6735253772290811e-06, "loss": 0.6527, "step": 123 }, { "epoch": 0.010216692757683117, "grad_norm": 14.663608441939818, "learning_rate": 1.6872427983539098e-06, "loss": 0.4992, "step": 124 }, { "epoch": 0.010299085441212821, "grad_norm": 27.846664256554586, "learning_rate": 1.700960219478738e-06, "loss": 0.6578, "step": 125 }, { "epoch": 0.010381478124742523, "grad_norm": 48.120411539456136, "learning_rate": 1.7146776406035667e-06, "loss": 0.7731, "step": 126 }, { "epoch": 0.010463870808272225, "grad_norm": 29.505384191045792, "learning_rate": 1.7283950617283952e-06, "loss": 0.4631, "step": 127 }, { "epoch": 0.010546263491801927, "grad_norm": 27.267562026668486, "learning_rate": 1.7421124828532237e-06, "loss": 0.7196, "step": 128 }, { "epoch": 0.010628656175331631, "grad_norm": 16.00289092345597, "learning_rate": 1.7558299039780521e-06, "loss": 0.5238, "step": 129 }, { "epoch": 0.010711048858861334, "grad_norm": 20.034041777867913, "learning_rate": 1.7695473251028808e-06, "loss": 0.48, "step": 130 }, { "epoch": 0.010793441542391036, "grad_norm": 16.125317675567455, "learning_rate": 1.7832647462277095e-06, "loss": 0.6135, "step": 131 }, { "epoch": 0.010875834225920738, "grad_norm": 198.72635885269693, "learning_rate": 1.7969821673525378e-06, "loss": 1.8359, "step": 132 }, { "epoch": 0.01095822690945044, "grad_norm": 11.49829810544229, "learning_rate": 1.8106995884773665e-06, "loss": 0.4496, "step": 133 }, { "epoch": 0.011040619592980144, "grad_norm": 54.65603884199396, "learning_rate": 1.824417009602195e-06, "loss": 0.6657, "step": 134 }, { "epoch": 0.011123012276509846, "grad_norm": 23.069821716903398, "learning_rate": 1.8381344307270236e-06, "loss": 0.5426, "step": 135 }, { "epoch": 0.011205404960039548, "grad_norm": 13.204812144916009, "learning_rate": 1.8518518518518519e-06, "loss": 0.612, "step": 136 }, { "epoch": 0.01128779764356925, "grad_norm": 13.956836795334933, "learning_rate": 1.8655692729766806e-06, "loss": 0.532, "step": 137 }, { "epoch": 0.011370190327098954, "grad_norm": 42.68872796386726, "learning_rate": 1.879286694101509e-06, "loss": 0.7006, "step": 138 }, { "epoch": 0.011452583010628656, "grad_norm": 16.612308273214413, "learning_rate": 1.8930041152263375e-06, "loss": 0.5123, "step": 139 }, { "epoch": 0.011534975694158359, "grad_norm": 18.144654907032912, "learning_rate": 1.9067215363511662e-06, "loss": 0.3141, "step": 140 }, { "epoch": 0.01161736837768806, "grad_norm": 16.814344046499077, "learning_rate": 1.9204389574759944e-06, "loss": 0.6542, "step": 141 }, { "epoch": 0.011699761061217765, "grad_norm": 21.160095993478766, "learning_rate": 1.9341563786008233e-06, "loss": 0.5819, "step": 142 }, { "epoch": 0.011782153744747467, "grad_norm": 21.297656919271585, "learning_rate": 1.947873799725652e-06, "loss": 0.5107, "step": 143 }, { "epoch": 0.011864546428277169, "grad_norm": 15.566851005374614, "learning_rate": 1.9615912208504803e-06, "loss": 0.6187, "step": 144 }, { "epoch": 0.011946939111806871, "grad_norm": 16.02129799647006, "learning_rate": 1.9753086419753087e-06, "loss": 0.4715, "step": 145 }, { "epoch": 0.012029331795336573, "grad_norm": 11.717994264174337, "learning_rate": 1.9890260631001372e-06, "loss": 0.4021, "step": 146 }, { "epoch": 0.012111724478866277, "grad_norm": 21.22813881358679, "learning_rate": 2.002743484224966e-06, "loss": 0.2599, "step": 147 }, { "epoch": 0.01219411716239598, "grad_norm": 15.200100122381537, "learning_rate": 2.0164609053497946e-06, "loss": 0.45, "step": 148 }, { "epoch": 0.012276509845925682, "grad_norm": 24.89750355075059, "learning_rate": 2.030178326474623e-06, "loss": 0.772, "step": 149 }, { "epoch": 0.012358902529455384, "grad_norm": 16.304820858469412, "learning_rate": 2.0438957475994515e-06, "loss": 0.446, "step": 150 }, { "epoch": 0.012441295212985088, "grad_norm": 20.608928374910505, "learning_rate": 2.05761316872428e-06, "loss": 0.5507, "step": 151 }, { "epoch": 0.01252368789651479, "grad_norm": 10.483108607114513, "learning_rate": 2.0713305898491085e-06, "loss": 0.4834, "step": 152 }, { "epoch": 0.012606080580044492, "grad_norm": 12.697803561984879, "learning_rate": 2.085048010973937e-06, "loss": 0.5253, "step": 153 }, { "epoch": 0.012688473263574194, "grad_norm": 24.461540625272452, "learning_rate": 2.0987654320987654e-06, "loss": 0.6982, "step": 154 }, { "epoch": 0.012770865947103896, "grad_norm": 17.323695037238057, "learning_rate": 2.1124828532235943e-06, "loss": 0.6608, "step": 155 }, { "epoch": 0.0128532586306336, "grad_norm": 19.1728467069908, "learning_rate": 2.1262002743484228e-06, "loss": 0.6158, "step": 156 }, { "epoch": 0.012935651314163302, "grad_norm": 14.335840726971144, "learning_rate": 2.1399176954732512e-06, "loss": 0.6844, "step": 157 }, { "epoch": 0.013018043997693005, "grad_norm": 20.095242492343232, "learning_rate": 2.1536351165980797e-06, "loss": 0.497, "step": 158 }, { "epoch": 0.013100436681222707, "grad_norm": 10.114501664370549, "learning_rate": 2.167352537722908e-06, "loss": 0.5262, "step": 159 }, { "epoch": 0.01318282936475241, "grad_norm": 13.305214604549445, "learning_rate": 2.1810699588477367e-06, "loss": 0.5619, "step": 160 }, { "epoch": 0.013265222048282113, "grad_norm": 19.721782800895156, "learning_rate": 2.194787379972565e-06, "loss": 0.5357, "step": 161 }, { "epoch": 0.013347614731811815, "grad_norm": 19.7228102937409, "learning_rate": 2.208504801097394e-06, "loss": 0.4225, "step": 162 }, { "epoch": 0.013430007415341517, "grad_norm": 240.83778830697852, "learning_rate": 2.222222222222222e-06, "loss": 2.4384, "step": 163 }, { "epoch": 0.013512400098871221, "grad_norm": 11.380285250812992, "learning_rate": 2.235939643347051e-06, "loss": 0.6533, "step": 164 }, { "epoch": 0.013594792782400923, "grad_norm": 9.94152540099469, "learning_rate": 2.2496570644718794e-06, "loss": 0.5497, "step": 165 }, { "epoch": 0.013677185465930625, "grad_norm": 12.090836450223387, "learning_rate": 2.263374485596708e-06, "loss": 0.4756, "step": 166 }, { "epoch": 0.013759578149460328, "grad_norm": 14.813308219199923, "learning_rate": 2.277091906721537e-06, "loss": 0.5355, "step": 167 }, { "epoch": 0.01384197083299003, "grad_norm": 13.192872206591804, "learning_rate": 2.290809327846365e-06, "loss": 0.5875, "step": 168 }, { "epoch": 0.013924363516519734, "grad_norm": 16.210695640291387, "learning_rate": 2.3045267489711937e-06, "loss": 0.5253, "step": 169 }, { "epoch": 0.014006756200049436, "grad_norm": 12.039792190252744, "learning_rate": 2.3182441700960222e-06, "loss": 0.4517, "step": 170 }, { "epoch": 0.014089148883579138, "grad_norm": 23.04062666474093, "learning_rate": 2.3319615912208507e-06, "loss": 0.4083, "step": 171 }, { "epoch": 0.01417154156710884, "grad_norm": 19.979153089914988, "learning_rate": 2.345679012345679e-06, "loss": 0.6887, "step": 172 }, { "epoch": 0.014253934250638544, "grad_norm": 21.895537557735427, "learning_rate": 2.3593964334705076e-06, "loss": 0.8094, "step": 173 }, { "epoch": 0.014336326934168246, "grad_norm": 31.47401830070671, "learning_rate": 2.3731138545953365e-06, "loss": 0.7431, "step": 174 }, { "epoch": 0.014418719617697948, "grad_norm": 12.750465460746202, "learning_rate": 2.3868312757201646e-06, "loss": 0.6583, "step": 175 }, { "epoch": 0.01450111230122765, "grad_norm": 13.307184351874149, "learning_rate": 2.4005486968449935e-06, "loss": 0.6077, "step": 176 }, { "epoch": 0.014583504984757354, "grad_norm": 10.435374769314452, "learning_rate": 2.414266117969822e-06, "loss": 0.5739, "step": 177 }, { "epoch": 0.014665897668287057, "grad_norm": 15.566819292000186, "learning_rate": 2.4279835390946504e-06, "loss": 0.644, "step": 178 }, { "epoch": 0.014748290351816759, "grad_norm": 12.814513858300232, "learning_rate": 2.441700960219479e-06, "loss": 0.5858, "step": 179 }, { "epoch": 0.014830683035346461, "grad_norm": 12.12622273494356, "learning_rate": 2.4554183813443074e-06, "loss": 0.5202, "step": 180 }, { "epoch": 0.014913075718876163, "grad_norm": 16.96998648395035, "learning_rate": 2.469135802469136e-06, "loss": 0.3457, "step": 181 }, { "epoch": 0.014995468402405867, "grad_norm": 13.91986946254961, "learning_rate": 2.4828532235939647e-06, "loss": 0.5681, "step": 182 }, { "epoch": 0.015077861085935569, "grad_norm": 12.486810040618805, "learning_rate": 2.496570644718793e-06, "loss": 0.5125, "step": 183 }, { "epoch": 0.015160253769465271, "grad_norm": 10.303008103171251, "learning_rate": 2.5102880658436217e-06, "loss": 0.6385, "step": 184 }, { "epoch": 0.015242646452994973, "grad_norm": 13.183010022460554, "learning_rate": 2.52400548696845e-06, "loss": 0.3552, "step": 185 }, { "epoch": 0.015325039136524677, "grad_norm": 10.107898578134508, "learning_rate": 2.5377229080932786e-06, "loss": 0.4341, "step": 186 }, { "epoch": 0.01540743182005438, "grad_norm": 8.570843302612268, "learning_rate": 2.551440329218107e-06, "loss": 0.4343, "step": 187 }, { "epoch": 0.015489824503584082, "grad_norm": 17.3196847847868, "learning_rate": 2.565157750342936e-06, "loss": 0.6971, "step": 188 }, { "epoch": 0.015572217187113784, "grad_norm": 11.86766768913693, "learning_rate": 2.578875171467764e-06, "loss": 0.5436, "step": 189 }, { "epoch": 0.015654609870643488, "grad_norm": 10.49550664029216, "learning_rate": 2.5925925925925925e-06, "loss": 0.342, "step": 190 }, { "epoch": 0.01573700255417319, "grad_norm": 9.038437970250417, "learning_rate": 2.6063100137174214e-06, "loss": 0.3151, "step": 191 }, { "epoch": 0.015819395237702892, "grad_norm": 15.678199292955869, "learning_rate": 2.62002743484225e-06, "loss": 0.7268, "step": 192 }, { "epoch": 0.015901787921232594, "grad_norm": 13.800404804526247, "learning_rate": 2.6337448559670788e-06, "loss": 0.5118, "step": 193 }, { "epoch": 0.015984180604762296, "grad_norm": 74.10559559217063, "learning_rate": 2.647462277091907e-06, "loss": 0.7444, "step": 194 }, { "epoch": 0.016066573288292, "grad_norm": 12.20315952893777, "learning_rate": 2.6611796982167353e-06, "loss": 0.4277, "step": 195 }, { "epoch": 0.0161489659718217, "grad_norm": 10.05719320789487, "learning_rate": 2.674897119341564e-06, "loss": 0.4664, "step": 196 }, { "epoch": 0.016231358655351406, "grad_norm": 42.082856786319546, "learning_rate": 2.6886145404663926e-06, "loss": 0.3969, "step": 197 }, { "epoch": 0.01631375133888111, "grad_norm": 15.787631693690875, "learning_rate": 2.7023319615912207e-06, "loss": 0.7307, "step": 198 }, { "epoch": 0.01639614402241081, "grad_norm": 8.901740684680457, "learning_rate": 2.7160493827160496e-06, "loss": 0.5109, "step": 199 }, { "epoch": 0.016478536705940513, "grad_norm": 28.934834071007202, "learning_rate": 2.729766803840878e-06, "loss": 0.4942, "step": 200 }, { "epoch": 0.016560929389470215, "grad_norm": 18.793354020178867, "learning_rate": 2.743484224965707e-06, "loss": 0.5592, "step": 201 }, { "epoch": 0.016643322072999917, "grad_norm": 13.60338783501572, "learning_rate": 2.7572016460905354e-06, "loss": 0.6025, "step": 202 }, { "epoch": 0.01672571475652962, "grad_norm": 8.038968073425716, "learning_rate": 2.7709190672153635e-06, "loss": 0.5211, "step": 203 }, { "epoch": 0.01680810744005932, "grad_norm": 11.559001618222288, "learning_rate": 2.7846364883401924e-06, "loss": 0.5185, "step": 204 }, { "epoch": 0.016890500123589024, "grad_norm": 10.70606495183075, "learning_rate": 2.798353909465021e-06, "loss": 0.5378, "step": 205 }, { "epoch": 0.01697289280711873, "grad_norm": 15.724659491801045, "learning_rate": 2.8120713305898493e-06, "loss": 0.3996, "step": 206 }, { "epoch": 0.01705528549064843, "grad_norm": 15.632077558092512, "learning_rate": 2.825788751714678e-06, "loss": 0.5294, "step": 207 }, { "epoch": 0.017137678174178134, "grad_norm": 15.35567010238041, "learning_rate": 2.8395061728395062e-06, "loss": 0.5789, "step": 208 }, { "epoch": 0.017220070857707836, "grad_norm": 12.247079248152177, "learning_rate": 2.8532235939643347e-06, "loss": 0.4783, "step": 209 }, { "epoch": 0.017302463541237538, "grad_norm": 13.787412538148317, "learning_rate": 2.8669410150891636e-06, "loss": 0.6358, "step": 210 }, { "epoch": 0.01738485622476724, "grad_norm": 10.388866874954653, "learning_rate": 2.880658436213992e-06, "loss": 0.4617, "step": 211 }, { "epoch": 0.017467248908296942, "grad_norm": 10.149440548768066, "learning_rate": 2.89437585733882e-06, "loss": 0.372, "step": 212 }, { "epoch": 0.017549641591826644, "grad_norm": 12.782054026030952, "learning_rate": 2.908093278463649e-06, "loss": 0.5502, "step": 213 }, { "epoch": 0.017632034275356347, "grad_norm": 8.980692409189274, "learning_rate": 2.9218106995884775e-06, "loss": 0.5311, "step": 214 }, { "epoch": 0.017714426958886052, "grad_norm": 12.126638458623237, "learning_rate": 2.9355281207133064e-06, "loss": 0.6599, "step": 215 }, { "epoch": 0.017796819642415754, "grad_norm": 10.503433095750024, "learning_rate": 2.949245541838135e-06, "loss": 0.4327, "step": 216 }, { "epoch": 0.017879212325945457, "grad_norm": 12.219841823090144, "learning_rate": 2.962962962962963e-06, "loss": 0.7044, "step": 217 }, { "epoch": 0.01796160500947516, "grad_norm": 18.87320464359166, "learning_rate": 2.976680384087792e-06, "loss": 0.7467, "step": 218 }, { "epoch": 0.01804399769300486, "grad_norm": 289.359254982659, "learning_rate": 2.9903978052126203e-06, "loss": 3.0217, "step": 219 }, { "epoch": 0.018126390376534563, "grad_norm": 7.733672679042532, "learning_rate": 3.004115226337449e-06, "loss": 0.4293, "step": 220 }, { "epoch": 0.018208783060064265, "grad_norm": 16.61269730251294, "learning_rate": 3.0178326474622772e-06, "loss": 0.6614, "step": 221 }, { "epoch": 0.018291175743593967, "grad_norm": 8.31112554516155, "learning_rate": 3.0315500685871057e-06, "loss": 0.5271, "step": 222 }, { "epoch": 0.018373568427123673, "grad_norm": 13.664445288630535, "learning_rate": 3.0452674897119346e-06, "loss": 0.7018, "step": 223 }, { "epoch": 0.018455961110653375, "grad_norm": 10.005927544238816, "learning_rate": 3.058984910836763e-06, "loss": 0.5524, "step": 224 }, { "epoch": 0.018538353794183077, "grad_norm": 15.446861208215383, "learning_rate": 3.0727023319615915e-06, "loss": 0.6341, "step": 225 }, { "epoch": 0.01862074647771278, "grad_norm": 16.079846485759564, "learning_rate": 3.08641975308642e-06, "loss": 0.8794, "step": 226 }, { "epoch": 0.018703139161242482, "grad_norm": 10.175892407022696, "learning_rate": 3.1001371742112485e-06, "loss": 0.6755, "step": 227 }, { "epoch": 0.018785531844772184, "grad_norm": 56.127455072454026, "learning_rate": 3.113854595336077e-06, "loss": 0.3321, "step": 228 }, { "epoch": 0.018867924528301886, "grad_norm": 12.930244631445957, "learning_rate": 3.127572016460906e-06, "loss": 0.4804, "step": 229 }, { "epoch": 0.018950317211831588, "grad_norm": 12.529583269551953, "learning_rate": 3.141289437585734e-06, "loss": 0.5312, "step": 230 }, { "epoch": 0.01903270989536129, "grad_norm": 9.78335819090374, "learning_rate": 3.1550068587105624e-06, "loss": 0.6044, "step": 231 }, { "epoch": 0.019115102578890996, "grad_norm": 9.981952585751747, "learning_rate": 3.1687242798353912e-06, "loss": 0.7049, "step": 232 }, { "epoch": 0.019197495262420698, "grad_norm": 17.065859536580135, "learning_rate": 3.1824417009602197e-06, "loss": 0.6509, "step": 233 }, { "epoch": 0.0192798879459504, "grad_norm": 10.93465261939953, "learning_rate": 3.1961591220850486e-06, "loss": 0.7744, "step": 234 }, { "epoch": 0.019362280629480103, "grad_norm": 9.705094089624701, "learning_rate": 3.2098765432098767e-06, "loss": 0.6289, "step": 235 }, { "epoch": 0.019444673313009805, "grad_norm": 11.03377155515954, "learning_rate": 3.223593964334705e-06, "loss": 0.4788, "step": 236 }, { "epoch": 0.019527065996539507, "grad_norm": 9.129123781076657, "learning_rate": 3.237311385459534e-06, "loss": 0.5489, "step": 237 }, { "epoch": 0.01960945868006921, "grad_norm": 8.697937237915472, "learning_rate": 3.2510288065843625e-06, "loss": 0.5861, "step": 238 }, { "epoch": 0.01969185136359891, "grad_norm": 8.870677568511018, "learning_rate": 3.2647462277091905e-06, "loss": 0.5946, "step": 239 }, { "epoch": 0.019774244047128613, "grad_norm": 23.189212971761012, "learning_rate": 3.2784636488340194e-06, "loss": 0.4848, "step": 240 }, { "epoch": 0.01985663673065832, "grad_norm": 8.803471237780029, "learning_rate": 3.292181069958848e-06, "loss": 0.3981, "step": 241 }, { "epoch": 0.01993902941418802, "grad_norm": 9.276823579497725, "learning_rate": 3.305898491083677e-06, "loss": 0.3854, "step": 242 }, { "epoch": 0.020021422097717723, "grad_norm": 15.048560056515383, "learning_rate": 3.3196159122085053e-06, "loss": 0.737, "step": 243 }, { "epoch": 0.020103814781247425, "grad_norm": 8.848106416589038, "learning_rate": 3.3333333333333333e-06, "loss": 0.2963, "step": 244 }, { "epoch": 0.020186207464777128, "grad_norm": 7.313536463258056, "learning_rate": 3.3470507544581622e-06, "loss": 0.2768, "step": 245 }, { "epoch": 0.02026860014830683, "grad_norm": 12.219856537027805, "learning_rate": 3.3607681755829907e-06, "loss": 0.4306, "step": 246 }, { "epoch": 0.020350992831836532, "grad_norm": 10.00453954068384, "learning_rate": 3.3744855967078196e-06, "loss": 0.2573, "step": 247 }, { "epoch": 0.020433385515366234, "grad_norm": 16.88134336345359, "learning_rate": 3.3882030178326476e-06, "loss": 0.2629, "step": 248 }, { "epoch": 0.020515778198895936, "grad_norm": 12.88958428233626, "learning_rate": 3.401920438957476e-06, "loss": 0.4504, "step": 249 }, { "epoch": 0.020598170882425642, "grad_norm": 19.916078497234878, "learning_rate": 3.415637860082305e-06, "loss": 0.7189, "step": 250 }, { "epoch": 0.020680563565955344, "grad_norm": 12.379454068967135, "learning_rate": 3.4293552812071335e-06, "loss": 0.5047, "step": 251 }, { "epoch": 0.020762956249485046, "grad_norm": 7.363299478552216, "learning_rate": 3.443072702331962e-06, "loss": 0.308, "step": 252 }, { "epoch": 0.02084534893301475, "grad_norm": 14.9221134616295, "learning_rate": 3.4567901234567904e-06, "loss": 0.5358, "step": 253 }, { "epoch": 0.02092774161654445, "grad_norm": 75.3026629510051, "learning_rate": 3.470507544581619e-06, "loss": 1.3456, "step": 254 }, { "epoch": 0.021010134300074153, "grad_norm": 13.144224277254176, "learning_rate": 3.4842249657064474e-06, "loss": 0.7256, "step": 255 }, { "epoch": 0.021092526983603855, "grad_norm": 8.29902926659797, "learning_rate": 3.4979423868312762e-06, "loss": 0.479, "step": 256 }, { "epoch": 0.021174919667133557, "grad_norm": 36.142615394611894, "learning_rate": 3.5116598079561043e-06, "loss": 0.5905, "step": 257 }, { "epoch": 0.021257312350663263, "grad_norm": 18.894771750217856, "learning_rate": 3.5253772290809328e-06, "loss": 0.5372, "step": 258 }, { "epoch": 0.021339705034192965, "grad_norm": 8.876306670885448, "learning_rate": 3.5390946502057617e-06, "loss": 0.507, "step": 259 }, { "epoch": 0.021422097717722667, "grad_norm": 12.322148718207554, "learning_rate": 3.55281207133059e-06, "loss": 0.5313, "step": 260 }, { "epoch": 0.02150449040125237, "grad_norm": 9.429328982690008, "learning_rate": 3.566529492455419e-06, "loss": 0.6338, "step": 261 }, { "epoch": 0.02158688308478207, "grad_norm": 8.439579932933407, "learning_rate": 3.580246913580247e-06, "loss": 0.64, "step": 262 }, { "epoch": 0.021669275768311774, "grad_norm": 7.027927606341166, "learning_rate": 3.5939643347050755e-06, "loss": 0.694, "step": 263 }, { "epoch": 0.021751668451841476, "grad_norm": 18.461422860766792, "learning_rate": 3.6076817558299044e-06, "loss": 0.8644, "step": 264 }, { "epoch": 0.021834061135371178, "grad_norm": 7.312928184572379, "learning_rate": 3.621399176954733e-06, "loss": 0.5441, "step": 265 }, { "epoch": 0.02191645381890088, "grad_norm": 10.896094086625412, "learning_rate": 3.635116598079561e-06, "loss": 0.3631, "step": 266 }, { "epoch": 0.021998846502430586, "grad_norm": 12.46612913960122, "learning_rate": 3.64883401920439e-06, "loss": 0.6946, "step": 267 }, { "epoch": 0.022081239185960288, "grad_norm": 8.87385816825834, "learning_rate": 3.6625514403292183e-06, "loss": 0.6441, "step": 268 }, { "epoch": 0.02216363186948999, "grad_norm": 7.66230536481842, "learning_rate": 3.6762688614540472e-06, "loss": 0.4107, "step": 269 }, { "epoch": 0.022246024553019692, "grad_norm": 20.729872546438557, "learning_rate": 3.6899862825788757e-06, "loss": 0.575, "step": 270 }, { "epoch": 0.022328417236549394, "grad_norm": 8.579634165930928, "learning_rate": 3.7037037037037037e-06, "loss": 0.2851, "step": 271 }, { "epoch": 0.022410809920079097, "grad_norm": 8.32603360510797, "learning_rate": 3.7174211248285326e-06, "loss": 0.549, "step": 272 }, { "epoch": 0.0224932026036088, "grad_norm": 7.119710330257647, "learning_rate": 3.731138545953361e-06, "loss": 0.6911, "step": 273 }, { "epoch": 0.0225755952871385, "grad_norm": 12.334011335320461, "learning_rate": 3.7448559670781896e-06, "loss": 0.2933, "step": 274 }, { "epoch": 0.022657987970668203, "grad_norm": 8.819773878214544, "learning_rate": 3.758573388203018e-06, "loss": 0.5093, "step": 275 }, { "epoch": 0.02274038065419791, "grad_norm": 7.853520271881538, "learning_rate": 3.7722908093278465e-06, "loss": 0.4793, "step": 276 }, { "epoch": 0.02282277333772761, "grad_norm": 12.938012573178968, "learning_rate": 3.786008230452675e-06, "loss": 0.7915, "step": 277 }, { "epoch": 0.022905166021257313, "grad_norm": 7.351505350233861, "learning_rate": 3.799725651577504e-06, "loss": 0.4733, "step": 278 }, { "epoch": 0.022987558704787015, "grad_norm": 7.3673534060195855, "learning_rate": 3.8134430727023324e-06, "loss": 0.41, "step": 279 }, { "epoch": 0.023069951388316717, "grad_norm": 9.289270184893226, "learning_rate": 3.827160493827161e-06, "loss": 0.4778, "step": 280 }, { "epoch": 0.02315234407184642, "grad_norm": 6.28612952057349, "learning_rate": 3.840877914951989e-06, "loss": 0.406, "step": 281 }, { "epoch": 0.02323473675537612, "grad_norm": 5.946903311816241, "learning_rate": 3.854595336076818e-06, "loss": 0.4755, "step": 282 }, { "epoch": 0.023317129438905824, "grad_norm": 6.834284548380493, "learning_rate": 3.868312757201647e-06, "loss": 0.5116, "step": 283 }, { "epoch": 0.02339952212243553, "grad_norm": 11.774766376537002, "learning_rate": 3.882030178326475e-06, "loss": 0.3654, "step": 284 }, { "epoch": 0.02348191480596523, "grad_norm": 10.79007061340619, "learning_rate": 3.895747599451304e-06, "loss": 0.3865, "step": 285 }, { "epoch": 0.023564307489494934, "grad_norm": 8.26845323051975, "learning_rate": 3.909465020576132e-06, "loss": 0.6248, "step": 286 }, { "epoch": 0.023646700173024636, "grad_norm": 49.81368555681276, "learning_rate": 3.9231824417009605e-06, "loss": 0.6136, "step": 287 }, { "epoch": 0.023729092856554338, "grad_norm": 13.035734882105107, "learning_rate": 3.9368998628257894e-06, "loss": 0.7522, "step": 288 }, { "epoch": 0.02381148554008404, "grad_norm": 10.878498977499254, "learning_rate": 3.9506172839506175e-06, "loss": 0.6219, "step": 289 }, { "epoch": 0.023893878223613742, "grad_norm": 11.982100652311225, "learning_rate": 3.964334705075446e-06, "loss": 0.6203, "step": 290 }, { "epoch": 0.023976270907143445, "grad_norm": 9.985093354905656, "learning_rate": 3.9780521262002744e-06, "loss": 0.6091, "step": 291 }, { "epoch": 0.024058663590673147, "grad_norm": 10.79524941182704, "learning_rate": 3.991769547325103e-06, "loss": 0.511, "step": 292 }, { "epoch": 0.024141056274202852, "grad_norm": 8.306861975711865, "learning_rate": 4.005486968449932e-06, "loss": 0.6458, "step": 293 }, { "epoch": 0.024223448957732555, "grad_norm": 8.445514735869802, "learning_rate": 4.01920438957476e-06, "loss": 0.4246, "step": 294 }, { "epoch": 0.024305841641262257, "grad_norm": 10.946877499366206, "learning_rate": 4.032921810699589e-06, "loss": 0.4455, "step": 295 }, { "epoch": 0.02438823432479196, "grad_norm": 9.95334826152276, "learning_rate": 4.046639231824417e-06, "loss": 0.7388, "step": 296 }, { "epoch": 0.02447062700832166, "grad_norm": 6.182291262554031, "learning_rate": 4.060356652949246e-06, "loss": 0.255, "step": 297 }, { "epoch": 0.024553019691851363, "grad_norm": 6.230437793654582, "learning_rate": 4.074074074074074e-06, "loss": 0.317, "step": 298 }, { "epoch": 0.024635412375381065, "grad_norm": 37.925863944916884, "learning_rate": 4.087791495198903e-06, "loss": 0.6706, "step": 299 }, { "epoch": 0.024717805058910768, "grad_norm": 9.275615694704983, "learning_rate": 4.101508916323731e-06, "loss": 0.7037, "step": 300 }, { "epoch": 0.02480019774244047, "grad_norm": 8.011699796447031, "learning_rate": 4.11522633744856e-06, "loss": 0.5876, "step": 301 }, { "epoch": 0.024882590425970175, "grad_norm": 6.150375331005549, "learning_rate": 4.128943758573389e-06, "loss": 0.5513, "step": 302 }, { "epoch": 0.024964983109499878, "grad_norm": 9.66910213632119, "learning_rate": 4.142661179698217e-06, "loss": 0.7654, "step": 303 }, { "epoch": 0.02504737579302958, "grad_norm": 7.492282251093323, "learning_rate": 4.156378600823046e-06, "loss": 0.5467, "step": 304 }, { "epoch": 0.025129768476559282, "grad_norm": 8.84736509671235, "learning_rate": 4.170096021947874e-06, "loss": 0.6802, "step": 305 }, { "epoch": 0.025212161160088984, "grad_norm": 11.147679298984809, "learning_rate": 4.183813443072703e-06, "loss": 0.3183, "step": 306 }, { "epoch": 0.025294553843618686, "grad_norm": 6.052642841781604, "learning_rate": 4.197530864197531e-06, "loss": 0.2843, "step": 307 }, { "epoch": 0.02537694652714839, "grad_norm": 5.853639145291225, "learning_rate": 4.21124828532236e-06, "loss": 0.6171, "step": 308 }, { "epoch": 0.02545933921067809, "grad_norm": 7.664697260789782, "learning_rate": 4.224965706447189e-06, "loss": 0.5526, "step": 309 }, { "epoch": 0.025541731894207793, "grad_norm": 8.309328255797821, "learning_rate": 4.238683127572017e-06, "loss": 0.3675, "step": 310 }, { "epoch": 0.0256241245777375, "grad_norm": 6.048028087236039, "learning_rate": 4.2524005486968456e-06, "loss": 0.3439, "step": 311 }, { "epoch": 0.0257065172612672, "grad_norm": 7.684783842069482, "learning_rate": 4.266117969821674e-06, "loss": 0.671, "step": 312 }, { "epoch": 0.025788909944796903, "grad_norm": 14.534992720713337, "learning_rate": 4.2798353909465025e-06, "loss": 0.7617, "step": 313 }, { "epoch": 0.025871302628326605, "grad_norm": 8.615671182896788, "learning_rate": 4.293552812071331e-06, "loss": 0.6877, "step": 314 }, { "epoch": 0.025953695311856307, "grad_norm": 7.575816304061312, "learning_rate": 4.3072702331961594e-06, "loss": 0.6431, "step": 315 }, { "epoch": 0.02603608799538601, "grad_norm": 8.768345528296296, "learning_rate": 4.3209876543209875e-06, "loss": 0.4367, "step": 316 }, { "epoch": 0.02611848067891571, "grad_norm": 7.718681777042332, "learning_rate": 4.334705075445816e-06, "loss": 0.3518, "step": 317 }, { "epoch": 0.026200873362445413, "grad_norm": 15.692592743928309, "learning_rate": 4.348422496570645e-06, "loss": 0.8163, "step": 318 }, { "epoch": 0.02628326604597512, "grad_norm": 8.46510288305704, "learning_rate": 4.362139917695473e-06, "loss": 0.6321, "step": 319 }, { "epoch": 0.02636565872950482, "grad_norm": 6.317038560782752, "learning_rate": 4.375857338820302e-06, "loss": 0.4448, "step": 320 }, { "epoch": 0.026448051413034523, "grad_norm": 17.22722159022544, "learning_rate": 4.38957475994513e-06, "loss": 0.6319, "step": 321 }, { "epoch": 0.026530444096564226, "grad_norm": 35.34200236087575, "learning_rate": 4.403292181069959e-06, "loss": 0.6193, "step": 322 }, { "epoch": 0.026612836780093928, "grad_norm": 11.559769027360641, "learning_rate": 4.417009602194788e-06, "loss": 0.6904, "step": 323 }, { "epoch": 0.02669522946362363, "grad_norm": 7.249712253160824, "learning_rate": 4.430727023319616e-06, "loss": 0.4605, "step": 324 }, { "epoch": 0.026777622147153332, "grad_norm": 5.9808851831275, "learning_rate": 4.444444444444444e-06, "loss": 0.3127, "step": 325 }, { "epoch": 0.026860014830683034, "grad_norm": 6.4015006508429995, "learning_rate": 4.458161865569273e-06, "loss": 0.6494, "step": 326 }, { "epoch": 0.026942407514212736, "grad_norm": 7.388398333528457, "learning_rate": 4.471879286694102e-06, "loss": 0.4854, "step": 327 }, { "epoch": 0.027024800197742442, "grad_norm": 5.459089110209384, "learning_rate": 4.485596707818931e-06, "loss": 0.463, "step": 328 }, { "epoch": 0.027107192881272144, "grad_norm": 7.377223891634756, "learning_rate": 4.499314128943759e-06, "loss": 0.6919, "step": 329 }, { "epoch": 0.027189585564801846, "grad_norm": 6.3768264626554805, "learning_rate": 4.513031550068587e-06, "loss": 0.6361, "step": 330 }, { "epoch": 0.02727197824833155, "grad_norm": 6.99781725288093, "learning_rate": 4.526748971193416e-06, "loss": 0.6743, "step": 331 }, { "epoch": 0.02735437093186125, "grad_norm": 5.9726845052369075, "learning_rate": 4.540466392318245e-06, "loss": 0.3525, "step": 332 }, { "epoch": 0.027436763615390953, "grad_norm": 4.715507305833575, "learning_rate": 4.554183813443074e-06, "loss": 0.365, "step": 333 }, { "epoch": 0.027519156298920655, "grad_norm": 5.786778668319323, "learning_rate": 4.567901234567902e-06, "loss": 0.3823, "step": 334 }, { "epoch": 0.027601548982450357, "grad_norm": 7.779154962035555, "learning_rate": 4.58161865569273e-06, "loss": 0.5933, "step": 335 }, { "epoch": 0.02768394166598006, "grad_norm": 12.355416669442642, "learning_rate": 4.595336076817559e-06, "loss": 0.6039, "step": 336 }, { "epoch": 0.027766334349509765, "grad_norm": 5.078241648282684, "learning_rate": 4.6090534979423875e-06, "loss": 0.3775, "step": 337 }, { "epoch": 0.027848727033039467, "grad_norm": 5.863453886765012, "learning_rate": 4.622770919067216e-06, "loss": 0.2638, "step": 338 }, { "epoch": 0.02793111971656917, "grad_norm": 12.580103906062597, "learning_rate": 4.6364883401920444e-06, "loss": 0.7607, "step": 339 }, { "epoch": 0.02801351240009887, "grad_norm": 6.6784963620823525, "learning_rate": 4.6502057613168725e-06, "loss": 0.4435, "step": 340 }, { "epoch": 0.028095905083628574, "grad_norm": 7.038657857694269, "learning_rate": 4.663923182441701e-06, "loss": 0.6857, "step": 341 }, { "epoch": 0.028178297767158276, "grad_norm": 14.379372920193825, "learning_rate": 4.67764060356653e-06, "loss": 0.8087, "step": 342 }, { "epoch": 0.028260690450687978, "grad_norm": 9.837898915303215, "learning_rate": 4.691358024691358e-06, "loss": 0.5434, "step": 343 }, { "epoch": 0.02834308313421768, "grad_norm": 6.687023299218438, "learning_rate": 4.705075445816187e-06, "loss": 0.3655, "step": 344 }, { "epoch": 0.028425475817747382, "grad_norm": 5.726375583817776, "learning_rate": 4.718792866941015e-06, "loss": 0.5816, "step": 345 }, { "epoch": 0.028507868501277088, "grad_norm": 6.8852392247210945, "learning_rate": 4.732510288065844e-06, "loss": 0.5028, "step": 346 }, { "epoch": 0.02859026118480679, "grad_norm": 5.798984025320741, "learning_rate": 4.746227709190673e-06, "loss": 0.5701, "step": 347 }, { "epoch": 0.028672653868336492, "grad_norm": 10.24662425737303, "learning_rate": 4.759945130315501e-06, "loss": 0.6865, "step": 348 }, { "epoch": 0.028755046551866194, "grad_norm": 6.089865434146081, "learning_rate": 4.773662551440329e-06, "loss": 0.6887, "step": 349 }, { "epoch": 0.028837439235395897, "grad_norm": 6.365563959115913, "learning_rate": 4.787379972565158e-06, "loss": 0.6076, "step": 350 }, { "epoch": 0.0289198319189256, "grad_norm": 5.916203223471868, "learning_rate": 4.801097393689987e-06, "loss": 0.6597, "step": 351 }, { "epoch": 0.0290022246024553, "grad_norm": 4.599031509093365, "learning_rate": 4.814814814814815e-06, "loss": 0.395, "step": 352 }, { "epoch": 0.029084617285985003, "grad_norm": 6.6869153491637485, "learning_rate": 4.828532235939644e-06, "loss": 0.548, "step": 353 }, { "epoch": 0.02916700996951471, "grad_norm": 4.333465973974785, "learning_rate": 4.842249657064472e-06, "loss": 0.5945, "step": 354 }, { "epoch": 0.02924940265304441, "grad_norm": 6.808396103175679, "learning_rate": 4.855967078189301e-06, "loss": 0.5964, "step": 355 }, { "epoch": 0.029331795336574113, "grad_norm": 9.755100603471288, "learning_rate": 4.86968449931413e-06, "loss": 0.6287, "step": 356 }, { "epoch": 0.029414188020103815, "grad_norm": 5.80905261329336, "learning_rate": 4.883401920438958e-06, "loss": 0.4366, "step": 357 }, { "epoch": 0.029496580703633517, "grad_norm": 5.5904604000702545, "learning_rate": 4.897119341563787e-06, "loss": 0.5093, "step": 358 }, { "epoch": 0.02957897338716322, "grad_norm": 7.090237952729793, "learning_rate": 4.910836762688615e-06, "loss": 0.4749, "step": 359 }, { "epoch": 0.029661366070692922, "grad_norm": 6.9638534563545695, "learning_rate": 4.924554183813444e-06, "loss": 0.6429, "step": 360 }, { "epoch": 0.029743758754222624, "grad_norm": 4.026899519732204, "learning_rate": 4.938271604938272e-06, "loss": 0.4878, "step": 361 }, { "epoch": 0.029826151437752326, "grad_norm": 7.3582470565677065, "learning_rate": 4.9519890260631005e-06, "loss": 0.5546, "step": 362 }, { "epoch": 0.029908544121282032, "grad_norm": 5.356724068201691, "learning_rate": 4.9657064471879294e-06, "loss": 0.345, "step": 363 }, { "epoch": 0.029990936804811734, "grad_norm": 5.364214773411196, "learning_rate": 4.9794238683127575e-06, "loss": 0.3246, "step": 364 }, { "epoch": 0.030073329488341436, "grad_norm": 8.332851292756342, "learning_rate": 4.993141289437586e-06, "loss": 0.6733, "step": 365 }, { "epoch": 0.030155722171871138, "grad_norm": 9.765581947528334, "learning_rate": 5.0068587105624144e-06, "loss": 0.4704, "step": 366 }, { "epoch": 0.03023811485540084, "grad_norm": 5.498161157247211, "learning_rate": 5.020576131687243e-06, "loss": 0.2827, "step": 367 }, { "epoch": 0.030320507538930543, "grad_norm": 43.68184948115611, "learning_rate": 5.034293552812071e-06, "loss": 0.3449, "step": 368 }, { "epoch": 0.030402900222460245, "grad_norm": 6.219439174212411, "learning_rate": 5.0480109739369e-06, "loss": 0.3369, "step": 369 }, { "epoch": 0.030485292905989947, "grad_norm": 5.267942442082339, "learning_rate": 5.061728395061729e-06, "loss": 0.4036, "step": 370 }, { "epoch": 0.03056768558951965, "grad_norm": 4.7582676290183, "learning_rate": 5.075445816186557e-06, "loss": 0.2824, "step": 371 }, { "epoch": 0.030650078273049355, "grad_norm": 8.46553998751592, "learning_rate": 5.089163237311386e-06, "loss": 0.5826, "step": 372 }, { "epoch": 0.030732470956579057, "grad_norm": 9.33378948895317, "learning_rate": 5.102880658436214e-06, "loss": 0.6385, "step": 373 }, { "epoch": 0.03081486364010876, "grad_norm": 10.627769537470304, "learning_rate": 5.116598079561042e-06, "loss": 0.6659, "step": 374 }, { "epoch": 0.03089725632363846, "grad_norm": 6.414932231735032, "learning_rate": 5.130315500685872e-06, "loss": 0.4209, "step": 375 }, { "epoch": 0.030979649007168163, "grad_norm": 5.200756588061871, "learning_rate": 5.1440329218107e-06, "loss": 0.3344, "step": 376 }, { "epoch": 0.031062041690697866, "grad_norm": 8.066096499546177, "learning_rate": 5.157750342935528e-06, "loss": 0.5315, "step": 377 }, { "epoch": 0.031144434374227568, "grad_norm": 7.5104265708571125, "learning_rate": 5.171467764060357e-06, "loss": 0.5785, "step": 378 }, { "epoch": 0.03122682705775727, "grad_norm": 7.326972182415438, "learning_rate": 5.185185185185185e-06, "loss": 0.6173, "step": 379 }, { "epoch": 0.031309219741286975, "grad_norm": 7.064196146503117, "learning_rate": 5.198902606310015e-06, "loss": 0.4073, "step": 380 }, { "epoch": 0.031391612424816674, "grad_norm": 10.55313726796645, "learning_rate": 5.212620027434843e-06, "loss": 0.5923, "step": 381 }, { "epoch": 0.03147400510834638, "grad_norm": 9.355498638304265, "learning_rate": 5.226337448559671e-06, "loss": 0.3621, "step": 382 }, { "epoch": 0.03155639779187608, "grad_norm": 6.985263971534283, "learning_rate": 5.2400548696845e-06, "loss": 0.5704, "step": 383 }, { "epoch": 0.031638790475405784, "grad_norm": 12.680293995262826, "learning_rate": 5.253772290809328e-06, "loss": 0.6467, "step": 384 }, { "epoch": 0.03172118315893549, "grad_norm": 8.65674983418551, "learning_rate": 5.2674897119341575e-06, "loss": 0.5878, "step": 385 }, { "epoch": 0.03180357584246519, "grad_norm": 6.341472071022504, "learning_rate": 5.2812071330589856e-06, "loss": 0.4152, "step": 386 }, { "epoch": 0.031885968525994894, "grad_norm": 6.869168720162001, "learning_rate": 5.294924554183814e-06, "loss": 0.4097, "step": 387 }, { "epoch": 0.03196836120952459, "grad_norm": 14.190249862283896, "learning_rate": 5.3086419753086425e-06, "loss": 0.8292, "step": 388 }, { "epoch": 0.0320507538930543, "grad_norm": 8.275823508063906, "learning_rate": 5.3223593964334705e-06, "loss": 0.5986, "step": 389 }, { "epoch": 0.032133146576584, "grad_norm": 6.358393504534048, "learning_rate": 5.3360768175583e-06, "loss": 0.4381, "step": 390 }, { "epoch": 0.0322155392601137, "grad_norm": 7.516552473352774, "learning_rate": 5.349794238683128e-06, "loss": 0.6122, "step": 391 }, { "epoch": 0.0322979319436434, "grad_norm": 8.856630051592028, "learning_rate": 5.363511659807956e-06, "loss": 0.5578, "step": 392 }, { "epoch": 0.03238032462717311, "grad_norm": 6.892129730394783, "learning_rate": 5.377229080932785e-06, "loss": 0.3553, "step": 393 }, { "epoch": 0.03246271731070281, "grad_norm": 21.05820599044793, "learning_rate": 5.390946502057613e-06, "loss": 0.7038, "step": 394 }, { "epoch": 0.03254510999423251, "grad_norm": 8.434869352368931, "learning_rate": 5.404663923182441e-06, "loss": 0.6087, "step": 395 }, { "epoch": 0.03262750267776222, "grad_norm": 7.561351421213893, "learning_rate": 5.418381344307271e-06, "loss": 0.5764, "step": 396 }, { "epoch": 0.032709895361291916, "grad_norm": 8.211243703767535, "learning_rate": 5.432098765432099e-06, "loss": 0.4147, "step": 397 }, { "epoch": 0.03279228804482162, "grad_norm": 6.985876038283628, "learning_rate": 5.445816186556928e-06, "loss": 0.6712, "step": 398 }, { "epoch": 0.03287468072835132, "grad_norm": 7.796492055840742, "learning_rate": 5.459533607681756e-06, "loss": 0.5813, "step": 399 }, { "epoch": 0.032957073411881026, "grad_norm": 103.54807755066446, "learning_rate": 5.473251028806584e-06, "loss": 2.5632, "step": 400 }, { "epoch": 0.033039466095410724, "grad_norm": 5.515831392953944, "learning_rate": 5.486968449931414e-06, "loss": 0.4869, "step": 401 }, { "epoch": 0.03312185877894043, "grad_norm": 7.030021195326967, "learning_rate": 5.500685871056242e-06, "loss": 0.5261, "step": 402 }, { "epoch": 0.033204251462470136, "grad_norm": 19.87615017638583, "learning_rate": 5.514403292181071e-06, "loss": 0.6917, "step": 403 }, { "epoch": 0.033286644145999834, "grad_norm": 7.081294911924975, "learning_rate": 5.528120713305899e-06, "loss": 0.6978, "step": 404 }, { "epoch": 0.03336903682952954, "grad_norm": 11.979085035070433, "learning_rate": 5.541838134430727e-06, "loss": 0.8872, "step": 405 }, { "epoch": 0.03345142951305924, "grad_norm": 8.195578186353957, "learning_rate": 5.555555555555557e-06, "loss": 0.6859, "step": 406 }, { "epoch": 0.033533822196588944, "grad_norm": 8.47406281800443, "learning_rate": 5.569272976680385e-06, "loss": 0.8004, "step": 407 }, { "epoch": 0.03361621488011864, "grad_norm": 7.986033367143951, "learning_rate": 5.582990397805214e-06, "loss": 0.6459, "step": 408 }, { "epoch": 0.03369860756364835, "grad_norm": 56.85009393490188, "learning_rate": 5.596707818930042e-06, "loss": 0.4587, "step": 409 }, { "epoch": 0.03378100024717805, "grad_norm": 7.34821171599197, "learning_rate": 5.61042524005487e-06, "loss": 0.3975, "step": 410 }, { "epoch": 0.03386339293070775, "grad_norm": 10.977592364670041, "learning_rate": 5.624142661179699e-06, "loss": 0.6514, "step": 411 }, { "epoch": 0.03394578561423746, "grad_norm": 6.281256114995981, "learning_rate": 5.6378600823045275e-06, "loss": 0.5191, "step": 412 }, { "epoch": 0.03402817829776716, "grad_norm": 8.676725239026284, "learning_rate": 5.651577503429356e-06, "loss": 0.6749, "step": 413 }, { "epoch": 0.03411057098129686, "grad_norm": 5.271388347433627, "learning_rate": 5.6652949245541844e-06, "loss": 0.3394, "step": 414 }, { "epoch": 0.03419296366482656, "grad_norm": 8.62394479818176, "learning_rate": 5.6790123456790125e-06, "loss": 0.6579, "step": 415 }, { "epoch": 0.03427535634835627, "grad_norm": 8.725171620314873, "learning_rate": 5.692729766803841e-06, "loss": 0.5777, "step": 416 }, { "epoch": 0.034357749031885966, "grad_norm": 103.08679101276228, "learning_rate": 5.7064471879286694e-06, "loss": 1.5575, "step": 417 }, { "epoch": 0.03444014171541567, "grad_norm": 5.761947609743853, "learning_rate": 5.720164609053498e-06, "loss": 0.5843, "step": 418 }, { "epoch": 0.03452253439894537, "grad_norm": 7.898907683412111, "learning_rate": 5.733882030178327e-06, "loss": 0.4965, "step": 419 }, { "epoch": 0.034604927082475076, "grad_norm": 8.409149079491211, "learning_rate": 5.747599451303155e-06, "loss": 0.5187, "step": 420 }, { "epoch": 0.03468731976600478, "grad_norm": 7.164102449901402, "learning_rate": 5.761316872427984e-06, "loss": 0.6237, "step": 421 }, { "epoch": 0.03476971244953448, "grad_norm": 6.78383472471162, "learning_rate": 5.775034293552812e-06, "loss": 0.47, "step": 422 }, { "epoch": 0.034852105133064186, "grad_norm": 8.352679629190035, "learning_rate": 5.78875171467764e-06, "loss": 0.6486, "step": 423 }, { "epoch": 0.034934497816593885, "grad_norm": 10.944499686428724, "learning_rate": 5.80246913580247e-06, "loss": 0.6787, "step": 424 }, { "epoch": 0.03501689050012359, "grad_norm": 7.169250883656542, "learning_rate": 5.816186556927298e-06, "loss": 0.6341, "step": 425 }, { "epoch": 0.03509928318365329, "grad_norm": 8.444427076931605, "learning_rate": 5.829903978052127e-06, "loss": 0.3875, "step": 426 }, { "epoch": 0.035181675867182995, "grad_norm": 5.943310209215223, "learning_rate": 5.843621399176955e-06, "loss": 0.5543, "step": 427 }, { "epoch": 0.03526406855071269, "grad_norm": 9.00551355023434, "learning_rate": 5.857338820301783e-06, "loss": 0.4304, "step": 428 }, { "epoch": 0.0353464612342424, "grad_norm": 5.450933353978881, "learning_rate": 5.871056241426613e-06, "loss": 0.439, "step": 429 }, { "epoch": 0.035428853917772105, "grad_norm": 5.508996916628157, "learning_rate": 5.884773662551441e-06, "loss": 0.3802, "step": 430 }, { "epoch": 0.0355112466013018, "grad_norm": 5.247685983735031, "learning_rate": 5.89849108367627e-06, "loss": 0.2684, "step": 431 }, { "epoch": 0.03559363928483151, "grad_norm": 5.397506286271877, "learning_rate": 5.912208504801098e-06, "loss": 0.341, "step": 432 }, { "epoch": 0.03567603196836121, "grad_norm": 5.27131135952353, "learning_rate": 5.925925925925926e-06, "loss": 0.2659, "step": 433 }, { "epoch": 0.03575842465189091, "grad_norm": 480.0659176486544, "learning_rate": 5.9396433470507556e-06, "loss": 0.638, "step": 434 }, { "epoch": 0.03584081733542061, "grad_norm": 13.678674149772142, "learning_rate": 5.953360768175584e-06, "loss": 0.7309, "step": 435 }, { "epoch": 0.03592321001895032, "grad_norm": 10.780807692308517, "learning_rate": 5.967078189300412e-06, "loss": 0.7032, "step": 436 }, { "epoch": 0.03600560270248002, "grad_norm": 13.02329032201945, "learning_rate": 5.9807956104252405e-06, "loss": 0.5507, "step": 437 }, { "epoch": 0.03608799538600972, "grad_norm": 6.048832451175678, "learning_rate": 5.994513031550069e-06, "loss": 0.4034, "step": 438 }, { "epoch": 0.03617038806953943, "grad_norm": 6.96057463234594, "learning_rate": 6.008230452674898e-06, "loss": 0.4898, "step": 439 }, { "epoch": 0.036252780753069126, "grad_norm": 9.398283406999298, "learning_rate": 6.021947873799726e-06, "loss": 0.645, "step": 440 }, { "epoch": 0.03633517343659883, "grad_norm": 8.87917346131629, "learning_rate": 6.0356652949245544e-06, "loss": 0.6331, "step": 441 }, { "epoch": 0.03641756612012853, "grad_norm": 7.58770542825193, "learning_rate": 6.049382716049383e-06, "loss": 0.496, "step": 442 }, { "epoch": 0.036499958803658236, "grad_norm": 7.319287387022868, "learning_rate": 6.063100137174211e-06, "loss": 0.4107, "step": 443 }, { "epoch": 0.036582351487187935, "grad_norm": 6.047688078633024, "learning_rate": 6.076817558299041e-06, "loss": 0.5706, "step": 444 }, { "epoch": 0.03666474417071764, "grad_norm": 4.016360698698718, "learning_rate": 6.090534979423869e-06, "loss": 0.4581, "step": 445 }, { "epoch": 0.036747136854247346, "grad_norm": 4.965075908435688, "learning_rate": 6.104252400548697e-06, "loss": 0.4266, "step": 446 }, { "epoch": 0.036829529537777045, "grad_norm": 6.216596173925147, "learning_rate": 6.117969821673526e-06, "loss": 0.4522, "step": 447 }, { "epoch": 0.03691192222130675, "grad_norm": 4.842662632695187, "learning_rate": 6.131687242798354e-06, "loss": 0.4335, "step": 448 }, { "epoch": 0.03699431490483645, "grad_norm": 4.9752196790209515, "learning_rate": 6.145404663923183e-06, "loss": 0.557, "step": 449 }, { "epoch": 0.037076707588366155, "grad_norm": 5.735323380928238, "learning_rate": 6.159122085048012e-06, "loss": 0.4763, "step": 450 }, { "epoch": 0.037159100271895854, "grad_norm": 5.77466848724865, "learning_rate": 6.17283950617284e-06, "loss": 0.4934, "step": 451 }, { "epoch": 0.03724149295542556, "grad_norm": 9.112092371493077, "learning_rate": 6.186556927297669e-06, "loss": 0.7463, "step": 452 }, { "epoch": 0.03732388563895526, "grad_norm": 5.7102176201346, "learning_rate": 6.200274348422497e-06, "loss": 0.5117, "step": 453 }, { "epoch": 0.037406278322484963, "grad_norm": 78.79906885419953, "learning_rate": 6.213991769547325e-06, "loss": 0.4034, "step": 454 }, { "epoch": 0.03748867100601467, "grad_norm": 6.538900045540373, "learning_rate": 6.227709190672154e-06, "loss": 0.5889, "step": 455 }, { "epoch": 0.03757106368954437, "grad_norm": 6.303771338902271, "learning_rate": 6.241426611796983e-06, "loss": 0.4705, "step": 456 }, { "epoch": 0.03765345637307407, "grad_norm": 6.123584679010139, "learning_rate": 6.255144032921812e-06, "loss": 0.4935, "step": 457 }, { "epoch": 0.03773584905660377, "grad_norm": 6.632670832269488, "learning_rate": 6.26886145404664e-06, "loss": 0.4198, "step": 458 }, { "epoch": 0.03781824174013348, "grad_norm": 5.546218761188075, "learning_rate": 6.282578875171468e-06, "loss": 0.4775, "step": 459 }, { "epoch": 0.037900634423663176, "grad_norm": 7.410628500383756, "learning_rate": 6.296296296296297e-06, "loss": 0.5848, "step": 460 }, { "epoch": 0.03798302710719288, "grad_norm": 13.29983237450669, "learning_rate": 6.310013717421125e-06, "loss": 0.5264, "step": 461 }, { "epoch": 0.03806541979072258, "grad_norm": 6.248701868467927, "learning_rate": 6.3237311385459544e-06, "loss": 0.4628, "step": 462 }, { "epoch": 0.038147812474252286, "grad_norm": 5.555557826856918, "learning_rate": 6.3374485596707825e-06, "loss": 0.4553, "step": 463 }, { "epoch": 0.03823020515778199, "grad_norm": 9.184422285800041, "learning_rate": 6.3511659807956105e-06, "loss": 0.8573, "step": 464 }, { "epoch": 0.03831259784131169, "grad_norm": 5.747050476310712, "learning_rate": 6.3648834019204394e-06, "loss": 0.482, "step": 465 }, { "epoch": 0.038394990524841396, "grad_norm": 4.763610826797223, "learning_rate": 6.3786008230452675e-06, "loss": 0.5323, "step": 466 }, { "epoch": 0.038477383208371095, "grad_norm": 5.592179783867718, "learning_rate": 6.392318244170097e-06, "loss": 0.4239, "step": 467 }, { "epoch": 0.0385597758919008, "grad_norm": 6.874653123972646, "learning_rate": 6.406035665294925e-06, "loss": 0.6832, "step": 468 }, { "epoch": 0.0386421685754305, "grad_norm": 8.90284342707074, "learning_rate": 6.419753086419753e-06, "loss": 0.7136, "step": 469 }, { "epoch": 0.038724561258960205, "grad_norm": 4.321479415948776, "learning_rate": 6.433470507544582e-06, "loss": 0.3613, "step": 470 }, { "epoch": 0.038806953942489904, "grad_norm": 6.212842061197888, "learning_rate": 6.44718792866941e-06, "loss": 0.5118, "step": 471 }, { "epoch": 0.03888934662601961, "grad_norm": 6.620819776714046, "learning_rate": 6.460905349794238e-06, "loss": 0.6616, "step": 472 }, { "epoch": 0.038971739309549315, "grad_norm": 4.727885284155705, "learning_rate": 6.474622770919068e-06, "loss": 0.5001, "step": 473 }, { "epoch": 0.039054131993079014, "grad_norm": 5.762977515077748, "learning_rate": 6.488340192043896e-06, "loss": 0.496, "step": 474 }, { "epoch": 0.03913652467660872, "grad_norm": 4.548105976567005, "learning_rate": 6.502057613168725e-06, "loss": 0.4174, "step": 475 }, { "epoch": 0.03921891736013842, "grad_norm": 10.758680715013131, "learning_rate": 6.515775034293553e-06, "loss": 1.0219, "step": 476 }, { "epoch": 0.039301310043668124, "grad_norm": 4.833629523221548, "learning_rate": 6.529492455418381e-06, "loss": 0.4714, "step": 477 }, { "epoch": 0.03938370272719782, "grad_norm": 6.168003173970242, "learning_rate": 6.543209876543211e-06, "loss": 0.5518, "step": 478 }, { "epoch": 0.03946609541072753, "grad_norm": 6.781568533057639, "learning_rate": 6.556927297668039e-06, "loss": 0.6373, "step": 479 }, { "epoch": 0.03954848809425723, "grad_norm": 7.807648679748847, "learning_rate": 6.570644718792868e-06, "loss": 0.6692, "step": 480 }, { "epoch": 0.03963088077778693, "grad_norm": 5.261036851137123, "learning_rate": 6.584362139917696e-06, "loss": 0.4972, "step": 481 }, { "epoch": 0.03971327346131664, "grad_norm": 8.82287432796905, "learning_rate": 6.598079561042524e-06, "loss": 0.5782, "step": 482 }, { "epoch": 0.03979566614484634, "grad_norm": 4.74653534153213, "learning_rate": 6.611796982167354e-06, "loss": 0.4713, "step": 483 }, { "epoch": 0.03987805882837604, "grad_norm": 6.115095781407729, "learning_rate": 6.625514403292182e-06, "loss": 0.4716, "step": 484 }, { "epoch": 0.03996045151190574, "grad_norm": 9.137058349523514, "learning_rate": 6.6392318244170106e-06, "loss": 0.5758, "step": 485 }, { "epoch": 0.04004284419543545, "grad_norm": 6.208461928327539, "learning_rate": 6.652949245541839e-06, "loss": 0.6577, "step": 486 }, { "epoch": 0.040125236878965145, "grad_norm": 7.110169283978004, "learning_rate": 6.666666666666667e-06, "loss": 0.5648, "step": 487 }, { "epoch": 0.04020762956249485, "grad_norm": 5.955335908180957, "learning_rate": 6.680384087791496e-06, "loss": 0.6219, "step": 488 }, { "epoch": 0.04029002224602455, "grad_norm": 11.624360168848643, "learning_rate": 6.6941015089163244e-06, "loss": 0.5879, "step": 489 }, { "epoch": 0.040372414929554255, "grad_norm": 6.045762234406257, "learning_rate": 6.707818930041153e-06, "loss": 0.5048, "step": 490 }, { "epoch": 0.04045480761308396, "grad_norm": 7.274352306633563, "learning_rate": 6.721536351165981e-06, "loss": 0.7685, "step": 491 }, { "epoch": 0.04053720029661366, "grad_norm": 5.5877312296120465, "learning_rate": 6.7352537722908094e-06, "loss": 0.4779, "step": 492 }, { "epoch": 0.040619592980143365, "grad_norm": 190.3033861867491, "learning_rate": 6.748971193415639e-06, "loss": 2.688, "step": 493 }, { "epoch": 0.040701985663673064, "grad_norm": 10.422219949359063, "learning_rate": 6.762688614540467e-06, "loss": 0.6075, "step": 494 }, { "epoch": 0.04078437834720277, "grad_norm": 5.781767462518796, "learning_rate": 6.776406035665295e-06, "loss": 0.4923, "step": 495 }, { "epoch": 0.04086677103073247, "grad_norm": 7.096850817273703, "learning_rate": 6.790123456790124e-06, "loss": 0.3914, "step": 496 }, { "epoch": 0.040949163714262174, "grad_norm": 6.715044383416897, "learning_rate": 6.803840877914952e-06, "loss": 0.6559, "step": 497 }, { "epoch": 0.04103155639779187, "grad_norm": 5.869148124670727, "learning_rate": 6.817558299039781e-06, "loss": 0.6119, "step": 498 }, { "epoch": 0.04111394908132158, "grad_norm": 6.817431946634836, "learning_rate": 6.83127572016461e-06, "loss": 0.7011, "step": 499 }, { "epoch": 0.041196341764851284, "grad_norm": 6.726271933368034, "learning_rate": 6.844993141289438e-06, "loss": 0.5462, "step": 500 }, { "epoch": 0.04127873444838098, "grad_norm": 5.340136233900829, "learning_rate": 6.858710562414267e-06, "loss": 0.4988, "step": 501 }, { "epoch": 0.04136112713191069, "grad_norm": 6.910491082536719, "learning_rate": 6.872427983539095e-06, "loss": 0.3951, "step": 502 }, { "epoch": 0.04144351981544039, "grad_norm": 5.788747410824949, "learning_rate": 6.886145404663924e-06, "loss": 0.5783, "step": 503 }, { "epoch": 0.04152591249897009, "grad_norm": 4.802638214101094, "learning_rate": 6.899862825788752e-06, "loss": 0.2877, "step": 504 }, { "epoch": 0.04160830518249979, "grad_norm": 4.484566887630588, "learning_rate": 6.913580246913581e-06, "loss": 0.3811, "step": 505 }, { "epoch": 0.0416906978660295, "grad_norm": 6.051035690893021, "learning_rate": 6.92729766803841e-06, "loss": 0.4715, "step": 506 }, { "epoch": 0.0417730905495592, "grad_norm": 6.986842742202583, "learning_rate": 6.941015089163238e-06, "loss": 0.3989, "step": 507 }, { "epoch": 0.0418554832330889, "grad_norm": 3.822985062741508, "learning_rate": 6.954732510288067e-06, "loss": 0.3269, "step": 508 }, { "epoch": 0.04193787591661861, "grad_norm": 6.271803341046997, "learning_rate": 6.968449931412895e-06, "loss": 0.5455, "step": 509 }, { "epoch": 0.042020268600148306, "grad_norm": 4.324558170988178, "learning_rate": 6.982167352537723e-06, "loss": 0.3514, "step": 510 }, { "epoch": 0.04210266128367801, "grad_norm": 7.0738043807784265, "learning_rate": 6.9958847736625525e-06, "loss": 0.4582, "step": 511 }, { "epoch": 0.04218505396720771, "grad_norm": 4.303336598762236, "learning_rate": 7.0096021947873805e-06, "loss": 0.2196, "step": 512 }, { "epoch": 0.042267446650737416, "grad_norm": 6.729727246828568, "learning_rate": 7.023319615912209e-06, "loss": 0.5618, "step": 513 }, { "epoch": 0.042349839334267114, "grad_norm": 5.868167847006668, "learning_rate": 7.0370370370370375e-06, "loss": 0.5361, "step": 514 }, { "epoch": 0.04243223201779682, "grad_norm": 8.605825638540875, "learning_rate": 7.0507544581618655e-06, "loss": 0.6892, "step": 515 }, { "epoch": 0.042514624701326525, "grad_norm": 5.431792569281863, "learning_rate": 7.064471879286695e-06, "loss": 0.4384, "step": 516 }, { "epoch": 0.042597017384856224, "grad_norm": 6.984146114234522, "learning_rate": 7.078189300411523e-06, "loss": 0.483, "step": 517 }, { "epoch": 0.04267941006838593, "grad_norm": 9.3422311974361, "learning_rate": 7.091906721536351e-06, "loss": 0.5842, "step": 518 }, { "epoch": 0.04276180275191563, "grad_norm": 5.732842769025313, "learning_rate": 7.10562414266118e-06, "loss": 0.3972, "step": 519 }, { "epoch": 0.042844195435445334, "grad_norm": 8.65862913312267, "learning_rate": 7.119341563786008e-06, "loss": 0.6456, "step": 520 }, { "epoch": 0.04292658811897503, "grad_norm": 6.864043755425524, "learning_rate": 7.133058984910838e-06, "loss": 0.5415, "step": 521 }, { "epoch": 0.04300898080250474, "grad_norm": 7.980923425776543, "learning_rate": 7.146776406035666e-06, "loss": 0.6034, "step": 522 }, { "epoch": 0.04309137348603444, "grad_norm": 6.747289314662945, "learning_rate": 7.160493827160494e-06, "loss": 0.5615, "step": 523 }, { "epoch": 0.04317376616956414, "grad_norm": 5.139243463936733, "learning_rate": 7.174211248285323e-06, "loss": 0.4397, "step": 524 }, { "epoch": 0.04325615885309385, "grad_norm": 4.121695404994115, "learning_rate": 7.187928669410151e-06, "loss": 0.576, "step": 525 }, { "epoch": 0.04333855153662355, "grad_norm": 4.650257020504757, "learning_rate": 7.201646090534981e-06, "loss": 0.3738, "step": 526 }, { "epoch": 0.04342094422015325, "grad_norm": 5.611100034997747, "learning_rate": 7.215363511659809e-06, "loss": 0.5172, "step": 527 }, { "epoch": 0.04350333690368295, "grad_norm": 7.677372417584333, "learning_rate": 7.229080932784637e-06, "loss": 0.6671, "step": 528 }, { "epoch": 0.04358572958721266, "grad_norm": 6.6136776618758875, "learning_rate": 7.242798353909466e-06, "loss": 0.7559, "step": 529 }, { "epoch": 0.043668122270742356, "grad_norm": 4.792630608864752, "learning_rate": 7.256515775034294e-06, "loss": 0.3489, "step": 530 }, { "epoch": 0.04375051495427206, "grad_norm": 6.35062782260829, "learning_rate": 7.270233196159122e-06, "loss": 0.6693, "step": 531 }, { "epoch": 0.04383290763780176, "grad_norm": 5.605832169231131, "learning_rate": 7.283950617283952e-06, "loss": 0.6839, "step": 532 }, { "epoch": 0.043915300321331466, "grad_norm": 7.340777524703994, "learning_rate": 7.29766803840878e-06, "loss": 0.4743, "step": 533 }, { "epoch": 0.04399769300486117, "grad_norm": 5.736671549282368, "learning_rate": 7.311385459533609e-06, "loss": 0.5146, "step": 534 }, { "epoch": 0.04408008568839087, "grad_norm": 5.290429644156163, "learning_rate": 7.325102880658437e-06, "loss": 0.4368, "step": 535 }, { "epoch": 0.044162478371920576, "grad_norm": 5.60282614307985, "learning_rate": 7.338820301783265e-06, "loss": 0.5957, "step": 536 }, { "epoch": 0.044244871055450274, "grad_norm": 5.166334947065374, "learning_rate": 7.3525377229080944e-06, "loss": 0.3518, "step": 537 }, { "epoch": 0.04432726373897998, "grad_norm": 4.03828573750434, "learning_rate": 7.3662551440329225e-06, "loss": 0.4423, "step": 538 }, { "epoch": 0.04440965642250968, "grad_norm": 5.668301363403015, "learning_rate": 7.379972565157751e-06, "loss": 0.5681, "step": 539 }, { "epoch": 0.044492049106039384, "grad_norm": 5.359958652089056, "learning_rate": 7.3936899862825794e-06, "loss": 0.2408, "step": 540 }, { "epoch": 0.04457444178956908, "grad_norm": 7.013014929960838, "learning_rate": 7.4074074074074075e-06, "loss": 0.5804, "step": 541 }, { "epoch": 0.04465683447309879, "grad_norm": 4.355083270145565, "learning_rate": 7.421124828532237e-06, "loss": 0.3215, "step": 542 }, { "epoch": 0.044739227156628494, "grad_norm": 4.950584901024228, "learning_rate": 7.434842249657065e-06, "loss": 0.4059, "step": 543 }, { "epoch": 0.04482161984015819, "grad_norm": 7.881681417974008, "learning_rate": 7.448559670781894e-06, "loss": 0.6569, "step": 544 }, { "epoch": 0.0449040125236879, "grad_norm": 8.797495038662335, "learning_rate": 7.462277091906722e-06, "loss": 0.6289, "step": 545 }, { "epoch": 0.0449864052072176, "grad_norm": 5.34524571582464, "learning_rate": 7.47599451303155e-06, "loss": 0.4069, "step": 546 }, { "epoch": 0.0450687978907473, "grad_norm": 6.158525652435086, "learning_rate": 7.489711934156379e-06, "loss": 0.2462, "step": 547 }, { "epoch": 0.045151190574277, "grad_norm": 4.744502082646233, "learning_rate": 7.503429355281208e-06, "loss": 0.3248, "step": 548 }, { "epoch": 0.04523358325780671, "grad_norm": 5.917171401303492, "learning_rate": 7.517146776406036e-06, "loss": 0.5406, "step": 549 }, { "epoch": 0.045315975941336406, "grad_norm": 12.458906212002898, "learning_rate": 7.530864197530865e-06, "loss": 0.7253, "step": 550 }, { "epoch": 0.04539836862486611, "grad_norm": 7.531914989608172, "learning_rate": 7.544581618655693e-06, "loss": 0.5998, "step": 551 }, { "epoch": 0.04548076130839582, "grad_norm": 4.0280860261800004, "learning_rate": 7.558299039780522e-06, "loss": 0.4643, "step": 552 }, { "epoch": 0.045563153991925516, "grad_norm": 6.015033091452991, "learning_rate": 7.57201646090535e-06, "loss": 0.4256, "step": 553 }, { "epoch": 0.04564554667545522, "grad_norm": 7.13456394416836, "learning_rate": 7.585733882030179e-06, "loss": 0.7066, "step": 554 }, { "epoch": 0.04572793935898492, "grad_norm": 9.287192614752263, "learning_rate": 7.599451303155008e-06, "loss": 0.7301, "step": 555 }, { "epoch": 0.045810332042514626, "grad_norm": 6.284874505694774, "learning_rate": 7.613168724279836e-06, "loss": 0.433, "step": 556 }, { "epoch": 0.045892724726044325, "grad_norm": 5.947339381004487, "learning_rate": 7.626886145404665e-06, "loss": 0.4314, "step": 557 }, { "epoch": 0.04597511740957403, "grad_norm": 5.263524060407473, "learning_rate": 7.640603566529494e-06, "loss": 0.4686, "step": 558 }, { "epoch": 0.04605751009310373, "grad_norm": 6.4191613503614775, "learning_rate": 7.654320987654322e-06, "loss": 0.3771, "step": 559 }, { "epoch": 0.046139902776633435, "grad_norm": 4.786426740476273, "learning_rate": 7.66803840877915e-06, "loss": 0.5297, "step": 560 }, { "epoch": 0.04622229546016314, "grad_norm": 5.969988084382956, "learning_rate": 7.681755829903978e-06, "loss": 0.538, "step": 561 }, { "epoch": 0.04630468814369284, "grad_norm": 7.308885572221812, "learning_rate": 7.695473251028807e-06, "loss": 0.5234, "step": 562 }, { "epoch": 0.046387080827222545, "grad_norm": 5.97558760032448, "learning_rate": 7.709190672153636e-06, "loss": 0.4623, "step": 563 }, { "epoch": 0.04646947351075224, "grad_norm": 6.703370454236969, "learning_rate": 7.722908093278464e-06, "loss": 0.4321, "step": 564 }, { "epoch": 0.04655186619428195, "grad_norm": 5.449290012281466, "learning_rate": 7.736625514403293e-06, "loss": 0.4356, "step": 565 }, { "epoch": 0.04663425887781165, "grad_norm": 7.6918159276005245, "learning_rate": 7.750342935528121e-06, "loss": 0.7377, "step": 566 }, { "epoch": 0.04671665156134135, "grad_norm": 6.70070388013919, "learning_rate": 7.76406035665295e-06, "loss": 0.4832, "step": 567 }, { "epoch": 0.04679904424487106, "grad_norm": 7.054794661079072, "learning_rate": 7.77777777777778e-06, "loss": 0.5, "step": 568 }, { "epoch": 0.04688143692840076, "grad_norm": 6.480167958787211, "learning_rate": 7.791495198902607e-06, "loss": 0.6175, "step": 569 }, { "epoch": 0.04696382961193046, "grad_norm": 7.506915605641678, "learning_rate": 7.805212620027435e-06, "loss": 0.6497, "step": 570 }, { "epoch": 0.04704622229546016, "grad_norm": 5.494983006094659, "learning_rate": 7.818930041152263e-06, "loss": 0.4892, "step": 571 }, { "epoch": 0.04712861497898987, "grad_norm": 4.3290727230186175, "learning_rate": 7.832647462277091e-06, "loss": 0.3603, "step": 572 }, { "epoch": 0.047211007662519566, "grad_norm": 5.4053345835693545, "learning_rate": 7.846364883401921e-06, "loss": 0.4964, "step": 573 }, { "epoch": 0.04729340034604927, "grad_norm": 6.8723464860852275, "learning_rate": 7.860082304526749e-06, "loss": 0.5672, "step": 574 }, { "epoch": 0.04737579302957897, "grad_norm": 6.656820969576108, "learning_rate": 7.873799725651579e-06, "loss": 0.645, "step": 575 }, { "epoch": 0.047458185713108676, "grad_norm": 5.514831433681132, "learning_rate": 7.887517146776407e-06, "loss": 0.4784, "step": 576 }, { "epoch": 0.04754057839663838, "grad_norm": 4.686188017061002, "learning_rate": 7.901234567901235e-06, "loss": 0.3503, "step": 577 }, { "epoch": 0.04762297108016808, "grad_norm": 4.387746808622081, "learning_rate": 7.914951989026065e-06, "loss": 0.4085, "step": 578 }, { "epoch": 0.047705363763697786, "grad_norm": 5.842284742709113, "learning_rate": 7.928669410150893e-06, "loss": 0.6309, "step": 579 }, { "epoch": 0.047787756447227485, "grad_norm": 3.9959223903308883, "learning_rate": 7.94238683127572e-06, "loss": 0.4597, "step": 580 }, { "epoch": 0.04787014913075719, "grad_norm": 5.369023494162627, "learning_rate": 7.956104252400549e-06, "loss": 0.3751, "step": 581 }, { "epoch": 0.04795254181428689, "grad_norm": 5.2283340567591345, "learning_rate": 7.969821673525377e-06, "loss": 0.5132, "step": 582 }, { "epoch": 0.048034934497816595, "grad_norm": 5.349903681875905, "learning_rate": 7.983539094650207e-06, "loss": 0.3902, "step": 583 }, { "epoch": 0.048117327181346294, "grad_norm": 6.025785140359736, "learning_rate": 7.997256515775035e-06, "loss": 0.4025, "step": 584 }, { "epoch": 0.048199719864876, "grad_norm": 11.461654268608273, "learning_rate": 8.010973936899864e-06, "loss": 0.679, "step": 585 }, { "epoch": 0.048282112548405705, "grad_norm": 6.778197127568214, "learning_rate": 8.024691358024692e-06, "loss": 0.6373, "step": 586 }, { "epoch": 0.048364505231935404, "grad_norm": 6.682544812430659, "learning_rate": 8.03840877914952e-06, "loss": 0.6251, "step": 587 }, { "epoch": 0.04844689791546511, "grad_norm": 4.574480748574406, "learning_rate": 8.052126200274349e-06, "loss": 0.3578, "step": 588 }, { "epoch": 0.04852929059899481, "grad_norm": 5.884448857284855, "learning_rate": 8.065843621399178e-06, "loss": 0.4917, "step": 589 }, { "epoch": 0.048611683282524513, "grad_norm": 7.305232846822941, "learning_rate": 8.079561042524006e-06, "loss": 0.606, "step": 590 }, { "epoch": 0.04869407596605421, "grad_norm": 5.674679345815404, "learning_rate": 8.093278463648834e-06, "loss": 0.6098, "step": 591 }, { "epoch": 0.04877646864958392, "grad_norm": 5.8357949188415805, "learning_rate": 8.106995884773662e-06, "loss": 0.4771, "step": 592 }, { "epoch": 0.048858861333113617, "grad_norm": 6.271507937692957, "learning_rate": 8.120713305898492e-06, "loss": 0.4724, "step": 593 }, { "epoch": 0.04894125401664332, "grad_norm": 6.878289052557951, "learning_rate": 8.13443072702332e-06, "loss": 0.6355, "step": 594 }, { "epoch": 0.04902364670017303, "grad_norm": 6.77176191369003, "learning_rate": 8.148148148148148e-06, "loss": 0.5007, "step": 595 }, { "epoch": 0.049106039383702726, "grad_norm": 5.259636242937289, "learning_rate": 8.161865569272978e-06, "loss": 0.4991, "step": 596 }, { "epoch": 0.04918843206723243, "grad_norm": 6.83053596050915, "learning_rate": 8.175582990397806e-06, "loss": 0.4805, "step": 597 }, { "epoch": 0.04927082475076213, "grad_norm": 8.169736862097166, "learning_rate": 8.189300411522634e-06, "loss": 0.5456, "step": 598 }, { "epoch": 0.049353217434291836, "grad_norm": 7.689359450945521, "learning_rate": 8.203017832647462e-06, "loss": 0.6017, "step": 599 }, { "epoch": 0.049435610117821535, "grad_norm": 5.228139769258261, "learning_rate": 8.21673525377229e-06, "loss": 0.5305, "step": 600 }, { "epoch": 0.04951800280135124, "grad_norm": 8.65116366410673, "learning_rate": 8.23045267489712e-06, "loss": 0.5437, "step": 601 }, { "epoch": 0.04960039548488094, "grad_norm": 11.17746374283472, "learning_rate": 8.244170096021948e-06, "loss": 0.7125, "step": 602 }, { "epoch": 0.049682788168410645, "grad_norm": 7.301491118538198, "learning_rate": 8.257887517146778e-06, "loss": 0.5577, "step": 603 }, { "epoch": 0.04976518085194035, "grad_norm": 4.970573337029868, "learning_rate": 8.271604938271606e-06, "loss": 0.5968, "step": 604 }, { "epoch": 0.04984757353547005, "grad_norm": 5.587652792417023, "learning_rate": 8.285322359396434e-06, "loss": 0.6005, "step": 605 }, { "epoch": 0.049929966218999755, "grad_norm": 5.549916285590465, "learning_rate": 8.299039780521264e-06, "loss": 0.3548, "step": 606 }, { "epoch": 0.050012358902529454, "grad_norm": 8.10309244938579, "learning_rate": 8.312757201646092e-06, "loss": 0.5691, "step": 607 }, { "epoch": 0.05009475158605916, "grad_norm": 5.8031246153733935, "learning_rate": 8.32647462277092e-06, "loss": 0.5858, "step": 608 }, { "epoch": 0.05017714426958886, "grad_norm": 4.633845595880233, "learning_rate": 8.340192043895748e-06, "loss": 0.6119, "step": 609 }, { "epoch": 0.050259536953118564, "grad_norm": 4.538100964584221, "learning_rate": 8.353909465020576e-06, "loss": 0.4402, "step": 610 }, { "epoch": 0.05034192963664826, "grad_norm": 5.323060646938032, "learning_rate": 8.367626886145406e-06, "loss": 0.634, "step": 611 }, { "epoch": 0.05042432232017797, "grad_norm": 8.019191719629505, "learning_rate": 8.381344307270234e-06, "loss": 0.5162, "step": 612 }, { "epoch": 0.050506715003707674, "grad_norm": 8.490592176269905, "learning_rate": 8.395061728395062e-06, "loss": 0.7862, "step": 613 }, { "epoch": 0.05058910768723737, "grad_norm": 6.715022235925982, "learning_rate": 8.408779149519891e-06, "loss": 0.693, "step": 614 }, { "epoch": 0.05067150037076708, "grad_norm": 6.662593981470133, "learning_rate": 8.42249657064472e-06, "loss": 0.7293, "step": 615 }, { "epoch": 0.05075389305429678, "grad_norm": 6.601584476220066, "learning_rate": 8.43621399176955e-06, "loss": 0.8338, "step": 616 }, { "epoch": 0.05083628573782648, "grad_norm": 4.392372074940804, "learning_rate": 8.449931412894377e-06, "loss": 0.362, "step": 617 }, { "epoch": 0.05091867842135618, "grad_norm": 67.48725205124786, "learning_rate": 8.463648834019205e-06, "loss": 2.8128, "step": 618 }, { "epoch": 0.05100107110488589, "grad_norm": 7.506512019819455, "learning_rate": 8.477366255144033e-06, "loss": 0.5441, "step": 619 }, { "epoch": 0.051083463788415585, "grad_norm": 6.880593438312231, "learning_rate": 8.491083676268861e-06, "loss": 0.5519, "step": 620 }, { "epoch": 0.05116585647194529, "grad_norm": 5.888795257341883, "learning_rate": 8.504801097393691e-06, "loss": 0.5516, "step": 621 }, { "epoch": 0.051248249155475, "grad_norm": 6.252602124665069, "learning_rate": 8.518518518518519e-06, "loss": 0.6957, "step": 622 }, { "epoch": 0.051330641839004695, "grad_norm": 5.4090002095589975, "learning_rate": 8.532235939643347e-06, "loss": 0.5359, "step": 623 }, { "epoch": 0.0514130345225344, "grad_norm": 10.890995083855032, "learning_rate": 8.545953360768177e-06, "loss": 0.709, "step": 624 }, { "epoch": 0.0514954272060641, "grad_norm": 5.34598899270713, "learning_rate": 8.559670781893005e-06, "loss": 0.7751, "step": 625 }, { "epoch": 0.051577819889593805, "grad_norm": 4.45040160733867, "learning_rate": 8.573388203017833e-06, "loss": 0.2535, "step": 626 }, { "epoch": 0.051660212573123504, "grad_norm": 5.585882232730492, "learning_rate": 8.587105624142663e-06, "loss": 0.6672, "step": 627 }, { "epoch": 0.05174260525665321, "grad_norm": 5.864022504893711, "learning_rate": 8.60082304526749e-06, "loss": 0.7359, "step": 628 }, { "epoch": 0.05182499794018291, "grad_norm": 4.476550092270306, "learning_rate": 8.614540466392319e-06, "loss": 0.4212, "step": 629 }, { "epoch": 0.051907390623712614, "grad_norm": 4.761925495673636, "learning_rate": 8.628257887517147e-06, "loss": 0.5568, "step": 630 }, { "epoch": 0.05198978330724232, "grad_norm": 4.598522719894157, "learning_rate": 8.641975308641975e-06, "loss": 0.2614, "step": 631 }, { "epoch": 0.05207217599077202, "grad_norm": 5.5021749646336175, "learning_rate": 8.655692729766805e-06, "loss": 0.6382, "step": 632 }, { "epoch": 0.052154568674301724, "grad_norm": 6.517012701844157, "learning_rate": 8.669410150891633e-06, "loss": 0.5804, "step": 633 }, { "epoch": 0.05223696135783142, "grad_norm": 6.805425239578879, "learning_rate": 8.683127572016463e-06, "loss": 0.5015, "step": 634 }, { "epoch": 0.05231935404136113, "grad_norm": 4.6738708514525715, "learning_rate": 8.69684499314129e-06, "loss": 0.3992, "step": 635 }, { "epoch": 0.05240174672489083, "grad_norm": 8.234844782748597, "learning_rate": 8.710562414266119e-06, "loss": 0.7507, "step": 636 }, { "epoch": 0.05248413940842053, "grad_norm": 6.698687047110895, "learning_rate": 8.724279835390947e-06, "loss": 0.6197, "step": 637 }, { "epoch": 0.05256653209195024, "grad_norm": 4.1168902182520615, "learning_rate": 8.737997256515776e-06, "loss": 0.3694, "step": 638 }, { "epoch": 0.05264892477547994, "grad_norm": 4.351008788296417, "learning_rate": 8.751714677640604e-06, "loss": 0.4905, "step": 639 }, { "epoch": 0.05273131745900964, "grad_norm": 9.457012453724198, "learning_rate": 8.765432098765432e-06, "loss": 0.7262, "step": 640 }, { "epoch": 0.05281371014253934, "grad_norm": 4.28519533402721, "learning_rate": 8.77914951989026e-06, "loss": 0.4703, "step": 641 }, { "epoch": 0.05289610282606905, "grad_norm": 7.396862648357201, "learning_rate": 8.79286694101509e-06, "loss": 0.4252, "step": 642 }, { "epoch": 0.052978495509598746, "grad_norm": 4.898822144574726, "learning_rate": 8.806584362139918e-06, "loss": 0.4963, "step": 643 }, { "epoch": 0.05306088819312845, "grad_norm": 5.754512361115338, "learning_rate": 8.820301783264746e-06, "loss": 0.7162, "step": 644 }, { "epoch": 0.05314328087665815, "grad_norm": 3.6053519506068605, "learning_rate": 8.834019204389576e-06, "loss": 0.2067, "step": 645 }, { "epoch": 0.053225673560187856, "grad_norm": 13.554388572711437, "learning_rate": 8.847736625514404e-06, "loss": 0.7916, "step": 646 }, { "epoch": 0.05330806624371756, "grad_norm": 9.04741748435677, "learning_rate": 8.861454046639232e-06, "loss": 0.6137, "step": 647 }, { "epoch": 0.05339045892724726, "grad_norm": 5.876495893543201, "learning_rate": 8.87517146776406e-06, "loss": 0.5187, "step": 648 }, { "epoch": 0.053472851610776966, "grad_norm": 16.50292992475323, "learning_rate": 8.888888888888888e-06, "loss": 0.5344, "step": 649 }, { "epoch": 0.053555244294306664, "grad_norm": 16.27884445947484, "learning_rate": 8.902606310013718e-06, "loss": 0.8425, "step": 650 }, { "epoch": 0.05363763697783637, "grad_norm": 6.05812949734961, "learning_rate": 8.916323731138546e-06, "loss": 0.5581, "step": 651 }, { "epoch": 0.05372002966136607, "grad_norm": 4.631703021154219, "learning_rate": 8.930041152263376e-06, "loss": 0.6053, "step": 652 }, { "epoch": 0.053802422344895774, "grad_norm": 5.0840641963520925, "learning_rate": 8.943758573388204e-06, "loss": 0.4591, "step": 653 }, { "epoch": 0.05388481502842547, "grad_norm": 11.145274102530228, "learning_rate": 8.957475994513032e-06, "loss": 0.4096, "step": 654 }, { "epoch": 0.05396720771195518, "grad_norm": 5.150949637450351, "learning_rate": 8.971193415637862e-06, "loss": 0.6682, "step": 655 }, { "epoch": 0.054049600395484884, "grad_norm": 6.053088680153872, "learning_rate": 8.98491083676269e-06, "loss": 0.6298, "step": 656 }, { "epoch": 0.05413199307901458, "grad_norm": 4.576638977362141, "learning_rate": 8.998628257887518e-06, "loss": 0.5385, "step": 657 }, { "epoch": 0.05421438576254429, "grad_norm": 5.279899070079792, "learning_rate": 9.012345679012346e-06, "loss": 0.5517, "step": 658 }, { "epoch": 0.05429677844607399, "grad_norm": 9.816689344331193, "learning_rate": 9.026063100137174e-06, "loss": 0.528, "step": 659 }, { "epoch": 0.05437917112960369, "grad_norm": 5.344268401980664, "learning_rate": 9.039780521262004e-06, "loss": 0.6586, "step": 660 }, { "epoch": 0.05446156381313339, "grad_norm": 4.907761624647467, "learning_rate": 9.053497942386832e-06, "loss": 0.416, "step": 661 }, { "epoch": 0.0545439564966631, "grad_norm": 8.816994387823925, "learning_rate": 9.067215363511661e-06, "loss": 0.8245, "step": 662 }, { "epoch": 0.054626349180192796, "grad_norm": 6.742675433781916, "learning_rate": 9.08093278463649e-06, "loss": 0.4017, "step": 663 }, { "epoch": 0.0547087418637225, "grad_norm": 5.140173007369312, "learning_rate": 9.094650205761317e-06, "loss": 0.5631, "step": 664 }, { "epoch": 0.05479113454725221, "grad_norm": 6.004556037601133, "learning_rate": 9.108367626886147e-06, "loss": 0.5043, "step": 665 }, { "epoch": 0.054873527230781906, "grad_norm": 11.766169776927814, "learning_rate": 9.122085048010975e-06, "loss": 0.5823, "step": 666 }, { "epoch": 0.05495591991431161, "grad_norm": 3.9038912525359484, "learning_rate": 9.135802469135803e-06, "loss": 0.4185, "step": 667 }, { "epoch": 0.05503831259784131, "grad_norm": 5.764415526037728, "learning_rate": 9.149519890260631e-06, "loss": 0.7135, "step": 668 }, { "epoch": 0.055120705281371016, "grad_norm": 5.200914706950143, "learning_rate": 9.16323731138546e-06, "loss": 0.5353, "step": 669 }, { "epoch": 0.055203097964900714, "grad_norm": 5.012014991774245, "learning_rate": 9.17695473251029e-06, "loss": 0.372, "step": 670 }, { "epoch": 0.05528549064843042, "grad_norm": 2.8951621516844677, "learning_rate": 9.190672153635117e-06, "loss": 0.2485, "step": 671 }, { "epoch": 0.05536788333196012, "grad_norm": 5.756434032002608, "learning_rate": 9.204389574759945e-06, "loss": 0.5129, "step": 672 }, { "epoch": 0.055450276015489824, "grad_norm": 5.5513950603318785, "learning_rate": 9.218106995884775e-06, "loss": 0.5475, "step": 673 }, { "epoch": 0.05553266869901953, "grad_norm": 7.155824300287789, "learning_rate": 9.231824417009603e-06, "loss": 0.6587, "step": 674 }, { "epoch": 0.05561506138254923, "grad_norm": 6.693719659190916, "learning_rate": 9.245541838134433e-06, "loss": 0.6653, "step": 675 }, { "epoch": 0.055697454066078934, "grad_norm": 5.899028184857873, "learning_rate": 9.25925925925926e-06, "loss": 0.5912, "step": 676 }, { "epoch": 0.05577984674960863, "grad_norm": 4.996123098753804, "learning_rate": 9.272976680384089e-06, "loss": 0.4981, "step": 677 }, { "epoch": 0.05586223943313834, "grad_norm": 7.59709581355784, "learning_rate": 9.286694101508917e-06, "loss": 0.6263, "step": 678 }, { "epoch": 0.05594463211666804, "grad_norm": 7.995633663002308, "learning_rate": 9.300411522633745e-06, "loss": 0.6424, "step": 679 }, { "epoch": 0.05602702480019774, "grad_norm": 4.669826110736232, "learning_rate": 9.314128943758575e-06, "loss": 0.4041, "step": 680 }, { "epoch": 0.05610941748372744, "grad_norm": 6.343393787810919, "learning_rate": 9.327846364883403e-06, "loss": 0.7479, "step": 681 }, { "epoch": 0.05619181016725715, "grad_norm": 96.70000934708833, "learning_rate": 9.34156378600823e-06, "loss": 2.6674, "step": 682 }, { "epoch": 0.05627420285078685, "grad_norm": 5.214445104499895, "learning_rate": 9.35528120713306e-06, "loss": 0.3759, "step": 683 }, { "epoch": 0.05635659553431655, "grad_norm": 6.426486685956931, "learning_rate": 9.368998628257889e-06, "loss": 0.7181, "step": 684 }, { "epoch": 0.05643898821784626, "grad_norm": 7.141014135224707, "learning_rate": 9.382716049382717e-06, "loss": 0.7562, "step": 685 }, { "epoch": 0.056521380901375956, "grad_norm": 5.988619932398916, "learning_rate": 9.396433470507545e-06, "loss": 0.3709, "step": 686 }, { "epoch": 0.05660377358490566, "grad_norm": 5.891136898704754, "learning_rate": 9.410150891632374e-06, "loss": 0.6162, "step": 687 }, { "epoch": 0.05668616626843536, "grad_norm": 6.170436173120623, "learning_rate": 9.423868312757202e-06, "loss": 0.576, "step": 688 }, { "epoch": 0.056768558951965066, "grad_norm": 6.151013921299717, "learning_rate": 9.43758573388203e-06, "loss": 0.478, "step": 689 }, { "epoch": 0.056850951635494765, "grad_norm": 6.117432660032868, "learning_rate": 9.451303155006859e-06, "loss": 0.5562, "step": 690 }, { "epoch": 0.05693334431902447, "grad_norm": 6.9566546187232206, "learning_rate": 9.465020576131688e-06, "loss": 0.7103, "step": 691 }, { "epoch": 0.057015737002554176, "grad_norm": 7.5413525683464435, "learning_rate": 9.478737997256516e-06, "loss": 0.5024, "step": 692 }, { "epoch": 0.057098129686083875, "grad_norm": 6.783803405617549, "learning_rate": 9.492455418381346e-06, "loss": 0.6286, "step": 693 }, { "epoch": 0.05718052236961358, "grad_norm": 7.745792551245552, "learning_rate": 9.506172839506174e-06, "loss": 0.5794, "step": 694 }, { "epoch": 0.05726291505314328, "grad_norm": 5.774054351127429, "learning_rate": 9.519890260631002e-06, "loss": 0.5072, "step": 695 }, { "epoch": 0.057345307736672985, "grad_norm": 5.0098435672277555, "learning_rate": 9.53360768175583e-06, "loss": 0.5214, "step": 696 }, { "epoch": 0.05742770042020268, "grad_norm": 6.134234294504796, "learning_rate": 9.547325102880658e-06, "loss": 0.6675, "step": 697 }, { "epoch": 0.05751009310373239, "grad_norm": 8.689201856152978, "learning_rate": 9.561042524005488e-06, "loss": 0.5794, "step": 698 }, { "epoch": 0.05759248578726209, "grad_norm": 5.8119206456550145, "learning_rate": 9.574759945130316e-06, "loss": 0.3022, "step": 699 }, { "epoch": 0.05767487847079179, "grad_norm": 32.92612650154318, "learning_rate": 9.588477366255144e-06, "loss": 0.527, "step": 700 }, { "epoch": 0.0577572711543215, "grad_norm": 8.182146639732006, "learning_rate": 9.602194787379974e-06, "loss": 0.6942, "step": 701 }, { "epoch": 0.0578396638378512, "grad_norm": 4.748298256564357, "learning_rate": 9.615912208504802e-06, "loss": 0.3809, "step": 702 }, { "epoch": 0.0579220565213809, "grad_norm": 7.767690253299567, "learning_rate": 9.62962962962963e-06, "loss": 0.7925, "step": 703 }, { "epoch": 0.0580044492049106, "grad_norm": 6.152146994551039, "learning_rate": 9.64334705075446e-06, "loss": 0.6161, "step": 704 }, { "epoch": 0.05808684188844031, "grad_norm": 5.059103423212747, "learning_rate": 9.657064471879288e-06, "loss": 0.32, "step": 705 }, { "epoch": 0.058169234571970006, "grad_norm": 5.104441529492062, "learning_rate": 9.670781893004116e-06, "loss": 0.4551, "step": 706 }, { "epoch": 0.05825162725549971, "grad_norm": 10.994238478560392, "learning_rate": 9.684499314128944e-06, "loss": 0.7538, "step": 707 }, { "epoch": 0.05833401993902942, "grad_norm": 4.968918212244643, "learning_rate": 9.698216735253772e-06, "loss": 0.5932, "step": 708 }, { "epoch": 0.058416412622559116, "grad_norm": 6.423172055805545, "learning_rate": 9.711934156378602e-06, "loss": 0.7786, "step": 709 }, { "epoch": 0.05849880530608882, "grad_norm": 3.880934215636923, "learning_rate": 9.72565157750343e-06, "loss": 0.4232, "step": 710 }, { "epoch": 0.05858119798961852, "grad_norm": 8.269601261803912, "learning_rate": 9.73936899862826e-06, "loss": 0.6481, "step": 711 }, { "epoch": 0.058663590673148226, "grad_norm": 3.8961399197860658, "learning_rate": 9.753086419753087e-06, "loss": 0.3662, "step": 712 }, { "epoch": 0.058745983356677925, "grad_norm": 6.851544898008151, "learning_rate": 9.766803840877916e-06, "loss": 0.6113, "step": 713 }, { "epoch": 0.05882837604020763, "grad_norm": 4.163553582824758, "learning_rate": 9.780521262002745e-06, "loss": 0.4059, "step": 714 }, { "epoch": 0.05891076872373733, "grad_norm": 8.045649533095332, "learning_rate": 9.794238683127573e-06, "loss": 0.5178, "step": 715 }, { "epoch": 0.058993161407267035, "grad_norm": 8.105313201818435, "learning_rate": 9.807956104252401e-06, "loss": 0.6752, "step": 716 }, { "epoch": 0.05907555409079674, "grad_norm": 7.225672961161458, "learning_rate": 9.82167352537723e-06, "loss": 0.557, "step": 717 }, { "epoch": 0.05915794677432644, "grad_norm": 4.4835661046768776, "learning_rate": 9.835390946502057e-06, "loss": 0.4336, "step": 718 }, { "epoch": 0.059240339457856145, "grad_norm": 4.824952188625617, "learning_rate": 9.849108367626887e-06, "loss": 0.4902, "step": 719 }, { "epoch": 0.059322732141385844, "grad_norm": 4.503287721058772, "learning_rate": 9.862825788751715e-06, "loss": 0.4704, "step": 720 }, { "epoch": 0.05940512482491555, "grad_norm": 3.7716661123547413, "learning_rate": 9.876543209876543e-06, "loss": 0.2055, "step": 721 }, { "epoch": 0.05948751750844525, "grad_norm": 5.833818295505862, "learning_rate": 9.890260631001373e-06, "loss": 0.3876, "step": 722 }, { "epoch": 0.059569910191974954, "grad_norm": 8.755791086817371, "learning_rate": 9.903978052126201e-06, "loss": 0.687, "step": 723 }, { "epoch": 0.05965230287550465, "grad_norm": 6.667121450588804, "learning_rate": 9.91769547325103e-06, "loss": 0.4723, "step": 724 }, { "epoch": 0.05973469555903436, "grad_norm": 4.706421774706928, "learning_rate": 9.931412894375859e-06, "loss": 0.5538, "step": 725 }, { "epoch": 0.059817088242564063, "grad_norm": 10.070112083827578, "learning_rate": 9.945130315500687e-06, "loss": 0.8218, "step": 726 }, { "epoch": 0.05989948092609376, "grad_norm": 5.252163320718281, "learning_rate": 9.958847736625515e-06, "loss": 0.4948, "step": 727 }, { "epoch": 0.05998187360962347, "grad_norm": 9.412813237828644, "learning_rate": 9.972565157750343e-06, "loss": 0.81, "step": 728 }, { "epoch": 0.060064266293153167, "grad_norm": 4.587877285973218, "learning_rate": 9.986282578875173e-06, "loss": 0.4304, "step": 729 }, { "epoch": 0.06014665897668287, "grad_norm": 4.75570214128782, "learning_rate": 1e-05, "loss": 0.4285, "step": 730 }, { "epoch": 0.06022905166021257, "grad_norm": 4.53172116436025, "learning_rate": 9.999999955491562e-06, "loss": 0.4013, "step": 731 }, { "epoch": 0.060311444343742276, "grad_norm": 4.01036872409321, "learning_rate": 9.999999821966245e-06, "loss": 0.4827, "step": 732 }, { "epoch": 0.060393837027271975, "grad_norm": 5.750887446275034, "learning_rate": 9.999999599424054e-06, "loss": 0.554, "step": 733 }, { "epoch": 0.06047622971080168, "grad_norm": 11.992520749521988, "learning_rate": 9.99999928786499e-06, "loss": 0.5623, "step": 734 }, { "epoch": 0.060558622394331386, "grad_norm": 8.169748148911982, "learning_rate": 9.999998887289063e-06, "loss": 0.5052, "step": 735 }, { "epoch": 0.060641015077861085, "grad_norm": 4.563084100494972, "learning_rate": 9.999998397696277e-06, "loss": 0.2568, "step": 736 }, { "epoch": 0.06072340776139079, "grad_norm": 5.198391042090528, "learning_rate": 9.999997819086641e-06, "loss": 0.4664, "step": 737 }, { "epoch": 0.06080580044492049, "grad_norm": 5.384798403619332, "learning_rate": 9.999997151460166e-06, "loss": 0.4522, "step": 738 }, { "epoch": 0.060888193128450195, "grad_norm": 4.41206122981705, "learning_rate": 9.999996394816863e-06, "loss": 0.36, "step": 739 }, { "epoch": 0.060970585811979894, "grad_norm": 11.803173169655384, "learning_rate": 9.999995549156746e-06, "loss": 0.4347, "step": 740 }, { "epoch": 0.0610529784955096, "grad_norm": 7.305968155022703, "learning_rate": 9.999994614479829e-06, "loss": 0.6298, "step": 741 }, { "epoch": 0.0611353711790393, "grad_norm": 6.627646463372147, "learning_rate": 9.999993590786133e-06, "loss": 0.4627, "step": 742 }, { "epoch": 0.061217763862569004, "grad_norm": 7.961247066335053, "learning_rate": 9.999992478075669e-06, "loss": 0.7048, "step": 743 }, { "epoch": 0.06130015654609871, "grad_norm": 11.004530222939419, "learning_rate": 9.999991276348463e-06, "loss": 0.3497, "step": 744 }, { "epoch": 0.06138254922962841, "grad_norm": 7.087534685277768, "learning_rate": 9.999989985604533e-06, "loss": 0.7451, "step": 745 }, { "epoch": 0.061464941913158114, "grad_norm": 4.0530498589819155, "learning_rate": 9.999988605843905e-06, "loss": 0.3691, "step": 746 }, { "epoch": 0.06154733459668781, "grad_norm": 3.7550933352714186, "learning_rate": 9.9999871370666e-06, "loss": 0.2569, "step": 747 }, { "epoch": 0.06162972728021752, "grad_norm": 5.5935186429304, "learning_rate": 9.999985579272646e-06, "loss": 0.687, "step": 748 }, { "epoch": 0.06171211996374722, "grad_norm": 5.254856304496337, "learning_rate": 9.99998393246207e-06, "loss": 0.4639, "step": 749 }, { "epoch": 0.06179451264727692, "grad_norm": 7.5048343045294965, "learning_rate": 9.999982196634904e-06, "loss": 0.7888, "step": 750 }, { "epoch": 0.06187690533080662, "grad_norm": 6.480623845749662, "learning_rate": 9.999980371791175e-06, "loss": 0.7256, "step": 751 }, { "epoch": 0.06195929801433633, "grad_norm": 3.675842921010652, "learning_rate": 9.999978457930918e-06, "loss": 0.4443, "step": 752 }, { "epoch": 0.06204169069786603, "grad_norm": 16.26561159606985, "learning_rate": 9.999976455054165e-06, "loss": 0.7932, "step": 753 }, { "epoch": 0.06212408338139573, "grad_norm": 5.493234318965289, "learning_rate": 9.999974363160954e-06, "loss": 0.3184, "step": 754 }, { "epoch": 0.06220647606492544, "grad_norm": 5.1041491695509436, "learning_rate": 9.999972182251323e-06, "loss": 0.6043, "step": 755 }, { "epoch": 0.062288868748455135, "grad_norm": 4.851912554069924, "learning_rate": 9.999969912325307e-06, "loss": 0.5401, "step": 756 }, { "epoch": 0.06237126143198484, "grad_norm": 5.751575373973642, "learning_rate": 9.999967553382947e-06, "loss": 0.641, "step": 757 }, { "epoch": 0.06245365411551454, "grad_norm": 9.081514003066161, "learning_rate": 9.999965105424289e-06, "loss": 0.6134, "step": 758 }, { "epoch": 0.06253604679904425, "grad_norm": 4.811807174517819, "learning_rate": 9.999962568449374e-06, "loss": 0.4439, "step": 759 }, { "epoch": 0.06261843948257395, "grad_norm": 4.62379971606797, "learning_rate": 9.999959942458246e-06, "loss": 0.6629, "step": 760 }, { "epoch": 0.06270083216610366, "grad_norm": 11.03505749827588, "learning_rate": 9.999957227450953e-06, "loss": 0.6224, "step": 761 }, { "epoch": 0.06278322484963335, "grad_norm": 25.986656518689404, "learning_rate": 9.999954423427545e-06, "loss": 0.3417, "step": 762 }, { "epoch": 0.06286561753316305, "grad_norm": 7.11476454390553, "learning_rate": 9.99995153038807e-06, "loss": 0.6036, "step": 763 }, { "epoch": 0.06294801021669276, "grad_norm": 7.766009203845259, "learning_rate": 9.999948548332579e-06, "loss": 0.5608, "step": 764 }, { "epoch": 0.06303040290022247, "grad_norm": 5.920541614447556, "learning_rate": 9.999945477261124e-06, "loss": 0.5298, "step": 765 }, { "epoch": 0.06311279558375216, "grad_norm": 12.67177199319788, "learning_rate": 9.999942317173764e-06, "loss": 0.7621, "step": 766 }, { "epoch": 0.06319518826728186, "grad_norm": 14.925189851609852, "learning_rate": 9.999939068070552e-06, "loss": 0.6965, "step": 767 }, { "epoch": 0.06327758095081157, "grad_norm": 3.6293587522698427, "learning_rate": 9.999935729951547e-06, "loss": 0.4481, "step": 768 }, { "epoch": 0.06335997363434127, "grad_norm": 4.466378171099753, "learning_rate": 9.999932302816808e-06, "loss": 0.5852, "step": 769 }, { "epoch": 0.06344236631787098, "grad_norm": 5.132249154356962, "learning_rate": 9.999928786666395e-06, "loss": 0.3901, "step": 770 }, { "epoch": 0.06352475900140067, "grad_norm": 4.746744082094216, "learning_rate": 9.999925181500372e-06, "loss": 0.4565, "step": 771 }, { "epoch": 0.06360715168493038, "grad_norm": 5.8813150292186505, "learning_rate": 9.999921487318805e-06, "loss": 0.3263, "step": 772 }, { "epoch": 0.06368954436846008, "grad_norm": 4.410894729084581, "learning_rate": 9.999917704121756e-06, "loss": 0.345, "step": 773 }, { "epoch": 0.06377193705198979, "grad_norm": 6.405832396274801, "learning_rate": 9.999913831909292e-06, "loss": 0.8081, "step": 774 }, { "epoch": 0.06385432973551948, "grad_norm": 6.920285535512553, "learning_rate": 9.999909870681486e-06, "loss": 0.6784, "step": 775 }, { "epoch": 0.06393672241904919, "grad_norm": 5.579593067307145, "learning_rate": 9.999905820438407e-06, "loss": 0.578, "step": 776 }, { "epoch": 0.06401911510257889, "grad_norm": 6.074551579414587, "learning_rate": 9.999901681180123e-06, "loss": 0.5795, "step": 777 }, { "epoch": 0.0641015077861086, "grad_norm": 7.013231380533223, "learning_rate": 9.999897452906715e-06, "loss": 0.437, "step": 778 }, { "epoch": 0.0641839004696383, "grad_norm": 5.4113592934672985, "learning_rate": 9.999893135618255e-06, "loss": 0.5025, "step": 779 }, { "epoch": 0.064266293153168, "grad_norm": 6.6071206609748865, "learning_rate": 9.999888729314817e-06, "loss": 0.7329, "step": 780 }, { "epoch": 0.0643486858366977, "grad_norm": 5.440902339138829, "learning_rate": 9.999884233996482e-06, "loss": 0.4127, "step": 781 }, { "epoch": 0.0644310785202274, "grad_norm": 4.994301771597056, "learning_rate": 9.999879649663332e-06, "loss": 0.5911, "step": 782 }, { "epoch": 0.06451347120375711, "grad_norm": 6.0543541357228055, "learning_rate": 9.999874976315443e-06, "loss": 0.5825, "step": 783 }, { "epoch": 0.0645958638872868, "grad_norm": 4.3097161136235655, "learning_rate": 9.999870213952904e-06, "loss": 0.5132, "step": 784 }, { "epoch": 0.06467825657081651, "grad_norm": 6.888363932085567, "learning_rate": 9.999865362575799e-06, "loss": 0.7543, "step": 785 }, { "epoch": 0.06476064925434621, "grad_norm": 4.724388546371243, "learning_rate": 9.999860422184209e-06, "loss": 0.5942, "step": 786 }, { "epoch": 0.06484304193787592, "grad_norm": 7.4935192383230955, "learning_rate": 9.999855392778228e-06, "loss": 0.4375, "step": 787 }, { "epoch": 0.06492543462140563, "grad_norm": 6.498332631896668, "learning_rate": 9.999850274357943e-06, "loss": 0.6782, "step": 788 }, { "epoch": 0.06500782730493532, "grad_norm": 5.187935409845064, "learning_rate": 9.999845066923445e-06, "loss": 0.5646, "step": 789 }, { "epoch": 0.06509021998846502, "grad_norm": 5.9252659829991075, "learning_rate": 9.999839770474827e-06, "loss": 0.4834, "step": 790 }, { "epoch": 0.06517261267199473, "grad_norm": 4.848728708034422, "learning_rate": 9.999834385012184e-06, "loss": 0.4574, "step": 791 }, { "epoch": 0.06525500535552443, "grad_norm": 4.1703778115278665, "learning_rate": 9.999828910535612e-06, "loss": 0.5278, "step": 792 }, { "epoch": 0.06533739803905413, "grad_norm": 5.464196351317177, "learning_rate": 9.999823347045206e-06, "loss": 0.481, "step": 793 }, { "epoch": 0.06541979072258383, "grad_norm": 9.661214620692148, "learning_rate": 9.999817694541067e-06, "loss": 0.7433, "step": 794 }, { "epoch": 0.06550218340611354, "grad_norm": 6.375475870240334, "learning_rate": 9.999811953023297e-06, "loss": 0.6541, "step": 795 }, { "epoch": 0.06558457608964324, "grad_norm": 5.1053662811437865, "learning_rate": 9.999806122491998e-06, "loss": 0.4034, "step": 796 }, { "epoch": 0.06566696877317295, "grad_norm": 4.706843481268382, "learning_rate": 9.99980020294727e-06, "loss": 0.5359, "step": 797 }, { "epoch": 0.06574936145670264, "grad_norm": 5.942544872541858, "learning_rate": 9.99979419438922e-06, "loss": 0.5096, "step": 798 }, { "epoch": 0.06583175414023235, "grad_norm": 7.443525959047362, "learning_rate": 9.999788096817957e-06, "loss": 0.3826, "step": 799 }, { "epoch": 0.06591414682376205, "grad_norm": 5.603804250629685, "learning_rate": 9.999781910233589e-06, "loss": 0.5671, "step": 800 }, { "epoch": 0.06599653950729176, "grad_norm": 4.5063553175865705, "learning_rate": 9.999775634636226e-06, "loss": 0.4009, "step": 801 }, { "epoch": 0.06607893219082145, "grad_norm": 8.076520038006072, "learning_rate": 9.999769270025978e-06, "loss": 0.8698, "step": 802 }, { "epoch": 0.06616132487435115, "grad_norm": 5.500455378105405, "learning_rate": 9.99976281640296e-06, "loss": 0.4873, "step": 803 }, { "epoch": 0.06624371755788086, "grad_norm": 3.348900655594007, "learning_rate": 9.999756273767288e-06, "loss": 0.4912, "step": 804 }, { "epoch": 0.06632611024141057, "grad_norm": 4.699563891054634, "learning_rate": 9.999749642119075e-06, "loss": 0.4622, "step": 805 }, { "epoch": 0.06640850292494027, "grad_norm": 6.752287413544824, "learning_rate": 9.99974292145844e-06, "loss": 0.7243, "step": 806 }, { "epoch": 0.06649089560846996, "grad_norm": 7.99053258176144, "learning_rate": 9.999736111785507e-06, "loss": 0.6806, "step": 807 }, { "epoch": 0.06657328829199967, "grad_norm": 4.006803923880306, "learning_rate": 9.99972921310039e-06, "loss": 0.3127, "step": 808 }, { "epoch": 0.06665568097552937, "grad_norm": 5.684928104174344, "learning_rate": 9.99972222540322e-06, "loss": 0.3958, "step": 809 }, { "epoch": 0.06673807365905908, "grad_norm": 6.2655870002623715, "learning_rate": 9.999715148694114e-06, "loss": 0.4125, "step": 810 }, { "epoch": 0.06682046634258877, "grad_norm": 4.858773634051298, "learning_rate": 9.999707982973203e-06, "loss": 0.4663, "step": 811 }, { "epoch": 0.06690285902611848, "grad_norm": 4.4468583578233645, "learning_rate": 9.999700728240612e-06, "loss": 0.4221, "step": 812 }, { "epoch": 0.06698525170964818, "grad_norm": 5.30420324889732, "learning_rate": 9.999693384496469e-06, "loss": 0.6381, "step": 813 }, { "epoch": 0.06706764439317789, "grad_norm": 5.03373160052799, "learning_rate": 9.99968595174091e-06, "loss": 0.5375, "step": 814 }, { "epoch": 0.0671500370767076, "grad_norm": 4.342016897255091, "learning_rate": 9.999678429974063e-06, "loss": 0.5741, "step": 815 }, { "epoch": 0.06723242976023729, "grad_norm": 4.020245998757659, "learning_rate": 9.999670819196061e-06, "loss": 0.5157, "step": 816 }, { "epoch": 0.06731482244376699, "grad_norm": 5.037402626653137, "learning_rate": 9.999663119407043e-06, "loss": 0.5872, "step": 817 }, { "epoch": 0.0673972151272967, "grad_norm": 5.351119050045274, "learning_rate": 9.999655330607143e-06, "loss": 0.4749, "step": 818 }, { "epoch": 0.0674796078108264, "grad_norm": 3.9429223358828294, "learning_rate": 9.999647452796502e-06, "loss": 0.4117, "step": 819 }, { "epoch": 0.0675620004943561, "grad_norm": 5.59405888819271, "learning_rate": 9.99963948597526e-06, "loss": 0.314, "step": 820 }, { "epoch": 0.0676443931778858, "grad_norm": 4.687434371571609, "learning_rate": 9.999631430143558e-06, "loss": 0.5861, "step": 821 }, { "epoch": 0.0677267858614155, "grad_norm": 4.995276130163719, "learning_rate": 9.999623285301538e-06, "loss": 0.4674, "step": 822 }, { "epoch": 0.06780917854494521, "grad_norm": 4.533079712780735, "learning_rate": 9.999615051449348e-06, "loss": 0.5473, "step": 823 }, { "epoch": 0.06789157122847492, "grad_norm": 4.406552313322744, "learning_rate": 9.999606728587134e-06, "loss": 0.6765, "step": 824 }, { "epoch": 0.06797396391200461, "grad_norm": 5.688948880327145, "learning_rate": 9.999598316715043e-06, "loss": 0.5709, "step": 825 }, { "epoch": 0.06805635659553431, "grad_norm": 4.327277968958632, "learning_rate": 9.999589815833224e-06, "loss": 0.3639, "step": 826 }, { "epoch": 0.06813874927906402, "grad_norm": 5.27747946166832, "learning_rate": 9.999581225941829e-06, "loss": 0.3616, "step": 827 }, { "epoch": 0.06822114196259373, "grad_norm": 4.249209100815019, "learning_rate": 9.999572547041013e-06, "loss": 0.4393, "step": 828 }, { "epoch": 0.06830353464612342, "grad_norm": 7.1321873495194525, "learning_rate": 9.999563779130928e-06, "loss": 0.7852, "step": 829 }, { "epoch": 0.06838592732965312, "grad_norm": 5.422129659924177, "learning_rate": 9.999554922211732e-06, "loss": 0.4847, "step": 830 }, { "epoch": 0.06846832001318283, "grad_norm": 5.212554077950715, "learning_rate": 9.99954597628358e-06, "loss": 0.5653, "step": 831 }, { "epoch": 0.06855071269671253, "grad_norm": 4.785517487664035, "learning_rate": 9.999536941346635e-06, "loss": 0.3497, "step": 832 }, { "epoch": 0.06863310538024224, "grad_norm": 3.9488684791068165, "learning_rate": 9.999527817401053e-06, "loss": 0.4563, "step": 833 }, { "epoch": 0.06871549806377193, "grad_norm": 5.63061411017133, "learning_rate": 9.999518604447003e-06, "loss": 0.5806, "step": 834 }, { "epoch": 0.06879789074730164, "grad_norm": 4.343401510593881, "learning_rate": 9.999509302484642e-06, "loss": 0.5283, "step": 835 }, { "epoch": 0.06888028343083134, "grad_norm": 4.23058174413733, "learning_rate": 9.99949991151414e-06, "loss": 0.558, "step": 836 }, { "epoch": 0.06896267611436105, "grad_norm": 5.669493224488694, "learning_rate": 9.999490431535664e-06, "loss": 0.5318, "step": 837 }, { "epoch": 0.06904506879789074, "grad_norm": 4.17773496395434, "learning_rate": 9.999480862549383e-06, "loss": 0.531, "step": 838 }, { "epoch": 0.06912746148142045, "grad_norm": 5.0993501898509574, "learning_rate": 9.999471204555464e-06, "loss": 0.5367, "step": 839 }, { "epoch": 0.06920985416495015, "grad_norm": 6.258867241760913, "learning_rate": 9.99946145755408e-06, "loss": 0.625, "step": 840 }, { "epoch": 0.06929224684847986, "grad_norm": 6.14568981769412, "learning_rate": 9.999451621545408e-06, "loss": 0.6203, "step": 841 }, { "epoch": 0.06937463953200956, "grad_norm": 4.50524135844046, "learning_rate": 9.99944169652962e-06, "loss": 0.4648, "step": 842 }, { "epoch": 0.06945703221553925, "grad_norm": 4.759545744558735, "learning_rate": 9.999431682506893e-06, "loss": 0.3331, "step": 843 }, { "epoch": 0.06953942489906896, "grad_norm": 4.161271349934926, "learning_rate": 9.999421579477406e-06, "loss": 0.3352, "step": 844 }, { "epoch": 0.06962181758259867, "grad_norm": 4.557594062709752, "learning_rate": 9.99941138744134e-06, "loss": 0.6043, "step": 845 }, { "epoch": 0.06970421026612837, "grad_norm": 5.655003710422895, "learning_rate": 9.999401106398874e-06, "loss": 0.5817, "step": 846 }, { "epoch": 0.06978660294965806, "grad_norm": 4.925240291130296, "learning_rate": 9.999390736350192e-06, "loss": 0.5053, "step": 847 }, { "epoch": 0.06986899563318777, "grad_norm": 5.566654097661635, "learning_rate": 9.99938027729548e-06, "loss": 0.5705, "step": 848 }, { "epoch": 0.06995138831671747, "grad_norm": 4.473152490217241, "learning_rate": 9.999369729234923e-06, "loss": 0.3942, "step": 849 }, { "epoch": 0.07003378100024718, "grad_norm": 3.0533580021318665, "learning_rate": 9.999359092168707e-06, "loss": 0.3085, "step": 850 }, { "epoch": 0.07011617368377689, "grad_norm": 3.8516009324085747, "learning_rate": 9.999348366097024e-06, "loss": 0.3975, "step": 851 }, { "epoch": 0.07019856636730658, "grad_norm": 5.43210019067299, "learning_rate": 9.999337551020062e-06, "loss": 0.5565, "step": 852 }, { "epoch": 0.07028095905083628, "grad_norm": 5.9263640825101405, "learning_rate": 9.999326646938019e-06, "loss": 0.486, "step": 853 }, { "epoch": 0.07036335173436599, "grad_norm": 4.914603335651239, "learning_rate": 9.999315653851085e-06, "loss": 0.5884, "step": 854 }, { "epoch": 0.0704457444178957, "grad_norm": 4.624934035035607, "learning_rate": 9.999304571759456e-06, "loss": 0.446, "step": 855 }, { "epoch": 0.07052813710142539, "grad_norm": 4.290804101316048, "learning_rate": 9.99929340066333e-06, "loss": 0.501, "step": 856 }, { "epoch": 0.07061052978495509, "grad_norm": 9.34261485450524, "learning_rate": 9.999282140562905e-06, "loss": 0.7741, "step": 857 }, { "epoch": 0.0706929224684848, "grad_norm": 4.590850263399005, "learning_rate": 9.999270791458383e-06, "loss": 0.4548, "step": 858 }, { "epoch": 0.0707753151520145, "grad_norm": 4.7695609921791355, "learning_rate": 9.999259353349964e-06, "loss": 0.53, "step": 859 }, { "epoch": 0.07085770783554421, "grad_norm": 3.904977095403514, "learning_rate": 9.999247826237854e-06, "loss": 0.3604, "step": 860 }, { "epoch": 0.0709401005190739, "grad_norm": 8.682031296595586, "learning_rate": 9.999236210122256e-06, "loss": 0.78, "step": 861 }, { "epoch": 0.0710224932026036, "grad_norm": 4.5193801163329494, "learning_rate": 9.999224505003379e-06, "loss": 0.5151, "step": 862 }, { "epoch": 0.07110488588613331, "grad_norm": 6.453604475727589, "learning_rate": 9.999212710881429e-06, "loss": 0.7898, "step": 863 }, { "epoch": 0.07118727856966302, "grad_norm": 5.0929895898718165, "learning_rate": 9.99920082775662e-06, "loss": 0.3436, "step": 864 }, { "epoch": 0.07126967125319271, "grad_norm": 3.977322671420692, "learning_rate": 9.999188855629159e-06, "loss": 0.6677, "step": 865 }, { "epoch": 0.07135206393672242, "grad_norm": 4.011549270668664, "learning_rate": 9.99917679449926e-06, "loss": 0.5224, "step": 866 }, { "epoch": 0.07143445662025212, "grad_norm": 4.242067648981852, "learning_rate": 9.999164644367139e-06, "loss": 0.6407, "step": 867 }, { "epoch": 0.07151684930378183, "grad_norm": 4.6287285335817225, "learning_rate": 9.999152405233013e-06, "loss": 0.4401, "step": 868 }, { "epoch": 0.07159924198731153, "grad_norm": 7.160675102549129, "learning_rate": 9.999140077097096e-06, "loss": 0.6419, "step": 869 }, { "epoch": 0.07168163467084122, "grad_norm": 5.074238907861252, "learning_rate": 9.999127659959613e-06, "loss": 0.6444, "step": 870 }, { "epoch": 0.07176402735437093, "grad_norm": 3.6382794617465253, "learning_rate": 9.999115153820782e-06, "loss": 0.4528, "step": 871 }, { "epoch": 0.07184642003790064, "grad_norm": 4.599057975224609, "learning_rate": 9.999102558680827e-06, "loss": 0.531, "step": 872 }, { "epoch": 0.07192881272143034, "grad_norm": 4.5872619129769046, "learning_rate": 9.999089874539968e-06, "loss": 0.4827, "step": 873 }, { "epoch": 0.07201120540496005, "grad_norm": 3.689648865108007, "learning_rate": 9.999077101398437e-06, "loss": 0.492, "step": 874 }, { "epoch": 0.07209359808848974, "grad_norm": 5.8761872625125715, "learning_rate": 9.999064239256459e-06, "loss": 0.543, "step": 875 }, { "epoch": 0.07217599077201944, "grad_norm": 5.870430873318824, "learning_rate": 9.99905128811426e-06, "loss": 0.5407, "step": 876 }, { "epoch": 0.07225838345554915, "grad_norm": 6.386341937119981, "learning_rate": 9.999038247972076e-06, "loss": 0.6123, "step": 877 }, { "epoch": 0.07234077613907886, "grad_norm": 5.280967236144987, "learning_rate": 9.999025118830134e-06, "loss": 0.4707, "step": 878 }, { "epoch": 0.07242316882260855, "grad_norm": 5.724340686807843, "learning_rate": 9.999011900688672e-06, "loss": 0.6817, "step": 879 }, { "epoch": 0.07250556150613825, "grad_norm": 5.777864365455918, "learning_rate": 9.998998593547923e-06, "loss": 0.7443, "step": 880 }, { "epoch": 0.07258795418966796, "grad_norm": 4.481030640410297, "learning_rate": 9.998985197408122e-06, "loss": 0.4052, "step": 881 }, { "epoch": 0.07267034687319766, "grad_norm": 5.7539373865432895, "learning_rate": 9.998971712269512e-06, "loss": 0.6414, "step": 882 }, { "epoch": 0.07275273955672737, "grad_norm": 5.074248646089831, "learning_rate": 9.99895813813233e-06, "loss": 0.6889, "step": 883 }, { "epoch": 0.07283513224025706, "grad_norm": 6.08389615807529, "learning_rate": 9.998944474996817e-06, "loss": 0.6358, "step": 884 }, { "epoch": 0.07291752492378677, "grad_norm": 7.007593747793593, "learning_rate": 9.99893072286322e-06, "loss": 0.723, "step": 885 }, { "epoch": 0.07299991760731647, "grad_norm": 5.295613649313166, "learning_rate": 9.998916881731781e-06, "loss": 0.5226, "step": 886 }, { "epoch": 0.07308231029084618, "grad_norm": 5.1499745635077225, "learning_rate": 9.998902951602746e-06, "loss": 0.6138, "step": 887 }, { "epoch": 0.07316470297437587, "grad_norm": 4.404865550062876, "learning_rate": 9.998888932476365e-06, "loss": 0.4733, "step": 888 }, { "epoch": 0.07324709565790558, "grad_norm": 5.914838603616755, "learning_rate": 9.998874824352887e-06, "loss": 0.5345, "step": 889 }, { "epoch": 0.07332948834143528, "grad_norm": 9.339300610542013, "learning_rate": 9.99886062723256e-06, "loss": 0.7452, "step": 890 }, { "epoch": 0.07341188102496499, "grad_norm": 5.575845113308426, "learning_rate": 9.998846341115642e-06, "loss": 0.5513, "step": 891 }, { "epoch": 0.07349427370849469, "grad_norm": 4.403803637718023, "learning_rate": 9.998831966002385e-06, "loss": 0.4927, "step": 892 }, { "epoch": 0.07357666639202438, "grad_norm": 4.716127059524325, "learning_rate": 9.998817501893044e-06, "loss": 0.5894, "step": 893 }, { "epoch": 0.07365905907555409, "grad_norm": 4.162456660043013, "learning_rate": 9.998802948787878e-06, "loss": 0.5348, "step": 894 }, { "epoch": 0.0737414517590838, "grad_norm": 7.865128926097237, "learning_rate": 9.998788306687144e-06, "loss": 0.7982, "step": 895 }, { "epoch": 0.0738238444426135, "grad_norm": 5.062814936226218, "learning_rate": 9.998773575591105e-06, "loss": 0.5444, "step": 896 }, { "epoch": 0.07390623712614319, "grad_norm": 5.0766827309307345, "learning_rate": 9.998758755500022e-06, "loss": 0.5396, "step": 897 }, { "epoch": 0.0739886298096729, "grad_norm": 5.588620720256569, "learning_rate": 9.998743846414158e-06, "loss": 0.607, "step": 898 }, { "epoch": 0.0740710224932026, "grad_norm": 4.508870984621243, "learning_rate": 9.998728848333781e-06, "loss": 0.5817, "step": 899 }, { "epoch": 0.07415341517673231, "grad_norm": 6.255880304807757, "learning_rate": 9.998713761259157e-06, "loss": 0.624, "step": 900 }, { "epoch": 0.07423580786026202, "grad_norm": 6.448010541448679, "learning_rate": 9.998698585190554e-06, "loss": 0.592, "step": 901 }, { "epoch": 0.07431820054379171, "grad_norm": 6.581360057122206, "learning_rate": 9.998683320128242e-06, "loss": 0.7091, "step": 902 }, { "epoch": 0.07440059322732141, "grad_norm": 7.178574221854368, "learning_rate": 9.998667966072492e-06, "loss": 0.514, "step": 903 }, { "epoch": 0.07448298591085112, "grad_norm": 5.640474989894486, "learning_rate": 9.998652523023582e-06, "loss": 0.5192, "step": 904 }, { "epoch": 0.07456537859438082, "grad_norm": 4.859242002152594, "learning_rate": 9.99863699098178e-06, "loss": 0.393, "step": 905 }, { "epoch": 0.07464777127791052, "grad_norm": 4.7363551715969905, "learning_rate": 9.998621369947368e-06, "loss": 0.5423, "step": 906 }, { "epoch": 0.07473016396144022, "grad_norm": 7.11873781156602, "learning_rate": 9.998605659920621e-06, "loss": 0.6214, "step": 907 }, { "epoch": 0.07481255664496993, "grad_norm": 4.642272372196515, "learning_rate": 9.99858986090182e-06, "loss": 0.3049, "step": 908 }, { "epoch": 0.07489494932849963, "grad_norm": 4.975074076132109, "learning_rate": 9.998573972891246e-06, "loss": 0.5958, "step": 909 }, { "epoch": 0.07497734201202934, "grad_norm": 6.247010066207867, "learning_rate": 9.998557995889183e-06, "loss": 0.5501, "step": 910 }, { "epoch": 0.07505973469555903, "grad_norm": 6.758308077386494, "learning_rate": 9.998541929895912e-06, "loss": 0.4744, "step": 911 }, { "epoch": 0.07514212737908874, "grad_norm": 6.321946117889499, "learning_rate": 9.998525774911723e-06, "loss": 0.4651, "step": 912 }, { "epoch": 0.07522452006261844, "grad_norm": 5.910428930462289, "learning_rate": 9.998509530936901e-06, "loss": 0.5662, "step": 913 }, { "epoch": 0.07530691274614815, "grad_norm": 4.4469553646271525, "learning_rate": 9.998493197971737e-06, "loss": 0.529, "step": 914 }, { "epoch": 0.07538930542967784, "grad_norm": 6.494702959527984, "learning_rate": 9.998476776016521e-06, "loss": 0.5877, "step": 915 }, { "epoch": 0.07547169811320754, "grad_norm": 7.669316446508771, "learning_rate": 9.998460265071546e-06, "loss": 0.6812, "step": 916 }, { "epoch": 0.07555409079673725, "grad_norm": 14.38156671027216, "learning_rate": 9.998443665137104e-06, "loss": 0.8116, "step": 917 }, { "epoch": 0.07563648348026696, "grad_norm": 8.288458207815854, "learning_rate": 9.998426976213493e-06, "loss": 0.656, "step": 918 }, { "epoch": 0.07571887616379666, "grad_norm": 3.703380187983116, "learning_rate": 9.998410198301007e-06, "loss": 0.3688, "step": 919 }, { "epoch": 0.07580126884732635, "grad_norm": 5.446454718623625, "learning_rate": 9.99839333139995e-06, "loss": 0.5218, "step": 920 }, { "epoch": 0.07588366153085606, "grad_norm": 5.7339819465370825, "learning_rate": 9.998376375510617e-06, "loss": 0.4874, "step": 921 }, { "epoch": 0.07596605421438576, "grad_norm": 5.321192426624907, "learning_rate": 9.99835933063331e-06, "loss": 0.3845, "step": 922 }, { "epoch": 0.07604844689791547, "grad_norm": 5.797682110550929, "learning_rate": 9.998342196768337e-06, "loss": 0.4487, "step": 923 }, { "epoch": 0.07613083958144516, "grad_norm": 6.324776101604922, "learning_rate": 9.998324973915999e-06, "loss": 0.5774, "step": 924 }, { "epoch": 0.07621323226497487, "grad_norm": 6.886486065521574, "learning_rate": 9.998307662076604e-06, "loss": 0.5918, "step": 925 }, { "epoch": 0.07629562494850457, "grad_norm": 4.598957573831131, "learning_rate": 9.998290261250461e-06, "loss": 0.5424, "step": 926 }, { "epoch": 0.07637801763203428, "grad_norm": 5.054899919494791, "learning_rate": 9.998272771437878e-06, "loss": 0.3453, "step": 927 }, { "epoch": 0.07646041031556398, "grad_norm": 4.915345392459135, "learning_rate": 9.998255192639167e-06, "loss": 0.5505, "step": 928 }, { "epoch": 0.07654280299909368, "grad_norm": 13.147418222703912, "learning_rate": 9.998237524854643e-06, "loss": 0.7975, "step": 929 }, { "epoch": 0.07662519568262338, "grad_norm": 4.249042164087028, "learning_rate": 9.998219768084619e-06, "loss": 0.5132, "step": 930 }, { "epoch": 0.07670758836615309, "grad_norm": 4.6864159906629865, "learning_rate": 9.998201922329409e-06, "loss": 0.5093, "step": 931 }, { "epoch": 0.07678998104968279, "grad_norm": 4.704532568350368, "learning_rate": 9.998183987589332e-06, "loss": 0.5385, "step": 932 }, { "epoch": 0.07687237373321248, "grad_norm": 4.250326737192859, "learning_rate": 9.99816596386471e-06, "loss": 0.5081, "step": 933 }, { "epoch": 0.07695476641674219, "grad_norm": 3.2609943143186393, "learning_rate": 9.998147851155862e-06, "loss": 0.3386, "step": 934 }, { "epoch": 0.0770371591002719, "grad_norm": 4.686119380610933, "learning_rate": 9.998129649463108e-06, "loss": 0.4959, "step": 935 }, { "epoch": 0.0771195517838016, "grad_norm": 8.093847783734134, "learning_rate": 9.998111358786777e-06, "loss": 0.6091, "step": 936 }, { "epoch": 0.07720194446733131, "grad_norm": 4.128988601766982, "learning_rate": 9.998092979127191e-06, "loss": 0.3564, "step": 937 }, { "epoch": 0.077284337150861, "grad_norm": 12.520048545236683, "learning_rate": 9.998074510484679e-06, "loss": 0.8451, "step": 938 }, { "epoch": 0.0773667298343907, "grad_norm": 5.0581596409425025, "learning_rate": 9.998055952859567e-06, "loss": 0.4862, "step": 939 }, { "epoch": 0.07744912251792041, "grad_norm": 6.315311219118197, "learning_rate": 9.998037306252188e-06, "loss": 0.5742, "step": 940 }, { "epoch": 0.07753151520145012, "grad_norm": 5.819386724118808, "learning_rate": 9.998018570662875e-06, "loss": 0.6873, "step": 941 }, { "epoch": 0.07761390788497981, "grad_norm": 6.606119564610502, "learning_rate": 9.99799974609196e-06, "loss": 0.6462, "step": 942 }, { "epoch": 0.07769630056850951, "grad_norm": 4.9064741570302965, "learning_rate": 9.997980832539775e-06, "loss": 0.4563, "step": 943 }, { "epoch": 0.07777869325203922, "grad_norm": 6.403026987978093, "learning_rate": 9.997961830006663e-06, "loss": 0.5919, "step": 944 }, { "epoch": 0.07786108593556892, "grad_norm": 3.995002622383316, "learning_rate": 9.997942738492959e-06, "loss": 0.6035, "step": 945 }, { "epoch": 0.07794347861909863, "grad_norm": 5.979508594311581, "learning_rate": 9.997923557999001e-06, "loss": 0.6438, "step": 946 }, { "epoch": 0.07802587130262832, "grad_norm": 12.403971641606825, "learning_rate": 9.997904288525133e-06, "loss": 0.7055, "step": 947 }, { "epoch": 0.07810826398615803, "grad_norm": 5.065356925074259, "learning_rate": 9.997884930071698e-06, "loss": 0.489, "step": 948 }, { "epoch": 0.07819065666968773, "grad_norm": 6.739162552025105, "learning_rate": 9.99786548263904e-06, "loss": 0.6978, "step": 949 }, { "epoch": 0.07827304935321744, "grad_norm": 5.869680200446658, "learning_rate": 9.997845946227506e-06, "loss": 0.6373, "step": 950 }, { "epoch": 0.07835544203674713, "grad_norm": 5.498434870964948, "learning_rate": 9.997826320837445e-06, "loss": 0.5772, "step": 951 }, { "epoch": 0.07843783472027684, "grad_norm": 6.304272494085451, "learning_rate": 9.997806606469201e-06, "loss": 0.3892, "step": 952 }, { "epoch": 0.07852022740380654, "grad_norm": 6.170055796070485, "learning_rate": 9.997786803123131e-06, "loss": 0.5676, "step": 953 }, { "epoch": 0.07860262008733625, "grad_norm": 4.355001196596673, "learning_rate": 9.997766910799585e-06, "loss": 0.3932, "step": 954 }, { "epoch": 0.07868501277086595, "grad_norm": 4.587353573095941, "learning_rate": 9.997746929498915e-06, "loss": 0.4951, "step": 955 }, { "epoch": 0.07876740545439564, "grad_norm": 4.7560661272306115, "learning_rate": 9.997726859221482e-06, "loss": 0.3379, "step": 956 }, { "epoch": 0.07884979813792535, "grad_norm": 3.875204186498522, "learning_rate": 9.997706699967638e-06, "loss": 0.4937, "step": 957 }, { "epoch": 0.07893219082145506, "grad_norm": 4.5436308343446665, "learning_rate": 9.997686451737745e-06, "loss": 0.4664, "step": 958 }, { "epoch": 0.07901458350498476, "grad_norm": 5.085715886421841, "learning_rate": 9.997666114532166e-06, "loss": 0.3532, "step": 959 }, { "epoch": 0.07909697618851445, "grad_norm": 5.736037074111639, "learning_rate": 9.997645688351256e-06, "loss": 0.6229, "step": 960 }, { "epoch": 0.07917936887204416, "grad_norm": 4.6641539689143885, "learning_rate": 9.997625173195384e-06, "loss": 0.5927, "step": 961 }, { "epoch": 0.07926176155557386, "grad_norm": 6.986142276021053, "learning_rate": 9.997604569064913e-06, "loss": 0.6011, "step": 962 }, { "epoch": 0.07934415423910357, "grad_norm": 4.936390533586499, "learning_rate": 9.99758387596021e-06, "loss": 0.6004, "step": 963 }, { "epoch": 0.07942654692263328, "grad_norm": 25.174262625674295, "learning_rate": 9.997563093881647e-06, "loss": 1.0481, "step": 964 }, { "epoch": 0.07950893960616297, "grad_norm": 5.911910212113453, "learning_rate": 9.997542222829588e-06, "loss": 0.5712, "step": 965 }, { "epoch": 0.07959133228969267, "grad_norm": 5.683154041671146, "learning_rate": 9.997521262804408e-06, "loss": 0.5152, "step": 966 }, { "epoch": 0.07967372497322238, "grad_norm": 6.247591970007456, "learning_rate": 9.997500213806481e-06, "loss": 0.589, "step": 967 }, { "epoch": 0.07975611765675208, "grad_norm": 6.037728527699037, "learning_rate": 9.997479075836179e-06, "loss": 0.4865, "step": 968 }, { "epoch": 0.07983851034028178, "grad_norm": 4.848668306571336, "learning_rate": 9.997457848893881e-06, "loss": 0.5364, "step": 969 }, { "epoch": 0.07992090302381148, "grad_norm": 5.738086820942736, "learning_rate": 9.997436532979963e-06, "loss": 0.4475, "step": 970 }, { "epoch": 0.08000329570734119, "grad_norm": 7.7003958590754245, "learning_rate": 9.997415128094805e-06, "loss": 0.6297, "step": 971 }, { "epoch": 0.0800856883908709, "grad_norm": 4.6301976482339375, "learning_rate": 9.997393634238789e-06, "loss": 0.4354, "step": 972 }, { "epoch": 0.0801680810744006, "grad_norm": 5.051471456165753, "learning_rate": 9.997372051412296e-06, "loss": 0.5246, "step": 973 }, { "epoch": 0.08025047375793029, "grad_norm": 6.840214215762055, "learning_rate": 9.997350379615712e-06, "loss": 0.6289, "step": 974 }, { "epoch": 0.08033286644146, "grad_norm": 5.025951954709692, "learning_rate": 9.997328618849422e-06, "loss": 0.6347, "step": 975 }, { "epoch": 0.0804152591249897, "grad_norm": 4.175068481847607, "learning_rate": 9.997306769113812e-06, "loss": 0.4474, "step": 976 }, { "epoch": 0.08049765180851941, "grad_norm": 6.49606794332559, "learning_rate": 9.997284830409275e-06, "loss": 0.7058, "step": 977 }, { "epoch": 0.0805800444920491, "grad_norm": 4.355993261479888, "learning_rate": 9.997262802736197e-06, "loss": 0.3175, "step": 978 }, { "epoch": 0.0806624371755788, "grad_norm": 4.173278705106478, "learning_rate": 9.997240686094974e-06, "loss": 0.4082, "step": 979 }, { "epoch": 0.08074482985910851, "grad_norm": 3.6798094983336638, "learning_rate": 9.997218480485994e-06, "loss": 0.266, "step": 980 }, { "epoch": 0.08082722254263822, "grad_norm": 6.875883284171418, "learning_rate": 9.997196185909662e-06, "loss": 0.397, "step": 981 }, { "epoch": 0.08090961522616792, "grad_norm": 4.7547519593139125, "learning_rate": 9.997173802366365e-06, "loss": 0.6101, "step": 982 }, { "epoch": 0.08099200790969761, "grad_norm": 7.068828139356332, "learning_rate": 9.997151329856508e-06, "loss": 0.753, "step": 983 }, { "epoch": 0.08107440059322732, "grad_norm": 5.32212084183463, "learning_rate": 9.997128768380486e-06, "loss": 0.5187, "step": 984 }, { "epoch": 0.08115679327675702, "grad_norm": 4.63635990769238, "learning_rate": 9.997106117938704e-06, "loss": 0.5448, "step": 985 }, { "epoch": 0.08123918596028673, "grad_norm": 5.156911960898855, "learning_rate": 9.997083378531567e-06, "loss": 0.6237, "step": 986 }, { "epoch": 0.08132157864381642, "grad_norm": 66.02610969540054, "learning_rate": 9.997060550159477e-06, "loss": 2.7918, "step": 987 }, { "epoch": 0.08140397132734613, "grad_norm": 4.63314347325949, "learning_rate": 9.997037632822839e-06, "loss": 0.4784, "step": 988 }, { "epoch": 0.08148636401087583, "grad_norm": 17.125886475254017, "learning_rate": 9.997014626522064e-06, "loss": 0.5182, "step": 989 }, { "epoch": 0.08156875669440554, "grad_norm": 5.112815655421804, "learning_rate": 9.99699153125756e-06, "loss": 0.5992, "step": 990 }, { "epoch": 0.08165114937793524, "grad_norm": 4.634165444142863, "learning_rate": 9.996968347029739e-06, "loss": 0.5552, "step": 991 }, { "epoch": 0.08173354206146494, "grad_norm": 4.500448623996411, "learning_rate": 9.996945073839015e-06, "loss": 0.5293, "step": 992 }, { "epoch": 0.08181593474499464, "grad_norm": 4.985902758629872, "learning_rate": 9.996921711685798e-06, "loss": 0.5077, "step": 993 }, { "epoch": 0.08189832742852435, "grad_norm": 11.160741895344973, "learning_rate": 9.99689826057051e-06, "loss": 0.6087, "step": 994 }, { "epoch": 0.08198072011205405, "grad_norm": 5.767479903001831, "learning_rate": 9.996874720493563e-06, "loss": 0.5006, "step": 995 }, { "epoch": 0.08206311279558375, "grad_norm": 4.604269350433775, "learning_rate": 9.996851091455379e-06, "loss": 0.4231, "step": 996 }, { "epoch": 0.08214550547911345, "grad_norm": 6.964228664965358, "learning_rate": 9.996827373456379e-06, "loss": 0.7993, "step": 997 }, { "epoch": 0.08222789816264316, "grad_norm": 4.503845133301578, "learning_rate": 9.996803566496982e-06, "loss": 0.574, "step": 998 }, { "epoch": 0.08231029084617286, "grad_norm": 5.102706241374167, "learning_rate": 9.996779670577615e-06, "loss": 0.523, "step": 999 }, { "epoch": 0.08239268352970257, "grad_norm": 4.718633665300077, "learning_rate": 9.996755685698703e-06, "loss": 0.4039, "step": 1000 }, { "epoch": 0.08247507621323226, "grad_norm": 31.817192653927222, "learning_rate": 9.996731611860674e-06, "loss": 0.4298, "step": 1001 }, { "epoch": 0.08255746889676197, "grad_norm": 21.27663857241567, "learning_rate": 9.996707449063952e-06, "loss": 0.2222, "step": 1002 }, { "epoch": 0.08263986158029167, "grad_norm": 4.401888936601498, "learning_rate": 9.996683197308973e-06, "loss": 0.4995, "step": 1003 }, { "epoch": 0.08272225426382138, "grad_norm": 5.508952355584309, "learning_rate": 9.996658856596165e-06, "loss": 0.5681, "step": 1004 }, { "epoch": 0.08280464694735108, "grad_norm": 4.992828446636793, "learning_rate": 9.996634426925962e-06, "loss": 0.4845, "step": 1005 }, { "epoch": 0.08288703963088077, "grad_norm": 7.4045853788823015, "learning_rate": 9.9966099082988e-06, "loss": 0.6107, "step": 1006 }, { "epoch": 0.08296943231441048, "grad_norm": 7.359292979226814, "learning_rate": 9.996585300715117e-06, "loss": 0.7944, "step": 1007 }, { "epoch": 0.08305182499794019, "grad_norm": 5.680106009666031, "learning_rate": 9.996560604175344e-06, "loss": 0.4504, "step": 1008 }, { "epoch": 0.08313421768146989, "grad_norm": 4.403376579772996, "learning_rate": 9.99653581867993e-06, "loss": 0.5481, "step": 1009 }, { "epoch": 0.08321661036499958, "grad_norm": 10.223711349213156, "learning_rate": 9.99651094422931e-06, "loss": 0.9755, "step": 1010 }, { "epoch": 0.08329900304852929, "grad_norm": 4.683680775421051, "learning_rate": 9.99648598082393e-06, "loss": 0.6508, "step": 1011 }, { "epoch": 0.083381395732059, "grad_norm": 6.840856586449906, "learning_rate": 9.99646092846423e-06, "loss": 0.6494, "step": 1012 }, { "epoch": 0.0834637884155887, "grad_norm": 5.564536440133124, "learning_rate": 9.996435787150663e-06, "loss": 0.6494, "step": 1013 }, { "epoch": 0.0835461810991184, "grad_norm": 8.55630832138283, "learning_rate": 9.996410556883672e-06, "loss": 0.5978, "step": 1014 }, { "epoch": 0.0836285737826481, "grad_norm": 4.505682298137676, "learning_rate": 9.996385237663706e-06, "loss": 0.4981, "step": 1015 }, { "epoch": 0.0837109664661778, "grad_norm": 6.026491102637283, "learning_rate": 9.996359829491218e-06, "loss": 0.6929, "step": 1016 }, { "epoch": 0.08379335914970751, "grad_norm": 5.23031637253995, "learning_rate": 9.996334332366658e-06, "loss": 0.468, "step": 1017 }, { "epoch": 0.08387575183323721, "grad_norm": 6.02650829355012, "learning_rate": 9.996308746290482e-06, "loss": 0.6166, "step": 1018 }, { "epoch": 0.0839581445167669, "grad_norm": 4.1819439593996535, "learning_rate": 9.996283071263145e-06, "loss": 0.4417, "step": 1019 }, { "epoch": 0.08404053720029661, "grad_norm": 5.489497484777263, "learning_rate": 9.996257307285102e-06, "loss": 0.441, "step": 1020 }, { "epoch": 0.08412292988382632, "grad_norm": 6.1845291497461545, "learning_rate": 9.996231454356814e-06, "loss": 0.8055, "step": 1021 }, { "epoch": 0.08420532256735602, "grad_norm": 6.335171357274994, "learning_rate": 9.996205512478741e-06, "loss": 0.7177, "step": 1022 }, { "epoch": 0.08428771525088573, "grad_norm": 5.77099800958352, "learning_rate": 9.996179481651345e-06, "loss": 0.6201, "step": 1023 }, { "epoch": 0.08437010793441542, "grad_norm": 5.277909161458072, "learning_rate": 9.996153361875086e-06, "loss": 0.5087, "step": 1024 }, { "epoch": 0.08445250061794513, "grad_norm": 4.995606457759668, "learning_rate": 9.996127153150436e-06, "loss": 0.4032, "step": 1025 }, { "epoch": 0.08453489330147483, "grad_norm": 4.431456907318335, "learning_rate": 9.996100855477856e-06, "loss": 0.3881, "step": 1026 }, { "epoch": 0.08461728598500454, "grad_norm": 3.511167845084898, "learning_rate": 9.996074468857815e-06, "loss": 0.4317, "step": 1027 }, { "epoch": 0.08469967866853423, "grad_norm": 4.355083337852244, "learning_rate": 9.996047993290784e-06, "loss": 0.481, "step": 1028 }, { "epoch": 0.08478207135206393, "grad_norm": 4.7477919241040425, "learning_rate": 9.996021428777234e-06, "loss": 0.4123, "step": 1029 }, { "epoch": 0.08486446403559364, "grad_norm": 4.845625383441968, "learning_rate": 9.99599477531764e-06, "loss": 0.7018, "step": 1030 }, { "epoch": 0.08494685671912335, "grad_norm": 5.263537268769247, "learning_rate": 9.995968032912471e-06, "loss": 0.5584, "step": 1031 }, { "epoch": 0.08502924940265305, "grad_norm": 4.316936086223206, "learning_rate": 9.995941201562207e-06, "loss": 0.5342, "step": 1032 }, { "epoch": 0.08511164208618274, "grad_norm": 4.862616503432274, "learning_rate": 9.995914281267326e-06, "loss": 0.5874, "step": 1033 }, { "epoch": 0.08519403476971245, "grad_norm": 6.347242300001709, "learning_rate": 9.995887272028307e-06, "loss": 0.6603, "step": 1034 }, { "epoch": 0.08527642745324215, "grad_norm": 4.525346388262154, "learning_rate": 9.995860173845629e-06, "loss": 0.441, "step": 1035 }, { "epoch": 0.08535882013677186, "grad_norm": 4.606242184878409, "learning_rate": 9.995832986719776e-06, "loss": 0.5658, "step": 1036 }, { "epoch": 0.08544121282030155, "grad_norm": 5.55541041688934, "learning_rate": 9.995805710651233e-06, "loss": 0.5883, "step": 1037 }, { "epoch": 0.08552360550383126, "grad_norm": 4.805999702498609, "learning_rate": 9.995778345640481e-06, "loss": 0.5197, "step": 1038 }, { "epoch": 0.08560599818736096, "grad_norm": 5.6071103942756535, "learning_rate": 9.995750891688013e-06, "loss": 0.4935, "step": 1039 }, { "epoch": 0.08568839087089067, "grad_norm": 50.57617337717754, "learning_rate": 9.995723348794315e-06, "loss": 2.4806, "step": 1040 }, { "epoch": 0.08577078355442037, "grad_norm": 6.25280098712178, "learning_rate": 9.995695716959877e-06, "loss": 0.6218, "step": 1041 }, { "epoch": 0.08585317623795007, "grad_norm": 3.9373446441524025, "learning_rate": 9.995667996185193e-06, "loss": 0.5496, "step": 1042 }, { "epoch": 0.08593556892147977, "grad_norm": 4.343696783721304, "learning_rate": 9.995640186470755e-06, "loss": 0.4876, "step": 1043 }, { "epoch": 0.08601796160500948, "grad_norm": 5.0506015991461, "learning_rate": 9.995612287817056e-06, "loss": 0.5382, "step": 1044 }, { "epoch": 0.08610035428853918, "grad_norm": 3.3472528865228015, "learning_rate": 9.995584300224597e-06, "loss": 0.4219, "step": 1045 }, { "epoch": 0.08618274697206887, "grad_norm": 3.6590082207213666, "learning_rate": 9.995556223693874e-06, "loss": 0.4519, "step": 1046 }, { "epoch": 0.08626513965559858, "grad_norm": 3.3110168095564414, "learning_rate": 9.995528058225386e-06, "loss": 0.3475, "step": 1047 }, { "epoch": 0.08634753233912829, "grad_norm": 5.863164197441982, "learning_rate": 9.995499803819637e-06, "loss": 0.6212, "step": 1048 }, { "epoch": 0.08642992502265799, "grad_norm": 5.518382693746084, "learning_rate": 9.995471460477127e-06, "loss": 0.5021, "step": 1049 }, { "epoch": 0.0865123177061877, "grad_norm": 6.110381483524074, "learning_rate": 9.995443028198362e-06, "loss": 0.4432, "step": 1050 }, { "epoch": 0.08659471038971739, "grad_norm": 4.892003499667656, "learning_rate": 9.99541450698385e-06, "loss": 0.3957, "step": 1051 }, { "epoch": 0.0866771030732471, "grad_norm": 7.104164940402655, "learning_rate": 9.995385896834095e-06, "loss": 0.676, "step": 1052 }, { "epoch": 0.0867594957567768, "grad_norm": 11.570101337811126, "learning_rate": 9.995357197749611e-06, "loss": 0.5474, "step": 1053 }, { "epoch": 0.0868418884403065, "grad_norm": 4.624106631340867, "learning_rate": 9.995328409730905e-06, "loss": 0.506, "step": 1054 }, { "epoch": 0.0869242811238362, "grad_norm": 5.743273453010476, "learning_rate": 9.99529953277849e-06, "loss": 0.3698, "step": 1055 }, { "epoch": 0.0870066738073659, "grad_norm": 8.029930627600796, "learning_rate": 9.995270566892884e-06, "loss": 0.5471, "step": 1056 }, { "epoch": 0.08708906649089561, "grad_norm": 5.194894530300615, "learning_rate": 9.995241512074596e-06, "loss": 0.6335, "step": 1057 }, { "epoch": 0.08717145917442531, "grad_norm": 6.84466633274184, "learning_rate": 9.995212368324147e-06, "loss": 0.5793, "step": 1058 }, { "epoch": 0.08725385185795502, "grad_norm": 7.268033072915504, "learning_rate": 9.99518313564206e-06, "loss": 0.653, "step": 1059 }, { "epoch": 0.08733624454148471, "grad_norm": 4.021612447179571, "learning_rate": 9.995153814028846e-06, "loss": 0.4125, "step": 1060 }, { "epoch": 0.08741863722501442, "grad_norm": 5.826262293101434, "learning_rate": 9.995124403485036e-06, "loss": 0.5812, "step": 1061 }, { "epoch": 0.08750102990854412, "grad_norm": 4.557140038560946, "learning_rate": 9.995094904011148e-06, "loss": 0.3531, "step": 1062 }, { "epoch": 0.08758342259207383, "grad_norm": 7.238981062698408, "learning_rate": 9.99506531560771e-06, "loss": 0.7622, "step": 1063 }, { "epoch": 0.08766581527560352, "grad_norm": 4.21469929038344, "learning_rate": 9.995035638275248e-06, "loss": 0.6258, "step": 1064 }, { "epoch": 0.08774820795913323, "grad_norm": 3.6655616059927345, "learning_rate": 9.995005872014289e-06, "loss": 0.3423, "step": 1065 }, { "epoch": 0.08783060064266293, "grad_norm": 5.412977135346199, "learning_rate": 9.994976016825367e-06, "loss": 0.6841, "step": 1066 }, { "epoch": 0.08791299332619264, "grad_norm": 5.379025435070761, "learning_rate": 9.994946072709007e-06, "loss": 0.6847, "step": 1067 }, { "epoch": 0.08799538600972234, "grad_norm": 4.472227140989464, "learning_rate": 9.994916039665748e-06, "loss": 0.4647, "step": 1068 }, { "epoch": 0.08807777869325203, "grad_norm": 5.060338129572461, "learning_rate": 9.994885917696122e-06, "loss": 0.6175, "step": 1069 }, { "epoch": 0.08816017137678174, "grad_norm": 4.715322598305957, "learning_rate": 9.994855706800666e-06, "loss": 0.4338, "step": 1070 }, { "epoch": 0.08824256406031145, "grad_norm": 4.281526810708517, "learning_rate": 9.994825406979918e-06, "loss": 0.5457, "step": 1071 }, { "epoch": 0.08832495674384115, "grad_norm": 3.4716224269919476, "learning_rate": 9.994795018234416e-06, "loss": 0.4955, "step": 1072 }, { "epoch": 0.08840734942737084, "grad_norm": 5.027771756957362, "learning_rate": 9.994764540564702e-06, "loss": 0.6585, "step": 1073 }, { "epoch": 0.08848974211090055, "grad_norm": 4.724781093913095, "learning_rate": 9.99473397397132e-06, "loss": 0.4618, "step": 1074 }, { "epoch": 0.08857213479443025, "grad_norm": 6.9846657667334915, "learning_rate": 9.99470331845481e-06, "loss": 0.5641, "step": 1075 }, { "epoch": 0.08865452747795996, "grad_norm": 5.484701110827303, "learning_rate": 9.994672574015724e-06, "loss": 0.542, "step": 1076 }, { "epoch": 0.08873692016148967, "grad_norm": 7.037433703723267, "learning_rate": 9.994641740654604e-06, "loss": 0.4367, "step": 1077 }, { "epoch": 0.08881931284501936, "grad_norm": 5.776582968652833, "learning_rate": 9.994610818372002e-06, "loss": 0.5423, "step": 1078 }, { "epoch": 0.08890170552854906, "grad_norm": 6.745245173679915, "learning_rate": 9.994579807168468e-06, "loss": 0.698, "step": 1079 }, { "epoch": 0.08898409821207877, "grad_norm": 5.473254655410174, "learning_rate": 9.994548707044551e-06, "loss": 0.4812, "step": 1080 }, { "epoch": 0.08906649089560847, "grad_norm": 4.628085367767044, "learning_rate": 9.994517518000809e-06, "loss": 0.4693, "step": 1081 }, { "epoch": 0.08914888357913817, "grad_norm": 7.525465774535276, "learning_rate": 9.994486240037794e-06, "loss": 0.6911, "step": 1082 }, { "epoch": 0.08923127626266787, "grad_norm": 5.893129420637143, "learning_rate": 9.994454873156068e-06, "loss": 0.6289, "step": 1083 }, { "epoch": 0.08931366894619758, "grad_norm": 5.50149802924285, "learning_rate": 9.994423417356183e-06, "loss": 0.6096, "step": 1084 }, { "epoch": 0.08939606162972728, "grad_norm": 4.773359114172165, "learning_rate": 9.994391872638702e-06, "loss": 0.4555, "step": 1085 }, { "epoch": 0.08947845431325699, "grad_norm": 7.212055616353917, "learning_rate": 9.994360239004186e-06, "loss": 0.7443, "step": 1086 }, { "epoch": 0.08956084699678668, "grad_norm": 7.876765760130183, "learning_rate": 9.9943285164532e-06, "loss": 0.7077, "step": 1087 }, { "epoch": 0.08964323968031639, "grad_norm": 7.317645647763081, "learning_rate": 9.994296704986306e-06, "loss": 0.6041, "step": 1088 }, { "epoch": 0.08972563236384609, "grad_norm": 4.530023086536175, "learning_rate": 9.994264804604073e-06, "loss": 0.4931, "step": 1089 }, { "epoch": 0.0898080250473758, "grad_norm": 4.2526214869784065, "learning_rate": 9.994232815307065e-06, "loss": 0.2995, "step": 1090 }, { "epoch": 0.08989041773090549, "grad_norm": 3.505242547342838, "learning_rate": 9.994200737095857e-06, "loss": 0.4473, "step": 1091 }, { "epoch": 0.0899728104144352, "grad_norm": 4.429638177463622, "learning_rate": 9.994168569971017e-06, "loss": 0.5841, "step": 1092 }, { "epoch": 0.0900552030979649, "grad_norm": 5.82955819981545, "learning_rate": 9.994136313933117e-06, "loss": 0.4789, "step": 1093 }, { "epoch": 0.0901375957814946, "grad_norm": 13.672647133825611, "learning_rate": 9.994103968982733e-06, "loss": 0.8772, "step": 1094 }, { "epoch": 0.09021998846502431, "grad_norm": 4.710965238730624, "learning_rate": 9.994071535120439e-06, "loss": 0.4686, "step": 1095 }, { "epoch": 0.090302381148554, "grad_norm": 3.9920741559548367, "learning_rate": 9.994039012346814e-06, "loss": 0.4907, "step": 1096 }, { "epoch": 0.09038477383208371, "grad_norm": 5.7078667377661825, "learning_rate": 9.994006400662436e-06, "loss": 0.617, "step": 1097 }, { "epoch": 0.09046716651561341, "grad_norm": 3.8186640801772125, "learning_rate": 9.993973700067888e-06, "loss": 0.4375, "step": 1098 }, { "epoch": 0.09054955919914312, "grad_norm": 5.873776628166568, "learning_rate": 9.99394091056375e-06, "loss": 0.5642, "step": 1099 }, { "epoch": 0.09063195188267281, "grad_norm": 6.100201545489032, "learning_rate": 9.993908032150604e-06, "loss": 0.603, "step": 1100 }, { "epoch": 0.09071434456620252, "grad_norm": 5.704701398966367, "learning_rate": 9.99387506482904e-06, "loss": 0.4836, "step": 1101 }, { "epoch": 0.09079673724973222, "grad_norm": 3.5476812408346934, "learning_rate": 9.99384200859964e-06, "loss": 0.2808, "step": 1102 }, { "epoch": 0.09087912993326193, "grad_norm": 4.151988355820425, "learning_rate": 9.993808863462995e-06, "loss": 0.381, "step": 1103 }, { "epoch": 0.09096152261679163, "grad_norm": 4.313112079524609, "learning_rate": 9.993775629419696e-06, "loss": 0.3598, "step": 1104 }, { "epoch": 0.09104391530032133, "grad_norm": 3.5113027125878085, "learning_rate": 9.993742306470332e-06, "loss": 0.2947, "step": 1105 }, { "epoch": 0.09112630798385103, "grad_norm": 5.254248735680192, "learning_rate": 9.993708894615502e-06, "loss": 0.3881, "step": 1106 }, { "epoch": 0.09120870066738074, "grad_norm": 7.426494059694848, "learning_rate": 9.993675393855793e-06, "loss": 0.812, "step": 1107 }, { "epoch": 0.09129109335091044, "grad_norm": 8.860170716007438, "learning_rate": 9.993641804191805e-06, "loss": 0.7974, "step": 1108 }, { "epoch": 0.09137348603444014, "grad_norm": 8.101723704520365, "learning_rate": 9.99360812562414e-06, "loss": 0.5309, "step": 1109 }, { "epoch": 0.09145587871796984, "grad_norm": 42.341845463398435, "learning_rate": 9.99357435815339e-06, "loss": 1.8562, "step": 1110 }, { "epoch": 0.09153827140149955, "grad_norm": 6.81287903015811, "learning_rate": 9.993540501780161e-06, "loss": 0.6941, "step": 1111 }, { "epoch": 0.09162066408502925, "grad_norm": 5.635578780831226, "learning_rate": 9.993506556505054e-06, "loss": 0.4578, "step": 1112 }, { "epoch": 0.09170305676855896, "grad_norm": 5.211604855452364, "learning_rate": 9.993472522328676e-06, "loss": 0.6778, "step": 1113 }, { "epoch": 0.09178544945208865, "grad_norm": 5.1777549097773665, "learning_rate": 9.99343839925163e-06, "loss": 0.5578, "step": 1114 }, { "epoch": 0.09186784213561835, "grad_norm": 4.842543684963219, "learning_rate": 9.993404187274522e-06, "loss": 0.5595, "step": 1115 }, { "epoch": 0.09195023481914806, "grad_norm": 6.42302616275195, "learning_rate": 9.993369886397967e-06, "loss": 0.7556, "step": 1116 }, { "epoch": 0.09203262750267777, "grad_norm": 6.211592177133203, "learning_rate": 9.99333549662257e-06, "loss": 0.5456, "step": 1117 }, { "epoch": 0.09211502018620746, "grad_norm": 4.630213204114295, "learning_rate": 9.993301017948946e-06, "loss": 0.4993, "step": 1118 }, { "epoch": 0.09219741286973716, "grad_norm": 7.009132877346605, "learning_rate": 9.99326645037771e-06, "loss": 0.8535, "step": 1119 }, { "epoch": 0.09227980555326687, "grad_norm": 4.964465276218794, "learning_rate": 9.993231793909474e-06, "loss": 0.4111, "step": 1120 }, { "epoch": 0.09236219823679657, "grad_norm": 5.696307168046908, "learning_rate": 9.993197048544857e-06, "loss": 0.5841, "step": 1121 }, { "epoch": 0.09244459092032628, "grad_norm": 5.527989894111306, "learning_rate": 9.993162214284478e-06, "loss": 0.5463, "step": 1122 }, { "epoch": 0.09252698360385597, "grad_norm": 5.171245280928823, "learning_rate": 9.993127291128956e-06, "loss": 0.6916, "step": 1123 }, { "epoch": 0.09260937628738568, "grad_norm": 5.320160130999334, "learning_rate": 9.993092279078914e-06, "loss": 0.406, "step": 1124 }, { "epoch": 0.09269176897091538, "grad_norm": 5.163510317928108, "learning_rate": 9.993057178134973e-06, "loss": 0.6965, "step": 1125 }, { "epoch": 0.09277416165444509, "grad_norm": 6.282384783932995, "learning_rate": 9.99302198829776e-06, "loss": 0.6963, "step": 1126 }, { "epoch": 0.09285655433797478, "grad_norm": 6.815231221580075, "learning_rate": 9.992986709567902e-06, "loss": 0.6793, "step": 1127 }, { "epoch": 0.09293894702150449, "grad_norm": 4.643382322672108, "learning_rate": 9.992951341946025e-06, "loss": 0.3584, "step": 1128 }, { "epoch": 0.09302133970503419, "grad_norm": 17.845535518840695, "learning_rate": 9.992915885432759e-06, "loss": 0.781, "step": 1129 }, { "epoch": 0.0931037323885639, "grad_norm": 4.085175770281815, "learning_rate": 9.992880340028736e-06, "loss": 0.2735, "step": 1130 }, { "epoch": 0.0931861250720936, "grad_norm": 4.864690311245262, "learning_rate": 9.992844705734591e-06, "loss": 0.547, "step": 1131 }, { "epoch": 0.0932685177556233, "grad_norm": 5.03732878733345, "learning_rate": 9.992808982550955e-06, "loss": 0.5577, "step": 1132 }, { "epoch": 0.093350910439153, "grad_norm": 8.201592365589455, "learning_rate": 9.992773170478465e-06, "loss": 0.7697, "step": 1133 }, { "epoch": 0.0934333031226827, "grad_norm": 5.9794222226843114, "learning_rate": 9.992737269517759e-06, "loss": 0.6587, "step": 1134 }, { "epoch": 0.09351569580621241, "grad_norm": 4.465989480795388, "learning_rate": 9.992701279669477e-06, "loss": 0.5631, "step": 1135 }, { "epoch": 0.09359808848974212, "grad_norm": 4.85574702436748, "learning_rate": 9.992665200934258e-06, "loss": 0.4923, "step": 1136 }, { "epoch": 0.09368048117327181, "grad_norm": 5.903537966151732, "learning_rate": 9.992629033312744e-06, "loss": 0.5924, "step": 1137 }, { "epoch": 0.09376287385680152, "grad_norm": 5.9804211363312065, "learning_rate": 9.99259277680558e-06, "loss": 0.6413, "step": 1138 }, { "epoch": 0.09384526654033122, "grad_norm": 6.216811540759033, "learning_rate": 9.992556431413412e-06, "loss": 0.4857, "step": 1139 }, { "epoch": 0.09392765922386093, "grad_norm": 3.992746573683693, "learning_rate": 9.992519997136887e-06, "loss": 0.5609, "step": 1140 }, { "epoch": 0.09401005190739062, "grad_norm": 6.133646477579303, "learning_rate": 9.992483473976652e-06, "loss": 0.7192, "step": 1141 }, { "epoch": 0.09409244459092032, "grad_norm": 3.996697571383684, "learning_rate": 9.992446861933358e-06, "loss": 0.5403, "step": 1142 }, { "epoch": 0.09417483727445003, "grad_norm": 4.876746898177986, "learning_rate": 9.992410161007658e-06, "loss": 0.6047, "step": 1143 }, { "epoch": 0.09425722995797974, "grad_norm": 7.113186432470744, "learning_rate": 9.992373371200206e-06, "loss": 0.6164, "step": 1144 }, { "epoch": 0.09433962264150944, "grad_norm": 4.5979462744336494, "learning_rate": 9.992336492511653e-06, "loss": 0.5623, "step": 1145 }, { "epoch": 0.09442201532503913, "grad_norm": 5.0662774129679935, "learning_rate": 9.992299524942658e-06, "loss": 0.5222, "step": 1146 }, { "epoch": 0.09450440800856884, "grad_norm": 6.317547083033332, "learning_rate": 9.992262468493883e-06, "loss": 0.7313, "step": 1147 }, { "epoch": 0.09458680069209854, "grad_norm": 6.083162361809943, "learning_rate": 9.99222532316598e-06, "loss": 0.7805, "step": 1148 }, { "epoch": 0.09466919337562825, "grad_norm": 6.645090805437031, "learning_rate": 9.992188088959616e-06, "loss": 0.5836, "step": 1149 }, { "epoch": 0.09475158605915794, "grad_norm": 4.61742230222751, "learning_rate": 9.992150765875452e-06, "loss": 0.3845, "step": 1150 }, { "epoch": 0.09483397874268765, "grad_norm": 4.9888454907973, "learning_rate": 9.992113353914153e-06, "loss": 0.5926, "step": 1151 }, { "epoch": 0.09491637142621735, "grad_norm": 4.584756235692825, "learning_rate": 9.992075853076385e-06, "loss": 0.3355, "step": 1152 }, { "epoch": 0.09499876410974706, "grad_norm": 3.6417513821813583, "learning_rate": 9.992038263362815e-06, "loss": 0.4314, "step": 1153 }, { "epoch": 0.09508115679327676, "grad_norm": 4.554586660083454, "learning_rate": 9.992000584774113e-06, "loss": 0.5483, "step": 1154 }, { "epoch": 0.09516354947680646, "grad_norm": 4.795571278019529, "learning_rate": 9.991962817310947e-06, "loss": 0.7088, "step": 1155 }, { "epoch": 0.09524594216033616, "grad_norm": 5.551121573962564, "learning_rate": 9.991924960973995e-06, "loss": 0.6027, "step": 1156 }, { "epoch": 0.09532833484386587, "grad_norm": 5.793039171212162, "learning_rate": 9.991887015763926e-06, "loss": 0.5796, "step": 1157 }, { "epoch": 0.09541072752739557, "grad_norm": 3.914653681105744, "learning_rate": 9.991848981681417e-06, "loss": 0.5456, "step": 1158 }, { "epoch": 0.09549312021092526, "grad_norm": 4.822015558280715, "learning_rate": 9.991810858727147e-06, "loss": 0.4228, "step": 1159 }, { "epoch": 0.09557551289445497, "grad_norm": 6.430460772568001, "learning_rate": 9.991772646901793e-06, "loss": 0.511, "step": 1160 }, { "epoch": 0.09565790557798468, "grad_norm": 5.31544358134059, "learning_rate": 9.991734346206034e-06, "loss": 0.4908, "step": 1161 }, { "epoch": 0.09574029826151438, "grad_norm": 4.579164539846181, "learning_rate": 9.991695956640555e-06, "loss": 0.5216, "step": 1162 }, { "epoch": 0.09582269094504409, "grad_norm": 10.980434026023534, "learning_rate": 9.991657478206037e-06, "loss": 0.4225, "step": 1163 }, { "epoch": 0.09590508362857378, "grad_norm": 11.232153491350044, "learning_rate": 9.991618910903165e-06, "loss": 0.4346, "step": 1164 }, { "epoch": 0.09598747631210348, "grad_norm": 5.016670131834788, "learning_rate": 9.99158025473263e-06, "loss": 0.4156, "step": 1165 }, { "epoch": 0.09606986899563319, "grad_norm": 5.823029951418563, "learning_rate": 9.991541509695113e-06, "loss": 0.6179, "step": 1166 }, { "epoch": 0.0961522616791629, "grad_norm": 6.14466976320633, "learning_rate": 9.991502675791308e-06, "loss": 0.5943, "step": 1167 }, { "epoch": 0.09623465436269259, "grad_norm": 5.663257166183292, "learning_rate": 9.991463753021907e-06, "loss": 0.4774, "step": 1168 }, { "epoch": 0.09631704704622229, "grad_norm": 4.613753681467516, "learning_rate": 9.991424741387601e-06, "loss": 0.3702, "step": 1169 }, { "epoch": 0.096399439729752, "grad_norm": 6.32248078526071, "learning_rate": 9.991385640889087e-06, "loss": 0.548, "step": 1170 }, { "epoch": 0.0964818324132817, "grad_norm": 6.541355663315621, "learning_rate": 9.991346451527058e-06, "loss": 0.6273, "step": 1171 }, { "epoch": 0.09656422509681141, "grad_norm": 5.283297075822956, "learning_rate": 9.991307173302212e-06, "loss": 0.3891, "step": 1172 }, { "epoch": 0.0966466177803411, "grad_norm": 4.233220917693166, "learning_rate": 9.991267806215251e-06, "loss": 0.3118, "step": 1173 }, { "epoch": 0.09672901046387081, "grad_norm": 4.413307234382051, "learning_rate": 9.991228350266875e-06, "loss": 0.5013, "step": 1174 }, { "epoch": 0.09681140314740051, "grad_norm": 3.56821873368059, "learning_rate": 9.991188805457784e-06, "loss": 0.4205, "step": 1175 }, { "epoch": 0.09689379583093022, "grad_norm": 6.493304664428958, "learning_rate": 9.991149171788686e-06, "loss": 0.6007, "step": 1176 }, { "epoch": 0.09697618851445991, "grad_norm": 4.839910460452555, "learning_rate": 9.991109449260283e-06, "loss": 0.511, "step": 1177 }, { "epoch": 0.09705858119798962, "grad_norm": 6.988660129796937, "learning_rate": 9.991069637873282e-06, "loss": 0.8373, "step": 1178 }, { "epoch": 0.09714097388151932, "grad_norm": 7.174369580906187, "learning_rate": 9.991029737628397e-06, "loss": 0.4762, "step": 1179 }, { "epoch": 0.09722336656504903, "grad_norm": 5.530363323174134, "learning_rate": 9.990989748526334e-06, "loss": 0.4079, "step": 1180 }, { "epoch": 0.09730575924857873, "grad_norm": 7.764029045054272, "learning_rate": 9.990949670567804e-06, "loss": 0.5609, "step": 1181 }, { "epoch": 0.09738815193210842, "grad_norm": 4.6082841469746025, "learning_rate": 9.990909503753524e-06, "loss": 0.5465, "step": 1182 }, { "epoch": 0.09747054461563813, "grad_norm": 4.287818673511596, "learning_rate": 9.990869248084205e-06, "loss": 0.3848, "step": 1183 }, { "epoch": 0.09755293729916784, "grad_norm": 5.323729474167395, "learning_rate": 9.990828903560568e-06, "loss": 0.5052, "step": 1184 }, { "epoch": 0.09763532998269754, "grad_norm": 3.5424616789602856, "learning_rate": 9.990788470183328e-06, "loss": 0.2952, "step": 1185 }, { "epoch": 0.09771772266622723, "grad_norm": 3.9137775832429584, "learning_rate": 9.990747947953207e-06, "loss": 0.2791, "step": 1186 }, { "epoch": 0.09780011534975694, "grad_norm": 5.032302762992251, "learning_rate": 9.990707336870925e-06, "loss": 0.3739, "step": 1187 }, { "epoch": 0.09788250803328664, "grad_norm": 5.581163892106261, "learning_rate": 9.990666636937207e-06, "loss": 0.5531, "step": 1188 }, { "epoch": 0.09796490071681635, "grad_norm": 6.884687269301154, "learning_rate": 9.990625848152775e-06, "loss": 0.7531, "step": 1189 }, { "epoch": 0.09804729340034606, "grad_norm": 5.374496441514754, "learning_rate": 9.990584970518355e-06, "loss": 0.6825, "step": 1190 }, { "epoch": 0.09812968608387575, "grad_norm": 4.491076319002902, "learning_rate": 9.99054400403468e-06, "loss": 0.5471, "step": 1191 }, { "epoch": 0.09821207876740545, "grad_norm": 4.568092241493959, "learning_rate": 9.990502948702472e-06, "loss": 0.2779, "step": 1192 }, { "epoch": 0.09829447145093516, "grad_norm": 5.707272721328885, "learning_rate": 9.990461804522466e-06, "loss": 0.6366, "step": 1193 }, { "epoch": 0.09837686413446486, "grad_norm": 4.826130217944562, "learning_rate": 9.990420571495394e-06, "loss": 0.605, "step": 1194 }, { "epoch": 0.09845925681799456, "grad_norm": 4.33472383368196, "learning_rate": 9.990379249621991e-06, "loss": 0.6158, "step": 1195 }, { "epoch": 0.09854164950152426, "grad_norm": 4.997331262544171, "learning_rate": 9.990337838902992e-06, "loss": 0.5247, "step": 1196 }, { "epoch": 0.09862404218505397, "grad_norm": 3.544784718142565, "learning_rate": 9.990296339339131e-06, "loss": 0.5761, "step": 1197 }, { "epoch": 0.09870643486858367, "grad_norm": 4.904002550116714, "learning_rate": 9.990254750931153e-06, "loss": 0.4465, "step": 1198 }, { "epoch": 0.09878882755211338, "grad_norm": 4.097219050092533, "learning_rate": 9.990213073679793e-06, "loss": 0.5315, "step": 1199 }, { "epoch": 0.09887122023564307, "grad_norm": 5.885850480185024, "learning_rate": 9.990171307585797e-06, "loss": 0.4493, "step": 1200 }, { "epoch": 0.09895361291917278, "grad_norm": 4.33215452297953, "learning_rate": 9.990129452649906e-06, "loss": 0.4882, "step": 1201 }, { "epoch": 0.09903600560270248, "grad_norm": 4.268587794277847, "learning_rate": 9.990087508872865e-06, "loss": 0.444, "step": 1202 }, { "epoch": 0.09911839828623219, "grad_norm": 42.319887791095226, "learning_rate": 9.990045476255422e-06, "loss": 1.8771, "step": 1203 }, { "epoch": 0.09920079096976188, "grad_norm": 40.893472301948556, "learning_rate": 9.990003354798326e-06, "loss": 1.546, "step": 1204 }, { "epoch": 0.09928318365329158, "grad_norm": 4.301738471538322, "learning_rate": 9.989961144502324e-06, "loss": 0.6113, "step": 1205 }, { "epoch": 0.09936557633682129, "grad_norm": 4.904331603406959, "learning_rate": 9.98991884536817e-06, "loss": 0.504, "step": 1206 }, { "epoch": 0.099447969020351, "grad_norm": 4.841374601844491, "learning_rate": 9.989876457396616e-06, "loss": 0.6375, "step": 1207 }, { "epoch": 0.0995303617038807, "grad_norm": 3.6457744187919996, "learning_rate": 9.989833980588419e-06, "loss": 0.4475, "step": 1208 }, { "epoch": 0.0996127543874104, "grad_norm": 6.177090683739293, "learning_rate": 9.989791414944332e-06, "loss": 0.4527, "step": 1209 }, { "epoch": 0.0996951470709401, "grad_norm": 6.948933775306833, "learning_rate": 9.989748760465114e-06, "loss": 0.4229, "step": 1210 }, { "epoch": 0.0997775397544698, "grad_norm": 4.081918060719312, "learning_rate": 9.989706017151526e-06, "loss": 0.4226, "step": 1211 }, { "epoch": 0.09985993243799951, "grad_norm": 5.241837641447958, "learning_rate": 9.989663185004326e-06, "loss": 0.6111, "step": 1212 }, { "epoch": 0.0999423251215292, "grad_norm": 4.874711378457223, "learning_rate": 9.989620264024278e-06, "loss": 0.5264, "step": 1213 }, { "epoch": 0.10002471780505891, "grad_norm": 6.576308785350509, "learning_rate": 9.989577254212147e-06, "loss": 0.7179, "step": 1214 }, { "epoch": 0.10010711048858861, "grad_norm": 5.81751849230087, "learning_rate": 9.989534155568696e-06, "loss": 0.4763, "step": 1215 }, { "epoch": 0.10018950317211832, "grad_norm": 5.144482743322481, "learning_rate": 9.989490968094695e-06, "loss": 0.5334, "step": 1216 }, { "epoch": 0.10027189585564802, "grad_norm": 4.558820107084972, "learning_rate": 9.989447691790912e-06, "loss": 0.4786, "step": 1217 }, { "epoch": 0.10035428853917772, "grad_norm": 5.042904719669194, "learning_rate": 9.98940432665812e-06, "loss": 0.351, "step": 1218 }, { "epoch": 0.10043668122270742, "grad_norm": 7.9762498299247095, "learning_rate": 9.989360872697085e-06, "loss": 0.0755, "step": 1219 }, { "epoch": 0.10051907390623713, "grad_norm": 5.90945797643557, "learning_rate": 9.989317329908585e-06, "loss": 0.5389, "step": 1220 }, { "epoch": 0.10060146658976683, "grad_norm": 6.595876901546739, "learning_rate": 9.989273698293396e-06, "loss": 0.5458, "step": 1221 }, { "epoch": 0.10068385927329652, "grad_norm": 3.5037946342073076, "learning_rate": 9.989229977852292e-06, "loss": 0.3967, "step": 1222 }, { "epoch": 0.10076625195682623, "grad_norm": 4.8746671742155145, "learning_rate": 9.989186168586054e-06, "loss": 0.536, "step": 1223 }, { "epoch": 0.10084864464035594, "grad_norm": 6.885826091957547, "learning_rate": 9.989142270495458e-06, "loss": 0.7177, "step": 1224 }, { "epoch": 0.10093103732388564, "grad_norm": 4.581417536969941, "learning_rate": 9.98909828358129e-06, "loss": 0.4632, "step": 1225 }, { "epoch": 0.10101343000741535, "grad_norm": 6.322372039008234, "learning_rate": 9.989054207844331e-06, "loss": 0.6098, "step": 1226 }, { "epoch": 0.10109582269094504, "grad_norm": 5.273630448320928, "learning_rate": 9.989010043285365e-06, "loss": 0.5149, "step": 1227 }, { "epoch": 0.10117821537447474, "grad_norm": 7.791366040723516, "learning_rate": 9.988965789905179e-06, "loss": 0.693, "step": 1228 }, { "epoch": 0.10126060805800445, "grad_norm": 9.738396758859928, "learning_rate": 9.988921447704563e-06, "loss": 0.5488, "step": 1229 }, { "epoch": 0.10134300074153416, "grad_norm": 5.146859630461013, "learning_rate": 9.988877016684302e-06, "loss": 0.5047, "step": 1230 }, { "epoch": 0.10142539342506385, "grad_norm": 5.257519730963288, "learning_rate": 9.98883249684519e-06, "loss": 0.6279, "step": 1231 }, { "epoch": 0.10150778610859355, "grad_norm": 6.374293874468573, "learning_rate": 9.988787888188021e-06, "loss": 0.5565, "step": 1232 }, { "epoch": 0.10159017879212326, "grad_norm": 3.579113688809698, "learning_rate": 9.988743190713585e-06, "loss": 0.3567, "step": 1233 }, { "epoch": 0.10167257147565296, "grad_norm": 5.160088719699968, "learning_rate": 9.988698404422682e-06, "loss": 0.512, "step": 1234 }, { "epoch": 0.10175496415918267, "grad_norm": 6.862175012633274, "learning_rate": 9.988653529316106e-06, "loss": 0.4836, "step": 1235 }, { "epoch": 0.10183735684271236, "grad_norm": 7.94291959200385, "learning_rate": 9.988608565394658e-06, "loss": 0.6196, "step": 1236 }, { "epoch": 0.10191974952624207, "grad_norm": 8.071509431185923, "learning_rate": 9.988563512659137e-06, "loss": 0.7937, "step": 1237 }, { "epoch": 0.10200214220977177, "grad_norm": 4.085112773207024, "learning_rate": 9.988518371110346e-06, "loss": 0.5843, "step": 1238 }, { "epoch": 0.10208453489330148, "grad_norm": 3.4369756535780036, "learning_rate": 9.988473140749089e-06, "loss": 0.4593, "step": 1239 }, { "epoch": 0.10216692757683117, "grad_norm": 4.233546401366111, "learning_rate": 9.98842782157617e-06, "loss": 0.1937, "step": 1240 }, { "epoch": 0.10224932026036088, "grad_norm": 5.0020721675763165, "learning_rate": 9.988382413592398e-06, "loss": 0.3163, "step": 1241 }, { "epoch": 0.10233171294389058, "grad_norm": 6.741423995217005, "learning_rate": 9.98833691679858e-06, "loss": 0.4598, "step": 1242 }, { "epoch": 0.10241410562742029, "grad_norm": 5.601574362819235, "learning_rate": 9.988291331195525e-06, "loss": 0.468, "step": 1243 }, { "epoch": 0.10249649831095, "grad_norm": 5.251186632818674, "learning_rate": 9.988245656784045e-06, "loss": 0.4222, "step": 1244 }, { "epoch": 0.10257889099447969, "grad_norm": 8.378822533378939, "learning_rate": 9.988199893564956e-06, "loss": 0.8973, "step": 1245 }, { "epoch": 0.10266128367800939, "grad_norm": 5.961577135188696, "learning_rate": 9.98815404153907e-06, "loss": 0.5762, "step": 1246 }, { "epoch": 0.1027436763615391, "grad_norm": 9.099142670765637, "learning_rate": 9.988108100707203e-06, "loss": 0.7662, "step": 1247 }, { "epoch": 0.1028260690450688, "grad_norm": 6.092979434479812, "learning_rate": 9.988062071070174e-06, "loss": 0.6146, "step": 1248 }, { "epoch": 0.1029084617285985, "grad_norm": 5.159988340065593, "learning_rate": 9.988015952628802e-06, "loss": 0.4235, "step": 1249 }, { "epoch": 0.1029908544121282, "grad_norm": 5.5843012218596995, "learning_rate": 9.987969745383908e-06, "loss": 0.6002, "step": 1250 }, { "epoch": 0.1030732470956579, "grad_norm": 5.194975705721119, "learning_rate": 9.987923449336316e-06, "loss": 0.3804, "step": 1251 }, { "epoch": 0.10315563977918761, "grad_norm": 4.495661995779881, "learning_rate": 9.98787706448685e-06, "loss": 0.5055, "step": 1252 }, { "epoch": 0.10323803246271732, "grad_norm": 4.23602600968263, "learning_rate": 9.987830590836335e-06, "loss": 0.5776, "step": 1253 }, { "epoch": 0.10332042514624701, "grad_norm": 6.237589314146654, "learning_rate": 9.987784028385596e-06, "loss": 0.4792, "step": 1254 }, { "epoch": 0.10340281782977671, "grad_norm": 7.598718878609416, "learning_rate": 9.987737377135464e-06, "loss": 0.8099, "step": 1255 }, { "epoch": 0.10348521051330642, "grad_norm": 3.2818516957200283, "learning_rate": 9.987690637086772e-06, "loss": 0.3107, "step": 1256 }, { "epoch": 0.10356760319683612, "grad_norm": 5.155044529125087, "learning_rate": 9.987643808240351e-06, "loss": 0.4354, "step": 1257 }, { "epoch": 0.10364999588036582, "grad_norm": 4.263048262690633, "learning_rate": 9.98759689059703e-06, "loss": 0.3961, "step": 1258 }, { "epoch": 0.10373238856389552, "grad_norm": 5.420585016670734, "learning_rate": 9.987549884157652e-06, "loss": 0.5856, "step": 1259 }, { "epoch": 0.10381478124742523, "grad_norm": 5.035305806630145, "learning_rate": 9.987502788923047e-06, "loss": 0.5991, "step": 1260 }, { "epoch": 0.10389717393095493, "grad_norm": 5.210172313300331, "learning_rate": 9.987455604894059e-06, "loss": 0.6802, "step": 1261 }, { "epoch": 0.10397956661448464, "grad_norm": 5.208839152242212, "learning_rate": 9.987408332071522e-06, "loss": 0.5894, "step": 1262 }, { "epoch": 0.10406195929801433, "grad_norm": 5.007424953872612, "learning_rate": 9.987360970456284e-06, "loss": 0.6866, "step": 1263 }, { "epoch": 0.10414435198154404, "grad_norm": 5.492166942733279, "learning_rate": 9.987313520049184e-06, "loss": 0.5856, "step": 1264 }, { "epoch": 0.10422674466507374, "grad_norm": 44.58199918343172, "learning_rate": 9.987265980851069e-06, "loss": 1.9599, "step": 1265 }, { "epoch": 0.10430913734860345, "grad_norm": 4.5829858248139175, "learning_rate": 9.987218352862781e-06, "loss": 0.5187, "step": 1266 }, { "epoch": 0.10439153003213314, "grad_norm": 5.025959968933363, "learning_rate": 9.987170636085175e-06, "loss": 0.5232, "step": 1267 }, { "epoch": 0.10447392271566285, "grad_norm": 33.25337475336521, "learning_rate": 9.987122830519096e-06, "loss": 0.675, "step": 1268 }, { "epoch": 0.10455631539919255, "grad_norm": 6.627024708916473, "learning_rate": 9.987074936165394e-06, "loss": 0.6327, "step": 1269 }, { "epoch": 0.10463870808272226, "grad_norm": 14.391260890255582, "learning_rate": 9.987026953024927e-06, "loss": 0.171, "step": 1270 }, { "epoch": 0.10472110076625196, "grad_norm": 4.702108978609021, "learning_rate": 9.986978881098543e-06, "loss": 0.3207, "step": 1271 }, { "epoch": 0.10480349344978165, "grad_norm": 6.034579770929771, "learning_rate": 9.986930720387103e-06, "loss": 0.4834, "step": 1272 }, { "epoch": 0.10488588613331136, "grad_norm": 5.096157625239223, "learning_rate": 9.986882470891458e-06, "loss": 0.3464, "step": 1273 }, { "epoch": 0.10496827881684107, "grad_norm": 6.817695716429012, "learning_rate": 9.986834132612475e-06, "loss": 0.6021, "step": 1274 }, { "epoch": 0.10505067150037077, "grad_norm": 5.953244971338033, "learning_rate": 9.98678570555101e-06, "loss": 0.682, "step": 1275 }, { "epoch": 0.10513306418390048, "grad_norm": 5.0509725079955965, "learning_rate": 9.986737189707924e-06, "loss": 0.6976, "step": 1276 }, { "epoch": 0.10521545686743017, "grad_norm": 5.45215491215228, "learning_rate": 9.986688585084086e-06, "loss": 0.5298, "step": 1277 }, { "epoch": 0.10529784955095987, "grad_norm": 4.412292451733061, "learning_rate": 9.986639891680356e-06, "loss": 0.3362, "step": 1278 }, { "epoch": 0.10538024223448958, "grad_norm": 5.221866379996899, "learning_rate": 9.986591109497601e-06, "loss": 0.5397, "step": 1279 }, { "epoch": 0.10546263491801929, "grad_norm": 5.480429366404604, "learning_rate": 9.986542238536694e-06, "loss": 0.4179, "step": 1280 }, { "epoch": 0.10554502760154898, "grad_norm": 4.6929338481975, "learning_rate": 9.986493278798502e-06, "loss": 0.3414, "step": 1281 }, { "epoch": 0.10562742028507868, "grad_norm": 5.128214656502921, "learning_rate": 9.986444230283896e-06, "loss": 0.3893, "step": 1282 }, { "epoch": 0.10570981296860839, "grad_norm": 6.237929917953249, "learning_rate": 9.986395092993751e-06, "loss": 0.479, "step": 1283 }, { "epoch": 0.1057922056521381, "grad_norm": 7.2745793133813015, "learning_rate": 9.98634586692894e-06, "loss": 0.6739, "step": 1284 }, { "epoch": 0.1058745983356678, "grad_norm": 4.95514436276389, "learning_rate": 9.986296552090343e-06, "loss": 0.3535, "step": 1285 }, { "epoch": 0.10595699101919749, "grad_norm": 6.709823023172423, "learning_rate": 9.986247148478834e-06, "loss": 0.4273, "step": 1286 }, { "epoch": 0.1060393837027272, "grad_norm": 7.894794011841938, "learning_rate": 9.986197656095293e-06, "loss": 0.5231, "step": 1287 }, { "epoch": 0.1061217763862569, "grad_norm": 6.771758510510305, "learning_rate": 9.986148074940602e-06, "loss": 0.6098, "step": 1288 }, { "epoch": 0.10620416906978661, "grad_norm": 8.620140427216912, "learning_rate": 9.986098405015646e-06, "loss": 0.6816, "step": 1289 }, { "epoch": 0.1062865617533163, "grad_norm": 7.649934951424651, "learning_rate": 9.986048646321306e-06, "loss": 0.6417, "step": 1290 }, { "epoch": 0.106368954436846, "grad_norm": 5.89217219407464, "learning_rate": 9.98599879885847e-06, "loss": 0.436, "step": 1291 }, { "epoch": 0.10645134712037571, "grad_norm": 4.562164944360262, "learning_rate": 9.985948862628023e-06, "loss": 0.4035, "step": 1292 }, { "epoch": 0.10653373980390542, "grad_norm": 10.224434742049766, "learning_rate": 9.985898837630856e-06, "loss": 0.7638, "step": 1293 }, { "epoch": 0.10661613248743512, "grad_norm": 6.688388151927518, "learning_rate": 9.98584872386786e-06, "loss": 0.6856, "step": 1294 }, { "epoch": 0.10669852517096481, "grad_norm": 5.548007921469972, "learning_rate": 9.985798521339924e-06, "loss": 0.4693, "step": 1295 }, { "epoch": 0.10678091785449452, "grad_norm": 6.389072481221684, "learning_rate": 9.985748230047944e-06, "loss": 0.7325, "step": 1296 }, { "epoch": 0.10686331053802423, "grad_norm": 4.857740386437905, "learning_rate": 9.985697849992818e-06, "loss": 0.4256, "step": 1297 }, { "epoch": 0.10694570322155393, "grad_norm": 4.244141671261363, "learning_rate": 9.98564738117544e-06, "loss": 0.4349, "step": 1298 }, { "epoch": 0.10702809590508362, "grad_norm": 8.506166832118577, "learning_rate": 9.985596823596708e-06, "loss": 0.6764, "step": 1299 }, { "epoch": 0.10711048858861333, "grad_norm": 35.49665995961184, "learning_rate": 9.985546177257523e-06, "loss": 0.8473, "step": 1300 }, { "epoch": 0.10719288127214303, "grad_norm": 4.860038932251913, "learning_rate": 9.985495442158785e-06, "loss": 0.4825, "step": 1301 }, { "epoch": 0.10727527395567274, "grad_norm": 6.230406301466884, "learning_rate": 9.985444618301401e-06, "loss": 0.65, "step": 1302 }, { "epoch": 0.10735766663920245, "grad_norm": 4.218360186395634, "learning_rate": 9.985393705686274e-06, "loss": 0.3347, "step": 1303 }, { "epoch": 0.10744005932273214, "grad_norm": 14.47184287987901, "learning_rate": 9.985342704314308e-06, "loss": 1.0207, "step": 1304 }, { "epoch": 0.10752245200626184, "grad_norm": 6.850987039774206, "learning_rate": 9.985291614186417e-06, "loss": 0.7262, "step": 1305 }, { "epoch": 0.10760484468979155, "grad_norm": 5.9020975240354385, "learning_rate": 9.985240435303505e-06, "loss": 0.4397, "step": 1306 }, { "epoch": 0.10768723737332125, "grad_norm": 4.756886284647011, "learning_rate": 9.985189167666484e-06, "loss": 0.3688, "step": 1307 }, { "epoch": 0.10776963005685095, "grad_norm": 5.157047719389633, "learning_rate": 9.985137811276268e-06, "loss": 0.6059, "step": 1308 }, { "epoch": 0.10785202274038065, "grad_norm": 6.27032589224842, "learning_rate": 9.985086366133771e-06, "loss": 0.7094, "step": 1309 }, { "epoch": 0.10793441542391036, "grad_norm": 4.952356354117285, "learning_rate": 9.985034832239908e-06, "loss": 0.4373, "step": 1310 }, { "epoch": 0.10801680810744006, "grad_norm": 6.469389577291621, "learning_rate": 9.984983209595598e-06, "loss": 0.5434, "step": 1311 }, { "epoch": 0.10809920079096977, "grad_norm": 4.993310174827445, "learning_rate": 9.98493149820176e-06, "loss": 0.5393, "step": 1312 }, { "epoch": 0.10818159347449946, "grad_norm": 3.6178325149897046, "learning_rate": 9.984879698059314e-06, "loss": 0.2839, "step": 1313 }, { "epoch": 0.10826398615802917, "grad_norm": 4.710143431767484, "learning_rate": 9.98482780916918e-06, "loss": 0.5721, "step": 1314 }, { "epoch": 0.10834637884155887, "grad_norm": 7.01427162272288, "learning_rate": 9.984775831532288e-06, "loss": 0.6726, "step": 1315 }, { "epoch": 0.10842877152508858, "grad_norm": 6.672994509695413, "learning_rate": 9.984723765149555e-06, "loss": 0.6024, "step": 1316 }, { "epoch": 0.10851116420861827, "grad_norm": 5.677032485840668, "learning_rate": 9.984671610021916e-06, "loss": 0.6153, "step": 1317 }, { "epoch": 0.10859355689214797, "grad_norm": 5.458697212060859, "learning_rate": 9.984619366150294e-06, "loss": 0.599, "step": 1318 }, { "epoch": 0.10867594957567768, "grad_norm": 4.616675469079038, "learning_rate": 9.98456703353562e-06, "loss": 0.5492, "step": 1319 }, { "epoch": 0.10875834225920739, "grad_norm": 6.386675749076506, "learning_rate": 9.98451461217883e-06, "loss": 0.6014, "step": 1320 }, { "epoch": 0.10884073494273709, "grad_norm": 7.024628115810297, "learning_rate": 9.984462102080852e-06, "loss": 0.778, "step": 1321 }, { "epoch": 0.10892312762626678, "grad_norm": 5.033824012329933, "learning_rate": 9.984409503242623e-06, "loss": 0.4687, "step": 1322 }, { "epoch": 0.10900552030979649, "grad_norm": 4.962648224329656, "learning_rate": 9.98435681566508e-06, "loss": 0.5488, "step": 1323 }, { "epoch": 0.1090879129933262, "grad_norm": 7.920473586250384, "learning_rate": 9.984304039349159e-06, "loss": 0.6991, "step": 1324 }, { "epoch": 0.1091703056768559, "grad_norm": 5.738981101778399, "learning_rate": 9.9842511742958e-06, "loss": 0.5916, "step": 1325 }, { "epoch": 0.10925269836038559, "grad_norm": 4.679325132639655, "learning_rate": 9.984198220505947e-06, "loss": 0.5934, "step": 1326 }, { "epoch": 0.1093350910439153, "grad_norm": 3.9674401675344346, "learning_rate": 9.984145177980541e-06, "loss": 0.5008, "step": 1327 }, { "epoch": 0.109417483727445, "grad_norm": 5.381770819469537, "learning_rate": 9.984092046720526e-06, "loss": 0.4177, "step": 1328 }, { "epoch": 0.10949987641097471, "grad_norm": 5.770846813479589, "learning_rate": 9.984038826726847e-06, "loss": 0.4167, "step": 1329 }, { "epoch": 0.10958226909450441, "grad_norm": 6.865983869702629, "learning_rate": 9.983985518000455e-06, "loss": 0.7061, "step": 1330 }, { "epoch": 0.1096646617780341, "grad_norm": 3.9275472528501925, "learning_rate": 9.983932120542294e-06, "loss": 0.2918, "step": 1331 }, { "epoch": 0.10974705446156381, "grad_norm": 6.07045204922866, "learning_rate": 9.983878634353317e-06, "loss": 0.6954, "step": 1332 }, { "epoch": 0.10982944714509352, "grad_norm": 4.141327747866078, "learning_rate": 9.983825059434478e-06, "loss": 0.3842, "step": 1333 }, { "epoch": 0.10991183982862322, "grad_norm": 5.786533535647971, "learning_rate": 9.98377139578673e-06, "loss": 0.5725, "step": 1334 }, { "epoch": 0.10999423251215291, "grad_norm": 6.8133544098549415, "learning_rate": 9.983717643411027e-06, "loss": 0.7385, "step": 1335 }, { "epoch": 0.11007662519568262, "grad_norm": 6.85720202285886, "learning_rate": 9.983663802308326e-06, "loss": 0.5718, "step": 1336 }, { "epoch": 0.11015901787921233, "grad_norm": 6.136362509792475, "learning_rate": 9.983609872479587e-06, "loss": 0.3897, "step": 1337 }, { "epoch": 0.11024141056274203, "grad_norm": 4.797859015857666, "learning_rate": 9.98355585392577e-06, "loss": 0.6325, "step": 1338 }, { "epoch": 0.11032380324627174, "grad_norm": 8.316767655871656, "learning_rate": 9.983501746647835e-06, "loss": 0.6866, "step": 1339 }, { "epoch": 0.11040619592980143, "grad_norm": 9.033213998462069, "learning_rate": 9.983447550646748e-06, "loss": 0.6542, "step": 1340 }, { "epoch": 0.11048858861333113, "grad_norm": 33.85428238105279, "learning_rate": 9.98339326592347e-06, "loss": 0.9791, "step": 1341 }, { "epoch": 0.11057098129686084, "grad_norm": 6.407367839369825, "learning_rate": 9.98333889247897e-06, "loss": 0.6029, "step": 1342 }, { "epoch": 0.11065337398039055, "grad_norm": 20.144367498349798, "learning_rate": 9.983284430314217e-06, "loss": 0.5827, "step": 1343 }, { "epoch": 0.11073576666392024, "grad_norm": 4.6534351475822655, "learning_rate": 9.98322987943018e-06, "loss": 0.4664, "step": 1344 }, { "epoch": 0.11081815934744994, "grad_norm": 8.15101971157244, "learning_rate": 9.983175239827829e-06, "loss": 0.7332, "step": 1345 }, { "epoch": 0.11090055203097965, "grad_norm": 4.753437093306661, "learning_rate": 9.983120511508136e-06, "loss": 0.5571, "step": 1346 }, { "epoch": 0.11098294471450935, "grad_norm": 11.681875120750966, "learning_rate": 9.983065694472078e-06, "loss": 0.4647, "step": 1347 }, { "epoch": 0.11106533739803906, "grad_norm": 12.117460717638025, "learning_rate": 9.983010788720629e-06, "loss": 0.538, "step": 1348 }, { "epoch": 0.11114773008156875, "grad_norm": 6.829274575800984, "learning_rate": 9.982955794254768e-06, "loss": 0.7028, "step": 1349 }, { "epoch": 0.11123012276509846, "grad_norm": 13.602827030669436, "learning_rate": 9.982900711075473e-06, "loss": 0.7064, "step": 1350 }, { "epoch": 0.11131251544862816, "grad_norm": 5.8175552794909775, "learning_rate": 9.982845539183724e-06, "loss": 0.6018, "step": 1351 }, { "epoch": 0.11139490813215787, "grad_norm": 8.31044623036391, "learning_rate": 9.982790278580505e-06, "loss": 0.3879, "step": 1352 }, { "epoch": 0.11147730081568756, "grad_norm": 6.099537144897278, "learning_rate": 9.982734929266799e-06, "loss": 0.5985, "step": 1353 }, { "epoch": 0.11155969349921727, "grad_norm": 5.0871649102753675, "learning_rate": 9.98267949124359e-06, "loss": 0.6366, "step": 1354 }, { "epoch": 0.11164208618274697, "grad_norm": 7.594378062700149, "learning_rate": 9.982623964511868e-06, "loss": 0.3574, "step": 1355 }, { "epoch": 0.11172447886627668, "grad_norm": 6.421114608398679, "learning_rate": 9.982568349072619e-06, "loss": 0.424, "step": 1356 }, { "epoch": 0.11180687154980638, "grad_norm": 4.43829490618891, "learning_rate": 9.982512644926835e-06, "loss": 0.5444, "step": 1357 }, { "epoch": 0.11188926423333607, "grad_norm": 5.484026046951222, "learning_rate": 9.982456852075505e-06, "loss": 0.4623, "step": 1358 }, { "epoch": 0.11197165691686578, "grad_norm": 4.551122319765745, "learning_rate": 9.982400970519625e-06, "loss": 0.482, "step": 1359 }, { "epoch": 0.11205404960039549, "grad_norm": 5.123980967275811, "learning_rate": 9.982345000260189e-06, "loss": 0.6261, "step": 1360 }, { "epoch": 0.11213644228392519, "grad_norm": 7.0859474884143845, "learning_rate": 9.982288941298193e-06, "loss": 0.482, "step": 1361 }, { "epoch": 0.11221883496745488, "grad_norm": 5.062990740013323, "learning_rate": 9.982232793634637e-06, "loss": 0.4078, "step": 1362 }, { "epoch": 0.11230122765098459, "grad_norm": 6.563773730859556, "learning_rate": 9.982176557270518e-06, "loss": 0.6887, "step": 1363 }, { "epoch": 0.1123836203345143, "grad_norm": 3.9679727896989507, "learning_rate": 9.982120232206837e-06, "loss": 0.5059, "step": 1364 }, { "epoch": 0.112466013018044, "grad_norm": 4.109093720360083, "learning_rate": 9.9820638184446e-06, "loss": 0.3438, "step": 1365 }, { "epoch": 0.1125484057015737, "grad_norm": 5.911123344422388, "learning_rate": 9.98200731598481e-06, "loss": 0.5155, "step": 1366 }, { "epoch": 0.1126307983851034, "grad_norm": 3.88318872578385, "learning_rate": 9.98195072482847e-06, "loss": 0.5121, "step": 1367 }, { "epoch": 0.1127131910686331, "grad_norm": 5.233996486233231, "learning_rate": 9.98189404497659e-06, "loss": 0.6168, "step": 1368 }, { "epoch": 0.11279558375216281, "grad_norm": 4.602934758003588, "learning_rate": 9.981837276430181e-06, "loss": 0.3514, "step": 1369 }, { "epoch": 0.11287797643569251, "grad_norm": 6.096124850039538, "learning_rate": 9.98178041919025e-06, "loss": 0.7239, "step": 1370 }, { "epoch": 0.1129603691192222, "grad_norm": 3.74675189974131, "learning_rate": 9.981723473257812e-06, "loss": 0.2741, "step": 1371 }, { "epoch": 0.11304276180275191, "grad_norm": 4.56350112320027, "learning_rate": 9.981666438633877e-06, "loss": 0.4282, "step": 1372 }, { "epoch": 0.11312515448628162, "grad_norm": 14.508417238617053, "learning_rate": 9.981609315319467e-06, "loss": 0.6197, "step": 1373 }, { "epoch": 0.11320754716981132, "grad_norm": 3.0941627955941367, "learning_rate": 9.981552103315593e-06, "loss": 0.2163, "step": 1374 }, { "epoch": 0.11328993985334103, "grad_norm": 4.946676977341611, "learning_rate": 9.981494802623275e-06, "loss": 0.387, "step": 1375 }, { "epoch": 0.11337233253687072, "grad_norm": 7.823751960171278, "learning_rate": 9.981437413243535e-06, "loss": 0.6005, "step": 1376 }, { "epoch": 0.11345472522040043, "grad_norm": 4.525532982557422, "learning_rate": 9.981379935177393e-06, "loss": 0.5959, "step": 1377 }, { "epoch": 0.11353711790393013, "grad_norm": 6.635732248640508, "learning_rate": 9.981322368425873e-06, "loss": 0.5028, "step": 1378 }, { "epoch": 0.11361951058745984, "grad_norm": 7.813709567215677, "learning_rate": 9.98126471299e-06, "loss": 0.5495, "step": 1379 }, { "epoch": 0.11370190327098953, "grad_norm": 4.630427176502277, "learning_rate": 9.981206968870798e-06, "loss": 0.5631, "step": 1380 }, { "epoch": 0.11378429595451924, "grad_norm": 33.40325879735037, "learning_rate": 9.9811491360693e-06, "loss": 1.1373, "step": 1381 }, { "epoch": 0.11386668863804894, "grad_norm": 4.6566891571505575, "learning_rate": 9.981091214586533e-06, "loss": 0.4544, "step": 1382 }, { "epoch": 0.11394908132157865, "grad_norm": 5.069079604968983, "learning_rate": 9.981033204423526e-06, "loss": 0.5782, "step": 1383 }, { "epoch": 0.11403147400510835, "grad_norm": 4.434254165048741, "learning_rate": 9.980975105581315e-06, "loss": 0.5051, "step": 1384 }, { "epoch": 0.11411386668863804, "grad_norm": 7.74689606386784, "learning_rate": 9.980916918060932e-06, "loss": 0.5908, "step": 1385 }, { "epoch": 0.11419625937216775, "grad_norm": 4.577069084019305, "learning_rate": 9.980858641863415e-06, "loss": 0.5266, "step": 1386 }, { "epoch": 0.11427865205569745, "grad_norm": 3.7822421197101046, "learning_rate": 9.980800276989802e-06, "loss": 0.5155, "step": 1387 }, { "epoch": 0.11436104473922716, "grad_norm": 6.354305643899875, "learning_rate": 9.98074182344113e-06, "loss": 0.5797, "step": 1388 }, { "epoch": 0.11444343742275685, "grad_norm": 5.1062797154471085, "learning_rate": 9.980683281218438e-06, "loss": 0.3497, "step": 1389 }, { "epoch": 0.11452583010628656, "grad_norm": 4.339769494165447, "learning_rate": 9.980624650322772e-06, "loss": 0.5299, "step": 1390 }, { "epoch": 0.11460822278981626, "grad_norm": 7.573051401034229, "learning_rate": 9.980565930755174e-06, "loss": 0.681, "step": 1391 }, { "epoch": 0.11469061547334597, "grad_norm": 14.775312547328694, "learning_rate": 9.980507122516692e-06, "loss": 0.7312, "step": 1392 }, { "epoch": 0.11477300815687567, "grad_norm": 4.248998579788246, "learning_rate": 9.980448225608369e-06, "loss": 0.5819, "step": 1393 }, { "epoch": 0.11485540084040537, "grad_norm": 4.820974256874798, "learning_rate": 9.980389240031256e-06, "loss": 0.3988, "step": 1394 }, { "epoch": 0.11493779352393507, "grad_norm": 5.3611932668737134, "learning_rate": 9.980330165786403e-06, "loss": 0.553, "step": 1395 }, { "epoch": 0.11502018620746478, "grad_norm": 4.803109353772144, "learning_rate": 9.98027100287486e-06, "loss": 0.4254, "step": 1396 }, { "epoch": 0.11510257889099448, "grad_norm": 6.166043944368681, "learning_rate": 9.980211751297682e-06, "loss": 0.6435, "step": 1397 }, { "epoch": 0.11518497157452418, "grad_norm": 6.321449333735212, "learning_rate": 9.980152411055923e-06, "loss": 0.5901, "step": 1398 }, { "epoch": 0.11526736425805388, "grad_norm": 3.9058938395757736, "learning_rate": 9.980092982150641e-06, "loss": 0.4481, "step": 1399 }, { "epoch": 0.11534975694158359, "grad_norm": 7.060663282013449, "learning_rate": 9.980033464582892e-06, "loss": 0.7435, "step": 1400 }, { "epoch": 0.11543214962511329, "grad_norm": 5.044696423020422, "learning_rate": 9.979973858353738e-06, "loss": 0.4583, "step": 1401 }, { "epoch": 0.115514542308643, "grad_norm": 4.010481581383983, "learning_rate": 9.979914163464237e-06, "loss": 0.4307, "step": 1402 }, { "epoch": 0.11559693499217269, "grad_norm": 4.715002567821283, "learning_rate": 9.979854379915454e-06, "loss": 0.4067, "step": 1403 }, { "epoch": 0.1156793276757024, "grad_norm": 5.095282676387148, "learning_rate": 9.979794507708453e-06, "loss": 0.5319, "step": 1404 }, { "epoch": 0.1157617203592321, "grad_norm": 5.434641667200585, "learning_rate": 9.979734546844301e-06, "loss": 0.5371, "step": 1405 }, { "epoch": 0.1158441130427618, "grad_norm": 5.864807571595335, "learning_rate": 9.979674497324063e-06, "loss": 0.7502, "step": 1406 }, { "epoch": 0.11592650572629151, "grad_norm": 3.931855774749118, "learning_rate": 9.979614359148809e-06, "loss": 0.4857, "step": 1407 }, { "epoch": 0.1160088984098212, "grad_norm": 4.415958936160234, "learning_rate": 9.97955413231961e-06, "loss": 0.4143, "step": 1408 }, { "epoch": 0.11609129109335091, "grad_norm": 5.477681605181166, "learning_rate": 9.97949381683754e-06, "loss": 0.5853, "step": 1409 }, { "epoch": 0.11617368377688062, "grad_norm": 5.394465919333391, "learning_rate": 9.97943341270367e-06, "loss": 0.6864, "step": 1410 }, { "epoch": 0.11625607646041032, "grad_norm": 7.0540860642905825, "learning_rate": 9.979372919919077e-06, "loss": 0.5353, "step": 1411 }, { "epoch": 0.11633846914394001, "grad_norm": 4.457918745669958, "learning_rate": 9.979312338484837e-06, "loss": 0.5332, "step": 1412 }, { "epoch": 0.11642086182746972, "grad_norm": 4.318862068781239, "learning_rate": 9.979251668402027e-06, "loss": 0.5383, "step": 1413 }, { "epoch": 0.11650325451099942, "grad_norm": 4.15910796680108, "learning_rate": 9.979190909671732e-06, "loss": 0.3852, "step": 1414 }, { "epoch": 0.11658564719452913, "grad_norm": 4.409347363114283, "learning_rate": 9.97913006229503e-06, "loss": 0.5274, "step": 1415 }, { "epoch": 0.11666803987805884, "grad_norm": 3.860140816734429, "learning_rate": 9.979069126273006e-06, "loss": 0.3016, "step": 1416 }, { "epoch": 0.11675043256158853, "grad_norm": 4.05973495912906, "learning_rate": 9.979008101606743e-06, "loss": 0.5707, "step": 1417 }, { "epoch": 0.11683282524511823, "grad_norm": 5.335313848877938, "learning_rate": 9.978946988297329e-06, "loss": 0.4813, "step": 1418 }, { "epoch": 0.11691521792864794, "grad_norm": 5.075262879615677, "learning_rate": 9.978885786345851e-06, "loss": 0.463, "step": 1419 }, { "epoch": 0.11699761061217764, "grad_norm": 5.215466147778947, "learning_rate": 9.978824495753399e-06, "loss": 0.5398, "step": 1420 }, { "epoch": 0.11708000329570734, "grad_norm": 10.408226861855583, "learning_rate": 9.978763116521065e-06, "loss": 0.7895, "step": 1421 }, { "epoch": 0.11716239597923704, "grad_norm": 3.486562350465299, "learning_rate": 9.97870164864994e-06, "loss": 0.5399, "step": 1422 }, { "epoch": 0.11724478866276675, "grad_norm": 3.263975256020736, "learning_rate": 9.97864009214112e-06, "loss": 0.2916, "step": 1423 }, { "epoch": 0.11732718134629645, "grad_norm": 3.687714872193806, "learning_rate": 9.9785784469957e-06, "loss": 0.304, "step": 1424 }, { "epoch": 0.11740957402982616, "grad_norm": 5.509829107986992, "learning_rate": 9.978516713214779e-06, "loss": 0.5531, "step": 1425 }, { "epoch": 0.11749196671335585, "grad_norm": 3.5921556435301474, "learning_rate": 9.978454890799453e-06, "loss": 0.3016, "step": 1426 }, { "epoch": 0.11757435939688556, "grad_norm": 4.173814176850804, "learning_rate": 9.978392979750825e-06, "loss": 0.4396, "step": 1427 }, { "epoch": 0.11765675208041526, "grad_norm": 33.23788817879558, "learning_rate": 9.978330980069996e-06, "loss": 1.1389, "step": 1428 }, { "epoch": 0.11773914476394497, "grad_norm": 5.852780126730159, "learning_rate": 9.978268891758072e-06, "loss": 0.6514, "step": 1429 }, { "epoch": 0.11782153744747466, "grad_norm": 4.939407725307317, "learning_rate": 9.978206714816156e-06, "loss": 0.7286, "step": 1430 }, { "epoch": 0.11790393013100436, "grad_norm": 5.03926308808164, "learning_rate": 9.978144449245357e-06, "loss": 0.3388, "step": 1431 }, { "epoch": 0.11798632281453407, "grad_norm": 6.049230382777763, "learning_rate": 9.978082095046781e-06, "loss": 0.4871, "step": 1432 }, { "epoch": 0.11806871549806378, "grad_norm": 3.489939110522026, "learning_rate": 9.978019652221543e-06, "loss": 0.2893, "step": 1433 }, { "epoch": 0.11815110818159348, "grad_norm": 5.028102433756064, "learning_rate": 9.977957120770748e-06, "loss": 0.5076, "step": 1434 }, { "epoch": 0.11823350086512317, "grad_norm": 5.274453037216691, "learning_rate": 9.977894500695512e-06, "loss": 0.3798, "step": 1435 }, { "epoch": 0.11831589354865288, "grad_norm": 4.751680688931771, "learning_rate": 9.977831791996952e-06, "loss": 0.4783, "step": 1436 }, { "epoch": 0.11839828623218258, "grad_norm": 3.5749837950342274, "learning_rate": 9.977768994676181e-06, "loss": 0.3682, "step": 1437 }, { "epoch": 0.11848067891571229, "grad_norm": 3.671226493661603, "learning_rate": 9.97770610873432e-06, "loss": 0.347, "step": 1438 }, { "epoch": 0.11856307159924198, "grad_norm": 6.196939033251727, "learning_rate": 9.977643134172487e-06, "loss": 0.5274, "step": 1439 }, { "epoch": 0.11864546428277169, "grad_norm": 5.2089347723562085, "learning_rate": 9.977580070991804e-06, "loss": 0.5175, "step": 1440 }, { "epoch": 0.11872785696630139, "grad_norm": 6.334013108356391, "learning_rate": 9.977516919193393e-06, "loss": 0.6194, "step": 1441 }, { "epoch": 0.1188102496498311, "grad_norm": 7.306839104821746, "learning_rate": 9.977453678778379e-06, "loss": 0.7709, "step": 1442 }, { "epoch": 0.1188926423333608, "grad_norm": 4.519396435491379, "learning_rate": 9.977390349747886e-06, "loss": 0.4418, "step": 1443 }, { "epoch": 0.1189750350168905, "grad_norm": 4.698209909244438, "learning_rate": 9.977326932103044e-06, "loss": 0.535, "step": 1444 }, { "epoch": 0.1190574277004202, "grad_norm": 4.8932208776361446, "learning_rate": 9.977263425844981e-06, "loss": 0.5942, "step": 1445 }, { "epoch": 0.11913982038394991, "grad_norm": 5.026667743592898, "learning_rate": 9.977199830974826e-06, "loss": 0.547, "step": 1446 }, { "epoch": 0.11922221306747961, "grad_norm": 5.23253291298616, "learning_rate": 9.977136147493715e-06, "loss": 0.5333, "step": 1447 }, { "epoch": 0.1193046057510093, "grad_norm": 4.719053095707587, "learning_rate": 9.97707237540278e-06, "loss": 0.5265, "step": 1448 }, { "epoch": 0.11938699843453901, "grad_norm": 4.625540723780489, "learning_rate": 9.977008514703153e-06, "loss": 0.5827, "step": 1449 }, { "epoch": 0.11946939111806872, "grad_norm": 3.856805995552182, "learning_rate": 9.976944565395976e-06, "loss": 0.6053, "step": 1450 }, { "epoch": 0.11955178380159842, "grad_norm": 3.330853507113743, "learning_rate": 9.976880527482385e-06, "loss": 0.5254, "step": 1451 }, { "epoch": 0.11963417648512813, "grad_norm": 4.459173017435613, "learning_rate": 9.97681640096352e-06, "loss": 0.4728, "step": 1452 }, { "epoch": 0.11971656916865782, "grad_norm": 7.018979668301613, "learning_rate": 9.976752185840524e-06, "loss": 0.6592, "step": 1453 }, { "epoch": 0.11979896185218752, "grad_norm": 4.695286668122802, "learning_rate": 9.976687882114538e-06, "loss": 0.2927, "step": 1454 }, { "epoch": 0.11988135453571723, "grad_norm": 6.372078077902412, "learning_rate": 9.976623489786708e-06, "loss": 0.7203, "step": 1455 }, { "epoch": 0.11996374721924694, "grad_norm": 13.058182473303905, "learning_rate": 9.976559008858182e-06, "loss": 0.8304, "step": 1456 }, { "epoch": 0.12004613990277663, "grad_norm": 3.750428817875247, "learning_rate": 9.976494439330106e-06, "loss": 0.4544, "step": 1457 }, { "epoch": 0.12012853258630633, "grad_norm": 5.714795325590294, "learning_rate": 9.976429781203631e-06, "loss": 0.5764, "step": 1458 }, { "epoch": 0.12021092526983604, "grad_norm": 6.249017860839444, "learning_rate": 9.976365034479907e-06, "loss": 0.6097, "step": 1459 }, { "epoch": 0.12029331795336574, "grad_norm": 4.366534697481048, "learning_rate": 9.976300199160087e-06, "loss": 0.5872, "step": 1460 }, { "epoch": 0.12037571063689545, "grad_norm": 8.297730150263703, "learning_rate": 9.976235275245325e-06, "loss": 0.6445, "step": 1461 }, { "epoch": 0.12045810332042514, "grad_norm": 3.0791550000816437, "learning_rate": 9.976170262736777e-06, "loss": 0.3265, "step": 1462 }, { "epoch": 0.12054049600395485, "grad_norm": 4.587962090590767, "learning_rate": 9.9761051616356e-06, "loss": 0.4655, "step": 1463 }, { "epoch": 0.12062288868748455, "grad_norm": 4.083352587701113, "learning_rate": 9.976039971942955e-06, "loss": 0.5081, "step": 1464 }, { "epoch": 0.12070528137101426, "grad_norm": 19.36776574624167, "learning_rate": 9.97597469366e-06, "loss": 0.8676, "step": 1465 }, { "epoch": 0.12078767405454395, "grad_norm": 13.102581672872773, "learning_rate": 9.975909326787898e-06, "loss": 0.6868, "step": 1466 }, { "epoch": 0.12087006673807366, "grad_norm": 4.961145426331136, "learning_rate": 9.975843871327815e-06, "loss": 0.5178, "step": 1467 }, { "epoch": 0.12095245942160336, "grad_norm": 3.5357157810853512, "learning_rate": 9.975778327280914e-06, "loss": 0.2661, "step": 1468 }, { "epoch": 0.12103485210513307, "grad_norm": 3.0027106887596036, "learning_rate": 9.97571269464836e-06, "loss": 0.2429, "step": 1469 }, { "epoch": 0.12111724478866277, "grad_norm": 4.596992981690704, "learning_rate": 9.975646973431326e-06, "loss": 0.4726, "step": 1470 }, { "epoch": 0.12119963747219246, "grad_norm": 6.690419035435042, "learning_rate": 9.975581163630981e-06, "loss": 0.7149, "step": 1471 }, { "epoch": 0.12128203015572217, "grad_norm": 5.306103849360317, "learning_rate": 9.975515265248493e-06, "loss": 0.5719, "step": 1472 }, { "epoch": 0.12136442283925188, "grad_norm": 5.095282694288974, "learning_rate": 9.975449278285038e-06, "loss": 0.5793, "step": 1473 }, { "epoch": 0.12144681552278158, "grad_norm": 5.007439354366816, "learning_rate": 9.975383202741793e-06, "loss": 0.5795, "step": 1474 }, { "epoch": 0.12152920820631127, "grad_norm": 4.86506910122601, "learning_rate": 9.97531703861993e-06, "loss": 0.4545, "step": 1475 }, { "epoch": 0.12161160088984098, "grad_norm": 12.362020895190154, "learning_rate": 9.975250785920629e-06, "loss": 0.6358, "step": 1476 }, { "epoch": 0.12169399357337068, "grad_norm": 5.088274293351666, "learning_rate": 9.97518444464507e-06, "loss": 0.4832, "step": 1477 }, { "epoch": 0.12177638625690039, "grad_norm": 4.852830296274398, "learning_rate": 9.975118014794431e-06, "loss": 0.4588, "step": 1478 }, { "epoch": 0.1218587789404301, "grad_norm": 5.076993779434122, "learning_rate": 9.975051496369899e-06, "loss": 0.5489, "step": 1479 }, { "epoch": 0.12194117162395979, "grad_norm": 4.492297833507457, "learning_rate": 9.974984889372658e-06, "loss": 0.3727, "step": 1480 }, { "epoch": 0.1220235643074895, "grad_norm": 4.969531949366711, "learning_rate": 9.97491819380389e-06, "loss": 0.6192, "step": 1481 }, { "epoch": 0.1221059569910192, "grad_norm": 5.5171550464058985, "learning_rate": 9.974851409664786e-06, "loss": 0.5168, "step": 1482 }, { "epoch": 0.1221883496745489, "grad_norm": 4.549692286167438, "learning_rate": 9.974784536956533e-06, "loss": 0.5461, "step": 1483 }, { "epoch": 0.1222707423580786, "grad_norm": 4.638601246264793, "learning_rate": 9.974717575680321e-06, "loss": 0.5586, "step": 1484 }, { "epoch": 0.1223531350416083, "grad_norm": 3.9934881662408617, "learning_rate": 9.974650525837345e-06, "loss": 0.4364, "step": 1485 }, { "epoch": 0.12243552772513801, "grad_norm": 8.227489553944196, "learning_rate": 9.974583387428797e-06, "loss": 0.8104, "step": 1486 }, { "epoch": 0.12251792040866771, "grad_norm": 6.644787549148226, "learning_rate": 9.974516160455872e-06, "loss": 0.4537, "step": 1487 }, { "epoch": 0.12260031309219742, "grad_norm": 4.325870512007751, "learning_rate": 9.974448844919766e-06, "loss": 0.4874, "step": 1488 }, { "epoch": 0.12268270577572711, "grad_norm": 6.081726074936586, "learning_rate": 9.97438144082168e-06, "loss": 0.3875, "step": 1489 }, { "epoch": 0.12276509845925682, "grad_norm": 4.548854894101285, "learning_rate": 9.974313948162812e-06, "loss": 0.5696, "step": 1490 }, { "epoch": 0.12284749114278652, "grad_norm": 9.255791845151686, "learning_rate": 9.974246366944364e-06, "loss": 0.9999, "step": 1491 }, { "epoch": 0.12292988382631623, "grad_norm": 6.508244913389245, "learning_rate": 9.97417869716754e-06, "loss": 0.6256, "step": 1492 }, { "epoch": 0.12301227650984592, "grad_norm": 2.9004728725960067, "learning_rate": 9.974110938833545e-06, "loss": 0.2222, "step": 1493 }, { "epoch": 0.12309466919337562, "grad_norm": 6.840818959886781, "learning_rate": 9.974043091943584e-06, "loss": 0.6488, "step": 1494 }, { "epoch": 0.12317706187690533, "grad_norm": 4.564889329777813, "learning_rate": 9.973975156498866e-06, "loss": 0.4834, "step": 1495 }, { "epoch": 0.12325945456043504, "grad_norm": 6.124262779948301, "learning_rate": 9.973907132500597e-06, "loss": 0.6345, "step": 1496 }, { "epoch": 0.12334184724396474, "grad_norm": 4.143149551754576, "learning_rate": 9.973839019949994e-06, "loss": 0.5449, "step": 1497 }, { "epoch": 0.12342423992749443, "grad_norm": 4.692433215382461, "learning_rate": 9.973770818848265e-06, "loss": 0.381, "step": 1498 }, { "epoch": 0.12350663261102414, "grad_norm": 4.558902674629272, "learning_rate": 9.973702529196627e-06, "loss": 0.4342, "step": 1499 }, { "epoch": 0.12358902529455384, "grad_norm": 4.544800628848942, "learning_rate": 9.973634150996291e-06, "loss": 0.3499, "step": 1500 }, { "epoch": 0.12367141797808355, "grad_norm": 5.059581470168767, "learning_rate": 9.973565684248483e-06, "loss": 0.5135, "step": 1501 }, { "epoch": 0.12375381066161324, "grad_norm": 4.258353140119297, "learning_rate": 9.973497128954414e-06, "loss": 0.269, "step": 1502 }, { "epoch": 0.12383620334514295, "grad_norm": 7.132848659419424, "learning_rate": 9.973428485115308e-06, "loss": 0.6726, "step": 1503 }, { "epoch": 0.12391859602867265, "grad_norm": 3.5983862118163277, "learning_rate": 9.973359752732386e-06, "loss": 0.4669, "step": 1504 }, { "epoch": 0.12400098871220236, "grad_norm": 3.448183524414553, "learning_rate": 9.973290931806874e-06, "loss": 0.1703, "step": 1505 }, { "epoch": 0.12408338139573206, "grad_norm": 3.0366884309895794, "learning_rate": 9.973222022339992e-06, "loss": 0.2643, "step": 1506 }, { "epoch": 0.12416577407926176, "grad_norm": 4.8670408538461745, "learning_rate": 9.973153024332974e-06, "loss": 0.2684, "step": 1507 }, { "epoch": 0.12424816676279146, "grad_norm": 5.9026783007837205, "learning_rate": 9.973083937787042e-06, "loss": 0.5869, "step": 1508 }, { "epoch": 0.12433055944632117, "grad_norm": 6.025970687624214, "learning_rate": 9.973014762703429e-06, "loss": 0.4191, "step": 1509 }, { "epoch": 0.12441295212985087, "grad_norm": 6.51904193753321, "learning_rate": 9.972945499083366e-06, "loss": 0.7139, "step": 1510 }, { "epoch": 0.12449534481338057, "grad_norm": 6.684992245083261, "learning_rate": 9.972876146928088e-06, "loss": 0.6404, "step": 1511 }, { "epoch": 0.12457773749691027, "grad_norm": 5.001971574274414, "learning_rate": 9.972806706238826e-06, "loss": 0.4946, "step": 1512 }, { "epoch": 0.12466013018043998, "grad_norm": 6.7337019014227355, "learning_rate": 9.97273717701682e-06, "loss": 0.5832, "step": 1513 }, { "epoch": 0.12474252286396968, "grad_norm": 5.220718061812807, "learning_rate": 9.972667559263305e-06, "loss": 0.5482, "step": 1514 }, { "epoch": 0.12482491554749939, "grad_norm": 5.150466794347032, "learning_rate": 9.97259785297952e-06, "loss": 0.6143, "step": 1515 }, { "epoch": 0.12490730823102908, "grad_norm": 6.537060800514755, "learning_rate": 9.972528058166711e-06, "loss": 0.6394, "step": 1516 }, { "epoch": 0.12498970091455879, "grad_norm": 4.955150662710675, "learning_rate": 9.972458174826115e-06, "loss": 0.5091, "step": 1517 }, { "epoch": 0.1250720935980885, "grad_norm": 4.842000273038944, "learning_rate": 9.972388202958977e-06, "loss": 0.4601, "step": 1518 }, { "epoch": 0.12515448628161818, "grad_norm": 4.184733000664414, "learning_rate": 9.972318142566547e-06, "loss": 0.4797, "step": 1519 }, { "epoch": 0.1252368789651479, "grad_norm": 5.585543182543029, "learning_rate": 9.972247993650067e-06, "loss": 0.6223, "step": 1520 }, { "epoch": 0.1253192716486776, "grad_norm": 5.879739754393376, "learning_rate": 9.97217775621079e-06, "loss": 0.628, "step": 1521 }, { "epoch": 0.1254016643322073, "grad_norm": 11.002403812555526, "learning_rate": 9.972107430249963e-06, "loss": 0.553, "step": 1522 }, { "epoch": 0.125484057015737, "grad_norm": 6.203040642309966, "learning_rate": 9.972037015768841e-06, "loss": 0.7279, "step": 1523 }, { "epoch": 0.1255664496992667, "grad_norm": 7.577934526871955, "learning_rate": 9.971966512768677e-06, "loss": 0.538, "step": 1524 }, { "epoch": 0.12564884238279642, "grad_norm": 4.019605674039673, "learning_rate": 9.971895921250723e-06, "loss": 0.5441, "step": 1525 }, { "epoch": 0.1257312350663261, "grad_norm": 5.764658658433788, "learning_rate": 9.97182524121624e-06, "loss": 0.6654, "step": 1526 }, { "epoch": 0.1258136277498558, "grad_norm": 4.945527867426451, "learning_rate": 9.971754472666484e-06, "loss": 0.514, "step": 1527 }, { "epoch": 0.12589602043338552, "grad_norm": 6.0441278677454005, "learning_rate": 9.971683615602716e-06, "loss": 0.4181, "step": 1528 }, { "epoch": 0.1259784131169152, "grad_norm": 14.25711273437809, "learning_rate": 9.971612670026196e-06, "loss": 0.6254, "step": 1529 }, { "epoch": 0.12606080580044493, "grad_norm": 4.700273765258455, "learning_rate": 9.97154163593819e-06, "loss": 0.6229, "step": 1530 }, { "epoch": 0.12614319848397462, "grad_norm": 3.3864821654511945, "learning_rate": 9.97147051333996e-06, "loss": 0.399, "step": 1531 }, { "epoch": 0.12622559116750431, "grad_norm": 5.12783193960563, "learning_rate": 9.971399302232772e-06, "loss": 0.5427, "step": 1532 }, { "epoch": 0.12630798385103403, "grad_norm": 4.931238238343592, "learning_rate": 9.971328002617895e-06, "loss": 0.2851, "step": 1533 }, { "epoch": 0.12639037653456373, "grad_norm": 5.285172145335731, "learning_rate": 9.971256614496598e-06, "loss": 0.5647, "step": 1534 }, { "epoch": 0.12647276921809344, "grad_norm": 4.744357889613329, "learning_rate": 9.971185137870155e-06, "loss": 0.5237, "step": 1535 }, { "epoch": 0.12655516190162314, "grad_norm": 8.652911164552046, "learning_rate": 9.971113572739832e-06, "loss": 0.8313, "step": 1536 }, { "epoch": 0.12663755458515283, "grad_norm": 5.869655307998774, "learning_rate": 9.971041919106908e-06, "loss": 0.5802, "step": 1537 }, { "epoch": 0.12671994726868255, "grad_norm": 6.3300754330457, "learning_rate": 9.970970176972658e-06, "loss": 0.6885, "step": 1538 }, { "epoch": 0.12680233995221224, "grad_norm": 7.1758070025576055, "learning_rate": 9.970898346338358e-06, "loss": 0.6389, "step": 1539 }, { "epoch": 0.12688473263574196, "grad_norm": 12.961913535688998, "learning_rate": 9.970826427205287e-06, "loss": 0.6393, "step": 1540 }, { "epoch": 0.12696712531927165, "grad_norm": 5.401262177099866, "learning_rate": 9.970754419574728e-06, "loss": 0.5455, "step": 1541 }, { "epoch": 0.12704951800280134, "grad_norm": 3.3340002399513, "learning_rate": 9.970682323447959e-06, "loss": 0.2345, "step": 1542 }, { "epoch": 0.12713191068633106, "grad_norm": 8.894631262296059, "learning_rate": 9.970610138826267e-06, "loss": 0.7767, "step": 1543 }, { "epoch": 0.12721430336986075, "grad_norm": 4.952958216270172, "learning_rate": 9.970537865710934e-06, "loss": 0.2786, "step": 1544 }, { "epoch": 0.12729669605339045, "grad_norm": 2.6515561410684194, "learning_rate": 9.970465504103249e-06, "loss": 0.2038, "step": 1545 }, { "epoch": 0.12737908873692017, "grad_norm": 5.298892635444798, "learning_rate": 9.9703930540045e-06, "loss": 0.4559, "step": 1546 }, { "epoch": 0.12746148142044986, "grad_norm": 4.197621921991316, "learning_rate": 9.970320515415974e-06, "loss": 0.4502, "step": 1547 }, { "epoch": 0.12754387410397958, "grad_norm": 4.800228256482724, "learning_rate": 9.970247888338966e-06, "loss": 0.5957, "step": 1548 }, { "epoch": 0.12762626678750927, "grad_norm": 4.815816035104436, "learning_rate": 9.970175172774768e-06, "loss": 0.5874, "step": 1549 }, { "epoch": 0.12770865947103896, "grad_norm": 6.545724686124551, "learning_rate": 9.970102368724675e-06, "loss": 0.5925, "step": 1550 }, { "epoch": 0.12779105215456868, "grad_norm": 5.121538774261143, "learning_rate": 9.970029476189984e-06, "loss": 0.5063, "step": 1551 }, { "epoch": 0.12787344483809837, "grad_norm": 5.7759067014268135, "learning_rate": 9.969956495171989e-06, "loss": 0.6866, "step": 1552 }, { "epoch": 0.1279558375216281, "grad_norm": 6.588677492190531, "learning_rate": 9.96988342567199e-06, "loss": 0.7341, "step": 1553 }, { "epoch": 0.12803823020515778, "grad_norm": 6.843692520636973, "learning_rate": 9.969810267691293e-06, "loss": 0.7034, "step": 1554 }, { "epoch": 0.12812062288868747, "grad_norm": 5.436070981898065, "learning_rate": 9.969737021231196e-06, "loss": 0.4046, "step": 1555 }, { "epoch": 0.1282030155722172, "grad_norm": 15.426453647795194, "learning_rate": 9.969663686293003e-06, "loss": 0.8425, "step": 1556 }, { "epoch": 0.12828540825574689, "grad_norm": 6.468281649010783, "learning_rate": 9.969590262878021e-06, "loss": 0.5969, "step": 1557 }, { "epoch": 0.1283678009392766, "grad_norm": 8.361371730974474, "learning_rate": 9.969516750987558e-06, "loss": 0.6787, "step": 1558 }, { "epoch": 0.1284501936228063, "grad_norm": 5.3302943640173925, "learning_rate": 9.969443150622921e-06, "loss": 0.5459, "step": 1559 }, { "epoch": 0.128532586306336, "grad_norm": 4.633429138093913, "learning_rate": 9.96936946178542e-06, "loss": 0.649, "step": 1560 }, { "epoch": 0.1286149789898657, "grad_norm": 8.13100570319315, "learning_rate": 9.96929568447637e-06, "loss": 0.7892, "step": 1561 }, { "epoch": 0.1286973716733954, "grad_norm": 5.039540157664388, "learning_rate": 9.96922181869708e-06, "loss": 0.5216, "step": 1562 }, { "epoch": 0.1287797643569251, "grad_norm": 3.6431682961361633, "learning_rate": 9.969147864448867e-06, "loss": 0.2595, "step": 1563 }, { "epoch": 0.1288621570404548, "grad_norm": 5.4887751624743375, "learning_rate": 9.96907382173305e-06, "loss": 0.5876, "step": 1564 }, { "epoch": 0.1289445497239845, "grad_norm": 4.129305542648151, "learning_rate": 9.968999690550945e-06, "loss": 0.567, "step": 1565 }, { "epoch": 0.12902694240751422, "grad_norm": 4.6202359784395695, "learning_rate": 9.96892547090387e-06, "loss": 0.3541, "step": 1566 }, { "epoch": 0.12910933509104391, "grad_norm": 5.530337251130461, "learning_rate": 9.968851162793149e-06, "loss": 0.4995, "step": 1567 }, { "epoch": 0.1291917277745736, "grad_norm": 6.098832238315356, "learning_rate": 9.968776766220105e-06, "loss": 0.6462, "step": 1568 }, { "epoch": 0.12927412045810333, "grad_norm": 36.110655337933196, "learning_rate": 9.968702281186062e-06, "loss": 1.8144, "step": 1569 }, { "epoch": 0.12935651314163302, "grad_norm": 3.73141968324195, "learning_rate": 9.968627707692345e-06, "loss": 0.4626, "step": 1570 }, { "epoch": 0.12943890582516274, "grad_norm": 5.556066968723632, "learning_rate": 9.968553045740283e-06, "loss": 0.4893, "step": 1571 }, { "epoch": 0.12952129850869243, "grad_norm": 3.5534727840085303, "learning_rate": 9.968478295331206e-06, "loss": 0.3499, "step": 1572 }, { "epoch": 0.12960369119222212, "grad_norm": 5.954589534858549, "learning_rate": 9.96840345646644e-06, "loss": 0.7069, "step": 1573 }, { "epoch": 0.12968608387575184, "grad_norm": 5.504329435904227, "learning_rate": 9.968328529147324e-06, "loss": 0.5345, "step": 1574 }, { "epoch": 0.12976847655928153, "grad_norm": 4.780089942398109, "learning_rate": 9.968253513375187e-06, "loss": 0.6211, "step": 1575 }, { "epoch": 0.12985086924281125, "grad_norm": 3.6400688935977183, "learning_rate": 9.968178409151368e-06, "loss": 0.4675, "step": 1576 }, { "epoch": 0.12993326192634094, "grad_norm": 5.161844362296579, "learning_rate": 9.968103216477203e-06, "loss": 0.5463, "step": 1577 }, { "epoch": 0.13001565460987063, "grad_norm": 6.01338401708055, "learning_rate": 9.968027935354029e-06, "loss": 0.5191, "step": 1578 }, { "epoch": 0.13009804729340035, "grad_norm": 4.717446966537538, "learning_rate": 9.967952565783188e-06, "loss": 0.5651, "step": 1579 }, { "epoch": 0.13018043997693005, "grad_norm": 5.2385252642930125, "learning_rate": 9.96787710776602e-06, "loss": 0.5444, "step": 1580 }, { "epoch": 0.13026283266045974, "grad_norm": 4.091715325329443, "learning_rate": 9.967801561303871e-06, "loss": 0.4193, "step": 1581 }, { "epoch": 0.13034522534398946, "grad_norm": 7.045107352433902, "learning_rate": 9.967725926398086e-06, "loss": 0.4062, "step": 1582 }, { "epoch": 0.13042761802751915, "grad_norm": 4.770986545998961, "learning_rate": 9.967650203050007e-06, "loss": 0.5442, "step": 1583 }, { "epoch": 0.13051001071104887, "grad_norm": 4.436604203751775, "learning_rate": 9.967574391260988e-06, "loss": 0.4965, "step": 1584 }, { "epoch": 0.13059240339457856, "grad_norm": 5.81892153360883, "learning_rate": 9.967498491032376e-06, "loss": 0.5432, "step": 1585 }, { "epoch": 0.13067479607810825, "grad_norm": 6.625865132029466, "learning_rate": 9.967422502365523e-06, "loss": 0.6075, "step": 1586 }, { "epoch": 0.13075718876163797, "grad_norm": 5.357243836445354, "learning_rate": 9.96734642526178e-06, "loss": 0.5635, "step": 1587 }, { "epoch": 0.13083958144516766, "grad_norm": 7.170092927283846, "learning_rate": 9.9672702597225e-06, "loss": 0.5262, "step": 1588 }, { "epoch": 0.13092197412869738, "grad_norm": 5.58870075455088, "learning_rate": 9.967194005749045e-06, "loss": 0.5163, "step": 1589 }, { "epoch": 0.13100436681222707, "grad_norm": 3.3901516521405113, "learning_rate": 9.96711766334277e-06, "loss": 0.3337, "step": 1590 }, { "epoch": 0.13108675949575677, "grad_norm": 6.177430383120361, "learning_rate": 9.967041232505032e-06, "loss": 0.6221, "step": 1591 }, { "epoch": 0.13116915217928649, "grad_norm": 3.409414315356522, "learning_rate": 9.966964713237193e-06, "loss": 0.4258, "step": 1592 }, { "epoch": 0.13125154486281618, "grad_norm": 5.796187847217588, "learning_rate": 9.966888105540615e-06, "loss": 0.6415, "step": 1593 }, { "epoch": 0.1313339375463459, "grad_norm": 4.297779577006258, "learning_rate": 9.966811409416664e-06, "loss": 0.353, "step": 1594 }, { "epoch": 0.1314163302298756, "grad_norm": 8.019855525742495, "learning_rate": 9.966734624866702e-06, "loss": 0.6986, "step": 1595 }, { "epoch": 0.13149872291340528, "grad_norm": 4.454829131276674, "learning_rate": 9.966657751892099e-06, "loss": 0.44, "step": 1596 }, { "epoch": 0.131581115596935, "grad_norm": 5.627067380885521, "learning_rate": 9.966580790494222e-06, "loss": 0.5673, "step": 1597 }, { "epoch": 0.1316635082804647, "grad_norm": 16.272684626405372, "learning_rate": 9.96650374067444e-06, "loss": 0.6624, "step": 1598 }, { "epoch": 0.13174590096399438, "grad_norm": 4.793323279968309, "learning_rate": 9.966426602434128e-06, "loss": 0.543, "step": 1599 }, { "epoch": 0.1318282936475241, "grad_norm": 5.433792865346555, "learning_rate": 9.966349375774658e-06, "loss": 0.5756, "step": 1600 }, { "epoch": 0.1319106863310538, "grad_norm": 6.05169655830317, "learning_rate": 9.966272060697403e-06, "loss": 0.5257, "step": 1601 }, { "epoch": 0.13199307901458351, "grad_norm": 28.334925324741096, "learning_rate": 9.966194657203743e-06, "loss": 0.7121, "step": 1602 }, { "epoch": 0.1320754716981132, "grad_norm": 4.439485716182711, "learning_rate": 9.966117165295053e-06, "loss": 0.4213, "step": 1603 }, { "epoch": 0.1321578643816429, "grad_norm": 4.634956482246697, "learning_rate": 9.966039584972713e-06, "loss": 0.5792, "step": 1604 }, { "epoch": 0.13224025706517262, "grad_norm": 8.01022976032216, "learning_rate": 9.965961916238105e-06, "loss": 0.8657, "step": 1605 }, { "epoch": 0.1323226497487023, "grad_norm": 4.263056023595695, "learning_rate": 9.965884159092613e-06, "loss": 0.5201, "step": 1606 }, { "epoch": 0.13240504243223203, "grad_norm": 5.493539454273528, "learning_rate": 9.965806313537618e-06, "loss": 0.551, "step": 1607 }, { "epoch": 0.13248743511576172, "grad_norm": 5.696550440916956, "learning_rate": 9.965728379574508e-06, "loss": 0.6679, "step": 1608 }, { "epoch": 0.1325698277992914, "grad_norm": 5.757476810730032, "learning_rate": 9.965650357204673e-06, "loss": 0.347, "step": 1609 }, { "epoch": 0.13265222048282113, "grad_norm": 3.9736532348314686, "learning_rate": 9.965572246429498e-06, "loss": 0.4657, "step": 1610 }, { "epoch": 0.13273461316635082, "grad_norm": 5.014191491777652, "learning_rate": 9.965494047250374e-06, "loss": 0.5738, "step": 1611 }, { "epoch": 0.13281700584988054, "grad_norm": 4.378594249950214, "learning_rate": 9.965415759668696e-06, "loss": 0.5816, "step": 1612 }, { "epoch": 0.13289939853341023, "grad_norm": 4.370547607797723, "learning_rate": 9.965337383685854e-06, "loss": 0.4593, "step": 1613 }, { "epoch": 0.13298179121693993, "grad_norm": 2.9688769844150844, "learning_rate": 9.965258919303246e-06, "loss": 0.4406, "step": 1614 }, { "epoch": 0.13306418390046965, "grad_norm": 5.547068937775387, "learning_rate": 9.965180366522269e-06, "loss": 0.6537, "step": 1615 }, { "epoch": 0.13314657658399934, "grad_norm": 6.145359120990056, "learning_rate": 9.96510172534432e-06, "loss": 0.5867, "step": 1616 }, { "epoch": 0.13322896926752906, "grad_norm": 5.312644770705092, "learning_rate": 9.9650229957708e-06, "loss": 0.6586, "step": 1617 }, { "epoch": 0.13331136195105875, "grad_norm": 5.562860466575434, "learning_rate": 9.96494417780311e-06, "loss": 0.5398, "step": 1618 }, { "epoch": 0.13339375463458844, "grad_norm": 4.158707385172828, "learning_rate": 9.964865271442656e-06, "loss": 0.3144, "step": 1619 }, { "epoch": 0.13347614731811816, "grad_norm": 4.431346560539185, "learning_rate": 9.964786276690839e-06, "loss": 0.4856, "step": 1620 }, { "epoch": 0.13355854000164785, "grad_norm": 5.449051878207818, "learning_rate": 9.964707193549069e-06, "loss": 0.5363, "step": 1621 }, { "epoch": 0.13364093268517754, "grad_norm": 6.425628126296071, "learning_rate": 9.964628022018748e-06, "loss": 0.7224, "step": 1622 }, { "epoch": 0.13372332536870726, "grad_norm": 7.144634256191189, "learning_rate": 9.964548762101293e-06, "loss": 0.7207, "step": 1623 }, { "epoch": 0.13380571805223695, "grad_norm": 5.800859450448251, "learning_rate": 9.96446941379811e-06, "loss": 0.6978, "step": 1624 }, { "epoch": 0.13388811073576667, "grad_norm": 5.499598864277338, "learning_rate": 9.964389977110613e-06, "loss": 0.4624, "step": 1625 }, { "epoch": 0.13397050341929637, "grad_norm": 4.9609394948407255, "learning_rate": 9.964310452040216e-06, "loss": 0.555, "step": 1626 }, { "epoch": 0.13405289610282606, "grad_norm": 5.42629026019687, "learning_rate": 9.964230838588336e-06, "loss": 0.4247, "step": 1627 }, { "epoch": 0.13413528878635578, "grad_norm": 4.866848300258573, "learning_rate": 9.964151136756391e-06, "loss": 0.5655, "step": 1628 }, { "epoch": 0.13421768146988547, "grad_norm": 4.615561791061624, "learning_rate": 9.964071346545796e-06, "loss": 0.58, "step": 1629 }, { "epoch": 0.1343000741534152, "grad_norm": 6.71642950132474, "learning_rate": 9.963991467957977e-06, "loss": 0.7631, "step": 1630 }, { "epoch": 0.13438246683694488, "grad_norm": 4.315299600438297, "learning_rate": 9.963911500994352e-06, "loss": 0.5401, "step": 1631 }, { "epoch": 0.13446485952047457, "grad_norm": 4.648644316335041, "learning_rate": 9.963831445656345e-06, "loss": 0.5922, "step": 1632 }, { "epoch": 0.1345472522040043, "grad_norm": 3.5272454240753266, "learning_rate": 9.96375130194538e-06, "loss": 0.4294, "step": 1633 }, { "epoch": 0.13462964488753398, "grad_norm": 4.157793884858299, "learning_rate": 9.963671069862891e-06, "loss": 0.3727, "step": 1634 }, { "epoch": 0.1347120375710637, "grad_norm": 5.325914533957815, "learning_rate": 9.9635907494103e-06, "loss": 0.5745, "step": 1635 }, { "epoch": 0.1347944302545934, "grad_norm": 5.069398707440809, "learning_rate": 9.963510340589037e-06, "loss": 0.706, "step": 1636 }, { "epoch": 0.1348768229381231, "grad_norm": 28.812450237360544, "learning_rate": 9.963429843400536e-06, "loss": 0.7662, "step": 1637 }, { "epoch": 0.1349592156216528, "grad_norm": 3.5444323522446624, "learning_rate": 9.963349257846227e-06, "loss": 0.524, "step": 1638 }, { "epoch": 0.1350416083051825, "grad_norm": 3.5539689124915856, "learning_rate": 9.963268583927549e-06, "loss": 0.2982, "step": 1639 }, { "epoch": 0.1351240009887122, "grad_norm": 5.938623235390135, "learning_rate": 9.963187821645934e-06, "loss": 0.7121, "step": 1640 }, { "epoch": 0.1352063936722419, "grad_norm": 6.0354335604105565, "learning_rate": 9.963106971002825e-06, "loss": 0.4654, "step": 1641 }, { "epoch": 0.1352887863557716, "grad_norm": 4.70397412329452, "learning_rate": 9.963026031999657e-06, "loss": 0.4274, "step": 1642 }, { "epoch": 0.13537117903930132, "grad_norm": 10.030510740341603, "learning_rate": 9.96294500463787e-06, "loss": 0.946, "step": 1643 }, { "epoch": 0.135453571722831, "grad_norm": 3.6329870853318207, "learning_rate": 9.96286388891891e-06, "loss": 0.3884, "step": 1644 }, { "epoch": 0.1355359644063607, "grad_norm": 21.36243576092974, "learning_rate": 9.962782684844222e-06, "loss": 0.352, "step": 1645 }, { "epoch": 0.13561835708989042, "grad_norm": 4.951295627734071, "learning_rate": 9.962701392415248e-06, "loss": 0.4897, "step": 1646 }, { "epoch": 0.13570074977342012, "grad_norm": 8.046321604277672, "learning_rate": 9.962620011633437e-06, "loss": 0.535, "step": 1647 }, { "epoch": 0.13578314245694983, "grad_norm": 6.575825142302053, "learning_rate": 9.962538542500237e-06, "loss": 0.309, "step": 1648 }, { "epoch": 0.13586553514047953, "grad_norm": 6.94532275093602, "learning_rate": 9.9624569850171e-06, "loss": 0.6461, "step": 1649 }, { "epoch": 0.13594792782400922, "grad_norm": 4.7357288915786855, "learning_rate": 9.962375339185477e-06, "loss": 0.4357, "step": 1650 }, { "epoch": 0.13603032050753894, "grad_norm": 6.597949073770168, "learning_rate": 9.962293605006824e-06, "loss": 0.6975, "step": 1651 }, { "epoch": 0.13611271319106863, "grad_norm": 4.15014398746701, "learning_rate": 9.962211782482592e-06, "loss": 0.3552, "step": 1652 }, { "epoch": 0.13619510587459835, "grad_norm": 4.809601849964069, "learning_rate": 9.962129871614238e-06, "loss": 0.6046, "step": 1653 }, { "epoch": 0.13627749855812804, "grad_norm": 4.127491331508017, "learning_rate": 9.962047872403225e-06, "loss": 0.4958, "step": 1654 }, { "epoch": 0.13635989124165773, "grad_norm": 4.711346309289919, "learning_rate": 9.961965784851008e-06, "loss": 0.5391, "step": 1655 }, { "epoch": 0.13644228392518745, "grad_norm": 5.386352678608556, "learning_rate": 9.96188360895905e-06, "loss": 0.5528, "step": 1656 }, { "epoch": 0.13652467660871714, "grad_norm": 3.7171000317960674, "learning_rate": 9.961801344728814e-06, "loss": 0.3146, "step": 1657 }, { "epoch": 0.13660706929224684, "grad_norm": 8.89069958620607, "learning_rate": 9.961718992161766e-06, "loss": 0.7587, "step": 1658 }, { "epoch": 0.13668946197577655, "grad_norm": 4.697622743571729, "learning_rate": 9.961636551259372e-06, "loss": 0.5835, "step": 1659 }, { "epoch": 0.13677185465930625, "grad_norm": 4.830696241299008, "learning_rate": 9.961554022023096e-06, "loss": 0.4971, "step": 1660 }, { "epoch": 0.13685424734283597, "grad_norm": 6.104707702622647, "learning_rate": 9.961471404454412e-06, "loss": 0.7071, "step": 1661 }, { "epoch": 0.13693664002636566, "grad_norm": 4.34172790300022, "learning_rate": 9.961388698554788e-06, "loss": 0.5556, "step": 1662 }, { "epoch": 0.13701903270989535, "grad_norm": 4.298900363667948, "learning_rate": 9.961305904325698e-06, "loss": 0.5294, "step": 1663 }, { "epoch": 0.13710142539342507, "grad_norm": 4.954490101460579, "learning_rate": 9.961223021768616e-06, "loss": 0.6465, "step": 1664 }, { "epoch": 0.13718381807695476, "grad_norm": 3.726228481637675, "learning_rate": 9.961140050885014e-06, "loss": 0.631, "step": 1665 }, { "epoch": 0.13726621076048448, "grad_norm": 4.358895727873771, "learning_rate": 9.961056991676374e-06, "loss": 0.3122, "step": 1666 }, { "epoch": 0.13734860344401417, "grad_norm": 3.7045601167381945, "learning_rate": 9.960973844144173e-06, "loss": 0.4447, "step": 1667 }, { "epoch": 0.13743099612754386, "grad_norm": 4.571967455353021, "learning_rate": 9.960890608289892e-06, "loss": 0.4863, "step": 1668 }, { "epoch": 0.13751338881107358, "grad_norm": 5.0825669590129205, "learning_rate": 9.96080728411501e-06, "loss": 0.6621, "step": 1669 }, { "epoch": 0.13759578149460328, "grad_norm": 5.845767088881154, "learning_rate": 9.960723871621015e-06, "loss": 0.7493, "step": 1670 }, { "epoch": 0.137678174178133, "grad_norm": 13.232880223960649, "learning_rate": 9.960640370809386e-06, "loss": 0.8361, "step": 1671 }, { "epoch": 0.1377605668616627, "grad_norm": 3.393345594707139, "learning_rate": 9.960556781681617e-06, "loss": 0.4943, "step": 1672 }, { "epoch": 0.13784295954519238, "grad_norm": 4.815367039541761, "learning_rate": 9.960473104239188e-06, "loss": 0.5859, "step": 1673 }, { "epoch": 0.1379253522287221, "grad_norm": 25.934815306172943, "learning_rate": 9.960389338483595e-06, "loss": 0.5897, "step": 1674 }, { "epoch": 0.1380077449122518, "grad_norm": 24.48503833988488, "learning_rate": 9.960305484416329e-06, "loss": 0.4866, "step": 1675 }, { "epoch": 0.13809013759578148, "grad_norm": 4.687506824918209, "learning_rate": 9.96022154203888e-06, "loss": 0.6256, "step": 1676 }, { "epoch": 0.1381725302793112, "grad_norm": 4.850750355427269, "learning_rate": 9.960137511352743e-06, "loss": 0.6337, "step": 1677 }, { "epoch": 0.1382549229628409, "grad_norm": 3.8668299030247524, "learning_rate": 9.960053392359415e-06, "loss": 0.469, "step": 1678 }, { "epoch": 0.1383373156463706, "grad_norm": 6.853909013979112, "learning_rate": 9.959969185060393e-06, "loss": 0.6324, "step": 1679 }, { "epoch": 0.1384197083299003, "grad_norm": 6.736126149335651, "learning_rate": 9.959884889457176e-06, "loss": 0.562, "step": 1680 }, { "epoch": 0.13850210101343, "grad_norm": 3.2727359868331862, "learning_rate": 9.959800505551266e-06, "loss": 0.3456, "step": 1681 }, { "epoch": 0.13858449369695972, "grad_norm": 5.234071812547031, "learning_rate": 9.959716033344164e-06, "loss": 0.5409, "step": 1682 }, { "epoch": 0.1386668863804894, "grad_norm": 6.251936957368742, "learning_rate": 9.959631472837376e-06, "loss": 0.5655, "step": 1683 }, { "epoch": 0.13874927906401913, "grad_norm": 5.288184313366442, "learning_rate": 9.959546824032404e-06, "loss": 0.5368, "step": 1684 }, { "epoch": 0.13883167174754882, "grad_norm": 6.679955478940586, "learning_rate": 9.959462086930757e-06, "loss": 0.6028, "step": 1685 }, { "epoch": 0.1389140644310785, "grad_norm": 5.000778668824838, "learning_rate": 9.959377261533945e-06, "loss": 0.5867, "step": 1686 }, { "epoch": 0.13899645711460823, "grad_norm": 5.120889487517192, "learning_rate": 9.959292347843476e-06, "loss": 0.5128, "step": 1687 }, { "epoch": 0.13907884979813792, "grad_norm": 6.017331185338668, "learning_rate": 9.959207345860863e-06, "loss": 0.8164, "step": 1688 }, { "epoch": 0.13916124248166764, "grad_norm": 4.242760533926133, "learning_rate": 9.959122255587617e-06, "loss": 0.4745, "step": 1689 }, { "epoch": 0.13924363516519733, "grad_norm": 6.532137559532413, "learning_rate": 9.959037077025256e-06, "loss": 0.6932, "step": 1690 }, { "epoch": 0.13932602784872702, "grad_norm": 5.970476933345833, "learning_rate": 9.958951810175294e-06, "loss": 0.5707, "step": 1691 }, { "epoch": 0.13940842053225674, "grad_norm": 4.783279541756054, "learning_rate": 9.958866455039253e-06, "loss": 0.4375, "step": 1692 }, { "epoch": 0.13949081321578644, "grad_norm": 5.1671883616236025, "learning_rate": 9.958781011618648e-06, "loss": 0.5305, "step": 1693 }, { "epoch": 0.13957320589931613, "grad_norm": 4.396863523163892, "learning_rate": 9.958695479915002e-06, "loss": 0.4693, "step": 1694 }, { "epoch": 0.13965559858284585, "grad_norm": 6.13192250177938, "learning_rate": 9.958609859929836e-06, "loss": 0.7002, "step": 1695 }, { "epoch": 0.13973799126637554, "grad_norm": 4.437325147612999, "learning_rate": 9.958524151664677e-06, "loss": 0.3917, "step": 1696 }, { "epoch": 0.13982038394990526, "grad_norm": 5.070019626853925, "learning_rate": 9.958438355121052e-06, "loss": 0.4742, "step": 1697 }, { "epoch": 0.13990277663343495, "grad_norm": 6.083396673742178, "learning_rate": 9.958352470300485e-06, "loss": 0.621, "step": 1698 }, { "epoch": 0.13998516931696464, "grad_norm": 6.384051434280243, "learning_rate": 9.958266497204506e-06, "loss": 0.5026, "step": 1699 }, { "epoch": 0.14006756200049436, "grad_norm": 4.010479307928655, "learning_rate": 9.958180435834646e-06, "loss": 0.4158, "step": 1700 }, { "epoch": 0.14014995468402405, "grad_norm": 4.719623857149445, "learning_rate": 9.958094286192437e-06, "loss": 0.4985, "step": 1701 }, { "epoch": 0.14023234736755377, "grad_norm": 5.426958394750245, "learning_rate": 9.958008048279413e-06, "loss": 0.5531, "step": 1702 }, { "epoch": 0.14031474005108346, "grad_norm": 5.08856362765558, "learning_rate": 9.95792172209711e-06, "loss": 0.3943, "step": 1703 }, { "epoch": 0.14039713273461316, "grad_norm": 5.638213422297256, "learning_rate": 9.957835307647063e-06, "loss": 0.6932, "step": 1704 }, { "epoch": 0.14047952541814288, "grad_norm": 6.63185919475462, "learning_rate": 9.957748804930813e-06, "loss": 0.6148, "step": 1705 }, { "epoch": 0.14056191810167257, "grad_norm": 7.754221383064333, "learning_rate": 9.9576622139499e-06, "loss": 0.7738, "step": 1706 }, { "epoch": 0.1406443107852023, "grad_norm": 5.15021907426077, "learning_rate": 9.957575534705861e-06, "loss": 0.3595, "step": 1707 }, { "epoch": 0.14072670346873198, "grad_norm": 7.00492373956156, "learning_rate": 9.957488767200246e-06, "loss": 0.6592, "step": 1708 }, { "epoch": 0.14080909615226167, "grad_norm": 6.8725037736438885, "learning_rate": 9.957401911434594e-06, "loss": 0.6738, "step": 1709 }, { "epoch": 0.1408914888357914, "grad_norm": 5.474394546552511, "learning_rate": 9.957314967410455e-06, "loss": 0.4472, "step": 1710 }, { "epoch": 0.14097388151932108, "grad_norm": 6.395917661111817, "learning_rate": 9.957227935129374e-06, "loss": 0.781, "step": 1711 }, { "epoch": 0.14105627420285077, "grad_norm": 5.013972013359134, "learning_rate": 9.957140814592901e-06, "loss": 0.5114, "step": 1712 }, { "epoch": 0.1411386668863805, "grad_norm": 5.229649358585399, "learning_rate": 9.95705360580259e-06, "loss": 0.7029, "step": 1713 }, { "epoch": 0.14122105956991018, "grad_norm": 4.092719529026001, "learning_rate": 9.956966308759993e-06, "loss": 0.3894, "step": 1714 }, { "epoch": 0.1413034522534399, "grad_norm": 5.2989880902352136, "learning_rate": 9.95687892346666e-06, "loss": 0.6119, "step": 1715 }, { "epoch": 0.1413858449369696, "grad_norm": 3.6082549788649962, "learning_rate": 9.95679144992415e-06, "loss": 0.2587, "step": 1716 }, { "epoch": 0.1414682376204993, "grad_norm": 4.328630549185103, "learning_rate": 9.95670388813402e-06, "loss": 0.4679, "step": 1717 }, { "epoch": 0.141550630304029, "grad_norm": 6.666267680234917, "learning_rate": 9.95661623809783e-06, "loss": 0.6764, "step": 1718 }, { "epoch": 0.1416330229875587, "grad_norm": 6.030588485073613, "learning_rate": 9.956528499817137e-06, "loss": 0.5958, "step": 1719 }, { "epoch": 0.14171541567108842, "grad_norm": 3.697734271168265, "learning_rate": 9.956440673293508e-06, "loss": 0.3724, "step": 1720 }, { "epoch": 0.1417978083546181, "grad_norm": 5.078205684000341, "learning_rate": 9.956352758528501e-06, "loss": 0.4152, "step": 1721 }, { "epoch": 0.1418802010381478, "grad_norm": 5.998477976751115, "learning_rate": 9.956264755523687e-06, "loss": 0.4393, "step": 1722 }, { "epoch": 0.14196259372167752, "grad_norm": 4.1617349318422105, "learning_rate": 9.956176664280628e-06, "loss": 0.5035, "step": 1723 }, { "epoch": 0.1420449864052072, "grad_norm": 5.193159222144419, "learning_rate": 9.956088484800895e-06, "loss": 0.5345, "step": 1724 }, { "epoch": 0.14212737908873693, "grad_norm": 4.085571835755461, "learning_rate": 9.956000217086055e-06, "loss": 0.5145, "step": 1725 }, { "epoch": 0.14220977177226662, "grad_norm": 8.724102472644716, "learning_rate": 9.955911861137683e-06, "loss": 0.7727, "step": 1726 }, { "epoch": 0.14229216445579632, "grad_norm": 5.455750786296661, "learning_rate": 9.95582341695735e-06, "loss": 0.519, "step": 1727 }, { "epoch": 0.14237455713932604, "grad_norm": 7.3612630042618, "learning_rate": 9.955734884546632e-06, "loss": 0.9181, "step": 1728 }, { "epoch": 0.14245694982285573, "grad_norm": 6.860122301504194, "learning_rate": 9.955646263907103e-06, "loss": 0.7269, "step": 1729 }, { "epoch": 0.14253934250638542, "grad_norm": 5.246035437947141, "learning_rate": 9.955557555040344e-06, "loss": 0.6018, "step": 1730 }, { "epoch": 0.14262173518991514, "grad_norm": 4.620747795169099, "learning_rate": 9.95546875794793e-06, "loss": 0.4273, "step": 1731 }, { "epoch": 0.14270412787344483, "grad_norm": 4.879100737888411, "learning_rate": 9.955379872631447e-06, "loss": 0.4053, "step": 1732 }, { "epoch": 0.14278652055697455, "grad_norm": 5.1781106148154965, "learning_rate": 9.955290899092473e-06, "loss": 0.5273, "step": 1733 }, { "epoch": 0.14286891324050424, "grad_norm": 5.28098812734887, "learning_rate": 9.955201837332592e-06, "loss": 0.4492, "step": 1734 }, { "epoch": 0.14295130592403393, "grad_norm": 4.588098614679162, "learning_rate": 9.955112687353395e-06, "loss": 0.6444, "step": 1735 }, { "epoch": 0.14303369860756365, "grad_norm": 6.807890987436337, "learning_rate": 9.955023449156464e-06, "loss": 0.8301, "step": 1736 }, { "epoch": 0.14311609129109334, "grad_norm": 6.19317980765747, "learning_rate": 9.95493412274339e-06, "loss": 0.644, "step": 1737 }, { "epoch": 0.14319848397462306, "grad_norm": 4.804293638065614, "learning_rate": 9.954844708115761e-06, "loss": 0.3949, "step": 1738 }, { "epoch": 0.14328087665815276, "grad_norm": 5.446810889833297, "learning_rate": 9.95475520527517e-06, "loss": 0.4823, "step": 1739 }, { "epoch": 0.14336326934168245, "grad_norm": 5.587157787333849, "learning_rate": 9.954665614223212e-06, "loss": 0.3342, "step": 1740 }, { "epoch": 0.14344566202521217, "grad_norm": 6.671984779337379, "learning_rate": 9.954575934961482e-06, "loss": 0.4438, "step": 1741 }, { "epoch": 0.14352805470874186, "grad_norm": 6.720067281421165, "learning_rate": 9.954486167491574e-06, "loss": 0.6546, "step": 1742 }, { "epoch": 0.14361044739227158, "grad_norm": 4.310942822266015, "learning_rate": 9.954396311815088e-06, "loss": 0.3683, "step": 1743 }, { "epoch": 0.14369284007580127, "grad_norm": 6.662276529673106, "learning_rate": 9.954306367933623e-06, "loss": 0.7405, "step": 1744 }, { "epoch": 0.14377523275933096, "grad_norm": 5.668799637679965, "learning_rate": 9.954216335848781e-06, "loss": 0.5108, "step": 1745 }, { "epoch": 0.14385762544286068, "grad_norm": 11.70486674091479, "learning_rate": 9.954126215562165e-06, "loss": 0.5593, "step": 1746 }, { "epoch": 0.14394001812639037, "grad_norm": 4.7414430921442285, "learning_rate": 9.954036007075378e-06, "loss": 0.6503, "step": 1747 }, { "epoch": 0.1440224108099201, "grad_norm": 5.383455910246553, "learning_rate": 9.953945710390029e-06, "loss": 0.534, "step": 1748 }, { "epoch": 0.14410480349344978, "grad_norm": 5.207132564825904, "learning_rate": 9.953855325507723e-06, "loss": 0.6014, "step": 1749 }, { "epoch": 0.14418719617697948, "grad_norm": 4.522652655624019, "learning_rate": 9.95376485243007e-06, "loss": 0.561, "step": 1750 }, { "epoch": 0.1442695888605092, "grad_norm": 3.575322962748732, "learning_rate": 9.95367429115868e-06, "loss": 0.3608, "step": 1751 }, { "epoch": 0.1443519815440389, "grad_norm": 3.799989212310609, "learning_rate": 9.953583641695163e-06, "loss": 0.5892, "step": 1752 }, { "epoch": 0.14443437422756858, "grad_norm": 4.154410303606598, "learning_rate": 9.95349290404114e-06, "loss": 0.6148, "step": 1753 }, { "epoch": 0.1445167669110983, "grad_norm": 4.526476783100888, "learning_rate": 9.95340207819822e-06, "loss": 0.5512, "step": 1754 }, { "epoch": 0.144599159594628, "grad_norm": 5.470982965204849, "learning_rate": 9.953311164168023e-06, "loss": 0.6535, "step": 1755 }, { "epoch": 0.1446815522781577, "grad_norm": 6.153122788881867, "learning_rate": 9.953220161952165e-06, "loss": 0.5768, "step": 1756 }, { "epoch": 0.1447639449616874, "grad_norm": 3.888887680481176, "learning_rate": 9.95312907155227e-06, "loss": 0.3174, "step": 1757 }, { "epoch": 0.1448463376452171, "grad_norm": 5.669629872152485, "learning_rate": 9.953037892969957e-06, "loss": 0.6727, "step": 1758 }, { "epoch": 0.1449287303287468, "grad_norm": 35.771339221273166, "learning_rate": 9.952946626206848e-06, "loss": 1.7314, "step": 1759 }, { "epoch": 0.1450111230122765, "grad_norm": 4.3289004927992725, "learning_rate": 9.952855271264573e-06, "loss": 0.5573, "step": 1760 }, { "epoch": 0.14509351569580622, "grad_norm": 3.799289594497197, "learning_rate": 9.952763828144752e-06, "loss": 0.3963, "step": 1761 }, { "epoch": 0.14517590837933592, "grad_norm": 5.288650375462546, "learning_rate": 9.952672296849017e-06, "loss": 0.475, "step": 1762 }, { "epoch": 0.1452583010628656, "grad_norm": 3.9972186574876734, "learning_rate": 9.952580677378998e-06, "loss": 0.5127, "step": 1763 }, { "epoch": 0.14534069374639533, "grad_norm": 4.3332751493193475, "learning_rate": 9.952488969736324e-06, "loss": 0.5247, "step": 1764 }, { "epoch": 0.14542308642992502, "grad_norm": 4.195892689128413, "learning_rate": 9.952397173922629e-06, "loss": 0.3199, "step": 1765 }, { "epoch": 0.14550547911345474, "grad_norm": 5.722985806247223, "learning_rate": 9.952305289939545e-06, "loss": 0.6056, "step": 1766 }, { "epoch": 0.14558787179698443, "grad_norm": 6.368312651719165, "learning_rate": 9.952213317788713e-06, "loss": 0.5713, "step": 1767 }, { "epoch": 0.14567026448051412, "grad_norm": 4.5910410814721585, "learning_rate": 9.952121257471765e-06, "loss": 0.5231, "step": 1768 }, { "epoch": 0.14575265716404384, "grad_norm": 4.149070439095053, "learning_rate": 9.952029108990341e-06, "loss": 0.4691, "step": 1769 }, { "epoch": 0.14583504984757353, "grad_norm": 6.698020175906564, "learning_rate": 9.951936872346084e-06, "loss": 0.6816, "step": 1770 }, { "epoch": 0.14591744253110323, "grad_norm": 4.448335960350891, "learning_rate": 9.951844547540634e-06, "loss": 0.5778, "step": 1771 }, { "epoch": 0.14599983521463294, "grad_norm": 5.697992048620652, "learning_rate": 9.951752134575636e-06, "loss": 0.5035, "step": 1772 }, { "epoch": 0.14608222789816264, "grad_norm": 6.626396310224672, "learning_rate": 9.951659633452735e-06, "loss": 0.6606, "step": 1773 }, { "epoch": 0.14616462058169236, "grad_norm": 4.046239182291733, "learning_rate": 9.951567044173577e-06, "loss": 0.3849, "step": 1774 }, { "epoch": 0.14624701326522205, "grad_norm": 3.688760782701429, "learning_rate": 9.951474366739811e-06, "loss": 0.4589, "step": 1775 }, { "epoch": 0.14632940594875174, "grad_norm": 4.790126428067029, "learning_rate": 9.951381601153087e-06, "loss": 0.6115, "step": 1776 }, { "epoch": 0.14641179863228146, "grad_norm": 6.205649281121326, "learning_rate": 9.951288747415055e-06, "loss": 0.5969, "step": 1777 }, { "epoch": 0.14649419131581115, "grad_norm": 3.3187533632736397, "learning_rate": 9.95119580552737e-06, "loss": 0.5073, "step": 1778 }, { "epoch": 0.14657658399934087, "grad_norm": 3.258398469094614, "learning_rate": 9.95110277549169e-06, "loss": 0.2559, "step": 1779 }, { "epoch": 0.14665897668287056, "grad_norm": 4.3933466019017375, "learning_rate": 9.951009657309664e-06, "loss": 0.4237, "step": 1780 }, { "epoch": 0.14674136936640025, "grad_norm": 5.039755273960905, "learning_rate": 9.950916450982954e-06, "loss": 0.5285, "step": 1781 }, { "epoch": 0.14682376204992997, "grad_norm": 5.052820158208282, "learning_rate": 9.95082315651322e-06, "loss": 0.5216, "step": 1782 }, { "epoch": 0.14690615473345967, "grad_norm": 7.221433965049104, "learning_rate": 9.950729773902119e-06, "loss": 0.7875, "step": 1783 }, { "epoch": 0.14698854741698938, "grad_norm": 4.022936489503606, "learning_rate": 9.950636303151318e-06, "loss": 0.3333, "step": 1784 }, { "epoch": 0.14707094010051908, "grad_norm": 4.202488842137441, "learning_rate": 9.950542744262478e-06, "loss": 0.3781, "step": 1785 }, { "epoch": 0.14715333278404877, "grad_norm": 7.63848412977035, "learning_rate": 9.950449097237268e-06, "loss": 0.6791, "step": 1786 }, { "epoch": 0.1472357254675785, "grad_norm": 4.718688788366161, "learning_rate": 9.950355362077351e-06, "loss": 0.5764, "step": 1787 }, { "epoch": 0.14731811815110818, "grad_norm": 7.447234767381768, "learning_rate": 9.950261538784399e-06, "loss": 0.4469, "step": 1788 }, { "epoch": 0.14740051083463787, "grad_norm": 5.095202612388618, "learning_rate": 9.950167627360078e-06, "loss": 0.6372, "step": 1789 }, { "epoch": 0.1474829035181676, "grad_norm": 7.204391060200845, "learning_rate": 9.950073627806068e-06, "loss": 0.4491, "step": 1790 }, { "epoch": 0.14756529620169728, "grad_norm": 5.269735640638179, "learning_rate": 9.949979540124036e-06, "loss": 0.7009, "step": 1791 }, { "epoch": 0.147647688885227, "grad_norm": 5.921433725601925, "learning_rate": 9.949885364315659e-06, "loss": 0.3743, "step": 1792 }, { "epoch": 0.1477300815687567, "grad_norm": 4.473723831399909, "learning_rate": 9.949791100382613e-06, "loss": 0.5765, "step": 1793 }, { "epoch": 0.14781247425228639, "grad_norm": 4.15202235170927, "learning_rate": 9.949696748326576e-06, "loss": 0.4384, "step": 1794 }, { "epoch": 0.1478948669358161, "grad_norm": 5.1563131436767335, "learning_rate": 9.94960230814923e-06, "loss": 0.4914, "step": 1795 }, { "epoch": 0.1479772596193458, "grad_norm": 7.2266024173875, "learning_rate": 9.949507779852255e-06, "loss": 0.8423, "step": 1796 }, { "epoch": 0.14805965230287552, "grad_norm": 3.6509897775167817, "learning_rate": 9.949413163437334e-06, "loss": 0.2087, "step": 1797 }, { "epoch": 0.1481420449864052, "grad_norm": 4.574456787530481, "learning_rate": 9.94931845890615e-06, "loss": 0.4738, "step": 1798 }, { "epoch": 0.1482244376699349, "grad_norm": 5.648889619706296, "learning_rate": 9.949223666260391e-06, "loss": 0.5997, "step": 1799 }, { "epoch": 0.14830683035346462, "grad_norm": 5.202785968386703, "learning_rate": 9.949128785501744e-06, "loss": 0.3894, "step": 1800 }, { "epoch": 0.1483892230369943, "grad_norm": 10.997911457737233, "learning_rate": 9.949033816631897e-06, "loss": 0.6417, "step": 1801 }, { "epoch": 0.14847161572052403, "grad_norm": 4.6037373374277015, "learning_rate": 9.948938759652545e-06, "loss": 0.4054, "step": 1802 }, { "epoch": 0.14855400840405372, "grad_norm": 6.562807171790634, "learning_rate": 9.948843614565373e-06, "loss": 0.5643, "step": 1803 }, { "epoch": 0.14863640108758341, "grad_norm": 4.45858174996238, "learning_rate": 9.948748381372081e-06, "loss": 0.4779, "step": 1804 }, { "epoch": 0.14871879377111313, "grad_norm": 5.1171416882597915, "learning_rate": 9.948653060074365e-06, "loss": 0.6325, "step": 1805 }, { "epoch": 0.14880118645464283, "grad_norm": 5.1648545519748765, "learning_rate": 9.948557650673917e-06, "loss": 0.6289, "step": 1806 }, { "epoch": 0.14888357913817252, "grad_norm": 3.3249976860584125, "learning_rate": 9.94846215317244e-06, "loss": 0.5099, "step": 1807 }, { "epoch": 0.14896597182170224, "grad_norm": 4.210265252795672, "learning_rate": 9.94836656757163e-06, "loss": 0.3661, "step": 1808 }, { "epoch": 0.14904836450523193, "grad_norm": 5.856159591100659, "learning_rate": 9.948270893873194e-06, "loss": 0.6683, "step": 1809 }, { "epoch": 0.14913075718876165, "grad_norm": 5.665378850590255, "learning_rate": 9.94817513207883e-06, "loss": 0.4274, "step": 1810 }, { "epoch": 0.14921314987229134, "grad_norm": 3.2139105645087644, "learning_rate": 9.948079282190246e-06, "loss": 0.3374, "step": 1811 }, { "epoch": 0.14929554255582103, "grad_norm": 4.41472261330769, "learning_rate": 9.947983344209149e-06, "loss": 0.5235, "step": 1812 }, { "epoch": 0.14937793523935075, "grad_norm": 5.0108760558021235, "learning_rate": 9.947887318137246e-06, "loss": 0.4894, "step": 1813 }, { "epoch": 0.14946032792288044, "grad_norm": 4.076125168503218, "learning_rate": 9.947791203976246e-06, "loss": 0.486, "step": 1814 }, { "epoch": 0.14954272060641016, "grad_norm": 4.9262153367813175, "learning_rate": 9.94769500172786e-06, "loss": 0.4339, "step": 1815 }, { "epoch": 0.14962511328993985, "grad_norm": 4.823936819797354, "learning_rate": 9.947598711393803e-06, "loss": 0.5129, "step": 1816 }, { "epoch": 0.14970750597346955, "grad_norm": 7.422686319765741, "learning_rate": 9.947502332975785e-06, "loss": 0.724, "step": 1817 }, { "epoch": 0.14978989865699927, "grad_norm": 5.069308352660441, "learning_rate": 9.947405866475526e-06, "loss": 0.3606, "step": 1818 }, { "epoch": 0.14987229134052896, "grad_norm": 4.817079842052196, "learning_rate": 9.947309311894741e-06, "loss": 0.6129, "step": 1819 }, { "epoch": 0.14995468402405868, "grad_norm": 4.409288579092636, "learning_rate": 9.947212669235151e-06, "loss": 0.3029, "step": 1820 }, { "epoch": 0.15003707670758837, "grad_norm": 4.088287504533524, "learning_rate": 9.947115938498475e-06, "loss": 0.3747, "step": 1821 }, { "epoch": 0.15011946939111806, "grad_norm": 4.526831599614096, "learning_rate": 9.947019119686437e-06, "loss": 0.5938, "step": 1822 }, { "epoch": 0.15020186207464778, "grad_norm": 5.180661766219478, "learning_rate": 9.946922212800758e-06, "loss": 0.5274, "step": 1823 }, { "epoch": 0.15028425475817747, "grad_norm": 4.075075934628573, "learning_rate": 9.946825217843165e-06, "loss": 0.5151, "step": 1824 }, { "epoch": 0.15036664744170716, "grad_norm": 4.371753027926906, "learning_rate": 9.946728134815384e-06, "loss": 0.3841, "step": 1825 }, { "epoch": 0.15044904012523688, "grad_norm": 7.004245095318552, "learning_rate": 9.946630963719143e-06, "loss": 0.7213, "step": 1826 }, { "epoch": 0.15053143280876657, "grad_norm": 4.695580060254283, "learning_rate": 9.946533704556174e-06, "loss": 0.4254, "step": 1827 }, { "epoch": 0.1506138254922963, "grad_norm": 6.379558882024517, "learning_rate": 9.946436357328208e-06, "loss": 0.4716, "step": 1828 }, { "epoch": 0.15069621817582599, "grad_norm": 30.181991180317187, "learning_rate": 9.946338922036977e-06, "loss": 1.3403, "step": 1829 }, { "epoch": 0.15077861085935568, "grad_norm": 4.760372710692016, "learning_rate": 9.946241398684216e-06, "loss": 0.6688, "step": 1830 }, { "epoch": 0.1508610035428854, "grad_norm": 29.77514513346913, "learning_rate": 9.94614378727166e-06, "loss": 1.0769, "step": 1831 }, { "epoch": 0.1509433962264151, "grad_norm": 5.446400107754182, "learning_rate": 9.946046087801052e-06, "loss": 0.4928, "step": 1832 }, { "epoch": 0.1510257889099448, "grad_norm": 4.18231130411727, "learning_rate": 9.945948300274124e-06, "loss": 0.5164, "step": 1833 }, { "epoch": 0.1511081815934745, "grad_norm": 9.784576516645163, "learning_rate": 9.945850424692622e-06, "loss": 0.7336, "step": 1834 }, { "epoch": 0.1511905742770042, "grad_norm": 4.280452136926324, "learning_rate": 9.945752461058286e-06, "loss": 0.5356, "step": 1835 }, { "epoch": 0.1512729669605339, "grad_norm": 4.514588928723341, "learning_rate": 9.945654409372861e-06, "loss": 0.3138, "step": 1836 }, { "epoch": 0.1513553596440636, "grad_norm": 4.977510975280522, "learning_rate": 9.945556269638095e-06, "loss": 0.6125, "step": 1837 }, { "epoch": 0.15143775232759332, "grad_norm": 3.7210768337208595, "learning_rate": 9.945458041855732e-06, "loss": 0.6259, "step": 1838 }, { "epoch": 0.15152014501112301, "grad_norm": 3.836298613785647, "learning_rate": 9.94535972602752e-06, "loss": 0.4432, "step": 1839 }, { "epoch": 0.1516025376946527, "grad_norm": 8.066856187714102, "learning_rate": 9.945261322155213e-06, "loss": 0.8116, "step": 1840 }, { "epoch": 0.15168493037818243, "grad_norm": 4.468142883297208, "learning_rate": 9.94516283024056e-06, "loss": 0.4872, "step": 1841 }, { "epoch": 0.15176732306171212, "grad_norm": 5.892106263712281, "learning_rate": 9.945064250285318e-06, "loss": 0.7393, "step": 1842 }, { "epoch": 0.1518497157452418, "grad_norm": 5.748773653446861, "learning_rate": 9.944965582291236e-06, "loss": 0.491, "step": 1843 }, { "epoch": 0.15193210842877153, "grad_norm": 6.267061350789159, "learning_rate": 9.944866826260076e-06, "loss": 0.5588, "step": 1844 }, { "epoch": 0.15201450111230122, "grad_norm": 5.56231366013553, "learning_rate": 9.944767982193595e-06, "loss": 0.392, "step": 1845 }, { "epoch": 0.15209689379583094, "grad_norm": 5.383200889814545, "learning_rate": 9.944669050093552e-06, "loss": 0.6692, "step": 1846 }, { "epoch": 0.15217928647936063, "grad_norm": 6.870277257184773, "learning_rate": 9.944570029961706e-06, "loss": 0.7725, "step": 1847 }, { "epoch": 0.15226167916289032, "grad_norm": 4.559517814413792, "learning_rate": 9.944470921799825e-06, "loss": 0.6481, "step": 1848 }, { "epoch": 0.15234407184642004, "grad_norm": 4.74898634616392, "learning_rate": 9.944371725609671e-06, "loss": 0.5648, "step": 1849 }, { "epoch": 0.15242646452994973, "grad_norm": 4.860513869955498, "learning_rate": 9.944272441393008e-06, "loss": 0.6931, "step": 1850 }, { "epoch": 0.15250885721347945, "grad_norm": 4.3722520697227205, "learning_rate": 9.944173069151609e-06, "loss": 0.3393, "step": 1851 }, { "epoch": 0.15259124989700915, "grad_norm": 5.0443736184423065, "learning_rate": 9.944073608887235e-06, "loss": 0.5772, "step": 1852 }, { "epoch": 0.15267364258053884, "grad_norm": 8.549939972371906, "learning_rate": 9.943974060601664e-06, "loss": 0.5043, "step": 1853 }, { "epoch": 0.15275603526406856, "grad_norm": 4.757709984298188, "learning_rate": 9.943874424296666e-06, "loss": 0.6423, "step": 1854 }, { "epoch": 0.15283842794759825, "grad_norm": 5.4002386456791305, "learning_rate": 9.943774699974014e-06, "loss": 0.5686, "step": 1855 }, { "epoch": 0.15292082063112797, "grad_norm": 4.215127806823176, "learning_rate": 9.943674887635483e-06, "loss": 0.5367, "step": 1856 }, { "epoch": 0.15300321331465766, "grad_norm": 4.274941922810144, "learning_rate": 9.943574987282853e-06, "loss": 0.5136, "step": 1857 }, { "epoch": 0.15308560599818735, "grad_norm": 5.007470901292181, "learning_rate": 9.943474998917899e-06, "loss": 0.4348, "step": 1858 }, { "epoch": 0.15316799868171707, "grad_norm": 4.406761750552184, "learning_rate": 9.943374922542403e-06, "loss": 0.5084, "step": 1859 }, { "epoch": 0.15325039136524676, "grad_norm": 5.414163225734359, "learning_rate": 9.943274758158146e-06, "loss": 0.545, "step": 1860 }, { "epoch": 0.15333278404877645, "grad_norm": 4.477228911899572, "learning_rate": 9.943174505766912e-06, "loss": 0.5127, "step": 1861 }, { "epoch": 0.15341517673230617, "grad_norm": 5.4286296996844134, "learning_rate": 9.943074165370486e-06, "loss": 0.5424, "step": 1862 }, { "epoch": 0.15349756941583587, "grad_norm": 7.8227006152958625, "learning_rate": 9.94297373697065e-06, "loss": 0.6181, "step": 1863 }, { "epoch": 0.15357996209936559, "grad_norm": 4.58280474696728, "learning_rate": 9.942873220569201e-06, "loss": 0.5705, "step": 1864 }, { "epoch": 0.15366235478289528, "grad_norm": 5.000809086425357, "learning_rate": 9.942772616167921e-06, "loss": 0.5671, "step": 1865 }, { "epoch": 0.15374474746642497, "grad_norm": 4.052269078249802, "learning_rate": 9.942671923768604e-06, "loss": 0.5368, "step": 1866 }, { "epoch": 0.1538271401499547, "grad_norm": 4.354386031164391, "learning_rate": 9.942571143373041e-06, "loss": 0.4506, "step": 1867 }, { "epoch": 0.15390953283348438, "grad_norm": 5.323874342321902, "learning_rate": 9.942470274983029e-06, "loss": 0.5499, "step": 1868 }, { "epoch": 0.1539919255170141, "grad_norm": 4.166290067353925, "learning_rate": 9.94236931860036e-06, "loss": 0.57, "step": 1869 }, { "epoch": 0.1540743182005438, "grad_norm": 4.663093189419887, "learning_rate": 9.942268274226836e-06, "loss": 0.6149, "step": 1870 }, { "epoch": 0.15415671088407348, "grad_norm": 5.011905916587221, "learning_rate": 9.942167141864252e-06, "loss": 0.5146, "step": 1871 }, { "epoch": 0.1542391035676032, "grad_norm": 3.684085210483375, "learning_rate": 9.94206592151441e-06, "loss": 0.471, "step": 1872 }, { "epoch": 0.1543214962511329, "grad_norm": 4.529894364792616, "learning_rate": 9.941964613179113e-06, "loss": 0.6402, "step": 1873 }, { "epoch": 0.15440388893466261, "grad_norm": 5.379119541477112, "learning_rate": 9.941863216860161e-06, "loss": 0.4688, "step": 1874 }, { "epoch": 0.1544862816181923, "grad_norm": 4.711252577285658, "learning_rate": 9.941761732559365e-06, "loss": 0.4731, "step": 1875 }, { "epoch": 0.154568674301722, "grad_norm": 4.627102248023416, "learning_rate": 9.941660160278526e-06, "loss": 0.5882, "step": 1876 }, { "epoch": 0.15465106698525172, "grad_norm": 4.784269516867435, "learning_rate": 9.941558500019458e-06, "loss": 0.5784, "step": 1877 }, { "epoch": 0.1547334596687814, "grad_norm": 3.964863589286543, "learning_rate": 9.941456751783965e-06, "loss": 0.4781, "step": 1878 }, { "epoch": 0.15481585235231113, "grad_norm": 3.7942935218449954, "learning_rate": 9.941354915573863e-06, "loss": 0.3864, "step": 1879 }, { "epoch": 0.15489824503584082, "grad_norm": 3.7341181608097385, "learning_rate": 9.941252991390961e-06, "loss": 0.4249, "step": 1880 }, { "epoch": 0.1549806377193705, "grad_norm": 5.551866143123865, "learning_rate": 9.941150979237078e-06, "loss": 0.5836, "step": 1881 }, { "epoch": 0.15506303040290023, "grad_norm": 9.280745802972426, "learning_rate": 9.941048879114025e-06, "loss": 0.7968, "step": 1882 }, { "epoch": 0.15514542308642992, "grad_norm": 4.957929728130742, "learning_rate": 9.940946691023625e-06, "loss": 0.4156, "step": 1883 }, { "epoch": 0.15522781576995962, "grad_norm": 5.612906720964241, "learning_rate": 9.940844414967697e-06, "loss": 0.5885, "step": 1884 }, { "epoch": 0.15531020845348933, "grad_norm": 4.208810615796374, "learning_rate": 9.940742050948057e-06, "loss": 0.3961, "step": 1885 }, { "epoch": 0.15539260113701903, "grad_norm": 23.92972768953157, "learning_rate": 9.94063959896653e-06, "loss": 0.6231, "step": 1886 }, { "epoch": 0.15547499382054875, "grad_norm": 5.76934083591682, "learning_rate": 9.940537059024942e-06, "loss": 0.4364, "step": 1887 }, { "epoch": 0.15555738650407844, "grad_norm": 5.707451138870683, "learning_rate": 9.940434431125117e-06, "loss": 0.5047, "step": 1888 }, { "epoch": 0.15563977918760813, "grad_norm": 5.776218724756825, "learning_rate": 9.940331715268883e-06, "loss": 0.5968, "step": 1889 }, { "epoch": 0.15572217187113785, "grad_norm": 4.954848575236138, "learning_rate": 9.940228911458065e-06, "loss": 0.5645, "step": 1890 }, { "epoch": 0.15580456455466754, "grad_norm": 4.99108420622021, "learning_rate": 9.940126019694498e-06, "loss": 0.6215, "step": 1891 }, { "epoch": 0.15588695723819726, "grad_norm": 4.922687738818404, "learning_rate": 9.940023039980012e-06, "loss": 0.4969, "step": 1892 }, { "epoch": 0.15596934992172695, "grad_norm": 6.857403736754451, "learning_rate": 9.939919972316437e-06, "loss": 0.7445, "step": 1893 }, { "epoch": 0.15605174260525664, "grad_norm": 5.0582162606668115, "learning_rate": 9.939816816705615e-06, "loss": 0.4787, "step": 1894 }, { "epoch": 0.15613413528878636, "grad_norm": 4.756494625315935, "learning_rate": 9.939713573149377e-06, "loss": 0.6097, "step": 1895 }, { "epoch": 0.15621652797231605, "grad_norm": 6.419763789825375, "learning_rate": 9.939610241649561e-06, "loss": 0.4783, "step": 1896 }, { "epoch": 0.15629892065584577, "grad_norm": 4.5978040753369935, "learning_rate": 9.93950682220801e-06, "loss": 0.4438, "step": 1897 }, { "epoch": 0.15638131333937547, "grad_norm": 4.462028232430049, "learning_rate": 9.939403314826563e-06, "loss": 0.3942, "step": 1898 }, { "epoch": 0.15646370602290516, "grad_norm": 5.337997568156949, "learning_rate": 9.939299719507065e-06, "loss": 0.5616, "step": 1899 }, { "epoch": 0.15654609870643488, "grad_norm": 3.60824355878785, "learning_rate": 9.939196036251357e-06, "loss": 0.4209, "step": 1900 }, { "epoch": 0.15662849138996457, "grad_norm": 3.9143893571730146, "learning_rate": 9.939092265061288e-06, "loss": 0.3278, "step": 1901 }, { "epoch": 0.15671088407349426, "grad_norm": 4.439793896511223, "learning_rate": 9.938988405938703e-06, "loss": 0.5322, "step": 1902 }, { "epoch": 0.15679327675702398, "grad_norm": 4.36845403354661, "learning_rate": 9.938884458885454e-06, "loss": 0.5716, "step": 1903 }, { "epoch": 0.15687566944055367, "grad_norm": 6.284830972083645, "learning_rate": 9.938780423903387e-06, "loss": 0.6454, "step": 1904 }, { "epoch": 0.1569580621240834, "grad_norm": 6.495990019493336, "learning_rate": 9.938676300994358e-06, "loss": 0.5363, "step": 1905 }, { "epoch": 0.15704045480761308, "grad_norm": 5.215129230889581, "learning_rate": 9.938572090160222e-06, "loss": 0.3534, "step": 1906 }, { "epoch": 0.15712284749114278, "grad_norm": 5.0984361516420735, "learning_rate": 9.938467791402828e-06, "loss": 0.3469, "step": 1907 }, { "epoch": 0.1572052401746725, "grad_norm": 35.6236332120135, "learning_rate": 9.938363404724038e-06, "loss": 1.4843, "step": 1908 }, { "epoch": 0.1572876328582022, "grad_norm": 7.458138808494354, "learning_rate": 9.93825893012571e-06, "loss": 0.6091, "step": 1909 }, { "epoch": 0.1573700255417319, "grad_norm": 6.398287378682458, "learning_rate": 9.938154367609705e-06, "loss": 0.5134, "step": 1910 }, { "epoch": 0.1574524182252616, "grad_norm": 3.5343424335631433, "learning_rate": 9.93804971717788e-06, "loss": 0.286, "step": 1911 }, { "epoch": 0.1575348109087913, "grad_norm": 4.504459000128934, "learning_rate": 9.937944978832103e-06, "loss": 0.4624, "step": 1912 }, { "epoch": 0.157617203592321, "grad_norm": 7.108577599603217, "learning_rate": 9.937840152574235e-06, "loss": 0.7483, "step": 1913 }, { "epoch": 0.1576995962758507, "grad_norm": 3.6174046857082898, "learning_rate": 9.937735238406146e-06, "loss": 0.4525, "step": 1914 }, { "epoch": 0.15778198895938042, "grad_norm": 6.389978345222535, "learning_rate": 9.9376302363297e-06, "loss": 0.7896, "step": 1915 }, { "epoch": 0.1578643816429101, "grad_norm": 6.548196848276821, "learning_rate": 9.937525146346767e-06, "loss": 0.6367, "step": 1916 }, { "epoch": 0.1579467743264398, "grad_norm": 3.655411742497471, "learning_rate": 9.937419968459221e-06, "loss": 0.5116, "step": 1917 }, { "epoch": 0.15802916700996952, "grad_norm": 4.929959679569643, "learning_rate": 9.937314702668933e-06, "loss": 0.4611, "step": 1918 }, { "epoch": 0.15811155969349922, "grad_norm": 6.432318019871717, "learning_rate": 9.937209348977776e-06, "loss": 0.5361, "step": 1919 }, { "epoch": 0.1581939523770289, "grad_norm": 4.031201714847671, "learning_rate": 9.937103907387626e-06, "loss": 0.3384, "step": 1920 }, { "epoch": 0.15827634506055863, "grad_norm": 3.9206730902636253, "learning_rate": 9.936998377900362e-06, "loss": 0.5143, "step": 1921 }, { "epoch": 0.15835873774408832, "grad_norm": 5.579814626803759, "learning_rate": 9.93689276051786e-06, "loss": 0.6045, "step": 1922 }, { "epoch": 0.15844113042761804, "grad_norm": 3.917683732272094, "learning_rate": 9.936787055242002e-06, "loss": 0.2376, "step": 1923 }, { "epoch": 0.15852352311114773, "grad_norm": 4.78423472401811, "learning_rate": 9.93668126207467e-06, "loss": 0.4524, "step": 1924 }, { "epoch": 0.15860591579467742, "grad_norm": 3.2708331638186716, "learning_rate": 9.936575381017746e-06, "loss": 0.3629, "step": 1925 }, { "epoch": 0.15868830847820714, "grad_norm": 23.158576456528266, "learning_rate": 9.936469412073117e-06, "loss": 0.4387, "step": 1926 }, { "epoch": 0.15877070116173683, "grad_norm": 5.907761415620391, "learning_rate": 9.936363355242668e-06, "loss": 0.5724, "step": 1927 }, { "epoch": 0.15885309384526655, "grad_norm": 4.421376470888521, "learning_rate": 9.93625721052829e-06, "loss": 0.6202, "step": 1928 }, { "epoch": 0.15893548652879624, "grad_norm": 5.122603282277841, "learning_rate": 9.936150977931869e-06, "loss": 0.4856, "step": 1929 }, { "epoch": 0.15901787921232594, "grad_norm": 5.697392483109681, "learning_rate": 9.936044657455298e-06, "loss": 0.7097, "step": 1930 }, { "epoch": 0.15910027189585565, "grad_norm": 8.995800779777136, "learning_rate": 9.93593824910047e-06, "loss": 0.7263, "step": 1931 }, { "epoch": 0.15918266457938535, "grad_norm": 3.015815073457665, "learning_rate": 9.935831752869278e-06, "loss": 0.4475, "step": 1932 }, { "epoch": 0.15926505726291507, "grad_norm": 5.3090549745473075, "learning_rate": 9.93572516876362e-06, "loss": 0.5545, "step": 1933 }, { "epoch": 0.15934744994644476, "grad_norm": 12.995406107077027, "learning_rate": 9.935618496785396e-06, "loss": 0.1749, "step": 1934 }, { "epoch": 0.15942984262997445, "grad_norm": 4.825072935595757, "learning_rate": 9.935511736936498e-06, "loss": 0.4887, "step": 1935 }, { "epoch": 0.15951223531350417, "grad_norm": 4.746919626773541, "learning_rate": 9.935404889218831e-06, "loss": 0.5112, "step": 1936 }, { "epoch": 0.15959462799703386, "grad_norm": 6.689116278696351, "learning_rate": 9.935297953634298e-06, "loss": 0.7103, "step": 1937 }, { "epoch": 0.15967702068056355, "grad_norm": 6.502278304742261, "learning_rate": 9.935190930184802e-06, "loss": 0.7145, "step": 1938 }, { "epoch": 0.15975941336409327, "grad_norm": 5.451164256483429, "learning_rate": 9.935083818872247e-06, "loss": 0.4737, "step": 1939 }, { "epoch": 0.15984180604762296, "grad_norm": 3.0051587922433067, "learning_rate": 9.93497661969854e-06, "loss": 0.2742, "step": 1940 }, { "epoch": 0.15992419873115268, "grad_norm": 5.427591565725511, "learning_rate": 9.934869332665592e-06, "loss": 0.5207, "step": 1941 }, { "epoch": 0.16000659141468238, "grad_norm": 5.4367848489833115, "learning_rate": 9.934761957775312e-06, "loss": 0.5983, "step": 1942 }, { "epoch": 0.16008898409821207, "grad_norm": 7.154739577060193, "learning_rate": 9.93465449502961e-06, "loss": 0.6409, "step": 1943 }, { "epoch": 0.1601713767817418, "grad_norm": 5.038406869203767, "learning_rate": 9.934546944430402e-06, "loss": 0.4853, "step": 1944 }, { "epoch": 0.16025376946527148, "grad_norm": 5.650856613574145, "learning_rate": 9.934439305979598e-06, "loss": 0.6453, "step": 1945 }, { "epoch": 0.1603361621488012, "grad_norm": 15.099928104720231, "learning_rate": 9.934331579679119e-06, "loss": 0.7712, "step": 1946 }, { "epoch": 0.1604185548323309, "grad_norm": 5.331849451324917, "learning_rate": 9.934223765530883e-06, "loss": 0.5346, "step": 1947 }, { "epoch": 0.16050094751586058, "grad_norm": 6.11974330303117, "learning_rate": 9.934115863536806e-06, "loss": 0.6636, "step": 1948 }, { "epoch": 0.1605833401993903, "grad_norm": 5.259099308839429, "learning_rate": 9.934007873698813e-06, "loss": 0.4581, "step": 1949 }, { "epoch": 0.16066573288292, "grad_norm": 4.763348066034409, "learning_rate": 9.933899796018821e-06, "loss": 0.7946, "step": 1950 }, { "epoch": 0.1607481255664497, "grad_norm": 4.328616169486818, "learning_rate": 9.933791630498761e-06, "loss": 0.5788, "step": 1951 }, { "epoch": 0.1608305182499794, "grad_norm": 5.067172957788821, "learning_rate": 9.933683377140552e-06, "loss": 0.6345, "step": 1952 }, { "epoch": 0.1609129109335091, "grad_norm": 4.119522578831503, "learning_rate": 9.933575035946128e-06, "loss": 0.4333, "step": 1953 }, { "epoch": 0.16099530361703882, "grad_norm": 4.532665725421111, "learning_rate": 9.933466606917412e-06, "loss": 0.4846, "step": 1954 }, { "epoch": 0.1610776963005685, "grad_norm": 6.677989132375096, "learning_rate": 9.933358090056337e-06, "loss": 0.6531, "step": 1955 }, { "epoch": 0.1611600889840982, "grad_norm": 5.737777079771404, "learning_rate": 9.933249485364836e-06, "loss": 0.551, "step": 1956 }, { "epoch": 0.16124248166762792, "grad_norm": 5.303362303054575, "learning_rate": 9.93314079284484e-06, "loss": 0.5989, "step": 1957 }, { "epoch": 0.1613248743511576, "grad_norm": 4.539301099456485, "learning_rate": 9.933032012498287e-06, "loss": 0.5385, "step": 1958 }, { "epoch": 0.16140726703468733, "grad_norm": 7.619380498183769, "learning_rate": 9.932923144327112e-06, "loss": 0.5917, "step": 1959 }, { "epoch": 0.16148965971821702, "grad_norm": 6.2711200766201225, "learning_rate": 9.932814188333252e-06, "loss": 0.6814, "step": 1960 }, { "epoch": 0.1615720524017467, "grad_norm": 4.925280195591163, "learning_rate": 9.932705144518648e-06, "loss": 0.3257, "step": 1961 }, { "epoch": 0.16165444508527643, "grad_norm": 4.900465005853736, "learning_rate": 9.932596012885243e-06, "loss": 0.6121, "step": 1962 }, { "epoch": 0.16173683776880612, "grad_norm": 5.597180202998693, "learning_rate": 9.932486793434976e-06, "loss": 0.4417, "step": 1963 }, { "epoch": 0.16181923045233584, "grad_norm": 3.786036286281869, "learning_rate": 9.932377486169795e-06, "loss": 0.4473, "step": 1964 }, { "epoch": 0.16190162313586554, "grad_norm": 5.333467663504172, "learning_rate": 9.932268091091647e-06, "loss": 0.5273, "step": 1965 }, { "epoch": 0.16198401581939523, "grad_norm": 3.114851131702418, "learning_rate": 9.932158608202473e-06, "loss": 0.2613, "step": 1966 }, { "epoch": 0.16206640850292495, "grad_norm": 4.933213429654218, "learning_rate": 9.932049037504228e-06, "loss": 0.5545, "step": 1967 }, { "epoch": 0.16214880118645464, "grad_norm": 5.732455463753953, "learning_rate": 9.931939378998862e-06, "loss": 0.5004, "step": 1968 }, { "epoch": 0.16223119386998436, "grad_norm": 3.3159988830987936, "learning_rate": 9.931829632688327e-06, "loss": 0.2362, "step": 1969 }, { "epoch": 0.16231358655351405, "grad_norm": 6.801154604066474, "learning_rate": 9.931719798574577e-06, "loss": 0.725, "step": 1970 }, { "epoch": 0.16239597923704374, "grad_norm": 28.52359146325827, "learning_rate": 9.931609876659567e-06, "loss": 0.8996, "step": 1971 }, { "epoch": 0.16247837192057346, "grad_norm": 7.673010563827642, "learning_rate": 9.931499866945254e-06, "loss": 0.6011, "step": 1972 }, { "epoch": 0.16256076460410315, "grad_norm": 2.9164586276644355, "learning_rate": 9.931389769433595e-06, "loss": 0.257, "step": 1973 }, { "epoch": 0.16264315728763284, "grad_norm": 3.4481637153537736, "learning_rate": 9.931279584126552e-06, "loss": 0.2657, "step": 1974 }, { "epoch": 0.16272554997116256, "grad_norm": 5.503134114237322, "learning_rate": 9.931169311026086e-06, "loss": 0.478, "step": 1975 }, { "epoch": 0.16280794265469226, "grad_norm": 7.31936675050305, "learning_rate": 9.93105895013416e-06, "loss": 0.7626, "step": 1976 }, { "epoch": 0.16289033533822198, "grad_norm": 3.325018882378549, "learning_rate": 9.930948501452739e-06, "loss": 0.2196, "step": 1977 }, { "epoch": 0.16297272802175167, "grad_norm": 4.015246823982774, "learning_rate": 9.93083796498379e-06, "loss": 0.3479, "step": 1978 }, { "epoch": 0.16305512070528136, "grad_norm": 4.097019784111488, "learning_rate": 9.930727340729283e-06, "loss": 0.3443, "step": 1979 }, { "epoch": 0.16313751338881108, "grad_norm": 4.548498573960237, "learning_rate": 9.930616628691182e-06, "loss": 0.4211, "step": 1980 }, { "epoch": 0.16321990607234077, "grad_norm": 6.9065001345198045, "learning_rate": 9.930505828871461e-06, "loss": 0.6502, "step": 1981 }, { "epoch": 0.1633022987558705, "grad_norm": 4.716154264543286, "learning_rate": 9.930394941272094e-06, "loss": 0.6323, "step": 1982 }, { "epoch": 0.16338469143940018, "grad_norm": 5.198255317143385, "learning_rate": 9.930283965895054e-06, "loss": 0.5158, "step": 1983 }, { "epoch": 0.16346708412292987, "grad_norm": 3.8635971124237267, "learning_rate": 9.930172902742316e-06, "loss": 0.393, "step": 1984 }, { "epoch": 0.1635494768064596, "grad_norm": 4.73938319426752, "learning_rate": 9.930061751815858e-06, "loss": 0.53, "step": 1985 }, { "epoch": 0.16363186948998928, "grad_norm": 5.053290616839663, "learning_rate": 9.929950513117658e-06, "loss": 0.6883, "step": 1986 }, { "epoch": 0.163714262173519, "grad_norm": 4.256070865568776, "learning_rate": 9.929839186649698e-06, "loss": 0.4755, "step": 1987 }, { "epoch": 0.1637966548570487, "grad_norm": 5.726661518698401, "learning_rate": 9.929727772413959e-06, "loss": 0.6225, "step": 1988 }, { "epoch": 0.1638790475405784, "grad_norm": 6.136396113547857, "learning_rate": 9.929616270412425e-06, "loss": 0.5515, "step": 1989 }, { "epoch": 0.1639614402241081, "grad_norm": 5.803352132593352, "learning_rate": 9.92950468064708e-06, "loss": 0.6151, "step": 1990 }, { "epoch": 0.1640438329076378, "grad_norm": 5.500294699026293, "learning_rate": 9.929393003119911e-06, "loss": 0.645, "step": 1991 }, { "epoch": 0.1641262255911675, "grad_norm": 5.126741967427504, "learning_rate": 9.929281237832909e-06, "loss": 0.4252, "step": 1992 }, { "epoch": 0.1642086182746972, "grad_norm": 5.715611583303511, "learning_rate": 9.92916938478806e-06, "loss": 0.6065, "step": 1993 }, { "epoch": 0.1642910109582269, "grad_norm": 4.5030944875150505, "learning_rate": 9.929057443987356e-06, "loss": 0.4517, "step": 1994 }, { "epoch": 0.16437340364175662, "grad_norm": 4.036448430622849, "learning_rate": 9.928945415432792e-06, "loss": 0.3826, "step": 1995 }, { "epoch": 0.1644557963252863, "grad_norm": 4.77432743749195, "learning_rate": 9.92883329912636e-06, "loss": 0.4729, "step": 1996 }, { "epoch": 0.164538189008816, "grad_norm": 4.24213889175515, "learning_rate": 9.92872109507006e-06, "loss": 0.6217, "step": 1997 }, { "epoch": 0.16462058169234572, "grad_norm": 6.054404874550711, "learning_rate": 9.928608803265884e-06, "loss": 0.5828, "step": 1998 }, { "epoch": 0.16470297437587542, "grad_norm": 5.412374428435805, "learning_rate": 9.928496423715835e-06, "loss": 0.6126, "step": 1999 }, { "epoch": 0.16478536705940514, "grad_norm": 6.090978296501382, "learning_rate": 9.928383956421914e-06, "loss": 0.6151, "step": 2000 } ], "logging_steps": 1.0, "max_steps": 24274, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7648832073984.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }