{ "best_metric": 0.42018274218503854, "best_model_checkpoint": "train/Large-20241114-Compress:4x-Lr:5e-5-Llama3-8B-instruct-GPT2-Large-RAG-no-ft_token-onlySquad-everymem/checkpoint-1600", "epoch": 1.1793974016400994, "eval_steps": 800, "global_step": 3200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003685616880125311, "grad_norm": 27.948738039629134, "learning_rate": 0.0, "loss": 3.7247, "step": 1 }, { "epoch": 0.0007371233760250622, "grad_norm": 17.519613555408675, "learning_rate": 7.525749891599529e-06, "loss": 3.4389, "step": 2 }, { "epoch": 0.0011056850640375933, "grad_norm": 15.666191797296735, "learning_rate": 1.192803136799156e-05, "loss": 3.3938, "step": 3 }, { "epoch": 0.0014742467520501245, "grad_norm": 19.98822743116167, "learning_rate": 1.5051499783199057e-05, "loss": 3.7241, "step": 4 }, { "epoch": 0.0018428084400626554, "grad_norm": 19.036306927692475, "learning_rate": 1.7474250108400467e-05, "loss": 3.3114, "step": 5 }, { "epoch": 0.0022113701280751866, "grad_norm": 15.896471776510857, "learning_rate": 1.945378125959109e-05, "loss": 2.6316, "step": 6 }, { "epoch": 0.0025799318160877175, "grad_norm": 20.1539803106801, "learning_rate": 2.1127451000356418e-05, "loss": 2.5167, "step": 7 }, { "epoch": 0.002948493504100249, "grad_norm": 13.662156143817375, "learning_rate": 2.2577249674798584e-05, "loss": 2.2412, "step": 8 }, { "epoch": 0.00331705519211278, "grad_norm": 16.472188919888577, "learning_rate": 2.385606273598312e-05, "loss": 2.0667, "step": 9 }, { "epoch": 0.003685616880125311, "grad_norm": 15.49433746270563, "learning_rate": 2.4999999999999998e-05, "loss": 1.8683, "step": 10 }, { "epoch": 0.004054178568137842, "grad_norm": 21.158742467653884, "learning_rate": 2.6034817128955623e-05, "loss": 1.8046, "step": 11 }, { "epoch": 0.004422740256150373, "grad_norm": 17.8117581397049, "learning_rate": 2.6979531151190617e-05, "loss": 1.5318, "step": 12 }, { "epoch": 0.0047913019441629045, "grad_norm": 26.731893240302632, "learning_rate": 2.7848583807670913e-05, "loss": 1.5218, "step": 13 }, { "epoch": 0.005159863632175435, "grad_norm": 23.856312733248668, "learning_rate": 2.8653200891955945e-05, "loss": 1.3567, "step": 14 }, { "epoch": 0.005528425320187966, "grad_norm": 29.02499259797494, "learning_rate": 2.940228147639203e-05, "loss": 1.0629, "step": 15 }, { "epoch": 0.005896987008200498, "grad_norm": 20.526495953591603, "learning_rate": 3.0102999566398115e-05, "loss": 1.1201, "step": 16 }, { "epoch": 0.006265548696213028, "grad_norm": 14.915585839239919, "learning_rate": 3.076122303445685e-05, "loss": 0.8752, "step": 17 }, { "epoch": 0.00663411038422556, "grad_norm": 22.283790561243027, "learning_rate": 3.1381812627582646e-05, "loss": 1.3198, "step": 18 }, { "epoch": 0.007002672072238091, "grad_norm": 14.672867128995486, "learning_rate": 3.1968840023820715e-05, "loss": 0.8266, "step": 19 }, { "epoch": 0.007371233760250622, "grad_norm": 16.654490995299135, "learning_rate": 3.2525749891599525e-05, "loss": 1.0334, "step": 20 }, { "epoch": 0.007739795448263153, "grad_norm": 15.999422062926406, "learning_rate": 3.305548236834798e-05, "loss": 0.5562, "step": 21 }, { "epoch": 0.008108357136275684, "grad_norm": 15.964635867713108, "learning_rate": 3.3560567020555153e-05, "loss": 0.7637, "step": 22 }, { "epoch": 0.008476918824288216, "grad_norm": 16.97014015081435, "learning_rate": 3.404319590043982e-05, "loss": 1.1195, "step": 23 }, { "epoch": 0.008845480512300746, "grad_norm": 17.64519928652913, "learning_rate": 3.450528104279015e-05, "loss": 0.6925, "step": 24 }, { "epoch": 0.009214042200313277, "grad_norm": 15.448573291668291, "learning_rate": 3.4948500216800935e-05, "loss": 0.6815, "step": 25 }, { "epoch": 0.009582603888325809, "grad_norm": 12.78556679947311, "learning_rate": 3.537433369927044e-05, "loss": 0.8205, "step": 26 }, { "epoch": 0.00995116557633834, "grad_norm": 15.02961350888846, "learning_rate": 3.578409410397468e-05, "loss": 0.6697, "step": 27 }, { "epoch": 0.01031972726435087, "grad_norm": 15.316659790524561, "learning_rate": 3.6178950783555475e-05, "loss": 0.8106, "step": 28 }, { "epoch": 0.010688288952363402, "grad_norm": 12.552058345194698, "learning_rate": 3.65599499474739e-05, "loss": 0.7277, "step": 29 }, { "epoch": 0.011056850640375933, "grad_norm": 11.860440936327011, "learning_rate": 3.6928031367991554e-05, "loss": 0.6658, "step": 30 }, { "epoch": 0.011425412328388463, "grad_norm": 12.554919028862315, "learning_rate": 3.728404234585681e-05, "loss": 0.6462, "step": 31 }, { "epoch": 0.011793974016400996, "grad_norm": 8.839541670711338, "learning_rate": 3.762874945799765e-05, "loss": 0.5799, "step": 32 }, { "epoch": 0.012162535704413526, "grad_norm": 14.202913893336234, "learning_rate": 3.796284849694718e-05, "loss": 0.8663, "step": 33 }, { "epoch": 0.012531097392426057, "grad_norm": 14.23315236842399, "learning_rate": 3.8286972926056376e-05, "loss": 0.6258, "step": 34 }, { "epoch": 0.012899659080438589, "grad_norm": 12.814364265095428, "learning_rate": 3.8601701108756885e-05, "loss": 0.4986, "step": 35 }, { "epoch": 0.01326822076845112, "grad_norm": 30.054183856580703, "learning_rate": 3.890756251918218e-05, "loss": 0.6946, "step": 36 }, { "epoch": 0.01363678245646365, "grad_norm": 10.204704127417168, "learning_rate": 3.920504310167487e-05, "loss": 0.6491, "step": 37 }, { "epoch": 0.014005344144476182, "grad_norm": 14.155057810588191, "learning_rate": 3.949458991542025e-05, "loss": 0.5924, "step": 38 }, { "epoch": 0.014373905832488713, "grad_norm": 21.336367311901043, "learning_rate": 3.977661517566247e-05, "loss": 0.6291, "step": 39 }, { "epoch": 0.014742467520501243, "grad_norm": 14.135740264734325, "learning_rate": 4.005149978319905e-05, "loss": 0.5519, "step": 40 }, { "epoch": 0.015111029208513775, "grad_norm": 14.580541608716585, "learning_rate": 4.031959641799338e-05, "loss": 0.8142, "step": 41 }, { "epoch": 0.015479590896526306, "grad_norm": 9.967967800914995, "learning_rate": 4.058123225994751e-05, "loss": 0.4958, "step": 42 }, { "epoch": 0.015848152584538838, "grad_norm": 12.607918542937671, "learning_rate": 4.0836711389489654e-05, "loss": 0.4715, "step": 43 }, { "epoch": 0.016216714272551367, "grad_norm": 13.37479179434178, "learning_rate": 4.108631691215468e-05, "loss": 0.4599, "step": 44 }, { "epoch": 0.0165852759605639, "grad_norm": 45.467998275572896, "learning_rate": 4.133031284438358e-05, "loss": 0.4743, "step": 45 }, { "epoch": 0.01695383764857643, "grad_norm": 14.726611198526907, "learning_rate": 4.156894579203935e-05, "loss": 0.6099, "step": 46 }, { "epoch": 0.01732239933658896, "grad_norm": 12.265739450102977, "learning_rate": 4.180244644839293e-05, "loss": 0.5097, "step": 47 }, { "epoch": 0.017690961024601493, "grad_norm": 11.048328697889177, "learning_rate": 4.203103093438968e-05, "loss": 0.4442, "step": 48 }, { "epoch": 0.018059522712614025, "grad_norm": 16.454042182709713, "learning_rate": 4.2254902000712836e-05, "loss": 0.4892, "step": 49 }, { "epoch": 0.018428084400626554, "grad_norm": 10.872252369312958, "learning_rate": 4.247425010840046e-05, "loss": 0.3892, "step": 50 }, { "epoch": 0.018796646088639086, "grad_norm": 12.158687396229034, "learning_rate": 4.2689254402448405e-05, "loss": 0.6186, "step": 51 }, { "epoch": 0.019165207776651618, "grad_norm": 8.900599564491253, "learning_rate": 4.290008359086998e-05, "loss": 0.5846, "step": 52 }, { "epoch": 0.019533769464664147, "grad_norm": 7.595825414048372, "learning_rate": 4.310689674001973e-05, "loss": 0.4128, "step": 53 }, { "epoch": 0.01990233115267668, "grad_norm": 13.459186034850068, "learning_rate": 4.330984399557421e-05, "loss": 0.5021, "step": 54 }, { "epoch": 0.02027089284068921, "grad_norm": 8.57652704193686, "learning_rate": 4.350906723735609e-05, "loss": 0.4033, "step": 55 }, { "epoch": 0.02063945452870174, "grad_norm": 15.10185703201529, "learning_rate": 4.370470067515501e-05, "loss": 0.5173, "step": 56 }, { "epoch": 0.021008016216714272, "grad_norm": 11.900262884233575, "learning_rate": 4.3896871391812285e-05, "loss": 0.5803, "step": 57 }, { "epoch": 0.021376577904726805, "grad_norm": 5.2035313353091235, "learning_rate": 4.408569983907343e-05, "loss": 0.2929, "step": 58 }, { "epoch": 0.021745139592739333, "grad_norm": 10.517546751636123, "learning_rate": 4.42713002910536e-05, "loss": 0.4812, "step": 59 }, { "epoch": 0.022113701280751866, "grad_norm": 6.954578170654083, "learning_rate": 4.445378125959108e-05, "loss": 0.4275, "step": 60 }, { "epoch": 0.022482262968764398, "grad_norm": 11.44200704188231, "learning_rate": 4.463324587526917e-05, "loss": 0.7245, "step": 61 }, { "epoch": 0.022850824656776927, "grad_norm": 10.397192326602754, "learning_rate": 4.4809792237456346e-05, "loss": 0.4844, "step": 62 }, { "epoch": 0.02321938634478946, "grad_norm": 10.040097526897306, "learning_rate": 4.498351373633954e-05, "loss": 0.5781, "step": 63 }, { "epoch": 0.02358794803280199, "grad_norm": 12.19879534667896, "learning_rate": 4.515449934959717e-05, "loss": 0.5337, "step": 64 }, { "epoch": 0.02395650972081452, "grad_norm": 6.792453494061931, "learning_rate": 4.532283391607138e-05, "loss": 0.4389, "step": 65 }, { "epoch": 0.024325071408827052, "grad_norm": 10.116501997770694, "learning_rate": 4.548859838854671e-05, "loss": 0.5591, "step": 66 }, { "epoch": 0.024693633096839585, "grad_norm": 7.097698709112133, "learning_rate": 4.565187006752065e-05, "loss": 0.4178, "step": 67 }, { "epoch": 0.025062194784852113, "grad_norm": 12.872464717387677, "learning_rate": 4.581272281765591e-05, "loss": 0.354, "step": 68 }, { "epoch": 0.025430756472864646, "grad_norm": 10.990804817088863, "learning_rate": 4.597122726843138e-05, "loss": 0.466, "step": 69 }, { "epoch": 0.025799318160877178, "grad_norm": 12.295250597521767, "learning_rate": 4.612745100035642e-05, "loss": 0.4974, "step": 70 }, { "epoch": 0.026167879848889707, "grad_norm": 7.800462161335052, "learning_rate": 4.628145871797688e-05, "loss": 0.4654, "step": 71 }, { "epoch": 0.02653644153690224, "grad_norm": 15.295209067226125, "learning_rate": 4.643331241078171e-05, "loss": 0.7217, "step": 72 }, { "epoch": 0.02690500322491477, "grad_norm": 8.680326944689309, "learning_rate": 4.658307150301139e-05, "loss": 0.5634, "step": 73 }, { "epoch": 0.0272735649129273, "grad_norm": 11.927762358991654, "learning_rate": 4.67307929932744e-05, "loss": 0.5815, "step": 74 }, { "epoch": 0.027642126600939832, "grad_norm": 22.143992058283974, "learning_rate": 4.687653158479249e-05, "loss": 0.7231, "step": 75 }, { "epoch": 0.028010688288952364, "grad_norm": 8.331351794846512, "learning_rate": 4.702033980701978e-05, "loss": 0.793, "step": 76 }, { "epoch": 0.028379249976964893, "grad_norm": 9.717188081006777, "learning_rate": 4.716226812931204e-05, "loss": 0.4496, "step": 77 }, { "epoch": 0.028747811664977425, "grad_norm": 17.698189784010662, "learning_rate": 4.7302365067262006e-05, "loss": 0.6084, "step": 78 }, { "epoch": 0.029116373352989958, "grad_norm": 7.1873593523942585, "learning_rate": 4.744067728226103e-05, "loss": 0.4758, "step": 79 }, { "epoch": 0.029484935041002486, "grad_norm": 18.076772785966767, "learning_rate": 4.757724967479858e-05, "loss": 0.4961, "step": 80 }, { "epoch": 0.02985349672901502, "grad_norm": 6.354235772762872, "learning_rate": 4.771212547196624e-05, "loss": 0.3993, "step": 81 }, { "epoch": 0.03022205841702755, "grad_norm": 31.262533619346677, "learning_rate": 4.7845346309592914e-05, "loss": 0.5614, "step": 82 }, { "epoch": 0.03059062010504008, "grad_norm": 18.12664661162711, "learning_rate": 4.7976952309401844e-05, "loss": 0.6124, "step": 83 }, { "epoch": 0.030959181793052612, "grad_norm": 7.221339698407417, "learning_rate": 4.810698215154703e-05, "loss": 0.4128, "step": 84 }, { "epoch": 0.031327743481065144, "grad_norm": 8.984292628698485, "learning_rate": 4.823547314285732e-05, "loss": 0.2786, "step": 85 }, { "epoch": 0.031696305169077676, "grad_norm": 8.186817402813606, "learning_rate": 4.836246128108918e-05, "loss": 0.4388, "step": 86 }, { "epoch": 0.03206486685709021, "grad_norm": 12.434242320305573, "learning_rate": 4.8487981315465456e-05, "loss": 0.6646, "step": 87 }, { "epoch": 0.032433428545102734, "grad_norm": 20.99632085632988, "learning_rate": 4.8612066803754214e-05, "loss": 0.5131, "step": 88 }, { "epoch": 0.032801990233115266, "grad_norm": 16.817010097317574, "learning_rate": 4.873475016612281e-05, "loss": 0.6371, "step": 89 }, { "epoch": 0.0331705519211278, "grad_norm": 9.619334745820442, "learning_rate": 4.885606273598312e-05, "loss": 0.6461, "step": 90 }, { "epoch": 0.03353911360914033, "grad_norm": 25.433401466501685, "learning_rate": 4.897603480802733e-05, "loss": 0.4388, "step": 91 }, { "epoch": 0.03390767529715286, "grad_norm": 7.669876366604869, "learning_rate": 4.909469568363888e-05, "loss": 0.434, "step": 92 }, { "epoch": 0.034276236985165395, "grad_norm": 24.346526588685986, "learning_rate": 4.9212073713848375e-05, "loss": 0.3527, "step": 93 }, { "epoch": 0.03464479867317792, "grad_norm": 7.526962135987003, "learning_rate": 4.932819633999246e-05, "loss": 0.3962, "step": 94 }, { "epoch": 0.03501336036119045, "grad_norm": 15.779272578773535, "learning_rate": 4.9443090132221186e-05, "loss": 0.6951, "step": 95 }, { "epoch": 0.035381922049202985, "grad_norm": 6.790601900415152, "learning_rate": 4.9556780825989205e-05, "loss": 0.5104, "step": 96 }, { "epoch": 0.03575048373721552, "grad_norm": 11.395688201891387, "learning_rate": 4.9669293356656114e-05, "loss": 0.5571, "step": 97 }, { "epoch": 0.03611904542522805, "grad_norm": 16.4100125893795, "learning_rate": 4.978065189231237e-05, "loss": 0.4919, "step": 98 }, { "epoch": 0.03648760711324058, "grad_norm": 9.807720988650265, "learning_rate": 4.989087986493874e-05, "loss": 0.4965, "step": 99 }, { "epoch": 0.03685616880125311, "grad_norm": 10.002190159330027, "learning_rate": 4.9999999999999996e-05, "loss": 0.3604, "step": 100 }, { "epoch": 0.03722473048926564, "grad_norm": 13.22116117378512, "learning_rate": 5e-05, "loss": 0.3852, "step": 101 }, { "epoch": 0.03759329217727817, "grad_norm": 8.312344886651927, "learning_rate": 4.9996909383112874e-05, "loss": 0.5638, "step": 102 }, { "epoch": 0.037961853865290704, "grad_norm": 7.593891158185195, "learning_rate": 4.9993818766225745e-05, "loss": 0.5339, "step": 103 }, { "epoch": 0.038330415553303236, "grad_norm": 12.628401194277636, "learning_rate": 4.999072814933861e-05, "loss": 0.5685, "step": 104 }, { "epoch": 0.03869897724131577, "grad_norm": 11.430108689052554, "learning_rate": 4.998763753245148e-05, "loss": 0.3969, "step": 105 }, { "epoch": 0.039067538929328294, "grad_norm": 15.663117568131991, "learning_rate": 4.998454691556435e-05, "loss": 0.2764, "step": 106 }, { "epoch": 0.039436100617340826, "grad_norm": 7.4267967850103345, "learning_rate": 4.998145629867722e-05, "loss": 0.456, "step": 107 }, { "epoch": 0.03980466230535336, "grad_norm": 14.217306009266762, "learning_rate": 4.997836568179009e-05, "loss": 0.5439, "step": 108 }, { "epoch": 0.04017322399336589, "grad_norm": 19.49401528880282, "learning_rate": 4.997527506490295e-05, "loss": 0.431, "step": 109 }, { "epoch": 0.04054178568137842, "grad_norm": 11.663021789849653, "learning_rate": 4.997218444801582e-05, "loss": 0.3583, "step": 110 }, { "epoch": 0.040910347369390955, "grad_norm": 8.787951318646812, "learning_rate": 4.9969093831128694e-05, "loss": 0.4742, "step": 111 }, { "epoch": 0.04127890905740348, "grad_norm": 6.903235144230781, "learning_rate": 4.9966003214241565e-05, "loss": 0.4421, "step": 112 }, { "epoch": 0.04164747074541601, "grad_norm": 9.55979053283796, "learning_rate": 4.9962912597354436e-05, "loss": 0.2508, "step": 113 }, { "epoch": 0.042016032433428545, "grad_norm": 10.593674380952457, "learning_rate": 4.99598219804673e-05, "loss": 0.4406, "step": 114 }, { "epoch": 0.04238459412144108, "grad_norm": 9.472479863048502, "learning_rate": 4.995673136358017e-05, "loss": 0.398, "step": 115 }, { "epoch": 0.04275315580945361, "grad_norm": 6.026600997758325, "learning_rate": 4.995364074669304e-05, "loss": 0.2806, "step": 116 }, { "epoch": 0.04312171749746614, "grad_norm": 6.962987321821551, "learning_rate": 4.9950550129805914e-05, "loss": 0.313, "step": 117 }, { "epoch": 0.04349027918547867, "grad_norm": 19.772705932733228, "learning_rate": 4.994745951291878e-05, "loss": 0.5736, "step": 118 }, { "epoch": 0.0438588408734912, "grad_norm": 5.739557818882813, "learning_rate": 4.994436889603165e-05, "loss": 0.2939, "step": 119 }, { "epoch": 0.04422740256150373, "grad_norm": 6.782623011383633, "learning_rate": 4.994127827914452e-05, "loss": 0.3289, "step": 120 }, { "epoch": 0.044595964249516264, "grad_norm": 7.402452485184228, "learning_rate": 4.993818766225739e-05, "loss": 0.4179, "step": 121 }, { "epoch": 0.044964525937528796, "grad_norm": 13.012632834336408, "learning_rate": 4.9935097045370264e-05, "loss": 0.5513, "step": 122 }, { "epoch": 0.04533308762554133, "grad_norm": 8.579663970698249, "learning_rate": 4.993200642848313e-05, "loss": 0.4969, "step": 123 }, { "epoch": 0.04570164931355385, "grad_norm": 11.988966637910119, "learning_rate": 4.992891581159599e-05, "loss": 0.5163, "step": 124 }, { "epoch": 0.046070211001566386, "grad_norm": 8.87853976297645, "learning_rate": 4.9925825194708864e-05, "loss": 0.6245, "step": 125 }, { "epoch": 0.04643877268957892, "grad_norm": 15.306022426148656, "learning_rate": 4.9922734577821735e-05, "loss": 0.4901, "step": 126 }, { "epoch": 0.04680733437759145, "grad_norm": 12.510690618856426, "learning_rate": 4.9919643960934606e-05, "loss": 0.4835, "step": 127 }, { "epoch": 0.04717589606560398, "grad_norm": 8.664459632185578, "learning_rate": 4.991655334404747e-05, "loss": 0.481, "step": 128 }, { "epoch": 0.047544457753616515, "grad_norm": 11.624909431350826, "learning_rate": 4.991346272716034e-05, "loss": 0.4115, "step": 129 }, { "epoch": 0.04791301944162904, "grad_norm": 7.807504994992073, "learning_rate": 4.991037211027321e-05, "loss": 0.3954, "step": 130 }, { "epoch": 0.04828158112964157, "grad_norm": 8.368286174658259, "learning_rate": 4.9907281493386084e-05, "loss": 0.4141, "step": 131 }, { "epoch": 0.048650142817654105, "grad_norm": 16.49948396742778, "learning_rate": 4.9904190876498955e-05, "loss": 0.4592, "step": 132 }, { "epoch": 0.04901870450566664, "grad_norm": 8.587181438550937, "learning_rate": 4.990110025961182e-05, "loss": 0.5241, "step": 133 }, { "epoch": 0.04938726619367917, "grad_norm": 12.538541655849325, "learning_rate": 4.989800964272469e-05, "loss": 0.4291, "step": 134 }, { "epoch": 0.0497558278816917, "grad_norm": 10.521687226847668, "learning_rate": 4.989491902583756e-05, "loss": 0.6541, "step": 135 }, { "epoch": 0.05012438956970423, "grad_norm": 12.338764816119156, "learning_rate": 4.989182840895043e-05, "loss": 0.5435, "step": 136 }, { "epoch": 0.05049295125771676, "grad_norm": 9.374328182308794, "learning_rate": 4.98887377920633e-05, "loss": 0.4677, "step": 137 }, { "epoch": 0.05086151294572929, "grad_norm": 11.91729592393643, "learning_rate": 4.988564717517617e-05, "loss": 0.4061, "step": 138 }, { "epoch": 0.05123007463374182, "grad_norm": 15.062413892064558, "learning_rate": 4.988255655828903e-05, "loss": 0.8671, "step": 139 }, { "epoch": 0.051598636321754356, "grad_norm": 14.772472267350798, "learning_rate": 4.9879465941401904e-05, "loss": 0.5336, "step": 140 }, { "epoch": 0.05196719800976689, "grad_norm": 8.63337499934959, "learning_rate": 4.9876375324514776e-05, "loss": 0.4893, "step": 141 }, { "epoch": 0.05233575969777941, "grad_norm": 18.982394444013774, "learning_rate": 4.987328470762764e-05, "loss": 0.58, "step": 142 }, { "epoch": 0.052704321385791945, "grad_norm": 19.365302039878003, "learning_rate": 4.987019409074051e-05, "loss": 0.5329, "step": 143 }, { "epoch": 0.05307288307380448, "grad_norm": 31.188085149194997, "learning_rate": 4.986710347385338e-05, "loss": 0.4468, "step": 144 }, { "epoch": 0.05344144476181701, "grad_norm": 10.672673520866635, "learning_rate": 4.9864012856966254e-05, "loss": 0.4038, "step": 145 }, { "epoch": 0.05381000644982954, "grad_norm": 5.967042688778674, "learning_rate": 4.9860922240079125e-05, "loss": 0.4598, "step": 146 }, { "epoch": 0.054178568137842074, "grad_norm": 6.845009605116754, "learning_rate": 4.985783162319199e-05, "loss": 0.35, "step": 147 }, { "epoch": 0.0545471298258546, "grad_norm": 11.827519309335129, "learning_rate": 4.985474100630486e-05, "loss": 0.6299, "step": 148 }, { "epoch": 0.05491569151386713, "grad_norm": 12.874335707035954, "learning_rate": 4.985165038941773e-05, "loss": 0.459, "step": 149 }, { "epoch": 0.055284253201879664, "grad_norm": 8.146123570363564, "learning_rate": 4.98485597725306e-05, "loss": 0.4124, "step": 150 }, { "epoch": 0.055652814889892196, "grad_norm": 6.2530705348077795, "learning_rate": 4.9845469155643474e-05, "loss": 0.3146, "step": 151 }, { "epoch": 0.05602137657790473, "grad_norm": 8.952624348594705, "learning_rate": 4.984237853875634e-05, "loss": 0.3439, "step": 152 }, { "epoch": 0.05638993826591726, "grad_norm": 9.49358794003077, "learning_rate": 4.983928792186921e-05, "loss": 0.5186, "step": 153 }, { "epoch": 0.056758499953929786, "grad_norm": 6.909894025020335, "learning_rate": 4.9836197304982074e-05, "loss": 0.3225, "step": 154 }, { "epoch": 0.05712706164194232, "grad_norm": 7.433426454699197, "learning_rate": 4.9833106688094945e-05, "loss": 0.4663, "step": 155 }, { "epoch": 0.05749562332995485, "grad_norm": 18.872247962658356, "learning_rate": 4.9830016071207816e-05, "loss": 0.4048, "step": 156 }, { "epoch": 0.05786418501796738, "grad_norm": 6.464004028734519, "learning_rate": 4.982692545432068e-05, "loss": 0.3812, "step": 157 }, { "epoch": 0.058232746705979915, "grad_norm": 10.646753996712045, "learning_rate": 4.982383483743355e-05, "loss": 0.4131, "step": 158 }, { "epoch": 0.05860130839399245, "grad_norm": 7.305293518060021, "learning_rate": 4.982074422054642e-05, "loss": 0.4675, "step": 159 }, { "epoch": 0.05896987008200497, "grad_norm": 11.594191550511875, "learning_rate": 4.9817653603659294e-05, "loss": 0.585, "step": 160 }, { "epoch": 0.059338431770017505, "grad_norm": 7.316186481032598, "learning_rate": 4.981456298677216e-05, "loss": 0.3539, "step": 161 }, { "epoch": 0.05970699345803004, "grad_norm": 40.00804515041709, "learning_rate": 4.981147236988503e-05, "loss": 0.4738, "step": 162 }, { "epoch": 0.06007555514604257, "grad_norm": 11.610139926852465, "learning_rate": 4.98083817529979e-05, "loss": 0.4278, "step": 163 }, { "epoch": 0.0604441168340551, "grad_norm": 5.994817402837168, "learning_rate": 4.980529113611077e-05, "loss": 0.276, "step": 164 }, { "epoch": 0.060812678522067634, "grad_norm": 23.21037476177477, "learning_rate": 4.9802200519223643e-05, "loss": 0.3431, "step": 165 }, { "epoch": 0.06118124021008016, "grad_norm": 15.123658680355188, "learning_rate": 4.979910990233651e-05, "loss": 0.368, "step": 166 }, { "epoch": 0.06154980189809269, "grad_norm": 8.475502165824196, "learning_rate": 4.979601928544938e-05, "loss": 0.3048, "step": 167 }, { "epoch": 0.061918363586105224, "grad_norm": 7.875931457655253, "learning_rate": 4.979292866856225e-05, "loss": 0.357, "step": 168 }, { "epoch": 0.062286925274117756, "grad_norm": 9.416549366808367, "learning_rate": 4.978983805167512e-05, "loss": 0.4798, "step": 169 }, { "epoch": 0.06265548696213029, "grad_norm": 15.930216721990016, "learning_rate": 4.9786747434787986e-05, "loss": 0.4193, "step": 170 }, { "epoch": 0.06302404865014281, "grad_norm": 6.6590275230314395, "learning_rate": 4.978365681790085e-05, "loss": 0.3806, "step": 171 }, { "epoch": 0.06339261033815535, "grad_norm": 7.4862580608350635, "learning_rate": 4.978056620101372e-05, "loss": 0.3019, "step": 172 }, { "epoch": 0.06376117202616788, "grad_norm": 4.235068176513431, "learning_rate": 4.977747558412659e-05, "loss": 0.2446, "step": 173 }, { "epoch": 0.06412973371418042, "grad_norm": 9.851771595223527, "learning_rate": 4.9774384967239464e-05, "loss": 0.3426, "step": 174 }, { "epoch": 0.06449829540219294, "grad_norm": 10.02736809745778, "learning_rate": 4.9771294350352335e-05, "loss": 0.5668, "step": 175 }, { "epoch": 0.06486685709020547, "grad_norm": 11.713388747589164, "learning_rate": 4.97682037334652e-05, "loss": 0.4724, "step": 176 }, { "epoch": 0.06523541877821801, "grad_norm": 6.461521994042285, "learning_rate": 4.976511311657807e-05, "loss": 0.4287, "step": 177 }, { "epoch": 0.06560398046623053, "grad_norm": 11.469605458133003, "learning_rate": 4.976202249969094e-05, "loss": 0.5007, "step": 178 }, { "epoch": 0.06597254215424307, "grad_norm": 14.5584113803308, "learning_rate": 4.975893188280381e-05, "loss": 0.4972, "step": 179 }, { "epoch": 0.0663411038422556, "grad_norm": 11.665658380443803, "learning_rate": 4.975584126591668e-05, "loss": 0.4373, "step": 180 }, { "epoch": 0.06670966553026812, "grad_norm": 5.694793719406507, "learning_rate": 4.975275064902955e-05, "loss": 0.4294, "step": 181 }, { "epoch": 0.06707822721828066, "grad_norm": 9.96811590300603, "learning_rate": 4.974966003214242e-05, "loss": 0.6771, "step": 182 }, { "epoch": 0.06744678890629319, "grad_norm": 7.848592896909979, "learning_rate": 4.974656941525529e-05, "loss": 0.3906, "step": 183 }, { "epoch": 0.06781535059430573, "grad_norm": 11.77633896522294, "learning_rate": 4.974347879836816e-05, "loss": 0.2937, "step": 184 }, { "epoch": 0.06818391228231825, "grad_norm": 8.458797899338622, "learning_rate": 4.974038818148103e-05, "loss": 0.3853, "step": 185 }, { "epoch": 0.06855247397033079, "grad_norm": 10.500947818665324, "learning_rate": 4.973729756459389e-05, "loss": 0.3784, "step": 186 }, { "epoch": 0.06892103565834332, "grad_norm": 7.794650752457303, "learning_rate": 4.973420694770676e-05, "loss": 0.4862, "step": 187 }, { "epoch": 0.06928959734635584, "grad_norm": 5.928647157379094, "learning_rate": 4.9731116330819633e-05, "loss": 0.3808, "step": 188 }, { "epoch": 0.06965815903436838, "grad_norm": 4.77134926128798, "learning_rate": 4.9728025713932505e-05, "loss": 0.2863, "step": 189 }, { "epoch": 0.0700267207223809, "grad_norm": 6.63129054424752, "learning_rate": 4.972493509704537e-05, "loss": 0.4625, "step": 190 }, { "epoch": 0.07039528241039344, "grad_norm": 17.722938217533233, "learning_rate": 4.972184448015824e-05, "loss": 0.6294, "step": 191 }, { "epoch": 0.07076384409840597, "grad_norm": 7.320418024269435, "learning_rate": 4.971875386327111e-05, "loss": 0.379, "step": 192 }, { "epoch": 0.0711324057864185, "grad_norm": 16.362669051122513, "learning_rate": 4.971566324638398e-05, "loss": 0.6355, "step": 193 }, { "epoch": 0.07150096747443103, "grad_norm": 9.006325935716049, "learning_rate": 4.9712572629496854e-05, "loss": 0.4224, "step": 194 }, { "epoch": 0.07186952916244356, "grad_norm": 8.6705886574105, "learning_rate": 4.970948201260972e-05, "loss": 0.4109, "step": 195 }, { "epoch": 0.0722380908504561, "grad_norm": 9.799439503737055, "learning_rate": 4.970639139572259e-05, "loss": 0.4526, "step": 196 }, { "epoch": 0.07260665253846862, "grad_norm": 11.963795405835697, "learning_rate": 4.970330077883546e-05, "loss": 0.3987, "step": 197 }, { "epoch": 0.07297521422648116, "grad_norm": 8.514646141418574, "learning_rate": 4.970021016194833e-05, "loss": 0.4439, "step": 198 }, { "epoch": 0.07334377591449369, "grad_norm": 18.589930345590492, "learning_rate": 4.9697119545061196e-05, "loss": 0.6489, "step": 199 }, { "epoch": 0.07371233760250621, "grad_norm": 5.903904363296973, "learning_rate": 4.969402892817406e-05, "loss": 0.3778, "step": 200 }, { "epoch": 0.07408089929051875, "grad_norm": 10.925046272092548, "learning_rate": 4.969093831128693e-05, "loss": 0.5356, "step": 201 }, { "epoch": 0.07444946097853128, "grad_norm": 6.192243819037191, "learning_rate": 4.96878476943998e-05, "loss": 0.416, "step": 202 }, { "epoch": 0.07481802266654382, "grad_norm": 8.030689976978794, "learning_rate": 4.9684757077512674e-05, "loss": 0.4246, "step": 203 }, { "epoch": 0.07518658435455634, "grad_norm": 5.659655478408323, "learning_rate": 4.9681666460625545e-05, "loss": 0.4813, "step": 204 }, { "epoch": 0.07555514604256887, "grad_norm": 8.228728644065027, "learning_rate": 4.967857584373841e-05, "loss": 0.3306, "step": 205 }, { "epoch": 0.07592370773058141, "grad_norm": 6.257070890337506, "learning_rate": 4.967548522685128e-05, "loss": 0.4859, "step": 206 }, { "epoch": 0.07629226941859393, "grad_norm": 4.485443094367335, "learning_rate": 4.967239460996415e-05, "loss": 0.3014, "step": 207 }, { "epoch": 0.07666083110660647, "grad_norm": 14.79910226109594, "learning_rate": 4.9669303993077023e-05, "loss": 0.8493, "step": 208 }, { "epoch": 0.077029392794619, "grad_norm": 10.60310423399547, "learning_rate": 4.966621337618989e-05, "loss": 0.4251, "step": 209 }, { "epoch": 0.07739795448263154, "grad_norm": 6.8587623965660764, "learning_rate": 4.966312275930276e-05, "loss": 0.3164, "step": 210 }, { "epoch": 0.07776651617064406, "grad_norm": 7.961795180429926, "learning_rate": 4.966003214241563e-05, "loss": 0.4792, "step": 211 }, { "epoch": 0.07813507785865659, "grad_norm": 9.873377413173708, "learning_rate": 4.96569415255285e-05, "loss": 0.355, "step": 212 }, { "epoch": 0.07850363954666913, "grad_norm": 7.985658187922642, "learning_rate": 4.965385090864137e-05, "loss": 0.4856, "step": 213 }, { "epoch": 0.07887220123468165, "grad_norm": 7.006895755951106, "learning_rate": 4.965076029175424e-05, "loss": 0.5069, "step": 214 }, { "epoch": 0.07924076292269419, "grad_norm": 9.182425206786204, "learning_rate": 4.96476696748671e-05, "loss": 0.3855, "step": 215 }, { "epoch": 0.07960932461070672, "grad_norm": 6.681106220407601, "learning_rate": 4.964457905797997e-05, "loss": 0.348, "step": 216 }, { "epoch": 0.07997788629871924, "grad_norm": 18.26919925331764, "learning_rate": 4.9641488441092844e-05, "loss": 0.5244, "step": 217 }, { "epoch": 0.08034644798673178, "grad_norm": 17.26264291271853, "learning_rate": 4.9638397824205715e-05, "loss": 0.3773, "step": 218 }, { "epoch": 0.0807150096747443, "grad_norm": 15.756986007235856, "learning_rate": 4.963530720731858e-05, "loss": 0.4562, "step": 219 }, { "epoch": 0.08108357136275685, "grad_norm": 13.596426864670295, "learning_rate": 4.963221659043145e-05, "loss": 0.5705, "step": 220 }, { "epoch": 0.08145213305076937, "grad_norm": 5.5334290775763435, "learning_rate": 4.962912597354432e-05, "loss": 0.3433, "step": 221 }, { "epoch": 0.08182069473878191, "grad_norm": 26.289351549302477, "learning_rate": 4.962603535665719e-05, "loss": 0.564, "step": 222 }, { "epoch": 0.08218925642679444, "grad_norm": 6.25880849010579, "learning_rate": 4.9622944739770064e-05, "loss": 0.7557, "step": 223 }, { "epoch": 0.08255781811480696, "grad_norm": 14.419408519111164, "learning_rate": 4.961985412288293e-05, "loss": 0.3325, "step": 224 }, { "epoch": 0.0829263798028195, "grad_norm": 15.106398564445817, "learning_rate": 4.96167635059958e-05, "loss": 0.3414, "step": 225 }, { "epoch": 0.08329494149083203, "grad_norm": 6.683369153973991, "learning_rate": 4.961367288910867e-05, "loss": 0.4624, "step": 226 }, { "epoch": 0.08366350317884456, "grad_norm": 10.457827937059053, "learning_rate": 4.961058227222154e-05, "loss": 0.3954, "step": 227 }, { "epoch": 0.08403206486685709, "grad_norm": 20.008662064645588, "learning_rate": 4.960749165533441e-05, "loss": 0.41, "step": 228 }, { "epoch": 0.08440062655486961, "grad_norm": 14.130647593653153, "learning_rate": 4.960440103844728e-05, "loss": 0.3722, "step": 229 }, { "epoch": 0.08476918824288215, "grad_norm": 6.810013343473069, "learning_rate": 4.960131042156014e-05, "loss": 0.4367, "step": 230 }, { "epoch": 0.08513774993089468, "grad_norm": 7.711059693022449, "learning_rate": 4.9598219804673013e-05, "loss": 0.4121, "step": 231 }, { "epoch": 0.08550631161890722, "grad_norm": 7.729364301788707, "learning_rate": 4.9595129187785885e-05, "loss": 0.4009, "step": 232 }, { "epoch": 0.08587487330691974, "grad_norm": 9.499273975135548, "learning_rate": 4.959203857089875e-05, "loss": 0.5158, "step": 233 }, { "epoch": 0.08624343499493228, "grad_norm": 6.384375167301454, "learning_rate": 4.958894795401162e-05, "loss": 0.297, "step": 234 }, { "epoch": 0.08661199668294481, "grad_norm": 15.946893294987547, "learning_rate": 4.958585733712449e-05, "loss": 0.5061, "step": 235 }, { "epoch": 0.08698055837095733, "grad_norm": 5.8002159999220675, "learning_rate": 4.958276672023736e-05, "loss": 0.4035, "step": 236 }, { "epoch": 0.08734912005896987, "grad_norm": 10.42787320362025, "learning_rate": 4.9579676103350234e-05, "loss": 0.4433, "step": 237 }, { "epoch": 0.0877176817469824, "grad_norm": 8.212935911364443, "learning_rate": 4.95765854864631e-05, "loss": 0.4108, "step": 238 }, { "epoch": 0.08808624343499494, "grad_norm": 9.944799657216226, "learning_rate": 4.957349486957597e-05, "loss": 0.4199, "step": 239 }, { "epoch": 0.08845480512300746, "grad_norm": 6.054688760080514, "learning_rate": 4.957040425268884e-05, "loss": 0.3025, "step": 240 }, { "epoch": 0.08882336681101999, "grad_norm": 16.775919886465523, "learning_rate": 4.956731363580171e-05, "loss": 0.3815, "step": 241 }, { "epoch": 0.08919192849903253, "grad_norm": 12.926733282218263, "learning_rate": 4.9564223018914576e-05, "loss": 0.4169, "step": 242 }, { "epoch": 0.08956049018704505, "grad_norm": 17.70041920655809, "learning_rate": 4.956113240202745e-05, "loss": 0.6847, "step": 243 }, { "epoch": 0.08992905187505759, "grad_norm": 9.755855660347141, "learning_rate": 4.955804178514032e-05, "loss": 0.3967, "step": 244 }, { "epoch": 0.09029761356307012, "grad_norm": 8.392341537638647, "learning_rate": 4.955495116825318e-05, "loss": 0.4497, "step": 245 }, { "epoch": 0.09066617525108266, "grad_norm": 8.806241890854144, "learning_rate": 4.9551860551366054e-05, "loss": 0.2952, "step": 246 }, { "epoch": 0.09103473693909518, "grad_norm": 9.949964851048941, "learning_rate": 4.9548769934478925e-05, "loss": 0.4272, "step": 247 }, { "epoch": 0.0914032986271077, "grad_norm": 7.0008124833654914, "learning_rate": 4.954567931759179e-05, "loss": 0.4557, "step": 248 }, { "epoch": 0.09177186031512025, "grad_norm": 10.991603768038342, "learning_rate": 4.954258870070466e-05, "loss": 0.5041, "step": 249 }, { "epoch": 0.09214042200313277, "grad_norm": 9.749454580848276, "learning_rate": 4.953949808381753e-05, "loss": 0.4252, "step": 250 }, { "epoch": 0.09250898369114531, "grad_norm": 6.627141354532811, "learning_rate": 4.95364074669304e-05, "loss": 0.3905, "step": 251 }, { "epoch": 0.09287754537915784, "grad_norm": 9.490299291363177, "learning_rate": 4.953331685004327e-05, "loss": 0.3526, "step": 252 }, { "epoch": 0.09324610706717036, "grad_norm": 33.361202745596756, "learning_rate": 4.953022623315614e-05, "loss": 0.4038, "step": 253 }, { "epoch": 0.0936146687551829, "grad_norm": 8.054096425936384, "learning_rate": 4.952713561626901e-05, "loss": 0.3807, "step": 254 }, { "epoch": 0.09398323044319543, "grad_norm": 14.830485952882084, "learning_rate": 4.952404499938188e-05, "loss": 0.2997, "step": 255 }, { "epoch": 0.09435179213120796, "grad_norm": 9.837677163221684, "learning_rate": 4.952095438249475e-05, "loss": 0.4424, "step": 256 }, { "epoch": 0.09472035381922049, "grad_norm": 7.215874697931637, "learning_rate": 4.951786376560762e-05, "loss": 0.2422, "step": 257 }, { "epoch": 0.09508891550723303, "grad_norm": 13.03725435897847, "learning_rate": 4.951477314872049e-05, "loss": 0.4605, "step": 258 }, { "epoch": 0.09545747719524555, "grad_norm": 6.956861816262157, "learning_rate": 4.951168253183336e-05, "loss": 0.4944, "step": 259 }, { "epoch": 0.09582603888325808, "grad_norm": 7.968322503085803, "learning_rate": 4.9508591914946224e-05, "loss": 0.3495, "step": 260 }, { "epoch": 0.09619460057127062, "grad_norm": 8.75542695821872, "learning_rate": 4.9505501298059095e-05, "loss": 0.4713, "step": 261 }, { "epoch": 0.09656316225928314, "grad_norm": 7.7050026236003, "learning_rate": 4.950241068117196e-05, "loss": 0.3095, "step": 262 }, { "epoch": 0.09693172394729568, "grad_norm": 8.160919139183347, "learning_rate": 4.949932006428483e-05, "loss": 0.2964, "step": 263 }, { "epoch": 0.09730028563530821, "grad_norm": 10.764817427643084, "learning_rate": 4.94962294473977e-05, "loss": 0.229, "step": 264 }, { "epoch": 0.09766884732332073, "grad_norm": 6.7558805493488645, "learning_rate": 4.949313883051057e-05, "loss": 0.2975, "step": 265 }, { "epoch": 0.09803740901133327, "grad_norm": 21.624652132091846, "learning_rate": 4.9490048213623444e-05, "loss": 0.3914, "step": 266 }, { "epoch": 0.0984059706993458, "grad_norm": 6.344942629098607, "learning_rate": 4.948695759673631e-05, "loss": 0.3568, "step": 267 }, { "epoch": 0.09877453238735834, "grad_norm": 8.198293312917567, "learning_rate": 4.948386697984918e-05, "loss": 0.4752, "step": 268 }, { "epoch": 0.09914309407537086, "grad_norm": 10.21872461531633, "learning_rate": 4.948077636296205e-05, "loss": 0.2894, "step": 269 }, { "epoch": 0.0995116557633834, "grad_norm": 8.224391610970715, "learning_rate": 4.947768574607492e-05, "loss": 0.4179, "step": 270 }, { "epoch": 0.09988021745139593, "grad_norm": 10.197693388186032, "learning_rate": 4.9474595129187787e-05, "loss": 0.6413, "step": 271 }, { "epoch": 0.10024877913940845, "grad_norm": 8.250425847940413, "learning_rate": 4.947150451230066e-05, "loss": 0.3011, "step": 272 }, { "epoch": 0.10061734082742099, "grad_norm": 16.45542017912288, "learning_rate": 4.946841389541353e-05, "loss": 0.4161, "step": 273 }, { "epoch": 0.10098590251543352, "grad_norm": 13.231236253387994, "learning_rate": 4.94653232785264e-05, "loss": 0.5175, "step": 274 }, { "epoch": 0.10135446420344606, "grad_norm": 7.525821232894931, "learning_rate": 4.9462232661639265e-05, "loss": 0.3133, "step": 275 }, { "epoch": 0.10172302589145858, "grad_norm": 11.420527638339918, "learning_rate": 4.9459142044752136e-05, "loss": 0.275, "step": 276 }, { "epoch": 0.10209158757947111, "grad_norm": 15.585028952215849, "learning_rate": 4.9456051427865e-05, "loss": 0.3255, "step": 277 }, { "epoch": 0.10246014926748365, "grad_norm": 44.52815641666728, "learning_rate": 4.945296081097787e-05, "loss": 0.4601, "step": 278 }, { "epoch": 0.10282871095549617, "grad_norm": 23.64866075010248, "learning_rate": 4.944987019409074e-05, "loss": 0.3527, "step": 279 }, { "epoch": 0.10319727264350871, "grad_norm": 8.337257173539411, "learning_rate": 4.9446779577203614e-05, "loss": 0.3591, "step": 280 }, { "epoch": 0.10356583433152124, "grad_norm": 7.472018368700472, "learning_rate": 4.944368896031648e-05, "loss": 0.5968, "step": 281 }, { "epoch": 0.10393439601953378, "grad_norm": 33.010836584843624, "learning_rate": 4.944059834342935e-05, "loss": 0.3084, "step": 282 }, { "epoch": 0.1043029577075463, "grad_norm": 10.19245724506033, "learning_rate": 4.943750772654222e-05, "loss": 0.4392, "step": 283 }, { "epoch": 0.10467151939555883, "grad_norm": 11.719554659874293, "learning_rate": 4.943441710965509e-05, "loss": 0.4417, "step": 284 }, { "epoch": 0.10504008108357137, "grad_norm": 6.6549305928979505, "learning_rate": 4.943132649276796e-05, "loss": 0.3015, "step": 285 }, { "epoch": 0.10540864277158389, "grad_norm": 11.417088039075121, "learning_rate": 4.942823587588083e-05, "loss": 0.4865, "step": 286 }, { "epoch": 0.10577720445959643, "grad_norm": 9.00169865154801, "learning_rate": 4.94251452589937e-05, "loss": 0.4205, "step": 287 }, { "epoch": 0.10614576614760896, "grad_norm": 16.551359755791214, "learning_rate": 4.942205464210657e-05, "loss": 0.5124, "step": 288 }, { "epoch": 0.10651432783562148, "grad_norm": 8.016036649547862, "learning_rate": 4.941896402521944e-05, "loss": 0.3609, "step": 289 }, { "epoch": 0.10688288952363402, "grad_norm": 9.820463605763676, "learning_rate": 4.9415873408332305e-05, "loss": 0.4911, "step": 290 }, { "epoch": 0.10725145121164655, "grad_norm": 6.495485645488869, "learning_rate": 4.941278279144517e-05, "loss": 0.4215, "step": 291 }, { "epoch": 0.10762001289965908, "grad_norm": 16.49413588384331, "learning_rate": 4.940969217455804e-05, "loss": 0.3972, "step": 292 }, { "epoch": 0.10798857458767161, "grad_norm": 18.198981891702932, "learning_rate": 4.940660155767091e-05, "loss": 0.462, "step": 293 }, { "epoch": 0.10835713627568415, "grad_norm": 6.91782096821262, "learning_rate": 4.940351094078378e-05, "loss": 0.2921, "step": 294 }, { "epoch": 0.10872569796369667, "grad_norm": 7.571687020468919, "learning_rate": 4.9400420323896655e-05, "loss": 0.4118, "step": 295 }, { "epoch": 0.1090942596517092, "grad_norm": 5.33490400233325, "learning_rate": 4.939732970700952e-05, "loss": 0.3518, "step": 296 }, { "epoch": 0.10946282133972174, "grad_norm": 10.53157237660326, "learning_rate": 4.939423909012239e-05, "loss": 0.2632, "step": 297 }, { "epoch": 0.10983138302773426, "grad_norm": 5.562210418364193, "learning_rate": 4.939114847323526e-05, "loss": 0.3794, "step": 298 }, { "epoch": 0.1101999447157468, "grad_norm": 8.690483184831546, "learning_rate": 4.938805785634813e-05, "loss": 0.3158, "step": 299 }, { "epoch": 0.11056850640375933, "grad_norm": 62.87730643135487, "learning_rate": 4.9384967239461e-05, "loss": 0.5011, "step": 300 }, { "epoch": 0.11093706809177185, "grad_norm": 10.802627201545299, "learning_rate": 4.938187662257387e-05, "loss": 0.5271, "step": 301 }, { "epoch": 0.11130562977978439, "grad_norm": 9.136059473462108, "learning_rate": 4.937878600568674e-05, "loss": 0.2318, "step": 302 }, { "epoch": 0.11167419146779692, "grad_norm": 15.19056199842932, "learning_rate": 4.937569538879961e-05, "loss": 0.3512, "step": 303 }, { "epoch": 0.11204275315580946, "grad_norm": 18.55133028838498, "learning_rate": 4.937260477191248e-05, "loss": 0.577, "step": 304 }, { "epoch": 0.11241131484382198, "grad_norm": 12.819657871007616, "learning_rate": 4.9369514155025346e-05, "loss": 0.3077, "step": 305 }, { "epoch": 0.11277987653183452, "grad_norm": 16.57231891288575, "learning_rate": 4.936642353813821e-05, "loss": 0.3023, "step": 306 }, { "epoch": 0.11314843821984705, "grad_norm": 19.56455285754379, "learning_rate": 4.936333292125108e-05, "loss": 0.4686, "step": 307 }, { "epoch": 0.11351699990785957, "grad_norm": 11.496315407309245, "learning_rate": 4.936024230436395e-05, "loss": 0.2909, "step": 308 }, { "epoch": 0.11388556159587211, "grad_norm": 14.303091096577417, "learning_rate": 4.9357151687476824e-05, "loss": 0.3013, "step": 309 }, { "epoch": 0.11425412328388464, "grad_norm": 10.42613542626211, "learning_rate": 4.935406107058969e-05, "loss": 0.2646, "step": 310 }, { "epoch": 0.11462268497189718, "grad_norm": 14.645227773346617, "learning_rate": 4.935097045370256e-05, "loss": 0.3085, "step": 311 }, { "epoch": 0.1149912466599097, "grad_norm": 7.111593114226548, "learning_rate": 4.934787983681543e-05, "loss": 0.3265, "step": 312 }, { "epoch": 0.11535980834792223, "grad_norm": 15.386964190254929, "learning_rate": 4.93447892199283e-05, "loss": 0.4131, "step": 313 }, { "epoch": 0.11572837003593477, "grad_norm": 11.674621340637374, "learning_rate": 4.9341698603041167e-05, "loss": 0.4327, "step": 314 }, { "epoch": 0.11609693172394729, "grad_norm": 10.938771549971273, "learning_rate": 4.933860798615404e-05, "loss": 0.5495, "step": 315 }, { "epoch": 0.11646549341195983, "grad_norm": 8.28398852862453, "learning_rate": 4.933551736926691e-05, "loss": 0.332, "step": 316 }, { "epoch": 0.11683405509997236, "grad_norm": 4.929249884622287, "learning_rate": 4.933242675237978e-05, "loss": 0.2662, "step": 317 }, { "epoch": 0.1172026167879849, "grad_norm": 16.068677374884498, "learning_rate": 4.932933613549265e-05, "loss": 0.3815, "step": 318 }, { "epoch": 0.11757117847599742, "grad_norm": 10.844558377874721, "learning_rate": 4.9326245518605516e-05, "loss": 0.5211, "step": 319 }, { "epoch": 0.11793974016400995, "grad_norm": 11.621788319579617, "learning_rate": 4.932315490171839e-05, "loss": 0.5584, "step": 320 }, { "epoch": 0.11830830185202248, "grad_norm": 11.772311839643251, "learning_rate": 4.932006428483125e-05, "loss": 0.5003, "step": 321 }, { "epoch": 0.11867686354003501, "grad_norm": 11.334339620757438, "learning_rate": 4.931697366794412e-05, "loss": 0.4395, "step": 322 }, { "epoch": 0.11904542522804755, "grad_norm": 5.537104035766826, "learning_rate": 4.9313883051056994e-05, "loss": 0.4423, "step": 323 }, { "epoch": 0.11941398691606007, "grad_norm": 6.602878127964646, "learning_rate": 4.931079243416986e-05, "loss": 0.3388, "step": 324 }, { "epoch": 0.1197825486040726, "grad_norm": 10.202714476570913, "learning_rate": 4.930770181728273e-05, "loss": 0.5349, "step": 325 }, { "epoch": 0.12015111029208514, "grad_norm": 10.497322104341883, "learning_rate": 4.93046112003956e-05, "loss": 0.4084, "step": 326 }, { "epoch": 0.12051967198009766, "grad_norm": 13.399182855543433, "learning_rate": 4.930152058350847e-05, "loss": 0.3141, "step": 327 }, { "epoch": 0.1208882336681102, "grad_norm": 8.419863657085324, "learning_rate": 4.929842996662134e-05, "loss": 0.48, "step": 328 }, { "epoch": 0.12125679535612273, "grad_norm": 75.17162654552581, "learning_rate": 4.929533934973421e-05, "loss": 0.4499, "step": 329 }, { "epoch": 0.12162535704413527, "grad_norm": 17.959885721161687, "learning_rate": 4.929224873284708e-05, "loss": 0.4534, "step": 330 }, { "epoch": 0.1219939187321478, "grad_norm": 8.57618321744012, "learning_rate": 4.928915811595995e-05, "loss": 0.3563, "step": 331 }, { "epoch": 0.12236248042016032, "grad_norm": 12.879521733283616, "learning_rate": 4.928606749907282e-05, "loss": 0.3019, "step": 332 }, { "epoch": 0.12273104210817286, "grad_norm": 14.906871342854673, "learning_rate": 4.9282976882185685e-05, "loss": 0.3784, "step": 333 }, { "epoch": 0.12309960379618538, "grad_norm": 7.925869809140528, "learning_rate": 4.9279886265298556e-05, "loss": 0.3053, "step": 334 }, { "epoch": 0.12346816548419792, "grad_norm": 13.773649650060698, "learning_rate": 4.927679564841143e-05, "loss": 0.3296, "step": 335 }, { "epoch": 0.12383672717221045, "grad_norm": 23.581937750870633, "learning_rate": 4.927370503152429e-05, "loss": 0.3182, "step": 336 }, { "epoch": 0.12420528886022297, "grad_norm": 27.5358253859858, "learning_rate": 4.927061441463716e-05, "loss": 0.5734, "step": 337 }, { "epoch": 0.12457385054823551, "grad_norm": 16.260291963140222, "learning_rate": 4.9267523797750034e-05, "loss": 0.4361, "step": 338 }, { "epoch": 0.12494241223624804, "grad_norm": 23.794313870305103, "learning_rate": 4.92644331808629e-05, "loss": 0.6518, "step": 339 }, { "epoch": 0.12531097392426058, "grad_norm": 33.71188892776738, "learning_rate": 4.926134256397577e-05, "loss": 0.4093, "step": 340 }, { "epoch": 0.1256795356122731, "grad_norm": 5.004291981131984, "learning_rate": 4.925825194708864e-05, "loss": 0.23, "step": 341 }, { "epoch": 0.12604809730028563, "grad_norm": 14.20687981085083, "learning_rate": 4.925516133020151e-05, "loss": 0.4476, "step": 342 }, { "epoch": 0.12641665898829815, "grad_norm": 27.485606362765697, "learning_rate": 4.925207071331438e-05, "loss": 0.3893, "step": 343 }, { "epoch": 0.1267852206763107, "grad_norm": 9.715818300284369, "learning_rate": 4.924898009642725e-05, "loss": 0.3164, "step": 344 }, { "epoch": 0.12715378236432323, "grad_norm": 10.44938156999315, "learning_rate": 4.924588947954012e-05, "loss": 0.3087, "step": 345 }, { "epoch": 0.12752234405233576, "grad_norm": 13.88902330121592, "learning_rate": 4.924279886265299e-05, "loss": 0.4587, "step": 346 }, { "epoch": 0.12789090574034828, "grad_norm": 15.322568238128666, "learning_rate": 4.923970824576586e-05, "loss": 0.2573, "step": 347 }, { "epoch": 0.12825946742836083, "grad_norm": 15.127996983229782, "learning_rate": 4.9236617628878726e-05, "loss": 0.4228, "step": 348 }, { "epoch": 0.12862802911637336, "grad_norm": 5.131747386397051, "learning_rate": 4.92335270119916e-05, "loss": 0.2515, "step": 349 }, { "epoch": 0.12899659080438589, "grad_norm": 59.58399689133715, "learning_rate": 4.923043639510447e-05, "loss": 0.2371, "step": 350 }, { "epoch": 0.1293651524923984, "grad_norm": 9.724407156400694, "learning_rate": 4.922734577821733e-05, "loss": 0.438, "step": 351 }, { "epoch": 0.12973371418041094, "grad_norm": 27.779083682410675, "learning_rate": 4.9224255161330204e-05, "loss": 0.5809, "step": 352 }, { "epoch": 0.1301022758684235, "grad_norm": 17.909500652532078, "learning_rate": 4.922116454444307e-05, "loss": 0.4425, "step": 353 }, { "epoch": 0.13047083755643601, "grad_norm": 8.544579680648154, "learning_rate": 4.921807392755594e-05, "loss": 0.2282, "step": 354 }, { "epoch": 0.13083939924444854, "grad_norm": 16.970784832817746, "learning_rate": 4.921498331066881e-05, "loss": 0.4567, "step": 355 }, { "epoch": 0.13120796093246107, "grad_norm": 35.83372414545846, "learning_rate": 4.921189269378168e-05, "loss": 0.3751, "step": 356 }, { "epoch": 0.1315765226204736, "grad_norm": 7.696094949513281, "learning_rate": 4.920880207689455e-05, "loss": 0.4459, "step": 357 }, { "epoch": 0.13194508430848614, "grad_norm": 15.99976014911166, "learning_rate": 4.920571146000742e-05, "loss": 0.471, "step": 358 }, { "epoch": 0.13231364599649867, "grad_norm": 10.54848692832535, "learning_rate": 4.920262084312029e-05, "loss": 0.3655, "step": 359 }, { "epoch": 0.1326822076845112, "grad_norm": 6.966344306946771, "learning_rate": 4.919953022623316e-05, "loss": 0.2074, "step": 360 }, { "epoch": 0.13305076937252372, "grad_norm": 10.763469330735857, "learning_rate": 4.919643960934603e-05, "loss": 0.3426, "step": 361 }, { "epoch": 0.13341933106053624, "grad_norm": 10.301334139148517, "learning_rate": 4.9193348992458896e-05, "loss": 0.4701, "step": 362 }, { "epoch": 0.1337878927485488, "grad_norm": 5.543174691437213, "learning_rate": 4.919025837557177e-05, "loss": 0.186, "step": 363 }, { "epoch": 0.13415645443656132, "grad_norm": 9.552288911549265, "learning_rate": 4.918716775868464e-05, "loss": 0.3742, "step": 364 }, { "epoch": 0.13452501612457385, "grad_norm": 10.25562708774508, "learning_rate": 4.918407714179751e-05, "loss": 0.5377, "step": 365 }, { "epoch": 0.13489357781258637, "grad_norm": 17.87527828400075, "learning_rate": 4.9180986524910374e-05, "loss": 0.469, "step": 366 }, { "epoch": 0.1352621395005989, "grad_norm": 5.48938802508008, "learning_rate": 4.9177895908023245e-05, "loss": 0.2447, "step": 367 }, { "epoch": 0.13563070118861145, "grad_norm": 8.103609538275405, "learning_rate": 4.917480529113611e-05, "loss": 0.3217, "step": 368 }, { "epoch": 0.13599926287662398, "grad_norm": 6.26363533373345, "learning_rate": 4.917171467424898e-05, "loss": 0.2275, "step": 369 }, { "epoch": 0.1363678245646365, "grad_norm": 10.64259658450638, "learning_rate": 4.916862405736185e-05, "loss": 0.3341, "step": 370 }, { "epoch": 0.13673638625264903, "grad_norm": 13.542958609745266, "learning_rate": 4.916553344047472e-05, "loss": 0.485, "step": 371 }, { "epoch": 0.13710494794066158, "grad_norm": 11.242737184702703, "learning_rate": 4.916244282358759e-05, "loss": 0.4405, "step": 372 }, { "epoch": 0.1374735096286741, "grad_norm": 9.509715282458144, "learning_rate": 4.915935220670046e-05, "loss": 0.3536, "step": 373 }, { "epoch": 0.13784207131668663, "grad_norm": 9.98061743101093, "learning_rate": 4.915626158981333e-05, "loss": 0.4049, "step": 374 }, { "epoch": 0.13821063300469916, "grad_norm": 9.668445379665249, "learning_rate": 4.91531709729262e-05, "loss": 0.4889, "step": 375 }, { "epoch": 0.13857919469271168, "grad_norm": 8.32272833912269, "learning_rate": 4.915008035603907e-05, "loss": 0.3345, "step": 376 }, { "epoch": 0.13894775638072424, "grad_norm": 9.406533880163929, "learning_rate": 4.9146989739151936e-05, "loss": 0.4119, "step": 377 }, { "epoch": 0.13931631806873676, "grad_norm": 14.718883732473673, "learning_rate": 4.914389912226481e-05, "loss": 0.4099, "step": 378 }, { "epoch": 0.1396848797567493, "grad_norm": 12.605599482200919, "learning_rate": 4.914080850537768e-05, "loss": 0.2963, "step": 379 }, { "epoch": 0.1400534414447618, "grad_norm": 15.828373717813882, "learning_rate": 4.913771788849055e-05, "loss": 0.6456, "step": 380 }, { "epoch": 0.14042200313277434, "grad_norm": 10.40351790571118, "learning_rate": 4.9134627271603414e-05, "loss": 0.4365, "step": 381 }, { "epoch": 0.1407905648207869, "grad_norm": 8.420149079586473, "learning_rate": 4.913153665471628e-05, "loss": 0.3906, "step": 382 }, { "epoch": 0.14115912650879942, "grad_norm": 6.524458717315566, "learning_rate": 4.912844603782915e-05, "loss": 0.2739, "step": 383 }, { "epoch": 0.14152768819681194, "grad_norm": 6.857592931236651, "learning_rate": 4.912535542094202e-05, "loss": 0.3033, "step": 384 }, { "epoch": 0.14189624988482447, "grad_norm": 4.731482701896902, "learning_rate": 4.912226480405489e-05, "loss": 0.318, "step": 385 }, { "epoch": 0.142264811572837, "grad_norm": 5.442389734121069, "learning_rate": 4.9119174187167764e-05, "loss": 0.345, "step": 386 }, { "epoch": 0.14263337326084954, "grad_norm": 6.562665664761645, "learning_rate": 4.911608357028063e-05, "loss": 0.3638, "step": 387 }, { "epoch": 0.14300193494886207, "grad_norm": 5.995850081571719, "learning_rate": 4.91129929533935e-05, "loss": 0.4389, "step": 388 }, { "epoch": 0.1433704966368746, "grad_norm": 8.709041098225793, "learning_rate": 4.910990233650637e-05, "loss": 0.4365, "step": 389 }, { "epoch": 0.14373905832488712, "grad_norm": 6.7377298234515255, "learning_rate": 4.910681171961924e-05, "loss": 0.2544, "step": 390 }, { "epoch": 0.14410762001289965, "grad_norm": 5.272863467021093, "learning_rate": 4.9103721102732106e-05, "loss": 0.3421, "step": 391 }, { "epoch": 0.1444761817009122, "grad_norm": 9.472723098990926, "learning_rate": 4.910063048584498e-05, "loss": 0.438, "step": 392 }, { "epoch": 0.14484474338892472, "grad_norm": 13.382224155193477, "learning_rate": 4.909753986895785e-05, "loss": 0.7009, "step": 393 }, { "epoch": 0.14521330507693725, "grad_norm": 10.415577417008022, "learning_rate": 4.909444925207072e-05, "loss": 0.4301, "step": 394 }, { "epoch": 0.14558186676494977, "grad_norm": 9.793188055357543, "learning_rate": 4.909135863518359e-05, "loss": 0.2521, "step": 395 }, { "epoch": 0.14595042845296233, "grad_norm": 14.189954100111855, "learning_rate": 4.908826801829645e-05, "loss": 0.321, "step": 396 }, { "epoch": 0.14631899014097485, "grad_norm": 6.353662723089674, "learning_rate": 4.908517740140932e-05, "loss": 0.4351, "step": 397 }, { "epoch": 0.14668755182898738, "grad_norm": 7.63836474229133, "learning_rate": 4.908208678452219e-05, "loss": 0.5044, "step": 398 }, { "epoch": 0.1470561135169999, "grad_norm": 9.323480673553048, "learning_rate": 4.907899616763506e-05, "loss": 0.4199, "step": 399 }, { "epoch": 0.14742467520501243, "grad_norm": 4.733969290191743, "learning_rate": 4.907590555074793e-05, "loss": 0.3167, "step": 400 }, { "epoch": 0.14779323689302498, "grad_norm": 14.134730945316376, "learning_rate": 4.90728149338608e-05, "loss": 0.3987, "step": 401 }, { "epoch": 0.1481617985810375, "grad_norm": 7.638797970625689, "learning_rate": 4.906972431697367e-05, "loss": 0.3867, "step": 402 }, { "epoch": 0.14853036026905003, "grad_norm": 8.3845752238602, "learning_rate": 4.906663370008654e-05, "loss": 0.5592, "step": 403 }, { "epoch": 0.14889892195706256, "grad_norm": 6.280784779491259, "learning_rate": 4.906354308319941e-05, "loss": 0.397, "step": 404 }, { "epoch": 0.14926748364507508, "grad_norm": 6.478659590558222, "learning_rate": 4.9060452466312276e-05, "loss": 0.3464, "step": 405 }, { "epoch": 0.14963604533308764, "grad_norm": 7.103486357600554, "learning_rate": 4.905736184942515e-05, "loss": 0.398, "step": 406 }, { "epoch": 0.15000460702110016, "grad_norm": 10.124423163665803, "learning_rate": 4.905427123253802e-05, "loss": 0.4232, "step": 407 }, { "epoch": 0.1503731687091127, "grad_norm": 9.150085248706361, "learning_rate": 4.905118061565089e-05, "loss": 0.4917, "step": 408 }, { "epoch": 0.1507417303971252, "grad_norm": 8.396748558373, "learning_rate": 4.904808999876376e-05, "loss": 0.5096, "step": 409 }, { "epoch": 0.15111029208513774, "grad_norm": 12.68709168928114, "learning_rate": 4.9044999381876625e-05, "loss": 0.4531, "step": 410 }, { "epoch": 0.1514788537731503, "grad_norm": 6.125091552050148, "learning_rate": 4.904190876498949e-05, "loss": 0.3628, "step": 411 }, { "epoch": 0.15184741546116282, "grad_norm": 6.408718024498293, "learning_rate": 4.903881814810236e-05, "loss": 0.2822, "step": 412 }, { "epoch": 0.15221597714917534, "grad_norm": 7.156348082024361, "learning_rate": 4.903572753121523e-05, "loss": 0.3859, "step": 413 }, { "epoch": 0.15258453883718787, "grad_norm": 5.537952909181913, "learning_rate": 4.90326369143281e-05, "loss": 0.3458, "step": 414 }, { "epoch": 0.1529531005252004, "grad_norm": 8.957180309760572, "learning_rate": 4.902954629744097e-05, "loss": 0.5609, "step": 415 }, { "epoch": 0.15332166221321294, "grad_norm": 6.9551757531409715, "learning_rate": 4.902645568055384e-05, "loss": 0.335, "step": 416 }, { "epoch": 0.15369022390122547, "grad_norm": 10.3801859568144, "learning_rate": 4.902336506366671e-05, "loss": 0.5204, "step": 417 }, { "epoch": 0.154058785589238, "grad_norm": 8.004472436050497, "learning_rate": 4.902027444677958e-05, "loss": 0.4042, "step": 418 }, { "epoch": 0.15442734727725052, "grad_norm": 8.296574761369664, "learning_rate": 4.901718382989245e-05, "loss": 0.3241, "step": 419 }, { "epoch": 0.15479590896526307, "grad_norm": 9.077438775026547, "learning_rate": 4.9014093213005316e-05, "loss": 0.3749, "step": 420 }, { "epoch": 0.1551644706532756, "grad_norm": 12.098781909003923, "learning_rate": 4.901100259611819e-05, "loss": 0.4791, "step": 421 }, { "epoch": 0.15553303234128812, "grad_norm": 7.085063169597537, "learning_rate": 4.900791197923106e-05, "loss": 0.4294, "step": 422 }, { "epoch": 0.15590159402930065, "grad_norm": 10.926891828276487, "learning_rate": 4.900482136234393e-05, "loss": 0.3359, "step": 423 }, { "epoch": 0.15627015571731318, "grad_norm": 7.825417299691221, "learning_rate": 4.9001730745456794e-05, "loss": 0.464, "step": 424 }, { "epoch": 0.15663871740532573, "grad_norm": 9.216927454905152, "learning_rate": 4.8998640128569666e-05, "loss": 0.3492, "step": 425 }, { "epoch": 0.15700727909333825, "grad_norm": 5.606631997659355, "learning_rate": 4.899554951168254e-05, "loss": 0.332, "step": 426 }, { "epoch": 0.15737584078135078, "grad_norm": 9.400040614263572, "learning_rate": 4.89924588947954e-05, "loss": 0.3769, "step": 427 }, { "epoch": 0.1577444024693633, "grad_norm": 8.401386655159504, "learning_rate": 4.898936827790827e-05, "loss": 0.3768, "step": 428 }, { "epoch": 0.15811296415737583, "grad_norm": 6.372656896492244, "learning_rate": 4.8986277661021144e-05, "loss": 0.2865, "step": 429 }, { "epoch": 0.15848152584538838, "grad_norm": 18.377506837663248, "learning_rate": 4.898318704413401e-05, "loss": 0.4874, "step": 430 }, { "epoch": 0.1588500875334009, "grad_norm": 8.99322848421722, "learning_rate": 4.898009642724688e-05, "loss": 0.3835, "step": 431 }, { "epoch": 0.15921864922141343, "grad_norm": 6.2516934961516, "learning_rate": 4.897700581035975e-05, "loss": 0.3216, "step": 432 }, { "epoch": 0.15958721090942596, "grad_norm": 11.730464847651698, "learning_rate": 4.897391519347262e-05, "loss": 0.3477, "step": 433 }, { "epoch": 0.15995577259743848, "grad_norm": 7.250759216544747, "learning_rate": 4.8970824576585486e-05, "loss": 0.5108, "step": 434 }, { "epoch": 0.16032433428545104, "grad_norm": 8.89021522905085, "learning_rate": 4.896773395969836e-05, "loss": 0.4139, "step": 435 }, { "epoch": 0.16069289597346356, "grad_norm": 6.62075727389872, "learning_rate": 4.896464334281123e-05, "loss": 0.342, "step": 436 }, { "epoch": 0.1610614576614761, "grad_norm": 14.63212789959537, "learning_rate": 4.89615527259241e-05, "loss": 0.3192, "step": 437 }, { "epoch": 0.1614300193494886, "grad_norm": 10.321364413414159, "learning_rate": 4.895846210903697e-05, "loss": 0.6709, "step": 438 }, { "epoch": 0.16179858103750114, "grad_norm": 7.295636939937116, "learning_rate": 4.8955371492149835e-05, "loss": 0.4198, "step": 439 }, { "epoch": 0.1621671427255137, "grad_norm": 9.032865069687883, "learning_rate": 4.8952280875262706e-05, "loss": 0.5473, "step": 440 }, { "epoch": 0.16253570441352622, "grad_norm": 5.991383405346655, "learning_rate": 4.894919025837558e-05, "loss": 0.5729, "step": 441 }, { "epoch": 0.16290426610153874, "grad_norm": 5.388565510401418, "learning_rate": 4.894609964148844e-05, "loss": 0.3469, "step": 442 }, { "epoch": 0.16327282778955127, "grad_norm": 5.9647970641182155, "learning_rate": 4.894300902460131e-05, "loss": 0.2249, "step": 443 }, { "epoch": 0.16364138947756382, "grad_norm": 8.104253015000971, "learning_rate": 4.893991840771418e-05, "loss": 0.5725, "step": 444 }, { "epoch": 0.16400995116557635, "grad_norm": 9.840957100074556, "learning_rate": 4.893682779082705e-05, "loss": 0.4803, "step": 445 }, { "epoch": 0.16437851285358887, "grad_norm": 9.999015759669017, "learning_rate": 4.893373717393992e-05, "loss": 0.3257, "step": 446 }, { "epoch": 0.1647470745416014, "grad_norm": 9.35266626866413, "learning_rate": 4.893064655705279e-05, "loss": 0.3155, "step": 447 }, { "epoch": 0.16511563622961392, "grad_norm": 7.335805678478171, "learning_rate": 4.892755594016566e-05, "loss": 0.3268, "step": 448 }, { "epoch": 0.16548419791762647, "grad_norm": 7.198519808888828, "learning_rate": 4.892446532327853e-05, "loss": 0.37, "step": 449 }, { "epoch": 0.165852759605639, "grad_norm": 11.919390458261041, "learning_rate": 4.89213747063914e-05, "loss": 0.4653, "step": 450 }, { "epoch": 0.16622132129365152, "grad_norm": 13.767225538336964, "learning_rate": 4.891828408950427e-05, "loss": 0.4393, "step": 451 }, { "epoch": 0.16658988298166405, "grad_norm": 4.064859497344809, "learning_rate": 4.891519347261714e-05, "loss": 0.218, "step": 452 }, { "epoch": 0.16695844466967658, "grad_norm": 6.83886628108037, "learning_rate": 4.8912102855730005e-05, "loss": 0.4822, "step": 453 }, { "epoch": 0.16732700635768913, "grad_norm": 5.532253713111885, "learning_rate": 4.8909012238842876e-05, "loss": 0.458, "step": 454 }, { "epoch": 0.16769556804570165, "grad_norm": 7.287705339054723, "learning_rate": 4.890592162195575e-05, "loss": 0.308, "step": 455 }, { "epoch": 0.16806412973371418, "grad_norm": 5.481022954259731, "learning_rate": 4.890283100506862e-05, "loss": 0.2898, "step": 456 }, { "epoch": 0.1684326914217267, "grad_norm": 14.075101775484054, "learning_rate": 4.889974038818148e-05, "loss": 0.4482, "step": 457 }, { "epoch": 0.16880125310973923, "grad_norm": 5.127351919013948, "learning_rate": 4.8896649771294354e-05, "loss": 0.2738, "step": 458 }, { "epoch": 0.16916981479775178, "grad_norm": 13.890928181449485, "learning_rate": 4.889355915440722e-05, "loss": 0.3991, "step": 459 }, { "epoch": 0.1695383764857643, "grad_norm": 8.413616370484663, "learning_rate": 4.889046853752009e-05, "loss": 0.4417, "step": 460 }, { "epoch": 0.16990693817377683, "grad_norm": 13.058546426215477, "learning_rate": 4.888737792063296e-05, "loss": 0.4791, "step": 461 }, { "epoch": 0.17027549986178936, "grad_norm": 7.247284479321807, "learning_rate": 4.888428730374583e-05, "loss": 0.5237, "step": 462 }, { "epoch": 0.17064406154980188, "grad_norm": 14.827411855545943, "learning_rate": 4.8881196686858696e-05, "loss": 0.4702, "step": 463 }, { "epoch": 0.17101262323781444, "grad_norm": 9.612260983674014, "learning_rate": 4.887810606997157e-05, "loss": 0.3899, "step": 464 }, { "epoch": 0.17138118492582696, "grad_norm": 5.0607500230596685, "learning_rate": 4.887501545308444e-05, "loss": 0.4095, "step": 465 }, { "epoch": 0.1717497466138395, "grad_norm": 10.027607764590154, "learning_rate": 4.887192483619731e-05, "loss": 0.4223, "step": 466 }, { "epoch": 0.172118308301852, "grad_norm": 7.569746092716923, "learning_rate": 4.886883421931018e-05, "loss": 0.3372, "step": 467 }, { "epoch": 0.17248686998986457, "grad_norm": 9.106806345473963, "learning_rate": 4.8865743602423045e-05, "loss": 0.3715, "step": 468 }, { "epoch": 0.1728554316778771, "grad_norm": 11.994706575812671, "learning_rate": 4.886265298553592e-05, "loss": 0.4167, "step": 469 }, { "epoch": 0.17322399336588962, "grad_norm": 5.9367726733799255, "learning_rate": 4.885956236864879e-05, "loss": 0.3266, "step": 470 }, { "epoch": 0.17359255505390214, "grad_norm": 6.793699588255208, "learning_rate": 4.885647175176166e-05, "loss": 0.4199, "step": 471 }, { "epoch": 0.17396111674191467, "grad_norm": 10.8152536402856, "learning_rate": 4.8853381134874523e-05, "loss": 0.6998, "step": 472 }, { "epoch": 0.17432967842992722, "grad_norm": 51.621322981187966, "learning_rate": 4.885029051798739e-05, "loss": 0.476, "step": 473 }, { "epoch": 0.17469824011793975, "grad_norm": 11.323910854084378, "learning_rate": 4.884719990110026e-05, "loss": 0.4584, "step": 474 }, { "epoch": 0.17506680180595227, "grad_norm": 7.290564869789272, "learning_rate": 4.884410928421313e-05, "loss": 0.4454, "step": 475 }, { "epoch": 0.1754353634939648, "grad_norm": 4.75083835632098, "learning_rate": 4.8841018667326e-05, "loss": 0.272, "step": 476 }, { "epoch": 0.17580392518197732, "grad_norm": 8.973475470737398, "learning_rate": 4.8837928050438866e-05, "loss": 0.4858, "step": 477 }, { "epoch": 0.17617248686998987, "grad_norm": 5.793244327670626, "learning_rate": 4.883483743355174e-05, "loss": 0.3049, "step": 478 }, { "epoch": 0.1765410485580024, "grad_norm": 7.617726843886332, "learning_rate": 4.883174681666461e-05, "loss": 0.3108, "step": 479 }, { "epoch": 0.17690961024601493, "grad_norm": 27.359989597251655, "learning_rate": 4.882865619977748e-05, "loss": 0.5052, "step": 480 }, { "epoch": 0.17727817193402745, "grad_norm": 8.43897601745286, "learning_rate": 4.882556558289035e-05, "loss": 0.39, "step": 481 }, { "epoch": 0.17764673362203998, "grad_norm": 9.713441435405011, "learning_rate": 4.8822474966003215e-05, "loss": 0.4056, "step": 482 }, { "epoch": 0.17801529531005253, "grad_norm": 8.52827284290937, "learning_rate": 4.8819384349116086e-05, "loss": 0.4606, "step": 483 }, { "epoch": 0.17838385699806505, "grad_norm": 11.224517444866715, "learning_rate": 4.881629373222896e-05, "loss": 0.4006, "step": 484 }, { "epoch": 0.17875241868607758, "grad_norm": 6.657479759125375, "learning_rate": 4.881320311534183e-05, "loss": 0.4041, "step": 485 }, { "epoch": 0.1791209803740901, "grad_norm": 8.16680669340121, "learning_rate": 4.88101124984547e-05, "loss": 0.3705, "step": 486 }, { "epoch": 0.17948954206210266, "grad_norm": 7.148089024122275, "learning_rate": 4.880702188156756e-05, "loss": 0.3465, "step": 487 }, { "epoch": 0.17985810375011518, "grad_norm": 7.167527287580627, "learning_rate": 4.880393126468043e-05, "loss": 0.4198, "step": 488 }, { "epoch": 0.1802266654381277, "grad_norm": 4.753330468546542, "learning_rate": 4.88008406477933e-05, "loss": 0.3552, "step": 489 }, { "epoch": 0.18059522712614023, "grad_norm": 6.218216916339631, "learning_rate": 4.879775003090617e-05, "loss": 0.2569, "step": 490 }, { "epoch": 0.18096378881415276, "grad_norm": 14.876640165286712, "learning_rate": 4.879465941401904e-05, "loss": 0.4006, "step": 491 }, { "epoch": 0.1813323505021653, "grad_norm": 12.124773475654607, "learning_rate": 4.879156879713191e-05, "loss": 0.4533, "step": 492 }, { "epoch": 0.18170091219017784, "grad_norm": 7.460564249498723, "learning_rate": 4.878847818024478e-05, "loss": 0.389, "step": 493 }, { "epoch": 0.18206947387819036, "grad_norm": 6.017231357665245, "learning_rate": 4.878538756335765e-05, "loss": 0.3284, "step": 494 }, { "epoch": 0.1824380355662029, "grad_norm": 8.506916149550804, "learning_rate": 4.878229694647052e-05, "loss": 0.3334, "step": 495 }, { "epoch": 0.1828065972542154, "grad_norm": 17.090327225304048, "learning_rate": 4.8779206329583385e-05, "loss": 0.3687, "step": 496 }, { "epoch": 0.18317515894222797, "grad_norm": 8.452207956264298, "learning_rate": 4.8776115712696256e-05, "loss": 0.4193, "step": 497 }, { "epoch": 0.1835437206302405, "grad_norm": 9.12696043928928, "learning_rate": 4.877302509580913e-05, "loss": 0.3272, "step": 498 }, { "epoch": 0.18391228231825302, "grad_norm": 10.275117941223579, "learning_rate": 4.8769934478922e-05, "loss": 0.3646, "step": 499 }, { "epoch": 0.18428084400626554, "grad_norm": 12.830111276645022, "learning_rate": 4.876684386203487e-05, "loss": 0.6136, "step": 500 }, { "epoch": 0.18464940569427807, "grad_norm": 7.190735196658061, "learning_rate": 4.8763753245147734e-05, "loss": 0.4327, "step": 501 }, { "epoch": 0.18501796738229062, "grad_norm": 9.414929615148651, "learning_rate": 4.87606626282606e-05, "loss": 0.4038, "step": 502 }, { "epoch": 0.18538652907030315, "grad_norm": 10.367247477101376, "learning_rate": 4.875757201137347e-05, "loss": 0.4478, "step": 503 }, { "epoch": 0.18575509075831567, "grad_norm": 6.818780925900882, "learning_rate": 4.875448139448634e-05, "loss": 0.3336, "step": 504 }, { "epoch": 0.1861236524463282, "grad_norm": 10.556332203808667, "learning_rate": 4.875139077759921e-05, "loss": 0.2581, "step": 505 }, { "epoch": 0.18649221413434072, "grad_norm": 47.454544965583906, "learning_rate": 4.8748300160712076e-05, "loss": 0.372, "step": 506 }, { "epoch": 0.18686077582235328, "grad_norm": 9.233120750798623, "learning_rate": 4.874520954382495e-05, "loss": 0.4871, "step": 507 }, { "epoch": 0.1872293375103658, "grad_norm": 7.104374260726599, "learning_rate": 4.874211892693782e-05, "loss": 0.4155, "step": 508 }, { "epoch": 0.18759789919837833, "grad_norm": 10.390791675119956, "learning_rate": 4.873902831005069e-05, "loss": 0.5912, "step": 509 }, { "epoch": 0.18796646088639085, "grad_norm": 7.854009840906407, "learning_rate": 4.873593769316356e-05, "loss": 0.7202, "step": 510 }, { "epoch": 0.1883350225744034, "grad_norm": 7.746537942697773, "learning_rate": 4.8732847076276425e-05, "loss": 0.3289, "step": 511 }, { "epoch": 0.18870358426241593, "grad_norm": 8.03696722041659, "learning_rate": 4.87297564593893e-05, "loss": 0.4804, "step": 512 }, { "epoch": 0.18907214595042846, "grad_norm": 7.934970963442389, "learning_rate": 4.872666584250217e-05, "loss": 0.633, "step": 513 }, { "epoch": 0.18944070763844098, "grad_norm": 7.64155108060402, "learning_rate": 4.872357522561504e-05, "loss": 0.4364, "step": 514 }, { "epoch": 0.1898092693264535, "grad_norm": 5.672833322320693, "learning_rate": 4.8720484608727903e-05, "loss": 0.3517, "step": 515 }, { "epoch": 0.19017783101446606, "grad_norm": 4.162776097027605, "learning_rate": 4.8717393991840775e-05, "loss": 0.2443, "step": 516 }, { "epoch": 0.19054639270247858, "grad_norm": 5.285774053876387, "learning_rate": 4.871430337495364e-05, "loss": 0.3989, "step": 517 }, { "epoch": 0.1909149543904911, "grad_norm": 8.03655854164949, "learning_rate": 4.871121275806651e-05, "loss": 0.4647, "step": 518 }, { "epoch": 0.19128351607850363, "grad_norm": 8.992577777151833, "learning_rate": 4.870812214117938e-05, "loss": 0.3574, "step": 519 }, { "epoch": 0.19165207776651616, "grad_norm": 11.507797914535903, "learning_rate": 4.870503152429225e-05, "loss": 0.4151, "step": 520 }, { "epoch": 0.1920206394545287, "grad_norm": 11.76787943727311, "learning_rate": 4.870194090740512e-05, "loss": 0.4062, "step": 521 }, { "epoch": 0.19238920114254124, "grad_norm": 8.448156790058649, "learning_rate": 4.869885029051799e-05, "loss": 0.4814, "step": 522 }, { "epoch": 0.19275776283055376, "grad_norm": 9.626766686734458, "learning_rate": 4.869575967363086e-05, "loss": 0.4431, "step": 523 }, { "epoch": 0.1931263245185663, "grad_norm": 20.1990907067833, "learning_rate": 4.869266905674373e-05, "loss": 0.3891, "step": 524 }, { "epoch": 0.19349488620657881, "grad_norm": 11.272129588349442, "learning_rate": 4.8689578439856595e-05, "loss": 0.4664, "step": 525 }, { "epoch": 0.19386344789459137, "grad_norm": 7.511885317316008, "learning_rate": 4.8686487822969466e-05, "loss": 0.3985, "step": 526 }, { "epoch": 0.1942320095826039, "grad_norm": 7.1818831016143605, "learning_rate": 4.868339720608234e-05, "loss": 0.5553, "step": 527 }, { "epoch": 0.19460057127061642, "grad_norm": 7.052625386429411, "learning_rate": 4.868030658919521e-05, "loss": 0.44, "step": 528 }, { "epoch": 0.19496913295862894, "grad_norm": 5.247383919095759, "learning_rate": 4.867721597230808e-05, "loss": 0.3129, "step": 529 }, { "epoch": 0.19533769464664147, "grad_norm": 8.231156532724361, "learning_rate": 4.8674125355420944e-05, "loss": 0.4656, "step": 530 }, { "epoch": 0.19570625633465402, "grad_norm": 7.625677109968533, "learning_rate": 4.8671034738533815e-05, "loss": 0.3579, "step": 531 }, { "epoch": 0.19607481802266655, "grad_norm": 8.182577036335582, "learning_rate": 4.866794412164668e-05, "loss": 0.239, "step": 532 }, { "epoch": 0.19644337971067907, "grad_norm": 9.964991132901654, "learning_rate": 4.866485350475955e-05, "loss": 0.5579, "step": 533 }, { "epoch": 0.1968119413986916, "grad_norm": 8.46419027963359, "learning_rate": 4.866176288787242e-05, "loss": 0.3153, "step": 534 }, { "epoch": 0.19718050308670415, "grad_norm": 7.454483772073387, "learning_rate": 4.8658672270985287e-05, "loss": 0.305, "step": 535 }, { "epoch": 0.19754906477471668, "grad_norm": 16.42744631498276, "learning_rate": 4.865558165409816e-05, "loss": 0.4811, "step": 536 }, { "epoch": 0.1979176264627292, "grad_norm": 4.31385914490099, "learning_rate": 4.865249103721103e-05, "loss": 0.1253, "step": 537 }, { "epoch": 0.19828618815074173, "grad_norm": 10.55980255173555, "learning_rate": 4.86494004203239e-05, "loss": 0.5034, "step": 538 }, { "epoch": 0.19865474983875425, "grad_norm": 9.848171440063918, "learning_rate": 4.864630980343677e-05, "loss": 0.4421, "step": 539 }, { "epoch": 0.1990233115267668, "grad_norm": 4.947631677819232, "learning_rate": 4.8643219186549636e-05, "loss": 0.1954, "step": 540 }, { "epoch": 0.19939187321477933, "grad_norm": 9.39329960416537, "learning_rate": 4.864012856966251e-05, "loss": 0.2979, "step": 541 }, { "epoch": 0.19976043490279186, "grad_norm": 7.432606858589756, "learning_rate": 4.863703795277538e-05, "loss": 0.5761, "step": 542 }, { "epoch": 0.20012899659080438, "grad_norm": 9.719026322438374, "learning_rate": 4.863394733588825e-05, "loss": 0.4893, "step": 543 }, { "epoch": 0.2004975582788169, "grad_norm": 7.566863763513664, "learning_rate": 4.8630856719001114e-05, "loss": 0.2982, "step": 544 }, { "epoch": 0.20086611996682946, "grad_norm": 5.01622181606669, "learning_rate": 4.8627766102113985e-05, "loss": 0.3993, "step": 545 }, { "epoch": 0.20123468165484198, "grad_norm": 7.570507003757761, "learning_rate": 4.8624675485226856e-05, "loss": 0.3985, "step": 546 }, { "epoch": 0.2016032433428545, "grad_norm": 10.373847621665698, "learning_rate": 4.862158486833973e-05, "loss": 0.3906, "step": 547 }, { "epoch": 0.20197180503086704, "grad_norm": 7.53461858987712, "learning_rate": 4.861849425145259e-05, "loss": 0.3317, "step": 548 }, { "epoch": 0.20234036671887956, "grad_norm": 5.313278320742466, "learning_rate": 4.8615403634565456e-05, "loss": 0.3396, "step": 549 }, { "epoch": 0.2027089284068921, "grad_norm": 12.04358324332369, "learning_rate": 4.861231301767833e-05, "loss": 0.576, "step": 550 }, { "epoch": 0.20307749009490464, "grad_norm": 12.157607446502054, "learning_rate": 4.86092224007912e-05, "loss": 0.4303, "step": 551 }, { "epoch": 0.20344605178291716, "grad_norm": 6.945215557095509, "learning_rate": 4.860613178390407e-05, "loss": 0.481, "step": 552 }, { "epoch": 0.2038146134709297, "grad_norm": 6.733903695804908, "learning_rate": 4.860304116701694e-05, "loss": 0.4738, "step": 553 }, { "epoch": 0.20418317515894222, "grad_norm": 6.332034262221677, "learning_rate": 4.8599950550129805e-05, "loss": 0.35, "step": 554 }, { "epoch": 0.20455173684695477, "grad_norm": 4.514996441980188, "learning_rate": 4.8596859933242677e-05, "loss": 0.2853, "step": 555 }, { "epoch": 0.2049202985349673, "grad_norm": 23.915791600752524, "learning_rate": 4.859376931635555e-05, "loss": 0.4546, "step": 556 }, { "epoch": 0.20528886022297982, "grad_norm": 17.341162356233056, "learning_rate": 4.859067869946842e-05, "loss": 0.4204, "step": 557 }, { "epoch": 0.20565742191099234, "grad_norm": 13.847058454127767, "learning_rate": 4.858758808258129e-05, "loss": 0.6588, "step": 558 }, { "epoch": 0.2060259835990049, "grad_norm": 8.946866336116134, "learning_rate": 4.8584497465694155e-05, "loss": 0.3129, "step": 559 }, { "epoch": 0.20639454528701742, "grad_norm": 8.862731190236484, "learning_rate": 4.8581406848807026e-05, "loss": 0.4211, "step": 560 }, { "epoch": 0.20676310697502995, "grad_norm": 8.5456144808591, "learning_rate": 4.85783162319199e-05, "loss": 0.3669, "step": 561 }, { "epoch": 0.20713166866304247, "grad_norm": 12.32576567648918, "learning_rate": 4.857522561503277e-05, "loss": 0.4375, "step": 562 }, { "epoch": 0.207500230351055, "grad_norm": 5.874650660740789, "learning_rate": 4.857213499814563e-05, "loss": 0.3283, "step": 563 }, { "epoch": 0.20786879203906755, "grad_norm": 11.202037380739817, "learning_rate": 4.85690443812585e-05, "loss": 0.3789, "step": 564 }, { "epoch": 0.20823735372708008, "grad_norm": 9.332292044545515, "learning_rate": 4.856595376437137e-05, "loss": 0.3799, "step": 565 }, { "epoch": 0.2086059154150926, "grad_norm": 5.859783514405064, "learning_rate": 4.856286314748424e-05, "loss": 0.3588, "step": 566 }, { "epoch": 0.20897447710310513, "grad_norm": 24.171087172035207, "learning_rate": 4.855977253059711e-05, "loss": 0.6777, "step": 567 }, { "epoch": 0.20934303879111765, "grad_norm": 6.557146486387605, "learning_rate": 4.8556681913709975e-05, "loss": 0.4009, "step": 568 }, { "epoch": 0.2097116004791302, "grad_norm": 7.325895641369879, "learning_rate": 4.8553591296822846e-05, "loss": 0.4009, "step": 569 }, { "epoch": 0.21008016216714273, "grad_norm": 4.595107727780408, "learning_rate": 4.855050067993572e-05, "loss": 0.2783, "step": 570 }, { "epoch": 0.21044872385515526, "grad_norm": 9.554898301027006, "learning_rate": 4.854741006304859e-05, "loss": 0.3527, "step": 571 }, { "epoch": 0.21081728554316778, "grad_norm": 10.066579051194369, "learning_rate": 4.854431944616146e-05, "loss": 0.5546, "step": 572 }, { "epoch": 0.2111858472311803, "grad_norm": 8.049815527656325, "learning_rate": 4.8541228829274324e-05, "loss": 0.3528, "step": 573 }, { "epoch": 0.21155440891919286, "grad_norm": 34.01029184190978, "learning_rate": 4.8538138212387195e-05, "loss": 0.4216, "step": 574 }, { "epoch": 0.21192297060720539, "grad_norm": 36.22261222936234, "learning_rate": 4.8535047595500067e-05, "loss": 0.4211, "step": 575 }, { "epoch": 0.2122915322952179, "grad_norm": 9.276833456360569, "learning_rate": 4.853195697861294e-05, "loss": 0.3441, "step": 576 }, { "epoch": 0.21266009398323044, "grad_norm": 7.4235868708333035, "learning_rate": 4.85288663617258e-05, "loss": 0.3606, "step": 577 }, { "epoch": 0.21302865567124296, "grad_norm": 12.48387693096736, "learning_rate": 4.8525775744838667e-05, "loss": 0.4129, "step": 578 }, { "epoch": 0.21339721735925551, "grad_norm": 6.225649586071697, "learning_rate": 4.852268512795154e-05, "loss": 0.3954, "step": 579 }, { "epoch": 0.21376577904726804, "grad_norm": 7.070190293747045, "learning_rate": 4.851959451106441e-05, "loss": 0.3943, "step": 580 }, { "epoch": 0.21413434073528057, "grad_norm": 11.709577790686255, "learning_rate": 4.851650389417728e-05, "loss": 0.5956, "step": 581 }, { "epoch": 0.2145029024232931, "grad_norm": 17.230236286263423, "learning_rate": 4.851341327729015e-05, "loss": 0.4841, "step": 582 }, { "epoch": 0.21487146411130564, "grad_norm": 13.623990922710913, "learning_rate": 4.8510322660403016e-05, "loss": 0.4296, "step": 583 }, { "epoch": 0.21524002579931817, "grad_norm": 8.46672501231037, "learning_rate": 4.850723204351589e-05, "loss": 0.3363, "step": 584 }, { "epoch": 0.2156085874873307, "grad_norm": 6.0131231480734275, "learning_rate": 4.850414142662876e-05, "loss": 0.3359, "step": 585 }, { "epoch": 0.21597714917534322, "grad_norm": 12.064254395977374, "learning_rate": 4.850105080974163e-05, "loss": 0.3946, "step": 586 }, { "epoch": 0.21634571086335574, "grad_norm": 5.622525327768774, "learning_rate": 4.8497960192854494e-05, "loss": 0.2286, "step": 587 }, { "epoch": 0.2167142725513683, "grad_norm": 19.077637112414333, "learning_rate": 4.8494869575967365e-05, "loss": 0.6984, "step": 588 }, { "epoch": 0.21708283423938082, "grad_norm": 6.895942003329295, "learning_rate": 4.8491778959080236e-05, "loss": 0.3188, "step": 589 }, { "epoch": 0.21745139592739335, "grad_norm": 14.728193812995674, "learning_rate": 4.848868834219311e-05, "loss": 0.2464, "step": 590 }, { "epoch": 0.21781995761540587, "grad_norm": 6.892061460073727, "learning_rate": 4.848559772530598e-05, "loss": 0.3347, "step": 591 }, { "epoch": 0.2181885193034184, "grad_norm": 9.605030696726512, "learning_rate": 4.848250710841884e-05, "loss": 0.3821, "step": 592 }, { "epoch": 0.21855708099143095, "grad_norm": 16.57607970712711, "learning_rate": 4.847941649153171e-05, "loss": 0.3683, "step": 593 }, { "epoch": 0.21892564267944348, "grad_norm": 8.663784232456537, "learning_rate": 4.847632587464458e-05, "loss": 0.2534, "step": 594 }, { "epoch": 0.219294204367456, "grad_norm": 6.490400121078011, "learning_rate": 4.847323525775745e-05, "loss": 0.3698, "step": 595 }, { "epoch": 0.21966276605546853, "grad_norm": 6.661353791713731, "learning_rate": 4.847014464087032e-05, "loss": 0.3682, "step": 596 }, { "epoch": 0.22003132774348105, "grad_norm": 33.72190852039387, "learning_rate": 4.8467054023983185e-05, "loss": 0.3922, "step": 597 }, { "epoch": 0.2203998894314936, "grad_norm": 6.099799493714968, "learning_rate": 4.8463963407096057e-05, "loss": 0.2745, "step": 598 }, { "epoch": 0.22076845111950613, "grad_norm": 17.181398064567535, "learning_rate": 4.846087279020893e-05, "loss": 0.3761, "step": 599 }, { "epoch": 0.22113701280751866, "grad_norm": 10.567635816861428, "learning_rate": 4.84577821733218e-05, "loss": 0.3456, "step": 600 }, { "epoch": 0.22150557449553118, "grad_norm": 15.129237566569676, "learning_rate": 4.845469155643467e-05, "loss": 0.4134, "step": 601 }, { "epoch": 0.2218741361835437, "grad_norm": 9.807790221218921, "learning_rate": 4.8451600939547534e-05, "loss": 0.2673, "step": 602 }, { "epoch": 0.22224269787155626, "grad_norm": 7.4410043362590885, "learning_rate": 4.8448510322660406e-05, "loss": 0.4043, "step": 603 }, { "epoch": 0.22261125955956879, "grad_norm": 7.010454817314314, "learning_rate": 4.844541970577328e-05, "loss": 0.3197, "step": 604 }, { "epoch": 0.2229798212475813, "grad_norm": 8.227134691936058, "learning_rate": 4.844232908888615e-05, "loss": 0.5677, "step": 605 }, { "epoch": 0.22334838293559384, "grad_norm": 17.36651448032427, "learning_rate": 4.843923847199901e-05, "loss": 0.5177, "step": 606 }, { "epoch": 0.2237169446236064, "grad_norm": 6.676574909851268, "learning_rate": 4.8436147855111884e-05, "loss": 0.3684, "step": 607 }, { "epoch": 0.22408550631161891, "grad_norm": 6.7519257235667585, "learning_rate": 4.843305723822475e-05, "loss": 0.3338, "step": 608 }, { "epoch": 0.22445406799963144, "grad_norm": 9.082691508257874, "learning_rate": 4.842996662133762e-05, "loss": 0.3741, "step": 609 }, { "epoch": 0.22482262968764397, "grad_norm": 7.175257597761789, "learning_rate": 4.842687600445049e-05, "loss": 0.447, "step": 610 }, { "epoch": 0.2251911913756565, "grad_norm": 7.563429089687201, "learning_rate": 4.842378538756336e-05, "loss": 0.3969, "step": 611 }, { "epoch": 0.22555975306366904, "grad_norm": 7.707941433678614, "learning_rate": 4.8420694770676226e-05, "loss": 0.2977, "step": 612 }, { "epoch": 0.22592831475168157, "grad_norm": 8.28908182873571, "learning_rate": 4.84176041537891e-05, "loss": 0.3299, "step": 613 }, { "epoch": 0.2262968764396941, "grad_norm": 5.282815012893146, "learning_rate": 4.841451353690197e-05, "loss": 0.3367, "step": 614 }, { "epoch": 0.22666543812770662, "grad_norm": 4.177237389720987, "learning_rate": 4.841142292001484e-05, "loss": 0.2231, "step": 615 }, { "epoch": 0.22703399981571915, "grad_norm": 10.411949086355325, "learning_rate": 4.8408332303127704e-05, "loss": 0.4355, "step": 616 }, { "epoch": 0.2274025615037317, "grad_norm": 6.97617262960898, "learning_rate": 4.8405241686240575e-05, "loss": 0.3773, "step": 617 }, { "epoch": 0.22777112319174422, "grad_norm": 11.125279798096653, "learning_rate": 4.8402151069353446e-05, "loss": 0.4306, "step": 618 }, { "epoch": 0.22813968487975675, "grad_norm": 7.185370353651969, "learning_rate": 4.839906045246632e-05, "loss": 0.2837, "step": 619 }, { "epoch": 0.22850824656776927, "grad_norm": 16.611368427530476, "learning_rate": 4.839596983557919e-05, "loss": 0.269, "step": 620 }, { "epoch": 0.2288768082557818, "grad_norm": 7.61611520898055, "learning_rate": 4.839287921869205e-05, "loss": 0.3295, "step": 621 }, { "epoch": 0.22924536994379435, "grad_norm": 10.287580865702072, "learning_rate": 4.8389788601804924e-05, "loss": 0.4179, "step": 622 }, { "epoch": 0.22961393163180688, "grad_norm": 8.916135635487503, "learning_rate": 4.838669798491779e-05, "loss": 0.4269, "step": 623 }, { "epoch": 0.2299824933198194, "grad_norm": 11.20201422426628, "learning_rate": 4.838360736803066e-05, "loss": 0.3664, "step": 624 }, { "epoch": 0.23035105500783193, "grad_norm": 18.84910575708558, "learning_rate": 4.838051675114353e-05, "loss": 0.5821, "step": 625 }, { "epoch": 0.23071961669584445, "grad_norm": 12.361722971504367, "learning_rate": 4.8377426134256396e-05, "loss": 0.3687, "step": 626 }, { "epoch": 0.231088178383857, "grad_norm": 9.641994780878704, "learning_rate": 4.837433551736927e-05, "loss": 0.4641, "step": 627 }, { "epoch": 0.23145674007186953, "grad_norm": 13.731732805434417, "learning_rate": 4.837124490048214e-05, "loss": 0.469, "step": 628 }, { "epoch": 0.23182530175988206, "grad_norm": 12.55543311705478, "learning_rate": 4.836815428359501e-05, "loss": 0.235, "step": 629 }, { "epoch": 0.23219386344789458, "grad_norm": 18.621550579801184, "learning_rate": 4.836506366670788e-05, "loss": 0.3658, "step": 630 }, { "epoch": 0.23256242513590714, "grad_norm": 4.486437598999, "learning_rate": 4.8361973049820745e-05, "loss": 0.2133, "step": 631 }, { "epoch": 0.23293098682391966, "grad_norm": 13.674816647822468, "learning_rate": 4.8358882432933616e-05, "loss": 0.4379, "step": 632 }, { "epoch": 0.2332995485119322, "grad_norm": 9.514892950853989, "learning_rate": 4.835579181604649e-05, "loss": 0.3978, "step": 633 }, { "epoch": 0.2336681101999447, "grad_norm": 8.065056920285276, "learning_rate": 4.835270119915936e-05, "loss": 0.2598, "step": 634 }, { "epoch": 0.23403667188795724, "grad_norm": 6.041272310562659, "learning_rate": 4.834961058227222e-05, "loss": 0.3029, "step": 635 }, { "epoch": 0.2344052335759698, "grad_norm": 7.9372428792324214, "learning_rate": 4.8346519965385094e-05, "loss": 0.3359, "step": 636 }, { "epoch": 0.23477379526398232, "grad_norm": 4.8146571489477905, "learning_rate": 4.8343429348497965e-05, "loss": 0.3886, "step": 637 }, { "epoch": 0.23514235695199484, "grad_norm": 5.378994721437905, "learning_rate": 4.834033873161083e-05, "loss": 0.3654, "step": 638 }, { "epoch": 0.23551091864000737, "grad_norm": 9.990254421717424, "learning_rate": 4.83372481147237e-05, "loss": 0.4606, "step": 639 }, { "epoch": 0.2358794803280199, "grad_norm": 6.818596308275947, "learning_rate": 4.8334157497836565e-05, "loss": 0.2126, "step": 640 }, { "epoch": 0.23624804201603244, "grad_norm": 8.985704300437575, "learning_rate": 4.8331066880949436e-05, "loss": 0.4072, "step": 641 }, { "epoch": 0.23661660370404497, "grad_norm": 7.146662390598517, "learning_rate": 4.832797626406231e-05, "loss": 0.5017, "step": 642 }, { "epoch": 0.2369851653920575, "grad_norm": 4.702420102646864, "learning_rate": 4.832488564717518e-05, "loss": 0.2778, "step": 643 }, { "epoch": 0.23735372708007002, "grad_norm": 26.51997169918892, "learning_rate": 4.832179503028805e-05, "loss": 0.3309, "step": 644 }, { "epoch": 0.23772228876808255, "grad_norm": 8.27384600472845, "learning_rate": 4.8318704413400914e-05, "loss": 0.3846, "step": 645 }, { "epoch": 0.2380908504560951, "grad_norm": 6.2708353272833035, "learning_rate": 4.8315613796513786e-05, "loss": 0.3291, "step": 646 }, { "epoch": 0.23845941214410762, "grad_norm": 28.32679774883008, "learning_rate": 4.831252317962666e-05, "loss": 0.28, "step": 647 }, { "epoch": 0.23882797383212015, "grad_norm": 13.964973229294852, "learning_rate": 4.830943256273953e-05, "loss": 0.5045, "step": 648 }, { "epoch": 0.23919653552013267, "grad_norm": 14.423287931023596, "learning_rate": 4.830634194585239e-05, "loss": 0.4524, "step": 649 }, { "epoch": 0.2395650972081452, "grad_norm": 7.669176299068966, "learning_rate": 4.8303251328965264e-05, "loss": 0.2902, "step": 650 }, { "epoch": 0.23993365889615775, "grad_norm": 7.668607742494849, "learning_rate": 4.8300160712078135e-05, "loss": 0.3086, "step": 651 }, { "epoch": 0.24030222058417028, "grad_norm": 10.133714481346567, "learning_rate": 4.8297070095191006e-05, "loss": 0.3081, "step": 652 }, { "epoch": 0.2406707822721828, "grad_norm": 4.891504178226811, "learning_rate": 4.829397947830387e-05, "loss": 0.2946, "step": 653 }, { "epoch": 0.24103934396019533, "grad_norm": 6.272793519445515, "learning_rate": 4.829088886141674e-05, "loss": 0.4988, "step": 654 }, { "epoch": 0.24140790564820788, "grad_norm": 7.59888024363006, "learning_rate": 4.8287798244529606e-05, "loss": 0.3384, "step": 655 }, { "epoch": 0.2417764673362204, "grad_norm": 4.640800498443929, "learning_rate": 4.828470762764248e-05, "loss": 0.2832, "step": 656 }, { "epoch": 0.24214502902423293, "grad_norm": 8.653011225664992, "learning_rate": 4.828161701075535e-05, "loss": 0.3365, "step": 657 }, { "epoch": 0.24251359071224546, "grad_norm": 8.294982349452853, "learning_rate": 4.827852639386822e-05, "loss": 0.4123, "step": 658 }, { "epoch": 0.24288215240025798, "grad_norm": 9.142216157583054, "learning_rate": 4.8275435776981084e-05, "loss": 0.4634, "step": 659 }, { "epoch": 0.24325071408827054, "grad_norm": 6.6711619797936645, "learning_rate": 4.8272345160093955e-05, "loss": 0.2059, "step": 660 }, { "epoch": 0.24361927577628306, "grad_norm": 7.473664905135753, "learning_rate": 4.8269254543206826e-05, "loss": 0.6036, "step": 661 }, { "epoch": 0.2439878374642956, "grad_norm": 9.0163328530881, "learning_rate": 4.82661639263197e-05, "loss": 0.5239, "step": 662 }, { "epoch": 0.2443563991523081, "grad_norm": 5.0566760813743405, "learning_rate": 4.826307330943257e-05, "loss": 0.2772, "step": 663 }, { "epoch": 0.24472496084032064, "grad_norm": 6.085157077489031, "learning_rate": 4.825998269254543e-05, "loss": 0.4691, "step": 664 }, { "epoch": 0.2450935225283332, "grad_norm": 4.626998469596298, "learning_rate": 4.8256892075658304e-05, "loss": 0.2632, "step": 665 }, { "epoch": 0.24546208421634572, "grad_norm": 7.304219831703692, "learning_rate": 4.8253801458771176e-05, "loss": 0.371, "step": 666 }, { "epoch": 0.24583064590435824, "grad_norm": 10.322071788920333, "learning_rate": 4.825071084188405e-05, "loss": 0.5014, "step": 667 }, { "epoch": 0.24619920759237077, "grad_norm": 16.333855341659717, "learning_rate": 4.824762022499691e-05, "loss": 0.59, "step": 668 }, { "epoch": 0.2465677692803833, "grad_norm": 3.3206437966611535, "learning_rate": 4.8244529608109776e-05, "loss": 0.1615, "step": 669 }, { "epoch": 0.24693633096839585, "grad_norm": 5.54560945069949, "learning_rate": 4.824143899122265e-05, "loss": 0.3552, "step": 670 }, { "epoch": 0.24730489265640837, "grad_norm": 12.722781580400495, "learning_rate": 4.823834837433552e-05, "loss": 0.4381, "step": 671 }, { "epoch": 0.2476734543444209, "grad_norm": 6.954178565281615, "learning_rate": 4.823525775744839e-05, "loss": 0.3868, "step": 672 }, { "epoch": 0.24804201603243342, "grad_norm": 8.60472355134199, "learning_rate": 4.823216714056126e-05, "loss": 0.4322, "step": 673 }, { "epoch": 0.24841057772044595, "grad_norm": 7.77404230726628, "learning_rate": 4.8229076523674125e-05, "loss": 0.2999, "step": 674 }, { "epoch": 0.2487791394084585, "grad_norm": 8.171395107204576, "learning_rate": 4.8225985906786996e-05, "loss": 0.409, "step": 675 }, { "epoch": 0.24914770109647102, "grad_norm": 7.168655237903541, "learning_rate": 4.822289528989987e-05, "loss": 0.3734, "step": 676 }, { "epoch": 0.24951626278448355, "grad_norm": 5.37053157569319, "learning_rate": 4.821980467301274e-05, "loss": 0.3772, "step": 677 }, { "epoch": 0.24988482447249608, "grad_norm": 9.757763632658861, "learning_rate": 4.82167140561256e-05, "loss": 0.3303, "step": 678 }, { "epoch": 0.25025338616050863, "grad_norm": 13.656854472105413, "learning_rate": 4.8213623439238474e-05, "loss": 0.6011, "step": 679 }, { "epoch": 0.25062194784852115, "grad_norm": 15.583578886385602, "learning_rate": 4.8210532822351345e-05, "loss": 0.436, "step": 680 }, { "epoch": 0.2509905095365337, "grad_norm": 5.480557322041315, "learning_rate": 4.8207442205464216e-05, "loss": 0.2623, "step": 681 }, { "epoch": 0.2513590712245462, "grad_norm": 5.6542088318922605, "learning_rate": 4.820435158857709e-05, "loss": 0.2903, "step": 682 }, { "epoch": 0.25172763291255873, "grad_norm": 13.765430462054722, "learning_rate": 4.820126097168995e-05, "loss": 0.4001, "step": 683 }, { "epoch": 0.25209619460057126, "grad_norm": 6.075928592746968, "learning_rate": 4.8198170354802816e-05, "loss": 0.3377, "step": 684 }, { "epoch": 0.2524647562885838, "grad_norm": 5.728530275083552, "learning_rate": 4.819507973791569e-05, "loss": 0.3409, "step": 685 }, { "epoch": 0.2528333179765963, "grad_norm": 5.566434004865016, "learning_rate": 4.819198912102856e-05, "loss": 0.3139, "step": 686 }, { "epoch": 0.2532018796646089, "grad_norm": 10.404752056461763, "learning_rate": 4.818889850414143e-05, "loss": 0.3568, "step": 687 }, { "epoch": 0.2535704413526214, "grad_norm": 10.517232650140791, "learning_rate": 4.8185807887254294e-05, "loss": 0.3606, "step": 688 }, { "epoch": 0.25393900304063394, "grad_norm": 7.774434118744542, "learning_rate": 4.8182717270367166e-05, "loss": 0.4071, "step": 689 }, { "epoch": 0.25430756472864646, "grad_norm": 8.786423648991141, "learning_rate": 4.817962665348004e-05, "loss": 0.4376, "step": 690 }, { "epoch": 0.254676126416659, "grad_norm": 7.925386090906954, "learning_rate": 4.817653603659291e-05, "loss": 0.4369, "step": 691 }, { "epoch": 0.2550446881046715, "grad_norm": 13.047260941148002, "learning_rate": 4.817344541970578e-05, "loss": 0.3629, "step": 692 }, { "epoch": 0.25541324979268404, "grad_norm": 32.04309232307332, "learning_rate": 4.8170354802818644e-05, "loss": 0.7418, "step": 693 }, { "epoch": 0.25578181148069656, "grad_norm": 10.14743307192606, "learning_rate": 4.8167264185931515e-05, "loss": 0.4737, "step": 694 }, { "epoch": 0.2561503731687091, "grad_norm": 31.32532133651525, "learning_rate": 4.8164173569044386e-05, "loss": 0.4826, "step": 695 }, { "epoch": 0.25651893485672167, "grad_norm": 6.912505617880996, "learning_rate": 4.816108295215726e-05, "loss": 0.5261, "step": 696 }, { "epoch": 0.2568874965447342, "grad_norm": 25.954491328423053, "learning_rate": 4.815799233527012e-05, "loss": 0.4221, "step": 697 }, { "epoch": 0.2572560582327467, "grad_norm": 10.816690151226332, "learning_rate": 4.815490171838299e-05, "loss": 0.4487, "step": 698 }, { "epoch": 0.25762461992075925, "grad_norm": 10.998290189159446, "learning_rate": 4.815181110149586e-05, "loss": 0.4096, "step": 699 }, { "epoch": 0.25799318160877177, "grad_norm": 6.646034102162686, "learning_rate": 4.814872048460873e-05, "loss": 0.484, "step": 700 }, { "epoch": 0.2583617432967843, "grad_norm": 9.613205061156844, "learning_rate": 4.81456298677216e-05, "loss": 0.5043, "step": 701 }, { "epoch": 0.2587303049847968, "grad_norm": 5.742752815497315, "learning_rate": 4.814253925083447e-05, "loss": 0.2794, "step": 702 }, { "epoch": 0.25909886667280935, "grad_norm": 7.780600647086532, "learning_rate": 4.8139448633947335e-05, "loss": 0.3167, "step": 703 }, { "epoch": 0.2594674283608219, "grad_norm": 15.897817478696172, "learning_rate": 4.8136358017060206e-05, "loss": 0.4902, "step": 704 }, { "epoch": 0.2598359900488344, "grad_norm": 12.18431949685999, "learning_rate": 4.813326740017308e-05, "loss": 0.303, "step": 705 }, { "epoch": 0.260204551736847, "grad_norm": 8.885732259806659, "learning_rate": 4.813017678328595e-05, "loss": 0.4072, "step": 706 }, { "epoch": 0.2605731134248595, "grad_norm": 4.994043235142509, "learning_rate": 4.812708616639881e-05, "loss": 0.2118, "step": 707 }, { "epoch": 0.26094167511287203, "grad_norm": 9.286690604057416, "learning_rate": 4.8123995549511684e-05, "loss": 0.2495, "step": 708 }, { "epoch": 0.26131023680088455, "grad_norm": 10.018997457759621, "learning_rate": 4.8120904932624556e-05, "loss": 0.3991, "step": 709 }, { "epoch": 0.2616787984888971, "grad_norm": 5.180821727543132, "learning_rate": 4.811781431573743e-05, "loss": 0.3534, "step": 710 }, { "epoch": 0.2620473601769096, "grad_norm": 7.064737961236602, "learning_rate": 4.81147236988503e-05, "loss": 0.3126, "step": 711 }, { "epoch": 0.26241592186492213, "grad_norm": 6.181192901933895, "learning_rate": 4.811163308196316e-05, "loss": 0.2127, "step": 712 }, { "epoch": 0.26278448355293466, "grad_norm": 4.922280721744962, "learning_rate": 4.8108542465076034e-05, "loss": 0.2868, "step": 713 }, { "epoch": 0.2631530452409472, "grad_norm": 18.129120457013954, "learning_rate": 4.81054518481889e-05, "loss": 0.3397, "step": 714 }, { "epoch": 0.26352160692895976, "grad_norm": 8.00493755556873, "learning_rate": 4.810236123130177e-05, "loss": 0.3939, "step": 715 }, { "epoch": 0.2638901686169723, "grad_norm": 5.494183846312841, "learning_rate": 4.809927061441464e-05, "loss": 0.2473, "step": 716 }, { "epoch": 0.2642587303049848, "grad_norm": 12.0409476693642, "learning_rate": 4.8096179997527505e-05, "loss": 0.3523, "step": 717 }, { "epoch": 0.26462729199299734, "grad_norm": 8.906962262500356, "learning_rate": 4.8093089380640376e-05, "loss": 0.4281, "step": 718 }, { "epoch": 0.26499585368100986, "grad_norm": 8.061586786641616, "learning_rate": 4.808999876375325e-05, "loss": 0.4559, "step": 719 }, { "epoch": 0.2653644153690224, "grad_norm": 7.24900784774105, "learning_rate": 4.808690814686612e-05, "loss": 0.4653, "step": 720 }, { "epoch": 0.2657329770570349, "grad_norm": 10.562385874475904, "learning_rate": 4.808381752997899e-05, "loss": 0.6441, "step": 721 }, { "epoch": 0.26610153874504744, "grad_norm": 5.6163860911274055, "learning_rate": 4.8080726913091854e-05, "loss": 0.3398, "step": 722 }, { "epoch": 0.26647010043305996, "grad_norm": 12.307926161084927, "learning_rate": 4.8077636296204725e-05, "loss": 0.2349, "step": 723 }, { "epoch": 0.2668386621210725, "grad_norm": 9.717957099388064, "learning_rate": 4.8074545679317596e-05, "loss": 0.4525, "step": 724 }, { "epoch": 0.26720722380908507, "grad_norm": 6.7814184774398125, "learning_rate": 4.807145506243047e-05, "loss": 0.3798, "step": 725 }, { "epoch": 0.2675757854970976, "grad_norm": 13.861766105742532, "learning_rate": 4.806836444554333e-05, "loss": 0.309, "step": 726 }, { "epoch": 0.2679443471851101, "grad_norm": 6.358108581863755, "learning_rate": 4.80652738286562e-05, "loss": 0.4234, "step": 727 }, { "epoch": 0.26831290887312265, "grad_norm": 7.787548534064501, "learning_rate": 4.8062183211769074e-05, "loss": 0.4141, "step": 728 }, { "epoch": 0.26868147056113517, "grad_norm": 8.928429756130484, "learning_rate": 4.805909259488194e-05, "loss": 0.2607, "step": 729 }, { "epoch": 0.2690500322491477, "grad_norm": 7.740905347605404, "learning_rate": 4.805600197799481e-05, "loss": 0.3466, "step": 730 }, { "epoch": 0.2694185939371602, "grad_norm": 5.9071635190727925, "learning_rate": 4.8052911361107674e-05, "loss": 0.4007, "step": 731 }, { "epoch": 0.26978715562517275, "grad_norm": 6.203227698102958, "learning_rate": 4.8049820744220546e-05, "loss": 0.3447, "step": 732 }, { "epoch": 0.2701557173131853, "grad_norm": 6.994352514854451, "learning_rate": 4.804673012733342e-05, "loss": 0.3454, "step": 733 }, { "epoch": 0.2705242790011978, "grad_norm": 9.694562540242838, "learning_rate": 4.804363951044629e-05, "loss": 0.4351, "step": 734 }, { "epoch": 0.2708928406892104, "grad_norm": 15.04254018232023, "learning_rate": 4.804054889355916e-05, "loss": 0.3738, "step": 735 }, { "epoch": 0.2712614023772229, "grad_norm": 12.192730998795323, "learning_rate": 4.8037458276672024e-05, "loss": 0.3314, "step": 736 }, { "epoch": 0.27162996406523543, "grad_norm": 5.544470553356832, "learning_rate": 4.8034367659784895e-05, "loss": 0.2942, "step": 737 }, { "epoch": 0.27199852575324796, "grad_norm": 13.630431606206914, "learning_rate": 4.8031277042897766e-05, "loss": 0.3975, "step": 738 }, { "epoch": 0.2723670874412605, "grad_norm": 12.734109587916633, "learning_rate": 4.802818642601064e-05, "loss": 0.2703, "step": 739 }, { "epoch": 0.272735649129273, "grad_norm": 47.995956250566046, "learning_rate": 4.80250958091235e-05, "loss": 0.41, "step": 740 }, { "epoch": 0.27310421081728553, "grad_norm": 5.059615832434926, "learning_rate": 4.802200519223637e-05, "loss": 0.2418, "step": 741 }, { "epoch": 0.27347277250529806, "grad_norm": 12.922467772830535, "learning_rate": 4.8018914575349244e-05, "loss": 0.4573, "step": 742 }, { "epoch": 0.2738413341933106, "grad_norm": 6.8001316730990125, "learning_rate": 4.8015823958462115e-05, "loss": 0.3318, "step": 743 }, { "epoch": 0.27420989588132316, "grad_norm": 12.80211803078767, "learning_rate": 4.801273334157498e-05, "loss": 0.4339, "step": 744 }, { "epoch": 0.2745784575693357, "grad_norm": 9.177258654740838, "learning_rate": 4.800964272468785e-05, "loss": 0.3901, "step": 745 }, { "epoch": 0.2749470192573482, "grad_norm": 9.329724658693378, "learning_rate": 4.8006552107800715e-05, "loss": 0.3288, "step": 746 }, { "epoch": 0.27531558094536074, "grad_norm": 9.154962963485596, "learning_rate": 4.8003461490913586e-05, "loss": 0.5965, "step": 747 }, { "epoch": 0.27568414263337326, "grad_norm": 15.140329710420257, "learning_rate": 4.800037087402646e-05, "loss": 0.2982, "step": 748 }, { "epoch": 0.2760527043213858, "grad_norm": 8.667945865224409, "learning_rate": 4.799728025713933e-05, "loss": 0.4434, "step": 749 }, { "epoch": 0.2764212660093983, "grad_norm": 15.018093132548174, "learning_rate": 4.799418964025219e-05, "loss": 0.4717, "step": 750 }, { "epoch": 0.27678982769741084, "grad_norm": 5.476983548030831, "learning_rate": 4.7991099023365064e-05, "loss": 0.4307, "step": 751 }, { "epoch": 0.27715838938542336, "grad_norm": 10.196336903755329, "learning_rate": 4.7988008406477935e-05, "loss": 0.4506, "step": 752 }, { "epoch": 0.2775269510734359, "grad_norm": 6.811001954164226, "learning_rate": 4.798491778959081e-05, "loss": 0.5359, "step": 753 }, { "epoch": 0.27789551276144847, "grad_norm": 10.701422652028269, "learning_rate": 4.798182717270368e-05, "loss": 0.4317, "step": 754 }, { "epoch": 0.278264074449461, "grad_norm": 8.338266921337121, "learning_rate": 4.797873655581654e-05, "loss": 0.2647, "step": 755 }, { "epoch": 0.2786326361374735, "grad_norm": 4.847997759425083, "learning_rate": 4.7975645938929413e-05, "loss": 0.3244, "step": 756 }, { "epoch": 0.27900119782548605, "grad_norm": 31.789962969551706, "learning_rate": 4.7972555322042285e-05, "loss": 0.2762, "step": 757 }, { "epoch": 0.2793697595134986, "grad_norm": 29.607608374078655, "learning_rate": 4.7969464705155156e-05, "loss": 0.4582, "step": 758 }, { "epoch": 0.2797383212015111, "grad_norm": 12.499373763609995, "learning_rate": 4.796637408826802e-05, "loss": 0.3509, "step": 759 }, { "epoch": 0.2801068828895236, "grad_norm": 9.198208543258488, "learning_rate": 4.7963283471380885e-05, "loss": 0.4414, "step": 760 }, { "epoch": 0.28047544457753615, "grad_norm": 12.005380695730313, "learning_rate": 4.7960192854493756e-05, "loss": 0.2749, "step": 761 }, { "epoch": 0.2808440062655487, "grad_norm": 5.875566495813703, "learning_rate": 4.795710223760663e-05, "loss": 0.262, "step": 762 }, { "epoch": 0.28121256795356125, "grad_norm": 7.413852057570542, "learning_rate": 4.79540116207195e-05, "loss": 0.4101, "step": 763 }, { "epoch": 0.2815811296415738, "grad_norm": 7.080756433024536, "learning_rate": 4.795092100383237e-05, "loss": 0.3895, "step": 764 }, { "epoch": 0.2819496913295863, "grad_norm": 11.326150824745485, "learning_rate": 4.7947830386945234e-05, "loss": 0.3453, "step": 765 }, { "epoch": 0.28231825301759883, "grad_norm": 13.998514369068442, "learning_rate": 4.7944739770058105e-05, "loss": 0.3338, "step": 766 }, { "epoch": 0.28268681470561136, "grad_norm": 10.78319693414738, "learning_rate": 4.7941649153170976e-05, "loss": 0.347, "step": 767 }, { "epoch": 0.2830553763936239, "grad_norm": 7.999695772108403, "learning_rate": 4.793855853628385e-05, "loss": 0.3819, "step": 768 }, { "epoch": 0.2834239380816364, "grad_norm": 6.624615352188618, "learning_rate": 4.793546791939671e-05, "loss": 0.2897, "step": 769 }, { "epoch": 0.28379249976964893, "grad_norm": 10.574796662957194, "learning_rate": 4.793237730250958e-05, "loss": 0.3498, "step": 770 }, { "epoch": 0.28416106145766146, "grad_norm": 6.882894526327341, "learning_rate": 4.7929286685622454e-05, "loss": 0.3922, "step": 771 }, { "epoch": 0.284529623145674, "grad_norm": 5.103182992412944, "learning_rate": 4.7926196068735325e-05, "loss": 0.2355, "step": 772 }, { "epoch": 0.28489818483368656, "grad_norm": 10.938477565130858, "learning_rate": 4.79231054518482e-05, "loss": 0.4376, "step": 773 }, { "epoch": 0.2852667465216991, "grad_norm": 5.761023196893628, "learning_rate": 4.792001483496106e-05, "loss": 0.2863, "step": 774 }, { "epoch": 0.2856353082097116, "grad_norm": 4.146874889612376, "learning_rate": 4.7916924218073925e-05, "loss": 0.1848, "step": 775 }, { "epoch": 0.28600386989772414, "grad_norm": 11.21327079921829, "learning_rate": 4.79138336011868e-05, "loss": 0.4169, "step": 776 }, { "epoch": 0.28637243158573666, "grad_norm": 11.290790106239001, "learning_rate": 4.791074298429967e-05, "loss": 0.3823, "step": 777 }, { "epoch": 0.2867409932737492, "grad_norm": 4.214413823358086, "learning_rate": 4.790765236741254e-05, "loss": 0.2847, "step": 778 }, { "epoch": 0.2871095549617617, "grad_norm": 34.88794492617579, "learning_rate": 4.7904561750525403e-05, "loss": 0.2684, "step": 779 }, { "epoch": 0.28747811664977424, "grad_norm": 5.498363077885534, "learning_rate": 4.7901471133638275e-05, "loss": 0.2567, "step": 780 }, { "epoch": 0.28784667833778677, "grad_norm": 9.57259299997467, "learning_rate": 4.7898380516751146e-05, "loss": 0.3706, "step": 781 }, { "epoch": 0.2882152400257993, "grad_norm": 18.02792057920592, "learning_rate": 4.789528989986402e-05, "loss": 0.6545, "step": 782 }, { "epoch": 0.28858380171381187, "grad_norm": 5.453963338249598, "learning_rate": 4.789219928297689e-05, "loss": 0.233, "step": 783 }, { "epoch": 0.2889523634018244, "grad_norm": 9.226490559891005, "learning_rate": 4.788910866608975e-05, "loss": 0.2439, "step": 784 }, { "epoch": 0.2893209250898369, "grad_norm": 5.491853923624755, "learning_rate": 4.7886018049202624e-05, "loss": 0.2776, "step": 785 }, { "epoch": 0.28968948677784945, "grad_norm": 8.51622256360939, "learning_rate": 4.7882927432315495e-05, "loss": 0.3775, "step": 786 }, { "epoch": 0.290058048465862, "grad_norm": 11.492557937180614, "learning_rate": 4.7879836815428366e-05, "loss": 0.3852, "step": 787 }, { "epoch": 0.2904266101538745, "grad_norm": 7.689060952105615, "learning_rate": 4.787674619854123e-05, "loss": 0.3618, "step": 788 }, { "epoch": 0.290795171841887, "grad_norm": 9.429376402671691, "learning_rate": 4.7873655581654095e-05, "loss": 0.3542, "step": 789 }, { "epoch": 0.29116373352989955, "grad_norm": 10.602553409153161, "learning_rate": 4.7870564964766966e-05, "loss": 0.4156, "step": 790 }, { "epoch": 0.2915322952179121, "grad_norm": 13.888325109271467, "learning_rate": 4.786747434787984e-05, "loss": 0.4408, "step": 791 }, { "epoch": 0.29190085690592465, "grad_norm": 41.36265688716847, "learning_rate": 4.786438373099271e-05, "loss": 0.362, "step": 792 }, { "epoch": 0.2922694185939372, "grad_norm": 4.237207345224657, "learning_rate": 4.786129311410558e-05, "loss": 0.3451, "step": 793 }, { "epoch": 0.2926379802819497, "grad_norm": 10.070012102167155, "learning_rate": 4.7858202497218444e-05, "loss": 0.5379, "step": 794 }, { "epoch": 0.29300654196996223, "grad_norm": 7.1871568680832025, "learning_rate": 4.7855111880331315e-05, "loss": 0.3076, "step": 795 }, { "epoch": 0.29337510365797476, "grad_norm": 7.3037657130625835, "learning_rate": 4.785202126344419e-05, "loss": 0.3054, "step": 796 }, { "epoch": 0.2937436653459873, "grad_norm": 14.670256666604798, "learning_rate": 4.784893064655706e-05, "loss": 0.3684, "step": 797 }, { "epoch": 0.2941122270339998, "grad_norm": 4.924033621071289, "learning_rate": 4.784584002966992e-05, "loss": 0.2294, "step": 798 }, { "epoch": 0.29448078872201233, "grad_norm": 8.708022652031906, "learning_rate": 4.7842749412782793e-05, "loss": 0.2789, "step": 799 }, { "epoch": 0.29484935041002486, "grad_norm": 6.952265517016147, "learning_rate": 4.7839658795895665e-05, "loss": 0.4133, "step": 800 }, { "epoch": 0.29484935041002486, "eval_bleu": 0.13228481728886513, "eval_bleu_1gram": 0.5003325432920076, "eval_bleu_2gram": 0.2961057151061455, "eval_bleu_3gram": 0.17622122700575452, "eval_bleu_4gram": 0.11273783418631768, "eval_rag_val_loss": 0.43846681295897544, "eval_rouge1": 0.4858541036986238, "eval_rouge2": 0.2832984606140104, "eval_rougeL": 0.48560076055054385, "step": 800 }, { "epoch": 0.2952179120980374, "grad_norm": 8.07530721074137, "learning_rate": 4.7836568179008536e-05, "loss": 0.2587, "step": 801 }, { "epoch": 0.29558647378604996, "grad_norm": 6.7524033965424985, "learning_rate": 4.783347756212141e-05, "loss": 0.2593, "step": 802 }, { "epoch": 0.2959550354740625, "grad_norm": 8.377867705888132, "learning_rate": 4.783038694523427e-05, "loss": 0.2662, "step": 803 }, { "epoch": 0.296323597162075, "grad_norm": 7.236853420120958, "learning_rate": 4.782729632834714e-05, "loss": 0.3712, "step": 804 }, { "epoch": 0.29669215885008754, "grad_norm": 7.174488796405364, "learning_rate": 4.782420571146001e-05, "loss": 0.3914, "step": 805 }, { "epoch": 0.29706072053810006, "grad_norm": 6.794276510294517, "learning_rate": 4.782111509457288e-05, "loss": 0.4029, "step": 806 }, { "epoch": 0.2974292822261126, "grad_norm": 32.13947418333489, "learning_rate": 4.781802447768575e-05, "loss": 0.6825, "step": 807 }, { "epoch": 0.2977978439141251, "grad_norm": 8.170442240405876, "learning_rate": 4.7814933860798614e-05, "loss": 0.2762, "step": 808 }, { "epoch": 0.29816640560213764, "grad_norm": 8.125241789521318, "learning_rate": 4.7811843243911485e-05, "loss": 0.3429, "step": 809 }, { "epoch": 0.29853496729015017, "grad_norm": 6.631858586348639, "learning_rate": 4.7808752627024356e-05, "loss": 0.292, "step": 810 }, { "epoch": 0.29890352897816275, "grad_norm": 16.883656633801962, "learning_rate": 4.780566201013723e-05, "loss": 0.4649, "step": 811 }, { "epoch": 0.29927209066617527, "grad_norm": 7.826902739535009, "learning_rate": 4.780257139325009e-05, "loss": 0.4685, "step": 812 }, { "epoch": 0.2996406523541878, "grad_norm": 8.962606951342774, "learning_rate": 4.779948077636296e-05, "loss": 0.3044, "step": 813 }, { "epoch": 0.3000092140422003, "grad_norm": 6.3650410971921705, "learning_rate": 4.7796390159475834e-05, "loss": 0.3111, "step": 814 }, { "epoch": 0.30037777573021285, "grad_norm": 7.072648310638424, "learning_rate": 4.7793299542588705e-05, "loss": 0.3389, "step": 815 }, { "epoch": 0.3007463374182254, "grad_norm": 5.956485630750396, "learning_rate": 4.7790208925701577e-05, "loss": 0.3325, "step": 816 }, { "epoch": 0.3011148991062379, "grad_norm": 7.338116501963342, "learning_rate": 4.778711830881444e-05, "loss": 0.4023, "step": 817 }, { "epoch": 0.3014834607942504, "grad_norm": 9.25353276604907, "learning_rate": 4.778402769192731e-05, "loss": 0.5001, "step": 818 }, { "epoch": 0.30185202248226295, "grad_norm": 18.5995863017354, "learning_rate": 4.778093707504018e-05, "loss": 0.3538, "step": 819 }, { "epoch": 0.3022205841702755, "grad_norm": 21.7093392381309, "learning_rate": 4.777784645815305e-05, "loss": 0.6634, "step": 820 }, { "epoch": 0.30258914585828806, "grad_norm": 5.902845905384524, "learning_rate": 4.777475584126592e-05, "loss": 0.2925, "step": 821 }, { "epoch": 0.3029577075463006, "grad_norm": 8.339803040737879, "learning_rate": 4.7771665224378783e-05, "loss": 0.4123, "step": 822 }, { "epoch": 0.3033262692343131, "grad_norm": 5.705762446654807, "learning_rate": 4.7768574607491655e-05, "loss": 0.3511, "step": 823 }, { "epoch": 0.30369483092232563, "grad_norm": 8.622776352990456, "learning_rate": 4.7765483990604526e-05, "loss": 0.5102, "step": 824 }, { "epoch": 0.30406339261033816, "grad_norm": 4.974493773502215, "learning_rate": 4.77623933737174e-05, "loss": 0.4285, "step": 825 }, { "epoch": 0.3044319542983507, "grad_norm": 6.623163220557272, "learning_rate": 4.775930275683027e-05, "loss": 0.4044, "step": 826 }, { "epoch": 0.3048005159863632, "grad_norm": 13.232335245334509, "learning_rate": 4.775621213994313e-05, "loss": 0.332, "step": 827 }, { "epoch": 0.30516907767437573, "grad_norm": 4.920380477459924, "learning_rate": 4.7753121523056004e-05, "loss": 0.2598, "step": 828 }, { "epoch": 0.30553763936238826, "grad_norm": 11.715069083361382, "learning_rate": 4.7750030906168875e-05, "loss": 0.4364, "step": 829 }, { "epoch": 0.3059062010504008, "grad_norm": 9.297686167999718, "learning_rate": 4.7746940289281746e-05, "loss": 0.6602, "step": 830 }, { "epoch": 0.30627476273841336, "grad_norm": 9.315923074472662, "learning_rate": 4.774384967239461e-05, "loss": 0.5432, "step": 831 }, { "epoch": 0.3066433244264259, "grad_norm": 7.834369049861247, "learning_rate": 4.774075905550748e-05, "loss": 0.2098, "step": 832 }, { "epoch": 0.3070118861144384, "grad_norm": 17.039306079185156, "learning_rate": 4.773766843862035e-05, "loss": 0.5536, "step": 833 }, { "epoch": 0.30738044780245094, "grad_norm": 15.10169789450428, "learning_rate": 4.7734577821733224e-05, "loss": 0.2348, "step": 834 }, { "epoch": 0.30774900949046347, "grad_norm": 5.970350759863342, "learning_rate": 4.773148720484609e-05, "loss": 0.257, "step": 835 }, { "epoch": 0.308117571178476, "grad_norm": 6.844272541320446, "learning_rate": 4.772839658795896e-05, "loss": 0.4412, "step": 836 }, { "epoch": 0.3084861328664885, "grad_norm": 7.1652921497016555, "learning_rate": 4.7725305971071824e-05, "loss": 0.2376, "step": 837 }, { "epoch": 0.30885469455450104, "grad_norm": 9.059269191075359, "learning_rate": 4.7722215354184695e-05, "loss": 0.2697, "step": 838 }, { "epoch": 0.30922325624251357, "grad_norm": 10.278008132506756, "learning_rate": 4.7719124737297567e-05, "loss": 0.4223, "step": 839 }, { "epoch": 0.30959181793052615, "grad_norm": 10.411479747012955, "learning_rate": 4.771603412041044e-05, "loss": 0.4277, "step": 840 }, { "epoch": 0.3099603796185387, "grad_norm": 7.767749764337623, "learning_rate": 4.77129435035233e-05, "loss": 0.2826, "step": 841 }, { "epoch": 0.3103289413065512, "grad_norm": 5.3079599166311064, "learning_rate": 4.770985288663617e-05, "loss": 0.348, "step": 842 }, { "epoch": 0.3106975029945637, "grad_norm": 7.7011934222530165, "learning_rate": 4.7706762269749045e-05, "loss": 0.3799, "step": 843 }, { "epoch": 0.31106606468257625, "grad_norm": 8.983470572683668, "learning_rate": 4.7703671652861916e-05, "loss": 0.3642, "step": 844 }, { "epoch": 0.3114346263705888, "grad_norm": 22.111483998351964, "learning_rate": 4.770058103597479e-05, "loss": 0.3123, "step": 845 }, { "epoch": 0.3118031880586013, "grad_norm": 7.511619721948653, "learning_rate": 4.769749041908765e-05, "loss": 0.2274, "step": 846 }, { "epoch": 0.3121717497466138, "grad_norm": 3.9416007834178037, "learning_rate": 4.769439980220052e-05, "loss": 0.2799, "step": 847 }, { "epoch": 0.31254031143462635, "grad_norm": 5.027759549627168, "learning_rate": 4.7691309185313394e-05, "loss": 0.3482, "step": 848 }, { "epoch": 0.3129088731226389, "grad_norm": 9.105866320819603, "learning_rate": 4.7688218568426265e-05, "loss": 0.4724, "step": 849 }, { "epoch": 0.31327743481065146, "grad_norm": 8.04104954487927, "learning_rate": 4.768512795153913e-05, "loss": 0.3193, "step": 850 }, { "epoch": 0.313645996498664, "grad_norm": 10.872977934766624, "learning_rate": 4.7682037334651994e-05, "loss": 0.4085, "step": 851 }, { "epoch": 0.3140145581866765, "grad_norm": 6.703485225788395, "learning_rate": 4.7678946717764865e-05, "loss": 0.4318, "step": 852 }, { "epoch": 0.31438311987468903, "grad_norm": 9.972014368052255, "learning_rate": 4.7675856100877736e-05, "loss": 0.301, "step": 853 }, { "epoch": 0.31475168156270156, "grad_norm": 6.22401918091552, "learning_rate": 4.767276548399061e-05, "loss": 0.3428, "step": 854 }, { "epoch": 0.3151202432507141, "grad_norm": 7.190185575889029, "learning_rate": 4.766967486710348e-05, "loss": 0.3858, "step": 855 }, { "epoch": 0.3154888049387266, "grad_norm": 5.294361159390203, "learning_rate": 4.766658425021634e-05, "loss": 0.3158, "step": 856 }, { "epoch": 0.31585736662673913, "grad_norm": 11.772693552308345, "learning_rate": 4.7663493633329214e-05, "loss": 0.4771, "step": 857 }, { "epoch": 0.31622592831475166, "grad_norm": 5.466542872135952, "learning_rate": 4.7660403016442085e-05, "loss": 0.2704, "step": 858 }, { "epoch": 0.31659449000276424, "grad_norm": 7.491219021411522, "learning_rate": 4.7657312399554957e-05, "loss": 0.3346, "step": 859 }, { "epoch": 0.31696305169077676, "grad_norm": 7.794937779910484, "learning_rate": 4.765422178266782e-05, "loss": 0.4081, "step": 860 }, { "epoch": 0.3173316133787893, "grad_norm": 6.851019634458675, "learning_rate": 4.765113116578069e-05, "loss": 0.3069, "step": 861 }, { "epoch": 0.3177001750668018, "grad_norm": 17.13258806282544, "learning_rate": 4.764804054889356e-05, "loss": 0.4262, "step": 862 }, { "epoch": 0.31806873675481434, "grad_norm": 19.253217155831628, "learning_rate": 4.7644949932006435e-05, "loss": 0.5191, "step": 863 }, { "epoch": 0.31843729844282687, "grad_norm": 13.131418166611185, "learning_rate": 4.7641859315119306e-05, "loss": 0.3907, "step": 864 }, { "epoch": 0.3188058601308394, "grad_norm": 12.042488536484774, "learning_rate": 4.763876869823217e-05, "loss": 0.4165, "step": 865 }, { "epoch": 0.3191744218188519, "grad_norm": 10.12240491977392, "learning_rate": 4.7635678081345035e-05, "loss": 0.4533, "step": 866 }, { "epoch": 0.31954298350686444, "grad_norm": 4.853254284337211, "learning_rate": 4.7632587464457906e-05, "loss": 0.4604, "step": 867 }, { "epoch": 0.31991154519487697, "grad_norm": 6.083879664026683, "learning_rate": 4.762949684757078e-05, "loss": 0.3116, "step": 868 }, { "epoch": 0.32028010688288955, "grad_norm": 5.03729245347546, "learning_rate": 4.762640623068365e-05, "loss": 0.2288, "step": 869 }, { "epoch": 0.3206486685709021, "grad_norm": 11.385585680117508, "learning_rate": 4.762331561379651e-05, "loss": 0.3682, "step": 870 }, { "epoch": 0.3210172302589146, "grad_norm": 19.18164609398787, "learning_rate": 4.7620224996909384e-05, "loss": 0.4584, "step": 871 }, { "epoch": 0.3213857919469271, "grad_norm": 7.711712922874787, "learning_rate": 4.7617134380022255e-05, "loss": 0.439, "step": 872 }, { "epoch": 0.32175435363493965, "grad_norm": 4.830509917706771, "learning_rate": 4.7614043763135126e-05, "loss": 0.2684, "step": 873 }, { "epoch": 0.3221229153229522, "grad_norm": 8.702958000950515, "learning_rate": 4.7610953146248e-05, "loss": 0.3307, "step": 874 }, { "epoch": 0.3224914770109647, "grad_norm": 10.616697770277261, "learning_rate": 4.760786252936086e-05, "loss": 0.416, "step": 875 }, { "epoch": 0.3228600386989772, "grad_norm": 5.9302992674784525, "learning_rate": 4.760477191247373e-05, "loss": 0.2268, "step": 876 }, { "epoch": 0.32322860038698975, "grad_norm": 5.510391821802253, "learning_rate": 4.7601681295586604e-05, "loss": 0.452, "step": 877 }, { "epoch": 0.3235971620750023, "grad_norm": 8.95531688442616, "learning_rate": 4.7598590678699475e-05, "loss": 0.3784, "step": 878 }, { "epoch": 0.32396572376301486, "grad_norm": 12.234312015921846, "learning_rate": 4.759550006181234e-05, "loss": 0.2644, "step": 879 }, { "epoch": 0.3243342854510274, "grad_norm": 9.010720967058708, "learning_rate": 4.7592409444925204e-05, "loss": 0.2819, "step": 880 }, { "epoch": 0.3247028471390399, "grad_norm": 24.876620599297773, "learning_rate": 4.7589318828038075e-05, "loss": 0.4781, "step": 881 }, { "epoch": 0.32507140882705243, "grad_norm": 12.700587846262275, "learning_rate": 4.7586228211150947e-05, "loss": 0.4707, "step": 882 }, { "epoch": 0.32543997051506496, "grad_norm": 25.529908834878114, "learning_rate": 4.758313759426382e-05, "loss": 0.4994, "step": 883 }, { "epoch": 0.3258085322030775, "grad_norm": 10.50642416478931, "learning_rate": 4.758004697737668e-05, "loss": 0.3382, "step": 884 }, { "epoch": 0.32617709389109, "grad_norm": 19.534851230497157, "learning_rate": 4.757695636048955e-05, "loss": 0.5828, "step": 885 }, { "epoch": 0.32654565557910253, "grad_norm": 8.728296447446759, "learning_rate": 4.7573865743602424e-05, "loss": 0.4813, "step": 886 }, { "epoch": 0.32691421726711506, "grad_norm": 19.738234480080152, "learning_rate": 4.7570775126715296e-05, "loss": 0.2823, "step": 887 }, { "epoch": 0.32728277895512764, "grad_norm": 9.753129114640874, "learning_rate": 4.756768450982817e-05, "loss": 0.3257, "step": 888 }, { "epoch": 0.32765134064314017, "grad_norm": 7.047559751410912, "learning_rate": 4.756459389294103e-05, "loss": 0.414, "step": 889 }, { "epoch": 0.3280199023311527, "grad_norm": 9.82229417033285, "learning_rate": 4.75615032760539e-05, "loss": 0.3731, "step": 890 }, { "epoch": 0.3283884640191652, "grad_norm": 10.253527522767774, "learning_rate": 4.7558412659166774e-05, "loss": 0.3692, "step": 891 }, { "epoch": 0.32875702570717774, "grad_norm": 13.536197312957302, "learning_rate": 4.7555322042279645e-05, "loss": 0.3591, "step": 892 }, { "epoch": 0.32912558739519027, "grad_norm": 8.65275905464913, "learning_rate": 4.7552231425392516e-05, "loss": 0.4002, "step": 893 }, { "epoch": 0.3294941490832028, "grad_norm": 9.604348755363294, "learning_rate": 4.754914080850538e-05, "loss": 0.2593, "step": 894 }, { "epoch": 0.3298627107712153, "grad_norm": 14.140108113826539, "learning_rate": 4.7546050191618245e-05, "loss": 0.4594, "step": 895 }, { "epoch": 0.33023127245922784, "grad_norm": 11.935415035937298, "learning_rate": 4.7542959574731116e-05, "loss": 0.4036, "step": 896 }, { "epoch": 0.33059983414724037, "grad_norm": 3.5863879882815572, "learning_rate": 4.753986895784399e-05, "loss": 0.2184, "step": 897 }, { "epoch": 0.33096839583525295, "grad_norm": 11.562990910186798, "learning_rate": 4.753677834095686e-05, "loss": 0.4546, "step": 898 }, { "epoch": 0.3313369575232655, "grad_norm": 5.365934489438563, "learning_rate": 4.753368772406972e-05, "loss": 0.32, "step": 899 }, { "epoch": 0.331705519211278, "grad_norm": 11.098916325638086, "learning_rate": 4.7530597107182594e-05, "loss": 0.3455, "step": 900 }, { "epoch": 0.3320740808992905, "grad_norm": 10.48559599587319, "learning_rate": 4.7527506490295465e-05, "loss": 0.3838, "step": 901 }, { "epoch": 0.33244264258730305, "grad_norm": 7.738578501831193, "learning_rate": 4.7524415873408336e-05, "loss": 0.4545, "step": 902 }, { "epoch": 0.3328112042753156, "grad_norm": 10.011308379587126, "learning_rate": 4.75213252565212e-05, "loss": 0.5549, "step": 903 }, { "epoch": 0.3331797659633281, "grad_norm": 4.6401247724660095, "learning_rate": 4.751823463963407e-05, "loss": 0.3411, "step": 904 }, { "epoch": 0.3335483276513406, "grad_norm": 5.559961661336984, "learning_rate": 4.751514402274694e-05, "loss": 0.3955, "step": 905 }, { "epoch": 0.33391688933935315, "grad_norm": 13.410622192786443, "learning_rate": 4.7512053405859814e-05, "loss": 0.3684, "step": 906 }, { "epoch": 0.33428545102736573, "grad_norm": 9.487184413239806, "learning_rate": 4.7508962788972686e-05, "loss": 0.3612, "step": 907 }, { "epoch": 0.33465401271537826, "grad_norm": 22.480338088781885, "learning_rate": 4.750587217208555e-05, "loss": 0.3317, "step": 908 }, { "epoch": 0.3350225744033908, "grad_norm": 11.924278406987307, "learning_rate": 4.750278155519842e-05, "loss": 0.3428, "step": 909 }, { "epoch": 0.3353911360914033, "grad_norm": 22.917229366907964, "learning_rate": 4.7499690938311286e-05, "loss": 0.3181, "step": 910 }, { "epoch": 0.33575969777941583, "grad_norm": 11.842111116820556, "learning_rate": 4.749660032142416e-05, "loss": 0.5888, "step": 911 }, { "epoch": 0.33612825946742836, "grad_norm": 7.363235918709931, "learning_rate": 4.749350970453703e-05, "loss": 0.3803, "step": 912 }, { "epoch": 0.3364968211554409, "grad_norm": 6.853971468881398, "learning_rate": 4.749041908764989e-05, "loss": 0.4136, "step": 913 }, { "epoch": 0.3368653828434534, "grad_norm": 5.5600889316828965, "learning_rate": 4.7487328470762764e-05, "loss": 0.3287, "step": 914 }, { "epoch": 0.33723394453146593, "grad_norm": 25.392223507374016, "learning_rate": 4.7484237853875635e-05, "loss": 0.4073, "step": 915 }, { "epoch": 0.33760250621947846, "grad_norm": 7.25458565746683, "learning_rate": 4.7481147236988506e-05, "loss": 0.2745, "step": 916 }, { "epoch": 0.33797106790749104, "grad_norm": 6.586334542357943, "learning_rate": 4.747805662010138e-05, "loss": 0.2944, "step": 917 }, { "epoch": 0.33833962959550357, "grad_norm": 7.07077918435626, "learning_rate": 4.747496600321424e-05, "loss": 0.3207, "step": 918 }, { "epoch": 0.3387081912835161, "grad_norm": 10.707267460797398, "learning_rate": 4.747187538632711e-05, "loss": 0.3556, "step": 919 }, { "epoch": 0.3390767529715286, "grad_norm": 8.321183108060481, "learning_rate": 4.7468784769439984e-05, "loss": 0.398, "step": 920 }, { "epoch": 0.33944531465954114, "grad_norm": 9.408526477796533, "learning_rate": 4.7465694152552855e-05, "loss": 0.4751, "step": 921 }, { "epoch": 0.33981387634755367, "grad_norm": 8.85115393750389, "learning_rate": 4.746260353566572e-05, "loss": 0.5349, "step": 922 }, { "epoch": 0.3401824380355662, "grad_norm": 15.503634119084387, "learning_rate": 4.745951291877859e-05, "loss": 0.3176, "step": 923 }, { "epoch": 0.3405509997235787, "grad_norm": 7.968647795377054, "learning_rate": 4.745642230189146e-05, "loss": 0.4852, "step": 924 }, { "epoch": 0.34091956141159124, "grad_norm": 6.25270266271945, "learning_rate": 4.745333168500433e-05, "loss": 0.3404, "step": 925 }, { "epoch": 0.34128812309960377, "grad_norm": 6.383002251520825, "learning_rate": 4.74502410681172e-05, "loss": 0.5343, "step": 926 }, { "epoch": 0.34165668478761635, "grad_norm": 8.36743596809535, "learning_rate": 4.744715045123007e-05, "loss": 0.286, "step": 927 }, { "epoch": 0.3420252464756289, "grad_norm": 11.572629378512987, "learning_rate": 4.744405983434293e-05, "loss": 0.3853, "step": 928 }, { "epoch": 0.3423938081636414, "grad_norm": 15.681799593827762, "learning_rate": 4.7440969217455804e-05, "loss": 0.347, "step": 929 }, { "epoch": 0.3427623698516539, "grad_norm": 6.127208700738236, "learning_rate": 4.7437878600568676e-05, "loss": 0.3768, "step": 930 }, { "epoch": 0.34313093153966645, "grad_norm": 5.726709715237682, "learning_rate": 4.743478798368155e-05, "loss": 0.3477, "step": 931 }, { "epoch": 0.343499493227679, "grad_norm": 8.23628620935801, "learning_rate": 4.743169736679441e-05, "loss": 0.3716, "step": 932 }, { "epoch": 0.3438680549156915, "grad_norm": 7.046474132427835, "learning_rate": 4.742860674990728e-05, "loss": 0.4117, "step": 933 }, { "epoch": 0.344236616603704, "grad_norm": 8.963023201582677, "learning_rate": 4.7425516133020154e-05, "loss": 0.4353, "step": 934 }, { "epoch": 0.34460517829171655, "grad_norm": 6.004204230772319, "learning_rate": 4.7422425516133025e-05, "loss": 0.2934, "step": 935 }, { "epoch": 0.34497373997972913, "grad_norm": 3.953090019222082, "learning_rate": 4.7419334899245896e-05, "loss": 0.31, "step": 936 }, { "epoch": 0.34534230166774166, "grad_norm": 13.692205865558835, "learning_rate": 4.741624428235876e-05, "loss": 0.4251, "step": 937 }, { "epoch": 0.3457108633557542, "grad_norm": 20.13200000785829, "learning_rate": 4.741315366547163e-05, "loss": 0.5498, "step": 938 }, { "epoch": 0.3460794250437667, "grad_norm": 5.735341835111154, "learning_rate": 4.74100630485845e-05, "loss": 0.2946, "step": 939 }, { "epoch": 0.34644798673177923, "grad_norm": 14.02534969118278, "learning_rate": 4.7406972431697374e-05, "loss": 0.3096, "step": 940 }, { "epoch": 0.34681654841979176, "grad_norm": 10.870540713115783, "learning_rate": 4.740388181481024e-05, "loss": 0.3607, "step": 941 }, { "epoch": 0.3471851101078043, "grad_norm": 6.813823606366096, "learning_rate": 4.74007911979231e-05, "loss": 0.3429, "step": 942 }, { "epoch": 0.3475536717958168, "grad_norm": 11.066403837032034, "learning_rate": 4.7397700581035974e-05, "loss": 0.3862, "step": 943 }, { "epoch": 0.34792223348382934, "grad_norm": 9.674636668398836, "learning_rate": 4.7394609964148845e-05, "loss": 0.3429, "step": 944 }, { "epoch": 0.34829079517184186, "grad_norm": 9.81945631957846, "learning_rate": 4.7391519347261716e-05, "loss": 0.28, "step": 945 }, { "epoch": 0.34865935685985444, "grad_norm": 12.161917260255986, "learning_rate": 4.738842873037459e-05, "loss": 0.4399, "step": 946 }, { "epoch": 0.34902791854786697, "grad_norm": 8.116637166287536, "learning_rate": 4.738533811348745e-05, "loss": 0.3199, "step": 947 }, { "epoch": 0.3493964802358795, "grad_norm": 7.046750763265299, "learning_rate": 4.738224749660032e-05, "loss": 0.467, "step": 948 }, { "epoch": 0.349765041923892, "grad_norm": 4.997166402883285, "learning_rate": 4.7379156879713194e-05, "loss": 0.3076, "step": 949 }, { "epoch": 0.35013360361190454, "grad_norm": 6.8170553953078095, "learning_rate": 4.7376066262826066e-05, "loss": 0.2146, "step": 950 }, { "epoch": 0.35050216529991707, "grad_norm": 11.042761137838818, "learning_rate": 4.737297564593893e-05, "loss": 0.3907, "step": 951 }, { "epoch": 0.3508707269879296, "grad_norm": 7.2700087581618495, "learning_rate": 4.73698850290518e-05, "loss": 0.4506, "step": 952 }, { "epoch": 0.3512392886759421, "grad_norm": 8.132188170230394, "learning_rate": 4.736679441216467e-05, "loss": 0.3006, "step": 953 }, { "epoch": 0.35160785036395464, "grad_norm": 9.541760662121176, "learning_rate": 4.7363703795277544e-05, "loss": 0.2581, "step": 954 }, { "epoch": 0.3519764120519672, "grad_norm": 10.61722486656412, "learning_rate": 4.7360613178390415e-05, "loss": 0.4587, "step": 955 }, { "epoch": 0.35234497373997975, "grad_norm": 5.343576528845536, "learning_rate": 4.735752256150328e-05, "loss": 0.2956, "step": 956 }, { "epoch": 0.3527135354279923, "grad_norm": 12.200888454417917, "learning_rate": 4.7354431944616144e-05, "loss": 0.3991, "step": 957 }, { "epoch": 0.3530820971160048, "grad_norm": 10.61611962298887, "learning_rate": 4.7351341327729015e-05, "loss": 0.2502, "step": 958 }, { "epoch": 0.3534506588040173, "grad_norm": 8.630916169690368, "learning_rate": 4.7348250710841886e-05, "loss": 0.4434, "step": 959 }, { "epoch": 0.35381922049202985, "grad_norm": 7.838220843638711, "learning_rate": 4.734516009395476e-05, "loss": 0.3025, "step": 960 }, { "epoch": 0.3541877821800424, "grad_norm": 8.995364796915679, "learning_rate": 4.734206947706762e-05, "loss": 0.359, "step": 961 }, { "epoch": 0.3545563438680549, "grad_norm": 5.319184708901095, "learning_rate": 4.733897886018049e-05, "loss": 0.3015, "step": 962 }, { "epoch": 0.3549249055560674, "grad_norm": 11.934770042272037, "learning_rate": 4.7335888243293364e-05, "loss": 0.3896, "step": 963 }, { "epoch": 0.35529346724407995, "grad_norm": 9.227463357667146, "learning_rate": 4.7332797626406235e-05, "loss": 0.4203, "step": 964 }, { "epoch": 0.35566202893209253, "grad_norm": 7.884651687853404, "learning_rate": 4.7329707009519106e-05, "loss": 0.3004, "step": 965 }, { "epoch": 0.35603059062010506, "grad_norm": 5.203242303482512, "learning_rate": 4.732661639263197e-05, "loss": 0.2855, "step": 966 }, { "epoch": 0.3563991523081176, "grad_norm": 4.727960149761668, "learning_rate": 4.732352577574484e-05, "loss": 0.3881, "step": 967 }, { "epoch": 0.3567677139961301, "grad_norm": 6.420720128821708, "learning_rate": 4.732043515885771e-05, "loss": 0.3138, "step": 968 }, { "epoch": 0.35713627568414263, "grad_norm": 11.017186696700794, "learning_rate": 4.7317344541970584e-05, "loss": 0.425, "step": 969 }, { "epoch": 0.35750483737215516, "grad_norm": 20.559296239541723, "learning_rate": 4.731425392508345e-05, "loss": 0.4173, "step": 970 }, { "epoch": 0.3578733990601677, "grad_norm": 8.086986646912665, "learning_rate": 4.731116330819631e-05, "loss": 0.3094, "step": 971 }, { "epoch": 0.3582419607481802, "grad_norm": 5.468991693877865, "learning_rate": 4.7308072691309184e-05, "loss": 0.3192, "step": 972 }, { "epoch": 0.35861052243619274, "grad_norm": 7.50037383101539, "learning_rate": 4.7304982074422056e-05, "loss": 0.3018, "step": 973 }, { "epoch": 0.3589790841242053, "grad_norm": 6.8490939244143245, "learning_rate": 4.730189145753493e-05, "loss": 0.3385, "step": 974 }, { "epoch": 0.35934764581221784, "grad_norm": 10.866209489375409, "learning_rate": 4.729880084064779e-05, "loss": 0.4476, "step": 975 }, { "epoch": 0.35971620750023037, "grad_norm": 8.47953261976785, "learning_rate": 4.729571022376066e-05, "loss": 0.2538, "step": 976 }, { "epoch": 0.3600847691882429, "grad_norm": 5.638601177791128, "learning_rate": 4.7292619606873534e-05, "loss": 0.3318, "step": 977 }, { "epoch": 0.3604533308762554, "grad_norm": 10.986555726809074, "learning_rate": 4.7289528989986405e-05, "loss": 0.6182, "step": 978 }, { "epoch": 0.36082189256426794, "grad_norm": 9.39770728213241, "learning_rate": 4.7286438373099276e-05, "loss": 0.4596, "step": 979 }, { "epoch": 0.36119045425228047, "grad_norm": 9.676330821847097, "learning_rate": 4.728334775621214e-05, "loss": 0.4661, "step": 980 }, { "epoch": 0.361559015940293, "grad_norm": 4.651647763917176, "learning_rate": 4.728025713932501e-05, "loss": 0.2469, "step": 981 }, { "epoch": 0.3619275776283055, "grad_norm": 10.154721940336273, "learning_rate": 4.727716652243788e-05, "loss": 0.3543, "step": 982 }, { "epoch": 0.36229613931631804, "grad_norm": 5.372160938013762, "learning_rate": 4.7274075905550754e-05, "loss": 0.3861, "step": 983 }, { "epoch": 0.3626647010043306, "grad_norm": 9.958432589452613, "learning_rate": 4.7270985288663625e-05, "loss": 0.354, "step": 984 }, { "epoch": 0.36303326269234315, "grad_norm": 5.218420692150421, "learning_rate": 4.726789467177649e-05, "loss": 0.3041, "step": 985 }, { "epoch": 0.3634018243803557, "grad_norm": 7.172272315894025, "learning_rate": 4.7264804054889354e-05, "loss": 0.3942, "step": 986 }, { "epoch": 0.3637703860683682, "grad_norm": 6.955450804831956, "learning_rate": 4.7261713438002225e-05, "loss": 0.3623, "step": 987 }, { "epoch": 0.3641389477563807, "grad_norm": 7.490135826845187, "learning_rate": 4.7258622821115096e-05, "loss": 0.4388, "step": 988 }, { "epoch": 0.36450750944439325, "grad_norm": 14.337752141622806, "learning_rate": 4.725553220422797e-05, "loss": 0.3711, "step": 989 }, { "epoch": 0.3648760711324058, "grad_norm": 11.443744559836228, "learning_rate": 4.725244158734083e-05, "loss": 0.5398, "step": 990 }, { "epoch": 0.3652446328204183, "grad_norm": 7.273629715589181, "learning_rate": 4.72493509704537e-05, "loss": 0.3029, "step": 991 }, { "epoch": 0.3656131945084308, "grad_norm": 6.835231548369391, "learning_rate": 4.7246260353566574e-05, "loss": 0.3373, "step": 992 }, { "epoch": 0.36598175619644335, "grad_norm": 8.438119371074801, "learning_rate": 4.7243169736679446e-05, "loss": 0.3322, "step": 993 }, { "epoch": 0.36635031788445593, "grad_norm": 11.565115308108421, "learning_rate": 4.724007911979231e-05, "loss": 0.4405, "step": 994 }, { "epoch": 0.36671887957246846, "grad_norm": 34.36155276750092, "learning_rate": 4.723698850290518e-05, "loss": 0.2861, "step": 995 }, { "epoch": 0.367087441260481, "grad_norm": 8.831407972949046, "learning_rate": 4.723389788601805e-05, "loss": 0.4259, "step": 996 }, { "epoch": 0.3674560029484935, "grad_norm": 5.635728734578251, "learning_rate": 4.7230807269130924e-05, "loss": 0.4113, "step": 997 }, { "epoch": 0.36782456463650604, "grad_norm": 4.803365385931172, "learning_rate": 4.7227716652243795e-05, "loss": 0.2403, "step": 998 }, { "epoch": 0.36819312632451856, "grad_norm": 11.122591936756539, "learning_rate": 4.722462603535666e-05, "loss": 0.4267, "step": 999 }, { "epoch": 0.3685616880125311, "grad_norm": 20.220529469124163, "learning_rate": 4.722153541846953e-05, "loss": 0.7409, "step": 1000 }, { "epoch": 0.3689302497005436, "grad_norm": 14.156871925149218, "learning_rate": 4.7218444801582395e-05, "loss": 0.4526, "step": 1001 }, { "epoch": 0.36929881138855614, "grad_norm": 11.198127344796246, "learning_rate": 4.7215354184695266e-05, "loss": 0.2643, "step": 1002 }, { "epoch": 0.3696673730765687, "grad_norm": 10.13730730468641, "learning_rate": 4.721226356780814e-05, "loss": 0.5077, "step": 1003 }, { "epoch": 0.37003593476458124, "grad_norm": 9.69711279618997, "learning_rate": 4.7209172950921e-05, "loss": 0.321, "step": 1004 }, { "epoch": 0.37040449645259377, "grad_norm": 5.2473519322631095, "learning_rate": 4.720608233403387e-05, "loss": 0.3164, "step": 1005 }, { "epoch": 0.3707730581406063, "grad_norm": 6.9407360330809444, "learning_rate": 4.7202991717146744e-05, "loss": 0.3204, "step": 1006 }, { "epoch": 0.3711416198286188, "grad_norm": 15.905771784160764, "learning_rate": 4.7199901100259615e-05, "loss": 0.306, "step": 1007 }, { "epoch": 0.37151018151663134, "grad_norm": 10.996384459967114, "learning_rate": 4.7196810483372486e-05, "loss": 0.376, "step": 1008 }, { "epoch": 0.37187874320464387, "grad_norm": 51.98875423018184, "learning_rate": 4.719371986648535e-05, "loss": 0.421, "step": 1009 }, { "epoch": 0.3722473048926564, "grad_norm": 7.922667574688682, "learning_rate": 4.719062924959822e-05, "loss": 0.2691, "step": 1010 }, { "epoch": 0.3726158665806689, "grad_norm": 12.533374026076899, "learning_rate": 4.718753863271109e-05, "loss": 0.4193, "step": 1011 }, { "epoch": 0.37298442826868144, "grad_norm": 44.09807130224773, "learning_rate": 4.7184448015823964e-05, "loss": 0.4301, "step": 1012 }, { "epoch": 0.373352989956694, "grad_norm": 8.950149654224102, "learning_rate": 4.718135739893683e-05, "loss": 0.3536, "step": 1013 }, { "epoch": 0.37372155164470655, "grad_norm": 9.887879290340855, "learning_rate": 4.71782667820497e-05, "loss": 0.4229, "step": 1014 }, { "epoch": 0.3740901133327191, "grad_norm": 6.596816086788396, "learning_rate": 4.717517616516257e-05, "loss": 0.2867, "step": 1015 }, { "epoch": 0.3744586750207316, "grad_norm": 9.896222451573891, "learning_rate": 4.7172085548275436e-05, "loss": 0.3082, "step": 1016 }, { "epoch": 0.3748272367087441, "grad_norm": 10.545942522435984, "learning_rate": 4.716899493138831e-05, "loss": 0.1858, "step": 1017 }, { "epoch": 0.37519579839675665, "grad_norm": 11.221793184377852, "learning_rate": 4.716590431450118e-05, "loss": 0.476, "step": 1018 }, { "epoch": 0.3755643600847692, "grad_norm": 8.327788326479277, "learning_rate": 4.716281369761404e-05, "loss": 0.3871, "step": 1019 }, { "epoch": 0.3759329217727817, "grad_norm": 6.5149174221423545, "learning_rate": 4.7159723080726914e-05, "loss": 0.3103, "step": 1020 }, { "epoch": 0.37630148346079423, "grad_norm": 16.302748297473688, "learning_rate": 4.7156632463839785e-05, "loss": 0.4598, "step": 1021 }, { "epoch": 0.3766700451488068, "grad_norm": 17.121545150762806, "learning_rate": 4.7153541846952656e-05, "loss": 0.5717, "step": 1022 }, { "epoch": 0.37703860683681933, "grad_norm": 5.677376257195879, "learning_rate": 4.715045123006552e-05, "loss": 0.3076, "step": 1023 }, { "epoch": 0.37740716852483186, "grad_norm": 6.142047435839418, "learning_rate": 4.714736061317839e-05, "loss": 0.307, "step": 1024 }, { "epoch": 0.3777757302128444, "grad_norm": 11.308457468954307, "learning_rate": 4.714426999629126e-05, "loss": 0.3321, "step": 1025 }, { "epoch": 0.3781442919008569, "grad_norm": 6.857228840561336, "learning_rate": 4.7141179379404134e-05, "loss": 0.4383, "step": 1026 }, { "epoch": 0.37851285358886944, "grad_norm": 6.384305856323759, "learning_rate": 4.7138088762517005e-05, "loss": 0.3429, "step": 1027 }, { "epoch": 0.37888141527688196, "grad_norm": 5.47129195945221, "learning_rate": 4.713499814562987e-05, "loss": 0.225, "step": 1028 }, { "epoch": 0.3792499769648945, "grad_norm": 10.15311925904924, "learning_rate": 4.713190752874274e-05, "loss": 0.2715, "step": 1029 }, { "epoch": 0.379618538652907, "grad_norm": 13.220820704391095, "learning_rate": 4.712881691185561e-05, "loss": 0.2697, "step": 1030 }, { "epoch": 0.37998710034091954, "grad_norm": 6.532382994580969, "learning_rate": 4.7125726294968476e-05, "loss": 0.4217, "step": 1031 }, { "epoch": 0.3803556620289321, "grad_norm": 12.909598371077973, "learning_rate": 4.712263567808135e-05, "loss": 0.4087, "step": 1032 }, { "epoch": 0.38072422371694464, "grad_norm": 12.011566310008307, "learning_rate": 4.711954506119421e-05, "loss": 0.3529, "step": 1033 }, { "epoch": 0.38109278540495717, "grad_norm": 6.37095094634179, "learning_rate": 4.711645444430708e-05, "loss": 0.2962, "step": 1034 }, { "epoch": 0.3814613470929697, "grad_norm": 7.436123760701718, "learning_rate": 4.7113363827419954e-05, "loss": 0.2765, "step": 1035 }, { "epoch": 0.3818299087809822, "grad_norm": 4.559767231270599, "learning_rate": 4.7110273210532825e-05, "loss": 0.2563, "step": 1036 }, { "epoch": 0.38219847046899474, "grad_norm": 10.317193829980642, "learning_rate": 4.71071825936457e-05, "loss": 0.3269, "step": 1037 }, { "epoch": 0.38256703215700727, "grad_norm": 7.283203386883208, "learning_rate": 4.710409197675856e-05, "loss": 0.4509, "step": 1038 }, { "epoch": 0.3829355938450198, "grad_norm": 8.432143482008112, "learning_rate": 4.710100135987143e-05, "loss": 0.3934, "step": 1039 }, { "epoch": 0.3833041555330323, "grad_norm": 8.744172253614341, "learning_rate": 4.7097910742984303e-05, "loss": 0.4905, "step": 1040 }, { "epoch": 0.38367271722104485, "grad_norm": 9.778043666268479, "learning_rate": 4.7094820126097175e-05, "loss": 0.4942, "step": 1041 }, { "epoch": 0.3840412789090574, "grad_norm": 10.390052263350567, "learning_rate": 4.709172950921004e-05, "loss": 0.2898, "step": 1042 }, { "epoch": 0.38440984059706995, "grad_norm": 7.12929632922509, "learning_rate": 4.708863889232291e-05, "loss": 0.2929, "step": 1043 }, { "epoch": 0.3847784022850825, "grad_norm": 6.592610771512904, "learning_rate": 4.708554827543578e-05, "loss": 0.4284, "step": 1044 }, { "epoch": 0.385146963973095, "grad_norm": 11.992945822929359, "learning_rate": 4.708245765854865e-05, "loss": 0.3831, "step": 1045 }, { "epoch": 0.3855155256611075, "grad_norm": 6.0798717038522065, "learning_rate": 4.707936704166152e-05, "loss": 0.3381, "step": 1046 }, { "epoch": 0.38588408734912005, "grad_norm": 8.35029808014922, "learning_rate": 4.707627642477438e-05, "loss": 0.4921, "step": 1047 }, { "epoch": 0.3862526490371326, "grad_norm": 5.9792790241738825, "learning_rate": 4.707318580788725e-05, "loss": 0.2218, "step": 1048 }, { "epoch": 0.3866212107251451, "grad_norm": 6.925582684953162, "learning_rate": 4.7070095191000124e-05, "loss": 0.4253, "step": 1049 }, { "epoch": 0.38698977241315763, "grad_norm": 9.871331028262158, "learning_rate": 4.7067004574112995e-05, "loss": 0.4616, "step": 1050 }, { "epoch": 0.3873583341011702, "grad_norm": 10.821977111352288, "learning_rate": 4.7063913957225866e-05, "loss": 0.4074, "step": 1051 }, { "epoch": 0.38772689578918274, "grad_norm": 6.162575197824985, "learning_rate": 4.706082334033873e-05, "loss": 0.2887, "step": 1052 }, { "epoch": 0.38809545747719526, "grad_norm": 12.618134397518046, "learning_rate": 4.70577327234516e-05, "loss": 0.4565, "step": 1053 }, { "epoch": 0.3884640191652078, "grad_norm": 7.388644769726254, "learning_rate": 4.705464210656447e-05, "loss": 0.5326, "step": 1054 }, { "epoch": 0.3888325808532203, "grad_norm": 13.078405598281622, "learning_rate": 4.7051551489677344e-05, "loss": 0.3548, "step": 1055 }, { "epoch": 0.38920114254123284, "grad_norm": 6.958313932410812, "learning_rate": 4.7048460872790215e-05, "loss": 0.3532, "step": 1056 }, { "epoch": 0.38956970422924536, "grad_norm": 7.624794879092957, "learning_rate": 4.704537025590308e-05, "loss": 0.352, "step": 1057 }, { "epoch": 0.3899382659172579, "grad_norm": 9.740978272988999, "learning_rate": 4.704227963901595e-05, "loss": 0.5656, "step": 1058 }, { "epoch": 0.3903068276052704, "grad_norm": 17.286514335772246, "learning_rate": 4.703918902212882e-05, "loss": 0.3383, "step": 1059 }, { "epoch": 0.39067538929328294, "grad_norm": 10.284905522959894, "learning_rate": 4.7036098405241693e-05, "loss": 0.5222, "step": 1060 }, { "epoch": 0.3910439509812955, "grad_norm": 12.389703819792768, "learning_rate": 4.703300778835456e-05, "loss": 0.5142, "step": 1061 }, { "epoch": 0.39141251266930804, "grad_norm": 8.626262185442325, "learning_rate": 4.702991717146742e-05, "loss": 0.4991, "step": 1062 }, { "epoch": 0.39178107435732057, "grad_norm": 18.672505056053925, "learning_rate": 4.7026826554580293e-05, "loss": 0.4711, "step": 1063 }, { "epoch": 0.3921496360453331, "grad_norm": 8.427790275379499, "learning_rate": 4.7023735937693165e-05, "loss": 0.4143, "step": 1064 }, { "epoch": 0.3925181977333456, "grad_norm": 14.748369029763458, "learning_rate": 4.7020645320806036e-05, "loss": 0.3898, "step": 1065 }, { "epoch": 0.39288675942135814, "grad_norm": 7.641233330285192, "learning_rate": 4.70175547039189e-05, "loss": 0.4165, "step": 1066 }, { "epoch": 0.39325532110937067, "grad_norm": 12.03984575242505, "learning_rate": 4.701446408703177e-05, "loss": 0.4392, "step": 1067 }, { "epoch": 0.3936238827973832, "grad_norm": 12.133164635907288, "learning_rate": 4.701137347014464e-05, "loss": 0.4018, "step": 1068 }, { "epoch": 0.3939924444853957, "grad_norm": 5.87169635494861, "learning_rate": 4.7008282853257514e-05, "loss": 0.3435, "step": 1069 }, { "epoch": 0.3943610061734083, "grad_norm": 4.860887886856061, "learning_rate": 4.7005192236370385e-05, "loss": 0.2159, "step": 1070 }, { "epoch": 0.3947295678614208, "grad_norm": 4.95819044575114, "learning_rate": 4.700210161948325e-05, "loss": 0.3164, "step": 1071 }, { "epoch": 0.39509812954943335, "grad_norm": 6.312205392742382, "learning_rate": 4.699901100259612e-05, "loss": 0.3938, "step": 1072 }, { "epoch": 0.3954666912374459, "grad_norm": 7.713981112768834, "learning_rate": 4.699592038570899e-05, "loss": 0.3778, "step": 1073 }, { "epoch": 0.3958352529254584, "grad_norm": 4.980270561314074, "learning_rate": 4.699282976882186e-05, "loss": 0.2489, "step": 1074 }, { "epoch": 0.39620381461347093, "grad_norm": 7.387185584798707, "learning_rate": 4.698973915193473e-05, "loss": 0.343, "step": 1075 }, { "epoch": 0.39657237630148345, "grad_norm": 6.656027060343681, "learning_rate": 4.69866485350476e-05, "loss": 0.3245, "step": 1076 }, { "epoch": 0.396940937989496, "grad_norm": 9.414960408423791, "learning_rate": 4.698355791816046e-05, "loss": 0.3562, "step": 1077 }, { "epoch": 0.3973094996775085, "grad_norm": 9.115299032540662, "learning_rate": 4.6980467301273334e-05, "loss": 0.3593, "step": 1078 }, { "epoch": 0.39767806136552103, "grad_norm": 6.779634204100266, "learning_rate": 4.6977376684386205e-05, "loss": 0.4762, "step": 1079 }, { "epoch": 0.3980466230535336, "grad_norm": 10.543271954004306, "learning_rate": 4.6974286067499077e-05, "loss": 0.5215, "step": 1080 }, { "epoch": 0.39841518474154614, "grad_norm": 18.60354915105247, "learning_rate": 4.697119545061194e-05, "loss": 0.3783, "step": 1081 }, { "epoch": 0.39878374642955866, "grad_norm": 11.387006187409813, "learning_rate": 4.696810483372481e-05, "loss": 0.551, "step": 1082 }, { "epoch": 0.3991523081175712, "grad_norm": 6.988492225801492, "learning_rate": 4.6965014216837683e-05, "loss": 0.3763, "step": 1083 }, { "epoch": 0.3995208698055837, "grad_norm": 5.962979868006432, "learning_rate": 4.6961923599950555e-05, "loss": 0.1913, "step": 1084 }, { "epoch": 0.39988943149359624, "grad_norm": 9.896124541699352, "learning_rate": 4.695883298306342e-05, "loss": 0.3471, "step": 1085 }, { "epoch": 0.40025799318160876, "grad_norm": 16.15453478564533, "learning_rate": 4.695574236617629e-05, "loss": 0.2675, "step": 1086 }, { "epoch": 0.4006265548696213, "grad_norm": 8.00574811423969, "learning_rate": 4.695265174928916e-05, "loss": 0.445, "step": 1087 }, { "epoch": 0.4009951165576338, "grad_norm": 5.531776360074219, "learning_rate": 4.694956113240203e-05, "loss": 0.2768, "step": 1088 }, { "epoch": 0.40136367824564634, "grad_norm": 5.055787522486895, "learning_rate": 4.6946470515514904e-05, "loss": 0.4138, "step": 1089 }, { "epoch": 0.4017322399336589, "grad_norm": 7.874745682742465, "learning_rate": 4.694337989862777e-05, "loss": 0.3456, "step": 1090 }, { "epoch": 0.40210080162167144, "grad_norm": 12.476628541622333, "learning_rate": 4.694028928174064e-05, "loss": 0.3465, "step": 1091 }, { "epoch": 0.40246936330968397, "grad_norm": 8.315780565941987, "learning_rate": 4.6937198664853504e-05, "loss": 0.4831, "step": 1092 }, { "epoch": 0.4028379249976965, "grad_norm": 6.203753768609229, "learning_rate": 4.6934108047966375e-05, "loss": 0.4203, "step": 1093 }, { "epoch": 0.403206486685709, "grad_norm": 4.754734891767491, "learning_rate": 4.6931017431079246e-05, "loss": 0.2723, "step": 1094 }, { "epoch": 0.40357504837372155, "grad_norm": 5.855452463077749, "learning_rate": 4.692792681419211e-05, "loss": 0.4124, "step": 1095 }, { "epoch": 0.40394361006173407, "grad_norm": 9.814606458690275, "learning_rate": 4.692483619730498e-05, "loss": 0.4965, "step": 1096 }, { "epoch": 0.4043121717497466, "grad_norm": 6.076852071880277, "learning_rate": 4.692174558041785e-05, "loss": 0.4801, "step": 1097 }, { "epoch": 0.4046807334377591, "grad_norm": 10.505422372801439, "learning_rate": 4.6918654963530724e-05, "loss": 0.5759, "step": 1098 }, { "epoch": 0.4050492951257717, "grad_norm": 10.840365978411358, "learning_rate": 4.6915564346643595e-05, "loss": 0.424, "step": 1099 }, { "epoch": 0.4054178568137842, "grad_norm": 13.231367145464823, "learning_rate": 4.691247372975646e-05, "loss": 0.3254, "step": 1100 }, { "epoch": 0.40578641850179675, "grad_norm": 15.094167004623891, "learning_rate": 4.690938311286933e-05, "loss": 0.4304, "step": 1101 }, { "epoch": 0.4061549801898093, "grad_norm": 12.931264655509155, "learning_rate": 4.69062924959822e-05, "loss": 0.3114, "step": 1102 }, { "epoch": 0.4065235418778218, "grad_norm": 5.2483792300906265, "learning_rate": 4.690320187909507e-05, "loss": 0.3161, "step": 1103 }, { "epoch": 0.40689210356583433, "grad_norm": 5.187859580205095, "learning_rate": 4.690011126220794e-05, "loss": 0.3315, "step": 1104 }, { "epoch": 0.40726066525384685, "grad_norm": 10.118115380351108, "learning_rate": 4.689702064532081e-05, "loss": 0.3906, "step": 1105 }, { "epoch": 0.4076292269418594, "grad_norm": 12.623075735161578, "learning_rate": 4.689393002843368e-05, "loss": 0.4682, "step": 1106 }, { "epoch": 0.4079977886298719, "grad_norm": 8.480990976694725, "learning_rate": 4.6890839411546545e-05, "loss": 0.3118, "step": 1107 }, { "epoch": 0.40836635031788443, "grad_norm": 6.161020047831784, "learning_rate": 4.6887748794659416e-05, "loss": 0.2962, "step": 1108 }, { "epoch": 0.408734912005897, "grad_norm": 6.914357669370342, "learning_rate": 4.688465817777229e-05, "loss": 0.381, "step": 1109 }, { "epoch": 0.40910347369390954, "grad_norm": 4.417237778692229, "learning_rate": 4.688156756088515e-05, "loss": 0.2673, "step": 1110 }, { "epoch": 0.40947203538192206, "grad_norm": 10.957124553969939, "learning_rate": 4.687847694399802e-05, "loss": 0.4385, "step": 1111 }, { "epoch": 0.4098405970699346, "grad_norm": 4.760944206450909, "learning_rate": 4.6875386327110894e-05, "loss": 0.4123, "step": 1112 }, { "epoch": 0.4102091587579471, "grad_norm": 7.083645383655701, "learning_rate": 4.6872295710223765e-05, "loss": 0.4122, "step": 1113 }, { "epoch": 0.41057772044595964, "grad_norm": 6.219446191204886, "learning_rate": 4.686920509333663e-05, "loss": 0.3078, "step": 1114 }, { "epoch": 0.41094628213397216, "grad_norm": 7.831740711179008, "learning_rate": 4.68661144764495e-05, "loss": 0.4578, "step": 1115 }, { "epoch": 0.4113148438219847, "grad_norm": 22.024223170182417, "learning_rate": 4.686302385956237e-05, "loss": 0.3879, "step": 1116 }, { "epoch": 0.4116834055099972, "grad_norm": 7.307482962756503, "learning_rate": 4.685993324267524e-05, "loss": 0.4433, "step": 1117 }, { "epoch": 0.4120519671980098, "grad_norm": 9.017678170809, "learning_rate": 4.6856842625788114e-05, "loss": 0.5571, "step": 1118 }, { "epoch": 0.4124205288860223, "grad_norm": 18.958951276273005, "learning_rate": 4.685375200890098e-05, "loss": 0.3175, "step": 1119 }, { "epoch": 0.41278909057403484, "grad_norm": 12.868883179859665, "learning_rate": 4.685066139201385e-05, "loss": 0.3786, "step": 1120 }, { "epoch": 0.41315765226204737, "grad_norm": 5.959782279729465, "learning_rate": 4.684757077512672e-05, "loss": 0.3352, "step": 1121 }, { "epoch": 0.4135262139500599, "grad_norm": 16.433868123006675, "learning_rate": 4.6844480158239585e-05, "loss": 0.4103, "step": 1122 }, { "epoch": 0.4138947756380724, "grad_norm": 5.457264550477773, "learning_rate": 4.6841389541352457e-05, "loss": 0.3065, "step": 1123 }, { "epoch": 0.41426333732608495, "grad_norm": 4.146007336107196, "learning_rate": 4.683829892446532e-05, "loss": 0.2928, "step": 1124 }, { "epoch": 0.41463189901409747, "grad_norm": 8.663899591288537, "learning_rate": 4.683520830757819e-05, "loss": 0.3709, "step": 1125 }, { "epoch": 0.41500046070211, "grad_norm": 16.347625914697062, "learning_rate": 4.683211769069106e-05, "loss": 0.4164, "step": 1126 }, { "epoch": 0.4153690223901225, "grad_norm": 4.8439697215790085, "learning_rate": 4.6829027073803935e-05, "loss": 0.4556, "step": 1127 }, { "epoch": 0.4157375840781351, "grad_norm": 11.037046298317813, "learning_rate": 4.6825936456916806e-05, "loss": 0.3243, "step": 1128 }, { "epoch": 0.41610614576614763, "grad_norm": 20.108314183849583, "learning_rate": 4.682284584002967e-05, "loss": 0.4209, "step": 1129 }, { "epoch": 0.41647470745416015, "grad_norm": 3.996936817302569, "learning_rate": 4.681975522314254e-05, "loss": 0.3253, "step": 1130 }, { "epoch": 0.4168432691421727, "grad_norm": 12.955318251439977, "learning_rate": 4.681666460625541e-05, "loss": 0.3958, "step": 1131 }, { "epoch": 0.4172118308301852, "grad_norm": 6.916763243709535, "learning_rate": 4.6813573989368284e-05, "loss": 0.605, "step": 1132 }, { "epoch": 0.41758039251819773, "grad_norm": 5.708602490541756, "learning_rate": 4.681048337248115e-05, "loss": 0.3258, "step": 1133 }, { "epoch": 0.41794895420621025, "grad_norm": 9.404800455429413, "learning_rate": 4.680739275559402e-05, "loss": 0.4471, "step": 1134 }, { "epoch": 0.4183175158942228, "grad_norm": 10.58628523702384, "learning_rate": 4.680430213870689e-05, "loss": 0.3345, "step": 1135 }, { "epoch": 0.4186860775822353, "grad_norm": 8.46294674391444, "learning_rate": 4.680121152181976e-05, "loss": 0.3876, "step": 1136 }, { "epoch": 0.41905463927024783, "grad_norm": 20.455427140779015, "learning_rate": 4.6798120904932626e-05, "loss": 0.3021, "step": 1137 }, { "epoch": 0.4194232009582604, "grad_norm": 12.196497955475976, "learning_rate": 4.679503028804549e-05, "loss": 0.397, "step": 1138 }, { "epoch": 0.41979176264627294, "grad_norm": 8.880178310906606, "learning_rate": 4.679193967115836e-05, "loss": 0.3282, "step": 1139 }, { "epoch": 0.42016032433428546, "grad_norm": 4.797934946259188, "learning_rate": 4.678884905427123e-05, "loss": 0.3767, "step": 1140 }, { "epoch": 0.420528886022298, "grad_norm": 6.501151349728009, "learning_rate": 4.6785758437384104e-05, "loss": 0.3183, "step": 1141 }, { "epoch": 0.4208974477103105, "grad_norm": 7.2503317395310924, "learning_rate": 4.6782667820496975e-05, "loss": 0.4741, "step": 1142 }, { "epoch": 0.42126600939832304, "grad_norm": 3.9576530005736, "learning_rate": 4.677957720360984e-05, "loss": 0.2363, "step": 1143 }, { "epoch": 0.42163457108633556, "grad_norm": 11.7644836763827, "learning_rate": 4.677648658672271e-05, "loss": 0.3387, "step": 1144 }, { "epoch": 0.4220031327743481, "grad_norm": 10.133300393034283, "learning_rate": 4.677339596983558e-05, "loss": 0.2967, "step": 1145 }, { "epoch": 0.4223716944623606, "grad_norm": 20.64927844248589, "learning_rate": 4.677030535294845e-05, "loss": 0.4301, "step": 1146 }, { "epoch": 0.4227402561503732, "grad_norm": 6.921415912223486, "learning_rate": 4.676721473606132e-05, "loss": 0.4191, "step": 1147 }, { "epoch": 0.4231088178383857, "grad_norm": 11.105825637841814, "learning_rate": 4.676412411917419e-05, "loss": 0.2337, "step": 1148 }, { "epoch": 0.42347737952639825, "grad_norm": 6.724500741117154, "learning_rate": 4.676103350228706e-05, "loss": 0.3402, "step": 1149 }, { "epoch": 0.42384594121441077, "grad_norm": 8.1429236273095, "learning_rate": 4.675794288539993e-05, "loss": 0.481, "step": 1150 }, { "epoch": 0.4242145029024233, "grad_norm": 5.627614070866555, "learning_rate": 4.67548522685128e-05, "loss": 0.3877, "step": 1151 }, { "epoch": 0.4245830645904358, "grad_norm": 5.624043701262548, "learning_rate": 4.675176165162567e-05, "loss": 0.3745, "step": 1152 }, { "epoch": 0.42495162627844835, "grad_norm": 6.483659010734638, "learning_rate": 4.674867103473853e-05, "loss": 0.3568, "step": 1153 }, { "epoch": 0.42532018796646087, "grad_norm": 12.288658955151112, "learning_rate": 4.67455804178514e-05, "loss": 0.4636, "step": 1154 }, { "epoch": 0.4256887496544734, "grad_norm": 5.3648468644186025, "learning_rate": 4.6742489800964274e-05, "loss": 0.3319, "step": 1155 }, { "epoch": 0.4260573113424859, "grad_norm": 12.563768009700055, "learning_rate": 4.6739399184077145e-05, "loss": 0.3533, "step": 1156 }, { "epoch": 0.4264258730304985, "grad_norm": 15.999254686162582, "learning_rate": 4.673630856719001e-05, "loss": 0.392, "step": 1157 }, { "epoch": 0.42679443471851103, "grad_norm": 8.938427430164458, "learning_rate": 4.673321795030288e-05, "loss": 0.3057, "step": 1158 }, { "epoch": 0.42716299640652355, "grad_norm": 4.817056455756899, "learning_rate": 4.673012733341575e-05, "loss": 0.3113, "step": 1159 }, { "epoch": 0.4275315580945361, "grad_norm": 5.2240982518675905, "learning_rate": 4.672703671652862e-05, "loss": 0.2725, "step": 1160 }, { "epoch": 0.4279001197825486, "grad_norm": 7.803559337256181, "learning_rate": 4.6723946099641494e-05, "loss": 0.1809, "step": 1161 }, { "epoch": 0.42826868147056113, "grad_norm": 4.709859595018217, "learning_rate": 4.672085548275436e-05, "loss": 0.4149, "step": 1162 }, { "epoch": 0.42863724315857366, "grad_norm": 12.989692122618983, "learning_rate": 4.671776486586723e-05, "loss": 0.4257, "step": 1163 }, { "epoch": 0.4290058048465862, "grad_norm": 12.213497037609471, "learning_rate": 4.67146742489801e-05, "loss": 0.5218, "step": 1164 }, { "epoch": 0.4293743665345987, "grad_norm": 7.108300163427182, "learning_rate": 4.671158363209297e-05, "loss": 0.4127, "step": 1165 }, { "epoch": 0.4297429282226113, "grad_norm": 8.737640807582114, "learning_rate": 4.6708493015205836e-05, "loss": 0.3456, "step": 1166 }, { "epoch": 0.4301114899106238, "grad_norm": 6.875383123646553, "learning_rate": 4.67054023983187e-05, "loss": 0.4605, "step": 1167 }, { "epoch": 0.43048005159863634, "grad_norm": 5.614643650993064, "learning_rate": 4.670231178143157e-05, "loss": 0.3364, "step": 1168 }, { "epoch": 0.43084861328664886, "grad_norm": 7.775652852507939, "learning_rate": 4.669922116454444e-05, "loss": 0.375, "step": 1169 }, { "epoch": 0.4312171749746614, "grad_norm": 5.808861034326203, "learning_rate": 4.6696130547657314e-05, "loss": 0.364, "step": 1170 }, { "epoch": 0.4315857366626739, "grad_norm": 13.925614151346702, "learning_rate": 4.6693039930770186e-05, "loss": 0.4356, "step": 1171 }, { "epoch": 0.43195429835068644, "grad_norm": 11.914369232731843, "learning_rate": 4.668994931388305e-05, "loss": 0.4345, "step": 1172 }, { "epoch": 0.43232286003869896, "grad_norm": 7.463779215860117, "learning_rate": 4.668685869699592e-05, "loss": 0.4173, "step": 1173 }, { "epoch": 0.4326914217267115, "grad_norm": 8.085087326984226, "learning_rate": 4.668376808010879e-05, "loss": 0.2837, "step": 1174 }, { "epoch": 0.433059983414724, "grad_norm": 6.875645555185136, "learning_rate": 4.6680677463221664e-05, "loss": 0.3451, "step": 1175 }, { "epoch": 0.4334285451027366, "grad_norm": 6.150459780990208, "learning_rate": 4.667758684633453e-05, "loss": 0.2749, "step": 1176 }, { "epoch": 0.4337971067907491, "grad_norm": 4.286063947491603, "learning_rate": 4.66744962294474e-05, "loss": 0.1721, "step": 1177 }, { "epoch": 0.43416566847876165, "grad_norm": 10.964616567271523, "learning_rate": 4.667140561256027e-05, "loss": 0.5892, "step": 1178 }, { "epoch": 0.43453423016677417, "grad_norm": 7.906613548876617, "learning_rate": 4.666831499567314e-05, "loss": 0.3723, "step": 1179 }, { "epoch": 0.4349027918547867, "grad_norm": 14.959091861516496, "learning_rate": 4.666522437878601e-05, "loss": 0.3161, "step": 1180 }, { "epoch": 0.4352713535427992, "grad_norm": 14.446351875803499, "learning_rate": 4.666213376189888e-05, "loss": 0.4278, "step": 1181 }, { "epoch": 0.43563991523081175, "grad_norm": 7.207059038310249, "learning_rate": 4.665904314501175e-05, "loss": 0.3603, "step": 1182 }, { "epoch": 0.4360084769188243, "grad_norm": 8.388005876137964, "learning_rate": 4.665595252812461e-05, "loss": 0.3099, "step": 1183 }, { "epoch": 0.4363770386068368, "grad_norm": 14.923019221509664, "learning_rate": 4.6652861911237484e-05, "loss": 0.47, "step": 1184 }, { "epoch": 0.4367456002948493, "grad_norm": 9.220282165446415, "learning_rate": 4.6649771294350355e-05, "loss": 0.2986, "step": 1185 }, { "epoch": 0.4371141619828619, "grad_norm": 4.068742855786499, "learning_rate": 4.664668067746322e-05, "loss": 0.2757, "step": 1186 }, { "epoch": 0.43748272367087443, "grad_norm": 9.30304540177885, "learning_rate": 4.664359006057609e-05, "loss": 0.4056, "step": 1187 }, { "epoch": 0.43785128535888695, "grad_norm": 6.694488866437925, "learning_rate": 4.664049944368896e-05, "loss": 0.3369, "step": 1188 }, { "epoch": 0.4382198470468995, "grad_norm": 7.520855707221596, "learning_rate": 4.663740882680183e-05, "loss": 0.3311, "step": 1189 }, { "epoch": 0.438588408734912, "grad_norm": 8.228682749237429, "learning_rate": 4.6634318209914704e-05, "loss": 0.3914, "step": 1190 }, { "epoch": 0.43895697042292453, "grad_norm": 9.96144836312526, "learning_rate": 4.663122759302757e-05, "loss": 0.4728, "step": 1191 }, { "epoch": 0.43932553211093706, "grad_norm": 22.781113361215745, "learning_rate": 4.662813697614044e-05, "loss": 0.3697, "step": 1192 }, { "epoch": 0.4396940937989496, "grad_norm": 9.977701885309532, "learning_rate": 4.662504635925331e-05, "loss": 0.373, "step": 1193 }, { "epoch": 0.4400626554869621, "grad_norm": 4.908908002509276, "learning_rate": 4.662195574236618e-05, "loss": 0.2856, "step": 1194 }, { "epoch": 0.4404312171749747, "grad_norm": 10.026169675860299, "learning_rate": 4.661886512547905e-05, "loss": 0.5707, "step": 1195 }, { "epoch": 0.4407997788629872, "grad_norm": 6.112147021877093, "learning_rate": 4.661577450859192e-05, "loss": 0.3133, "step": 1196 }, { "epoch": 0.44116834055099974, "grad_norm": 8.057790400911378, "learning_rate": 4.661268389170479e-05, "loss": 0.3296, "step": 1197 }, { "epoch": 0.44153690223901226, "grad_norm": 5.679547797456568, "learning_rate": 4.6609593274817654e-05, "loss": 0.3385, "step": 1198 }, { "epoch": 0.4419054639270248, "grad_norm": 21.98553962577946, "learning_rate": 4.6606502657930525e-05, "loss": 0.43, "step": 1199 }, { "epoch": 0.4422740256150373, "grad_norm": 12.863527983133226, "learning_rate": 4.6603412041043396e-05, "loss": 0.4251, "step": 1200 }, { "epoch": 0.44264258730304984, "grad_norm": 8.80000057220457, "learning_rate": 4.660032142415626e-05, "loss": 0.4055, "step": 1201 }, { "epoch": 0.44301114899106236, "grad_norm": 11.91260941446178, "learning_rate": 4.659723080726913e-05, "loss": 0.3388, "step": 1202 }, { "epoch": 0.4433797106790749, "grad_norm": 3.5565768073283306, "learning_rate": 4.6594140190382e-05, "loss": 0.262, "step": 1203 }, { "epoch": 0.4437482723670874, "grad_norm": 6.566839908438201, "learning_rate": 4.6591049573494874e-05, "loss": 0.3204, "step": 1204 }, { "epoch": 0.4441168340551, "grad_norm": 3.9645703752352373, "learning_rate": 4.658795895660774e-05, "loss": 0.2147, "step": 1205 }, { "epoch": 0.4444853957431125, "grad_norm": 8.070733176656818, "learning_rate": 4.658486833972061e-05, "loss": 0.4332, "step": 1206 }, { "epoch": 0.44485395743112505, "grad_norm": 5.0659250988278135, "learning_rate": 4.658177772283348e-05, "loss": 0.2909, "step": 1207 }, { "epoch": 0.44522251911913757, "grad_norm": 20.495826854680228, "learning_rate": 4.657868710594635e-05, "loss": 0.3141, "step": 1208 }, { "epoch": 0.4455910808071501, "grad_norm": 10.254413305915417, "learning_rate": 4.657559648905922e-05, "loss": 0.5339, "step": 1209 }, { "epoch": 0.4459596424951626, "grad_norm": 5.892314314508098, "learning_rate": 4.657250587217209e-05, "loss": 0.3017, "step": 1210 }, { "epoch": 0.44632820418317515, "grad_norm": 18.798365355505315, "learning_rate": 4.656941525528496e-05, "loss": 0.4176, "step": 1211 }, { "epoch": 0.4466967658711877, "grad_norm": 12.16882959936408, "learning_rate": 4.656632463839783e-05, "loss": 0.4161, "step": 1212 }, { "epoch": 0.4470653275592002, "grad_norm": 11.294635978192604, "learning_rate": 4.6563234021510694e-05, "loss": 0.5515, "step": 1213 }, { "epoch": 0.4474338892472128, "grad_norm": 7.6769393707262426, "learning_rate": 4.6560143404623566e-05, "loss": 0.2542, "step": 1214 }, { "epoch": 0.4478024509352253, "grad_norm": 3.6911695939351534, "learning_rate": 4.655705278773643e-05, "loss": 0.3216, "step": 1215 }, { "epoch": 0.44817101262323783, "grad_norm": 7.295947258925965, "learning_rate": 4.65539621708493e-05, "loss": 0.409, "step": 1216 }, { "epoch": 0.44853957431125036, "grad_norm": 4.460287551163321, "learning_rate": 4.655087155396217e-05, "loss": 0.2038, "step": 1217 }, { "epoch": 0.4489081359992629, "grad_norm": 18.35698621427531, "learning_rate": 4.6547780937075044e-05, "loss": 0.4962, "step": 1218 }, { "epoch": 0.4492766976872754, "grad_norm": 5.861085036924357, "learning_rate": 4.6544690320187915e-05, "loss": 0.3228, "step": 1219 }, { "epoch": 0.44964525937528793, "grad_norm": 5.774538505577097, "learning_rate": 4.654159970330078e-05, "loss": 0.4305, "step": 1220 }, { "epoch": 0.45001382106330046, "grad_norm": 5.814525292422478, "learning_rate": 4.653850908641365e-05, "loss": 0.196, "step": 1221 }, { "epoch": 0.450382382751313, "grad_norm": 9.252187109467348, "learning_rate": 4.653541846952652e-05, "loss": 0.3451, "step": 1222 }, { "epoch": 0.4507509444393255, "grad_norm": 15.868222209000564, "learning_rate": 4.653232785263939e-05, "loss": 0.5303, "step": 1223 }, { "epoch": 0.4511195061273381, "grad_norm": 15.894073802246233, "learning_rate": 4.652923723575226e-05, "loss": 0.6703, "step": 1224 }, { "epoch": 0.4514880678153506, "grad_norm": 3.9214185471894236, "learning_rate": 4.652614661886513e-05, "loss": 0.2962, "step": 1225 }, { "epoch": 0.45185662950336314, "grad_norm": 6.2095885917277664, "learning_rate": 4.6523056001978e-05, "loss": 0.3564, "step": 1226 }, { "epoch": 0.45222519119137566, "grad_norm": 6.252097426384517, "learning_rate": 4.651996538509087e-05, "loss": 0.3816, "step": 1227 }, { "epoch": 0.4525937528793882, "grad_norm": 4.429419238597738, "learning_rate": 4.6516874768203735e-05, "loss": 0.3033, "step": 1228 }, { "epoch": 0.4529623145674007, "grad_norm": 9.622767994677755, "learning_rate": 4.65137841513166e-05, "loss": 0.457, "step": 1229 }, { "epoch": 0.45333087625541324, "grad_norm": 6.2651354819685965, "learning_rate": 4.651069353442947e-05, "loss": 0.4503, "step": 1230 }, { "epoch": 0.45369943794342577, "grad_norm": 11.14415677494755, "learning_rate": 4.650760291754234e-05, "loss": 0.8155, "step": 1231 }, { "epoch": 0.4540679996314383, "grad_norm": 8.590196797400303, "learning_rate": 4.650451230065521e-05, "loss": 0.4956, "step": 1232 }, { "epoch": 0.4544365613194508, "grad_norm": 4.595617628295249, "learning_rate": 4.6501421683768084e-05, "loss": 0.316, "step": 1233 }, { "epoch": 0.4548051230074634, "grad_norm": 4.771220939693643, "learning_rate": 4.649833106688095e-05, "loss": 0.3143, "step": 1234 }, { "epoch": 0.4551736846954759, "grad_norm": 6.433605608671524, "learning_rate": 4.649524044999382e-05, "loss": 0.224, "step": 1235 }, { "epoch": 0.45554224638348845, "grad_norm": 8.674503402347812, "learning_rate": 4.649214983310669e-05, "loss": 0.3592, "step": 1236 }, { "epoch": 0.455910808071501, "grad_norm": 12.500596299253651, "learning_rate": 4.648905921621956e-05, "loss": 0.3362, "step": 1237 }, { "epoch": 0.4562793697595135, "grad_norm": 6.102663065455811, "learning_rate": 4.648596859933243e-05, "loss": 0.2941, "step": 1238 }, { "epoch": 0.456647931447526, "grad_norm": 7.107228384715099, "learning_rate": 4.64828779824453e-05, "loss": 0.4313, "step": 1239 }, { "epoch": 0.45701649313553855, "grad_norm": 8.613980537798215, "learning_rate": 4.647978736555817e-05, "loss": 0.3526, "step": 1240 }, { "epoch": 0.4573850548235511, "grad_norm": 12.099102245476082, "learning_rate": 4.647669674867104e-05, "loss": 0.4138, "step": 1241 }, { "epoch": 0.4577536165115636, "grad_norm": 7.712448698642943, "learning_rate": 4.647360613178391e-05, "loss": 0.5051, "step": 1242 }, { "epoch": 0.4581221781995762, "grad_norm": 12.948625434182796, "learning_rate": 4.6470515514896776e-05, "loss": 0.3879, "step": 1243 }, { "epoch": 0.4584907398875887, "grad_norm": 4.982813671323651, "learning_rate": 4.646742489800964e-05, "loss": 0.2829, "step": 1244 }, { "epoch": 0.45885930157560123, "grad_norm": 7.898059568032235, "learning_rate": 4.646433428112251e-05, "loss": 0.5185, "step": 1245 }, { "epoch": 0.45922786326361376, "grad_norm": 6.253814143801752, "learning_rate": 4.646124366423538e-05, "loss": 0.3453, "step": 1246 }, { "epoch": 0.4595964249516263, "grad_norm": 6.27233801074956, "learning_rate": 4.6458153047348254e-05, "loss": 0.3573, "step": 1247 }, { "epoch": 0.4599649866396388, "grad_norm": 10.540691546748317, "learning_rate": 4.645506243046112e-05, "loss": 0.4821, "step": 1248 }, { "epoch": 0.46033354832765133, "grad_norm": 7.611183280633602, "learning_rate": 4.645197181357399e-05, "loss": 0.3036, "step": 1249 }, { "epoch": 0.46070211001566386, "grad_norm": 7.223868532912995, "learning_rate": 4.644888119668686e-05, "loss": 0.3143, "step": 1250 }, { "epoch": 0.4610706717036764, "grad_norm": 9.101996284593072, "learning_rate": 4.644579057979973e-05, "loss": 0.3259, "step": 1251 }, { "epoch": 0.4614392333916889, "grad_norm": 3.927767881974184, "learning_rate": 4.64426999629126e-05, "loss": 0.1957, "step": 1252 }, { "epoch": 0.4618077950797015, "grad_norm": 8.13108011313087, "learning_rate": 4.643960934602547e-05, "loss": 0.2864, "step": 1253 }, { "epoch": 0.462176356767714, "grad_norm": 9.674951707980656, "learning_rate": 4.643651872913834e-05, "loss": 0.4024, "step": 1254 }, { "epoch": 0.46254491845572654, "grad_norm": 10.143046555774907, "learning_rate": 4.643342811225121e-05, "loss": 0.4843, "step": 1255 }, { "epoch": 0.46291348014373906, "grad_norm": 10.152014215866728, "learning_rate": 4.643033749536408e-05, "loss": 0.4006, "step": 1256 }, { "epoch": 0.4632820418317516, "grad_norm": 8.098368501599277, "learning_rate": 4.6427246878476946e-05, "loss": 0.3194, "step": 1257 }, { "epoch": 0.4636506035197641, "grad_norm": 7.204496240734925, "learning_rate": 4.642415626158981e-05, "loss": 0.4773, "step": 1258 }, { "epoch": 0.46401916520777664, "grad_norm": 9.77630002481987, "learning_rate": 4.642106564470268e-05, "loss": 0.4681, "step": 1259 }, { "epoch": 0.46438772689578917, "grad_norm": 4.144716019606013, "learning_rate": 4.641797502781555e-05, "loss": 0.2139, "step": 1260 }, { "epoch": 0.4647562885838017, "grad_norm": 16.682182922231167, "learning_rate": 4.6414884410928424e-05, "loss": 0.3289, "step": 1261 }, { "epoch": 0.46512485027181427, "grad_norm": 6.46949476294656, "learning_rate": 4.6411793794041295e-05, "loss": 0.4106, "step": 1262 }, { "epoch": 0.4654934119598268, "grad_norm": 5.892788841215035, "learning_rate": 4.640870317715416e-05, "loss": 0.324, "step": 1263 }, { "epoch": 0.4658619736478393, "grad_norm": 7.085153084166511, "learning_rate": 4.640561256026703e-05, "loss": 0.4335, "step": 1264 }, { "epoch": 0.46623053533585185, "grad_norm": 16.87200403815954, "learning_rate": 4.64025219433799e-05, "loss": 0.4488, "step": 1265 }, { "epoch": 0.4665990970238644, "grad_norm": 8.464041551743968, "learning_rate": 4.639943132649277e-05, "loss": 0.3825, "step": 1266 }, { "epoch": 0.4669676587118769, "grad_norm": 10.357387070055136, "learning_rate": 4.639634070960564e-05, "loss": 0.3344, "step": 1267 }, { "epoch": 0.4673362203998894, "grad_norm": 6.937999260610682, "learning_rate": 4.639325009271851e-05, "loss": 0.3864, "step": 1268 }, { "epoch": 0.46770478208790195, "grad_norm": 7.607941353412426, "learning_rate": 4.639015947583138e-05, "loss": 0.3861, "step": 1269 }, { "epoch": 0.4680733437759145, "grad_norm": 12.245423999827318, "learning_rate": 4.638706885894425e-05, "loss": 0.507, "step": 1270 }, { "epoch": 0.468441905463927, "grad_norm": 9.372581068142662, "learning_rate": 4.638397824205712e-05, "loss": 0.3067, "step": 1271 }, { "epoch": 0.4688104671519396, "grad_norm": 5.807944861200233, "learning_rate": 4.6380887625169986e-05, "loss": 0.3215, "step": 1272 }, { "epoch": 0.4691790288399521, "grad_norm": 6.083920106502442, "learning_rate": 4.637779700828285e-05, "loss": 0.5258, "step": 1273 }, { "epoch": 0.46954759052796463, "grad_norm": 9.077442136940984, "learning_rate": 4.637470639139572e-05, "loss": 0.3827, "step": 1274 }, { "epoch": 0.46991615221597716, "grad_norm": 4.936717490467576, "learning_rate": 4.637161577450859e-05, "loss": 0.2594, "step": 1275 }, { "epoch": 0.4702847139039897, "grad_norm": 5.036808710491593, "learning_rate": 4.6368525157621464e-05, "loss": 0.2875, "step": 1276 }, { "epoch": 0.4706532755920022, "grad_norm": 5.5135923103540145, "learning_rate": 4.636543454073433e-05, "loss": 0.4093, "step": 1277 }, { "epoch": 0.47102183728001473, "grad_norm": 5.711114507559539, "learning_rate": 4.63623439238472e-05, "loss": 0.3774, "step": 1278 }, { "epoch": 0.47139039896802726, "grad_norm": 4.142027879975712, "learning_rate": 4.635925330696007e-05, "loss": 0.4225, "step": 1279 }, { "epoch": 0.4717589606560398, "grad_norm": 9.114846629381562, "learning_rate": 4.635616269007294e-05, "loss": 0.3694, "step": 1280 }, { "epoch": 0.4721275223440523, "grad_norm": 5.668944891930432, "learning_rate": 4.6353072073185814e-05, "loss": 0.3574, "step": 1281 }, { "epoch": 0.4724960840320649, "grad_norm": 6.849728276787358, "learning_rate": 4.634998145629868e-05, "loss": 0.2282, "step": 1282 }, { "epoch": 0.4728646457200774, "grad_norm": 10.524039950043766, "learning_rate": 4.634689083941155e-05, "loss": 0.3758, "step": 1283 }, { "epoch": 0.47323320740808994, "grad_norm": 8.466718254032317, "learning_rate": 4.634380022252442e-05, "loss": 0.4076, "step": 1284 }, { "epoch": 0.47360176909610247, "grad_norm": 6.606454190334944, "learning_rate": 4.634070960563729e-05, "loss": 0.5513, "step": 1285 }, { "epoch": 0.473970330784115, "grad_norm": 5.068110994636471, "learning_rate": 4.6337618988750156e-05, "loss": 0.2937, "step": 1286 }, { "epoch": 0.4743388924721275, "grad_norm": 5.383661787250565, "learning_rate": 4.633452837186303e-05, "loss": 0.2063, "step": 1287 }, { "epoch": 0.47470745416014004, "grad_norm": 6.9703558880486645, "learning_rate": 4.633143775497589e-05, "loss": 0.4535, "step": 1288 }, { "epoch": 0.47507601584815257, "grad_norm": 11.934609587492142, "learning_rate": 4.632834713808876e-05, "loss": 0.4342, "step": 1289 }, { "epoch": 0.4754445775361651, "grad_norm": 8.108885292746345, "learning_rate": 4.6325256521201634e-05, "loss": 0.467, "step": 1290 }, { "epoch": 0.4758131392241777, "grad_norm": 6.384179182592639, "learning_rate": 4.6322165904314505e-05, "loss": 0.4859, "step": 1291 }, { "epoch": 0.4761817009121902, "grad_norm": 4.208242031403737, "learning_rate": 4.631907528742737e-05, "loss": 0.2833, "step": 1292 }, { "epoch": 0.4765502626002027, "grad_norm": 4.278710273293017, "learning_rate": 4.631598467054024e-05, "loss": 0.2838, "step": 1293 }, { "epoch": 0.47691882428821525, "grad_norm": 34.67505583370144, "learning_rate": 4.631289405365311e-05, "loss": 0.3769, "step": 1294 }, { "epoch": 0.4772873859762278, "grad_norm": 6.826991076559557, "learning_rate": 4.630980343676598e-05, "loss": 0.4367, "step": 1295 }, { "epoch": 0.4776559476642403, "grad_norm": 21.58357637285741, "learning_rate": 4.630671281987885e-05, "loss": 0.3898, "step": 1296 }, { "epoch": 0.4780245093522528, "grad_norm": 5.109813053123002, "learning_rate": 4.630362220299172e-05, "loss": 0.3151, "step": 1297 }, { "epoch": 0.47839307104026535, "grad_norm": 8.90881953064981, "learning_rate": 4.630053158610459e-05, "loss": 0.2936, "step": 1298 }, { "epoch": 0.4787616327282779, "grad_norm": 8.360064954003423, "learning_rate": 4.629744096921746e-05, "loss": 0.2682, "step": 1299 }, { "epoch": 0.4791301944162904, "grad_norm": 6.1933067124653185, "learning_rate": 4.629435035233033e-05, "loss": 0.4191, "step": 1300 }, { "epoch": 0.479498756104303, "grad_norm": 9.431161937466808, "learning_rate": 4.62912597354432e-05, "loss": 0.2448, "step": 1301 }, { "epoch": 0.4798673177923155, "grad_norm": 6.6597249447955456, "learning_rate": 4.628816911855607e-05, "loss": 0.355, "step": 1302 }, { "epoch": 0.48023587948032803, "grad_norm": 10.986447395185921, "learning_rate": 4.628507850166894e-05, "loss": 0.3922, "step": 1303 }, { "epoch": 0.48060444116834056, "grad_norm": 7.196781965176997, "learning_rate": 4.6281987884781804e-05, "loss": 0.2659, "step": 1304 }, { "epoch": 0.4809730028563531, "grad_norm": 10.172628770314624, "learning_rate": 4.6278897267894675e-05, "loss": 0.2566, "step": 1305 }, { "epoch": 0.4813415645443656, "grad_norm": 13.199586942741215, "learning_rate": 4.627580665100754e-05, "loss": 0.4724, "step": 1306 }, { "epoch": 0.48171012623237813, "grad_norm": 10.234524092425476, "learning_rate": 4.627271603412041e-05, "loss": 0.5196, "step": 1307 }, { "epoch": 0.48207868792039066, "grad_norm": 12.56755245200071, "learning_rate": 4.626962541723328e-05, "loss": 0.4371, "step": 1308 }, { "epoch": 0.4824472496084032, "grad_norm": 7.116525797227705, "learning_rate": 4.626653480034615e-05, "loss": 0.3017, "step": 1309 }, { "epoch": 0.48281581129641576, "grad_norm": 29.774809934584386, "learning_rate": 4.626344418345902e-05, "loss": 0.3395, "step": 1310 }, { "epoch": 0.4831843729844283, "grad_norm": 12.609525053320308, "learning_rate": 4.626035356657189e-05, "loss": 0.4825, "step": 1311 }, { "epoch": 0.4835529346724408, "grad_norm": 5.336268491518602, "learning_rate": 4.625726294968476e-05, "loss": 0.2568, "step": 1312 }, { "epoch": 0.48392149636045334, "grad_norm": 6.813684483018777, "learning_rate": 4.625417233279763e-05, "loss": 0.3515, "step": 1313 }, { "epoch": 0.48429005804846587, "grad_norm": 8.252956900858132, "learning_rate": 4.62510817159105e-05, "loss": 0.2923, "step": 1314 }, { "epoch": 0.4846586197364784, "grad_norm": 9.614957251808327, "learning_rate": 4.6247991099023366e-05, "loss": 0.4021, "step": 1315 }, { "epoch": 0.4850271814244909, "grad_norm": 5.689393451454995, "learning_rate": 4.624490048213624e-05, "loss": 0.5331, "step": 1316 }, { "epoch": 0.48539574311250344, "grad_norm": 5.808531031625924, "learning_rate": 4.624180986524911e-05, "loss": 0.2901, "step": 1317 }, { "epoch": 0.48576430480051597, "grad_norm": 9.413899197883422, "learning_rate": 4.623871924836198e-05, "loss": 0.2083, "step": 1318 }, { "epoch": 0.4861328664885285, "grad_norm": 3.334379810733775, "learning_rate": 4.6235628631474844e-05, "loss": 0.2088, "step": 1319 }, { "epoch": 0.4865014281765411, "grad_norm": 7.825040472248944, "learning_rate": 4.623253801458771e-05, "loss": 0.391, "step": 1320 }, { "epoch": 0.4868699898645536, "grad_norm": 5.24442067455214, "learning_rate": 4.622944739770058e-05, "loss": 0.2284, "step": 1321 }, { "epoch": 0.4872385515525661, "grad_norm": 7.613915062284553, "learning_rate": 4.622635678081345e-05, "loss": 0.4358, "step": 1322 }, { "epoch": 0.48760711324057865, "grad_norm": 5.94768285430967, "learning_rate": 4.622326616392632e-05, "loss": 0.4885, "step": 1323 }, { "epoch": 0.4879756749285912, "grad_norm": 5.638384006658554, "learning_rate": 4.6220175547039193e-05, "loss": 0.2351, "step": 1324 }, { "epoch": 0.4883442366166037, "grad_norm": 6.44326020829645, "learning_rate": 4.621708493015206e-05, "loss": 0.2493, "step": 1325 }, { "epoch": 0.4887127983046162, "grad_norm": 35.84690687384164, "learning_rate": 4.621399431326493e-05, "loss": 0.4391, "step": 1326 }, { "epoch": 0.48908135999262875, "grad_norm": 6.1002500639141966, "learning_rate": 4.62109036963778e-05, "loss": 0.345, "step": 1327 }, { "epoch": 0.4894499216806413, "grad_norm": 3.400395112360623, "learning_rate": 4.620781307949067e-05, "loss": 0.347, "step": 1328 }, { "epoch": 0.4898184833686538, "grad_norm": 6.398547496478968, "learning_rate": 4.6204722462603536e-05, "loss": 0.2534, "step": 1329 }, { "epoch": 0.4901870450566664, "grad_norm": 6.291902685054288, "learning_rate": 4.620163184571641e-05, "loss": 0.3799, "step": 1330 }, { "epoch": 0.4905556067446789, "grad_norm": 6.93342209294554, "learning_rate": 4.619854122882928e-05, "loss": 0.2063, "step": 1331 }, { "epoch": 0.49092416843269143, "grad_norm": 6.599989411316672, "learning_rate": 4.619545061194215e-05, "loss": 0.2395, "step": 1332 }, { "epoch": 0.49129273012070396, "grad_norm": 9.845168238607423, "learning_rate": 4.619235999505502e-05, "loss": 0.2982, "step": 1333 }, { "epoch": 0.4916612918087165, "grad_norm": 16.659597092756137, "learning_rate": 4.6189269378167885e-05, "loss": 0.4114, "step": 1334 }, { "epoch": 0.492029853496729, "grad_norm": 7.238399037341012, "learning_rate": 4.618617876128075e-05, "loss": 0.2397, "step": 1335 }, { "epoch": 0.49239841518474153, "grad_norm": 9.407984374078762, "learning_rate": 4.618308814439362e-05, "loss": 0.5127, "step": 1336 }, { "epoch": 0.49276697687275406, "grad_norm": 15.680796859111426, "learning_rate": 4.617999752750649e-05, "loss": 0.4109, "step": 1337 }, { "epoch": 0.4931355385607666, "grad_norm": 8.449830038855627, "learning_rate": 4.617690691061936e-05, "loss": 0.33, "step": 1338 }, { "epoch": 0.49350410024877917, "grad_norm": 6.260559317370022, "learning_rate": 4.617381629373223e-05, "loss": 0.4102, "step": 1339 }, { "epoch": 0.4938726619367917, "grad_norm": 13.75975384248076, "learning_rate": 4.61707256768451e-05, "loss": 0.3602, "step": 1340 }, { "epoch": 0.4942412236248042, "grad_norm": 7.0368933516142045, "learning_rate": 4.616763505995797e-05, "loss": 0.387, "step": 1341 }, { "epoch": 0.49460978531281674, "grad_norm": 12.255887854797587, "learning_rate": 4.616454444307084e-05, "loss": 0.4401, "step": 1342 }, { "epoch": 0.49497834700082927, "grad_norm": 3.7613815520922524, "learning_rate": 4.616145382618371e-05, "loss": 0.2864, "step": 1343 }, { "epoch": 0.4953469086888418, "grad_norm": 3.623953701161429, "learning_rate": 4.615836320929658e-05, "loss": 0.2283, "step": 1344 }, { "epoch": 0.4957154703768543, "grad_norm": 6.525933981512124, "learning_rate": 4.615527259240945e-05, "loss": 0.448, "step": 1345 }, { "epoch": 0.49608403206486684, "grad_norm": 5.2102806214889315, "learning_rate": 4.615218197552232e-05, "loss": 0.2073, "step": 1346 }, { "epoch": 0.49645259375287937, "grad_norm": 5.636056164496186, "learning_rate": 4.614909135863519e-05, "loss": 0.3874, "step": 1347 }, { "epoch": 0.4968211554408919, "grad_norm": 8.502866373892138, "learning_rate": 4.6146000741748055e-05, "loss": 0.5181, "step": 1348 }, { "epoch": 0.4971897171289045, "grad_norm": 6.039265262807288, "learning_rate": 4.614291012486092e-05, "loss": 0.3406, "step": 1349 }, { "epoch": 0.497558278816917, "grad_norm": 13.941285855111978, "learning_rate": 4.613981950797379e-05, "loss": 0.4527, "step": 1350 }, { "epoch": 0.4979268405049295, "grad_norm": 39.57488945025621, "learning_rate": 4.613672889108666e-05, "loss": 0.4206, "step": 1351 }, { "epoch": 0.49829540219294205, "grad_norm": 9.578148896279327, "learning_rate": 4.613363827419953e-05, "loss": 0.4486, "step": 1352 }, { "epoch": 0.4986639638809546, "grad_norm": 10.46857509182243, "learning_rate": 4.6130547657312404e-05, "loss": 0.4101, "step": 1353 }, { "epoch": 0.4990325255689671, "grad_norm": 4.999794383589572, "learning_rate": 4.612745704042527e-05, "loss": 0.2751, "step": 1354 }, { "epoch": 0.4994010872569796, "grad_norm": 30.929775314032955, "learning_rate": 4.612436642353814e-05, "loss": 0.7014, "step": 1355 }, { "epoch": 0.49976964894499215, "grad_norm": 4.388393717542447, "learning_rate": 4.612127580665101e-05, "loss": 0.3729, "step": 1356 }, { "epoch": 0.5001382106330047, "grad_norm": 7.62063348534868, "learning_rate": 4.611818518976388e-05, "loss": 0.4125, "step": 1357 }, { "epoch": 0.5005067723210173, "grad_norm": 6.526099989971159, "learning_rate": 4.6115094572876746e-05, "loss": 0.3007, "step": 1358 }, { "epoch": 0.5008753340090297, "grad_norm": 7.874565960260921, "learning_rate": 4.611200395598962e-05, "loss": 0.5085, "step": 1359 }, { "epoch": 0.5012438956970423, "grad_norm": 5.285087680865247, "learning_rate": 4.610891333910249e-05, "loss": 0.3688, "step": 1360 }, { "epoch": 0.5016124573850548, "grad_norm": 4.939248463533849, "learning_rate": 4.610582272221536e-05, "loss": 0.2631, "step": 1361 }, { "epoch": 0.5019810190730674, "grad_norm": 8.743732496548107, "learning_rate": 4.610273210532823e-05, "loss": 0.3006, "step": 1362 }, { "epoch": 0.5023495807610799, "grad_norm": 4.6888303776164255, "learning_rate": 4.6099641488441095e-05, "loss": 0.3285, "step": 1363 }, { "epoch": 0.5027181424490924, "grad_norm": 7.30528281330859, "learning_rate": 4.609655087155396e-05, "loss": 0.3073, "step": 1364 }, { "epoch": 0.503086704137105, "grad_norm": 4.724584748928173, "learning_rate": 4.609346025466683e-05, "loss": 0.2339, "step": 1365 }, { "epoch": 0.5034552658251175, "grad_norm": 5.01477328760719, "learning_rate": 4.60903696377797e-05, "loss": 0.3643, "step": 1366 }, { "epoch": 0.50382382751313, "grad_norm": 6.42566904822254, "learning_rate": 4.6087279020892573e-05, "loss": 0.3348, "step": 1367 }, { "epoch": 0.5041923892011425, "grad_norm": 6.643049878958906, "learning_rate": 4.608418840400544e-05, "loss": 0.3779, "step": 1368 }, { "epoch": 0.5045609508891551, "grad_norm": 5.54180917580532, "learning_rate": 4.608109778711831e-05, "loss": 0.3235, "step": 1369 }, { "epoch": 0.5049295125771676, "grad_norm": 6.238294702944167, "learning_rate": 4.607800717023118e-05, "loss": 0.3305, "step": 1370 }, { "epoch": 0.5052980742651801, "grad_norm": 19.224263214557226, "learning_rate": 4.607491655334405e-05, "loss": 0.3461, "step": 1371 }, { "epoch": 0.5056666359531926, "grad_norm": 6.469226174635107, "learning_rate": 4.607182593645692e-05, "loss": 0.431, "step": 1372 }, { "epoch": 0.5060351976412052, "grad_norm": 4.487037001502935, "learning_rate": 4.606873531956979e-05, "loss": 0.2921, "step": 1373 }, { "epoch": 0.5064037593292178, "grad_norm": 4.717121171563944, "learning_rate": 4.606564470268266e-05, "loss": 0.2337, "step": 1374 }, { "epoch": 0.5067723210172302, "grad_norm": 7.7256332779843, "learning_rate": 4.606255408579553e-05, "loss": 0.4258, "step": 1375 }, { "epoch": 0.5071408827052428, "grad_norm": 9.358925020485126, "learning_rate": 4.60594634689084e-05, "loss": 0.3476, "step": 1376 }, { "epoch": 0.5075094443932553, "grad_norm": 3.8375479431139676, "learning_rate": 4.6056372852021265e-05, "loss": 0.3798, "step": 1377 }, { "epoch": 0.5078780060812679, "grad_norm": 6.214267361117416, "learning_rate": 4.6053282235134136e-05, "loss": 0.2718, "step": 1378 }, { "epoch": 0.5082465677692803, "grad_norm": 5.348304280152327, "learning_rate": 4.6050191618247e-05, "loss": 0.3055, "step": 1379 }, { "epoch": 0.5086151294572929, "grad_norm": 8.045864241738384, "learning_rate": 4.604710100135987e-05, "loss": 0.49, "step": 1380 }, { "epoch": 0.5089836911453054, "grad_norm": 13.58652713103764, "learning_rate": 4.604401038447274e-05, "loss": 0.3489, "step": 1381 }, { "epoch": 0.509352252833318, "grad_norm": 8.16771421396201, "learning_rate": 4.604091976758561e-05, "loss": 0.5568, "step": 1382 }, { "epoch": 0.5097208145213306, "grad_norm": 6.114579035569543, "learning_rate": 4.603782915069848e-05, "loss": 0.2529, "step": 1383 }, { "epoch": 0.510089376209343, "grad_norm": 5.747769794014063, "learning_rate": 4.603473853381135e-05, "loss": 0.3752, "step": 1384 }, { "epoch": 0.5104579378973556, "grad_norm": 4.825466374602757, "learning_rate": 4.603164791692422e-05, "loss": 0.3654, "step": 1385 }, { "epoch": 0.5108264995853681, "grad_norm": 6.374378248824118, "learning_rate": 4.602855730003709e-05, "loss": 0.3101, "step": 1386 }, { "epoch": 0.5111950612733807, "grad_norm": 9.463472513391622, "learning_rate": 4.6025466683149957e-05, "loss": 0.5447, "step": 1387 }, { "epoch": 0.5115636229613931, "grad_norm": 13.121215274630742, "learning_rate": 4.602237606626283e-05, "loss": 0.5393, "step": 1388 }, { "epoch": 0.5119321846494057, "grad_norm": 5.897204056672548, "learning_rate": 4.60192854493757e-05, "loss": 0.3701, "step": 1389 }, { "epoch": 0.5123007463374182, "grad_norm": 10.486471998279647, "learning_rate": 4.601619483248857e-05, "loss": 0.2816, "step": 1390 }, { "epoch": 0.5126693080254308, "grad_norm": 8.571101191490252, "learning_rate": 4.601310421560144e-05, "loss": 0.2864, "step": 1391 }, { "epoch": 0.5130378697134433, "grad_norm": 7.809305499234301, "learning_rate": 4.6010013598714306e-05, "loss": 0.4357, "step": 1392 }, { "epoch": 0.5134064314014558, "grad_norm": 8.658467253260316, "learning_rate": 4.600692298182718e-05, "loss": 0.2873, "step": 1393 }, { "epoch": 0.5137749930894684, "grad_norm": 5.325536552225682, "learning_rate": 4.600383236494004e-05, "loss": 0.2844, "step": 1394 }, { "epoch": 0.5141435547774809, "grad_norm": 7.12786834268112, "learning_rate": 4.600074174805291e-05, "loss": 0.2746, "step": 1395 }, { "epoch": 0.5145121164654934, "grad_norm": 4.766865553185703, "learning_rate": 4.5997651131165784e-05, "loss": 0.3471, "step": 1396 }, { "epoch": 0.5148806781535059, "grad_norm": 10.933131451797566, "learning_rate": 4.599456051427865e-05, "loss": 0.4468, "step": 1397 }, { "epoch": 0.5152492398415185, "grad_norm": 6.7425326262332845, "learning_rate": 4.599146989739152e-05, "loss": 0.368, "step": 1398 }, { "epoch": 0.515617801529531, "grad_norm": 5.558933100257168, "learning_rate": 4.598837928050439e-05, "loss": 0.2402, "step": 1399 }, { "epoch": 0.5159863632175435, "grad_norm": 5.880991700811162, "learning_rate": 4.598528866361726e-05, "loss": 0.2667, "step": 1400 }, { "epoch": 0.516354924905556, "grad_norm": 5.538227400354477, "learning_rate": 4.5982198046730126e-05, "loss": 0.4852, "step": 1401 }, { "epoch": 0.5167234865935686, "grad_norm": 8.61234405332099, "learning_rate": 4.5979107429843e-05, "loss": 0.3103, "step": 1402 }, { "epoch": 0.5170920482815812, "grad_norm": 6.440592495217588, "learning_rate": 4.597601681295587e-05, "loss": 0.2767, "step": 1403 }, { "epoch": 0.5174606099695936, "grad_norm": 6.1572816245803255, "learning_rate": 4.597292619606874e-05, "loss": 0.3241, "step": 1404 }, { "epoch": 0.5178291716576062, "grad_norm": 3.6462330335680697, "learning_rate": 4.596983557918161e-05, "loss": 0.2269, "step": 1405 }, { "epoch": 0.5181977333456187, "grad_norm": 4.25325695228356, "learning_rate": 4.5966744962294475e-05, "loss": 0.2452, "step": 1406 }, { "epoch": 0.5185662950336313, "grad_norm": 8.862590011333277, "learning_rate": 4.5963654345407347e-05, "loss": 0.3707, "step": 1407 }, { "epoch": 0.5189348567216437, "grad_norm": 7.512914347210767, "learning_rate": 4.596056372852022e-05, "loss": 0.2146, "step": 1408 }, { "epoch": 0.5193034184096563, "grad_norm": 5.594226326721156, "learning_rate": 4.595747311163308e-05, "loss": 0.3084, "step": 1409 }, { "epoch": 0.5196719800976688, "grad_norm": 6.151437805322745, "learning_rate": 4.595438249474595e-05, "loss": 0.3285, "step": 1410 }, { "epoch": 0.5200405417856814, "grad_norm": 8.908725755812958, "learning_rate": 4.595129187785882e-05, "loss": 0.2894, "step": 1411 }, { "epoch": 0.520409103473694, "grad_norm": 5.32736316950926, "learning_rate": 4.594820126097169e-05, "loss": 0.2418, "step": 1412 }, { "epoch": 0.5207776651617064, "grad_norm": 14.477949015327747, "learning_rate": 4.594511064408456e-05, "loss": 0.3487, "step": 1413 }, { "epoch": 0.521146226849719, "grad_norm": 7.7276878347433415, "learning_rate": 4.594202002719743e-05, "loss": 0.3477, "step": 1414 }, { "epoch": 0.5215147885377315, "grad_norm": 9.801896701918364, "learning_rate": 4.59389294103103e-05, "loss": 0.5263, "step": 1415 }, { "epoch": 0.5218833502257441, "grad_norm": 8.943332716232925, "learning_rate": 4.593583879342317e-05, "loss": 0.5403, "step": 1416 }, { "epoch": 0.5222519119137565, "grad_norm": 4.176666365793698, "learning_rate": 4.593274817653604e-05, "loss": 0.3125, "step": 1417 }, { "epoch": 0.5226204736017691, "grad_norm": 11.333990938144318, "learning_rate": 4.592965755964891e-05, "loss": 0.5028, "step": 1418 }, { "epoch": 0.5229890352897816, "grad_norm": 7.551018187727183, "learning_rate": 4.592656694276178e-05, "loss": 0.4889, "step": 1419 }, { "epoch": 0.5233575969777942, "grad_norm": 6.522312894267083, "learning_rate": 4.5923476325874645e-05, "loss": 0.2678, "step": 1420 }, { "epoch": 0.5237261586658067, "grad_norm": 5.703820925540595, "learning_rate": 4.5920385708987516e-05, "loss": 0.333, "step": 1421 }, { "epoch": 0.5240947203538192, "grad_norm": 9.23832047758585, "learning_rate": 4.591729509210039e-05, "loss": 0.3457, "step": 1422 }, { "epoch": 0.5244632820418318, "grad_norm": 6.608706609599023, "learning_rate": 4.591420447521326e-05, "loss": 0.2971, "step": 1423 }, { "epoch": 0.5248318437298443, "grad_norm": 8.009549163817972, "learning_rate": 4.591111385832612e-05, "loss": 0.3043, "step": 1424 }, { "epoch": 0.5252004054178568, "grad_norm": 8.557520399946927, "learning_rate": 4.5908023241438994e-05, "loss": 0.2492, "step": 1425 }, { "epoch": 0.5255689671058693, "grad_norm": 4.416363951667578, "learning_rate": 4.590493262455186e-05, "loss": 0.2376, "step": 1426 }, { "epoch": 0.5259375287938819, "grad_norm": 5.936366565336652, "learning_rate": 4.590184200766473e-05, "loss": 0.2464, "step": 1427 }, { "epoch": 0.5263060904818944, "grad_norm": 6.549909448361731, "learning_rate": 4.58987513907776e-05, "loss": 0.352, "step": 1428 }, { "epoch": 0.5266746521699069, "grad_norm": 4.493669083122753, "learning_rate": 4.589566077389047e-05, "loss": 0.1772, "step": 1429 }, { "epoch": 0.5270432138579195, "grad_norm": 4.589538719917405, "learning_rate": 4.5892570157003337e-05, "loss": 0.2355, "step": 1430 }, { "epoch": 0.527411775545932, "grad_norm": 5.911498306296112, "learning_rate": 4.588947954011621e-05, "loss": 0.3495, "step": 1431 }, { "epoch": 0.5277803372339446, "grad_norm": 7.832129392698681, "learning_rate": 4.588638892322908e-05, "loss": 0.3562, "step": 1432 }, { "epoch": 0.528148898921957, "grad_norm": 6.967141004168973, "learning_rate": 4.588329830634195e-05, "loss": 0.4489, "step": 1433 }, { "epoch": 0.5285174606099696, "grad_norm": 10.097054805589119, "learning_rate": 4.588020768945482e-05, "loss": 0.2953, "step": 1434 }, { "epoch": 0.5288860222979821, "grad_norm": 11.48335786843597, "learning_rate": 4.5877117072567686e-05, "loss": 0.6035, "step": 1435 }, { "epoch": 0.5292545839859947, "grad_norm": 9.360625372967709, "learning_rate": 4.587402645568056e-05, "loss": 0.3742, "step": 1436 }, { "epoch": 0.5296231456740071, "grad_norm": 6.013046385748093, "learning_rate": 4.587093583879343e-05, "loss": 0.271, "step": 1437 }, { "epoch": 0.5299917073620197, "grad_norm": 15.926445212428673, "learning_rate": 4.58678452219063e-05, "loss": 0.3989, "step": 1438 }, { "epoch": 0.5303602690500322, "grad_norm": 9.884516898483518, "learning_rate": 4.5864754605019164e-05, "loss": 0.564, "step": 1439 }, { "epoch": 0.5307288307380448, "grad_norm": 13.518279521654593, "learning_rate": 4.586166398813203e-05, "loss": 0.2944, "step": 1440 }, { "epoch": 0.5310973924260574, "grad_norm": 8.408952356645155, "learning_rate": 4.58585733712449e-05, "loss": 0.2946, "step": 1441 }, { "epoch": 0.5314659541140698, "grad_norm": 6.048887090546466, "learning_rate": 4.585548275435777e-05, "loss": 0.4224, "step": 1442 }, { "epoch": 0.5318345158020824, "grad_norm": 8.514986626731599, "learning_rate": 4.585239213747064e-05, "loss": 0.2032, "step": 1443 }, { "epoch": 0.5322030774900949, "grad_norm": 4.924708928060931, "learning_rate": 4.584930152058351e-05, "loss": 0.302, "step": 1444 }, { "epoch": 0.5325716391781075, "grad_norm": 5.265665569205734, "learning_rate": 4.584621090369638e-05, "loss": 0.2985, "step": 1445 }, { "epoch": 0.5329402008661199, "grad_norm": 6.250636564262563, "learning_rate": 4.584312028680925e-05, "loss": 0.3912, "step": 1446 }, { "epoch": 0.5333087625541325, "grad_norm": 5.001207015261123, "learning_rate": 4.584002966992212e-05, "loss": 0.4072, "step": 1447 }, { "epoch": 0.533677324242145, "grad_norm": 7.245563596677355, "learning_rate": 4.583693905303499e-05, "loss": 0.3195, "step": 1448 }, { "epoch": 0.5340458859301576, "grad_norm": 6.592017646765931, "learning_rate": 4.5833848436147855e-05, "loss": 0.3334, "step": 1449 }, { "epoch": 0.5344144476181701, "grad_norm": 9.80978142077781, "learning_rate": 4.5830757819260726e-05, "loss": 0.2571, "step": 1450 }, { "epoch": 0.5347830093061826, "grad_norm": 8.59961625063602, "learning_rate": 4.58276672023736e-05, "loss": 0.4065, "step": 1451 }, { "epoch": 0.5351515709941952, "grad_norm": 6.812422165732255, "learning_rate": 4.582457658548647e-05, "loss": 0.2342, "step": 1452 }, { "epoch": 0.5355201326822077, "grad_norm": 8.081711229466466, "learning_rate": 4.582148596859934e-05, "loss": 0.2716, "step": 1453 }, { "epoch": 0.5358886943702202, "grad_norm": 15.46926490620292, "learning_rate": 4.5818395351712204e-05, "loss": 0.4673, "step": 1454 }, { "epoch": 0.5362572560582327, "grad_norm": 7.438785217540248, "learning_rate": 4.581530473482507e-05, "loss": 0.3526, "step": 1455 }, { "epoch": 0.5366258177462453, "grad_norm": 6.826768404282009, "learning_rate": 4.581221411793794e-05, "loss": 0.5645, "step": 1456 }, { "epoch": 0.5369943794342578, "grad_norm": 8.575571149038796, "learning_rate": 4.580912350105081e-05, "loss": 0.2884, "step": 1457 }, { "epoch": 0.5373629411222703, "grad_norm": 8.63004365593929, "learning_rate": 4.580603288416368e-05, "loss": 0.4021, "step": 1458 }, { "epoch": 0.5377315028102829, "grad_norm": 20.199080886326083, "learning_rate": 4.580294226727655e-05, "loss": 0.4802, "step": 1459 }, { "epoch": 0.5381000644982954, "grad_norm": 28.27693121375068, "learning_rate": 4.579985165038942e-05, "loss": 0.4025, "step": 1460 }, { "epoch": 0.538468626186308, "grad_norm": 5.1332758631936475, "learning_rate": 4.579676103350229e-05, "loss": 0.1809, "step": 1461 }, { "epoch": 0.5388371878743204, "grad_norm": 9.408129127342896, "learning_rate": 4.579367041661516e-05, "loss": 0.2984, "step": 1462 }, { "epoch": 0.539205749562333, "grad_norm": 5.989292445098997, "learning_rate": 4.579057979972803e-05, "loss": 0.2802, "step": 1463 }, { "epoch": 0.5395743112503455, "grad_norm": 9.620226183986253, "learning_rate": 4.5787489182840896e-05, "loss": 0.4236, "step": 1464 }, { "epoch": 0.5399428729383581, "grad_norm": 14.767798595803335, "learning_rate": 4.578439856595377e-05, "loss": 0.4459, "step": 1465 }, { "epoch": 0.5403114346263705, "grad_norm": 7.014963233508817, "learning_rate": 4.578130794906664e-05, "loss": 0.3613, "step": 1466 }, { "epoch": 0.5406799963143831, "grad_norm": 3.6945363816475516, "learning_rate": 4.577821733217951e-05, "loss": 0.3386, "step": 1467 }, { "epoch": 0.5410485580023956, "grad_norm": 7.290110685452748, "learning_rate": 4.5775126715292374e-05, "loss": 0.4585, "step": 1468 }, { "epoch": 0.5414171196904082, "grad_norm": 8.214059397736955, "learning_rate": 4.5772036098405245e-05, "loss": 0.3207, "step": 1469 }, { "epoch": 0.5417856813784208, "grad_norm": 9.872123589062022, "learning_rate": 4.576894548151811e-05, "loss": 0.3366, "step": 1470 }, { "epoch": 0.5421542430664332, "grad_norm": 11.733699448971286, "learning_rate": 4.576585486463098e-05, "loss": 0.521, "step": 1471 }, { "epoch": 0.5425228047544458, "grad_norm": 5.992540490845584, "learning_rate": 4.576276424774385e-05, "loss": 0.4818, "step": 1472 }, { "epoch": 0.5428913664424583, "grad_norm": 12.389236430115016, "learning_rate": 4.5759673630856716e-05, "loss": 0.6516, "step": 1473 }, { "epoch": 0.5432599281304709, "grad_norm": 6.968421004530849, "learning_rate": 4.575658301396959e-05, "loss": 0.3164, "step": 1474 }, { "epoch": 0.5436284898184833, "grad_norm": 8.096892582628778, "learning_rate": 4.575349239708246e-05, "loss": 0.3644, "step": 1475 }, { "epoch": 0.5439970515064959, "grad_norm": 8.613787896096891, "learning_rate": 4.575040178019533e-05, "loss": 0.262, "step": 1476 }, { "epoch": 0.5443656131945084, "grad_norm": 5.306313896019151, "learning_rate": 4.57473111633082e-05, "loss": 0.4317, "step": 1477 }, { "epoch": 0.544734174882521, "grad_norm": 9.480453459763114, "learning_rate": 4.5744220546421066e-05, "loss": 0.4803, "step": 1478 }, { "epoch": 0.5451027365705335, "grad_norm": 7.5326215199550965, "learning_rate": 4.574112992953394e-05, "loss": 0.3965, "step": 1479 }, { "epoch": 0.545471298258546, "grad_norm": 4.316521138176914, "learning_rate": 4.573803931264681e-05, "loss": 0.2552, "step": 1480 }, { "epoch": 0.5458398599465586, "grad_norm": 4.459555817796321, "learning_rate": 4.573494869575968e-05, "loss": 0.3382, "step": 1481 }, { "epoch": 0.5462084216345711, "grad_norm": 6.333266743092396, "learning_rate": 4.5731858078872544e-05, "loss": 0.2852, "step": 1482 }, { "epoch": 0.5465769833225836, "grad_norm": 5.042463045301614, "learning_rate": 4.5728767461985415e-05, "loss": 0.3386, "step": 1483 }, { "epoch": 0.5469455450105961, "grad_norm": 5.594043915768178, "learning_rate": 4.5725676845098286e-05, "loss": 0.4154, "step": 1484 }, { "epoch": 0.5473141066986087, "grad_norm": 14.166047804535696, "learning_rate": 4.572258622821115e-05, "loss": 0.4614, "step": 1485 }, { "epoch": 0.5476826683866212, "grad_norm": 7.62503552037705, "learning_rate": 4.571949561132402e-05, "loss": 0.3855, "step": 1486 }, { "epoch": 0.5480512300746337, "grad_norm": 4.3511820272403785, "learning_rate": 4.571640499443689e-05, "loss": 0.2265, "step": 1487 }, { "epoch": 0.5484197917626463, "grad_norm": 6.455120083070898, "learning_rate": 4.571331437754976e-05, "loss": 0.3149, "step": 1488 }, { "epoch": 0.5487883534506588, "grad_norm": 10.7545429655087, "learning_rate": 4.571022376066263e-05, "loss": 0.4835, "step": 1489 }, { "epoch": 0.5491569151386714, "grad_norm": 7.725582419355949, "learning_rate": 4.57071331437755e-05, "loss": 0.4315, "step": 1490 }, { "epoch": 0.5495254768266838, "grad_norm": 5.691118367912224, "learning_rate": 4.570404252688837e-05, "loss": 0.2887, "step": 1491 }, { "epoch": 0.5498940385146964, "grad_norm": 5.677970868182721, "learning_rate": 4.5700951910001235e-05, "loss": 0.5579, "step": 1492 }, { "epoch": 0.5502626002027089, "grad_norm": 7.016892350555005, "learning_rate": 4.5697861293114106e-05, "loss": 0.3831, "step": 1493 }, { "epoch": 0.5506311618907215, "grad_norm": 10.989491732006172, "learning_rate": 4.569477067622698e-05, "loss": 0.2739, "step": 1494 }, { "epoch": 0.550999723578734, "grad_norm": 6.814169862807168, "learning_rate": 4.569168005933985e-05, "loss": 0.353, "step": 1495 }, { "epoch": 0.5513682852667465, "grad_norm": 7.703240879262787, "learning_rate": 4.568858944245272e-05, "loss": 0.2818, "step": 1496 }, { "epoch": 0.5517368469547591, "grad_norm": 9.026824394812671, "learning_rate": 4.5685498825565584e-05, "loss": 0.3246, "step": 1497 }, { "epoch": 0.5521054086427716, "grad_norm": 5.475591423803431, "learning_rate": 4.5682408208678456e-05, "loss": 0.4593, "step": 1498 }, { "epoch": 0.5524739703307842, "grad_norm": 8.16906760079194, "learning_rate": 4.567931759179133e-05, "loss": 0.269, "step": 1499 }, { "epoch": 0.5528425320187966, "grad_norm": 9.098554358042271, "learning_rate": 4.567622697490419e-05, "loss": 0.506, "step": 1500 }, { "epoch": 0.5532110937068092, "grad_norm": 7.284922185415476, "learning_rate": 4.567313635801706e-05, "loss": 0.3524, "step": 1501 }, { "epoch": 0.5535796553948217, "grad_norm": 4.084284431603326, "learning_rate": 4.567004574112993e-05, "loss": 0.1623, "step": 1502 }, { "epoch": 0.5539482170828343, "grad_norm": 7.781311280036059, "learning_rate": 4.56669551242428e-05, "loss": 0.4131, "step": 1503 }, { "epoch": 0.5543167787708467, "grad_norm": 6.328191912261983, "learning_rate": 4.566386450735567e-05, "loss": 0.3355, "step": 1504 }, { "epoch": 0.5546853404588593, "grad_norm": 15.220937583406903, "learning_rate": 4.566077389046854e-05, "loss": 0.4438, "step": 1505 }, { "epoch": 0.5550539021468718, "grad_norm": 7.552962917976761, "learning_rate": 4.565768327358141e-05, "loss": 0.5963, "step": 1506 }, { "epoch": 0.5554224638348844, "grad_norm": 6.466598374048755, "learning_rate": 4.5654592656694276e-05, "loss": 0.4144, "step": 1507 }, { "epoch": 0.5557910255228969, "grad_norm": 11.35827345931425, "learning_rate": 4.565150203980715e-05, "loss": 0.3245, "step": 1508 }, { "epoch": 0.5561595872109094, "grad_norm": 5.43660474400872, "learning_rate": 4.564841142292002e-05, "loss": 0.2599, "step": 1509 }, { "epoch": 0.556528148898922, "grad_norm": 11.594930501304358, "learning_rate": 4.564532080603289e-05, "loss": 0.4382, "step": 1510 }, { "epoch": 0.5568967105869345, "grad_norm": 7.304662171998202, "learning_rate": 4.5642230189145754e-05, "loss": 0.4222, "step": 1511 }, { "epoch": 0.557265272274947, "grad_norm": 11.208231721060566, "learning_rate": 4.5639139572258625e-05, "loss": 0.4253, "step": 1512 }, { "epoch": 0.5576338339629595, "grad_norm": 4.276248655834729, "learning_rate": 4.5636048955371496e-05, "loss": 0.2952, "step": 1513 }, { "epoch": 0.5580023956509721, "grad_norm": 4.815351359856524, "learning_rate": 4.563295833848437e-05, "loss": 0.2812, "step": 1514 }, { "epoch": 0.5583709573389846, "grad_norm": 15.546352849271836, "learning_rate": 4.562986772159723e-05, "loss": 0.5012, "step": 1515 }, { "epoch": 0.5587395190269971, "grad_norm": 5.35938923558605, "learning_rate": 4.56267771047101e-05, "loss": 0.336, "step": 1516 }, { "epoch": 0.5591080807150097, "grad_norm": 39.162151016837996, "learning_rate": 4.562368648782297e-05, "loss": 0.386, "step": 1517 }, { "epoch": 0.5594766424030222, "grad_norm": 8.009239106917544, "learning_rate": 4.562059587093584e-05, "loss": 0.5027, "step": 1518 }, { "epoch": 0.5598452040910348, "grad_norm": 6.910603389645826, "learning_rate": 4.561750525404871e-05, "loss": 0.4088, "step": 1519 }, { "epoch": 0.5602137657790472, "grad_norm": 6.862956369925285, "learning_rate": 4.561441463716158e-05, "loss": 0.43, "step": 1520 }, { "epoch": 0.5605823274670598, "grad_norm": 4.149635730980384, "learning_rate": 4.5611324020274446e-05, "loss": 0.2602, "step": 1521 }, { "epoch": 0.5609508891550723, "grad_norm": 7.123582130029894, "learning_rate": 4.560823340338732e-05, "loss": 0.4036, "step": 1522 }, { "epoch": 0.5613194508430849, "grad_norm": 6.720317427086371, "learning_rate": 4.560514278650019e-05, "loss": 0.3289, "step": 1523 }, { "epoch": 0.5616880125310973, "grad_norm": 7.694934258629433, "learning_rate": 4.560205216961306e-05, "loss": 0.5121, "step": 1524 }, { "epoch": 0.5620565742191099, "grad_norm": 6.323844761949979, "learning_rate": 4.559896155272593e-05, "loss": 0.4506, "step": 1525 }, { "epoch": 0.5624251359071225, "grad_norm": 4.493984227980001, "learning_rate": 4.5595870935838795e-05, "loss": 0.3531, "step": 1526 }, { "epoch": 0.562793697595135, "grad_norm": 5.551678364234739, "learning_rate": 4.5592780318951666e-05, "loss": 0.3589, "step": 1527 }, { "epoch": 0.5631622592831476, "grad_norm": 4.8880337318842315, "learning_rate": 4.558968970206454e-05, "loss": 0.3466, "step": 1528 }, { "epoch": 0.56353082097116, "grad_norm": 8.388080004949499, "learning_rate": 4.558659908517741e-05, "loss": 0.4669, "step": 1529 }, { "epoch": 0.5638993826591726, "grad_norm": 3.9115868084664127, "learning_rate": 4.558350846829027e-05, "loss": 0.2891, "step": 1530 }, { "epoch": 0.5642679443471851, "grad_norm": 8.324838301087764, "learning_rate": 4.558041785140314e-05, "loss": 0.3504, "step": 1531 }, { "epoch": 0.5646365060351977, "grad_norm": 7.256455277094772, "learning_rate": 4.557732723451601e-05, "loss": 0.4326, "step": 1532 }, { "epoch": 0.5650050677232101, "grad_norm": 4.139850126493013, "learning_rate": 4.557423661762888e-05, "loss": 0.2348, "step": 1533 }, { "epoch": 0.5653736294112227, "grad_norm": 7.964725689950791, "learning_rate": 4.557114600074175e-05, "loss": 0.3354, "step": 1534 }, { "epoch": 0.5657421910992352, "grad_norm": 8.46256134146204, "learning_rate": 4.556805538385462e-05, "loss": 0.5044, "step": 1535 }, { "epoch": 0.5661107527872478, "grad_norm": 10.39546850223016, "learning_rate": 4.5564964766967486e-05, "loss": 0.3771, "step": 1536 }, { "epoch": 0.5664793144752603, "grad_norm": 7.6405256456027475, "learning_rate": 4.556187415008036e-05, "loss": 0.4367, "step": 1537 }, { "epoch": 0.5668478761632728, "grad_norm": 10.389917151641237, "learning_rate": 4.555878353319323e-05, "loss": 0.4166, "step": 1538 }, { "epoch": 0.5672164378512854, "grad_norm": 4.857679850281617, "learning_rate": 4.55556929163061e-05, "loss": 0.2606, "step": 1539 }, { "epoch": 0.5675849995392979, "grad_norm": 15.029308931322324, "learning_rate": 4.5552602299418964e-05, "loss": 0.4141, "step": 1540 }, { "epoch": 0.5679535612273104, "grad_norm": 6.731417190048067, "learning_rate": 4.5549511682531836e-05, "loss": 0.3016, "step": 1541 }, { "epoch": 0.5683221229153229, "grad_norm": 16.446026929255968, "learning_rate": 4.554642106564471e-05, "loss": 0.3843, "step": 1542 }, { "epoch": 0.5686906846033355, "grad_norm": 6.939878313828146, "learning_rate": 4.554333044875758e-05, "loss": 0.4012, "step": 1543 }, { "epoch": 0.569059246291348, "grad_norm": 4.693721050879227, "learning_rate": 4.554023983187045e-05, "loss": 0.3374, "step": 1544 }, { "epoch": 0.5694278079793605, "grad_norm": 5.127794945769514, "learning_rate": 4.553714921498331e-05, "loss": 0.2611, "step": 1545 }, { "epoch": 0.5697963696673731, "grad_norm": 7.294329376875547, "learning_rate": 4.553405859809618e-05, "loss": 0.3612, "step": 1546 }, { "epoch": 0.5701649313553856, "grad_norm": 6.830683248590739, "learning_rate": 4.553096798120905e-05, "loss": 0.3128, "step": 1547 }, { "epoch": 0.5705334930433982, "grad_norm": 4.783552824471756, "learning_rate": 4.552787736432192e-05, "loss": 0.3456, "step": 1548 }, { "epoch": 0.5709020547314106, "grad_norm": 6.199587642123712, "learning_rate": 4.552478674743479e-05, "loss": 0.2876, "step": 1549 }, { "epoch": 0.5712706164194232, "grad_norm": 4.711674722710276, "learning_rate": 4.5521696130547656e-05, "loss": 0.2934, "step": 1550 }, { "epoch": 0.5716391781074357, "grad_norm": 34.38771781783068, "learning_rate": 4.551860551366053e-05, "loss": 0.4096, "step": 1551 }, { "epoch": 0.5720077397954483, "grad_norm": 8.133924541977866, "learning_rate": 4.55155148967734e-05, "loss": 0.4746, "step": 1552 }, { "epoch": 0.5723763014834607, "grad_norm": 7.259264584119117, "learning_rate": 4.551242427988627e-05, "loss": 0.2949, "step": 1553 }, { "epoch": 0.5727448631714733, "grad_norm": 8.630890092797227, "learning_rate": 4.550933366299914e-05, "loss": 0.2569, "step": 1554 }, { "epoch": 0.5731134248594859, "grad_norm": 7.177262420708392, "learning_rate": 4.5506243046112005e-05, "loss": 0.3205, "step": 1555 }, { "epoch": 0.5734819865474984, "grad_norm": 4.175195180019704, "learning_rate": 4.5503152429224876e-05, "loss": 0.2938, "step": 1556 }, { "epoch": 0.573850548235511, "grad_norm": 11.081556555417563, "learning_rate": 4.550006181233775e-05, "loss": 0.4807, "step": 1557 }, { "epoch": 0.5742191099235234, "grad_norm": 12.536996032991715, "learning_rate": 4.549697119545062e-05, "loss": 0.3606, "step": 1558 }, { "epoch": 0.574587671611536, "grad_norm": 4.456848301254543, "learning_rate": 4.549388057856348e-05, "loss": 0.223, "step": 1559 }, { "epoch": 0.5749562332995485, "grad_norm": 16.740955245132557, "learning_rate": 4.5490789961676354e-05, "loss": 0.4246, "step": 1560 }, { "epoch": 0.5753247949875611, "grad_norm": 8.880929604750353, "learning_rate": 4.548769934478922e-05, "loss": 0.3029, "step": 1561 }, { "epoch": 0.5756933566755735, "grad_norm": 9.461558415318503, "learning_rate": 4.548460872790209e-05, "loss": 0.4556, "step": 1562 }, { "epoch": 0.5760619183635861, "grad_norm": 4.94252974814308, "learning_rate": 4.548151811101496e-05, "loss": 0.361, "step": 1563 }, { "epoch": 0.5764304800515986, "grad_norm": 7.900893979252394, "learning_rate": 4.5478427494127826e-05, "loss": 0.2999, "step": 1564 }, { "epoch": 0.5767990417396112, "grad_norm": 10.65703179408583, "learning_rate": 4.54753368772407e-05, "loss": 0.52, "step": 1565 }, { "epoch": 0.5771676034276237, "grad_norm": 7.5910984009470805, "learning_rate": 4.547224626035357e-05, "loss": 0.2711, "step": 1566 }, { "epoch": 0.5775361651156362, "grad_norm": 5.756966392844398, "learning_rate": 4.546915564346644e-05, "loss": 0.2582, "step": 1567 }, { "epoch": 0.5779047268036488, "grad_norm": 4.362809117249423, "learning_rate": 4.546606502657931e-05, "loss": 0.2474, "step": 1568 }, { "epoch": 0.5782732884916613, "grad_norm": 10.00080677592731, "learning_rate": 4.5462974409692175e-05, "loss": 0.566, "step": 1569 }, { "epoch": 0.5786418501796738, "grad_norm": 8.363639756153006, "learning_rate": 4.5459883792805046e-05, "loss": 0.3731, "step": 1570 }, { "epoch": 0.5790104118676863, "grad_norm": 5.504720742663692, "learning_rate": 4.545679317591792e-05, "loss": 0.221, "step": 1571 }, { "epoch": 0.5793789735556989, "grad_norm": 5.670742962313572, "learning_rate": 4.545370255903079e-05, "loss": 0.5141, "step": 1572 }, { "epoch": 0.5797475352437114, "grad_norm": 7.028665703463561, "learning_rate": 4.545061194214365e-05, "loss": 0.4163, "step": 1573 }, { "epoch": 0.580116096931724, "grad_norm": 6.9818656902611425, "learning_rate": 4.5447521325256524e-05, "loss": 0.3831, "step": 1574 }, { "epoch": 0.5804846586197365, "grad_norm": 6.433453223078191, "learning_rate": 4.5444430708369395e-05, "loss": 0.2141, "step": 1575 }, { "epoch": 0.580853220307749, "grad_norm": 8.885433887198166, "learning_rate": 4.544134009148226e-05, "loss": 0.4141, "step": 1576 }, { "epoch": 0.5812217819957616, "grad_norm": 7.989555693723242, "learning_rate": 4.543824947459513e-05, "loss": 0.2596, "step": 1577 }, { "epoch": 0.581590343683774, "grad_norm": 5.477896578216027, "learning_rate": 4.5435158857708e-05, "loss": 0.3992, "step": 1578 }, { "epoch": 0.5819589053717866, "grad_norm": 14.118572021551413, "learning_rate": 4.5432068240820866e-05, "loss": 0.3831, "step": 1579 }, { "epoch": 0.5823274670597991, "grad_norm": 13.966365192470557, "learning_rate": 4.542897762393374e-05, "loss": 0.3928, "step": 1580 }, { "epoch": 0.5826960287478117, "grad_norm": 17.560525043258867, "learning_rate": 4.542588700704661e-05, "loss": 0.4607, "step": 1581 }, { "epoch": 0.5830645904358241, "grad_norm": 9.050663657118042, "learning_rate": 4.542279639015948e-05, "loss": 0.4125, "step": 1582 }, { "epoch": 0.5834331521238367, "grad_norm": 3.959219836632302, "learning_rate": 4.5419705773272344e-05, "loss": 0.2812, "step": 1583 }, { "epoch": 0.5838017138118493, "grad_norm": 5.92506053265904, "learning_rate": 4.5416615156385216e-05, "loss": 0.4612, "step": 1584 }, { "epoch": 0.5841702754998618, "grad_norm": 8.161188377162915, "learning_rate": 4.541352453949809e-05, "loss": 0.2234, "step": 1585 }, { "epoch": 0.5845388371878744, "grad_norm": 10.715001821655745, "learning_rate": 4.541043392261096e-05, "loss": 0.2647, "step": 1586 }, { "epoch": 0.5849073988758868, "grad_norm": 4.522090942035234, "learning_rate": 4.540734330572383e-05, "loss": 0.224, "step": 1587 }, { "epoch": 0.5852759605638994, "grad_norm": 6.755920074622516, "learning_rate": 4.5404252688836694e-05, "loss": 0.3126, "step": 1588 }, { "epoch": 0.5856445222519119, "grad_norm": 6.500216553822036, "learning_rate": 4.5401162071949565e-05, "loss": 0.2094, "step": 1589 }, { "epoch": 0.5860130839399245, "grad_norm": 6.8203701091137985, "learning_rate": 4.5398071455062436e-05, "loss": 0.298, "step": 1590 }, { "epoch": 0.5863816456279369, "grad_norm": 5.9443782117407995, "learning_rate": 4.53949808381753e-05, "loss": 0.3051, "step": 1591 }, { "epoch": 0.5867502073159495, "grad_norm": 5.799335954733881, "learning_rate": 4.539189022128817e-05, "loss": 0.2853, "step": 1592 }, { "epoch": 0.5871187690039621, "grad_norm": 7.266477867030286, "learning_rate": 4.5388799604401036e-05, "loss": 0.4176, "step": 1593 }, { "epoch": 0.5874873306919746, "grad_norm": 9.284761426055574, "learning_rate": 4.538570898751391e-05, "loss": 0.4526, "step": 1594 }, { "epoch": 0.5878558923799871, "grad_norm": 5.750396051450296, "learning_rate": 4.538261837062678e-05, "loss": 0.3517, "step": 1595 }, { "epoch": 0.5882244540679996, "grad_norm": 6.487099096005585, "learning_rate": 4.537952775373965e-05, "loss": 0.3925, "step": 1596 }, { "epoch": 0.5885930157560122, "grad_norm": 4.717411887311811, "learning_rate": 4.537643713685252e-05, "loss": 0.3214, "step": 1597 }, { "epoch": 0.5889615774440247, "grad_norm": 5.21727416591174, "learning_rate": 4.5373346519965385e-05, "loss": 0.4211, "step": 1598 }, { "epoch": 0.5893301391320372, "grad_norm": 6.172274574189766, "learning_rate": 4.5370255903078256e-05, "loss": 0.2981, "step": 1599 }, { "epoch": 0.5896987008200497, "grad_norm": 9.75559641318526, "learning_rate": 4.536716528619113e-05, "loss": 0.6199, "step": 1600 }, { "epoch": 0.5896987008200497, "eval_bleu": 0.14225626573681474, "eval_bleu_1gram": 0.49872377472440893, "eval_bleu_2gram": 0.2973828346350141, "eval_bleu_3gram": 0.1835567608929293, "eval_bleu_4gram": 0.12113579232280423, "eval_rag_val_loss": 0.42018274218503854, "eval_rouge1": 0.4868654172843418, "eval_rouge2": 0.28816075775596406, "eval_rougeL": 0.48659379433879263, "step": 1600 }, { "epoch": 0.5900672625080623, "grad_norm": 6.329470945791068, "learning_rate": 4.5364074669304e-05, "loss": 0.366, "step": 1601 }, { "epoch": 0.5904358241960748, "grad_norm": 6.897543550072481, "learning_rate": 4.536098405241686e-05, "loss": 0.3771, "step": 1602 }, { "epoch": 0.5908043858840873, "grad_norm": 18.50512294461489, "learning_rate": 4.5357893435529734e-05, "loss": 0.2864, "step": 1603 }, { "epoch": 0.5911729475720999, "grad_norm": 5.092191305565853, "learning_rate": 4.5354802818642605e-05, "loss": 0.3685, "step": 1604 }, { "epoch": 0.5915415092601124, "grad_norm": 8.116509329376001, "learning_rate": 4.535171220175548e-05, "loss": 0.2338, "step": 1605 }, { "epoch": 0.591910070948125, "grad_norm": 6.70315402364783, "learning_rate": 4.534862158486834e-05, "loss": 0.3959, "step": 1606 }, { "epoch": 0.5922786326361374, "grad_norm": 9.366958220843902, "learning_rate": 4.534553096798121e-05, "loss": 0.3614, "step": 1607 }, { "epoch": 0.59264719432415, "grad_norm": 9.58737716496646, "learning_rate": 4.534244035109408e-05, "loss": 0.3731, "step": 1608 }, { "epoch": 0.5930157560121625, "grad_norm": 21.491310670272576, "learning_rate": 4.533934973420695e-05, "loss": 0.3492, "step": 1609 }, { "epoch": 0.5933843177001751, "grad_norm": 5.096020900282227, "learning_rate": 4.533625911731982e-05, "loss": 0.2088, "step": 1610 }, { "epoch": 0.5937528793881875, "grad_norm": 6.509570192367194, "learning_rate": 4.533316850043269e-05, "loss": 0.3472, "step": 1611 }, { "epoch": 0.5941214410762001, "grad_norm": 8.011994430136303, "learning_rate": 4.5330077883545555e-05, "loss": 0.4161, "step": 1612 }, { "epoch": 0.5944900027642127, "grad_norm": 14.546145968909707, "learning_rate": 4.5326987266658426e-05, "loss": 0.2921, "step": 1613 }, { "epoch": 0.5948585644522252, "grad_norm": 7.528206491247003, "learning_rate": 4.53238966497713e-05, "loss": 0.3877, "step": 1614 }, { "epoch": 0.5952271261402378, "grad_norm": 9.199612683975138, "learning_rate": 4.532080603288417e-05, "loss": 0.3753, "step": 1615 }, { "epoch": 0.5955956878282502, "grad_norm": 2.889156814116814, "learning_rate": 4.531771541599704e-05, "loss": 0.1658, "step": 1616 }, { "epoch": 0.5959642495162628, "grad_norm": 6.401807780055818, "learning_rate": 4.5314624799109904e-05, "loss": 0.3307, "step": 1617 }, { "epoch": 0.5963328112042753, "grad_norm": 3.810487560004138, "learning_rate": 4.5311534182222775e-05, "loss": 0.253, "step": 1618 }, { "epoch": 0.5967013728922879, "grad_norm": 8.043899726807975, "learning_rate": 4.5308443565335646e-05, "loss": 0.4503, "step": 1619 }, { "epoch": 0.5970699345803003, "grad_norm": 6.378984589805235, "learning_rate": 4.530535294844852e-05, "loss": 0.3433, "step": 1620 }, { "epoch": 0.5974384962683129, "grad_norm": 12.730658727295276, "learning_rate": 4.530226233156138e-05, "loss": 0.3875, "step": 1621 }, { "epoch": 0.5978070579563255, "grad_norm": 3.29208445412638, "learning_rate": 4.5299171714674246e-05, "loss": 0.2325, "step": 1622 }, { "epoch": 0.598175619644338, "grad_norm": 3.612666491199505, "learning_rate": 4.529608109778712e-05, "loss": 0.2639, "step": 1623 }, { "epoch": 0.5985441813323505, "grad_norm": 6.187397349112997, "learning_rate": 4.529299048089999e-05, "loss": 0.3384, "step": 1624 }, { "epoch": 0.598912743020363, "grad_norm": 6.2504138046606315, "learning_rate": 4.528989986401286e-05, "loss": 0.3275, "step": 1625 }, { "epoch": 0.5992813047083756, "grad_norm": 11.361437414862431, "learning_rate": 4.528680924712573e-05, "loss": 0.3572, "step": 1626 }, { "epoch": 0.5996498663963881, "grad_norm": 9.1650847948546, "learning_rate": 4.5283718630238595e-05, "loss": 0.4029, "step": 1627 }, { "epoch": 0.6000184280844006, "grad_norm": 5.735842649485619, "learning_rate": 4.528062801335147e-05, "loss": 0.277, "step": 1628 }, { "epoch": 0.6003869897724131, "grad_norm": 14.276163276266667, "learning_rate": 4.527753739646434e-05, "loss": 0.401, "step": 1629 }, { "epoch": 0.6007555514604257, "grad_norm": 12.26114505666522, "learning_rate": 4.527444677957721e-05, "loss": 0.2269, "step": 1630 }, { "epoch": 0.6011241131484382, "grad_norm": 8.355442726554847, "learning_rate": 4.5271356162690073e-05, "loss": 0.4922, "step": 1631 }, { "epoch": 0.6014926748364507, "grad_norm": 9.144921699801907, "learning_rate": 4.5268265545802945e-05, "loss": 0.2971, "step": 1632 }, { "epoch": 0.6018612365244633, "grad_norm": 4.845403179955034, "learning_rate": 4.5265174928915816e-05, "loss": 0.3768, "step": 1633 }, { "epoch": 0.6022297982124758, "grad_norm": 8.534648511785742, "learning_rate": 4.526208431202869e-05, "loss": 0.2742, "step": 1634 }, { "epoch": 0.6025983599004884, "grad_norm": 5.3612064535374095, "learning_rate": 4.525899369514156e-05, "loss": 0.3722, "step": 1635 }, { "epoch": 0.6029669215885008, "grad_norm": 7.324351821707795, "learning_rate": 4.5255903078254416e-05, "loss": 0.3321, "step": 1636 }, { "epoch": 0.6033354832765134, "grad_norm": 5.721972948753737, "learning_rate": 4.525281246136729e-05, "loss": 0.4414, "step": 1637 }, { "epoch": 0.6037040449645259, "grad_norm": 5.873982442838039, "learning_rate": 4.524972184448016e-05, "loss": 0.2646, "step": 1638 }, { "epoch": 0.6040726066525385, "grad_norm": 9.254840306120732, "learning_rate": 4.524663122759303e-05, "loss": 0.379, "step": 1639 }, { "epoch": 0.604441168340551, "grad_norm": 11.60600385841412, "learning_rate": 4.52435406107059e-05, "loss": 0.4887, "step": 1640 }, { "epoch": 0.6048097300285635, "grad_norm": 7.230673714426751, "learning_rate": 4.5240449993818765e-05, "loss": 0.3599, "step": 1641 }, { "epoch": 0.6051782917165761, "grad_norm": 7.704527566673687, "learning_rate": 4.5237359376931636e-05, "loss": 0.3309, "step": 1642 }, { "epoch": 0.6055468534045886, "grad_norm": 42.13985156409251, "learning_rate": 4.523426876004451e-05, "loss": 0.5639, "step": 1643 }, { "epoch": 0.6059154150926012, "grad_norm": 5.172751176726765, "learning_rate": 4.523117814315738e-05, "loss": 0.3185, "step": 1644 }, { "epoch": 0.6062839767806136, "grad_norm": 8.727431920795997, "learning_rate": 4.522808752627024e-05, "loss": 0.413, "step": 1645 }, { "epoch": 0.6066525384686262, "grad_norm": 7.033636747688744, "learning_rate": 4.5224996909383114e-05, "loss": 0.4518, "step": 1646 }, { "epoch": 0.6070211001566387, "grad_norm": 4.303634819463863, "learning_rate": 4.5221906292495985e-05, "loss": 0.3277, "step": 1647 }, { "epoch": 0.6073896618446513, "grad_norm": 5.1803403417061515, "learning_rate": 4.5218815675608857e-05, "loss": 0.2889, "step": 1648 }, { "epoch": 0.6077582235326637, "grad_norm": 5.523932492793202, "learning_rate": 4.521572505872173e-05, "loss": 0.3513, "step": 1649 }, { "epoch": 0.6081267852206763, "grad_norm": 4.569172200161268, "learning_rate": 4.521263444183459e-05, "loss": 0.2881, "step": 1650 }, { "epoch": 0.6084953469086889, "grad_norm": 6.797518153838047, "learning_rate": 4.520954382494746e-05, "loss": 0.425, "step": 1651 }, { "epoch": 0.6088639085967014, "grad_norm": 9.167904255871361, "learning_rate": 4.520645320806033e-05, "loss": 0.4017, "step": 1652 }, { "epoch": 0.609232470284714, "grad_norm": 8.463763919357461, "learning_rate": 4.52033625911732e-05, "loss": 0.5224, "step": 1653 }, { "epoch": 0.6096010319727264, "grad_norm": 12.601706592093118, "learning_rate": 4.520027197428607e-05, "loss": 0.3699, "step": 1654 }, { "epoch": 0.609969593660739, "grad_norm": 4.503693230679067, "learning_rate": 4.5197181357398935e-05, "loss": 0.2515, "step": 1655 }, { "epoch": 0.6103381553487515, "grad_norm": 8.355313064672766, "learning_rate": 4.5194090740511806e-05, "loss": 0.3991, "step": 1656 }, { "epoch": 0.610706717036764, "grad_norm": 5.9228389227245595, "learning_rate": 4.519100012362468e-05, "loss": 0.3658, "step": 1657 }, { "epoch": 0.6110752787247765, "grad_norm": 9.640274412436106, "learning_rate": 4.518790950673755e-05, "loss": 0.3412, "step": 1658 }, { "epoch": 0.6114438404127891, "grad_norm": 5.5056761149219335, "learning_rate": 4.518481888985042e-05, "loss": 0.3327, "step": 1659 }, { "epoch": 0.6118124021008016, "grad_norm": 19.168206371438274, "learning_rate": 4.5181728272963284e-05, "loss": 0.3105, "step": 1660 }, { "epoch": 0.6121809637888141, "grad_norm": 10.277910890375914, "learning_rate": 4.5178637656076155e-05, "loss": 0.5601, "step": 1661 }, { "epoch": 0.6125495254768267, "grad_norm": 4.6623695662102795, "learning_rate": 4.5175547039189026e-05, "loss": 0.2648, "step": 1662 }, { "epoch": 0.6129180871648392, "grad_norm": 12.470516987892383, "learning_rate": 4.51724564223019e-05, "loss": 0.4562, "step": 1663 }, { "epoch": 0.6132866488528518, "grad_norm": 13.502503339586585, "learning_rate": 4.516936580541476e-05, "loss": 0.4523, "step": 1664 }, { "epoch": 0.6136552105408642, "grad_norm": 5.226094634094538, "learning_rate": 4.516627518852763e-05, "loss": 0.2788, "step": 1665 }, { "epoch": 0.6140237722288768, "grad_norm": 6.94243439469953, "learning_rate": 4.51631845716405e-05, "loss": 0.2358, "step": 1666 }, { "epoch": 0.6143923339168893, "grad_norm": 8.100656515760196, "learning_rate": 4.516009395475337e-05, "loss": 0.3053, "step": 1667 }, { "epoch": 0.6147608956049019, "grad_norm": 4.620304301425238, "learning_rate": 4.515700333786624e-05, "loss": 0.4316, "step": 1668 }, { "epoch": 0.6151294572929144, "grad_norm": 6.104634584930317, "learning_rate": 4.515391272097911e-05, "loss": 0.3331, "step": 1669 }, { "epoch": 0.6154980189809269, "grad_norm": 22.722533696699763, "learning_rate": 4.5150822104091975e-05, "loss": 0.495, "step": 1670 }, { "epoch": 0.6158665806689395, "grad_norm": 9.960045149980418, "learning_rate": 4.5147731487204847e-05, "loss": 0.3184, "step": 1671 }, { "epoch": 0.616235142356952, "grad_norm": 7.643944890804247, "learning_rate": 4.514464087031772e-05, "loss": 0.3644, "step": 1672 }, { "epoch": 0.6166037040449646, "grad_norm": 6.254760455111777, "learning_rate": 4.514155025343059e-05, "loss": 0.3443, "step": 1673 }, { "epoch": 0.616972265732977, "grad_norm": 5.7374066646522195, "learning_rate": 4.5138459636543453e-05, "loss": 0.3128, "step": 1674 }, { "epoch": 0.6173408274209896, "grad_norm": 6.853945588447178, "learning_rate": 4.5135369019656325e-05, "loss": 0.3842, "step": 1675 }, { "epoch": 0.6177093891090021, "grad_norm": 6.6710885006456975, "learning_rate": 4.5132278402769196e-05, "loss": 0.3625, "step": 1676 }, { "epoch": 0.6180779507970147, "grad_norm": 6.033326419102762, "learning_rate": 4.512918778588207e-05, "loss": 0.3529, "step": 1677 }, { "epoch": 0.6184465124850271, "grad_norm": 4.792370995987643, "learning_rate": 4.512609716899494e-05, "loss": 0.3392, "step": 1678 }, { "epoch": 0.6188150741730397, "grad_norm": 5.831757178129759, "learning_rate": 4.51230065521078e-05, "loss": 0.327, "step": 1679 }, { "epoch": 0.6191836358610523, "grad_norm": 6.752980457261248, "learning_rate": 4.5119915935220674e-05, "loss": 0.3824, "step": 1680 }, { "epoch": 0.6195521975490648, "grad_norm": 5.735228430830684, "learning_rate": 4.5116825318333545e-05, "loss": 0.4594, "step": 1681 }, { "epoch": 0.6199207592370773, "grad_norm": 9.638142122283524, "learning_rate": 4.511373470144641e-05, "loss": 0.5925, "step": 1682 }, { "epoch": 0.6202893209250898, "grad_norm": 7.083047060697995, "learning_rate": 4.511064408455928e-05, "loss": 0.2721, "step": 1683 }, { "epoch": 0.6206578826131024, "grad_norm": 6.80164333008914, "learning_rate": 4.5107553467672145e-05, "loss": 0.3953, "step": 1684 }, { "epoch": 0.6210264443011149, "grad_norm": 3.669838848057954, "learning_rate": 4.5104462850785016e-05, "loss": 0.293, "step": 1685 }, { "epoch": 0.6213950059891274, "grad_norm": 5.505349765237233, "learning_rate": 4.510137223389789e-05, "loss": 0.264, "step": 1686 }, { "epoch": 0.6217635676771399, "grad_norm": 7.60564931411236, "learning_rate": 4.509828161701076e-05, "loss": 0.3458, "step": 1687 }, { "epoch": 0.6221321293651525, "grad_norm": 5.503258173564306, "learning_rate": 4.509519100012363e-05, "loss": 0.23, "step": 1688 }, { "epoch": 0.6225006910531651, "grad_norm": 3.4249326379096496, "learning_rate": 4.5092100383236494e-05, "loss": 0.3076, "step": 1689 }, { "epoch": 0.6228692527411775, "grad_norm": 11.278489723555008, "learning_rate": 4.5089009766349365e-05, "loss": 0.4576, "step": 1690 }, { "epoch": 0.6232378144291901, "grad_norm": 7.335811658598501, "learning_rate": 4.5085919149462237e-05, "loss": 0.4121, "step": 1691 }, { "epoch": 0.6236063761172026, "grad_norm": 16.49101613852979, "learning_rate": 4.508282853257511e-05, "loss": 0.4117, "step": 1692 }, { "epoch": 0.6239749378052152, "grad_norm": 3.3678272462698815, "learning_rate": 4.507973791568797e-05, "loss": 0.2225, "step": 1693 }, { "epoch": 0.6243434994932276, "grad_norm": 7.0571914571894485, "learning_rate": 4.507664729880084e-05, "loss": 0.5157, "step": 1694 }, { "epoch": 0.6247120611812402, "grad_norm": 7.04172884520517, "learning_rate": 4.5073556681913715e-05, "loss": 0.3465, "step": 1695 }, { "epoch": 0.6250806228692527, "grad_norm": 9.32764897497706, "learning_rate": 4.5070466065026586e-05, "loss": 0.5556, "step": 1696 }, { "epoch": 0.6254491845572653, "grad_norm": 8.15638142330061, "learning_rate": 4.506737544813945e-05, "loss": 0.3304, "step": 1697 }, { "epoch": 0.6258177462452778, "grad_norm": 6.073874274295704, "learning_rate": 4.506428483125232e-05, "loss": 0.4436, "step": 1698 }, { "epoch": 0.6261863079332903, "grad_norm": 6.3384563740174364, "learning_rate": 4.5061194214365186e-05, "loss": 0.3564, "step": 1699 }, { "epoch": 0.6265548696213029, "grad_norm": 9.540073800431303, "learning_rate": 4.505810359747806e-05, "loss": 0.3377, "step": 1700 }, { "epoch": 0.6269234313093154, "grad_norm": 5.1709621580410525, "learning_rate": 4.505501298059093e-05, "loss": 0.3397, "step": 1701 }, { "epoch": 0.627291992997328, "grad_norm": 4.6602755560875355, "learning_rate": 4.50519223637038e-05, "loss": 0.3137, "step": 1702 }, { "epoch": 0.6276605546853404, "grad_norm": 8.114072993385763, "learning_rate": 4.5048831746816664e-05, "loss": 0.4379, "step": 1703 }, { "epoch": 0.628029116373353, "grad_norm": 10.013106340391296, "learning_rate": 4.5045741129929535e-05, "loss": 0.3577, "step": 1704 }, { "epoch": 0.6283976780613655, "grad_norm": 5.879398403922039, "learning_rate": 4.5042650513042406e-05, "loss": 0.2843, "step": 1705 }, { "epoch": 0.6287662397493781, "grad_norm": 6.611689601916431, "learning_rate": 4.503955989615528e-05, "loss": 0.3557, "step": 1706 }, { "epoch": 0.6291348014373905, "grad_norm": 5.486415038026418, "learning_rate": 4.503646927926815e-05, "loss": 0.2827, "step": 1707 }, { "epoch": 0.6295033631254031, "grad_norm": 6.233291876071959, "learning_rate": 4.503337866238101e-05, "loss": 0.2819, "step": 1708 }, { "epoch": 0.6298719248134157, "grad_norm": 5.86669367148224, "learning_rate": 4.5030288045493884e-05, "loss": 0.2423, "step": 1709 }, { "epoch": 0.6302404865014282, "grad_norm": 5.110644074999562, "learning_rate": 4.5027197428606755e-05, "loss": 0.4029, "step": 1710 }, { "epoch": 0.6306090481894407, "grad_norm": 5.005936722115084, "learning_rate": 4.5024106811719627e-05, "loss": 0.2455, "step": 1711 }, { "epoch": 0.6309776098774532, "grad_norm": 5.105389548448409, "learning_rate": 4.502101619483249e-05, "loss": 0.2907, "step": 1712 }, { "epoch": 0.6313461715654658, "grad_norm": 11.604963201080713, "learning_rate": 4.5017925577945355e-05, "loss": 0.7123, "step": 1713 }, { "epoch": 0.6317147332534783, "grad_norm": 7.191643888659189, "learning_rate": 4.5014834961058227e-05, "loss": 0.3448, "step": 1714 }, { "epoch": 0.6320832949414908, "grad_norm": 7.500835880747699, "learning_rate": 4.50117443441711e-05, "loss": 0.3159, "step": 1715 }, { "epoch": 0.6324518566295033, "grad_norm": 5.411049537038554, "learning_rate": 4.500865372728397e-05, "loss": 0.3297, "step": 1716 }, { "epoch": 0.6328204183175159, "grad_norm": 15.256069288859578, "learning_rate": 4.500556311039683e-05, "loss": 0.3353, "step": 1717 }, { "epoch": 0.6331889800055285, "grad_norm": 8.135992038779175, "learning_rate": 4.5002472493509705e-05, "loss": 0.3411, "step": 1718 }, { "epoch": 0.633557541693541, "grad_norm": 5.1359394831315575, "learning_rate": 4.4999381876622576e-05, "loss": 0.2679, "step": 1719 }, { "epoch": 0.6339261033815535, "grad_norm": 5.208126379351866, "learning_rate": 4.499629125973545e-05, "loss": 0.2951, "step": 1720 }, { "epoch": 0.634294665069566, "grad_norm": 8.652892194807523, "learning_rate": 4.499320064284832e-05, "loss": 0.4873, "step": 1721 }, { "epoch": 0.6346632267575786, "grad_norm": 6.765832488157869, "learning_rate": 4.499011002596118e-05, "loss": 0.351, "step": 1722 }, { "epoch": 0.635031788445591, "grad_norm": 6.7132409509074975, "learning_rate": 4.4987019409074054e-05, "loss": 0.2713, "step": 1723 }, { "epoch": 0.6354003501336036, "grad_norm": 4.039698065626413, "learning_rate": 4.4983928792186925e-05, "loss": 0.2117, "step": 1724 }, { "epoch": 0.6357689118216161, "grad_norm": 4.159895825182066, "learning_rate": 4.4980838175299796e-05, "loss": 0.3225, "step": 1725 }, { "epoch": 0.6361374735096287, "grad_norm": 8.184785116144525, "learning_rate": 4.497774755841267e-05, "loss": 0.3786, "step": 1726 }, { "epoch": 0.6365060351976412, "grad_norm": 16.2371521851581, "learning_rate": 4.4974656941525525e-05, "loss": 0.3833, "step": 1727 }, { "epoch": 0.6368745968856537, "grad_norm": 9.057977639812313, "learning_rate": 4.4971566324638396e-05, "loss": 0.2745, "step": 1728 }, { "epoch": 0.6372431585736663, "grad_norm": 8.069939544716, "learning_rate": 4.496847570775127e-05, "loss": 0.4404, "step": 1729 }, { "epoch": 0.6376117202616788, "grad_norm": 6.199584873207049, "learning_rate": 4.496538509086414e-05, "loss": 0.3247, "step": 1730 }, { "epoch": 0.6379802819496914, "grad_norm": 7.662643814604212, "learning_rate": 4.496229447397701e-05, "loss": 0.3941, "step": 1731 }, { "epoch": 0.6383488436377038, "grad_norm": 8.953932374563799, "learning_rate": 4.4959203857089874e-05, "loss": 0.3124, "step": 1732 }, { "epoch": 0.6387174053257164, "grad_norm": 5.664182043458309, "learning_rate": 4.4956113240202745e-05, "loss": 0.4196, "step": 1733 }, { "epoch": 0.6390859670137289, "grad_norm": 4.596518584022229, "learning_rate": 4.4953022623315616e-05, "loss": 0.2455, "step": 1734 }, { "epoch": 0.6394545287017415, "grad_norm": 7.936051364178713, "learning_rate": 4.494993200642849e-05, "loss": 0.4408, "step": 1735 }, { "epoch": 0.6398230903897539, "grad_norm": 7.117367588281411, "learning_rate": 4.494684138954135e-05, "loss": 0.5376, "step": 1736 }, { "epoch": 0.6401916520777665, "grad_norm": 5.5037946615284925, "learning_rate": 4.494375077265422e-05, "loss": 0.2947, "step": 1737 }, { "epoch": 0.6405602137657791, "grad_norm": 8.635286251198435, "learning_rate": 4.4940660155767094e-05, "loss": 0.5156, "step": 1738 }, { "epoch": 0.6409287754537916, "grad_norm": 19.882639826116606, "learning_rate": 4.4937569538879966e-05, "loss": 0.388, "step": 1739 }, { "epoch": 0.6412973371418041, "grad_norm": 10.520845338921575, "learning_rate": 4.493447892199284e-05, "loss": 0.4566, "step": 1740 }, { "epoch": 0.6416658988298166, "grad_norm": 7.195789631981634, "learning_rate": 4.49313883051057e-05, "loss": 0.2825, "step": 1741 }, { "epoch": 0.6420344605178292, "grad_norm": 15.745137266285731, "learning_rate": 4.4928297688218566e-05, "loss": 0.313, "step": 1742 }, { "epoch": 0.6424030222058417, "grad_norm": 9.460224606708605, "learning_rate": 4.492520707133144e-05, "loss": 0.5506, "step": 1743 }, { "epoch": 0.6427715838938542, "grad_norm": 12.952369879602994, "learning_rate": 4.492211645444431e-05, "loss": 0.4105, "step": 1744 }, { "epoch": 0.6431401455818667, "grad_norm": 11.545135821323736, "learning_rate": 4.491902583755718e-05, "loss": 0.3914, "step": 1745 }, { "epoch": 0.6435087072698793, "grad_norm": 6.717438325815341, "learning_rate": 4.4915935220670044e-05, "loss": 0.3558, "step": 1746 }, { "epoch": 0.6438772689578919, "grad_norm": 4.25030673546993, "learning_rate": 4.4912844603782915e-05, "loss": 0.2319, "step": 1747 }, { "epoch": 0.6442458306459043, "grad_norm": 11.283747599973262, "learning_rate": 4.4909753986895786e-05, "loss": 0.474, "step": 1748 }, { "epoch": 0.6446143923339169, "grad_norm": 6.283532482333721, "learning_rate": 4.490666337000866e-05, "loss": 0.2806, "step": 1749 }, { "epoch": 0.6449829540219294, "grad_norm": 9.8261670222732, "learning_rate": 4.490357275312153e-05, "loss": 0.242, "step": 1750 }, { "epoch": 0.645351515709942, "grad_norm": 5.394190220201162, "learning_rate": 4.490048213623439e-05, "loss": 0.3008, "step": 1751 }, { "epoch": 0.6457200773979545, "grad_norm": 8.544785936418696, "learning_rate": 4.4897391519347264e-05, "loss": 0.4949, "step": 1752 }, { "epoch": 0.646088639085967, "grad_norm": 6.13161531656878, "learning_rate": 4.4894300902460135e-05, "loss": 0.2814, "step": 1753 }, { "epoch": 0.6464572007739795, "grad_norm": 3.8841059351419953, "learning_rate": 4.4891210285573006e-05, "loss": 0.353, "step": 1754 }, { "epoch": 0.6468257624619921, "grad_norm": 7.027194468432475, "learning_rate": 4.488811966868587e-05, "loss": 0.2175, "step": 1755 }, { "epoch": 0.6471943241500046, "grad_norm": 4.543098480655206, "learning_rate": 4.488502905179874e-05, "loss": 0.2004, "step": 1756 }, { "epoch": 0.6475628858380171, "grad_norm": 7.178623456840971, "learning_rate": 4.4881938434911606e-05, "loss": 0.2179, "step": 1757 }, { "epoch": 0.6479314475260297, "grad_norm": 7.792287329230684, "learning_rate": 4.487884781802448e-05, "loss": 0.3808, "step": 1758 }, { "epoch": 0.6483000092140422, "grad_norm": 4.542373368928836, "learning_rate": 4.487575720113735e-05, "loss": 0.2767, "step": 1759 }, { "epoch": 0.6486685709020548, "grad_norm": 5.4451935808504155, "learning_rate": 4.487266658425022e-05, "loss": 0.409, "step": 1760 }, { "epoch": 0.6490371325900672, "grad_norm": 5.870268519432001, "learning_rate": 4.4869575967363084e-05, "loss": 0.2745, "step": 1761 }, { "epoch": 0.6494056942780798, "grad_norm": 7.836971439139652, "learning_rate": 4.4866485350475956e-05, "loss": 0.3388, "step": 1762 }, { "epoch": 0.6497742559660923, "grad_norm": 6.174416629555543, "learning_rate": 4.486339473358883e-05, "loss": 0.2517, "step": 1763 }, { "epoch": 0.6501428176541049, "grad_norm": 6.056575121582258, "learning_rate": 4.48603041167017e-05, "loss": 0.392, "step": 1764 }, { "epoch": 0.6505113793421173, "grad_norm": 6.354044261649733, "learning_rate": 4.485721349981456e-05, "loss": 0.2405, "step": 1765 }, { "epoch": 0.6508799410301299, "grad_norm": 27.110869915554627, "learning_rate": 4.4854122882927434e-05, "loss": 0.4005, "step": 1766 }, { "epoch": 0.6512485027181425, "grad_norm": 4.078390880116876, "learning_rate": 4.4851032266040305e-05, "loss": 0.2064, "step": 1767 }, { "epoch": 0.651617064406155, "grad_norm": 5.239308324109958, "learning_rate": 4.4847941649153176e-05, "loss": 0.3232, "step": 1768 }, { "epoch": 0.6519856260941675, "grad_norm": 9.439823092079937, "learning_rate": 4.484485103226605e-05, "loss": 0.4731, "step": 1769 }, { "epoch": 0.65235418778218, "grad_norm": 6.117373954493962, "learning_rate": 4.484176041537891e-05, "loss": 0.2787, "step": 1770 }, { "epoch": 0.6527227494701926, "grad_norm": 6.890286737906225, "learning_rate": 4.483866979849178e-05, "loss": 0.3803, "step": 1771 }, { "epoch": 0.6530913111582051, "grad_norm": 12.240747811405228, "learning_rate": 4.483557918160465e-05, "loss": 0.3483, "step": 1772 }, { "epoch": 0.6534598728462176, "grad_norm": 6.529340108853273, "learning_rate": 4.483248856471752e-05, "loss": 0.2992, "step": 1773 }, { "epoch": 0.6538284345342301, "grad_norm": 6.080470870506665, "learning_rate": 4.482939794783039e-05, "loss": 0.2821, "step": 1774 }, { "epoch": 0.6541969962222427, "grad_norm": 5.451074427817193, "learning_rate": 4.4826307330943254e-05, "loss": 0.2945, "step": 1775 }, { "epoch": 0.6545655579102553, "grad_norm": 5.792805438527417, "learning_rate": 4.4823216714056125e-05, "loss": 0.3226, "step": 1776 }, { "epoch": 0.6549341195982677, "grad_norm": 16.72600328204606, "learning_rate": 4.4820126097168996e-05, "loss": 0.371, "step": 1777 }, { "epoch": 0.6553026812862803, "grad_norm": 6.230578038324405, "learning_rate": 4.481703548028187e-05, "loss": 0.376, "step": 1778 }, { "epoch": 0.6556712429742928, "grad_norm": 4.250055873727964, "learning_rate": 4.481394486339474e-05, "loss": 0.2445, "step": 1779 }, { "epoch": 0.6560398046623054, "grad_norm": 7.304563209169855, "learning_rate": 4.48108542465076e-05, "loss": 0.2981, "step": 1780 }, { "epoch": 0.6564083663503179, "grad_norm": 9.60051754510084, "learning_rate": 4.4807763629620474e-05, "loss": 0.6203, "step": 1781 }, { "epoch": 0.6567769280383304, "grad_norm": 5.311854424239844, "learning_rate": 4.4804673012733346e-05, "loss": 0.3852, "step": 1782 }, { "epoch": 0.6571454897263429, "grad_norm": 18.854770758911226, "learning_rate": 4.480158239584622e-05, "loss": 0.3424, "step": 1783 }, { "epoch": 0.6575140514143555, "grad_norm": 10.64097166687031, "learning_rate": 4.479849177895908e-05, "loss": 0.5135, "step": 1784 }, { "epoch": 0.6578826131023681, "grad_norm": 11.288773244564155, "learning_rate": 4.479540116207195e-05, "loss": 0.3753, "step": 1785 }, { "epoch": 0.6582511747903805, "grad_norm": 14.574293608257229, "learning_rate": 4.4792310545184824e-05, "loss": 0.5288, "step": 1786 }, { "epoch": 0.6586197364783931, "grad_norm": 6.809565200705273, "learning_rate": 4.478921992829769e-05, "loss": 0.3469, "step": 1787 }, { "epoch": 0.6589882981664056, "grad_norm": 4.95112834362027, "learning_rate": 4.478612931141056e-05, "loss": 0.2223, "step": 1788 }, { "epoch": 0.6593568598544182, "grad_norm": 6.769348952122035, "learning_rate": 4.478303869452343e-05, "loss": 0.3657, "step": 1789 }, { "epoch": 0.6597254215424306, "grad_norm": 5.351238870232674, "learning_rate": 4.4779948077636295e-05, "loss": 0.3419, "step": 1790 }, { "epoch": 0.6600939832304432, "grad_norm": 6.499430704862133, "learning_rate": 4.4776857460749166e-05, "loss": 0.5427, "step": 1791 }, { "epoch": 0.6604625449184557, "grad_norm": 6.757489605725219, "learning_rate": 4.477376684386204e-05, "loss": 0.4305, "step": 1792 }, { "epoch": 0.6608311066064683, "grad_norm": 7.2359678026856855, "learning_rate": 4.477067622697491e-05, "loss": 0.3153, "step": 1793 }, { "epoch": 0.6611996682944807, "grad_norm": 6.649551387942548, "learning_rate": 4.476758561008777e-05, "loss": 0.4991, "step": 1794 }, { "epoch": 0.6615682299824933, "grad_norm": 5.9240361198235965, "learning_rate": 4.4764494993200644e-05, "loss": 0.3071, "step": 1795 }, { "epoch": 0.6619367916705059, "grad_norm": 6.744207934792981, "learning_rate": 4.4761404376313515e-05, "loss": 0.3187, "step": 1796 }, { "epoch": 0.6623053533585184, "grad_norm": 10.565563231735855, "learning_rate": 4.4758313759426386e-05, "loss": 0.4849, "step": 1797 }, { "epoch": 0.662673915046531, "grad_norm": 15.000623562885332, "learning_rate": 4.475522314253926e-05, "loss": 0.4001, "step": 1798 }, { "epoch": 0.6630424767345434, "grad_norm": 4.371974007587701, "learning_rate": 4.475213252565212e-05, "loss": 0.2279, "step": 1799 }, { "epoch": 0.663411038422556, "grad_norm": 6.208974852049161, "learning_rate": 4.474904190876499e-05, "loss": 0.3495, "step": 1800 }, { "epoch": 0.6637796001105685, "grad_norm": 6.01869214625297, "learning_rate": 4.4745951291877864e-05, "loss": 0.245, "step": 1801 }, { "epoch": 0.664148161798581, "grad_norm": 15.757830898291193, "learning_rate": 4.474286067499073e-05, "loss": 0.2924, "step": 1802 }, { "epoch": 0.6645167234865935, "grad_norm": 8.794570322823391, "learning_rate": 4.47397700581036e-05, "loss": 0.5754, "step": 1803 }, { "epoch": 0.6648852851746061, "grad_norm": 5.0716966523633635, "learning_rate": 4.4736679441216464e-05, "loss": 0.3182, "step": 1804 }, { "epoch": 0.6652538468626187, "grad_norm": 7.048088020078178, "learning_rate": 4.4733588824329336e-05, "loss": 0.3058, "step": 1805 }, { "epoch": 0.6656224085506312, "grad_norm": 4.300639774722341, "learning_rate": 4.473049820744221e-05, "loss": 0.3171, "step": 1806 }, { "epoch": 0.6659909702386437, "grad_norm": 7.775172054435465, "learning_rate": 4.472740759055508e-05, "loss": 0.3268, "step": 1807 }, { "epoch": 0.6663595319266562, "grad_norm": 9.068521538031554, "learning_rate": 4.472431697366794e-05, "loss": 0.3377, "step": 1808 }, { "epoch": 0.6667280936146688, "grad_norm": 23.260579645178396, "learning_rate": 4.4721226356780814e-05, "loss": 0.5297, "step": 1809 }, { "epoch": 0.6670966553026813, "grad_norm": 7.0020645366833705, "learning_rate": 4.4718135739893685e-05, "loss": 0.3261, "step": 1810 }, { "epoch": 0.6674652169906938, "grad_norm": 6.195149326941641, "learning_rate": 4.4715045123006556e-05, "loss": 0.413, "step": 1811 }, { "epoch": 0.6678337786787063, "grad_norm": 10.828349057476254, "learning_rate": 4.471195450611943e-05, "loss": 0.3571, "step": 1812 }, { "epoch": 0.6682023403667189, "grad_norm": 6.7095890488969685, "learning_rate": 4.470886388923229e-05, "loss": 0.1436, "step": 1813 }, { "epoch": 0.6685709020547315, "grad_norm": 9.045027944543794, "learning_rate": 4.470577327234516e-05, "loss": 0.4195, "step": 1814 }, { "epoch": 0.6689394637427439, "grad_norm": 5.9009501581147665, "learning_rate": 4.4702682655458034e-05, "loss": 0.293, "step": 1815 }, { "epoch": 0.6693080254307565, "grad_norm": 7.332905005459419, "learning_rate": 4.4699592038570905e-05, "loss": 0.3107, "step": 1816 }, { "epoch": 0.669676587118769, "grad_norm": 15.670279607073613, "learning_rate": 4.4696501421683776e-05, "loss": 0.4459, "step": 1817 }, { "epoch": 0.6700451488067816, "grad_norm": 7.174083095540415, "learning_rate": 4.4693410804796634e-05, "loss": 0.2767, "step": 1818 }, { "epoch": 0.670413710494794, "grad_norm": 5.143278509619584, "learning_rate": 4.4690320187909505e-05, "loss": 0.3571, "step": 1819 }, { "epoch": 0.6707822721828066, "grad_norm": 6.553645381903727, "learning_rate": 4.4687229571022376e-05, "loss": 0.4899, "step": 1820 }, { "epoch": 0.6711508338708191, "grad_norm": 6.132760250573605, "learning_rate": 4.468413895413525e-05, "loss": 0.2582, "step": 1821 }, { "epoch": 0.6715193955588317, "grad_norm": 4.480014927702966, "learning_rate": 4.468104833724812e-05, "loss": 0.2801, "step": 1822 }, { "epoch": 0.6718879572468441, "grad_norm": 6.748648614180159, "learning_rate": 4.467795772036098e-05, "loss": 0.3619, "step": 1823 }, { "epoch": 0.6722565189348567, "grad_norm": 6.400995248657809, "learning_rate": 4.4674867103473854e-05, "loss": 0.3418, "step": 1824 }, { "epoch": 0.6726250806228693, "grad_norm": 5.598243655937995, "learning_rate": 4.4671776486586726e-05, "loss": 0.4109, "step": 1825 }, { "epoch": 0.6729936423108818, "grad_norm": 4.380145289828839, "learning_rate": 4.46686858696996e-05, "loss": 0.2805, "step": 1826 }, { "epoch": 0.6733622039988943, "grad_norm": 5.866381227889772, "learning_rate": 4.466559525281246e-05, "loss": 0.4398, "step": 1827 }, { "epoch": 0.6737307656869068, "grad_norm": 12.229338991856197, "learning_rate": 4.466250463592533e-05, "loss": 0.3639, "step": 1828 }, { "epoch": 0.6740993273749194, "grad_norm": 7.063182156457029, "learning_rate": 4.4659414019038204e-05, "loss": 0.3516, "step": 1829 }, { "epoch": 0.6744678890629319, "grad_norm": 6.4808395859087, "learning_rate": 4.4656323402151075e-05, "loss": 0.2935, "step": 1830 }, { "epoch": 0.6748364507509444, "grad_norm": 5.570632647159531, "learning_rate": 4.4653232785263946e-05, "loss": 0.4013, "step": 1831 }, { "epoch": 0.6752050124389569, "grad_norm": 13.309925958784305, "learning_rate": 4.465014216837681e-05, "loss": 0.3824, "step": 1832 }, { "epoch": 0.6755735741269695, "grad_norm": 6.15824369882368, "learning_rate": 4.4647051551489675e-05, "loss": 0.3572, "step": 1833 }, { "epoch": 0.6759421358149821, "grad_norm": 10.744952059035748, "learning_rate": 4.4643960934602546e-05, "loss": 0.175, "step": 1834 }, { "epoch": 0.6763106975029946, "grad_norm": 7.158828470723558, "learning_rate": 4.464087031771542e-05, "loss": 0.4354, "step": 1835 }, { "epoch": 0.6766792591910071, "grad_norm": 5.383870811125456, "learning_rate": 4.463777970082829e-05, "loss": 0.2709, "step": 1836 }, { "epoch": 0.6770478208790196, "grad_norm": 6.230111177043337, "learning_rate": 4.463468908394115e-05, "loss": 0.3707, "step": 1837 }, { "epoch": 0.6774163825670322, "grad_norm": 10.16961560448421, "learning_rate": 4.4631598467054024e-05, "loss": 0.6204, "step": 1838 }, { "epoch": 0.6777849442550447, "grad_norm": 6.117383308248639, "learning_rate": 4.4628507850166895e-05, "loss": 0.2936, "step": 1839 }, { "epoch": 0.6781535059430572, "grad_norm": 4.1468091164639675, "learning_rate": 4.4625417233279766e-05, "loss": 0.2192, "step": 1840 }, { "epoch": 0.6785220676310697, "grad_norm": 9.681601814590861, "learning_rate": 4.462232661639264e-05, "loss": 0.6263, "step": 1841 }, { "epoch": 0.6788906293190823, "grad_norm": 10.431841567117731, "learning_rate": 4.46192359995055e-05, "loss": 0.2668, "step": 1842 }, { "epoch": 0.6792591910070949, "grad_norm": 9.618114039445636, "learning_rate": 4.461614538261837e-05, "loss": 0.3892, "step": 1843 }, { "epoch": 0.6796277526951073, "grad_norm": 6.647003198417719, "learning_rate": 4.4613054765731244e-05, "loss": 0.2987, "step": 1844 }, { "epoch": 0.6799963143831199, "grad_norm": 9.41130381325202, "learning_rate": 4.4609964148844116e-05, "loss": 0.5299, "step": 1845 }, { "epoch": 0.6803648760711324, "grad_norm": 9.075121279239752, "learning_rate": 4.460687353195698e-05, "loss": 0.3153, "step": 1846 }, { "epoch": 0.680733437759145, "grad_norm": 3.9731514621133632, "learning_rate": 4.460378291506985e-05, "loss": 0.2871, "step": 1847 }, { "epoch": 0.6811019994471574, "grad_norm": 8.88172077861367, "learning_rate": 4.4600692298182716e-05, "loss": 0.3676, "step": 1848 }, { "epoch": 0.68147056113517, "grad_norm": 5.939984574731392, "learning_rate": 4.459760168129559e-05, "loss": 0.3086, "step": 1849 }, { "epoch": 0.6818391228231825, "grad_norm": 14.449383489531277, "learning_rate": 4.459451106440846e-05, "loss": 0.3388, "step": 1850 }, { "epoch": 0.6822076845111951, "grad_norm": 5.3426776724704315, "learning_rate": 4.459142044752133e-05, "loss": 0.2676, "step": 1851 }, { "epoch": 0.6825762461992075, "grad_norm": 13.111024809587434, "learning_rate": 4.4588329830634194e-05, "loss": 0.3917, "step": 1852 }, { "epoch": 0.6829448078872201, "grad_norm": 6.7174082281206235, "learning_rate": 4.4585239213747065e-05, "loss": 0.3954, "step": 1853 }, { "epoch": 0.6833133695752327, "grad_norm": 6.18118734761916, "learning_rate": 4.4582148596859936e-05, "loss": 0.3389, "step": 1854 }, { "epoch": 0.6836819312632452, "grad_norm": 15.546145750798752, "learning_rate": 4.457905797997281e-05, "loss": 0.351, "step": 1855 }, { "epoch": 0.6840504929512577, "grad_norm": 10.690959883470729, "learning_rate": 4.457596736308567e-05, "loss": 0.4328, "step": 1856 }, { "epoch": 0.6844190546392702, "grad_norm": 6.082741207760358, "learning_rate": 4.457287674619854e-05, "loss": 0.341, "step": 1857 }, { "epoch": 0.6847876163272828, "grad_norm": 6.186829193435843, "learning_rate": 4.4569786129311414e-05, "loss": 0.3346, "step": 1858 }, { "epoch": 0.6851561780152953, "grad_norm": 5.888614544186094, "learning_rate": 4.4566695512424285e-05, "loss": 0.3059, "step": 1859 }, { "epoch": 0.6855247397033079, "grad_norm": 7.816458957843279, "learning_rate": 4.4563604895537156e-05, "loss": 0.427, "step": 1860 }, { "epoch": 0.6858933013913203, "grad_norm": 7.701069091296726, "learning_rate": 4.456051427865002e-05, "loss": 0.3285, "step": 1861 }, { "epoch": 0.6862618630793329, "grad_norm": 6.275251418468446, "learning_rate": 4.455742366176289e-05, "loss": 0.3244, "step": 1862 }, { "epoch": 0.6866304247673455, "grad_norm": 16.84242008316538, "learning_rate": 4.4554333044875756e-05, "loss": 0.3063, "step": 1863 }, { "epoch": 0.686998986455358, "grad_norm": 11.038397615769606, "learning_rate": 4.455124242798863e-05, "loss": 0.3356, "step": 1864 }, { "epoch": 0.6873675481433705, "grad_norm": 7.149760272935675, "learning_rate": 4.45481518111015e-05, "loss": 0.1921, "step": 1865 }, { "epoch": 0.687736109831383, "grad_norm": 6.032374143840823, "learning_rate": 4.454506119421436e-05, "loss": 0.4143, "step": 1866 }, { "epoch": 0.6881046715193956, "grad_norm": 6.28211840040282, "learning_rate": 4.4541970577327234e-05, "loss": 0.386, "step": 1867 }, { "epoch": 0.688473233207408, "grad_norm": 10.002540647104247, "learning_rate": 4.4538879960440106e-05, "loss": 0.4895, "step": 1868 }, { "epoch": 0.6888417948954206, "grad_norm": 9.423805254182092, "learning_rate": 4.453578934355298e-05, "loss": 0.391, "step": 1869 }, { "epoch": 0.6892103565834331, "grad_norm": 5.615144021012194, "learning_rate": 4.453269872666585e-05, "loss": 0.4827, "step": 1870 }, { "epoch": 0.6895789182714457, "grad_norm": 5.119030336707799, "learning_rate": 4.452960810977871e-05, "loss": 0.2634, "step": 1871 }, { "epoch": 0.6899474799594583, "grad_norm": 5.3491126920076075, "learning_rate": 4.4526517492891583e-05, "loss": 0.5154, "step": 1872 }, { "epoch": 0.6903160416474707, "grad_norm": 7.760781448973452, "learning_rate": 4.4523426876004455e-05, "loss": 0.3326, "step": 1873 }, { "epoch": 0.6906846033354833, "grad_norm": 5.542538089165345, "learning_rate": 4.4520336259117326e-05, "loss": 0.4154, "step": 1874 }, { "epoch": 0.6910531650234958, "grad_norm": 5.39435269380001, "learning_rate": 4.451724564223019e-05, "loss": 0.3172, "step": 1875 }, { "epoch": 0.6914217267115084, "grad_norm": 4.698944565788006, "learning_rate": 4.451415502534306e-05, "loss": 0.2373, "step": 1876 }, { "epoch": 0.6917902883995208, "grad_norm": 6.038070064192503, "learning_rate": 4.451106440845593e-05, "loss": 0.419, "step": 1877 }, { "epoch": 0.6921588500875334, "grad_norm": 7.001891289388744, "learning_rate": 4.45079737915688e-05, "loss": 0.4753, "step": 1878 }, { "epoch": 0.6925274117755459, "grad_norm": 4.148360716832962, "learning_rate": 4.450488317468167e-05, "loss": 0.2701, "step": 1879 }, { "epoch": 0.6928959734635585, "grad_norm": 8.32041061415156, "learning_rate": 4.450179255779453e-05, "loss": 0.3738, "step": 1880 }, { "epoch": 0.693264535151571, "grad_norm": 7.034330651081184, "learning_rate": 4.4498701940907404e-05, "loss": 0.2751, "step": 1881 }, { "epoch": 0.6936330968395835, "grad_norm": 5.209601326770286, "learning_rate": 4.4495611324020275e-05, "loss": 0.1965, "step": 1882 }, { "epoch": 0.6940016585275961, "grad_norm": 6.5006197120447675, "learning_rate": 4.4492520707133146e-05, "loss": 0.3915, "step": 1883 }, { "epoch": 0.6943702202156086, "grad_norm": 5.656901453765761, "learning_rate": 4.448943009024602e-05, "loss": 0.4052, "step": 1884 }, { "epoch": 0.6947387819036211, "grad_norm": 7.337286259464075, "learning_rate": 4.448633947335888e-05, "loss": 0.3788, "step": 1885 }, { "epoch": 0.6951073435916336, "grad_norm": 6.3325210184199525, "learning_rate": 4.448324885647175e-05, "loss": 0.5444, "step": 1886 }, { "epoch": 0.6954759052796462, "grad_norm": 5.634773579662929, "learning_rate": 4.4480158239584624e-05, "loss": 0.3184, "step": 1887 }, { "epoch": 0.6958444669676587, "grad_norm": 3.4005591774171804, "learning_rate": 4.4477067622697495e-05, "loss": 0.1858, "step": 1888 }, { "epoch": 0.6962130286556713, "grad_norm": 5.329880411262926, "learning_rate": 4.447397700581037e-05, "loss": 0.353, "step": 1889 }, { "epoch": 0.6965815903436837, "grad_norm": 6.100749685777611, "learning_rate": 4.447088638892323e-05, "loss": 0.3069, "step": 1890 }, { "epoch": 0.6969501520316963, "grad_norm": 12.047037126016711, "learning_rate": 4.44677957720361e-05, "loss": 0.5663, "step": 1891 }, { "epoch": 0.6973187137197089, "grad_norm": 5.249470638653386, "learning_rate": 4.4464705155148973e-05, "loss": 0.344, "step": 1892 }, { "epoch": 0.6976872754077214, "grad_norm": 5.866691395678565, "learning_rate": 4.446161453826184e-05, "loss": 0.2696, "step": 1893 }, { "epoch": 0.6980558370957339, "grad_norm": 8.713450800544846, "learning_rate": 4.445852392137471e-05, "loss": 0.4294, "step": 1894 }, { "epoch": 0.6984243987837464, "grad_norm": 5.800578042526573, "learning_rate": 4.4455433304487573e-05, "loss": 0.4262, "step": 1895 }, { "epoch": 0.698792960471759, "grad_norm": 8.182367710963733, "learning_rate": 4.4452342687600445e-05, "loss": 0.4076, "step": 1896 }, { "epoch": 0.6991615221597715, "grad_norm": 6.9709335122336675, "learning_rate": 4.4449252070713316e-05, "loss": 0.3127, "step": 1897 }, { "epoch": 0.699530083847784, "grad_norm": 6.140449191384345, "learning_rate": 4.444616145382619e-05, "loss": 0.3675, "step": 1898 }, { "epoch": 0.6998986455357965, "grad_norm": 6.098348750515188, "learning_rate": 4.444307083693905e-05, "loss": 0.3152, "step": 1899 }, { "epoch": 0.7002672072238091, "grad_norm": 4.9688858877349995, "learning_rate": 4.443998022005192e-05, "loss": 0.2041, "step": 1900 }, { "epoch": 0.7006357689118217, "grad_norm": 7.828804763244265, "learning_rate": 4.4436889603164794e-05, "loss": 0.4625, "step": 1901 }, { "epoch": 0.7010043305998341, "grad_norm": 6.5626447389172515, "learning_rate": 4.4433798986277665e-05, "loss": 0.3857, "step": 1902 }, { "epoch": 0.7013728922878467, "grad_norm": 19.22225895304224, "learning_rate": 4.4430708369390536e-05, "loss": 0.312, "step": 1903 }, { "epoch": 0.7017414539758592, "grad_norm": 6.648274259434707, "learning_rate": 4.44276177525034e-05, "loss": 0.3317, "step": 1904 }, { "epoch": 0.7021100156638718, "grad_norm": 7.157704834293815, "learning_rate": 4.442452713561627e-05, "loss": 0.4337, "step": 1905 }, { "epoch": 0.7024785773518842, "grad_norm": 4.4861197981669925, "learning_rate": 4.442143651872914e-05, "loss": 0.2638, "step": 1906 }, { "epoch": 0.7028471390398968, "grad_norm": 10.084943112168933, "learning_rate": 4.4418345901842014e-05, "loss": 0.6286, "step": 1907 }, { "epoch": 0.7032157007279093, "grad_norm": 5.565744375397644, "learning_rate": 4.441525528495488e-05, "loss": 0.3037, "step": 1908 }, { "epoch": 0.7035842624159219, "grad_norm": 5.453569191928562, "learning_rate": 4.441216466806774e-05, "loss": 0.4376, "step": 1909 }, { "epoch": 0.7039528241039344, "grad_norm": 4.519028591996648, "learning_rate": 4.4409074051180614e-05, "loss": 0.4272, "step": 1910 }, { "epoch": 0.7043213857919469, "grad_norm": 9.497157424488014, "learning_rate": 4.4405983434293485e-05, "loss": 0.4154, "step": 1911 }, { "epoch": 0.7046899474799595, "grad_norm": 13.181253264468834, "learning_rate": 4.440289281740636e-05, "loss": 0.3747, "step": 1912 }, { "epoch": 0.705058509167972, "grad_norm": 7.237800331011612, "learning_rate": 4.439980220051923e-05, "loss": 0.2998, "step": 1913 }, { "epoch": 0.7054270708559846, "grad_norm": 7.529874937277838, "learning_rate": 4.439671158363209e-05, "loss": 0.3924, "step": 1914 }, { "epoch": 0.705795632543997, "grad_norm": 6.302769176480461, "learning_rate": 4.4393620966744963e-05, "loss": 0.4298, "step": 1915 }, { "epoch": 0.7061641942320096, "grad_norm": 11.547303146663186, "learning_rate": 4.4390530349857835e-05, "loss": 0.38, "step": 1916 }, { "epoch": 0.7065327559200221, "grad_norm": 9.246360268919672, "learning_rate": 4.4387439732970706e-05, "loss": 0.3606, "step": 1917 }, { "epoch": 0.7069013176080347, "grad_norm": 4.314748868739984, "learning_rate": 4.438434911608357e-05, "loss": 0.3332, "step": 1918 }, { "epoch": 0.7072698792960471, "grad_norm": 8.508876877577524, "learning_rate": 4.438125849919644e-05, "loss": 0.3649, "step": 1919 }, { "epoch": 0.7076384409840597, "grad_norm": 9.848427088285787, "learning_rate": 4.437816788230931e-05, "loss": 0.595, "step": 1920 }, { "epoch": 0.7080070026720723, "grad_norm": 5.026383030290021, "learning_rate": 4.4375077265422184e-05, "loss": 0.1285, "step": 1921 }, { "epoch": 0.7083755643600848, "grad_norm": 5.3291812867696535, "learning_rate": 4.4371986648535055e-05, "loss": 0.2809, "step": 1922 }, { "epoch": 0.7087441260480973, "grad_norm": 6.107206384110106, "learning_rate": 4.436889603164792e-05, "loss": 0.3482, "step": 1923 }, { "epoch": 0.7091126877361098, "grad_norm": 6.073415780427301, "learning_rate": 4.4365805414760784e-05, "loss": 0.3311, "step": 1924 }, { "epoch": 0.7094812494241224, "grad_norm": 9.245693957215341, "learning_rate": 4.4362714797873655e-05, "loss": 0.3476, "step": 1925 }, { "epoch": 0.7098498111121349, "grad_norm": 20.801521773790224, "learning_rate": 4.4359624180986526e-05, "loss": 0.3761, "step": 1926 }, { "epoch": 0.7102183728001474, "grad_norm": 6.560422732060781, "learning_rate": 4.43565335640994e-05, "loss": 0.4763, "step": 1927 }, { "epoch": 0.7105869344881599, "grad_norm": 58.463538650705196, "learning_rate": 4.435344294721226e-05, "loss": 0.3653, "step": 1928 }, { "epoch": 0.7109554961761725, "grad_norm": 8.819926710451417, "learning_rate": 4.435035233032513e-05, "loss": 0.6145, "step": 1929 }, { "epoch": 0.7113240578641851, "grad_norm": 6.9085639770929195, "learning_rate": 4.4347261713438004e-05, "loss": 0.3213, "step": 1930 }, { "epoch": 0.7116926195521975, "grad_norm": 6.666228788618572, "learning_rate": 4.4344171096550875e-05, "loss": 0.6611, "step": 1931 }, { "epoch": 0.7120611812402101, "grad_norm": 9.16250011657465, "learning_rate": 4.4341080479663747e-05, "loss": 0.5121, "step": 1932 }, { "epoch": 0.7124297429282226, "grad_norm": 8.608872968433271, "learning_rate": 4.433798986277661e-05, "loss": 0.3374, "step": 1933 }, { "epoch": 0.7127983046162352, "grad_norm": 5.642860540213088, "learning_rate": 4.433489924588948e-05, "loss": 0.3401, "step": 1934 }, { "epoch": 0.7131668663042476, "grad_norm": 11.49277302425787, "learning_rate": 4.4331808629002353e-05, "loss": 0.303, "step": 1935 }, { "epoch": 0.7135354279922602, "grad_norm": 7.436104266851055, "learning_rate": 4.4328718012115225e-05, "loss": 0.5929, "step": 1936 }, { "epoch": 0.7139039896802727, "grad_norm": 4.584813150739871, "learning_rate": 4.432562739522809e-05, "loss": 0.2694, "step": 1937 }, { "epoch": 0.7142725513682853, "grad_norm": 5.979832132592383, "learning_rate": 4.432253677834096e-05, "loss": 0.5886, "step": 1938 }, { "epoch": 0.7146411130562978, "grad_norm": 6.070895016090026, "learning_rate": 4.4319446161453825e-05, "loss": 0.3306, "step": 1939 }, { "epoch": 0.7150096747443103, "grad_norm": 5.167241997628962, "learning_rate": 4.4316355544566696e-05, "loss": 0.2396, "step": 1940 }, { "epoch": 0.7153782364323229, "grad_norm": 5.907006916611037, "learning_rate": 4.431326492767957e-05, "loss": 0.3254, "step": 1941 }, { "epoch": 0.7157467981203354, "grad_norm": 3.6696378345209832, "learning_rate": 4.431017431079244e-05, "loss": 0.1842, "step": 1942 }, { "epoch": 0.716115359808348, "grad_norm": 5.998806834793242, "learning_rate": 4.43070836939053e-05, "loss": 0.3924, "step": 1943 }, { "epoch": 0.7164839214963604, "grad_norm": 7.164051317977915, "learning_rate": 4.4303993077018174e-05, "loss": 0.2405, "step": 1944 }, { "epoch": 0.716852483184373, "grad_norm": 5.437942486878708, "learning_rate": 4.4300902460131045e-05, "loss": 0.3483, "step": 1945 }, { "epoch": 0.7172210448723855, "grad_norm": 6.114681661184575, "learning_rate": 4.4297811843243916e-05, "loss": 0.2453, "step": 1946 }, { "epoch": 0.717589606560398, "grad_norm": 3.4381100200027856, "learning_rate": 4.429472122635678e-05, "loss": 0.2381, "step": 1947 }, { "epoch": 0.7179581682484106, "grad_norm": 6.258861515734673, "learning_rate": 4.429163060946965e-05, "loss": 0.3203, "step": 1948 }, { "epoch": 0.7183267299364231, "grad_norm": 4.36113219583413, "learning_rate": 4.428853999258252e-05, "loss": 0.3117, "step": 1949 }, { "epoch": 0.7186952916244357, "grad_norm": 4.976435828879938, "learning_rate": 4.4285449375695394e-05, "loss": 0.3631, "step": 1950 }, { "epoch": 0.7190638533124482, "grad_norm": 7.357618636621031, "learning_rate": 4.4282358758808265e-05, "loss": 0.3797, "step": 1951 }, { "epoch": 0.7194324150004607, "grad_norm": 4.354000155293397, "learning_rate": 4.427926814192113e-05, "loss": 0.2508, "step": 1952 }, { "epoch": 0.7198009766884732, "grad_norm": 6.840767824678188, "learning_rate": 4.4276177525034e-05, "loss": 0.3213, "step": 1953 }, { "epoch": 0.7201695383764858, "grad_norm": 4.781730677558605, "learning_rate": 4.4273086908146865e-05, "loss": 0.3289, "step": 1954 }, { "epoch": 0.7205381000644983, "grad_norm": 7.213954901765538, "learning_rate": 4.4269996291259737e-05, "loss": 0.3782, "step": 1955 }, { "epoch": 0.7209066617525108, "grad_norm": 12.956494821387482, "learning_rate": 4.426690567437261e-05, "loss": 0.2967, "step": 1956 }, { "epoch": 0.7212752234405233, "grad_norm": 7.703308227093186, "learning_rate": 4.426381505748547e-05, "loss": 0.4834, "step": 1957 }, { "epoch": 0.7216437851285359, "grad_norm": 10.852727422398612, "learning_rate": 4.426072444059834e-05, "loss": 0.3457, "step": 1958 }, { "epoch": 0.7220123468165485, "grad_norm": 3.9150827871170835, "learning_rate": 4.4257633823711215e-05, "loss": 0.2015, "step": 1959 }, { "epoch": 0.7223809085045609, "grad_norm": 4.9280033771453775, "learning_rate": 4.4254543206824086e-05, "loss": 0.2249, "step": 1960 }, { "epoch": 0.7227494701925735, "grad_norm": 6.984124300764925, "learning_rate": 4.425145258993696e-05, "loss": 0.4178, "step": 1961 }, { "epoch": 0.723118031880586, "grad_norm": 5.8297414711217845, "learning_rate": 4.424836197304982e-05, "loss": 0.3312, "step": 1962 }, { "epoch": 0.7234865935685986, "grad_norm": 5.714630143820688, "learning_rate": 4.424527135616269e-05, "loss": 0.3059, "step": 1963 }, { "epoch": 0.723855155256611, "grad_norm": 10.451801666748882, "learning_rate": 4.4242180739275564e-05, "loss": 0.3306, "step": 1964 }, { "epoch": 0.7242237169446236, "grad_norm": 15.714312129803538, "learning_rate": 4.4239090122388435e-05, "loss": 0.2503, "step": 1965 }, { "epoch": 0.7245922786326361, "grad_norm": 5.393255415747727, "learning_rate": 4.42359995055013e-05, "loss": 0.4143, "step": 1966 }, { "epoch": 0.7249608403206487, "grad_norm": 6.827000296208694, "learning_rate": 4.423290888861417e-05, "loss": 0.4716, "step": 1967 }, { "epoch": 0.7253294020086613, "grad_norm": 9.374369282169152, "learning_rate": 4.422981827172704e-05, "loss": 0.4205, "step": 1968 }, { "epoch": 0.7256979636966737, "grad_norm": 7.383177797675595, "learning_rate": 4.4226727654839906e-05, "loss": 0.3822, "step": 1969 }, { "epoch": 0.7260665253846863, "grad_norm": 15.13018155207756, "learning_rate": 4.422363703795278e-05, "loss": 0.6625, "step": 1970 }, { "epoch": 0.7264350870726988, "grad_norm": 5.313891239878074, "learning_rate": 4.422054642106564e-05, "loss": 0.3408, "step": 1971 }, { "epoch": 0.7268036487607114, "grad_norm": 37.471887248767906, "learning_rate": 4.421745580417851e-05, "loss": 0.3266, "step": 1972 }, { "epoch": 0.7271722104487238, "grad_norm": 5.610081535195631, "learning_rate": 4.4214365187291384e-05, "loss": 0.4403, "step": 1973 }, { "epoch": 0.7275407721367364, "grad_norm": 6.74557576181592, "learning_rate": 4.4211274570404255e-05, "loss": 0.4869, "step": 1974 }, { "epoch": 0.7279093338247489, "grad_norm": 5.817377156729876, "learning_rate": 4.4208183953517127e-05, "loss": 0.2196, "step": 1975 }, { "epoch": 0.7282778955127615, "grad_norm": 9.220108811404593, "learning_rate": 4.420509333662999e-05, "loss": 0.4565, "step": 1976 }, { "epoch": 0.728646457200774, "grad_norm": 5.016591962255998, "learning_rate": 4.420200271974286e-05, "loss": 0.3726, "step": 1977 }, { "epoch": 0.7290150188887865, "grad_norm": 5.910043295558087, "learning_rate": 4.419891210285573e-05, "loss": 0.2953, "step": 1978 }, { "epoch": 0.7293835805767991, "grad_norm": 4.4033856111051515, "learning_rate": 4.4195821485968605e-05, "loss": 0.4034, "step": 1979 }, { "epoch": 0.7297521422648116, "grad_norm": 4.956133870211047, "learning_rate": 4.419273086908147e-05, "loss": 0.3437, "step": 1980 }, { "epoch": 0.7301207039528241, "grad_norm": 6.375168442838239, "learning_rate": 4.418964025219434e-05, "loss": 0.5224, "step": 1981 }, { "epoch": 0.7304892656408366, "grad_norm": 5.603911818295888, "learning_rate": 4.418654963530721e-05, "loss": 0.2703, "step": 1982 }, { "epoch": 0.7308578273288492, "grad_norm": 3.1684613579802674, "learning_rate": 4.418345901842008e-05, "loss": 0.2174, "step": 1983 }, { "epoch": 0.7312263890168617, "grad_norm": 5.328681267928847, "learning_rate": 4.418036840153295e-05, "loss": 0.3019, "step": 1984 }, { "epoch": 0.7315949507048742, "grad_norm": 5.917068503618466, "learning_rate": 4.417727778464582e-05, "loss": 0.3404, "step": 1985 }, { "epoch": 0.7319635123928867, "grad_norm": 8.131256042525138, "learning_rate": 4.417418716775868e-05, "loss": 0.4028, "step": 1986 }, { "epoch": 0.7323320740808993, "grad_norm": 8.034947356910331, "learning_rate": 4.4171096550871554e-05, "loss": 0.2458, "step": 1987 }, { "epoch": 0.7327006357689119, "grad_norm": 29.13589522607809, "learning_rate": 4.4168005933984425e-05, "loss": 0.4843, "step": 1988 }, { "epoch": 0.7330691974569243, "grad_norm": 5.91139537985018, "learning_rate": 4.4164915317097296e-05, "loss": 0.2405, "step": 1989 }, { "epoch": 0.7334377591449369, "grad_norm": 7.431111645513472, "learning_rate": 4.416182470021016e-05, "loss": 0.3823, "step": 1990 }, { "epoch": 0.7338063208329494, "grad_norm": 8.556672500867926, "learning_rate": 4.415873408332303e-05, "loss": 0.5479, "step": 1991 }, { "epoch": 0.734174882520962, "grad_norm": 5.599753456138158, "learning_rate": 4.41556434664359e-05, "loss": 0.3302, "step": 1992 }, { "epoch": 0.7345434442089744, "grad_norm": 5.906852247424205, "learning_rate": 4.4152552849548774e-05, "loss": 0.3525, "step": 1993 }, { "epoch": 0.734912005896987, "grad_norm": 14.728258564467046, "learning_rate": 4.4149462232661645e-05, "loss": 0.5694, "step": 1994 }, { "epoch": 0.7352805675849995, "grad_norm": 9.212449018526252, "learning_rate": 4.414637161577451e-05, "loss": 0.3195, "step": 1995 }, { "epoch": 0.7356491292730121, "grad_norm": 11.153212531834647, "learning_rate": 4.414328099888738e-05, "loss": 0.6808, "step": 1996 }, { "epoch": 0.7360176909610247, "grad_norm": 6.496032677854831, "learning_rate": 4.414019038200025e-05, "loss": 0.2877, "step": 1997 }, { "epoch": 0.7363862526490371, "grad_norm": 19.722699188530203, "learning_rate": 4.413709976511312e-05, "loss": 0.4535, "step": 1998 }, { "epoch": 0.7367548143370497, "grad_norm": 7.7509772392158265, "learning_rate": 4.413400914822599e-05, "loss": 0.5222, "step": 1999 }, { "epoch": 0.7371233760250622, "grad_norm": 6.789908651599837, "learning_rate": 4.413091853133885e-05, "loss": 0.3898, "step": 2000 }, { "epoch": 0.7374919377130748, "grad_norm": 4.727624896743252, "learning_rate": 4.412782791445172e-05, "loss": 0.2721, "step": 2001 }, { "epoch": 0.7378604994010872, "grad_norm": 3.8888712534428924, "learning_rate": 4.4124737297564595e-05, "loss": 0.3478, "step": 2002 }, { "epoch": 0.7382290610890998, "grad_norm": 5.913856734572692, "learning_rate": 4.4121646680677466e-05, "loss": 0.3701, "step": 2003 }, { "epoch": 0.7385976227771123, "grad_norm": 8.43757414608625, "learning_rate": 4.411855606379034e-05, "loss": 0.3239, "step": 2004 }, { "epoch": 0.7389661844651249, "grad_norm": 9.788520816267788, "learning_rate": 4.41154654469032e-05, "loss": 0.5122, "step": 2005 }, { "epoch": 0.7393347461531374, "grad_norm": 5.606019272835435, "learning_rate": 4.411237483001607e-05, "loss": 0.3936, "step": 2006 }, { "epoch": 0.7397033078411499, "grad_norm": 7.595594896294383, "learning_rate": 4.4109284213128944e-05, "loss": 0.3887, "step": 2007 }, { "epoch": 0.7400718695291625, "grad_norm": 11.885730914219554, "learning_rate": 4.4106193596241815e-05, "loss": 0.3804, "step": 2008 }, { "epoch": 0.740440431217175, "grad_norm": 4.068600812717093, "learning_rate": 4.410310297935468e-05, "loss": 0.2302, "step": 2009 }, { "epoch": 0.7408089929051875, "grad_norm": 10.375120598884923, "learning_rate": 4.410001236246755e-05, "loss": 0.51, "step": 2010 }, { "epoch": 0.7411775545932, "grad_norm": 5.437534902175611, "learning_rate": 4.409692174558042e-05, "loss": 0.4183, "step": 2011 }, { "epoch": 0.7415461162812126, "grad_norm": 4.616752487835529, "learning_rate": 4.409383112869329e-05, "loss": 0.3221, "step": 2012 }, { "epoch": 0.7419146779692251, "grad_norm": 14.87777760962848, "learning_rate": 4.4090740511806164e-05, "loss": 0.4537, "step": 2013 }, { "epoch": 0.7422832396572376, "grad_norm": 3.918533421422444, "learning_rate": 4.408764989491903e-05, "loss": 0.2488, "step": 2014 }, { "epoch": 0.7426518013452501, "grad_norm": 11.41317881375586, "learning_rate": 4.408455927803189e-05, "loss": 0.4971, "step": 2015 }, { "epoch": 0.7430203630332627, "grad_norm": 7.102316368385096, "learning_rate": 4.4081468661144764e-05, "loss": 0.3289, "step": 2016 }, { "epoch": 0.7433889247212753, "grad_norm": 16.306560034123823, "learning_rate": 4.4078378044257635e-05, "loss": 0.2928, "step": 2017 }, { "epoch": 0.7437574864092877, "grad_norm": 3.905082589227342, "learning_rate": 4.4075287427370506e-05, "loss": 0.2597, "step": 2018 }, { "epoch": 0.7441260480973003, "grad_norm": 4.33459407855057, "learning_rate": 4.407219681048337e-05, "loss": 0.3186, "step": 2019 }, { "epoch": 0.7444946097853128, "grad_norm": 5.9830116730974146, "learning_rate": 4.406910619359624e-05, "loss": 0.3671, "step": 2020 }, { "epoch": 0.7448631714733254, "grad_norm": 7.384795843878266, "learning_rate": 4.406601557670911e-05, "loss": 0.5047, "step": 2021 }, { "epoch": 0.7452317331613378, "grad_norm": 6.900990135912167, "learning_rate": 4.4062924959821984e-05, "loss": 0.2088, "step": 2022 }, { "epoch": 0.7456002948493504, "grad_norm": 4.954435253428984, "learning_rate": 4.4059834342934856e-05, "loss": 0.3036, "step": 2023 }, { "epoch": 0.7459688565373629, "grad_norm": 7.410760592448912, "learning_rate": 4.405674372604772e-05, "loss": 0.4854, "step": 2024 }, { "epoch": 0.7463374182253755, "grad_norm": 7.900062522157905, "learning_rate": 4.405365310916059e-05, "loss": 0.3059, "step": 2025 }, { "epoch": 0.746705979913388, "grad_norm": 9.169580539773378, "learning_rate": 4.405056249227346e-05, "loss": 0.2246, "step": 2026 }, { "epoch": 0.7470745416014005, "grad_norm": 8.914682993577314, "learning_rate": 4.4047471875386334e-05, "loss": 0.5107, "step": 2027 }, { "epoch": 0.7474431032894131, "grad_norm": 10.436219645108904, "learning_rate": 4.40443812584992e-05, "loss": 0.4569, "step": 2028 }, { "epoch": 0.7478116649774256, "grad_norm": 6.144663174541339, "learning_rate": 4.404129064161206e-05, "loss": 0.3086, "step": 2029 }, { "epoch": 0.7481802266654382, "grad_norm": 6.073540770543578, "learning_rate": 4.4038200024724934e-05, "loss": 0.306, "step": 2030 }, { "epoch": 0.7485487883534506, "grad_norm": 4.6791751434620785, "learning_rate": 4.4035109407837805e-05, "loss": 0.4401, "step": 2031 }, { "epoch": 0.7489173500414632, "grad_norm": 9.410814565494999, "learning_rate": 4.4032018790950676e-05, "loss": 0.3595, "step": 2032 }, { "epoch": 0.7492859117294757, "grad_norm": 5.170623904568964, "learning_rate": 4.402892817406355e-05, "loss": 0.4112, "step": 2033 }, { "epoch": 0.7496544734174883, "grad_norm": 10.524385745139714, "learning_rate": 4.402583755717641e-05, "loss": 0.2589, "step": 2034 }, { "epoch": 0.7500230351055008, "grad_norm": 9.043003340916563, "learning_rate": 4.402274694028928e-05, "loss": 0.2549, "step": 2035 }, { "epoch": 0.7503915967935133, "grad_norm": 4.547206591283651, "learning_rate": 4.4019656323402154e-05, "loss": 0.168, "step": 2036 }, { "epoch": 0.7507601584815259, "grad_norm": 12.379159555825822, "learning_rate": 4.4016565706515025e-05, "loss": 0.5179, "step": 2037 }, { "epoch": 0.7511287201695384, "grad_norm": 9.389416812096961, "learning_rate": 4.401347508962789e-05, "loss": 0.34, "step": 2038 }, { "epoch": 0.7514972818575509, "grad_norm": 8.654537761472076, "learning_rate": 4.401038447274076e-05, "loss": 0.2847, "step": 2039 }, { "epoch": 0.7518658435455634, "grad_norm": 10.168628274087046, "learning_rate": 4.400729385585363e-05, "loss": 0.4315, "step": 2040 }, { "epoch": 0.752234405233576, "grad_norm": 22.897660501573355, "learning_rate": 4.40042032389665e-05, "loss": 0.3231, "step": 2041 }, { "epoch": 0.7526029669215885, "grad_norm": 6.245500699839783, "learning_rate": 4.4001112622079374e-05, "loss": 0.2313, "step": 2042 }, { "epoch": 0.752971528609601, "grad_norm": 4.6174547663498675, "learning_rate": 4.399802200519224e-05, "loss": 0.3327, "step": 2043 }, { "epoch": 0.7533400902976136, "grad_norm": 7.8644430455375485, "learning_rate": 4.39949313883051e-05, "loss": 0.5466, "step": 2044 }, { "epoch": 0.7537086519856261, "grad_norm": 8.263601477261593, "learning_rate": 4.3991840771417974e-05, "loss": 0.3662, "step": 2045 }, { "epoch": 0.7540772136736387, "grad_norm": 5.290967499608099, "learning_rate": 4.3988750154530846e-05, "loss": 0.2782, "step": 2046 }, { "epoch": 0.7544457753616511, "grad_norm": 8.616207340278207, "learning_rate": 4.398565953764372e-05, "loss": 0.3872, "step": 2047 }, { "epoch": 0.7548143370496637, "grad_norm": 9.088671547618402, "learning_rate": 4.398256892075658e-05, "loss": 0.4939, "step": 2048 }, { "epoch": 0.7551828987376762, "grad_norm": 6.409766446290855, "learning_rate": 4.397947830386945e-05, "loss": 0.2632, "step": 2049 }, { "epoch": 0.7555514604256888, "grad_norm": 7.360495836307417, "learning_rate": 4.3976387686982324e-05, "loss": 0.2126, "step": 2050 }, { "epoch": 0.7559200221137012, "grad_norm": 5.365532987913201, "learning_rate": 4.3973297070095195e-05, "loss": 0.2799, "step": 2051 }, { "epoch": 0.7562885838017138, "grad_norm": 9.22234023706793, "learning_rate": 4.397020645320806e-05, "loss": 0.3995, "step": 2052 }, { "epoch": 0.7566571454897263, "grad_norm": 13.12410943779258, "learning_rate": 4.396711583632093e-05, "loss": 0.3085, "step": 2053 }, { "epoch": 0.7570257071777389, "grad_norm": 7.275167519649623, "learning_rate": 4.39640252194338e-05, "loss": 0.4646, "step": 2054 }, { "epoch": 0.7573942688657515, "grad_norm": 9.48495577144818, "learning_rate": 4.396093460254667e-05, "loss": 0.3347, "step": 2055 }, { "epoch": 0.7577628305537639, "grad_norm": 7.088495535119988, "learning_rate": 4.3957843985659544e-05, "loss": 0.3689, "step": 2056 }, { "epoch": 0.7581313922417765, "grad_norm": 4.9810849036758, "learning_rate": 4.395475336877241e-05, "loss": 0.341, "step": 2057 }, { "epoch": 0.758499953929789, "grad_norm": 4.211670311062113, "learning_rate": 4.395166275188528e-05, "loss": 0.1971, "step": 2058 }, { "epoch": 0.7588685156178016, "grad_norm": 5.271085449139844, "learning_rate": 4.3948572134998144e-05, "loss": 0.3374, "step": 2059 }, { "epoch": 0.759237077305814, "grad_norm": 10.222879397879726, "learning_rate": 4.3945481518111015e-05, "loss": 0.4183, "step": 2060 }, { "epoch": 0.7596056389938266, "grad_norm": 9.760900028816943, "learning_rate": 4.3942390901223886e-05, "loss": 0.3831, "step": 2061 }, { "epoch": 0.7599742006818391, "grad_norm": 13.380734934330494, "learning_rate": 4.393930028433675e-05, "loss": 0.3002, "step": 2062 }, { "epoch": 0.7603427623698517, "grad_norm": 6.0193664805483005, "learning_rate": 4.393620966744962e-05, "loss": 0.3448, "step": 2063 }, { "epoch": 0.7607113240578642, "grad_norm": 9.870507872403511, "learning_rate": 4.393311905056249e-05, "loss": 0.2933, "step": 2064 }, { "epoch": 0.7610798857458767, "grad_norm": 5.5742919745978865, "learning_rate": 4.3930028433675364e-05, "loss": 0.2579, "step": 2065 }, { "epoch": 0.7614484474338893, "grad_norm": 6.293484288414915, "learning_rate": 4.3926937816788236e-05, "loss": 0.4834, "step": 2066 }, { "epoch": 0.7618170091219018, "grad_norm": 6.605602152518973, "learning_rate": 4.39238471999011e-05, "loss": 0.3033, "step": 2067 }, { "epoch": 0.7621855708099143, "grad_norm": 8.622684776160117, "learning_rate": 4.392075658301397e-05, "loss": 0.3454, "step": 2068 }, { "epoch": 0.7625541324979268, "grad_norm": 10.41606891188554, "learning_rate": 4.391766596612684e-05, "loss": 0.4647, "step": 2069 }, { "epoch": 0.7629226941859394, "grad_norm": 3.8202912811750966, "learning_rate": 4.3914575349239714e-05, "loss": 0.2286, "step": 2070 }, { "epoch": 0.7632912558739519, "grad_norm": 4.733673144397306, "learning_rate": 4.391148473235258e-05, "loss": 0.2303, "step": 2071 }, { "epoch": 0.7636598175619644, "grad_norm": 12.982605080182049, "learning_rate": 4.390839411546545e-05, "loss": 0.3772, "step": 2072 }, { "epoch": 0.764028379249977, "grad_norm": 6.733180247665927, "learning_rate": 4.390530349857832e-05, "loss": 0.4282, "step": 2073 }, { "epoch": 0.7643969409379895, "grad_norm": 7.714130208299927, "learning_rate": 4.390221288169119e-05, "loss": 0.2905, "step": 2074 }, { "epoch": 0.7647655026260021, "grad_norm": 25.207455891452657, "learning_rate": 4.3899122264804056e-05, "loss": 0.4252, "step": 2075 }, { "epoch": 0.7651340643140145, "grad_norm": 5.166031501038932, "learning_rate": 4.389603164791693e-05, "loss": 0.3633, "step": 2076 }, { "epoch": 0.7655026260020271, "grad_norm": 5.161874743414375, "learning_rate": 4.389294103102979e-05, "loss": 0.2986, "step": 2077 }, { "epoch": 0.7658711876900396, "grad_norm": 4.409126803565744, "learning_rate": 4.388985041414266e-05, "loss": 0.3228, "step": 2078 }, { "epoch": 0.7662397493780522, "grad_norm": 7.264365860316994, "learning_rate": 4.3886759797255534e-05, "loss": 0.4109, "step": 2079 }, { "epoch": 0.7666083110660646, "grad_norm": 4.108047096747636, "learning_rate": 4.3883669180368405e-05, "loss": 0.2937, "step": 2080 }, { "epoch": 0.7669768727540772, "grad_norm": 6.876454702276526, "learning_rate": 4.388057856348127e-05, "loss": 0.3074, "step": 2081 }, { "epoch": 0.7673454344420897, "grad_norm": 8.86511195422469, "learning_rate": 4.387748794659414e-05, "loss": 0.328, "step": 2082 }, { "epoch": 0.7677139961301023, "grad_norm": 15.703555458963246, "learning_rate": 4.387439732970701e-05, "loss": 0.354, "step": 2083 }, { "epoch": 0.7680825578181149, "grad_norm": 6.554155548661832, "learning_rate": 4.387130671281988e-05, "loss": 0.3431, "step": 2084 }, { "epoch": 0.7684511195061273, "grad_norm": 12.346522384470632, "learning_rate": 4.3868216095932754e-05, "loss": 0.4366, "step": 2085 }, { "epoch": 0.7688196811941399, "grad_norm": 10.482402405597254, "learning_rate": 4.386512547904562e-05, "loss": 0.3475, "step": 2086 }, { "epoch": 0.7691882428821524, "grad_norm": 5.143071389655387, "learning_rate": 4.386203486215849e-05, "loss": 0.3219, "step": 2087 }, { "epoch": 0.769556804570165, "grad_norm": 3.055161383279263, "learning_rate": 4.385894424527136e-05, "loss": 0.2495, "step": 2088 }, { "epoch": 0.7699253662581774, "grad_norm": 3.3654874329004323, "learning_rate": 4.385585362838423e-05, "loss": 0.1926, "step": 2089 }, { "epoch": 0.77029392794619, "grad_norm": 5.7026775027774095, "learning_rate": 4.38527630114971e-05, "loss": 0.3313, "step": 2090 }, { "epoch": 0.7706624896342025, "grad_norm": 7.07456358517289, "learning_rate": 4.384967239460996e-05, "loss": 0.2807, "step": 2091 }, { "epoch": 0.771031051322215, "grad_norm": 7.1302797264238365, "learning_rate": 4.384658177772283e-05, "loss": 0.4889, "step": 2092 }, { "epoch": 0.7713996130102276, "grad_norm": 6.758532747022694, "learning_rate": 4.3843491160835704e-05, "loss": 0.3347, "step": 2093 }, { "epoch": 0.7717681746982401, "grad_norm": 9.25246747072774, "learning_rate": 4.3840400543948575e-05, "loss": 0.343, "step": 2094 }, { "epoch": 0.7721367363862527, "grad_norm": 12.882277126534879, "learning_rate": 4.3837309927061446e-05, "loss": 0.4542, "step": 2095 }, { "epoch": 0.7725052980742652, "grad_norm": 11.074601101203656, "learning_rate": 4.383421931017431e-05, "loss": 0.2728, "step": 2096 }, { "epoch": 0.7728738597622777, "grad_norm": 6.134917035400723, "learning_rate": 4.383112869328718e-05, "loss": 0.2663, "step": 2097 }, { "epoch": 0.7732424214502902, "grad_norm": 5.464239734701974, "learning_rate": 4.382803807640005e-05, "loss": 0.2746, "step": 2098 }, { "epoch": 0.7736109831383028, "grad_norm": 9.517530629434642, "learning_rate": 4.3824947459512924e-05, "loss": 0.4631, "step": 2099 }, { "epoch": 0.7739795448263153, "grad_norm": 4.794624919203246, "learning_rate": 4.382185684262579e-05, "loss": 0.2057, "step": 2100 }, { "epoch": 0.7743481065143278, "grad_norm": 4.772509595522618, "learning_rate": 4.381876622573866e-05, "loss": 0.4918, "step": 2101 }, { "epoch": 0.7747166682023404, "grad_norm": 6.693883398384818, "learning_rate": 4.381567560885153e-05, "loss": 0.3827, "step": 2102 }, { "epoch": 0.7750852298903529, "grad_norm": 5.479030341518138, "learning_rate": 4.38125849919644e-05, "loss": 0.1779, "step": 2103 }, { "epoch": 0.7754537915783655, "grad_norm": 10.405694986841565, "learning_rate": 4.380949437507727e-05, "loss": 0.444, "step": 2104 }, { "epoch": 0.7758223532663779, "grad_norm": 6.251019509133246, "learning_rate": 4.380640375819014e-05, "loss": 0.4642, "step": 2105 }, { "epoch": 0.7761909149543905, "grad_norm": 23.92451304036824, "learning_rate": 4.3803313141303e-05, "loss": 0.3308, "step": 2106 }, { "epoch": 0.776559476642403, "grad_norm": 15.245067361524574, "learning_rate": 4.380022252441587e-05, "loss": 0.3417, "step": 2107 }, { "epoch": 0.7769280383304156, "grad_norm": 6.254541806790536, "learning_rate": 4.3797131907528744e-05, "loss": 0.2977, "step": 2108 }, { "epoch": 0.777296600018428, "grad_norm": 4.088148638387648, "learning_rate": 4.3794041290641616e-05, "loss": 0.2459, "step": 2109 }, { "epoch": 0.7776651617064406, "grad_norm": 5.396844167792465, "learning_rate": 4.379095067375448e-05, "loss": 0.3022, "step": 2110 }, { "epoch": 0.7780337233944531, "grad_norm": 14.18961321851359, "learning_rate": 4.378786005686735e-05, "loss": 0.363, "step": 2111 }, { "epoch": 0.7784022850824657, "grad_norm": 5.685782372312779, "learning_rate": 4.378476943998022e-05, "loss": 0.3432, "step": 2112 }, { "epoch": 0.7787708467704783, "grad_norm": 4.999449890392238, "learning_rate": 4.3781678823093094e-05, "loss": 0.2708, "step": 2113 }, { "epoch": 0.7791394084584907, "grad_norm": 5.23155824398011, "learning_rate": 4.3778588206205965e-05, "loss": 0.3068, "step": 2114 }, { "epoch": 0.7795079701465033, "grad_norm": 5.6657873855271035, "learning_rate": 4.377549758931883e-05, "loss": 0.3558, "step": 2115 }, { "epoch": 0.7798765318345158, "grad_norm": 5.664430214356984, "learning_rate": 4.37724069724317e-05, "loss": 0.3161, "step": 2116 }, { "epoch": 0.7802450935225284, "grad_norm": 9.369253609973839, "learning_rate": 4.376931635554457e-05, "loss": 0.2851, "step": 2117 }, { "epoch": 0.7806136552105408, "grad_norm": 5.976042282302166, "learning_rate": 4.376622573865744e-05, "loss": 0.3654, "step": 2118 }, { "epoch": 0.7809822168985534, "grad_norm": 7.954830204214346, "learning_rate": 4.376313512177031e-05, "loss": 0.406, "step": 2119 }, { "epoch": 0.7813507785865659, "grad_norm": 6.369045094250602, "learning_rate": 4.376004450488317e-05, "loss": 0.4969, "step": 2120 }, { "epoch": 0.7817193402745785, "grad_norm": 5.59856254657306, "learning_rate": 4.375695388799604e-05, "loss": 0.2958, "step": 2121 }, { "epoch": 0.782087901962591, "grad_norm": 7.597175735929613, "learning_rate": 4.3753863271108914e-05, "loss": 0.421, "step": 2122 }, { "epoch": 0.7824564636506035, "grad_norm": 4.286882245905384, "learning_rate": 4.3750772654221785e-05, "loss": 0.2224, "step": 2123 }, { "epoch": 0.7828250253386161, "grad_norm": 8.768599877462488, "learning_rate": 4.3747682037334656e-05, "loss": 0.4996, "step": 2124 }, { "epoch": 0.7831935870266286, "grad_norm": 7.794272192842208, "learning_rate": 4.374459142044752e-05, "loss": 0.4707, "step": 2125 }, { "epoch": 0.7835621487146411, "grad_norm": 8.026816246354116, "learning_rate": 4.374150080356039e-05, "loss": 0.2764, "step": 2126 }, { "epoch": 0.7839307104026536, "grad_norm": 13.078396847906403, "learning_rate": 4.373841018667326e-05, "loss": 0.4309, "step": 2127 }, { "epoch": 0.7842992720906662, "grad_norm": 8.225015435015912, "learning_rate": 4.3735319569786134e-05, "loss": 0.2968, "step": 2128 }, { "epoch": 0.7846678337786787, "grad_norm": 7.257130503563509, "learning_rate": 4.3732228952899e-05, "loss": 0.2244, "step": 2129 }, { "epoch": 0.7850363954666912, "grad_norm": 5.237403289703223, "learning_rate": 4.372913833601187e-05, "loss": 0.3291, "step": 2130 }, { "epoch": 0.7854049571547038, "grad_norm": 6.155300008990918, "learning_rate": 4.372604771912474e-05, "loss": 0.2853, "step": 2131 }, { "epoch": 0.7857735188427163, "grad_norm": 4.547717249096999, "learning_rate": 4.372295710223761e-05, "loss": 0.3322, "step": 2132 }, { "epoch": 0.7861420805307289, "grad_norm": 5.501919498144488, "learning_rate": 4.3719866485350484e-05, "loss": 0.311, "step": 2133 }, { "epoch": 0.7865106422187413, "grad_norm": 3.6646695478026783, "learning_rate": 4.371677586846335e-05, "loss": 0.2133, "step": 2134 }, { "epoch": 0.7868792039067539, "grad_norm": 4.814223340104633, "learning_rate": 4.371368525157621e-05, "loss": 0.3343, "step": 2135 }, { "epoch": 0.7872477655947664, "grad_norm": 6.857215767427381, "learning_rate": 4.3710594634689084e-05, "loss": 0.404, "step": 2136 }, { "epoch": 0.787616327282779, "grad_norm": 8.09293458579525, "learning_rate": 4.3707504017801955e-05, "loss": 0.4354, "step": 2137 }, { "epoch": 0.7879848889707914, "grad_norm": 5.727735453927301, "learning_rate": 4.3704413400914826e-05, "loss": 0.4217, "step": 2138 }, { "epoch": 0.788353450658804, "grad_norm": 4.279406519590317, "learning_rate": 4.370132278402769e-05, "loss": 0.3201, "step": 2139 }, { "epoch": 0.7887220123468166, "grad_norm": 3.362215340629618, "learning_rate": 4.369823216714056e-05, "loss": 0.1883, "step": 2140 }, { "epoch": 0.7890905740348291, "grad_norm": 6.683960735650881, "learning_rate": 4.369514155025343e-05, "loss": 0.3503, "step": 2141 }, { "epoch": 0.7894591357228417, "grad_norm": 8.895024878567346, "learning_rate": 4.3692050933366304e-05, "loss": 0.413, "step": 2142 }, { "epoch": 0.7898276974108541, "grad_norm": 6.628285582960884, "learning_rate": 4.368896031647917e-05, "loss": 0.4159, "step": 2143 }, { "epoch": 0.7901962590988667, "grad_norm": 6.210702260678802, "learning_rate": 4.368586969959204e-05, "loss": 0.5977, "step": 2144 }, { "epoch": 0.7905648207868792, "grad_norm": 7.710052903287005, "learning_rate": 4.368277908270491e-05, "loss": 0.3944, "step": 2145 }, { "epoch": 0.7909333824748918, "grad_norm": 2.4618991486681354, "learning_rate": 4.367968846581778e-05, "loss": 0.1735, "step": 2146 }, { "epoch": 0.7913019441629042, "grad_norm": 9.946918750394673, "learning_rate": 4.367659784893065e-05, "loss": 0.2346, "step": 2147 }, { "epoch": 0.7916705058509168, "grad_norm": 13.251472823140146, "learning_rate": 4.367350723204352e-05, "loss": 0.3837, "step": 2148 }, { "epoch": 0.7920390675389293, "grad_norm": 8.807996099459055, "learning_rate": 4.367041661515639e-05, "loss": 0.4981, "step": 2149 }, { "epoch": 0.7924076292269419, "grad_norm": 7.191560875153941, "learning_rate": 4.366732599826925e-05, "loss": 0.5099, "step": 2150 }, { "epoch": 0.7927761909149544, "grad_norm": 4.732544397144866, "learning_rate": 4.3664235381382124e-05, "loss": 0.231, "step": 2151 }, { "epoch": 0.7931447526029669, "grad_norm": 5.872966982745597, "learning_rate": 4.3661144764494996e-05, "loss": 0.2876, "step": 2152 }, { "epoch": 0.7935133142909795, "grad_norm": 6.294426150780304, "learning_rate": 4.365805414760786e-05, "loss": 0.304, "step": 2153 }, { "epoch": 0.793881875978992, "grad_norm": 11.498148893561764, "learning_rate": 4.365496353072073e-05, "loss": 0.2597, "step": 2154 }, { "epoch": 0.7942504376670045, "grad_norm": 3.0525088138437453, "learning_rate": 4.36518729138336e-05, "loss": 0.2171, "step": 2155 }, { "epoch": 0.794618999355017, "grad_norm": 13.944172856969288, "learning_rate": 4.3648782296946473e-05, "loss": 0.5218, "step": 2156 }, { "epoch": 0.7949875610430296, "grad_norm": 5.7078282859118215, "learning_rate": 4.3645691680059345e-05, "loss": 0.4264, "step": 2157 }, { "epoch": 0.7953561227310421, "grad_norm": 6.752818896528797, "learning_rate": 4.364260106317221e-05, "loss": 0.3529, "step": 2158 }, { "epoch": 0.7957246844190546, "grad_norm": 5.681108173744548, "learning_rate": 4.363951044628508e-05, "loss": 0.4595, "step": 2159 }, { "epoch": 0.7960932461070672, "grad_norm": 4.737043478516784, "learning_rate": 4.363641982939795e-05, "loss": 0.3435, "step": 2160 }, { "epoch": 0.7964618077950797, "grad_norm": 7.186987817178859, "learning_rate": 4.363332921251082e-05, "loss": 0.3619, "step": 2161 }, { "epoch": 0.7968303694830923, "grad_norm": 3.1642349007815804, "learning_rate": 4.363023859562369e-05, "loss": 0.2461, "step": 2162 }, { "epoch": 0.7971989311711047, "grad_norm": 4.840035669494846, "learning_rate": 4.362714797873656e-05, "loss": 0.4372, "step": 2163 }, { "epoch": 0.7975674928591173, "grad_norm": 6.940584339089726, "learning_rate": 4.362405736184943e-05, "loss": 0.3676, "step": 2164 }, { "epoch": 0.7979360545471298, "grad_norm": 8.69307919292653, "learning_rate": 4.3620966744962294e-05, "loss": 0.5218, "step": 2165 }, { "epoch": 0.7983046162351424, "grad_norm": 6.962151795040632, "learning_rate": 4.3617876128075165e-05, "loss": 0.279, "step": 2166 }, { "epoch": 0.7986731779231548, "grad_norm": 6.0500694333970895, "learning_rate": 4.3614785511188036e-05, "loss": 0.1814, "step": 2167 }, { "epoch": 0.7990417396111674, "grad_norm": 4.91615491024871, "learning_rate": 4.36116948943009e-05, "loss": 0.2928, "step": 2168 }, { "epoch": 0.79941030129918, "grad_norm": 10.266859006524989, "learning_rate": 4.360860427741377e-05, "loss": 0.4732, "step": 2169 }, { "epoch": 0.7997788629871925, "grad_norm": 7.776250864950824, "learning_rate": 4.360551366052664e-05, "loss": 0.4792, "step": 2170 }, { "epoch": 0.800147424675205, "grad_norm": 5.7293872397762176, "learning_rate": 4.3602423043639514e-05, "loss": 0.4001, "step": 2171 }, { "epoch": 0.8005159863632175, "grad_norm": 7.750980930392022, "learning_rate": 4.359933242675238e-05, "loss": 0.3743, "step": 2172 }, { "epoch": 0.8008845480512301, "grad_norm": 4.429121031695038, "learning_rate": 4.359624180986525e-05, "loss": 0.2485, "step": 2173 }, { "epoch": 0.8012531097392426, "grad_norm": 9.845767634868508, "learning_rate": 4.359315119297812e-05, "loss": 0.4896, "step": 2174 }, { "epoch": 0.8016216714272552, "grad_norm": 7.656108995034466, "learning_rate": 4.359006057609099e-05, "loss": 0.3767, "step": 2175 }, { "epoch": 0.8019902331152676, "grad_norm": 6.741991449115845, "learning_rate": 4.3586969959203863e-05, "loss": 0.4767, "step": 2176 }, { "epoch": 0.8023587948032802, "grad_norm": 7.13494714087813, "learning_rate": 4.358387934231673e-05, "loss": 0.4387, "step": 2177 }, { "epoch": 0.8027273564912927, "grad_norm": 5.702138987966352, "learning_rate": 4.35807887254296e-05, "loss": 0.3102, "step": 2178 }, { "epoch": 0.8030959181793053, "grad_norm": 8.109132737379591, "learning_rate": 4.357769810854247e-05, "loss": 0.4351, "step": 2179 }, { "epoch": 0.8034644798673178, "grad_norm": 6.382461370103946, "learning_rate": 4.3574607491655335e-05, "loss": 0.381, "step": 2180 }, { "epoch": 0.8038330415553303, "grad_norm": 9.420777720360741, "learning_rate": 4.3571516874768206e-05, "loss": 0.3014, "step": 2181 }, { "epoch": 0.8042016032433429, "grad_norm": 6.3854165339093925, "learning_rate": 4.356842625788107e-05, "loss": 0.2984, "step": 2182 }, { "epoch": 0.8045701649313554, "grad_norm": 5.974415910973635, "learning_rate": 4.356533564099394e-05, "loss": 0.3255, "step": 2183 }, { "epoch": 0.8049387266193679, "grad_norm": 6.619805945004107, "learning_rate": 4.356224502410681e-05, "loss": 0.4099, "step": 2184 }, { "epoch": 0.8053072883073804, "grad_norm": 5.39571858680502, "learning_rate": 4.3559154407219684e-05, "loss": 0.2208, "step": 2185 }, { "epoch": 0.805675849995393, "grad_norm": 11.949192572094567, "learning_rate": 4.3556063790332555e-05, "loss": 0.3839, "step": 2186 }, { "epoch": 0.8060444116834055, "grad_norm": 8.585285248769221, "learning_rate": 4.355297317344542e-05, "loss": 0.3956, "step": 2187 }, { "epoch": 0.806412973371418, "grad_norm": 5.294493913680697, "learning_rate": 4.354988255655829e-05, "loss": 0.1862, "step": 2188 }, { "epoch": 0.8067815350594306, "grad_norm": 3.8226338766142214, "learning_rate": 4.354679193967116e-05, "loss": 0.2813, "step": 2189 }, { "epoch": 0.8071500967474431, "grad_norm": 5.769829103791801, "learning_rate": 4.354370132278403e-05, "loss": 0.4024, "step": 2190 }, { "epoch": 0.8075186584354557, "grad_norm": 5.266646741676465, "learning_rate": 4.35406107058969e-05, "loss": 0.2946, "step": 2191 }, { "epoch": 0.8078872201234681, "grad_norm": 10.165891607372743, "learning_rate": 4.353752008900977e-05, "loss": 0.636, "step": 2192 }, { "epoch": 0.8082557818114807, "grad_norm": 7.154623188197117, "learning_rate": 4.353442947212264e-05, "loss": 0.5318, "step": 2193 }, { "epoch": 0.8086243434994932, "grad_norm": 6.262193915221401, "learning_rate": 4.353133885523551e-05, "loss": 0.4132, "step": 2194 }, { "epoch": 0.8089929051875058, "grad_norm": 22.11360659593291, "learning_rate": 4.352824823834838e-05, "loss": 0.3036, "step": 2195 }, { "epoch": 0.8093614668755182, "grad_norm": 9.462697326134613, "learning_rate": 4.352515762146125e-05, "loss": 0.4703, "step": 2196 }, { "epoch": 0.8097300285635308, "grad_norm": 6.183313668246329, "learning_rate": 4.352206700457411e-05, "loss": 0.4472, "step": 2197 }, { "epoch": 0.8100985902515434, "grad_norm": 7.267416193859109, "learning_rate": 4.351897638768698e-05, "loss": 0.3204, "step": 2198 }, { "epoch": 0.8104671519395559, "grad_norm": 9.495131600398528, "learning_rate": 4.3515885770799853e-05, "loss": 0.3285, "step": 2199 }, { "epoch": 0.8108357136275685, "grad_norm": 4.837973811482027, "learning_rate": 4.3512795153912725e-05, "loss": 0.2806, "step": 2200 }, { "epoch": 0.8112042753155809, "grad_norm": 5.005644473293882, "learning_rate": 4.350970453702559e-05, "loss": 0.2625, "step": 2201 }, { "epoch": 0.8115728370035935, "grad_norm": 9.890362728552667, "learning_rate": 4.350661392013846e-05, "loss": 0.4178, "step": 2202 }, { "epoch": 0.811941398691606, "grad_norm": 8.480882575693128, "learning_rate": 4.350352330325133e-05, "loss": 0.1952, "step": 2203 }, { "epoch": 0.8123099603796186, "grad_norm": 5.763338953871861, "learning_rate": 4.35004326863642e-05, "loss": 0.3411, "step": 2204 }, { "epoch": 0.812678522067631, "grad_norm": 9.089228079305565, "learning_rate": 4.3497342069477074e-05, "loss": 0.4174, "step": 2205 }, { "epoch": 0.8130470837556436, "grad_norm": 13.532714764469263, "learning_rate": 4.349425145258994e-05, "loss": 0.3977, "step": 2206 }, { "epoch": 0.8134156454436561, "grad_norm": 7.4778787536504545, "learning_rate": 4.349116083570281e-05, "loss": 0.3639, "step": 2207 }, { "epoch": 0.8137842071316687, "grad_norm": 8.218978820632397, "learning_rate": 4.348807021881568e-05, "loss": 0.4269, "step": 2208 }, { "epoch": 0.8141527688196812, "grad_norm": 7.275039316182556, "learning_rate": 4.348497960192855e-05, "loss": 0.4356, "step": 2209 }, { "epoch": 0.8145213305076937, "grad_norm": 8.307924775460297, "learning_rate": 4.3481888985041416e-05, "loss": 0.3211, "step": 2210 }, { "epoch": 0.8148898921957063, "grad_norm": 6.070929575698102, "learning_rate": 4.347879836815428e-05, "loss": 0.1747, "step": 2211 }, { "epoch": 0.8152584538837188, "grad_norm": 12.429100689322317, "learning_rate": 4.347570775126715e-05, "loss": 0.6432, "step": 2212 }, { "epoch": 0.8156270155717313, "grad_norm": 5.669252740506653, "learning_rate": 4.347261713438002e-05, "loss": 0.3806, "step": 2213 }, { "epoch": 0.8159955772597438, "grad_norm": 4.346192037104283, "learning_rate": 4.3469526517492894e-05, "loss": 0.3107, "step": 2214 }, { "epoch": 0.8163641389477564, "grad_norm": 5.64423827364661, "learning_rate": 4.346643590060576e-05, "loss": 0.2236, "step": 2215 }, { "epoch": 0.8167327006357689, "grad_norm": 4.646465492290341, "learning_rate": 4.346334528371863e-05, "loss": 0.3015, "step": 2216 }, { "epoch": 0.8171012623237814, "grad_norm": 7.877323837126473, "learning_rate": 4.34602546668315e-05, "loss": 0.2851, "step": 2217 }, { "epoch": 0.817469824011794, "grad_norm": 5.016125805531978, "learning_rate": 4.345716404994437e-05, "loss": 0.2538, "step": 2218 }, { "epoch": 0.8178383856998065, "grad_norm": 5.159308433911987, "learning_rate": 4.3454073433057243e-05, "loss": 0.2407, "step": 2219 }, { "epoch": 0.8182069473878191, "grad_norm": 6.517626996838794, "learning_rate": 4.345098281617011e-05, "loss": 0.3714, "step": 2220 }, { "epoch": 0.8185755090758315, "grad_norm": 25.328346698597898, "learning_rate": 4.344789219928298e-05, "loss": 0.2786, "step": 2221 }, { "epoch": 0.8189440707638441, "grad_norm": 7.579768114795614, "learning_rate": 4.344480158239585e-05, "loss": 0.2053, "step": 2222 }, { "epoch": 0.8193126324518566, "grad_norm": 9.121639939481653, "learning_rate": 4.344171096550872e-05, "loss": 0.2962, "step": 2223 }, { "epoch": 0.8196811941398692, "grad_norm": 8.01393820580158, "learning_rate": 4.343862034862159e-05, "loss": 0.3048, "step": 2224 }, { "epoch": 0.8200497558278816, "grad_norm": 8.042133008121198, "learning_rate": 4.343552973173446e-05, "loss": 0.4344, "step": 2225 }, { "epoch": 0.8204183175158942, "grad_norm": 6.152281125673339, "learning_rate": 4.343243911484732e-05, "loss": 0.2583, "step": 2226 }, { "epoch": 0.8207868792039068, "grad_norm": 4.03857891205635, "learning_rate": 4.342934849796019e-05, "loss": 0.2873, "step": 2227 }, { "epoch": 0.8211554408919193, "grad_norm": 6.017369876812875, "learning_rate": 4.3426257881073064e-05, "loss": 0.2983, "step": 2228 }, { "epoch": 0.8215240025799319, "grad_norm": 5.017659472871127, "learning_rate": 4.3423167264185935e-05, "loss": 0.307, "step": 2229 }, { "epoch": 0.8218925642679443, "grad_norm": 3.719583626282016, "learning_rate": 4.34200766472988e-05, "loss": 0.2918, "step": 2230 }, { "epoch": 0.8222611259559569, "grad_norm": 3.788700424670437, "learning_rate": 4.341698603041167e-05, "loss": 0.2469, "step": 2231 }, { "epoch": 0.8226296876439694, "grad_norm": 4.75856089852291, "learning_rate": 4.341389541352454e-05, "loss": 0.3918, "step": 2232 }, { "epoch": 0.822998249331982, "grad_norm": 9.491054137585463, "learning_rate": 4.341080479663741e-05, "loss": 0.4793, "step": 2233 }, { "epoch": 0.8233668110199944, "grad_norm": 8.68676710810917, "learning_rate": 4.340771417975028e-05, "loss": 0.4384, "step": 2234 }, { "epoch": 0.823735372708007, "grad_norm": 4.8707472028641545, "learning_rate": 4.340462356286315e-05, "loss": 0.5189, "step": 2235 }, { "epoch": 0.8241039343960196, "grad_norm": 5.173565268544568, "learning_rate": 4.340153294597602e-05, "loss": 0.3013, "step": 2236 }, { "epoch": 0.8244724960840321, "grad_norm": 5.56858030460967, "learning_rate": 4.339844232908889e-05, "loss": 0.3058, "step": 2237 }, { "epoch": 0.8248410577720446, "grad_norm": 8.993727299474044, "learning_rate": 4.339535171220176e-05, "loss": 0.437, "step": 2238 }, { "epoch": 0.8252096194600571, "grad_norm": 5.917969394595675, "learning_rate": 4.3392261095314627e-05, "loss": 0.5594, "step": 2239 }, { "epoch": 0.8255781811480697, "grad_norm": 6.7658358710642466, "learning_rate": 4.33891704784275e-05, "loss": 0.4469, "step": 2240 }, { "epoch": 0.8259467428360822, "grad_norm": 4.638373140051643, "learning_rate": 4.338607986154036e-05, "loss": 0.3857, "step": 2241 }, { "epoch": 0.8263153045240947, "grad_norm": 8.063981267604566, "learning_rate": 4.338298924465323e-05, "loss": 0.5455, "step": 2242 }, { "epoch": 0.8266838662121072, "grad_norm": 6.078401770088358, "learning_rate": 4.3379898627766105e-05, "loss": 0.3845, "step": 2243 }, { "epoch": 0.8270524279001198, "grad_norm": 14.45226823896431, "learning_rate": 4.337680801087897e-05, "loss": 0.2568, "step": 2244 }, { "epoch": 0.8274209895881323, "grad_norm": 7.106909288674731, "learning_rate": 4.337371739399184e-05, "loss": 0.3272, "step": 2245 }, { "epoch": 0.8277895512761448, "grad_norm": 4.366968685120855, "learning_rate": 4.337062677710471e-05, "loss": 0.2516, "step": 2246 }, { "epoch": 0.8281581129641574, "grad_norm": 4.9192546348109865, "learning_rate": 4.336753616021758e-05, "loss": 0.3567, "step": 2247 }, { "epoch": 0.8285266746521699, "grad_norm": 8.900602993200973, "learning_rate": 4.3364445543330454e-05, "loss": 0.4694, "step": 2248 }, { "epoch": 0.8288952363401825, "grad_norm": 13.998044013577564, "learning_rate": 4.336135492644332e-05, "loss": 0.3634, "step": 2249 }, { "epoch": 0.8292637980281949, "grad_norm": 5.69438576189937, "learning_rate": 4.335826430955619e-05, "loss": 0.2639, "step": 2250 }, { "epoch": 0.8296323597162075, "grad_norm": 3.4884712534864564, "learning_rate": 4.335517369266906e-05, "loss": 0.2481, "step": 2251 }, { "epoch": 0.83000092140422, "grad_norm": 4.155028658514424, "learning_rate": 4.335208307578193e-05, "loss": 0.2511, "step": 2252 }, { "epoch": 0.8303694830922326, "grad_norm": 5.1415621474332065, "learning_rate": 4.3348992458894796e-05, "loss": 0.2989, "step": 2253 }, { "epoch": 0.830738044780245, "grad_norm": 4.5012957508799225, "learning_rate": 4.334590184200767e-05, "loss": 0.2915, "step": 2254 }, { "epoch": 0.8311066064682576, "grad_norm": 7.485599170827213, "learning_rate": 4.334281122512054e-05, "loss": 0.3182, "step": 2255 }, { "epoch": 0.8314751681562702, "grad_norm": 6.754640185234327, "learning_rate": 4.33397206082334e-05, "loss": 0.3204, "step": 2256 }, { "epoch": 0.8318437298442827, "grad_norm": 3.9748713826418234, "learning_rate": 4.3336629991346274e-05, "loss": 0.2393, "step": 2257 }, { "epoch": 0.8322122915322953, "grad_norm": 4.625001237198948, "learning_rate": 4.3333539374459145e-05, "loss": 0.268, "step": 2258 }, { "epoch": 0.8325808532203077, "grad_norm": 5.612631869731611, "learning_rate": 4.333044875757201e-05, "loss": 0.2803, "step": 2259 }, { "epoch": 0.8329494149083203, "grad_norm": 5.24624444968071, "learning_rate": 4.332735814068488e-05, "loss": 0.204, "step": 2260 }, { "epoch": 0.8333179765963328, "grad_norm": 4.041997260242347, "learning_rate": 4.332426752379775e-05, "loss": 0.2609, "step": 2261 }, { "epoch": 0.8336865382843454, "grad_norm": 5.388626037601804, "learning_rate": 4.332117690691062e-05, "loss": 0.4169, "step": 2262 }, { "epoch": 0.8340550999723578, "grad_norm": 5.547822597315911, "learning_rate": 4.331808629002349e-05, "loss": 0.2364, "step": 2263 }, { "epoch": 0.8344236616603704, "grad_norm": 8.792057649376563, "learning_rate": 4.331499567313636e-05, "loss": 0.3137, "step": 2264 }, { "epoch": 0.834792223348383, "grad_norm": 10.028332437501799, "learning_rate": 4.331190505624923e-05, "loss": 0.3421, "step": 2265 }, { "epoch": 0.8351607850363955, "grad_norm": 7.321764172029875, "learning_rate": 4.33088144393621e-05, "loss": 0.2924, "step": 2266 }, { "epoch": 0.835529346724408, "grad_norm": 9.64497858356572, "learning_rate": 4.330572382247497e-05, "loss": 0.2833, "step": 2267 }, { "epoch": 0.8358979084124205, "grad_norm": 8.434546737929455, "learning_rate": 4.330263320558784e-05, "loss": 0.4394, "step": 2268 }, { "epoch": 0.8362664701004331, "grad_norm": 4.964839716203613, "learning_rate": 4.329954258870071e-05, "loss": 0.3638, "step": 2269 }, { "epoch": 0.8366350317884456, "grad_norm": 7.377767269990069, "learning_rate": 4.329645197181358e-05, "loss": 0.3304, "step": 2270 }, { "epoch": 0.8370035934764581, "grad_norm": 4.706444870763197, "learning_rate": 4.3293361354926444e-05, "loss": 0.2851, "step": 2271 }, { "epoch": 0.8373721551644706, "grad_norm": 7.097443681891879, "learning_rate": 4.3290270738039315e-05, "loss": 0.2203, "step": 2272 }, { "epoch": 0.8377407168524832, "grad_norm": 5.488014513369129, "learning_rate": 4.328718012115218e-05, "loss": 0.2381, "step": 2273 }, { "epoch": 0.8381092785404957, "grad_norm": 15.439674378751484, "learning_rate": 4.328408950426505e-05, "loss": 0.3846, "step": 2274 }, { "epoch": 0.8384778402285082, "grad_norm": 10.546419622692346, "learning_rate": 4.328099888737792e-05, "loss": 0.4145, "step": 2275 }, { "epoch": 0.8388464019165208, "grad_norm": 8.908763865340603, "learning_rate": 4.327790827049079e-05, "loss": 0.4432, "step": 2276 }, { "epoch": 0.8392149636045333, "grad_norm": 5.887175908654621, "learning_rate": 4.3274817653603664e-05, "loss": 0.3211, "step": 2277 }, { "epoch": 0.8395835252925459, "grad_norm": 4.164867673972274, "learning_rate": 4.327172703671653e-05, "loss": 0.2615, "step": 2278 }, { "epoch": 0.8399520869805583, "grad_norm": 8.63130996539633, "learning_rate": 4.32686364198294e-05, "loss": 0.4655, "step": 2279 }, { "epoch": 0.8403206486685709, "grad_norm": 5.434952764086507, "learning_rate": 4.326554580294227e-05, "loss": 0.3334, "step": 2280 }, { "epoch": 0.8406892103565834, "grad_norm": 9.740038747220838, "learning_rate": 4.326245518605514e-05, "loss": 0.3503, "step": 2281 }, { "epoch": 0.841057772044596, "grad_norm": 6.4570451333599985, "learning_rate": 4.3259364569168007e-05, "loss": 0.252, "step": 2282 }, { "epoch": 0.8414263337326084, "grad_norm": 6.744709060536214, "learning_rate": 4.325627395228088e-05, "loss": 0.367, "step": 2283 }, { "epoch": 0.841794895420621, "grad_norm": 5.389687630797202, "learning_rate": 4.325318333539375e-05, "loss": 0.2189, "step": 2284 }, { "epoch": 0.8421634571086336, "grad_norm": 6.210978036424321, "learning_rate": 4.325009271850662e-05, "loss": 0.3042, "step": 2285 }, { "epoch": 0.8425320187966461, "grad_norm": 5.786807796204643, "learning_rate": 4.3247002101619485e-05, "loss": 0.4379, "step": 2286 }, { "epoch": 0.8429005804846587, "grad_norm": 12.383415058289025, "learning_rate": 4.324391148473235e-05, "loss": 0.5702, "step": 2287 }, { "epoch": 0.8432691421726711, "grad_norm": 9.88069201250581, "learning_rate": 4.324082086784522e-05, "loss": 0.2373, "step": 2288 }, { "epoch": 0.8436377038606837, "grad_norm": 11.360413305796389, "learning_rate": 4.323773025095809e-05, "loss": 0.4288, "step": 2289 }, { "epoch": 0.8440062655486962, "grad_norm": 7.8728319847099915, "learning_rate": 4.323463963407096e-05, "loss": 0.3016, "step": 2290 }, { "epoch": 0.8443748272367088, "grad_norm": 8.207634549160806, "learning_rate": 4.3231549017183834e-05, "loss": 0.3712, "step": 2291 }, { "epoch": 0.8447433889247212, "grad_norm": 6.02958632535291, "learning_rate": 4.32284584002967e-05, "loss": 0.3031, "step": 2292 }, { "epoch": 0.8451119506127338, "grad_norm": 3.399726430321327, "learning_rate": 4.322536778340957e-05, "loss": 0.2033, "step": 2293 }, { "epoch": 0.8454805123007464, "grad_norm": 6.862769049701513, "learning_rate": 4.322227716652244e-05, "loss": 0.3563, "step": 2294 }, { "epoch": 0.8458490739887589, "grad_norm": 9.916940594477438, "learning_rate": 4.321918654963531e-05, "loss": 0.4807, "step": 2295 }, { "epoch": 0.8462176356767714, "grad_norm": 5.874679881363417, "learning_rate": 4.321609593274818e-05, "loss": 0.326, "step": 2296 }, { "epoch": 0.8465861973647839, "grad_norm": 4.805497264079767, "learning_rate": 4.321300531586105e-05, "loss": 0.4488, "step": 2297 }, { "epoch": 0.8469547590527965, "grad_norm": 6.224736844670655, "learning_rate": 4.320991469897392e-05, "loss": 0.3595, "step": 2298 }, { "epoch": 0.847323320740809, "grad_norm": 5.470221272513907, "learning_rate": 4.320682408208679e-05, "loss": 0.5262, "step": 2299 }, { "epoch": 0.8476918824288215, "grad_norm": 14.764476848548618, "learning_rate": 4.320373346519966e-05, "loss": 0.2369, "step": 2300 }, { "epoch": 0.848060444116834, "grad_norm": 11.486482055857278, "learning_rate": 4.3200642848312525e-05, "loss": 0.2695, "step": 2301 }, { "epoch": 0.8484290058048466, "grad_norm": 3.870870543356847, "learning_rate": 4.319755223142539e-05, "loss": 0.3741, "step": 2302 }, { "epoch": 0.8487975674928591, "grad_norm": 7.742193905301336, "learning_rate": 4.319446161453826e-05, "loss": 0.4031, "step": 2303 }, { "epoch": 0.8491661291808716, "grad_norm": 5.097232681157431, "learning_rate": 4.319137099765113e-05, "loss": 0.324, "step": 2304 }, { "epoch": 0.8495346908688842, "grad_norm": 6.298627661538756, "learning_rate": 4.3188280380764e-05, "loss": 0.3282, "step": 2305 }, { "epoch": 0.8499032525568967, "grad_norm": 6.8383868379829496, "learning_rate": 4.318518976387687e-05, "loss": 0.1829, "step": 2306 }, { "epoch": 0.8502718142449093, "grad_norm": 3.382324606442612, "learning_rate": 4.318209914698974e-05, "loss": 0.2348, "step": 2307 }, { "epoch": 0.8506403759329217, "grad_norm": 9.517264489560947, "learning_rate": 4.317900853010261e-05, "loss": 0.34, "step": 2308 }, { "epoch": 0.8510089376209343, "grad_norm": 5.6940876463886525, "learning_rate": 4.317591791321548e-05, "loss": 0.3382, "step": 2309 }, { "epoch": 0.8513774993089468, "grad_norm": 9.732411488116572, "learning_rate": 4.317282729632835e-05, "loss": 0.364, "step": 2310 }, { "epoch": 0.8517460609969594, "grad_norm": 6.969539854166026, "learning_rate": 4.316973667944122e-05, "loss": 0.2723, "step": 2311 }, { "epoch": 0.8521146226849718, "grad_norm": 6.824167322359167, "learning_rate": 4.316664606255409e-05, "loss": 0.3683, "step": 2312 }, { "epoch": 0.8524831843729844, "grad_norm": 6.127618969338616, "learning_rate": 4.316355544566696e-05, "loss": 0.3844, "step": 2313 }, { "epoch": 0.852851746060997, "grad_norm": 11.223536583336823, "learning_rate": 4.316046482877983e-05, "loss": 0.439, "step": 2314 }, { "epoch": 0.8532203077490095, "grad_norm": 5.319784040154893, "learning_rate": 4.3157374211892695e-05, "loss": 0.2705, "step": 2315 }, { "epoch": 0.8535888694370221, "grad_norm": 5.713675493627931, "learning_rate": 4.3154283595005566e-05, "loss": 0.2787, "step": 2316 }, { "epoch": 0.8539574311250345, "grad_norm": 6.809283415838045, "learning_rate": 4.315119297811843e-05, "loss": 0.2109, "step": 2317 }, { "epoch": 0.8543259928130471, "grad_norm": 8.367376245245111, "learning_rate": 4.31481023612313e-05, "loss": 0.3998, "step": 2318 }, { "epoch": 0.8546945545010596, "grad_norm": 7.211164973050796, "learning_rate": 4.314501174434417e-05, "loss": 0.4029, "step": 2319 }, { "epoch": 0.8550631161890722, "grad_norm": 5.738895474274509, "learning_rate": 4.3141921127457044e-05, "loss": 0.3332, "step": 2320 }, { "epoch": 0.8554316778770846, "grad_norm": 7.889154943119873, "learning_rate": 4.313883051056991e-05, "loss": 0.2499, "step": 2321 }, { "epoch": 0.8558002395650972, "grad_norm": 5.344560885113715, "learning_rate": 4.313573989368278e-05, "loss": 0.3724, "step": 2322 }, { "epoch": 0.8561688012531098, "grad_norm": 4.922246428507425, "learning_rate": 4.313264927679565e-05, "loss": 0.3984, "step": 2323 }, { "epoch": 0.8565373629411223, "grad_norm": 8.904833871790068, "learning_rate": 4.312955865990852e-05, "loss": 0.2723, "step": 2324 }, { "epoch": 0.8569059246291348, "grad_norm": 10.206101361679826, "learning_rate": 4.3126468043021386e-05, "loss": 0.3257, "step": 2325 }, { "epoch": 0.8572744863171473, "grad_norm": 3.954130625317681, "learning_rate": 4.312337742613426e-05, "loss": 0.2934, "step": 2326 }, { "epoch": 0.8576430480051599, "grad_norm": 8.751478233810301, "learning_rate": 4.312028680924713e-05, "loss": 0.2584, "step": 2327 }, { "epoch": 0.8580116096931724, "grad_norm": 9.326892356005178, "learning_rate": 4.311719619236e-05, "loss": 0.3427, "step": 2328 }, { "epoch": 0.8583801713811849, "grad_norm": 4.647144400631668, "learning_rate": 4.311410557547287e-05, "loss": 0.3429, "step": 2329 }, { "epoch": 0.8587487330691974, "grad_norm": 9.9773788177923, "learning_rate": 4.3111014958585736e-05, "loss": 0.2258, "step": 2330 }, { "epoch": 0.85911729475721, "grad_norm": 5.306549329818288, "learning_rate": 4.310792434169861e-05, "loss": 0.3151, "step": 2331 }, { "epoch": 0.8594858564452226, "grad_norm": 6.696340831363436, "learning_rate": 4.310483372481147e-05, "loss": 0.3709, "step": 2332 }, { "epoch": 0.859854418133235, "grad_norm": 11.644350415821627, "learning_rate": 4.310174310792434e-05, "loss": 0.558, "step": 2333 }, { "epoch": 0.8602229798212476, "grad_norm": 4.125180558385563, "learning_rate": 4.3098652491037214e-05, "loss": 0.2856, "step": 2334 }, { "epoch": 0.8605915415092601, "grad_norm": 3.6698413168047668, "learning_rate": 4.309556187415008e-05, "loss": 0.226, "step": 2335 }, { "epoch": 0.8609601031972727, "grad_norm": 7.270207098486453, "learning_rate": 4.309247125726295e-05, "loss": 0.3056, "step": 2336 }, { "epoch": 0.8613286648852851, "grad_norm": 12.506118497676635, "learning_rate": 4.308938064037582e-05, "loss": 0.386, "step": 2337 }, { "epoch": 0.8616972265732977, "grad_norm": 5.44580531250512, "learning_rate": 4.308629002348869e-05, "loss": 0.3635, "step": 2338 }, { "epoch": 0.8620657882613102, "grad_norm": 6.880447171782617, "learning_rate": 4.308319940660156e-05, "loss": 0.2947, "step": 2339 }, { "epoch": 0.8624343499493228, "grad_norm": 4.511340791798609, "learning_rate": 4.308010878971443e-05, "loss": 0.2496, "step": 2340 }, { "epoch": 0.8628029116373352, "grad_norm": 10.37058908847478, "learning_rate": 4.30770181728273e-05, "loss": 0.465, "step": 2341 }, { "epoch": 0.8631714733253478, "grad_norm": 5.90978187842715, "learning_rate": 4.307392755594017e-05, "loss": 0.3279, "step": 2342 }, { "epoch": 0.8635400350133604, "grad_norm": 12.973243174015835, "learning_rate": 4.307083693905304e-05, "loss": 0.3456, "step": 2343 }, { "epoch": 0.8639085967013729, "grad_norm": 2.8599177611471456, "learning_rate": 4.3067746322165905e-05, "loss": 0.2156, "step": 2344 }, { "epoch": 0.8642771583893855, "grad_norm": 4.742079706867975, "learning_rate": 4.3064655705278776e-05, "loss": 0.3885, "step": 2345 }, { "epoch": 0.8646457200773979, "grad_norm": 6.933037499694342, "learning_rate": 4.306156508839165e-05, "loss": 0.3949, "step": 2346 }, { "epoch": 0.8650142817654105, "grad_norm": 5.943650443188959, "learning_rate": 4.305847447150451e-05, "loss": 0.4005, "step": 2347 }, { "epoch": 0.865382843453423, "grad_norm": 5.672291301655782, "learning_rate": 4.305538385461738e-05, "loss": 0.3475, "step": 2348 }, { "epoch": 0.8657514051414356, "grad_norm": 6.443869393368325, "learning_rate": 4.3052293237730254e-05, "loss": 0.4157, "step": 2349 }, { "epoch": 0.866119966829448, "grad_norm": 2.7551708158788286, "learning_rate": 4.304920262084312e-05, "loss": 0.3047, "step": 2350 }, { "epoch": 0.8664885285174606, "grad_norm": 6.057405199040372, "learning_rate": 4.304611200395599e-05, "loss": 0.248, "step": 2351 }, { "epoch": 0.8668570902054732, "grad_norm": 5.220247156715596, "learning_rate": 4.304302138706886e-05, "loss": 0.3236, "step": 2352 }, { "epoch": 0.8672256518934857, "grad_norm": 6.987021267072502, "learning_rate": 4.303993077018173e-05, "loss": 0.4142, "step": 2353 }, { "epoch": 0.8675942135814982, "grad_norm": 4.354083168409482, "learning_rate": 4.30368401532946e-05, "loss": 0.3218, "step": 2354 }, { "epoch": 0.8679627752695107, "grad_norm": 13.646096232359367, "learning_rate": 4.303374953640747e-05, "loss": 0.3007, "step": 2355 }, { "epoch": 0.8683313369575233, "grad_norm": 4.857735213054174, "learning_rate": 4.303065891952034e-05, "loss": 0.2565, "step": 2356 }, { "epoch": 0.8686998986455358, "grad_norm": 5.239398060651993, "learning_rate": 4.302756830263321e-05, "loss": 0.3102, "step": 2357 }, { "epoch": 0.8690684603335483, "grad_norm": 6.736653166085289, "learning_rate": 4.302447768574608e-05, "loss": 0.3534, "step": 2358 }, { "epoch": 0.8694370220215608, "grad_norm": 6.715346498958502, "learning_rate": 4.3021387068858946e-05, "loss": 0.3114, "step": 2359 }, { "epoch": 0.8698055837095734, "grad_norm": 5.898593683259218, "learning_rate": 4.301829645197182e-05, "loss": 0.2542, "step": 2360 }, { "epoch": 0.870174145397586, "grad_norm": 8.683466791553908, "learning_rate": 4.301520583508469e-05, "loss": 0.372, "step": 2361 }, { "epoch": 0.8705427070855984, "grad_norm": 6.111638969203971, "learning_rate": 4.301211521819755e-05, "loss": 0.2922, "step": 2362 }, { "epoch": 0.870911268773611, "grad_norm": 5.631617404752018, "learning_rate": 4.3009024601310424e-05, "loss": 0.3045, "step": 2363 }, { "epoch": 0.8712798304616235, "grad_norm": 4.404667346877118, "learning_rate": 4.300593398442329e-05, "loss": 0.2682, "step": 2364 }, { "epoch": 0.8716483921496361, "grad_norm": 6.1154162106669, "learning_rate": 4.300284336753616e-05, "loss": 0.3066, "step": 2365 }, { "epoch": 0.8720169538376485, "grad_norm": 13.797822222049719, "learning_rate": 4.299975275064903e-05, "loss": 0.4558, "step": 2366 }, { "epoch": 0.8723855155256611, "grad_norm": 11.55600799587735, "learning_rate": 4.29966621337619e-05, "loss": 0.3006, "step": 2367 }, { "epoch": 0.8727540772136736, "grad_norm": 9.684770753399299, "learning_rate": 4.299357151687477e-05, "loss": 0.3804, "step": 2368 }, { "epoch": 0.8731226389016862, "grad_norm": 7.220543840801301, "learning_rate": 4.299048089998764e-05, "loss": 0.4099, "step": 2369 }, { "epoch": 0.8734912005896986, "grad_norm": 3.5788204400168393, "learning_rate": 4.298739028310051e-05, "loss": 0.2361, "step": 2370 }, { "epoch": 0.8738597622777112, "grad_norm": 8.87326763599741, "learning_rate": 4.298429966621338e-05, "loss": 0.3633, "step": 2371 }, { "epoch": 0.8742283239657238, "grad_norm": 4.654807462551779, "learning_rate": 4.298120904932625e-05, "loss": 0.3615, "step": 2372 }, { "epoch": 0.8745968856537363, "grad_norm": 3.1251177956314327, "learning_rate": 4.2978118432439116e-05, "loss": 0.2287, "step": 2373 }, { "epoch": 0.8749654473417489, "grad_norm": 5.231646472851021, "learning_rate": 4.297502781555199e-05, "loss": 0.2789, "step": 2374 }, { "epoch": 0.8753340090297613, "grad_norm": 4.902923817151059, "learning_rate": 4.297193719866486e-05, "loss": 0.2121, "step": 2375 }, { "epoch": 0.8757025707177739, "grad_norm": 3.8143234895006155, "learning_rate": 4.296884658177773e-05, "loss": 0.2374, "step": 2376 }, { "epoch": 0.8760711324057864, "grad_norm": 4.435033838256973, "learning_rate": 4.2965755964890594e-05, "loss": 0.3266, "step": 2377 }, { "epoch": 0.876439694093799, "grad_norm": 5.802498036145524, "learning_rate": 4.296266534800346e-05, "loss": 0.4031, "step": 2378 }, { "epoch": 0.8768082557818114, "grad_norm": 5.9030908279681915, "learning_rate": 4.295957473111633e-05, "loss": 0.4205, "step": 2379 }, { "epoch": 0.877176817469824, "grad_norm": 5.62047547524861, "learning_rate": 4.29564841142292e-05, "loss": 0.245, "step": 2380 }, { "epoch": 0.8775453791578366, "grad_norm": 6.433182240558274, "learning_rate": 4.295339349734207e-05, "loss": 0.3231, "step": 2381 }, { "epoch": 0.8779139408458491, "grad_norm": 8.75554198098892, "learning_rate": 4.295030288045494e-05, "loss": 0.402, "step": 2382 }, { "epoch": 0.8782825025338616, "grad_norm": 4.1879185709355795, "learning_rate": 4.294721226356781e-05, "loss": 0.2964, "step": 2383 }, { "epoch": 0.8786510642218741, "grad_norm": 5.957937623157492, "learning_rate": 4.294412164668068e-05, "loss": 0.359, "step": 2384 }, { "epoch": 0.8790196259098867, "grad_norm": 6.471100753233262, "learning_rate": 4.294103102979355e-05, "loss": 0.5074, "step": 2385 }, { "epoch": 0.8793881875978992, "grad_norm": 7.556462928416005, "learning_rate": 4.293794041290642e-05, "loss": 0.2076, "step": 2386 }, { "epoch": 0.8797567492859117, "grad_norm": 7.332068970084779, "learning_rate": 4.293484979601929e-05, "loss": 0.3481, "step": 2387 }, { "epoch": 0.8801253109739242, "grad_norm": 8.005288283561667, "learning_rate": 4.2931759179132156e-05, "loss": 0.454, "step": 2388 }, { "epoch": 0.8804938726619368, "grad_norm": 5.406935455439348, "learning_rate": 4.292866856224503e-05, "loss": 0.3995, "step": 2389 }, { "epoch": 0.8808624343499494, "grad_norm": 6.90021897120562, "learning_rate": 4.29255779453579e-05, "loss": 0.4102, "step": 2390 }, { "epoch": 0.8812309960379618, "grad_norm": 24.198286465596976, "learning_rate": 4.292248732847077e-05, "loss": 0.2, "step": 2391 }, { "epoch": 0.8815995577259744, "grad_norm": 7.065190975688429, "learning_rate": 4.2919396711583634e-05, "loss": 0.3117, "step": 2392 }, { "epoch": 0.8819681194139869, "grad_norm": 10.676685952335804, "learning_rate": 4.29163060946965e-05, "loss": 0.3977, "step": 2393 }, { "epoch": 0.8823366811019995, "grad_norm": 5.674237560528335, "learning_rate": 4.291321547780937e-05, "loss": 0.3303, "step": 2394 }, { "epoch": 0.882705242790012, "grad_norm": 3.541234711877615, "learning_rate": 4.291012486092224e-05, "loss": 0.4623, "step": 2395 }, { "epoch": 0.8830738044780245, "grad_norm": 7.096982782268561, "learning_rate": 4.290703424403511e-05, "loss": 0.2683, "step": 2396 }, { "epoch": 0.883442366166037, "grad_norm": 8.37174127293229, "learning_rate": 4.290394362714798e-05, "loss": 0.4028, "step": 2397 }, { "epoch": 0.8838109278540496, "grad_norm": 9.886394638602857, "learning_rate": 4.290085301026085e-05, "loss": 0.4394, "step": 2398 }, { "epoch": 0.8841794895420622, "grad_norm": 6.25880788061319, "learning_rate": 4.289776239337372e-05, "loss": 0.4829, "step": 2399 }, { "epoch": 0.8845480512300746, "grad_norm": 4.991212752671603, "learning_rate": 4.289467177648659e-05, "loss": 0.3721, "step": 2400 }, { "epoch": 0.8845480512300746, "eval_bleu": 0.13361158489324831, "eval_bleu_1gram": 0.49947480187983945, "eval_bleu_2gram": 0.296656546597884, "eval_bleu_3gram": 0.17772833870250865, "eval_bleu_4gram": 0.11447618346468894, "eval_rag_val_loss": 0.40762046460290385, "eval_rouge1": 0.4829761372151563, "eval_rouge2": 0.2841229705369208, "eval_rougeL": 0.48237513385357506, "step": 2400 }, { "epoch": 0.8849166129180872, "grad_norm": 7.643922433600333, "learning_rate": 4.289158115959946e-05, "loss": 0.2321, "step": 2401 }, { "epoch": 0.8852851746060997, "grad_norm": 14.936485607227755, "learning_rate": 4.2888490542712326e-05, "loss": 0.5074, "step": 2402 }, { "epoch": 0.8856537362941123, "grad_norm": 6.033651713757572, "learning_rate": 4.28853999258252e-05, "loss": 0.3692, "step": 2403 }, { "epoch": 0.8860222979821247, "grad_norm": 8.02636285065356, "learning_rate": 4.288230930893807e-05, "loss": 0.4734, "step": 2404 }, { "epoch": 0.8863908596701373, "grad_norm": 13.284494089092178, "learning_rate": 4.287921869205094e-05, "loss": 0.6027, "step": 2405 }, { "epoch": 0.8867594213581498, "grad_norm": 6.342123189460527, "learning_rate": 4.2876128075163804e-05, "loss": 0.4238, "step": 2406 }, { "epoch": 0.8871279830461624, "grad_norm": 6.813830604447363, "learning_rate": 4.287303745827667e-05, "loss": 0.3646, "step": 2407 }, { "epoch": 0.8874965447341748, "grad_norm": 12.434061313074851, "learning_rate": 4.286994684138954e-05, "loss": 0.4556, "step": 2408 }, { "epoch": 0.8878651064221874, "grad_norm": 6.139382117678068, "learning_rate": 4.286685622450241e-05, "loss": 0.4262, "step": 2409 }, { "epoch": 0.8882336681102, "grad_norm": 6.591964118248771, "learning_rate": 4.286376560761528e-05, "loss": 0.5568, "step": 2410 }, { "epoch": 0.8886022297982125, "grad_norm": 7.052955321847194, "learning_rate": 4.286067499072815e-05, "loss": 0.3309, "step": 2411 }, { "epoch": 0.888970791486225, "grad_norm": 8.931034336996655, "learning_rate": 4.285758437384102e-05, "loss": 0.3142, "step": 2412 }, { "epoch": 0.8893393531742375, "grad_norm": 6.0471796949638925, "learning_rate": 4.285449375695389e-05, "loss": 0.2892, "step": 2413 }, { "epoch": 0.8897079148622501, "grad_norm": 8.144404787687257, "learning_rate": 4.285140314006676e-05, "loss": 0.3471, "step": 2414 }, { "epoch": 0.8900764765502626, "grad_norm": 7.352517206871654, "learning_rate": 4.284831252317963e-05, "loss": 0.4234, "step": 2415 }, { "epoch": 0.8904450382382751, "grad_norm": 6.296772617615251, "learning_rate": 4.2845221906292496e-05, "loss": 0.2894, "step": 2416 }, { "epoch": 0.8908135999262876, "grad_norm": 11.735792945496708, "learning_rate": 4.284213128940537e-05, "loss": 0.3084, "step": 2417 }, { "epoch": 0.8911821616143002, "grad_norm": 5.733192758537164, "learning_rate": 4.283904067251824e-05, "loss": 0.462, "step": 2418 }, { "epoch": 0.8915507233023128, "grad_norm": 5.781416277169624, "learning_rate": 4.283595005563111e-05, "loss": 0.3243, "step": 2419 }, { "epoch": 0.8919192849903252, "grad_norm": 5.121060788116175, "learning_rate": 4.283285943874398e-05, "loss": 0.2335, "step": 2420 }, { "epoch": 0.8922878466783378, "grad_norm": 4.847330259171485, "learning_rate": 4.2829768821856845e-05, "loss": 0.3699, "step": 2421 }, { "epoch": 0.8926564083663503, "grad_norm": 6.112538955033877, "learning_rate": 4.282667820496971e-05, "loss": 0.3292, "step": 2422 }, { "epoch": 0.8930249700543629, "grad_norm": 11.019176760072712, "learning_rate": 4.282358758808258e-05, "loss": 0.5787, "step": 2423 }, { "epoch": 0.8933935317423753, "grad_norm": 4.388784871431394, "learning_rate": 4.282049697119545e-05, "loss": 0.3974, "step": 2424 }, { "epoch": 0.8937620934303879, "grad_norm": 5.080198269676934, "learning_rate": 4.281740635430832e-05, "loss": 0.4839, "step": 2425 }, { "epoch": 0.8941306551184004, "grad_norm": 6.054487773923558, "learning_rate": 4.281431573742119e-05, "loss": 0.4722, "step": 2426 }, { "epoch": 0.894499216806413, "grad_norm": 4.556345093288135, "learning_rate": 4.281122512053406e-05, "loss": 0.41, "step": 2427 }, { "epoch": 0.8948677784944256, "grad_norm": 10.150900033217498, "learning_rate": 4.280813450364693e-05, "loss": 0.4591, "step": 2428 }, { "epoch": 0.895236340182438, "grad_norm": 9.198967452777818, "learning_rate": 4.28050438867598e-05, "loss": 0.5494, "step": 2429 }, { "epoch": 0.8956049018704506, "grad_norm": 8.514948546813587, "learning_rate": 4.280195326987267e-05, "loss": 0.3454, "step": 2430 }, { "epoch": 0.8959734635584631, "grad_norm": 4.4601844913659665, "learning_rate": 4.2798862652985536e-05, "loss": 0.2364, "step": 2431 }, { "epoch": 0.8963420252464757, "grad_norm": 6.071001207713807, "learning_rate": 4.279577203609841e-05, "loss": 0.3835, "step": 2432 }, { "epoch": 0.8967105869344881, "grad_norm": 8.098464594241287, "learning_rate": 4.279268141921128e-05, "loss": 0.4015, "step": 2433 }, { "epoch": 0.8970791486225007, "grad_norm": 6.253205659833196, "learning_rate": 4.278959080232415e-05, "loss": 0.3293, "step": 2434 }, { "epoch": 0.8974477103105132, "grad_norm": 6.75662499727192, "learning_rate": 4.2786500185437014e-05, "loss": 0.3554, "step": 2435 }, { "epoch": 0.8978162719985258, "grad_norm": 5.132368791466675, "learning_rate": 4.2783409568549886e-05, "loss": 0.3641, "step": 2436 }, { "epoch": 0.8981848336865382, "grad_norm": 4.047284551097047, "learning_rate": 4.278031895166275e-05, "loss": 0.2749, "step": 2437 }, { "epoch": 0.8985533953745508, "grad_norm": 3.9574541956069624, "learning_rate": 4.277722833477562e-05, "loss": 0.3786, "step": 2438 }, { "epoch": 0.8989219570625634, "grad_norm": 8.501648686902312, "learning_rate": 4.277413771788849e-05, "loss": 0.5075, "step": 2439 }, { "epoch": 0.8992905187505759, "grad_norm": 34.8187830954788, "learning_rate": 4.2771047101001363e-05, "loss": 0.3842, "step": 2440 }, { "epoch": 0.8996590804385884, "grad_norm": 6.840853700957956, "learning_rate": 4.276795648411423e-05, "loss": 0.2021, "step": 2441 }, { "epoch": 0.9000276421266009, "grad_norm": 7.168815941079134, "learning_rate": 4.27648658672271e-05, "loss": 0.313, "step": 2442 }, { "epoch": 0.9003962038146135, "grad_norm": 5.744510850126977, "learning_rate": 4.276177525033997e-05, "loss": 0.272, "step": 2443 }, { "epoch": 0.900764765502626, "grad_norm": 8.212276331440956, "learning_rate": 4.275868463345284e-05, "loss": 0.241, "step": 2444 }, { "epoch": 0.9011333271906385, "grad_norm": 9.830819104859012, "learning_rate": 4.2755594016565706e-05, "loss": 0.2749, "step": 2445 }, { "epoch": 0.901501888878651, "grad_norm": 10.619862110801694, "learning_rate": 4.275250339967858e-05, "loss": 0.4126, "step": 2446 }, { "epoch": 0.9018704505666636, "grad_norm": 7.075174487223492, "learning_rate": 4.274941278279145e-05, "loss": 0.5206, "step": 2447 }, { "epoch": 0.9022390122546762, "grad_norm": 6.571327593720628, "learning_rate": 4.274632216590432e-05, "loss": 0.3887, "step": 2448 }, { "epoch": 0.9026075739426886, "grad_norm": 5.901328322026785, "learning_rate": 4.274323154901719e-05, "loss": 0.4827, "step": 2449 }, { "epoch": 0.9029761356307012, "grad_norm": 2.9080114461576465, "learning_rate": 4.2740140932130055e-05, "loss": 0.201, "step": 2450 }, { "epoch": 0.9033446973187137, "grad_norm": 14.814235151041432, "learning_rate": 4.2737050315242926e-05, "loss": 0.1791, "step": 2451 }, { "epoch": 0.9037132590067263, "grad_norm": 5.055499853216212, "learning_rate": 4.27339596983558e-05, "loss": 0.3976, "step": 2452 }, { "epoch": 0.9040818206947387, "grad_norm": 5.413378299344727, "learning_rate": 4.273086908146866e-05, "loss": 0.2698, "step": 2453 }, { "epoch": 0.9044503823827513, "grad_norm": 10.06934384584417, "learning_rate": 4.272777846458153e-05, "loss": 0.4745, "step": 2454 }, { "epoch": 0.9048189440707638, "grad_norm": 11.05217920969449, "learning_rate": 4.27246878476944e-05, "loss": 0.2876, "step": 2455 }, { "epoch": 0.9051875057587764, "grad_norm": 17.438222781279382, "learning_rate": 4.272159723080727e-05, "loss": 0.2731, "step": 2456 }, { "epoch": 0.905556067446789, "grad_norm": 6.79932915239492, "learning_rate": 4.271850661392014e-05, "loss": 0.44, "step": 2457 }, { "epoch": 0.9059246291348014, "grad_norm": 4.714006275300683, "learning_rate": 4.271541599703301e-05, "loss": 0.2928, "step": 2458 }, { "epoch": 0.906293190822814, "grad_norm": 11.148904898882936, "learning_rate": 4.271232538014588e-05, "loss": 0.372, "step": 2459 }, { "epoch": 0.9066617525108265, "grad_norm": 15.327642186267083, "learning_rate": 4.270923476325875e-05, "loss": 0.2757, "step": 2460 }, { "epoch": 0.9070303141988391, "grad_norm": 3.9341324226188896, "learning_rate": 4.270614414637162e-05, "loss": 0.3708, "step": 2461 }, { "epoch": 0.9073988758868515, "grad_norm": 6.466152683294885, "learning_rate": 4.270305352948449e-05, "loss": 0.3505, "step": 2462 }, { "epoch": 0.9077674375748641, "grad_norm": 5.723627064091338, "learning_rate": 4.269996291259736e-05, "loss": 0.2999, "step": 2463 }, { "epoch": 0.9081359992628766, "grad_norm": 7.9857565921839475, "learning_rate": 4.2696872295710225e-05, "loss": 0.5057, "step": 2464 }, { "epoch": 0.9085045609508892, "grad_norm": 10.287533402417674, "learning_rate": 4.2693781678823096e-05, "loss": 0.3961, "step": 2465 }, { "epoch": 0.9088731226389016, "grad_norm": 9.168070275321966, "learning_rate": 4.269069106193597e-05, "loss": 0.3858, "step": 2466 }, { "epoch": 0.9092416843269142, "grad_norm": 10.777818039581849, "learning_rate": 4.268760044504884e-05, "loss": 0.2893, "step": 2467 }, { "epoch": 0.9096102460149268, "grad_norm": 4.519842270656281, "learning_rate": 4.26845098281617e-05, "loss": 0.2259, "step": 2468 }, { "epoch": 0.9099788077029393, "grad_norm": 7.129570281573942, "learning_rate": 4.268141921127457e-05, "loss": 0.4017, "step": 2469 }, { "epoch": 0.9103473693909518, "grad_norm": 14.166694072622082, "learning_rate": 4.267832859438744e-05, "loss": 0.3974, "step": 2470 }, { "epoch": 0.9107159310789643, "grad_norm": 6.883212369113025, "learning_rate": 4.267523797750031e-05, "loss": 0.3379, "step": 2471 }, { "epoch": 0.9110844927669769, "grad_norm": 6.526461511387249, "learning_rate": 4.267214736061318e-05, "loss": 0.2909, "step": 2472 }, { "epoch": 0.9114530544549894, "grad_norm": 7.1308326271410145, "learning_rate": 4.266905674372605e-05, "loss": 0.459, "step": 2473 }, { "epoch": 0.911821616143002, "grad_norm": 3.964240448250203, "learning_rate": 4.2665966126838916e-05, "loss": 0.3023, "step": 2474 }, { "epoch": 0.9121901778310144, "grad_norm": 8.580333988118193, "learning_rate": 4.266287550995179e-05, "loss": 0.3893, "step": 2475 }, { "epoch": 0.912558739519027, "grad_norm": 6.369391573559753, "learning_rate": 4.265978489306466e-05, "loss": 0.3652, "step": 2476 }, { "epoch": 0.9129273012070396, "grad_norm": 8.715839515472544, "learning_rate": 4.265669427617753e-05, "loss": 0.3231, "step": 2477 }, { "epoch": 0.913295862895052, "grad_norm": 6.261138755890003, "learning_rate": 4.2653603659290394e-05, "loss": 0.3071, "step": 2478 }, { "epoch": 0.9136644245830646, "grad_norm": 4.259046911084722, "learning_rate": 4.2650513042403265e-05, "loss": 0.1776, "step": 2479 }, { "epoch": 0.9140329862710771, "grad_norm": 5.7677233012756535, "learning_rate": 4.264742242551614e-05, "loss": 0.2304, "step": 2480 }, { "epoch": 0.9144015479590897, "grad_norm": 6.699744307919811, "learning_rate": 4.264433180862901e-05, "loss": 0.254, "step": 2481 }, { "epoch": 0.9147701096471021, "grad_norm": 4.6584315981705675, "learning_rate": 4.264124119174188e-05, "loss": 0.2403, "step": 2482 }, { "epoch": 0.9151386713351147, "grad_norm": 5.360102745893329, "learning_rate": 4.2638150574854743e-05, "loss": 0.2652, "step": 2483 }, { "epoch": 0.9155072330231272, "grad_norm": 7.46685741498803, "learning_rate": 4.263505995796761e-05, "loss": 0.4501, "step": 2484 }, { "epoch": 0.9158757947111398, "grad_norm": 7.046402401523718, "learning_rate": 4.263196934108048e-05, "loss": 0.4153, "step": 2485 }, { "epoch": 0.9162443563991524, "grad_norm": 4.265675526972957, "learning_rate": 4.262887872419335e-05, "loss": 0.1834, "step": 2486 }, { "epoch": 0.9166129180871648, "grad_norm": 7.710325268960696, "learning_rate": 4.262578810730622e-05, "loss": 0.3102, "step": 2487 }, { "epoch": 0.9169814797751774, "grad_norm": 4.743237148300676, "learning_rate": 4.2622697490419086e-05, "loss": 0.2566, "step": 2488 }, { "epoch": 0.9173500414631899, "grad_norm": 9.052883755172601, "learning_rate": 4.261960687353196e-05, "loss": 0.2867, "step": 2489 }, { "epoch": 0.9177186031512025, "grad_norm": 5.206076211764968, "learning_rate": 4.261651625664483e-05, "loss": 0.3424, "step": 2490 }, { "epoch": 0.9180871648392149, "grad_norm": 6.603950890065616, "learning_rate": 4.26134256397577e-05, "loss": 0.3336, "step": 2491 }, { "epoch": 0.9184557265272275, "grad_norm": 6.168491643347888, "learning_rate": 4.261033502287057e-05, "loss": 0.4959, "step": 2492 }, { "epoch": 0.91882428821524, "grad_norm": 5.41382311006873, "learning_rate": 4.2607244405983435e-05, "loss": 0.2405, "step": 2493 }, { "epoch": 0.9191928499032526, "grad_norm": 2.999333784516145, "learning_rate": 4.2604153789096306e-05, "loss": 0.2868, "step": 2494 }, { "epoch": 0.9195614115912651, "grad_norm": 5.231333472384645, "learning_rate": 4.260106317220918e-05, "loss": 0.3248, "step": 2495 }, { "epoch": 0.9199299732792776, "grad_norm": 4.211493234287712, "learning_rate": 4.259797255532205e-05, "loss": 0.35, "step": 2496 }, { "epoch": 0.9202985349672902, "grad_norm": 4.045374293792835, "learning_rate": 4.259488193843491e-05, "loss": 0.2261, "step": 2497 }, { "epoch": 0.9206670966553027, "grad_norm": 11.310880597789941, "learning_rate": 4.259179132154778e-05, "loss": 0.3639, "step": 2498 }, { "epoch": 0.9210356583433152, "grad_norm": 5.886358441196236, "learning_rate": 4.258870070466065e-05, "loss": 0.3625, "step": 2499 }, { "epoch": 0.9214042200313277, "grad_norm": 6.990941589853554, "learning_rate": 4.258561008777352e-05, "loss": 0.4662, "step": 2500 }, { "epoch": 0.9217727817193403, "grad_norm": 6.570194637951482, "learning_rate": 4.258251947088639e-05, "loss": 0.4608, "step": 2501 }, { "epoch": 0.9221413434073528, "grad_norm": 3.487527148506842, "learning_rate": 4.257942885399926e-05, "loss": 0.3689, "step": 2502 }, { "epoch": 0.9225099050953653, "grad_norm": 5.39776085948077, "learning_rate": 4.257633823711213e-05, "loss": 0.3387, "step": 2503 }, { "epoch": 0.9228784667833778, "grad_norm": 9.388539214334761, "learning_rate": 4.2573247620225e-05, "loss": 0.5184, "step": 2504 }, { "epoch": 0.9232470284713904, "grad_norm": 4.130733320328471, "learning_rate": 4.257015700333787e-05, "loss": 0.3106, "step": 2505 }, { "epoch": 0.923615590159403, "grad_norm": 3.5532958943277557, "learning_rate": 4.256706638645074e-05, "loss": 0.3016, "step": 2506 }, { "epoch": 0.9239841518474154, "grad_norm": 7.970638515114272, "learning_rate": 4.2563975769563605e-05, "loss": 0.352, "step": 2507 }, { "epoch": 0.924352713535428, "grad_norm": 8.97244197338831, "learning_rate": 4.2560885152676476e-05, "loss": 0.5145, "step": 2508 }, { "epoch": 0.9247212752234405, "grad_norm": 4.836875122086084, "learning_rate": 4.255779453578935e-05, "loss": 0.2713, "step": 2509 }, { "epoch": 0.9250898369114531, "grad_norm": 5.461805719385174, "learning_rate": 4.255470391890222e-05, "loss": 0.3833, "step": 2510 }, { "epoch": 0.9254583985994655, "grad_norm": 16.480983989190484, "learning_rate": 4.255161330201509e-05, "loss": 0.2376, "step": 2511 }, { "epoch": 0.9258269602874781, "grad_norm": 4.931176881042588, "learning_rate": 4.2548522685127954e-05, "loss": 0.2972, "step": 2512 }, { "epoch": 0.9261955219754906, "grad_norm": 7.221313021561321, "learning_rate": 4.254543206824082e-05, "loss": 0.3348, "step": 2513 }, { "epoch": 0.9265640836635032, "grad_norm": 14.324167085569496, "learning_rate": 4.254234145135369e-05, "loss": 0.4357, "step": 2514 }, { "epoch": 0.9269326453515158, "grad_norm": 5.050409644102366, "learning_rate": 4.253925083446656e-05, "loss": 0.501, "step": 2515 }, { "epoch": 0.9273012070395282, "grad_norm": 9.162388120959598, "learning_rate": 4.253616021757943e-05, "loss": 0.4162, "step": 2516 }, { "epoch": 0.9276697687275408, "grad_norm": 4.305424732695515, "learning_rate": 4.2533069600692296e-05, "loss": 0.2608, "step": 2517 }, { "epoch": 0.9280383304155533, "grad_norm": 4.631235404255425, "learning_rate": 4.252997898380517e-05, "loss": 0.398, "step": 2518 }, { "epoch": 0.9284068921035659, "grad_norm": 44.52729560293256, "learning_rate": 4.252688836691804e-05, "loss": 0.3354, "step": 2519 }, { "epoch": 0.9287754537915783, "grad_norm": 7.683267560278766, "learning_rate": 4.252379775003091e-05, "loss": 0.3783, "step": 2520 }, { "epoch": 0.9291440154795909, "grad_norm": 4.894750640409286, "learning_rate": 4.252070713314378e-05, "loss": 0.1584, "step": 2521 }, { "epoch": 0.9295125771676034, "grad_norm": 9.183018012844075, "learning_rate": 4.2517616516256645e-05, "loss": 0.4226, "step": 2522 }, { "epoch": 0.929881138855616, "grad_norm": 8.004269414345188, "learning_rate": 4.2514525899369517e-05, "loss": 0.4647, "step": 2523 }, { "epoch": 0.9302497005436285, "grad_norm": 23.141082614904683, "learning_rate": 4.251143528248239e-05, "loss": 0.2904, "step": 2524 }, { "epoch": 0.930618262231641, "grad_norm": 8.172922450198161, "learning_rate": 4.250834466559526e-05, "loss": 0.4507, "step": 2525 }, { "epoch": 0.9309868239196536, "grad_norm": 4.213507665898561, "learning_rate": 4.250525404870812e-05, "loss": 0.2914, "step": 2526 }, { "epoch": 0.9313553856076661, "grad_norm": 27.30527650473439, "learning_rate": 4.2502163431820995e-05, "loss": 0.4547, "step": 2527 }, { "epoch": 0.9317239472956786, "grad_norm": 15.274812118448667, "learning_rate": 4.249907281493386e-05, "loss": 0.2989, "step": 2528 }, { "epoch": 0.9320925089836911, "grad_norm": 6.426671077257176, "learning_rate": 4.249598219804673e-05, "loss": 0.3087, "step": 2529 }, { "epoch": 0.9324610706717037, "grad_norm": 26.729284518649322, "learning_rate": 4.24928915811596e-05, "loss": 0.3619, "step": 2530 }, { "epoch": 0.9328296323597162, "grad_norm": 8.926679556820456, "learning_rate": 4.248980096427247e-05, "loss": 0.3641, "step": 2531 }, { "epoch": 0.9331981940477287, "grad_norm": 5.688529455896793, "learning_rate": 4.248671034738534e-05, "loss": 0.2129, "step": 2532 }, { "epoch": 0.9335667557357412, "grad_norm": 3.223874059673659, "learning_rate": 4.248361973049821e-05, "loss": 0.2096, "step": 2533 }, { "epoch": 0.9339353174237538, "grad_norm": 4.8899315290591465, "learning_rate": 4.248052911361108e-05, "loss": 0.3569, "step": 2534 }, { "epoch": 0.9343038791117664, "grad_norm": 3.1618465246006795, "learning_rate": 4.247743849672395e-05, "loss": 0.1646, "step": 2535 }, { "epoch": 0.9346724407997788, "grad_norm": 5.3623383895601355, "learning_rate": 4.2474347879836815e-05, "loss": 0.3322, "step": 2536 }, { "epoch": 0.9350410024877914, "grad_norm": 9.464302050635538, "learning_rate": 4.2471257262949686e-05, "loss": 0.3549, "step": 2537 }, { "epoch": 0.9354095641758039, "grad_norm": 6.308817478617972, "learning_rate": 4.246816664606256e-05, "loss": 0.2678, "step": 2538 }, { "epoch": 0.9357781258638165, "grad_norm": 10.79033103739104, "learning_rate": 4.246507602917543e-05, "loss": 0.3405, "step": 2539 }, { "epoch": 0.936146687551829, "grad_norm": 4.5351394271867145, "learning_rate": 4.24619854122883e-05, "loss": 0.3171, "step": 2540 }, { "epoch": 0.9365152492398415, "grad_norm": 12.996918533153229, "learning_rate": 4.2458894795401164e-05, "loss": 0.3325, "step": 2541 }, { "epoch": 0.936883810927854, "grad_norm": 5.193001644787307, "learning_rate": 4.2455804178514035e-05, "loss": 0.3923, "step": 2542 }, { "epoch": 0.9372523726158666, "grad_norm": 4.5916639509152635, "learning_rate": 4.24527135616269e-05, "loss": 0.175, "step": 2543 }, { "epoch": 0.9376209343038792, "grad_norm": 6.7639640196436, "learning_rate": 4.244962294473977e-05, "loss": 0.3067, "step": 2544 }, { "epoch": 0.9379894959918916, "grad_norm": 5.189095860632185, "learning_rate": 4.244653232785264e-05, "loss": 0.2276, "step": 2545 }, { "epoch": 0.9383580576799042, "grad_norm": 9.259874589041967, "learning_rate": 4.2443441710965507e-05, "loss": 0.3778, "step": 2546 }, { "epoch": 0.9387266193679167, "grad_norm": 8.29306212780095, "learning_rate": 4.244035109407838e-05, "loss": 0.2881, "step": 2547 }, { "epoch": 0.9390951810559293, "grad_norm": 5.551017482382998, "learning_rate": 4.243726047719125e-05, "loss": 0.2937, "step": 2548 }, { "epoch": 0.9394637427439417, "grad_norm": 11.364570279671888, "learning_rate": 4.243416986030412e-05, "loss": 0.451, "step": 2549 }, { "epoch": 0.9398323044319543, "grad_norm": 5.340485049445836, "learning_rate": 4.2431079243416985e-05, "loss": 0.2349, "step": 2550 }, { "epoch": 0.9402008661199668, "grad_norm": 10.018294575240292, "learning_rate": 4.2427988626529856e-05, "loss": 0.2973, "step": 2551 }, { "epoch": 0.9405694278079794, "grad_norm": 4.870744265918659, "learning_rate": 4.242489800964273e-05, "loss": 0.1844, "step": 2552 }, { "epoch": 0.9409379894959919, "grad_norm": 11.355345125316054, "learning_rate": 4.24218073927556e-05, "loss": 0.4044, "step": 2553 }, { "epoch": 0.9413065511840044, "grad_norm": 19.65861518833529, "learning_rate": 4.241871677586847e-05, "loss": 0.3377, "step": 2554 }, { "epoch": 0.941675112872017, "grad_norm": 10.178449296338073, "learning_rate": 4.2415626158981334e-05, "loss": 0.2784, "step": 2555 }, { "epoch": 0.9420436745600295, "grad_norm": 5.621766241695712, "learning_rate": 4.2412535542094205e-05, "loss": 0.3848, "step": 2556 }, { "epoch": 0.942412236248042, "grad_norm": 18.55927081550148, "learning_rate": 4.2409444925207076e-05, "loss": 0.4681, "step": 2557 }, { "epoch": 0.9427807979360545, "grad_norm": 11.216683795192237, "learning_rate": 4.240635430831994e-05, "loss": 0.2872, "step": 2558 }, { "epoch": 0.9431493596240671, "grad_norm": 7.42186240747287, "learning_rate": 4.240326369143281e-05, "loss": 0.4411, "step": 2559 }, { "epoch": 0.9435179213120796, "grad_norm": 6.126086080366688, "learning_rate": 4.2400173074545676e-05, "loss": 0.3712, "step": 2560 }, { "epoch": 0.9438864830000921, "grad_norm": 8.552057511681355, "learning_rate": 4.239708245765855e-05, "loss": 0.274, "step": 2561 }, { "epoch": 0.9442550446881046, "grad_norm": 8.811853696241492, "learning_rate": 4.239399184077142e-05, "loss": 0.4454, "step": 2562 }, { "epoch": 0.9446236063761172, "grad_norm": 19.381041144875965, "learning_rate": 4.239090122388429e-05, "loss": 0.4379, "step": 2563 }, { "epoch": 0.9449921680641298, "grad_norm": 8.624080443083166, "learning_rate": 4.238781060699716e-05, "loss": 0.3805, "step": 2564 }, { "epoch": 0.9453607297521422, "grad_norm": 20.576684946290698, "learning_rate": 4.2384719990110025e-05, "loss": 0.3477, "step": 2565 }, { "epoch": 0.9457292914401548, "grad_norm": 13.789380086640158, "learning_rate": 4.2381629373222897e-05, "loss": 0.3951, "step": 2566 }, { "epoch": 0.9460978531281673, "grad_norm": 7.2642639854289435, "learning_rate": 4.237853875633577e-05, "loss": 0.3199, "step": 2567 }, { "epoch": 0.9464664148161799, "grad_norm": 11.133621475541753, "learning_rate": 4.237544813944864e-05, "loss": 0.3914, "step": 2568 }, { "epoch": 0.9468349765041923, "grad_norm": 6.677445027458224, "learning_rate": 4.23723575225615e-05, "loss": 0.2592, "step": 2569 }, { "epoch": 0.9472035381922049, "grad_norm": 4.916568474947507, "learning_rate": 4.2369266905674375e-05, "loss": 0.2427, "step": 2570 }, { "epoch": 0.9475720998802174, "grad_norm": 6.3987340986468215, "learning_rate": 4.2366176288787246e-05, "loss": 0.3998, "step": 2571 }, { "epoch": 0.94794066156823, "grad_norm": 6.194677640272698, "learning_rate": 4.236308567190012e-05, "loss": 0.3075, "step": 2572 }, { "epoch": 0.9483092232562426, "grad_norm": 10.364538594569481, "learning_rate": 4.235999505501299e-05, "loss": 0.3971, "step": 2573 }, { "epoch": 0.948677784944255, "grad_norm": 5.606473464509749, "learning_rate": 4.235690443812585e-05, "loss": 0.2376, "step": 2574 }, { "epoch": 0.9490463466322676, "grad_norm": 6.781168150517704, "learning_rate": 4.235381382123872e-05, "loss": 0.3199, "step": 2575 }, { "epoch": 0.9494149083202801, "grad_norm": 6.144336927752113, "learning_rate": 4.235072320435159e-05, "loss": 0.3757, "step": 2576 }, { "epoch": 0.9497834700082927, "grad_norm": 5.242604087047332, "learning_rate": 4.234763258746446e-05, "loss": 0.3788, "step": 2577 }, { "epoch": 0.9501520316963051, "grad_norm": 13.02710656174631, "learning_rate": 4.234454197057733e-05, "loss": 0.6222, "step": 2578 }, { "epoch": 0.9505205933843177, "grad_norm": 13.162672636198712, "learning_rate": 4.2341451353690195e-05, "loss": 0.3497, "step": 2579 }, { "epoch": 0.9508891550723302, "grad_norm": 6.820780070599284, "learning_rate": 4.2338360736803066e-05, "loss": 0.3905, "step": 2580 }, { "epoch": 0.9512577167603428, "grad_norm": 17.03405850553302, "learning_rate": 4.233527011991594e-05, "loss": 0.4733, "step": 2581 }, { "epoch": 0.9516262784483553, "grad_norm": 7.8951690582284435, "learning_rate": 4.233217950302881e-05, "loss": 0.3626, "step": 2582 }, { "epoch": 0.9519948401363678, "grad_norm": 5.2143989744781765, "learning_rate": 4.232908888614168e-05, "loss": 0.2704, "step": 2583 }, { "epoch": 0.9523634018243804, "grad_norm": 7.637504455265425, "learning_rate": 4.2325998269254544e-05, "loss": 0.3869, "step": 2584 }, { "epoch": 0.9527319635123929, "grad_norm": 10.985001135137168, "learning_rate": 4.2322907652367415e-05, "loss": 0.5274, "step": 2585 }, { "epoch": 0.9531005252004054, "grad_norm": 7.151407660786911, "learning_rate": 4.2319817035480286e-05, "loss": 0.5982, "step": 2586 }, { "epoch": 0.9534690868884179, "grad_norm": 3.421535666724015, "learning_rate": 4.231672641859316e-05, "loss": 0.3361, "step": 2587 }, { "epoch": 0.9538376485764305, "grad_norm": 26.433038141142493, "learning_rate": 4.231363580170602e-05, "loss": 0.3485, "step": 2588 }, { "epoch": 0.954206210264443, "grad_norm": 34.96987228706215, "learning_rate": 4.2310545184818887e-05, "loss": 0.354, "step": 2589 }, { "epoch": 0.9545747719524555, "grad_norm": 7.657183158215365, "learning_rate": 4.230745456793176e-05, "loss": 0.3669, "step": 2590 }, { "epoch": 0.9549433336404681, "grad_norm": 4.797273203074484, "learning_rate": 4.230436395104463e-05, "loss": 0.2691, "step": 2591 }, { "epoch": 0.9553118953284806, "grad_norm": 6.088218016842487, "learning_rate": 4.23012733341575e-05, "loss": 0.3507, "step": 2592 }, { "epoch": 0.9556804570164932, "grad_norm": 6.187136860989992, "learning_rate": 4.229818271727037e-05, "loss": 0.2879, "step": 2593 }, { "epoch": 0.9560490187045056, "grad_norm": 7.852167605336435, "learning_rate": 4.2295092100383236e-05, "loss": 0.4395, "step": 2594 }, { "epoch": 0.9564175803925182, "grad_norm": 4.547375418795208, "learning_rate": 4.229200148349611e-05, "loss": 0.3514, "step": 2595 }, { "epoch": 0.9567861420805307, "grad_norm": 14.982010799055239, "learning_rate": 4.228891086660898e-05, "loss": 0.2005, "step": 2596 }, { "epoch": 0.9571547037685433, "grad_norm": 4.418200148670376, "learning_rate": 4.228582024972185e-05, "loss": 0.2984, "step": 2597 }, { "epoch": 0.9575232654565557, "grad_norm": 5.882932137416448, "learning_rate": 4.2282729632834714e-05, "loss": 0.3352, "step": 2598 }, { "epoch": 0.9578918271445683, "grad_norm": 6.541695305180478, "learning_rate": 4.2279639015947585e-05, "loss": 0.3703, "step": 2599 }, { "epoch": 0.9582603888325808, "grad_norm": 6.070845689763166, "learning_rate": 4.2276548399060456e-05, "loss": 0.4117, "step": 2600 }, { "epoch": 0.9586289505205934, "grad_norm": 4.339732555815607, "learning_rate": 4.227345778217333e-05, "loss": 0.2951, "step": 2601 }, { "epoch": 0.958997512208606, "grad_norm": 9.374181279354646, "learning_rate": 4.22703671652862e-05, "loss": 0.33, "step": 2602 }, { "epoch": 0.9593660738966184, "grad_norm": 9.642274885410616, "learning_rate": 4.226727654839906e-05, "loss": 0.3431, "step": 2603 }, { "epoch": 0.959734635584631, "grad_norm": 9.897135973405348, "learning_rate": 4.226418593151193e-05, "loss": 0.3597, "step": 2604 }, { "epoch": 0.9601031972726435, "grad_norm": 4.97639711784903, "learning_rate": 4.22610953146248e-05, "loss": 0.2519, "step": 2605 }, { "epoch": 0.9604717589606561, "grad_norm": 6.510297687714903, "learning_rate": 4.225800469773767e-05, "loss": 0.393, "step": 2606 }, { "epoch": 0.9608403206486685, "grad_norm": 7.090118957726788, "learning_rate": 4.225491408085054e-05, "loss": 0.4325, "step": 2607 }, { "epoch": 0.9612088823366811, "grad_norm": 15.095787400503855, "learning_rate": 4.2251823463963405e-05, "loss": 0.3537, "step": 2608 }, { "epoch": 0.9615774440246936, "grad_norm": 17.271628269112504, "learning_rate": 4.2248732847076276e-05, "loss": 0.4463, "step": 2609 }, { "epoch": 0.9619460057127062, "grad_norm": 5.524904220924615, "learning_rate": 4.224564223018915e-05, "loss": 0.3764, "step": 2610 }, { "epoch": 0.9623145674007187, "grad_norm": 6.6903959759815175, "learning_rate": 4.224255161330202e-05, "loss": 0.2862, "step": 2611 }, { "epoch": 0.9626831290887312, "grad_norm": 7.8380480706993465, "learning_rate": 4.223946099641489e-05, "loss": 0.579, "step": 2612 }, { "epoch": 0.9630516907767438, "grad_norm": 6.272637227132072, "learning_rate": 4.2236370379527754e-05, "loss": 0.3993, "step": 2613 }, { "epoch": 0.9634202524647563, "grad_norm": 5.6569233699016515, "learning_rate": 4.2233279762640626e-05, "loss": 0.3032, "step": 2614 }, { "epoch": 0.9637888141527688, "grad_norm": 8.874090873467907, "learning_rate": 4.22301891457535e-05, "loss": 0.2886, "step": 2615 }, { "epoch": 0.9641573758407813, "grad_norm": 7.184778378227006, "learning_rate": 4.222709852886637e-05, "loss": 0.3495, "step": 2616 }, { "epoch": 0.9645259375287939, "grad_norm": 5.104923280443963, "learning_rate": 4.222400791197923e-05, "loss": 0.4139, "step": 2617 }, { "epoch": 0.9648944992168064, "grad_norm": 3.707069195793343, "learning_rate": 4.2220917295092104e-05, "loss": 0.2685, "step": 2618 }, { "epoch": 0.965263060904819, "grad_norm": 9.736970460569346, "learning_rate": 4.221782667820497e-05, "loss": 0.4636, "step": 2619 }, { "epoch": 0.9656316225928315, "grad_norm": 5.2487775196753965, "learning_rate": 4.221473606131784e-05, "loss": 0.2808, "step": 2620 }, { "epoch": 0.966000184280844, "grad_norm": 5.810430999369797, "learning_rate": 4.221164544443071e-05, "loss": 0.3377, "step": 2621 }, { "epoch": 0.9663687459688566, "grad_norm": 8.306537985419837, "learning_rate": 4.220855482754358e-05, "loss": 0.3366, "step": 2622 }, { "epoch": 0.966737307656869, "grad_norm": 9.934865926328403, "learning_rate": 4.2205464210656446e-05, "loss": 0.3732, "step": 2623 }, { "epoch": 0.9671058693448816, "grad_norm": 10.11604685594621, "learning_rate": 4.220237359376932e-05, "loss": 0.4989, "step": 2624 }, { "epoch": 0.9674744310328941, "grad_norm": 5.130188153480563, "learning_rate": 4.219928297688219e-05, "loss": 0.2888, "step": 2625 }, { "epoch": 0.9678429927209067, "grad_norm": 13.92127984312876, "learning_rate": 4.219619235999506e-05, "loss": 0.4026, "step": 2626 }, { "epoch": 0.9682115544089192, "grad_norm": 19.255703167949683, "learning_rate": 4.2193101743107924e-05, "loss": 0.241, "step": 2627 }, { "epoch": 0.9685801160969317, "grad_norm": 8.877346534530497, "learning_rate": 4.2190011126220795e-05, "loss": 0.449, "step": 2628 }, { "epoch": 0.9689486777849442, "grad_norm": 6.736597955508415, "learning_rate": 4.2186920509333666e-05, "loss": 0.3072, "step": 2629 }, { "epoch": 0.9693172394729568, "grad_norm": 9.72666368546766, "learning_rate": 4.218382989244654e-05, "loss": 0.3711, "step": 2630 }, { "epoch": 0.9696858011609694, "grad_norm": 7.861915253696063, "learning_rate": 4.218073927555941e-05, "loss": 0.4224, "step": 2631 }, { "epoch": 0.9700543628489818, "grad_norm": 6.380830081046929, "learning_rate": 4.217764865867227e-05, "loss": 0.4417, "step": 2632 }, { "epoch": 0.9704229245369944, "grad_norm": 7.683056795628516, "learning_rate": 4.2174558041785144e-05, "loss": 0.4631, "step": 2633 }, { "epoch": 0.9707914862250069, "grad_norm": 13.791378390861512, "learning_rate": 4.217146742489801e-05, "loss": 0.3708, "step": 2634 }, { "epoch": 0.9711600479130195, "grad_norm": 5.072672854392969, "learning_rate": 4.216837680801088e-05, "loss": 0.225, "step": 2635 }, { "epoch": 0.9715286096010319, "grad_norm": 2.6522859502458136, "learning_rate": 4.216528619112375e-05, "loss": 0.1068, "step": 2636 }, { "epoch": 0.9718971712890445, "grad_norm": 6.063331782903712, "learning_rate": 4.2162195574236616e-05, "loss": 0.2267, "step": 2637 }, { "epoch": 0.972265732977057, "grad_norm": 4.629250712394299, "learning_rate": 4.215910495734949e-05, "loss": 0.2528, "step": 2638 }, { "epoch": 0.9726342946650696, "grad_norm": 6.140235480847124, "learning_rate": 4.215601434046236e-05, "loss": 0.4736, "step": 2639 }, { "epoch": 0.9730028563530821, "grad_norm": 7.370116087621655, "learning_rate": 4.215292372357523e-05, "loss": 0.2849, "step": 2640 }, { "epoch": 0.9733714180410946, "grad_norm": 3.796713240831531, "learning_rate": 4.2149833106688094e-05, "loss": 0.3001, "step": 2641 }, { "epoch": 0.9737399797291072, "grad_norm": 5.382235248324858, "learning_rate": 4.2146742489800965e-05, "loss": 0.2554, "step": 2642 }, { "epoch": 0.9741085414171197, "grad_norm": 7.131134069705687, "learning_rate": 4.2143651872913836e-05, "loss": 0.3963, "step": 2643 }, { "epoch": 0.9744771031051322, "grad_norm": 3.9493060456088136, "learning_rate": 4.214056125602671e-05, "loss": 0.2153, "step": 2644 }, { "epoch": 0.9748456647931447, "grad_norm": 7.171272867560934, "learning_rate": 4.213747063913958e-05, "loss": 0.2927, "step": 2645 }, { "epoch": 0.9752142264811573, "grad_norm": 7.081344983641437, "learning_rate": 4.213438002225244e-05, "loss": 0.4957, "step": 2646 }, { "epoch": 0.9755827881691698, "grad_norm": 35.57637683726643, "learning_rate": 4.2131289405365314e-05, "loss": 0.3003, "step": 2647 }, { "epoch": 0.9759513498571823, "grad_norm": 10.987994491681308, "learning_rate": 4.2128198788478185e-05, "loss": 0.4821, "step": 2648 }, { "epoch": 0.9763199115451949, "grad_norm": 6.1322993167056, "learning_rate": 4.212510817159105e-05, "loss": 0.4395, "step": 2649 }, { "epoch": 0.9766884732332074, "grad_norm": 5.84502735559019, "learning_rate": 4.212201755470392e-05, "loss": 0.2971, "step": 2650 }, { "epoch": 0.97705703492122, "grad_norm": 4.891243112521705, "learning_rate": 4.2118926937816785e-05, "loss": 0.3543, "step": 2651 }, { "epoch": 0.9774255966092324, "grad_norm": 5.8057195020744485, "learning_rate": 4.2115836320929656e-05, "loss": 0.2642, "step": 2652 }, { "epoch": 0.977794158297245, "grad_norm": 4.570728146163674, "learning_rate": 4.211274570404253e-05, "loss": 0.2138, "step": 2653 }, { "epoch": 0.9781627199852575, "grad_norm": 8.473854694228963, "learning_rate": 4.21096550871554e-05, "loss": 0.3968, "step": 2654 }, { "epoch": 0.9785312816732701, "grad_norm": 10.631643215579562, "learning_rate": 4.210656447026827e-05, "loss": 0.1866, "step": 2655 }, { "epoch": 0.9788998433612826, "grad_norm": 6.545182111638431, "learning_rate": 4.2103473853381134e-05, "loss": 0.2553, "step": 2656 }, { "epoch": 0.9792684050492951, "grad_norm": 4.064077217870312, "learning_rate": 4.2100383236494006e-05, "loss": 0.2838, "step": 2657 }, { "epoch": 0.9796369667373076, "grad_norm": 3.36549437543019, "learning_rate": 4.209729261960688e-05, "loss": 0.2537, "step": 2658 }, { "epoch": 0.9800055284253202, "grad_norm": 4.763216204118572, "learning_rate": 4.209420200271975e-05, "loss": 0.3549, "step": 2659 }, { "epoch": 0.9803740901133328, "grad_norm": 15.150189066799319, "learning_rate": 4.209111138583261e-05, "loss": 0.3426, "step": 2660 }, { "epoch": 0.9807426518013452, "grad_norm": 12.121991915039427, "learning_rate": 4.2088020768945484e-05, "loss": 0.2835, "step": 2661 }, { "epoch": 0.9811112134893578, "grad_norm": 6.026293363290527, "learning_rate": 4.2084930152058355e-05, "loss": 0.2673, "step": 2662 }, { "epoch": 0.9814797751773703, "grad_norm": 5.717636583679251, "learning_rate": 4.2081839535171226e-05, "loss": 0.2836, "step": 2663 }, { "epoch": 0.9818483368653829, "grad_norm": 18.10950138814826, "learning_rate": 4.207874891828409e-05, "loss": 0.4235, "step": 2664 }, { "epoch": 0.9822168985533953, "grad_norm": 4.896438799274332, "learning_rate": 4.207565830139696e-05, "loss": 0.2166, "step": 2665 }, { "epoch": 0.9825854602414079, "grad_norm": 8.586382228728933, "learning_rate": 4.2072567684509826e-05, "loss": 0.3448, "step": 2666 }, { "epoch": 0.9829540219294204, "grad_norm": 7.935701421781446, "learning_rate": 4.20694770676227e-05, "loss": 0.298, "step": 2667 }, { "epoch": 0.983322583617433, "grad_norm": 8.792668964930698, "learning_rate": 4.206638645073557e-05, "loss": 0.3849, "step": 2668 }, { "epoch": 0.9836911453054455, "grad_norm": 3.8401710782089005, "learning_rate": 4.206329583384844e-05, "loss": 0.1852, "step": 2669 }, { "epoch": 0.984059706993458, "grad_norm": 5.46939815494979, "learning_rate": 4.2060205216961304e-05, "loss": 0.2053, "step": 2670 }, { "epoch": 0.9844282686814706, "grad_norm": 7.468170781998866, "learning_rate": 4.2057114600074175e-05, "loss": 0.2871, "step": 2671 }, { "epoch": 0.9847968303694831, "grad_norm": 4.1410354122959, "learning_rate": 4.2054023983187046e-05, "loss": 0.2489, "step": 2672 }, { "epoch": 0.9851653920574956, "grad_norm": 7.427640903796572, "learning_rate": 4.205093336629992e-05, "loss": 0.4104, "step": 2673 }, { "epoch": 0.9855339537455081, "grad_norm": 6.024925592366655, "learning_rate": 4.204784274941279e-05, "loss": 0.4747, "step": 2674 }, { "epoch": 0.9859025154335207, "grad_norm": 6.6373096043637805, "learning_rate": 4.204475213252565e-05, "loss": 0.3751, "step": 2675 }, { "epoch": 0.9862710771215332, "grad_norm": 5.44490756878546, "learning_rate": 4.2041661515638524e-05, "loss": 0.2643, "step": 2676 }, { "epoch": 0.9866396388095457, "grad_norm": 4.743187687331786, "learning_rate": 4.2038570898751396e-05, "loss": 0.3172, "step": 2677 }, { "epoch": 0.9870082004975583, "grad_norm": 8.18948177202538, "learning_rate": 4.203548028186427e-05, "loss": 0.4588, "step": 2678 }, { "epoch": 0.9873767621855708, "grad_norm": 7.11151158013159, "learning_rate": 4.203238966497713e-05, "loss": 0.3905, "step": 2679 }, { "epoch": 0.9877453238735834, "grad_norm": 7.413937212867256, "learning_rate": 4.2029299048089996e-05, "loss": 0.364, "step": 2680 }, { "epoch": 0.9881138855615959, "grad_norm": 6.336004329756982, "learning_rate": 4.202620843120287e-05, "loss": 0.3699, "step": 2681 }, { "epoch": 0.9884824472496084, "grad_norm": 3.6870952319393173, "learning_rate": 4.202311781431574e-05, "loss": 0.16, "step": 2682 }, { "epoch": 0.9888510089376209, "grad_norm": 8.508406128853725, "learning_rate": 4.202002719742861e-05, "loss": 0.3643, "step": 2683 }, { "epoch": 0.9892195706256335, "grad_norm": 4.8329437800428305, "learning_rate": 4.201693658054148e-05, "loss": 0.2604, "step": 2684 }, { "epoch": 0.989588132313646, "grad_norm": 9.588693684221235, "learning_rate": 4.2013845963654345e-05, "loss": 0.2373, "step": 2685 }, { "epoch": 0.9899566940016585, "grad_norm": 5.371266332454588, "learning_rate": 4.2010755346767216e-05, "loss": 0.3072, "step": 2686 }, { "epoch": 0.9903252556896711, "grad_norm": 6.104746750836246, "learning_rate": 4.200766472988009e-05, "loss": 0.3484, "step": 2687 }, { "epoch": 0.9906938173776836, "grad_norm": 12.401950658140562, "learning_rate": 4.200457411299296e-05, "loss": 0.3823, "step": 2688 }, { "epoch": 0.9910623790656962, "grad_norm": 3.501479517350459, "learning_rate": 4.200148349610582e-05, "loss": 0.3622, "step": 2689 }, { "epoch": 0.9914309407537086, "grad_norm": 12.883597753129873, "learning_rate": 4.1998392879218694e-05, "loss": 0.2882, "step": 2690 }, { "epoch": 0.9917995024417212, "grad_norm": 9.078199796647548, "learning_rate": 4.1995302262331565e-05, "loss": 0.3765, "step": 2691 }, { "epoch": 0.9921680641297337, "grad_norm": 4.3067668455142885, "learning_rate": 4.1992211645444436e-05, "loss": 0.2246, "step": 2692 }, { "epoch": 0.9925366258177463, "grad_norm": 4.720881706903508, "learning_rate": 4.198912102855731e-05, "loss": 0.3807, "step": 2693 }, { "epoch": 0.9929051875057587, "grad_norm": 10.439718227749477, "learning_rate": 4.198603041167017e-05, "loss": 0.4246, "step": 2694 }, { "epoch": 0.9932737491937713, "grad_norm": 6.659000025645948, "learning_rate": 4.1982939794783036e-05, "loss": 0.3613, "step": 2695 }, { "epoch": 0.9936423108817838, "grad_norm": 7.416778163571883, "learning_rate": 4.197984917789591e-05, "loss": 0.2819, "step": 2696 }, { "epoch": 0.9940108725697964, "grad_norm": 4.7616891085521225, "learning_rate": 4.197675856100878e-05, "loss": 0.3273, "step": 2697 }, { "epoch": 0.994379434257809, "grad_norm": 6.974746382282869, "learning_rate": 4.197366794412165e-05, "loss": 0.4316, "step": 2698 }, { "epoch": 0.9947479959458214, "grad_norm": 5.1280335543125375, "learning_rate": 4.1970577327234514e-05, "loss": 0.2461, "step": 2699 }, { "epoch": 0.995116557633834, "grad_norm": 5.481272980349312, "learning_rate": 4.1967486710347386e-05, "loss": 0.276, "step": 2700 }, { "epoch": 0.9954851193218465, "grad_norm": 7.475646597667834, "learning_rate": 4.196439609346026e-05, "loss": 0.2541, "step": 2701 }, { "epoch": 0.995853681009859, "grad_norm": 4.145515519169809, "learning_rate": 4.196130547657313e-05, "loss": 0.2742, "step": 2702 }, { "epoch": 0.9962222426978715, "grad_norm": 8.020488728308143, "learning_rate": 4.1958214859686e-05, "loss": 0.4049, "step": 2703 }, { "epoch": 0.9965908043858841, "grad_norm": 3.200586718177805, "learning_rate": 4.1955124242798864e-05, "loss": 0.2426, "step": 2704 }, { "epoch": 0.9969593660738966, "grad_norm": 6.96395750285443, "learning_rate": 4.1952033625911735e-05, "loss": 0.3167, "step": 2705 }, { "epoch": 0.9973279277619091, "grad_norm": 5.026033908133044, "learning_rate": 4.1948943009024606e-05, "loss": 0.3061, "step": 2706 }, { "epoch": 0.9976964894499217, "grad_norm": 10.374473466496083, "learning_rate": 4.194585239213748e-05, "loss": 0.3253, "step": 2707 }, { "epoch": 0.9980650511379342, "grad_norm": 20.769127543918326, "learning_rate": 4.194276177525034e-05, "loss": 0.4423, "step": 2708 }, { "epoch": 0.9984336128259468, "grad_norm": 8.204770110336826, "learning_rate": 4.193967115836321e-05, "loss": 0.5034, "step": 2709 }, { "epoch": 0.9988021745139593, "grad_norm": 3.940524196318614, "learning_rate": 4.193658054147608e-05, "loss": 0.2773, "step": 2710 }, { "epoch": 0.9991707362019718, "grad_norm": 4.952648055642711, "learning_rate": 4.193348992458895e-05, "loss": 0.2355, "step": 2711 }, { "epoch": 0.9995392978899843, "grad_norm": 6.895785728205754, "learning_rate": 4.193039930770182e-05, "loss": 0.4185, "step": 2712 }, { "epoch": 0.9999078595779969, "grad_norm": 8.599844252484587, "learning_rate": 4.1927308690814684e-05, "loss": 0.6176, "step": 2713 }, { "epoch": 1.0002764212660094, "grad_norm": 4.881247405419045, "learning_rate": 4.1924218073927555e-05, "loss": 0.2984, "step": 2714 }, { "epoch": 1.0006449829540218, "grad_norm": 3.4110823484762043, "learning_rate": 4.1921127457040426e-05, "loss": 0.1576, "step": 2715 }, { "epoch": 1.0010135446420345, "grad_norm": 3.993166092526691, "learning_rate": 4.19180368401533e-05, "loss": 0.2551, "step": 2716 }, { "epoch": 1.001382106330047, "grad_norm": 8.847901572895822, "learning_rate": 4.191494622326617e-05, "loss": 0.3029, "step": 2717 }, { "epoch": 1.0017506680180595, "grad_norm": 6.333719676765593, "learning_rate": 4.191185560637903e-05, "loss": 0.2959, "step": 2718 }, { "epoch": 1.0021192297060721, "grad_norm": 6.649570606107245, "learning_rate": 4.1908764989491904e-05, "loss": 0.3365, "step": 2719 }, { "epoch": 1.0024877913940846, "grad_norm": 3.427210162906912, "learning_rate": 4.1905674372604776e-05, "loss": 0.2116, "step": 2720 }, { "epoch": 1.002856353082097, "grad_norm": 3.0906161995695927, "learning_rate": 4.190258375571765e-05, "loss": 0.1721, "step": 2721 }, { "epoch": 1.0032249147701096, "grad_norm": 2.473587897156868, "learning_rate": 4.189949313883052e-05, "loss": 0.2063, "step": 2722 }, { "epoch": 1.0035934764581222, "grad_norm": 3.250573914545827, "learning_rate": 4.189640252194338e-05, "loss": 0.1818, "step": 2723 }, { "epoch": 1.0039620381461347, "grad_norm": 5.861529877192532, "learning_rate": 4.1893311905056253e-05, "loss": 0.2022, "step": 2724 }, { "epoch": 1.0043305998341472, "grad_norm": 7.055855925700175, "learning_rate": 4.189022128816912e-05, "loss": 0.2989, "step": 2725 }, { "epoch": 1.0046991615221599, "grad_norm": 25.53672509569857, "learning_rate": 4.188713067128199e-05, "loss": 0.2823, "step": 2726 }, { "epoch": 1.0050677232101723, "grad_norm": 12.070525508840545, "learning_rate": 4.188404005439486e-05, "loss": 0.3083, "step": 2727 }, { "epoch": 1.0054362848981848, "grad_norm": 3.1140582982794345, "learning_rate": 4.1880949437507725e-05, "loss": 0.1661, "step": 2728 }, { "epoch": 1.0058048465861973, "grad_norm": 10.150955651305232, "learning_rate": 4.1877858820620596e-05, "loss": 0.174, "step": 2729 }, { "epoch": 1.00617340827421, "grad_norm": 6.534776821397097, "learning_rate": 4.187476820373347e-05, "loss": 0.249, "step": 2730 }, { "epoch": 1.0065419699622224, "grad_norm": 14.149430204855259, "learning_rate": 4.187167758684634e-05, "loss": 0.2379, "step": 2731 }, { "epoch": 1.006910531650235, "grad_norm": 2.614560485577903, "learning_rate": 4.18685869699592e-05, "loss": 0.0815, "step": 2732 }, { "epoch": 1.0072790933382474, "grad_norm": 19.133080464640457, "learning_rate": 4.1865496353072074e-05, "loss": 0.2282, "step": 2733 }, { "epoch": 1.00764765502626, "grad_norm": 5.636415554042442, "learning_rate": 4.1862405736184945e-05, "loss": 0.23, "step": 2734 }, { "epoch": 1.0080162167142726, "grad_norm": 4.370759052897601, "learning_rate": 4.1859315119297816e-05, "loss": 0.1912, "step": 2735 }, { "epoch": 1.008384778402285, "grad_norm": 5.142764681030343, "learning_rate": 4.185622450241069e-05, "loss": 0.2085, "step": 2736 }, { "epoch": 1.0087533400902977, "grad_norm": 8.931850541828524, "learning_rate": 4.185313388552355e-05, "loss": 0.3728, "step": 2737 }, { "epoch": 1.0091219017783102, "grad_norm": 9.47876252930276, "learning_rate": 4.185004326863642e-05, "loss": 0.2721, "step": 2738 }, { "epoch": 1.0094904634663227, "grad_norm": 10.451124242147573, "learning_rate": 4.1846952651749294e-05, "loss": 0.1761, "step": 2739 }, { "epoch": 1.0098590251543351, "grad_norm": 4.820800756521891, "learning_rate": 4.184386203486216e-05, "loss": 0.1987, "step": 2740 }, { "epoch": 1.0102275868423478, "grad_norm": 8.333156532955227, "learning_rate": 4.184077141797503e-05, "loss": 0.2347, "step": 2741 }, { "epoch": 1.0105961485303603, "grad_norm": 5.401138623604886, "learning_rate": 4.1837680801087894e-05, "loss": 0.3756, "step": 2742 }, { "epoch": 1.0109647102183728, "grad_norm": 15.710798614813774, "learning_rate": 4.1834590184200765e-05, "loss": 0.2399, "step": 2743 }, { "epoch": 1.0113332719063852, "grad_norm": 5.620724557889673, "learning_rate": 4.183149956731364e-05, "loss": 0.1804, "step": 2744 }, { "epoch": 1.011701833594398, "grad_norm": 7.452790258795799, "learning_rate": 4.182840895042651e-05, "loss": 0.2605, "step": 2745 }, { "epoch": 1.0120703952824104, "grad_norm": 16.271196362120627, "learning_rate": 4.182531833353938e-05, "loss": 0.2721, "step": 2746 }, { "epoch": 1.0124389569704229, "grad_norm": 4.785827838713658, "learning_rate": 4.1822227716652243e-05, "loss": 0.2926, "step": 2747 }, { "epoch": 1.0128075186584355, "grad_norm": 5.884790257789468, "learning_rate": 4.1819137099765115e-05, "loss": 0.2887, "step": 2748 }, { "epoch": 1.013176080346448, "grad_norm": 5.055480800443933, "learning_rate": 4.1816046482877986e-05, "loss": 0.3242, "step": 2749 }, { "epoch": 1.0135446420344605, "grad_norm": 3.894454113532117, "learning_rate": 4.181295586599086e-05, "loss": 0.2109, "step": 2750 }, { "epoch": 1.013913203722473, "grad_norm": 4.5119316017837425, "learning_rate": 4.180986524910372e-05, "loss": 0.1783, "step": 2751 }, { "epoch": 1.0142817654104856, "grad_norm": 5.0491174965429835, "learning_rate": 4.180677463221659e-05, "loss": 0.2885, "step": 2752 }, { "epoch": 1.0146503270984981, "grad_norm": 6.543046874533587, "learning_rate": 4.1803684015329464e-05, "loss": 0.2333, "step": 2753 }, { "epoch": 1.0150188887865106, "grad_norm": 11.478026917929359, "learning_rate": 4.1800593398442335e-05, "loss": 0.2925, "step": 2754 }, { "epoch": 1.0153874504745233, "grad_norm": 4.225040972810066, "learning_rate": 4.17975027815552e-05, "loss": 0.3857, "step": 2755 }, { "epoch": 1.0157560121625357, "grad_norm": 4.034763431539243, "learning_rate": 4.179441216466807e-05, "loss": 0.1822, "step": 2756 }, { "epoch": 1.0161245738505482, "grad_norm": 4.3690568065831314, "learning_rate": 4.1791321547780935e-05, "loss": 0.2042, "step": 2757 }, { "epoch": 1.0164931355385607, "grad_norm": 4.884147083239088, "learning_rate": 4.1788230930893806e-05, "loss": 0.2145, "step": 2758 }, { "epoch": 1.0168616972265734, "grad_norm": 11.76996490965922, "learning_rate": 4.178514031400668e-05, "loss": 0.3793, "step": 2759 }, { "epoch": 1.0172302589145858, "grad_norm": 6.306609474121736, "learning_rate": 4.178204969711955e-05, "loss": 0.1978, "step": 2760 }, { "epoch": 1.0175988206025983, "grad_norm": 5.620670262886942, "learning_rate": 4.177895908023241e-05, "loss": 0.2665, "step": 2761 }, { "epoch": 1.0179673822906108, "grad_norm": 5.044514484918227, "learning_rate": 4.1775868463345284e-05, "loss": 0.2776, "step": 2762 }, { "epoch": 1.0183359439786235, "grad_norm": 10.036173815308015, "learning_rate": 4.1772777846458155e-05, "loss": 0.4004, "step": 2763 }, { "epoch": 1.018704505666636, "grad_norm": 3.74617305031257, "learning_rate": 4.176968722957103e-05, "loss": 0.218, "step": 2764 }, { "epoch": 1.0190730673546484, "grad_norm": 15.07796864799117, "learning_rate": 4.17665966126839e-05, "loss": 0.3249, "step": 2765 }, { "epoch": 1.0194416290426611, "grad_norm": 7.083023902267848, "learning_rate": 4.176350599579676e-05, "loss": 0.2472, "step": 2766 }, { "epoch": 1.0198101907306736, "grad_norm": 8.95233246655847, "learning_rate": 4.1760415378909633e-05, "loss": 0.2157, "step": 2767 }, { "epoch": 1.020178752418686, "grad_norm": 3.8096297669434547, "learning_rate": 4.1757324762022505e-05, "loss": 0.1946, "step": 2768 }, { "epoch": 1.0205473141066985, "grad_norm": 3.005821460784684, "learning_rate": 4.1754234145135376e-05, "loss": 0.1419, "step": 2769 }, { "epoch": 1.0209158757947112, "grad_norm": 3.182113284508321, "learning_rate": 4.175114352824824e-05, "loss": 0.1788, "step": 2770 }, { "epoch": 1.0212844374827237, "grad_norm": 3.382723976721356, "learning_rate": 4.1748052911361105e-05, "loss": 0.1416, "step": 2771 }, { "epoch": 1.0216529991707362, "grad_norm": 5.526373658797457, "learning_rate": 4.1744962294473976e-05, "loss": 0.2411, "step": 2772 }, { "epoch": 1.0220215608587488, "grad_norm": 3.2829383502787493, "learning_rate": 4.174187167758685e-05, "loss": 0.1981, "step": 2773 }, { "epoch": 1.0223901225467613, "grad_norm": 14.342821278144578, "learning_rate": 4.173878106069972e-05, "loss": 0.372, "step": 2774 }, { "epoch": 1.0227586842347738, "grad_norm": 10.025647651889814, "learning_rate": 4.173569044381259e-05, "loss": 0.189, "step": 2775 }, { "epoch": 1.0231272459227863, "grad_norm": 3.787085200090452, "learning_rate": 4.1732599826925454e-05, "loss": 0.2253, "step": 2776 }, { "epoch": 1.023495807610799, "grad_norm": 5.351985897940858, "learning_rate": 4.1729509210038325e-05, "loss": 0.2499, "step": 2777 }, { "epoch": 1.0238643692988114, "grad_norm": 6.324534511399924, "learning_rate": 4.1726418593151196e-05, "loss": 0.2336, "step": 2778 }, { "epoch": 1.0242329309868239, "grad_norm": 2.6642554428619145, "learning_rate": 4.172332797626407e-05, "loss": 0.2122, "step": 2779 }, { "epoch": 1.0246014926748364, "grad_norm": 5.27969716735298, "learning_rate": 4.172023735937693e-05, "loss": 0.1342, "step": 2780 }, { "epoch": 1.024970054362849, "grad_norm": 3.931162259770828, "learning_rate": 4.17171467424898e-05, "loss": 0.2599, "step": 2781 }, { "epoch": 1.0253386160508615, "grad_norm": 3.8000836413363777, "learning_rate": 4.1714056125602674e-05, "loss": 0.1227, "step": 2782 }, { "epoch": 1.025707177738874, "grad_norm": 5.960773990733378, "learning_rate": 4.1710965508715545e-05, "loss": 0.2022, "step": 2783 }, { "epoch": 1.0260757394268867, "grad_norm": 11.780215109138632, "learning_rate": 4.1707874891828417e-05, "loss": 0.2969, "step": 2784 }, { "epoch": 1.0264443011148991, "grad_norm": 6.17805546213474, "learning_rate": 4.1704784274941274e-05, "loss": 0.2526, "step": 2785 }, { "epoch": 1.0268128628029116, "grad_norm": 5.872766597797586, "learning_rate": 4.1701693658054145e-05, "loss": 0.1834, "step": 2786 }, { "epoch": 1.027181424490924, "grad_norm": 4.379797784326555, "learning_rate": 4.169860304116702e-05, "loss": 0.1387, "step": 2787 }, { "epoch": 1.0275499861789368, "grad_norm": 8.267618036387837, "learning_rate": 4.169551242427989e-05, "loss": 0.1641, "step": 2788 }, { "epoch": 1.0279185478669493, "grad_norm": 4.716040028888341, "learning_rate": 4.169242180739276e-05, "loss": 0.2575, "step": 2789 }, { "epoch": 1.0282871095549617, "grad_norm": 4.772350530906521, "learning_rate": 4.1689331190505623e-05, "loss": 0.1933, "step": 2790 }, { "epoch": 1.0286556712429742, "grad_norm": 6.680218591825444, "learning_rate": 4.1686240573618495e-05, "loss": 0.203, "step": 2791 }, { "epoch": 1.0290242329309869, "grad_norm": 4.707953423670749, "learning_rate": 4.1683149956731366e-05, "loss": 0.193, "step": 2792 }, { "epoch": 1.0293927946189994, "grad_norm": 5.105329025679855, "learning_rate": 4.168005933984424e-05, "loss": 0.2598, "step": 2793 }, { "epoch": 1.0297613563070118, "grad_norm": 6.607699278743001, "learning_rate": 4.167696872295711e-05, "loss": 0.2189, "step": 2794 }, { "epoch": 1.0301299179950245, "grad_norm": 10.450085755502837, "learning_rate": 4.167387810606997e-05, "loss": 0.2444, "step": 2795 }, { "epoch": 1.030498479683037, "grad_norm": 7.058727235698251, "learning_rate": 4.1670787489182844e-05, "loss": 0.1706, "step": 2796 }, { "epoch": 1.0308670413710495, "grad_norm": 3.1931050703883126, "learning_rate": 4.1667696872295715e-05, "loss": 0.1756, "step": 2797 }, { "epoch": 1.031235603059062, "grad_norm": 5.156553687197979, "learning_rate": 4.1664606255408586e-05, "loss": 0.1953, "step": 2798 }, { "epoch": 1.0316041647470746, "grad_norm": 3.1274631710918106, "learning_rate": 4.166151563852145e-05, "loss": 0.1609, "step": 2799 }, { "epoch": 1.031972726435087, "grad_norm": 4.597699187020727, "learning_rate": 4.1658425021634315e-05, "loss": 0.1566, "step": 2800 }, { "epoch": 1.0323412881230996, "grad_norm": 4.394281242888485, "learning_rate": 4.1655334404747186e-05, "loss": 0.2536, "step": 2801 }, { "epoch": 1.0327098498111122, "grad_norm": 5.297633294034838, "learning_rate": 4.165224378786006e-05, "loss": 0.2255, "step": 2802 }, { "epoch": 1.0330784114991247, "grad_norm": 4.04656957703644, "learning_rate": 4.164915317097293e-05, "loss": 0.1992, "step": 2803 }, { "epoch": 1.0334469731871372, "grad_norm": 4.749074745410941, "learning_rate": 4.164606255408579e-05, "loss": 0.1739, "step": 2804 }, { "epoch": 1.0338155348751497, "grad_norm": 4.8495979693029385, "learning_rate": 4.1642971937198664e-05, "loss": 0.3016, "step": 2805 }, { "epoch": 1.0341840965631623, "grad_norm": 2.5076510653720585, "learning_rate": 4.1639881320311535e-05, "loss": 0.0952, "step": 2806 }, { "epoch": 1.0345526582511748, "grad_norm": 13.095706474878392, "learning_rate": 4.1636790703424407e-05, "loss": 0.3509, "step": 2807 }, { "epoch": 1.0349212199391873, "grad_norm": 4.254612607433379, "learning_rate": 4.163370008653728e-05, "loss": 0.1768, "step": 2808 }, { "epoch": 1.0352897816271998, "grad_norm": 7.298620370588187, "learning_rate": 4.163060946965014e-05, "loss": 0.2651, "step": 2809 }, { "epoch": 1.0356583433152124, "grad_norm": 4.720261489607693, "learning_rate": 4.162751885276301e-05, "loss": 0.1945, "step": 2810 }, { "epoch": 1.036026905003225, "grad_norm": 5.52792103047696, "learning_rate": 4.1624428235875885e-05, "loss": 0.2948, "step": 2811 }, { "epoch": 1.0363954666912374, "grad_norm": 4.23886107244957, "learning_rate": 4.1621337618988756e-05, "loss": 0.252, "step": 2812 }, { "epoch": 1.03676402837925, "grad_norm": 5.310669347957925, "learning_rate": 4.161824700210162e-05, "loss": 0.1208, "step": 2813 }, { "epoch": 1.0371325900672625, "grad_norm": 3.7215274728614065, "learning_rate": 4.161515638521449e-05, "loss": 0.1609, "step": 2814 }, { "epoch": 1.037501151755275, "grad_norm": 8.602401540027028, "learning_rate": 4.1612065768327356e-05, "loss": 0.2304, "step": 2815 }, { "epoch": 1.0378697134432875, "grad_norm": 5.141483501879361, "learning_rate": 4.160897515144023e-05, "loss": 0.161, "step": 2816 }, { "epoch": 1.0382382751313002, "grad_norm": 6.123316942309663, "learning_rate": 4.16058845345531e-05, "loss": 0.3137, "step": 2817 }, { "epoch": 1.0386068368193127, "grad_norm": 6.997032353896516, "learning_rate": 4.160279391766597e-05, "loss": 0.2106, "step": 2818 }, { "epoch": 1.0389753985073251, "grad_norm": 4.313971227048546, "learning_rate": 4.1599703300778834e-05, "loss": 0.1164, "step": 2819 }, { "epoch": 1.0393439601953376, "grad_norm": 4.425446809958678, "learning_rate": 4.1596612683891705e-05, "loss": 0.1628, "step": 2820 }, { "epoch": 1.0397125218833503, "grad_norm": 3.923382959324774, "learning_rate": 4.1593522067004576e-05, "loss": 0.2498, "step": 2821 }, { "epoch": 1.0400810835713628, "grad_norm": 5.8932102510421505, "learning_rate": 4.159043145011745e-05, "loss": 0.2318, "step": 2822 }, { "epoch": 1.0404496452593752, "grad_norm": 8.101664675856409, "learning_rate": 4.158734083323031e-05, "loss": 0.3654, "step": 2823 }, { "epoch": 1.040818206947388, "grad_norm": 3.254867723065458, "learning_rate": 4.158425021634318e-05, "loss": 0.195, "step": 2824 }, { "epoch": 1.0411867686354004, "grad_norm": 3.972675334693281, "learning_rate": 4.1581159599456054e-05, "loss": 0.2424, "step": 2825 }, { "epoch": 1.0415553303234129, "grad_norm": 6.4213189768414205, "learning_rate": 4.1578068982568925e-05, "loss": 0.2597, "step": 2826 }, { "epoch": 1.0419238920114253, "grad_norm": 10.286031521340377, "learning_rate": 4.1574978365681797e-05, "loss": 0.3224, "step": 2827 }, { "epoch": 1.042292453699438, "grad_norm": 12.594758024174185, "learning_rate": 4.157188774879466e-05, "loss": 0.3201, "step": 2828 }, { "epoch": 1.0426610153874505, "grad_norm": 3.8447726990342055, "learning_rate": 4.156879713190753e-05, "loss": 0.2254, "step": 2829 }, { "epoch": 1.043029577075463, "grad_norm": 6.747194060166557, "learning_rate": 4.15657065150204e-05, "loss": 0.3867, "step": 2830 }, { "epoch": 1.0433981387634756, "grad_norm": 7.474654541198135, "learning_rate": 4.156261589813327e-05, "loss": 0.2813, "step": 2831 }, { "epoch": 1.0437667004514881, "grad_norm": 6.2698236034835, "learning_rate": 4.155952528124614e-05, "loss": 0.1815, "step": 2832 }, { "epoch": 1.0441352621395006, "grad_norm": 3.202573849781713, "learning_rate": 4.1556434664359e-05, "loss": 0.1918, "step": 2833 }, { "epoch": 1.044503823827513, "grad_norm": 8.48270451801437, "learning_rate": 4.1553344047471875e-05, "loss": 0.1628, "step": 2834 }, { "epoch": 1.0448723855155257, "grad_norm": 5.425921686166248, "learning_rate": 4.1550253430584746e-05, "loss": 0.1835, "step": 2835 }, { "epoch": 1.0452409472035382, "grad_norm": 3.5373882855679746, "learning_rate": 4.154716281369762e-05, "loss": 0.1804, "step": 2836 }, { "epoch": 1.0456095088915507, "grad_norm": 5.993262004378334, "learning_rate": 4.154407219681049e-05, "loss": 0.2799, "step": 2837 }, { "epoch": 1.0459780705795632, "grad_norm": 10.94788360348738, "learning_rate": 4.154098157992335e-05, "loss": 0.2931, "step": 2838 }, { "epoch": 1.0463466322675758, "grad_norm": 7.581346468142683, "learning_rate": 4.1537890963036224e-05, "loss": 0.2685, "step": 2839 }, { "epoch": 1.0467151939555883, "grad_norm": 3.8592519393019495, "learning_rate": 4.1534800346149095e-05, "loss": 0.144, "step": 2840 }, { "epoch": 1.0470837556436008, "grad_norm": 5.194810794325696, "learning_rate": 4.1531709729261966e-05, "loss": 0.2252, "step": 2841 }, { "epoch": 1.0474523173316135, "grad_norm": 4.415865531574115, "learning_rate": 4.152861911237483e-05, "loss": 0.2387, "step": 2842 }, { "epoch": 1.047820879019626, "grad_norm": 3.183114119832641, "learning_rate": 4.15255284954877e-05, "loss": 0.1983, "step": 2843 }, { "epoch": 1.0481894407076384, "grad_norm": 8.478477606241066, "learning_rate": 4.152243787860057e-05, "loss": 0.1732, "step": 2844 }, { "epoch": 1.048558002395651, "grad_norm": 7.4133863874407755, "learning_rate": 4.1519347261713444e-05, "loss": 0.2531, "step": 2845 }, { "epoch": 1.0489265640836636, "grad_norm": 8.213046455492513, "learning_rate": 4.151625664482631e-05, "loss": 0.2698, "step": 2846 }, { "epoch": 1.049295125771676, "grad_norm": 11.779975478141921, "learning_rate": 4.151316602793918e-05, "loss": 0.3075, "step": 2847 }, { "epoch": 1.0496636874596885, "grad_norm": 4.470948432135233, "learning_rate": 4.1510075411052044e-05, "loss": 0.2821, "step": 2848 }, { "epoch": 1.050032249147701, "grad_norm": 8.632202237338314, "learning_rate": 4.1506984794164915e-05, "loss": 0.3448, "step": 2849 }, { "epoch": 1.0504008108357137, "grad_norm": 3.6653368590296216, "learning_rate": 4.1503894177277787e-05, "loss": 0.1678, "step": 2850 }, { "epoch": 1.0507693725237262, "grad_norm": 5.85993421810581, "learning_rate": 4.150080356039066e-05, "loss": 0.2737, "step": 2851 }, { "epoch": 1.0511379342117386, "grad_norm": 4.940590193238004, "learning_rate": 4.149771294350352e-05, "loss": 0.1736, "step": 2852 }, { "epoch": 1.0515064958997513, "grad_norm": 11.977514817609482, "learning_rate": 4.149462232661639e-05, "loss": 0.1975, "step": 2853 }, { "epoch": 1.0518750575877638, "grad_norm": 9.33656591112039, "learning_rate": 4.1491531709729265e-05, "loss": 0.3045, "step": 2854 }, { "epoch": 1.0522436192757763, "grad_norm": 4.897999234343937, "learning_rate": 4.1488441092842136e-05, "loss": 0.2116, "step": 2855 }, { "epoch": 1.0526121809637887, "grad_norm": 14.549249604790619, "learning_rate": 4.148535047595501e-05, "loss": 0.2171, "step": 2856 }, { "epoch": 1.0529807426518014, "grad_norm": 3.5704429819650363, "learning_rate": 4.148225985906787e-05, "loss": 0.2271, "step": 2857 }, { "epoch": 1.0533493043398139, "grad_norm": 8.143750899366797, "learning_rate": 4.147916924218074e-05, "loss": 0.1675, "step": 2858 }, { "epoch": 1.0537178660278264, "grad_norm": 7.41434213736013, "learning_rate": 4.1476078625293614e-05, "loss": 0.2432, "step": 2859 }, { "epoch": 1.054086427715839, "grad_norm": 6.76029070799052, "learning_rate": 4.1472988008406485e-05, "loss": 0.2907, "step": 2860 }, { "epoch": 1.0544549894038515, "grad_norm": 4.158116889371176, "learning_rate": 4.146989739151935e-05, "loss": 0.2295, "step": 2861 }, { "epoch": 1.054823551091864, "grad_norm": 7.256672386736084, "learning_rate": 4.1466806774632214e-05, "loss": 0.2969, "step": 2862 }, { "epoch": 1.0551921127798765, "grad_norm": 3.3995891154475757, "learning_rate": 4.1463716157745085e-05, "loss": 0.1579, "step": 2863 }, { "epoch": 1.0555606744678891, "grad_norm": 7.087147332891791, "learning_rate": 4.1460625540857956e-05, "loss": 0.2508, "step": 2864 }, { "epoch": 1.0559292361559016, "grad_norm": 5.300371034698302, "learning_rate": 4.145753492397083e-05, "loss": 0.1244, "step": 2865 }, { "epoch": 1.056297797843914, "grad_norm": 4.103484482717382, "learning_rate": 4.14544443070837e-05, "loss": 0.2096, "step": 2866 }, { "epoch": 1.0566663595319266, "grad_norm": 5.634890021103052, "learning_rate": 4.145135369019656e-05, "loss": 0.3506, "step": 2867 }, { "epoch": 1.0570349212199392, "grad_norm": 6.062774337381825, "learning_rate": 4.1448263073309434e-05, "loss": 0.1916, "step": 2868 }, { "epoch": 1.0574034829079517, "grad_norm": 4.90654855779498, "learning_rate": 4.1445172456422305e-05, "loss": 0.2619, "step": 2869 }, { "epoch": 1.0577720445959642, "grad_norm": 4.235558826458168, "learning_rate": 4.1442081839535176e-05, "loss": 0.2035, "step": 2870 }, { "epoch": 1.0581406062839769, "grad_norm": 3.6883187112519824, "learning_rate": 4.143899122264804e-05, "loss": 0.167, "step": 2871 }, { "epoch": 1.0585091679719894, "grad_norm": 5.594678407482801, "learning_rate": 4.143590060576091e-05, "loss": 0.2323, "step": 2872 }, { "epoch": 1.0588777296600018, "grad_norm": 5.616574907921113, "learning_rate": 4.143280998887378e-05, "loss": 0.2331, "step": 2873 }, { "epoch": 1.0592462913480143, "grad_norm": 4.116640116082157, "learning_rate": 4.1429719371986654e-05, "loss": 0.261, "step": 2874 }, { "epoch": 1.059614853036027, "grad_norm": 6.090728480172914, "learning_rate": 4.1426628755099526e-05, "loss": 0.19, "step": 2875 }, { "epoch": 1.0599834147240395, "grad_norm": 4.600046837609713, "learning_rate": 4.142353813821238e-05, "loss": 0.1803, "step": 2876 }, { "epoch": 1.060351976412052, "grad_norm": 11.1132520732262, "learning_rate": 4.1420447521325254e-05, "loss": 0.2459, "step": 2877 }, { "epoch": 1.0607205381000644, "grad_norm": 3.4517488110719077, "learning_rate": 4.1417356904438126e-05, "loss": 0.2072, "step": 2878 }, { "epoch": 1.061089099788077, "grad_norm": 14.060969426254175, "learning_rate": 4.1414266287551e-05, "loss": 0.1874, "step": 2879 }, { "epoch": 1.0614576614760896, "grad_norm": 6.223325953153141, "learning_rate": 4.141117567066387e-05, "loss": 0.2485, "step": 2880 }, { "epoch": 1.061826223164102, "grad_norm": 8.656467696664642, "learning_rate": 4.140808505377673e-05, "loss": 0.1996, "step": 2881 }, { "epoch": 1.0621947848521147, "grad_norm": 4.944323110818095, "learning_rate": 4.1404994436889604e-05, "loss": 0.2239, "step": 2882 }, { "epoch": 1.0625633465401272, "grad_norm": 2.8210958729611577, "learning_rate": 4.1401903820002475e-05, "loss": 0.1771, "step": 2883 }, { "epoch": 1.0629319082281397, "grad_norm": 5.8299675450149575, "learning_rate": 4.1398813203115346e-05, "loss": 0.2543, "step": 2884 }, { "epoch": 1.0633004699161521, "grad_norm": 4.480423199555946, "learning_rate": 4.139572258622821e-05, "loss": 0.1941, "step": 2885 }, { "epoch": 1.0636690316041648, "grad_norm": 7.259875707481908, "learning_rate": 4.139263196934108e-05, "loss": 0.2771, "step": 2886 }, { "epoch": 1.0640375932921773, "grad_norm": 5.750701861461265, "learning_rate": 4.138954135245395e-05, "loss": 0.1777, "step": 2887 }, { "epoch": 1.0644061549801898, "grad_norm": 8.531100652596157, "learning_rate": 4.1386450735566824e-05, "loss": 0.1656, "step": 2888 }, { "epoch": 1.0647747166682024, "grad_norm": 10.296869072456564, "learning_rate": 4.1383360118679695e-05, "loss": 0.2521, "step": 2889 }, { "epoch": 1.065143278356215, "grad_norm": 6.7858052928816335, "learning_rate": 4.138026950179256e-05, "loss": 0.3081, "step": 2890 }, { "epoch": 1.0655118400442274, "grad_norm": 10.80032562895241, "learning_rate": 4.1377178884905424e-05, "loss": 0.3046, "step": 2891 }, { "epoch": 1.0658804017322399, "grad_norm": 4.344477750342125, "learning_rate": 4.1374088268018295e-05, "loss": 0.1384, "step": 2892 }, { "epoch": 1.0662489634202525, "grad_norm": 3.8215135429927947, "learning_rate": 4.1370997651131166e-05, "loss": 0.1524, "step": 2893 }, { "epoch": 1.066617525108265, "grad_norm": 6.414653643967873, "learning_rate": 4.136790703424404e-05, "loss": 0.2591, "step": 2894 }, { "epoch": 1.0669860867962775, "grad_norm": 3.7175172958536056, "learning_rate": 4.13648164173569e-05, "loss": 0.1616, "step": 2895 }, { "epoch": 1.06735464848429, "grad_norm": 6.936143072269641, "learning_rate": 4.136172580046977e-05, "loss": 0.2533, "step": 2896 }, { "epoch": 1.0677232101723027, "grad_norm": 12.456838324587899, "learning_rate": 4.1358635183582644e-05, "loss": 0.3461, "step": 2897 }, { "epoch": 1.0680917718603151, "grad_norm": 6.152244542850931, "learning_rate": 4.1355544566695516e-05, "loss": 0.1328, "step": 2898 }, { "epoch": 1.0684603335483276, "grad_norm": 7.0885044146438805, "learning_rate": 4.135245394980839e-05, "loss": 0.2284, "step": 2899 }, { "epoch": 1.0688288952363403, "grad_norm": 5.035821011078058, "learning_rate": 4.134936333292125e-05, "loss": 0.3148, "step": 2900 }, { "epoch": 1.0691974569243528, "grad_norm": 5.663051499210902, "learning_rate": 4.134627271603412e-05, "loss": 0.1833, "step": 2901 }, { "epoch": 1.0695660186123652, "grad_norm": 5.453526348220691, "learning_rate": 4.1343182099146994e-05, "loss": 0.1556, "step": 2902 }, { "epoch": 1.0699345803003777, "grad_norm": 4.829142145462689, "learning_rate": 4.1340091482259865e-05, "loss": 0.1799, "step": 2903 }, { "epoch": 1.0703031419883904, "grad_norm": 22.365911758501205, "learning_rate": 4.133700086537273e-05, "loss": 0.4383, "step": 2904 }, { "epoch": 1.0706717036764029, "grad_norm": 4.342838768095568, "learning_rate": 4.13339102484856e-05, "loss": 0.196, "step": 2905 }, { "epoch": 1.0710402653644153, "grad_norm": 4.853700323447461, "learning_rate": 4.1330819631598465e-05, "loss": 0.2865, "step": 2906 }, { "epoch": 1.0714088270524278, "grad_norm": 3.168758119952691, "learning_rate": 4.1327729014711336e-05, "loss": 0.1488, "step": 2907 }, { "epoch": 1.0717773887404405, "grad_norm": 2.348666514830955, "learning_rate": 4.132463839782421e-05, "loss": 0.0907, "step": 2908 }, { "epoch": 1.072145950428453, "grad_norm": 5.44290068315413, "learning_rate": 4.132154778093708e-05, "loss": 0.1544, "step": 2909 }, { "epoch": 1.0725145121164654, "grad_norm": 4.444770981131315, "learning_rate": 4.131845716404994e-05, "loss": 0.2111, "step": 2910 }, { "epoch": 1.0728830738044781, "grad_norm": 7.175024733018946, "learning_rate": 4.1315366547162814e-05, "loss": 0.1686, "step": 2911 }, { "epoch": 1.0732516354924906, "grad_norm": 9.065833590138418, "learning_rate": 4.1312275930275685e-05, "loss": 0.2705, "step": 2912 }, { "epoch": 1.073620197180503, "grad_norm": 2.6578578794845, "learning_rate": 4.1309185313388556e-05, "loss": 0.1251, "step": 2913 }, { "epoch": 1.0739887588685155, "grad_norm": 6.211459232705351, "learning_rate": 4.130609469650142e-05, "loss": 0.2199, "step": 2914 }, { "epoch": 1.0743573205565282, "grad_norm": 5.636423337178345, "learning_rate": 4.130300407961429e-05, "loss": 0.1345, "step": 2915 }, { "epoch": 1.0747258822445407, "grad_norm": 4.82110381470753, "learning_rate": 4.129991346272716e-05, "loss": 0.1727, "step": 2916 }, { "epoch": 1.0750944439325532, "grad_norm": 3.8654586472485857, "learning_rate": 4.1296822845840034e-05, "loss": 0.2353, "step": 2917 }, { "epoch": 1.0754630056205658, "grad_norm": 5.611853601831455, "learning_rate": 4.1293732228952906e-05, "loss": 0.1255, "step": 2918 }, { "epoch": 1.0758315673085783, "grad_norm": 10.184584873374503, "learning_rate": 4.129064161206577e-05, "loss": 0.358, "step": 2919 }, { "epoch": 1.0762001289965908, "grad_norm": 4.219227855604597, "learning_rate": 4.128755099517864e-05, "loss": 0.165, "step": 2920 }, { "epoch": 1.0765686906846033, "grad_norm": 7.376593983265265, "learning_rate": 4.1284460378291506e-05, "loss": 0.2145, "step": 2921 }, { "epoch": 1.076937252372616, "grad_norm": 6.055767922755738, "learning_rate": 4.128136976140438e-05, "loss": 0.1471, "step": 2922 }, { "epoch": 1.0773058140606284, "grad_norm": 15.948028079025162, "learning_rate": 4.127827914451725e-05, "loss": 0.2644, "step": 2923 }, { "epoch": 1.077674375748641, "grad_norm": 8.474699175961756, "learning_rate": 4.127518852763011e-05, "loss": 0.2055, "step": 2924 }, { "epoch": 1.0780429374366534, "grad_norm": 8.160544715352618, "learning_rate": 4.1272097910742984e-05, "loss": 0.2349, "step": 2925 }, { "epoch": 1.078411499124666, "grad_norm": 6.401216641655569, "learning_rate": 4.1269007293855855e-05, "loss": 0.2001, "step": 2926 }, { "epoch": 1.0787800608126785, "grad_norm": 3.980698149325377, "learning_rate": 4.1265916676968726e-05, "loss": 0.1295, "step": 2927 }, { "epoch": 1.079148622500691, "grad_norm": 9.367239219389122, "learning_rate": 4.12628260600816e-05, "loss": 0.3027, "step": 2928 }, { "epoch": 1.0795171841887037, "grad_norm": 8.154251262971234, "learning_rate": 4.125973544319446e-05, "loss": 0.2293, "step": 2929 }, { "epoch": 1.0798857458767162, "grad_norm": 3.5853340123143895, "learning_rate": 4.125664482630733e-05, "loss": 0.1818, "step": 2930 }, { "epoch": 1.0802543075647286, "grad_norm": 7.808634541773768, "learning_rate": 4.1253554209420204e-05, "loss": 0.3128, "step": 2931 }, { "epoch": 1.080622869252741, "grad_norm": 8.408100364136862, "learning_rate": 4.1250463592533075e-05, "loss": 0.195, "step": 2932 }, { "epoch": 1.0809914309407538, "grad_norm": 4.083051139785959, "learning_rate": 4.124737297564594e-05, "loss": 0.1873, "step": 2933 }, { "epoch": 1.0813599926287663, "grad_norm": 4.76260630571562, "learning_rate": 4.124428235875881e-05, "loss": 0.2808, "step": 2934 }, { "epoch": 1.0817285543167787, "grad_norm": 10.257077634568361, "learning_rate": 4.124119174187168e-05, "loss": 0.2233, "step": 2935 }, { "epoch": 1.0820971160047912, "grad_norm": 10.673354754199092, "learning_rate": 4.1238101124984546e-05, "loss": 0.238, "step": 2936 }, { "epoch": 1.0824656776928039, "grad_norm": 8.309567500678952, "learning_rate": 4.123501050809742e-05, "loss": 0.1902, "step": 2937 }, { "epoch": 1.0828342393808164, "grad_norm": 7.93331342208458, "learning_rate": 4.123191989121029e-05, "loss": 0.2446, "step": 2938 }, { "epoch": 1.0832028010688288, "grad_norm": 10.03602595734993, "learning_rate": 4.122882927432315e-05, "loss": 0.1976, "step": 2939 }, { "epoch": 1.0835713627568415, "grad_norm": 4.040253749470128, "learning_rate": 4.1225738657436024e-05, "loss": 0.2366, "step": 2940 }, { "epoch": 1.083939924444854, "grad_norm": 7.578728280191255, "learning_rate": 4.1222648040548896e-05, "loss": 0.1742, "step": 2941 }, { "epoch": 1.0843084861328665, "grad_norm": 61.00115465415901, "learning_rate": 4.121955742366177e-05, "loss": 0.2306, "step": 2942 }, { "epoch": 1.084677047820879, "grad_norm": 5.434211174819147, "learning_rate": 4.121646680677463e-05, "loss": 0.2697, "step": 2943 }, { "epoch": 1.0850456095088916, "grad_norm": 8.797113500041771, "learning_rate": 4.12133761898875e-05, "loss": 0.2255, "step": 2944 }, { "epoch": 1.085414171196904, "grad_norm": 5.4987963313019375, "learning_rate": 4.1210285573000374e-05, "loss": 0.2673, "step": 2945 }, { "epoch": 1.0857827328849166, "grad_norm": 3.7415940485125785, "learning_rate": 4.1207194956113245e-05, "loss": 0.1925, "step": 2946 }, { "epoch": 1.0861512945729292, "grad_norm": 11.051284535861798, "learning_rate": 4.1204104339226116e-05, "loss": 0.1967, "step": 2947 }, { "epoch": 1.0865198562609417, "grad_norm": 6.778396779684119, "learning_rate": 4.120101372233898e-05, "loss": 0.1499, "step": 2948 }, { "epoch": 1.0868884179489542, "grad_norm": 5.026170903522483, "learning_rate": 4.119792310545185e-05, "loss": 0.2417, "step": 2949 }, { "epoch": 1.0872569796369667, "grad_norm": 6.547034068212264, "learning_rate": 4.119483248856472e-05, "loss": 0.201, "step": 2950 }, { "epoch": 1.0876255413249794, "grad_norm": 19.796286124824967, "learning_rate": 4.1191741871677594e-05, "loss": 0.212, "step": 2951 }, { "epoch": 1.0879941030129918, "grad_norm": 10.438448742902516, "learning_rate": 4.118865125479046e-05, "loss": 0.2481, "step": 2952 }, { "epoch": 1.0883626647010043, "grad_norm": 7.153146128958665, "learning_rate": 4.118556063790332e-05, "loss": 0.2876, "step": 2953 }, { "epoch": 1.0887312263890168, "grad_norm": 3.1332328685589528, "learning_rate": 4.1182470021016194e-05, "loss": 0.1314, "step": 2954 }, { "epoch": 1.0890997880770295, "grad_norm": 11.321700764599152, "learning_rate": 4.1179379404129065e-05, "loss": 0.2194, "step": 2955 }, { "epoch": 1.089468349765042, "grad_norm": 4.311823916306126, "learning_rate": 4.1176288787241936e-05, "loss": 0.1869, "step": 2956 }, { "epoch": 1.0898369114530544, "grad_norm": 8.345985827399442, "learning_rate": 4.117319817035481e-05, "loss": 0.3562, "step": 2957 }, { "epoch": 1.090205473141067, "grad_norm": 4.950231820274975, "learning_rate": 4.117010755346767e-05, "loss": 0.2631, "step": 2958 }, { "epoch": 1.0905740348290796, "grad_norm": 9.966069832817631, "learning_rate": 4.116701693658054e-05, "loss": 0.2595, "step": 2959 }, { "epoch": 1.090942596517092, "grad_norm": 5.268566905830364, "learning_rate": 4.1163926319693414e-05, "loss": 0.2638, "step": 2960 }, { "epoch": 1.0913111582051045, "grad_norm": 7.612612558419062, "learning_rate": 4.1160835702806286e-05, "loss": 0.2617, "step": 2961 }, { "epoch": 1.0916797198931172, "grad_norm": 5.028973176779585, "learning_rate": 4.115774508591915e-05, "loss": 0.2734, "step": 2962 }, { "epoch": 1.0920482815811297, "grad_norm": 3.736639509371763, "learning_rate": 4.115465446903202e-05, "loss": 0.1261, "step": 2963 }, { "epoch": 1.0924168432691421, "grad_norm": 6.2265330927787375, "learning_rate": 4.115156385214489e-05, "loss": 0.2236, "step": 2964 }, { "epoch": 1.0927854049571546, "grad_norm": 4.468716707972579, "learning_rate": 4.1148473235257764e-05, "loss": 0.1896, "step": 2965 }, { "epoch": 1.0931539666451673, "grad_norm": 3.469042516523156, "learning_rate": 4.1145382618370635e-05, "loss": 0.1668, "step": 2966 }, { "epoch": 1.0935225283331798, "grad_norm": 6.306175160412894, "learning_rate": 4.114229200148349e-05, "loss": 0.2542, "step": 2967 }, { "epoch": 1.0938910900211922, "grad_norm": 10.456197290008358, "learning_rate": 4.1139201384596364e-05, "loss": 0.2337, "step": 2968 }, { "epoch": 1.094259651709205, "grad_norm": 5.11512251094751, "learning_rate": 4.1136110767709235e-05, "loss": 0.2463, "step": 2969 }, { "epoch": 1.0946282133972174, "grad_norm": 7.336902558992879, "learning_rate": 4.1133020150822106e-05, "loss": 0.2865, "step": 2970 }, { "epoch": 1.0949967750852299, "grad_norm": 5.722001615714065, "learning_rate": 4.112992953393498e-05, "loss": 0.2672, "step": 2971 }, { "epoch": 1.0953653367732423, "grad_norm": 11.060403916224717, "learning_rate": 4.112683891704784e-05, "loss": 0.2484, "step": 2972 }, { "epoch": 1.095733898461255, "grad_norm": 6.394257126550312, "learning_rate": 4.112374830016071e-05, "loss": 0.3696, "step": 2973 }, { "epoch": 1.0961024601492675, "grad_norm": 4.696303175827462, "learning_rate": 4.1120657683273584e-05, "loss": 0.2055, "step": 2974 }, { "epoch": 1.09647102183728, "grad_norm": 14.820236310426194, "learning_rate": 4.1117567066386455e-05, "loss": 0.3131, "step": 2975 }, { "epoch": 1.0968395835252926, "grad_norm": 9.111370620546666, "learning_rate": 4.111447644949932e-05, "loss": 0.2587, "step": 2976 }, { "epoch": 1.0972081452133051, "grad_norm": 5.289609894817903, "learning_rate": 4.111138583261219e-05, "loss": 0.1737, "step": 2977 }, { "epoch": 1.0975767069013176, "grad_norm": 10.416476153856792, "learning_rate": 4.110829521572506e-05, "loss": 0.2677, "step": 2978 }, { "epoch": 1.09794526858933, "grad_norm": 8.493490082368783, "learning_rate": 4.110520459883793e-05, "loss": 0.2432, "step": 2979 }, { "epoch": 1.0983138302773428, "grad_norm": 7.064493893734312, "learning_rate": 4.1102113981950804e-05, "loss": 0.2195, "step": 2980 }, { "epoch": 1.0986823919653552, "grad_norm": 6.130362518195167, "learning_rate": 4.109902336506367e-05, "loss": 0.2322, "step": 2981 }, { "epoch": 1.0990509536533677, "grad_norm": 4.473077985711826, "learning_rate": 4.109593274817653e-05, "loss": 0.1998, "step": 2982 }, { "epoch": 1.0994195153413802, "grad_norm": 5.997568909542185, "learning_rate": 4.1092842131289404e-05, "loss": 0.2614, "step": 2983 }, { "epoch": 1.0997880770293929, "grad_norm": 5.158003445349404, "learning_rate": 4.1089751514402276e-05, "loss": 0.1886, "step": 2984 }, { "epoch": 1.1001566387174053, "grad_norm": 5.666560602130466, "learning_rate": 4.108666089751515e-05, "loss": 0.2498, "step": 2985 }, { "epoch": 1.1005252004054178, "grad_norm": 4.250636109263415, "learning_rate": 4.108357028062801e-05, "loss": 0.1778, "step": 2986 }, { "epoch": 1.1008937620934305, "grad_norm": 7.017635473690662, "learning_rate": 4.108047966374088e-05, "loss": 0.262, "step": 2987 }, { "epoch": 1.101262323781443, "grad_norm": 7.482517465863553, "learning_rate": 4.1077389046853754e-05, "loss": 0.3134, "step": 2988 }, { "epoch": 1.1016308854694554, "grad_norm": 13.713402889992045, "learning_rate": 4.1074298429966625e-05, "loss": 0.2859, "step": 2989 }, { "epoch": 1.101999447157468, "grad_norm": 3.6088790222507057, "learning_rate": 4.1071207813079496e-05, "loss": 0.2065, "step": 2990 }, { "epoch": 1.1023680088454806, "grad_norm": 6.031546629889813, "learning_rate": 4.106811719619236e-05, "loss": 0.2641, "step": 2991 }, { "epoch": 1.102736570533493, "grad_norm": 2.882349985128034, "learning_rate": 4.106502657930523e-05, "loss": 0.1918, "step": 2992 }, { "epoch": 1.1031051322215055, "grad_norm": 5.793755280493312, "learning_rate": 4.10619359624181e-05, "loss": 0.1543, "step": 2993 }, { "epoch": 1.103473693909518, "grad_norm": 9.46747522617319, "learning_rate": 4.1058845345530974e-05, "loss": 0.2241, "step": 2994 }, { "epoch": 1.1038422555975307, "grad_norm": 3.88088738902194, "learning_rate": 4.105575472864384e-05, "loss": 0.2252, "step": 2995 }, { "epoch": 1.1042108172855432, "grad_norm": 4.83814964170874, "learning_rate": 4.105266411175671e-05, "loss": 0.2475, "step": 2996 }, { "epoch": 1.1045793789735556, "grad_norm": 6.434072822934188, "learning_rate": 4.1049573494869574e-05, "loss": 0.3482, "step": 2997 }, { "epoch": 1.1049479406615683, "grad_norm": 5.6250262789642225, "learning_rate": 4.1046482877982445e-05, "loss": 0.1505, "step": 2998 }, { "epoch": 1.1053165023495808, "grad_norm": 2.8392917773515345, "learning_rate": 4.1043392261095316e-05, "loss": 0.152, "step": 2999 }, { "epoch": 1.1056850640375933, "grad_norm": 7.2287333099315045, "learning_rate": 4.104030164420819e-05, "loss": 0.2386, "step": 3000 }, { "epoch": 1.1060536257256057, "grad_norm": 4.13912349323336, "learning_rate": 4.103721102732105e-05, "loss": 0.1539, "step": 3001 }, { "epoch": 1.1064221874136184, "grad_norm": 9.89556250486033, "learning_rate": 4.103412041043392e-05, "loss": 0.2381, "step": 3002 }, { "epoch": 1.1067907491016309, "grad_norm": 12.951476283228837, "learning_rate": 4.1031029793546794e-05, "loss": 0.308, "step": 3003 }, { "epoch": 1.1071593107896434, "grad_norm": 4.405764938601907, "learning_rate": 4.1027939176659665e-05, "loss": 0.393, "step": 3004 }, { "epoch": 1.107527872477656, "grad_norm": 3.511290731007082, "learning_rate": 4.102484855977253e-05, "loss": 0.1125, "step": 3005 }, { "epoch": 1.1078964341656685, "grad_norm": 4.3426204832028334, "learning_rate": 4.10217579428854e-05, "loss": 0.2634, "step": 3006 }, { "epoch": 1.108264995853681, "grad_norm": 7.57201398658166, "learning_rate": 4.101866732599827e-05, "loss": 0.2225, "step": 3007 }, { "epoch": 1.1086335575416935, "grad_norm": 7.199174240231371, "learning_rate": 4.1015576709111143e-05, "loss": 0.17, "step": 3008 }, { "epoch": 1.1090021192297062, "grad_norm": 3.5892730690978056, "learning_rate": 4.1012486092224015e-05, "loss": 0.1978, "step": 3009 }, { "epoch": 1.1093706809177186, "grad_norm": 7.412613464825092, "learning_rate": 4.100939547533688e-05, "loss": 0.4261, "step": 3010 }, { "epoch": 1.109739242605731, "grad_norm": 23.618039029058384, "learning_rate": 4.100630485844975e-05, "loss": 0.2573, "step": 3011 }, { "epoch": 1.1101078042937438, "grad_norm": 3.2358293442493196, "learning_rate": 4.1003214241562615e-05, "loss": 0.1111, "step": 3012 }, { "epoch": 1.1104763659817563, "grad_norm": 7.023550471838923, "learning_rate": 4.1000123624675486e-05, "loss": 0.2791, "step": 3013 }, { "epoch": 1.1108449276697687, "grad_norm": 4.070421682346823, "learning_rate": 4.099703300778836e-05, "loss": 0.175, "step": 3014 }, { "epoch": 1.1112134893577812, "grad_norm": 2.8859095697543617, "learning_rate": 4.099394239090122e-05, "loss": 0.1273, "step": 3015 }, { "epoch": 1.1115820510457939, "grad_norm": 7.688088495322752, "learning_rate": 4.099085177401409e-05, "loss": 0.1902, "step": 3016 }, { "epoch": 1.1119506127338064, "grad_norm": 5.890033074951569, "learning_rate": 4.0987761157126964e-05, "loss": 0.2719, "step": 3017 }, { "epoch": 1.1123191744218188, "grad_norm": 11.804066387324632, "learning_rate": 4.0984670540239835e-05, "loss": 0.2186, "step": 3018 }, { "epoch": 1.1126877361098313, "grad_norm": 8.147692638085156, "learning_rate": 4.0981579923352706e-05, "loss": 0.2811, "step": 3019 }, { "epoch": 1.113056297797844, "grad_norm": 9.091311622724433, "learning_rate": 4.097848930646557e-05, "loss": 0.4109, "step": 3020 }, { "epoch": 1.1134248594858565, "grad_norm": 7.875883264810858, "learning_rate": 4.097539868957844e-05, "loss": 0.2519, "step": 3021 }, { "epoch": 1.113793421173869, "grad_norm": 13.390847203159305, "learning_rate": 4.097230807269131e-05, "loss": 0.3393, "step": 3022 }, { "epoch": 1.1141619828618814, "grad_norm": 10.336955348361695, "learning_rate": 4.0969217455804184e-05, "loss": 0.2714, "step": 3023 }, { "epoch": 1.114530544549894, "grad_norm": 6.250992352859489, "learning_rate": 4.096612683891705e-05, "loss": 0.225, "step": 3024 }, { "epoch": 1.1148991062379066, "grad_norm": 4.930794907373364, "learning_rate": 4.096303622202992e-05, "loss": 0.2716, "step": 3025 }, { "epoch": 1.115267667925919, "grad_norm": 8.407233311883834, "learning_rate": 4.095994560514279e-05, "loss": 0.3625, "step": 3026 }, { "epoch": 1.1156362296139317, "grad_norm": 5.5101564438448145, "learning_rate": 4.0956854988255655e-05, "loss": 0.2201, "step": 3027 }, { "epoch": 1.1160047913019442, "grad_norm": 8.014300915553337, "learning_rate": 4.095376437136853e-05, "loss": 0.2615, "step": 3028 }, { "epoch": 1.1163733529899567, "grad_norm": 11.974193161958045, "learning_rate": 4.09506737544814e-05, "loss": 0.1806, "step": 3029 }, { "epoch": 1.1167419146779691, "grad_norm": 4.449595555131729, "learning_rate": 4.094758313759426e-05, "loss": 0.1466, "step": 3030 }, { "epoch": 1.1171104763659818, "grad_norm": 6.3109358558737645, "learning_rate": 4.0944492520707133e-05, "loss": 0.2869, "step": 3031 }, { "epoch": 1.1174790380539943, "grad_norm": 10.09984864980775, "learning_rate": 4.0941401903820005e-05, "loss": 0.1594, "step": 3032 }, { "epoch": 1.1178475997420068, "grad_norm": 4.32839050580323, "learning_rate": 4.0938311286932876e-05, "loss": 0.1777, "step": 3033 }, { "epoch": 1.1182161614300195, "grad_norm": 13.570240536654325, "learning_rate": 4.093522067004574e-05, "loss": 0.1544, "step": 3034 }, { "epoch": 1.118584723118032, "grad_norm": 5.353575835920981, "learning_rate": 4.093213005315861e-05, "loss": 0.1779, "step": 3035 }, { "epoch": 1.1189532848060444, "grad_norm": 5.265034356652708, "learning_rate": 4.092903943627148e-05, "loss": 0.2024, "step": 3036 }, { "epoch": 1.1193218464940569, "grad_norm": 7.354685072597592, "learning_rate": 4.0925948819384354e-05, "loss": 0.3211, "step": 3037 }, { "epoch": 1.1196904081820696, "grad_norm": 3.3558754369195705, "learning_rate": 4.0922858202497225e-05, "loss": 0.1411, "step": 3038 }, { "epoch": 1.120058969870082, "grad_norm": 5.156807054426414, "learning_rate": 4.091976758561009e-05, "loss": 0.1577, "step": 3039 }, { "epoch": 1.1204275315580945, "grad_norm": 4.569167608334875, "learning_rate": 4.091667696872296e-05, "loss": 0.1513, "step": 3040 }, { "epoch": 1.1207960932461072, "grad_norm": 6.764880696645267, "learning_rate": 4.091358635183583e-05, "loss": 0.387, "step": 3041 }, { "epoch": 1.1211646549341197, "grad_norm": 5.745276791749755, "learning_rate": 4.0910495734948696e-05, "loss": 0.1161, "step": 3042 }, { "epoch": 1.1215332166221321, "grad_norm": 3.862766520863272, "learning_rate": 4.090740511806157e-05, "loss": 0.18, "step": 3043 }, { "epoch": 1.1219017783101446, "grad_norm": 9.10108719923108, "learning_rate": 4.090431450117443e-05, "loss": 0.2617, "step": 3044 }, { "epoch": 1.1222703399981573, "grad_norm": 10.460806585835986, "learning_rate": 4.09012238842873e-05, "loss": 0.4467, "step": 3045 }, { "epoch": 1.1226389016861698, "grad_norm": 7.209765106480585, "learning_rate": 4.0898133267400174e-05, "loss": 0.3861, "step": 3046 }, { "epoch": 1.1230074633741822, "grad_norm": 3.5059426130438918, "learning_rate": 4.0895042650513045e-05, "loss": 0.2245, "step": 3047 }, { "epoch": 1.1233760250621947, "grad_norm": 3.944386107202908, "learning_rate": 4.089195203362591e-05, "loss": 0.2496, "step": 3048 }, { "epoch": 1.1237445867502074, "grad_norm": 5.3804726352625005, "learning_rate": 4.088886141673878e-05, "loss": 0.2272, "step": 3049 }, { "epoch": 1.1241131484382199, "grad_norm": 4.71513339261534, "learning_rate": 4.088577079985165e-05, "loss": 0.1642, "step": 3050 }, { "epoch": 1.1244817101262323, "grad_norm": 7.769092791015574, "learning_rate": 4.0882680182964523e-05, "loss": 0.1651, "step": 3051 }, { "epoch": 1.1248502718142448, "grad_norm": 7.526231035184257, "learning_rate": 4.0879589566077395e-05, "loss": 0.1276, "step": 3052 }, { "epoch": 1.1252188335022575, "grad_norm": 5.705020286530677, "learning_rate": 4.087649894919026e-05, "loss": 0.1984, "step": 3053 }, { "epoch": 1.12558739519027, "grad_norm": 6.200222306726754, "learning_rate": 4.087340833230313e-05, "loss": 0.2296, "step": 3054 }, { "epoch": 1.1259559568782824, "grad_norm": 2.919769670991297, "learning_rate": 4.0870317715416e-05, "loss": 0.1526, "step": 3055 }, { "epoch": 1.1263245185662951, "grad_norm": 6.923886801144181, "learning_rate": 4.086722709852887e-05, "loss": 0.2098, "step": 3056 }, { "epoch": 1.1266930802543076, "grad_norm": 4.746693313466689, "learning_rate": 4.086413648164174e-05, "loss": 0.1879, "step": 3057 }, { "epoch": 1.12706164194232, "grad_norm": 22.473262922564643, "learning_rate": 4.08610458647546e-05, "loss": 0.3692, "step": 3058 }, { "epoch": 1.1274302036303325, "grad_norm": 4.366757503079172, "learning_rate": 4.085795524786747e-05, "loss": 0.1515, "step": 3059 }, { "epoch": 1.1277987653183452, "grad_norm": 7.954285182905884, "learning_rate": 4.0854864630980344e-05, "loss": 0.1747, "step": 3060 }, { "epoch": 1.1281673270063577, "grad_norm": 5.58857871615408, "learning_rate": 4.0851774014093215e-05, "loss": 0.2067, "step": 3061 }, { "epoch": 1.1285358886943702, "grad_norm": 7.128667590183184, "learning_rate": 4.0848683397206086e-05, "loss": 0.2554, "step": 3062 }, { "epoch": 1.1289044503823829, "grad_norm": 3.7589140167834763, "learning_rate": 4.084559278031895e-05, "loss": 0.1666, "step": 3063 }, { "epoch": 1.1292730120703953, "grad_norm": 11.427821666101837, "learning_rate": 4.084250216343182e-05, "loss": 0.1838, "step": 3064 }, { "epoch": 1.1296415737584078, "grad_norm": 4.608532637873016, "learning_rate": 4.083941154654469e-05, "loss": 0.2035, "step": 3065 }, { "epoch": 1.1300101354464203, "grad_norm": 4.736826850353461, "learning_rate": 4.0836320929657564e-05, "loss": 0.2121, "step": 3066 }, { "epoch": 1.130378697134433, "grad_norm": 11.291987753569245, "learning_rate": 4.083323031277043e-05, "loss": 0.2701, "step": 3067 }, { "epoch": 1.1307472588224454, "grad_norm": 7.481066392575138, "learning_rate": 4.08301396958833e-05, "loss": 0.2191, "step": 3068 }, { "epoch": 1.131115820510458, "grad_norm": 5.59716896976268, "learning_rate": 4.082704907899617e-05, "loss": 0.2819, "step": 3069 }, { "epoch": 1.1314843821984706, "grad_norm": 4.649207402431644, "learning_rate": 4.082395846210904e-05, "loss": 0.2352, "step": 3070 }, { "epoch": 1.131852943886483, "grad_norm": 4.495799117124572, "learning_rate": 4.0820867845221913e-05, "loss": 0.3019, "step": 3071 }, { "epoch": 1.1322215055744955, "grad_norm": 2.479625648731905, "learning_rate": 4.081777722833478e-05, "loss": 0.0951, "step": 3072 }, { "epoch": 1.132590067262508, "grad_norm": 2.8117188110521143, "learning_rate": 4.081468661144764e-05, "loss": 0.1924, "step": 3073 }, { "epoch": 1.1329586289505207, "grad_norm": 4.541759851648789, "learning_rate": 4.0811595994560513e-05, "loss": 0.1936, "step": 3074 }, { "epoch": 1.1333271906385332, "grad_norm": 27.412442733210817, "learning_rate": 4.0808505377673385e-05, "loss": 0.2228, "step": 3075 }, { "epoch": 1.1336957523265456, "grad_norm": 5.604856918917999, "learning_rate": 4.0805414760786256e-05, "loss": 0.2474, "step": 3076 }, { "epoch": 1.134064314014558, "grad_norm": 6.19723977204661, "learning_rate": 4.080232414389912e-05, "loss": 0.1566, "step": 3077 }, { "epoch": 1.1344328757025708, "grad_norm": 10.928048371550041, "learning_rate": 4.079923352701199e-05, "loss": 0.2853, "step": 3078 }, { "epoch": 1.1348014373905833, "grad_norm": 4.692991981302788, "learning_rate": 4.079614291012486e-05, "loss": 0.1411, "step": 3079 }, { "epoch": 1.1351699990785957, "grad_norm": 4.707224532468391, "learning_rate": 4.0793052293237734e-05, "loss": 0.1472, "step": 3080 }, { "epoch": 1.1355385607666082, "grad_norm": 4.098155672168894, "learning_rate": 4.0789961676350605e-05, "loss": 0.2141, "step": 3081 }, { "epoch": 1.1359071224546209, "grad_norm": 4.561055830138953, "learning_rate": 4.078687105946347e-05, "loss": 0.1987, "step": 3082 }, { "epoch": 1.1362756841426334, "grad_norm": 2.7158733364115406, "learning_rate": 4.078378044257634e-05, "loss": 0.1347, "step": 3083 }, { "epoch": 1.1366442458306458, "grad_norm": 13.337034728954114, "learning_rate": 4.078068982568921e-05, "loss": 0.2854, "step": 3084 }, { "epoch": 1.1370128075186585, "grad_norm": 8.684555762332492, "learning_rate": 4.077759920880208e-05, "loss": 0.2328, "step": 3085 }, { "epoch": 1.137381369206671, "grad_norm": 3.346737534151365, "learning_rate": 4.077450859191495e-05, "loss": 0.1184, "step": 3086 }, { "epoch": 1.1377499308946835, "grad_norm": 21.42021741512742, "learning_rate": 4.077141797502782e-05, "loss": 0.336, "step": 3087 }, { "epoch": 1.138118492582696, "grad_norm": 3.9994043860449064, "learning_rate": 4.076832735814068e-05, "loss": 0.1541, "step": 3088 }, { "epoch": 1.1384870542707086, "grad_norm": 16.084341910302083, "learning_rate": 4.0765236741253554e-05, "loss": 0.1892, "step": 3089 }, { "epoch": 1.138855615958721, "grad_norm": 3.3974619199571015, "learning_rate": 4.0762146124366425e-05, "loss": 0.2235, "step": 3090 }, { "epoch": 1.1392241776467336, "grad_norm": 6.113241313767318, "learning_rate": 4.0759055507479297e-05, "loss": 0.1959, "step": 3091 }, { "epoch": 1.1395927393347463, "grad_norm": 2.5513640551418995, "learning_rate": 4.075596489059216e-05, "loss": 0.1833, "step": 3092 }, { "epoch": 1.1399613010227587, "grad_norm": 5.221652931043521, "learning_rate": 4.075287427370503e-05, "loss": 0.1585, "step": 3093 }, { "epoch": 1.1403298627107712, "grad_norm": 6.465663302470827, "learning_rate": 4.07497836568179e-05, "loss": 0.2448, "step": 3094 }, { "epoch": 1.1406984243987837, "grad_norm": 5.055182737826337, "learning_rate": 4.0746693039930775e-05, "loss": 0.2279, "step": 3095 }, { "epoch": 1.1410669860867964, "grad_norm": 8.547870696543335, "learning_rate": 4.074360242304364e-05, "loss": 0.2512, "step": 3096 }, { "epoch": 1.1414355477748088, "grad_norm": 13.129547784459831, "learning_rate": 4.074051180615651e-05, "loss": 0.2787, "step": 3097 }, { "epoch": 1.1418041094628213, "grad_norm": 3.9361916442208678, "learning_rate": 4.073742118926938e-05, "loss": 0.2439, "step": 3098 }, { "epoch": 1.142172671150834, "grad_norm": 4.301614928281114, "learning_rate": 4.073433057238225e-05, "loss": 0.1887, "step": 3099 }, { "epoch": 1.1425412328388465, "grad_norm": 6.637123962210295, "learning_rate": 4.0731239955495124e-05, "loss": 0.1655, "step": 3100 }, { "epoch": 1.142909794526859, "grad_norm": 6.726504371028595, "learning_rate": 4.072814933860799e-05, "loss": 0.2677, "step": 3101 }, { "epoch": 1.1432783562148714, "grad_norm": 15.245960138250046, "learning_rate": 4.072505872172086e-05, "loss": 0.3148, "step": 3102 }, { "epoch": 1.143646917902884, "grad_norm": 4.2434878103980775, "learning_rate": 4.0721968104833724e-05, "loss": 0.1673, "step": 3103 }, { "epoch": 1.1440154795908966, "grad_norm": 4.834692183028685, "learning_rate": 4.0718877487946595e-05, "loss": 0.281, "step": 3104 }, { "epoch": 1.144384041278909, "grad_norm": 9.310258634029429, "learning_rate": 4.0715786871059466e-05, "loss": 0.2343, "step": 3105 }, { "epoch": 1.1447526029669215, "grad_norm": 6.132766470766756, "learning_rate": 4.071269625417233e-05, "loss": 0.3071, "step": 3106 }, { "epoch": 1.1451211646549342, "grad_norm": 4.3307253741417675, "learning_rate": 4.07096056372852e-05, "loss": 0.1984, "step": 3107 }, { "epoch": 1.1454897263429467, "grad_norm": 6.788045124342311, "learning_rate": 4.070651502039807e-05, "loss": 0.1383, "step": 3108 }, { "epoch": 1.1458582880309591, "grad_norm": 5.0399884459953475, "learning_rate": 4.0703424403510944e-05, "loss": 0.2328, "step": 3109 }, { "epoch": 1.1462268497189716, "grad_norm": 6.333589883259428, "learning_rate": 4.0700333786623815e-05, "loss": 0.2501, "step": 3110 }, { "epoch": 1.1465954114069843, "grad_norm": 13.488160769987553, "learning_rate": 4.069724316973668e-05, "loss": 0.3991, "step": 3111 }, { "epoch": 1.1469639730949968, "grad_norm": 11.359426044417638, "learning_rate": 4.069415255284955e-05, "loss": 0.3167, "step": 3112 }, { "epoch": 1.1473325347830092, "grad_norm": 1.9586135041587698, "learning_rate": 4.069106193596242e-05, "loss": 0.105, "step": 3113 }, { "epoch": 1.147701096471022, "grad_norm": 6.620761019017925, "learning_rate": 4.068797131907529e-05, "loss": 0.2163, "step": 3114 }, { "epoch": 1.1480696581590344, "grad_norm": 3.816989693999275, "learning_rate": 4.068488070218816e-05, "loss": 0.2087, "step": 3115 }, { "epoch": 1.1484382198470469, "grad_norm": 5.959680827254438, "learning_rate": 4.068179008530103e-05, "loss": 0.2513, "step": 3116 }, { "epoch": 1.1488067815350593, "grad_norm": 12.458164856110681, "learning_rate": 4.06786994684139e-05, "loss": 0.191, "step": 3117 }, { "epoch": 1.149175343223072, "grad_norm": 5.097813021415983, "learning_rate": 4.0675608851526765e-05, "loss": 0.222, "step": 3118 }, { "epoch": 1.1495439049110845, "grad_norm": 5.605315798490364, "learning_rate": 4.0672518234639636e-05, "loss": 0.2221, "step": 3119 }, { "epoch": 1.149912466599097, "grad_norm": 3.7048135119002934, "learning_rate": 4.06694276177525e-05, "loss": 0.138, "step": 3120 }, { "epoch": 1.1502810282871097, "grad_norm": 9.67137842336397, "learning_rate": 4.066633700086537e-05, "loss": 0.3497, "step": 3121 }, { "epoch": 1.1506495899751221, "grad_norm": 4.455790046827414, "learning_rate": 4.066324638397824e-05, "loss": 0.2133, "step": 3122 }, { "epoch": 1.1510181516631346, "grad_norm": 4.995782981196726, "learning_rate": 4.0660155767091114e-05, "loss": 0.2368, "step": 3123 }, { "epoch": 1.151386713351147, "grad_norm": 3.8942976320038585, "learning_rate": 4.0657065150203985e-05, "loss": 0.1788, "step": 3124 }, { "epoch": 1.1517552750391598, "grad_norm": 3.42920641468569, "learning_rate": 4.065397453331685e-05, "loss": 0.1658, "step": 3125 }, { "epoch": 1.1521238367271722, "grad_norm": 6.5850402293211046, "learning_rate": 4.065088391642972e-05, "loss": 0.1449, "step": 3126 }, { "epoch": 1.1524923984151847, "grad_norm": 12.300766984550318, "learning_rate": 4.064779329954259e-05, "loss": 0.3088, "step": 3127 }, { "epoch": 1.1528609601031974, "grad_norm": 8.520222561933627, "learning_rate": 4.064470268265546e-05, "loss": 0.3671, "step": 3128 }, { "epoch": 1.1532295217912099, "grad_norm": 8.84717981264126, "learning_rate": 4.0641612065768334e-05, "loss": 0.3753, "step": 3129 }, { "epoch": 1.1535980834792223, "grad_norm": 6.35185378041355, "learning_rate": 4.06385214488812e-05, "loss": 0.2555, "step": 3130 }, { "epoch": 1.1539666451672348, "grad_norm": 6.1671421108310405, "learning_rate": 4.063543083199407e-05, "loss": 0.1655, "step": 3131 }, { "epoch": 1.1543352068552475, "grad_norm": 4.946767873080029, "learning_rate": 4.063234021510694e-05, "loss": 0.1955, "step": 3132 }, { "epoch": 1.15470376854326, "grad_norm": 8.55816585249369, "learning_rate": 4.0629249598219805e-05, "loss": 0.2353, "step": 3133 }, { "epoch": 1.1550723302312724, "grad_norm": 16.54506932133124, "learning_rate": 4.0626158981332677e-05, "loss": 0.2232, "step": 3134 }, { "epoch": 1.155440891919285, "grad_norm": 5.766067942737379, "learning_rate": 4.062306836444554e-05, "loss": 0.2039, "step": 3135 }, { "epoch": 1.1558094536072976, "grad_norm": 5.606185984166285, "learning_rate": 4.061997774755841e-05, "loss": 0.2372, "step": 3136 }, { "epoch": 1.15617801529531, "grad_norm": 5.7604596744010275, "learning_rate": 4.061688713067128e-05, "loss": 0.203, "step": 3137 }, { "epoch": 1.1565465769833225, "grad_norm": 3.019957281656849, "learning_rate": 4.0613796513784155e-05, "loss": 0.0758, "step": 3138 }, { "epoch": 1.156915138671335, "grad_norm": 3.803390626556614, "learning_rate": 4.061070589689702e-05, "loss": 0.2049, "step": 3139 }, { "epoch": 1.1572837003593477, "grad_norm": 6.3808746197591075, "learning_rate": 4.060761528000989e-05, "loss": 0.2713, "step": 3140 }, { "epoch": 1.1576522620473602, "grad_norm": 2.866297612350029, "learning_rate": 4.060452466312276e-05, "loss": 0.1967, "step": 3141 }, { "epoch": 1.1580208237353726, "grad_norm": 10.914807653964722, "learning_rate": 4.060143404623563e-05, "loss": 0.2624, "step": 3142 }, { "epoch": 1.1583893854233853, "grad_norm": 8.956553390273097, "learning_rate": 4.0598343429348504e-05, "loss": 0.2867, "step": 3143 }, { "epoch": 1.1587579471113978, "grad_norm": 5.697160502115526, "learning_rate": 4.059525281246137e-05, "loss": 0.2786, "step": 3144 }, { "epoch": 1.1591265087994103, "grad_norm": 10.600146857629866, "learning_rate": 4.059216219557424e-05, "loss": 0.2726, "step": 3145 }, { "epoch": 1.1594950704874227, "grad_norm": 4.514012877236769, "learning_rate": 4.058907157868711e-05, "loss": 0.2323, "step": 3146 }, { "epoch": 1.1598636321754354, "grad_norm": 8.068194130706184, "learning_rate": 4.058598096179998e-05, "loss": 0.2892, "step": 3147 }, { "epoch": 1.160232193863448, "grad_norm": 4.530765902211736, "learning_rate": 4.0582890344912846e-05, "loss": 0.2733, "step": 3148 }, { "epoch": 1.1606007555514604, "grad_norm": 5.797018455519424, "learning_rate": 4.057979972802571e-05, "loss": 0.256, "step": 3149 }, { "epoch": 1.160969317239473, "grad_norm": 8.188664710215464, "learning_rate": 4.057670911113858e-05, "loss": 0.2612, "step": 3150 }, { "epoch": 1.1613378789274855, "grad_norm": 10.417729641402087, "learning_rate": 4.057361849425145e-05, "loss": 0.1831, "step": 3151 }, { "epoch": 1.161706440615498, "grad_norm": 3.0497135340504538, "learning_rate": 4.0570527877364324e-05, "loss": 0.0984, "step": 3152 }, { "epoch": 1.1620750023035105, "grad_norm": 6.647384255310064, "learning_rate": 4.0567437260477195e-05, "loss": 0.2423, "step": 3153 }, { "epoch": 1.1624435639915232, "grad_norm": 3.181793789955873, "learning_rate": 4.056434664359006e-05, "loss": 0.2314, "step": 3154 }, { "epoch": 1.1628121256795356, "grad_norm": 4.9103023049081145, "learning_rate": 4.056125602670293e-05, "loss": 0.296, "step": 3155 }, { "epoch": 1.163180687367548, "grad_norm": 3.2248168361015184, "learning_rate": 4.05581654098158e-05, "loss": 0.1132, "step": 3156 }, { "epoch": 1.1635492490555608, "grad_norm": 5.03370900215332, "learning_rate": 4.055507479292867e-05, "loss": 0.247, "step": 3157 }, { "epoch": 1.1639178107435733, "grad_norm": 3.4102832119478523, "learning_rate": 4.055198417604154e-05, "loss": 0.1824, "step": 3158 }, { "epoch": 1.1642863724315857, "grad_norm": 4.682196707040187, "learning_rate": 4.054889355915441e-05, "loss": 0.1789, "step": 3159 }, { "epoch": 1.1646549341195982, "grad_norm": 8.573586519259175, "learning_rate": 4.054580294226728e-05, "loss": 0.2399, "step": 3160 }, { "epoch": 1.1650234958076109, "grad_norm": 11.634835625156638, "learning_rate": 4.054271232538015e-05, "loss": 0.343, "step": 3161 }, { "epoch": 1.1653920574956234, "grad_norm": 15.380737722733192, "learning_rate": 4.053962170849302e-05, "loss": 0.4175, "step": 3162 }, { "epoch": 1.1657606191836358, "grad_norm": 8.403854085404477, "learning_rate": 4.053653109160589e-05, "loss": 0.1627, "step": 3163 }, { "epoch": 1.1661291808716483, "grad_norm": 11.690086690485007, "learning_rate": 4.053344047471875e-05, "loss": 0.2211, "step": 3164 }, { "epoch": 1.166497742559661, "grad_norm": 5.055241408502881, "learning_rate": 4.053034985783162e-05, "loss": 0.2422, "step": 3165 }, { "epoch": 1.1668663042476735, "grad_norm": 9.967689194635804, "learning_rate": 4.0527259240944494e-05, "loss": 0.3323, "step": 3166 }, { "epoch": 1.167234865935686, "grad_norm": 2.955138635110402, "learning_rate": 4.0524168624057365e-05, "loss": 0.2167, "step": 3167 }, { "epoch": 1.1676034276236984, "grad_norm": 5.3460410042334425, "learning_rate": 4.052107800717023e-05, "loss": 0.1796, "step": 3168 }, { "epoch": 1.167971989311711, "grad_norm": 3.106808624080369, "learning_rate": 4.05179873902831e-05, "loss": 0.2027, "step": 3169 }, { "epoch": 1.1683405509997236, "grad_norm": 3.4748558398745346, "learning_rate": 4.051489677339597e-05, "loss": 0.1214, "step": 3170 }, { "epoch": 1.168709112687736, "grad_norm": 3.944090882575432, "learning_rate": 4.051180615650884e-05, "loss": 0.2917, "step": 3171 }, { "epoch": 1.1690776743757487, "grad_norm": 3.9056281243267676, "learning_rate": 4.0508715539621714e-05, "loss": 0.1733, "step": 3172 }, { "epoch": 1.1694462360637612, "grad_norm": 2.759050907098125, "learning_rate": 4.050562492273458e-05, "loss": 0.1807, "step": 3173 }, { "epoch": 1.1698147977517737, "grad_norm": 4.28119653821547, "learning_rate": 4.050253430584745e-05, "loss": 0.1393, "step": 3174 }, { "epoch": 1.1701833594397861, "grad_norm": 4.965556527992651, "learning_rate": 4.049944368896032e-05, "loss": 0.2023, "step": 3175 }, { "epoch": 1.1705519211277988, "grad_norm": 6.57439427359641, "learning_rate": 4.049635307207319e-05, "loss": 0.2795, "step": 3176 }, { "epoch": 1.1709204828158113, "grad_norm": 5.418818511001802, "learning_rate": 4.0493262455186056e-05, "loss": 0.1918, "step": 3177 }, { "epoch": 1.1712890445038238, "grad_norm": 2.3810882833841007, "learning_rate": 4.049017183829892e-05, "loss": 0.0748, "step": 3178 }, { "epoch": 1.1716576061918365, "grad_norm": 3.96624484557482, "learning_rate": 4.048708122141179e-05, "loss": 0.1262, "step": 3179 }, { "epoch": 1.172026167879849, "grad_norm": 3.823220111885501, "learning_rate": 4.048399060452466e-05, "loss": 0.1506, "step": 3180 }, { "epoch": 1.1723947295678614, "grad_norm": 18.371614935861313, "learning_rate": 4.0480899987637534e-05, "loss": 0.2326, "step": 3181 }, { "epoch": 1.1727632912558739, "grad_norm": 4.888880019228047, "learning_rate": 4.0477809370750406e-05, "loss": 0.1875, "step": 3182 }, { "epoch": 1.1731318529438866, "grad_norm": 4.340313107495097, "learning_rate": 4.047471875386327e-05, "loss": 0.1907, "step": 3183 }, { "epoch": 1.173500414631899, "grad_norm": 7.30742345153201, "learning_rate": 4.047162813697614e-05, "loss": 0.236, "step": 3184 }, { "epoch": 1.1738689763199115, "grad_norm": 7.738458715153349, "learning_rate": 4.046853752008901e-05, "loss": 0.2542, "step": 3185 }, { "epoch": 1.1742375380079242, "grad_norm": 28.378256455482337, "learning_rate": 4.0465446903201884e-05, "loss": 0.4141, "step": 3186 }, { "epoch": 1.1746060996959367, "grad_norm": 2.7762473043937943, "learning_rate": 4.046235628631475e-05, "loss": 0.1624, "step": 3187 }, { "epoch": 1.1749746613839491, "grad_norm": 4.976844959237832, "learning_rate": 4.045926566942762e-05, "loss": 0.1647, "step": 3188 }, { "epoch": 1.1753432230719616, "grad_norm": 6.418927409572059, "learning_rate": 4.045617505254049e-05, "loss": 0.4796, "step": 3189 }, { "epoch": 1.1757117847599743, "grad_norm": 12.50583909030254, "learning_rate": 4.045308443565336e-05, "loss": 0.0767, "step": 3190 }, { "epoch": 1.1760803464479868, "grad_norm": 6.298925326284896, "learning_rate": 4.044999381876623e-05, "loss": 0.3094, "step": 3191 }, { "epoch": 1.1764489081359992, "grad_norm": 10.720715045140485, "learning_rate": 4.04469032018791e-05, "loss": 0.2691, "step": 3192 }, { "epoch": 1.1768174698240117, "grad_norm": 8.712745923549035, "learning_rate": 4.044381258499196e-05, "loss": 0.2561, "step": 3193 }, { "epoch": 1.1771860315120244, "grad_norm": 5.112147894115408, "learning_rate": 4.044072196810483e-05, "loss": 0.2622, "step": 3194 }, { "epoch": 1.1775545932000369, "grad_norm": 6.878567411622166, "learning_rate": 4.0437631351217704e-05, "loss": 0.2078, "step": 3195 }, { "epoch": 1.1779231548880493, "grad_norm": 4.447830425680083, "learning_rate": 4.0434540734330575e-05, "loss": 0.2664, "step": 3196 }, { "epoch": 1.1782917165760618, "grad_norm": 5.076767584502531, "learning_rate": 4.043145011744344e-05, "loss": 0.2363, "step": 3197 }, { "epoch": 1.1786602782640745, "grad_norm": 6.4827399341036855, "learning_rate": 4.042835950055631e-05, "loss": 0.2797, "step": 3198 }, { "epoch": 1.179028839952087, "grad_norm": 7.84195278293022, "learning_rate": 4.042526888366918e-05, "loss": 0.3299, "step": 3199 }, { "epoch": 1.1793974016400994, "grad_norm": 13.195624718775159, "learning_rate": 4.042217826678205e-05, "loss": 0.2321, "step": 3200 }, { "epoch": 1.1793974016400994, "eval_bleu": 0.15343699847257694, "eval_bleu_1gram": 0.5014416880253451, "eval_bleu_2gram": 0.306967135799664, "eval_bleu_3gram": 0.19359272856877072, "eval_bleu_4gram": 0.1325891528983081, "eval_rag_val_loss": 0.40663746912542564, "eval_rouge1": 0.4953397507965068, "eval_rouge2": 0.3013715842094907, "eval_rougeL": 0.4950749470534422, "step": 3200 } ], "logging_steps": 1, "max_steps": 16278, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 1600, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }