{ "best_metric": 0.8220645315783643, "best_model_checkpoint": "train/Large-20241115-Compress:16x-Lr:5e-5-Llama3-8B-instruct-GPT2-Large-RAG-no-ft_token-onlySquad-everymem/checkpoint-1600", "epoch": 1.7690961024601493, "eval_steps": 800, "global_step": 4800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003685616880125311, "grad_norm": 17.08695020146954, "learning_rate": 0.0, "loss": 4.3491, "step": 1 }, { "epoch": 0.0007371233760250622, "grad_norm": 14.003965088788526, "learning_rate": 7.525749891599529e-06, "loss": 4.077, "step": 2 }, { "epoch": 0.0011056850640375933, "grad_norm": 10.574541981114434, "learning_rate": 1.192803136799156e-05, "loss": 3.7797, "step": 3 }, { "epoch": 0.0014742467520501245, "grad_norm": 12.570540706443131, "learning_rate": 1.5051499783199057e-05, "loss": 4.1339, "step": 4 }, { "epoch": 0.0018428084400626554, "grad_norm": 12.877325644211624, "learning_rate": 1.7474250108400467e-05, "loss": 3.7705, "step": 5 }, { "epoch": 0.0022113701280751866, "grad_norm": 8.730810430953705, "learning_rate": 1.945378125959109e-05, "loss": 3.2196, "step": 6 }, { "epoch": 0.0025799318160877175, "grad_norm": 9.949026656346408, "learning_rate": 2.1127451000356418e-05, "loss": 3.1704, "step": 7 }, { "epoch": 0.002948493504100249, "grad_norm": 9.82407818612165, "learning_rate": 2.2577249674798584e-05, "loss": 2.9147, "step": 8 }, { "epoch": 0.00331705519211278, "grad_norm": 12.70496568094938, "learning_rate": 2.385606273598312e-05, "loss": 2.863, "step": 9 }, { "epoch": 0.003685616880125311, "grad_norm": 10.538594478002146, "learning_rate": 2.4999999999999998e-05, "loss": 2.5078, "step": 10 }, { "epoch": 0.004054178568137842, "grad_norm": 14.839728073548889, "learning_rate": 2.6034817128955623e-05, "loss": 2.723, "step": 11 }, { "epoch": 0.004422740256150373, "grad_norm": 14.175350145862799, "learning_rate": 2.6979531151190617e-05, "loss": 2.3541, "step": 12 }, { "epoch": 0.0047913019441629045, "grad_norm": 18.34496442423813, "learning_rate": 2.7848583807670913e-05, "loss": 2.3718, "step": 13 }, { "epoch": 0.005159863632175435, "grad_norm": 12.013128093133414, "learning_rate": 2.8653200891955945e-05, "loss": 2.154, "step": 14 }, { "epoch": 0.005528425320187966, "grad_norm": 10.032356557688026, "learning_rate": 2.940228147639203e-05, "loss": 1.7959, "step": 15 }, { "epoch": 0.005896987008200498, "grad_norm": 11.041216532810779, "learning_rate": 3.0102999566398115e-05, "loss": 1.8764, "step": 16 }, { "epoch": 0.006265548696213028, "grad_norm": 10.721213188415074, "learning_rate": 3.076122303445685e-05, "loss": 2.1062, "step": 17 }, { "epoch": 0.00663411038422556, "grad_norm": 9.859734046552676, "learning_rate": 3.1381812627582646e-05, "loss": 1.6777, "step": 18 }, { "epoch": 0.007002672072238091, "grad_norm": 8.153670213570093, "learning_rate": 3.1968840023820715e-05, "loss": 1.3923, "step": 19 }, { "epoch": 0.007371233760250622, "grad_norm": 12.193304156780705, "learning_rate": 3.2525749891599525e-05, "loss": 1.8566, "step": 20 }, { "epoch": 0.007739795448263153, "grad_norm": 9.370564950300114, "learning_rate": 3.305548236834798e-05, "loss": 1.6516, "step": 21 }, { "epoch": 0.008108357136275684, "grad_norm": 10.006029218802984, "learning_rate": 3.3560567020555153e-05, "loss": 1.4579, "step": 22 }, { "epoch": 0.008476918824288216, "grad_norm": 9.904147240846099, "learning_rate": 3.404319590043982e-05, "loss": 1.8116, "step": 23 }, { "epoch": 0.008845480512300746, "grad_norm": 9.390724930572702, "learning_rate": 3.450528104279015e-05, "loss": 1.4801, "step": 24 }, { "epoch": 0.009214042200313277, "grad_norm": 9.141449615581697, "learning_rate": 3.4948500216800935e-05, "loss": 1.6275, "step": 25 }, { "epoch": 0.009582603888325809, "grad_norm": 7.46651256038943, "learning_rate": 3.537433369927044e-05, "loss": 1.8063, "step": 26 }, { "epoch": 0.00995116557633834, "grad_norm": 7.96380940784144, "learning_rate": 3.578409410397468e-05, "loss": 1.8354, "step": 27 }, { "epoch": 0.01031972726435087, "grad_norm": 9.594905423348779, "learning_rate": 3.6178950783555475e-05, "loss": 1.6978, "step": 28 }, { "epoch": 0.010688288952363402, "grad_norm": 7.500168862031333, "learning_rate": 3.65599499474739e-05, "loss": 1.6192, "step": 29 }, { "epoch": 0.011056850640375933, "grad_norm": 6.711874207037387, "learning_rate": 3.6928031367991554e-05, "loss": 1.4321, "step": 30 }, { "epoch": 0.011425412328388463, "grad_norm": 6.59835090842351, "learning_rate": 3.728404234585681e-05, "loss": 1.0742, "step": 31 }, { "epoch": 0.011793974016400996, "grad_norm": 6.4326678167365765, "learning_rate": 3.762874945799765e-05, "loss": 1.6376, "step": 32 }, { "epoch": 0.012162535704413526, "grad_norm": 7.480975117489437, "learning_rate": 3.796284849694718e-05, "loss": 1.7764, "step": 33 }, { "epoch": 0.012531097392426057, "grad_norm": 7.296322918086891, "learning_rate": 3.8286972926056376e-05, "loss": 1.597, "step": 34 }, { "epoch": 0.012899659080438589, "grad_norm": 7.604731402046073, "learning_rate": 3.8601701108756885e-05, "loss": 1.42, "step": 35 }, { "epoch": 0.01326822076845112, "grad_norm": 9.144102403442803, "learning_rate": 3.890756251918218e-05, "loss": 1.4569, "step": 36 }, { "epoch": 0.01363678245646365, "grad_norm": 9.497510885105386, "learning_rate": 3.920504310167487e-05, "loss": 1.5261, "step": 37 }, { "epoch": 0.014005344144476182, "grad_norm": 7.4045006339841635, "learning_rate": 3.949458991542025e-05, "loss": 1.4517, "step": 38 }, { "epoch": 0.014373905832488713, "grad_norm": 7.4389205986387275, "learning_rate": 3.977661517566247e-05, "loss": 1.1842, "step": 39 }, { "epoch": 0.014742467520501243, "grad_norm": 8.386499964397068, "learning_rate": 4.005149978319905e-05, "loss": 1.0976, "step": 40 }, { "epoch": 0.015111029208513775, "grad_norm": 6.606676204506424, "learning_rate": 4.031959641799338e-05, "loss": 1.5894, "step": 41 }, { "epoch": 0.015479590896526306, "grad_norm": 9.10051252996805, "learning_rate": 4.058123225994751e-05, "loss": 1.2626, "step": 42 }, { "epoch": 0.015848152584538838, "grad_norm": 8.222229427758345, "learning_rate": 4.0836711389489654e-05, "loss": 1.1489, "step": 43 }, { "epoch": 0.016216714272551367, "grad_norm": 7.637782654471986, "learning_rate": 4.108631691215468e-05, "loss": 1.335, "step": 44 }, { "epoch": 0.0165852759605639, "grad_norm": 12.60952626342047, "learning_rate": 4.133031284438358e-05, "loss": 1.7177, "step": 45 }, { "epoch": 0.01695383764857643, "grad_norm": 9.558559827662528, "learning_rate": 4.156894579203935e-05, "loss": 1.2862, "step": 46 }, { "epoch": 0.01732239933658896, "grad_norm": 9.371911112228274, "learning_rate": 4.180244644839293e-05, "loss": 1.2267, "step": 47 }, { "epoch": 0.017690961024601493, "grad_norm": 9.396742365833159, "learning_rate": 4.203103093438968e-05, "loss": 1.0079, "step": 48 }, { "epoch": 0.018059522712614025, "grad_norm": 10.572497089247575, "learning_rate": 4.2254902000712836e-05, "loss": 1.4973, "step": 49 }, { "epoch": 0.018428084400626554, "grad_norm": 10.797814070277791, "learning_rate": 4.247425010840046e-05, "loss": 1.197, "step": 50 }, { "epoch": 0.018796646088639086, "grad_norm": 6.567832306295075, "learning_rate": 4.2689254402448405e-05, "loss": 1.176, "step": 51 }, { "epoch": 0.019165207776651618, "grad_norm": 7.41438304019284, "learning_rate": 4.290008359086998e-05, "loss": 1.2515, "step": 52 }, { "epoch": 0.019533769464664147, "grad_norm": 10.073525779530193, "learning_rate": 4.310689674001973e-05, "loss": 1.4137, "step": 53 }, { "epoch": 0.01990233115267668, "grad_norm": 10.302066217966908, "learning_rate": 4.330984399557421e-05, "loss": 1.4525, "step": 54 }, { "epoch": 0.02027089284068921, "grad_norm": 8.852289935907827, "learning_rate": 4.350906723735609e-05, "loss": 1.2853, "step": 55 }, { "epoch": 0.02063945452870174, "grad_norm": 7.842389600497803, "learning_rate": 4.370470067515501e-05, "loss": 1.0821, "step": 56 }, { "epoch": 0.021008016216714272, "grad_norm": 6.963131679733383, "learning_rate": 4.3896871391812285e-05, "loss": 1.1803, "step": 57 }, { "epoch": 0.021376577904726805, "grad_norm": 7.191714436124513, "learning_rate": 4.408569983907343e-05, "loss": 0.9637, "step": 58 }, { "epoch": 0.021745139592739333, "grad_norm": 8.768544192084292, "learning_rate": 4.42713002910536e-05, "loss": 1.3951, "step": 59 }, { "epoch": 0.022113701280751866, "grad_norm": 7.488125365161497, "learning_rate": 4.445378125959108e-05, "loss": 1.4901, "step": 60 }, { "epoch": 0.022482262968764398, "grad_norm": 8.517440406404976, "learning_rate": 4.463324587526917e-05, "loss": 1.4377, "step": 61 }, { "epoch": 0.022850824656776927, "grad_norm": 7.882466969322544, "learning_rate": 4.4809792237456346e-05, "loss": 1.2871, "step": 62 }, { "epoch": 0.02321938634478946, "grad_norm": 7.022014333703284, "learning_rate": 4.498351373633954e-05, "loss": 1.1053, "step": 63 }, { "epoch": 0.02358794803280199, "grad_norm": 6.94816251107251, "learning_rate": 4.515449934959717e-05, "loss": 1.3862, "step": 64 }, { "epoch": 0.02395650972081452, "grad_norm": 6.306484566627538, "learning_rate": 4.532283391607138e-05, "loss": 1.2331, "step": 65 }, { "epoch": 0.024325071408827052, "grad_norm": 7.905758808543756, "learning_rate": 4.548859838854671e-05, "loss": 1.3515, "step": 66 }, { "epoch": 0.024693633096839585, "grad_norm": 8.577300485477005, "learning_rate": 4.565187006752065e-05, "loss": 1.2955, "step": 67 }, { "epoch": 0.025062194784852113, "grad_norm": 7.659793703174734, "learning_rate": 4.581272281765591e-05, "loss": 1.2573, "step": 68 }, { "epoch": 0.025430756472864646, "grad_norm": 8.977748065475463, "learning_rate": 4.597122726843138e-05, "loss": 1.2622, "step": 69 }, { "epoch": 0.025799318160877178, "grad_norm": 7.545151160200399, "learning_rate": 4.612745100035642e-05, "loss": 1.3396, "step": 70 }, { "epoch": 0.026167879848889707, "grad_norm": 7.240808382375095, "learning_rate": 4.628145871797688e-05, "loss": 1.1469, "step": 71 }, { "epoch": 0.02653644153690224, "grad_norm": 7.1685245974024845, "learning_rate": 4.643331241078171e-05, "loss": 1.2568, "step": 72 }, { "epoch": 0.02690500322491477, "grad_norm": 8.102639284004887, "learning_rate": 4.658307150301139e-05, "loss": 1.1563, "step": 73 }, { "epoch": 0.0272735649129273, "grad_norm": 10.805219194482836, "learning_rate": 4.67307929932744e-05, "loss": 1.1324, "step": 74 }, { "epoch": 0.027642126600939832, "grad_norm": 8.318542120189758, "learning_rate": 4.687653158479249e-05, "loss": 1.2996, "step": 75 }, { "epoch": 0.028010688288952364, "grad_norm": 7.725025421397037, "learning_rate": 4.702033980701978e-05, "loss": 1.2137, "step": 76 }, { "epoch": 0.028379249976964893, "grad_norm": 7.891557997130244, "learning_rate": 4.716226812931204e-05, "loss": 1.1889, "step": 77 }, { "epoch": 0.028747811664977425, "grad_norm": 8.475846023867618, "learning_rate": 4.7302365067262006e-05, "loss": 1.3367, "step": 78 }, { "epoch": 0.029116373352989958, "grad_norm": 7.047132669208227, "learning_rate": 4.744067728226103e-05, "loss": 1.3136, "step": 79 }, { "epoch": 0.029484935041002486, "grad_norm": 8.691936430458567, "learning_rate": 4.757724967479858e-05, "loss": 1.2414, "step": 80 }, { "epoch": 0.02985349672901502, "grad_norm": 6.270306367583594, "learning_rate": 4.771212547196624e-05, "loss": 0.8592, "step": 81 }, { "epoch": 0.03022205841702755, "grad_norm": 7.1836638454981445, "learning_rate": 4.7845346309592914e-05, "loss": 1.0936, "step": 82 }, { "epoch": 0.03059062010504008, "grad_norm": 9.757392623822703, "learning_rate": 4.7976952309401844e-05, "loss": 1.3019, "step": 83 }, { "epoch": 0.030959181793052612, "grad_norm": 6.352036949915038, "learning_rate": 4.810698215154703e-05, "loss": 0.8554, "step": 84 }, { "epoch": 0.031327743481065144, "grad_norm": 7.076412574829086, "learning_rate": 4.823547314285732e-05, "loss": 0.8587, "step": 85 }, { "epoch": 0.031696305169077676, "grad_norm": 7.531433862998124, "learning_rate": 4.836246128108918e-05, "loss": 1.2212, "step": 86 }, { "epoch": 0.03206486685709021, "grad_norm": 7.814088949743403, "learning_rate": 4.8487981315465456e-05, "loss": 1.0986, "step": 87 }, { "epoch": 0.032433428545102734, "grad_norm": 9.085220374214755, "learning_rate": 4.8612066803754214e-05, "loss": 1.4418, "step": 88 }, { "epoch": 0.032801990233115266, "grad_norm": 6.926004043670452, "learning_rate": 4.873475016612281e-05, "loss": 0.9738, "step": 89 }, { "epoch": 0.0331705519211278, "grad_norm": 8.359619137059967, "learning_rate": 4.885606273598312e-05, "loss": 1.3825, "step": 90 }, { "epoch": 0.03353911360914033, "grad_norm": 9.508429551926296, "learning_rate": 4.897603480802733e-05, "loss": 1.3226, "step": 91 }, { "epoch": 0.03390767529715286, "grad_norm": 8.280806849876463, "learning_rate": 4.909469568363888e-05, "loss": 1.1939, "step": 92 }, { "epoch": 0.034276236985165395, "grad_norm": 7.25762407039775, "learning_rate": 4.9212073713848375e-05, "loss": 1.1127, "step": 93 }, { "epoch": 0.03464479867317792, "grad_norm": 8.638007486674445, "learning_rate": 4.932819633999246e-05, "loss": 1.1823, "step": 94 }, { "epoch": 0.03501336036119045, "grad_norm": 10.435299213033009, "learning_rate": 4.9443090132221186e-05, "loss": 1.2441, "step": 95 }, { "epoch": 0.035381922049202985, "grad_norm": 7.15082807747491, "learning_rate": 4.9556780825989205e-05, "loss": 1.2836, "step": 96 }, { "epoch": 0.03575048373721552, "grad_norm": 11.645289281558096, "learning_rate": 4.9669293356656114e-05, "loss": 1.2553, "step": 97 }, { "epoch": 0.03611904542522805, "grad_norm": 15.298792593194088, "learning_rate": 4.978065189231237e-05, "loss": 0.9935, "step": 98 }, { "epoch": 0.03648760711324058, "grad_norm": 12.55551211219256, "learning_rate": 4.989087986493874e-05, "loss": 1.2311, "step": 99 }, { "epoch": 0.03685616880125311, "grad_norm": 10.704352404531095, "learning_rate": 4.9999999999999996e-05, "loss": 1.2879, "step": 100 }, { "epoch": 0.03722473048926564, "grad_norm": 9.78960220541836, "learning_rate": 5e-05, "loss": 1.0964, "step": 101 }, { "epoch": 0.03759329217727817, "grad_norm": 11.00952083199788, "learning_rate": 4.9996909383112874e-05, "loss": 1.3336, "step": 102 }, { "epoch": 0.037961853865290704, "grad_norm": 6.792476800760323, "learning_rate": 4.9993818766225745e-05, "loss": 1.085, "step": 103 }, { "epoch": 0.038330415553303236, "grad_norm": 8.095454563056947, "learning_rate": 4.999072814933861e-05, "loss": 1.0396, "step": 104 }, { "epoch": 0.03869897724131577, "grad_norm": 11.03173827044397, "learning_rate": 4.998763753245148e-05, "loss": 1.2126, "step": 105 }, { "epoch": 0.039067538929328294, "grad_norm": 7.537062558564759, "learning_rate": 4.998454691556435e-05, "loss": 0.6971, "step": 106 }, { "epoch": 0.039436100617340826, "grad_norm": 7.9562981950928835, "learning_rate": 4.998145629867722e-05, "loss": 1.0253, "step": 107 }, { "epoch": 0.03980466230535336, "grad_norm": 11.273703124577713, "learning_rate": 4.997836568179009e-05, "loss": 1.234, "step": 108 }, { "epoch": 0.04017322399336589, "grad_norm": 7.3164643846962925, "learning_rate": 4.997527506490295e-05, "loss": 0.8721, "step": 109 }, { "epoch": 0.04054178568137842, "grad_norm": 8.33893346173819, "learning_rate": 4.997218444801582e-05, "loss": 1.0719, "step": 110 }, { "epoch": 0.040910347369390955, "grad_norm": 8.619923299992518, "learning_rate": 4.9969093831128694e-05, "loss": 0.9656, "step": 111 }, { "epoch": 0.04127890905740348, "grad_norm": 9.127339102487475, "learning_rate": 4.9966003214241565e-05, "loss": 1.3164, "step": 112 }, { "epoch": 0.04164747074541601, "grad_norm": 8.317940903200782, "learning_rate": 4.9962912597354436e-05, "loss": 1.0306, "step": 113 }, { "epoch": 0.042016032433428545, "grad_norm": 9.117014696171925, "learning_rate": 4.99598219804673e-05, "loss": 0.9979, "step": 114 }, { "epoch": 0.04238459412144108, "grad_norm": 7.764056348577509, "learning_rate": 4.995673136358017e-05, "loss": 0.8814, "step": 115 }, { "epoch": 0.04275315580945361, "grad_norm": 6.460045601887559, "learning_rate": 4.995364074669304e-05, "loss": 0.975, "step": 116 }, { "epoch": 0.04312171749746614, "grad_norm": 5.412764136232284, "learning_rate": 4.9950550129805914e-05, "loss": 0.7499, "step": 117 }, { "epoch": 0.04349027918547867, "grad_norm": 11.362631308758948, "learning_rate": 4.994745951291878e-05, "loss": 1.483, "step": 118 }, { "epoch": 0.0438588408734912, "grad_norm": 6.587044872797665, "learning_rate": 4.994436889603165e-05, "loss": 0.7419, "step": 119 }, { "epoch": 0.04422740256150373, "grad_norm": 8.435744717594867, "learning_rate": 4.994127827914452e-05, "loss": 1.0477, "step": 120 }, { "epoch": 0.044595964249516264, "grad_norm": 9.369788590593807, "learning_rate": 4.993818766225739e-05, "loss": 0.8313, "step": 121 }, { "epoch": 0.044964525937528796, "grad_norm": 7.760962823343022, "learning_rate": 4.9935097045370264e-05, "loss": 1.1757, "step": 122 }, { "epoch": 0.04533308762554133, "grad_norm": 9.922150279153676, "learning_rate": 4.993200642848313e-05, "loss": 1.0082, "step": 123 }, { "epoch": 0.04570164931355385, "grad_norm": 10.64638888607196, "learning_rate": 4.992891581159599e-05, "loss": 1.1332, "step": 124 }, { "epoch": 0.046070211001566386, "grad_norm": 10.456160807300403, "learning_rate": 4.9925825194708864e-05, "loss": 1.4375, "step": 125 }, { "epoch": 0.04643877268957892, "grad_norm": 7.698436865739775, "learning_rate": 4.9922734577821735e-05, "loss": 1.1124, "step": 126 }, { "epoch": 0.04680733437759145, "grad_norm": 7.53142803820615, "learning_rate": 4.9919643960934606e-05, "loss": 1.0969, "step": 127 }, { "epoch": 0.04717589606560398, "grad_norm": 8.823110266809174, "learning_rate": 4.991655334404747e-05, "loss": 1.0045, "step": 128 }, { "epoch": 0.047544457753616515, "grad_norm": 18.835715534258316, "learning_rate": 4.991346272716034e-05, "loss": 1.1462, "step": 129 }, { "epoch": 0.04791301944162904, "grad_norm": 7.818204460353203, "learning_rate": 4.991037211027321e-05, "loss": 1.0922, "step": 130 }, { "epoch": 0.04828158112964157, "grad_norm": 6.961341923226463, "learning_rate": 4.9907281493386084e-05, "loss": 0.9743, "step": 131 }, { "epoch": 0.048650142817654105, "grad_norm": 7.62314695761229, "learning_rate": 4.9904190876498955e-05, "loss": 0.7256, "step": 132 }, { "epoch": 0.04901870450566664, "grad_norm": 12.182296815990432, "learning_rate": 4.990110025961182e-05, "loss": 1.1536, "step": 133 }, { "epoch": 0.04938726619367917, "grad_norm": 7.750135605148684, "learning_rate": 4.989800964272469e-05, "loss": 1.1613, "step": 134 }, { "epoch": 0.0497558278816917, "grad_norm": 9.507524421734756, "learning_rate": 4.989491902583756e-05, "loss": 1.2696, "step": 135 }, { "epoch": 0.05012438956970423, "grad_norm": 7.789147716121109, "learning_rate": 4.989182840895043e-05, "loss": 0.832, "step": 136 }, { "epoch": 0.05049295125771676, "grad_norm": 6.751105889044968, "learning_rate": 4.98887377920633e-05, "loss": 1.008, "step": 137 }, { "epoch": 0.05086151294572929, "grad_norm": 9.41619529782166, "learning_rate": 4.988564717517617e-05, "loss": 1.0395, "step": 138 }, { "epoch": 0.05123007463374182, "grad_norm": 11.519700797751934, "learning_rate": 4.988255655828903e-05, "loss": 1.4756, "step": 139 }, { "epoch": 0.051598636321754356, "grad_norm": 7.295033518640332, "learning_rate": 4.9879465941401904e-05, "loss": 1.0429, "step": 140 }, { "epoch": 0.05196719800976689, "grad_norm": 10.19877253607883, "learning_rate": 4.9876375324514776e-05, "loss": 1.0992, "step": 141 }, { "epoch": 0.05233575969777941, "grad_norm": 14.702084057532987, "learning_rate": 4.987328470762764e-05, "loss": 1.1515, "step": 142 }, { "epoch": 0.052704321385791945, "grad_norm": 8.460250361640643, "learning_rate": 4.987019409074051e-05, "loss": 1.0327, "step": 143 }, { "epoch": 0.05307288307380448, "grad_norm": 14.729504845319182, "learning_rate": 4.986710347385338e-05, "loss": 1.0841, "step": 144 }, { "epoch": 0.05344144476181701, "grad_norm": 9.791541392289268, "learning_rate": 4.9864012856966254e-05, "loss": 0.8941, "step": 145 }, { "epoch": 0.05381000644982954, "grad_norm": 8.590496543237474, "learning_rate": 4.9860922240079125e-05, "loss": 1.1488, "step": 146 }, { "epoch": 0.054178568137842074, "grad_norm": 5.790544286784018, "learning_rate": 4.985783162319199e-05, "loss": 1.014, "step": 147 }, { "epoch": 0.0545471298258546, "grad_norm": 10.040475946217017, "learning_rate": 4.985474100630486e-05, "loss": 1.3445, "step": 148 }, { "epoch": 0.05491569151386713, "grad_norm": 10.478702926739333, "learning_rate": 4.985165038941773e-05, "loss": 1.2926, "step": 149 }, { "epoch": 0.055284253201879664, "grad_norm": 9.707358599233938, "learning_rate": 4.98485597725306e-05, "loss": 1.0117, "step": 150 }, { "epoch": 0.055652814889892196, "grad_norm": 7.632565356717265, "learning_rate": 4.9845469155643474e-05, "loss": 0.8071, "step": 151 }, { "epoch": 0.05602137657790473, "grad_norm": 8.518086656812594, "learning_rate": 4.984237853875634e-05, "loss": 1.1132, "step": 152 }, { "epoch": 0.05638993826591726, "grad_norm": 8.85226321834784, "learning_rate": 4.983928792186921e-05, "loss": 1.1272, "step": 153 }, { "epoch": 0.056758499953929786, "grad_norm": 7.061578471059406, "learning_rate": 4.9836197304982074e-05, "loss": 0.8596, "step": 154 }, { "epoch": 0.05712706164194232, "grad_norm": 8.404427369789241, "learning_rate": 4.9833106688094945e-05, "loss": 1.0816, "step": 155 }, { "epoch": 0.05749562332995485, "grad_norm": 7.951672014714254, "learning_rate": 4.9830016071207816e-05, "loss": 0.9151, "step": 156 }, { "epoch": 0.05786418501796738, "grad_norm": 11.638802159081585, "learning_rate": 4.982692545432068e-05, "loss": 0.9608, "step": 157 }, { "epoch": 0.058232746705979915, "grad_norm": 9.502547876106457, "learning_rate": 4.982383483743355e-05, "loss": 0.8213, "step": 158 }, { "epoch": 0.05860130839399245, "grad_norm": 8.782990524097514, "learning_rate": 4.982074422054642e-05, "loss": 0.9409, "step": 159 }, { "epoch": 0.05896987008200497, "grad_norm": 7.346496328235743, "learning_rate": 4.9817653603659294e-05, "loss": 1.0843, "step": 160 }, { "epoch": 0.059338431770017505, "grad_norm": 6.226593745020414, "learning_rate": 4.981456298677216e-05, "loss": 0.7355, "step": 161 }, { "epoch": 0.05970699345803004, "grad_norm": 17.740135353027892, "learning_rate": 4.981147236988503e-05, "loss": 0.9995, "step": 162 }, { "epoch": 0.06007555514604257, "grad_norm": 7.78611070416167, "learning_rate": 4.98083817529979e-05, "loss": 0.8862, "step": 163 }, { "epoch": 0.0604441168340551, "grad_norm": 8.093831536902279, "learning_rate": 4.980529113611077e-05, "loss": 0.8571, "step": 164 }, { "epoch": 0.060812678522067634, "grad_norm": 9.857318742339924, "learning_rate": 4.9802200519223643e-05, "loss": 0.9556, "step": 165 }, { "epoch": 0.06118124021008016, "grad_norm": 6.224101002439968, "learning_rate": 4.979910990233651e-05, "loss": 0.8589, "step": 166 }, { "epoch": 0.06154980189809269, "grad_norm": 5.812338549925092, "learning_rate": 4.979601928544938e-05, "loss": 0.892, "step": 167 }, { "epoch": 0.061918363586105224, "grad_norm": 6.287497682362427, "learning_rate": 4.979292866856225e-05, "loss": 0.7492, "step": 168 }, { "epoch": 0.062286925274117756, "grad_norm": 8.78212616917279, "learning_rate": 4.978983805167512e-05, "loss": 1.0509, "step": 169 }, { "epoch": 0.06265548696213029, "grad_norm": 8.390717291013614, "learning_rate": 4.9786747434787986e-05, "loss": 1.0064, "step": 170 }, { "epoch": 0.06302404865014281, "grad_norm": 7.778951895360456, "learning_rate": 4.978365681790085e-05, "loss": 0.8025, "step": 171 }, { "epoch": 0.06339261033815535, "grad_norm": 8.307202938805812, "learning_rate": 4.978056620101372e-05, "loss": 0.8596, "step": 172 }, { "epoch": 0.06376117202616788, "grad_norm": 6.577821121989456, "learning_rate": 4.977747558412659e-05, "loss": 0.8146, "step": 173 }, { "epoch": 0.06412973371418042, "grad_norm": 8.852237793498134, "learning_rate": 4.9774384967239464e-05, "loss": 0.9119, "step": 174 }, { "epoch": 0.06449829540219294, "grad_norm": 10.80638875965983, "learning_rate": 4.9771294350352335e-05, "loss": 0.983, "step": 175 }, { "epoch": 0.06486685709020547, "grad_norm": 11.205028930790128, "learning_rate": 4.97682037334652e-05, "loss": 0.8527, "step": 176 }, { "epoch": 0.06523541877821801, "grad_norm": 9.850606026377974, "learning_rate": 4.976511311657807e-05, "loss": 0.9078, "step": 177 }, { "epoch": 0.06560398046623053, "grad_norm": 11.116444930839654, "learning_rate": 4.976202249969094e-05, "loss": 1.0494, "step": 178 }, { "epoch": 0.06597254215424307, "grad_norm": 9.72915847636388, "learning_rate": 4.975893188280381e-05, "loss": 1.1274, "step": 179 }, { "epoch": 0.0663411038422556, "grad_norm": 10.053595921588943, "learning_rate": 4.975584126591668e-05, "loss": 1.013, "step": 180 }, { "epoch": 0.06670966553026812, "grad_norm": 7.876690395522352, "learning_rate": 4.975275064902955e-05, "loss": 1.0791, "step": 181 }, { "epoch": 0.06707822721828066, "grad_norm": 5.910853937875841, "learning_rate": 4.974966003214242e-05, "loss": 0.9965, "step": 182 }, { "epoch": 0.06744678890629319, "grad_norm": 7.245363265487587, "learning_rate": 4.974656941525529e-05, "loss": 1.049, "step": 183 }, { "epoch": 0.06781535059430573, "grad_norm": 7.013658005677598, "learning_rate": 4.974347879836816e-05, "loss": 0.7372, "step": 184 }, { "epoch": 0.06818391228231825, "grad_norm": 7.0220447555415175, "learning_rate": 4.974038818148103e-05, "loss": 0.8089, "step": 185 }, { "epoch": 0.06855247397033079, "grad_norm": 9.902664257633603, "learning_rate": 4.973729756459389e-05, "loss": 1.0578, "step": 186 }, { "epoch": 0.06892103565834332, "grad_norm": 6.711133605436531, "learning_rate": 4.973420694770676e-05, "loss": 1.071, "step": 187 }, { "epoch": 0.06928959734635584, "grad_norm": 7.235693661035385, "learning_rate": 4.9731116330819633e-05, "loss": 1.2935, "step": 188 }, { "epoch": 0.06965815903436838, "grad_norm": 5.504333522827044, "learning_rate": 4.9728025713932505e-05, "loss": 0.6557, "step": 189 }, { "epoch": 0.0700267207223809, "grad_norm": 9.219689686615297, "learning_rate": 4.972493509704537e-05, "loss": 1.1014, "step": 190 }, { "epoch": 0.07039528241039344, "grad_norm": 9.629807101622188, "learning_rate": 4.972184448015824e-05, "loss": 1.1227, "step": 191 }, { "epoch": 0.07076384409840597, "grad_norm": 7.552198471419365, "learning_rate": 4.971875386327111e-05, "loss": 0.7694, "step": 192 }, { "epoch": 0.0711324057864185, "grad_norm": 9.532811171452334, "learning_rate": 4.971566324638398e-05, "loss": 1.0605, "step": 193 }, { "epoch": 0.07150096747443103, "grad_norm": 9.346039252380189, "learning_rate": 4.9712572629496854e-05, "loss": 0.8139, "step": 194 }, { "epoch": 0.07186952916244356, "grad_norm": 10.014717715584975, "learning_rate": 4.970948201260972e-05, "loss": 0.881, "step": 195 }, { "epoch": 0.0722380908504561, "grad_norm": 10.572276269219193, "learning_rate": 4.970639139572259e-05, "loss": 1.2554, "step": 196 }, { "epoch": 0.07260665253846862, "grad_norm": 7.381589887192008, "learning_rate": 4.970330077883546e-05, "loss": 0.9746, "step": 197 }, { "epoch": 0.07297521422648116, "grad_norm": 7.5247445402158215, "learning_rate": 4.970021016194833e-05, "loss": 0.9611, "step": 198 }, { "epoch": 0.07334377591449369, "grad_norm": 10.28514438140219, "learning_rate": 4.9697119545061196e-05, "loss": 1.1633, "step": 199 }, { "epoch": 0.07371233760250621, "grad_norm": 5.452128059612868, "learning_rate": 4.969402892817406e-05, "loss": 0.9781, "step": 200 }, { "epoch": 0.07408089929051875, "grad_norm": 9.05496934216774, "learning_rate": 4.969093831128693e-05, "loss": 1.0513, "step": 201 }, { "epoch": 0.07444946097853128, "grad_norm": 8.879822186588518, "learning_rate": 4.96878476943998e-05, "loss": 1.1447, "step": 202 }, { "epoch": 0.07481802266654382, "grad_norm": 9.510610978870194, "learning_rate": 4.9684757077512674e-05, "loss": 1.0407, "step": 203 }, { "epoch": 0.07518658435455634, "grad_norm": 6.189247578138727, "learning_rate": 4.9681666460625545e-05, "loss": 1.0539, "step": 204 }, { "epoch": 0.07555514604256887, "grad_norm": 9.267381987189498, "learning_rate": 4.967857584373841e-05, "loss": 1.0193, "step": 205 }, { "epoch": 0.07592370773058141, "grad_norm": 6.384549038942692, "learning_rate": 4.967548522685128e-05, "loss": 1.0214, "step": 206 }, { "epoch": 0.07629226941859393, "grad_norm": 8.754485806060549, "learning_rate": 4.967239460996415e-05, "loss": 1.0121, "step": 207 }, { "epoch": 0.07666083110660647, "grad_norm": 12.028837521926475, "learning_rate": 4.9669303993077023e-05, "loss": 1.1574, "step": 208 }, { "epoch": 0.077029392794619, "grad_norm": 6.25706692753438, "learning_rate": 4.966621337618989e-05, "loss": 0.814, "step": 209 }, { "epoch": 0.07739795448263154, "grad_norm": 9.245700971272829, "learning_rate": 4.966312275930276e-05, "loss": 0.8856, "step": 210 }, { "epoch": 0.07776651617064406, "grad_norm": 5.413264844739898, "learning_rate": 4.966003214241563e-05, "loss": 1.0104, "step": 211 }, { "epoch": 0.07813507785865659, "grad_norm": 7.039511747768637, "learning_rate": 4.96569415255285e-05, "loss": 0.9073, "step": 212 }, { "epoch": 0.07850363954666913, "grad_norm": 7.564220390413416, "learning_rate": 4.965385090864137e-05, "loss": 1.1031, "step": 213 }, { "epoch": 0.07887220123468165, "grad_norm": 8.755753723500996, "learning_rate": 4.965076029175424e-05, "loss": 0.8735, "step": 214 }, { "epoch": 0.07924076292269419, "grad_norm": 7.180891896906581, "learning_rate": 4.96476696748671e-05, "loss": 0.8301, "step": 215 }, { "epoch": 0.07960932461070672, "grad_norm": 6.969018494775113, "learning_rate": 4.964457905797997e-05, "loss": 0.7859, "step": 216 }, { "epoch": 0.07997788629871924, "grad_norm": 8.701859058512655, "learning_rate": 4.9641488441092844e-05, "loss": 1.2138, "step": 217 }, { "epoch": 0.08034644798673178, "grad_norm": 8.436248460419922, "learning_rate": 4.9638397824205715e-05, "loss": 1.0954, "step": 218 }, { "epoch": 0.0807150096747443, "grad_norm": 7.3677702558983995, "learning_rate": 4.963530720731858e-05, "loss": 0.8739, "step": 219 }, { "epoch": 0.08108357136275685, "grad_norm": 7.372982460397429, "learning_rate": 4.963221659043145e-05, "loss": 0.8481, "step": 220 }, { "epoch": 0.08145213305076937, "grad_norm": 5.37333227499927, "learning_rate": 4.962912597354432e-05, "loss": 0.8185, "step": 221 }, { "epoch": 0.08182069473878191, "grad_norm": 9.613020935626276, "learning_rate": 4.962603535665719e-05, "loss": 0.9735, "step": 222 }, { "epoch": 0.08218925642679444, "grad_norm": 10.45408546129365, "learning_rate": 4.9622944739770064e-05, "loss": 1.4117, "step": 223 }, { "epoch": 0.08255781811480696, "grad_norm": 12.859865051795474, "learning_rate": 4.961985412288293e-05, "loss": 0.9153, "step": 224 }, { "epoch": 0.0829263798028195, "grad_norm": 8.385249911401576, "learning_rate": 4.96167635059958e-05, "loss": 1.0837, "step": 225 }, { "epoch": 0.08329494149083203, "grad_norm": 7.794127077526691, "learning_rate": 4.961367288910867e-05, "loss": 1.1145, "step": 226 }, { "epoch": 0.08366350317884456, "grad_norm": 7.891761984915821, "learning_rate": 4.961058227222154e-05, "loss": 1.1516, "step": 227 }, { "epoch": 0.08403206486685709, "grad_norm": 8.643659227933087, "learning_rate": 4.960749165533441e-05, "loss": 0.8465, "step": 228 }, { "epoch": 0.08440062655486961, "grad_norm": 9.312038743033547, "learning_rate": 4.960440103844728e-05, "loss": 0.9661, "step": 229 }, { "epoch": 0.08476918824288215, "grad_norm": 6.178090348481795, "learning_rate": 4.960131042156014e-05, "loss": 0.7608, "step": 230 }, { "epoch": 0.08513774993089468, "grad_norm": 7.333210106016626, "learning_rate": 4.9598219804673013e-05, "loss": 0.9467, "step": 231 }, { "epoch": 0.08550631161890722, "grad_norm": 8.71288865479971, "learning_rate": 4.9595129187785885e-05, "loss": 0.9975, "step": 232 }, { "epoch": 0.08587487330691974, "grad_norm": 11.266890667676812, "learning_rate": 4.959203857089875e-05, "loss": 1.2148, "step": 233 }, { "epoch": 0.08624343499493228, "grad_norm": 6.136488125241543, "learning_rate": 4.958894795401162e-05, "loss": 0.7328, "step": 234 }, { "epoch": 0.08661199668294481, "grad_norm": 9.760151591139927, "learning_rate": 4.958585733712449e-05, "loss": 0.986, "step": 235 }, { "epoch": 0.08698055837095733, "grad_norm": 6.658115752305133, "learning_rate": 4.958276672023736e-05, "loss": 0.8702, "step": 236 }, { "epoch": 0.08734912005896987, "grad_norm": 6.315009798478765, "learning_rate": 4.9579676103350234e-05, "loss": 0.8692, "step": 237 }, { "epoch": 0.0877176817469824, "grad_norm": 8.290290705573005, "learning_rate": 4.95765854864631e-05, "loss": 0.9061, "step": 238 }, { "epoch": 0.08808624343499494, "grad_norm": 7.792988330175903, "learning_rate": 4.957349486957597e-05, "loss": 1.0462, "step": 239 }, { "epoch": 0.08845480512300746, "grad_norm": 8.648423385264001, "learning_rate": 4.957040425268884e-05, "loss": 1.0299, "step": 240 }, { "epoch": 0.08882336681101999, "grad_norm": 14.411832906829547, "learning_rate": 4.956731363580171e-05, "loss": 1.2424, "step": 241 }, { "epoch": 0.08919192849903253, "grad_norm": 8.509561882530567, "learning_rate": 4.9564223018914576e-05, "loss": 0.8491, "step": 242 }, { "epoch": 0.08956049018704505, "grad_norm": 8.167168219079977, "learning_rate": 4.956113240202745e-05, "loss": 0.9716, "step": 243 }, { "epoch": 0.08992905187505759, "grad_norm": 7.6385425311856725, "learning_rate": 4.955804178514032e-05, "loss": 0.7333, "step": 244 }, { "epoch": 0.09029761356307012, "grad_norm": 9.34219194263149, "learning_rate": 4.955495116825318e-05, "loss": 0.7673, "step": 245 }, { "epoch": 0.09066617525108266, "grad_norm": 7.947517621649891, "learning_rate": 4.9551860551366054e-05, "loss": 0.913, "step": 246 }, { "epoch": 0.09103473693909518, "grad_norm": 5.984212768586758, "learning_rate": 4.9548769934478925e-05, "loss": 0.856, "step": 247 }, { "epoch": 0.0914032986271077, "grad_norm": 8.992257496605284, "learning_rate": 4.954567931759179e-05, "loss": 1.0889, "step": 248 }, { "epoch": 0.09177186031512025, "grad_norm": 11.774432812938645, "learning_rate": 4.954258870070466e-05, "loss": 0.9547, "step": 249 }, { "epoch": 0.09214042200313277, "grad_norm": 8.937992109075424, "learning_rate": 4.953949808381753e-05, "loss": 0.8219, "step": 250 }, { "epoch": 0.09250898369114531, "grad_norm": 9.204926241652228, "learning_rate": 4.95364074669304e-05, "loss": 0.8994, "step": 251 }, { "epoch": 0.09287754537915784, "grad_norm": 11.91342723716212, "learning_rate": 4.953331685004327e-05, "loss": 1.0524, "step": 252 }, { "epoch": 0.09324610706717036, "grad_norm": 15.316475488398718, "learning_rate": 4.953022623315614e-05, "loss": 0.9587, "step": 253 }, { "epoch": 0.0936146687551829, "grad_norm": 8.491407651635269, "learning_rate": 4.952713561626901e-05, "loss": 0.7794, "step": 254 }, { "epoch": 0.09398323044319543, "grad_norm": 6.650975963135563, "learning_rate": 4.952404499938188e-05, "loss": 0.6909, "step": 255 }, { "epoch": 0.09435179213120796, "grad_norm": 7.635411892655681, "learning_rate": 4.952095438249475e-05, "loss": 0.8173, "step": 256 }, { "epoch": 0.09472035381922049, "grad_norm": 6.028855555094949, "learning_rate": 4.951786376560762e-05, "loss": 0.7429, "step": 257 }, { "epoch": 0.09508891550723303, "grad_norm": 8.087930951435931, "learning_rate": 4.951477314872049e-05, "loss": 1.1185, "step": 258 }, { "epoch": 0.09545747719524555, "grad_norm": 7.476547193362075, "learning_rate": 4.951168253183336e-05, "loss": 0.9703, "step": 259 }, { "epoch": 0.09582603888325808, "grad_norm": 7.564506831964649, "learning_rate": 4.9508591914946224e-05, "loss": 0.7199, "step": 260 }, { "epoch": 0.09619460057127062, "grad_norm": 5.970425834867912, "learning_rate": 4.9505501298059095e-05, "loss": 0.9623, "step": 261 }, { "epoch": 0.09656316225928314, "grad_norm": 5.220703125, "learning_rate": 4.950241068117196e-05, "loss": 0.6649, "step": 262 }, { "epoch": 0.09693172394729568, "grad_norm": 7.068018015123432, "learning_rate": 4.949932006428483e-05, "loss": 0.831, "step": 263 }, { "epoch": 0.09730028563530821, "grad_norm": 8.636889238197432, "learning_rate": 4.94962294473977e-05, "loss": 0.9478, "step": 264 }, { "epoch": 0.09766884732332073, "grad_norm": 7.5677759351727945, "learning_rate": 4.949313883051057e-05, "loss": 0.912, "step": 265 }, { "epoch": 0.09803740901133327, "grad_norm": 8.79145930860755, "learning_rate": 4.9490048213623444e-05, "loss": 0.7538, "step": 266 }, { "epoch": 0.0984059706993458, "grad_norm": 6.350109201528576, "learning_rate": 4.948695759673631e-05, "loss": 0.9318, "step": 267 }, { "epoch": 0.09877453238735834, "grad_norm": 10.22504905185266, "learning_rate": 4.948386697984918e-05, "loss": 0.9271, "step": 268 }, { "epoch": 0.09914309407537086, "grad_norm": 8.547271328790771, "learning_rate": 4.948077636296205e-05, "loss": 0.7471, "step": 269 }, { "epoch": 0.0995116557633834, "grad_norm": 8.499339807340599, "learning_rate": 4.947768574607492e-05, "loss": 0.9419, "step": 270 }, { "epoch": 0.09988021745139593, "grad_norm": 9.61054362812661, "learning_rate": 4.9474595129187787e-05, "loss": 1.2451, "step": 271 }, { "epoch": 0.10024877913940845, "grad_norm": 7.098082441971729, "learning_rate": 4.947150451230066e-05, "loss": 0.7948, "step": 272 }, { "epoch": 0.10061734082742099, "grad_norm": 11.843419535915753, "learning_rate": 4.946841389541353e-05, "loss": 1.0712, "step": 273 }, { "epoch": 0.10098590251543352, "grad_norm": 7.25072686070472, "learning_rate": 4.94653232785264e-05, "loss": 0.8167, "step": 274 }, { "epoch": 0.10135446420344606, "grad_norm": 9.780647039871885, "learning_rate": 4.9462232661639265e-05, "loss": 0.899, "step": 275 }, { "epoch": 0.10172302589145858, "grad_norm": 7.802101967097102, "learning_rate": 4.9459142044752136e-05, "loss": 0.8318, "step": 276 }, { "epoch": 0.10209158757947111, "grad_norm": 13.58926998285751, "learning_rate": 4.9456051427865e-05, "loss": 1.1349, "step": 277 }, { "epoch": 0.10246014926748365, "grad_norm": 9.486263985718418, "learning_rate": 4.945296081097787e-05, "loss": 0.9922, "step": 278 }, { "epoch": 0.10282871095549617, "grad_norm": 10.511837780355432, "learning_rate": 4.944987019409074e-05, "loss": 0.8636, "step": 279 }, { "epoch": 0.10319727264350871, "grad_norm": 10.625029440446344, "learning_rate": 4.9446779577203614e-05, "loss": 1.0805, "step": 280 }, { "epoch": 0.10356583433152124, "grad_norm": 8.614048293510415, "learning_rate": 4.944368896031648e-05, "loss": 1.1087, "step": 281 }, { "epoch": 0.10393439601953378, "grad_norm": 9.061682512967463, "learning_rate": 4.944059834342935e-05, "loss": 0.8097, "step": 282 }, { "epoch": 0.1043029577075463, "grad_norm": 6.635870023258154, "learning_rate": 4.943750772654222e-05, "loss": 0.8532, "step": 283 }, { "epoch": 0.10467151939555883, "grad_norm": 9.137012256802386, "learning_rate": 4.943441710965509e-05, "loss": 0.6945, "step": 284 }, { "epoch": 0.10504008108357137, "grad_norm": 7.097694946922579, "learning_rate": 4.943132649276796e-05, "loss": 0.7593, "step": 285 }, { "epoch": 0.10540864277158389, "grad_norm": 10.821549878481406, "learning_rate": 4.942823587588083e-05, "loss": 0.8906, "step": 286 }, { "epoch": 0.10577720445959643, "grad_norm": 8.855770716758252, "learning_rate": 4.94251452589937e-05, "loss": 0.9838, "step": 287 }, { "epoch": 0.10614576614760896, "grad_norm": 10.29987538086399, "learning_rate": 4.942205464210657e-05, "loss": 1.2839, "step": 288 }, { "epoch": 0.10651432783562148, "grad_norm": 7.976710513364119, "learning_rate": 4.941896402521944e-05, "loss": 0.989, "step": 289 }, { "epoch": 0.10688288952363402, "grad_norm": 8.906289833381317, "learning_rate": 4.9415873408332305e-05, "loss": 1.0636, "step": 290 }, { "epoch": 0.10725145121164655, "grad_norm": 8.146146984517243, "learning_rate": 4.941278279144517e-05, "loss": 1.0106, "step": 291 }, { "epoch": 0.10762001289965908, "grad_norm": 9.988556990474306, "learning_rate": 4.940969217455804e-05, "loss": 1.1683, "step": 292 }, { "epoch": 0.10798857458767161, "grad_norm": 10.851904186035307, "learning_rate": 4.940660155767091e-05, "loss": 1.0055, "step": 293 }, { "epoch": 0.10835713627568415, "grad_norm": 9.079204029031468, "learning_rate": 4.940351094078378e-05, "loss": 0.9408, "step": 294 }, { "epoch": 0.10872569796369667, "grad_norm": 7.937820307959552, "learning_rate": 4.9400420323896655e-05, "loss": 0.9862, "step": 295 }, { "epoch": 0.1090942596517092, "grad_norm": 6.799505597261282, "learning_rate": 4.939732970700952e-05, "loss": 0.9306, "step": 296 }, { "epoch": 0.10946282133972174, "grad_norm": 6.72884359460119, "learning_rate": 4.939423909012239e-05, "loss": 0.7655, "step": 297 }, { "epoch": 0.10983138302773426, "grad_norm": 6.083150712847319, "learning_rate": 4.939114847323526e-05, "loss": 0.831, "step": 298 }, { "epoch": 0.1101999447157468, "grad_norm": 5.9528673972316435, "learning_rate": 4.938805785634813e-05, "loss": 0.7239, "step": 299 }, { "epoch": 0.11056850640375933, "grad_norm": 9.881236750613745, "learning_rate": 4.9384967239461e-05, "loss": 1.0067, "step": 300 }, { "epoch": 0.11093706809177185, "grad_norm": 8.42329759915009, "learning_rate": 4.938187662257387e-05, "loss": 1.0726, "step": 301 }, { "epoch": 0.11130562977978439, "grad_norm": 7.766750814394049, "learning_rate": 4.937878600568674e-05, "loss": 0.8179, "step": 302 }, { "epoch": 0.11167419146779692, "grad_norm": 5.728611848727308, "learning_rate": 4.937569538879961e-05, "loss": 0.7402, "step": 303 }, { "epoch": 0.11204275315580946, "grad_norm": 7.721204359757403, "learning_rate": 4.937260477191248e-05, "loss": 1.0016, "step": 304 }, { "epoch": 0.11241131484382198, "grad_norm": 7.605824858445253, "learning_rate": 4.9369514155025346e-05, "loss": 0.6924, "step": 305 }, { "epoch": 0.11277987653183452, "grad_norm": 5.471508442154536, "learning_rate": 4.936642353813821e-05, "loss": 0.7798, "step": 306 }, { "epoch": 0.11314843821984705, "grad_norm": 8.189224178098934, "learning_rate": 4.936333292125108e-05, "loss": 0.9552, "step": 307 }, { "epoch": 0.11351699990785957, "grad_norm": 13.302507115451606, "learning_rate": 4.936024230436395e-05, "loss": 0.8063, "step": 308 }, { "epoch": 0.11388556159587211, "grad_norm": 8.371255977196146, "learning_rate": 4.9357151687476824e-05, "loss": 0.8839, "step": 309 }, { "epoch": 0.11425412328388464, "grad_norm": 7.503970303279458, "learning_rate": 4.935406107058969e-05, "loss": 0.681, "step": 310 }, { "epoch": 0.11462268497189718, "grad_norm": 7.8956840990871795, "learning_rate": 4.935097045370256e-05, "loss": 0.791, "step": 311 }, { "epoch": 0.1149912466599097, "grad_norm": 7.13856580162301, "learning_rate": 4.934787983681543e-05, "loss": 0.9037, "step": 312 }, { "epoch": 0.11535980834792223, "grad_norm": 9.254895538614365, "learning_rate": 4.93447892199283e-05, "loss": 0.8312, "step": 313 }, { "epoch": 0.11572837003593477, "grad_norm": 10.24528804495137, "learning_rate": 4.9341698603041167e-05, "loss": 1.1635, "step": 314 }, { "epoch": 0.11609693172394729, "grad_norm": 9.674945399403828, "learning_rate": 4.933860798615404e-05, "loss": 1.1469, "step": 315 }, { "epoch": 0.11646549341195983, "grad_norm": 14.419766719375557, "learning_rate": 4.933551736926691e-05, "loss": 0.9187, "step": 316 }, { "epoch": 0.11683405509997236, "grad_norm": 11.081033300700367, "learning_rate": 4.933242675237978e-05, "loss": 0.8658, "step": 317 }, { "epoch": 0.1172026167879849, "grad_norm": 7.031228841113997, "learning_rate": 4.932933613549265e-05, "loss": 0.7624, "step": 318 }, { "epoch": 0.11757117847599742, "grad_norm": 9.954199237432215, "learning_rate": 4.9326245518605516e-05, "loss": 1.2493, "step": 319 }, { "epoch": 0.11793974016400995, "grad_norm": 9.378529609003381, "learning_rate": 4.932315490171839e-05, "loss": 1.0956, "step": 320 }, { "epoch": 0.11830830185202248, "grad_norm": 10.505141679624524, "learning_rate": 4.932006428483125e-05, "loss": 1.0313, "step": 321 }, { "epoch": 0.11867686354003501, "grad_norm": 5.718546545860055, "learning_rate": 4.931697366794412e-05, "loss": 0.8131, "step": 322 }, { "epoch": 0.11904542522804755, "grad_norm": 7.085108396229949, "learning_rate": 4.9313883051056994e-05, "loss": 0.9821, "step": 323 }, { "epoch": 0.11941398691606007, "grad_norm": 6.991795500097636, "learning_rate": 4.931079243416986e-05, "loss": 0.8371, "step": 324 }, { "epoch": 0.1197825486040726, "grad_norm": 8.315761757994432, "learning_rate": 4.930770181728273e-05, "loss": 0.9439, "step": 325 }, { "epoch": 0.12015111029208514, "grad_norm": 6.350900913518497, "learning_rate": 4.93046112003956e-05, "loss": 0.7873, "step": 326 }, { "epoch": 0.12051967198009766, "grad_norm": 6.369861401737384, "learning_rate": 4.930152058350847e-05, "loss": 0.7525, "step": 327 }, { "epoch": 0.1208882336681102, "grad_norm": 6.437291410455566, "learning_rate": 4.929842996662134e-05, "loss": 0.9745, "step": 328 }, { "epoch": 0.12125679535612273, "grad_norm": 9.906305836420913, "learning_rate": 4.929533934973421e-05, "loss": 0.6927, "step": 329 }, { "epoch": 0.12162535704413527, "grad_norm": 12.371804797808528, "learning_rate": 4.929224873284708e-05, "loss": 1.1086, "step": 330 }, { "epoch": 0.1219939187321478, "grad_norm": 7.189932502779929, "learning_rate": 4.928915811595995e-05, "loss": 0.8287, "step": 331 }, { "epoch": 0.12236248042016032, "grad_norm": 11.462895117619242, "learning_rate": 4.928606749907282e-05, "loss": 0.8889, "step": 332 }, { "epoch": 0.12273104210817286, "grad_norm": 9.350638105680192, "learning_rate": 4.9282976882185685e-05, "loss": 1.011, "step": 333 }, { "epoch": 0.12309960379618538, "grad_norm": 6.083721340120558, "learning_rate": 4.9279886265298556e-05, "loss": 0.8559, "step": 334 }, { "epoch": 0.12346816548419792, "grad_norm": 11.349103688882908, "learning_rate": 4.927679564841143e-05, "loss": 0.7302, "step": 335 }, { "epoch": 0.12383672717221045, "grad_norm": 7.0686068168729665, "learning_rate": 4.927370503152429e-05, "loss": 0.7054, "step": 336 }, { "epoch": 0.12420528886022297, "grad_norm": 12.571747218465697, "learning_rate": 4.927061441463716e-05, "loss": 0.7981, "step": 337 }, { "epoch": 0.12457385054823551, "grad_norm": 9.841633887132735, "learning_rate": 4.9267523797750034e-05, "loss": 1.0779, "step": 338 }, { "epoch": 0.12494241223624804, "grad_norm": 10.951462209957755, "learning_rate": 4.92644331808629e-05, "loss": 1.0281, "step": 339 }, { "epoch": 0.12531097392426058, "grad_norm": 6.0221897520071295, "learning_rate": 4.926134256397577e-05, "loss": 0.5808, "step": 340 }, { "epoch": 0.1256795356122731, "grad_norm": 9.780923953731154, "learning_rate": 4.925825194708864e-05, "loss": 0.7289, "step": 341 }, { "epoch": 0.12604809730028563, "grad_norm": 16.23460621526878, "learning_rate": 4.925516133020151e-05, "loss": 0.9341, "step": 342 }, { "epoch": 0.12641665898829815, "grad_norm": 14.2055248419464, "learning_rate": 4.925207071331438e-05, "loss": 0.9072, "step": 343 }, { "epoch": 0.1267852206763107, "grad_norm": 8.074650561445562, "learning_rate": 4.924898009642725e-05, "loss": 0.5714, "step": 344 }, { "epoch": 0.12715378236432323, "grad_norm": 22.78511573425594, "learning_rate": 4.924588947954012e-05, "loss": 1.0674, "step": 345 }, { "epoch": 0.12752234405233576, "grad_norm": 28.04165384897862, "learning_rate": 4.924279886265299e-05, "loss": 0.826, "step": 346 }, { "epoch": 0.12789090574034828, "grad_norm": 15.543648573817157, "learning_rate": 4.923970824576586e-05, "loss": 0.6961, "step": 347 }, { "epoch": 0.12825946742836083, "grad_norm": 7.7778869469867855, "learning_rate": 4.9236617628878726e-05, "loss": 0.7834, "step": 348 }, { "epoch": 0.12862802911637336, "grad_norm": 9.662402122169611, "learning_rate": 4.92335270119916e-05, "loss": 0.6144, "step": 349 }, { "epoch": 0.12899659080438589, "grad_norm": 156.60649955581346, "learning_rate": 4.923043639510447e-05, "loss": 0.7686, "step": 350 }, { "epoch": 0.1293651524923984, "grad_norm": 9.359408421528391, "learning_rate": 4.922734577821733e-05, "loss": 0.9739, "step": 351 }, { "epoch": 0.12973371418041094, "grad_norm": 13.463146628432444, "learning_rate": 4.9224255161330204e-05, "loss": 0.9783, "step": 352 }, { "epoch": 0.1301022758684235, "grad_norm": 8.545110488989769, "learning_rate": 4.922116454444307e-05, "loss": 0.7609, "step": 353 }, { "epoch": 0.13047083755643601, "grad_norm": 11.66079942216924, "learning_rate": 4.921807392755594e-05, "loss": 0.6109, "step": 354 }, { "epoch": 0.13083939924444854, "grad_norm": 11.436692172846366, "learning_rate": 4.921498331066881e-05, "loss": 0.9625, "step": 355 }, { "epoch": 0.13120796093246107, "grad_norm": 20.281626176210885, "learning_rate": 4.921189269378168e-05, "loss": 0.8646, "step": 356 }, { "epoch": 0.1315765226204736, "grad_norm": 9.663502361621608, "learning_rate": 4.920880207689455e-05, "loss": 0.9338, "step": 357 }, { "epoch": 0.13194508430848614, "grad_norm": 11.675077476489449, "learning_rate": 4.920571146000742e-05, "loss": 0.9891, "step": 358 }, { "epoch": 0.13231364599649867, "grad_norm": 8.573515329204097, "learning_rate": 4.920262084312029e-05, "loss": 0.8306, "step": 359 }, { "epoch": 0.1326822076845112, "grad_norm": 7.711406473200714, "learning_rate": 4.919953022623316e-05, "loss": 0.6102, "step": 360 }, { "epoch": 0.13305076937252372, "grad_norm": 10.573457530552181, "learning_rate": 4.919643960934603e-05, "loss": 0.7229, "step": 361 }, { "epoch": 0.13341933106053624, "grad_norm": 6.858933419371207, "learning_rate": 4.9193348992458896e-05, "loss": 0.9241, "step": 362 }, { "epoch": 0.1337878927485488, "grad_norm": 7.918080521534737, "learning_rate": 4.919025837557177e-05, "loss": 0.583, "step": 363 }, { "epoch": 0.13415645443656132, "grad_norm": 8.52395843676168, "learning_rate": 4.918716775868464e-05, "loss": 0.8247, "step": 364 }, { "epoch": 0.13452501612457385, "grad_norm": 8.894753407688926, "learning_rate": 4.918407714179751e-05, "loss": 1.0215, "step": 365 }, { "epoch": 0.13489357781258637, "grad_norm": 8.491404057698217, "learning_rate": 4.9180986524910374e-05, "loss": 0.8003, "step": 366 }, { "epoch": 0.1352621395005989, "grad_norm": 7.554501173212842, "learning_rate": 4.9177895908023245e-05, "loss": 0.682, "step": 367 }, { "epoch": 0.13563070118861145, "grad_norm": 8.13287816676179, "learning_rate": 4.917480529113611e-05, "loss": 0.7797, "step": 368 }, { "epoch": 0.13599926287662398, "grad_norm": 7.138839932614829, "learning_rate": 4.917171467424898e-05, "loss": 0.6723, "step": 369 }, { "epoch": 0.1363678245646365, "grad_norm": 11.248559138958052, "learning_rate": 4.916862405736185e-05, "loss": 1.1317, "step": 370 }, { "epoch": 0.13673638625264903, "grad_norm": 7.918082930386652, "learning_rate": 4.916553344047472e-05, "loss": 1.0417, "step": 371 }, { "epoch": 0.13710494794066158, "grad_norm": 6.428417131297941, "learning_rate": 4.916244282358759e-05, "loss": 0.9717, "step": 372 }, { "epoch": 0.1374735096286741, "grad_norm": 6.843747770404888, "learning_rate": 4.915935220670046e-05, "loss": 0.8022, "step": 373 }, { "epoch": 0.13784207131668663, "grad_norm": 9.412513244538532, "learning_rate": 4.915626158981333e-05, "loss": 1.247, "step": 374 }, { "epoch": 0.13821063300469916, "grad_norm": 5.391472702821877, "learning_rate": 4.91531709729262e-05, "loss": 0.8708, "step": 375 }, { "epoch": 0.13857919469271168, "grad_norm": 6.502411028390077, "learning_rate": 4.915008035603907e-05, "loss": 0.6196, "step": 376 }, { "epoch": 0.13894775638072424, "grad_norm": 6.868270076962441, "learning_rate": 4.9146989739151936e-05, "loss": 0.8575, "step": 377 }, { "epoch": 0.13931631806873676, "grad_norm": 9.826298627375186, "learning_rate": 4.914389912226481e-05, "loss": 0.9112, "step": 378 }, { "epoch": 0.1396848797567493, "grad_norm": 6.41084980394489, "learning_rate": 4.914080850537768e-05, "loss": 0.7988, "step": 379 }, { "epoch": 0.1400534414447618, "grad_norm": 5.822828407050905, "learning_rate": 4.913771788849055e-05, "loss": 0.7859, "step": 380 }, { "epoch": 0.14042200313277434, "grad_norm": 6.668453295041102, "learning_rate": 4.9134627271603414e-05, "loss": 0.845, "step": 381 }, { "epoch": 0.1407905648207869, "grad_norm": 9.235180505001564, "learning_rate": 4.913153665471628e-05, "loss": 0.8648, "step": 382 }, { "epoch": 0.14115912650879942, "grad_norm": 7.243517378066243, "learning_rate": 4.912844603782915e-05, "loss": 0.6481, "step": 383 }, { "epoch": 0.14152768819681194, "grad_norm": 7.34345338912304, "learning_rate": 4.912535542094202e-05, "loss": 0.9818, "step": 384 }, { "epoch": 0.14189624988482447, "grad_norm": 8.179792897183908, "learning_rate": 4.912226480405489e-05, "loss": 0.8137, "step": 385 }, { "epoch": 0.142264811572837, "grad_norm": 5.81845280167942, "learning_rate": 4.9119174187167764e-05, "loss": 0.6668, "step": 386 }, { "epoch": 0.14263337326084954, "grad_norm": 6.564896790738365, "learning_rate": 4.911608357028063e-05, "loss": 0.6972, "step": 387 }, { "epoch": 0.14300193494886207, "grad_norm": 12.875674331931245, "learning_rate": 4.91129929533935e-05, "loss": 1.1181, "step": 388 }, { "epoch": 0.1433704966368746, "grad_norm": 7.0430908869574695, "learning_rate": 4.910990233650637e-05, "loss": 0.963, "step": 389 }, { "epoch": 0.14373905832488712, "grad_norm": 7.6806935517750805, "learning_rate": 4.910681171961924e-05, "loss": 0.5984, "step": 390 }, { "epoch": 0.14410762001289965, "grad_norm": 6.666031076014909, "learning_rate": 4.9103721102732106e-05, "loss": 0.7549, "step": 391 }, { "epoch": 0.1444761817009122, "grad_norm": 6.963314382704748, "learning_rate": 4.910063048584498e-05, "loss": 0.8127, "step": 392 }, { "epoch": 0.14484474338892472, "grad_norm": 7.519538859574071, "learning_rate": 4.909753986895785e-05, "loss": 1.0234, "step": 393 }, { "epoch": 0.14521330507693725, "grad_norm": 6.621277249024335, "learning_rate": 4.909444925207072e-05, "loss": 0.9086, "step": 394 }, { "epoch": 0.14558186676494977, "grad_norm": 10.444417705456571, "learning_rate": 4.909135863518359e-05, "loss": 0.8472, "step": 395 }, { "epoch": 0.14595042845296233, "grad_norm": 7.275420249520224, "learning_rate": 4.908826801829645e-05, "loss": 0.8952, "step": 396 }, { "epoch": 0.14631899014097485, "grad_norm": 6.11033556477149, "learning_rate": 4.908517740140932e-05, "loss": 0.6989, "step": 397 }, { "epoch": 0.14668755182898738, "grad_norm": 7.0068772456317205, "learning_rate": 4.908208678452219e-05, "loss": 0.8299, "step": 398 }, { "epoch": 0.1470561135169999, "grad_norm": 8.9021976286549, "learning_rate": 4.907899616763506e-05, "loss": 1.0059, "step": 399 }, { "epoch": 0.14742467520501243, "grad_norm": 8.873592211326265, "learning_rate": 4.907590555074793e-05, "loss": 0.7812, "step": 400 }, { "epoch": 0.14779323689302498, "grad_norm": 9.660198108229167, "learning_rate": 4.90728149338608e-05, "loss": 0.9165, "step": 401 }, { "epoch": 0.1481617985810375, "grad_norm": 7.326527761034085, "learning_rate": 4.906972431697367e-05, "loss": 0.6955, "step": 402 }, { "epoch": 0.14853036026905003, "grad_norm": 7.409650705936439, "learning_rate": 4.906663370008654e-05, "loss": 1.1681, "step": 403 }, { "epoch": 0.14889892195706256, "grad_norm": 9.135937132557286, "learning_rate": 4.906354308319941e-05, "loss": 0.7792, "step": 404 }, { "epoch": 0.14926748364507508, "grad_norm": 6.5811273605063345, "learning_rate": 4.9060452466312276e-05, "loss": 0.6628, "step": 405 }, { "epoch": 0.14963604533308764, "grad_norm": 11.026804776724052, "learning_rate": 4.905736184942515e-05, "loss": 0.8182, "step": 406 }, { "epoch": 0.15000460702110016, "grad_norm": 10.40673826979499, "learning_rate": 4.905427123253802e-05, "loss": 0.9397, "step": 407 }, { "epoch": 0.1503731687091127, "grad_norm": 8.447356619813366, "learning_rate": 4.905118061565089e-05, "loss": 0.9863, "step": 408 }, { "epoch": 0.1507417303971252, "grad_norm": 6.49552293794, "learning_rate": 4.904808999876376e-05, "loss": 0.8331, "step": 409 }, { "epoch": 0.15111029208513774, "grad_norm": 6.8577962461893, "learning_rate": 4.9044999381876625e-05, "loss": 0.8046, "step": 410 }, { "epoch": 0.1514788537731503, "grad_norm": 7.8304276724952695, "learning_rate": 4.904190876498949e-05, "loss": 0.7159, "step": 411 }, { "epoch": 0.15184741546116282, "grad_norm": 6.391259793789405, "learning_rate": 4.903881814810236e-05, "loss": 0.659, "step": 412 }, { "epoch": 0.15221597714917534, "grad_norm": 13.304028031586313, "learning_rate": 4.903572753121523e-05, "loss": 0.8082, "step": 413 }, { "epoch": 0.15258453883718787, "grad_norm": 8.294237770162384, "learning_rate": 4.90326369143281e-05, "loss": 1.0462, "step": 414 }, { "epoch": 0.1529531005252004, "grad_norm": 9.985728952602496, "learning_rate": 4.902954629744097e-05, "loss": 1.3824, "step": 415 }, { "epoch": 0.15332166221321294, "grad_norm": 8.615846060876818, "learning_rate": 4.902645568055384e-05, "loss": 0.8938, "step": 416 }, { "epoch": 0.15369022390122547, "grad_norm": 8.289005433945883, "learning_rate": 4.902336506366671e-05, "loss": 1.0796, "step": 417 }, { "epoch": 0.154058785589238, "grad_norm": 6.670396842074819, "learning_rate": 4.902027444677958e-05, "loss": 0.824, "step": 418 }, { "epoch": 0.15442734727725052, "grad_norm": 5.949635201178271, "learning_rate": 4.901718382989245e-05, "loss": 0.6901, "step": 419 }, { "epoch": 0.15479590896526307, "grad_norm": 8.837064662716198, "learning_rate": 4.9014093213005316e-05, "loss": 1.0319, "step": 420 }, { "epoch": 0.1551644706532756, "grad_norm": 6.593179831779917, "learning_rate": 4.901100259611819e-05, "loss": 0.971, "step": 421 }, { "epoch": 0.15553303234128812, "grad_norm": 7.3285044752826805, "learning_rate": 4.900791197923106e-05, "loss": 0.8329, "step": 422 }, { "epoch": 0.15590159402930065, "grad_norm": 7.482131271275298, "learning_rate": 4.900482136234393e-05, "loss": 0.5455, "step": 423 }, { "epoch": 0.15627015571731318, "grad_norm": 6.402764986714541, "learning_rate": 4.9001730745456794e-05, "loss": 0.8118, "step": 424 }, { "epoch": 0.15663871740532573, "grad_norm": 5.473057559953616, "learning_rate": 4.8998640128569666e-05, "loss": 0.7824, "step": 425 }, { "epoch": 0.15700727909333825, "grad_norm": 8.472089833821626, "learning_rate": 4.899554951168254e-05, "loss": 0.9552, "step": 426 }, { "epoch": 0.15737584078135078, "grad_norm": 6.3566555797771604, "learning_rate": 4.89924588947954e-05, "loss": 0.6972, "step": 427 }, { "epoch": 0.1577444024693633, "grad_norm": 8.41964573275721, "learning_rate": 4.898936827790827e-05, "loss": 0.8904, "step": 428 }, { "epoch": 0.15811296415737583, "grad_norm": 8.321653904486457, "learning_rate": 4.8986277661021144e-05, "loss": 0.6899, "step": 429 }, { "epoch": 0.15848152584538838, "grad_norm": 5.503692427868419, "learning_rate": 4.898318704413401e-05, "loss": 0.7257, "step": 430 }, { "epoch": 0.1588500875334009, "grad_norm": 7.168876070811403, "learning_rate": 4.898009642724688e-05, "loss": 0.7646, "step": 431 }, { "epoch": 0.15921864922141343, "grad_norm": 8.088096971604031, "learning_rate": 4.897700581035975e-05, "loss": 0.8142, "step": 432 }, { "epoch": 0.15958721090942596, "grad_norm": 13.199670174841847, "learning_rate": 4.897391519347262e-05, "loss": 0.9044, "step": 433 }, { "epoch": 0.15995577259743848, "grad_norm": 6.979528191915144, "learning_rate": 4.8970824576585486e-05, "loss": 0.9559, "step": 434 }, { "epoch": 0.16032433428545104, "grad_norm": 8.89511622472507, "learning_rate": 4.896773395969836e-05, "loss": 1.1393, "step": 435 }, { "epoch": 0.16069289597346356, "grad_norm": 5.066849711943109, "learning_rate": 4.896464334281123e-05, "loss": 0.6759, "step": 436 }, { "epoch": 0.1610614576614761, "grad_norm": 5.4577503426870475, "learning_rate": 4.89615527259241e-05, "loss": 0.8373, "step": 437 }, { "epoch": 0.1614300193494886, "grad_norm": 8.59038819187523, "learning_rate": 4.895846210903697e-05, "loss": 1.2523, "step": 438 }, { "epoch": 0.16179858103750114, "grad_norm": 6.981631565851141, "learning_rate": 4.8955371492149835e-05, "loss": 0.7483, "step": 439 }, { "epoch": 0.1621671427255137, "grad_norm": 8.640280194154359, "learning_rate": 4.8952280875262706e-05, "loss": 0.8421, "step": 440 }, { "epoch": 0.16253570441352622, "grad_norm": 9.797010502950473, "learning_rate": 4.894919025837558e-05, "loss": 0.9964, "step": 441 }, { "epoch": 0.16290426610153874, "grad_norm": 6.943700820103179, "learning_rate": 4.894609964148844e-05, "loss": 0.7658, "step": 442 }, { "epoch": 0.16327282778955127, "grad_norm": 7.687440453275475, "learning_rate": 4.894300902460131e-05, "loss": 0.6364, "step": 443 }, { "epoch": 0.16364138947756382, "grad_norm": 8.216614652498695, "learning_rate": 4.893991840771418e-05, "loss": 0.9897, "step": 444 }, { "epoch": 0.16400995116557635, "grad_norm": 6.573730740762873, "learning_rate": 4.893682779082705e-05, "loss": 1.0686, "step": 445 }, { "epoch": 0.16437851285358887, "grad_norm": 9.846734263662226, "learning_rate": 4.893373717393992e-05, "loss": 0.8936, "step": 446 }, { "epoch": 0.1647470745416014, "grad_norm": 7.239911920422687, "learning_rate": 4.893064655705279e-05, "loss": 0.6516, "step": 447 }, { "epoch": 0.16511563622961392, "grad_norm": 10.678568716545508, "learning_rate": 4.892755594016566e-05, "loss": 0.6812, "step": 448 }, { "epoch": 0.16548419791762647, "grad_norm": 5.9140583073589825, "learning_rate": 4.892446532327853e-05, "loss": 0.6794, "step": 449 }, { "epoch": 0.165852759605639, "grad_norm": 9.649280390813963, "learning_rate": 4.89213747063914e-05, "loss": 1.1173, "step": 450 }, { "epoch": 0.16622132129365152, "grad_norm": 8.943780572973601, "learning_rate": 4.891828408950427e-05, "loss": 1.0686, "step": 451 }, { "epoch": 0.16658988298166405, "grad_norm": 7.159788098316386, "learning_rate": 4.891519347261714e-05, "loss": 0.7747, "step": 452 }, { "epoch": 0.16695844466967658, "grad_norm": 6.970390639885789, "learning_rate": 4.8912102855730005e-05, "loss": 0.8826, "step": 453 }, { "epoch": 0.16732700635768913, "grad_norm": 8.054331345181222, "learning_rate": 4.8909012238842876e-05, "loss": 0.9973, "step": 454 }, { "epoch": 0.16769556804570165, "grad_norm": 5.977329977785239, "learning_rate": 4.890592162195575e-05, "loss": 0.6281, "step": 455 }, { "epoch": 0.16806412973371418, "grad_norm": 8.103524333795633, "learning_rate": 4.890283100506862e-05, "loss": 1.2745, "step": 456 }, { "epoch": 0.1684326914217267, "grad_norm": 10.714072583213134, "learning_rate": 4.889974038818148e-05, "loss": 0.9614, "step": 457 }, { "epoch": 0.16880125310973923, "grad_norm": 5.380398899875114, "learning_rate": 4.8896649771294354e-05, "loss": 0.6115, "step": 458 }, { "epoch": 0.16916981479775178, "grad_norm": 15.934323940580578, "learning_rate": 4.889355915440722e-05, "loss": 1.0823, "step": 459 }, { "epoch": 0.1695383764857643, "grad_norm": 7.740547075676352, "learning_rate": 4.889046853752009e-05, "loss": 0.8615, "step": 460 }, { "epoch": 0.16990693817377683, "grad_norm": 11.23787900902933, "learning_rate": 4.888737792063296e-05, "loss": 1.3548, "step": 461 }, { "epoch": 0.17027549986178936, "grad_norm": 6.137970251026819, "learning_rate": 4.888428730374583e-05, "loss": 0.9544, "step": 462 }, { "epoch": 0.17064406154980188, "grad_norm": 8.253860957341418, "learning_rate": 4.8881196686858696e-05, "loss": 0.7704, "step": 463 }, { "epoch": 0.17101262323781444, "grad_norm": 11.91105110507609, "learning_rate": 4.887810606997157e-05, "loss": 0.9681, "step": 464 }, { "epoch": 0.17138118492582696, "grad_norm": 5.59723150075307, "learning_rate": 4.887501545308444e-05, "loss": 0.9192, "step": 465 }, { "epoch": 0.1717497466138395, "grad_norm": 7.6022447081945135, "learning_rate": 4.887192483619731e-05, "loss": 0.8577, "step": 466 }, { "epoch": 0.172118308301852, "grad_norm": 7.089969114766287, "learning_rate": 4.886883421931018e-05, "loss": 0.6177, "step": 467 }, { "epoch": 0.17248686998986457, "grad_norm": 7.7627915676232915, "learning_rate": 4.8865743602423045e-05, "loss": 0.7318, "step": 468 }, { "epoch": 0.1728554316778771, "grad_norm": 6.063279563884897, "learning_rate": 4.886265298553592e-05, "loss": 0.9018, "step": 469 }, { "epoch": 0.17322399336588962, "grad_norm": 6.077180999957958, "learning_rate": 4.885956236864879e-05, "loss": 0.7286, "step": 470 }, { "epoch": 0.17359255505390214, "grad_norm": 13.04558376843961, "learning_rate": 4.885647175176166e-05, "loss": 0.7854, "step": 471 }, { "epoch": 0.17396111674191467, "grad_norm": 9.418757338796286, "learning_rate": 4.8853381134874523e-05, "loss": 0.7603, "step": 472 }, { "epoch": 0.17432967842992722, "grad_norm": 7.824961009553044, "learning_rate": 4.885029051798739e-05, "loss": 0.8334, "step": 473 }, { "epoch": 0.17469824011793975, "grad_norm": 8.813865177114497, "learning_rate": 4.884719990110026e-05, "loss": 0.8047, "step": 474 }, { "epoch": 0.17506680180595227, "grad_norm": 7.74383496598867, "learning_rate": 4.884410928421313e-05, "loss": 0.8236, "step": 475 }, { "epoch": 0.1754353634939648, "grad_norm": 6.681711133525229, "learning_rate": 4.8841018667326e-05, "loss": 0.6446, "step": 476 }, { "epoch": 0.17580392518197732, "grad_norm": 7.698191581500702, "learning_rate": 4.8837928050438866e-05, "loss": 0.778, "step": 477 }, { "epoch": 0.17617248686998987, "grad_norm": 5.854777493178448, "learning_rate": 4.883483743355174e-05, "loss": 0.5604, "step": 478 }, { "epoch": 0.1765410485580024, "grad_norm": 7.636548161857498, "learning_rate": 4.883174681666461e-05, "loss": 0.7153, "step": 479 }, { "epoch": 0.17690961024601493, "grad_norm": 15.16470447185668, "learning_rate": 4.882865619977748e-05, "loss": 1.0356, "step": 480 }, { "epoch": 0.17727817193402745, "grad_norm": 5.752056251728322, "learning_rate": 4.882556558289035e-05, "loss": 0.5821, "step": 481 }, { "epoch": 0.17764673362203998, "grad_norm": 8.724039421653618, "learning_rate": 4.8822474966003215e-05, "loss": 0.9034, "step": 482 }, { "epoch": 0.17801529531005253, "grad_norm": 7.209553727347468, "learning_rate": 4.8819384349116086e-05, "loss": 0.6555, "step": 483 }, { "epoch": 0.17838385699806505, "grad_norm": 8.881165847540707, "learning_rate": 4.881629373222896e-05, "loss": 0.7142, "step": 484 }, { "epoch": 0.17875241868607758, "grad_norm": 6.717261144993476, "learning_rate": 4.881320311534183e-05, "loss": 0.663, "step": 485 }, { "epoch": 0.1791209803740901, "grad_norm": 8.284177327540775, "learning_rate": 4.88101124984547e-05, "loss": 1.0132, "step": 486 }, { "epoch": 0.17948954206210266, "grad_norm": 10.139270662641987, "learning_rate": 4.880702188156756e-05, "loss": 0.877, "step": 487 }, { "epoch": 0.17985810375011518, "grad_norm": 7.41772293398659, "learning_rate": 4.880393126468043e-05, "loss": 0.8754, "step": 488 }, { "epoch": 0.1802266654381277, "grad_norm": 11.195070571574753, "learning_rate": 4.88008406477933e-05, "loss": 0.8664, "step": 489 }, { "epoch": 0.18059522712614023, "grad_norm": 8.295809172654682, "learning_rate": 4.879775003090617e-05, "loss": 0.6264, "step": 490 }, { "epoch": 0.18096378881415276, "grad_norm": 9.395635248931125, "learning_rate": 4.879465941401904e-05, "loss": 0.9539, "step": 491 }, { "epoch": 0.1813323505021653, "grad_norm": 12.051265564619525, "learning_rate": 4.879156879713191e-05, "loss": 1.249, "step": 492 }, { "epoch": 0.18170091219017784, "grad_norm": 6.591768518457527, "learning_rate": 4.878847818024478e-05, "loss": 0.741, "step": 493 }, { "epoch": 0.18206947387819036, "grad_norm": 8.85164222782386, "learning_rate": 4.878538756335765e-05, "loss": 0.7747, "step": 494 }, { "epoch": 0.1824380355662029, "grad_norm": 7.627425511539397, "learning_rate": 4.878229694647052e-05, "loss": 0.646, "step": 495 }, { "epoch": 0.1828065972542154, "grad_norm": 17.0535136162656, "learning_rate": 4.8779206329583385e-05, "loss": 0.859, "step": 496 }, { "epoch": 0.18317515894222797, "grad_norm": 8.710852543027416, "learning_rate": 4.8776115712696256e-05, "loss": 1.0686, "step": 497 }, { "epoch": 0.1835437206302405, "grad_norm": 6.956517452777703, "learning_rate": 4.877302509580913e-05, "loss": 0.6425, "step": 498 }, { "epoch": 0.18391228231825302, "grad_norm": 13.761601218274722, "learning_rate": 4.8769934478922e-05, "loss": 0.6185, "step": 499 }, { "epoch": 0.18428084400626554, "grad_norm": 7.440544723340442, "learning_rate": 4.876684386203487e-05, "loss": 0.9753, "step": 500 }, { "epoch": 0.18464940569427807, "grad_norm": 6.113763584206885, "learning_rate": 4.8763753245147734e-05, "loss": 0.9607, "step": 501 }, { "epoch": 0.18501796738229062, "grad_norm": 8.746900608996988, "learning_rate": 4.87606626282606e-05, "loss": 0.9656, "step": 502 }, { "epoch": 0.18538652907030315, "grad_norm": 5.178913777878637, "learning_rate": 4.875757201137347e-05, "loss": 0.6522, "step": 503 }, { "epoch": 0.18575509075831567, "grad_norm": 9.206484325609603, "learning_rate": 4.875448139448634e-05, "loss": 0.8288, "step": 504 }, { "epoch": 0.1861236524463282, "grad_norm": 10.413543436403167, "learning_rate": 4.875139077759921e-05, "loss": 0.6368, "step": 505 }, { "epoch": 0.18649221413434072, "grad_norm": 7.030327359170383, "learning_rate": 4.8748300160712076e-05, "loss": 0.8296, "step": 506 }, { "epoch": 0.18686077582235328, "grad_norm": 9.887093394101798, "learning_rate": 4.874520954382495e-05, "loss": 0.835, "step": 507 }, { "epoch": 0.1872293375103658, "grad_norm": 8.988788403092288, "learning_rate": 4.874211892693782e-05, "loss": 0.9289, "step": 508 }, { "epoch": 0.18759789919837833, "grad_norm": 11.71779553404711, "learning_rate": 4.873902831005069e-05, "loss": 1.1403, "step": 509 }, { "epoch": 0.18796646088639085, "grad_norm": 6.188263200642785, "learning_rate": 4.873593769316356e-05, "loss": 1.0606, "step": 510 }, { "epoch": 0.1883350225744034, "grad_norm": 5.296813424351718, "learning_rate": 4.8732847076276425e-05, "loss": 0.7014, "step": 511 }, { "epoch": 0.18870358426241593, "grad_norm": 8.101583688647557, "learning_rate": 4.87297564593893e-05, "loss": 1.041, "step": 512 }, { "epoch": 0.18907214595042846, "grad_norm": 8.182352326036954, "learning_rate": 4.872666584250217e-05, "loss": 1.3595, "step": 513 }, { "epoch": 0.18944070763844098, "grad_norm": 5.34619263263897, "learning_rate": 4.872357522561504e-05, "loss": 0.7581, "step": 514 }, { "epoch": 0.1898092693264535, "grad_norm": 5.622421012516956, "learning_rate": 4.8720484608727903e-05, "loss": 0.6578, "step": 515 }, { "epoch": 0.19017783101446606, "grad_norm": 6.340997709597836, "learning_rate": 4.8717393991840775e-05, "loss": 0.827, "step": 516 }, { "epoch": 0.19054639270247858, "grad_norm": 7.962894697003496, "learning_rate": 4.871430337495364e-05, "loss": 0.945, "step": 517 }, { "epoch": 0.1909149543904911, "grad_norm": 9.562452129169392, "learning_rate": 4.871121275806651e-05, "loss": 1.0357, "step": 518 }, { "epoch": 0.19128351607850363, "grad_norm": 5.864915999829716, "learning_rate": 4.870812214117938e-05, "loss": 0.8089, "step": 519 }, { "epoch": 0.19165207776651616, "grad_norm": 7.795741058026416, "learning_rate": 4.870503152429225e-05, "loss": 0.8201, "step": 520 }, { "epoch": 0.1920206394545287, "grad_norm": 6.577537528510563, "learning_rate": 4.870194090740512e-05, "loss": 0.9318, "step": 521 }, { "epoch": 0.19238920114254124, "grad_norm": 5.253682434533401, "learning_rate": 4.869885029051799e-05, "loss": 0.7895, "step": 522 }, { "epoch": 0.19275776283055376, "grad_norm": 8.219344084992427, "learning_rate": 4.869575967363086e-05, "loss": 1.0141, "step": 523 }, { "epoch": 0.1931263245185663, "grad_norm": 5.65284398837084, "learning_rate": 4.869266905674373e-05, "loss": 0.8392, "step": 524 }, { "epoch": 0.19349488620657881, "grad_norm": 6.804084549653825, "learning_rate": 4.8689578439856595e-05, "loss": 0.7974, "step": 525 }, { "epoch": 0.19386344789459137, "grad_norm": 8.794425447140506, "learning_rate": 4.8686487822969466e-05, "loss": 0.9292, "step": 526 }, { "epoch": 0.1942320095826039, "grad_norm": 5.385331271863862, "learning_rate": 4.868339720608234e-05, "loss": 1.0104, "step": 527 }, { "epoch": 0.19460057127061642, "grad_norm": 6.820105550626009, "learning_rate": 4.868030658919521e-05, "loss": 1.1333, "step": 528 }, { "epoch": 0.19496913295862894, "grad_norm": 6.354947134979087, "learning_rate": 4.867721597230808e-05, "loss": 0.873, "step": 529 }, { "epoch": 0.19533769464664147, "grad_norm": 5.5325374505817715, "learning_rate": 4.8674125355420944e-05, "loss": 0.8472, "step": 530 }, { "epoch": 0.19570625633465402, "grad_norm": 7.198123676803055, "learning_rate": 4.8671034738533815e-05, "loss": 0.8959, "step": 531 }, { "epoch": 0.19607481802266655, "grad_norm": 6.076371516127199, "learning_rate": 4.866794412164668e-05, "loss": 0.7192, "step": 532 }, { "epoch": 0.19644337971067907, "grad_norm": 7.237783201800779, "learning_rate": 4.866485350475955e-05, "loss": 1.0735, "step": 533 }, { "epoch": 0.1968119413986916, "grad_norm": 5.575512014919852, "learning_rate": 4.866176288787242e-05, "loss": 0.6169, "step": 534 }, { "epoch": 0.19718050308670415, "grad_norm": 12.933235681958251, "learning_rate": 4.8658672270985287e-05, "loss": 0.8393, "step": 535 }, { "epoch": 0.19754906477471668, "grad_norm": 6.728873357689113, "learning_rate": 4.865558165409816e-05, "loss": 0.7065, "step": 536 }, { "epoch": 0.1979176264627292, "grad_norm": 7.004445980145006, "learning_rate": 4.865249103721103e-05, "loss": 0.5157, "step": 537 }, { "epoch": 0.19828618815074173, "grad_norm": 8.595217381824634, "learning_rate": 4.86494004203239e-05, "loss": 1.199, "step": 538 }, { "epoch": 0.19865474983875425, "grad_norm": 6.897755089052864, "learning_rate": 4.864630980343677e-05, "loss": 0.7981, "step": 539 }, { "epoch": 0.1990233115267668, "grad_norm": 5.364900548747313, "learning_rate": 4.8643219186549636e-05, "loss": 0.5739, "step": 540 }, { "epoch": 0.19939187321477933, "grad_norm": 8.233220926153242, "learning_rate": 4.864012856966251e-05, "loss": 1.0226, "step": 541 }, { "epoch": 0.19976043490279186, "grad_norm": 7.683362886560938, "learning_rate": 4.863703795277538e-05, "loss": 1.0898, "step": 542 }, { "epoch": 0.20012899659080438, "grad_norm": 7.494724707252429, "learning_rate": 4.863394733588825e-05, "loss": 1.0289, "step": 543 }, { "epoch": 0.2004975582788169, "grad_norm": 5.042098013760849, "learning_rate": 4.8630856719001114e-05, "loss": 0.5465, "step": 544 }, { "epoch": 0.20086611996682946, "grad_norm": 6.392795632470598, "learning_rate": 4.8627766102113985e-05, "loss": 0.8064, "step": 545 }, { "epoch": 0.20123468165484198, "grad_norm": 6.406219035748339, "learning_rate": 4.8624675485226856e-05, "loss": 0.7443, "step": 546 }, { "epoch": 0.2016032433428545, "grad_norm": 8.945373481642577, "learning_rate": 4.862158486833973e-05, "loss": 0.6541, "step": 547 }, { "epoch": 0.20197180503086704, "grad_norm": 5.12054417132336, "learning_rate": 4.861849425145259e-05, "loss": 0.6588, "step": 548 }, { "epoch": 0.20234036671887956, "grad_norm": 7.240302604969176, "learning_rate": 4.8615403634565456e-05, "loss": 0.9026, "step": 549 }, { "epoch": 0.2027089284068921, "grad_norm": 7.328482613042619, "learning_rate": 4.861231301767833e-05, "loss": 0.9874, "step": 550 }, { "epoch": 0.20307749009490464, "grad_norm": 7.984900291169083, "learning_rate": 4.86092224007912e-05, "loss": 1.1016, "step": 551 }, { "epoch": 0.20344605178291716, "grad_norm": 6.235135522025752, "learning_rate": 4.860613178390407e-05, "loss": 0.9137, "step": 552 }, { "epoch": 0.2038146134709297, "grad_norm": 7.152789616086313, "learning_rate": 4.860304116701694e-05, "loss": 0.814, "step": 553 }, { "epoch": 0.20418317515894222, "grad_norm": 8.847605373749971, "learning_rate": 4.8599950550129805e-05, "loss": 0.8388, "step": 554 }, { "epoch": 0.20455173684695477, "grad_norm": 4.487499676972699, "learning_rate": 4.8596859933242677e-05, "loss": 0.6036, "step": 555 }, { "epoch": 0.2049202985349673, "grad_norm": 6.917094420415791, "learning_rate": 4.859376931635555e-05, "loss": 0.8324, "step": 556 }, { "epoch": 0.20528886022297982, "grad_norm": 9.68215026806941, "learning_rate": 4.859067869946842e-05, "loss": 0.8057, "step": 557 }, { "epoch": 0.20565742191099234, "grad_norm": 6.9394852830489056, "learning_rate": 4.858758808258129e-05, "loss": 0.8848, "step": 558 }, { "epoch": 0.2060259835990049, "grad_norm": 7.887818092183103, "learning_rate": 4.8584497465694155e-05, "loss": 0.8708, "step": 559 }, { "epoch": 0.20639454528701742, "grad_norm": 7.2662681376980425, "learning_rate": 4.8581406848807026e-05, "loss": 0.6349, "step": 560 }, { "epoch": 0.20676310697502995, "grad_norm": 6.825774252993724, "learning_rate": 4.85783162319199e-05, "loss": 0.7672, "step": 561 }, { "epoch": 0.20713166866304247, "grad_norm": 7.394540019969707, "learning_rate": 4.857522561503277e-05, "loss": 0.8126, "step": 562 }, { "epoch": 0.207500230351055, "grad_norm": 8.191015988259446, "learning_rate": 4.857213499814563e-05, "loss": 0.7453, "step": 563 }, { "epoch": 0.20786879203906755, "grad_norm": 7.412174993533654, "learning_rate": 4.85690443812585e-05, "loss": 0.4755, "step": 564 }, { "epoch": 0.20823735372708008, "grad_norm": 6.63075841003766, "learning_rate": 4.856595376437137e-05, "loss": 0.781, "step": 565 }, { "epoch": 0.2086059154150926, "grad_norm": 5.726601802175841, "learning_rate": 4.856286314748424e-05, "loss": 0.6644, "step": 566 }, { "epoch": 0.20897447710310513, "grad_norm": 12.59991561619189, "learning_rate": 4.855977253059711e-05, "loss": 1.0593, "step": 567 }, { "epoch": 0.20934303879111765, "grad_norm": 6.284810893052056, "learning_rate": 4.8556681913709975e-05, "loss": 0.7625, "step": 568 }, { "epoch": 0.2097116004791302, "grad_norm": 7.3678564615423, "learning_rate": 4.8553591296822846e-05, "loss": 1.0486, "step": 569 }, { "epoch": 0.21008016216714273, "grad_norm": 7.757014392008824, "learning_rate": 4.855050067993572e-05, "loss": 0.81, "step": 570 }, { "epoch": 0.21044872385515526, "grad_norm": 6.316026665600469, "learning_rate": 4.854741006304859e-05, "loss": 1.036, "step": 571 }, { "epoch": 0.21081728554316778, "grad_norm": 9.893789451072244, "learning_rate": 4.854431944616146e-05, "loss": 1.1461, "step": 572 }, { "epoch": 0.2111858472311803, "grad_norm": 5.830300260110699, "learning_rate": 4.8541228829274324e-05, "loss": 0.771, "step": 573 }, { "epoch": 0.21155440891919286, "grad_norm": 6.942104976351593, "learning_rate": 4.8538138212387195e-05, "loss": 0.8221, "step": 574 }, { "epoch": 0.21192297060720539, "grad_norm": 7.630322709164803, "learning_rate": 4.8535047595500067e-05, "loss": 0.8944, "step": 575 }, { "epoch": 0.2122915322952179, "grad_norm": 5.472009875227288, "learning_rate": 4.853195697861294e-05, "loss": 0.6937, "step": 576 }, { "epoch": 0.21266009398323044, "grad_norm": 7.707250268127838, "learning_rate": 4.85288663617258e-05, "loss": 0.7215, "step": 577 }, { "epoch": 0.21302865567124296, "grad_norm": 8.944752984685001, "learning_rate": 4.8525775744838667e-05, "loss": 0.9727, "step": 578 }, { "epoch": 0.21339721735925551, "grad_norm": 9.082151376893721, "learning_rate": 4.852268512795154e-05, "loss": 0.7788, "step": 579 }, { "epoch": 0.21376577904726804, "grad_norm": 6.0002711552702, "learning_rate": 4.851959451106441e-05, "loss": 0.6589, "step": 580 }, { "epoch": 0.21413434073528057, "grad_norm": 8.375403095622334, "learning_rate": 4.851650389417728e-05, "loss": 1.0668, "step": 581 }, { "epoch": 0.2145029024232931, "grad_norm": 5.8636742040207555, "learning_rate": 4.851341327729015e-05, "loss": 0.7106, "step": 582 }, { "epoch": 0.21487146411130564, "grad_norm": 7.1722446587399835, "learning_rate": 4.8510322660403016e-05, "loss": 0.597, "step": 583 }, { "epoch": 0.21524002579931817, "grad_norm": 7.070204861490738, "learning_rate": 4.850723204351589e-05, "loss": 0.7616, "step": 584 }, { "epoch": 0.2156085874873307, "grad_norm": 11.822693962430765, "learning_rate": 4.850414142662876e-05, "loss": 0.8836, "step": 585 }, { "epoch": 0.21597714917534322, "grad_norm": 8.035614846234932, "learning_rate": 4.850105080974163e-05, "loss": 0.7222, "step": 586 }, { "epoch": 0.21634571086335574, "grad_norm": 6.831363983459798, "learning_rate": 4.8497960192854494e-05, "loss": 0.9029, "step": 587 }, { "epoch": 0.2167142725513683, "grad_norm": 7.441511338660848, "learning_rate": 4.8494869575967365e-05, "loss": 0.7496, "step": 588 }, { "epoch": 0.21708283423938082, "grad_norm": 9.975237896599259, "learning_rate": 4.8491778959080236e-05, "loss": 0.889, "step": 589 }, { "epoch": 0.21745139592739335, "grad_norm": 6.764996294583069, "learning_rate": 4.848868834219311e-05, "loss": 0.6684, "step": 590 }, { "epoch": 0.21781995761540587, "grad_norm": 8.296943046448831, "learning_rate": 4.848559772530598e-05, "loss": 0.935, "step": 591 }, { "epoch": 0.2181885193034184, "grad_norm": 6.611960191813393, "learning_rate": 4.848250710841884e-05, "loss": 0.8318, "step": 592 }, { "epoch": 0.21855708099143095, "grad_norm": 6.539516929960966, "learning_rate": 4.847941649153171e-05, "loss": 0.6502, "step": 593 }, { "epoch": 0.21892564267944348, "grad_norm": 9.071123321855596, "learning_rate": 4.847632587464458e-05, "loss": 0.767, "step": 594 }, { "epoch": 0.219294204367456, "grad_norm": 6.279291711214829, "learning_rate": 4.847323525775745e-05, "loss": 0.6096, "step": 595 }, { "epoch": 0.21966276605546853, "grad_norm": 7.6793728178947465, "learning_rate": 4.847014464087032e-05, "loss": 0.7661, "step": 596 }, { "epoch": 0.22003132774348105, "grad_norm": 7.997248176313818, "learning_rate": 4.8467054023983185e-05, "loss": 0.8548, "step": 597 }, { "epoch": 0.2203998894314936, "grad_norm": 6.545240976721894, "learning_rate": 4.8463963407096057e-05, "loss": 0.6561, "step": 598 }, { "epoch": 0.22076845111950613, "grad_norm": 9.594576225364825, "learning_rate": 4.846087279020893e-05, "loss": 0.8541, "step": 599 }, { "epoch": 0.22113701280751866, "grad_norm": 7.575405238425677, "learning_rate": 4.84577821733218e-05, "loss": 0.7336, "step": 600 }, { "epoch": 0.22150557449553118, "grad_norm": 24.907834259917728, "learning_rate": 4.845469155643467e-05, "loss": 0.9898, "step": 601 }, { "epoch": 0.2218741361835437, "grad_norm": 10.751559831873669, "learning_rate": 4.8451600939547534e-05, "loss": 0.64, "step": 602 }, { "epoch": 0.22224269787155626, "grad_norm": 8.521985511636762, "learning_rate": 4.8448510322660406e-05, "loss": 0.8241, "step": 603 }, { "epoch": 0.22261125955956879, "grad_norm": 6.967796089333362, "learning_rate": 4.844541970577328e-05, "loss": 0.8131, "step": 604 }, { "epoch": 0.2229798212475813, "grad_norm": 5.570378414154633, "learning_rate": 4.844232908888615e-05, "loss": 0.7072, "step": 605 }, { "epoch": 0.22334838293559384, "grad_norm": 14.061009035544222, "learning_rate": 4.843923847199901e-05, "loss": 1.2932, "step": 606 }, { "epoch": 0.2237169446236064, "grad_norm": 9.432771218022967, "learning_rate": 4.8436147855111884e-05, "loss": 0.9193, "step": 607 }, { "epoch": 0.22408550631161891, "grad_norm": 8.779233218971116, "learning_rate": 4.843305723822475e-05, "loss": 0.7613, "step": 608 }, { "epoch": 0.22445406799963144, "grad_norm": 7.699730548850547, "learning_rate": 4.842996662133762e-05, "loss": 0.8793, "step": 609 }, { "epoch": 0.22482262968764397, "grad_norm": 6.195184117029653, "learning_rate": 4.842687600445049e-05, "loss": 0.9513, "step": 610 }, { "epoch": 0.2251911913756565, "grad_norm": 8.573168714177845, "learning_rate": 4.842378538756336e-05, "loss": 0.7426, "step": 611 }, { "epoch": 0.22555975306366904, "grad_norm": 7.5871578535334026, "learning_rate": 4.8420694770676226e-05, "loss": 0.8815, "step": 612 }, { "epoch": 0.22592831475168157, "grad_norm": 7.711286017089076, "learning_rate": 4.84176041537891e-05, "loss": 0.9626, "step": 613 }, { "epoch": 0.2262968764396941, "grad_norm": 11.502989670616273, "learning_rate": 4.841451353690197e-05, "loss": 0.9008, "step": 614 }, { "epoch": 0.22666543812770662, "grad_norm": 5.456506942869048, "learning_rate": 4.841142292001484e-05, "loss": 0.6623, "step": 615 }, { "epoch": 0.22703399981571915, "grad_norm": 5.549928152633117, "learning_rate": 4.8408332303127704e-05, "loss": 0.8131, "step": 616 }, { "epoch": 0.2274025615037317, "grad_norm": 8.048571006039383, "learning_rate": 4.8405241686240575e-05, "loss": 0.8864, "step": 617 }, { "epoch": 0.22777112319174422, "grad_norm": 8.895521910410526, "learning_rate": 4.8402151069353446e-05, "loss": 1.0838, "step": 618 }, { "epoch": 0.22813968487975675, "grad_norm": 5.88093267348874, "learning_rate": 4.839906045246632e-05, "loss": 0.734, "step": 619 }, { "epoch": 0.22850824656776927, "grad_norm": 5.653147315407776, "learning_rate": 4.839596983557919e-05, "loss": 0.6481, "step": 620 }, { "epoch": 0.2288768082557818, "grad_norm": 5.812752338797625, "learning_rate": 4.839287921869205e-05, "loss": 0.7881, "step": 621 }, { "epoch": 0.22924536994379435, "grad_norm": 6.383029144980761, "learning_rate": 4.8389788601804924e-05, "loss": 0.6908, "step": 622 }, { "epoch": 0.22961393163180688, "grad_norm": 6.08873491548891, "learning_rate": 4.838669798491779e-05, "loss": 0.8436, "step": 623 }, { "epoch": 0.2299824933198194, "grad_norm": 7.22591054913913, "learning_rate": 4.838360736803066e-05, "loss": 0.6733, "step": 624 }, { "epoch": 0.23035105500783193, "grad_norm": 6.052127735108564, "learning_rate": 4.838051675114353e-05, "loss": 0.9732, "step": 625 }, { "epoch": 0.23071961669584445, "grad_norm": 6.307895907610556, "learning_rate": 4.8377426134256396e-05, "loss": 0.7608, "step": 626 }, { "epoch": 0.231088178383857, "grad_norm": 6.522071338982827, "learning_rate": 4.837433551736927e-05, "loss": 0.9849, "step": 627 }, { "epoch": 0.23145674007186953, "grad_norm": 6.346447300739561, "learning_rate": 4.837124490048214e-05, "loss": 0.784, "step": 628 }, { "epoch": 0.23182530175988206, "grad_norm": 7.622389549788863, "learning_rate": 4.836815428359501e-05, "loss": 0.7024, "step": 629 }, { "epoch": 0.23219386344789458, "grad_norm": 14.378968329633715, "learning_rate": 4.836506366670788e-05, "loss": 0.7159, "step": 630 }, { "epoch": 0.23256242513590714, "grad_norm": 12.448633472867488, "learning_rate": 4.8361973049820745e-05, "loss": 0.7032, "step": 631 }, { "epoch": 0.23293098682391966, "grad_norm": 6.825038744917783, "learning_rate": 4.8358882432933616e-05, "loss": 0.6569, "step": 632 }, { "epoch": 0.2332995485119322, "grad_norm": 7.676202179139078, "learning_rate": 4.835579181604649e-05, "loss": 0.8502, "step": 633 }, { "epoch": 0.2336681101999447, "grad_norm": 8.392937425038118, "learning_rate": 4.835270119915936e-05, "loss": 0.6795, "step": 634 }, { "epoch": 0.23403667188795724, "grad_norm": 11.624316123617634, "learning_rate": 4.834961058227222e-05, "loss": 0.7646, "step": 635 }, { "epoch": 0.2344052335759698, "grad_norm": 9.881033683788162, "learning_rate": 4.8346519965385094e-05, "loss": 0.8943, "step": 636 }, { "epoch": 0.23477379526398232, "grad_norm": 11.390304163219877, "learning_rate": 4.8343429348497965e-05, "loss": 0.9407, "step": 637 }, { "epoch": 0.23514235695199484, "grad_norm": 7.977345815786629, "learning_rate": 4.834033873161083e-05, "loss": 0.81, "step": 638 }, { "epoch": 0.23551091864000737, "grad_norm": 8.098445752636641, "learning_rate": 4.83372481147237e-05, "loss": 0.8727, "step": 639 }, { "epoch": 0.2358794803280199, "grad_norm": 6.4362803664892265, "learning_rate": 4.8334157497836565e-05, "loss": 0.6225, "step": 640 }, { "epoch": 0.23624804201603244, "grad_norm": 8.785332541861676, "learning_rate": 4.8331066880949436e-05, "loss": 0.88, "step": 641 }, { "epoch": 0.23661660370404497, "grad_norm": 6.270855097769886, "learning_rate": 4.832797626406231e-05, "loss": 0.8221, "step": 642 }, { "epoch": 0.2369851653920575, "grad_norm": 6.917854605389492, "learning_rate": 4.832488564717518e-05, "loss": 0.7043, "step": 643 }, { "epoch": 0.23735372708007002, "grad_norm": 6.8152938853293925, "learning_rate": 4.832179503028805e-05, "loss": 0.7546, "step": 644 }, { "epoch": 0.23772228876808255, "grad_norm": 6.444941987456496, "learning_rate": 4.8318704413400914e-05, "loss": 0.6345, "step": 645 }, { "epoch": 0.2380908504560951, "grad_norm": 8.54775734205255, "learning_rate": 4.8315613796513786e-05, "loss": 0.7977, "step": 646 }, { "epoch": 0.23845941214410762, "grad_norm": 6.086175681314837, "learning_rate": 4.831252317962666e-05, "loss": 0.6418, "step": 647 }, { "epoch": 0.23882797383212015, "grad_norm": 7.595401537227558, "learning_rate": 4.830943256273953e-05, "loss": 0.9025, "step": 648 }, { "epoch": 0.23919653552013267, "grad_norm": 7.865037702020202, "learning_rate": 4.830634194585239e-05, "loss": 0.8886, "step": 649 }, { "epoch": 0.2395650972081452, "grad_norm": 7.579018703342487, "learning_rate": 4.8303251328965264e-05, "loss": 0.6476, "step": 650 }, { "epoch": 0.23993365889615775, "grad_norm": 6.286966483988982, "learning_rate": 4.8300160712078135e-05, "loss": 0.6612, "step": 651 }, { "epoch": 0.24030222058417028, "grad_norm": 8.003236116101919, "learning_rate": 4.8297070095191006e-05, "loss": 0.9309, "step": 652 }, { "epoch": 0.2406707822721828, "grad_norm": 9.342847697601943, "learning_rate": 4.829397947830387e-05, "loss": 0.7082, "step": 653 }, { "epoch": 0.24103934396019533, "grad_norm": 6.197084344279751, "learning_rate": 4.829088886141674e-05, "loss": 0.8694, "step": 654 }, { "epoch": 0.24140790564820788, "grad_norm": 7.653228537255178, "learning_rate": 4.8287798244529606e-05, "loss": 0.7631, "step": 655 }, { "epoch": 0.2417764673362204, "grad_norm": 6.276908019063618, "learning_rate": 4.828470762764248e-05, "loss": 0.611, "step": 656 }, { "epoch": 0.24214502902423293, "grad_norm": 10.88281320104987, "learning_rate": 4.828161701075535e-05, "loss": 0.8661, "step": 657 }, { "epoch": 0.24251359071224546, "grad_norm": 6.730676188402247, "learning_rate": 4.827852639386822e-05, "loss": 0.9604, "step": 658 }, { "epoch": 0.24288215240025798, "grad_norm": 8.015663072521667, "learning_rate": 4.8275435776981084e-05, "loss": 0.9072, "step": 659 }, { "epoch": 0.24325071408827054, "grad_norm": 6.409349240791118, "learning_rate": 4.8272345160093955e-05, "loss": 0.633, "step": 660 }, { "epoch": 0.24361927577628306, "grad_norm": 6.683532108131874, "learning_rate": 4.8269254543206826e-05, "loss": 0.8856, "step": 661 }, { "epoch": 0.2439878374642956, "grad_norm": 7.340426857824967, "learning_rate": 4.82661639263197e-05, "loss": 1.206, "step": 662 }, { "epoch": 0.2443563991523081, "grad_norm": 7.042874505401053, "learning_rate": 4.826307330943257e-05, "loss": 0.8283, "step": 663 }, { "epoch": 0.24472496084032064, "grad_norm": 7.597345952853782, "learning_rate": 4.825998269254543e-05, "loss": 0.8084, "step": 664 }, { "epoch": 0.2450935225283332, "grad_norm": 6.6186080902995394, "learning_rate": 4.8256892075658304e-05, "loss": 0.6847, "step": 665 }, { "epoch": 0.24546208421634572, "grad_norm": 5.232374122005872, "learning_rate": 4.8253801458771176e-05, "loss": 0.7299, "step": 666 }, { "epoch": 0.24583064590435824, "grad_norm": 7.077739646139518, "learning_rate": 4.825071084188405e-05, "loss": 0.7086, "step": 667 }, { "epoch": 0.24619920759237077, "grad_norm": 5.753508451116615, "learning_rate": 4.824762022499691e-05, "loss": 1.0008, "step": 668 }, { "epoch": 0.2465677692803833, "grad_norm": 5.854210614710297, "learning_rate": 4.8244529608109776e-05, "loss": 0.5212, "step": 669 }, { "epoch": 0.24693633096839585, "grad_norm": 10.146489826454262, "learning_rate": 4.824143899122265e-05, "loss": 0.7101, "step": 670 }, { "epoch": 0.24730489265640837, "grad_norm": 6.654797319382503, "learning_rate": 4.823834837433552e-05, "loss": 0.7915, "step": 671 }, { "epoch": 0.2476734543444209, "grad_norm": 6.765575946163104, "learning_rate": 4.823525775744839e-05, "loss": 0.8502, "step": 672 }, { "epoch": 0.24804201603243342, "grad_norm": 13.644163880861754, "learning_rate": 4.823216714056126e-05, "loss": 0.8276, "step": 673 }, { "epoch": 0.24841057772044595, "grad_norm": 8.442885650962848, "learning_rate": 4.8229076523674125e-05, "loss": 0.7905, "step": 674 }, { "epoch": 0.2487791394084585, "grad_norm": 9.862682531333308, "learning_rate": 4.8225985906786996e-05, "loss": 0.9078, "step": 675 }, { "epoch": 0.24914770109647102, "grad_norm": 6.4387999768281015, "learning_rate": 4.822289528989987e-05, "loss": 0.7665, "step": 676 }, { "epoch": 0.24951626278448355, "grad_norm": 6.971166901512974, "learning_rate": 4.821980467301274e-05, "loss": 0.5786, "step": 677 }, { "epoch": 0.24988482447249608, "grad_norm": 7.775184810686627, "learning_rate": 4.82167140561256e-05, "loss": 0.7991, "step": 678 }, { "epoch": 0.25025338616050863, "grad_norm": 6.94174779117192, "learning_rate": 4.8213623439238474e-05, "loss": 0.8433, "step": 679 }, { "epoch": 0.25062194784852115, "grad_norm": 6.567908392654155, "learning_rate": 4.8210532822351345e-05, "loss": 0.7505, "step": 680 }, { "epoch": 0.2509905095365337, "grad_norm": 5.7280531292374715, "learning_rate": 4.8207442205464216e-05, "loss": 0.7736, "step": 681 }, { "epoch": 0.2513590712245462, "grad_norm": 5.2944510435952985, "learning_rate": 4.820435158857709e-05, "loss": 0.7101, "step": 682 }, { "epoch": 0.25172763291255873, "grad_norm": 12.78002983555423, "learning_rate": 4.820126097168995e-05, "loss": 1.0615, "step": 683 }, { "epoch": 0.25209619460057126, "grad_norm": 5.877377292872514, "learning_rate": 4.8198170354802816e-05, "loss": 0.7712, "step": 684 }, { "epoch": 0.2524647562885838, "grad_norm": 8.297446940621064, "learning_rate": 4.819507973791569e-05, "loss": 0.8325, "step": 685 }, { "epoch": 0.2528333179765963, "grad_norm": 6.190247821438194, "learning_rate": 4.819198912102856e-05, "loss": 0.6891, "step": 686 }, { "epoch": 0.2532018796646089, "grad_norm": 4.968763435393589, "learning_rate": 4.818889850414143e-05, "loss": 0.696, "step": 687 }, { "epoch": 0.2535704413526214, "grad_norm": 7.109925770647069, "learning_rate": 4.8185807887254294e-05, "loss": 0.96, "step": 688 }, { "epoch": 0.25393900304063394, "grad_norm": 7.474230682692079, "learning_rate": 4.8182717270367166e-05, "loss": 0.9032, "step": 689 }, { "epoch": 0.25430756472864646, "grad_norm": 5.624420644806155, "learning_rate": 4.817962665348004e-05, "loss": 0.7155, "step": 690 }, { "epoch": 0.254676126416659, "grad_norm": 8.363914326809603, "learning_rate": 4.817653603659291e-05, "loss": 0.9604, "step": 691 }, { "epoch": 0.2550446881046715, "grad_norm": 7.81530516045129, "learning_rate": 4.817344541970578e-05, "loss": 0.9059, "step": 692 }, { "epoch": 0.25541324979268404, "grad_norm": 8.886941102987503, "learning_rate": 4.8170354802818644e-05, "loss": 0.9995, "step": 693 }, { "epoch": 0.25578181148069656, "grad_norm": 5.986747729482581, "learning_rate": 4.8167264185931515e-05, "loss": 0.8124, "step": 694 }, { "epoch": 0.2561503731687091, "grad_norm": 6.406135074235905, "learning_rate": 4.8164173569044386e-05, "loss": 1.0084, "step": 695 }, { "epoch": 0.25651893485672167, "grad_norm": 8.781301260693354, "learning_rate": 4.816108295215726e-05, "loss": 1.0689, "step": 696 }, { "epoch": 0.2568874965447342, "grad_norm": 5.016519532100157, "learning_rate": 4.815799233527012e-05, "loss": 0.7552, "step": 697 }, { "epoch": 0.2572560582327467, "grad_norm": 6.763557664445054, "learning_rate": 4.815490171838299e-05, "loss": 0.8393, "step": 698 }, { "epoch": 0.25762461992075925, "grad_norm": 6.963978318357595, "learning_rate": 4.815181110149586e-05, "loss": 0.8374, "step": 699 }, { "epoch": 0.25799318160877177, "grad_norm": 5.709283717135114, "learning_rate": 4.814872048460873e-05, "loss": 0.9063, "step": 700 }, { "epoch": 0.2583617432967843, "grad_norm": 8.268311033469375, "learning_rate": 4.81456298677216e-05, "loss": 1.2959, "step": 701 }, { "epoch": 0.2587303049847968, "grad_norm": 8.98420601063898, "learning_rate": 4.814253925083447e-05, "loss": 0.6727, "step": 702 }, { "epoch": 0.25909886667280935, "grad_norm": 6.964979580139328, "learning_rate": 4.8139448633947335e-05, "loss": 0.8096, "step": 703 }, { "epoch": 0.2594674283608219, "grad_norm": 13.440364088848945, "learning_rate": 4.8136358017060206e-05, "loss": 0.9052, "step": 704 }, { "epoch": 0.2598359900488344, "grad_norm": 6.24596671618786, "learning_rate": 4.813326740017308e-05, "loss": 0.8188, "step": 705 }, { "epoch": 0.260204551736847, "grad_norm": 7.742087478068586, "learning_rate": 4.813017678328595e-05, "loss": 0.8411, "step": 706 }, { "epoch": 0.2605731134248595, "grad_norm": 9.484220550636989, "learning_rate": 4.812708616639881e-05, "loss": 0.6713, "step": 707 }, { "epoch": 0.26094167511287203, "grad_norm": 7.173347936896641, "learning_rate": 4.8123995549511684e-05, "loss": 0.7867, "step": 708 }, { "epoch": 0.26131023680088455, "grad_norm": 7.232712227074154, "learning_rate": 4.8120904932624556e-05, "loss": 0.8336, "step": 709 }, { "epoch": 0.2616787984888971, "grad_norm": 5.477677388246057, "learning_rate": 4.811781431573743e-05, "loss": 0.7024, "step": 710 }, { "epoch": 0.2620473601769096, "grad_norm": 4.594863445257803, "learning_rate": 4.81147236988503e-05, "loss": 0.6001, "step": 711 }, { "epoch": 0.26241592186492213, "grad_norm": 9.830292139844445, "learning_rate": 4.811163308196316e-05, "loss": 0.5911, "step": 712 }, { "epoch": 0.26278448355293466, "grad_norm": 5.772778051394671, "learning_rate": 4.8108542465076034e-05, "loss": 0.7176, "step": 713 }, { "epoch": 0.2631530452409472, "grad_norm": 7.077046225825482, "learning_rate": 4.81054518481889e-05, "loss": 0.6971, "step": 714 }, { "epoch": 0.26352160692895976, "grad_norm": 7.555556459364494, "learning_rate": 4.810236123130177e-05, "loss": 0.8229, "step": 715 }, { "epoch": 0.2638901686169723, "grad_norm": 6.369896135905715, "learning_rate": 4.809927061441464e-05, "loss": 0.5324, "step": 716 }, { "epoch": 0.2642587303049848, "grad_norm": 7.203363046415667, "learning_rate": 4.8096179997527505e-05, "loss": 0.9731, "step": 717 }, { "epoch": 0.26462729199299734, "grad_norm": 6.754021188564348, "learning_rate": 4.8093089380640376e-05, "loss": 0.8501, "step": 718 }, { "epoch": 0.26499585368100986, "grad_norm": 7.8847658669030185, "learning_rate": 4.808999876375325e-05, "loss": 0.8448, "step": 719 }, { "epoch": 0.2653644153690224, "grad_norm": 8.604107749558674, "learning_rate": 4.808690814686612e-05, "loss": 0.8087, "step": 720 }, { "epoch": 0.2657329770570349, "grad_norm": 9.269023407127365, "learning_rate": 4.808381752997899e-05, "loss": 0.9598, "step": 721 }, { "epoch": 0.26610153874504744, "grad_norm": 8.059773301942862, "learning_rate": 4.8080726913091854e-05, "loss": 0.7651, "step": 722 }, { "epoch": 0.26647010043305996, "grad_norm": 6.377933201389516, "learning_rate": 4.8077636296204725e-05, "loss": 0.4918, "step": 723 }, { "epoch": 0.2668386621210725, "grad_norm": 8.294200056523637, "learning_rate": 4.8074545679317596e-05, "loss": 0.9534, "step": 724 }, { "epoch": 0.26720722380908507, "grad_norm": 9.374314753602492, "learning_rate": 4.807145506243047e-05, "loss": 0.8597, "step": 725 }, { "epoch": 0.2675757854970976, "grad_norm": 9.441223622561376, "learning_rate": 4.806836444554333e-05, "loss": 0.6721, "step": 726 }, { "epoch": 0.2679443471851101, "grad_norm": 6.636328458395865, "learning_rate": 4.80652738286562e-05, "loss": 0.752, "step": 727 }, { "epoch": 0.26831290887312265, "grad_norm": 9.375297846831154, "learning_rate": 4.8062183211769074e-05, "loss": 1.0671, "step": 728 }, { "epoch": 0.26868147056113517, "grad_norm": 7.338062437684451, "learning_rate": 4.805909259488194e-05, "loss": 0.8129, "step": 729 }, { "epoch": 0.2690500322491477, "grad_norm": 8.424150318800704, "learning_rate": 4.805600197799481e-05, "loss": 0.7995, "step": 730 }, { "epoch": 0.2694185939371602, "grad_norm": 8.497676756261457, "learning_rate": 4.8052911361107674e-05, "loss": 0.9867, "step": 731 }, { "epoch": 0.26978715562517275, "grad_norm": 5.323815356370886, "learning_rate": 4.8049820744220546e-05, "loss": 0.6157, "step": 732 }, { "epoch": 0.2701557173131853, "grad_norm": 5.594097105370661, "learning_rate": 4.804673012733342e-05, "loss": 0.716, "step": 733 }, { "epoch": 0.2705242790011978, "grad_norm": 9.577776904104871, "learning_rate": 4.804363951044629e-05, "loss": 0.8655, "step": 734 }, { "epoch": 0.2708928406892104, "grad_norm": 13.69175507758199, "learning_rate": 4.804054889355916e-05, "loss": 0.7305, "step": 735 }, { "epoch": 0.2712614023772229, "grad_norm": 9.577712779796915, "learning_rate": 4.8037458276672024e-05, "loss": 0.8709, "step": 736 }, { "epoch": 0.27162996406523543, "grad_norm": 8.029692858835048, "learning_rate": 4.8034367659784895e-05, "loss": 0.6497, "step": 737 }, { "epoch": 0.27199852575324796, "grad_norm": 10.16346160730987, "learning_rate": 4.8031277042897766e-05, "loss": 0.8452, "step": 738 }, { "epoch": 0.2723670874412605, "grad_norm": 8.360645347399924, "learning_rate": 4.802818642601064e-05, "loss": 0.7467, "step": 739 }, { "epoch": 0.272735649129273, "grad_norm": 10.69669022752975, "learning_rate": 4.80250958091235e-05, "loss": 0.9475, "step": 740 }, { "epoch": 0.27310421081728553, "grad_norm": 6.308945060845095, "learning_rate": 4.802200519223637e-05, "loss": 0.7366, "step": 741 }, { "epoch": 0.27347277250529806, "grad_norm": 7.2138755822045715, "learning_rate": 4.8018914575349244e-05, "loss": 0.7434, "step": 742 }, { "epoch": 0.2738413341933106, "grad_norm": 8.921817705869012, "learning_rate": 4.8015823958462115e-05, "loss": 0.9079, "step": 743 }, { "epoch": 0.27420989588132316, "grad_norm": 11.574518009968534, "learning_rate": 4.801273334157498e-05, "loss": 1.0267, "step": 744 }, { "epoch": 0.2745784575693357, "grad_norm": 12.692306134417246, "learning_rate": 4.800964272468785e-05, "loss": 1.027, "step": 745 }, { "epoch": 0.2749470192573482, "grad_norm": 8.592363169348953, "learning_rate": 4.8006552107800715e-05, "loss": 0.8527, "step": 746 }, { "epoch": 0.27531558094536074, "grad_norm": 9.769690950439008, "learning_rate": 4.8003461490913586e-05, "loss": 1.1169, "step": 747 }, { "epoch": 0.27568414263337326, "grad_norm": 10.02570168187081, "learning_rate": 4.800037087402646e-05, "loss": 0.7265, "step": 748 }, { "epoch": 0.2760527043213858, "grad_norm": 8.723115436581237, "learning_rate": 4.799728025713933e-05, "loss": 0.878, "step": 749 }, { "epoch": 0.2764212660093983, "grad_norm": 12.968194641723457, "learning_rate": 4.799418964025219e-05, "loss": 1.1912, "step": 750 }, { "epoch": 0.27678982769741084, "grad_norm": 5.639924320227234, "learning_rate": 4.7991099023365064e-05, "loss": 0.8765, "step": 751 }, { "epoch": 0.27715838938542336, "grad_norm": 6.590885064251673, "learning_rate": 4.7988008406477935e-05, "loss": 0.8803, "step": 752 }, { "epoch": 0.2775269510734359, "grad_norm": 5.809639175424994, "learning_rate": 4.798491778959081e-05, "loss": 0.9403, "step": 753 }, { "epoch": 0.27789551276144847, "grad_norm": 8.210547237395325, "learning_rate": 4.798182717270368e-05, "loss": 0.893, "step": 754 }, { "epoch": 0.278264074449461, "grad_norm": 5.766288574407271, "learning_rate": 4.797873655581654e-05, "loss": 0.5202, "step": 755 }, { "epoch": 0.2786326361374735, "grad_norm": 7.588442859167846, "learning_rate": 4.7975645938929413e-05, "loss": 0.8849, "step": 756 }, { "epoch": 0.27900119782548605, "grad_norm": 6.465476861937261, "learning_rate": 4.7972555322042285e-05, "loss": 0.6, "step": 757 }, { "epoch": 0.2793697595134986, "grad_norm": 4.962166509304626, "learning_rate": 4.7969464705155156e-05, "loss": 0.9215, "step": 758 }, { "epoch": 0.2797383212015111, "grad_norm": 6.854517996305059, "learning_rate": 4.796637408826802e-05, "loss": 0.7478, "step": 759 }, { "epoch": 0.2801068828895236, "grad_norm": 6.998561166299649, "learning_rate": 4.7963283471380885e-05, "loss": 0.7921, "step": 760 }, { "epoch": 0.28047544457753615, "grad_norm": 4.7646719558050945, "learning_rate": 4.7960192854493756e-05, "loss": 0.6565, "step": 761 }, { "epoch": 0.2808440062655487, "grad_norm": 7.152826948110053, "learning_rate": 4.795710223760663e-05, "loss": 0.6266, "step": 762 }, { "epoch": 0.28121256795356125, "grad_norm": 6.6684009521432435, "learning_rate": 4.79540116207195e-05, "loss": 0.8902, "step": 763 }, { "epoch": 0.2815811296415738, "grad_norm": 15.555119123845733, "learning_rate": 4.795092100383237e-05, "loss": 0.8506, "step": 764 }, { "epoch": 0.2819496913295863, "grad_norm": 5.775201386082107, "learning_rate": 4.7947830386945234e-05, "loss": 0.7939, "step": 765 }, { "epoch": 0.28231825301759883, "grad_norm": 7.5446306449423695, "learning_rate": 4.7944739770058105e-05, "loss": 0.601, "step": 766 }, { "epoch": 0.28268681470561136, "grad_norm": 9.369778819532142, "learning_rate": 4.7941649153170976e-05, "loss": 0.9005, "step": 767 }, { "epoch": 0.2830553763936239, "grad_norm": 9.24622577271608, "learning_rate": 4.793855853628385e-05, "loss": 0.8861, "step": 768 }, { "epoch": 0.2834239380816364, "grad_norm": 6.644413843300621, "learning_rate": 4.793546791939671e-05, "loss": 0.7205, "step": 769 }, { "epoch": 0.28379249976964893, "grad_norm": 5.434596898098719, "learning_rate": 4.793237730250958e-05, "loss": 0.5607, "step": 770 }, { "epoch": 0.28416106145766146, "grad_norm": 9.101605250704319, "learning_rate": 4.7929286685622454e-05, "loss": 0.9403, "step": 771 }, { "epoch": 0.284529623145674, "grad_norm": 6.179836883861322, "learning_rate": 4.7926196068735325e-05, "loss": 0.4798, "step": 772 }, { "epoch": 0.28489818483368656, "grad_norm": 8.211282216538928, "learning_rate": 4.79231054518482e-05, "loss": 0.7473, "step": 773 }, { "epoch": 0.2852667465216991, "grad_norm": 14.94890794114448, "learning_rate": 4.792001483496106e-05, "loss": 0.7754, "step": 774 }, { "epoch": 0.2856353082097116, "grad_norm": 5.812513453970484, "learning_rate": 4.7916924218073925e-05, "loss": 0.6005, "step": 775 }, { "epoch": 0.28600386989772414, "grad_norm": 6.062298328938682, "learning_rate": 4.79138336011868e-05, "loss": 0.7981, "step": 776 }, { "epoch": 0.28637243158573666, "grad_norm": 6.566042570609979, "learning_rate": 4.791074298429967e-05, "loss": 0.7427, "step": 777 }, { "epoch": 0.2867409932737492, "grad_norm": 6.697652080841173, "learning_rate": 4.790765236741254e-05, "loss": 0.718, "step": 778 }, { "epoch": 0.2871095549617617, "grad_norm": 9.147188148572797, "learning_rate": 4.7904561750525403e-05, "loss": 0.8228, "step": 779 }, { "epoch": 0.28747811664977424, "grad_norm": 6.722259598445058, "learning_rate": 4.7901471133638275e-05, "loss": 0.663, "step": 780 }, { "epoch": 0.28784667833778677, "grad_norm": 7.346890691028943, "learning_rate": 4.7898380516751146e-05, "loss": 0.6543, "step": 781 }, { "epoch": 0.2882152400257993, "grad_norm": 10.449209988314085, "learning_rate": 4.789528989986402e-05, "loss": 0.9265, "step": 782 }, { "epoch": 0.28858380171381187, "grad_norm": 5.989443711574648, "learning_rate": 4.789219928297689e-05, "loss": 0.7055, "step": 783 }, { "epoch": 0.2889523634018244, "grad_norm": 4.992599350920179, "learning_rate": 4.788910866608975e-05, "loss": 0.5238, "step": 784 }, { "epoch": 0.2893209250898369, "grad_norm": 7.245451454013127, "learning_rate": 4.7886018049202624e-05, "loss": 0.8739, "step": 785 }, { "epoch": 0.28968948677784945, "grad_norm": 7.5030460529868925, "learning_rate": 4.7882927432315495e-05, "loss": 0.9242, "step": 786 }, { "epoch": 0.290058048465862, "grad_norm": 8.898540814396947, "learning_rate": 4.7879836815428366e-05, "loss": 0.7777, "step": 787 }, { "epoch": 0.2904266101538745, "grad_norm": 9.395905239960667, "learning_rate": 4.787674619854123e-05, "loss": 0.8461, "step": 788 }, { "epoch": 0.290795171841887, "grad_norm": 8.653405779205656, "learning_rate": 4.7873655581654095e-05, "loss": 0.963, "step": 789 }, { "epoch": 0.29116373352989955, "grad_norm": 7.726113455812054, "learning_rate": 4.7870564964766966e-05, "loss": 0.8295, "step": 790 }, { "epoch": 0.2915322952179121, "grad_norm": 8.424712260508562, "learning_rate": 4.786747434787984e-05, "loss": 0.8041, "step": 791 }, { "epoch": 0.29190085690592465, "grad_norm": 7.995978298201696, "learning_rate": 4.786438373099271e-05, "loss": 0.9144, "step": 792 }, { "epoch": 0.2922694185939372, "grad_norm": 9.019410712030203, "learning_rate": 4.786129311410558e-05, "loss": 0.8421, "step": 793 }, { "epoch": 0.2926379802819497, "grad_norm": 9.66607225717144, "learning_rate": 4.7858202497218444e-05, "loss": 0.8821, "step": 794 }, { "epoch": 0.29300654196996223, "grad_norm": 7.295081103934916, "learning_rate": 4.7855111880331315e-05, "loss": 0.7479, "step": 795 }, { "epoch": 0.29337510365797476, "grad_norm": 6.555464397306682, "learning_rate": 4.785202126344419e-05, "loss": 0.7604, "step": 796 }, { "epoch": 0.2937436653459873, "grad_norm": 10.359076724354091, "learning_rate": 4.784893064655706e-05, "loss": 0.6979, "step": 797 }, { "epoch": 0.2941122270339998, "grad_norm": 8.258895240038918, "learning_rate": 4.784584002966992e-05, "loss": 0.857, "step": 798 }, { "epoch": 0.29448078872201233, "grad_norm": 4.694515420852412, "learning_rate": 4.7842749412782793e-05, "loss": 0.4921, "step": 799 }, { "epoch": 0.29484935041002486, "grad_norm": 6.5991056125246175, "learning_rate": 4.7839658795895665e-05, "loss": 0.7917, "step": 800 }, { "epoch": 0.29484935041002486, "eval_bleu": 0.05491603477017873, "eval_bleu_1gram": 0.40078720819653285, "eval_bleu_2gram": 0.18584224998063378, "eval_bleu_3gram": 0.08287622851918461, "eval_bleu_4gram": 0.04174297198179999, "eval_rag_val_loss": 0.8779409628643625, "eval_rouge1": 0.38665359626568463, "eval_rouge2": 0.17530052568127902, "eval_rougeL": 0.3824496680912904, "step": 800 }, { "epoch": 0.2952179120980374, "grad_norm": 6.2440593715388975, "learning_rate": 4.7836568179008536e-05, "loss": 0.6721, "step": 801 }, { "epoch": 0.29558647378604996, "grad_norm": 5.014271396031196, "learning_rate": 4.783347756212141e-05, "loss": 0.6104, "step": 802 }, { "epoch": 0.2959550354740625, "grad_norm": 6.7189878909556455, "learning_rate": 4.783038694523427e-05, "loss": 0.6895, "step": 803 }, { "epoch": 0.296323597162075, "grad_norm": 6.27451612575063, "learning_rate": 4.782729632834714e-05, "loss": 0.9899, "step": 804 }, { "epoch": 0.29669215885008754, "grad_norm": 5.7363437526387395, "learning_rate": 4.782420571146001e-05, "loss": 0.7504, "step": 805 }, { "epoch": 0.29706072053810006, "grad_norm": 7.329258682630465, "learning_rate": 4.782111509457288e-05, "loss": 0.7788, "step": 806 }, { "epoch": 0.2974292822261126, "grad_norm": 6.383556532713379, "learning_rate": 4.781802447768575e-05, "loss": 1.0166, "step": 807 }, { "epoch": 0.2977978439141251, "grad_norm": 7.694762234450559, "learning_rate": 4.7814933860798614e-05, "loss": 0.7162, "step": 808 }, { "epoch": 0.29816640560213764, "grad_norm": 9.98613235219065, "learning_rate": 4.7811843243911485e-05, "loss": 0.953, "step": 809 }, { "epoch": 0.29853496729015017, "grad_norm": 8.586794059866726, "learning_rate": 4.7808752627024356e-05, "loss": 0.763, "step": 810 }, { "epoch": 0.29890352897816275, "grad_norm": 6.454106217017727, "learning_rate": 4.780566201013723e-05, "loss": 0.8687, "step": 811 }, { "epoch": 0.29927209066617527, "grad_norm": 5.880866185960022, "learning_rate": 4.780257139325009e-05, "loss": 0.8715, "step": 812 }, { "epoch": 0.2996406523541878, "grad_norm": 6.992800143377104, "learning_rate": 4.779948077636296e-05, "loss": 0.6465, "step": 813 }, { "epoch": 0.3000092140422003, "grad_norm": 6.0334456010071476, "learning_rate": 4.7796390159475834e-05, "loss": 0.6762, "step": 814 }, { "epoch": 0.30037777573021285, "grad_norm": 7.087118536154719, "learning_rate": 4.7793299542588705e-05, "loss": 0.8202, "step": 815 }, { "epoch": 0.3007463374182254, "grad_norm": 5.375031936905483, "learning_rate": 4.7790208925701577e-05, "loss": 0.7512, "step": 816 }, { "epoch": 0.3011148991062379, "grad_norm": 10.52372676732165, "learning_rate": 4.778711830881444e-05, "loss": 0.8848, "step": 817 }, { "epoch": 0.3014834607942504, "grad_norm": 10.207951711151804, "learning_rate": 4.778402769192731e-05, "loss": 0.8715, "step": 818 }, { "epoch": 0.30185202248226295, "grad_norm": 8.506126439908988, "learning_rate": 4.778093707504018e-05, "loss": 0.8853, "step": 819 }, { "epoch": 0.3022205841702755, "grad_norm": 11.670364956285242, "learning_rate": 4.777784645815305e-05, "loss": 1.1497, "step": 820 }, { "epoch": 0.30258914585828806, "grad_norm": 6.470227059060662, "learning_rate": 4.777475584126592e-05, "loss": 0.6817, "step": 821 }, { "epoch": 0.3029577075463006, "grad_norm": 9.433477289653803, "learning_rate": 4.7771665224378783e-05, "loss": 1.0885, "step": 822 }, { "epoch": 0.3033262692343131, "grad_norm": 7.748698463668552, "learning_rate": 4.7768574607491655e-05, "loss": 0.8243, "step": 823 }, { "epoch": 0.30369483092232563, "grad_norm": 5.897231224956799, "learning_rate": 4.7765483990604526e-05, "loss": 0.8731, "step": 824 }, { "epoch": 0.30406339261033816, "grad_norm": 5.528597265657002, "learning_rate": 4.77623933737174e-05, "loss": 0.7936, "step": 825 }, { "epoch": 0.3044319542983507, "grad_norm": 13.793276843592475, "learning_rate": 4.775930275683027e-05, "loss": 0.8801, "step": 826 }, { "epoch": 0.3048005159863632, "grad_norm": 4.746863484070441, "learning_rate": 4.775621213994313e-05, "loss": 0.7171, "step": 827 }, { "epoch": 0.30516907767437573, "grad_norm": 4.812572924569235, "learning_rate": 4.7753121523056004e-05, "loss": 0.6249, "step": 828 }, { "epoch": 0.30553763936238826, "grad_norm": 9.189732701977194, "learning_rate": 4.7750030906168875e-05, "loss": 0.8497, "step": 829 }, { "epoch": 0.3059062010504008, "grad_norm": 7.248144767427047, "learning_rate": 4.7746940289281746e-05, "loss": 0.9221, "step": 830 }, { "epoch": 0.30627476273841336, "grad_norm": 11.004479623196353, "learning_rate": 4.774384967239461e-05, "loss": 0.8754, "step": 831 }, { "epoch": 0.3066433244264259, "grad_norm": 4.885361053649128, "learning_rate": 4.774075905550748e-05, "loss": 0.737, "step": 832 }, { "epoch": 0.3070118861144384, "grad_norm": 6.529265910037254, "learning_rate": 4.773766843862035e-05, "loss": 0.8232, "step": 833 }, { "epoch": 0.30738044780245094, "grad_norm": 6.302493785596338, "learning_rate": 4.7734577821733224e-05, "loss": 0.6499, "step": 834 }, { "epoch": 0.30774900949046347, "grad_norm": 9.797006609213682, "learning_rate": 4.773148720484609e-05, "loss": 0.8085, "step": 835 }, { "epoch": 0.308117571178476, "grad_norm": 6.396247311923841, "learning_rate": 4.772839658795896e-05, "loss": 0.8194, "step": 836 }, { "epoch": 0.3084861328664885, "grad_norm": 5.776038878164716, "learning_rate": 4.7725305971071824e-05, "loss": 0.6135, "step": 837 }, { "epoch": 0.30885469455450104, "grad_norm": 6.616828341234438, "learning_rate": 4.7722215354184695e-05, "loss": 0.709, "step": 838 }, { "epoch": 0.30922325624251357, "grad_norm": 8.576575523195615, "learning_rate": 4.7719124737297567e-05, "loss": 0.7964, "step": 839 }, { "epoch": 0.30959181793052615, "grad_norm": 7.545505818070589, "learning_rate": 4.771603412041044e-05, "loss": 0.7787, "step": 840 }, { "epoch": 0.3099603796185387, "grad_norm": 9.626192885951282, "learning_rate": 4.77129435035233e-05, "loss": 0.8465, "step": 841 }, { "epoch": 0.3103289413065512, "grad_norm": 7.629689259932215, "learning_rate": 4.770985288663617e-05, "loss": 0.7654, "step": 842 }, { "epoch": 0.3106975029945637, "grad_norm": 9.642977484703975, "learning_rate": 4.7706762269749045e-05, "loss": 0.9538, "step": 843 }, { "epoch": 0.31106606468257625, "grad_norm": 8.221020751095415, "learning_rate": 4.7703671652861916e-05, "loss": 0.8128, "step": 844 }, { "epoch": 0.3114346263705888, "grad_norm": 5.202524511379442, "learning_rate": 4.770058103597479e-05, "loss": 0.6879, "step": 845 }, { "epoch": 0.3118031880586013, "grad_norm": 6.638643431371105, "learning_rate": 4.769749041908765e-05, "loss": 0.7272, "step": 846 }, { "epoch": 0.3121717497466138, "grad_norm": 4.251255074475296, "learning_rate": 4.769439980220052e-05, "loss": 0.5732, "step": 847 }, { "epoch": 0.31254031143462635, "grad_norm": 6.225557674568199, "learning_rate": 4.7691309185313394e-05, "loss": 0.9258, "step": 848 }, { "epoch": 0.3129088731226389, "grad_norm": 9.375305170814357, "learning_rate": 4.7688218568426265e-05, "loss": 1.1514, "step": 849 }, { "epoch": 0.31327743481065146, "grad_norm": 6.504737887896358, "learning_rate": 4.768512795153913e-05, "loss": 0.6432, "step": 850 }, { "epoch": 0.313645996498664, "grad_norm": 10.076665349723216, "learning_rate": 4.7682037334651994e-05, "loss": 0.9443, "step": 851 }, { "epoch": 0.3140145581866765, "grad_norm": 10.053256320540344, "learning_rate": 4.7678946717764865e-05, "loss": 1.0898, "step": 852 }, { "epoch": 0.31438311987468903, "grad_norm": 6.993129628393264, "learning_rate": 4.7675856100877736e-05, "loss": 0.655, "step": 853 }, { "epoch": 0.31475168156270156, "grad_norm": 5.968824775586913, "learning_rate": 4.767276548399061e-05, "loss": 0.6572, "step": 854 }, { "epoch": 0.3151202432507141, "grad_norm": 5.753416621967136, "learning_rate": 4.766967486710348e-05, "loss": 0.6592, "step": 855 }, { "epoch": 0.3154888049387266, "grad_norm": 5.751364836214796, "learning_rate": 4.766658425021634e-05, "loss": 0.7177, "step": 856 }, { "epoch": 0.31585736662673913, "grad_norm": 7.46235517044997, "learning_rate": 4.7663493633329214e-05, "loss": 0.7428, "step": 857 }, { "epoch": 0.31622592831475166, "grad_norm": 5.147485671095245, "learning_rate": 4.7660403016442085e-05, "loss": 0.5396, "step": 858 }, { "epoch": 0.31659449000276424, "grad_norm": 8.739425809317366, "learning_rate": 4.7657312399554957e-05, "loss": 0.8434, "step": 859 }, { "epoch": 0.31696305169077676, "grad_norm": 7.146739787301784, "learning_rate": 4.765422178266782e-05, "loss": 0.8126, "step": 860 }, { "epoch": 0.3173316133787893, "grad_norm": 7.335512126578463, "learning_rate": 4.765113116578069e-05, "loss": 0.8951, "step": 861 }, { "epoch": 0.3177001750668018, "grad_norm": 7.97908432986919, "learning_rate": 4.764804054889356e-05, "loss": 0.5899, "step": 862 }, { "epoch": 0.31806873675481434, "grad_norm": 5.9434225960940825, "learning_rate": 4.7644949932006435e-05, "loss": 0.8605, "step": 863 }, { "epoch": 0.31843729844282687, "grad_norm": 6.5872684097840635, "learning_rate": 4.7641859315119306e-05, "loss": 0.8696, "step": 864 }, { "epoch": 0.3188058601308394, "grad_norm": 8.884008000026297, "learning_rate": 4.763876869823217e-05, "loss": 0.8455, "step": 865 }, { "epoch": 0.3191744218188519, "grad_norm": 8.564672932256283, "learning_rate": 4.7635678081345035e-05, "loss": 0.6255, "step": 866 }, { "epoch": 0.31954298350686444, "grad_norm": 7.675788704671953, "learning_rate": 4.7632587464457906e-05, "loss": 0.882, "step": 867 }, { "epoch": 0.31991154519487697, "grad_norm": 6.703420352431042, "learning_rate": 4.762949684757078e-05, "loss": 0.7644, "step": 868 }, { "epoch": 0.32028010688288955, "grad_norm": 6.332476440819591, "learning_rate": 4.762640623068365e-05, "loss": 0.6012, "step": 869 }, { "epoch": 0.3206486685709021, "grad_norm": 7.317985107690296, "learning_rate": 4.762331561379651e-05, "loss": 0.8256, "step": 870 }, { "epoch": 0.3210172302589146, "grad_norm": 8.417056940176874, "learning_rate": 4.7620224996909384e-05, "loss": 0.8931, "step": 871 }, { "epoch": 0.3213857919469271, "grad_norm": 8.817910123177128, "learning_rate": 4.7617134380022255e-05, "loss": 1.025, "step": 872 }, { "epoch": 0.32175435363493965, "grad_norm": 7.392493239687685, "learning_rate": 4.7614043763135126e-05, "loss": 0.7568, "step": 873 }, { "epoch": 0.3221229153229522, "grad_norm": 6.8345523150152045, "learning_rate": 4.7610953146248e-05, "loss": 0.6907, "step": 874 }, { "epoch": 0.3224914770109647, "grad_norm": 8.05968242767424, "learning_rate": 4.760786252936086e-05, "loss": 0.8274, "step": 875 }, { "epoch": 0.3228600386989772, "grad_norm": 6.227161029302461, "learning_rate": 4.760477191247373e-05, "loss": 0.7291, "step": 876 }, { "epoch": 0.32322860038698975, "grad_norm": 5.276942375500284, "learning_rate": 4.7601681295586604e-05, "loss": 0.8685, "step": 877 }, { "epoch": 0.3235971620750023, "grad_norm": 5.388417375174462, "learning_rate": 4.7598590678699475e-05, "loss": 0.5822, "step": 878 }, { "epoch": 0.32396572376301486, "grad_norm": 8.310553989788376, "learning_rate": 4.759550006181234e-05, "loss": 0.6595, "step": 879 }, { "epoch": 0.3243342854510274, "grad_norm": 6.2474994997999, "learning_rate": 4.7592409444925204e-05, "loss": 0.6302, "step": 880 }, { "epoch": 0.3247028471390399, "grad_norm": 7.8477699951313795, "learning_rate": 4.7589318828038075e-05, "loss": 0.9355, "step": 881 }, { "epoch": 0.32507140882705243, "grad_norm": 8.558318739163276, "learning_rate": 4.7586228211150947e-05, "loss": 1.0311, "step": 882 }, { "epoch": 0.32543997051506496, "grad_norm": 8.676708517111656, "learning_rate": 4.758313759426382e-05, "loss": 0.9032, "step": 883 }, { "epoch": 0.3258085322030775, "grad_norm": 9.022231724091503, "learning_rate": 4.758004697737668e-05, "loss": 0.8986, "step": 884 }, { "epoch": 0.32617709389109, "grad_norm": 7.286550794391545, "learning_rate": 4.757695636048955e-05, "loss": 0.9291, "step": 885 }, { "epoch": 0.32654565557910253, "grad_norm": 8.628556112746809, "learning_rate": 4.7573865743602424e-05, "loss": 0.8711, "step": 886 }, { "epoch": 0.32691421726711506, "grad_norm": 7.088406739269295, "learning_rate": 4.7570775126715296e-05, "loss": 0.541, "step": 887 }, { "epoch": 0.32728277895512764, "grad_norm": 7.334993636863772, "learning_rate": 4.756768450982817e-05, "loss": 0.7647, "step": 888 }, { "epoch": 0.32765134064314017, "grad_norm": 6.931243549753255, "learning_rate": 4.756459389294103e-05, "loss": 0.7264, "step": 889 }, { "epoch": 0.3280199023311527, "grad_norm": 5.709476477144503, "learning_rate": 4.75615032760539e-05, "loss": 0.6947, "step": 890 }, { "epoch": 0.3283884640191652, "grad_norm": 8.617429645141575, "learning_rate": 4.7558412659166774e-05, "loss": 0.8252, "step": 891 }, { "epoch": 0.32875702570717774, "grad_norm": 12.039563762074383, "learning_rate": 4.7555322042279645e-05, "loss": 0.8619, "step": 892 }, { "epoch": 0.32912558739519027, "grad_norm": 6.022961232792284, "learning_rate": 4.7552231425392516e-05, "loss": 0.7053, "step": 893 }, { "epoch": 0.3294941490832028, "grad_norm": 8.999580373518205, "learning_rate": 4.754914080850538e-05, "loss": 0.9816, "step": 894 }, { "epoch": 0.3298627107712153, "grad_norm": 6.2188150220375835, "learning_rate": 4.7546050191618245e-05, "loss": 0.6897, "step": 895 }, { "epoch": 0.33023127245922784, "grad_norm": 9.42140695095732, "learning_rate": 4.7542959574731116e-05, "loss": 0.9843, "step": 896 }, { "epoch": 0.33059983414724037, "grad_norm": 5.428026738482562, "learning_rate": 4.753986895784399e-05, "loss": 0.6462, "step": 897 }, { "epoch": 0.33096839583525295, "grad_norm": 7.527214013542066, "learning_rate": 4.753677834095686e-05, "loss": 0.9556, "step": 898 }, { "epoch": 0.3313369575232655, "grad_norm": 6.895480082550283, "learning_rate": 4.753368772406972e-05, "loss": 0.8068, "step": 899 }, { "epoch": 0.331705519211278, "grad_norm": 8.3030580517213, "learning_rate": 4.7530597107182594e-05, "loss": 0.976, "step": 900 }, { "epoch": 0.3320740808992905, "grad_norm": 12.976690669520556, "learning_rate": 4.7527506490295465e-05, "loss": 0.8187, "step": 901 }, { "epoch": 0.33244264258730305, "grad_norm": 6.7338476129413865, "learning_rate": 4.7524415873408336e-05, "loss": 0.8497, "step": 902 }, { "epoch": 0.3328112042753156, "grad_norm": 7.816769097259624, "learning_rate": 4.75213252565212e-05, "loss": 1.0908, "step": 903 }, { "epoch": 0.3331797659633281, "grad_norm": 6.684098849448534, "learning_rate": 4.751823463963407e-05, "loss": 0.6977, "step": 904 }, { "epoch": 0.3335483276513406, "grad_norm": 6.593610861517021, "learning_rate": 4.751514402274694e-05, "loss": 0.8192, "step": 905 }, { "epoch": 0.33391688933935315, "grad_norm": 7.344967485515593, "learning_rate": 4.7512053405859814e-05, "loss": 0.8587, "step": 906 }, { "epoch": 0.33428545102736573, "grad_norm": 8.275618109998737, "learning_rate": 4.7508962788972686e-05, "loss": 0.8678, "step": 907 }, { "epoch": 0.33465401271537826, "grad_norm": 5.7161128733270505, "learning_rate": 4.750587217208555e-05, "loss": 0.7369, "step": 908 }, { "epoch": 0.3350225744033908, "grad_norm": 6.854037978366236, "learning_rate": 4.750278155519842e-05, "loss": 0.5564, "step": 909 }, { "epoch": 0.3353911360914033, "grad_norm": 7.2740720806016945, "learning_rate": 4.7499690938311286e-05, "loss": 0.7301, "step": 910 }, { "epoch": 0.33575969777941583, "grad_norm": 11.513110441411254, "learning_rate": 4.749660032142416e-05, "loss": 1.1115, "step": 911 }, { "epoch": 0.33612825946742836, "grad_norm": 8.07717010343863, "learning_rate": 4.749350970453703e-05, "loss": 0.9067, "step": 912 }, { "epoch": 0.3364968211554409, "grad_norm": 5.797409978893929, "learning_rate": 4.749041908764989e-05, "loss": 0.8484, "step": 913 }, { "epoch": 0.3368653828434534, "grad_norm": 6.0108369870925955, "learning_rate": 4.7487328470762764e-05, "loss": 0.9264, "step": 914 }, { "epoch": 0.33723394453146593, "grad_norm": 32.42409630944824, "learning_rate": 4.7484237853875635e-05, "loss": 0.9045, "step": 915 }, { "epoch": 0.33760250621947846, "grad_norm": 5.779722599645888, "learning_rate": 4.7481147236988506e-05, "loss": 0.5526, "step": 916 }, { "epoch": 0.33797106790749104, "grad_norm": 6.3260808209717645, "learning_rate": 4.747805662010138e-05, "loss": 0.8284, "step": 917 }, { "epoch": 0.33833962959550357, "grad_norm": 6.038145244861776, "learning_rate": 4.747496600321424e-05, "loss": 0.5847, "step": 918 }, { "epoch": 0.3387081912835161, "grad_norm": 7.497643164030729, "learning_rate": 4.747187538632711e-05, "loss": 0.9153, "step": 919 }, { "epoch": 0.3390767529715286, "grad_norm": 5.6469023796224205, "learning_rate": 4.7468784769439984e-05, "loss": 0.5552, "step": 920 }, { "epoch": 0.33944531465954114, "grad_norm": 9.501068858199176, "learning_rate": 4.7465694152552855e-05, "loss": 0.8732, "step": 921 }, { "epoch": 0.33981387634755367, "grad_norm": 5.941221014843396, "learning_rate": 4.746260353566572e-05, "loss": 0.5885, "step": 922 }, { "epoch": 0.3401824380355662, "grad_norm": 7.906935108038662, "learning_rate": 4.745951291877859e-05, "loss": 0.8314, "step": 923 }, { "epoch": 0.3405509997235787, "grad_norm": 6.352428794801055, "learning_rate": 4.745642230189146e-05, "loss": 1.1543, "step": 924 }, { "epoch": 0.34091956141159124, "grad_norm": 6.678009429202945, "learning_rate": 4.745333168500433e-05, "loss": 0.8358, "step": 925 }, { "epoch": 0.34128812309960377, "grad_norm": 8.510181387952866, "learning_rate": 4.74502410681172e-05, "loss": 0.9978, "step": 926 }, { "epoch": 0.34165668478761635, "grad_norm": 5.7318821615603435, "learning_rate": 4.744715045123007e-05, "loss": 0.685, "step": 927 }, { "epoch": 0.3420252464756289, "grad_norm": 9.462032138238518, "learning_rate": 4.744405983434293e-05, "loss": 0.8265, "step": 928 }, { "epoch": 0.3423938081636414, "grad_norm": 6.103622186569768, "learning_rate": 4.7440969217455804e-05, "loss": 0.7605, "step": 929 }, { "epoch": 0.3427623698516539, "grad_norm": 8.372158650330785, "learning_rate": 4.7437878600568676e-05, "loss": 0.6473, "step": 930 }, { "epoch": 0.34313093153966645, "grad_norm": 4.309724661048863, "learning_rate": 4.743478798368155e-05, "loss": 0.5116, "step": 931 }, { "epoch": 0.343499493227679, "grad_norm": 9.440531868903014, "learning_rate": 4.743169736679441e-05, "loss": 0.8118, "step": 932 }, { "epoch": 0.3438680549156915, "grad_norm": 8.431083443412547, "learning_rate": 4.742860674990728e-05, "loss": 0.9585, "step": 933 }, { "epoch": 0.344236616603704, "grad_norm": 7.328539871152125, "learning_rate": 4.7425516133020154e-05, "loss": 0.8693, "step": 934 }, { "epoch": 0.34460517829171655, "grad_norm": 5.628321027545544, "learning_rate": 4.7422425516133025e-05, "loss": 0.5438, "step": 935 }, { "epoch": 0.34497373997972913, "grad_norm": 6.784751445178487, "learning_rate": 4.7419334899245896e-05, "loss": 0.8919, "step": 936 }, { "epoch": 0.34534230166774166, "grad_norm": 9.018960689375684, "learning_rate": 4.741624428235876e-05, "loss": 0.8183, "step": 937 }, { "epoch": 0.3457108633557542, "grad_norm": 11.05751813362251, "learning_rate": 4.741315366547163e-05, "loss": 1.017, "step": 938 }, { "epoch": 0.3460794250437667, "grad_norm": 8.221745514439085, "learning_rate": 4.74100630485845e-05, "loss": 0.7356, "step": 939 }, { "epoch": 0.34644798673177923, "grad_norm": 9.978389659302302, "learning_rate": 4.7406972431697374e-05, "loss": 0.7919, "step": 940 }, { "epoch": 0.34681654841979176, "grad_norm": 7.461082193459512, "learning_rate": 4.740388181481024e-05, "loss": 0.8308, "step": 941 }, { "epoch": 0.3471851101078043, "grad_norm": 9.482469144127807, "learning_rate": 4.74007911979231e-05, "loss": 0.7932, "step": 942 }, { "epoch": 0.3475536717958168, "grad_norm": 6.665856820826691, "learning_rate": 4.7397700581035974e-05, "loss": 0.6767, "step": 943 }, { "epoch": 0.34792223348382934, "grad_norm": 21.426256617903416, "learning_rate": 4.7394609964148845e-05, "loss": 0.9374, "step": 944 }, { "epoch": 0.34829079517184186, "grad_norm": 7.693521761820745, "learning_rate": 4.7391519347261716e-05, "loss": 0.597, "step": 945 }, { "epoch": 0.34865935685985444, "grad_norm": 8.288131445854942, "learning_rate": 4.738842873037459e-05, "loss": 0.9834, "step": 946 }, { "epoch": 0.34902791854786697, "grad_norm": 63.656238494107946, "learning_rate": 4.738533811348745e-05, "loss": 0.6927, "step": 947 }, { "epoch": 0.3493964802358795, "grad_norm": 5.6121448701851335, "learning_rate": 4.738224749660032e-05, "loss": 0.6894, "step": 948 }, { "epoch": 0.349765041923892, "grad_norm": 6.442258390989105, "learning_rate": 4.7379156879713194e-05, "loss": 0.7837, "step": 949 }, { "epoch": 0.35013360361190454, "grad_norm": 10.886048065320525, "learning_rate": 4.7376066262826066e-05, "loss": 0.4463, "step": 950 }, { "epoch": 0.35050216529991707, "grad_norm": 7.014736467605959, "learning_rate": 4.737297564593893e-05, "loss": 0.706, "step": 951 }, { "epoch": 0.3508707269879296, "grad_norm": 6.643807830989483, "learning_rate": 4.73698850290518e-05, "loss": 0.8131, "step": 952 }, { "epoch": 0.3512392886759421, "grad_norm": 7.095616569799642, "learning_rate": 4.736679441216467e-05, "loss": 0.7932, "step": 953 }, { "epoch": 0.35160785036395464, "grad_norm": 6.754623525026672, "learning_rate": 4.7363703795277544e-05, "loss": 0.5887, "step": 954 }, { "epoch": 0.3519764120519672, "grad_norm": 7.622977316810501, "learning_rate": 4.7360613178390415e-05, "loss": 0.677, "step": 955 }, { "epoch": 0.35234497373997975, "grad_norm": 5.371316401635134, "learning_rate": 4.735752256150328e-05, "loss": 0.6599, "step": 956 }, { "epoch": 0.3527135354279923, "grad_norm": 7.949845452875759, "learning_rate": 4.7354431944616144e-05, "loss": 0.7922, "step": 957 }, { "epoch": 0.3530820971160048, "grad_norm": 8.451343172302, "learning_rate": 4.7351341327729015e-05, "loss": 0.5983, "step": 958 }, { "epoch": 0.3534506588040173, "grad_norm": 12.582824966233114, "learning_rate": 4.7348250710841886e-05, "loss": 0.8167, "step": 959 }, { "epoch": 0.35381922049202985, "grad_norm": 11.21646103313831, "learning_rate": 4.734516009395476e-05, "loss": 0.8668, "step": 960 }, { "epoch": 0.3541877821800424, "grad_norm": 6.775241422398696, "learning_rate": 4.734206947706762e-05, "loss": 0.4363, "step": 961 }, { "epoch": 0.3545563438680549, "grad_norm": 7.817746285370348, "learning_rate": 4.733897886018049e-05, "loss": 0.607, "step": 962 }, { "epoch": 0.3549249055560674, "grad_norm": 14.749706717986008, "learning_rate": 4.7335888243293364e-05, "loss": 1.0023, "step": 963 }, { "epoch": 0.35529346724407995, "grad_norm": 12.166086958221202, "learning_rate": 4.7332797626406235e-05, "loss": 0.8199, "step": 964 }, { "epoch": 0.35566202893209253, "grad_norm": 6.92690147983412, "learning_rate": 4.7329707009519106e-05, "loss": 0.73, "step": 965 }, { "epoch": 0.35603059062010506, "grad_norm": 6.730749300413701, "learning_rate": 4.732661639263197e-05, "loss": 0.5453, "step": 966 }, { "epoch": 0.3563991523081176, "grad_norm": 6.627385123979364, "learning_rate": 4.732352577574484e-05, "loss": 0.8671, "step": 967 }, { "epoch": 0.3567677139961301, "grad_norm": 6.746436414646995, "learning_rate": 4.732043515885771e-05, "loss": 0.7993, "step": 968 }, { "epoch": 0.35713627568414263, "grad_norm": 12.651359008645079, "learning_rate": 4.7317344541970584e-05, "loss": 0.7881, "step": 969 }, { "epoch": 0.35750483737215516, "grad_norm": 9.289116707684556, "learning_rate": 4.731425392508345e-05, "loss": 0.8925, "step": 970 }, { "epoch": 0.3578733990601677, "grad_norm": 7.027418389594582, "learning_rate": 4.731116330819631e-05, "loss": 0.7142, "step": 971 }, { "epoch": 0.3582419607481802, "grad_norm": 7.786306431201907, "learning_rate": 4.7308072691309184e-05, "loss": 0.806, "step": 972 }, { "epoch": 0.35861052243619274, "grad_norm": 7.725403423947784, "learning_rate": 4.7304982074422056e-05, "loss": 0.6523, "step": 973 }, { "epoch": 0.3589790841242053, "grad_norm": 13.370491765407438, "learning_rate": 4.730189145753493e-05, "loss": 0.7629, "step": 974 }, { "epoch": 0.35934764581221784, "grad_norm": 7.211297221525923, "learning_rate": 4.729880084064779e-05, "loss": 1.0213, "step": 975 }, { "epoch": 0.35971620750023037, "grad_norm": 6.825314290282376, "learning_rate": 4.729571022376066e-05, "loss": 0.7428, "step": 976 }, { "epoch": 0.3600847691882429, "grad_norm": 6.32249731676943, "learning_rate": 4.7292619606873534e-05, "loss": 0.6787, "step": 977 }, { "epoch": 0.3604533308762554, "grad_norm": 8.550339577160992, "learning_rate": 4.7289528989986405e-05, "loss": 1.1934, "step": 978 }, { "epoch": 0.36082189256426794, "grad_norm": 8.411168584567058, "learning_rate": 4.7286438373099276e-05, "loss": 0.729, "step": 979 }, { "epoch": 0.36119045425228047, "grad_norm": 7.991983688902987, "learning_rate": 4.728334775621214e-05, "loss": 1.0611, "step": 980 }, { "epoch": 0.361559015940293, "grad_norm": 10.98928553944616, "learning_rate": 4.728025713932501e-05, "loss": 0.5258, "step": 981 }, { "epoch": 0.3619275776283055, "grad_norm": 7.066148203898155, "learning_rate": 4.727716652243788e-05, "loss": 0.7116, "step": 982 }, { "epoch": 0.36229613931631804, "grad_norm": 5.442464557159065, "learning_rate": 4.7274075905550754e-05, "loss": 0.6403, "step": 983 }, { "epoch": 0.3626647010043306, "grad_norm": 8.426921174917993, "learning_rate": 4.7270985288663625e-05, "loss": 0.6192, "step": 984 }, { "epoch": 0.36303326269234315, "grad_norm": 7.908895057636992, "learning_rate": 4.726789467177649e-05, "loss": 0.8491, "step": 985 }, { "epoch": 0.3634018243803557, "grad_norm": 6.6988386485645375, "learning_rate": 4.7264804054889354e-05, "loss": 0.6526, "step": 986 }, { "epoch": 0.3637703860683682, "grad_norm": 6.574399785831807, "learning_rate": 4.7261713438002225e-05, "loss": 0.7256, "step": 987 }, { "epoch": 0.3641389477563807, "grad_norm": 16.435923685705763, "learning_rate": 4.7258622821115096e-05, "loss": 0.8877, "step": 988 }, { "epoch": 0.36450750944439325, "grad_norm": 7.369668358803536, "learning_rate": 4.725553220422797e-05, "loss": 0.7796, "step": 989 }, { "epoch": 0.3648760711324058, "grad_norm": 7.227798405718104, "learning_rate": 4.725244158734083e-05, "loss": 0.7976, "step": 990 }, { "epoch": 0.3652446328204183, "grad_norm": 5.5672037189171375, "learning_rate": 4.72493509704537e-05, "loss": 0.663, "step": 991 }, { "epoch": 0.3656131945084308, "grad_norm": 6.967082418542055, "learning_rate": 4.7246260353566574e-05, "loss": 0.7891, "step": 992 }, { "epoch": 0.36598175619644335, "grad_norm": 9.373900488909436, "learning_rate": 4.7243169736679446e-05, "loss": 0.7077, "step": 993 }, { "epoch": 0.36635031788445593, "grad_norm": 7.918893708307781, "learning_rate": 4.724007911979231e-05, "loss": 0.9598, "step": 994 }, { "epoch": 0.36671887957246846, "grad_norm": 8.916726893521655, "learning_rate": 4.723698850290518e-05, "loss": 0.8646, "step": 995 }, { "epoch": 0.367087441260481, "grad_norm": 8.229959364428035, "learning_rate": 4.723389788601805e-05, "loss": 0.7764, "step": 996 }, { "epoch": 0.3674560029484935, "grad_norm": 5.438674449146037, "learning_rate": 4.7230807269130924e-05, "loss": 0.7961, "step": 997 }, { "epoch": 0.36782456463650604, "grad_norm": 8.451479034096312, "learning_rate": 4.7227716652243795e-05, "loss": 0.8701, "step": 998 }, { "epoch": 0.36819312632451856, "grad_norm": 8.708881221005488, "learning_rate": 4.722462603535666e-05, "loss": 0.8342, "step": 999 }, { "epoch": 0.3685616880125311, "grad_norm": 11.317797016066113, "learning_rate": 4.722153541846953e-05, "loss": 1.0828, "step": 1000 }, { "epoch": 0.3689302497005436, "grad_norm": 8.085960144895827, "learning_rate": 4.7218444801582395e-05, "loss": 0.9277, "step": 1001 }, { "epoch": 0.36929881138855614, "grad_norm": 5.8523864573431705, "learning_rate": 4.7215354184695266e-05, "loss": 0.7518, "step": 1002 }, { "epoch": 0.3696673730765687, "grad_norm": 6.259627594519526, "learning_rate": 4.721226356780814e-05, "loss": 0.8732, "step": 1003 }, { "epoch": 0.37003593476458124, "grad_norm": 6.127415394877902, "learning_rate": 4.7209172950921e-05, "loss": 0.5922, "step": 1004 }, { "epoch": 0.37040449645259377, "grad_norm": 6.359064267976682, "learning_rate": 4.720608233403387e-05, "loss": 0.7311, "step": 1005 }, { "epoch": 0.3707730581406063, "grad_norm": 5.162509517337625, "learning_rate": 4.7202991717146744e-05, "loss": 0.5654, "step": 1006 }, { "epoch": 0.3711416198286188, "grad_norm": 17.474794382787543, "learning_rate": 4.7199901100259615e-05, "loss": 0.6739, "step": 1007 }, { "epoch": 0.37151018151663134, "grad_norm": 9.183637780360208, "learning_rate": 4.7196810483372486e-05, "loss": 0.9996, "step": 1008 }, { "epoch": 0.37187874320464387, "grad_norm": 8.189319670463417, "learning_rate": 4.719371986648535e-05, "loss": 0.6918, "step": 1009 }, { "epoch": 0.3722473048926564, "grad_norm": 8.319954877877757, "learning_rate": 4.719062924959822e-05, "loss": 0.7041, "step": 1010 }, { "epoch": 0.3726158665806689, "grad_norm": 11.778007729002946, "learning_rate": 4.718753863271109e-05, "loss": 0.9557, "step": 1011 }, { "epoch": 0.37298442826868144, "grad_norm": 11.402462850525636, "learning_rate": 4.7184448015823964e-05, "loss": 1.176, "step": 1012 }, { "epoch": 0.373352989956694, "grad_norm": 8.37761180633459, "learning_rate": 4.718135739893683e-05, "loss": 0.8674, "step": 1013 }, { "epoch": 0.37372155164470655, "grad_norm": 9.881728185945915, "learning_rate": 4.71782667820497e-05, "loss": 0.9149, "step": 1014 }, { "epoch": 0.3740901133327191, "grad_norm": 7.886022694798125, "learning_rate": 4.717517616516257e-05, "loss": 0.6714, "step": 1015 }, { "epoch": 0.3744586750207316, "grad_norm": 7.931599022991734, "learning_rate": 4.7172085548275436e-05, "loss": 0.7258, "step": 1016 }, { "epoch": 0.3748272367087441, "grad_norm": 10.042701720608907, "learning_rate": 4.716899493138831e-05, "loss": 0.5695, "step": 1017 }, { "epoch": 0.37519579839675665, "grad_norm": 7.355096371017386, "learning_rate": 4.716590431450118e-05, "loss": 0.857, "step": 1018 }, { "epoch": 0.3755643600847692, "grad_norm": 10.468846198252043, "learning_rate": 4.716281369761404e-05, "loss": 0.99, "step": 1019 }, { "epoch": 0.3759329217727817, "grad_norm": 8.10579771546292, "learning_rate": 4.7159723080726914e-05, "loss": 0.8055, "step": 1020 }, { "epoch": 0.37630148346079423, "grad_norm": 8.251179004000258, "learning_rate": 4.7156632463839785e-05, "loss": 1.0882, "step": 1021 }, { "epoch": 0.3766700451488068, "grad_norm": 5.868566358656689, "learning_rate": 4.7153541846952656e-05, "loss": 0.8704, "step": 1022 }, { "epoch": 0.37703860683681933, "grad_norm": 9.24979957157223, "learning_rate": 4.715045123006552e-05, "loss": 0.7505, "step": 1023 }, { "epoch": 0.37740716852483186, "grad_norm": 9.447593293689602, "learning_rate": 4.714736061317839e-05, "loss": 0.7161, "step": 1024 }, { "epoch": 0.3777757302128444, "grad_norm": 7.923458144323333, "learning_rate": 4.714426999629126e-05, "loss": 0.7262, "step": 1025 }, { "epoch": 0.3781442919008569, "grad_norm": 7.039692739370525, "learning_rate": 4.7141179379404134e-05, "loss": 0.7075, "step": 1026 }, { "epoch": 0.37851285358886944, "grad_norm": 4.460662566095361, "learning_rate": 4.7138088762517005e-05, "loss": 0.5332, "step": 1027 }, { "epoch": 0.37888141527688196, "grad_norm": 6.917386978819827, "learning_rate": 4.713499814562987e-05, "loss": 0.7812, "step": 1028 }, { "epoch": 0.3792499769648945, "grad_norm": 10.751674787780157, "learning_rate": 4.713190752874274e-05, "loss": 0.6834, "step": 1029 }, { "epoch": 0.379618538652907, "grad_norm": 5.763596092457571, "learning_rate": 4.712881691185561e-05, "loss": 0.5908, "step": 1030 }, { "epoch": 0.37998710034091954, "grad_norm": 6.257478131184787, "learning_rate": 4.7125726294968476e-05, "loss": 0.9071, "step": 1031 }, { "epoch": 0.3803556620289321, "grad_norm": 6.88457280315088, "learning_rate": 4.712263567808135e-05, "loss": 1.0291, "step": 1032 }, { "epoch": 0.38072422371694464, "grad_norm": 6.8633793501988745, "learning_rate": 4.711954506119421e-05, "loss": 0.9098, "step": 1033 }, { "epoch": 0.38109278540495717, "grad_norm": 8.702481531980009, "learning_rate": 4.711645444430708e-05, "loss": 0.7531, "step": 1034 }, { "epoch": 0.3814613470929697, "grad_norm": 7.6950210128804635, "learning_rate": 4.7113363827419954e-05, "loss": 0.7106, "step": 1035 }, { "epoch": 0.3818299087809822, "grad_norm": 7.675712418186926, "learning_rate": 4.7110273210532825e-05, "loss": 0.812, "step": 1036 }, { "epoch": 0.38219847046899474, "grad_norm": 6.4585015387831906, "learning_rate": 4.71071825936457e-05, "loss": 0.633, "step": 1037 }, { "epoch": 0.38256703215700727, "grad_norm": 8.87170520927699, "learning_rate": 4.710409197675856e-05, "loss": 0.7896, "step": 1038 }, { "epoch": 0.3829355938450198, "grad_norm": 289.1226626580836, "learning_rate": 4.710100135987143e-05, "loss": 1.0454, "step": 1039 }, { "epoch": 0.3833041555330323, "grad_norm": 7.060628457343332, "learning_rate": 4.7097910742984303e-05, "loss": 1.0068, "step": 1040 }, { "epoch": 0.38367271722104485, "grad_norm": 7.998577468282299, "learning_rate": 4.7094820126097175e-05, "loss": 0.9068, "step": 1041 }, { "epoch": 0.3840412789090574, "grad_norm": 6.079036222665923, "learning_rate": 4.709172950921004e-05, "loss": 0.5342, "step": 1042 }, { "epoch": 0.38440984059706995, "grad_norm": 10.91769132585817, "learning_rate": 4.708863889232291e-05, "loss": 0.7915, "step": 1043 }, { "epoch": 0.3847784022850825, "grad_norm": 7.212474126124788, "learning_rate": 4.708554827543578e-05, "loss": 0.73, "step": 1044 }, { "epoch": 0.385146963973095, "grad_norm": 12.173165523490786, "learning_rate": 4.708245765854865e-05, "loss": 0.8914, "step": 1045 }, { "epoch": 0.3855155256611075, "grad_norm": 5.4608906974478675, "learning_rate": 4.707936704166152e-05, "loss": 0.9006, "step": 1046 }, { "epoch": 0.38588408734912005, "grad_norm": 6.412280412450054, "learning_rate": 4.707627642477438e-05, "loss": 0.6903, "step": 1047 }, { "epoch": 0.3862526490371326, "grad_norm": 9.517678125436586, "learning_rate": 4.707318580788725e-05, "loss": 0.6171, "step": 1048 }, { "epoch": 0.3866212107251451, "grad_norm": 9.715711505022579, "learning_rate": 4.7070095191000124e-05, "loss": 0.8149, "step": 1049 }, { "epoch": 0.38698977241315763, "grad_norm": 5.575213701061153, "learning_rate": 4.7067004574112995e-05, "loss": 0.888, "step": 1050 }, { "epoch": 0.3873583341011702, "grad_norm": 6.657157155576468, "learning_rate": 4.7063913957225866e-05, "loss": 0.743, "step": 1051 }, { "epoch": 0.38772689578918274, "grad_norm": 6.29897740859919, "learning_rate": 4.706082334033873e-05, "loss": 0.5189, "step": 1052 }, { "epoch": 0.38809545747719526, "grad_norm": 8.747295724686412, "learning_rate": 4.70577327234516e-05, "loss": 0.7997, "step": 1053 }, { "epoch": 0.3884640191652078, "grad_norm": 7.295173659058454, "learning_rate": 4.705464210656447e-05, "loss": 0.7832, "step": 1054 }, { "epoch": 0.3888325808532203, "grad_norm": 4.974931818049196, "learning_rate": 4.7051551489677344e-05, "loss": 0.5458, "step": 1055 }, { "epoch": 0.38920114254123284, "grad_norm": 7.917622826373022, "learning_rate": 4.7048460872790215e-05, "loss": 0.7011, "step": 1056 }, { "epoch": 0.38956970422924536, "grad_norm": 7.8800064400583665, "learning_rate": 4.704537025590308e-05, "loss": 0.7744, "step": 1057 }, { "epoch": 0.3899382659172579, "grad_norm": 8.189630828101224, "learning_rate": 4.704227963901595e-05, "loss": 0.9142, "step": 1058 }, { "epoch": 0.3903068276052704, "grad_norm": 13.938183139087748, "learning_rate": 4.703918902212882e-05, "loss": 0.8282, "step": 1059 }, { "epoch": 0.39067538929328294, "grad_norm": 7.308189230019797, "learning_rate": 4.7036098405241693e-05, "loss": 0.8525, "step": 1060 }, { "epoch": 0.3910439509812955, "grad_norm": 7.806481324212799, "learning_rate": 4.703300778835456e-05, "loss": 0.8451, "step": 1061 }, { "epoch": 0.39141251266930804, "grad_norm": 8.091494448472913, "learning_rate": 4.702991717146742e-05, "loss": 0.9093, "step": 1062 }, { "epoch": 0.39178107435732057, "grad_norm": 5.574968401216329, "learning_rate": 4.7026826554580293e-05, "loss": 0.7638, "step": 1063 }, { "epoch": 0.3921496360453331, "grad_norm": 11.880314651586595, "learning_rate": 4.7023735937693165e-05, "loss": 0.989, "step": 1064 }, { "epoch": 0.3925181977333456, "grad_norm": 19.099904069110366, "learning_rate": 4.7020645320806036e-05, "loss": 0.7191, "step": 1065 }, { "epoch": 0.39288675942135814, "grad_norm": 8.64906911085667, "learning_rate": 4.70175547039189e-05, "loss": 0.8128, "step": 1066 }, { "epoch": 0.39325532110937067, "grad_norm": 8.968105173356891, "learning_rate": 4.701446408703177e-05, "loss": 0.8062, "step": 1067 }, { "epoch": 0.3936238827973832, "grad_norm": 5.220553332119676, "learning_rate": 4.701137347014464e-05, "loss": 0.5872, "step": 1068 }, { "epoch": 0.3939924444853957, "grad_norm": 6.438253710428298, "learning_rate": 4.7008282853257514e-05, "loss": 1.0555, "step": 1069 }, { "epoch": 0.3943610061734083, "grad_norm": 4.760385102794132, "learning_rate": 4.7005192236370385e-05, "loss": 0.5621, "step": 1070 }, { "epoch": 0.3947295678614208, "grad_norm": 8.181442699100927, "learning_rate": 4.700210161948325e-05, "loss": 0.6884, "step": 1071 }, { "epoch": 0.39509812954943335, "grad_norm": 6.421266698683757, "learning_rate": 4.699901100259612e-05, "loss": 0.8033, "step": 1072 }, { "epoch": 0.3954666912374459, "grad_norm": 10.038265258956832, "learning_rate": 4.699592038570899e-05, "loss": 0.8876, "step": 1073 }, { "epoch": 0.3958352529254584, "grad_norm": 9.54067637095161, "learning_rate": 4.699282976882186e-05, "loss": 0.7087, "step": 1074 }, { "epoch": 0.39620381461347093, "grad_norm": 6.219971196638431, "learning_rate": 4.698973915193473e-05, "loss": 0.51, "step": 1075 }, { "epoch": 0.39657237630148345, "grad_norm": 6.386210140776625, "learning_rate": 4.69866485350476e-05, "loss": 0.7469, "step": 1076 }, { "epoch": 0.396940937989496, "grad_norm": 5.02673592260868, "learning_rate": 4.698355791816046e-05, "loss": 0.7921, "step": 1077 }, { "epoch": 0.3973094996775085, "grad_norm": 7.787819169118443, "learning_rate": 4.6980467301273334e-05, "loss": 0.7954, "step": 1078 }, { "epoch": 0.39767806136552103, "grad_norm": 6.830061369546219, "learning_rate": 4.6977376684386205e-05, "loss": 0.801, "step": 1079 }, { "epoch": 0.3980466230535336, "grad_norm": 7.900529925765117, "learning_rate": 4.6974286067499077e-05, "loss": 0.7838, "step": 1080 }, { "epoch": 0.39841518474154614, "grad_norm": 8.86499060753477, "learning_rate": 4.697119545061194e-05, "loss": 0.6561, "step": 1081 }, { "epoch": 0.39878374642955866, "grad_norm": 8.218945867140881, "learning_rate": 4.696810483372481e-05, "loss": 0.7766, "step": 1082 }, { "epoch": 0.3991523081175712, "grad_norm": 5.334900069583441, "learning_rate": 4.6965014216837683e-05, "loss": 0.5841, "step": 1083 }, { "epoch": 0.3995208698055837, "grad_norm": 5.900571969966463, "learning_rate": 4.6961923599950555e-05, "loss": 0.5697, "step": 1084 }, { "epoch": 0.39988943149359624, "grad_norm": 12.082371605495391, "learning_rate": 4.695883298306342e-05, "loss": 0.8014, "step": 1085 }, { "epoch": 0.40025799318160876, "grad_norm": 8.877714789587122, "learning_rate": 4.695574236617629e-05, "loss": 0.7348, "step": 1086 }, { "epoch": 0.4006265548696213, "grad_norm": 10.299027954454761, "learning_rate": 4.695265174928916e-05, "loss": 0.8119, "step": 1087 }, { "epoch": 0.4009951165576338, "grad_norm": 9.205733494665413, "learning_rate": 4.694956113240203e-05, "loss": 0.6809, "step": 1088 }, { "epoch": 0.40136367824564634, "grad_norm": 9.3027948589389, "learning_rate": 4.6946470515514904e-05, "loss": 1.0371, "step": 1089 }, { "epoch": 0.4017322399336589, "grad_norm": 8.551068102058078, "learning_rate": 4.694337989862777e-05, "loss": 0.7714, "step": 1090 }, { "epoch": 0.40210080162167144, "grad_norm": 10.327201582679447, "learning_rate": 4.694028928174064e-05, "loss": 0.8172, "step": 1091 }, { "epoch": 0.40246936330968397, "grad_norm": 12.26281877233632, "learning_rate": 4.6937198664853504e-05, "loss": 1.1233, "step": 1092 }, { "epoch": 0.4028379249976965, "grad_norm": 8.901314421898599, "learning_rate": 4.6934108047966375e-05, "loss": 1.0513, "step": 1093 }, { "epoch": 0.403206486685709, "grad_norm": 7.493912833308288, "learning_rate": 4.6931017431079246e-05, "loss": 0.6441, "step": 1094 }, { "epoch": 0.40357504837372155, "grad_norm": 8.09356901342047, "learning_rate": 4.692792681419211e-05, "loss": 0.9302, "step": 1095 }, { "epoch": 0.40394361006173407, "grad_norm": 8.665845489723308, "learning_rate": 4.692483619730498e-05, "loss": 0.7574, "step": 1096 }, { "epoch": 0.4043121717497466, "grad_norm": 10.061408131755217, "learning_rate": 4.692174558041785e-05, "loss": 1.139, "step": 1097 }, { "epoch": 0.4046807334377591, "grad_norm": 8.661785903275357, "learning_rate": 4.6918654963530724e-05, "loss": 0.9175, "step": 1098 }, { "epoch": 0.4050492951257717, "grad_norm": 9.004057075743255, "learning_rate": 4.6915564346643595e-05, "loss": 0.7404, "step": 1099 }, { "epoch": 0.4054178568137842, "grad_norm": 9.595909645968627, "learning_rate": 4.691247372975646e-05, "loss": 0.6201, "step": 1100 }, { "epoch": 0.40578641850179675, "grad_norm": 9.792435852573037, "learning_rate": 4.690938311286933e-05, "loss": 1.0225, "step": 1101 }, { "epoch": 0.4061549801898093, "grad_norm": 7.848147675473185, "learning_rate": 4.69062924959822e-05, "loss": 0.7788, "step": 1102 }, { "epoch": 0.4065235418778218, "grad_norm": 5.978390881477915, "learning_rate": 4.690320187909507e-05, "loss": 0.7814, "step": 1103 }, { "epoch": 0.40689210356583433, "grad_norm": 6.782226492193124, "learning_rate": 4.690011126220794e-05, "loss": 0.6625, "step": 1104 }, { "epoch": 0.40726066525384685, "grad_norm": 5.724143231511881, "learning_rate": 4.689702064532081e-05, "loss": 0.6742, "step": 1105 }, { "epoch": 0.4076292269418594, "grad_norm": 8.371162560233989, "learning_rate": 4.689393002843368e-05, "loss": 1.0769, "step": 1106 }, { "epoch": 0.4079977886298719, "grad_norm": 6.805840833571139, "learning_rate": 4.6890839411546545e-05, "loss": 0.774, "step": 1107 }, { "epoch": 0.40836635031788443, "grad_norm": 8.444582530894808, "learning_rate": 4.6887748794659416e-05, "loss": 0.9264, "step": 1108 }, { "epoch": 0.408734912005897, "grad_norm": 8.827736526989318, "learning_rate": 4.688465817777229e-05, "loss": 0.9498, "step": 1109 }, { "epoch": 0.40910347369390954, "grad_norm": 8.297127872500049, "learning_rate": 4.688156756088515e-05, "loss": 0.7117, "step": 1110 }, { "epoch": 0.40947203538192206, "grad_norm": 7.265534431139141, "learning_rate": 4.687847694399802e-05, "loss": 0.9441, "step": 1111 }, { "epoch": 0.4098405970699346, "grad_norm": 7.973469133443276, "learning_rate": 4.6875386327110894e-05, "loss": 0.9543, "step": 1112 }, { "epoch": 0.4102091587579471, "grad_norm": 7.150756326434223, "learning_rate": 4.6872295710223765e-05, "loss": 0.7728, "step": 1113 }, { "epoch": 0.41057772044595964, "grad_norm": 7.08976035161004, "learning_rate": 4.686920509333663e-05, "loss": 0.6934, "step": 1114 }, { "epoch": 0.41094628213397216, "grad_norm": 7.051576521978082, "learning_rate": 4.68661144764495e-05, "loss": 0.9007, "step": 1115 }, { "epoch": 0.4113148438219847, "grad_norm": 7.2808759208494065, "learning_rate": 4.686302385956237e-05, "loss": 0.6952, "step": 1116 }, { "epoch": 0.4116834055099972, "grad_norm": 7.5329636013256085, "learning_rate": 4.685993324267524e-05, "loss": 0.96, "step": 1117 }, { "epoch": 0.4120519671980098, "grad_norm": 8.762802784867658, "learning_rate": 4.6856842625788114e-05, "loss": 0.9365, "step": 1118 }, { "epoch": 0.4124205288860223, "grad_norm": 6.900725904234358, "learning_rate": 4.685375200890098e-05, "loss": 0.5717, "step": 1119 }, { "epoch": 0.41278909057403484, "grad_norm": 11.763324081951973, "learning_rate": 4.685066139201385e-05, "loss": 0.6597, "step": 1120 }, { "epoch": 0.41315765226204737, "grad_norm": 8.258348806644523, "learning_rate": 4.684757077512672e-05, "loss": 0.8369, "step": 1121 }, { "epoch": 0.4135262139500599, "grad_norm": 15.142914054743095, "learning_rate": 4.6844480158239585e-05, "loss": 0.8512, "step": 1122 }, { "epoch": 0.4138947756380724, "grad_norm": 5.504383074625762, "learning_rate": 4.6841389541352457e-05, "loss": 0.7349, "step": 1123 }, { "epoch": 0.41426333732608495, "grad_norm": 8.245443762131464, "learning_rate": 4.683829892446532e-05, "loss": 0.8419, "step": 1124 }, { "epoch": 0.41463189901409747, "grad_norm": 14.376590541252133, "learning_rate": 4.683520830757819e-05, "loss": 0.6654, "step": 1125 }, { "epoch": 0.41500046070211, "grad_norm": 9.386655758865839, "learning_rate": 4.683211769069106e-05, "loss": 0.7097, "step": 1126 }, { "epoch": 0.4153690223901225, "grad_norm": 5.405102734855354, "learning_rate": 4.6829027073803935e-05, "loss": 0.8528, "step": 1127 }, { "epoch": 0.4157375840781351, "grad_norm": 7.480854775450912, "learning_rate": 4.6825936456916806e-05, "loss": 0.8327, "step": 1128 }, { "epoch": 0.41610614576614763, "grad_norm": 9.037169948265227, "learning_rate": 4.682284584002967e-05, "loss": 0.7318, "step": 1129 }, { "epoch": 0.41647470745416015, "grad_norm": 5.586342280733848, "learning_rate": 4.681975522314254e-05, "loss": 0.5875, "step": 1130 }, { "epoch": 0.4168432691421727, "grad_norm": 6.701674511169736, "learning_rate": 4.681666460625541e-05, "loss": 0.8925, "step": 1131 }, { "epoch": 0.4172118308301852, "grad_norm": 7.557635295337042, "learning_rate": 4.6813573989368284e-05, "loss": 1.116, "step": 1132 }, { "epoch": 0.41758039251819773, "grad_norm": 5.535032196916151, "learning_rate": 4.681048337248115e-05, "loss": 0.6854, "step": 1133 }, { "epoch": 0.41794895420621025, "grad_norm": 6.258575659613725, "learning_rate": 4.680739275559402e-05, "loss": 0.6677, "step": 1134 }, { "epoch": 0.4183175158942228, "grad_norm": 8.403194056376485, "learning_rate": 4.680430213870689e-05, "loss": 0.7394, "step": 1135 }, { "epoch": 0.4186860775822353, "grad_norm": 5.77255238109033, "learning_rate": 4.680121152181976e-05, "loss": 0.6869, "step": 1136 }, { "epoch": 0.41905463927024783, "grad_norm": 6.303746868240937, "learning_rate": 4.6798120904932626e-05, "loss": 0.6814, "step": 1137 }, { "epoch": 0.4194232009582604, "grad_norm": 19.853835250901067, "learning_rate": 4.679503028804549e-05, "loss": 0.8195, "step": 1138 }, { "epoch": 0.41979176264627294, "grad_norm": 15.876406149276885, "learning_rate": 4.679193967115836e-05, "loss": 0.6315, "step": 1139 }, { "epoch": 0.42016032433428546, "grad_norm": 6.315192526497392, "learning_rate": 4.678884905427123e-05, "loss": 0.9877, "step": 1140 }, { "epoch": 0.420528886022298, "grad_norm": 6.592481156612658, "learning_rate": 4.6785758437384104e-05, "loss": 0.8092, "step": 1141 }, { "epoch": 0.4208974477103105, "grad_norm": 6.861275585569553, "learning_rate": 4.6782667820496975e-05, "loss": 0.824, "step": 1142 }, { "epoch": 0.42126600939832304, "grad_norm": 6.8309786701676325, "learning_rate": 4.677957720360984e-05, "loss": 0.6852, "step": 1143 }, { "epoch": 0.42163457108633556, "grad_norm": 11.735520552022953, "learning_rate": 4.677648658672271e-05, "loss": 0.7304, "step": 1144 }, { "epoch": 0.4220031327743481, "grad_norm": 5.789001546891494, "learning_rate": 4.677339596983558e-05, "loss": 0.4459, "step": 1145 }, { "epoch": 0.4223716944623606, "grad_norm": 11.243677354697748, "learning_rate": 4.677030535294845e-05, "loss": 0.7706, "step": 1146 }, { "epoch": 0.4227402561503732, "grad_norm": 5.739733611514766, "learning_rate": 4.676721473606132e-05, "loss": 0.6726, "step": 1147 }, { "epoch": 0.4231088178383857, "grad_norm": 6.59495815399753, "learning_rate": 4.676412411917419e-05, "loss": 0.7753, "step": 1148 }, { "epoch": 0.42347737952639825, "grad_norm": 6.4750465303023175, "learning_rate": 4.676103350228706e-05, "loss": 0.5995, "step": 1149 }, { "epoch": 0.42384594121441077, "grad_norm": 7.901280948751507, "learning_rate": 4.675794288539993e-05, "loss": 0.8078, "step": 1150 }, { "epoch": 0.4242145029024233, "grad_norm": 6.98725002435379, "learning_rate": 4.67548522685128e-05, "loss": 0.8267, "step": 1151 }, { "epoch": 0.4245830645904358, "grad_norm": 6.260182135616643, "learning_rate": 4.675176165162567e-05, "loss": 0.9895, "step": 1152 }, { "epoch": 0.42495162627844835, "grad_norm": 6.124426367231197, "learning_rate": 4.674867103473853e-05, "loss": 0.801, "step": 1153 }, { "epoch": 0.42532018796646087, "grad_norm": 5.44255549993503, "learning_rate": 4.67455804178514e-05, "loss": 0.8002, "step": 1154 }, { "epoch": 0.4256887496544734, "grad_norm": 7.9737662290774844, "learning_rate": 4.6742489800964274e-05, "loss": 0.7013, "step": 1155 }, { "epoch": 0.4260573113424859, "grad_norm": 7.6221400666451045, "learning_rate": 4.6739399184077145e-05, "loss": 0.7098, "step": 1156 }, { "epoch": 0.4264258730304985, "grad_norm": 5.553481087674839, "learning_rate": 4.673630856719001e-05, "loss": 0.6753, "step": 1157 }, { "epoch": 0.42679443471851103, "grad_norm": 17.700140759614207, "learning_rate": 4.673321795030288e-05, "loss": 0.733, "step": 1158 }, { "epoch": 0.42716299640652355, "grad_norm": 6.36443785289397, "learning_rate": 4.673012733341575e-05, "loss": 0.6433, "step": 1159 }, { "epoch": 0.4275315580945361, "grad_norm": 5.7540945318821395, "learning_rate": 4.672703671652862e-05, "loss": 0.8513, "step": 1160 }, { "epoch": 0.4279001197825486, "grad_norm": 7.782335075495974, "learning_rate": 4.6723946099641494e-05, "loss": 0.7156, "step": 1161 }, { "epoch": 0.42826868147056113, "grad_norm": 7.861325456133088, "learning_rate": 4.672085548275436e-05, "loss": 1.1848, "step": 1162 }, { "epoch": 0.42863724315857366, "grad_norm": 8.97069610854559, "learning_rate": 4.671776486586723e-05, "loss": 0.8214, "step": 1163 }, { "epoch": 0.4290058048465862, "grad_norm": 5.254813394617916, "learning_rate": 4.67146742489801e-05, "loss": 0.8473, "step": 1164 }, { "epoch": 0.4293743665345987, "grad_norm": 9.302350344070478, "learning_rate": 4.671158363209297e-05, "loss": 0.7737, "step": 1165 }, { "epoch": 0.4297429282226113, "grad_norm": 7.040138695824488, "learning_rate": 4.6708493015205836e-05, "loss": 0.656, "step": 1166 }, { "epoch": 0.4301114899106238, "grad_norm": 8.829815241953325, "learning_rate": 4.67054023983187e-05, "loss": 0.9148, "step": 1167 }, { "epoch": 0.43048005159863634, "grad_norm": 7.179009506208665, "learning_rate": 4.670231178143157e-05, "loss": 0.8902, "step": 1168 }, { "epoch": 0.43084861328664886, "grad_norm": 6.039374537135836, "learning_rate": 4.669922116454444e-05, "loss": 0.7799, "step": 1169 }, { "epoch": 0.4312171749746614, "grad_norm": 8.715439034465021, "learning_rate": 4.6696130547657314e-05, "loss": 0.9008, "step": 1170 }, { "epoch": 0.4315857366626739, "grad_norm": 8.275345680415477, "learning_rate": 4.6693039930770186e-05, "loss": 0.858, "step": 1171 }, { "epoch": 0.43195429835068644, "grad_norm": 7.364756566392847, "learning_rate": 4.668994931388305e-05, "loss": 0.7159, "step": 1172 }, { "epoch": 0.43232286003869896, "grad_norm": 9.058821990710216, "learning_rate": 4.668685869699592e-05, "loss": 0.9446, "step": 1173 }, { "epoch": 0.4326914217267115, "grad_norm": 8.034950680251997, "learning_rate": 4.668376808010879e-05, "loss": 0.6497, "step": 1174 }, { "epoch": 0.433059983414724, "grad_norm": 6.902107204492313, "learning_rate": 4.6680677463221664e-05, "loss": 0.8705, "step": 1175 }, { "epoch": 0.4334285451027366, "grad_norm": 7.638602958491212, "learning_rate": 4.667758684633453e-05, "loss": 0.5686, "step": 1176 }, { "epoch": 0.4337971067907491, "grad_norm": 9.272452246655353, "learning_rate": 4.66744962294474e-05, "loss": 0.5448, "step": 1177 }, { "epoch": 0.43416566847876165, "grad_norm": 8.15893510489564, "learning_rate": 4.667140561256027e-05, "loss": 1.0564, "step": 1178 }, { "epoch": 0.43453423016677417, "grad_norm": 5.788123750957341, "learning_rate": 4.666831499567314e-05, "loss": 0.7358, "step": 1179 }, { "epoch": 0.4349027918547867, "grad_norm": 5.944999375788445, "learning_rate": 4.666522437878601e-05, "loss": 0.5901, "step": 1180 }, { "epoch": 0.4352713535427992, "grad_norm": 9.62932737098889, "learning_rate": 4.666213376189888e-05, "loss": 0.9933, "step": 1181 }, { "epoch": 0.43563991523081175, "grad_norm": 7.2258244977892705, "learning_rate": 4.665904314501175e-05, "loss": 0.7982, "step": 1182 }, { "epoch": 0.4360084769188243, "grad_norm": 8.474049166563985, "learning_rate": 4.665595252812461e-05, "loss": 0.8799, "step": 1183 }, { "epoch": 0.4363770386068368, "grad_norm": 7.085910044220514, "learning_rate": 4.6652861911237484e-05, "loss": 0.8365, "step": 1184 }, { "epoch": 0.4367456002948493, "grad_norm": 7.7270122586174415, "learning_rate": 4.6649771294350355e-05, "loss": 0.7463, "step": 1185 }, { "epoch": 0.4371141619828619, "grad_norm": 10.36744617176823, "learning_rate": 4.664668067746322e-05, "loss": 0.9677, "step": 1186 }, { "epoch": 0.43748272367087443, "grad_norm": 12.180136622772919, "learning_rate": 4.664359006057609e-05, "loss": 0.8339, "step": 1187 }, { "epoch": 0.43785128535888695, "grad_norm": 6.7674318504213735, "learning_rate": 4.664049944368896e-05, "loss": 0.6805, "step": 1188 }, { "epoch": 0.4382198470468995, "grad_norm": 8.714151680695037, "learning_rate": 4.663740882680183e-05, "loss": 0.6119, "step": 1189 }, { "epoch": 0.438588408734912, "grad_norm": 7.593254669737621, "learning_rate": 4.6634318209914704e-05, "loss": 0.9066, "step": 1190 }, { "epoch": 0.43895697042292453, "grad_norm": 7.635739627129799, "learning_rate": 4.663122759302757e-05, "loss": 0.8484, "step": 1191 }, { "epoch": 0.43932553211093706, "grad_norm": 8.216056585456899, "learning_rate": 4.662813697614044e-05, "loss": 0.7281, "step": 1192 }, { "epoch": 0.4396940937989496, "grad_norm": 7.213637089471146, "learning_rate": 4.662504635925331e-05, "loss": 0.8433, "step": 1193 }, { "epoch": 0.4400626554869621, "grad_norm": 7.469601889831755, "learning_rate": 4.662195574236618e-05, "loss": 0.7205, "step": 1194 }, { "epoch": 0.4404312171749747, "grad_norm": 11.498323401263294, "learning_rate": 4.661886512547905e-05, "loss": 1.1849, "step": 1195 }, { "epoch": 0.4407997788629872, "grad_norm": 6.109833917186741, "learning_rate": 4.661577450859192e-05, "loss": 0.6454, "step": 1196 }, { "epoch": 0.44116834055099974, "grad_norm": 6.04934460581575, "learning_rate": 4.661268389170479e-05, "loss": 0.7244, "step": 1197 }, { "epoch": 0.44153690223901226, "grad_norm": 47.43828227286481, "learning_rate": 4.6609593274817654e-05, "loss": 0.7945, "step": 1198 }, { "epoch": 0.4419054639270248, "grad_norm": 9.783037601227507, "learning_rate": 4.6606502657930525e-05, "loss": 0.9579, "step": 1199 }, { "epoch": 0.4422740256150373, "grad_norm": 9.388127202645205, "learning_rate": 4.6603412041043396e-05, "loss": 0.7838, "step": 1200 }, { "epoch": 0.44264258730304984, "grad_norm": 6.504812952939875, "learning_rate": 4.660032142415626e-05, "loss": 0.6353, "step": 1201 }, { "epoch": 0.44301114899106236, "grad_norm": 8.14529138763929, "learning_rate": 4.659723080726913e-05, "loss": 0.7924, "step": 1202 }, { "epoch": 0.4433797106790749, "grad_norm": 9.35955107305166, "learning_rate": 4.6594140190382e-05, "loss": 0.5909, "step": 1203 }, { "epoch": 0.4437482723670874, "grad_norm": 6.357207657681229, "learning_rate": 4.6591049573494874e-05, "loss": 0.8997, "step": 1204 }, { "epoch": 0.4441168340551, "grad_norm": 5.48897522580864, "learning_rate": 4.658795895660774e-05, "loss": 0.5535, "step": 1205 }, { "epoch": 0.4444853957431125, "grad_norm": 6.3422098028294105, "learning_rate": 4.658486833972061e-05, "loss": 0.879, "step": 1206 }, { "epoch": 0.44485395743112505, "grad_norm": 8.083093174254017, "learning_rate": 4.658177772283348e-05, "loss": 0.7468, "step": 1207 }, { "epoch": 0.44522251911913757, "grad_norm": 5.234578689483661, "learning_rate": 4.657868710594635e-05, "loss": 0.7153, "step": 1208 }, { "epoch": 0.4455910808071501, "grad_norm": 10.0728978995484, "learning_rate": 4.657559648905922e-05, "loss": 0.9417, "step": 1209 }, { "epoch": 0.4459596424951626, "grad_norm": 5.566217787379772, "learning_rate": 4.657250587217209e-05, "loss": 0.6348, "step": 1210 }, { "epoch": 0.44632820418317515, "grad_norm": 7.122915933554486, "learning_rate": 4.656941525528496e-05, "loss": 0.7277, "step": 1211 }, { "epoch": 0.4466967658711877, "grad_norm": 9.897633556785888, "learning_rate": 4.656632463839783e-05, "loss": 0.8578, "step": 1212 }, { "epoch": 0.4470653275592002, "grad_norm": 8.320925006797621, "learning_rate": 4.6563234021510694e-05, "loss": 0.8691, "step": 1213 }, { "epoch": 0.4474338892472128, "grad_norm": 6.698335514844982, "learning_rate": 4.6560143404623566e-05, "loss": 0.8271, "step": 1214 }, { "epoch": 0.4478024509352253, "grad_norm": 6.807505605459269, "learning_rate": 4.655705278773643e-05, "loss": 0.7265, "step": 1215 }, { "epoch": 0.44817101262323783, "grad_norm": 8.626466488213214, "learning_rate": 4.65539621708493e-05, "loss": 0.9391, "step": 1216 }, { "epoch": 0.44853957431125036, "grad_norm": 4.692464818644755, "learning_rate": 4.655087155396217e-05, "loss": 0.5241, "step": 1217 }, { "epoch": 0.4489081359992629, "grad_norm": 9.04547329652811, "learning_rate": 4.6547780937075044e-05, "loss": 0.9527, "step": 1218 }, { "epoch": 0.4492766976872754, "grad_norm": 4.885064714953847, "learning_rate": 4.6544690320187915e-05, "loss": 0.5145, "step": 1219 }, { "epoch": 0.44964525937528793, "grad_norm": 5.720652779630805, "learning_rate": 4.654159970330078e-05, "loss": 0.9673, "step": 1220 }, { "epoch": 0.45001382106330046, "grad_norm": 5.070945570912223, "learning_rate": 4.653850908641365e-05, "loss": 0.4192, "step": 1221 }, { "epoch": 0.450382382751313, "grad_norm": 5.706231762960315, "learning_rate": 4.653541846952652e-05, "loss": 0.6511, "step": 1222 }, { "epoch": 0.4507509444393255, "grad_norm": 6.722968050493694, "learning_rate": 4.653232785263939e-05, "loss": 0.6754, "step": 1223 }, { "epoch": 0.4511195061273381, "grad_norm": 7.903785291234298, "learning_rate": 4.652923723575226e-05, "loss": 1.0495, "step": 1224 }, { "epoch": 0.4514880678153506, "grad_norm": 4.796700837890837, "learning_rate": 4.652614661886513e-05, "loss": 0.6268, "step": 1225 }, { "epoch": 0.45185662950336314, "grad_norm": 11.134346454079497, "learning_rate": 4.6523056001978e-05, "loss": 0.6302, "step": 1226 }, { "epoch": 0.45222519119137566, "grad_norm": 8.612182823951432, "learning_rate": 4.651996538509087e-05, "loss": 0.7143, "step": 1227 }, { "epoch": 0.4525937528793882, "grad_norm": 6.098026594294794, "learning_rate": 4.6516874768203735e-05, "loss": 0.721, "step": 1228 }, { "epoch": 0.4529623145674007, "grad_norm": 7.046292232260319, "learning_rate": 4.65137841513166e-05, "loss": 0.6966, "step": 1229 }, { "epoch": 0.45333087625541324, "grad_norm": 6.122659664291826, "learning_rate": 4.651069353442947e-05, "loss": 0.6947, "step": 1230 }, { "epoch": 0.45369943794342577, "grad_norm": 7.754187160371254, "learning_rate": 4.650760291754234e-05, "loss": 1.083, "step": 1231 }, { "epoch": 0.4540679996314383, "grad_norm": 8.777458485565187, "learning_rate": 4.650451230065521e-05, "loss": 0.892, "step": 1232 }, { "epoch": 0.4544365613194508, "grad_norm": 5.589502522778609, "learning_rate": 4.6501421683768084e-05, "loss": 0.8506, "step": 1233 }, { "epoch": 0.4548051230074634, "grad_norm": 5.629259997125232, "learning_rate": 4.649833106688095e-05, "loss": 0.5389, "step": 1234 }, { "epoch": 0.4551736846954759, "grad_norm": 11.87080196663455, "learning_rate": 4.649524044999382e-05, "loss": 0.7562, "step": 1235 }, { "epoch": 0.45554224638348845, "grad_norm": 6.890019050649319, "learning_rate": 4.649214983310669e-05, "loss": 0.7812, "step": 1236 }, { "epoch": 0.455910808071501, "grad_norm": 5.437016958945892, "learning_rate": 4.648905921621956e-05, "loss": 0.7748, "step": 1237 }, { "epoch": 0.4562793697595135, "grad_norm": 7.940212477139948, "learning_rate": 4.648596859933243e-05, "loss": 0.5763, "step": 1238 }, { "epoch": 0.456647931447526, "grad_norm": 6.446942857245583, "learning_rate": 4.64828779824453e-05, "loss": 0.6133, "step": 1239 }, { "epoch": 0.45701649313553855, "grad_norm": 6.996381641899015, "learning_rate": 4.647978736555817e-05, "loss": 0.7565, "step": 1240 }, { "epoch": 0.4573850548235511, "grad_norm": 7.431630102242878, "learning_rate": 4.647669674867104e-05, "loss": 0.7583, "step": 1241 }, { "epoch": 0.4577536165115636, "grad_norm": 8.58298909012422, "learning_rate": 4.647360613178391e-05, "loss": 0.8507, "step": 1242 }, { "epoch": 0.4581221781995762, "grad_norm": 6.767196226147399, "learning_rate": 4.6470515514896776e-05, "loss": 0.7261, "step": 1243 }, { "epoch": 0.4584907398875887, "grad_norm": 5.224079631438007, "learning_rate": 4.646742489800964e-05, "loss": 0.7174, "step": 1244 }, { "epoch": 0.45885930157560123, "grad_norm": 7.513752091299124, "learning_rate": 4.646433428112251e-05, "loss": 0.7498, "step": 1245 }, { "epoch": 0.45922786326361376, "grad_norm": 10.885693083300144, "learning_rate": 4.646124366423538e-05, "loss": 0.9254, "step": 1246 }, { "epoch": 0.4595964249516263, "grad_norm": 6.177761235813404, "learning_rate": 4.6458153047348254e-05, "loss": 0.7655, "step": 1247 }, { "epoch": 0.4599649866396388, "grad_norm": 6.8428547591544575, "learning_rate": 4.645506243046112e-05, "loss": 0.9039, "step": 1248 }, { "epoch": 0.46033354832765133, "grad_norm": 7.907583011236103, "learning_rate": 4.645197181357399e-05, "loss": 0.8074, "step": 1249 }, { "epoch": 0.46070211001566386, "grad_norm": 9.27551955749776, "learning_rate": 4.644888119668686e-05, "loss": 0.7438, "step": 1250 }, { "epoch": 0.4610706717036764, "grad_norm": 5.501189536724737, "learning_rate": 4.644579057979973e-05, "loss": 0.7237, "step": 1251 }, { "epoch": 0.4614392333916889, "grad_norm": 8.531117644348761, "learning_rate": 4.64426999629126e-05, "loss": 0.6211, "step": 1252 }, { "epoch": 0.4618077950797015, "grad_norm": 9.290593745348747, "learning_rate": 4.643960934602547e-05, "loss": 0.7976, "step": 1253 }, { "epoch": 0.462176356767714, "grad_norm": 7.936557743885033, "learning_rate": 4.643651872913834e-05, "loss": 0.9331, "step": 1254 }, { "epoch": 0.46254491845572654, "grad_norm": 9.983869895038431, "learning_rate": 4.643342811225121e-05, "loss": 0.8569, "step": 1255 }, { "epoch": 0.46291348014373906, "grad_norm": 9.732578852775264, "learning_rate": 4.643033749536408e-05, "loss": 0.8613, "step": 1256 }, { "epoch": 0.4632820418317516, "grad_norm": 8.821542415910221, "learning_rate": 4.6427246878476946e-05, "loss": 0.602, "step": 1257 }, { "epoch": 0.4636506035197641, "grad_norm": 6.20185035658383, "learning_rate": 4.642415626158981e-05, "loss": 0.799, "step": 1258 }, { "epoch": 0.46401916520777664, "grad_norm": 8.02052772897579, "learning_rate": 4.642106564470268e-05, "loss": 0.8919, "step": 1259 }, { "epoch": 0.46438772689578917, "grad_norm": 8.909827440776509, "learning_rate": 4.641797502781555e-05, "loss": 0.6457, "step": 1260 }, { "epoch": 0.4647562885838017, "grad_norm": 7.728948735731808, "learning_rate": 4.6414884410928424e-05, "loss": 0.8213, "step": 1261 }, { "epoch": 0.46512485027181427, "grad_norm": 8.546617465698372, "learning_rate": 4.6411793794041295e-05, "loss": 0.829, "step": 1262 }, { "epoch": 0.4654934119598268, "grad_norm": 6.835595973611454, "learning_rate": 4.640870317715416e-05, "loss": 0.871, "step": 1263 }, { "epoch": 0.4658619736478393, "grad_norm": 5.504231818961672, "learning_rate": 4.640561256026703e-05, "loss": 0.8347, "step": 1264 }, { "epoch": 0.46623053533585185, "grad_norm": 6.845890633576798, "learning_rate": 4.64025219433799e-05, "loss": 1.0165, "step": 1265 }, { "epoch": 0.4665990970238644, "grad_norm": 9.885833206217487, "learning_rate": 4.639943132649277e-05, "loss": 0.8471, "step": 1266 }, { "epoch": 0.4669676587118769, "grad_norm": 6.991983728263305, "learning_rate": 4.639634070960564e-05, "loss": 0.7414, "step": 1267 }, { "epoch": 0.4673362203998894, "grad_norm": 6.298612823355247, "learning_rate": 4.639325009271851e-05, "loss": 0.6724, "step": 1268 }, { "epoch": 0.46770478208790195, "grad_norm": 5.99987538526192, "learning_rate": 4.639015947583138e-05, "loss": 0.8071, "step": 1269 }, { "epoch": 0.4680733437759145, "grad_norm": 6.804884827793285, "learning_rate": 4.638706885894425e-05, "loss": 0.7222, "step": 1270 }, { "epoch": 0.468441905463927, "grad_norm": 6.470278352059129, "learning_rate": 4.638397824205712e-05, "loss": 0.6514, "step": 1271 }, { "epoch": 0.4688104671519396, "grad_norm": 6.66056032269477, "learning_rate": 4.6380887625169986e-05, "loss": 0.8141, "step": 1272 }, { "epoch": 0.4691790288399521, "grad_norm": 6.52913182421137, "learning_rate": 4.637779700828285e-05, "loss": 0.9512, "step": 1273 }, { "epoch": 0.46954759052796463, "grad_norm": 10.001910217943996, "learning_rate": 4.637470639139572e-05, "loss": 0.6906, "step": 1274 }, { "epoch": 0.46991615221597716, "grad_norm": 6.1913563434502805, "learning_rate": 4.637161577450859e-05, "loss": 0.6424, "step": 1275 }, { "epoch": 0.4702847139039897, "grad_norm": 4.735967286512441, "learning_rate": 4.6368525157621464e-05, "loss": 0.4168, "step": 1276 }, { "epoch": 0.4706532755920022, "grad_norm": 8.12008659279038, "learning_rate": 4.636543454073433e-05, "loss": 0.767, "step": 1277 }, { "epoch": 0.47102183728001473, "grad_norm": 6.340536271288062, "learning_rate": 4.63623439238472e-05, "loss": 0.854, "step": 1278 }, { "epoch": 0.47139039896802726, "grad_norm": 5.678239934384721, "learning_rate": 4.635925330696007e-05, "loss": 0.8456, "step": 1279 }, { "epoch": 0.4717589606560398, "grad_norm": 8.329414768795285, "learning_rate": 4.635616269007294e-05, "loss": 0.8698, "step": 1280 }, { "epoch": 0.4721275223440523, "grad_norm": 5.701640565588132, "learning_rate": 4.6353072073185814e-05, "loss": 0.6601, "step": 1281 }, { "epoch": 0.4724960840320649, "grad_norm": 5.242063245139269, "learning_rate": 4.634998145629868e-05, "loss": 0.543, "step": 1282 }, { "epoch": 0.4728646457200774, "grad_norm": 9.417880044817634, "learning_rate": 4.634689083941155e-05, "loss": 0.5977, "step": 1283 }, { "epoch": 0.47323320740808994, "grad_norm": 8.95555883217692, "learning_rate": 4.634380022252442e-05, "loss": 0.8657, "step": 1284 }, { "epoch": 0.47360176909610247, "grad_norm": 7.476995154670675, "learning_rate": 4.634070960563729e-05, "loss": 0.9578, "step": 1285 }, { "epoch": 0.473970330784115, "grad_norm": 6.519683378992011, "learning_rate": 4.6337618988750156e-05, "loss": 0.6942, "step": 1286 }, { "epoch": 0.4743388924721275, "grad_norm": 5.019034584499333, "learning_rate": 4.633452837186303e-05, "loss": 0.351, "step": 1287 }, { "epoch": 0.47470745416014004, "grad_norm": 7.386964306716337, "learning_rate": 4.633143775497589e-05, "loss": 0.8115, "step": 1288 }, { "epoch": 0.47507601584815257, "grad_norm": 10.100056004132902, "learning_rate": 4.632834713808876e-05, "loss": 0.735, "step": 1289 }, { "epoch": 0.4754445775361651, "grad_norm": 8.793175253017163, "learning_rate": 4.6325256521201634e-05, "loss": 0.7169, "step": 1290 }, { "epoch": 0.4758131392241777, "grad_norm": 10.202293841170329, "learning_rate": 4.6322165904314505e-05, "loss": 0.9969, "step": 1291 }, { "epoch": 0.4761817009121902, "grad_norm": 7.049216953375866, "learning_rate": 4.631907528742737e-05, "loss": 0.779, "step": 1292 }, { "epoch": 0.4765502626002027, "grad_norm": 8.787209874189125, "learning_rate": 4.631598467054024e-05, "loss": 0.7164, "step": 1293 }, { "epoch": 0.47691882428821525, "grad_norm": 7.861436092038097, "learning_rate": 4.631289405365311e-05, "loss": 0.7458, "step": 1294 }, { "epoch": 0.4772873859762278, "grad_norm": 6.063892637142821, "learning_rate": 4.630980343676598e-05, "loss": 0.8842, "step": 1295 }, { "epoch": 0.4776559476642403, "grad_norm": 11.219340954360701, "learning_rate": 4.630671281987885e-05, "loss": 0.7319, "step": 1296 }, { "epoch": 0.4780245093522528, "grad_norm": 11.942341566657962, "learning_rate": 4.630362220299172e-05, "loss": 0.8831, "step": 1297 }, { "epoch": 0.47839307104026535, "grad_norm": 7.308780082639422, "learning_rate": 4.630053158610459e-05, "loss": 0.6861, "step": 1298 }, { "epoch": 0.4787616327282779, "grad_norm": 7.69019599099287, "learning_rate": 4.629744096921746e-05, "loss": 0.6258, "step": 1299 }, { "epoch": 0.4791301944162904, "grad_norm": 8.8867663975371, "learning_rate": 4.629435035233033e-05, "loss": 0.8191, "step": 1300 }, { "epoch": 0.479498756104303, "grad_norm": 8.135590678823473, "learning_rate": 4.62912597354432e-05, "loss": 0.7372, "step": 1301 }, { "epoch": 0.4798673177923155, "grad_norm": 7.638159231151815, "learning_rate": 4.628816911855607e-05, "loss": 0.6716, "step": 1302 }, { "epoch": 0.48023587948032803, "grad_norm": 9.499846607776837, "learning_rate": 4.628507850166894e-05, "loss": 0.8286, "step": 1303 }, { "epoch": 0.48060444116834056, "grad_norm": 5.528527403353789, "learning_rate": 4.6281987884781804e-05, "loss": 0.6114, "step": 1304 }, { "epoch": 0.4809730028563531, "grad_norm": 9.517274510022347, "learning_rate": 4.6278897267894675e-05, "loss": 0.6424, "step": 1305 }, { "epoch": 0.4813415645443656, "grad_norm": 8.801898612768333, "learning_rate": 4.627580665100754e-05, "loss": 1.0997, "step": 1306 }, { "epoch": 0.48171012623237813, "grad_norm": 7.523116028632912, "learning_rate": 4.627271603412041e-05, "loss": 0.6958, "step": 1307 }, { "epoch": 0.48207868792039066, "grad_norm": 12.978680074437824, "learning_rate": 4.626962541723328e-05, "loss": 0.8143, "step": 1308 }, { "epoch": 0.4824472496084032, "grad_norm": 6.4116872647956695, "learning_rate": 4.626653480034615e-05, "loss": 0.7557, "step": 1309 }, { "epoch": 0.48281581129641576, "grad_norm": 4.830206462483239, "learning_rate": 4.626344418345902e-05, "loss": 0.8238, "step": 1310 }, { "epoch": 0.4831843729844283, "grad_norm": 8.797900937899104, "learning_rate": 4.626035356657189e-05, "loss": 1.026, "step": 1311 }, { "epoch": 0.4835529346724408, "grad_norm": 8.763167582506998, "learning_rate": 4.625726294968476e-05, "loss": 0.7782, "step": 1312 }, { "epoch": 0.48392149636045334, "grad_norm": 4.744129769825992, "learning_rate": 4.625417233279763e-05, "loss": 0.7012, "step": 1313 }, { "epoch": 0.48429005804846587, "grad_norm": 8.29277187119103, "learning_rate": 4.62510817159105e-05, "loss": 0.6731, "step": 1314 }, { "epoch": 0.4846586197364784, "grad_norm": 5.643779686124168, "learning_rate": 4.6247991099023366e-05, "loss": 0.6375, "step": 1315 }, { "epoch": 0.4850271814244909, "grad_norm": 5.985339692051576, "learning_rate": 4.624490048213624e-05, "loss": 0.913, "step": 1316 }, { "epoch": 0.48539574311250344, "grad_norm": 8.964442478083681, "learning_rate": 4.624180986524911e-05, "loss": 0.6182, "step": 1317 }, { "epoch": 0.48576430480051597, "grad_norm": 6.750990053260244, "learning_rate": 4.623871924836198e-05, "loss": 0.5943, "step": 1318 }, { "epoch": 0.4861328664885285, "grad_norm": 4.167878597753926, "learning_rate": 4.6235628631474844e-05, "loss": 0.4733, "step": 1319 }, { "epoch": 0.4865014281765411, "grad_norm": 5.704706009453675, "learning_rate": 4.623253801458771e-05, "loss": 0.7281, "step": 1320 }, { "epoch": 0.4868699898645536, "grad_norm": 7.4433985808958285, "learning_rate": 4.622944739770058e-05, "loss": 0.6401, "step": 1321 }, { "epoch": 0.4872385515525661, "grad_norm": 9.605885735916516, "learning_rate": 4.622635678081345e-05, "loss": 1.0115, "step": 1322 }, { "epoch": 0.48760711324057865, "grad_norm": 8.222584341282959, "learning_rate": 4.622326616392632e-05, "loss": 0.7515, "step": 1323 }, { "epoch": 0.4879756749285912, "grad_norm": 5.985325989222778, "learning_rate": 4.6220175547039193e-05, "loss": 0.5193, "step": 1324 }, { "epoch": 0.4883442366166037, "grad_norm": 10.193920148579029, "learning_rate": 4.621708493015206e-05, "loss": 0.5024, "step": 1325 }, { "epoch": 0.4887127983046162, "grad_norm": 7.5196408271236805, "learning_rate": 4.621399431326493e-05, "loss": 0.898, "step": 1326 }, { "epoch": 0.48908135999262875, "grad_norm": 6.446107313682554, "learning_rate": 4.62109036963778e-05, "loss": 0.7118, "step": 1327 }, { "epoch": 0.4894499216806413, "grad_norm": 7.013036305528173, "learning_rate": 4.620781307949067e-05, "loss": 0.7869, "step": 1328 }, { "epoch": 0.4898184833686538, "grad_norm": 11.463312424351054, "learning_rate": 4.6204722462603536e-05, "loss": 0.7451, "step": 1329 }, { "epoch": 0.4901870450566664, "grad_norm": 8.06068293477668, "learning_rate": 4.620163184571641e-05, "loss": 0.8309, "step": 1330 }, { "epoch": 0.4905556067446789, "grad_norm": 7.800125453624778, "learning_rate": 4.619854122882928e-05, "loss": 0.691, "step": 1331 }, { "epoch": 0.49092416843269143, "grad_norm": 6.372999887296136, "learning_rate": 4.619545061194215e-05, "loss": 0.5651, "step": 1332 }, { "epoch": 0.49129273012070396, "grad_norm": 7.788015832854539, "learning_rate": 4.619235999505502e-05, "loss": 0.7313, "step": 1333 }, { "epoch": 0.4916612918087165, "grad_norm": 10.30284304593162, "learning_rate": 4.6189269378167885e-05, "loss": 0.8674, "step": 1334 }, { "epoch": 0.492029853496729, "grad_norm": 6.326555071463902, "learning_rate": 4.618617876128075e-05, "loss": 0.682, "step": 1335 }, { "epoch": 0.49239841518474153, "grad_norm": 6.660511926903064, "learning_rate": 4.618308814439362e-05, "loss": 1.138, "step": 1336 }, { "epoch": 0.49276697687275406, "grad_norm": 7.702949692160851, "learning_rate": 4.617999752750649e-05, "loss": 0.7231, "step": 1337 }, { "epoch": 0.4931355385607666, "grad_norm": 7.076777247773142, "learning_rate": 4.617690691061936e-05, "loss": 0.6515, "step": 1338 }, { "epoch": 0.49350410024877917, "grad_norm": 6.734416350837375, "learning_rate": 4.617381629373223e-05, "loss": 0.8943, "step": 1339 }, { "epoch": 0.4938726619367917, "grad_norm": 7.196480887086627, "learning_rate": 4.61707256768451e-05, "loss": 0.7291, "step": 1340 }, { "epoch": 0.4942412236248042, "grad_norm": 6.711262918205142, "learning_rate": 4.616763505995797e-05, "loss": 0.634, "step": 1341 }, { "epoch": 0.49460978531281674, "grad_norm": 8.629793908055268, "learning_rate": 4.616454444307084e-05, "loss": 0.8746, "step": 1342 }, { "epoch": 0.49497834700082927, "grad_norm": 6.866621425936405, "learning_rate": 4.616145382618371e-05, "loss": 0.8823, "step": 1343 }, { "epoch": 0.4953469086888418, "grad_norm": 8.528876229394358, "learning_rate": 4.615836320929658e-05, "loss": 0.7079, "step": 1344 }, { "epoch": 0.4957154703768543, "grad_norm": 8.284741396232075, "learning_rate": 4.615527259240945e-05, "loss": 0.915, "step": 1345 }, { "epoch": 0.49608403206486684, "grad_norm": 6.347195295763208, "learning_rate": 4.615218197552232e-05, "loss": 0.6242, "step": 1346 }, { "epoch": 0.49645259375287937, "grad_norm": 7.527898145450777, "learning_rate": 4.614909135863519e-05, "loss": 0.6886, "step": 1347 }, { "epoch": 0.4968211554408919, "grad_norm": 7.5301289970625565, "learning_rate": 4.6146000741748055e-05, "loss": 0.6318, "step": 1348 }, { "epoch": 0.4971897171289045, "grad_norm": 4.707647538104099, "learning_rate": 4.614291012486092e-05, "loss": 0.5615, "step": 1349 }, { "epoch": 0.497558278816917, "grad_norm": 6.673797544834095, "learning_rate": 4.613981950797379e-05, "loss": 0.6491, "step": 1350 }, { "epoch": 0.4979268405049295, "grad_norm": 15.320218877199911, "learning_rate": 4.613672889108666e-05, "loss": 0.9966, "step": 1351 }, { "epoch": 0.49829540219294205, "grad_norm": 9.232084503368876, "learning_rate": 4.613363827419953e-05, "loss": 0.9665, "step": 1352 }, { "epoch": 0.4986639638809546, "grad_norm": 6.093301140319679, "learning_rate": 4.6130547657312404e-05, "loss": 0.6699, "step": 1353 }, { "epoch": 0.4990325255689671, "grad_norm": 4.82211571696418, "learning_rate": 4.612745704042527e-05, "loss": 0.5342, "step": 1354 }, { "epoch": 0.4994010872569796, "grad_norm": 19.135302193565234, "learning_rate": 4.612436642353814e-05, "loss": 0.9953, "step": 1355 }, { "epoch": 0.49976964894499215, "grad_norm": 5.791326206363757, "learning_rate": 4.612127580665101e-05, "loss": 0.7894, "step": 1356 }, { "epoch": 0.5001382106330047, "grad_norm": 7.712447709411724, "learning_rate": 4.611818518976388e-05, "loss": 0.9759, "step": 1357 }, { "epoch": 0.5005067723210173, "grad_norm": 8.366711970760088, "learning_rate": 4.6115094572876746e-05, "loss": 0.6466, "step": 1358 }, { "epoch": 0.5008753340090297, "grad_norm": 8.001917609223723, "learning_rate": 4.611200395598962e-05, "loss": 0.9287, "step": 1359 }, { "epoch": 0.5012438956970423, "grad_norm": 8.985267024331284, "learning_rate": 4.610891333910249e-05, "loss": 0.6836, "step": 1360 }, { "epoch": 0.5016124573850548, "grad_norm": 8.217685180842919, "learning_rate": 4.610582272221536e-05, "loss": 0.6163, "step": 1361 }, { "epoch": 0.5019810190730674, "grad_norm": 6.567198606268862, "learning_rate": 4.610273210532823e-05, "loss": 0.5717, "step": 1362 }, { "epoch": 0.5023495807610799, "grad_norm": 7.682569706082618, "learning_rate": 4.6099641488441095e-05, "loss": 0.8686, "step": 1363 }, { "epoch": 0.5027181424490924, "grad_norm": 8.295736978361848, "learning_rate": 4.609655087155396e-05, "loss": 0.6007, "step": 1364 }, { "epoch": 0.503086704137105, "grad_norm": 4.166800738402868, "learning_rate": 4.609346025466683e-05, "loss": 0.3317, "step": 1365 }, { "epoch": 0.5034552658251175, "grad_norm": 9.745594692257713, "learning_rate": 4.60903696377797e-05, "loss": 0.7778, "step": 1366 }, { "epoch": 0.50382382751313, "grad_norm": 8.800176133213808, "learning_rate": 4.6087279020892573e-05, "loss": 0.8163, "step": 1367 }, { "epoch": 0.5041923892011425, "grad_norm": 6.073252786939014, "learning_rate": 4.608418840400544e-05, "loss": 0.7825, "step": 1368 }, { "epoch": 0.5045609508891551, "grad_norm": 8.356422887025099, "learning_rate": 4.608109778711831e-05, "loss": 0.8017, "step": 1369 }, { "epoch": 0.5049295125771676, "grad_norm": 34.554854418260646, "learning_rate": 4.607800717023118e-05, "loss": 0.6575, "step": 1370 }, { "epoch": 0.5052980742651801, "grad_norm": 8.335827212690065, "learning_rate": 4.607491655334405e-05, "loss": 0.7121, "step": 1371 }, { "epoch": 0.5056666359531926, "grad_norm": 7.305418057551907, "learning_rate": 4.607182593645692e-05, "loss": 0.683, "step": 1372 }, { "epoch": 0.5060351976412052, "grad_norm": 6.133798935397444, "learning_rate": 4.606873531956979e-05, "loss": 0.5751, "step": 1373 }, { "epoch": 0.5064037593292178, "grad_norm": 6.251333475911422, "learning_rate": 4.606564470268266e-05, "loss": 0.5549, "step": 1374 }, { "epoch": 0.5067723210172302, "grad_norm": 9.058930634636964, "learning_rate": 4.606255408579553e-05, "loss": 0.8161, "step": 1375 }, { "epoch": 0.5071408827052428, "grad_norm": 10.049798187203344, "learning_rate": 4.60594634689084e-05, "loss": 0.7118, "step": 1376 }, { "epoch": 0.5075094443932553, "grad_norm": 7.596249769398188, "learning_rate": 4.6056372852021265e-05, "loss": 0.9179, "step": 1377 }, { "epoch": 0.5078780060812679, "grad_norm": 9.29304838527249, "learning_rate": 4.6053282235134136e-05, "loss": 0.5843, "step": 1378 }, { "epoch": 0.5082465677692803, "grad_norm": 8.468832881282625, "learning_rate": 4.6050191618247e-05, "loss": 0.6929, "step": 1379 }, { "epoch": 0.5086151294572929, "grad_norm": 7.643705343894239, "learning_rate": 4.604710100135987e-05, "loss": 0.9173, "step": 1380 }, { "epoch": 0.5089836911453054, "grad_norm": 6.829454235875082, "learning_rate": 4.604401038447274e-05, "loss": 0.7244, "step": 1381 }, { "epoch": 0.509352252833318, "grad_norm": 5.906877433914395, "learning_rate": 4.604091976758561e-05, "loss": 0.7876, "step": 1382 }, { "epoch": 0.5097208145213306, "grad_norm": 8.370417921418214, "learning_rate": 4.603782915069848e-05, "loss": 0.5713, "step": 1383 }, { "epoch": 0.510089376209343, "grad_norm": 6.128186078426177, "learning_rate": 4.603473853381135e-05, "loss": 0.8184, "step": 1384 }, { "epoch": 0.5104579378973556, "grad_norm": 6.859911921522384, "learning_rate": 4.603164791692422e-05, "loss": 0.6899, "step": 1385 }, { "epoch": 0.5108264995853681, "grad_norm": 6.514847450607535, "learning_rate": 4.602855730003709e-05, "loss": 0.6562, "step": 1386 }, { "epoch": 0.5111950612733807, "grad_norm": 7.604683747848108, "learning_rate": 4.6025466683149957e-05, "loss": 0.7614, "step": 1387 }, { "epoch": 0.5115636229613931, "grad_norm": 6.659218283017094, "learning_rate": 4.602237606626283e-05, "loss": 0.9661, "step": 1388 }, { "epoch": 0.5119321846494057, "grad_norm": 18.05510077009706, "learning_rate": 4.60192854493757e-05, "loss": 0.5388, "step": 1389 }, { "epoch": 0.5123007463374182, "grad_norm": 7.380299103007514, "learning_rate": 4.601619483248857e-05, "loss": 0.5961, "step": 1390 }, { "epoch": 0.5126693080254308, "grad_norm": 5.840956729226413, "learning_rate": 4.601310421560144e-05, "loss": 0.7102, "step": 1391 }, { "epoch": 0.5130378697134433, "grad_norm": 8.09575566365642, "learning_rate": 4.6010013598714306e-05, "loss": 0.8246, "step": 1392 }, { "epoch": 0.5134064314014558, "grad_norm": 9.509583708660003, "learning_rate": 4.600692298182718e-05, "loss": 1.0285, "step": 1393 }, { "epoch": 0.5137749930894684, "grad_norm": 7.694993251626549, "learning_rate": 4.600383236494004e-05, "loss": 0.7165, "step": 1394 }, { "epoch": 0.5141435547774809, "grad_norm": 7.943736112208798, "learning_rate": 4.600074174805291e-05, "loss": 0.7351, "step": 1395 }, { "epoch": 0.5145121164654934, "grad_norm": 5.707143543556427, "learning_rate": 4.5997651131165784e-05, "loss": 0.8181, "step": 1396 }, { "epoch": 0.5148806781535059, "grad_norm": 6.216290931835378, "learning_rate": 4.599456051427865e-05, "loss": 0.6897, "step": 1397 }, { "epoch": 0.5152492398415185, "grad_norm": 6.514954017834062, "learning_rate": 4.599146989739152e-05, "loss": 0.7415, "step": 1398 }, { "epoch": 0.515617801529531, "grad_norm": 6.50625133679291, "learning_rate": 4.598837928050439e-05, "loss": 0.6084, "step": 1399 }, { "epoch": 0.5159863632175435, "grad_norm": 5.715756492708166, "learning_rate": 4.598528866361726e-05, "loss": 0.5751, "step": 1400 }, { "epoch": 0.516354924905556, "grad_norm": 6.945562403230423, "learning_rate": 4.5982198046730126e-05, "loss": 0.8604, "step": 1401 }, { "epoch": 0.5167234865935686, "grad_norm": 6.823165524047814, "learning_rate": 4.5979107429843e-05, "loss": 0.9038, "step": 1402 }, { "epoch": 0.5170920482815812, "grad_norm": 5.0821581039605945, "learning_rate": 4.597601681295587e-05, "loss": 0.5438, "step": 1403 }, { "epoch": 0.5174606099695936, "grad_norm": 5.836583930270998, "learning_rate": 4.597292619606874e-05, "loss": 0.6386, "step": 1404 }, { "epoch": 0.5178291716576062, "grad_norm": 5.041432567027041, "learning_rate": 4.596983557918161e-05, "loss": 0.4762, "step": 1405 }, { "epoch": 0.5181977333456187, "grad_norm": 7.263566576693746, "learning_rate": 4.5966744962294475e-05, "loss": 0.6902, "step": 1406 }, { "epoch": 0.5185662950336313, "grad_norm": 7.7470830534726, "learning_rate": 4.5963654345407347e-05, "loss": 0.8432, "step": 1407 }, { "epoch": 0.5189348567216437, "grad_norm": 6.313529053460253, "learning_rate": 4.596056372852022e-05, "loss": 0.6473, "step": 1408 }, { "epoch": 0.5193034184096563, "grad_norm": 8.029351273456568, "learning_rate": 4.595747311163308e-05, "loss": 0.7577, "step": 1409 }, { "epoch": 0.5196719800976688, "grad_norm": 8.284013395069294, "learning_rate": 4.595438249474595e-05, "loss": 0.8479, "step": 1410 }, { "epoch": 0.5200405417856814, "grad_norm": 7.881345100040823, "learning_rate": 4.595129187785882e-05, "loss": 0.6731, "step": 1411 }, { "epoch": 0.520409103473694, "grad_norm": 6.586848547788533, "learning_rate": 4.594820126097169e-05, "loss": 0.7013, "step": 1412 }, { "epoch": 0.5207776651617064, "grad_norm": 5.962781228524827, "learning_rate": 4.594511064408456e-05, "loss": 0.5027, "step": 1413 }, { "epoch": 0.521146226849719, "grad_norm": 6.999174886847713, "learning_rate": 4.594202002719743e-05, "loss": 0.6923, "step": 1414 }, { "epoch": 0.5215147885377315, "grad_norm": 8.04730401238053, "learning_rate": 4.59389294103103e-05, "loss": 0.9645, "step": 1415 }, { "epoch": 0.5218833502257441, "grad_norm": 10.588288196107438, "learning_rate": 4.593583879342317e-05, "loss": 1.0159, "step": 1416 }, { "epoch": 0.5222519119137565, "grad_norm": 4.957835367452384, "learning_rate": 4.593274817653604e-05, "loss": 0.6978, "step": 1417 }, { "epoch": 0.5226204736017691, "grad_norm": 8.926679129483803, "learning_rate": 4.592965755964891e-05, "loss": 0.7879, "step": 1418 }, { "epoch": 0.5229890352897816, "grad_norm": 7.27951251734856, "learning_rate": 4.592656694276178e-05, "loss": 0.8181, "step": 1419 }, { "epoch": 0.5233575969777942, "grad_norm": 11.82806952802382, "learning_rate": 4.5923476325874645e-05, "loss": 0.7633, "step": 1420 }, { "epoch": 0.5237261586658067, "grad_norm": 5.778990927923276, "learning_rate": 4.5920385708987516e-05, "loss": 0.7018, "step": 1421 }, { "epoch": 0.5240947203538192, "grad_norm": 7.580591677317982, "learning_rate": 4.591729509210039e-05, "loss": 0.7926, "step": 1422 }, { "epoch": 0.5244632820418318, "grad_norm": 5.739979180487647, "learning_rate": 4.591420447521326e-05, "loss": 0.6214, "step": 1423 }, { "epoch": 0.5248318437298443, "grad_norm": 5.933162349566089, "learning_rate": 4.591111385832612e-05, "loss": 0.6605, "step": 1424 }, { "epoch": 0.5252004054178568, "grad_norm": 6.6623409860611895, "learning_rate": 4.5908023241438994e-05, "loss": 0.769, "step": 1425 }, { "epoch": 0.5255689671058693, "grad_norm": 6.067753982917963, "learning_rate": 4.590493262455186e-05, "loss": 0.5553, "step": 1426 }, { "epoch": 0.5259375287938819, "grad_norm": 6.871267519299285, "learning_rate": 4.590184200766473e-05, "loss": 0.5817, "step": 1427 }, { "epoch": 0.5263060904818944, "grad_norm": 7.220070986411985, "learning_rate": 4.58987513907776e-05, "loss": 0.7358, "step": 1428 }, { "epoch": 0.5266746521699069, "grad_norm": 5.3638898777819515, "learning_rate": 4.589566077389047e-05, "loss": 0.6057, "step": 1429 }, { "epoch": 0.5270432138579195, "grad_norm": 5.798411039858955, "learning_rate": 4.5892570157003337e-05, "loss": 0.613, "step": 1430 }, { "epoch": 0.527411775545932, "grad_norm": 6.898944010991601, "learning_rate": 4.588947954011621e-05, "loss": 0.6465, "step": 1431 }, { "epoch": 0.5277803372339446, "grad_norm": 8.045453644190632, "learning_rate": 4.588638892322908e-05, "loss": 0.6137, "step": 1432 }, { "epoch": 0.528148898921957, "grad_norm": 8.91584041962302, "learning_rate": 4.588329830634195e-05, "loss": 0.7246, "step": 1433 }, { "epoch": 0.5285174606099696, "grad_norm": 5.992168401615588, "learning_rate": 4.588020768945482e-05, "loss": 0.628, "step": 1434 }, { "epoch": 0.5288860222979821, "grad_norm": 9.00434897569359, "learning_rate": 4.5877117072567686e-05, "loss": 1.1307, "step": 1435 }, { "epoch": 0.5292545839859947, "grad_norm": 7.673300430046128, "learning_rate": 4.587402645568056e-05, "loss": 0.7375, "step": 1436 }, { "epoch": 0.5296231456740071, "grad_norm": 7.5301072136084155, "learning_rate": 4.587093583879343e-05, "loss": 0.6311, "step": 1437 }, { "epoch": 0.5299917073620197, "grad_norm": 10.614793969417569, "learning_rate": 4.58678452219063e-05, "loss": 0.7769, "step": 1438 }, { "epoch": 0.5303602690500322, "grad_norm": 8.430343192322127, "learning_rate": 4.5864754605019164e-05, "loss": 0.7608, "step": 1439 }, { "epoch": 0.5307288307380448, "grad_norm": 5.890851003829278, "learning_rate": 4.586166398813203e-05, "loss": 0.7326, "step": 1440 }, { "epoch": 0.5310973924260574, "grad_norm": 5.656736742858282, "learning_rate": 4.58585733712449e-05, "loss": 0.6107, "step": 1441 }, { "epoch": 0.5314659541140698, "grad_norm": 6.122068053741208, "learning_rate": 4.585548275435777e-05, "loss": 0.8315, "step": 1442 }, { "epoch": 0.5318345158020824, "grad_norm": 6.446295497956378, "learning_rate": 4.585239213747064e-05, "loss": 0.6068, "step": 1443 }, { "epoch": 0.5322030774900949, "grad_norm": 6.332016791316003, "learning_rate": 4.584930152058351e-05, "loss": 0.7271, "step": 1444 }, { "epoch": 0.5325716391781075, "grad_norm": 7.300952107250347, "learning_rate": 4.584621090369638e-05, "loss": 0.6856, "step": 1445 }, { "epoch": 0.5329402008661199, "grad_norm": 7.60116102485543, "learning_rate": 4.584312028680925e-05, "loss": 0.6995, "step": 1446 }, { "epoch": 0.5333087625541325, "grad_norm": 7.991401581516258, "learning_rate": 4.584002966992212e-05, "loss": 0.9868, "step": 1447 }, { "epoch": 0.533677324242145, "grad_norm": 7.20794900152559, "learning_rate": 4.583693905303499e-05, "loss": 0.7145, "step": 1448 }, { "epoch": 0.5340458859301576, "grad_norm": 5.7489424023134195, "learning_rate": 4.5833848436147855e-05, "loss": 0.7104, "step": 1449 }, { "epoch": 0.5344144476181701, "grad_norm": 8.94833592123571, "learning_rate": 4.5830757819260726e-05, "loss": 0.7625, "step": 1450 }, { "epoch": 0.5347830093061826, "grad_norm": 9.971748881557133, "learning_rate": 4.58276672023736e-05, "loss": 0.8142, "step": 1451 }, { "epoch": 0.5351515709941952, "grad_norm": 6.287194622655205, "learning_rate": 4.582457658548647e-05, "loss": 0.7811, "step": 1452 }, { "epoch": 0.5355201326822077, "grad_norm": 10.873940602583573, "learning_rate": 4.582148596859934e-05, "loss": 0.7439, "step": 1453 }, { "epoch": 0.5358886943702202, "grad_norm": 8.15035013956216, "learning_rate": 4.5818395351712204e-05, "loss": 0.7384, "step": 1454 }, { "epoch": 0.5362572560582327, "grad_norm": 5.955816346400448, "learning_rate": 4.581530473482507e-05, "loss": 0.8797, "step": 1455 }, { "epoch": 0.5366258177462453, "grad_norm": 7.925048192596716, "learning_rate": 4.581221411793794e-05, "loss": 1.1002, "step": 1456 }, { "epoch": 0.5369943794342578, "grad_norm": 5.22540169955889, "learning_rate": 4.580912350105081e-05, "loss": 0.4743, "step": 1457 }, { "epoch": 0.5373629411222703, "grad_norm": 7.399811726829899, "learning_rate": 4.580603288416368e-05, "loss": 0.7734, "step": 1458 }, { "epoch": 0.5377315028102829, "grad_norm": 7.046755906005836, "learning_rate": 4.580294226727655e-05, "loss": 0.7906, "step": 1459 }, { "epoch": 0.5381000644982954, "grad_norm": 6.704595942373282, "learning_rate": 4.579985165038942e-05, "loss": 0.7352, "step": 1460 }, { "epoch": 0.538468626186308, "grad_norm": 5.7596606493697555, "learning_rate": 4.579676103350229e-05, "loss": 0.6575, "step": 1461 }, { "epoch": 0.5388371878743204, "grad_norm": 6.948513327567193, "learning_rate": 4.579367041661516e-05, "loss": 0.6304, "step": 1462 }, { "epoch": 0.539205749562333, "grad_norm": 6.196039645418487, "learning_rate": 4.579057979972803e-05, "loss": 0.6654, "step": 1463 }, { "epoch": 0.5395743112503455, "grad_norm": 9.777535705267121, "learning_rate": 4.5787489182840896e-05, "loss": 0.8901, "step": 1464 }, { "epoch": 0.5399428729383581, "grad_norm": 5.443150182672972, "learning_rate": 4.578439856595377e-05, "loss": 0.7259, "step": 1465 }, { "epoch": 0.5403114346263705, "grad_norm": 6.812657065724505, "learning_rate": 4.578130794906664e-05, "loss": 0.6661, "step": 1466 }, { "epoch": 0.5406799963143831, "grad_norm": 8.370719157216085, "learning_rate": 4.577821733217951e-05, "loss": 0.6277, "step": 1467 }, { "epoch": 0.5410485580023956, "grad_norm": 6.213716703274675, "learning_rate": 4.5775126715292374e-05, "loss": 0.7778, "step": 1468 }, { "epoch": 0.5414171196904082, "grad_norm": 6.358596641651419, "learning_rate": 4.5772036098405245e-05, "loss": 0.4992, "step": 1469 }, { "epoch": 0.5417856813784208, "grad_norm": 12.290035919807405, "learning_rate": 4.576894548151811e-05, "loss": 0.6835, "step": 1470 }, { "epoch": 0.5421542430664332, "grad_norm": 12.367035740145933, "learning_rate": 4.576585486463098e-05, "loss": 0.9684, "step": 1471 }, { "epoch": 0.5425228047544458, "grad_norm": 12.809513990844486, "learning_rate": 4.576276424774385e-05, "loss": 0.9293, "step": 1472 }, { "epoch": 0.5428913664424583, "grad_norm": 8.455043610246602, "learning_rate": 4.5759673630856716e-05, "loss": 0.9118, "step": 1473 }, { "epoch": 0.5432599281304709, "grad_norm": 6.39397105919019, "learning_rate": 4.575658301396959e-05, "loss": 0.6327, "step": 1474 }, { "epoch": 0.5436284898184833, "grad_norm": 22.570879703629256, "learning_rate": 4.575349239708246e-05, "loss": 0.9059, "step": 1475 }, { "epoch": 0.5439970515064959, "grad_norm": 6.3145056833568916, "learning_rate": 4.575040178019533e-05, "loss": 0.5161, "step": 1476 }, { "epoch": 0.5443656131945084, "grad_norm": 5.914820351665845, "learning_rate": 4.57473111633082e-05, "loss": 0.668, "step": 1477 }, { "epoch": 0.544734174882521, "grad_norm": 6.791699493266464, "learning_rate": 4.5744220546421066e-05, "loss": 0.6825, "step": 1478 }, { "epoch": 0.5451027365705335, "grad_norm": 8.266302474008368, "learning_rate": 4.574112992953394e-05, "loss": 0.7315, "step": 1479 }, { "epoch": 0.545471298258546, "grad_norm": 5.715886300581204, "learning_rate": 4.573803931264681e-05, "loss": 0.6222, "step": 1480 }, { "epoch": 0.5458398599465586, "grad_norm": 4.594698023124703, "learning_rate": 4.573494869575968e-05, "loss": 0.6661, "step": 1481 }, { "epoch": 0.5462084216345711, "grad_norm": 3.590751657667377, "learning_rate": 4.5731858078872544e-05, "loss": 0.3988, "step": 1482 }, { "epoch": 0.5465769833225836, "grad_norm": 5.037200441753859, "learning_rate": 4.5728767461985415e-05, "loss": 0.6072, "step": 1483 }, { "epoch": 0.5469455450105961, "grad_norm": 5.928665173523134, "learning_rate": 4.5725676845098286e-05, "loss": 0.9837, "step": 1484 }, { "epoch": 0.5473141066986087, "grad_norm": 6.638937055695903, "learning_rate": 4.572258622821115e-05, "loss": 0.8469, "step": 1485 }, { "epoch": 0.5476826683866212, "grad_norm": 10.069699572842513, "learning_rate": 4.571949561132402e-05, "loss": 0.8128, "step": 1486 }, { "epoch": 0.5480512300746337, "grad_norm": 4.62998837727021, "learning_rate": 4.571640499443689e-05, "loss": 0.4906, "step": 1487 }, { "epoch": 0.5484197917626463, "grad_norm": 5.892763594508244, "learning_rate": 4.571331437754976e-05, "loss": 0.8655, "step": 1488 }, { "epoch": 0.5487883534506588, "grad_norm": 7.479569901529448, "learning_rate": 4.571022376066263e-05, "loss": 1.0929, "step": 1489 }, { "epoch": 0.5491569151386714, "grad_norm": 6.375262086754135, "learning_rate": 4.57071331437755e-05, "loss": 0.8838, "step": 1490 }, { "epoch": 0.5495254768266838, "grad_norm": 6.554383117373931, "learning_rate": 4.570404252688837e-05, "loss": 0.6674, "step": 1491 }, { "epoch": 0.5498940385146964, "grad_norm": 7.452809197155071, "learning_rate": 4.5700951910001235e-05, "loss": 1.0538, "step": 1492 }, { "epoch": 0.5502626002027089, "grad_norm": 7.081770810589728, "learning_rate": 4.5697861293114106e-05, "loss": 0.6055, "step": 1493 }, { "epoch": 0.5506311618907215, "grad_norm": 8.267017730313654, "learning_rate": 4.569477067622698e-05, "loss": 0.7878, "step": 1494 }, { "epoch": 0.550999723578734, "grad_norm": 7.511097136916504, "learning_rate": 4.569168005933985e-05, "loss": 0.7197, "step": 1495 }, { "epoch": 0.5513682852667465, "grad_norm": 6.828740072590544, "learning_rate": 4.568858944245272e-05, "loss": 0.6059, "step": 1496 }, { "epoch": 0.5517368469547591, "grad_norm": 7.125494688830164, "learning_rate": 4.5685498825565584e-05, "loss": 0.8496, "step": 1497 }, { "epoch": 0.5521054086427716, "grad_norm": 5.693340949262637, "learning_rate": 4.5682408208678456e-05, "loss": 0.7659, "step": 1498 }, { "epoch": 0.5524739703307842, "grad_norm": 6.119224472275048, "learning_rate": 4.567931759179133e-05, "loss": 0.5793, "step": 1499 }, { "epoch": 0.5528425320187966, "grad_norm": 11.700792230373095, "learning_rate": 4.567622697490419e-05, "loss": 1.0472, "step": 1500 }, { "epoch": 0.5532110937068092, "grad_norm": 4.119246661004803, "learning_rate": 4.567313635801706e-05, "loss": 0.4404, "step": 1501 }, { "epoch": 0.5535796553948217, "grad_norm": 9.168765943184418, "learning_rate": 4.567004574112993e-05, "loss": 0.6529, "step": 1502 }, { "epoch": 0.5539482170828343, "grad_norm": 10.449866729324567, "learning_rate": 4.56669551242428e-05, "loss": 0.8573, "step": 1503 }, { "epoch": 0.5543167787708467, "grad_norm": 7.125514497083318, "learning_rate": 4.566386450735567e-05, "loss": 0.7098, "step": 1504 }, { "epoch": 0.5546853404588593, "grad_norm": 8.51625884610647, "learning_rate": 4.566077389046854e-05, "loss": 0.8209, "step": 1505 }, { "epoch": 0.5550539021468718, "grad_norm": 5.460560623283715, "learning_rate": 4.565768327358141e-05, "loss": 0.8898, "step": 1506 }, { "epoch": 0.5554224638348844, "grad_norm": 9.223741944910893, "learning_rate": 4.5654592656694276e-05, "loss": 0.7938, "step": 1507 }, { "epoch": 0.5557910255228969, "grad_norm": 8.302157055101892, "learning_rate": 4.565150203980715e-05, "loss": 0.6324, "step": 1508 }, { "epoch": 0.5561595872109094, "grad_norm": 5.894216722473341, "learning_rate": 4.564841142292002e-05, "loss": 0.6195, "step": 1509 }, { "epoch": 0.556528148898922, "grad_norm": 7.93291479123285, "learning_rate": 4.564532080603289e-05, "loss": 0.9252, "step": 1510 }, { "epoch": 0.5568967105869345, "grad_norm": 6.752137022463124, "learning_rate": 4.5642230189145754e-05, "loss": 0.7823, "step": 1511 }, { "epoch": 0.557265272274947, "grad_norm": 8.183363473788795, "learning_rate": 4.5639139572258625e-05, "loss": 0.8005, "step": 1512 }, { "epoch": 0.5576338339629595, "grad_norm": 11.059469206560475, "learning_rate": 4.5636048955371496e-05, "loss": 0.8153, "step": 1513 }, { "epoch": 0.5580023956509721, "grad_norm": 5.709930456376013, "learning_rate": 4.563295833848437e-05, "loss": 0.6639, "step": 1514 }, { "epoch": 0.5583709573389846, "grad_norm": 5.7373797368761625, "learning_rate": 4.562986772159723e-05, "loss": 0.6972, "step": 1515 }, { "epoch": 0.5587395190269971, "grad_norm": 5.252367121791857, "learning_rate": 4.56267771047101e-05, "loss": 0.5652, "step": 1516 }, { "epoch": 0.5591080807150097, "grad_norm": 5.210703785929704, "learning_rate": 4.562368648782297e-05, "loss": 0.8199, "step": 1517 }, { "epoch": 0.5594766424030222, "grad_norm": 11.486716518312372, "learning_rate": 4.562059587093584e-05, "loss": 1.0522, "step": 1518 }, { "epoch": 0.5598452040910348, "grad_norm": 7.193437064727766, "learning_rate": 4.561750525404871e-05, "loss": 0.8076, "step": 1519 }, { "epoch": 0.5602137657790472, "grad_norm": 6.153398347113709, "learning_rate": 4.561441463716158e-05, "loss": 1.0757, "step": 1520 }, { "epoch": 0.5605823274670598, "grad_norm": 4.983428195488508, "learning_rate": 4.5611324020274446e-05, "loss": 0.6547, "step": 1521 }, { "epoch": 0.5609508891550723, "grad_norm": 7.61009084479388, "learning_rate": 4.560823340338732e-05, "loss": 0.7303, "step": 1522 }, { "epoch": 0.5613194508430849, "grad_norm": 7.83194065564811, "learning_rate": 4.560514278650019e-05, "loss": 0.734, "step": 1523 }, { "epoch": 0.5616880125310973, "grad_norm": 6.380529660807276, "learning_rate": 4.560205216961306e-05, "loss": 0.7228, "step": 1524 }, { "epoch": 0.5620565742191099, "grad_norm": 8.108460479502645, "learning_rate": 4.559896155272593e-05, "loss": 0.7094, "step": 1525 }, { "epoch": 0.5624251359071225, "grad_norm": 5.1232211119341295, "learning_rate": 4.5595870935838795e-05, "loss": 0.6108, "step": 1526 }, { "epoch": 0.562793697595135, "grad_norm": 5.0637486177817825, "learning_rate": 4.5592780318951666e-05, "loss": 0.6321, "step": 1527 }, { "epoch": 0.5631622592831476, "grad_norm": 5.836594387603969, "learning_rate": 4.558968970206454e-05, "loss": 0.8386, "step": 1528 }, { "epoch": 0.56353082097116, "grad_norm": 7.189412003668735, "learning_rate": 4.558659908517741e-05, "loss": 0.7625, "step": 1529 }, { "epoch": 0.5638993826591726, "grad_norm": 8.129829262680246, "learning_rate": 4.558350846829027e-05, "loss": 0.5878, "step": 1530 }, { "epoch": 0.5642679443471851, "grad_norm": 8.212185286671094, "learning_rate": 4.558041785140314e-05, "loss": 0.7804, "step": 1531 }, { "epoch": 0.5646365060351977, "grad_norm": 8.556402332407204, "learning_rate": 4.557732723451601e-05, "loss": 0.7353, "step": 1532 }, { "epoch": 0.5650050677232101, "grad_norm": 5.111528508562547, "learning_rate": 4.557423661762888e-05, "loss": 0.4149, "step": 1533 }, { "epoch": 0.5653736294112227, "grad_norm": 7.956695654434992, "learning_rate": 4.557114600074175e-05, "loss": 0.6239, "step": 1534 }, { "epoch": 0.5657421910992352, "grad_norm": 10.185850044924837, "learning_rate": 4.556805538385462e-05, "loss": 0.985, "step": 1535 }, { "epoch": 0.5661107527872478, "grad_norm": 10.375510329598702, "learning_rate": 4.5564964766967486e-05, "loss": 0.7566, "step": 1536 }, { "epoch": 0.5664793144752603, "grad_norm": 5.163430133369435, "learning_rate": 4.556187415008036e-05, "loss": 0.7272, "step": 1537 }, { "epoch": 0.5668478761632728, "grad_norm": 9.310635988206327, "learning_rate": 4.555878353319323e-05, "loss": 0.9389, "step": 1538 }, { "epoch": 0.5672164378512854, "grad_norm": 6.5553194998502695, "learning_rate": 4.55556929163061e-05, "loss": 0.6897, "step": 1539 }, { "epoch": 0.5675849995392979, "grad_norm": 6.666895417421511, "learning_rate": 4.5552602299418964e-05, "loss": 0.7689, "step": 1540 }, { "epoch": 0.5679535612273104, "grad_norm": 6.501646860277168, "learning_rate": 4.5549511682531836e-05, "loss": 0.7669, "step": 1541 }, { "epoch": 0.5683221229153229, "grad_norm": 6.566837875277247, "learning_rate": 4.554642106564471e-05, "loss": 0.8482, "step": 1542 }, { "epoch": 0.5686906846033355, "grad_norm": 5.222814929865501, "learning_rate": 4.554333044875758e-05, "loss": 0.614, "step": 1543 }, { "epoch": 0.569059246291348, "grad_norm": 5.175776459313821, "learning_rate": 4.554023983187045e-05, "loss": 0.7457, "step": 1544 }, { "epoch": 0.5694278079793605, "grad_norm": 11.023114973577258, "learning_rate": 4.553714921498331e-05, "loss": 0.6094, "step": 1545 }, { "epoch": 0.5697963696673731, "grad_norm": 5.666743034895367, "learning_rate": 4.553405859809618e-05, "loss": 0.6029, "step": 1546 }, { "epoch": 0.5701649313553856, "grad_norm": 5.0278909973735235, "learning_rate": 4.553096798120905e-05, "loss": 0.6269, "step": 1547 }, { "epoch": 0.5705334930433982, "grad_norm": 5.9986478553528455, "learning_rate": 4.552787736432192e-05, "loss": 0.6079, "step": 1548 }, { "epoch": 0.5709020547314106, "grad_norm": 6.265787250957981, "learning_rate": 4.552478674743479e-05, "loss": 0.7521, "step": 1549 }, { "epoch": 0.5712706164194232, "grad_norm": 7.859956935929609, "learning_rate": 4.5521696130547656e-05, "loss": 0.927, "step": 1550 }, { "epoch": 0.5716391781074357, "grad_norm": 4.038134233953927, "learning_rate": 4.551860551366053e-05, "loss": 0.5981, "step": 1551 }, { "epoch": 0.5720077397954483, "grad_norm": 6.696551035896552, "learning_rate": 4.55155148967734e-05, "loss": 0.829, "step": 1552 }, { "epoch": 0.5723763014834607, "grad_norm": 7.634581502400565, "learning_rate": 4.551242427988627e-05, "loss": 0.6895, "step": 1553 }, { "epoch": 0.5727448631714733, "grad_norm": 5.94974387763647, "learning_rate": 4.550933366299914e-05, "loss": 0.7234, "step": 1554 }, { "epoch": 0.5731134248594859, "grad_norm": 6.261624931065717, "learning_rate": 4.5506243046112005e-05, "loss": 0.5169, "step": 1555 }, { "epoch": 0.5734819865474984, "grad_norm": 6.540623996842076, "learning_rate": 4.5503152429224876e-05, "loss": 0.5249, "step": 1556 }, { "epoch": 0.573850548235511, "grad_norm": 6.7548361513721975, "learning_rate": 4.550006181233775e-05, "loss": 0.9984, "step": 1557 }, { "epoch": 0.5742191099235234, "grad_norm": 9.3845507991151, "learning_rate": 4.549697119545062e-05, "loss": 0.7589, "step": 1558 }, { "epoch": 0.574587671611536, "grad_norm": 5.547229164529675, "learning_rate": 4.549388057856348e-05, "loss": 0.637, "step": 1559 }, { "epoch": 0.5749562332995485, "grad_norm": 5.580712837436252, "learning_rate": 4.5490789961676354e-05, "loss": 0.6496, "step": 1560 }, { "epoch": 0.5753247949875611, "grad_norm": 7.360150921974242, "learning_rate": 4.548769934478922e-05, "loss": 0.6469, "step": 1561 }, { "epoch": 0.5756933566755735, "grad_norm": 7.223305854035083, "learning_rate": 4.548460872790209e-05, "loss": 0.9649, "step": 1562 }, { "epoch": 0.5760619183635861, "grad_norm": 5.41150211711815, "learning_rate": 4.548151811101496e-05, "loss": 0.806, "step": 1563 }, { "epoch": 0.5764304800515986, "grad_norm": 6.719972862915468, "learning_rate": 4.5478427494127826e-05, "loss": 0.7076, "step": 1564 }, { "epoch": 0.5767990417396112, "grad_norm": 9.064993258861428, "learning_rate": 4.54753368772407e-05, "loss": 0.8497, "step": 1565 }, { "epoch": 0.5771676034276237, "grad_norm": 5.1604519455399025, "learning_rate": 4.547224626035357e-05, "loss": 0.5962, "step": 1566 }, { "epoch": 0.5775361651156362, "grad_norm": 5.0579674292526695, "learning_rate": 4.546915564346644e-05, "loss": 0.6221, "step": 1567 }, { "epoch": 0.5779047268036488, "grad_norm": 6.8803351768642855, "learning_rate": 4.546606502657931e-05, "loss": 0.6725, "step": 1568 }, { "epoch": 0.5782732884916613, "grad_norm": 6.802767459724313, "learning_rate": 4.5462974409692175e-05, "loss": 0.7572, "step": 1569 }, { "epoch": 0.5786418501796738, "grad_norm": 6.35359397767165, "learning_rate": 4.5459883792805046e-05, "loss": 0.6237, "step": 1570 }, { "epoch": 0.5790104118676863, "grad_norm": 8.883086482966783, "learning_rate": 4.545679317591792e-05, "loss": 0.5416, "step": 1571 }, { "epoch": 0.5793789735556989, "grad_norm": 6.464100811729287, "learning_rate": 4.545370255903079e-05, "loss": 0.8735, "step": 1572 }, { "epoch": 0.5797475352437114, "grad_norm": 8.003566423819338, "learning_rate": 4.545061194214365e-05, "loss": 0.7414, "step": 1573 }, { "epoch": 0.580116096931724, "grad_norm": 8.451962882946601, "learning_rate": 4.5447521325256524e-05, "loss": 0.6378, "step": 1574 }, { "epoch": 0.5804846586197365, "grad_norm": 7.289676358295436, "learning_rate": 4.5444430708369395e-05, "loss": 0.6897, "step": 1575 }, { "epoch": 0.580853220307749, "grad_norm": 9.74868922103481, "learning_rate": 4.544134009148226e-05, "loss": 0.8987, "step": 1576 }, { "epoch": 0.5812217819957616, "grad_norm": 8.428957988012034, "learning_rate": 4.543824947459513e-05, "loss": 0.8622, "step": 1577 }, { "epoch": 0.581590343683774, "grad_norm": 5.746682578387142, "learning_rate": 4.5435158857708e-05, "loss": 0.5381, "step": 1578 }, { "epoch": 0.5819589053717866, "grad_norm": 9.061461921957513, "learning_rate": 4.5432068240820866e-05, "loss": 0.7942, "step": 1579 }, { "epoch": 0.5823274670597991, "grad_norm": 9.715578794568884, "learning_rate": 4.542897762393374e-05, "loss": 0.9199, "step": 1580 }, { "epoch": 0.5826960287478117, "grad_norm": 7.627430512828507, "learning_rate": 4.542588700704661e-05, "loss": 0.9469, "step": 1581 }, { "epoch": 0.5830645904358241, "grad_norm": 6.118486950902108, "learning_rate": 4.542279639015948e-05, "loss": 0.5835, "step": 1582 }, { "epoch": 0.5834331521238367, "grad_norm": 5.160114851272239, "learning_rate": 4.5419705773272344e-05, "loss": 0.5727, "step": 1583 }, { "epoch": 0.5838017138118493, "grad_norm": 6.567738213548042, "learning_rate": 4.5416615156385216e-05, "loss": 0.9027, "step": 1584 }, { "epoch": 0.5841702754998618, "grad_norm": 5.409658779587883, "learning_rate": 4.541352453949809e-05, "loss": 0.5105, "step": 1585 }, { "epoch": 0.5845388371878744, "grad_norm": 8.80121208859982, "learning_rate": 4.541043392261096e-05, "loss": 0.7117, "step": 1586 }, { "epoch": 0.5849073988758868, "grad_norm": 6.595511973929104, "learning_rate": 4.540734330572383e-05, "loss": 0.6278, "step": 1587 }, { "epoch": 0.5852759605638994, "grad_norm": 6.347747595496596, "learning_rate": 4.5404252688836694e-05, "loss": 0.6234, "step": 1588 }, { "epoch": 0.5856445222519119, "grad_norm": 7.851715784806583, "learning_rate": 4.5401162071949565e-05, "loss": 0.46, "step": 1589 }, { "epoch": 0.5860130839399245, "grad_norm": 9.52340585579751, "learning_rate": 4.5398071455062436e-05, "loss": 0.8081, "step": 1590 }, { "epoch": 0.5863816456279369, "grad_norm": 8.02670123645186, "learning_rate": 4.53949808381753e-05, "loss": 0.6881, "step": 1591 }, { "epoch": 0.5867502073159495, "grad_norm": 4.147373213804729, "learning_rate": 4.539189022128817e-05, "loss": 0.4815, "step": 1592 }, { "epoch": 0.5871187690039621, "grad_norm": 6.334654051285113, "learning_rate": 4.5388799604401036e-05, "loss": 0.7298, "step": 1593 }, { "epoch": 0.5874873306919746, "grad_norm": 10.045481441309585, "learning_rate": 4.538570898751391e-05, "loss": 0.8279, "step": 1594 }, { "epoch": 0.5878558923799871, "grad_norm": 5.336912186213073, "learning_rate": 4.538261837062678e-05, "loss": 0.6891, "step": 1595 }, { "epoch": 0.5882244540679996, "grad_norm": 6.209651252426163, "learning_rate": 4.537952775373965e-05, "loss": 0.6914, "step": 1596 }, { "epoch": 0.5885930157560122, "grad_norm": 6.6492915070077965, "learning_rate": 4.537643713685252e-05, "loss": 0.753, "step": 1597 }, { "epoch": 0.5889615774440247, "grad_norm": 5.5370738948378255, "learning_rate": 4.5373346519965385e-05, "loss": 0.7909, "step": 1598 }, { "epoch": 0.5893301391320372, "grad_norm": 7.589822640539353, "learning_rate": 4.5370255903078256e-05, "loss": 0.8184, "step": 1599 }, { "epoch": 0.5896987008200497, "grad_norm": 9.383634126312892, "learning_rate": 4.536716528619113e-05, "loss": 1.0953, "step": 1600 }, { "epoch": 0.5896987008200497, "eval_bleu": 0.059549847484403624, "eval_bleu_1gram": 0.41239142447914534, "eval_bleu_2gram": 0.1942260221663101, "eval_bleu_3gram": 0.09034895874036726, "eval_bleu_4gram": 0.04528327937965561, "eval_rag_val_loss": 0.8220645315783643, "eval_rouge1": 0.39484843988509716, "eval_rouge2": 0.18306406834930353, "eval_rougeL": 0.39134014164378605, "step": 1600 }, { "epoch": 0.5900672625080623, "grad_norm": 10.092851628340686, "learning_rate": 4.5364074669304e-05, "loss": 0.7886, "step": 1601 }, { "epoch": 0.5904358241960748, "grad_norm": 5.713845740137709, "learning_rate": 4.536098405241686e-05, "loss": 0.8102, "step": 1602 }, { "epoch": 0.5908043858840873, "grad_norm": 10.71918844766536, "learning_rate": 4.5357893435529734e-05, "loss": 0.8404, "step": 1603 }, { "epoch": 0.5911729475720999, "grad_norm": 7.435120273994869, "learning_rate": 4.5354802818642605e-05, "loss": 0.8279, "step": 1604 }, { "epoch": 0.5915415092601124, "grad_norm": 5.873614858114621, "learning_rate": 4.535171220175548e-05, "loss": 0.4246, "step": 1605 }, { "epoch": 0.591910070948125, "grad_norm": 7.016901048866932, "learning_rate": 4.534862158486834e-05, "loss": 0.8368, "step": 1606 }, { "epoch": 0.5922786326361374, "grad_norm": 8.995714650881169, "learning_rate": 4.534553096798121e-05, "loss": 0.7406, "step": 1607 }, { "epoch": 0.59264719432415, "grad_norm": 10.353544547863299, "learning_rate": 4.534244035109408e-05, "loss": 0.8985, "step": 1608 }, { "epoch": 0.5930157560121625, "grad_norm": 8.155270572479589, "learning_rate": 4.533934973420695e-05, "loss": 0.7286, "step": 1609 }, { "epoch": 0.5933843177001751, "grad_norm": 8.573079722179862, "learning_rate": 4.533625911731982e-05, "loss": 0.663, "step": 1610 }, { "epoch": 0.5937528793881875, "grad_norm": 5.709282714899687, "learning_rate": 4.533316850043269e-05, "loss": 0.5037, "step": 1611 }, { "epoch": 0.5941214410762001, "grad_norm": 6.468524431359272, "learning_rate": 4.5330077883545555e-05, "loss": 0.6907, "step": 1612 }, { "epoch": 0.5944900027642127, "grad_norm": 9.576065314966472, "learning_rate": 4.5326987266658426e-05, "loss": 0.5048, "step": 1613 }, { "epoch": 0.5948585644522252, "grad_norm": 6.34286447743074, "learning_rate": 4.53238966497713e-05, "loss": 0.7207, "step": 1614 }, { "epoch": 0.5952271261402378, "grad_norm": 5.080197143333782, "learning_rate": 4.532080603288417e-05, "loss": 0.591, "step": 1615 }, { "epoch": 0.5955956878282502, "grad_norm": 4.030040470800394, "learning_rate": 4.531771541599704e-05, "loss": 0.5626, "step": 1616 }, { "epoch": 0.5959642495162628, "grad_norm": 5.3212509783976225, "learning_rate": 4.5314624799109904e-05, "loss": 0.5313, "step": 1617 }, { "epoch": 0.5963328112042753, "grad_norm": 5.1919942484090535, "learning_rate": 4.5311534182222775e-05, "loss": 0.7422, "step": 1618 }, { "epoch": 0.5967013728922879, "grad_norm": 8.564420387266392, "learning_rate": 4.5308443565335646e-05, "loss": 1.0155, "step": 1619 }, { "epoch": 0.5970699345803003, "grad_norm": 7.304493229368525, "learning_rate": 4.530535294844852e-05, "loss": 0.8121, "step": 1620 }, { "epoch": 0.5974384962683129, "grad_norm": 7.994232005723134, "learning_rate": 4.530226233156138e-05, "loss": 0.7727, "step": 1621 }, { "epoch": 0.5978070579563255, "grad_norm": 5.151386718712974, "learning_rate": 4.5299171714674246e-05, "loss": 0.6591, "step": 1622 }, { "epoch": 0.598175619644338, "grad_norm": 8.476705607950956, "learning_rate": 4.529608109778712e-05, "loss": 0.7886, "step": 1623 }, { "epoch": 0.5985441813323505, "grad_norm": 6.9390477009172855, "learning_rate": 4.529299048089999e-05, "loss": 0.6535, "step": 1624 }, { "epoch": 0.598912743020363, "grad_norm": 5.326378499727867, "learning_rate": 4.528989986401286e-05, "loss": 0.5298, "step": 1625 }, { "epoch": 0.5992813047083756, "grad_norm": 9.925199369258596, "learning_rate": 4.528680924712573e-05, "loss": 0.6902, "step": 1626 }, { "epoch": 0.5996498663963881, "grad_norm": 5.971176852424461, "learning_rate": 4.5283718630238595e-05, "loss": 0.7114, "step": 1627 }, { "epoch": 0.6000184280844006, "grad_norm": 6.271735277506002, "learning_rate": 4.528062801335147e-05, "loss": 0.6125, "step": 1628 }, { "epoch": 0.6003869897724131, "grad_norm": 7.832590866073177, "learning_rate": 4.527753739646434e-05, "loss": 0.9019, "step": 1629 }, { "epoch": 0.6007555514604257, "grad_norm": 6.044881171611751, "learning_rate": 4.527444677957721e-05, "loss": 0.7289, "step": 1630 }, { "epoch": 0.6011241131484382, "grad_norm": 8.61979850161801, "learning_rate": 4.5271356162690073e-05, "loss": 0.7073, "step": 1631 }, { "epoch": 0.6014926748364507, "grad_norm": 5.967283697918619, "learning_rate": 4.5268265545802945e-05, "loss": 0.4966, "step": 1632 }, { "epoch": 0.6018612365244633, "grad_norm": 5.587530162482381, "learning_rate": 4.5265174928915816e-05, "loss": 0.7634, "step": 1633 }, { "epoch": 0.6022297982124758, "grad_norm": 8.443251620365503, "learning_rate": 4.526208431202869e-05, "loss": 0.7862, "step": 1634 }, { "epoch": 0.6025983599004884, "grad_norm": 6.71376257020764, "learning_rate": 4.525899369514156e-05, "loss": 0.7373, "step": 1635 }, { "epoch": 0.6029669215885008, "grad_norm": 12.874061328039609, "learning_rate": 4.5255903078254416e-05, "loss": 0.7469, "step": 1636 }, { "epoch": 0.6033354832765134, "grad_norm": 6.9536121669222215, "learning_rate": 4.525281246136729e-05, "loss": 0.7519, "step": 1637 }, { "epoch": 0.6037040449645259, "grad_norm": 5.40959249375462, "learning_rate": 4.524972184448016e-05, "loss": 0.6286, "step": 1638 }, { "epoch": 0.6040726066525385, "grad_norm": 7.12483027741248, "learning_rate": 4.524663122759303e-05, "loss": 0.8539, "step": 1639 }, { "epoch": 0.604441168340551, "grad_norm": 6.035669476559617, "learning_rate": 4.52435406107059e-05, "loss": 0.701, "step": 1640 }, { "epoch": 0.6048097300285635, "grad_norm": 6.553113345971248, "learning_rate": 4.5240449993818765e-05, "loss": 0.6245, "step": 1641 }, { "epoch": 0.6051782917165761, "grad_norm": 7.971518839585294, "learning_rate": 4.5237359376931636e-05, "loss": 0.5358, "step": 1642 }, { "epoch": 0.6055468534045886, "grad_norm": 8.983210692220254, "learning_rate": 4.523426876004451e-05, "loss": 0.8017, "step": 1643 }, { "epoch": 0.6059154150926012, "grad_norm": 19.396809171992373, "learning_rate": 4.523117814315738e-05, "loss": 0.7095, "step": 1644 }, { "epoch": 0.6062839767806136, "grad_norm": 7.1656086458600985, "learning_rate": 4.522808752627024e-05, "loss": 0.8772, "step": 1645 }, { "epoch": 0.6066525384686262, "grad_norm": 6.026045218697185, "learning_rate": 4.5224996909383114e-05, "loss": 0.66, "step": 1646 }, { "epoch": 0.6070211001566387, "grad_norm": 4.148733124912718, "learning_rate": 4.5221906292495985e-05, "loss": 0.5806, "step": 1647 }, { "epoch": 0.6073896618446513, "grad_norm": 7.72148744790815, "learning_rate": 4.5218815675608857e-05, "loss": 0.8941, "step": 1648 }, { "epoch": 0.6077582235326637, "grad_norm": 5.051097697695043, "learning_rate": 4.521572505872173e-05, "loss": 0.7014, "step": 1649 }, { "epoch": 0.6081267852206763, "grad_norm": 5.585145096494931, "learning_rate": 4.521263444183459e-05, "loss": 0.7653, "step": 1650 }, { "epoch": 0.6084953469086889, "grad_norm": 7.249790583249737, "learning_rate": 4.520954382494746e-05, "loss": 0.7228, "step": 1651 }, { "epoch": 0.6088639085967014, "grad_norm": 6.340930330869241, "learning_rate": 4.520645320806033e-05, "loss": 0.7888, "step": 1652 }, { "epoch": 0.609232470284714, "grad_norm": 5.468954028002099, "learning_rate": 4.52033625911732e-05, "loss": 0.7318, "step": 1653 }, { "epoch": 0.6096010319727264, "grad_norm": 6.114185049504151, "learning_rate": 4.520027197428607e-05, "loss": 0.6351, "step": 1654 }, { "epoch": 0.609969593660739, "grad_norm": 6.970112073043805, "learning_rate": 4.5197181357398935e-05, "loss": 0.7565, "step": 1655 }, { "epoch": 0.6103381553487515, "grad_norm": 9.103828848733441, "learning_rate": 4.5194090740511806e-05, "loss": 0.7549, "step": 1656 }, { "epoch": 0.610706717036764, "grad_norm": 5.297739504143231, "learning_rate": 4.519100012362468e-05, "loss": 0.784, "step": 1657 }, { "epoch": 0.6110752787247765, "grad_norm": 4.932289174558284, "learning_rate": 4.518790950673755e-05, "loss": 0.5384, "step": 1658 }, { "epoch": 0.6114438404127891, "grad_norm": 7.539116134848074, "learning_rate": 4.518481888985042e-05, "loss": 0.9107, "step": 1659 }, { "epoch": 0.6118124021008016, "grad_norm": 7.09656996279304, "learning_rate": 4.5181728272963284e-05, "loss": 0.7212, "step": 1660 }, { "epoch": 0.6121809637888141, "grad_norm": 8.196144723592802, "learning_rate": 4.5178637656076155e-05, "loss": 1.136, "step": 1661 }, { "epoch": 0.6125495254768267, "grad_norm": 7.157433824196624, "learning_rate": 4.5175547039189026e-05, "loss": 0.6452, "step": 1662 }, { "epoch": 0.6129180871648392, "grad_norm": 8.40506733229137, "learning_rate": 4.51724564223019e-05, "loss": 0.9742, "step": 1663 }, { "epoch": 0.6132866488528518, "grad_norm": 6.44541104328077, "learning_rate": 4.516936580541476e-05, "loss": 0.7312, "step": 1664 }, { "epoch": 0.6136552105408642, "grad_norm": 6.347152023242577, "learning_rate": 4.516627518852763e-05, "loss": 0.587, "step": 1665 }, { "epoch": 0.6140237722288768, "grad_norm": 11.54278301986787, "learning_rate": 4.51631845716405e-05, "loss": 0.8848, "step": 1666 }, { "epoch": 0.6143923339168893, "grad_norm": 8.18585701102107, "learning_rate": 4.516009395475337e-05, "loss": 0.6699, "step": 1667 }, { "epoch": 0.6147608956049019, "grad_norm": 5.441560128275592, "learning_rate": 4.515700333786624e-05, "loss": 0.6797, "step": 1668 }, { "epoch": 0.6151294572929144, "grad_norm": 8.508083314675117, "learning_rate": 4.515391272097911e-05, "loss": 0.9216, "step": 1669 }, { "epoch": 0.6154980189809269, "grad_norm": 10.00158526253073, "learning_rate": 4.5150822104091975e-05, "loss": 0.8461, "step": 1670 }, { "epoch": 0.6158665806689395, "grad_norm": 5.205287541225827, "learning_rate": 4.5147731487204847e-05, "loss": 0.636, "step": 1671 }, { "epoch": 0.616235142356952, "grad_norm": 6.7838138357026505, "learning_rate": 4.514464087031772e-05, "loss": 0.7472, "step": 1672 }, { "epoch": 0.6166037040449646, "grad_norm": 7.660324818469703, "learning_rate": 4.514155025343059e-05, "loss": 0.8215, "step": 1673 }, { "epoch": 0.616972265732977, "grad_norm": 7.1633063430005475, "learning_rate": 4.5138459636543453e-05, "loss": 0.5606, "step": 1674 }, { "epoch": 0.6173408274209896, "grad_norm": 9.0781518932476, "learning_rate": 4.5135369019656325e-05, "loss": 0.8814, "step": 1675 }, { "epoch": 0.6177093891090021, "grad_norm": 5.357876137093825, "learning_rate": 4.5132278402769196e-05, "loss": 0.6407, "step": 1676 }, { "epoch": 0.6180779507970147, "grad_norm": 7.954475572947911, "learning_rate": 4.512918778588207e-05, "loss": 0.982, "step": 1677 }, { "epoch": 0.6184465124850271, "grad_norm": 5.884295961697011, "learning_rate": 4.512609716899494e-05, "loss": 0.655, "step": 1678 }, { "epoch": 0.6188150741730397, "grad_norm": 7.047933494765034, "learning_rate": 4.51230065521078e-05, "loss": 0.6258, "step": 1679 }, { "epoch": 0.6191836358610523, "grad_norm": 5.715220211455521, "learning_rate": 4.5119915935220674e-05, "loss": 0.6187, "step": 1680 }, { "epoch": 0.6195521975490648, "grad_norm": 6.881705603211848, "learning_rate": 4.5116825318333545e-05, "loss": 0.7631, "step": 1681 }, { "epoch": 0.6199207592370773, "grad_norm": 8.778915583697316, "learning_rate": 4.511373470144641e-05, "loss": 1.0181, "step": 1682 }, { "epoch": 0.6202893209250898, "grad_norm": 5.840689280867008, "learning_rate": 4.511064408455928e-05, "loss": 0.5257, "step": 1683 }, { "epoch": 0.6206578826131024, "grad_norm": 5.8006980607460825, "learning_rate": 4.5107553467672145e-05, "loss": 0.7343, "step": 1684 }, { "epoch": 0.6210264443011149, "grad_norm": 6.929230581399014, "learning_rate": 4.5104462850785016e-05, "loss": 0.7175, "step": 1685 }, { "epoch": 0.6213950059891274, "grad_norm": 5.053189794234689, "learning_rate": 4.510137223389789e-05, "loss": 0.5163, "step": 1686 }, { "epoch": 0.6217635676771399, "grad_norm": 6.598167060876257, "learning_rate": 4.509828161701076e-05, "loss": 0.5955, "step": 1687 }, { "epoch": 0.6221321293651525, "grad_norm": 6.784681726354575, "learning_rate": 4.509519100012363e-05, "loss": 0.5837, "step": 1688 }, { "epoch": 0.6225006910531651, "grad_norm": 6.14152601780637, "learning_rate": 4.5092100383236494e-05, "loss": 0.6204, "step": 1689 }, { "epoch": 0.6228692527411775, "grad_norm": 7.357535162636574, "learning_rate": 4.5089009766349365e-05, "loss": 1.0055, "step": 1690 }, { "epoch": 0.6232378144291901, "grad_norm": 12.355781814125775, "learning_rate": 4.5085919149462237e-05, "loss": 0.6075, "step": 1691 }, { "epoch": 0.6236063761172026, "grad_norm": 5.998285684455497, "learning_rate": 4.508282853257511e-05, "loss": 0.7972, "step": 1692 }, { "epoch": 0.6239749378052152, "grad_norm": 3.701300320371811, "learning_rate": 4.507973791568797e-05, "loss": 0.4894, "step": 1693 }, { "epoch": 0.6243434994932276, "grad_norm": 5.504131152808267, "learning_rate": 4.507664729880084e-05, "loss": 0.7507, "step": 1694 }, { "epoch": 0.6247120611812402, "grad_norm": 5.439738547269205, "learning_rate": 4.5073556681913715e-05, "loss": 0.7246, "step": 1695 }, { "epoch": 0.6250806228692527, "grad_norm": 6.933685628792654, "learning_rate": 4.5070466065026586e-05, "loss": 0.8049, "step": 1696 }, { "epoch": 0.6254491845572653, "grad_norm": 5.714089417491973, "learning_rate": 4.506737544813945e-05, "loss": 0.6363, "step": 1697 }, { "epoch": 0.6258177462452778, "grad_norm": 5.519119668505404, "learning_rate": 4.506428483125232e-05, "loss": 0.8035, "step": 1698 }, { "epoch": 0.6261863079332903, "grad_norm": 6.674375400537842, "learning_rate": 4.5061194214365186e-05, "loss": 0.6694, "step": 1699 }, { "epoch": 0.6265548696213029, "grad_norm": 5.466636380057446, "learning_rate": 4.505810359747806e-05, "loss": 0.5726, "step": 1700 }, { "epoch": 0.6269234313093154, "grad_norm": 5.4723786440899715, "learning_rate": 4.505501298059093e-05, "loss": 0.6614, "step": 1701 }, { "epoch": 0.627291992997328, "grad_norm": 5.884552028462625, "learning_rate": 4.50519223637038e-05, "loss": 0.7777, "step": 1702 }, { "epoch": 0.6276605546853404, "grad_norm": 5.694810130878624, "learning_rate": 4.5048831746816664e-05, "loss": 0.7459, "step": 1703 }, { "epoch": 0.628029116373353, "grad_norm": 8.209155149677972, "learning_rate": 4.5045741129929535e-05, "loss": 0.6604, "step": 1704 }, { "epoch": 0.6283976780613655, "grad_norm": 5.753503478456524, "learning_rate": 4.5042650513042406e-05, "loss": 0.7676, "step": 1705 }, { "epoch": 0.6287662397493781, "grad_norm": 7.250481951249215, "learning_rate": 4.503955989615528e-05, "loss": 0.5659, "step": 1706 }, { "epoch": 0.6291348014373905, "grad_norm": 4.9868464067946725, "learning_rate": 4.503646927926815e-05, "loss": 0.6064, "step": 1707 }, { "epoch": 0.6295033631254031, "grad_norm": 9.750599378111167, "learning_rate": 4.503337866238101e-05, "loss": 0.7293, "step": 1708 }, { "epoch": 0.6298719248134157, "grad_norm": 6.093952195163135, "learning_rate": 4.5030288045493884e-05, "loss": 0.6563, "step": 1709 }, { "epoch": 0.6302404865014282, "grad_norm": 8.038006623291263, "learning_rate": 4.5027197428606755e-05, "loss": 0.9737, "step": 1710 }, { "epoch": 0.6306090481894407, "grad_norm": 7.1259810458264985, "learning_rate": 4.5024106811719627e-05, "loss": 0.6336, "step": 1711 }, { "epoch": 0.6309776098774532, "grad_norm": 7.890368769045173, "learning_rate": 4.502101619483249e-05, "loss": 0.7565, "step": 1712 }, { "epoch": 0.6313461715654658, "grad_norm": 5.437185694939065, "learning_rate": 4.5017925577945355e-05, "loss": 0.9234, "step": 1713 }, { "epoch": 0.6317147332534783, "grad_norm": 10.974502837232038, "learning_rate": 4.5014834961058227e-05, "loss": 0.894, "step": 1714 }, { "epoch": 0.6320832949414908, "grad_norm": 5.448114317401803, "learning_rate": 4.50117443441711e-05, "loss": 0.5351, "step": 1715 }, { "epoch": 0.6324518566295033, "grad_norm": 5.433443507538233, "learning_rate": 4.500865372728397e-05, "loss": 0.7435, "step": 1716 }, { "epoch": 0.6328204183175159, "grad_norm": 8.564198123698045, "learning_rate": 4.500556311039683e-05, "loss": 0.5901, "step": 1717 }, { "epoch": 0.6331889800055285, "grad_norm": 6.083565206768149, "learning_rate": 4.5002472493509705e-05, "loss": 0.8467, "step": 1718 }, { "epoch": 0.633557541693541, "grad_norm": 6.801238945684019, "learning_rate": 4.4999381876622576e-05, "loss": 0.5684, "step": 1719 }, { "epoch": 0.6339261033815535, "grad_norm": 5.569628487598447, "learning_rate": 4.499629125973545e-05, "loss": 0.5851, "step": 1720 }, { "epoch": 0.634294665069566, "grad_norm": 5.762553567699434, "learning_rate": 4.499320064284832e-05, "loss": 0.7312, "step": 1721 }, { "epoch": 0.6346632267575786, "grad_norm": 6.695284581877324, "learning_rate": 4.499011002596118e-05, "loss": 0.8127, "step": 1722 }, { "epoch": 0.635031788445591, "grad_norm": 5.10649153512858, "learning_rate": 4.4987019409074054e-05, "loss": 0.5626, "step": 1723 }, { "epoch": 0.6354003501336036, "grad_norm": 5.60629281290265, "learning_rate": 4.4983928792186925e-05, "loss": 0.4794, "step": 1724 }, { "epoch": 0.6357689118216161, "grad_norm": 5.121037137368239, "learning_rate": 4.4980838175299796e-05, "loss": 0.5347, "step": 1725 }, { "epoch": 0.6361374735096287, "grad_norm": 7.131066667505378, "learning_rate": 4.497774755841267e-05, "loss": 0.648, "step": 1726 }, { "epoch": 0.6365060351976412, "grad_norm": 6.916458249303285, "learning_rate": 4.4974656941525525e-05, "loss": 0.7293, "step": 1727 }, { "epoch": 0.6368745968856537, "grad_norm": 8.389403753361384, "learning_rate": 4.4971566324638396e-05, "loss": 0.6458, "step": 1728 }, { "epoch": 0.6372431585736663, "grad_norm": 6.162718806531652, "learning_rate": 4.496847570775127e-05, "loss": 0.7242, "step": 1729 }, { "epoch": 0.6376117202616788, "grad_norm": 7.88042154336758, "learning_rate": 4.496538509086414e-05, "loss": 0.7364, "step": 1730 }, { "epoch": 0.6379802819496914, "grad_norm": 7.729947142426463, "learning_rate": 4.496229447397701e-05, "loss": 0.9503, "step": 1731 }, { "epoch": 0.6383488436377038, "grad_norm": 8.815251779752504, "learning_rate": 4.4959203857089874e-05, "loss": 0.7843, "step": 1732 }, { "epoch": 0.6387174053257164, "grad_norm": 6.285759516961981, "learning_rate": 4.4956113240202745e-05, "loss": 1.0071, "step": 1733 }, { "epoch": 0.6390859670137289, "grad_norm": 54.638511540481915, "learning_rate": 4.4953022623315616e-05, "loss": 0.7942, "step": 1734 }, { "epoch": 0.6394545287017415, "grad_norm": 6.051090476382678, "learning_rate": 4.494993200642849e-05, "loss": 0.6705, "step": 1735 }, { "epoch": 0.6398230903897539, "grad_norm": 7.8867603469061525, "learning_rate": 4.494684138954135e-05, "loss": 0.7804, "step": 1736 }, { "epoch": 0.6401916520777665, "grad_norm": 10.266684746003849, "learning_rate": 4.494375077265422e-05, "loss": 0.8796, "step": 1737 }, { "epoch": 0.6405602137657791, "grad_norm": 9.358594858711012, "learning_rate": 4.4940660155767094e-05, "loss": 1.1235, "step": 1738 }, { "epoch": 0.6409287754537916, "grad_norm": 7.972422273483654, "learning_rate": 4.4937569538879966e-05, "loss": 0.7923, "step": 1739 }, { "epoch": 0.6412973371418041, "grad_norm": 6.395116444855537, "learning_rate": 4.493447892199284e-05, "loss": 0.7086, "step": 1740 }, { "epoch": 0.6416658988298166, "grad_norm": 7.616313050768376, "learning_rate": 4.49313883051057e-05, "loss": 0.686, "step": 1741 }, { "epoch": 0.6420344605178292, "grad_norm": 8.470095819609089, "learning_rate": 4.4928297688218566e-05, "loss": 0.7442, "step": 1742 }, { "epoch": 0.6424030222058417, "grad_norm": 8.090458616783279, "learning_rate": 4.492520707133144e-05, "loss": 0.8732, "step": 1743 }, { "epoch": 0.6427715838938542, "grad_norm": 6.371633800302419, "learning_rate": 4.492211645444431e-05, "loss": 0.894, "step": 1744 }, { "epoch": 0.6431401455818667, "grad_norm": 6.003876705258309, "learning_rate": 4.491902583755718e-05, "loss": 0.7444, "step": 1745 }, { "epoch": 0.6435087072698793, "grad_norm": 5.630309925933078, "learning_rate": 4.4915935220670044e-05, "loss": 0.5094, "step": 1746 }, { "epoch": 0.6438772689578919, "grad_norm": 9.254323412927317, "learning_rate": 4.4912844603782915e-05, "loss": 0.6382, "step": 1747 }, { "epoch": 0.6442458306459043, "grad_norm": 9.515131895320819, "learning_rate": 4.4909753986895786e-05, "loss": 0.8029, "step": 1748 }, { "epoch": 0.6446143923339169, "grad_norm": 7.122358936909307, "learning_rate": 4.490666337000866e-05, "loss": 0.8517, "step": 1749 }, { "epoch": 0.6449829540219294, "grad_norm": 5.136062220401534, "learning_rate": 4.490357275312153e-05, "loss": 0.6139, "step": 1750 }, { "epoch": 0.645351515709942, "grad_norm": 4.549584908467526, "learning_rate": 4.490048213623439e-05, "loss": 0.5625, "step": 1751 }, { "epoch": 0.6457200773979545, "grad_norm": 6.652962459300615, "learning_rate": 4.4897391519347264e-05, "loss": 0.9437, "step": 1752 }, { "epoch": 0.646088639085967, "grad_norm": 7.511877699988855, "learning_rate": 4.4894300902460135e-05, "loss": 0.8089, "step": 1753 }, { "epoch": 0.6464572007739795, "grad_norm": 6.494193491326171, "learning_rate": 4.4891210285573006e-05, "loss": 0.8715, "step": 1754 }, { "epoch": 0.6468257624619921, "grad_norm": 7.273477884051422, "learning_rate": 4.488811966868587e-05, "loss": 0.6806, "step": 1755 }, { "epoch": 0.6471943241500046, "grad_norm": 4.957364455162501, "learning_rate": 4.488502905179874e-05, "loss": 0.4749, "step": 1756 }, { "epoch": 0.6475628858380171, "grad_norm": 5.592182302032894, "learning_rate": 4.4881938434911606e-05, "loss": 0.5597, "step": 1757 }, { "epoch": 0.6479314475260297, "grad_norm": 8.034233756142514, "learning_rate": 4.487884781802448e-05, "loss": 0.757, "step": 1758 }, { "epoch": 0.6483000092140422, "grad_norm": 6.511074608681309, "learning_rate": 4.487575720113735e-05, "loss": 0.7378, "step": 1759 }, { "epoch": 0.6486685709020548, "grad_norm": 6.813094646854114, "learning_rate": 4.487266658425022e-05, "loss": 0.7739, "step": 1760 }, { "epoch": 0.6490371325900672, "grad_norm": 8.11819125771714, "learning_rate": 4.4869575967363084e-05, "loss": 0.5876, "step": 1761 }, { "epoch": 0.6494056942780798, "grad_norm": 7.104086180946047, "learning_rate": 4.4866485350475956e-05, "loss": 0.7838, "step": 1762 }, { "epoch": 0.6497742559660923, "grad_norm": 7.823816267184833, "learning_rate": 4.486339473358883e-05, "loss": 0.448, "step": 1763 }, { "epoch": 0.6501428176541049, "grad_norm": 5.483748267029801, "learning_rate": 4.48603041167017e-05, "loss": 0.6789, "step": 1764 }, { "epoch": 0.6505113793421173, "grad_norm": 6.552431648186068, "learning_rate": 4.485721349981456e-05, "loss": 0.6185, "step": 1765 }, { "epoch": 0.6508799410301299, "grad_norm": 7.664475113924805, "learning_rate": 4.4854122882927434e-05, "loss": 0.9461, "step": 1766 }, { "epoch": 0.6512485027181425, "grad_norm": 6.03949865268882, "learning_rate": 4.4851032266040305e-05, "loss": 0.5895, "step": 1767 }, { "epoch": 0.651617064406155, "grad_norm": 6.319123975664999, "learning_rate": 4.4847941649153176e-05, "loss": 0.652, "step": 1768 }, { "epoch": 0.6519856260941675, "grad_norm": 6.667717024393193, "learning_rate": 4.484485103226605e-05, "loss": 0.6525, "step": 1769 }, { "epoch": 0.65235418778218, "grad_norm": 6.580078414867165, "learning_rate": 4.484176041537891e-05, "loss": 0.5889, "step": 1770 }, { "epoch": 0.6527227494701926, "grad_norm": 14.594081441967, "learning_rate": 4.483866979849178e-05, "loss": 0.7156, "step": 1771 }, { "epoch": 0.6530913111582051, "grad_norm": 5.658588073719195, "learning_rate": 4.483557918160465e-05, "loss": 0.6376, "step": 1772 }, { "epoch": 0.6534598728462176, "grad_norm": 14.669524925772189, "learning_rate": 4.483248856471752e-05, "loss": 0.7362, "step": 1773 }, { "epoch": 0.6538284345342301, "grad_norm": 6.596469119626909, "learning_rate": 4.482939794783039e-05, "loss": 0.5409, "step": 1774 }, { "epoch": 0.6541969962222427, "grad_norm": 10.386348171158115, "learning_rate": 4.4826307330943254e-05, "loss": 0.7356, "step": 1775 }, { "epoch": 0.6545655579102553, "grad_norm": 8.238302087011345, "learning_rate": 4.4823216714056125e-05, "loss": 0.6935, "step": 1776 }, { "epoch": 0.6549341195982677, "grad_norm": 6.0439812096253815, "learning_rate": 4.4820126097168996e-05, "loss": 0.6654, "step": 1777 }, { "epoch": 0.6553026812862803, "grad_norm": 8.932609874221479, "learning_rate": 4.481703548028187e-05, "loss": 0.712, "step": 1778 }, { "epoch": 0.6556712429742928, "grad_norm": 5.269934455710096, "learning_rate": 4.481394486339474e-05, "loss": 0.5528, "step": 1779 }, { "epoch": 0.6560398046623054, "grad_norm": 9.719911757331571, "learning_rate": 4.48108542465076e-05, "loss": 0.7809, "step": 1780 }, { "epoch": 0.6564083663503179, "grad_norm": 5.636337045288006, "learning_rate": 4.4807763629620474e-05, "loss": 0.8769, "step": 1781 }, { "epoch": 0.6567769280383304, "grad_norm": 6.813061612279376, "learning_rate": 4.4804673012733346e-05, "loss": 0.8264, "step": 1782 }, { "epoch": 0.6571454897263429, "grad_norm": 6.889104074104822, "learning_rate": 4.480158239584622e-05, "loss": 0.9091, "step": 1783 }, { "epoch": 0.6575140514143555, "grad_norm": 6.309503127181574, "learning_rate": 4.479849177895908e-05, "loss": 0.7094, "step": 1784 }, { "epoch": 0.6578826131023681, "grad_norm": 7.263277720634831, "learning_rate": 4.479540116207195e-05, "loss": 0.7048, "step": 1785 }, { "epoch": 0.6582511747903805, "grad_norm": 10.342016161213856, "learning_rate": 4.4792310545184824e-05, "loss": 0.7452, "step": 1786 }, { "epoch": 0.6586197364783931, "grad_norm": 5.638829502905882, "learning_rate": 4.478921992829769e-05, "loss": 0.7034, "step": 1787 }, { "epoch": 0.6589882981664056, "grad_norm": 7.190468084089557, "learning_rate": 4.478612931141056e-05, "loss": 0.8854, "step": 1788 }, { "epoch": 0.6593568598544182, "grad_norm": 8.390162165714807, "learning_rate": 4.478303869452343e-05, "loss": 0.9815, "step": 1789 }, { "epoch": 0.6597254215424306, "grad_norm": 7.087196313988881, "learning_rate": 4.4779948077636295e-05, "loss": 0.9102, "step": 1790 }, { "epoch": 0.6600939832304432, "grad_norm": 11.818480577227213, "learning_rate": 4.4776857460749166e-05, "loss": 0.9349, "step": 1791 }, { "epoch": 0.6604625449184557, "grad_norm": 8.01687606358285, "learning_rate": 4.477376684386204e-05, "loss": 0.8927, "step": 1792 }, { "epoch": 0.6608311066064683, "grad_norm": 5.652655370804846, "learning_rate": 4.477067622697491e-05, "loss": 0.7515, "step": 1793 }, { "epoch": 0.6611996682944807, "grad_norm": 7.879460116253703, "learning_rate": 4.476758561008777e-05, "loss": 0.7849, "step": 1794 }, { "epoch": 0.6615682299824933, "grad_norm": 6.932346388679429, "learning_rate": 4.4764494993200644e-05, "loss": 0.6266, "step": 1795 }, { "epoch": 0.6619367916705059, "grad_norm": 6.298371168017743, "learning_rate": 4.4761404376313515e-05, "loss": 0.5579, "step": 1796 }, { "epoch": 0.6623053533585184, "grad_norm": 6.405841795778977, "learning_rate": 4.4758313759426386e-05, "loss": 0.7188, "step": 1797 }, { "epoch": 0.662673915046531, "grad_norm": 5.935131363479482, "learning_rate": 4.475522314253926e-05, "loss": 0.5929, "step": 1798 }, { "epoch": 0.6630424767345434, "grad_norm": 4.377324386028248, "learning_rate": 4.475213252565212e-05, "loss": 0.3968, "step": 1799 }, { "epoch": 0.663411038422556, "grad_norm": 5.977751171111148, "learning_rate": 4.474904190876499e-05, "loss": 0.6384, "step": 1800 }, { "epoch": 0.6637796001105685, "grad_norm": 5.059690095963141, "learning_rate": 4.4745951291877864e-05, "loss": 0.5872, "step": 1801 }, { "epoch": 0.664148161798581, "grad_norm": 4.883443318626749, "learning_rate": 4.474286067499073e-05, "loss": 0.5383, "step": 1802 }, { "epoch": 0.6645167234865935, "grad_norm": 5.689967029120685, "learning_rate": 4.47397700581036e-05, "loss": 0.7994, "step": 1803 }, { "epoch": 0.6648852851746061, "grad_norm": 5.477845045858412, "learning_rate": 4.4736679441216464e-05, "loss": 0.6572, "step": 1804 }, { "epoch": 0.6652538468626187, "grad_norm": 6.65015114060358, "learning_rate": 4.4733588824329336e-05, "loss": 0.6449, "step": 1805 }, { "epoch": 0.6656224085506312, "grad_norm": 8.304969531605666, "learning_rate": 4.473049820744221e-05, "loss": 0.6887, "step": 1806 }, { "epoch": 0.6659909702386437, "grad_norm": 4.7502967340240225, "learning_rate": 4.472740759055508e-05, "loss": 0.6323, "step": 1807 }, { "epoch": 0.6663595319266562, "grad_norm": 7.048060958096416, "learning_rate": 4.472431697366794e-05, "loss": 0.6491, "step": 1808 }, { "epoch": 0.6667280936146688, "grad_norm": 6.494049870136532, "learning_rate": 4.4721226356780814e-05, "loss": 0.9946, "step": 1809 }, { "epoch": 0.6670966553026813, "grad_norm": 7.60471635338424, "learning_rate": 4.4718135739893685e-05, "loss": 0.729, "step": 1810 }, { "epoch": 0.6674652169906938, "grad_norm": 6.362154116718866, "learning_rate": 4.4715045123006556e-05, "loss": 0.7593, "step": 1811 }, { "epoch": 0.6678337786787063, "grad_norm": 7.180525339613532, "learning_rate": 4.471195450611943e-05, "loss": 0.671, "step": 1812 }, { "epoch": 0.6682023403667189, "grad_norm": 9.652270635978654, "learning_rate": 4.470886388923229e-05, "loss": 0.6674, "step": 1813 }, { "epoch": 0.6685709020547315, "grad_norm": 7.60217069445156, "learning_rate": 4.470577327234516e-05, "loss": 0.7228, "step": 1814 }, { "epoch": 0.6689394637427439, "grad_norm": 12.101536021194402, "learning_rate": 4.4702682655458034e-05, "loss": 0.7028, "step": 1815 }, { "epoch": 0.6693080254307565, "grad_norm": 7.331950086125088, "learning_rate": 4.4699592038570905e-05, "loss": 0.7402, "step": 1816 }, { "epoch": 0.669676587118769, "grad_norm": 7.886587428095014, "learning_rate": 4.4696501421683776e-05, "loss": 0.774, "step": 1817 }, { "epoch": 0.6700451488067816, "grad_norm": 7.755522206455489, "learning_rate": 4.4693410804796634e-05, "loss": 0.569, "step": 1818 }, { "epoch": 0.670413710494794, "grad_norm": 7.1021555032939645, "learning_rate": 4.4690320187909505e-05, "loss": 0.7288, "step": 1819 }, { "epoch": 0.6707822721828066, "grad_norm": 8.788570732770053, "learning_rate": 4.4687229571022376e-05, "loss": 0.9023, "step": 1820 }, { "epoch": 0.6711508338708191, "grad_norm": 13.120598917991392, "learning_rate": 4.468413895413525e-05, "loss": 0.6321, "step": 1821 }, { "epoch": 0.6715193955588317, "grad_norm": 6.666844492765874, "learning_rate": 4.468104833724812e-05, "loss": 0.5494, "step": 1822 }, { "epoch": 0.6718879572468441, "grad_norm": 8.984730801378648, "learning_rate": 4.467795772036098e-05, "loss": 0.7405, "step": 1823 }, { "epoch": 0.6722565189348567, "grad_norm": 7.557963374069071, "learning_rate": 4.4674867103473854e-05, "loss": 0.591, "step": 1824 }, { "epoch": 0.6726250806228693, "grad_norm": 10.304987349930448, "learning_rate": 4.4671776486586726e-05, "loss": 0.8605, "step": 1825 }, { "epoch": 0.6729936423108818, "grad_norm": 6.949795386506472, "learning_rate": 4.46686858696996e-05, "loss": 0.7391, "step": 1826 }, { "epoch": 0.6733622039988943, "grad_norm": 11.513767128793848, "learning_rate": 4.466559525281246e-05, "loss": 0.716, "step": 1827 }, { "epoch": 0.6737307656869068, "grad_norm": 12.74174441672921, "learning_rate": 4.466250463592533e-05, "loss": 0.8114, "step": 1828 }, { "epoch": 0.6740993273749194, "grad_norm": 6.995677703465313, "learning_rate": 4.4659414019038204e-05, "loss": 0.6723, "step": 1829 }, { "epoch": 0.6744678890629319, "grad_norm": 5.8312803016194215, "learning_rate": 4.4656323402151075e-05, "loss": 0.7826, "step": 1830 }, { "epoch": 0.6748364507509444, "grad_norm": 5.8010429759979365, "learning_rate": 4.4653232785263946e-05, "loss": 0.7456, "step": 1831 }, { "epoch": 0.6752050124389569, "grad_norm": 9.568852726679559, "learning_rate": 4.465014216837681e-05, "loss": 0.7065, "step": 1832 }, { "epoch": 0.6755735741269695, "grad_norm": 6.292506517688041, "learning_rate": 4.4647051551489675e-05, "loss": 0.5761, "step": 1833 }, { "epoch": 0.6759421358149821, "grad_norm": 8.423656268172177, "learning_rate": 4.4643960934602546e-05, "loss": 0.5558, "step": 1834 }, { "epoch": 0.6763106975029946, "grad_norm": 6.8805339388695375, "learning_rate": 4.464087031771542e-05, "loss": 0.977, "step": 1835 }, { "epoch": 0.6766792591910071, "grad_norm": 6.153483897330603, "learning_rate": 4.463777970082829e-05, "loss": 0.7335, "step": 1836 }, { "epoch": 0.6770478208790196, "grad_norm": 6.2963879115268835, "learning_rate": 4.463468908394115e-05, "loss": 0.7764, "step": 1837 }, { "epoch": 0.6774163825670322, "grad_norm": 8.935999177418724, "learning_rate": 4.4631598467054024e-05, "loss": 1.0201, "step": 1838 }, { "epoch": 0.6777849442550447, "grad_norm": 7.706297679546337, "learning_rate": 4.4628507850166895e-05, "loss": 0.7577, "step": 1839 }, { "epoch": 0.6781535059430572, "grad_norm": 4.711527975600714, "learning_rate": 4.4625417233279766e-05, "loss": 0.4284, "step": 1840 }, { "epoch": 0.6785220676310697, "grad_norm": 7.469257161945191, "learning_rate": 4.462232661639264e-05, "loss": 0.8966, "step": 1841 }, { "epoch": 0.6788906293190823, "grad_norm": 7.5485208742452325, "learning_rate": 4.46192359995055e-05, "loss": 0.7638, "step": 1842 }, { "epoch": 0.6792591910070949, "grad_norm": 7.966373822351133, "learning_rate": 4.461614538261837e-05, "loss": 0.6993, "step": 1843 }, { "epoch": 0.6796277526951073, "grad_norm": 6.624355356774141, "learning_rate": 4.4613054765731244e-05, "loss": 0.7902, "step": 1844 }, { "epoch": 0.6799963143831199, "grad_norm": 9.152688853928987, "learning_rate": 4.4609964148844116e-05, "loss": 0.8282, "step": 1845 }, { "epoch": 0.6803648760711324, "grad_norm": 5.4913435351287925, "learning_rate": 4.460687353195698e-05, "loss": 0.5076, "step": 1846 }, { "epoch": 0.680733437759145, "grad_norm": 5.743332023480237, "learning_rate": 4.460378291506985e-05, "loss": 0.8056, "step": 1847 }, { "epoch": 0.6811019994471574, "grad_norm": 6.0790556756551295, "learning_rate": 4.4600692298182716e-05, "loss": 0.6084, "step": 1848 }, { "epoch": 0.68147056113517, "grad_norm": 4.417337954387403, "learning_rate": 4.459760168129559e-05, "loss": 0.5792, "step": 1849 }, { "epoch": 0.6818391228231825, "grad_norm": 11.969775601660274, "learning_rate": 4.459451106440846e-05, "loss": 0.7158, "step": 1850 }, { "epoch": 0.6822076845111951, "grad_norm": 4.4864686338603, "learning_rate": 4.459142044752133e-05, "loss": 0.457, "step": 1851 }, { "epoch": 0.6825762461992075, "grad_norm": 6.497305678440992, "learning_rate": 4.4588329830634194e-05, "loss": 0.8389, "step": 1852 }, { "epoch": 0.6829448078872201, "grad_norm": 6.777772635925232, "learning_rate": 4.4585239213747065e-05, "loss": 0.8836, "step": 1853 }, { "epoch": 0.6833133695752327, "grad_norm": 6.198563981646903, "learning_rate": 4.4582148596859936e-05, "loss": 0.6584, "step": 1854 }, { "epoch": 0.6836819312632452, "grad_norm": 6.164698037381056, "learning_rate": 4.457905797997281e-05, "loss": 0.5565, "step": 1855 }, { "epoch": 0.6840504929512577, "grad_norm": 5.73917796877952, "learning_rate": 4.457596736308567e-05, "loss": 0.7224, "step": 1856 }, { "epoch": 0.6844190546392702, "grad_norm": 5.573457875814748, "learning_rate": 4.457287674619854e-05, "loss": 0.5641, "step": 1857 }, { "epoch": 0.6847876163272828, "grad_norm": 6.535800352579575, "learning_rate": 4.4569786129311414e-05, "loss": 0.6666, "step": 1858 }, { "epoch": 0.6851561780152953, "grad_norm": 5.912627153057617, "learning_rate": 4.4566695512424285e-05, "loss": 0.5705, "step": 1859 }, { "epoch": 0.6855247397033079, "grad_norm": 7.196028716220355, "learning_rate": 4.4563604895537156e-05, "loss": 0.7354, "step": 1860 }, { "epoch": 0.6858933013913203, "grad_norm": 6.012298377576237, "learning_rate": 4.456051427865002e-05, "loss": 0.4859, "step": 1861 }, { "epoch": 0.6862618630793329, "grad_norm": 10.344128374169042, "learning_rate": 4.455742366176289e-05, "loss": 0.7592, "step": 1862 }, { "epoch": 0.6866304247673455, "grad_norm": 5.600827646402541, "learning_rate": 4.4554333044875756e-05, "loss": 0.69, "step": 1863 }, { "epoch": 0.686998986455358, "grad_norm": 7.712688830767396, "learning_rate": 4.455124242798863e-05, "loss": 0.9044, "step": 1864 }, { "epoch": 0.6873675481433705, "grad_norm": 6.861651970278766, "learning_rate": 4.45481518111015e-05, "loss": 0.5724, "step": 1865 }, { "epoch": 0.687736109831383, "grad_norm": 10.004773335860895, "learning_rate": 4.454506119421436e-05, "loss": 0.8266, "step": 1866 }, { "epoch": 0.6881046715193956, "grad_norm": 6.693239975135069, "learning_rate": 4.4541970577327234e-05, "loss": 0.7676, "step": 1867 }, { "epoch": 0.688473233207408, "grad_norm": 6.710940910578295, "learning_rate": 4.4538879960440106e-05, "loss": 0.6181, "step": 1868 }, { "epoch": 0.6888417948954206, "grad_norm": 11.167276014716712, "learning_rate": 4.453578934355298e-05, "loss": 0.8158, "step": 1869 }, { "epoch": 0.6892103565834331, "grad_norm": 5.60063148752728, "learning_rate": 4.453269872666585e-05, "loss": 0.7438, "step": 1870 }, { "epoch": 0.6895789182714457, "grad_norm": 7.173337567031537, "learning_rate": 4.452960810977871e-05, "loss": 0.6144, "step": 1871 }, { "epoch": 0.6899474799594583, "grad_norm": 6.139010850587643, "learning_rate": 4.4526517492891583e-05, "loss": 0.6747, "step": 1872 }, { "epoch": 0.6903160416474707, "grad_norm": 6.673590339084054, "learning_rate": 4.4523426876004455e-05, "loss": 0.6667, "step": 1873 }, { "epoch": 0.6906846033354833, "grad_norm": 5.31350806152789, "learning_rate": 4.4520336259117326e-05, "loss": 0.5919, "step": 1874 }, { "epoch": 0.6910531650234958, "grad_norm": 7.125947588098042, "learning_rate": 4.451724564223019e-05, "loss": 0.7475, "step": 1875 }, { "epoch": 0.6914217267115084, "grad_norm": 6.737665845991926, "learning_rate": 4.451415502534306e-05, "loss": 0.7068, "step": 1876 }, { "epoch": 0.6917902883995208, "grad_norm": 9.334363426403103, "learning_rate": 4.451106440845593e-05, "loss": 0.7378, "step": 1877 }, { "epoch": 0.6921588500875334, "grad_norm": 9.1791364528755, "learning_rate": 4.45079737915688e-05, "loss": 0.8395, "step": 1878 }, { "epoch": 0.6925274117755459, "grad_norm": 8.982570301081266, "learning_rate": 4.450488317468167e-05, "loss": 0.7111, "step": 1879 }, { "epoch": 0.6928959734635585, "grad_norm": 7.333081876894081, "learning_rate": 4.450179255779453e-05, "loss": 0.6491, "step": 1880 }, { "epoch": 0.693264535151571, "grad_norm": 7.1961723749911615, "learning_rate": 4.4498701940907404e-05, "loss": 0.6352, "step": 1881 }, { "epoch": 0.6936330968395835, "grad_norm": 7.150495455988577, "learning_rate": 4.4495611324020275e-05, "loss": 0.6192, "step": 1882 }, { "epoch": 0.6940016585275961, "grad_norm": 12.213596359596801, "learning_rate": 4.4492520707133146e-05, "loss": 0.8089, "step": 1883 }, { "epoch": 0.6943702202156086, "grad_norm": 9.417515089135858, "learning_rate": 4.448943009024602e-05, "loss": 0.8693, "step": 1884 }, { "epoch": 0.6947387819036211, "grad_norm": 6.297174564931538, "learning_rate": 4.448633947335888e-05, "loss": 0.5602, "step": 1885 }, { "epoch": 0.6951073435916336, "grad_norm": 8.032741779163741, "learning_rate": 4.448324885647175e-05, "loss": 0.9369, "step": 1886 }, { "epoch": 0.6954759052796462, "grad_norm": 7.204871373507584, "learning_rate": 4.4480158239584624e-05, "loss": 0.6592, "step": 1887 }, { "epoch": 0.6958444669676587, "grad_norm": 11.300577699190836, "learning_rate": 4.4477067622697495e-05, "loss": 0.6164, "step": 1888 }, { "epoch": 0.6962130286556713, "grad_norm": 5.731623600188543, "learning_rate": 4.447397700581037e-05, "loss": 0.65, "step": 1889 }, { "epoch": 0.6965815903436837, "grad_norm": 8.279899285482546, "learning_rate": 4.447088638892323e-05, "loss": 0.5829, "step": 1890 }, { "epoch": 0.6969501520316963, "grad_norm": 8.934148840428488, "learning_rate": 4.44677957720361e-05, "loss": 0.8879, "step": 1891 }, { "epoch": 0.6973187137197089, "grad_norm": 6.876028642862167, "learning_rate": 4.4464705155148973e-05, "loss": 0.7889, "step": 1892 }, { "epoch": 0.6976872754077214, "grad_norm": 7.8105520032335, "learning_rate": 4.446161453826184e-05, "loss": 0.6831, "step": 1893 }, { "epoch": 0.6980558370957339, "grad_norm": 10.433829935786104, "learning_rate": 4.445852392137471e-05, "loss": 0.8889, "step": 1894 }, { "epoch": 0.6984243987837464, "grad_norm": 5.57614724709973, "learning_rate": 4.4455433304487573e-05, "loss": 0.7058, "step": 1895 }, { "epoch": 0.698792960471759, "grad_norm": 6.831330478818884, "learning_rate": 4.4452342687600445e-05, "loss": 0.7113, "step": 1896 }, { "epoch": 0.6991615221597715, "grad_norm": 16.179394198512483, "learning_rate": 4.4449252070713316e-05, "loss": 0.7924, "step": 1897 }, { "epoch": 0.699530083847784, "grad_norm": 6.573034931293749, "learning_rate": 4.444616145382619e-05, "loss": 0.7949, "step": 1898 }, { "epoch": 0.6998986455357965, "grad_norm": 7.3009228475768415, "learning_rate": 4.444307083693905e-05, "loss": 0.6666, "step": 1899 }, { "epoch": 0.7002672072238091, "grad_norm": 7.740094407168737, "learning_rate": 4.443998022005192e-05, "loss": 0.7111, "step": 1900 }, { "epoch": 0.7006357689118217, "grad_norm": 5.109019042515131, "learning_rate": 4.4436889603164794e-05, "loss": 0.6724, "step": 1901 }, { "epoch": 0.7010043305998341, "grad_norm": 10.744269684566165, "learning_rate": 4.4433798986277665e-05, "loss": 0.7218, "step": 1902 }, { "epoch": 0.7013728922878467, "grad_norm": 6.537371089678186, "learning_rate": 4.4430708369390536e-05, "loss": 0.657, "step": 1903 }, { "epoch": 0.7017414539758592, "grad_norm": 7.2417450309818285, "learning_rate": 4.44276177525034e-05, "loss": 0.6466, "step": 1904 }, { "epoch": 0.7021100156638718, "grad_norm": 8.524162506401838, "learning_rate": 4.442452713561627e-05, "loss": 0.7993, "step": 1905 }, { "epoch": 0.7024785773518842, "grad_norm": 7.261393572096662, "learning_rate": 4.442143651872914e-05, "loss": 0.5947, "step": 1906 }, { "epoch": 0.7028471390398968, "grad_norm": 11.390571416197696, "learning_rate": 4.4418345901842014e-05, "loss": 0.9385, "step": 1907 }, { "epoch": 0.7032157007279093, "grad_norm": 6.185472252470621, "learning_rate": 4.441525528495488e-05, "loss": 0.6591, "step": 1908 }, { "epoch": 0.7035842624159219, "grad_norm": 8.608150665127507, "learning_rate": 4.441216466806774e-05, "loss": 0.5998, "step": 1909 }, { "epoch": 0.7039528241039344, "grad_norm": 5.996898485289799, "learning_rate": 4.4409074051180614e-05, "loss": 0.8235, "step": 1910 }, { "epoch": 0.7043213857919469, "grad_norm": 9.299064211781944, "learning_rate": 4.4405983434293485e-05, "loss": 0.604, "step": 1911 }, { "epoch": 0.7046899474799595, "grad_norm": 6.005128575964647, "learning_rate": 4.440289281740636e-05, "loss": 0.8266, "step": 1912 }, { "epoch": 0.705058509167972, "grad_norm": 7.167056524037605, "learning_rate": 4.439980220051923e-05, "loss": 0.5843, "step": 1913 }, { "epoch": 0.7054270708559846, "grad_norm": 7.497316007375781, "learning_rate": 4.439671158363209e-05, "loss": 0.6859, "step": 1914 }, { "epoch": 0.705795632543997, "grad_norm": 6.232659354806473, "learning_rate": 4.4393620966744963e-05, "loss": 0.7933, "step": 1915 }, { "epoch": 0.7061641942320096, "grad_norm": 6.772451291562165, "learning_rate": 4.4390530349857835e-05, "loss": 0.7057, "step": 1916 }, { "epoch": 0.7065327559200221, "grad_norm": 6.349265721437087, "learning_rate": 4.4387439732970706e-05, "loss": 0.609, "step": 1917 }, { "epoch": 0.7069013176080347, "grad_norm": 8.044875642847856, "learning_rate": 4.438434911608357e-05, "loss": 0.9153, "step": 1918 }, { "epoch": 0.7072698792960471, "grad_norm": 8.31791476234593, "learning_rate": 4.438125849919644e-05, "loss": 0.8939, "step": 1919 }, { "epoch": 0.7076384409840597, "grad_norm": 8.403807785196383, "learning_rate": 4.437816788230931e-05, "loss": 0.8173, "step": 1920 }, { "epoch": 0.7080070026720723, "grad_norm": 6.001048314384915, "learning_rate": 4.4375077265422184e-05, "loss": 0.3674, "step": 1921 }, { "epoch": 0.7083755643600848, "grad_norm": 4.259892562800903, "learning_rate": 4.4371986648535055e-05, "loss": 0.5252, "step": 1922 }, { "epoch": 0.7087441260480973, "grad_norm": 7.730668599772708, "learning_rate": 4.436889603164792e-05, "loss": 0.7583, "step": 1923 }, { "epoch": 0.7091126877361098, "grad_norm": 7.166059276066289, "learning_rate": 4.4365805414760784e-05, "loss": 0.6492, "step": 1924 }, { "epoch": 0.7094812494241224, "grad_norm": 8.312869873996116, "learning_rate": 4.4362714797873655e-05, "loss": 0.7585, "step": 1925 }, { "epoch": 0.7098498111121349, "grad_norm": 5.610228747159868, "learning_rate": 4.4359624180986526e-05, "loss": 0.6588, "step": 1926 }, { "epoch": 0.7102183728001474, "grad_norm": 8.548312943195556, "learning_rate": 4.43565335640994e-05, "loss": 0.7159, "step": 1927 }, { "epoch": 0.7105869344881599, "grad_norm": 6.828516619383115, "learning_rate": 4.435344294721226e-05, "loss": 0.627, "step": 1928 }, { "epoch": 0.7109554961761725, "grad_norm": 6.517522814512835, "learning_rate": 4.435035233032513e-05, "loss": 0.9667, "step": 1929 }, { "epoch": 0.7113240578641851, "grad_norm": 5.203812287448313, "learning_rate": 4.4347261713438004e-05, "loss": 0.5499, "step": 1930 }, { "epoch": 0.7116926195521975, "grad_norm": 6.938174928018687, "learning_rate": 4.4344171096550875e-05, "loss": 1.0836, "step": 1931 }, { "epoch": 0.7120611812402101, "grad_norm": 8.255877626325343, "learning_rate": 4.4341080479663747e-05, "loss": 0.971, "step": 1932 }, { "epoch": 0.7124297429282226, "grad_norm": 8.266771781417166, "learning_rate": 4.433798986277661e-05, "loss": 0.718, "step": 1933 }, { "epoch": 0.7127983046162352, "grad_norm": 7.235532334363595, "learning_rate": 4.433489924588948e-05, "loss": 0.7535, "step": 1934 }, { "epoch": 0.7131668663042476, "grad_norm": 6.240298867526674, "learning_rate": 4.4331808629002353e-05, "loss": 0.57, "step": 1935 }, { "epoch": 0.7135354279922602, "grad_norm": 6.747479003330846, "learning_rate": 4.4328718012115225e-05, "loss": 0.9445, "step": 1936 }, { "epoch": 0.7139039896802727, "grad_norm": 4.590856354390801, "learning_rate": 4.432562739522809e-05, "loss": 0.5766, "step": 1937 }, { "epoch": 0.7142725513682853, "grad_norm": 6.559205018231356, "learning_rate": 4.432253677834096e-05, "loss": 1.0476, "step": 1938 }, { "epoch": 0.7146411130562978, "grad_norm": 5.61258735163963, "learning_rate": 4.4319446161453825e-05, "loss": 0.6613, "step": 1939 }, { "epoch": 0.7150096747443103, "grad_norm": 6.070400477966064, "learning_rate": 4.4316355544566696e-05, "loss": 0.865, "step": 1940 }, { "epoch": 0.7153782364323229, "grad_norm": 8.740737372048706, "learning_rate": 4.431326492767957e-05, "loss": 0.8983, "step": 1941 }, { "epoch": 0.7157467981203354, "grad_norm": 3.940443845815642, "learning_rate": 4.431017431079244e-05, "loss": 0.3606, "step": 1942 }, { "epoch": 0.716115359808348, "grad_norm": 6.7820605659356605, "learning_rate": 4.43070836939053e-05, "loss": 0.6986, "step": 1943 }, { "epoch": 0.7164839214963604, "grad_norm": 4.636695713927612, "learning_rate": 4.4303993077018174e-05, "loss": 0.6165, "step": 1944 }, { "epoch": 0.716852483184373, "grad_norm": 5.978838797517392, "learning_rate": 4.4300902460131045e-05, "loss": 0.7375, "step": 1945 }, { "epoch": 0.7172210448723855, "grad_norm": 5.978960341594299, "learning_rate": 4.4297811843243916e-05, "loss": 0.5102, "step": 1946 }, { "epoch": 0.717589606560398, "grad_norm": 5.807163208594193, "learning_rate": 4.429472122635678e-05, "loss": 0.5053, "step": 1947 }, { "epoch": 0.7179581682484106, "grad_norm": 7.5737745244280275, "learning_rate": 4.429163060946965e-05, "loss": 0.6859, "step": 1948 }, { "epoch": 0.7183267299364231, "grad_norm": 6.380320404521663, "learning_rate": 4.428853999258252e-05, "loss": 0.6004, "step": 1949 }, { "epoch": 0.7186952916244357, "grad_norm": 7.356836743805044, "learning_rate": 4.4285449375695394e-05, "loss": 0.797, "step": 1950 }, { "epoch": 0.7190638533124482, "grad_norm": 12.769029366475515, "learning_rate": 4.4282358758808265e-05, "loss": 0.8928, "step": 1951 }, { "epoch": 0.7194324150004607, "grad_norm": 10.846595236979358, "learning_rate": 4.427926814192113e-05, "loss": 0.6863, "step": 1952 }, { "epoch": 0.7198009766884732, "grad_norm": 5.595819016033763, "learning_rate": 4.4276177525034e-05, "loss": 0.6069, "step": 1953 }, { "epoch": 0.7201695383764858, "grad_norm": 9.474299954040964, "learning_rate": 4.4273086908146865e-05, "loss": 0.9154, "step": 1954 }, { "epoch": 0.7205381000644983, "grad_norm": 10.905036924387996, "learning_rate": 4.4269996291259737e-05, "loss": 0.8604, "step": 1955 }, { "epoch": 0.7209066617525108, "grad_norm": 9.299330443203669, "learning_rate": 4.426690567437261e-05, "loss": 0.6243, "step": 1956 }, { "epoch": 0.7212752234405233, "grad_norm": 7.7359629840773945, "learning_rate": 4.426381505748547e-05, "loss": 0.839, "step": 1957 }, { "epoch": 0.7216437851285359, "grad_norm": 6.780031437533702, "learning_rate": 4.426072444059834e-05, "loss": 0.6086, "step": 1958 }, { "epoch": 0.7220123468165485, "grad_norm": 73.02067544859298, "learning_rate": 4.4257633823711215e-05, "loss": 0.5777, "step": 1959 }, { "epoch": 0.7223809085045609, "grad_norm": 4.699465522406677, "learning_rate": 4.4254543206824086e-05, "loss": 0.3394, "step": 1960 }, { "epoch": 0.7227494701925735, "grad_norm": 8.838152836234539, "learning_rate": 4.425145258993696e-05, "loss": 1.0265, "step": 1961 }, { "epoch": 0.723118031880586, "grad_norm": 63.305707079442094, "learning_rate": 4.424836197304982e-05, "loss": 1.0882, "step": 1962 }, { "epoch": 0.7234865935685986, "grad_norm": 6.998827699862548, "learning_rate": 4.424527135616269e-05, "loss": 0.7401, "step": 1963 }, { "epoch": 0.723855155256611, "grad_norm": 7.816405273925477, "learning_rate": 4.4242180739275564e-05, "loss": 0.7703, "step": 1964 }, { "epoch": 0.7242237169446236, "grad_norm": 309.7282579132876, "learning_rate": 4.4239090122388435e-05, "loss": 0.6215, "step": 1965 }, { "epoch": 0.7245922786326361, "grad_norm": 352.69213513700583, "learning_rate": 4.42359995055013e-05, "loss": 0.9367, "step": 1966 }, { "epoch": 0.7249608403206487, "grad_norm": 6.637379146991527, "learning_rate": 4.423290888861417e-05, "loss": 0.8727, "step": 1967 }, { "epoch": 0.7253294020086613, "grad_norm": 7.362422757480263, "learning_rate": 4.422981827172704e-05, "loss": 0.8774, "step": 1968 }, { "epoch": 0.7256979636966737, "grad_norm": 7.682443832355997, "learning_rate": 4.4226727654839906e-05, "loss": 0.767, "step": 1969 }, { "epoch": 0.7260665253846863, "grad_norm": 6.84355769475108, "learning_rate": 4.422363703795278e-05, "loss": 0.8754, "step": 1970 }, { "epoch": 0.7264350870726988, "grad_norm": 10.35691488982874, "learning_rate": 4.422054642106564e-05, "loss": 0.6287, "step": 1971 }, { "epoch": 0.7268036487607114, "grad_norm": 10.54986285803006, "learning_rate": 4.421745580417851e-05, "loss": 0.6726, "step": 1972 }, { "epoch": 0.7271722104487238, "grad_norm": 6.338062160621675, "learning_rate": 4.4214365187291384e-05, "loss": 0.7486, "step": 1973 }, { "epoch": 0.7275407721367364, "grad_norm": 8.290909571244562, "learning_rate": 4.4211274570404255e-05, "loss": 0.7178, "step": 1974 }, { "epoch": 0.7279093338247489, "grad_norm": 7.530095561967543, "learning_rate": 4.4208183953517127e-05, "loss": 0.5784, "step": 1975 }, { "epoch": 0.7282778955127615, "grad_norm": 9.045333704657518, "learning_rate": 4.420509333662999e-05, "loss": 0.6577, "step": 1976 }, { "epoch": 0.728646457200774, "grad_norm": 6.790052756552601, "learning_rate": 4.420200271974286e-05, "loss": 0.7796, "step": 1977 }, { "epoch": 0.7290150188887865, "grad_norm": 6.860348156693995, "learning_rate": 4.419891210285573e-05, "loss": 0.6692, "step": 1978 }, { "epoch": 0.7293835805767991, "grad_norm": 4.9433450988780425, "learning_rate": 4.4195821485968605e-05, "loss": 0.8206, "step": 1979 }, { "epoch": 0.7297521422648116, "grad_norm": 7.262652176593864, "learning_rate": 4.419273086908147e-05, "loss": 0.7613, "step": 1980 }, { "epoch": 0.7301207039528241, "grad_norm": 7.042418701222619, "learning_rate": 4.418964025219434e-05, "loss": 0.7809, "step": 1981 }, { "epoch": 0.7304892656408366, "grad_norm": 5.587895745038724, "learning_rate": 4.418654963530721e-05, "loss": 0.495, "step": 1982 }, { "epoch": 0.7308578273288492, "grad_norm": 4.894071394498973, "learning_rate": 4.418345901842008e-05, "loss": 0.5616, "step": 1983 }, { "epoch": 0.7312263890168617, "grad_norm": 8.956557649385047, "learning_rate": 4.418036840153295e-05, "loss": 0.7178, "step": 1984 }, { "epoch": 0.7315949507048742, "grad_norm": 4.672073206955073, "learning_rate": 4.417727778464582e-05, "loss": 0.5809, "step": 1985 }, { "epoch": 0.7319635123928867, "grad_norm": 8.13631555049979, "learning_rate": 4.417418716775868e-05, "loss": 0.7189, "step": 1986 }, { "epoch": 0.7323320740808993, "grad_norm": 8.450478299841599, "learning_rate": 4.4171096550871554e-05, "loss": 0.5972, "step": 1987 }, { "epoch": 0.7327006357689119, "grad_norm": 8.548995234473052, "learning_rate": 4.4168005933984425e-05, "loss": 0.6756, "step": 1988 }, { "epoch": 0.7330691974569243, "grad_norm": 6.69995525971463, "learning_rate": 4.4164915317097296e-05, "loss": 0.5532, "step": 1989 }, { "epoch": 0.7334377591449369, "grad_norm": 8.204917493890049, "learning_rate": 4.416182470021016e-05, "loss": 0.5886, "step": 1990 }, { "epoch": 0.7338063208329494, "grad_norm": 9.905983521159541, "learning_rate": 4.415873408332303e-05, "loss": 0.9617, "step": 1991 }, { "epoch": 0.734174882520962, "grad_norm": 9.897889469093036, "learning_rate": 4.41556434664359e-05, "loss": 0.9656, "step": 1992 }, { "epoch": 0.7345434442089744, "grad_norm": 5.654344437947623, "learning_rate": 4.4152552849548774e-05, "loss": 0.7585, "step": 1993 }, { "epoch": 0.734912005896987, "grad_norm": 6.981600967915261, "learning_rate": 4.4149462232661645e-05, "loss": 0.8922, "step": 1994 }, { "epoch": 0.7352805675849995, "grad_norm": 6.850853425484353, "learning_rate": 4.414637161577451e-05, "loss": 0.711, "step": 1995 }, { "epoch": 0.7356491292730121, "grad_norm": 8.409920837537792, "learning_rate": 4.414328099888738e-05, "loss": 1.2557, "step": 1996 }, { "epoch": 0.7360176909610247, "grad_norm": 7.804035942366366, "learning_rate": 4.414019038200025e-05, "loss": 0.8624, "step": 1997 }, { "epoch": 0.7363862526490371, "grad_norm": 5.39607330534892, "learning_rate": 4.413709976511312e-05, "loss": 0.6433, "step": 1998 }, { "epoch": 0.7367548143370497, "grad_norm": 6.7752642252731095, "learning_rate": 4.413400914822599e-05, "loss": 0.8539, "step": 1999 }, { "epoch": 0.7371233760250622, "grad_norm": 6.185450975634085, "learning_rate": 4.413091853133885e-05, "loss": 0.732, "step": 2000 }, { "epoch": 0.7374919377130748, "grad_norm": 5.377689132450242, "learning_rate": 4.412782791445172e-05, "loss": 0.5906, "step": 2001 }, { "epoch": 0.7378604994010872, "grad_norm": 5.9400237492044825, "learning_rate": 4.4124737297564595e-05, "loss": 0.7496, "step": 2002 }, { "epoch": 0.7382290610890998, "grad_norm": 7.342169149164394, "learning_rate": 4.4121646680677466e-05, "loss": 0.6771, "step": 2003 }, { "epoch": 0.7385976227771123, "grad_norm": 8.315292462993542, "learning_rate": 4.411855606379034e-05, "loss": 0.7951, "step": 2004 }, { "epoch": 0.7389661844651249, "grad_norm": 5.54312462617772, "learning_rate": 4.41154654469032e-05, "loss": 0.7234, "step": 2005 }, { "epoch": 0.7393347461531374, "grad_norm": 5.763890281993692, "learning_rate": 4.411237483001607e-05, "loss": 0.86, "step": 2006 }, { "epoch": 0.7397033078411499, "grad_norm": 6.090299441237315, "learning_rate": 4.4109284213128944e-05, "loss": 0.6892, "step": 2007 }, { "epoch": 0.7400718695291625, "grad_norm": 5.597598323801817, "learning_rate": 4.4106193596241815e-05, "loss": 0.6836, "step": 2008 }, { "epoch": 0.740440431217175, "grad_norm": 6.116654296086385, "learning_rate": 4.410310297935468e-05, "loss": 0.6926, "step": 2009 }, { "epoch": 0.7408089929051875, "grad_norm": 4.319251704575179, "learning_rate": 4.410001236246755e-05, "loss": 0.6048, "step": 2010 }, { "epoch": 0.7411775545932, "grad_norm": 5.9215232724537135, "learning_rate": 4.409692174558042e-05, "loss": 0.833, "step": 2011 }, { "epoch": 0.7415461162812126, "grad_norm": 7.103266445485248, "learning_rate": 4.409383112869329e-05, "loss": 0.6533, "step": 2012 }, { "epoch": 0.7419146779692251, "grad_norm": 6.6025629673663175, "learning_rate": 4.4090740511806164e-05, "loss": 0.8383, "step": 2013 }, { "epoch": 0.7422832396572376, "grad_norm": 5.382574377897044, "learning_rate": 4.408764989491903e-05, "loss": 0.5606, "step": 2014 }, { "epoch": 0.7426518013452501, "grad_norm": 7.036110516207734, "learning_rate": 4.408455927803189e-05, "loss": 0.7754, "step": 2015 }, { "epoch": 0.7430203630332627, "grad_norm": 5.166550952887167, "learning_rate": 4.4081468661144764e-05, "loss": 0.624, "step": 2016 }, { "epoch": 0.7433889247212753, "grad_norm": 6.593993557594492, "learning_rate": 4.4078378044257635e-05, "loss": 0.7494, "step": 2017 }, { "epoch": 0.7437574864092877, "grad_norm": 7.211004949152994, "learning_rate": 4.4075287427370506e-05, "loss": 0.7006, "step": 2018 }, { "epoch": 0.7441260480973003, "grad_norm": 5.3692997714227495, "learning_rate": 4.407219681048337e-05, "loss": 0.6521, "step": 2019 }, { "epoch": 0.7444946097853128, "grad_norm": 8.554111353315543, "learning_rate": 4.406910619359624e-05, "loss": 0.7637, "step": 2020 }, { "epoch": 0.7448631714733254, "grad_norm": 9.828632675182844, "learning_rate": 4.406601557670911e-05, "loss": 1.1209, "step": 2021 }, { "epoch": 0.7452317331613378, "grad_norm": 6.839026924652589, "learning_rate": 4.4062924959821984e-05, "loss": 0.4256, "step": 2022 }, { "epoch": 0.7456002948493504, "grad_norm": 6.847905817614175, "learning_rate": 4.4059834342934856e-05, "loss": 0.881, "step": 2023 }, { "epoch": 0.7459688565373629, "grad_norm": 7.438123147969216, "learning_rate": 4.405674372604772e-05, "loss": 0.99, "step": 2024 }, { "epoch": 0.7463374182253755, "grad_norm": 7.417902411334489, "learning_rate": 4.405365310916059e-05, "loss": 0.5676, "step": 2025 }, { "epoch": 0.746705979913388, "grad_norm": 7.2453535251965535, "learning_rate": 4.405056249227346e-05, "loss": 0.5933, "step": 2026 }, { "epoch": 0.7470745416014005, "grad_norm": 10.5873078420016, "learning_rate": 4.4047471875386334e-05, "loss": 1.0014, "step": 2027 }, { "epoch": 0.7474431032894131, "grad_norm": 7.597623112074262, "learning_rate": 4.40443812584992e-05, "loss": 1.0256, "step": 2028 }, { "epoch": 0.7478116649774256, "grad_norm": 7.589310969462968, "learning_rate": 4.404129064161206e-05, "loss": 0.5359, "step": 2029 }, { "epoch": 0.7481802266654382, "grad_norm": 6.213864348208726, "learning_rate": 4.4038200024724934e-05, "loss": 0.7283, "step": 2030 }, { "epoch": 0.7485487883534506, "grad_norm": 6.644643200666245, "learning_rate": 4.4035109407837805e-05, "loss": 0.7374, "step": 2031 }, { "epoch": 0.7489173500414632, "grad_norm": 6.979648159745439, "learning_rate": 4.4032018790950676e-05, "loss": 0.6925, "step": 2032 }, { "epoch": 0.7492859117294757, "grad_norm": 7.817932681414749, "learning_rate": 4.402892817406355e-05, "loss": 0.9765, "step": 2033 }, { "epoch": 0.7496544734174883, "grad_norm": 5.845212702344075, "learning_rate": 4.402583755717641e-05, "loss": 0.6293, "step": 2034 }, { "epoch": 0.7500230351055008, "grad_norm": 11.304627772280883, "learning_rate": 4.402274694028928e-05, "loss": 0.6398, "step": 2035 }, { "epoch": 0.7503915967935133, "grad_norm": 5.993115289870999, "learning_rate": 4.4019656323402154e-05, "loss": 0.5597, "step": 2036 }, { "epoch": 0.7507601584815259, "grad_norm": 6.053394834287236, "learning_rate": 4.4016565706515025e-05, "loss": 0.816, "step": 2037 }, { "epoch": 0.7511287201695384, "grad_norm": 5.26148058515833, "learning_rate": 4.401347508962789e-05, "loss": 0.6834, "step": 2038 }, { "epoch": 0.7514972818575509, "grad_norm": 10.704533081989982, "learning_rate": 4.401038447274076e-05, "loss": 0.5357, "step": 2039 }, { "epoch": 0.7518658435455634, "grad_norm": 5.603672539956149, "learning_rate": 4.400729385585363e-05, "loss": 0.7372, "step": 2040 }, { "epoch": 0.752234405233576, "grad_norm": 12.114331143029265, "learning_rate": 4.40042032389665e-05, "loss": 0.717, "step": 2041 }, { "epoch": 0.7526029669215885, "grad_norm": 12.941366161649348, "learning_rate": 4.4001112622079374e-05, "loss": 0.4824, "step": 2042 }, { "epoch": 0.752971528609601, "grad_norm": 11.869829517866005, "learning_rate": 4.399802200519224e-05, "loss": 0.6612, "step": 2043 }, { "epoch": 0.7533400902976136, "grad_norm": 6.712732361478144, "learning_rate": 4.39949313883051e-05, "loss": 0.8897, "step": 2044 }, { "epoch": 0.7537086519856261, "grad_norm": 7.1304321996981495, "learning_rate": 4.3991840771417974e-05, "loss": 0.5739, "step": 2045 }, { "epoch": 0.7540772136736387, "grad_norm": 5.871704475886745, "learning_rate": 4.3988750154530846e-05, "loss": 0.7222, "step": 2046 }, { "epoch": 0.7544457753616511, "grad_norm": 7.679713826410878, "learning_rate": 4.398565953764372e-05, "loss": 0.8067, "step": 2047 }, { "epoch": 0.7548143370496637, "grad_norm": 7.107560347476311, "learning_rate": 4.398256892075658e-05, "loss": 0.8133, "step": 2048 }, { "epoch": 0.7551828987376762, "grad_norm": 9.489829793354353, "learning_rate": 4.397947830386945e-05, "loss": 0.5447, "step": 2049 }, { "epoch": 0.7555514604256888, "grad_norm": 6.5167806135721404, "learning_rate": 4.3976387686982324e-05, "loss": 0.632, "step": 2050 }, { "epoch": 0.7559200221137012, "grad_norm": 7.2480034545486, "learning_rate": 4.3973297070095195e-05, "loss": 0.7298, "step": 2051 }, { "epoch": 0.7562885838017138, "grad_norm": 7.511901567588117, "learning_rate": 4.397020645320806e-05, "loss": 0.8209, "step": 2052 }, { "epoch": 0.7566571454897263, "grad_norm": 9.116670752413414, "learning_rate": 4.396711583632093e-05, "loss": 0.6431, "step": 2053 }, { "epoch": 0.7570257071777389, "grad_norm": 7.004326164733668, "learning_rate": 4.39640252194338e-05, "loss": 0.9971, "step": 2054 }, { "epoch": 0.7573942688657515, "grad_norm": 6.9276740795057234, "learning_rate": 4.396093460254667e-05, "loss": 0.6384, "step": 2055 }, { "epoch": 0.7577628305537639, "grad_norm": 7.494531799717222, "learning_rate": 4.3957843985659544e-05, "loss": 0.7218, "step": 2056 }, { "epoch": 0.7581313922417765, "grad_norm": 7.0503865562138515, "learning_rate": 4.395475336877241e-05, "loss": 0.6606, "step": 2057 }, { "epoch": 0.758499953929789, "grad_norm": 6.112932423154913, "learning_rate": 4.395166275188528e-05, "loss": 0.6468, "step": 2058 }, { "epoch": 0.7588685156178016, "grad_norm": 6.0143448693738595, "learning_rate": 4.3948572134998144e-05, "loss": 0.6575, "step": 2059 }, { "epoch": 0.759237077305814, "grad_norm": 7.7555986916339235, "learning_rate": 4.3945481518111015e-05, "loss": 0.8413, "step": 2060 }, { "epoch": 0.7596056389938266, "grad_norm": 8.14831905699512, "learning_rate": 4.3942390901223886e-05, "loss": 0.7163, "step": 2061 }, { "epoch": 0.7599742006818391, "grad_norm": 6.706073388212057, "learning_rate": 4.393930028433675e-05, "loss": 0.6909, "step": 2062 }, { "epoch": 0.7603427623698517, "grad_norm": 7.150712315216592, "learning_rate": 4.393620966744962e-05, "loss": 0.6929, "step": 2063 }, { "epoch": 0.7607113240578642, "grad_norm": 7.070126896735129, "learning_rate": 4.393311905056249e-05, "loss": 0.7363, "step": 2064 }, { "epoch": 0.7610798857458767, "grad_norm": 7.404807163486296, "learning_rate": 4.3930028433675364e-05, "loss": 0.7478, "step": 2065 }, { "epoch": 0.7614484474338893, "grad_norm": 6.066748634595085, "learning_rate": 4.3926937816788236e-05, "loss": 0.7378, "step": 2066 }, { "epoch": 0.7618170091219018, "grad_norm": 8.599774610537812, "learning_rate": 4.39238471999011e-05, "loss": 0.7666, "step": 2067 }, { "epoch": 0.7621855708099143, "grad_norm": 6.95215989176346, "learning_rate": 4.392075658301397e-05, "loss": 0.7256, "step": 2068 }, { "epoch": 0.7625541324979268, "grad_norm": 5.820141435195849, "learning_rate": 4.391766596612684e-05, "loss": 0.5547, "step": 2069 }, { "epoch": 0.7629226941859394, "grad_norm": 5.257753731794522, "learning_rate": 4.3914575349239714e-05, "loss": 0.5551, "step": 2070 }, { "epoch": 0.7632912558739519, "grad_norm": 5.236670511698794, "learning_rate": 4.391148473235258e-05, "loss": 0.4763, "step": 2071 }, { "epoch": 0.7636598175619644, "grad_norm": 6.216993534334504, "learning_rate": 4.390839411546545e-05, "loss": 0.6335, "step": 2072 }, { "epoch": 0.764028379249977, "grad_norm": 9.94582339860164, "learning_rate": 4.390530349857832e-05, "loss": 0.9029, "step": 2073 }, { "epoch": 0.7643969409379895, "grad_norm": 5.3217606555170605, "learning_rate": 4.390221288169119e-05, "loss": 0.6509, "step": 2074 }, { "epoch": 0.7647655026260021, "grad_norm": 8.148981004914482, "learning_rate": 4.3899122264804056e-05, "loss": 0.866, "step": 2075 }, { "epoch": 0.7651340643140145, "grad_norm": 5.643815340358802, "learning_rate": 4.389603164791693e-05, "loss": 0.6139, "step": 2076 }, { "epoch": 0.7655026260020271, "grad_norm": 5.284003087541716, "learning_rate": 4.389294103102979e-05, "loss": 0.6314, "step": 2077 }, { "epoch": 0.7658711876900396, "grad_norm": 5.406750429839654, "learning_rate": 4.388985041414266e-05, "loss": 0.5238, "step": 2078 }, { "epoch": 0.7662397493780522, "grad_norm": 6.930869843469099, "learning_rate": 4.3886759797255534e-05, "loss": 0.9019, "step": 2079 }, { "epoch": 0.7666083110660646, "grad_norm": 8.012182496634422, "learning_rate": 4.3883669180368405e-05, "loss": 0.7973, "step": 2080 }, { "epoch": 0.7669768727540772, "grad_norm": 5.555696248286317, "learning_rate": 4.388057856348127e-05, "loss": 0.7119, "step": 2081 }, { "epoch": 0.7673454344420897, "grad_norm": 4.922575594978837, "learning_rate": 4.387748794659414e-05, "loss": 0.4709, "step": 2082 }, { "epoch": 0.7677139961301023, "grad_norm": 5.712137370212804, "learning_rate": 4.387439732970701e-05, "loss": 0.7579, "step": 2083 }, { "epoch": 0.7680825578181149, "grad_norm": 7.811693317727878, "learning_rate": 4.387130671281988e-05, "loss": 0.6957, "step": 2084 }, { "epoch": 0.7684511195061273, "grad_norm": 8.69026326754142, "learning_rate": 4.3868216095932754e-05, "loss": 0.7484, "step": 2085 }, { "epoch": 0.7688196811941399, "grad_norm": 9.0612366946798, "learning_rate": 4.386512547904562e-05, "loss": 0.752, "step": 2086 }, { "epoch": 0.7691882428821524, "grad_norm": 8.480315359477583, "learning_rate": 4.386203486215849e-05, "loss": 0.8339, "step": 2087 }, { "epoch": 0.769556804570165, "grad_norm": 4.890857825054065, "learning_rate": 4.385894424527136e-05, "loss": 0.6043, "step": 2088 }, { "epoch": 0.7699253662581774, "grad_norm": 8.530609665758435, "learning_rate": 4.385585362838423e-05, "loss": 0.6629, "step": 2089 }, { "epoch": 0.77029392794619, "grad_norm": 5.748276161859989, "learning_rate": 4.38527630114971e-05, "loss": 0.5682, "step": 2090 }, { "epoch": 0.7706624896342025, "grad_norm": 6.135313420703434, "learning_rate": 4.384967239460996e-05, "loss": 0.6077, "step": 2091 }, { "epoch": 0.771031051322215, "grad_norm": 5.696465771865839, "learning_rate": 4.384658177772283e-05, "loss": 0.711, "step": 2092 }, { "epoch": 0.7713996130102276, "grad_norm": 5.916249041078684, "learning_rate": 4.3843491160835704e-05, "loss": 0.7587, "step": 2093 }, { "epoch": 0.7717681746982401, "grad_norm": 6.607046885143201, "learning_rate": 4.3840400543948575e-05, "loss": 0.6264, "step": 2094 }, { "epoch": 0.7721367363862527, "grad_norm": 5.816360996088225, "learning_rate": 4.3837309927061446e-05, "loss": 0.7796, "step": 2095 }, { "epoch": 0.7725052980742652, "grad_norm": 10.057225331674255, "learning_rate": 4.383421931017431e-05, "loss": 0.6327, "step": 2096 }, { "epoch": 0.7728738597622777, "grad_norm": 8.600828938122858, "learning_rate": 4.383112869328718e-05, "loss": 0.6045, "step": 2097 }, { "epoch": 0.7732424214502902, "grad_norm": 7.2326378600783725, "learning_rate": 4.382803807640005e-05, "loss": 0.6498, "step": 2098 }, { "epoch": 0.7736109831383028, "grad_norm": 8.552921924682057, "learning_rate": 4.3824947459512924e-05, "loss": 1.0744, "step": 2099 }, { "epoch": 0.7739795448263153, "grad_norm": 5.142662131684249, "learning_rate": 4.382185684262579e-05, "loss": 0.5179, "step": 2100 }, { "epoch": 0.7743481065143278, "grad_norm": 7.225020422461134, "learning_rate": 4.381876622573866e-05, "loss": 0.973, "step": 2101 }, { "epoch": 0.7747166682023404, "grad_norm": 7.564023707715286, "learning_rate": 4.381567560885153e-05, "loss": 0.8372, "step": 2102 }, { "epoch": 0.7750852298903529, "grad_norm": 9.31891058125287, "learning_rate": 4.38125849919644e-05, "loss": 0.6842, "step": 2103 }, { "epoch": 0.7754537915783655, "grad_norm": 14.387805310414462, "learning_rate": 4.380949437507727e-05, "loss": 0.7654, "step": 2104 }, { "epoch": 0.7758223532663779, "grad_norm": 9.215093786349161, "learning_rate": 4.380640375819014e-05, "loss": 0.793, "step": 2105 }, { "epoch": 0.7761909149543905, "grad_norm": 4.936479293973005, "learning_rate": 4.3803313141303e-05, "loss": 0.575, "step": 2106 }, { "epoch": 0.776559476642403, "grad_norm": 7.751412539808773, "learning_rate": 4.380022252441587e-05, "loss": 0.8372, "step": 2107 }, { "epoch": 0.7769280383304156, "grad_norm": 5.261981190095787, "learning_rate": 4.3797131907528744e-05, "loss": 0.5664, "step": 2108 }, { "epoch": 0.777296600018428, "grad_norm": 5.940420617004624, "learning_rate": 4.3794041290641616e-05, "loss": 0.7285, "step": 2109 }, { "epoch": 0.7776651617064406, "grad_norm": 7.015646205946514, "learning_rate": 4.379095067375448e-05, "loss": 0.777, "step": 2110 }, { "epoch": 0.7780337233944531, "grad_norm": 6.725590971325436, "learning_rate": 4.378786005686735e-05, "loss": 0.6526, "step": 2111 }, { "epoch": 0.7784022850824657, "grad_norm": 5.029820209422135, "learning_rate": 4.378476943998022e-05, "loss": 0.581, "step": 2112 }, { "epoch": 0.7787708467704783, "grad_norm": 7.576262758461715, "learning_rate": 4.3781678823093094e-05, "loss": 0.6178, "step": 2113 }, { "epoch": 0.7791394084584907, "grad_norm": 11.268480846302934, "learning_rate": 4.3778588206205965e-05, "loss": 0.688, "step": 2114 }, { "epoch": 0.7795079701465033, "grad_norm": 5.664699587015432, "learning_rate": 4.377549758931883e-05, "loss": 0.573, "step": 2115 }, { "epoch": 0.7798765318345158, "grad_norm": 5.011487733982927, "learning_rate": 4.37724069724317e-05, "loss": 0.5381, "step": 2116 }, { "epoch": 0.7802450935225284, "grad_norm": 6.4859196672996235, "learning_rate": 4.376931635554457e-05, "loss": 0.6906, "step": 2117 }, { "epoch": 0.7806136552105408, "grad_norm": 7.85581838170213, "learning_rate": 4.376622573865744e-05, "loss": 0.7979, "step": 2118 }, { "epoch": 0.7809822168985534, "grad_norm": 6.424138990368913, "learning_rate": 4.376313512177031e-05, "loss": 0.6508, "step": 2119 }, { "epoch": 0.7813507785865659, "grad_norm": 7.143084680475334, "learning_rate": 4.376004450488317e-05, "loss": 0.9199, "step": 2120 }, { "epoch": 0.7817193402745785, "grad_norm": 5.882484051982306, "learning_rate": 4.375695388799604e-05, "loss": 0.4834, "step": 2121 }, { "epoch": 0.782087901962591, "grad_norm": 7.2023867148083935, "learning_rate": 4.3753863271108914e-05, "loss": 0.9503, "step": 2122 }, { "epoch": 0.7824564636506035, "grad_norm": 7.850386650136452, "learning_rate": 4.3750772654221785e-05, "loss": 0.537, "step": 2123 }, { "epoch": 0.7828250253386161, "grad_norm": 7.895216891768484, "learning_rate": 4.3747682037334656e-05, "loss": 0.6721, "step": 2124 }, { "epoch": 0.7831935870266286, "grad_norm": 7.126535885892259, "learning_rate": 4.374459142044752e-05, "loss": 0.7274, "step": 2125 }, { "epoch": 0.7835621487146411, "grad_norm": 6.815254424531322, "learning_rate": 4.374150080356039e-05, "loss": 0.4679, "step": 2126 }, { "epoch": 0.7839307104026536, "grad_norm": 8.903109883663458, "learning_rate": 4.373841018667326e-05, "loss": 0.7846, "step": 2127 }, { "epoch": 0.7842992720906662, "grad_norm": 6.882999828963227, "learning_rate": 4.3735319569786134e-05, "loss": 0.6305, "step": 2128 }, { "epoch": 0.7846678337786787, "grad_norm": 7.818755308251595, "learning_rate": 4.3732228952899e-05, "loss": 0.6235, "step": 2129 }, { "epoch": 0.7850363954666912, "grad_norm": 11.613781838353397, "learning_rate": 4.372913833601187e-05, "loss": 0.7481, "step": 2130 }, { "epoch": 0.7854049571547038, "grad_norm": 9.088158635299623, "learning_rate": 4.372604771912474e-05, "loss": 0.5868, "step": 2131 }, { "epoch": 0.7857735188427163, "grad_norm": 6.961021619774414, "learning_rate": 4.372295710223761e-05, "loss": 0.8569, "step": 2132 }, { "epoch": 0.7861420805307289, "grad_norm": 6.604894684176463, "learning_rate": 4.3719866485350484e-05, "loss": 0.5009, "step": 2133 }, { "epoch": 0.7865106422187413, "grad_norm": 4.894422525353676, "learning_rate": 4.371677586846335e-05, "loss": 0.5679, "step": 2134 }, { "epoch": 0.7868792039067539, "grad_norm": 6.098323103750274, "learning_rate": 4.371368525157621e-05, "loss": 0.621, "step": 2135 }, { "epoch": 0.7872477655947664, "grad_norm": 7.597641438369238, "learning_rate": 4.3710594634689084e-05, "loss": 0.6751, "step": 2136 }, { "epoch": 0.787616327282779, "grad_norm": 7.4066607526360135, "learning_rate": 4.3707504017801955e-05, "loss": 0.7053, "step": 2137 }, { "epoch": 0.7879848889707914, "grad_norm": 6.837700107582261, "learning_rate": 4.3704413400914826e-05, "loss": 0.7842, "step": 2138 }, { "epoch": 0.788353450658804, "grad_norm": 5.987728922057425, "learning_rate": 4.370132278402769e-05, "loss": 0.5241, "step": 2139 }, { "epoch": 0.7887220123468166, "grad_norm": 7.019254633357459, "learning_rate": 4.369823216714056e-05, "loss": 0.5634, "step": 2140 }, { "epoch": 0.7890905740348291, "grad_norm": 5.82011587335655, "learning_rate": 4.369514155025343e-05, "loss": 0.5629, "step": 2141 }, { "epoch": 0.7894591357228417, "grad_norm": 6.790698803545811, "learning_rate": 4.3692050933366304e-05, "loss": 0.7378, "step": 2142 }, { "epoch": 0.7898276974108541, "grad_norm": 5.447861369079072, "learning_rate": 4.368896031647917e-05, "loss": 0.8462, "step": 2143 }, { "epoch": 0.7901962590988667, "grad_norm": 7.987919507778239, "learning_rate": 4.368586969959204e-05, "loss": 0.8239, "step": 2144 }, { "epoch": 0.7905648207868792, "grad_norm": 4.431423701445001, "learning_rate": 4.368277908270491e-05, "loss": 0.5194, "step": 2145 }, { "epoch": 0.7909333824748918, "grad_norm": 3.909372409298265, "learning_rate": 4.367968846581778e-05, "loss": 0.4267, "step": 2146 }, { "epoch": 0.7913019441629042, "grad_norm": 8.561391494167653, "learning_rate": 4.367659784893065e-05, "loss": 0.4716, "step": 2147 }, { "epoch": 0.7916705058509168, "grad_norm": 7.1265637204353816, "learning_rate": 4.367350723204352e-05, "loss": 0.7704, "step": 2148 }, { "epoch": 0.7920390675389293, "grad_norm": 6.777940355584961, "learning_rate": 4.367041661515639e-05, "loss": 0.8702, "step": 2149 }, { "epoch": 0.7924076292269419, "grad_norm": 5.557704272649505, "learning_rate": 4.366732599826925e-05, "loss": 0.8106, "step": 2150 }, { "epoch": 0.7927761909149544, "grad_norm": 6.311050097449673, "learning_rate": 4.3664235381382124e-05, "loss": 0.8049, "step": 2151 }, { "epoch": 0.7931447526029669, "grad_norm": 6.09068495132276, "learning_rate": 4.3661144764494996e-05, "loss": 0.5826, "step": 2152 }, { "epoch": 0.7935133142909795, "grad_norm": 7.569198290198749, "learning_rate": 4.365805414760786e-05, "loss": 0.7776, "step": 2153 }, { "epoch": 0.793881875978992, "grad_norm": 10.036650062766055, "learning_rate": 4.365496353072073e-05, "loss": 0.6027, "step": 2154 }, { "epoch": 0.7942504376670045, "grad_norm": 4.701851403033802, "learning_rate": 4.36518729138336e-05, "loss": 0.4507, "step": 2155 }, { "epoch": 0.794618999355017, "grad_norm": 9.47320351189776, "learning_rate": 4.3648782296946473e-05, "loss": 1.1022, "step": 2156 }, { "epoch": 0.7949875610430296, "grad_norm": 5.736776654557622, "learning_rate": 4.3645691680059345e-05, "loss": 0.7326, "step": 2157 }, { "epoch": 0.7953561227310421, "grad_norm": 7.173856041924308, "learning_rate": 4.364260106317221e-05, "loss": 0.6926, "step": 2158 }, { "epoch": 0.7957246844190546, "grad_norm": 7.557044212939879, "learning_rate": 4.363951044628508e-05, "loss": 0.8535, "step": 2159 }, { "epoch": 0.7960932461070672, "grad_norm": 6.297216363556469, "learning_rate": 4.363641982939795e-05, "loss": 0.6505, "step": 2160 }, { "epoch": 0.7964618077950797, "grad_norm": 12.452839502295124, "learning_rate": 4.363332921251082e-05, "loss": 0.7794, "step": 2161 }, { "epoch": 0.7968303694830923, "grad_norm": 7.4098370714074875, "learning_rate": 4.363023859562369e-05, "loss": 0.7392, "step": 2162 }, { "epoch": 0.7971989311711047, "grad_norm": 7.16204892214397, "learning_rate": 4.362714797873656e-05, "loss": 0.6696, "step": 2163 }, { "epoch": 0.7975674928591173, "grad_norm": 7.32851280373793, "learning_rate": 4.362405736184943e-05, "loss": 0.7686, "step": 2164 }, { "epoch": 0.7979360545471298, "grad_norm": 12.07232740059627, "learning_rate": 4.3620966744962294e-05, "loss": 0.8746, "step": 2165 }, { "epoch": 0.7983046162351424, "grad_norm": 7.941060866966774, "learning_rate": 4.3617876128075165e-05, "loss": 0.6771, "step": 2166 }, { "epoch": 0.7986731779231548, "grad_norm": 5.943935400220311, "learning_rate": 4.3614785511188036e-05, "loss": 0.5561, "step": 2167 }, { "epoch": 0.7990417396111674, "grad_norm": 7.079496473952581, "learning_rate": 4.36116948943009e-05, "loss": 0.6182, "step": 2168 }, { "epoch": 0.79941030129918, "grad_norm": 7.044220621114556, "learning_rate": 4.360860427741377e-05, "loss": 0.7131, "step": 2169 }, { "epoch": 0.7997788629871925, "grad_norm": 7.687248411946413, "learning_rate": 4.360551366052664e-05, "loss": 0.8087, "step": 2170 }, { "epoch": 0.800147424675205, "grad_norm": 6.040483907047417, "learning_rate": 4.3602423043639514e-05, "loss": 0.6406, "step": 2171 }, { "epoch": 0.8005159863632175, "grad_norm": 5.352317322451542, "learning_rate": 4.359933242675238e-05, "loss": 0.8192, "step": 2172 }, { "epoch": 0.8008845480512301, "grad_norm": 7.79046746747693, "learning_rate": 4.359624180986525e-05, "loss": 0.5099, "step": 2173 }, { "epoch": 0.8012531097392426, "grad_norm": 5.91147926987849, "learning_rate": 4.359315119297812e-05, "loss": 0.6767, "step": 2174 }, { "epoch": 0.8016216714272552, "grad_norm": 10.585663986895533, "learning_rate": 4.359006057609099e-05, "loss": 0.5492, "step": 2175 }, { "epoch": 0.8019902331152676, "grad_norm": 5.974603947876744, "learning_rate": 4.3586969959203863e-05, "loss": 0.9769, "step": 2176 }, { "epoch": 0.8023587948032802, "grad_norm": 7.325280912565609, "learning_rate": 4.358387934231673e-05, "loss": 0.7578, "step": 2177 }, { "epoch": 0.8027273564912927, "grad_norm": 5.0952427086938705, "learning_rate": 4.35807887254296e-05, "loss": 0.5773, "step": 2178 }, { "epoch": 0.8030959181793053, "grad_norm": 6.867004739284477, "learning_rate": 4.357769810854247e-05, "loss": 0.6989, "step": 2179 }, { "epoch": 0.8034644798673178, "grad_norm": 7.133675897994452, "learning_rate": 4.3574607491655335e-05, "loss": 0.7007, "step": 2180 }, { "epoch": 0.8038330415553303, "grad_norm": 5.762706814114306, "learning_rate": 4.3571516874768206e-05, "loss": 0.5537, "step": 2181 }, { "epoch": 0.8042016032433429, "grad_norm": 7.539652462343983, "learning_rate": 4.356842625788107e-05, "loss": 0.6056, "step": 2182 }, { "epoch": 0.8045701649313554, "grad_norm": 6.774410050385327, "learning_rate": 4.356533564099394e-05, "loss": 0.5296, "step": 2183 }, { "epoch": 0.8049387266193679, "grad_norm": 7.368711214877519, "learning_rate": 4.356224502410681e-05, "loss": 0.8108, "step": 2184 }, { "epoch": 0.8053072883073804, "grad_norm": 11.431267379488908, "learning_rate": 4.3559154407219684e-05, "loss": 0.6563, "step": 2185 }, { "epoch": 0.805675849995393, "grad_norm": 10.015272303901131, "learning_rate": 4.3556063790332555e-05, "loss": 0.8289, "step": 2186 }, { "epoch": 0.8060444116834055, "grad_norm": 7.9103206965931046, "learning_rate": 4.355297317344542e-05, "loss": 0.7262, "step": 2187 }, { "epoch": 0.806412973371418, "grad_norm": 9.350836372707022, "learning_rate": 4.354988255655829e-05, "loss": 0.9545, "step": 2188 }, { "epoch": 0.8067815350594306, "grad_norm": 4.222622146962774, "learning_rate": 4.354679193967116e-05, "loss": 0.5737, "step": 2189 }, { "epoch": 0.8071500967474431, "grad_norm": 5.467386827871788, "learning_rate": 4.354370132278403e-05, "loss": 0.6386, "step": 2190 }, { "epoch": 0.8075186584354557, "grad_norm": 6.150419465933489, "learning_rate": 4.35406107058969e-05, "loss": 0.6163, "step": 2191 }, { "epoch": 0.8078872201234681, "grad_norm": 8.124209321503809, "learning_rate": 4.353752008900977e-05, "loss": 1.0397, "step": 2192 }, { "epoch": 0.8082557818114807, "grad_norm": 8.043456305037946, "learning_rate": 4.353442947212264e-05, "loss": 0.9237, "step": 2193 }, { "epoch": 0.8086243434994932, "grad_norm": 6.302359414687053, "learning_rate": 4.353133885523551e-05, "loss": 0.7947, "step": 2194 }, { "epoch": 0.8089929051875058, "grad_norm": 5.108346818135537, "learning_rate": 4.352824823834838e-05, "loss": 0.528, "step": 2195 }, { "epoch": 0.8093614668755182, "grad_norm": 7.738326602592995, "learning_rate": 4.352515762146125e-05, "loss": 0.8813, "step": 2196 }, { "epoch": 0.8097300285635308, "grad_norm": 6.271887030635105, "learning_rate": 4.352206700457411e-05, "loss": 0.6821, "step": 2197 }, { "epoch": 0.8100985902515434, "grad_norm": 6.5702863731273045, "learning_rate": 4.351897638768698e-05, "loss": 0.6142, "step": 2198 }, { "epoch": 0.8104671519395559, "grad_norm": 8.01155971776583, "learning_rate": 4.3515885770799853e-05, "loss": 0.8216, "step": 2199 }, { "epoch": 0.8108357136275685, "grad_norm": 6.287233757312654, "learning_rate": 4.3512795153912725e-05, "loss": 0.7263, "step": 2200 }, { "epoch": 0.8112042753155809, "grad_norm": 8.009914929799017, "learning_rate": 4.350970453702559e-05, "loss": 0.6261, "step": 2201 }, { "epoch": 0.8115728370035935, "grad_norm": 7.522602355808346, "learning_rate": 4.350661392013846e-05, "loss": 0.8258, "step": 2202 }, { "epoch": 0.811941398691606, "grad_norm": 5.802119676714702, "learning_rate": 4.350352330325133e-05, "loss": 0.657, "step": 2203 }, { "epoch": 0.8123099603796186, "grad_norm": 11.120445433552785, "learning_rate": 4.35004326863642e-05, "loss": 0.7175, "step": 2204 }, { "epoch": 0.812678522067631, "grad_norm": 5.722810231503469, "learning_rate": 4.3497342069477074e-05, "loss": 0.894, "step": 2205 }, { "epoch": 0.8130470837556436, "grad_norm": 5.384416006400366, "learning_rate": 4.349425145258994e-05, "loss": 0.5387, "step": 2206 }, { "epoch": 0.8134156454436561, "grad_norm": 6.047875136827338, "learning_rate": 4.349116083570281e-05, "loss": 0.6287, "step": 2207 }, { "epoch": 0.8137842071316687, "grad_norm": 7.472156976590662, "learning_rate": 4.348807021881568e-05, "loss": 0.8181, "step": 2208 }, { "epoch": 0.8141527688196812, "grad_norm": 6.375618100588427, "learning_rate": 4.348497960192855e-05, "loss": 0.8237, "step": 2209 }, { "epoch": 0.8145213305076937, "grad_norm": 6.9768309675038775, "learning_rate": 4.3481888985041416e-05, "loss": 0.6588, "step": 2210 }, { "epoch": 0.8148898921957063, "grad_norm": 5.923424993495271, "learning_rate": 4.347879836815428e-05, "loss": 0.5394, "step": 2211 }, { "epoch": 0.8152584538837188, "grad_norm": 7.622967558596459, "learning_rate": 4.347570775126715e-05, "loss": 0.8343, "step": 2212 }, { "epoch": 0.8156270155717313, "grad_norm": 5.874705855128182, "learning_rate": 4.347261713438002e-05, "loss": 0.6914, "step": 2213 }, { "epoch": 0.8159955772597438, "grad_norm": 7.534071777520316, "learning_rate": 4.3469526517492894e-05, "loss": 0.5648, "step": 2214 }, { "epoch": 0.8163641389477564, "grad_norm": 7.521861956164687, "learning_rate": 4.346643590060576e-05, "loss": 0.5347, "step": 2215 }, { "epoch": 0.8167327006357689, "grad_norm": 6.9447761388612435, "learning_rate": 4.346334528371863e-05, "loss": 0.5743, "step": 2216 }, { "epoch": 0.8171012623237814, "grad_norm": 6.944300162617912, "learning_rate": 4.34602546668315e-05, "loss": 0.8638, "step": 2217 }, { "epoch": 0.817469824011794, "grad_norm": 5.325804442771672, "learning_rate": 4.345716404994437e-05, "loss": 0.4826, "step": 2218 }, { "epoch": 0.8178383856998065, "grad_norm": 6.5737664287541016, "learning_rate": 4.3454073433057243e-05, "loss": 0.5757, "step": 2219 }, { "epoch": 0.8182069473878191, "grad_norm": 4.846888159334973, "learning_rate": 4.345098281617011e-05, "loss": 0.7225, "step": 2220 }, { "epoch": 0.8185755090758315, "grad_norm": 7.0603507493507305, "learning_rate": 4.344789219928298e-05, "loss": 0.5951, "step": 2221 }, { "epoch": 0.8189440707638441, "grad_norm": 5.385236352180622, "learning_rate": 4.344480158239585e-05, "loss": 0.6169, "step": 2222 }, { "epoch": 0.8193126324518566, "grad_norm": 6.174150959904009, "learning_rate": 4.344171096550872e-05, "loss": 0.678, "step": 2223 }, { "epoch": 0.8196811941398692, "grad_norm": 8.64059718646631, "learning_rate": 4.343862034862159e-05, "loss": 0.8125, "step": 2224 }, { "epoch": 0.8200497558278816, "grad_norm": 5.468595840461114, "learning_rate": 4.343552973173446e-05, "loss": 0.6357, "step": 2225 }, { "epoch": 0.8204183175158942, "grad_norm": 5.537008789871236, "learning_rate": 4.343243911484732e-05, "loss": 0.5234, "step": 2226 }, { "epoch": 0.8207868792039068, "grad_norm": 10.318112597311622, "learning_rate": 4.342934849796019e-05, "loss": 0.6411, "step": 2227 }, { "epoch": 0.8211554408919193, "grad_norm": 6.521803161196194, "learning_rate": 4.3426257881073064e-05, "loss": 0.6484, "step": 2228 }, { "epoch": 0.8215240025799319, "grad_norm": 8.750038364871251, "learning_rate": 4.3423167264185935e-05, "loss": 0.6508, "step": 2229 }, { "epoch": 0.8218925642679443, "grad_norm": 5.523734121168164, "learning_rate": 4.34200766472988e-05, "loss": 0.6484, "step": 2230 }, { "epoch": 0.8222611259559569, "grad_norm": 6.289670547566369, "learning_rate": 4.341698603041167e-05, "loss": 0.5666, "step": 2231 }, { "epoch": 0.8226296876439694, "grad_norm": 5.9471921819151605, "learning_rate": 4.341389541352454e-05, "loss": 0.7075, "step": 2232 }, { "epoch": 0.822998249331982, "grad_norm": 9.717482898207392, "learning_rate": 4.341080479663741e-05, "loss": 0.7697, "step": 2233 }, { "epoch": 0.8233668110199944, "grad_norm": 5.966246714100551, "learning_rate": 4.340771417975028e-05, "loss": 0.6291, "step": 2234 }, { "epoch": 0.823735372708007, "grad_norm": 6.673375124093879, "learning_rate": 4.340462356286315e-05, "loss": 0.8848, "step": 2235 }, { "epoch": 0.8241039343960196, "grad_norm": 6.5909753539004114, "learning_rate": 4.340153294597602e-05, "loss": 0.6462, "step": 2236 }, { "epoch": 0.8244724960840321, "grad_norm": 7.853183377857655, "learning_rate": 4.339844232908889e-05, "loss": 0.617, "step": 2237 }, { "epoch": 0.8248410577720446, "grad_norm": 7.487461035217896, "learning_rate": 4.339535171220176e-05, "loss": 0.6886, "step": 2238 }, { "epoch": 0.8252096194600571, "grad_norm": 6.017538187550042, "learning_rate": 4.3392261095314627e-05, "loss": 0.9342, "step": 2239 }, { "epoch": 0.8255781811480697, "grad_norm": 7.330160090219189, "learning_rate": 4.33891704784275e-05, "loss": 0.7837, "step": 2240 }, { "epoch": 0.8259467428360822, "grad_norm": 6.71997314674825, "learning_rate": 4.338607986154036e-05, "loss": 0.8083, "step": 2241 }, { "epoch": 0.8263153045240947, "grad_norm": 8.741116182821392, "learning_rate": 4.338298924465323e-05, "loss": 0.8141, "step": 2242 }, { "epoch": 0.8266838662121072, "grad_norm": 9.548806352565176, "learning_rate": 4.3379898627766105e-05, "loss": 0.9178, "step": 2243 }, { "epoch": 0.8270524279001198, "grad_norm": 18.847657464378198, "learning_rate": 4.337680801087897e-05, "loss": 0.7524, "step": 2244 }, { "epoch": 0.8274209895881323, "grad_norm": 6.9842835149384515, "learning_rate": 4.337371739399184e-05, "loss": 0.6779, "step": 2245 }, { "epoch": 0.8277895512761448, "grad_norm": 4.9829868787347396, "learning_rate": 4.337062677710471e-05, "loss": 0.4984, "step": 2246 }, { "epoch": 0.8281581129641574, "grad_norm": 5.914089590839537, "learning_rate": 4.336753616021758e-05, "loss": 0.5714, "step": 2247 }, { "epoch": 0.8285266746521699, "grad_norm": 7.115309700997791, "learning_rate": 4.3364445543330454e-05, "loss": 1.0218, "step": 2248 }, { "epoch": 0.8288952363401825, "grad_norm": 9.19568585956123, "learning_rate": 4.336135492644332e-05, "loss": 0.7267, "step": 2249 }, { "epoch": 0.8292637980281949, "grad_norm": 7.027547853484017, "learning_rate": 4.335826430955619e-05, "loss": 0.6901, "step": 2250 }, { "epoch": 0.8296323597162075, "grad_norm": 5.751104497563449, "learning_rate": 4.335517369266906e-05, "loss": 0.5478, "step": 2251 }, { "epoch": 0.83000092140422, "grad_norm": 5.386716094372533, "learning_rate": 4.335208307578193e-05, "loss": 0.5432, "step": 2252 }, { "epoch": 0.8303694830922326, "grad_norm": 6.074468694616427, "learning_rate": 4.3348992458894796e-05, "loss": 0.5009, "step": 2253 }, { "epoch": 0.830738044780245, "grad_norm": 5.442499252270221, "learning_rate": 4.334590184200767e-05, "loss": 0.6295, "step": 2254 }, { "epoch": 0.8311066064682576, "grad_norm": 8.014412771548137, "learning_rate": 4.334281122512054e-05, "loss": 0.7282, "step": 2255 }, { "epoch": 0.8314751681562702, "grad_norm": 9.908940572608374, "learning_rate": 4.33397206082334e-05, "loss": 0.6352, "step": 2256 }, { "epoch": 0.8318437298442827, "grad_norm": 8.662713787749023, "learning_rate": 4.3336629991346274e-05, "loss": 0.714, "step": 2257 }, { "epoch": 0.8322122915322953, "grad_norm": 4.360202724398438, "learning_rate": 4.3333539374459145e-05, "loss": 0.384, "step": 2258 }, { "epoch": 0.8325808532203077, "grad_norm": 5.803839024672929, "learning_rate": 4.333044875757201e-05, "loss": 0.5959, "step": 2259 }, { "epoch": 0.8329494149083203, "grad_norm": 5.087482364258438, "learning_rate": 4.332735814068488e-05, "loss": 0.5024, "step": 2260 }, { "epoch": 0.8333179765963328, "grad_norm": 7.024886171769501, "learning_rate": 4.332426752379775e-05, "loss": 0.7041, "step": 2261 }, { "epoch": 0.8336865382843454, "grad_norm": 5.121999280817055, "learning_rate": 4.332117690691062e-05, "loss": 0.6513, "step": 2262 }, { "epoch": 0.8340550999723578, "grad_norm": 11.036169754343202, "learning_rate": 4.331808629002349e-05, "loss": 0.6902, "step": 2263 }, { "epoch": 0.8344236616603704, "grad_norm": 7.656231066621998, "learning_rate": 4.331499567313636e-05, "loss": 0.6336, "step": 2264 }, { "epoch": 0.834792223348383, "grad_norm": 7.579767359885161, "learning_rate": 4.331190505624923e-05, "loss": 0.6873, "step": 2265 }, { "epoch": 0.8351607850363955, "grad_norm": 7.191929256850826, "learning_rate": 4.33088144393621e-05, "loss": 0.5882, "step": 2266 }, { "epoch": 0.835529346724408, "grad_norm": 5.9913764016729445, "learning_rate": 4.330572382247497e-05, "loss": 0.5531, "step": 2267 }, { "epoch": 0.8358979084124205, "grad_norm": 6.872330025756196, "learning_rate": 4.330263320558784e-05, "loss": 0.7184, "step": 2268 }, { "epoch": 0.8362664701004331, "grad_norm": 8.032269246182182, "learning_rate": 4.329954258870071e-05, "loss": 0.7946, "step": 2269 }, { "epoch": 0.8366350317884456, "grad_norm": 9.670699977215527, "learning_rate": 4.329645197181358e-05, "loss": 0.7873, "step": 2270 }, { "epoch": 0.8370035934764581, "grad_norm": 7.145300872212685, "learning_rate": 4.3293361354926444e-05, "loss": 0.5875, "step": 2271 }, { "epoch": 0.8373721551644706, "grad_norm": 4.350830893251302, "learning_rate": 4.3290270738039315e-05, "loss": 0.3695, "step": 2272 }, { "epoch": 0.8377407168524832, "grad_norm": 5.631357288079479, "learning_rate": 4.328718012115218e-05, "loss": 0.5188, "step": 2273 }, { "epoch": 0.8381092785404957, "grad_norm": 8.94548904711152, "learning_rate": 4.328408950426505e-05, "loss": 0.6992, "step": 2274 }, { "epoch": 0.8384778402285082, "grad_norm": 5.342635189018567, "learning_rate": 4.328099888737792e-05, "loss": 0.67, "step": 2275 }, { "epoch": 0.8388464019165208, "grad_norm": 7.352445608156407, "learning_rate": 4.327790827049079e-05, "loss": 0.7403, "step": 2276 }, { "epoch": 0.8392149636045333, "grad_norm": 5.81493958170394, "learning_rate": 4.3274817653603664e-05, "loss": 0.5963, "step": 2277 }, { "epoch": 0.8395835252925459, "grad_norm": 5.977665339424435, "learning_rate": 4.327172703671653e-05, "loss": 0.5639, "step": 2278 }, { "epoch": 0.8399520869805583, "grad_norm": 8.12270193612717, "learning_rate": 4.32686364198294e-05, "loss": 0.8086, "step": 2279 }, { "epoch": 0.8403206486685709, "grad_norm": 9.672639338992568, "learning_rate": 4.326554580294227e-05, "loss": 0.8353, "step": 2280 }, { "epoch": 0.8406892103565834, "grad_norm": 7.559596994143748, "learning_rate": 4.326245518605514e-05, "loss": 0.8211, "step": 2281 }, { "epoch": 0.841057772044596, "grad_norm": 7.597699931590368, "learning_rate": 4.3259364569168007e-05, "loss": 0.5442, "step": 2282 }, { "epoch": 0.8414263337326084, "grad_norm": 6.46949446812467, "learning_rate": 4.325627395228088e-05, "loss": 0.4797, "step": 2283 }, { "epoch": 0.841794895420621, "grad_norm": 7.0577365711545585, "learning_rate": 4.325318333539375e-05, "loss": 0.7075, "step": 2284 }, { "epoch": 0.8421634571086336, "grad_norm": 7.830489298414748, "learning_rate": 4.325009271850662e-05, "loss": 0.7112, "step": 2285 }, { "epoch": 0.8425320187966461, "grad_norm": 6.09150662331533, "learning_rate": 4.3247002101619485e-05, "loss": 0.8833, "step": 2286 }, { "epoch": 0.8429005804846587, "grad_norm": 7.725636981269403, "learning_rate": 4.324391148473235e-05, "loss": 0.8202, "step": 2287 }, { "epoch": 0.8432691421726711, "grad_norm": 9.944533444177424, "learning_rate": 4.324082086784522e-05, "loss": 0.5187, "step": 2288 }, { "epoch": 0.8436377038606837, "grad_norm": 6.125964030479968, "learning_rate": 4.323773025095809e-05, "loss": 0.607, "step": 2289 }, { "epoch": 0.8440062655486962, "grad_norm": 7.108887506895391, "learning_rate": 4.323463963407096e-05, "loss": 0.7424, "step": 2290 }, { "epoch": 0.8443748272367088, "grad_norm": 5.278016313661354, "learning_rate": 4.3231549017183834e-05, "loss": 0.8262, "step": 2291 }, { "epoch": 0.8447433889247212, "grad_norm": 7.1822942042651725, "learning_rate": 4.32284584002967e-05, "loss": 0.5853, "step": 2292 }, { "epoch": 0.8451119506127338, "grad_norm": 5.991148141098164, "learning_rate": 4.322536778340957e-05, "loss": 0.4578, "step": 2293 }, { "epoch": 0.8454805123007464, "grad_norm": 7.189544652280067, "learning_rate": 4.322227716652244e-05, "loss": 0.7781, "step": 2294 }, { "epoch": 0.8458490739887589, "grad_norm": 6.43617012583352, "learning_rate": 4.321918654963531e-05, "loss": 0.822, "step": 2295 }, { "epoch": 0.8462176356767714, "grad_norm": 7.292025079546983, "learning_rate": 4.321609593274818e-05, "loss": 0.6486, "step": 2296 }, { "epoch": 0.8465861973647839, "grad_norm": 7.719056158133072, "learning_rate": 4.321300531586105e-05, "loss": 0.9729, "step": 2297 }, { "epoch": 0.8469547590527965, "grad_norm": 5.99220086883287, "learning_rate": 4.320991469897392e-05, "loss": 0.6176, "step": 2298 }, { "epoch": 0.847323320740809, "grad_norm": 6.225283463853053, "learning_rate": 4.320682408208679e-05, "loss": 0.9167, "step": 2299 }, { "epoch": 0.8476918824288215, "grad_norm": 6.5027090442792215, "learning_rate": 4.320373346519966e-05, "loss": 0.6574, "step": 2300 }, { "epoch": 0.848060444116834, "grad_norm": 7.94197737201742, "learning_rate": 4.3200642848312525e-05, "loss": 0.5563, "step": 2301 }, { "epoch": 0.8484290058048466, "grad_norm": 7.221740103385721, "learning_rate": 4.319755223142539e-05, "loss": 0.7268, "step": 2302 }, { "epoch": 0.8487975674928591, "grad_norm": 7.480132937550491, "learning_rate": 4.319446161453826e-05, "loss": 0.5124, "step": 2303 }, { "epoch": 0.8491661291808716, "grad_norm": 14.727673200820156, "learning_rate": 4.319137099765113e-05, "loss": 0.6182, "step": 2304 }, { "epoch": 0.8495346908688842, "grad_norm": 7.885484769889062, "learning_rate": 4.3188280380764e-05, "loss": 0.7263, "step": 2305 }, { "epoch": 0.8499032525568967, "grad_norm": 7.391006685687739, "learning_rate": 4.318518976387687e-05, "loss": 0.705, "step": 2306 }, { "epoch": 0.8502718142449093, "grad_norm": 5.028469668015155, "learning_rate": 4.318209914698974e-05, "loss": 0.5797, "step": 2307 }, { "epoch": 0.8506403759329217, "grad_norm": 10.010105367709984, "learning_rate": 4.317900853010261e-05, "loss": 0.7244, "step": 2308 }, { "epoch": 0.8510089376209343, "grad_norm": 13.231012522963331, "learning_rate": 4.317591791321548e-05, "loss": 0.7666, "step": 2309 }, { "epoch": 0.8513774993089468, "grad_norm": 8.307875185635474, "learning_rate": 4.317282729632835e-05, "loss": 0.8468, "step": 2310 }, { "epoch": 0.8517460609969594, "grad_norm": 4.792040249179451, "learning_rate": 4.316973667944122e-05, "loss": 0.5691, "step": 2311 }, { "epoch": 0.8521146226849718, "grad_norm": 8.488050700820951, "learning_rate": 4.316664606255409e-05, "loss": 0.7723, "step": 2312 }, { "epoch": 0.8524831843729844, "grad_norm": 9.95689485979557, "learning_rate": 4.316355544566696e-05, "loss": 0.9821, "step": 2313 }, { "epoch": 0.852851746060997, "grad_norm": 7.7544651089534575, "learning_rate": 4.316046482877983e-05, "loss": 0.9278, "step": 2314 }, { "epoch": 0.8532203077490095, "grad_norm": 6.6221272700000355, "learning_rate": 4.3157374211892695e-05, "loss": 0.7974, "step": 2315 }, { "epoch": 0.8535888694370221, "grad_norm": 5.547324750692755, "learning_rate": 4.3154283595005566e-05, "loss": 0.6864, "step": 2316 }, { "epoch": 0.8539574311250345, "grad_norm": 6.215971512914553, "learning_rate": 4.315119297811843e-05, "loss": 0.6185, "step": 2317 }, { "epoch": 0.8543259928130471, "grad_norm": 7.824172920172817, "learning_rate": 4.31481023612313e-05, "loss": 0.7445, "step": 2318 }, { "epoch": 0.8546945545010596, "grad_norm": 5.01295889944155, "learning_rate": 4.314501174434417e-05, "loss": 0.6086, "step": 2319 }, { "epoch": 0.8550631161890722, "grad_norm": 7.184042986564912, "learning_rate": 4.3141921127457044e-05, "loss": 0.8331, "step": 2320 }, { "epoch": 0.8554316778770846, "grad_norm": 8.537357142581097, "learning_rate": 4.313883051056991e-05, "loss": 0.6824, "step": 2321 }, { "epoch": 0.8558002395650972, "grad_norm": 6.999376541711143, "learning_rate": 4.313573989368278e-05, "loss": 0.672, "step": 2322 }, { "epoch": 0.8561688012531098, "grad_norm": 5.739661500632742, "learning_rate": 4.313264927679565e-05, "loss": 0.7597, "step": 2323 }, { "epoch": 0.8565373629411223, "grad_norm": 7.493275743355959, "learning_rate": 4.312955865990852e-05, "loss": 0.7066, "step": 2324 }, { "epoch": 0.8569059246291348, "grad_norm": 7.53641722131222, "learning_rate": 4.3126468043021386e-05, "loss": 0.7617, "step": 2325 }, { "epoch": 0.8572744863171473, "grad_norm": 4.6464562561544085, "learning_rate": 4.312337742613426e-05, "loss": 0.5301, "step": 2326 }, { "epoch": 0.8576430480051599, "grad_norm": 10.188321507569418, "learning_rate": 4.312028680924713e-05, "loss": 0.8682, "step": 2327 }, { "epoch": 0.8580116096931724, "grad_norm": 5.106848975995183, "learning_rate": 4.311719619236e-05, "loss": 0.5495, "step": 2328 }, { "epoch": 0.8583801713811849, "grad_norm": 6.228797042580233, "learning_rate": 4.311410557547287e-05, "loss": 0.8131, "step": 2329 }, { "epoch": 0.8587487330691974, "grad_norm": 7.243995284501831, "learning_rate": 4.3111014958585736e-05, "loss": 0.8325, "step": 2330 }, { "epoch": 0.85911729475721, "grad_norm": 5.9807608823979805, "learning_rate": 4.310792434169861e-05, "loss": 0.7862, "step": 2331 }, { "epoch": 0.8594858564452226, "grad_norm": 6.577993069954478, "learning_rate": 4.310483372481147e-05, "loss": 0.7171, "step": 2332 }, { "epoch": 0.859854418133235, "grad_norm": 12.427825692390028, "learning_rate": 4.310174310792434e-05, "loss": 0.8263, "step": 2333 }, { "epoch": 0.8602229798212476, "grad_norm": 9.7556339516624, "learning_rate": 4.3098652491037214e-05, "loss": 0.6194, "step": 2334 }, { "epoch": 0.8605915415092601, "grad_norm": 7.813904902976723, "learning_rate": 4.309556187415008e-05, "loss": 0.7089, "step": 2335 }, { "epoch": 0.8609601031972727, "grad_norm": 5.292630365682468, "learning_rate": 4.309247125726295e-05, "loss": 0.7196, "step": 2336 }, { "epoch": 0.8613286648852851, "grad_norm": 7.755858145382048, "learning_rate": 4.308938064037582e-05, "loss": 0.8123, "step": 2337 }, { "epoch": 0.8616972265732977, "grad_norm": 5.789007148014729, "learning_rate": 4.308629002348869e-05, "loss": 0.6781, "step": 2338 }, { "epoch": 0.8620657882613102, "grad_norm": 8.00131882287373, "learning_rate": 4.308319940660156e-05, "loss": 0.7231, "step": 2339 }, { "epoch": 0.8624343499493228, "grad_norm": 9.287126010434628, "learning_rate": 4.308010878971443e-05, "loss": 0.8139, "step": 2340 }, { "epoch": 0.8628029116373352, "grad_norm": 8.595390468424776, "learning_rate": 4.30770181728273e-05, "loss": 0.7831, "step": 2341 }, { "epoch": 0.8631714733253478, "grad_norm": 9.061714506637509, "learning_rate": 4.307392755594017e-05, "loss": 0.7774, "step": 2342 }, { "epoch": 0.8635400350133604, "grad_norm": 11.40801971502326, "learning_rate": 4.307083693905304e-05, "loss": 0.6944, "step": 2343 }, { "epoch": 0.8639085967013729, "grad_norm": 5.354338567722697, "learning_rate": 4.3067746322165905e-05, "loss": 0.5659, "step": 2344 }, { "epoch": 0.8642771583893855, "grad_norm": 4.876132540096094, "learning_rate": 4.3064655705278776e-05, "loss": 0.6719, "step": 2345 }, { "epoch": 0.8646457200773979, "grad_norm": 8.850850951238257, "learning_rate": 4.306156508839165e-05, "loss": 0.8475, "step": 2346 }, { "epoch": 0.8650142817654105, "grad_norm": 4.9973492271499, "learning_rate": 4.305847447150451e-05, "loss": 0.7084, "step": 2347 }, { "epoch": 0.865382843453423, "grad_norm": 5.411451362358158, "learning_rate": 4.305538385461738e-05, "loss": 0.6782, "step": 2348 }, { "epoch": 0.8657514051414356, "grad_norm": 8.036252374949825, "learning_rate": 4.3052293237730254e-05, "loss": 0.7343, "step": 2349 }, { "epoch": 0.866119966829448, "grad_norm": 6.8753035044868085, "learning_rate": 4.304920262084312e-05, "loss": 0.6777, "step": 2350 }, { "epoch": 0.8664885285174606, "grad_norm": 12.635437531649268, "learning_rate": 4.304611200395599e-05, "loss": 0.4613, "step": 2351 }, { "epoch": 0.8668570902054732, "grad_norm": 6.689295162368537, "learning_rate": 4.304302138706886e-05, "loss": 0.5548, "step": 2352 }, { "epoch": 0.8672256518934857, "grad_norm": 6.348030938460705, "learning_rate": 4.303993077018173e-05, "loss": 0.7268, "step": 2353 }, { "epoch": 0.8675942135814982, "grad_norm": 9.392715602224808, "learning_rate": 4.30368401532946e-05, "loss": 0.7223, "step": 2354 }, { "epoch": 0.8679627752695107, "grad_norm": 6.850252311125059, "learning_rate": 4.303374953640747e-05, "loss": 0.7101, "step": 2355 }, { "epoch": 0.8683313369575233, "grad_norm": 4.740727660262642, "learning_rate": 4.303065891952034e-05, "loss": 0.4325, "step": 2356 }, { "epoch": 0.8686998986455358, "grad_norm": 5.274153958853444, "learning_rate": 4.302756830263321e-05, "loss": 0.5764, "step": 2357 }, { "epoch": 0.8690684603335483, "grad_norm": 7.842541848564429, "learning_rate": 4.302447768574608e-05, "loss": 0.62, "step": 2358 }, { "epoch": 0.8694370220215608, "grad_norm": 6.230037394452725, "learning_rate": 4.3021387068858946e-05, "loss": 0.5213, "step": 2359 }, { "epoch": 0.8698055837095734, "grad_norm": 5.187211729565678, "learning_rate": 4.301829645197182e-05, "loss": 0.4694, "step": 2360 }, { "epoch": 0.870174145397586, "grad_norm": 5.744979657036709, "learning_rate": 4.301520583508469e-05, "loss": 0.5895, "step": 2361 }, { "epoch": 0.8705427070855984, "grad_norm": 7.017409881350176, "learning_rate": 4.301211521819755e-05, "loss": 0.5975, "step": 2362 }, { "epoch": 0.870911268773611, "grad_norm": 8.357431688545885, "learning_rate": 4.3009024601310424e-05, "loss": 0.6633, "step": 2363 }, { "epoch": 0.8712798304616235, "grad_norm": 4.856612521157526, "learning_rate": 4.300593398442329e-05, "loss": 0.5749, "step": 2364 }, { "epoch": 0.8716483921496361, "grad_norm": 6.931793615351663, "learning_rate": 4.300284336753616e-05, "loss": 0.6376, "step": 2365 }, { "epoch": 0.8720169538376485, "grad_norm": 7.178202843980932, "learning_rate": 4.299975275064903e-05, "loss": 0.8355, "step": 2366 }, { "epoch": 0.8723855155256611, "grad_norm": 6.695846910191911, "learning_rate": 4.29966621337619e-05, "loss": 0.6807, "step": 2367 }, { "epoch": 0.8727540772136736, "grad_norm": 9.514633553003984, "learning_rate": 4.299357151687477e-05, "loss": 0.6, "step": 2368 }, { "epoch": 0.8731226389016862, "grad_norm": 7.858012936459847, "learning_rate": 4.299048089998764e-05, "loss": 0.9374, "step": 2369 }, { "epoch": 0.8734912005896986, "grad_norm": 5.635202945551192, "learning_rate": 4.298739028310051e-05, "loss": 0.6115, "step": 2370 }, { "epoch": 0.8738597622777112, "grad_norm": 8.920475036562218, "learning_rate": 4.298429966621338e-05, "loss": 0.5612, "step": 2371 }, { "epoch": 0.8742283239657238, "grad_norm": 8.566843532215863, "learning_rate": 4.298120904932625e-05, "loss": 0.7818, "step": 2372 }, { "epoch": 0.8745968856537363, "grad_norm": 7.195656303285321, "learning_rate": 4.2978118432439116e-05, "loss": 0.6259, "step": 2373 }, { "epoch": 0.8749654473417489, "grad_norm": 6.453219286065888, "learning_rate": 4.297502781555199e-05, "loss": 0.5329, "step": 2374 }, { "epoch": 0.8753340090297613, "grad_norm": 4.64244254640064, "learning_rate": 4.297193719866486e-05, "loss": 0.345, "step": 2375 }, { "epoch": 0.8757025707177739, "grad_norm": 6.107078647514996, "learning_rate": 4.296884658177773e-05, "loss": 0.5531, "step": 2376 }, { "epoch": 0.8760711324057864, "grad_norm": 6.695044994017876, "learning_rate": 4.2965755964890594e-05, "loss": 0.4889, "step": 2377 }, { "epoch": 0.876439694093799, "grad_norm": 11.599672352339985, "learning_rate": 4.296266534800346e-05, "loss": 0.7426, "step": 2378 }, { "epoch": 0.8768082557818114, "grad_norm": 8.24596653220791, "learning_rate": 4.295957473111633e-05, "loss": 0.8104, "step": 2379 }, { "epoch": 0.877176817469824, "grad_norm": 6.182804059651176, "learning_rate": 4.29564841142292e-05, "loss": 0.6111, "step": 2380 }, { "epoch": 0.8775453791578366, "grad_norm": 7.330892012003068, "learning_rate": 4.295339349734207e-05, "loss": 0.629, "step": 2381 }, { "epoch": 0.8779139408458491, "grad_norm": 14.150932879749925, "learning_rate": 4.295030288045494e-05, "loss": 0.8206, "step": 2382 }, { "epoch": 0.8782825025338616, "grad_norm": 6.7905811152157085, "learning_rate": 4.294721226356781e-05, "loss": 0.7425, "step": 2383 }, { "epoch": 0.8786510642218741, "grad_norm": 5.89447656483497, "learning_rate": 4.294412164668068e-05, "loss": 0.6549, "step": 2384 }, { "epoch": 0.8790196259098867, "grad_norm": 5.698099508844621, "learning_rate": 4.294103102979355e-05, "loss": 0.8646, "step": 2385 }, { "epoch": 0.8793881875978992, "grad_norm": 8.101430187414008, "learning_rate": 4.293794041290642e-05, "loss": 0.7288, "step": 2386 }, { "epoch": 0.8797567492859117, "grad_norm": 5.781476650434687, "learning_rate": 4.293484979601929e-05, "loss": 0.6639, "step": 2387 }, { "epoch": 0.8801253109739242, "grad_norm": 9.1012070746052, "learning_rate": 4.2931759179132156e-05, "loss": 0.9902, "step": 2388 }, { "epoch": 0.8804938726619368, "grad_norm": 5.606884416728137, "learning_rate": 4.292866856224503e-05, "loss": 0.5251, "step": 2389 }, { "epoch": 0.8808624343499494, "grad_norm": 5.760940757856076, "learning_rate": 4.29255779453579e-05, "loss": 0.5862, "step": 2390 }, { "epoch": 0.8812309960379618, "grad_norm": 6.0351436084009995, "learning_rate": 4.292248732847077e-05, "loss": 0.6036, "step": 2391 }, { "epoch": 0.8815995577259744, "grad_norm": 8.4628430701266, "learning_rate": 4.2919396711583634e-05, "loss": 0.6842, "step": 2392 }, { "epoch": 0.8819681194139869, "grad_norm": 8.246693266340246, "learning_rate": 4.29163060946965e-05, "loss": 0.5866, "step": 2393 }, { "epoch": 0.8823366811019995, "grad_norm": 9.161393423249894, "learning_rate": 4.291321547780937e-05, "loss": 0.8711, "step": 2394 }, { "epoch": 0.882705242790012, "grad_norm": 6.7792572046700155, "learning_rate": 4.291012486092224e-05, "loss": 0.8235, "step": 2395 }, { "epoch": 0.8830738044780245, "grad_norm": 6.909818391966676, "learning_rate": 4.290703424403511e-05, "loss": 0.691, "step": 2396 }, { "epoch": 0.883442366166037, "grad_norm": 6.023562260583757, "learning_rate": 4.290394362714798e-05, "loss": 0.8012, "step": 2397 }, { "epoch": 0.8838109278540496, "grad_norm": 8.27859074343367, "learning_rate": 4.290085301026085e-05, "loss": 1.0008, "step": 2398 }, { "epoch": 0.8841794895420622, "grad_norm": 54.831967662435524, "learning_rate": 4.289776239337372e-05, "loss": 0.8314, "step": 2399 }, { "epoch": 0.8845480512300746, "grad_norm": 6.9822347549227795, "learning_rate": 4.289467177648659e-05, "loss": 0.7591, "step": 2400 }, { "epoch": 0.8845480512300746, "eval_bleu": 0.06272924588970775, "eval_bleu_1gram": 0.41173961546563564, "eval_bleu_2gram": 0.19818194110901793, "eval_bleu_3gram": 0.09548031595896364, "eval_bleu_4gram": 0.048330595664347976, "eval_rag_val_loss": 0.8066987228592701, "eval_rouge1": 0.39456407360602497, "eval_rouge2": 0.188065278628304, "eval_rougeL": 0.39195065555433845, "step": 2400 }, { "epoch": 0.8849166129180872, "grad_norm": 5.120732461273622, "learning_rate": 4.289158115959946e-05, "loss": 0.4797, "step": 2401 }, { "epoch": 0.8852851746060997, "grad_norm": 9.603163086815336, "learning_rate": 4.2888490542712326e-05, "loss": 0.7037, "step": 2402 }, { "epoch": 0.8856537362941123, "grad_norm": 5.474054695633852, "learning_rate": 4.28853999258252e-05, "loss": 0.6661, "step": 2403 }, { "epoch": 0.8860222979821247, "grad_norm": 5.707397532735887, "learning_rate": 4.288230930893807e-05, "loss": 0.6152, "step": 2404 }, { "epoch": 0.8863908596701373, "grad_norm": 10.916692680347305, "learning_rate": 4.287921869205094e-05, "loss": 0.958, "step": 2405 }, { "epoch": 0.8867594213581498, "grad_norm": 10.73733910472258, "learning_rate": 4.2876128075163804e-05, "loss": 0.6209, "step": 2406 }, { "epoch": 0.8871279830461624, "grad_norm": 7.811510679547182, "learning_rate": 4.287303745827667e-05, "loss": 0.7788, "step": 2407 }, { "epoch": 0.8874965447341748, "grad_norm": 7.920042770347321, "learning_rate": 4.286994684138954e-05, "loss": 0.7426, "step": 2408 }, { "epoch": 0.8878651064221874, "grad_norm": 10.908161684399797, "learning_rate": 4.286685622450241e-05, "loss": 0.9391, "step": 2409 }, { "epoch": 0.8882336681102, "grad_norm": 7.546072781023511, "learning_rate": 4.286376560761528e-05, "loss": 0.8648, "step": 2410 }, { "epoch": 0.8886022297982125, "grad_norm": 4.551886768883677, "learning_rate": 4.286067499072815e-05, "loss": 0.6114, "step": 2411 }, { "epoch": 0.888970791486225, "grad_norm": 7.574373870252721, "learning_rate": 4.285758437384102e-05, "loss": 0.7664, "step": 2412 }, { "epoch": 0.8893393531742375, "grad_norm": 7.761928853079218, "learning_rate": 4.285449375695389e-05, "loss": 0.6304, "step": 2413 }, { "epoch": 0.8897079148622501, "grad_norm": 6.631703278404632, "learning_rate": 4.285140314006676e-05, "loss": 0.8541, "step": 2414 }, { "epoch": 0.8900764765502626, "grad_norm": 10.28047896163715, "learning_rate": 4.284831252317963e-05, "loss": 0.863, "step": 2415 }, { "epoch": 0.8904450382382751, "grad_norm": 4.8043572040941385, "learning_rate": 4.2845221906292496e-05, "loss": 0.5043, "step": 2416 }, { "epoch": 0.8908135999262876, "grad_norm": 6.371925061203695, "learning_rate": 4.284213128940537e-05, "loss": 0.7101, "step": 2417 }, { "epoch": 0.8911821616143002, "grad_norm": 4.8855410345171775, "learning_rate": 4.283904067251824e-05, "loss": 0.5518, "step": 2418 }, { "epoch": 0.8915507233023128, "grad_norm": 7.042405701027265, "learning_rate": 4.283595005563111e-05, "loss": 0.7465, "step": 2419 }, { "epoch": 0.8919192849903252, "grad_norm": 7.60401530354669, "learning_rate": 4.283285943874398e-05, "loss": 0.5486, "step": 2420 }, { "epoch": 0.8922878466783378, "grad_norm": 6.754403549957156, "learning_rate": 4.2829768821856845e-05, "loss": 0.5652, "step": 2421 }, { "epoch": 0.8926564083663503, "grad_norm": 7.324534368199626, "learning_rate": 4.282667820496971e-05, "loss": 0.7216, "step": 2422 }, { "epoch": 0.8930249700543629, "grad_norm": 7.416781763905315, "learning_rate": 4.282358758808258e-05, "loss": 0.9819, "step": 2423 }, { "epoch": 0.8933935317423753, "grad_norm": 6.439175285938185, "learning_rate": 4.282049697119545e-05, "loss": 0.8425, "step": 2424 }, { "epoch": 0.8937620934303879, "grad_norm": 9.135947153716893, "learning_rate": 4.281740635430832e-05, "loss": 0.8115, "step": 2425 }, { "epoch": 0.8941306551184004, "grad_norm": 6.846683239395206, "learning_rate": 4.281431573742119e-05, "loss": 0.8651, "step": 2426 }, { "epoch": 0.894499216806413, "grad_norm": 5.625302115910191, "learning_rate": 4.281122512053406e-05, "loss": 0.6739, "step": 2427 }, { "epoch": 0.8948677784944256, "grad_norm": 6.620450455013988, "learning_rate": 4.280813450364693e-05, "loss": 0.654, "step": 2428 }, { "epoch": 0.895236340182438, "grad_norm": 7.806042252915166, "learning_rate": 4.28050438867598e-05, "loss": 0.8793, "step": 2429 }, { "epoch": 0.8956049018704506, "grad_norm": 8.045974236599047, "learning_rate": 4.280195326987267e-05, "loss": 0.5712, "step": 2430 }, { "epoch": 0.8959734635584631, "grad_norm": 6.011732075399462, "learning_rate": 4.2798862652985536e-05, "loss": 0.6639, "step": 2431 }, { "epoch": 0.8963420252464757, "grad_norm": 4.920633601507351, "learning_rate": 4.279577203609841e-05, "loss": 0.613, "step": 2432 }, { "epoch": 0.8967105869344881, "grad_norm": 5.197939743367904, "learning_rate": 4.279268141921128e-05, "loss": 0.6724, "step": 2433 }, { "epoch": 0.8970791486225007, "grad_norm": 8.538868168105488, "learning_rate": 4.278959080232415e-05, "loss": 0.8929, "step": 2434 }, { "epoch": 0.8974477103105132, "grad_norm": 6.196111062381248, "learning_rate": 4.2786500185437014e-05, "loss": 0.8917, "step": 2435 }, { "epoch": 0.8978162719985258, "grad_norm": 10.457140688631664, "learning_rate": 4.2783409568549886e-05, "loss": 0.706, "step": 2436 }, { "epoch": 0.8981848336865382, "grad_norm": 7.344102696816455, "learning_rate": 4.278031895166275e-05, "loss": 0.7363, "step": 2437 }, { "epoch": 0.8985533953745508, "grad_norm": 11.327603032155636, "learning_rate": 4.277722833477562e-05, "loss": 0.7055, "step": 2438 }, { "epoch": 0.8989219570625634, "grad_norm": 8.748186631990913, "learning_rate": 4.277413771788849e-05, "loss": 0.8833, "step": 2439 }, { "epoch": 0.8992905187505759, "grad_norm": 7.849817611616498, "learning_rate": 4.2771047101001363e-05, "loss": 0.7164, "step": 2440 }, { "epoch": 0.8996590804385884, "grad_norm": 6.675538962104132, "learning_rate": 4.276795648411423e-05, "loss": 0.6785, "step": 2441 }, { "epoch": 0.9000276421266009, "grad_norm": 12.07671755625649, "learning_rate": 4.27648658672271e-05, "loss": 0.6654, "step": 2442 }, { "epoch": 0.9003962038146135, "grad_norm": 5.267600212574196, "learning_rate": 4.276177525033997e-05, "loss": 0.6455, "step": 2443 }, { "epoch": 0.900764765502626, "grad_norm": 6.702030261439003, "learning_rate": 4.275868463345284e-05, "loss": 0.6527, "step": 2444 }, { "epoch": 0.9011333271906385, "grad_norm": 7.636123797956319, "learning_rate": 4.2755594016565706e-05, "loss": 0.5563, "step": 2445 }, { "epoch": 0.901501888878651, "grad_norm": 6.2032000251287895, "learning_rate": 4.275250339967858e-05, "loss": 0.6179, "step": 2446 }, { "epoch": 0.9018704505666636, "grad_norm": 6.45756765769519, "learning_rate": 4.274941278279145e-05, "loss": 0.762, "step": 2447 }, { "epoch": 0.9022390122546762, "grad_norm": 8.03148748736905, "learning_rate": 4.274632216590432e-05, "loss": 0.8028, "step": 2448 }, { "epoch": 0.9026075739426886, "grad_norm": 6.288288784413582, "learning_rate": 4.274323154901719e-05, "loss": 0.7644, "step": 2449 }, { "epoch": 0.9029761356307012, "grad_norm": 5.024622752134684, "learning_rate": 4.2740140932130055e-05, "loss": 0.6023, "step": 2450 }, { "epoch": 0.9033446973187137, "grad_norm": 5.16895444527934, "learning_rate": 4.2737050315242926e-05, "loss": 0.4896, "step": 2451 }, { "epoch": 0.9037132590067263, "grad_norm": 7.369077729240603, "learning_rate": 4.27339596983558e-05, "loss": 0.8122, "step": 2452 }, { "epoch": 0.9040818206947387, "grad_norm": 6.130567861518825, "learning_rate": 4.273086908146866e-05, "loss": 0.4105, "step": 2453 }, { "epoch": 0.9044503823827513, "grad_norm": 7.026320157612759, "learning_rate": 4.272777846458153e-05, "loss": 0.7176, "step": 2454 }, { "epoch": 0.9048189440707638, "grad_norm": 5.584190340836827, "learning_rate": 4.27246878476944e-05, "loss": 0.5246, "step": 2455 }, { "epoch": 0.9051875057587764, "grad_norm": 6.2636328976414335, "learning_rate": 4.272159723080727e-05, "loss": 0.618, "step": 2456 }, { "epoch": 0.905556067446789, "grad_norm": 10.350224338624901, "learning_rate": 4.271850661392014e-05, "loss": 0.7141, "step": 2457 }, { "epoch": 0.9059246291348014, "grad_norm": 5.409569399297455, "learning_rate": 4.271541599703301e-05, "loss": 0.5991, "step": 2458 }, { "epoch": 0.906293190822814, "grad_norm": 7.056393574418848, "learning_rate": 4.271232538014588e-05, "loss": 0.6598, "step": 2459 }, { "epoch": 0.9066617525108265, "grad_norm": 6.2129799609460505, "learning_rate": 4.270923476325875e-05, "loss": 0.5131, "step": 2460 }, { "epoch": 0.9070303141988391, "grad_norm": 5.876662323521513, "learning_rate": 4.270614414637162e-05, "loss": 0.655, "step": 2461 }, { "epoch": 0.9073988758868515, "grad_norm": 6.757485089611972, "learning_rate": 4.270305352948449e-05, "loss": 0.727, "step": 2462 }, { "epoch": 0.9077674375748641, "grad_norm": 5.553491391195279, "learning_rate": 4.269996291259736e-05, "loss": 0.5116, "step": 2463 }, { "epoch": 0.9081359992628766, "grad_norm": 7.597242768713369, "learning_rate": 4.2696872295710225e-05, "loss": 1.0143, "step": 2464 }, { "epoch": 0.9085045609508892, "grad_norm": 5.987842640798671, "learning_rate": 4.2693781678823096e-05, "loss": 0.6496, "step": 2465 }, { "epoch": 0.9088731226389016, "grad_norm": 10.043172340896342, "learning_rate": 4.269069106193597e-05, "loss": 0.7887, "step": 2466 }, { "epoch": 0.9092416843269142, "grad_norm": 7.556670661367677, "learning_rate": 4.268760044504884e-05, "loss": 0.7361, "step": 2467 }, { "epoch": 0.9096102460149268, "grad_norm": 4.2479411355374515, "learning_rate": 4.26845098281617e-05, "loss": 0.4719, "step": 2468 }, { "epoch": 0.9099788077029393, "grad_norm": 9.139382093648942, "learning_rate": 4.268141921127457e-05, "loss": 0.83, "step": 2469 }, { "epoch": 0.9103473693909518, "grad_norm": 6.488634884158, "learning_rate": 4.267832859438744e-05, "loss": 0.6823, "step": 2470 }, { "epoch": 0.9107159310789643, "grad_norm": 5.048135983746911, "learning_rate": 4.267523797750031e-05, "loss": 0.6393, "step": 2471 }, { "epoch": 0.9110844927669769, "grad_norm": 7.72077896694662, "learning_rate": 4.267214736061318e-05, "loss": 0.6394, "step": 2472 }, { "epoch": 0.9114530544549894, "grad_norm": 7.42532711657109, "learning_rate": 4.266905674372605e-05, "loss": 1.0301, "step": 2473 }, { "epoch": 0.911821616143002, "grad_norm": 5.538612078389568, "learning_rate": 4.2665966126838916e-05, "loss": 0.6101, "step": 2474 }, { "epoch": 0.9121901778310144, "grad_norm": 8.482843475088501, "learning_rate": 4.266287550995179e-05, "loss": 0.7954, "step": 2475 }, { "epoch": 0.912558739519027, "grad_norm": 7.91495803330285, "learning_rate": 4.265978489306466e-05, "loss": 0.7141, "step": 2476 }, { "epoch": 0.9129273012070396, "grad_norm": 8.429316416183498, "learning_rate": 4.265669427617753e-05, "loss": 0.6246, "step": 2477 }, { "epoch": 0.913295862895052, "grad_norm": 6.3124694823481065, "learning_rate": 4.2653603659290394e-05, "loss": 0.7309, "step": 2478 }, { "epoch": 0.9136644245830646, "grad_norm": 7.791588713176085, "learning_rate": 4.2650513042403265e-05, "loss": 0.522, "step": 2479 }, { "epoch": 0.9140329862710771, "grad_norm": 5.275731199669114, "learning_rate": 4.264742242551614e-05, "loss": 0.5534, "step": 2480 }, { "epoch": 0.9144015479590897, "grad_norm": 7.856850675624585, "learning_rate": 4.264433180862901e-05, "loss": 0.7146, "step": 2481 }, { "epoch": 0.9147701096471021, "grad_norm": 5.228052311840596, "learning_rate": 4.264124119174188e-05, "loss": 0.5984, "step": 2482 }, { "epoch": 0.9151386713351147, "grad_norm": 6.657714109627068, "learning_rate": 4.2638150574854743e-05, "loss": 0.6316, "step": 2483 }, { "epoch": 0.9155072330231272, "grad_norm": 6.547725066940891, "learning_rate": 4.263505995796761e-05, "loss": 0.8869, "step": 2484 }, { "epoch": 0.9158757947111398, "grad_norm": 5.898284869713981, "learning_rate": 4.263196934108048e-05, "loss": 0.9457, "step": 2485 }, { "epoch": 0.9162443563991524, "grad_norm": 7.06035696278131, "learning_rate": 4.262887872419335e-05, "loss": 0.3435, "step": 2486 }, { "epoch": 0.9166129180871648, "grad_norm": 7.213801285424745, "learning_rate": 4.262578810730622e-05, "loss": 0.5868, "step": 2487 }, { "epoch": 0.9169814797751774, "grad_norm": 7.49617211885819, "learning_rate": 4.2622697490419086e-05, "loss": 0.6146, "step": 2488 }, { "epoch": 0.9173500414631899, "grad_norm": 10.893129867963225, "learning_rate": 4.261960687353196e-05, "loss": 0.672, "step": 2489 }, { "epoch": 0.9177186031512025, "grad_norm": 6.450266798548796, "learning_rate": 4.261651625664483e-05, "loss": 0.6322, "step": 2490 }, { "epoch": 0.9180871648392149, "grad_norm": 5.79039210650456, "learning_rate": 4.26134256397577e-05, "loss": 0.6755, "step": 2491 }, { "epoch": 0.9184557265272275, "grad_norm": 8.680231580325835, "learning_rate": 4.261033502287057e-05, "loss": 0.7766, "step": 2492 }, { "epoch": 0.91882428821524, "grad_norm": 3.826588474121664, "learning_rate": 4.2607244405983435e-05, "loss": 0.4381, "step": 2493 }, { "epoch": 0.9191928499032526, "grad_norm": 6.399706666899919, "learning_rate": 4.2604153789096306e-05, "loss": 0.7916, "step": 2494 }, { "epoch": 0.9195614115912651, "grad_norm": 5.893323528114781, "learning_rate": 4.260106317220918e-05, "loss": 0.6099, "step": 2495 }, { "epoch": 0.9199299732792776, "grad_norm": 9.455046453740078, "learning_rate": 4.259797255532205e-05, "loss": 0.8189, "step": 2496 }, { "epoch": 0.9202985349672902, "grad_norm": 16.273690663537266, "learning_rate": 4.259488193843491e-05, "loss": 0.6919, "step": 2497 }, { "epoch": 0.9206670966553027, "grad_norm": 9.031168056235082, "learning_rate": 4.259179132154778e-05, "loss": 0.9385, "step": 2498 }, { "epoch": 0.9210356583433152, "grad_norm": 7.570923204856071, "learning_rate": 4.258870070466065e-05, "loss": 0.8608, "step": 2499 }, { "epoch": 0.9214042200313277, "grad_norm": 7.2365998704879955, "learning_rate": 4.258561008777352e-05, "loss": 0.9002, "step": 2500 }, { "epoch": 0.9217727817193403, "grad_norm": 7.473759331972504, "learning_rate": 4.258251947088639e-05, "loss": 0.951, "step": 2501 }, { "epoch": 0.9221413434073528, "grad_norm": 5.995332491883367, "learning_rate": 4.257942885399926e-05, "loss": 0.8217, "step": 2502 }, { "epoch": 0.9225099050953653, "grad_norm": 7.078334106320718, "learning_rate": 4.257633823711213e-05, "loss": 0.6676, "step": 2503 }, { "epoch": 0.9228784667833778, "grad_norm": 7.130700491245555, "learning_rate": 4.2573247620225e-05, "loss": 0.8452, "step": 2504 }, { "epoch": 0.9232470284713904, "grad_norm": 5.589816793807296, "learning_rate": 4.257015700333787e-05, "loss": 0.5895, "step": 2505 }, { "epoch": 0.923615590159403, "grad_norm": 5.7474475668713625, "learning_rate": 4.256706638645074e-05, "loss": 0.5964, "step": 2506 }, { "epoch": 0.9239841518474154, "grad_norm": 5.875083111114259, "learning_rate": 4.2563975769563605e-05, "loss": 0.62, "step": 2507 }, { "epoch": 0.924352713535428, "grad_norm": 7.017803440145052, "learning_rate": 4.2560885152676476e-05, "loss": 1.0074, "step": 2508 }, { "epoch": 0.9247212752234405, "grad_norm": 6.242642458877872, "learning_rate": 4.255779453578935e-05, "loss": 0.8062, "step": 2509 }, { "epoch": 0.9250898369114531, "grad_norm": 7.4347216120860065, "learning_rate": 4.255470391890222e-05, "loss": 0.7786, "step": 2510 }, { "epoch": 0.9254583985994655, "grad_norm": 5.432941850159708, "learning_rate": 4.255161330201509e-05, "loss": 0.6002, "step": 2511 }, { "epoch": 0.9258269602874781, "grad_norm": 6.240964538170299, "learning_rate": 4.2548522685127954e-05, "loss": 0.5803, "step": 2512 }, { "epoch": 0.9261955219754906, "grad_norm": 6.51640187121451, "learning_rate": 4.254543206824082e-05, "loss": 0.6898, "step": 2513 }, { "epoch": 0.9265640836635032, "grad_norm": 6.914098362371682, "learning_rate": 4.254234145135369e-05, "loss": 0.7032, "step": 2514 }, { "epoch": 0.9269326453515158, "grad_norm": 6.205919995710874, "learning_rate": 4.253925083446656e-05, "loss": 0.9197, "step": 2515 }, { "epoch": 0.9273012070395282, "grad_norm": 6.779578499534293, "learning_rate": 4.253616021757943e-05, "loss": 0.8623, "step": 2516 }, { "epoch": 0.9276697687275408, "grad_norm": 8.579189991807347, "learning_rate": 4.2533069600692296e-05, "loss": 0.7338, "step": 2517 }, { "epoch": 0.9280383304155533, "grad_norm": 4.991187531280718, "learning_rate": 4.252997898380517e-05, "loss": 0.7251, "step": 2518 }, { "epoch": 0.9284068921035659, "grad_norm": 6.432019317256331, "learning_rate": 4.252688836691804e-05, "loss": 0.6843, "step": 2519 }, { "epoch": 0.9287754537915783, "grad_norm": 7.780863189275007, "learning_rate": 4.252379775003091e-05, "loss": 0.8995, "step": 2520 }, { "epoch": 0.9291440154795909, "grad_norm": 5.039003451688917, "learning_rate": 4.252070713314378e-05, "loss": 0.4268, "step": 2521 }, { "epoch": 0.9295125771676034, "grad_norm": 5.275443411898062, "learning_rate": 4.2517616516256645e-05, "loss": 0.6146, "step": 2522 }, { "epoch": 0.929881138855616, "grad_norm": 5.311464994248935, "learning_rate": 4.2514525899369517e-05, "loss": 0.8559, "step": 2523 }, { "epoch": 0.9302497005436285, "grad_norm": 5.274570914077914, "learning_rate": 4.251143528248239e-05, "loss": 0.5332, "step": 2524 }, { "epoch": 0.930618262231641, "grad_norm": 8.149993952649453, "learning_rate": 4.250834466559526e-05, "loss": 0.6547, "step": 2525 }, { "epoch": 0.9309868239196536, "grad_norm": 4.357073870432121, "learning_rate": 4.250525404870812e-05, "loss": 0.5242, "step": 2526 }, { "epoch": 0.9313553856076661, "grad_norm": 13.384682261386756, "learning_rate": 4.2502163431820995e-05, "loss": 0.8758, "step": 2527 }, { "epoch": 0.9317239472956786, "grad_norm": 6.6327492359320965, "learning_rate": 4.249907281493386e-05, "loss": 0.4884, "step": 2528 }, { "epoch": 0.9320925089836911, "grad_norm": 5.859279296093412, "learning_rate": 4.249598219804673e-05, "loss": 0.7368, "step": 2529 }, { "epoch": 0.9324610706717037, "grad_norm": 6.0931402439322335, "learning_rate": 4.24928915811596e-05, "loss": 0.7729, "step": 2530 }, { "epoch": 0.9328296323597162, "grad_norm": 7.151278572205474, "learning_rate": 4.248980096427247e-05, "loss": 0.6196, "step": 2531 }, { "epoch": 0.9331981940477287, "grad_norm": 4.833024091803003, "learning_rate": 4.248671034738534e-05, "loss": 0.5555, "step": 2532 }, { "epoch": 0.9335667557357412, "grad_norm": 5.955598252334764, "learning_rate": 4.248361973049821e-05, "loss": 0.7097, "step": 2533 }, { "epoch": 0.9339353174237538, "grad_norm": 5.742788353449026, "learning_rate": 4.248052911361108e-05, "loss": 0.6033, "step": 2534 }, { "epoch": 0.9343038791117664, "grad_norm": 6.294868244128506, "learning_rate": 4.247743849672395e-05, "loss": 0.4826, "step": 2535 }, { "epoch": 0.9346724407997788, "grad_norm": 6.242465551340109, "learning_rate": 4.2474347879836815e-05, "loss": 0.7356, "step": 2536 }, { "epoch": 0.9350410024877914, "grad_norm": 5.989586694688821, "learning_rate": 4.2471257262949686e-05, "loss": 0.6731, "step": 2537 }, { "epoch": 0.9354095641758039, "grad_norm": 6.440920615410769, "learning_rate": 4.246816664606256e-05, "loss": 0.6527, "step": 2538 }, { "epoch": 0.9357781258638165, "grad_norm": 8.75754069604982, "learning_rate": 4.246507602917543e-05, "loss": 0.672, "step": 2539 }, { "epoch": 0.936146687551829, "grad_norm": 10.185953783518777, "learning_rate": 4.24619854122883e-05, "loss": 0.6343, "step": 2540 }, { "epoch": 0.9365152492398415, "grad_norm": 8.620781793036205, "learning_rate": 4.2458894795401164e-05, "loss": 0.7348, "step": 2541 }, { "epoch": 0.936883810927854, "grad_norm": 7.803603821631418, "learning_rate": 4.2455804178514035e-05, "loss": 0.7242, "step": 2542 }, { "epoch": 0.9372523726158666, "grad_norm": 5.87443052737847, "learning_rate": 4.24527135616269e-05, "loss": 0.5737, "step": 2543 }, { "epoch": 0.9376209343038792, "grad_norm": 4.727615213992385, "learning_rate": 4.244962294473977e-05, "loss": 0.505, "step": 2544 }, { "epoch": 0.9379894959918916, "grad_norm": 4.608089151288276, "learning_rate": 4.244653232785264e-05, "loss": 0.5151, "step": 2545 }, { "epoch": 0.9383580576799042, "grad_norm": 7.443509791267512, "learning_rate": 4.2443441710965507e-05, "loss": 0.7811, "step": 2546 }, { "epoch": 0.9387266193679167, "grad_norm": 8.414050712329255, "learning_rate": 4.244035109407838e-05, "loss": 0.7742, "step": 2547 }, { "epoch": 0.9390951810559293, "grad_norm": 6.053155993302011, "learning_rate": 4.243726047719125e-05, "loss": 0.6273, "step": 2548 }, { "epoch": 0.9394637427439417, "grad_norm": 6.14584602311543, "learning_rate": 4.243416986030412e-05, "loss": 0.806, "step": 2549 }, { "epoch": 0.9398323044319543, "grad_norm": 5.085245158569438, "learning_rate": 4.2431079243416985e-05, "loss": 0.5103, "step": 2550 }, { "epoch": 0.9402008661199668, "grad_norm": 5.779033174006663, "learning_rate": 4.2427988626529856e-05, "loss": 0.5986, "step": 2551 }, { "epoch": 0.9405694278079794, "grad_norm": 5.949638086419627, "learning_rate": 4.242489800964273e-05, "loss": 0.6967, "step": 2552 }, { "epoch": 0.9409379894959919, "grad_norm": 7.7041294766139305, "learning_rate": 4.24218073927556e-05, "loss": 0.6092, "step": 2553 }, { "epoch": 0.9413065511840044, "grad_norm": 7.982758539435211, "learning_rate": 4.241871677586847e-05, "loss": 0.7376, "step": 2554 }, { "epoch": 0.941675112872017, "grad_norm": 10.000201413987245, "learning_rate": 4.2415626158981334e-05, "loss": 0.7132, "step": 2555 }, { "epoch": 0.9420436745600295, "grad_norm": 5.752458461460637, "learning_rate": 4.2412535542094205e-05, "loss": 0.6092, "step": 2556 }, { "epoch": 0.942412236248042, "grad_norm": 6.558334627195793, "learning_rate": 4.2409444925207076e-05, "loss": 0.7658, "step": 2557 }, { "epoch": 0.9427807979360545, "grad_norm": 8.103943757143513, "learning_rate": 4.240635430831994e-05, "loss": 0.8525, "step": 2558 }, { "epoch": 0.9431493596240671, "grad_norm": 5.173725454046875, "learning_rate": 4.240326369143281e-05, "loss": 0.7324, "step": 2559 }, { "epoch": 0.9435179213120796, "grad_norm": 7.100469826210776, "learning_rate": 4.2400173074545676e-05, "loss": 0.7631, "step": 2560 }, { "epoch": 0.9438864830000921, "grad_norm": 7.746690597278359, "learning_rate": 4.239708245765855e-05, "loss": 0.8733, "step": 2561 }, { "epoch": 0.9442550446881046, "grad_norm": 8.088207335398286, "learning_rate": 4.239399184077142e-05, "loss": 0.8339, "step": 2562 }, { "epoch": 0.9446236063761172, "grad_norm": 7.359255002090294, "learning_rate": 4.239090122388429e-05, "loss": 0.7442, "step": 2563 }, { "epoch": 0.9449921680641298, "grad_norm": 6.602176145665794, "learning_rate": 4.238781060699716e-05, "loss": 0.7871, "step": 2564 }, { "epoch": 0.9453607297521422, "grad_norm": 8.57983159111156, "learning_rate": 4.2384719990110025e-05, "loss": 0.7918, "step": 2565 }, { "epoch": 0.9457292914401548, "grad_norm": 6.460493485347597, "learning_rate": 4.2381629373222897e-05, "loss": 0.7156, "step": 2566 }, { "epoch": 0.9460978531281673, "grad_norm": 5.395619077614068, "learning_rate": 4.237853875633577e-05, "loss": 0.6235, "step": 2567 }, { "epoch": 0.9464664148161799, "grad_norm": 6.060897772733297, "learning_rate": 4.237544813944864e-05, "loss": 0.6936, "step": 2568 }, { "epoch": 0.9468349765041923, "grad_norm": 4.7118346212538995, "learning_rate": 4.23723575225615e-05, "loss": 0.4755, "step": 2569 }, { "epoch": 0.9472035381922049, "grad_norm": 5.535861576775272, "learning_rate": 4.2369266905674375e-05, "loss": 0.639, "step": 2570 }, { "epoch": 0.9475720998802174, "grad_norm": 5.260635502358235, "learning_rate": 4.2366176288787246e-05, "loss": 0.5056, "step": 2571 }, { "epoch": 0.94794066156823, "grad_norm": 6.027881689782541, "learning_rate": 4.236308567190012e-05, "loss": 0.6128, "step": 2572 }, { "epoch": 0.9483092232562426, "grad_norm": 8.4567790991567, "learning_rate": 4.235999505501299e-05, "loss": 0.8748, "step": 2573 }, { "epoch": 0.948677784944255, "grad_norm": 7.557528288116664, "learning_rate": 4.235690443812585e-05, "loss": 0.9373, "step": 2574 }, { "epoch": 0.9490463466322676, "grad_norm": 4.180352787297925, "learning_rate": 4.235381382123872e-05, "loss": 0.5531, "step": 2575 }, { "epoch": 0.9494149083202801, "grad_norm": 6.2008935099923805, "learning_rate": 4.235072320435159e-05, "loss": 0.8235, "step": 2576 }, { "epoch": 0.9497834700082927, "grad_norm": 6.77457644556807, "learning_rate": 4.234763258746446e-05, "loss": 0.7198, "step": 2577 }, { "epoch": 0.9501520316963051, "grad_norm": 9.012425640079853, "learning_rate": 4.234454197057733e-05, "loss": 0.9441, "step": 2578 }, { "epoch": 0.9505205933843177, "grad_norm": 6.880436083258152, "learning_rate": 4.2341451353690195e-05, "loss": 0.6606, "step": 2579 }, { "epoch": 0.9508891550723302, "grad_norm": 9.879598198878792, "learning_rate": 4.2338360736803066e-05, "loss": 0.8912, "step": 2580 }, { "epoch": 0.9512577167603428, "grad_norm": 7.37264356305865, "learning_rate": 4.233527011991594e-05, "loss": 0.7861, "step": 2581 }, { "epoch": 0.9516262784483553, "grad_norm": 10.298017099857859, "learning_rate": 4.233217950302881e-05, "loss": 0.7731, "step": 2582 }, { "epoch": 0.9519948401363678, "grad_norm": 4.249315318886243, "learning_rate": 4.232908888614168e-05, "loss": 0.3559, "step": 2583 }, { "epoch": 0.9523634018243804, "grad_norm": 7.353073370250105, "learning_rate": 4.2325998269254544e-05, "loss": 0.6397, "step": 2584 }, { "epoch": 0.9527319635123929, "grad_norm": 6.589599460128201, "learning_rate": 4.2322907652367415e-05, "loss": 0.9258, "step": 2585 }, { "epoch": 0.9531005252004054, "grad_norm": 5.981786422033816, "learning_rate": 4.2319817035480286e-05, "loss": 0.7113, "step": 2586 }, { "epoch": 0.9534690868884179, "grad_norm": 6.481726562871952, "learning_rate": 4.231672641859316e-05, "loss": 0.6885, "step": 2587 }, { "epoch": 0.9538376485764305, "grad_norm": 9.04689608293096, "learning_rate": 4.231363580170602e-05, "loss": 0.7788, "step": 2588 }, { "epoch": 0.954206210264443, "grad_norm": 5.74252762598111, "learning_rate": 4.2310545184818887e-05, "loss": 0.8174, "step": 2589 }, { "epoch": 0.9545747719524555, "grad_norm": 6.475199409860744, "learning_rate": 4.230745456793176e-05, "loss": 0.7142, "step": 2590 }, { "epoch": 0.9549433336404681, "grad_norm": 7.878957087249228, "learning_rate": 4.230436395104463e-05, "loss": 0.5939, "step": 2591 }, { "epoch": 0.9553118953284806, "grad_norm": 6.641572772438887, "learning_rate": 4.23012733341575e-05, "loss": 0.785, "step": 2592 }, { "epoch": 0.9556804570164932, "grad_norm": 7.639529034543223, "learning_rate": 4.229818271727037e-05, "loss": 0.7773, "step": 2593 }, { "epoch": 0.9560490187045056, "grad_norm": 6.147935238479111, "learning_rate": 4.2295092100383236e-05, "loss": 0.9386, "step": 2594 }, { "epoch": 0.9564175803925182, "grad_norm": 7.348867118876773, "learning_rate": 4.229200148349611e-05, "loss": 0.6773, "step": 2595 }, { "epoch": 0.9567861420805307, "grad_norm": 6.023160104264336, "learning_rate": 4.228891086660898e-05, "loss": 0.4579, "step": 2596 }, { "epoch": 0.9571547037685433, "grad_norm": 4.085622392396531, "learning_rate": 4.228582024972185e-05, "loss": 0.4878, "step": 2597 }, { "epoch": 0.9575232654565557, "grad_norm": 5.232724422010869, "learning_rate": 4.2282729632834714e-05, "loss": 0.6825, "step": 2598 }, { "epoch": 0.9578918271445683, "grad_norm": 6.429230053783474, "learning_rate": 4.2279639015947585e-05, "loss": 0.7361, "step": 2599 }, { "epoch": 0.9582603888325808, "grad_norm": 8.651492798166034, "learning_rate": 4.2276548399060456e-05, "loss": 0.8428, "step": 2600 }, { "epoch": 0.9586289505205934, "grad_norm": 4.392776993608504, "learning_rate": 4.227345778217333e-05, "loss": 0.5449, "step": 2601 }, { "epoch": 0.958997512208606, "grad_norm": 4.395094364962908, "learning_rate": 4.22703671652862e-05, "loss": 0.5872, "step": 2602 }, { "epoch": 0.9593660738966184, "grad_norm": 7.095773819923774, "learning_rate": 4.226727654839906e-05, "loss": 0.7362, "step": 2603 }, { "epoch": 0.959734635584631, "grad_norm": 7.0290751844904, "learning_rate": 4.226418593151193e-05, "loss": 0.5539, "step": 2604 }, { "epoch": 0.9601031972726435, "grad_norm": 5.282844257682384, "learning_rate": 4.22610953146248e-05, "loss": 0.465, "step": 2605 }, { "epoch": 0.9604717589606561, "grad_norm": 5.9719296918487546, "learning_rate": 4.225800469773767e-05, "loss": 0.7365, "step": 2606 }, { "epoch": 0.9608403206486685, "grad_norm": 9.358658446328675, "learning_rate": 4.225491408085054e-05, "loss": 0.9, "step": 2607 }, { "epoch": 0.9612088823366811, "grad_norm": 6.060934277519123, "learning_rate": 4.2251823463963405e-05, "loss": 0.685, "step": 2608 }, { "epoch": 0.9615774440246936, "grad_norm": 6.481000863476475, "learning_rate": 4.2248732847076276e-05, "loss": 0.9095, "step": 2609 }, { "epoch": 0.9619460057127062, "grad_norm": 5.256943243987689, "learning_rate": 4.224564223018915e-05, "loss": 0.7593, "step": 2610 }, { "epoch": 0.9623145674007187, "grad_norm": 5.032055144718924, "learning_rate": 4.224255161330202e-05, "loss": 0.6069, "step": 2611 }, { "epoch": 0.9626831290887312, "grad_norm": 8.230895608443218, "learning_rate": 4.223946099641489e-05, "loss": 1.0817, "step": 2612 }, { "epoch": 0.9630516907767438, "grad_norm": 5.895945769227518, "learning_rate": 4.2236370379527754e-05, "loss": 0.5842, "step": 2613 }, { "epoch": 0.9634202524647563, "grad_norm": 7.41989951093958, "learning_rate": 4.2233279762640626e-05, "loss": 0.5803, "step": 2614 }, { "epoch": 0.9637888141527688, "grad_norm": 8.540571883839357, "learning_rate": 4.22301891457535e-05, "loss": 0.5478, "step": 2615 }, { "epoch": 0.9641573758407813, "grad_norm": 7.468205515914093, "learning_rate": 4.222709852886637e-05, "loss": 0.6873, "step": 2616 }, { "epoch": 0.9645259375287939, "grad_norm": 5.419277657378222, "learning_rate": 4.222400791197923e-05, "loss": 0.8077, "step": 2617 }, { "epoch": 0.9648944992168064, "grad_norm": 8.120755539494251, "learning_rate": 4.2220917295092104e-05, "loss": 0.607, "step": 2618 }, { "epoch": 0.965263060904819, "grad_norm": 10.69654222751446, "learning_rate": 4.221782667820497e-05, "loss": 0.71, "step": 2619 }, { "epoch": 0.9656316225928315, "grad_norm": 11.847401883809141, "learning_rate": 4.221473606131784e-05, "loss": 0.7714, "step": 2620 }, { "epoch": 0.966000184280844, "grad_norm": 6.3637982855669435, "learning_rate": 4.221164544443071e-05, "loss": 0.6526, "step": 2621 }, { "epoch": 0.9663687459688566, "grad_norm": 7.2557237327496376, "learning_rate": 4.220855482754358e-05, "loss": 0.7699, "step": 2622 }, { "epoch": 0.966737307656869, "grad_norm": 13.493798420948615, "learning_rate": 4.2205464210656446e-05, "loss": 0.6795, "step": 2623 }, { "epoch": 0.9671058693448816, "grad_norm": 7.436605191297991, "learning_rate": 4.220237359376932e-05, "loss": 0.7829, "step": 2624 }, { "epoch": 0.9674744310328941, "grad_norm": 6.291188438716134, "learning_rate": 4.219928297688219e-05, "loss": 0.6379, "step": 2625 }, { "epoch": 0.9678429927209067, "grad_norm": 6.432504437797548, "learning_rate": 4.219619235999506e-05, "loss": 0.8899, "step": 2626 }, { "epoch": 0.9682115544089192, "grad_norm": 5.976541755989326, "learning_rate": 4.2193101743107924e-05, "loss": 0.6032, "step": 2627 }, { "epoch": 0.9685801160969317, "grad_norm": 4.52796090554658, "learning_rate": 4.2190011126220795e-05, "loss": 0.799, "step": 2628 }, { "epoch": 0.9689486777849442, "grad_norm": 4.913606801456178, "learning_rate": 4.2186920509333666e-05, "loss": 0.5867, "step": 2629 }, { "epoch": 0.9693172394729568, "grad_norm": 6.787162488551169, "learning_rate": 4.218382989244654e-05, "loss": 0.6634, "step": 2630 }, { "epoch": 0.9696858011609694, "grad_norm": 5.485634162011155, "learning_rate": 4.218073927555941e-05, "loss": 0.8161, "step": 2631 }, { "epoch": 0.9700543628489818, "grad_norm": 8.154521656707217, "learning_rate": 4.217764865867227e-05, "loss": 0.6496, "step": 2632 }, { "epoch": 0.9704229245369944, "grad_norm": 5.265957151175613, "learning_rate": 4.2174558041785144e-05, "loss": 0.7373, "step": 2633 }, { "epoch": 0.9707914862250069, "grad_norm": 6.10956450263562, "learning_rate": 4.217146742489801e-05, "loss": 0.5111, "step": 2634 }, { "epoch": 0.9711600479130195, "grad_norm": 6.246000307110606, "learning_rate": 4.216837680801088e-05, "loss": 0.547, "step": 2635 }, { "epoch": 0.9715286096010319, "grad_norm": 4.230708826074007, "learning_rate": 4.216528619112375e-05, "loss": 0.3457, "step": 2636 }, { "epoch": 0.9718971712890445, "grad_norm": 6.055241280118342, "learning_rate": 4.2162195574236616e-05, "loss": 0.4827, "step": 2637 }, { "epoch": 0.972265732977057, "grad_norm": 5.517908245102231, "learning_rate": 4.215910495734949e-05, "loss": 0.4994, "step": 2638 }, { "epoch": 0.9726342946650696, "grad_norm": 30.77142034516109, "learning_rate": 4.215601434046236e-05, "loss": 0.8945, "step": 2639 }, { "epoch": 0.9730028563530821, "grad_norm": 7.507622977414096, "learning_rate": 4.215292372357523e-05, "loss": 0.7135, "step": 2640 }, { "epoch": 0.9733714180410946, "grad_norm": 6.601764454658688, "learning_rate": 4.2149833106688094e-05, "loss": 0.6861, "step": 2641 }, { "epoch": 0.9737399797291072, "grad_norm": 5.762490679222397, "learning_rate": 4.2146742489800965e-05, "loss": 0.5124, "step": 2642 }, { "epoch": 0.9741085414171197, "grad_norm": 7.987028573083115, "learning_rate": 4.2143651872913836e-05, "loss": 0.828, "step": 2643 }, { "epoch": 0.9744771031051322, "grad_norm": 5.494348570207164, "learning_rate": 4.214056125602671e-05, "loss": 0.2931, "step": 2644 }, { "epoch": 0.9748456647931447, "grad_norm": 6.991301445255919, "learning_rate": 4.213747063913958e-05, "loss": 0.698, "step": 2645 }, { "epoch": 0.9752142264811573, "grad_norm": 6.073254043167597, "learning_rate": 4.213438002225244e-05, "loss": 0.6451, "step": 2646 }, { "epoch": 0.9755827881691698, "grad_norm": 5.069505342207743, "learning_rate": 4.2131289405365314e-05, "loss": 0.6069, "step": 2647 }, { "epoch": 0.9759513498571823, "grad_norm": 7.119508199529925, "learning_rate": 4.2128198788478185e-05, "loss": 0.8353, "step": 2648 }, { "epoch": 0.9763199115451949, "grad_norm": 6.779286746427179, "learning_rate": 4.212510817159105e-05, "loss": 0.6883, "step": 2649 }, { "epoch": 0.9766884732332074, "grad_norm": 6.331003998837844, "learning_rate": 4.212201755470392e-05, "loss": 0.6487, "step": 2650 }, { "epoch": 0.97705703492122, "grad_norm": 4.9489537915585196, "learning_rate": 4.2118926937816785e-05, "loss": 0.7901, "step": 2651 }, { "epoch": 0.9774255966092324, "grad_norm": 6.7663289115810255, "learning_rate": 4.2115836320929656e-05, "loss": 0.7706, "step": 2652 }, { "epoch": 0.977794158297245, "grad_norm": 6.872538177839347, "learning_rate": 4.211274570404253e-05, "loss": 0.6754, "step": 2653 }, { "epoch": 0.9781627199852575, "grad_norm": 7.599169394629998, "learning_rate": 4.21096550871554e-05, "loss": 0.7214, "step": 2654 }, { "epoch": 0.9785312816732701, "grad_norm": 7.341318840017168, "learning_rate": 4.210656447026827e-05, "loss": 0.5925, "step": 2655 }, { "epoch": 0.9788998433612826, "grad_norm": 8.73063565975797, "learning_rate": 4.2103473853381134e-05, "loss": 0.5372, "step": 2656 }, { "epoch": 0.9792684050492951, "grad_norm": 9.865262796001923, "learning_rate": 4.2100383236494006e-05, "loss": 0.5729, "step": 2657 }, { "epoch": 0.9796369667373076, "grad_norm": 7.7102077645140445, "learning_rate": 4.209729261960688e-05, "loss": 0.8129, "step": 2658 }, { "epoch": 0.9800055284253202, "grad_norm": 7.457379612246673, "learning_rate": 4.209420200271975e-05, "loss": 0.8074, "step": 2659 }, { "epoch": 0.9803740901133328, "grad_norm": 8.137579935973315, "learning_rate": 4.209111138583261e-05, "loss": 0.7523, "step": 2660 }, { "epoch": 0.9807426518013452, "grad_norm": 7.119359510952611, "learning_rate": 4.2088020768945484e-05, "loss": 0.6638, "step": 2661 }, { "epoch": 0.9811112134893578, "grad_norm": 5.661304554843814, "learning_rate": 4.2084930152058355e-05, "loss": 0.5027, "step": 2662 }, { "epoch": 0.9814797751773703, "grad_norm": 5.180193063680345, "learning_rate": 4.2081839535171226e-05, "loss": 0.5285, "step": 2663 }, { "epoch": 0.9818483368653829, "grad_norm": 7.068789761783064, "learning_rate": 4.207874891828409e-05, "loss": 0.6676, "step": 2664 }, { "epoch": 0.9822168985533953, "grad_norm": 6.841564648231975, "learning_rate": 4.207565830139696e-05, "loss": 0.7542, "step": 2665 }, { "epoch": 0.9825854602414079, "grad_norm": 6.846069500270872, "learning_rate": 4.2072567684509826e-05, "loss": 0.6961, "step": 2666 }, { "epoch": 0.9829540219294204, "grad_norm": 6.581922001597058, "learning_rate": 4.20694770676227e-05, "loss": 0.8197, "step": 2667 }, { "epoch": 0.983322583617433, "grad_norm": 7.378254059402393, "learning_rate": 4.206638645073557e-05, "loss": 0.7353, "step": 2668 }, { "epoch": 0.9836911453054455, "grad_norm": 5.0560133078194776, "learning_rate": 4.206329583384844e-05, "loss": 0.5003, "step": 2669 }, { "epoch": 0.984059706993458, "grad_norm": 4.882586322886617, "learning_rate": 4.2060205216961304e-05, "loss": 0.465, "step": 2670 }, { "epoch": 0.9844282686814706, "grad_norm": 7.1998047696241825, "learning_rate": 4.2057114600074175e-05, "loss": 0.6731, "step": 2671 }, { "epoch": 0.9847968303694831, "grad_norm": 5.21580047403665, "learning_rate": 4.2054023983187046e-05, "loss": 0.6234, "step": 2672 }, { "epoch": 0.9851653920574956, "grad_norm": 8.075954359601061, "learning_rate": 4.205093336629992e-05, "loss": 0.7621, "step": 2673 }, { "epoch": 0.9855339537455081, "grad_norm": 8.568214903970663, "learning_rate": 4.204784274941279e-05, "loss": 0.9015, "step": 2674 }, { "epoch": 0.9859025154335207, "grad_norm": 22.42274606321949, "learning_rate": 4.204475213252565e-05, "loss": 0.7842, "step": 2675 }, { "epoch": 0.9862710771215332, "grad_norm": 5.984154759430479, "learning_rate": 4.2041661515638524e-05, "loss": 0.5869, "step": 2676 }, { "epoch": 0.9866396388095457, "grad_norm": 9.041588380067427, "learning_rate": 4.2038570898751396e-05, "loss": 0.6882, "step": 2677 }, { "epoch": 0.9870082004975583, "grad_norm": 8.030557446732663, "learning_rate": 4.203548028186427e-05, "loss": 0.7154, "step": 2678 }, { "epoch": 0.9873767621855708, "grad_norm": 5.8784766765812195, "learning_rate": 4.203238966497713e-05, "loss": 0.7405, "step": 2679 }, { "epoch": 0.9877453238735834, "grad_norm": 12.478393802475079, "learning_rate": 4.2029299048089996e-05, "loss": 0.6334, "step": 2680 }, { "epoch": 0.9881138855615959, "grad_norm": 6.805461642423966, "learning_rate": 4.202620843120287e-05, "loss": 0.7194, "step": 2681 }, { "epoch": 0.9884824472496084, "grad_norm": 8.49543100863405, "learning_rate": 4.202311781431574e-05, "loss": 0.4884, "step": 2682 }, { "epoch": 0.9888510089376209, "grad_norm": 6.629842463904509, "learning_rate": 4.202002719742861e-05, "loss": 0.7655, "step": 2683 }, { "epoch": 0.9892195706256335, "grad_norm": 6.832262125798104, "learning_rate": 4.201693658054148e-05, "loss": 0.6176, "step": 2684 }, { "epoch": 0.989588132313646, "grad_norm": 6.473631265674314, "learning_rate": 4.2013845963654345e-05, "loss": 0.6105, "step": 2685 }, { "epoch": 0.9899566940016585, "grad_norm": 6.708759776080397, "learning_rate": 4.2010755346767216e-05, "loss": 0.7568, "step": 2686 }, { "epoch": 0.9903252556896711, "grad_norm": 6.268670233906458, "learning_rate": 4.200766472988009e-05, "loss": 0.8136, "step": 2687 }, { "epoch": 0.9906938173776836, "grad_norm": 8.322154010560505, "learning_rate": 4.200457411299296e-05, "loss": 0.766, "step": 2688 }, { "epoch": 0.9910623790656962, "grad_norm": 5.971050358422084, "learning_rate": 4.200148349610582e-05, "loss": 0.7639, "step": 2689 }, { "epoch": 0.9914309407537086, "grad_norm": 5.966542419901691, "learning_rate": 4.1998392879218694e-05, "loss": 0.5587, "step": 2690 }, { "epoch": 0.9917995024417212, "grad_norm": 7.0130996746887275, "learning_rate": 4.1995302262331565e-05, "loss": 0.782, "step": 2691 }, { "epoch": 0.9921680641297337, "grad_norm": 6.4359508242912575, "learning_rate": 4.1992211645444436e-05, "loss": 0.5831, "step": 2692 }, { "epoch": 0.9925366258177463, "grad_norm": 7.128496081950305, "learning_rate": 4.198912102855731e-05, "loss": 0.8206, "step": 2693 }, { "epoch": 0.9929051875057587, "grad_norm": 6.834229139211392, "learning_rate": 4.198603041167017e-05, "loss": 0.658, "step": 2694 }, { "epoch": 0.9932737491937713, "grad_norm": 5.362934320819056, "learning_rate": 4.1982939794783036e-05, "loss": 0.6807, "step": 2695 }, { "epoch": 0.9936423108817838, "grad_norm": 7.97737665906504, "learning_rate": 4.197984917789591e-05, "loss": 0.7072, "step": 2696 }, { "epoch": 0.9940108725697964, "grad_norm": 7.396621303937845, "learning_rate": 4.197675856100878e-05, "loss": 0.7644, "step": 2697 }, { "epoch": 0.994379434257809, "grad_norm": 7.798185616768443, "learning_rate": 4.197366794412165e-05, "loss": 0.691, "step": 2698 }, { "epoch": 0.9947479959458214, "grad_norm": 5.964425162846237, "learning_rate": 4.1970577327234514e-05, "loss": 0.5192, "step": 2699 }, { "epoch": 0.995116557633834, "grad_norm": 4.319742948160642, "learning_rate": 4.1967486710347386e-05, "loss": 0.5203, "step": 2700 }, { "epoch": 0.9954851193218465, "grad_norm": 6.540728394428827, "learning_rate": 4.196439609346026e-05, "loss": 0.5173, "step": 2701 }, { "epoch": 0.995853681009859, "grad_norm": 5.836064307942923, "learning_rate": 4.196130547657313e-05, "loss": 0.5563, "step": 2702 }, { "epoch": 0.9962222426978715, "grad_norm": 12.061657495674828, "learning_rate": 4.1958214859686e-05, "loss": 0.89, "step": 2703 }, { "epoch": 0.9965908043858841, "grad_norm": 5.055502305548022, "learning_rate": 4.1955124242798864e-05, "loss": 0.4804, "step": 2704 }, { "epoch": 0.9969593660738966, "grad_norm": 6.878900427222834, "learning_rate": 4.1952033625911735e-05, "loss": 0.6147, "step": 2705 }, { "epoch": 0.9973279277619091, "grad_norm": 5.3761446421556505, "learning_rate": 4.1948943009024606e-05, "loss": 0.7663, "step": 2706 }, { "epoch": 0.9976964894499217, "grad_norm": 7.359415430820469, "learning_rate": 4.194585239213748e-05, "loss": 0.7149, "step": 2707 }, { "epoch": 0.9980650511379342, "grad_norm": 8.639811306764974, "learning_rate": 4.194276177525034e-05, "loss": 0.7232, "step": 2708 }, { "epoch": 0.9984336128259468, "grad_norm": 8.0479372971445, "learning_rate": 4.193967115836321e-05, "loss": 1.172, "step": 2709 }, { "epoch": 0.9988021745139593, "grad_norm": 6.417430815671158, "learning_rate": 4.193658054147608e-05, "loss": 0.4899, "step": 2710 }, { "epoch": 0.9991707362019718, "grad_norm": 6.338639027943025, "learning_rate": 4.193348992458895e-05, "loss": 0.6365, "step": 2711 }, { "epoch": 0.9995392978899843, "grad_norm": 9.000245832688579, "learning_rate": 4.193039930770182e-05, "loss": 0.8583, "step": 2712 }, { "epoch": 0.9999078595779969, "grad_norm": 8.909335488693737, "learning_rate": 4.1927308690814684e-05, "loss": 0.7897, "step": 2713 }, { "epoch": 1.0002764212660094, "grad_norm": 4.388971309183057, "learning_rate": 4.1924218073927555e-05, "loss": 0.5781, "step": 2714 }, { "epoch": 1.0006449829540218, "grad_norm": 4.7859314582530015, "learning_rate": 4.1921127457040426e-05, "loss": 0.4181, "step": 2715 }, { "epoch": 1.0010135446420345, "grad_norm": 6.528818070146112, "learning_rate": 4.19180368401533e-05, "loss": 0.5611, "step": 2716 }, { "epoch": 1.001382106330047, "grad_norm": 4.85480954520513, "learning_rate": 4.191494622326617e-05, "loss": 0.5771, "step": 2717 }, { "epoch": 1.0017506680180595, "grad_norm": 4.7299334650824045, "learning_rate": 4.191185560637903e-05, "loss": 0.5477, "step": 2718 }, { "epoch": 1.0021192297060721, "grad_norm": 6.040089508808132, "learning_rate": 4.1908764989491904e-05, "loss": 0.5352, "step": 2719 }, { "epoch": 1.0024877913940846, "grad_norm": 5.234298659949378, "learning_rate": 4.1905674372604776e-05, "loss": 0.4543, "step": 2720 }, { "epoch": 1.002856353082097, "grad_norm": 7.691981552252664, "learning_rate": 4.190258375571765e-05, "loss": 0.4884, "step": 2721 }, { "epoch": 1.0032249147701096, "grad_norm": 3.243695232539068, "learning_rate": 4.189949313883052e-05, "loss": 0.383, "step": 2722 }, { "epoch": 1.0035934764581222, "grad_norm": 5.423016430756222, "learning_rate": 4.189640252194338e-05, "loss": 0.5016, "step": 2723 }, { "epoch": 1.0039620381461347, "grad_norm": 3.687252488962262, "learning_rate": 4.1893311905056253e-05, "loss": 0.3285, "step": 2724 }, { "epoch": 1.0043305998341472, "grad_norm": 5.65052328471561, "learning_rate": 4.189022128816912e-05, "loss": 0.6486, "step": 2725 }, { "epoch": 1.0046991615221599, "grad_norm": 5.236347430377505, "learning_rate": 4.188713067128199e-05, "loss": 0.4657, "step": 2726 }, { "epoch": 1.0050677232101723, "grad_norm": 4.84310140881889, "learning_rate": 4.188404005439486e-05, "loss": 0.4513, "step": 2727 }, { "epoch": 1.0054362848981848, "grad_norm": 5.776412672184614, "learning_rate": 4.1880949437507725e-05, "loss": 0.4484, "step": 2728 }, { "epoch": 1.0058048465861973, "grad_norm": 5.993013128624305, "learning_rate": 4.1877858820620596e-05, "loss": 0.3713, "step": 2729 }, { "epoch": 1.00617340827421, "grad_norm": 6.36935683663988, "learning_rate": 4.187476820373347e-05, "loss": 0.509, "step": 2730 }, { "epoch": 1.0065419699622224, "grad_norm": 9.93194463183932, "learning_rate": 4.187167758684634e-05, "loss": 0.6782, "step": 2731 }, { "epoch": 1.006910531650235, "grad_norm": 7.382608142426302, "learning_rate": 4.18685869699592e-05, "loss": 0.4485, "step": 2732 }, { "epoch": 1.0072790933382474, "grad_norm": 5.852376680048524, "learning_rate": 4.1865496353072074e-05, "loss": 0.418, "step": 2733 }, { "epoch": 1.00764765502626, "grad_norm": 19.328284470340513, "learning_rate": 4.1862405736184945e-05, "loss": 0.6128, "step": 2734 }, { "epoch": 1.0080162167142726, "grad_norm": 3.8861273694253105, "learning_rate": 4.1859315119297816e-05, "loss": 0.3085, "step": 2735 }, { "epoch": 1.008384778402285, "grad_norm": 6.855406984230301, "learning_rate": 4.185622450241069e-05, "loss": 0.4669, "step": 2736 }, { "epoch": 1.0087533400902977, "grad_norm": 5.570409230894422, "learning_rate": 4.185313388552355e-05, "loss": 0.4314, "step": 2737 }, { "epoch": 1.0091219017783102, "grad_norm": 5.071195504956134, "learning_rate": 4.185004326863642e-05, "loss": 0.4448, "step": 2738 }, { "epoch": 1.0094904634663227, "grad_norm": 5.056124216219747, "learning_rate": 4.1846952651749294e-05, "loss": 0.2984, "step": 2739 }, { "epoch": 1.0098590251543351, "grad_norm": 4.613253135498582, "learning_rate": 4.184386203486216e-05, "loss": 0.3044, "step": 2740 }, { "epoch": 1.0102275868423478, "grad_norm": 6.343200359116716, "learning_rate": 4.184077141797503e-05, "loss": 0.5207, "step": 2741 }, { "epoch": 1.0105961485303603, "grad_norm": 5.130907515179093, "learning_rate": 4.1837680801087894e-05, "loss": 0.6518, "step": 2742 }, { "epoch": 1.0109647102183728, "grad_norm": 8.202539971698076, "learning_rate": 4.1834590184200765e-05, "loss": 0.409, "step": 2743 }, { "epoch": 1.0113332719063852, "grad_norm": 3.9392054663879743, "learning_rate": 4.183149956731364e-05, "loss": 0.424, "step": 2744 }, { "epoch": 1.011701833594398, "grad_norm": 7.540219612464827, "learning_rate": 4.182840895042651e-05, "loss": 0.4122, "step": 2745 }, { "epoch": 1.0120703952824104, "grad_norm": 6.618917876249792, "learning_rate": 4.182531833353938e-05, "loss": 0.5492, "step": 2746 }, { "epoch": 1.0124389569704229, "grad_norm": 6.155078137395285, "learning_rate": 4.1822227716652243e-05, "loss": 0.726, "step": 2747 }, { "epoch": 1.0128075186584355, "grad_norm": 5.670157012111057, "learning_rate": 4.1819137099765115e-05, "loss": 0.4189, "step": 2748 }, { "epoch": 1.013176080346448, "grad_norm": 5.195368670396538, "learning_rate": 4.1816046482877986e-05, "loss": 0.5637, "step": 2749 }, { "epoch": 1.0135446420344605, "grad_norm": 4.808002897353735, "learning_rate": 4.181295586599086e-05, "loss": 0.534, "step": 2750 }, { "epoch": 1.013913203722473, "grad_norm": 5.328878489266681, "learning_rate": 4.180986524910372e-05, "loss": 0.3572, "step": 2751 }, { "epoch": 1.0142817654104856, "grad_norm": 5.654129558189396, "learning_rate": 4.180677463221659e-05, "loss": 0.4226, "step": 2752 }, { "epoch": 1.0146503270984981, "grad_norm": 7.648565928361868, "learning_rate": 4.1803684015329464e-05, "loss": 0.6325, "step": 2753 }, { "epoch": 1.0150188887865106, "grad_norm": 6.071360611735591, "learning_rate": 4.1800593398442335e-05, "loss": 0.4196, "step": 2754 }, { "epoch": 1.0153874504745233, "grad_norm": 4.90981673324004, "learning_rate": 4.17975027815552e-05, "loss": 0.5738, "step": 2755 }, { "epoch": 1.0157560121625357, "grad_norm": 7.025774777949315, "learning_rate": 4.179441216466807e-05, "loss": 0.534, "step": 2756 }, { "epoch": 1.0161245738505482, "grad_norm": 6.1927378669549995, "learning_rate": 4.1791321547780935e-05, "loss": 0.5878, "step": 2757 }, { "epoch": 1.0164931355385607, "grad_norm": 7.667293384754518, "learning_rate": 4.1788230930893806e-05, "loss": 0.4998, "step": 2758 }, { "epoch": 1.0168616972265734, "grad_norm": 4.209132782693916, "learning_rate": 4.178514031400668e-05, "loss": 0.4423, "step": 2759 }, { "epoch": 1.0172302589145858, "grad_norm": 5.851288365053107, "learning_rate": 4.178204969711955e-05, "loss": 0.3917, "step": 2760 }, { "epoch": 1.0175988206025983, "grad_norm": 8.270080173385185, "learning_rate": 4.177895908023241e-05, "loss": 0.5499, "step": 2761 }, { "epoch": 1.0179673822906108, "grad_norm": 4.557210704265952, "learning_rate": 4.1775868463345284e-05, "loss": 0.6111, "step": 2762 }, { "epoch": 1.0183359439786235, "grad_norm": 6.9056470789035505, "learning_rate": 4.1772777846458155e-05, "loss": 0.5235, "step": 2763 }, { "epoch": 1.018704505666636, "grad_norm": 4.685106810957262, "learning_rate": 4.176968722957103e-05, "loss": 0.3982, "step": 2764 }, { "epoch": 1.0190730673546484, "grad_norm": 4.646098085166971, "learning_rate": 4.17665966126839e-05, "loss": 0.3072, "step": 2765 }, { "epoch": 1.0194416290426611, "grad_norm": 4.038012606082944, "learning_rate": 4.176350599579676e-05, "loss": 0.2653, "step": 2766 }, { "epoch": 1.0198101907306736, "grad_norm": 4.729124273482667, "learning_rate": 4.1760415378909633e-05, "loss": 0.3103, "step": 2767 }, { "epoch": 1.020178752418686, "grad_norm": 4.576368808782682, "learning_rate": 4.1757324762022505e-05, "loss": 0.2496, "step": 2768 }, { "epoch": 1.0205473141066985, "grad_norm": 6.681945775971797, "learning_rate": 4.1754234145135376e-05, "loss": 0.3699, "step": 2769 }, { "epoch": 1.0209158757947112, "grad_norm": 5.098206612412035, "learning_rate": 4.175114352824824e-05, "loss": 0.3211, "step": 2770 }, { "epoch": 1.0212844374827237, "grad_norm": 10.244490468585418, "learning_rate": 4.1748052911361105e-05, "loss": 0.311, "step": 2771 }, { "epoch": 1.0216529991707362, "grad_norm": 7.883814404939604, "learning_rate": 4.1744962294473976e-05, "loss": 0.5184, "step": 2772 }, { "epoch": 1.0220215608587488, "grad_norm": 4.70963442669444, "learning_rate": 4.174187167758685e-05, "loss": 0.2934, "step": 2773 }, { "epoch": 1.0223901225467613, "grad_norm": 6.035595529118339, "learning_rate": 4.173878106069972e-05, "loss": 0.6524, "step": 2774 }, { "epoch": 1.0227586842347738, "grad_norm": 6.227691509743185, "learning_rate": 4.173569044381259e-05, "loss": 0.42, "step": 2775 }, { "epoch": 1.0231272459227863, "grad_norm": 4.367715929847963, "learning_rate": 4.1732599826925454e-05, "loss": 0.3811, "step": 2776 }, { "epoch": 1.023495807610799, "grad_norm": 7.5340044358465486, "learning_rate": 4.1729509210038325e-05, "loss": 0.3642, "step": 2777 }, { "epoch": 1.0238643692988114, "grad_norm": 6.278947246778876, "learning_rate": 4.1726418593151196e-05, "loss": 0.3157, "step": 2778 }, { "epoch": 1.0242329309868239, "grad_norm": 4.871716298178847, "learning_rate": 4.172332797626407e-05, "loss": 0.4946, "step": 2779 }, { "epoch": 1.0246014926748364, "grad_norm": 5.720672117639593, "learning_rate": 4.172023735937693e-05, "loss": 0.3648, "step": 2780 }, { "epoch": 1.024970054362849, "grad_norm": 7.854736901046907, "learning_rate": 4.17171467424898e-05, "loss": 0.6171, "step": 2781 }, { "epoch": 1.0253386160508615, "grad_norm": 5.154806038938057, "learning_rate": 4.1714056125602674e-05, "loss": 0.3152, "step": 2782 }, { "epoch": 1.025707177738874, "grad_norm": 6.385152175511823, "learning_rate": 4.1710965508715545e-05, "loss": 0.4003, "step": 2783 }, { "epoch": 1.0260757394268867, "grad_norm": 8.52792796756175, "learning_rate": 4.1707874891828417e-05, "loss": 0.4583, "step": 2784 }, { "epoch": 1.0264443011148991, "grad_norm": 7.045639572622537, "learning_rate": 4.1704784274941274e-05, "loss": 0.4762, "step": 2785 }, { "epoch": 1.0268128628029116, "grad_norm": 5.034284920244694, "learning_rate": 4.1701693658054145e-05, "loss": 0.3092, "step": 2786 }, { "epoch": 1.027181424490924, "grad_norm": 6.163335914480612, "learning_rate": 4.169860304116702e-05, "loss": 0.3731, "step": 2787 }, { "epoch": 1.0275499861789368, "grad_norm": 6.710533334198894, "learning_rate": 4.169551242427989e-05, "loss": 0.333, "step": 2788 }, { "epoch": 1.0279185478669493, "grad_norm": 4.8845424669762405, "learning_rate": 4.169242180739276e-05, "loss": 0.3695, "step": 2789 }, { "epoch": 1.0282871095549617, "grad_norm": 3.6774410833829725, "learning_rate": 4.1689331190505623e-05, "loss": 0.1904, "step": 2790 }, { "epoch": 1.0286556712429742, "grad_norm": 5.096324995292353, "learning_rate": 4.1686240573618495e-05, "loss": 0.2899, "step": 2791 }, { "epoch": 1.0290242329309869, "grad_norm": 7.849709484774665, "learning_rate": 4.1683149956731366e-05, "loss": 0.5154, "step": 2792 }, { "epoch": 1.0293927946189994, "grad_norm": 5.9857603217668744, "learning_rate": 4.168005933984424e-05, "loss": 0.4232, "step": 2793 }, { "epoch": 1.0297613563070118, "grad_norm": 6.862420798367525, "learning_rate": 4.167696872295711e-05, "loss": 0.5422, "step": 2794 }, { "epoch": 1.0301299179950245, "grad_norm": 7.733175412023511, "learning_rate": 4.167387810606997e-05, "loss": 0.5881, "step": 2795 }, { "epoch": 1.030498479683037, "grad_norm": 4.7578100946730375, "learning_rate": 4.1670787489182844e-05, "loss": 0.2561, "step": 2796 }, { "epoch": 1.0308670413710495, "grad_norm": 7.211632592785098, "learning_rate": 4.1667696872295715e-05, "loss": 0.4526, "step": 2797 }, { "epoch": 1.031235603059062, "grad_norm": 4.797352124072586, "learning_rate": 4.1664606255408586e-05, "loss": 0.3533, "step": 2798 }, { "epoch": 1.0316041647470746, "grad_norm": 7.452620835116444, "learning_rate": 4.166151563852145e-05, "loss": 0.4298, "step": 2799 }, { "epoch": 1.031972726435087, "grad_norm": 15.432258488334792, "learning_rate": 4.1658425021634315e-05, "loss": 0.3171, "step": 2800 }, { "epoch": 1.0323412881230996, "grad_norm": 7.269822481236485, "learning_rate": 4.1655334404747186e-05, "loss": 0.4572, "step": 2801 }, { "epoch": 1.0327098498111122, "grad_norm": 5.015665309660839, "learning_rate": 4.165224378786006e-05, "loss": 0.3983, "step": 2802 }, { "epoch": 1.0330784114991247, "grad_norm": 5.048306383220732, "learning_rate": 4.164915317097293e-05, "loss": 0.3809, "step": 2803 }, { "epoch": 1.0334469731871372, "grad_norm": 9.216217624240613, "learning_rate": 4.164606255408579e-05, "loss": 0.4792, "step": 2804 }, { "epoch": 1.0338155348751497, "grad_norm": 4.797823635076719, "learning_rate": 4.1642971937198664e-05, "loss": 0.4595, "step": 2805 }, { "epoch": 1.0341840965631623, "grad_norm": 6.718316210560361, "learning_rate": 4.1639881320311535e-05, "loss": 0.3, "step": 2806 }, { "epoch": 1.0345526582511748, "grad_norm": 4.955777874812633, "learning_rate": 4.1636790703424407e-05, "loss": 0.4835, "step": 2807 }, { "epoch": 1.0349212199391873, "grad_norm": 7.189650769407034, "learning_rate": 4.163370008653728e-05, "loss": 0.3331, "step": 2808 }, { "epoch": 1.0352897816271998, "grad_norm": 4.809411581295656, "learning_rate": 4.163060946965014e-05, "loss": 0.4141, "step": 2809 }, { "epoch": 1.0356583433152124, "grad_norm": 5.947230667487573, "learning_rate": 4.162751885276301e-05, "loss": 0.4263, "step": 2810 }, { "epoch": 1.036026905003225, "grad_norm": 38.29115943171478, "learning_rate": 4.1624428235875885e-05, "loss": 0.3512, "step": 2811 }, { "epoch": 1.0363954666912374, "grad_norm": 10.973048397673972, "learning_rate": 4.1621337618988756e-05, "loss": 0.4458, "step": 2812 }, { "epoch": 1.03676402837925, "grad_norm": 6.23120832209294, "learning_rate": 4.161824700210162e-05, "loss": 0.4085, "step": 2813 }, { "epoch": 1.0371325900672625, "grad_norm": 13.173572001077693, "learning_rate": 4.161515638521449e-05, "loss": 0.3884, "step": 2814 }, { "epoch": 1.037501151755275, "grad_norm": 9.903151162842137, "learning_rate": 4.1612065768327356e-05, "loss": 0.7147, "step": 2815 }, { "epoch": 1.0378697134432875, "grad_norm": 4.646972014922266, "learning_rate": 4.160897515144023e-05, "loss": 0.2398, "step": 2816 }, { "epoch": 1.0382382751313002, "grad_norm": 5.969780158974397, "learning_rate": 4.16058845345531e-05, "loss": 0.6647, "step": 2817 }, { "epoch": 1.0386068368193127, "grad_norm": 4.6566002221215115, "learning_rate": 4.160279391766597e-05, "loss": 0.3741, "step": 2818 }, { "epoch": 1.0389753985073251, "grad_norm": 4.555394741000816, "learning_rate": 4.1599703300778834e-05, "loss": 0.1882, "step": 2819 }, { "epoch": 1.0393439601953376, "grad_norm": 8.869118044248397, "learning_rate": 4.1596612683891705e-05, "loss": 0.4418, "step": 2820 }, { "epoch": 1.0397125218833503, "grad_norm": 5.970067063662594, "learning_rate": 4.1593522067004576e-05, "loss": 0.5275, "step": 2821 }, { "epoch": 1.0400810835713628, "grad_norm": 9.952985490093695, "learning_rate": 4.159043145011745e-05, "loss": 0.477, "step": 2822 }, { "epoch": 1.0404496452593752, "grad_norm": 5.767392598290193, "learning_rate": 4.158734083323031e-05, "loss": 0.4354, "step": 2823 }, { "epoch": 1.040818206947388, "grad_norm": 5.7794354864671975, "learning_rate": 4.158425021634318e-05, "loss": 0.4625, "step": 2824 }, { "epoch": 1.0411867686354004, "grad_norm": 5.286160144633062, "learning_rate": 4.1581159599456054e-05, "loss": 0.4531, "step": 2825 }, { "epoch": 1.0415553303234129, "grad_norm": 4.549444252699757, "learning_rate": 4.1578068982568925e-05, "loss": 0.5193, "step": 2826 }, { "epoch": 1.0419238920114253, "grad_norm": 8.192018616088934, "learning_rate": 4.1574978365681797e-05, "loss": 0.5356, "step": 2827 }, { "epoch": 1.042292453699438, "grad_norm": 4.769893148253109, "learning_rate": 4.157188774879466e-05, "loss": 0.4133, "step": 2828 }, { "epoch": 1.0426610153874505, "grad_norm": 4.157541174853891, "learning_rate": 4.156879713190753e-05, "loss": 0.4503, "step": 2829 }, { "epoch": 1.043029577075463, "grad_norm": 7.955701968848423, "learning_rate": 4.15657065150204e-05, "loss": 0.6432, "step": 2830 }, { "epoch": 1.0433981387634756, "grad_norm": 12.261467373578103, "learning_rate": 4.156261589813327e-05, "loss": 0.4385, "step": 2831 }, { "epoch": 1.0437667004514881, "grad_norm": 4.1908747041796435, "learning_rate": 4.155952528124614e-05, "loss": 0.3891, "step": 2832 }, { "epoch": 1.0441352621395006, "grad_norm": 5.1865998371856055, "learning_rate": 4.1556434664359e-05, "loss": 0.2241, "step": 2833 }, { "epoch": 1.044503823827513, "grad_norm": 6.583958881773547, "learning_rate": 4.1553344047471875e-05, "loss": 0.2802, "step": 2834 }, { "epoch": 1.0448723855155257, "grad_norm": 6.003726120122357, "learning_rate": 4.1550253430584746e-05, "loss": 0.3856, "step": 2835 }, { "epoch": 1.0452409472035382, "grad_norm": 6.846272878840207, "learning_rate": 4.154716281369762e-05, "loss": 0.279, "step": 2836 }, { "epoch": 1.0456095088915507, "grad_norm": 10.576791699000362, "learning_rate": 4.154407219681049e-05, "loss": 0.6024, "step": 2837 }, { "epoch": 1.0459780705795632, "grad_norm": 6.528231420603672, "learning_rate": 4.154098157992335e-05, "loss": 0.399, "step": 2838 }, { "epoch": 1.0463466322675758, "grad_norm": 4.0956807715770545, "learning_rate": 4.1537890963036224e-05, "loss": 0.4104, "step": 2839 }, { "epoch": 1.0467151939555883, "grad_norm": 6.061721909132925, "learning_rate": 4.1534800346149095e-05, "loss": 0.3785, "step": 2840 }, { "epoch": 1.0470837556436008, "grad_norm": 6.204333901014148, "learning_rate": 4.1531709729261966e-05, "loss": 0.3478, "step": 2841 }, { "epoch": 1.0474523173316135, "grad_norm": 6.284006301633903, "learning_rate": 4.152861911237483e-05, "loss": 0.3188, "step": 2842 }, { "epoch": 1.047820879019626, "grad_norm": 6.825442278159011, "learning_rate": 4.15255284954877e-05, "loss": 0.5876, "step": 2843 }, { "epoch": 1.0481894407076384, "grad_norm": 5.1205421226310275, "learning_rate": 4.152243787860057e-05, "loss": 0.3293, "step": 2844 }, { "epoch": 1.048558002395651, "grad_norm": 5.259360370400311, "learning_rate": 4.1519347261713444e-05, "loss": 0.5447, "step": 2845 }, { "epoch": 1.0489265640836636, "grad_norm": 12.575089404658572, "learning_rate": 4.151625664482631e-05, "loss": 0.3502, "step": 2846 }, { "epoch": 1.049295125771676, "grad_norm": 9.464508415953999, "learning_rate": 4.151316602793918e-05, "loss": 0.4726, "step": 2847 }, { "epoch": 1.0496636874596885, "grad_norm": 6.33183485005408, "learning_rate": 4.1510075411052044e-05, "loss": 0.5297, "step": 2848 }, { "epoch": 1.050032249147701, "grad_norm": 6.397892557303904, "learning_rate": 4.1506984794164915e-05, "loss": 0.4953, "step": 2849 }, { "epoch": 1.0504008108357137, "grad_norm": 5.133317292588838, "learning_rate": 4.1503894177277787e-05, "loss": 0.3347, "step": 2850 }, { "epoch": 1.0507693725237262, "grad_norm": 7.876743184268354, "learning_rate": 4.150080356039066e-05, "loss": 0.3857, "step": 2851 }, { "epoch": 1.0511379342117386, "grad_norm": 7.341172045854527, "learning_rate": 4.149771294350352e-05, "loss": 0.4436, "step": 2852 }, { "epoch": 1.0515064958997513, "grad_norm": 5.611798201355282, "learning_rate": 4.149462232661639e-05, "loss": 0.3748, "step": 2853 }, { "epoch": 1.0518750575877638, "grad_norm": 5.82405238486405, "learning_rate": 4.1491531709729265e-05, "loss": 0.4273, "step": 2854 }, { "epoch": 1.0522436192757763, "grad_norm": 3.9042805093610227, "learning_rate": 4.1488441092842136e-05, "loss": 0.2957, "step": 2855 }, { "epoch": 1.0526121809637887, "grad_norm": 7.081454876918169, "learning_rate": 4.148535047595501e-05, "loss": 0.5068, "step": 2856 }, { "epoch": 1.0529807426518014, "grad_norm": 7.081744954655117, "learning_rate": 4.148225985906787e-05, "loss": 0.4399, "step": 2857 }, { "epoch": 1.0533493043398139, "grad_norm": 6.632886978288417, "learning_rate": 4.147916924218074e-05, "loss": 0.3655, "step": 2858 }, { "epoch": 1.0537178660278264, "grad_norm": 5.959397583285081, "learning_rate": 4.1476078625293614e-05, "loss": 0.2875, "step": 2859 }, { "epoch": 1.054086427715839, "grad_norm": 5.202537892985894, "learning_rate": 4.1472988008406485e-05, "loss": 0.3475, "step": 2860 }, { "epoch": 1.0544549894038515, "grad_norm": 23.210385280371398, "learning_rate": 4.146989739151935e-05, "loss": 0.5518, "step": 2861 }, { "epoch": 1.054823551091864, "grad_norm": 5.140292913499411, "learning_rate": 4.1466806774632214e-05, "loss": 0.511, "step": 2862 }, { "epoch": 1.0551921127798765, "grad_norm": 3.761440627293134, "learning_rate": 4.1463716157745085e-05, "loss": 0.4099, "step": 2863 }, { "epoch": 1.0555606744678891, "grad_norm": 6.561735272128413, "learning_rate": 4.1460625540857956e-05, "loss": 0.5264, "step": 2864 }, { "epoch": 1.0559292361559016, "grad_norm": 5.354678039275273, "learning_rate": 4.145753492397083e-05, "loss": 0.3676, "step": 2865 }, { "epoch": 1.056297797843914, "grad_norm": 3.476978611421449, "learning_rate": 4.14544443070837e-05, "loss": 0.3435, "step": 2866 }, { "epoch": 1.0566663595319266, "grad_norm": 6.294553418669073, "learning_rate": 4.145135369019656e-05, "loss": 0.5426, "step": 2867 }, { "epoch": 1.0570349212199392, "grad_norm": 5.9924204953887115, "learning_rate": 4.1448263073309434e-05, "loss": 0.467, "step": 2868 }, { "epoch": 1.0574034829079517, "grad_norm": 5.2317193881502915, "learning_rate": 4.1445172456422305e-05, "loss": 0.5168, "step": 2869 }, { "epoch": 1.0577720445959642, "grad_norm": 4.025146595435512, "learning_rate": 4.1442081839535176e-05, "loss": 0.2694, "step": 2870 }, { "epoch": 1.0581406062839769, "grad_norm": 4.2434799445496285, "learning_rate": 4.143899122264804e-05, "loss": 0.2967, "step": 2871 }, { "epoch": 1.0585091679719894, "grad_norm": 5.8447121220842915, "learning_rate": 4.143590060576091e-05, "loss": 0.4275, "step": 2872 }, { "epoch": 1.0588777296600018, "grad_norm": 7.174741882572708, "learning_rate": 4.143280998887378e-05, "loss": 0.4894, "step": 2873 }, { "epoch": 1.0592462913480143, "grad_norm": 4.675056522456084, "learning_rate": 4.1429719371986654e-05, "loss": 0.4751, "step": 2874 }, { "epoch": 1.059614853036027, "grad_norm": 5.715355371087809, "learning_rate": 4.1426628755099526e-05, "loss": 0.3848, "step": 2875 }, { "epoch": 1.0599834147240395, "grad_norm": 9.552754141834486, "learning_rate": 4.142353813821238e-05, "loss": 0.4714, "step": 2876 }, { "epoch": 1.060351976412052, "grad_norm": 47.71956091730754, "learning_rate": 4.1420447521325254e-05, "loss": 0.4554, "step": 2877 }, { "epoch": 1.0607205381000644, "grad_norm": 4.98045726101616, "learning_rate": 4.1417356904438126e-05, "loss": 0.3955, "step": 2878 }, { "epoch": 1.061089099788077, "grad_norm": 10.37229571660089, "learning_rate": 4.1414266287551e-05, "loss": 0.4527, "step": 2879 }, { "epoch": 1.0614576614760896, "grad_norm": 5.359333716586623, "learning_rate": 4.141117567066387e-05, "loss": 0.3856, "step": 2880 }, { "epoch": 1.061826223164102, "grad_norm": 3.821552473231797, "learning_rate": 4.140808505377673e-05, "loss": 0.3121, "step": 2881 }, { "epoch": 1.0621947848521147, "grad_norm": 7.602935886500575, "learning_rate": 4.1404994436889604e-05, "loss": 0.3699, "step": 2882 }, { "epoch": 1.0625633465401272, "grad_norm": 4.393042282474125, "learning_rate": 4.1401903820002475e-05, "loss": 0.2981, "step": 2883 }, { "epoch": 1.0629319082281397, "grad_norm": 7.169642281646787, "learning_rate": 4.1398813203115346e-05, "loss": 0.4515, "step": 2884 }, { "epoch": 1.0633004699161521, "grad_norm": 9.84897632217311, "learning_rate": 4.139572258622821e-05, "loss": 0.4383, "step": 2885 }, { "epoch": 1.0636690316041648, "grad_norm": 5.788937298325601, "learning_rate": 4.139263196934108e-05, "loss": 0.4661, "step": 2886 }, { "epoch": 1.0640375932921773, "grad_norm": 6.233314519571618, "learning_rate": 4.138954135245395e-05, "loss": 0.325, "step": 2887 }, { "epoch": 1.0644061549801898, "grad_norm": 6.211119911826049, "learning_rate": 4.1386450735566824e-05, "loss": 0.3865, "step": 2888 }, { "epoch": 1.0647747166682024, "grad_norm": 8.008981907753974, "learning_rate": 4.1383360118679695e-05, "loss": 0.3183, "step": 2889 }, { "epoch": 1.065143278356215, "grad_norm": 7.354441290924835, "learning_rate": 4.138026950179256e-05, "loss": 0.4972, "step": 2890 }, { "epoch": 1.0655118400442274, "grad_norm": 8.058923682200207, "learning_rate": 4.1377178884905424e-05, "loss": 0.4971, "step": 2891 }, { "epoch": 1.0658804017322399, "grad_norm": 6.402323491807486, "learning_rate": 4.1374088268018295e-05, "loss": 0.3699, "step": 2892 }, { "epoch": 1.0662489634202525, "grad_norm": 7.595097677319716, "learning_rate": 4.1370997651131166e-05, "loss": 0.4585, "step": 2893 }, { "epoch": 1.066617525108265, "grad_norm": 4.988105740965867, "learning_rate": 4.136790703424404e-05, "loss": 0.2447, "step": 2894 }, { "epoch": 1.0669860867962775, "grad_norm": 5.965373577177141, "learning_rate": 4.13648164173569e-05, "loss": 0.3664, "step": 2895 }, { "epoch": 1.06735464848429, "grad_norm": 8.632302551397697, "learning_rate": 4.136172580046977e-05, "loss": 0.4948, "step": 2896 }, { "epoch": 1.0677232101723027, "grad_norm": 4.659215609960292, "learning_rate": 4.1358635183582644e-05, "loss": 0.4743, "step": 2897 }, { "epoch": 1.0680917718603151, "grad_norm": 5.234812248528475, "learning_rate": 4.1355544566695516e-05, "loss": 0.2944, "step": 2898 }, { "epoch": 1.0684603335483276, "grad_norm": 9.528001175298225, "learning_rate": 4.135245394980839e-05, "loss": 0.4333, "step": 2899 }, { "epoch": 1.0688288952363403, "grad_norm": 7.059753964381312, "learning_rate": 4.134936333292125e-05, "loss": 0.5921, "step": 2900 }, { "epoch": 1.0691974569243528, "grad_norm": 6.283507287866381, "learning_rate": 4.134627271603412e-05, "loss": 0.3593, "step": 2901 }, { "epoch": 1.0695660186123652, "grad_norm": 5.48685670875032, "learning_rate": 4.1343182099146994e-05, "loss": 0.3123, "step": 2902 }, { "epoch": 1.0699345803003777, "grad_norm": 5.421438765772018, "learning_rate": 4.1340091482259865e-05, "loss": 0.3695, "step": 2903 }, { "epoch": 1.0703031419883904, "grad_norm": 13.497019120771977, "learning_rate": 4.133700086537273e-05, "loss": 0.5177, "step": 2904 }, { "epoch": 1.0706717036764029, "grad_norm": 2.9479204768683047, "learning_rate": 4.13339102484856e-05, "loss": 0.2631, "step": 2905 }, { "epoch": 1.0710402653644153, "grad_norm": 6.008520751795392, "learning_rate": 4.1330819631598465e-05, "loss": 0.5603, "step": 2906 }, { "epoch": 1.0714088270524278, "grad_norm": 73.24609364583888, "learning_rate": 4.1327729014711336e-05, "loss": 0.3446, "step": 2907 }, { "epoch": 1.0717773887404405, "grad_norm": 3.5138835747787103, "learning_rate": 4.132463839782421e-05, "loss": 0.2489, "step": 2908 }, { "epoch": 1.072145950428453, "grad_norm": 4.902711406158849, "learning_rate": 4.132154778093708e-05, "loss": 0.3249, "step": 2909 }, { "epoch": 1.0725145121164654, "grad_norm": 5.592071451846676, "learning_rate": 4.131845716404994e-05, "loss": 0.3961, "step": 2910 }, { "epoch": 1.0728830738044781, "grad_norm": 8.43049613461056, "learning_rate": 4.1315366547162814e-05, "loss": 0.3796, "step": 2911 }, { "epoch": 1.0732516354924906, "grad_norm": 6.5485284215085064, "learning_rate": 4.1312275930275685e-05, "loss": 0.6541, "step": 2912 }, { "epoch": 1.073620197180503, "grad_norm": 5.480833469688797, "learning_rate": 4.1309185313388556e-05, "loss": 0.2908, "step": 2913 }, { "epoch": 1.0739887588685155, "grad_norm": 4.998306750644177, "learning_rate": 4.130609469650142e-05, "loss": 0.4095, "step": 2914 }, { "epoch": 1.0743573205565282, "grad_norm": 5.964697935349362, "learning_rate": 4.130300407961429e-05, "loss": 0.3333, "step": 2915 }, { "epoch": 1.0747258822445407, "grad_norm": 6.17149425249621, "learning_rate": 4.129991346272716e-05, "loss": 0.4016, "step": 2916 }, { "epoch": 1.0750944439325532, "grad_norm": 8.00010299616316, "learning_rate": 4.1296822845840034e-05, "loss": 0.4856, "step": 2917 }, { "epoch": 1.0754630056205658, "grad_norm": 3.786979433046654, "learning_rate": 4.1293732228952906e-05, "loss": 0.2402, "step": 2918 }, { "epoch": 1.0758315673085783, "grad_norm": 6.788299974259192, "learning_rate": 4.129064161206577e-05, "loss": 0.5323, "step": 2919 }, { "epoch": 1.0762001289965908, "grad_norm": 5.466909741990917, "learning_rate": 4.128755099517864e-05, "loss": 0.272, "step": 2920 }, { "epoch": 1.0765686906846033, "grad_norm": 6.058534878924969, "learning_rate": 4.1284460378291506e-05, "loss": 0.3476, "step": 2921 }, { "epoch": 1.076937252372616, "grad_norm": 5.849868590354322, "learning_rate": 4.128136976140438e-05, "loss": 0.365, "step": 2922 }, { "epoch": 1.0773058140606284, "grad_norm": 9.660367908619362, "learning_rate": 4.127827914451725e-05, "loss": 0.4926, "step": 2923 }, { "epoch": 1.077674375748641, "grad_norm": 6.592451645721178, "learning_rate": 4.127518852763011e-05, "loss": 0.342, "step": 2924 }, { "epoch": 1.0780429374366534, "grad_norm": 10.355595838643948, "learning_rate": 4.1272097910742984e-05, "loss": 0.5544, "step": 2925 }, { "epoch": 1.078411499124666, "grad_norm": 7.787021932118148, "learning_rate": 4.1269007293855855e-05, "loss": 0.4294, "step": 2926 }, { "epoch": 1.0787800608126785, "grad_norm": 12.283278746636752, "learning_rate": 4.1265916676968726e-05, "loss": 0.3464, "step": 2927 }, { "epoch": 1.079148622500691, "grad_norm": 6.430959990462216, "learning_rate": 4.12628260600816e-05, "loss": 0.456, "step": 2928 }, { "epoch": 1.0795171841887037, "grad_norm": 9.680047300238945, "learning_rate": 4.125973544319446e-05, "loss": 0.458, "step": 2929 }, { "epoch": 1.0798857458767162, "grad_norm": 4.774702787886015, "learning_rate": 4.125664482630733e-05, "loss": 0.206, "step": 2930 }, { "epoch": 1.0802543075647286, "grad_norm": 8.983459532163298, "learning_rate": 4.1253554209420204e-05, "loss": 0.5652, "step": 2931 }, { "epoch": 1.080622869252741, "grad_norm": 6.149045494875353, "learning_rate": 4.1250463592533075e-05, "loss": 0.4037, "step": 2932 }, { "epoch": 1.0809914309407538, "grad_norm": 5.801192246593246, "learning_rate": 4.124737297564594e-05, "loss": 0.3202, "step": 2933 }, { "epoch": 1.0813599926287663, "grad_norm": 7.787029280297271, "learning_rate": 4.124428235875881e-05, "loss": 0.5323, "step": 2934 }, { "epoch": 1.0817285543167787, "grad_norm": 5.707615419708731, "learning_rate": 4.124119174187168e-05, "loss": 0.2378, "step": 2935 }, { "epoch": 1.0820971160047912, "grad_norm": 6.416303979441108, "learning_rate": 4.1238101124984546e-05, "loss": 0.4321, "step": 2936 }, { "epoch": 1.0824656776928039, "grad_norm": 6.558439324510902, "learning_rate": 4.123501050809742e-05, "loss": 0.4209, "step": 2937 }, { "epoch": 1.0828342393808164, "grad_norm": 5.926923466980763, "learning_rate": 4.123191989121029e-05, "loss": 0.3936, "step": 2938 }, { "epoch": 1.0832028010688288, "grad_norm": 5.7637208518335905, "learning_rate": 4.122882927432315e-05, "loss": 0.4667, "step": 2939 }, { "epoch": 1.0835713627568415, "grad_norm": 5.306810990548306, "learning_rate": 4.1225738657436024e-05, "loss": 0.4182, "step": 2940 }, { "epoch": 1.083939924444854, "grad_norm": 8.507116592291641, "learning_rate": 4.1222648040548896e-05, "loss": 0.3618, "step": 2941 }, { "epoch": 1.0843084861328665, "grad_norm": 5.935458183567378, "learning_rate": 4.121955742366177e-05, "loss": 0.4054, "step": 2942 }, { "epoch": 1.084677047820879, "grad_norm": 9.616763467373124, "learning_rate": 4.121646680677463e-05, "loss": 0.6123, "step": 2943 }, { "epoch": 1.0850456095088916, "grad_norm": 20.188559678553872, "learning_rate": 4.12133761898875e-05, "loss": 0.5858, "step": 2944 }, { "epoch": 1.085414171196904, "grad_norm": 10.077309650884242, "learning_rate": 4.1210285573000374e-05, "loss": 0.7292, "step": 2945 }, { "epoch": 1.0857827328849166, "grad_norm": 6.275553839205711, "learning_rate": 4.1207194956113245e-05, "loss": 0.4016, "step": 2946 }, { "epoch": 1.0861512945729292, "grad_norm": 6.265726673699685, "learning_rate": 4.1204104339226116e-05, "loss": 0.3727, "step": 2947 }, { "epoch": 1.0865198562609417, "grad_norm": 6.254088323493412, "learning_rate": 4.120101372233898e-05, "loss": 0.3497, "step": 2948 }, { "epoch": 1.0868884179489542, "grad_norm": 6.798158441145511, "learning_rate": 4.119792310545185e-05, "loss": 0.3338, "step": 2949 }, { "epoch": 1.0872569796369667, "grad_norm": 7.815728824967617, "learning_rate": 4.119483248856472e-05, "loss": 0.4629, "step": 2950 }, { "epoch": 1.0876255413249794, "grad_norm": 7.9386105699147205, "learning_rate": 4.1191741871677594e-05, "loss": 0.4248, "step": 2951 }, { "epoch": 1.0879941030129918, "grad_norm": 8.442308200310116, "learning_rate": 4.118865125479046e-05, "loss": 0.5625, "step": 2952 }, { "epoch": 1.0883626647010043, "grad_norm": 19.464387113974492, "learning_rate": 4.118556063790332e-05, "loss": 0.4777, "step": 2953 }, { "epoch": 1.0887312263890168, "grad_norm": 10.743874157359611, "learning_rate": 4.1182470021016194e-05, "loss": 0.3253, "step": 2954 }, { "epoch": 1.0890997880770295, "grad_norm": 6.239383683701573, "learning_rate": 4.1179379404129065e-05, "loss": 0.3977, "step": 2955 }, { "epoch": 1.089468349765042, "grad_norm": 9.916408973572658, "learning_rate": 4.1176288787241936e-05, "loss": 0.5077, "step": 2956 }, { "epoch": 1.0898369114530544, "grad_norm": 16.78909067441796, "learning_rate": 4.117319817035481e-05, "loss": 0.7212, "step": 2957 }, { "epoch": 1.090205473141067, "grad_norm": 5.02808370531763, "learning_rate": 4.117010755346767e-05, "loss": 0.4389, "step": 2958 }, { "epoch": 1.0905740348290796, "grad_norm": 5.512382440329865, "learning_rate": 4.116701693658054e-05, "loss": 0.3832, "step": 2959 }, { "epoch": 1.090942596517092, "grad_norm": 11.054703373442667, "learning_rate": 4.1163926319693414e-05, "loss": 0.5465, "step": 2960 }, { "epoch": 1.0913111582051045, "grad_norm": 6.872828747604329, "learning_rate": 4.1160835702806286e-05, "loss": 0.5385, "step": 2961 }, { "epoch": 1.0916797198931172, "grad_norm": 6.144272359255165, "learning_rate": 4.115774508591915e-05, "loss": 0.426, "step": 2962 }, { "epoch": 1.0920482815811297, "grad_norm": 5.531944791616257, "learning_rate": 4.115465446903202e-05, "loss": 0.258, "step": 2963 }, { "epoch": 1.0924168432691421, "grad_norm": 6.632208015925308, "learning_rate": 4.115156385214489e-05, "loss": 0.1919, "step": 2964 }, { "epoch": 1.0927854049571546, "grad_norm": 4.803208732991981, "learning_rate": 4.1148473235257764e-05, "loss": 0.3489, "step": 2965 }, { "epoch": 1.0931539666451673, "grad_norm": 7.148757143639059, "learning_rate": 4.1145382618370635e-05, "loss": 0.5389, "step": 2966 }, { "epoch": 1.0935225283331798, "grad_norm": 6.9844752225576325, "learning_rate": 4.114229200148349e-05, "loss": 0.3347, "step": 2967 }, { "epoch": 1.0938910900211922, "grad_norm": 5.247433171167048, "learning_rate": 4.1139201384596364e-05, "loss": 0.4588, "step": 2968 }, { "epoch": 1.094259651709205, "grad_norm": 5.8237360160022735, "learning_rate": 4.1136110767709235e-05, "loss": 0.4906, "step": 2969 }, { "epoch": 1.0946282133972174, "grad_norm": 5.949250169196417, "learning_rate": 4.1133020150822106e-05, "loss": 0.4521, "step": 2970 }, { "epoch": 1.0949967750852299, "grad_norm": 6.475642416173737, "learning_rate": 4.112992953393498e-05, "loss": 0.533, "step": 2971 }, { "epoch": 1.0953653367732423, "grad_norm": 7.309513362440972, "learning_rate": 4.112683891704784e-05, "loss": 0.452, "step": 2972 }, { "epoch": 1.095733898461255, "grad_norm": 6.822493756026852, "learning_rate": 4.112374830016071e-05, "loss": 0.6262, "step": 2973 }, { "epoch": 1.0961024601492675, "grad_norm": 6.907174579301244, "learning_rate": 4.1120657683273584e-05, "loss": 0.3582, "step": 2974 }, { "epoch": 1.09647102183728, "grad_norm": 5.568443637559577, "learning_rate": 4.1117567066386455e-05, "loss": 0.45, "step": 2975 }, { "epoch": 1.0968395835252926, "grad_norm": 6.507443787386572, "learning_rate": 4.111447644949932e-05, "loss": 0.3675, "step": 2976 }, { "epoch": 1.0972081452133051, "grad_norm": 7.978501998534876, "learning_rate": 4.111138583261219e-05, "loss": 0.4737, "step": 2977 }, { "epoch": 1.0975767069013176, "grad_norm": 5.750057717738511, "learning_rate": 4.110829521572506e-05, "loss": 0.537, "step": 2978 }, { "epoch": 1.09794526858933, "grad_norm": 5.609877709944507, "learning_rate": 4.110520459883793e-05, "loss": 0.3916, "step": 2979 }, { "epoch": 1.0983138302773428, "grad_norm": 4.691424138704446, "learning_rate": 4.1102113981950804e-05, "loss": 0.3848, "step": 2980 }, { "epoch": 1.0986823919653552, "grad_norm": 5.3910403768859725, "learning_rate": 4.109902336506367e-05, "loss": 0.5075, "step": 2981 }, { "epoch": 1.0990509536533677, "grad_norm": 7.111658823589932, "learning_rate": 4.109593274817653e-05, "loss": 0.4101, "step": 2982 }, { "epoch": 1.0994195153413802, "grad_norm": 4.07593462938592, "learning_rate": 4.1092842131289404e-05, "loss": 0.3255, "step": 2983 }, { "epoch": 1.0997880770293929, "grad_norm": 7.8025505493052005, "learning_rate": 4.1089751514402276e-05, "loss": 0.4271, "step": 2984 }, { "epoch": 1.1001566387174053, "grad_norm": 7.912620661630588, "learning_rate": 4.108666089751515e-05, "loss": 0.461, "step": 2985 }, { "epoch": 1.1005252004054178, "grad_norm": 5.953020230069703, "learning_rate": 4.108357028062801e-05, "loss": 0.4375, "step": 2986 }, { "epoch": 1.1008937620934305, "grad_norm": 5.001132646064317, "learning_rate": 4.108047966374088e-05, "loss": 0.455, "step": 2987 }, { "epoch": 1.101262323781443, "grad_norm": 6.507369925114536, "learning_rate": 4.1077389046853754e-05, "loss": 0.4725, "step": 2988 }, { "epoch": 1.1016308854694554, "grad_norm": 6.7742827876903435, "learning_rate": 4.1074298429966625e-05, "loss": 0.428, "step": 2989 }, { "epoch": 1.101999447157468, "grad_norm": 4.521646359139461, "learning_rate": 4.1071207813079496e-05, "loss": 0.3036, "step": 2990 }, { "epoch": 1.1023680088454806, "grad_norm": 5.206569688376588, "learning_rate": 4.106811719619236e-05, "loss": 0.3851, "step": 2991 }, { "epoch": 1.102736570533493, "grad_norm": 5.152801470203661, "learning_rate": 4.106502657930523e-05, "loss": 0.3714, "step": 2992 }, { "epoch": 1.1031051322215055, "grad_norm": 4.725000952665042, "learning_rate": 4.10619359624181e-05, "loss": 0.2524, "step": 2993 }, { "epoch": 1.103473693909518, "grad_norm": 5.2949467302764095, "learning_rate": 4.1058845345530974e-05, "loss": 0.4705, "step": 2994 }, { "epoch": 1.1038422555975307, "grad_norm": 6.087147740505901, "learning_rate": 4.105575472864384e-05, "loss": 0.3868, "step": 2995 }, { "epoch": 1.1042108172855432, "grad_norm": 6.423290384436388, "learning_rate": 4.105266411175671e-05, "loss": 0.5464, "step": 2996 }, { "epoch": 1.1045793789735556, "grad_norm": 7.305508393045577, "learning_rate": 4.1049573494869574e-05, "loss": 0.5972, "step": 2997 }, { "epoch": 1.1049479406615683, "grad_norm": 6.416052190249688, "learning_rate": 4.1046482877982445e-05, "loss": 0.3481, "step": 2998 }, { "epoch": 1.1053165023495808, "grad_norm": 3.9735872121216147, "learning_rate": 4.1043392261095316e-05, "loss": 0.2321, "step": 2999 }, { "epoch": 1.1056850640375933, "grad_norm": 5.830184122780135, "learning_rate": 4.104030164420819e-05, "loss": 0.3747, "step": 3000 }, { "epoch": 1.1060536257256057, "grad_norm": 7.293761673552677, "learning_rate": 4.103721102732105e-05, "loss": 0.2814, "step": 3001 }, { "epoch": 1.1064221874136184, "grad_norm": 6.141088414845149, "learning_rate": 4.103412041043392e-05, "loss": 0.3479, "step": 3002 }, { "epoch": 1.1067907491016309, "grad_norm": 6.755878008430504, "learning_rate": 4.1031029793546794e-05, "loss": 0.3155, "step": 3003 }, { "epoch": 1.1071593107896434, "grad_norm": 5.421727598973477, "learning_rate": 4.1027939176659665e-05, "loss": 0.5822, "step": 3004 }, { "epoch": 1.107527872477656, "grad_norm": 6.431617196947157, "learning_rate": 4.102484855977253e-05, "loss": 0.2743, "step": 3005 }, { "epoch": 1.1078964341656685, "grad_norm": 8.509802159162453, "learning_rate": 4.10217579428854e-05, "loss": 0.6424, "step": 3006 }, { "epoch": 1.108264995853681, "grad_norm": 7.058645090951912, "learning_rate": 4.101866732599827e-05, "loss": 0.5572, "step": 3007 }, { "epoch": 1.1086335575416935, "grad_norm": 4.510261280416795, "learning_rate": 4.1015576709111143e-05, "loss": 0.2756, "step": 3008 }, { "epoch": 1.1090021192297062, "grad_norm": 4.618843310991848, "learning_rate": 4.1012486092224015e-05, "loss": 0.3371, "step": 3009 }, { "epoch": 1.1093706809177186, "grad_norm": 6.291732619320957, "learning_rate": 4.100939547533688e-05, "loss": 0.4703, "step": 3010 }, { "epoch": 1.109739242605731, "grad_norm": 6.737738882157769, "learning_rate": 4.100630485844975e-05, "loss": 0.5363, "step": 3011 }, { "epoch": 1.1101078042937438, "grad_norm": 7.017643627494882, "learning_rate": 4.1003214241562615e-05, "loss": 0.2791, "step": 3012 }, { "epoch": 1.1104763659817563, "grad_norm": 9.909105725631264, "learning_rate": 4.1000123624675486e-05, "loss": 0.5244, "step": 3013 }, { "epoch": 1.1108449276697687, "grad_norm": 7.057014968534025, "learning_rate": 4.099703300778836e-05, "loss": 0.43, "step": 3014 }, { "epoch": 1.1112134893577812, "grad_norm": 6.563399770906415, "learning_rate": 4.099394239090122e-05, "loss": 0.3061, "step": 3015 }, { "epoch": 1.1115820510457939, "grad_norm": 5.6449522712791405, "learning_rate": 4.099085177401409e-05, "loss": 0.2462, "step": 3016 }, { "epoch": 1.1119506127338064, "grad_norm": 7.263269842581393, "learning_rate": 4.0987761157126964e-05, "loss": 0.4394, "step": 3017 }, { "epoch": 1.1123191744218188, "grad_norm": 9.326624866289219, "learning_rate": 4.0984670540239835e-05, "loss": 0.3785, "step": 3018 }, { "epoch": 1.1126877361098313, "grad_norm": 7.442816364918224, "learning_rate": 4.0981579923352706e-05, "loss": 0.4824, "step": 3019 }, { "epoch": 1.113056297797844, "grad_norm": 6.5144229210001665, "learning_rate": 4.097848930646557e-05, "loss": 0.563, "step": 3020 }, { "epoch": 1.1134248594858565, "grad_norm": 5.171631591232083, "learning_rate": 4.097539868957844e-05, "loss": 0.4112, "step": 3021 }, { "epoch": 1.113793421173869, "grad_norm": 7.310801773458284, "learning_rate": 4.097230807269131e-05, "loss": 0.4309, "step": 3022 }, { "epoch": 1.1141619828618814, "grad_norm": 9.780710223724489, "learning_rate": 4.0969217455804184e-05, "loss": 0.4852, "step": 3023 }, { "epoch": 1.114530544549894, "grad_norm": 4.668423503607316, "learning_rate": 4.096612683891705e-05, "loss": 0.3373, "step": 3024 }, { "epoch": 1.1148991062379066, "grad_norm": 7.285752635609114, "learning_rate": 4.096303622202992e-05, "loss": 0.5344, "step": 3025 }, { "epoch": 1.115267667925919, "grad_norm": 5.001701065618045, "learning_rate": 4.095994560514279e-05, "loss": 0.4733, "step": 3026 }, { "epoch": 1.1156362296139317, "grad_norm": 10.290171732159951, "learning_rate": 4.0956854988255655e-05, "loss": 0.4127, "step": 3027 }, { "epoch": 1.1160047913019442, "grad_norm": 5.1825153347290875, "learning_rate": 4.095376437136853e-05, "loss": 0.4378, "step": 3028 }, { "epoch": 1.1163733529899567, "grad_norm": 4.085278547261597, "learning_rate": 4.09506737544814e-05, "loss": 0.317, "step": 3029 }, { "epoch": 1.1167419146779691, "grad_norm": 6.431859480409411, "learning_rate": 4.094758313759426e-05, "loss": 0.2644, "step": 3030 }, { "epoch": 1.1171104763659818, "grad_norm": 6.525274582270346, "learning_rate": 4.0944492520707133e-05, "loss": 0.3523, "step": 3031 }, { "epoch": 1.1174790380539943, "grad_norm": 11.344110488418098, "learning_rate": 4.0941401903820005e-05, "loss": 0.4035, "step": 3032 }, { "epoch": 1.1178475997420068, "grad_norm": 5.6434937681919255, "learning_rate": 4.0938311286932876e-05, "loss": 0.352, "step": 3033 }, { "epoch": 1.1182161614300195, "grad_norm": 3.7012147765076238, "learning_rate": 4.093522067004574e-05, "loss": 0.3293, "step": 3034 }, { "epoch": 1.118584723118032, "grad_norm": 6.433617467321189, "learning_rate": 4.093213005315861e-05, "loss": 0.4216, "step": 3035 }, { "epoch": 1.1189532848060444, "grad_norm": 4.866579853659641, "learning_rate": 4.092903943627148e-05, "loss": 0.3098, "step": 3036 }, { "epoch": 1.1193218464940569, "grad_norm": 10.025359994413074, "learning_rate": 4.0925948819384354e-05, "loss": 0.4727, "step": 3037 }, { "epoch": 1.1196904081820696, "grad_norm": 6.38138186087823, "learning_rate": 4.0922858202497225e-05, "loss": 0.4045, "step": 3038 }, { "epoch": 1.120058969870082, "grad_norm": 7.928968515357423, "learning_rate": 4.091976758561009e-05, "loss": 0.6294, "step": 3039 }, { "epoch": 1.1204275315580945, "grad_norm": 9.217561496480847, "learning_rate": 4.091667696872296e-05, "loss": 0.3603, "step": 3040 }, { "epoch": 1.1207960932461072, "grad_norm": 8.241132189549262, "learning_rate": 4.091358635183583e-05, "loss": 0.6152, "step": 3041 }, { "epoch": 1.1211646549341197, "grad_norm": 4.654730837012927, "learning_rate": 4.0910495734948696e-05, "loss": 0.3051, "step": 3042 }, { "epoch": 1.1215332166221321, "grad_norm": 4.950736543978165, "learning_rate": 4.090740511806157e-05, "loss": 0.4821, "step": 3043 }, { "epoch": 1.1219017783101446, "grad_norm": 7.626043529707406, "learning_rate": 4.090431450117443e-05, "loss": 0.5946, "step": 3044 }, { "epoch": 1.1222703399981573, "grad_norm": 9.361870689276497, "learning_rate": 4.09012238842873e-05, "loss": 0.5689, "step": 3045 }, { "epoch": 1.1226389016861698, "grad_norm": 7.703321102346661, "learning_rate": 4.0898133267400174e-05, "loss": 0.6975, "step": 3046 }, { "epoch": 1.1230074633741822, "grad_norm": 6.310629992585551, "learning_rate": 4.0895042650513045e-05, "loss": 0.4075, "step": 3047 }, { "epoch": 1.1233760250621947, "grad_norm": 6.342623004494941, "learning_rate": 4.089195203362591e-05, "loss": 0.4116, "step": 3048 }, { "epoch": 1.1237445867502074, "grad_norm": 6.057831215110051, "learning_rate": 4.088886141673878e-05, "loss": 0.4781, "step": 3049 }, { "epoch": 1.1241131484382199, "grad_norm": 5.855585363039946, "learning_rate": 4.088577079985165e-05, "loss": 0.4604, "step": 3050 }, { "epoch": 1.1244817101262323, "grad_norm": 8.886991754048355, "learning_rate": 4.0882680182964523e-05, "loss": 0.2098, "step": 3051 }, { "epoch": 1.1248502718142448, "grad_norm": 4.584993073469648, "learning_rate": 4.0879589566077395e-05, "loss": 0.204, "step": 3052 }, { "epoch": 1.1252188335022575, "grad_norm": 5.029300288226997, "learning_rate": 4.087649894919026e-05, "loss": 0.3578, "step": 3053 }, { "epoch": 1.12558739519027, "grad_norm": 5.953893897195557, "learning_rate": 4.087340833230313e-05, "loss": 0.3996, "step": 3054 }, { "epoch": 1.1259559568782824, "grad_norm": 6.132865992999189, "learning_rate": 4.0870317715416e-05, "loss": 0.5097, "step": 3055 }, { "epoch": 1.1263245185662951, "grad_norm": 9.027154436028779, "learning_rate": 4.086722709852887e-05, "loss": 0.4272, "step": 3056 }, { "epoch": 1.1266930802543076, "grad_norm": 6.747925616222389, "learning_rate": 4.086413648164174e-05, "loss": 0.4844, "step": 3057 }, { "epoch": 1.12706164194232, "grad_norm": 9.74354456892365, "learning_rate": 4.08610458647546e-05, "loss": 0.5128, "step": 3058 }, { "epoch": 1.1274302036303325, "grad_norm": 5.758603169825935, "learning_rate": 4.085795524786747e-05, "loss": 0.2501, "step": 3059 }, { "epoch": 1.1277987653183452, "grad_norm": 7.930072583399128, "learning_rate": 4.0854864630980344e-05, "loss": 0.3925, "step": 3060 }, { "epoch": 1.1281673270063577, "grad_norm": 6.9212814317664355, "learning_rate": 4.0851774014093215e-05, "loss": 0.3682, "step": 3061 }, { "epoch": 1.1285358886943702, "grad_norm": 13.182449603130008, "learning_rate": 4.0848683397206086e-05, "loss": 0.4039, "step": 3062 }, { "epoch": 1.1289044503823829, "grad_norm": 10.368119497599547, "learning_rate": 4.084559278031895e-05, "loss": 0.3415, "step": 3063 }, { "epoch": 1.1292730120703953, "grad_norm": 7.0303618145620606, "learning_rate": 4.084250216343182e-05, "loss": 0.4269, "step": 3064 }, { "epoch": 1.1296415737584078, "grad_norm": 4.64538330270525, "learning_rate": 4.083941154654469e-05, "loss": 0.4113, "step": 3065 }, { "epoch": 1.1300101354464203, "grad_norm": 5.749034634697988, "learning_rate": 4.0836320929657564e-05, "loss": 0.4511, "step": 3066 }, { "epoch": 1.130378697134433, "grad_norm": 8.362019514526866, "learning_rate": 4.083323031277043e-05, "loss": 0.5867, "step": 3067 }, { "epoch": 1.1307472588224454, "grad_norm": 4.782545525054492, "learning_rate": 4.08301396958833e-05, "loss": 0.316, "step": 3068 }, { "epoch": 1.131115820510458, "grad_norm": 8.086846548387571, "learning_rate": 4.082704907899617e-05, "loss": 0.5591, "step": 3069 }, { "epoch": 1.1314843821984706, "grad_norm": 7.1382668104982825, "learning_rate": 4.082395846210904e-05, "loss": 0.4686, "step": 3070 }, { "epoch": 1.131852943886483, "grad_norm": 4.98511119416248, "learning_rate": 4.0820867845221913e-05, "loss": 0.4539, "step": 3071 }, { "epoch": 1.1322215055744955, "grad_norm": 5.304022633568988, "learning_rate": 4.081777722833478e-05, "loss": 0.3413, "step": 3072 }, { "epoch": 1.132590067262508, "grad_norm": 7.614472673258265, "learning_rate": 4.081468661144764e-05, "loss": 0.3901, "step": 3073 }, { "epoch": 1.1329586289505207, "grad_norm": 5.374378966893306, "learning_rate": 4.0811595994560513e-05, "loss": 0.4494, "step": 3074 }, { "epoch": 1.1333271906385332, "grad_norm": 6.321410886479626, "learning_rate": 4.0808505377673385e-05, "loss": 0.428, "step": 3075 }, { "epoch": 1.1336957523265456, "grad_norm": 5.805814774773185, "learning_rate": 4.0805414760786256e-05, "loss": 0.5089, "step": 3076 }, { "epoch": 1.134064314014558, "grad_norm": 17.54296831510242, "learning_rate": 4.080232414389912e-05, "loss": 0.4573, "step": 3077 }, { "epoch": 1.1344328757025708, "grad_norm": 5.691965885963535, "learning_rate": 4.079923352701199e-05, "loss": 0.4875, "step": 3078 }, { "epoch": 1.1348014373905833, "grad_norm": 5.517409254882608, "learning_rate": 4.079614291012486e-05, "loss": 0.2155, "step": 3079 }, { "epoch": 1.1351699990785957, "grad_norm": 6.091324073909438, "learning_rate": 4.0793052293237734e-05, "loss": 0.3445, "step": 3080 }, { "epoch": 1.1355385607666082, "grad_norm": 6.810027347459372, "learning_rate": 4.0789961676350605e-05, "loss": 0.5253, "step": 3081 }, { "epoch": 1.1359071224546209, "grad_norm": 3.903985916127925, "learning_rate": 4.078687105946347e-05, "loss": 0.3007, "step": 3082 }, { "epoch": 1.1362756841426334, "grad_norm": 5.186061429861478, "learning_rate": 4.078378044257634e-05, "loss": 0.3368, "step": 3083 }, { "epoch": 1.1366442458306458, "grad_norm": 8.163910640782614, "learning_rate": 4.078068982568921e-05, "loss": 0.4239, "step": 3084 }, { "epoch": 1.1370128075186585, "grad_norm": 6.635611043705007, "learning_rate": 4.077759920880208e-05, "loss": 0.5057, "step": 3085 }, { "epoch": 1.137381369206671, "grad_norm": 8.061090863716023, "learning_rate": 4.077450859191495e-05, "loss": 0.2976, "step": 3086 }, { "epoch": 1.1377499308946835, "grad_norm": 6.185299260417233, "learning_rate": 4.077141797502782e-05, "loss": 0.4735, "step": 3087 }, { "epoch": 1.138118492582696, "grad_norm": 10.191267756101912, "learning_rate": 4.076832735814068e-05, "loss": 0.3601, "step": 3088 }, { "epoch": 1.1384870542707086, "grad_norm": 6.515090152002557, "learning_rate": 4.0765236741253554e-05, "loss": 0.4002, "step": 3089 }, { "epoch": 1.138855615958721, "grad_norm": 5.810324969502196, "learning_rate": 4.0762146124366425e-05, "loss": 0.5982, "step": 3090 }, { "epoch": 1.1392241776467336, "grad_norm": 5.70476752888264, "learning_rate": 4.0759055507479297e-05, "loss": 0.3253, "step": 3091 }, { "epoch": 1.1395927393347463, "grad_norm": 7.013495379027538, "learning_rate": 4.075596489059216e-05, "loss": 0.472, "step": 3092 }, { "epoch": 1.1399613010227587, "grad_norm": 9.264150157969919, "learning_rate": 4.075287427370503e-05, "loss": 0.3542, "step": 3093 }, { "epoch": 1.1403298627107712, "grad_norm": 6.783280450988136, "learning_rate": 4.07497836568179e-05, "loss": 0.5486, "step": 3094 }, { "epoch": 1.1406984243987837, "grad_norm": 4.897336017692724, "learning_rate": 4.0746693039930775e-05, "loss": 0.3533, "step": 3095 }, { "epoch": 1.1410669860867964, "grad_norm": 8.896492736343491, "learning_rate": 4.074360242304364e-05, "loss": 0.5314, "step": 3096 }, { "epoch": 1.1414355477748088, "grad_norm": 12.158999879766917, "learning_rate": 4.074051180615651e-05, "loss": 0.6886, "step": 3097 }, { "epoch": 1.1418041094628213, "grad_norm": 9.063798061670084, "learning_rate": 4.073742118926938e-05, "loss": 0.7346, "step": 3098 }, { "epoch": 1.142172671150834, "grad_norm": 6.424083469195388, "learning_rate": 4.073433057238225e-05, "loss": 0.3894, "step": 3099 }, { "epoch": 1.1425412328388465, "grad_norm": 7.499402340281909, "learning_rate": 4.0731239955495124e-05, "loss": 0.297, "step": 3100 }, { "epoch": 1.142909794526859, "grad_norm": 5.868906960302864, "learning_rate": 4.072814933860799e-05, "loss": 0.5211, "step": 3101 }, { "epoch": 1.1432783562148714, "grad_norm": 6.5613553456711955, "learning_rate": 4.072505872172086e-05, "loss": 0.4133, "step": 3102 }, { "epoch": 1.143646917902884, "grad_norm": 8.007897770125275, "learning_rate": 4.0721968104833724e-05, "loss": 0.3691, "step": 3103 }, { "epoch": 1.1440154795908966, "grad_norm": 5.942343573334909, "learning_rate": 4.0718877487946595e-05, "loss": 0.4713, "step": 3104 }, { "epoch": 1.144384041278909, "grad_norm": 9.801811860420006, "learning_rate": 4.0715786871059466e-05, "loss": 0.5356, "step": 3105 }, { "epoch": 1.1447526029669215, "grad_norm": 7.947232024842955, "learning_rate": 4.071269625417233e-05, "loss": 0.4934, "step": 3106 }, { "epoch": 1.1451211646549342, "grad_norm": 6.307785842230641, "learning_rate": 4.07096056372852e-05, "loss": 0.5468, "step": 3107 }, { "epoch": 1.1454897263429467, "grad_norm": 4.393246991163394, "learning_rate": 4.070651502039807e-05, "loss": 0.2453, "step": 3108 }, { "epoch": 1.1458582880309591, "grad_norm": 5.696635528235305, "learning_rate": 4.0703424403510944e-05, "loss": 0.508, "step": 3109 }, { "epoch": 1.1462268497189716, "grad_norm": 7.304432127053442, "learning_rate": 4.0700333786623815e-05, "loss": 0.5108, "step": 3110 }, { "epoch": 1.1465954114069843, "grad_norm": 9.602235899265166, "learning_rate": 4.069724316973668e-05, "loss": 0.4793, "step": 3111 }, { "epoch": 1.1469639730949968, "grad_norm": 7.713633459258927, "learning_rate": 4.069415255284955e-05, "loss": 0.5659, "step": 3112 }, { "epoch": 1.1473325347830092, "grad_norm": 9.20813411537423, "learning_rate": 4.069106193596242e-05, "loss": 0.3323, "step": 3113 }, { "epoch": 1.147701096471022, "grad_norm": 5.825889670084566, "learning_rate": 4.068797131907529e-05, "loss": 0.6306, "step": 3114 }, { "epoch": 1.1480696581590344, "grad_norm": 5.174616245654834, "learning_rate": 4.068488070218816e-05, "loss": 0.4271, "step": 3115 }, { "epoch": 1.1484382198470469, "grad_norm": 7.1622550456449945, "learning_rate": 4.068179008530103e-05, "loss": 0.6515, "step": 3116 }, { "epoch": 1.1488067815350593, "grad_norm": 8.833572192531403, "learning_rate": 4.06786994684139e-05, "loss": 0.3032, "step": 3117 }, { "epoch": 1.149175343223072, "grad_norm": 5.4646161831639075, "learning_rate": 4.0675608851526765e-05, "loss": 0.418, "step": 3118 }, { "epoch": 1.1495439049110845, "grad_norm": 7.734895322617849, "learning_rate": 4.0672518234639636e-05, "loss": 0.7469, "step": 3119 }, { "epoch": 1.149912466599097, "grad_norm": 4.610939303546115, "learning_rate": 4.06694276177525e-05, "loss": 0.3618, "step": 3120 }, { "epoch": 1.1502810282871097, "grad_norm": 5.279512198485299, "learning_rate": 4.066633700086537e-05, "loss": 0.3342, "step": 3121 }, { "epoch": 1.1506495899751221, "grad_norm": 4.994197148438446, "learning_rate": 4.066324638397824e-05, "loss": 0.5561, "step": 3122 }, { "epoch": 1.1510181516631346, "grad_norm": 7.059071746837678, "learning_rate": 4.0660155767091114e-05, "loss": 0.5419, "step": 3123 }, { "epoch": 1.151386713351147, "grad_norm": 5.118853162534061, "learning_rate": 4.0657065150203985e-05, "loss": 0.4285, "step": 3124 }, { "epoch": 1.1517552750391598, "grad_norm": 5.97853923345551, "learning_rate": 4.065397453331685e-05, "loss": 0.2847, "step": 3125 }, { "epoch": 1.1521238367271722, "grad_norm": 5.4657893422775885, "learning_rate": 4.065088391642972e-05, "loss": 0.266, "step": 3126 }, { "epoch": 1.1524923984151847, "grad_norm": 8.31873609626233, "learning_rate": 4.064779329954259e-05, "loss": 0.4582, "step": 3127 }, { "epoch": 1.1528609601031974, "grad_norm": 9.909695095448983, "learning_rate": 4.064470268265546e-05, "loss": 0.5435, "step": 3128 }, { "epoch": 1.1532295217912099, "grad_norm": 7.801564612331979, "learning_rate": 4.0641612065768334e-05, "loss": 0.4207, "step": 3129 }, { "epoch": 1.1535980834792223, "grad_norm": 7.314556647715343, "learning_rate": 4.06385214488812e-05, "loss": 0.4822, "step": 3130 }, { "epoch": 1.1539666451672348, "grad_norm": 14.594155152800845, "learning_rate": 4.063543083199407e-05, "loss": 0.4613, "step": 3131 }, { "epoch": 1.1543352068552475, "grad_norm": 6.156953258332645, "learning_rate": 4.063234021510694e-05, "loss": 0.346, "step": 3132 }, { "epoch": 1.15470376854326, "grad_norm": 7.2932281856640175, "learning_rate": 4.0629249598219805e-05, "loss": 0.4808, "step": 3133 }, { "epoch": 1.1550723302312724, "grad_norm": 6.632637947221001, "learning_rate": 4.0626158981332677e-05, "loss": 0.3893, "step": 3134 }, { "epoch": 1.155440891919285, "grad_norm": 5.388032593920929, "learning_rate": 4.062306836444554e-05, "loss": 0.382, "step": 3135 }, { "epoch": 1.1558094536072976, "grad_norm": 4.544647824918787, "learning_rate": 4.061997774755841e-05, "loss": 0.3484, "step": 3136 }, { "epoch": 1.15617801529531, "grad_norm": 6.533219182061733, "learning_rate": 4.061688713067128e-05, "loss": 0.4182, "step": 3137 }, { "epoch": 1.1565465769833225, "grad_norm": 4.78837443600434, "learning_rate": 4.0613796513784155e-05, "loss": 0.2505, "step": 3138 }, { "epoch": 1.156915138671335, "grad_norm": 4.209064583777511, "learning_rate": 4.061070589689702e-05, "loss": 0.2792, "step": 3139 }, { "epoch": 1.1572837003593477, "grad_norm": 6.935653853282947, "learning_rate": 4.060761528000989e-05, "loss": 0.5433, "step": 3140 }, { "epoch": 1.1576522620473602, "grad_norm": 6.235256352664842, "learning_rate": 4.060452466312276e-05, "loss": 0.4219, "step": 3141 }, { "epoch": 1.1580208237353726, "grad_norm": 8.370946102291002, "learning_rate": 4.060143404623563e-05, "loss": 0.557, "step": 3142 }, { "epoch": 1.1583893854233853, "grad_norm": 12.745966254533972, "learning_rate": 4.0598343429348504e-05, "loss": 0.5524, "step": 3143 }, { "epoch": 1.1587579471113978, "grad_norm": 9.265539365118606, "learning_rate": 4.059525281246137e-05, "loss": 0.7304, "step": 3144 }, { "epoch": 1.1591265087994103, "grad_norm": 8.222458615499, "learning_rate": 4.059216219557424e-05, "loss": 0.5615, "step": 3145 }, { "epoch": 1.1594950704874227, "grad_norm": 8.71939795489349, "learning_rate": 4.058907157868711e-05, "loss": 0.4383, "step": 3146 }, { "epoch": 1.1598636321754354, "grad_norm": 8.258334949030045, "learning_rate": 4.058598096179998e-05, "loss": 0.5125, "step": 3147 }, { "epoch": 1.160232193863448, "grad_norm": 6.827291407211528, "learning_rate": 4.0582890344912846e-05, "loss": 0.4793, "step": 3148 }, { "epoch": 1.1606007555514604, "grad_norm": 11.652919262731979, "learning_rate": 4.057979972802571e-05, "loss": 0.4557, "step": 3149 }, { "epoch": 1.160969317239473, "grad_norm": 9.99369918686269, "learning_rate": 4.057670911113858e-05, "loss": 0.6097, "step": 3150 }, { "epoch": 1.1613378789274855, "grad_norm": 5.946192431905972, "learning_rate": 4.057361849425145e-05, "loss": 0.3085, "step": 3151 }, { "epoch": 1.161706440615498, "grad_norm": 7.14724711528182, "learning_rate": 4.0570527877364324e-05, "loss": 0.3159, "step": 3152 }, { "epoch": 1.1620750023035105, "grad_norm": 5.720333026181248, "learning_rate": 4.0567437260477195e-05, "loss": 0.3378, "step": 3153 }, { "epoch": 1.1624435639915232, "grad_norm": 5.82528625570774, "learning_rate": 4.056434664359006e-05, "loss": 0.4336, "step": 3154 }, { "epoch": 1.1628121256795356, "grad_norm": 7.192919470919768, "learning_rate": 4.056125602670293e-05, "loss": 0.4957, "step": 3155 }, { "epoch": 1.163180687367548, "grad_norm": 7.421180580178149, "learning_rate": 4.05581654098158e-05, "loss": 0.5245, "step": 3156 }, { "epoch": 1.1635492490555608, "grad_norm": 6.438933870196651, "learning_rate": 4.055507479292867e-05, "loss": 0.6668, "step": 3157 }, { "epoch": 1.1639178107435733, "grad_norm": 7.303516053293965, "learning_rate": 4.055198417604154e-05, "loss": 0.3874, "step": 3158 }, { "epoch": 1.1642863724315857, "grad_norm": 5.289804967158635, "learning_rate": 4.054889355915441e-05, "loss": 0.4038, "step": 3159 }, { "epoch": 1.1646549341195982, "grad_norm": 5.873749620202591, "learning_rate": 4.054580294226728e-05, "loss": 0.4202, "step": 3160 }, { "epoch": 1.1650234958076109, "grad_norm": 8.426636887187891, "learning_rate": 4.054271232538015e-05, "loss": 0.6332, "step": 3161 }, { "epoch": 1.1653920574956234, "grad_norm": 9.06617609641631, "learning_rate": 4.053962170849302e-05, "loss": 0.6695, "step": 3162 }, { "epoch": 1.1657606191836358, "grad_norm": 5.030194094725892, "learning_rate": 4.053653109160589e-05, "loss": 0.3282, "step": 3163 }, { "epoch": 1.1661291808716483, "grad_norm": 8.463397034020604, "learning_rate": 4.053344047471875e-05, "loss": 0.3957, "step": 3164 }, { "epoch": 1.166497742559661, "grad_norm": 7.635697162328084, "learning_rate": 4.053034985783162e-05, "loss": 0.4908, "step": 3165 }, { "epoch": 1.1668663042476735, "grad_norm": 7.273553406870514, "learning_rate": 4.0527259240944494e-05, "loss": 0.6299, "step": 3166 }, { "epoch": 1.167234865935686, "grad_norm": 4.414789676689513, "learning_rate": 4.0524168624057365e-05, "loss": 0.457, "step": 3167 }, { "epoch": 1.1676034276236984, "grad_norm": 5.093334344860462, "learning_rate": 4.052107800717023e-05, "loss": 0.309, "step": 3168 }, { "epoch": 1.167971989311711, "grad_norm": 5.390770597841259, "learning_rate": 4.05179873902831e-05, "loss": 0.4471, "step": 3169 }, { "epoch": 1.1683405509997236, "grad_norm": 3.8547624831991767, "learning_rate": 4.051489677339597e-05, "loss": 0.2297, "step": 3170 }, { "epoch": 1.168709112687736, "grad_norm": 5.783618338796846, "learning_rate": 4.051180615650884e-05, "loss": 0.6179, "step": 3171 }, { "epoch": 1.1690776743757487, "grad_norm": 8.378112214707235, "learning_rate": 4.0508715539621714e-05, "loss": 0.3363, "step": 3172 }, { "epoch": 1.1694462360637612, "grad_norm": 5.121899667230107, "learning_rate": 4.050562492273458e-05, "loss": 0.365, "step": 3173 }, { "epoch": 1.1698147977517737, "grad_norm": 6.627964723943236, "learning_rate": 4.050253430584745e-05, "loss": 0.3256, "step": 3174 }, { "epoch": 1.1701833594397861, "grad_norm": 4.481851645374106, "learning_rate": 4.049944368896032e-05, "loss": 0.3375, "step": 3175 }, { "epoch": 1.1705519211277988, "grad_norm": 7.490947600902883, "learning_rate": 4.049635307207319e-05, "loss": 0.5804, "step": 3176 }, { "epoch": 1.1709204828158113, "grad_norm": 8.616817407623776, "learning_rate": 4.0493262455186056e-05, "loss": 0.3393, "step": 3177 }, { "epoch": 1.1712890445038238, "grad_norm": 5.537467953005814, "learning_rate": 4.049017183829892e-05, "loss": 0.3163, "step": 3178 }, { "epoch": 1.1716576061918365, "grad_norm": 5.615962928280101, "learning_rate": 4.048708122141179e-05, "loss": 0.272, "step": 3179 }, { "epoch": 1.172026167879849, "grad_norm": 4.2281737015337715, "learning_rate": 4.048399060452466e-05, "loss": 0.2468, "step": 3180 }, { "epoch": 1.1723947295678614, "grad_norm": 7.5435212385862584, "learning_rate": 4.0480899987637534e-05, "loss": 0.4458, "step": 3181 }, { "epoch": 1.1727632912558739, "grad_norm": 6.0889140967770015, "learning_rate": 4.0477809370750406e-05, "loss": 0.4252, "step": 3182 }, { "epoch": 1.1731318529438866, "grad_norm": 8.741122292538039, "learning_rate": 4.047471875386327e-05, "loss": 0.4414, "step": 3183 }, { "epoch": 1.173500414631899, "grad_norm": 7.16813106509002, "learning_rate": 4.047162813697614e-05, "loss": 0.3734, "step": 3184 }, { "epoch": 1.1738689763199115, "grad_norm": 7.8616626966848, "learning_rate": 4.046853752008901e-05, "loss": 0.4373, "step": 3185 }, { "epoch": 1.1742375380079242, "grad_norm": 10.01726110369153, "learning_rate": 4.0465446903201884e-05, "loss": 0.6796, "step": 3186 }, { "epoch": 1.1746060996959367, "grad_norm": 6.541406646549911, "learning_rate": 4.046235628631475e-05, "loss": 0.3426, "step": 3187 }, { "epoch": 1.1749746613839491, "grad_norm": 4.759259083889833, "learning_rate": 4.045926566942762e-05, "loss": 0.34, "step": 3188 }, { "epoch": 1.1753432230719616, "grad_norm": 7.409491107470181, "learning_rate": 4.045617505254049e-05, "loss": 0.7299, "step": 3189 }, { "epoch": 1.1757117847599743, "grad_norm": 5.011277070061563, "learning_rate": 4.045308443565336e-05, "loss": 0.2474, "step": 3190 }, { "epoch": 1.1760803464479868, "grad_norm": 4.856515318939169, "learning_rate": 4.044999381876623e-05, "loss": 0.3895, "step": 3191 }, { "epoch": 1.1764489081359992, "grad_norm": 5.808049949326124, "learning_rate": 4.04469032018791e-05, "loss": 0.4892, "step": 3192 }, { "epoch": 1.1768174698240117, "grad_norm": 5.042395147652873, "learning_rate": 4.044381258499196e-05, "loss": 0.3377, "step": 3193 }, { "epoch": 1.1771860315120244, "grad_norm": 6.4329017586639, "learning_rate": 4.044072196810483e-05, "loss": 0.4743, "step": 3194 }, { "epoch": 1.1775545932000369, "grad_norm": 5.775614533197657, "learning_rate": 4.0437631351217704e-05, "loss": 0.5226, "step": 3195 }, { "epoch": 1.1779231548880493, "grad_norm": 13.681395680578596, "learning_rate": 4.0434540734330575e-05, "loss": 0.5926, "step": 3196 }, { "epoch": 1.1782917165760618, "grad_norm": 6.43685287630313, "learning_rate": 4.043145011744344e-05, "loss": 0.4703, "step": 3197 }, { "epoch": 1.1786602782640745, "grad_norm": 5.337312088457058, "learning_rate": 4.042835950055631e-05, "loss": 0.3944, "step": 3198 }, { "epoch": 1.179028839952087, "grad_norm": 8.32206875170504, "learning_rate": 4.042526888366918e-05, "loss": 0.5904, "step": 3199 }, { "epoch": 1.1793974016400994, "grad_norm": 5.799552032008391, "learning_rate": 4.042217826678205e-05, "loss": 0.4051, "step": 3200 }, { "epoch": 1.1793974016400994, "eval_bleu": 0.08239022934865231, "eval_bleu_1gram": 0.42687884282929084, "eval_bleu_2gram": 0.21959127227329853, "eval_bleu_3gram": 0.11193662059120642, "eval_bleu_4gram": 0.0629068621916171, "eval_rag_val_loss": 0.7724439239171215, "eval_rouge1": 0.4196332211641382, "eval_rouge2": 0.21342030365402523, "eval_rougeL": 0.4168437194443293, "step": 3200 }, { "epoch": 1.1797659633281121, "grad_norm": 6.087999026545322, "learning_rate": 4.0419087649894924e-05, "loss": 0.3583, "step": 3201 }, { "epoch": 1.1801345250161246, "grad_norm": 9.427386593375548, "learning_rate": 4.041599703300779e-05, "loss": 0.3709, "step": 3202 }, { "epoch": 1.180503086704137, "grad_norm": 7.063868837290189, "learning_rate": 4.041290641612066e-05, "loss": 0.4793, "step": 3203 }, { "epoch": 1.1808716483921495, "grad_norm": 5.158457890220272, "learning_rate": 4.040981579923353e-05, "loss": 0.4789, "step": 3204 }, { "epoch": 1.1812402100801622, "grad_norm": 5.532332664610297, "learning_rate": 4.04067251823464e-05, "loss": 0.4249, "step": 3205 }, { "epoch": 1.1816087717681747, "grad_norm": 6.175834370818347, "learning_rate": 4.040363456545927e-05, "loss": 0.3085, "step": 3206 }, { "epoch": 1.1819773334561872, "grad_norm": 6.616701506836046, "learning_rate": 4.040054394857214e-05, "loss": 0.5119, "step": 3207 }, { "epoch": 1.1823458951441999, "grad_norm": 8.085194900985014, "learning_rate": 4.039745333168501e-05, "loss": 0.4673, "step": 3208 }, { "epoch": 1.1827144568322123, "grad_norm": 5.250507148588869, "learning_rate": 4.0394362714797874e-05, "loss": 0.4197, "step": 3209 }, { "epoch": 1.1830830185202248, "grad_norm": 6.10796494077802, "learning_rate": 4.0391272097910745e-05, "loss": 0.4771, "step": 3210 }, { "epoch": 1.1834515802082373, "grad_norm": 6.892400188532782, "learning_rate": 4.038818148102361e-05, "loss": 0.7491, "step": 3211 }, { "epoch": 1.18382014189625, "grad_norm": 6.246306892274864, "learning_rate": 4.038509086413648e-05, "loss": 0.4407, "step": 3212 }, { "epoch": 1.1841887035842624, "grad_norm": 7.6116701773121775, "learning_rate": 4.038200024724935e-05, "loss": 0.6072, "step": 3213 }, { "epoch": 1.184557265272275, "grad_norm": 7.806551690605507, "learning_rate": 4.037890963036222e-05, "loss": 0.4525, "step": 3214 }, { "epoch": 1.1849258269602876, "grad_norm": 5.690497981104046, "learning_rate": 4.0375819013475094e-05, "loss": 0.4323, "step": 3215 }, { "epoch": 1.1852943886483, "grad_norm": 8.29599126502288, "learning_rate": 4.037272839658796e-05, "loss": 0.3545, "step": 3216 }, { "epoch": 1.1856629503363125, "grad_norm": 10.554537508764929, "learning_rate": 4.036963777970083e-05, "loss": 0.3711, "step": 3217 }, { "epoch": 1.186031512024325, "grad_norm": 4.916055781458458, "learning_rate": 4.03665471628137e-05, "loss": 0.4831, "step": 3218 }, { "epoch": 1.1864000737123377, "grad_norm": 8.917503769223822, "learning_rate": 4.036345654592657e-05, "loss": 0.6847, "step": 3219 }, { "epoch": 1.1867686354003502, "grad_norm": 6.869733978407172, "learning_rate": 4.036036592903944e-05, "loss": 0.3985, "step": 3220 }, { "epoch": 1.1871371970883626, "grad_norm": 3.959346052754963, "learning_rate": 4.035727531215231e-05, "loss": 0.2985, "step": 3221 }, { "epoch": 1.1875057587763753, "grad_norm": 5.240618179489729, "learning_rate": 4.035418469526518e-05, "loss": 0.4162, "step": 3222 }, { "epoch": 1.1878743204643878, "grad_norm": 23.956351961774278, "learning_rate": 4.035109407837805e-05, "loss": 0.6639, "step": 3223 }, { "epoch": 1.1882428821524003, "grad_norm": 7.531011933802144, "learning_rate": 4.0348003461490914e-05, "loss": 0.3813, "step": 3224 }, { "epoch": 1.1886114438404127, "grad_norm": 7.55581394701163, "learning_rate": 4.0344912844603786e-05, "loss": 0.5236, "step": 3225 }, { "epoch": 1.1889800055284252, "grad_norm": 7.604732405289136, "learning_rate": 4.034182222771665e-05, "loss": 0.4311, "step": 3226 }, { "epoch": 1.189348567216438, "grad_norm": 6.281690895207252, "learning_rate": 4.033873161082952e-05, "loss": 0.4441, "step": 3227 }, { "epoch": 1.1897171289044504, "grad_norm": 6.143475751683524, "learning_rate": 4.033564099394239e-05, "loss": 0.3357, "step": 3228 }, { "epoch": 1.1900856905924628, "grad_norm": 4.939653144427841, "learning_rate": 4.0332550377055264e-05, "loss": 0.3463, "step": 3229 }, { "epoch": 1.1904542522804755, "grad_norm": 6.482378916700083, "learning_rate": 4.032945976016813e-05, "loss": 0.3906, "step": 3230 }, { "epoch": 1.190822813968488, "grad_norm": 6.339311823033194, "learning_rate": 4.0326369143281e-05, "loss": 0.4156, "step": 3231 }, { "epoch": 1.1911913756565005, "grad_norm": 6.095200898987944, "learning_rate": 4.032327852639387e-05, "loss": 0.4931, "step": 3232 }, { "epoch": 1.191559937344513, "grad_norm": 5.35536012323733, "learning_rate": 4.032018790950674e-05, "loss": 0.5215, "step": 3233 }, { "epoch": 1.1919284990325256, "grad_norm": 6.010717039574728, "learning_rate": 4.031709729261961e-05, "loss": 0.3882, "step": 3234 }, { "epoch": 1.192297060720538, "grad_norm": 7.138478162983931, "learning_rate": 4.031400667573248e-05, "loss": 0.5182, "step": 3235 }, { "epoch": 1.1926656224085506, "grad_norm": 5.513759394784489, "learning_rate": 4.031091605884535e-05, "loss": 0.3657, "step": 3236 }, { "epoch": 1.1930341840965633, "grad_norm": 6.112920254417297, "learning_rate": 4.030782544195822e-05, "loss": 0.4918, "step": 3237 }, { "epoch": 1.1934027457845757, "grad_norm": 5.546640998310215, "learning_rate": 4.030473482507109e-05, "loss": 0.3996, "step": 3238 }, { "epoch": 1.1937713074725882, "grad_norm": 8.888063085131284, "learning_rate": 4.0301644208183955e-05, "loss": 0.366, "step": 3239 }, { "epoch": 1.1941398691606007, "grad_norm": 7.001540423245434, "learning_rate": 4.029855359129682e-05, "loss": 0.46, "step": 3240 }, { "epoch": 1.1945084308486134, "grad_norm": 10.989660779249228, "learning_rate": 4.029546297440969e-05, "loss": 0.4295, "step": 3241 }, { "epoch": 1.1948769925366258, "grad_norm": 6.504653145458612, "learning_rate": 4.029237235752256e-05, "loss": 0.3126, "step": 3242 }, { "epoch": 1.1952455542246383, "grad_norm": 4.74156222549666, "learning_rate": 4.028928174063543e-05, "loss": 0.4649, "step": 3243 }, { "epoch": 1.195614115912651, "grad_norm": 8.269826935106991, "learning_rate": 4.0286191123748304e-05, "loss": 0.4096, "step": 3244 }, { "epoch": 1.1959826776006635, "grad_norm": 7.060742184798657, "learning_rate": 4.028310050686117e-05, "loss": 0.5584, "step": 3245 }, { "epoch": 1.196351239288676, "grad_norm": 8.308959665925721, "learning_rate": 4.028000988997404e-05, "loss": 0.4411, "step": 3246 }, { "epoch": 1.1967198009766884, "grad_norm": 5.8870932922487835, "learning_rate": 4.027691927308691e-05, "loss": 0.4185, "step": 3247 }, { "epoch": 1.197088362664701, "grad_norm": 5.763429301168733, "learning_rate": 4.027382865619978e-05, "loss": 0.3241, "step": 3248 }, { "epoch": 1.1974569243527136, "grad_norm": 5.825487946468787, "learning_rate": 4.027073803931265e-05, "loss": 0.3103, "step": 3249 }, { "epoch": 1.197825486040726, "grad_norm": 11.635670348668814, "learning_rate": 4.026764742242552e-05, "loss": 0.5362, "step": 3250 }, { "epoch": 1.1981940477287387, "grad_norm": 5.725336003334147, "learning_rate": 4.026455680553839e-05, "loss": 0.4241, "step": 3251 }, { "epoch": 1.1985626094167512, "grad_norm": 6.148998966779636, "learning_rate": 4.026146618865126e-05, "loss": 0.4269, "step": 3252 }, { "epoch": 1.1989311711047637, "grad_norm": 6.002663339142546, "learning_rate": 4.025837557176413e-05, "loss": 0.3275, "step": 3253 }, { "epoch": 1.1992997327927761, "grad_norm": 12.215601991176527, "learning_rate": 4.0255284954876996e-05, "loss": 0.5388, "step": 3254 }, { "epoch": 1.1996682944807886, "grad_norm": 6.418833511240148, "learning_rate": 4.025219433798986e-05, "loss": 0.4666, "step": 3255 }, { "epoch": 1.2000368561688013, "grad_norm": 6.426953611708333, "learning_rate": 4.024910372110273e-05, "loss": 0.4572, "step": 3256 }, { "epoch": 1.2004054178568138, "grad_norm": 8.596765717311357, "learning_rate": 4.02460131042156e-05, "loss": 0.2928, "step": 3257 }, { "epoch": 1.2007739795448262, "grad_norm": 7.604546803069735, "learning_rate": 4.0242922487328474e-05, "loss": 0.375, "step": 3258 }, { "epoch": 1.201142541232839, "grad_norm": 6.7421125317203705, "learning_rate": 4.023983187044134e-05, "loss": 0.5151, "step": 3259 }, { "epoch": 1.2015111029208514, "grad_norm": 3.7678180813943793, "learning_rate": 4.023674125355421e-05, "loss": 0.2167, "step": 3260 }, { "epoch": 1.2018796646088639, "grad_norm": 5.744454404788274, "learning_rate": 4.023365063666708e-05, "loss": 0.3019, "step": 3261 }, { "epoch": 1.2022482262968763, "grad_norm": 5.305053878064758, "learning_rate": 4.023056001977995e-05, "loss": 0.4845, "step": 3262 }, { "epoch": 1.202616787984889, "grad_norm": 7.643313069616852, "learning_rate": 4.022746940289282e-05, "loss": 0.266, "step": 3263 }, { "epoch": 1.2029853496729015, "grad_norm": 8.141732695151376, "learning_rate": 4.022437878600569e-05, "loss": 0.3892, "step": 3264 }, { "epoch": 1.203353911360914, "grad_norm": 7.85542140284401, "learning_rate": 4.022128816911856e-05, "loss": 0.4488, "step": 3265 }, { "epoch": 1.2037224730489267, "grad_norm": 8.048391847327329, "learning_rate": 4.021819755223143e-05, "loss": 0.4477, "step": 3266 }, { "epoch": 1.2040910347369391, "grad_norm": 7.162426278099411, "learning_rate": 4.02151069353443e-05, "loss": 0.5003, "step": 3267 }, { "epoch": 1.2044595964249516, "grad_norm": 7.581546475116123, "learning_rate": 4.0212016318457166e-05, "loss": 0.3417, "step": 3268 }, { "epoch": 1.204828158112964, "grad_norm": 5.7016579609283795, "learning_rate": 4.020892570157003e-05, "loss": 0.3615, "step": 3269 }, { "epoch": 1.2051967198009768, "grad_norm": 5.633252342288982, "learning_rate": 4.02058350846829e-05, "loss": 0.3939, "step": 3270 }, { "epoch": 1.2055652814889892, "grad_norm": 8.242425389043284, "learning_rate": 4.020274446779577e-05, "loss": 0.3745, "step": 3271 }, { "epoch": 1.2059338431770017, "grad_norm": 5.26057060192726, "learning_rate": 4.0199653850908644e-05, "loss": 0.4152, "step": 3272 }, { "epoch": 1.2063024048650144, "grad_norm": 7.426731038844738, "learning_rate": 4.0196563234021515e-05, "loss": 0.3808, "step": 3273 }, { "epoch": 1.2066709665530269, "grad_norm": 6.200767703364155, "learning_rate": 4.019347261713438e-05, "loss": 0.3319, "step": 3274 }, { "epoch": 1.2070395282410393, "grad_norm": 7.3679242862949, "learning_rate": 4.019038200024725e-05, "loss": 0.4401, "step": 3275 }, { "epoch": 1.2074080899290518, "grad_norm": 5.311902539933315, "learning_rate": 4.018729138336012e-05, "loss": 0.3016, "step": 3276 }, { "epoch": 1.2077766516170645, "grad_norm": 4.220920032892386, "learning_rate": 4.018420076647299e-05, "loss": 0.2227, "step": 3277 }, { "epoch": 1.208145213305077, "grad_norm": 7.45326575060363, "learning_rate": 4.018111014958586e-05, "loss": 0.3669, "step": 3278 }, { "epoch": 1.2085137749930894, "grad_norm": 7.4102689888300555, "learning_rate": 4.017801953269873e-05, "loss": 0.4357, "step": 3279 }, { "epoch": 1.2088823366811021, "grad_norm": 11.234040532551576, "learning_rate": 4.01749289158116e-05, "loss": 0.528, "step": 3280 }, { "epoch": 1.2092508983691146, "grad_norm": 5.942770455849525, "learning_rate": 4.017183829892447e-05, "loss": 0.4608, "step": 3281 }, { "epoch": 1.209619460057127, "grad_norm": 4.627390887902529, "learning_rate": 4.016874768203734e-05, "loss": 0.2243, "step": 3282 }, { "epoch": 1.2099880217451395, "grad_norm": 6.749370121707279, "learning_rate": 4.0165657065150206e-05, "loss": 0.5548, "step": 3283 }, { "epoch": 1.210356583433152, "grad_norm": 10.836336355425212, "learning_rate": 4.016256644826307e-05, "loss": 0.614, "step": 3284 }, { "epoch": 1.2107251451211647, "grad_norm": 6.669067935818945, "learning_rate": 4.015947583137594e-05, "loss": 0.5339, "step": 3285 }, { "epoch": 1.2110937068091772, "grad_norm": 6.891119353192296, "learning_rate": 4.015638521448881e-05, "loss": 0.6189, "step": 3286 }, { "epoch": 1.2114622684971896, "grad_norm": 5.513285629791817, "learning_rate": 4.0153294597601684e-05, "loss": 0.4778, "step": 3287 }, { "epoch": 1.2118308301852023, "grad_norm": 6.360819284703524, "learning_rate": 4.015020398071455e-05, "loss": 0.6103, "step": 3288 }, { "epoch": 1.2121993918732148, "grad_norm": 5.277189059076278, "learning_rate": 4.014711336382742e-05, "loss": 0.285, "step": 3289 }, { "epoch": 1.2125679535612273, "grad_norm": 5.049712809337488, "learning_rate": 4.014402274694029e-05, "loss": 0.353, "step": 3290 }, { "epoch": 1.2129365152492397, "grad_norm": 7.4523050109067555, "learning_rate": 4.014093213005316e-05, "loss": 0.4972, "step": 3291 }, { "epoch": 1.2133050769372524, "grad_norm": 7.779942904461426, "learning_rate": 4.0137841513166033e-05, "loss": 0.5937, "step": 3292 }, { "epoch": 1.213673638625265, "grad_norm": 12.827733060585288, "learning_rate": 4.01347508962789e-05, "loss": 0.7198, "step": 3293 }, { "epoch": 1.2140422003132774, "grad_norm": 6.646070549855915, "learning_rate": 4.013166027939177e-05, "loss": 0.4614, "step": 3294 }, { "epoch": 1.21441076200129, "grad_norm": 7.504299203174252, "learning_rate": 4.012856966250464e-05, "loss": 0.4815, "step": 3295 }, { "epoch": 1.2147793236893025, "grad_norm": 5.322774488288399, "learning_rate": 4.012547904561751e-05, "loss": 0.2804, "step": 3296 }, { "epoch": 1.215147885377315, "grad_norm": 7.352379456549811, "learning_rate": 4.0122388428730376e-05, "loss": 0.4909, "step": 3297 }, { "epoch": 1.2155164470653275, "grad_norm": 16.1171846597792, "learning_rate": 4.011929781184325e-05, "loss": 0.4923, "step": 3298 }, { "epoch": 1.2158850087533402, "grad_norm": 5.747319799305436, "learning_rate": 4.011620719495611e-05, "loss": 0.3536, "step": 3299 }, { "epoch": 1.2162535704413526, "grad_norm": 6.685846712619759, "learning_rate": 4.011311657806898e-05, "loss": 0.4303, "step": 3300 }, { "epoch": 1.216622132129365, "grad_norm": 9.178127360773871, "learning_rate": 4.0110025961181854e-05, "loss": 0.603, "step": 3301 }, { "epoch": 1.2169906938173778, "grad_norm": 9.305072459369732, "learning_rate": 4.010693534429472e-05, "loss": 0.342, "step": 3302 }, { "epoch": 1.2173592555053903, "grad_norm": 4.504173039312145, "learning_rate": 4.010384472740759e-05, "loss": 0.2715, "step": 3303 }, { "epoch": 1.2177278171934027, "grad_norm": 6.309033943620673, "learning_rate": 4.010075411052046e-05, "loss": 0.5385, "step": 3304 }, { "epoch": 1.2180963788814152, "grad_norm": 5.938734709202715, "learning_rate": 4.009766349363333e-05, "loss": 0.3903, "step": 3305 }, { "epoch": 1.218464940569428, "grad_norm": 10.01608280081847, "learning_rate": 4.00945728767462e-05, "loss": 0.3058, "step": 3306 }, { "epoch": 1.2188335022574404, "grad_norm": 5.552860780525691, "learning_rate": 4.009148225985907e-05, "loss": 0.4022, "step": 3307 }, { "epoch": 1.2192020639454528, "grad_norm": 6.6325795702253805, "learning_rate": 4.008839164297194e-05, "loss": 0.4997, "step": 3308 }, { "epoch": 1.2195706256334655, "grad_norm": 8.066347608745682, "learning_rate": 4.008530102608481e-05, "loss": 0.4465, "step": 3309 }, { "epoch": 1.219939187321478, "grad_norm": 4.224929014433491, "learning_rate": 4.008221040919768e-05, "loss": 0.2731, "step": 3310 }, { "epoch": 1.2203077490094905, "grad_norm": 14.086945982686972, "learning_rate": 4.0079119792310545e-05, "loss": 0.573, "step": 3311 }, { "epoch": 1.220676310697503, "grad_norm": 17.77103263263219, "learning_rate": 4.007602917542342e-05, "loss": 0.4292, "step": 3312 }, { "epoch": 1.2210448723855156, "grad_norm": 6.528752921927446, "learning_rate": 4.007293855853629e-05, "loss": 0.3854, "step": 3313 }, { "epoch": 1.221413434073528, "grad_norm": 9.437917940093849, "learning_rate": 4.006984794164915e-05, "loss": 0.3903, "step": 3314 }, { "epoch": 1.2217819957615406, "grad_norm": 7.25884154942429, "learning_rate": 4.0066757324762023e-05, "loss": 0.4652, "step": 3315 }, { "epoch": 1.222150557449553, "grad_norm": 9.964607932823188, "learning_rate": 4.0063666707874895e-05, "loss": 0.3322, "step": 3316 }, { "epoch": 1.2225191191375657, "grad_norm": 6.645307689515217, "learning_rate": 4.006057609098776e-05, "loss": 0.5602, "step": 3317 }, { "epoch": 1.2228876808255782, "grad_norm": 7.0795282652860685, "learning_rate": 4.005748547410063e-05, "loss": 0.3859, "step": 3318 }, { "epoch": 1.2232562425135907, "grad_norm": 5.9307546748439695, "learning_rate": 4.00543948572135e-05, "loss": 0.364, "step": 3319 }, { "epoch": 1.2236248042016031, "grad_norm": 8.020215718324353, "learning_rate": 4.005130424032637e-05, "loss": 0.4749, "step": 3320 }, { "epoch": 1.2239933658896158, "grad_norm": 6.74328675044525, "learning_rate": 4.004821362343924e-05, "loss": 0.5343, "step": 3321 }, { "epoch": 1.2243619275776283, "grad_norm": 9.110952773953926, "learning_rate": 4.004512300655211e-05, "loss": 0.4332, "step": 3322 }, { "epoch": 1.2247304892656408, "grad_norm": 5.197170206517925, "learning_rate": 4.004203238966498e-05, "loss": 0.4488, "step": 3323 }, { "epoch": 1.2250990509536535, "grad_norm": 7.055393931355734, "learning_rate": 4.003894177277785e-05, "loss": 0.3822, "step": 3324 }, { "epoch": 1.225467612641666, "grad_norm": 6.570792054808846, "learning_rate": 4.003585115589072e-05, "loss": 0.4979, "step": 3325 }, { "epoch": 1.2258361743296784, "grad_norm": 7.164187098493908, "learning_rate": 4.0032760539003586e-05, "loss": 0.3317, "step": 3326 }, { "epoch": 1.2262047360176909, "grad_norm": 5.863575317409139, "learning_rate": 4.002966992211646e-05, "loss": 0.3352, "step": 3327 }, { "epoch": 1.2265732977057036, "grad_norm": 5.8078699847504875, "learning_rate": 4.002657930522933e-05, "loss": 0.5625, "step": 3328 }, { "epoch": 1.226941859393716, "grad_norm": 6.82596258844048, "learning_rate": 4.00234886883422e-05, "loss": 0.5222, "step": 3329 }, { "epoch": 1.2273104210817285, "grad_norm": 9.554089406617903, "learning_rate": 4.0020398071455064e-05, "loss": 0.2384, "step": 3330 }, { "epoch": 1.2276789827697412, "grad_norm": 4.86015485289222, "learning_rate": 4.001730745456793e-05, "loss": 0.3068, "step": 3331 }, { "epoch": 1.2280475444577537, "grad_norm": 6.934193415969743, "learning_rate": 4.00142168376808e-05, "loss": 0.3845, "step": 3332 }, { "epoch": 1.2284161061457661, "grad_norm": 4.047123374828594, "learning_rate": 4.001112622079367e-05, "loss": 0.3215, "step": 3333 }, { "epoch": 1.2287846678337786, "grad_norm": 5.834032434623773, "learning_rate": 4.000803560390654e-05, "loss": 0.4892, "step": 3334 }, { "epoch": 1.2291532295217913, "grad_norm": 6.083725729353683, "learning_rate": 4.0004944987019413e-05, "loss": 0.4189, "step": 3335 }, { "epoch": 1.2295217912098038, "grad_norm": 4.887909839354267, "learning_rate": 4.000185437013228e-05, "loss": 0.3457, "step": 3336 }, { "epoch": 1.2298903528978162, "grad_norm": 7.855710337455542, "learning_rate": 3.999876375324515e-05, "loss": 0.4412, "step": 3337 }, { "epoch": 1.230258914585829, "grad_norm": 10.122898967000202, "learning_rate": 3.999567313635802e-05, "loss": 0.5781, "step": 3338 }, { "epoch": 1.2306274762738414, "grad_norm": 5.342996823063769, "learning_rate": 3.999258251947089e-05, "loss": 0.339, "step": 3339 }, { "epoch": 1.2309960379618539, "grad_norm": 8.112800712478437, "learning_rate": 3.9989491902583756e-05, "loss": 0.386, "step": 3340 }, { "epoch": 1.2313645996498663, "grad_norm": 7.371332064659053, "learning_rate": 3.998640128569663e-05, "loss": 0.469, "step": 3341 }, { "epoch": 1.231733161337879, "grad_norm": 6.4394309101641385, "learning_rate": 3.99833106688095e-05, "loss": 0.3023, "step": 3342 }, { "epoch": 1.2321017230258915, "grad_norm": 3.884106917271949, "learning_rate": 3.998022005192237e-05, "loss": 0.1513, "step": 3343 }, { "epoch": 1.232470284713904, "grad_norm": 9.007454222330681, "learning_rate": 3.997712943503524e-05, "loss": 0.4041, "step": 3344 }, { "epoch": 1.2328388464019164, "grad_norm": 7.4054160634466655, "learning_rate": 3.9974038818148105e-05, "loss": 0.2864, "step": 3345 }, { "epoch": 1.2332074080899291, "grad_norm": 9.442489012695901, "learning_rate": 3.997094820126097e-05, "loss": 0.6409, "step": 3346 }, { "epoch": 1.2335759697779416, "grad_norm": 10.994117551106093, "learning_rate": 3.996785758437384e-05, "loss": 0.4468, "step": 3347 }, { "epoch": 1.233944531465954, "grad_norm": 14.849844554768284, "learning_rate": 3.996476696748671e-05, "loss": 0.5252, "step": 3348 }, { "epoch": 1.2343130931539665, "grad_norm": 5.406395706241886, "learning_rate": 3.996167635059958e-05, "loss": 0.3182, "step": 3349 }, { "epoch": 1.2346816548419792, "grad_norm": 8.169420154403841, "learning_rate": 3.995858573371245e-05, "loss": 0.6999, "step": 3350 }, { "epoch": 1.2350502165299917, "grad_norm": 4.992508807759545, "learning_rate": 3.995549511682532e-05, "loss": 0.3697, "step": 3351 }, { "epoch": 1.2354187782180042, "grad_norm": 6.823398656540172, "learning_rate": 3.995240449993819e-05, "loss": 0.4079, "step": 3352 }, { "epoch": 1.2357873399060169, "grad_norm": 6.934040478566478, "learning_rate": 3.994931388305106e-05, "loss": 0.4532, "step": 3353 }, { "epoch": 1.2361559015940293, "grad_norm": 12.45361082117759, "learning_rate": 3.994622326616393e-05, "loss": 0.4216, "step": 3354 }, { "epoch": 1.2365244632820418, "grad_norm": 8.930163807151501, "learning_rate": 3.99431326492768e-05, "loss": 0.4543, "step": 3355 }, { "epoch": 1.2368930249700543, "grad_norm": 7.410357788755753, "learning_rate": 3.994004203238967e-05, "loss": 0.4058, "step": 3356 }, { "epoch": 1.237261586658067, "grad_norm": 6.510128053498581, "learning_rate": 3.993695141550254e-05, "loss": 0.3561, "step": 3357 }, { "epoch": 1.2376301483460794, "grad_norm": 7.552510370918535, "learning_rate": 3.993386079861541e-05, "loss": 0.3733, "step": 3358 }, { "epoch": 1.237998710034092, "grad_norm": 9.508809873478434, "learning_rate": 3.9930770181728275e-05, "loss": 0.4503, "step": 3359 }, { "epoch": 1.2383672717221046, "grad_norm": 7.078183744409925, "learning_rate": 3.992767956484114e-05, "loss": 0.352, "step": 3360 }, { "epoch": 1.238735833410117, "grad_norm": 4.9895901079337595, "learning_rate": 3.992458894795401e-05, "loss": 0.2962, "step": 3361 }, { "epoch": 1.2391043950981295, "grad_norm": 7.19480193373818, "learning_rate": 3.992149833106688e-05, "loss": 0.5541, "step": 3362 }, { "epoch": 1.239472956786142, "grad_norm": 9.06114449722602, "learning_rate": 3.991840771417975e-05, "loss": 0.4052, "step": 3363 }, { "epoch": 1.2398415184741547, "grad_norm": 8.124408406948202, "learning_rate": 3.9915317097292624e-05, "loss": 0.4545, "step": 3364 }, { "epoch": 1.2402100801621672, "grad_norm": 5.873277127490971, "learning_rate": 3.991222648040549e-05, "loss": 0.5576, "step": 3365 }, { "epoch": 1.2405786418501796, "grad_norm": 6.835536539580163, "learning_rate": 3.990913586351836e-05, "loss": 0.3814, "step": 3366 }, { "epoch": 1.2409472035381923, "grad_norm": 6.761598018491774, "learning_rate": 3.990604524663123e-05, "loss": 0.4128, "step": 3367 }, { "epoch": 1.2413157652262048, "grad_norm": 4.628500773380046, "learning_rate": 3.99029546297441e-05, "loss": 0.3894, "step": 3368 }, { "epoch": 1.2416843269142173, "grad_norm": 5.149750468310504, "learning_rate": 3.9899864012856966e-05, "loss": 0.4298, "step": 3369 }, { "epoch": 1.2420528886022297, "grad_norm": 38.93825553540663, "learning_rate": 3.989677339596984e-05, "loss": 0.7061, "step": 3370 }, { "epoch": 1.2424214502902424, "grad_norm": 6.161482857413243, "learning_rate": 3.989368277908271e-05, "loss": 0.4114, "step": 3371 }, { "epoch": 1.242790011978255, "grad_norm": 6.167780732568163, "learning_rate": 3.989059216219558e-05, "loss": 0.4944, "step": 3372 }, { "epoch": 1.2431585736662674, "grad_norm": 7.585890482040825, "learning_rate": 3.988750154530845e-05, "loss": 0.4524, "step": 3373 }, { "epoch": 1.2435271353542798, "grad_norm": 13.677082713529753, "learning_rate": 3.9884410928421315e-05, "loss": 0.6972, "step": 3374 }, { "epoch": 1.2438956970422925, "grad_norm": 7.542186346818588, "learning_rate": 3.988132031153418e-05, "loss": 0.4753, "step": 3375 }, { "epoch": 1.244264258730305, "grad_norm": 9.690915791448873, "learning_rate": 3.987822969464705e-05, "loss": 0.5858, "step": 3376 }, { "epoch": 1.2446328204183175, "grad_norm": 6.833560970825693, "learning_rate": 3.987513907775992e-05, "loss": 0.2874, "step": 3377 }, { "epoch": 1.24500138210633, "grad_norm": 7.0870404883350835, "learning_rate": 3.987204846087279e-05, "loss": 0.348, "step": 3378 }, { "epoch": 1.2453699437943426, "grad_norm": 5.604518137218849, "learning_rate": 3.986895784398566e-05, "loss": 0.3234, "step": 3379 }, { "epoch": 1.245738505482355, "grad_norm": 5.676546721222565, "learning_rate": 3.986586722709853e-05, "loss": 0.2589, "step": 3380 }, { "epoch": 1.2461070671703676, "grad_norm": 6.920440869702933, "learning_rate": 3.98627766102114e-05, "loss": 0.5921, "step": 3381 }, { "epoch": 1.2464756288583803, "grad_norm": 6.499522851890444, "learning_rate": 3.985968599332427e-05, "loss": 0.2863, "step": 3382 }, { "epoch": 1.2468441905463927, "grad_norm": 4.973007777060881, "learning_rate": 3.9856595376437136e-05, "loss": 0.2985, "step": 3383 }, { "epoch": 1.2472127522344052, "grad_norm": 5.677656101649143, "learning_rate": 3.985350475955001e-05, "loss": 0.3891, "step": 3384 }, { "epoch": 1.2475813139224177, "grad_norm": 7.299107996122711, "learning_rate": 3.985041414266288e-05, "loss": 0.3528, "step": 3385 }, { "epoch": 1.2479498756104304, "grad_norm": 6.153824536579017, "learning_rate": 3.984732352577575e-05, "loss": 0.358, "step": 3386 }, { "epoch": 1.2483184372984428, "grad_norm": 4.908166984670964, "learning_rate": 3.984423290888862e-05, "loss": 0.322, "step": 3387 }, { "epoch": 1.2486869989864553, "grad_norm": 5.602537275170754, "learning_rate": 3.9841142292001485e-05, "loss": 0.3498, "step": 3388 }, { "epoch": 1.249055560674468, "grad_norm": 6.050317538416792, "learning_rate": 3.9838051675114356e-05, "loss": 0.5699, "step": 3389 }, { "epoch": 1.2494241223624805, "grad_norm": 7.539828025414906, "learning_rate": 3.983496105822722e-05, "loss": 0.4083, "step": 3390 }, { "epoch": 1.249792684050493, "grad_norm": 9.526473252308701, "learning_rate": 3.983187044134009e-05, "loss": 0.5083, "step": 3391 }, { "epoch": 1.2501612457385054, "grad_norm": 8.438929909941328, "learning_rate": 3.982877982445296e-05, "loss": 0.6744, "step": 3392 }, { "epoch": 1.250529807426518, "grad_norm": 6.748274971271593, "learning_rate": 3.982568920756583e-05, "loss": 0.4219, "step": 3393 }, { "epoch": 1.2508983691145306, "grad_norm": 7.659107406841299, "learning_rate": 3.98225985906787e-05, "loss": 0.2945, "step": 3394 }, { "epoch": 1.251266930802543, "grad_norm": 18.60230157209846, "learning_rate": 3.981950797379157e-05, "loss": 0.3766, "step": 3395 }, { "epoch": 1.2516354924905557, "grad_norm": 4.550037241353644, "learning_rate": 3.981641735690444e-05, "loss": 0.3846, "step": 3396 }, { "epoch": 1.2520040541785682, "grad_norm": 5.277105567464613, "learning_rate": 3.981332674001731e-05, "loss": 0.4525, "step": 3397 }, { "epoch": 1.2523726158665807, "grad_norm": 6.353194398965368, "learning_rate": 3.9810236123130177e-05, "loss": 0.2919, "step": 3398 }, { "epoch": 1.2527411775545931, "grad_norm": 5.73158366689054, "learning_rate": 3.980714550624305e-05, "loss": 0.454, "step": 3399 }, { "epoch": 1.2531097392426056, "grad_norm": 6.274666291744439, "learning_rate": 3.980405488935592e-05, "loss": 0.4308, "step": 3400 }, { "epoch": 1.2534783009306183, "grad_norm": 6.265601560020837, "learning_rate": 3.980096427246879e-05, "loss": 0.3405, "step": 3401 }, { "epoch": 1.2538468626186308, "grad_norm": 6.499392554436157, "learning_rate": 3.9797873655581655e-05, "loss": 0.3396, "step": 3402 }, { "epoch": 1.2542154243066435, "grad_norm": 5.734192057921404, "learning_rate": 3.9794783038694526e-05, "loss": 0.5335, "step": 3403 }, { "epoch": 1.254583985994656, "grad_norm": 4.700448424678906, "learning_rate": 3.97916924218074e-05, "loss": 0.2343, "step": 3404 }, { "epoch": 1.2549525476826684, "grad_norm": 6.733760938506291, "learning_rate": 3.978860180492026e-05, "loss": 0.4301, "step": 3405 }, { "epoch": 1.2553211093706809, "grad_norm": 5.007280770814978, "learning_rate": 3.978551118803313e-05, "loss": 0.308, "step": 3406 }, { "epoch": 1.2556896710586933, "grad_norm": 5.514977263112377, "learning_rate": 3.9782420571146004e-05, "loss": 0.3637, "step": 3407 }, { "epoch": 1.256058232746706, "grad_norm": 4.064614317968312, "learning_rate": 3.977932995425887e-05, "loss": 0.1792, "step": 3408 }, { "epoch": 1.2564267944347185, "grad_norm": 8.008133091905739, "learning_rate": 3.977623933737174e-05, "loss": 0.2868, "step": 3409 }, { "epoch": 1.256795356122731, "grad_norm": 7.722665635268703, "learning_rate": 3.977314872048461e-05, "loss": 0.6572, "step": 3410 }, { "epoch": 1.2571639178107437, "grad_norm": 5.416865340281785, "learning_rate": 3.977005810359748e-05, "loss": 0.3363, "step": 3411 }, { "epoch": 1.2575324794987561, "grad_norm": 5.179372648448279, "learning_rate": 3.9766967486710346e-05, "loss": 0.3618, "step": 3412 }, { "epoch": 1.2579010411867686, "grad_norm": 6.978759146412263, "learning_rate": 3.976387686982322e-05, "loss": 0.5669, "step": 3413 }, { "epoch": 1.258269602874781, "grad_norm": 5.702110555646774, "learning_rate": 3.976078625293609e-05, "loss": 0.4206, "step": 3414 }, { "epoch": 1.2586381645627938, "grad_norm": 7.033012203219526, "learning_rate": 3.975769563604896e-05, "loss": 0.3879, "step": 3415 }, { "epoch": 1.2590067262508062, "grad_norm": 9.822088719760561, "learning_rate": 3.975460501916183e-05, "loss": 0.4078, "step": 3416 }, { "epoch": 1.2593752879388187, "grad_norm": 5.774010987247892, "learning_rate": 3.9751514402274695e-05, "loss": 0.2744, "step": 3417 }, { "epoch": 1.2597438496268314, "grad_norm": 6.200511775890679, "learning_rate": 3.9748423785387567e-05, "loss": 0.4432, "step": 3418 }, { "epoch": 1.2601124113148439, "grad_norm": 9.56191516159902, "learning_rate": 3.974533316850044e-05, "loss": 0.5777, "step": 3419 }, { "epoch": 1.2604809730028563, "grad_norm": 7.60070683053428, "learning_rate": 3.97422425516133e-05, "loss": 0.4801, "step": 3420 }, { "epoch": 1.2608495346908688, "grad_norm": 6.05107818328071, "learning_rate": 3.973915193472617e-05, "loss": 0.4193, "step": 3421 }, { "epoch": 1.2612180963788815, "grad_norm": 9.056562912446939, "learning_rate": 3.973606131783904e-05, "loss": 0.6947, "step": 3422 }, { "epoch": 1.261586658066894, "grad_norm": 6.937352084395824, "learning_rate": 3.973297070095191e-05, "loss": 0.478, "step": 3423 }, { "epoch": 1.2619552197549064, "grad_norm": 6.869474375244344, "learning_rate": 3.972988008406478e-05, "loss": 0.3425, "step": 3424 }, { "epoch": 1.2623237814429191, "grad_norm": 3.6329656753241553, "learning_rate": 3.972678946717765e-05, "loss": 0.2846, "step": 3425 }, { "epoch": 1.2626923431309316, "grad_norm": 6.096961793814553, "learning_rate": 3.972369885029052e-05, "loss": 0.3482, "step": 3426 }, { "epoch": 1.263060904818944, "grad_norm": 7.6362641728516785, "learning_rate": 3.972060823340339e-05, "loss": 0.48, "step": 3427 }, { "epoch": 1.2634294665069565, "grad_norm": 6.713036383255356, "learning_rate": 3.971751761651626e-05, "loss": 0.4436, "step": 3428 }, { "epoch": 1.263798028194969, "grad_norm": 5.770549707523708, "learning_rate": 3.971442699962913e-05, "loss": 0.5086, "step": 3429 }, { "epoch": 1.2641665898829817, "grad_norm": 10.061316379096397, "learning_rate": 3.9711336382742e-05, "loss": 0.4549, "step": 3430 }, { "epoch": 1.2645351515709942, "grad_norm": 9.347683183987524, "learning_rate": 3.9708245765854865e-05, "loss": 0.5516, "step": 3431 }, { "epoch": 1.2649037132590069, "grad_norm": 7.7531033117013255, "learning_rate": 3.9705155148967736e-05, "loss": 0.3002, "step": 3432 }, { "epoch": 1.2652722749470193, "grad_norm": 6.593110690830465, "learning_rate": 3.970206453208061e-05, "loss": 0.5467, "step": 3433 }, { "epoch": 1.2656408366350318, "grad_norm": 3.5087902855811257, "learning_rate": 3.969897391519348e-05, "loss": 0.2184, "step": 3434 }, { "epoch": 1.2660093983230443, "grad_norm": 5.000557677639799, "learning_rate": 3.969588329830634e-05, "loss": 0.2813, "step": 3435 }, { "epoch": 1.2663779600110567, "grad_norm": 4.177756517662802, "learning_rate": 3.9692792681419214e-05, "loss": 0.1876, "step": 3436 }, { "epoch": 1.2667465216990694, "grad_norm": 5.567971956335895, "learning_rate": 3.968970206453208e-05, "loss": 0.389, "step": 3437 }, { "epoch": 1.267115083387082, "grad_norm": 9.940489121497462, "learning_rate": 3.968661144764495e-05, "loss": 0.4844, "step": 3438 }, { "epoch": 1.2674836450750944, "grad_norm": 3.408416898739829, "learning_rate": 3.968352083075782e-05, "loss": 0.2199, "step": 3439 }, { "epoch": 1.267852206763107, "grad_norm": 7.453294412218992, "learning_rate": 3.968043021387069e-05, "loss": 0.3749, "step": 3440 }, { "epoch": 1.2682207684511195, "grad_norm": 4.889115463169901, "learning_rate": 3.9677339596983557e-05, "loss": 0.2861, "step": 3441 }, { "epoch": 1.268589330139132, "grad_norm": 4.399238798014166, "learning_rate": 3.967424898009643e-05, "loss": 0.2981, "step": 3442 }, { "epoch": 1.2689578918271445, "grad_norm": 5.02374503942939, "learning_rate": 3.96711583632093e-05, "loss": 0.3795, "step": 3443 }, { "epoch": 1.2693264535151572, "grad_norm": 8.371409998984266, "learning_rate": 3.966806774632217e-05, "loss": 0.379, "step": 3444 }, { "epoch": 1.2696950152031696, "grad_norm": 4.862508566880709, "learning_rate": 3.966497712943504e-05, "loss": 0.312, "step": 3445 }, { "epoch": 1.270063576891182, "grad_norm": 11.574145581738016, "learning_rate": 3.9661886512547906e-05, "loss": 0.4544, "step": 3446 }, { "epoch": 1.2704321385791948, "grad_norm": 12.06451367583356, "learning_rate": 3.965879589566078e-05, "loss": 0.5005, "step": 3447 }, { "epoch": 1.2708007002672073, "grad_norm": 5.258454554269382, "learning_rate": 3.965570527877365e-05, "loss": 0.3739, "step": 3448 }, { "epoch": 1.2711692619552197, "grad_norm": 9.430080301341347, "learning_rate": 3.965261466188652e-05, "loss": 0.5826, "step": 3449 }, { "epoch": 1.2715378236432322, "grad_norm": 8.555207871587587, "learning_rate": 3.9649524044999384e-05, "loss": 0.5258, "step": 3450 }, { "epoch": 1.271906385331245, "grad_norm": 9.291122169677623, "learning_rate": 3.964643342811225e-05, "loss": 0.6153, "step": 3451 }, { "epoch": 1.2722749470192574, "grad_norm": 6.196473675465925, "learning_rate": 3.964334281122512e-05, "loss": 0.5906, "step": 3452 }, { "epoch": 1.2726435087072698, "grad_norm": 7.866118251413136, "learning_rate": 3.964025219433799e-05, "loss": 0.3479, "step": 3453 }, { "epoch": 1.2730120703952825, "grad_norm": 6.475325186675443, "learning_rate": 3.963716157745086e-05, "loss": 0.4642, "step": 3454 }, { "epoch": 1.273380632083295, "grad_norm": 6.157076862400043, "learning_rate": 3.963407096056373e-05, "loss": 0.3627, "step": 3455 }, { "epoch": 1.2737491937713075, "grad_norm": 7.550739317815538, "learning_rate": 3.96309803436766e-05, "loss": 0.2566, "step": 3456 }, { "epoch": 1.27411775545932, "grad_norm": 6.835080861504184, "learning_rate": 3.962788972678947e-05, "loss": 0.4737, "step": 3457 }, { "epoch": 1.2744863171473324, "grad_norm": 8.27586149128002, "learning_rate": 3.962479910990234e-05, "loss": 0.6676, "step": 3458 }, { "epoch": 1.274854878835345, "grad_norm": 7.943154070911522, "learning_rate": 3.962170849301521e-05, "loss": 0.3565, "step": 3459 }, { "epoch": 1.2752234405233576, "grad_norm": 6.356916022501688, "learning_rate": 3.9618617876128075e-05, "loss": 0.3501, "step": 3460 }, { "epoch": 1.2755920022113703, "grad_norm": 4.467257163647747, "learning_rate": 3.9615527259240946e-05, "loss": 0.3508, "step": 3461 }, { "epoch": 1.2759605638993827, "grad_norm": 7.887097467225794, "learning_rate": 3.961243664235382e-05, "loss": 0.4068, "step": 3462 }, { "epoch": 1.2763291255873952, "grad_norm": 4.599915395870877, "learning_rate": 3.960934602546669e-05, "loss": 0.3336, "step": 3463 }, { "epoch": 1.2766976872754077, "grad_norm": 4.555085729133521, "learning_rate": 3.960625540857956e-05, "loss": 0.2561, "step": 3464 }, { "epoch": 1.2770662489634201, "grad_norm": 8.109467199018264, "learning_rate": 3.9603164791692424e-05, "loss": 0.4723, "step": 3465 }, { "epoch": 1.2774348106514328, "grad_norm": 5.69655014844511, "learning_rate": 3.960007417480529e-05, "loss": 0.3677, "step": 3466 }, { "epoch": 1.2778033723394453, "grad_norm": 5.411994131644733, "learning_rate": 3.959698355791816e-05, "loss": 0.4798, "step": 3467 }, { "epoch": 1.2781719340274578, "grad_norm": 6.192560765781037, "learning_rate": 3.959389294103103e-05, "loss": 0.422, "step": 3468 }, { "epoch": 1.2785404957154705, "grad_norm": 7.043859678292894, "learning_rate": 3.95908023241439e-05, "loss": 0.4214, "step": 3469 }, { "epoch": 1.278909057403483, "grad_norm": 6.546984541893838, "learning_rate": 3.958771170725677e-05, "loss": 0.4317, "step": 3470 }, { "epoch": 1.2792776190914954, "grad_norm": 4.34931644023684, "learning_rate": 3.958462109036964e-05, "loss": 0.2401, "step": 3471 }, { "epoch": 1.2796461807795079, "grad_norm": 6.906448292366831, "learning_rate": 3.958153047348251e-05, "loss": 0.6527, "step": 3472 }, { "epoch": 1.2800147424675206, "grad_norm": 6.121304701314736, "learning_rate": 3.957843985659538e-05, "loss": 0.3914, "step": 3473 }, { "epoch": 1.280383304155533, "grad_norm": 4.904324098888996, "learning_rate": 3.9575349239708245e-05, "loss": 0.3116, "step": 3474 }, { "epoch": 1.2807518658435455, "grad_norm": 9.84279121694805, "learning_rate": 3.9572258622821116e-05, "loss": 0.5052, "step": 3475 }, { "epoch": 1.2811204275315582, "grad_norm": 9.688893470218677, "learning_rate": 3.956916800593399e-05, "loss": 0.353, "step": 3476 }, { "epoch": 1.2814889892195707, "grad_norm": 5.088740781898929, "learning_rate": 3.956607738904686e-05, "loss": 0.3811, "step": 3477 }, { "epoch": 1.2818575509075831, "grad_norm": 8.558735040624022, "learning_rate": 3.956298677215973e-05, "loss": 0.5724, "step": 3478 }, { "epoch": 1.2822261125955956, "grad_norm": 9.477621926560085, "learning_rate": 3.9559896155272594e-05, "loss": 0.4725, "step": 3479 }, { "epoch": 1.2825946742836083, "grad_norm": 7.532104950176196, "learning_rate": 3.9556805538385465e-05, "loss": 0.3772, "step": 3480 }, { "epoch": 1.2829632359716208, "grad_norm": 6.635448924834101, "learning_rate": 3.955371492149833e-05, "loss": 0.3304, "step": 3481 }, { "epoch": 1.2833317976596332, "grad_norm": 4.969044226806006, "learning_rate": 3.95506243046112e-05, "loss": 0.3015, "step": 3482 }, { "epoch": 1.283700359347646, "grad_norm": 7.912483020059165, "learning_rate": 3.954753368772407e-05, "loss": 0.5819, "step": 3483 }, { "epoch": 1.2840689210356584, "grad_norm": 4.213482542373671, "learning_rate": 3.9544443070836936e-05, "loss": 0.3515, "step": 3484 }, { "epoch": 1.2844374827236709, "grad_norm": 5.575353281092093, "learning_rate": 3.954135245394981e-05, "loss": 0.2845, "step": 3485 }, { "epoch": 1.2848060444116833, "grad_norm": 9.00578927232258, "learning_rate": 3.953826183706268e-05, "loss": 0.3523, "step": 3486 }, { "epoch": 1.2851746060996958, "grad_norm": 4.61491676429088, "learning_rate": 3.953517122017555e-05, "loss": 0.3717, "step": 3487 }, { "epoch": 1.2855431677877085, "grad_norm": 4.418518302055079, "learning_rate": 3.953208060328842e-05, "loss": 0.344, "step": 3488 }, { "epoch": 1.285911729475721, "grad_norm": 4.564129682057541, "learning_rate": 3.9528989986401286e-05, "loss": 0.3209, "step": 3489 }, { "epoch": 1.2862802911637337, "grad_norm": 8.576512808854728, "learning_rate": 3.952589936951416e-05, "loss": 0.5767, "step": 3490 }, { "epoch": 1.2866488528517461, "grad_norm": 4.621624101144033, "learning_rate": 3.952280875262703e-05, "loss": 0.2597, "step": 3491 }, { "epoch": 1.2870174145397586, "grad_norm": 4.503519589186389, "learning_rate": 3.95197181357399e-05, "loss": 0.3386, "step": 3492 }, { "epoch": 1.287385976227771, "grad_norm": 8.211352830539996, "learning_rate": 3.9516627518852764e-05, "loss": 0.5387, "step": 3493 }, { "epoch": 1.2877545379157835, "grad_norm": 5.27749988203532, "learning_rate": 3.9513536901965635e-05, "loss": 0.2962, "step": 3494 }, { "epoch": 1.2881230996037962, "grad_norm": 7.9525630729929855, "learning_rate": 3.9510446285078506e-05, "loss": 0.369, "step": 3495 }, { "epoch": 1.2884916612918087, "grad_norm": 5.175932707100019, "learning_rate": 3.950735566819137e-05, "loss": 0.3449, "step": 3496 }, { "epoch": 1.2888602229798212, "grad_norm": 6.980601816985046, "learning_rate": 3.950426505130424e-05, "loss": 0.7193, "step": 3497 }, { "epoch": 1.2892287846678339, "grad_norm": 13.700280201963196, "learning_rate": 3.950117443441711e-05, "loss": 0.4499, "step": 3498 }, { "epoch": 1.2895973463558463, "grad_norm": 4.822508671793797, "learning_rate": 3.949808381752998e-05, "loss": 0.3927, "step": 3499 }, { "epoch": 1.2899659080438588, "grad_norm": 7.273103405155865, "learning_rate": 3.949499320064285e-05, "loss": 0.3912, "step": 3500 }, { "epoch": 1.2903344697318713, "grad_norm": 9.463108912923467, "learning_rate": 3.949190258375572e-05, "loss": 0.3971, "step": 3501 }, { "epoch": 1.290703031419884, "grad_norm": 8.958548136656791, "learning_rate": 3.948881196686859e-05, "loss": 0.4496, "step": 3502 }, { "epoch": 1.2910715931078964, "grad_norm": 13.12109491292046, "learning_rate": 3.9485721349981455e-05, "loss": 0.405, "step": 3503 }, { "epoch": 1.291440154795909, "grad_norm": 6.943573089186363, "learning_rate": 3.9482630733094326e-05, "loss": 0.4764, "step": 3504 }, { "epoch": 1.2918087164839216, "grad_norm": 10.092040114944085, "learning_rate": 3.94795401162072e-05, "loss": 0.5441, "step": 3505 }, { "epoch": 1.292177278171934, "grad_norm": 13.222203301880137, "learning_rate": 3.947644949932007e-05, "loss": 0.4763, "step": 3506 }, { "epoch": 1.2925458398599465, "grad_norm": 5.933499886107482, "learning_rate": 3.947335888243294e-05, "loss": 0.5305, "step": 3507 }, { "epoch": 1.292914401547959, "grad_norm": 4.827237439593973, "learning_rate": 3.9470268265545804e-05, "loss": 0.3394, "step": 3508 }, { "epoch": 1.2932829632359717, "grad_norm": 5.609739668954557, "learning_rate": 3.9467177648658676e-05, "loss": 0.4547, "step": 3509 }, { "epoch": 1.2936515249239842, "grad_norm": 5.825029548275378, "learning_rate": 3.946408703177155e-05, "loss": 0.4683, "step": 3510 }, { "epoch": 1.2940200866119966, "grad_norm": 7.164730993466554, "learning_rate": 3.946099641488441e-05, "loss": 0.4514, "step": 3511 }, { "epoch": 1.2943886483000093, "grad_norm": 8.220030014094668, "learning_rate": 3.945790579799728e-05, "loss": 0.4169, "step": 3512 }, { "epoch": 1.2947572099880218, "grad_norm": 6.435904592201618, "learning_rate": 3.945481518111015e-05, "loss": 0.3906, "step": 3513 }, { "epoch": 1.2951257716760343, "grad_norm": 5.525237355376867, "learning_rate": 3.945172456422302e-05, "loss": 0.3842, "step": 3514 }, { "epoch": 1.2954943333640467, "grad_norm": 7.170179910570667, "learning_rate": 3.944863394733589e-05, "loss": 0.502, "step": 3515 }, { "epoch": 1.2958628950520592, "grad_norm": 9.032731195442555, "learning_rate": 3.944554333044876e-05, "loss": 0.457, "step": 3516 }, { "epoch": 1.296231456740072, "grad_norm": 6.015495952856912, "learning_rate": 3.944245271356163e-05, "loss": 0.3917, "step": 3517 }, { "epoch": 1.2966000184280844, "grad_norm": 6.282567735300019, "learning_rate": 3.9439362096674496e-05, "loss": 0.366, "step": 3518 }, { "epoch": 1.296968580116097, "grad_norm": 8.886481796291505, "learning_rate": 3.943627147978737e-05, "loss": 0.4936, "step": 3519 }, { "epoch": 1.2973371418041095, "grad_norm": 6.800026770146352, "learning_rate": 3.943318086290024e-05, "loss": 0.3707, "step": 3520 }, { "epoch": 1.297705703492122, "grad_norm": 7.7870557336846975, "learning_rate": 3.943009024601311e-05, "loss": 0.6732, "step": 3521 }, { "epoch": 1.2980742651801345, "grad_norm": 9.576979501750703, "learning_rate": 3.9426999629125974e-05, "loss": 0.6066, "step": 3522 }, { "epoch": 1.298442826868147, "grad_norm": 13.622698388239868, "learning_rate": 3.9423909012238845e-05, "loss": 0.3286, "step": 3523 }, { "epoch": 1.2988113885561596, "grad_norm": 9.108446548284137, "learning_rate": 3.9420818395351716e-05, "loss": 0.5027, "step": 3524 }, { "epoch": 1.299179950244172, "grad_norm": 6.505018790919427, "learning_rate": 3.941772777846459e-05, "loss": 0.4309, "step": 3525 }, { "epoch": 1.2995485119321846, "grad_norm": 10.047481346521197, "learning_rate": 3.941463716157745e-05, "loss": 0.5017, "step": 3526 }, { "epoch": 1.2999170736201973, "grad_norm": 6.862599512153466, "learning_rate": 3.941154654469032e-05, "loss": 0.5277, "step": 3527 }, { "epoch": 1.3002856353082097, "grad_norm": 7.449712647906424, "learning_rate": 3.940845592780319e-05, "loss": 0.3478, "step": 3528 }, { "epoch": 1.3006541969962222, "grad_norm": 6.724576189378615, "learning_rate": 3.940536531091606e-05, "loss": 0.3683, "step": 3529 }, { "epoch": 1.3010227586842347, "grad_norm": 7.214481825908041, "learning_rate": 3.940227469402893e-05, "loss": 0.352, "step": 3530 }, { "epoch": 1.3013913203722474, "grad_norm": 6.391045218458011, "learning_rate": 3.93991840771418e-05, "loss": 0.462, "step": 3531 }, { "epoch": 1.3017598820602598, "grad_norm": 5.156706633722187, "learning_rate": 3.9396093460254666e-05, "loss": 0.4224, "step": 3532 }, { "epoch": 1.3021284437482723, "grad_norm": 4.492534166311218, "learning_rate": 3.939300284336754e-05, "loss": 0.3621, "step": 3533 }, { "epoch": 1.302497005436285, "grad_norm": 5.852637402316009, "learning_rate": 3.938991222648041e-05, "loss": 0.4884, "step": 3534 }, { "epoch": 1.3028655671242975, "grad_norm": 6.292060014500356, "learning_rate": 3.938682160959328e-05, "loss": 0.6081, "step": 3535 }, { "epoch": 1.30323412881231, "grad_norm": 4.964560223759204, "learning_rate": 3.938373099270615e-05, "loss": 0.5742, "step": 3536 }, { "epoch": 1.3036026905003224, "grad_norm": 5.975730129945419, "learning_rate": 3.9380640375819015e-05, "loss": 0.3728, "step": 3537 }, { "epoch": 1.303971252188335, "grad_norm": 4.932722073566857, "learning_rate": 3.9377549758931886e-05, "loss": 0.3528, "step": 3538 }, { "epoch": 1.3043398138763476, "grad_norm": 6.7483738953864725, "learning_rate": 3.937445914204476e-05, "loss": 0.4892, "step": 3539 }, { "epoch": 1.30470837556436, "grad_norm": 7.2233686988057535, "learning_rate": 3.937136852515763e-05, "loss": 0.502, "step": 3540 }, { "epoch": 1.3050769372523727, "grad_norm": 7.600767307698051, "learning_rate": 3.936827790827049e-05, "loss": 0.4247, "step": 3541 }, { "epoch": 1.3054454989403852, "grad_norm": 6.01776417930891, "learning_rate": 3.936518729138336e-05, "loss": 0.4171, "step": 3542 }, { "epoch": 1.3058140606283977, "grad_norm": 7.879306887046834, "learning_rate": 3.936209667449623e-05, "loss": 0.5724, "step": 3543 }, { "epoch": 1.3061826223164101, "grad_norm": 6.295685989537393, "learning_rate": 3.93590060576091e-05, "loss": 0.3995, "step": 3544 }, { "epoch": 1.3065511840044226, "grad_norm": 13.896250354890029, "learning_rate": 3.935591544072197e-05, "loss": 0.5649, "step": 3545 }, { "epoch": 1.3069197456924353, "grad_norm": 8.299383154318608, "learning_rate": 3.9352824823834835e-05, "loss": 0.4815, "step": 3546 }, { "epoch": 1.3072883073804478, "grad_norm": 5.49264589569971, "learning_rate": 3.9349734206947706e-05, "loss": 0.3574, "step": 3547 }, { "epoch": 1.3076568690684605, "grad_norm": 4.9259036092058786, "learning_rate": 3.934664359006058e-05, "loss": 0.4176, "step": 3548 }, { "epoch": 1.308025430756473, "grad_norm": 5.846722667836216, "learning_rate": 3.934355297317345e-05, "loss": 0.5686, "step": 3549 }, { "epoch": 1.3083939924444854, "grad_norm": 7.349035300958645, "learning_rate": 3.934046235628632e-05, "loss": 0.5315, "step": 3550 }, { "epoch": 1.3087625541324979, "grad_norm": 10.328503946574937, "learning_rate": 3.9337371739399184e-05, "loss": 0.5072, "step": 3551 }, { "epoch": 1.3091311158205103, "grad_norm": 5.402251777224602, "learning_rate": 3.9334281122512056e-05, "loss": 0.3426, "step": 3552 }, { "epoch": 1.309499677508523, "grad_norm": 5.060137538925379, "learning_rate": 3.933119050562493e-05, "loss": 0.3357, "step": 3553 }, { "epoch": 1.3098682391965355, "grad_norm": 4.689921046326512, "learning_rate": 3.93280998887378e-05, "loss": 0.312, "step": 3554 }, { "epoch": 1.310236800884548, "grad_norm": 7.641073326890278, "learning_rate": 3.932500927185067e-05, "loss": 0.4438, "step": 3555 }, { "epoch": 1.3106053625725607, "grad_norm": 8.703886755543198, "learning_rate": 3.932191865496353e-05, "loss": 0.5067, "step": 3556 }, { "epoch": 1.3109739242605731, "grad_norm": 8.541430760048982, "learning_rate": 3.93188280380764e-05, "loss": 0.4979, "step": 3557 }, { "epoch": 1.3113424859485856, "grad_norm": 5.19817109590571, "learning_rate": 3.931573742118927e-05, "loss": 0.3346, "step": 3558 }, { "epoch": 1.311711047636598, "grad_norm": 6.0553115227822465, "learning_rate": 3.931264680430214e-05, "loss": 0.4383, "step": 3559 }, { "epoch": 1.3120796093246108, "grad_norm": 4.755022706179151, "learning_rate": 3.930955618741501e-05, "loss": 0.3754, "step": 3560 }, { "epoch": 1.3124481710126232, "grad_norm": 4.838034524883513, "learning_rate": 3.9306465570527876e-05, "loss": 0.2935, "step": 3561 }, { "epoch": 1.3128167327006357, "grad_norm": 5.371135653245634, "learning_rate": 3.930337495364075e-05, "loss": 0.5362, "step": 3562 }, { "epoch": 1.3131852943886484, "grad_norm": 7.437721569703823, "learning_rate": 3.930028433675362e-05, "loss": 0.5731, "step": 3563 }, { "epoch": 1.3135538560766609, "grad_norm": 8.004273703589392, "learning_rate": 3.929719371986649e-05, "loss": 0.5503, "step": 3564 }, { "epoch": 1.3139224177646733, "grad_norm": 9.320072244519697, "learning_rate": 3.9294103102979354e-05, "loss": 0.235, "step": 3565 }, { "epoch": 1.3142909794526858, "grad_norm": 8.993940220690783, "learning_rate": 3.9291012486092225e-05, "loss": 0.6898, "step": 3566 }, { "epoch": 1.3146595411406985, "grad_norm": 10.120949771023376, "learning_rate": 3.9287921869205096e-05, "loss": 0.5904, "step": 3567 }, { "epoch": 1.315028102828711, "grad_norm": 7.288766280489557, "learning_rate": 3.928483125231797e-05, "loss": 0.4991, "step": 3568 }, { "epoch": 1.3153966645167234, "grad_norm": 6.3110893864026405, "learning_rate": 3.928174063543084e-05, "loss": 0.2749, "step": 3569 }, { "epoch": 1.3157652262047361, "grad_norm": 5.7554050417693645, "learning_rate": 3.92786500185437e-05, "loss": 0.4917, "step": 3570 }, { "epoch": 1.3161337878927486, "grad_norm": 6.200968562119828, "learning_rate": 3.927555940165657e-05, "loss": 0.4351, "step": 3571 }, { "epoch": 1.316502349580761, "grad_norm": 7.353490983939459, "learning_rate": 3.927246878476944e-05, "loss": 0.5868, "step": 3572 }, { "epoch": 1.3168709112687735, "grad_norm": 6.3279411383397814, "learning_rate": 3.926937816788231e-05, "loss": 0.3927, "step": 3573 }, { "epoch": 1.317239472956786, "grad_norm": 8.681273500041312, "learning_rate": 3.926628755099518e-05, "loss": 0.5432, "step": 3574 }, { "epoch": 1.3176080346447987, "grad_norm": 6.4192717907474774, "learning_rate": 3.9263196934108046e-05, "loss": 0.3405, "step": 3575 }, { "epoch": 1.3179765963328112, "grad_norm": 8.03280209041827, "learning_rate": 3.926010631722092e-05, "loss": 0.3708, "step": 3576 }, { "epoch": 1.3183451580208239, "grad_norm": 5.499841687785044, "learning_rate": 3.925701570033379e-05, "loss": 0.4089, "step": 3577 }, { "epoch": 1.3187137197088363, "grad_norm": 5.843117419597113, "learning_rate": 3.925392508344666e-05, "loss": 0.4578, "step": 3578 }, { "epoch": 1.3190822813968488, "grad_norm": 7.374167443820485, "learning_rate": 3.925083446655953e-05, "loss": 0.4859, "step": 3579 }, { "epoch": 1.3194508430848613, "grad_norm": 8.56901669855858, "learning_rate": 3.9247743849672395e-05, "loss": 0.3601, "step": 3580 }, { "epoch": 1.3198194047728737, "grad_norm": 4.772400888619817, "learning_rate": 3.9244653232785266e-05, "loss": 0.4072, "step": 3581 }, { "epoch": 1.3201879664608864, "grad_norm": 4.257520193162967, "learning_rate": 3.924156261589814e-05, "loss": 0.1974, "step": 3582 }, { "epoch": 1.320556528148899, "grad_norm": 14.277808110608554, "learning_rate": 3.923847199901101e-05, "loss": 0.4605, "step": 3583 }, { "epoch": 1.3209250898369114, "grad_norm": 7.953126918590142, "learning_rate": 3.923538138212387e-05, "loss": 0.3087, "step": 3584 }, { "epoch": 1.321293651524924, "grad_norm": 5.5375054972946405, "learning_rate": 3.9232290765236744e-05, "loss": 0.4974, "step": 3585 }, { "epoch": 1.3216622132129365, "grad_norm": 6.39121711804567, "learning_rate": 3.9229200148349615e-05, "loss": 0.5254, "step": 3586 }, { "epoch": 1.322030774900949, "grad_norm": 5.581948217651344, "learning_rate": 3.922610953146248e-05, "loss": 0.3368, "step": 3587 }, { "epoch": 1.3223993365889615, "grad_norm": 6.672320384541261, "learning_rate": 3.922301891457535e-05, "loss": 0.4062, "step": 3588 }, { "epoch": 1.3227678982769742, "grad_norm": 7.761738409008823, "learning_rate": 3.921992829768822e-05, "loss": 0.3441, "step": 3589 }, { "epoch": 1.3231364599649866, "grad_norm": 6.357167153595224, "learning_rate": 3.9216837680801086e-05, "loss": 0.3795, "step": 3590 }, { "epoch": 1.323505021652999, "grad_norm": 7.322912383496554, "learning_rate": 3.921374706391396e-05, "loss": 0.5189, "step": 3591 }, { "epoch": 1.3238735833410118, "grad_norm": 6.265776901116137, "learning_rate": 3.921065644702683e-05, "loss": 0.4319, "step": 3592 }, { "epoch": 1.3242421450290243, "grad_norm": 5.687806508954114, "learning_rate": 3.92075658301397e-05, "loss": 0.3459, "step": 3593 }, { "epoch": 1.3246107067170367, "grad_norm": 5.760690122693397, "learning_rate": 3.9204475213252564e-05, "loss": 0.2776, "step": 3594 }, { "epoch": 1.3249792684050492, "grad_norm": 6.203369443290571, "learning_rate": 3.9201384596365435e-05, "loss": 0.3686, "step": 3595 }, { "epoch": 1.325347830093062, "grad_norm": 8.264717614714767, "learning_rate": 3.919829397947831e-05, "loss": 0.4279, "step": 3596 }, { "epoch": 1.3257163917810744, "grad_norm": 7.218294996313783, "learning_rate": 3.919520336259118e-05, "loss": 0.3869, "step": 3597 }, { "epoch": 1.3260849534690868, "grad_norm": 8.578989899049205, "learning_rate": 3.919211274570405e-05, "loss": 0.5184, "step": 3598 }, { "epoch": 1.3264535151570995, "grad_norm": 8.371129750126743, "learning_rate": 3.9189022128816913e-05, "loss": 0.496, "step": 3599 }, { "epoch": 1.326822076845112, "grad_norm": 3.7739395218346217, "learning_rate": 3.9185931511929785e-05, "loss": 0.277, "step": 3600 }, { "epoch": 1.3271906385331245, "grad_norm": 5.766272035594277, "learning_rate": 3.9182840895042656e-05, "loss": 0.4023, "step": 3601 }, { "epoch": 1.327559200221137, "grad_norm": 6.9589671076652175, "learning_rate": 3.917975027815552e-05, "loss": 0.5079, "step": 3602 }, { "epoch": 1.3279277619091494, "grad_norm": 4.602164522176991, "learning_rate": 3.917665966126839e-05, "loss": 0.33, "step": 3603 }, { "epoch": 1.328296323597162, "grad_norm": 5.65499847223209, "learning_rate": 3.9173569044381256e-05, "loss": 0.287, "step": 3604 }, { "epoch": 1.3286648852851746, "grad_norm": 9.395500453513515, "learning_rate": 3.917047842749413e-05, "loss": 0.5448, "step": 3605 }, { "epoch": 1.3290334469731873, "grad_norm": 5.718077573171993, "learning_rate": 3.9167387810607e-05, "loss": 0.4931, "step": 3606 }, { "epoch": 1.3294020086611997, "grad_norm": 7.544809884216092, "learning_rate": 3.916429719371987e-05, "loss": 0.6475, "step": 3607 }, { "epoch": 1.3297705703492122, "grad_norm": 6.689415773162143, "learning_rate": 3.916120657683274e-05, "loss": 0.4342, "step": 3608 }, { "epoch": 1.3301391320372247, "grad_norm": 6.170957395691803, "learning_rate": 3.9158115959945605e-05, "loss": 0.3789, "step": 3609 }, { "epoch": 1.3305076937252371, "grad_norm": 9.184004553365076, "learning_rate": 3.9155025343058476e-05, "loss": 0.4953, "step": 3610 }, { "epoch": 1.3308762554132498, "grad_norm": 6.380161963054284, "learning_rate": 3.915193472617135e-05, "loss": 0.407, "step": 3611 }, { "epoch": 1.3312448171012623, "grad_norm": 6.379644459829865, "learning_rate": 3.914884410928422e-05, "loss": 0.5413, "step": 3612 }, { "epoch": 1.331613378789275, "grad_norm": 6.992044833129027, "learning_rate": 3.914575349239708e-05, "loss": 0.5125, "step": 3613 }, { "epoch": 1.3319819404772875, "grad_norm": 5.957397849887625, "learning_rate": 3.9142662875509954e-05, "loss": 0.4476, "step": 3614 }, { "epoch": 1.3323505021653, "grad_norm": 23.687603067058095, "learning_rate": 3.9139572258622825e-05, "loss": 0.4356, "step": 3615 }, { "epoch": 1.3327190638533124, "grad_norm": 4.53059540328332, "learning_rate": 3.91364816417357e-05, "loss": 0.3669, "step": 3616 }, { "epoch": 1.3330876255413249, "grad_norm": 8.943178307090935, "learning_rate": 3.913339102484856e-05, "loss": 0.4072, "step": 3617 }, { "epoch": 1.3334561872293376, "grad_norm": 5.195494776231484, "learning_rate": 3.9130300407961425e-05, "loss": 0.3336, "step": 3618 }, { "epoch": 1.33382474891735, "grad_norm": 5.566852880930553, "learning_rate": 3.91272097910743e-05, "loss": 0.511, "step": 3619 }, { "epoch": 1.3341933106053625, "grad_norm": 6.772953706951193, "learning_rate": 3.912411917418717e-05, "loss": 0.2417, "step": 3620 }, { "epoch": 1.3345618722933752, "grad_norm": 7.493148471592831, "learning_rate": 3.912102855730004e-05, "loss": 0.4668, "step": 3621 }, { "epoch": 1.3349304339813877, "grad_norm": 11.610556541068966, "learning_rate": 3.911793794041291e-05, "loss": 0.4237, "step": 3622 }, { "epoch": 1.3352989956694001, "grad_norm": 4.892235828844445, "learning_rate": 3.9114847323525775e-05, "loss": 0.4033, "step": 3623 }, { "epoch": 1.3356675573574126, "grad_norm": 6.669527806806977, "learning_rate": 3.9111756706638646e-05, "loss": 0.4231, "step": 3624 }, { "epoch": 1.3360361190454253, "grad_norm": 7.0306819431990935, "learning_rate": 3.910866608975152e-05, "loss": 0.34, "step": 3625 }, { "epoch": 1.3364046807334378, "grad_norm": 10.69014809801164, "learning_rate": 3.910557547286439e-05, "loss": 0.7336, "step": 3626 }, { "epoch": 1.3367732424214502, "grad_norm": 8.949197436956288, "learning_rate": 3.910248485597726e-05, "loss": 0.5168, "step": 3627 }, { "epoch": 1.337141804109463, "grad_norm": 5.41475383533638, "learning_rate": 3.9099394239090124e-05, "loss": 0.3155, "step": 3628 }, { "epoch": 1.3375103657974754, "grad_norm": 8.653459560529688, "learning_rate": 3.9096303622202995e-05, "loss": 0.4203, "step": 3629 }, { "epoch": 1.3378789274854879, "grad_norm": 8.106908757627176, "learning_rate": 3.9093213005315866e-05, "loss": 0.564, "step": 3630 }, { "epoch": 1.3382474891735003, "grad_norm": 8.310563629169364, "learning_rate": 3.909012238842874e-05, "loss": 0.3561, "step": 3631 }, { "epoch": 1.3386160508615128, "grad_norm": 7.368061745996858, "learning_rate": 3.90870317715416e-05, "loss": 0.259, "step": 3632 }, { "epoch": 1.3389846125495255, "grad_norm": 7.220965949453304, "learning_rate": 3.9083941154654466e-05, "loss": 0.4042, "step": 3633 }, { "epoch": 1.339353174237538, "grad_norm": 5.166270927942448, "learning_rate": 3.908085053776734e-05, "loss": 0.3929, "step": 3634 }, { "epoch": 1.3397217359255507, "grad_norm": 8.335894941169474, "learning_rate": 3.907775992088021e-05, "loss": 0.3007, "step": 3635 }, { "epoch": 1.3400902976135631, "grad_norm": 7.3887335713952025, "learning_rate": 3.907466930399308e-05, "loss": 0.3371, "step": 3636 }, { "epoch": 1.3404588593015756, "grad_norm": 5.769196683342816, "learning_rate": 3.9071578687105944e-05, "loss": 0.3637, "step": 3637 }, { "epoch": 1.340827420989588, "grad_norm": 30.17803617181708, "learning_rate": 3.9068488070218815e-05, "loss": 0.579, "step": 3638 }, { "epoch": 1.3411959826776005, "grad_norm": 4.690177864654987, "learning_rate": 3.9065397453331687e-05, "loss": 0.2651, "step": 3639 }, { "epoch": 1.3415645443656132, "grad_norm": 6.294043119512544, "learning_rate": 3.906230683644456e-05, "loss": 0.4218, "step": 3640 }, { "epoch": 1.3419331060536257, "grad_norm": 8.736852741416403, "learning_rate": 3.905921621955743e-05, "loss": 0.8479, "step": 3641 }, { "epoch": 1.3423016677416384, "grad_norm": 6.247679622978976, "learning_rate": 3.9056125602670293e-05, "loss": 0.4433, "step": 3642 }, { "epoch": 1.3426702294296509, "grad_norm": 7.92937335815557, "learning_rate": 3.9053034985783165e-05, "loss": 0.4683, "step": 3643 }, { "epoch": 1.3430387911176633, "grad_norm": 6.003481490647316, "learning_rate": 3.9049944368896036e-05, "loss": 0.3359, "step": 3644 }, { "epoch": 1.3434073528056758, "grad_norm": 8.617555363153272, "learning_rate": 3.904685375200891e-05, "loss": 0.5326, "step": 3645 }, { "epoch": 1.3437759144936883, "grad_norm": 8.024494818357336, "learning_rate": 3.904376313512177e-05, "loss": 0.4165, "step": 3646 }, { "epoch": 1.344144476181701, "grad_norm": 7.817180726717283, "learning_rate": 3.9040672518234636e-05, "loss": 0.48, "step": 3647 }, { "epoch": 1.3445130378697134, "grad_norm": 6.762751085622903, "learning_rate": 3.903758190134751e-05, "loss": 0.3574, "step": 3648 }, { "epoch": 1.344881599557726, "grad_norm": 6.497204399474048, "learning_rate": 3.903449128446038e-05, "loss": 0.6025, "step": 3649 }, { "epoch": 1.3452501612457386, "grad_norm": 8.192131305089307, "learning_rate": 3.903140066757325e-05, "loss": 0.4038, "step": 3650 }, { "epoch": 1.345618722933751, "grad_norm": 7.51977221610661, "learning_rate": 3.902831005068612e-05, "loss": 0.4839, "step": 3651 }, { "epoch": 1.3459872846217635, "grad_norm": 5.686315392172493, "learning_rate": 3.9025219433798985e-05, "loss": 0.3844, "step": 3652 }, { "epoch": 1.346355846309776, "grad_norm": 4.103619508367949, "learning_rate": 3.9022128816911856e-05, "loss": 0.3412, "step": 3653 }, { "epoch": 1.3467244079977887, "grad_norm": 4.075536147343765, "learning_rate": 3.901903820002473e-05, "loss": 0.293, "step": 3654 }, { "epoch": 1.3470929696858012, "grad_norm": 4.365211217080631, "learning_rate": 3.90159475831376e-05, "loss": 0.234, "step": 3655 }, { "epoch": 1.3474615313738136, "grad_norm": 7.357755511167069, "learning_rate": 3.901285696625046e-05, "loss": 0.3026, "step": 3656 }, { "epoch": 1.3478300930618263, "grad_norm": 7.374860341964217, "learning_rate": 3.9009766349363334e-05, "loss": 0.4742, "step": 3657 }, { "epoch": 1.3481986547498388, "grad_norm": 4.343593457371996, "learning_rate": 3.9006675732476205e-05, "loss": 0.4076, "step": 3658 }, { "epoch": 1.3485672164378513, "grad_norm": 4.696000999203629, "learning_rate": 3.9003585115589077e-05, "loss": 0.39, "step": 3659 }, { "epoch": 1.3489357781258637, "grad_norm": 4.498834035077411, "learning_rate": 3.900049449870195e-05, "loss": 0.2668, "step": 3660 }, { "epoch": 1.3493043398138764, "grad_norm": 6.010094099022208, "learning_rate": 3.899740388181481e-05, "loss": 0.5664, "step": 3661 }, { "epoch": 1.349672901501889, "grad_norm": 4.6212027142690015, "learning_rate": 3.8994313264927677e-05, "loss": 0.3078, "step": 3662 }, { "epoch": 1.3500414631899014, "grad_norm": 6.551194106884196, "learning_rate": 3.899122264804055e-05, "loss": 0.3937, "step": 3663 }, { "epoch": 1.350410024877914, "grad_norm": 4.107423732465105, "learning_rate": 3.898813203115342e-05, "loss": 0.3756, "step": 3664 }, { "epoch": 1.3507785865659265, "grad_norm": 4.4596058583266185, "learning_rate": 3.898504141426629e-05, "loss": 0.2545, "step": 3665 }, { "epoch": 1.351147148253939, "grad_norm": 5.307805938312382, "learning_rate": 3.8981950797379155e-05, "loss": 0.3353, "step": 3666 }, { "epoch": 1.3515157099419515, "grad_norm": 8.055505838642677, "learning_rate": 3.8978860180492026e-05, "loss": 0.6271, "step": 3667 }, { "epoch": 1.351884271629964, "grad_norm": 8.601797545891662, "learning_rate": 3.89757695636049e-05, "loss": 0.4536, "step": 3668 }, { "epoch": 1.3522528333179766, "grad_norm": 6.482971480730939, "learning_rate": 3.897267894671777e-05, "loss": 0.5369, "step": 3669 }, { "epoch": 1.352621395005989, "grad_norm": 6.4250370603449936, "learning_rate": 3.896958832983064e-05, "loss": 0.3134, "step": 3670 }, { "epoch": 1.3529899566940018, "grad_norm": 8.23931562822328, "learning_rate": 3.8966497712943504e-05, "loss": 0.4324, "step": 3671 }, { "epoch": 1.3533585183820143, "grad_norm": 8.330995104496962, "learning_rate": 3.8963407096056375e-05, "loss": 0.5496, "step": 3672 }, { "epoch": 1.3537270800700267, "grad_norm": 8.252029631671968, "learning_rate": 3.8960316479169246e-05, "loss": 0.3453, "step": 3673 }, { "epoch": 1.3540956417580392, "grad_norm": 5.54512430421486, "learning_rate": 3.895722586228212e-05, "loss": 0.3004, "step": 3674 }, { "epoch": 1.3544642034460517, "grad_norm": 6.181805388799475, "learning_rate": 3.895413524539498e-05, "loss": 0.3922, "step": 3675 }, { "epoch": 1.3548327651340644, "grad_norm": 8.803737748243286, "learning_rate": 3.895104462850785e-05, "loss": 0.4342, "step": 3676 }, { "epoch": 1.3552013268220768, "grad_norm": 9.756385472864281, "learning_rate": 3.894795401162072e-05, "loss": 0.3117, "step": 3677 }, { "epoch": 1.3555698885100893, "grad_norm": 6.574963169937544, "learning_rate": 3.894486339473359e-05, "loss": 0.5581, "step": 3678 }, { "epoch": 1.355938450198102, "grad_norm": 7.90154961948352, "learning_rate": 3.894177277784646e-05, "loss": 0.4913, "step": 3679 }, { "epoch": 1.3563070118861145, "grad_norm": 7.1416720033831185, "learning_rate": 3.893868216095933e-05, "loss": 0.4989, "step": 3680 }, { "epoch": 1.356675573574127, "grad_norm": 5.348429811312214, "learning_rate": 3.8935591544072195e-05, "loss": 0.354, "step": 3681 }, { "epoch": 1.3570441352621394, "grad_norm": 5.54134572564931, "learning_rate": 3.8932500927185067e-05, "loss": 0.3677, "step": 3682 }, { "epoch": 1.357412696950152, "grad_norm": 5.642992869947067, "learning_rate": 3.892941031029794e-05, "loss": 0.2914, "step": 3683 }, { "epoch": 1.3577812586381646, "grad_norm": 6.062222503918156, "learning_rate": 3.892631969341081e-05, "loss": 0.4998, "step": 3684 }, { "epoch": 1.358149820326177, "grad_norm": 9.003319869734598, "learning_rate": 3.892322907652367e-05, "loss": 0.4897, "step": 3685 }, { "epoch": 1.3585183820141897, "grad_norm": 7.938684810610432, "learning_rate": 3.8920138459636545e-05, "loss": 0.4106, "step": 3686 }, { "epoch": 1.3588869437022022, "grad_norm": 9.188050960706681, "learning_rate": 3.8917047842749416e-05, "loss": 0.4915, "step": 3687 }, { "epoch": 1.3592555053902147, "grad_norm": 12.190281007827698, "learning_rate": 3.891395722586229e-05, "loss": 0.5295, "step": 3688 }, { "epoch": 1.3596240670782271, "grad_norm": 8.778147302030899, "learning_rate": 3.891086660897516e-05, "loss": 0.511, "step": 3689 }, { "epoch": 1.3599926287662398, "grad_norm": 6.6515351552761866, "learning_rate": 3.890777599208802e-05, "loss": 0.4555, "step": 3690 }, { "epoch": 1.3603611904542523, "grad_norm": 5.803246464277257, "learning_rate": 3.8904685375200894e-05, "loss": 0.3009, "step": 3691 }, { "epoch": 1.3607297521422648, "grad_norm": 6.480239174278938, "learning_rate": 3.890159475831376e-05, "loss": 0.3647, "step": 3692 }, { "epoch": 1.3610983138302775, "grad_norm": 5.8010863766551575, "learning_rate": 3.889850414142663e-05, "loss": 0.4725, "step": 3693 }, { "epoch": 1.36146687551829, "grad_norm": 5.281996093055237, "learning_rate": 3.88954135245395e-05, "loss": 0.2553, "step": 3694 }, { "epoch": 1.3618354372063024, "grad_norm": 4.61107477431681, "learning_rate": 3.8892322907652365e-05, "loss": 0.2978, "step": 3695 }, { "epoch": 1.3622039988943149, "grad_norm": 4.850235860536272, "learning_rate": 3.8889232290765236e-05, "loss": 0.3387, "step": 3696 }, { "epoch": 1.3625725605823273, "grad_norm": 11.145569034663296, "learning_rate": 3.888614167387811e-05, "loss": 0.3759, "step": 3697 }, { "epoch": 1.36294112227034, "grad_norm": 6.966091315986049, "learning_rate": 3.888305105699098e-05, "loss": 0.4369, "step": 3698 }, { "epoch": 1.3633096839583525, "grad_norm": 6.050506999312575, "learning_rate": 3.887996044010385e-05, "loss": 0.3485, "step": 3699 }, { "epoch": 1.3636782456463652, "grad_norm": 7.432716690738666, "learning_rate": 3.8876869823216714e-05, "loss": 0.5941, "step": 3700 }, { "epoch": 1.3640468073343777, "grad_norm": 7.288479208223151, "learning_rate": 3.8873779206329585e-05, "loss": 0.4647, "step": 3701 }, { "epoch": 1.3644153690223901, "grad_norm": 5.667654437572078, "learning_rate": 3.8870688589442457e-05, "loss": 0.2606, "step": 3702 }, { "epoch": 1.3647839307104026, "grad_norm": 7.510532358859561, "learning_rate": 3.886759797255533e-05, "loss": 0.4721, "step": 3703 }, { "epoch": 1.365152492398415, "grad_norm": 6.182655364481917, "learning_rate": 3.886450735566819e-05, "loss": 0.4671, "step": 3704 }, { "epoch": 1.3655210540864278, "grad_norm": 7.091696555421677, "learning_rate": 3.886141673878106e-05, "loss": 0.632, "step": 3705 }, { "epoch": 1.3658896157744402, "grad_norm": 6.544322387961807, "learning_rate": 3.8858326121893935e-05, "loss": 0.4109, "step": 3706 }, { "epoch": 1.3662581774624527, "grad_norm": 8.702584104320591, "learning_rate": 3.8855235505006806e-05, "loss": 0.6989, "step": 3707 }, { "epoch": 1.3666267391504654, "grad_norm": 6.551686706020646, "learning_rate": 3.885214488811967e-05, "loss": 0.3642, "step": 3708 }, { "epoch": 1.3669953008384779, "grad_norm": 8.131758945027764, "learning_rate": 3.8849054271232535e-05, "loss": 0.361, "step": 3709 }, { "epoch": 1.3673638625264903, "grad_norm": 6.9468098625804995, "learning_rate": 3.8845963654345406e-05, "loss": 0.5418, "step": 3710 }, { "epoch": 1.3677324242145028, "grad_norm": 4.268336960566205, "learning_rate": 3.884287303745828e-05, "loss": 0.275, "step": 3711 }, { "epoch": 1.3681009859025155, "grad_norm": 5.867128658888534, "learning_rate": 3.883978242057115e-05, "loss": 0.491, "step": 3712 }, { "epoch": 1.368469547590528, "grad_norm": 9.19650719724104, "learning_rate": 3.883669180368402e-05, "loss": 0.4662, "step": 3713 }, { "epoch": 1.3688381092785404, "grad_norm": 6.213876319265842, "learning_rate": 3.8833601186796884e-05, "loss": 0.4866, "step": 3714 }, { "epoch": 1.3692066709665531, "grad_norm": 9.69992744969244, "learning_rate": 3.8830510569909755e-05, "loss": 0.521, "step": 3715 }, { "epoch": 1.3695752326545656, "grad_norm": 5.151624419673561, "learning_rate": 3.8827419953022626e-05, "loss": 0.4332, "step": 3716 }, { "epoch": 1.369943794342578, "grad_norm": 37.382739844495745, "learning_rate": 3.88243293361355e-05, "loss": 0.3994, "step": 3717 }, { "epoch": 1.3703123560305905, "grad_norm": 6.7785416921284645, "learning_rate": 3.882123871924836e-05, "loss": 0.4628, "step": 3718 }, { "epoch": 1.3706809177186032, "grad_norm": 5.393793827731933, "learning_rate": 3.881814810236123e-05, "loss": 0.3762, "step": 3719 }, { "epoch": 1.3710494794066157, "grad_norm": 8.34989468628034, "learning_rate": 3.8815057485474104e-05, "loss": 0.7304, "step": 3720 }, { "epoch": 1.3714180410946282, "grad_norm": 8.019077442408431, "learning_rate": 3.8811966868586975e-05, "loss": 0.5898, "step": 3721 }, { "epoch": 1.3717866027826409, "grad_norm": 7.513796006821646, "learning_rate": 3.8808876251699846e-05, "loss": 0.3562, "step": 3722 }, { "epoch": 1.3721551644706533, "grad_norm": 7.23083251169987, "learning_rate": 3.880578563481271e-05, "loss": 0.6122, "step": 3723 }, { "epoch": 1.3725237261586658, "grad_norm": 5.516195554103687, "learning_rate": 3.8802695017925575e-05, "loss": 0.3576, "step": 3724 }, { "epoch": 1.3728922878466783, "grad_norm": 7.320923737988356, "learning_rate": 3.8799604401038446e-05, "loss": 0.6115, "step": 3725 }, { "epoch": 1.3732608495346907, "grad_norm": 6.153925887838414, "learning_rate": 3.879651378415132e-05, "loss": 0.4791, "step": 3726 }, { "epoch": 1.3736294112227034, "grad_norm": 5.400586287142633, "learning_rate": 3.879342316726419e-05, "loss": 0.5051, "step": 3727 }, { "epoch": 1.373997972910716, "grad_norm": 6.489185433310611, "learning_rate": 3.879033255037705e-05, "loss": 0.3298, "step": 3728 }, { "epoch": 1.3743665345987286, "grad_norm": 12.665549245702394, "learning_rate": 3.8787241933489924e-05, "loss": 0.5284, "step": 3729 }, { "epoch": 1.374735096286741, "grad_norm": 5.17037582566189, "learning_rate": 3.8784151316602796e-05, "loss": 0.3253, "step": 3730 }, { "epoch": 1.3751036579747535, "grad_norm": 5.327920232960424, "learning_rate": 3.878106069971567e-05, "loss": 0.316, "step": 3731 }, { "epoch": 1.375472219662766, "grad_norm": 6.913128357622535, "learning_rate": 3.877797008282854e-05, "loss": 0.385, "step": 3732 }, { "epoch": 1.3758407813507785, "grad_norm": 5.394217623599548, "learning_rate": 3.87748794659414e-05, "loss": 0.4925, "step": 3733 }, { "epoch": 1.3762093430387912, "grad_norm": 5.276795444198316, "learning_rate": 3.8771788849054274e-05, "loss": 0.4492, "step": 3734 }, { "epoch": 1.3765779047268036, "grad_norm": 8.373084005705518, "learning_rate": 3.8768698232167145e-05, "loss": 0.5739, "step": 3735 }, { "epoch": 1.376946466414816, "grad_norm": 6.880450498336471, "learning_rate": 3.8765607615280016e-05, "loss": 0.3978, "step": 3736 }, { "epoch": 1.3773150281028288, "grad_norm": 4.595894864253477, "learning_rate": 3.876251699839288e-05, "loss": 0.3934, "step": 3737 }, { "epoch": 1.3776835897908413, "grad_norm": 5.763495488765775, "learning_rate": 3.8759426381505745e-05, "loss": 0.3084, "step": 3738 }, { "epoch": 1.3780521514788537, "grad_norm": 6.9793664097077635, "learning_rate": 3.8756335764618616e-05, "loss": 0.5459, "step": 3739 }, { "epoch": 1.3784207131668662, "grad_norm": 6.544884281170008, "learning_rate": 3.875324514773149e-05, "loss": 0.6255, "step": 3740 }, { "epoch": 1.378789274854879, "grad_norm": 9.17862069931504, "learning_rate": 3.875015453084436e-05, "loss": 0.6298, "step": 3741 }, { "epoch": 1.3791578365428914, "grad_norm": 5.754805505616215, "learning_rate": 3.874706391395723e-05, "loss": 0.3737, "step": 3742 }, { "epoch": 1.3795263982309038, "grad_norm": 8.148306416730872, "learning_rate": 3.8743973297070094e-05, "loss": 0.3959, "step": 3743 }, { "epoch": 1.3798949599189165, "grad_norm": 6.0580270528003535, "learning_rate": 3.8740882680182965e-05, "loss": 0.4241, "step": 3744 }, { "epoch": 1.380263521606929, "grad_norm": 8.742855970309595, "learning_rate": 3.8737792063295836e-05, "loss": 0.6823, "step": 3745 }, { "epoch": 1.3806320832949415, "grad_norm": 8.828765359795236, "learning_rate": 3.873470144640871e-05, "loss": 0.3847, "step": 3746 }, { "epoch": 1.381000644982954, "grad_norm": 7.651756992403822, "learning_rate": 3.873161082952157e-05, "loss": 0.453, "step": 3747 }, { "epoch": 1.3813692066709666, "grad_norm": 3.7811877032736283, "learning_rate": 3.872852021263444e-05, "loss": 0.2271, "step": 3748 }, { "epoch": 1.381737768358979, "grad_norm": 5.990910957368762, "learning_rate": 3.8725429595747314e-05, "loss": 0.2676, "step": 3749 }, { "epoch": 1.3821063300469916, "grad_norm": 5.782075400326304, "learning_rate": 3.8722338978860186e-05, "loss": 0.355, "step": 3750 }, { "epoch": 1.3824748917350043, "grad_norm": 6.800455347410856, "learning_rate": 3.871924836197306e-05, "loss": 0.5349, "step": 3751 }, { "epoch": 1.3828434534230167, "grad_norm": 7.835307041717528, "learning_rate": 3.871615774508592e-05, "loss": 0.4674, "step": 3752 }, { "epoch": 1.3832120151110292, "grad_norm": 12.57118160451092, "learning_rate": 3.8713067128198786e-05, "loss": 0.439, "step": 3753 }, { "epoch": 1.3835805767990417, "grad_norm": 5.317505038339284, "learning_rate": 3.870997651131166e-05, "loss": 0.2865, "step": 3754 }, { "epoch": 1.3839491384870541, "grad_norm": 5.308533388177796, "learning_rate": 3.870688589442453e-05, "loss": 0.3038, "step": 3755 }, { "epoch": 1.3843177001750668, "grad_norm": 33.155824764591706, "learning_rate": 3.87037952775374e-05, "loss": 0.4688, "step": 3756 }, { "epoch": 1.3846862618630793, "grad_norm": 3.9142206278094425, "learning_rate": 3.8700704660650264e-05, "loss": 0.2472, "step": 3757 }, { "epoch": 1.385054823551092, "grad_norm": 6.5570999452793775, "learning_rate": 3.8697614043763135e-05, "loss": 0.3807, "step": 3758 }, { "epoch": 1.3854233852391045, "grad_norm": 5.87984704889161, "learning_rate": 3.8694523426876006e-05, "loss": 0.3645, "step": 3759 }, { "epoch": 1.385791946927117, "grad_norm": 6.739579811468683, "learning_rate": 3.869143280998888e-05, "loss": 0.469, "step": 3760 }, { "epoch": 1.3861605086151294, "grad_norm": 5.965085807240675, "learning_rate": 3.868834219310175e-05, "loss": 0.336, "step": 3761 }, { "epoch": 1.3865290703031419, "grad_norm": 4.526057563405567, "learning_rate": 3.868525157621461e-05, "loss": 0.2116, "step": 3762 }, { "epoch": 1.3868976319911546, "grad_norm": 8.137495087184098, "learning_rate": 3.8682160959327484e-05, "loss": 0.3927, "step": 3763 }, { "epoch": 1.387266193679167, "grad_norm": 5.269479127800044, "learning_rate": 3.8679070342440355e-05, "loss": 0.3472, "step": 3764 }, { "epoch": 1.3876347553671795, "grad_norm": 7.411016934069866, "learning_rate": 3.8675979725553226e-05, "loss": 0.4956, "step": 3765 }, { "epoch": 1.3880033170551922, "grad_norm": 7.543490644166851, "learning_rate": 3.867288910866609e-05, "loss": 0.4118, "step": 3766 }, { "epoch": 1.3883718787432047, "grad_norm": 4.261464755621396, "learning_rate": 3.866979849177896e-05, "loss": 0.2441, "step": 3767 }, { "epoch": 1.3887404404312171, "grad_norm": 8.092793647498748, "learning_rate": 3.8666707874891826e-05, "loss": 0.3352, "step": 3768 }, { "epoch": 1.3891090021192296, "grad_norm": 8.11690646319314, "learning_rate": 3.86636172580047e-05, "loss": 0.4377, "step": 3769 }, { "epoch": 1.3894775638072423, "grad_norm": 6.888785118770773, "learning_rate": 3.866052664111757e-05, "loss": 0.2914, "step": 3770 }, { "epoch": 1.3898461254952548, "grad_norm": 7.882489956777691, "learning_rate": 3.865743602423044e-05, "loss": 0.492, "step": 3771 }, { "epoch": 1.3902146871832672, "grad_norm": 5.22856195405791, "learning_rate": 3.8654345407343304e-05, "loss": 0.45, "step": 3772 }, { "epoch": 1.39058324887128, "grad_norm": 7.020189325545566, "learning_rate": 3.8651254790456176e-05, "loss": 0.5064, "step": 3773 }, { "epoch": 1.3909518105592924, "grad_norm": 10.200051281370017, "learning_rate": 3.864816417356905e-05, "loss": 0.5079, "step": 3774 }, { "epoch": 1.3913203722473049, "grad_norm": 6.69902571241561, "learning_rate": 3.864507355668192e-05, "loss": 0.4179, "step": 3775 }, { "epoch": 1.3916889339353173, "grad_norm": 7.176465395904399, "learning_rate": 3.864198293979478e-05, "loss": 0.3844, "step": 3776 }, { "epoch": 1.39205749562333, "grad_norm": 9.266797871198255, "learning_rate": 3.8638892322907654e-05, "loss": 0.5261, "step": 3777 }, { "epoch": 1.3924260573113425, "grad_norm": 8.136045489428797, "learning_rate": 3.8635801706020525e-05, "loss": 0.3105, "step": 3778 }, { "epoch": 1.392794618999355, "grad_norm": 13.084138752308018, "learning_rate": 3.8632711089133396e-05, "loss": 0.6013, "step": 3779 }, { "epoch": 1.3931631806873677, "grad_norm": 5.555564414123254, "learning_rate": 3.862962047224627e-05, "loss": 0.4463, "step": 3780 }, { "epoch": 1.3935317423753801, "grad_norm": 8.727808249544688, "learning_rate": 3.862652985535913e-05, "loss": 0.3391, "step": 3781 }, { "epoch": 1.3939003040633926, "grad_norm": 8.78590568307417, "learning_rate": 3.8623439238472e-05, "loss": 0.5001, "step": 3782 }, { "epoch": 1.394268865751405, "grad_norm": 4.891593666550076, "learning_rate": 3.862034862158487e-05, "loss": 0.3729, "step": 3783 }, { "epoch": 1.3946374274394175, "grad_norm": 10.315727329988889, "learning_rate": 3.861725800469774e-05, "loss": 0.5209, "step": 3784 }, { "epoch": 1.3950059891274302, "grad_norm": 5.495405879351278, "learning_rate": 3.861416738781061e-05, "loss": 0.436, "step": 3785 }, { "epoch": 1.3953745508154427, "grad_norm": 7.279901862688198, "learning_rate": 3.8611076770923474e-05, "loss": 0.4878, "step": 3786 }, { "epoch": 1.3957431125034554, "grad_norm": 8.592165603235191, "learning_rate": 3.8607986154036345e-05, "loss": 0.4611, "step": 3787 }, { "epoch": 1.3961116741914679, "grad_norm": 7.840061734014858, "learning_rate": 3.8604895537149216e-05, "loss": 0.6193, "step": 3788 }, { "epoch": 1.3964802358794803, "grad_norm": 7.275240141061147, "learning_rate": 3.860180492026209e-05, "loss": 0.5342, "step": 3789 }, { "epoch": 1.3968487975674928, "grad_norm": 10.185995728065468, "learning_rate": 3.859871430337496e-05, "loss": 0.4269, "step": 3790 }, { "epoch": 1.3972173592555053, "grad_norm": 6.129941542877208, "learning_rate": 3.859562368648782e-05, "loss": 0.6385, "step": 3791 }, { "epoch": 1.397585920943518, "grad_norm": 6.037753536990186, "learning_rate": 3.8592533069600694e-05, "loss": 0.4027, "step": 3792 }, { "epoch": 1.3979544826315304, "grad_norm": 5.343804967050997, "learning_rate": 3.8589442452713566e-05, "loss": 0.4326, "step": 3793 }, { "epoch": 1.398323044319543, "grad_norm": 4.583703505855264, "learning_rate": 3.858635183582644e-05, "loss": 0.2394, "step": 3794 }, { "epoch": 1.3986916060075556, "grad_norm": 6.06576858971593, "learning_rate": 3.85832612189393e-05, "loss": 0.4578, "step": 3795 }, { "epoch": 1.399060167695568, "grad_norm": 5.330007270965484, "learning_rate": 3.858017060205217e-05, "loss": 0.3155, "step": 3796 }, { "epoch": 1.3994287293835805, "grad_norm": 7.0699358929394425, "learning_rate": 3.8577079985165044e-05, "loss": 0.2724, "step": 3797 }, { "epoch": 1.399797291071593, "grad_norm": 6.808023924627526, "learning_rate": 3.857398936827791e-05, "loss": 0.532, "step": 3798 }, { "epoch": 1.4001658527596057, "grad_norm": 8.499557483598318, "learning_rate": 3.857089875139078e-05, "loss": 0.5309, "step": 3799 }, { "epoch": 1.4005344144476182, "grad_norm": 5.6178703052275445, "learning_rate": 3.8567808134503644e-05, "loss": 0.2794, "step": 3800 }, { "epoch": 1.4009029761356306, "grad_norm": 4.453683061934483, "learning_rate": 3.8564717517616515e-05, "loss": 0.2223, "step": 3801 }, { "epoch": 1.4012715378236433, "grad_norm": 6.363002482959168, "learning_rate": 3.8561626900729386e-05, "loss": 0.4747, "step": 3802 }, { "epoch": 1.4016400995116558, "grad_norm": 3.349027230526516, "learning_rate": 3.855853628384226e-05, "loss": 0.1871, "step": 3803 }, { "epoch": 1.4020086611996683, "grad_norm": 6.054115711408161, "learning_rate": 3.855544566695513e-05, "loss": 0.6091, "step": 3804 }, { "epoch": 1.4023772228876807, "grad_norm": 6.6275592395231495, "learning_rate": 3.855235505006799e-05, "loss": 0.4204, "step": 3805 }, { "epoch": 1.4027457845756934, "grad_norm": 6.085543226758487, "learning_rate": 3.8549264433180864e-05, "loss": 0.6544, "step": 3806 }, { "epoch": 1.403114346263706, "grad_norm": 5.566919863913997, "learning_rate": 3.8546173816293735e-05, "loss": 0.3401, "step": 3807 }, { "epoch": 1.4034829079517184, "grad_norm": 9.6779308680229, "learning_rate": 3.8543083199406606e-05, "loss": 0.6131, "step": 3808 }, { "epoch": 1.403851469639731, "grad_norm": 5.585779403903476, "learning_rate": 3.853999258251947e-05, "loss": 0.3201, "step": 3809 }, { "epoch": 1.4042200313277435, "grad_norm": 10.11003155470394, "learning_rate": 3.853690196563234e-05, "loss": 0.4171, "step": 3810 }, { "epoch": 1.404588593015756, "grad_norm": 5.64897117362397, "learning_rate": 3.853381134874521e-05, "loss": 0.4679, "step": 3811 }, { "epoch": 1.4049571547037685, "grad_norm": 5.852156360678021, "learning_rate": 3.8530720731858084e-05, "loss": 0.5275, "step": 3812 }, { "epoch": 1.405325716391781, "grad_norm": 6.806389824558072, "learning_rate": 3.852763011497095e-05, "loss": 0.3792, "step": 3813 }, { "epoch": 1.4056942780797936, "grad_norm": 8.977633765086408, "learning_rate": 3.852453949808382e-05, "loss": 0.4756, "step": 3814 }, { "epoch": 1.406062839767806, "grad_norm": 6.145475456895796, "learning_rate": 3.8521448881196684e-05, "loss": 0.4533, "step": 3815 }, { "epoch": 1.4064314014558188, "grad_norm": 6.615715574744577, "learning_rate": 3.8518358264309556e-05, "loss": 0.6143, "step": 3816 }, { "epoch": 1.4067999631438313, "grad_norm": 7.08063440702576, "learning_rate": 3.851526764742243e-05, "loss": 0.6369, "step": 3817 }, { "epoch": 1.4071685248318437, "grad_norm": 7.502039822705878, "learning_rate": 3.85121770305353e-05, "loss": 0.5156, "step": 3818 }, { "epoch": 1.4075370865198562, "grad_norm": 4.781493336587643, "learning_rate": 3.850908641364816e-05, "loss": 0.3726, "step": 3819 }, { "epoch": 1.4079056482078687, "grad_norm": 5.703412944892795, "learning_rate": 3.8505995796761034e-05, "loss": 0.4219, "step": 3820 }, { "epoch": 1.4082742098958814, "grad_norm": 4.700955622492988, "learning_rate": 3.8502905179873905e-05, "loss": 0.3379, "step": 3821 }, { "epoch": 1.4086427715838938, "grad_norm": 5.982564069482163, "learning_rate": 3.8499814562986776e-05, "loss": 0.3813, "step": 3822 }, { "epoch": 1.4090113332719063, "grad_norm": 5.3203297081219505, "learning_rate": 3.849672394609965e-05, "loss": 0.3125, "step": 3823 }, { "epoch": 1.409379894959919, "grad_norm": 5.000328053202954, "learning_rate": 3.849363332921251e-05, "loss": 0.3836, "step": 3824 }, { "epoch": 1.4097484566479315, "grad_norm": 6.650720152904202, "learning_rate": 3.849054271232538e-05, "loss": 0.635, "step": 3825 }, { "epoch": 1.410117018335944, "grad_norm": 4.495510404842683, "learning_rate": 3.8487452095438254e-05, "loss": 0.3348, "step": 3826 }, { "epoch": 1.4104855800239564, "grad_norm": 7.360549994941095, "learning_rate": 3.8484361478551125e-05, "loss": 0.4402, "step": 3827 }, { "epoch": 1.410854141711969, "grad_norm": 4.467526568486943, "learning_rate": 3.848127086166399e-05, "loss": 0.3209, "step": 3828 }, { "epoch": 1.4112227033999816, "grad_norm": 7.367372868492434, "learning_rate": 3.8478180244776854e-05, "loss": 0.6408, "step": 3829 }, { "epoch": 1.411591265087994, "grad_norm": 7.59327627203548, "learning_rate": 3.8475089627889725e-05, "loss": 0.4937, "step": 3830 }, { "epoch": 1.4119598267760067, "grad_norm": 5.0227403409121765, "learning_rate": 3.8471999011002596e-05, "loss": 0.3507, "step": 3831 }, { "epoch": 1.4123283884640192, "grad_norm": 8.105679120010983, "learning_rate": 3.846890839411547e-05, "loss": 0.4754, "step": 3832 }, { "epoch": 1.4126969501520317, "grad_norm": 7.600745475748374, "learning_rate": 3.846581777722834e-05, "loss": 0.3831, "step": 3833 }, { "epoch": 1.4130655118400441, "grad_norm": 7.795486357113667, "learning_rate": 3.84627271603412e-05, "loss": 0.5223, "step": 3834 }, { "epoch": 1.4134340735280568, "grad_norm": 6.954497538337488, "learning_rate": 3.8459636543454074e-05, "loss": 0.5418, "step": 3835 }, { "epoch": 1.4138026352160693, "grad_norm": 7.519871897722347, "learning_rate": 3.8456545926566946e-05, "loss": 0.4469, "step": 3836 }, { "epoch": 1.4141711969040818, "grad_norm": 6.916355110719672, "learning_rate": 3.845345530967982e-05, "loss": 0.4527, "step": 3837 }, { "epoch": 1.4145397585920945, "grad_norm": 8.021789916493418, "learning_rate": 3.845036469279268e-05, "loss": 0.5238, "step": 3838 }, { "epoch": 1.414908320280107, "grad_norm": 5.8398323186551995, "learning_rate": 3.844727407590555e-05, "loss": 0.3984, "step": 3839 }, { "epoch": 1.4152768819681194, "grad_norm": 8.423195701755844, "learning_rate": 3.8444183459018424e-05, "loss": 0.5526, "step": 3840 }, { "epoch": 1.4156454436561319, "grad_norm": 6.130522437679086, "learning_rate": 3.8441092842131295e-05, "loss": 0.3948, "step": 3841 }, { "epoch": 1.4160140053441443, "grad_norm": 7.268512111192144, "learning_rate": 3.8438002225244166e-05, "loss": 0.4938, "step": 3842 }, { "epoch": 1.416382567032157, "grad_norm": 7.132050890336085, "learning_rate": 3.843491160835703e-05, "loss": 0.3963, "step": 3843 }, { "epoch": 1.4167511287201695, "grad_norm": 5.560561281593561, "learning_rate": 3.8431820991469895e-05, "loss": 0.3132, "step": 3844 }, { "epoch": 1.4171196904081822, "grad_norm": 7.762372385348962, "learning_rate": 3.8428730374582766e-05, "loss": 0.5349, "step": 3845 }, { "epoch": 1.4174882520961947, "grad_norm": 7.403053334228627, "learning_rate": 3.842563975769564e-05, "loss": 0.6959, "step": 3846 }, { "epoch": 1.4178568137842071, "grad_norm": 8.007136499668055, "learning_rate": 3.842254914080851e-05, "loss": 0.5386, "step": 3847 }, { "epoch": 1.4182253754722196, "grad_norm": 5.608975962097163, "learning_rate": 3.841945852392137e-05, "loss": 0.4093, "step": 3848 }, { "epoch": 1.418593937160232, "grad_norm": 7.977930144139244, "learning_rate": 3.8416367907034244e-05, "loss": 0.5891, "step": 3849 }, { "epoch": 1.4189624988482448, "grad_norm": 5.624926079158382, "learning_rate": 3.8413277290147115e-05, "loss": 0.4128, "step": 3850 }, { "epoch": 1.4193310605362572, "grad_norm": 7.4391041796774, "learning_rate": 3.8410186673259986e-05, "loss": 0.4535, "step": 3851 }, { "epoch": 1.4196996222242697, "grad_norm": 8.174048169454503, "learning_rate": 3.840709605637286e-05, "loss": 0.5507, "step": 3852 }, { "epoch": 1.4200681839122824, "grad_norm": 5.6101117938593354, "learning_rate": 3.840400543948572e-05, "loss": 0.3027, "step": 3853 }, { "epoch": 1.4204367456002949, "grad_norm": 5.650343703896956, "learning_rate": 3.840091482259859e-05, "loss": 0.2968, "step": 3854 }, { "epoch": 1.4208053072883073, "grad_norm": 4.603751369520652, "learning_rate": 3.8397824205711464e-05, "loss": 0.3106, "step": 3855 }, { "epoch": 1.4211738689763198, "grad_norm": 8.09748100389241, "learning_rate": 3.8394733588824335e-05, "loss": 0.4359, "step": 3856 }, { "epoch": 1.4215424306643325, "grad_norm": 7.4490745942560554, "learning_rate": 3.83916429719372e-05, "loss": 0.6745, "step": 3857 }, { "epoch": 1.421910992352345, "grad_norm": 6.258669524257307, "learning_rate": 3.838855235505007e-05, "loss": 0.3645, "step": 3858 }, { "epoch": 1.4222795540403574, "grad_norm": 6.625836013731353, "learning_rate": 3.8385461738162936e-05, "loss": 0.4862, "step": 3859 }, { "epoch": 1.4226481157283701, "grad_norm": 12.58374898531398, "learning_rate": 3.838237112127581e-05, "loss": 0.3957, "step": 3860 }, { "epoch": 1.4230166774163826, "grad_norm": 11.103615428036512, "learning_rate": 3.837928050438868e-05, "loss": 0.3643, "step": 3861 }, { "epoch": 1.423385239104395, "grad_norm": 5.820500927633633, "learning_rate": 3.837618988750155e-05, "loss": 0.4303, "step": 3862 }, { "epoch": 1.4237538007924075, "grad_norm": 7.904567856193283, "learning_rate": 3.8373099270614414e-05, "loss": 0.6005, "step": 3863 }, { "epoch": 1.4241223624804202, "grad_norm": 5.407701242399656, "learning_rate": 3.8370008653727285e-05, "loss": 0.316, "step": 3864 }, { "epoch": 1.4244909241684327, "grad_norm": 3.6542091868821824, "learning_rate": 3.8366918036840156e-05, "loss": 0.1966, "step": 3865 }, { "epoch": 1.4248594858564452, "grad_norm": 5.939287739702503, "learning_rate": 3.836382741995303e-05, "loss": 0.4056, "step": 3866 }, { "epoch": 1.4252280475444579, "grad_norm": 8.332304115316314, "learning_rate": 3.836073680306589e-05, "loss": 0.286, "step": 3867 }, { "epoch": 1.4255966092324703, "grad_norm": 6.612302019295909, "learning_rate": 3.835764618617876e-05, "loss": 0.5233, "step": 3868 }, { "epoch": 1.4259651709204828, "grad_norm": 8.482993672084461, "learning_rate": 3.8354555569291634e-05, "loss": 0.4028, "step": 3869 }, { "epoch": 1.4263337326084953, "grad_norm": 5.503693467542317, "learning_rate": 3.8351464952404505e-05, "loss": 0.3303, "step": 3870 }, { "epoch": 1.4267022942965077, "grad_norm": 7.3887129199395245, "learning_rate": 3.8348374335517376e-05, "loss": 0.4069, "step": 3871 }, { "epoch": 1.4270708559845204, "grad_norm": 6.717990566182306, "learning_rate": 3.834528371863024e-05, "loss": 0.3154, "step": 3872 }, { "epoch": 1.427439417672533, "grad_norm": 6.173237092577271, "learning_rate": 3.834219310174311e-05, "loss": 0.5382, "step": 3873 }, { "epoch": 1.4278079793605456, "grad_norm": 3.971634184757476, "learning_rate": 3.8339102484855976e-05, "loss": 0.3828, "step": 3874 }, { "epoch": 1.428176541048558, "grad_norm": 6.127120604580603, "learning_rate": 3.833601186796885e-05, "loss": 0.3407, "step": 3875 }, { "epoch": 1.4285451027365705, "grad_norm": 7.11554827273905, "learning_rate": 3.833292125108172e-05, "loss": 0.4754, "step": 3876 }, { "epoch": 1.428913664424583, "grad_norm": 5.6332228851024695, "learning_rate": 3.832983063419458e-05, "loss": 0.4567, "step": 3877 }, { "epoch": 1.4292822261125955, "grad_norm": 5.264834200285336, "learning_rate": 3.8326740017307454e-05, "loss": 0.4314, "step": 3878 }, { "epoch": 1.4296507878006082, "grad_norm": 6.482834083666468, "learning_rate": 3.8323649400420325e-05, "loss": 0.3515, "step": 3879 }, { "epoch": 1.4300193494886206, "grad_norm": 5.3610232296045295, "learning_rate": 3.83205587835332e-05, "loss": 0.4373, "step": 3880 }, { "epoch": 1.430387911176633, "grad_norm": 6.444933700989968, "learning_rate": 3.831746816664606e-05, "loss": 0.4744, "step": 3881 }, { "epoch": 1.4307564728646458, "grad_norm": 5.690084017114876, "learning_rate": 3.831437754975893e-05, "loss": 0.5581, "step": 3882 }, { "epoch": 1.4311250345526583, "grad_norm": 7.4393085236340655, "learning_rate": 3.8311286932871803e-05, "loss": 0.616, "step": 3883 }, { "epoch": 1.4314935962406707, "grad_norm": 6.701725455766795, "learning_rate": 3.8308196315984675e-05, "loss": 0.5318, "step": 3884 }, { "epoch": 1.4318621579286832, "grad_norm": 5.085553835975342, "learning_rate": 3.8305105699097546e-05, "loss": 0.246, "step": 3885 }, { "epoch": 1.432230719616696, "grad_norm": 5.884784747832355, "learning_rate": 3.830201508221041e-05, "loss": 0.3883, "step": 3886 }, { "epoch": 1.4325992813047084, "grad_norm": 4.034260416804511, "learning_rate": 3.829892446532328e-05, "loss": 0.2695, "step": 3887 }, { "epoch": 1.4329678429927208, "grad_norm": 5.933831617598837, "learning_rate": 3.829583384843615e-05, "loss": 0.4209, "step": 3888 }, { "epoch": 1.4333364046807335, "grad_norm": 4.995126638095899, "learning_rate": 3.829274323154902e-05, "loss": 0.4275, "step": 3889 }, { "epoch": 1.433704966368746, "grad_norm": 4.625939351134113, "learning_rate": 3.828965261466189e-05, "loss": 0.3349, "step": 3890 }, { "epoch": 1.4340735280567585, "grad_norm": 8.50365369471664, "learning_rate": 3.828656199777475e-05, "loss": 0.556, "step": 3891 }, { "epoch": 1.434442089744771, "grad_norm": 5.799624713726601, "learning_rate": 3.8283471380887624e-05, "loss": 0.3767, "step": 3892 }, { "epoch": 1.4348106514327836, "grad_norm": 4.940565485538526, "learning_rate": 3.8280380764000495e-05, "loss": 0.3509, "step": 3893 }, { "epoch": 1.435179213120796, "grad_norm": 5.026384168692172, "learning_rate": 3.8277290147113366e-05, "loss": 0.3909, "step": 3894 }, { "epoch": 1.4355477748088086, "grad_norm": 12.956509542556867, "learning_rate": 3.827419953022624e-05, "loss": 0.2928, "step": 3895 }, { "epoch": 1.4359163364968213, "grad_norm": 7.343735715162303, "learning_rate": 3.82711089133391e-05, "loss": 0.4678, "step": 3896 }, { "epoch": 1.4362848981848337, "grad_norm": 17.025769617103585, "learning_rate": 3.826801829645197e-05, "loss": 0.4642, "step": 3897 }, { "epoch": 1.4366534598728462, "grad_norm": 7.134371032580067, "learning_rate": 3.8264927679564844e-05, "loss": 0.3973, "step": 3898 }, { "epoch": 1.4370220215608587, "grad_norm": 6.578808094357428, "learning_rate": 3.8261837062677715e-05, "loss": 0.3673, "step": 3899 }, { "epoch": 1.4373905832488711, "grad_norm": 6.328851352025512, "learning_rate": 3.825874644579058e-05, "loss": 0.4277, "step": 3900 }, { "epoch": 1.4377591449368838, "grad_norm": 7.418153620902613, "learning_rate": 3.825565582890345e-05, "loss": 0.2699, "step": 3901 }, { "epoch": 1.4381277066248963, "grad_norm": 8.405481693368834, "learning_rate": 3.825256521201632e-05, "loss": 0.4598, "step": 3902 }, { "epoch": 1.438496268312909, "grad_norm": 6.004236632907244, "learning_rate": 3.8249474595129193e-05, "loss": 0.3195, "step": 3903 }, { "epoch": 1.4388648300009215, "grad_norm": 7.874261730647158, "learning_rate": 3.824638397824206e-05, "loss": 0.4264, "step": 3904 }, { "epoch": 1.439233391688934, "grad_norm": 10.502439669604005, "learning_rate": 3.824329336135493e-05, "loss": 0.5945, "step": 3905 }, { "epoch": 1.4396019533769464, "grad_norm": 7.779476591807477, "learning_rate": 3.8240202744467793e-05, "loss": 0.4988, "step": 3906 }, { "epoch": 1.4399705150649589, "grad_norm": 6.854435352133831, "learning_rate": 3.8237112127580665e-05, "loss": 0.4262, "step": 3907 }, { "epoch": 1.4403390767529716, "grad_norm": 6.741782378471726, "learning_rate": 3.8234021510693536e-05, "loss": 0.4355, "step": 3908 }, { "epoch": 1.440707638440984, "grad_norm": 8.809640406913388, "learning_rate": 3.823093089380641e-05, "loss": 0.4815, "step": 3909 }, { "epoch": 1.4410762001289965, "grad_norm": 7.941520573759002, "learning_rate": 3.822784027691927e-05, "loss": 0.4723, "step": 3910 }, { "epoch": 1.4414447618170092, "grad_norm": 7.667161041021316, "learning_rate": 3.822474966003214e-05, "loss": 0.4895, "step": 3911 }, { "epoch": 1.4418133235050217, "grad_norm": 8.723856206647254, "learning_rate": 3.8221659043145014e-05, "loss": 0.4037, "step": 3912 }, { "epoch": 1.4421818851930341, "grad_norm": 12.292061975016894, "learning_rate": 3.8218568426257885e-05, "loss": 0.5323, "step": 3913 }, { "epoch": 1.4425504468810466, "grad_norm": 5.647586320451263, "learning_rate": 3.8215477809370756e-05, "loss": 0.4481, "step": 3914 }, { "epoch": 1.4429190085690593, "grad_norm": 7.569863005876287, "learning_rate": 3.821238719248362e-05, "loss": 0.5093, "step": 3915 }, { "epoch": 1.4432875702570718, "grad_norm": 9.030516703577584, "learning_rate": 3.820929657559649e-05, "loss": 0.4322, "step": 3916 }, { "epoch": 1.4436561319450842, "grad_norm": 4.780685092240637, "learning_rate": 3.820620595870936e-05, "loss": 0.3554, "step": 3917 }, { "epoch": 1.444024693633097, "grad_norm": 5.7097911597439905, "learning_rate": 3.8203115341822234e-05, "loss": 0.4169, "step": 3918 }, { "epoch": 1.4443932553211094, "grad_norm": 5.018003476156503, "learning_rate": 3.82000247249351e-05, "loss": 0.3793, "step": 3919 }, { "epoch": 1.4447618170091219, "grad_norm": 8.403220839856491, "learning_rate": 3.819693410804796e-05, "loss": 0.5095, "step": 3920 }, { "epoch": 1.4451303786971343, "grad_norm": 5.311122581082936, "learning_rate": 3.8193843491160834e-05, "loss": 0.4402, "step": 3921 }, { "epoch": 1.445498940385147, "grad_norm": 4.981806078113314, "learning_rate": 3.8190752874273705e-05, "loss": 0.4551, "step": 3922 }, { "epoch": 1.4458675020731595, "grad_norm": 9.51354256128755, "learning_rate": 3.8187662257386577e-05, "loss": 0.4028, "step": 3923 }, { "epoch": 1.446236063761172, "grad_norm": 9.01528226574491, "learning_rate": 3.818457164049945e-05, "loss": 0.5099, "step": 3924 }, { "epoch": 1.4466046254491847, "grad_norm": 6.561888457369046, "learning_rate": 3.818148102361231e-05, "loss": 0.5901, "step": 3925 }, { "epoch": 1.4469731871371971, "grad_norm": 6.994104354922013, "learning_rate": 3.8178390406725183e-05, "loss": 0.5064, "step": 3926 }, { "epoch": 1.4473417488252096, "grad_norm": 7.907412236245546, "learning_rate": 3.8175299789838055e-05, "loss": 0.6442, "step": 3927 }, { "epoch": 1.447710310513222, "grad_norm": 6.732757019961714, "learning_rate": 3.8172209172950926e-05, "loss": 0.4786, "step": 3928 }, { "epoch": 1.4480788722012345, "grad_norm": 7.243338846250503, "learning_rate": 3.816911855606379e-05, "loss": 0.3205, "step": 3929 }, { "epoch": 1.4484474338892472, "grad_norm": 5.797018455519424, "learning_rate": 3.816602793917666e-05, "loss": 0.505, "step": 3930 }, { "epoch": 1.4488159955772597, "grad_norm": 7.445235901445557, "learning_rate": 3.816293732228953e-05, "loss": 0.5468, "step": 3931 }, { "epoch": 1.4491845572652724, "grad_norm": 7.398723141595215, "learning_rate": 3.8159846705402404e-05, "loss": 0.7543, "step": 3932 }, { "epoch": 1.4495531189532849, "grad_norm": 6.731064693128136, "learning_rate": 3.8156756088515275e-05, "loss": 0.6606, "step": 3933 }, { "epoch": 1.4499216806412973, "grad_norm": 4.390412349234805, "learning_rate": 3.815366547162814e-05, "loss": 0.3811, "step": 3934 }, { "epoch": 1.4502902423293098, "grad_norm": 10.383755953185364, "learning_rate": 3.8150574854741004e-05, "loss": 0.6448, "step": 3935 }, { "epoch": 1.4506588040173223, "grad_norm": 6.26108514027633, "learning_rate": 3.8147484237853875e-05, "loss": 0.4426, "step": 3936 }, { "epoch": 1.451027365705335, "grad_norm": 9.865280583253988, "learning_rate": 3.8144393620966746e-05, "loss": 0.5455, "step": 3937 }, { "epoch": 1.4513959273933474, "grad_norm": 9.095568707738263, "learning_rate": 3.814130300407962e-05, "loss": 0.527, "step": 3938 }, { "epoch": 1.45176448908136, "grad_norm": 5.573878619396868, "learning_rate": 3.813821238719248e-05, "loss": 0.3875, "step": 3939 }, { "epoch": 1.4521330507693726, "grad_norm": 5.309497658051972, "learning_rate": 3.813512177030535e-05, "loss": 0.4196, "step": 3940 }, { "epoch": 1.452501612457385, "grad_norm": 7.793281537425813, "learning_rate": 3.8132031153418224e-05, "loss": 0.4783, "step": 3941 }, { "epoch": 1.4528701741453975, "grad_norm": 7.649022766725738, "learning_rate": 3.8128940536531095e-05, "loss": 0.6631, "step": 3942 }, { "epoch": 1.45323873583341, "grad_norm": 5.343041802487138, "learning_rate": 3.8125849919643967e-05, "loss": 0.4232, "step": 3943 }, { "epoch": 1.4536072975214227, "grad_norm": 9.363456434792122, "learning_rate": 3.812275930275683e-05, "loss": 0.5454, "step": 3944 }, { "epoch": 1.4539758592094352, "grad_norm": 7.916026708603146, "learning_rate": 3.81196686858697e-05, "loss": 0.5402, "step": 3945 }, { "epoch": 1.4543444208974476, "grad_norm": 5.629810226250415, "learning_rate": 3.811657806898257e-05, "loss": 0.5184, "step": 3946 }, { "epoch": 1.4547129825854603, "grad_norm": 5.935532414530757, "learning_rate": 3.8113487452095445e-05, "loss": 0.3409, "step": 3947 }, { "epoch": 1.4550815442734728, "grad_norm": 5.539458482628218, "learning_rate": 3.811039683520831e-05, "loss": 0.4007, "step": 3948 }, { "epoch": 1.4554501059614853, "grad_norm": 7.4017871657506005, "learning_rate": 3.8107306218321173e-05, "loss": 0.4586, "step": 3949 }, { "epoch": 1.4558186676494977, "grad_norm": 5.597846380014218, "learning_rate": 3.8104215601434045e-05, "loss": 0.4535, "step": 3950 }, { "epoch": 1.4561872293375104, "grad_norm": 4.306437335763226, "learning_rate": 3.8101124984546916e-05, "loss": 0.2983, "step": 3951 }, { "epoch": 1.456555791025523, "grad_norm": 4.065262544794059, "learning_rate": 3.809803436765979e-05, "loss": 0.3809, "step": 3952 }, { "epoch": 1.4569243527135354, "grad_norm": 5.046658821318226, "learning_rate": 3.809494375077265e-05, "loss": 0.4103, "step": 3953 }, { "epoch": 1.457292914401548, "grad_norm": 7.472802758463154, "learning_rate": 3.809185313388552e-05, "loss": 0.2351, "step": 3954 }, { "epoch": 1.4576614760895605, "grad_norm": 5.197122129619024, "learning_rate": 3.8088762516998394e-05, "loss": 0.3116, "step": 3955 }, { "epoch": 1.458030037777573, "grad_norm": 6.15468178540846, "learning_rate": 3.8085671900111265e-05, "loss": 0.438, "step": 3956 }, { "epoch": 1.4583985994655855, "grad_norm": 6.627667735656172, "learning_rate": 3.8082581283224136e-05, "loss": 0.3768, "step": 3957 }, { "epoch": 1.458767161153598, "grad_norm": 6.741177196990204, "learning_rate": 3.8079490666337e-05, "loss": 0.4251, "step": 3958 }, { "epoch": 1.4591357228416106, "grad_norm": 6.4893159357112955, "learning_rate": 3.807640004944987e-05, "loss": 0.6098, "step": 3959 }, { "epoch": 1.459504284529623, "grad_norm": 8.034505340219255, "learning_rate": 3.807330943256274e-05, "loss": 0.3767, "step": 3960 }, { "epoch": 1.4598728462176358, "grad_norm": 8.767433891036271, "learning_rate": 3.8070218815675614e-05, "loss": 0.4768, "step": 3961 }, { "epoch": 1.4602414079056483, "grad_norm": 5.4549714817941535, "learning_rate": 3.8067128198788485e-05, "loss": 0.2571, "step": 3962 }, { "epoch": 1.4606099695936607, "grad_norm": 5.503401831314786, "learning_rate": 3.806403758190135e-05, "loss": 0.4538, "step": 3963 }, { "epoch": 1.4609785312816732, "grad_norm": 5.354527363582477, "learning_rate": 3.806094696501422e-05, "loss": 0.3703, "step": 3964 }, { "epoch": 1.4613470929696857, "grad_norm": 4.652366913578461, "learning_rate": 3.8057856348127085e-05, "loss": 0.2806, "step": 3965 }, { "epoch": 1.4617156546576984, "grad_norm": 12.522031366006475, "learning_rate": 3.8054765731239957e-05, "loss": 0.7781, "step": 3966 }, { "epoch": 1.4620842163457108, "grad_norm": 5.7601901441555015, "learning_rate": 3.805167511435283e-05, "loss": 0.4753, "step": 3967 }, { "epoch": 1.4624527780337235, "grad_norm": 8.579631513316691, "learning_rate": 3.804858449746569e-05, "loss": 0.3551, "step": 3968 }, { "epoch": 1.462821339721736, "grad_norm": 7.7446053710641465, "learning_rate": 3.804549388057856e-05, "loss": 0.5787, "step": 3969 }, { "epoch": 1.4631899014097485, "grad_norm": 6.5517836493612265, "learning_rate": 3.8042403263691435e-05, "loss": 0.3058, "step": 3970 }, { "epoch": 1.463558463097761, "grad_norm": 7.024427028109993, "learning_rate": 3.8039312646804306e-05, "loss": 0.4034, "step": 3971 }, { "epoch": 1.4639270247857734, "grad_norm": 4.133461337004584, "learning_rate": 3.803622202991717e-05, "loss": 0.2109, "step": 3972 }, { "epoch": 1.464295586473786, "grad_norm": 7.177353306210984, "learning_rate": 3.803313141303004e-05, "loss": 0.4063, "step": 3973 }, { "epoch": 1.4646641481617986, "grad_norm": 6.052229843948789, "learning_rate": 3.803004079614291e-05, "loss": 0.3738, "step": 3974 }, { "epoch": 1.465032709849811, "grad_norm": 5.137026747836086, "learning_rate": 3.8026950179255784e-05, "loss": 0.3781, "step": 3975 }, { "epoch": 1.4654012715378237, "grad_norm": 7.915369857850754, "learning_rate": 3.8023859562368655e-05, "loss": 0.6557, "step": 3976 }, { "epoch": 1.4657698332258362, "grad_norm": 6.334111149189993, "learning_rate": 3.802076894548152e-05, "loss": 0.4775, "step": 3977 }, { "epoch": 1.4661383949138487, "grad_norm": 6.218603697929648, "learning_rate": 3.801767832859439e-05, "loss": 0.5062, "step": 3978 }, { "epoch": 1.4665069566018611, "grad_norm": 7.602125031376147, "learning_rate": 3.801458771170726e-05, "loss": 0.4376, "step": 3979 }, { "epoch": 1.4668755182898738, "grad_norm": 4.811966309122689, "learning_rate": 3.8011497094820126e-05, "loss": 0.3288, "step": 3980 }, { "epoch": 1.4672440799778863, "grad_norm": 5.425618311369585, "learning_rate": 3.8008406477933e-05, "loss": 0.3061, "step": 3981 }, { "epoch": 1.4676126416658988, "grad_norm": 3.7881490019160773, "learning_rate": 3.800531586104586e-05, "loss": 0.2349, "step": 3982 }, { "epoch": 1.4679812033539115, "grad_norm": 8.068465990051466, "learning_rate": 3.800222524415873e-05, "loss": 0.2924, "step": 3983 }, { "epoch": 1.468349765041924, "grad_norm": 3.8108894823716137, "learning_rate": 3.7999134627271604e-05, "loss": 0.3275, "step": 3984 }, { "epoch": 1.4687183267299364, "grad_norm": 6.764847144678817, "learning_rate": 3.7996044010384475e-05, "loss": 0.3859, "step": 3985 }, { "epoch": 1.4690868884179489, "grad_norm": 5.721243559633897, "learning_rate": 3.7992953393497347e-05, "loss": 0.4419, "step": 3986 }, { "epoch": 1.4694554501059613, "grad_norm": 13.857214173258443, "learning_rate": 3.798986277661021e-05, "loss": 0.4711, "step": 3987 }, { "epoch": 1.469824011793974, "grad_norm": 4.941999871705137, "learning_rate": 3.798677215972308e-05, "loss": 0.4619, "step": 3988 }, { "epoch": 1.4701925734819865, "grad_norm": 5.418533394784111, "learning_rate": 3.798368154283595e-05, "loss": 0.3645, "step": 3989 }, { "epoch": 1.4705611351699992, "grad_norm": 10.142078454197332, "learning_rate": 3.7980590925948825e-05, "loss": 0.4428, "step": 3990 }, { "epoch": 1.4709296968580117, "grad_norm": 5.887725683207662, "learning_rate": 3.797750030906169e-05, "loss": 0.3953, "step": 3991 }, { "epoch": 1.4712982585460241, "grad_norm": 6.0791883935816795, "learning_rate": 3.797440969217456e-05, "loss": 0.4562, "step": 3992 }, { "epoch": 1.4716668202340366, "grad_norm": 5.4700678518156245, "learning_rate": 3.797131907528743e-05, "loss": 0.3743, "step": 3993 }, { "epoch": 1.472035381922049, "grad_norm": 5.9510048923283225, "learning_rate": 3.79682284584003e-05, "loss": 0.2359, "step": 3994 }, { "epoch": 1.4724039436100618, "grad_norm": 9.34148713845969, "learning_rate": 3.796513784151317e-05, "loss": 0.6009, "step": 3995 }, { "epoch": 1.4727725052980742, "grad_norm": 6.609358262205461, "learning_rate": 3.796204722462604e-05, "loss": 0.5407, "step": 3996 }, { "epoch": 1.473141066986087, "grad_norm": 5.871334474543491, "learning_rate": 3.79589566077389e-05, "loss": 0.4433, "step": 3997 }, { "epoch": 1.4735096286740994, "grad_norm": 5.946254339819995, "learning_rate": 3.7955865990851774e-05, "loss": 0.3821, "step": 3998 }, { "epoch": 1.4738781903621119, "grad_norm": 8.856580074730719, "learning_rate": 3.7952775373964645e-05, "loss": 0.4459, "step": 3999 }, { "epoch": 1.4742467520501243, "grad_norm": 23.654096820354795, "learning_rate": 3.7949684757077516e-05, "loss": 0.4024, "step": 4000 }, { "epoch": 1.4742467520501243, "eval_bleu": 0.07907290230209624, "eval_bleu_1gram": 0.43233705742771733, "eval_bleu_2gram": 0.21828389996821054, "eval_bleu_3gram": 0.11150767226762426, "eval_bleu_4gram": 0.060532887948848715, "eval_rag_val_loss": 0.7910259763559222, "eval_rouge1": 0.42155704512575093, "eval_rouge2": 0.21307293792596688, "eval_rougeL": 0.4193056379068561, "step": 4000 }, { "epoch": 1.4746153137381368, "grad_norm": 9.251983429929785, "learning_rate": 3.794659414019038e-05, "loss": 0.4266, "step": 4001 }, { "epoch": 1.4749838754261495, "grad_norm": 5.911829981835647, "learning_rate": 3.794350352330325e-05, "loss": 0.386, "step": 4002 }, { "epoch": 1.475352437114162, "grad_norm": 7.915686242144899, "learning_rate": 3.794041290641612e-05, "loss": 0.5985, "step": 4003 }, { "epoch": 1.4757209988021744, "grad_norm": 8.810657497989295, "learning_rate": 3.7937322289528994e-05, "loss": 0.8177, "step": 4004 }, { "epoch": 1.4760895604901871, "grad_norm": 10.442282671668083, "learning_rate": 3.7934231672641865e-05, "loss": 0.4473, "step": 4005 }, { "epoch": 1.4764581221781996, "grad_norm": 7.079487044292315, "learning_rate": 3.793114105575473e-05, "loss": 0.3146, "step": 4006 }, { "epoch": 1.476826683866212, "grad_norm": 8.632756822501449, "learning_rate": 3.79280504388676e-05, "loss": 0.3974, "step": 4007 }, { "epoch": 1.4771952455542245, "grad_norm": 6.22363947819057, "learning_rate": 3.792495982198047e-05, "loss": 0.4307, "step": 4008 }, { "epoch": 1.4775638072422372, "grad_norm": 6.168014516533815, "learning_rate": 3.792186920509334e-05, "loss": 0.4086, "step": 4009 }, { "epoch": 1.4779323689302497, "grad_norm": 5.503121617244133, "learning_rate": 3.791877858820621e-05, "loss": 0.3464, "step": 4010 }, { "epoch": 1.4783009306182622, "grad_norm": 7.5729590357487675, "learning_rate": 3.791568797131907e-05, "loss": 0.3504, "step": 4011 }, { "epoch": 1.4786694923062749, "grad_norm": 7.971313782200939, "learning_rate": 3.791259735443194e-05, "loss": 0.4812, "step": 4012 }, { "epoch": 1.4790380539942873, "grad_norm": 9.316797685697106, "learning_rate": 3.7909506737544814e-05, "loss": 0.2901, "step": 4013 }, { "epoch": 1.4794066156822998, "grad_norm": 8.284130358456368, "learning_rate": 3.7906416120657686e-05, "loss": 0.5312, "step": 4014 }, { "epoch": 1.4797751773703123, "grad_norm": 8.15753046851353, "learning_rate": 3.790332550377056e-05, "loss": 0.617, "step": 4015 }, { "epoch": 1.480143739058325, "grad_norm": 6.902989787430595, "learning_rate": 3.790023488688342e-05, "loss": 0.4109, "step": 4016 }, { "epoch": 1.4805123007463374, "grad_norm": 5.419242637618366, "learning_rate": 3.789714426999629e-05, "loss": 0.3493, "step": 4017 }, { "epoch": 1.48088086243435, "grad_norm": 5.620763242759149, "learning_rate": 3.7894053653109164e-05, "loss": 0.2781, "step": 4018 }, { "epoch": 1.4812494241223626, "grad_norm": 8.06474048327037, "learning_rate": 3.7890963036222035e-05, "loss": 0.643, "step": 4019 }, { "epoch": 1.481617985810375, "grad_norm": 9.5157853537392, "learning_rate": 3.78878724193349e-05, "loss": 0.7897, "step": 4020 }, { "epoch": 1.4819865474983875, "grad_norm": 7.465290369619549, "learning_rate": 3.788478180244777e-05, "loss": 0.3948, "step": 4021 }, { "epoch": 1.4823551091864, "grad_norm": 7.7579211700911, "learning_rate": 3.788169118556064e-05, "loss": 0.4785, "step": 4022 }, { "epoch": 1.4827236708744125, "grad_norm": 6.975817188807356, "learning_rate": 3.787860056867351e-05, "loss": 0.2653, "step": 4023 }, { "epoch": 1.4830922325624252, "grad_norm": 13.051883161166222, "learning_rate": 3.7875509951786384e-05, "loss": 0.362, "step": 4024 }, { "epoch": 1.4834607942504376, "grad_norm": 5.182921813787281, "learning_rate": 3.787241933489925e-05, "loss": 0.463, "step": 4025 }, { "epoch": 1.4838293559384503, "grad_norm": 4.051640477369793, "learning_rate": 3.786932871801211e-05, "loss": 0.3086, "step": 4026 }, { "epoch": 1.4841979176264628, "grad_norm": 9.58219686210326, "learning_rate": 3.7866238101124984e-05, "loss": 0.4674, "step": 4027 }, { "epoch": 1.4845664793144753, "grad_norm": 11.465031408174584, "learning_rate": 3.7863147484237855e-05, "loss": 0.5122, "step": 4028 }, { "epoch": 1.4849350410024877, "grad_norm": 4.749567915688982, "learning_rate": 3.7860056867350726e-05, "loss": 0.3762, "step": 4029 }, { "epoch": 1.4853036026905002, "grad_norm": 7.648103825554727, "learning_rate": 3.785696625046359e-05, "loss": 0.4495, "step": 4030 }, { "epoch": 1.485672164378513, "grad_norm": 6.412689991417439, "learning_rate": 3.785387563357646e-05, "loss": 0.3738, "step": 4031 }, { "epoch": 1.4860407260665254, "grad_norm": 6.773683607300861, "learning_rate": 3.785078501668933e-05, "loss": 0.4808, "step": 4032 }, { "epoch": 1.4864092877545378, "grad_norm": 4.629039958841548, "learning_rate": 3.7847694399802204e-05, "loss": 0.3212, "step": 4033 }, { "epoch": 1.4867778494425505, "grad_norm": 4.50605387902419, "learning_rate": 3.7844603782915076e-05, "loss": 0.3964, "step": 4034 }, { "epoch": 1.487146411130563, "grad_norm": 6.584554760130723, "learning_rate": 3.784151316602794e-05, "loss": 0.7804, "step": 4035 }, { "epoch": 1.4875149728185755, "grad_norm": 6.671492482996859, "learning_rate": 3.783842254914081e-05, "loss": 0.573, "step": 4036 }, { "epoch": 1.487883534506588, "grad_norm": 7.649211528760252, "learning_rate": 3.783533193225368e-05, "loss": 0.3992, "step": 4037 }, { "epoch": 1.4882520961946006, "grad_norm": 7.773654156449964, "learning_rate": 3.7832241315366554e-05, "loss": 0.515, "step": 4038 }, { "epoch": 1.488620657882613, "grad_norm": 6.993572553986345, "learning_rate": 3.782915069847942e-05, "loss": 0.3609, "step": 4039 }, { "epoch": 1.4889892195706256, "grad_norm": 6.5068634183767955, "learning_rate": 3.782606008159228e-05, "loss": 0.6048, "step": 4040 }, { "epoch": 1.4893577812586383, "grad_norm": 6.609798336675157, "learning_rate": 3.7822969464705154e-05, "loss": 0.484, "step": 4041 }, { "epoch": 1.4897263429466507, "grad_norm": 5.176378392734684, "learning_rate": 3.7819878847818025e-05, "loss": 0.2751, "step": 4042 }, { "epoch": 1.4900949046346632, "grad_norm": 5.829592931548359, "learning_rate": 3.7816788230930896e-05, "loss": 0.3832, "step": 4043 }, { "epoch": 1.4904634663226757, "grad_norm": 6.057311994191045, "learning_rate": 3.781369761404376e-05, "loss": 0.4481, "step": 4044 }, { "epoch": 1.4908320280106884, "grad_norm": 5.678830760370627, "learning_rate": 3.781060699715663e-05, "loss": 0.4419, "step": 4045 }, { "epoch": 1.4912005896987008, "grad_norm": 7.017805614441729, "learning_rate": 3.78075163802695e-05, "loss": 0.4585, "step": 4046 }, { "epoch": 1.4915691513867133, "grad_norm": 8.404403767403544, "learning_rate": 3.7804425763382374e-05, "loss": 0.6505, "step": 4047 }, { "epoch": 1.491937713074726, "grad_norm": 9.099682108598028, "learning_rate": 3.7801335146495245e-05, "loss": 0.4715, "step": 4048 }, { "epoch": 1.4923062747627385, "grad_norm": 6.707324155478111, "learning_rate": 3.779824452960811e-05, "loss": 0.3078, "step": 4049 }, { "epoch": 1.492674836450751, "grad_norm": 11.48191938032357, "learning_rate": 3.779515391272098e-05, "loss": 0.4042, "step": 4050 }, { "epoch": 1.4930433981387634, "grad_norm": 13.096433523861444, "learning_rate": 3.779206329583385e-05, "loss": 0.6861, "step": 4051 }, { "epoch": 1.4934119598267759, "grad_norm": 4.634623644712157, "learning_rate": 3.778897267894672e-05, "loss": 0.3252, "step": 4052 }, { "epoch": 1.4937805215147886, "grad_norm": 6.7407516418077, "learning_rate": 3.7785882062059594e-05, "loss": 0.4363, "step": 4053 }, { "epoch": 1.494149083202801, "grad_norm": 7.455654011111194, "learning_rate": 3.778279144517246e-05, "loss": 0.4907, "step": 4054 }, { "epoch": 1.4945176448908137, "grad_norm": 6.245550479129403, "learning_rate": 3.777970082828532e-05, "loss": 0.4189, "step": 4055 }, { "epoch": 1.4948862065788262, "grad_norm": 6.0608294830177165, "learning_rate": 3.7776610211398194e-05, "loss": 0.5642, "step": 4056 }, { "epoch": 1.4952547682668387, "grad_norm": 7.298249795207218, "learning_rate": 3.7773519594511066e-05, "loss": 0.4783, "step": 4057 }, { "epoch": 1.4956233299548511, "grad_norm": 5.097004605439942, "learning_rate": 3.777042897762394e-05, "loss": 0.3734, "step": 4058 }, { "epoch": 1.4959918916428636, "grad_norm": 5.569549551099122, "learning_rate": 3.77673383607368e-05, "loss": 0.3166, "step": 4059 }, { "epoch": 1.4963604533308763, "grad_norm": 8.144123753295032, "learning_rate": 3.776424774384967e-05, "loss": 0.5274, "step": 4060 }, { "epoch": 1.4967290150188888, "grad_norm": 12.288319967236673, "learning_rate": 3.7761157126962544e-05, "loss": 0.6762, "step": 4061 }, { "epoch": 1.4970975767069012, "grad_norm": 5.0709267642405385, "learning_rate": 3.7758066510075415e-05, "loss": 0.4234, "step": 4062 }, { "epoch": 1.497466138394914, "grad_norm": 4.594241161428646, "learning_rate": 3.775497589318828e-05, "loss": 0.3122, "step": 4063 }, { "epoch": 1.4978347000829264, "grad_norm": 6.346655870690233, "learning_rate": 3.775188527630115e-05, "loss": 0.3966, "step": 4064 }, { "epoch": 1.4982032617709389, "grad_norm": 4.903151779150301, "learning_rate": 3.774879465941402e-05, "loss": 0.3741, "step": 4065 }, { "epoch": 1.4985718234589513, "grad_norm": 7.335391478281744, "learning_rate": 3.774570404252689e-05, "loss": 0.4699, "step": 4066 }, { "epoch": 1.498940385146964, "grad_norm": 5.2300632311060244, "learning_rate": 3.7742613425639764e-05, "loss": 0.4136, "step": 4067 }, { "epoch": 1.4993089468349765, "grad_norm": 10.768573418141981, "learning_rate": 3.773952280875263e-05, "loss": 0.3833, "step": 4068 }, { "epoch": 1.499677508522989, "grad_norm": 8.794390746023854, "learning_rate": 3.77364321918655e-05, "loss": 0.4795, "step": 4069 }, { "epoch": 1.5000460702110017, "grad_norm": 6.139787534336744, "learning_rate": 3.7733341574978364e-05, "loss": 0.485, "step": 4070 }, { "epoch": 1.5004146318990141, "grad_norm": 6.051013880494209, "learning_rate": 3.7730250958091235e-05, "loss": 0.4876, "step": 4071 }, { "epoch": 1.5007831935870266, "grad_norm": 5.713499233073207, "learning_rate": 3.7727160341204106e-05, "loss": 0.3971, "step": 4072 }, { "epoch": 1.501151755275039, "grad_norm": 8.770593442075164, "learning_rate": 3.772406972431697e-05, "loss": 0.4187, "step": 4073 }, { "epoch": 1.5015203169630515, "grad_norm": 6.150841520467919, "learning_rate": 3.772097910742984e-05, "loss": 0.4397, "step": 4074 }, { "epoch": 1.5018888786510642, "grad_norm": 4.9200470936079785, "learning_rate": 3.771788849054271e-05, "loss": 0.2733, "step": 4075 }, { "epoch": 1.5022574403390767, "grad_norm": 6.728365099459236, "learning_rate": 3.7714797873655584e-05, "loss": 0.4239, "step": 4076 }, { "epoch": 1.5026260020270894, "grad_norm": 5.052408592005046, "learning_rate": 3.7711707256768456e-05, "loss": 0.3747, "step": 4077 }, { "epoch": 1.5029945637151019, "grad_norm": 10.119849884970426, "learning_rate": 3.770861663988132e-05, "loss": 0.4987, "step": 4078 }, { "epoch": 1.5033631254031143, "grad_norm": 8.015405127649604, "learning_rate": 3.770552602299419e-05, "loss": 0.4836, "step": 4079 }, { "epoch": 1.5037316870911268, "grad_norm": 5.159377750463001, "learning_rate": 3.770243540610706e-05, "loss": 0.3653, "step": 4080 }, { "epoch": 1.5041002487791393, "grad_norm": 7.705432600369553, "learning_rate": 3.7699344789219934e-05, "loss": 0.419, "step": 4081 }, { "epoch": 1.504468810467152, "grad_norm": 4.63774415064962, "learning_rate": 3.76962541723328e-05, "loss": 0.3127, "step": 4082 }, { "epoch": 1.5048373721551644, "grad_norm": 6.476776303580439, "learning_rate": 3.769316355544567e-05, "loss": 0.4513, "step": 4083 }, { "epoch": 1.5052059338431771, "grad_norm": 5.598112312686671, "learning_rate": 3.769007293855854e-05, "loss": 0.3784, "step": 4084 }, { "epoch": 1.5055744955311896, "grad_norm": 6.244571311781131, "learning_rate": 3.768698232167141e-05, "loss": 0.4895, "step": 4085 }, { "epoch": 1.505943057219202, "grad_norm": 5.332484098214906, "learning_rate": 3.7683891704784276e-05, "loss": 0.3581, "step": 4086 }, { "epoch": 1.5063116189072145, "grad_norm": 5.877202696491294, "learning_rate": 3.768080108789715e-05, "loss": 0.3468, "step": 4087 }, { "epoch": 1.506680180595227, "grad_norm": 4.611631713852374, "learning_rate": 3.767771047101001e-05, "loss": 0.3918, "step": 4088 }, { "epoch": 1.5070487422832397, "grad_norm": 6.249169866745302, "learning_rate": 3.767461985412288e-05, "loss": 0.3863, "step": 4089 }, { "epoch": 1.5074173039712522, "grad_norm": 10.869179757622726, "learning_rate": 3.7671529237235754e-05, "loss": 0.4687, "step": 4090 }, { "epoch": 1.5077858656592649, "grad_norm": 7.379470763213117, "learning_rate": 3.7668438620348625e-05, "loss": 0.5941, "step": 4091 }, { "epoch": 1.5081544273472773, "grad_norm": 8.539083049780853, "learning_rate": 3.766534800346149e-05, "loss": 0.4466, "step": 4092 }, { "epoch": 1.5085229890352898, "grad_norm": 5.254542429246059, "learning_rate": 3.766225738657436e-05, "loss": 0.4049, "step": 4093 }, { "epoch": 1.5088915507233023, "grad_norm": 5.4571528820553485, "learning_rate": 3.765916676968723e-05, "loss": 0.2139, "step": 4094 }, { "epoch": 1.5092601124113147, "grad_norm": 18.74207759060103, "learning_rate": 3.76560761528001e-05, "loss": 0.2851, "step": 4095 }, { "epoch": 1.5096286740993272, "grad_norm": 6.113621009428359, "learning_rate": 3.7652985535912974e-05, "loss": 0.4857, "step": 4096 }, { "epoch": 1.50999723578734, "grad_norm": 6.763208816808473, "learning_rate": 3.764989491902584e-05, "loss": 0.3873, "step": 4097 }, { "epoch": 1.5103657974753526, "grad_norm": 6.383988569741784, "learning_rate": 3.764680430213871e-05, "loss": 0.4833, "step": 4098 }, { "epoch": 1.510734359163365, "grad_norm": 6.040901012794922, "learning_rate": 3.764371368525158e-05, "loss": 0.4162, "step": 4099 }, { "epoch": 1.5111029208513775, "grad_norm": 5.726480897163298, "learning_rate": 3.764062306836445e-05, "loss": 0.5439, "step": 4100 }, { "epoch": 1.51147148253939, "grad_norm": 5.459179159930523, "learning_rate": 3.763753245147732e-05, "loss": 0.5341, "step": 4101 }, { "epoch": 1.5118400442274025, "grad_norm": 5.413912420133068, "learning_rate": 3.763444183459018e-05, "loss": 0.3946, "step": 4102 }, { "epoch": 1.512208605915415, "grad_norm": 6.079024299835318, "learning_rate": 3.763135121770305e-05, "loss": 0.4877, "step": 4103 }, { "epoch": 1.5125771676034276, "grad_norm": 5.561320147596038, "learning_rate": 3.7628260600815924e-05, "loss": 0.339, "step": 4104 }, { "epoch": 1.5129457292914401, "grad_norm": 5.623392511194727, "learning_rate": 3.7625169983928795e-05, "loss": 0.5535, "step": 4105 }, { "epoch": 1.5133142909794528, "grad_norm": 6.609684352984925, "learning_rate": 3.7622079367041666e-05, "loss": 0.4077, "step": 4106 }, { "epoch": 1.5136828526674653, "grad_norm": 6.621723444429835, "learning_rate": 3.761898875015453e-05, "loss": 0.3398, "step": 4107 }, { "epoch": 1.5140514143554777, "grad_norm": 8.296175652464392, "learning_rate": 3.76158981332674e-05, "loss": 0.4802, "step": 4108 }, { "epoch": 1.5144199760434902, "grad_norm": 7.123878256852586, "learning_rate": 3.761280751638027e-05, "loss": 0.6822, "step": 4109 }, { "epoch": 1.5147885377315027, "grad_norm": 4.844489847100736, "learning_rate": 3.7609716899493144e-05, "loss": 0.3557, "step": 4110 }, { "epoch": 1.5151570994195154, "grad_norm": 10.244258854477422, "learning_rate": 3.760662628260601e-05, "loss": 0.3476, "step": 4111 }, { "epoch": 1.5155256611075278, "grad_norm": 7.975967071159136, "learning_rate": 3.760353566571888e-05, "loss": 0.4523, "step": 4112 }, { "epoch": 1.5158942227955405, "grad_norm": 6.790799075764379, "learning_rate": 3.760044504883175e-05, "loss": 0.3645, "step": 4113 }, { "epoch": 1.516262784483553, "grad_norm": 9.106687381584917, "learning_rate": 3.759735443194462e-05, "loss": 0.3988, "step": 4114 }, { "epoch": 1.5166313461715655, "grad_norm": 4.677370851284334, "learning_rate": 3.759426381505749e-05, "loss": 0.3621, "step": 4115 }, { "epoch": 1.516999907859578, "grad_norm": 3.8170976276100217, "learning_rate": 3.759117319817035e-05, "loss": 0.2065, "step": 4116 }, { "epoch": 1.5173684695475904, "grad_norm": 7.060520671199646, "learning_rate": 3.758808258128322e-05, "loss": 0.5495, "step": 4117 }, { "epoch": 1.517737031235603, "grad_norm": 6.672394707693326, "learning_rate": 3.758499196439609e-05, "loss": 0.3588, "step": 4118 }, { "epoch": 1.5181055929236156, "grad_norm": 8.69446487613941, "learning_rate": 3.7581901347508964e-05, "loss": 0.5715, "step": 4119 }, { "epoch": 1.5184741546116283, "grad_norm": 6.480537915449919, "learning_rate": 3.7578810730621836e-05, "loss": 0.5865, "step": 4120 }, { "epoch": 1.5188427162996407, "grad_norm": 6.520214632398159, "learning_rate": 3.75757201137347e-05, "loss": 0.3619, "step": 4121 }, { "epoch": 1.5192112779876532, "grad_norm": 5.277722868159686, "learning_rate": 3.757262949684757e-05, "loss": 0.4714, "step": 4122 }, { "epoch": 1.5195798396756657, "grad_norm": 5.265822953032191, "learning_rate": 3.756953887996044e-05, "loss": 0.5709, "step": 4123 }, { "epoch": 1.5199484013636781, "grad_norm": 5.651750492579365, "learning_rate": 3.7566448263073314e-05, "loss": 0.3838, "step": 4124 }, { "epoch": 1.5203169630516906, "grad_norm": 7.42986309433666, "learning_rate": 3.7563357646186185e-05, "loss": 0.3664, "step": 4125 }, { "epoch": 1.5206855247397033, "grad_norm": 5.976076113786824, "learning_rate": 3.756026702929905e-05, "loss": 0.3607, "step": 4126 }, { "epoch": 1.521054086427716, "grad_norm": 7.835977419202904, "learning_rate": 3.755717641241192e-05, "loss": 0.4152, "step": 4127 }, { "epoch": 1.5214226481157285, "grad_norm": 8.200820453815744, "learning_rate": 3.755408579552479e-05, "loss": 0.683, "step": 4128 }, { "epoch": 1.521791209803741, "grad_norm": 5.444466523798247, "learning_rate": 3.755099517863766e-05, "loss": 0.3586, "step": 4129 }, { "epoch": 1.5221597714917534, "grad_norm": 5.6956741783307026, "learning_rate": 3.754790456175053e-05, "loss": 0.3434, "step": 4130 }, { "epoch": 1.5225283331797659, "grad_norm": 10.437771182620464, "learning_rate": 3.754481394486339e-05, "loss": 0.6583, "step": 4131 }, { "epoch": 1.5228968948677784, "grad_norm": 5.709534938599456, "learning_rate": 3.754172332797626e-05, "loss": 0.4086, "step": 4132 }, { "epoch": 1.523265456555791, "grad_norm": 7.435123352382038, "learning_rate": 3.7538632711089134e-05, "loss": 0.5476, "step": 4133 }, { "epoch": 1.5236340182438035, "grad_norm": 7.550353833932428, "learning_rate": 3.7535542094202005e-05, "loss": 0.3386, "step": 4134 }, { "epoch": 1.5240025799318162, "grad_norm": 3.804672209847998, "learning_rate": 3.753245147731487e-05, "loss": 0.2369, "step": 4135 }, { "epoch": 1.5243711416198287, "grad_norm": 7.396079762253066, "learning_rate": 3.752936086042774e-05, "loss": 0.4337, "step": 4136 }, { "epoch": 1.5247397033078411, "grad_norm": 6.333359801923086, "learning_rate": 3.752627024354061e-05, "loss": 0.4948, "step": 4137 }, { "epoch": 1.5251082649958536, "grad_norm": 6.087461699378999, "learning_rate": 3.752317962665348e-05, "loss": 0.4178, "step": 4138 }, { "epoch": 1.525476826683866, "grad_norm": 8.099118370772775, "learning_rate": 3.7520089009766354e-05, "loss": 0.5722, "step": 4139 }, { "epoch": 1.5258453883718788, "grad_norm": 5.606709051315968, "learning_rate": 3.751699839287922e-05, "loss": 0.566, "step": 4140 }, { "epoch": 1.5262139500598912, "grad_norm": 4.930739204437084, "learning_rate": 3.751390777599209e-05, "loss": 0.2321, "step": 4141 }, { "epoch": 1.526582511747904, "grad_norm": 9.466245817402392, "learning_rate": 3.751081715910496e-05, "loss": 0.5769, "step": 4142 }, { "epoch": 1.5269510734359164, "grad_norm": 6.267966120839533, "learning_rate": 3.750772654221783e-05, "loss": 0.5525, "step": 4143 }, { "epoch": 1.5273196351239289, "grad_norm": 6.762806646759545, "learning_rate": 3.75046359253307e-05, "loss": 0.435, "step": 4144 }, { "epoch": 1.5276881968119413, "grad_norm": 5.6124024788385105, "learning_rate": 3.750154530844357e-05, "loss": 0.4764, "step": 4145 }, { "epoch": 1.5280567584999538, "grad_norm": 5.7066749706980335, "learning_rate": 3.749845469155643e-05, "loss": 0.4583, "step": 4146 }, { "epoch": 1.5284253201879665, "grad_norm": 5.887521265076544, "learning_rate": 3.7495364074669304e-05, "loss": 0.2742, "step": 4147 }, { "epoch": 1.528793881875979, "grad_norm": 6.575277622874268, "learning_rate": 3.7492273457782175e-05, "loss": 0.6345, "step": 4148 }, { "epoch": 1.5291624435639917, "grad_norm": 5.002456824379912, "learning_rate": 3.7489182840895046e-05, "loss": 0.3295, "step": 4149 }, { "epoch": 1.5295310052520041, "grad_norm": 5.194850264328975, "learning_rate": 3.748609222400791e-05, "loss": 0.4161, "step": 4150 }, { "epoch": 1.5298995669400166, "grad_norm": 6.562950772743841, "learning_rate": 3.748300160712078e-05, "loss": 0.5825, "step": 4151 }, { "epoch": 1.530268128628029, "grad_norm": 6.409496842988023, "learning_rate": 3.747991099023365e-05, "loss": 0.5049, "step": 4152 }, { "epoch": 1.5306366903160415, "grad_norm": 6.992047561012367, "learning_rate": 3.7476820373346524e-05, "loss": 0.5108, "step": 4153 }, { "epoch": 1.531005252004054, "grad_norm": 11.792515880257818, "learning_rate": 3.747372975645939e-05, "loss": 0.5804, "step": 4154 }, { "epoch": 1.5313738136920667, "grad_norm": 6.891716903017556, "learning_rate": 3.747063913957226e-05, "loss": 0.5633, "step": 4155 }, { "epoch": 1.5317423753800794, "grad_norm": 5.811965097405269, "learning_rate": 3.746754852268513e-05, "loss": 0.4434, "step": 4156 }, { "epoch": 1.5321109370680919, "grad_norm": 6.815854707718506, "learning_rate": 3.7464457905798e-05, "loss": 0.4363, "step": 4157 }, { "epoch": 1.5324794987561043, "grad_norm": 5.5376064178367885, "learning_rate": 3.746136728891087e-05, "loss": 0.3667, "step": 4158 }, { "epoch": 1.5328480604441168, "grad_norm": 4.308066037492744, "learning_rate": 3.745827667202374e-05, "loss": 0.3765, "step": 4159 }, { "epoch": 1.5332166221321293, "grad_norm": 5.295255070090979, "learning_rate": 3.745518605513661e-05, "loss": 0.459, "step": 4160 }, { "epoch": 1.5335851838201418, "grad_norm": 9.564832614804022, "learning_rate": 3.745209543824947e-05, "loss": 0.5481, "step": 4161 }, { "epoch": 1.5339537455081544, "grad_norm": 6.041511621349283, "learning_rate": 3.7449004821362344e-05, "loss": 0.3188, "step": 4162 }, { "epoch": 1.534322307196167, "grad_norm": 6.941417790683991, "learning_rate": 3.7445914204475215e-05, "loss": 0.5876, "step": 4163 }, { "epoch": 1.5346908688841796, "grad_norm": 11.420730053335989, "learning_rate": 3.744282358758808e-05, "loss": 0.5435, "step": 4164 }, { "epoch": 1.535059430572192, "grad_norm": 7.314929004462861, "learning_rate": 3.743973297070095e-05, "loss": 0.4657, "step": 4165 }, { "epoch": 1.5354279922602045, "grad_norm": 8.452088805483603, "learning_rate": 3.743664235381382e-05, "loss": 0.3355, "step": 4166 }, { "epoch": 1.535796553948217, "grad_norm": 5.787489374863695, "learning_rate": 3.7433551736926693e-05, "loss": 0.3881, "step": 4167 }, { "epoch": 1.5361651156362295, "grad_norm": 8.343204551898761, "learning_rate": 3.7430461120039565e-05, "loss": 0.516, "step": 4168 }, { "epoch": 1.5365336773242422, "grad_norm": 6.503365452366278, "learning_rate": 3.742737050315243e-05, "loss": 0.3934, "step": 4169 }, { "epoch": 1.5369022390122546, "grad_norm": 4.374332050015072, "learning_rate": 3.74242798862653e-05, "loss": 0.3952, "step": 4170 }, { "epoch": 1.5372708007002673, "grad_norm": 6.039310109736252, "learning_rate": 3.742118926937817e-05, "loss": 0.5825, "step": 4171 }, { "epoch": 1.5376393623882798, "grad_norm": 6.04059221206505, "learning_rate": 3.741809865249104e-05, "loss": 0.5463, "step": 4172 }, { "epoch": 1.5380079240762923, "grad_norm": 5.872157281624592, "learning_rate": 3.741500803560391e-05, "loss": 0.2941, "step": 4173 }, { "epoch": 1.5383764857643047, "grad_norm": 6.475726065559898, "learning_rate": 3.741191741871678e-05, "loss": 0.4374, "step": 4174 }, { "epoch": 1.5387450474523172, "grad_norm": 5.660057514558949, "learning_rate": 3.740882680182965e-05, "loss": 0.2948, "step": 4175 }, { "epoch": 1.53911360914033, "grad_norm": 5.20211076445318, "learning_rate": 3.7405736184942514e-05, "loss": 0.3633, "step": 4176 }, { "epoch": 1.5394821708283424, "grad_norm": 6.439852090404644, "learning_rate": 3.7402645568055385e-05, "loss": 0.4488, "step": 4177 }, { "epoch": 1.539850732516355, "grad_norm": 6.984992151494898, "learning_rate": 3.7399554951168256e-05, "loss": 0.2638, "step": 4178 }, { "epoch": 1.5402192942043675, "grad_norm": 4.959922963128651, "learning_rate": 3.739646433428112e-05, "loss": 0.3113, "step": 4179 }, { "epoch": 1.54058785589238, "grad_norm": 7.573278391374618, "learning_rate": 3.739337371739399e-05, "loss": 0.4717, "step": 4180 }, { "epoch": 1.5409564175803925, "grad_norm": 5.911302454121389, "learning_rate": 3.739028310050686e-05, "loss": 0.608, "step": 4181 }, { "epoch": 1.541324979268405, "grad_norm": 8.11241419295674, "learning_rate": 3.7387192483619734e-05, "loss": 0.5988, "step": 4182 }, { "epoch": 1.5416935409564174, "grad_norm": 6.885901668239822, "learning_rate": 3.73841018667326e-05, "loss": 0.3215, "step": 4183 }, { "epoch": 1.54206210264443, "grad_norm": 12.745801047244441, "learning_rate": 3.738101124984547e-05, "loss": 0.4075, "step": 4184 }, { "epoch": 1.5424306643324428, "grad_norm": 14.238227666222903, "learning_rate": 3.737792063295834e-05, "loss": 0.6255, "step": 4185 }, { "epoch": 1.5427992260204553, "grad_norm": 8.082290844368863, "learning_rate": 3.737483001607121e-05, "loss": 0.3456, "step": 4186 }, { "epoch": 1.5431677877084677, "grad_norm": 5.086576313267917, "learning_rate": 3.7371739399184083e-05, "loss": 0.3557, "step": 4187 }, { "epoch": 1.5435363493964802, "grad_norm": 7.86565122785655, "learning_rate": 3.736864878229695e-05, "loss": 0.3234, "step": 4188 }, { "epoch": 1.5439049110844927, "grad_norm": 5.8525676602473755, "learning_rate": 3.736555816540982e-05, "loss": 0.2478, "step": 4189 }, { "epoch": 1.5442734727725052, "grad_norm": 6.4856623463264444, "learning_rate": 3.736246754852269e-05, "loss": 0.4322, "step": 4190 }, { "epoch": 1.5446420344605178, "grad_norm": 7.598685964122195, "learning_rate": 3.7359376931635555e-05, "loss": 0.6524, "step": 4191 }, { "epoch": 1.5450105961485303, "grad_norm": 7.279676275496417, "learning_rate": 3.7356286314748426e-05, "loss": 0.5228, "step": 4192 }, { "epoch": 1.545379157836543, "grad_norm": 5.112545604549044, "learning_rate": 3.735319569786129e-05, "loss": 0.4159, "step": 4193 }, { "epoch": 1.5457477195245555, "grad_norm": 7.025354516257636, "learning_rate": 3.735010508097416e-05, "loss": 0.3183, "step": 4194 }, { "epoch": 1.546116281212568, "grad_norm": 7.058419458193367, "learning_rate": 3.734701446408703e-05, "loss": 0.4839, "step": 4195 }, { "epoch": 1.5464848429005804, "grad_norm": 6.21514021036313, "learning_rate": 3.7343923847199904e-05, "loss": 0.448, "step": 4196 }, { "epoch": 1.5468534045885929, "grad_norm": 6.124155414505385, "learning_rate": 3.7340833230312775e-05, "loss": 0.348, "step": 4197 }, { "epoch": 1.5472219662766056, "grad_norm": 5.96806419387124, "learning_rate": 3.733774261342564e-05, "loss": 0.5568, "step": 4198 }, { "epoch": 1.547590527964618, "grad_norm": 5.104959522352114, "learning_rate": 3.733465199653851e-05, "loss": 0.3441, "step": 4199 }, { "epoch": 1.5479590896526307, "grad_norm": 6.6375090345763015, "learning_rate": 3.733156137965138e-05, "loss": 0.4663, "step": 4200 }, { "epoch": 1.5483276513406432, "grad_norm": 4.581404308931327, "learning_rate": 3.732847076276425e-05, "loss": 0.4144, "step": 4201 }, { "epoch": 1.5486962130286557, "grad_norm": 9.05676761724908, "learning_rate": 3.732538014587712e-05, "loss": 0.6596, "step": 4202 }, { "epoch": 1.5490647747166681, "grad_norm": 4.666771796722607, "learning_rate": 3.732228952898999e-05, "loss": 0.3216, "step": 4203 }, { "epoch": 1.5494333364046806, "grad_norm": 7.791268758422263, "learning_rate": 3.731919891210286e-05, "loss": 0.6576, "step": 4204 }, { "epoch": 1.5498018980926933, "grad_norm": 10.934662717929749, "learning_rate": 3.731610829521573e-05, "loss": 0.4074, "step": 4205 }, { "epoch": 1.5501704597807058, "grad_norm": 6.289523469131028, "learning_rate": 3.7313017678328595e-05, "loss": 0.4727, "step": 4206 }, { "epoch": 1.5505390214687185, "grad_norm": 9.323637785629447, "learning_rate": 3.730992706144146e-05, "loss": 0.6238, "step": 4207 }, { "epoch": 1.550907583156731, "grad_norm": 5.083660407044836, "learning_rate": 3.730683644455433e-05, "loss": 0.3564, "step": 4208 }, { "epoch": 1.5512761448447434, "grad_norm": 7.347997597649877, "learning_rate": 3.73037458276672e-05, "loss": 0.4604, "step": 4209 }, { "epoch": 1.5516447065327559, "grad_norm": 6.391405426349873, "learning_rate": 3.7300655210780073e-05, "loss": 0.415, "step": 4210 }, { "epoch": 1.5520132682207683, "grad_norm": 4.603255007604843, "learning_rate": 3.7297564593892945e-05, "loss": 0.2506, "step": 4211 }, { "epoch": 1.5523818299087808, "grad_norm": 8.024659298708672, "learning_rate": 3.729447397700581e-05, "loss": 0.4539, "step": 4212 }, { "epoch": 1.5527503915967935, "grad_norm": 9.365274903783524, "learning_rate": 3.729138336011868e-05, "loss": 0.4007, "step": 4213 }, { "epoch": 1.5531189532848062, "grad_norm": 9.541600345120104, "learning_rate": 3.728829274323155e-05, "loss": 0.4359, "step": 4214 }, { "epoch": 1.5534875149728187, "grad_norm": 6.330800938849977, "learning_rate": 3.728520212634442e-05, "loss": 0.4956, "step": 4215 }, { "epoch": 1.5538560766608311, "grad_norm": 5.387322428668944, "learning_rate": 3.728211150945729e-05, "loss": 0.4131, "step": 4216 }, { "epoch": 1.5542246383488436, "grad_norm": 7.956748871247456, "learning_rate": 3.727902089257016e-05, "loss": 0.3292, "step": 4217 }, { "epoch": 1.554593200036856, "grad_norm": 8.405236619186383, "learning_rate": 3.727593027568303e-05, "loss": 0.3971, "step": 4218 }, { "epoch": 1.5549617617248686, "grad_norm": 5.694458446123644, "learning_rate": 3.72728396587959e-05, "loss": 0.4184, "step": 4219 }, { "epoch": 1.5553303234128812, "grad_norm": 9.789747550600689, "learning_rate": 3.726974904190877e-05, "loss": 0.4381, "step": 4220 }, { "epoch": 1.5556988851008937, "grad_norm": 6.154445635499778, "learning_rate": 3.7266658425021636e-05, "loss": 0.5871, "step": 4221 }, { "epoch": 1.5560674467889064, "grad_norm": 5.766065296429233, "learning_rate": 3.72635678081345e-05, "loss": 0.3851, "step": 4222 }, { "epoch": 1.5564360084769189, "grad_norm": 15.722265620147395, "learning_rate": 3.726047719124737e-05, "loss": 0.3479, "step": 4223 }, { "epoch": 1.5568045701649313, "grad_norm": 6.363060035864783, "learning_rate": 3.725738657436024e-05, "loss": 0.6032, "step": 4224 }, { "epoch": 1.5571731318529438, "grad_norm": 5.9012345913502235, "learning_rate": 3.7254295957473114e-05, "loss": 0.3769, "step": 4225 }, { "epoch": 1.5575416935409563, "grad_norm": 20.850303797715544, "learning_rate": 3.725120534058598e-05, "loss": 0.3224, "step": 4226 }, { "epoch": 1.557910255228969, "grad_norm": 11.233309423343858, "learning_rate": 3.724811472369885e-05, "loss": 0.3292, "step": 4227 }, { "epoch": 1.5582788169169814, "grad_norm": 12.168527399896474, "learning_rate": 3.724502410681172e-05, "loss": 0.5826, "step": 4228 }, { "epoch": 1.5586473786049941, "grad_norm": 6.0199555105802265, "learning_rate": 3.724193348992459e-05, "loss": 0.3319, "step": 4229 }, { "epoch": 1.5590159402930066, "grad_norm": 7.7981614024425605, "learning_rate": 3.723884287303746e-05, "loss": 0.5056, "step": 4230 }, { "epoch": 1.559384501981019, "grad_norm": 6.095677154202125, "learning_rate": 3.723575225615033e-05, "loss": 0.4098, "step": 4231 }, { "epoch": 1.5597530636690315, "grad_norm": 6.714745184067888, "learning_rate": 3.72326616392632e-05, "loss": 0.6056, "step": 4232 }, { "epoch": 1.560121625357044, "grad_norm": 7.0959334857149985, "learning_rate": 3.722957102237607e-05, "loss": 0.3914, "step": 4233 }, { "epoch": 1.5604901870450567, "grad_norm": 6.37636476755571, "learning_rate": 3.722648040548894e-05, "loss": 0.2963, "step": 4234 }, { "epoch": 1.5608587487330692, "grad_norm": 3.921909772863151, "learning_rate": 3.7223389788601806e-05, "loss": 0.249, "step": 4235 }, { "epoch": 1.5612273104210819, "grad_norm": 6.145569497172746, "learning_rate": 3.722029917171468e-05, "loss": 0.5358, "step": 4236 }, { "epoch": 1.5615958721090943, "grad_norm": 6.943811243647277, "learning_rate": 3.721720855482754e-05, "loss": 0.4191, "step": 4237 }, { "epoch": 1.5619644337971068, "grad_norm": 6.619540286049539, "learning_rate": 3.721411793794041e-05, "loss": 0.4627, "step": 4238 }, { "epoch": 1.5623329954851193, "grad_norm": 6.180504128841036, "learning_rate": 3.7211027321053284e-05, "loss": 0.5571, "step": 4239 }, { "epoch": 1.5627015571731318, "grad_norm": 8.260184734948048, "learning_rate": 3.7207936704166155e-05, "loss": 0.4963, "step": 4240 }, { "epoch": 1.5630701188611442, "grad_norm": 8.33012277427952, "learning_rate": 3.720484608727902e-05, "loss": 0.4498, "step": 4241 }, { "epoch": 1.563438680549157, "grad_norm": 7.897508938293132, "learning_rate": 3.720175547039189e-05, "loss": 0.6318, "step": 4242 }, { "epoch": 1.5638072422371696, "grad_norm": 6.466491599842982, "learning_rate": 3.719866485350476e-05, "loss": 0.3757, "step": 4243 }, { "epoch": 1.564175803925182, "grad_norm": 7.573509084694956, "learning_rate": 3.719557423661763e-05, "loss": 0.5332, "step": 4244 }, { "epoch": 1.5645443656131945, "grad_norm": 14.212697243642852, "learning_rate": 3.71924836197305e-05, "loss": 0.6185, "step": 4245 }, { "epoch": 1.564912927301207, "grad_norm": 10.31061106634924, "learning_rate": 3.718939300284337e-05, "loss": 0.5428, "step": 4246 }, { "epoch": 1.5652814889892195, "grad_norm": 5.93991007829927, "learning_rate": 3.718630238595624e-05, "loss": 0.4086, "step": 4247 }, { "epoch": 1.565650050677232, "grad_norm": 7.355209694472225, "learning_rate": 3.718321176906911e-05, "loss": 0.4464, "step": 4248 }, { "epoch": 1.5660186123652446, "grad_norm": 5.0808458756403665, "learning_rate": 3.718012115218198e-05, "loss": 0.3906, "step": 4249 }, { "epoch": 1.5663871740532571, "grad_norm": 7.118144166803208, "learning_rate": 3.7177030535294847e-05, "loss": 0.4411, "step": 4250 }, { "epoch": 1.5667557357412698, "grad_norm": 5.0345775902096115, "learning_rate": 3.717393991840772e-05, "loss": 0.4095, "step": 4251 }, { "epoch": 1.5671242974292823, "grad_norm": 4.903443912907667, "learning_rate": 3.717084930152058e-05, "loss": 0.4616, "step": 4252 }, { "epoch": 1.5674928591172947, "grad_norm": 9.942081954566085, "learning_rate": 3.716775868463345e-05, "loss": 0.5834, "step": 4253 }, { "epoch": 1.5678614208053072, "grad_norm": 11.460596663302756, "learning_rate": 3.7164668067746325e-05, "loss": 0.2906, "step": 4254 }, { "epoch": 1.5682299824933197, "grad_norm": 5.816335089704893, "learning_rate": 3.716157745085919e-05, "loss": 0.3994, "step": 4255 }, { "epoch": 1.5685985441813324, "grad_norm": 4.887693654001966, "learning_rate": 3.715848683397206e-05, "loss": 0.3724, "step": 4256 }, { "epoch": 1.5689671058693448, "grad_norm": 5.275373270288003, "learning_rate": 3.715539621708493e-05, "loss": 0.4626, "step": 4257 }, { "epoch": 1.5693356675573575, "grad_norm": 6.4692698099383135, "learning_rate": 3.71523056001978e-05, "loss": 0.4012, "step": 4258 }, { "epoch": 1.56970422924537, "grad_norm": 5.003580337381119, "learning_rate": 3.7149214983310674e-05, "loss": 0.425, "step": 4259 }, { "epoch": 1.5700727909333825, "grad_norm": 7.166483263138193, "learning_rate": 3.714612436642354e-05, "loss": 0.3451, "step": 4260 }, { "epoch": 1.570441352621395, "grad_norm": 7.1897547626719005, "learning_rate": 3.714303374953641e-05, "loss": 0.4621, "step": 4261 }, { "epoch": 1.5708099143094074, "grad_norm": 10.48371860137855, "learning_rate": 3.713994313264928e-05, "loss": 0.5971, "step": 4262 }, { "epoch": 1.57117847599742, "grad_norm": 5.315430674289133, "learning_rate": 3.713685251576215e-05, "loss": 0.3225, "step": 4263 }, { "epoch": 1.5715470376854326, "grad_norm": 8.714593368209568, "learning_rate": 3.7133761898875016e-05, "loss": 0.484, "step": 4264 }, { "epoch": 1.5719155993734453, "grad_norm": 4.461104675474126, "learning_rate": 3.713067128198789e-05, "loss": 0.3495, "step": 4265 }, { "epoch": 1.5722841610614577, "grad_norm": 3.9425837078412123, "learning_rate": 3.712758066510076e-05, "loss": 0.2843, "step": 4266 }, { "epoch": 1.5726527227494702, "grad_norm": 5.804411479810259, "learning_rate": 3.712449004821362e-05, "loss": 0.5489, "step": 4267 }, { "epoch": 1.5730212844374827, "grad_norm": 7.932403610418107, "learning_rate": 3.7121399431326494e-05, "loss": 0.4882, "step": 4268 }, { "epoch": 1.5733898461254952, "grad_norm": 5.596678239099664, "learning_rate": 3.7118308814439365e-05, "loss": 0.3559, "step": 4269 }, { "epoch": 1.5737584078135076, "grad_norm": 6.362843010730283, "learning_rate": 3.711521819755223e-05, "loss": 0.3189, "step": 4270 }, { "epoch": 1.5741269695015203, "grad_norm": 6.654502101876364, "learning_rate": 3.71121275806651e-05, "loss": 0.4544, "step": 4271 }, { "epoch": 1.574495531189533, "grad_norm": 5.017553606340931, "learning_rate": 3.710903696377797e-05, "loss": 0.3537, "step": 4272 }, { "epoch": 1.5748640928775455, "grad_norm": 6.511158095802555, "learning_rate": 3.710594634689084e-05, "loss": 0.5512, "step": 4273 }, { "epoch": 1.575232654565558, "grad_norm": 8.056351557314208, "learning_rate": 3.710285573000371e-05, "loss": 0.6053, "step": 4274 }, { "epoch": 1.5756012162535704, "grad_norm": 6.384713943753055, "learning_rate": 3.709976511311658e-05, "loss": 0.5367, "step": 4275 }, { "epoch": 1.5759697779415829, "grad_norm": 8.439465103193884, "learning_rate": 3.709667449622945e-05, "loss": 0.6442, "step": 4276 }, { "epoch": 1.5763383396295954, "grad_norm": 5.784859185380205, "learning_rate": 3.709358387934232e-05, "loss": 0.4532, "step": 4277 }, { "epoch": 1.576706901317608, "grad_norm": 6.076088689261014, "learning_rate": 3.709049326245519e-05, "loss": 0.3892, "step": 4278 }, { "epoch": 1.5770754630056205, "grad_norm": 6.510192509161896, "learning_rate": 3.708740264556806e-05, "loss": 0.4387, "step": 4279 }, { "epoch": 1.5774440246936332, "grad_norm": 4.7424927665080165, "learning_rate": 3.708431202868093e-05, "loss": 0.5222, "step": 4280 }, { "epoch": 1.5778125863816457, "grad_norm": 6.094742757674565, "learning_rate": 3.70812214117938e-05, "loss": 0.3713, "step": 4281 }, { "epoch": 1.5781811480696581, "grad_norm": 6.114150422436759, "learning_rate": 3.7078130794906664e-05, "loss": 0.5268, "step": 4282 }, { "epoch": 1.5785497097576706, "grad_norm": 9.238176779932562, "learning_rate": 3.7075040178019535e-05, "loss": 0.3781, "step": 4283 }, { "epoch": 1.578918271445683, "grad_norm": 8.004406669019689, "learning_rate": 3.70719495611324e-05, "loss": 0.4055, "step": 4284 }, { "epoch": 1.5792868331336958, "grad_norm": 12.119294072608328, "learning_rate": 3.706885894424527e-05, "loss": 0.5041, "step": 4285 }, { "epoch": 1.5796553948217082, "grad_norm": 8.121264259122707, "learning_rate": 3.706576832735814e-05, "loss": 0.5351, "step": 4286 }, { "epoch": 1.580023956509721, "grad_norm": 9.17971866683539, "learning_rate": 3.706267771047101e-05, "loss": 0.4885, "step": 4287 }, { "epoch": 1.5803925181977334, "grad_norm": 6.9961182867032035, "learning_rate": 3.7059587093583884e-05, "loss": 0.5503, "step": 4288 }, { "epoch": 1.5807610798857459, "grad_norm": 4.630010210854, "learning_rate": 3.705649647669675e-05, "loss": 0.3305, "step": 4289 }, { "epoch": 1.5811296415737583, "grad_norm": 8.279576315981874, "learning_rate": 3.705340585980962e-05, "loss": 0.6159, "step": 4290 }, { "epoch": 1.5814982032617708, "grad_norm": 5.101942717095654, "learning_rate": 3.705031524292249e-05, "loss": 0.3127, "step": 4291 }, { "epoch": 1.5818667649497835, "grad_norm": 6.4273026077068405, "learning_rate": 3.704722462603536e-05, "loss": 0.4994, "step": 4292 }, { "epoch": 1.582235326637796, "grad_norm": 6.463261879258263, "learning_rate": 3.7044134009148226e-05, "loss": 0.5281, "step": 4293 }, { "epoch": 1.5826038883258087, "grad_norm": 5.200358070369635, "learning_rate": 3.70410433922611e-05, "loss": 0.5005, "step": 4294 }, { "epoch": 1.5829724500138211, "grad_norm": 6.065226777342765, "learning_rate": 3.703795277537397e-05, "loss": 0.4468, "step": 4295 }, { "epoch": 1.5833410117018336, "grad_norm": 5.720185646832498, "learning_rate": 3.703486215848684e-05, "loss": 0.3534, "step": 4296 }, { "epoch": 1.583709573389846, "grad_norm": 5.998912712763169, "learning_rate": 3.7031771541599704e-05, "loss": 0.5565, "step": 4297 }, { "epoch": 1.5840781350778586, "grad_norm": 13.857783453043467, "learning_rate": 3.702868092471257e-05, "loss": 0.3904, "step": 4298 }, { "epoch": 1.5844466967658712, "grad_norm": 4.828287560155236, "learning_rate": 3.702559030782544e-05, "loss": 0.4076, "step": 4299 }, { "epoch": 1.5848152584538837, "grad_norm": 4.151935833679537, "learning_rate": 3.702249969093831e-05, "loss": 0.374, "step": 4300 }, { "epoch": 1.5851838201418964, "grad_norm": 6.782225929738091, "learning_rate": 3.701940907405118e-05, "loss": 0.4826, "step": 4301 }, { "epoch": 1.5855523818299089, "grad_norm": 6.333835916487906, "learning_rate": 3.7016318457164054e-05, "loss": 0.539, "step": 4302 }, { "epoch": 1.5859209435179213, "grad_norm": 5.411906728319988, "learning_rate": 3.701322784027692e-05, "loss": 0.5164, "step": 4303 }, { "epoch": 1.5862895052059338, "grad_norm": 6.186736088615368, "learning_rate": 3.701013722338979e-05, "loss": 0.6223, "step": 4304 }, { "epoch": 1.5866580668939463, "grad_norm": 14.92903184474094, "learning_rate": 3.700704660650266e-05, "loss": 0.6531, "step": 4305 }, { "epoch": 1.5870266285819588, "grad_norm": 4.395126478786062, "learning_rate": 3.700395598961553e-05, "loss": 0.2393, "step": 4306 }, { "epoch": 1.5873951902699714, "grad_norm": 7.321884523874782, "learning_rate": 3.7000865372728396e-05, "loss": 0.4197, "step": 4307 }, { "epoch": 1.587763751957984, "grad_norm": 5.996479909185849, "learning_rate": 3.699777475584127e-05, "loss": 0.3221, "step": 4308 }, { "epoch": 1.5881323136459966, "grad_norm": 5.03359077923628, "learning_rate": 3.699468413895414e-05, "loss": 0.4215, "step": 4309 }, { "epoch": 1.588500875334009, "grad_norm": 4.646592333366464, "learning_rate": 3.699159352206701e-05, "loss": 0.3795, "step": 4310 }, { "epoch": 1.5888694370220215, "grad_norm": 9.524744834385656, "learning_rate": 3.698850290517988e-05, "loss": 0.6348, "step": 4311 }, { "epoch": 1.589237998710034, "grad_norm": 7.165975167676746, "learning_rate": 3.6985412288292745e-05, "loss": 0.3771, "step": 4312 }, { "epoch": 1.5896065603980465, "grad_norm": 4.913641737252555, "learning_rate": 3.698232167140561e-05, "loss": 0.3594, "step": 4313 }, { "epoch": 1.5899751220860592, "grad_norm": 5.901630512519301, "learning_rate": 3.697923105451848e-05, "loss": 0.4507, "step": 4314 }, { "epoch": 1.5903436837740716, "grad_norm": 7.608722509964638, "learning_rate": 3.697614043763135e-05, "loss": 0.3226, "step": 4315 }, { "epoch": 1.5907122454620843, "grad_norm": 8.595624351703922, "learning_rate": 3.697304982074422e-05, "loss": 0.4384, "step": 4316 }, { "epoch": 1.5910808071500968, "grad_norm": 6.090231168020771, "learning_rate": 3.696995920385709e-05, "loss": 0.4599, "step": 4317 }, { "epoch": 1.5914493688381093, "grad_norm": 6.109585107180746, "learning_rate": 3.696686858696996e-05, "loss": 0.415, "step": 4318 }, { "epoch": 1.5918179305261217, "grad_norm": 8.08648614910989, "learning_rate": 3.696377797008283e-05, "loss": 0.6435, "step": 4319 }, { "epoch": 1.5921864922141342, "grad_norm": 7.346160363245518, "learning_rate": 3.69606873531957e-05, "loss": 0.4747, "step": 4320 }, { "epoch": 1.592555053902147, "grad_norm": 5.321796675154413, "learning_rate": 3.695759673630857e-05, "loss": 0.2399, "step": 4321 }, { "epoch": 1.5929236155901594, "grad_norm": 6.829828185214528, "learning_rate": 3.695450611942144e-05, "loss": 0.5251, "step": 4322 }, { "epoch": 1.593292177278172, "grad_norm": 5.623016177137029, "learning_rate": 3.695141550253431e-05, "loss": 0.2992, "step": 4323 }, { "epoch": 1.5936607389661845, "grad_norm": 5.9467008275531965, "learning_rate": 3.694832488564718e-05, "loss": 0.3608, "step": 4324 }, { "epoch": 1.594029300654197, "grad_norm": 6.993568735782138, "learning_rate": 3.694523426876005e-05, "loss": 0.5029, "step": 4325 }, { "epoch": 1.5943978623422095, "grad_norm": 7.231502217977485, "learning_rate": 3.6942143651872915e-05, "loss": 0.4594, "step": 4326 }, { "epoch": 1.594766424030222, "grad_norm": 5.523026554947944, "learning_rate": 3.693905303498578e-05, "loss": 0.3454, "step": 4327 }, { "epoch": 1.5951349857182346, "grad_norm": 11.072634779798207, "learning_rate": 3.693596241809865e-05, "loss": 0.5935, "step": 4328 }, { "epoch": 1.5955035474062471, "grad_norm": 6.294304031791673, "learning_rate": 3.693287180121152e-05, "loss": 0.4434, "step": 4329 }, { "epoch": 1.5958721090942598, "grad_norm": 11.513514000454938, "learning_rate": 3.692978118432439e-05, "loss": 0.5363, "step": 4330 }, { "epoch": 1.5962406707822723, "grad_norm": 3.6473572661387954, "learning_rate": 3.6926690567437264e-05, "loss": 0.2263, "step": 4331 }, { "epoch": 1.5966092324702847, "grad_norm": 5.81124601886274, "learning_rate": 3.692359995055013e-05, "loss": 0.4508, "step": 4332 }, { "epoch": 1.5969777941582972, "grad_norm": 5.544844822685844, "learning_rate": 3.6920509333663e-05, "loss": 0.5127, "step": 4333 }, { "epoch": 1.5973463558463097, "grad_norm": 7.107333047243079, "learning_rate": 3.691741871677587e-05, "loss": 0.5176, "step": 4334 }, { "epoch": 1.5977149175343222, "grad_norm": 5.563237173545731, "learning_rate": 3.691432809988874e-05, "loss": 0.2178, "step": 4335 }, { "epoch": 1.5980834792223348, "grad_norm": 7.970392753467752, "learning_rate": 3.6911237483001606e-05, "loss": 0.5282, "step": 4336 }, { "epoch": 1.5984520409103473, "grad_norm": 6.508655946805509, "learning_rate": 3.690814686611448e-05, "loss": 0.5078, "step": 4337 }, { "epoch": 1.59882060259836, "grad_norm": 7.3814531962102015, "learning_rate": 3.690505624922735e-05, "loss": 0.5734, "step": 4338 }, { "epoch": 1.5991891642863725, "grad_norm": 6.1591786810537075, "learning_rate": 3.690196563234022e-05, "loss": 0.5467, "step": 4339 }, { "epoch": 1.599557725974385, "grad_norm": 7.363539244485502, "learning_rate": 3.689887501545309e-05, "loss": 0.4122, "step": 4340 }, { "epoch": 1.5999262876623974, "grad_norm": 4.229965334991469, "learning_rate": 3.6895784398565956e-05, "loss": 0.28, "step": 4341 }, { "epoch": 1.6002948493504099, "grad_norm": 7.64430993587727, "learning_rate": 3.689269378167883e-05, "loss": 0.4571, "step": 4342 }, { "epoch": 1.6006634110384226, "grad_norm": 10.515905050598503, "learning_rate": 3.688960316479169e-05, "loss": 0.436, "step": 4343 }, { "epoch": 1.601031972726435, "grad_norm": 7.403319732822989, "learning_rate": 3.688651254790456e-05, "loss": 0.579, "step": 4344 }, { "epoch": 1.6014005344144477, "grad_norm": 4.968230214311968, "learning_rate": 3.6883421931017434e-05, "loss": 0.4011, "step": 4345 }, { "epoch": 1.6017690961024602, "grad_norm": 7.579983764470008, "learning_rate": 3.68803313141303e-05, "loss": 0.4059, "step": 4346 }, { "epoch": 1.6021376577904727, "grad_norm": 5.862610760193543, "learning_rate": 3.687724069724317e-05, "loss": 0.5157, "step": 4347 }, { "epoch": 1.6025062194784851, "grad_norm": 6.793203480346413, "learning_rate": 3.687415008035604e-05, "loss": 0.402, "step": 4348 }, { "epoch": 1.6028747811664976, "grad_norm": 14.495681645819666, "learning_rate": 3.687105946346891e-05, "loss": 0.5356, "step": 4349 }, { "epoch": 1.6032433428545103, "grad_norm": 10.61082823832901, "learning_rate": 3.686796884658178e-05, "loss": 0.6078, "step": 4350 }, { "epoch": 1.6036119045425228, "grad_norm": 6.179377609565672, "learning_rate": 3.686487822969465e-05, "loss": 0.27, "step": 4351 }, { "epoch": 1.6039804662305355, "grad_norm": 10.870112230551493, "learning_rate": 3.686178761280752e-05, "loss": 0.4417, "step": 4352 }, { "epoch": 1.604349027918548, "grad_norm": 9.933509265271875, "learning_rate": 3.685869699592039e-05, "loss": 0.6029, "step": 4353 }, { "epoch": 1.6047175896065604, "grad_norm": 7.671846657756965, "learning_rate": 3.685560637903326e-05, "loss": 0.5363, "step": 4354 }, { "epoch": 1.6050861512945729, "grad_norm": 5.905372191092788, "learning_rate": 3.6852515762146125e-05, "loss": 0.4932, "step": 4355 }, { "epoch": 1.6054547129825854, "grad_norm": 12.245910584754597, "learning_rate": 3.6849425145258996e-05, "loss": 0.5904, "step": 4356 }, { "epoch": 1.605823274670598, "grad_norm": 7.0655624366632495, "learning_rate": 3.684633452837187e-05, "loss": 0.3214, "step": 4357 }, { "epoch": 1.6061918363586105, "grad_norm": 5.288249416765524, "learning_rate": 3.684324391148473e-05, "loss": 0.4202, "step": 4358 }, { "epoch": 1.6065603980466232, "grad_norm": 7.07914837645319, "learning_rate": 3.68401532945976e-05, "loss": 0.5694, "step": 4359 }, { "epoch": 1.6069289597346357, "grad_norm": 7.559515750453045, "learning_rate": 3.6837062677710474e-05, "loss": 0.8008, "step": 4360 }, { "epoch": 1.6072975214226481, "grad_norm": 6.940427969893027, "learning_rate": 3.683397206082334e-05, "loss": 0.2721, "step": 4361 }, { "epoch": 1.6076660831106606, "grad_norm": 6.312342877840357, "learning_rate": 3.683088144393621e-05, "loss": 0.324, "step": 4362 }, { "epoch": 1.608034644798673, "grad_norm": 5.10065371493715, "learning_rate": 3.682779082704908e-05, "loss": 0.4371, "step": 4363 }, { "epoch": 1.6084032064866856, "grad_norm": 7.495873905713166, "learning_rate": 3.682470021016195e-05, "loss": 0.4143, "step": 4364 }, { "epoch": 1.6087717681746982, "grad_norm": 7.261421152370906, "learning_rate": 3.682160959327482e-05, "loss": 0.4195, "step": 4365 }, { "epoch": 1.6091403298627107, "grad_norm": 6.0172789046428425, "learning_rate": 3.681851897638769e-05, "loss": 0.4651, "step": 4366 }, { "epoch": 1.6095088915507234, "grad_norm": 6.3588027134235094, "learning_rate": 3.681542835950056e-05, "loss": 0.3891, "step": 4367 }, { "epoch": 1.6098774532387359, "grad_norm": 4.545230318954558, "learning_rate": 3.681233774261343e-05, "loss": 0.3401, "step": 4368 }, { "epoch": 1.6102460149267483, "grad_norm": 10.741647003447827, "learning_rate": 3.68092471257263e-05, "loss": 0.5813, "step": 4369 }, { "epoch": 1.6106145766147608, "grad_norm": 5.148959838117868, "learning_rate": 3.6806156508839166e-05, "loss": 0.4555, "step": 4370 }, { "epoch": 1.6109831383027733, "grad_norm": 5.0527098381941915, "learning_rate": 3.680306589195204e-05, "loss": 0.3265, "step": 4371 }, { "epoch": 1.611351699990786, "grad_norm": 6.418084652597702, "learning_rate": 3.679997527506491e-05, "loss": 0.4602, "step": 4372 }, { "epoch": 1.6117202616787984, "grad_norm": 6.270786965368786, "learning_rate": 3.679688465817777e-05, "loss": 0.4998, "step": 4373 }, { "epoch": 1.6120888233668111, "grad_norm": 5.664141971425234, "learning_rate": 3.6793794041290644e-05, "loss": 0.4288, "step": 4374 }, { "epoch": 1.6124573850548236, "grad_norm": 4.949650361014307, "learning_rate": 3.679070342440351e-05, "loss": 0.4301, "step": 4375 }, { "epoch": 1.612825946742836, "grad_norm": 5.765384824138815, "learning_rate": 3.678761280751638e-05, "loss": 0.4628, "step": 4376 }, { "epoch": 1.6131945084308486, "grad_norm": 6.542169960568798, "learning_rate": 3.678452219062925e-05, "loss": 0.3589, "step": 4377 }, { "epoch": 1.613563070118861, "grad_norm": 8.427954126987258, "learning_rate": 3.678143157374212e-05, "loss": 0.6818, "step": 4378 }, { "epoch": 1.6139316318068737, "grad_norm": 13.253000171651877, "learning_rate": 3.6778340956854986e-05, "loss": 0.504, "step": 4379 }, { "epoch": 1.6143001934948862, "grad_norm": 6.18477933004308, "learning_rate": 3.677525033996786e-05, "loss": 0.4843, "step": 4380 }, { "epoch": 1.6146687551828989, "grad_norm": 5.916320289179766, "learning_rate": 3.677215972308073e-05, "loss": 0.3086, "step": 4381 }, { "epoch": 1.6150373168709113, "grad_norm": 6.33845787860175, "learning_rate": 3.67690691061936e-05, "loss": 0.3705, "step": 4382 }, { "epoch": 1.6154058785589238, "grad_norm": 7.898201324825828, "learning_rate": 3.676597848930647e-05, "loss": 0.4038, "step": 4383 }, { "epoch": 1.6157744402469363, "grad_norm": 9.645429060327617, "learning_rate": 3.6762887872419336e-05, "loss": 0.4446, "step": 4384 }, { "epoch": 1.6161430019349488, "grad_norm": 8.121073081984282, "learning_rate": 3.675979725553221e-05, "loss": 0.3925, "step": 4385 }, { "epoch": 1.6165115636229614, "grad_norm": 9.217683167878924, "learning_rate": 3.675670663864508e-05, "loss": 0.4248, "step": 4386 }, { "epoch": 1.616880125310974, "grad_norm": 5.192877131262652, "learning_rate": 3.675361602175795e-05, "loss": 0.4099, "step": 4387 }, { "epoch": 1.6172486869989866, "grad_norm": 6.980838708194326, "learning_rate": 3.6750525404870814e-05, "loss": 0.2816, "step": 4388 }, { "epoch": 1.617617248686999, "grad_norm": 5.376550849917449, "learning_rate": 3.674743478798368e-05, "loss": 0.3884, "step": 4389 }, { "epoch": 1.6179858103750115, "grad_norm": 6.225144975191999, "learning_rate": 3.674434417109655e-05, "loss": 0.4393, "step": 4390 }, { "epoch": 1.618354372063024, "grad_norm": 5.73910585091594, "learning_rate": 3.674125355420942e-05, "loss": 0.3543, "step": 4391 }, { "epoch": 1.6187229337510365, "grad_norm": 6.027250713191544, "learning_rate": 3.673816293732229e-05, "loss": 0.4436, "step": 4392 }, { "epoch": 1.619091495439049, "grad_norm": 10.545179261826823, "learning_rate": 3.673507232043516e-05, "loss": 0.4568, "step": 4393 }, { "epoch": 1.6194600571270616, "grad_norm": 5.60395181047848, "learning_rate": 3.673198170354803e-05, "loss": 0.3575, "step": 4394 }, { "epoch": 1.6198286188150743, "grad_norm": 12.607729741835525, "learning_rate": 3.67288910866609e-05, "loss": 0.5245, "step": 4395 }, { "epoch": 1.6201971805030868, "grad_norm": 3.495843872005929, "learning_rate": 3.672580046977377e-05, "loss": 0.2022, "step": 4396 }, { "epoch": 1.6205657421910993, "grad_norm": 6.905774958916857, "learning_rate": 3.672270985288664e-05, "loss": 0.4876, "step": 4397 }, { "epoch": 1.6209343038791117, "grad_norm": 8.717452195356183, "learning_rate": 3.6719619235999505e-05, "loss": 0.8467, "step": 4398 }, { "epoch": 1.6213028655671242, "grad_norm": 10.947403440535282, "learning_rate": 3.6716528619112376e-05, "loss": 0.4333, "step": 4399 }, { "epoch": 1.6216714272551367, "grad_norm": 9.542674937274983, "learning_rate": 3.671343800222525e-05, "loss": 0.4978, "step": 4400 }, { "epoch": 1.6220399889431494, "grad_norm": 5.324293263418424, "learning_rate": 3.671034738533812e-05, "loss": 0.2637, "step": 4401 }, { "epoch": 1.6224085506311618, "grad_norm": 6.468929269858923, "learning_rate": 3.670725676845099e-05, "loss": 0.5202, "step": 4402 }, { "epoch": 1.6227771123191745, "grad_norm": 7.386229677919613, "learning_rate": 3.6704166151563854e-05, "loss": 0.5128, "step": 4403 }, { "epoch": 1.623145674007187, "grad_norm": 7.647863411326119, "learning_rate": 3.670107553467672e-05, "loss": 0.5694, "step": 4404 }, { "epoch": 1.6235142356951995, "grad_norm": 5.67955988723557, "learning_rate": 3.669798491778959e-05, "loss": 0.408, "step": 4405 }, { "epoch": 1.623882797383212, "grad_norm": 8.958555801349815, "learning_rate": 3.669489430090246e-05, "loss": 0.3741, "step": 4406 }, { "epoch": 1.6242513590712244, "grad_norm": 6.444097304951532, "learning_rate": 3.669180368401533e-05, "loss": 0.4246, "step": 4407 }, { "epoch": 1.6246199207592371, "grad_norm": 6.897188758367766, "learning_rate": 3.66887130671282e-05, "loss": 0.4173, "step": 4408 }, { "epoch": 1.6249884824472496, "grad_norm": 7.114647555491922, "learning_rate": 3.668562245024107e-05, "loss": 0.613, "step": 4409 }, { "epoch": 1.6253570441352623, "grad_norm": 5.868923534878602, "learning_rate": 3.668253183335394e-05, "loss": 0.2888, "step": 4410 }, { "epoch": 1.6257256058232747, "grad_norm": 5.881705171036214, "learning_rate": 3.667944121646681e-05, "loss": 0.4102, "step": 4411 }, { "epoch": 1.6260941675112872, "grad_norm": 5.956110648639588, "learning_rate": 3.667635059957968e-05, "loss": 0.3395, "step": 4412 }, { "epoch": 1.6264627291992997, "grad_norm": 5.1111760596050955, "learning_rate": 3.6673259982692546e-05, "loss": 0.3263, "step": 4413 }, { "epoch": 1.6268312908873122, "grad_norm": 9.12386074531513, "learning_rate": 3.667016936580542e-05, "loss": 0.5776, "step": 4414 }, { "epoch": 1.6271998525753248, "grad_norm": 5.859560543937268, "learning_rate": 3.666707874891829e-05, "loss": 0.5534, "step": 4415 }, { "epoch": 1.6275684142633373, "grad_norm": 4.103878391940356, "learning_rate": 3.666398813203116e-05, "loss": 0.2296, "step": 4416 }, { "epoch": 1.62793697595135, "grad_norm": 7.106289573431753, "learning_rate": 3.6660897515144024e-05, "loss": 0.5084, "step": 4417 }, { "epoch": 1.6283055376393625, "grad_norm": 11.086031095447042, "learning_rate": 3.665780689825689e-05, "loss": 0.4896, "step": 4418 }, { "epoch": 1.628674099327375, "grad_norm": 8.554029298308963, "learning_rate": 3.665471628136976e-05, "loss": 0.4507, "step": 4419 }, { "epoch": 1.6290426610153874, "grad_norm": 10.56041954363083, "learning_rate": 3.665162566448263e-05, "loss": 0.4762, "step": 4420 }, { "epoch": 1.6294112227033999, "grad_norm": 4.486194201717904, "learning_rate": 3.66485350475955e-05, "loss": 0.3752, "step": 4421 }, { "epoch": 1.6297797843914124, "grad_norm": 8.189059276467358, "learning_rate": 3.664544443070837e-05, "loss": 0.4293, "step": 4422 }, { "epoch": 1.630148346079425, "grad_norm": 5.129258596225018, "learning_rate": 3.664235381382124e-05, "loss": 0.3234, "step": 4423 }, { "epoch": 1.6305169077674377, "grad_norm": 5.944921733831886, "learning_rate": 3.663926319693411e-05, "loss": 0.4822, "step": 4424 }, { "epoch": 1.6308854694554502, "grad_norm": 6.688131427367893, "learning_rate": 3.663617258004698e-05, "loss": 0.4469, "step": 4425 }, { "epoch": 1.6312540311434627, "grad_norm": 6.028684080152256, "learning_rate": 3.663308196315985e-05, "loss": 0.3339, "step": 4426 }, { "epoch": 1.6316225928314751, "grad_norm": 6.415502797712813, "learning_rate": 3.6629991346272716e-05, "loss": 0.3419, "step": 4427 }, { "epoch": 1.6319911545194876, "grad_norm": 9.279192661157063, "learning_rate": 3.662690072938559e-05, "loss": 0.4751, "step": 4428 }, { "epoch": 1.6323597162075, "grad_norm": 5.672457410266653, "learning_rate": 3.662381011249846e-05, "loss": 0.3221, "step": 4429 }, { "epoch": 1.6327282778955128, "grad_norm": 7.919267514968339, "learning_rate": 3.662071949561133e-05, "loss": 0.3378, "step": 4430 }, { "epoch": 1.6330968395835253, "grad_norm": 6.905662822359146, "learning_rate": 3.66176288787242e-05, "loss": 0.4326, "step": 4431 }, { "epoch": 1.633465401271538, "grad_norm": 8.837116031317942, "learning_rate": 3.6614538261837065e-05, "loss": 0.5231, "step": 4432 }, { "epoch": 1.6338339629595504, "grad_norm": 10.011483655916129, "learning_rate": 3.661144764494993e-05, "loss": 0.3663, "step": 4433 }, { "epoch": 1.6342025246475629, "grad_norm": 4.555601365290698, "learning_rate": 3.66083570280628e-05, "loss": 0.281, "step": 4434 }, { "epoch": 1.6345710863355754, "grad_norm": 8.01269668123465, "learning_rate": 3.660526641117567e-05, "loss": 0.4492, "step": 4435 }, { "epoch": 1.6349396480235878, "grad_norm": 7.110025028232996, "learning_rate": 3.660217579428854e-05, "loss": 0.3927, "step": 4436 }, { "epoch": 1.6353082097116005, "grad_norm": 8.480711200328596, "learning_rate": 3.659908517740141e-05, "loss": 0.7328, "step": 4437 }, { "epoch": 1.635676771399613, "grad_norm": 10.983379639068223, "learning_rate": 3.659599456051428e-05, "loss": 0.6235, "step": 4438 }, { "epoch": 1.6360453330876257, "grad_norm": 8.31724149237477, "learning_rate": 3.659290394362715e-05, "loss": 0.5144, "step": 4439 }, { "epoch": 1.6364138947756381, "grad_norm": 6.216822953428652, "learning_rate": 3.658981332674002e-05, "loss": 0.4242, "step": 4440 }, { "epoch": 1.6367824564636506, "grad_norm": 7.754052855981791, "learning_rate": 3.658672270985289e-05, "loss": 0.4662, "step": 4441 }, { "epoch": 1.637151018151663, "grad_norm": 9.155084633469867, "learning_rate": 3.6583632092965756e-05, "loss": 0.5705, "step": 4442 }, { "epoch": 1.6375195798396756, "grad_norm": 7.7876151527965485, "learning_rate": 3.658054147607863e-05, "loss": 0.3426, "step": 4443 }, { "epoch": 1.6378881415276882, "grad_norm": 19.91303955419939, "learning_rate": 3.65774508591915e-05, "loss": 0.3642, "step": 4444 }, { "epoch": 1.6382567032157007, "grad_norm": 6.126812413709367, "learning_rate": 3.657436024230437e-05, "loss": 0.4386, "step": 4445 }, { "epoch": 1.6386252649037134, "grad_norm": 6.4024709580674, "learning_rate": 3.6571269625417234e-05, "loss": 0.363, "step": 4446 }, { "epoch": 1.6389938265917259, "grad_norm": 6.340989888889281, "learning_rate": 3.6568179008530105e-05, "loss": 0.3332, "step": 4447 }, { "epoch": 1.6393623882797383, "grad_norm": 5.360709777638439, "learning_rate": 3.656508839164297e-05, "loss": 0.3402, "step": 4448 }, { "epoch": 1.6397309499677508, "grad_norm": 7.368982996342889, "learning_rate": 3.656199777475584e-05, "loss": 0.3944, "step": 4449 }, { "epoch": 1.6400995116557633, "grad_norm": 7.432162381076681, "learning_rate": 3.655890715786871e-05, "loss": 0.3806, "step": 4450 }, { "epoch": 1.6404680733437758, "grad_norm": 14.560389549996808, "learning_rate": 3.655581654098158e-05, "loss": 0.4859, "step": 4451 }, { "epoch": 1.6408366350317884, "grad_norm": 5.738243357388503, "learning_rate": 3.655272592409445e-05, "loss": 0.4264, "step": 4452 }, { "epoch": 1.6412051967198011, "grad_norm": 5.556858931344337, "learning_rate": 3.654963530720732e-05, "loss": 0.5102, "step": 4453 }, { "epoch": 1.6415737584078136, "grad_norm": 8.856785956017179, "learning_rate": 3.654654469032019e-05, "loss": 0.572, "step": 4454 }, { "epoch": 1.641942320095826, "grad_norm": 8.247721964429878, "learning_rate": 3.654345407343306e-05, "loss": 0.592, "step": 4455 }, { "epoch": 1.6423108817838385, "grad_norm": 6.084693477937815, "learning_rate": 3.6540363456545926e-05, "loss": 0.422, "step": 4456 }, { "epoch": 1.642679443471851, "grad_norm": 9.693910114631086, "learning_rate": 3.65372728396588e-05, "loss": 0.4494, "step": 4457 }, { "epoch": 1.6430480051598635, "grad_norm": 5.4886979240183225, "learning_rate": 3.653418222277167e-05, "loss": 0.4227, "step": 4458 }, { "epoch": 1.6434165668478762, "grad_norm": 6.81474729096013, "learning_rate": 3.653109160588454e-05, "loss": 0.6932, "step": 4459 }, { "epoch": 1.6437851285358887, "grad_norm": 7.9397690300774215, "learning_rate": 3.652800098899741e-05, "loss": 0.4983, "step": 4460 }, { "epoch": 1.6441536902239013, "grad_norm": 5.598584180055435, "learning_rate": 3.6524910372110275e-05, "loss": 0.4048, "step": 4461 }, { "epoch": 1.6445222519119138, "grad_norm": 6.12263411936156, "learning_rate": 3.6521819755223146e-05, "loss": 0.4881, "step": 4462 }, { "epoch": 1.6448908135999263, "grad_norm": 5.258292415759956, "learning_rate": 3.651872913833602e-05, "loss": 0.4914, "step": 4463 }, { "epoch": 1.6452593752879388, "grad_norm": 6.155158086461788, "learning_rate": 3.651563852144888e-05, "loss": 0.3775, "step": 4464 }, { "epoch": 1.6456279369759512, "grad_norm": 4.822268393706172, "learning_rate": 3.651254790456175e-05, "loss": 0.2901, "step": 4465 }, { "epoch": 1.645996498663964, "grad_norm": 12.109962813354148, "learning_rate": 3.650945728767462e-05, "loss": 0.3798, "step": 4466 }, { "epoch": 1.6463650603519764, "grad_norm": 7.154265415915, "learning_rate": 3.650636667078749e-05, "loss": 0.3652, "step": 4467 }, { "epoch": 1.646733622039989, "grad_norm": 5.773462277412158, "learning_rate": 3.650327605390036e-05, "loss": 0.4697, "step": 4468 }, { "epoch": 1.6471021837280015, "grad_norm": 6.322265926687512, "learning_rate": 3.650018543701323e-05, "loss": 0.6014, "step": 4469 }, { "epoch": 1.647470745416014, "grad_norm": 3.9230211262597194, "learning_rate": 3.6497094820126095e-05, "loss": 0.2958, "step": 4470 }, { "epoch": 1.6478393071040265, "grad_norm": 12.352633522113438, "learning_rate": 3.649400420323897e-05, "loss": 0.6304, "step": 4471 }, { "epoch": 1.648207868792039, "grad_norm": 8.367070329916064, "learning_rate": 3.649091358635184e-05, "loss": 0.4389, "step": 4472 }, { "epoch": 1.6485764304800516, "grad_norm": 6.448033279901007, "learning_rate": 3.648782296946471e-05, "loss": 0.3746, "step": 4473 }, { "epoch": 1.6489449921680641, "grad_norm": 6.522428697259929, "learning_rate": 3.648473235257758e-05, "loss": 0.4754, "step": 4474 }, { "epoch": 1.6493135538560768, "grad_norm": 4.2384398821839735, "learning_rate": 3.6481641735690445e-05, "loss": 0.3438, "step": 4475 }, { "epoch": 1.6496821155440893, "grad_norm": 6.817313917003437, "learning_rate": 3.6478551118803316e-05, "loss": 0.3276, "step": 4476 }, { "epoch": 1.6500506772321017, "grad_norm": 7.467666611244721, "learning_rate": 3.647546050191619e-05, "loss": 0.4702, "step": 4477 }, { "epoch": 1.6504192389201142, "grad_norm": 9.384670305512286, "learning_rate": 3.647236988502906e-05, "loss": 0.8005, "step": 4478 }, { "epoch": 1.6507878006081267, "grad_norm": 4.560021341173527, "learning_rate": 3.646927926814192e-05, "loss": 0.2295, "step": 4479 }, { "epoch": 1.6511563622961392, "grad_norm": 5.7413759657190475, "learning_rate": 3.646618865125479e-05, "loss": 0.3449, "step": 4480 }, { "epoch": 1.6515249239841518, "grad_norm": 7.885600871895549, "learning_rate": 3.646309803436766e-05, "loss": 0.4115, "step": 4481 }, { "epoch": 1.6518934856721645, "grad_norm": 3.8927550096225434, "learning_rate": 3.646000741748053e-05, "loss": 0.3005, "step": 4482 }, { "epoch": 1.652262047360177, "grad_norm": 4.8516753301845625, "learning_rate": 3.64569168005934e-05, "loss": 0.4408, "step": 4483 }, { "epoch": 1.6526306090481895, "grad_norm": 6.188078882130127, "learning_rate": 3.645382618370627e-05, "loss": 0.3897, "step": 4484 }, { "epoch": 1.652999170736202, "grad_norm": 9.33297261040565, "learning_rate": 3.6450735566819136e-05, "loss": 0.601, "step": 4485 }, { "epoch": 1.6533677324242144, "grad_norm": 7.414637970372488, "learning_rate": 3.644764494993201e-05, "loss": 0.4474, "step": 4486 }, { "epoch": 1.653736294112227, "grad_norm": 23.78071359036956, "learning_rate": 3.644455433304488e-05, "loss": 0.5746, "step": 4487 }, { "epoch": 1.6541048558002396, "grad_norm": 6.020680708309174, "learning_rate": 3.644146371615775e-05, "loss": 0.5289, "step": 4488 }, { "epoch": 1.654473417488252, "grad_norm": 6.838914809223922, "learning_rate": 3.6438373099270614e-05, "loss": 0.5562, "step": 4489 }, { "epoch": 1.6548419791762647, "grad_norm": 7.109079878833323, "learning_rate": 3.6435282482383485e-05, "loss": 0.438, "step": 4490 }, { "epoch": 1.6552105408642772, "grad_norm": 5.7231178488108165, "learning_rate": 3.6432191865496357e-05, "loss": 0.4944, "step": 4491 }, { "epoch": 1.6555791025522897, "grad_norm": 5.272881372585614, "learning_rate": 3.642910124860923e-05, "loss": 0.3699, "step": 4492 }, { "epoch": 1.6559476642403022, "grad_norm": 6.648356884334249, "learning_rate": 3.64260106317221e-05, "loss": 0.4494, "step": 4493 }, { "epoch": 1.6563162259283146, "grad_norm": 5.674503106365384, "learning_rate": 3.6422920014834963e-05, "loss": 0.393, "step": 4494 }, { "epoch": 1.6566847876163273, "grad_norm": 6.614602909409598, "learning_rate": 3.641982939794783e-05, "loss": 0.4886, "step": 4495 }, { "epoch": 1.6570533493043398, "grad_norm": 3.56121026501325, "learning_rate": 3.64167387810607e-05, "loss": 0.2978, "step": 4496 }, { "epoch": 1.6574219109923525, "grad_norm": 6.823685728414374, "learning_rate": 3.641364816417357e-05, "loss": 0.5708, "step": 4497 }, { "epoch": 1.657790472680365, "grad_norm": 6.776846726191538, "learning_rate": 3.641055754728644e-05, "loss": 0.2857, "step": 4498 }, { "epoch": 1.6581590343683774, "grad_norm": 7.202512768856913, "learning_rate": 3.6407466930399306e-05, "loss": 0.4857, "step": 4499 }, { "epoch": 1.6585275960563899, "grad_norm": 5.160542129307673, "learning_rate": 3.640437631351218e-05, "loss": 0.2953, "step": 4500 }, { "epoch": 1.6588961577444024, "grad_norm": 8.60123165093732, "learning_rate": 3.640128569662505e-05, "loss": 0.4284, "step": 4501 }, { "epoch": 1.659264719432415, "grad_norm": 4.974981658748383, "learning_rate": 3.639819507973792e-05, "loss": 0.3921, "step": 4502 }, { "epoch": 1.6596332811204275, "grad_norm": 5.895962914794067, "learning_rate": 3.639510446285079e-05, "loss": 0.3114, "step": 4503 }, { "epoch": 1.6600018428084402, "grad_norm": 7.770448349594181, "learning_rate": 3.6392013845963655e-05, "loss": 0.3386, "step": 4504 }, { "epoch": 1.6603704044964527, "grad_norm": 8.539855417640231, "learning_rate": 3.6388923229076526e-05, "loss": 0.4287, "step": 4505 }, { "epoch": 1.6607389661844651, "grad_norm": 6.190375382437504, "learning_rate": 3.63858326121894e-05, "loss": 0.5053, "step": 4506 }, { "epoch": 1.6611075278724776, "grad_norm": 5.612984603656835, "learning_rate": 3.638274199530227e-05, "loss": 0.3768, "step": 4507 }, { "epoch": 1.66147608956049, "grad_norm": 3.947654104770098, "learning_rate": 3.637965137841513e-05, "loss": 0.4373, "step": 4508 }, { "epoch": 1.6618446512485026, "grad_norm": 6.0712704485650395, "learning_rate": 3.6376560761528e-05, "loss": 0.446, "step": 4509 }, { "epoch": 1.6622132129365152, "grad_norm": 6.40026337558506, "learning_rate": 3.637347014464087e-05, "loss": 0.5621, "step": 4510 }, { "epoch": 1.662581774624528, "grad_norm": 3.6035848305939213, "learning_rate": 3.637037952775374e-05, "loss": 0.263, "step": 4511 }, { "epoch": 1.6629503363125404, "grad_norm": 7.0605509271253935, "learning_rate": 3.636728891086661e-05, "loss": 0.4837, "step": 4512 }, { "epoch": 1.6633188980005529, "grad_norm": 7.324060414955222, "learning_rate": 3.636419829397948e-05, "loss": 0.375, "step": 4513 }, { "epoch": 1.6636874596885654, "grad_norm": 8.470150764861993, "learning_rate": 3.6361107677092347e-05, "loss": 0.5246, "step": 4514 }, { "epoch": 1.6640560213765778, "grad_norm": 5.588879046843306, "learning_rate": 3.635801706020522e-05, "loss": 0.4803, "step": 4515 }, { "epoch": 1.6644245830645903, "grad_norm": 5.220331923064173, "learning_rate": 3.635492644331809e-05, "loss": 0.3437, "step": 4516 }, { "epoch": 1.664793144752603, "grad_norm": 13.730458748218263, "learning_rate": 3.635183582643096e-05, "loss": 0.5227, "step": 4517 }, { "epoch": 1.6651617064406155, "grad_norm": 6.67448599332911, "learning_rate": 3.6348745209543825e-05, "loss": 0.6484, "step": 4518 }, { "epoch": 1.6655302681286281, "grad_norm": 7.220948516167701, "learning_rate": 3.6345654592656696e-05, "loss": 0.3731, "step": 4519 }, { "epoch": 1.6658988298166406, "grad_norm": 18.794507721880148, "learning_rate": 3.634256397576957e-05, "loss": 0.4972, "step": 4520 }, { "epoch": 1.666267391504653, "grad_norm": 6.262823149058795, "learning_rate": 3.633947335888244e-05, "loss": 0.3967, "step": 4521 }, { "epoch": 1.6666359531926656, "grad_norm": 5.935069018173949, "learning_rate": 3.633638274199531e-05, "loss": 0.3326, "step": 4522 }, { "epoch": 1.667004514880678, "grad_norm": 6.075536181488686, "learning_rate": 3.6333292125108174e-05, "loss": 0.3845, "step": 4523 }, { "epoch": 1.6673730765686907, "grad_norm": 5.1608134103213175, "learning_rate": 3.633020150822104e-05, "loss": 0.4484, "step": 4524 }, { "epoch": 1.6677416382567032, "grad_norm": 4.719667863313078, "learning_rate": 3.632711089133391e-05, "loss": 0.4006, "step": 4525 }, { "epoch": 1.6681101999447159, "grad_norm": 6.840473383534201, "learning_rate": 3.632402027444678e-05, "loss": 0.3294, "step": 4526 }, { "epoch": 1.6684787616327283, "grad_norm": 9.227730001282142, "learning_rate": 3.632092965755965e-05, "loss": 0.6503, "step": 4527 }, { "epoch": 1.6688473233207408, "grad_norm": 8.167923914869773, "learning_rate": 3.6317839040672516e-05, "loss": 0.5833, "step": 4528 }, { "epoch": 1.6692158850087533, "grad_norm": 3.4346573606842843, "learning_rate": 3.631474842378539e-05, "loss": 0.2454, "step": 4529 }, { "epoch": 1.6695844466967658, "grad_norm": 10.967416535532294, "learning_rate": 3.631165780689826e-05, "loss": 0.3753, "step": 4530 }, { "epoch": 1.6699530083847784, "grad_norm": 4.6434919447325305, "learning_rate": 3.630856719001113e-05, "loss": 0.3984, "step": 4531 }, { "epoch": 1.670321570072791, "grad_norm": 5.923280413292299, "learning_rate": 3.6305476573124e-05, "loss": 0.3778, "step": 4532 }, { "epoch": 1.6706901317608036, "grad_norm": 9.576678765839546, "learning_rate": 3.6302385956236865e-05, "loss": 0.4229, "step": 4533 }, { "epoch": 1.671058693448816, "grad_norm": 6.193181059696511, "learning_rate": 3.6299295339349737e-05, "loss": 0.4423, "step": 4534 }, { "epoch": 1.6714272551368285, "grad_norm": 8.640549947402347, "learning_rate": 3.629620472246261e-05, "loss": 0.3789, "step": 4535 }, { "epoch": 1.671795816824841, "grad_norm": 5.405519998469844, "learning_rate": 3.629311410557548e-05, "loss": 0.3437, "step": 4536 }, { "epoch": 1.6721643785128535, "grad_norm": 5.97381153497255, "learning_rate": 3.629002348868834e-05, "loss": 0.3308, "step": 4537 }, { "epoch": 1.672532940200866, "grad_norm": 6.178896075664907, "learning_rate": 3.6286932871801215e-05, "loss": 0.4247, "step": 4538 }, { "epoch": 1.6729015018888787, "grad_norm": 4.614592311784378, "learning_rate": 3.628384225491408e-05, "loss": 0.4166, "step": 4539 }, { "epoch": 1.6732700635768913, "grad_norm": 7.685276361185896, "learning_rate": 3.628075163802695e-05, "loss": 0.5295, "step": 4540 }, { "epoch": 1.6736386252649038, "grad_norm": 5.258523107974292, "learning_rate": 3.627766102113982e-05, "loss": 0.2405, "step": 4541 }, { "epoch": 1.6740071869529163, "grad_norm": 5.419427764639232, "learning_rate": 3.6274570404252686e-05, "loss": 0.4424, "step": 4542 }, { "epoch": 1.6743757486409288, "grad_norm": 8.219645752339046, "learning_rate": 3.627147978736556e-05, "loss": 0.4371, "step": 4543 }, { "epoch": 1.6747443103289412, "grad_norm": 6.262639502153668, "learning_rate": 3.626838917047843e-05, "loss": 0.3872, "step": 4544 }, { "epoch": 1.6751128720169537, "grad_norm": 7.25762801248974, "learning_rate": 3.62652985535913e-05, "loss": 0.5477, "step": 4545 }, { "epoch": 1.6754814337049664, "grad_norm": 5.450184026288484, "learning_rate": 3.626220793670417e-05, "loss": 0.3521, "step": 4546 }, { "epoch": 1.6758499953929789, "grad_norm": 8.465895055418267, "learning_rate": 3.6259117319817035e-05, "loss": 0.5079, "step": 4547 }, { "epoch": 1.6762185570809915, "grad_norm": 27.10236873620609, "learning_rate": 3.6256026702929906e-05, "loss": 0.5741, "step": 4548 }, { "epoch": 1.676587118769004, "grad_norm": 5.286313851422576, "learning_rate": 3.625293608604278e-05, "loss": 0.2894, "step": 4549 }, { "epoch": 1.6769556804570165, "grad_norm": 14.389740658902396, "learning_rate": 3.624984546915565e-05, "loss": 0.6565, "step": 4550 }, { "epoch": 1.677324242145029, "grad_norm": 5.935487747525505, "learning_rate": 3.624675485226851e-05, "loss": 0.4896, "step": 4551 }, { "epoch": 1.6776928038330414, "grad_norm": 5.22609335671219, "learning_rate": 3.6243664235381384e-05, "loss": 0.5136, "step": 4552 }, { "epoch": 1.6780613655210541, "grad_norm": 9.035215358647985, "learning_rate": 3.6240573618494255e-05, "loss": 0.4957, "step": 4553 }, { "epoch": 1.6784299272090666, "grad_norm": 4.098740891437467, "learning_rate": 3.623748300160712e-05, "loss": 0.3022, "step": 4554 }, { "epoch": 1.6787984888970793, "grad_norm": 6.337551151347598, "learning_rate": 3.623439238471999e-05, "loss": 0.344, "step": 4555 }, { "epoch": 1.6791670505850917, "grad_norm": 9.174706926784246, "learning_rate": 3.623130176783286e-05, "loss": 0.4295, "step": 4556 }, { "epoch": 1.6795356122731042, "grad_norm": 5.743666104273717, "learning_rate": 3.6228211150945727e-05, "loss": 0.3899, "step": 4557 }, { "epoch": 1.6799041739611167, "grad_norm": 5.065689024382374, "learning_rate": 3.62251205340586e-05, "loss": 0.3867, "step": 4558 }, { "epoch": 1.6802727356491292, "grad_norm": 8.30267349755675, "learning_rate": 3.622202991717147e-05, "loss": 0.7135, "step": 4559 }, { "epoch": 1.6806412973371418, "grad_norm": 16.695921183920564, "learning_rate": 3.621893930028434e-05, "loss": 0.616, "step": 4560 }, { "epoch": 1.6810098590251543, "grad_norm": 9.398379458076178, "learning_rate": 3.6215848683397205e-05, "loss": 0.4864, "step": 4561 }, { "epoch": 1.681378420713167, "grad_norm": 7.7946607851381895, "learning_rate": 3.6212758066510076e-05, "loss": 0.5452, "step": 4562 }, { "epoch": 1.6817469824011795, "grad_norm": 6.815525607958945, "learning_rate": 3.620966744962295e-05, "loss": 0.454, "step": 4563 }, { "epoch": 1.682115544089192, "grad_norm": 8.254892459124997, "learning_rate": 3.620657683273582e-05, "loss": 0.5868, "step": 4564 }, { "epoch": 1.6824841057772044, "grad_norm": 4.268945540366067, "learning_rate": 3.620348621584869e-05, "loss": 0.2758, "step": 4565 }, { "epoch": 1.682852667465217, "grad_norm": 4.160334136455043, "learning_rate": 3.6200395598961554e-05, "loss": 0.2923, "step": 4566 }, { "epoch": 1.6832212291532294, "grad_norm": 5.820840082003104, "learning_rate": 3.6197304982074425e-05, "loss": 0.3827, "step": 4567 }, { "epoch": 1.683589790841242, "grad_norm": 6.58374015742186, "learning_rate": 3.6194214365187296e-05, "loss": 0.4496, "step": 4568 }, { "epoch": 1.6839583525292547, "grad_norm": 5.5588902108223515, "learning_rate": 3.619112374830016e-05, "loss": 0.4194, "step": 4569 }, { "epoch": 1.6843269142172672, "grad_norm": 4.848084313292113, "learning_rate": 3.618803313141303e-05, "loss": 0.3187, "step": 4570 }, { "epoch": 1.6846954759052797, "grad_norm": 5.309716067436834, "learning_rate": 3.6184942514525896e-05, "loss": 0.2883, "step": 4571 }, { "epoch": 1.6850640375932922, "grad_norm": 17.513277956600707, "learning_rate": 3.618185189763877e-05, "loss": 0.6608, "step": 4572 }, { "epoch": 1.6854325992813046, "grad_norm": 7.511529325972269, "learning_rate": 3.617876128075164e-05, "loss": 0.6211, "step": 4573 }, { "epoch": 1.685801160969317, "grad_norm": 6.338466304267305, "learning_rate": 3.617567066386451e-05, "loss": 0.2835, "step": 4574 }, { "epoch": 1.6861697226573298, "grad_norm": 10.229455222561494, "learning_rate": 3.617258004697738e-05, "loss": 0.5314, "step": 4575 }, { "epoch": 1.6865382843453423, "grad_norm": 5.553071678991861, "learning_rate": 3.6169489430090245e-05, "loss": 0.457, "step": 4576 }, { "epoch": 1.686906846033355, "grad_norm": 7.238339485144177, "learning_rate": 3.6166398813203116e-05, "loss": 0.6862, "step": 4577 }, { "epoch": 1.6872754077213674, "grad_norm": 6.688266603257529, "learning_rate": 3.616330819631599e-05, "loss": 0.4674, "step": 4578 }, { "epoch": 1.6876439694093799, "grad_norm": 21.382026097614382, "learning_rate": 3.616021757942886e-05, "loss": 0.4576, "step": 4579 }, { "epoch": 1.6880125310973924, "grad_norm": 5.468549452237976, "learning_rate": 3.615712696254172e-05, "loss": 0.5112, "step": 4580 }, { "epoch": 1.6883810927854048, "grad_norm": 11.439401604000848, "learning_rate": 3.6154036345654594e-05, "loss": 0.6359, "step": 4581 }, { "epoch": 1.6887496544734175, "grad_norm": 16.104712210129485, "learning_rate": 3.6150945728767466e-05, "loss": 0.6041, "step": 4582 }, { "epoch": 1.68911821616143, "grad_norm": 7.614161057424412, "learning_rate": 3.614785511188034e-05, "loss": 0.6171, "step": 4583 }, { "epoch": 1.6894867778494427, "grad_norm": 10.035843507509798, "learning_rate": 3.61447644949932e-05, "loss": 0.3679, "step": 4584 }, { "epoch": 1.6898553395374551, "grad_norm": 6.946148404773167, "learning_rate": 3.614167387810607e-05, "loss": 0.3613, "step": 4585 }, { "epoch": 1.6902239012254676, "grad_norm": 6.053432959746439, "learning_rate": 3.613858326121894e-05, "loss": 0.4445, "step": 4586 }, { "epoch": 1.69059246291348, "grad_norm": 9.342050659382068, "learning_rate": 3.613549264433181e-05, "loss": 0.5057, "step": 4587 }, { "epoch": 1.6909610246014926, "grad_norm": 6.0035593283797075, "learning_rate": 3.613240202744468e-05, "loss": 0.4066, "step": 4588 }, { "epoch": 1.6913295862895052, "grad_norm": 7.718747528940647, "learning_rate": 3.612931141055755e-05, "loss": 0.5049, "step": 4589 }, { "epoch": 1.6916981479775177, "grad_norm": 6.158907089382379, "learning_rate": 3.6126220793670415e-05, "loss": 0.4646, "step": 4590 }, { "epoch": 1.6920667096655304, "grad_norm": 9.976099061769304, "learning_rate": 3.6123130176783286e-05, "loss": 0.5267, "step": 4591 }, { "epoch": 1.6924352713535429, "grad_norm": 7.065033854776679, "learning_rate": 3.612003955989616e-05, "loss": 0.6567, "step": 4592 }, { "epoch": 1.6928038330415554, "grad_norm": 4.477162798564903, "learning_rate": 3.611694894300903e-05, "loss": 0.3237, "step": 4593 }, { "epoch": 1.6931723947295678, "grad_norm": 7.467233671503389, "learning_rate": 3.61138583261219e-05, "loss": 0.4728, "step": 4594 }, { "epoch": 1.6935409564175803, "grad_norm": 10.350492648134292, "learning_rate": 3.6110767709234764e-05, "loss": 0.5847, "step": 4595 }, { "epoch": 1.6939095181055928, "grad_norm": 7.744081760100994, "learning_rate": 3.6107677092347635e-05, "loss": 0.407, "step": 4596 }, { "epoch": 1.6942780797936055, "grad_norm": 4.848045364243178, "learning_rate": 3.6104586475460506e-05, "loss": 0.2829, "step": 4597 }, { "epoch": 1.6946466414816181, "grad_norm": 6.509823052364458, "learning_rate": 3.610149585857338e-05, "loss": 0.4285, "step": 4598 }, { "epoch": 1.6950152031696306, "grad_norm": 5.0401634728698985, "learning_rate": 3.609840524168624e-05, "loss": 0.449, "step": 4599 }, { "epoch": 1.695383764857643, "grad_norm": 4.454279164439052, "learning_rate": 3.6095314624799106e-05, "loss": 0.2623, "step": 4600 }, { "epoch": 1.6957523265456556, "grad_norm": 8.585323905366565, "learning_rate": 3.609222400791198e-05, "loss": 0.5764, "step": 4601 }, { "epoch": 1.696120888233668, "grad_norm": 10.90629547022219, "learning_rate": 3.608913339102485e-05, "loss": 0.4382, "step": 4602 }, { "epoch": 1.6964894499216805, "grad_norm": 8.818517483701644, "learning_rate": 3.608604277413772e-05, "loss": 0.4869, "step": 4603 }, { "epoch": 1.6968580116096932, "grad_norm": 6.871296943058676, "learning_rate": 3.608295215725059e-05, "loss": 0.3417, "step": 4604 }, { "epoch": 1.6972265732977057, "grad_norm": 5.425174819309361, "learning_rate": 3.6079861540363456e-05, "loss": 0.5247, "step": 4605 }, { "epoch": 1.6975951349857183, "grad_norm": 6.247472022926241, "learning_rate": 3.607677092347633e-05, "loss": 0.4509, "step": 4606 }, { "epoch": 1.6979636966737308, "grad_norm": 9.135847776731472, "learning_rate": 3.60736803065892e-05, "loss": 0.3768, "step": 4607 }, { "epoch": 1.6983322583617433, "grad_norm": 5.375599317627909, "learning_rate": 3.607058968970207e-05, "loss": 0.3318, "step": 4608 }, { "epoch": 1.6987008200497558, "grad_norm": 3.4039556493144856, "learning_rate": 3.6067499072814934e-05, "loss": 0.2352, "step": 4609 }, { "epoch": 1.6990693817377682, "grad_norm": 7.097490710782822, "learning_rate": 3.6064408455927805e-05, "loss": 0.5005, "step": 4610 }, { "epoch": 1.699437943425781, "grad_norm": 6.4754868961331535, "learning_rate": 3.6061317839040676e-05, "loss": 0.4535, "step": 4611 }, { "epoch": 1.6998065051137934, "grad_norm": 8.357509283619208, "learning_rate": 3.605822722215355e-05, "loss": 0.5194, "step": 4612 }, { "epoch": 1.700175066801806, "grad_norm": 4.7208176686946475, "learning_rate": 3.605513660526642e-05, "loss": 0.4162, "step": 4613 }, { "epoch": 1.7005436284898185, "grad_norm": 9.019673355853252, "learning_rate": 3.605204598837928e-05, "loss": 0.4691, "step": 4614 }, { "epoch": 1.700912190177831, "grad_norm": 5.7397359376571675, "learning_rate": 3.604895537149215e-05, "loss": 0.2475, "step": 4615 }, { "epoch": 1.7012807518658435, "grad_norm": 6.604892951507913, "learning_rate": 3.604586475460502e-05, "loss": 0.3495, "step": 4616 }, { "epoch": 1.701649313553856, "grad_norm": 4.644491415278082, "learning_rate": 3.604277413771789e-05, "loss": 0.4008, "step": 4617 }, { "epoch": 1.7020178752418686, "grad_norm": 6.878617600683698, "learning_rate": 3.603968352083076e-05, "loss": 0.4462, "step": 4618 }, { "epoch": 1.7023864369298811, "grad_norm": 6.795332529718698, "learning_rate": 3.6036592903943625e-05, "loss": 0.3946, "step": 4619 }, { "epoch": 1.7027549986178938, "grad_norm": 8.978356085092786, "learning_rate": 3.6033502287056496e-05, "loss": 0.4416, "step": 4620 }, { "epoch": 1.7031235603059063, "grad_norm": 9.939887761608725, "learning_rate": 3.603041167016937e-05, "loss": 0.6495, "step": 4621 }, { "epoch": 1.7034921219939188, "grad_norm": 5.411886991890008, "learning_rate": 3.602732105328224e-05, "loss": 0.4166, "step": 4622 }, { "epoch": 1.7038606836819312, "grad_norm": 4.93494436060683, "learning_rate": 3.602423043639511e-05, "loss": 0.3878, "step": 4623 }, { "epoch": 1.7042292453699437, "grad_norm": 5.494738923553393, "learning_rate": 3.6021139819507974e-05, "loss": 0.4521, "step": 4624 }, { "epoch": 1.7045978070579562, "grad_norm": 5.491350134534205, "learning_rate": 3.6018049202620846e-05, "loss": 0.3851, "step": 4625 }, { "epoch": 1.7049663687459689, "grad_norm": 5.793135020055695, "learning_rate": 3.601495858573372e-05, "loss": 0.3319, "step": 4626 }, { "epoch": 1.7053349304339815, "grad_norm": 6.605143028633894, "learning_rate": 3.601186796884659e-05, "loss": 0.4696, "step": 4627 }, { "epoch": 1.705703492121994, "grad_norm": 9.122725111739117, "learning_rate": 3.600877735195945e-05, "loss": 0.5514, "step": 4628 }, { "epoch": 1.7060720538100065, "grad_norm": 5.604357332108386, "learning_rate": 3.6005686735072324e-05, "loss": 0.3818, "step": 4629 }, { "epoch": 1.706440615498019, "grad_norm": 6.985329650221643, "learning_rate": 3.600259611818519e-05, "loss": 0.595, "step": 4630 }, { "epoch": 1.7068091771860314, "grad_norm": 7.710314384413526, "learning_rate": 3.599950550129806e-05, "loss": 0.4751, "step": 4631 }, { "epoch": 1.707177738874044, "grad_norm": 4.8449679904506295, "learning_rate": 3.599641488441093e-05, "loss": 0.2198, "step": 4632 }, { "epoch": 1.7075463005620566, "grad_norm": 5.273322300535702, "learning_rate": 3.5993324267523795e-05, "loss": 0.369, "step": 4633 }, { "epoch": 1.707914862250069, "grad_norm": 4.835333421289988, "learning_rate": 3.5990233650636666e-05, "loss": 0.3384, "step": 4634 }, { "epoch": 1.7082834239380817, "grad_norm": 6.547704675948474, "learning_rate": 3.598714303374954e-05, "loss": 0.6265, "step": 4635 }, { "epoch": 1.7086519856260942, "grad_norm": 5.93406304719177, "learning_rate": 3.598405241686241e-05, "loss": 0.5854, "step": 4636 }, { "epoch": 1.7090205473141067, "grad_norm": 7.162102983532736, "learning_rate": 3.598096179997528e-05, "loss": 0.4485, "step": 4637 }, { "epoch": 1.7093891090021192, "grad_norm": 5.876393254426056, "learning_rate": 3.5977871183088144e-05, "loss": 0.4122, "step": 4638 }, { "epoch": 1.7097576706901316, "grad_norm": 5.984070932018904, "learning_rate": 3.5974780566201015e-05, "loss": 0.5477, "step": 4639 }, { "epoch": 1.7101262323781443, "grad_norm": 5.732298430847173, "learning_rate": 3.5971689949313886e-05, "loss": 0.3789, "step": 4640 }, { "epoch": 1.7104947940661568, "grad_norm": 7.193362556848332, "learning_rate": 3.596859933242676e-05, "loss": 0.4568, "step": 4641 }, { "epoch": 1.7108633557541695, "grad_norm": 7.62136704244738, "learning_rate": 3.596550871553962e-05, "loss": 0.4195, "step": 4642 }, { "epoch": 1.711231917442182, "grad_norm": 5.033025013825673, "learning_rate": 3.596241809865249e-05, "loss": 0.4088, "step": 4643 }, { "epoch": 1.7116004791301944, "grad_norm": 8.124646927424754, "learning_rate": 3.5959327481765364e-05, "loss": 0.4923, "step": 4644 }, { "epoch": 1.7119690408182069, "grad_norm": 7.854041654231055, "learning_rate": 3.595623686487823e-05, "loss": 0.5426, "step": 4645 }, { "epoch": 1.7123376025062194, "grad_norm": 6.716117591596382, "learning_rate": 3.59531462479911e-05, "loss": 0.3626, "step": 4646 }, { "epoch": 1.712706164194232, "grad_norm": 6.14536310325911, "learning_rate": 3.595005563110397e-05, "loss": 0.3352, "step": 4647 }, { "epoch": 1.7130747258822445, "grad_norm": 7.881053232978138, "learning_rate": 3.5946965014216836e-05, "loss": 0.3801, "step": 4648 }, { "epoch": 1.7134432875702572, "grad_norm": 4.680807804261937, "learning_rate": 3.594387439732971e-05, "loss": 0.4041, "step": 4649 }, { "epoch": 1.7138118492582697, "grad_norm": 10.915732734469596, "learning_rate": 3.594078378044258e-05, "loss": 0.5063, "step": 4650 }, { "epoch": 1.7141804109462822, "grad_norm": 7.229892338838883, "learning_rate": 3.593769316355545e-05, "loss": 0.478, "step": 4651 }, { "epoch": 1.7145489726342946, "grad_norm": 4.2394156208029665, "learning_rate": 3.5934602546668314e-05, "loss": 0.3759, "step": 4652 }, { "epoch": 1.714917534322307, "grad_norm": 8.773867507410722, "learning_rate": 3.5931511929781185e-05, "loss": 0.4443, "step": 4653 }, { "epoch": 1.7152860960103196, "grad_norm": 5.142438481713516, "learning_rate": 3.5928421312894056e-05, "loss": 0.3773, "step": 4654 }, { "epoch": 1.7156546576983323, "grad_norm": 5.88312828563962, "learning_rate": 3.592533069600693e-05, "loss": 0.4231, "step": 4655 }, { "epoch": 1.716023219386345, "grad_norm": 9.92062807432877, "learning_rate": 3.59222400791198e-05, "loss": 0.7658, "step": 4656 }, { "epoch": 1.7163917810743574, "grad_norm": 7.45656597430415, "learning_rate": 3.591914946223266e-05, "loss": 0.4862, "step": 4657 }, { "epoch": 1.7167603427623699, "grad_norm": 4.508298216358051, "learning_rate": 3.5916058845345534e-05, "loss": 0.2719, "step": 4658 }, { "epoch": 1.7171289044503824, "grad_norm": 5.353815247794747, "learning_rate": 3.5912968228458405e-05, "loss": 0.5169, "step": 4659 }, { "epoch": 1.7174974661383948, "grad_norm": 6.266415210846355, "learning_rate": 3.590987761157127e-05, "loss": 0.3436, "step": 4660 }, { "epoch": 1.7178660278264073, "grad_norm": 8.193948550404725, "learning_rate": 3.590678699468414e-05, "loss": 0.4934, "step": 4661 }, { "epoch": 1.71823458951442, "grad_norm": 17.599943056881425, "learning_rate": 3.5903696377797005e-05, "loss": 0.5989, "step": 4662 }, { "epoch": 1.7186031512024325, "grad_norm": 5.419151479594297, "learning_rate": 3.5900605760909876e-05, "loss": 0.5293, "step": 4663 }, { "epoch": 1.7189717128904451, "grad_norm": 5.935318395466784, "learning_rate": 3.589751514402275e-05, "loss": 0.3581, "step": 4664 }, { "epoch": 1.7193402745784576, "grad_norm": 5.279674046598907, "learning_rate": 3.589442452713562e-05, "loss": 0.4933, "step": 4665 }, { "epoch": 1.71970883626647, "grad_norm": 5.358446758171275, "learning_rate": 3.589133391024849e-05, "loss": 0.4, "step": 4666 }, { "epoch": 1.7200773979544826, "grad_norm": 7.145177812973159, "learning_rate": 3.5888243293361354e-05, "loss": 0.4748, "step": 4667 }, { "epoch": 1.720445959642495, "grad_norm": 8.163631679721252, "learning_rate": 3.5885152676474226e-05, "loss": 0.4884, "step": 4668 }, { "epoch": 1.7208145213305077, "grad_norm": 9.090928941184826, "learning_rate": 3.58820620595871e-05, "loss": 0.6927, "step": 4669 }, { "epoch": 1.7211830830185202, "grad_norm": 9.53782392079257, "learning_rate": 3.587897144269997e-05, "loss": 0.3127, "step": 4670 }, { "epoch": 1.7215516447065329, "grad_norm": 8.37265619532635, "learning_rate": 3.587588082581283e-05, "loss": 0.4369, "step": 4671 }, { "epoch": 1.7219202063945453, "grad_norm": 7.269023272821481, "learning_rate": 3.5872790208925704e-05, "loss": 0.311, "step": 4672 }, { "epoch": 1.7222887680825578, "grad_norm": 12.24936347397603, "learning_rate": 3.5869699592038575e-05, "loss": 0.3469, "step": 4673 }, { "epoch": 1.7226573297705703, "grad_norm": 5.482913439287077, "learning_rate": 3.5866608975151446e-05, "loss": 0.5537, "step": 4674 }, { "epoch": 1.7230258914585828, "grad_norm": 10.774050047170995, "learning_rate": 3.586351835826431e-05, "loss": 0.5459, "step": 4675 }, { "epoch": 1.7233944531465955, "grad_norm": 5.119767844480067, "learning_rate": 3.586042774137718e-05, "loss": 0.3272, "step": 4676 }, { "epoch": 1.723763014834608, "grad_norm": 8.735022232560162, "learning_rate": 3.5857337124490046e-05, "loss": 0.5219, "step": 4677 }, { "epoch": 1.7241315765226206, "grad_norm": 6.051812258332823, "learning_rate": 3.585424650760292e-05, "loss": 0.3968, "step": 4678 }, { "epoch": 1.724500138210633, "grad_norm": 15.87455688819327, "learning_rate": 3.585115589071579e-05, "loss": 0.3974, "step": 4679 }, { "epoch": 1.7248686998986456, "grad_norm": 5.3827295836774205, "learning_rate": 3.584806527382866e-05, "loss": 0.2904, "step": 4680 }, { "epoch": 1.725237261586658, "grad_norm": 6.397628118164581, "learning_rate": 3.5844974656941524e-05, "loss": 0.6485, "step": 4681 }, { "epoch": 1.7256058232746705, "grad_norm": 4.406473357038377, "learning_rate": 3.5841884040054395e-05, "loss": 0.2962, "step": 4682 }, { "epoch": 1.7259743849626832, "grad_norm": 5.241416089851398, "learning_rate": 3.5838793423167266e-05, "loss": 0.4143, "step": 4683 }, { "epoch": 1.7263429466506957, "grad_norm": 7.377786142403228, "learning_rate": 3.583570280628014e-05, "loss": 0.3827, "step": 4684 }, { "epoch": 1.7267115083387083, "grad_norm": 5.598736634116071, "learning_rate": 3.583261218939301e-05, "loss": 0.3269, "step": 4685 }, { "epoch": 1.7270800700267208, "grad_norm": 6.110374895697219, "learning_rate": 3.582952157250587e-05, "loss": 0.4355, "step": 4686 }, { "epoch": 1.7274486317147333, "grad_norm": 9.228732016312222, "learning_rate": 3.5826430955618744e-05, "loss": 0.5543, "step": 4687 }, { "epoch": 1.7278171934027458, "grad_norm": 5.274755333021825, "learning_rate": 3.5823340338731616e-05, "loss": 0.3797, "step": 4688 }, { "epoch": 1.7281857550907582, "grad_norm": 7.1909534947951546, "learning_rate": 3.582024972184449e-05, "loss": 0.6472, "step": 4689 }, { "epoch": 1.7285543167787707, "grad_norm": 9.78144499855546, "learning_rate": 3.581715910495735e-05, "loss": 0.4128, "step": 4690 }, { "epoch": 1.7289228784667834, "grad_norm": 6.27594346936491, "learning_rate": 3.5814068488070216e-05, "loss": 0.5296, "step": 4691 }, { "epoch": 1.7292914401547959, "grad_norm": 6.107109566887014, "learning_rate": 3.581097787118309e-05, "loss": 0.6074, "step": 4692 }, { "epoch": 1.7296600018428085, "grad_norm": 6.1697200055017944, "learning_rate": 3.580788725429596e-05, "loss": 0.5406, "step": 4693 }, { "epoch": 1.730028563530821, "grad_norm": 4.886419175782016, "learning_rate": 3.580479663740883e-05, "loss": 0.3675, "step": 4694 }, { "epoch": 1.7303971252188335, "grad_norm": 4.3768594332620685, "learning_rate": 3.58017060205217e-05, "loss": 0.5004, "step": 4695 }, { "epoch": 1.730765686906846, "grad_norm": 5.676242292441365, "learning_rate": 3.5798615403634565e-05, "loss": 0.2504, "step": 4696 }, { "epoch": 1.7311342485948584, "grad_norm": 4.429114787436985, "learning_rate": 3.5795524786747436e-05, "loss": 0.496, "step": 4697 }, { "epoch": 1.7315028102828711, "grad_norm": 6.494815813115031, "learning_rate": 3.579243416986031e-05, "loss": 0.4144, "step": 4698 }, { "epoch": 1.7318713719708836, "grad_norm": 8.539457404523537, "learning_rate": 3.578934355297318e-05, "loss": 0.3239, "step": 4699 }, { "epoch": 1.7322399336588963, "grad_norm": 5.32595646824453, "learning_rate": 3.578625293608604e-05, "loss": 0.3999, "step": 4700 }, { "epoch": 1.7326084953469088, "grad_norm": 5.523975998933376, "learning_rate": 3.5783162319198914e-05, "loss": 0.4269, "step": 4701 }, { "epoch": 1.7329770570349212, "grad_norm": 5.891009978405556, "learning_rate": 3.5780071702311785e-05, "loss": 0.4766, "step": 4702 }, { "epoch": 1.7333456187229337, "grad_norm": 6.364918535045336, "learning_rate": 3.5776981085424656e-05, "loss": 0.6846, "step": 4703 }, { "epoch": 1.7337141804109462, "grad_norm": 8.86174891723244, "learning_rate": 3.577389046853753e-05, "loss": 0.6333, "step": 4704 }, { "epoch": 1.7340827420989589, "grad_norm": 9.498382430647238, "learning_rate": 3.5770799851650385e-05, "loss": 0.5158, "step": 4705 }, { "epoch": 1.7344513037869713, "grad_norm": 5.49267784306483, "learning_rate": 3.5767709234763256e-05, "loss": 0.3907, "step": 4706 }, { "epoch": 1.734819865474984, "grad_norm": 6.688568600280204, "learning_rate": 3.576461861787613e-05, "loss": 0.3583, "step": 4707 }, { "epoch": 1.7351884271629965, "grad_norm": 5.700767385124293, "learning_rate": 3.5761528000989e-05, "loss": 0.3916, "step": 4708 }, { "epoch": 1.735556988851009, "grad_norm": 4.874712422031659, "learning_rate": 3.575843738410187e-05, "loss": 0.4377, "step": 4709 }, { "epoch": 1.7359255505390214, "grad_norm": 21.806152119612296, "learning_rate": 3.5755346767214734e-05, "loss": 0.7499, "step": 4710 }, { "epoch": 1.736294112227034, "grad_norm": 6.68359260848909, "learning_rate": 3.5752256150327606e-05, "loss": 0.4247, "step": 4711 }, { "epoch": 1.7366626739150466, "grad_norm": 8.293592475366808, "learning_rate": 3.574916553344048e-05, "loss": 0.3708, "step": 4712 }, { "epoch": 1.737031235603059, "grad_norm": 7.116157800615563, "learning_rate": 3.574607491655335e-05, "loss": 0.5452, "step": 4713 }, { "epoch": 1.7373997972910717, "grad_norm": 6.242350971244196, "learning_rate": 3.574298429966621e-05, "loss": 0.4277, "step": 4714 }, { "epoch": 1.7377683589790842, "grad_norm": 5.446718316333283, "learning_rate": 3.5739893682779083e-05, "loss": 0.4124, "step": 4715 }, { "epoch": 1.7381369206670967, "grad_norm": 5.866394233157888, "learning_rate": 3.5736803065891955e-05, "loss": 0.4252, "step": 4716 }, { "epoch": 1.7385054823551092, "grad_norm": 7.80344885848008, "learning_rate": 3.5733712449004826e-05, "loss": 0.6824, "step": 4717 }, { "epoch": 1.7388740440431216, "grad_norm": 4.353305981030686, "learning_rate": 3.57306218321177e-05, "loss": 0.2929, "step": 4718 }, { "epoch": 1.739242605731134, "grad_norm": 18.019366548266376, "learning_rate": 3.572753121523056e-05, "loss": 0.6831, "step": 4719 }, { "epoch": 1.7396111674191468, "grad_norm": 4.917020989705861, "learning_rate": 3.572444059834343e-05, "loss": 0.298, "step": 4720 }, { "epoch": 1.7399797291071593, "grad_norm": 6.248188824485523, "learning_rate": 3.57213499814563e-05, "loss": 0.5177, "step": 4721 }, { "epoch": 1.740348290795172, "grad_norm": 8.171506213296622, "learning_rate": 3.571825936456917e-05, "loss": 0.4268, "step": 4722 }, { "epoch": 1.7407168524831844, "grad_norm": 11.300249242143746, "learning_rate": 3.571516874768204e-05, "loss": 0.3604, "step": 4723 }, { "epoch": 1.7410854141711969, "grad_norm": 6.601865285216037, "learning_rate": 3.5712078130794904e-05, "loss": 0.4829, "step": 4724 }, { "epoch": 1.7414539758592094, "grad_norm": 4.77065004665777, "learning_rate": 3.5708987513907775e-05, "loss": 0.3384, "step": 4725 }, { "epoch": 1.7418225375472218, "grad_norm": 10.721890626127129, "learning_rate": 3.5705896897020646e-05, "loss": 0.6635, "step": 4726 }, { "epoch": 1.7421910992352345, "grad_norm": 6.941684320190013, "learning_rate": 3.570280628013352e-05, "loss": 0.495, "step": 4727 }, { "epoch": 1.742559660923247, "grad_norm": 5.724336490745224, "learning_rate": 3.569971566324639e-05, "loss": 0.2583, "step": 4728 }, { "epoch": 1.7429282226112597, "grad_norm": 15.801857484206465, "learning_rate": 3.569662504635925e-05, "loss": 0.551, "step": 4729 }, { "epoch": 1.7432967842992722, "grad_norm": 7.048703652094392, "learning_rate": 3.5693534429472124e-05, "loss": 0.456, "step": 4730 }, { "epoch": 1.7436653459872846, "grad_norm": 5.585341970002822, "learning_rate": 3.5690443812584995e-05, "loss": 0.3824, "step": 4731 }, { "epoch": 1.744033907675297, "grad_norm": 6.205898174290181, "learning_rate": 3.568735319569787e-05, "loss": 0.461, "step": 4732 }, { "epoch": 1.7444024693633096, "grad_norm": 7.459643314619517, "learning_rate": 3.568426257881073e-05, "loss": 0.4089, "step": 4733 }, { "epoch": 1.7447710310513223, "grad_norm": 7.666930429095457, "learning_rate": 3.56811719619236e-05, "loss": 0.5401, "step": 4734 }, { "epoch": 1.7451395927393347, "grad_norm": 8.719359892702936, "learning_rate": 3.5678081345036473e-05, "loss": 0.4388, "step": 4735 }, { "epoch": 1.7455081544273474, "grad_norm": 4.960696813797228, "learning_rate": 3.567499072814934e-05, "loss": 0.2874, "step": 4736 }, { "epoch": 1.7458767161153599, "grad_norm": 6.227478036615882, "learning_rate": 3.567190011126221e-05, "loss": 0.4231, "step": 4737 }, { "epoch": 1.7462452778033724, "grad_norm": 6.798442370466251, "learning_rate": 3.566880949437508e-05, "loss": 0.5461, "step": 4738 }, { "epoch": 1.7466138394913848, "grad_norm": 5.057905207725035, "learning_rate": 3.5665718877487945e-05, "loss": 0.2526, "step": 4739 }, { "epoch": 1.7469824011793973, "grad_norm": 7.534387718389307, "learning_rate": 3.5662628260600816e-05, "loss": 0.4432, "step": 4740 }, { "epoch": 1.74735096286741, "grad_norm": 9.267260145032248, "learning_rate": 3.565953764371369e-05, "loss": 0.4553, "step": 4741 }, { "epoch": 1.7477195245554225, "grad_norm": 5.675392426353902, "learning_rate": 3.565644702682656e-05, "loss": 0.3575, "step": 4742 }, { "epoch": 1.7480880862434351, "grad_norm": 8.311481155253896, "learning_rate": 3.565335640993942e-05, "loss": 0.4861, "step": 4743 }, { "epoch": 1.7484566479314476, "grad_norm": 11.684083490237304, "learning_rate": 3.5650265793052294e-05, "loss": 0.4977, "step": 4744 }, { "epoch": 1.74882520961946, "grad_norm": 7.578587091259471, "learning_rate": 3.5647175176165165e-05, "loss": 0.6878, "step": 4745 }, { "epoch": 1.7491937713074726, "grad_norm": 6.741243687560784, "learning_rate": 3.5644084559278036e-05, "loss": 0.4076, "step": 4746 }, { "epoch": 1.749562332995485, "grad_norm": 5.4960182689046615, "learning_rate": 3.564099394239091e-05, "loss": 0.2816, "step": 4747 }, { "epoch": 1.7499308946834975, "grad_norm": 20.0978471960074, "learning_rate": 3.563790332550377e-05, "loss": 0.5342, "step": 4748 }, { "epoch": 1.7502994563715102, "grad_norm": 8.821662198134106, "learning_rate": 3.563481270861664e-05, "loss": 0.4445, "step": 4749 }, { "epoch": 1.7506680180595229, "grad_norm": 8.841608549808505, "learning_rate": 3.5631722091729514e-05, "loss": 0.572, "step": 4750 }, { "epoch": 1.7510365797475353, "grad_norm": 6.662105653124876, "learning_rate": 3.562863147484238e-05, "loss": 0.3309, "step": 4751 }, { "epoch": 1.7514051414355478, "grad_norm": 6.459495521901144, "learning_rate": 3.562554085795525e-05, "loss": 0.56, "step": 4752 }, { "epoch": 1.7517737031235603, "grad_norm": 6.251427449295768, "learning_rate": 3.5622450241068114e-05, "loss": 0.2688, "step": 4753 }, { "epoch": 1.7521422648115728, "grad_norm": 5.436776299383764, "learning_rate": 3.5619359624180985e-05, "loss": 0.3644, "step": 4754 }, { "epoch": 1.7525108264995852, "grad_norm": 7.395864681840106, "learning_rate": 3.561626900729386e-05, "loss": 0.3695, "step": 4755 }, { "epoch": 1.752879388187598, "grad_norm": 8.473374346442, "learning_rate": 3.561317839040673e-05, "loss": 0.4812, "step": 4756 }, { "epoch": 1.7532479498756104, "grad_norm": 8.547549819109348, "learning_rate": 3.56100877735196e-05, "loss": 0.4827, "step": 4757 }, { "epoch": 1.753616511563623, "grad_norm": 8.377216558212137, "learning_rate": 3.5606997156632463e-05, "loss": 0.3545, "step": 4758 }, { "epoch": 1.7539850732516356, "grad_norm": 7.61773460575266, "learning_rate": 3.5603906539745335e-05, "loss": 0.494, "step": 4759 }, { "epoch": 1.754353634939648, "grad_norm": 6.5808778199309685, "learning_rate": 3.5600815922858206e-05, "loss": 0.412, "step": 4760 }, { "epoch": 1.7547221966276605, "grad_norm": 6.352388260244073, "learning_rate": 3.559772530597108e-05, "loss": 0.7021, "step": 4761 }, { "epoch": 1.755090758315673, "grad_norm": 6.151484935098051, "learning_rate": 3.559463468908394e-05, "loss": 0.5157, "step": 4762 }, { "epoch": 1.7554593200036857, "grad_norm": 4.930764541614591, "learning_rate": 3.559154407219681e-05, "loss": 0.3607, "step": 4763 }, { "epoch": 1.7558278816916981, "grad_norm": 12.318377181343163, "learning_rate": 3.5588453455309684e-05, "loss": 0.5658, "step": 4764 }, { "epoch": 1.7561964433797108, "grad_norm": 6.671553092534294, "learning_rate": 3.5585362838422555e-05, "loss": 0.3019, "step": 4765 }, { "epoch": 1.7565650050677233, "grad_norm": 6.481251012347739, "learning_rate": 3.558227222153542e-05, "loss": 0.502, "step": 4766 }, { "epoch": 1.7569335667557358, "grad_norm": 6.288618481996612, "learning_rate": 3.557918160464829e-05, "loss": 0.4592, "step": 4767 }, { "epoch": 1.7573021284437482, "grad_norm": 8.795456007889458, "learning_rate": 3.5576090987761155e-05, "loss": 0.4759, "step": 4768 }, { "epoch": 1.7576706901317607, "grad_norm": 4.147196381109331, "learning_rate": 3.5573000370874026e-05, "loss": 0.2733, "step": 4769 }, { "epoch": 1.7580392518197734, "grad_norm": 9.153813268976009, "learning_rate": 3.55699097539869e-05, "loss": 0.4236, "step": 4770 }, { "epoch": 1.7584078135077859, "grad_norm": 4.7019219871808975, "learning_rate": 3.556681913709977e-05, "loss": 0.3914, "step": 4771 }, { "epoch": 1.7587763751957985, "grad_norm": 7.439445177125578, "learning_rate": 3.556372852021263e-05, "loss": 0.5142, "step": 4772 }, { "epoch": 1.759144936883811, "grad_norm": 8.067346343523823, "learning_rate": 3.5560637903325504e-05, "loss": 0.4736, "step": 4773 }, { "epoch": 1.7595134985718235, "grad_norm": 11.651248953346228, "learning_rate": 3.5557547286438375e-05, "loss": 0.5567, "step": 4774 }, { "epoch": 1.759882060259836, "grad_norm": 9.07180708105173, "learning_rate": 3.5554456669551247e-05, "loss": 0.3563, "step": 4775 }, { "epoch": 1.7602506219478484, "grad_norm": 5.39947014081138, "learning_rate": 3.555136605266412e-05, "loss": 0.3484, "step": 4776 }, { "epoch": 1.760619183635861, "grad_norm": 5.723720704116377, "learning_rate": 3.554827543577698e-05, "loss": 0.2967, "step": 4777 }, { "epoch": 1.7609877453238736, "grad_norm": 7.924685248734104, "learning_rate": 3.5545184818889853e-05, "loss": 0.7043, "step": 4778 }, { "epoch": 1.7613563070118863, "grad_norm": 8.925005522226714, "learning_rate": 3.5542094202002725e-05, "loss": 0.5064, "step": 4779 }, { "epoch": 1.7617248686998987, "grad_norm": 7.790121758308874, "learning_rate": 3.5539003585115596e-05, "loss": 0.5216, "step": 4780 }, { "epoch": 1.7620934303879112, "grad_norm": 6.366671039253584, "learning_rate": 3.553591296822846e-05, "loss": 0.4286, "step": 4781 }, { "epoch": 1.7624619920759237, "grad_norm": 7.659781252772956, "learning_rate": 3.5532822351341325e-05, "loss": 0.5658, "step": 4782 }, { "epoch": 1.7628305537639362, "grad_norm": 7.973384930412877, "learning_rate": 3.5529731734454196e-05, "loss": 0.6012, "step": 4783 }, { "epoch": 1.7631991154519486, "grad_norm": 30.819994651402844, "learning_rate": 3.552664111756707e-05, "loss": 0.5959, "step": 4784 }, { "epoch": 1.7635676771399613, "grad_norm": 9.526537321087769, "learning_rate": 3.552355050067994e-05, "loss": 0.5075, "step": 4785 }, { "epoch": 1.7639362388279738, "grad_norm": 6.439470304420517, "learning_rate": 3.55204598837928e-05, "loss": 0.3985, "step": 4786 }, { "epoch": 1.7643048005159865, "grad_norm": 4.320795897882948, "learning_rate": 3.5517369266905674e-05, "loss": 0.4396, "step": 4787 }, { "epoch": 1.764673362203999, "grad_norm": 5.599406701402823, "learning_rate": 3.5514278650018545e-05, "loss": 0.4142, "step": 4788 }, { "epoch": 1.7650419238920114, "grad_norm": 12.886725262388472, "learning_rate": 3.5511188033131416e-05, "loss": 0.7006, "step": 4789 }, { "epoch": 1.765410485580024, "grad_norm": 3.412096238615955, "learning_rate": 3.550809741624429e-05, "loss": 0.3149, "step": 4790 }, { "epoch": 1.7657790472680364, "grad_norm": 4.446383114361899, "learning_rate": 3.550500679935715e-05, "loss": 0.2841, "step": 4791 }, { "epoch": 1.766147608956049, "grad_norm": 4.8107796108185195, "learning_rate": 3.550191618247002e-05, "loss": 0.4004, "step": 4792 }, { "epoch": 1.7665161706440615, "grad_norm": 5.611441823867674, "learning_rate": 3.5498825565582894e-05, "loss": 0.46, "step": 4793 }, { "epoch": 1.7668847323320742, "grad_norm": 8.56767439598775, "learning_rate": 3.5495734948695765e-05, "loss": 0.569, "step": 4794 }, { "epoch": 1.7672532940200867, "grad_norm": 7.336056839334819, "learning_rate": 3.5492644331808637e-05, "loss": 0.2464, "step": 4795 }, { "epoch": 1.7676218557080992, "grad_norm": 9.22268396265955, "learning_rate": 3.5489553714921494e-05, "loss": 0.6117, "step": 4796 }, { "epoch": 1.7679904173961116, "grad_norm": 5.301443644708258, "learning_rate": 3.5486463098034365e-05, "loss": 0.3252, "step": 4797 }, { "epoch": 1.768358979084124, "grad_norm": 8.92103265370094, "learning_rate": 3.5483372481147237e-05, "loss": 0.5036, "step": 4798 }, { "epoch": 1.7687275407721368, "grad_norm": 12.204777291909803, "learning_rate": 3.548028186426011e-05, "loss": 0.509, "step": 4799 }, { "epoch": 1.7690961024601493, "grad_norm": 10.138767254475747, "learning_rate": 3.547719124737298e-05, "loss": 0.4861, "step": 4800 }, { "epoch": 1.7690961024601493, "eval_bleu": 0.08241818087796421, "eval_bleu_1gram": 0.4326240473374284, "eval_bleu_2gram": 0.21970628710934567, "eval_bleu_3gram": 0.11314612093463658, "eval_bleu_4gram": 0.06421119779050063, "eval_rag_val_loss": 0.7688115625229052, "eval_rouge1": 0.42258662389485996, "eval_rouge2": 0.21313552957184237, "eval_rougeL": 0.42019365290603916, "step": 4800 } ], "logging_steps": 1, "max_steps": 16278, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 1600, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }