{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 26.668896321070235, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013377926421404682, "grad_norm": 0.5197079181671143, "learning_rate": 0.0, "loss": 4.2636, "step": 1 }, { "epoch": 0.026755852842809364, "grad_norm": 0.5626901984214783, "learning_rate": 4e-05, "loss": 4.3971, "step": 2 }, { "epoch": 0.04013377926421405, "grad_norm": 0.5167903304100037, "learning_rate": 8e-05, "loss": 4.3249, "step": 3 }, { "epoch": 0.05351170568561873, "grad_norm": 0.4764951169490814, "learning_rate": 0.00012, "loss": 4.2031, "step": 4 }, { "epoch": 0.06688963210702341, "grad_norm": 0.45488491654396057, "learning_rate": 0.00016, "loss": 4.3914, "step": 5 }, { "epoch": 0.0802675585284281, "grad_norm": 0.568274736404419, "learning_rate": 0.0002, "loss": 4.2346, "step": 6 }, { "epoch": 0.09364548494983277, "grad_norm": 0.5974003076553345, "learning_rate": 0.0001999555061179088, "loss": 4.131, "step": 7 }, { "epoch": 0.10702341137123746, "grad_norm": 0.6204471588134766, "learning_rate": 0.00019991101223581757, "loss": 4.2256, "step": 8 }, { "epoch": 0.12040133779264214, "grad_norm": 0.7143808603286743, "learning_rate": 0.00019986651835372636, "loss": 3.8449, "step": 9 }, { "epoch": 0.13377926421404682, "grad_norm": 0.7799420356750488, "learning_rate": 0.00019982202447163517, "loss": 4.4301, "step": 10 }, { "epoch": 0.14715719063545152, "grad_norm": 0.8880407214164734, "learning_rate": 0.00019977753058954395, "loss": 4.2266, "step": 11 }, { "epoch": 0.1605351170568562, "grad_norm": 0.7776209712028503, "learning_rate": 0.00019973303670745273, "loss": 4.3208, "step": 12 }, { "epoch": 0.17391304347826086, "grad_norm": 0.9125858545303345, "learning_rate": 0.0001996885428253615, "loss": 4.4363, "step": 13 }, { "epoch": 0.18729096989966554, "grad_norm": 0.9000256657600403, "learning_rate": 0.00019964404894327032, "loss": 4.2917, "step": 14 }, { "epoch": 0.20066889632107024, "grad_norm": 0.9995108246803284, "learning_rate": 0.00019959955506117908, "loss": 4.1784, "step": 15 }, { "epoch": 0.2140468227424749, "grad_norm": 0.9209024310112, "learning_rate": 0.0001995550611790879, "loss": 4.7852, "step": 16 }, { "epoch": 0.22742474916387959, "grad_norm": 0.9421981573104858, "learning_rate": 0.00019951056729699667, "loss": 4.8501, "step": 17 }, { "epoch": 0.2408026755852843, "grad_norm": 0.9213201403617859, "learning_rate": 0.00019946607341490545, "loss": 4.7923, "step": 18 }, { "epoch": 0.25418060200668896, "grad_norm": 0.9378194212913513, "learning_rate": 0.00019942157953281423, "loss": 4.9593, "step": 19 }, { "epoch": 0.26755852842809363, "grad_norm": 1.0096492767333984, "learning_rate": 0.00019937708565072304, "loss": 4.7099, "step": 20 }, { "epoch": 0.2809364548494983, "grad_norm": 0.8903587460517883, "learning_rate": 0.00019933259176863183, "loss": 4.3746, "step": 21 }, { "epoch": 0.29431438127090304, "grad_norm": 0.7808490991592407, "learning_rate": 0.0001992880978865406, "loss": 4.5873, "step": 22 }, { "epoch": 0.3076923076923077, "grad_norm": 0.8145670294761658, "learning_rate": 0.0001992436040044494, "loss": 4.7924, "step": 23 }, { "epoch": 0.3210702341137124, "grad_norm": 0.7945849299430847, "learning_rate": 0.0001991991101223582, "loss": 4.8881, "step": 24 }, { "epoch": 0.33444816053511706, "grad_norm": 0.7871395349502563, "learning_rate": 0.00019915461624026696, "loss": 4.6922, "step": 25 }, { "epoch": 0.34782608695652173, "grad_norm": 0.9111238718032837, "learning_rate": 0.00019911012235817577, "loss": 4.9982, "step": 26 }, { "epoch": 0.3612040133779264, "grad_norm": 0.7121369242668152, "learning_rate": 0.00019906562847608455, "loss": 4.4756, "step": 27 }, { "epoch": 0.3745819397993311, "grad_norm": 0.7118422389030457, "learning_rate": 0.00019902113459399333, "loss": 5.1389, "step": 28 }, { "epoch": 0.3879598662207358, "grad_norm": 0.7100292444229126, "learning_rate": 0.0001989766407119021, "loss": 4.7691, "step": 29 }, { "epoch": 0.4013377926421405, "grad_norm": 0.708591639995575, "learning_rate": 0.00019893214682981092, "loss": 4.8721, "step": 30 }, { "epoch": 0.41471571906354515, "grad_norm": 0.6711616516113281, "learning_rate": 0.0001988876529477197, "loss": 4.9152, "step": 31 }, { "epoch": 0.4280936454849498, "grad_norm": 0.7158232927322388, "learning_rate": 0.0001988431590656285, "loss": 4.828, "step": 32 }, { "epoch": 0.4414715719063545, "grad_norm": 0.6246087551116943, "learning_rate": 0.00019879866518353727, "loss": 4.8452, "step": 33 }, { "epoch": 0.45484949832775917, "grad_norm": 0.6088873744010925, "learning_rate": 0.00019875417130144608, "loss": 4.9702, "step": 34 }, { "epoch": 0.4682274247491639, "grad_norm": 0.5798126459121704, "learning_rate": 0.00019870967741935483, "loss": 4.9838, "step": 35 }, { "epoch": 0.4816053511705686, "grad_norm": 0.6268919706344604, "learning_rate": 0.00019866518353726364, "loss": 4.7636, "step": 36 }, { "epoch": 0.49498327759197325, "grad_norm": 0.5649904012680054, "learning_rate": 0.00019862068965517243, "loss": 4.506, "step": 37 }, { "epoch": 0.5083612040133779, "grad_norm": 0.5947792530059814, "learning_rate": 0.0001985761957730812, "loss": 4.8057, "step": 38 }, { "epoch": 0.5217391304347826, "grad_norm": 0.6204257011413574, "learning_rate": 0.00019853170189099, "loss": 5.0511, "step": 39 }, { "epoch": 0.5351170568561873, "grad_norm": 0.5972265601158142, "learning_rate": 0.0001984872080088988, "loss": 4.924, "step": 40 }, { "epoch": 0.5484949832775919, "grad_norm": 0.6117077469825745, "learning_rate": 0.00019844271412680758, "loss": 4.8729, "step": 41 }, { "epoch": 0.5618729096989966, "grad_norm": 0.5085508823394775, "learning_rate": 0.00019839822024471637, "loss": 4.3616, "step": 42 }, { "epoch": 0.5752508361204013, "grad_norm": 0.550647497177124, "learning_rate": 0.00019835372636262515, "loss": 5.2512, "step": 43 }, { "epoch": 0.5886287625418061, "grad_norm": 0.48329588770866394, "learning_rate": 0.00019830923248053396, "loss": 4.9501, "step": 44 }, { "epoch": 0.6020066889632107, "grad_norm": 0.6313246488571167, "learning_rate": 0.0001982647385984427, "loss": 4.5767, "step": 45 }, { "epoch": 0.6153846153846154, "grad_norm": 0.5111928582191467, "learning_rate": 0.00019822024471635152, "loss": 4.5586, "step": 46 }, { "epoch": 0.6287625418060201, "grad_norm": 0.5264492630958557, "learning_rate": 0.0001981757508342603, "loss": 4.7033, "step": 47 }, { "epoch": 0.6421404682274248, "grad_norm": 0.5058289170265198, "learning_rate": 0.0001981312569521691, "loss": 4.8396, "step": 48 }, { "epoch": 0.6555183946488294, "grad_norm": 0.5688439607620239, "learning_rate": 0.00019808676307007787, "loss": 5.1887, "step": 49 }, { "epoch": 0.6688963210702341, "grad_norm": 0.5488842129707336, "learning_rate": 0.00019804226918798665, "loss": 4.6075, "step": 50 }, { "epoch": 0.6822742474916388, "grad_norm": 0.5358632206916809, "learning_rate": 0.00019799777530589546, "loss": 5.0205, "step": 51 }, { "epoch": 0.6956521739130435, "grad_norm": 0.47869494557380676, "learning_rate": 0.00019795328142380422, "loss": 4.8495, "step": 52 }, { "epoch": 0.7090301003344481, "grad_norm": 0.49378660321235657, "learning_rate": 0.00019790878754171303, "loss": 4.6563, "step": 53 }, { "epoch": 0.7224080267558528, "grad_norm": 0.5167868733406067, "learning_rate": 0.0001978642936596218, "loss": 5.2558, "step": 54 }, { "epoch": 0.7357859531772575, "grad_norm": 0.5230040550231934, "learning_rate": 0.0001978197997775306, "loss": 4.7769, "step": 55 }, { "epoch": 0.7491638795986622, "grad_norm": 0.4822310507297516, "learning_rate": 0.00019777530589543937, "loss": 4.9282, "step": 56 }, { "epoch": 0.7625418060200669, "grad_norm": 0.500045895576477, "learning_rate": 0.00019773081201334818, "loss": 5.0399, "step": 57 }, { "epoch": 0.7759197324414716, "grad_norm": 0.4740642309188843, "learning_rate": 0.00019768631813125696, "loss": 4.8041, "step": 58 }, { "epoch": 0.7892976588628763, "grad_norm": 0.45918184518814087, "learning_rate": 0.00019764182424916575, "loss": 4.6304, "step": 59 }, { "epoch": 0.802675585284281, "grad_norm": 0.53122878074646, "learning_rate": 0.00019759733036707453, "loss": 4.8377, "step": 60 }, { "epoch": 0.8160535117056856, "grad_norm": 0.4925791919231415, "learning_rate": 0.00019755283648498334, "loss": 5.0919, "step": 61 }, { "epoch": 0.8294314381270903, "grad_norm": 0.4777262806892395, "learning_rate": 0.0001975083426028921, "loss": 4.8379, "step": 62 }, { "epoch": 0.842809364548495, "grad_norm": 0.49119675159454346, "learning_rate": 0.0001974638487208009, "loss": 5.0819, "step": 63 }, { "epoch": 0.8561872909698997, "grad_norm": 0.4732685089111328, "learning_rate": 0.00019741935483870969, "loss": 4.8948, "step": 64 }, { "epoch": 0.8695652173913043, "grad_norm": 0.46269145607948303, "learning_rate": 0.00019737486095661847, "loss": 4.824, "step": 65 }, { "epoch": 0.882943143812709, "grad_norm": 0.49532708525657654, "learning_rate": 0.00019733036707452725, "loss": 4.8986, "step": 66 }, { "epoch": 0.8963210702341137, "grad_norm": 0.5253002643585205, "learning_rate": 0.00019728587319243606, "loss": 4.9073, "step": 67 }, { "epoch": 0.9096989966555183, "grad_norm": 0.5069419145584106, "learning_rate": 0.00019724137931034484, "loss": 4.8962, "step": 68 }, { "epoch": 0.9230769230769231, "grad_norm": 0.5038817524909973, "learning_rate": 0.00019719688542825363, "loss": 4.8711, "step": 69 }, { "epoch": 0.9364548494983278, "grad_norm": 0.4987100064754486, "learning_rate": 0.0001971523915461624, "loss": 4.8816, "step": 70 }, { "epoch": 0.9498327759197325, "grad_norm": 0.47370976209640503, "learning_rate": 0.00019710789766407122, "loss": 4.9675, "step": 71 }, { "epoch": 0.9632107023411371, "grad_norm": 0.5081727504730225, "learning_rate": 0.00019706340378197997, "loss": 4.2768, "step": 72 }, { "epoch": 0.9765886287625418, "grad_norm": 0.45571258664131165, "learning_rate": 0.00019701890989988878, "loss": 4.6182, "step": 73 }, { "epoch": 0.9899665551839465, "grad_norm": 0.5216127634048462, "learning_rate": 0.00019697441601779756, "loss": 4.7126, "step": 74 }, { "epoch": 1.0, "grad_norm": 0.5393329858779907, "learning_rate": 0.00019692992213570635, "loss": 4.4919, "step": 75 }, { "epoch": 1.0133779264214047, "grad_norm": 0.4506986737251282, "learning_rate": 0.00019688542825361513, "loss": 4.5089, "step": 76 }, { "epoch": 1.0267558528428093, "grad_norm": 0.4328899085521698, "learning_rate": 0.00019684093437152394, "loss": 4.7518, "step": 77 }, { "epoch": 1.040133779264214, "grad_norm": 0.4397362470626831, "learning_rate": 0.00019679644048943272, "loss": 4.5069, "step": 78 }, { "epoch": 1.0535117056856187, "grad_norm": 0.4604664146900177, "learning_rate": 0.0001967519466073415, "loss": 4.7054, "step": 79 }, { "epoch": 1.0668896321070234, "grad_norm": 0.4398234784603119, "learning_rate": 0.00019670745272525029, "loss": 4.2743, "step": 80 }, { "epoch": 1.080267558528428, "grad_norm": 0.4570735692977905, "learning_rate": 0.0001966629588431591, "loss": 4.8012, "step": 81 }, { "epoch": 1.0936454849498327, "grad_norm": 0.4814144968986511, "learning_rate": 0.00019661846496106785, "loss": 4.6449, "step": 82 }, { "epoch": 1.1070234113712374, "grad_norm": 0.4526231288909912, "learning_rate": 0.00019657397107897666, "loss": 4.5546, "step": 83 }, { "epoch": 1.120401337792642, "grad_norm": 0.4847906827926636, "learning_rate": 0.00019652947719688544, "loss": 4.4421, "step": 84 }, { "epoch": 1.1337792642140467, "grad_norm": 0.5136271715164185, "learning_rate": 0.00019648498331479422, "loss": 4.7136, "step": 85 }, { "epoch": 1.1471571906354514, "grad_norm": 0.49209895730018616, "learning_rate": 0.000196440489432703, "loss": 4.3145, "step": 86 }, { "epoch": 1.160535117056856, "grad_norm": 0.4972032904624939, "learning_rate": 0.00019639599555061182, "loss": 4.0408, "step": 87 }, { "epoch": 1.1739130434782608, "grad_norm": 0.5077862739562988, "learning_rate": 0.0001963515016685206, "loss": 4.4074, "step": 88 }, { "epoch": 1.1872909698996654, "grad_norm": 0.5293861031532288, "learning_rate": 0.00019630700778642935, "loss": 4.5385, "step": 89 }, { "epoch": 1.2006688963210703, "grad_norm": 0.5062645673751831, "learning_rate": 0.00019626251390433816, "loss": 4.5141, "step": 90 }, { "epoch": 1.214046822742475, "grad_norm": 0.49655866622924805, "learning_rate": 0.00019621802002224695, "loss": 4.4765, "step": 91 }, { "epoch": 1.2274247491638797, "grad_norm": 0.6059755086898804, "learning_rate": 0.00019617352614015573, "loss": 4.568, "step": 92 }, { "epoch": 1.2408026755852843, "grad_norm": 0.5442761778831482, "learning_rate": 0.0001961290322580645, "loss": 4.7724, "step": 93 }, { "epoch": 1.254180602006689, "grad_norm": 0.5426056385040283, "learning_rate": 0.00019608453837597332, "loss": 4.5308, "step": 94 }, { "epoch": 1.2675585284280937, "grad_norm": 0.525372326374054, "learning_rate": 0.0001960400444938821, "loss": 4.394, "step": 95 }, { "epoch": 1.2809364548494984, "grad_norm": 0.5407588481903076, "learning_rate": 0.00019599555061179089, "loss": 4.7347, "step": 96 }, { "epoch": 1.294314381270903, "grad_norm": 0.5726659893989563, "learning_rate": 0.00019595105672969967, "loss": 4.9446, "step": 97 }, { "epoch": 1.3076923076923077, "grad_norm": 0.6211283206939697, "learning_rate": 0.00019590656284760848, "loss": 4.697, "step": 98 }, { "epoch": 1.3210702341137124, "grad_norm": 0.5627567172050476, "learning_rate": 0.00019586206896551723, "loss": 4.4892, "step": 99 }, { "epoch": 1.334448160535117, "grad_norm": 0.6174790859222412, "learning_rate": 0.00019581757508342604, "loss": 4.5686, "step": 100 }, { "epoch": 1.3478260869565217, "grad_norm": 0.5586990118026733, "learning_rate": 0.00019577308120133482, "loss": 4.3916, "step": 101 }, { "epoch": 1.3612040133779264, "grad_norm": 0.5655365586280823, "learning_rate": 0.0001957285873192436, "loss": 4.1789, "step": 102 }, { "epoch": 1.374581939799331, "grad_norm": 0.5834594964981079, "learning_rate": 0.0001956840934371524, "loss": 4.3316, "step": 103 }, { "epoch": 1.3879598662207357, "grad_norm": 0.6065447926521301, "learning_rate": 0.0001956395995550612, "loss": 4.5167, "step": 104 }, { "epoch": 1.4013377926421404, "grad_norm": 0.5250216722488403, "learning_rate": 0.00019559510567296998, "loss": 4.1718, "step": 105 }, { "epoch": 1.414715719063545, "grad_norm": 0.5861116051673889, "learning_rate": 0.00019555061179087876, "loss": 4.3077, "step": 106 }, { "epoch": 1.4280936454849498, "grad_norm": 0.6138104796409607, "learning_rate": 0.00019550611790878755, "loss": 4.4748, "step": 107 }, { "epoch": 1.4414715719063544, "grad_norm": 0.6742071509361267, "learning_rate": 0.00019546162402669636, "loss": 4.8769, "step": 108 }, { "epoch": 1.4548494983277591, "grad_norm": 0.6634951233863831, "learning_rate": 0.0001954171301446051, "loss": 4.6423, "step": 109 }, { "epoch": 1.468227424749164, "grad_norm": 0.626646876335144, "learning_rate": 0.00019537263626251392, "loss": 4.4654, "step": 110 }, { "epoch": 1.4816053511705687, "grad_norm": 0.6306963562965393, "learning_rate": 0.0001953281423804227, "loss": 4.7021, "step": 111 }, { "epoch": 1.4949832775919734, "grad_norm": 0.620370626449585, "learning_rate": 0.00019528364849833149, "loss": 4.587, "step": 112 }, { "epoch": 1.508361204013378, "grad_norm": 0.6410287618637085, "learning_rate": 0.00019523915461624027, "loss": 4.8089, "step": 113 }, { "epoch": 1.5217391304347827, "grad_norm": 0.676434338092804, "learning_rate": 0.00019519466073414908, "loss": 4.668, "step": 114 }, { "epoch": 1.5351170568561874, "grad_norm": 0.5756319761276245, "learning_rate": 0.00019515016685205786, "loss": 4.3223, "step": 115 }, { "epoch": 1.548494983277592, "grad_norm": 0.5850693583488464, "learning_rate": 0.00019510567296996664, "loss": 4.2343, "step": 116 }, { "epoch": 1.5618729096989967, "grad_norm": 0.6172360777854919, "learning_rate": 0.00019506117908787542, "loss": 4.6102, "step": 117 }, { "epoch": 1.5752508361204014, "grad_norm": 0.5887568593025208, "learning_rate": 0.00019501668520578423, "loss": 4.8097, "step": 118 }, { "epoch": 1.588628762541806, "grad_norm": 0.5763369798660278, "learning_rate": 0.000194972191323693, "loss": 4.2001, "step": 119 }, { "epoch": 1.6020066889632107, "grad_norm": 0.6158986687660217, "learning_rate": 0.0001949276974416018, "loss": 4.7075, "step": 120 }, { "epoch": 1.6153846153846154, "grad_norm": 0.5540957450866699, "learning_rate": 0.00019488320355951058, "loss": 4.452, "step": 121 }, { "epoch": 1.62876254180602, "grad_norm": 0.6193795204162598, "learning_rate": 0.00019483870967741936, "loss": 4.4583, "step": 122 }, { "epoch": 1.6421404682274248, "grad_norm": 0.6699966788291931, "learning_rate": 0.00019479421579532815, "loss": 4.3728, "step": 123 }, { "epoch": 1.6555183946488294, "grad_norm": 0.5904677510261536, "learning_rate": 0.00019474972191323696, "loss": 4.5452, "step": 124 }, { "epoch": 1.6688963210702341, "grad_norm": 0.6137760281562805, "learning_rate": 0.00019470522803114574, "loss": 4.2853, "step": 125 }, { "epoch": 1.6822742474916388, "grad_norm": 0.6396192908287048, "learning_rate": 0.00019466073414905452, "loss": 4.4258, "step": 126 }, { "epoch": 1.6956521739130435, "grad_norm": 0.6190487742424011, "learning_rate": 0.0001946162402669633, "loss": 4.9866, "step": 127 }, { "epoch": 1.7090301003344481, "grad_norm": 0.6971675157546997, "learning_rate": 0.0001945717463848721, "loss": 4.2126, "step": 128 }, { "epoch": 1.7224080267558528, "grad_norm": 0.6245931386947632, "learning_rate": 0.00019452725250278087, "loss": 4.8477, "step": 129 }, { "epoch": 1.7357859531772575, "grad_norm": 0.5675052404403687, "learning_rate": 0.00019448275862068965, "loss": 4.4097, "step": 130 }, { "epoch": 1.7491638795986622, "grad_norm": 0.6594040393829346, "learning_rate": 0.00019443826473859846, "loss": 4.3747, "step": 131 }, { "epoch": 1.7625418060200668, "grad_norm": 0.6377655267715454, "learning_rate": 0.00019439377085650724, "loss": 4.2733, "step": 132 }, { "epoch": 1.7759197324414715, "grad_norm": 0.6167862415313721, "learning_rate": 0.00019434927697441602, "loss": 4.5694, "step": 133 }, { "epoch": 1.7892976588628762, "grad_norm": 0.577671468257904, "learning_rate": 0.0001943047830923248, "loss": 4.5006, "step": 134 }, { "epoch": 1.8026755852842808, "grad_norm": 0.6361016035079956, "learning_rate": 0.00019426028921023362, "loss": 4.9907, "step": 135 }, { "epoch": 1.8160535117056855, "grad_norm": 0.6445321440696716, "learning_rate": 0.00019421579532814237, "loss": 4.779, "step": 136 }, { "epoch": 1.8294314381270902, "grad_norm": 0.5955402851104736, "learning_rate": 0.00019417130144605118, "loss": 4.6026, "step": 137 }, { "epoch": 1.8428093645484949, "grad_norm": 0.6807080507278442, "learning_rate": 0.00019412680756395996, "loss": 4.7124, "step": 138 }, { "epoch": 1.8561872909698995, "grad_norm": 0.5799288153648376, "learning_rate": 0.00019408231368186875, "loss": 4.0701, "step": 139 }, { "epoch": 1.8695652173913042, "grad_norm": 0.6187757253646851, "learning_rate": 0.00019403781979977753, "loss": 4.705, "step": 140 }, { "epoch": 1.8829431438127089, "grad_norm": 0.6614826917648315, "learning_rate": 0.00019399332591768634, "loss": 4.8146, "step": 141 }, { "epoch": 1.8963210702341136, "grad_norm": 0.6204859614372253, "learning_rate": 0.00019394883203559512, "loss": 4.3041, "step": 142 }, { "epoch": 1.9096989966555182, "grad_norm": 0.6527450680732727, "learning_rate": 0.0001939043381535039, "loss": 4.4493, "step": 143 }, { "epoch": 1.9230769230769231, "grad_norm": 0.6470615267753601, "learning_rate": 0.00019385984427141268, "loss": 4.7771, "step": 144 }, { "epoch": 1.9364548494983278, "grad_norm": 0.5642555952072144, "learning_rate": 0.0001938153503893215, "loss": 4.3344, "step": 145 }, { "epoch": 1.9498327759197325, "grad_norm": 0.6206467151641846, "learning_rate": 0.00019377085650723025, "loss": 4.2191, "step": 146 }, { "epoch": 1.9632107023411371, "grad_norm": 0.6079016923904419, "learning_rate": 0.00019372636262513906, "loss": 4.7397, "step": 147 }, { "epoch": 1.9765886287625418, "grad_norm": 0.6197662353515625, "learning_rate": 0.00019368186874304784, "loss": 4.5342, "step": 148 }, { "epoch": 1.9899665551839465, "grad_norm": 0.6556297540664673, "learning_rate": 0.00019363737486095662, "loss": 4.6709, "step": 149 }, { "epoch": 2.0, "grad_norm": 0.7837930917739868, "learning_rate": 0.0001935928809788654, "loss": 4.6215, "step": 150 }, { "epoch": 2.0133779264214047, "grad_norm": 0.5267267227172852, "learning_rate": 0.00019354838709677422, "loss": 4.2695, "step": 151 }, { "epoch": 2.0267558528428093, "grad_norm": 0.5862157344818115, "learning_rate": 0.000193503893214683, "loss": 4.3702, "step": 152 }, { "epoch": 2.040133779264214, "grad_norm": 0.538254976272583, "learning_rate": 0.00019345939933259178, "loss": 4.3953, "step": 153 }, { "epoch": 2.0535117056856187, "grad_norm": 0.5977053642272949, "learning_rate": 0.00019341490545050056, "loss": 4.2156, "step": 154 }, { "epoch": 2.0668896321070234, "grad_norm": 0.606006383895874, "learning_rate": 0.00019337041156840937, "loss": 4.2802, "step": 155 }, { "epoch": 2.080267558528428, "grad_norm": 0.6071277856826782, "learning_rate": 0.00019332591768631813, "loss": 4.5545, "step": 156 }, { "epoch": 2.0936454849498327, "grad_norm": 0.6281546354293823, "learning_rate": 0.00019328142380422694, "loss": 4.6105, "step": 157 }, { "epoch": 2.1070234113712374, "grad_norm": 0.5703116655349731, "learning_rate": 0.00019323692992213572, "loss": 4.2751, "step": 158 }, { "epoch": 2.120401337792642, "grad_norm": 0.6587452292442322, "learning_rate": 0.0001931924360400445, "loss": 4.6342, "step": 159 }, { "epoch": 2.1337792642140467, "grad_norm": 0.6141905784606934, "learning_rate": 0.00019314794215795328, "loss": 4.4345, "step": 160 }, { "epoch": 2.1471571906354514, "grad_norm": 0.6741939187049866, "learning_rate": 0.0001931034482758621, "loss": 4.0257, "step": 161 }, { "epoch": 2.160535117056856, "grad_norm": 0.6468759179115295, "learning_rate": 0.00019305895439377088, "loss": 4.2313, "step": 162 }, { "epoch": 2.1739130434782608, "grad_norm": 0.6703383326530457, "learning_rate": 0.00019301446051167966, "loss": 4.2164, "step": 163 }, { "epoch": 2.1872909698996654, "grad_norm": 0.710967481136322, "learning_rate": 0.00019296996662958844, "loss": 4.3398, "step": 164 }, { "epoch": 2.20066889632107, "grad_norm": 0.6862124800682068, "learning_rate": 0.00019292547274749725, "loss": 4.3379, "step": 165 }, { "epoch": 2.2140468227424748, "grad_norm": 0.6288430690765381, "learning_rate": 0.000192880978865406, "loss": 4.3487, "step": 166 }, { "epoch": 2.2274247491638794, "grad_norm": 0.6358796954154968, "learning_rate": 0.00019283648498331481, "loss": 4.1656, "step": 167 }, { "epoch": 2.240802675585284, "grad_norm": 0.6818917393684387, "learning_rate": 0.0001927919911012236, "loss": 4.5363, "step": 168 }, { "epoch": 2.254180602006689, "grad_norm": 0.6996105313301086, "learning_rate": 0.00019274749721913238, "loss": 4.3208, "step": 169 }, { "epoch": 2.2675585284280935, "grad_norm": 0.6730326414108276, "learning_rate": 0.00019270300333704116, "loss": 4.1401, "step": 170 }, { "epoch": 2.280936454849498, "grad_norm": 0.7022603750228882, "learning_rate": 0.00019265850945494994, "loss": 4.5761, "step": 171 }, { "epoch": 2.294314381270903, "grad_norm": 0.6525995135307312, "learning_rate": 0.00019261401557285875, "loss": 4.4017, "step": 172 }, { "epoch": 2.3076923076923075, "grad_norm": 0.7066033482551575, "learning_rate": 0.0001925695216907675, "loss": 4.0037, "step": 173 }, { "epoch": 2.321070234113712, "grad_norm": 0.6708059310913086, "learning_rate": 0.00019252502780867632, "loss": 4.1947, "step": 174 }, { "epoch": 2.334448160535117, "grad_norm": 0.8711172342300415, "learning_rate": 0.0001924805339265851, "loss": 3.9958, "step": 175 }, { "epoch": 2.3478260869565215, "grad_norm": 0.7258634567260742, "learning_rate": 0.00019243604004449388, "loss": 4.4682, "step": 176 }, { "epoch": 2.361204013377926, "grad_norm": 0.7693021893501282, "learning_rate": 0.00019239154616240267, "loss": 4.54, "step": 177 }, { "epoch": 2.374581939799331, "grad_norm": 0.7271276116371155, "learning_rate": 0.00019234705228031148, "loss": 4.2942, "step": 178 }, { "epoch": 2.387959866220736, "grad_norm": 0.6836609244346619, "learning_rate": 0.00019230255839822026, "loss": 4.3099, "step": 179 }, { "epoch": 2.4013377926421406, "grad_norm": 0.731164813041687, "learning_rate": 0.00019225806451612904, "loss": 4.4077, "step": 180 }, { "epoch": 2.4147157190635453, "grad_norm": 0.7575274109840393, "learning_rate": 0.00019221357063403782, "loss": 4.6572, "step": 181 }, { "epoch": 2.42809364548495, "grad_norm": 0.8461325168609619, "learning_rate": 0.00019216907675194663, "loss": 4.4922, "step": 182 }, { "epoch": 2.4414715719063547, "grad_norm": 0.7225251197814941, "learning_rate": 0.0001921245828698554, "loss": 4.0372, "step": 183 }, { "epoch": 2.4548494983277593, "grad_norm": 3.563720703125, "learning_rate": 0.0001920800889877642, "loss": 4.5412, "step": 184 }, { "epoch": 2.468227424749164, "grad_norm": 0.8452121019363403, "learning_rate": 0.00019203559510567298, "loss": 4.4961, "step": 185 }, { "epoch": 2.4816053511705687, "grad_norm": 0.8734024167060852, "learning_rate": 0.00019199110122358176, "loss": 4.0884, "step": 186 }, { "epoch": 2.4949832775919734, "grad_norm": 1.1765823364257812, "learning_rate": 0.00019194660734149054, "loss": 4.2228, "step": 187 }, { "epoch": 2.508361204013378, "grad_norm": 0.750206708908081, "learning_rate": 0.00019190211345939935, "loss": 4.4305, "step": 188 }, { "epoch": 2.5217391304347827, "grad_norm": 0.7574430704116821, "learning_rate": 0.00019185761957730814, "loss": 4.1511, "step": 189 }, { "epoch": 2.5351170568561874, "grad_norm": 0.7105517387390137, "learning_rate": 0.00019181312569521692, "loss": 4.4793, "step": 190 }, { "epoch": 2.548494983277592, "grad_norm": 0.7495557069778442, "learning_rate": 0.0001917686318131257, "loss": 4.1335, "step": 191 }, { "epoch": 2.5618729096989967, "grad_norm": 0.8001168966293335, "learning_rate": 0.0001917241379310345, "loss": 4.7898, "step": 192 }, { "epoch": 2.5752508361204014, "grad_norm": 0.7402104735374451, "learning_rate": 0.00019167964404894327, "loss": 4.4482, "step": 193 }, { "epoch": 2.588628762541806, "grad_norm": 0.748267650604248, "learning_rate": 0.00019163515016685207, "loss": 4.3167, "step": 194 }, { "epoch": 2.6020066889632107, "grad_norm": 0.8291250467300415, "learning_rate": 0.00019159065628476086, "loss": 4.058, "step": 195 }, { "epoch": 2.6153846153846154, "grad_norm": 0.6945542693138123, "learning_rate": 0.00019154616240266964, "loss": 3.9751, "step": 196 }, { "epoch": 2.62876254180602, "grad_norm": 0.7307319045066833, "learning_rate": 0.00019150166852057842, "loss": 4.2736, "step": 197 }, { "epoch": 2.6421404682274248, "grad_norm": 0.7489168047904968, "learning_rate": 0.00019145717463848723, "loss": 4.3075, "step": 198 }, { "epoch": 2.6555183946488294, "grad_norm": 0.9727582931518555, "learning_rate": 0.00019141268075639601, "loss": 4.6474, "step": 199 }, { "epoch": 2.668896321070234, "grad_norm": 0.6776256561279297, "learning_rate": 0.0001913681868743048, "loss": 4.4217, "step": 200 }, { "epoch": 2.682274247491639, "grad_norm": 0.7305111885070801, "learning_rate": 0.00019132369299221358, "loss": 4.2804, "step": 201 }, { "epoch": 2.6956521739130435, "grad_norm": 0.7196978330612183, "learning_rate": 0.0001912791991101224, "loss": 4.3941, "step": 202 }, { "epoch": 2.709030100334448, "grad_norm": 0.7988458871841431, "learning_rate": 0.00019123470522803114, "loss": 4.437, "step": 203 }, { "epoch": 2.722408026755853, "grad_norm": 0.7004797458648682, "learning_rate": 0.00019119021134593995, "loss": 4.4986, "step": 204 }, { "epoch": 2.7357859531772575, "grad_norm": 0.677796483039856, "learning_rate": 0.00019114571746384874, "loss": 4.0851, "step": 205 }, { "epoch": 2.749163879598662, "grad_norm": 0.7527475357055664, "learning_rate": 0.00019110122358175752, "loss": 4.4469, "step": 206 }, { "epoch": 2.762541806020067, "grad_norm": 1.1659115552902222, "learning_rate": 0.0001910567296996663, "loss": 4.3284, "step": 207 }, { "epoch": 2.7759197324414715, "grad_norm": 0.7238364815711975, "learning_rate": 0.0001910122358175751, "loss": 4.2605, "step": 208 }, { "epoch": 2.789297658862876, "grad_norm": 0.7537760734558105, "learning_rate": 0.0001909677419354839, "loss": 4.3775, "step": 209 }, { "epoch": 2.802675585284281, "grad_norm": 0.6874127388000488, "learning_rate": 0.00019092324805339267, "loss": 4.3404, "step": 210 }, { "epoch": 2.8160535117056855, "grad_norm": 0.7045959830284119, "learning_rate": 0.00019087875417130146, "loss": 4.1568, "step": 211 }, { "epoch": 2.82943143812709, "grad_norm": 0.7249194383621216, "learning_rate": 0.00019083426028921027, "loss": 4.1969, "step": 212 }, { "epoch": 2.842809364548495, "grad_norm": 0.8331268429756165, "learning_rate": 0.00019078976640711902, "loss": 4.3169, "step": 213 }, { "epoch": 2.8561872909698995, "grad_norm": 0.7171936631202698, "learning_rate": 0.0001907452725250278, "loss": 4.5123, "step": 214 }, { "epoch": 2.869565217391304, "grad_norm": 0.759919285774231, "learning_rate": 0.0001907007786429366, "loss": 4.5412, "step": 215 }, { "epoch": 2.882943143812709, "grad_norm": 0.7451274991035461, "learning_rate": 0.0001906562847608454, "loss": 4.5253, "step": 216 }, { "epoch": 2.8963210702341136, "grad_norm": 0.6564481258392334, "learning_rate": 0.00019061179087875418, "loss": 4.1092, "step": 217 }, { "epoch": 2.9096989966555182, "grad_norm": 0.7339865565299988, "learning_rate": 0.00019056729699666296, "loss": 4.5092, "step": 218 }, { "epoch": 2.9230769230769234, "grad_norm": 0.7113937735557556, "learning_rate": 0.00019052280311457177, "loss": 4.3355, "step": 219 }, { "epoch": 2.936454849498328, "grad_norm": 0.7306456565856934, "learning_rate": 0.00019047830923248053, "loss": 4.5745, "step": 220 }, { "epoch": 2.9498327759197327, "grad_norm": 0.7971818447113037, "learning_rate": 0.00019043381535038933, "loss": 4.2903, "step": 221 }, { "epoch": 2.9632107023411374, "grad_norm": 0.7757331728935242, "learning_rate": 0.00019038932146829812, "loss": 4.2832, "step": 222 }, { "epoch": 2.976588628762542, "grad_norm": 0.7326288223266602, "learning_rate": 0.0001903448275862069, "loss": 4.2444, "step": 223 }, { "epoch": 2.9899665551839467, "grad_norm": 0.7363834381103516, "learning_rate": 0.00019030033370411568, "loss": 4.6744, "step": 224 }, { "epoch": 3.0, "grad_norm": 0.8835271596908569, "learning_rate": 0.0001902558398220245, "loss": 4.432, "step": 225 }, { "epoch": 3.0133779264214047, "grad_norm": 0.6591921448707581, "learning_rate": 0.00019021134593993327, "loss": 4.1353, "step": 226 }, { "epoch": 3.0267558528428093, "grad_norm": 0.6895263195037842, "learning_rate": 0.00019016685205784206, "loss": 4.1253, "step": 227 }, { "epoch": 3.040133779264214, "grad_norm": 0.6476898789405823, "learning_rate": 0.00019012235817575084, "loss": 4.0354, "step": 228 }, { "epoch": 3.0535117056856187, "grad_norm": 0.6398957967758179, "learning_rate": 0.00019007786429365965, "loss": 4.024, "step": 229 }, { "epoch": 3.0668896321070234, "grad_norm": 0.7483389973640442, "learning_rate": 0.0001900333704115684, "loss": 4.1405, "step": 230 }, { "epoch": 3.080267558528428, "grad_norm": 0.7003724575042725, "learning_rate": 0.0001899888765294772, "loss": 4.3593, "step": 231 }, { "epoch": 3.0936454849498327, "grad_norm": 0.7426732182502747, "learning_rate": 0.000189944382647386, "loss": 4.485, "step": 232 }, { "epoch": 3.1070234113712374, "grad_norm": 0.6957541108131409, "learning_rate": 0.00018989988876529478, "loss": 4.1017, "step": 233 }, { "epoch": 3.120401337792642, "grad_norm": 0.8613067865371704, "learning_rate": 0.00018985539488320356, "loss": 4.3038, "step": 234 }, { "epoch": 3.1337792642140467, "grad_norm": 0.8375754952430725, "learning_rate": 0.00018981090100111237, "loss": 4.4356, "step": 235 }, { "epoch": 3.1471571906354514, "grad_norm": 0.7878522872924805, "learning_rate": 0.00018976640711902115, "loss": 3.9916, "step": 236 }, { "epoch": 3.160535117056856, "grad_norm": 0.7463901042938232, "learning_rate": 0.00018972191323692993, "loss": 3.6761, "step": 237 }, { "epoch": 3.1739130434782608, "grad_norm": 0.7360939979553223, "learning_rate": 0.00018967741935483872, "loss": 3.9573, "step": 238 }, { "epoch": 3.1872909698996654, "grad_norm": 0.891861081123352, "learning_rate": 0.00018963292547274753, "loss": 4.1853, "step": 239 }, { "epoch": 3.20066889632107, "grad_norm": 0.8589549660682678, "learning_rate": 0.00018958843159065628, "loss": 4.0679, "step": 240 }, { "epoch": 3.2140468227424748, "grad_norm": 0.9534163475036621, "learning_rate": 0.0001895439377085651, "loss": 3.732, "step": 241 }, { "epoch": 3.2274247491638794, "grad_norm": 0.8968185186386108, "learning_rate": 0.00018949944382647387, "loss": 4.2217, "step": 242 }, { "epoch": 3.240802675585284, "grad_norm": 0.81589275598526, "learning_rate": 0.00018945494994438266, "loss": 4.428, "step": 243 }, { "epoch": 3.254180602006689, "grad_norm": 0.929050862789154, "learning_rate": 0.00018941045606229144, "loss": 4.3468, "step": 244 }, { "epoch": 3.2675585284280935, "grad_norm": 0.8535035252571106, "learning_rate": 0.00018936596218020025, "loss": 3.8489, "step": 245 }, { "epoch": 3.280936454849498, "grad_norm": 0.9484681487083435, "learning_rate": 0.00018932146829810903, "loss": 4.0132, "step": 246 }, { "epoch": 3.294314381270903, "grad_norm": 0.8190047144889832, "learning_rate": 0.0001892769744160178, "loss": 4.3574, "step": 247 }, { "epoch": 3.3076923076923075, "grad_norm": 0.8764749765396118, "learning_rate": 0.0001892324805339266, "loss": 4.3103, "step": 248 }, { "epoch": 3.321070234113712, "grad_norm": 0.8929185271263123, "learning_rate": 0.0001891879866518354, "loss": 4.3606, "step": 249 }, { "epoch": 3.334448160535117, "grad_norm": 0.9096692204475403, "learning_rate": 0.00018914349276974416, "loss": 4.0047, "step": 250 }, { "epoch": 3.3478260869565215, "grad_norm": 0.885143518447876, "learning_rate": 0.00018909899888765297, "loss": 4.182, "step": 251 }, { "epoch": 3.361204013377926, "grad_norm": 0.7724215984344482, "learning_rate": 0.00018905450500556175, "loss": 3.9529, "step": 252 }, { "epoch": 3.374581939799331, "grad_norm": 0.8351865410804749, "learning_rate": 0.00018901001112347053, "loss": 3.9533, "step": 253 }, { "epoch": 3.387959866220736, "grad_norm": 0.8684999942779541, "learning_rate": 0.00018896551724137932, "loss": 3.8594, "step": 254 }, { "epoch": 3.4013377926421406, "grad_norm": 0.8903334736824036, "learning_rate": 0.0001889210233592881, "loss": 3.9248, "step": 255 }, { "epoch": 3.4147157190635453, "grad_norm": 0.826690137386322, "learning_rate": 0.0001888765294771969, "loss": 4.0389, "step": 256 }, { "epoch": 3.42809364548495, "grad_norm": 0.8306142687797546, "learning_rate": 0.00018883203559510566, "loss": 3.8168, "step": 257 }, { "epoch": 3.4414715719063547, "grad_norm": 0.9032199382781982, "learning_rate": 0.00018878754171301447, "loss": 4.178, "step": 258 }, { "epoch": 3.4548494983277593, "grad_norm": 0.9081966280937195, "learning_rate": 0.00018874304783092326, "loss": 4.2583, "step": 259 }, { "epoch": 3.468227424749164, "grad_norm": 0.8424077033996582, "learning_rate": 0.00018869855394883204, "loss": 4.3285, "step": 260 }, { "epoch": 3.4816053511705687, "grad_norm": 0.8302170038223267, "learning_rate": 0.00018865406006674082, "loss": 4.1346, "step": 261 }, { "epoch": 3.4949832775919734, "grad_norm": 0.8747193217277527, "learning_rate": 0.00018860956618464963, "loss": 4.0747, "step": 262 }, { "epoch": 3.508361204013378, "grad_norm": 0.8613927364349365, "learning_rate": 0.0001885650723025584, "loss": 4.2346, "step": 263 }, { "epoch": 3.5217391304347827, "grad_norm": 0.8321558833122253, "learning_rate": 0.0001885205784204672, "loss": 3.9781, "step": 264 }, { "epoch": 3.5351170568561874, "grad_norm": 0.8961741328239441, "learning_rate": 0.00018847608453837598, "loss": 4.311, "step": 265 }, { "epoch": 3.548494983277592, "grad_norm": 0.7703898549079895, "learning_rate": 0.00018843159065628479, "loss": 4.1163, "step": 266 }, { "epoch": 3.5618729096989967, "grad_norm": 0.880051851272583, "learning_rate": 0.00018838709677419354, "loss": 3.8032, "step": 267 }, { "epoch": 3.5752508361204014, "grad_norm": 0.8287038207054138, "learning_rate": 0.00018834260289210235, "loss": 4.1627, "step": 268 }, { "epoch": 3.588628762541806, "grad_norm": 0.9726569652557373, "learning_rate": 0.00018829810901001113, "loss": 4.4055, "step": 269 }, { "epoch": 3.6020066889632107, "grad_norm": 0.8071132898330688, "learning_rate": 0.00018825361512791992, "loss": 4.1709, "step": 270 }, { "epoch": 3.6153846153846154, "grad_norm": 0.8310988545417786, "learning_rate": 0.0001882091212458287, "loss": 4.2359, "step": 271 }, { "epoch": 3.62876254180602, "grad_norm": 0.8713561296463013, "learning_rate": 0.0001881646273637375, "loss": 4.1247, "step": 272 }, { "epoch": 3.6421404682274248, "grad_norm": 0.8964342474937439, "learning_rate": 0.0001881201334816463, "loss": 4.0794, "step": 273 }, { "epoch": 3.6555183946488294, "grad_norm": 0.9901681542396545, "learning_rate": 0.00018807563959955507, "loss": 4.0217, "step": 274 }, { "epoch": 3.668896321070234, "grad_norm": 0.9279042482376099, "learning_rate": 0.00018803114571746385, "loss": 4.3244, "step": 275 }, { "epoch": 3.682274247491639, "grad_norm": 0.8105964660644531, "learning_rate": 0.00018798665183537266, "loss": 3.9041, "step": 276 }, { "epoch": 3.6956521739130435, "grad_norm": 0.8511622548103333, "learning_rate": 0.00018794215795328142, "loss": 3.8969, "step": 277 }, { "epoch": 3.709030100334448, "grad_norm": 0.9072037935256958, "learning_rate": 0.00018789766407119023, "loss": 4.2185, "step": 278 }, { "epoch": 3.722408026755853, "grad_norm": 0.9792962670326233, "learning_rate": 0.000187853170189099, "loss": 4.1915, "step": 279 }, { "epoch": 3.7357859531772575, "grad_norm": 0.8579828143119812, "learning_rate": 0.0001878086763070078, "loss": 3.8903, "step": 280 }, { "epoch": 3.749163879598662, "grad_norm": 0.9866719841957092, "learning_rate": 0.00018776418242491658, "loss": 4.2022, "step": 281 }, { "epoch": 3.762541806020067, "grad_norm": 0.9251964688301086, "learning_rate": 0.00018771968854282539, "loss": 3.9536, "step": 282 }, { "epoch": 3.7759197324414715, "grad_norm": 1.0300836563110352, "learning_rate": 0.00018767519466073417, "loss": 4.2908, "step": 283 }, { "epoch": 3.789297658862876, "grad_norm": 1.0194575786590576, "learning_rate": 0.00018763070077864295, "loss": 4.1851, "step": 284 }, { "epoch": 3.802675585284281, "grad_norm": 0.8165330290794373, "learning_rate": 0.00018758620689655173, "loss": 4.138, "step": 285 }, { "epoch": 3.8160535117056855, "grad_norm": 1.0104280710220337, "learning_rate": 0.00018754171301446054, "loss": 3.9481, "step": 286 }, { "epoch": 3.82943143812709, "grad_norm": 0.9972538352012634, "learning_rate": 0.0001874972191323693, "loss": 4.3932, "step": 287 }, { "epoch": 3.842809364548495, "grad_norm": 0.96323162317276, "learning_rate": 0.0001874527252502781, "loss": 4.1133, "step": 288 }, { "epoch": 3.8561872909698995, "grad_norm": 0.8500615954399109, "learning_rate": 0.0001874082313681869, "loss": 4.2205, "step": 289 }, { "epoch": 3.869565217391304, "grad_norm": 0.8451250195503235, "learning_rate": 0.00018736373748609567, "loss": 4.1371, "step": 290 }, { "epoch": 3.882943143812709, "grad_norm": 0.9399815201759338, "learning_rate": 0.00018731924360400445, "loss": 4.5237, "step": 291 }, { "epoch": 3.8963210702341136, "grad_norm": 0.8061622977256775, "learning_rate": 0.00018727474972191326, "loss": 4.1033, "step": 292 }, { "epoch": 3.9096989966555182, "grad_norm": 0.7987121343612671, "learning_rate": 0.00018723025583982205, "loss": 3.9311, "step": 293 }, { "epoch": 3.9230769230769234, "grad_norm": 0.9041138291358948, "learning_rate": 0.00018718576195773083, "loss": 4.0252, "step": 294 }, { "epoch": 3.936454849498328, "grad_norm": 1.0002484321594238, "learning_rate": 0.0001871412680756396, "loss": 4.4605, "step": 295 }, { "epoch": 3.9498327759197327, "grad_norm": 0.9991098046302795, "learning_rate": 0.0001870967741935484, "loss": 4.1528, "step": 296 }, { "epoch": 3.9632107023411374, "grad_norm": 1.2179397344589233, "learning_rate": 0.00018705228031145718, "loss": 4.5224, "step": 297 }, { "epoch": 3.976588628762542, "grad_norm": 0.8279774785041809, "learning_rate": 0.00018700778642936596, "loss": 3.9464, "step": 298 }, { "epoch": 3.9899665551839467, "grad_norm": 0.8012803792953491, "learning_rate": 0.00018696329254727477, "loss": 4.0139, "step": 299 }, { "epoch": 4.0, "grad_norm": 0.9700272083282471, "learning_rate": 0.00018691879866518355, "loss": 3.8306, "step": 300 }, { "epoch": 4.013377926421405, "grad_norm": 0.7136749625205994, "learning_rate": 0.00018687430478309233, "loss": 3.9253, "step": 301 }, { "epoch": 4.026755852842809, "grad_norm": 0.7885096669197083, "learning_rate": 0.00018682981090100111, "loss": 3.927, "step": 302 }, { "epoch": 4.040133779264214, "grad_norm": 0.7801666855812073, "learning_rate": 0.00018678531701890992, "loss": 3.6482, "step": 303 }, { "epoch": 4.053511705685619, "grad_norm": 0.7857955098152161, "learning_rate": 0.00018674082313681868, "loss": 4.0665, "step": 304 }, { "epoch": 4.066889632107023, "grad_norm": 0.707421064376831, "learning_rate": 0.0001866963292547275, "loss": 3.9142, "step": 305 }, { "epoch": 4.080267558528428, "grad_norm": 0.7936912775039673, "learning_rate": 0.00018665183537263627, "loss": 4.1227, "step": 306 }, { "epoch": 4.093645484949833, "grad_norm": 0.8899754881858826, "learning_rate": 0.00018660734149054505, "loss": 3.7661, "step": 307 }, { "epoch": 4.107023411371237, "grad_norm": 0.7760347723960876, "learning_rate": 0.00018656284760845384, "loss": 3.8921, "step": 308 }, { "epoch": 4.120401337792642, "grad_norm": 0.8672968745231628, "learning_rate": 0.00018651835372636265, "loss": 3.6037, "step": 309 }, { "epoch": 4.133779264214047, "grad_norm": 0.8046863675117493, "learning_rate": 0.0001864738598442714, "loss": 3.9117, "step": 310 }, { "epoch": 4.147157190635451, "grad_norm": 0.9172897934913635, "learning_rate": 0.0001864293659621802, "loss": 3.7229, "step": 311 }, { "epoch": 4.160535117056856, "grad_norm": 0.9616653919219971, "learning_rate": 0.000186384872080089, "loss": 3.8851, "step": 312 }, { "epoch": 4.173913043478261, "grad_norm": 0.9659278988838196, "learning_rate": 0.0001863403781979978, "loss": 4.005, "step": 313 }, { "epoch": 4.187290969899665, "grad_norm": 0.9171205163002014, "learning_rate": 0.00018629588431590656, "loss": 3.8634, "step": 314 }, { "epoch": 4.20066889632107, "grad_norm": 0.9968683123588562, "learning_rate": 0.00018625139043381537, "loss": 3.7321, "step": 315 }, { "epoch": 4.214046822742475, "grad_norm": 0.8762083053588867, "learning_rate": 0.00018620689655172415, "loss": 3.931, "step": 316 }, { "epoch": 4.2274247491638794, "grad_norm": 0.9815887212753296, "learning_rate": 0.00018616240266963293, "loss": 3.9975, "step": 317 }, { "epoch": 4.240802675585284, "grad_norm": 1.0065505504608154, "learning_rate": 0.00018611790878754171, "loss": 3.8364, "step": 318 }, { "epoch": 4.254180602006689, "grad_norm": 0.9785431623458862, "learning_rate": 0.00018607341490545052, "loss": 3.8822, "step": 319 }, { "epoch": 4.2675585284280935, "grad_norm": 1.077799677848816, "learning_rate": 0.00018602892102335928, "loss": 3.8299, "step": 320 }, { "epoch": 4.280936454849498, "grad_norm": 0.8109619617462158, "learning_rate": 0.0001859844271412681, "loss": 3.8096, "step": 321 }, { "epoch": 4.294314381270903, "grad_norm": 0.967856764793396, "learning_rate": 0.00018593993325917687, "loss": 3.8639, "step": 322 }, { "epoch": 4.3076923076923075, "grad_norm": 0.8657905459403992, "learning_rate": 0.00018589543937708568, "loss": 3.7556, "step": 323 }, { "epoch": 4.321070234113712, "grad_norm": 0.9641517400741577, "learning_rate": 0.00018585094549499444, "loss": 3.9702, "step": 324 }, { "epoch": 4.334448160535117, "grad_norm": 0.9664435982704163, "learning_rate": 0.00018580645161290325, "loss": 3.8754, "step": 325 }, { "epoch": 4.3478260869565215, "grad_norm": 0.8322617411613464, "learning_rate": 0.00018576195773081203, "loss": 3.83, "step": 326 }, { "epoch": 4.361204013377926, "grad_norm": 1.0363450050354004, "learning_rate": 0.0001857174638487208, "loss": 3.9825, "step": 327 }, { "epoch": 4.374581939799331, "grad_norm": 1.0125840902328491, "learning_rate": 0.0001856729699666296, "loss": 3.6525, "step": 328 }, { "epoch": 4.3879598662207355, "grad_norm": 0.9922601580619812, "learning_rate": 0.0001856284760845384, "loss": 4.2373, "step": 329 }, { "epoch": 4.40133779264214, "grad_norm": 0.9070426225662231, "learning_rate": 0.00018558398220244716, "loss": 3.9623, "step": 330 }, { "epoch": 4.414715719063545, "grad_norm": 0.9369637370109558, "learning_rate": 0.00018553948832035597, "loss": 3.9297, "step": 331 }, { "epoch": 4.4280936454849495, "grad_norm": 1.108876347541809, "learning_rate": 0.00018549499443826475, "loss": 3.7325, "step": 332 }, { "epoch": 4.441471571906354, "grad_norm": 0.9405660629272461, "learning_rate": 0.00018545050055617356, "loss": 3.8615, "step": 333 }, { "epoch": 4.454849498327759, "grad_norm": 0.9730128645896912, "learning_rate": 0.00018540600667408231, "loss": 4.1794, "step": 334 }, { "epoch": 4.468227424749164, "grad_norm": 0.9341335892677307, "learning_rate": 0.00018536151279199112, "loss": 3.9422, "step": 335 }, { "epoch": 4.481605351170568, "grad_norm": 0.9262625575065613, "learning_rate": 0.0001853170189098999, "loss": 3.9819, "step": 336 }, { "epoch": 4.494983277591973, "grad_norm": 1.0419141054153442, "learning_rate": 0.00018527252502780866, "loss": 3.7481, "step": 337 }, { "epoch": 4.508361204013378, "grad_norm": 0.8986826539039612, "learning_rate": 0.00018522803114571747, "loss": 4.1195, "step": 338 }, { "epoch": 4.521739130434782, "grad_norm": 0.9502431154251099, "learning_rate": 0.00018518353726362625, "loss": 3.8521, "step": 339 }, { "epoch": 4.535117056856187, "grad_norm": 0.8936267495155334, "learning_rate": 0.00018513904338153504, "loss": 3.647, "step": 340 }, { "epoch": 4.548494983277592, "grad_norm": 0.8870158195495605, "learning_rate": 0.00018509454949944382, "loss": 3.7624, "step": 341 }, { "epoch": 4.561872909698996, "grad_norm": 0.9030978679656982, "learning_rate": 0.00018505005561735263, "loss": 3.8018, "step": 342 }, { "epoch": 4.575250836120401, "grad_norm": 0.8690946698188782, "learning_rate": 0.0001850055617352614, "loss": 3.6183, "step": 343 }, { "epoch": 4.588628762541806, "grad_norm": 0.9812071323394775, "learning_rate": 0.0001849610678531702, "loss": 4.1231, "step": 344 }, { "epoch": 4.602006688963211, "grad_norm": 0.9404383301734924, "learning_rate": 0.00018491657397107897, "loss": 3.9645, "step": 345 }, { "epoch": 4.615384615384615, "grad_norm": 1.0422123670578003, "learning_rate": 0.00018487208008898778, "loss": 3.9031, "step": 346 }, { "epoch": 4.6287625418060205, "grad_norm": 0.9838129281997681, "learning_rate": 0.00018482758620689654, "loss": 3.9985, "step": 347 }, { "epoch": 4.642140468227424, "grad_norm": 0.9232532978057861, "learning_rate": 0.00018478309232480535, "loss": 4.0343, "step": 348 }, { "epoch": 4.65551839464883, "grad_norm": 0.9242956042289734, "learning_rate": 0.00018473859844271413, "loss": 4.0669, "step": 349 }, { "epoch": 4.668896321070234, "grad_norm": 0.919269859790802, "learning_rate": 0.0001846941045606229, "loss": 4.0549, "step": 350 }, { "epoch": 4.682274247491639, "grad_norm": 0.93565833568573, "learning_rate": 0.0001846496106785317, "loss": 4.1306, "step": 351 }, { "epoch": 4.695652173913043, "grad_norm": 0.9001899361610413, "learning_rate": 0.0001846051167964405, "loss": 3.8916, "step": 352 }, { "epoch": 4.709030100334449, "grad_norm": 0.8896821737289429, "learning_rate": 0.0001845606229143493, "loss": 3.8377, "step": 353 }, { "epoch": 4.722408026755852, "grad_norm": 1.0137807130813599, "learning_rate": 0.00018451612903225807, "loss": 3.9923, "step": 354 }, { "epoch": 4.735785953177258, "grad_norm": 1.075823426246643, "learning_rate": 0.00018447163515016685, "loss": 4.0706, "step": 355 }, { "epoch": 4.749163879598662, "grad_norm": 1.0076895952224731, "learning_rate": 0.00018442714126807566, "loss": 4.0759, "step": 356 }, { "epoch": 4.762541806020067, "grad_norm": 0.9387428164482117, "learning_rate": 0.00018438264738598442, "loss": 3.6959, "step": 357 }, { "epoch": 4.775919732441472, "grad_norm": 0.8920648097991943, "learning_rate": 0.00018433815350389323, "loss": 3.9213, "step": 358 }, { "epoch": 4.789297658862877, "grad_norm": 1.0252491235733032, "learning_rate": 0.000184293659621802, "loss": 3.9118, "step": 359 }, { "epoch": 4.802675585284281, "grad_norm": 1.0382707118988037, "learning_rate": 0.0001842491657397108, "loss": 4.0172, "step": 360 }, { "epoch": 4.816053511705686, "grad_norm": 1.07838773727417, "learning_rate": 0.00018420467185761957, "loss": 3.8531, "step": 361 }, { "epoch": 4.829431438127091, "grad_norm": 0.9974546432495117, "learning_rate": 0.00018416017797552838, "loss": 4.0387, "step": 362 }, { "epoch": 4.842809364548495, "grad_norm": 1.024491548538208, "learning_rate": 0.00018411568409343717, "loss": 3.9504, "step": 363 }, { "epoch": 4.8561872909699, "grad_norm": 0.9236369132995605, "learning_rate": 0.00018407119021134595, "loss": 3.7119, "step": 364 }, { "epoch": 4.869565217391305, "grad_norm": 0.935644268989563, "learning_rate": 0.00018402669632925473, "loss": 4.0077, "step": 365 }, { "epoch": 4.882943143812709, "grad_norm": 0.9328681230545044, "learning_rate": 0.00018398220244716354, "loss": 3.9133, "step": 366 }, { "epoch": 4.896321070234114, "grad_norm": 0.9596607089042664, "learning_rate": 0.0001839377085650723, "loss": 3.8003, "step": 367 }, { "epoch": 4.909698996655519, "grad_norm": 0.9878052473068237, "learning_rate": 0.0001838932146829811, "loss": 3.8805, "step": 368 }, { "epoch": 4.923076923076923, "grad_norm": 1.00381600856781, "learning_rate": 0.0001838487208008899, "loss": 4.0264, "step": 369 }, { "epoch": 4.936454849498328, "grad_norm": 1.024754524230957, "learning_rate": 0.00018380422691879867, "loss": 3.7291, "step": 370 }, { "epoch": 4.949832775919733, "grad_norm": 0.9670823812484741, "learning_rate": 0.00018375973303670745, "loss": 3.9418, "step": 371 }, { "epoch": 4.963210702341137, "grad_norm": 0.9736581444740295, "learning_rate": 0.00018371523915461626, "loss": 3.8813, "step": 372 }, { "epoch": 4.976588628762542, "grad_norm": 0.9752672910690308, "learning_rate": 0.00018367074527252504, "loss": 3.6717, "step": 373 }, { "epoch": 4.989966555183947, "grad_norm": 1.1268304586410522, "learning_rate": 0.00018362625139043383, "loss": 3.9782, "step": 374 }, { "epoch": 5.0, "grad_norm": 1.7933701276779175, "learning_rate": 0.0001835817575083426, "loss": 3.001, "step": 375 }, { "epoch": 5.013377926421405, "grad_norm": 0.8035010099411011, "learning_rate": 0.00018353726362625142, "loss": 3.7943, "step": 376 }, { "epoch": 5.026755852842809, "grad_norm": 0.8016420006752014, "learning_rate": 0.00018349276974416017, "loss": 3.7454, "step": 377 }, { "epoch": 5.040133779264214, "grad_norm": 0.6844643354415894, "learning_rate": 0.00018344827586206896, "loss": 3.699, "step": 378 }, { "epoch": 5.053511705685619, "grad_norm": 0.8649943470954895, "learning_rate": 0.00018340378197997777, "loss": 3.7197, "step": 379 }, { "epoch": 5.066889632107023, "grad_norm": 0.9685015678405762, "learning_rate": 0.00018335928809788655, "loss": 3.6952, "step": 380 }, { "epoch": 5.080267558528428, "grad_norm": 0.8728330135345459, "learning_rate": 0.00018331479421579533, "loss": 3.7164, "step": 381 }, { "epoch": 5.093645484949833, "grad_norm": 0.962504506111145, "learning_rate": 0.0001832703003337041, "loss": 3.6123, "step": 382 }, { "epoch": 5.107023411371237, "grad_norm": 0.9194462895393372, "learning_rate": 0.00018322580645161292, "loss": 3.275, "step": 383 }, { "epoch": 5.120401337792642, "grad_norm": 0.9851329326629639, "learning_rate": 0.00018318131256952168, "loss": 3.6222, "step": 384 }, { "epoch": 5.133779264214047, "grad_norm": 1.0702580213546753, "learning_rate": 0.0001831368186874305, "loss": 3.8728, "step": 385 }, { "epoch": 5.147157190635451, "grad_norm": 1.3237228393554688, "learning_rate": 0.00018309232480533927, "loss": 3.8948, "step": 386 }, { "epoch": 5.160535117056856, "grad_norm": 1.0076218843460083, "learning_rate": 0.00018304783092324805, "loss": 3.8894, "step": 387 }, { "epoch": 5.173913043478261, "grad_norm": 1.084722876548767, "learning_rate": 0.00018300333704115683, "loss": 3.7398, "step": 388 }, { "epoch": 5.187290969899665, "grad_norm": 0.9112711548805237, "learning_rate": 0.00018295884315906564, "loss": 3.5901, "step": 389 }, { "epoch": 5.20066889632107, "grad_norm": 0.9451406002044678, "learning_rate": 0.00018291434927697443, "loss": 3.6313, "step": 390 }, { "epoch": 5.214046822742475, "grad_norm": 0.8901047706604004, "learning_rate": 0.0001828698553948832, "loss": 3.3191, "step": 391 }, { "epoch": 5.2274247491638794, "grad_norm": 0.9838565587997437, "learning_rate": 0.000182825361512792, "loss": 3.882, "step": 392 }, { "epoch": 5.240802675585284, "grad_norm": 0.9839156866073608, "learning_rate": 0.0001827808676307008, "loss": 3.6068, "step": 393 }, { "epoch": 5.254180602006689, "grad_norm": 0.9328583478927612, "learning_rate": 0.00018273637374860956, "loss": 3.6856, "step": 394 }, { "epoch": 5.2675585284280935, "grad_norm": 0.8705796003341675, "learning_rate": 0.00018269187986651837, "loss": 3.7282, "step": 395 }, { "epoch": 5.280936454849498, "grad_norm": 0.9675374031066895, "learning_rate": 0.00018264738598442715, "loss": 3.6588, "step": 396 }, { "epoch": 5.294314381270903, "grad_norm": 1.145280361175537, "learning_rate": 0.00018260289210233593, "loss": 3.8843, "step": 397 }, { "epoch": 5.3076923076923075, "grad_norm": 0.9769694805145264, "learning_rate": 0.0001825583982202447, "loss": 3.7207, "step": 398 }, { "epoch": 5.321070234113712, "grad_norm": 0.9277816414833069, "learning_rate": 0.00018251390433815352, "loss": 3.712, "step": 399 }, { "epoch": 5.334448160535117, "grad_norm": 1.1015180349349976, "learning_rate": 0.0001824694104560623, "loss": 3.7941, "step": 400 }, { "epoch": 5.3478260869565215, "grad_norm": 1.2234200239181519, "learning_rate": 0.0001824249165739711, "loss": 3.6559, "step": 401 }, { "epoch": 5.361204013377926, "grad_norm": 0.9358471035957336, "learning_rate": 0.00018238042269187987, "loss": 3.7665, "step": 402 }, { "epoch": 5.374581939799331, "grad_norm": 0.8287034630775452, "learning_rate": 0.00018233592880978868, "loss": 3.8265, "step": 403 }, { "epoch": 5.3879598662207355, "grad_norm": 1.0219204425811768, "learning_rate": 0.00018229143492769743, "loss": 3.6829, "step": 404 }, { "epoch": 5.40133779264214, "grad_norm": 1.0601041316986084, "learning_rate": 0.00018224694104560624, "loss": 3.5879, "step": 405 }, { "epoch": 5.414715719063545, "grad_norm": 1.2221566438674927, "learning_rate": 0.00018220244716351503, "loss": 3.6023, "step": 406 }, { "epoch": 5.4280936454849495, "grad_norm": 0.9589087963104248, "learning_rate": 0.0001821579532814238, "loss": 3.9109, "step": 407 }, { "epoch": 5.441471571906354, "grad_norm": 1.088295340538025, "learning_rate": 0.0001821134593993326, "loss": 3.5695, "step": 408 }, { "epoch": 5.454849498327759, "grad_norm": 1.1284915208816528, "learning_rate": 0.0001820689655172414, "loss": 3.6004, "step": 409 }, { "epoch": 5.468227424749164, "grad_norm": 1.0108689069747925, "learning_rate": 0.00018202447163515018, "loss": 3.7797, "step": 410 }, { "epoch": 5.481605351170568, "grad_norm": 0.8550918102264404, "learning_rate": 0.00018197997775305896, "loss": 3.5758, "step": 411 }, { "epoch": 5.494983277591973, "grad_norm": 0.8765145540237427, "learning_rate": 0.00018193548387096775, "loss": 3.7625, "step": 412 }, { "epoch": 5.508361204013378, "grad_norm": 1.0253541469573975, "learning_rate": 0.00018189098998887656, "loss": 3.6033, "step": 413 }, { "epoch": 5.521739130434782, "grad_norm": 1.0475622415542603, "learning_rate": 0.0001818464961067853, "loss": 3.813, "step": 414 }, { "epoch": 5.535117056856187, "grad_norm": 1.053133249282837, "learning_rate": 0.00018180200222469412, "loss": 3.4779, "step": 415 }, { "epoch": 5.548494983277592, "grad_norm": 1.0151216983795166, "learning_rate": 0.0001817575083426029, "loss": 3.9038, "step": 416 }, { "epoch": 5.561872909698996, "grad_norm": 1.4666434526443481, "learning_rate": 0.00018171301446051169, "loss": 3.4735, "step": 417 }, { "epoch": 5.575250836120401, "grad_norm": 1.1043344736099243, "learning_rate": 0.00018166852057842047, "loss": 3.7449, "step": 418 }, { "epoch": 5.588628762541806, "grad_norm": 0.900745153427124, "learning_rate": 0.00018162402669632925, "loss": 3.7401, "step": 419 }, { "epoch": 5.602006688963211, "grad_norm": 0.9771101474761963, "learning_rate": 0.00018157953281423806, "loss": 3.8328, "step": 420 }, { "epoch": 5.615384615384615, "grad_norm": 0.9099516272544861, "learning_rate": 0.00018153503893214682, "loss": 3.6245, "step": 421 }, { "epoch": 5.6287625418060205, "grad_norm": 0.9844585657119751, "learning_rate": 0.00018149054505005563, "loss": 3.5776, "step": 422 }, { "epoch": 5.642140468227424, "grad_norm": 1.0481154918670654, "learning_rate": 0.0001814460511679644, "loss": 3.6304, "step": 423 }, { "epoch": 5.65551839464883, "grad_norm": 0.9971081614494324, "learning_rate": 0.0001814015572858732, "loss": 3.7863, "step": 424 }, { "epoch": 5.668896321070234, "grad_norm": 0.9247872829437256, "learning_rate": 0.00018135706340378197, "loss": 3.7319, "step": 425 }, { "epoch": 5.682274247491639, "grad_norm": 0.9895725846290588, "learning_rate": 0.00018131256952169078, "loss": 3.78, "step": 426 }, { "epoch": 5.695652173913043, "grad_norm": 1.0847641229629517, "learning_rate": 0.00018126807563959956, "loss": 3.8662, "step": 427 }, { "epoch": 5.709030100334449, "grad_norm": 0.986259937286377, "learning_rate": 0.00018122358175750835, "loss": 3.5621, "step": 428 }, { "epoch": 5.722408026755852, "grad_norm": 0.9166681170463562, "learning_rate": 0.00018117908787541713, "loss": 3.6153, "step": 429 }, { "epoch": 5.735785953177258, "grad_norm": 1.1331177949905396, "learning_rate": 0.00018113459399332594, "loss": 3.5976, "step": 430 }, { "epoch": 5.749163879598662, "grad_norm": 0.8743540644645691, "learning_rate": 0.0001810901001112347, "loss": 3.2511, "step": 431 }, { "epoch": 5.762541806020067, "grad_norm": 1.0700207948684692, "learning_rate": 0.0001810456062291435, "loss": 3.7634, "step": 432 }, { "epoch": 5.775919732441472, "grad_norm": 0.9412694573402405, "learning_rate": 0.00018100111234705229, "loss": 3.6264, "step": 433 }, { "epoch": 5.789297658862877, "grad_norm": 1.0398496389389038, "learning_rate": 0.00018095661846496107, "loss": 3.744, "step": 434 }, { "epoch": 5.802675585284281, "grad_norm": 0.9605004787445068, "learning_rate": 0.00018091212458286985, "loss": 3.5532, "step": 435 }, { "epoch": 5.816053511705686, "grad_norm": 1.0449095964431763, "learning_rate": 0.00018086763070077866, "loss": 4.0611, "step": 436 }, { "epoch": 5.829431438127091, "grad_norm": 0.9342606663703918, "learning_rate": 0.00018082313681868744, "loss": 3.9957, "step": 437 }, { "epoch": 5.842809364548495, "grad_norm": 0.9687880873680115, "learning_rate": 0.00018077864293659622, "loss": 3.9299, "step": 438 }, { "epoch": 5.8561872909699, "grad_norm": 1.1390576362609863, "learning_rate": 0.000180734149054505, "loss": 3.6552, "step": 439 }, { "epoch": 5.869565217391305, "grad_norm": 0.9280988574028015, "learning_rate": 0.00018068965517241382, "loss": 3.7828, "step": 440 }, { "epoch": 5.882943143812709, "grad_norm": 1.2928193807601929, "learning_rate": 0.00018064516129032257, "loss": 3.6292, "step": 441 }, { "epoch": 5.896321070234114, "grad_norm": 1.0959875583648682, "learning_rate": 0.00018060066740823138, "loss": 3.4293, "step": 442 }, { "epoch": 5.909698996655519, "grad_norm": 1.0713289976119995, "learning_rate": 0.00018055617352614016, "loss": 3.7767, "step": 443 }, { "epoch": 5.923076923076923, "grad_norm": 0.9309440851211548, "learning_rate": 0.00018051167964404895, "loss": 3.5473, "step": 444 }, { "epoch": 5.936454849498328, "grad_norm": 1.0999056100845337, "learning_rate": 0.00018046718576195773, "loss": 3.9694, "step": 445 }, { "epoch": 5.949832775919733, "grad_norm": 1.1073781251907349, "learning_rate": 0.00018042269187986654, "loss": 3.6882, "step": 446 }, { "epoch": 5.963210702341137, "grad_norm": 1.0430257320404053, "learning_rate": 0.00018037819799777532, "loss": 3.4009, "step": 447 }, { "epoch": 5.976588628762542, "grad_norm": 1.1132690906524658, "learning_rate": 0.0001803337041156841, "loss": 3.8832, "step": 448 }, { "epoch": 5.989966555183947, "grad_norm": 1.0147771835327148, "learning_rate": 0.00018028921023359289, "loss": 3.7117, "step": 449 }, { "epoch": 6.0, "grad_norm": 1.458959698677063, "learning_rate": 0.0001802447163515017, "loss": 3.5745, "step": 450 }, { "epoch": 6.013377926421405, "grad_norm": 0.8363592028617859, "learning_rate": 0.00018020022246941045, "loss": 3.5835, "step": 451 }, { "epoch": 6.026755852842809, "grad_norm": 0.8071937561035156, "learning_rate": 0.00018015572858731926, "loss": 3.5923, "step": 452 }, { "epoch": 6.040133779264214, "grad_norm": 0.7746313214302063, "learning_rate": 0.00018011123470522804, "loss": 3.5688, "step": 453 }, { "epoch": 6.053511705685619, "grad_norm": 0.689179539680481, "learning_rate": 0.00018006674082313682, "loss": 3.412, "step": 454 }, { "epoch": 6.066889632107023, "grad_norm": 0.8438050746917725, "learning_rate": 0.0001800222469410456, "loss": 3.403, "step": 455 }, { "epoch": 6.080267558528428, "grad_norm": 0.7670062780380249, "learning_rate": 0.00017997775305895442, "loss": 3.5029, "step": 456 }, { "epoch": 6.093645484949833, "grad_norm": 0.8185870051383972, "learning_rate": 0.0001799332591768632, "loss": 3.4584, "step": 457 }, { "epoch": 6.107023411371237, "grad_norm": 0.9618543386459351, "learning_rate": 0.00017988876529477198, "loss": 3.6538, "step": 458 }, { "epoch": 6.120401337792642, "grad_norm": 0.959724485874176, "learning_rate": 0.00017984427141268076, "loss": 3.5284, "step": 459 }, { "epoch": 6.133779264214047, "grad_norm": 0.8044765591621399, "learning_rate": 0.00017979977753058955, "loss": 3.2198, "step": 460 }, { "epoch": 6.147157190635451, "grad_norm": 0.8287092447280884, "learning_rate": 0.00017975528364849833, "loss": 3.4977, "step": 461 }, { "epoch": 6.160535117056856, "grad_norm": 0.8855329155921936, "learning_rate": 0.0001797107897664071, "loss": 3.5008, "step": 462 }, { "epoch": 6.173913043478261, "grad_norm": 0.8839483857154846, "learning_rate": 0.00017966629588431592, "loss": 3.6135, "step": 463 }, { "epoch": 6.187290969899665, "grad_norm": 0.963446319103241, "learning_rate": 0.0001796218020022247, "loss": 3.6156, "step": 464 }, { "epoch": 6.20066889632107, "grad_norm": 0.896743655204773, "learning_rate": 0.00017957730812013348, "loss": 3.6623, "step": 465 }, { "epoch": 6.214046822742475, "grad_norm": 0.9268617033958435, "learning_rate": 0.00017953281423804227, "loss": 3.4343, "step": 466 }, { "epoch": 6.2274247491638794, "grad_norm": 0.8335449695587158, "learning_rate": 0.00017948832035595108, "loss": 3.5716, "step": 467 }, { "epoch": 6.240802675585284, "grad_norm": 0.7771849036216736, "learning_rate": 0.00017944382647385983, "loss": 3.5191, "step": 468 }, { "epoch": 6.254180602006689, "grad_norm": 0.9157488346099854, "learning_rate": 0.00017939933259176864, "loss": 3.5583, "step": 469 }, { "epoch": 6.2675585284280935, "grad_norm": 0.9348477721214294, "learning_rate": 0.00017935483870967742, "loss": 3.3137, "step": 470 }, { "epoch": 6.280936454849498, "grad_norm": 0.8791135549545288, "learning_rate": 0.0001793103448275862, "loss": 3.5111, "step": 471 }, { "epoch": 6.294314381270903, "grad_norm": 0.9963672757148743, "learning_rate": 0.000179265850945495, "loss": 3.7518, "step": 472 }, { "epoch": 6.3076923076923075, "grad_norm": 0.9291539192199707, "learning_rate": 0.0001792213570634038, "loss": 3.4524, "step": 473 }, { "epoch": 6.321070234113712, "grad_norm": 0.9349279403686523, "learning_rate": 0.00017917686318131258, "loss": 3.4753, "step": 474 }, { "epoch": 6.334448160535117, "grad_norm": 0.8984476327896118, "learning_rate": 0.00017913236929922136, "loss": 3.7325, "step": 475 }, { "epoch": 6.3478260869565215, "grad_norm": 0.8452139496803284, "learning_rate": 0.00017908787541713015, "loss": 3.8021, "step": 476 }, { "epoch": 6.361204013377926, "grad_norm": 0.9418376088142395, "learning_rate": 0.00017904338153503895, "loss": 3.7426, "step": 477 }, { "epoch": 6.374581939799331, "grad_norm": 1.0661097764968872, "learning_rate": 0.0001789988876529477, "loss": 3.7556, "step": 478 }, { "epoch": 6.3879598662207355, "grad_norm": 0.9645984768867493, "learning_rate": 0.00017895439377085652, "loss": 3.3353, "step": 479 }, { "epoch": 6.40133779264214, "grad_norm": 0.9243470430374146, "learning_rate": 0.0001789098998887653, "loss": 3.5729, "step": 480 }, { "epoch": 6.414715719063545, "grad_norm": 0.885061502456665, "learning_rate": 0.00017886540600667408, "loss": 3.5699, "step": 481 }, { "epoch": 6.4280936454849495, "grad_norm": 0.9025402069091797, "learning_rate": 0.00017882091212458287, "loss": 3.4532, "step": 482 }, { "epoch": 6.441471571906354, "grad_norm": 0.9760842323303223, "learning_rate": 0.00017877641824249168, "loss": 3.7222, "step": 483 }, { "epoch": 6.454849498327759, "grad_norm": 1.2709609270095825, "learning_rate": 0.00017873192436040046, "loss": 3.756, "step": 484 }, { "epoch": 6.468227424749164, "grad_norm": 0.904513955116272, "learning_rate": 0.00017868743047830924, "loss": 3.657, "step": 485 }, { "epoch": 6.481605351170568, "grad_norm": 1.158915400505066, "learning_rate": 0.00017864293659621802, "loss": 3.5897, "step": 486 }, { "epoch": 6.494983277591973, "grad_norm": 0.9457879066467285, "learning_rate": 0.00017859844271412683, "loss": 3.4394, "step": 487 }, { "epoch": 6.508361204013378, "grad_norm": 1.04762601852417, "learning_rate": 0.0001785539488320356, "loss": 3.39, "step": 488 }, { "epoch": 6.521739130434782, "grad_norm": 0.9370948076248169, "learning_rate": 0.0001785094549499444, "loss": 3.4816, "step": 489 }, { "epoch": 6.535117056856187, "grad_norm": 1.1307988166809082, "learning_rate": 0.00017846496106785318, "loss": 3.5079, "step": 490 }, { "epoch": 6.548494983277592, "grad_norm": 1.0025054216384888, "learning_rate": 0.00017842046718576196, "loss": 3.3447, "step": 491 }, { "epoch": 6.561872909698996, "grad_norm": 0.9893412590026855, "learning_rate": 0.00017837597330367074, "loss": 3.4698, "step": 492 }, { "epoch": 6.575250836120401, "grad_norm": 1.0476347208023071, "learning_rate": 0.00017833147942157955, "loss": 3.4955, "step": 493 }, { "epoch": 6.588628762541806, "grad_norm": 1.0508891344070435, "learning_rate": 0.00017828698553948834, "loss": 3.6568, "step": 494 }, { "epoch": 6.602006688963211, "grad_norm": 1.0397465229034424, "learning_rate": 0.00017824249165739712, "loss": 3.5087, "step": 495 }, { "epoch": 6.615384615384615, "grad_norm": 0.9884181022644043, "learning_rate": 0.0001781979977753059, "loss": 3.6778, "step": 496 }, { "epoch": 6.6287625418060205, "grad_norm": 1.1187562942504883, "learning_rate": 0.0001781535038932147, "loss": 3.4345, "step": 497 }, { "epoch": 6.642140468227424, "grad_norm": 1.1133880615234375, "learning_rate": 0.00017810901001112347, "loss": 3.4959, "step": 498 }, { "epoch": 6.65551839464883, "grad_norm": 0.9368589520454407, "learning_rate": 0.00017806451612903228, "loss": 3.628, "step": 499 }, { "epoch": 6.668896321070234, "grad_norm": 1.0427212715148926, "learning_rate": 0.00017802002224694106, "loss": 3.6073, "step": 500 }, { "epoch": 6.682274247491639, "grad_norm": 0.9564261436462402, "learning_rate": 0.00017797552836484984, "loss": 3.4406, "step": 501 }, { "epoch": 6.695652173913043, "grad_norm": 0.9754629135131836, "learning_rate": 0.00017793103448275862, "loss": 3.7025, "step": 502 }, { "epoch": 6.709030100334449, "grad_norm": 0.9669683575630188, "learning_rate": 0.0001778865406006674, "loss": 3.5872, "step": 503 }, { "epoch": 6.722408026755852, "grad_norm": 0.959619402885437, "learning_rate": 0.00017784204671857621, "loss": 3.5124, "step": 504 }, { "epoch": 6.735785953177258, "grad_norm": 0.981737494468689, "learning_rate": 0.00017779755283648497, "loss": 3.4038, "step": 505 }, { "epoch": 6.749163879598662, "grad_norm": 0.9372640252113342, "learning_rate": 0.00017775305895439378, "loss": 3.6725, "step": 506 }, { "epoch": 6.762541806020067, "grad_norm": 0.947066605091095, "learning_rate": 0.00017770856507230256, "loss": 3.807, "step": 507 }, { "epoch": 6.775919732441472, "grad_norm": 0.7659755349159241, "learning_rate": 0.00017766407119021134, "loss": 3.0411, "step": 508 }, { "epoch": 6.789297658862877, "grad_norm": 1.0432168245315552, "learning_rate": 0.00017761957730812013, "loss": 3.6259, "step": 509 }, { "epoch": 6.802675585284281, "grad_norm": 1.0104693174362183, "learning_rate": 0.00017757508342602894, "loss": 3.5971, "step": 510 }, { "epoch": 6.816053511705686, "grad_norm": 0.9007440805435181, "learning_rate": 0.00017753058954393772, "loss": 3.5894, "step": 511 }, { "epoch": 6.829431438127091, "grad_norm": 0.8829946517944336, "learning_rate": 0.0001774860956618465, "loss": 3.6094, "step": 512 }, { "epoch": 6.842809364548495, "grad_norm": 0.9823127388954163, "learning_rate": 0.00017744160177975528, "loss": 3.6568, "step": 513 }, { "epoch": 6.8561872909699, "grad_norm": 1.0247899293899536, "learning_rate": 0.0001773971078976641, "loss": 3.259, "step": 514 }, { "epoch": 6.869565217391305, "grad_norm": 0.9435336589813232, "learning_rate": 0.00017735261401557285, "loss": 3.4978, "step": 515 }, { "epoch": 6.882943143812709, "grad_norm": 1.0135424137115479, "learning_rate": 0.00017730812013348166, "loss": 3.4765, "step": 516 }, { "epoch": 6.896321070234114, "grad_norm": 1.1327738761901855, "learning_rate": 0.00017726362625139044, "loss": 3.6927, "step": 517 }, { "epoch": 6.909698996655519, "grad_norm": 0.9335159659385681, "learning_rate": 0.00017721913236929922, "loss": 3.7278, "step": 518 }, { "epoch": 6.923076923076923, "grad_norm": 0.8229056000709534, "learning_rate": 0.000177174638487208, "loss": 3.549, "step": 519 }, { "epoch": 6.936454849498328, "grad_norm": 0.8996124267578125, "learning_rate": 0.00017713014460511681, "loss": 3.5093, "step": 520 }, { "epoch": 6.949832775919733, "grad_norm": 0.9984102845191956, "learning_rate": 0.0001770856507230256, "loss": 3.4772, "step": 521 }, { "epoch": 6.963210702341137, "grad_norm": 0.9136049747467041, "learning_rate": 0.00017704115684093438, "loss": 3.6487, "step": 522 }, { "epoch": 6.976588628762542, "grad_norm": 0.9707899689674377, "learning_rate": 0.00017699666295884316, "loss": 3.7471, "step": 523 }, { "epoch": 6.989966555183947, "grad_norm": 0.919865071773529, "learning_rate": 0.00017695216907675197, "loss": 3.8013, "step": 524 }, { "epoch": 7.0, "grad_norm": 1.125541090965271, "learning_rate": 0.00017690767519466073, "loss": 3.4869, "step": 525 }, { "epoch": 7.013377926421405, "grad_norm": 0.8512810468673706, "learning_rate": 0.00017686318131256954, "loss": 3.3334, "step": 526 }, { "epoch": 7.026755852842809, "grad_norm": 0.8450623750686646, "learning_rate": 0.00017681868743047832, "loss": 3.5769, "step": 527 }, { "epoch": 7.040133779264214, "grad_norm": 0.8526298403739929, "learning_rate": 0.0001767741935483871, "loss": 3.4546, "step": 528 }, { "epoch": 7.053511705685619, "grad_norm": 0.75905442237854, "learning_rate": 0.00017672969966629588, "loss": 3.4228, "step": 529 }, { "epoch": 7.066889632107023, "grad_norm": 0.8442811965942383, "learning_rate": 0.0001766852057842047, "loss": 3.5766, "step": 530 }, { "epoch": 7.080267558528428, "grad_norm": 0.9584814310073853, "learning_rate": 0.00017664071190211347, "loss": 3.5312, "step": 531 }, { "epoch": 7.093645484949833, "grad_norm": 0.9741052985191345, "learning_rate": 0.00017659621802002226, "loss": 3.4877, "step": 532 }, { "epoch": 7.107023411371237, "grad_norm": 0.8638135194778442, "learning_rate": 0.00017655172413793104, "loss": 3.5701, "step": 533 }, { "epoch": 7.120401337792642, "grad_norm": 1.0128440856933594, "learning_rate": 0.00017650723025583985, "loss": 3.3629, "step": 534 }, { "epoch": 7.133779264214047, "grad_norm": 0.9763593673706055, "learning_rate": 0.0001764627363737486, "loss": 3.5651, "step": 535 }, { "epoch": 7.147157190635451, "grad_norm": 0.8706293702125549, "learning_rate": 0.00017641824249165741, "loss": 3.454, "step": 536 }, { "epoch": 7.160535117056856, "grad_norm": 0.9227468967437744, "learning_rate": 0.0001763737486095662, "loss": 3.5528, "step": 537 }, { "epoch": 7.173913043478261, "grad_norm": 0.7493206262588501, "learning_rate": 0.00017632925472747498, "loss": 3.2662, "step": 538 }, { "epoch": 7.187290969899665, "grad_norm": 0.8414123058319092, "learning_rate": 0.00017628476084538376, "loss": 3.3864, "step": 539 }, { "epoch": 7.20066889632107, "grad_norm": 0.8352764248847961, "learning_rate": 0.00017624026696329257, "loss": 3.2407, "step": 540 }, { "epoch": 7.214046822742475, "grad_norm": 0.7413480281829834, "learning_rate": 0.00017619577308120135, "loss": 3.3989, "step": 541 }, { "epoch": 7.2274247491638794, "grad_norm": 0.7661281228065491, "learning_rate": 0.00017615127919911014, "loss": 3.3792, "step": 542 }, { "epoch": 7.240802675585284, "grad_norm": 0.86900395154953, "learning_rate": 0.00017610678531701892, "loss": 3.2191, "step": 543 }, { "epoch": 7.254180602006689, "grad_norm": 0.8536344170570374, "learning_rate": 0.0001760622914349277, "loss": 3.3366, "step": 544 }, { "epoch": 7.2675585284280935, "grad_norm": 0.8729544878005981, "learning_rate": 0.00017601779755283648, "loss": 3.4976, "step": 545 }, { "epoch": 7.280936454849498, "grad_norm": 0.8263023495674133, "learning_rate": 0.00017597330367074526, "loss": 3.4102, "step": 546 }, { "epoch": 7.294314381270903, "grad_norm": 0.748373806476593, "learning_rate": 0.00017592880978865407, "loss": 3.332, "step": 547 }, { "epoch": 7.3076923076923075, "grad_norm": 0.7606791853904724, "learning_rate": 0.00017588431590656286, "loss": 3.361, "step": 548 }, { "epoch": 7.321070234113712, "grad_norm": 0.9155070781707764, "learning_rate": 0.00017583982202447164, "loss": 3.657, "step": 549 }, { "epoch": 7.334448160535117, "grad_norm": 0.7440597414970398, "learning_rate": 0.00017579532814238042, "loss": 3.335, "step": 550 }, { "epoch": 7.3478260869565215, "grad_norm": 0.8781002759933472, "learning_rate": 0.00017575083426028923, "loss": 3.5579, "step": 551 }, { "epoch": 7.361204013377926, "grad_norm": 0.7886701822280884, "learning_rate": 0.00017570634037819799, "loss": 3.4636, "step": 552 }, { "epoch": 7.374581939799331, "grad_norm": 0.8931376934051514, "learning_rate": 0.0001756618464961068, "loss": 3.4375, "step": 553 }, { "epoch": 7.3879598662207355, "grad_norm": 0.7599623799324036, "learning_rate": 0.00017561735261401558, "loss": 3.6551, "step": 554 }, { "epoch": 7.40133779264214, "grad_norm": 0.7692762613296509, "learning_rate": 0.00017557285873192436, "loss": 3.6373, "step": 555 }, { "epoch": 7.414715719063545, "grad_norm": 0.8861828446388245, "learning_rate": 0.00017552836484983314, "loss": 3.4791, "step": 556 }, { "epoch": 7.4280936454849495, "grad_norm": 0.9560372829437256, "learning_rate": 0.00017548387096774195, "loss": 3.7291, "step": 557 }, { "epoch": 7.441471571906354, "grad_norm": 0.8745344281196594, "learning_rate": 0.00017543937708565073, "loss": 3.3071, "step": 558 }, { "epoch": 7.454849498327759, "grad_norm": 0.8178285360336304, "learning_rate": 0.00017539488320355952, "loss": 3.4738, "step": 559 }, { "epoch": 7.468227424749164, "grad_norm": 0.8611259460449219, "learning_rate": 0.0001753503893214683, "loss": 3.25, "step": 560 }, { "epoch": 7.481605351170568, "grad_norm": 0.8623505234718323, "learning_rate": 0.0001753058954393771, "loss": 3.3701, "step": 561 }, { "epoch": 7.494983277591973, "grad_norm": 0.76930171251297, "learning_rate": 0.00017526140155728586, "loss": 3.332, "step": 562 }, { "epoch": 7.508361204013378, "grad_norm": 0.8986758589744568, "learning_rate": 0.00017521690767519467, "loss": 3.3927, "step": 563 }, { "epoch": 7.521739130434782, "grad_norm": 0.9844257831573486, "learning_rate": 0.00017517241379310346, "loss": 3.5664, "step": 564 }, { "epoch": 7.535117056856187, "grad_norm": 0.983921229839325, "learning_rate": 0.00017512791991101224, "loss": 3.3888, "step": 565 }, { "epoch": 7.548494983277592, "grad_norm": 0.8052308559417725, "learning_rate": 0.00017508342602892102, "loss": 3.4809, "step": 566 }, { "epoch": 7.561872909698996, "grad_norm": 0.7996425032615662, "learning_rate": 0.00017503893214682983, "loss": 3.4793, "step": 567 }, { "epoch": 7.575250836120401, "grad_norm": 0.8453391194343567, "learning_rate": 0.0001749944382647386, "loss": 3.3199, "step": 568 }, { "epoch": 7.588628762541806, "grad_norm": 0.8720147013664246, "learning_rate": 0.0001749499443826474, "loss": 3.5612, "step": 569 }, { "epoch": 7.602006688963211, "grad_norm": 0.9093672633171082, "learning_rate": 0.00017490545050055618, "loss": 3.0509, "step": 570 }, { "epoch": 7.615384615384615, "grad_norm": 0.8936579823493958, "learning_rate": 0.000174860956618465, "loss": 3.4408, "step": 571 }, { "epoch": 7.6287625418060205, "grad_norm": 0.7683162689208984, "learning_rate": 0.00017481646273637374, "loss": 3.3536, "step": 572 }, { "epoch": 7.642140468227424, "grad_norm": 0.7943581342697144, "learning_rate": 0.00017477196885428255, "loss": 3.4542, "step": 573 }, { "epoch": 7.65551839464883, "grad_norm": 0.8183353543281555, "learning_rate": 0.00017472747497219133, "loss": 3.603, "step": 574 }, { "epoch": 7.668896321070234, "grad_norm": 0.7816463708877563, "learning_rate": 0.00017468298109010012, "loss": 3.7388, "step": 575 }, { "epoch": 7.682274247491639, "grad_norm": 0.8167930245399475, "learning_rate": 0.0001746384872080089, "loss": 3.7743, "step": 576 }, { "epoch": 7.695652173913043, "grad_norm": 0.832392156124115, "learning_rate": 0.0001745939933259177, "loss": 3.7488, "step": 577 }, { "epoch": 7.709030100334449, "grad_norm": 0.9362333416938782, "learning_rate": 0.0001745494994438265, "loss": 3.5722, "step": 578 }, { "epoch": 7.722408026755852, "grad_norm": 1.0247248411178589, "learning_rate": 0.00017450500556173527, "loss": 3.2048, "step": 579 }, { "epoch": 7.735785953177258, "grad_norm": 0.8833767175674438, "learning_rate": 0.00017446051167964406, "loss": 3.389, "step": 580 }, { "epoch": 7.749163879598662, "grad_norm": 0.8344758749008179, "learning_rate": 0.00017441601779755287, "loss": 3.5264, "step": 581 }, { "epoch": 7.762541806020067, "grad_norm": 0.9771448373794556, "learning_rate": 0.00017437152391546162, "loss": 3.041, "step": 582 }, { "epoch": 7.775919732441472, "grad_norm": 0.8279567956924438, "learning_rate": 0.00017432703003337043, "loss": 3.6273, "step": 583 }, { "epoch": 7.789297658862877, "grad_norm": 0.957206130027771, "learning_rate": 0.0001742825361512792, "loss": 3.3831, "step": 584 }, { "epoch": 7.802675585284281, "grad_norm": 0.860619843006134, "learning_rate": 0.000174238042269188, "loss": 3.4566, "step": 585 }, { "epoch": 7.816053511705686, "grad_norm": 0.8725448846817017, "learning_rate": 0.00017419354838709678, "loss": 3.6594, "step": 586 }, { "epoch": 7.829431438127091, "grad_norm": 0.8343111276626587, "learning_rate": 0.00017414905450500556, "loss": 3.3423, "step": 587 }, { "epoch": 7.842809364548495, "grad_norm": 0.9043267965316772, "learning_rate": 0.00017410456062291437, "loss": 3.221, "step": 588 }, { "epoch": 7.8561872909699, "grad_norm": 0.9563114643096924, "learning_rate": 0.00017406006674082312, "loss": 3.5143, "step": 589 }, { "epoch": 7.869565217391305, "grad_norm": 0.9726302027702332, "learning_rate": 0.00017401557285873193, "loss": 3.4373, "step": 590 }, { "epoch": 7.882943143812709, "grad_norm": 0.9203178882598877, "learning_rate": 0.00017397107897664072, "loss": 3.6014, "step": 591 }, { "epoch": 7.896321070234114, "grad_norm": 0.9120233654975891, "learning_rate": 0.0001739265850945495, "loss": 3.1429, "step": 592 }, { "epoch": 7.909698996655519, "grad_norm": 0.7576518058776855, "learning_rate": 0.00017388209121245828, "loss": 3.2065, "step": 593 }, { "epoch": 7.923076923076923, "grad_norm": 0.9629240036010742, "learning_rate": 0.0001738375973303671, "loss": 3.4788, "step": 594 }, { "epoch": 7.936454849498328, "grad_norm": 0.8390881419181824, "learning_rate": 0.00017379310344827587, "loss": 3.2857, "step": 595 }, { "epoch": 7.949832775919733, "grad_norm": 0.8708979487419128, "learning_rate": 0.00017374860956618466, "loss": 3.3321, "step": 596 }, { "epoch": 7.963210702341137, "grad_norm": 0.7076835632324219, "learning_rate": 0.00017370411568409344, "loss": 3.5905, "step": 597 }, { "epoch": 7.976588628762542, "grad_norm": 1.016526222229004, "learning_rate": 0.00017365962180200225, "loss": 3.4053, "step": 598 }, { "epoch": 7.989966555183947, "grad_norm": 0.7592278718948364, "learning_rate": 0.000173615127919911, "loss": 3.3968, "step": 599 }, { "epoch": 8.0, "grad_norm": 1.0106462240219116, "learning_rate": 0.0001735706340378198, "loss": 3.3722, "step": 600 }, { "epoch": 8.013377926421406, "grad_norm": 0.740808367729187, "learning_rate": 0.0001735261401557286, "loss": 3.394, "step": 601 }, { "epoch": 8.02675585284281, "grad_norm": 0.6732498407363892, "learning_rate": 0.00017348164627363738, "loss": 3.297, "step": 602 }, { "epoch": 8.040133779264215, "grad_norm": 0.8319197297096252, "learning_rate": 0.00017343715239154616, "loss": 3.2898, "step": 603 }, { "epoch": 8.053511705685619, "grad_norm": 0.7834349870681763, "learning_rate": 0.00017339265850945497, "loss": 3.2341, "step": 604 }, { "epoch": 8.066889632107024, "grad_norm": 0.705737292766571, "learning_rate": 0.00017334816462736375, "loss": 3.3429, "step": 605 }, { "epoch": 8.080267558528428, "grad_norm": 0.8270958065986633, "learning_rate": 0.00017330367074527253, "loss": 3.0458, "step": 606 }, { "epoch": 8.093645484949834, "grad_norm": 0.7254801392555237, "learning_rate": 0.00017325917686318132, "loss": 3.5143, "step": 607 }, { "epoch": 8.107023411371237, "grad_norm": 0.8450751900672913, "learning_rate": 0.00017321468298109013, "loss": 3.1507, "step": 608 }, { "epoch": 8.120401337792643, "grad_norm": 0.7936837673187256, "learning_rate": 0.00017317018909899888, "loss": 3.3979, "step": 609 }, { "epoch": 8.133779264214047, "grad_norm": 0.6496401429176331, "learning_rate": 0.0001731256952169077, "loss": 3.3613, "step": 610 }, { "epoch": 8.147157190635452, "grad_norm": 0.8721235990524292, "learning_rate": 0.00017308120133481647, "loss": 3.4299, "step": 611 }, { "epoch": 8.160535117056856, "grad_norm": 0.7671874761581421, "learning_rate": 0.00017303670745272525, "loss": 3.3333, "step": 612 }, { "epoch": 8.173913043478262, "grad_norm": 0.6427144408226013, "learning_rate": 0.00017299221357063404, "loss": 3.3304, "step": 613 }, { "epoch": 8.187290969899665, "grad_norm": 0.7999966144561768, "learning_rate": 0.00017294771968854285, "loss": 3.4016, "step": 614 }, { "epoch": 8.200668896321071, "grad_norm": 0.8216206431388855, "learning_rate": 0.00017290322580645163, "loss": 3.1724, "step": 615 }, { "epoch": 8.214046822742475, "grad_norm": 0.7364024519920349, "learning_rate": 0.0001728587319243604, "loss": 3.34, "step": 616 }, { "epoch": 8.22742474916388, "grad_norm": 0.7688239812850952, "learning_rate": 0.0001728142380422692, "loss": 3.2114, "step": 617 }, { "epoch": 8.240802675585284, "grad_norm": 0.8786870837211609, "learning_rate": 0.000172769744160178, "loss": 3.378, "step": 618 }, { "epoch": 8.25418060200669, "grad_norm": 0.9048855900764465, "learning_rate": 0.00017272525027808676, "loss": 3.1801, "step": 619 }, { "epoch": 8.267558528428093, "grad_norm": 0.657189130783081, "learning_rate": 0.00017268075639599557, "loss": 3.1389, "step": 620 }, { "epoch": 8.280936454849499, "grad_norm": 0.8015987873077393, "learning_rate": 0.00017263626251390435, "loss": 3.4621, "step": 621 }, { "epoch": 8.294314381270903, "grad_norm": 0.8232793807983398, "learning_rate": 0.00017259176863181313, "loss": 3.1763, "step": 622 }, { "epoch": 8.307692307692308, "grad_norm": 0.7447130680084229, "learning_rate": 0.00017254727474972192, "loss": 3.1266, "step": 623 }, { "epoch": 8.321070234113712, "grad_norm": 0.7649840116500854, "learning_rate": 0.00017250278086763072, "loss": 3.1959, "step": 624 }, { "epoch": 8.334448160535118, "grad_norm": 0.7119699120521545, "learning_rate": 0.0001724582869855395, "loss": 3.5626, "step": 625 }, { "epoch": 8.347826086956522, "grad_norm": 0.8238518834114075, "learning_rate": 0.00017241379310344826, "loss": 3.1873, "step": 626 }, { "epoch": 8.361204013377927, "grad_norm": 0.8248497843742371, "learning_rate": 0.00017236929922135707, "loss": 3.5686, "step": 627 }, { "epoch": 8.37458193979933, "grad_norm": 0.8704475164413452, "learning_rate": 0.00017232480533926585, "loss": 3.3195, "step": 628 }, { "epoch": 8.387959866220736, "grad_norm": 0.8160929083824158, "learning_rate": 0.00017228031145717464, "loss": 3.3808, "step": 629 }, { "epoch": 8.40133779264214, "grad_norm": 0.8537085652351379, "learning_rate": 0.00017223581757508342, "loss": 3.3638, "step": 630 }, { "epoch": 8.414715719063546, "grad_norm": 0.876519501209259, "learning_rate": 0.00017219132369299223, "loss": 3.2019, "step": 631 }, { "epoch": 8.42809364548495, "grad_norm": 0.6573703289031982, "learning_rate": 0.000172146829810901, "loss": 3.4998, "step": 632 }, { "epoch": 8.441471571906355, "grad_norm": 0.8822937607765198, "learning_rate": 0.0001721023359288098, "loss": 3.4281, "step": 633 }, { "epoch": 8.454849498327759, "grad_norm": 0.764872670173645, "learning_rate": 0.00017205784204671858, "loss": 3.3693, "step": 634 }, { "epoch": 8.468227424749164, "grad_norm": 0.7492384910583496, "learning_rate": 0.00017201334816462739, "loss": 3.5672, "step": 635 }, { "epoch": 8.481605351170568, "grad_norm": 0.8037416934967041, "learning_rate": 0.00017196885428253614, "loss": 3.3804, "step": 636 }, { "epoch": 8.494983277591974, "grad_norm": 0.8380945324897766, "learning_rate": 0.00017192436040044495, "loss": 3.2272, "step": 637 }, { "epoch": 8.508361204013378, "grad_norm": 0.8467932939529419, "learning_rate": 0.00017187986651835373, "loss": 3.2649, "step": 638 }, { "epoch": 8.521739130434783, "grad_norm": 0.751542866230011, "learning_rate": 0.00017183537263626252, "loss": 3.4135, "step": 639 }, { "epoch": 8.535117056856187, "grad_norm": 0.7618190050125122, "learning_rate": 0.0001717908787541713, "loss": 3.483, "step": 640 }, { "epoch": 8.548494983277592, "grad_norm": 0.9661890864372253, "learning_rate": 0.0001717463848720801, "loss": 3.2201, "step": 641 }, { "epoch": 8.561872909698996, "grad_norm": 0.8655393719673157, "learning_rate": 0.0001717018909899889, "loss": 3.4142, "step": 642 }, { "epoch": 8.575250836120402, "grad_norm": 0.796047031879425, "learning_rate": 0.00017165739710789767, "loss": 3.3558, "step": 643 }, { "epoch": 8.588628762541806, "grad_norm": 1.0098161697387695, "learning_rate": 0.00017161290322580645, "loss": 3.553, "step": 644 }, { "epoch": 8.602006688963211, "grad_norm": 1.1880302429199219, "learning_rate": 0.00017156840934371526, "loss": 3.3581, "step": 645 }, { "epoch": 8.615384615384615, "grad_norm": 0.9361609220504761, "learning_rate": 0.00017152391546162402, "loss": 3.4543, "step": 646 }, { "epoch": 8.62876254180602, "grad_norm": 0.8794479966163635, "learning_rate": 0.00017147942157953283, "loss": 3.3654, "step": 647 }, { "epoch": 8.642140468227424, "grad_norm": 0.9263080954551697, "learning_rate": 0.0001714349276974416, "loss": 3.4376, "step": 648 }, { "epoch": 8.65551839464883, "grad_norm": 1.0015815496444702, "learning_rate": 0.0001713904338153504, "loss": 3.3533, "step": 649 }, { "epoch": 8.668896321070234, "grad_norm": 0.8525484204292297, "learning_rate": 0.00017134593993325918, "loss": 3.3897, "step": 650 }, { "epoch": 8.68227424749164, "grad_norm": 0.7196484804153442, "learning_rate": 0.00017130144605116799, "loss": 3.2428, "step": 651 }, { "epoch": 8.695652173913043, "grad_norm": 0.8779593706130981, "learning_rate": 0.00017125695216907677, "loss": 3.5471, "step": 652 }, { "epoch": 8.709030100334449, "grad_norm": 0.9256909489631653, "learning_rate": 0.00017121245828698555, "loss": 3.1776, "step": 653 }, { "epoch": 8.722408026755852, "grad_norm": 0.7774620652198792, "learning_rate": 0.00017116796440489433, "loss": 3.4836, "step": 654 }, { "epoch": 8.735785953177258, "grad_norm": 0.8112596273422241, "learning_rate": 0.00017112347052280314, "loss": 3.5194, "step": 655 }, { "epoch": 8.749163879598662, "grad_norm": 0.7350602746009827, "learning_rate": 0.0001710789766407119, "loss": 3.2248, "step": 656 }, { "epoch": 8.762541806020067, "grad_norm": 0.8231781125068665, "learning_rate": 0.0001710344827586207, "loss": 3.4659, "step": 657 }, { "epoch": 8.775919732441471, "grad_norm": 0.8921564221382141, "learning_rate": 0.0001709899888765295, "loss": 3.2712, "step": 658 }, { "epoch": 8.789297658862877, "grad_norm": 0.8921830058097839, "learning_rate": 0.00017094549499443827, "loss": 3.5071, "step": 659 }, { "epoch": 8.80267558528428, "grad_norm": 0.7809077501296997, "learning_rate": 0.00017090100111234705, "loss": 3.6639, "step": 660 }, { "epoch": 8.816053511705686, "grad_norm": 0.9431234002113342, "learning_rate": 0.00017085650723025586, "loss": 3.2795, "step": 661 }, { "epoch": 8.82943143812709, "grad_norm": 0.9707314968109131, "learning_rate": 0.00017081201334816465, "loss": 3.3395, "step": 662 }, { "epoch": 8.842809364548495, "grad_norm": 0.7547470331192017, "learning_rate": 0.00017076751946607343, "loss": 3.5316, "step": 663 }, { "epoch": 8.856187290969899, "grad_norm": 0.8989250659942627, "learning_rate": 0.0001707230255839822, "loss": 3.4029, "step": 664 }, { "epoch": 8.869565217391305, "grad_norm": 1.0237400531768799, "learning_rate": 0.00017067853170189102, "loss": 3.5014, "step": 665 }, { "epoch": 8.882943143812708, "grad_norm": 0.7289263010025024, "learning_rate": 0.00017063403781979978, "loss": 3.4211, "step": 666 }, { "epoch": 8.896321070234114, "grad_norm": 0.7978695034980774, "learning_rate": 0.00017058954393770856, "loss": 3.5421, "step": 667 }, { "epoch": 8.909698996655518, "grad_norm": 0.7401835918426514, "learning_rate": 0.00017054505005561737, "loss": 3.3419, "step": 668 }, { "epoch": 8.923076923076923, "grad_norm": 0.8952983617782593, "learning_rate": 0.00017050055617352615, "loss": 3.1322, "step": 669 }, { "epoch": 8.936454849498327, "grad_norm": 0.6922047734260559, "learning_rate": 0.00017045606229143493, "loss": 3.5872, "step": 670 }, { "epoch": 8.949832775919733, "grad_norm": 0.8618977665901184, "learning_rate": 0.00017041156840934371, "loss": 3.1366, "step": 671 }, { "epoch": 8.963210702341136, "grad_norm": 0.7933799624443054, "learning_rate": 0.00017036707452725252, "loss": 3.3108, "step": 672 }, { "epoch": 8.976588628762542, "grad_norm": 0.718401312828064, "learning_rate": 0.00017032258064516128, "loss": 3.3771, "step": 673 }, { "epoch": 8.989966555183946, "grad_norm": 0.8096804618835449, "learning_rate": 0.0001702780867630701, "loss": 3.3225, "step": 674 }, { "epoch": 9.0, "grad_norm": 1.0055694580078125, "learning_rate": 0.00017023359288097887, "loss": 3.5545, "step": 675 }, { "epoch": 9.013377926421406, "grad_norm": 0.710986852645874, "learning_rate": 0.00017018909899888765, "loss": 3.3333, "step": 676 }, { "epoch": 9.02675585284281, "grad_norm": 0.672132134437561, "learning_rate": 0.00017014460511679644, "loss": 2.9995, "step": 677 }, { "epoch": 9.040133779264215, "grad_norm": 0.6752933263778687, "learning_rate": 0.00017010011123470525, "loss": 3.3571, "step": 678 }, { "epoch": 9.053511705685619, "grad_norm": 0.6553521156311035, "learning_rate": 0.00017005561735261403, "loss": 3.3407, "step": 679 }, { "epoch": 9.066889632107024, "grad_norm": 0.7492311596870422, "learning_rate": 0.0001700111234705228, "loss": 3.325, "step": 680 }, { "epoch": 9.080267558528428, "grad_norm": 0.736139714717865, "learning_rate": 0.0001699666295884316, "loss": 3.2626, "step": 681 }, { "epoch": 9.093645484949834, "grad_norm": 0.7131486535072327, "learning_rate": 0.0001699221357063404, "loss": 3.2612, "step": 682 }, { "epoch": 9.107023411371237, "grad_norm": 0.7037603855133057, "learning_rate": 0.00016987764182424916, "loss": 3.2418, "step": 683 }, { "epoch": 9.120401337792643, "grad_norm": 0.685518205165863, "learning_rate": 0.00016983314794215797, "loss": 3.4854, "step": 684 }, { "epoch": 9.133779264214047, "grad_norm": 0.6050254106521606, "learning_rate": 0.00016978865406006675, "loss": 3.2312, "step": 685 }, { "epoch": 9.147157190635452, "grad_norm": 0.6932830810546875, "learning_rate": 0.00016974416017797553, "loss": 3.4634, "step": 686 }, { "epoch": 9.160535117056856, "grad_norm": 0.7055158615112305, "learning_rate": 0.00016969966629588431, "loss": 3.1408, "step": 687 }, { "epoch": 9.173913043478262, "grad_norm": 0.6887643337249756, "learning_rate": 0.00016965517241379312, "loss": 3.0697, "step": 688 }, { "epoch": 9.187290969899665, "grad_norm": 0.7201237082481384, "learning_rate": 0.0001696106785317019, "loss": 3.303, "step": 689 }, { "epoch": 9.200668896321071, "grad_norm": 0.6617894768714905, "learning_rate": 0.0001695661846496107, "loss": 3.4846, "step": 690 }, { "epoch": 9.214046822742475, "grad_norm": 0.8979818224906921, "learning_rate": 0.00016952169076751947, "loss": 3.1898, "step": 691 }, { "epoch": 9.22742474916388, "grad_norm": 0.9507981538772583, "learning_rate": 0.00016947719688542828, "loss": 3.4748, "step": 692 }, { "epoch": 9.240802675585284, "grad_norm": 0.7935391068458557, "learning_rate": 0.00016943270300333704, "loss": 3.3661, "step": 693 }, { "epoch": 9.25418060200669, "grad_norm": 0.7437114715576172, "learning_rate": 0.00016938820912124584, "loss": 3.4407, "step": 694 }, { "epoch": 9.267558528428093, "grad_norm": 0.680610179901123, "learning_rate": 0.00016934371523915463, "loss": 3.3135, "step": 695 }, { "epoch": 9.280936454849499, "grad_norm": 0.846716582775116, "learning_rate": 0.0001692992213570634, "loss": 3.3923, "step": 696 }, { "epoch": 9.294314381270903, "grad_norm": 0.9567786455154419, "learning_rate": 0.0001692547274749722, "loss": 3.1405, "step": 697 }, { "epoch": 9.307692307692308, "grad_norm": 0.7509344816207886, "learning_rate": 0.000169210233592881, "loss": 3.4031, "step": 698 }, { "epoch": 9.321070234113712, "grad_norm": 0.8118243217468262, "learning_rate": 0.00016916573971078978, "loss": 3.317, "step": 699 }, { "epoch": 9.334448160535118, "grad_norm": 0.7445617318153381, "learning_rate": 0.00016912124582869857, "loss": 3.3989, "step": 700 }, { "epoch": 9.347826086956522, "grad_norm": 0.7520869970321655, "learning_rate": 0.00016907675194660735, "loss": 3.08, "step": 701 }, { "epoch": 9.361204013377927, "grad_norm": 0.7466426491737366, "learning_rate": 0.00016903225806451616, "loss": 3.3338, "step": 702 }, { "epoch": 9.37458193979933, "grad_norm": 0.7595514059066772, "learning_rate": 0.0001689877641824249, "loss": 3.08, "step": 703 }, { "epoch": 9.387959866220736, "grad_norm": 0.713771641254425, "learning_rate": 0.00016894327030033372, "loss": 3.236, "step": 704 }, { "epoch": 9.40133779264214, "grad_norm": 0.670863687992096, "learning_rate": 0.0001688987764182425, "loss": 3.4491, "step": 705 }, { "epoch": 9.414715719063546, "grad_norm": 0.8842789530754089, "learning_rate": 0.0001688542825361513, "loss": 3.3444, "step": 706 }, { "epoch": 9.42809364548495, "grad_norm": 0.8298172950744629, "learning_rate": 0.00016880978865406007, "loss": 3.3818, "step": 707 }, { "epoch": 9.441471571906355, "grad_norm": 0.7407504320144653, "learning_rate": 0.00016876529477196885, "loss": 3.2294, "step": 708 }, { "epoch": 9.454849498327759, "grad_norm": 0.6642070412635803, "learning_rate": 0.00016872080088987766, "loss": 3.3009, "step": 709 }, { "epoch": 9.468227424749164, "grad_norm": 0.7627503275871277, "learning_rate": 0.00016867630700778642, "loss": 3.3486, "step": 710 }, { "epoch": 9.481605351170568, "grad_norm": 0.7307603359222412, "learning_rate": 0.00016863181312569523, "loss": 3.0257, "step": 711 }, { "epoch": 9.494983277591974, "grad_norm": 0.7932866215705872, "learning_rate": 0.000168587319243604, "loss": 3.4191, "step": 712 }, { "epoch": 9.508361204013378, "grad_norm": 0.7457575798034668, "learning_rate": 0.0001685428253615128, "loss": 3.2283, "step": 713 }, { "epoch": 9.521739130434783, "grad_norm": 0.6718200445175171, "learning_rate": 0.00016849833147942157, "loss": 3.4569, "step": 714 }, { "epoch": 9.535117056856187, "grad_norm": 0.8189072608947754, "learning_rate": 0.00016845383759733038, "loss": 3.2585, "step": 715 }, { "epoch": 9.548494983277592, "grad_norm": 0.6895336508750916, "learning_rate": 0.00016840934371523917, "loss": 3.2417, "step": 716 }, { "epoch": 9.561872909698996, "grad_norm": 0.723173975944519, "learning_rate": 0.00016836484983314795, "loss": 3.3243, "step": 717 }, { "epoch": 9.575250836120402, "grad_norm": 0.8354344964027405, "learning_rate": 0.00016832035595105673, "loss": 3.3585, "step": 718 }, { "epoch": 9.588628762541806, "grad_norm": 0.6736294031143188, "learning_rate": 0.00016827586206896554, "loss": 3.188, "step": 719 }, { "epoch": 9.602006688963211, "grad_norm": 0.7790263295173645, "learning_rate": 0.0001682313681868743, "loss": 3.3171, "step": 720 }, { "epoch": 9.615384615384615, "grad_norm": 0.6426937580108643, "learning_rate": 0.0001681868743047831, "loss": 3.0854, "step": 721 }, { "epoch": 9.62876254180602, "grad_norm": 0.7029106020927429, "learning_rate": 0.0001681423804226919, "loss": 3.3629, "step": 722 }, { "epoch": 9.642140468227424, "grad_norm": 0.8353022933006287, "learning_rate": 0.00016809788654060067, "loss": 3.3715, "step": 723 }, { "epoch": 9.65551839464883, "grad_norm": 0.8578335642814636, "learning_rate": 0.00016805339265850945, "loss": 3.3554, "step": 724 }, { "epoch": 9.668896321070234, "grad_norm": 0.6998556852340698, "learning_rate": 0.00016800889877641826, "loss": 3.3043, "step": 725 }, { "epoch": 9.68227424749164, "grad_norm": 0.7134855389595032, "learning_rate": 0.00016796440489432704, "loss": 3.5856, "step": 726 }, { "epoch": 9.695652173913043, "grad_norm": 0.6636050939559937, "learning_rate": 0.00016791991101223583, "loss": 3.2156, "step": 727 }, { "epoch": 9.709030100334449, "grad_norm": 0.7757130861282349, "learning_rate": 0.0001678754171301446, "loss": 3.4974, "step": 728 }, { "epoch": 9.722408026755852, "grad_norm": 0.770648181438446, "learning_rate": 0.00016783092324805342, "loss": 3.3251, "step": 729 }, { "epoch": 9.735785953177258, "grad_norm": 0.7728201746940613, "learning_rate": 0.00016778642936596217, "loss": 3.2666, "step": 730 }, { "epoch": 9.749163879598662, "grad_norm": 0.8277239203453064, "learning_rate": 0.00016774193548387098, "loss": 3.3867, "step": 731 }, { "epoch": 9.762541806020067, "grad_norm": 0.6534886360168457, "learning_rate": 0.00016769744160177977, "loss": 3.175, "step": 732 }, { "epoch": 9.775919732441471, "grad_norm": 0.8508428335189819, "learning_rate": 0.00016765294771968855, "loss": 3.2084, "step": 733 }, { "epoch": 9.789297658862877, "grad_norm": 0.7656721472740173, "learning_rate": 0.00016760845383759733, "loss": 3.1426, "step": 734 }, { "epoch": 9.80267558528428, "grad_norm": 0.9495553970336914, "learning_rate": 0.00016756395995550614, "loss": 3.1623, "step": 735 }, { "epoch": 9.816053511705686, "grad_norm": 0.7998641729354858, "learning_rate": 0.00016751946607341492, "loss": 3.3893, "step": 736 }, { "epoch": 9.82943143812709, "grad_norm": 0.8124551177024841, "learning_rate": 0.0001674749721913237, "loss": 3.2012, "step": 737 }, { "epoch": 9.842809364548495, "grad_norm": 0.6332049369812012, "learning_rate": 0.0001674304783092325, "loss": 3.3384, "step": 738 }, { "epoch": 9.856187290969899, "grad_norm": 0.7114555835723877, "learning_rate": 0.0001673859844271413, "loss": 3.0802, "step": 739 }, { "epoch": 9.869565217391305, "grad_norm": 0.7175182700157166, "learning_rate": 0.00016734149054505005, "loss": 3.2572, "step": 740 }, { "epoch": 9.882943143812708, "grad_norm": 0.7724816799163818, "learning_rate": 0.00016729699666295886, "loss": 3.1078, "step": 741 }, { "epoch": 9.896321070234114, "grad_norm": 0.7834901213645935, "learning_rate": 0.00016725250278086764, "loss": 3.2513, "step": 742 }, { "epoch": 9.909698996655518, "grad_norm": 0.663495659828186, "learning_rate": 0.00016720800889877643, "loss": 3.327, "step": 743 }, { "epoch": 9.923076923076923, "grad_norm": 0.7828975319862366, "learning_rate": 0.0001671635150166852, "loss": 3.609, "step": 744 }, { "epoch": 9.936454849498327, "grad_norm": 0.6747825145721436, "learning_rate": 0.00016711902113459402, "loss": 3.4479, "step": 745 }, { "epoch": 9.949832775919733, "grad_norm": 0.7816379070281982, "learning_rate": 0.0001670745272525028, "loss": 3.2369, "step": 746 }, { "epoch": 9.963210702341136, "grad_norm": 0.7011098265647888, "learning_rate": 0.00016703003337041158, "loss": 2.9103, "step": 747 }, { "epoch": 9.976588628762542, "grad_norm": 0.7165176868438721, "learning_rate": 0.00016698553948832036, "loss": 3.1669, "step": 748 }, { "epoch": 9.989966555183946, "grad_norm": 0.766315758228302, "learning_rate": 0.00016694104560622915, "loss": 3.1138, "step": 749 }, { "epoch": 10.0, "grad_norm": 0.876315176486969, "learning_rate": 0.00016689655172413793, "loss": 3.4851, "step": 750 }, { "epoch": 10.013377926421406, "grad_norm": 0.807686984539032, "learning_rate": 0.0001668520578420467, "loss": 3.193, "step": 751 }, { "epoch": 10.02675585284281, "grad_norm": 0.7085704803466797, "learning_rate": 0.00016680756395995552, "loss": 3.4797, "step": 752 }, { "epoch": 10.040133779264215, "grad_norm": 0.7119605541229248, "learning_rate": 0.0001667630700778643, "loss": 3.1713, "step": 753 }, { "epoch": 10.053511705685619, "grad_norm": 0.6569423675537109, "learning_rate": 0.00016671857619577309, "loss": 3.1661, "step": 754 }, { "epoch": 10.066889632107024, "grad_norm": 0.8173550367355347, "learning_rate": 0.00016667408231368187, "loss": 2.8467, "step": 755 }, { "epoch": 10.080267558528428, "grad_norm": 0.7261365056037903, "learning_rate": 0.00016662958843159068, "loss": 3.3679, "step": 756 }, { "epoch": 10.093645484949834, "grad_norm": 0.7997227311134338, "learning_rate": 0.00016658509454949943, "loss": 3.0985, "step": 757 }, { "epoch": 10.107023411371237, "grad_norm": 0.653391420841217, "learning_rate": 0.00016654060066740824, "loss": 3.2156, "step": 758 }, { "epoch": 10.120401337792643, "grad_norm": 0.6799002289772034, "learning_rate": 0.00016649610678531703, "loss": 3.3302, "step": 759 }, { "epoch": 10.133779264214047, "grad_norm": 0.6444498896598816, "learning_rate": 0.0001664516129032258, "loss": 3.2813, "step": 760 }, { "epoch": 10.147157190635452, "grad_norm": 1.064769983291626, "learning_rate": 0.0001664071190211346, "loss": 3.1852, "step": 761 }, { "epoch": 10.160535117056856, "grad_norm": 0.6534339189529419, "learning_rate": 0.0001663626251390434, "loss": 3.1563, "step": 762 }, { "epoch": 10.173913043478262, "grad_norm": 0.6909127235412598, "learning_rate": 0.00016631813125695218, "loss": 3.2728, "step": 763 }, { "epoch": 10.187290969899665, "grad_norm": 0.6549767851829529, "learning_rate": 0.00016627363737486096, "loss": 3.0491, "step": 764 }, { "epoch": 10.200668896321071, "grad_norm": 0.678054928779602, "learning_rate": 0.00016622914349276975, "loss": 3.4807, "step": 765 }, { "epoch": 10.214046822742475, "grad_norm": 0.613358199596405, "learning_rate": 0.00016618464961067856, "loss": 3.1746, "step": 766 }, { "epoch": 10.22742474916388, "grad_norm": 0.6624737977981567, "learning_rate": 0.0001661401557285873, "loss": 2.8528, "step": 767 }, { "epoch": 10.240802675585284, "grad_norm": 0.65067458152771, "learning_rate": 0.00016609566184649612, "loss": 3.1843, "step": 768 }, { "epoch": 10.25418060200669, "grad_norm": 0.6192435622215271, "learning_rate": 0.0001660511679644049, "loss": 3.3162, "step": 769 }, { "epoch": 10.267558528428093, "grad_norm": 0.6456341743469238, "learning_rate": 0.00016600667408231369, "loss": 3.2302, "step": 770 }, { "epoch": 10.280936454849499, "grad_norm": 2.357724189758301, "learning_rate": 0.00016596218020022247, "loss": 3.2741, "step": 771 }, { "epoch": 10.294314381270903, "grad_norm": 0.6833475828170776, "learning_rate": 0.00016591768631813128, "loss": 3.1516, "step": 772 }, { "epoch": 10.307692307692308, "grad_norm": 0.5557199716567993, "learning_rate": 0.00016587319243604006, "loss": 3.281, "step": 773 }, { "epoch": 10.321070234113712, "grad_norm": 0.6617905497550964, "learning_rate": 0.00016582869855394884, "loss": 3.375, "step": 774 }, { "epoch": 10.334448160535118, "grad_norm": 0.5671921372413635, "learning_rate": 0.00016578420467185762, "loss": 3.4335, "step": 775 }, { "epoch": 10.347826086956522, "grad_norm": 0.8487278819084167, "learning_rate": 0.00016573971078976643, "loss": 3.143, "step": 776 }, { "epoch": 10.361204013377927, "grad_norm": 0.6489982604980469, "learning_rate": 0.0001656952169076752, "loss": 3.3258, "step": 777 }, { "epoch": 10.37458193979933, "grad_norm": 0.8773537278175354, "learning_rate": 0.000165650723025584, "loss": 3.0466, "step": 778 }, { "epoch": 10.387959866220736, "grad_norm": 0.5961865782737732, "learning_rate": 0.00016560622914349278, "loss": 3.3417, "step": 779 }, { "epoch": 10.40133779264214, "grad_norm": 0.6149600148200989, "learning_rate": 0.00016556173526140156, "loss": 3.0622, "step": 780 }, { "epoch": 10.414715719063546, "grad_norm": 0.7591158151626587, "learning_rate": 0.00016551724137931035, "loss": 3.2078, "step": 781 }, { "epoch": 10.42809364548495, "grad_norm": 0.7915151119232178, "learning_rate": 0.00016547274749721916, "loss": 3.3082, "step": 782 }, { "epoch": 10.441471571906355, "grad_norm": 0.8709903359413147, "learning_rate": 0.00016542825361512794, "loss": 3.3073, "step": 783 }, { "epoch": 10.454849498327759, "grad_norm": 0.6593959331512451, "learning_rate": 0.00016538375973303672, "loss": 3.1574, "step": 784 }, { "epoch": 10.468227424749164, "grad_norm": 0.8101013898849487, "learning_rate": 0.0001653392658509455, "loss": 3.3631, "step": 785 }, { "epoch": 10.481605351170568, "grad_norm": 0.8200273513793945, "learning_rate": 0.0001652947719688543, "loss": 3.0447, "step": 786 }, { "epoch": 10.494983277591974, "grad_norm": 0.7304090857505798, "learning_rate": 0.00016525027808676307, "loss": 3.3382, "step": 787 }, { "epoch": 10.508361204013378, "grad_norm": 0.7059088349342346, "learning_rate": 0.00016520578420467188, "loss": 3.085, "step": 788 }, { "epoch": 10.521739130434783, "grad_norm": 0.6664522886276245, "learning_rate": 0.00016516129032258066, "loss": 3.3198, "step": 789 }, { "epoch": 10.535117056856187, "grad_norm": 0.9230799078941345, "learning_rate": 0.00016511679644048944, "loss": 3.2502, "step": 790 }, { "epoch": 10.548494983277592, "grad_norm": 0.6974027752876282, "learning_rate": 0.00016507230255839822, "loss": 3.2432, "step": 791 }, { "epoch": 10.561872909698996, "grad_norm": 0.7186788320541382, "learning_rate": 0.000165027808676307, "loss": 3.2273, "step": 792 }, { "epoch": 10.575250836120402, "grad_norm": 0.6168047189712524, "learning_rate": 0.00016498331479421582, "loss": 3.2319, "step": 793 }, { "epoch": 10.588628762541806, "grad_norm": 0.6219142079353333, "learning_rate": 0.00016493882091212457, "loss": 2.9733, "step": 794 }, { "epoch": 10.602006688963211, "grad_norm": 0.573359489440918, "learning_rate": 0.00016489432703003338, "loss": 3.214, "step": 795 }, { "epoch": 10.615384615384615, "grad_norm": 0.678263783454895, "learning_rate": 0.00016484983314794216, "loss": 3.2331, "step": 796 }, { "epoch": 10.62876254180602, "grad_norm": 0.6593761444091797, "learning_rate": 0.00016480533926585095, "loss": 3.3605, "step": 797 }, { "epoch": 10.642140468227424, "grad_norm": 0.8732627034187317, "learning_rate": 0.00016476084538375973, "loss": 3.3531, "step": 798 }, { "epoch": 10.65551839464883, "grad_norm": 0.7198925614356995, "learning_rate": 0.00016471635150166854, "loss": 3.2927, "step": 799 }, { "epoch": 10.668896321070234, "grad_norm": 0.7275107502937317, "learning_rate": 0.00016467185761957732, "loss": 3.3695, "step": 800 }, { "epoch": 10.68227424749164, "grad_norm": 0.7077828049659729, "learning_rate": 0.0001646273637374861, "loss": 2.846, "step": 801 }, { "epoch": 10.695652173913043, "grad_norm": 0.7579251527786255, "learning_rate": 0.00016458286985539488, "loss": 3.1917, "step": 802 }, { "epoch": 10.709030100334449, "grad_norm": 0.7607265114784241, "learning_rate": 0.0001645383759733037, "loss": 3.2919, "step": 803 }, { "epoch": 10.722408026755852, "grad_norm": 0.7122685313224792, "learning_rate": 0.00016449388209121245, "loss": 3.7108, "step": 804 }, { "epoch": 10.735785953177258, "grad_norm": 0.7256726622581482, "learning_rate": 0.00016444938820912126, "loss": 3.3209, "step": 805 }, { "epoch": 10.749163879598662, "grad_norm": 0.7903631925582886, "learning_rate": 0.00016440489432703004, "loss": 3.398, "step": 806 }, { "epoch": 10.762541806020067, "grad_norm": 1.78204345703125, "learning_rate": 0.00016436040044493882, "loss": 3.1118, "step": 807 }, { "epoch": 10.775919732441471, "grad_norm": 0.7647016644477844, "learning_rate": 0.0001643159065628476, "loss": 3.1889, "step": 808 }, { "epoch": 10.789297658862877, "grad_norm": 0.8039811253547668, "learning_rate": 0.00016427141268075642, "loss": 3.0104, "step": 809 }, { "epoch": 10.80267558528428, "grad_norm": 0.6011155843734741, "learning_rate": 0.0001642269187986652, "loss": 3.0988, "step": 810 }, { "epoch": 10.816053511705686, "grad_norm": 0.8137276768684387, "learning_rate": 0.00016418242491657398, "loss": 3.4308, "step": 811 }, { "epoch": 10.82943143812709, "grad_norm": 0.6501771807670593, "learning_rate": 0.00016413793103448276, "loss": 3.1405, "step": 812 }, { "epoch": 10.842809364548495, "grad_norm": 0.678032636642456, "learning_rate": 0.00016409343715239157, "loss": 3.2243, "step": 813 }, { "epoch": 10.856187290969899, "grad_norm": 0.6830305457115173, "learning_rate": 0.00016404894327030033, "loss": 3.2685, "step": 814 }, { "epoch": 10.869565217391305, "grad_norm": 0.7482068538665771, "learning_rate": 0.00016400444938820914, "loss": 3.3363, "step": 815 }, { "epoch": 10.882943143812708, "grad_norm": 0.6592227816581726, "learning_rate": 0.00016395995550611792, "loss": 3.2914, "step": 816 }, { "epoch": 10.896321070234114, "grad_norm": 0.7520759105682373, "learning_rate": 0.0001639154616240267, "loss": 3.1371, "step": 817 }, { "epoch": 10.909698996655518, "grad_norm": 0.6802201271057129, "learning_rate": 0.00016387096774193548, "loss": 3.2925, "step": 818 }, { "epoch": 10.923076923076923, "grad_norm": 0.7528939247131348, "learning_rate": 0.0001638264738598443, "loss": 3.2147, "step": 819 }, { "epoch": 10.936454849498327, "grad_norm": 0.7070727348327637, "learning_rate": 0.00016378197997775308, "loss": 3.2649, "step": 820 }, { "epoch": 10.949832775919733, "grad_norm": 0.6121620535850525, "learning_rate": 0.00016373748609566186, "loss": 3.3999, "step": 821 }, { "epoch": 10.963210702341136, "grad_norm": 0.7355679273605347, "learning_rate": 0.00016369299221357064, "loss": 3.2561, "step": 822 }, { "epoch": 10.976588628762542, "grad_norm": 0.7294445037841797, "learning_rate": 0.00016364849833147945, "loss": 2.7019, "step": 823 }, { "epoch": 10.989966555183946, "grad_norm": 0.8628729581832886, "learning_rate": 0.0001636040044493882, "loss": 3.3655, "step": 824 }, { "epoch": 11.0, "grad_norm": 0.8784325122833252, "learning_rate": 0.00016355951056729702, "loss": 3.504, "step": 825 }, { "epoch": 11.013377926421406, "grad_norm": 0.6880869269371033, "learning_rate": 0.0001635150166852058, "loss": 3.1878, "step": 826 }, { "epoch": 11.02675585284281, "grad_norm": 0.5625393390655518, "learning_rate": 0.00016347052280311458, "loss": 3.2757, "step": 827 }, { "epoch": 11.040133779264215, "grad_norm": 0.5854038596153259, "learning_rate": 0.00016342602892102336, "loss": 3.0994, "step": 828 }, { "epoch": 11.053511705685619, "grad_norm": 0.6682130098342896, "learning_rate": 0.00016338153503893217, "loss": 3.1935, "step": 829 }, { "epoch": 11.066889632107024, "grad_norm": 0.6216278076171875, "learning_rate": 0.00016333704115684095, "loss": 3.1273, "step": 830 }, { "epoch": 11.080267558528428, "grad_norm": 0.61285001039505, "learning_rate": 0.0001632925472747497, "loss": 3.2377, "step": 831 }, { "epoch": 11.093645484949834, "grad_norm": 0.6559188365936279, "learning_rate": 0.00016324805339265852, "loss": 3.1084, "step": 832 }, { "epoch": 11.107023411371237, "grad_norm": 0.6322848200798035, "learning_rate": 0.0001632035595105673, "loss": 3.3178, "step": 833 }, { "epoch": 11.120401337792643, "grad_norm": 0.6306194067001343, "learning_rate": 0.00016315906562847608, "loss": 3.2012, "step": 834 }, { "epoch": 11.133779264214047, "grad_norm": 0.6923161149024963, "learning_rate": 0.00016311457174638487, "loss": 2.9328, "step": 835 }, { "epoch": 11.147157190635452, "grad_norm": 0.6900002360343933, "learning_rate": 0.00016307007786429368, "loss": 3.1436, "step": 836 }, { "epoch": 11.160535117056856, "grad_norm": 0.817669153213501, "learning_rate": 0.00016302558398220246, "loss": 3.0935, "step": 837 }, { "epoch": 11.173913043478262, "grad_norm": 0.7544119954109192, "learning_rate": 0.00016298109010011124, "loss": 3.0646, "step": 838 }, { "epoch": 11.187290969899665, "grad_norm": 0.7996231913566589, "learning_rate": 0.00016293659621802002, "loss": 3.1975, "step": 839 }, { "epoch": 11.200668896321071, "grad_norm": 0.6186792850494385, "learning_rate": 0.00016289210233592883, "loss": 3.31, "step": 840 }, { "epoch": 11.214046822742475, "grad_norm": 0.6926666498184204, "learning_rate": 0.0001628476084538376, "loss": 3.0765, "step": 841 }, { "epoch": 11.22742474916388, "grad_norm": 0.7475588917732239, "learning_rate": 0.0001628031145717464, "loss": 3.1743, "step": 842 }, { "epoch": 11.240802675585284, "grad_norm": 0.5520989298820496, "learning_rate": 0.00016275862068965518, "loss": 3.4243, "step": 843 }, { "epoch": 11.25418060200669, "grad_norm": 0.6556730270385742, "learning_rate": 0.00016271412680756396, "loss": 3.3293, "step": 844 }, { "epoch": 11.267558528428093, "grad_norm": 0.6509199738502502, "learning_rate": 0.00016266963292547274, "loss": 2.999, "step": 845 }, { "epoch": 11.280936454849499, "grad_norm": 0.6254273653030396, "learning_rate": 0.00016262513904338155, "loss": 3.1869, "step": 846 }, { "epoch": 11.294314381270903, "grad_norm": 0.7454530000686646, "learning_rate": 0.00016258064516129034, "loss": 3.3694, "step": 847 }, { "epoch": 11.307692307692308, "grad_norm": 0.7563592791557312, "learning_rate": 0.00016253615127919912, "loss": 3.0057, "step": 848 }, { "epoch": 11.321070234113712, "grad_norm": 0.6986783742904663, "learning_rate": 0.0001624916573971079, "loss": 2.9893, "step": 849 }, { "epoch": 11.334448160535118, "grad_norm": 0.7260631322860718, "learning_rate": 0.0001624471635150167, "loss": 3.1733, "step": 850 }, { "epoch": 11.347826086956522, "grad_norm": 0.7522863745689392, "learning_rate": 0.00016240266963292547, "loss": 2.9829, "step": 851 }, { "epoch": 11.361204013377927, "grad_norm": 0.7290140986442566, "learning_rate": 0.00016235817575083428, "loss": 3.1887, "step": 852 }, { "epoch": 11.37458193979933, "grad_norm": 0.6470169425010681, "learning_rate": 0.00016231368186874306, "loss": 3.2537, "step": 853 }, { "epoch": 11.387959866220736, "grad_norm": 0.863742470741272, "learning_rate": 0.00016226918798665184, "loss": 3.3443, "step": 854 }, { "epoch": 11.40133779264214, "grad_norm": 0.7363939881324768, "learning_rate": 0.00016222469410456062, "loss": 3.3653, "step": 855 }, { "epoch": 11.414715719063546, "grad_norm": 0.6548926830291748, "learning_rate": 0.00016218020022246943, "loss": 3.0373, "step": 856 }, { "epoch": 11.42809364548495, "grad_norm": 0.8087872862815857, "learning_rate": 0.00016213570634037821, "loss": 3.0118, "step": 857 }, { "epoch": 11.441471571906355, "grad_norm": 0.677811324596405, "learning_rate": 0.000162091212458287, "loss": 3.0339, "step": 858 }, { "epoch": 11.454849498327759, "grad_norm": 0.6907945275306702, "learning_rate": 0.00016204671857619578, "loss": 2.9496, "step": 859 }, { "epoch": 11.468227424749164, "grad_norm": 0.6940027475357056, "learning_rate": 0.0001620022246941046, "loss": 3.2825, "step": 860 }, { "epoch": 11.481605351170568, "grad_norm": 0.7132136225700378, "learning_rate": 0.00016195773081201334, "loss": 3.1271, "step": 861 }, { "epoch": 11.494983277591974, "grad_norm": 0.5997372269630432, "learning_rate": 0.00016191323692992215, "loss": 3.1292, "step": 862 }, { "epoch": 11.508361204013378, "grad_norm": 0.6468494534492493, "learning_rate": 0.00016186874304783094, "loss": 3.32, "step": 863 }, { "epoch": 11.521739130434783, "grad_norm": 0.5792532563209534, "learning_rate": 0.00016182424916573972, "loss": 3.3657, "step": 864 }, { "epoch": 11.535117056856187, "grad_norm": 0.8242068290710449, "learning_rate": 0.0001617797552836485, "loss": 3.0094, "step": 865 }, { "epoch": 11.548494983277592, "grad_norm": 0.9260333776473999, "learning_rate": 0.0001617352614015573, "loss": 3.1123, "step": 866 }, { "epoch": 11.561872909698996, "grad_norm": 0.6337956786155701, "learning_rate": 0.0001616907675194661, "loss": 3.1761, "step": 867 }, { "epoch": 11.575250836120402, "grad_norm": 0.6010364294052124, "learning_rate": 0.00016164627363737487, "loss": 3.2987, "step": 868 }, { "epoch": 11.588628762541806, "grad_norm": 0.7492111921310425, "learning_rate": 0.00016160177975528366, "loss": 2.9085, "step": 869 }, { "epoch": 11.602006688963211, "grad_norm": 0.6329553127288818, "learning_rate": 0.00016155728587319247, "loss": 3.1841, "step": 870 }, { "epoch": 11.615384615384615, "grad_norm": 0.768527626991272, "learning_rate": 0.00016151279199110122, "loss": 3.0061, "step": 871 }, { "epoch": 11.62876254180602, "grad_norm": 0.6333640813827515, "learning_rate": 0.00016146829810901003, "loss": 3.3046, "step": 872 }, { "epoch": 11.642140468227424, "grad_norm": 0.7457571625709534, "learning_rate": 0.00016142380422691881, "loss": 3.2728, "step": 873 }, { "epoch": 11.65551839464883, "grad_norm": 0.6389586925506592, "learning_rate": 0.0001613793103448276, "loss": 3.419, "step": 874 }, { "epoch": 11.668896321070234, "grad_norm": 0.8885436058044434, "learning_rate": 0.00016133481646273638, "loss": 3.0938, "step": 875 }, { "epoch": 11.68227424749164, "grad_norm": 0.7936431765556335, "learning_rate": 0.00016129032258064516, "loss": 3.305, "step": 876 }, { "epoch": 11.695652173913043, "grad_norm": 0.6133994460105896, "learning_rate": 0.00016124582869855397, "loss": 3.3474, "step": 877 }, { "epoch": 11.709030100334449, "grad_norm": 0.6638192534446716, "learning_rate": 0.00016120133481646273, "loss": 3.1418, "step": 878 }, { "epoch": 11.722408026755852, "grad_norm": 0.6820496320724487, "learning_rate": 0.00016115684093437154, "loss": 3.1421, "step": 879 }, { "epoch": 11.735785953177258, "grad_norm": 0.6057732105255127, "learning_rate": 0.00016111234705228032, "loss": 3.091, "step": 880 }, { "epoch": 11.749163879598662, "grad_norm": 0.6267048716545105, "learning_rate": 0.0001610678531701891, "loss": 3.1289, "step": 881 }, { "epoch": 11.762541806020067, "grad_norm": 0.6822847723960876, "learning_rate": 0.00016102335928809788, "loss": 3.157, "step": 882 }, { "epoch": 11.775919732441471, "grad_norm": 0.6809714436531067, "learning_rate": 0.0001609788654060067, "loss": 3.0806, "step": 883 }, { "epoch": 11.789297658862877, "grad_norm": 0.5546092391014099, "learning_rate": 0.00016093437152391547, "loss": 3.1853, "step": 884 }, { "epoch": 11.80267558528428, "grad_norm": 0.7375029921531677, "learning_rate": 0.00016088987764182426, "loss": 3.1287, "step": 885 }, { "epoch": 11.816053511705686, "grad_norm": 0.6246840953826904, "learning_rate": 0.00016084538375973304, "loss": 3.1331, "step": 886 }, { "epoch": 11.82943143812709, "grad_norm": 0.6088026762008667, "learning_rate": 0.00016080088987764185, "loss": 3.3781, "step": 887 }, { "epoch": 11.842809364548495, "grad_norm": 0.7996237874031067, "learning_rate": 0.0001607563959955506, "loss": 3.0161, "step": 888 }, { "epoch": 11.856187290969899, "grad_norm": 0.6221441626548767, "learning_rate": 0.0001607119021134594, "loss": 3.1491, "step": 889 }, { "epoch": 11.869565217391305, "grad_norm": 0.6276041269302368, "learning_rate": 0.0001606674082313682, "loss": 3.2575, "step": 890 }, { "epoch": 11.882943143812708, "grad_norm": 0.6394500136375427, "learning_rate": 0.00016062291434927698, "loss": 3.3437, "step": 891 }, { "epoch": 11.896321070234114, "grad_norm": 0.7674509286880493, "learning_rate": 0.00016057842046718576, "loss": 3.1995, "step": 892 }, { "epoch": 11.909698996655518, "grad_norm": 0.7502215504646301, "learning_rate": 0.00016053392658509457, "loss": 3.3129, "step": 893 }, { "epoch": 11.923076923076923, "grad_norm": 0.6078189611434937, "learning_rate": 0.00016048943270300335, "loss": 3.1623, "step": 894 }, { "epoch": 11.936454849498327, "grad_norm": 0.6113708019256592, "learning_rate": 0.00016044493882091213, "loss": 3.6063, "step": 895 }, { "epoch": 11.949832775919733, "grad_norm": 0.6606878638267517, "learning_rate": 0.00016040044493882092, "loss": 3.3216, "step": 896 }, { "epoch": 11.963210702341136, "grad_norm": 0.7055956125259399, "learning_rate": 0.00016035595105672973, "loss": 3.2006, "step": 897 }, { "epoch": 11.976588628762542, "grad_norm": 0.7424116730690002, "learning_rate": 0.00016031145717463848, "loss": 3.1298, "step": 898 }, { "epoch": 11.989966555183946, "grad_norm": 0.6675695180892944, "learning_rate": 0.0001602669632925473, "loss": 3.1116, "step": 899 }, { "epoch": 12.0, "grad_norm": 0.9356181621551514, "learning_rate": 0.00016022246941045607, "loss": 3.2461, "step": 900 }, { "epoch": 12.013377926421406, "grad_norm": 0.8539507985115051, "learning_rate": 0.00016017797552836486, "loss": 3.0671, "step": 901 }, { "epoch": 12.02675585284281, "grad_norm": 0.573266327381134, "learning_rate": 0.00016013348164627364, "loss": 3.0789, "step": 902 }, { "epoch": 12.040133779264215, "grad_norm": 0.5849746465682983, "learning_rate": 0.00016008898776418245, "loss": 3.1623, "step": 903 }, { "epoch": 12.053511705685619, "grad_norm": 0.6523334980010986, "learning_rate": 0.00016004449388209123, "loss": 3.0243, "step": 904 }, { "epoch": 12.066889632107024, "grad_norm": 0.6428223848342896, "learning_rate": 0.00016, "loss": 3.1241, "step": 905 }, { "epoch": 12.080267558528428, "grad_norm": 0.5881937742233276, "learning_rate": 0.0001599555061179088, "loss": 3.1744, "step": 906 }, { "epoch": 12.093645484949834, "grad_norm": 0.7523583173751831, "learning_rate": 0.0001599110122358176, "loss": 3.0821, "step": 907 }, { "epoch": 12.107023411371237, "grad_norm": 0.6120390295982361, "learning_rate": 0.00015986651835372636, "loss": 3.16, "step": 908 }, { "epoch": 12.120401337792643, "grad_norm": 0.6486253142356873, "learning_rate": 0.00015982202447163517, "loss": 3.1402, "step": 909 }, { "epoch": 12.133779264214047, "grad_norm": 0.6322839260101318, "learning_rate": 0.00015977753058954395, "loss": 3.1325, "step": 910 }, { "epoch": 12.147157190635452, "grad_norm": 0.5858875513076782, "learning_rate": 0.00015973303670745273, "loss": 3.1705, "step": 911 }, { "epoch": 12.160535117056856, "grad_norm": 0.6495780348777771, "learning_rate": 0.00015968854282536152, "loss": 3.1977, "step": 912 }, { "epoch": 12.173913043478262, "grad_norm": 0.6483474969863892, "learning_rate": 0.00015964404894327033, "loss": 2.9101, "step": 913 }, { "epoch": 12.187290969899665, "grad_norm": 0.6021110415458679, "learning_rate": 0.0001595995550611791, "loss": 3.2106, "step": 914 }, { "epoch": 12.200668896321071, "grad_norm": 0.5101630687713623, "learning_rate": 0.00015955506117908786, "loss": 3.1316, "step": 915 }, { "epoch": 12.214046822742475, "grad_norm": 0.6226193904876709, "learning_rate": 0.00015951056729699667, "loss": 3.0897, "step": 916 }, { "epoch": 12.22742474916388, "grad_norm": 0.6268473267555237, "learning_rate": 0.00015946607341490546, "loss": 3.2036, "step": 917 }, { "epoch": 12.240802675585284, "grad_norm": 0.7825391292572021, "learning_rate": 0.00015942157953281424, "loss": 3.0082, "step": 918 }, { "epoch": 12.25418060200669, "grad_norm": 0.7881148457527161, "learning_rate": 0.00015937708565072302, "loss": 3.0779, "step": 919 }, { "epoch": 12.267558528428093, "grad_norm": 0.6725586652755737, "learning_rate": 0.00015933259176863183, "loss": 3.04, "step": 920 }, { "epoch": 12.280936454849499, "grad_norm": 0.5831689238548279, "learning_rate": 0.0001592880978865406, "loss": 3.3319, "step": 921 }, { "epoch": 12.294314381270903, "grad_norm": 0.6057907342910767, "learning_rate": 0.0001592436040044494, "loss": 3.0869, "step": 922 }, { "epoch": 12.307692307692308, "grad_norm": 0.771857500076294, "learning_rate": 0.00015919911012235818, "loss": 3.1472, "step": 923 }, { "epoch": 12.321070234113712, "grad_norm": 0.7447528839111328, "learning_rate": 0.000159154616240267, "loss": 3.174, "step": 924 }, { "epoch": 12.334448160535118, "grad_norm": 0.5772632956504822, "learning_rate": 0.00015911012235817574, "loss": 3.1767, "step": 925 }, { "epoch": 12.347826086956522, "grad_norm": 0.6952618956565857, "learning_rate": 0.00015906562847608455, "loss": 3.1597, "step": 926 }, { "epoch": 12.361204013377927, "grad_norm": 0.600922703742981, "learning_rate": 0.00015902113459399333, "loss": 3.3612, "step": 927 }, { "epoch": 12.37458193979933, "grad_norm": 0.7571472525596619, "learning_rate": 0.00015897664071190212, "loss": 2.9405, "step": 928 }, { "epoch": 12.387959866220736, "grad_norm": 0.9343985915184021, "learning_rate": 0.0001589321468298109, "loss": 3.2886, "step": 929 }, { "epoch": 12.40133779264214, "grad_norm": 0.7046729922294617, "learning_rate": 0.0001588876529477197, "loss": 3.4421, "step": 930 }, { "epoch": 12.414715719063546, "grad_norm": 0.7591777443885803, "learning_rate": 0.0001588431590656285, "loss": 3.005, "step": 931 }, { "epoch": 12.42809364548495, "grad_norm": 0.6508903503417969, "learning_rate": 0.00015879866518353727, "loss": 2.8554, "step": 932 }, { "epoch": 12.441471571906355, "grad_norm": 0.6557784676551819, "learning_rate": 0.00015875417130144606, "loss": 3.3268, "step": 933 }, { "epoch": 12.454849498327759, "grad_norm": 0.6941578984260559, "learning_rate": 0.00015870967741935487, "loss": 3.2088, "step": 934 }, { "epoch": 12.468227424749164, "grad_norm": 0.6824263334274292, "learning_rate": 0.00015866518353726362, "loss": 3.0897, "step": 935 }, { "epoch": 12.481605351170568, "grad_norm": 0.7324599027633667, "learning_rate": 0.00015862068965517243, "loss": 3.1443, "step": 936 }, { "epoch": 12.494983277591974, "grad_norm": 0.577022135257721, "learning_rate": 0.0001585761957730812, "loss": 3.0896, "step": 937 }, { "epoch": 12.508361204013378, "grad_norm": 0.6165060997009277, "learning_rate": 0.00015853170189099, "loss": 2.7546, "step": 938 }, { "epoch": 12.521739130434783, "grad_norm": 0.561906635761261, "learning_rate": 0.00015848720800889878, "loss": 3.4192, "step": 939 }, { "epoch": 12.535117056856187, "grad_norm": 0.5894923806190491, "learning_rate": 0.0001584427141268076, "loss": 3.0388, "step": 940 }, { "epoch": 12.548494983277592, "grad_norm": 0.6261674761772156, "learning_rate": 0.00015839822024471637, "loss": 3.0705, "step": 941 }, { "epoch": 12.561872909698996, "grad_norm": 0.695101261138916, "learning_rate": 0.00015835372636262515, "loss": 3.1684, "step": 942 }, { "epoch": 12.575250836120402, "grad_norm": 0.6176817417144775, "learning_rate": 0.00015830923248053393, "loss": 3.0708, "step": 943 }, { "epoch": 12.588628762541806, "grad_norm": 0.6548507213592529, "learning_rate": 0.00015826473859844274, "loss": 3.1569, "step": 944 }, { "epoch": 12.602006688963211, "grad_norm": 0.6046382188796997, "learning_rate": 0.0001582202447163515, "loss": 3.2479, "step": 945 }, { "epoch": 12.615384615384615, "grad_norm": 0.7103912234306335, "learning_rate": 0.0001581757508342603, "loss": 3.127, "step": 946 }, { "epoch": 12.62876254180602, "grad_norm": 0.7131765484809875, "learning_rate": 0.0001581312569521691, "loss": 3.0975, "step": 947 }, { "epoch": 12.642140468227424, "grad_norm": 0.6442859768867493, "learning_rate": 0.00015808676307007787, "loss": 3.2885, "step": 948 }, { "epoch": 12.65551839464883, "grad_norm": 0.6430050134658813, "learning_rate": 0.00015804226918798666, "loss": 3.0397, "step": 949 }, { "epoch": 12.668896321070234, "grad_norm": 0.6894303560256958, "learning_rate": 0.00015799777530589546, "loss": 3.0756, "step": 950 }, { "epoch": 12.68227424749164, "grad_norm": 0.7319600582122803, "learning_rate": 0.00015795328142380425, "loss": 3.0457, "step": 951 }, { "epoch": 12.695652173913043, "grad_norm": 0.6445140838623047, "learning_rate": 0.00015790878754171303, "loss": 2.9828, "step": 952 }, { "epoch": 12.709030100334449, "grad_norm": 0.7522070407867432, "learning_rate": 0.0001578642936596218, "loss": 2.8942, "step": 953 }, { "epoch": 12.722408026755852, "grad_norm": 0.7962691783905029, "learning_rate": 0.00015781979977753062, "loss": 3.0985, "step": 954 }, { "epoch": 12.735785953177258, "grad_norm": 0.6391687393188477, "learning_rate": 0.00015777530589543938, "loss": 3.1752, "step": 955 }, { "epoch": 12.749163879598662, "grad_norm": 0.7632976174354553, "learning_rate": 0.00015773081201334816, "loss": 3.3505, "step": 956 }, { "epoch": 12.762541806020067, "grad_norm": 0.7491022944450378, "learning_rate": 0.00015768631813125697, "loss": 3.0721, "step": 957 }, { "epoch": 12.775919732441471, "grad_norm": 0.6163421273231506, "learning_rate": 0.00015764182424916572, "loss": 3.3242, "step": 958 }, { "epoch": 12.789297658862877, "grad_norm": 0.6831198334693909, "learning_rate": 0.00015759733036707453, "loss": 3.1409, "step": 959 }, { "epoch": 12.80267558528428, "grad_norm": 0.812300980091095, "learning_rate": 0.00015755283648498332, "loss": 2.9606, "step": 960 }, { "epoch": 12.816053511705686, "grad_norm": 0.6904334425926208, "learning_rate": 0.00015750834260289213, "loss": 3.0398, "step": 961 }, { "epoch": 12.82943143812709, "grad_norm": 0.6349720358848572, "learning_rate": 0.00015746384872080088, "loss": 3.1033, "step": 962 }, { "epoch": 12.842809364548495, "grad_norm": 0.6837566494941711, "learning_rate": 0.0001574193548387097, "loss": 3.2353, "step": 963 }, { "epoch": 12.856187290969899, "grad_norm": 0.5852749943733215, "learning_rate": 0.00015737486095661847, "loss": 3.0972, "step": 964 }, { "epoch": 12.869565217391305, "grad_norm": 0.6641372442245483, "learning_rate": 0.00015733036707452725, "loss": 3.2243, "step": 965 }, { "epoch": 12.882943143812708, "grad_norm": 0.6613900065422058, "learning_rate": 0.00015728587319243604, "loss": 3.1263, "step": 966 }, { "epoch": 12.896321070234114, "grad_norm": 0.6126120090484619, "learning_rate": 0.00015724137931034485, "loss": 3.069, "step": 967 }, { "epoch": 12.909698996655518, "grad_norm": 0.6764604449272156, "learning_rate": 0.0001571968854282536, "loss": 3.1397, "step": 968 }, { "epoch": 12.923076923076923, "grad_norm": 0.6447578072547913, "learning_rate": 0.0001571523915461624, "loss": 3.1839, "step": 969 }, { "epoch": 12.936454849498327, "grad_norm": 0.5872016549110413, "learning_rate": 0.0001571078976640712, "loss": 3.3144, "step": 970 }, { "epoch": 12.949832775919733, "grad_norm": 0.626276969909668, "learning_rate": 0.00015706340378198, "loss": 3.1295, "step": 971 }, { "epoch": 12.963210702341136, "grad_norm": 0.6829231381416321, "learning_rate": 0.00015701890989988876, "loss": 3.2261, "step": 972 }, { "epoch": 12.976588628762542, "grad_norm": 0.6197345852851868, "learning_rate": 0.00015697441601779757, "loss": 3.1117, "step": 973 }, { "epoch": 12.989966555183946, "grad_norm": 0.6137062907218933, "learning_rate": 0.00015692992213570635, "loss": 3.1548, "step": 974 }, { "epoch": 13.0, "grad_norm": 0.7483121752738953, "learning_rate": 0.00015688542825361513, "loss": 3.4944, "step": 975 }, { "epoch": 13.013377926421406, "grad_norm": 0.6102525591850281, "learning_rate": 0.00015684093437152392, "loss": 3.1403, "step": 976 }, { "epoch": 13.02675585284281, "grad_norm": 0.7258747220039368, "learning_rate": 0.00015679644048943272, "loss": 2.8474, "step": 977 }, { "epoch": 13.040133779264215, "grad_norm": 0.6891087293624878, "learning_rate": 0.00015675194660734148, "loss": 2.9722, "step": 978 }, { "epoch": 13.053511705685619, "grad_norm": 0.6320910453796387, "learning_rate": 0.0001567074527252503, "loss": 2.9807, "step": 979 }, { "epoch": 13.066889632107024, "grad_norm": 0.8684266209602356, "learning_rate": 0.00015666295884315907, "loss": 2.8448, "step": 980 }, { "epoch": 13.080267558528428, "grad_norm": 0.7126099467277527, "learning_rate": 0.00015661846496106788, "loss": 3.1915, "step": 981 }, { "epoch": 13.093645484949834, "grad_norm": 0.7001529335975647, "learning_rate": 0.00015657397107897664, "loss": 3.2017, "step": 982 }, { "epoch": 13.107023411371237, "grad_norm": 0.7901191711425781, "learning_rate": 0.00015652947719688545, "loss": 3.1273, "step": 983 }, { "epoch": 13.120401337792643, "grad_norm": 0.5769410729408264, "learning_rate": 0.00015648498331479423, "loss": 2.8582, "step": 984 }, { "epoch": 13.133779264214047, "grad_norm": 0.5969700813293457, "learning_rate": 0.000156440489432703, "loss": 3.118, "step": 985 }, { "epoch": 13.147157190635452, "grad_norm": 0.5789377093315125, "learning_rate": 0.0001563959955506118, "loss": 2.8121, "step": 986 }, { "epoch": 13.160535117056856, "grad_norm": 0.7945278882980347, "learning_rate": 0.0001563515016685206, "loss": 2.9212, "step": 987 }, { "epoch": 13.173913043478262, "grad_norm": 0.60884690284729, "learning_rate": 0.00015630700778642936, "loss": 3.1993, "step": 988 }, { "epoch": 13.187290969899665, "grad_norm": 0.616142213344574, "learning_rate": 0.00015626251390433817, "loss": 3.107, "step": 989 }, { "epoch": 13.200668896321071, "grad_norm": 0.6428812146186829, "learning_rate": 0.00015621802002224695, "loss": 3.1401, "step": 990 }, { "epoch": 13.214046822742475, "grad_norm": 0.5723693370819092, "learning_rate": 0.00015617352614015576, "loss": 3.0474, "step": 991 }, { "epoch": 13.22742474916388, "grad_norm": 0.5820907950401306, "learning_rate": 0.00015612903225806451, "loss": 3.2211, "step": 992 }, { "epoch": 13.240802675585284, "grad_norm": 0.555957555770874, "learning_rate": 0.00015608453837597332, "loss": 3.1306, "step": 993 }, { "epoch": 13.25418060200669, "grad_norm": 0.528698205947876, "learning_rate": 0.0001560400444938821, "loss": 3.0989, "step": 994 }, { "epoch": 13.267558528428093, "grad_norm": 0.5959749817848206, "learning_rate": 0.0001559955506117909, "loss": 2.9128, "step": 995 }, { "epoch": 13.280936454849499, "grad_norm": 0.6702240705490112, "learning_rate": 0.00015595105672969967, "loss": 2.9969, "step": 996 }, { "epoch": 13.294314381270903, "grad_norm": 0.5363825559616089, "learning_rate": 0.00015590656284760845, "loss": 3.2027, "step": 997 }, { "epoch": 13.307692307692308, "grad_norm": 0.6402661204338074, "learning_rate": 0.00015586206896551724, "loss": 3.2733, "step": 998 }, { "epoch": 13.321070234113712, "grad_norm": 0.700517475605011, "learning_rate": 0.00015581757508342602, "loss": 3.2629, "step": 999 }, { "epoch": 13.334448160535118, "grad_norm": 0.5500949621200562, "learning_rate": 0.00015577308120133483, "loss": 3.172, "step": 1000 }, { "epoch": 13.347826086956522, "grad_norm": 0.6199147701263428, "learning_rate": 0.0001557285873192436, "loss": 3.0556, "step": 1001 }, { "epoch": 13.361204013377927, "grad_norm": 0.5900529026985168, "learning_rate": 0.0001556840934371524, "loss": 3.1248, "step": 1002 }, { "epoch": 13.37458193979933, "grad_norm": 0.7272413372993469, "learning_rate": 0.00015563959955506118, "loss": 3.1142, "step": 1003 }, { "epoch": 13.387959866220736, "grad_norm": 0.6461951732635498, "learning_rate": 0.00015559510567296998, "loss": 3.1145, "step": 1004 }, { "epoch": 13.40133779264214, "grad_norm": 0.5750373005867004, "learning_rate": 0.00015555061179087874, "loss": 3.2117, "step": 1005 }, { "epoch": 13.414715719063546, "grad_norm": 0.6486302614212036, "learning_rate": 0.00015550611790878755, "loss": 2.9018, "step": 1006 }, { "epoch": 13.42809364548495, "grad_norm": 0.6897476315498352, "learning_rate": 0.00015546162402669633, "loss": 3.2987, "step": 1007 }, { "epoch": 13.441471571906355, "grad_norm": 0.5997576713562012, "learning_rate": 0.00015541713014460511, "loss": 2.974, "step": 1008 }, { "epoch": 13.454849498327759, "grad_norm": 0.6484793424606323, "learning_rate": 0.0001553726362625139, "loss": 2.9827, "step": 1009 }, { "epoch": 13.468227424749164, "grad_norm": 0.562312126159668, "learning_rate": 0.0001553281423804227, "loss": 3.2243, "step": 1010 }, { "epoch": 13.481605351170568, "grad_norm": 0.7257137298583984, "learning_rate": 0.0001552836484983315, "loss": 3.1081, "step": 1011 }, { "epoch": 13.494983277591974, "grad_norm": 0.7201404571533203, "learning_rate": 0.00015523915461624027, "loss": 3.1468, "step": 1012 }, { "epoch": 13.508361204013378, "grad_norm": 0.666539192199707, "learning_rate": 0.00015519466073414905, "loss": 3.0081, "step": 1013 }, { "epoch": 13.521739130434783, "grad_norm": 0.6867642998695374, "learning_rate": 0.00015515016685205786, "loss": 2.7889, "step": 1014 }, { "epoch": 13.535117056856187, "grad_norm": 0.5799785256385803, "learning_rate": 0.00015510567296996662, "loss": 3.2995, "step": 1015 }, { "epoch": 13.548494983277592, "grad_norm": 0.6155371069908142, "learning_rate": 0.00015506117908787543, "loss": 3.2227, "step": 1016 }, { "epoch": 13.561872909698996, "grad_norm": 0.7604040503501892, "learning_rate": 0.0001550166852057842, "loss": 3.0384, "step": 1017 }, { "epoch": 13.575250836120402, "grad_norm": 0.8445917963981628, "learning_rate": 0.000154972191323693, "loss": 2.9566, "step": 1018 }, { "epoch": 13.588628762541806, "grad_norm": 0.7978566288948059, "learning_rate": 0.00015492769744160177, "loss": 3.0175, "step": 1019 }, { "epoch": 13.602006688963211, "grad_norm": 0.5899437069892883, "learning_rate": 0.00015488320355951058, "loss": 3.2418, "step": 1020 }, { "epoch": 13.615384615384615, "grad_norm": 0.7204627990722656, "learning_rate": 0.00015483870967741937, "loss": 3.1568, "step": 1021 }, { "epoch": 13.62876254180602, "grad_norm": 0.6504855155944824, "learning_rate": 0.00015479421579532815, "loss": 2.8952, "step": 1022 }, { "epoch": 13.642140468227424, "grad_norm": 0.8101251125335693, "learning_rate": 0.00015474972191323693, "loss": 3.1975, "step": 1023 }, { "epoch": 13.65551839464883, "grad_norm": 0.6161416172981262, "learning_rate": 0.00015470522803114574, "loss": 3.1565, "step": 1024 }, { "epoch": 13.668896321070234, "grad_norm": 0.6131258606910706, "learning_rate": 0.0001546607341490545, "loss": 3.0382, "step": 1025 }, { "epoch": 13.68227424749164, "grad_norm": 0.8008583784103394, "learning_rate": 0.0001546162402669633, "loss": 2.995, "step": 1026 }, { "epoch": 13.695652173913043, "grad_norm": 0.7101227045059204, "learning_rate": 0.0001545717463848721, "loss": 3.0704, "step": 1027 }, { "epoch": 13.709030100334449, "grad_norm": 0.7988458275794983, "learning_rate": 0.00015452725250278087, "loss": 3.1424, "step": 1028 }, { "epoch": 13.722408026755852, "grad_norm": 0.6013655662536621, "learning_rate": 0.00015448275862068965, "loss": 2.9986, "step": 1029 }, { "epoch": 13.735785953177258, "grad_norm": 0.6368236541748047, "learning_rate": 0.00015443826473859846, "loss": 3.0193, "step": 1030 }, { "epoch": 13.749163879598662, "grad_norm": 0.8222694396972656, "learning_rate": 0.00015439377085650724, "loss": 3.0888, "step": 1031 }, { "epoch": 13.762541806020067, "grad_norm": 0.7270404696464539, "learning_rate": 0.00015434927697441603, "loss": 3.0929, "step": 1032 }, { "epoch": 13.775919732441471, "grad_norm": 0.7292355298995972, "learning_rate": 0.0001543047830923248, "loss": 2.7676, "step": 1033 }, { "epoch": 13.789297658862877, "grad_norm": 0.6662157773971558, "learning_rate": 0.00015426028921023362, "loss": 3.1984, "step": 1034 }, { "epoch": 13.80267558528428, "grad_norm": 0.6350163817405701, "learning_rate": 0.00015421579532814237, "loss": 3.3042, "step": 1035 }, { "epoch": 13.816053511705686, "grad_norm": 0.5999907851219177, "learning_rate": 0.00015417130144605118, "loss": 3.0983, "step": 1036 }, { "epoch": 13.82943143812709, "grad_norm": 0.5942257642745972, "learning_rate": 0.00015412680756395997, "loss": 3.0474, "step": 1037 }, { "epoch": 13.842809364548495, "grad_norm": 0.662589430809021, "learning_rate": 0.00015408231368186875, "loss": 2.9251, "step": 1038 }, { "epoch": 13.856187290969899, "grad_norm": 0.5817089080810547, "learning_rate": 0.00015403781979977753, "loss": 3.0716, "step": 1039 }, { "epoch": 13.869565217391305, "grad_norm": 0.6019257307052612, "learning_rate": 0.0001539933259176863, "loss": 3.1754, "step": 1040 }, { "epoch": 13.882943143812708, "grad_norm": 0.6301860213279724, "learning_rate": 0.00015394883203559512, "loss": 3.2066, "step": 1041 }, { "epoch": 13.896321070234114, "grad_norm": 0.6468888521194458, "learning_rate": 0.00015390433815350388, "loss": 3.3001, "step": 1042 }, { "epoch": 13.909698996655518, "grad_norm": 0.6510801911354065, "learning_rate": 0.0001538598442714127, "loss": 3.2198, "step": 1043 }, { "epoch": 13.923076923076923, "grad_norm": 0.5692014694213867, "learning_rate": 0.00015381535038932147, "loss": 3.3257, "step": 1044 }, { "epoch": 13.936454849498327, "grad_norm": 0.594219982624054, "learning_rate": 0.00015377085650723025, "loss": 2.9918, "step": 1045 }, { "epoch": 13.949832775919733, "grad_norm": 0.6501769423484802, "learning_rate": 0.00015372636262513903, "loss": 2.9653, "step": 1046 }, { "epoch": 13.963210702341136, "grad_norm": 0.6310623288154602, "learning_rate": 0.00015368186874304784, "loss": 3.19, "step": 1047 }, { "epoch": 13.976588628762542, "grad_norm": 0.5795436501502991, "learning_rate": 0.00015363737486095663, "loss": 3.272, "step": 1048 }, { "epoch": 13.989966555183946, "grad_norm": 0.5421392917633057, "learning_rate": 0.0001535928809788654, "loss": 3.2109, "step": 1049 }, { "epoch": 14.0, "grad_norm": 0.833959698677063, "learning_rate": 0.0001535483870967742, "loss": 3.3017, "step": 1050 }, { "epoch": 14.013377926421406, "grad_norm": 0.6856208443641663, "learning_rate": 0.000153503893214683, "loss": 3.0784, "step": 1051 }, { "epoch": 14.02675585284281, "grad_norm": 0.5841811895370483, "learning_rate": 0.00015345939933259176, "loss": 2.848, "step": 1052 }, { "epoch": 14.040133779264215, "grad_norm": 0.557906985282898, "learning_rate": 0.00015341490545050057, "loss": 3.1564, "step": 1053 }, { "epoch": 14.053511705685619, "grad_norm": 0.5468619465827942, "learning_rate": 0.00015337041156840935, "loss": 3.1237, "step": 1054 }, { "epoch": 14.066889632107024, "grad_norm": 0.7213225364685059, "learning_rate": 0.00015332591768631813, "loss": 2.8993, "step": 1055 }, { "epoch": 14.080267558528428, "grad_norm": 0.7413175106048584, "learning_rate": 0.0001532814238042269, "loss": 3.0028, "step": 1056 }, { "epoch": 14.093645484949834, "grad_norm": 0.6072244644165039, "learning_rate": 0.00015323692992213572, "loss": 2.8534, "step": 1057 }, { "epoch": 14.107023411371237, "grad_norm": 0.683262288570404, "learning_rate": 0.0001531924360400445, "loss": 2.8982, "step": 1058 }, { "epoch": 14.120401337792643, "grad_norm": 0.5880157351493835, "learning_rate": 0.0001531479421579533, "loss": 2.9567, "step": 1059 }, { "epoch": 14.133779264214047, "grad_norm": 0.7519298195838928, "learning_rate": 0.00015310344827586207, "loss": 2.9615, "step": 1060 }, { "epoch": 14.147157190635452, "grad_norm": 0.7747945189476013, "learning_rate": 0.00015305895439377088, "loss": 2.7823, "step": 1061 }, { "epoch": 14.160535117056856, "grad_norm": 0.6560395956039429, "learning_rate": 0.00015301446051167963, "loss": 3.1791, "step": 1062 }, { "epoch": 14.173913043478262, "grad_norm": 0.6388076543807983, "learning_rate": 0.00015296996662958844, "loss": 2.9868, "step": 1063 }, { "epoch": 14.187290969899665, "grad_norm": 0.7349525690078735, "learning_rate": 0.00015292547274749723, "loss": 2.9291, "step": 1064 }, { "epoch": 14.200668896321071, "grad_norm": 0.7184433341026306, "learning_rate": 0.000152880978865406, "loss": 2.7889, "step": 1065 }, { "epoch": 14.214046822742475, "grad_norm": 0.6776930093765259, "learning_rate": 0.0001528364849833148, "loss": 3.1572, "step": 1066 }, { "epoch": 14.22742474916388, "grad_norm": 0.818756103515625, "learning_rate": 0.0001527919911012236, "loss": 2.767, "step": 1067 }, { "epoch": 14.240802675585284, "grad_norm": 0.6005066633224487, "learning_rate": 0.00015274749721913238, "loss": 3.1796, "step": 1068 }, { "epoch": 14.25418060200669, "grad_norm": 0.6367926001548767, "learning_rate": 0.00015270300333704117, "loss": 2.9999, "step": 1069 }, { "epoch": 14.267558528428093, "grad_norm": 0.6823679208755493, "learning_rate": 0.00015265850945494995, "loss": 2.7663, "step": 1070 }, { "epoch": 14.280936454849499, "grad_norm": 0.6238808631896973, "learning_rate": 0.00015261401557285876, "loss": 3.1908, "step": 1071 }, { "epoch": 14.294314381270903, "grad_norm": 0.6983721256256104, "learning_rate": 0.0001525695216907675, "loss": 3.0264, "step": 1072 }, { "epoch": 14.307692307692308, "grad_norm": 0.7568501234054565, "learning_rate": 0.00015252502780867632, "loss": 3.0474, "step": 1073 }, { "epoch": 14.321070234113712, "grad_norm": 0.6250051259994507, "learning_rate": 0.0001524805339265851, "loss": 3.114, "step": 1074 }, { "epoch": 14.334448160535118, "grad_norm": 0.5907386541366577, "learning_rate": 0.0001524360400444939, "loss": 3.1823, "step": 1075 }, { "epoch": 14.347826086956522, "grad_norm": 0.6719332337379456, "learning_rate": 0.00015239154616240267, "loss": 2.994, "step": 1076 }, { "epoch": 14.361204013377927, "grad_norm": 0.5911534428596497, "learning_rate": 0.00015234705228031148, "loss": 3.4178, "step": 1077 }, { "epoch": 14.37458193979933, "grad_norm": 0.8071689009666443, "learning_rate": 0.00015230255839822026, "loss": 2.9633, "step": 1078 }, { "epoch": 14.387959866220736, "grad_norm": 0.5957038998603821, "learning_rate": 0.00015225806451612902, "loss": 3.1126, "step": 1079 }, { "epoch": 14.40133779264214, "grad_norm": 0.6604459285736084, "learning_rate": 0.00015221357063403783, "loss": 2.9193, "step": 1080 }, { "epoch": 14.414715719063546, "grad_norm": 0.626081109046936, "learning_rate": 0.0001521690767519466, "loss": 2.9321, "step": 1081 }, { "epoch": 14.42809364548495, "grad_norm": 0.5767174959182739, "learning_rate": 0.0001521245828698554, "loss": 3.1518, "step": 1082 }, { "epoch": 14.441471571906355, "grad_norm": 0.6444874405860901, "learning_rate": 0.00015208008898776417, "loss": 3.1948, "step": 1083 }, { "epoch": 14.454849498327759, "grad_norm": 0.668171763420105, "learning_rate": 0.00015203559510567298, "loss": 3.016, "step": 1084 }, { "epoch": 14.468227424749164, "grad_norm": 0.6998944878578186, "learning_rate": 0.00015199110122358176, "loss": 2.9558, "step": 1085 }, { "epoch": 14.481605351170568, "grad_norm": 0.5896235704421997, "learning_rate": 0.00015194660734149055, "loss": 3.0997, "step": 1086 }, { "epoch": 14.494983277591974, "grad_norm": 0.6724826097488403, "learning_rate": 0.00015190211345939933, "loss": 2.9369, "step": 1087 }, { "epoch": 14.508361204013378, "grad_norm": 0.5710486769676208, "learning_rate": 0.00015185761957730814, "loss": 3.4724, "step": 1088 }, { "epoch": 14.521739130434783, "grad_norm": 0.9997962117195129, "learning_rate": 0.0001518131256952169, "loss": 2.9058, "step": 1089 }, { "epoch": 14.535117056856187, "grad_norm": 0.668074905872345, "learning_rate": 0.0001517686318131257, "loss": 3.2401, "step": 1090 }, { "epoch": 14.548494983277592, "grad_norm": 0.6180433630943298, "learning_rate": 0.00015172413793103449, "loss": 2.8975, "step": 1091 }, { "epoch": 14.561872909698996, "grad_norm": 0.6412661075592041, "learning_rate": 0.00015167964404894327, "loss": 2.9882, "step": 1092 }, { "epoch": 14.575250836120402, "grad_norm": 0.715288519859314, "learning_rate": 0.00015163515016685205, "loss": 3.4004, "step": 1093 }, { "epoch": 14.588628762541806, "grad_norm": 0.689164400100708, "learning_rate": 0.00015159065628476086, "loss": 3.0705, "step": 1094 }, { "epoch": 14.602006688963211, "grad_norm": 0.7713497281074524, "learning_rate": 0.00015154616240266964, "loss": 3.159, "step": 1095 }, { "epoch": 14.615384615384615, "grad_norm": 0.6725841164588928, "learning_rate": 0.00015150166852057843, "loss": 2.8938, "step": 1096 }, { "epoch": 14.62876254180602, "grad_norm": 0.658108651638031, "learning_rate": 0.0001514571746384872, "loss": 2.8747, "step": 1097 }, { "epoch": 14.642140468227424, "grad_norm": 0.5711888074874878, "learning_rate": 0.00015141268075639602, "loss": 2.8989, "step": 1098 }, { "epoch": 14.65551839464883, "grad_norm": 0.6184161305427551, "learning_rate": 0.00015136818687430477, "loss": 3.0904, "step": 1099 }, { "epoch": 14.668896321070234, "grad_norm": 0.5937799215316772, "learning_rate": 0.00015132369299221358, "loss": 3.1637, "step": 1100 }, { "epoch": 14.68227424749164, "grad_norm": 0.591673731803894, "learning_rate": 0.00015127919911012236, "loss": 2.9547, "step": 1101 }, { "epoch": 14.695652173913043, "grad_norm": 0.7317401170730591, "learning_rate": 0.00015123470522803115, "loss": 3.2043, "step": 1102 }, { "epoch": 14.709030100334449, "grad_norm": 0.5784003734588623, "learning_rate": 0.00015119021134593993, "loss": 3.102, "step": 1103 }, { "epoch": 14.722408026755852, "grad_norm": 0.7077385187149048, "learning_rate": 0.00015114571746384874, "loss": 3.011, "step": 1104 }, { "epoch": 14.735785953177258, "grad_norm": 0.6472675204277039, "learning_rate": 0.00015110122358175752, "loss": 3.2075, "step": 1105 }, { "epoch": 14.749163879598662, "grad_norm": 0.6789306998252869, "learning_rate": 0.0001510567296996663, "loss": 2.9458, "step": 1106 }, { "epoch": 14.762541806020067, "grad_norm": 0.6602732539176941, "learning_rate": 0.00015101223581757509, "loss": 2.9941, "step": 1107 }, { "epoch": 14.775919732441471, "grad_norm": 0.7484832406044006, "learning_rate": 0.0001509677419354839, "loss": 3.0358, "step": 1108 }, { "epoch": 14.789297658862877, "grad_norm": 0.704139769077301, "learning_rate": 0.00015092324805339265, "loss": 3.1006, "step": 1109 }, { "epoch": 14.80267558528428, "grad_norm": 0.6545978784561157, "learning_rate": 0.00015087875417130146, "loss": 3.0369, "step": 1110 }, { "epoch": 14.816053511705686, "grad_norm": 0.5718163847923279, "learning_rate": 0.00015083426028921024, "loss": 3.1683, "step": 1111 }, { "epoch": 14.82943143812709, "grad_norm": 0.5773367285728455, "learning_rate": 0.00015078976640711902, "loss": 3.2753, "step": 1112 }, { "epoch": 14.842809364548495, "grad_norm": 0.6617185473442078, "learning_rate": 0.0001507452725250278, "loss": 2.9713, "step": 1113 }, { "epoch": 14.856187290969899, "grad_norm": 0.6748194098472595, "learning_rate": 0.00015070077864293662, "loss": 3.0961, "step": 1114 }, { "epoch": 14.869565217391305, "grad_norm": 0.6942034959793091, "learning_rate": 0.0001506562847608454, "loss": 3.0778, "step": 1115 }, { "epoch": 14.882943143812708, "grad_norm": 1.0203640460968018, "learning_rate": 0.00015061179087875418, "loss": 3.0705, "step": 1116 }, { "epoch": 14.896321070234114, "grad_norm": 0.5746601223945618, "learning_rate": 0.00015056729699666296, "loss": 3.1204, "step": 1117 }, { "epoch": 14.909698996655518, "grad_norm": 0.7374005317687988, "learning_rate": 0.00015052280311457177, "loss": 3.1289, "step": 1118 }, { "epoch": 14.923076923076923, "grad_norm": 0.5524411201477051, "learning_rate": 0.00015047830923248053, "loss": 3.2795, "step": 1119 }, { "epoch": 14.936454849498327, "grad_norm": 0.7024741768836975, "learning_rate": 0.0001504338153503893, "loss": 3.0675, "step": 1120 }, { "epoch": 14.949832775919733, "grad_norm": 0.7431137561798096, "learning_rate": 0.00015038932146829812, "loss": 3.1222, "step": 1121 }, { "epoch": 14.963210702341136, "grad_norm": 0.6568113565444946, "learning_rate": 0.0001503448275862069, "loss": 3.1523, "step": 1122 }, { "epoch": 14.976588628762542, "grad_norm": 0.6193330883979797, "learning_rate": 0.00015030033370411569, "loss": 3.1632, "step": 1123 }, { "epoch": 14.989966555183946, "grad_norm": 0.6371363401412964, "learning_rate": 0.00015025583982202447, "loss": 3.0525, "step": 1124 }, { "epoch": 15.0, "grad_norm": 0.652542233467102, "learning_rate": 0.00015021134593993328, "loss": 3.1846, "step": 1125 }, { "epoch": 15.013377926421406, "grad_norm": 0.6337831616401672, "learning_rate": 0.00015016685205784203, "loss": 2.9292, "step": 1126 }, { "epoch": 15.02675585284281, "grad_norm": 0.85350501537323, "learning_rate": 0.00015012235817575084, "loss": 2.913, "step": 1127 }, { "epoch": 15.040133779264215, "grad_norm": 0.6439313888549805, "learning_rate": 0.00015007786429365962, "loss": 3.0809, "step": 1128 }, { "epoch": 15.053511705685619, "grad_norm": 0.5232247114181519, "learning_rate": 0.0001500333704115684, "loss": 3.1348, "step": 1129 }, { "epoch": 15.066889632107024, "grad_norm": 0.6082741618156433, "learning_rate": 0.0001499888765294772, "loss": 2.9282, "step": 1130 }, { "epoch": 15.080267558528428, "grad_norm": 0.5736444592475891, "learning_rate": 0.000149944382647386, "loss": 2.9891, "step": 1131 }, { "epoch": 15.093645484949834, "grad_norm": 0.7732790112495422, "learning_rate": 0.00014989988876529478, "loss": 3.0461, "step": 1132 }, { "epoch": 15.107023411371237, "grad_norm": 0.618357241153717, "learning_rate": 0.00014985539488320356, "loss": 3.0296, "step": 1133 }, { "epoch": 15.120401337792643, "grad_norm": 0.7245836853981018, "learning_rate": 0.00014981090100111235, "loss": 3.0078, "step": 1134 }, { "epoch": 15.133779264214047, "grad_norm": 0.6738787293434143, "learning_rate": 0.00014976640711902116, "loss": 3.1171, "step": 1135 }, { "epoch": 15.147157190635452, "grad_norm": 0.5802761316299438, "learning_rate": 0.0001497219132369299, "loss": 3.1403, "step": 1136 }, { "epoch": 15.160535117056856, "grad_norm": 0.5941367745399475, "learning_rate": 0.00014967741935483872, "loss": 2.9149, "step": 1137 }, { "epoch": 15.173913043478262, "grad_norm": 0.8884940147399902, "learning_rate": 0.0001496329254727475, "loss": 2.8983, "step": 1138 }, { "epoch": 15.187290969899665, "grad_norm": 0.7229192852973938, "learning_rate": 0.00014958843159065628, "loss": 2.9774, "step": 1139 }, { "epoch": 15.200668896321071, "grad_norm": 0.6714467406272888, "learning_rate": 0.00014954393770856507, "loss": 2.9219, "step": 1140 }, { "epoch": 15.214046822742475, "grad_norm": 0.6785704493522644, "learning_rate": 0.00014949944382647388, "loss": 3.0205, "step": 1141 }, { "epoch": 15.22742474916388, "grad_norm": 0.6349677443504333, "learning_rate": 0.00014945494994438266, "loss": 3.1601, "step": 1142 }, { "epoch": 15.240802675585284, "grad_norm": 0.557123064994812, "learning_rate": 0.00014941045606229144, "loss": 2.6297, "step": 1143 }, { "epoch": 15.25418060200669, "grad_norm": 0.6714944243431091, "learning_rate": 0.00014936596218020022, "loss": 2.7951, "step": 1144 }, { "epoch": 15.267558528428093, "grad_norm": 0.6747463345527649, "learning_rate": 0.00014932146829810903, "loss": 2.7909, "step": 1145 }, { "epoch": 15.280936454849499, "grad_norm": 0.5717387199401855, "learning_rate": 0.0001492769744160178, "loss": 3.2896, "step": 1146 }, { "epoch": 15.294314381270903, "grad_norm": 0.6589123010635376, "learning_rate": 0.0001492324805339266, "loss": 2.8332, "step": 1147 }, { "epoch": 15.307692307692308, "grad_norm": 0.6273646950721741, "learning_rate": 0.00014918798665183538, "loss": 3.0084, "step": 1148 }, { "epoch": 15.321070234113712, "grad_norm": 0.6551377773284912, "learning_rate": 0.00014914349276974416, "loss": 2.8147, "step": 1149 }, { "epoch": 15.334448160535118, "grad_norm": 0.6751659512519836, "learning_rate": 0.00014909899888765295, "loss": 3.1345, "step": 1150 }, { "epoch": 15.347826086956522, "grad_norm": 0.677094042301178, "learning_rate": 0.00014905450500556175, "loss": 3.1958, "step": 1151 }, { "epoch": 15.361204013377927, "grad_norm": 0.6613426804542542, "learning_rate": 0.00014901001112347054, "loss": 3.081, "step": 1152 }, { "epoch": 15.37458193979933, "grad_norm": 0.7645783424377441, "learning_rate": 0.00014896551724137932, "loss": 2.8799, "step": 1153 }, { "epoch": 15.387959866220736, "grad_norm": 0.5698953866958618, "learning_rate": 0.0001489210233592881, "loss": 3.1691, "step": 1154 }, { "epoch": 15.40133779264214, "grad_norm": 0.6581351161003113, "learning_rate": 0.0001488765294771969, "loss": 3.2365, "step": 1155 }, { "epoch": 15.414715719063546, "grad_norm": 0.7809271812438965, "learning_rate": 0.00014883203559510567, "loss": 2.833, "step": 1156 }, { "epoch": 15.42809364548495, "grad_norm": 0.6226280927658081, "learning_rate": 0.00014878754171301448, "loss": 3.1502, "step": 1157 }, { "epoch": 15.441471571906355, "grad_norm": 0.5494824051856995, "learning_rate": 0.00014874304783092326, "loss": 3.2195, "step": 1158 }, { "epoch": 15.454849498327759, "grad_norm": 0.5729116797447205, "learning_rate": 0.00014869855394883204, "loss": 2.9463, "step": 1159 }, { "epoch": 15.468227424749164, "grad_norm": 0.6673750877380371, "learning_rate": 0.00014865406006674082, "loss": 3.1514, "step": 1160 }, { "epoch": 15.481605351170568, "grad_norm": 0.6746686697006226, "learning_rate": 0.0001486095661846496, "loss": 3.0088, "step": 1161 }, { "epoch": 15.494983277591974, "grad_norm": 0.6898564100265503, "learning_rate": 0.00014856507230255842, "loss": 2.8468, "step": 1162 }, { "epoch": 15.508361204013378, "grad_norm": 0.7262438535690308, "learning_rate": 0.00014852057842046717, "loss": 2.8017, "step": 1163 }, { "epoch": 15.521739130434783, "grad_norm": 0.6878666877746582, "learning_rate": 0.00014847608453837598, "loss": 2.9486, "step": 1164 }, { "epoch": 15.535117056856187, "grad_norm": 0.6375080347061157, "learning_rate": 0.00014843159065628476, "loss": 3.0757, "step": 1165 }, { "epoch": 15.548494983277592, "grad_norm": 0.6540268063545227, "learning_rate": 0.00014838709677419355, "loss": 3.1262, "step": 1166 }, { "epoch": 15.561872909698996, "grad_norm": 0.6036689877510071, "learning_rate": 0.00014834260289210233, "loss": 3.0607, "step": 1167 }, { "epoch": 15.575250836120402, "grad_norm": 0.5899893641471863, "learning_rate": 0.00014829810901001114, "loss": 3.1222, "step": 1168 }, { "epoch": 15.588628762541806, "grad_norm": 0.7268028259277344, "learning_rate": 0.00014825361512791992, "loss": 3.1777, "step": 1169 }, { "epoch": 15.602006688963211, "grad_norm": 0.6990141272544861, "learning_rate": 0.0001482091212458287, "loss": 3.2142, "step": 1170 }, { "epoch": 15.615384615384615, "grad_norm": 0.6009657382965088, "learning_rate": 0.00014816462736373748, "loss": 3.141, "step": 1171 }, { "epoch": 15.62876254180602, "grad_norm": 0.6287830471992493, "learning_rate": 0.0001481201334816463, "loss": 3.0621, "step": 1172 }, { "epoch": 15.642140468227424, "grad_norm": 0.6720128655433655, "learning_rate": 0.00014807563959955505, "loss": 3.0353, "step": 1173 }, { "epoch": 15.65551839464883, "grad_norm": 0.6694427132606506, "learning_rate": 0.00014803114571746386, "loss": 3.0171, "step": 1174 }, { "epoch": 15.668896321070234, "grad_norm": 0.5630237460136414, "learning_rate": 0.00014798665183537264, "loss": 3.1444, "step": 1175 }, { "epoch": 15.68227424749164, "grad_norm": 0.7139558792114258, "learning_rate": 0.00014794215795328142, "loss": 3.0012, "step": 1176 }, { "epoch": 15.695652173913043, "grad_norm": 0.6374551057815552, "learning_rate": 0.0001478976640711902, "loss": 2.9123, "step": 1177 }, { "epoch": 15.709030100334449, "grad_norm": 0.5957819223403931, "learning_rate": 0.00014785317018909902, "loss": 3.09, "step": 1178 }, { "epoch": 15.722408026755852, "grad_norm": 0.6083621382713318, "learning_rate": 0.0001478086763070078, "loss": 3.0231, "step": 1179 }, { "epoch": 15.735785953177258, "grad_norm": 0.6169192790985107, "learning_rate": 0.00014776418242491658, "loss": 2.9863, "step": 1180 }, { "epoch": 15.749163879598662, "grad_norm": 0.6058081984519958, "learning_rate": 0.00014771968854282536, "loss": 3.0261, "step": 1181 }, { "epoch": 15.762541806020067, "grad_norm": 0.5816760659217834, "learning_rate": 0.00014767519466073417, "loss": 3.1593, "step": 1182 }, { "epoch": 15.775919732441471, "grad_norm": 0.6246895790100098, "learning_rate": 0.00014763070077864293, "loss": 3.1029, "step": 1183 }, { "epoch": 15.789297658862877, "grad_norm": 0.56280517578125, "learning_rate": 0.00014758620689655174, "loss": 2.9778, "step": 1184 }, { "epoch": 15.80267558528428, "grad_norm": 0.5743212699890137, "learning_rate": 0.00014754171301446052, "loss": 2.8799, "step": 1185 }, { "epoch": 15.816053511705686, "grad_norm": 0.6163922548294067, "learning_rate": 0.0001474972191323693, "loss": 3.0226, "step": 1186 }, { "epoch": 15.82943143812709, "grad_norm": 0.5892409682273865, "learning_rate": 0.00014745272525027808, "loss": 3.1167, "step": 1187 }, { "epoch": 15.842809364548495, "grad_norm": 0.7977785468101501, "learning_rate": 0.0001474082313681869, "loss": 2.8427, "step": 1188 }, { "epoch": 15.856187290969899, "grad_norm": 0.7396023273468018, "learning_rate": 0.00014736373748609568, "loss": 2.7809, "step": 1189 }, { "epoch": 15.869565217391305, "grad_norm": 0.58844393491745, "learning_rate": 0.00014731924360400446, "loss": 3.0624, "step": 1190 }, { "epoch": 15.882943143812708, "grad_norm": 0.6903204321861267, "learning_rate": 0.00014727474972191324, "loss": 3.1246, "step": 1191 }, { "epoch": 15.896321070234114, "grad_norm": 0.5902391672134399, "learning_rate": 0.00014723025583982205, "loss": 3.0505, "step": 1192 }, { "epoch": 15.909698996655518, "grad_norm": 0.575752317905426, "learning_rate": 0.0001471857619577308, "loss": 2.8508, "step": 1193 }, { "epoch": 15.923076923076923, "grad_norm": 0.7248224020004272, "learning_rate": 0.00014714126807563961, "loss": 3.1438, "step": 1194 }, { "epoch": 15.936454849498327, "grad_norm": 0.5669791102409363, "learning_rate": 0.0001470967741935484, "loss": 3.1765, "step": 1195 }, { "epoch": 15.949832775919733, "grad_norm": 0.6656806468963623, "learning_rate": 0.00014705228031145718, "loss": 3.1456, "step": 1196 }, { "epoch": 15.963210702341136, "grad_norm": 0.6073266863822937, "learning_rate": 0.00014700778642936596, "loss": 3.1836, "step": 1197 }, { "epoch": 15.976588628762542, "grad_norm": 0.8209658861160278, "learning_rate": 0.00014696329254727477, "loss": 2.8457, "step": 1198 }, { "epoch": 15.989966555183946, "grad_norm": 0.6495081186294556, "learning_rate": 0.00014691879866518355, "loss": 3.0161, "step": 1199 }, { "epoch": 16.0, "grad_norm": 0.7522635459899902, "learning_rate": 0.00014687430478309234, "loss": 2.8293, "step": 1200 }, { "epoch": 16.013377926421406, "grad_norm": 0.8024417161941528, "learning_rate": 0.00014682981090100112, "loss": 2.909, "step": 1201 }, { "epoch": 16.02675585284281, "grad_norm": 0.7147983908653259, "learning_rate": 0.0001467853170189099, "loss": 2.7664, "step": 1202 }, { "epoch": 16.040133779264213, "grad_norm": 0.602427065372467, "learning_rate": 0.00014674082313681868, "loss": 2.8189, "step": 1203 }, { "epoch": 16.05351170568562, "grad_norm": 0.7274264097213745, "learning_rate": 0.00014669632925472747, "loss": 2.9957, "step": 1204 }, { "epoch": 16.066889632107024, "grad_norm": 0.6042147278785706, "learning_rate": 0.00014665183537263628, "loss": 2.8925, "step": 1205 }, { "epoch": 16.08026755852843, "grad_norm": 0.592339813709259, "learning_rate": 0.00014660734149054506, "loss": 3.1463, "step": 1206 }, { "epoch": 16.093645484949832, "grad_norm": 0.6099987030029297, "learning_rate": 0.00014656284760845384, "loss": 3.2264, "step": 1207 }, { "epoch": 16.107023411371237, "grad_norm": 0.9324999451637268, "learning_rate": 0.00014651835372636262, "loss": 2.916, "step": 1208 }, { "epoch": 16.120401337792643, "grad_norm": 0.6000798344612122, "learning_rate": 0.00014647385984427143, "loss": 2.9242, "step": 1209 }, { "epoch": 16.13377926421405, "grad_norm": 0.609199047088623, "learning_rate": 0.0001464293659621802, "loss": 2.9405, "step": 1210 }, { "epoch": 16.14715719063545, "grad_norm": 0.6789796948432922, "learning_rate": 0.000146384872080089, "loss": 3.003, "step": 1211 }, { "epoch": 16.160535117056856, "grad_norm": 0.6567651629447937, "learning_rate": 0.00014634037819799778, "loss": 2.8733, "step": 1212 }, { "epoch": 16.17391304347826, "grad_norm": 0.5860549211502075, "learning_rate": 0.00014629588431590656, "loss": 3.1429, "step": 1213 }, { "epoch": 16.187290969899667, "grad_norm": 0.6623414158821106, "learning_rate": 0.00014625139043381534, "loss": 3.1741, "step": 1214 }, { "epoch": 16.20066889632107, "grad_norm": 0.8366180062294006, "learning_rate": 0.00014620689655172415, "loss": 2.8627, "step": 1215 }, { "epoch": 16.214046822742475, "grad_norm": 0.616780698299408, "learning_rate": 0.00014616240266963294, "loss": 2.7619, "step": 1216 }, { "epoch": 16.22742474916388, "grad_norm": 0.6345306634902954, "learning_rate": 0.00014611790878754172, "loss": 2.9104, "step": 1217 }, { "epoch": 16.240802675585286, "grad_norm": 0.6326844096183777, "learning_rate": 0.0001460734149054505, "loss": 2.8453, "step": 1218 }, { "epoch": 16.254180602006688, "grad_norm": 0.5441793203353882, "learning_rate": 0.0001460289210233593, "loss": 3.2924, "step": 1219 }, { "epoch": 16.267558528428093, "grad_norm": 0.604637086391449, "learning_rate": 0.00014598442714126807, "loss": 3.0362, "step": 1220 }, { "epoch": 16.2809364548495, "grad_norm": 0.6100621819496155, "learning_rate": 0.00014593993325917687, "loss": 2.917, "step": 1221 }, { "epoch": 16.294314381270905, "grad_norm": 0.6323224902153015, "learning_rate": 0.00014589543937708566, "loss": 3.144, "step": 1222 }, { "epoch": 16.307692307692307, "grad_norm": 0.595485270023346, "learning_rate": 0.00014585094549499444, "loss": 2.8869, "step": 1223 }, { "epoch": 16.321070234113712, "grad_norm": 0.6350538730621338, "learning_rate": 0.00014580645161290322, "loss": 2.7253, "step": 1224 }, { "epoch": 16.334448160535118, "grad_norm": 0.5804395079612732, "learning_rate": 0.00014576195773081203, "loss": 3.1523, "step": 1225 }, { "epoch": 16.347826086956523, "grad_norm": 0.5905717015266418, "learning_rate": 0.00014571746384872081, "loss": 3.1419, "step": 1226 }, { "epoch": 16.361204013377925, "grad_norm": 0.6824894547462463, "learning_rate": 0.0001456729699666296, "loss": 2.8953, "step": 1227 }, { "epoch": 16.37458193979933, "grad_norm": 0.5840978622436523, "learning_rate": 0.00014562847608453838, "loss": 3.1229, "step": 1228 }, { "epoch": 16.387959866220736, "grad_norm": 0.7102469801902771, "learning_rate": 0.0001455839822024472, "loss": 2.6571, "step": 1229 }, { "epoch": 16.401337792642142, "grad_norm": 0.6148349046707153, "learning_rate": 0.00014553948832035594, "loss": 3.0084, "step": 1230 }, { "epoch": 16.414715719063544, "grad_norm": 0.60859215259552, "learning_rate": 0.00014549499443826475, "loss": 3.1348, "step": 1231 }, { "epoch": 16.42809364548495, "grad_norm": 0.6059513688087463, "learning_rate": 0.00014545050055617354, "loss": 2.9678, "step": 1232 }, { "epoch": 16.441471571906355, "grad_norm": 0.6012231707572937, "learning_rate": 0.00014540600667408232, "loss": 2.9279, "step": 1233 }, { "epoch": 16.45484949832776, "grad_norm": 0.6185587644577026, "learning_rate": 0.0001453615127919911, "loss": 2.9922, "step": 1234 }, { "epoch": 16.468227424749163, "grad_norm": 0.5989127159118652, "learning_rate": 0.0001453170189098999, "loss": 3.0736, "step": 1235 }, { "epoch": 16.48160535117057, "grad_norm": 0.673633337020874, "learning_rate": 0.0001452725250278087, "loss": 2.9703, "step": 1236 }, { "epoch": 16.494983277591974, "grad_norm": 0.6815900206565857, "learning_rate": 0.00014522803114571747, "loss": 3.0171, "step": 1237 }, { "epoch": 16.50836120401338, "grad_norm": 0.7687232494354248, "learning_rate": 0.00014518353726362626, "loss": 2.8988, "step": 1238 }, { "epoch": 16.52173913043478, "grad_norm": 0.7992755174636841, "learning_rate": 0.00014513904338153507, "loss": 3.0347, "step": 1239 }, { "epoch": 16.535117056856187, "grad_norm": 0.8673639893531799, "learning_rate": 0.00014509454949944382, "loss": 2.9654, "step": 1240 }, { "epoch": 16.548494983277592, "grad_norm": 0.6200671792030334, "learning_rate": 0.00014505005561735263, "loss": 3.1075, "step": 1241 }, { "epoch": 16.561872909698998, "grad_norm": 0.8055624961853027, "learning_rate": 0.0001450055617352614, "loss": 2.877, "step": 1242 }, { "epoch": 16.5752508361204, "grad_norm": 0.6428245902061462, "learning_rate": 0.0001449610678531702, "loss": 3.0326, "step": 1243 }, { "epoch": 16.588628762541806, "grad_norm": 0.844804584980011, "learning_rate": 0.00014491657397107898, "loss": 2.8093, "step": 1244 }, { "epoch": 16.60200668896321, "grad_norm": 0.5699613690376282, "learning_rate": 0.00014487208008898776, "loss": 3.1914, "step": 1245 }, { "epoch": 16.615384615384617, "grad_norm": 0.6638582348823547, "learning_rate": 0.00014482758620689657, "loss": 2.7484, "step": 1246 }, { "epoch": 16.62876254180602, "grad_norm": 0.7390914559364319, "learning_rate": 0.00014478309232480533, "loss": 3.0355, "step": 1247 }, { "epoch": 16.642140468227424, "grad_norm": 0.6177923083305359, "learning_rate": 0.00014473859844271413, "loss": 3.0067, "step": 1248 }, { "epoch": 16.65551839464883, "grad_norm": 0.6234062314033508, "learning_rate": 0.00014469410456062292, "loss": 2.8153, "step": 1249 }, { "epoch": 16.668896321070235, "grad_norm": 0.8505418300628662, "learning_rate": 0.0001446496106785317, "loss": 2.8292, "step": 1250 }, { "epoch": 16.682274247491637, "grad_norm": 0.8339266180992126, "learning_rate": 0.00014460511679644048, "loss": 2.8072, "step": 1251 }, { "epoch": 16.695652173913043, "grad_norm": 0.5782635807991028, "learning_rate": 0.0001445606229143493, "loss": 3.1079, "step": 1252 }, { "epoch": 16.70903010033445, "grad_norm": 0.687126874923706, "learning_rate": 0.00014451612903225807, "loss": 3.1401, "step": 1253 }, { "epoch": 16.722408026755854, "grad_norm": 0.7313762307167053, "learning_rate": 0.00014447163515016686, "loss": 2.9167, "step": 1254 }, { "epoch": 16.735785953177256, "grad_norm": 0.8815247416496277, "learning_rate": 0.00014442714126807564, "loss": 3.0294, "step": 1255 }, { "epoch": 16.74916387959866, "grad_norm": 0.7636277675628662, "learning_rate": 0.00014438264738598445, "loss": 3.0758, "step": 1256 }, { "epoch": 16.762541806020067, "grad_norm": 0.5961578488349915, "learning_rate": 0.0001443381535038932, "loss": 3.1027, "step": 1257 }, { "epoch": 16.775919732441473, "grad_norm": 0.6840028762817383, "learning_rate": 0.000144293659621802, "loss": 2.9192, "step": 1258 }, { "epoch": 16.789297658862875, "grad_norm": 0.7895340323448181, "learning_rate": 0.0001442491657397108, "loss": 2.8772, "step": 1259 }, { "epoch": 16.80267558528428, "grad_norm": 0.8516091704368591, "learning_rate": 0.00014420467185761958, "loss": 2.9168, "step": 1260 }, { "epoch": 16.816053511705686, "grad_norm": 0.6745076179504395, "learning_rate": 0.00014416017797552836, "loss": 3.1352, "step": 1261 }, { "epoch": 16.82943143812709, "grad_norm": 0.6744667887687683, "learning_rate": 0.00014411568409343717, "loss": 2.8439, "step": 1262 }, { "epoch": 16.842809364548494, "grad_norm": 0.6307089924812317, "learning_rate": 0.00014407119021134595, "loss": 3.2054, "step": 1263 }, { "epoch": 16.8561872909699, "grad_norm": 0.6480753421783447, "learning_rate": 0.00014402669632925473, "loss": 3.039, "step": 1264 }, { "epoch": 16.869565217391305, "grad_norm": 0.6143667697906494, "learning_rate": 0.00014398220244716352, "loss": 2.9475, "step": 1265 }, { "epoch": 16.88294314381271, "grad_norm": 0.6289299130439758, "learning_rate": 0.00014393770856507233, "loss": 3.1437, "step": 1266 }, { "epoch": 16.896321070234112, "grad_norm": 0.6618160009384155, "learning_rate": 0.00014389321468298108, "loss": 2.8641, "step": 1267 }, { "epoch": 16.909698996655518, "grad_norm": 0.6053375601768494, "learning_rate": 0.0001438487208008899, "loss": 2.9129, "step": 1268 }, { "epoch": 16.923076923076923, "grad_norm": 0.5706185102462769, "learning_rate": 0.00014380422691879867, "loss": 3.005, "step": 1269 }, { "epoch": 16.93645484949833, "grad_norm": 0.6779253482818604, "learning_rate": 0.00014375973303670746, "loss": 3.1071, "step": 1270 }, { "epoch": 16.94983277591973, "grad_norm": 0.6679616570472717, "learning_rate": 0.00014371523915461624, "loss": 3.1792, "step": 1271 }, { "epoch": 16.963210702341136, "grad_norm": 0.6018584966659546, "learning_rate": 0.00014367074527252505, "loss": 2.9947, "step": 1272 }, { "epoch": 16.976588628762542, "grad_norm": 0.6106094717979431, "learning_rate": 0.00014362625139043383, "loss": 3.1965, "step": 1273 }, { "epoch": 16.989966555183948, "grad_norm": 0.5486257672309875, "learning_rate": 0.0001435817575083426, "loss": 3.0975, "step": 1274 }, { "epoch": 17.0, "grad_norm": 0.6516265273094177, "learning_rate": 0.0001435372636262514, "loss": 3.1064, "step": 1275 }, { "epoch": 17.013377926421406, "grad_norm": 0.6434454917907715, "learning_rate": 0.0001434927697441602, "loss": 2.8903, "step": 1276 }, { "epoch": 17.02675585284281, "grad_norm": 0.6237186193466187, "learning_rate": 0.00014344827586206896, "loss": 3.135, "step": 1277 }, { "epoch": 17.040133779264213, "grad_norm": 0.5920026302337646, "learning_rate": 0.00014340378197997777, "loss": 2.8612, "step": 1278 }, { "epoch": 17.05351170568562, "grad_norm": 0.6545232534408569, "learning_rate": 0.00014335928809788655, "loss": 2.8934, "step": 1279 }, { "epoch": 17.066889632107024, "grad_norm": 0.7839710116386414, "learning_rate": 0.00014331479421579533, "loss": 2.965, "step": 1280 }, { "epoch": 17.08026755852843, "grad_norm": 0.6448978781700134, "learning_rate": 0.00014327030033370412, "loss": 2.9343, "step": 1281 }, { "epoch": 17.093645484949832, "grad_norm": 0.5713958144187927, "learning_rate": 0.00014322580645161293, "loss": 3.1566, "step": 1282 }, { "epoch": 17.107023411371237, "grad_norm": 0.666409969329834, "learning_rate": 0.0001431813125695217, "loss": 2.7705, "step": 1283 }, { "epoch": 17.120401337792643, "grad_norm": 0.6068354249000549, "learning_rate": 0.0001431368186874305, "loss": 3.088, "step": 1284 }, { "epoch": 17.13377926421405, "grad_norm": 0.8292580246925354, "learning_rate": 0.00014309232480533927, "loss": 2.8939, "step": 1285 }, { "epoch": 17.14715719063545, "grad_norm": 0.6789494156837463, "learning_rate": 0.00014304783092324806, "loss": 3.2317, "step": 1286 }, { "epoch": 17.160535117056856, "grad_norm": 0.6030963063240051, "learning_rate": 0.00014300333704115684, "loss": 2.9248, "step": 1287 }, { "epoch": 17.17391304347826, "grad_norm": 0.7090041041374207, "learning_rate": 0.00014295884315906562, "loss": 2.9097, "step": 1288 }, { "epoch": 17.187290969899667, "grad_norm": 0.5750879645347595, "learning_rate": 0.00014291434927697443, "loss": 2.9585, "step": 1289 }, { "epoch": 17.20066889632107, "grad_norm": 0.6379792094230652, "learning_rate": 0.0001428698553948832, "loss": 2.794, "step": 1290 }, { "epoch": 17.214046822742475, "grad_norm": 0.6736400127410889, "learning_rate": 0.000142825361512792, "loss": 2.8812, "step": 1291 }, { "epoch": 17.22742474916388, "grad_norm": 0.6580933332443237, "learning_rate": 0.00014278086763070078, "loss": 2.8029, "step": 1292 }, { "epoch": 17.240802675585286, "grad_norm": 0.6550527215003967, "learning_rate": 0.00014273637374860959, "loss": 3.1135, "step": 1293 }, { "epoch": 17.254180602006688, "grad_norm": 0.6616796255111694, "learning_rate": 0.00014269187986651834, "loss": 2.8916, "step": 1294 }, { "epoch": 17.267558528428093, "grad_norm": 0.7247623801231384, "learning_rate": 0.00014264738598442715, "loss": 2.8482, "step": 1295 }, { "epoch": 17.2809364548495, "grad_norm": 0.7138639092445374, "learning_rate": 0.00014260289210233593, "loss": 2.9283, "step": 1296 }, { "epoch": 17.294314381270905, "grad_norm": 0.6413894891738892, "learning_rate": 0.00014255839822024472, "loss": 3.0273, "step": 1297 }, { "epoch": 17.307692307692307, "grad_norm": 0.6106882095336914, "learning_rate": 0.0001425139043381535, "loss": 3.062, "step": 1298 }, { "epoch": 17.321070234113712, "grad_norm": 0.6762199997901917, "learning_rate": 0.0001424694104560623, "loss": 2.7864, "step": 1299 }, { "epoch": 17.334448160535118, "grad_norm": 0.65083909034729, "learning_rate": 0.0001424249165739711, "loss": 3.115, "step": 1300 }, { "epoch": 17.347826086956523, "grad_norm": 0.7381249666213989, "learning_rate": 0.00014238042269187987, "loss": 2.6981, "step": 1301 }, { "epoch": 17.361204013377925, "grad_norm": 0.5674475431442261, "learning_rate": 0.00014233592880978865, "loss": 2.8646, "step": 1302 }, { "epoch": 17.37458193979933, "grad_norm": 0.6201330423355103, "learning_rate": 0.00014229143492769746, "loss": 3.207, "step": 1303 }, { "epoch": 17.387959866220736, "grad_norm": 0.7004446983337402, "learning_rate": 0.00014224694104560622, "loss": 2.7623, "step": 1304 }, { "epoch": 17.401337792642142, "grad_norm": 0.7278717756271362, "learning_rate": 0.00014220244716351503, "loss": 2.7967, "step": 1305 }, { "epoch": 17.414715719063544, "grad_norm": 0.6384133696556091, "learning_rate": 0.0001421579532814238, "loss": 2.7542, "step": 1306 }, { "epoch": 17.42809364548495, "grad_norm": 0.6443619132041931, "learning_rate": 0.0001421134593993326, "loss": 2.9376, "step": 1307 }, { "epoch": 17.441471571906355, "grad_norm": 0.6889017224311829, "learning_rate": 0.00014206896551724138, "loss": 2.8868, "step": 1308 }, { "epoch": 17.45484949832776, "grad_norm": 0.624864935874939, "learning_rate": 0.00014202447163515019, "loss": 3.0459, "step": 1309 }, { "epoch": 17.468227424749163, "grad_norm": 0.5964120626449585, "learning_rate": 0.00014197997775305897, "loss": 2.8604, "step": 1310 }, { "epoch": 17.48160535117057, "grad_norm": 0.5584320425987244, "learning_rate": 0.00014193548387096775, "loss": 3.1267, "step": 1311 }, { "epoch": 17.494983277591974, "grad_norm": 0.6566155552864075, "learning_rate": 0.00014189098998887653, "loss": 2.7059, "step": 1312 }, { "epoch": 17.50836120401338, "grad_norm": 0.7166509032249451, "learning_rate": 0.00014184649610678534, "loss": 3.1076, "step": 1313 }, { "epoch": 17.52173913043478, "grad_norm": 0.6076642870903015, "learning_rate": 0.0001418020022246941, "loss": 3.1121, "step": 1314 }, { "epoch": 17.535117056856187, "grad_norm": 0.6293672323226929, "learning_rate": 0.0001417575083426029, "loss": 3.0719, "step": 1315 }, { "epoch": 17.548494983277592, "grad_norm": 0.6696231365203857, "learning_rate": 0.0001417130144605117, "loss": 2.4281, "step": 1316 }, { "epoch": 17.561872909698998, "grad_norm": 0.7928171157836914, "learning_rate": 0.00014166852057842047, "loss": 2.8602, "step": 1317 }, { "epoch": 17.5752508361204, "grad_norm": 0.5897494554519653, "learning_rate": 0.00014162402669632925, "loss": 3.1556, "step": 1318 }, { "epoch": 17.588628762541806, "grad_norm": 0.6028451323509216, "learning_rate": 0.00014157953281423806, "loss": 3.1228, "step": 1319 }, { "epoch": 17.60200668896321, "grad_norm": 0.6237207651138306, "learning_rate": 0.00014153503893214685, "loss": 2.9691, "step": 1320 }, { "epoch": 17.615384615384617, "grad_norm": 0.6401494741439819, "learning_rate": 0.00014149054505005563, "loss": 3.0659, "step": 1321 }, { "epoch": 17.62876254180602, "grad_norm": 0.7098166942596436, "learning_rate": 0.0001414460511679644, "loss": 2.9606, "step": 1322 }, { "epoch": 17.642140468227424, "grad_norm": 0.6416228413581848, "learning_rate": 0.00014140155728587322, "loss": 3.1056, "step": 1323 }, { "epoch": 17.65551839464883, "grad_norm": 0.7303211092948914, "learning_rate": 0.00014135706340378198, "loss": 2.8604, "step": 1324 }, { "epoch": 17.668896321070235, "grad_norm": 0.64544677734375, "learning_rate": 0.00014131256952169079, "loss": 2.6751, "step": 1325 }, { "epoch": 17.682274247491637, "grad_norm": 0.6870211362838745, "learning_rate": 0.00014126807563959957, "loss": 2.9802, "step": 1326 }, { "epoch": 17.695652173913043, "grad_norm": 0.6570687294006348, "learning_rate": 0.00014122358175750835, "loss": 3.0496, "step": 1327 }, { "epoch": 17.70903010033445, "grad_norm": 0.7057302594184875, "learning_rate": 0.00014117908787541713, "loss": 2.845, "step": 1328 }, { "epoch": 17.722408026755854, "grad_norm": 0.8613574504852295, "learning_rate": 0.00014113459399332591, "loss": 3.0314, "step": 1329 }, { "epoch": 17.735785953177256, "grad_norm": 0.7408957481384277, "learning_rate": 0.00014109010011123472, "loss": 2.8651, "step": 1330 }, { "epoch": 17.74916387959866, "grad_norm": 0.6553664803504944, "learning_rate": 0.00014104560622914348, "loss": 2.9647, "step": 1331 }, { "epoch": 17.762541806020067, "grad_norm": 0.5991332530975342, "learning_rate": 0.0001410011123470523, "loss": 2.9796, "step": 1332 }, { "epoch": 17.775919732441473, "grad_norm": 0.6124044060707092, "learning_rate": 0.00014095661846496107, "loss": 3.0682, "step": 1333 }, { "epoch": 17.789297658862875, "grad_norm": 0.5788628458976746, "learning_rate": 0.00014091212458286985, "loss": 3.0357, "step": 1334 }, { "epoch": 17.80267558528428, "grad_norm": 0.6785842776298523, "learning_rate": 0.00014086763070077864, "loss": 3.1128, "step": 1335 }, { "epoch": 17.816053511705686, "grad_norm": 0.5994388461112976, "learning_rate": 0.00014082313681868745, "loss": 2.9087, "step": 1336 }, { "epoch": 17.82943143812709, "grad_norm": 0.6150069236755371, "learning_rate": 0.00014077864293659623, "loss": 2.9488, "step": 1337 }, { "epoch": 17.842809364548494, "grad_norm": 0.6211126446723938, "learning_rate": 0.000140734149054505, "loss": 2.8006, "step": 1338 }, { "epoch": 17.8561872909699, "grad_norm": 0.6093603372573853, "learning_rate": 0.0001406896551724138, "loss": 3.0013, "step": 1339 }, { "epoch": 17.869565217391305, "grad_norm": 0.6861109137535095, "learning_rate": 0.0001406451612903226, "loss": 2.9423, "step": 1340 }, { "epoch": 17.88294314381271, "grad_norm": 0.6148517727851868, "learning_rate": 0.00014060066740823136, "loss": 2.8471, "step": 1341 }, { "epoch": 17.896321070234112, "grad_norm": 0.8285694718360901, "learning_rate": 0.00014055617352614017, "loss": 3.055, "step": 1342 }, { "epoch": 17.909698996655518, "grad_norm": 0.5955973863601685, "learning_rate": 0.00014051167964404895, "loss": 3.0088, "step": 1343 }, { "epoch": 17.923076923076923, "grad_norm": 0.6020825505256653, "learning_rate": 0.00014046718576195773, "loss": 3.1445, "step": 1344 }, { "epoch": 17.93645484949833, "grad_norm": 0.6115384101867676, "learning_rate": 0.00014042269187986651, "loss": 3.0889, "step": 1345 }, { "epoch": 17.94983277591973, "grad_norm": 0.6469634175300598, "learning_rate": 0.00014037819799777532, "loss": 3.1916, "step": 1346 }, { "epoch": 17.963210702341136, "grad_norm": 0.6653386354446411, "learning_rate": 0.0001403337041156841, "loss": 2.8805, "step": 1347 }, { "epoch": 17.976588628762542, "grad_norm": 0.6167243719100952, "learning_rate": 0.0001402892102335929, "loss": 2.9819, "step": 1348 }, { "epoch": 17.989966555183948, "grad_norm": 0.6281883716583252, "learning_rate": 0.00014024471635150167, "loss": 3.1109, "step": 1349 }, { "epoch": 18.0, "grad_norm": 0.6747295260429382, "learning_rate": 0.00014020022246941048, "loss": 2.8772, "step": 1350 }, { "epoch": 18.013377926421406, "grad_norm": 0.5834116339683533, "learning_rate": 0.00014015572858731924, "loss": 2.9399, "step": 1351 }, { "epoch": 18.02675585284281, "grad_norm": 0.620858371257782, "learning_rate": 0.00014011123470522805, "loss": 2.7351, "step": 1352 }, { "epoch": 18.040133779264213, "grad_norm": 0.5407689809799194, "learning_rate": 0.00014006674082313683, "loss": 2.8329, "step": 1353 }, { "epoch": 18.05351170568562, "grad_norm": 0.6045056581497192, "learning_rate": 0.0001400222469410456, "loss": 2.7382, "step": 1354 }, { "epoch": 18.066889632107024, "grad_norm": 0.5433570146560669, "learning_rate": 0.0001399777530589544, "loss": 2.9154, "step": 1355 }, { "epoch": 18.08026755852843, "grad_norm": 0.6174083352088928, "learning_rate": 0.0001399332591768632, "loss": 2.7438, "step": 1356 }, { "epoch": 18.093645484949832, "grad_norm": 0.6720690727233887, "learning_rate": 0.00013988876529477198, "loss": 3.1324, "step": 1357 }, { "epoch": 18.107023411371237, "grad_norm": 0.648423433303833, "learning_rate": 0.00013984427141268077, "loss": 2.7802, "step": 1358 }, { "epoch": 18.120401337792643, "grad_norm": 0.6625978350639343, "learning_rate": 0.00013979977753058955, "loss": 2.9152, "step": 1359 }, { "epoch": 18.13377926421405, "grad_norm": 0.5362007021903992, "learning_rate": 0.00013975528364849836, "loss": 2.9552, "step": 1360 }, { "epoch": 18.14715719063545, "grad_norm": 0.6275555491447449, "learning_rate": 0.00013971078976640711, "loss": 3.0742, "step": 1361 }, { "epoch": 18.160535117056856, "grad_norm": 0.5755884647369385, "learning_rate": 0.00013966629588431592, "loss": 2.8705, "step": 1362 }, { "epoch": 18.17391304347826, "grad_norm": 0.5092719793319702, "learning_rate": 0.0001396218020022247, "loss": 2.9684, "step": 1363 }, { "epoch": 18.187290969899667, "grad_norm": 0.7400075197219849, "learning_rate": 0.0001395773081201335, "loss": 2.9726, "step": 1364 }, { "epoch": 18.20066889632107, "grad_norm": 0.6478124260902405, "learning_rate": 0.00013953281423804227, "loss": 2.7427, "step": 1365 }, { "epoch": 18.214046822742475, "grad_norm": 0.6313418745994568, "learning_rate": 0.00013948832035595108, "loss": 2.7713, "step": 1366 }, { "epoch": 18.22742474916388, "grad_norm": 0.5571421980857849, "learning_rate": 0.00013944382647385986, "loss": 2.9221, "step": 1367 }, { "epoch": 18.240802675585286, "grad_norm": 0.5346395373344421, "learning_rate": 0.00013939933259176862, "loss": 2.9842, "step": 1368 }, { "epoch": 18.254180602006688, "grad_norm": 0.5828048586845398, "learning_rate": 0.00013935483870967743, "loss": 2.8683, "step": 1369 }, { "epoch": 18.267558528428093, "grad_norm": 0.6446037888526917, "learning_rate": 0.0001393103448275862, "loss": 2.8, "step": 1370 }, { "epoch": 18.2809364548495, "grad_norm": 0.612689197063446, "learning_rate": 0.000139265850945495, "loss": 3.0642, "step": 1371 }, { "epoch": 18.294314381270905, "grad_norm": 0.5511941909790039, "learning_rate": 0.00013922135706340377, "loss": 3.0633, "step": 1372 }, { "epoch": 18.307692307692307, "grad_norm": 0.7538149356842041, "learning_rate": 0.00013917686318131258, "loss": 2.7224, "step": 1373 }, { "epoch": 18.321070234113712, "grad_norm": 0.6194874048233032, "learning_rate": 0.00013913236929922137, "loss": 2.9357, "step": 1374 }, { "epoch": 18.334448160535118, "grad_norm": 0.5770833492279053, "learning_rate": 0.00013908787541713015, "loss": 2.8585, "step": 1375 }, { "epoch": 18.347826086956523, "grad_norm": 0.609080970287323, "learning_rate": 0.00013904338153503893, "loss": 2.7763, "step": 1376 }, { "epoch": 18.361204013377925, "grad_norm": 0.5578462481498718, "learning_rate": 0.00013899888765294774, "loss": 3.0324, "step": 1377 }, { "epoch": 18.37458193979933, "grad_norm": 0.5949610471725464, "learning_rate": 0.0001389543937708565, "loss": 2.7332, "step": 1378 }, { "epoch": 18.387959866220736, "grad_norm": 0.6248785257339478, "learning_rate": 0.0001389098998887653, "loss": 2.7355, "step": 1379 }, { "epoch": 18.401337792642142, "grad_norm": 0.606239378452301, "learning_rate": 0.0001388654060066741, "loss": 2.9883, "step": 1380 }, { "epoch": 18.414715719063544, "grad_norm": 0.6222496628761292, "learning_rate": 0.00013882091212458287, "loss": 2.7384, "step": 1381 }, { "epoch": 18.42809364548495, "grad_norm": 0.6253412365913391, "learning_rate": 0.00013877641824249165, "loss": 2.7555, "step": 1382 }, { "epoch": 18.441471571906355, "grad_norm": 0.6204626560211182, "learning_rate": 0.00013873192436040046, "loss": 2.7279, "step": 1383 }, { "epoch": 18.45484949832776, "grad_norm": 0.7254919409751892, "learning_rate": 0.00013868743047830924, "loss": 2.9053, "step": 1384 }, { "epoch": 18.468227424749163, "grad_norm": 0.6207154393196106, "learning_rate": 0.00013864293659621803, "loss": 3.0648, "step": 1385 }, { "epoch": 18.48160535117057, "grad_norm": 0.6959066390991211, "learning_rate": 0.0001385984427141268, "loss": 2.9641, "step": 1386 }, { "epoch": 18.494983277591974, "grad_norm": 0.6345707774162292, "learning_rate": 0.00013855394883203562, "loss": 2.9452, "step": 1387 }, { "epoch": 18.50836120401338, "grad_norm": 0.5806639790534973, "learning_rate": 0.00013850945494994437, "loss": 2.9375, "step": 1388 }, { "epoch": 18.52173913043478, "grad_norm": 0.6498666405677795, "learning_rate": 0.00013846496106785318, "loss": 2.867, "step": 1389 }, { "epoch": 18.535117056856187, "grad_norm": 0.629264771938324, "learning_rate": 0.00013842046718576197, "loss": 2.822, "step": 1390 }, { "epoch": 18.548494983277592, "grad_norm": 0.6734644174575806, "learning_rate": 0.00013837597330367075, "loss": 2.9065, "step": 1391 }, { "epoch": 18.561872909698998, "grad_norm": 0.5705899000167847, "learning_rate": 0.00013833147942157953, "loss": 2.9909, "step": 1392 }, { "epoch": 18.5752508361204, "grad_norm": 0.6786744594573975, "learning_rate": 0.00013828698553948834, "loss": 3.0667, "step": 1393 }, { "epoch": 18.588628762541806, "grad_norm": 0.6044118404388428, "learning_rate": 0.00013824249165739712, "loss": 2.8345, "step": 1394 }, { "epoch": 18.60200668896321, "grad_norm": 0.5928333401679993, "learning_rate": 0.0001381979977753059, "loss": 2.974, "step": 1395 }, { "epoch": 18.615384615384617, "grad_norm": 0.636883556842804, "learning_rate": 0.0001381535038932147, "loss": 3.1034, "step": 1396 }, { "epoch": 18.62876254180602, "grad_norm": 0.6029159426689148, "learning_rate": 0.0001381090100111235, "loss": 3.0303, "step": 1397 }, { "epoch": 18.642140468227424, "grad_norm": 0.6479122638702393, "learning_rate": 0.00013806451612903225, "loss": 3.1782, "step": 1398 }, { "epoch": 18.65551839464883, "grad_norm": 0.6547753810882568, "learning_rate": 0.00013802002224694106, "loss": 3.0594, "step": 1399 }, { "epoch": 18.668896321070235, "grad_norm": 0.6506614089012146, "learning_rate": 0.00013797552836484984, "loss": 3.0955, "step": 1400 }, { "epoch": 18.682274247491637, "grad_norm": 0.6073411107063293, "learning_rate": 0.00013793103448275863, "loss": 2.7452, "step": 1401 }, { "epoch": 18.695652173913043, "grad_norm": 0.6307429075241089, "learning_rate": 0.0001378865406006674, "loss": 2.8329, "step": 1402 }, { "epoch": 18.70903010033445, "grad_norm": 0.7205286026000977, "learning_rate": 0.00013784204671857622, "loss": 2.8228, "step": 1403 }, { "epoch": 18.722408026755854, "grad_norm": 0.6154866814613342, "learning_rate": 0.000137797552836485, "loss": 2.7081, "step": 1404 }, { "epoch": 18.735785953177256, "grad_norm": 0.548799455165863, "learning_rate": 0.00013775305895439378, "loss": 2.9336, "step": 1405 }, { "epoch": 18.74916387959866, "grad_norm": 0.5723371505737305, "learning_rate": 0.00013770856507230257, "loss": 2.9661, "step": 1406 }, { "epoch": 18.762541806020067, "grad_norm": 0.5917731523513794, "learning_rate": 0.00013766407119021137, "loss": 2.8458, "step": 1407 }, { "epoch": 18.775919732441473, "grad_norm": 0.6160814166069031, "learning_rate": 0.00013761957730812013, "loss": 2.8453, "step": 1408 }, { "epoch": 18.789297658862875, "grad_norm": 0.5995500683784485, "learning_rate": 0.0001375750834260289, "loss": 3.1361, "step": 1409 }, { "epoch": 18.80267558528428, "grad_norm": 0.5574924945831299, "learning_rate": 0.00013753058954393772, "loss": 2.9808, "step": 1410 }, { "epoch": 18.816053511705686, "grad_norm": 0.6444510221481323, "learning_rate": 0.0001374860956618465, "loss": 2.9204, "step": 1411 }, { "epoch": 18.82943143812709, "grad_norm": 0.6153264045715332, "learning_rate": 0.0001374416017797553, "loss": 3.036, "step": 1412 }, { "epoch": 18.842809364548494, "grad_norm": 0.5951060652732849, "learning_rate": 0.00013739710789766407, "loss": 3.1012, "step": 1413 }, { "epoch": 18.8561872909699, "grad_norm": 0.5688861012458801, "learning_rate": 0.00013735261401557288, "loss": 3.1568, "step": 1414 }, { "epoch": 18.869565217391305, "grad_norm": 0.6256094574928284, "learning_rate": 0.00013730812013348163, "loss": 2.7721, "step": 1415 }, { "epoch": 18.88294314381271, "grad_norm": 0.646250307559967, "learning_rate": 0.00013726362625139044, "loss": 3.0091, "step": 1416 }, { "epoch": 18.896321070234112, "grad_norm": 0.6805879473686218, "learning_rate": 0.00013721913236929923, "loss": 2.8397, "step": 1417 }, { "epoch": 18.909698996655518, "grad_norm": 0.6164728999137878, "learning_rate": 0.000137174638487208, "loss": 2.8318, "step": 1418 }, { "epoch": 18.923076923076923, "grad_norm": 0.6298549771308899, "learning_rate": 0.0001371301446051168, "loss": 2.8493, "step": 1419 }, { "epoch": 18.93645484949833, "grad_norm": 0.5760109424591064, "learning_rate": 0.0001370856507230256, "loss": 3.1736, "step": 1420 }, { "epoch": 18.94983277591973, "grad_norm": 0.6126035451889038, "learning_rate": 0.00013704115684093438, "loss": 3.1313, "step": 1421 }, { "epoch": 18.963210702341136, "grad_norm": 0.6092283129692078, "learning_rate": 0.00013699666295884316, "loss": 3.0601, "step": 1422 }, { "epoch": 18.976588628762542, "grad_norm": 0.6506980657577515, "learning_rate": 0.00013695216907675195, "loss": 2.7787, "step": 1423 }, { "epoch": 18.989966555183948, "grad_norm": 0.6060482263565063, "learning_rate": 0.00013690767519466076, "loss": 3.062, "step": 1424 }, { "epoch": 19.0, "grad_norm": 0.7881284952163696, "learning_rate": 0.0001368631813125695, "loss": 2.9882, "step": 1425 }, { "epoch": 19.013377926421406, "grad_norm": 0.5459823608398438, "learning_rate": 0.00013681868743047832, "loss": 2.9312, "step": 1426 }, { "epoch": 19.02675585284281, "grad_norm": 0.71108078956604, "learning_rate": 0.0001367741935483871, "loss": 2.6003, "step": 1427 }, { "epoch": 19.040133779264213, "grad_norm": 0.6824572682380676, "learning_rate": 0.00013672969966629589, "loss": 2.8354, "step": 1428 }, { "epoch": 19.05351170568562, "grad_norm": 0.607400119304657, "learning_rate": 0.00013668520578420467, "loss": 2.95, "step": 1429 }, { "epoch": 19.066889632107024, "grad_norm": 0.5925526022911072, "learning_rate": 0.00013664071190211348, "loss": 2.7101, "step": 1430 }, { "epoch": 19.08026755852843, "grad_norm": 0.6208476424217224, "learning_rate": 0.00013659621802002226, "loss": 2.7759, "step": 1431 }, { "epoch": 19.093645484949832, "grad_norm": 0.6047778725624084, "learning_rate": 0.00013655172413793104, "loss": 3.0377, "step": 1432 }, { "epoch": 19.107023411371237, "grad_norm": 0.5979378819465637, "learning_rate": 0.00013650723025583983, "loss": 2.8671, "step": 1433 }, { "epoch": 19.120401337792643, "grad_norm": 0.6338753700256348, "learning_rate": 0.00013646273637374863, "loss": 2.7869, "step": 1434 }, { "epoch": 19.13377926421405, "grad_norm": 0.6263737678527832, "learning_rate": 0.0001364182424916574, "loss": 2.6591, "step": 1435 }, { "epoch": 19.14715719063545, "grad_norm": 0.646990180015564, "learning_rate": 0.0001363737486095662, "loss": 2.896, "step": 1436 }, { "epoch": 19.160535117056856, "grad_norm": 0.5691138505935669, "learning_rate": 0.00013632925472747498, "loss": 2.8774, "step": 1437 }, { "epoch": 19.17391304347826, "grad_norm": 0.5838844180107117, "learning_rate": 0.00013628476084538376, "loss": 2.9722, "step": 1438 }, { "epoch": 19.187290969899667, "grad_norm": 0.6154463887214661, "learning_rate": 0.00013624026696329255, "loss": 3.0143, "step": 1439 }, { "epoch": 19.20066889632107, "grad_norm": 0.6480549573898315, "learning_rate": 0.00013619577308120136, "loss": 2.8118, "step": 1440 }, { "epoch": 19.214046822742475, "grad_norm": 0.7675592303276062, "learning_rate": 0.00013615127919911014, "loss": 2.865, "step": 1441 }, { "epoch": 19.22742474916388, "grad_norm": 0.7231382131576538, "learning_rate": 0.00013610678531701892, "loss": 2.6942, "step": 1442 }, { "epoch": 19.240802675585286, "grad_norm": 0.6359425187110901, "learning_rate": 0.0001360622914349277, "loss": 2.5976, "step": 1443 }, { "epoch": 19.254180602006688, "grad_norm": 0.6486908793449402, "learning_rate": 0.0001360177975528365, "loss": 2.9494, "step": 1444 }, { "epoch": 19.267558528428093, "grad_norm": 0.5930846929550171, "learning_rate": 0.00013597330367074527, "loss": 3.1295, "step": 1445 }, { "epoch": 19.2809364548495, "grad_norm": 0.6988996267318726, "learning_rate": 0.00013592880978865408, "loss": 2.9627, "step": 1446 }, { "epoch": 19.294314381270905, "grad_norm": 0.5971337556838989, "learning_rate": 0.00013588431590656286, "loss": 2.9636, "step": 1447 }, { "epoch": 19.307692307692307, "grad_norm": 0.6479155421257019, "learning_rate": 0.00013583982202447164, "loss": 2.6575, "step": 1448 }, { "epoch": 19.321070234113712, "grad_norm": 0.6771759986877441, "learning_rate": 0.00013579532814238043, "loss": 3.0667, "step": 1449 }, { "epoch": 19.334448160535118, "grad_norm": 0.5785907506942749, "learning_rate": 0.0001357508342602892, "loss": 2.8839, "step": 1450 }, { "epoch": 19.347826086956523, "grad_norm": 0.6315357089042664, "learning_rate": 0.00013570634037819802, "loss": 2.6925, "step": 1451 }, { "epoch": 19.361204013377925, "grad_norm": 0.6047807931900024, "learning_rate": 0.00013566184649610677, "loss": 2.7948, "step": 1452 }, { "epoch": 19.37458193979933, "grad_norm": 0.6079906225204468, "learning_rate": 0.00013561735261401558, "loss": 3.1798, "step": 1453 }, { "epoch": 19.387959866220736, "grad_norm": 0.5820274353027344, "learning_rate": 0.00013557285873192436, "loss": 3.1159, "step": 1454 }, { "epoch": 19.401337792642142, "grad_norm": 0.562022864818573, "learning_rate": 0.00013552836484983315, "loss": 2.9869, "step": 1455 }, { "epoch": 19.414715719063544, "grad_norm": 0.6663182973861694, "learning_rate": 0.00013548387096774193, "loss": 2.8038, "step": 1456 }, { "epoch": 19.42809364548495, "grad_norm": 0.7092719078063965, "learning_rate": 0.00013543937708565074, "loss": 3.0159, "step": 1457 }, { "epoch": 19.441471571906355, "grad_norm": 0.5990714430809021, "learning_rate": 0.00013539488320355952, "loss": 2.8858, "step": 1458 }, { "epoch": 19.45484949832776, "grad_norm": 0.600847065448761, "learning_rate": 0.0001353503893214683, "loss": 2.9352, "step": 1459 }, { "epoch": 19.468227424749163, "grad_norm": 0.6393849849700928, "learning_rate": 0.00013530589543937709, "loss": 2.6911, "step": 1460 }, { "epoch": 19.48160535117057, "grad_norm": 0.6170421838760376, "learning_rate": 0.0001352614015572859, "loss": 2.749, "step": 1461 }, { "epoch": 19.494983277591974, "grad_norm": 0.53690105676651, "learning_rate": 0.00013521690767519465, "loss": 2.9516, "step": 1462 }, { "epoch": 19.50836120401338, "grad_norm": 0.5976501703262329, "learning_rate": 0.00013517241379310346, "loss": 2.9996, "step": 1463 }, { "epoch": 19.52173913043478, "grad_norm": 0.5892135500907898, "learning_rate": 0.00013512791991101224, "loss": 3.2039, "step": 1464 }, { "epoch": 19.535117056856187, "grad_norm": 0.65968918800354, "learning_rate": 0.00013508342602892102, "loss": 2.5897, "step": 1465 }, { "epoch": 19.548494983277592, "grad_norm": 0.591454267501831, "learning_rate": 0.0001350389321468298, "loss": 3.0104, "step": 1466 }, { "epoch": 19.561872909698998, "grad_norm": 0.6272184252738953, "learning_rate": 0.00013499443826473862, "loss": 2.7266, "step": 1467 }, { "epoch": 19.5752508361204, "grad_norm": 0.6142420172691345, "learning_rate": 0.0001349499443826474, "loss": 2.8358, "step": 1468 }, { "epoch": 19.588628762541806, "grad_norm": 0.6268441677093506, "learning_rate": 0.00013490545050055618, "loss": 3.0196, "step": 1469 }, { "epoch": 19.60200668896321, "grad_norm": 0.6512436866760254, "learning_rate": 0.00013486095661846496, "loss": 2.9558, "step": 1470 }, { "epoch": 19.615384615384617, "grad_norm": 0.5983771681785583, "learning_rate": 0.00013481646273637377, "loss": 2.9958, "step": 1471 }, { "epoch": 19.62876254180602, "grad_norm": 0.6994190216064453, "learning_rate": 0.00013477196885428253, "loss": 2.8019, "step": 1472 }, { "epoch": 19.642140468227424, "grad_norm": 0.5878567695617676, "learning_rate": 0.00013472747497219134, "loss": 2.7007, "step": 1473 }, { "epoch": 19.65551839464883, "grad_norm": 0.6140199303627014, "learning_rate": 0.00013468298109010012, "loss": 2.9212, "step": 1474 }, { "epoch": 19.668896321070235, "grad_norm": 0.648714542388916, "learning_rate": 0.0001346384872080089, "loss": 3.0053, "step": 1475 }, { "epoch": 19.682274247491637, "grad_norm": 0.5991750359535217, "learning_rate": 0.00013459399332591769, "loss": 2.9129, "step": 1476 }, { "epoch": 19.695652173913043, "grad_norm": 0.5538223385810852, "learning_rate": 0.0001345494994438265, "loss": 2.9097, "step": 1477 }, { "epoch": 19.70903010033445, "grad_norm": 0.5864409804344177, "learning_rate": 0.00013450500556173528, "loss": 2.9348, "step": 1478 }, { "epoch": 19.722408026755854, "grad_norm": 0.6004533767700195, "learning_rate": 0.00013446051167964406, "loss": 2.8845, "step": 1479 }, { "epoch": 19.735785953177256, "grad_norm": 0.6316581964492798, "learning_rate": 0.00013441601779755284, "loss": 2.8619, "step": 1480 }, { "epoch": 19.74916387959866, "grad_norm": 0.593138575553894, "learning_rate": 0.00013437152391546165, "loss": 3.041, "step": 1481 }, { "epoch": 19.762541806020067, "grad_norm": 0.5826678276062012, "learning_rate": 0.0001343270300333704, "loss": 2.7432, "step": 1482 }, { "epoch": 19.775919732441473, "grad_norm": 0.6341697573661804, "learning_rate": 0.00013428253615127922, "loss": 2.9755, "step": 1483 }, { "epoch": 19.789297658862875, "grad_norm": 0.5894901156425476, "learning_rate": 0.000134238042269188, "loss": 2.9754, "step": 1484 }, { "epoch": 19.80267558528428, "grad_norm": 0.5840655565261841, "learning_rate": 0.00013419354838709678, "loss": 2.9959, "step": 1485 }, { "epoch": 19.816053511705686, "grad_norm": 0.6006319522857666, "learning_rate": 0.00013414905450500556, "loss": 3.0196, "step": 1486 }, { "epoch": 19.82943143812709, "grad_norm": 0.5647453665733337, "learning_rate": 0.00013410456062291437, "loss": 2.9662, "step": 1487 }, { "epoch": 19.842809364548494, "grad_norm": 0.6583006978034973, "learning_rate": 0.00013406006674082316, "loss": 2.8719, "step": 1488 }, { "epoch": 19.8561872909699, "grad_norm": 0.6041131615638733, "learning_rate": 0.00013401557285873194, "loss": 3.0019, "step": 1489 }, { "epoch": 19.869565217391305, "grad_norm": 0.5524600148200989, "learning_rate": 0.00013397107897664072, "loss": 2.7409, "step": 1490 }, { "epoch": 19.88294314381271, "grad_norm": 0.6532869338989258, "learning_rate": 0.0001339265850945495, "loss": 2.9336, "step": 1491 }, { "epoch": 19.896321070234112, "grad_norm": 0.6459875106811523, "learning_rate": 0.00013388209121245828, "loss": 2.9572, "step": 1492 }, { "epoch": 19.909698996655518, "grad_norm": 0.6051417589187622, "learning_rate": 0.00013383759733036707, "loss": 2.864, "step": 1493 }, { "epoch": 19.923076923076923, "grad_norm": 0.6565695405006409, "learning_rate": 0.00013379310344827588, "loss": 2.865, "step": 1494 }, { "epoch": 19.93645484949833, "grad_norm": 0.6118014454841614, "learning_rate": 0.00013374860956618466, "loss": 2.7594, "step": 1495 }, { "epoch": 19.94983277591973, "grad_norm": 0.6801209449768066, "learning_rate": 0.00013370411568409344, "loss": 2.7532, "step": 1496 }, { "epoch": 19.963210702341136, "grad_norm": 0.5785267353057861, "learning_rate": 0.00013365962180200222, "loss": 3.0618, "step": 1497 }, { "epoch": 19.976588628762542, "grad_norm": 0.6344903707504272, "learning_rate": 0.00013361512791991103, "loss": 2.7908, "step": 1498 }, { "epoch": 19.989966555183948, "grad_norm": 0.6073011159896851, "learning_rate": 0.0001335706340378198, "loss": 3.0699, "step": 1499 }, { "epoch": 20.0, "grad_norm": 0.6989748477935791, "learning_rate": 0.0001335261401557286, "loss": 2.7932, "step": 1500 }, { "epoch": 20.013377926421406, "grad_norm": 0.5897710919380188, "learning_rate": 0.00013348164627363738, "loss": 3.0379, "step": 1501 }, { "epoch": 20.02675585284281, "grad_norm": 0.5845353603363037, "learning_rate": 0.00013343715239154616, "loss": 2.7123, "step": 1502 }, { "epoch": 20.040133779264213, "grad_norm": 0.5720913410186768, "learning_rate": 0.00013339265850945495, "loss": 3.0196, "step": 1503 }, { "epoch": 20.05351170568562, "grad_norm": 0.7616726756095886, "learning_rate": 0.00013334816462736375, "loss": 2.6719, "step": 1504 }, { "epoch": 20.066889632107024, "grad_norm": 0.6211048364639282, "learning_rate": 0.00013330367074527254, "loss": 2.8263, "step": 1505 }, { "epoch": 20.08026755852843, "grad_norm": 0.6016023755073547, "learning_rate": 0.00013325917686318132, "loss": 2.7226, "step": 1506 }, { "epoch": 20.093645484949832, "grad_norm": 0.6265879273414612, "learning_rate": 0.0001332146829810901, "loss": 2.8597, "step": 1507 }, { "epoch": 20.107023411371237, "grad_norm": 0.6129719614982605, "learning_rate": 0.0001331701890989989, "loss": 2.9449, "step": 1508 }, { "epoch": 20.120401337792643, "grad_norm": 0.6351513266563416, "learning_rate": 0.00013312569521690767, "loss": 2.9768, "step": 1509 }, { "epoch": 20.13377926421405, "grad_norm": 0.5772795677185059, "learning_rate": 0.00013308120133481648, "loss": 3.0424, "step": 1510 }, { "epoch": 20.14715719063545, "grad_norm": 0.600697934627533, "learning_rate": 0.00013303670745272526, "loss": 2.793, "step": 1511 }, { "epoch": 20.160535117056856, "grad_norm": 0.5418747663497925, "learning_rate": 0.00013299221357063404, "loss": 2.8706, "step": 1512 }, { "epoch": 20.17391304347826, "grad_norm": 0.5500038862228394, "learning_rate": 0.00013294771968854282, "loss": 2.7352, "step": 1513 }, { "epoch": 20.187290969899667, "grad_norm": 0.5478414297103882, "learning_rate": 0.00013290322580645163, "loss": 2.9113, "step": 1514 }, { "epoch": 20.20066889632107, "grad_norm": 0.5382576584815979, "learning_rate": 0.00013285873192436042, "loss": 2.8276, "step": 1515 }, { "epoch": 20.214046822742475, "grad_norm": 0.6003616452217102, "learning_rate": 0.0001328142380422692, "loss": 2.9112, "step": 1516 }, { "epoch": 20.22742474916388, "grad_norm": 0.6674323678016663, "learning_rate": 0.00013276974416017798, "loss": 2.7382, "step": 1517 }, { "epoch": 20.240802675585286, "grad_norm": 0.591314435005188, "learning_rate": 0.0001327252502780868, "loss": 2.8003, "step": 1518 }, { "epoch": 20.254180602006688, "grad_norm": 0.5268637537956238, "learning_rate": 0.00013268075639599554, "loss": 2.7134, "step": 1519 }, { "epoch": 20.267558528428093, "grad_norm": 0.5217694640159607, "learning_rate": 0.00013263626251390435, "loss": 2.6157, "step": 1520 }, { "epoch": 20.2809364548495, "grad_norm": 0.6171165704727173, "learning_rate": 0.00013259176863181314, "loss": 2.8738, "step": 1521 }, { "epoch": 20.294314381270905, "grad_norm": 0.5457054972648621, "learning_rate": 0.00013254727474972192, "loss": 2.6106, "step": 1522 }, { "epoch": 20.307692307692307, "grad_norm": 0.6596150994300842, "learning_rate": 0.0001325027808676307, "loss": 2.9974, "step": 1523 }, { "epoch": 20.321070234113712, "grad_norm": 0.7236288785934448, "learning_rate": 0.0001324582869855395, "loss": 2.8669, "step": 1524 }, { "epoch": 20.334448160535118, "grad_norm": 0.6390851736068726, "learning_rate": 0.0001324137931034483, "loss": 2.909, "step": 1525 }, { "epoch": 20.347826086956523, "grad_norm": 0.6010439991950989, "learning_rate": 0.00013236929922135708, "loss": 2.9567, "step": 1526 }, { "epoch": 20.361204013377925, "grad_norm": 0.5825399160385132, "learning_rate": 0.00013232480533926586, "loss": 2.7688, "step": 1527 }, { "epoch": 20.37458193979933, "grad_norm": 0.6000121831893921, "learning_rate": 0.00013228031145717467, "loss": 3.0515, "step": 1528 }, { "epoch": 20.387959866220736, "grad_norm": 0.5775492787361145, "learning_rate": 0.00013223581757508342, "loss": 2.8958, "step": 1529 }, { "epoch": 20.401337792642142, "grad_norm": 0.6193161010742188, "learning_rate": 0.00013219132369299223, "loss": 2.9648, "step": 1530 }, { "epoch": 20.414715719063544, "grad_norm": 0.7751132249832153, "learning_rate": 0.00013214682981090101, "loss": 2.677, "step": 1531 }, { "epoch": 20.42809364548495, "grad_norm": 0.6269053220748901, "learning_rate": 0.0001321023359288098, "loss": 2.9483, "step": 1532 }, { "epoch": 20.441471571906355, "grad_norm": 0.5793836116790771, "learning_rate": 0.00013205784204671858, "loss": 2.8025, "step": 1533 }, { "epoch": 20.45484949832776, "grad_norm": 0.5609649419784546, "learning_rate": 0.00013201334816462736, "loss": 2.8104, "step": 1534 }, { "epoch": 20.468227424749163, "grad_norm": 0.6055050492286682, "learning_rate": 0.00013196885428253617, "loss": 2.8649, "step": 1535 }, { "epoch": 20.48160535117057, "grad_norm": 0.5952653884887695, "learning_rate": 0.00013192436040044493, "loss": 2.6815, "step": 1536 }, { "epoch": 20.494983277591974, "grad_norm": 0.5716878175735474, "learning_rate": 0.00013187986651835374, "loss": 2.637, "step": 1537 }, { "epoch": 20.50836120401338, "grad_norm": 0.5684781074523926, "learning_rate": 0.00013183537263626252, "loss": 2.8601, "step": 1538 }, { "epoch": 20.52173913043478, "grad_norm": 0.5603588223457336, "learning_rate": 0.0001317908787541713, "loss": 2.9173, "step": 1539 }, { "epoch": 20.535117056856187, "grad_norm": 0.6022379398345947, "learning_rate": 0.00013174638487208008, "loss": 2.873, "step": 1540 }, { "epoch": 20.548494983277592, "grad_norm": 0.6124081611633301, "learning_rate": 0.0001317018909899889, "loss": 3.05, "step": 1541 }, { "epoch": 20.561872909698998, "grad_norm": 0.5928300023078918, "learning_rate": 0.00013165739710789768, "loss": 2.7411, "step": 1542 }, { "epoch": 20.5752508361204, "grad_norm": 0.5931242108345032, "learning_rate": 0.00013161290322580646, "loss": 3.0032, "step": 1543 }, { "epoch": 20.588628762541806, "grad_norm": 0.7123291492462158, "learning_rate": 0.00013156840934371524, "loss": 2.6485, "step": 1544 }, { "epoch": 20.60200668896321, "grad_norm": 0.565980851650238, "learning_rate": 0.00013152391546162405, "loss": 2.7143, "step": 1545 }, { "epoch": 20.615384615384617, "grad_norm": 0.6504361033439636, "learning_rate": 0.0001314794215795328, "loss": 2.6285, "step": 1546 }, { "epoch": 20.62876254180602, "grad_norm": 0.6558433175086975, "learning_rate": 0.00013143492769744161, "loss": 3.0403, "step": 1547 }, { "epoch": 20.642140468227424, "grad_norm": 0.6159015893936157, "learning_rate": 0.0001313904338153504, "loss": 2.8836, "step": 1548 }, { "epoch": 20.65551839464883, "grad_norm": 0.6061471104621887, "learning_rate": 0.00013134593993325918, "loss": 2.9076, "step": 1549 }, { "epoch": 20.668896321070235, "grad_norm": 0.6891756057739258, "learning_rate": 0.00013130144605116796, "loss": 3.0326, "step": 1550 }, { "epoch": 20.682274247491637, "grad_norm": 0.5920016765594482, "learning_rate": 0.00013125695216907677, "loss": 2.949, "step": 1551 }, { "epoch": 20.695652173913043, "grad_norm": 0.7947055697441101, "learning_rate": 0.00013121245828698555, "loss": 2.7796, "step": 1552 }, { "epoch": 20.70903010033445, "grad_norm": 0.6409865021705627, "learning_rate": 0.00013116796440489434, "loss": 2.9103, "step": 1553 }, { "epoch": 20.722408026755854, "grad_norm": 0.5864197611808777, "learning_rate": 0.00013112347052280312, "loss": 3.0493, "step": 1554 }, { "epoch": 20.735785953177256, "grad_norm": 0.6567059755325317, "learning_rate": 0.00013107897664071193, "loss": 2.8359, "step": 1555 }, { "epoch": 20.74916387959866, "grad_norm": 0.5808839201927185, "learning_rate": 0.00013103448275862068, "loss": 2.9595, "step": 1556 }, { "epoch": 20.762541806020067, "grad_norm": 0.6176792979240417, "learning_rate": 0.0001309899888765295, "loss": 2.6975, "step": 1557 }, { "epoch": 20.775919732441473, "grad_norm": 0.5763049721717834, "learning_rate": 0.00013094549499443827, "loss": 2.9752, "step": 1558 }, { "epoch": 20.789297658862875, "grad_norm": 0.5877807140350342, "learning_rate": 0.00013090100111234706, "loss": 2.7214, "step": 1559 }, { "epoch": 20.80267558528428, "grad_norm": 0.6624999642372131, "learning_rate": 0.00013085650723025584, "loss": 2.8642, "step": 1560 }, { "epoch": 20.816053511705686, "grad_norm": 0.7411154508590698, "learning_rate": 0.00013081201334816465, "loss": 2.7526, "step": 1561 }, { "epoch": 20.82943143812709, "grad_norm": 0.6246466636657715, "learning_rate": 0.00013076751946607343, "loss": 3.0441, "step": 1562 }, { "epoch": 20.842809364548494, "grad_norm": 0.5515304803848267, "learning_rate": 0.00013072302558398221, "loss": 2.7211, "step": 1563 }, { "epoch": 20.8561872909699, "grad_norm": 0.5537006258964539, "learning_rate": 0.000130678531701891, "loss": 2.9619, "step": 1564 }, { "epoch": 20.869565217391305, "grad_norm": 0.6326210498809814, "learning_rate": 0.0001306340378197998, "loss": 3.2433, "step": 1565 }, { "epoch": 20.88294314381271, "grad_norm": 0.6463941335678101, "learning_rate": 0.00013058954393770856, "loss": 2.7263, "step": 1566 }, { "epoch": 20.896321070234112, "grad_norm": 0.6212389469146729, "learning_rate": 0.00013054505005561737, "loss": 2.9775, "step": 1567 }, { "epoch": 20.909698996655518, "grad_norm": 0.5878139734268188, "learning_rate": 0.00013050055617352615, "loss": 2.7427, "step": 1568 }, { "epoch": 20.923076923076923, "grad_norm": 0.5821187496185303, "learning_rate": 0.00013045606229143494, "loss": 2.9058, "step": 1569 }, { "epoch": 20.93645484949833, "grad_norm": 0.6192734837532043, "learning_rate": 0.00013041156840934372, "loss": 2.9744, "step": 1570 }, { "epoch": 20.94983277591973, "grad_norm": 0.5540494322776794, "learning_rate": 0.00013036707452725253, "loss": 2.8299, "step": 1571 }, { "epoch": 20.963210702341136, "grad_norm": 0.6620298624038696, "learning_rate": 0.0001303225806451613, "loss": 2.8291, "step": 1572 }, { "epoch": 20.976588628762542, "grad_norm": 0.7161823511123657, "learning_rate": 0.0001302780867630701, "loss": 2.9829, "step": 1573 }, { "epoch": 20.989966555183948, "grad_norm": 1.1449205875396729, "learning_rate": 0.00013023359288097887, "loss": 2.8807, "step": 1574 }, { "epoch": 21.0, "grad_norm": 0.6721773743629456, "learning_rate": 0.00013018909899888766, "loss": 2.8398, "step": 1575 }, { "epoch": 21.013377926421406, "grad_norm": 0.5506662130355835, "learning_rate": 0.00013014460511679644, "loss": 2.6858, "step": 1576 }, { "epoch": 21.02675585284281, "grad_norm": 0.5651558041572571, "learning_rate": 0.00013010011123470522, "loss": 2.8692, "step": 1577 }, { "epoch": 21.040133779264213, "grad_norm": 0.5379538536071777, "learning_rate": 0.00013005561735261403, "loss": 2.8221, "step": 1578 }, { "epoch": 21.05351170568562, "grad_norm": 0.5756275653839111, "learning_rate": 0.0001300111234705228, "loss": 2.5469, "step": 1579 }, { "epoch": 21.066889632107024, "grad_norm": 0.672498881816864, "learning_rate": 0.0001299666295884316, "loss": 2.825, "step": 1580 }, { "epoch": 21.08026755852843, "grad_norm": 0.5217440128326416, "learning_rate": 0.00012992213570634038, "loss": 2.732, "step": 1581 }, { "epoch": 21.093645484949832, "grad_norm": 0.6326786875724792, "learning_rate": 0.0001298776418242492, "loss": 2.6537, "step": 1582 }, { "epoch": 21.107023411371237, "grad_norm": 0.6899091005325317, "learning_rate": 0.00012983314794215794, "loss": 2.7133, "step": 1583 }, { "epoch": 21.120401337792643, "grad_norm": 0.604834794998169, "learning_rate": 0.00012978865406006675, "loss": 2.9341, "step": 1584 }, { "epoch": 21.13377926421405, "grad_norm": 0.7540157437324524, "learning_rate": 0.00012974416017797553, "loss": 2.8224, "step": 1585 }, { "epoch": 21.14715719063545, "grad_norm": 0.6161774396896362, "learning_rate": 0.00012969966629588432, "loss": 2.9968, "step": 1586 }, { "epoch": 21.160535117056856, "grad_norm": 0.618781328201294, "learning_rate": 0.0001296551724137931, "loss": 2.8074, "step": 1587 }, { "epoch": 21.17391304347826, "grad_norm": 0.6708270907402039, "learning_rate": 0.0001296106785317019, "loss": 2.9327, "step": 1588 }, { "epoch": 21.187290969899667, "grad_norm": 0.6362271904945374, "learning_rate": 0.0001295661846496107, "loss": 2.46, "step": 1589 }, { "epoch": 21.20066889632107, "grad_norm": 0.6039800047874451, "learning_rate": 0.00012952169076751947, "loss": 2.7728, "step": 1590 }, { "epoch": 21.214046822742475, "grad_norm": 0.6119550466537476, "learning_rate": 0.00012947719688542826, "loss": 2.7123, "step": 1591 }, { "epoch": 21.22742474916388, "grad_norm": 0.8122309446334839, "learning_rate": 0.00012943270300333707, "loss": 2.7424, "step": 1592 }, { "epoch": 21.240802675585286, "grad_norm": 0.5658085346221924, "learning_rate": 0.00012938820912124582, "loss": 2.4553, "step": 1593 }, { "epoch": 21.254180602006688, "grad_norm": 0.7365813851356506, "learning_rate": 0.00012934371523915463, "loss": 2.9437, "step": 1594 }, { "epoch": 21.267558528428093, "grad_norm": 0.5746303796768188, "learning_rate": 0.0001292992213570634, "loss": 2.8262, "step": 1595 }, { "epoch": 21.2809364548495, "grad_norm": 0.5708132386207581, "learning_rate": 0.0001292547274749722, "loss": 2.9298, "step": 1596 }, { "epoch": 21.294314381270905, "grad_norm": 0.6003934741020203, "learning_rate": 0.00012921023359288098, "loss": 2.7943, "step": 1597 }, { "epoch": 21.307692307692307, "grad_norm": 0.5722339749336243, "learning_rate": 0.0001291657397107898, "loss": 3.0547, "step": 1598 }, { "epoch": 21.321070234113712, "grad_norm": 0.6287402510643005, "learning_rate": 0.00012912124582869857, "loss": 2.8814, "step": 1599 }, { "epoch": 21.334448160535118, "grad_norm": 0.6594595313072205, "learning_rate": 0.00012907675194660735, "loss": 2.7927, "step": 1600 }, { "epoch": 21.347826086956523, "grad_norm": 0.6678686141967773, "learning_rate": 0.00012903225806451613, "loss": 2.7981, "step": 1601 }, { "epoch": 21.361204013377925, "grad_norm": 0.6103408336639404, "learning_rate": 0.00012898776418242494, "loss": 2.9009, "step": 1602 }, { "epoch": 21.37458193979933, "grad_norm": 0.5791922807693481, "learning_rate": 0.0001289432703003337, "loss": 2.6689, "step": 1603 }, { "epoch": 21.387959866220736, "grad_norm": 0.5776186585426331, "learning_rate": 0.0001288987764182425, "loss": 2.903, "step": 1604 }, { "epoch": 21.401337792642142, "grad_norm": 0.5746363401412964, "learning_rate": 0.0001288542825361513, "loss": 2.832, "step": 1605 }, { "epoch": 21.414715719063544, "grad_norm": 0.5495322346687317, "learning_rate": 0.00012880978865406007, "loss": 2.7305, "step": 1606 }, { "epoch": 21.42809364548495, "grad_norm": 0.6944610476493835, "learning_rate": 0.00012876529477196886, "loss": 2.7534, "step": 1607 }, { "epoch": 21.441471571906355, "grad_norm": 0.6640026569366455, "learning_rate": 0.00012872080088987767, "loss": 2.814, "step": 1608 }, { "epoch": 21.45484949832776, "grad_norm": 0.7469935417175293, "learning_rate": 0.00012867630700778642, "loss": 2.8523, "step": 1609 }, { "epoch": 21.468227424749163, "grad_norm": 0.600739061832428, "learning_rate": 0.00012863181312569523, "loss": 3.0074, "step": 1610 }, { "epoch": 21.48160535117057, "grad_norm": 0.6701086759567261, "learning_rate": 0.000128587319243604, "loss": 2.6693, "step": 1611 }, { "epoch": 21.494983277591974, "grad_norm": 0.6365100145339966, "learning_rate": 0.00012854282536151282, "loss": 2.9558, "step": 1612 }, { "epoch": 21.50836120401338, "grad_norm": 0.6170171499252319, "learning_rate": 0.00012849833147942158, "loss": 2.605, "step": 1613 }, { "epoch": 21.52173913043478, "grad_norm": 0.5899895429611206, "learning_rate": 0.0001284538375973304, "loss": 2.7816, "step": 1614 }, { "epoch": 21.535117056856187, "grad_norm": 0.6815734505653381, "learning_rate": 0.00012840934371523917, "loss": 2.6663, "step": 1615 }, { "epoch": 21.548494983277592, "grad_norm": 0.6620442271232605, "learning_rate": 0.00012836484983314792, "loss": 2.9333, "step": 1616 }, { "epoch": 21.561872909698998, "grad_norm": 0.6552561521530151, "learning_rate": 0.00012832035595105673, "loss": 2.6833, "step": 1617 }, { "epoch": 21.5752508361204, "grad_norm": 0.6288586854934692, "learning_rate": 0.00012827586206896552, "loss": 2.959, "step": 1618 }, { "epoch": 21.588628762541806, "grad_norm": 0.6969814300537109, "learning_rate": 0.0001282313681868743, "loss": 2.7129, "step": 1619 }, { "epoch": 21.60200668896321, "grad_norm": 0.5918965339660645, "learning_rate": 0.00012818687430478308, "loss": 2.9057, "step": 1620 }, { "epoch": 21.615384615384617, "grad_norm": 0.5739157199859619, "learning_rate": 0.0001281423804226919, "loss": 2.9275, "step": 1621 }, { "epoch": 21.62876254180602, "grad_norm": 0.6649355888366699, "learning_rate": 0.00012809788654060067, "loss": 2.7629, "step": 1622 }, { "epoch": 21.642140468227424, "grad_norm": 0.6449686884880066, "learning_rate": 0.00012805339265850946, "loss": 2.9173, "step": 1623 }, { "epoch": 21.65551839464883, "grad_norm": 0.5940249562263489, "learning_rate": 0.00012800889877641824, "loss": 2.875, "step": 1624 }, { "epoch": 21.668896321070235, "grad_norm": 0.8104184865951538, "learning_rate": 0.00012796440489432705, "loss": 2.7381, "step": 1625 }, { "epoch": 21.682274247491637, "grad_norm": 0.6892338395118713, "learning_rate": 0.0001279199110122358, "loss": 2.7516, "step": 1626 }, { "epoch": 21.695652173913043, "grad_norm": 0.5888476967811584, "learning_rate": 0.0001278754171301446, "loss": 2.9234, "step": 1627 }, { "epoch": 21.70903010033445, "grad_norm": 0.6148846745491028, "learning_rate": 0.0001278309232480534, "loss": 3.0824, "step": 1628 }, { "epoch": 21.722408026755854, "grad_norm": 0.6559250354766846, "learning_rate": 0.00012778642936596218, "loss": 3.0316, "step": 1629 }, { "epoch": 21.735785953177256, "grad_norm": 0.5679346919059753, "learning_rate": 0.00012774193548387096, "loss": 3.0012, "step": 1630 }, { "epoch": 21.74916387959866, "grad_norm": 0.6776624917984009, "learning_rate": 0.00012769744160177977, "loss": 2.8331, "step": 1631 }, { "epoch": 21.762541806020067, "grad_norm": 0.6982457637786865, "learning_rate": 0.00012765294771968855, "loss": 2.8175, "step": 1632 }, { "epoch": 21.775919732441473, "grad_norm": 0.6834619045257568, "learning_rate": 0.00012760845383759733, "loss": 3.1136, "step": 1633 }, { "epoch": 21.789297658862875, "grad_norm": 0.6460130214691162, "learning_rate": 0.00012756395995550612, "loss": 2.7737, "step": 1634 }, { "epoch": 21.80267558528428, "grad_norm": 0.6818533539772034, "learning_rate": 0.00012751946607341493, "loss": 2.5013, "step": 1635 }, { "epoch": 21.816053511705686, "grad_norm": 0.7875826954841614, "learning_rate": 0.00012747497219132368, "loss": 2.9702, "step": 1636 }, { "epoch": 21.82943143812709, "grad_norm": 0.5845924019813538, "learning_rate": 0.0001274304783092325, "loss": 2.8592, "step": 1637 }, { "epoch": 21.842809364548494, "grad_norm": 0.6136084794998169, "learning_rate": 0.00012738598442714127, "loss": 2.8456, "step": 1638 }, { "epoch": 21.8561872909699, "grad_norm": 0.7345831394195557, "learning_rate": 0.00012734149054505005, "loss": 2.9846, "step": 1639 }, { "epoch": 21.869565217391305, "grad_norm": 0.5709069967269897, "learning_rate": 0.00012729699666295884, "loss": 2.9736, "step": 1640 }, { "epoch": 21.88294314381271, "grad_norm": 0.7338367104530334, "learning_rate": 0.00012725250278086765, "loss": 2.8215, "step": 1641 }, { "epoch": 21.896321070234112, "grad_norm": 0.6029645800590515, "learning_rate": 0.00012720800889877643, "loss": 2.7419, "step": 1642 }, { "epoch": 21.909698996655518, "grad_norm": 0.7492797374725342, "learning_rate": 0.0001271635150166852, "loss": 2.8256, "step": 1643 }, { "epoch": 21.923076923076923, "grad_norm": 0.6877094507217407, "learning_rate": 0.000127119021134594, "loss": 2.8241, "step": 1644 }, { "epoch": 21.93645484949833, "grad_norm": 0.6263371109962463, "learning_rate": 0.0001270745272525028, "loss": 2.8459, "step": 1645 }, { "epoch": 21.94983277591973, "grad_norm": 0.5821470618247986, "learning_rate": 0.00012703003337041156, "loss": 3.0513, "step": 1646 }, { "epoch": 21.963210702341136, "grad_norm": 0.5494033098220825, "learning_rate": 0.00012698553948832037, "loss": 3.0951, "step": 1647 }, { "epoch": 21.976588628762542, "grad_norm": 0.5838212370872498, "learning_rate": 0.00012694104560622915, "loss": 3.0089, "step": 1648 }, { "epoch": 21.989966555183948, "grad_norm": 0.7115112543106079, "learning_rate": 0.00012689655172413793, "loss": 2.9137, "step": 1649 }, { "epoch": 22.0, "grad_norm": 0.7208350896835327, "learning_rate": 0.00012685205784204672, "loss": 2.8793, "step": 1650 }, { "epoch": 22.013377926421406, "grad_norm": 0.912466824054718, "learning_rate": 0.00012680756395995552, "loss": 2.7563, "step": 1651 }, { "epoch": 22.02675585284281, "grad_norm": 0.7778460383415222, "learning_rate": 0.0001267630700778643, "loss": 2.7859, "step": 1652 }, { "epoch": 22.040133779264213, "grad_norm": 0.6658245921134949, "learning_rate": 0.0001267185761957731, "loss": 2.8353, "step": 1653 }, { "epoch": 22.05351170568562, "grad_norm": 0.5711331963539124, "learning_rate": 0.00012667408231368187, "loss": 2.7731, "step": 1654 }, { "epoch": 22.066889632107024, "grad_norm": 0.6227294206619263, "learning_rate": 0.00012662958843159068, "loss": 2.6841, "step": 1655 }, { "epoch": 22.08026755852843, "grad_norm": 0.6221343874931335, "learning_rate": 0.00012658509454949944, "loss": 2.6303, "step": 1656 }, { "epoch": 22.093645484949832, "grad_norm": 0.6461536288261414, "learning_rate": 0.00012654060066740822, "loss": 2.7591, "step": 1657 }, { "epoch": 22.107023411371237, "grad_norm": 0.9253934025764465, "learning_rate": 0.00012649610678531703, "loss": 2.7619, "step": 1658 }, { "epoch": 22.120401337792643, "grad_norm": 1.0310887098312378, "learning_rate": 0.0001264516129032258, "loss": 2.7614, "step": 1659 }, { "epoch": 22.13377926421405, "grad_norm": 0.682077944278717, "learning_rate": 0.0001264071190211346, "loss": 2.9013, "step": 1660 }, { "epoch": 22.14715719063545, "grad_norm": 0.6438060998916626, "learning_rate": 0.00012636262513904338, "loss": 2.8715, "step": 1661 }, { "epoch": 22.160535117056856, "grad_norm": 0.5626085996627808, "learning_rate": 0.00012631813125695219, "loss": 2.7459, "step": 1662 }, { "epoch": 22.17391304347826, "grad_norm": 0.6211214065551758, "learning_rate": 0.00012627363737486094, "loss": 2.9223, "step": 1663 }, { "epoch": 22.187290969899667, "grad_norm": 0.5843247175216675, "learning_rate": 0.00012622914349276975, "loss": 3.0176, "step": 1664 }, { "epoch": 22.20066889632107, "grad_norm": 0.6780831813812256, "learning_rate": 0.00012618464961067853, "loss": 3.0641, "step": 1665 }, { "epoch": 22.214046822742475, "grad_norm": 0.6507664918899536, "learning_rate": 0.00012614015572858731, "loss": 2.9043, "step": 1666 }, { "epoch": 22.22742474916388, "grad_norm": 0.6472734212875366, "learning_rate": 0.0001260956618464961, "loss": 2.678, "step": 1667 }, { "epoch": 22.240802675585286, "grad_norm": 0.5920576453208923, "learning_rate": 0.0001260511679644049, "loss": 2.9801, "step": 1668 }, { "epoch": 22.254180602006688, "grad_norm": 0.5785727500915527, "learning_rate": 0.0001260066740823137, "loss": 2.6949, "step": 1669 }, { "epoch": 22.267558528428093, "grad_norm": 0.8260320425033569, "learning_rate": 0.00012596218020022247, "loss": 2.939, "step": 1670 }, { "epoch": 22.2809364548495, "grad_norm": 0.6697145700454712, "learning_rate": 0.00012591768631813125, "loss": 2.6997, "step": 1671 }, { "epoch": 22.294314381270905, "grad_norm": 0.6255594491958618, "learning_rate": 0.00012587319243604006, "loss": 2.7875, "step": 1672 }, { "epoch": 22.307692307692307, "grad_norm": 0.707083523273468, "learning_rate": 0.00012582869855394882, "loss": 2.8843, "step": 1673 }, { "epoch": 22.321070234113712, "grad_norm": 0.5888648629188538, "learning_rate": 0.00012578420467185763, "loss": 2.8862, "step": 1674 }, { "epoch": 22.334448160535118, "grad_norm": 0.6249898672103882, "learning_rate": 0.0001257397107897664, "loss": 2.6108, "step": 1675 }, { "epoch": 22.347826086956523, "grad_norm": 0.6932422518730164, "learning_rate": 0.0001256952169076752, "loss": 2.8393, "step": 1676 }, { "epoch": 22.361204013377925, "grad_norm": 0.6037236452102661, "learning_rate": 0.00012565072302558398, "loss": 2.6517, "step": 1677 }, { "epoch": 22.37458193979933, "grad_norm": 0.5549351572990417, "learning_rate": 0.00012560622914349278, "loss": 2.7725, "step": 1678 }, { "epoch": 22.387959866220736, "grad_norm": 0.6309815645217896, "learning_rate": 0.00012556173526140157, "loss": 2.7539, "step": 1679 }, { "epoch": 22.401337792642142, "grad_norm": 0.6618108153343201, "learning_rate": 0.00012551724137931035, "loss": 2.9342, "step": 1680 }, { "epoch": 22.414715719063544, "grad_norm": 0.7427678108215332, "learning_rate": 0.00012547274749721913, "loss": 2.9707, "step": 1681 }, { "epoch": 22.42809364548495, "grad_norm": 0.6226203441619873, "learning_rate": 0.00012542825361512794, "loss": 2.728, "step": 1682 }, { "epoch": 22.441471571906355, "grad_norm": 0.6432873606681824, "learning_rate": 0.0001253837597330367, "loss": 2.6271, "step": 1683 }, { "epoch": 22.45484949832776, "grad_norm": 0.618368923664093, "learning_rate": 0.0001253392658509455, "loss": 2.7364, "step": 1684 }, { "epoch": 22.468227424749163, "grad_norm": 0.601243257522583, "learning_rate": 0.0001252947719688543, "loss": 2.7071, "step": 1685 }, { "epoch": 22.48160535117057, "grad_norm": 0.5549631714820862, "learning_rate": 0.00012525027808676307, "loss": 2.7703, "step": 1686 }, { "epoch": 22.494983277591974, "grad_norm": 0.8125869035720825, "learning_rate": 0.00012520578420467185, "loss": 2.6798, "step": 1687 }, { "epoch": 22.50836120401338, "grad_norm": 0.5712392330169678, "learning_rate": 0.00012516129032258066, "loss": 2.7694, "step": 1688 }, { "epoch": 22.52173913043478, "grad_norm": 0.5955860614776611, "learning_rate": 0.00012511679644048945, "loss": 2.8128, "step": 1689 }, { "epoch": 22.535117056856187, "grad_norm": 0.6090961694717407, "learning_rate": 0.00012507230255839823, "loss": 2.9096, "step": 1690 }, { "epoch": 22.548494983277592, "grad_norm": 0.6430270671844482, "learning_rate": 0.000125027808676307, "loss": 2.8046, "step": 1691 }, { "epoch": 22.561872909698998, "grad_norm": 0.6325036883354187, "learning_rate": 0.00012498331479421582, "loss": 2.9619, "step": 1692 }, { "epoch": 22.5752508361204, "grad_norm": 0.5997435450553894, "learning_rate": 0.00012493882091212458, "loss": 2.9809, "step": 1693 }, { "epoch": 22.588628762541806, "grad_norm": 0.6140502095222473, "learning_rate": 0.00012489432703003338, "loss": 2.9622, "step": 1694 }, { "epoch": 22.60200668896321, "grad_norm": 0.5908461809158325, "learning_rate": 0.00012484983314794217, "loss": 2.8282, "step": 1695 }, { "epoch": 22.615384615384617, "grad_norm": 0.5987250208854675, "learning_rate": 0.00012480533926585095, "loss": 2.8239, "step": 1696 }, { "epoch": 22.62876254180602, "grad_norm": 0.6643062233924866, "learning_rate": 0.00012476084538375973, "loss": 2.8677, "step": 1697 }, { "epoch": 22.642140468227424, "grad_norm": 0.640986442565918, "learning_rate": 0.00012471635150166851, "loss": 2.6359, "step": 1698 }, { "epoch": 22.65551839464883, "grad_norm": 0.6315325498580933, "learning_rate": 0.00012467185761957732, "loss": 2.7919, "step": 1699 }, { "epoch": 22.668896321070235, "grad_norm": 0.6384916305541992, "learning_rate": 0.00012462736373748608, "loss": 3.0353, "step": 1700 }, { "epoch": 22.682274247491637, "grad_norm": 0.6934499144554138, "learning_rate": 0.0001245828698553949, "loss": 2.8694, "step": 1701 }, { "epoch": 22.695652173913043, "grad_norm": 0.6261676549911499, "learning_rate": 0.00012453837597330367, "loss": 2.6893, "step": 1702 }, { "epoch": 22.70903010033445, "grad_norm": 0.6069616079330444, "learning_rate": 0.00012449388209121245, "loss": 2.9214, "step": 1703 }, { "epoch": 22.722408026755854, "grad_norm": 0.6354973316192627, "learning_rate": 0.00012444938820912124, "loss": 2.754, "step": 1704 }, { "epoch": 22.735785953177256, "grad_norm": 0.7385901808738708, "learning_rate": 0.00012440489432703004, "loss": 2.7633, "step": 1705 }, { "epoch": 22.74916387959866, "grad_norm": 0.680008053779602, "learning_rate": 0.00012436040044493883, "loss": 2.851, "step": 1706 }, { "epoch": 22.762541806020067, "grad_norm": 0.5894846320152283, "learning_rate": 0.0001243159065628476, "loss": 2.8797, "step": 1707 }, { "epoch": 22.775919732441473, "grad_norm": 0.5705847144126892, "learning_rate": 0.0001242714126807564, "loss": 2.7471, "step": 1708 }, { "epoch": 22.789297658862875, "grad_norm": 0.6021112203598022, "learning_rate": 0.0001242269187986652, "loss": 3.0716, "step": 1709 }, { "epoch": 22.80267558528428, "grad_norm": 0.63507080078125, "learning_rate": 0.00012418242491657396, "loss": 2.8873, "step": 1710 }, { "epoch": 22.816053511705686, "grad_norm": 0.6159088015556335, "learning_rate": 0.00012413793103448277, "loss": 2.5226, "step": 1711 }, { "epoch": 22.82943143812709, "grad_norm": 0.6099200248718262, "learning_rate": 0.00012409343715239155, "loss": 2.9937, "step": 1712 }, { "epoch": 22.842809364548494, "grad_norm": 0.5942363142967224, "learning_rate": 0.00012404894327030033, "loss": 2.5113, "step": 1713 }, { "epoch": 22.8561872909699, "grad_norm": 0.6209394335746765, "learning_rate": 0.00012400444938820911, "loss": 3.0766, "step": 1714 }, { "epoch": 22.869565217391305, "grad_norm": 0.6850425601005554, "learning_rate": 0.00012395995550611792, "loss": 2.8807, "step": 1715 }, { "epoch": 22.88294314381271, "grad_norm": 0.5698277354240417, "learning_rate": 0.0001239154616240267, "loss": 2.6821, "step": 1716 }, { "epoch": 22.896321070234112, "grad_norm": 0.7140303254127502, "learning_rate": 0.0001238709677419355, "loss": 2.7047, "step": 1717 }, { "epoch": 22.909698996655518, "grad_norm": 0.6129325032234192, "learning_rate": 0.00012382647385984427, "loss": 2.705, "step": 1718 }, { "epoch": 22.923076923076923, "grad_norm": 0.6409560441970825, "learning_rate": 0.00012378197997775308, "loss": 2.7126, "step": 1719 }, { "epoch": 22.93645484949833, "grad_norm": 0.5590237975120544, "learning_rate": 0.00012373748609566184, "loss": 2.7945, "step": 1720 }, { "epoch": 22.94983277591973, "grad_norm": 0.5763843655586243, "learning_rate": 0.00012369299221357064, "loss": 2.6802, "step": 1721 }, { "epoch": 22.963210702341136, "grad_norm": 0.6224062442779541, "learning_rate": 0.00012364849833147943, "loss": 2.7035, "step": 1722 }, { "epoch": 22.976588628762542, "grad_norm": 0.5638879537582397, "learning_rate": 0.0001236040044493882, "loss": 2.8259, "step": 1723 }, { "epoch": 22.989966555183948, "grad_norm": 0.6214390993118286, "learning_rate": 0.000123559510567297, "loss": 2.9997, "step": 1724 }, { "epoch": 23.0, "grad_norm": 0.6950054168701172, "learning_rate": 0.0001235150166852058, "loss": 2.7497, "step": 1725 }, { "epoch": 23.013377926421406, "grad_norm": 0.67231285572052, "learning_rate": 0.00012347052280311458, "loss": 2.859, "step": 1726 }, { "epoch": 23.02675585284281, "grad_norm": 0.6401329040527344, "learning_rate": 0.00012342602892102337, "loss": 2.7056, "step": 1727 }, { "epoch": 23.040133779264213, "grad_norm": 0.6234462261199951, "learning_rate": 0.00012338153503893215, "loss": 2.8271, "step": 1728 }, { "epoch": 23.05351170568562, "grad_norm": 0.6482179164886475, "learning_rate": 0.00012333704115684096, "loss": 2.898, "step": 1729 }, { "epoch": 23.066889632107024, "grad_norm": 0.6077515482902527, "learning_rate": 0.0001232925472747497, "loss": 2.6865, "step": 1730 }, { "epoch": 23.08026755852843, "grad_norm": 0.5866036415100098, "learning_rate": 0.00012324805339265852, "loss": 2.9001, "step": 1731 }, { "epoch": 23.093645484949832, "grad_norm": 0.6499370336532593, "learning_rate": 0.0001232035595105673, "loss": 2.8718, "step": 1732 }, { "epoch": 23.107023411371237, "grad_norm": 0.6188756227493286, "learning_rate": 0.0001231590656284761, "loss": 2.7928, "step": 1733 }, { "epoch": 23.120401337792643, "grad_norm": 0.5990676879882812, "learning_rate": 0.00012311457174638487, "loss": 2.7918, "step": 1734 }, { "epoch": 23.13377926421405, "grad_norm": 0.5592599511146545, "learning_rate": 0.00012307007786429368, "loss": 2.7948, "step": 1735 }, { "epoch": 23.14715719063545, "grad_norm": 0.5751471519470215, "learning_rate": 0.00012302558398220246, "loss": 2.6472, "step": 1736 }, { "epoch": 23.160535117056856, "grad_norm": 0.6300846338272095, "learning_rate": 0.00012298109010011124, "loss": 2.7491, "step": 1737 }, { "epoch": 23.17391304347826, "grad_norm": 0.5754296779632568, "learning_rate": 0.00012293659621802003, "loss": 2.7557, "step": 1738 }, { "epoch": 23.187290969899667, "grad_norm": 0.6483306884765625, "learning_rate": 0.0001228921023359288, "loss": 2.776, "step": 1739 }, { "epoch": 23.20066889632107, "grad_norm": 0.5819816589355469, "learning_rate": 0.0001228476084538376, "loss": 2.9192, "step": 1740 }, { "epoch": 23.214046822742475, "grad_norm": 0.6570101380348206, "learning_rate": 0.00012280311457174637, "loss": 2.8571, "step": 1741 }, { "epoch": 23.22742474916388, "grad_norm": 0.6435232162475586, "learning_rate": 0.00012275862068965518, "loss": 2.6134, "step": 1742 }, { "epoch": 23.240802675585286, "grad_norm": 0.5817757844924927, "learning_rate": 0.00012271412680756397, "loss": 2.8384, "step": 1743 }, { "epoch": 23.254180602006688, "grad_norm": 0.5564932823181152, "learning_rate": 0.00012266963292547275, "loss": 2.4606, "step": 1744 }, { "epoch": 23.267558528428093, "grad_norm": 0.6028578281402588, "learning_rate": 0.00012262513904338153, "loss": 2.824, "step": 1745 }, { "epoch": 23.2809364548495, "grad_norm": 0.5562644600868225, "learning_rate": 0.00012258064516129034, "loss": 2.8195, "step": 1746 }, { "epoch": 23.294314381270905, "grad_norm": 0.6838886141777039, "learning_rate": 0.0001225361512791991, "loss": 2.8453, "step": 1747 }, { "epoch": 23.307692307692307, "grad_norm": 0.6102856993675232, "learning_rate": 0.0001224916573971079, "loss": 2.8331, "step": 1748 }, { "epoch": 23.321070234113712, "grad_norm": 0.5895605087280273, "learning_rate": 0.0001224471635150167, "loss": 2.7479, "step": 1749 }, { "epoch": 23.334448160535118, "grad_norm": 0.7318586111068726, "learning_rate": 0.00012240266963292547, "loss": 2.7201, "step": 1750 }, { "epoch": 23.347826086956523, "grad_norm": 0.6560878157615662, "learning_rate": 0.00012235817575083425, "loss": 2.931, "step": 1751 }, { "epoch": 23.361204013377925, "grad_norm": 0.5879004001617432, "learning_rate": 0.00012231368186874306, "loss": 2.8607, "step": 1752 }, { "epoch": 23.37458193979933, "grad_norm": 0.5916743278503418, "learning_rate": 0.00012226918798665184, "loss": 2.6923, "step": 1753 }, { "epoch": 23.387959866220736, "grad_norm": 0.5569814443588257, "learning_rate": 0.00012222469410456063, "loss": 2.634, "step": 1754 }, { "epoch": 23.401337792642142, "grad_norm": 0.7285422682762146, "learning_rate": 0.0001221802002224694, "loss": 2.7135, "step": 1755 }, { "epoch": 23.414715719063544, "grad_norm": 0.7323710918426514, "learning_rate": 0.00012213570634037822, "loss": 2.6855, "step": 1756 }, { "epoch": 23.42809364548495, "grad_norm": 0.65497887134552, "learning_rate": 0.00012209121245828697, "loss": 2.8777, "step": 1757 }, { "epoch": 23.441471571906355, "grad_norm": 0.5919074416160583, "learning_rate": 0.00012204671857619578, "loss": 2.657, "step": 1758 }, { "epoch": 23.45484949832776, "grad_norm": 0.6749523282051086, "learning_rate": 0.00012200222469410457, "loss": 2.8142, "step": 1759 }, { "epoch": 23.468227424749163, "grad_norm": 0.6859252452850342, "learning_rate": 0.00012195773081201336, "loss": 2.8572, "step": 1760 }, { "epoch": 23.48160535117057, "grad_norm": 0.5956023931503296, "learning_rate": 0.00012191323692992213, "loss": 2.8385, "step": 1761 }, { "epoch": 23.494983277591974, "grad_norm": 0.684101939201355, "learning_rate": 0.00012186874304783094, "loss": 2.4779, "step": 1762 }, { "epoch": 23.50836120401338, "grad_norm": 0.726864755153656, "learning_rate": 0.00012182424916573971, "loss": 2.8523, "step": 1763 }, { "epoch": 23.52173913043478, "grad_norm": 0.5874570608139038, "learning_rate": 0.00012177975528364852, "loss": 2.7866, "step": 1764 }, { "epoch": 23.535117056856187, "grad_norm": 0.6077170968055725, "learning_rate": 0.00012173526140155729, "loss": 2.711, "step": 1765 }, { "epoch": 23.548494983277592, "grad_norm": 0.6854214668273926, "learning_rate": 0.00012169076751946608, "loss": 2.655, "step": 1766 }, { "epoch": 23.561872909698998, "grad_norm": 0.6409576535224915, "learning_rate": 0.00012164627363737486, "loss": 2.7309, "step": 1767 }, { "epoch": 23.5752508361204, "grad_norm": 0.6241947412490845, "learning_rate": 0.00012160177975528366, "loss": 2.67, "step": 1768 }, { "epoch": 23.588628762541806, "grad_norm": 0.5962851643562317, "learning_rate": 0.00012155728587319244, "loss": 2.6147, "step": 1769 }, { "epoch": 23.60200668896321, "grad_norm": 0.6183756589889526, "learning_rate": 0.00012151279199110124, "loss": 2.9772, "step": 1770 }, { "epoch": 23.615384615384617, "grad_norm": 0.6092197299003601, "learning_rate": 0.00012146829810901001, "loss": 2.7146, "step": 1771 }, { "epoch": 23.62876254180602, "grad_norm": 0.6660104990005493, "learning_rate": 0.00012142380422691882, "loss": 2.6339, "step": 1772 }, { "epoch": 23.642140468227424, "grad_norm": 0.9198834896087646, "learning_rate": 0.00012137931034482759, "loss": 2.7188, "step": 1773 }, { "epoch": 23.65551839464883, "grad_norm": 0.7053755521774292, "learning_rate": 0.0001213348164627364, "loss": 2.7484, "step": 1774 }, { "epoch": 23.668896321070235, "grad_norm": 0.7443736791610718, "learning_rate": 0.00012129032258064516, "loss": 2.6805, "step": 1775 }, { "epoch": 23.682274247491637, "grad_norm": 0.579801082611084, "learning_rate": 0.00012124582869855396, "loss": 2.6467, "step": 1776 }, { "epoch": 23.695652173913043, "grad_norm": 0.6014502644538879, "learning_rate": 0.00012120133481646274, "loss": 2.5816, "step": 1777 }, { "epoch": 23.70903010033445, "grad_norm": 0.6318315863609314, "learning_rate": 0.00012115684093437154, "loss": 2.8137, "step": 1778 }, { "epoch": 23.722408026755854, "grad_norm": 0.6172413229942322, "learning_rate": 0.00012111234705228032, "loss": 2.7789, "step": 1779 }, { "epoch": 23.735785953177256, "grad_norm": 0.6184534430503845, "learning_rate": 0.00012106785317018909, "loss": 2.6802, "step": 1780 }, { "epoch": 23.74916387959866, "grad_norm": 0.6380288600921631, "learning_rate": 0.00012102335928809789, "loss": 2.8755, "step": 1781 }, { "epoch": 23.762541806020067, "grad_norm": 0.5941389799118042, "learning_rate": 0.00012097886540600667, "loss": 2.945, "step": 1782 }, { "epoch": 23.775919732441473, "grad_norm": 0.6913108825683594, "learning_rate": 0.00012093437152391546, "loss": 2.7036, "step": 1783 }, { "epoch": 23.789297658862875, "grad_norm": 0.563119113445282, "learning_rate": 0.00012088987764182425, "loss": 2.6608, "step": 1784 }, { "epoch": 23.80267558528428, "grad_norm": 0.6387828588485718, "learning_rate": 0.00012084538375973304, "loss": 2.8933, "step": 1785 }, { "epoch": 23.816053511705686, "grad_norm": 0.7530612945556641, "learning_rate": 0.00012080088987764183, "loss": 2.8399, "step": 1786 }, { "epoch": 23.82943143812709, "grad_norm": 0.6401646733283997, "learning_rate": 0.00012075639599555062, "loss": 2.8757, "step": 1787 }, { "epoch": 23.842809364548494, "grad_norm": 0.7403398752212524, "learning_rate": 0.00012071190211345939, "loss": 2.629, "step": 1788 }, { "epoch": 23.8561872909699, "grad_norm": 0.6479887366294861, "learning_rate": 0.0001206674082313682, "loss": 2.9069, "step": 1789 }, { "epoch": 23.869565217391305, "grad_norm": 0.588141679763794, "learning_rate": 0.00012062291434927697, "loss": 2.732, "step": 1790 }, { "epoch": 23.88294314381271, "grad_norm": 0.6330631971359253, "learning_rate": 0.00012057842046718576, "loss": 2.9348, "step": 1791 }, { "epoch": 23.896321070234112, "grad_norm": 0.6657344698905945, "learning_rate": 0.00012053392658509455, "loss": 3.0499, "step": 1792 }, { "epoch": 23.909698996655518, "grad_norm": 0.6816025972366333, "learning_rate": 0.00012048943270300334, "loss": 2.9961, "step": 1793 }, { "epoch": 23.923076923076923, "grad_norm": 0.6501593589782715, "learning_rate": 0.00012044493882091212, "loss": 2.8555, "step": 1794 }, { "epoch": 23.93645484949833, "grad_norm": 0.6157960295677185, "learning_rate": 0.00012040044493882092, "loss": 2.8133, "step": 1795 }, { "epoch": 23.94983277591973, "grad_norm": 0.5868191719055176, "learning_rate": 0.0001203559510567297, "loss": 2.7887, "step": 1796 }, { "epoch": 23.963210702341136, "grad_norm": 0.6586911678314209, "learning_rate": 0.0001203114571746385, "loss": 3.0916, "step": 1797 }, { "epoch": 23.976588628762542, "grad_norm": 0.6382162570953369, "learning_rate": 0.00012026696329254727, "loss": 2.7469, "step": 1798 }, { "epoch": 23.989966555183948, "grad_norm": 0.6481708288192749, "learning_rate": 0.00012022246941045608, "loss": 2.9216, "step": 1799 }, { "epoch": 24.0, "grad_norm": 0.7272862792015076, "learning_rate": 0.00012017797552836485, "loss": 2.686, "step": 1800 }, { "epoch": 24.013377926421406, "grad_norm": 0.743366003036499, "learning_rate": 0.00012013348164627364, "loss": 2.6791, "step": 1801 }, { "epoch": 24.02675585284281, "grad_norm": 0.6428125500679016, "learning_rate": 0.00012008898776418242, "loss": 2.782, "step": 1802 }, { "epoch": 24.040133779264213, "grad_norm": 0.6210600137710571, "learning_rate": 0.00012004449388209122, "loss": 2.9352, "step": 1803 }, { "epoch": 24.05351170568562, "grad_norm": 0.9589283466339111, "learning_rate": 0.00012, "loss": 2.7686, "step": 1804 }, { "epoch": 24.066889632107024, "grad_norm": 0.5503745675086975, "learning_rate": 0.0001199555061179088, "loss": 2.9241, "step": 1805 }, { "epoch": 24.08026755852843, "grad_norm": 0.6018356084823608, "learning_rate": 0.00011991101223581758, "loss": 2.7512, "step": 1806 }, { "epoch": 24.093645484949832, "grad_norm": 0.7458929419517517, "learning_rate": 0.00011986651835372638, "loss": 2.5795, "step": 1807 }, { "epoch": 24.107023411371237, "grad_norm": 0.705740213394165, "learning_rate": 0.00011982202447163515, "loss": 2.8537, "step": 1808 }, { "epoch": 24.120401337792643, "grad_norm": 0.6299166679382324, "learning_rate": 0.00011977753058954396, "loss": 2.7815, "step": 1809 }, { "epoch": 24.13377926421405, "grad_norm": 1.6271346807479858, "learning_rate": 0.00011973303670745272, "loss": 2.6373, "step": 1810 }, { "epoch": 24.14715719063545, "grad_norm": 0.6286007165908813, "learning_rate": 0.00011968854282536152, "loss": 2.5587, "step": 1811 }, { "epoch": 24.160535117056856, "grad_norm": 0.6259005069732666, "learning_rate": 0.0001196440489432703, "loss": 2.6414, "step": 1812 }, { "epoch": 24.17391304347826, "grad_norm": 0.6058273911476135, "learning_rate": 0.0001195995550611791, "loss": 2.8571, "step": 1813 }, { "epoch": 24.187290969899667, "grad_norm": 0.6144312620162964, "learning_rate": 0.00011955506117908788, "loss": 2.7777, "step": 1814 }, { "epoch": 24.20066889632107, "grad_norm": 0.7302254438400269, "learning_rate": 0.00011951056729699668, "loss": 2.6298, "step": 1815 }, { "epoch": 24.214046822742475, "grad_norm": 0.7253098487854004, "learning_rate": 0.00011946607341490546, "loss": 2.9066, "step": 1816 }, { "epoch": 24.22742474916388, "grad_norm": 0.7629284858703613, "learning_rate": 0.00011942157953281426, "loss": 2.5391, "step": 1817 }, { "epoch": 24.240802675585286, "grad_norm": 0.6672796607017517, "learning_rate": 0.00011937708565072302, "loss": 2.8014, "step": 1818 }, { "epoch": 24.254180602006688, "grad_norm": 0.6049585938453674, "learning_rate": 0.00011933259176863183, "loss": 2.5539, "step": 1819 }, { "epoch": 24.267558528428093, "grad_norm": 0.5817275047302246, "learning_rate": 0.0001192880978865406, "loss": 2.5771, "step": 1820 }, { "epoch": 24.2809364548495, "grad_norm": 0.6785464882850647, "learning_rate": 0.00011924360400444938, "loss": 2.8043, "step": 1821 }, { "epoch": 24.294314381270905, "grad_norm": 0.6705557107925415, "learning_rate": 0.00011919911012235818, "loss": 2.7126, "step": 1822 }, { "epoch": 24.307692307692307, "grad_norm": 0.6267027854919434, "learning_rate": 0.00011915461624026696, "loss": 2.8739, "step": 1823 }, { "epoch": 24.321070234113712, "grad_norm": 0.6952410340309143, "learning_rate": 0.00011911012235817576, "loss": 2.8402, "step": 1824 }, { "epoch": 24.334448160535118, "grad_norm": 0.6892711520195007, "learning_rate": 0.00011906562847608453, "loss": 2.7314, "step": 1825 }, { "epoch": 24.347826086956523, "grad_norm": 0.5816205739974976, "learning_rate": 0.00011902113459399334, "loss": 2.6115, "step": 1826 }, { "epoch": 24.361204013377925, "grad_norm": 0.6549028158187866, "learning_rate": 0.0001189766407119021, "loss": 2.7503, "step": 1827 }, { "epoch": 24.37458193979933, "grad_norm": 0.5902572870254517, "learning_rate": 0.0001189321468298109, "loss": 2.7549, "step": 1828 }, { "epoch": 24.387959866220736, "grad_norm": 0.8412529230117798, "learning_rate": 0.00011888765294771968, "loss": 2.9654, "step": 1829 }, { "epoch": 24.401337792642142, "grad_norm": 0.5785757303237915, "learning_rate": 0.00011884315906562848, "loss": 2.8101, "step": 1830 }, { "epoch": 24.414715719063544, "grad_norm": 0.5835216045379639, "learning_rate": 0.00011879866518353726, "loss": 2.5368, "step": 1831 }, { "epoch": 24.42809364548495, "grad_norm": 0.6589221954345703, "learning_rate": 0.00011875417130144606, "loss": 2.7917, "step": 1832 }, { "epoch": 24.441471571906355, "grad_norm": 0.6370606422424316, "learning_rate": 0.00011870967741935484, "loss": 2.7514, "step": 1833 }, { "epoch": 24.45484949832776, "grad_norm": 0.644980788230896, "learning_rate": 0.00011866518353726364, "loss": 2.6756, "step": 1834 }, { "epoch": 24.468227424749163, "grad_norm": 0.7281529307365417, "learning_rate": 0.0001186206896551724, "loss": 2.5308, "step": 1835 }, { "epoch": 24.48160535117057, "grad_norm": 0.5669592618942261, "learning_rate": 0.00011857619577308122, "loss": 2.7197, "step": 1836 }, { "epoch": 24.494983277591974, "grad_norm": 0.7093492150306702, "learning_rate": 0.00011853170189098998, "loss": 2.9053, "step": 1837 }, { "epoch": 24.50836120401338, "grad_norm": 0.5949118733406067, "learning_rate": 0.00011848720800889878, "loss": 2.8594, "step": 1838 }, { "epoch": 24.52173913043478, "grad_norm": 0.5725157260894775, "learning_rate": 0.00011844271412680756, "loss": 2.5906, "step": 1839 }, { "epoch": 24.535117056856187, "grad_norm": 0.7810790538787842, "learning_rate": 0.00011839822024471636, "loss": 2.7852, "step": 1840 }, { "epoch": 24.548494983277592, "grad_norm": 0.6285178661346436, "learning_rate": 0.00011835372636262514, "loss": 2.7201, "step": 1841 }, { "epoch": 24.561872909698998, "grad_norm": 0.6262674927711487, "learning_rate": 0.00011830923248053394, "loss": 2.7249, "step": 1842 }, { "epoch": 24.5752508361204, "grad_norm": 0.5526009202003479, "learning_rate": 0.00011826473859844272, "loss": 2.6668, "step": 1843 }, { "epoch": 24.588628762541806, "grad_norm": 0.6295923590660095, "learning_rate": 0.00011822024471635152, "loss": 2.8788, "step": 1844 }, { "epoch": 24.60200668896321, "grad_norm": 0.6965957283973694, "learning_rate": 0.00011817575083426028, "loss": 2.8676, "step": 1845 }, { "epoch": 24.615384615384617, "grad_norm": 0.6246476173400879, "learning_rate": 0.0001181312569521691, "loss": 2.8326, "step": 1846 }, { "epoch": 24.62876254180602, "grad_norm": 0.6312415599822998, "learning_rate": 0.00011808676307007786, "loss": 2.5748, "step": 1847 }, { "epoch": 24.642140468227424, "grad_norm": 0.7262862920761108, "learning_rate": 0.00011804226918798666, "loss": 2.922, "step": 1848 }, { "epoch": 24.65551839464883, "grad_norm": 0.6640278697013855, "learning_rate": 0.00011799777530589544, "loss": 2.7494, "step": 1849 }, { "epoch": 24.668896321070235, "grad_norm": 0.6154195070266724, "learning_rate": 0.00011795328142380424, "loss": 2.6168, "step": 1850 }, { "epoch": 24.682274247491637, "grad_norm": 0.6020949482917786, "learning_rate": 0.00011790878754171302, "loss": 2.7566, "step": 1851 }, { "epoch": 24.695652173913043, "grad_norm": 0.5775970816612244, "learning_rate": 0.00011786429365962182, "loss": 2.6862, "step": 1852 }, { "epoch": 24.70903010033445, "grad_norm": 0.6687362194061279, "learning_rate": 0.0001178197997775306, "loss": 2.554, "step": 1853 }, { "epoch": 24.722408026755854, "grad_norm": 0.6184269785881042, "learning_rate": 0.0001177753058954394, "loss": 2.9353, "step": 1854 }, { "epoch": 24.735785953177256, "grad_norm": 0.6340799331665039, "learning_rate": 0.00011773081201334816, "loss": 2.857, "step": 1855 }, { "epoch": 24.74916387959866, "grad_norm": 0.8216782808303833, "learning_rate": 0.00011768631813125697, "loss": 2.6295, "step": 1856 }, { "epoch": 24.762541806020067, "grad_norm": 0.5764971375465393, "learning_rate": 0.00011764182424916574, "loss": 2.8431, "step": 1857 }, { "epoch": 24.775919732441473, "grad_norm": 0.6233363747596741, "learning_rate": 0.00011759733036707454, "loss": 2.6867, "step": 1858 }, { "epoch": 24.789297658862875, "grad_norm": 0.6669492721557617, "learning_rate": 0.00011755283648498332, "loss": 2.5405, "step": 1859 }, { "epoch": 24.80267558528428, "grad_norm": 0.6156982183456421, "learning_rate": 0.00011750834260289212, "loss": 2.8071, "step": 1860 }, { "epoch": 24.816053511705686, "grad_norm": 0.6210219860076904, "learning_rate": 0.0001174638487208009, "loss": 2.8308, "step": 1861 }, { "epoch": 24.82943143812709, "grad_norm": 0.6786196231842041, "learning_rate": 0.00011741935483870967, "loss": 2.9039, "step": 1862 }, { "epoch": 24.842809364548494, "grad_norm": 0.6467169523239136, "learning_rate": 0.00011737486095661848, "loss": 2.6601, "step": 1863 }, { "epoch": 24.8561872909699, "grad_norm": 0.6734811663627625, "learning_rate": 0.00011733036707452724, "loss": 2.8614, "step": 1864 }, { "epoch": 24.869565217391305, "grad_norm": 0.6223945021629333, "learning_rate": 0.00011728587319243604, "loss": 2.9397, "step": 1865 }, { "epoch": 24.88294314381271, "grad_norm": 0.6042306423187256, "learning_rate": 0.00011724137931034482, "loss": 2.8984, "step": 1866 }, { "epoch": 24.896321070234112, "grad_norm": 0.945598840713501, "learning_rate": 0.00011719688542825362, "loss": 2.6882, "step": 1867 }, { "epoch": 24.909698996655518, "grad_norm": 0.6739409565925598, "learning_rate": 0.0001171523915461624, "loss": 2.6285, "step": 1868 }, { "epoch": 24.923076923076923, "grad_norm": 0.6158167719841003, "learning_rate": 0.0001171078976640712, "loss": 2.8019, "step": 1869 }, { "epoch": 24.93645484949833, "grad_norm": 0.5984783172607422, "learning_rate": 0.00011706340378197998, "loss": 2.785, "step": 1870 }, { "epoch": 24.94983277591973, "grad_norm": 0.594236433506012, "learning_rate": 0.00011701890989988878, "loss": 3.0126, "step": 1871 }, { "epoch": 24.963210702341136, "grad_norm": 0.6268919706344604, "learning_rate": 0.00011697441601779754, "loss": 2.7908, "step": 1872 }, { "epoch": 24.976588628762542, "grad_norm": 0.6119821071624756, "learning_rate": 0.00011692992213570635, "loss": 2.5886, "step": 1873 }, { "epoch": 24.989966555183948, "grad_norm": 0.8935804963111877, "learning_rate": 0.00011688542825361512, "loss": 2.9702, "step": 1874 }, { "epoch": 25.0, "grad_norm": 0.6583001613616943, "learning_rate": 0.00011684093437152392, "loss": 2.7691, "step": 1875 }, { "epoch": 25.013377926421406, "grad_norm": 0.5952125787734985, "learning_rate": 0.0001167964404894327, "loss": 2.6709, "step": 1876 }, { "epoch": 25.02675585284281, "grad_norm": 0.6740133762359619, "learning_rate": 0.0001167519466073415, "loss": 2.6548, "step": 1877 }, { "epoch": 25.040133779264213, "grad_norm": 0.6665695309638977, "learning_rate": 0.00011670745272525028, "loss": 2.6141, "step": 1878 }, { "epoch": 25.05351170568562, "grad_norm": 0.698957085609436, "learning_rate": 0.00011666295884315908, "loss": 2.7752, "step": 1879 }, { "epoch": 25.066889632107024, "grad_norm": 0.7277541160583496, "learning_rate": 0.00011661846496106786, "loss": 2.8876, "step": 1880 }, { "epoch": 25.08026755852843, "grad_norm": 0.5987577438354492, "learning_rate": 0.00011657397107897665, "loss": 2.5601, "step": 1881 }, { "epoch": 25.093645484949832, "grad_norm": 0.5897870063781738, "learning_rate": 0.00011652947719688542, "loss": 2.5854, "step": 1882 }, { "epoch": 25.107023411371237, "grad_norm": 0.573273241519928, "learning_rate": 0.00011648498331479423, "loss": 2.7141, "step": 1883 }, { "epoch": 25.120401337792643, "grad_norm": 0.6173187494277954, "learning_rate": 0.000116440489432703, "loss": 2.6697, "step": 1884 }, { "epoch": 25.13377926421405, "grad_norm": 0.5728760361671448, "learning_rate": 0.0001163959955506118, "loss": 2.7058, "step": 1885 }, { "epoch": 25.14715719063545, "grad_norm": 0.543645441532135, "learning_rate": 0.00011635150166852058, "loss": 2.441, "step": 1886 }, { "epoch": 25.160535117056856, "grad_norm": 0.6582958698272705, "learning_rate": 0.00011630700778642938, "loss": 2.8462, "step": 1887 }, { "epoch": 25.17391304347826, "grad_norm": 0.6733880043029785, "learning_rate": 0.00011626251390433816, "loss": 2.7294, "step": 1888 }, { "epoch": 25.187290969899667, "grad_norm": 0.8249802589416504, "learning_rate": 0.00011621802002224695, "loss": 2.7738, "step": 1889 }, { "epoch": 25.20066889632107, "grad_norm": 0.6004601120948792, "learning_rate": 0.00011617352614015574, "loss": 2.6992, "step": 1890 }, { "epoch": 25.214046822742475, "grad_norm": 0.6188962459564209, "learning_rate": 0.00011612903225806453, "loss": 2.5715, "step": 1891 }, { "epoch": 25.22742474916388, "grad_norm": 0.6831389665603638, "learning_rate": 0.0001160845383759733, "loss": 2.8777, "step": 1892 }, { "epoch": 25.240802675585286, "grad_norm": 0.5578925013542175, "learning_rate": 0.00011604004449388211, "loss": 2.4875, "step": 1893 }, { "epoch": 25.254180602006688, "grad_norm": 0.5995929837226868, "learning_rate": 0.00011599555061179088, "loss": 2.5166, "step": 1894 }, { "epoch": 25.267558528428093, "grad_norm": 0.6817669868469238, "learning_rate": 0.00011595105672969967, "loss": 2.7628, "step": 1895 }, { "epoch": 25.2809364548495, "grad_norm": 0.671890139579773, "learning_rate": 0.00011590656284760846, "loss": 2.5406, "step": 1896 }, { "epoch": 25.294314381270905, "grad_norm": 0.6167321801185608, "learning_rate": 0.00011586206896551725, "loss": 2.8789, "step": 1897 }, { "epoch": 25.307692307692307, "grad_norm": 0.6145638227462769, "learning_rate": 0.00011581757508342604, "loss": 2.7629, "step": 1898 }, { "epoch": 25.321070234113712, "grad_norm": 0.6117168068885803, "learning_rate": 0.00011577308120133483, "loss": 2.7048, "step": 1899 }, { "epoch": 25.334448160535118, "grad_norm": 0.6903496384620667, "learning_rate": 0.00011572858731924361, "loss": 2.9096, "step": 1900 }, { "epoch": 25.347826086956523, "grad_norm": 0.5942792296409607, "learning_rate": 0.00011568409343715241, "loss": 2.4525, "step": 1901 }, { "epoch": 25.361204013377925, "grad_norm": 0.5714770555496216, "learning_rate": 0.00011563959955506118, "loss": 2.6605, "step": 1902 }, { "epoch": 25.37458193979933, "grad_norm": 1.0252799987792969, "learning_rate": 0.00011559510567296996, "loss": 2.4599, "step": 1903 }, { "epoch": 25.387959866220736, "grad_norm": 0.6459619998931885, "learning_rate": 0.00011555061179087876, "loss": 2.657, "step": 1904 }, { "epoch": 25.401337792642142, "grad_norm": 0.6513925790786743, "learning_rate": 0.00011550611790878754, "loss": 2.6491, "step": 1905 }, { "epoch": 25.414715719063544, "grad_norm": 0.838390588760376, "learning_rate": 0.00011546162402669634, "loss": 2.8909, "step": 1906 }, { "epoch": 25.42809364548495, "grad_norm": 0.6873288750648499, "learning_rate": 0.00011541713014460512, "loss": 2.9252, "step": 1907 }, { "epoch": 25.441471571906355, "grad_norm": 0.6325615644454956, "learning_rate": 0.00011537263626251391, "loss": 2.8784, "step": 1908 }, { "epoch": 25.45484949832776, "grad_norm": 0.7683016061782837, "learning_rate": 0.00011532814238042268, "loss": 2.7951, "step": 1909 }, { "epoch": 25.468227424749163, "grad_norm": 0.6581037044525146, "learning_rate": 0.00011528364849833149, "loss": 2.7674, "step": 1910 }, { "epoch": 25.48160535117057, "grad_norm": 0.6761939525604248, "learning_rate": 0.00011523915461624026, "loss": 2.7315, "step": 1911 }, { "epoch": 25.494983277591974, "grad_norm": 0.621744692325592, "learning_rate": 0.00011519466073414906, "loss": 2.6977, "step": 1912 }, { "epoch": 25.50836120401338, "grad_norm": 0.5744712352752686, "learning_rate": 0.00011515016685205784, "loss": 2.6568, "step": 1913 }, { "epoch": 25.52173913043478, "grad_norm": 0.6587238907814026, "learning_rate": 0.00011510567296996664, "loss": 3.0222, "step": 1914 }, { "epoch": 25.535117056856187, "grad_norm": 0.6338929533958435, "learning_rate": 0.00011506117908787542, "loss": 2.4611, "step": 1915 }, { "epoch": 25.548494983277592, "grad_norm": 0.6468850374221802, "learning_rate": 0.00011501668520578421, "loss": 2.5532, "step": 1916 }, { "epoch": 25.561872909698998, "grad_norm": 0.6900503635406494, "learning_rate": 0.000114972191323693, "loss": 2.7291, "step": 1917 }, { "epoch": 25.5752508361204, "grad_norm": 0.9081000089645386, "learning_rate": 0.00011492769744160179, "loss": 2.528, "step": 1918 }, { "epoch": 25.588628762541806, "grad_norm": 0.6108705401420593, "learning_rate": 0.00011488320355951056, "loss": 2.7942, "step": 1919 }, { "epoch": 25.60200668896321, "grad_norm": 0.5745365023612976, "learning_rate": 0.00011483870967741937, "loss": 2.7273, "step": 1920 }, { "epoch": 25.615384615384617, "grad_norm": 0.7215850949287415, "learning_rate": 0.00011479421579532814, "loss": 2.85, "step": 1921 }, { "epoch": 25.62876254180602, "grad_norm": 0.6535587310791016, "learning_rate": 0.00011474972191323693, "loss": 2.7488, "step": 1922 }, { "epoch": 25.642140468227424, "grad_norm": 0.6285836696624756, "learning_rate": 0.00011470522803114572, "loss": 2.6874, "step": 1923 }, { "epoch": 25.65551839464883, "grad_norm": 0.652277410030365, "learning_rate": 0.00011466073414905451, "loss": 2.889, "step": 1924 }, { "epoch": 25.668896321070235, "grad_norm": 0.6526362299919128, "learning_rate": 0.0001146162402669633, "loss": 2.8342, "step": 1925 }, { "epoch": 25.682274247491637, "grad_norm": 0.6387808918952942, "learning_rate": 0.00011457174638487209, "loss": 2.6713, "step": 1926 }, { "epoch": 25.695652173913043, "grad_norm": 0.5775367617607117, "learning_rate": 0.00011452725250278087, "loss": 2.6848, "step": 1927 }, { "epoch": 25.70903010033445, "grad_norm": 0.7041560411453247, "learning_rate": 0.00011448275862068967, "loss": 2.7532, "step": 1928 }, { "epoch": 25.722408026755854, "grad_norm": 0.6833249926567078, "learning_rate": 0.00011443826473859844, "loss": 2.8257, "step": 1929 }, { "epoch": 25.735785953177256, "grad_norm": 0.6916714310646057, "learning_rate": 0.00011439377085650725, "loss": 2.5491, "step": 1930 }, { "epoch": 25.74916387959866, "grad_norm": 0.8915151357650757, "learning_rate": 0.00011434927697441602, "loss": 2.8361, "step": 1931 }, { "epoch": 25.762541806020067, "grad_norm": 0.6278401613235474, "learning_rate": 0.00011430478309232481, "loss": 2.8022, "step": 1932 }, { "epoch": 25.775919732441473, "grad_norm": 0.6929000020027161, "learning_rate": 0.0001142602892102336, "loss": 2.8938, "step": 1933 }, { "epoch": 25.789297658862875, "grad_norm": 0.6363662481307983, "learning_rate": 0.00011421579532814239, "loss": 2.8736, "step": 1934 }, { "epoch": 25.80267558528428, "grad_norm": 0.6596313118934631, "learning_rate": 0.00011417130144605117, "loss": 2.9864, "step": 1935 }, { "epoch": 25.816053511705686, "grad_norm": 0.9694854617118835, "learning_rate": 0.00011412680756395997, "loss": 2.7362, "step": 1936 }, { "epoch": 25.82943143812709, "grad_norm": 0.721339225769043, "learning_rate": 0.00011408231368186875, "loss": 2.9919, "step": 1937 }, { "epoch": 25.842809364548494, "grad_norm": 0.6164969205856323, "learning_rate": 0.00011403781979977755, "loss": 3.0446, "step": 1938 }, { "epoch": 25.8561872909699, "grad_norm": 0.8240790367126465, "learning_rate": 0.00011399332591768632, "loss": 2.7113, "step": 1939 }, { "epoch": 25.869565217391305, "grad_norm": 0.865219235420227, "learning_rate": 0.00011394883203559513, "loss": 2.6692, "step": 1940 }, { "epoch": 25.88294314381271, "grad_norm": 0.6014530062675476, "learning_rate": 0.0001139043381535039, "loss": 2.7527, "step": 1941 }, { "epoch": 25.896321070234112, "grad_norm": 0.6110913157463074, "learning_rate": 0.00011385984427141269, "loss": 2.6213, "step": 1942 }, { "epoch": 25.909698996655518, "grad_norm": 0.6967377662658691, "learning_rate": 0.00011381535038932147, "loss": 3.0013, "step": 1943 }, { "epoch": 25.923076923076923, "grad_norm": 0.6235397458076477, "learning_rate": 0.00011377085650723027, "loss": 2.8256, "step": 1944 }, { "epoch": 25.93645484949833, "grad_norm": 0.6463499665260315, "learning_rate": 0.00011372636262513905, "loss": 2.6573, "step": 1945 }, { "epoch": 25.94983277591973, "grad_norm": 0.7898368239402771, "learning_rate": 0.00011368186874304782, "loss": 2.6484, "step": 1946 }, { "epoch": 25.963210702341136, "grad_norm": 0.6130072474479675, "learning_rate": 0.00011363737486095663, "loss": 2.6912, "step": 1947 }, { "epoch": 25.976588628762542, "grad_norm": 0.6541060209274292, "learning_rate": 0.0001135928809788654, "loss": 2.789, "step": 1948 }, { "epoch": 25.989966555183948, "grad_norm": 0.6722179651260376, "learning_rate": 0.0001135483870967742, "loss": 2.7197, "step": 1949 }, { "epoch": 26.0, "grad_norm": 0.6884087920188904, "learning_rate": 0.00011350389321468298, "loss": 2.83, "step": 1950 }, { "epoch": 26.013377926421406, "grad_norm": 0.5991981029510498, "learning_rate": 0.00011345939933259177, "loss": 2.8456, "step": 1951 }, { "epoch": 26.02675585284281, "grad_norm": 0.660169243812561, "learning_rate": 0.00011341490545050056, "loss": 2.8782, "step": 1952 }, { "epoch": 26.040133779264213, "grad_norm": 0.651779294013977, "learning_rate": 0.00011337041156840935, "loss": 2.7286, "step": 1953 }, { "epoch": 26.05351170568562, "grad_norm": 0.6962876915931702, "learning_rate": 0.00011332591768631813, "loss": 2.5009, "step": 1954 }, { "epoch": 26.066889632107024, "grad_norm": 0.6752526164054871, "learning_rate": 0.00011328142380422693, "loss": 2.6137, "step": 1955 }, { "epoch": 26.08026755852843, "grad_norm": 0.6335856914520264, "learning_rate": 0.0001132369299221357, "loss": 2.4744, "step": 1956 }, { "epoch": 26.093645484949832, "grad_norm": 0.6627519726753235, "learning_rate": 0.00011319243604004451, "loss": 2.9633, "step": 1957 }, { "epoch": 26.107023411371237, "grad_norm": 0.5948102474212646, "learning_rate": 0.00011314794215795328, "loss": 2.6333, "step": 1958 }, { "epoch": 26.120401337792643, "grad_norm": 0.5609325766563416, "learning_rate": 0.00011310344827586207, "loss": 2.5427, "step": 1959 }, { "epoch": 26.13377926421405, "grad_norm": 0.6280467510223389, "learning_rate": 0.00011305895439377086, "loss": 2.6838, "step": 1960 }, { "epoch": 26.14715719063545, "grad_norm": 0.5900541543960571, "learning_rate": 0.00011301446051167965, "loss": 2.8057, "step": 1961 }, { "epoch": 26.160535117056856, "grad_norm": 0.6185488700866699, "learning_rate": 0.00011296996662958843, "loss": 2.7778, "step": 1962 }, { "epoch": 26.17391304347826, "grad_norm": 0.6149762272834778, "learning_rate": 0.00011292547274749723, "loss": 2.7384, "step": 1963 }, { "epoch": 26.187290969899667, "grad_norm": 0.5975783467292786, "learning_rate": 0.00011288097886540601, "loss": 2.6957, "step": 1964 }, { "epoch": 26.20066889632107, "grad_norm": 0.7946596741676331, "learning_rate": 0.00011283648498331481, "loss": 2.6071, "step": 1965 }, { "epoch": 26.214046822742475, "grad_norm": 0.5987370014190674, "learning_rate": 0.00011279199110122358, "loss": 2.5812, "step": 1966 }, { "epoch": 26.22742474916388, "grad_norm": 0.7613934278488159, "learning_rate": 0.00011274749721913239, "loss": 2.7886, "step": 1967 }, { "epoch": 26.240802675585286, "grad_norm": 0.6035755276679993, "learning_rate": 0.00011270300333704116, "loss": 2.4709, "step": 1968 }, { "epoch": 26.254180602006688, "grad_norm": 0.9545117616653442, "learning_rate": 0.00011265850945494995, "loss": 2.6039, "step": 1969 }, { "epoch": 26.267558528428093, "grad_norm": 0.5982191562652588, "learning_rate": 0.00011261401557285873, "loss": 2.664, "step": 1970 }, { "epoch": 26.2809364548495, "grad_norm": 0.6070300340652466, "learning_rate": 0.00011256952169076753, "loss": 2.671, "step": 1971 }, { "epoch": 26.294314381270905, "grad_norm": 0.6093387603759766, "learning_rate": 0.00011252502780867631, "loss": 2.6441, "step": 1972 }, { "epoch": 26.307692307692307, "grad_norm": 0.6499563455581665, "learning_rate": 0.00011248053392658511, "loss": 2.6851, "step": 1973 }, { "epoch": 26.321070234113712, "grad_norm": 0.6848227977752686, "learning_rate": 0.00011243604004449389, "loss": 2.6448, "step": 1974 }, { "epoch": 26.334448160535118, "grad_norm": 0.6483498811721802, "learning_rate": 0.00011239154616240269, "loss": 2.5361, "step": 1975 }, { "epoch": 26.347826086956523, "grad_norm": 0.6412089467048645, "learning_rate": 0.00011234705228031146, "loss": 2.6625, "step": 1976 }, { "epoch": 26.361204013377925, "grad_norm": 0.6891600489616394, "learning_rate": 0.00011230255839822026, "loss": 2.6901, "step": 1977 }, { "epoch": 26.37458193979933, "grad_norm": 0.606422483921051, "learning_rate": 0.00011225806451612903, "loss": 2.7778, "step": 1978 }, { "epoch": 26.387959866220736, "grad_norm": 0.6754788756370544, "learning_rate": 0.00011221357063403783, "loss": 2.7757, "step": 1979 }, { "epoch": 26.401337792642142, "grad_norm": 0.6534073352813721, "learning_rate": 0.00011216907675194661, "loss": 2.7682, "step": 1980 }, { "epoch": 26.414715719063544, "grad_norm": 0.6333116888999939, "learning_rate": 0.00011212458286985541, "loss": 2.739, "step": 1981 }, { "epoch": 26.42809364548495, "grad_norm": 0.6784356236457825, "learning_rate": 0.00011208008898776419, "loss": 2.499, "step": 1982 }, { "epoch": 26.441471571906355, "grad_norm": 0.6247740983963013, "learning_rate": 0.00011203559510567299, "loss": 2.8451, "step": 1983 }, { "epoch": 26.45484949832776, "grad_norm": 0.6405763030052185, "learning_rate": 0.00011199110122358177, "loss": 2.7237, "step": 1984 }, { "epoch": 26.468227424749163, "grad_norm": 0.7398512959480286, "learning_rate": 0.00011194660734149056, "loss": 2.6931, "step": 1985 }, { "epoch": 26.48160535117057, "grad_norm": 0.6067454814910889, "learning_rate": 0.00011190211345939933, "loss": 2.8153, "step": 1986 }, { "epoch": 26.494983277591974, "grad_norm": 0.5860223770141602, "learning_rate": 0.00011185761957730812, "loss": 2.9522, "step": 1987 }, { "epoch": 26.50836120401338, "grad_norm": 0.6325815320014954, "learning_rate": 0.00011181312569521691, "loss": 2.7294, "step": 1988 }, { "epoch": 26.52173913043478, "grad_norm": 0.6049959063529968, "learning_rate": 0.0001117686318131257, "loss": 2.7989, "step": 1989 }, { "epoch": 26.535117056856187, "grad_norm": 0.5589177012443542, "learning_rate": 0.00011172413793103449, "loss": 2.4431, "step": 1990 }, { "epoch": 26.548494983277592, "grad_norm": 0.6467594504356384, "learning_rate": 0.00011167964404894327, "loss": 2.7166, "step": 1991 }, { "epoch": 26.561872909698998, "grad_norm": 0.7082380056381226, "learning_rate": 0.00011163515016685207, "loss": 2.7768, "step": 1992 }, { "epoch": 26.5752508361204, "grad_norm": 0.7438964247703552, "learning_rate": 0.00011159065628476084, "loss": 2.8655, "step": 1993 }, { "epoch": 26.588628762541806, "grad_norm": 0.7755714654922485, "learning_rate": 0.00011154616240266965, "loss": 2.6956, "step": 1994 }, { "epoch": 26.60200668896321, "grad_norm": 0.576988160610199, "learning_rate": 0.00011150166852057842, "loss": 2.5831, "step": 1995 }, { "epoch": 26.615384615384617, "grad_norm": 0.6044856309890747, "learning_rate": 0.00011145717463848721, "loss": 2.6834, "step": 1996 }, { "epoch": 26.62876254180602, "grad_norm": 0.5846719741821289, "learning_rate": 0.000111412680756396, "loss": 2.5553, "step": 1997 }, { "epoch": 26.642140468227424, "grad_norm": 0.6511263251304626, "learning_rate": 0.00011136818687430479, "loss": 2.5404, "step": 1998 }, { "epoch": 26.65551839464883, "grad_norm": 0.6784057021141052, "learning_rate": 0.00011132369299221357, "loss": 2.5896, "step": 1999 }, { "epoch": 26.668896321070235, "grad_norm": 0.7156029939651489, "learning_rate": 0.00011127919911012237, "loss": 2.6683, "step": 2000 } ], "logging_steps": 1, "max_steps": 4500, "num_input_tokens_seen": 0, "num_train_epochs": 60, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.18781793617152e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }