diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,28034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 53.34782608695652, + "eval_steps": 500, + "global_step": 4000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013377926421404682, + "grad_norm": 0.5197079181671143, + "learning_rate": 0.0, + "loss": 4.2636, + "step": 1 + }, + { + "epoch": 0.026755852842809364, + "grad_norm": 0.5626901984214783, + "learning_rate": 4e-05, + "loss": 4.3971, + "step": 2 + }, + { + "epoch": 0.04013377926421405, + "grad_norm": 0.5167903304100037, + "learning_rate": 8e-05, + "loss": 4.3249, + "step": 3 + }, + { + "epoch": 0.05351170568561873, + "grad_norm": 0.4764951169490814, + "learning_rate": 0.00012, + "loss": 4.2031, + "step": 4 + }, + { + "epoch": 0.06688963210702341, + "grad_norm": 0.45488491654396057, + "learning_rate": 0.00016, + "loss": 4.3914, + "step": 5 + }, + { + "epoch": 0.0802675585284281, + "grad_norm": 0.568274736404419, + "learning_rate": 0.0002, + "loss": 4.2346, + "step": 6 + }, + { + "epoch": 0.09364548494983277, + "grad_norm": 0.5974003076553345, + "learning_rate": 0.0001999555061179088, + "loss": 4.131, + "step": 7 + }, + { + "epoch": 0.10702341137123746, + "grad_norm": 0.6204471588134766, + "learning_rate": 0.00019991101223581757, + "loss": 4.2256, + "step": 8 + }, + { + "epoch": 0.12040133779264214, + "grad_norm": 0.7143808603286743, + "learning_rate": 0.00019986651835372636, + "loss": 3.8449, + "step": 9 + }, + { + "epoch": 0.13377926421404682, + "grad_norm": 0.7799420356750488, + "learning_rate": 0.00019982202447163517, + "loss": 4.4301, + "step": 10 + }, + { + "epoch": 0.14715719063545152, + "grad_norm": 0.8880407214164734, + "learning_rate": 0.00019977753058954395, + "loss": 4.2266, + "step": 11 + }, + { + "epoch": 0.1605351170568562, + "grad_norm": 0.7776209712028503, + "learning_rate": 0.00019973303670745273, + "loss": 4.3208, + "step": 12 + }, + { + "epoch": 0.17391304347826086, + "grad_norm": 0.9125858545303345, + "learning_rate": 0.0001996885428253615, + "loss": 4.4363, + "step": 13 + }, + { + "epoch": 0.18729096989966554, + "grad_norm": 0.9000256657600403, + "learning_rate": 0.00019964404894327032, + "loss": 4.2917, + "step": 14 + }, + { + "epoch": 0.20066889632107024, + "grad_norm": 0.9995108246803284, + "learning_rate": 0.00019959955506117908, + "loss": 4.1784, + "step": 15 + }, + { + "epoch": 0.2140468227424749, + "grad_norm": 0.9209024310112, + "learning_rate": 0.0001995550611790879, + "loss": 4.7852, + "step": 16 + }, + { + "epoch": 0.22742474916387959, + "grad_norm": 0.9421981573104858, + "learning_rate": 0.00019951056729699667, + "loss": 4.8501, + "step": 17 + }, + { + "epoch": 0.2408026755852843, + "grad_norm": 0.9213201403617859, + "learning_rate": 0.00019946607341490545, + "loss": 4.7923, + "step": 18 + }, + { + "epoch": 0.25418060200668896, + "grad_norm": 0.9378194212913513, + "learning_rate": 0.00019942157953281423, + "loss": 4.9593, + "step": 19 + }, + { + "epoch": 0.26755852842809363, + "grad_norm": 1.0096492767333984, + "learning_rate": 0.00019937708565072304, + "loss": 4.7099, + "step": 20 + }, + { + "epoch": 0.2809364548494983, + "grad_norm": 0.8903587460517883, + "learning_rate": 0.00019933259176863183, + "loss": 4.3746, + "step": 21 + }, + { + "epoch": 0.29431438127090304, + "grad_norm": 0.7808490991592407, + "learning_rate": 0.0001992880978865406, + "loss": 4.5873, + "step": 22 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 0.8145670294761658, + "learning_rate": 0.0001992436040044494, + "loss": 4.7924, + "step": 23 + }, + { + "epoch": 0.3210702341137124, + "grad_norm": 0.7945849299430847, + "learning_rate": 0.0001991991101223582, + "loss": 4.8881, + "step": 24 + }, + { + "epoch": 0.33444816053511706, + "grad_norm": 0.7871395349502563, + "learning_rate": 0.00019915461624026696, + "loss": 4.6922, + "step": 25 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 0.9111238718032837, + "learning_rate": 0.00019911012235817577, + "loss": 4.9982, + "step": 26 + }, + { + "epoch": 0.3612040133779264, + "grad_norm": 0.7121369242668152, + "learning_rate": 0.00019906562847608455, + "loss": 4.4756, + "step": 27 + }, + { + "epoch": 0.3745819397993311, + "grad_norm": 0.7118422389030457, + "learning_rate": 0.00019902113459399333, + "loss": 5.1389, + "step": 28 + }, + { + "epoch": 0.3879598662207358, + "grad_norm": 0.7100292444229126, + "learning_rate": 0.0001989766407119021, + "loss": 4.7691, + "step": 29 + }, + { + "epoch": 0.4013377926421405, + "grad_norm": 0.708591639995575, + "learning_rate": 0.00019893214682981092, + "loss": 4.8721, + "step": 30 + }, + { + "epoch": 0.41471571906354515, + "grad_norm": 0.6711616516113281, + "learning_rate": 0.0001988876529477197, + "loss": 4.9152, + "step": 31 + }, + { + "epoch": 0.4280936454849498, + "grad_norm": 0.7158232927322388, + "learning_rate": 0.0001988431590656285, + "loss": 4.828, + "step": 32 + }, + { + "epoch": 0.4414715719063545, + "grad_norm": 0.6246087551116943, + "learning_rate": 0.00019879866518353727, + "loss": 4.8452, + "step": 33 + }, + { + "epoch": 0.45484949832775917, + "grad_norm": 0.6088873744010925, + "learning_rate": 0.00019875417130144608, + "loss": 4.9702, + "step": 34 + }, + { + "epoch": 0.4682274247491639, + "grad_norm": 0.5798126459121704, + "learning_rate": 0.00019870967741935483, + "loss": 4.9838, + "step": 35 + }, + { + "epoch": 0.4816053511705686, + "grad_norm": 0.6268919706344604, + "learning_rate": 0.00019866518353726364, + "loss": 4.7636, + "step": 36 + }, + { + "epoch": 0.49498327759197325, + "grad_norm": 0.5649904012680054, + "learning_rate": 0.00019862068965517243, + "loss": 4.506, + "step": 37 + }, + { + "epoch": 0.5083612040133779, + "grad_norm": 0.5947792530059814, + "learning_rate": 0.0001985761957730812, + "loss": 4.8057, + "step": 38 + }, + { + "epoch": 0.5217391304347826, + "grad_norm": 0.6204257011413574, + "learning_rate": 0.00019853170189099, + "loss": 5.0511, + "step": 39 + }, + { + "epoch": 0.5351170568561873, + "grad_norm": 0.5972265601158142, + "learning_rate": 0.0001984872080088988, + "loss": 4.924, + "step": 40 + }, + { + "epoch": 0.5484949832775919, + "grad_norm": 0.6117077469825745, + "learning_rate": 0.00019844271412680758, + "loss": 4.8729, + "step": 41 + }, + { + "epoch": 0.5618729096989966, + "grad_norm": 0.5085508823394775, + "learning_rate": 0.00019839822024471637, + "loss": 4.3616, + "step": 42 + }, + { + "epoch": 0.5752508361204013, + "grad_norm": 0.550647497177124, + "learning_rate": 0.00019835372636262515, + "loss": 5.2512, + "step": 43 + }, + { + "epoch": 0.5886287625418061, + "grad_norm": 0.48329588770866394, + "learning_rate": 0.00019830923248053396, + "loss": 4.9501, + "step": 44 + }, + { + "epoch": 0.6020066889632107, + "grad_norm": 0.6313246488571167, + "learning_rate": 0.0001982647385984427, + "loss": 4.5767, + "step": 45 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 0.5111928582191467, + "learning_rate": 0.00019822024471635152, + "loss": 4.5586, + "step": 46 + }, + { + "epoch": 0.6287625418060201, + "grad_norm": 0.5264492630958557, + "learning_rate": 0.0001981757508342603, + "loss": 4.7033, + "step": 47 + }, + { + "epoch": 0.6421404682274248, + "grad_norm": 0.5058289170265198, + "learning_rate": 0.0001981312569521691, + "loss": 4.8396, + "step": 48 + }, + { + "epoch": 0.6555183946488294, + "grad_norm": 0.5688439607620239, + "learning_rate": 0.00019808676307007787, + "loss": 5.1887, + "step": 49 + }, + { + "epoch": 0.6688963210702341, + "grad_norm": 0.5488842129707336, + "learning_rate": 0.00019804226918798665, + "loss": 4.6075, + "step": 50 + }, + { + "epoch": 0.6822742474916388, + "grad_norm": 0.5358632206916809, + "learning_rate": 0.00019799777530589546, + "loss": 5.0205, + "step": 51 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 0.47869494557380676, + "learning_rate": 0.00019795328142380422, + "loss": 4.8495, + "step": 52 + }, + { + "epoch": 0.7090301003344481, + "grad_norm": 0.49378660321235657, + "learning_rate": 0.00019790878754171303, + "loss": 4.6563, + "step": 53 + }, + { + "epoch": 0.7224080267558528, + "grad_norm": 0.5167868733406067, + "learning_rate": 0.0001978642936596218, + "loss": 5.2558, + "step": 54 + }, + { + "epoch": 0.7357859531772575, + "grad_norm": 0.5230040550231934, + "learning_rate": 0.0001978197997775306, + "loss": 4.7769, + "step": 55 + }, + { + "epoch": 0.7491638795986622, + "grad_norm": 0.4822310507297516, + "learning_rate": 0.00019777530589543937, + "loss": 4.9282, + "step": 56 + }, + { + "epoch": 0.7625418060200669, + "grad_norm": 0.500045895576477, + "learning_rate": 0.00019773081201334818, + "loss": 5.0399, + "step": 57 + }, + { + "epoch": 0.7759197324414716, + "grad_norm": 0.4740642309188843, + "learning_rate": 0.00019768631813125696, + "loss": 4.8041, + "step": 58 + }, + { + "epoch": 0.7892976588628763, + "grad_norm": 0.45918184518814087, + "learning_rate": 0.00019764182424916575, + "loss": 4.6304, + "step": 59 + }, + { + "epoch": 0.802675585284281, + "grad_norm": 0.53122878074646, + "learning_rate": 0.00019759733036707453, + "loss": 4.8377, + "step": 60 + }, + { + "epoch": 0.8160535117056856, + "grad_norm": 0.4925791919231415, + "learning_rate": 0.00019755283648498334, + "loss": 5.0919, + "step": 61 + }, + { + "epoch": 0.8294314381270903, + "grad_norm": 0.4777262806892395, + "learning_rate": 0.0001975083426028921, + "loss": 4.8379, + "step": 62 + }, + { + "epoch": 0.842809364548495, + "grad_norm": 0.49119675159454346, + "learning_rate": 0.0001974638487208009, + "loss": 5.0819, + "step": 63 + }, + { + "epoch": 0.8561872909698997, + "grad_norm": 0.4732685089111328, + "learning_rate": 0.00019741935483870969, + "loss": 4.8948, + "step": 64 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 0.46269145607948303, + "learning_rate": 0.00019737486095661847, + "loss": 4.824, + "step": 65 + }, + { + "epoch": 0.882943143812709, + "grad_norm": 0.49532708525657654, + "learning_rate": 0.00019733036707452725, + "loss": 4.8986, + "step": 66 + }, + { + "epoch": 0.8963210702341137, + "grad_norm": 0.5253002643585205, + "learning_rate": 0.00019728587319243606, + "loss": 4.9073, + "step": 67 + }, + { + "epoch": 0.9096989966555183, + "grad_norm": 0.5069419145584106, + "learning_rate": 0.00019724137931034484, + "loss": 4.8962, + "step": 68 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 0.5038817524909973, + "learning_rate": 0.00019719688542825363, + "loss": 4.8711, + "step": 69 + }, + { + "epoch": 0.9364548494983278, + "grad_norm": 0.4987100064754486, + "learning_rate": 0.0001971523915461624, + "loss": 4.8816, + "step": 70 + }, + { + "epoch": 0.9498327759197325, + "grad_norm": 0.47370976209640503, + "learning_rate": 0.00019710789766407122, + "loss": 4.9675, + "step": 71 + }, + { + "epoch": 0.9632107023411371, + "grad_norm": 0.5081727504730225, + "learning_rate": 0.00019706340378197997, + "loss": 4.2768, + "step": 72 + }, + { + "epoch": 0.9765886287625418, + "grad_norm": 0.45571258664131165, + "learning_rate": 0.00019701890989988878, + "loss": 4.6182, + "step": 73 + }, + { + "epoch": 0.9899665551839465, + "grad_norm": 0.5216127634048462, + "learning_rate": 0.00019697441601779756, + "loss": 4.7126, + "step": 74 + }, + { + "epoch": 1.0, + "grad_norm": 0.5393329858779907, + "learning_rate": 0.00019692992213570635, + "loss": 4.4919, + "step": 75 + }, + { + "epoch": 1.0133779264214047, + "grad_norm": 0.4506986737251282, + "learning_rate": 0.00019688542825361513, + "loss": 4.5089, + "step": 76 + }, + { + "epoch": 1.0267558528428093, + "grad_norm": 0.4328899085521698, + "learning_rate": 0.00019684093437152394, + "loss": 4.7518, + "step": 77 + }, + { + "epoch": 1.040133779264214, + "grad_norm": 0.4397362470626831, + "learning_rate": 0.00019679644048943272, + "loss": 4.5069, + "step": 78 + }, + { + "epoch": 1.0535117056856187, + "grad_norm": 0.4604664146900177, + "learning_rate": 0.0001967519466073415, + "loss": 4.7054, + "step": 79 + }, + { + "epoch": 1.0668896321070234, + "grad_norm": 0.4398234784603119, + "learning_rate": 0.00019670745272525029, + "loss": 4.2743, + "step": 80 + }, + { + "epoch": 1.080267558528428, + "grad_norm": 0.4570735692977905, + "learning_rate": 0.0001966629588431591, + "loss": 4.8012, + "step": 81 + }, + { + "epoch": 1.0936454849498327, + "grad_norm": 0.4814144968986511, + "learning_rate": 0.00019661846496106785, + "loss": 4.6449, + "step": 82 + }, + { + "epoch": 1.1070234113712374, + "grad_norm": 0.4526231288909912, + "learning_rate": 0.00019657397107897666, + "loss": 4.5546, + "step": 83 + }, + { + "epoch": 1.120401337792642, + "grad_norm": 0.4847906827926636, + "learning_rate": 0.00019652947719688544, + "loss": 4.4421, + "step": 84 + }, + { + "epoch": 1.1337792642140467, + "grad_norm": 0.5136271715164185, + "learning_rate": 0.00019648498331479422, + "loss": 4.7136, + "step": 85 + }, + { + "epoch": 1.1471571906354514, + "grad_norm": 0.49209895730018616, + "learning_rate": 0.000196440489432703, + "loss": 4.3145, + "step": 86 + }, + { + "epoch": 1.160535117056856, + "grad_norm": 0.4972032904624939, + "learning_rate": 0.00019639599555061182, + "loss": 4.0408, + "step": 87 + }, + { + "epoch": 1.1739130434782608, + "grad_norm": 0.5077862739562988, + "learning_rate": 0.0001963515016685206, + "loss": 4.4074, + "step": 88 + }, + { + "epoch": 1.1872909698996654, + "grad_norm": 0.5293861031532288, + "learning_rate": 0.00019630700778642935, + "loss": 4.5385, + "step": 89 + }, + { + "epoch": 1.2006688963210703, + "grad_norm": 0.5062645673751831, + "learning_rate": 0.00019626251390433816, + "loss": 4.5141, + "step": 90 + }, + { + "epoch": 1.214046822742475, + "grad_norm": 0.49655866622924805, + "learning_rate": 0.00019621802002224695, + "loss": 4.4765, + "step": 91 + }, + { + "epoch": 1.2274247491638797, + "grad_norm": 0.6059755086898804, + "learning_rate": 0.00019617352614015573, + "loss": 4.568, + "step": 92 + }, + { + "epoch": 1.2408026755852843, + "grad_norm": 0.5442761778831482, + "learning_rate": 0.0001961290322580645, + "loss": 4.7724, + "step": 93 + }, + { + "epoch": 1.254180602006689, + "grad_norm": 0.5426056385040283, + "learning_rate": 0.00019608453837597332, + "loss": 4.5308, + "step": 94 + }, + { + "epoch": 1.2675585284280937, + "grad_norm": 0.525372326374054, + "learning_rate": 0.0001960400444938821, + "loss": 4.394, + "step": 95 + }, + { + "epoch": 1.2809364548494984, + "grad_norm": 0.5407588481903076, + "learning_rate": 0.00019599555061179089, + "loss": 4.7347, + "step": 96 + }, + { + "epoch": 1.294314381270903, + "grad_norm": 0.5726659893989563, + "learning_rate": 0.00019595105672969967, + "loss": 4.9446, + "step": 97 + }, + { + "epoch": 1.3076923076923077, + "grad_norm": 0.6211283206939697, + "learning_rate": 0.00019590656284760848, + "loss": 4.697, + "step": 98 + }, + { + "epoch": 1.3210702341137124, + "grad_norm": 0.5627567172050476, + "learning_rate": 0.00019586206896551723, + "loss": 4.4892, + "step": 99 + }, + { + "epoch": 1.334448160535117, + "grad_norm": 0.6174790859222412, + "learning_rate": 0.00019581757508342604, + "loss": 4.5686, + "step": 100 + }, + { + "epoch": 1.3478260869565217, + "grad_norm": 0.5586990118026733, + "learning_rate": 0.00019577308120133482, + "loss": 4.3916, + "step": 101 + }, + { + "epoch": 1.3612040133779264, + "grad_norm": 0.5655365586280823, + "learning_rate": 0.0001957285873192436, + "loss": 4.1789, + "step": 102 + }, + { + "epoch": 1.374581939799331, + "grad_norm": 0.5834594964981079, + "learning_rate": 0.0001956840934371524, + "loss": 4.3316, + "step": 103 + }, + { + "epoch": 1.3879598662207357, + "grad_norm": 0.6065447926521301, + "learning_rate": 0.0001956395995550612, + "loss": 4.5167, + "step": 104 + }, + { + "epoch": 1.4013377926421404, + "grad_norm": 0.5250216722488403, + "learning_rate": 0.00019559510567296998, + "loss": 4.1718, + "step": 105 + }, + { + "epoch": 1.414715719063545, + "grad_norm": 0.5861116051673889, + "learning_rate": 0.00019555061179087876, + "loss": 4.3077, + "step": 106 + }, + { + "epoch": 1.4280936454849498, + "grad_norm": 0.6138104796409607, + "learning_rate": 0.00019550611790878755, + "loss": 4.4748, + "step": 107 + }, + { + "epoch": 1.4414715719063544, + "grad_norm": 0.6742071509361267, + "learning_rate": 0.00019546162402669636, + "loss": 4.8769, + "step": 108 + }, + { + "epoch": 1.4548494983277591, + "grad_norm": 0.6634951233863831, + "learning_rate": 0.0001954171301446051, + "loss": 4.6423, + "step": 109 + }, + { + "epoch": 1.468227424749164, + "grad_norm": 0.626646876335144, + "learning_rate": 0.00019537263626251392, + "loss": 4.4654, + "step": 110 + }, + { + "epoch": 1.4816053511705687, + "grad_norm": 0.6306963562965393, + "learning_rate": 0.0001953281423804227, + "loss": 4.7021, + "step": 111 + }, + { + "epoch": 1.4949832775919734, + "grad_norm": 0.620370626449585, + "learning_rate": 0.00019528364849833149, + "loss": 4.587, + "step": 112 + }, + { + "epoch": 1.508361204013378, + "grad_norm": 0.6410287618637085, + "learning_rate": 0.00019523915461624027, + "loss": 4.8089, + "step": 113 + }, + { + "epoch": 1.5217391304347827, + "grad_norm": 0.676434338092804, + "learning_rate": 0.00019519466073414908, + "loss": 4.668, + "step": 114 + }, + { + "epoch": 1.5351170568561874, + "grad_norm": 0.5756319761276245, + "learning_rate": 0.00019515016685205786, + "loss": 4.3223, + "step": 115 + }, + { + "epoch": 1.548494983277592, + "grad_norm": 0.5850693583488464, + "learning_rate": 0.00019510567296996664, + "loss": 4.2343, + "step": 116 + }, + { + "epoch": 1.5618729096989967, + "grad_norm": 0.6172360777854919, + "learning_rate": 0.00019506117908787542, + "loss": 4.6102, + "step": 117 + }, + { + "epoch": 1.5752508361204014, + "grad_norm": 0.5887568593025208, + "learning_rate": 0.00019501668520578423, + "loss": 4.8097, + "step": 118 + }, + { + "epoch": 1.588628762541806, + "grad_norm": 0.5763369798660278, + "learning_rate": 0.000194972191323693, + "loss": 4.2001, + "step": 119 + }, + { + "epoch": 1.6020066889632107, + "grad_norm": 0.6158986687660217, + "learning_rate": 0.0001949276974416018, + "loss": 4.7075, + "step": 120 + }, + { + "epoch": 1.6153846153846154, + "grad_norm": 0.5540957450866699, + "learning_rate": 0.00019488320355951058, + "loss": 4.452, + "step": 121 + }, + { + "epoch": 1.62876254180602, + "grad_norm": 0.6193795204162598, + "learning_rate": 0.00019483870967741936, + "loss": 4.4583, + "step": 122 + }, + { + "epoch": 1.6421404682274248, + "grad_norm": 0.6699966788291931, + "learning_rate": 0.00019479421579532815, + "loss": 4.3728, + "step": 123 + }, + { + "epoch": 1.6555183946488294, + "grad_norm": 0.5904677510261536, + "learning_rate": 0.00019474972191323696, + "loss": 4.5452, + "step": 124 + }, + { + "epoch": 1.6688963210702341, + "grad_norm": 0.6137760281562805, + "learning_rate": 0.00019470522803114574, + "loss": 4.2853, + "step": 125 + }, + { + "epoch": 1.6822742474916388, + "grad_norm": 0.6396192908287048, + "learning_rate": 0.00019466073414905452, + "loss": 4.4258, + "step": 126 + }, + { + "epoch": 1.6956521739130435, + "grad_norm": 0.6190487742424011, + "learning_rate": 0.0001946162402669633, + "loss": 4.9866, + "step": 127 + }, + { + "epoch": 1.7090301003344481, + "grad_norm": 0.6971675157546997, + "learning_rate": 0.0001945717463848721, + "loss": 4.2126, + "step": 128 + }, + { + "epoch": 1.7224080267558528, + "grad_norm": 0.6245931386947632, + "learning_rate": 0.00019452725250278087, + "loss": 4.8477, + "step": 129 + }, + { + "epoch": 1.7357859531772575, + "grad_norm": 0.5675052404403687, + "learning_rate": 0.00019448275862068965, + "loss": 4.4097, + "step": 130 + }, + { + "epoch": 1.7491638795986622, + "grad_norm": 0.6594040393829346, + "learning_rate": 0.00019443826473859846, + "loss": 4.3747, + "step": 131 + }, + { + "epoch": 1.7625418060200668, + "grad_norm": 0.6377655267715454, + "learning_rate": 0.00019439377085650724, + "loss": 4.2733, + "step": 132 + }, + { + "epoch": 1.7759197324414715, + "grad_norm": 0.6167862415313721, + "learning_rate": 0.00019434927697441602, + "loss": 4.5694, + "step": 133 + }, + { + "epoch": 1.7892976588628762, + "grad_norm": 0.577671468257904, + "learning_rate": 0.0001943047830923248, + "loss": 4.5006, + "step": 134 + }, + { + "epoch": 1.8026755852842808, + "grad_norm": 0.6361016035079956, + "learning_rate": 0.00019426028921023362, + "loss": 4.9907, + "step": 135 + }, + { + "epoch": 1.8160535117056855, + "grad_norm": 0.6445321440696716, + "learning_rate": 0.00019421579532814237, + "loss": 4.779, + "step": 136 + }, + { + "epoch": 1.8294314381270902, + "grad_norm": 0.5955402851104736, + "learning_rate": 0.00019417130144605118, + "loss": 4.6026, + "step": 137 + }, + { + "epoch": 1.8428093645484949, + "grad_norm": 0.6807080507278442, + "learning_rate": 0.00019412680756395996, + "loss": 4.7124, + "step": 138 + }, + { + "epoch": 1.8561872909698995, + "grad_norm": 0.5799288153648376, + "learning_rate": 0.00019408231368186875, + "loss": 4.0701, + "step": 139 + }, + { + "epoch": 1.8695652173913042, + "grad_norm": 0.6187757253646851, + "learning_rate": 0.00019403781979977753, + "loss": 4.705, + "step": 140 + }, + { + "epoch": 1.8829431438127089, + "grad_norm": 0.6614826917648315, + "learning_rate": 0.00019399332591768634, + "loss": 4.8146, + "step": 141 + }, + { + "epoch": 1.8963210702341136, + "grad_norm": 0.6204859614372253, + "learning_rate": 0.00019394883203559512, + "loss": 4.3041, + "step": 142 + }, + { + "epoch": 1.9096989966555182, + "grad_norm": 0.6527450680732727, + "learning_rate": 0.0001939043381535039, + "loss": 4.4493, + "step": 143 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 0.6470615267753601, + "learning_rate": 0.00019385984427141268, + "loss": 4.7771, + "step": 144 + }, + { + "epoch": 1.9364548494983278, + "grad_norm": 0.5642555952072144, + "learning_rate": 0.0001938153503893215, + "loss": 4.3344, + "step": 145 + }, + { + "epoch": 1.9498327759197325, + "grad_norm": 0.6206467151641846, + "learning_rate": 0.00019377085650723025, + "loss": 4.2191, + "step": 146 + }, + { + "epoch": 1.9632107023411371, + "grad_norm": 0.6079016923904419, + "learning_rate": 0.00019372636262513906, + "loss": 4.7397, + "step": 147 + }, + { + "epoch": 1.9765886287625418, + "grad_norm": 0.6197662353515625, + "learning_rate": 0.00019368186874304784, + "loss": 4.5342, + "step": 148 + }, + { + "epoch": 1.9899665551839465, + "grad_norm": 0.6556297540664673, + "learning_rate": 0.00019363737486095662, + "loss": 4.6709, + "step": 149 + }, + { + "epoch": 2.0, + "grad_norm": 0.7837930917739868, + "learning_rate": 0.0001935928809788654, + "loss": 4.6215, + "step": 150 + }, + { + "epoch": 2.0133779264214047, + "grad_norm": 0.5267267227172852, + "learning_rate": 0.00019354838709677422, + "loss": 4.2695, + "step": 151 + }, + { + "epoch": 2.0267558528428093, + "grad_norm": 0.5862157344818115, + "learning_rate": 0.000193503893214683, + "loss": 4.3702, + "step": 152 + }, + { + "epoch": 2.040133779264214, + "grad_norm": 0.538254976272583, + "learning_rate": 0.00019345939933259178, + "loss": 4.3953, + "step": 153 + }, + { + "epoch": 2.0535117056856187, + "grad_norm": 0.5977053642272949, + "learning_rate": 0.00019341490545050056, + "loss": 4.2156, + "step": 154 + }, + { + "epoch": 2.0668896321070234, + "grad_norm": 0.606006383895874, + "learning_rate": 0.00019337041156840937, + "loss": 4.2802, + "step": 155 + }, + { + "epoch": 2.080267558528428, + "grad_norm": 0.6071277856826782, + "learning_rate": 0.00019332591768631813, + "loss": 4.5545, + "step": 156 + }, + { + "epoch": 2.0936454849498327, + "grad_norm": 0.6281546354293823, + "learning_rate": 0.00019328142380422694, + "loss": 4.6105, + "step": 157 + }, + { + "epoch": 2.1070234113712374, + "grad_norm": 0.5703116655349731, + "learning_rate": 0.00019323692992213572, + "loss": 4.2751, + "step": 158 + }, + { + "epoch": 2.120401337792642, + "grad_norm": 0.6587452292442322, + "learning_rate": 0.0001931924360400445, + "loss": 4.6342, + "step": 159 + }, + { + "epoch": 2.1337792642140467, + "grad_norm": 0.6141905784606934, + "learning_rate": 0.00019314794215795328, + "loss": 4.4345, + "step": 160 + }, + { + "epoch": 2.1471571906354514, + "grad_norm": 0.6741939187049866, + "learning_rate": 0.0001931034482758621, + "loss": 4.0257, + "step": 161 + }, + { + "epoch": 2.160535117056856, + "grad_norm": 0.6468759179115295, + "learning_rate": 0.00019305895439377088, + "loss": 4.2313, + "step": 162 + }, + { + "epoch": 2.1739130434782608, + "grad_norm": 0.6703383326530457, + "learning_rate": 0.00019301446051167966, + "loss": 4.2164, + "step": 163 + }, + { + "epoch": 2.1872909698996654, + "grad_norm": 0.710967481136322, + "learning_rate": 0.00019296996662958844, + "loss": 4.3398, + "step": 164 + }, + { + "epoch": 2.20066889632107, + "grad_norm": 0.6862124800682068, + "learning_rate": 0.00019292547274749725, + "loss": 4.3379, + "step": 165 + }, + { + "epoch": 2.2140468227424748, + "grad_norm": 0.6288430690765381, + "learning_rate": 0.000192880978865406, + "loss": 4.3487, + "step": 166 + }, + { + "epoch": 2.2274247491638794, + "grad_norm": 0.6358796954154968, + "learning_rate": 0.00019283648498331481, + "loss": 4.1656, + "step": 167 + }, + { + "epoch": 2.240802675585284, + "grad_norm": 0.6818917393684387, + "learning_rate": 0.0001927919911012236, + "loss": 4.5363, + "step": 168 + }, + { + "epoch": 2.254180602006689, + "grad_norm": 0.6996105313301086, + "learning_rate": 0.00019274749721913238, + "loss": 4.3208, + "step": 169 + }, + { + "epoch": 2.2675585284280935, + "grad_norm": 0.6730326414108276, + "learning_rate": 0.00019270300333704116, + "loss": 4.1401, + "step": 170 + }, + { + "epoch": 2.280936454849498, + "grad_norm": 0.7022603750228882, + "learning_rate": 0.00019265850945494994, + "loss": 4.5761, + "step": 171 + }, + { + "epoch": 2.294314381270903, + "grad_norm": 0.6525995135307312, + "learning_rate": 0.00019261401557285875, + "loss": 4.4017, + "step": 172 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 0.7066033482551575, + "learning_rate": 0.0001925695216907675, + "loss": 4.0037, + "step": 173 + }, + { + "epoch": 2.321070234113712, + "grad_norm": 0.6708059310913086, + "learning_rate": 0.00019252502780867632, + "loss": 4.1947, + "step": 174 + }, + { + "epoch": 2.334448160535117, + "grad_norm": 0.8711172342300415, + "learning_rate": 0.0001924805339265851, + "loss": 3.9958, + "step": 175 + }, + { + "epoch": 2.3478260869565215, + "grad_norm": 0.7258634567260742, + "learning_rate": 0.00019243604004449388, + "loss": 4.4682, + "step": 176 + }, + { + "epoch": 2.361204013377926, + "grad_norm": 0.7693021893501282, + "learning_rate": 0.00019239154616240267, + "loss": 4.54, + "step": 177 + }, + { + "epoch": 2.374581939799331, + "grad_norm": 0.7271276116371155, + "learning_rate": 0.00019234705228031148, + "loss": 4.2942, + "step": 178 + }, + { + "epoch": 2.387959866220736, + "grad_norm": 0.6836609244346619, + "learning_rate": 0.00019230255839822026, + "loss": 4.3099, + "step": 179 + }, + { + "epoch": 2.4013377926421406, + "grad_norm": 0.731164813041687, + "learning_rate": 0.00019225806451612904, + "loss": 4.4077, + "step": 180 + }, + { + "epoch": 2.4147157190635453, + "grad_norm": 0.7575274109840393, + "learning_rate": 0.00019221357063403782, + "loss": 4.6572, + "step": 181 + }, + { + "epoch": 2.42809364548495, + "grad_norm": 0.8461325168609619, + "learning_rate": 0.00019216907675194663, + "loss": 4.4922, + "step": 182 + }, + { + "epoch": 2.4414715719063547, + "grad_norm": 0.7225251197814941, + "learning_rate": 0.0001921245828698554, + "loss": 4.0372, + "step": 183 + }, + { + "epoch": 2.4548494983277593, + "grad_norm": 3.563720703125, + "learning_rate": 0.0001920800889877642, + "loss": 4.5412, + "step": 184 + }, + { + "epoch": 2.468227424749164, + "grad_norm": 0.8452121019363403, + "learning_rate": 0.00019203559510567298, + "loss": 4.4961, + "step": 185 + }, + { + "epoch": 2.4816053511705687, + "grad_norm": 0.8734024167060852, + "learning_rate": 0.00019199110122358176, + "loss": 4.0884, + "step": 186 + }, + { + "epoch": 2.4949832775919734, + "grad_norm": 1.1765823364257812, + "learning_rate": 0.00019194660734149054, + "loss": 4.2228, + "step": 187 + }, + { + "epoch": 2.508361204013378, + "grad_norm": 0.750206708908081, + "learning_rate": 0.00019190211345939935, + "loss": 4.4305, + "step": 188 + }, + { + "epoch": 2.5217391304347827, + "grad_norm": 0.7574430704116821, + "learning_rate": 0.00019185761957730814, + "loss": 4.1511, + "step": 189 + }, + { + "epoch": 2.5351170568561874, + "grad_norm": 0.7105517387390137, + "learning_rate": 0.00019181312569521692, + "loss": 4.4793, + "step": 190 + }, + { + "epoch": 2.548494983277592, + "grad_norm": 0.7495557069778442, + "learning_rate": 0.0001917686318131257, + "loss": 4.1335, + "step": 191 + }, + { + "epoch": 2.5618729096989967, + "grad_norm": 0.8001168966293335, + "learning_rate": 0.0001917241379310345, + "loss": 4.7898, + "step": 192 + }, + { + "epoch": 2.5752508361204014, + "grad_norm": 0.7402104735374451, + "learning_rate": 0.00019167964404894327, + "loss": 4.4482, + "step": 193 + }, + { + "epoch": 2.588628762541806, + "grad_norm": 0.748267650604248, + "learning_rate": 0.00019163515016685207, + "loss": 4.3167, + "step": 194 + }, + { + "epoch": 2.6020066889632107, + "grad_norm": 0.8291250467300415, + "learning_rate": 0.00019159065628476086, + "loss": 4.058, + "step": 195 + }, + { + "epoch": 2.6153846153846154, + "grad_norm": 0.6945542693138123, + "learning_rate": 0.00019154616240266964, + "loss": 3.9751, + "step": 196 + }, + { + "epoch": 2.62876254180602, + "grad_norm": 0.7307319045066833, + "learning_rate": 0.00019150166852057842, + "loss": 4.2736, + "step": 197 + }, + { + "epoch": 2.6421404682274248, + "grad_norm": 0.7489168047904968, + "learning_rate": 0.00019145717463848723, + "loss": 4.3075, + "step": 198 + }, + { + "epoch": 2.6555183946488294, + "grad_norm": 0.9727582931518555, + "learning_rate": 0.00019141268075639601, + "loss": 4.6474, + "step": 199 + }, + { + "epoch": 2.668896321070234, + "grad_norm": 0.6776256561279297, + "learning_rate": 0.0001913681868743048, + "loss": 4.4217, + "step": 200 + }, + { + "epoch": 2.682274247491639, + "grad_norm": 0.7305111885070801, + "learning_rate": 0.00019132369299221358, + "loss": 4.2804, + "step": 201 + }, + { + "epoch": 2.6956521739130435, + "grad_norm": 0.7196978330612183, + "learning_rate": 0.0001912791991101224, + "loss": 4.3941, + "step": 202 + }, + { + "epoch": 2.709030100334448, + "grad_norm": 0.7988458871841431, + "learning_rate": 0.00019123470522803114, + "loss": 4.437, + "step": 203 + }, + { + "epoch": 2.722408026755853, + "grad_norm": 0.7004797458648682, + "learning_rate": 0.00019119021134593995, + "loss": 4.4986, + "step": 204 + }, + { + "epoch": 2.7357859531772575, + "grad_norm": 0.677796483039856, + "learning_rate": 0.00019114571746384874, + "loss": 4.0851, + "step": 205 + }, + { + "epoch": 2.749163879598662, + "grad_norm": 0.7527475357055664, + "learning_rate": 0.00019110122358175752, + "loss": 4.4469, + "step": 206 + }, + { + "epoch": 2.762541806020067, + "grad_norm": 1.1659115552902222, + "learning_rate": 0.0001910567296996663, + "loss": 4.3284, + "step": 207 + }, + { + "epoch": 2.7759197324414715, + "grad_norm": 0.7238364815711975, + "learning_rate": 0.0001910122358175751, + "loss": 4.2605, + "step": 208 + }, + { + "epoch": 2.789297658862876, + "grad_norm": 0.7537760734558105, + "learning_rate": 0.0001909677419354839, + "loss": 4.3775, + "step": 209 + }, + { + "epoch": 2.802675585284281, + "grad_norm": 0.6874127388000488, + "learning_rate": 0.00019092324805339267, + "loss": 4.3404, + "step": 210 + }, + { + "epoch": 2.8160535117056855, + "grad_norm": 0.7045959830284119, + "learning_rate": 0.00019087875417130146, + "loss": 4.1568, + "step": 211 + }, + { + "epoch": 2.82943143812709, + "grad_norm": 0.7249194383621216, + "learning_rate": 0.00019083426028921027, + "loss": 4.1969, + "step": 212 + }, + { + "epoch": 2.842809364548495, + "grad_norm": 0.8331268429756165, + "learning_rate": 0.00019078976640711902, + "loss": 4.3169, + "step": 213 + }, + { + "epoch": 2.8561872909698995, + "grad_norm": 0.7171936631202698, + "learning_rate": 0.0001907452725250278, + "loss": 4.5123, + "step": 214 + }, + { + "epoch": 2.869565217391304, + "grad_norm": 0.759919285774231, + "learning_rate": 0.0001907007786429366, + "loss": 4.5412, + "step": 215 + }, + { + "epoch": 2.882943143812709, + "grad_norm": 0.7451274991035461, + "learning_rate": 0.0001906562847608454, + "loss": 4.5253, + "step": 216 + }, + { + "epoch": 2.8963210702341136, + "grad_norm": 0.6564481258392334, + "learning_rate": 0.00019061179087875418, + "loss": 4.1092, + "step": 217 + }, + { + "epoch": 2.9096989966555182, + "grad_norm": 0.7339865565299988, + "learning_rate": 0.00019056729699666296, + "loss": 4.5092, + "step": 218 + }, + { + "epoch": 2.9230769230769234, + "grad_norm": 0.7113937735557556, + "learning_rate": 0.00019052280311457177, + "loss": 4.3355, + "step": 219 + }, + { + "epoch": 2.936454849498328, + "grad_norm": 0.7306456565856934, + "learning_rate": 0.00019047830923248053, + "loss": 4.5745, + "step": 220 + }, + { + "epoch": 2.9498327759197327, + "grad_norm": 0.7971818447113037, + "learning_rate": 0.00019043381535038933, + "loss": 4.2903, + "step": 221 + }, + { + "epoch": 2.9632107023411374, + "grad_norm": 0.7757331728935242, + "learning_rate": 0.00019038932146829812, + "loss": 4.2832, + "step": 222 + }, + { + "epoch": 2.976588628762542, + "grad_norm": 0.7326288223266602, + "learning_rate": 0.0001903448275862069, + "loss": 4.2444, + "step": 223 + }, + { + "epoch": 2.9899665551839467, + "grad_norm": 0.7363834381103516, + "learning_rate": 0.00019030033370411568, + "loss": 4.6744, + "step": 224 + }, + { + "epoch": 3.0, + "grad_norm": 0.8835271596908569, + "learning_rate": 0.0001902558398220245, + "loss": 4.432, + "step": 225 + }, + { + "epoch": 3.0133779264214047, + "grad_norm": 0.6591921448707581, + "learning_rate": 0.00019021134593993327, + "loss": 4.1353, + "step": 226 + }, + { + "epoch": 3.0267558528428093, + "grad_norm": 0.6895263195037842, + "learning_rate": 0.00019016685205784206, + "loss": 4.1253, + "step": 227 + }, + { + "epoch": 3.040133779264214, + "grad_norm": 0.6476898789405823, + "learning_rate": 0.00019012235817575084, + "loss": 4.0354, + "step": 228 + }, + { + "epoch": 3.0535117056856187, + "grad_norm": 0.6398957967758179, + "learning_rate": 0.00019007786429365965, + "loss": 4.024, + "step": 229 + }, + { + "epoch": 3.0668896321070234, + "grad_norm": 0.7483389973640442, + "learning_rate": 0.0001900333704115684, + "loss": 4.1405, + "step": 230 + }, + { + "epoch": 3.080267558528428, + "grad_norm": 0.7003724575042725, + "learning_rate": 0.0001899888765294772, + "loss": 4.3593, + "step": 231 + }, + { + "epoch": 3.0936454849498327, + "grad_norm": 0.7426732182502747, + "learning_rate": 0.000189944382647386, + "loss": 4.485, + "step": 232 + }, + { + "epoch": 3.1070234113712374, + "grad_norm": 0.6957541108131409, + "learning_rate": 0.00018989988876529478, + "loss": 4.1017, + "step": 233 + }, + { + "epoch": 3.120401337792642, + "grad_norm": 0.8613067865371704, + "learning_rate": 0.00018985539488320356, + "loss": 4.3038, + "step": 234 + }, + { + "epoch": 3.1337792642140467, + "grad_norm": 0.8375754952430725, + "learning_rate": 0.00018981090100111237, + "loss": 4.4356, + "step": 235 + }, + { + "epoch": 3.1471571906354514, + "grad_norm": 0.7878522872924805, + "learning_rate": 0.00018976640711902115, + "loss": 3.9916, + "step": 236 + }, + { + "epoch": 3.160535117056856, + "grad_norm": 0.7463901042938232, + "learning_rate": 0.00018972191323692993, + "loss": 3.6761, + "step": 237 + }, + { + "epoch": 3.1739130434782608, + "grad_norm": 0.7360939979553223, + "learning_rate": 0.00018967741935483872, + "loss": 3.9573, + "step": 238 + }, + { + "epoch": 3.1872909698996654, + "grad_norm": 0.891861081123352, + "learning_rate": 0.00018963292547274753, + "loss": 4.1853, + "step": 239 + }, + { + "epoch": 3.20066889632107, + "grad_norm": 0.8589549660682678, + "learning_rate": 0.00018958843159065628, + "loss": 4.0679, + "step": 240 + }, + { + "epoch": 3.2140468227424748, + "grad_norm": 0.9534163475036621, + "learning_rate": 0.0001895439377085651, + "loss": 3.732, + "step": 241 + }, + { + "epoch": 3.2274247491638794, + "grad_norm": 0.8968185186386108, + "learning_rate": 0.00018949944382647387, + "loss": 4.2217, + "step": 242 + }, + { + "epoch": 3.240802675585284, + "grad_norm": 0.81589275598526, + "learning_rate": 0.00018945494994438266, + "loss": 4.428, + "step": 243 + }, + { + "epoch": 3.254180602006689, + "grad_norm": 0.929050862789154, + "learning_rate": 0.00018941045606229144, + "loss": 4.3468, + "step": 244 + }, + { + "epoch": 3.2675585284280935, + "grad_norm": 0.8535035252571106, + "learning_rate": 0.00018936596218020025, + "loss": 3.8489, + "step": 245 + }, + { + "epoch": 3.280936454849498, + "grad_norm": 0.9484681487083435, + "learning_rate": 0.00018932146829810903, + "loss": 4.0132, + "step": 246 + }, + { + "epoch": 3.294314381270903, + "grad_norm": 0.8190047144889832, + "learning_rate": 0.0001892769744160178, + "loss": 4.3574, + "step": 247 + }, + { + "epoch": 3.3076923076923075, + "grad_norm": 0.8764749765396118, + "learning_rate": 0.0001892324805339266, + "loss": 4.3103, + "step": 248 + }, + { + "epoch": 3.321070234113712, + "grad_norm": 0.8929185271263123, + "learning_rate": 0.0001891879866518354, + "loss": 4.3606, + "step": 249 + }, + { + "epoch": 3.334448160535117, + "grad_norm": 0.9096692204475403, + "learning_rate": 0.00018914349276974416, + "loss": 4.0047, + "step": 250 + }, + { + "epoch": 3.3478260869565215, + "grad_norm": 0.885143518447876, + "learning_rate": 0.00018909899888765297, + "loss": 4.182, + "step": 251 + }, + { + "epoch": 3.361204013377926, + "grad_norm": 0.7724215984344482, + "learning_rate": 0.00018905450500556175, + "loss": 3.9529, + "step": 252 + }, + { + "epoch": 3.374581939799331, + "grad_norm": 0.8351865410804749, + "learning_rate": 0.00018901001112347053, + "loss": 3.9533, + "step": 253 + }, + { + "epoch": 3.387959866220736, + "grad_norm": 0.8684999942779541, + "learning_rate": 0.00018896551724137932, + "loss": 3.8594, + "step": 254 + }, + { + "epoch": 3.4013377926421406, + "grad_norm": 0.8903334736824036, + "learning_rate": 0.0001889210233592881, + "loss": 3.9248, + "step": 255 + }, + { + "epoch": 3.4147157190635453, + "grad_norm": 0.826690137386322, + "learning_rate": 0.0001888765294771969, + "loss": 4.0389, + "step": 256 + }, + { + "epoch": 3.42809364548495, + "grad_norm": 0.8306142687797546, + "learning_rate": 0.00018883203559510566, + "loss": 3.8168, + "step": 257 + }, + { + "epoch": 3.4414715719063547, + "grad_norm": 0.9032199382781982, + "learning_rate": 0.00018878754171301447, + "loss": 4.178, + "step": 258 + }, + { + "epoch": 3.4548494983277593, + "grad_norm": 0.9081966280937195, + "learning_rate": 0.00018874304783092326, + "loss": 4.2583, + "step": 259 + }, + { + "epoch": 3.468227424749164, + "grad_norm": 0.8424077033996582, + "learning_rate": 0.00018869855394883204, + "loss": 4.3285, + "step": 260 + }, + { + "epoch": 3.4816053511705687, + "grad_norm": 0.8302170038223267, + "learning_rate": 0.00018865406006674082, + "loss": 4.1346, + "step": 261 + }, + { + "epoch": 3.4949832775919734, + "grad_norm": 0.8747193217277527, + "learning_rate": 0.00018860956618464963, + "loss": 4.0747, + "step": 262 + }, + { + "epoch": 3.508361204013378, + "grad_norm": 0.8613927364349365, + "learning_rate": 0.0001885650723025584, + "loss": 4.2346, + "step": 263 + }, + { + "epoch": 3.5217391304347827, + "grad_norm": 0.8321558833122253, + "learning_rate": 0.0001885205784204672, + "loss": 3.9781, + "step": 264 + }, + { + "epoch": 3.5351170568561874, + "grad_norm": 0.8961741328239441, + "learning_rate": 0.00018847608453837598, + "loss": 4.311, + "step": 265 + }, + { + "epoch": 3.548494983277592, + "grad_norm": 0.7703898549079895, + "learning_rate": 0.00018843159065628479, + "loss": 4.1163, + "step": 266 + }, + { + "epoch": 3.5618729096989967, + "grad_norm": 0.880051851272583, + "learning_rate": 0.00018838709677419354, + "loss": 3.8032, + "step": 267 + }, + { + "epoch": 3.5752508361204014, + "grad_norm": 0.8287038207054138, + "learning_rate": 0.00018834260289210235, + "loss": 4.1627, + "step": 268 + }, + { + "epoch": 3.588628762541806, + "grad_norm": 0.9726569652557373, + "learning_rate": 0.00018829810901001113, + "loss": 4.4055, + "step": 269 + }, + { + "epoch": 3.6020066889632107, + "grad_norm": 0.8071132898330688, + "learning_rate": 0.00018825361512791992, + "loss": 4.1709, + "step": 270 + }, + { + "epoch": 3.6153846153846154, + "grad_norm": 0.8310988545417786, + "learning_rate": 0.0001882091212458287, + "loss": 4.2359, + "step": 271 + }, + { + "epoch": 3.62876254180602, + "grad_norm": 0.8713561296463013, + "learning_rate": 0.0001881646273637375, + "loss": 4.1247, + "step": 272 + }, + { + "epoch": 3.6421404682274248, + "grad_norm": 0.8964342474937439, + "learning_rate": 0.0001881201334816463, + "loss": 4.0794, + "step": 273 + }, + { + "epoch": 3.6555183946488294, + "grad_norm": 0.9901681542396545, + "learning_rate": 0.00018807563959955507, + "loss": 4.0217, + "step": 274 + }, + { + "epoch": 3.668896321070234, + "grad_norm": 0.9279042482376099, + "learning_rate": 0.00018803114571746385, + "loss": 4.3244, + "step": 275 + }, + { + "epoch": 3.682274247491639, + "grad_norm": 0.8105964660644531, + "learning_rate": 0.00018798665183537266, + "loss": 3.9041, + "step": 276 + }, + { + "epoch": 3.6956521739130435, + "grad_norm": 0.8511622548103333, + "learning_rate": 0.00018794215795328142, + "loss": 3.8969, + "step": 277 + }, + { + "epoch": 3.709030100334448, + "grad_norm": 0.9072037935256958, + "learning_rate": 0.00018789766407119023, + "loss": 4.2185, + "step": 278 + }, + { + "epoch": 3.722408026755853, + "grad_norm": 0.9792962670326233, + "learning_rate": 0.000187853170189099, + "loss": 4.1915, + "step": 279 + }, + { + "epoch": 3.7357859531772575, + "grad_norm": 0.8579828143119812, + "learning_rate": 0.0001878086763070078, + "loss": 3.8903, + "step": 280 + }, + { + "epoch": 3.749163879598662, + "grad_norm": 0.9866719841957092, + "learning_rate": 0.00018776418242491658, + "loss": 4.2022, + "step": 281 + }, + { + "epoch": 3.762541806020067, + "grad_norm": 0.9251964688301086, + "learning_rate": 0.00018771968854282539, + "loss": 3.9536, + "step": 282 + }, + { + "epoch": 3.7759197324414715, + "grad_norm": 1.0300836563110352, + "learning_rate": 0.00018767519466073417, + "loss": 4.2908, + "step": 283 + }, + { + "epoch": 3.789297658862876, + "grad_norm": 1.0194575786590576, + "learning_rate": 0.00018763070077864295, + "loss": 4.1851, + "step": 284 + }, + { + "epoch": 3.802675585284281, + "grad_norm": 0.8165330290794373, + "learning_rate": 0.00018758620689655173, + "loss": 4.138, + "step": 285 + }, + { + "epoch": 3.8160535117056855, + "grad_norm": 1.0104280710220337, + "learning_rate": 0.00018754171301446054, + "loss": 3.9481, + "step": 286 + }, + { + "epoch": 3.82943143812709, + "grad_norm": 0.9972538352012634, + "learning_rate": 0.0001874972191323693, + "loss": 4.3932, + "step": 287 + }, + { + "epoch": 3.842809364548495, + "grad_norm": 0.96323162317276, + "learning_rate": 0.0001874527252502781, + "loss": 4.1133, + "step": 288 + }, + { + "epoch": 3.8561872909698995, + "grad_norm": 0.8500615954399109, + "learning_rate": 0.0001874082313681869, + "loss": 4.2205, + "step": 289 + }, + { + "epoch": 3.869565217391304, + "grad_norm": 0.8451250195503235, + "learning_rate": 0.00018736373748609567, + "loss": 4.1371, + "step": 290 + }, + { + "epoch": 3.882943143812709, + "grad_norm": 0.9399815201759338, + "learning_rate": 0.00018731924360400445, + "loss": 4.5237, + "step": 291 + }, + { + "epoch": 3.8963210702341136, + "grad_norm": 0.8061622977256775, + "learning_rate": 0.00018727474972191326, + "loss": 4.1033, + "step": 292 + }, + { + "epoch": 3.9096989966555182, + "grad_norm": 0.7987121343612671, + "learning_rate": 0.00018723025583982205, + "loss": 3.9311, + "step": 293 + }, + { + "epoch": 3.9230769230769234, + "grad_norm": 0.9041138291358948, + "learning_rate": 0.00018718576195773083, + "loss": 4.0252, + "step": 294 + }, + { + "epoch": 3.936454849498328, + "grad_norm": 1.0002484321594238, + "learning_rate": 0.0001871412680756396, + "loss": 4.4605, + "step": 295 + }, + { + "epoch": 3.9498327759197327, + "grad_norm": 0.9991098046302795, + "learning_rate": 0.0001870967741935484, + "loss": 4.1528, + "step": 296 + }, + { + "epoch": 3.9632107023411374, + "grad_norm": 1.2179397344589233, + "learning_rate": 0.00018705228031145718, + "loss": 4.5224, + "step": 297 + }, + { + "epoch": 3.976588628762542, + "grad_norm": 0.8279774785041809, + "learning_rate": 0.00018700778642936596, + "loss": 3.9464, + "step": 298 + }, + { + "epoch": 3.9899665551839467, + "grad_norm": 0.8012803792953491, + "learning_rate": 0.00018696329254727477, + "loss": 4.0139, + "step": 299 + }, + { + "epoch": 4.0, + "grad_norm": 0.9700272083282471, + "learning_rate": 0.00018691879866518355, + "loss": 3.8306, + "step": 300 + }, + { + "epoch": 4.013377926421405, + "grad_norm": 0.7136749625205994, + "learning_rate": 0.00018687430478309233, + "loss": 3.9253, + "step": 301 + }, + { + "epoch": 4.026755852842809, + "grad_norm": 0.7885096669197083, + "learning_rate": 0.00018682981090100111, + "loss": 3.927, + "step": 302 + }, + { + "epoch": 4.040133779264214, + "grad_norm": 0.7801666855812073, + "learning_rate": 0.00018678531701890992, + "loss": 3.6482, + "step": 303 + }, + { + "epoch": 4.053511705685619, + "grad_norm": 0.7857955098152161, + "learning_rate": 0.00018674082313681868, + "loss": 4.0665, + "step": 304 + }, + { + "epoch": 4.066889632107023, + "grad_norm": 0.707421064376831, + "learning_rate": 0.0001866963292547275, + "loss": 3.9142, + "step": 305 + }, + { + "epoch": 4.080267558528428, + "grad_norm": 0.7936912775039673, + "learning_rate": 0.00018665183537263627, + "loss": 4.1227, + "step": 306 + }, + { + "epoch": 4.093645484949833, + "grad_norm": 0.8899754881858826, + "learning_rate": 0.00018660734149054505, + "loss": 3.7661, + "step": 307 + }, + { + "epoch": 4.107023411371237, + "grad_norm": 0.7760347723960876, + "learning_rate": 0.00018656284760845384, + "loss": 3.8921, + "step": 308 + }, + { + "epoch": 4.120401337792642, + "grad_norm": 0.8672968745231628, + "learning_rate": 0.00018651835372636265, + "loss": 3.6037, + "step": 309 + }, + { + "epoch": 4.133779264214047, + "grad_norm": 0.8046863675117493, + "learning_rate": 0.0001864738598442714, + "loss": 3.9117, + "step": 310 + }, + { + "epoch": 4.147157190635451, + "grad_norm": 0.9172897934913635, + "learning_rate": 0.0001864293659621802, + "loss": 3.7229, + "step": 311 + }, + { + "epoch": 4.160535117056856, + "grad_norm": 0.9616653919219971, + "learning_rate": 0.000186384872080089, + "loss": 3.8851, + "step": 312 + }, + { + "epoch": 4.173913043478261, + "grad_norm": 0.9659278988838196, + "learning_rate": 0.0001863403781979978, + "loss": 4.005, + "step": 313 + }, + { + "epoch": 4.187290969899665, + "grad_norm": 0.9171205163002014, + "learning_rate": 0.00018629588431590656, + "loss": 3.8634, + "step": 314 + }, + { + "epoch": 4.20066889632107, + "grad_norm": 0.9968683123588562, + "learning_rate": 0.00018625139043381537, + "loss": 3.7321, + "step": 315 + }, + { + "epoch": 4.214046822742475, + "grad_norm": 0.8762083053588867, + "learning_rate": 0.00018620689655172415, + "loss": 3.931, + "step": 316 + }, + { + "epoch": 4.2274247491638794, + "grad_norm": 0.9815887212753296, + "learning_rate": 0.00018616240266963293, + "loss": 3.9975, + "step": 317 + }, + { + "epoch": 4.240802675585284, + "grad_norm": 1.0065505504608154, + "learning_rate": 0.00018611790878754171, + "loss": 3.8364, + "step": 318 + }, + { + "epoch": 4.254180602006689, + "grad_norm": 0.9785431623458862, + "learning_rate": 0.00018607341490545052, + "loss": 3.8822, + "step": 319 + }, + { + "epoch": 4.2675585284280935, + "grad_norm": 1.077799677848816, + "learning_rate": 0.00018602892102335928, + "loss": 3.8299, + "step": 320 + }, + { + "epoch": 4.280936454849498, + "grad_norm": 0.8109619617462158, + "learning_rate": 0.0001859844271412681, + "loss": 3.8096, + "step": 321 + }, + { + "epoch": 4.294314381270903, + "grad_norm": 0.967856764793396, + "learning_rate": 0.00018593993325917687, + "loss": 3.8639, + "step": 322 + }, + { + "epoch": 4.3076923076923075, + "grad_norm": 0.8657905459403992, + "learning_rate": 0.00018589543937708568, + "loss": 3.7556, + "step": 323 + }, + { + "epoch": 4.321070234113712, + "grad_norm": 0.9641517400741577, + "learning_rate": 0.00018585094549499444, + "loss": 3.9702, + "step": 324 + }, + { + "epoch": 4.334448160535117, + "grad_norm": 0.9664435982704163, + "learning_rate": 0.00018580645161290325, + "loss": 3.8754, + "step": 325 + }, + { + "epoch": 4.3478260869565215, + "grad_norm": 0.8322617411613464, + "learning_rate": 0.00018576195773081203, + "loss": 3.83, + "step": 326 + }, + { + "epoch": 4.361204013377926, + "grad_norm": 1.0363450050354004, + "learning_rate": 0.0001857174638487208, + "loss": 3.9825, + "step": 327 + }, + { + "epoch": 4.374581939799331, + "grad_norm": 1.0125840902328491, + "learning_rate": 0.0001856729699666296, + "loss": 3.6525, + "step": 328 + }, + { + "epoch": 4.3879598662207355, + "grad_norm": 0.9922601580619812, + "learning_rate": 0.0001856284760845384, + "loss": 4.2373, + "step": 329 + }, + { + "epoch": 4.40133779264214, + "grad_norm": 0.9070426225662231, + "learning_rate": 0.00018558398220244716, + "loss": 3.9623, + "step": 330 + }, + { + "epoch": 4.414715719063545, + "grad_norm": 0.9369637370109558, + "learning_rate": 0.00018553948832035597, + "loss": 3.9297, + "step": 331 + }, + { + "epoch": 4.4280936454849495, + "grad_norm": 1.108876347541809, + "learning_rate": 0.00018549499443826475, + "loss": 3.7325, + "step": 332 + }, + { + "epoch": 4.441471571906354, + "grad_norm": 0.9405660629272461, + "learning_rate": 0.00018545050055617356, + "loss": 3.8615, + "step": 333 + }, + { + "epoch": 4.454849498327759, + "grad_norm": 0.9730128645896912, + "learning_rate": 0.00018540600667408231, + "loss": 4.1794, + "step": 334 + }, + { + "epoch": 4.468227424749164, + "grad_norm": 0.9341335892677307, + "learning_rate": 0.00018536151279199112, + "loss": 3.9422, + "step": 335 + }, + { + "epoch": 4.481605351170568, + "grad_norm": 0.9262625575065613, + "learning_rate": 0.0001853170189098999, + "loss": 3.9819, + "step": 336 + }, + { + "epoch": 4.494983277591973, + "grad_norm": 1.0419141054153442, + "learning_rate": 0.00018527252502780866, + "loss": 3.7481, + "step": 337 + }, + { + "epoch": 4.508361204013378, + "grad_norm": 0.8986826539039612, + "learning_rate": 0.00018522803114571747, + "loss": 4.1195, + "step": 338 + }, + { + "epoch": 4.521739130434782, + "grad_norm": 0.9502431154251099, + "learning_rate": 0.00018518353726362625, + "loss": 3.8521, + "step": 339 + }, + { + "epoch": 4.535117056856187, + "grad_norm": 0.8936267495155334, + "learning_rate": 0.00018513904338153504, + "loss": 3.647, + "step": 340 + }, + { + "epoch": 4.548494983277592, + "grad_norm": 0.8870158195495605, + "learning_rate": 0.00018509454949944382, + "loss": 3.7624, + "step": 341 + }, + { + "epoch": 4.561872909698996, + "grad_norm": 0.9030978679656982, + "learning_rate": 0.00018505005561735263, + "loss": 3.8018, + "step": 342 + }, + { + "epoch": 4.575250836120401, + "grad_norm": 0.8690946698188782, + "learning_rate": 0.0001850055617352614, + "loss": 3.6183, + "step": 343 + }, + { + "epoch": 4.588628762541806, + "grad_norm": 0.9812071323394775, + "learning_rate": 0.0001849610678531702, + "loss": 4.1231, + "step": 344 + }, + { + "epoch": 4.602006688963211, + "grad_norm": 0.9404383301734924, + "learning_rate": 0.00018491657397107897, + "loss": 3.9645, + "step": 345 + }, + { + "epoch": 4.615384615384615, + "grad_norm": 1.0422123670578003, + "learning_rate": 0.00018487208008898778, + "loss": 3.9031, + "step": 346 + }, + { + "epoch": 4.6287625418060205, + "grad_norm": 0.9838129281997681, + "learning_rate": 0.00018482758620689654, + "loss": 3.9985, + "step": 347 + }, + { + "epoch": 4.642140468227424, + "grad_norm": 0.9232532978057861, + "learning_rate": 0.00018478309232480535, + "loss": 4.0343, + "step": 348 + }, + { + "epoch": 4.65551839464883, + "grad_norm": 0.9242956042289734, + "learning_rate": 0.00018473859844271413, + "loss": 4.0669, + "step": 349 + }, + { + "epoch": 4.668896321070234, + "grad_norm": 0.919269859790802, + "learning_rate": 0.0001846941045606229, + "loss": 4.0549, + "step": 350 + }, + { + "epoch": 4.682274247491639, + "grad_norm": 0.93565833568573, + "learning_rate": 0.0001846496106785317, + "loss": 4.1306, + "step": 351 + }, + { + "epoch": 4.695652173913043, + "grad_norm": 0.9001899361610413, + "learning_rate": 0.0001846051167964405, + "loss": 3.8916, + "step": 352 + }, + { + "epoch": 4.709030100334449, + "grad_norm": 0.8896821737289429, + "learning_rate": 0.0001845606229143493, + "loss": 3.8377, + "step": 353 + }, + { + "epoch": 4.722408026755852, + "grad_norm": 1.0137807130813599, + "learning_rate": 0.00018451612903225807, + "loss": 3.9923, + "step": 354 + }, + { + "epoch": 4.735785953177258, + "grad_norm": 1.075823426246643, + "learning_rate": 0.00018447163515016685, + "loss": 4.0706, + "step": 355 + }, + { + "epoch": 4.749163879598662, + "grad_norm": 1.0076895952224731, + "learning_rate": 0.00018442714126807566, + "loss": 4.0759, + "step": 356 + }, + { + "epoch": 4.762541806020067, + "grad_norm": 0.9387428164482117, + "learning_rate": 0.00018438264738598442, + "loss": 3.6959, + "step": 357 + }, + { + "epoch": 4.775919732441472, + "grad_norm": 0.8920648097991943, + "learning_rate": 0.00018433815350389323, + "loss": 3.9213, + "step": 358 + }, + { + "epoch": 4.789297658862877, + "grad_norm": 1.0252491235733032, + "learning_rate": 0.000184293659621802, + "loss": 3.9118, + "step": 359 + }, + { + "epoch": 4.802675585284281, + "grad_norm": 1.0382707118988037, + "learning_rate": 0.0001842491657397108, + "loss": 4.0172, + "step": 360 + }, + { + "epoch": 4.816053511705686, + "grad_norm": 1.07838773727417, + "learning_rate": 0.00018420467185761957, + "loss": 3.8531, + "step": 361 + }, + { + "epoch": 4.829431438127091, + "grad_norm": 0.9974546432495117, + "learning_rate": 0.00018416017797552838, + "loss": 4.0387, + "step": 362 + }, + { + "epoch": 4.842809364548495, + "grad_norm": 1.024491548538208, + "learning_rate": 0.00018411568409343717, + "loss": 3.9504, + "step": 363 + }, + { + "epoch": 4.8561872909699, + "grad_norm": 0.9236369132995605, + "learning_rate": 0.00018407119021134595, + "loss": 3.7119, + "step": 364 + }, + { + "epoch": 4.869565217391305, + "grad_norm": 0.935644268989563, + "learning_rate": 0.00018402669632925473, + "loss": 4.0077, + "step": 365 + }, + { + "epoch": 4.882943143812709, + "grad_norm": 0.9328681230545044, + "learning_rate": 0.00018398220244716354, + "loss": 3.9133, + "step": 366 + }, + { + "epoch": 4.896321070234114, + "grad_norm": 0.9596607089042664, + "learning_rate": 0.0001839377085650723, + "loss": 3.8003, + "step": 367 + }, + { + "epoch": 4.909698996655519, + "grad_norm": 0.9878052473068237, + "learning_rate": 0.0001838932146829811, + "loss": 3.8805, + "step": 368 + }, + { + "epoch": 4.923076923076923, + "grad_norm": 1.00381600856781, + "learning_rate": 0.0001838487208008899, + "loss": 4.0264, + "step": 369 + }, + { + "epoch": 4.936454849498328, + "grad_norm": 1.024754524230957, + "learning_rate": 0.00018380422691879867, + "loss": 3.7291, + "step": 370 + }, + { + "epoch": 4.949832775919733, + "grad_norm": 0.9670823812484741, + "learning_rate": 0.00018375973303670745, + "loss": 3.9418, + "step": 371 + }, + { + "epoch": 4.963210702341137, + "grad_norm": 0.9736581444740295, + "learning_rate": 0.00018371523915461626, + "loss": 3.8813, + "step": 372 + }, + { + "epoch": 4.976588628762542, + "grad_norm": 0.9752672910690308, + "learning_rate": 0.00018367074527252504, + "loss": 3.6717, + "step": 373 + }, + { + "epoch": 4.989966555183947, + "grad_norm": 1.1268304586410522, + "learning_rate": 0.00018362625139043383, + "loss": 3.9782, + "step": 374 + }, + { + "epoch": 5.0, + "grad_norm": 1.7933701276779175, + "learning_rate": 0.0001835817575083426, + "loss": 3.001, + "step": 375 + }, + { + "epoch": 5.013377926421405, + "grad_norm": 0.8035010099411011, + "learning_rate": 0.00018353726362625142, + "loss": 3.7943, + "step": 376 + }, + { + "epoch": 5.026755852842809, + "grad_norm": 0.8016420006752014, + "learning_rate": 0.00018349276974416017, + "loss": 3.7454, + "step": 377 + }, + { + "epoch": 5.040133779264214, + "grad_norm": 0.6844643354415894, + "learning_rate": 0.00018344827586206896, + "loss": 3.699, + "step": 378 + }, + { + "epoch": 5.053511705685619, + "grad_norm": 0.8649943470954895, + "learning_rate": 0.00018340378197997777, + "loss": 3.7197, + "step": 379 + }, + { + "epoch": 5.066889632107023, + "grad_norm": 0.9685015678405762, + "learning_rate": 0.00018335928809788655, + "loss": 3.6952, + "step": 380 + }, + { + "epoch": 5.080267558528428, + "grad_norm": 0.8728330135345459, + "learning_rate": 0.00018331479421579533, + "loss": 3.7164, + "step": 381 + }, + { + "epoch": 5.093645484949833, + "grad_norm": 0.962504506111145, + "learning_rate": 0.0001832703003337041, + "loss": 3.6123, + "step": 382 + }, + { + "epoch": 5.107023411371237, + "grad_norm": 0.9194462895393372, + "learning_rate": 0.00018322580645161292, + "loss": 3.275, + "step": 383 + }, + { + "epoch": 5.120401337792642, + "grad_norm": 0.9851329326629639, + "learning_rate": 0.00018318131256952168, + "loss": 3.6222, + "step": 384 + }, + { + "epoch": 5.133779264214047, + "grad_norm": 1.0702580213546753, + "learning_rate": 0.0001831368186874305, + "loss": 3.8728, + "step": 385 + }, + { + "epoch": 5.147157190635451, + "grad_norm": 1.3237228393554688, + "learning_rate": 0.00018309232480533927, + "loss": 3.8948, + "step": 386 + }, + { + "epoch": 5.160535117056856, + "grad_norm": 1.0076218843460083, + "learning_rate": 0.00018304783092324805, + "loss": 3.8894, + "step": 387 + }, + { + "epoch": 5.173913043478261, + "grad_norm": 1.084722876548767, + "learning_rate": 0.00018300333704115683, + "loss": 3.7398, + "step": 388 + }, + { + "epoch": 5.187290969899665, + "grad_norm": 0.9112711548805237, + "learning_rate": 0.00018295884315906564, + "loss": 3.5901, + "step": 389 + }, + { + "epoch": 5.20066889632107, + "grad_norm": 0.9451406002044678, + "learning_rate": 0.00018291434927697443, + "loss": 3.6313, + "step": 390 + }, + { + "epoch": 5.214046822742475, + "grad_norm": 0.8901047706604004, + "learning_rate": 0.0001828698553948832, + "loss": 3.3191, + "step": 391 + }, + { + "epoch": 5.2274247491638794, + "grad_norm": 0.9838565587997437, + "learning_rate": 0.000182825361512792, + "loss": 3.882, + "step": 392 + }, + { + "epoch": 5.240802675585284, + "grad_norm": 0.9839156866073608, + "learning_rate": 0.0001827808676307008, + "loss": 3.6068, + "step": 393 + }, + { + "epoch": 5.254180602006689, + "grad_norm": 0.9328583478927612, + "learning_rate": 0.00018273637374860956, + "loss": 3.6856, + "step": 394 + }, + { + "epoch": 5.2675585284280935, + "grad_norm": 0.8705796003341675, + "learning_rate": 0.00018269187986651837, + "loss": 3.7282, + "step": 395 + }, + { + "epoch": 5.280936454849498, + "grad_norm": 0.9675374031066895, + "learning_rate": 0.00018264738598442715, + "loss": 3.6588, + "step": 396 + }, + { + "epoch": 5.294314381270903, + "grad_norm": 1.145280361175537, + "learning_rate": 0.00018260289210233593, + "loss": 3.8843, + "step": 397 + }, + { + "epoch": 5.3076923076923075, + "grad_norm": 0.9769694805145264, + "learning_rate": 0.0001825583982202447, + "loss": 3.7207, + "step": 398 + }, + { + "epoch": 5.321070234113712, + "grad_norm": 0.9277816414833069, + "learning_rate": 0.00018251390433815352, + "loss": 3.712, + "step": 399 + }, + { + "epoch": 5.334448160535117, + "grad_norm": 1.1015180349349976, + "learning_rate": 0.0001824694104560623, + "loss": 3.7941, + "step": 400 + }, + { + "epoch": 5.3478260869565215, + "grad_norm": 1.2234200239181519, + "learning_rate": 0.0001824249165739711, + "loss": 3.6559, + "step": 401 + }, + { + "epoch": 5.361204013377926, + "grad_norm": 0.9358471035957336, + "learning_rate": 0.00018238042269187987, + "loss": 3.7665, + "step": 402 + }, + { + "epoch": 5.374581939799331, + "grad_norm": 0.8287034630775452, + "learning_rate": 0.00018233592880978868, + "loss": 3.8265, + "step": 403 + }, + { + "epoch": 5.3879598662207355, + "grad_norm": 1.0219204425811768, + "learning_rate": 0.00018229143492769743, + "loss": 3.6829, + "step": 404 + }, + { + "epoch": 5.40133779264214, + "grad_norm": 1.0601041316986084, + "learning_rate": 0.00018224694104560624, + "loss": 3.5879, + "step": 405 + }, + { + "epoch": 5.414715719063545, + "grad_norm": 1.2221566438674927, + "learning_rate": 0.00018220244716351503, + "loss": 3.6023, + "step": 406 + }, + { + "epoch": 5.4280936454849495, + "grad_norm": 0.9589087963104248, + "learning_rate": 0.0001821579532814238, + "loss": 3.9109, + "step": 407 + }, + { + "epoch": 5.441471571906354, + "grad_norm": 1.088295340538025, + "learning_rate": 0.0001821134593993326, + "loss": 3.5695, + "step": 408 + }, + { + "epoch": 5.454849498327759, + "grad_norm": 1.1284915208816528, + "learning_rate": 0.0001820689655172414, + "loss": 3.6004, + "step": 409 + }, + { + "epoch": 5.468227424749164, + "grad_norm": 1.0108689069747925, + "learning_rate": 0.00018202447163515018, + "loss": 3.7797, + "step": 410 + }, + { + "epoch": 5.481605351170568, + "grad_norm": 0.8550918102264404, + "learning_rate": 0.00018197997775305896, + "loss": 3.5758, + "step": 411 + }, + { + "epoch": 5.494983277591973, + "grad_norm": 0.8765145540237427, + "learning_rate": 0.00018193548387096775, + "loss": 3.7625, + "step": 412 + }, + { + "epoch": 5.508361204013378, + "grad_norm": 1.0253541469573975, + "learning_rate": 0.00018189098998887656, + "loss": 3.6033, + "step": 413 + }, + { + "epoch": 5.521739130434782, + "grad_norm": 1.0475622415542603, + "learning_rate": 0.0001818464961067853, + "loss": 3.813, + "step": 414 + }, + { + "epoch": 5.535117056856187, + "grad_norm": 1.053133249282837, + "learning_rate": 0.00018180200222469412, + "loss": 3.4779, + "step": 415 + }, + { + "epoch": 5.548494983277592, + "grad_norm": 1.0151216983795166, + "learning_rate": 0.0001817575083426029, + "loss": 3.9038, + "step": 416 + }, + { + "epoch": 5.561872909698996, + "grad_norm": 1.4666434526443481, + "learning_rate": 0.00018171301446051169, + "loss": 3.4735, + "step": 417 + }, + { + "epoch": 5.575250836120401, + "grad_norm": 1.1043344736099243, + "learning_rate": 0.00018166852057842047, + "loss": 3.7449, + "step": 418 + }, + { + "epoch": 5.588628762541806, + "grad_norm": 0.900745153427124, + "learning_rate": 0.00018162402669632925, + "loss": 3.7401, + "step": 419 + }, + { + "epoch": 5.602006688963211, + "grad_norm": 0.9771101474761963, + "learning_rate": 0.00018157953281423806, + "loss": 3.8328, + "step": 420 + }, + { + "epoch": 5.615384615384615, + "grad_norm": 0.9099516272544861, + "learning_rate": 0.00018153503893214682, + "loss": 3.6245, + "step": 421 + }, + { + "epoch": 5.6287625418060205, + "grad_norm": 0.9844585657119751, + "learning_rate": 0.00018149054505005563, + "loss": 3.5776, + "step": 422 + }, + { + "epoch": 5.642140468227424, + "grad_norm": 1.0481154918670654, + "learning_rate": 0.0001814460511679644, + "loss": 3.6304, + "step": 423 + }, + { + "epoch": 5.65551839464883, + "grad_norm": 0.9971081614494324, + "learning_rate": 0.0001814015572858732, + "loss": 3.7863, + "step": 424 + }, + { + "epoch": 5.668896321070234, + "grad_norm": 0.9247872829437256, + "learning_rate": 0.00018135706340378197, + "loss": 3.7319, + "step": 425 + }, + { + "epoch": 5.682274247491639, + "grad_norm": 0.9895725846290588, + "learning_rate": 0.00018131256952169078, + "loss": 3.78, + "step": 426 + }, + { + "epoch": 5.695652173913043, + "grad_norm": 1.0847641229629517, + "learning_rate": 0.00018126807563959956, + "loss": 3.8662, + "step": 427 + }, + { + "epoch": 5.709030100334449, + "grad_norm": 0.986259937286377, + "learning_rate": 0.00018122358175750835, + "loss": 3.5621, + "step": 428 + }, + { + "epoch": 5.722408026755852, + "grad_norm": 0.9166681170463562, + "learning_rate": 0.00018117908787541713, + "loss": 3.6153, + "step": 429 + }, + { + "epoch": 5.735785953177258, + "grad_norm": 1.1331177949905396, + "learning_rate": 0.00018113459399332594, + "loss": 3.5976, + "step": 430 + }, + { + "epoch": 5.749163879598662, + "grad_norm": 0.8743540644645691, + "learning_rate": 0.0001810901001112347, + "loss": 3.2511, + "step": 431 + }, + { + "epoch": 5.762541806020067, + "grad_norm": 1.0700207948684692, + "learning_rate": 0.0001810456062291435, + "loss": 3.7634, + "step": 432 + }, + { + "epoch": 5.775919732441472, + "grad_norm": 0.9412694573402405, + "learning_rate": 0.00018100111234705229, + "loss": 3.6264, + "step": 433 + }, + { + "epoch": 5.789297658862877, + "grad_norm": 1.0398496389389038, + "learning_rate": 0.00018095661846496107, + "loss": 3.744, + "step": 434 + }, + { + "epoch": 5.802675585284281, + "grad_norm": 0.9605004787445068, + "learning_rate": 0.00018091212458286985, + "loss": 3.5532, + "step": 435 + }, + { + "epoch": 5.816053511705686, + "grad_norm": 1.0449095964431763, + "learning_rate": 0.00018086763070077866, + "loss": 4.0611, + "step": 436 + }, + { + "epoch": 5.829431438127091, + "grad_norm": 0.9342606663703918, + "learning_rate": 0.00018082313681868744, + "loss": 3.9957, + "step": 437 + }, + { + "epoch": 5.842809364548495, + "grad_norm": 0.9687880873680115, + "learning_rate": 0.00018077864293659622, + "loss": 3.9299, + "step": 438 + }, + { + "epoch": 5.8561872909699, + "grad_norm": 1.1390576362609863, + "learning_rate": 0.000180734149054505, + "loss": 3.6552, + "step": 439 + }, + { + "epoch": 5.869565217391305, + "grad_norm": 0.9280988574028015, + "learning_rate": 0.00018068965517241382, + "loss": 3.7828, + "step": 440 + }, + { + "epoch": 5.882943143812709, + "grad_norm": 1.2928193807601929, + "learning_rate": 0.00018064516129032257, + "loss": 3.6292, + "step": 441 + }, + { + "epoch": 5.896321070234114, + "grad_norm": 1.0959875583648682, + "learning_rate": 0.00018060066740823138, + "loss": 3.4293, + "step": 442 + }, + { + "epoch": 5.909698996655519, + "grad_norm": 1.0713289976119995, + "learning_rate": 0.00018055617352614016, + "loss": 3.7767, + "step": 443 + }, + { + "epoch": 5.923076923076923, + "grad_norm": 0.9309440851211548, + "learning_rate": 0.00018051167964404895, + "loss": 3.5473, + "step": 444 + }, + { + "epoch": 5.936454849498328, + "grad_norm": 1.0999056100845337, + "learning_rate": 0.00018046718576195773, + "loss": 3.9694, + "step": 445 + }, + { + "epoch": 5.949832775919733, + "grad_norm": 1.1073781251907349, + "learning_rate": 0.00018042269187986654, + "loss": 3.6882, + "step": 446 + }, + { + "epoch": 5.963210702341137, + "grad_norm": 1.0430257320404053, + "learning_rate": 0.00018037819799777532, + "loss": 3.4009, + "step": 447 + }, + { + "epoch": 5.976588628762542, + "grad_norm": 1.1132690906524658, + "learning_rate": 0.0001803337041156841, + "loss": 3.8832, + "step": 448 + }, + { + "epoch": 5.989966555183947, + "grad_norm": 1.0147771835327148, + "learning_rate": 0.00018028921023359289, + "loss": 3.7117, + "step": 449 + }, + { + "epoch": 6.0, + "grad_norm": 1.458959698677063, + "learning_rate": 0.0001802447163515017, + "loss": 3.5745, + "step": 450 + }, + { + "epoch": 6.013377926421405, + "grad_norm": 0.8363592028617859, + "learning_rate": 0.00018020022246941045, + "loss": 3.5835, + "step": 451 + }, + { + "epoch": 6.026755852842809, + "grad_norm": 0.8071937561035156, + "learning_rate": 0.00018015572858731926, + "loss": 3.5923, + "step": 452 + }, + { + "epoch": 6.040133779264214, + "grad_norm": 0.7746313214302063, + "learning_rate": 0.00018011123470522804, + "loss": 3.5688, + "step": 453 + }, + { + "epoch": 6.053511705685619, + "grad_norm": 0.689179539680481, + "learning_rate": 0.00018006674082313682, + "loss": 3.412, + "step": 454 + }, + { + "epoch": 6.066889632107023, + "grad_norm": 0.8438050746917725, + "learning_rate": 0.0001800222469410456, + "loss": 3.403, + "step": 455 + }, + { + "epoch": 6.080267558528428, + "grad_norm": 0.7670062780380249, + "learning_rate": 0.00017997775305895442, + "loss": 3.5029, + "step": 456 + }, + { + "epoch": 6.093645484949833, + "grad_norm": 0.8185870051383972, + "learning_rate": 0.0001799332591768632, + "loss": 3.4584, + "step": 457 + }, + { + "epoch": 6.107023411371237, + "grad_norm": 0.9618543386459351, + "learning_rate": 0.00017988876529477198, + "loss": 3.6538, + "step": 458 + }, + { + "epoch": 6.120401337792642, + "grad_norm": 0.959724485874176, + "learning_rate": 0.00017984427141268076, + "loss": 3.5284, + "step": 459 + }, + { + "epoch": 6.133779264214047, + "grad_norm": 0.8044765591621399, + "learning_rate": 0.00017979977753058955, + "loss": 3.2198, + "step": 460 + }, + { + "epoch": 6.147157190635451, + "grad_norm": 0.8287092447280884, + "learning_rate": 0.00017975528364849833, + "loss": 3.4977, + "step": 461 + }, + { + "epoch": 6.160535117056856, + "grad_norm": 0.8855329155921936, + "learning_rate": 0.0001797107897664071, + "loss": 3.5008, + "step": 462 + }, + { + "epoch": 6.173913043478261, + "grad_norm": 0.8839483857154846, + "learning_rate": 0.00017966629588431592, + "loss": 3.6135, + "step": 463 + }, + { + "epoch": 6.187290969899665, + "grad_norm": 0.963446319103241, + "learning_rate": 0.0001796218020022247, + "loss": 3.6156, + "step": 464 + }, + { + "epoch": 6.20066889632107, + "grad_norm": 0.896743655204773, + "learning_rate": 0.00017957730812013348, + "loss": 3.6623, + "step": 465 + }, + { + "epoch": 6.214046822742475, + "grad_norm": 0.9268617033958435, + "learning_rate": 0.00017953281423804227, + "loss": 3.4343, + "step": 466 + }, + { + "epoch": 6.2274247491638794, + "grad_norm": 0.8335449695587158, + "learning_rate": 0.00017948832035595108, + "loss": 3.5716, + "step": 467 + }, + { + "epoch": 6.240802675585284, + "grad_norm": 0.7771849036216736, + "learning_rate": 0.00017944382647385983, + "loss": 3.5191, + "step": 468 + }, + { + "epoch": 6.254180602006689, + "grad_norm": 0.9157488346099854, + "learning_rate": 0.00017939933259176864, + "loss": 3.5583, + "step": 469 + }, + { + "epoch": 6.2675585284280935, + "grad_norm": 0.9348477721214294, + "learning_rate": 0.00017935483870967742, + "loss": 3.3137, + "step": 470 + }, + { + "epoch": 6.280936454849498, + "grad_norm": 0.8791135549545288, + "learning_rate": 0.0001793103448275862, + "loss": 3.5111, + "step": 471 + }, + { + "epoch": 6.294314381270903, + "grad_norm": 0.9963672757148743, + "learning_rate": 0.000179265850945495, + "loss": 3.7518, + "step": 472 + }, + { + "epoch": 6.3076923076923075, + "grad_norm": 0.9291539192199707, + "learning_rate": 0.0001792213570634038, + "loss": 3.4524, + "step": 473 + }, + { + "epoch": 6.321070234113712, + "grad_norm": 0.9349279403686523, + "learning_rate": 0.00017917686318131258, + "loss": 3.4753, + "step": 474 + }, + { + "epoch": 6.334448160535117, + "grad_norm": 0.8984476327896118, + "learning_rate": 0.00017913236929922136, + "loss": 3.7325, + "step": 475 + }, + { + "epoch": 6.3478260869565215, + "grad_norm": 0.8452139496803284, + "learning_rate": 0.00017908787541713015, + "loss": 3.8021, + "step": 476 + }, + { + "epoch": 6.361204013377926, + "grad_norm": 0.9418376088142395, + "learning_rate": 0.00017904338153503895, + "loss": 3.7426, + "step": 477 + }, + { + "epoch": 6.374581939799331, + "grad_norm": 1.0661097764968872, + "learning_rate": 0.0001789988876529477, + "loss": 3.7556, + "step": 478 + }, + { + "epoch": 6.3879598662207355, + "grad_norm": 0.9645984768867493, + "learning_rate": 0.00017895439377085652, + "loss": 3.3353, + "step": 479 + }, + { + "epoch": 6.40133779264214, + "grad_norm": 0.9243470430374146, + "learning_rate": 0.0001789098998887653, + "loss": 3.5729, + "step": 480 + }, + { + "epoch": 6.414715719063545, + "grad_norm": 0.885061502456665, + "learning_rate": 0.00017886540600667408, + "loss": 3.5699, + "step": 481 + }, + { + "epoch": 6.4280936454849495, + "grad_norm": 0.9025402069091797, + "learning_rate": 0.00017882091212458287, + "loss": 3.4532, + "step": 482 + }, + { + "epoch": 6.441471571906354, + "grad_norm": 0.9760842323303223, + "learning_rate": 0.00017877641824249168, + "loss": 3.7222, + "step": 483 + }, + { + "epoch": 6.454849498327759, + "grad_norm": 1.2709609270095825, + "learning_rate": 0.00017873192436040046, + "loss": 3.756, + "step": 484 + }, + { + "epoch": 6.468227424749164, + "grad_norm": 0.904513955116272, + "learning_rate": 0.00017868743047830924, + "loss": 3.657, + "step": 485 + }, + { + "epoch": 6.481605351170568, + "grad_norm": 1.158915400505066, + "learning_rate": 0.00017864293659621802, + "loss": 3.5897, + "step": 486 + }, + { + "epoch": 6.494983277591973, + "grad_norm": 0.9457879066467285, + "learning_rate": 0.00017859844271412683, + "loss": 3.4394, + "step": 487 + }, + { + "epoch": 6.508361204013378, + "grad_norm": 1.04762601852417, + "learning_rate": 0.0001785539488320356, + "loss": 3.39, + "step": 488 + }, + { + "epoch": 6.521739130434782, + "grad_norm": 0.9370948076248169, + "learning_rate": 0.0001785094549499444, + "loss": 3.4816, + "step": 489 + }, + { + "epoch": 6.535117056856187, + "grad_norm": 1.1307988166809082, + "learning_rate": 0.00017846496106785318, + "loss": 3.5079, + "step": 490 + }, + { + "epoch": 6.548494983277592, + "grad_norm": 1.0025054216384888, + "learning_rate": 0.00017842046718576196, + "loss": 3.3447, + "step": 491 + }, + { + "epoch": 6.561872909698996, + "grad_norm": 0.9893412590026855, + "learning_rate": 0.00017837597330367074, + "loss": 3.4698, + "step": 492 + }, + { + "epoch": 6.575250836120401, + "grad_norm": 1.0476347208023071, + "learning_rate": 0.00017833147942157955, + "loss": 3.4955, + "step": 493 + }, + { + "epoch": 6.588628762541806, + "grad_norm": 1.0508891344070435, + "learning_rate": 0.00017828698553948834, + "loss": 3.6568, + "step": 494 + }, + { + "epoch": 6.602006688963211, + "grad_norm": 1.0397465229034424, + "learning_rate": 0.00017824249165739712, + "loss": 3.5087, + "step": 495 + }, + { + "epoch": 6.615384615384615, + "grad_norm": 0.9884181022644043, + "learning_rate": 0.0001781979977753059, + "loss": 3.6778, + "step": 496 + }, + { + "epoch": 6.6287625418060205, + "grad_norm": 1.1187562942504883, + "learning_rate": 0.0001781535038932147, + "loss": 3.4345, + "step": 497 + }, + { + "epoch": 6.642140468227424, + "grad_norm": 1.1133880615234375, + "learning_rate": 0.00017810901001112347, + "loss": 3.4959, + "step": 498 + }, + { + "epoch": 6.65551839464883, + "grad_norm": 0.9368589520454407, + "learning_rate": 0.00017806451612903228, + "loss": 3.628, + "step": 499 + }, + { + "epoch": 6.668896321070234, + "grad_norm": 1.0427212715148926, + "learning_rate": 0.00017802002224694106, + "loss": 3.6073, + "step": 500 + }, + { + "epoch": 6.682274247491639, + "grad_norm": 0.9564261436462402, + "learning_rate": 0.00017797552836484984, + "loss": 3.4406, + "step": 501 + }, + { + "epoch": 6.695652173913043, + "grad_norm": 0.9754629135131836, + "learning_rate": 0.00017793103448275862, + "loss": 3.7025, + "step": 502 + }, + { + "epoch": 6.709030100334449, + "grad_norm": 0.9669683575630188, + "learning_rate": 0.0001778865406006674, + "loss": 3.5872, + "step": 503 + }, + { + "epoch": 6.722408026755852, + "grad_norm": 0.959619402885437, + "learning_rate": 0.00017784204671857621, + "loss": 3.5124, + "step": 504 + }, + { + "epoch": 6.735785953177258, + "grad_norm": 0.981737494468689, + "learning_rate": 0.00017779755283648497, + "loss": 3.4038, + "step": 505 + }, + { + "epoch": 6.749163879598662, + "grad_norm": 0.9372640252113342, + "learning_rate": 0.00017775305895439378, + "loss": 3.6725, + "step": 506 + }, + { + "epoch": 6.762541806020067, + "grad_norm": 0.947066605091095, + "learning_rate": 0.00017770856507230256, + "loss": 3.807, + "step": 507 + }, + { + "epoch": 6.775919732441472, + "grad_norm": 0.7659755349159241, + "learning_rate": 0.00017766407119021134, + "loss": 3.0411, + "step": 508 + }, + { + "epoch": 6.789297658862877, + "grad_norm": 1.0432168245315552, + "learning_rate": 0.00017761957730812013, + "loss": 3.6259, + "step": 509 + }, + { + "epoch": 6.802675585284281, + "grad_norm": 1.0104693174362183, + "learning_rate": 0.00017757508342602894, + "loss": 3.5971, + "step": 510 + }, + { + "epoch": 6.816053511705686, + "grad_norm": 0.9007440805435181, + "learning_rate": 0.00017753058954393772, + "loss": 3.5894, + "step": 511 + }, + { + "epoch": 6.829431438127091, + "grad_norm": 0.8829946517944336, + "learning_rate": 0.0001774860956618465, + "loss": 3.6094, + "step": 512 + }, + { + "epoch": 6.842809364548495, + "grad_norm": 0.9823127388954163, + "learning_rate": 0.00017744160177975528, + "loss": 3.6568, + "step": 513 + }, + { + "epoch": 6.8561872909699, + "grad_norm": 1.0247899293899536, + "learning_rate": 0.0001773971078976641, + "loss": 3.259, + "step": 514 + }, + { + "epoch": 6.869565217391305, + "grad_norm": 0.9435336589813232, + "learning_rate": 0.00017735261401557285, + "loss": 3.4978, + "step": 515 + }, + { + "epoch": 6.882943143812709, + "grad_norm": 1.0135424137115479, + "learning_rate": 0.00017730812013348166, + "loss": 3.4765, + "step": 516 + }, + { + "epoch": 6.896321070234114, + "grad_norm": 1.1327738761901855, + "learning_rate": 0.00017726362625139044, + "loss": 3.6927, + "step": 517 + }, + { + "epoch": 6.909698996655519, + "grad_norm": 0.9335159659385681, + "learning_rate": 0.00017721913236929922, + "loss": 3.7278, + "step": 518 + }, + { + "epoch": 6.923076923076923, + "grad_norm": 0.8229056000709534, + "learning_rate": 0.000177174638487208, + "loss": 3.549, + "step": 519 + }, + { + "epoch": 6.936454849498328, + "grad_norm": 0.8996124267578125, + "learning_rate": 0.00017713014460511681, + "loss": 3.5093, + "step": 520 + }, + { + "epoch": 6.949832775919733, + "grad_norm": 0.9984102845191956, + "learning_rate": 0.0001770856507230256, + "loss": 3.4772, + "step": 521 + }, + { + "epoch": 6.963210702341137, + "grad_norm": 0.9136049747467041, + "learning_rate": 0.00017704115684093438, + "loss": 3.6487, + "step": 522 + }, + { + "epoch": 6.976588628762542, + "grad_norm": 0.9707899689674377, + "learning_rate": 0.00017699666295884316, + "loss": 3.7471, + "step": 523 + }, + { + "epoch": 6.989966555183947, + "grad_norm": 0.919865071773529, + "learning_rate": 0.00017695216907675197, + "loss": 3.8013, + "step": 524 + }, + { + "epoch": 7.0, + "grad_norm": 1.125541090965271, + "learning_rate": 0.00017690767519466073, + "loss": 3.4869, + "step": 525 + }, + { + "epoch": 7.013377926421405, + "grad_norm": 0.8512810468673706, + "learning_rate": 0.00017686318131256954, + "loss": 3.3334, + "step": 526 + }, + { + "epoch": 7.026755852842809, + "grad_norm": 0.8450623750686646, + "learning_rate": 0.00017681868743047832, + "loss": 3.5769, + "step": 527 + }, + { + "epoch": 7.040133779264214, + "grad_norm": 0.8526298403739929, + "learning_rate": 0.0001767741935483871, + "loss": 3.4546, + "step": 528 + }, + { + "epoch": 7.053511705685619, + "grad_norm": 0.75905442237854, + "learning_rate": 0.00017672969966629588, + "loss": 3.4228, + "step": 529 + }, + { + "epoch": 7.066889632107023, + "grad_norm": 0.8442811965942383, + "learning_rate": 0.0001766852057842047, + "loss": 3.5766, + "step": 530 + }, + { + "epoch": 7.080267558528428, + "grad_norm": 0.9584814310073853, + "learning_rate": 0.00017664071190211347, + "loss": 3.5312, + "step": 531 + }, + { + "epoch": 7.093645484949833, + "grad_norm": 0.9741052985191345, + "learning_rate": 0.00017659621802002226, + "loss": 3.4877, + "step": 532 + }, + { + "epoch": 7.107023411371237, + "grad_norm": 0.8638135194778442, + "learning_rate": 0.00017655172413793104, + "loss": 3.5701, + "step": 533 + }, + { + "epoch": 7.120401337792642, + "grad_norm": 1.0128440856933594, + "learning_rate": 0.00017650723025583985, + "loss": 3.3629, + "step": 534 + }, + { + "epoch": 7.133779264214047, + "grad_norm": 0.9763593673706055, + "learning_rate": 0.0001764627363737486, + "loss": 3.5651, + "step": 535 + }, + { + "epoch": 7.147157190635451, + "grad_norm": 0.8706293702125549, + "learning_rate": 0.00017641824249165741, + "loss": 3.454, + "step": 536 + }, + { + "epoch": 7.160535117056856, + "grad_norm": 0.9227468967437744, + "learning_rate": 0.0001763737486095662, + "loss": 3.5528, + "step": 537 + }, + { + "epoch": 7.173913043478261, + "grad_norm": 0.7493206262588501, + "learning_rate": 0.00017632925472747498, + "loss": 3.2662, + "step": 538 + }, + { + "epoch": 7.187290969899665, + "grad_norm": 0.8414123058319092, + "learning_rate": 0.00017628476084538376, + "loss": 3.3864, + "step": 539 + }, + { + "epoch": 7.20066889632107, + "grad_norm": 0.8352764248847961, + "learning_rate": 0.00017624026696329257, + "loss": 3.2407, + "step": 540 + }, + { + "epoch": 7.214046822742475, + "grad_norm": 0.7413480281829834, + "learning_rate": 0.00017619577308120135, + "loss": 3.3989, + "step": 541 + }, + { + "epoch": 7.2274247491638794, + "grad_norm": 0.7661281228065491, + "learning_rate": 0.00017615127919911014, + "loss": 3.3792, + "step": 542 + }, + { + "epoch": 7.240802675585284, + "grad_norm": 0.86900395154953, + "learning_rate": 0.00017610678531701892, + "loss": 3.2191, + "step": 543 + }, + { + "epoch": 7.254180602006689, + "grad_norm": 0.8536344170570374, + "learning_rate": 0.0001760622914349277, + "loss": 3.3366, + "step": 544 + }, + { + "epoch": 7.2675585284280935, + "grad_norm": 0.8729544878005981, + "learning_rate": 0.00017601779755283648, + "loss": 3.4976, + "step": 545 + }, + { + "epoch": 7.280936454849498, + "grad_norm": 0.8263023495674133, + "learning_rate": 0.00017597330367074526, + "loss": 3.4102, + "step": 546 + }, + { + "epoch": 7.294314381270903, + "grad_norm": 0.748373806476593, + "learning_rate": 0.00017592880978865407, + "loss": 3.332, + "step": 547 + }, + { + "epoch": 7.3076923076923075, + "grad_norm": 0.7606791853904724, + "learning_rate": 0.00017588431590656286, + "loss": 3.361, + "step": 548 + }, + { + "epoch": 7.321070234113712, + "grad_norm": 0.9155070781707764, + "learning_rate": 0.00017583982202447164, + "loss": 3.657, + "step": 549 + }, + { + "epoch": 7.334448160535117, + "grad_norm": 0.7440597414970398, + "learning_rate": 0.00017579532814238042, + "loss": 3.335, + "step": 550 + }, + { + "epoch": 7.3478260869565215, + "grad_norm": 0.8781002759933472, + "learning_rate": 0.00017575083426028923, + "loss": 3.5579, + "step": 551 + }, + { + "epoch": 7.361204013377926, + "grad_norm": 0.7886701822280884, + "learning_rate": 0.00017570634037819799, + "loss": 3.4636, + "step": 552 + }, + { + "epoch": 7.374581939799331, + "grad_norm": 0.8931376934051514, + "learning_rate": 0.0001756618464961068, + "loss": 3.4375, + "step": 553 + }, + { + "epoch": 7.3879598662207355, + "grad_norm": 0.7599623799324036, + "learning_rate": 0.00017561735261401558, + "loss": 3.6551, + "step": 554 + }, + { + "epoch": 7.40133779264214, + "grad_norm": 0.7692762613296509, + "learning_rate": 0.00017557285873192436, + "loss": 3.6373, + "step": 555 + }, + { + "epoch": 7.414715719063545, + "grad_norm": 0.8861828446388245, + "learning_rate": 0.00017552836484983314, + "loss": 3.4791, + "step": 556 + }, + { + "epoch": 7.4280936454849495, + "grad_norm": 0.9560372829437256, + "learning_rate": 0.00017548387096774195, + "loss": 3.7291, + "step": 557 + }, + { + "epoch": 7.441471571906354, + "grad_norm": 0.8745344281196594, + "learning_rate": 0.00017543937708565073, + "loss": 3.3071, + "step": 558 + }, + { + "epoch": 7.454849498327759, + "grad_norm": 0.8178285360336304, + "learning_rate": 0.00017539488320355952, + "loss": 3.4738, + "step": 559 + }, + { + "epoch": 7.468227424749164, + "grad_norm": 0.8611259460449219, + "learning_rate": 0.0001753503893214683, + "loss": 3.25, + "step": 560 + }, + { + "epoch": 7.481605351170568, + "grad_norm": 0.8623505234718323, + "learning_rate": 0.0001753058954393771, + "loss": 3.3701, + "step": 561 + }, + { + "epoch": 7.494983277591973, + "grad_norm": 0.76930171251297, + "learning_rate": 0.00017526140155728586, + "loss": 3.332, + "step": 562 + }, + { + "epoch": 7.508361204013378, + "grad_norm": 0.8986758589744568, + "learning_rate": 0.00017521690767519467, + "loss": 3.3927, + "step": 563 + }, + { + "epoch": 7.521739130434782, + "grad_norm": 0.9844257831573486, + "learning_rate": 0.00017517241379310346, + "loss": 3.5664, + "step": 564 + }, + { + "epoch": 7.535117056856187, + "grad_norm": 0.983921229839325, + "learning_rate": 0.00017512791991101224, + "loss": 3.3888, + "step": 565 + }, + { + "epoch": 7.548494983277592, + "grad_norm": 0.8052308559417725, + "learning_rate": 0.00017508342602892102, + "loss": 3.4809, + "step": 566 + }, + { + "epoch": 7.561872909698996, + "grad_norm": 0.7996425032615662, + "learning_rate": 0.00017503893214682983, + "loss": 3.4793, + "step": 567 + }, + { + "epoch": 7.575250836120401, + "grad_norm": 0.8453391194343567, + "learning_rate": 0.0001749944382647386, + "loss": 3.3199, + "step": 568 + }, + { + "epoch": 7.588628762541806, + "grad_norm": 0.8720147013664246, + "learning_rate": 0.0001749499443826474, + "loss": 3.5612, + "step": 569 + }, + { + "epoch": 7.602006688963211, + "grad_norm": 0.9093672633171082, + "learning_rate": 0.00017490545050055618, + "loss": 3.0509, + "step": 570 + }, + { + "epoch": 7.615384615384615, + "grad_norm": 0.8936579823493958, + "learning_rate": 0.000174860956618465, + "loss": 3.4408, + "step": 571 + }, + { + "epoch": 7.6287625418060205, + "grad_norm": 0.7683162689208984, + "learning_rate": 0.00017481646273637374, + "loss": 3.3536, + "step": 572 + }, + { + "epoch": 7.642140468227424, + "grad_norm": 0.7943581342697144, + "learning_rate": 0.00017477196885428255, + "loss": 3.4542, + "step": 573 + }, + { + "epoch": 7.65551839464883, + "grad_norm": 0.8183353543281555, + "learning_rate": 0.00017472747497219133, + "loss": 3.603, + "step": 574 + }, + { + "epoch": 7.668896321070234, + "grad_norm": 0.7816463708877563, + "learning_rate": 0.00017468298109010012, + "loss": 3.7388, + "step": 575 + }, + { + "epoch": 7.682274247491639, + "grad_norm": 0.8167930245399475, + "learning_rate": 0.0001746384872080089, + "loss": 3.7743, + "step": 576 + }, + { + "epoch": 7.695652173913043, + "grad_norm": 0.832392156124115, + "learning_rate": 0.0001745939933259177, + "loss": 3.7488, + "step": 577 + }, + { + "epoch": 7.709030100334449, + "grad_norm": 0.9362333416938782, + "learning_rate": 0.0001745494994438265, + "loss": 3.5722, + "step": 578 + }, + { + "epoch": 7.722408026755852, + "grad_norm": 1.0247248411178589, + "learning_rate": 0.00017450500556173527, + "loss": 3.2048, + "step": 579 + }, + { + "epoch": 7.735785953177258, + "grad_norm": 0.8833767175674438, + "learning_rate": 0.00017446051167964406, + "loss": 3.389, + "step": 580 + }, + { + "epoch": 7.749163879598662, + "grad_norm": 0.8344758749008179, + "learning_rate": 0.00017441601779755287, + "loss": 3.5264, + "step": 581 + }, + { + "epoch": 7.762541806020067, + "grad_norm": 0.9771448373794556, + "learning_rate": 0.00017437152391546162, + "loss": 3.041, + "step": 582 + }, + { + "epoch": 7.775919732441472, + "grad_norm": 0.8279567956924438, + "learning_rate": 0.00017432703003337043, + "loss": 3.6273, + "step": 583 + }, + { + "epoch": 7.789297658862877, + "grad_norm": 0.957206130027771, + "learning_rate": 0.0001742825361512792, + "loss": 3.3831, + "step": 584 + }, + { + "epoch": 7.802675585284281, + "grad_norm": 0.860619843006134, + "learning_rate": 0.000174238042269188, + "loss": 3.4566, + "step": 585 + }, + { + "epoch": 7.816053511705686, + "grad_norm": 0.8725448846817017, + "learning_rate": 0.00017419354838709678, + "loss": 3.6594, + "step": 586 + }, + { + "epoch": 7.829431438127091, + "grad_norm": 0.8343111276626587, + "learning_rate": 0.00017414905450500556, + "loss": 3.3423, + "step": 587 + }, + { + "epoch": 7.842809364548495, + "grad_norm": 0.9043267965316772, + "learning_rate": 0.00017410456062291437, + "loss": 3.221, + "step": 588 + }, + { + "epoch": 7.8561872909699, + "grad_norm": 0.9563114643096924, + "learning_rate": 0.00017406006674082312, + "loss": 3.5143, + "step": 589 + }, + { + "epoch": 7.869565217391305, + "grad_norm": 0.9726302027702332, + "learning_rate": 0.00017401557285873193, + "loss": 3.4373, + "step": 590 + }, + { + "epoch": 7.882943143812709, + "grad_norm": 0.9203178882598877, + "learning_rate": 0.00017397107897664072, + "loss": 3.6014, + "step": 591 + }, + { + "epoch": 7.896321070234114, + "grad_norm": 0.9120233654975891, + "learning_rate": 0.0001739265850945495, + "loss": 3.1429, + "step": 592 + }, + { + "epoch": 7.909698996655519, + "grad_norm": 0.7576518058776855, + "learning_rate": 0.00017388209121245828, + "loss": 3.2065, + "step": 593 + }, + { + "epoch": 7.923076923076923, + "grad_norm": 0.9629240036010742, + "learning_rate": 0.0001738375973303671, + "loss": 3.4788, + "step": 594 + }, + { + "epoch": 7.936454849498328, + "grad_norm": 0.8390881419181824, + "learning_rate": 0.00017379310344827587, + "loss": 3.2857, + "step": 595 + }, + { + "epoch": 7.949832775919733, + "grad_norm": 0.8708979487419128, + "learning_rate": 0.00017374860956618466, + "loss": 3.3321, + "step": 596 + }, + { + "epoch": 7.963210702341137, + "grad_norm": 0.7076835632324219, + "learning_rate": 0.00017370411568409344, + "loss": 3.5905, + "step": 597 + }, + { + "epoch": 7.976588628762542, + "grad_norm": 1.016526222229004, + "learning_rate": 0.00017365962180200225, + "loss": 3.4053, + "step": 598 + }, + { + "epoch": 7.989966555183947, + "grad_norm": 0.7592278718948364, + "learning_rate": 0.000173615127919911, + "loss": 3.3968, + "step": 599 + }, + { + "epoch": 8.0, + "grad_norm": 1.0106462240219116, + "learning_rate": 0.0001735706340378198, + "loss": 3.3722, + "step": 600 + }, + { + "epoch": 8.013377926421406, + "grad_norm": 0.740808367729187, + "learning_rate": 0.0001735261401557286, + "loss": 3.394, + "step": 601 + }, + { + "epoch": 8.02675585284281, + "grad_norm": 0.6732498407363892, + "learning_rate": 0.00017348164627363738, + "loss": 3.297, + "step": 602 + }, + { + "epoch": 8.040133779264215, + "grad_norm": 0.8319197297096252, + "learning_rate": 0.00017343715239154616, + "loss": 3.2898, + "step": 603 + }, + { + "epoch": 8.053511705685619, + "grad_norm": 0.7834349870681763, + "learning_rate": 0.00017339265850945497, + "loss": 3.2341, + "step": 604 + }, + { + "epoch": 8.066889632107024, + "grad_norm": 0.705737292766571, + "learning_rate": 0.00017334816462736375, + "loss": 3.3429, + "step": 605 + }, + { + "epoch": 8.080267558528428, + "grad_norm": 0.8270958065986633, + "learning_rate": 0.00017330367074527253, + "loss": 3.0458, + "step": 606 + }, + { + "epoch": 8.093645484949834, + "grad_norm": 0.7254801392555237, + "learning_rate": 0.00017325917686318132, + "loss": 3.5143, + "step": 607 + }, + { + "epoch": 8.107023411371237, + "grad_norm": 0.8450751900672913, + "learning_rate": 0.00017321468298109013, + "loss": 3.1507, + "step": 608 + }, + { + "epoch": 8.120401337792643, + "grad_norm": 0.7936837673187256, + "learning_rate": 0.00017317018909899888, + "loss": 3.3979, + "step": 609 + }, + { + "epoch": 8.133779264214047, + "grad_norm": 0.6496401429176331, + "learning_rate": 0.0001731256952169077, + "loss": 3.3613, + "step": 610 + }, + { + "epoch": 8.147157190635452, + "grad_norm": 0.8721235990524292, + "learning_rate": 0.00017308120133481647, + "loss": 3.4299, + "step": 611 + }, + { + "epoch": 8.160535117056856, + "grad_norm": 0.7671874761581421, + "learning_rate": 0.00017303670745272525, + "loss": 3.3333, + "step": 612 + }, + { + "epoch": 8.173913043478262, + "grad_norm": 0.6427144408226013, + "learning_rate": 0.00017299221357063404, + "loss": 3.3304, + "step": 613 + }, + { + "epoch": 8.187290969899665, + "grad_norm": 0.7999966144561768, + "learning_rate": 0.00017294771968854285, + "loss": 3.4016, + "step": 614 + }, + { + "epoch": 8.200668896321071, + "grad_norm": 0.8216206431388855, + "learning_rate": 0.00017290322580645163, + "loss": 3.1724, + "step": 615 + }, + { + "epoch": 8.214046822742475, + "grad_norm": 0.7364024519920349, + "learning_rate": 0.0001728587319243604, + "loss": 3.34, + "step": 616 + }, + { + "epoch": 8.22742474916388, + "grad_norm": 0.7688239812850952, + "learning_rate": 0.0001728142380422692, + "loss": 3.2114, + "step": 617 + }, + { + "epoch": 8.240802675585284, + "grad_norm": 0.8786870837211609, + "learning_rate": 0.000172769744160178, + "loss": 3.378, + "step": 618 + }, + { + "epoch": 8.25418060200669, + "grad_norm": 0.9048855900764465, + "learning_rate": 0.00017272525027808676, + "loss": 3.1801, + "step": 619 + }, + { + "epoch": 8.267558528428093, + "grad_norm": 0.657189130783081, + "learning_rate": 0.00017268075639599557, + "loss": 3.1389, + "step": 620 + }, + { + "epoch": 8.280936454849499, + "grad_norm": 0.8015987873077393, + "learning_rate": 0.00017263626251390435, + "loss": 3.4621, + "step": 621 + }, + { + "epoch": 8.294314381270903, + "grad_norm": 0.8232793807983398, + "learning_rate": 0.00017259176863181313, + "loss": 3.1763, + "step": 622 + }, + { + "epoch": 8.307692307692308, + "grad_norm": 0.7447130680084229, + "learning_rate": 0.00017254727474972192, + "loss": 3.1266, + "step": 623 + }, + { + "epoch": 8.321070234113712, + "grad_norm": 0.7649840116500854, + "learning_rate": 0.00017250278086763072, + "loss": 3.1959, + "step": 624 + }, + { + "epoch": 8.334448160535118, + "grad_norm": 0.7119699120521545, + "learning_rate": 0.0001724582869855395, + "loss": 3.5626, + "step": 625 + }, + { + "epoch": 8.347826086956522, + "grad_norm": 0.8238518834114075, + "learning_rate": 0.00017241379310344826, + "loss": 3.1873, + "step": 626 + }, + { + "epoch": 8.361204013377927, + "grad_norm": 0.8248497843742371, + "learning_rate": 0.00017236929922135707, + "loss": 3.5686, + "step": 627 + }, + { + "epoch": 8.37458193979933, + "grad_norm": 0.8704475164413452, + "learning_rate": 0.00017232480533926585, + "loss": 3.3195, + "step": 628 + }, + { + "epoch": 8.387959866220736, + "grad_norm": 0.8160929083824158, + "learning_rate": 0.00017228031145717464, + "loss": 3.3808, + "step": 629 + }, + { + "epoch": 8.40133779264214, + "grad_norm": 0.8537085652351379, + "learning_rate": 0.00017223581757508342, + "loss": 3.3638, + "step": 630 + }, + { + "epoch": 8.414715719063546, + "grad_norm": 0.876519501209259, + "learning_rate": 0.00017219132369299223, + "loss": 3.2019, + "step": 631 + }, + { + "epoch": 8.42809364548495, + "grad_norm": 0.6573703289031982, + "learning_rate": 0.000172146829810901, + "loss": 3.4998, + "step": 632 + }, + { + "epoch": 8.441471571906355, + "grad_norm": 0.8822937607765198, + "learning_rate": 0.0001721023359288098, + "loss": 3.4281, + "step": 633 + }, + { + "epoch": 8.454849498327759, + "grad_norm": 0.764872670173645, + "learning_rate": 0.00017205784204671858, + "loss": 3.3693, + "step": 634 + }, + { + "epoch": 8.468227424749164, + "grad_norm": 0.7492384910583496, + "learning_rate": 0.00017201334816462739, + "loss": 3.5672, + "step": 635 + }, + { + "epoch": 8.481605351170568, + "grad_norm": 0.8037416934967041, + "learning_rate": 0.00017196885428253614, + "loss": 3.3804, + "step": 636 + }, + { + "epoch": 8.494983277591974, + "grad_norm": 0.8380945324897766, + "learning_rate": 0.00017192436040044495, + "loss": 3.2272, + "step": 637 + }, + { + "epoch": 8.508361204013378, + "grad_norm": 0.8467932939529419, + "learning_rate": 0.00017187986651835373, + "loss": 3.2649, + "step": 638 + }, + { + "epoch": 8.521739130434783, + "grad_norm": 0.751542866230011, + "learning_rate": 0.00017183537263626252, + "loss": 3.4135, + "step": 639 + }, + { + "epoch": 8.535117056856187, + "grad_norm": 0.7618190050125122, + "learning_rate": 0.0001717908787541713, + "loss": 3.483, + "step": 640 + }, + { + "epoch": 8.548494983277592, + "grad_norm": 0.9661890864372253, + "learning_rate": 0.0001717463848720801, + "loss": 3.2201, + "step": 641 + }, + { + "epoch": 8.561872909698996, + "grad_norm": 0.8655393719673157, + "learning_rate": 0.0001717018909899889, + "loss": 3.4142, + "step": 642 + }, + { + "epoch": 8.575250836120402, + "grad_norm": 0.796047031879425, + "learning_rate": 0.00017165739710789767, + "loss": 3.3558, + "step": 643 + }, + { + "epoch": 8.588628762541806, + "grad_norm": 1.0098161697387695, + "learning_rate": 0.00017161290322580645, + "loss": 3.553, + "step": 644 + }, + { + "epoch": 8.602006688963211, + "grad_norm": 1.1880302429199219, + "learning_rate": 0.00017156840934371526, + "loss": 3.3581, + "step": 645 + }, + { + "epoch": 8.615384615384615, + "grad_norm": 0.9361609220504761, + "learning_rate": 0.00017152391546162402, + "loss": 3.4543, + "step": 646 + }, + { + "epoch": 8.62876254180602, + "grad_norm": 0.8794479966163635, + "learning_rate": 0.00017147942157953283, + "loss": 3.3654, + "step": 647 + }, + { + "epoch": 8.642140468227424, + "grad_norm": 0.9263080954551697, + "learning_rate": 0.0001714349276974416, + "loss": 3.4376, + "step": 648 + }, + { + "epoch": 8.65551839464883, + "grad_norm": 1.0015815496444702, + "learning_rate": 0.0001713904338153504, + "loss": 3.3533, + "step": 649 + }, + { + "epoch": 8.668896321070234, + "grad_norm": 0.8525484204292297, + "learning_rate": 0.00017134593993325918, + "loss": 3.3897, + "step": 650 + }, + { + "epoch": 8.68227424749164, + "grad_norm": 0.7196484804153442, + "learning_rate": 0.00017130144605116799, + "loss": 3.2428, + "step": 651 + }, + { + "epoch": 8.695652173913043, + "grad_norm": 0.8779593706130981, + "learning_rate": 0.00017125695216907677, + "loss": 3.5471, + "step": 652 + }, + { + "epoch": 8.709030100334449, + "grad_norm": 0.9256909489631653, + "learning_rate": 0.00017121245828698555, + "loss": 3.1776, + "step": 653 + }, + { + "epoch": 8.722408026755852, + "grad_norm": 0.7774620652198792, + "learning_rate": 0.00017116796440489433, + "loss": 3.4836, + "step": 654 + }, + { + "epoch": 8.735785953177258, + "grad_norm": 0.8112596273422241, + "learning_rate": 0.00017112347052280314, + "loss": 3.5194, + "step": 655 + }, + { + "epoch": 8.749163879598662, + "grad_norm": 0.7350602746009827, + "learning_rate": 0.0001710789766407119, + "loss": 3.2248, + "step": 656 + }, + { + "epoch": 8.762541806020067, + "grad_norm": 0.8231781125068665, + "learning_rate": 0.0001710344827586207, + "loss": 3.4659, + "step": 657 + }, + { + "epoch": 8.775919732441471, + "grad_norm": 0.8921564221382141, + "learning_rate": 0.0001709899888765295, + "loss": 3.2712, + "step": 658 + }, + { + "epoch": 8.789297658862877, + "grad_norm": 0.8921830058097839, + "learning_rate": 0.00017094549499443827, + "loss": 3.5071, + "step": 659 + }, + { + "epoch": 8.80267558528428, + "grad_norm": 0.7809077501296997, + "learning_rate": 0.00017090100111234705, + "loss": 3.6639, + "step": 660 + }, + { + "epoch": 8.816053511705686, + "grad_norm": 0.9431234002113342, + "learning_rate": 0.00017085650723025586, + "loss": 3.2795, + "step": 661 + }, + { + "epoch": 8.82943143812709, + "grad_norm": 0.9707314968109131, + "learning_rate": 0.00017081201334816465, + "loss": 3.3395, + "step": 662 + }, + { + "epoch": 8.842809364548495, + "grad_norm": 0.7547470331192017, + "learning_rate": 0.00017076751946607343, + "loss": 3.5316, + "step": 663 + }, + { + "epoch": 8.856187290969899, + "grad_norm": 0.8989250659942627, + "learning_rate": 0.0001707230255839822, + "loss": 3.4029, + "step": 664 + }, + { + "epoch": 8.869565217391305, + "grad_norm": 1.0237400531768799, + "learning_rate": 0.00017067853170189102, + "loss": 3.5014, + "step": 665 + }, + { + "epoch": 8.882943143812708, + "grad_norm": 0.7289263010025024, + "learning_rate": 0.00017063403781979978, + "loss": 3.4211, + "step": 666 + }, + { + "epoch": 8.896321070234114, + "grad_norm": 0.7978695034980774, + "learning_rate": 0.00017058954393770856, + "loss": 3.5421, + "step": 667 + }, + { + "epoch": 8.909698996655518, + "grad_norm": 0.7401835918426514, + "learning_rate": 0.00017054505005561737, + "loss": 3.3419, + "step": 668 + }, + { + "epoch": 8.923076923076923, + "grad_norm": 0.8952983617782593, + "learning_rate": 0.00017050055617352615, + "loss": 3.1322, + "step": 669 + }, + { + "epoch": 8.936454849498327, + "grad_norm": 0.6922047734260559, + "learning_rate": 0.00017045606229143493, + "loss": 3.5872, + "step": 670 + }, + { + "epoch": 8.949832775919733, + "grad_norm": 0.8618977665901184, + "learning_rate": 0.00017041156840934371, + "loss": 3.1366, + "step": 671 + }, + { + "epoch": 8.963210702341136, + "grad_norm": 0.7933799624443054, + "learning_rate": 0.00017036707452725252, + "loss": 3.3108, + "step": 672 + }, + { + "epoch": 8.976588628762542, + "grad_norm": 0.718401312828064, + "learning_rate": 0.00017032258064516128, + "loss": 3.3771, + "step": 673 + }, + { + "epoch": 8.989966555183946, + "grad_norm": 0.8096804618835449, + "learning_rate": 0.0001702780867630701, + "loss": 3.3225, + "step": 674 + }, + { + "epoch": 9.0, + "grad_norm": 1.0055694580078125, + "learning_rate": 0.00017023359288097887, + "loss": 3.5545, + "step": 675 + }, + { + "epoch": 9.013377926421406, + "grad_norm": 0.710986852645874, + "learning_rate": 0.00017018909899888765, + "loss": 3.3333, + "step": 676 + }, + { + "epoch": 9.02675585284281, + "grad_norm": 0.672132134437561, + "learning_rate": 0.00017014460511679644, + "loss": 2.9995, + "step": 677 + }, + { + "epoch": 9.040133779264215, + "grad_norm": 0.6752933263778687, + "learning_rate": 0.00017010011123470525, + "loss": 3.3571, + "step": 678 + }, + { + "epoch": 9.053511705685619, + "grad_norm": 0.6553521156311035, + "learning_rate": 0.00017005561735261403, + "loss": 3.3407, + "step": 679 + }, + { + "epoch": 9.066889632107024, + "grad_norm": 0.7492311596870422, + "learning_rate": 0.0001700111234705228, + "loss": 3.325, + "step": 680 + }, + { + "epoch": 9.080267558528428, + "grad_norm": 0.736139714717865, + "learning_rate": 0.0001699666295884316, + "loss": 3.2626, + "step": 681 + }, + { + "epoch": 9.093645484949834, + "grad_norm": 0.7131486535072327, + "learning_rate": 0.0001699221357063404, + "loss": 3.2612, + "step": 682 + }, + { + "epoch": 9.107023411371237, + "grad_norm": 0.7037603855133057, + "learning_rate": 0.00016987764182424916, + "loss": 3.2418, + "step": 683 + }, + { + "epoch": 9.120401337792643, + "grad_norm": 0.685518205165863, + "learning_rate": 0.00016983314794215797, + "loss": 3.4854, + "step": 684 + }, + { + "epoch": 9.133779264214047, + "grad_norm": 0.6050254106521606, + "learning_rate": 0.00016978865406006675, + "loss": 3.2312, + "step": 685 + }, + { + "epoch": 9.147157190635452, + "grad_norm": 0.6932830810546875, + "learning_rate": 0.00016974416017797553, + "loss": 3.4634, + "step": 686 + }, + { + "epoch": 9.160535117056856, + "grad_norm": 0.7055158615112305, + "learning_rate": 0.00016969966629588431, + "loss": 3.1408, + "step": 687 + }, + { + "epoch": 9.173913043478262, + "grad_norm": 0.6887643337249756, + "learning_rate": 0.00016965517241379312, + "loss": 3.0697, + "step": 688 + }, + { + "epoch": 9.187290969899665, + "grad_norm": 0.7201237082481384, + "learning_rate": 0.0001696106785317019, + "loss": 3.303, + "step": 689 + }, + { + "epoch": 9.200668896321071, + "grad_norm": 0.6617894768714905, + "learning_rate": 0.0001695661846496107, + "loss": 3.4846, + "step": 690 + }, + { + "epoch": 9.214046822742475, + "grad_norm": 0.8979818224906921, + "learning_rate": 0.00016952169076751947, + "loss": 3.1898, + "step": 691 + }, + { + "epoch": 9.22742474916388, + "grad_norm": 0.9507981538772583, + "learning_rate": 0.00016947719688542828, + "loss": 3.4748, + "step": 692 + }, + { + "epoch": 9.240802675585284, + "grad_norm": 0.7935391068458557, + "learning_rate": 0.00016943270300333704, + "loss": 3.3661, + "step": 693 + }, + { + "epoch": 9.25418060200669, + "grad_norm": 0.7437114715576172, + "learning_rate": 0.00016938820912124584, + "loss": 3.4407, + "step": 694 + }, + { + "epoch": 9.267558528428093, + "grad_norm": 0.680610179901123, + "learning_rate": 0.00016934371523915463, + "loss": 3.3135, + "step": 695 + }, + { + "epoch": 9.280936454849499, + "grad_norm": 0.846716582775116, + "learning_rate": 0.0001692992213570634, + "loss": 3.3923, + "step": 696 + }, + { + "epoch": 9.294314381270903, + "grad_norm": 0.9567786455154419, + "learning_rate": 0.0001692547274749722, + "loss": 3.1405, + "step": 697 + }, + { + "epoch": 9.307692307692308, + "grad_norm": 0.7509344816207886, + "learning_rate": 0.000169210233592881, + "loss": 3.4031, + "step": 698 + }, + { + "epoch": 9.321070234113712, + "grad_norm": 0.8118243217468262, + "learning_rate": 0.00016916573971078978, + "loss": 3.317, + "step": 699 + }, + { + "epoch": 9.334448160535118, + "grad_norm": 0.7445617318153381, + "learning_rate": 0.00016912124582869857, + "loss": 3.3989, + "step": 700 + }, + { + "epoch": 9.347826086956522, + "grad_norm": 0.7520869970321655, + "learning_rate": 0.00016907675194660735, + "loss": 3.08, + "step": 701 + }, + { + "epoch": 9.361204013377927, + "grad_norm": 0.7466426491737366, + "learning_rate": 0.00016903225806451616, + "loss": 3.3338, + "step": 702 + }, + { + "epoch": 9.37458193979933, + "grad_norm": 0.7595514059066772, + "learning_rate": 0.0001689877641824249, + "loss": 3.08, + "step": 703 + }, + { + "epoch": 9.387959866220736, + "grad_norm": 0.713771641254425, + "learning_rate": 0.00016894327030033372, + "loss": 3.236, + "step": 704 + }, + { + "epoch": 9.40133779264214, + "grad_norm": 0.670863687992096, + "learning_rate": 0.0001688987764182425, + "loss": 3.4491, + "step": 705 + }, + { + "epoch": 9.414715719063546, + "grad_norm": 0.8842789530754089, + "learning_rate": 0.0001688542825361513, + "loss": 3.3444, + "step": 706 + }, + { + "epoch": 9.42809364548495, + "grad_norm": 0.8298172950744629, + "learning_rate": 0.00016880978865406007, + "loss": 3.3818, + "step": 707 + }, + { + "epoch": 9.441471571906355, + "grad_norm": 0.7407504320144653, + "learning_rate": 0.00016876529477196885, + "loss": 3.2294, + "step": 708 + }, + { + "epoch": 9.454849498327759, + "grad_norm": 0.6642070412635803, + "learning_rate": 0.00016872080088987766, + "loss": 3.3009, + "step": 709 + }, + { + "epoch": 9.468227424749164, + "grad_norm": 0.7627503275871277, + "learning_rate": 0.00016867630700778642, + "loss": 3.3486, + "step": 710 + }, + { + "epoch": 9.481605351170568, + "grad_norm": 0.7307603359222412, + "learning_rate": 0.00016863181312569523, + "loss": 3.0257, + "step": 711 + }, + { + "epoch": 9.494983277591974, + "grad_norm": 0.7932866215705872, + "learning_rate": 0.000168587319243604, + "loss": 3.4191, + "step": 712 + }, + { + "epoch": 9.508361204013378, + "grad_norm": 0.7457575798034668, + "learning_rate": 0.0001685428253615128, + "loss": 3.2283, + "step": 713 + }, + { + "epoch": 9.521739130434783, + "grad_norm": 0.6718200445175171, + "learning_rate": 0.00016849833147942157, + "loss": 3.4569, + "step": 714 + }, + { + "epoch": 9.535117056856187, + "grad_norm": 0.8189072608947754, + "learning_rate": 0.00016845383759733038, + "loss": 3.2585, + "step": 715 + }, + { + "epoch": 9.548494983277592, + "grad_norm": 0.6895336508750916, + "learning_rate": 0.00016840934371523917, + "loss": 3.2417, + "step": 716 + }, + { + "epoch": 9.561872909698996, + "grad_norm": 0.723173975944519, + "learning_rate": 0.00016836484983314795, + "loss": 3.3243, + "step": 717 + }, + { + "epoch": 9.575250836120402, + "grad_norm": 0.8354344964027405, + "learning_rate": 0.00016832035595105673, + "loss": 3.3585, + "step": 718 + }, + { + "epoch": 9.588628762541806, + "grad_norm": 0.6736294031143188, + "learning_rate": 0.00016827586206896554, + "loss": 3.188, + "step": 719 + }, + { + "epoch": 9.602006688963211, + "grad_norm": 0.7790263295173645, + "learning_rate": 0.0001682313681868743, + "loss": 3.3171, + "step": 720 + }, + { + "epoch": 9.615384615384615, + "grad_norm": 0.6426937580108643, + "learning_rate": 0.0001681868743047831, + "loss": 3.0854, + "step": 721 + }, + { + "epoch": 9.62876254180602, + "grad_norm": 0.7029106020927429, + "learning_rate": 0.0001681423804226919, + "loss": 3.3629, + "step": 722 + }, + { + "epoch": 9.642140468227424, + "grad_norm": 0.8353022933006287, + "learning_rate": 0.00016809788654060067, + "loss": 3.3715, + "step": 723 + }, + { + "epoch": 9.65551839464883, + "grad_norm": 0.8578335642814636, + "learning_rate": 0.00016805339265850945, + "loss": 3.3554, + "step": 724 + }, + { + "epoch": 9.668896321070234, + "grad_norm": 0.6998556852340698, + "learning_rate": 0.00016800889877641826, + "loss": 3.3043, + "step": 725 + }, + { + "epoch": 9.68227424749164, + "grad_norm": 0.7134855389595032, + "learning_rate": 0.00016796440489432704, + "loss": 3.5856, + "step": 726 + }, + { + "epoch": 9.695652173913043, + "grad_norm": 0.6636050939559937, + "learning_rate": 0.00016791991101223583, + "loss": 3.2156, + "step": 727 + }, + { + "epoch": 9.709030100334449, + "grad_norm": 0.7757130861282349, + "learning_rate": 0.0001678754171301446, + "loss": 3.4974, + "step": 728 + }, + { + "epoch": 9.722408026755852, + "grad_norm": 0.770648181438446, + "learning_rate": 0.00016783092324805342, + "loss": 3.3251, + "step": 729 + }, + { + "epoch": 9.735785953177258, + "grad_norm": 0.7728201746940613, + "learning_rate": 0.00016778642936596217, + "loss": 3.2666, + "step": 730 + }, + { + "epoch": 9.749163879598662, + "grad_norm": 0.8277239203453064, + "learning_rate": 0.00016774193548387098, + "loss": 3.3867, + "step": 731 + }, + { + "epoch": 9.762541806020067, + "grad_norm": 0.6534886360168457, + "learning_rate": 0.00016769744160177977, + "loss": 3.175, + "step": 732 + }, + { + "epoch": 9.775919732441471, + "grad_norm": 0.8508428335189819, + "learning_rate": 0.00016765294771968855, + "loss": 3.2084, + "step": 733 + }, + { + "epoch": 9.789297658862877, + "grad_norm": 0.7656721472740173, + "learning_rate": 0.00016760845383759733, + "loss": 3.1426, + "step": 734 + }, + { + "epoch": 9.80267558528428, + "grad_norm": 0.9495553970336914, + "learning_rate": 0.00016756395995550614, + "loss": 3.1623, + "step": 735 + }, + { + "epoch": 9.816053511705686, + "grad_norm": 0.7998641729354858, + "learning_rate": 0.00016751946607341492, + "loss": 3.3893, + "step": 736 + }, + { + "epoch": 9.82943143812709, + "grad_norm": 0.8124551177024841, + "learning_rate": 0.0001674749721913237, + "loss": 3.2012, + "step": 737 + }, + { + "epoch": 9.842809364548495, + "grad_norm": 0.6332049369812012, + "learning_rate": 0.0001674304783092325, + "loss": 3.3384, + "step": 738 + }, + { + "epoch": 9.856187290969899, + "grad_norm": 0.7114555835723877, + "learning_rate": 0.0001673859844271413, + "loss": 3.0802, + "step": 739 + }, + { + "epoch": 9.869565217391305, + "grad_norm": 0.7175182700157166, + "learning_rate": 0.00016734149054505005, + "loss": 3.2572, + "step": 740 + }, + { + "epoch": 9.882943143812708, + "grad_norm": 0.7724816799163818, + "learning_rate": 0.00016729699666295886, + "loss": 3.1078, + "step": 741 + }, + { + "epoch": 9.896321070234114, + "grad_norm": 0.7834901213645935, + "learning_rate": 0.00016725250278086764, + "loss": 3.2513, + "step": 742 + }, + { + "epoch": 9.909698996655518, + "grad_norm": 0.663495659828186, + "learning_rate": 0.00016720800889877643, + "loss": 3.327, + "step": 743 + }, + { + "epoch": 9.923076923076923, + "grad_norm": 0.7828975319862366, + "learning_rate": 0.0001671635150166852, + "loss": 3.609, + "step": 744 + }, + { + "epoch": 9.936454849498327, + "grad_norm": 0.6747825145721436, + "learning_rate": 0.00016711902113459402, + "loss": 3.4479, + "step": 745 + }, + { + "epoch": 9.949832775919733, + "grad_norm": 0.7816379070281982, + "learning_rate": 0.0001670745272525028, + "loss": 3.2369, + "step": 746 + }, + { + "epoch": 9.963210702341136, + "grad_norm": 0.7011098265647888, + "learning_rate": 0.00016703003337041158, + "loss": 2.9103, + "step": 747 + }, + { + "epoch": 9.976588628762542, + "grad_norm": 0.7165176868438721, + "learning_rate": 0.00016698553948832036, + "loss": 3.1669, + "step": 748 + }, + { + "epoch": 9.989966555183946, + "grad_norm": 0.766315758228302, + "learning_rate": 0.00016694104560622915, + "loss": 3.1138, + "step": 749 + }, + { + "epoch": 10.0, + "grad_norm": 0.876315176486969, + "learning_rate": 0.00016689655172413793, + "loss": 3.4851, + "step": 750 + }, + { + "epoch": 10.013377926421406, + "grad_norm": 0.807686984539032, + "learning_rate": 0.0001668520578420467, + "loss": 3.193, + "step": 751 + }, + { + "epoch": 10.02675585284281, + "grad_norm": 0.7085704803466797, + "learning_rate": 0.00016680756395995552, + "loss": 3.4797, + "step": 752 + }, + { + "epoch": 10.040133779264215, + "grad_norm": 0.7119605541229248, + "learning_rate": 0.0001667630700778643, + "loss": 3.1713, + "step": 753 + }, + { + "epoch": 10.053511705685619, + "grad_norm": 0.6569423675537109, + "learning_rate": 0.00016671857619577309, + "loss": 3.1661, + "step": 754 + }, + { + "epoch": 10.066889632107024, + "grad_norm": 0.8173550367355347, + "learning_rate": 0.00016667408231368187, + "loss": 2.8467, + "step": 755 + }, + { + "epoch": 10.080267558528428, + "grad_norm": 0.7261365056037903, + "learning_rate": 0.00016662958843159068, + "loss": 3.3679, + "step": 756 + }, + { + "epoch": 10.093645484949834, + "grad_norm": 0.7997227311134338, + "learning_rate": 0.00016658509454949943, + "loss": 3.0985, + "step": 757 + }, + { + "epoch": 10.107023411371237, + "grad_norm": 0.653391420841217, + "learning_rate": 0.00016654060066740824, + "loss": 3.2156, + "step": 758 + }, + { + "epoch": 10.120401337792643, + "grad_norm": 0.6799002289772034, + "learning_rate": 0.00016649610678531703, + "loss": 3.3302, + "step": 759 + }, + { + "epoch": 10.133779264214047, + "grad_norm": 0.6444498896598816, + "learning_rate": 0.0001664516129032258, + "loss": 3.2813, + "step": 760 + }, + { + "epoch": 10.147157190635452, + "grad_norm": 1.064769983291626, + "learning_rate": 0.0001664071190211346, + "loss": 3.1852, + "step": 761 + }, + { + "epoch": 10.160535117056856, + "grad_norm": 0.6534339189529419, + "learning_rate": 0.0001663626251390434, + "loss": 3.1563, + "step": 762 + }, + { + "epoch": 10.173913043478262, + "grad_norm": 0.6909127235412598, + "learning_rate": 0.00016631813125695218, + "loss": 3.2728, + "step": 763 + }, + { + "epoch": 10.187290969899665, + "grad_norm": 0.6549767851829529, + "learning_rate": 0.00016627363737486096, + "loss": 3.0491, + "step": 764 + }, + { + "epoch": 10.200668896321071, + "grad_norm": 0.678054928779602, + "learning_rate": 0.00016622914349276975, + "loss": 3.4807, + "step": 765 + }, + { + "epoch": 10.214046822742475, + "grad_norm": 0.613358199596405, + "learning_rate": 0.00016618464961067856, + "loss": 3.1746, + "step": 766 + }, + { + "epoch": 10.22742474916388, + "grad_norm": 0.6624737977981567, + "learning_rate": 0.0001661401557285873, + "loss": 2.8528, + "step": 767 + }, + { + "epoch": 10.240802675585284, + "grad_norm": 0.65067458152771, + "learning_rate": 0.00016609566184649612, + "loss": 3.1843, + "step": 768 + }, + { + "epoch": 10.25418060200669, + "grad_norm": 0.6192435622215271, + "learning_rate": 0.0001660511679644049, + "loss": 3.3162, + "step": 769 + }, + { + "epoch": 10.267558528428093, + "grad_norm": 0.6456341743469238, + "learning_rate": 0.00016600667408231369, + "loss": 3.2302, + "step": 770 + }, + { + "epoch": 10.280936454849499, + "grad_norm": 2.357724189758301, + "learning_rate": 0.00016596218020022247, + "loss": 3.2741, + "step": 771 + }, + { + "epoch": 10.294314381270903, + "grad_norm": 0.6833475828170776, + "learning_rate": 0.00016591768631813128, + "loss": 3.1516, + "step": 772 + }, + { + "epoch": 10.307692307692308, + "grad_norm": 0.5557199716567993, + "learning_rate": 0.00016587319243604006, + "loss": 3.281, + "step": 773 + }, + { + "epoch": 10.321070234113712, + "grad_norm": 0.6617905497550964, + "learning_rate": 0.00016582869855394884, + "loss": 3.375, + "step": 774 + }, + { + "epoch": 10.334448160535118, + "grad_norm": 0.5671921372413635, + "learning_rate": 0.00016578420467185762, + "loss": 3.4335, + "step": 775 + }, + { + "epoch": 10.347826086956522, + "grad_norm": 0.8487278819084167, + "learning_rate": 0.00016573971078976643, + "loss": 3.143, + "step": 776 + }, + { + "epoch": 10.361204013377927, + "grad_norm": 0.6489982604980469, + "learning_rate": 0.0001656952169076752, + "loss": 3.3258, + "step": 777 + }, + { + "epoch": 10.37458193979933, + "grad_norm": 0.8773537278175354, + "learning_rate": 0.000165650723025584, + "loss": 3.0466, + "step": 778 + }, + { + "epoch": 10.387959866220736, + "grad_norm": 0.5961865782737732, + "learning_rate": 0.00016560622914349278, + "loss": 3.3417, + "step": 779 + }, + { + "epoch": 10.40133779264214, + "grad_norm": 0.6149600148200989, + "learning_rate": 0.00016556173526140156, + "loss": 3.0622, + "step": 780 + }, + { + "epoch": 10.414715719063546, + "grad_norm": 0.7591158151626587, + "learning_rate": 0.00016551724137931035, + "loss": 3.2078, + "step": 781 + }, + { + "epoch": 10.42809364548495, + "grad_norm": 0.7915151119232178, + "learning_rate": 0.00016547274749721916, + "loss": 3.3082, + "step": 782 + }, + { + "epoch": 10.441471571906355, + "grad_norm": 0.8709903359413147, + "learning_rate": 0.00016542825361512794, + "loss": 3.3073, + "step": 783 + }, + { + "epoch": 10.454849498327759, + "grad_norm": 0.6593959331512451, + "learning_rate": 0.00016538375973303672, + "loss": 3.1574, + "step": 784 + }, + { + "epoch": 10.468227424749164, + "grad_norm": 0.8101013898849487, + "learning_rate": 0.0001653392658509455, + "loss": 3.3631, + "step": 785 + }, + { + "epoch": 10.481605351170568, + "grad_norm": 0.8200273513793945, + "learning_rate": 0.0001652947719688543, + "loss": 3.0447, + "step": 786 + }, + { + "epoch": 10.494983277591974, + "grad_norm": 0.7304090857505798, + "learning_rate": 0.00016525027808676307, + "loss": 3.3382, + "step": 787 + }, + { + "epoch": 10.508361204013378, + "grad_norm": 0.7059088349342346, + "learning_rate": 0.00016520578420467188, + "loss": 3.085, + "step": 788 + }, + { + "epoch": 10.521739130434783, + "grad_norm": 0.6664522886276245, + "learning_rate": 0.00016516129032258066, + "loss": 3.3198, + "step": 789 + }, + { + "epoch": 10.535117056856187, + "grad_norm": 0.9230799078941345, + "learning_rate": 0.00016511679644048944, + "loss": 3.2502, + "step": 790 + }, + { + "epoch": 10.548494983277592, + "grad_norm": 0.6974027752876282, + "learning_rate": 0.00016507230255839822, + "loss": 3.2432, + "step": 791 + }, + { + "epoch": 10.561872909698996, + "grad_norm": 0.7186788320541382, + "learning_rate": 0.000165027808676307, + "loss": 3.2273, + "step": 792 + }, + { + "epoch": 10.575250836120402, + "grad_norm": 0.6168047189712524, + "learning_rate": 0.00016498331479421582, + "loss": 3.2319, + "step": 793 + }, + { + "epoch": 10.588628762541806, + "grad_norm": 0.6219142079353333, + "learning_rate": 0.00016493882091212457, + "loss": 2.9733, + "step": 794 + }, + { + "epoch": 10.602006688963211, + "grad_norm": 0.573359489440918, + "learning_rate": 0.00016489432703003338, + "loss": 3.214, + "step": 795 + }, + { + "epoch": 10.615384615384615, + "grad_norm": 0.678263783454895, + "learning_rate": 0.00016484983314794216, + "loss": 3.2331, + "step": 796 + }, + { + "epoch": 10.62876254180602, + "grad_norm": 0.6593761444091797, + "learning_rate": 0.00016480533926585095, + "loss": 3.3605, + "step": 797 + }, + { + "epoch": 10.642140468227424, + "grad_norm": 0.8732627034187317, + "learning_rate": 0.00016476084538375973, + "loss": 3.3531, + "step": 798 + }, + { + "epoch": 10.65551839464883, + "grad_norm": 0.7198925614356995, + "learning_rate": 0.00016471635150166854, + "loss": 3.2927, + "step": 799 + }, + { + "epoch": 10.668896321070234, + "grad_norm": 0.7275107502937317, + "learning_rate": 0.00016467185761957732, + "loss": 3.3695, + "step": 800 + }, + { + "epoch": 10.68227424749164, + "grad_norm": 0.7077828049659729, + "learning_rate": 0.0001646273637374861, + "loss": 2.846, + "step": 801 + }, + { + "epoch": 10.695652173913043, + "grad_norm": 0.7579251527786255, + "learning_rate": 0.00016458286985539488, + "loss": 3.1917, + "step": 802 + }, + { + "epoch": 10.709030100334449, + "grad_norm": 0.7607265114784241, + "learning_rate": 0.0001645383759733037, + "loss": 3.2919, + "step": 803 + }, + { + "epoch": 10.722408026755852, + "grad_norm": 0.7122685313224792, + "learning_rate": 0.00016449388209121245, + "loss": 3.7108, + "step": 804 + }, + { + "epoch": 10.735785953177258, + "grad_norm": 0.7256726622581482, + "learning_rate": 0.00016444938820912126, + "loss": 3.3209, + "step": 805 + }, + { + "epoch": 10.749163879598662, + "grad_norm": 0.7903631925582886, + "learning_rate": 0.00016440489432703004, + "loss": 3.398, + "step": 806 + }, + { + "epoch": 10.762541806020067, + "grad_norm": 1.78204345703125, + "learning_rate": 0.00016436040044493882, + "loss": 3.1118, + "step": 807 + }, + { + "epoch": 10.775919732441471, + "grad_norm": 0.7647016644477844, + "learning_rate": 0.0001643159065628476, + "loss": 3.1889, + "step": 808 + }, + { + "epoch": 10.789297658862877, + "grad_norm": 0.8039811253547668, + "learning_rate": 0.00016427141268075642, + "loss": 3.0104, + "step": 809 + }, + { + "epoch": 10.80267558528428, + "grad_norm": 0.6011155843734741, + "learning_rate": 0.0001642269187986652, + "loss": 3.0988, + "step": 810 + }, + { + "epoch": 10.816053511705686, + "grad_norm": 0.8137276768684387, + "learning_rate": 0.00016418242491657398, + "loss": 3.4308, + "step": 811 + }, + { + "epoch": 10.82943143812709, + "grad_norm": 0.6501771807670593, + "learning_rate": 0.00016413793103448276, + "loss": 3.1405, + "step": 812 + }, + { + "epoch": 10.842809364548495, + "grad_norm": 0.678032636642456, + "learning_rate": 0.00016409343715239157, + "loss": 3.2243, + "step": 813 + }, + { + "epoch": 10.856187290969899, + "grad_norm": 0.6830305457115173, + "learning_rate": 0.00016404894327030033, + "loss": 3.2685, + "step": 814 + }, + { + "epoch": 10.869565217391305, + "grad_norm": 0.7482068538665771, + "learning_rate": 0.00016400444938820914, + "loss": 3.3363, + "step": 815 + }, + { + "epoch": 10.882943143812708, + "grad_norm": 0.6592227816581726, + "learning_rate": 0.00016395995550611792, + "loss": 3.2914, + "step": 816 + }, + { + "epoch": 10.896321070234114, + "grad_norm": 0.7520759105682373, + "learning_rate": 0.0001639154616240267, + "loss": 3.1371, + "step": 817 + }, + { + "epoch": 10.909698996655518, + "grad_norm": 0.6802201271057129, + "learning_rate": 0.00016387096774193548, + "loss": 3.2925, + "step": 818 + }, + { + "epoch": 10.923076923076923, + "grad_norm": 0.7528939247131348, + "learning_rate": 0.0001638264738598443, + "loss": 3.2147, + "step": 819 + }, + { + "epoch": 10.936454849498327, + "grad_norm": 0.7070727348327637, + "learning_rate": 0.00016378197997775308, + "loss": 3.2649, + "step": 820 + }, + { + "epoch": 10.949832775919733, + "grad_norm": 0.6121620535850525, + "learning_rate": 0.00016373748609566186, + "loss": 3.3999, + "step": 821 + }, + { + "epoch": 10.963210702341136, + "grad_norm": 0.7355679273605347, + "learning_rate": 0.00016369299221357064, + "loss": 3.2561, + "step": 822 + }, + { + "epoch": 10.976588628762542, + "grad_norm": 0.7294445037841797, + "learning_rate": 0.00016364849833147945, + "loss": 2.7019, + "step": 823 + }, + { + "epoch": 10.989966555183946, + "grad_norm": 0.8628729581832886, + "learning_rate": 0.0001636040044493882, + "loss": 3.3655, + "step": 824 + }, + { + "epoch": 11.0, + "grad_norm": 0.8784325122833252, + "learning_rate": 0.00016355951056729702, + "loss": 3.504, + "step": 825 + }, + { + "epoch": 11.013377926421406, + "grad_norm": 0.6880869269371033, + "learning_rate": 0.0001635150166852058, + "loss": 3.1878, + "step": 826 + }, + { + "epoch": 11.02675585284281, + "grad_norm": 0.5625393390655518, + "learning_rate": 0.00016347052280311458, + "loss": 3.2757, + "step": 827 + }, + { + "epoch": 11.040133779264215, + "grad_norm": 0.5854038596153259, + "learning_rate": 0.00016342602892102336, + "loss": 3.0994, + "step": 828 + }, + { + "epoch": 11.053511705685619, + "grad_norm": 0.6682130098342896, + "learning_rate": 0.00016338153503893217, + "loss": 3.1935, + "step": 829 + }, + { + "epoch": 11.066889632107024, + "grad_norm": 0.6216278076171875, + "learning_rate": 0.00016333704115684095, + "loss": 3.1273, + "step": 830 + }, + { + "epoch": 11.080267558528428, + "grad_norm": 0.61285001039505, + "learning_rate": 0.0001632925472747497, + "loss": 3.2377, + "step": 831 + }, + { + "epoch": 11.093645484949834, + "grad_norm": 0.6559188365936279, + "learning_rate": 0.00016324805339265852, + "loss": 3.1084, + "step": 832 + }, + { + "epoch": 11.107023411371237, + "grad_norm": 0.6322848200798035, + "learning_rate": 0.0001632035595105673, + "loss": 3.3178, + "step": 833 + }, + { + "epoch": 11.120401337792643, + "grad_norm": 0.6306194067001343, + "learning_rate": 0.00016315906562847608, + "loss": 3.2012, + "step": 834 + }, + { + "epoch": 11.133779264214047, + "grad_norm": 0.6923161149024963, + "learning_rate": 0.00016311457174638487, + "loss": 2.9328, + "step": 835 + }, + { + "epoch": 11.147157190635452, + "grad_norm": 0.6900002360343933, + "learning_rate": 0.00016307007786429368, + "loss": 3.1436, + "step": 836 + }, + { + "epoch": 11.160535117056856, + "grad_norm": 0.817669153213501, + "learning_rate": 0.00016302558398220246, + "loss": 3.0935, + "step": 837 + }, + { + "epoch": 11.173913043478262, + "grad_norm": 0.7544119954109192, + "learning_rate": 0.00016298109010011124, + "loss": 3.0646, + "step": 838 + }, + { + "epoch": 11.187290969899665, + "grad_norm": 0.7996231913566589, + "learning_rate": 0.00016293659621802002, + "loss": 3.1975, + "step": 839 + }, + { + "epoch": 11.200668896321071, + "grad_norm": 0.6186792850494385, + "learning_rate": 0.00016289210233592883, + "loss": 3.31, + "step": 840 + }, + { + "epoch": 11.214046822742475, + "grad_norm": 0.6926666498184204, + "learning_rate": 0.0001628476084538376, + "loss": 3.0765, + "step": 841 + }, + { + "epoch": 11.22742474916388, + "grad_norm": 0.7475588917732239, + "learning_rate": 0.0001628031145717464, + "loss": 3.1743, + "step": 842 + }, + { + "epoch": 11.240802675585284, + "grad_norm": 0.5520989298820496, + "learning_rate": 0.00016275862068965518, + "loss": 3.4243, + "step": 843 + }, + { + "epoch": 11.25418060200669, + "grad_norm": 0.6556730270385742, + "learning_rate": 0.00016271412680756396, + "loss": 3.3293, + "step": 844 + }, + { + "epoch": 11.267558528428093, + "grad_norm": 0.6509199738502502, + "learning_rate": 0.00016266963292547274, + "loss": 2.999, + "step": 845 + }, + { + "epoch": 11.280936454849499, + "grad_norm": 0.6254273653030396, + "learning_rate": 0.00016262513904338155, + "loss": 3.1869, + "step": 846 + }, + { + "epoch": 11.294314381270903, + "grad_norm": 0.7454530000686646, + "learning_rate": 0.00016258064516129034, + "loss": 3.3694, + "step": 847 + }, + { + "epoch": 11.307692307692308, + "grad_norm": 0.7563592791557312, + "learning_rate": 0.00016253615127919912, + "loss": 3.0057, + "step": 848 + }, + { + "epoch": 11.321070234113712, + "grad_norm": 0.6986783742904663, + "learning_rate": 0.0001624916573971079, + "loss": 2.9893, + "step": 849 + }, + { + "epoch": 11.334448160535118, + "grad_norm": 0.7260631322860718, + "learning_rate": 0.0001624471635150167, + "loss": 3.1733, + "step": 850 + }, + { + "epoch": 11.347826086956522, + "grad_norm": 0.7522863745689392, + "learning_rate": 0.00016240266963292547, + "loss": 2.9829, + "step": 851 + }, + { + "epoch": 11.361204013377927, + "grad_norm": 0.7290140986442566, + "learning_rate": 0.00016235817575083428, + "loss": 3.1887, + "step": 852 + }, + { + "epoch": 11.37458193979933, + "grad_norm": 0.6470169425010681, + "learning_rate": 0.00016231368186874306, + "loss": 3.2537, + "step": 853 + }, + { + "epoch": 11.387959866220736, + "grad_norm": 0.863742470741272, + "learning_rate": 0.00016226918798665184, + "loss": 3.3443, + "step": 854 + }, + { + "epoch": 11.40133779264214, + "grad_norm": 0.7363939881324768, + "learning_rate": 0.00016222469410456062, + "loss": 3.3653, + "step": 855 + }, + { + "epoch": 11.414715719063546, + "grad_norm": 0.6548926830291748, + "learning_rate": 0.00016218020022246943, + "loss": 3.0373, + "step": 856 + }, + { + "epoch": 11.42809364548495, + "grad_norm": 0.8087872862815857, + "learning_rate": 0.00016213570634037821, + "loss": 3.0118, + "step": 857 + }, + { + "epoch": 11.441471571906355, + "grad_norm": 0.677811324596405, + "learning_rate": 0.000162091212458287, + "loss": 3.0339, + "step": 858 + }, + { + "epoch": 11.454849498327759, + "grad_norm": 0.6907945275306702, + "learning_rate": 0.00016204671857619578, + "loss": 2.9496, + "step": 859 + }, + { + "epoch": 11.468227424749164, + "grad_norm": 0.6940027475357056, + "learning_rate": 0.0001620022246941046, + "loss": 3.2825, + "step": 860 + }, + { + "epoch": 11.481605351170568, + "grad_norm": 0.7132136225700378, + "learning_rate": 0.00016195773081201334, + "loss": 3.1271, + "step": 861 + }, + { + "epoch": 11.494983277591974, + "grad_norm": 0.5997372269630432, + "learning_rate": 0.00016191323692992215, + "loss": 3.1292, + "step": 862 + }, + { + "epoch": 11.508361204013378, + "grad_norm": 0.6468494534492493, + "learning_rate": 0.00016186874304783094, + "loss": 3.32, + "step": 863 + }, + { + "epoch": 11.521739130434783, + "grad_norm": 0.5792532563209534, + "learning_rate": 0.00016182424916573972, + "loss": 3.3657, + "step": 864 + }, + { + "epoch": 11.535117056856187, + "grad_norm": 0.8242068290710449, + "learning_rate": 0.0001617797552836485, + "loss": 3.0094, + "step": 865 + }, + { + "epoch": 11.548494983277592, + "grad_norm": 0.9260333776473999, + "learning_rate": 0.0001617352614015573, + "loss": 3.1123, + "step": 866 + }, + { + "epoch": 11.561872909698996, + "grad_norm": 0.6337956786155701, + "learning_rate": 0.0001616907675194661, + "loss": 3.1761, + "step": 867 + }, + { + "epoch": 11.575250836120402, + "grad_norm": 0.6010364294052124, + "learning_rate": 0.00016164627363737487, + "loss": 3.2987, + "step": 868 + }, + { + "epoch": 11.588628762541806, + "grad_norm": 0.7492111921310425, + "learning_rate": 0.00016160177975528366, + "loss": 2.9085, + "step": 869 + }, + { + "epoch": 11.602006688963211, + "grad_norm": 0.6329553127288818, + "learning_rate": 0.00016155728587319247, + "loss": 3.1841, + "step": 870 + }, + { + "epoch": 11.615384615384615, + "grad_norm": 0.768527626991272, + "learning_rate": 0.00016151279199110122, + "loss": 3.0061, + "step": 871 + }, + { + "epoch": 11.62876254180602, + "grad_norm": 0.6333640813827515, + "learning_rate": 0.00016146829810901003, + "loss": 3.3046, + "step": 872 + }, + { + "epoch": 11.642140468227424, + "grad_norm": 0.7457571625709534, + "learning_rate": 0.00016142380422691881, + "loss": 3.2728, + "step": 873 + }, + { + "epoch": 11.65551839464883, + "grad_norm": 0.6389586925506592, + "learning_rate": 0.0001613793103448276, + "loss": 3.419, + "step": 874 + }, + { + "epoch": 11.668896321070234, + "grad_norm": 0.8885436058044434, + "learning_rate": 0.00016133481646273638, + "loss": 3.0938, + "step": 875 + }, + { + "epoch": 11.68227424749164, + "grad_norm": 0.7936431765556335, + "learning_rate": 0.00016129032258064516, + "loss": 3.305, + "step": 876 + }, + { + "epoch": 11.695652173913043, + "grad_norm": 0.6133994460105896, + "learning_rate": 0.00016124582869855397, + "loss": 3.3474, + "step": 877 + }, + { + "epoch": 11.709030100334449, + "grad_norm": 0.6638192534446716, + "learning_rate": 0.00016120133481646273, + "loss": 3.1418, + "step": 878 + }, + { + "epoch": 11.722408026755852, + "grad_norm": 0.6820496320724487, + "learning_rate": 0.00016115684093437154, + "loss": 3.1421, + "step": 879 + }, + { + "epoch": 11.735785953177258, + "grad_norm": 0.6057732105255127, + "learning_rate": 0.00016111234705228032, + "loss": 3.091, + "step": 880 + }, + { + "epoch": 11.749163879598662, + "grad_norm": 0.6267048716545105, + "learning_rate": 0.0001610678531701891, + "loss": 3.1289, + "step": 881 + }, + { + "epoch": 11.762541806020067, + "grad_norm": 0.6822847723960876, + "learning_rate": 0.00016102335928809788, + "loss": 3.157, + "step": 882 + }, + { + "epoch": 11.775919732441471, + "grad_norm": 0.6809714436531067, + "learning_rate": 0.0001609788654060067, + "loss": 3.0806, + "step": 883 + }, + { + "epoch": 11.789297658862877, + "grad_norm": 0.5546092391014099, + "learning_rate": 0.00016093437152391547, + "loss": 3.1853, + "step": 884 + }, + { + "epoch": 11.80267558528428, + "grad_norm": 0.7375029921531677, + "learning_rate": 0.00016088987764182426, + "loss": 3.1287, + "step": 885 + }, + { + "epoch": 11.816053511705686, + "grad_norm": 0.6246840953826904, + "learning_rate": 0.00016084538375973304, + "loss": 3.1331, + "step": 886 + }, + { + "epoch": 11.82943143812709, + "grad_norm": 0.6088026762008667, + "learning_rate": 0.00016080088987764185, + "loss": 3.3781, + "step": 887 + }, + { + "epoch": 11.842809364548495, + "grad_norm": 0.7996237874031067, + "learning_rate": 0.0001607563959955506, + "loss": 3.0161, + "step": 888 + }, + { + "epoch": 11.856187290969899, + "grad_norm": 0.6221441626548767, + "learning_rate": 0.0001607119021134594, + "loss": 3.1491, + "step": 889 + }, + { + "epoch": 11.869565217391305, + "grad_norm": 0.6276041269302368, + "learning_rate": 0.0001606674082313682, + "loss": 3.2575, + "step": 890 + }, + { + "epoch": 11.882943143812708, + "grad_norm": 0.6394500136375427, + "learning_rate": 0.00016062291434927698, + "loss": 3.3437, + "step": 891 + }, + { + "epoch": 11.896321070234114, + "grad_norm": 0.7674509286880493, + "learning_rate": 0.00016057842046718576, + "loss": 3.1995, + "step": 892 + }, + { + "epoch": 11.909698996655518, + "grad_norm": 0.7502215504646301, + "learning_rate": 0.00016053392658509457, + "loss": 3.3129, + "step": 893 + }, + { + "epoch": 11.923076923076923, + "grad_norm": 0.6078189611434937, + "learning_rate": 0.00016048943270300335, + "loss": 3.1623, + "step": 894 + }, + { + "epoch": 11.936454849498327, + "grad_norm": 0.6113708019256592, + "learning_rate": 0.00016044493882091213, + "loss": 3.6063, + "step": 895 + }, + { + "epoch": 11.949832775919733, + "grad_norm": 0.6606878638267517, + "learning_rate": 0.00016040044493882092, + "loss": 3.3216, + "step": 896 + }, + { + "epoch": 11.963210702341136, + "grad_norm": 0.7055956125259399, + "learning_rate": 0.00016035595105672973, + "loss": 3.2006, + "step": 897 + }, + { + "epoch": 11.976588628762542, + "grad_norm": 0.7424116730690002, + "learning_rate": 0.00016031145717463848, + "loss": 3.1298, + "step": 898 + }, + { + "epoch": 11.989966555183946, + "grad_norm": 0.6675695180892944, + "learning_rate": 0.0001602669632925473, + "loss": 3.1116, + "step": 899 + }, + { + "epoch": 12.0, + "grad_norm": 0.9356181621551514, + "learning_rate": 0.00016022246941045607, + "loss": 3.2461, + "step": 900 + }, + { + "epoch": 12.013377926421406, + "grad_norm": 0.8539507985115051, + "learning_rate": 0.00016017797552836486, + "loss": 3.0671, + "step": 901 + }, + { + "epoch": 12.02675585284281, + "grad_norm": 0.573266327381134, + "learning_rate": 0.00016013348164627364, + "loss": 3.0789, + "step": 902 + }, + { + "epoch": 12.040133779264215, + "grad_norm": 0.5849746465682983, + "learning_rate": 0.00016008898776418245, + "loss": 3.1623, + "step": 903 + }, + { + "epoch": 12.053511705685619, + "grad_norm": 0.6523334980010986, + "learning_rate": 0.00016004449388209123, + "loss": 3.0243, + "step": 904 + }, + { + "epoch": 12.066889632107024, + "grad_norm": 0.6428223848342896, + "learning_rate": 0.00016, + "loss": 3.1241, + "step": 905 + }, + { + "epoch": 12.080267558528428, + "grad_norm": 0.5881937742233276, + "learning_rate": 0.0001599555061179088, + "loss": 3.1744, + "step": 906 + }, + { + "epoch": 12.093645484949834, + "grad_norm": 0.7523583173751831, + "learning_rate": 0.0001599110122358176, + "loss": 3.0821, + "step": 907 + }, + { + "epoch": 12.107023411371237, + "grad_norm": 0.6120390295982361, + "learning_rate": 0.00015986651835372636, + "loss": 3.16, + "step": 908 + }, + { + "epoch": 12.120401337792643, + "grad_norm": 0.6486253142356873, + "learning_rate": 0.00015982202447163517, + "loss": 3.1402, + "step": 909 + }, + { + "epoch": 12.133779264214047, + "grad_norm": 0.6322839260101318, + "learning_rate": 0.00015977753058954395, + "loss": 3.1325, + "step": 910 + }, + { + "epoch": 12.147157190635452, + "grad_norm": 0.5858875513076782, + "learning_rate": 0.00015973303670745273, + "loss": 3.1705, + "step": 911 + }, + { + "epoch": 12.160535117056856, + "grad_norm": 0.6495780348777771, + "learning_rate": 0.00015968854282536152, + "loss": 3.1977, + "step": 912 + }, + { + "epoch": 12.173913043478262, + "grad_norm": 0.6483474969863892, + "learning_rate": 0.00015964404894327033, + "loss": 2.9101, + "step": 913 + }, + { + "epoch": 12.187290969899665, + "grad_norm": 0.6021110415458679, + "learning_rate": 0.0001595995550611791, + "loss": 3.2106, + "step": 914 + }, + { + "epoch": 12.200668896321071, + "grad_norm": 0.5101630687713623, + "learning_rate": 0.00015955506117908786, + "loss": 3.1316, + "step": 915 + }, + { + "epoch": 12.214046822742475, + "grad_norm": 0.6226193904876709, + "learning_rate": 0.00015951056729699667, + "loss": 3.0897, + "step": 916 + }, + { + "epoch": 12.22742474916388, + "grad_norm": 0.6268473267555237, + "learning_rate": 0.00015946607341490546, + "loss": 3.2036, + "step": 917 + }, + { + "epoch": 12.240802675585284, + "grad_norm": 0.7825391292572021, + "learning_rate": 0.00015942157953281424, + "loss": 3.0082, + "step": 918 + }, + { + "epoch": 12.25418060200669, + "grad_norm": 0.7881148457527161, + "learning_rate": 0.00015937708565072302, + "loss": 3.0779, + "step": 919 + }, + { + "epoch": 12.267558528428093, + "grad_norm": 0.6725586652755737, + "learning_rate": 0.00015933259176863183, + "loss": 3.04, + "step": 920 + }, + { + "epoch": 12.280936454849499, + "grad_norm": 0.5831689238548279, + "learning_rate": 0.0001592880978865406, + "loss": 3.3319, + "step": 921 + }, + { + "epoch": 12.294314381270903, + "grad_norm": 0.6057907342910767, + "learning_rate": 0.0001592436040044494, + "loss": 3.0869, + "step": 922 + }, + { + "epoch": 12.307692307692308, + "grad_norm": 0.771857500076294, + "learning_rate": 0.00015919911012235818, + "loss": 3.1472, + "step": 923 + }, + { + "epoch": 12.321070234113712, + "grad_norm": 0.7447528839111328, + "learning_rate": 0.000159154616240267, + "loss": 3.174, + "step": 924 + }, + { + "epoch": 12.334448160535118, + "grad_norm": 0.5772632956504822, + "learning_rate": 0.00015911012235817574, + "loss": 3.1767, + "step": 925 + }, + { + "epoch": 12.347826086956522, + "grad_norm": 0.6952618956565857, + "learning_rate": 0.00015906562847608455, + "loss": 3.1597, + "step": 926 + }, + { + "epoch": 12.361204013377927, + "grad_norm": 0.600922703742981, + "learning_rate": 0.00015902113459399333, + "loss": 3.3612, + "step": 927 + }, + { + "epoch": 12.37458193979933, + "grad_norm": 0.7571472525596619, + "learning_rate": 0.00015897664071190212, + "loss": 2.9405, + "step": 928 + }, + { + "epoch": 12.387959866220736, + "grad_norm": 0.9343985915184021, + "learning_rate": 0.0001589321468298109, + "loss": 3.2886, + "step": 929 + }, + { + "epoch": 12.40133779264214, + "grad_norm": 0.7046729922294617, + "learning_rate": 0.0001588876529477197, + "loss": 3.4421, + "step": 930 + }, + { + "epoch": 12.414715719063546, + "grad_norm": 0.7591777443885803, + "learning_rate": 0.0001588431590656285, + "loss": 3.005, + "step": 931 + }, + { + "epoch": 12.42809364548495, + "grad_norm": 0.6508903503417969, + "learning_rate": 0.00015879866518353727, + "loss": 2.8554, + "step": 932 + }, + { + "epoch": 12.441471571906355, + "grad_norm": 0.6557784676551819, + "learning_rate": 0.00015875417130144606, + "loss": 3.3268, + "step": 933 + }, + { + "epoch": 12.454849498327759, + "grad_norm": 0.6941578984260559, + "learning_rate": 0.00015870967741935487, + "loss": 3.2088, + "step": 934 + }, + { + "epoch": 12.468227424749164, + "grad_norm": 0.6824263334274292, + "learning_rate": 0.00015866518353726362, + "loss": 3.0897, + "step": 935 + }, + { + "epoch": 12.481605351170568, + "grad_norm": 0.7324599027633667, + "learning_rate": 0.00015862068965517243, + "loss": 3.1443, + "step": 936 + }, + { + "epoch": 12.494983277591974, + "grad_norm": 0.577022135257721, + "learning_rate": 0.0001585761957730812, + "loss": 3.0896, + "step": 937 + }, + { + "epoch": 12.508361204013378, + "grad_norm": 0.6165060997009277, + "learning_rate": 0.00015853170189099, + "loss": 2.7546, + "step": 938 + }, + { + "epoch": 12.521739130434783, + "grad_norm": 0.561906635761261, + "learning_rate": 0.00015848720800889878, + "loss": 3.4192, + "step": 939 + }, + { + "epoch": 12.535117056856187, + "grad_norm": 0.5894923806190491, + "learning_rate": 0.0001584427141268076, + "loss": 3.0388, + "step": 940 + }, + { + "epoch": 12.548494983277592, + "grad_norm": 0.6261674761772156, + "learning_rate": 0.00015839822024471637, + "loss": 3.0705, + "step": 941 + }, + { + "epoch": 12.561872909698996, + "grad_norm": 0.695101261138916, + "learning_rate": 0.00015835372636262515, + "loss": 3.1684, + "step": 942 + }, + { + "epoch": 12.575250836120402, + "grad_norm": 0.6176817417144775, + "learning_rate": 0.00015830923248053393, + "loss": 3.0708, + "step": 943 + }, + { + "epoch": 12.588628762541806, + "grad_norm": 0.6548507213592529, + "learning_rate": 0.00015826473859844274, + "loss": 3.1569, + "step": 944 + }, + { + "epoch": 12.602006688963211, + "grad_norm": 0.6046382188796997, + "learning_rate": 0.0001582202447163515, + "loss": 3.2479, + "step": 945 + }, + { + "epoch": 12.615384615384615, + "grad_norm": 0.7103912234306335, + "learning_rate": 0.0001581757508342603, + "loss": 3.127, + "step": 946 + }, + { + "epoch": 12.62876254180602, + "grad_norm": 0.7131765484809875, + "learning_rate": 0.0001581312569521691, + "loss": 3.0975, + "step": 947 + }, + { + "epoch": 12.642140468227424, + "grad_norm": 0.6442859768867493, + "learning_rate": 0.00015808676307007787, + "loss": 3.2885, + "step": 948 + }, + { + "epoch": 12.65551839464883, + "grad_norm": 0.6430050134658813, + "learning_rate": 0.00015804226918798666, + "loss": 3.0397, + "step": 949 + }, + { + "epoch": 12.668896321070234, + "grad_norm": 0.6894303560256958, + "learning_rate": 0.00015799777530589546, + "loss": 3.0756, + "step": 950 + }, + { + "epoch": 12.68227424749164, + "grad_norm": 0.7319600582122803, + "learning_rate": 0.00015795328142380425, + "loss": 3.0457, + "step": 951 + }, + { + "epoch": 12.695652173913043, + "grad_norm": 0.6445140838623047, + "learning_rate": 0.00015790878754171303, + "loss": 2.9828, + "step": 952 + }, + { + "epoch": 12.709030100334449, + "grad_norm": 0.7522070407867432, + "learning_rate": 0.0001578642936596218, + "loss": 2.8942, + "step": 953 + }, + { + "epoch": 12.722408026755852, + "grad_norm": 0.7962691783905029, + "learning_rate": 0.00015781979977753062, + "loss": 3.0985, + "step": 954 + }, + { + "epoch": 12.735785953177258, + "grad_norm": 0.6391687393188477, + "learning_rate": 0.00015777530589543938, + "loss": 3.1752, + "step": 955 + }, + { + "epoch": 12.749163879598662, + "grad_norm": 0.7632976174354553, + "learning_rate": 0.00015773081201334816, + "loss": 3.3505, + "step": 956 + }, + { + "epoch": 12.762541806020067, + "grad_norm": 0.7491022944450378, + "learning_rate": 0.00015768631813125697, + "loss": 3.0721, + "step": 957 + }, + { + "epoch": 12.775919732441471, + "grad_norm": 0.6163421273231506, + "learning_rate": 0.00015764182424916572, + "loss": 3.3242, + "step": 958 + }, + { + "epoch": 12.789297658862877, + "grad_norm": 0.6831198334693909, + "learning_rate": 0.00015759733036707453, + "loss": 3.1409, + "step": 959 + }, + { + "epoch": 12.80267558528428, + "grad_norm": 0.812300980091095, + "learning_rate": 0.00015755283648498332, + "loss": 2.9606, + "step": 960 + }, + { + "epoch": 12.816053511705686, + "grad_norm": 0.6904334425926208, + "learning_rate": 0.00015750834260289213, + "loss": 3.0398, + "step": 961 + }, + { + "epoch": 12.82943143812709, + "grad_norm": 0.6349720358848572, + "learning_rate": 0.00015746384872080088, + "loss": 3.1033, + "step": 962 + }, + { + "epoch": 12.842809364548495, + "grad_norm": 0.6837566494941711, + "learning_rate": 0.0001574193548387097, + "loss": 3.2353, + "step": 963 + }, + { + "epoch": 12.856187290969899, + "grad_norm": 0.5852749943733215, + "learning_rate": 0.00015737486095661847, + "loss": 3.0972, + "step": 964 + }, + { + "epoch": 12.869565217391305, + "grad_norm": 0.6641372442245483, + "learning_rate": 0.00015733036707452725, + "loss": 3.2243, + "step": 965 + }, + { + "epoch": 12.882943143812708, + "grad_norm": 0.6613900065422058, + "learning_rate": 0.00015728587319243604, + "loss": 3.1263, + "step": 966 + }, + { + "epoch": 12.896321070234114, + "grad_norm": 0.6126120090484619, + "learning_rate": 0.00015724137931034485, + "loss": 3.069, + "step": 967 + }, + { + "epoch": 12.909698996655518, + "grad_norm": 0.6764604449272156, + "learning_rate": 0.0001571968854282536, + "loss": 3.1397, + "step": 968 + }, + { + "epoch": 12.923076923076923, + "grad_norm": 0.6447578072547913, + "learning_rate": 0.0001571523915461624, + "loss": 3.1839, + "step": 969 + }, + { + "epoch": 12.936454849498327, + "grad_norm": 0.5872016549110413, + "learning_rate": 0.0001571078976640712, + "loss": 3.3144, + "step": 970 + }, + { + "epoch": 12.949832775919733, + "grad_norm": 0.626276969909668, + "learning_rate": 0.00015706340378198, + "loss": 3.1295, + "step": 971 + }, + { + "epoch": 12.963210702341136, + "grad_norm": 0.6829231381416321, + "learning_rate": 0.00015701890989988876, + "loss": 3.2261, + "step": 972 + }, + { + "epoch": 12.976588628762542, + "grad_norm": 0.6197345852851868, + "learning_rate": 0.00015697441601779757, + "loss": 3.1117, + "step": 973 + }, + { + "epoch": 12.989966555183946, + "grad_norm": 0.6137062907218933, + "learning_rate": 0.00015692992213570635, + "loss": 3.1548, + "step": 974 + }, + { + "epoch": 13.0, + "grad_norm": 0.7483121752738953, + "learning_rate": 0.00015688542825361513, + "loss": 3.4944, + "step": 975 + }, + { + "epoch": 13.013377926421406, + "grad_norm": 0.6102525591850281, + "learning_rate": 0.00015684093437152392, + "loss": 3.1403, + "step": 976 + }, + { + "epoch": 13.02675585284281, + "grad_norm": 0.7258747220039368, + "learning_rate": 0.00015679644048943272, + "loss": 2.8474, + "step": 977 + }, + { + "epoch": 13.040133779264215, + "grad_norm": 0.6891087293624878, + "learning_rate": 0.00015675194660734148, + "loss": 2.9722, + "step": 978 + }, + { + "epoch": 13.053511705685619, + "grad_norm": 0.6320910453796387, + "learning_rate": 0.0001567074527252503, + "loss": 2.9807, + "step": 979 + }, + { + "epoch": 13.066889632107024, + "grad_norm": 0.8684266209602356, + "learning_rate": 0.00015666295884315907, + "loss": 2.8448, + "step": 980 + }, + { + "epoch": 13.080267558528428, + "grad_norm": 0.7126099467277527, + "learning_rate": 0.00015661846496106788, + "loss": 3.1915, + "step": 981 + }, + { + "epoch": 13.093645484949834, + "grad_norm": 0.7001529335975647, + "learning_rate": 0.00015657397107897664, + "loss": 3.2017, + "step": 982 + }, + { + "epoch": 13.107023411371237, + "grad_norm": 0.7901191711425781, + "learning_rate": 0.00015652947719688545, + "loss": 3.1273, + "step": 983 + }, + { + "epoch": 13.120401337792643, + "grad_norm": 0.5769410729408264, + "learning_rate": 0.00015648498331479423, + "loss": 2.8582, + "step": 984 + }, + { + "epoch": 13.133779264214047, + "grad_norm": 0.5969700813293457, + "learning_rate": 0.000156440489432703, + "loss": 3.118, + "step": 985 + }, + { + "epoch": 13.147157190635452, + "grad_norm": 0.5789377093315125, + "learning_rate": 0.0001563959955506118, + "loss": 2.8121, + "step": 986 + }, + { + "epoch": 13.160535117056856, + "grad_norm": 0.7945278882980347, + "learning_rate": 0.0001563515016685206, + "loss": 2.9212, + "step": 987 + }, + { + "epoch": 13.173913043478262, + "grad_norm": 0.60884690284729, + "learning_rate": 0.00015630700778642936, + "loss": 3.1993, + "step": 988 + }, + { + "epoch": 13.187290969899665, + "grad_norm": 0.616142213344574, + "learning_rate": 0.00015626251390433817, + "loss": 3.107, + "step": 989 + }, + { + "epoch": 13.200668896321071, + "grad_norm": 0.6428812146186829, + "learning_rate": 0.00015621802002224695, + "loss": 3.1401, + "step": 990 + }, + { + "epoch": 13.214046822742475, + "grad_norm": 0.5723693370819092, + "learning_rate": 0.00015617352614015576, + "loss": 3.0474, + "step": 991 + }, + { + "epoch": 13.22742474916388, + "grad_norm": 0.5820907950401306, + "learning_rate": 0.00015612903225806451, + "loss": 3.2211, + "step": 992 + }, + { + "epoch": 13.240802675585284, + "grad_norm": 0.555957555770874, + "learning_rate": 0.00015608453837597332, + "loss": 3.1306, + "step": 993 + }, + { + "epoch": 13.25418060200669, + "grad_norm": 0.528698205947876, + "learning_rate": 0.0001560400444938821, + "loss": 3.0989, + "step": 994 + }, + { + "epoch": 13.267558528428093, + "grad_norm": 0.5959749817848206, + "learning_rate": 0.0001559955506117909, + "loss": 2.9128, + "step": 995 + }, + { + "epoch": 13.280936454849499, + "grad_norm": 0.6702240705490112, + "learning_rate": 0.00015595105672969967, + "loss": 2.9969, + "step": 996 + }, + { + "epoch": 13.294314381270903, + "grad_norm": 0.5363825559616089, + "learning_rate": 0.00015590656284760845, + "loss": 3.2027, + "step": 997 + }, + { + "epoch": 13.307692307692308, + "grad_norm": 0.6402661204338074, + "learning_rate": 0.00015586206896551724, + "loss": 3.2733, + "step": 998 + }, + { + "epoch": 13.321070234113712, + "grad_norm": 0.700517475605011, + "learning_rate": 0.00015581757508342602, + "loss": 3.2629, + "step": 999 + }, + { + "epoch": 13.334448160535118, + "grad_norm": 0.5500949621200562, + "learning_rate": 0.00015577308120133483, + "loss": 3.172, + "step": 1000 + }, + { + "epoch": 13.347826086956522, + "grad_norm": 0.6199147701263428, + "learning_rate": 0.0001557285873192436, + "loss": 3.0556, + "step": 1001 + }, + { + "epoch": 13.361204013377927, + "grad_norm": 0.5900529026985168, + "learning_rate": 0.0001556840934371524, + "loss": 3.1248, + "step": 1002 + }, + { + "epoch": 13.37458193979933, + "grad_norm": 0.7272413372993469, + "learning_rate": 0.00015563959955506118, + "loss": 3.1142, + "step": 1003 + }, + { + "epoch": 13.387959866220736, + "grad_norm": 0.6461951732635498, + "learning_rate": 0.00015559510567296998, + "loss": 3.1145, + "step": 1004 + }, + { + "epoch": 13.40133779264214, + "grad_norm": 0.5750373005867004, + "learning_rate": 0.00015555061179087874, + "loss": 3.2117, + "step": 1005 + }, + { + "epoch": 13.414715719063546, + "grad_norm": 0.6486302614212036, + "learning_rate": 0.00015550611790878755, + "loss": 2.9018, + "step": 1006 + }, + { + "epoch": 13.42809364548495, + "grad_norm": 0.6897476315498352, + "learning_rate": 0.00015546162402669633, + "loss": 3.2987, + "step": 1007 + }, + { + "epoch": 13.441471571906355, + "grad_norm": 0.5997576713562012, + "learning_rate": 0.00015541713014460511, + "loss": 2.974, + "step": 1008 + }, + { + "epoch": 13.454849498327759, + "grad_norm": 0.6484793424606323, + "learning_rate": 0.0001553726362625139, + "loss": 2.9827, + "step": 1009 + }, + { + "epoch": 13.468227424749164, + "grad_norm": 0.562312126159668, + "learning_rate": 0.0001553281423804227, + "loss": 3.2243, + "step": 1010 + }, + { + "epoch": 13.481605351170568, + "grad_norm": 0.7257137298583984, + "learning_rate": 0.0001552836484983315, + "loss": 3.1081, + "step": 1011 + }, + { + "epoch": 13.494983277591974, + "grad_norm": 0.7201404571533203, + "learning_rate": 0.00015523915461624027, + "loss": 3.1468, + "step": 1012 + }, + { + "epoch": 13.508361204013378, + "grad_norm": 0.666539192199707, + "learning_rate": 0.00015519466073414905, + "loss": 3.0081, + "step": 1013 + }, + { + "epoch": 13.521739130434783, + "grad_norm": 0.6867642998695374, + "learning_rate": 0.00015515016685205786, + "loss": 2.7889, + "step": 1014 + }, + { + "epoch": 13.535117056856187, + "grad_norm": 0.5799785256385803, + "learning_rate": 0.00015510567296996662, + "loss": 3.2995, + "step": 1015 + }, + { + "epoch": 13.548494983277592, + "grad_norm": 0.6155371069908142, + "learning_rate": 0.00015506117908787543, + "loss": 3.2227, + "step": 1016 + }, + { + "epoch": 13.561872909698996, + "grad_norm": 0.7604040503501892, + "learning_rate": 0.0001550166852057842, + "loss": 3.0384, + "step": 1017 + }, + { + "epoch": 13.575250836120402, + "grad_norm": 0.8445917963981628, + "learning_rate": 0.000154972191323693, + "loss": 2.9566, + "step": 1018 + }, + { + "epoch": 13.588628762541806, + "grad_norm": 0.7978566288948059, + "learning_rate": 0.00015492769744160177, + "loss": 3.0175, + "step": 1019 + }, + { + "epoch": 13.602006688963211, + "grad_norm": 0.5899437069892883, + "learning_rate": 0.00015488320355951058, + "loss": 3.2418, + "step": 1020 + }, + { + "epoch": 13.615384615384615, + "grad_norm": 0.7204627990722656, + "learning_rate": 0.00015483870967741937, + "loss": 3.1568, + "step": 1021 + }, + { + "epoch": 13.62876254180602, + "grad_norm": 0.6504855155944824, + "learning_rate": 0.00015479421579532815, + "loss": 2.8952, + "step": 1022 + }, + { + "epoch": 13.642140468227424, + "grad_norm": 0.8101251125335693, + "learning_rate": 0.00015474972191323693, + "loss": 3.1975, + "step": 1023 + }, + { + "epoch": 13.65551839464883, + "grad_norm": 0.6161416172981262, + "learning_rate": 0.00015470522803114574, + "loss": 3.1565, + "step": 1024 + }, + { + "epoch": 13.668896321070234, + "grad_norm": 0.6131258606910706, + "learning_rate": 0.0001546607341490545, + "loss": 3.0382, + "step": 1025 + }, + { + "epoch": 13.68227424749164, + "grad_norm": 0.8008583784103394, + "learning_rate": 0.0001546162402669633, + "loss": 2.995, + "step": 1026 + }, + { + "epoch": 13.695652173913043, + "grad_norm": 0.7101227045059204, + "learning_rate": 0.0001545717463848721, + "loss": 3.0704, + "step": 1027 + }, + { + "epoch": 13.709030100334449, + "grad_norm": 0.7988458275794983, + "learning_rate": 0.00015452725250278087, + "loss": 3.1424, + "step": 1028 + }, + { + "epoch": 13.722408026755852, + "grad_norm": 0.6013655662536621, + "learning_rate": 0.00015448275862068965, + "loss": 2.9986, + "step": 1029 + }, + { + "epoch": 13.735785953177258, + "grad_norm": 0.6368236541748047, + "learning_rate": 0.00015443826473859846, + "loss": 3.0193, + "step": 1030 + }, + { + "epoch": 13.749163879598662, + "grad_norm": 0.8222694396972656, + "learning_rate": 0.00015439377085650724, + "loss": 3.0888, + "step": 1031 + }, + { + "epoch": 13.762541806020067, + "grad_norm": 0.7270404696464539, + "learning_rate": 0.00015434927697441603, + "loss": 3.0929, + "step": 1032 + }, + { + "epoch": 13.775919732441471, + "grad_norm": 0.7292355298995972, + "learning_rate": 0.0001543047830923248, + "loss": 2.7676, + "step": 1033 + }, + { + "epoch": 13.789297658862877, + "grad_norm": 0.6662157773971558, + "learning_rate": 0.00015426028921023362, + "loss": 3.1984, + "step": 1034 + }, + { + "epoch": 13.80267558528428, + "grad_norm": 0.6350163817405701, + "learning_rate": 0.00015421579532814237, + "loss": 3.3042, + "step": 1035 + }, + { + "epoch": 13.816053511705686, + "grad_norm": 0.5999907851219177, + "learning_rate": 0.00015417130144605118, + "loss": 3.0983, + "step": 1036 + }, + { + "epoch": 13.82943143812709, + "grad_norm": 0.5942257642745972, + "learning_rate": 0.00015412680756395997, + "loss": 3.0474, + "step": 1037 + }, + { + "epoch": 13.842809364548495, + "grad_norm": 0.662589430809021, + "learning_rate": 0.00015408231368186875, + "loss": 2.9251, + "step": 1038 + }, + { + "epoch": 13.856187290969899, + "grad_norm": 0.5817089080810547, + "learning_rate": 0.00015403781979977753, + "loss": 3.0716, + "step": 1039 + }, + { + "epoch": 13.869565217391305, + "grad_norm": 0.6019257307052612, + "learning_rate": 0.0001539933259176863, + "loss": 3.1754, + "step": 1040 + }, + { + "epoch": 13.882943143812708, + "grad_norm": 0.6301860213279724, + "learning_rate": 0.00015394883203559512, + "loss": 3.2066, + "step": 1041 + }, + { + "epoch": 13.896321070234114, + "grad_norm": 0.6468888521194458, + "learning_rate": 0.00015390433815350388, + "loss": 3.3001, + "step": 1042 + }, + { + "epoch": 13.909698996655518, + "grad_norm": 0.6510801911354065, + "learning_rate": 0.0001538598442714127, + "loss": 3.2198, + "step": 1043 + }, + { + "epoch": 13.923076923076923, + "grad_norm": 0.5692014694213867, + "learning_rate": 0.00015381535038932147, + "loss": 3.3257, + "step": 1044 + }, + { + "epoch": 13.936454849498327, + "grad_norm": 0.594219982624054, + "learning_rate": 0.00015377085650723025, + "loss": 2.9918, + "step": 1045 + }, + { + "epoch": 13.949832775919733, + "grad_norm": 0.6501769423484802, + "learning_rate": 0.00015372636262513903, + "loss": 2.9653, + "step": 1046 + }, + { + "epoch": 13.963210702341136, + "grad_norm": 0.6310623288154602, + "learning_rate": 0.00015368186874304784, + "loss": 3.19, + "step": 1047 + }, + { + "epoch": 13.976588628762542, + "grad_norm": 0.5795436501502991, + "learning_rate": 0.00015363737486095663, + "loss": 3.272, + "step": 1048 + }, + { + "epoch": 13.989966555183946, + "grad_norm": 0.5421392917633057, + "learning_rate": 0.0001535928809788654, + "loss": 3.2109, + "step": 1049 + }, + { + "epoch": 14.0, + "grad_norm": 0.833959698677063, + "learning_rate": 0.0001535483870967742, + "loss": 3.3017, + "step": 1050 + }, + { + "epoch": 14.013377926421406, + "grad_norm": 0.6856208443641663, + "learning_rate": 0.000153503893214683, + "loss": 3.0784, + "step": 1051 + }, + { + "epoch": 14.02675585284281, + "grad_norm": 0.5841811895370483, + "learning_rate": 0.00015345939933259176, + "loss": 2.848, + "step": 1052 + }, + { + "epoch": 14.040133779264215, + "grad_norm": 0.557906985282898, + "learning_rate": 0.00015341490545050057, + "loss": 3.1564, + "step": 1053 + }, + { + "epoch": 14.053511705685619, + "grad_norm": 0.5468619465827942, + "learning_rate": 0.00015337041156840935, + "loss": 3.1237, + "step": 1054 + }, + { + "epoch": 14.066889632107024, + "grad_norm": 0.7213225364685059, + "learning_rate": 0.00015332591768631813, + "loss": 2.8993, + "step": 1055 + }, + { + "epoch": 14.080267558528428, + "grad_norm": 0.7413175106048584, + "learning_rate": 0.0001532814238042269, + "loss": 3.0028, + "step": 1056 + }, + { + "epoch": 14.093645484949834, + "grad_norm": 0.6072244644165039, + "learning_rate": 0.00015323692992213572, + "loss": 2.8534, + "step": 1057 + }, + { + "epoch": 14.107023411371237, + "grad_norm": 0.683262288570404, + "learning_rate": 0.0001531924360400445, + "loss": 2.8982, + "step": 1058 + }, + { + "epoch": 14.120401337792643, + "grad_norm": 0.5880157351493835, + "learning_rate": 0.0001531479421579533, + "loss": 2.9567, + "step": 1059 + }, + { + "epoch": 14.133779264214047, + "grad_norm": 0.7519298195838928, + "learning_rate": 0.00015310344827586207, + "loss": 2.9615, + "step": 1060 + }, + { + "epoch": 14.147157190635452, + "grad_norm": 0.7747945189476013, + "learning_rate": 0.00015305895439377088, + "loss": 2.7823, + "step": 1061 + }, + { + "epoch": 14.160535117056856, + "grad_norm": 0.6560395956039429, + "learning_rate": 0.00015301446051167963, + "loss": 3.1791, + "step": 1062 + }, + { + "epoch": 14.173913043478262, + "grad_norm": 0.6388076543807983, + "learning_rate": 0.00015296996662958844, + "loss": 2.9868, + "step": 1063 + }, + { + "epoch": 14.187290969899665, + "grad_norm": 0.7349525690078735, + "learning_rate": 0.00015292547274749723, + "loss": 2.9291, + "step": 1064 + }, + { + "epoch": 14.200668896321071, + "grad_norm": 0.7184433341026306, + "learning_rate": 0.000152880978865406, + "loss": 2.7889, + "step": 1065 + }, + { + "epoch": 14.214046822742475, + "grad_norm": 0.6776930093765259, + "learning_rate": 0.0001528364849833148, + "loss": 3.1572, + "step": 1066 + }, + { + "epoch": 14.22742474916388, + "grad_norm": 0.818756103515625, + "learning_rate": 0.0001527919911012236, + "loss": 2.767, + "step": 1067 + }, + { + "epoch": 14.240802675585284, + "grad_norm": 0.6005066633224487, + "learning_rate": 0.00015274749721913238, + "loss": 3.1796, + "step": 1068 + }, + { + "epoch": 14.25418060200669, + "grad_norm": 0.6367926001548767, + "learning_rate": 0.00015270300333704117, + "loss": 2.9999, + "step": 1069 + }, + { + "epoch": 14.267558528428093, + "grad_norm": 0.6823679208755493, + "learning_rate": 0.00015265850945494995, + "loss": 2.7663, + "step": 1070 + }, + { + "epoch": 14.280936454849499, + "grad_norm": 0.6238808631896973, + "learning_rate": 0.00015261401557285876, + "loss": 3.1908, + "step": 1071 + }, + { + "epoch": 14.294314381270903, + "grad_norm": 0.6983721256256104, + "learning_rate": 0.0001525695216907675, + "loss": 3.0264, + "step": 1072 + }, + { + "epoch": 14.307692307692308, + "grad_norm": 0.7568501234054565, + "learning_rate": 0.00015252502780867632, + "loss": 3.0474, + "step": 1073 + }, + { + "epoch": 14.321070234113712, + "grad_norm": 0.6250051259994507, + "learning_rate": 0.0001524805339265851, + "loss": 3.114, + "step": 1074 + }, + { + "epoch": 14.334448160535118, + "grad_norm": 0.5907386541366577, + "learning_rate": 0.0001524360400444939, + "loss": 3.1823, + "step": 1075 + }, + { + "epoch": 14.347826086956522, + "grad_norm": 0.6719332337379456, + "learning_rate": 0.00015239154616240267, + "loss": 2.994, + "step": 1076 + }, + { + "epoch": 14.361204013377927, + "grad_norm": 0.5911534428596497, + "learning_rate": 0.00015234705228031148, + "loss": 3.4178, + "step": 1077 + }, + { + "epoch": 14.37458193979933, + "grad_norm": 0.8071689009666443, + "learning_rate": 0.00015230255839822026, + "loss": 2.9633, + "step": 1078 + }, + { + "epoch": 14.387959866220736, + "grad_norm": 0.5957038998603821, + "learning_rate": 0.00015225806451612902, + "loss": 3.1126, + "step": 1079 + }, + { + "epoch": 14.40133779264214, + "grad_norm": 0.6604459285736084, + "learning_rate": 0.00015221357063403783, + "loss": 2.9193, + "step": 1080 + }, + { + "epoch": 14.414715719063546, + "grad_norm": 0.626081109046936, + "learning_rate": 0.0001521690767519466, + "loss": 2.9321, + "step": 1081 + }, + { + "epoch": 14.42809364548495, + "grad_norm": 0.5767174959182739, + "learning_rate": 0.0001521245828698554, + "loss": 3.1518, + "step": 1082 + }, + { + "epoch": 14.441471571906355, + "grad_norm": 0.6444874405860901, + "learning_rate": 0.00015208008898776417, + "loss": 3.1948, + "step": 1083 + }, + { + "epoch": 14.454849498327759, + "grad_norm": 0.668171763420105, + "learning_rate": 0.00015203559510567298, + "loss": 3.016, + "step": 1084 + }, + { + "epoch": 14.468227424749164, + "grad_norm": 0.6998944878578186, + "learning_rate": 0.00015199110122358176, + "loss": 2.9558, + "step": 1085 + }, + { + "epoch": 14.481605351170568, + "grad_norm": 0.5896235704421997, + "learning_rate": 0.00015194660734149055, + "loss": 3.0997, + "step": 1086 + }, + { + "epoch": 14.494983277591974, + "grad_norm": 0.6724826097488403, + "learning_rate": 0.00015190211345939933, + "loss": 2.9369, + "step": 1087 + }, + { + "epoch": 14.508361204013378, + "grad_norm": 0.5710486769676208, + "learning_rate": 0.00015185761957730814, + "loss": 3.4724, + "step": 1088 + }, + { + "epoch": 14.521739130434783, + "grad_norm": 0.9997962117195129, + "learning_rate": 0.0001518131256952169, + "loss": 2.9058, + "step": 1089 + }, + { + "epoch": 14.535117056856187, + "grad_norm": 0.668074905872345, + "learning_rate": 0.0001517686318131257, + "loss": 3.2401, + "step": 1090 + }, + { + "epoch": 14.548494983277592, + "grad_norm": 0.6180433630943298, + "learning_rate": 0.00015172413793103449, + "loss": 2.8975, + "step": 1091 + }, + { + "epoch": 14.561872909698996, + "grad_norm": 0.6412661075592041, + "learning_rate": 0.00015167964404894327, + "loss": 2.9882, + "step": 1092 + }, + { + "epoch": 14.575250836120402, + "grad_norm": 0.715288519859314, + "learning_rate": 0.00015163515016685205, + "loss": 3.4004, + "step": 1093 + }, + { + "epoch": 14.588628762541806, + "grad_norm": 0.689164400100708, + "learning_rate": 0.00015159065628476086, + "loss": 3.0705, + "step": 1094 + }, + { + "epoch": 14.602006688963211, + "grad_norm": 0.7713497281074524, + "learning_rate": 0.00015154616240266964, + "loss": 3.159, + "step": 1095 + }, + { + "epoch": 14.615384615384615, + "grad_norm": 0.6725841164588928, + "learning_rate": 0.00015150166852057843, + "loss": 2.8938, + "step": 1096 + }, + { + "epoch": 14.62876254180602, + "grad_norm": 0.658108651638031, + "learning_rate": 0.0001514571746384872, + "loss": 2.8747, + "step": 1097 + }, + { + "epoch": 14.642140468227424, + "grad_norm": 0.5711888074874878, + "learning_rate": 0.00015141268075639602, + "loss": 2.8989, + "step": 1098 + }, + { + "epoch": 14.65551839464883, + "grad_norm": 0.6184161305427551, + "learning_rate": 0.00015136818687430477, + "loss": 3.0904, + "step": 1099 + }, + { + "epoch": 14.668896321070234, + "grad_norm": 0.5937799215316772, + "learning_rate": 0.00015132369299221358, + "loss": 3.1637, + "step": 1100 + }, + { + "epoch": 14.68227424749164, + "grad_norm": 0.591673731803894, + "learning_rate": 0.00015127919911012236, + "loss": 2.9547, + "step": 1101 + }, + { + "epoch": 14.695652173913043, + "grad_norm": 0.7317401170730591, + "learning_rate": 0.00015123470522803115, + "loss": 3.2043, + "step": 1102 + }, + { + "epoch": 14.709030100334449, + "grad_norm": 0.5784003734588623, + "learning_rate": 0.00015119021134593993, + "loss": 3.102, + "step": 1103 + }, + { + "epoch": 14.722408026755852, + "grad_norm": 0.7077385187149048, + "learning_rate": 0.00015114571746384874, + "loss": 3.011, + "step": 1104 + }, + { + "epoch": 14.735785953177258, + "grad_norm": 0.6472675204277039, + "learning_rate": 0.00015110122358175752, + "loss": 3.2075, + "step": 1105 + }, + { + "epoch": 14.749163879598662, + "grad_norm": 0.6789306998252869, + "learning_rate": 0.0001510567296996663, + "loss": 2.9458, + "step": 1106 + }, + { + "epoch": 14.762541806020067, + "grad_norm": 0.6602732539176941, + "learning_rate": 0.00015101223581757509, + "loss": 2.9941, + "step": 1107 + }, + { + "epoch": 14.775919732441471, + "grad_norm": 0.7484832406044006, + "learning_rate": 0.0001509677419354839, + "loss": 3.0358, + "step": 1108 + }, + { + "epoch": 14.789297658862877, + "grad_norm": 0.704139769077301, + "learning_rate": 0.00015092324805339265, + "loss": 3.1006, + "step": 1109 + }, + { + "epoch": 14.80267558528428, + "grad_norm": 0.6545978784561157, + "learning_rate": 0.00015087875417130146, + "loss": 3.0369, + "step": 1110 + }, + { + "epoch": 14.816053511705686, + "grad_norm": 0.5718163847923279, + "learning_rate": 0.00015083426028921024, + "loss": 3.1683, + "step": 1111 + }, + { + "epoch": 14.82943143812709, + "grad_norm": 0.5773367285728455, + "learning_rate": 0.00015078976640711902, + "loss": 3.2753, + "step": 1112 + }, + { + "epoch": 14.842809364548495, + "grad_norm": 0.6617185473442078, + "learning_rate": 0.0001507452725250278, + "loss": 2.9713, + "step": 1113 + }, + { + "epoch": 14.856187290969899, + "grad_norm": 0.6748194098472595, + "learning_rate": 0.00015070077864293662, + "loss": 3.0961, + "step": 1114 + }, + { + "epoch": 14.869565217391305, + "grad_norm": 0.6942034959793091, + "learning_rate": 0.0001506562847608454, + "loss": 3.0778, + "step": 1115 + }, + { + "epoch": 14.882943143812708, + "grad_norm": 1.0203640460968018, + "learning_rate": 0.00015061179087875418, + "loss": 3.0705, + "step": 1116 + }, + { + "epoch": 14.896321070234114, + "grad_norm": 0.5746601223945618, + "learning_rate": 0.00015056729699666296, + "loss": 3.1204, + "step": 1117 + }, + { + "epoch": 14.909698996655518, + "grad_norm": 0.7374005317687988, + "learning_rate": 0.00015052280311457177, + "loss": 3.1289, + "step": 1118 + }, + { + "epoch": 14.923076923076923, + "grad_norm": 0.5524411201477051, + "learning_rate": 0.00015047830923248053, + "loss": 3.2795, + "step": 1119 + }, + { + "epoch": 14.936454849498327, + "grad_norm": 0.7024741768836975, + "learning_rate": 0.0001504338153503893, + "loss": 3.0675, + "step": 1120 + }, + { + "epoch": 14.949832775919733, + "grad_norm": 0.7431137561798096, + "learning_rate": 0.00015038932146829812, + "loss": 3.1222, + "step": 1121 + }, + { + "epoch": 14.963210702341136, + "grad_norm": 0.6568113565444946, + "learning_rate": 0.0001503448275862069, + "loss": 3.1523, + "step": 1122 + }, + { + "epoch": 14.976588628762542, + "grad_norm": 0.6193330883979797, + "learning_rate": 0.00015030033370411569, + "loss": 3.1632, + "step": 1123 + }, + { + "epoch": 14.989966555183946, + "grad_norm": 0.6371363401412964, + "learning_rate": 0.00015025583982202447, + "loss": 3.0525, + "step": 1124 + }, + { + "epoch": 15.0, + "grad_norm": 0.652542233467102, + "learning_rate": 0.00015021134593993328, + "loss": 3.1846, + "step": 1125 + }, + { + "epoch": 15.013377926421406, + "grad_norm": 0.6337831616401672, + "learning_rate": 0.00015016685205784203, + "loss": 2.9292, + "step": 1126 + }, + { + "epoch": 15.02675585284281, + "grad_norm": 0.85350501537323, + "learning_rate": 0.00015012235817575084, + "loss": 2.913, + "step": 1127 + }, + { + "epoch": 15.040133779264215, + "grad_norm": 0.6439313888549805, + "learning_rate": 0.00015007786429365962, + "loss": 3.0809, + "step": 1128 + }, + { + "epoch": 15.053511705685619, + "grad_norm": 0.5232247114181519, + "learning_rate": 0.0001500333704115684, + "loss": 3.1348, + "step": 1129 + }, + { + "epoch": 15.066889632107024, + "grad_norm": 0.6082741618156433, + "learning_rate": 0.0001499888765294772, + "loss": 2.9282, + "step": 1130 + }, + { + "epoch": 15.080267558528428, + "grad_norm": 0.5736444592475891, + "learning_rate": 0.000149944382647386, + "loss": 2.9891, + "step": 1131 + }, + { + "epoch": 15.093645484949834, + "grad_norm": 0.7732790112495422, + "learning_rate": 0.00014989988876529478, + "loss": 3.0461, + "step": 1132 + }, + { + "epoch": 15.107023411371237, + "grad_norm": 0.618357241153717, + "learning_rate": 0.00014985539488320356, + "loss": 3.0296, + "step": 1133 + }, + { + "epoch": 15.120401337792643, + "grad_norm": 0.7245836853981018, + "learning_rate": 0.00014981090100111235, + "loss": 3.0078, + "step": 1134 + }, + { + "epoch": 15.133779264214047, + "grad_norm": 0.6738787293434143, + "learning_rate": 0.00014976640711902116, + "loss": 3.1171, + "step": 1135 + }, + { + "epoch": 15.147157190635452, + "grad_norm": 0.5802761316299438, + "learning_rate": 0.0001497219132369299, + "loss": 3.1403, + "step": 1136 + }, + { + "epoch": 15.160535117056856, + "grad_norm": 0.5941367745399475, + "learning_rate": 0.00014967741935483872, + "loss": 2.9149, + "step": 1137 + }, + { + "epoch": 15.173913043478262, + "grad_norm": 0.8884940147399902, + "learning_rate": 0.0001496329254727475, + "loss": 2.8983, + "step": 1138 + }, + { + "epoch": 15.187290969899665, + "grad_norm": 0.7229192852973938, + "learning_rate": 0.00014958843159065628, + "loss": 2.9774, + "step": 1139 + }, + { + "epoch": 15.200668896321071, + "grad_norm": 0.6714467406272888, + "learning_rate": 0.00014954393770856507, + "loss": 2.9219, + "step": 1140 + }, + { + "epoch": 15.214046822742475, + "grad_norm": 0.6785704493522644, + "learning_rate": 0.00014949944382647388, + "loss": 3.0205, + "step": 1141 + }, + { + "epoch": 15.22742474916388, + "grad_norm": 0.6349677443504333, + "learning_rate": 0.00014945494994438266, + "loss": 3.1601, + "step": 1142 + }, + { + "epoch": 15.240802675585284, + "grad_norm": 0.557123064994812, + "learning_rate": 0.00014941045606229144, + "loss": 2.6297, + "step": 1143 + }, + { + "epoch": 15.25418060200669, + "grad_norm": 0.6714944243431091, + "learning_rate": 0.00014936596218020022, + "loss": 2.7951, + "step": 1144 + }, + { + "epoch": 15.267558528428093, + "grad_norm": 0.6747463345527649, + "learning_rate": 0.00014932146829810903, + "loss": 2.7909, + "step": 1145 + }, + { + "epoch": 15.280936454849499, + "grad_norm": 0.5717387199401855, + "learning_rate": 0.0001492769744160178, + "loss": 3.2896, + "step": 1146 + }, + { + "epoch": 15.294314381270903, + "grad_norm": 0.6589123010635376, + "learning_rate": 0.0001492324805339266, + "loss": 2.8332, + "step": 1147 + }, + { + "epoch": 15.307692307692308, + "grad_norm": 0.6273646950721741, + "learning_rate": 0.00014918798665183538, + "loss": 3.0084, + "step": 1148 + }, + { + "epoch": 15.321070234113712, + "grad_norm": 0.6551377773284912, + "learning_rate": 0.00014914349276974416, + "loss": 2.8147, + "step": 1149 + }, + { + "epoch": 15.334448160535118, + "grad_norm": 0.6751659512519836, + "learning_rate": 0.00014909899888765295, + "loss": 3.1345, + "step": 1150 + }, + { + "epoch": 15.347826086956522, + "grad_norm": 0.677094042301178, + "learning_rate": 0.00014905450500556175, + "loss": 3.1958, + "step": 1151 + }, + { + "epoch": 15.361204013377927, + "grad_norm": 0.6613426804542542, + "learning_rate": 0.00014901001112347054, + "loss": 3.081, + "step": 1152 + }, + { + "epoch": 15.37458193979933, + "grad_norm": 0.7645783424377441, + "learning_rate": 0.00014896551724137932, + "loss": 2.8799, + "step": 1153 + }, + { + "epoch": 15.387959866220736, + "grad_norm": 0.5698953866958618, + "learning_rate": 0.0001489210233592881, + "loss": 3.1691, + "step": 1154 + }, + { + "epoch": 15.40133779264214, + "grad_norm": 0.6581351161003113, + "learning_rate": 0.0001488765294771969, + "loss": 3.2365, + "step": 1155 + }, + { + "epoch": 15.414715719063546, + "grad_norm": 0.7809271812438965, + "learning_rate": 0.00014883203559510567, + "loss": 2.833, + "step": 1156 + }, + { + "epoch": 15.42809364548495, + "grad_norm": 0.6226280927658081, + "learning_rate": 0.00014878754171301448, + "loss": 3.1502, + "step": 1157 + }, + { + "epoch": 15.441471571906355, + "grad_norm": 0.5494824051856995, + "learning_rate": 0.00014874304783092326, + "loss": 3.2195, + "step": 1158 + }, + { + "epoch": 15.454849498327759, + "grad_norm": 0.5729116797447205, + "learning_rate": 0.00014869855394883204, + "loss": 2.9463, + "step": 1159 + }, + { + "epoch": 15.468227424749164, + "grad_norm": 0.6673750877380371, + "learning_rate": 0.00014865406006674082, + "loss": 3.1514, + "step": 1160 + }, + { + "epoch": 15.481605351170568, + "grad_norm": 0.6746686697006226, + "learning_rate": 0.0001486095661846496, + "loss": 3.0088, + "step": 1161 + }, + { + "epoch": 15.494983277591974, + "grad_norm": 0.6898564100265503, + "learning_rate": 0.00014856507230255842, + "loss": 2.8468, + "step": 1162 + }, + { + "epoch": 15.508361204013378, + "grad_norm": 0.7262438535690308, + "learning_rate": 0.00014852057842046717, + "loss": 2.8017, + "step": 1163 + }, + { + "epoch": 15.521739130434783, + "grad_norm": 0.6878666877746582, + "learning_rate": 0.00014847608453837598, + "loss": 2.9486, + "step": 1164 + }, + { + "epoch": 15.535117056856187, + "grad_norm": 0.6375080347061157, + "learning_rate": 0.00014843159065628476, + "loss": 3.0757, + "step": 1165 + }, + { + "epoch": 15.548494983277592, + "grad_norm": 0.6540268063545227, + "learning_rate": 0.00014838709677419355, + "loss": 3.1262, + "step": 1166 + }, + { + "epoch": 15.561872909698996, + "grad_norm": 0.6036689877510071, + "learning_rate": 0.00014834260289210233, + "loss": 3.0607, + "step": 1167 + }, + { + "epoch": 15.575250836120402, + "grad_norm": 0.5899893641471863, + "learning_rate": 0.00014829810901001114, + "loss": 3.1222, + "step": 1168 + }, + { + "epoch": 15.588628762541806, + "grad_norm": 0.7268028259277344, + "learning_rate": 0.00014825361512791992, + "loss": 3.1777, + "step": 1169 + }, + { + "epoch": 15.602006688963211, + "grad_norm": 0.6990141272544861, + "learning_rate": 0.0001482091212458287, + "loss": 3.2142, + "step": 1170 + }, + { + "epoch": 15.615384615384615, + "grad_norm": 0.6009657382965088, + "learning_rate": 0.00014816462736373748, + "loss": 3.141, + "step": 1171 + }, + { + "epoch": 15.62876254180602, + "grad_norm": 0.6287830471992493, + "learning_rate": 0.0001481201334816463, + "loss": 3.0621, + "step": 1172 + }, + { + "epoch": 15.642140468227424, + "grad_norm": 0.6720128655433655, + "learning_rate": 0.00014807563959955505, + "loss": 3.0353, + "step": 1173 + }, + { + "epoch": 15.65551839464883, + "grad_norm": 0.6694427132606506, + "learning_rate": 0.00014803114571746386, + "loss": 3.0171, + "step": 1174 + }, + { + "epoch": 15.668896321070234, + "grad_norm": 0.5630237460136414, + "learning_rate": 0.00014798665183537264, + "loss": 3.1444, + "step": 1175 + }, + { + "epoch": 15.68227424749164, + "grad_norm": 0.7139558792114258, + "learning_rate": 0.00014794215795328142, + "loss": 3.0012, + "step": 1176 + }, + { + "epoch": 15.695652173913043, + "grad_norm": 0.6374551057815552, + "learning_rate": 0.0001478976640711902, + "loss": 2.9123, + "step": 1177 + }, + { + "epoch": 15.709030100334449, + "grad_norm": 0.5957819223403931, + "learning_rate": 0.00014785317018909902, + "loss": 3.09, + "step": 1178 + }, + { + "epoch": 15.722408026755852, + "grad_norm": 0.6083621382713318, + "learning_rate": 0.0001478086763070078, + "loss": 3.0231, + "step": 1179 + }, + { + "epoch": 15.735785953177258, + "grad_norm": 0.6169192790985107, + "learning_rate": 0.00014776418242491658, + "loss": 2.9863, + "step": 1180 + }, + { + "epoch": 15.749163879598662, + "grad_norm": 0.6058081984519958, + "learning_rate": 0.00014771968854282536, + "loss": 3.0261, + "step": 1181 + }, + { + "epoch": 15.762541806020067, + "grad_norm": 0.5816760659217834, + "learning_rate": 0.00014767519466073417, + "loss": 3.1593, + "step": 1182 + }, + { + "epoch": 15.775919732441471, + "grad_norm": 0.6246895790100098, + "learning_rate": 0.00014763070077864293, + "loss": 3.1029, + "step": 1183 + }, + { + "epoch": 15.789297658862877, + "grad_norm": 0.56280517578125, + "learning_rate": 0.00014758620689655174, + "loss": 2.9778, + "step": 1184 + }, + { + "epoch": 15.80267558528428, + "grad_norm": 0.5743212699890137, + "learning_rate": 0.00014754171301446052, + "loss": 2.8799, + "step": 1185 + }, + { + "epoch": 15.816053511705686, + "grad_norm": 0.6163922548294067, + "learning_rate": 0.0001474972191323693, + "loss": 3.0226, + "step": 1186 + }, + { + "epoch": 15.82943143812709, + "grad_norm": 0.5892409682273865, + "learning_rate": 0.00014745272525027808, + "loss": 3.1167, + "step": 1187 + }, + { + "epoch": 15.842809364548495, + "grad_norm": 0.7977785468101501, + "learning_rate": 0.0001474082313681869, + "loss": 2.8427, + "step": 1188 + }, + { + "epoch": 15.856187290969899, + "grad_norm": 0.7396023273468018, + "learning_rate": 0.00014736373748609568, + "loss": 2.7809, + "step": 1189 + }, + { + "epoch": 15.869565217391305, + "grad_norm": 0.58844393491745, + "learning_rate": 0.00014731924360400446, + "loss": 3.0624, + "step": 1190 + }, + { + "epoch": 15.882943143812708, + "grad_norm": 0.6903204321861267, + "learning_rate": 0.00014727474972191324, + "loss": 3.1246, + "step": 1191 + }, + { + "epoch": 15.896321070234114, + "grad_norm": 0.5902391672134399, + "learning_rate": 0.00014723025583982205, + "loss": 3.0505, + "step": 1192 + }, + { + "epoch": 15.909698996655518, + "grad_norm": 0.575752317905426, + "learning_rate": 0.0001471857619577308, + "loss": 2.8508, + "step": 1193 + }, + { + "epoch": 15.923076923076923, + "grad_norm": 0.7248224020004272, + "learning_rate": 0.00014714126807563961, + "loss": 3.1438, + "step": 1194 + }, + { + "epoch": 15.936454849498327, + "grad_norm": 0.5669791102409363, + "learning_rate": 0.0001470967741935484, + "loss": 3.1765, + "step": 1195 + }, + { + "epoch": 15.949832775919733, + "grad_norm": 0.6656806468963623, + "learning_rate": 0.00014705228031145718, + "loss": 3.1456, + "step": 1196 + }, + { + "epoch": 15.963210702341136, + "grad_norm": 0.6073266863822937, + "learning_rate": 0.00014700778642936596, + "loss": 3.1836, + "step": 1197 + }, + { + "epoch": 15.976588628762542, + "grad_norm": 0.8209658861160278, + "learning_rate": 0.00014696329254727477, + "loss": 2.8457, + "step": 1198 + }, + { + "epoch": 15.989966555183946, + "grad_norm": 0.6495081186294556, + "learning_rate": 0.00014691879866518355, + "loss": 3.0161, + "step": 1199 + }, + { + "epoch": 16.0, + "grad_norm": 0.7522635459899902, + "learning_rate": 0.00014687430478309234, + "loss": 2.8293, + "step": 1200 + }, + { + "epoch": 16.013377926421406, + "grad_norm": 0.8024417161941528, + "learning_rate": 0.00014682981090100112, + "loss": 2.909, + "step": 1201 + }, + { + "epoch": 16.02675585284281, + "grad_norm": 0.7147983908653259, + "learning_rate": 0.0001467853170189099, + "loss": 2.7664, + "step": 1202 + }, + { + "epoch": 16.040133779264213, + "grad_norm": 0.602427065372467, + "learning_rate": 0.00014674082313681868, + "loss": 2.8189, + "step": 1203 + }, + { + "epoch": 16.05351170568562, + "grad_norm": 0.7274264097213745, + "learning_rate": 0.00014669632925472747, + "loss": 2.9957, + "step": 1204 + }, + { + "epoch": 16.066889632107024, + "grad_norm": 0.6042147278785706, + "learning_rate": 0.00014665183537263628, + "loss": 2.8925, + "step": 1205 + }, + { + "epoch": 16.08026755852843, + "grad_norm": 0.592339813709259, + "learning_rate": 0.00014660734149054506, + "loss": 3.1463, + "step": 1206 + }, + { + "epoch": 16.093645484949832, + "grad_norm": 0.6099987030029297, + "learning_rate": 0.00014656284760845384, + "loss": 3.2264, + "step": 1207 + }, + { + "epoch": 16.107023411371237, + "grad_norm": 0.9324999451637268, + "learning_rate": 0.00014651835372636262, + "loss": 2.916, + "step": 1208 + }, + { + "epoch": 16.120401337792643, + "grad_norm": 0.6000798344612122, + "learning_rate": 0.00014647385984427143, + "loss": 2.9242, + "step": 1209 + }, + { + "epoch": 16.13377926421405, + "grad_norm": 0.609199047088623, + "learning_rate": 0.0001464293659621802, + "loss": 2.9405, + "step": 1210 + }, + { + "epoch": 16.14715719063545, + "grad_norm": 0.6789796948432922, + "learning_rate": 0.000146384872080089, + "loss": 3.003, + "step": 1211 + }, + { + "epoch": 16.160535117056856, + "grad_norm": 0.6567651629447937, + "learning_rate": 0.00014634037819799778, + "loss": 2.8733, + "step": 1212 + }, + { + "epoch": 16.17391304347826, + "grad_norm": 0.5860549211502075, + "learning_rate": 0.00014629588431590656, + "loss": 3.1429, + "step": 1213 + }, + { + "epoch": 16.187290969899667, + "grad_norm": 0.6623414158821106, + "learning_rate": 0.00014625139043381534, + "loss": 3.1741, + "step": 1214 + }, + { + "epoch": 16.20066889632107, + "grad_norm": 0.8366180062294006, + "learning_rate": 0.00014620689655172415, + "loss": 2.8627, + "step": 1215 + }, + { + "epoch": 16.214046822742475, + "grad_norm": 0.616780698299408, + "learning_rate": 0.00014616240266963294, + "loss": 2.7619, + "step": 1216 + }, + { + "epoch": 16.22742474916388, + "grad_norm": 0.6345306634902954, + "learning_rate": 0.00014611790878754172, + "loss": 2.9104, + "step": 1217 + }, + { + "epoch": 16.240802675585286, + "grad_norm": 0.6326844096183777, + "learning_rate": 0.0001460734149054505, + "loss": 2.8453, + "step": 1218 + }, + { + "epoch": 16.254180602006688, + "grad_norm": 0.5441793203353882, + "learning_rate": 0.0001460289210233593, + "loss": 3.2924, + "step": 1219 + }, + { + "epoch": 16.267558528428093, + "grad_norm": 0.604637086391449, + "learning_rate": 0.00014598442714126807, + "loss": 3.0362, + "step": 1220 + }, + { + "epoch": 16.2809364548495, + "grad_norm": 0.6100621819496155, + "learning_rate": 0.00014593993325917687, + "loss": 2.917, + "step": 1221 + }, + { + "epoch": 16.294314381270905, + "grad_norm": 0.6323224902153015, + "learning_rate": 0.00014589543937708566, + "loss": 3.144, + "step": 1222 + }, + { + "epoch": 16.307692307692307, + "grad_norm": 0.595485270023346, + "learning_rate": 0.00014585094549499444, + "loss": 2.8869, + "step": 1223 + }, + { + "epoch": 16.321070234113712, + "grad_norm": 0.6350538730621338, + "learning_rate": 0.00014580645161290322, + "loss": 2.7253, + "step": 1224 + }, + { + "epoch": 16.334448160535118, + "grad_norm": 0.5804395079612732, + "learning_rate": 0.00014576195773081203, + "loss": 3.1523, + "step": 1225 + }, + { + "epoch": 16.347826086956523, + "grad_norm": 0.5905717015266418, + "learning_rate": 0.00014571746384872081, + "loss": 3.1419, + "step": 1226 + }, + { + "epoch": 16.361204013377925, + "grad_norm": 0.6824894547462463, + "learning_rate": 0.0001456729699666296, + "loss": 2.8953, + "step": 1227 + }, + { + "epoch": 16.37458193979933, + "grad_norm": 0.5840978622436523, + "learning_rate": 0.00014562847608453838, + "loss": 3.1229, + "step": 1228 + }, + { + "epoch": 16.387959866220736, + "grad_norm": 0.7102469801902771, + "learning_rate": 0.0001455839822024472, + "loss": 2.6571, + "step": 1229 + }, + { + "epoch": 16.401337792642142, + "grad_norm": 0.6148349046707153, + "learning_rate": 0.00014553948832035594, + "loss": 3.0084, + "step": 1230 + }, + { + "epoch": 16.414715719063544, + "grad_norm": 0.60859215259552, + "learning_rate": 0.00014549499443826475, + "loss": 3.1348, + "step": 1231 + }, + { + "epoch": 16.42809364548495, + "grad_norm": 0.6059513688087463, + "learning_rate": 0.00014545050055617354, + "loss": 2.9678, + "step": 1232 + }, + { + "epoch": 16.441471571906355, + "grad_norm": 0.6012231707572937, + "learning_rate": 0.00014540600667408232, + "loss": 2.9279, + "step": 1233 + }, + { + "epoch": 16.45484949832776, + "grad_norm": 0.6185587644577026, + "learning_rate": 0.0001453615127919911, + "loss": 2.9922, + "step": 1234 + }, + { + "epoch": 16.468227424749163, + "grad_norm": 0.5989127159118652, + "learning_rate": 0.0001453170189098999, + "loss": 3.0736, + "step": 1235 + }, + { + "epoch": 16.48160535117057, + "grad_norm": 0.673633337020874, + "learning_rate": 0.0001452725250278087, + "loss": 2.9703, + "step": 1236 + }, + { + "epoch": 16.494983277591974, + "grad_norm": 0.6815900206565857, + "learning_rate": 0.00014522803114571747, + "loss": 3.0171, + "step": 1237 + }, + { + "epoch": 16.50836120401338, + "grad_norm": 0.7687232494354248, + "learning_rate": 0.00014518353726362626, + "loss": 2.8988, + "step": 1238 + }, + { + "epoch": 16.52173913043478, + "grad_norm": 0.7992755174636841, + "learning_rate": 0.00014513904338153507, + "loss": 3.0347, + "step": 1239 + }, + { + "epoch": 16.535117056856187, + "grad_norm": 0.8673639893531799, + "learning_rate": 0.00014509454949944382, + "loss": 2.9654, + "step": 1240 + }, + { + "epoch": 16.548494983277592, + "grad_norm": 0.6200671792030334, + "learning_rate": 0.00014505005561735263, + "loss": 3.1075, + "step": 1241 + }, + { + "epoch": 16.561872909698998, + "grad_norm": 0.8055624961853027, + "learning_rate": 0.0001450055617352614, + "loss": 2.877, + "step": 1242 + }, + { + "epoch": 16.5752508361204, + "grad_norm": 0.6428245902061462, + "learning_rate": 0.0001449610678531702, + "loss": 3.0326, + "step": 1243 + }, + { + "epoch": 16.588628762541806, + "grad_norm": 0.844804584980011, + "learning_rate": 0.00014491657397107898, + "loss": 2.8093, + "step": 1244 + }, + { + "epoch": 16.60200668896321, + "grad_norm": 0.5699613690376282, + "learning_rate": 0.00014487208008898776, + "loss": 3.1914, + "step": 1245 + }, + { + "epoch": 16.615384615384617, + "grad_norm": 0.6638582348823547, + "learning_rate": 0.00014482758620689657, + "loss": 2.7484, + "step": 1246 + }, + { + "epoch": 16.62876254180602, + "grad_norm": 0.7390914559364319, + "learning_rate": 0.00014478309232480533, + "loss": 3.0355, + "step": 1247 + }, + { + "epoch": 16.642140468227424, + "grad_norm": 0.6177923083305359, + "learning_rate": 0.00014473859844271413, + "loss": 3.0067, + "step": 1248 + }, + { + "epoch": 16.65551839464883, + "grad_norm": 0.6234062314033508, + "learning_rate": 0.00014469410456062292, + "loss": 2.8153, + "step": 1249 + }, + { + "epoch": 16.668896321070235, + "grad_norm": 0.8505418300628662, + "learning_rate": 0.0001446496106785317, + "loss": 2.8292, + "step": 1250 + }, + { + "epoch": 16.682274247491637, + "grad_norm": 0.8339266180992126, + "learning_rate": 0.00014460511679644048, + "loss": 2.8072, + "step": 1251 + }, + { + "epoch": 16.695652173913043, + "grad_norm": 0.5782635807991028, + "learning_rate": 0.0001445606229143493, + "loss": 3.1079, + "step": 1252 + }, + { + "epoch": 16.70903010033445, + "grad_norm": 0.687126874923706, + "learning_rate": 0.00014451612903225807, + "loss": 3.1401, + "step": 1253 + }, + { + "epoch": 16.722408026755854, + "grad_norm": 0.7313762307167053, + "learning_rate": 0.00014447163515016686, + "loss": 2.9167, + "step": 1254 + }, + { + "epoch": 16.735785953177256, + "grad_norm": 0.8815247416496277, + "learning_rate": 0.00014442714126807564, + "loss": 3.0294, + "step": 1255 + }, + { + "epoch": 16.74916387959866, + "grad_norm": 0.7636277675628662, + "learning_rate": 0.00014438264738598445, + "loss": 3.0758, + "step": 1256 + }, + { + "epoch": 16.762541806020067, + "grad_norm": 0.5961578488349915, + "learning_rate": 0.0001443381535038932, + "loss": 3.1027, + "step": 1257 + }, + { + "epoch": 16.775919732441473, + "grad_norm": 0.6840028762817383, + "learning_rate": 0.000144293659621802, + "loss": 2.9192, + "step": 1258 + }, + { + "epoch": 16.789297658862875, + "grad_norm": 0.7895340323448181, + "learning_rate": 0.0001442491657397108, + "loss": 2.8772, + "step": 1259 + }, + { + "epoch": 16.80267558528428, + "grad_norm": 0.8516091704368591, + "learning_rate": 0.00014420467185761958, + "loss": 2.9168, + "step": 1260 + }, + { + "epoch": 16.816053511705686, + "grad_norm": 0.6745076179504395, + "learning_rate": 0.00014416017797552836, + "loss": 3.1352, + "step": 1261 + }, + { + "epoch": 16.82943143812709, + "grad_norm": 0.6744667887687683, + "learning_rate": 0.00014411568409343717, + "loss": 2.8439, + "step": 1262 + }, + { + "epoch": 16.842809364548494, + "grad_norm": 0.6307089924812317, + "learning_rate": 0.00014407119021134595, + "loss": 3.2054, + "step": 1263 + }, + { + "epoch": 16.8561872909699, + "grad_norm": 0.6480753421783447, + "learning_rate": 0.00014402669632925473, + "loss": 3.039, + "step": 1264 + }, + { + "epoch": 16.869565217391305, + "grad_norm": 0.6143667697906494, + "learning_rate": 0.00014398220244716352, + "loss": 2.9475, + "step": 1265 + }, + { + "epoch": 16.88294314381271, + "grad_norm": 0.6289299130439758, + "learning_rate": 0.00014393770856507233, + "loss": 3.1437, + "step": 1266 + }, + { + "epoch": 16.896321070234112, + "grad_norm": 0.6618160009384155, + "learning_rate": 0.00014389321468298108, + "loss": 2.8641, + "step": 1267 + }, + { + "epoch": 16.909698996655518, + "grad_norm": 0.6053375601768494, + "learning_rate": 0.0001438487208008899, + "loss": 2.9129, + "step": 1268 + }, + { + "epoch": 16.923076923076923, + "grad_norm": 0.5706185102462769, + "learning_rate": 0.00014380422691879867, + "loss": 3.005, + "step": 1269 + }, + { + "epoch": 16.93645484949833, + "grad_norm": 0.6779253482818604, + "learning_rate": 0.00014375973303670746, + "loss": 3.1071, + "step": 1270 + }, + { + "epoch": 16.94983277591973, + "grad_norm": 0.6679616570472717, + "learning_rate": 0.00014371523915461624, + "loss": 3.1792, + "step": 1271 + }, + { + "epoch": 16.963210702341136, + "grad_norm": 0.6018584966659546, + "learning_rate": 0.00014367074527252505, + "loss": 2.9947, + "step": 1272 + }, + { + "epoch": 16.976588628762542, + "grad_norm": 0.6106094717979431, + "learning_rate": 0.00014362625139043383, + "loss": 3.1965, + "step": 1273 + }, + { + "epoch": 16.989966555183948, + "grad_norm": 0.5486257672309875, + "learning_rate": 0.0001435817575083426, + "loss": 3.0975, + "step": 1274 + }, + { + "epoch": 17.0, + "grad_norm": 0.6516265273094177, + "learning_rate": 0.0001435372636262514, + "loss": 3.1064, + "step": 1275 + }, + { + "epoch": 17.013377926421406, + "grad_norm": 0.6434454917907715, + "learning_rate": 0.0001434927697441602, + "loss": 2.8903, + "step": 1276 + }, + { + "epoch": 17.02675585284281, + "grad_norm": 0.6237186193466187, + "learning_rate": 0.00014344827586206896, + "loss": 3.135, + "step": 1277 + }, + { + "epoch": 17.040133779264213, + "grad_norm": 0.5920026302337646, + "learning_rate": 0.00014340378197997777, + "loss": 2.8612, + "step": 1278 + }, + { + "epoch": 17.05351170568562, + "grad_norm": 0.6545232534408569, + "learning_rate": 0.00014335928809788655, + "loss": 2.8934, + "step": 1279 + }, + { + "epoch": 17.066889632107024, + "grad_norm": 0.7839710116386414, + "learning_rate": 0.00014331479421579533, + "loss": 2.965, + "step": 1280 + }, + { + "epoch": 17.08026755852843, + "grad_norm": 0.6448978781700134, + "learning_rate": 0.00014327030033370412, + "loss": 2.9343, + "step": 1281 + }, + { + "epoch": 17.093645484949832, + "grad_norm": 0.5713958144187927, + "learning_rate": 0.00014322580645161293, + "loss": 3.1566, + "step": 1282 + }, + { + "epoch": 17.107023411371237, + "grad_norm": 0.666409969329834, + "learning_rate": 0.0001431813125695217, + "loss": 2.7705, + "step": 1283 + }, + { + "epoch": 17.120401337792643, + "grad_norm": 0.6068354249000549, + "learning_rate": 0.0001431368186874305, + "loss": 3.088, + "step": 1284 + }, + { + "epoch": 17.13377926421405, + "grad_norm": 0.8292580246925354, + "learning_rate": 0.00014309232480533927, + "loss": 2.8939, + "step": 1285 + }, + { + "epoch": 17.14715719063545, + "grad_norm": 0.6789494156837463, + "learning_rate": 0.00014304783092324806, + "loss": 3.2317, + "step": 1286 + }, + { + "epoch": 17.160535117056856, + "grad_norm": 0.6030963063240051, + "learning_rate": 0.00014300333704115684, + "loss": 2.9248, + "step": 1287 + }, + { + "epoch": 17.17391304347826, + "grad_norm": 0.7090041041374207, + "learning_rate": 0.00014295884315906562, + "loss": 2.9097, + "step": 1288 + }, + { + "epoch": 17.187290969899667, + "grad_norm": 0.5750879645347595, + "learning_rate": 0.00014291434927697443, + "loss": 2.9585, + "step": 1289 + }, + { + "epoch": 17.20066889632107, + "grad_norm": 0.6379792094230652, + "learning_rate": 0.0001428698553948832, + "loss": 2.794, + "step": 1290 + }, + { + "epoch": 17.214046822742475, + "grad_norm": 0.6736400127410889, + "learning_rate": 0.000142825361512792, + "loss": 2.8812, + "step": 1291 + }, + { + "epoch": 17.22742474916388, + "grad_norm": 0.6580933332443237, + "learning_rate": 0.00014278086763070078, + "loss": 2.8029, + "step": 1292 + }, + { + "epoch": 17.240802675585286, + "grad_norm": 0.6550527215003967, + "learning_rate": 0.00014273637374860959, + "loss": 3.1135, + "step": 1293 + }, + { + "epoch": 17.254180602006688, + "grad_norm": 0.6616796255111694, + "learning_rate": 0.00014269187986651834, + "loss": 2.8916, + "step": 1294 + }, + { + "epoch": 17.267558528428093, + "grad_norm": 0.7247623801231384, + "learning_rate": 0.00014264738598442715, + "loss": 2.8482, + "step": 1295 + }, + { + "epoch": 17.2809364548495, + "grad_norm": 0.7138639092445374, + "learning_rate": 0.00014260289210233593, + "loss": 2.9283, + "step": 1296 + }, + { + "epoch": 17.294314381270905, + "grad_norm": 0.6413894891738892, + "learning_rate": 0.00014255839822024472, + "loss": 3.0273, + "step": 1297 + }, + { + "epoch": 17.307692307692307, + "grad_norm": 0.6106882095336914, + "learning_rate": 0.0001425139043381535, + "loss": 3.062, + "step": 1298 + }, + { + "epoch": 17.321070234113712, + "grad_norm": 0.6762199997901917, + "learning_rate": 0.0001424694104560623, + "loss": 2.7864, + "step": 1299 + }, + { + "epoch": 17.334448160535118, + "grad_norm": 0.65083909034729, + "learning_rate": 0.0001424249165739711, + "loss": 3.115, + "step": 1300 + }, + { + "epoch": 17.347826086956523, + "grad_norm": 0.7381249666213989, + "learning_rate": 0.00014238042269187987, + "loss": 2.6981, + "step": 1301 + }, + { + "epoch": 17.361204013377925, + "grad_norm": 0.5674475431442261, + "learning_rate": 0.00014233592880978865, + "loss": 2.8646, + "step": 1302 + }, + { + "epoch": 17.37458193979933, + "grad_norm": 0.6201330423355103, + "learning_rate": 0.00014229143492769746, + "loss": 3.207, + "step": 1303 + }, + { + "epoch": 17.387959866220736, + "grad_norm": 0.7004446983337402, + "learning_rate": 0.00014224694104560622, + "loss": 2.7623, + "step": 1304 + }, + { + "epoch": 17.401337792642142, + "grad_norm": 0.7278717756271362, + "learning_rate": 0.00014220244716351503, + "loss": 2.7967, + "step": 1305 + }, + { + "epoch": 17.414715719063544, + "grad_norm": 0.6384133696556091, + "learning_rate": 0.0001421579532814238, + "loss": 2.7542, + "step": 1306 + }, + { + "epoch": 17.42809364548495, + "grad_norm": 0.6443619132041931, + "learning_rate": 0.0001421134593993326, + "loss": 2.9376, + "step": 1307 + }, + { + "epoch": 17.441471571906355, + "grad_norm": 0.6889017224311829, + "learning_rate": 0.00014206896551724138, + "loss": 2.8868, + "step": 1308 + }, + { + "epoch": 17.45484949832776, + "grad_norm": 0.624864935874939, + "learning_rate": 0.00014202447163515019, + "loss": 3.0459, + "step": 1309 + }, + { + "epoch": 17.468227424749163, + "grad_norm": 0.5964120626449585, + "learning_rate": 0.00014197997775305897, + "loss": 2.8604, + "step": 1310 + }, + { + "epoch": 17.48160535117057, + "grad_norm": 0.5584320425987244, + "learning_rate": 0.00014193548387096775, + "loss": 3.1267, + "step": 1311 + }, + { + "epoch": 17.494983277591974, + "grad_norm": 0.6566155552864075, + "learning_rate": 0.00014189098998887653, + "loss": 2.7059, + "step": 1312 + }, + { + "epoch": 17.50836120401338, + "grad_norm": 0.7166509032249451, + "learning_rate": 0.00014184649610678534, + "loss": 3.1076, + "step": 1313 + }, + { + "epoch": 17.52173913043478, + "grad_norm": 0.6076642870903015, + "learning_rate": 0.0001418020022246941, + "loss": 3.1121, + "step": 1314 + }, + { + "epoch": 17.535117056856187, + "grad_norm": 0.6293672323226929, + "learning_rate": 0.0001417575083426029, + "loss": 3.0719, + "step": 1315 + }, + { + "epoch": 17.548494983277592, + "grad_norm": 0.6696231365203857, + "learning_rate": 0.0001417130144605117, + "loss": 2.4281, + "step": 1316 + }, + { + "epoch": 17.561872909698998, + "grad_norm": 0.7928171157836914, + "learning_rate": 0.00014166852057842047, + "loss": 2.8602, + "step": 1317 + }, + { + "epoch": 17.5752508361204, + "grad_norm": 0.5897494554519653, + "learning_rate": 0.00014162402669632925, + "loss": 3.1556, + "step": 1318 + }, + { + "epoch": 17.588628762541806, + "grad_norm": 0.6028451323509216, + "learning_rate": 0.00014157953281423806, + "loss": 3.1228, + "step": 1319 + }, + { + "epoch": 17.60200668896321, + "grad_norm": 0.6237207651138306, + "learning_rate": 0.00014153503893214685, + "loss": 2.9691, + "step": 1320 + }, + { + "epoch": 17.615384615384617, + "grad_norm": 0.6401494741439819, + "learning_rate": 0.00014149054505005563, + "loss": 3.0659, + "step": 1321 + }, + { + "epoch": 17.62876254180602, + "grad_norm": 0.7098166942596436, + "learning_rate": 0.0001414460511679644, + "loss": 2.9606, + "step": 1322 + }, + { + "epoch": 17.642140468227424, + "grad_norm": 0.6416228413581848, + "learning_rate": 0.00014140155728587322, + "loss": 3.1056, + "step": 1323 + }, + { + "epoch": 17.65551839464883, + "grad_norm": 0.7303211092948914, + "learning_rate": 0.00014135706340378198, + "loss": 2.8604, + "step": 1324 + }, + { + "epoch": 17.668896321070235, + "grad_norm": 0.64544677734375, + "learning_rate": 0.00014131256952169079, + "loss": 2.6751, + "step": 1325 + }, + { + "epoch": 17.682274247491637, + "grad_norm": 0.6870211362838745, + "learning_rate": 0.00014126807563959957, + "loss": 2.9802, + "step": 1326 + }, + { + "epoch": 17.695652173913043, + "grad_norm": 0.6570687294006348, + "learning_rate": 0.00014122358175750835, + "loss": 3.0496, + "step": 1327 + }, + { + "epoch": 17.70903010033445, + "grad_norm": 0.7057302594184875, + "learning_rate": 0.00014117908787541713, + "loss": 2.845, + "step": 1328 + }, + { + "epoch": 17.722408026755854, + "grad_norm": 0.8613574504852295, + "learning_rate": 0.00014113459399332591, + "loss": 3.0314, + "step": 1329 + }, + { + "epoch": 17.735785953177256, + "grad_norm": 0.7408957481384277, + "learning_rate": 0.00014109010011123472, + "loss": 2.8651, + "step": 1330 + }, + { + "epoch": 17.74916387959866, + "grad_norm": 0.6553664803504944, + "learning_rate": 0.00014104560622914348, + "loss": 2.9647, + "step": 1331 + }, + { + "epoch": 17.762541806020067, + "grad_norm": 0.5991332530975342, + "learning_rate": 0.0001410011123470523, + "loss": 2.9796, + "step": 1332 + }, + { + "epoch": 17.775919732441473, + "grad_norm": 0.6124044060707092, + "learning_rate": 0.00014095661846496107, + "loss": 3.0682, + "step": 1333 + }, + { + "epoch": 17.789297658862875, + "grad_norm": 0.5788628458976746, + "learning_rate": 0.00014091212458286985, + "loss": 3.0357, + "step": 1334 + }, + { + "epoch": 17.80267558528428, + "grad_norm": 0.6785842776298523, + "learning_rate": 0.00014086763070077864, + "loss": 3.1128, + "step": 1335 + }, + { + "epoch": 17.816053511705686, + "grad_norm": 0.5994388461112976, + "learning_rate": 0.00014082313681868745, + "loss": 2.9087, + "step": 1336 + }, + { + "epoch": 17.82943143812709, + "grad_norm": 0.6150069236755371, + "learning_rate": 0.00014077864293659623, + "loss": 2.9488, + "step": 1337 + }, + { + "epoch": 17.842809364548494, + "grad_norm": 0.6211126446723938, + "learning_rate": 0.000140734149054505, + "loss": 2.8006, + "step": 1338 + }, + { + "epoch": 17.8561872909699, + "grad_norm": 0.6093603372573853, + "learning_rate": 0.0001406896551724138, + "loss": 3.0013, + "step": 1339 + }, + { + "epoch": 17.869565217391305, + "grad_norm": 0.6861109137535095, + "learning_rate": 0.0001406451612903226, + "loss": 2.9423, + "step": 1340 + }, + { + "epoch": 17.88294314381271, + "grad_norm": 0.6148517727851868, + "learning_rate": 0.00014060066740823136, + "loss": 2.8471, + "step": 1341 + }, + { + "epoch": 17.896321070234112, + "grad_norm": 0.8285694718360901, + "learning_rate": 0.00014055617352614017, + "loss": 3.055, + "step": 1342 + }, + { + "epoch": 17.909698996655518, + "grad_norm": 0.5955973863601685, + "learning_rate": 0.00014051167964404895, + "loss": 3.0088, + "step": 1343 + }, + { + "epoch": 17.923076923076923, + "grad_norm": 0.6020825505256653, + "learning_rate": 0.00014046718576195773, + "loss": 3.1445, + "step": 1344 + }, + { + "epoch": 17.93645484949833, + "grad_norm": 0.6115384101867676, + "learning_rate": 0.00014042269187986651, + "loss": 3.0889, + "step": 1345 + }, + { + "epoch": 17.94983277591973, + "grad_norm": 0.6469634175300598, + "learning_rate": 0.00014037819799777532, + "loss": 3.1916, + "step": 1346 + }, + { + "epoch": 17.963210702341136, + "grad_norm": 0.6653386354446411, + "learning_rate": 0.0001403337041156841, + "loss": 2.8805, + "step": 1347 + }, + { + "epoch": 17.976588628762542, + "grad_norm": 0.6167243719100952, + "learning_rate": 0.0001402892102335929, + "loss": 2.9819, + "step": 1348 + }, + { + "epoch": 17.989966555183948, + "grad_norm": 0.6281883716583252, + "learning_rate": 0.00014024471635150167, + "loss": 3.1109, + "step": 1349 + }, + { + "epoch": 18.0, + "grad_norm": 0.6747295260429382, + "learning_rate": 0.00014020022246941048, + "loss": 2.8772, + "step": 1350 + }, + { + "epoch": 18.013377926421406, + "grad_norm": 0.5834116339683533, + "learning_rate": 0.00014015572858731924, + "loss": 2.9399, + "step": 1351 + }, + { + "epoch": 18.02675585284281, + "grad_norm": 0.620858371257782, + "learning_rate": 0.00014011123470522805, + "loss": 2.7351, + "step": 1352 + }, + { + "epoch": 18.040133779264213, + "grad_norm": 0.5407689809799194, + "learning_rate": 0.00014006674082313683, + "loss": 2.8329, + "step": 1353 + }, + { + "epoch": 18.05351170568562, + "grad_norm": 0.6045056581497192, + "learning_rate": 0.0001400222469410456, + "loss": 2.7382, + "step": 1354 + }, + { + "epoch": 18.066889632107024, + "grad_norm": 0.5433570146560669, + "learning_rate": 0.0001399777530589544, + "loss": 2.9154, + "step": 1355 + }, + { + "epoch": 18.08026755852843, + "grad_norm": 0.6174083352088928, + "learning_rate": 0.0001399332591768632, + "loss": 2.7438, + "step": 1356 + }, + { + "epoch": 18.093645484949832, + "grad_norm": 0.6720690727233887, + "learning_rate": 0.00013988876529477198, + "loss": 3.1324, + "step": 1357 + }, + { + "epoch": 18.107023411371237, + "grad_norm": 0.648423433303833, + "learning_rate": 0.00013984427141268077, + "loss": 2.7802, + "step": 1358 + }, + { + "epoch": 18.120401337792643, + "grad_norm": 0.6625978350639343, + "learning_rate": 0.00013979977753058955, + "loss": 2.9152, + "step": 1359 + }, + { + "epoch": 18.13377926421405, + "grad_norm": 0.5362007021903992, + "learning_rate": 0.00013975528364849836, + "loss": 2.9552, + "step": 1360 + }, + { + "epoch": 18.14715719063545, + "grad_norm": 0.6275555491447449, + "learning_rate": 0.00013971078976640711, + "loss": 3.0742, + "step": 1361 + }, + { + "epoch": 18.160535117056856, + "grad_norm": 0.5755884647369385, + "learning_rate": 0.00013966629588431592, + "loss": 2.8705, + "step": 1362 + }, + { + "epoch": 18.17391304347826, + "grad_norm": 0.5092719793319702, + "learning_rate": 0.0001396218020022247, + "loss": 2.9684, + "step": 1363 + }, + { + "epoch": 18.187290969899667, + "grad_norm": 0.7400075197219849, + "learning_rate": 0.0001395773081201335, + "loss": 2.9726, + "step": 1364 + }, + { + "epoch": 18.20066889632107, + "grad_norm": 0.6478124260902405, + "learning_rate": 0.00013953281423804227, + "loss": 2.7427, + "step": 1365 + }, + { + "epoch": 18.214046822742475, + "grad_norm": 0.6313418745994568, + "learning_rate": 0.00013948832035595108, + "loss": 2.7713, + "step": 1366 + }, + { + "epoch": 18.22742474916388, + "grad_norm": 0.5571421980857849, + "learning_rate": 0.00013944382647385986, + "loss": 2.9221, + "step": 1367 + }, + { + "epoch": 18.240802675585286, + "grad_norm": 0.5346395373344421, + "learning_rate": 0.00013939933259176862, + "loss": 2.9842, + "step": 1368 + }, + { + "epoch": 18.254180602006688, + "grad_norm": 0.5828048586845398, + "learning_rate": 0.00013935483870967743, + "loss": 2.8683, + "step": 1369 + }, + { + "epoch": 18.267558528428093, + "grad_norm": 0.6446037888526917, + "learning_rate": 0.0001393103448275862, + "loss": 2.8, + "step": 1370 + }, + { + "epoch": 18.2809364548495, + "grad_norm": 0.612689197063446, + "learning_rate": 0.000139265850945495, + "loss": 3.0642, + "step": 1371 + }, + { + "epoch": 18.294314381270905, + "grad_norm": 0.5511941909790039, + "learning_rate": 0.00013922135706340377, + "loss": 3.0633, + "step": 1372 + }, + { + "epoch": 18.307692307692307, + "grad_norm": 0.7538149356842041, + "learning_rate": 0.00013917686318131258, + "loss": 2.7224, + "step": 1373 + }, + { + "epoch": 18.321070234113712, + "grad_norm": 0.6194874048233032, + "learning_rate": 0.00013913236929922137, + "loss": 2.9357, + "step": 1374 + }, + { + "epoch": 18.334448160535118, + "grad_norm": 0.5770833492279053, + "learning_rate": 0.00013908787541713015, + "loss": 2.8585, + "step": 1375 + }, + { + "epoch": 18.347826086956523, + "grad_norm": 0.609080970287323, + "learning_rate": 0.00013904338153503893, + "loss": 2.7763, + "step": 1376 + }, + { + "epoch": 18.361204013377925, + "grad_norm": 0.5578462481498718, + "learning_rate": 0.00013899888765294774, + "loss": 3.0324, + "step": 1377 + }, + { + "epoch": 18.37458193979933, + "grad_norm": 0.5949610471725464, + "learning_rate": 0.0001389543937708565, + "loss": 2.7332, + "step": 1378 + }, + { + "epoch": 18.387959866220736, + "grad_norm": 0.6248785257339478, + "learning_rate": 0.0001389098998887653, + "loss": 2.7355, + "step": 1379 + }, + { + "epoch": 18.401337792642142, + "grad_norm": 0.606239378452301, + "learning_rate": 0.0001388654060066741, + "loss": 2.9883, + "step": 1380 + }, + { + "epoch": 18.414715719063544, + "grad_norm": 0.6222496628761292, + "learning_rate": 0.00013882091212458287, + "loss": 2.7384, + "step": 1381 + }, + { + "epoch": 18.42809364548495, + "grad_norm": 0.6253412365913391, + "learning_rate": 0.00013877641824249165, + "loss": 2.7555, + "step": 1382 + }, + { + "epoch": 18.441471571906355, + "grad_norm": 0.6204626560211182, + "learning_rate": 0.00013873192436040046, + "loss": 2.7279, + "step": 1383 + }, + { + "epoch": 18.45484949832776, + "grad_norm": 0.7254919409751892, + "learning_rate": 0.00013868743047830924, + "loss": 2.9053, + "step": 1384 + }, + { + "epoch": 18.468227424749163, + "grad_norm": 0.6207154393196106, + "learning_rate": 0.00013864293659621803, + "loss": 3.0648, + "step": 1385 + }, + { + "epoch": 18.48160535117057, + "grad_norm": 0.6959066390991211, + "learning_rate": 0.0001385984427141268, + "loss": 2.9641, + "step": 1386 + }, + { + "epoch": 18.494983277591974, + "grad_norm": 0.6345707774162292, + "learning_rate": 0.00013855394883203562, + "loss": 2.9452, + "step": 1387 + }, + { + "epoch": 18.50836120401338, + "grad_norm": 0.5806639790534973, + "learning_rate": 0.00013850945494994437, + "loss": 2.9375, + "step": 1388 + }, + { + "epoch": 18.52173913043478, + "grad_norm": 0.6498666405677795, + "learning_rate": 0.00013846496106785318, + "loss": 2.867, + "step": 1389 + }, + { + "epoch": 18.535117056856187, + "grad_norm": 0.629264771938324, + "learning_rate": 0.00013842046718576197, + "loss": 2.822, + "step": 1390 + }, + { + "epoch": 18.548494983277592, + "grad_norm": 0.6734644174575806, + "learning_rate": 0.00013837597330367075, + "loss": 2.9065, + "step": 1391 + }, + { + "epoch": 18.561872909698998, + "grad_norm": 0.5705899000167847, + "learning_rate": 0.00013833147942157953, + "loss": 2.9909, + "step": 1392 + }, + { + "epoch": 18.5752508361204, + "grad_norm": 0.6786744594573975, + "learning_rate": 0.00013828698553948834, + "loss": 3.0667, + "step": 1393 + }, + { + "epoch": 18.588628762541806, + "grad_norm": 0.6044118404388428, + "learning_rate": 0.00013824249165739712, + "loss": 2.8345, + "step": 1394 + }, + { + "epoch": 18.60200668896321, + "grad_norm": 0.5928333401679993, + "learning_rate": 0.0001381979977753059, + "loss": 2.974, + "step": 1395 + }, + { + "epoch": 18.615384615384617, + "grad_norm": 0.636883556842804, + "learning_rate": 0.0001381535038932147, + "loss": 3.1034, + "step": 1396 + }, + { + "epoch": 18.62876254180602, + "grad_norm": 0.6029159426689148, + "learning_rate": 0.0001381090100111235, + "loss": 3.0303, + "step": 1397 + }, + { + "epoch": 18.642140468227424, + "grad_norm": 0.6479122638702393, + "learning_rate": 0.00013806451612903225, + "loss": 3.1782, + "step": 1398 + }, + { + "epoch": 18.65551839464883, + "grad_norm": 0.6547753810882568, + "learning_rate": 0.00013802002224694106, + "loss": 3.0594, + "step": 1399 + }, + { + "epoch": 18.668896321070235, + "grad_norm": 0.6506614089012146, + "learning_rate": 0.00013797552836484984, + "loss": 3.0955, + "step": 1400 + }, + { + "epoch": 18.682274247491637, + "grad_norm": 0.6073411107063293, + "learning_rate": 0.00013793103448275863, + "loss": 2.7452, + "step": 1401 + }, + { + "epoch": 18.695652173913043, + "grad_norm": 0.6307429075241089, + "learning_rate": 0.0001378865406006674, + "loss": 2.8329, + "step": 1402 + }, + { + "epoch": 18.70903010033445, + "grad_norm": 0.7205286026000977, + "learning_rate": 0.00013784204671857622, + "loss": 2.8228, + "step": 1403 + }, + { + "epoch": 18.722408026755854, + "grad_norm": 0.6154866814613342, + "learning_rate": 0.000137797552836485, + "loss": 2.7081, + "step": 1404 + }, + { + "epoch": 18.735785953177256, + "grad_norm": 0.548799455165863, + "learning_rate": 0.00013775305895439378, + "loss": 2.9336, + "step": 1405 + }, + { + "epoch": 18.74916387959866, + "grad_norm": 0.5723371505737305, + "learning_rate": 0.00013770856507230257, + "loss": 2.9661, + "step": 1406 + }, + { + "epoch": 18.762541806020067, + "grad_norm": 0.5917731523513794, + "learning_rate": 0.00013766407119021137, + "loss": 2.8458, + "step": 1407 + }, + { + "epoch": 18.775919732441473, + "grad_norm": 0.6160814166069031, + "learning_rate": 0.00013761957730812013, + "loss": 2.8453, + "step": 1408 + }, + { + "epoch": 18.789297658862875, + "grad_norm": 0.5995500683784485, + "learning_rate": 0.0001375750834260289, + "loss": 3.1361, + "step": 1409 + }, + { + "epoch": 18.80267558528428, + "grad_norm": 0.5574924945831299, + "learning_rate": 0.00013753058954393772, + "loss": 2.9808, + "step": 1410 + }, + { + "epoch": 18.816053511705686, + "grad_norm": 0.6444510221481323, + "learning_rate": 0.0001374860956618465, + "loss": 2.9204, + "step": 1411 + }, + { + "epoch": 18.82943143812709, + "grad_norm": 0.6153264045715332, + "learning_rate": 0.0001374416017797553, + "loss": 3.036, + "step": 1412 + }, + { + "epoch": 18.842809364548494, + "grad_norm": 0.5951060652732849, + "learning_rate": 0.00013739710789766407, + "loss": 3.1012, + "step": 1413 + }, + { + "epoch": 18.8561872909699, + "grad_norm": 0.5688861012458801, + "learning_rate": 0.00013735261401557288, + "loss": 3.1568, + "step": 1414 + }, + { + "epoch": 18.869565217391305, + "grad_norm": 0.6256094574928284, + "learning_rate": 0.00013730812013348163, + "loss": 2.7721, + "step": 1415 + }, + { + "epoch": 18.88294314381271, + "grad_norm": 0.646250307559967, + "learning_rate": 0.00013726362625139044, + "loss": 3.0091, + "step": 1416 + }, + { + "epoch": 18.896321070234112, + "grad_norm": 0.6805879473686218, + "learning_rate": 0.00013721913236929923, + "loss": 2.8397, + "step": 1417 + }, + { + "epoch": 18.909698996655518, + "grad_norm": 0.6164728999137878, + "learning_rate": 0.000137174638487208, + "loss": 2.8318, + "step": 1418 + }, + { + "epoch": 18.923076923076923, + "grad_norm": 0.6298549771308899, + "learning_rate": 0.0001371301446051168, + "loss": 2.8493, + "step": 1419 + }, + { + "epoch": 18.93645484949833, + "grad_norm": 0.5760109424591064, + "learning_rate": 0.0001370856507230256, + "loss": 3.1736, + "step": 1420 + }, + { + "epoch": 18.94983277591973, + "grad_norm": 0.6126035451889038, + "learning_rate": 0.00013704115684093438, + "loss": 3.1313, + "step": 1421 + }, + { + "epoch": 18.963210702341136, + "grad_norm": 0.6092283129692078, + "learning_rate": 0.00013699666295884316, + "loss": 3.0601, + "step": 1422 + }, + { + "epoch": 18.976588628762542, + "grad_norm": 0.6506980657577515, + "learning_rate": 0.00013695216907675195, + "loss": 2.7787, + "step": 1423 + }, + { + "epoch": 18.989966555183948, + "grad_norm": 0.6060482263565063, + "learning_rate": 0.00013690767519466076, + "loss": 3.062, + "step": 1424 + }, + { + "epoch": 19.0, + "grad_norm": 0.7881284952163696, + "learning_rate": 0.0001368631813125695, + "loss": 2.9882, + "step": 1425 + }, + { + "epoch": 19.013377926421406, + "grad_norm": 0.5459823608398438, + "learning_rate": 0.00013681868743047832, + "loss": 2.9312, + "step": 1426 + }, + { + "epoch": 19.02675585284281, + "grad_norm": 0.71108078956604, + "learning_rate": 0.0001367741935483871, + "loss": 2.6003, + "step": 1427 + }, + { + "epoch": 19.040133779264213, + "grad_norm": 0.6824572682380676, + "learning_rate": 0.00013672969966629589, + "loss": 2.8354, + "step": 1428 + }, + { + "epoch": 19.05351170568562, + "grad_norm": 0.607400119304657, + "learning_rate": 0.00013668520578420467, + "loss": 2.95, + "step": 1429 + }, + { + "epoch": 19.066889632107024, + "grad_norm": 0.5925526022911072, + "learning_rate": 0.00013664071190211348, + "loss": 2.7101, + "step": 1430 + }, + { + "epoch": 19.08026755852843, + "grad_norm": 0.6208476424217224, + "learning_rate": 0.00013659621802002226, + "loss": 2.7759, + "step": 1431 + }, + { + "epoch": 19.093645484949832, + "grad_norm": 0.6047778725624084, + "learning_rate": 0.00013655172413793104, + "loss": 3.0377, + "step": 1432 + }, + { + "epoch": 19.107023411371237, + "grad_norm": 0.5979378819465637, + "learning_rate": 0.00013650723025583983, + "loss": 2.8671, + "step": 1433 + }, + { + "epoch": 19.120401337792643, + "grad_norm": 0.6338753700256348, + "learning_rate": 0.00013646273637374863, + "loss": 2.7869, + "step": 1434 + }, + { + "epoch": 19.13377926421405, + "grad_norm": 0.6263737678527832, + "learning_rate": 0.0001364182424916574, + "loss": 2.6591, + "step": 1435 + }, + { + "epoch": 19.14715719063545, + "grad_norm": 0.646990180015564, + "learning_rate": 0.0001363737486095662, + "loss": 2.896, + "step": 1436 + }, + { + "epoch": 19.160535117056856, + "grad_norm": 0.5691138505935669, + "learning_rate": 0.00013632925472747498, + "loss": 2.8774, + "step": 1437 + }, + { + "epoch": 19.17391304347826, + "grad_norm": 0.5838844180107117, + "learning_rate": 0.00013628476084538376, + "loss": 2.9722, + "step": 1438 + }, + { + "epoch": 19.187290969899667, + "grad_norm": 0.6154463887214661, + "learning_rate": 0.00013624026696329255, + "loss": 3.0143, + "step": 1439 + }, + { + "epoch": 19.20066889632107, + "grad_norm": 0.6480549573898315, + "learning_rate": 0.00013619577308120136, + "loss": 2.8118, + "step": 1440 + }, + { + "epoch": 19.214046822742475, + "grad_norm": 0.7675592303276062, + "learning_rate": 0.00013615127919911014, + "loss": 2.865, + "step": 1441 + }, + { + "epoch": 19.22742474916388, + "grad_norm": 0.7231382131576538, + "learning_rate": 0.00013610678531701892, + "loss": 2.6942, + "step": 1442 + }, + { + "epoch": 19.240802675585286, + "grad_norm": 0.6359425187110901, + "learning_rate": 0.0001360622914349277, + "loss": 2.5976, + "step": 1443 + }, + { + "epoch": 19.254180602006688, + "grad_norm": 0.6486908793449402, + "learning_rate": 0.0001360177975528365, + "loss": 2.9494, + "step": 1444 + }, + { + "epoch": 19.267558528428093, + "grad_norm": 0.5930846929550171, + "learning_rate": 0.00013597330367074527, + "loss": 3.1295, + "step": 1445 + }, + { + "epoch": 19.2809364548495, + "grad_norm": 0.6988996267318726, + "learning_rate": 0.00013592880978865408, + "loss": 2.9627, + "step": 1446 + }, + { + "epoch": 19.294314381270905, + "grad_norm": 0.5971337556838989, + "learning_rate": 0.00013588431590656286, + "loss": 2.9636, + "step": 1447 + }, + { + "epoch": 19.307692307692307, + "grad_norm": 0.6479155421257019, + "learning_rate": 0.00013583982202447164, + "loss": 2.6575, + "step": 1448 + }, + { + "epoch": 19.321070234113712, + "grad_norm": 0.6771759986877441, + "learning_rate": 0.00013579532814238043, + "loss": 3.0667, + "step": 1449 + }, + { + "epoch": 19.334448160535118, + "grad_norm": 0.5785907506942749, + "learning_rate": 0.0001357508342602892, + "loss": 2.8839, + "step": 1450 + }, + { + "epoch": 19.347826086956523, + "grad_norm": 0.6315357089042664, + "learning_rate": 0.00013570634037819802, + "loss": 2.6925, + "step": 1451 + }, + { + "epoch": 19.361204013377925, + "grad_norm": 0.6047807931900024, + "learning_rate": 0.00013566184649610677, + "loss": 2.7948, + "step": 1452 + }, + { + "epoch": 19.37458193979933, + "grad_norm": 0.6079906225204468, + "learning_rate": 0.00013561735261401558, + "loss": 3.1798, + "step": 1453 + }, + { + "epoch": 19.387959866220736, + "grad_norm": 0.5820274353027344, + "learning_rate": 0.00013557285873192436, + "loss": 3.1159, + "step": 1454 + }, + { + "epoch": 19.401337792642142, + "grad_norm": 0.562022864818573, + "learning_rate": 0.00013552836484983315, + "loss": 2.9869, + "step": 1455 + }, + { + "epoch": 19.414715719063544, + "grad_norm": 0.6663182973861694, + "learning_rate": 0.00013548387096774193, + "loss": 2.8038, + "step": 1456 + }, + { + "epoch": 19.42809364548495, + "grad_norm": 0.7092719078063965, + "learning_rate": 0.00013543937708565074, + "loss": 3.0159, + "step": 1457 + }, + { + "epoch": 19.441471571906355, + "grad_norm": 0.5990714430809021, + "learning_rate": 0.00013539488320355952, + "loss": 2.8858, + "step": 1458 + }, + { + "epoch": 19.45484949832776, + "grad_norm": 0.600847065448761, + "learning_rate": 0.0001353503893214683, + "loss": 2.9352, + "step": 1459 + }, + { + "epoch": 19.468227424749163, + "grad_norm": 0.6393849849700928, + "learning_rate": 0.00013530589543937709, + "loss": 2.6911, + "step": 1460 + }, + { + "epoch": 19.48160535117057, + "grad_norm": 0.6170421838760376, + "learning_rate": 0.0001352614015572859, + "loss": 2.749, + "step": 1461 + }, + { + "epoch": 19.494983277591974, + "grad_norm": 0.53690105676651, + "learning_rate": 0.00013521690767519465, + "loss": 2.9516, + "step": 1462 + }, + { + "epoch": 19.50836120401338, + "grad_norm": 0.5976501703262329, + "learning_rate": 0.00013517241379310346, + "loss": 2.9996, + "step": 1463 + }, + { + "epoch": 19.52173913043478, + "grad_norm": 0.5892135500907898, + "learning_rate": 0.00013512791991101224, + "loss": 3.2039, + "step": 1464 + }, + { + "epoch": 19.535117056856187, + "grad_norm": 0.65968918800354, + "learning_rate": 0.00013508342602892102, + "loss": 2.5897, + "step": 1465 + }, + { + "epoch": 19.548494983277592, + "grad_norm": 0.591454267501831, + "learning_rate": 0.0001350389321468298, + "loss": 3.0104, + "step": 1466 + }, + { + "epoch": 19.561872909698998, + "grad_norm": 0.6272184252738953, + "learning_rate": 0.00013499443826473862, + "loss": 2.7266, + "step": 1467 + }, + { + "epoch": 19.5752508361204, + "grad_norm": 0.6142420172691345, + "learning_rate": 0.0001349499443826474, + "loss": 2.8358, + "step": 1468 + }, + { + "epoch": 19.588628762541806, + "grad_norm": 0.6268441677093506, + "learning_rate": 0.00013490545050055618, + "loss": 3.0196, + "step": 1469 + }, + { + "epoch": 19.60200668896321, + "grad_norm": 0.6512436866760254, + "learning_rate": 0.00013486095661846496, + "loss": 2.9558, + "step": 1470 + }, + { + "epoch": 19.615384615384617, + "grad_norm": 0.5983771681785583, + "learning_rate": 0.00013481646273637377, + "loss": 2.9958, + "step": 1471 + }, + { + "epoch": 19.62876254180602, + "grad_norm": 0.6994190216064453, + "learning_rate": 0.00013477196885428253, + "loss": 2.8019, + "step": 1472 + }, + { + "epoch": 19.642140468227424, + "grad_norm": 0.5878567695617676, + "learning_rate": 0.00013472747497219134, + "loss": 2.7007, + "step": 1473 + }, + { + "epoch": 19.65551839464883, + "grad_norm": 0.6140199303627014, + "learning_rate": 0.00013468298109010012, + "loss": 2.9212, + "step": 1474 + }, + { + "epoch": 19.668896321070235, + "grad_norm": 0.648714542388916, + "learning_rate": 0.0001346384872080089, + "loss": 3.0053, + "step": 1475 + }, + { + "epoch": 19.682274247491637, + "grad_norm": 0.5991750359535217, + "learning_rate": 0.00013459399332591769, + "loss": 2.9129, + "step": 1476 + }, + { + "epoch": 19.695652173913043, + "grad_norm": 0.5538223385810852, + "learning_rate": 0.0001345494994438265, + "loss": 2.9097, + "step": 1477 + }, + { + "epoch": 19.70903010033445, + "grad_norm": 0.5864409804344177, + "learning_rate": 0.00013450500556173528, + "loss": 2.9348, + "step": 1478 + }, + { + "epoch": 19.722408026755854, + "grad_norm": 0.6004533767700195, + "learning_rate": 0.00013446051167964406, + "loss": 2.8845, + "step": 1479 + }, + { + "epoch": 19.735785953177256, + "grad_norm": 0.6316581964492798, + "learning_rate": 0.00013441601779755284, + "loss": 2.8619, + "step": 1480 + }, + { + "epoch": 19.74916387959866, + "grad_norm": 0.593138575553894, + "learning_rate": 0.00013437152391546165, + "loss": 3.041, + "step": 1481 + }, + { + "epoch": 19.762541806020067, + "grad_norm": 0.5826678276062012, + "learning_rate": 0.0001343270300333704, + "loss": 2.7432, + "step": 1482 + }, + { + "epoch": 19.775919732441473, + "grad_norm": 0.6341697573661804, + "learning_rate": 0.00013428253615127922, + "loss": 2.9755, + "step": 1483 + }, + { + "epoch": 19.789297658862875, + "grad_norm": 0.5894901156425476, + "learning_rate": 0.000134238042269188, + "loss": 2.9754, + "step": 1484 + }, + { + "epoch": 19.80267558528428, + "grad_norm": 0.5840655565261841, + "learning_rate": 0.00013419354838709678, + "loss": 2.9959, + "step": 1485 + }, + { + "epoch": 19.816053511705686, + "grad_norm": 0.6006319522857666, + "learning_rate": 0.00013414905450500556, + "loss": 3.0196, + "step": 1486 + }, + { + "epoch": 19.82943143812709, + "grad_norm": 0.5647453665733337, + "learning_rate": 0.00013410456062291437, + "loss": 2.9662, + "step": 1487 + }, + { + "epoch": 19.842809364548494, + "grad_norm": 0.6583006978034973, + "learning_rate": 0.00013406006674082316, + "loss": 2.8719, + "step": 1488 + }, + { + "epoch": 19.8561872909699, + "grad_norm": 0.6041131615638733, + "learning_rate": 0.00013401557285873194, + "loss": 3.0019, + "step": 1489 + }, + { + "epoch": 19.869565217391305, + "grad_norm": 0.5524600148200989, + "learning_rate": 0.00013397107897664072, + "loss": 2.7409, + "step": 1490 + }, + { + "epoch": 19.88294314381271, + "grad_norm": 0.6532869338989258, + "learning_rate": 0.0001339265850945495, + "loss": 2.9336, + "step": 1491 + }, + { + "epoch": 19.896321070234112, + "grad_norm": 0.6459875106811523, + "learning_rate": 0.00013388209121245828, + "loss": 2.9572, + "step": 1492 + }, + { + "epoch": 19.909698996655518, + "grad_norm": 0.6051417589187622, + "learning_rate": 0.00013383759733036707, + "loss": 2.864, + "step": 1493 + }, + { + "epoch": 19.923076923076923, + "grad_norm": 0.6565695405006409, + "learning_rate": 0.00013379310344827588, + "loss": 2.865, + "step": 1494 + }, + { + "epoch": 19.93645484949833, + "grad_norm": 0.6118014454841614, + "learning_rate": 0.00013374860956618466, + "loss": 2.7594, + "step": 1495 + }, + { + "epoch": 19.94983277591973, + "grad_norm": 0.6801209449768066, + "learning_rate": 0.00013370411568409344, + "loss": 2.7532, + "step": 1496 + }, + { + "epoch": 19.963210702341136, + "grad_norm": 0.5785267353057861, + "learning_rate": 0.00013365962180200222, + "loss": 3.0618, + "step": 1497 + }, + { + "epoch": 19.976588628762542, + "grad_norm": 0.6344903707504272, + "learning_rate": 0.00013361512791991103, + "loss": 2.7908, + "step": 1498 + }, + { + "epoch": 19.989966555183948, + "grad_norm": 0.6073011159896851, + "learning_rate": 0.0001335706340378198, + "loss": 3.0699, + "step": 1499 + }, + { + "epoch": 20.0, + "grad_norm": 0.6989748477935791, + "learning_rate": 0.0001335261401557286, + "loss": 2.7932, + "step": 1500 + }, + { + "epoch": 20.013377926421406, + "grad_norm": 0.5897710919380188, + "learning_rate": 0.00013348164627363738, + "loss": 3.0379, + "step": 1501 + }, + { + "epoch": 20.02675585284281, + "grad_norm": 0.5845353603363037, + "learning_rate": 0.00013343715239154616, + "loss": 2.7123, + "step": 1502 + }, + { + "epoch": 20.040133779264213, + "grad_norm": 0.5720913410186768, + "learning_rate": 0.00013339265850945495, + "loss": 3.0196, + "step": 1503 + }, + { + "epoch": 20.05351170568562, + "grad_norm": 0.7616726756095886, + "learning_rate": 0.00013334816462736375, + "loss": 2.6719, + "step": 1504 + }, + { + "epoch": 20.066889632107024, + "grad_norm": 0.6211048364639282, + "learning_rate": 0.00013330367074527254, + "loss": 2.8263, + "step": 1505 + }, + { + "epoch": 20.08026755852843, + "grad_norm": 0.6016023755073547, + "learning_rate": 0.00013325917686318132, + "loss": 2.7226, + "step": 1506 + }, + { + "epoch": 20.093645484949832, + "grad_norm": 0.6265879273414612, + "learning_rate": 0.0001332146829810901, + "loss": 2.8597, + "step": 1507 + }, + { + "epoch": 20.107023411371237, + "grad_norm": 0.6129719614982605, + "learning_rate": 0.0001331701890989989, + "loss": 2.9449, + "step": 1508 + }, + { + "epoch": 20.120401337792643, + "grad_norm": 0.6351513266563416, + "learning_rate": 0.00013312569521690767, + "loss": 2.9768, + "step": 1509 + }, + { + "epoch": 20.13377926421405, + "grad_norm": 0.5772795677185059, + "learning_rate": 0.00013308120133481648, + "loss": 3.0424, + "step": 1510 + }, + { + "epoch": 20.14715719063545, + "grad_norm": 0.600697934627533, + "learning_rate": 0.00013303670745272526, + "loss": 2.793, + "step": 1511 + }, + { + "epoch": 20.160535117056856, + "grad_norm": 0.5418747663497925, + "learning_rate": 0.00013299221357063404, + "loss": 2.8706, + "step": 1512 + }, + { + "epoch": 20.17391304347826, + "grad_norm": 0.5500038862228394, + "learning_rate": 0.00013294771968854282, + "loss": 2.7352, + "step": 1513 + }, + { + "epoch": 20.187290969899667, + "grad_norm": 0.5478414297103882, + "learning_rate": 0.00013290322580645163, + "loss": 2.9113, + "step": 1514 + }, + { + "epoch": 20.20066889632107, + "grad_norm": 0.5382576584815979, + "learning_rate": 0.00013285873192436042, + "loss": 2.8276, + "step": 1515 + }, + { + "epoch": 20.214046822742475, + "grad_norm": 0.6003616452217102, + "learning_rate": 0.0001328142380422692, + "loss": 2.9112, + "step": 1516 + }, + { + "epoch": 20.22742474916388, + "grad_norm": 0.6674323678016663, + "learning_rate": 0.00013276974416017798, + "loss": 2.7382, + "step": 1517 + }, + { + "epoch": 20.240802675585286, + "grad_norm": 0.591314435005188, + "learning_rate": 0.0001327252502780868, + "loss": 2.8003, + "step": 1518 + }, + { + "epoch": 20.254180602006688, + "grad_norm": 0.5268637537956238, + "learning_rate": 0.00013268075639599554, + "loss": 2.7134, + "step": 1519 + }, + { + "epoch": 20.267558528428093, + "grad_norm": 0.5217694640159607, + "learning_rate": 0.00013263626251390435, + "loss": 2.6157, + "step": 1520 + }, + { + "epoch": 20.2809364548495, + "grad_norm": 0.6171165704727173, + "learning_rate": 0.00013259176863181314, + "loss": 2.8738, + "step": 1521 + }, + { + "epoch": 20.294314381270905, + "grad_norm": 0.5457054972648621, + "learning_rate": 0.00013254727474972192, + "loss": 2.6106, + "step": 1522 + }, + { + "epoch": 20.307692307692307, + "grad_norm": 0.6596150994300842, + "learning_rate": 0.0001325027808676307, + "loss": 2.9974, + "step": 1523 + }, + { + "epoch": 20.321070234113712, + "grad_norm": 0.7236288785934448, + "learning_rate": 0.0001324582869855395, + "loss": 2.8669, + "step": 1524 + }, + { + "epoch": 20.334448160535118, + "grad_norm": 0.6390851736068726, + "learning_rate": 0.0001324137931034483, + "loss": 2.909, + "step": 1525 + }, + { + "epoch": 20.347826086956523, + "grad_norm": 0.6010439991950989, + "learning_rate": 0.00013236929922135708, + "loss": 2.9567, + "step": 1526 + }, + { + "epoch": 20.361204013377925, + "grad_norm": 0.5825399160385132, + "learning_rate": 0.00013232480533926586, + "loss": 2.7688, + "step": 1527 + }, + { + "epoch": 20.37458193979933, + "grad_norm": 0.6000121831893921, + "learning_rate": 0.00013228031145717467, + "loss": 3.0515, + "step": 1528 + }, + { + "epoch": 20.387959866220736, + "grad_norm": 0.5775492787361145, + "learning_rate": 0.00013223581757508342, + "loss": 2.8958, + "step": 1529 + }, + { + "epoch": 20.401337792642142, + "grad_norm": 0.6193161010742188, + "learning_rate": 0.00013219132369299223, + "loss": 2.9648, + "step": 1530 + }, + { + "epoch": 20.414715719063544, + "grad_norm": 0.7751132249832153, + "learning_rate": 0.00013214682981090101, + "loss": 2.677, + "step": 1531 + }, + { + "epoch": 20.42809364548495, + "grad_norm": 0.6269053220748901, + "learning_rate": 0.0001321023359288098, + "loss": 2.9483, + "step": 1532 + }, + { + "epoch": 20.441471571906355, + "grad_norm": 0.5793836116790771, + "learning_rate": 0.00013205784204671858, + "loss": 2.8025, + "step": 1533 + }, + { + "epoch": 20.45484949832776, + "grad_norm": 0.5609649419784546, + "learning_rate": 0.00013201334816462736, + "loss": 2.8104, + "step": 1534 + }, + { + "epoch": 20.468227424749163, + "grad_norm": 0.6055050492286682, + "learning_rate": 0.00013196885428253617, + "loss": 2.8649, + "step": 1535 + }, + { + "epoch": 20.48160535117057, + "grad_norm": 0.5952653884887695, + "learning_rate": 0.00013192436040044493, + "loss": 2.6815, + "step": 1536 + }, + { + "epoch": 20.494983277591974, + "grad_norm": 0.5716878175735474, + "learning_rate": 0.00013187986651835374, + "loss": 2.637, + "step": 1537 + }, + { + "epoch": 20.50836120401338, + "grad_norm": 0.5684781074523926, + "learning_rate": 0.00013183537263626252, + "loss": 2.8601, + "step": 1538 + }, + { + "epoch": 20.52173913043478, + "grad_norm": 0.5603588223457336, + "learning_rate": 0.0001317908787541713, + "loss": 2.9173, + "step": 1539 + }, + { + "epoch": 20.535117056856187, + "grad_norm": 0.6022379398345947, + "learning_rate": 0.00013174638487208008, + "loss": 2.873, + "step": 1540 + }, + { + "epoch": 20.548494983277592, + "grad_norm": 0.6124081611633301, + "learning_rate": 0.0001317018909899889, + "loss": 3.05, + "step": 1541 + }, + { + "epoch": 20.561872909698998, + "grad_norm": 0.5928300023078918, + "learning_rate": 0.00013165739710789768, + "loss": 2.7411, + "step": 1542 + }, + { + "epoch": 20.5752508361204, + "grad_norm": 0.5931242108345032, + "learning_rate": 0.00013161290322580646, + "loss": 3.0032, + "step": 1543 + }, + { + "epoch": 20.588628762541806, + "grad_norm": 0.7123291492462158, + "learning_rate": 0.00013156840934371524, + "loss": 2.6485, + "step": 1544 + }, + { + "epoch": 20.60200668896321, + "grad_norm": 0.565980851650238, + "learning_rate": 0.00013152391546162405, + "loss": 2.7143, + "step": 1545 + }, + { + "epoch": 20.615384615384617, + "grad_norm": 0.6504361033439636, + "learning_rate": 0.0001314794215795328, + "loss": 2.6285, + "step": 1546 + }, + { + "epoch": 20.62876254180602, + "grad_norm": 0.6558433175086975, + "learning_rate": 0.00013143492769744161, + "loss": 3.0403, + "step": 1547 + }, + { + "epoch": 20.642140468227424, + "grad_norm": 0.6159015893936157, + "learning_rate": 0.0001313904338153504, + "loss": 2.8836, + "step": 1548 + }, + { + "epoch": 20.65551839464883, + "grad_norm": 0.6061471104621887, + "learning_rate": 0.00013134593993325918, + "loss": 2.9076, + "step": 1549 + }, + { + "epoch": 20.668896321070235, + "grad_norm": 0.6891756057739258, + "learning_rate": 0.00013130144605116796, + "loss": 3.0326, + "step": 1550 + }, + { + "epoch": 20.682274247491637, + "grad_norm": 0.5920016765594482, + "learning_rate": 0.00013125695216907677, + "loss": 2.949, + "step": 1551 + }, + { + "epoch": 20.695652173913043, + "grad_norm": 0.7947055697441101, + "learning_rate": 0.00013121245828698555, + "loss": 2.7796, + "step": 1552 + }, + { + "epoch": 20.70903010033445, + "grad_norm": 0.6409865021705627, + "learning_rate": 0.00013116796440489434, + "loss": 2.9103, + "step": 1553 + }, + { + "epoch": 20.722408026755854, + "grad_norm": 0.5864197611808777, + "learning_rate": 0.00013112347052280312, + "loss": 3.0493, + "step": 1554 + }, + { + "epoch": 20.735785953177256, + "grad_norm": 0.6567059755325317, + "learning_rate": 0.00013107897664071193, + "loss": 2.8359, + "step": 1555 + }, + { + "epoch": 20.74916387959866, + "grad_norm": 0.5808839201927185, + "learning_rate": 0.00013103448275862068, + "loss": 2.9595, + "step": 1556 + }, + { + "epoch": 20.762541806020067, + "grad_norm": 0.6176792979240417, + "learning_rate": 0.0001309899888765295, + "loss": 2.6975, + "step": 1557 + }, + { + "epoch": 20.775919732441473, + "grad_norm": 0.5763049721717834, + "learning_rate": 0.00013094549499443827, + "loss": 2.9752, + "step": 1558 + }, + { + "epoch": 20.789297658862875, + "grad_norm": 0.5877807140350342, + "learning_rate": 0.00013090100111234706, + "loss": 2.7214, + "step": 1559 + }, + { + "epoch": 20.80267558528428, + "grad_norm": 0.6624999642372131, + "learning_rate": 0.00013085650723025584, + "loss": 2.8642, + "step": 1560 + }, + { + "epoch": 20.816053511705686, + "grad_norm": 0.7411154508590698, + "learning_rate": 0.00013081201334816465, + "loss": 2.7526, + "step": 1561 + }, + { + "epoch": 20.82943143812709, + "grad_norm": 0.6246466636657715, + "learning_rate": 0.00013076751946607343, + "loss": 3.0441, + "step": 1562 + }, + { + "epoch": 20.842809364548494, + "grad_norm": 0.5515304803848267, + "learning_rate": 0.00013072302558398221, + "loss": 2.7211, + "step": 1563 + }, + { + "epoch": 20.8561872909699, + "grad_norm": 0.5537006258964539, + "learning_rate": 0.000130678531701891, + "loss": 2.9619, + "step": 1564 + }, + { + "epoch": 20.869565217391305, + "grad_norm": 0.6326210498809814, + "learning_rate": 0.0001306340378197998, + "loss": 3.2433, + "step": 1565 + }, + { + "epoch": 20.88294314381271, + "grad_norm": 0.6463941335678101, + "learning_rate": 0.00013058954393770856, + "loss": 2.7263, + "step": 1566 + }, + { + "epoch": 20.896321070234112, + "grad_norm": 0.6212389469146729, + "learning_rate": 0.00013054505005561737, + "loss": 2.9775, + "step": 1567 + }, + { + "epoch": 20.909698996655518, + "grad_norm": 0.5878139734268188, + "learning_rate": 0.00013050055617352615, + "loss": 2.7427, + "step": 1568 + }, + { + "epoch": 20.923076923076923, + "grad_norm": 0.5821187496185303, + "learning_rate": 0.00013045606229143494, + "loss": 2.9058, + "step": 1569 + }, + { + "epoch": 20.93645484949833, + "grad_norm": 0.6192734837532043, + "learning_rate": 0.00013041156840934372, + "loss": 2.9744, + "step": 1570 + }, + { + "epoch": 20.94983277591973, + "grad_norm": 0.5540494322776794, + "learning_rate": 0.00013036707452725253, + "loss": 2.8299, + "step": 1571 + }, + { + "epoch": 20.963210702341136, + "grad_norm": 0.6620298624038696, + "learning_rate": 0.0001303225806451613, + "loss": 2.8291, + "step": 1572 + }, + { + "epoch": 20.976588628762542, + "grad_norm": 0.7161823511123657, + "learning_rate": 0.0001302780867630701, + "loss": 2.9829, + "step": 1573 + }, + { + "epoch": 20.989966555183948, + "grad_norm": 1.1449205875396729, + "learning_rate": 0.00013023359288097887, + "loss": 2.8807, + "step": 1574 + }, + { + "epoch": 21.0, + "grad_norm": 0.6721773743629456, + "learning_rate": 0.00013018909899888766, + "loss": 2.8398, + "step": 1575 + }, + { + "epoch": 21.013377926421406, + "grad_norm": 0.5506662130355835, + "learning_rate": 0.00013014460511679644, + "loss": 2.6858, + "step": 1576 + }, + { + "epoch": 21.02675585284281, + "grad_norm": 0.5651558041572571, + "learning_rate": 0.00013010011123470522, + "loss": 2.8692, + "step": 1577 + }, + { + "epoch": 21.040133779264213, + "grad_norm": 0.5379538536071777, + "learning_rate": 0.00013005561735261403, + "loss": 2.8221, + "step": 1578 + }, + { + "epoch": 21.05351170568562, + "grad_norm": 0.5756275653839111, + "learning_rate": 0.0001300111234705228, + "loss": 2.5469, + "step": 1579 + }, + { + "epoch": 21.066889632107024, + "grad_norm": 0.672498881816864, + "learning_rate": 0.0001299666295884316, + "loss": 2.825, + "step": 1580 + }, + { + "epoch": 21.08026755852843, + "grad_norm": 0.5217440128326416, + "learning_rate": 0.00012992213570634038, + "loss": 2.732, + "step": 1581 + }, + { + "epoch": 21.093645484949832, + "grad_norm": 0.6326786875724792, + "learning_rate": 0.0001298776418242492, + "loss": 2.6537, + "step": 1582 + }, + { + "epoch": 21.107023411371237, + "grad_norm": 0.6899091005325317, + "learning_rate": 0.00012983314794215794, + "loss": 2.7133, + "step": 1583 + }, + { + "epoch": 21.120401337792643, + "grad_norm": 0.604834794998169, + "learning_rate": 0.00012978865406006675, + "loss": 2.9341, + "step": 1584 + }, + { + "epoch": 21.13377926421405, + "grad_norm": 0.7540157437324524, + "learning_rate": 0.00012974416017797553, + "loss": 2.8224, + "step": 1585 + }, + { + "epoch": 21.14715719063545, + "grad_norm": 0.6161774396896362, + "learning_rate": 0.00012969966629588432, + "loss": 2.9968, + "step": 1586 + }, + { + "epoch": 21.160535117056856, + "grad_norm": 0.618781328201294, + "learning_rate": 0.0001296551724137931, + "loss": 2.8074, + "step": 1587 + }, + { + "epoch": 21.17391304347826, + "grad_norm": 0.6708270907402039, + "learning_rate": 0.0001296106785317019, + "loss": 2.9327, + "step": 1588 + }, + { + "epoch": 21.187290969899667, + "grad_norm": 0.6362271904945374, + "learning_rate": 0.0001295661846496107, + "loss": 2.46, + "step": 1589 + }, + { + "epoch": 21.20066889632107, + "grad_norm": 0.6039800047874451, + "learning_rate": 0.00012952169076751947, + "loss": 2.7728, + "step": 1590 + }, + { + "epoch": 21.214046822742475, + "grad_norm": 0.6119550466537476, + "learning_rate": 0.00012947719688542826, + "loss": 2.7123, + "step": 1591 + }, + { + "epoch": 21.22742474916388, + "grad_norm": 0.8122309446334839, + "learning_rate": 0.00012943270300333707, + "loss": 2.7424, + "step": 1592 + }, + { + "epoch": 21.240802675585286, + "grad_norm": 0.5658085346221924, + "learning_rate": 0.00012938820912124582, + "loss": 2.4553, + "step": 1593 + }, + { + "epoch": 21.254180602006688, + "grad_norm": 0.7365813851356506, + "learning_rate": 0.00012934371523915463, + "loss": 2.9437, + "step": 1594 + }, + { + "epoch": 21.267558528428093, + "grad_norm": 0.5746303796768188, + "learning_rate": 0.0001292992213570634, + "loss": 2.8262, + "step": 1595 + }, + { + "epoch": 21.2809364548495, + "grad_norm": 0.5708132386207581, + "learning_rate": 0.0001292547274749722, + "loss": 2.9298, + "step": 1596 + }, + { + "epoch": 21.294314381270905, + "grad_norm": 0.6003934741020203, + "learning_rate": 0.00012921023359288098, + "loss": 2.7943, + "step": 1597 + }, + { + "epoch": 21.307692307692307, + "grad_norm": 0.5722339749336243, + "learning_rate": 0.0001291657397107898, + "loss": 3.0547, + "step": 1598 + }, + { + "epoch": 21.321070234113712, + "grad_norm": 0.6287402510643005, + "learning_rate": 0.00012912124582869857, + "loss": 2.8814, + "step": 1599 + }, + { + "epoch": 21.334448160535118, + "grad_norm": 0.6594595313072205, + "learning_rate": 0.00012907675194660735, + "loss": 2.7927, + "step": 1600 + }, + { + "epoch": 21.347826086956523, + "grad_norm": 0.6678686141967773, + "learning_rate": 0.00012903225806451613, + "loss": 2.7981, + "step": 1601 + }, + { + "epoch": 21.361204013377925, + "grad_norm": 0.6103408336639404, + "learning_rate": 0.00012898776418242494, + "loss": 2.9009, + "step": 1602 + }, + { + "epoch": 21.37458193979933, + "grad_norm": 0.5791922807693481, + "learning_rate": 0.0001289432703003337, + "loss": 2.6689, + "step": 1603 + }, + { + "epoch": 21.387959866220736, + "grad_norm": 0.5776186585426331, + "learning_rate": 0.0001288987764182425, + "loss": 2.903, + "step": 1604 + }, + { + "epoch": 21.401337792642142, + "grad_norm": 0.5746363401412964, + "learning_rate": 0.0001288542825361513, + "loss": 2.832, + "step": 1605 + }, + { + "epoch": 21.414715719063544, + "grad_norm": 0.5495322346687317, + "learning_rate": 0.00012880978865406007, + "loss": 2.7305, + "step": 1606 + }, + { + "epoch": 21.42809364548495, + "grad_norm": 0.6944610476493835, + "learning_rate": 0.00012876529477196886, + "loss": 2.7534, + "step": 1607 + }, + { + "epoch": 21.441471571906355, + "grad_norm": 0.6640026569366455, + "learning_rate": 0.00012872080088987767, + "loss": 2.814, + "step": 1608 + }, + { + "epoch": 21.45484949832776, + "grad_norm": 0.7469935417175293, + "learning_rate": 0.00012867630700778642, + "loss": 2.8523, + "step": 1609 + }, + { + "epoch": 21.468227424749163, + "grad_norm": 0.600739061832428, + "learning_rate": 0.00012863181312569523, + "loss": 3.0074, + "step": 1610 + }, + { + "epoch": 21.48160535117057, + "grad_norm": 0.6701086759567261, + "learning_rate": 0.000128587319243604, + "loss": 2.6693, + "step": 1611 + }, + { + "epoch": 21.494983277591974, + "grad_norm": 0.6365100145339966, + "learning_rate": 0.00012854282536151282, + "loss": 2.9558, + "step": 1612 + }, + { + "epoch": 21.50836120401338, + "grad_norm": 0.6170171499252319, + "learning_rate": 0.00012849833147942158, + "loss": 2.605, + "step": 1613 + }, + { + "epoch": 21.52173913043478, + "grad_norm": 0.5899895429611206, + "learning_rate": 0.0001284538375973304, + "loss": 2.7816, + "step": 1614 + }, + { + "epoch": 21.535117056856187, + "grad_norm": 0.6815734505653381, + "learning_rate": 0.00012840934371523917, + "loss": 2.6663, + "step": 1615 + }, + { + "epoch": 21.548494983277592, + "grad_norm": 0.6620442271232605, + "learning_rate": 0.00012836484983314792, + "loss": 2.9333, + "step": 1616 + }, + { + "epoch": 21.561872909698998, + "grad_norm": 0.6552561521530151, + "learning_rate": 0.00012832035595105673, + "loss": 2.6833, + "step": 1617 + }, + { + "epoch": 21.5752508361204, + "grad_norm": 0.6288586854934692, + "learning_rate": 0.00012827586206896552, + "loss": 2.959, + "step": 1618 + }, + { + "epoch": 21.588628762541806, + "grad_norm": 0.6969814300537109, + "learning_rate": 0.0001282313681868743, + "loss": 2.7129, + "step": 1619 + }, + { + "epoch": 21.60200668896321, + "grad_norm": 0.5918965339660645, + "learning_rate": 0.00012818687430478308, + "loss": 2.9057, + "step": 1620 + }, + { + "epoch": 21.615384615384617, + "grad_norm": 0.5739157199859619, + "learning_rate": 0.0001281423804226919, + "loss": 2.9275, + "step": 1621 + }, + { + "epoch": 21.62876254180602, + "grad_norm": 0.6649355888366699, + "learning_rate": 0.00012809788654060067, + "loss": 2.7629, + "step": 1622 + }, + { + "epoch": 21.642140468227424, + "grad_norm": 0.6449686884880066, + "learning_rate": 0.00012805339265850946, + "loss": 2.9173, + "step": 1623 + }, + { + "epoch": 21.65551839464883, + "grad_norm": 0.5940249562263489, + "learning_rate": 0.00012800889877641824, + "loss": 2.875, + "step": 1624 + }, + { + "epoch": 21.668896321070235, + "grad_norm": 0.8104184865951538, + "learning_rate": 0.00012796440489432705, + "loss": 2.7381, + "step": 1625 + }, + { + "epoch": 21.682274247491637, + "grad_norm": 0.6892338395118713, + "learning_rate": 0.0001279199110122358, + "loss": 2.7516, + "step": 1626 + }, + { + "epoch": 21.695652173913043, + "grad_norm": 0.5888476967811584, + "learning_rate": 0.0001278754171301446, + "loss": 2.9234, + "step": 1627 + }, + { + "epoch": 21.70903010033445, + "grad_norm": 0.6148846745491028, + "learning_rate": 0.0001278309232480534, + "loss": 3.0824, + "step": 1628 + }, + { + "epoch": 21.722408026755854, + "grad_norm": 0.6559250354766846, + "learning_rate": 0.00012778642936596218, + "loss": 3.0316, + "step": 1629 + }, + { + "epoch": 21.735785953177256, + "grad_norm": 0.5679346919059753, + "learning_rate": 0.00012774193548387096, + "loss": 3.0012, + "step": 1630 + }, + { + "epoch": 21.74916387959866, + "grad_norm": 0.6776624917984009, + "learning_rate": 0.00012769744160177977, + "loss": 2.8331, + "step": 1631 + }, + { + "epoch": 21.762541806020067, + "grad_norm": 0.6982457637786865, + "learning_rate": 0.00012765294771968855, + "loss": 2.8175, + "step": 1632 + }, + { + "epoch": 21.775919732441473, + "grad_norm": 0.6834619045257568, + "learning_rate": 0.00012760845383759733, + "loss": 3.1136, + "step": 1633 + }, + { + "epoch": 21.789297658862875, + "grad_norm": 0.6460130214691162, + "learning_rate": 0.00012756395995550612, + "loss": 2.7737, + "step": 1634 + }, + { + "epoch": 21.80267558528428, + "grad_norm": 0.6818533539772034, + "learning_rate": 0.00012751946607341493, + "loss": 2.5013, + "step": 1635 + }, + { + "epoch": 21.816053511705686, + "grad_norm": 0.7875826954841614, + "learning_rate": 0.00012747497219132368, + "loss": 2.9702, + "step": 1636 + }, + { + "epoch": 21.82943143812709, + "grad_norm": 0.5845924019813538, + "learning_rate": 0.0001274304783092325, + "loss": 2.8592, + "step": 1637 + }, + { + "epoch": 21.842809364548494, + "grad_norm": 0.6136084794998169, + "learning_rate": 0.00012738598442714127, + "loss": 2.8456, + "step": 1638 + }, + { + "epoch": 21.8561872909699, + "grad_norm": 0.7345831394195557, + "learning_rate": 0.00012734149054505005, + "loss": 2.9846, + "step": 1639 + }, + { + "epoch": 21.869565217391305, + "grad_norm": 0.5709069967269897, + "learning_rate": 0.00012729699666295884, + "loss": 2.9736, + "step": 1640 + }, + { + "epoch": 21.88294314381271, + "grad_norm": 0.7338367104530334, + "learning_rate": 0.00012725250278086765, + "loss": 2.8215, + "step": 1641 + }, + { + "epoch": 21.896321070234112, + "grad_norm": 0.6029645800590515, + "learning_rate": 0.00012720800889877643, + "loss": 2.7419, + "step": 1642 + }, + { + "epoch": 21.909698996655518, + "grad_norm": 0.7492797374725342, + "learning_rate": 0.0001271635150166852, + "loss": 2.8256, + "step": 1643 + }, + { + "epoch": 21.923076923076923, + "grad_norm": 0.6877094507217407, + "learning_rate": 0.000127119021134594, + "loss": 2.8241, + "step": 1644 + }, + { + "epoch": 21.93645484949833, + "grad_norm": 0.6263371109962463, + "learning_rate": 0.0001270745272525028, + "loss": 2.8459, + "step": 1645 + }, + { + "epoch": 21.94983277591973, + "grad_norm": 0.5821470618247986, + "learning_rate": 0.00012703003337041156, + "loss": 3.0513, + "step": 1646 + }, + { + "epoch": 21.963210702341136, + "grad_norm": 0.5494033098220825, + "learning_rate": 0.00012698553948832037, + "loss": 3.0951, + "step": 1647 + }, + { + "epoch": 21.976588628762542, + "grad_norm": 0.5838212370872498, + "learning_rate": 0.00012694104560622915, + "loss": 3.0089, + "step": 1648 + }, + { + "epoch": 21.989966555183948, + "grad_norm": 0.7115112543106079, + "learning_rate": 0.00012689655172413793, + "loss": 2.9137, + "step": 1649 + }, + { + "epoch": 22.0, + "grad_norm": 0.7208350896835327, + "learning_rate": 0.00012685205784204672, + "loss": 2.8793, + "step": 1650 + }, + { + "epoch": 22.013377926421406, + "grad_norm": 0.912466824054718, + "learning_rate": 0.00012680756395995552, + "loss": 2.7563, + "step": 1651 + }, + { + "epoch": 22.02675585284281, + "grad_norm": 0.7778460383415222, + "learning_rate": 0.0001267630700778643, + "loss": 2.7859, + "step": 1652 + }, + { + "epoch": 22.040133779264213, + "grad_norm": 0.6658245921134949, + "learning_rate": 0.0001267185761957731, + "loss": 2.8353, + "step": 1653 + }, + { + "epoch": 22.05351170568562, + "grad_norm": 0.5711331963539124, + "learning_rate": 0.00012667408231368187, + "loss": 2.7731, + "step": 1654 + }, + { + "epoch": 22.066889632107024, + "grad_norm": 0.6227294206619263, + "learning_rate": 0.00012662958843159068, + "loss": 2.6841, + "step": 1655 + }, + { + "epoch": 22.08026755852843, + "grad_norm": 0.6221343874931335, + "learning_rate": 0.00012658509454949944, + "loss": 2.6303, + "step": 1656 + }, + { + "epoch": 22.093645484949832, + "grad_norm": 0.6461536288261414, + "learning_rate": 0.00012654060066740822, + "loss": 2.7591, + "step": 1657 + }, + { + "epoch": 22.107023411371237, + "grad_norm": 0.9253934025764465, + "learning_rate": 0.00012649610678531703, + "loss": 2.7619, + "step": 1658 + }, + { + "epoch": 22.120401337792643, + "grad_norm": 1.0310887098312378, + "learning_rate": 0.0001264516129032258, + "loss": 2.7614, + "step": 1659 + }, + { + "epoch": 22.13377926421405, + "grad_norm": 0.682077944278717, + "learning_rate": 0.0001264071190211346, + "loss": 2.9013, + "step": 1660 + }, + { + "epoch": 22.14715719063545, + "grad_norm": 0.6438060998916626, + "learning_rate": 0.00012636262513904338, + "loss": 2.8715, + "step": 1661 + }, + { + "epoch": 22.160535117056856, + "grad_norm": 0.5626085996627808, + "learning_rate": 0.00012631813125695219, + "loss": 2.7459, + "step": 1662 + }, + { + "epoch": 22.17391304347826, + "grad_norm": 0.6211214065551758, + "learning_rate": 0.00012627363737486094, + "loss": 2.9223, + "step": 1663 + }, + { + "epoch": 22.187290969899667, + "grad_norm": 0.5843247175216675, + "learning_rate": 0.00012622914349276975, + "loss": 3.0176, + "step": 1664 + }, + { + "epoch": 22.20066889632107, + "grad_norm": 0.6780831813812256, + "learning_rate": 0.00012618464961067853, + "loss": 3.0641, + "step": 1665 + }, + { + "epoch": 22.214046822742475, + "grad_norm": 0.6507664918899536, + "learning_rate": 0.00012614015572858731, + "loss": 2.9043, + "step": 1666 + }, + { + "epoch": 22.22742474916388, + "grad_norm": 0.6472734212875366, + "learning_rate": 0.0001260956618464961, + "loss": 2.678, + "step": 1667 + }, + { + "epoch": 22.240802675585286, + "grad_norm": 0.5920576453208923, + "learning_rate": 0.0001260511679644049, + "loss": 2.9801, + "step": 1668 + }, + { + "epoch": 22.254180602006688, + "grad_norm": 0.5785727500915527, + "learning_rate": 0.0001260066740823137, + "loss": 2.6949, + "step": 1669 + }, + { + "epoch": 22.267558528428093, + "grad_norm": 0.8260320425033569, + "learning_rate": 0.00012596218020022247, + "loss": 2.939, + "step": 1670 + }, + { + "epoch": 22.2809364548495, + "grad_norm": 0.6697145700454712, + "learning_rate": 0.00012591768631813125, + "loss": 2.6997, + "step": 1671 + }, + { + "epoch": 22.294314381270905, + "grad_norm": 0.6255594491958618, + "learning_rate": 0.00012587319243604006, + "loss": 2.7875, + "step": 1672 + }, + { + "epoch": 22.307692307692307, + "grad_norm": 0.707083523273468, + "learning_rate": 0.00012582869855394882, + "loss": 2.8843, + "step": 1673 + }, + { + "epoch": 22.321070234113712, + "grad_norm": 0.5888648629188538, + "learning_rate": 0.00012578420467185763, + "loss": 2.8862, + "step": 1674 + }, + { + "epoch": 22.334448160535118, + "grad_norm": 0.6249898672103882, + "learning_rate": 0.0001257397107897664, + "loss": 2.6108, + "step": 1675 + }, + { + "epoch": 22.347826086956523, + "grad_norm": 0.6932422518730164, + "learning_rate": 0.0001256952169076752, + "loss": 2.8393, + "step": 1676 + }, + { + "epoch": 22.361204013377925, + "grad_norm": 0.6037236452102661, + "learning_rate": 0.00012565072302558398, + "loss": 2.6517, + "step": 1677 + }, + { + "epoch": 22.37458193979933, + "grad_norm": 0.5549351572990417, + "learning_rate": 0.00012560622914349278, + "loss": 2.7725, + "step": 1678 + }, + { + "epoch": 22.387959866220736, + "grad_norm": 0.6309815645217896, + "learning_rate": 0.00012556173526140157, + "loss": 2.7539, + "step": 1679 + }, + { + "epoch": 22.401337792642142, + "grad_norm": 0.6618108153343201, + "learning_rate": 0.00012551724137931035, + "loss": 2.9342, + "step": 1680 + }, + { + "epoch": 22.414715719063544, + "grad_norm": 0.7427678108215332, + "learning_rate": 0.00012547274749721913, + "loss": 2.9707, + "step": 1681 + }, + { + "epoch": 22.42809364548495, + "grad_norm": 0.6226203441619873, + "learning_rate": 0.00012542825361512794, + "loss": 2.728, + "step": 1682 + }, + { + "epoch": 22.441471571906355, + "grad_norm": 0.6432873606681824, + "learning_rate": 0.0001253837597330367, + "loss": 2.6271, + "step": 1683 + }, + { + "epoch": 22.45484949832776, + "grad_norm": 0.618368923664093, + "learning_rate": 0.0001253392658509455, + "loss": 2.7364, + "step": 1684 + }, + { + "epoch": 22.468227424749163, + "grad_norm": 0.601243257522583, + "learning_rate": 0.0001252947719688543, + "loss": 2.7071, + "step": 1685 + }, + { + "epoch": 22.48160535117057, + "grad_norm": 0.5549631714820862, + "learning_rate": 0.00012525027808676307, + "loss": 2.7703, + "step": 1686 + }, + { + "epoch": 22.494983277591974, + "grad_norm": 0.8125869035720825, + "learning_rate": 0.00012520578420467185, + "loss": 2.6798, + "step": 1687 + }, + { + "epoch": 22.50836120401338, + "grad_norm": 0.5712392330169678, + "learning_rate": 0.00012516129032258066, + "loss": 2.7694, + "step": 1688 + }, + { + "epoch": 22.52173913043478, + "grad_norm": 0.5955860614776611, + "learning_rate": 0.00012511679644048945, + "loss": 2.8128, + "step": 1689 + }, + { + "epoch": 22.535117056856187, + "grad_norm": 0.6090961694717407, + "learning_rate": 0.00012507230255839823, + "loss": 2.9096, + "step": 1690 + }, + { + "epoch": 22.548494983277592, + "grad_norm": 0.6430270671844482, + "learning_rate": 0.000125027808676307, + "loss": 2.8046, + "step": 1691 + }, + { + "epoch": 22.561872909698998, + "grad_norm": 0.6325036883354187, + "learning_rate": 0.00012498331479421582, + "loss": 2.9619, + "step": 1692 + }, + { + "epoch": 22.5752508361204, + "grad_norm": 0.5997435450553894, + "learning_rate": 0.00012493882091212458, + "loss": 2.9809, + "step": 1693 + }, + { + "epoch": 22.588628762541806, + "grad_norm": 0.6140502095222473, + "learning_rate": 0.00012489432703003338, + "loss": 2.9622, + "step": 1694 + }, + { + "epoch": 22.60200668896321, + "grad_norm": 0.5908461809158325, + "learning_rate": 0.00012484983314794217, + "loss": 2.8282, + "step": 1695 + }, + { + "epoch": 22.615384615384617, + "grad_norm": 0.5987250208854675, + "learning_rate": 0.00012480533926585095, + "loss": 2.8239, + "step": 1696 + }, + { + "epoch": 22.62876254180602, + "grad_norm": 0.6643062233924866, + "learning_rate": 0.00012476084538375973, + "loss": 2.8677, + "step": 1697 + }, + { + "epoch": 22.642140468227424, + "grad_norm": 0.640986442565918, + "learning_rate": 0.00012471635150166851, + "loss": 2.6359, + "step": 1698 + }, + { + "epoch": 22.65551839464883, + "grad_norm": 0.6315325498580933, + "learning_rate": 0.00012467185761957732, + "loss": 2.7919, + "step": 1699 + }, + { + "epoch": 22.668896321070235, + "grad_norm": 0.6384916305541992, + "learning_rate": 0.00012462736373748608, + "loss": 3.0353, + "step": 1700 + }, + { + "epoch": 22.682274247491637, + "grad_norm": 0.6934499144554138, + "learning_rate": 0.0001245828698553949, + "loss": 2.8694, + "step": 1701 + }, + { + "epoch": 22.695652173913043, + "grad_norm": 0.6261676549911499, + "learning_rate": 0.00012453837597330367, + "loss": 2.6893, + "step": 1702 + }, + { + "epoch": 22.70903010033445, + "grad_norm": 0.6069616079330444, + "learning_rate": 0.00012449388209121245, + "loss": 2.9214, + "step": 1703 + }, + { + "epoch": 22.722408026755854, + "grad_norm": 0.6354973316192627, + "learning_rate": 0.00012444938820912124, + "loss": 2.754, + "step": 1704 + }, + { + "epoch": 22.735785953177256, + "grad_norm": 0.7385901808738708, + "learning_rate": 0.00012440489432703004, + "loss": 2.7633, + "step": 1705 + }, + { + "epoch": 22.74916387959866, + "grad_norm": 0.680008053779602, + "learning_rate": 0.00012436040044493883, + "loss": 2.851, + "step": 1706 + }, + { + "epoch": 22.762541806020067, + "grad_norm": 0.5894846320152283, + "learning_rate": 0.0001243159065628476, + "loss": 2.8797, + "step": 1707 + }, + { + "epoch": 22.775919732441473, + "grad_norm": 0.5705847144126892, + "learning_rate": 0.0001242714126807564, + "loss": 2.7471, + "step": 1708 + }, + { + "epoch": 22.789297658862875, + "grad_norm": 0.6021112203598022, + "learning_rate": 0.0001242269187986652, + "loss": 3.0716, + "step": 1709 + }, + { + "epoch": 22.80267558528428, + "grad_norm": 0.63507080078125, + "learning_rate": 0.00012418242491657396, + "loss": 2.8873, + "step": 1710 + }, + { + "epoch": 22.816053511705686, + "grad_norm": 0.6159088015556335, + "learning_rate": 0.00012413793103448277, + "loss": 2.5226, + "step": 1711 + }, + { + "epoch": 22.82943143812709, + "grad_norm": 0.6099200248718262, + "learning_rate": 0.00012409343715239155, + "loss": 2.9937, + "step": 1712 + }, + { + "epoch": 22.842809364548494, + "grad_norm": 0.5942363142967224, + "learning_rate": 0.00012404894327030033, + "loss": 2.5113, + "step": 1713 + }, + { + "epoch": 22.8561872909699, + "grad_norm": 0.6209394335746765, + "learning_rate": 0.00012400444938820911, + "loss": 3.0766, + "step": 1714 + }, + { + "epoch": 22.869565217391305, + "grad_norm": 0.6850425601005554, + "learning_rate": 0.00012395995550611792, + "loss": 2.8807, + "step": 1715 + }, + { + "epoch": 22.88294314381271, + "grad_norm": 0.5698277354240417, + "learning_rate": 0.0001239154616240267, + "loss": 2.6821, + "step": 1716 + }, + { + "epoch": 22.896321070234112, + "grad_norm": 0.7140303254127502, + "learning_rate": 0.0001238709677419355, + "loss": 2.7047, + "step": 1717 + }, + { + "epoch": 22.909698996655518, + "grad_norm": 0.6129325032234192, + "learning_rate": 0.00012382647385984427, + "loss": 2.705, + "step": 1718 + }, + { + "epoch": 22.923076923076923, + "grad_norm": 0.6409560441970825, + "learning_rate": 0.00012378197997775308, + "loss": 2.7126, + "step": 1719 + }, + { + "epoch": 22.93645484949833, + "grad_norm": 0.5590237975120544, + "learning_rate": 0.00012373748609566184, + "loss": 2.7945, + "step": 1720 + }, + { + "epoch": 22.94983277591973, + "grad_norm": 0.5763843655586243, + "learning_rate": 0.00012369299221357064, + "loss": 2.6802, + "step": 1721 + }, + { + "epoch": 22.963210702341136, + "grad_norm": 0.6224062442779541, + "learning_rate": 0.00012364849833147943, + "loss": 2.7035, + "step": 1722 + }, + { + "epoch": 22.976588628762542, + "grad_norm": 0.5638879537582397, + "learning_rate": 0.0001236040044493882, + "loss": 2.8259, + "step": 1723 + }, + { + "epoch": 22.989966555183948, + "grad_norm": 0.6214390993118286, + "learning_rate": 0.000123559510567297, + "loss": 2.9997, + "step": 1724 + }, + { + "epoch": 23.0, + "grad_norm": 0.6950054168701172, + "learning_rate": 0.0001235150166852058, + "loss": 2.7497, + "step": 1725 + }, + { + "epoch": 23.013377926421406, + "grad_norm": 0.67231285572052, + "learning_rate": 0.00012347052280311458, + "loss": 2.859, + "step": 1726 + }, + { + "epoch": 23.02675585284281, + "grad_norm": 0.6401329040527344, + "learning_rate": 0.00012342602892102337, + "loss": 2.7056, + "step": 1727 + }, + { + "epoch": 23.040133779264213, + "grad_norm": 0.6234462261199951, + "learning_rate": 0.00012338153503893215, + "loss": 2.8271, + "step": 1728 + }, + { + "epoch": 23.05351170568562, + "grad_norm": 0.6482179164886475, + "learning_rate": 0.00012333704115684096, + "loss": 2.898, + "step": 1729 + }, + { + "epoch": 23.066889632107024, + "grad_norm": 0.6077515482902527, + "learning_rate": 0.0001232925472747497, + "loss": 2.6865, + "step": 1730 + }, + { + "epoch": 23.08026755852843, + "grad_norm": 0.5866036415100098, + "learning_rate": 0.00012324805339265852, + "loss": 2.9001, + "step": 1731 + }, + { + "epoch": 23.093645484949832, + "grad_norm": 0.6499370336532593, + "learning_rate": 0.0001232035595105673, + "loss": 2.8718, + "step": 1732 + }, + { + "epoch": 23.107023411371237, + "grad_norm": 0.6188756227493286, + "learning_rate": 0.0001231590656284761, + "loss": 2.7928, + "step": 1733 + }, + { + "epoch": 23.120401337792643, + "grad_norm": 0.5990676879882812, + "learning_rate": 0.00012311457174638487, + "loss": 2.7918, + "step": 1734 + }, + { + "epoch": 23.13377926421405, + "grad_norm": 0.5592599511146545, + "learning_rate": 0.00012307007786429368, + "loss": 2.7948, + "step": 1735 + }, + { + "epoch": 23.14715719063545, + "grad_norm": 0.5751471519470215, + "learning_rate": 0.00012302558398220246, + "loss": 2.6472, + "step": 1736 + }, + { + "epoch": 23.160535117056856, + "grad_norm": 0.6300846338272095, + "learning_rate": 0.00012298109010011124, + "loss": 2.7491, + "step": 1737 + }, + { + "epoch": 23.17391304347826, + "grad_norm": 0.5754296779632568, + "learning_rate": 0.00012293659621802003, + "loss": 2.7557, + "step": 1738 + }, + { + "epoch": 23.187290969899667, + "grad_norm": 0.6483306884765625, + "learning_rate": 0.0001228921023359288, + "loss": 2.776, + "step": 1739 + }, + { + "epoch": 23.20066889632107, + "grad_norm": 0.5819816589355469, + "learning_rate": 0.0001228476084538376, + "loss": 2.9192, + "step": 1740 + }, + { + "epoch": 23.214046822742475, + "grad_norm": 0.6570101380348206, + "learning_rate": 0.00012280311457174637, + "loss": 2.8571, + "step": 1741 + }, + { + "epoch": 23.22742474916388, + "grad_norm": 0.6435232162475586, + "learning_rate": 0.00012275862068965518, + "loss": 2.6134, + "step": 1742 + }, + { + "epoch": 23.240802675585286, + "grad_norm": 0.5817757844924927, + "learning_rate": 0.00012271412680756397, + "loss": 2.8384, + "step": 1743 + }, + { + "epoch": 23.254180602006688, + "grad_norm": 0.5564932823181152, + "learning_rate": 0.00012266963292547275, + "loss": 2.4606, + "step": 1744 + }, + { + "epoch": 23.267558528428093, + "grad_norm": 0.6028578281402588, + "learning_rate": 0.00012262513904338153, + "loss": 2.824, + "step": 1745 + }, + { + "epoch": 23.2809364548495, + "grad_norm": 0.5562644600868225, + "learning_rate": 0.00012258064516129034, + "loss": 2.8195, + "step": 1746 + }, + { + "epoch": 23.294314381270905, + "grad_norm": 0.6838886141777039, + "learning_rate": 0.0001225361512791991, + "loss": 2.8453, + "step": 1747 + }, + { + "epoch": 23.307692307692307, + "grad_norm": 0.6102856993675232, + "learning_rate": 0.0001224916573971079, + "loss": 2.8331, + "step": 1748 + }, + { + "epoch": 23.321070234113712, + "grad_norm": 0.5895605087280273, + "learning_rate": 0.0001224471635150167, + "loss": 2.7479, + "step": 1749 + }, + { + "epoch": 23.334448160535118, + "grad_norm": 0.7318586111068726, + "learning_rate": 0.00012240266963292547, + "loss": 2.7201, + "step": 1750 + }, + { + "epoch": 23.347826086956523, + "grad_norm": 0.6560878157615662, + "learning_rate": 0.00012235817575083425, + "loss": 2.931, + "step": 1751 + }, + { + "epoch": 23.361204013377925, + "grad_norm": 0.5879004001617432, + "learning_rate": 0.00012231368186874306, + "loss": 2.8607, + "step": 1752 + }, + { + "epoch": 23.37458193979933, + "grad_norm": 0.5916743278503418, + "learning_rate": 0.00012226918798665184, + "loss": 2.6923, + "step": 1753 + }, + { + "epoch": 23.387959866220736, + "grad_norm": 0.5569814443588257, + "learning_rate": 0.00012222469410456063, + "loss": 2.634, + "step": 1754 + }, + { + "epoch": 23.401337792642142, + "grad_norm": 0.7285422682762146, + "learning_rate": 0.0001221802002224694, + "loss": 2.7135, + "step": 1755 + }, + { + "epoch": 23.414715719063544, + "grad_norm": 0.7323710918426514, + "learning_rate": 0.00012213570634037822, + "loss": 2.6855, + "step": 1756 + }, + { + "epoch": 23.42809364548495, + "grad_norm": 0.65497887134552, + "learning_rate": 0.00012209121245828697, + "loss": 2.8777, + "step": 1757 + }, + { + "epoch": 23.441471571906355, + "grad_norm": 0.5919074416160583, + "learning_rate": 0.00012204671857619578, + "loss": 2.657, + "step": 1758 + }, + { + "epoch": 23.45484949832776, + "grad_norm": 0.6749523282051086, + "learning_rate": 0.00012200222469410457, + "loss": 2.8142, + "step": 1759 + }, + { + "epoch": 23.468227424749163, + "grad_norm": 0.6859252452850342, + "learning_rate": 0.00012195773081201336, + "loss": 2.8572, + "step": 1760 + }, + { + "epoch": 23.48160535117057, + "grad_norm": 0.5956023931503296, + "learning_rate": 0.00012191323692992213, + "loss": 2.8385, + "step": 1761 + }, + { + "epoch": 23.494983277591974, + "grad_norm": 0.684101939201355, + "learning_rate": 0.00012186874304783094, + "loss": 2.4779, + "step": 1762 + }, + { + "epoch": 23.50836120401338, + "grad_norm": 0.726864755153656, + "learning_rate": 0.00012182424916573971, + "loss": 2.8523, + "step": 1763 + }, + { + "epoch": 23.52173913043478, + "grad_norm": 0.5874570608139038, + "learning_rate": 0.00012177975528364852, + "loss": 2.7866, + "step": 1764 + }, + { + "epoch": 23.535117056856187, + "grad_norm": 0.6077170968055725, + "learning_rate": 0.00012173526140155729, + "loss": 2.711, + "step": 1765 + }, + { + "epoch": 23.548494983277592, + "grad_norm": 0.6854214668273926, + "learning_rate": 0.00012169076751946608, + "loss": 2.655, + "step": 1766 + }, + { + "epoch": 23.561872909698998, + "grad_norm": 0.6409576535224915, + "learning_rate": 0.00012164627363737486, + "loss": 2.7309, + "step": 1767 + }, + { + "epoch": 23.5752508361204, + "grad_norm": 0.6241947412490845, + "learning_rate": 0.00012160177975528366, + "loss": 2.67, + "step": 1768 + }, + { + "epoch": 23.588628762541806, + "grad_norm": 0.5962851643562317, + "learning_rate": 0.00012155728587319244, + "loss": 2.6147, + "step": 1769 + }, + { + "epoch": 23.60200668896321, + "grad_norm": 0.6183756589889526, + "learning_rate": 0.00012151279199110124, + "loss": 2.9772, + "step": 1770 + }, + { + "epoch": 23.615384615384617, + "grad_norm": 0.6092197299003601, + "learning_rate": 0.00012146829810901001, + "loss": 2.7146, + "step": 1771 + }, + { + "epoch": 23.62876254180602, + "grad_norm": 0.6660104990005493, + "learning_rate": 0.00012142380422691882, + "loss": 2.6339, + "step": 1772 + }, + { + "epoch": 23.642140468227424, + "grad_norm": 0.9198834896087646, + "learning_rate": 0.00012137931034482759, + "loss": 2.7188, + "step": 1773 + }, + { + "epoch": 23.65551839464883, + "grad_norm": 0.7053755521774292, + "learning_rate": 0.0001213348164627364, + "loss": 2.7484, + "step": 1774 + }, + { + "epoch": 23.668896321070235, + "grad_norm": 0.7443736791610718, + "learning_rate": 0.00012129032258064516, + "loss": 2.6805, + "step": 1775 + }, + { + "epoch": 23.682274247491637, + "grad_norm": 0.579801082611084, + "learning_rate": 0.00012124582869855396, + "loss": 2.6467, + "step": 1776 + }, + { + "epoch": 23.695652173913043, + "grad_norm": 0.6014502644538879, + "learning_rate": 0.00012120133481646274, + "loss": 2.5816, + "step": 1777 + }, + { + "epoch": 23.70903010033445, + "grad_norm": 0.6318315863609314, + "learning_rate": 0.00012115684093437154, + "loss": 2.8137, + "step": 1778 + }, + { + "epoch": 23.722408026755854, + "grad_norm": 0.6172413229942322, + "learning_rate": 0.00012111234705228032, + "loss": 2.7789, + "step": 1779 + }, + { + "epoch": 23.735785953177256, + "grad_norm": 0.6184534430503845, + "learning_rate": 0.00012106785317018909, + "loss": 2.6802, + "step": 1780 + }, + { + "epoch": 23.74916387959866, + "grad_norm": 0.6380288600921631, + "learning_rate": 0.00012102335928809789, + "loss": 2.8755, + "step": 1781 + }, + { + "epoch": 23.762541806020067, + "grad_norm": 0.5941389799118042, + "learning_rate": 0.00012097886540600667, + "loss": 2.945, + "step": 1782 + }, + { + "epoch": 23.775919732441473, + "grad_norm": 0.6913108825683594, + "learning_rate": 0.00012093437152391546, + "loss": 2.7036, + "step": 1783 + }, + { + "epoch": 23.789297658862875, + "grad_norm": 0.563119113445282, + "learning_rate": 0.00012088987764182425, + "loss": 2.6608, + "step": 1784 + }, + { + "epoch": 23.80267558528428, + "grad_norm": 0.6387828588485718, + "learning_rate": 0.00012084538375973304, + "loss": 2.8933, + "step": 1785 + }, + { + "epoch": 23.816053511705686, + "grad_norm": 0.7530612945556641, + "learning_rate": 0.00012080088987764183, + "loss": 2.8399, + "step": 1786 + }, + { + "epoch": 23.82943143812709, + "grad_norm": 0.6401646733283997, + "learning_rate": 0.00012075639599555062, + "loss": 2.8757, + "step": 1787 + }, + { + "epoch": 23.842809364548494, + "grad_norm": 0.7403398752212524, + "learning_rate": 0.00012071190211345939, + "loss": 2.629, + "step": 1788 + }, + { + "epoch": 23.8561872909699, + "grad_norm": 0.6479887366294861, + "learning_rate": 0.0001206674082313682, + "loss": 2.9069, + "step": 1789 + }, + { + "epoch": 23.869565217391305, + "grad_norm": 0.588141679763794, + "learning_rate": 0.00012062291434927697, + "loss": 2.732, + "step": 1790 + }, + { + "epoch": 23.88294314381271, + "grad_norm": 0.6330631971359253, + "learning_rate": 0.00012057842046718576, + "loss": 2.9348, + "step": 1791 + }, + { + "epoch": 23.896321070234112, + "grad_norm": 0.6657344698905945, + "learning_rate": 0.00012053392658509455, + "loss": 3.0499, + "step": 1792 + }, + { + "epoch": 23.909698996655518, + "grad_norm": 0.6816025972366333, + "learning_rate": 0.00012048943270300334, + "loss": 2.9961, + "step": 1793 + }, + { + "epoch": 23.923076923076923, + "grad_norm": 0.6501593589782715, + "learning_rate": 0.00012044493882091212, + "loss": 2.8555, + "step": 1794 + }, + { + "epoch": 23.93645484949833, + "grad_norm": 0.6157960295677185, + "learning_rate": 0.00012040044493882092, + "loss": 2.8133, + "step": 1795 + }, + { + "epoch": 23.94983277591973, + "grad_norm": 0.5868191719055176, + "learning_rate": 0.0001203559510567297, + "loss": 2.7887, + "step": 1796 + }, + { + "epoch": 23.963210702341136, + "grad_norm": 0.6586911678314209, + "learning_rate": 0.0001203114571746385, + "loss": 3.0916, + "step": 1797 + }, + { + "epoch": 23.976588628762542, + "grad_norm": 0.6382162570953369, + "learning_rate": 0.00012026696329254727, + "loss": 2.7469, + "step": 1798 + }, + { + "epoch": 23.989966555183948, + "grad_norm": 0.6481708288192749, + "learning_rate": 0.00012022246941045608, + "loss": 2.9216, + "step": 1799 + }, + { + "epoch": 24.0, + "grad_norm": 0.7272862792015076, + "learning_rate": 0.00012017797552836485, + "loss": 2.686, + "step": 1800 + }, + { + "epoch": 24.013377926421406, + "grad_norm": 0.743366003036499, + "learning_rate": 0.00012013348164627364, + "loss": 2.6791, + "step": 1801 + }, + { + "epoch": 24.02675585284281, + "grad_norm": 0.6428125500679016, + "learning_rate": 0.00012008898776418242, + "loss": 2.782, + "step": 1802 + }, + { + "epoch": 24.040133779264213, + "grad_norm": 0.6210600137710571, + "learning_rate": 0.00012004449388209122, + "loss": 2.9352, + "step": 1803 + }, + { + "epoch": 24.05351170568562, + "grad_norm": 0.9589283466339111, + "learning_rate": 0.00012, + "loss": 2.7686, + "step": 1804 + }, + { + "epoch": 24.066889632107024, + "grad_norm": 0.5503745675086975, + "learning_rate": 0.0001199555061179088, + "loss": 2.9241, + "step": 1805 + }, + { + "epoch": 24.08026755852843, + "grad_norm": 0.6018356084823608, + "learning_rate": 0.00011991101223581758, + "loss": 2.7512, + "step": 1806 + }, + { + "epoch": 24.093645484949832, + "grad_norm": 0.7458929419517517, + "learning_rate": 0.00011986651835372638, + "loss": 2.5795, + "step": 1807 + }, + { + "epoch": 24.107023411371237, + "grad_norm": 0.705740213394165, + "learning_rate": 0.00011982202447163515, + "loss": 2.8537, + "step": 1808 + }, + { + "epoch": 24.120401337792643, + "grad_norm": 0.6299166679382324, + "learning_rate": 0.00011977753058954396, + "loss": 2.7815, + "step": 1809 + }, + { + "epoch": 24.13377926421405, + "grad_norm": 1.6271346807479858, + "learning_rate": 0.00011973303670745272, + "loss": 2.6373, + "step": 1810 + }, + { + "epoch": 24.14715719063545, + "grad_norm": 0.6286007165908813, + "learning_rate": 0.00011968854282536152, + "loss": 2.5587, + "step": 1811 + }, + { + "epoch": 24.160535117056856, + "grad_norm": 0.6259005069732666, + "learning_rate": 0.0001196440489432703, + "loss": 2.6414, + "step": 1812 + }, + { + "epoch": 24.17391304347826, + "grad_norm": 0.6058273911476135, + "learning_rate": 0.0001195995550611791, + "loss": 2.8571, + "step": 1813 + }, + { + "epoch": 24.187290969899667, + "grad_norm": 0.6144312620162964, + "learning_rate": 0.00011955506117908788, + "loss": 2.7777, + "step": 1814 + }, + { + "epoch": 24.20066889632107, + "grad_norm": 0.7302254438400269, + "learning_rate": 0.00011951056729699668, + "loss": 2.6298, + "step": 1815 + }, + { + "epoch": 24.214046822742475, + "grad_norm": 0.7253098487854004, + "learning_rate": 0.00011946607341490546, + "loss": 2.9066, + "step": 1816 + }, + { + "epoch": 24.22742474916388, + "grad_norm": 0.7629284858703613, + "learning_rate": 0.00011942157953281426, + "loss": 2.5391, + "step": 1817 + }, + { + "epoch": 24.240802675585286, + "grad_norm": 0.6672796607017517, + "learning_rate": 0.00011937708565072302, + "loss": 2.8014, + "step": 1818 + }, + { + "epoch": 24.254180602006688, + "grad_norm": 0.6049585938453674, + "learning_rate": 0.00011933259176863183, + "loss": 2.5539, + "step": 1819 + }, + { + "epoch": 24.267558528428093, + "grad_norm": 0.5817275047302246, + "learning_rate": 0.0001192880978865406, + "loss": 2.5771, + "step": 1820 + }, + { + "epoch": 24.2809364548495, + "grad_norm": 0.6785464882850647, + "learning_rate": 0.00011924360400444938, + "loss": 2.8043, + "step": 1821 + }, + { + "epoch": 24.294314381270905, + "grad_norm": 0.6705557107925415, + "learning_rate": 0.00011919911012235818, + "loss": 2.7126, + "step": 1822 + }, + { + "epoch": 24.307692307692307, + "grad_norm": 0.6267027854919434, + "learning_rate": 0.00011915461624026696, + "loss": 2.8739, + "step": 1823 + }, + { + "epoch": 24.321070234113712, + "grad_norm": 0.6952410340309143, + "learning_rate": 0.00011911012235817576, + "loss": 2.8402, + "step": 1824 + }, + { + "epoch": 24.334448160535118, + "grad_norm": 0.6892711520195007, + "learning_rate": 0.00011906562847608453, + "loss": 2.7314, + "step": 1825 + }, + { + "epoch": 24.347826086956523, + "grad_norm": 0.5816205739974976, + "learning_rate": 0.00011902113459399334, + "loss": 2.6115, + "step": 1826 + }, + { + "epoch": 24.361204013377925, + "grad_norm": 0.6549028158187866, + "learning_rate": 0.0001189766407119021, + "loss": 2.7503, + "step": 1827 + }, + { + "epoch": 24.37458193979933, + "grad_norm": 0.5902572870254517, + "learning_rate": 0.0001189321468298109, + "loss": 2.7549, + "step": 1828 + }, + { + "epoch": 24.387959866220736, + "grad_norm": 0.8412529230117798, + "learning_rate": 0.00011888765294771968, + "loss": 2.9654, + "step": 1829 + }, + { + "epoch": 24.401337792642142, + "grad_norm": 0.5785757303237915, + "learning_rate": 0.00011884315906562848, + "loss": 2.8101, + "step": 1830 + }, + { + "epoch": 24.414715719063544, + "grad_norm": 0.5835216045379639, + "learning_rate": 0.00011879866518353726, + "loss": 2.5368, + "step": 1831 + }, + { + "epoch": 24.42809364548495, + "grad_norm": 0.6589221954345703, + "learning_rate": 0.00011875417130144606, + "loss": 2.7917, + "step": 1832 + }, + { + "epoch": 24.441471571906355, + "grad_norm": 0.6370606422424316, + "learning_rate": 0.00011870967741935484, + "loss": 2.7514, + "step": 1833 + }, + { + "epoch": 24.45484949832776, + "grad_norm": 0.644980788230896, + "learning_rate": 0.00011866518353726364, + "loss": 2.6756, + "step": 1834 + }, + { + "epoch": 24.468227424749163, + "grad_norm": 0.7281529307365417, + "learning_rate": 0.0001186206896551724, + "loss": 2.5308, + "step": 1835 + }, + { + "epoch": 24.48160535117057, + "grad_norm": 0.5669592618942261, + "learning_rate": 0.00011857619577308122, + "loss": 2.7197, + "step": 1836 + }, + { + "epoch": 24.494983277591974, + "grad_norm": 0.7093492150306702, + "learning_rate": 0.00011853170189098998, + "loss": 2.9053, + "step": 1837 + }, + { + "epoch": 24.50836120401338, + "grad_norm": 0.5949118733406067, + "learning_rate": 0.00011848720800889878, + "loss": 2.8594, + "step": 1838 + }, + { + "epoch": 24.52173913043478, + "grad_norm": 0.5725157260894775, + "learning_rate": 0.00011844271412680756, + "loss": 2.5906, + "step": 1839 + }, + { + "epoch": 24.535117056856187, + "grad_norm": 0.7810790538787842, + "learning_rate": 0.00011839822024471636, + "loss": 2.7852, + "step": 1840 + }, + { + "epoch": 24.548494983277592, + "grad_norm": 0.6285178661346436, + "learning_rate": 0.00011835372636262514, + "loss": 2.7201, + "step": 1841 + }, + { + "epoch": 24.561872909698998, + "grad_norm": 0.6262674927711487, + "learning_rate": 0.00011830923248053394, + "loss": 2.7249, + "step": 1842 + }, + { + "epoch": 24.5752508361204, + "grad_norm": 0.5526009202003479, + "learning_rate": 0.00011826473859844272, + "loss": 2.6668, + "step": 1843 + }, + { + "epoch": 24.588628762541806, + "grad_norm": 0.6295923590660095, + "learning_rate": 0.00011822024471635152, + "loss": 2.8788, + "step": 1844 + }, + { + "epoch": 24.60200668896321, + "grad_norm": 0.6965957283973694, + "learning_rate": 0.00011817575083426028, + "loss": 2.8676, + "step": 1845 + }, + { + "epoch": 24.615384615384617, + "grad_norm": 0.6246476173400879, + "learning_rate": 0.0001181312569521691, + "loss": 2.8326, + "step": 1846 + }, + { + "epoch": 24.62876254180602, + "grad_norm": 0.6312415599822998, + "learning_rate": 0.00011808676307007786, + "loss": 2.5748, + "step": 1847 + }, + { + "epoch": 24.642140468227424, + "grad_norm": 0.7262862920761108, + "learning_rate": 0.00011804226918798666, + "loss": 2.922, + "step": 1848 + }, + { + "epoch": 24.65551839464883, + "grad_norm": 0.6640278697013855, + "learning_rate": 0.00011799777530589544, + "loss": 2.7494, + "step": 1849 + }, + { + "epoch": 24.668896321070235, + "grad_norm": 0.6154195070266724, + "learning_rate": 0.00011795328142380424, + "loss": 2.6168, + "step": 1850 + }, + { + "epoch": 24.682274247491637, + "grad_norm": 0.6020949482917786, + "learning_rate": 0.00011790878754171302, + "loss": 2.7566, + "step": 1851 + }, + { + "epoch": 24.695652173913043, + "grad_norm": 0.5775970816612244, + "learning_rate": 0.00011786429365962182, + "loss": 2.6862, + "step": 1852 + }, + { + "epoch": 24.70903010033445, + "grad_norm": 0.6687362194061279, + "learning_rate": 0.0001178197997775306, + "loss": 2.554, + "step": 1853 + }, + { + "epoch": 24.722408026755854, + "grad_norm": 0.6184269785881042, + "learning_rate": 0.0001177753058954394, + "loss": 2.9353, + "step": 1854 + }, + { + "epoch": 24.735785953177256, + "grad_norm": 0.6340799331665039, + "learning_rate": 0.00011773081201334816, + "loss": 2.857, + "step": 1855 + }, + { + "epoch": 24.74916387959866, + "grad_norm": 0.8216782808303833, + "learning_rate": 0.00011768631813125697, + "loss": 2.6295, + "step": 1856 + }, + { + "epoch": 24.762541806020067, + "grad_norm": 0.5764971375465393, + "learning_rate": 0.00011764182424916574, + "loss": 2.8431, + "step": 1857 + }, + { + "epoch": 24.775919732441473, + "grad_norm": 0.6233363747596741, + "learning_rate": 0.00011759733036707454, + "loss": 2.6867, + "step": 1858 + }, + { + "epoch": 24.789297658862875, + "grad_norm": 0.6669492721557617, + "learning_rate": 0.00011755283648498332, + "loss": 2.5405, + "step": 1859 + }, + { + "epoch": 24.80267558528428, + "grad_norm": 0.6156982183456421, + "learning_rate": 0.00011750834260289212, + "loss": 2.8071, + "step": 1860 + }, + { + "epoch": 24.816053511705686, + "grad_norm": 0.6210219860076904, + "learning_rate": 0.0001174638487208009, + "loss": 2.8308, + "step": 1861 + }, + { + "epoch": 24.82943143812709, + "grad_norm": 0.6786196231842041, + "learning_rate": 0.00011741935483870967, + "loss": 2.9039, + "step": 1862 + }, + { + "epoch": 24.842809364548494, + "grad_norm": 0.6467169523239136, + "learning_rate": 0.00011737486095661848, + "loss": 2.6601, + "step": 1863 + }, + { + "epoch": 24.8561872909699, + "grad_norm": 0.6734811663627625, + "learning_rate": 0.00011733036707452724, + "loss": 2.8614, + "step": 1864 + }, + { + "epoch": 24.869565217391305, + "grad_norm": 0.6223945021629333, + "learning_rate": 0.00011728587319243604, + "loss": 2.9397, + "step": 1865 + }, + { + "epoch": 24.88294314381271, + "grad_norm": 0.6042306423187256, + "learning_rate": 0.00011724137931034482, + "loss": 2.8984, + "step": 1866 + }, + { + "epoch": 24.896321070234112, + "grad_norm": 0.945598840713501, + "learning_rate": 0.00011719688542825362, + "loss": 2.6882, + "step": 1867 + }, + { + "epoch": 24.909698996655518, + "grad_norm": 0.6739409565925598, + "learning_rate": 0.0001171523915461624, + "loss": 2.6285, + "step": 1868 + }, + { + "epoch": 24.923076923076923, + "grad_norm": 0.6158167719841003, + "learning_rate": 0.0001171078976640712, + "loss": 2.8019, + "step": 1869 + }, + { + "epoch": 24.93645484949833, + "grad_norm": 0.5984783172607422, + "learning_rate": 0.00011706340378197998, + "loss": 2.785, + "step": 1870 + }, + { + "epoch": 24.94983277591973, + "grad_norm": 0.594236433506012, + "learning_rate": 0.00011701890989988878, + "loss": 3.0126, + "step": 1871 + }, + { + "epoch": 24.963210702341136, + "grad_norm": 0.6268919706344604, + "learning_rate": 0.00011697441601779754, + "loss": 2.7908, + "step": 1872 + }, + { + "epoch": 24.976588628762542, + "grad_norm": 0.6119821071624756, + "learning_rate": 0.00011692992213570635, + "loss": 2.5886, + "step": 1873 + }, + { + "epoch": 24.989966555183948, + "grad_norm": 0.8935804963111877, + "learning_rate": 0.00011688542825361512, + "loss": 2.9702, + "step": 1874 + }, + { + "epoch": 25.0, + "grad_norm": 0.6583001613616943, + "learning_rate": 0.00011684093437152392, + "loss": 2.7691, + "step": 1875 + }, + { + "epoch": 25.013377926421406, + "grad_norm": 0.5952125787734985, + "learning_rate": 0.0001167964404894327, + "loss": 2.6709, + "step": 1876 + }, + { + "epoch": 25.02675585284281, + "grad_norm": 0.6740133762359619, + "learning_rate": 0.0001167519466073415, + "loss": 2.6548, + "step": 1877 + }, + { + "epoch": 25.040133779264213, + "grad_norm": 0.6665695309638977, + "learning_rate": 0.00011670745272525028, + "loss": 2.6141, + "step": 1878 + }, + { + "epoch": 25.05351170568562, + "grad_norm": 0.698957085609436, + "learning_rate": 0.00011666295884315908, + "loss": 2.7752, + "step": 1879 + }, + { + "epoch": 25.066889632107024, + "grad_norm": 0.7277541160583496, + "learning_rate": 0.00011661846496106786, + "loss": 2.8876, + "step": 1880 + }, + { + "epoch": 25.08026755852843, + "grad_norm": 0.5987577438354492, + "learning_rate": 0.00011657397107897665, + "loss": 2.5601, + "step": 1881 + }, + { + "epoch": 25.093645484949832, + "grad_norm": 0.5897870063781738, + "learning_rate": 0.00011652947719688542, + "loss": 2.5854, + "step": 1882 + }, + { + "epoch": 25.107023411371237, + "grad_norm": 0.573273241519928, + "learning_rate": 0.00011648498331479423, + "loss": 2.7141, + "step": 1883 + }, + { + "epoch": 25.120401337792643, + "grad_norm": 0.6173187494277954, + "learning_rate": 0.000116440489432703, + "loss": 2.6697, + "step": 1884 + }, + { + "epoch": 25.13377926421405, + "grad_norm": 0.5728760361671448, + "learning_rate": 0.0001163959955506118, + "loss": 2.7058, + "step": 1885 + }, + { + "epoch": 25.14715719063545, + "grad_norm": 0.543645441532135, + "learning_rate": 0.00011635150166852058, + "loss": 2.441, + "step": 1886 + }, + { + "epoch": 25.160535117056856, + "grad_norm": 0.6582958698272705, + "learning_rate": 0.00011630700778642938, + "loss": 2.8462, + "step": 1887 + }, + { + "epoch": 25.17391304347826, + "grad_norm": 0.6733880043029785, + "learning_rate": 0.00011626251390433816, + "loss": 2.7294, + "step": 1888 + }, + { + "epoch": 25.187290969899667, + "grad_norm": 0.8249802589416504, + "learning_rate": 0.00011621802002224695, + "loss": 2.7738, + "step": 1889 + }, + { + "epoch": 25.20066889632107, + "grad_norm": 0.6004601120948792, + "learning_rate": 0.00011617352614015574, + "loss": 2.6992, + "step": 1890 + }, + { + "epoch": 25.214046822742475, + "grad_norm": 0.6188962459564209, + "learning_rate": 0.00011612903225806453, + "loss": 2.5715, + "step": 1891 + }, + { + "epoch": 25.22742474916388, + "grad_norm": 0.6831389665603638, + "learning_rate": 0.0001160845383759733, + "loss": 2.8777, + "step": 1892 + }, + { + "epoch": 25.240802675585286, + "grad_norm": 0.5578925013542175, + "learning_rate": 0.00011604004449388211, + "loss": 2.4875, + "step": 1893 + }, + { + "epoch": 25.254180602006688, + "grad_norm": 0.5995929837226868, + "learning_rate": 0.00011599555061179088, + "loss": 2.5166, + "step": 1894 + }, + { + "epoch": 25.267558528428093, + "grad_norm": 0.6817669868469238, + "learning_rate": 0.00011595105672969967, + "loss": 2.7628, + "step": 1895 + }, + { + "epoch": 25.2809364548495, + "grad_norm": 0.671890139579773, + "learning_rate": 0.00011590656284760846, + "loss": 2.5406, + "step": 1896 + }, + { + "epoch": 25.294314381270905, + "grad_norm": 0.6167321801185608, + "learning_rate": 0.00011586206896551725, + "loss": 2.8789, + "step": 1897 + }, + { + "epoch": 25.307692307692307, + "grad_norm": 0.6145638227462769, + "learning_rate": 0.00011581757508342604, + "loss": 2.7629, + "step": 1898 + }, + { + "epoch": 25.321070234113712, + "grad_norm": 0.6117168068885803, + "learning_rate": 0.00011577308120133483, + "loss": 2.7048, + "step": 1899 + }, + { + "epoch": 25.334448160535118, + "grad_norm": 0.6903496384620667, + "learning_rate": 0.00011572858731924361, + "loss": 2.9096, + "step": 1900 + }, + { + "epoch": 25.347826086956523, + "grad_norm": 0.5942792296409607, + "learning_rate": 0.00011568409343715241, + "loss": 2.4525, + "step": 1901 + }, + { + "epoch": 25.361204013377925, + "grad_norm": 0.5714770555496216, + "learning_rate": 0.00011563959955506118, + "loss": 2.6605, + "step": 1902 + }, + { + "epoch": 25.37458193979933, + "grad_norm": 1.0252799987792969, + "learning_rate": 0.00011559510567296996, + "loss": 2.4599, + "step": 1903 + }, + { + "epoch": 25.387959866220736, + "grad_norm": 0.6459619998931885, + "learning_rate": 0.00011555061179087876, + "loss": 2.657, + "step": 1904 + }, + { + "epoch": 25.401337792642142, + "grad_norm": 0.6513925790786743, + "learning_rate": 0.00011550611790878754, + "loss": 2.6491, + "step": 1905 + }, + { + "epoch": 25.414715719063544, + "grad_norm": 0.838390588760376, + "learning_rate": 0.00011546162402669634, + "loss": 2.8909, + "step": 1906 + }, + { + "epoch": 25.42809364548495, + "grad_norm": 0.6873288750648499, + "learning_rate": 0.00011541713014460512, + "loss": 2.9252, + "step": 1907 + }, + { + "epoch": 25.441471571906355, + "grad_norm": 0.6325615644454956, + "learning_rate": 0.00011537263626251391, + "loss": 2.8784, + "step": 1908 + }, + { + "epoch": 25.45484949832776, + "grad_norm": 0.7683016061782837, + "learning_rate": 0.00011532814238042268, + "loss": 2.7951, + "step": 1909 + }, + { + "epoch": 25.468227424749163, + "grad_norm": 0.6581037044525146, + "learning_rate": 0.00011528364849833149, + "loss": 2.7674, + "step": 1910 + }, + { + "epoch": 25.48160535117057, + "grad_norm": 0.6761939525604248, + "learning_rate": 0.00011523915461624026, + "loss": 2.7315, + "step": 1911 + }, + { + "epoch": 25.494983277591974, + "grad_norm": 0.621744692325592, + "learning_rate": 0.00011519466073414906, + "loss": 2.6977, + "step": 1912 + }, + { + "epoch": 25.50836120401338, + "grad_norm": 0.5744712352752686, + "learning_rate": 0.00011515016685205784, + "loss": 2.6568, + "step": 1913 + }, + { + "epoch": 25.52173913043478, + "grad_norm": 0.6587238907814026, + "learning_rate": 0.00011510567296996664, + "loss": 3.0222, + "step": 1914 + }, + { + "epoch": 25.535117056856187, + "grad_norm": 0.6338929533958435, + "learning_rate": 0.00011506117908787542, + "loss": 2.4611, + "step": 1915 + }, + { + "epoch": 25.548494983277592, + "grad_norm": 0.6468850374221802, + "learning_rate": 0.00011501668520578421, + "loss": 2.5532, + "step": 1916 + }, + { + "epoch": 25.561872909698998, + "grad_norm": 0.6900503635406494, + "learning_rate": 0.000114972191323693, + "loss": 2.7291, + "step": 1917 + }, + { + "epoch": 25.5752508361204, + "grad_norm": 0.9081000089645386, + "learning_rate": 0.00011492769744160179, + "loss": 2.528, + "step": 1918 + }, + { + "epoch": 25.588628762541806, + "grad_norm": 0.6108705401420593, + "learning_rate": 0.00011488320355951056, + "loss": 2.7942, + "step": 1919 + }, + { + "epoch": 25.60200668896321, + "grad_norm": 0.5745365023612976, + "learning_rate": 0.00011483870967741937, + "loss": 2.7273, + "step": 1920 + }, + { + "epoch": 25.615384615384617, + "grad_norm": 0.7215850949287415, + "learning_rate": 0.00011479421579532814, + "loss": 2.85, + "step": 1921 + }, + { + "epoch": 25.62876254180602, + "grad_norm": 0.6535587310791016, + "learning_rate": 0.00011474972191323693, + "loss": 2.7488, + "step": 1922 + }, + { + "epoch": 25.642140468227424, + "grad_norm": 0.6285836696624756, + "learning_rate": 0.00011470522803114572, + "loss": 2.6874, + "step": 1923 + }, + { + "epoch": 25.65551839464883, + "grad_norm": 0.652277410030365, + "learning_rate": 0.00011466073414905451, + "loss": 2.889, + "step": 1924 + }, + { + "epoch": 25.668896321070235, + "grad_norm": 0.6526362299919128, + "learning_rate": 0.0001146162402669633, + "loss": 2.8342, + "step": 1925 + }, + { + "epoch": 25.682274247491637, + "grad_norm": 0.6387808918952942, + "learning_rate": 0.00011457174638487209, + "loss": 2.6713, + "step": 1926 + }, + { + "epoch": 25.695652173913043, + "grad_norm": 0.5775367617607117, + "learning_rate": 0.00011452725250278087, + "loss": 2.6848, + "step": 1927 + }, + { + "epoch": 25.70903010033445, + "grad_norm": 0.7041560411453247, + "learning_rate": 0.00011448275862068967, + "loss": 2.7532, + "step": 1928 + }, + { + "epoch": 25.722408026755854, + "grad_norm": 0.6833249926567078, + "learning_rate": 0.00011443826473859844, + "loss": 2.8257, + "step": 1929 + }, + { + "epoch": 25.735785953177256, + "grad_norm": 0.6916714310646057, + "learning_rate": 0.00011439377085650725, + "loss": 2.5491, + "step": 1930 + }, + { + "epoch": 25.74916387959866, + "grad_norm": 0.8915151357650757, + "learning_rate": 0.00011434927697441602, + "loss": 2.8361, + "step": 1931 + }, + { + "epoch": 25.762541806020067, + "grad_norm": 0.6278401613235474, + "learning_rate": 0.00011430478309232481, + "loss": 2.8022, + "step": 1932 + }, + { + "epoch": 25.775919732441473, + "grad_norm": 0.6929000020027161, + "learning_rate": 0.0001142602892102336, + "loss": 2.8938, + "step": 1933 + }, + { + "epoch": 25.789297658862875, + "grad_norm": 0.6363662481307983, + "learning_rate": 0.00011421579532814239, + "loss": 2.8736, + "step": 1934 + }, + { + "epoch": 25.80267558528428, + "grad_norm": 0.6596313118934631, + "learning_rate": 0.00011417130144605117, + "loss": 2.9864, + "step": 1935 + }, + { + "epoch": 25.816053511705686, + "grad_norm": 0.9694854617118835, + "learning_rate": 0.00011412680756395997, + "loss": 2.7362, + "step": 1936 + }, + { + "epoch": 25.82943143812709, + "grad_norm": 0.721339225769043, + "learning_rate": 0.00011408231368186875, + "loss": 2.9919, + "step": 1937 + }, + { + "epoch": 25.842809364548494, + "grad_norm": 0.6164969205856323, + "learning_rate": 0.00011403781979977755, + "loss": 3.0446, + "step": 1938 + }, + { + "epoch": 25.8561872909699, + "grad_norm": 0.8240790367126465, + "learning_rate": 0.00011399332591768632, + "loss": 2.7113, + "step": 1939 + }, + { + "epoch": 25.869565217391305, + "grad_norm": 0.865219235420227, + "learning_rate": 0.00011394883203559513, + "loss": 2.6692, + "step": 1940 + }, + { + "epoch": 25.88294314381271, + "grad_norm": 0.6014530062675476, + "learning_rate": 0.0001139043381535039, + "loss": 2.7527, + "step": 1941 + }, + { + "epoch": 25.896321070234112, + "grad_norm": 0.6110913157463074, + "learning_rate": 0.00011385984427141269, + "loss": 2.6213, + "step": 1942 + }, + { + "epoch": 25.909698996655518, + "grad_norm": 0.6967377662658691, + "learning_rate": 0.00011381535038932147, + "loss": 3.0013, + "step": 1943 + }, + { + "epoch": 25.923076923076923, + "grad_norm": 0.6235397458076477, + "learning_rate": 0.00011377085650723027, + "loss": 2.8256, + "step": 1944 + }, + { + "epoch": 25.93645484949833, + "grad_norm": 0.6463499665260315, + "learning_rate": 0.00011372636262513905, + "loss": 2.6573, + "step": 1945 + }, + { + "epoch": 25.94983277591973, + "grad_norm": 0.7898368239402771, + "learning_rate": 0.00011368186874304782, + "loss": 2.6484, + "step": 1946 + }, + { + "epoch": 25.963210702341136, + "grad_norm": 0.6130072474479675, + "learning_rate": 0.00011363737486095663, + "loss": 2.6912, + "step": 1947 + }, + { + "epoch": 25.976588628762542, + "grad_norm": 0.6541060209274292, + "learning_rate": 0.0001135928809788654, + "loss": 2.789, + "step": 1948 + }, + { + "epoch": 25.989966555183948, + "grad_norm": 0.6722179651260376, + "learning_rate": 0.0001135483870967742, + "loss": 2.7197, + "step": 1949 + }, + { + "epoch": 26.0, + "grad_norm": 0.6884087920188904, + "learning_rate": 0.00011350389321468298, + "loss": 2.83, + "step": 1950 + }, + { + "epoch": 26.013377926421406, + "grad_norm": 0.5991981029510498, + "learning_rate": 0.00011345939933259177, + "loss": 2.8456, + "step": 1951 + }, + { + "epoch": 26.02675585284281, + "grad_norm": 0.660169243812561, + "learning_rate": 0.00011341490545050056, + "loss": 2.8782, + "step": 1952 + }, + { + "epoch": 26.040133779264213, + "grad_norm": 0.651779294013977, + "learning_rate": 0.00011337041156840935, + "loss": 2.7286, + "step": 1953 + }, + { + "epoch": 26.05351170568562, + "grad_norm": 0.6962876915931702, + "learning_rate": 0.00011332591768631813, + "loss": 2.5009, + "step": 1954 + }, + { + "epoch": 26.066889632107024, + "grad_norm": 0.6752526164054871, + "learning_rate": 0.00011328142380422693, + "loss": 2.6137, + "step": 1955 + }, + { + "epoch": 26.08026755852843, + "grad_norm": 0.6335856914520264, + "learning_rate": 0.0001132369299221357, + "loss": 2.4744, + "step": 1956 + }, + { + "epoch": 26.093645484949832, + "grad_norm": 0.6627519726753235, + "learning_rate": 0.00011319243604004451, + "loss": 2.9633, + "step": 1957 + }, + { + "epoch": 26.107023411371237, + "grad_norm": 0.5948102474212646, + "learning_rate": 0.00011314794215795328, + "loss": 2.6333, + "step": 1958 + }, + { + "epoch": 26.120401337792643, + "grad_norm": 0.5609325766563416, + "learning_rate": 0.00011310344827586207, + "loss": 2.5427, + "step": 1959 + }, + { + "epoch": 26.13377926421405, + "grad_norm": 0.6280467510223389, + "learning_rate": 0.00011305895439377086, + "loss": 2.6838, + "step": 1960 + }, + { + "epoch": 26.14715719063545, + "grad_norm": 0.5900541543960571, + "learning_rate": 0.00011301446051167965, + "loss": 2.8057, + "step": 1961 + }, + { + "epoch": 26.160535117056856, + "grad_norm": 0.6185488700866699, + "learning_rate": 0.00011296996662958843, + "loss": 2.7778, + "step": 1962 + }, + { + "epoch": 26.17391304347826, + "grad_norm": 0.6149762272834778, + "learning_rate": 0.00011292547274749723, + "loss": 2.7384, + "step": 1963 + }, + { + "epoch": 26.187290969899667, + "grad_norm": 0.5975783467292786, + "learning_rate": 0.00011288097886540601, + "loss": 2.6957, + "step": 1964 + }, + { + "epoch": 26.20066889632107, + "grad_norm": 0.7946596741676331, + "learning_rate": 0.00011283648498331481, + "loss": 2.6071, + "step": 1965 + }, + { + "epoch": 26.214046822742475, + "grad_norm": 0.5987370014190674, + "learning_rate": 0.00011279199110122358, + "loss": 2.5812, + "step": 1966 + }, + { + "epoch": 26.22742474916388, + "grad_norm": 0.7613934278488159, + "learning_rate": 0.00011274749721913239, + "loss": 2.7886, + "step": 1967 + }, + { + "epoch": 26.240802675585286, + "grad_norm": 0.6035755276679993, + "learning_rate": 0.00011270300333704116, + "loss": 2.4709, + "step": 1968 + }, + { + "epoch": 26.254180602006688, + "grad_norm": 0.9545117616653442, + "learning_rate": 0.00011265850945494995, + "loss": 2.6039, + "step": 1969 + }, + { + "epoch": 26.267558528428093, + "grad_norm": 0.5982191562652588, + "learning_rate": 0.00011261401557285873, + "loss": 2.664, + "step": 1970 + }, + { + "epoch": 26.2809364548495, + "grad_norm": 0.6070300340652466, + "learning_rate": 0.00011256952169076753, + "loss": 2.671, + "step": 1971 + }, + { + "epoch": 26.294314381270905, + "grad_norm": 0.6093387603759766, + "learning_rate": 0.00011252502780867631, + "loss": 2.6441, + "step": 1972 + }, + { + "epoch": 26.307692307692307, + "grad_norm": 0.6499563455581665, + "learning_rate": 0.00011248053392658511, + "loss": 2.6851, + "step": 1973 + }, + { + "epoch": 26.321070234113712, + "grad_norm": 0.6848227977752686, + "learning_rate": 0.00011243604004449389, + "loss": 2.6448, + "step": 1974 + }, + { + "epoch": 26.334448160535118, + "grad_norm": 0.6483498811721802, + "learning_rate": 0.00011239154616240269, + "loss": 2.5361, + "step": 1975 + }, + { + "epoch": 26.347826086956523, + "grad_norm": 0.6412089467048645, + "learning_rate": 0.00011234705228031146, + "loss": 2.6625, + "step": 1976 + }, + { + "epoch": 26.361204013377925, + "grad_norm": 0.6891600489616394, + "learning_rate": 0.00011230255839822026, + "loss": 2.6901, + "step": 1977 + }, + { + "epoch": 26.37458193979933, + "grad_norm": 0.606422483921051, + "learning_rate": 0.00011225806451612903, + "loss": 2.7778, + "step": 1978 + }, + { + "epoch": 26.387959866220736, + "grad_norm": 0.6754788756370544, + "learning_rate": 0.00011221357063403783, + "loss": 2.7757, + "step": 1979 + }, + { + "epoch": 26.401337792642142, + "grad_norm": 0.6534073352813721, + "learning_rate": 0.00011216907675194661, + "loss": 2.7682, + "step": 1980 + }, + { + "epoch": 26.414715719063544, + "grad_norm": 0.6333116888999939, + "learning_rate": 0.00011212458286985541, + "loss": 2.739, + "step": 1981 + }, + { + "epoch": 26.42809364548495, + "grad_norm": 0.6784356236457825, + "learning_rate": 0.00011208008898776419, + "loss": 2.499, + "step": 1982 + }, + { + "epoch": 26.441471571906355, + "grad_norm": 0.6247740983963013, + "learning_rate": 0.00011203559510567299, + "loss": 2.8451, + "step": 1983 + }, + { + "epoch": 26.45484949832776, + "grad_norm": 0.6405763030052185, + "learning_rate": 0.00011199110122358177, + "loss": 2.7237, + "step": 1984 + }, + { + "epoch": 26.468227424749163, + "grad_norm": 0.7398512959480286, + "learning_rate": 0.00011194660734149056, + "loss": 2.6931, + "step": 1985 + }, + { + "epoch": 26.48160535117057, + "grad_norm": 0.6067454814910889, + "learning_rate": 0.00011190211345939933, + "loss": 2.8153, + "step": 1986 + }, + { + "epoch": 26.494983277591974, + "grad_norm": 0.5860223770141602, + "learning_rate": 0.00011185761957730812, + "loss": 2.9522, + "step": 1987 + }, + { + "epoch": 26.50836120401338, + "grad_norm": 0.6325815320014954, + "learning_rate": 0.00011181312569521691, + "loss": 2.7294, + "step": 1988 + }, + { + "epoch": 26.52173913043478, + "grad_norm": 0.6049959063529968, + "learning_rate": 0.0001117686318131257, + "loss": 2.7989, + "step": 1989 + }, + { + "epoch": 26.535117056856187, + "grad_norm": 0.5589177012443542, + "learning_rate": 0.00011172413793103449, + "loss": 2.4431, + "step": 1990 + }, + { + "epoch": 26.548494983277592, + "grad_norm": 0.6467594504356384, + "learning_rate": 0.00011167964404894327, + "loss": 2.7166, + "step": 1991 + }, + { + "epoch": 26.561872909698998, + "grad_norm": 0.7082380056381226, + "learning_rate": 0.00011163515016685207, + "loss": 2.7768, + "step": 1992 + }, + { + "epoch": 26.5752508361204, + "grad_norm": 0.7438964247703552, + "learning_rate": 0.00011159065628476084, + "loss": 2.8655, + "step": 1993 + }, + { + "epoch": 26.588628762541806, + "grad_norm": 0.7755714654922485, + "learning_rate": 0.00011154616240266965, + "loss": 2.6956, + "step": 1994 + }, + { + "epoch": 26.60200668896321, + "grad_norm": 0.576988160610199, + "learning_rate": 0.00011150166852057842, + "loss": 2.5831, + "step": 1995 + }, + { + "epoch": 26.615384615384617, + "grad_norm": 0.6044856309890747, + "learning_rate": 0.00011145717463848721, + "loss": 2.6834, + "step": 1996 + }, + { + "epoch": 26.62876254180602, + "grad_norm": 0.5846719741821289, + "learning_rate": 0.000111412680756396, + "loss": 2.5553, + "step": 1997 + }, + { + "epoch": 26.642140468227424, + "grad_norm": 0.6511263251304626, + "learning_rate": 0.00011136818687430479, + "loss": 2.5404, + "step": 1998 + }, + { + "epoch": 26.65551839464883, + "grad_norm": 0.6784057021141052, + "learning_rate": 0.00011132369299221357, + "loss": 2.5896, + "step": 1999 + }, + { + "epoch": 26.668896321070235, + "grad_norm": 0.7156029939651489, + "learning_rate": 0.00011127919911012237, + "loss": 2.6683, + "step": 2000 + }, + { + "epoch": 26.682274247491637, + "grad_norm": 1.5313527584075928, + "learning_rate": 0.00011123470522803115, + "loss": 3.1002, + "step": 2001 + }, + { + "epoch": 26.695652173913043, + "grad_norm": 1.570999264717102, + "learning_rate": 0.00011119021134593995, + "loss": 3.0022, + "step": 2002 + }, + { + "epoch": 26.70903010033445, + "grad_norm": 1.672279953956604, + "learning_rate": 0.00011114571746384872, + "loss": 2.9443, + "step": 2003 + }, + { + "epoch": 26.722408026755854, + "grad_norm": 1.3200056552886963, + "learning_rate": 0.00011110122358175752, + "loss": 3.3101, + "step": 2004 + }, + { + "epoch": 26.735785953177256, + "grad_norm": 1.9621714353561401, + "learning_rate": 0.0001110567296996663, + "loss": 3.0294, + "step": 2005 + }, + { + "epoch": 26.74916387959866, + "grad_norm": 1.5567591190338135, + "learning_rate": 0.00011101223581757509, + "loss": 3.1361, + "step": 2006 + }, + { + "epoch": 26.762541806020067, + "grad_norm": 1.6681092977523804, + "learning_rate": 0.00011096774193548387, + "loss": 3.2709, + "step": 2007 + }, + { + "epoch": 26.775919732441473, + "grad_norm": 1.461212158203125, + "learning_rate": 0.00011092324805339267, + "loss": 3.0585, + "step": 2008 + }, + { + "epoch": 26.789297658862875, + "grad_norm": 1.4997388124465942, + "learning_rate": 0.00011087875417130145, + "loss": 3.0182, + "step": 2009 + }, + { + "epoch": 26.80267558528428, + "grad_norm": 1.3930811882019043, + "learning_rate": 0.00011083426028921025, + "loss": 2.9796, + "step": 2010 + }, + { + "epoch": 26.816053511705686, + "grad_norm": 1.3426967859268188, + "learning_rate": 0.00011078976640711903, + "loss": 3.2029, + "step": 2011 + }, + { + "epoch": 26.82943143812709, + "grad_norm": 1.7054353952407837, + "learning_rate": 0.00011074527252502782, + "loss": 3.0889, + "step": 2012 + }, + { + "epoch": 26.842809364548494, + "grad_norm": 1.6260415315628052, + "learning_rate": 0.00011070077864293659, + "loss": 3.2461, + "step": 2013 + }, + { + "epoch": 26.8561872909699, + "grad_norm": 1.7708278894424438, + "learning_rate": 0.0001106562847608454, + "loss": 3.1737, + "step": 2014 + }, + { + "epoch": 26.869565217391305, + "grad_norm": 1.4107633829116821, + "learning_rate": 0.00011061179087875417, + "loss": 3.0006, + "step": 2015 + }, + { + "epoch": 26.88294314381271, + "grad_norm": 1.929655909538269, + "learning_rate": 0.00011056729699666297, + "loss": 3.0805, + "step": 2016 + }, + { + "epoch": 26.896321070234112, + "grad_norm": 1.835286021232605, + "learning_rate": 0.00011052280311457175, + "loss": 3.0995, + "step": 2017 + }, + { + "epoch": 26.909698996655518, + "grad_norm": 1.9293112754821777, + "learning_rate": 0.00011047830923248055, + "loss": 3.1714, + "step": 2018 + }, + { + "epoch": 26.923076923076923, + "grad_norm": 1.5640552043914795, + "learning_rate": 0.00011043381535038933, + "loss": 3.1381, + "step": 2019 + }, + { + "epoch": 26.93645484949833, + "grad_norm": 1.7890485525131226, + "learning_rate": 0.00011038932146829812, + "loss": 3.0785, + "step": 2020 + }, + { + "epoch": 26.94983277591973, + "grad_norm": 1.3167777061462402, + "learning_rate": 0.0001103448275862069, + "loss": 3.2041, + "step": 2021 + }, + { + "epoch": 26.963210702341136, + "grad_norm": 1.3006356954574585, + "learning_rate": 0.0001103003337041157, + "loss": 2.6744, + "step": 2022 + }, + { + "epoch": 26.976588628762542, + "grad_norm": 1.3111026287078857, + "learning_rate": 0.00011025583982202447, + "loss": 2.9279, + "step": 2023 + }, + { + "epoch": 26.989966555183948, + "grad_norm": 1.9163484573364258, + "learning_rate": 0.00011021134593993328, + "loss": 3.0983, + "step": 2024 + }, + { + "epoch": 27.013377926421406, + "grad_norm": 2.121852159500122, + "learning_rate": 0.00011016685205784205, + "loss": 6.1272, + "step": 2025 + }, + { + "epoch": 27.02675585284281, + "grad_norm": 1.2503304481506348, + "learning_rate": 0.00011012235817575085, + "loss": 2.8418, + "step": 2026 + }, + { + "epoch": 27.040133779264213, + "grad_norm": 1.2677263021469116, + "learning_rate": 0.00011007786429365963, + "loss": 3.166, + "step": 2027 + }, + { + "epoch": 27.05351170568562, + "grad_norm": 1.1323914527893066, + "learning_rate": 0.00011003337041156841, + "loss": 2.881, + "step": 2028 + }, + { + "epoch": 27.066889632107024, + "grad_norm": 1.32306969165802, + "learning_rate": 0.0001099888765294772, + "loss": 3.1736, + "step": 2029 + }, + { + "epoch": 27.08026755852843, + "grad_norm": 1.1854408979415894, + "learning_rate": 0.00010994438264738598, + "loss": 3.0644, + "step": 2030 + }, + { + "epoch": 27.093645484949832, + "grad_norm": 1.3617606163024902, + "learning_rate": 0.00010989988876529478, + "loss": 3.0845, + "step": 2031 + }, + { + "epoch": 27.107023411371237, + "grad_norm": 1.3771755695343018, + "learning_rate": 0.00010985539488320355, + "loss": 3.215, + "step": 2032 + }, + { + "epoch": 27.120401337792643, + "grad_norm": 1.4441111087799072, + "learning_rate": 0.00010981090100111235, + "loss": 3.1639, + "step": 2033 + }, + { + "epoch": 27.13377926421405, + "grad_norm": 1.4004138708114624, + "learning_rate": 0.00010976640711902113, + "loss": 3.2066, + "step": 2034 + }, + { + "epoch": 27.14715719063545, + "grad_norm": 1.3546315431594849, + "learning_rate": 0.00010972191323692993, + "loss": 2.8154, + "step": 2035 + }, + { + "epoch": 27.160535117056856, + "grad_norm": 1.122201681137085, + "learning_rate": 0.00010967741935483871, + "loss": 2.8852, + "step": 2036 + }, + { + "epoch": 27.17391304347826, + "grad_norm": 1.4592616558074951, + "learning_rate": 0.0001096329254727475, + "loss": 2.9766, + "step": 2037 + }, + { + "epoch": 27.187290969899667, + "grad_norm": 1.1791727542877197, + "learning_rate": 0.00010958843159065629, + "loss": 2.8115, + "step": 2038 + }, + { + "epoch": 27.20066889632107, + "grad_norm": 1.3347991704940796, + "learning_rate": 0.00010954393770856508, + "loss": 2.839, + "step": 2039 + }, + { + "epoch": 27.214046822742475, + "grad_norm": 1.6515034437179565, + "learning_rate": 0.00010949944382647385, + "loss": 3.2088, + "step": 2040 + }, + { + "epoch": 27.22742474916388, + "grad_norm": 1.4179280996322632, + "learning_rate": 0.00010945494994438266, + "loss": 3.0364, + "step": 2041 + }, + { + "epoch": 27.240802675585286, + "grad_norm": 1.5073821544647217, + "learning_rate": 0.00010941045606229143, + "loss": 2.9634, + "step": 2042 + }, + { + "epoch": 27.254180602006688, + "grad_norm": 1.3612418174743652, + "learning_rate": 0.00010936596218020023, + "loss": 3.0204, + "step": 2043 + }, + { + "epoch": 27.267558528428093, + "grad_norm": 1.3515981435775757, + "learning_rate": 0.00010932146829810901, + "loss": 2.7506, + "step": 2044 + }, + { + "epoch": 27.2809364548495, + "grad_norm": 1.394405722618103, + "learning_rate": 0.0001092769744160178, + "loss": 2.8979, + "step": 2045 + }, + { + "epoch": 27.294314381270905, + "grad_norm": 1.4995383024215698, + "learning_rate": 0.00010923248053392659, + "loss": 3.2501, + "step": 2046 + }, + { + "epoch": 27.307692307692307, + "grad_norm": 1.4658801555633545, + "learning_rate": 0.00010918798665183538, + "loss": 3.1901, + "step": 2047 + }, + { + "epoch": 27.321070234113712, + "grad_norm": 1.4346429109573364, + "learning_rate": 0.00010914349276974417, + "loss": 2.9835, + "step": 2048 + }, + { + "epoch": 27.334448160535118, + "grad_norm": 1.455994963645935, + "learning_rate": 0.00010909899888765296, + "loss": 3.0354, + "step": 2049 + }, + { + "epoch": 27.347826086956523, + "grad_norm": 1.2409014701843262, + "learning_rate": 0.00010905450500556173, + "loss": 3.1903, + "step": 2050 + }, + { + "epoch": 27.361204013377925, + "grad_norm": 1.417441487312317, + "learning_rate": 0.00010901001112347054, + "loss": 2.9081, + "step": 2051 + }, + { + "epoch": 27.37458193979933, + "grad_norm": 1.4116290807724, + "learning_rate": 0.00010896551724137931, + "loss": 2.8466, + "step": 2052 + }, + { + "epoch": 27.387959866220736, + "grad_norm": 1.58635675907135, + "learning_rate": 0.0001089210233592881, + "loss": 3.134, + "step": 2053 + }, + { + "epoch": 27.401337792642142, + "grad_norm": 1.4947638511657715, + "learning_rate": 0.00010887652947719689, + "loss": 2.9585, + "step": 2054 + }, + { + "epoch": 27.414715719063544, + "grad_norm": 1.5971535444259644, + "learning_rate": 0.00010883203559510568, + "loss": 2.954, + "step": 2055 + }, + { + "epoch": 27.42809364548495, + "grad_norm": 1.5000510215759277, + "learning_rate": 0.00010878754171301447, + "loss": 3.1699, + "step": 2056 + }, + { + "epoch": 27.441471571906355, + "grad_norm": 1.3633960485458374, + "learning_rate": 0.00010874304783092326, + "loss": 3.0109, + "step": 2057 + }, + { + "epoch": 27.45484949832776, + "grad_norm": 1.2857588529586792, + "learning_rate": 0.00010869855394883204, + "loss": 3.1932, + "step": 2058 + }, + { + "epoch": 27.468227424749163, + "grad_norm": 1.6654231548309326, + "learning_rate": 0.00010865406006674084, + "loss": 3.2411, + "step": 2059 + }, + { + "epoch": 27.48160535117057, + "grad_norm": 1.398990511894226, + "learning_rate": 0.00010860956618464961, + "loss": 3.2183, + "step": 2060 + }, + { + "epoch": 27.494983277591974, + "grad_norm": 1.4811097383499146, + "learning_rate": 0.00010856507230255842, + "loss": 2.9565, + "step": 2061 + }, + { + "epoch": 27.50836120401338, + "grad_norm": 1.5817798376083374, + "learning_rate": 0.00010852057842046719, + "loss": 3.0652, + "step": 2062 + }, + { + "epoch": 27.52173913043478, + "grad_norm": 1.2948859930038452, + "learning_rate": 0.00010847608453837598, + "loss": 2.8978, + "step": 2063 + }, + { + "epoch": 27.535117056856187, + "grad_norm": 1.3314701318740845, + "learning_rate": 0.00010843159065628477, + "loss": 2.8764, + "step": 2064 + }, + { + "epoch": 27.548494983277592, + "grad_norm": 1.531958818435669, + "learning_rate": 0.00010838709677419356, + "loss": 3.3317, + "step": 2065 + }, + { + "epoch": 27.561872909698998, + "grad_norm": 1.5060667991638184, + "learning_rate": 0.00010834260289210234, + "loss": 3.0724, + "step": 2066 + }, + { + "epoch": 27.5752508361204, + "grad_norm": 1.4489496946334839, + "learning_rate": 0.00010829810901001114, + "loss": 3.0841, + "step": 2067 + }, + { + "epoch": 27.588628762541806, + "grad_norm": 1.4431540966033936, + "learning_rate": 0.00010825361512791992, + "loss": 3.145, + "step": 2068 + }, + { + "epoch": 27.60200668896321, + "grad_norm": 1.6104519367218018, + "learning_rate": 0.00010820912124582869, + "loss": 3.0226, + "step": 2069 + }, + { + "epoch": 27.615384615384617, + "grad_norm": 1.249314546585083, + "learning_rate": 0.00010816462736373749, + "loss": 3.1273, + "step": 2070 + }, + { + "epoch": 27.62876254180602, + "grad_norm": 1.1928060054779053, + "learning_rate": 0.00010812013348164627, + "loss": 2.8877, + "step": 2071 + }, + { + "epoch": 27.642140468227424, + "grad_norm": 1.5014128684997559, + "learning_rate": 0.00010807563959955507, + "loss": 3.4129, + "step": 2072 + }, + { + "epoch": 27.65551839464883, + "grad_norm": 1.344773530960083, + "learning_rate": 0.00010803114571746385, + "loss": 3.1112, + "step": 2073 + }, + { + "epoch": 27.668896321070235, + "grad_norm": 1.616158366203308, + "learning_rate": 0.00010798665183537264, + "loss": 3.1799, + "step": 2074 + }, + { + "epoch": 27.682274247491637, + "grad_norm": 1.4087249040603638, + "learning_rate": 0.00010794215795328141, + "loss": 2.9315, + "step": 2075 + }, + { + "epoch": 27.695652173913043, + "grad_norm": 1.48402738571167, + "learning_rate": 0.00010789766407119022, + "loss": 3.1755, + "step": 2076 + }, + { + "epoch": 27.70903010033445, + "grad_norm": 1.3316876888275146, + "learning_rate": 0.00010785317018909899, + "loss": 3.087, + "step": 2077 + }, + { + "epoch": 27.722408026755854, + "grad_norm": 1.3765549659729004, + "learning_rate": 0.0001078086763070078, + "loss": 3.2595, + "step": 2078 + }, + { + "epoch": 27.735785953177256, + "grad_norm": 1.646952748298645, + "learning_rate": 0.00010776418242491657, + "loss": 3.0627, + "step": 2079 + }, + { + "epoch": 27.74916387959866, + "grad_norm": 1.2064934968948364, + "learning_rate": 0.00010771968854282537, + "loss": 3.0569, + "step": 2080 + }, + { + "epoch": 27.762541806020067, + "grad_norm": 1.3027973175048828, + "learning_rate": 0.00010767519466073415, + "loss": 3.0246, + "step": 2081 + }, + { + "epoch": 27.775919732441473, + "grad_norm": 1.4610519409179688, + "learning_rate": 0.00010763070077864294, + "loss": 3.0123, + "step": 2082 + }, + { + "epoch": 27.789297658862875, + "grad_norm": 1.376183032989502, + "learning_rate": 0.00010758620689655173, + "loss": 3.1312, + "step": 2083 + }, + { + "epoch": 27.80267558528428, + "grad_norm": 1.3715664148330688, + "learning_rate": 0.00010754171301446052, + "loss": 3.0104, + "step": 2084 + }, + { + "epoch": 27.816053511705686, + "grad_norm": 1.2951513528823853, + "learning_rate": 0.00010749721913236929, + "loss": 3.0353, + "step": 2085 + }, + { + "epoch": 27.82943143812709, + "grad_norm": 1.1614545583724976, + "learning_rate": 0.0001074527252502781, + "loss": 2.8977, + "step": 2086 + }, + { + "epoch": 27.842809364548494, + "grad_norm": 1.3948044776916504, + "learning_rate": 0.00010740823136818687, + "loss": 2.8611, + "step": 2087 + }, + { + "epoch": 27.8561872909699, + "grad_norm": 1.3053209781646729, + "learning_rate": 0.00010736373748609568, + "loss": 2.7965, + "step": 2088 + }, + { + "epoch": 27.869565217391305, + "grad_norm": 1.289396047592163, + "learning_rate": 0.00010731924360400445, + "loss": 2.8663, + "step": 2089 + }, + { + "epoch": 27.88294314381271, + "grad_norm": 1.4188008308410645, + "learning_rate": 0.00010727474972191324, + "loss": 3.0139, + "step": 2090 + }, + { + "epoch": 27.896321070234112, + "grad_norm": 1.5196731090545654, + "learning_rate": 0.00010723025583982203, + "loss": 2.9593, + "step": 2091 + }, + { + "epoch": 27.909698996655518, + "grad_norm": 1.5437039136886597, + "learning_rate": 0.00010718576195773082, + "loss": 2.8009, + "step": 2092 + }, + { + "epoch": 27.923076923076923, + "grad_norm": 1.4248530864715576, + "learning_rate": 0.0001071412680756396, + "loss": 2.8469, + "step": 2093 + }, + { + "epoch": 27.93645484949833, + "grad_norm": 1.4018962383270264, + "learning_rate": 0.0001070967741935484, + "loss": 3.0897, + "step": 2094 + }, + { + "epoch": 27.94983277591973, + "grad_norm": 1.2483067512512207, + "learning_rate": 0.00010705228031145717, + "loss": 3.1738, + "step": 2095 + }, + { + "epoch": 27.963210702341136, + "grad_norm": 1.25924813747406, + "learning_rate": 0.00010700778642936598, + "loss": 3.263, + "step": 2096 + }, + { + "epoch": 27.976588628762542, + "grad_norm": 1.2528305053710938, + "learning_rate": 0.00010696329254727475, + "loss": 3.0374, + "step": 2097 + }, + { + "epoch": 27.989966555183948, + "grad_norm": 1.7555912733078003, + "learning_rate": 0.00010691879866518356, + "loss": 3.1906, + "step": 2098 + }, + { + "epoch": 28.0, + "grad_norm": 1.7080843448638916, + "learning_rate": 0.00010687430478309233, + "loss": 2.9748, + "step": 2099 + }, + { + "epoch": 28.013377926421406, + "grad_norm": 1.120409369468689, + "learning_rate": 0.00010682981090100112, + "loss": 2.9385, + "step": 2100 + }, + { + "epoch": 28.02675585284281, + "grad_norm": 0.8793032169342041, + "learning_rate": 0.0001067853170189099, + "loss": 2.9205, + "step": 2101 + }, + { + "epoch": 28.040133779264213, + "grad_norm": 1.018744707107544, + "learning_rate": 0.0001067408231368187, + "loss": 3.1447, + "step": 2102 + }, + { + "epoch": 28.05351170568562, + "grad_norm": 0.9645024538040161, + "learning_rate": 0.00010669632925472748, + "loss": 2.783, + "step": 2103 + }, + { + "epoch": 28.066889632107024, + "grad_norm": 1.0723059177398682, + "learning_rate": 0.00010665183537263628, + "loss": 2.9627, + "step": 2104 + }, + { + "epoch": 28.08026755852843, + "grad_norm": 1.0304062366485596, + "learning_rate": 0.00010660734149054505, + "loss": 2.8118, + "step": 2105 + }, + { + "epoch": 28.093645484949832, + "grad_norm": 1.0894114971160889, + "learning_rate": 0.00010656284760845386, + "loss": 2.9643, + "step": 2106 + }, + { + "epoch": 28.107023411371237, + "grad_norm": 1.1615228652954102, + "learning_rate": 0.00010651835372636263, + "loss": 2.9402, + "step": 2107 + }, + { + "epoch": 28.120401337792643, + "grad_norm": 1.066375732421875, + "learning_rate": 0.00010647385984427144, + "loss": 2.8733, + "step": 2108 + }, + { + "epoch": 28.13377926421405, + "grad_norm": 1.1981687545776367, + "learning_rate": 0.0001064293659621802, + "loss": 2.9301, + "step": 2109 + }, + { + "epoch": 28.14715719063545, + "grad_norm": 1.0867598056793213, + "learning_rate": 0.00010638487208008899, + "loss": 3.1676, + "step": 2110 + }, + { + "epoch": 28.160535117056856, + "grad_norm": 1.156256079673767, + "learning_rate": 0.00010634037819799778, + "loss": 2.9555, + "step": 2111 + }, + { + "epoch": 28.17391304347826, + "grad_norm": 0.9931294322013855, + "learning_rate": 0.00010629588431590655, + "loss": 2.7119, + "step": 2112 + }, + { + "epoch": 28.187290969899667, + "grad_norm": 0.9383543729782104, + "learning_rate": 0.00010625139043381536, + "loss": 2.9251, + "step": 2113 + }, + { + "epoch": 28.20066889632107, + "grad_norm": 0.859805166721344, + "learning_rate": 0.00010620689655172413, + "loss": 2.8734, + "step": 2114 + }, + { + "epoch": 28.214046822742475, + "grad_norm": 1.081536889076233, + "learning_rate": 0.00010616240266963293, + "loss": 3.0356, + "step": 2115 + }, + { + "epoch": 28.22742474916388, + "grad_norm": 1.132004976272583, + "learning_rate": 0.00010611790878754171, + "loss": 3.1751, + "step": 2116 + }, + { + "epoch": 28.240802675585286, + "grad_norm": 1.0406700372695923, + "learning_rate": 0.0001060734149054505, + "loss": 2.913, + "step": 2117 + }, + { + "epoch": 28.254180602006688, + "grad_norm": 1.4362869262695312, + "learning_rate": 0.00010602892102335929, + "loss": 3.0823, + "step": 2118 + }, + { + "epoch": 28.267558528428093, + "grad_norm": 1.0609610080718994, + "learning_rate": 0.00010598442714126808, + "loss": 3.0693, + "step": 2119 + }, + { + "epoch": 28.2809364548495, + "grad_norm": 1.0952329635620117, + "learning_rate": 0.00010593993325917686, + "loss": 2.9953, + "step": 2120 + }, + { + "epoch": 28.294314381270905, + "grad_norm": 1.0358316898345947, + "learning_rate": 0.00010589543937708566, + "loss": 2.7737, + "step": 2121 + }, + { + "epoch": 28.307692307692307, + "grad_norm": 1.1332812309265137, + "learning_rate": 0.00010585094549499443, + "loss": 2.9874, + "step": 2122 + }, + { + "epoch": 28.321070234113712, + "grad_norm": 1.0737558603286743, + "learning_rate": 0.00010580645161290324, + "loss": 3.0547, + "step": 2123 + }, + { + "epoch": 28.334448160535118, + "grad_norm": 1.030110478401184, + "learning_rate": 0.00010576195773081201, + "loss": 2.9786, + "step": 2124 + }, + { + "epoch": 28.347826086956523, + "grad_norm": 1.09152352809906, + "learning_rate": 0.0001057174638487208, + "loss": 2.6407, + "step": 2125 + }, + { + "epoch": 28.361204013377925, + "grad_norm": 0.9880086183547974, + "learning_rate": 0.00010567296996662959, + "loss": 3.1367, + "step": 2126 + }, + { + "epoch": 28.37458193979933, + "grad_norm": 1.1486655473709106, + "learning_rate": 0.00010562847608453838, + "loss": 3.1705, + "step": 2127 + }, + { + "epoch": 28.387959866220736, + "grad_norm": 1.199541449546814, + "learning_rate": 0.00010558398220244716, + "loss": 3.0254, + "step": 2128 + }, + { + "epoch": 28.401337792642142, + "grad_norm": 0.9470932483673096, + "learning_rate": 0.00010553948832035596, + "loss": 3.1296, + "step": 2129 + }, + { + "epoch": 28.414715719063544, + "grad_norm": 1.1546344757080078, + "learning_rate": 0.00010549499443826474, + "loss": 2.9981, + "step": 2130 + }, + { + "epoch": 28.42809364548495, + "grad_norm": 1.0525349378585815, + "learning_rate": 0.00010545050055617354, + "loss": 2.884, + "step": 2131 + }, + { + "epoch": 28.441471571906355, + "grad_norm": 0.8965997695922852, + "learning_rate": 0.00010540600667408231, + "loss": 3.1666, + "step": 2132 + }, + { + "epoch": 28.45484949832776, + "grad_norm": 1.0740344524383545, + "learning_rate": 0.00010536151279199112, + "loss": 2.9477, + "step": 2133 + }, + { + "epoch": 28.468227424749163, + "grad_norm": 1.0058174133300781, + "learning_rate": 0.00010531701890989989, + "loss": 2.9355, + "step": 2134 + }, + { + "epoch": 28.48160535117057, + "grad_norm": 1.0790162086486816, + "learning_rate": 0.00010527252502780868, + "loss": 2.8728, + "step": 2135 + }, + { + "epoch": 28.494983277591974, + "grad_norm": 1.0231183767318726, + "learning_rate": 0.00010522803114571746, + "loss": 2.6208, + "step": 2136 + }, + { + "epoch": 28.50836120401338, + "grad_norm": 1.1527316570281982, + "learning_rate": 0.00010518353726362626, + "loss": 3.0255, + "step": 2137 + }, + { + "epoch": 28.52173913043478, + "grad_norm": 1.0380228757858276, + "learning_rate": 0.00010513904338153504, + "loss": 3.1382, + "step": 2138 + }, + { + "epoch": 28.535117056856187, + "grad_norm": 1.1528582572937012, + "learning_rate": 0.00010509454949944384, + "loss": 2.9921, + "step": 2139 + }, + { + "epoch": 28.548494983277592, + "grad_norm": 1.0610437393188477, + "learning_rate": 0.00010505005561735262, + "loss": 2.832, + "step": 2140 + }, + { + "epoch": 28.561872909698998, + "grad_norm": 1.1252180337905884, + "learning_rate": 0.00010500556173526142, + "loss": 3.0318, + "step": 2141 + }, + { + "epoch": 28.5752508361204, + "grad_norm": 0.910602331161499, + "learning_rate": 0.00010496106785317019, + "loss": 3.0174, + "step": 2142 + }, + { + "epoch": 28.588628762541806, + "grad_norm": 1.1615829467773438, + "learning_rate": 0.000104916573971079, + "loss": 3.1465, + "step": 2143 + }, + { + "epoch": 28.60200668896321, + "grad_norm": 1.0431264638900757, + "learning_rate": 0.00010487208008898776, + "loss": 2.9406, + "step": 2144 + }, + { + "epoch": 28.615384615384617, + "grad_norm": 1.1530067920684814, + "learning_rate": 0.00010482758620689656, + "loss": 2.7672, + "step": 2145 + }, + { + "epoch": 28.62876254180602, + "grad_norm": 1.0598475933074951, + "learning_rate": 0.00010478309232480534, + "loss": 3.0477, + "step": 2146 + }, + { + "epoch": 28.642140468227424, + "grad_norm": 0.9107276797294617, + "learning_rate": 0.00010473859844271414, + "loss": 2.949, + "step": 2147 + }, + { + "epoch": 28.65551839464883, + "grad_norm": 1.0739660263061523, + "learning_rate": 0.00010469410456062292, + "loss": 3.0195, + "step": 2148 + }, + { + "epoch": 28.668896321070235, + "grad_norm": 1.0200115442276, + "learning_rate": 0.00010464961067853172, + "loss": 2.9086, + "step": 2149 + }, + { + "epoch": 28.682274247491637, + "grad_norm": 0.9945818185806274, + "learning_rate": 0.0001046051167964405, + "loss": 3.3103, + "step": 2150 + }, + { + "epoch": 28.695652173913043, + "grad_norm": 1.231123447418213, + "learning_rate": 0.00010456062291434927, + "loss": 2.9266, + "step": 2151 + }, + { + "epoch": 28.70903010033445, + "grad_norm": 1.02529776096344, + "learning_rate": 0.00010451612903225806, + "loss": 2.8796, + "step": 2152 + }, + { + "epoch": 28.722408026755854, + "grad_norm": 1.1735590696334839, + "learning_rate": 0.00010447163515016685, + "loss": 2.8582, + "step": 2153 + }, + { + "epoch": 28.735785953177256, + "grad_norm": 1.0260053873062134, + "learning_rate": 0.00010442714126807564, + "loss": 3.1459, + "step": 2154 + }, + { + "epoch": 28.74916387959866, + "grad_norm": 1.0473296642303467, + "learning_rate": 0.00010438264738598442, + "loss": 3.1804, + "step": 2155 + }, + { + "epoch": 28.762541806020067, + "grad_norm": 1.0932636260986328, + "learning_rate": 0.00010433815350389322, + "loss": 2.8114, + "step": 2156 + }, + { + "epoch": 28.775919732441473, + "grad_norm": 1.115605115890503, + "learning_rate": 0.000104293659621802, + "loss": 2.925, + "step": 2157 + }, + { + "epoch": 28.789297658862875, + "grad_norm": 1.0479992628097534, + "learning_rate": 0.0001042491657397108, + "loss": 3.0461, + "step": 2158 + }, + { + "epoch": 28.80267558528428, + "grad_norm": 0.9241504669189453, + "learning_rate": 0.00010420467185761957, + "loss": 3.0755, + "step": 2159 + }, + { + "epoch": 28.816053511705686, + "grad_norm": 0.9609296917915344, + "learning_rate": 0.00010416017797552838, + "loss": 3.077, + "step": 2160 + }, + { + "epoch": 28.82943143812709, + "grad_norm": 0.9561198949813843, + "learning_rate": 0.00010411568409343715, + "loss": 2.9041, + "step": 2161 + }, + { + "epoch": 28.842809364548494, + "grad_norm": 1.1552175283432007, + "learning_rate": 0.00010407119021134594, + "loss": 2.8253, + "step": 2162 + }, + { + "epoch": 28.8561872909699, + "grad_norm": 0.8945892453193665, + "learning_rate": 0.00010402669632925472, + "loss": 2.9228, + "step": 2163 + }, + { + "epoch": 28.869565217391305, + "grad_norm": 1.0247336626052856, + "learning_rate": 0.00010398220244716352, + "loss": 2.6202, + "step": 2164 + }, + { + "epoch": 28.88294314381271, + "grad_norm": 1.0500261783599854, + "learning_rate": 0.0001039377085650723, + "loss": 2.777, + "step": 2165 + }, + { + "epoch": 28.896321070234112, + "grad_norm": 1.0250235795974731, + "learning_rate": 0.0001038932146829811, + "loss": 3.0606, + "step": 2166 + }, + { + "epoch": 28.909698996655518, + "grad_norm": 1.0018290281295776, + "learning_rate": 0.00010384872080088988, + "loss": 3.0099, + "step": 2167 + }, + { + "epoch": 28.923076923076923, + "grad_norm": 0.8957269191741943, + "learning_rate": 0.00010380422691879868, + "loss": 2.7081, + "step": 2168 + }, + { + "epoch": 28.93645484949833, + "grad_norm": 1.060746431350708, + "learning_rate": 0.00010375973303670745, + "loss": 2.9771, + "step": 2169 + }, + { + "epoch": 28.94983277591973, + "grad_norm": 0.9782920479774475, + "learning_rate": 0.00010371523915461626, + "loss": 3.035, + "step": 2170 + }, + { + "epoch": 28.963210702341136, + "grad_norm": 0.9972925186157227, + "learning_rate": 0.00010367074527252502, + "loss": 3.0531, + "step": 2171 + }, + { + "epoch": 28.976588628762542, + "grad_norm": 1.1463640928268433, + "learning_rate": 0.00010362625139043382, + "loss": 2.9485, + "step": 2172 + }, + { + "epoch": 28.989966555183948, + "grad_norm": 1.0142980813980103, + "learning_rate": 0.0001035817575083426, + "loss": 2.8852, + "step": 2173 + }, + { + "epoch": 29.0, + "grad_norm": 1.347244381904602, + "learning_rate": 0.0001035372636262514, + "loss": 2.898, + "step": 2174 + }, + { + "epoch": 29.013377926421406, + "grad_norm": 0.9205636978149414, + "learning_rate": 0.00010349276974416018, + "loss": 2.7324, + "step": 2175 + }, + { + "epoch": 29.02675585284281, + "grad_norm": 0.9123748540878296, + "learning_rate": 0.00010344827586206898, + "loss": 2.943, + "step": 2176 + }, + { + "epoch": 29.040133779264213, + "grad_norm": 0.8594069480895996, + "learning_rate": 0.00010340378197997776, + "loss": 2.8531, + "step": 2177 + }, + { + "epoch": 29.05351170568562, + "grad_norm": 0.7439792156219482, + "learning_rate": 0.00010335928809788655, + "loss": 2.521, + "step": 2178 + }, + { + "epoch": 29.066889632107024, + "grad_norm": 0.7489947080612183, + "learning_rate": 0.00010331479421579532, + "loss": 2.9317, + "step": 2179 + }, + { + "epoch": 29.08026755852843, + "grad_norm": 0.8311702013015747, + "learning_rate": 0.00010327030033370413, + "loss": 2.7137, + "step": 2180 + }, + { + "epoch": 29.093645484949832, + "grad_norm": 0.9763099551200867, + "learning_rate": 0.0001032258064516129, + "loss": 2.7502, + "step": 2181 + }, + { + "epoch": 29.107023411371237, + "grad_norm": 0.8277449607849121, + "learning_rate": 0.0001031813125695217, + "loss": 2.9792, + "step": 2182 + }, + { + "epoch": 29.120401337792643, + "grad_norm": 0.7491191029548645, + "learning_rate": 0.00010313681868743048, + "loss": 2.8479, + "step": 2183 + }, + { + "epoch": 29.13377926421405, + "grad_norm": 0.7314183115959167, + "learning_rate": 0.00010309232480533928, + "loss": 2.8962, + "step": 2184 + }, + { + "epoch": 29.14715719063545, + "grad_norm": 0.8334509134292603, + "learning_rate": 0.00010304783092324806, + "loss": 2.86, + "step": 2185 + }, + { + "epoch": 29.160535117056856, + "grad_norm": 0.9736613631248474, + "learning_rate": 0.00010300333704115685, + "loss": 2.9289, + "step": 2186 + }, + { + "epoch": 29.17391304347826, + "grad_norm": 0.9118865132331848, + "learning_rate": 0.00010295884315906564, + "loss": 3.0865, + "step": 2187 + }, + { + "epoch": 29.187290969899667, + "grad_norm": 0.992799699306488, + "learning_rate": 0.00010291434927697443, + "loss": 2.7811, + "step": 2188 + }, + { + "epoch": 29.20066889632107, + "grad_norm": 1.0415345430374146, + "learning_rate": 0.0001028698553948832, + "loss": 2.7707, + "step": 2189 + }, + { + "epoch": 29.214046822742475, + "grad_norm": 0.7599477171897888, + "learning_rate": 0.00010282536151279201, + "loss": 2.8162, + "step": 2190 + }, + { + "epoch": 29.22742474916388, + "grad_norm": 1.17939293384552, + "learning_rate": 0.00010278086763070078, + "loss": 2.7939, + "step": 2191 + }, + { + "epoch": 29.240802675585286, + "grad_norm": 0.8793815970420837, + "learning_rate": 0.00010273637374860956, + "loss": 2.7704, + "step": 2192 + }, + { + "epoch": 29.254180602006688, + "grad_norm": 0.838680624961853, + "learning_rate": 0.00010269187986651836, + "loss": 2.8513, + "step": 2193 + }, + { + "epoch": 29.267558528428093, + "grad_norm": 0.8918489217758179, + "learning_rate": 0.00010264738598442714, + "loss": 2.7748, + "step": 2194 + }, + { + "epoch": 29.2809364548495, + "grad_norm": 0.7513167858123779, + "learning_rate": 0.00010260289210233594, + "loss": 2.8151, + "step": 2195 + }, + { + "epoch": 29.294314381270905, + "grad_norm": 0.7921515703201294, + "learning_rate": 0.0001025583982202447, + "loss": 2.8004, + "step": 2196 + }, + { + "epoch": 29.307692307692307, + "grad_norm": 0.7862170934677124, + "learning_rate": 0.00010251390433815352, + "loss": 2.9749, + "step": 2197 + }, + { + "epoch": 29.321070234113712, + "grad_norm": 0.8512644171714783, + "learning_rate": 0.00010246941045606228, + "loss": 2.9936, + "step": 2198 + }, + { + "epoch": 29.334448160535118, + "grad_norm": 0.9074665307998657, + "learning_rate": 0.00010242491657397108, + "loss": 2.814, + "step": 2199 + }, + { + "epoch": 29.347826086956523, + "grad_norm": 0.9875670075416565, + "learning_rate": 0.00010238042269187986, + "loss": 2.9862, + "step": 2200 + }, + { + "epoch": 29.361204013377925, + "grad_norm": 0.7445062398910522, + "learning_rate": 0.00010233592880978866, + "loss": 2.8233, + "step": 2201 + }, + { + "epoch": 29.37458193979933, + "grad_norm": 0.8388944268226624, + "learning_rate": 0.00010229143492769744, + "loss": 2.8983, + "step": 2202 + }, + { + "epoch": 29.387959866220736, + "grad_norm": 0.7882921099662781, + "learning_rate": 0.00010224694104560624, + "loss": 2.9521, + "step": 2203 + }, + { + "epoch": 29.401337792642142, + "grad_norm": 0.8031525015830994, + "learning_rate": 0.00010220244716351502, + "loss": 2.8722, + "step": 2204 + }, + { + "epoch": 29.414715719063544, + "grad_norm": 0.8128193616867065, + "learning_rate": 0.00010215795328142381, + "loss": 2.972, + "step": 2205 + }, + { + "epoch": 29.42809364548495, + "grad_norm": 1.153869867324829, + "learning_rate": 0.00010211345939933258, + "loss": 2.8711, + "step": 2206 + }, + { + "epoch": 29.441471571906355, + "grad_norm": 0.8942824602127075, + "learning_rate": 0.0001020689655172414, + "loss": 3.2308, + "step": 2207 + }, + { + "epoch": 29.45484949832776, + "grad_norm": 0.8985738158226013, + "learning_rate": 0.00010202447163515016, + "loss": 2.8042, + "step": 2208 + }, + { + "epoch": 29.468227424749163, + "grad_norm": 0.8202070593833923, + "learning_rate": 0.00010197997775305896, + "loss": 2.7193, + "step": 2209 + }, + { + "epoch": 29.48160535117057, + "grad_norm": 0.7752913236618042, + "learning_rate": 0.00010193548387096774, + "loss": 2.8162, + "step": 2210 + }, + { + "epoch": 29.494983277591974, + "grad_norm": 0.8399960994720459, + "learning_rate": 0.00010189098998887654, + "loss": 3.0488, + "step": 2211 + }, + { + "epoch": 29.50836120401338, + "grad_norm": 0.9991989731788635, + "learning_rate": 0.00010184649610678532, + "loss": 2.8647, + "step": 2212 + }, + { + "epoch": 29.52173913043478, + "grad_norm": 0.8456683158874512, + "learning_rate": 0.00010180200222469411, + "loss": 2.9635, + "step": 2213 + }, + { + "epoch": 29.535117056856187, + "grad_norm": 0.7978901863098145, + "learning_rate": 0.0001017575083426029, + "loss": 2.8927, + "step": 2214 + }, + { + "epoch": 29.548494983277592, + "grad_norm": 0.8560264706611633, + "learning_rate": 0.00010171301446051169, + "loss": 2.9403, + "step": 2215 + }, + { + "epoch": 29.561872909698998, + "grad_norm": 0.7215510010719299, + "learning_rate": 0.00010166852057842046, + "loss": 2.7367, + "step": 2216 + }, + { + "epoch": 29.5752508361204, + "grad_norm": 0.8395666480064392, + "learning_rate": 0.00010162402669632927, + "loss": 2.9857, + "step": 2217 + }, + { + "epoch": 29.588628762541806, + "grad_norm": 0.9163089394569397, + "learning_rate": 0.00010157953281423804, + "loss": 2.9605, + "step": 2218 + }, + { + "epoch": 29.60200668896321, + "grad_norm": 0.8586741089820862, + "learning_rate": 0.00010153503893214684, + "loss": 2.9543, + "step": 2219 + }, + { + "epoch": 29.615384615384617, + "grad_norm": 0.7774202227592468, + "learning_rate": 0.00010149054505005562, + "loss": 3.0245, + "step": 2220 + }, + { + "epoch": 29.62876254180602, + "grad_norm": 0.8288631439208984, + "learning_rate": 0.00010144605116796441, + "loss": 2.975, + "step": 2221 + }, + { + "epoch": 29.642140468227424, + "grad_norm": 0.7886912226676941, + "learning_rate": 0.0001014015572858732, + "loss": 2.6679, + "step": 2222 + }, + { + "epoch": 29.65551839464883, + "grad_norm": 0.7964534759521484, + "learning_rate": 0.00010135706340378199, + "loss": 2.9989, + "step": 2223 + }, + { + "epoch": 29.668896321070235, + "grad_norm": 1.0234196186065674, + "learning_rate": 0.00010131256952169078, + "loss": 3.2245, + "step": 2224 + }, + { + "epoch": 29.682274247491637, + "grad_norm": 0.8414390683174133, + "learning_rate": 0.00010126807563959957, + "loss": 3.0525, + "step": 2225 + }, + { + "epoch": 29.695652173913043, + "grad_norm": 0.8034384250640869, + "learning_rate": 0.00010122358175750834, + "loss": 2.9012, + "step": 2226 + }, + { + "epoch": 29.70903010033445, + "grad_norm": 0.7711749076843262, + "learning_rate": 0.00010117908787541715, + "loss": 2.7548, + "step": 2227 + }, + { + "epoch": 29.722408026755854, + "grad_norm": 0.9712610840797424, + "learning_rate": 0.00010113459399332592, + "loss": 3.1162, + "step": 2228 + }, + { + "epoch": 29.735785953177256, + "grad_norm": 0.8966991305351257, + "learning_rate": 0.00010109010011123471, + "loss": 3.0883, + "step": 2229 + }, + { + "epoch": 29.74916387959866, + "grad_norm": 0.8569051027297974, + "learning_rate": 0.0001010456062291435, + "loss": 2.8909, + "step": 2230 + }, + { + "epoch": 29.762541806020067, + "grad_norm": 0.9149661660194397, + "learning_rate": 0.00010100111234705229, + "loss": 2.7559, + "step": 2231 + }, + { + "epoch": 29.775919732441473, + "grad_norm": 1.1889090538024902, + "learning_rate": 0.00010095661846496107, + "loss": 2.7872, + "step": 2232 + }, + { + "epoch": 29.789297658862875, + "grad_norm": 0.8464928865432739, + "learning_rate": 0.00010091212458286984, + "loss": 2.8496, + "step": 2233 + }, + { + "epoch": 29.80267558528428, + "grad_norm": 0.8210122585296631, + "learning_rate": 0.00010086763070077865, + "loss": 3.0782, + "step": 2234 + }, + { + "epoch": 29.816053511705686, + "grad_norm": 0.7978618741035461, + "learning_rate": 0.00010082313681868742, + "loss": 2.9633, + "step": 2235 + }, + { + "epoch": 29.82943143812709, + "grad_norm": 0.7863582968711853, + "learning_rate": 0.00010077864293659622, + "loss": 2.9536, + "step": 2236 + }, + { + "epoch": 29.842809364548494, + "grad_norm": 0.9451448917388916, + "learning_rate": 0.000100734149054505, + "loss": 2.945, + "step": 2237 + }, + { + "epoch": 29.8561872909699, + "grad_norm": 0.9478015303611755, + "learning_rate": 0.0001006896551724138, + "loss": 2.8408, + "step": 2238 + }, + { + "epoch": 29.869565217391305, + "grad_norm": 0.804568886756897, + "learning_rate": 0.00010064516129032258, + "loss": 3.093, + "step": 2239 + }, + { + "epoch": 29.88294314381271, + "grad_norm": 0.8370556235313416, + "learning_rate": 0.00010060066740823137, + "loss": 3.0561, + "step": 2240 + }, + { + "epoch": 29.896321070234112, + "grad_norm": 0.9402264952659607, + "learning_rate": 0.00010055617352614016, + "loss": 3.0024, + "step": 2241 + }, + { + "epoch": 29.909698996655518, + "grad_norm": 0.7789230942726135, + "learning_rate": 0.00010051167964404895, + "loss": 2.8388, + "step": 2242 + }, + { + "epoch": 29.923076923076923, + "grad_norm": 0.8228744268417358, + "learning_rate": 0.00010046718576195772, + "loss": 3.1335, + "step": 2243 + }, + { + "epoch": 29.93645484949833, + "grad_norm": 0.9612992405891418, + "learning_rate": 0.00010042269187986653, + "loss": 3.0355, + "step": 2244 + }, + { + "epoch": 29.94983277591973, + "grad_norm": 0.989894688129425, + "learning_rate": 0.0001003781979977753, + "loss": 2.6879, + "step": 2245 + }, + { + "epoch": 29.963210702341136, + "grad_norm": 0.7333267331123352, + "learning_rate": 0.0001003337041156841, + "loss": 3.004, + "step": 2246 + }, + { + "epoch": 29.976588628762542, + "grad_norm": 0.802925705909729, + "learning_rate": 0.00010028921023359288, + "loss": 3.0526, + "step": 2247 + }, + { + "epoch": 29.989966555183948, + "grad_norm": 0.8313626646995544, + "learning_rate": 0.00010024471635150167, + "loss": 2.554, + "step": 2248 + }, + { + "epoch": 30.0, + "grad_norm": 1.1626179218292236, + "learning_rate": 0.00010020022246941046, + "loss": 3.0765, + "step": 2249 + }, + { + "epoch": 30.013377926421406, + "grad_norm": 0.6676865220069885, + "learning_rate": 0.00010015572858731925, + "loss": 2.8693, + "step": 2250 + }, + { + "epoch": 30.02675585284281, + "grad_norm": 0.719235897064209, + "learning_rate": 0.00010011123470522804, + "loss": 2.8247, + "step": 2251 + }, + { + "epoch": 30.040133779264213, + "grad_norm": 0.8199754953384399, + "learning_rate": 0.00010006674082313683, + "loss": 3.0088, + "step": 2252 + }, + { + "epoch": 30.05351170568562, + "grad_norm": 0.7587671279907227, + "learning_rate": 0.0001000222469410456, + "loss": 3.118, + "step": 2253 + }, + { + "epoch": 30.066889632107024, + "grad_norm": 0.7287217974662781, + "learning_rate": 9.99777530589544e-05, + "loss": 2.6612, + "step": 2254 + }, + { + "epoch": 30.08026755852843, + "grad_norm": 0.6484748125076294, + "learning_rate": 9.993325917686318e-05, + "loss": 3.0679, + "step": 2255 + }, + { + "epoch": 30.093645484949832, + "grad_norm": 0.6975134015083313, + "learning_rate": 9.988876529477197e-05, + "loss": 2.9633, + "step": 2256 + }, + { + "epoch": 30.107023411371237, + "grad_norm": 0.7293218970298767, + "learning_rate": 9.984427141268076e-05, + "loss": 2.9374, + "step": 2257 + }, + { + "epoch": 30.120401337792643, + "grad_norm": 0.8367610573768616, + "learning_rate": 9.979977753058954e-05, + "loss": 2.8749, + "step": 2258 + }, + { + "epoch": 30.13377926421405, + "grad_norm": 0.6851879954338074, + "learning_rate": 9.975528364849834e-05, + "loss": 2.4921, + "step": 2259 + }, + { + "epoch": 30.14715719063545, + "grad_norm": 0.6916420459747314, + "learning_rate": 9.971078976640712e-05, + "loss": 2.6425, + "step": 2260 + }, + { + "epoch": 30.160535117056856, + "grad_norm": 0.6615001559257507, + "learning_rate": 9.966629588431591e-05, + "loss": 2.6696, + "step": 2261 + }, + { + "epoch": 30.17391304347826, + "grad_norm": 0.7402496933937073, + "learning_rate": 9.96218020022247e-05, + "loss": 2.7479, + "step": 2262 + }, + { + "epoch": 30.187290969899667, + "grad_norm": 0.6824116110801697, + "learning_rate": 9.957730812013348e-05, + "loss": 2.427, + "step": 2263 + }, + { + "epoch": 30.20066889632107, + "grad_norm": 0.6979151964187622, + "learning_rate": 9.953281423804227e-05, + "loss": 3.0677, + "step": 2264 + }, + { + "epoch": 30.214046822742475, + "grad_norm": 0.7541396617889404, + "learning_rate": 9.948832035595106e-05, + "loss": 2.9202, + "step": 2265 + }, + { + "epoch": 30.22742474916388, + "grad_norm": 0.671625554561615, + "learning_rate": 9.944382647385985e-05, + "loss": 2.6953, + "step": 2266 + }, + { + "epoch": 30.240802675585286, + "grad_norm": 0.6994143128395081, + "learning_rate": 9.939933259176863e-05, + "loss": 2.877, + "step": 2267 + }, + { + "epoch": 30.254180602006688, + "grad_norm": 0.7016029953956604, + "learning_rate": 9.935483870967742e-05, + "loss": 2.7883, + "step": 2268 + }, + { + "epoch": 30.267558528428093, + "grad_norm": 0.7190927267074585, + "learning_rate": 9.931034482758621e-05, + "loss": 2.9066, + "step": 2269 + }, + { + "epoch": 30.2809364548495, + "grad_norm": 0.8793424963951111, + "learning_rate": 9.9265850945495e-05, + "loss": 2.7433, + "step": 2270 + }, + { + "epoch": 30.294314381270905, + "grad_norm": 0.7914022207260132, + "learning_rate": 9.922135706340379e-05, + "loss": 2.6283, + "step": 2271 + }, + { + "epoch": 30.307692307692307, + "grad_norm": 0.7672899961471558, + "learning_rate": 9.917686318131257e-05, + "loss": 3.0767, + "step": 2272 + }, + { + "epoch": 30.321070234113712, + "grad_norm": 0.7170990109443665, + "learning_rate": 9.913236929922136e-05, + "loss": 2.9666, + "step": 2273 + }, + { + "epoch": 30.334448160535118, + "grad_norm": 0.7215063571929932, + "learning_rate": 9.908787541713015e-05, + "loss": 2.9646, + "step": 2274 + }, + { + "epoch": 30.347826086956523, + "grad_norm": 0.6509780287742615, + "learning_rate": 9.904338153503893e-05, + "loss": 2.9134, + "step": 2275 + }, + { + "epoch": 30.361204013377925, + "grad_norm": 0.6361657977104187, + "learning_rate": 9.899888765294773e-05, + "loss": 2.6955, + "step": 2276 + }, + { + "epoch": 30.37458193979933, + "grad_norm": 0.6722748279571533, + "learning_rate": 9.895439377085651e-05, + "loss": 3.1694, + "step": 2277 + }, + { + "epoch": 30.387959866220736, + "grad_norm": 0.796759307384491, + "learning_rate": 9.89098998887653e-05, + "loss": 2.9863, + "step": 2278 + }, + { + "epoch": 30.401337792642142, + "grad_norm": 0.8303789496421814, + "learning_rate": 9.886540600667409e-05, + "loss": 2.6192, + "step": 2279 + }, + { + "epoch": 30.414715719063544, + "grad_norm": 0.7565797567367554, + "learning_rate": 9.882091212458287e-05, + "loss": 2.8709, + "step": 2280 + }, + { + "epoch": 30.42809364548495, + "grad_norm": 0.7000163197517395, + "learning_rate": 9.877641824249167e-05, + "loss": 2.9017, + "step": 2281 + }, + { + "epoch": 30.441471571906355, + "grad_norm": 0.7144603729248047, + "learning_rate": 9.873192436040045e-05, + "loss": 2.9685, + "step": 2282 + }, + { + "epoch": 30.45484949832776, + "grad_norm": 0.8616060018539429, + "learning_rate": 9.868743047830923e-05, + "loss": 2.554, + "step": 2283 + }, + { + "epoch": 30.468227424749163, + "grad_norm": 0.6722779273986816, + "learning_rate": 9.864293659621803e-05, + "loss": 2.7132, + "step": 2284 + }, + { + "epoch": 30.48160535117057, + "grad_norm": 0.8574376702308655, + "learning_rate": 9.859844271412681e-05, + "loss": 2.8295, + "step": 2285 + }, + { + "epoch": 30.494983277591974, + "grad_norm": 0.6491283178329468, + "learning_rate": 9.855394883203561e-05, + "loss": 2.6042, + "step": 2286 + }, + { + "epoch": 30.50836120401338, + "grad_norm": 0.677865743637085, + "learning_rate": 9.850945494994439e-05, + "loss": 2.9783, + "step": 2287 + }, + { + "epoch": 30.52173913043478, + "grad_norm": 0.6842278242111206, + "learning_rate": 9.846496106785317e-05, + "loss": 2.8781, + "step": 2288 + }, + { + "epoch": 30.535117056856187, + "grad_norm": 0.7441837787628174, + "learning_rate": 9.842046718576197e-05, + "loss": 2.73, + "step": 2289 + }, + { + "epoch": 30.548494983277592, + "grad_norm": 0.8213462233543396, + "learning_rate": 9.837597330367075e-05, + "loss": 3.0742, + "step": 2290 + }, + { + "epoch": 30.561872909698998, + "grad_norm": 0.7320848703384399, + "learning_rate": 9.833147942157955e-05, + "loss": 3.0144, + "step": 2291 + }, + { + "epoch": 30.5752508361204, + "grad_norm": 0.6924611330032349, + "learning_rate": 9.828698553948833e-05, + "loss": 2.825, + "step": 2292 + }, + { + "epoch": 30.588628762541806, + "grad_norm": 0.6979595422744751, + "learning_rate": 9.824249165739711e-05, + "loss": 2.5886, + "step": 2293 + }, + { + "epoch": 30.60200668896321, + "grad_norm": 0.772278368473053, + "learning_rate": 9.819799777530591e-05, + "loss": 2.9113, + "step": 2294 + }, + { + "epoch": 30.615384615384617, + "grad_norm": 0.7430849075317383, + "learning_rate": 9.815350389321468e-05, + "loss": 2.8571, + "step": 2295 + }, + { + "epoch": 30.62876254180602, + "grad_norm": 0.7554115056991577, + "learning_rate": 9.810901001112347e-05, + "loss": 2.8272, + "step": 2296 + }, + { + "epoch": 30.642140468227424, + "grad_norm": 0.8022170662879944, + "learning_rate": 9.806451612903226e-05, + "loss": 2.8113, + "step": 2297 + }, + { + "epoch": 30.65551839464883, + "grad_norm": 0.7076969146728516, + "learning_rate": 9.802002224694105e-05, + "loss": 2.8282, + "step": 2298 + }, + { + "epoch": 30.668896321070235, + "grad_norm": 0.8672876954078674, + "learning_rate": 9.797552836484983e-05, + "loss": 2.8059, + "step": 2299 + }, + { + "epoch": 30.682274247491637, + "grad_norm": 0.7063407301902771, + "learning_rate": 9.793103448275862e-05, + "loss": 3.0516, + "step": 2300 + }, + { + "epoch": 30.695652173913043, + "grad_norm": 0.6833634376525879, + "learning_rate": 9.788654060066741e-05, + "loss": 3.0762, + "step": 2301 + }, + { + "epoch": 30.70903010033445, + "grad_norm": 0.7486119270324707, + "learning_rate": 9.78420467185762e-05, + "loss": 2.7423, + "step": 2302 + }, + { + "epoch": 30.722408026755854, + "grad_norm": 0.7409520149230957, + "learning_rate": 9.779755283648499e-05, + "loss": 2.9579, + "step": 2303 + }, + { + "epoch": 30.735785953177256, + "grad_norm": 0.7436200380325317, + "learning_rate": 9.775305895439377e-05, + "loss": 2.796, + "step": 2304 + }, + { + "epoch": 30.74916387959866, + "grad_norm": 0.7009103298187256, + "learning_rate": 9.770856507230256e-05, + "loss": 2.9099, + "step": 2305 + }, + { + "epoch": 30.762541806020067, + "grad_norm": 0.7946734428405762, + "learning_rate": 9.766407119021135e-05, + "loss": 3.014, + "step": 2306 + }, + { + "epoch": 30.775919732441473, + "grad_norm": 0.7147461175918579, + "learning_rate": 9.761957730812013e-05, + "loss": 2.5407, + "step": 2307 + }, + { + "epoch": 30.789297658862875, + "grad_norm": 0.7347055673599243, + "learning_rate": 9.757508342602893e-05, + "loss": 3.0734, + "step": 2308 + }, + { + "epoch": 30.80267558528428, + "grad_norm": 0.7037932872772217, + "learning_rate": 9.753058954393771e-05, + "loss": 2.8854, + "step": 2309 + }, + { + "epoch": 30.816053511705686, + "grad_norm": 0.6475211977958679, + "learning_rate": 9.74860956618465e-05, + "loss": 2.8844, + "step": 2310 + }, + { + "epoch": 30.82943143812709, + "grad_norm": 0.7679370045661926, + "learning_rate": 9.744160177975529e-05, + "loss": 2.8667, + "step": 2311 + }, + { + "epoch": 30.842809364548494, + "grad_norm": 0.7334820032119751, + "learning_rate": 9.739710789766407e-05, + "loss": 2.8936, + "step": 2312 + }, + { + "epoch": 30.8561872909699, + "grad_norm": 0.6336019039154053, + "learning_rate": 9.735261401557287e-05, + "loss": 2.7476, + "step": 2313 + }, + { + "epoch": 30.869565217391305, + "grad_norm": 0.6566322445869446, + "learning_rate": 9.730812013348165e-05, + "loss": 2.8259, + "step": 2314 + }, + { + "epoch": 30.88294314381271, + "grad_norm": 0.6635501384735107, + "learning_rate": 9.726362625139043e-05, + "loss": 2.7668, + "step": 2315 + }, + { + "epoch": 30.896321070234112, + "grad_norm": 0.7602748274803162, + "learning_rate": 9.721913236929923e-05, + "loss": 2.8645, + "step": 2316 + }, + { + "epoch": 30.909698996655518, + "grad_norm": 0.8196545243263245, + "learning_rate": 9.717463848720801e-05, + "loss": 2.8413, + "step": 2317 + }, + { + "epoch": 30.923076923076923, + "grad_norm": 0.6762588620185852, + "learning_rate": 9.713014460511681e-05, + "loss": 2.6832, + "step": 2318 + }, + { + "epoch": 30.93645484949833, + "grad_norm": 0.665920078754425, + "learning_rate": 9.708565072302559e-05, + "loss": 3.0145, + "step": 2319 + }, + { + "epoch": 30.94983277591973, + "grad_norm": 0.6335712671279907, + "learning_rate": 9.704115684093437e-05, + "loss": 2.9063, + "step": 2320 + }, + { + "epoch": 30.963210702341136, + "grad_norm": 0.7016931176185608, + "learning_rate": 9.699666295884317e-05, + "loss": 2.7531, + "step": 2321 + }, + { + "epoch": 30.976588628762542, + "grad_norm": 0.8611680865287781, + "learning_rate": 9.695216907675195e-05, + "loss": 2.9685, + "step": 2322 + }, + { + "epoch": 30.989966555183948, + "grad_norm": 0.6724279522895813, + "learning_rate": 9.690767519466075e-05, + "loss": 2.8643, + "step": 2323 + }, + { + "epoch": 31.0, + "grad_norm": 0.7072698473930359, + "learning_rate": 9.686318131256953e-05, + "loss": 2.8678, + "step": 2324 + }, + { + "epoch": 31.013377926421406, + "grad_norm": 0.7127351760864258, + "learning_rate": 9.681868743047831e-05, + "loss": 2.7762, + "step": 2325 + }, + { + "epoch": 31.02675585284281, + "grad_norm": 0.6769905686378479, + "learning_rate": 9.677419354838711e-05, + "loss": 2.3606, + "step": 2326 + }, + { + "epoch": 31.040133779264213, + "grad_norm": 0.6301751732826233, + "learning_rate": 9.672969966629589e-05, + "loss": 2.8011, + "step": 2327 + }, + { + "epoch": 31.05351170568562, + "grad_norm": 0.6441762447357178, + "learning_rate": 9.668520578420469e-05, + "loss": 2.902, + "step": 2328 + }, + { + "epoch": 31.066889632107024, + "grad_norm": 0.6481513381004333, + "learning_rate": 9.664071190211347e-05, + "loss": 2.7218, + "step": 2329 + }, + { + "epoch": 31.08026755852843, + "grad_norm": 0.5800172686576843, + "learning_rate": 9.659621802002225e-05, + "loss": 2.6825, + "step": 2330 + }, + { + "epoch": 31.093645484949832, + "grad_norm": 0.574657142162323, + "learning_rate": 9.655172413793105e-05, + "loss": 3.0232, + "step": 2331 + }, + { + "epoch": 31.107023411371237, + "grad_norm": 0.7543913125991821, + "learning_rate": 9.650723025583983e-05, + "loss": 2.8385, + "step": 2332 + }, + { + "epoch": 31.120401337792643, + "grad_norm": 0.5946618914604187, + "learning_rate": 9.646273637374862e-05, + "loss": 2.9112, + "step": 2333 + }, + { + "epoch": 31.13377926421405, + "grad_norm": 0.6467399001121521, + "learning_rate": 9.641824249165741e-05, + "loss": 2.6113, + "step": 2334 + }, + { + "epoch": 31.14715719063545, + "grad_norm": 0.6605884432792664, + "learning_rate": 9.637374860956619e-05, + "loss": 2.6679, + "step": 2335 + }, + { + "epoch": 31.160535117056856, + "grad_norm": 0.6213564276695251, + "learning_rate": 9.632925472747497e-05, + "loss": 2.7759, + "step": 2336 + }, + { + "epoch": 31.17391304347826, + "grad_norm": 0.5951142907142639, + "learning_rate": 9.628476084538375e-05, + "loss": 2.7004, + "step": 2337 + }, + { + "epoch": 31.187290969899667, + "grad_norm": 0.7088178396224976, + "learning_rate": 9.624026696329255e-05, + "loss": 2.6991, + "step": 2338 + }, + { + "epoch": 31.20066889632107, + "grad_norm": 0.6114148497581482, + "learning_rate": 9.619577308120133e-05, + "loss": 2.8156, + "step": 2339 + }, + { + "epoch": 31.214046822742475, + "grad_norm": 0.6434885859489441, + "learning_rate": 9.615127919911013e-05, + "loss": 2.7579, + "step": 2340 + }, + { + "epoch": 31.22742474916388, + "grad_norm": 0.6285985708236694, + "learning_rate": 9.610678531701891e-05, + "loss": 2.748, + "step": 2341 + }, + { + "epoch": 31.240802675585286, + "grad_norm": 0.6324411630630493, + "learning_rate": 9.60622914349277e-05, + "loss": 3.0152, + "step": 2342 + }, + { + "epoch": 31.254180602006688, + "grad_norm": 0.6331474781036377, + "learning_rate": 9.601779755283649e-05, + "loss": 2.6819, + "step": 2343 + }, + { + "epoch": 31.267558528428093, + "grad_norm": 0.6276957392692566, + "learning_rate": 9.597330367074527e-05, + "loss": 2.8349, + "step": 2344 + }, + { + "epoch": 31.2809364548495, + "grad_norm": 0.6469705104827881, + "learning_rate": 9.592880978865407e-05, + "loss": 2.9038, + "step": 2345 + }, + { + "epoch": 31.294314381270905, + "grad_norm": 0.6182751059532166, + "learning_rate": 9.588431590656285e-05, + "loss": 2.6157, + "step": 2346 + }, + { + "epoch": 31.307692307692307, + "grad_norm": 0.6365599632263184, + "learning_rate": 9.583982202447163e-05, + "loss": 2.6948, + "step": 2347 + }, + { + "epoch": 31.321070234113712, + "grad_norm": 0.6663181781768799, + "learning_rate": 9.579532814238043e-05, + "loss": 2.7856, + "step": 2348 + }, + { + "epoch": 31.334448160535118, + "grad_norm": 0.6316637396812439, + "learning_rate": 9.575083426028921e-05, + "loss": 2.7797, + "step": 2349 + }, + { + "epoch": 31.347826086956523, + "grad_norm": 0.6328778862953186, + "learning_rate": 9.570634037819801e-05, + "loss": 2.856, + "step": 2350 + }, + { + "epoch": 31.361204013377925, + "grad_norm": 0.6759666204452515, + "learning_rate": 9.566184649610679e-05, + "loss": 2.8553, + "step": 2351 + }, + { + "epoch": 31.37458193979933, + "grad_norm": 0.6212480664253235, + "learning_rate": 9.561735261401557e-05, + "loss": 2.9275, + "step": 2352 + }, + { + "epoch": 31.387959866220736, + "grad_norm": 0.6709821224212646, + "learning_rate": 9.557285873192437e-05, + "loss": 2.6805, + "step": 2353 + }, + { + "epoch": 31.401337792642142, + "grad_norm": 0.883128821849823, + "learning_rate": 9.552836484983315e-05, + "loss": 2.9176, + "step": 2354 + }, + { + "epoch": 31.414715719063544, + "grad_norm": 0.6021890640258789, + "learning_rate": 9.548387096774195e-05, + "loss": 2.9855, + "step": 2355 + }, + { + "epoch": 31.42809364548495, + "grad_norm": 0.6398679614067078, + "learning_rate": 9.543937708565073e-05, + "loss": 2.6652, + "step": 2356 + }, + { + "epoch": 31.441471571906355, + "grad_norm": 0.6757022738456726, + "learning_rate": 9.539488320355951e-05, + "loss": 2.7315, + "step": 2357 + }, + { + "epoch": 31.45484949832776, + "grad_norm": 0.6144642233848572, + "learning_rate": 9.53503893214683e-05, + "loss": 2.9645, + "step": 2358 + }, + { + "epoch": 31.468227424749163, + "grad_norm": 0.6620282530784607, + "learning_rate": 9.530589543937709e-05, + "loss": 2.6869, + "step": 2359 + }, + { + "epoch": 31.48160535117057, + "grad_norm": 0.7360846996307373, + "learning_rate": 9.526140155728588e-05, + "loss": 2.8083, + "step": 2360 + }, + { + "epoch": 31.494983277591974, + "grad_norm": 0.6184893250465393, + "learning_rate": 9.521690767519467e-05, + "loss": 2.9699, + "step": 2361 + }, + { + "epoch": 31.50836120401338, + "grad_norm": 0.5998436808586121, + "learning_rate": 9.517241379310345e-05, + "loss": 2.6591, + "step": 2362 + }, + { + "epoch": 31.52173913043478, + "grad_norm": 0.6120555996894836, + "learning_rate": 9.512791991101225e-05, + "loss": 3.0051, + "step": 2363 + }, + { + "epoch": 31.535117056856187, + "grad_norm": 0.6484112739562988, + "learning_rate": 9.508342602892103e-05, + "loss": 2.8998, + "step": 2364 + }, + { + "epoch": 31.548494983277592, + "grad_norm": 0.7333430051803589, + "learning_rate": 9.503893214682982e-05, + "loss": 2.8552, + "step": 2365 + }, + { + "epoch": 31.561872909698998, + "grad_norm": 0.6599973440170288, + "learning_rate": 9.49944382647386e-05, + "loss": 2.8119, + "step": 2366 + }, + { + "epoch": 31.5752508361204, + "grad_norm": 0.6654923558235168, + "learning_rate": 9.494994438264739e-05, + "loss": 2.6518, + "step": 2367 + }, + { + "epoch": 31.588628762541806, + "grad_norm": 0.6521298289299011, + "learning_rate": 9.490545050055618e-05, + "loss": 2.9758, + "step": 2368 + }, + { + "epoch": 31.60200668896321, + "grad_norm": 0.6431354284286499, + "learning_rate": 9.486095661846497e-05, + "loss": 2.7054, + "step": 2369 + }, + { + "epoch": 31.615384615384617, + "grad_norm": 0.7166205644607544, + "learning_rate": 9.481646273637376e-05, + "loss": 2.8292, + "step": 2370 + }, + { + "epoch": 31.62876254180602, + "grad_norm": 0.6184803247451782, + "learning_rate": 9.477196885428255e-05, + "loss": 2.8888, + "step": 2371 + }, + { + "epoch": 31.642140468227424, + "grad_norm": 0.6148157119750977, + "learning_rate": 9.472747497219133e-05, + "loss": 3.0765, + "step": 2372 + }, + { + "epoch": 31.65551839464883, + "grad_norm": 0.5624946355819702, + "learning_rate": 9.468298109010012e-05, + "loss": 2.4355, + "step": 2373 + }, + { + "epoch": 31.668896321070235, + "grad_norm": 0.6958364844322205, + "learning_rate": 9.46384872080089e-05, + "loss": 2.7913, + "step": 2374 + }, + { + "epoch": 31.682274247491637, + "grad_norm": 0.5998165011405945, + "learning_rate": 9.45939933259177e-05, + "loss": 2.7362, + "step": 2375 + }, + { + "epoch": 31.695652173913043, + "grad_norm": 0.7454515695571899, + "learning_rate": 9.454949944382648e-05, + "loss": 2.6624, + "step": 2376 + }, + { + "epoch": 31.70903010033445, + "grad_norm": 0.7100658416748047, + "learning_rate": 9.450500556173527e-05, + "loss": 3.0165, + "step": 2377 + }, + { + "epoch": 31.722408026755854, + "grad_norm": 0.6329564452171326, + "learning_rate": 9.446051167964405e-05, + "loss": 2.6719, + "step": 2378 + }, + { + "epoch": 31.735785953177256, + "grad_norm": 0.6892597675323486, + "learning_rate": 9.441601779755283e-05, + "loss": 2.9397, + "step": 2379 + }, + { + "epoch": 31.74916387959866, + "grad_norm": 0.7385604381561279, + "learning_rate": 9.437152391546163e-05, + "loss": 2.8576, + "step": 2380 + }, + { + "epoch": 31.762541806020067, + "grad_norm": 0.6018693447113037, + "learning_rate": 9.432703003337041e-05, + "loss": 2.6954, + "step": 2381 + }, + { + "epoch": 31.775919732441473, + "grad_norm": 0.8231506943702698, + "learning_rate": 9.42825361512792e-05, + "loss": 2.6043, + "step": 2382 + }, + { + "epoch": 31.789297658862875, + "grad_norm": 0.6664673686027527, + "learning_rate": 9.423804226918799e-05, + "loss": 2.6166, + "step": 2383 + }, + { + "epoch": 31.80267558528428, + "grad_norm": 0.6987932324409485, + "learning_rate": 9.419354838709677e-05, + "loss": 2.8913, + "step": 2384 + }, + { + "epoch": 31.816053511705686, + "grad_norm": 0.6309788823127747, + "learning_rate": 9.414905450500557e-05, + "loss": 2.9431, + "step": 2385 + }, + { + "epoch": 31.82943143812709, + "grad_norm": 0.6008022427558899, + "learning_rate": 9.410456062291435e-05, + "loss": 2.8104, + "step": 2386 + }, + { + "epoch": 31.842809364548494, + "grad_norm": 0.5797815918922424, + "learning_rate": 9.406006674082315e-05, + "loss": 2.685, + "step": 2387 + }, + { + "epoch": 31.8561872909699, + "grad_norm": 0.6194100379943848, + "learning_rate": 9.401557285873193e-05, + "loss": 2.9663, + "step": 2388 + }, + { + "epoch": 31.869565217391305, + "grad_norm": 0.633310079574585, + "learning_rate": 9.397107897664071e-05, + "loss": 2.9512, + "step": 2389 + }, + { + "epoch": 31.88294314381271, + "grad_norm": 0.6811971664428711, + "learning_rate": 9.39265850945495e-05, + "loss": 2.8803, + "step": 2390 + }, + { + "epoch": 31.896321070234112, + "grad_norm": 0.6683096885681152, + "learning_rate": 9.388209121245829e-05, + "loss": 2.8334, + "step": 2391 + }, + { + "epoch": 31.909698996655518, + "grad_norm": 0.6561327576637268, + "learning_rate": 9.383759733036708e-05, + "loss": 2.7997, + "step": 2392 + }, + { + "epoch": 31.923076923076923, + "grad_norm": 0.7540359497070312, + "learning_rate": 9.379310344827587e-05, + "loss": 2.9634, + "step": 2393 + }, + { + "epoch": 31.93645484949833, + "grad_norm": 0.6072854399681091, + "learning_rate": 9.374860956618465e-05, + "loss": 2.8317, + "step": 2394 + }, + { + "epoch": 31.94983277591973, + "grad_norm": 0.6614176630973816, + "learning_rate": 9.370411568409344e-05, + "loss": 2.9773, + "step": 2395 + }, + { + "epoch": 31.963210702341136, + "grad_norm": 0.6928074955940247, + "learning_rate": 9.365962180200223e-05, + "loss": 2.8665, + "step": 2396 + }, + { + "epoch": 31.976588628762542, + "grad_norm": 0.644250750541687, + "learning_rate": 9.361512791991102e-05, + "loss": 2.7206, + "step": 2397 + }, + { + "epoch": 31.989966555183948, + "grad_norm": 0.6529290080070496, + "learning_rate": 9.35706340378198e-05, + "loss": 2.7764, + "step": 2398 + }, + { + "epoch": 32.0, + "grad_norm": 0.7077373266220093, + "learning_rate": 9.352614015572859e-05, + "loss": 2.9435, + "step": 2399 + }, + { + "epoch": 32.013377926421406, + "grad_norm": 0.614982545375824, + "learning_rate": 9.348164627363738e-05, + "loss": 2.9358, + "step": 2400 + }, + { + "epoch": 32.02675585284281, + "grad_norm": 0.6013137102127075, + "learning_rate": 9.343715239154617e-05, + "loss": 2.7346, + "step": 2401 + }, + { + "epoch": 32.04013377926422, + "grad_norm": 0.653290867805481, + "learning_rate": 9.339265850945496e-05, + "loss": 2.9841, + "step": 2402 + }, + { + "epoch": 32.05351170568562, + "grad_norm": 0.6256197094917297, + "learning_rate": 9.334816462736374e-05, + "loss": 2.8465, + "step": 2403 + }, + { + "epoch": 32.06688963210702, + "grad_norm": 0.5798116326332092, + "learning_rate": 9.330367074527253e-05, + "loss": 2.5661, + "step": 2404 + }, + { + "epoch": 32.080267558528426, + "grad_norm": 0.6489708423614502, + "learning_rate": 9.325917686318132e-05, + "loss": 2.8186, + "step": 2405 + }, + { + "epoch": 32.09364548494983, + "grad_norm": 0.6225152015686035, + "learning_rate": 9.32146829810901e-05, + "loss": 2.6598, + "step": 2406 + }, + { + "epoch": 32.10702341137124, + "grad_norm": 0.6490108966827393, + "learning_rate": 9.31701890989989e-05, + "loss": 2.7589, + "step": 2407 + }, + { + "epoch": 32.12040133779264, + "grad_norm": 0.5600370764732361, + "learning_rate": 9.312569521690768e-05, + "loss": 2.6324, + "step": 2408 + }, + { + "epoch": 32.13377926421405, + "grad_norm": 0.6141876578330994, + "learning_rate": 9.308120133481647e-05, + "loss": 2.7371, + "step": 2409 + }, + { + "epoch": 32.147157190635454, + "grad_norm": 0.5927343368530273, + "learning_rate": 9.303670745272526e-05, + "loss": 2.9498, + "step": 2410 + }, + { + "epoch": 32.16053511705686, + "grad_norm": 0.686123788356781, + "learning_rate": 9.299221357063404e-05, + "loss": 2.698, + "step": 2411 + }, + { + "epoch": 32.17391304347826, + "grad_norm": 0.6357244253158569, + "learning_rate": 9.294771968854284e-05, + "loss": 2.7431, + "step": 2412 + }, + { + "epoch": 32.187290969899664, + "grad_norm": 0.6475897431373596, + "learning_rate": 9.290322580645162e-05, + "loss": 2.7808, + "step": 2413 + }, + { + "epoch": 32.20066889632107, + "grad_norm": 0.5825409293174744, + "learning_rate": 9.28587319243604e-05, + "loss": 2.6867, + "step": 2414 + }, + { + "epoch": 32.214046822742475, + "grad_norm": 0.6462129354476929, + "learning_rate": 9.28142380422692e-05, + "loss": 2.5075, + "step": 2415 + }, + { + "epoch": 32.22742474916388, + "grad_norm": 0.6217963099479675, + "learning_rate": 9.276974416017798e-05, + "loss": 3.1276, + "step": 2416 + }, + { + "epoch": 32.240802675585286, + "grad_norm": 1.1910638809204102, + "learning_rate": 9.272525027808678e-05, + "loss": 2.7246, + "step": 2417 + }, + { + "epoch": 32.25418060200669, + "grad_norm": 0.6929856538772583, + "learning_rate": 9.268075639599556e-05, + "loss": 2.9576, + "step": 2418 + }, + { + "epoch": 32.2675585284281, + "grad_norm": 0.6331362128257751, + "learning_rate": 9.263626251390433e-05, + "loss": 2.7267, + "step": 2419 + }, + { + "epoch": 32.280936454849495, + "grad_norm": 0.6223586797714233, + "learning_rate": 9.259176863181313e-05, + "loss": 2.5904, + "step": 2420 + }, + { + "epoch": 32.2943143812709, + "grad_norm": 0.6478082537651062, + "learning_rate": 9.254727474972191e-05, + "loss": 2.7874, + "step": 2421 + }, + { + "epoch": 32.30769230769231, + "grad_norm": 0.6407091617584229, + "learning_rate": 9.25027808676307e-05, + "loss": 2.7926, + "step": 2422 + }, + { + "epoch": 32.32107023411371, + "grad_norm": 0.5770341753959656, + "learning_rate": 9.245828698553949e-05, + "loss": 2.9566, + "step": 2423 + }, + { + "epoch": 32.33444816053512, + "grad_norm": 0.6170133948326111, + "learning_rate": 9.241379310344827e-05, + "loss": 2.8071, + "step": 2424 + }, + { + "epoch": 32.34782608695652, + "grad_norm": 0.6636959910392761, + "learning_rate": 9.236929922135707e-05, + "loss": 2.8502, + "step": 2425 + }, + { + "epoch": 32.36120401337793, + "grad_norm": 0.5865075588226318, + "learning_rate": 9.232480533926585e-05, + "loss": 2.9213, + "step": 2426 + }, + { + "epoch": 32.374581939799334, + "grad_norm": 0.6461585760116577, + "learning_rate": 9.228031145717464e-05, + "loss": 2.7773, + "step": 2427 + }, + { + "epoch": 32.38795986622073, + "grad_norm": 0.6047717928886414, + "learning_rate": 9.223581757508343e-05, + "loss": 2.7859, + "step": 2428 + }, + { + "epoch": 32.40133779264214, + "grad_norm": 0.6110160946846008, + "learning_rate": 9.219132369299221e-05, + "loss": 3.0866, + "step": 2429 + }, + { + "epoch": 32.414715719063544, + "grad_norm": 0.6165122389793396, + "learning_rate": 9.2146829810901e-05, + "loss": 2.5561, + "step": 2430 + }, + { + "epoch": 32.42809364548495, + "grad_norm": 0.6536113023757935, + "learning_rate": 9.210233592880979e-05, + "loss": 2.6207, + "step": 2431 + }, + { + "epoch": 32.441471571906355, + "grad_norm": 0.6606733202934265, + "learning_rate": 9.205784204671858e-05, + "loss": 2.8492, + "step": 2432 + }, + { + "epoch": 32.45484949832776, + "grad_norm": 0.5955353379249573, + "learning_rate": 9.201334816462737e-05, + "loss": 2.517, + "step": 2433 + }, + { + "epoch": 32.468227424749166, + "grad_norm": 0.6918089985847473, + "learning_rate": 9.196885428253615e-05, + "loss": 2.7207, + "step": 2434 + }, + { + "epoch": 32.48160535117057, + "grad_norm": 0.6295514702796936, + "learning_rate": 9.192436040044494e-05, + "loss": 2.8519, + "step": 2435 + }, + { + "epoch": 32.49498327759197, + "grad_norm": 0.5654889345169067, + "learning_rate": 9.187986651835373e-05, + "loss": 2.6741, + "step": 2436 + }, + { + "epoch": 32.508361204013376, + "grad_norm": 0.6156261563301086, + "learning_rate": 9.183537263626252e-05, + "loss": 2.8508, + "step": 2437 + }, + { + "epoch": 32.52173913043478, + "grad_norm": 0.578255832195282, + "learning_rate": 9.17908787541713e-05, + "loss": 2.7162, + "step": 2438 + }, + { + "epoch": 32.53511705685619, + "grad_norm": 0.6073182225227356, + "learning_rate": 9.174638487208009e-05, + "loss": 2.9915, + "step": 2439 + }, + { + "epoch": 32.54849498327759, + "grad_norm": 0.6045039892196655, + "learning_rate": 9.170189098998888e-05, + "loss": 2.7507, + "step": 2440 + }, + { + "epoch": 32.561872909699, + "grad_norm": 0.6131772398948669, + "learning_rate": 9.165739710789767e-05, + "loss": 2.8352, + "step": 2441 + }, + { + "epoch": 32.575250836120404, + "grad_norm": 0.5804582834243774, + "learning_rate": 9.161290322580646e-05, + "loss": 2.844, + "step": 2442 + }, + { + "epoch": 32.58862876254181, + "grad_norm": 0.6363027095794678, + "learning_rate": 9.156840934371524e-05, + "loss": 2.7884, + "step": 2443 + }, + { + "epoch": 32.60200668896321, + "grad_norm": 0.599229097366333, + "learning_rate": 9.152391546162403e-05, + "loss": 2.5051, + "step": 2444 + }, + { + "epoch": 32.61538461538461, + "grad_norm": 0.6206624507904053, + "learning_rate": 9.147942157953282e-05, + "loss": 2.9053, + "step": 2445 + }, + { + "epoch": 32.62876254180602, + "grad_norm": 0.6836562752723694, + "learning_rate": 9.14349276974416e-05, + "loss": 2.8332, + "step": 2446 + }, + { + "epoch": 32.642140468227424, + "grad_norm": 0.6113333702087402, + "learning_rate": 9.13904338153504e-05, + "loss": 2.8872, + "step": 2447 + }, + { + "epoch": 32.65551839464883, + "grad_norm": 0.5945793986320496, + "learning_rate": 9.134593993325918e-05, + "loss": 2.7071, + "step": 2448 + }, + { + "epoch": 32.668896321070235, + "grad_norm": 0.647544264793396, + "learning_rate": 9.130144605116796e-05, + "loss": 2.7016, + "step": 2449 + }, + { + "epoch": 32.68227424749164, + "grad_norm": 0.5891870856285095, + "learning_rate": 9.125695216907676e-05, + "loss": 2.7178, + "step": 2450 + }, + { + "epoch": 32.69565217391305, + "grad_norm": 0.5718060731887817, + "learning_rate": 9.121245828698554e-05, + "loss": 2.5202, + "step": 2451 + }, + { + "epoch": 32.709030100334445, + "grad_norm": 0.6582256555557251, + "learning_rate": 9.116796440489434e-05, + "loss": 2.9955, + "step": 2452 + }, + { + "epoch": 32.72240802675585, + "grad_norm": 0.6060263514518738, + "learning_rate": 9.112347052280312e-05, + "loss": 2.7279, + "step": 2453 + }, + { + "epoch": 32.735785953177256, + "grad_norm": 0.6045027375221252, + "learning_rate": 9.10789766407119e-05, + "loss": 2.7305, + "step": 2454 + }, + { + "epoch": 32.74916387959866, + "grad_norm": 0.5832952857017517, + "learning_rate": 9.10344827586207e-05, + "loss": 2.8703, + "step": 2455 + }, + { + "epoch": 32.76254180602007, + "grad_norm": 0.5899837613105774, + "learning_rate": 9.098998887652948e-05, + "loss": 2.5243, + "step": 2456 + }, + { + "epoch": 32.77591973244147, + "grad_norm": 0.6895177960395813, + "learning_rate": 9.094549499443828e-05, + "loss": 2.7974, + "step": 2457 + }, + { + "epoch": 32.78929765886288, + "grad_norm": 0.5995526909828186, + "learning_rate": 9.090100111234706e-05, + "loss": 2.6908, + "step": 2458 + }, + { + "epoch": 32.802675585284284, + "grad_norm": 0.592761754989624, + "learning_rate": 9.085650723025584e-05, + "loss": 2.7314, + "step": 2459 + }, + { + "epoch": 32.81605351170568, + "grad_norm": 0.5758498311042786, + "learning_rate": 9.081201334816463e-05, + "loss": 2.5282, + "step": 2460 + }, + { + "epoch": 32.82943143812709, + "grad_norm": 0.6093677282333374, + "learning_rate": 9.076751946607341e-05, + "loss": 2.9135, + "step": 2461 + }, + { + "epoch": 32.84280936454849, + "grad_norm": 0.617378830909729, + "learning_rate": 9.07230255839822e-05, + "loss": 2.6457, + "step": 2462 + }, + { + "epoch": 32.8561872909699, + "grad_norm": 0.5618733763694763, + "learning_rate": 9.067853170189099e-05, + "loss": 2.4018, + "step": 2463 + }, + { + "epoch": 32.869565217391305, + "grad_norm": 0.6228212714195251, + "learning_rate": 9.063403781979978e-05, + "loss": 2.7449, + "step": 2464 + }, + { + "epoch": 32.88294314381271, + "grad_norm": 0.6647076606750488, + "learning_rate": 9.058954393770856e-05, + "loss": 2.7208, + "step": 2465 + }, + { + "epoch": 32.896321070234116, + "grad_norm": 0.6458041071891785, + "learning_rate": 9.054505005561735e-05, + "loss": 2.9473, + "step": 2466 + }, + { + "epoch": 32.90969899665552, + "grad_norm": 0.5941137671470642, + "learning_rate": 9.050055617352614e-05, + "loss": 2.807, + "step": 2467 + }, + { + "epoch": 32.92307692307692, + "grad_norm": 0.7032221555709839, + "learning_rate": 9.045606229143493e-05, + "loss": 2.7777, + "step": 2468 + }, + { + "epoch": 32.936454849498325, + "grad_norm": 0.5803418755531311, + "learning_rate": 9.041156840934372e-05, + "loss": 2.7822, + "step": 2469 + }, + { + "epoch": 32.94983277591973, + "grad_norm": 0.5971820950508118, + "learning_rate": 9.03670745272525e-05, + "loss": 2.8719, + "step": 2470 + }, + { + "epoch": 32.96321070234114, + "grad_norm": 0.6287712454795837, + "learning_rate": 9.032258064516129e-05, + "loss": 2.9398, + "step": 2471 + }, + { + "epoch": 32.97658862876254, + "grad_norm": 0.5914057493209839, + "learning_rate": 9.027808676307008e-05, + "loss": 2.8867, + "step": 2472 + }, + { + "epoch": 32.98996655518395, + "grad_norm": 0.5863922834396362, + "learning_rate": 9.023359288097886e-05, + "loss": 2.8591, + "step": 2473 + }, + { + "epoch": 33.0, + "grad_norm": 0.7483989000320435, + "learning_rate": 9.018909899888766e-05, + "loss": 2.6477, + "step": 2474 + }, + { + "epoch": 33.013377926421406, + "grad_norm": 0.6205896139144897, + "learning_rate": 9.014460511679644e-05, + "loss": 2.5337, + "step": 2475 + }, + { + "epoch": 33.02675585284281, + "grad_norm": 0.5723957419395447, + "learning_rate": 9.010011123470522e-05, + "loss": 2.5932, + "step": 2476 + }, + { + "epoch": 33.04013377926422, + "grad_norm": 0.5575253963470459, + "learning_rate": 9.005561735261402e-05, + "loss": 2.6566, + "step": 2477 + }, + { + "epoch": 33.05351170568562, + "grad_norm": 0.5518794655799866, + "learning_rate": 9.00111234705228e-05, + "loss": 2.6017, + "step": 2478 + }, + { + "epoch": 33.06688963210702, + "grad_norm": 0.5580832958221436, + "learning_rate": 8.99666295884316e-05, + "loss": 2.5346, + "step": 2479 + }, + { + "epoch": 33.080267558528426, + "grad_norm": 0.5831477046012878, + "learning_rate": 8.992213570634038e-05, + "loss": 2.4491, + "step": 2480 + }, + { + "epoch": 33.09364548494983, + "grad_norm": 0.5773186683654785, + "learning_rate": 8.987764182424916e-05, + "loss": 2.6822, + "step": 2481 + }, + { + "epoch": 33.10702341137124, + "grad_norm": 0.63813316822052, + "learning_rate": 8.983314794215796e-05, + "loss": 2.6537, + "step": 2482 + }, + { + "epoch": 33.12040133779264, + "grad_norm": 0.6081259250640869, + "learning_rate": 8.978865406006674e-05, + "loss": 2.6482, + "step": 2483 + }, + { + "epoch": 33.13377926421405, + "grad_norm": 0.5737481117248535, + "learning_rate": 8.974416017797554e-05, + "loss": 2.6858, + "step": 2484 + }, + { + "epoch": 33.147157190635454, + "grad_norm": 0.6300746202468872, + "learning_rate": 8.969966629588432e-05, + "loss": 2.7791, + "step": 2485 + }, + { + "epoch": 33.16053511705686, + "grad_norm": 0.6003954410552979, + "learning_rate": 8.96551724137931e-05, + "loss": 2.8088, + "step": 2486 + }, + { + "epoch": 33.17391304347826, + "grad_norm": 0.5618523955345154, + "learning_rate": 8.96106785317019e-05, + "loss": 2.6345, + "step": 2487 + }, + { + "epoch": 33.187290969899664, + "grad_norm": 0.6398965120315552, + "learning_rate": 8.956618464961068e-05, + "loss": 2.7798, + "step": 2488 + }, + { + "epoch": 33.20066889632107, + "grad_norm": 0.6220288276672363, + "learning_rate": 8.952169076751948e-05, + "loss": 2.8217, + "step": 2489 + }, + { + "epoch": 33.214046822742475, + "grad_norm": 0.6096678376197815, + "learning_rate": 8.947719688542826e-05, + "loss": 2.6255, + "step": 2490 + }, + { + "epoch": 33.22742474916388, + "grad_norm": 0.5997774600982666, + "learning_rate": 8.943270300333704e-05, + "loss": 2.8361, + "step": 2491 + }, + { + "epoch": 33.240802675585286, + "grad_norm": 0.6017264127731323, + "learning_rate": 8.938820912124584e-05, + "loss": 2.7003, + "step": 2492 + }, + { + "epoch": 33.25418060200669, + "grad_norm": 0.6213170289993286, + "learning_rate": 8.934371523915462e-05, + "loss": 2.8992, + "step": 2493 + }, + { + "epoch": 33.2675585284281, + "grad_norm": 0.6049240231513977, + "learning_rate": 8.929922135706342e-05, + "loss": 2.79, + "step": 2494 + }, + { + "epoch": 33.280936454849495, + "grad_norm": 0.6120469570159912, + "learning_rate": 8.92547274749722e-05, + "loss": 2.7695, + "step": 2495 + }, + { + "epoch": 33.2943143812709, + "grad_norm": 0.5764347314834595, + "learning_rate": 8.921023359288098e-05, + "loss": 2.5934, + "step": 2496 + }, + { + "epoch": 33.30769230769231, + "grad_norm": 0.5952526926994324, + "learning_rate": 8.916573971078978e-05, + "loss": 2.8907, + "step": 2497 + }, + { + "epoch": 33.32107023411371, + "grad_norm": 0.5542176961898804, + "learning_rate": 8.912124582869856e-05, + "loss": 2.6922, + "step": 2498 + }, + { + "epoch": 33.33444816053512, + "grad_norm": 0.622314453125, + "learning_rate": 8.907675194660736e-05, + "loss": 2.7322, + "step": 2499 + }, + { + "epoch": 33.34782608695652, + "grad_norm": 0.5831202864646912, + "learning_rate": 8.903225806451614e-05, + "loss": 2.766, + "step": 2500 + }, + { + "epoch": 33.36120401337793, + "grad_norm": 0.6198363900184631, + "learning_rate": 8.898776418242492e-05, + "loss": 3.0431, + "step": 2501 + }, + { + "epoch": 33.374581939799334, + "grad_norm": 0.6069219708442688, + "learning_rate": 8.89432703003337e-05, + "loss": 2.8924, + "step": 2502 + }, + { + "epoch": 33.38795986622073, + "grad_norm": 0.6229302287101746, + "learning_rate": 8.889877641824249e-05, + "loss": 2.807, + "step": 2503 + }, + { + "epoch": 33.40133779264214, + "grad_norm": 0.6178746819496155, + "learning_rate": 8.885428253615128e-05, + "loss": 2.7833, + "step": 2504 + }, + { + "epoch": 33.414715719063544, + "grad_norm": 0.5880036950111389, + "learning_rate": 8.880978865406006e-05, + "loss": 2.7688, + "step": 2505 + }, + { + "epoch": 33.42809364548495, + "grad_norm": 0.6083427667617798, + "learning_rate": 8.876529477196886e-05, + "loss": 2.5995, + "step": 2506 + }, + { + "epoch": 33.441471571906355, + "grad_norm": 0.5846229195594788, + "learning_rate": 8.872080088987764e-05, + "loss": 2.7198, + "step": 2507 + }, + { + "epoch": 33.45484949832776, + "grad_norm": 0.602645754814148, + "learning_rate": 8.867630700778642e-05, + "loss": 2.8395, + "step": 2508 + }, + { + "epoch": 33.468227424749166, + "grad_norm": 0.5731549859046936, + "learning_rate": 8.863181312569522e-05, + "loss": 2.6083, + "step": 2509 + }, + { + "epoch": 33.48160535117057, + "grad_norm": 0.6011414527893066, + "learning_rate": 8.8587319243604e-05, + "loss": 2.6036, + "step": 2510 + }, + { + "epoch": 33.49498327759197, + "grad_norm": 0.5945054292678833, + "learning_rate": 8.85428253615128e-05, + "loss": 2.8461, + "step": 2511 + }, + { + "epoch": 33.508361204013376, + "grad_norm": 0.6700778007507324, + "learning_rate": 8.849833147942158e-05, + "loss": 2.9723, + "step": 2512 + }, + { + "epoch": 33.52173913043478, + "grad_norm": 0.5661737322807312, + "learning_rate": 8.845383759733036e-05, + "loss": 2.5432, + "step": 2513 + }, + { + "epoch": 33.53511705685619, + "grad_norm": 0.615665078163147, + "learning_rate": 8.840934371523916e-05, + "loss": 2.5919, + "step": 2514 + }, + { + "epoch": 33.54849498327759, + "grad_norm": 0.6195909380912781, + "learning_rate": 8.836484983314794e-05, + "loss": 2.6273, + "step": 2515 + }, + { + "epoch": 33.561872909699, + "grad_norm": 0.6008119583129883, + "learning_rate": 8.832035595105674e-05, + "loss": 2.6099, + "step": 2516 + }, + { + "epoch": 33.575250836120404, + "grad_norm": 0.6525030732154846, + "learning_rate": 8.827586206896552e-05, + "loss": 2.7715, + "step": 2517 + }, + { + "epoch": 33.58862876254181, + "grad_norm": 0.6317800879478455, + "learning_rate": 8.82313681868743e-05, + "loss": 3.1264, + "step": 2518 + }, + { + "epoch": 33.60200668896321, + "grad_norm": 0.5794599056243896, + "learning_rate": 8.81868743047831e-05, + "loss": 2.5186, + "step": 2519 + }, + { + "epoch": 33.61538461538461, + "grad_norm": 0.633000373840332, + "learning_rate": 8.814238042269188e-05, + "loss": 2.7709, + "step": 2520 + }, + { + "epoch": 33.62876254180602, + "grad_norm": 0.6132048964500427, + "learning_rate": 8.809788654060068e-05, + "loss": 2.6311, + "step": 2521 + }, + { + "epoch": 33.642140468227424, + "grad_norm": 0.6357754468917847, + "learning_rate": 8.805339265850946e-05, + "loss": 2.8635, + "step": 2522 + }, + { + "epoch": 33.65551839464883, + "grad_norm": 0.5678791403770447, + "learning_rate": 8.800889877641824e-05, + "loss": 2.773, + "step": 2523 + }, + { + "epoch": 33.668896321070235, + "grad_norm": 0.5938114523887634, + "learning_rate": 8.796440489432704e-05, + "loss": 2.6978, + "step": 2524 + }, + { + "epoch": 33.68227424749164, + "grad_norm": 0.6006829142570496, + "learning_rate": 8.791991101223582e-05, + "loss": 2.8147, + "step": 2525 + }, + { + "epoch": 33.69565217391305, + "grad_norm": 0.6094152331352234, + "learning_rate": 8.787541713014462e-05, + "loss": 2.9597, + "step": 2526 + }, + { + "epoch": 33.709030100334445, + "grad_norm": 0.6261194944381714, + "learning_rate": 8.78309232480534e-05, + "loss": 2.7116, + "step": 2527 + }, + { + "epoch": 33.72240802675585, + "grad_norm": 0.6670746803283691, + "learning_rate": 8.778642936596218e-05, + "loss": 2.8481, + "step": 2528 + }, + { + "epoch": 33.735785953177256, + "grad_norm": 0.5976547598838806, + "learning_rate": 8.774193548387098e-05, + "loss": 2.7368, + "step": 2529 + }, + { + "epoch": 33.74916387959866, + "grad_norm": 0.5903379321098328, + "learning_rate": 8.769744160177976e-05, + "loss": 2.784, + "step": 2530 + }, + { + "epoch": 33.76254180602007, + "grad_norm": 0.604728102684021, + "learning_rate": 8.765294771968855e-05, + "loss": 2.7636, + "step": 2531 + }, + { + "epoch": 33.77591973244147, + "grad_norm": 0.6338250041007996, + "learning_rate": 8.760845383759734e-05, + "loss": 2.8111, + "step": 2532 + }, + { + "epoch": 33.78929765886288, + "grad_norm": 0.6076844334602356, + "learning_rate": 8.756395995550612e-05, + "loss": 2.7592, + "step": 2533 + }, + { + "epoch": 33.802675585284284, + "grad_norm": 0.6152723431587219, + "learning_rate": 8.751946607341492e-05, + "loss": 2.889, + "step": 2534 + }, + { + "epoch": 33.81605351170568, + "grad_norm": 0.5518035292625427, + "learning_rate": 8.74749721913237e-05, + "loss": 2.5681, + "step": 2535 + }, + { + "epoch": 33.82943143812709, + "grad_norm": 0.624555766582489, + "learning_rate": 8.74304783092325e-05, + "loss": 2.6651, + "step": 2536 + }, + { + "epoch": 33.84280936454849, + "grad_norm": 0.6052064895629883, + "learning_rate": 8.738598442714128e-05, + "loss": 2.6292, + "step": 2537 + }, + { + "epoch": 33.8561872909699, + "grad_norm": 0.6039239168167114, + "learning_rate": 8.734149054505006e-05, + "loss": 2.9519, + "step": 2538 + }, + { + "epoch": 33.869565217391305, + "grad_norm": 0.633802056312561, + "learning_rate": 8.729699666295885e-05, + "loss": 2.8169, + "step": 2539 + }, + { + "epoch": 33.88294314381271, + "grad_norm": 0.6555929780006409, + "learning_rate": 8.725250278086764e-05, + "loss": 2.7988, + "step": 2540 + }, + { + "epoch": 33.896321070234116, + "grad_norm": 0.6146313548088074, + "learning_rate": 8.720800889877643e-05, + "loss": 2.7496, + "step": 2541 + }, + { + "epoch": 33.90969899665552, + "grad_norm": 0.5777727365493774, + "learning_rate": 8.716351501668522e-05, + "loss": 2.7106, + "step": 2542 + }, + { + "epoch": 33.92307692307692, + "grad_norm": 0.6095665097236633, + "learning_rate": 8.7119021134594e-05, + "loss": 2.8617, + "step": 2543 + }, + { + "epoch": 33.936454849498325, + "grad_norm": 0.5426962375640869, + "learning_rate": 8.707452725250278e-05, + "loss": 2.7434, + "step": 2544 + }, + { + "epoch": 33.94983277591973, + "grad_norm": 0.5962931513786316, + "learning_rate": 8.703003337041156e-05, + "loss": 2.6328, + "step": 2545 + }, + { + "epoch": 33.96321070234114, + "grad_norm": 0.609198808670044, + "learning_rate": 8.698553948832036e-05, + "loss": 3.0128, + "step": 2546 + }, + { + "epoch": 33.97658862876254, + "grad_norm": 0.5592045783996582, + "learning_rate": 8.694104560622914e-05, + "loss": 2.4845, + "step": 2547 + }, + { + "epoch": 33.98996655518395, + "grad_norm": 0.6074299812316895, + "learning_rate": 8.689655172413794e-05, + "loss": 2.76, + "step": 2548 + }, + { + "epoch": 34.0, + "grad_norm": 0.7298697233200073, + "learning_rate": 8.685205784204672e-05, + "loss": 2.7132, + "step": 2549 + }, + { + "epoch": 34.013377926421406, + "grad_norm": 0.5669110417366028, + "learning_rate": 8.68075639599555e-05, + "loss": 2.6124, + "step": 2550 + }, + { + "epoch": 34.02675585284281, + "grad_norm": 0.6375609636306763, + "learning_rate": 8.67630700778643e-05, + "loss": 2.8555, + "step": 2551 + }, + { + "epoch": 34.04013377926422, + "grad_norm": 0.6021872162818909, + "learning_rate": 8.671857619577308e-05, + "loss": 2.6396, + "step": 2552 + }, + { + "epoch": 34.05351170568562, + "grad_norm": 0.5895376801490784, + "learning_rate": 8.667408231368188e-05, + "loss": 2.7079, + "step": 2553 + }, + { + "epoch": 34.06688963210702, + "grad_norm": 0.5841491222381592, + "learning_rate": 8.662958843159066e-05, + "loss": 2.5644, + "step": 2554 + }, + { + "epoch": 34.080267558528426, + "grad_norm": 0.6124491691589355, + "learning_rate": 8.658509454949944e-05, + "loss": 2.8281, + "step": 2555 + }, + { + "epoch": 34.09364548494983, + "grad_norm": 0.5991553664207458, + "learning_rate": 8.654060066740824e-05, + "loss": 2.7442, + "step": 2556 + }, + { + "epoch": 34.10702341137124, + "grad_norm": 0.5902360081672668, + "learning_rate": 8.649610678531702e-05, + "loss": 2.6759, + "step": 2557 + }, + { + "epoch": 34.12040133779264, + "grad_norm": 0.590696394443512, + "learning_rate": 8.645161290322581e-05, + "loss": 2.6966, + "step": 2558 + }, + { + "epoch": 34.13377926421405, + "grad_norm": 0.5867067575454712, + "learning_rate": 8.64071190211346e-05, + "loss": 2.7264, + "step": 2559 + }, + { + "epoch": 34.147157190635454, + "grad_norm": 0.6447772979736328, + "learning_rate": 8.636262513904338e-05, + "loss": 2.7659, + "step": 2560 + }, + { + "epoch": 34.16053511705686, + "grad_norm": 0.6006119251251221, + "learning_rate": 8.631813125695218e-05, + "loss": 2.7641, + "step": 2561 + }, + { + "epoch": 34.17391304347826, + "grad_norm": 0.593132495880127, + "learning_rate": 8.627363737486096e-05, + "loss": 2.6671, + "step": 2562 + }, + { + "epoch": 34.187290969899664, + "grad_norm": 0.5959640145301819, + "learning_rate": 8.622914349276975e-05, + "loss": 2.7752, + "step": 2563 + }, + { + "epoch": 34.20066889632107, + "grad_norm": 0.6102313995361328, + "learning_rate": 8.618464961067854e-05, + "loss": 2.8268, + "step": 2564 + }, + { + "epoch": 34.214046822742475, + "grad_norm": 0.6111622452735901, + "learning_rate": 8.614015572858732e-05, + "loss": 2.5703, + "step": 2565 + }, + { + "epoch": 34.22742474916388, + "grad_norm": 0.5814309120178223, + "learning_rate": 8.609566184649611e-05, + "loss": 2.5823, + "step": 2566 + }, + { + "epoch": 34.240802675585286, + "grad_norm": 0.5712332129478455, + "learning_rate": 8.60511679644049e-05, + "loss": 2.3881, + "step": 2567 + }, + { + "epoch": 34.25418060200669, + "grad_norm": 0.5990560054779053, + "learning_rate": 8.600667408231369e-05, + "loss": 2.6209, + "step": 2568 + }, + { + "epoch": 34.2675585284281, + "grad_norm": 0.6006229519844055, + "learning_rate": 8.596218020022248e-05, + "loss": 2.6623, + "step": 2569 + }, + { + "epoch": 34.280936454849495, + "grad_norm": 0.607763946056366, + "learning_rate": 8.591768631813126e-05, + "loss": 2.7403, + "step": 2570 + }, + { + "epoch": 34.2943143812709, + "grad_norm": 0.6283872127532959, + "learning_rate": 8.587319243604005e-05, + "loss": 2.9186, + "step": 2571 + }, + { + "epoch": 34.30769230769231, + "grad_norm": 0.6027917861938477, + "learning_rate": 8.582869855394884e-05, + "loss": 2.7604, + "step": 2572 + }, + { + "epoch": 34.32107023411371, + "grad_norm": 0.617796778678894, + "learning_rate": 8.578420467185763e-05, + "loss": 2.2285, + "step": 2573 + }, + { + "epoch": 34.33444816053512, + "grad_norm": 0.6242703199386597, + "learning_rate": 8.573971078976641e-05, + "loss": 2.7596, + "step": 2574 + }, + { + "epoch": 34.34782608695652, + "grad_norm": 0.6676938533782959, + "learning_rate": 8.56952169076752e-05, + "loss": 2.8307, + "step": 2575 + }, + { + "epoch": 34.36120401337793, + "grad_norm": 0.616120457649231, + "learning_rate": 8.565072302558399e-05, + "loss": 2.562, + "step": 2576 + }, + { + "epoch": 34.374581939799334, + "grad_norm": 0.6003774404525757, + "learning_rate": 8.560622914349277e-05, + "loss": 2.913, + "step": 2577 + }, + { + "epoch": 34.38795986622073, + "grad_norm": 0.615688145160675, + "learning_rate": 8.556173526140157e-05, + "loss": 2.707, + "step": 2578 + }, + { + "epoch": 34.40133779264214, + "grad_norm": 0.6033686995506287, + "learning_rate": 8.551724137931035e-05, + "loss": 2.8834, + "step": 2579 + }, + { + "epoch": 34.414715719063544, + "grad_norm": 0.5940545201301575, + "learning_rate": 8.547274749721914e-05, + "loss": 2.5229, + "step": 2580 + }, + { + "epoch": 34.42809364548495, + "grad_norm": 0.587794303894043, + "learning_rate": 8.542825361512793e-05, + "loss": 2.4714, + "step": 2581 + }, + { + "epoch": 34.441471571906355, + "grad_norm": 0.6004595160484314, + "learning_rate": 8.538375973303671e-05, + "loss": 2.7798, + "step": 2582 + }, + { + "epoch": 34.45484949832776, + "grad_norm": 0.5780850648880005, + "learning_rate": 8.533926585094551e-05, + "loss": 2.5977, + "step": 2583 + }, + { + "epoch": 34.468227424749166, + "grad_norm": 0.6270626187324524, + "learning_rate": 8.529477196885428e-05, + "loss": 2.8688, + "step": 2584 + }, + { + "epoch": 34.48160535117057, + "grad_norm": 0.5620662569999695, + "learning_rate": 8.525027808676307e-05, + "loss": 2.5738, + "step": 2585 + }, + { + "epoch": 34.49498327759197, + "grad_norm": 0.6294083595275879, + "learning_rate": 8.520578420467186e-05, + "loss": 2.9486, + "step": 2586 + }, + { + "epoch": 34.508361204013376, + "grad_norm": 0.596815824508667, + "learning_rate": 8.516129032258064e-05, + "loss": 2.7196, + "step": 2587 + }, + { + "epoch": 34.52173913043478, + "grad_norm": 0.5836464166641235, + "learning_rate": 8.511679644048944e-05, + "loss": 2.7812, + "step": 2588 + }, + { + "epoch": 34.53511705685619, + "grad_norm": 0.6028749942779541, + "learning_rate": 8.507230255839822e-05, + "loss": 2.7632, + "step": 2589 + }, + { + "epoch": 34.54849498327759, + "grad_norm": 0.602236807346344, + "learning_rate": 8.502780867630701e-05, + "loss": 2.7839, + "step": 2590 + }, + { + "epoch": 34.561872909699, + "grad_norm": 0.6275736093521118, + "learning_rate": 8.49833147942158e-05, + "loss": 2.7257, + "step": 2591 + }, + { + "epoch": 34.575250836120404, + "grad_norm": 0.5825592875480652, + "learning_rate": 8.493882091212458e-05, + "loss": 2.6373, + "step": 2592 + }, + { + "epoch": 34.58862876254181, + "grad_norm": 0.6081156134605408, + "learning_rate": 8.489432703003337e-05, + "loss": 2.7701, + "step": 2593 + }, + { + "epoch": 34.60200668896321, + "grad_norm": 0.5824214220046997, + "learning_rate": 8.484983314794216e-05, + "loss": 2.7086, + "step": 2594 + }, + { + "epoch": 34.61538461538461, + "grad_norm": 0.5969799160957336, + "learning_rate": 8.480533926585095e-05, + "loss": 2.6757, + "step": 2595 + }, + { + "epoch": 34.62876254180602, + "grad_norm": 0.6191763877868652, + "learning_rate": 8.476084538375974e-05, + "loss": 2.8289, + "step": 2596 + }, + { + "epoch": 34.642140468227424, + "grad_norm": 0.570646345615387, + "learning_rate": 8.471635150166852e-05, + "loss": 2.6874, + "step": 2597 + }, + { + "epoch": 34.65551839464883, + "grad_norm": 0.6193318963050842, + "learning_rate": 8.467185761957731e-05, + "loss": 2.7932, + "step": 2598 + }, + { + "epoch": 34.668896321070235, + "grad_norm": 0.607635498046875, + "learning_rate": 8.46273637374861e-05, + "loss": 2.7382, + "step": 2599 + }, + { + "epoch": 34.68227424749164, + "grad_norm": 0.5707620978355408, + "learning_rate": 8.458286985539489e-05, + "loss": 2.5729, + "step": 2600 + }, + { + "epoch": 34.69565217391305, + "grad_norm": 0.5609964728355408, + "learning_rate": 8.453837597330367e-05, + "loss": 2.6938, + "step": 2601 + }, + { + "epoch": 34.709030100334445, + "grad_norm": 0.6459850072860718, + "learning_rate": 8.449388209121246e-05, + "loss": 2.7604, + "step": 2602 + }, + { + "epoch": 34.72240802675585, + "grad_norm": 0.5685142874717712, + "learning_rate": 8.444938820912125e-05, + "loss": 2.6518, + "step": 2603 + }, + { + "epoch": 34.735785953177256, + "grad_norm": 0.5800818204879761, + "learning_rate": 8.440489432703003e-05, + "loss": 2.6193, + "step": 2604 + }, + { + "epoch": 34.74916387959866, + "grad_norm": 0.6133848428726196, + "learning_rate": 8.436040044493883e-05, + "loss": 2.8524, + "step": 2605 + }, + { + "epoch": 34.76254180602007, + "grad_norm": 0.5716528296470642, + "learning_rate": 8.431590656284761e-05, + "loss": 2.9175, + "step": 2606 + }, + { + "epoch": 34.77591973244147, + "grad_norm": 0.5581215023994446, + "learning_rate": 8.42714126807564e-05, + "loss": 2.6181, + "step": 2607 + }, + { + "epoch": 34.78929765886288, + "grad_norm": 0.5720329284667969, + "learning_rate": 8.422691879866519e-05, + "loss": 2.6401, + "step": 2608 + }, + { + "epoch": 34.802675585284284, + "grad_norm": 0.6074360609054565, + "learning_rate": 8.418242491657397e-05, + "loss": 2.8911, + "step": 2609 + }, + { + "epoch": 34.81605351170568, + "grad_norm": 0.5685964226722717, + "learning_rate": 8.413793103448277e-05, + "loss": 2.868, + "step": 2610 + }, + { + "epoch": 34.82943143812709, + "grad_norm": 0.5762805938720703, + "learning_rate": 8.409343715239155e-05, + "loss": 2.7641, + "step": 2611 + }, + { + "epoch": 34.84280936454849, + "grad_norm": 0.6570479869842529, + "learning_rate": 8.404894327030033e-05, + "loss": 2.8734, + "step": 2612 + }, + { + "epoch": 34.8561872909699, + "grad_norm": 0.6143054366111755, + "learning_rate": 8.400444938820913e-05, + "loss": 2.7792, + "step": 2613 + }, + { + "epoch": 34.869565217391305, + "grad_norm": 0.6144394278526306, + "learning_rate": 8.395995550611791e-05, + "loss": 2.8023, + "step": 2614 + }, + { + "epoch": 34.88294314381271, + "grad_norm": 0.6102853417396545, + "learning_rate": 8.391546162402671e-05, + "loss": 2.7508, + "step": 2615 + }, + { + "epoch": 34.896321070234116, + "grad_norm": 0.6034586429595947, + "learning_rate": 8.387096774193549e-05, + "loss": 2.5528, + "step": 2616 + }, + { + "epoch": 34.90969899665552, + "grad_norm": 0.6032466292381287, + "learning_rate": 8.382647385984427e-05, + "loss": 2.7771, + "step": 2617 + }, + { + "epoch": 34.92307692307692, + "grad_norm": 0.6024323105812073, + "learning_rate": 8.378197997775307e-05, + "loss": 2.5514, + "step": 2618 + }, + { + "epoch": 34.936454849498325, + "grad_norm": 0.6762076616287231, + "learning_rate": 8.373748609566185e-05, + "loss": 2.8336, + "step": 2619 + }, + { + "epoch": 34.94983277591973, + "grad_norm": 0.5701456069946289, + "learning_rate": 8.369299221357065e-05, + "loss": 2.4598, + "step": 2620 + }, + { + "epoch": 34.96321070234114, + "grad_norm": 0.5878699421882629, + "learning_rate": 8.364849833147943e-05, + "loss": 2.6511, + "step": 2621 + }, + { + "epoch": 34.97658862876254, + "grad_norm": 0.6187490820884705, + "learning_rate": 8.360400444938821e-05, + "loss": 2.6226, + "step": 2622 + }, + { + "epoch": 34.98996655518395, + "grad_norm": 0.5954105854034424, + "learning_rate": 8.355951056729701e-05, + "loss": 2.7115, + "step": 2623 + }, + { + "epoch": 35.0, + "grad_norm": 0.7193446755409241, + "learning_rate": 8.351501668520579e-05, + "loss": 2.7859, + "step": 2624 + }, + { + "epoch": 35.013377926421406, + "grad_norm": 0.6013773083686829, + "learning_rate": 8.347052280311457e-05, + "loss": 2.8222, + "step": 2625 + }, + { + "epoch": 35.02675585284281, + "grad_norm": 0.594046950340271, + "learning_rate": 8.342602892102336e-05, + "loss": 2.6455, + "step": 2626 + }, + { + "epoch": 35.04013377926422, + "grad_norm": 0.5741755962371826, + "learning_rate": 8.338153503893215e-05, + "loss": 2.6076, + "step": 2627 + }, + { + "epoch": 35.05351170568562, + "grad_norm": 0.6248610019683838, + "learning_rate": 8.333704115684093e-05, + "loss": 2.8714, + "step": 2628 + }, + { + "epoch": 35.06688963210702, + "grad_norm": 0.6038011312484741, + "learning_rate": 8.329254727474972e-05, + "loss": 2.6117, + "step": 2629 + }, + { + "epoch": 35.080267558528426, + "grad_norm": 0.618163526058197, + "learning_rate": 8.324805339265851e-05, + "loss": 2.8792, + "step": 2630 + }, + { + "epoch": 35.09364548494983, + "grad_norm": 0.5919619798660278, + "learning_rate": 8.32035595105673e-05, + "loss": 2.7317, + "step": 2631 + }, + { + "epoch": 35.10702341137124, + "grad_norm": 0.5996441841125488, + "learning_rate": 8.315906562847609e-05, + "loss": 2.6743, + "step": 2632 + }, + { + "epoch": 35.12040133779264, + "grad_norm": 0.5751325488090515, + "learning_rate": 8.311457174638487e-05, + "loss": 2.7814, + "step": 2633 + }, + { + "epoch": 35.13377926421405, + "grad_norm": 0.5554592609405518, + "learning_rate": 8.307007786429366e-05, + "loss": 2.4874, + "step": 2634 + }, + { + "epoch": 35.147157190635454, + "grad_norm": 0.5908927321434021, + "learning_rate": 8.302558398220245e-05, + "loss": 2.6838, + "step": 2635 + }, + { + "epoch": 35.16053511705686, + "grad_norm": 0.6261016130447388, + "learning_rate": 8.298109010011123e-05, + "loss": 2.8281, + "step": 2636 + }, + { + "epoch": 35.17391304347826, + "grad_norm": 0.5825657844543457, + "learning_rate": 8.293659621802003e-05, + "loss": 2.806, + "step": 2637 + }, + { + "epoch": 35.187290969899664, + "grad_norm": 0.5951602458953857, + "learning_rate": 8.289210233592881e-05, + "loss": 2.728, + "step": 2638 + }, + { + "epoch": 35.20066889632107, + "grad_norm": 0.6045755743980408, + "learning_rate": 8.28476084538376e-05, + "loss": 2.5632, + "step": 2639 + }, + { + "epoch": 35.214046822742475, + "grad_norm": 0.5793343782424927, + "learning_rate": 8.280311457174639e-05, + "loss": 2.4912, + "step": 2640 + }, + { + "epoch": 35.22742474916388, + "grad_norm": 0.5778194665908813, + "learning_rate": 8.275862068965517e-05, + "loss": 2.649, + "step": 2641 + }, + { + "epoch": 35.240802675585286, + "grad_norm": 0.5981440544128418, + "learning_rate": 8.271412680756397e-05, + "loss": 2.7538, + "step": 2642 + }, + { + "epoch": 35.25418060200669, + "grad_norm": 0.601005494594574, + "learning_rate": 8.266963292547275e-05, + "loss": 2.5793, + "step": 2643 + }, + { + "epoch": 35.2675585284281, + "grad_norm": 0.5964012742042542, + "learning_rate": 8.262513904338153e-05, + "loss": 2.7611, + "step": 2644 + }, + { + "epoch": 35.280936454849495, + "grad_norm": 0.603792130947113, + "learning_rate": 8.258064516129033e-05, + "loss": 2.5607, + "step": 2645 + }, + { + "epoch": 35.2943143812709, + "grad_norm": 0.6139290928840637, + "learning_rate": 8.253615127919911e-05, + "loss": 2.7616, + "step": 2646 + }, + { + "epoch": 35.30769230769231, + "grad_norm": 0.5915001034736633, + "learning_rate": 8.249165739710791e-05, + "loss": 2.7107, + "step": 2647 + }, + { + "epoch": 35.32107023411371, + "grad_norm": 0.6058906316757202, + "learning_rate": 8.244716351501669e-05, + "loss": 2.7078, + "step": 2648 + }, + { + "epoch": 35.33444816053512, + "grad_norm": 0.6192322969436646, + "learning_rate": 8.240266963292547e-05, + "loss": 2.6736, + "step": 2649 + }, + { + "epoch": 35.34782608695652, + "grad_norm": 0.6301470994949341, + "learning_rate": 8.235817575083427e-05, + "loss": 2.6024, + "step": 2650 + }, + { + "epoch": 35.36120401337793, + "grad_norm": 0.59424889087677, + "learning_rate": 8.231368186874305e-05, + "loss": 2.7615, + "step": 2651 + }, + { + "epoch": 35.374581939799334, + "grad_norm": 0.5878474712371826, + "learning_rate": 8.226918798665185e-05, + "loss": 2.7163, + "step": 2652 + }, + { + "epoch": 35.38795986622073, + "grad_norm": 0.6637581586837769, + "learning_rate": 8.222469410456063e-05, + "loss": 2.3804, + "step": 2653 + }, + { + "epoch": 35.40133779264214, + "grad_norm": 0.6142652630805969, + "learning_rate": 8.218020022246941e-05, + "loss": 2.8305, + "step": 2654 + }, + { + "epoch": 35.414715719063544, + "grad_norm": 0.59095698595047, + "learning_rate": 8.213570634037821e-05, + "loss": 2.7017, + "step": 2655 + }, + { + "epoch": 35.42809364548495, + "grad_norm": 0.6011471748352051, + "learning_rate": 8.209121245828699e-05, + "loss": 2.5261, + "step": 2656 + }, + { + "epoch": 35.441471571906355, + "grad_norm": 0.6142131686210632, + "learning_rate": 8.204671857619579e-05, + "loss": 2.8304, + "step": 2657 + }, + { + "epoch": 35.45484949832776, + "grad_norm": 0.5916325449943542, + "learning_rate": 8.200222469410457e-05, + "loss": 2.5681, + "step": 2658 + }, + { + "epoch": 35.468227424749166, + "grad_norm": 0.6357722282409668, + "learning_rate": 8.195773081201335e-05, + "loss": 2.6106, + "step": 2659 + }, + { + "epoch": 35.48160535117057, + "grad_norm": 0.6173760890960693, + "learning_rate": 8.191323692992215e-05, + "loss": 2.644, + "step": 2660 + }, + { + "epoch": 35.49498327759197, + "grad_norm": 0.6166900396347046, + "learning_rate": 8.186874304783093e-05, + "loss": 2.6629, + "step": 2661 + }, + { + "epoch": 35.508361204013376, + "grad_norm": 0.593647837638855, + "learning_rate": 8.182424916573973e-05, + "loss": 2.722, + "step": 2662 + }, + { + "epoch": 35.52173913043478, + "grad_norm": 0.6336297988891602, + "learning_rate": 8.177975528364851e-05, + "loss": 2.7934, + "step": 2663 + }, + { + "epoch": 35.53511705685619, + "grad_norm": 0.6457211971282959, + "learning_rate": 8.173526140155729e-05, + "loss": 2.5204, + "step": 2664 + }, + { + "epoch": 35.54849498327759, + "grad_norm": 0.5988196730613708, + "learning_rate": 8.169076751946609e-05, + "loss": 2.8071, + "step": 2665 + }, + { + "epoch": 35.561872909699, + "grad_norm": 0.6295507550239563, + "learning_rate": 8.164627363737485e-05, + "loss": 2.6807, + "step": 2666 + }, + { + "epoch": 35.575250836120404, + "grad_norm": 0.5988739728927612, + "learning_rate": 8.160177975528365e-05, + "loss": 2.5091, + "step": 2667 + }, + { + "epoch": 35.58862876254181, + "grad_norm": 0.5924574732780457, + "learning_rate": 8.155728587319243e-05, + "loss": 2.5552, + "step": 2668 + }, + { + "epoch": 35.60200668896321, + "grad_norm": 0.6240471005439758, + "learning_rate": 8.151279199110123e-05, + "loss": 2.8841, + "step": 2669 + }, + { + "epoch": 35.61538461538461, + "grad_norm": 0.5939052700996399, + "learning_rate": 8.146829810901001e-05, + "loss": 2.4867, + "step": 2670 + }, + { + "epoch": 35.62876254180602, + "grad_norm": 0.6398495435714722, + "learning_rate": 8.14238042269188e-05, + "loss": 2.801, + "step": 2671 + }, + { + "epoch": 35.642140468227424, + "grad_norm": 0.6106517910957336, + "learning_rate": 8.137931034482759e-05, + "loss": 2.7849, + "step": 2672 + }, + { + "epoch": 35.65551839464883, + "grad_norm": 0.5723669528961182, + "learning_rate": 8.133481646273637e-05, + "loss": 2.6566, + "step": 2673 + }, + { + "epoch": 35.668896321070235, + "grad_norm": 0.616449236869812, + "learning_rate": 8.129032258064517e-05, + "loss": 2.823, + "step": 2674 + }, + { + "epoch": 35.68227424749164, + "grad_norm": 0.6001511812210083, + "learning_rate": 8.124582869855395e-05, + "loss": 2.6833, + "step": 2675 + }, + { + "epoch": 35.69565217391305, + "grad_norm": 0.5830066800117493, + "learning_rate": 8.120133481646273e-05, + "loss": 2.8018, + "step": 2676 + }, + { + "epoch": 35.709030100334445, + "grad_norm": 0.5999500155448914, + "learning_rate": 8.115684093437153e-05, + "loss": 2.7339, + "step": 2677 + }, + { + "epoch": 35.72240802675585, + "grad_norm": 0.6584502458572388, + "learning_rate": 8.111234705228031e-05, + "loss": 2.8347, + "step": 2678 + }, + { + "epoch": 35.735785953177256, + "grad_norm": 0.6561383008956909, + "learning_rate": 8.106785317018911e-05, + "loss": 2.6747, + "step": 2679 + }, + { + "epoch": 35.74916387959866, + "grad_norm": 0.6040836572647095, + "learning_rate": 8.102335928809789e-05, + "loss": 2.713, + "step": 2680 + }, + { + "epoch": 35.76254180602007, + "grad_norm": 0.6353011131286621, + "learning_rate": 8.097886540600667e-05, + "loss": 2.51, + "step": 2681 + }, + { + "epoch": 35.77591973244147, + "grad_norm": 0.5977175235748291, + "learning_rate": 8.093437152391547e-05, + "loss": 2.7502, + "step": 2682 + }, + { + "epoch": 35.78929765886288, + "grad_norm": 0.6166728734970093, + "learning_rate": 8.088987764182425e-05, + "loss": 2.8618, + "step": 2683 + }, + { + "epoch": 35.802675585284284, + "grad_norm": 0.5676133036613464, + "learning_rate": 8.084538375973305e-05, + "loss": 2.6979, + "step": 2684 + }, + { + "epoch": 35.81605351170568, + "grad_norm": 0.6121795773506165, + "learning_rate": 8.080088987764183e-05, + "loss": 2.7341, + "step": 2685 + }, + { + "epoch": 35.82943143812709, + "grad_norm": 0.6005775332450867, + "learning_rate": 8.075639599555061e-05, + "loss": 2.6737, + "step": 2686 + }, + { + "epoch": 35.84280936454849, + "grad_norm": 0.5900837182998657, + "learning_rate": 8.071190211345941e-05, + "loss": 2.7243, + "step": 2687 + }, + { + "epoch": 35.8561872909699, + "grad_norm": 0.5916823744773865, + "learning_rate": 8.066740823136819e-05, + "loss": 2.5637, + "step": 2688 + }, + { + "epoch": 35.869565217391305, + "grad_norm": 0.6083678603172302, + "learning_rate": 8.062291434927699e-05, + "loss": 2.7644, + "step": 2689 + }, + { + "epoch": 35.88294314381271, + "grad_norm": 0.5921754837036133, + "learning_rate": 8.057842046718577e-05, + "loss": 2.6599, + "step": 2690 + }, + { + "epoch": 35.896321070234116, + "grad_norm": 0.6174843907356262, + "learning_rate": 8.053392658509455e-05, + "loss": 2.6216, + "step": 2691 + }, + { + "epoch": 35.90969899665552, + "grad_norm": 0.5929603576660156, + "learning_rate": 8.048943270300335e-05, + "loss": 2.5201, + "step": 2692 + }, + { + "epoch": 35.92307692307692, + "grad_norm": 0.6305480599403381, + "learning_rate": 8.044493882091213e-05, + "loss": 2.726, + "step": 2693 + }, + { + "epoch": 35.936454849498325, + "grad_norm": 0.662198543548584, + "learning_rate": 8.040044493882092e-05, + "loss": 2.7794, + "step": 2694 + }, + { + "epoch": 35.94983277591973, + "grad_norm": 0.5887805223464966, + "learning_rate": 8.03559510567297e-05, + "loss": 2.5348, + "step": 2695 + }, + { + "epoch": 35.96321070234114, + "grad_norm": 0.5848047137260437, + "learning_rate": 8.031145717463849e-05, + "loss": 2.4549, + "step": 2696 + }, + { + "epoch": 35.97658862876254, + "grad_norm": 0.5758711695671082, + "learning_rate": 8.026696329254729e-05, + "loss": 2.5325, + "step": 2697 + }, + { + "epoch": 35.98996655518395, + "grad_norm": 0.6385326981544495, + "learning_rate": 8.022246941045607e-05, + "loss": 2.7417, + "step": 2698 + }, + { + "epoch": 36.0, + "grad_norm": 0.7033568024635315, + "learning_rate": 8.017797552836486e-05, + "loss": 2.6844, + "step": 2699 + }, + { + "epoch": 36.013377926421406, + "grad_norm": 0.5869501233100891, + "learning_rate": 8.013348164627365e-05, + "loss": 2.5899, + "step": 2700 + }, + { + "epoch": 36.02675585284281, + "grad_norm": 0.6519103050231934, + "learning_rate": 8.008898776418243e-05, + "loss": 2.8153, + "step": 2701 + }, + { + "epoch": 36.04013377926422, + "grad_norm": 0.5899248123168945, + "learning_rate": 8.004449388209122e-05, + "loss": 2.8749, + "step": 2702 + }, + { + "epoch": 36.05351170568562, + "grad_norm": 0.6234768629074097, + "learning_rate": 8e-05, + "loss": 2.5244, + "step": 2703 + }, + { + "epoch": 36.06688963210702, + "grad_norm": 0.5662122368812561, + "learning_rate": 7.99555061179088e-05, + "loss": 2.4421, + "step": 2704 + }, + { + "epoch": 36.080267558528426, + "grad_norm": 0.584144651889801, + "learning_rate": 7.991101223581758e-05, + "loss": 2.7261, + "step": 2705 + }, + { + "epoch": 36.09364548494983, + "grad_norm": 0.6016060709953308, + "learning_rate": 7.986651835372637e-05, + "loss": 2.607, + "step": 2706 + }, + { + "epoch": 36.10702341137124, + "grad_norm": 0.6164819002151489, + "learning_rate": 7.982202447163516e-05, + "loss": 2.7332, + "step": 2707 + }, + { + "epoch": 36.12040133779264, + "grad_norm": 0.6115155816078186, + "learning_rate": 7.977753058954393e-05, + "loss": 2.706, + "step": 2708 + }, + { + "epoch": 36.13377926421405, + "grad_norm": 0.6102612018585205, + "learning_rate": 7.973303670745273e-05, + "loss": 2.8179, + "step": 2709 + }, + { + "epoch": 36.147157190635454, + "grad_norm": 0.5910987257957458, + "learning_rate": 7.968854282536151e-05, + "loss": 2.6354, + "step": 2710 + }, + { + "epoch": 36.16053511705686, + "grad_norm": 0.6355498433113098, + "learning_rate": 7.96440489432703e-05, + "loss": 2.705, + "step": 2711 + }, + { + "epoch": 36.17391304347826, + "grad_norm": 0.6213639378547668, + "learning_rate": 7.959955506117909e-05, + "loss": 2.481, + "step": 2712 + }, + { + "epoch": 36.187290969899664, + "grad_norm": 0.587666392326355, + "learning_rate": 7.955506117908787e-05, + "loss": 2.6295, + "step": 2713 + }, + { + "epoch": 36.20066889632107, + "grad_norm": 0.5778639316558838, + "learning_rate": 7.951056729699667e-05, + "loss": 2.5795, + "step": 2714 + }, + { + "epoch": 36.214046822742475, + "grad_norm": 0.6050285696983337, + "learning_rate": 7.946607341490545e-05, + "loss": 2.671, + "step": 2715 + }, + { + "epoch": 36.22742474916388, + "grad_norm": 0.6501610279083252, + "learning_rate": 7.942157953281425e-05, + "loss": 2.7712, + "step": 2716 + }, + { + "epoch": 36.240802675585286, + "grad_norm": 0.6189882159233093, + "learning_rate": 7.937708565072303e-05, + "loss": 2.5324, + "step": 2717 + }, + { + "epoch": 36.25418060200669, + "grad_norm": 0.6189801096916199, + "learning_rate": 7.933259176863181e-05, + "loss": 2.5716, + "step": 2718 + }, + { + "epoch": 36.2675585284281, + "grad_norm": 0.617070734500885, + "learning_rate": 7.92880978865406e-05, + "loss": 2.7485, + "step": 2719 + }, + { + "epoch": 36.280936454849495, + "grad_norm": 0.6042205095291138, + "learning_rate": 7.924360400444939e-05, + "loss": 2.4323, + "step": 2720 + }, + { + "epoch": 36.2943143812709, + "grad_norm": 0.6032358407974243, + "learning_rate": 7.919911012235818e-05, + "loss": 2.6852, + "step": 2721 + }, + { + "epoch": 36.30769230769231, + "grad_norm": 0.5686265230178833, + "learning_rate": 7.915461624026697e-05, + "loss": 2.551, + "step": 2722 + }, + { + "epoch": 36.32107023411371, + "grad_norm": 0.5876219868659973, + "learning_rate": 7.911012235817575e-05, + "loss": 2.3279, + "step": 2723 + }, + { + "epoch": 36.33444816053512, + "grad_norm": 0.624603807926178, + "learning_rate": 7.906562847608455e-05, + "loss": 2.6303, + "step": 2724 + }, + { + "epoch": 36.34782608695652, + "grad_norm": 0.5873613953590393, + "learning_rate": 7.902113459399333e-05, + "loss": 2.7109, + "step": 2725 + }, + { + "epoch": 36.36120401337793, + "grad_norm": 0.6027836203575134, + "learning_rate": 7.897664071190212e-05, + "loss": 2.6051, + "step": 2726 + }, + { + "epoch": 36.374581939799334, + "grad_norm": 0.5907491445541382, + "learning_rate": 7.89321468298109e-05, + "loss": 2.5136, + "step": 2727 + }, + { + "epoch": 36.38795986622073, + "grad_norm": 0.6281634569168091, + "learning_rate": 7.888765294771969e-05, + "loss": 2.6751, + "step": 2728 + }, + { + "epoch": 36.40133779264214, + "grad_norm": 0.6197676658630371, + "learning_rate": 7.884315906562848e-05, + "loss": 2.6621, + "step": 2729 + }, + { + "epoch": 36.414715719063544, + "grad_norm": 0.6759832501411438, + "learning_rate": 7.879866518353727e-05, + "loss": 2.9938, + "step": 2730 + }, + { + "epoch": 36.42809364548495, + "grad_norm": 0.6368512511253357, + "learning_rate": 7.875417130144606e-05, + "loss": 2.8629, + "step": 2731 + }, + { + "epoch": 36.441471571906355, + "grad_norm": 0.5914281606674194, + "learning_rate": 7.870967741935484e-05, + "loss": 2.6094, + "step": 2732 + }, + { + "epoch": 36.45484949832776, + "grad_norm": 0.60133957862854, + "learning_rate": 7.866518353726363e-05, + "loss": 2.7982, + "step": 2733 + }, + { + "epoch": 36.468227424749166, + "grad_norm": 0.621593177318573, + "learning_rate": 7.862068965517242e-05, + "loss": 2.595, + "step": 2734 + }, + { + "epoch": 36.48160535117057, + "grad_norm": 0.6033498048782349, + "learning_rate": 7.85761957730812e-05, + "loss": 2.7714, + "step": 2735 + }, + { + "epoch": 36.49498327759197, + "grad_norm": 0.6060223579406738, + "learning_rate": 7.853170189099e-05, + "loss": 2.5214, + "step": 2736 + }, + { + "epoch": 36.508361204013376, + "grad_norm": 0.6156677007675171, + "learning_rate": 7.848720800889878e-05, + "loss": 2.7125, + "step": 2737 + }, + { + "epoch": 36.52173913043478, + "grad_norm": 0.611400306224823, + "learning_rate": 7.844271412680757e-05, + "loss": 2.8178, + "step": 2738 + }, + { + "epoch": 36.53511705685619, + "grad_norm": 0.6582430601119995, + "learning_rate": 7.839822024471636e-05, + "loss": 2.9621, + "step": 2739 + }, + { + "epoch": 36.54849498327759, + "grad_norm": 0.6210145950317383, + "learning_rate": 7.835372636262514e-05, + "loss": 2.6619, + "step": 2740 + }, + { + "epoch": 36.561872909699, + "grad_norm": 0.6173349022865295, + "learning_rate": 7.830923248053394e-05, + "loss": 2.6075, + "step": 2741 + }, + { + "epoch": 36.575250836120404, + "grad_norm": 0.5770074129104614, + "learning_rate": 7.826473859844272e-05, + "loss": 2.5834, + "step": 2742 + }, + { + "epoch": 36.58862876254181, + "grad_norm": 0.6317800283432007, + "learning_rate": 7.82202447163515e-05, + "loss": 2.6134, + "step": 2743 + }, + { + "epoch": 36.60200668896321, + "grad_norm": 0.6122284531593323, + "learning_rate": 7.81757508342603e-05, + "loss": 2.7218, + "step": 2744 + }, + { + "epoch": 36.61538461538461, + "grad_norm": 0.5796928405761719, + "learning_rate": 7.813125695216908e-05, + "loss": 2.6459, + "step": 2745 + }, + { + "epoch": 36.62876254180602, + "grad_norm": 0.5904363393783569, + "learning_rate": 7.808676307007788e-05, + "loss": 2.5216, + "step": 2746 + }, + { + "epoch": 36.642140468227424, + "grad_norm": 0.6074969172477722, + "learning_rate": 7.804226918798666e-05, + "loss": 2.5776, + "step": 2747 + }, + { + "epoch": 36.65551839464883, + "grad_norm": 0.6080595850944519, + "learning_rate": 7.799777530589544e-05, + "loss": 2.7651, + "step": 2748 + }, + { + "epoch": 36.668896321070235, + "grad_norm": 0.5953335762023926, + "learning_rate": 7.795328142380423e-05, + "loss": 2.5139, + "step": 2749 + }, + { + "epoch": 36.68227424749164, + "grad_norm": 0.6192560195922852, + "learning_rate": 7.790878754171301e-05, + "loss": 2.5091, + "step": 2750 + }, + { + "epoch": 36.69565217391305, + "grad_norm": 0.6019810438156128, + "learning_rate": 7.78642936596218e-05, + "loss": 2.7217, + "step": 2751 + }, + { + "epoch": 36.709030100334445, + "grad_norm": 0.5785876512527466, + "learning_rate": 7.781979977753059e-05, + "loss": 2.4336, + "step": 2752 + }, + { + "epoch": 36.72240802675585, + "grad_norm": 0.6099944114685059, + "learning_rate": 7.777530589543937e-05, + "loss": 2.6614, + "step": 2753 + }, + { + "epoch": 36.735785953177256, + "grad_norm": 0.6206619143486023, + "learning_rate": 7.773081201334817e-05, + "loss": 2.7906, + "step": 2754 + }, + { + "epoch": 36.74916387959866, + "grad_norm": 0.5885736346244812, + "learning_rate": 7.768631813125695e-05, + "loss": 2.4836, + "step": 2755 + }, + { + "epoch": 36.76254180602007, + "grad_norm": 0.6022318005561829, + "learning_rate": 7.764182424916574e-05, + "loss": 2.6954, + "step": 2756 + }, + { + "epoch": 36.77591973244147, + "grad_norm": 0.6479224562644958, + "learning_rate": 7.759733036707453e-05, + "loss": 2.6576, + "step": 2757 + }, + { + "epoch": 36.78929765886288, + "grad_norm": 0.6288067102432251, + "learning_rate": 7.755283648498331e-05, + "loss": 2.6265, + "step": 2758 + }, + { + "epoch": 36.802675585284284, + "grad_norm": 0.6241925358772278, + "learning_rate": 7.75083426028921e-05, + "loss": 2.7554, + "step": 2759 + }, + { + "epoch": 36.81605351170568, + "grad_norm": 0.6327086687088013, + "learning_rate": 7.746384872080089e-05, + "loss": 2.6145, + "step": 2760 + }, + { + "epoch": 36.82943143812709, + "grad_norm": 0.6177266836166382, + "learning_rate": 7.741935483870968e-05, + "loss": 2.8887, + "step": 2761 + }, + { + "epoch": 36.84280936454849, + "grad_norm": 0.6014055013656616, + "learning_rate": 7.737486095661847e-05, + "loss": 2.6512, + "step": 2762 + }, + { + "epoch": 36.8561872909699, + "grad_norm": 0.5890874266624451, + "learning_rate": 7.733036707452725e-05, + "loss": 2.4959, + "step": 2763 + }, + { + "epoch": 36.869565217391305, + "grad_norm": 0.5938498973846436, + "learning_rate": 7.728587319243604e-05, + "loss": 2.6364, + "step": 2764 + }, + { + "epoch": 36.88294314381271, + "grad_norm": 0.587141752243042, + "learning_rate": 7.724137931034483e-05, + "loss": 2.804, + "step": 2765 + }, + { + "epoch": 36.896321070234116, + "grad_norm": 0.5674299597740173, + "learning_rate": 7.719688542825362e-05, + "loss": 2.5209, + "step": 2766 + }, + { + "epoch": 36.90969899665552, + "grad_norm": 0.616888165473938, + "learning_rate": 7.71523915461624e-05, + "loss": 2.6253, + "step": 2767 + }, + { + "epoch": 36.92307692307692, + "grad_norm": 0.6040933132171631, + "learning_rate": 7.710789766407119e-05, + "loss": 2.6645, + "step": 2768 + }, + { + "epoch": 36.936454849498325, + "grad_norm": 0.5765551328659058, + "learning_rate": 7.706340378197998e-05, + "loss": 2.5598, + "step": 2769 + }, + { + "epoch": 36.94983277591973, + "grad_norm": 0.598349928855896, + "learning_rate": 7.701890989988877e-05, + "loss": 2.6193, + "step": 2770 + }, + { + "epoch": 36.96321070234114, + "grad_norm": 0.5944122076034546, + "learning_rate": 7.697441601779756e-05, + "loss": 2.7833, + "step": 2771 + }, + { + "epoch": 36.97658862876254, + "grad_norm": 0.6495612263679504, + "learning_rate": 7.692992213570634e-05, + "loss": 2.8149, + "step": 2772 + }, + { + "epoch": 36.98996655518395, + "grad_norm": 0.6290954351425171, + "learning_rate": 7.688542825361513e-05, + "loss": 2.8039, + "step": 2773 + }, + { + "epoch": 37.0, + "grad_norm": 0.7225489020347595, + "learning_rate": 7.684093437152392e-05, + "loss": 2.6676, + "step": 2774 + }, + { + "epoch": 37.013377926421406, + "grad_norm": 0.5684818029403687, + "learning_rate": 7.67964404894327e-05, + "loss": 2.5921, + "step": 2775 + }, + { + "epoch": 37.02675585284281, + "grad_norm": 0.6261118054389954, + "learning_rate": 7.67519466073415e-05, + "loss": 2.6034, + "step": 2776 + }, + { + "epoch": 37.04013377926422, + "grad_norm": 0.5897383689880371, + "learning_rate": 7.670745272525028e-05, + "loss": 2.6218, + "step": 2777 + }, + { + "epoch": 37.05351170568562, + "grad_norm": 0.6284380555152893, + "learning_rate": 7.666295884315907e-05, + "loss": 2.8228, + "step": 2778 + }, + { + "epoch": 37.06688963210702, + "grad_norm": 0.6081552505493164, + "learning_rate": 7.661846496106786e-05, + "loss": 2.4883, + "step": 2779 + }, + { + "epoch": 37.080267558528426, + "grad_norm": 0.5826512575149536, + "learning_rate": 7.657397107897664e-05, + "loss": 2.288, + "step": 2780 + }, + { + "epoch": 37.09364548494983, + "grad_norm": 0.609711766242981, + "learning_rate": 7.652947719688544e-05, + "loss": 2.7462, + "step": 2781 + }, + { + "epoch": 37.10702341137124, + "grad_norm": 0.6180025339126587, + "learning_rate": 7.648498331479422e-05, + "loss": 2.5538, + "step": 2782 + }, + { + "epoch": 37.12040133779264, + "grad_norm": 0.5980042815208435, + "learning_rate": 7.6440489432703e-05, + "loss": 2.5066, + "step": 2783 + }, + { + "epoch": 37.13377926421405, + "grad_norm": 0.5932002067565918, + "learning_rate": 7.63959955506118e-05, + "loss": 2.5456, + "step": 2784 + }, + { + "epoch": 37.147157190635454, + "grad_norm": 0.5952025055885315, + "learning_rate": 7.635150166852058e-05, + "loss": 2.5584, + "step": 2785 + }, + { + "epoch": 37.16053511705686, + "grad_norm": 0.6259213089942932, + "learning_rate": 7.630700778642938e-05, + "loss": 2.7745, + "step": 2786 + }, + { + "epoch": 37.17391304347826, + "grad_norm": 0.620085597038269, + "learning_rate": 7.626251390433816e-05, + "loss": 2.7472, + "step": 2787 + }, + { + "epoch": 37.187290969899664, + "grad_norm": 0.5972263216972351, + "learning_rate": 7.621802002224694e-05, + "loss": 2.4507, + "step": 2788 + }, + { + "epoch": 37.20066889632107, + "grad_norm": 0.6285966634750366, + "learning_rate": 7.617352614015574e-05, + "loss": 2.6729, + "step": 2789 + }, + { + "epoch": 37.214046822742475, + "grad_norm": 0.6250035762786865, + "learning_rate": 7.612903225806451e-05, + "loss": 2.7339, + "step": 2790 + }, + { + "epoch": 37.22742474916388, + "grad_norm": 0.6274656653404236, + "learning_rate": 7.60845383759733e-05, + "loss": 2.4538, + "step": 2791 + }, + { + "epoch": 37.240802675585286, + "grad_norm": 0.632949948310852, + "learning_rate": 7.604004449388209e-05, + "loss": 2.6448, + "step": 2792 + }, + { + "epoch": 37.25418060200669, + "grad_norm": 0.5869743824005127, + "learning_rate": 7.599555061179088e-05, + "loss": 2.6065, + "step": 2793 + }, + { + "epoch": 37.2675585284281, + "grad_norm": 0.6650564670562744, + "learning_rate": 7.595105672969966e-05, + "loss": 2.6521, + "step": 2794 + }, + { + "epoch": 37.280936454849495, + "grad_norm": 0.597977340221405, + "learning_rate": 7.590656284760845e-05, + "loss": 2.7055, + "step": 2795 + }, + { + "epoch": 37.2943143812709, + "grad_norm": 0.6276383996009827, + "learning_rate": 7.586206896551724e-05, + "loss": 2.6584, + "step": 2796 + }, + { + "epoch": 37.30769230769231, + "grad_norm": 0.637657105922699, + "learning_rate": 7.581757508342603e-05, + "loss": 2.763, + "step": 2797 + }, + { + "epoch": 37.32107023411371, + "grad_norm": 0.5994833707809448, + "learning_rate": 7.577308120133482e-05, + "loss": 2.6043, + "step": 2798 + }, + { + "epoch": 37.33444816053512, + "grad_norm": 0.6213527321815491, + "learning_rate": 7.57285873192436e-05, + "loss": 2.7679, + "step": 2799 + }, + { + "epoch": 37.34782608695652, + "grad_norm": 0.6137163639068604, + "learning_rate": 7.568409343715239e-05, + "loss": 2.6287, + "step": 2800 + }, + { + "epoch": 37.36120401337793, + "grad_norm": 0.6083935499191284, + "learning_rate": 7.563959955506118e-05, + "loss": 2.6636, + "step": 2801 + }, + { + "epoch": 37.374581939799334, + "grad_norm": 0.6070682406425476, + "learning_rate": 7.559510567296996e-05, + "loss": 2.5606, + "step": 2802 + }, + { + "epoch": 37.38795986622073, + "grad_norm": 0.6027565002441406, + "learning_rate": 7.555061179087876e-05, + "loss": 2.55, + "step": 2803 + }, + { + "epoch": 37.40133779264214, + "grad_norm": 0.6037593483924866, + "learning_rate": 7.550611790878754e-05, + "loss": 2.4951, + "step": 2804 + }, + { + "epoch": 37.414715719063544, + "grad_norm": 0.588081955909729, + "learning_rate": 7.546162402669633e-05, + "loss": 2.4888, + "step": 2805 + }, + { + "epoch": 37.42809364548495, + "grad_norm": 0.640067994594574, + "learning_rate": 7.541713014460512e-05, + "loss": 2.6508, + "step": 2806 + }, + { + "epoch": 37.441471571906355, + "grad_norm": 0.6123721599578857, + "learning_rate": 7.53726362625139e-05, + "loss": 2.6542, + "step": 2807 + }, + { + "epoch": 37.45484949832776, + "grad_norm": 0.6153370141983032, + "learning_rate": 7.53281423804227e-05, + "loss": 2.5986, + "step": 2808 + }, + { + "epoch": 37.468227424749166, + "grad_norm": 0.6331472396850586, + "learning_rate": 7.528364849833148e-05, + "loss": 2.5883, + "step": 2809 + }, + { + "epoch": 37.48160535117057, + "grad_norm": 0.6108154654502869, + "learning_rate": 7.523915461624026e-05, + "loss": 2.5254, + "step": 2810 + }, + { + "epoch": 37.49498327759197, + "grad_norm": 0.6167747974395752, + "learning_rate": 7.519466073414906e-05, + "loss": 2.5088, + "step": 2811 + }, + { + "epoch": 37.508361204013376, + "grad_norm": 0.6372016668319702, + "learning_rate": 7.515016685205784e-05, + "loss": 2.6996, + "step": 2812 + }, + { + "epoch": 37.52173913043478, + "grad_norm": 0.5725007057189941, + "learning_rate": 7.510567296996664e-05, + "loss": 2.5294, + "step": 2813 + }, + { + "epoch": 37.53511705685619, + "grad_norm": 0.5692083835601807, + "learning_rate": 7.506117908787542e-05, + "loss": 2.5158, + "step": 2814 + }, + { + "epoch": 37.54849498327759, + "grad_norm": 0.6351197361946106, + "learning_rate": 7.50166852057842e-05, + "loss": 2.7666, + "step": 2815 + }, + { + "epoch": 37.561872909699, + "grad_norm": 0.5791570544242859, + "learning_rate": 7.4972191323693e-05, + "loss": 2.4728, + "step": 2816 + }, + { + "epoch": 37.575250836120404, + "grad_norm": 0.6294588446617126, + "learning_rate": 7.492769744160178e-05, + "loss": 2.657, + "step": 2817 + }, + { + "epoch": 37.58862876254181, + "grad_norm": 0.6071592569351196, + "learning_rate": 7.488320355951058e-05, + "loss": 2.7995, + "step": 2818 + }, + { + "epoch": 37.60200668896321, + "grad_norm": 0.6079913377761841, + "learning_rate": 7.483870967741936e-05, + "loss": 2.8394, + "step": 2819 + }, + { + "epoch": 37.61538461538461, + "grad_norm": 0.5867500901222229, + "learning_rate": 7.479421579532814e-05, + "loss": 2.413, + "step": 2820 + }, + { + "epoch": 37.62876254180602, + "grad_norm": 0.6049195528030396, + "learning_rate": 7.474972191323694e-05, + "loss": 2.64, + "step": 2821 + }, + { + "epoch": 37.642140468227424, + "grad_norm": 0.5787717700004578, + "learning_rate": 7.470522803114572e-05, + "loss": 2.5029, + "step": 2822 + }, + { + "epoch": 37.65551839464883, + "grad_norm": 0.6378308534622192, + "learning_rate": 7.466073414905452e-05, + "loss": 2.6937, + "step": 2823 + }, + { + "epoch": 37.668896321070235, + "grad_norm": 0.6016746163368225, + "learning_rate": 7.46162402669633e-05, + "loss": 2.477, + "step": 2824 + }, + { + "epoch": 37.68227424749164, + "grad_norm": 0.5864558219909668, + "learning_rate": 7.457174638487208e-05, + "loss": 2.7177, + "step": 2825 + }, + { + "epoch": 37.69565217391305, + "grad_norm": 0.6352173686027527, + "learning_rate": 7.452725250278088e-05, + "loss": 2.73, + "step": 2826 + }, + { + "epoch": 37.709030100334445, + "grad_norm": 0.6432989239692688, + "learning_rate": 7.448275862068966e-05, + "loss": 2.5062, + "step": 2827 + }, + { + "epoch": 37.72240802675585, + "grad_norm": 0.6061347723007202, + "learning_rate": 7.443826473859846e-05, + "loss": 2.5776, + "step": 2828 + }, + { + "epoch": 37.735785953177256, + "grad_norm": 0.6208527684211731, + "learning_rate": 7.439377085650724e-05, + "loss": 2.4125, + "step": 2829 + }, + { + "epoch": 37.74916387959866, + "grad_norm": 0.6052190661430359, + "learning_rate": 7.434927697441602e-05, + "loss": 2.5331, + "step": 2830 + }, + { + "epoch": 37.76254180602007, + "grad_norm": 0.6361626982688904, + "learning_rate": 7.43047830923248e-05, + "loss": 2.7675, + "step": 2831 + }, + { + "epoch": 37.77591973244147, + "grad_norm": 0.6102336645126343, + "learning_rate": 7.426028921023359e-05, + "loss": 2.3707, + "step": 2832 + }, + { + "epoch": 37.78929765886288, + "grad_norm": 0.6248254776000977, + "learning_rate": 7.421579532814238e-05, + "loss": 2.8015, + "step": 2833 + }, + { + "epoch": 37.802675585284284, + "grad_norm": 0.6449552178382874, + "learning_rate": 7.417130144605116e-05, + "loss": 2.7157, + "step": 2834 + }, + { + "epoch": 37.81605351170568, + "grad_norm": 0.6328997611999512, + "learning_rate": 7.412680756395996e-05, + "loss": 2.8754, + "step": 2835 + }, + { + "epoch": 37.82943143812709, + "grad_norm": 0.617190957069397, + "learning_rate": 7.408231368186874e-05, + "loss": 2.8825, + "step": 2836 + }, + { + "epoch": 37.84280936454849, + "grad_norm": 0.6431034207344055, + "learning_rate": 7.403781979977752e-05, + "loss": 2.877, + "step": 2837 + }, + { + "epoch": 37.8561872909699, + "grad_norm": 0.6334032416343689, + "learning_rate": 7.399332591768632e-05, + "loss": 2.5787, + "step": 2838 + }, + { + "epoch": 37.869565217391305, + "grad_norm": 0.599069356918335, + "learning_rate": 7.39488320355951e-05, + "loss": 2.6774, + "step": 2839 + }, + { + "epoch": 37.88294314381271, + "grad_norm": 0.6226022243499756, + "learning_rate": 7.39043381535039e-05, + "loss": 2.7821, + "step": 2840 + }, + { + "epoch": 37.896321070234116, + "grad_norm": 0.6168680191040039, + "learning_rate": 7.385984427141268e-05, + "loss": 2.6552, + "step": 2841 + }, + { + "epoch": 37.90969899665552, + "grad_norm": 0.5987963080406189, + "learning_rate": 7.381535038932146e-05, + "loss": 2.6139, + "step": 2842 + }, + { + "epoch": 37.92307692307692, + "grad_norm": 0.5855274796485901, + "learning_rate": 7.377085650723026e-05, + "loss": 2.687, + "step": 2843 + }, + { + "epoch": 37.936454849498325, + "grad_norm": 0.6431803703308105, + "learning_rate": 7.372636262513904e-05, + "loss": 2.7411, + "step": 2844 + }, + { + "epoch": 37.94983277591973, + "grad_norm": 0.6025601029396057, + "learning_rate": 7.368186874304784e-05, + "loss": 2.6054, + "step": 2845 + }, + { + "epoch": 37.96321070234114, + "grad_norm": 0.6150123476982117, + "learning_rate": 7.363737486095662e-05, + "loss": 2.7483, + "step": 2846 + }, + { + "epoch": 37.97658862876254, + "grad_norm": 0.5857491493225098, + "learning_rate": 7.35928809788654e-05, + "loss": 2.68, + "step": 2847 + }, + { + "epoch": 37.98996655518395, + "grad_norm": 0.6244483590126038, + "learning_rate": 7.35483870967742e-05, + "loss": 2.7171, + "step": 2848 + }, + { + "epoch": 38.0, + "grad_norm": 0.7437806725502014, + "learning_rate": 7.350389321468298e-05, + "loss": 2.5646, + "step": 2849 + }, + { + "epoch": 38.013377926421406, + "grad_norm": 0.6323840618133545, + "learning_rate": 7.345939933259178e-05, + "loss": 2.7457, + "step": 2850 + }, + { + "epoch": 38.02675585284281, + "grad_norm": 0.6000533103942871, + "learning_rate": 7.341490545050056e-05, + "loss": 2.6829, + "step": 2851 + }, + { + "epoch": 38.04013377926422, + "grad_norm": 0.621135950088501, + "learning_rate": 7.337041156840934e-05, + "loss": 2.795, + "step": 2852 + }, + { + "epoch": 38.05351170568562, + "grad_norm": 0.6256486773490906, + "learning_rate": 7.332591768631814e-05, + "loss": 2.6762, + "step": 2853 + }, + { + "epoch": 38.06688963210702, + "grad_norm": 0.6535592079162598, + "learning_rate": 7.328142380422692e-05, + "loss": 2.8339, + "step": 2854 + }, + { + "epoch": 38.080267558528426, + "grad_norm": 0.6324830055236816, + "learning_rate": 7.323692992213572e-05, + "loss": 2.6129, + "step": 2855 + }, + { + "epoch": 38.09364548494983, + "grad_norm": 0.5966958403587341, + "learning_rate": 7.31924360400445e-05, + "loss": 2.3738, + "step": 2856 + }, + { + "epoch": 38.10702341137124, + "grad_norm": 0.6118017435073853, + "learning_rate": 7.314794215795328e-05, + "loss": 2.5778, + "step": 2857 + }, + { + "epoch": 38.12040133779264, + "grad_norm": 0.5761687159538269, + "learning_rate": 7.310344827586208e-05, + "loss": 2.347, + "step": 2858 + }, + { + "epoch": 38.13377926421405, + "grad_norm": 0.6576936841011047, + "learning_rate": 7.305895439377086e-05, + "loss": 2.6811, + "step": 2859 + }, + { + "epoch": 38.147157190635454, + "grad_norm": 0.6086276173591614, + "learning_rate": 7.301446051167965e-05, + "loss": 2.713, + "step": 2860 + }, + { + "epoch": 38.16053511705686, + "grad_norm": 0.5824201107025146, + "learning_rate": 7.296996662958844e-05, + "loss": 2.457, + "step": 2861 + }, + { + "epoch": 38.17391304347826, + "grad_norm": 0.6397709250450134, + "learning_rate": 7.292547274749722e-05, + "loss": 2.5384, + "step": 2862 + }, + { + "epoch": 38.187290969899664, + "grad_norm": 0.6120484471321106, + "learning_rate": 7.288097886540602e-05, + "loss": 2.5742, + "step": 2863 + }, + { + "epoch": 38.20066889632107, + "grad_norm": 0.6424990892410278, + "learning_rate": 7.28364849833148e-05, + "loss": 2.8879, + "step": 2864 + }, + { + "epoch": 38.214046822742475, + "grad_norm": 0.610888659954071, + "learning_rate": 7.27919911012236e-05, + "loss": 2.5114, + "step": 2865 + }, + { + "epoch": 38.22742474916388, + "grad_norm": 0.5722895264625549, + "learning_rate": 7.274749721913238e-05, + "loss": 2.1089, + "step": 2866 + }, + { + "epoch": 38.240802675585286, + "grad_norm": 0.6000377535820007, + "learning_rate": 7.270300333704116e-05, + "loss": 2.3571, + "step": 2867 + }, + { + "epoch": 38.25418060200669, + "grad_norm": 0.6441863775253296, + "learning_rate": 7.265850945494995e-05, + "loss": 2.828, + "step": 2868 + }, + { + "epoch": 38.2675585284281, + "grad_norm": 0.6954602003097534, + "learning_rate": 7.261401557285874e-05, + "loss": 2.586, + "step": 2869 + }, + { + "epoch": 38.280936454849495, + "grad_norm": 0.6308737397193909, + "learning_rate": 7.256952169076753e-05, + "loss": 2.558, + "step": 2870 + }, + { + "epoch": 38.2943143812709, + "grad_norm": 0.5915984511375427, + "learning_rate": 7.252502780867632e-05, + "loss": 2.6369, + "step": 2871 + }, + { + "epoch": 38.30769230769231, + "grad_norm": 0.5993978381156921, + "learning_rate": 7.24805339265851e-05, + "loss": 2.5857, + "step": 2872 + }, + { + "epoch": 38.32107023411371, + "grad_norm": 0.5899291038513184, + "learning_rate": 7.243604004449388e-05, + "loss": 2.396, + "step": 2873 + }, + { + "epoch": 38.33444816053512, + "grad_norm": 0.6214590668678284, + "learning_rate": 7.239154616240266e-05, + "loss": 2.3691, + "step": 2874 + }, + { + "epoch": 38.34782608695652, + "grad_norm": 0.5765502452850342, + "learning_rate": 7.234705228031146e-05, + "loss": 2.5417, + "step": 2875 + }, + { + "epoch": 38.36120401337793, + "grad_norm": 0.6192215085029602, + "learning_rate": 7.230255839822024e-05, + "loss": 2.459, + "step": 2876 + }, + { + "epoch": 38.374581939799334, + "grad_norm": 0.6413353085517883, + "learning_rate": 7.225806451612904e-05, + "loss": 2.5345, + "step": 2877 + }, + { + "epoch": 38.38795986622073, + "grad_norm": 0.6169536709785461, + "learning_rate": 7.221357063403782e-05, + "loss": 2.5972, + "step": 2878 + }, + { + "epoch": 38.40133779264214, + "grad_norm": 0.6331331133842468, + "learning_rate": 7.21690767519466e-05, + "loss": 2.625, + "step": 2879 + }, + { + "epoch": 38.414715719063544, + "grad_norm": 0.6282365918159485, + "learning_rate": 7.21245828698554e-05, + "loss": 2.6334, + "step": 2880 + }, + { + "epoch": 38.42809364548495, + "grad_norm": 0.6211024522781372, + "learning_rate": 7.208008898776418e-05, + "loss": 2.4833, + "step": 2881 + }, + { + "epoch": 38.441471571906355, + "grad_norm": 0.6600573062896729, + "learning_rate": 7.203559510567298e-05, + "loss": 2.7515, + "step": 2882 + }, + { + "epoch": 38.45484949832776, + "grad_norm": 0.6234501004219055, + "learning_rate": 7.199110122358176e-05, + "loss": 2.4914, + "step": 2883 + }, + { + "epoch": 38.468227424749166, + "grad_norm": 0.6104183197021484, + "learning_rate": 7.194660734149054e-05, + "loss": 2.6202, + "step": 2884 + }, + { + "epoch": 38.48160535117057, + "grad_norm": 0.6139323711395264, + "learning_rate": 7.190211345939934e-05, + "loss": 2.6918, + "step": 2885 + }, + { + "epoch": 38.49498327759197, + "grad_norm": 0.6672762632369995, + "learning_rate": 7.185761957730812e-05, + "loss": 2.7207, + "step": 2886 + }, + { + "epoch": 38.508361204013376, + "grad_norm": 0.636197030544281, + "learning_rate": 7.181312569521691e-05, + "loss": 2.8393, + "step": 2887 + }, + { + "epoch": 38.52173913043478, + "grad_norm": 0.6464013457298279, + "learning_rate": 7.17686318131257e-05, + "loss": 2.6601, + "step": 2888 + }, + { + "epoch": 38.53511705685619, + "grad_norm": 0.5882225632667542, + "learning_rate": 7.172413793103448e-05, + "loss": 2.4206, + "step": 2889 + }, + { + "epoch": 38.54849498327759, + "grad_norm": 0.6114091873168945, + "learning_rate": 7.167964404894328e-05, + "loss": 2.6474, + "step": 2890 + }, + { + "epoch": 38.561872909699, + "grad_norm": 0.5976521968841553, + "learning_rate": 7.163515016685206e-05, + "loss": 2.5393, + "step": 2891 + }, + { + "epoch": 38.575250836120404, + "grad_norm": 0.6302757859230042, + "learning_rate": 7.159065628476085e-05, + "loss": 2.5687, + "step": 2892 + }, + { + "epoch": 38.58862876254181, + "grad_norm": 0.6598967909812927, + "learning_rate": 7.154616240266964e-05, + "loss": 2.8124, + "step": 2893 + }, + { + "epoch": 38.60200668896321, + "grad_norm": 0.6214116215705872, + "learning_rate": 7.150166852057842e-05, + "loss": 2.6531, + "step": 2894 + }, + { + "epoch": 38.61538461538461, + "grad_norm": 0.6264208555221558, + "learning_rate": 7.145717463848721e-05, + "loss": 2.6821, + "step": 2895 + }, + { + "epoch": 38.62876254180602, + "grad_norm": 0.647375226020813, + "learning_rate": 7.1412680756396e-05, + "loss": 2.5769, + "step": 2896 + }, + { + "epoch": 38.642140468227424, + "grad_norm": 0.6418511271476746, + "learning_rate": 7.136818687430479e-05, + "loss": 2.682, + "step": 2897 + }, + { + "epoch": 38.65551839464883, + "grad_norm": 0.6308450102806091, + "learning_rate": 7.132369299221358e-05, + "loss": 2.6151, + "step": 2898 + }, + { + "epoch": 38.668896321070235, + "grad_norm": 0.6675850749015808, + "learning_rate": 7.127919911012236e-05, + "loss": 2.7805, + "step": 2899 + }, + { + "epoch": 38.68227424749164, + "grad_norm": 0.5915642380714417, + "learning_rate": 7.123470522803115e-05, + "loss": 2.6036, + "step": 2900 + }, + { + "epoch": 38.69565217391305, + "grad_norm": 0.6425783038139343, + "learning_rate": 7.119021134593994e-05, + "loss": 2.805, + "step": 2901 + }, + { + "epoch": 38.709030100334445, + "grad_norm": 0.613692045211792, + "learning_rate": 7.114571746384873e-05, + "loss": 2.5263, + "step": 2902 + }, + { + "epoch": 38.72240802675585, + "grad_norm": 0.7906155586242676, + "learning_rate": 7.110122358175751e-05, + "loss": 2.7032, + "step": 2903 + }, + { + "epoch": 38.735785953177256, + "grad_norm": 0.6317926645278931, + "learning_rate": 7.10567296996663e-05, + "loss": 2.7303, + "step": 2904 + }, + { + "epoch": 38.74916387959866, + "grad_norm": 0.6563540101051331, + "learning_rate": 7.101223581757509e-05, + "loss": 2.6913, + "step": 2905 + }, + { + "epoch": 38.76254180602007, + "grad_norm": 0.6583380699157715, + "learning_rate": 7.096774193548388e-05, + "loss": 2.8355, + "step": 2906 + }, + { + "epoch": 38.77591973244147, + "grad_norm": 0.6026883125305176, + "learning_rate": 7.092324805339267e-05, + "loss": 2.4306, + "step": 2907 + }, + { + "epoch": 38.78929765886288, + "grad_norm": 0.6352461576461792, + "learning_rate": 7.087875417130145e-05, + "loss": 2.7963, + "step": 2908 + }, + { + "epoch": 38.802675585284284, + "grad_norm": 0.6036055684089661, + "learning_rate": 7.083426028921024e-05, + "loss": 2.6946, + "step": 2909 + }, + { + "epoch": 38.81605351170568, + "grad_norm": 0.5911431312561035, + "learning_rate": 7.078976640711903e-05, + "loss": 2.5936, + "step": 2910 + }, + { + "epoch": 38.82943143812709, + "grad_norm": 0.6125901341438293, + "learning_rate": 7.074527252502781e-05, + "loss": 2.4901, + "step": 2911 + }, + { + "epoch": 38.84280936454849, + "grad_norm": 0.600537896156311, + "learning_rate": 7.070077864293661e-05, + "loss": 2.6603, + "step": 2912 + }, + { + "epoch": 38.8561872909699, + "grad_norm": 0.6200169324874878, + "learning_rate": 7.065628476084539e-05, + "loss": 2.629, + "step": 2913 + }, + { + "epoch": 38.869565217391305, + "grad_norm": 0.5789446830749512, + "learning_rate": 7.061179087875418e-05, + "loss": 2.3878, + "step": 2914 + }, + { + "epoch": 38.88294314381271, + "grad_norm": 0.6316003799438477, + "learning_rate": 7.056729699666296e-05, + "loss": 2.4585, + "step": 2915 + }, + { + "epoch": 38.896321070234116, + "grad_norm": 0.6021048426628113, + "learning_rate": 7.052280311457174e-05, + "loss": 2.4609, + "step": 2916 + }, + { + "epoch": 38.90969899665552, + "grad_norm": 0.6088526844978333, + "learning_rate": 7.047830923248054e-05, + "loss": 2.6102, + "step": 2917 + }, + { + "epoch": 38.92307692307692, + "grad_norm": 0.658087432384491, + "learning_rate": 7.043381535038932e-05, + "loss": 2.6058, + "step": 2918 + }, + { + "epoch": 38.936454849498325, + "grad_norm": 0.6708016991615295, + "learning_rate": 7.038932146829811e-05, + "loss": 2.9315, + "step": 2919 + }, + { + "epoch": 38.94983277591973, + "grad_norm": 0.6475014686584473, + "learning_rate": 7.03448275862069e-05, + "loss": 2.8692, + "step": 2920 + }, + { + "epoch": 38.96321070234114, + "grad_norm": 0.6264966726303101, + "learning_rate": 7.030033370411568e-05, + "loss": 2.7866, + "step": 2921 + }, + { + "epoch": 38.97658862876254, + "grad_norm": 0.6225918531417847, + "learning_rate": 7.025583982202447e-05, + "loss": 2.4795, + "step": 2922 + }, + { + "epoch": 38.98996655518395, + "grad_norm": 0.6052890419960022, + "learning_rate": 7.021134593993326e-05, + "loss": 2.2833, + "step": 2923 + }, + { + "epoch": 39.0, + "grad_norm": 0.695241391658783, + "learning_rate": 7.016685205784205e-05, + "loss": 2.4251, + "step": 2924 + }, + { + "epoch": 39.013377926421406, + "grad_norm": 0.5891976952552795, + "learning_rate": 7.012235817575084e-05, + "loss": 2.561, + "step": 2925 + }, + { + "epoch": 39.02675585284281, + "grad_norm": 0.6551567316055298, + "learning_rate": 7.007786429365962e-05, + "loss": 2.4206, + "step": 2926 + }, + { + "epoch": 39.04013377926422, + "grad_norm": 0.5852656364440918, + "learning_rate": 7.003337041156841e-05, + "loss": 2.4159, + "step": 2927 + }, + { + "epoch": 39.05351170568562, + "grad_norm": 0.5813770294189453, + "learning_rate": 6.99888765294772e-05, + "loss": 2.5426, + "step": 2928 + }, + { + "epoch": 39.06688963210702, + "grad_norm": 0.5840299725532532, + "learning_rate": 6.994438264738599e-05, + "loss": 2.5013, + "step": 2929 + }, + { + "epoch": 39.080267558528426, + "grad_norm": 0.5940836071968079, + "learning_rate": 6.989988876529477e-05, + "loss": 2.4776, + "step": 2930 + }, + { + "epoch": 39.09364548494983, + "grad_norm": 0.6290647387504578, + "learning_rate": 6.985539488320356e-05, + "loss": 2.6289, + "step": 2931 + }, + { + "epoch": 39.10702341137124, + "grad_norm": 0.6070288419723511, + "learning_rate": 6.981090100111235e-05, + "loss": 2.3952, + "step": 2932 + }, + { + "epoch": 39.12040133779264, + "grad_norm": 0.6416279077529907, + "learning_rate": 6.976640711902114e-05, + "loss": 2.8001, + "step": 2933 + }, + { + "epoch": 39.13377926421405, + "grad_norm": 0.6238275170326233, + "learning_rate": 6.972191323692993e-05, + "loss": 2.5927, + "step": 2934 + }, + { + "epoch": 39.147157190635454, + "grad_norm": 0.6179841756820679, + "learning_rate": 6.967741935483871e-05, + "loss": 2.4691, + "step": 2935 + }, + { + "epoch": 39.16053511705686, + "grad_norm": 0.5916646122932434, + "learning_rate": 6.96329254727475e-05, + "loss": 2.4734, + "step": 2936 + }, + { + "epoch": 39.17391304347826, + "grad_norm": 0.6340898275375366, + "learning_rate": 6.958843159065629e-05, + "loss": 2.6764, + "step": 2937 + }, + { + "epoch": 39.187290969899664, + "grad_norm": 0.6154446005821228, + "learning_rate": 6.954393770856507e-05, + "loss": 2.666, + "step": 2938 + }, + { + "epoch": 39.20066889632107, + "grad_norm": 0.6166310906410217, + "learning_rate": 6.949944382647387e-05, + "loss": 2.5198, + "step": 2939 + }, + { + "epoch": 39.214046822742475, + "grad_norm": 0.6393052935600281, + "learning_rate": 6.945494994438265e-05, + "loss": 2.6016, + "step": 2940 + }, + { + "epoch": 39.22742474916388, + "grad_norm": 0.6041097640991211, + "learning_rate": 6.941045606229144e-05, + "loss": 2.462, + "step": 2941 + }, + { + "epoch": 39.240802675585286, + "grad_norm": 0.6157374382019043, + "learning_rate": 6.936596218020023e-05, + "loss": 2.5844, + "step": 2942 + }, + { + "epoch": 39.25418060200669, + "grad_norm": 0.641880452632904, + "learning_rate": 6.932146829810901e-05, + "loss": 2.7159, + "step": 2943 + }, + { + "epoch": 39.2675585284281, + "grad_norm": 0.6598557233810425, + "learning_rate": 6.927697441601781e-05, + "loss": 2.5649, + "step": 2944 + }, + { + "epoch": 39.280936454849495, + "grad_norm": 0.6511268615722656, + "learning_rate": 6.923248053392659e-05, + "loss": 2.7623, + "step": 2945 + }, + { + "epoch": 39.2943143812709, + "grad_norm": 0.6559281945228577, + "learning_rate": 6.918798665183537e-05, + "loss": 2.3524, + "step": 2946 + }, + { + "epoch": 39.30769230769231, + "grad_norm": 0.601424515247345, + "learning_rate": 6.914349276974417e-05, + "loss": 2.2888, + "step": 2947 + }, + { + "epoch": 39.32107023411371, + "grad_norm": 0.6525271534919739, + "learning_rate": 6.909899888765295e-05, + "loss": 2.4266, + "step": 2948 + }, + { + "epoch": 39.33444816053512, + "grad_norm": 0.6467943787574768, + "learning_rate": 6.905450500556175e-05, + "loss": 2.654, + "step": 2949 + }, + { + "epoch": 39.34782608695652, + "grad_norm": 0.6527394652366638, + "learning_rate": 6.901001112347053e-05, + "loss": 2.6237, + "step": 2950 + }, + { + "epoch": 39.36120401337793, + "grad_norm": 0.615578830242157, + "learning_rate": 6.896551724137931e-05, + "loss": 2.6382, + "step": 2951 + }, + { + "epoch": 39.374581939799334, + "grad_norm": 0.6257203221321106, + "learning_rate": 6.892102335928811e-05, + "loss": 2.8014, + "step": 2952 + }, + { + "epoch": 39.38795986622073, + "grad_norm": 0.6732933521270752, + "learning_rate": 6.887652947719689e-05, + "loss": 2.7796, + "step": 2953 + }, + { + "epoch": 39.40133779264214, + "grad_norm": 0.5996230840682983, + "learning_rate": 6.883203559510569e-05, + "loss": 2.4852, + "step": 2954 + }, + { + "epoch": 39.414715719063544, + "grad_norm": 0.6640390157699585, + "learning_rate": 6.878754171301446e-05, + "loss": 2.4979, + "step": 2955 + }, + { + "epoch": 39.42809364548495, + "grad_norm": 0.6220453381538391, + "learning_rate": 6.874304783092325e-05, + "loss": 2.4849, + "step": 2956 + }, + { + "epoch": 39.441471571906355, + "grad_norm": 0.6283277273178101, + "learning_rate": 6.869855394883203e-05, + "loss": 2.5193, + "step": 2957 + }, + { + "epoch": 39.45484949832776, + "grad_norm": 0.6696292757987976, + "learning_rate": 6.865406006674082e-05, + "loss": 2.6485, + "step": 2958 + }, + { + "epoch": 39.468227424749166, + "grad_norm": 0.6544603705406189, + "learning_rate": 6.860956618464961e-05, + "loss": 2.5956, + "step": 2959 + }, + { + "epoch": 39.48160535117057, + "grad_norm": 0.6541810631752014, + "learning_rate": 6.85650723025584e-05, + "loss": 2.502, + "step": 2960 + }, + { + "epoch": 39.49498327759197, + "grad_norm": 0.6799680590629578, + "learning_rate": 6.852057842046719e-05, + "loss": 2.7545, + "step": 2961 + }, + { + "epoch": 39.508361204013376, + "grad_norm": 0.6350213289260864, + "learning_rate": 6.847608453837597e-05, + "loss": 2.707, + "step": 2962 + }, + { + "epoch": 39.52173913043478, + "grad_norm": 0.5974067449569702, + "learning_rate": 6.843159065628476e-05, + "loss": 2.5572, + "step": 2963 + }, + { + "epoch": 39.53511705685619, + "grad_norm": 0.5829086899757385, + "learning_rate": 6.838709677419355e-05, + "loss": 2.5003, + "step": 2964 + }, + { + "epoch": 39.54849498327759, + "grad_norm": 0.617396891117096, + "learning_rate": 6.834260289210233e-05, + "loss": 2.5413, + "step": 2965 + }, + { + "epoch": 39.561872909699, + "grad_norm": 0.6405021548271179, + "learning_rate": 6.829810901001113e-05, + "loss": 2.4371, + "step": 2966 + }, + { + "epoch": 39.575250836120404, + "grad_norm": 0.6281189322471619, + "learning_rate": 6.825361512791991e-05, + "loss": 2.6484, + "step": 2967 + }, + { + "epoch": 39.58862876254181, + "grad_norm": 0.7368245720863342, + "learning_rate": 6.82091212458287e-05, + "loss": 2.5467, + "step": 2968 + }, + { + "epoch": 39.60200668896321, + "grad_norm": 0.6534647941589355, + "learning_rate": 6.816462736373749e-05, + "loss": 2.7322, + "step": 2969 + }, + { + "epoch": 39.61538461538461, + "grad_norm": 0.821247935295105, + "learning_rate": 6.812013348164627e-05, + "loss": 2.4748, + "step": 2970 + }, + { + "epoch": 39.62876254180602, + "grad_norm": 0.6398321390151978, + "learning_rate": 6.807563959955507e-05, + "loss": 2.6597, + "step": 2971 + }, + { + "epoch": 39.642140468227424, + "grad_norm": 0.6529011130332947, + "learning_rate": 6.803114571746385e-05, + "loss": 2.5417, + "step": 2972 + }, + { + "epoch": 39.65551839464883, + "grad_norm": 0.6198508143424988, + "learning_rate": 6.798665183537263e-05, + "loss": 2.4684, + "step": 2973 + }, + { + "epoch": 39.668896321070235, + "grad_norm": 0.6332361698150635, + "learning_rate": 6.794215795328143e-05, + "loss": 2.4961, + "step": 2974 + }, + { + "epoch": 39.68227424749164, + "grad_norm": 0.6088337898254395, + "learning_rate": 6.789766407119021e-05, + "loss": 2.6169, + "step": 2975 + }, + { + "epoch": 39.69565217391305, + "grad_norm": 0.6401640176773071, + "learning_rate": 6.785317018909901e-05, + "loss": 2.4942, + "step": 2976 + }, + { + "epoch": 39.709030100334445, + "grad_norm": 0.6432260870933533, + "learning_rate": 6.780867630700779e-05, + "loss": 2.4886, + "step": 2977 + }, + { + "epoch": 39.72240802675585, + "grad_norm": 0.6184273362159729, + "learning_rate": 6.776418242491657e-05, + "loss": 2.6445, + "step": 2978 + }, + { + "epoch": 39.735785953177256, + "grad_norm": 0.6584762334823608, + "learning_rate": 6.771968854282537e-05, + "loss": 2.5727, + "step": 2979 + }, + { + "epoch": 39.74916387959866, + "grad_norm": 0.577503502368927, + "learning_rate": 6.767519466073415e-05, + "loss": 2.3277, + "step": 2980 + }, + { + "epoch": 39.76254180602007, + "grad_norm": 0.62856525182724, + "learning_rate": 6.763070077864295e-05, + "loss": 2.7912, + "step": 2981 + }, + { + "epoch": 39.77591973244147, + "grad_norm": 0.6405977606773376, + "learning_rate": 6.758620689655173e-05, + "loss": 2.6165, + "step": 2982 + }, + { + "epoch": 39.78929765886288, + "grad_norm": 0.5938370823860168, + "learning_rate": 6.754171301446051e-05, + "loss": 2.6061, + "step": 2983 + }, + { + "epoch": 39.802675585284284, + "grad_norm": 0.6175585985183716, + "learning_rate": 6.749721913236931e-05, + "loss": 2.4875, + "step": 2984 + }, + { + "epoch": 39.81605351170568, + "grad_norm": 0.7778329253196716, + "learning_rate": 6.745272525027809e-05, + "loss": 2.8471, + "step": 2985 + }, + { + "epoch": 39.82943143812709, + "grad_norm": 0.6055701375007629, + "learning_rate": 6.740823136818689e-05, + "loss": 2.4975, + "step": 2986 + }, + { + "epoch": 39.84280936454849, + "grad_norm": 0.68842613697052, + "learning_rate": 6.736373748609567e-05, + "loss": 2.7126, + "step": 2987 + }, + { + "epoch": 39.8561872909699, + "grad_norm": 0.6329212784767151, + "learning_rate": 6.731924360400445e-05, + "loss": 2.6741, + "step": 2988 + }, + { + "epoch": 39.869565217391305, + "grad_norm": 0.6239383220672607, + "learning_rate": 6.727474972191325e-05, + "loss": 2.6681, + "step": 2989 + }, + { + "epoch": 39.88294314381271, + "grad_norm": 0.6401740908622742, + "learning_rate": 6.723025583982203e-05, + "loss": 2.7272, + "step": 2990 + }, + { + "epoch": 39.896321070234116, + "grad_norm": 0.6988689303398132, + "learning_rate": 6.718576195773083e-05, + "loss": 2.7339, + "step": 2991 + }, + { + "epoch": 39.90969899665552, + "grad_norm": 0.6372131705284119, + "learning_rate": 6.714126807563961e-05, + "loss": 2.4379, + "step": 2992 + }, + { + "epoch": 39.92307692307692, + "grad_norm": 0.6818649172782898, + "learning_rate": 6.709677419354839e-05, + "loss": 2.8075, + "step": 2993 + }, + { + "epoch": 39.936454849498325, + "grad_norm": 0.653063952922821, + "learning_rate": 6.705228031145719e-05, + "loss": 2.5813, + "step": 2994 + }, + { + "epoch": 39.94983277591973, + "grad_norm": 0.657010018825531, + "learning_rate": 6.700778642936597e-05, + "loss": 2.7227, + "step": 2995 + }, + { + "epoch": 39.96321070234114, + "grad_norm": 0.6535374522209167, + "learning_rate": 6.696329254727475e-05, + "loss": 2.7369, + "step": 2996 + }, + { + "epoch": 39.97658862876254, + "grad_norm": 0.6069753766059875, + "learning_rate": 6.691879866518353e-05, + "loss": 2.5248, + "step": 2997 + }, + { + "epoch": 39.98996655518395, + "grad_norm": 0.6699077486991882, + "learning_rate": 6.687430478309233e-05, + "loss": 2.6428, + "step": 2998 + }, + { + "epoch": 40.0, + "grad_norm": 0.7744498252868652, + "learning_rate": 6.682981090100111e-05, + "loss": 2.7199, + "step": 2999 + }, + { + "epoch": 40.013377926421406, + "grad_norm": 0.6341271996498108, + "learning_rate": 6.67853170189099e-05, + "loss": 2.5324, + "step": 3000 + }, + { + "epoch": 40.02675585284281, + "grad_norm": 0.6379808783531189, + "learning_rate": 6.674082313681869e-05, + "loss": 2.6118, + "step": 3001 + }, + { + "epoch": 40.04013377926422, + "grad_norm": 0.6141436696052551, + "learning_rate": 6.669632925472747e-05, + "loss": 2.5697, + "step": 3002 + }, + { + "epoch": 40.05351170568562, + "grad_norm": 0.6789229512214661, + "learning_rate": 6.665183537263627e-05, + "loss": 2.6907, + "step": 3003 + }, + { + "epoch": 40.06688963210702, + "grad_norm": 0.6370909214019775, + "learning_rate": 6.660734149054505e-05, + "loss": 2.7101, + "step": 3004 + }, + { + "epoch": 40.080267558528426, + "grad_norm": 0.6512884497642517, + "learning_rate": 6.656284760845383e-05, + "loss": 2.6242, + "step": 3005 + }, + { + "epoch": 40.09364548494983, + "grad_norm": 0.6215392351150513, + "learning_rate": 6.651835372636263e-05, + "loss": 2.513, + "step": 3006 + }, + { + "epoch": 40.10702341137124, + "grad_norm": 0.634795606136322, + "learning_rate": 6.647385984427141e-05, + "loss": 2.6536, + "step": 3007 + }, + { + "epoch": 40.12040133779264, + "grad_norm": 0.6299925446510315, + "learning_rate": 6.642936596218021e-05, + "loss": 2.5797, + "step": 3008 + }, + { + "epoch": 40.13377926421405, + "grad_norm": 0.602271318435669, + "learning_rate": 6.638487208008899e-05, + "loss": 2.4958, + "step": 3009 + }, + { + "epoch": 40.147157190635454, + "grad_norm": 0.6046922206878662, + "learning_rate": 6.634037819799777e-05, + "loss": 2.591, + "step": 3010 + }, + { + "epoch": 40.16053511705686, + "grad_norm": 0.6569716334342957, + "learning_rate": 6.629588431590657e-05, + "loss": 2.7014, + "step": 3011 + }, + { + "epoch": 40.17391304347826, + "grad_norm": 0.8074204325675964, + "learning_rate": 6.625139043381535e-05, + "loss": 2.38, + "step": 3012 + }, + { + "epoch": 40.187290969899664, + "grad_norm": 0.6500153541564941, + "learning_rate": 6.620689655172415e-05, + "loss": 2.5114, + "step": 3013 + }, + { + "epoch": 40.20066889632107, + "grad_norm": 0.6205062866210938, + "learning_rate": 6.616240266963293e-05, + "loss": 2.4542, + "step": 3014 + }, + { + "epoch": 40.214046822742475, + "grad_norm": 0.6093875169754028, + "learning_rate": 6.611790878754171e-05, + "loss": 2.4217, + "step": 3015 + }, + { + "epoch": 40.22742474916388, + "grad_norm": 0.6581112146377563, + "learning_rate": 6.607341490545051e-05, + "loss": 2.5728, + "step": 3016 + }, + { + "epoch": 40.240802675585286, + "grad_norm": 0.6695924401283264, + "learning_rate": 6.602892102335929e-05, + "loss": 2.7236, + "step": 3017 + }, + { + "epoch": 40.25418060200669, + "grad_norm": 0.7370408773422241, + "learning_rate": 6.598442714126809e-05, + "loss": 2.4101, + "step": 3018 + }, + { + "epoch": 40.2675585284281, + "grad_norm": 0.6048701405525208, + "learning_rate": 6.593993325917687e-05, + "loss": 2.517, + "step": 3019 + }, + { + "epoch": 40.280936454849495, + "grad_norm": 0.628162682056427, + "learning_rate": 6.589543937708565e-05, + "loss": 2.3851, + "step": 3020 + }, + { + "epoch": 40.2943143812709, + "grad_norm": 0.6344666481018066, + "learning_rate": 6.585094549499445e-05, + "loss": 2.6034, + "step": 3021 + }, + { + "epoch": 40.30769230769231, + "grad_norm": 0.6417535543441772, + "learning_rate": 6.580645161290323e-05, + "loss": 2.6893, + "step": 3022 + }, + { + "epoch": 40.32107023411371, + "grad_norm": 0.6148284077644348, + "learning_rate": 6.576195773081202e-05, + "loss": 2.4466, + "step": 3023 + }, + { + "epoch": 40.33444816053512, + "grad_norm": 0.6240962147712708, + "learning_rate": 6.571746384872081e-05, + "loss": 2.5427, + "step": 3024 + }, + { + "epoch": 40.34782608695652, + "grad_norm": 0.6034325957298279, + "learning_rate": 6.567296996662959e-05, + "loss": 2.588, + "step": 3025 + }, + { + "epoch": 40.36120401337793, + "grad_norm": 0.6440005302429199, + "learning_rate": 6.562847608453839e-05, + "loss": 2.5972, + "step": 3026 + }, + { + "epoch": 40.374581939799334, + "grad_norm": 0.6619619727134705, + "learning_rate": 6.558398220244717e-05, + "loss": 2.6194, + "step": 3027 + }, + { + "epoch": 40.38795986622073, + "grad_norm": 0.6329185962677002, + "learning_rate": 6.553948832035596e-05, + "loss": 2.463, + "step": 3028 + }, + { + "epoch": 40.40133779264214, + "grad_norm": 0.5924078822135925, + "learning_rate": 6.549499443826475e-05, + "loss": 2.5448, + "step": 3029 + }, + { + "epoch": 40.414715719063544, + "grad_norm": 0.6422038078308105, + "learning_rate": 6.545050055617353e-05, + "loss": 2.5527, + "step": 3030 + }, + { + "epoch": 40.42809364548495, + "grad_norm": 0.8861187696456909, + "learning_rate": 6.540600667408232e-05, + "loss": 2.5809, + "step": 3031 + }, + { + "epoch": 40.441471571906355, + "grad_norm": 0.6287595629692078, + "learning_rate": 6.536151279199111e-05, + "loss": 2.3957, + "step": 3032 + }, + { + "epoch": 40.45484949832776, + "grad_norm": 0.6341630816459656, + "learning_rate": 6.53170189098999e-05, + "loss": 2.7261, + "step": 3033 + }, + { + "epoch": 40.468227424749166, + "grad_norm": 0.6193274855613708, + "learning_rate": 6.527252502780869e-05, + "loss": 2.6581, + "step": 3034 + }, + { + "epoch": 40.48160535117057, + "grad_norm": 0.6899332404136658, + "learning_rate": 6.522803114571747e-05, + "loss": 2.5425, + "step": 3035 + }, + { + "epoch": 40.49498327759197, + "grad_norm": 0.6204500794410706, + "learning_rate": 6.518353726362626e-05, + "loss": 2.5298, + "step": 3036 + }, + { + "epoch": 40.508361204013376, + "grad_norm": 0.642670750617981, + "learning_rate": 6.513904338153505e-05, + "loss": 2.5961, + "step": 3037 + }, + { + "epoch": 40.52173913043478, + "grad_norm": 0.6434988975524902, + "learning_rate": 6.509454949944383e-05, + "loss": 2.5516, + "step": 3038 + }, + { + "epoch": 40.53511705685619, + "grad_norm": 0.62471604347229, + "learning_rate": 6.505005561735261e-05, + "loss": 2.4798, + "step": 3039 + }, + { + "epoch": 40.54849498327759, + "grad_norm": 0.6490535140037537, + "learning_rate": 6.50055617352614e-05, + "loss": 2.6373, + "step": 3040 + }, + { + "epoch": 40.561872909699, + "grad_norm": 0.6673942804336548, + "learning_rate": 6.496106785317019e-05, + "loss": 2.3785, + "step": 3041 + }, + { + "epoch": 40.575250836120404, + "grad_norm": 0.6491774320602417, + "learning_rate": 6.491657397107897e-05, + "loss": 2.7776, + "step": 3042 + }, + { + "epoch": 40.58862876254181, + "grad_norm": 0.6260173916816711, + "learning_rate": 6.487208008898777e-05, + "loss": 2.33, + "step": 3043 + }, + { + "epoch": 40.60200668896321, + "grad_norm": 0.6691784262657166, + "learning_rate": 6.482758620689655e-05, + "loss": 2.5681, + "step": 3044 + }, + { + "epoch": 40.61538461538461, + "grad_norm": 0.6271287798881531, + "learning_rate": 6.478309232480535e-05, + "loss": 2.5003, + "step": 3045 + }, + { + "epoch": 40.62876254180602, + "grad_norm": 0.5982306599617004, + "learning_rate": 6.473859844271413e-05, + "loss": 2.4117, + "step": 3046 + }, + { + "epoch": 40.642140468227424, + "grad_norm": 0.6666748523712158, + "learning_rate": 6.469410456062291e-05, + "loss": 2.9225, + "step": 3047 + }, + { + "epoch": 40.65551839464883, + "grad_norm": 0.7303210496902466, + "learning_rate": 6.46496106785317e-05, + "loss": 2.4966, + "step": 3048 + }, + { + "epoch": 40.668896321070235, + "grad_norm": 0.6557690501213074, + "learning_rate": 6.460511679644049e-05, + "loss": 2.7931, + "step": 3049 + }, + { + "epoch": 40.68227424749164, + "grad_norm": 0.6602864265441895, + "learning_rate": 6.456062291434928e-05, + "loss": 2.6292, + "step": 3050 + }, + { + "epoch": 40.69565217391305, + "grad_norm": 0.6641650199890137, + "learning_rate": 6.451612903225807e-05, + "loss": 2.4421, + "step": 3051 + }, + { + "epoch": 40.709030100334445, + "grad_norm": 0.6553093791007996, + "learning_rate": 6.447163515016685e-05, + "loss": 2.7653, + "step": 3052 + }, + { + "epoch": 40.72240802675585, + "grad_norm": 0.6673446893692017, + "learning_rate": 6.442714126807565e-05, + "loss": 2.7016, + "step": 3053 + }, + { + "epoch": 40.735785953177256, + "grad_norm": 0.6132397055625916, + "learning_rate": 6.438264738598443e-05, + "loss": 2.6634, + "step": 3054 + }, + { + "epoch": 40.74916387959866, + "grad_norm": 0.6243513822555542, + "learning_rate": 6.433815350389321e-05, + "loss": 2.3653, + "step": 3055 + }, + { + "epoch": 40.76254180602007, + "grad_norm": 0.6419558525085449, + "learning_rate": 6.4293659621802e-05, + "loss": 2.5712, + "step": 3056 + }, + { + "epoch": 40.77591973244147, + "grad_norm": 0.6716275215148926, + "learning_rate": 6.424916573971079e-05, + "loss": 2.645, + "step": 3057 + }, + { + "epoch": 40.78929765886288, + "grad_norm": 0.6217707991600037, + "learning_rate": 6.420467185761958e-05, + "loss": 2.427, + "step": 3058 + }, + { + "epoch": 40.802675585284284, + "grad_norm": 0.6085132956504822, + "learning_rate": 6.416017797552837e-05, + "loss": 2.6017, + "step": 3059 + }, + { + "epoch": 40.81605351170568, + "grad_norm": 0.6166917681694031, + "learning_rate": 6.411568409343715e-05, + "loss": 2.383, + "step": 3060 + }, + { + "epoch": 40.82943143812709, + "grad_norm": 0.607100248336792, + "learning_rate": 6.407119021134595e-05, + "loss": 2.5013, + "step": 3061 + }, + { + "epoch": 40.84280936454849, + "grad_norm": 0.641970694065094, + "learning_rate": 6.402669632925473e-05, + "loss": 2.4467, + "step": 3062 + }, + { + "epoch": 40.8561872909699, + "grad_norm": 0.598092257976532, + "learning_rate": 6.398220244716352e-05, + "loss": 2.3341, + "step": 3063 + }, + { + "epoch": 40.869565217391305, + "grad_norm": 0.6640897393226624, + "learning_rate": 6.39377085650723e-05, + "loss": 2.533, + "step": 3064 + }, + { + "epoch": 40.88294314381271, + "grad_norm": 0.6127122044563293, + "learning_rate": 6.389321468298109e-05, + "loss": 2.3842, + "step": 3065 + }, + { + "epoch": 40.896321070234116, + "grad_norm": 0.6670165657997131, + "learning_rate": 6.384872080088988e-05, + "loss": 2.5481, + "step": 3066 + }, + { + "epoch": 40.90969899665552, + "grad_norm": 0.6466744542121887, + "learning_rate": 6.380422691879867e-05, + "loss": 2.7235, + "step": 3067 + }, + { + "epoch": 40.92307692307692, + "grad_norm": 0.6578346490859985, + "learning_rate": 6.375973303670746e-05, + "loss": 2.779, + "step": 3068 + }, + { + "epoch": 40.936454849498325, + "grad_norm": 0.6433802843093872, + "learning_rate": 6.371523915461625e-05, + "loss": 2.3942, + "step": 3069 + }, + { + "epoch": 40.94983277591973, + "grad_norm": 0.6575675010681152, + "learning_rate": 6.367074527252503e-05, + "loss": 2.705, + "step": 3070 + }, + { + "epoch": 40.96321070234114, + "grad_norm": 0.6280662417411804, + "learning_rate": 6.362625139043382e-05, + "loss": 2.4813, + "step": 3071 + }, + { + "epoch": 40.97658862876254, + "grad_norm": 0.6336925625801086, + "learning_rate": 6.35817575083426e-05, + "loss": 2.6624, + "step": 3072 + }, + { + "epoch": 40.98996655518395, + "grad_norm": 0.6524997353553772, + "learning_rate": 6.35372636262514e-05, + "loss": 2.7407, + "step": 3073 + }, + { + "epoch": 41.0, + "grad_norm": 0.7705938816070557, + "learning_rate": 6.349276974416018e-05, + "loss": 2.4095, + "step": 3074 + }, + { + "epoch": 41.013377926421406, + "grad_norm": 0.6220411658287048, + "learning_rate": 6.344827586206897e-05, + "loss": 2.6689, + "step": 3075 + }, + { + "epoch": 41.02675585284281, + "grad_norm": 0.7290953397750854, + "learning_rate": 6.340378197997776e-05, + "loss": 2.5802, + "step": 3076 + }, + { + "epoch": 41.04013377926422, + "grad_norm": 0.8503483533859253, + "learning_rate": 6.335928809788654e-05, + "loss": 2.4214, + "step": 3077 + }, + { + "epoch": 41.05351170568562, + "grad_norm": 0.6377903819084167, + "learning_rate": 6.331479421579534e-05, + "loss": 2.4564, + "step": 3078 + }, + { + "epoch": 41.06688963210702, + "grad_norm": 0.6296485662460327, + "learning_rate": 6.327030033370411e-05, + "loss": 2.3787, + "step": 3079 + }, + { + "epoch": 41.080267558528426, + "grad_norm": 0.680673360824585, + "learning_rate": 6.32258064516129e-05, + "loss": 2.6136, + "step": 3080 + }, + { + "epoch": 41.09364548494983, + "grad_norm": 0.67911297082901, + "learning_rate": 6.318131256952169e-05, + "loss": 2.6889, + "step": 3081 + }, + { + "epoch": 41.10702341137124, + "grad_norm": 0.7195813655853271, + "learning_rate": 6.313681868743047e-05, + "loss": 2.5189, + "step": 3082 + }, + { + "epoch": 41.12040133779264, + "grad_norm": 0.644444465637207, + "learning_rate": 6.309232480533927e-05, + "loss": 2.4799, + "step": 3083 + }, + { + "epoch": 41.13377926421405, + "grad_norm": 0.6452529430389404, + "learning_rate": 6.304783092324805e-05, + "loss": 2.452, + "step": 3084 + }, + { + "epoch": 41.147157190635454, + "grad_norm": 0.6581646800041199, + "learning_rate": 6.300333704115684e-05, + "loss": 2.5695, + "step": 3085 + }, + { + "epoch": 41.16053511705686, + "grad_norm": 0.6414064168930054, + "learning_rate": 6.295884315906563e-05, + "loss": 2.4686, + "step": 3086 + }, + { + "epoch": 41.17391304347826, + "grad_norm": 0.620027482509613, + "learning_rate": 6.291434927697441e-05, + "loss": 2.4335, + "step": 3087 + }, + { + "epoch": 41.187290969899664, + "grad_norm": 0.6725049018859863, + "learning_rate": 6.28698553948832e-05, + "loss": 2.4423, + "step": 3088 + }, + { + "epoch": 41.20066889632107, + "grad_norm": 0.6626586318016052, + "learning_rate": 6.282536151279199e-05, + "loss": 2.3875, + "step": 3089 + }, + { + "epoch": 41.214046822742475, + "grad_norm": 0.6290739178657532, + "learning_rate": 6.278086763070078e-05, + "loss": 2.6957, + "step": 3090 + }, + { + "epoch": 41.22742474916388, + "grad_norm": 0.6320621967315674, + "learning_rate": 6.273637374860957e-05, + "loss": 2.6297, + "step": 3091 + }, + { + "epoch": 41.240802675585286, + "grad_norm": 0.6563973426818848, + "learning_rate": 6.269187986651835e-05, + "loss": 2.5361, + "step": 3092 + }, + { + "epoch": 41.25418060200669, + "grad_norm": 0.6116068363189697, + "learning_rate": 6.264738598442714e-05, + "loss": 2.644, + "step": 3093 + }, + { + "epoch": 41.2675585284281, + "grad_norm": 0.6175321340560913, + "learning_rate": 6.260289210233593e-05, + "loss": 2.2662, + "step": 3094 + }, + { + "epoch": 41.280936454849495, + "grad_norm": 0.6325392723083496, + "learning_rate": 6.255839822024472e-05, + "loss": 2.4981, + "step": 3095 + }, + { + "epoch": 41.2943143812709, + "grad_norm": 0.6011773943901062, + "learning_rate": 6.25139043381535e-05, + "loss": 2.4708, + "step": 3096 + }, + { + "epoch": 41.30769230769231, + "grad_norm": 0.6720942258834839, + "learning_rate": 6.246941045606229e-05, + "loss": 2.6639, + "step": 3097 + }, + { + "epoch": 41.32107023411371, + "grad_norm": 0.6427400708198547, + "learning_rate": 6.242491657397108e-05, + "loss": 2.3893, + "step": 3098 + }, + { + "epoch": 41.33444816053512, + "grad_norm": 0.6931129693984985, + "learning_rate": 6.238042269187987e-05, + "loss": 2.6919, + "step": 3099 + }, + { + "epoch": 41.34782608695652, + "grad_norm": 0.6366786956787109, + "learning_rate": 6.233592880978866e-05, + "loss": 2.3554, + "step": 3100 + }, + { + "epoch": 41.36120401337793, + "grad_norm": 0.5965732336044312, + "learning_rate": 6.229143492769744e-05, + "loss": 2.5368, + "step": 3101 + }, + { + "epoch": 41.374581939799334, + "grad_norm": 0.6699475646018982, + "learning_rate": 6.224694104560623e-05, + "loss": 2.614, + "step": 3102 + }, + { + "epoch": 41.38795986622073, + "grad_norm": 0.6514851450920105, + "learning_rate": 6.220244716351502e-05, + "loss": 2.611, + "step": 3103 + }, + { + "epoch": 41.40133779264214, + "grad_norm": 0.6431285738945007, + "learning_rate": 6.21579532814238e-05, + "loss": 2.5508, + "step": 3104 + }, + { + "epoch": 41.414715719063544, + "grad_norm": 0.6158801913261414, + "learning_rate": 6.21134593993326e-05, + "loss": 2.5008, + "step": 3105 + }, + { + "epoch": 41.42809364548495, + "grad_norm": 0.6571468114852905, + "learning_rate": 6.206896551724138e-05, + "loss": 2.5121, + "step": 3106 + }, + { + "epoch": 41.441471571906355, + "grad_norm": 0.6691536903381348, + "learning_rate": 6.202447163515017e-05, + "loss": 2.7706, + "step": 3107 + }, + { + "epoch": 41.45484949832776, + "grad_norm": 0.6521206498146057, + "learning_rate": 6.197997775305896e-05, + "loss": 2.5349, + "step": 3108 + }, + { + "epoch": 41.468227424749166, + "grad_norm": 0.6497208476066589, + "learning_rate": 6.193548387096774e-05, + "loss": 2.5069, + "step": 3109 + }, + { + "epoch": 41.48160535117057, + "grad_norm": 0.6708588004112244, + "learning_rate": 6.189098998887654e-05, + "loss": 2.5195, + "step": 3110 + }, + { + "epoch": 41.49498327759197, + "grad_norm": 0.6343464851379395, + "learning_rate": 6.184649610678532e-05, + "loss": 2.5025, + "step": 3111 + }, + { + "epoch": 41.508361204013376, + "grad_norm": 0.6211134195327759, + "learning_rate": 6.18020022246941e-05, + "loss": 2.3835, + "step": 3112 + }, + { + "epoch": 41.52173913043478, + "grad_norm": 0.7531595230102539, + "learning_rate": 6.17575083426029e-05, + "loss": 2.6583, + "step": 3113 + }, + { + "epoch": 41.53511705685619, + "grad_norm": 0.6848407983779907, + "learning_rate": 6.171301446051168e-05, + "loss": 2.5161, + "step": 3114 + }, + { + "epoch": 41.54849498327759, + "grad_norm": 0.6121324300765991, + "learning_rate": 6.166852057842048e-05, + "loss": 2.6031, + "step": 3115 + }, + { + "epoch": 41.561872909699, + "grad_norm": 0.6391879320144653, + "learning_rate": 6.162402669632926e-05, + "loss": 2.4762, + "step": 3116 + }, + { + "epoch": 41.575250836120404, + "grad_norm": 0.6806944012641907, + "learning_rate": 6.157953281423804e-05, + "loss": 2.4322, + "step": 3117 + }, + { + "epoch": 41.58862876254181, + "grad_norm": 0.5879799127578735, + "learning_rate": 6.153503893214684e-05, + "loss": 2.4121, + "step": 3118 + }, + { + "epoch": 41.60200668896321, + "grad_norm": 0.6548652052879333, + "learning_rate": 6.149054505005562e-05, + "loss": 2.516, + "step": 3119 + }, + { + "epoch": 41.61538461538461, + "grad_norm": 0.6357155442237854, + "learning_rate": 6.14460511679644e-05, + "loss": 2.6071, + "step": 3120 + }, + { + "epoch": 41.62876254180602, + "grad_norm": 0.6546043753623962, + "learning_rate": 6.140155728587319e-05, + "loss": 2.5423, + "step": 3121 + }, + { + "epoch": 41.642140468227424, + "grad_norm": 0.6443083882331848, + "learning_rate": 6.135706340378198e-05, + "loss": 2.4373, + "step": 3122 + }, + { + "epoch": 41.65551839464883, + "grad_norm": 0.599860429763794, + "learning_rate": 6.131256952169077e-05, + "loss": 2.3978, + "step": 3123 + }, + { + "epoch": 41.668896321070235, + "grad_norm": 0.6442713737487793, + "learning_rate": 6.126807563959955e-05, + "loss": 2.437, + "step": 3124 + }, + { + "epoch": 41.68227424749164, + "grad_norm": 0.704804003238678, + "learning_rate": 6.122358175750834e-05, + "loss": 2.5634, + "step": 3125 + }, + { + "epoch": 41.69565217391305, + "grad_norm": 0.6029696464538574, + "learning_rate": 6.117908787541713e-05, + "loss": 2.5276, + "step": 3126 + }, + { + "epoch": 41.709030100334445, + "grad_norm": 0.6845595240592957, + "learning_rate": 6.113459399332592e-05, + "loss": 2.6695, + "step": 3127 + }, + { + "epoch": 41.72240802675585, + "grad_norm": 0.6166744232177734, + "learning_rate": 6.10901001112347e-05, + "loss": 2.5261, + "step": 3128 + }, + { + "epoch": 41.735785953177256, + "grad_norm": 0.6824954152107239, + "learning_rate": 6.104560622914349e-05, + "loss": 2.6611, + "step": 3129 + }, + { + "epoch": 41.74916387959866, + "grad_norm": 0.6522831320762634, + "learning_rate": 6.100111234705228e-05, + "loss": 2.5558, + "step": 3130 + }, + { + "epoch": 41.76254180602007, + "grad_norm": 0.6635300517082214, + "learning_rate": 6.0956618464961065e-05, + "loss": 2.6164, + "step": 3131 + }, + { + "epoch": 41.77591973244147, + "grad_norm": 0.6519820094108582, + "learning_rate": 6.0912124582869854e-05, + "loss": 2.5598, + "step": 3132 + }, + { + "epoch": 41.78929765886288, + "grad_norm": 0.6299057006835938, + "learning_rate": 6.086763070077864e-05, + "loss": 2.3672, + "step": 3133 + }, + { + "epoch": 41.802675585284284, + "grad_norm": 0.6761189103126526, + "learning_rate": 6.082313681868743e-05, + "loss": 2.4973, + "step": 3134 + }, + { + "epoch": 41.81605351170568, + "grad_norm": 0.6459671258926392, + "learning_rate": 6.077864293659622e-05, + "loss": 2.5184, + "step": 3135 + }, + { + "epoch": 41.82943143812709, + "grad_norm": 0.6295469999313354, + "learning_rate": 6.0734149054505004e-05, + "loss": 2.5129, + "step": 3136 + }, + { + "epoch": 41.84280936454849, + "grad_norm": 0.6619572639465332, + "learning_rate": 6.068965517241379e-05, + "loss": 2.602, + "step": 3137 + }, + { + "epoch": 41.8561872909699, + "grad_norm": 0.619455873966217, + "learning_rate": 6.064516129032258e-05, + "loss": 2.3657, + "step": 3138 + }, + { + "epoch": 41.869565217391305, + "grad_norm": 0.6936202049255371, + "learning_rate": 6.060066740823137e-05, + "loss": 2.6134, + "step": 3139 + }, + { + "epoch": 41.88294314381271, + "grad_norm": 0.7031989097595215, + "learning_rate": 6.055617352614016e-05, + "loss": 2.7589, + "step": 3140 + }, + { + "epoch": 41.896321070234116, + "grad_norm": 0.6757580041885376, + "learning_rate": 6.051167964404894e-05, + "loss": 2.5874, + "step": 3141 + }, + { + "epoch": 41.90969899665552, + "grad_norm": 0.6400772333145142, + "learning_rate": 6.046718576195773e-05, + "loss": 2.5219, + "step": 3142 + }, + { + "epoch": 41.92307692307692, + "grad_norm": 0.652505099773407, + "learning_rate": 6.042269187986652e-05, + "loss": 2.8772, + "step": 3143 + }, + { + "epoch": 41.936454849498325, + "grad_norm": 0.6441666483879089, + "learning_rate": 6.037819799777531e-05, + "loss": 2.5647, + "step": 3144 + }, + { + "epoch": 41.94983277591973, + "grad_norm": 0.6462740898132324, + "learning_rate": 6.03337041156841e-05, + "loss": 2.755, + "step": 3145 + }, + { + "epoch": 41.96321070234114, + "grad_norm": 0.6710271835327148, + "learning_rate": 6.028921023359288e-05, + "loss": 2.5574, + "step": 3146 + }, + { + "epoch": 41.97658862876254, + "grad_norm": 0.6961186528205872, + "learning_rate": 6.024471635150167e-05, + "loss": 2.5408, + "step": 3147 + }, + { + "epoch": 41.98996655518395, + "grad_norm": 0.6388704180717468, + "learning_rate": 6.020022246941046e-05, + "loss": 2.4168, + "step": 3148 + }, + { + "epoch": 42.0, + "grad_norm": 0.8058933615684509, + "learning_rate": 6.015572858731925e-05, + "loss": 2.7006, + "step": 3149 + }, + { + "epoch": 42.013377926421406, + "grad_norm": 0.6950250864028931, + "learning_rate": 6.011123470522804e-05, + "loss": 2.5687, + "step": 3150 + }, + { + "epoch": 42.02675585284281, + "grad_norm": 0.6203860640525818, + "learning_rate": 6.006674082313682e-05, + "loss": 2.5466, + "step": 3151 + }, + { + "epoch": 42.04013377926422, + "grad_norm": 0.6443012952804565, + "learning_rate": 6.002224694104561e-05, + "loss": 2.6772, + "step": 3152 + }, + { + "epoch": 42.05351170568562, + "grad_norm": 0.6492132544517517, + "learning_rate": 5.99777530589544e-05, + "loss": 2.6421, + "step": 3153 + }, + { + "epoch": 42.06688963210702, + "grad_norm": 0.6684079766273499, + "learning_rate": 5.993325917686319e-05, + "loss": 2.4521, + "step": 3154 + }, + { + "epoch": 42.080267558528426, + "grad_norm": 0.6225764751434326, + "learning_rate": 5.988876529477198e-05, + "loss": 2.3612, + "step": 3155 + }, + { + "epoch": 42.09364548494983, + "grad_norm": 0.7330244183540344, + "learning_rate": 5.984427141268076e-05, + "loss": 2.6417, + "step": 3156 + }, + { + "epoch": 42.10702341137124, + "grad_norm": 0.671696662902832, + "learning_rate": 5.979977753058955e-05, + "loss": 2.4971, + "step": 3157 + }, + { + "epoch": 42.12040133779264, + "grad_norm": 0.6462723612785339, + "learning_rate": 5.975528364849834e-05, + "loss": 2.4472, + "step": 3158 + }, + { + "epoch": 42.13377926421405, + "grad_norm": 0.646503210067749, + "learning_rate": 5.971078976640713e-05, + "loss": 2.5354, + "step": 3159 + }, + { + "epoch": 42.147157190635454, + "grad_norm": 0.6394603848457336, + "learning_rate": 5.966629588431592e-05, + "loss": 2.4596, + "step": 3160 + }, + { + "epoch": 42.16053511705686, + "grad_norm": 0.6669955253601074, + "learning_rate": 5.962180200222469e-05, + "loss": 2.4145, + "step": 3161 + }, + { + "epoch": 42.17391304347826, + "grad_norm": 0.756793737411499, + "learning_rate": 5.957730812013348e-05, + "loss": 2.5212, + "step": 3162 + }, + { + "epoch": 42.187290969899664, + "grad_norm": 0.6672840714454651, + "learning_rate": 5.9532814238042264e-05, + "loss": 2.4857, + "step": 3163 + }, + { + "epoch": 42.20066889632107, + "grad_norm": 0.6417210698127747, + "learning_rate": 5.948832035595105e-05, + "loss": 2.2809, + "step": 3164 + }, + { + "epoch": 42.214046822742475, + "grad_norm": 0.8348161578178406, + "learning_rate": 5.944382647385984e-05, + "loss": 2.4916, + "step": 3165 + }, + { + "epoch": 42.22742474916388, + "grad_norm": 0.6534797549247742, + "learning_rate": 5.939933259176863e-05, + "loss": 2.5452, + "step": 3166 + }, + { + "epoch": 42.240802675585286, + "grad_norm": 0.6620765924453735, + "learning_rate": 5.935483870967742e-05, + "loss": 2.4918, + "step": 3167 + }, + { + "epoch": 42.25418060200669, + "grad_norm": 0.6680149435997009, + "learning_rate": 5.93103448275862e-05, + "loss": 2.5057, + "step": 3168 + }, + { + "epoch": 42.2675585284281, + "grad_norm": 0.6373328566551208, + "learning_rate": 5.926585094549499e-05, + "loss": 2.4261, + "step": 3169 + }, + { + "epoch": 42.280936454849495, + "grad_norm": 0.647787868976593, + "learning_rate": 5.922135706340378e-05, + "loss": 2.5882, + "step": 3170 + }, + { + "epoch": 42.2943143812709, + "grad_norm": 0.7209821343421936, + "learning_rate": 5.917686318131257e-05, + "loss": 2.5832, + "step": 3171 + }, + { + "epoch": 42.30769230769231, + "grad_norm": 0.650434672832489, + "learning_rate": 5.913236929922136e-05, + "loss": 2.6609, + "step": 3172 + }, + { + "epoch": 42.32107023411371, + "grad_norm": 0.6905040740966797, + "learning_rate": 5.908787541713014e-05, + "loss": 2.4769, + "step": 3173 + }, + { + "epoch": 42.33444816053512, + "grad_norm": 0.6566370725631714, + "learning_rate": 5.904338153503893e-05, + "loss": 2.3585, + "step": 3174 + }, + { + "epoch": 42.34782608695652, + "grad_norm": 0.6680585741996765, + "learning_rate": 5.899888765294772e-05, + "loss": 2.4853, + "step": 3175 + }, + { + "epoch": 42.36120401337793, + "grad_norm": 0.6622024774551392, + "learning_rate": 5.895439377085651e-05, + "loss": 2.525, + "step": 3176 + }, + { + "epoch": 42.374581939799334, + "grad_norm": 0.6646168828010559, + "learning_rate": 5.89098998887653e-05, + "loss": 2.7159, + "step": 3177 + }, + { + "epoch": 42.38795986622073, + "grad_norm": 0.6752247214317322, + "learning_rate": 5.886540600667408e-05, + "loss": 2.6018, + "step": 3178 + }, + { + "epoch": 42.40133779264214, + "grad_norm": 0.7147390842437744, + "learning_rate": 5.882091212458287e-05, + "loss": 2.52, + "step": 3179 + }, + { + "epoch": 42.414715719063544, + "grad_norm": 0.7092293500900269, + "learning_rate": 5.877641824249166e-05, + "loss": 2.3825, + "step": 3180 + }, + { + "epoch": 42.42809364548495, + "grad_norm": 0.6321873664855957, + "learning_rate": 5.873192436040045e-05, + "loss": 2.5694, + "step": 3181 + }, + { + "epoch": 42.441471571906355, + "grad_norm": 0.7254124879837036, + "learning_rate": 5.868743047830924e-05, + "loss": 2.5501, + "step": 3182 + }, + { + "epoch": 42.45484949832776, + "grad_norm": 0.6851321458816528, + "learning_rate": 5.864293659621802e-05, + "loss": 2.3981, + "step": 3183 + }, + { + "epoch": 42.468227424749166, + "grad_norm": 0.7157125473022461, + "learning_rate": 5.859844271412681e-05, + "loss": 2.4062, + "step": 3184 + }, + { + "epoch": 42.48160535117057, + "grad_norm": 0.6364109516143799, + "learning_rate": 5.85539488320356e-05, + "loss": 2.4445, + "step": 3185 + }, + { + "epoch": 42.49498327759197, + "grad_norm": 0.7198134064674377, + "learning_rate": 5.850945494994439e-05, + "loss": 2.4817, + "step": 3186 + }, + { + "epoch": 42.508361204013376, + "grad_norm": 0.6572685837745667, + "learning_rate": 5.846496106785318e-05, + "loss": 2.7384, + "step": 3187 + }, + { + "epoch": 42.52173913043478, + "grad_norm": 0.6623586416244507, + "learning_rate": 5.842046718576196e-05, + "loss": 2.4099, + "step": 3188 + }, + { + "epoch": 42.53511705685619, + "grad_norm": 0.6533327102661133, + "learning_rate": 5.837597330367075e-05, + "loss": 2.4957, + "step": 3189 + }, + { + "epoch": 42.54849498327759, + "grad_norm": 0.706375241279602, + "learning_rate": 5.833147942157954e-05, + "loss": 2.6125, + "step": 3190 + }, + { + "epoch": 42.561872909699, + "grad_norm": 0.6902095675468445, + "learning_rate": 5.828698553948833e-05, + "loss": 2.616, + "step": 3191 + }, + { + "epoch": 42.575250836120404, + "grad_norm": 0.6547108888626099, + "learning_rate": 5.8242491657397116e-05, + "loss": 2.3856, + "step": 3192 + }, + { + "epoch": 42.58862876254181, + "grad_norm": 0.6478321552276611, + "learning_rate": 5.81979977753059e-05, + "loss": 2.5016, + "step": 3193 + }, + { + "epoch": 42.60200668896321, + "grad_norm": 0.6727077960968018, + "learning_rate": 5.815350389321469e-05, + "loss": 2.4331, + "step": 3194 + }, + { + "epoch": 42.61538461538461, + "grad_norm": 0.6678333878517151, + "learning_rate": 5.810901001112348e-05, + "loss": 2.5317, + "step": 3195 + }, + { + "epoch": 42.62876254180602, + "grad_norm": 0.7883992791175842, + "learning_rate": 5.8064516129032266e-05, + "loss": 2.5823, + "step": 3196 + }, + { + "epoch": 42.642140468227424, + "grad_norm": 0.6439074873924255, + "learning_rate": 5.8020022246941055e-05, + "loss": 2.4141, + "step": 3197 + }, + { + "epoch": 42.65551839464883, + "grad_norm": 0.6541165113449097, + "learning_rate": 5.797552836484984e-05, + "loss": 2.5384, + "step": 3198 + }, + { + "epoch": 42.668896321070235, + "grad_norm": 0.6592057943344116, + "learning_rate": 5.7931034482758627e-05, + "loss": 2.4035, + "step": 3199 + }, + { + "epoch": 42.68227424749164, + "grad_norm": 0.6413922905921936, + "learning_rate": 5.7886540600667416e-05, + "loss": 2.4653, + "step": 3200 + }, + { + "epoch": 42.69565217391305, + "grad_norm": 0.6790466904640198, + "learning_rate": 5.7842046718576205e-05, + "loss": 2.5069, + "step": 3201 + }, + { + "epoch": 42.709030100334445, + "grad_norm": 0.6454100608825684, + "learning_rate": 5.779755283648498e-05, + "loss": 2.4377, + "step": 3202 + }, + { + "epoch": 42.72240802675585, + "grad_norm": 0.6789502501487732, + "learning_rate": 5.775305895439377e-05, + "loss": 2.6028, + "step": 3203 + }, + { + "epoch": 42.735785953177256, + "grad_norm": 0.6526380777359009, + "learning_rate": 5.770856507230256e-05, + "loss": 2.544, + "step": 3204 + }, + { + "epoch": 42.74916387959866, + "grad_norm": 0.6715470552444458, + "learning_rate": 5.766407119021134e-05, + "loss": 2.5226, + "step": 3205 + }, + { + "epoch": 42.76254180602007, + "grad_norm": 0.6436430811882019, + "learning_rate": 5.761957730812013e-05, + "loss": 2.4716, + "step": 3206 + }, + { + "epoch": 42.77591973244147, + "grad_norm": 0.6560038328170776, + "learning_rate": 5.757508342602892e-05, + "loss": 2.575, + "step": 3207 + }, + { + "epoch": 42.78929765886288, + "grad_norm": 0.6760275363922119, + "learning_rate": 5.753058954393771e-05, + "loss": 2.6278, + "step": 3208 + }, + { + "epoch": 42.802675585284284, + "grad_norm": 0.6424068212509155, + "learning_rate": 5.74860956618465e-05, + "loss": 2.5249, + "step": 3209 + }, + { + "epoch": 42.81605351170568, + "grad_norm": 0.6159402132034302, + "learning_rate": 5.744160177975528e-05, + "loss": 2.4096, + "step": 3210 + }, + { + "epoch": 42.82943143812709, + "grad_norm": 0.6637108325958252, + "learning_rate": 5.739710789766407e-05, + "loss": 2.5771, + "step": 3211 + }, + { + "epoch": 42.84280936454849, + "grad_norm": 0.6485425233840942, + "learning_rate": 5.735261401557286e-05, + "loss": 2.4803, + "step": 3212 + }, + { + "epoch": 42.8561872909699, + "grad_norm": 0.6670514941215515, + "learning_rate": 5.730812013348165e-05, + "loss": 2.7293, + "step": 3213 + }, + { + "epoch": 42.869565217391305, + "grad_norm": 0.6216664910316467, + "learning_rate": 5.726362625139044e-05, + "loss": 2.488, + "step": 3214 + }, + { + "epoch": 42.88294314381271, + "grad_norm": 0.6202486157417297, + "learning_rate": 5.721913236929922e-05, + "loss": 2.4099, + "step": 3215 + }, + { + "epoch": 42.896321070234116, + "grad_norm": 0.6572627425193787, + "learning_rate": 5.717463848720801e-05, + "loss": 2.6004, + "step": 3216 + }, + { + "epoch": 42.90969899665552, + "grad_norm": 0.6133331060409546, + "learning_rate": 5.71301446051168e-05, + "loss": 2.4422, + "step": 3217 + }, + { + "epoch": 42.92307692307692, + "grad_norm": 0.6674253940582275, + "learning_rate": 5.708565072302559e-05, + "loss": 2.5856, + "step": 3218 + }, + { + "epoch": 42.936454849498325, + "grad_norm": 0.6581224799156189, + "learning_rate": 5.7041156840934376e-05, + "loss": 2.4517, + "step": 3219 + }, + { + "epoch": 42.94983277591973, + "grad_norm": 0.6469165682792664, + "learning_rate": 5.699666295884316e-05, + "loss": 2.6374, + "step": 3220 + }, + { + "epoch": 42.96321070234114, + "grad_norm": 0.6413242816925049, + "learning_rate": 5.695216907675195e-05, + "loss": 2.5076, + "step": 3221 + }, + { + "epoch": 42.97658862876254, + "grad_norm": 0.6252263188362122, + "learning_rate": 5.690767519466074e-05, + "loss": 2.4679, + "step": 3222 + }, + { + "epoch": 42.98996655518395, + "grad_norm": 0.7174087166786194, + "learning_rate": 5.6863181312569526e-05, + "loss": 2.6383, + "step": 3223 + }, + { + "epoch": 43.0, + "grad_norm": 0.7846776247024536, + "learning_rate": 5.6818687430478315e-05, + "loss": 2.7, + "step": 3224 + }, + { + "epoch": 43.013377926421406, + "grad_norm": 0.6958154439926147, + "learning_rate": 5.67741935483871e-05, + "loss": 2.5076, + "step": 3225 + }, + { + "epoch": 43.02675585284281, + "grad_norm": 0.6751554608345032, + "learning_rate": 5.672969966629589e-05, + "loss": 2.6423, + "step": 3226 + }, + { + "epoch": 43.04013377926422, + "grad_norm": 0.6690580248832703, + "learning_rate": 5.6685205784204676e-05, + "loss": 2.5008, + "step": 3227 + }, + { + "epoch": 43.05351170568562, + "grad_norm": 0.7116791605949402, + "learning_rate": 5.6640711902113465e-05, + "loss": 2.3609, + "step": 3228 + }, + { + "epoch": 43.06688963210702, + "grad_norm": 0.626956582069397, + "learning_rate": 5.6596218020022254e-05, + "loss": 2.471, + "step": 3229 + }, + { + "epoch": 43.080267558528426, + "grad_norm": 0.6250149607658386, + "learning_rate": 5.6551724137931037e-05, + "loss": 2.4207, + "step": 3230 + }, + { + "epoch": 43.09364548494983, + "grad_norm": 0.6632840037345886, + "learning_rate": 5.6507230255839826e-05, + "loss": 2.4791, + "step": 3231 + }, + { + "epoch": 43.10702341137124, + "grad_norm": 1.4978615045547485, + "learning_rate": 5.6462736373748615e-05, + "loss": 2.3859, + "step": 3232 + }, + { + "epoch": 43.12040133779264, + "grad_norm": 0.7302461862564087, + "learning_rate": 5.6418242491657404e-05, + "loss": 2.4571, + "step": 3233 + }, + { + "epoch": 43.13377926421405, + "grad_norm": 0.6785010099411011, + "learning_rate": 5.637374860956619e-05, + "loss": 2.4689, + "step": 3234 + }, + { + "epoch": 43.147157190635454, + "grad_norm": 0.6378248929977417, + "learning_rate": 5.6329254727474976e-05, + "loss": 2.5509, + "step": 3235 + }, + { + "epoch": 43.16053511705686, + "grad_norm": 0.6675599217414856, + "learning_rate": 5.6284760845383765e-05, + "loss": 2.6542, + "step": 3236 + }, + { + "epoch": 43.17391304347826, + "grad_norm": 0.6947749853134155, + "learning_rate": 5.6240266963292554e-05, + "loss": 2.7001, + "step": 3237 + }, + { + "epoch": 43.187290969899664, + "grad_norm": 0.6531405448913574, + "learning_rate": 5.619577308120134e-05, + "loss": 2.4718, + "step": 3238 + }, + { + "epoch": 43.20066889632107, + "grad_norm": 0.6625925898551941, + "learning_rate": 5.615127919911013e-05, + "loss": 2.344, + "step": 3239 + }, + { + "epoch": 43.214046822742475, + "grad_norm": 1.235804796218872, + "learning_rate": 5.6106785317018915e-05, + "loss": 2.5819, + "step": 3240 + }, + { + "epoch": 43.22742474916388, + "grad_norm": 0.6504685878753662, + "learning_rate": 5.6062291434927704e-05, + "loss": 2.3638, + "step": 3241 + }, + { + "epoch": 43.240802675585286, + "grad_norm": 0.7184932231903076, + "learning_rate": 5.601779755283649e-05, + "loss": 2.4465, + "step": 3242 + }, + { + "epoch": 43.25418060200669, + "grad_norm": 0.6429609656333923, + "learning_rate": 5.597330367074528e-05, + "loss": 2.1915, + "step": 3243 + }, + { + "epoch": 43.2675585284281, + "grad_norm": 0.7342157959938049, + "learning_rate": 5.592880978865406e-05, + "loss": 2.3047, + "step": 3244 + }, + { + "epoch": 43.280936454849495, + "grad_norm": 0.7221858501434326, + "learning_rate": 5.588431590656285e-05, + "loss": 2.5297, + "step": 3245 + }, + { + "epoch": 43.2943143812709, + "grad_norm": 0.7088733315467834, + "learning_rate": 5.5839822024471636e-05, + "loss": 2.6321, + "step": 3246 + }, + { + "epoch": 43.30769230769231, + "grad_norm": 0.6901667714118958, + "learning_rate": 5.579532814238042e-05, + "loss": 2.4381, + "step": 3247 + }, + { + "epoch": 43.32107023411371, + "grad_norm": 0.6635287404060364, + "learning_rate": 5.575083426028921e-05, + "loss": 2.5138, + "step": 3248 + }, + { + "epoch": 43.33444816053512, + "grad_norm": 0.6960332989692688, + "learning_rate": 5.5706340378198e-05, + "loss": 2.6753, + "step": 3249 + }, + { + "epoch": 43.34782608695652, + "grad_norm": 0.7167268395423889, + "learning_rate": 5.5661846496106786e-05, + "loss": 2.7389, + "step": 3250 + }, + { + "epoch": 43.36120401337793, + "grad_norm": 0.6687808036804199, + "learning_rate": 5.5617352614015575e-05, + "loss": 2.4048, + "step": 3251 + }, + { + "epoch": 43.374581939799334, + "grad_norm": 0.6490589380264282, + "learning_rate": 5.557285873192436e-05, + "loss": 2.5123, + "step": 3252 + }, + { + "epoch": 43.38795986622073, + "grad_norm": 0.6539404392242432, + "learning_rate": 5.552836484983315e-05, + "loss": 2.676, + "step": 3253 + }, + { + "epoch": 43.40133779264214, + "grad_norm": 0.6659723520278931, + "learning_rate": 5.5483870967741936e-05, + "loss": 2.3908, + "step": 3254 + }, + { + "epoch": 43.414715719063544, + "grad_norm": 0.6532467603683472, + "learning_rate": 5.5439377085650725e-05, + "loss": 2.4925, + "step": 3255 + }, + { + "epoch": 43.42809364548495, + "grad_norm": 0.8012048006057739, + "learning_rate": 5.5394883203559514e-05, + "loss": 2.5035, + "step": 3256 + }, + { + "epoch": 43.441471571906355, + "grad_norm": 0.6359190344810486, + "learning_rate": 5.5350389321468297e-05, + "loss": 2.4463, + "step": 3257 + }, + { + "epoch": 43.45484949832776, + "grad_norm": 0.6980485916137695, + "learning_rate": 5.5305895439377086e-05, + "loss": 2.6038, + "step": 3258 + }, + { + "epoch": 43.468227424749166, + "grad_norm": 0.757401168346405, + "learning_rate": 5.5261401557285875e-05, + "loss": 2.5263, + "step": 3259 + }, + { + "epoch": 43.48160535117057, + "grad_norm": 0.7248904705047607, + "learning_rate": 5.5216907675194664e-05, + "loss": 2.4536, + "step": 3260 + }, + { + "epoch": 43.49498327759197, + "grad_norm": 1.246519923210144, + "learning_rate": 5.517241379310345e-05, + "loss": 2.7486, + "step": 3261 + }, + { + "epoch": 43.508361204013376, + "grad_norm": 0.6592410802841187, + "learning_rate": 5.5127919911012236e-05, + "loss": 2.4943, + "step": 3262 + }, + { + "epoch": 43.52173913043478, + "grad_norm": 0.8287456035614014, + "learning_rate": 5.5083426028921025e-05, + "loss": 2.3626, + "step": 3263 + }, + { + "epoch": 43.53511705685619, + "grad_norm": 0.6615451574325562, + "learning_rate": 5.5038932146829814e-05, + "loss": 2.4929, + "step": 3264 + }, + { + "epoch": 43.54849498327759, + "grad_norm": 0.6615239977836609, + "learning_rate": 5.49944382647386e-05, + "loss": 2.2937, + "step": 3265 + }, + { + "epoch": 43.561872909699, + "grad_norm": 0.6681683659553528, + "learning_rate": 5.494994438264739e-05, + "loss": 2.5765, + "step": 3266 + }, + { + "epoch": 43.575250836120404, + "grad_norm": 0.6718909740447998, + "learning_rate": 5.4905450500556175e-05, + "loss": 2.4609, + "step": 3267 + }, + { + "epoch": 43.58862876254181, + "grad_norm": 0.6635290384292603, + "learning_rate": 5.4860956618464964e-05, + "loss": 2.3023, + "step": 3268 + }, + { + "epoch": 43.60200668896321, + "grad_norm": 0.6895936727523804, + "learning_rate": 5.481646273637375e-05, + "loss": 2.4802, + "step": 3269 + }, + { + "epoch": 43.61538461538461, + "grad_norm": 0.6714615821838379, + "learning_rate": 5.477196885428254e-05, + "loss": 2.4943, + "step": 3270 + }, + { + "epoch": 43.62876254180602, + "grad_norm": 0.6521191596984863, + "learning_rate": 5.472747497219133e-05, + "loss": 2.5329, + "step": 3271 + }, + { + "epoch": 43.642140468227424, + "grad_norm": 0.6796808242797852, + "learning_rate": 5.4682981090100114e-05, + "loss": 2.7126, + "step": 3272 + }, + { + "epoch": 43.65551839464883, + "grad_norm": 0.6271299123764038, + "learning_rate": 5.46384872080089e-05, + "loss": 2.3406, + "step": 3273 + }, + { + "epoch": 43.668896321070235, + "grad_norm": 0.6406214237213135, + "learning_rate": 5.459399332591769e-05, + "loss": 2.5473, + "step": 3274 + }, + { + "epoch": 43.68227424749164, + "grad_norm": 0.688023030757904, + "learning_rate": 5.454949944382648e-05, + "loss": 2.5628, + "step": 3275 + }, + { + "epoch": 43.69565217391305, + "grad_norm": 0.6261418461799622, + "learning_rate": 5.450500556173527e-05, + "loss": 2.3595, + "step": 3276 + }, + { + "epoch": 43.709030100334445, + "grad_norm": 0.6406996846199036, + "learning_rate": 5.446051167964405e-05, + "loss": 2.4768, + "step": 3277 + }, + { + "epoch": 43.72240802675585, + "grad_norm": 0.6346117258071899, + "learning_rate": 5.441601779755284e-05, + "loss": 2.3375, + "step": 3278 + }, + { + "epoch": 43.735785953177256, + "grad_norm": 0.6415894627571106, + "learning_rate": 5.437152391546163e-05, + "loss": 2.2956, + "step": 3279 + }, + { + "epoch": 43.74916387959866, + "grad_norm": 0.681527316570282, + "learning_rate": 5.432703003337042e-05, + "loss": 2.4731, + "step": 3280 + }, + { + "epoch": 43.76254180602007, + "grad_norm": 0.6715549826622009, + "learning_rate": 5.428253615127921e-05, + "loss": 2.4454, + "step": 3281 + }, + { + "epoch": 43.77591973244147, + "grad_norm": 0.6230939626693726, + "learning_rate": 5.423804226918799e-05, + "loss": 2.2612, + "step": 3282 + }, + { + "epoch": 43.78929765886288, + "grad_norm": 0.7007676362991333, + "learning_rate": 5.419354838709678e-05, + "loss": 2.5626, + "step": 3283 + }, + { + "epoch": 43.802675585284284, + "grad_norm": 0.6708741784095764, + "learning_rate": 5.414905450500557e-05, + "loss": 2.4612, + "step": 3284 + }, + { + "epoch": 43.81605351170568, + "grad_norm": 0.6595252156257629, + "learning_rate": 5.4104560622914346e-05, + "loss": 2.4559, + "step": 3285 + }, + { + "epoch": 43.82943143812709, + "grad_norm": 0.627967119216919, + "learning_rate": 5.4060066740823135e-05, + "loss": 2.5175, + "step": 3286 + }, + { + "epoch": 43.84280936454849, + "grad_norm": 0.6663414835929871, + "learning_rate": 5.4015572858731924e-05, + "loss": 2.3807, + "step": 3287 + }, + { + "epoch": 43.8561872909699, + "grad_norm": 0.6707153916358948, + "learning_rate": 5.3971078976640707e-05, + "loss": 2.4467, + "step": 3288 + }, + { + "epoch": 43.869565217391305, + "grad_norm": 0.6654629707336426, + "learning_rate": 5.3926585094549496e-05, + "loss": 2.5185, + "step": 3289 + }, + { + "epoch": 43.88294314381271, + "grad_norm": 0.6775473356246948, + "learning_rate": 5.3882091212458285e-05, + "loss": 2.5433, + "step": 3290 + }, + { + "epoch": 43.896321070234116, + "grad_norm": 0.7619598507881165, + "learning_rate": 5.3837597330367074e-05, + "loss": 2.5873, + "step": 3291 + }, + { + "epoch": 43.90969899665552, + "grad_norm": 0.6738859415054321, + "learning_rate": 5.379310344827586e-05, + "loss": 2.7934, + "step": 3292 + }, + { + "epoch": 43.92307692307692, + "grad_norm": 0.6576844453811646, + "learning_rate": 5.3748609566184646e-05, + "loss": 2.4892, + "step": 3293 + }, + { + "epoch": 43.936454849498325, + "grad_norm": 0.7075631022453308, + "learning_rate": 5.3704115684093435e-05, + "loss": 2.6011, + "step": 3294 + }, + { + "epoch": 43.94983277591973, + "grad_norm": 0.6499934196472168, + "learning_rate": 5.3659621802002224e-05, + "loss": 2.5615, + "step": 3295 + }, + { + "epoch": 43.96321070234114, + "grad_norm": 0.6628806591033936, + "learning_rate": 5.361512791991101e-05, + "loss": 2.4376, + "step": 3296 + }, + { + "epoch": 43.97658862876254, + "grad_norm": 0.6394050121307373, + "learning_rate": 5.35706340378198e-05, + "loss": 2.5635, + "step": 3297 + }, + { + "epoch": 43.98996655518395, + "grad_norm": 0.6746408343315125, + "learning_rate": 5.3526140155728585e-05, + "loss": 2.676, + "step": 3298 + }, + { + "epoch": 44.0, + "grad_norm": 0.8157567381858826, + "learning_rate": 5.3481646273637374e-05, + "loss": 2.7095, + "step": 3299 + }, + { + "epoch": 44.013377926421406, + "grad_norm": 0.648750901222229, + "learning_rate": 5.343715239154616e-05, + "loss": 2.511, + "step": 3300 + }, + { + "epoch": 44.02675585284281, + "grad_norm": 0.646703839302063, + "learning_rate": 5.339265850945495e-05, + "loss": 2.4419, + "step": 3301 + }, + { + "epoch": 44.04013377926422, + "grad_norm": 0.6128621697425842, + "learning_rate": 5.334816462736374e-05, + "loss": 2.3166, + "step": 3302 + }, + { + "epoch": 44.05351170568562, + "grad_norm": 0.7439555525779724, + "learning_rate": 5.3303670745272524e-05, + "loss": 2.4277, + "step": 3303 + }, + { + "epoch": 44.06688963210702, + "grad_norm": 0.6968475580215454, + "learning_rate": 5.325917686318131e-05, + "loss": 2.4495, + "step": 3304 + }, + { + "epoch": 44.080267558528426, + "grad_norm": 0.6829770803451538, + "learning_rate": 5.32146829810901e-05, + "loss": 2.5492, + "step": 3305 + }, + { + "epoch": 44.09364548494983, + "grad_norm": 0.7078229784965515, + "learning_rate": 5.317018909899889e-05, + "loss": 2.3142, + "step": 3306 + }, + { + "epoch": 44.10702341137124, + "grad_norm": 0.7869110107421875, + "learning_rate": 5.312569521690768e-05, + "loss": 2.6079, + "step": 3307 + }, + { + "epoch": 44.12040133779264, + "grad_norm": 0.6723588705062866, + "learning_rate": 5.308120133481646e-05, + "loss": 2.5012, + "step": 3308 + }, + { + "epoch": 44.13377926421405, + "grad_norm": 0.6307100057601929, + "learning_rate": 5.303670745272525e-05, + "loss": 2.2734, + "step": 3309 + }, + { + "epoch": 44.147157190635454, + "grad_norm": 0.7307317852973938, + "learning_rate": 5.299221357063404e-05, + "loss": 2.5035, + "step": 3310 + }, + { + "epoch": 44.16053511705686, + "grad_norm": 0.6610035300254822, + "learning_rate": 5.294771968854283e-05, + "loss": 2.5255, + "step": 3311 + }, + { + "epoch": 44.17391304347826, + "grad_norm": 0.6795505285263062, + "learning_rate": 5.290322580645162e-05, + "loss": 2.4404, + "step": 3312 + }, + { + "epoch": 44.187290969899664, + "grad_norm": 0.8478922843933105, + "learning_rate": 5.28587319243604e-05, + "loss": 2.5065, + "step": 3313 + }, + { + "epoch": 44.20066889632107, + "grad_norm": 0.6836498975753784, + "learning_rate": 5.281423804226919e-05, + "loss": 2.5934, + "step": 3314 + }, + { + "epoch": 44.214046822742475, + "grad_norm": 0.6741980314254761, + "learning_rate": 5.276974416017798e-05, + "loss": 2.4481, + "step": 3315 + }, + { + "epoch": 44.22742474916388, + "grad_norm": 0.6965019702911377, + "learning_rate": 5.272525027808677e-05, + "loss": 2.3167, + "step": 3316 + }, + { + "epoch": 44.240802675585286, + "grad_norm": 0.7769578695297241, + "learning_rate": 5.268075639599556e-05, + "loss": 2.3156, + "step": 3317 + }, + { + "epoch": 44.25418060200669, + "grad_norm": 0.6968414783477783, + "learning_rate": 5.263626251390434e-05, + "loss": 2.5261, + "step": 3318 + }, + { + "epoch": 44.2675585284281, + "grad_norm": 0.6269694566726685, + "learning_rate": 5.259176863181313e-05, + "loss": 2.3642, + "step": 3319 + }, + { + "epoch": 44.280936454849495, + "grad_norm": 0.6887113451957703, + "learning_rate": 5.254727474972192e-05, + "loss": 2.5911, + "step": 3320 + }, + { + "epoch": 44.2943143812709, + "grad_norm": 0.6772909760475159, + "learning_rate": 5.250278086763071e-05, + "loss": 2.5551, + "step": 3321 + }, + { + "epoch": 44.30769230769231, + "grad_norm": 0.7027330994606018, + "learning_rate": 5.24582869855395e-05, + "loss": 2.7092, + "step": 3322 + }, + { + "epoch": 44.32107023411371, + "grad_norm": 0.6497491598129272, + "learning_rate": 5.241379310344828e-05, + "loss": 2.4204, + "step": 3323 + }, + { + "epoch": 44.33444816053512, + "grad_norm": 0.9224070906639099, + "learning_rate": 5.236929922135707e-05, + "loss": 2.3124, + "step": 3324 + }, + { + "epoch": 44.34782608695652, + "grad_norm": 0.6555243730545044, + "learning_rate": 5.232480533926586e-05, + "loss": 2.1393, + "step": 3325 + }, + { + "epoch": 44.36120401337793, + "grad_norm": 1.082617163658142, + "learning_rate": 5.2280311457174634e-05, + "loss": 2.2029, + "step": 3326 + }, + { + "epoch": 44.374581939799334, + "grad_norm": 0.6710803508758545, + "learning_rate": 5.223581757508342e-05, + "loss": 2.5351, + "step": 3327 + }, + { + "epoch": 44.38795986622073, + "grad_norm": 0.6694113612174988, + "learning_rate": 5.219132369299221e-05, + "loss": 2.4342, + "step": 3328 + }, + { + "epoch": 44.40133779264214, + "grad_norm": 0.6941688656806946, + "learning_rate": 5.2146829810901e-05, + "loss": 2.3725, + "step": 3329 + }, + { + "epoch": 44.414715719063544, + "grad_norm": 0.6413290500640869, + "learning_rate": 5.2102335928809784e-05, + "loss": 2.4962, + "step": 3330 + }, + { + "epoch": 44.42809364548495, + "grad_norm": 0.6977859139442444, + "learning_rate": 5.205784204671857e-05, + "loss": 2.5549, + "step": 3331 + }, + { + "epoch": 44.441471571906355, + "grad_norm": 0.7250884771347046, + "learning_rate": 5.201334816462736e-05, + "loss": 2.6143, + "step": 3332 + }, + { + "epoch": 44.45484949832776, + "grad_norm": 0.7633907794952393, + "learning_rate": 5.196885428253615e-05, + "loss": 2.4557, + "step": 3333 + }, + { + "epoch": 44.468227424749166, + "grad_norm": 0.7065114378929138, + "learning_rate": 5.192436040044494e-05, + "loss": 2.5935, + "step": 3334 + }, + { + "epoch": 44.48160535117057, + "grad_norm": 0.6201562285423279, + "learning_rate": 5.187986651835372e-05, + "loss": 2.3543, + "step": 3335 + }, + { + "epoch": 44.49498327759197, + "grad_norm": 0.6835589408874512, + "learning_rate": 5.183537263626251e-05, + "loss": 2.5158, + "step": 3336 + }, + { + "epoch": 44.508361204013376, + "grad_norm": 0.7697896957397461, + "learning_rate": 5.17908787541713e-05, + "loss": 2.2409, + "step": 3337 + }, + { + "epoch": 44.52173913043478, + "grad_norm": 0.7114497423171997, + "learning_rate": 5.174638487208009e-05, + "loss": 2.6477, + "step": 3338 + }, + { + "epoch": 44.53511705685619, + "grad_norm": 0.6574714183807373, + "learning_rate": 5.170189098998888e-05, + "loss": 2.3153, + "step": 3339 + }, + { + "epoch": 44.54849498327759, + "grad_norm": 0.6645333170890808, + "learning_rate": 5.165739710789766e-05, + "loss": 2.4608, + "step": 3340 + }, + { + "epoch": 44.561872909699, + "grad_norm": 0.6737465262413025, + "learning_rate": 5.161290322580645e-05, + "loss": 2.3891, + "step": 3341 + }, + { + "epoch": 44.575250836120404, + "grad_norm": 0.732427179813385, + "learning_rate": 5.156840934371524e-05, + "loss": 2.5159, + "step": 3342 + }, + { + "epoch": 44.58862876254181, + "grad_norm": 0.6582529544830322, + "learning_rate": 5.152391546162403e-05, + "loss": 2.4066, + "step": 3343 + }, + { + "epoch": 44.60200668896321, + "grad_norm": 0.7296741008758545, + "learning_rate": 5.147942157953282e-05, + "loss": 2.5349, + "step": 3344 + }, + { + "epoch": 44.61538461538461, + "grad_norm": 0.6754583716392517, + "learning_rate": 5.14349276974416e-05, + "loss": 2.5574, + "step": 3345 + }, + { + "epoch": 44.62876254180602, + "grad_norm": 0.6714019775390625, + "learning_rate": 5.139043381535039e-05, + "loss": 2.3795, + "step": 3346 + }, + { + "epoch": 44.642140468227424, + "grad_norm": 0.818821132183075, + "learning_rate": 5.134593993325918e-05, + "loss": 2.6163, + "step": 3347 + }, + { + "epoch": 44.65551839464883, + "grad_norm": 0.6695653200149536, + "learning_rate": 5.130144605116797e-05, + "loss": 2.5223, + "step": 3348 + }, + { + "epoch": 44.668896321070235, + "grad_norm": 0.674275815486908, + "learning_rate": 5.125695216907676e-05, + "loss": 2.3804, + "step": 3349 + }, + { + "epoch": 44.68227424749164, + "grad_norm": 0.6844233870506287, + "learning_rate": 5.121245828698554e-05, + "loss": 2.5665, + "step": 3350 + }, + { + "epoch": 44.69565217391305, + "grad_norm": 0.7201416492462158, + "learning_rate": 5.116796440489433e-05, + "loss": 2.5106, + "step": 3351 + }, + { + "epoch": 44.709030100334445, + "grad_norm": 0.6668140888214111, + "learning_rate": 5.112347052280312e-05, + "loss": 2.6049, + "step": 3352 + }, + { + "epoch": 44.72240802675585, + "grad_norm": 0.7152537703514099, + "learning_rate": 5.107897664071191e-05, + "loss": 2.4075, + "step": 3353 + }, + { + "epoch": 44.735785953177256, + "grad_norm": 0.6661379933357239, + "learning_rate": 5.10344827586207e-05, + "loss": 2.5558, + "step": 3354 + }, + { + "epoch": 44.74916387959866, + "grad_norm": 0.6641297936439514, + "learning_rate": 5.098998887652948e-05, + "loss": 2.4548, + "step": 3355 + }, + { + "epoch": 44.76254180602007, + "grad_norm": 0.689754843711853, + "learning_rate": 5.094549499443827e-05, + "loss": 2.4735, + "step": 3356 + }, + { + "epoch": 44.77591973244147, + "grad_norm": 0.6680580377578735, + "learning_rate": 5.090100111234706e-05, + "loss": 2.483, + "step": 3357 + }, + { + "epoch": 44.78929765886288, + "grad_norm": 1.1155476570129395, + "learning_rate": 5.0856507230255847e-05, + "loss": 2.6322, + "step": 3358 + }, + { + "epoch": 44.802675585284284, + "grad_norm": 0.6346137523651123, + "learning_rate": 5.0812013348164636e-05, + "loss": 2.4057, + "step": 3359 + }, + { + "epoch": 44.81605351170568, + "grad_norm": 0.6553255319595337, + "learning_rate": 5.076751946607342e-05, + "loss": 2.4353, + "step": 3360 + }, + { + "epoch": 44.82943143812709, + "grad_norm": 0.7519358992576599, + "learning_rate": 5.072302558398221e-05, + "loss": 2.5155, + "step": 3361 + }, + { + "epoch": 44.84280936454849, + "grad_norm": 0.6849926114082336, + "learning_rate": 5.0678531701890996e-05, + "loss": 2.6557, + "step": 3362 + }, + { + "epoch": 44.8561872909699, + "grad_norm": 0.6906036138534546, + "learning_rate": 5.0634037819799786e-05, + "loss": 2.5513, + "step": 3363 + }, + { + "epoch": 44.869565217391305, + "grad_norm": 0.712207019329071, + "learning_rate": 5.0589543937708575e-05, + "loss": 2.5477, + "step": 3364 + }, + { + "epoch": 44.88294314381271, + "grad_norm": 0.7856250405311584, + "learning_rate": 5.054505005561736e-05, + "loss": 2.5645, + "step": 3365 + }, + { + "epoch": 44.896321070234116, + "grad_norm": 0.6578722596168518, + "learning_rate": 5.0500556173526146e-05, + "loss": 2.5097, + "step": 3366 + }, + { + "epoch": 44.90969899665552, + "grad_norm": 0.6745465397834778, + "learning_rate": 5.045606229143492e-05, + "loss": 2.4405, + "step": 3367 + }, + { + "epoch": 44.92307692307692, + "grad_norm": 0.6686838865280151, + "learning_rate": 5.041156840934371e-05, + "loss": 2.4774, + "step": 3368 + }, + { + "epoch": 44.936454849498325, + "grad_norm": 0.6725925803184509, + "learning_rate": 5.03670745272525e-05, + "loss": 2.5365, + "step": 3369 + }, + { + "epoch": 44.94983277591973, + "grad_norm": 0.7038573026657104, + "learning_rate": 5.032258064516129e-05, + "loss": 2.5415, + "step": 3370 + }, + { + "epoch": 44.96321070234114, + "grad_norm": 0.670062780380249, + "learning_rate": 5.027808676307008e-05, + "loss": 2.5289, + "step": 3371 + }, + { + "epoch": 44.97658862876254, + "grad_norm": 0.6854854226112366, + "learning_rate": 5.023359288097886e-05, + "loss": 2.5487, + "step": 3372 + }, + { + "epoch": 44.98996655518395, + "grad_norm": 0.840345025062561, + "learning_rate": 5.018909899888765e-05, + "loss": 2.6067, + "step": 3373 + }, + { + "epoch": 45.0, + "grad_norm": 0.790497362613678, + "learning_rate": 5.014460511679644e-05, + "loss": 2.3824, + "step": 3374 + }, + { + "epoch": 45.013377926421406, + "grad_norm": 0.6614347100257874, + "learning_rate": 5.010011123470523e-05, + "loss": 2.3392, + "step": 3375 + }, + { + "epoch": 45.02675585284281, + "grad_norm": 0.6427727341651917, + "learning_rate": 5.005561735261402e-05, + "loss": 2.526, + "step": 3376 + }, + { + "epoch": 45.04013377926422, + "grad_norm": 0.6234596967697144, + "learning_rate": 5.00111234705228e-05, + "loss": 2.4324, + "step": 3377 + }, + { + "epoch": 45.05351170568562, + "grad_norm": 0.6480961441993713, + "learning_rate": 4.996662958843159e-05, + "loss": 2.4503, + "step": 3378 + }, + { + "epoch": 45.06688963210702, + "grad_norm": 0.6448456048965454, + "learning_rate": 4.992213570634038e-05, + "loss": 2.5241, + "step": 3379 + }, + { + "epoch": 45.080267558528426, + "grad_norm": 0.6678739786148071, + "learning_rate": 4.987764182424917e-05, + "loss": 2.5715, + "step": 3380 + }, + { + "epoch": 45.09364548494983, + "grad_norm": 0.6756695508956909, + "learning_rate": 4.983314794215796e-05, + "loss": 2.5005, + "step": 3381 + }, + { + "epoch": 45.10702341137124, + "grad_norm": 0.6389010548591614, + "learning_rate": 4.978865406006674e-05, + "loss": 2.3691, + "step": 3382 + }, + { + "epoch": 45.12040133779264, + "grad_norm": 0.66241854429245, + "learning_rate": 4.974416017797553e-05, + "loss": 2.251, + "step": 3383 + }, + { + "epoch": 45.13377926421405, + "grad_norm": 0.7921980619430542, + "learning_rate": 4.969966629588432e-05, + "loss": 2.2895, + "step": 3384 + }, + { + "epoch": 45.147157190635454, + "grad_norm": 0.6602982878684998, + "learning_rate": 4.9655172413793107e-05, + "loss": 2.4911, + "step": 3385 + }, + { + "epoch": 45.16053511705686, + "grad_norm": 0.6390572190284729, + "learning_rate": 4.9610678531701896e-05, + "loss": 2.2301, + "step": 3386 + }, + { + "epoch": 45.17391304347826, + "grad_norm": 0.6479654312133789, + "learning_rate": 4.956618464961068e-05, + "loss": 2.416, + "step": 3387 + }, + { + "epoch": 45.187290969899664, + "grad_norm": 0.6543484330177307, + "learning_rate": 4.952169076751947e-05, + "loss": 2.6481, + "step": 3388 + }, + { + "epoch": 45.20066889632107, + "grad_norm": 0.6472519636154175, + "learning_rate": 4.9477196885428256e-05, + "loss": 2.2218, + "step": 3389 + }, + { + "epoch": 45.214046822742475, + "grad_norm": 0.6654759049415588, + "learning_rate": 4.9432703003337046e-05, + "loss": 2.559, + "step": 3390 + }, + { + "epoch": 45.22742474916388, + "grad_norm": 0.833139955997467, + "learning_rate": 4.9388209121245835e-05, + "loss": 2.4773, + "step": 3391 + }, + { + "epoch": 45.240802675585286, + "grad_norm": 0.7028539180755615, + "learning_rate": 4.934371523915462e-05, + "loss": 2.5329, + "step": 3392 + }, + { + "epoch": 45.25418060200669, + "grad_norm": 0.6716699600219727, + "learning_rate": 4.9299221357063406e-05, + "loss": 2.6069, + "step": 3393 + }, + { + "epoch": 45.2675585284281, + "grad_norm": 0.8206474184989929, + "learning_rate": 4.9254727474972196e-05, + "loss": 2.5227, + "step": 3394 + }, + { + "epoch": 45.280936454849495, + "grad_norm": 0.6916563510894775, + "learning_rate": 4.9210233592880985e-05, + "loss": 2.5564, + "step": 3395 + }, + { + "epoch": 45.2943143812709, + "grad_norm": 0.67179936170578, + "learning_rate": 4.9165739710789774e-05, + "loss": 2.5468, + "step": 3396 + }, + { + "epoch": 45.30769230769231, + "grad_norm": 0.7964248657226562, + "learning_rate": 4.9121245828698556e-05, + "loss": 2.6126, + "step": 3397 + }, + { + "epoch": 45.32107023411371, + "grad_norm": 0.6877651810646057, + "learning_rate": 4.907675194660734e-05, + "loss": 2.6272, + "step": 3398 + }, + { + "epoch": 45.33444816053512, + "grad_norm": 0.7093799710273743, + "learning_rate": 4.903225806451613e-05, + "loss": 2.4046, + "step": 3399 + }, + { + "epoch": 45.34782608695652, + "grad_norm": 0.772832453250885, + "learning_rate": 4.898776418242492e-05, + "loss": 2.5126, + "step": 3400 + }, + { + "epoch": 45.36120401337793, + "grad_norm": 0.7250183820724487, + "learning_rate": 4.8943270300333706e-05, + "loss": 2.4396, + "step": 3401 + }, + { + "epoch": 45.374581939799334, + "grad_norm": 0.6946377754211426, + "learning_rate": 4.8898776418242495e-05, + "loss": 2.6412, + "step": 3402 + }, + { + "epoch": 45.38795986622073, + "grad_norm": 0.6398693919181824, + "learning_rate": 4.885428253615128e-05, + "loss": 2.4037, + "step": 3403 + }, + { + "epoch": 45.40133779264214, + "grad_norm": 0.7096537947654724, + "learning_rate": 4.880978865406007e-05, + "loss": 2.5131, + "step": 3404 + }, + { + "epoch": 45.414715719063544, + "grad_norm": 0.6392034888267517, + "learning_rate": 4.8765294771968856e-05, + "loss": 2.4144, + "step": 3405 + }, + { + "epoch": 45.42809364548495, + "grad_norm": 0.647911787033081, + "learning_rate": 4.8720800889877645e-05, + "loss": 2.4081, + "step": 3406 + }, + { + "epoch": 45.441471571906355, + "grad_norm": 0.8484491109848022, + "learning_rate": 4.8676307007786434e-05, + "loss": 2.2981, + "step": 3407 + }, + { + "epoch": 45.45484949832776, + "grad_norm": 0.6788907051086426, + "learning_rate": 4.863181312569522e-05, + "loss": 2.3547, + "step": 3408 + }, + { + "epoch": 45.468227424749166, + "grad_norm": 0.7060588002204895, + "learning_rate": 4.8587319243604006e-05, + "loss": 2.506, + "step": 3409 + }, + { + "epoch": 45.48160535117057, + "grad_norm": 0.7509225606918335, + "learning_rate": 4.8542825361512795e-05, + "loss": 2.4514, + "step": 3410 + }, + { + "epoch": 45.49498327759197, + "grad_norm": 0.6925180554389954, + "learning_rate": 4.8498331479421584e-05, + "loss": 2.5358, + "step": 3411 + }, + { + "epoch": 45.508361204013376, + "grad_norm": 0.6838158965110779, + "learning_rate": 4.8453837597330373e-05, + "loss": 2.2683, + "step": 3412 + }, + { + "epoch": 45.52173913043478, + "grad_norm": 0.6462475657463074, + "learning_rate": 4.8409343715239156e-05, + "loss": 2.3029, + "step": 3413 + }, + { + "epoch": 45.53511705685619, + "grad_norm": 0.6882359981536865, + "learning_rate": 4.8364849833147945e-05, + "loss": 2.6943, + "step": 3414 + }, + { + "epoch": 45.54849498327759, + "grad_norm": 0.6654660105705261, + "learning_rate": 4.8320355951056734e-05, + "loss": 2.2613, + "step": 3415 + }, + { + "epoch": 45.561872909699, + "grad_norm": 0.8237322568893433, + "learning_rate": 4.827586206896552e-05, + "loss": 2.2294, + "step": 3416 + }, + { + "epoch": 45.575250836120404, + "grad_norm": 0.7393941283226013, + "learning_rate": 4.823136818687431e-05, + "loss": 2.5682, + "step": 3417 + }, + { + "epoch": 45.58862876254181, + "grad_norm": 0.7490308880805969, + "learning_rate": 4.8186874304783095e-05, + "loss": 2.5922, + "step": 3418 + }, + { + "epoch": 45.60200668896321, + "grad_norm": 0.7629905343055725, + "learning_rate": 4.814238042269188e-05, + "loss": 2.6547, + "step": 3419 + }, + { + "epoch": 45.61538461538461, + "grad_norm": 0.7757886648178101, + "learning_rate": 4.8097886540600666e-05, + "loss": 2.4786, + "step": 3420 + }, + { + "epoch": 45.62876254180602, + "grad_norm": 0.6690565943717957, + "learning_rate": 4.8053392658509456e-05, + "loss": 2.4899, + "step": 3421 + }, + { + "epoch": 45.642140468227424, + "grad_norm": 0.674462616443634, + "learning_rate": 4.8008898776418245e-05, + "loss": 2.3315, + "step": 3422 + }, + { + "epoch": 45.65551839464883, + "grad_norm": 0.7295622229576111, + "learning_rate": 4.7964404894327034e-05, + "loss": 2.6889, + "step": 3423 + }, + { + "epoch": 45.668896321070235, + "grad_norm": 0.7250505685806274, + "learning_rate": 4.7919911012235816e-05, + "loss": 2.2723, + "step": 3424 + }, + { + "epoch": 45.68227424749164, + "grad_norm": 1.0119596719741821, + "learning_rate": 4.7875417130144605e-05, + "loss": 2.4533, + "step": 3425 + }, + { + "epoch": 45.69565217391305, + "grad_norm": 0.6516237854957581, + "learning_rate": 4.7830923248053395e-05, + "loss": 2.3416, + "step": 3426 + }, + { + "epoch": 45.709030100334445, + "grad_norm": 0.6897451281547546, + "learning_rate": 4.7786429365962184e-05, + "loss": 2.532, + "step": 3427 + }, + { + "epoch": 45.72240802675585, + "grad_norm": 0.6455211043357849, + "learning_rate": 4.774193548387097e-05, + "loss": 2.2748, + "step": 3428 + }, + { + "epoch": 45.735785953177256, + "grad_norm": 0.6655951142311096, + "learning_rate": 4.7697441601779755e-05, + "loss": 2.2649, + "step": 3429 + }, + { + "epoch": 45.74916387959866, + "grad_norm": 0.8087528944015503, + "learning_rate": 4.7652947719688545e-05, + "loss": 2.4608, + "step": 3430 + }, + { + "epoch": 45.76254180602007, + "grad_norm": 0.661077618598938, + "learning_rate": 4.7608453837597334e-05, + "loss": 2.3469, + "step": 3431 + }, + { + "epoch": 45.77591973244147, + "grad_norm": 0.7252074480056763, + "learning_rate": 4.756395995550612e-05, + "loss": 2.6173, + "step": 3432 + }, + { + "epoch": 45.78929765886288, + "grad_norm": 0.7233737111091614, + "learning_rate": 4.751946607341491e-05, + "loss": 2.4967, + "step": 3433 + }, + { + "epoch": 45.802675585284284, + "grad_norm": 0.6680254340171814, + "learning_rate": 4.7474972191323694e-05, + "loss": 2.4175, + "step": 3434 + }, + { + "epoch": 45.81605351170568, + "grad_norm": 0.6761989593505859, + "learning_rate": 4.7430478309232484e-05, + "loss": 2.5149, + "step": 3435 + }, + { + "epoch": 45.82943143812709, + "grad_norm": 0.6871892213821411, + "learning_rate": 4.738598442714127e-05, + "loss": 2.4105, + "step": 3436 + }, + { + "epoch": 45.84280936454849, + "grad_norm": 0.7473261952400208, + "learning_rate": 4.734149054505006e-05, + "loss": 2.5763, + "step": 3437 + }, + { + "epoch": 45.8561872909699, + "grad_norm": 0.703223705291748, + "learning_rate": 4.729699666295885e-05, + "loss": 2.4285, + "step": 3438 + }, + { + "epoch": 45.869565217391305, + "grad_norm": 0.7379417419433594, + "learning_rate": 4.7252502780867633e-05, + "loss": 2.5296, + "step": 3439 + }, + { + "epoch": 45.88294314381271, + "grad_norm": 0.8234548568725586, + "learning_rate": 4.7208008898776416e-05, + "loss": 2.4177, + "step": 3440 + }, + { + "epoch": 45.896321070234116, + "grad_norm": 0.6252479553222656, + "learning_rate": 4.7163515016685205e-05, + "loss": 2.174, + "step": 3441 + }, + { + "epoch": 45.90969899665552, + "grad_norm": 0.6623761057853699, + "learning_rate": 4.7119021134593994e-05, + "loss": 2.4613, + "step": 3442 + }, + { + "epoch": 45.92307692307692, + "grad_norm": 0.7237697839736938, + "learning_rate": 4.707452725250278e-05, + "loss": 2.5461, + "step": 3443 + }, + { + "epoch": 45.936454849498325, + "grad_norm": 0.7058880925178528, + "learning_rate": 4.703003337041157e-05, + "loss": 2.3805, + "step": 3444 + }, + { + "epoch": 45.94983277591973, + "grad_norm": 0.7062870860099792, + "learning_rate": 4.6985539488320355e-05, + "loss": 2.6266, + "step": 3445 + }, + { + "epoch": 45.96321070234114, + "grad_norm": 0.6760918498039246, + "learning_rate": 4.6941045606229144e-05, + "loss": 2.5284, + "step": 3446 + }, + { + "epoch": 45.97658862876254, + "grad_norm": 0.6973199844360352, + "learning_rate": 4.689655172413793e-05, + "loss": 2.3388, + "step": 3447 + }, + { + "epoch": 45.98996655518395, + "grad_norm": 0.6965627670288086, + "learning_rate": 4.685205784204672e-05, + "loss": 2.4212, + "step": 3448 + }, + { + "epoch": 46.0, + "grad_norm": 0.8428727388381958, + "learning_rate": 4.680756395995551e-05, + "loss": 2.5794, + "step": 3449 + }, + { + "epoch": 46.013377926421406, + "grad_norm": 0.6538412570953369, + "learning_rate": 4.6763070077864294e-05, + "loss": 2.3321, + "step": 3450 + }, + { + "epoch": 46.02675585284281, + "grad_norm": 0.6650435328483582, + "learning_rate": 4.671857619577308e-05, + "loss": 2.3299, + "step": 3451 + }, + { + "epoch": 46.04013377926422, + "grad_norm": 0.6518568396568298, + "learning_rate": 4.667408231368187e-05, + "loss": 2.3671, + "step": 3452 + }, + { + "epoch": 46.05351170568562, + "grad_norm": 0.6615071296691895, + "learning_rate": 4.662958843159066e-05, + "loss": 2.2856, + "step": 3453 + }, + { + "epoch": 46.06688963210702, + "grad_norm": 0.7280906438827515, + "learning_rate": 4.658509454949945e-05, + "loss": 2.6337, + "step": 3454 + }, + { + "epoch": 46.080267558528426, + "grad_norm": 0.6038973927497864, + "learning_rate": 4.654060066740823e-05, + "loss": 2.0686, + "step": 3455 + }, + { + "epoch": 46.09364548494983, + "grad_norm": 0.7241243720054626, + "learning_rate": 4.649610678531702e-05, + "loss": 2.4142, + "step": 3456 + }, + { + "epoch": 46.10702341137124, + "grad_norm": 0.716877281665802, + "learning_rate": 4.645161290322581e-05, + "loss": 2.6186, + "step": 3457 + }, + { + "epoch": 46.12040133779264, + "grad_norm": 0.7181987166404724, + "learning_rate": 4.64071190211346e-05, + "loss": 2.4412, + "step": 3458 + }, + { + "epoch": 46.13377926421405, + "grad_norm": 0.699880838394165, + "learning_rate": 4.636262513904339e-05, + "loss": 2.5438, + "step": 3459 + }, + { + "epoch": 46.147157190635454, + "grad_norm": 0.7127376198768616, + "learning_rate": 4.6318131256952165e-05, + "loss": 2.3073, + "step": 3460 + }, + { + "epoch": 46.16053511705686, + "grad_norm": 0.7011304497718811, + "learning_rate": 4.6273637374860954e-05, + "loss": 2.4327, + "step": 3461 + }, + { + "epoch": 46.17391304347826, + "grad_norm": 0.7037985920906067, + "learning_rate": 4.6229143492769744e-05, + "loss": 2.444, + "step": 3462 + }, + { + "epoch": 46.187290969899664, + "grad_norm": 0.6529151201248169, + "learning_rate": 4.618464961067853e-05, + "loss": 2.3799, + "step": 3463 + }, + { + "epoch": 46.20066889632107, + "grad_norm": 0.7062216997146606, + "learning_rate": 4.614015572858732e-05, + "loss": 2.4849, + "step": 3464 + }, + { + "epoch": 46.214046822742475, + "grad_norm": 0.7128186821937561, + "learning_rate": 4.6095661846496104e-05, + "loss": 2.3768, + "step": 3465 + }, + { + "epoch": 46.22742474916388, + "grad_norm": 0.629882276058197, + "learning_rate": 4.6051167964404894e-05, + "loss": 2.4646, + "step": 3466 + }, + { + "epoch": 46.240802675585286, + "grad_norm": 0.6844696998596191, + "learning_rate": 4.600667408231368e-05, + "loss": 2.6883, + "step": 3467 + }, + { + "epoch": 46.25418060200669, + "grad_norm": 0.6740692853927612, + "learning_rate": 4.596218020022247e-05, + "loss": 2.4321, + "step": 3468 + }, + { + "epoch": 46.2675585284281, + "grad_norm": 0.6562219262123108, + "learning_rate": 4.591768631813126e-05, + "loss": 2.6083, + "step": 3469 + }, + { + "epoch": 46.280936454849495, + "grad_norm": 0.7180445194244385, + "learning_rate": 4.5873192436040043e-05, + "loss": 2.4661, + "step": 3470 + }, + { + "epoch": 46.2943143812709, + "grad_norm": 0.683229923248291, + "learning_rate": 4.582869855394883e-05, + "loss": 2.3699, + "step": 3471 + }, + { + "epoch": 46.30769230769231, + "grad_norm": 0.7104426622390747, + "learning_rate": 4.578420467185762e-05, + "loss": 2.4708, + "step": 3472 + }, + { + "epoch": 46.32107023411371, + "grad_norm": 0.6398786306381226, + "learning_rate": 4.573971078976641e-05, + "loss": 2.3402, + "step": 3473 + }, + { + "epoch": 46.33444816053512, + "grad_norm": 0.6786426901817322, + "learning_rate": 4.56952169076752e-05, + "loss": 2.4108, + "step": 3474 + }, + { + "epoch": 46.34782608695652, + "grad_norm": 0.718450129032135, + "learning_rate": 4.565072302558398e-05, + "loss": 2.5797, + "step": 3475 + }, + { + "epoch": 46.36120401337793, + "grad_norm": 0.6450575590133667, + "learning_rate": 4.560622914349277e-05, + "loss": 2.0375, + "step": 3476 + }, + { + "epoch": 46.374581939799334, + "grad_norm": 0.6493234038352966, + "learning_rate": 4.556173526140156e-05, + "loss": 2.4572, + "step": 3477 + }, + { + "epoch": 46.38795986622073, + "grad_norm": 0.5922126173973083, + "learning_rate": 4.551724137931035e-05, + "loss": 2.2419, + "step": 3478 + }, + { + "epoch": 46.40133779264214, + "grad_norm": 0.6579440832138062, + "learning_rate": 4.547274749721914e-05, + "loss": 2.2189, + "step": 3479 + }, + { + "epoch": 46.414715719063544, + "grad_norm": 0.6737288236618042, + "learning_rate": 4.542825361512792e-05, + "loss": 2.4462, + "step": 3480 + }, + { + "epoch": 46.42809364548495, + "grad_norm": 0.7465442419052124, + "learning_rate": 4.5383759733036704e-05, + "loss": 2.2322, + "step": 3481 + }, + { + "epoch": 46.441471571906355, + "grad_norm": 0.8456260561943054, + "learning_rate": 4.533926585094549e-05, + "loss": 2.7195, + "step": 3482 + }, + { + "epoch": 46.45484949832776, + "grad_norm": 0.6909487843513489, + "learning_rate": 4.529477196885428e-05, + "loss": 2.3794, + "step": 3483 + }, + { + "epoch": 46.468227424749166, + "grad_norm": 0.6962025761604309, + "learning_rate": 4.525027808676307e-05, + "loss": 2.4683, + "step": 3484 + }, + { + "epoch": 46.48160535117057, + "grad_norm": 0.6593672037124634, + "learning_rate": 4.520578420467186e-05, + "loss": 2.5137, + "step": 3485 + }, + { + "epoch": 46.49498327759197, + "grad_norm": 0.8271158933639526, + "learning_rate": 4.516129032258064e-05, + "loss": 2.4657, + "step": 3486 + }, + { + "epoch": 46.508361204013376, + "grad_norm": 0.7557095289230347, + "learning_rate": 4.511679644048943e-05, + "loss": 2.5075, + "step": 3487 + }, + { + "epoch": 46.52173913043478, + "grad_norm": 0.7621429562568665, + "learning_rate": 4.507230255839822e-05, + "loss": 2.4163, + "step": 3488 + }, + { + "epoch": 46.53511705685619, + "grad_norm": 0.6624872088432312, + "learning_rate": 4.502780867630701e-05, + "loss": 2.5192, + "step": 3489 + }, + { + "epoch": 46.54849498327759, + "grad_norm": 0.7335086464881897, + "learning_rate": 4.49833147942158e-05, + "loss": 2.4538, + "step": 3490 + }, + { + "epoch": 46.561872909699, + "grad_norm": 0.7712231278419495, + "learning_rate": 4.493882091212458e-05, + "loss": 2.4444, + "step": 3491 + }, + { + "epoch": 46.575250836120404, + "grad_norm": 0.6709562540054321, + "learning_rate": 4.489432703003337e-05, + "loss": 2.3849, + "step": 3492 + }, + { + "epoch": 46.58862876254181, + "grad_norm": 0.704768180847168, + "learning_rate": 4.484983314794216e-05, + "loss": 2.5614, + "step": 3493 + }, + { + "epoch": 46.60200668896321, + "grad_norm": 0.6898999214172363, + "learning_rate": 4.480533926585095e-05, + "loss": 2.4023, + "step": 3494 + }, + { + "epoch": 46.61538461538461, + "grad_norm": 0.6936464309692383, + "learning_rate": 4.476084538375974e-05, + "loss": 2.4075, + "step": 3495 + }, + { + "epoch": 46.62876254180602, + "grad_norm": 0.7783868908882141, + "learning_rate": 4.471635150166852e-05, + "loss": 2.4102, + "step": 3496 + }, + { + "epoch": 46.642140468227424, + "grad_norm": 1.0688285827636719, + "learning_rate": 4.467185761957731e-05, + "loss": 2.2541, + "step": 3497 + }, + { + "epoch": 46.65551839464883, + "grad_norm": 0.6693544983863831, + "learning_rate": 4.46273637374861e-05, + "loss": 2.3223, + "step": 3498 + }, + { + "epoch": 46.668896321070235, + "grad_norm": 0.6926013231277466, + "learning_rate": 4.458286985539489e-05, + "loss": 2.3858, + "step": 3499 + }, + { + "epoch": 46.68227424749164, + "grad_norm": 0.6820552945137024, + "learning_rate": 4.453837597330368e-05, + "loss": 2.6187, + "step": 3500 + }, + { + "epoch": 46.68227424749164, + "grad_norm": 1.874079704284668, + "learning_rate": 4.449388209121246e-05, + "loss": 2.7609, + "step": 3501 + }, + { + "epoch": 46.69565217391305, + "grad_norm": 1.8859320878982544, + "learning_rate": 4.444938820912124e-05, + "loss": 2.6921, + "step": 3502 + }, + { + "epoch": 46.709030100334445, + "grad_norm": 2.095224618911743, + "learning_rate": 4.440489432703003e-05, + "loss": 2.641, + "step": 3503 + }, + { + "epoch": 46.72240802675585, + "grad_norm": 1.8568474054336548, + "learning_rate": 4.436040044493882e-05, + "loss": 2.9348, + "step": 3504 + }, + { + "epoch": 46.735785953177256, + "grad_norm": 2.0055668354034424, + "learning_rate": 4.431590656284761e-05, + "loss": 2.7498, + "step": 3505 + }, + { + "epoch": 46.74916387959866, + "grad_norm": 1.6685417890548706, + "learning_rate": 4.42714126807564e-05, + "loss": 2.8001, + "step": 3506 + }, + { + "epoch": 46.76254180602007, + "grad_norm": 1.9633355140686035, + "learning_rate": 4.422691879866518e-05, + "loss": 2.9822, + "step": 3507 + }, + { + "epoch": 46.77591973244147, + "grad_norm": 1.688116192817688, + "learning_rate": 4.418242491657397e-05, + "loss": 2.8157, + "step": 3508 + }, + { + "epoch": 46.78929765886288, + "grad_norm": 1.7179639339447021, + "learning_rate": 4.413793103448276e-05, + "loss": 2.7049, + "step": 3509 + }, + { + "epoch": 46.802675585284284, + "grad_norm": 1.546454906463623, + "learning_rate": 4.409343715239155e-05, + "loss": 2.6513, + "step": 3510 + }, + { + "epoch": 46.81605351170568, + "grad_norm": 1.8116494417190552, + "learning_rate": 4.404894327030034e-05, + "loss": 2.933, + "step": 3511 + }, + { + "epoch": 46.82943143812709, + "grad_norm": 1.8387575149536133, + "learning_rate": 4.400444938820912e-05, + "loss": 2.864, + "step": 3512 + }, + { + "epoch": 46.84280936454849, + "grad_norm": 1.7248547077178955, + "learning_rate": 4.395995550611791e-05, + "loss": 2.928, + "step": 3513 + }, + { + "epoch": 46.8561872909699, + "grad_norm": 1.5455210208892822, + "learning_rate": 4.39154616240267e-05, + "loss": 2.8224, + "step": 3514 + }, + { + "epoch": 46.869565217391305, + "grad_norm": 1.7810604572296143, + "learning_rate": 4.387096774193549e-05, + "loss": 2.726, + "step": 3515 + }, + { + "epoch": 46.88294314381271, + "grad_norm": 1.8111591339111328, + "learning_rate": 4.382647385984428e-05, + "loss": 2.7562, + "step": 3516 + }, + { + "epoch": 46.896321070234116, + "grad_norm": 1.5729373693466187, + "learning_rate": 4.378197997775306e-05, + "loss": 2.7545, + "step": 3517 + }, + { + "epoch": 46.90969899665552, + "grad_norm": 1.7884842157363892, + "learning_rate": 4.373748609566185e-05, + "loss": 2.8122, + "step": 3518 + }, + { + "epoch": 46.92307692307692, + "grad_norm": 1.8262654542922974, + "learning_rate": 4.369299221357064e-05, + "loss": 2.814, + "step": 3519 + }, + { + "epoch": 46.936454849498325, + "grad_norm": 1.5744242668151855, + "learning_rate": 4.364849833147943e-05, + "loss": 2.7071, + "step": 3520 + }, + { + "epoch": 46.94983277591973, + "grad_norm": 1.506430983543396, + "learning_rate": 4.3604004449388216e-05, + "loss": 2.8561, + "step": 3521 + }, + { + "epoch": 46.96321070234114, + "grad_norm": 1.4957503080368042, + "learning_rate": 4.3559510567297e-05, + "loss": 2.3908, + "step": 3522 + }, + { + "epoch": 46.97658862876254, + "grad_norm": 1.5997592210769653, + "learning_rate": 4.351501668520578e-05, + "loss": 2.6824, + "step": 3523 + }, + { + "epoch": 46.98996655518395, + "grad_norm": 1.5681554079055786, + "learning_rate": 4.347052280311457e-05, + "loss": 2.6847, + "step": 3524 + }, + { + "epoch": 47.013377926421406, + "grad_norm": 2.1808457374572754, + "learning_rate": 4.342602892102336e-05, + "loss": 5.1443, + "step": 3525 + }, + { + "epoch": 47.02675585284281, + "grad_norm": 1.6048583984375, + "learning_rate": 4.338153503893215e-05, + "loss": 2.8358, + "step": 3526 + }, + { + "epoch": 47.04013377926422, + "grad_norm": 1.60443115234375, + "learning_rate": 4.333704115684094e-05, + "loss": 2.8681, + "step": 3527 + }, + { + "epoch": 47.05351170568562, + "grad_norm": 1.571325421333313, + "learning_rate": 4.329254727474972e-05, + "loss": 2.7476, + "step": 3528 + }, + { + "epoch": 47.06688963210702, + "grad_norm": 1.3896905183792114, + "learning_rate": 4.324805339265851e-05, + "loss": 2.9488, + "step": 3529 + }, + { + "epoch": 47.080267558528426, + "grad_norm": 1.6123690605163574, + "learning_rate": 4.32035595105673e-05, + "loss": 2.6438, + "step": 3530 + }, + { + "epoch": 47.09364548494983, + "grad_norm": 1.3029465675354004, + "learning_rate": 4.315906562847609e-05, + "loss": 2.4713, + "step": 3531 + }, + { + "epoch": 47.10702341137124, + "grad_norm": 1.6188567876815796, + "learning_rate": 4.311457174638488e-05, + "loss": 3.0083, + "step": 3532 + }, + { + "epoch": 47.12040133779264, + "grad_norm": 1.6200324296951294, + "learning_rate": 4.307007786429366e-05, + "loss": 2.7221, + "step": 3533 + }, + { + "epoch": 47.13377926421405, + "grad_norm": 1.5296354293823242, + "learning_rate": 4.302558398220245e-05, + "loss": 2.9623, + "step": 3534 + }, + { + "epoch": 47.147157190635454, + "grad_norm": 1.4072962999343872, + "learning_rate": 4.298109010011124e-05, + "loss": 2.7184, + "step": 3535 + }, + { + "epoch": 47.16053511705686, + "grad_norm": 1.2409498691558838, + "learning_rate": 4.293659621802003e-05, + "loss": 2.842, + "step": 3536 + }, + { + "epoch": 47.17391304347826, + "grad_norm": 1.6017128229141235, + "learning_rate": 4.2892102335928816e-05, + "loss": 2.7034, + "step": 3537 + }, + { + "epoch": 47.187290969899664, + "grad_norm": 1.248794436454773, + "learning_rate": 4.28476084538376e-05, + "loss": 2.4749, + "step": 3538 + }, + { + "epoch": 47.20066889632107, + "grad_norm": 1.3597090244293213, + "learning_rate": 4.280311457174639e-05, + "loss": 2.7913, + "step": 3539 + }, + { + "epoch": 47.214046822742475, + "grad_norm": 1.3427294492721558, + "learning_rate": 4.275862068965518e-05, + "loss": 2.6795, + "step": 3540 + }, + { + "epoch": 47.22742474916388, + "grad_norm": 1.4270846843719482, + "learning_rate": 4.2714126807563966e-05, + "loss": 2.5893, + "step": 3541 + }, + { + "epoch": 47.240802675585286, + "grad_norm": 1.5535361766815186, + "learning_rate": 4.2669632925472755e-05, + "loss": 2.5322, + "step": 3542 + }, + { + "epoch": 47.25418060200669, + "grad_norm": 1.269716739654541, + "learning_rate": 4.262513904338154e-05, + "loss": 2.7256, + "step": 3543 + }, + { + "epoch": 47.2675585284281, + "grad_norm": 1.465528130531311, + "learning_rate": 4.258064516129032e-05, + "loss": 2.7631, + "step": 3544 + }, + { + "epoch": 47.280936454849495, + "grad_norm": 1.4620834589004517, + "learning_rate": 4.253615127919911e-05, + "loss": 2.7169, + "step": 3545 + }, + { + "epoch": 47.2943143812709, + "grad_norm": 1.4662063121795654, + "learning_rate": 4.24916573971079e-05, + "loss": 2.8878, + "step": 3546 + }, + { + "epoch": 47.30769230769231, + "grad_norm": 1.6685733795166016, + "learning_rate": 4.244716351501669e-05, + "loss": 2.7765, + "step": 3547 + }, + { + "epoch": 47.32107023411371, + "grad_norm": 1.516026258468628, + "learning_rate": 4.2402669632925476e-05, + "loss": 2.8278, + "step": 3548 + }, + { + "epoch": 47.33444816053512, + "grad_norm": 1.5270005464553833, + "learning_rate": 4.235817575083426e-05, + "loss": 2.7561, + "step": 3549 + }, + { + "epoch": 47.34782608695652, + "grad_norm": 1.5817084312438965, + "learning_rate": 4.231368186874305e-05, + "loss": 2.8527, + "step": 3550 + }, + { + "epoch": 47.36120401337793, + "grad_norm": 1.3843607902526855, + "learning_rate": 4.226918798665184e-05, + "loss": 2.819, + "step": 3551 + }, + { + "epoch": 47.374581939799334, + "grad_norm": 1.4046587944030762, + "learning_rate": 4.2224694104560626e-05, + "loss": 2.5891, + "step": 3552 + }, + { + "epoch": 47.38795986622073, + "grad_norm": 1.660375952720642, + "learning_rate": 4.2180200222469415e-05, + "loss": 2.8561, + "step": 3553 + }, + { + "epoch": 47.40133779264214, + "grad_norm": 1.3785984516143799, + "learning_rate": 4.21357063403782e-05, + "loss": 2.6979, + "step": 3554 + }, + { + "epoch": 47.414715719063544, + "grad_norm": 1.667795181274414, + "learning_rate": 4.209121245828699e-05, + "loss": 2.6532, + "step": 3555 + }, + { + "epoch": 47.42809364548495, + "grad_norm": 1.4713093042373657, + "learning_rate": 4.2046718576195776e-05, + "loss": 2.8053, + "step": 3556 + }, + { + "epoch": 47.441471571906355, + "grad_norm": 1.7168391942977905, + "learning_rate": 4.2002224694104565e-05, + "loss": 2.6546, + "step": 3557 + }, + { + "epoch": 47.45484949832776, + "grad_norm": 1.476688265800476, + "learning_rate": 4.1957730812013354e-05, + "loss": 2.604, + "step": 3558 + }, + { + "epoch": 47.468227424749166, + "grad_norm": 1.4487282037734985, + "learning_rate": 4.191323692992214e-05, + "loss": 2.5379, + "step": 3559 + }, + { + "epoch": 47.48160535117057, + "grad_norm": 1.1151924133300781, + "learning_rate": 4.1868743047830926e-05, + "loss": 2.5306, + "step": 3560 + }, + { + "epoch": 47.49498327759197, + "grad_norm": 1.3186345100402832, + "learning_rate": 4.1824249165739715e-05, + "loss": 2.6516, + "step": 3561 + }, + { + "epoch": 47.508361204013376, + "grad_norm": 1.4641391038894653, + "learning_rate": 4.1779755283648504e-05, + "loss": 2.8421, + "step": 3562 + }, + { + "epoch": 47.52173913043478, + "grad_norm": 1.3929246664047241, + "learning_rate": 4.173526140155729e-05, + "loss": 2.6351, + "step": 3563 + }, + { + "epoch": 47.53511705685619, + "grad_norm": 1.8099298477172852, + "learning_rate": 4.1690767519466076e-05, + "loss": 2.6873, + "step": 3564 + }, + { + "epoch": 47.54849498327759, + "grad_norm": 1.5447300672531128, + "learning_rate": 4.164627363737486e-05, + "loss": 2.8718, + "step": 3565 + }, + { + "epoch": 47.561872909699, + "grad_norm": 1.5041502714157104, + "learning_rate": 4.160177975528365e-05, + "loss": 2.6741, + "step": 3566 + }, + { + "epoch": 47.575250836120404, + "grad_norm": 1.713394284248352, + "learning_rate": 4.155728587319244e-05, + "loss": 2.9518, + "step": 3567 + }, + { + "epoch": 47.58862876254181, + "grad_norm": 1.3753432035446167, + "learning_rate": 4.1512791991101226e-05, + "loss": 2.9874, + "step": 3568 + }, + { + "epoch": 47.60200668896321, + "grad_norm": 1.901676893234253, + "learning_rate": 4.1468298109010015e-05, + "loss": 2.7524, + "step": 3569 + }, + { + "epoch": 47.61538461538461, + "grad_norm": 1.8974645137786865, + "learning_rate": 4.14238042269188e-05, + "loss": 2.744, + "step": 3570 + }, + { + "epoch": 47.62876254180602, + "grad_norm": 1.278735637664795, + "learning_rate": 4.1379310344827587e-05, + "loss": 2.6875, + "step": 3571 + }, + { + "epoch": 47.642140468227424, + "grad_norm": 1.7426670789718628, + "learning_rate": 4.1334816462736376e-05, + "loss": 2.7088, + "step": 3572 + }, + { + "epoch": 47.65551839464883, + "grad_norm": 1.2926254272460938, + "learning_rate": 4.1290322580645165e-05, + "loss": 2.536, + "step": 3573 + }, + { + "epoch": 47.668896321070235, + "grad_norm": 1.4211947917938232, + "learning_rate": 4.1245828698553954e-05, + "loss": 2.7895, + "step": 3574 + }, + { + "epoch": 47.68227424749164, + "grad_norm": 1.4811222553253174, + "learning_rate": 4.1201334816462736e-05, + "loss": 2.8819, + "step": 3575 + }, + { + "epoch": 47.69565217391305, + "grad_norm": 1.441867470741272, + "learning_rate": 4.1156840934371526e-05, + "loss": 2.9332, + "step": 3576 + }, + { + "epoch": 47.709030100334445, + "grad_norm": 1.3309211730957031, + "learning_rate": 4.1112347052280315e-05, + "loss": 2.5375, + "step": 3577 + }, + { + "epoch": 47.72240802675585, + "grad_norm": 1.1746070384979248, + "learning_rate": 4.1067853170189104e-05, + "loss": 2.8976, + "step": 3578 + }, + { + "epoch": 47.735785953177256, + "grad_norm": 1.572267770767212, + "learning_rate": 4.102335928809789e-05, + "loss": 2.7175, + "step": 3579 + }, + { + "epoch": 47.74916387959866, + "grad_norm": 1.6516088247299194, + "learning_rate": 4.0978865406006675e-05, + "loss": 2.7369, + "step": 3580 + }, + { + "epoch": 47.76254180602007, + "grad_norm": 1.2800843715667725, + "learning_rate": 4.0934371523915465e-05, + "loss": 2.6551, + "step": 3581 + }, + { + "epoch": 47.77591973244147, + "grad_norm": 1.115822434425354, + "learning_rate": 4.0889877641824254e-05, + "loss": 2.8808, + "step": 3582 + }, + { + "epoch": 47.78929765886288, + "grad_norm": 1.5338307619094849, + "learning_rate": 4.084538375973304e-05, + "loss": 2.6126, + "step": 3583 + }, + { + "epoch": 47.802675585284284, + "grad_norm": 1.226580262184143, + "learning_rate": 4.0800889877641825e-05, + "loss": 2.6029, + "step": 3584 + }, + { + "epoch": 47.81605351170568, + "grad_norm": 1.2477607727050781, + "learning_rate": 4.0756395995550615e-05, + "loss": 2.7603, + "step": 3585 + }, + { + "epoch": 47.82943143812709, + "grad_norm": 1.5089812278747559, + "learning_rate": 4.07119021134594e-05, + "loss": 2.9377, + "step": 3586 + }, + { + "epoch": 47.84280936454849, + "grad_norm": 1.3173625469207764, + "learning_rate": 4.0667408231368186e-05, + "loss": 2.7398, + "step": 3587 + }, + { + "epoch": 47.8561872909699, + "grad_norm": 1.217244267463684, + "learning_rate": 4.0622914349276975e-05, + "loss": 2.6319, + "step": 3588 + }, + { + "epoch": 47.869565217391305, + "grad_norm": 1.5038195848464966, + "learning_rate": 4.0578420467185764e-05, + "loss": 2.6373, + "step": 3589 + }, + { + "epoch": 47.88294314381271, + "grad_norm": 1.4669184684753418, + "learning_rate": 4.0533926585094554e-05, + "loss": 2.7596, + "step": 3590 + }, + { + "epoch": 47.896321070234116, + "grad_norm": 1.1874650716781616, + "learning_rate": 4.0489432703003336e-05, + "loss": 2.8438, + "step": 3591 + }, + { + "epoch": 47.90969899665552, + "grad_norm": 1.3088831901550293, + "learning_rate": 4.0444938820912125e-05, + "loss": 2.6426, + "step": 3592 + }, + { + "epoch": 47.92307692307692, + "grad_norm": 1.2181847095489502, + "learning_rate": 4.0400444938820914e-05, + "loss": 2.577, + "step": 3593 + }, + { + "epoch": 47.936454849498325, + "grad_norm": 1.4106916189193726, + "learning_rate": 4.0355951056729703e-05, + "loss": 2.8785, + "step": 3594 + }, + { + "epoch": 47.94983277591973, + "grad_norm": 1.3725144863128662, + "learning_rate": 4.031145717463849e-05, + "loss": 2.6663, + "step": 3595 + }, + { + "epoch": 47.96321070234114, + "grad_norm": 1.6895320415496826, + "learning_rate": 4.0266963292547275e-05, + "loss": 2.8136, + "step": 3596 + }, + { + "epoch": 47.97658862876254, + "grad_norm": 1.2098865509033203, + "learning_rate": 4.0222469410456064e-05, + "loss": 2.6721, + "step": 3597 + }, + { + "epoch": 47.98996655518395, + "grad_norm": 1.6445807218551636, + "learning_rate": 4.017797552836485e-05, + "loss": 2.8354, + "step": 3598 + }, + { + "epoch": 48.0, + "grad_norm": 1.6511502265930176, + "learning_rate": 4.013348164627364e-05, + "loss": 2.7395, + "step": 3599 + }, + { + "epoch": 48.013377926421406, + "grad_norm": 0.930826723575592, + "learning_rate": 4.008898776418243e-05, + "loss": 2.6881, + "step": 3600 + }, + { + "epoch": 48.02675585284281, + "grad_norm": 1.0226924419403076, + "learning_rate": 4.0044493882091214e-05, + "loss": 2.4951, + "step": 3601 + }, + { + "epoch": 48.04013377926422, + "grad_norm": 0.8309192061424255, + "learning_rate": 4e-05, + "loss": 2.7274, + "step": 3602 + }, + { + "epoch": 48.05351170568562, + "grad_norm": 0.8343853950500488, + "learning_rate": 3.995550611790879e-05, + "loss": 2.7413, + "step": 3603 + }, + { + "epoch": 48.06688963210702, + "grad_norm": 0.8467966914176941, + "learning_rate": 3.991101223581758e-05, + "loss": 2.4937, + "step": 3604 + }, + { + "epoch": 48.080267558528426, + "grad_norm": 0.8166081309318542, + "learning_rate": 3.9866518353726364e-05, + "loss": 2.6457, + "step": 3605 + }, + { + "epoch": 48.09364548494983, + "grad_norm": 0.9212111234664917, + "learning_rate": 3.982202447163515e-05, + "loss": 2.6507, + "step": 3606 + }, + { + "epoch": 48.10702341137124, + "grad_norm": 0.8982736468315125, + "learning_rate": 3.9777530589543936e-05, + "loss": 2.6162, + "step": 3607 + }, + { + "epoch": 48.12040133779264, + "grad_norm": 1.1569843292236328, + "learning_rate": 3.9733036707452725e-05, + "loss": 2.8513, + "step": 3608 + }, + { + "epoch": 48.13377926421405, + "grad_norm": 1.0270570516586304, + "learning_rate": 3.9688542825361514e-05, + "loss": 2.8707, + "step": 3609 + }, + { + "epoch": 48.147157190635454, + "grad_norm": 0.9687206745147705, + "learning_rate": 3.96440489432703e-05, + "loss": 2.7854, + "step": 3610 + }, + { + "epoch": 48.16053511705686, + "grad_norm": 0.9363980293273926, + "learning_rate": 3.959955506117909e-05, + "loss": 2.7802, + "step": 3611 + }, + { + "epoch": 48.17391304347826, + "grad_norm": 0.8202338218688965, + "learning_rate": 3.9555061179087875e-05, + "loss": 2.7608, + "step": 3612 + }, + { + "epoch": 48.187290969899664, + "grad_norm": 0.9861024618148804, + "learning_rate": 3.9510567296996664e-05, + "loss": 2.5243, + "step": 3613 + }, + { + "epoch": 48.20066889632107, + "grad_norm": 0.8582180738449097, + "learning_rate": 3.946607341490545e-05, + "loss": 2.7575, + "step": 3614 + }, + { + "epoch": 48.214046822742475, + "grad_norm": 0.8339632749557495, + "learning_rate": 3.942157953281424e-05, + "loss": 2.6449, + "step": 3615 + }, + { + "epoch": 48.22742474916388, + "grad_norm": 0.9281173944473267, + "learning_rate": 3.937708565072303e-05, + "loss": 2.6076, + "step": 3616 + }, + { + "epoch": 48.240802675585286, + "grad_norm": 1.1367732286453247, + "learning_rate": 3.9332591768631814e-05, + "loss": 2.3871, + "step": 3617 + }, + { + "epoch": 48.25418060200669, + "grad_norm": 0.8201642632484436, + "learning_rate": 3.92880978865406e-05, + "loss": 2.5026, + "step": 3618 + }, + { + "epoch": 48.2675585284281, + "grad_norm": 0.9126654267311096, + "learning_rate": 3.924360400444939e-05, + "loss": 2.8502, + "step": 3619 + }, + { + "epoch": 48.280936454849495, + "grad_norm": 1.1465530395507812, + "learning_rate": 3.919911012235818e-05, + "loss": 2.8908, + "step": 3620 + }, + { + "epoch": 48.2943143812709, + "grad_norm": 0.8558413982391357, + "learning_rate": 3.915461624026697e-05, + "loss": 2.7329, + "step": 3621 + }, + { + "epoch": 48.30769230769231, + "grad_norm": 0.8293155431747437, + "learning_rate": 3.911012235817575e-05, + "loss": 2.7223, + "step": 3622 + }, + { + "epoch": 48.32107023411371, + "grad_norm": 0.9223231077194214, + "learning_rate": 3.906562847608454e-05, + "loss": 2.7142, + "step": 3623 + }, + { + "epoch": 48.33444816053512, + "grad_norm": 0.8491235971450806, + "learning_rate": 3.902113459399333e-05, + "loss": 2.6362, + "step": 3624 + }, + { + "epoch": 48.34782608695652, + "grad_norm": 0.8782801032066345, + "learning_rate": 3.8976640711902113e-05, + "loss": 2.7339, + "step": 3625 + }, + { + "epoch": 48.36120401337793, + "grad_norm": 0.8530482053756714, + "learning_rate": 3.89321468298109e-05, + "loss": 2.8429, + "step": 3626 + }, + { + "epoch": 48.374581939799334, + "grad_norm": 1.0677621364593506, + "learning_rate": 3.8887652947719685e-05, + "loss": 2.6479, + "step": 3627 + }, + { + "epoch": 48.38795986622073, + "grad_norm": 0.9075163006782532, + "learning_rate": 3.8843159065628474e-05, + "loss": 2.4386, + "step": 3628 + }, + { + "epoch": 48.40133779264214, + "grad_norm": 0.8299190402030945, + "learning_rate": 3.879866518353726e-05, + "loss": 2.5828, + "step": 3629 + }, + { + "epoch": 48.414715719063544, + "grad_norm": 0.877830982208252, + "learning_rate": 3.875417130144605e-05, + "loss": 2.7312, + "step": 3630 + }, + { + "epoch": 48.42809364548495, + "grad_norm": 1.0116523504257202, + "learning_rate": 3.870967741935484e-05, + "loss": 2.6551, + "step": 3631 + }, + { + "epoch": 48.441471571906355, + "grad_norm": 0.9170435667037964, + "learning_rate": 3.8665183537263624e-05, + "loss": 2.6219, + "step": 3632 + }, + { + "epoch": 48.45484949832776, + "grad_norm": 1.041170358657837, + "learning_rate": 3.862068965517241e-05, + "loss": 2.6453, + "step": 3633 + }, + { + "epoch": 48.468227424749166, + "grad_norm": 1.0471312999725342, + "learning_rate": 3.85761957730812e-05, + "loss": 2.7593, + "step": 3634 + }, + { + "epoch": 48.48160535117057, + "grad_norm": 0.962745726108551, + "learning_rate": 3.853170189098999e-05, + "loss": 2.7035, + "step": 3635 + }, + { + "epoch": 48.49498327759197, + "grad_norm": 1.091069221496582, + "learning_rate": 3.848720800889878e-05, + "loss": 2.55, + "step": 3636 + }, + { + "epoch": 48.508361204013376, + "grad_norm": 0.932161808013916, + "learning_rate": 3.844271412680756e-05, + "loss": 2.6504, + "step": 3637 + }, + { + "epoch": 48.52173913043478, + "grad_norm": 0.9401114583015442, + "learning_rate": 3.839822024471635e-05, + "loss": 2.6304, + "step": 3638 + }, + { + "epoch": 48.53511705685619, + "grad_norm": 0.8302827477455139, + "learning_rate": 3.835372636262514e-05, + "loss": 2.6115, + "step": 3639 + }, + { + "epoch": 48.54849498327759, + "grad_norm": 0.9349223971366882, + "learning_rate": 3.830923248053393e-05, + "loss": 2.4827, + "step": 3640 + }, + { + "epoch": 48.561872909699, + "grad_norm": 1.053091287612915, + "learning_rate": 3.826473859844272e-05, + "loss": 2.515, + "step": 3641 + }, + { + "epoch": 48.575250836120404, + "grad_norm": 0.9354698657989502, + "learning_rate": 3.82202447163515e-05, + "loss": 2.8267, + "step": 3642 + }, + { + "epoch": 48.58862876254181, + "grad_norm": 0.9621760845184326, + "learning_rate": 3.817575083426029e-05, + "loss": 2.7924, + "step": 3643 + }, + { + "epoch": 48.60200668896321, + "grad_norm": 1.0033005475997925, + "learning_rate": 3.813125695216908e-05, + "loss": 2.7694, + "step": 3644 + }, + { + "epoch": 48.61538461538461, + "grad_norm": 0.9117392301559448, + "learning_rate": 3.808676307007787e-05, + "loss": 2.8041, + "step": 3645 + }, + { + "epoch": 48.62876254180602, + "grad_norm": 1.1089485883712769, + "learning_rate": 3.804226918798665e-05, + "loss": 2.8602, + "step": 3646 + }, + { + "epoch": 48.642140468227424, + "grad_norm": 1.098798155784607, + "learning_rate": 3.799777530589544e-05, + "loss": 2.6619, + "step": 3647 + }, + { + "epoch": 48.65551839464883, + "grad_norm": 0.9343369603157043, + "learning_rate": 3.7953281423804224e-05, + "loss": 2.4415, + "step": 3648 + }, + { + "epoch": 48.668896321070235, + "grad_norm": 1.1379038095474243, + "learning_rate": 3.790878754171301e-05, + "loss": 2.6495, + "step": 3649 + }, + { + "epoch": 48.68227424749164, + "grad_norm": 0.8679940700531006, + "learning_rate": 3.78642936596218e-05, + "loss": 2.6208, + "step": 3650 + }, + { + "epoch": 48.69565217391305, + "grad_norm": 0.8417056202888489, + "learning_rate": 3.781979977753059e-05, + "loss": 2.7265, + "step": 3651 + }, + { + "epoch": 48.709030100334445, + "grad_norm": 1.0414758920669556, + "learning_rate": 3.777530589543938e-05, + "loss": 2.876, + "step": 3652 + }, + { + "epoch": 48.72240802675585, + "grad_norm": 0.9891830682754517, + "learning_rate": 3.773081201334816e-05, + "loss": 2.5307, + "step": 3653 + }, + { + "epoch": 48.735785953177256, + "grad_norm": 0.8791108727455139, + "learning_rate": 3.768631813125695e-05, + "loss": 2.6067, + "step": 3654 + }, + { + "epoch": 48.74916387959866, + "grad_norm": 1.0120600461959839, + "learning_rate": 3.764182424916574e-05, + "loss": 2.6719, + "step": 3655 + }, + { + "epoch": 48.76254180602007, + "grad_norm": 0.9853880405426025, + "learning_rate": 3.759733036707453e-05, + "loss": 2.6551, + "step": 3656 + }, + { + "epoch": 48.77591973244147, + "grad_norm": 0.8162937164306641, + "learning_rate": 3.755283648498332e-05, + "loss": 2.7002, + "step": 3657 + }, + { + "epoch": 48.78929765886288, + "grad_norm": 0.8841816782951355, + "learning_rate": 3.75083426028921e-05, + "loss": 2.6199, + "step": 3658 + }, + { + "epoch": 48.802675585284284, + "grad_norm": 0.8989855051040649, + "learning_rate": 3.746384872080089e-05, + "loss": 2.8484, + "step": 3659 + }, + { + "epoch": 48.81605351170568, + "grad_norm": 0.9171043634414673, + "learning_rate": 3.741935483870968e-05, + "loss": 2.7621, + "step": 3660 + }, + { + "epoch": 48.82943143812709, + "grad_norm": 0.9595106840133667, + "learning_rate": 3.737486095661847e-05, + "loss": 2.8166, + "step": 3661 + }, + { + "epoch": 48.84280936454849, + "grad_norm": 1.1387951374053955, + "learning_rate": 3.733036707452726e-05, + "loss": 2.6994, + "step": 3662 + }, + { + "epoch": 48.8561872909699, + "grad_norm": 0.9676811695098877, + "learning_rate": 3.728587319243604e-05, + "loss": 2.7282, + "step": 3663 + }, + { + "epoch": 48.869565217391305, + "grad_norm": 1.1474648714065552, + "learning_rate": 3.724137931034483e-05, + "loss": 2.6101, + "step": 3664 + }, + { + "epoch": 48.88294314381271, + "grad_norm": 0.8521884679794312, + "learning_rate": 3.719688542825362e-05, + "loss": 2.3969, + "step": 3665 + }, + { + "epoch": 48.896321070234116, + "grad_norm": 0.8633421063423157, + "learning_rate": 3.71523915461624e-05, + "loss": 2.7976, + "step": 3666 + }, + { + "epoch": 48.90969899665552, + "grad_norm": 0.9763745665550232, + "learning_rate": 3.710789766407119e-05, + "loss": 2.6065, + "step": 3667 + }, + { + "epoch": 48.92307692307692, + "grad_norm": 0.7422575354576111, + "learning_rate": 3.706340378197998e-05, + "loss": 2.6796, + "step": 3668 + }, + { + "epoch": 48.936454849498325, + "grad_norm": 0.933746874332428, + "learning_rate": 3.701890989988876e-05, + "loss": 2.6876, + "step": 3669 + }, + { + "epoch": 48.94983277591973, + "grad_norm": 0.8498068451881409, + "learning_rate": 3.697441601779755e-05, + "loss": 2.7027, + "step": 3670 + }, + { + "epoch": 48.96321070234114, + "grad_norm": 0.9815676212310791, + "learning_rate": 3.692992213570634e-05, + "loss": 2.5987, + "step": 3671 + }, + { + "epoch": 48.97658862876254, + "grad_norm": 0.9350228309631348, + "learning_rate": 3.688542825361513e-05, + "loss": 2.6082, + "step": 3672 + }, + { + "epoch": 48.98996655518395, + "grad_norm": 0.8007590770721436, + "learning_rate": 3.684093437152392e-05, + "loss": 2.3251, + "step": 3673 + }, + { + "epoch": 49.0, + "grad_norm": 1.2886775732040405, + "learning_rate": 3.67964404894327e-05, + "loss": 2.5578, + "step": 3674 + }, + { + "epoch": 49.013377926421406, + "grad_norm": 0.8050103187561035, + "learning_rate": 3.675194660734149e-05, + "loss": 2.6582, + "step": 3675 + }, + { + "epoch": 49.02675585284281, + "grad_norm": 0.7852005958557129, + "learning_rate": 3.670745272525028e-05, + "loss": 2.7088, + "step": 3676 + }, + { + "epoch": 49.04013377926422, + "grad_norm": 0.8211432695388794, + "learning_rate": 3.666295884315907e-05, + "loss": 2.3469, + "step": 3677 + }, + { + "epoch": 49.05351170568562, + "grad_norm": 0.8142633438110352, + "learning_rate": 3.661846496106786e-05, + "loss": 2.5533, + "step": 3678 + }, + { + "epoch": 49.06688963210702, + "grad_norm": 0.7460059523582458, + "learning_rate": 3.657397107897664e-05, + "loss": 2.7169, + "step": 3679 + }, + { + "epoch": 49.080267558528426, + "grad_norm": 0.8187868595123291, + "learning_rate": 3.652947719688543e-05, + "loss": 2.6557, + "step": 3680 + }, + { + "epoch": 49.09364548494983, + "grad_norm": 0.6976158022880554, + "learning_rate": 3.648498331479422e-05, + "loss": 2.6892, + "step": 3681 + }, + { + "epoch": 49.10702341137124, + "grad_norm": 0.8464581370353699, + "learning_rate": 3.644048943270301e-05, + "loss": 2.6912, + "step": 3682 + }, + { + "epoch": 49.12040133779264, + "grad_norm": 0.7306657433509827, + "learning_rate": 3.63959955506118e-05, + "loss": 2.4801, + "step": 3683 + }, + { + "epoch": 49.13377926421405, + "grad_norm": 0.7567654252052307, + "learning_rate": 3.635150166852058e-05, + "loss": 2.6735, + "step": 3684 + }, + { + "epoch": 49.147157190635454, + "grad_norm": 0.804836630821228, + "learning_rate": 3.630700778642937e-05, + "loss": 2.5206, + "step": 3685 + }, + { + "epoch": 49.16053511705686, + "grad_norm": 0.7669990658760071, + "learning_rate": 3.626251390433816e-05, + "loss": 2.6387, + "step": 3686 + }, + { + "epoch": 49.17391304347826, + "grad_norm": 0.7316834926605225, + "learning_rate": 3.621802002224694e-05, + "loss": 2.5791, + "step": 3687 + }, + { + "epoch": 49.187290969899664, + "grad_norm": 0.6893475651741028, + "learning_rate": 3.617352614015573e-05, + "loss": 2.5242, + "step": 3688 + }, + { + "epoch": 49.20066889632107, + "grad_norm": 0.7602696418762207, + "learning_rate": 3.612903225806452e-05, + "loss": 2.6918, + "step": 3689 + }, + { + "epoch": 49.214046822742475, + "grad_norm": 0.7876319289207458, + "learning_rate": 3.60845383759733e-05, + "loss": 2.5738, + "step": 3690 + }, + { + "epoch": 49.22742474916388, + "grad_norm": 0.9337881207466125, + "learning_rate": 3.604004449388209e-05, + "loss": 2.6884, + "step": 3691 + }, + { + "epoch": 49.240802675585286, + "grad_norm": 0.7203758955001831, + "learning_rate": 3.599555061179088e-05, + "loss": 2.4134, + "step": 3692 + }, + { + "epoch": 49.25418060200669, + "grad_norm": 0.8985403776168823, + "learning_rate": 3.595105672969967e-05, + "loss": 2.4833, + "step": 3693 + }, + { + "epoch": 49.2675585284281, + "grad_norm": 0.7891301512718201, + "learning_rate": 3.590656284760846e-05, + "loss": 2.8944, + "step": 3694 + }, + { + "epoch": 49.280936454849495, + "grad_norm": 0.7096188068389893, + "learning_rate": 3.586206896551724e-05, + "loss": 2.4957, + "step": 3695 + }, + { + "epoch": 49.2943143812709, + "grad_norm": 0.7646763920783997, + "learning_rate": 3.581757508342603e-05, + "loss": 2.2093, + "step": 3696 + }, + { + "epoch": 49.30769230769231, + "grad_norm": 0.8256480693817139, + "learning_rate": 3.577308120133482e-05, + "loss": 2.6597, + "step": 3697 + }, + { + "epoch": 49.32107023411371, + "grad_norm": 0.7693411707878113, + "learning_rate": 3.572858731924361e-05, + "loss": 2.5376, + "step": 3698 + }, + { + "epoch": 49.33444816053512, + "grad_norm": 0.7902242541313171, + "learning_rate": 3.5684093437152397e-05, + "loss": 2.887, + "step": 3699 + }, + { + "epoch": 49.34782608695652, + "grad_norm": 0.7505605220794678, + "learning_rate": 3.563959955506118e-05, + "loss": 2.7285, + "step": 3700 + }, + { + "epoch": 49.36120401337793, + "grad_norm": 0.9204686284065247, + "learning_rate": 3.559510567296997e-05, + "loss": 2.7478, + "step": 3701 + }, + { + "epoch": 49.374581939799334, + "grad_norm": 0.6998848915100098, + "learning_rate": 3.555061179087876e-05, + "loss": 2.405, + "step": 3702 + }, + { + "epoch": 49.38795986622073, + "grad_norm": 0.7633227109909058, + "learning_rate": 3.5506117908787546e-05, + "loss": 2.4304, + "step": 3703 + }, + { + "epoch": 49.40133779264214, + "grad_norm": 0.8731907606124878, + "learning_rate": 3.5461624026696336e-05, + "loss": 2.6215, + "step": 3704 + }, + { + "epoch": 49.414715719063544, + "grad_norm": 0.7742621302604675, + "learning_rate": 3.541713014460512e-05, + "loss": 2.6656, + "step": 3705 + }, + { + "epoch": 49.42809364548495, + "grad_norm": 0.7795779705047607, + "learning_rate": 3.537263626251391e-05, + "loss": 2.6521, + "step": 3706 + }, + { + "epoch": 49.441471571906355, + "grad_norm": 0.8041906356811523, + "learning_rate": 3.5328142380422696e-05, + "loss": 2.6883, + "step": 3707 + }, + { + "epoch": 49.45484949832776, + "grad_norm": 0.8720248937606812, + "learning_rate": 3.528364849833148e-05, + "loss": 2.295, + "step": 3708 + }, + { + "epoch": 49.468227424749166, + "grad_norm": 0.8696540594100952, + "learning_rate": 3.523915461624027e-05, + "loss": 2.6877, + "step": 3709 + }, + { + "epoch": 49.48160535117057, + "grad_norm": 0.9745573401451111, + "learning_rate": 3.519466073414906e-05, + "loss": 2.5912, + "step": 3710 + }, + { + "epoch": 49.49498327759197, + "grad_norm": 1.1198558807373047, + "learning_rate": 3.515016685205784e-05, + "loss": 2.7686, + "step": 3711 + }, + { + "epoch": 49.508361204013376, + "grad_norm": 0.759647011756897, + "learning_rate": 3.510567296996663e-05, + "loss": 2.6488, + "step": 3712 + }, + { + "epoch": 49.52173913043478, + "grad_norm": 0.7952780723571777, + "learning_rate": 3.506117908787542e-05, + "loss": 2.7659, + "step": 3713 + }, + { + "epoch": 49.53511705685619, + "grad_norm": 0.8217593431472778, + "learning_rate": 3.501668520578421e-05, + "loss": 2.7766, + "step": 3714 + }, + { + "epoch": 49.54849498327759, + "grad_norm": 0.7650991678237915, + "learning_rate": 3.4972191323692996e-05, + "loss": 2.5743, + "step": 3715 + }, + { + "epoch": 49.561872909699, + "grad_norm": 0.726296067237854, + "learning_rate": 3.492769744160178e-05, + "loss": 2.6984, + "step": 3716 + }, + { + "epoch": 49.575250836120404, + "grad_norm": 0.7600445747375488, + "learning_rate": 3.488320355951057e-05, + "loss": 2.454, + "step": 3717 + }, + { + "epoch": 49.58862876254181, + "grad_norm": 0.8778021335601807, + "learning_rate": 3.483870967741936e-05, + "loss": 2.7389, + "step": 3718 + }, + { + "epoch": 49.60200668896321, + "grad_norm": 0.7271983623504639, + "learning_rate": 3.4794215795328146e-05, + "loss": 2.4673, + "step": 3719 + }, + { + "epoch": 49.61538461538461, + "grad_norm": 0.7534515261650085, + "learning_rate": 3.4749721913236935e-05, + "loss": 2.7838, + "step": 3720 + }, + { + "epoch": 49.62876254180602, + "grad_norm": 0.9334620237350464, + "learning_rate": 3.470522803114572e-05, + "loss": 2.7135, + "step": 3721 + }, + { + "epoch": 49.642140468227424, + "grad_norm": 0.7726050019264221, + "learning_rate": 3.466073414905451e-05, + "loss": 2.5617, + "step": 3722 + }, + { + "epoch": 49.65551839464883, + "grad_norm": 0.8581987023353577, + "learning_rate": 3.4616240266963296e-05, + "loss": 2.7521, + "step": 3723 + }, + { + "epoch": 49.668896321070235, + "grad_norm": 0.8535495400428772, + "learning_rate": 3.4571746384872085e-05, + "loss": 2.4661, + "step": 3724 + }, + { + "epoch": 49.68227424749164, + "grad_norm": 0.691425621509552, + "learning_rate": 3.4527252502780874e-05, + "loss": 2.4683, + "step": 3725 + }, + { + "epoch": 49.69565217391305, + "grad_norm": 0.7674373984336853, + "learning_rate": 3.4482758620689657e-05, + "loss": 2.6441, + "step": 3726 + }, + { + "epoch": 49.709030100334445, + "grad_norm": 0.7647296190261841, + "learning_rate": 3.4438264738598446e-05, + "loss": 2.6286, + "step": 3727 + }, + { + "epoch": 49.72240802675585, + "grad_norm": 0.7668997645378113, + "learning_rate": 3.439377085650723e-05, + "loss": 2.762, + "step": 3728 + }, + { + "epoch": 49.735785953177256, + "grad_norm": 0.8111737370491028, + "learning_rate": 3.434927697441602e-05, + "loss": 2.6294, + "step": 3729 + }, + { + "epoch": 49.74916387959866, + "grad_norm": 0.7185457348823547, + "learning_rate": 3.4304783092324806e-05, + "loss": 2.5763, + "step": 3730 + }, + { + "epoch": 49.76254180602007, + "grad_norm": 0.8142096996307373, + "learning_rate": 3.4260289210233596e-05, + "loss": 2.8209, + "step": 3731 + }, + { + "epoch": 49.77591973244147, + "grad_norm": 0.7466010451316833, + "learning_rate": 3.421579532814238e-05, + "loss": 2.7076, + "step": 3732 + }, + { + "epoch": 49.78929765886288, + "grad_norm": 0.8199285268783569, + "learning_rate": 3.417130144605117e-05, + "loss": 2.3427, + "step": 3733 + }, + { + "epoch": 49.802675585284284, + "grad_norm": 0.8280915021896362, + "learning_rate": 3.4126807563959956e-05, + "loss": 2.5477, + "step": 3734 + }, + { + "epoch": 49.81605351170568, + "grad_norm": 0.7132146954536438, + "learning_rate": 3.4082313681868746e-05, + "loss": 2.6196, + "step": 3735 + }, + { + "epoch": 49.82943143812709, + "grad_norm": 0.7380895018577576, + "learning_rate": 3.4037819799777535e-05, + "loss": 2.8091, + "step": 3736 + }, + { + "epoch": 49.84280936454849, + "grad_norm": 0.7611865401268005, + "learning_rate": 3.399332591768632e-05, + "loss": 2.7362, + "step": 3737 + }, + { + "epoch": 49.8561872909699, + "grad_norm": 0.7730408906936646, + "learning_rate": 3.3948832035595106e-05, + "loss": 2.7238, + "step": 3738 + }, + { + "epoch": 49.869565217391305, + "grad_norm": 0.7552332282066345, + "learning_rate": 3.3904338153503895e-05, + "loss": 2.7884, + "step": 3739 + }, + { + "epoch": 49.88294314381271, + "grad_norm": 0.8960534334182739, + "learning_rate": 3.3859844271412685e-05, + "loss": 2.7067, + "step": 3740 + }, + { + "epoch": 49.896321070234116, + "grad_norm": 0.8455086350440979, + "learning_rate": 3.3815350389321474e-05, + "loss": 2.5756, + "step": 3741 + }, + { + "epoch": 49.90969899665552, + "grad_norm": 0.8840497732162476, + "learning_rate": 3.3770856507230256e-05, + "loss": 2.6779, + "step": 3742 + }, + { + "epoch": 49.92307692307692, + "grad_norm": 0.7964041829109192, + "learning_rate": 3.3726362625139045e-05, + "loss": 2.7642, + "step": 3743 + }, + { + "epoch": 49.936454849498325, + "grad_norm": 0.7956447601318359, + "learning_rate": 3.3681868743047834e-05, + "loss": 2.5019, + "step": 3744 + }, + { + "epoch": 49.94983277591973, + "grad_norm": 0.743416965007782, + "learning_rate": 3.3637374860956624e-05, + "loss": 2.5265, + "step": 3745 + }, + { + "epoch": 49.96321070234114, + "grad_norm": 0.8189771771430969, + "learning_rate": 3.359288097886541e-05, + "loss": 2.671, + "step": 3746 + }, + { + "epoch": 49.97658862876254, + "grad_norm": 0.778721034526825, + "learning_rate": 3.3548387096774195e-05, + "loss": 2.6753, + "step": 3747 + }, + { + "epoch": 49.98996655518395, + "grad_norm": 0.7933366894721985, + "learning_rate": 3.3503893214682984e-05, + "loss": 2.6491, + "step": 3748 + }, + { + "epoch": 50.0, + "grad_norm": 1.0016274452209473, + "learning_rate": 3.345939933259177e-05, + "loss": 2.5347, + "step": 3749 + }, + { + "epoch": 50.013377926421406, + "grad_norm": 0.7115795016288757, + "learning_rate": 3.3414905450500556e-05, + "loss": 2.4834, + "step": 3750 + }, + { + "epoch": 50.02675585284281, + "grad_norm": 0.6772646307945251, + "learning_rate": 3.3370411568409345e-05, + "loss": 2.7454, + "step": 3751 + }, + { + "epoch": 50.04013377926422, + "grad_norm": 0.7457217574119568, + "learning_rate": 3.3325917686318134e-05, + "loss": 2.6569, + "step": 3752 + }, + { + "epoch": 50.05351170568562, + "grad_norm": 0.6905792951583862, + "learning_rate": 3.328142380422692e-05, + "loss": 2.5026, + "step": 3753 + }, + { + "epoch": 50.06688963210702, + "grad_norm": 0.6815123558044434, + "learning_rate": 3.3236929922135706e-05, + "loss": 2.6324, + "step": 3754 + }, + { + "epoch": 50.080267558528426, + "grad_norm": 0.7720118761062622, + "learning_rate": 3.3192436040044495e-05, + "loss": 2.6267, + "step": 3755 + }, + { + "epoch": 50.09364548494983, + "grad_norm": 0.7029479146003723, + "learning_rate": 3.3147942157953284e-05, + "loss": 2.3862, + "step": 3756 + }, + { + "epoch": 50.10702341137124, + "grad_norm": 0.6969912648200989, + "learning_rate": 3.310344827586207e-05, + "loss": 2.6517, + "step": 3757 + }, + { + "epoch": 50.12040133779264, + "grad_norm": 0.71588134765625, + "learning_rate": 3.3058954393770856e-05, + "loss": 2.5768, + "step": 3758 + }, + { + "epoch": 50.13377926421405, + "grad_norm": 0.7532247304916382, + "learning_rate": 3.3014460511679645e-05, + "loss": 2.7685, + "step": 3759 + }, + { + "epoch": 50.147157190635454, + "grad_norm": 0.7348672747612, + "learning_rate": 3.2969966629588434e-05, + "loss": 2.653, + "step": 3760 + }, + { + "epoch": 50.16053511705686, + "grad_norm": 0.7328438758850098, + "learning_rate": 3.292547274749722e-05, + "loss": 2.4988, + "step": 3761 + }, + { + "epoch": 50.17391304347826, + "grad_norm": 0.7021594047546387, + "learning_rate": 3.288097886540601e-05, + "loss": 2.4435, + "step": 3762 + }, + { + "epoch": 50.187290969899664, + "grad_norm": 0.6721189022064209, + "learning_rate": 3.2836484983314795e-05, + "loss": 2.5429, + "step": 3763 + }, + { + "epoch": 50.20066889632107, + "grad_norm": 0.7436168193817139, + "learning_rate": 3.2791991101223584e-05, + "loss": 2.9638, + "step": 3764 + }, + { + "epoch": 50.214046822742475, + "grad_norm": 0.7094192504882812, + "learning_rate": 3.274749721913237e-05, + "loss": 2.6454, + "step": 3765 + }, + { + "epoch": 50.22742474916388, + "grad_norm": 0.7569435238838196, + "learning_rate": 3.270300333704116e-05, + "loss": 2.5448, + "step": 3766 + }, + { + "epoch": 50.240802675585286, + "grad_norm": 0.692587673664093, + "learning_rate": 3.265850945494995e-05, + "loss": 2.5065, + "step": 3767 + }, + { + "epoch": 50.25418060200669, + "grad_norm": 0.6578419208526611, + "learning_rate": 3.2614015572858734e-05, + "loss": 2.3608, + "step": 3768 + }, + { + "epoch": 50.2675585284281, + "grad_norm": 0.7179403305053711, + "learning_rate": 3.256952169076752e-05, + "loss": 2.5417, + "step": 3769 + }, + { + "epoch": 50.280936454849495, + "grad_norm": 0.7056894898414612, + "learning_rate": 3.2525027808676305e-05, + "loss": 2.5924, + "step": 3770 + }, + { + "epoch": 50.2943143812709, + "grad_norm": 0.6727268695831299, + "learning_rate": 3.2480533926585095e-05, + "loss": 2.7164, + "step": 3771 + }, + { + "epoch": 50.30769230769231, + "grad_norm": 0.7499269247055054, + "learning_rate": 3.2436040044493884e-05, + "loss": 2.6172, + "step": 3772 + }, + { + "epoch": 50.32107023411371, + "grad_norm": 0.735407829284668, + "learning_rate": 3.239154616240267e-05, + "loss": 2.5634, + "step": 3773 + }, + { + "epoch": 50.33444816053512, + "grad_norm": 0.8683627247810364, + "learning_rate": 3.2347052280311455e-05, + "loss": 2.6344, + "step": 3774 + }, + { + "epoch": 50.34782608695652, + "grad_norm": 0.6927014589309692, + "learning_rate": 3.2302558398220244e-05, + "loss": 2.4513, + "step": 3775 + }, + { + "epoch": 50.36120401337793, + "grad_norm": 0.7480311989784241, + "learning_rate": 3.2258064516129034e-05, + "loss": 2.5682, + "step": 3776 + }, + { + "epoch": 50.374581939799334, + "grad_norm": 0.6959646344184875, + "learning_rate": 3.221357063403782e-05, + "loss": 2.4993, + "step": 3777 + }, + { + "epoch": 50.38795986622073, + "grad_norm": 0.6775467395782471, + "learning_rate": 3.2169076751946605e-05, + "loss": 2.4055, + "step": 3778 + }, + { + "epoch": 50.40133779264214, + "grad_norm": 0.74509197473526, + "learning_rate": 3.2124582869855394e-05, + "loss": 2.4928, + "step": 3779 + }, + { + "epoch": 50.414715719063544, + "grad_norm": 0.7157049775123596, + "learning_rate": 3.2080088987764183e-05, + "loss": 2.7255, + "step": 3780 + }, + { + "epoch": 50.42809364548495, + "grad_norm": 0.8020923733711243, + "learning_rate": 3.203559510567297e-05, + "loss": 2.6627, + "step": 3781 + }, + { + "epoch": 50.441471571906355, + "grad_norm": 0.7195694446563721, + "learning_rate": 3.199110122358176e-05, + "loss": 2.5675, + "step": 3782 + }, + { + "epoch": 50.45484949832776, + "grad_norm": 0.6762703657150269, + "learning_rate": 3.1946607341490544e-05, + "loss": 2.5307, + "step": 3783 + }, + { + "epoch": 50.468227424749166, + "grad_norm": 0.697120726108551, + "learning_rate": 3.190211345939933e-05, + "loss": 2.7164, + "step": 3784 + }, + { + "epoch": 50.48160535117057, + "grad_norm": 0.7056689262390137, + "learning_rate": 3.185761957730812e-05, + "loss": 2.5651, + "step": 3785 + }, + { + "epoch": 50.49498327759197, + "grad_norm": 0.7277424335479736, + "learning_rate": 3.181312569521691e-05, + "loss": 2.482, + "step": 3786 + }, + { + "epoch": 50.508361204013376, + "grad_norm": 0.7807835340499878, + "learning_rate": 3.17686318131257e-05, + "loss": 2.5597, + "step": 3787 + }, + { + "epoch": 50.52173913043478, + "grad_norm": 0.7482343912124634, + "learning_rate": 3.172413793103448e-05, + "loss": 2.4444, + "step": 3788 + }, + { + "epoch": 50.53511705685619, + "grad_norm": 0.6976211071014404, + "learning_rate": 3.167964404894327e-05, + "loss": 2.6283, + "step": 3789 + }, + { + "epoch": 50.54849498327759, + "grad_norm": 0.6839179396629333, + "learning_rate": 3.1635150166852055e-05, + "loss": 2.712, + "step": 3790 + }, + { + "epoch": 50.561872909699, + "grad_norm": 0.7053922414779663, + "learning_rate": 3.1590656284760844e-05, + "loss": 2.5473, + "step": 3791 + }, + { + "epoch": 50.575250836120404, + "grad_norm": 0.6922830939292908, + "learning_rate": 3.154616240266963e-05, + "loss": 2.6293, + "step": 3792 + }, + { + "epoch": 50.58862876254181, + "grad_norm": 0.8351788520812988, + "learning_rate": 3.150166852057842e-05, + "loss": 2.807, + "step": 3793 + }, + { + "epoch": 50.60200668896321, + "grad_norm": 0.7981114387512207, + "learning_rate": 3.1457174638487205e-05, + "loss": 2.3676, + "step": 3794 + }, + { + "epoch": 50.61538461538461, + "grad_norm": 0.7550584077835083, + "learning_rate": 3.1412680756395994e-05, + "loss": 2.7333, + "step": 3795 + }, + { + "epoch": 50.62876254180602, + "grad_norm": 0.7444553971290588, + "learning_rate": 3.136818687430478e-05, + "loss": 2.5449, + "step": 3796 + }, + { + "epoch": 50.642140468227424, + "grad_norm": 0.7490617632865906, + "learning_rate": 3.132369299221357e-05, + "loss": 2.523, + "step": 3797 + }, + { + "epoch": 50.65551839464883, + "grad_norm": 0.7713622450828552, + "learning_rate": 3.127919911012236e-05, + "loss": 2.2943, + "step": 3798 + }, + { + "epoch": 50.668896321070235, + "grad_norm": 0.6862428784370422, + "learning_rate": 3.1234705228031144e-05, + "loss": 2.5255, + "step": 3799 + }, + { + "epoch": 50.68227424749164, + "grad_norm": 0.755214273929596, + "learning_rate": 3.119021134593993e-05, + "loss": 2.5067, + "step": 3800 + }, + { + "epoch": 50.69565217391305, + "grad_norm": 0.690545380115509, + "learning_rate": 3.114571746384872e-05, + "loss": 2.5803, + "step": 3801 + }, + { + "epoch": 50.709030100334445, + "grad_norm": 0.7627785801887512, + "learning_rate": 3.110122358175751e-05, + "loss": 2.5416, + "step": 3802 + }, + { + "epoch": 50.72240802675585, + "grad_norm": 0.7653105854988098, + "learning_rate": 3.10567296996663e-05, + "loss": 2.8272, + "step": 3803 + }, + { + "epoch": 50.735785953177256, + "grad_norm": 0.7366088628768921, + "learning_rate": 3.101223581757508e-05, + "loss": 2.6717, + "step": 3804 + }, + { + "epoch": 50.74916387959866, + "grad_norm": 0.7387148141860962, + "learning_rate": 3.096774193548387e-05, + "loss": 2.6479, + "step": 3805 + }, + { + "epoch": 50.76254180602007, + "grad_norm": 0.76811283826828, + "learning_rate": 3.092324805339266e-05, + "loss": 2.6197, + "step": 3806 + }, + { + "epoch": 50.77591973244147, + "grad_norm": 0.7530215978622437, + "learning_rate": 3.087875417130145e-05, + "loss": 2.7431, + "step": 3807 + }, + { + "epoch": 50.78929765886288, + "grad_norm": 0.7515009641647339, + "learning_rate": 3.083426028921024e-05, + "loss": 2.7594, + "step": 3808 + }, + { + "epoch": 50.802675585284284, + "grad_norm": 0.6830329298973083, + "learning_rate": 3.078976640711902e-05, + "loss": 2.6105, + "step": 3809 + }, + { + "epoch": 50.81605351170568, + "grad_norm": 0.7070818543434143, + "learning_rate": 3.074527252502781e-05, + "loss": 2.6362, + "step": 3810 + }, + { + "epoch": 50.82943143812709, + "grad_norm": 0.7041717171669006, + "learning_rate": 3.0700778642936593e-05, + "loss": 2.5358, + "step": 3811 + }, + { + "epoch": 50.84280936454849, + "grad_norm": 0.7511767148971558, + "learning_rate": 3.065628476084538e-05, + "loss": 2.482, + "step": 3812 + }, + { + "epoch": 50.8561872909699, + "grad_norm": 0.7591724395751953, + "learning_rate": 3.061179087875417e-05, + "loss": 2.7301, + "step": 3813 + }, + { + "epoch": 50.869565217391305, + "grad_norm": 0.7175691723823547, + "learning_rate": 3.056729699666296e-05, + "loss": 2.689, + "step": 3814 + }, + { + "epoch": 50.88294314381271, + "grad_norm": 0.7458793520927429, + "learning_rate": 3.052280311457174e-05, + "loss": 2.5394, + "step": 3815 + }, + { + "epoch": 50.896321070234116, + "grad_norm": 0.7417124509811401, + "learning_rate": 3.0478309232480532e-05, + "loss": 2.7544, + "step": 3816 + }, + { + "epoch": 50.90969899665552, + "grad_norm": 0.7595741748809814, + "learning_rate": 3.043381535038932e-05, + "loss": 2.5443, + "step": 3817 + }, + { + "epoch": 50.92307692307692, + "grad_norm": 0.6968060731887817, + "learning_rate": 3.038932146829811e-05, + "loss": 2.8318, + "step": 3818 + }, + { + "epoch": 50.936454849498325, + "grad_norm": 0.8147920966148376, + "learning_rate": 3.0344827586206897e-05, + "loss": 2.51, + "step": 3819 + }, + { + "epoch": 50.94983277591973, + "grad_norm": 0.7169308066368103, + "learning_rate": 3.0300333704115686e-05, + "loss": 2.7062, + "step": 3820 + }, + { + "epoch": 50.96321070234114, + "grad_norm": 0.6849644184112549, + "learning_rate": 3.025583982202447e-05, + "loss": 2.5129, + "step": 3821 + }, + { + "epoch": 50.97658862876254, + "grad_norm": 0.6856783628463745, + "learning_rate": 3.021134593993326e-05, + "loss": 2.606, + "step": 3822 + }, + { + "epoch": 50.98996655518395, + "grad_norm": 0.6814429759979248, + "learning_rate": 3.016685205784205e-05, + "loss": 2.606, + "step": 3823 + }, + { + "epoch": 51.0, + "grad_norm": 0.8198628425598145, + "learning_rate": 3.0122358175750836e-05, + "loss": 2.1563, + "step": 3824 + }, + { + "epoch": 51.013377926421406, + "grad_norm": 0.6644238829612732, + "learning_rate": 3.0077864293659625e-05, + "loss": 2.4632, + "step": 3825 + }, + { + "epoch": 51.02675585284281, + "grad_norm": 0.6920905113220215, + "learning_rate": 3.003337041156841e-05, + "loss": 2.436, + "step": 3826 + }, + { + "epoch": 51.04013377926422, + "grad_norm": 0.6688025593757629, + "learning_rate": 2.99888765294772e-05, + "loss": 2.6136, + "step": 3827 + }, + { + "epoch": 51.05351170568562, + "grad_norm": 0.6641514897346497, + "learning_rate": 2.994438264738599e-05, + "loss": 2.5133, + "step": 3828 + }, + { + "epoch": 51.06688963210702, + "grad_norm": 0.6722247004508972, + "learning_rate": 2.9899888765294775e-05, + "loss": 2.5963, + "step": 3829 + }, + { + "epoch": 51.080267558528426, + "grad_norm": 0.698442280292511, + "learning_rate": 2.9855394883203564e-05, + "loss": 2.5602, + "step": 3830 + }, + { + "epoch": 51.09364548494983, + "grad_norm": 0.6918148398399353, + "learning_rate": 2.9810901001112346e-05, + "loss": 2.6382, + "step": 3831 + }, + { + "epoch": 51.10702341137124, + "grad_norm": 0.7268350720405579, + "learning_rate": 2.9766407119021132e-05, + "loss": 2.6523, + "step": 3832 + }, + { + "epoch": 51.12040133779264, + "grad_norm": 0.708554744720459, + "learning_rate": 2.972191323692992e-05, + "loss": 2.6575, + "step": 3833 + }, + { + "epoch": 51.13377926421405, + "grad_norm": 0.6651294231414795, + "learning_rate": 2.967741935483871e-05, + "loss": 2.6154, + "step": 3834 + }, + { + "epoch": 51.147157190635454, + "grad_norm": 0.6972305774688721, + "learning_rate": 2.9632925472747496e-05, + "loss": 2.3533, + "step": 3835 + }, + { + "epoch": 51.16053511705686, + "grad_norm": 0.6912644505500793, + "learning_rate": 2.9588431590656285e-05, + "loss": 2.6753, + "step": 3836 + }, + { + "epoch": 51.17391304347826, + "grad_norm": 0.7234140038490295, + "learning_rate": 2.954393770856507e-05, + "loss": 2.7457, + "step": 3837 + }, + { + "epoch": 51.187290969899664, + "grad_norm": 0.7113907337188721, + "learning_rate": 2.949944382647386e-05, + "loss": 2.5897, + "step": 3838 + }, + { + "epoch": 51.20066889632107, + "grad_norm": 0.6739325523376465, + "learning_rate": 2.945494994438265e-05, + "loss": 2.4148, + "step": 3839 + }, + { + "epoch": 51.214046822742475, + "grad_norm": 0.6763201951980591, + "learning_rate": 2.9410456062291435e-05, + "loss": 2.4693, + "step": 3840 + }, + { + "epoch": 51.22742474916388, + "grad_norm": 0.6864936947822571, + "learning_rate": 2.9365962180200224e-05, + "loss": 2.5916, + "step": 3841 + }, + { + "epoch": 51.240802675585286, + "grad_norm": 0.6833862066268921, + "learning_rate": 2.932146829810901e-05, + "loss": 2.5388, + "step": 3842 + }, + { + "epoch": 51.25418060200669, + "grad_norm": 0.6838318705558777, + "learning_rate": 2.92769744160178e-05, + "loss": 2.7633, + "step": 3843 + }, + { + "epoch": 51.2675585284281, + "grad_norm": 0.729987382888794, + "learning_rate": 2.923248053392659e-05, + "loss": 2.7342, + "step": 3844 + }, + { + "epoch": 51.280936454849495, + "grad_norm": 0.7263936400413513, + "learning_rate": 2.9187986651835374e-05, + "loss": 2.668, + "step": 3845 + }, + { + "epoch": 51.2943143812709, + "grad_norm": 0.6780832409858704, + "learning_rate": 2.9143492769744163e-05, + "loss": 2.5854, + "step": 3846 + }, + { + "epoch": 51.30769230769231, + "grad_norm": 0.6763190031051636, + "learning_rate": 2.909899888765295e-05, + "loss": 2.2402, + "step": 3847 + }, + { + "epoch": 51.32107023411371, + "grad_norm": 0.725577712059021, + "learning_rate": 2.905450500556174e-05, + "loss": 2.4074, + "step": 3848 + }, + { + "epoch": 51.33444816053512, + "grad_norm": 0.720762312412262, + "learning_rate": 2.9010011123470528e-05, + "loss": 2.6617, + "step": 3849 + }, + { + "epoch": 51.34782608695652, + "grad_norm": 0.7184842228889465, + "learning_rate": 2.8965517241379313e-05, + "loss": 2.7605, + "step": 3850 + }, + { + "epoch": 51.36120401337793, + "grad_norm": 0.7217162847518921, + "learning_rate": 2.8921023359288102e-05, + "loss": 2.735, + "step": 3851 + }, + { + "epoch": 51.374581939799334, + "grad_norm": 0.7379841208457947, + "learning_rate": 2.8876529477196885e-05, + "loss": 2.4665, + "step": 3852 + }, + { + "epoch": 51.38795986622073, + "grad_norm": 0.7272505760192871, + "learning_rate": 2.883203559510567e-05, + "loss": 2.5609, + "step": 3853 + }, + { + "epoch": 51.40133779264214, + "grad_norm": 0.7013726830482483, + "learning_rate": 2.878754171301446e-05, + "loss": 2.7946, + "step": 3854 + }, + { + "epoch": 51.414715719063544, + "grad_norm": 0.7304871082305908, + "learning_rate": 2.874304783092325e-05, + "loss": 2.3282, + "step": 3855 + }, + { + "epoch": 51.42809364548495, + "grad_norm": 0.6831701993942261, + "learning_rate": 2.8698553948832035e-05, + "loss": 2.6357, + "step": 3856 + }, + { + "epoch": 51.441471571906355, + "grad_norm": 0.7182192206382751, + "learning_rate": 2.8654060066740824e-05, + "loss": 2.319, + "step": 3857 + }, + { + "epoch": 51.45484949832776, + "grad_norm": 0.7064784169197083, + "learning_rate": 2.860956618464961e-05, + "loss": 2.6095, + "step": 3858 + }, + { + "epoch": 51.468227424749166, + "grad_norm": 0.687251627445221, + "learning_rate": 2.85650723025584e-05, + "loss": 2.4059, + "step": 3859 + }, + { + "epoch": 51.48160535117057, + "grad_norm": 0.672291100025177, + "learning_rate": 2.8520578420467188e-05, + "loss": 2.4958, + "step": 3860 + }, + { + "epoch": 51.49498327759197, + "grad_norm": 0.7099835276603699, + "learning_rate": 2.8476084538375974e-05, + "loss": 2.6736, + "step": 3861 + }, + { + "epoch": 51.508361204013376, + "grad_norm": 0.7420845031738281, + "learning_rate": 2.8431590656284763e-05, + "loss": 2.6133, + "step": 3862 + }, + { + "epoch": 51.52173913043478, + "grad_norm": 0.6821379065513611, + "learning_rate": 2.838709677419355e-05, + "loss": 2.4062, + "step": 3863 + }, + { + "epoch": 51.53511705685619, + "grad_norm": 0.7185118794441223, + "learning_rate": 2.8342602892102338e-05, + "loss": 2.7629, + "step": 3864 + }, + { + "epoch": 51.54849498327759, + "grad_norm": 0.7404976487159729, + "learning_rate": 2.8298109010011127e-05, + "loss": 2.6438, + "step": 3865 + }, + { + "epoch": 51.561872909699, + "grad_norm": 0.7032206654548645, + "learning_rate": 2.8253615127919913e-05, + "loss": 2.7265, + "step": 3866 + }, + { + "epoch": 51.575250836120404, + "grad_norm": 0.6758597493171692, + "learning_rate": 2.8209121245828702e-05, + "loss": 2.6224, + "step": 3867 + }, + { + "epoch": 51.58862876254181, + "grad_norm": 0.6467934250831604, + "learning_rate": 2.8164627363737488e-05, + "loss": 2.4554, + "step": 3868 + }, + { + "epoch": 51.60200668896321, + "grad_norm": 0.6494783759117126, + "learning_rate": 2.8120133481646277e-05, + "loss": 2.4571, + "step": 3869 + }, + { + "epoch": 51.61538461538461, + "grad_norm": 0.6202491521835327, + "learning_rate": 2.8075639599555066e-05, + "loss": 2.1063, + "step": 3870 + }, + { + "epoch": 51.62876254180602, + "grad_norm": 0.6956090331077576, + "learning_rate": 2.8031145717463852e-05, + "loss": 2.719, + "step": 3871 + }, + { + "epoch": 51.642140468227424, + "grad_norm": 0.6946972012519836, + "learning_rate": 2.798665183537264e-05, + "loss": 2.5629, + "step": 3872 + }, + { + "epoch": 51.65551839464883, + "grad_norm": 0.6977154612541199, + "learning_rate": 2.7942157953281423e-05, + "loss": 2.6202, + "step": 3873 + }, + { + "epoch": 51.668896321070235, + "grad_norm": 0.7147237062454224, + "learning_rate": 2.789766407119021e-05, + "loss": 2.4834, + "step": 3874 + }, + { + "epoch": 51.68227424749164, + "grad_norm": 0.644244372844696, + "learning_rate": 2.7853170189099e-05, + "loss": 2.4317, + "step": 3875 + }, + { + "epoch": 51.69565217391305, + "grad_norm": 0.6681378483772278, + "learning_rate": 2.7808676307007788e-05, + "loss": 2.459, + "step": 3876 + }, + { + "epoch": 51.709030100334445, + "grad_norm": 0.6902260780334473, + "learning_rate": 2.7764182424916573e-05, + "loss": 2.5687, + "step": 3877 + }, + { + "epoch": 51.72240802675585, + "grad_norm": 0.6847380995750427, + "learning_rate": 2.7719688542825363e-05, + "loss": 2.7226, + "step": 3878 + }, + { + "epoch": 51.735785953177256, + "grad_norm": 0.664297342300415, + "learning_rate": 2.7675194660734148e-05, + "loss": 2.5813, + "step": 3879 + }, + { + "epoch": 51.74916387959866, + "grad_norm": 0.6731799840927124, + "learning_rate": 2.7630700778642937e-05, + "loss": 2.4737, + "step": 3880 + }, + { + "epoch": 51.76254180602007, + "grad_norm": 0.642483115196228, + "learning_rate": 2.7586206896551727e-05, + "loss": 2.4885, + "step": 3881 + }, + { + "epoch": 51.77591973244147, + "grad_norm": 0.7031340599060059, + "learning_rate": 2.7541713014460512e-05, + "loss": 2.574, + "step": 3882 + }, + { + "epoch": 51.78929765886288, + "grad_norm": 0.6814936995506287, + "learning_rate": 2.74972191323693e-05, + "loss": 2.6551, + "step": 3883 + }, + { + "epoch": 51.802675585284284, + "grad_norm": 0.6501394510269165, + "learning_rate": 2.7452725250278087e-05, + "loss": 2.458, + "step": 3884 + }, + { + "epoch": 51.81605351170568, + "grad_norm": 0.6935567855834961, + "learning_rate": 2.7408231368186877e-05, + "loss": 2.621, + "step": 3885 + }, + { + "epoch": 51.82943143812709, + "grad_norm": 0.6672533750534058, + "learning_rate": 2.7363737486095666e-05, + "loss": 2.4009, + "step": 3886 + }, + { + "epoch": 51.84280936454849, + "grad_norm": 0.6834401488304138, + "learning_rate": 2.731924360400445e-05, + "loss": 2.6903, + "step": 3887 + }, + { + "epoch": 51.8561872909699, + "grad_norm": 0.7198247909545898, + "learning_rate": 2.727474972191324e-05, + "loss": 2.7527, + "step": 3888 + }, + { + "epoch": 51.869565217391305, + "grad_norm": 0.6896756887435913, + "learning_rate": 2.7230255839822026e-05, + "loss": 2.6941, + "step": 3889 + }, + { + "epoch": 51.88294314381271, + "grad_norm": 0.6827479600906372, + "learning_rate": 2.7185761957730816e-05, + "loss": 2.6794, + "step": 3890 + }, + { + "epoch": 51.896321070234116, + "grad_norm": 0.7218233942985535, + "learning_rate": 2.7141268075639605e-05, + "loss": 2.5273, + "step": 3891 + }, + { + "epoch": 51.90969899665552, + "grad_norm": 0.6801145076751709, + "learning_rate": 2.709677419354839e-05, + "loss": 2.4922, + "step": 3892 + }, + { + "epoch": 51.92307692307692, + "grad_norm": 0.6611994504928589, + "learning_rate": 2.7052280311457173e-05, + "loss": 2.5052, + "step": 3893 + }, + { + "epoch": 51.936454849498325, + "grad_norm": 0.6727268099784851, + "learning_rate": 2.7007786429365962e-05, + "loss": 2.6148, + "step": 3894 + }, + { + "epoch": 51.94983277591973, + "grad_norm": 0.7804195284843445, + "learning_rate": 2.6963292547274748e-05, + "loss": 2.5588, + "step": 3895 + }, + { + "epoch": 51.96321070234114, + "grad_norm": 0.6668366193771362, + "learning_rate": 2.6918798665183537e-05, + "loss": 2.5478, + "step": 3896 + }, + { + "epoch": 51.97658862876254, + "grad_norm": 0.6974259614944458, + "learning_rate": 2.6874304783092323e-05, + "loss": 2.7024, + "step": 3897 + }, + { + "epoch": 51.98996655518395, + "grad_norm": 0.6899526715278625, + "learning_rate": 2.6829810901001112e-05, + "loss": 2.5015, + "step": 3898 + }, + { + "epoch": 52.0, + "grad_norm": 0.824705183506012, + "learning_rate": 2.67853170189099e-05, + "loss": 2.668, + "step": 3899 + }, + { + "epoch": 52.013377926421406, + "grad_norm": 0.6349574327468872, + "learning_rate": 2.6740823136818687e-05, + "loss": 2.3712, + "step": 3900 + }, + { + "epoch": 52.02675585284281, + "grad_norm": 0.6901255249977112, + "learning_rate": 2.6696329254727476e-05, + "loss": 2.4776, + "step": 3901 + }, + { + "epoch": 52.04013377926422, + "grad_norm": 0.6744613647460938, + "learning_rate": 2.6651835372636262e-05, + "loss": 2.5294, + "step": 3902 + }, + { + "epoch": 52.05351170568562, + "grad_norm": 0.6527228951454163, + "learning_rate": 2.660734149054505e-05, + "loss": 2.5578, + "step": 3903 + }, + { + "epoch": 52.06688963210702, + "grad_norm": 0.6679868698120117, + "learning_rate": 2.656284760845384e-05, + "loss": 2.5264, + "step": 3904 + }, + { + "epoch": 52.080267558528426, + "grad_norm": 0.7045819759368896, + "learning_rate": 2.6518353726362626e-05, + "loss": 2.8701, + "step": 3905 + }, + { + "epoch": 52.09364548494983, + "grad_norm": 0.6314506530761719, + "learning_rate": 2.6473859844271415e-05, + "loss": 2.394, + "step": 3906 + }, + { + "epoch": 52.10702341137124, + "grad_norm": 0.681894838809967, + "learning_rate": 2.64293659621802e-05, + "loss": 2.2975, + "step": 3907 + }, + { + "epoch": 52.12040133779264, + "grad_norm": 0.6718135476112366, + "learning_rate": 2.638487208008899e-05, + "loss": 2.4091, + "step": 3908 + }, + { + "epoch": 52.13377926421405, + "grad_norm": 0.6556568145751953, + "learning_rate": 2.634037819799778e-05, + "loss": 2.652, + "step": 3909 + }, + { + "epoch": 52.147157190635454, + "grad_norm": 0.6845338940620422, + "learning_rate": 2.6295884315906565e-05, + "loss": 2.6104, + "step": 3910 + }, + { + "epoch": 52.16053511705686, + "grad_norm": 0.6780291199684143, + "learning_rate": 2.6251390433815354e-05, + "loss": 2.6291, + "step": 3911 + }, + { + "epoch": 52.17391304347826, + "grad_norm": 0.6501689553260803, + "learning_rate": 2.620689655172414e-05, + "loss": 2.4703, + "step": 3912 + }, + { + "epoch": 52.187290969899664, + "grad_norm": 0.6705746650695801, + "learning_rate": 2.616240266963293e-05, + "loss": 2.5792, + "step": 3913 + }, + { + "epoch": 52.20066889632107, + "grad_norm": 0.7104156017303467, + "learning_rate": 2.611790878754171e-05, + "loss": 2.6099, + "step": 3914 + }, + { + "epoch": 52.214046822742475, + "grad_norm": 0.6317254304885864, + "learning_rate": 2.60734149054505e-05, + "loss": 2.2911, + "step": 3915 + }, + { + "epoch": 52.22742474916388, + "grad_norm": 0.7213663458824158, + "learning_rate": 2.6028921023359286e-05, + "loss": 2.5813, + "step": 3916 + }, + { + "epoch": 52.240802675585286, + "grad_norm": 0.6910906434059143, + "learning_rate": 2.5984427141268076e-05, + "loss": 2.599, + "step": 3917 + }, + { + "epoch": 52.25418060200669, + "grad_norm": 0.7245139479637146, + "learning_rate": 2.593993325917686e-05, + "loss": 2.6023, + "step": 3918 + }, + { + "epoch": 52.2675585284281, + "grad_norm": 0.6673336029052734, + "learning_rate": 2.589543937708565e-05, + "loss": 2.5356, + "step": 3919 + }, + { + "epoch": 52.280936454849495, + "grad_norm": 0.6572391390800476, + "learning_rate": 2.585094549499444e-05, + "loss": 2.492, + "step": 3920 + }, + { + "epoch": 52.2943143812709, + "grad_norm": 0.6730433106422424, + "learning_rate": 2.5806451612903226e-05, + "loss": 2.559, + "step": 3921 + }, + { + "epoch": 52.30769230769231, + "grad_norm": 0.699166476726532, + "learning_rate": 2.5761957730812015e-05, + "loss": 2.6597, + "step": 3922 + }, + { + "epoch": 52.32107023411371, + "grad_norm": 0.6620599031448364, + "learning_rate": 2.57174638487208e-05, + "loss": 2.5504, + "step": 3923 + }, + { + "epoch": 52.33444816053512, + "grad_norm": 0.7208895683288574, + "learning_rate": 2.567296996662959e-05, + "loss": 2.568, + "step": 3924 + }, + { + "epoch": 52.34782608695652, + "grad_norm": 0.7013632655143738, + "learning_rate": 2.562847608453838e-05, + "loss": 2.7051, + "step": 3925 + }, + { + "epoch": 52.36120401337793, + "grad_norm": 0.7046389579772949, + "learning_rate": 2.5583982202447165e-05, + "loss": 2.6715, + "step": 3926 + }, + { + "epoch": 52.374581939799334, + "grad_norm": 0.6850835084915161, + "learning_rate": 2.5539488320355954e-05, + "loss": 2.5796, + "step": 3927 + }, + { + "epoch": 52.38795986622073, + "grad_norm": 0.6935270428657532, + "learning_rate": 2.549499443826474e-05, + "loss": 2.3779, + "step": 3928 + }, + { + "epoch": 52.40133779264214, + "grad_norm": 0.659723699092865, + "learning_rate": 2.545050055617353e-05, + "loss": 2.3896, + "step": 3929 + }, + { + "epoch": 52.414715719063544, + "grad_norm": 0.6537664532661438, + "learning_rate": 2.5406006674082318e-05, + "loss": 2.3389, + "step": 3930 + }, + { + "epoch": 52.42809364548495, + "grad_norm": 0.6706110239028931, + "learning_rate": 2.5361512791991104e-05, + "loss": 2.5741, + "step": 3931 + }, + { + "epoch": 52.441471571906355, + "grad_norm": 0.6661157011985779, + "learning_rate": 2.5317018909899893e-05, + "loss": 2.4866, + "step": 3932 + }, + { + "epoch": 52.45484949832776, + "grad_norm": 0.6940733194351196, + "learning_rate": 2.527252502780868e-05, + "loss": 2.576, + "step": 3933 + }, + { + "epoch": 52.468227424749166, + "grad_norm": 0.6871435642242432, + "learning_rate": 2.522803114571746e-05, + "loss": 2.5895, + "step": 3934 + }, + { + "epoch": 52.48160535117057, + "grad_norm": 0.6974420547485352, + "learning_rate": 2.518353726362625e-05, + "loss": 2.6832, + "step": 3935 + }, + { + "epoch": 52.49498327759197, + "grad_norm": 0.6962250471115112, + "learning_rate": 2.513904338153504e-05, + "loss": 2.3006, + "step": 3936 + }, + { + "epoch": 52.508361204013376, + "grad_norm": 0.6863418817520142, + "learning_rate": 2.5094549499443825e-05, + "loss": 2.6124, + "step": 3937 + }, + { + "epoch": 52.52173913043478, + "grad_norm": 0.6994590163230896, + "learning_rate": 2.5050055617352614e-05, + "loss": 2.5151, + "step": 3938 + }, + { + "epoch": 52.53511705685619, + "grad_norm": 0.6534810662269592, + "learning_rate": 2.50055617352614e-05, + "loss": 2.6399, + "step": 3939 + }, + { + "epoch": 52.54849498327759, + "grad_norm": 0.6296973824501038, + "learning_rate": 2.496106785317019e-05, + "loss": 2.3892, + "step": 3940 + }, + { + "epoch": 52.561872909699, + "grad_norm": 0.6417976021766663, + "learning_rate": 2.491657397107898e-05, + "loss": 2.3044, + "step": 3941 + }, + { + "epoch": 52.575250836120404, + "grad_norm": 0.6997331976890564, + "learning_rate": 2.4872080088987764e-05, + "loss": 2.5604, + "step": 3942 + }, + { + "epoch": 52.58862876254181, + "grad_norm": 0.6694924235343933, + "learning_rate": 2.4827586206896553e-05, + "loss": 2.4752, + "step": 3943 + }, + { + "epoch": 52.60200668896321, + "grad_norm": 0.6598913669586182, + "learning_rate": 2.478309232480534e-05, + "loss": 2.5687, + "step": 3944 + }, + { + "epoch": 52.61538461538461, + "grad_norm": 0.7147283554077148, + "learning_rate": 2.4738598442714128e-05, + "loss": 2.5372, + "step": 3945 + }, + { + "epoch": 52.62876254180602, + "grad_norm": 0.6867512464523315, + "learning_rate": 2.4694104560622917e-05, + "loss": 2.6, + "step": 3946 + }, + { + "epoch": 52.642140468227424, + "grad_norm": 0.6952562928199768, + "learning_rate": 2.4649610678531703e-05, + "loss": 2.3294, + "step": 3947 + }, + { + "epoch": 52.65551839464883, + "grad_norm": 0.6790039539337158, + "learning_rate": 2.4605116796440492e-05, + "loss": 2.2903, + "step": 3948 + }, + { + "epoch": 52.668896321070235, + "grad_norm": 0.6904420852661133, + "learning_rate": 2.4560622914349278e-05, + "loss": 2.5636, + "step": 3949 + }, + { + "epoch": 52.68227424749164, + "grad_norm": 0.6746246218681335, + "learning_rate": 2.4516129032258064e-05, + "loss": 2.5318, + "step": 3950 + }, + { + "epoch": 52.69565217391305, + "grad_norm": 0.7066764235496521, + "learning_rate": 2.4471635150166853e-05, + "loss": 2.4879, + "step": 3951 + }, + { + "epoch": 52.709030100334445, + "grad_norm": 0.667173445224762, + "learning_rate": 2.442714126807564e-05, + "loss": 2.6719, + "step": 3952 + }, + { + "epoch": 52.72240802675585, + "grad_norm": 0.651569128036499, + "learning_rate": 2.4382647385984428e-05, + "loss": 2.5145, + "step": 3953 + }, + { + "epoch": 52.735785953177256, + "grad_norm": 0.6653866767883301, + "learning_rate": 2.4338153503893217e-05, + "loss": 2.647, + "step": 3954 + }, + { + "epoch": 52.74916387959866, + "grad_norm": 0.6874131560325623, + "learning_rate": 2.4293659621802003e-05, + "loss": 2.6872, + "step": 3955 + }, + { + "epoch": 52.76254180602007, + "grad_norm": 0.7200673222541809, + "learning_rate": 2.4249165739710792e-05, + "loss": 2.538, + "step": 3956 + }, + { + "epoch": 52.77591973244147, + "grad_norm": 0.6785191893577576, + "learning_rate": 2.4204671857619578e-05, + "loss": 2.7182, + "step": 3957 + }, + { + "epoch": 52.78929765886288, + "grad_norm": 0.6519473195075989, + "learning_rate": 2.4160177975528367e-05, + "loss": 2.3929, + "step": 3958 + }, + { + "epoch": 52.802675585284284, + "grad_norm": 0.7064288258552551, + "learning_rate": 2.4115684093437156e-05, + "loss": 2.6197, + "step": 3959 + }, + { + "epoch": 52.81605351170568, + "grad_norm": 0.6361091732978821, + "learning_rate": 2.407119021134594e-05, + "loss": 2.5221, + "step": 3960 + }, + { + "epoch": 52.82943143812709, + "grad_norm": 0.7092622518539429, + "learning_rate": 2.4026696329254728e-05, + "loss": 2.6919, + "step": 3961 + }, + { + "epoch": 52.84280936454849, + "grad_norm": 0.6915587782859802, + "learning_rate": 2.3982202447163517e-05, + "loss": 2.627, + "step": 3962 + }, + { + "epoch": 52.8561872909699, + "grad_norm": 0.6888433694839478, + "learning_rate": 2.3937708565072303e-05, + "loss": 2.7742, + "step": 3963 + }, + { + "epoch": 52.869565217391305, + "grad_norm": 0.6663147211074829, + "learning_rate": 2.3893214682981092e-05, + "loss": 2.2218, + "step": 3964 + }, + { + "epoch": 52.88294314381271, + "grad_norm": 0.6786331534385681, + "learning_rate": 2.3848720800889878e-05, + "loss": 2.6369, + "step": 3965 + }, + { + "epoch": 52.896321070234116, + "grad_norm": 0.6943885087966919, + "learning_rate": 2.3804226918798667e-05, + "loss": 2.464, + "step": 3966 + }, + { + "epoch": 52.90969899665552, + "grad_norm": 0.7094883322715759, + "learning_rate": 2.3759733036707456e-05, + "loss": 2.801, + "step": 3967 + }, + { + "epoch": 52.92307692307692, + "grad_norm": 0.6632527709007263, + "learning_rate": 2.3715239154616242e-05, + "loss": 2.5922, + "step": 3968 + }, + { + "epoch": 52.936454849498325, + "grad_norm": 0.7414517998695374, + "learning_rate": 2.367074527252503e-05, + "loss": 2.9419, + "step": 3969 + }, + { + "epoch": 52.94983277591973, + "grad_norm": 0.6803048849105835, + "learning_rate": 2.3626251390433817e-05, + "loss": 2.5849, + "step": 3970 + }, + { + "epoch": 52.96321070234114, + "grad_norm": 0.6940471529960632, + "learning_rate": 2.3581757508342603e-05, + "loss": 2.7496, + "step": 3971 + }, + { + "epoch": 52.97658862876254, + "grad_norm": 0.6914702653884888, + "learning_rate": 2.353726362625139e-05, + "loss": 2.5185, + "step": 3972 + }, + { + "epoch": 52.98996655518395, + "grad_norm": 0.6751214265823364, + "learning_rate": 2.3492769744160177e-05, + "loss": 2.4305, + "step": 3973 + }, + { + "epoch": 53.0, + "grad_norm": 0.831278920173645, + "learning_rate": 2.3448275862068967e-05, + "loss": 2.3934, + "step": 3974 + }, + { + "epoch": 53.013377926421406, + "grad_norm": 0.662552535533905, + "learning_rate": 2.3403781979977756e-05, + "loss": 2.388, + "step": 3975 + }, + { + "epoch": 53.02675585284281, + "grad_norm": 0.7117273211479187, + "learning_rate": 2.335928809788654e-05, + "loss": 2.6864, + "step": 3976 + }, + { + "epoch": 53.04013377926422, + "grad_norm": 0.623512327671051, + "learning_rate": 2.331479421579533e-05, + "loss": 2.3419, + "step": 3977 + }, + { + "epoch": 53.05351170568562, + "grad_norm": 0.6845181584358215, + "learning_rate": 2.3270300333704117e-05, + "loss": 2.5291, + "step": 3978 + }, + { + "epoch": 53.06688963210702, + "grad_norm": 0.6794723272323608, + "learning_rate": 2.3225806451612906e-05, + "loss": 2.6106, + "step": 3979 + }, + { + "epoch": 53.080267558528426, + "grad_norm": 0.6423758864402771, + "learning_rate": 2.3181312569521695e-05, + "loss": 2.5044, + "step": 3980 + }, + { + "epoch": 53.09364548494983, + "grad_norm": 0.6895222663879395, + "learning_rate": 2.3136818687430477e-05, + "loss": 2.5586, + "step": 3981 + }, + { + "epoch": 53.10702341137124, + "grad_norm": 0.711919903755188, + "learning_rate": 2.3092324805339266e-05, + "loss": 2.6076, + "step": 3982 + }, + { + "epoch": 53.12040133779264, + "grad_norm": 0.6487022042274475, + "learning_rate": 2.3047830923248052e-05, + "loss": 2.4783, + "step": 3983 + }, + { + "epoch": 53.13377926421405, + "grad_norm": 0.664364218711853, + "learning_rate": 2.300333704115684e-05, + "loss": 2.4453, + "step": 3984 + }, + { + "epoch": 53.147157190635454, + "grad_norm": 0.672069251537323, + "learning_rate": 2.295884315906563e-05, + "loss": 2.5366, + "step": 3985 + }, + { + "epoch": 53.16053511705686, + "grad_norm": 0.724062979221344, + "learning_rate": 2.2914349276974416e-05, + "loss": 2.6009, + "step": 3986 + }, + { + "epoch": 53.17391304347826, + "grad_norm": 0.660426914691925, + "learning_rate": 2.2869855394883205e-05, + "loss": 2.4559, + "step": 3987 + }, + { + "epoch": 53.187290969899664, + "grad_norm": 0.6827124357223511, + "learning_rate": 2.282536151279199e-05, + "loss": 2.2968, + "step": 3988 + }, + { + "epoch": 53.20066889632107, + "grad_norm": 0.6487954258918762, + "learning_rate": 2.278086763070078e-05, + "loss": 2.4694, + "step": 3989 + }, + { + "epoch": 53.214046822742475, + "grad_norm": 0.7343469262123108, + "learning_rate": 2.273637374860957e-05, + "loss": 2.584, + "step": 3990 + }, + { + "epoch": 53.22742474916388, + "grad_norm": 0.6878166198730469, + "learning_rate": 2.2691879866518352e-05, + "loss": 2.6855, + "step": 3991 + }, + { + "epoch": 53.240802675585286, + "grad_norm": 0.69071364402771, + "learning_rate": 2.264738598442714e-05, + "loss": 2.579, + "step": 3992 + }, + { + "epoch": 53.25418060200669, + "grad_norm": 0.6821252107620239, + "learning_rate": 2.260289210233593e-05, + "loss": 2.6543, + "step": 3993 + }, + { + "epoch": 53.2675585284281, + "grad_norm": 0.6615633964538574, + "learning_rate": 2.2558398220244716e-05, + "loss": 2.3427, + "step": 3994 + }, + { + "epoch": 53.280936454849495, + "grad_norm": 0.641324520111084, + "learning_rate": 2.2513904338153505e-05, + "loss": 2.479, + "step": 3995 + }, + { + "epoch": 53.2943143812709, + "grad_norm": 0.7013201117515564, + "learning_rate": 2.246941045606229e-05, + "loss": 2.6016, + "step": 3996 + }, + { + "epoch": 53.30769230769231, + "grad_norm": 0.7007108330726624, + "learning_rate": 2.242491657397108e-05, + "loss": 2.6184, + "step": 3997 + }, + { + "epoch": 53.32107023411371, + "grad_norm": 0.6686177849769592, + "learning_rate": 2.238042269187987e-05, + "loss": 2.6286, + "step": 3998 + }, + { + "epoch": 53.33444816053512, + "grad_norm": 0.6843096017837524, + "learning_rate": 2.2335928809788655e-05, + "loss": 2.4684, + "step": 3999 + }, + { + "epoch": 53.34782608695652, + "grad_norm": 0.66323322057724, + "learning_rate": 2.2291434927697444e-05, + "loss": 2.4277, + "step": 4000 + } + ], + "logging_steps": 1, + "max_steps": 4500, + "num_input_tokens_seen": 0, + "num_train_epochs": 60, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.2380878140440832e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}