{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.06610403063494089, "eval_steps": 500, "global_step": 8500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 7.77694478058128e-06, "grad_norm": 1.3781788873470509, "learning_rate": 5e-06, "loss": 3.1841, "step": 1 }, { "epoch": 1.555388956116256e-05, "grad_norm": 1.397067262498387, "learning_rate": 1e-05, "loss": 3.2165, "step": 2 }, { "epoch": 2.333083434174384e-05, "grad_norm": 1.0727434571191712, "learning_rate": 1.5e-05, "loss": 3.1771, "step": 3 }, { "epoch": 3.110777912232512e-05, "grad_norm": 1.0502195907542482, "learning_rate": 2e-05, "loss": 3.1029, "step": 4 }, { "epoch": 3.88847239029064e-05, "grad_norm": 1.4933289424055272, "learning_rate": 2.5e-05, "loss": 3.139, "step": 5 }, { "epoch": 4.666166868348768e-05, "grad_norm": 1.2576900793459878, "learning_rate": 3e-05, "loss": 2.9803, "step": 6 }, { "epoch": 5.443861346406896e-05, "grad_norm": 1.0464129635558295, "learning_rate": 3.5e-05, "loss": 2.9986, "step": 7 }, { "epoch": 6.221555824465024e-05, "grad_norm": 0.7992188571540303, "learning_rate": 4e-05, "loss": 2.9898, "step": 8 }, { "epoch": 6.999250302523153e-05, "grad_norm": 0.723400607214803, "learning_rate": 4.5e-05, "loss": 3.0757, "step": 9 }, { "epoch": 7.77694478058128e-05, "grad_norm": 0.6388962301147968, "learning_rate": 5e-05, "loss": 2.9756, "step": 10 }, { "epoch": 8.554639258639408e-05, "grad_norm": 0.5245280006999345, "learning_rate": 5.500000000000001e-05, "loss": 3.0175, "step": 11 }, { "epoch": 9.332333736697535e-05, "grad_norm": 0.541541048548596, "learning_rate": 6e-05, "loss": 3.0286, "step": 12 }, { "epoch": 0.00010110028214755664, "grad_norm": 0.7562081733424671, "learning_rate": 6.500000000000001e-05, "loss": 3.0681, "step": 13 }, { "epoch": 0.00010887722692813792, "grad_norm": 0.3865543992301327, "learning_rate": 7e-05, "loss": 3.0185, "step": 14 }, { "epoch": 0.0001166541717087192, "grad_norm": 0.4257216924338073, "learning_rate": 7.500000000000001e-05, "loss": 2.9643, "step": 15 }, { "epoch": 0.00012443111648930048, "grad_norm": 0.43630726227790967, "learning_rate": 8e-05, "loss": 3.0795, "step": 16 }, { "epoch": 0.00013220806126988175, "grad_norm": 0.3305450423749661, "learning_rate": 8.5e-05, "loss": 2.9075, "step": 17 }, { "epoch": 0.00013998500605046305, "grad_norm": 0.37385772299934034, "learning_rate": 9e-05, "loss": 2.9717, "step": 18 }, { "epoch": 0.00014776195083104432, "grad_norm": 0.31184501230670364, "learning_rate": 9.5e-05, "loss": 2.8433, "step": 19 }, { "epoch": 0.0001555388956116256, "grad_norm": 0.2889611032846881, "learning_rate": 0.0001, "loss": 2.8891, "step": 20 }, { "epoch": 0.0001633158403922069, "grad_norm": 0.9194314408567125, "learning_rate": 9.999999998507226e-05, "loss": 2.9003, "step": 21 }, { "epoch": 0.00017109278517278816, "grad_norm": 0.29227691345089646, "learning_rate": 9.9999999940289e-05, "loss": 2.8865, "step": 22 }, { "epoch": 0.00017886972995336944, "grad_norm": 0.3084995672876271, "learning_rate": 9.999999986565029e-05, "loss": 2.9348, "step": 23 }, { "epoch": 0.0001866466747339507, "grad_norm": 0.32299762745147437, "learning_rate": 9.999999976115607e-05, "loss": 2.8554, "step": 24 }, { "epoch": 0.000194423619514532, "grad_norm": 0.37595294044154787, "learning_rate": 9.999999962680635e-05, "loss": 2.9366, "step": 25 }, { "epoch": 0.00020220056429511328, "grad_norm": 0.3504604345750453, "learning_rate": 9.999999946260114e-05, "loss": 2.8742, "step": 26 }, { "epoch": 0.00020997750907569455, "grad_norm": 0.2737091383695973, "learning_rate": 9.999999926854045e-05, "loss": 2.8718, "step": 27 }, { "epoch": 0.00021775445385627585, "grad_norm": 0.19701570792013343, "learning_rate": 9.999999904462425e-05, "loss": 2.8748, "step": 28 }, { "epoch": 0.00022553139863685712, "grad_norm": 0.22127552495691005, "learning_rate": 9.999999879085256e-05, "loss": 2.8534, "step": 29 }, { "epoch": 0.0002333083434174384, "grad_norm": 0.4010806409784276, "learning_rate": 9.999999850722539e-05, "loss": 2.9192, "step": 30 }, { "epoch": 0.0002410852881980197, "grad_norm": 0.29132829823106265, "learning_rate": 9.99999981937427e-05, "loss": 2.9698, "step": 31 }, { "epoch": 0.00024886223297860096, "grad_norm": 0.21467560867127056, "learning_rate": 9.999999785040455e-05, "loss": 2.8389, "step": 32 }, { "epoch": 0.00025663917775918223, "grad_norm": 0.3548078987465395, "learning_rate": 9.99999974772109e-05, "loss": 2.8348, "step": 33 }, { "epoch": 0.0002644161225397635, "grad_norm": 0.26895240393567443, "learning_rate": 9.999999707416175e-05, "loss": 2.8585, "step": 34 }, { "epoch": 0.0002721930673203448, "grad_norm": 0.22199572618251467, "learning_rate": 9.999999664125711e-05, "loss": 2.8558, "step": 35 }, { "epoch": 0.0002799700121009261, "grad_norm": 0.42865007690900536, "learning_rate": 9.999999617849701e-05, "loss": 2.8995, "step": 36 }, { "epoch": 0.00028774695688150737, "grad_norm": 0.32016609999304896, "learning_rate": 9.999999568588139e-05, "loss": 2.7884, "step": 37 }, { "epoch": 0.00029552390166208864, "grad_norm": 0.2358200554344537, "learning_rate": 9.999999516341027e-05, "loss": 2.7605, "step": 38 }, { "epoch": 0.0003033008464426699, "grad_norm": 0.2564725816993172, "learning_rate": 9.99999946110837e-05, "loss": 2.7751, "step": 39 }, { "epoch": 0.0003110777912232512, "grad_norm": 0.17848695714018736, "learning_rate": 9.99999940289016e-05, "loss": 2.7802, "step": 40 }, { "epoch": 0.00031885473600383246, "grad_norm": 0.32111280827026734, "learning_rate": 9.999999341686403e-05, "loss": 2.8557, "step": 41 }, { "epoch": 0.0003266316807844138, "grad_norm": 0.17542866776519941, "learning_rate": 9.999999277497096e-05, "loss": 2.7247, "step": 42 }, { "epoch": 0.00033440862556499506, "grad_norm": 0.1927552784864359, "learning_rate": 9.999999210322241e-05, "loss": 2.7175, "step": 43 }, { "epoch": 0.00034218557034557633, "grad_norm": 0.18926674288243422, "learning_rate": 9.999999140161838e-05, "loss": 2.8188, "step": 44 }, { "epoch": 0.0003499625151261576, "grad_norm": 0.2020233380219591, "learning_rate": 9.999999067015887e-05, "loss": 2.7317, "step": 45 }, { "epoch": 0.00035773945990673887, "grad_norm": 0.153588174543195, "learning_rate": 9.999998990884384e-05, "loss": 2.7461, "step": 46 }, { "epoch": 0.00036551640468732014, "grad_norm": 0.1623402408353152, "learning_rate": 9.999998911767335e-05, "loss": 2.6825, "step": 47 }, { "epoch": 0.0003732933494679014, "grad_norm": 0.2085737879072871, "learning_rate": 9.999998829664736e-05, "loss": 2.859, "step": 48 }, { "epoch": 0.00038107029424848274, "grad_norm": 0.1920697574511579, "learning_rate": 9.999998744576589e-05, "loss": 2.7489, "step": 49 }, { "epoch": 0.000388847239029064, "grad_norm": 0.18382534231585423, "learning_rate": 9.999998656502893e-05, "loss": 2.7408, "step": 50 }, { "epoch": 0.0003966241838096453, "grad_norm": 0.16489421171889615, "learning_rate": 9.999998565443649e-05, "loss": 2.8208, "step": 51 }, { "epoch": 0.00040440112859022655, "grad_norm": 0.20588080246353976, "learning_rate": 9.999998471398858e-05, "loss": 2.6954, "step": 52 }, { "epoch": 0.0004121780733708078, "grad_norm": 0.3294131546810207, "learning_rate": 9.999998374368516e-05, "loss": 2.6844, "step": 53 }, { "epoch": 0.0004199550181513891, "grad_norm": 0.15451221429154138, "learning_rate": 9.999998274352626e-05, "loss": 2.7275, "step": 54 }, { "epoch": 0.0004277319629319704, "grad_norm": 0.19923776999178322, "learning_rate": 9.999998171351189e-05, "loss": 2.7608, "step": 55 }, { "epoch": 0.0004355089077125517, "grad_norm": 0.2312681856870332, "learning_rate": 9.999998065364205e-05, "loss": 2.7326, "step": 56 }, { "epoch": 0.00044328585249313297, "grad_norm": 0.2914138722475602, "learning_rate": 9.99999795639167e-05, "loss": 2.7033, "step": 57 }, { "epoch": 0.00045106279727371424, "grad_norm": 0.5189051132000227, "learning_rate": 9.999997844433589e-05, "loss": 2.6978, "step": 58 }, { "epoch": 0.0004588397420542955, "grad_norm": 0.3791267919585133, "learning_rate": 9.999997729489958e-05, "loss": 2.681, "step": 59 }, { "epoch": 0.0004666166868348768, "grad_norm": 0.30655077417616483, "learning_rate": 9.99999761156078e-05, "loss": 2.6337, "step": 60 }, { "epoch": 0.00047439363161545805, "grad_norm": 0.20761457409096823, "learning_rate": 9.999997490646056e-05, "loss": 2.743, "step": 61 }, { "epoch": 0.0004821705763960394, "grad_norm": 0.28816742220935765, "learning_rate": 9.999997366745783e-05, "loss": 2.7254, "step": 62 }, { "epoch": 0.0004899475211766206, "grad_norm": 0.40359947585317263, "learning_rate": 9.999997239859962e-05, "loss": 2.7654, "step": 63 }, { "epoch": 0.0004977244659572019, "grad_norm": 0.3560195421935845, "learning_rate": 9.999997109988593e-05, "loss": 2.7281, "step": 64 }, { "epoch": 0.0005055014107377832, "grad_norm": 0.19289787801604794, "learning_rate": 9.999996977131679e-05, "loss": 2.7096, "step": 65 }, { "epoch": 0.0005132783555183645, "grad_norm": 0.27649417454093306, "learning_rate": 9.999996841289214e-05, "loss": 2.6067, "step": 66 }, { "epoch": 0.0005210553002989457, "grad_norm": 0.2601106437339233, "learning_rate": 9.999996702461204e-05, "loss": 2.6498, "step": 67 }, { "epoch": 0.000528832245079527, "grad_norm": 0.17256980559017462, "learning_rate": 9.999996560647647e-05, "loss": 2.6453, "step": 68 }, { "epoch": 0.0005366091898601083, "grad_norm": 0.23255855878056572, "learning_rate": 9.999996415848541e-05, "loss": 2.5923, "step": 69 }, { "epoch": 0.0005443861346406895, "grad_norm": 0.22317256215611597, "learning_rate": 9.999996268063888e-05, "loss": 2.6408, "step": 70 }, { "epoch": 0.0005521630794212709, "grad_norm": 0.1824711264242648, "learning_rate": 9.999996117293689e-05, "loss": 2.6456, "step": 71 }, { "epoch": 0.0005599400242018522, "grad_norm": 0.15361379895815847, "learning_rate": 9.999995963537942e-05, "loss": 2.6317, "step": 72 }, { "epoch": 0.0005677169689824335, "grad_norm": 0.18666474400121172, "learning_rate": 9.999995806796649e-05, "loss": 2.6692, "step": 73 }, { "epoch": 0.0005754939137630147, "grad_norm": 0.1781997210265697, "learning_rate": 9.999995647069808e-05, "loss": 2.7251, "step": 74 }, { "epoch": 0.000583270858543596, "grad_norm": 0.20103305528925536, "learning_rate": 9.999995484357423e-05, "loss": 2.7155, "step": 75 }, { "epoch": 0.0005910478033241773, "grad_norm": 0.21945058440497245, "learning_rate": 9.99999531865949e-05, "loss": 2.7095, "step": 76 }, { "epoch": 0.0005988247481047586, "grad_norm": 0.23777047765012665, "learning_rate": 9.999995149976009e-05, "loss": 2.6364, "step": 77 }, { "epoch": 0.0006066016928853398, "grad_norm": 0.20138607993474417, "learning_rate": 9.999994978306984e-05, "loss": 2.6454, "step": 78 }, { "epoch": 0.0006143786376659211, "grad_norm": 0.15457938869731433, "learning_rate": 9.999994803652411e-05, "loss": 2.5185, "step": 79 }, { "epoch": 0.0006221555824465024, "grad_norm": 0.18841190104067893, "learning_rate": 9.999994626012293e-05, "loss": 2.668, "step": 80 }, { "epoch": 0.0006299325272270836, "grad_norm": 0.22653035127301224, "learning_rate": 9.999994445386627e-05, "loss": 2.5988, "step": 81 }, { "epoch": 0.0006377094720076649, "grad_norm": 0.21322880890622437, "learning_rate": 9.999994261775419e-05, "loss": 2.5974, "step": 82 }, { "epoch": 0.0006454864167882462, "grad_norm": 0.18068809255214, "learning_rate": 9.999994075178661e-05, "loss": 2.6646, "step": 83 }, { "epoch": 0.0006532633615688276, "grad_norm": 0.18427155311893045, "learning_rate": 9.99999388559636e-05, "loss": 2.707, "step": 84 }, { "epoch": 0.0006610403063494088, "grad_norm": 0.19636624046094941, "learning_rate": 9.999993693028511e-05, "loss": 2.7092, "step": 85 }, { "epoch": 0.0006688172511299901, "grad_norm": 0.19002954551390233, "learning_rate": 9.999993497475119e-05, "loss": 2.5937, "step": 86 }, { "epoch": 0.0006765941959105714, "grad_norm": 0.24427284742197042, "learning_rate": 9.99999329893618e-05, "loss": 2.6417, "step": 87 }, { "epoch": 0.0006843711406911527, "grad_norm": 0.31691948746649795, "learning_rate": 9.999993097411696e-05, "loss": 2.637, "step": 88 }, { "epoch": 0.0006921480854717339, "grad_norm": 0.353935176712756, "learning_rate": 9.999992892901668e-05, "loss": 2.5851, "step": 89 }, { "epoch": 0.0006999250302523152, "grad_norm": 0.3421858735559647, "learning_rate": 9.999992685406094e-05, "loss": 2.6847, "step": 90 }, { "epoch": 0.0007077019750328965, "grad_norm": 0.3003947850652209, "learning_rate": 9.999992474924975e-05, "loss": 2.571, "step": 91 }, { "epoch": 0.0007154789198134777, "grad_norm": 0.27546005371039617, "learning_rate": 9.999992261458311e-05, "loss": 2.6708, "step": 92 }, { "epoch": 0.000723255864594059, "grad_norm": 0.3004523661314765, "learning_rate": 9.999992045006104e-05, "loss": 2.5324, "step": 93 }, { "epoch": 0.0007310328093746403, "grad_norm": 0.35346336003414963, "learning_rate": 9.99999182556835e-05, "loss": 2.6999, "step": 94 }, { "epoch": 0.0007388097541552216, "grad_norm": 0.319746526822393, "learning_rate": 9.999991603145054e-05, "loss": 2.5559, "step": 95 }, { "epoch": 0.0007465866989358028, "grad_norm": 0.2660813753235509, "learning_rate": 9.999991377736212e-05, "loss": 2.5477, "step": 96 }, { "epoch": 0.0007543636437163842, "grad_norm": 0.28711560929318275, "learning_rate": 9.999991149341826e-05, "loss": 2.5845, "step": 97 }, { "epoch": 0.0007621405884969655, "grad_norm": 0.27656190840653305, "learning_rate": 9.999990917961896e-05, "loss": 2.547, "step": 98 }, { "epoch": 0.0007699175332775467, "grad_norm": 0.2652238795046946, "learning_rate": 9.999990683596423e-05, "loss": 2.5931, "step": 99 }, { "epoch": 0.000777694478058128, "grad_norm": 0.2406883700508297, "learning_rate": 9.999990446245406e-05, "loss": 2.558, "step": 100 }, { "epoch": 0.0007854714228387093, "grad_norm": 0.2526591431378865, "learning_rate": 9.999990205908847e-05, "loss": 2.6091, "step": 101 }, { "epoch": 0.0007932483676192906, "grad_norm": 0.25130938812883563, "learning_rate": 9.999989962586742e-05, "loss": 2.6827, "step": 102 }, { "epoch": 0.0008010253123998718, "grad_norm": 0.25419028992340087, "learning_rate": 9.999989716279095e-05, "loss": 2.6098, "step": 103 }, { "epoch": 0.0008088022571804531, "grad_norm": 0.2019058548254475, "learning_rate": 9.999989466985903e-05, "loss": 2.5804, "step": 104 }, { "epoch": 0.0008165792019610344, "grad_norm": 0.2687703414638524, "learning_rate": 9.99998921470717e-05, "loss": 2.5817, "step": 105 }, { "epoch": 0.0008243561467416157, "grad_norm": 0.2665177492360026, "learning_rate": 9.999988959442895e-05, "loss": 2.5475, "step": 106 }, { "epoch": 0.0008321330915221969, "grad_norm": 0.20084722234140648, "learning_rate": 9.999988701193077e-05, "loss": 2.5833, "step": 107 }, { "epoch": 0.0008399100363027782, "grad_norm": 0.47131395603146314, "learning_rate": 9.999988439957715e-05, "loss": 2.5327, "step": 108 }, { "epoch": 0.0008476869810833595, "grad_norm": 0.19175681689198412, "learning_rate": 9.999988175736812e-05, "loss": 2.6153, "step": 109 }, { "epoch": 0.0008554639258639408, "grad_norm": 0.22437865277346222, "learning_rate": 9.999987908530366e-05, "loss": 2.4957, "step": 110 }, { "epoch": 0.0008632408706445221, "grad_norm": 0.20687703446268516, "learning_rate": 9.999987638338377e-05, "loss": 2.6106, "step": 111 }, { "epoch": 0.0008710178154251034, "grad_norm": 0.18680947538348416, "learning_rate": 9.999987365160848e-05, "loss": 2.5883, "step": 112 }, { "epoch": 0.0008787947602056847, "grad_norm": 0.2283803285495701, "learning_rate": 9.999987088997776e-05, "loss": 2.6869, "step": 113 }, { "epoch": 0.0008865717049862659, "grad_norm": 0.2447688788978082, "learning_rate": 9.999986809849163e-05, "loss": 2.5756, "step": 114 }, { "epoch": 0.0008943486497668472, "grad_norm": 0.31046069460226794, "learning_rate": 9.999986527715009e-05, "loss": 2.5272, "step": 115 }, { "epoch": 0.0009021255945474285, "grad_norm": 0.35013502906671273, "learning_rate": 9.999986242595312e-05, "loss": 2.577, "step": 116 }, { "epoch": 0.0009099025393280097, "grad_norm": 0.37411171521557085, "learning_rate": 9.999985954490077e-05, "loss": 2.5193, "step": 117 }, { "epoch": 0.000917679484108591, "grad_norm": 0.2967779187562417, "learning_rate": 9.999985663399298e-05, "loss": 2.5546, "step": 118 }, { "epoch": 0.0009254564288891723, "grad_norm": 0.18986664184991875, "learning_rate": 9.999985369322981e-05, "loss": 2.6008, "step": 119 }, { "epoch": 0.0009332333736697536, "grad_norm": 0.24884136563758832, "learning_rate": 9.999985072261121e-05, "loss": 2.5514, "step": 120 }, { "epoch": 0.0009410103184503348, "grad_norm": 0.3795542078488493, "learning_rate": 9.999984772213721e-05, "loss": 2.5013, "step": 121 }, { "epoch": 0.0009487872632309161, "grad_norm": 0.4048015593640644, "learning_rate": 9.999984469180784e-05, "loss": 2.572, "step": 122 }, { "epoch": 0.0009565642080114975, "grad_norm": 0.3568164109698804, "learning_rate": 9.999984163162304e-05, "loss": 2.5522, "step": 123 }, { "epoch": 0.0009643411527920788, "grad_norm": 0.1975908136716411, "learning_rate": 9.999983854158285e-05, "loss": 2.5762, "step": 124 }, { "epoch": 0.00097211809757266, "grad_norm": 0.18031440004668006, "learning_rate": 9.999983542168726e-05, "loss": 2.5784, "step": 125 }, { "epoch": 0.0009798950423532413, "grad_norm": 0.2885544398331827, "learning_rate": 9.999983227193628e-05, "loss": 2.5582, "step": 126 }, { "epoch": 0.0009876719871338225, "grad_norm": 0.2663955338466254, "learning_rate": 9.99998290923299e-05, "loss": 2.5649, "step": 127 }, { "epoch": 0.0009954489319144038, "grad_norm": 0.176602116030421, "learning_rate": 9.999982588286813e-05, "loss": 2.5786, "step": 128 }, { "epoch": 0.0010032258766949852, "grad_norm": 0.20603976999045476, "learning_rate": 9.999982264355099e-05, "loss": 2.568, "step": 129 }, { "epoch": 0.0010110028214755664, "grad_norm": 0.21190817966619743, "learning_rate": 9.999981937437845e-05, "loss": 2.49, "step": 130 }, { "epoch": 0.0010187797662561478, "grad_norm": 0.22964900444463537, "learning_rate": 9.999981607535051e-05, "loss": 2.6575, "step": 131 }, { "epoch": 0.001026556711036729, "grad_norm": 0.24488340120853333, "learning_rate": 9.999981274646722e-05, "loss": 2.5902, "step": 132 }, { "epoch": 0.0010343336558173103, "grad_norm": 0.2544326723006938, "learning_rate": 9.999980938772853e-05, "loss": 2.4766, "step": 133 }, { "epoch": 0.0010421106005978915, "grad_norm": 0.22469982747487616, "learning_rate": 9.999980599913445e-05, "loss": 2.5536, "step": 134 }, { "epoch": 0.0010498875453784729, "grad_norm": 0.1949843748529875, "learning_rate": 9.9999802580685e-05, "loss": 2.4859, "step": 135 }, { "epoch": 0.001057664490159054, "grad_norm": 0.22253896029357823, "learning_rate": 9.99997991323802e-05, "loss": 2.5762, "step": 136 }, { "epoch": 0.0010654414349396354, "grad_norm": 0.24278060133068685, "learning_rate": 9.999979565422e-05, "loss": 2.6286, "step": 137 }, { "epoch": 0.0010732183797202166, "grad_norm": 0.21754265977683515, "learning_rate": 9.999979214620444e-05, "loss": 2.5191, "step": 138 }, { "epoch": 0.001080995324500798, "grad_norm": 0.16591650002526143, "learning_rate": 9.99997886083335e-05, "loss": 2.5655, "step": 139 }, { "epoch": 0.001088772269281379, "grad_norm": 0.2055463783205094, "learning_rate": 9.99997850406072e-05, "loss": 2.4494, "step": 140 }, { "epoch": 0.0010965492140619605, "grad_norm": 0.20284297256774428, "learning_rate": 9.999978144302555e-05, "loss": 2.5382, "step": 141 }, { "epoch": 0.0011043261588425419, "grad_norm": 0.19701112927647577, "learning_rate": 9.999977781558852e-05, "loss": 2.4864, "step": 142 }, { "epoch": 0.001112103103623123, "grad_norm": 0.18270729157791718, "learning_rate": 9.999977415829615e-05, "loss": 2.6145, "step": 143 }, { "epoch": 0.0011198800484037044, "grad_norm": 0.16344094320226996, "learning_rate": 9.99997704711484e-05, "loss": 2.5399, "step": 144 }, { "epoch": 0.0011276569931842856, "grad_norm": 0.1737773246147734, "learning_rate": 9.999976675414531e-05, "loss": 2.4631, "step": 145 }, { "epoch": 0.001135433937964867, "grad_norm": 0.23850571569622248, "learning_rate": 9.999976300728686e-05, "loss": 2.5542, "step": 146 }, { "epoch": 0.0011432108827454481, "grad_norm": 0.22872726530950357, "learning_rate": 9.999975923057305e-05, "loss": 2.5613, "step": 147 }, { "epoch": 0.0011509878275260295, "grad_norm": 0.26683050295324096, "learning_rate": 9.99997554240039e-05, "loss": 2.5331, "step": 148 }, { "epoch": 0.0011587647723066107, "grad_norm": 0.27363609203927575, "learning_rate": 9.999975158757942e-05, "loss": 2.5999, "step": 149 }, { "epoch": 0.001166541717087192, "grad_norm": 0.22615775378406122, "learning_rate": 9.999974772129957e-05, "loss": 2.4787, "step": 150 }, { "epoch": 0.0011743186618677732, "grad_norm": 0.2175974074392971, "learning_rate": 9.999974382516439e-05, "loss": 2.5071, "step": 151 }, { "epoch": 0.0011820956066483546, "grad_norm": 0.20603906040418357, "learning_rate": 9.999973989917386e-05, "loss": 2.5327, "step": 152 }, { "epoch": 0.0011898725514289357, "grad_norm": 0.18879903080321112, "learning_rate": 9.999973594332801e-05, "loss": 2.5513, "step": 153 }, { "epoch": 0.0011976494962095171, "grad_norm": 0.19093705270267988, "learning_rate": 9.999973195762681e-05, "loss": 2.5199, "step": 154 }, { "epoch": 0.0012054264409900985, "grad_norm": 0.2031079030658894, "learning_rate": 9.999972794207029e-05, "loss": 2.5007, "step": 155 }, { "epoch": 0.0012132033857706797, "grad_norm": 0.2338253519926602, "learning_rate": 9.999972389665843e-05, "loss": 2.5205, "step": 156 }, { "epoch": 0.001220980330551261, "grad_norm": 0.2792419805240614, "learning_rate": 9.999971982139124e-05, "loss": 2.4949, "step": 157 }, { "epoch": 0.0012287572753318422, "grad_norm": 0.3277448962546274, "learning_rate": 9.999971571626872e-05, "loss": 2.5461, "step": 158 }, { "epoch": 0.0012365342201124236, "grad_norm": 0.44444749031119857, "learning_rate": 9.999971158129088e-05, "loss": 2.5338, "step": 159 }, { "epoch": 0.0012443111648930047, "grad_norm": 0.5846653760309161, "learning_rate": 9.999970741645774e-05, "loss": 2.5339, "step": 160 }, { "epoch": 0.0012520881096735861, "grad_norm": 0.5608133526455837, "learning_rate": 9.999970322176927e-05, "loss": 2.4881, "step": 161 }, { "epoch": 0.0012598650544541673, "grad_norm": 0.5039185916978005, "learning_rate": 9.999969899722547e-05, "loss": 2.5376, "step": 162 }, { "epoch": 0.0012676419992347487, "grad_norm": 0.344547017088849, "learning_rate": 9.999969474282638e-05, "loss": 2.5245, "step": 163 }, { "epoch": 0.0012754189440153298, "grad_norm": 0.3031159500381915, "learning_rate": 9.999969045857197e-05, "loss": 2.4302, "step": 164 }, { "epoch": 0.0012831958887959112, "grad_norm": 0.2734332886778911, "learning_rate": 9.999968614446225e-05, "loss": 2.5336, "step": 165 }, { "epoch": 0.0012909728335764924, "grad_norm": 0.3018051661974538, "learning_rate": 9.999968180049723e-05, "loss": 2.4568, "step": 166 }, { "epoch": 0.0012987497783570738, "grad_norm": 0.3455378491936719, "learning_rate": 9.99996774266769e-05, "loss": 2.5641, "step": 167 }, { "epoch": 0.0013065267231376551, "grad_norm": 0.2480465795667683, "learning_rate": 9.999967302300128e-05, "loss": 2.5716, "step": 168 }, { "epoch": 0.0013143036679182363, "grad_norm": 0.2514287696086687, "learning_rate": 9.999966858947036e-05, "loss": 2.5079, "step": 169 }, { "epoch": 0.0013220806126988177, "grad_norm": 0.22533172504271282, "learning_rate": 9.999966412608414e-05, "loss": 2.4725, "step": 170 }, { "epoch": 0.0013298575574793988, "grad_norm": 0.23971264057999153, "learning_rate": 9.999965963284264e-05, "loss": 2.502, "step": 171 }, { "epoch": 0.0013376345022599802, "grad_norm": 0.18784028823391496, "learning_rate": 9.999965510974583e-05, "loss": 2.4469, "step": 172 }, { "epoch": 0.0013454114470405614, "grad_norm": 0.17857976406515053, "learning_rate": 9.999965055679374e-05, "loss": 2.5444, "step": 173 }, { "epoch": 0.0013531883918211428, "grad_norm": 0.19542590673936472, "learning_rate": 9.999964597398637e-05, "loss": 2.5928, "step": 174 }, { "epoch": 0.001360965336601724, "grad_norm": 0.16486830983204365, "learning_rate": 9.999964136132371e-05, "loss": 2.476, "step": 175 }, { "epoch": 0.0013687422813823053, "grad_norm": 0.14951204656038203, "learning_rate": 9.999963671880579e-05, "loss": 2.4748, "step": 176 }, { "epoch": 0.0013765192261628865, "grad_norm": 0.1724965245370557, "learning_rate": 9.99996320464326e-05, "loss": 2.4217, "step": 177 }, { "epoch": 0.0013842961709434679, "grad_norm": 0.1513826250398295, "learning_rate": 9.999962734420412e-05, "loss": 2.4795, "step": 178 }, { "epoch": 0.001392073115724049, "grad_norm": 0.15083051046641818, "learning_rate": 9.999962261212036e-05, "loss": 2.4869, "step": 179 }, { "epoch": 0.0013998500605046304, "grad_norm": 0.16583738068322795, "learning_rate": 9.999961785018135e-05, "loss": 2.4583, "step": 180 }, { "epoch": 0.0014076270052852118, "grad_norm": 0.1999735230887107, "learning_rate": 9.999961305838707e-05, "loss": 2.4767, "step": 181 }, { "epoch": 0.001415403950065793, "grad_norm": 0.16678356190398105, "learning_rate": 9.999960823673754e-05, "loss": 2.482, "step": 182 }, { "epoch": 0.0014231808948463743, "grad_norm": 0.22738298567243978, "learning_rate": 9.999960338523272e-05, "loss": 2.5687, "step": 183 }, { "epoch": 0.0014309578396269555, "grad_norm": 0.19518786426220736, "learning_rate": 9.999959850387266e-05, "loss": 2.4665, "step": 184 }, { "epoch": 0.0014387347844075369, "grad_norm": 0.21184703214996103, "learning_rate": 9.999959359265736e-05, "loss": 2.5011, "step": 185 }, { "epoch": 0.001446511729188118, "grad_norm": 0.16913264290117558, "learning_rate": 9.99995886515868e-05, "loss": 2.512, "step": 186 }, { "epoch": 0.0014542886739686994, "grad_norm": 0.17622120126810106, "learning_rate": 9.999958368066099e-05, "loss": 2.4729, "step": 187 }, { "epoch": 0.0014620656187492806, "grad_norm": 0.15467929647703021, "learning_rate": 9.999957867987995e-05, "loss": 2.5215, "step": 188 }, { "epoch": 0.001469842563529862, "grad_norm": 0.18719535328307715, "learning_rate": 9.999957364924366e-05, "loss": 2.4168, "step": 189 }, { "epoch": 0.0014776195083104431, "grad_norm": 0.23959254003410493, "learning_rate": 9.999956858875212e-05, "loss": 2.4665, "step": 190 }, { "epoch": 0.0014853964530910245, "grad_norm": 0.28552799057938066, "learning_rate": 9.999956349840536e-05, "loss": 2.5351, "step": 191 }, { "epoch": 0.0014931733978716057, "grad_norm": 0.25929025376035514, "learning_rate": 9.999955837820336e-05, "loss": 2.5162, "step": 192 }, { "epoch": 0.001500950342652187, "grad_norm": 0.19804466396897186, "learning_rate": 9.999955322814613e-05, "loss": 2.5364, "step": 193 }, { "epoch": 0.0015087272874327684, "grad_norm": 0.16095878745966657, "learning_rate": 9.999954804823368e-05, "loss": 2.5345, "step": 194 }, { "epoch": 0.0015165042322133496, "grad_norm": 0.1971001983582156, "learning_rate": 9.999954283846602e-05, "loss": 2.4145, "step": 195 }, { "epoch": 0.001524281176993931, "grad_norm": 0.21054181663552127, "learning_rate": 9.999953759884313e-05, "loss": 2.4438, "step": 196 }, { "epoch": 0.0015320581217745121, "grad_norm": 0.1742488015644025, "learning_rate": 9.999953232936501e-05, "loss": 2.4676, "step": 197 }, { "epoch": 0.0015398350665550935, "grad_norm": 0.164875720651087, "learning_rate": 9.999952703003168e-05, "loss": 2.4937, "step": 198 }, { "epoch": 0.0015476120113356747, "grad_norm": 0.18484071001659083, "learning_rate": 9.999952170084316e-05, "loss": 2.5146, "step": 199 }, { "epoch": 0.001555388956116256, "grad_norm": 0.20317539596237552, "learning_rate": 9.999951634179941e-05, "loss": 2.4366, "step": 200 }, { "epoch": 0.0015631659008968372, "grad_norm": 2.4465710262786224, "learning_rate": 9.999951095290048e-05, "loss": 2.4763, "step": 201 }, { "epoch": 0.0015709428456774186, "grad_norm": 0.25347394670379253, "learning_rate": 9.999950553414633e-05, "loss": 2.443, "step": 202 }, { "epoch": 0.0015787197904579997, "grad_norm": 0.3894222799590408, "learning_rate": 9.999950008553698e-05, "loss": 2.4251, "step": 203 }, { "epoch": 0.0015864967352385811, "grad_norm": 0.3920324953337565, "learning_rate": 9.999949460707245e-05, "loss": 2.5516, "step": 204 }, { "epoch": 0.0015942736800191623, "grad_norm": 0.4729131293029798, "learning_rate": 9.999948909875273e-05, "loss": 2.4776, "step": 205 }, { "epoch": 0.0016020506247997437, "grad_norm": 0.6932196426632694, "learning_rate": 9.99994835605778e-05, "loss": 2.4843, "step": 206 }, { "epoch": 0.001609827569580325, "grad_norm": 0.7503411106111623, "learning_rate": 9.999947799254771e-05, "loss": 2.5252, "step": 207 }, { "epoch": 0.0016176045143609062, "grad_norm": 0.43267972022815754, "learning_rate": 9.999947239466243e-05, "loss": 2.4545, "step": 208 }, { "epoch": 0.0016253814591414876, "grad_norm": 0.3965708809015969, "learning_rate": 9.999946676692198e-05, "loss": 2.4902, "step": 209 }, { "epoch": 0.0016331584039220688, "grad_norm": 0.6228463782556325, "learning_rate": 9.999946110932634e-05, "loss": 2.4883, "step": 210 }, { "epoch": 0.0016409353487026501, "grad_norm": 0.3709589595394729, "learning_rate": 9.999945542187555e-05, "loss": 2.5056, "step": 211 }, { "epoch": 0.0016487122934832313, "grad_norm": 0.3511040936843577, "learning_rate": 9.999944970456957e-05, "loss": 2.4071, "step": 212 }, { "epoch": 0.0016564892382638127, "grad_norm": 0.45827408543736553, "learning_rate": 9.999944395740844e-05, "loss": 2.4862, "step": 213 }, { "epoch": 0.0016642661830443938, "grad_norm": 0.2883804572854823, "learning_rate": 9.999943818039215e-05, "loss": 2.4915, "step": 214 }, { "epoch": 0.0016720431278249752, "grad_norm": 0.3677002242101439, "learning_rate": 9.99994323735207e-05, "loss": 2.4433, "step": 215 }, { "epoch": 0.0016798200726055564, "grad_norm": 0.2486439994642731, "learning_rate": 9.999942653679409e-05, "loss": 2.5016, "step": 216 }, { "epoch": 0.0016875970173861378, "grad_norm": 0.25158863170738166, "learning_rate": 9.999942067021233e-05, "loss": 2.4746, "step": 217 }, { "epoch": 0.001695373962166719, "grad_norm": 0.2023248211465051, "learning_rate": 9.999941477377543e-05, "loss": 2.4454, "step": 218 }, { "epoch": 0.0017031509069473003, "grad_norm": 0.2504558032573616, "learning_rate": 9.999940884748338e-05, "loss": 2.4659, "step": 219 }, { "epoch": 0.0017109278517278817, "grad_norm": 0.18838722157525986, "learning_rate": 9.999940289133621e-05, "loss": 2.475, "step": 220 }, { "epoch": 0.0017187047965084629, "grad_norm": 0.20142852995398497, "learning_rate": 9.999939690533388e-05, "loss": 2.4936, "step": 221 }, { "epoch": 0.0017264817412890442, "grad_norm": 0.19417837305195848, "learning_rate": 9.999939088947643e-05, "loss": 2.4997, "step": 222 }, { "epoch": 0.0017342586860696254, "grad_norm": 0.16099701276739664, "learning_rate": 9.999938484376383e-05, "loss": 2.416, "step": 223 }, { "epoch": 0.0017420356308502068, "grad_norm": 0.19268000744918307, "learning_rate": 9.999937876819615e-05, "loss": 2.459, "step": 224 }, { "epoch": 0.001749812575630788, "grad_norm": 0.18112704678025796, "learning_rate": 9.999937266277331e-05, "loss": 2.4473, "step": 225 }, { "epoch": 0.0017575895204113693, "grad_norm": 0.19124204404376016, "learning_rate": 9.999936652749536e-05, "loss": 2.4488, "step": 226 }, { "epoch": 0.0017653664651919505, "grad_norm": 0.17035649424438393, "learning_rate": 9.99993603623623e-05, "loss": 2.4566, "step": 227 }, { "epoch": 0.0017731434099725319, "grad_norm": 0.17082879184057445, "learning_rate": 9.999935416737411e-05, "loss": 2.4518, "step": 228 }, { "epoch": 0.001780920354753113, "grad_norm": 0.19361335360276244, "learning_rate": 9.999934794253084e-05, "loss": 2.4839, "step": 229 }, { "epoch": 0.0017886972995336944, "grad_norm": 0.16486294631351225, "learning_rate": 9.999934168783246e-05, "loss": 2.3732, "step": 230 }, { "epoch": 0.0017964742443142756, "grad_norm": 0.16584271730986772, "learning_rate": 9.999933540327896e-05, "loss": 2.4586, "step": 231 }, { "epoch": 0.001804251189094857, "grad_norm": 0.19880816366875337, "learning_rate": 9.99993290888704e-05, "loss": 2.4904, "step": 232 }, { "epoch": 0.0018120281338754383, "grad_norm": 0.18147271233408407, "learning_rate": 9.999932274460673e-05, "loss": 2.4589, "step": 233 }, { "epoch": 0.0018198050786560195, "grad_norm": 0.16307524162198617, "learning_rate": 9.999931637048797e-05, "loss": 2.4424, "step": 234 }, { "epoch": 0.0018275820234366009, "grad_norm": 0.19810155537927837, "learning_rate": 9.999930996651413e-05, "loss": 2.4675, "step": 235 }, { "epoch": 0.001835358968217182, "grad_norm": 0.171604370533007, "learning_rate": 9.999930353268521e-05, "loss": 2.4609, "step": 236 }, { "epoch": 0.0018431359129977634, "grad_norm": 0.1724700742270467, "learning_rate": 9.999929706900121e-05, "loss": 2.4866, "step": 237 }, { "epoch": 0.0018509128577783446, "grad_norm": 0.24253271527026674, "learning_rate": 9.999929057546213e-05, "loss": 2.3991, "step": 238 }, { "epoch": 0.001858689802558926, "grad_norm": 0.17382686582713502, "learning_rate": 9.9999284052068e-05, "loss": 2.4564, "step": 239 }, { "epoch": 0.0018664667473395071, "grad_norm": 0.14780640406853757, "learning_rate": 9.99992774988188e-05, "loss": 2.4298, "step": 240 }, { "epoch": 0.0018742436921200885, "grad_norm": 0.1895813373878783, "learning_rate": 9.999927091571452e-05, "loss": 2.4725, "step": 241 }, { "epoch": 0.0018820206369006697, "grad_norm": 0.1731568880461865, "learning_rate": 9.99992643027552e-05, "loss": 2.4222, "step": 242 }, { "epoch": 0.001889797581681251, "grad_norm": 0.13767524425586766, "learning_rate": 9.999925765994082e-05, "loss": 2.4238, "step": 243 }, { "epoch": 0.0018975745264618322, "grad_norm": 0.1787532876997416, "learning_rate": 9.999925098727139e-05, "loss": 2.4375, "step": 244 }, { "epoch": 0.0019053514712424136, "grad_norm": 0.17380940275685675, "learning_rate": 9.999924428474692e-05, "loss": 2.4771, "step": 245 }, { "epoch": 0.001913128416022995, "grad_norm": 0.15856213125892146, "learning_rate": 9.99992375523674e-05, "loss": 2.4167, "step": 246 }, { "epoch": 0.0019209053608035761, "grad_norm": 0.14245691991534729, "learning_rate": 9.999923079013284e-05, "loss": 2.3758, "step": 247 }, { "epoch": 0.0019286823055841575, "grad_norm": 0.1509464897000436, "learning_rate": 9.999922399804326e-05, "loss": 2.4579, "step": 248 }, { "epoch": 0.0019364592503647387, "grad_norm": 0.14877274888915573, "learning_rate": 9.999921717609865e-05, "loss": 2.4454, "step": 249 }, { "epoch": 0.00194423619514532, "grad_norm": 0.16638724262506066, "learning_rate": 9.999921032429901e-05, "loss": 2.3996, "step": 250 }, { "epoch": 0.0019520131399259012, "grad_norm": 0.1630855897396164, "learning_rate": 9.999920344264435e-05, "loss": 2.377, "step": 251 }, { "epoch": 0.0019597900847064826, "grad_norm": 0.17678107817849936, "learning_rate": 9.999919653113467e-05, "loss": 2.3942, "step": 252 }, { "epoch": 0.0019675670294870638, "grad_norm": 0.168621779187893, "learning_rate": 9.999918958976997e-05, "loss": 2.4451, "step": 253 }, { "epoch": 0.001975343974267645, "grad_norm": 0.1865576473785629, "learning_rate": 9.999918261855028e-05, "loss": 2.4732, "step": 254 }, { "epoch": 0.0019831209190482265, "grad_norm": 0.15259982245223289, "learning_rate": 9.999917561747557e-05, "loss": 2.3742, "step": 255 }, { "epoch": 0.0019908978638288077, "grad_norm": 0.15911779331159193, "learning_rate": 9.999916858654588e-05, "loss": 2.477, "step": 256 }, { "epoch": 0.001998674808609389, "grad_norm": 0.15598616581334626, "learning_rate": 9.999916152576117e-05, "loss": 2.3394, "step": 257 }, { "epoch": 0.0020064517533899704, "grad_norm": 0.15015729517422818, "learning_rate": 9.999915443512147e-05, "loss": 2.4384, "step": 258 }, { "epoch": 0.0020142286981705516, "grad_norm": 0.1618808517285124, "learning_rate": 9.99991473146268e-05, "loss": 2.4165, "step": 259 }, { "epoch": 0.0020220056429511328, "grad_norm": 0.20341998282011275, "learning_rate": 9.999914016427713e-05, "loss": 2.3916, "step": 260 }, { "epoch": 0.002029782587731714, "grad_norm": 0.2146783027401428, "learning_rate": 9.999913298407249e-05, "loss": 2.4346, "step": 261 }, { "epoch": 0.0020375595325122955, "grad_norm": 0.1580502923359377, "learning_rate": 9.999912577401287e-05, "loss": 2.4143, "step": 262 }, { "epoch": 0.0020453364772928767, "grad_norm": 0.17133451331742386, "learning_rate": 9.99991185340983e-05, "loss": 2.4091, "step": 263 }, { "epoch": 0.002053113422073458, "grad_norm": 0.18043732583780955, "learning_rate": 9.999911126432873e-05, "loss": 2.4092, "step": 264 }, { "epoch": 0.002060890366854039, "grad_norm": 0.16393460173696264, "learning_rate": 9.999910396470421e-05, "loss": 2.4654, "step": 265 }, { "epoch": 0.0020686673116346206, "grad_norm": 0.2018552041172487, "learning_rate": 9.999909663522473e-05, "loss": 2.3808, "step": 266 }, { "epoch": 0.0020764442564152018, "grad_norm": 0.23500336852571094, "learning_rate": 9.999908927589031e-05, "loss": 2.4444, "step": 267 }, { "epoch": 0.002084221201195783, "grad_norm": 0.20288450407820938, "learning_rate": 9.999908188670093e-05, "loss": 2.399, "step": 268 }, { "epoch": 0.002091998145976364, "grad_norm": 0.15612761470658057, "learning_rate": 9.999907446765661e-05, "loss": 2.421, "step": 269 }, { "epoch": 0.0020997750907569457, "grad_norm": 0.19033073667044548, "learning_rate": 9.999906701875735e-05, "loss": 2.4366, "step": 270 }, { "epoch": 0.002107552035537527, "grad_norm": 0.202350067670901, "learning_rate": 9.999905954000317e-05, "loss": 2.4068, "step": 271 }, { "epoch": 0.002115328980318108, "grad_norm": 0.20462602365165744, "learning_rate": 9.999905203139403e-05, "loss": 2.3661, "step": 272 }, { "epoch": 0.0021231059250986896, "grad_norm": 0.1845337803536294, "learning_rate": 9.999904449292999e-05, "loss": 2.4242, "step": 273 }, { "epoch": 0.002130882869879271, "grad_norm": 0.16146803023921025, "learning_rate": 9.9999036924611e-05, "loss": 2.4284, "step": 274 }, { "epoch": 0.002138659814659852, "grad_norm": 0.15783240342212074, "learning_rate": 9.999902932643711e-05, "loss": 2.4165, "step": 275 }, { "epoch": 0.002146436759440433, "grad_norm": 0.15997213703052235, "learning_rate": 9.99990216984083e-05, "loss": 2.4344, "step": 276 }, { "epoch": 0.0021542137042210147, "grad_norm": 0.15362187105292835, "learning_rate": 9.999901404052459e-05, "loss": 2.3859, "step": 277 }, { "epoch": 0.002161990649001596, "grad_norm": 0.15138636833264527, "learning_rate": 9.999900635278596e-05, "loss": 2.3384, "step": 278 }, { "epoch": 0.002169767593782177, "grad_norm": 0.13774488406527713, "learning_rate": 9.999899863519244e-05, "loss": 2.3882, "step": 279 }, { "epoch": 0.002177544538562758, "grad_norm": 0.16972783623663618, "learning_rate": 9.999899088774404e-05, "loss": 2.4555, "step": 280 }, { "epoch": 0.00218532148334334, "grad_norm": 0.1670839636311913, "learning_rate": 9.999898311044074e-05, "loss": 2.3925, "step": 281 }, { "epoch": 0.002193098428123921, "grad_norm": 0.14624213903094777, "learning_rate": 9.999897530328255e-05, "loss": 2.3835, "step": 282 }, { "epoch": 0.002200875372904502, "grad_norm": 0.14918952655783513, "learning_rate": 9.999896746626948e-05, "loss": 2.4287, "step": 283 }, { "epoch": 0.0022086523176850837, "grad_norm": 0.159037549062395, "learning_rate": 9.999895959940153e-05, "loss": 2.3948, "step": 284 }, { "epoch": 0.002216429262465665, "grad_norm": 0.15445401056386035, "learning_rate": 9.999895170267873e-05, "loss": 2.4492, "step": 285 }, { "epoch": 0.002224206207246246, "grad_norm": 0.17299713412731843, "learning_rate": 9.999894377610104e-05, "loss": 2.3778, "step": 286 }, { "epoch": 0.002231983152026827, "grad_norm": 0.16327645959842868, "learning_rate": 9.999893581966849e-05, "loss": 2.3812, "step": 287 }, { "epoch": 0.002239760096807409, "grad_norm": 0.16294552107548027, "learning_rate": 9.999892783338109e-05, "loss": 2.4095, "step": 288 }, { "epoch": 0.00224753704158799, "grad_norm": 0.18319836663461153, "learning_rate": 9.999891981723883e-05, "loss": 2.4584, "step": 289 }, { "epoch": 0.002255313986368571, "grad_norm": 0.1513051019224681, "learning_rate": 9.999891177124173e-05, "loss": 2.3497, "step": 290 }, { "epoch": 0.0022630909311491523, "grad_norm": 0.1550168302286235, "learning_rate": 9.999890369538977e-05, "loss": 2.4083, "step": 291 }, { "epoch": 0.002270867875929734, "grad_norm": 0.18024034371359984, "learning_rate": 9.999889558968301e-05, "loss": 2.4339, "step": 292 }, { "epoch": 0.002278644820710315, "grad_norm": 0.17122737740944394, "learning_rate": 9.999888745412138e-05, "loss": 2.4556, "step": 293 }, { "epoch": 0.0022864217654908962, "grad_norm": 0.15723305405858828, "learning_rate": 9.999887928870494e-05, "loss": 2.4021, "step": 294 }, { "epoch": 0.0022941987102714774, "grad_norm": 0.16701837660242708, "learning_rate": 9.999887109343366e-05, "loss": 2.3743, "step": 295 }, { "epoch": 0.002301975655052059, "grad_norm": 0.23747531356523147, "learning_rate": 9.999886286830758e-05, "loss": 2.4499, "step": 296 }, { "epoch": 0.00230975259983264, "grad_norm": 0.2792808020350421, "learning_rate": 9.999885461332668e-05, "loss": 2.4163, "step": 297 }, { "epoch": 0.0023175295446132213, "grad_norm": 0.37603520324311956, "learning_rate": 9.999884632849099e-05, "loss": 2.4053, "step": 298 }, { "epoch": 0.002325306489393803, "grad_norm": 0.40745049840767594, "learning_rate": 9.999883801380047e-05, "loss": 2.4399, "step": 299 }, { "epoch": 0.002333083434174384, "grad_norm": 0.37641668683858853, "learning_rate": 9.999882966925516e-05, "loss": 2.3761, "step": 300 }, { "epoch": 0.0023408603789549652, "grad_norm": 0.2670076481621427, "learning_rate": 9.999882129485506e-05, "loss": 2.3523, "step": 301 }, { "epoch": 0.0023486373237355464, "grad_norm": 0.18859284450741742, "learning_rate": 9.999881289060016e-05, "loss": 2.4026, "step": 302 }, { "epoch": 0.002356414268516128, "grad_norm": 0.1734946762978537, "learning_rate": 9.999880445649048e-05, "loss": 2.3756, "step": 303 }, { "epoch": 0.002364191213296709, "grad_norm": 0.20542833608857777, "learning_rate": 9.999879599252602e-05, "loss": 2.4271, "step": 304 }, { "epoch": 0.0023719681580772903, "grad_norm": 0.21905990172232573, "learning_rate": 9.999878749870679e-05, "loss": 2.4191, "step": 305 }, { "epoch": 0.0023797451028578715, "grad_norm": 0.19405218001411959, "learning_rate": 9.99987789750328e-05, "loss": 2.371, "step": 306 }, { "epoch": 0.002387522047638453, "grad_norm": 0.17588909413338383, "learning_rate": 9.999877042150403e-05, "loss": 2.4475, "step": 307 }, { "epoch": 0.0023952989924190342, "grad_norm": 0.195902496909208, "learning_rate": 9.999876183812052e-05, "loss": 2.4212, "step": 308 }, { "epoch": 0.0024030759371996154, "grad_norm": 0.23655474160841422, "learning_rate": 9.999875322488223e-05, "loss": 2.4137, "step": 309 }, { "epoch": 0.002410852881980197, "grad_norm": 0.27405200953986897, "learning_rate": 9.999874458178921e-05, "loss": 2.426, "step": 310 }, { "epoch": 0.002418629826760778, "grad_norm": 0.34338852157013855, "learning_rate": 9.999873590884146e-05, "loss": 2.3822, "step": 311 }, { "epoch": 0.0024264067715413593, "grad_norm": 0.22361181390164164, "learning_rate": 9.999872720603896e-05, "loss": 2.4208, "step": 312 }, { "epoch": 0.0024341837163219405, "grad_norm": 0.17342245811266435, "learning_rate": 9.999871847338171e-05, "loss": 2.3599, "step": 313 }, { "epoch": 0.002441960661102522, "grad_norm": 0.21836382735591098, "learning_rate": 9.999870971086976e-05, "loss": 2.4119, "step": 314 }, { "epoch": 0.0024497376058831032, "grad_norm": 0.2816694778889243, "learning_rate": 9.999870091850306e-05, "loss": 2.3618, "step": 315 }, { "epoch": 0.0024575145506636844, "grad_norm": 0.2107214287320401, "learning_rate": 9.999869209628167e-05, "loss": 2.3812, "step": 316 }, { "epoch": 0.0024652914954442656, "grad_norm": 0.14837175229128466, "learning_rate": 9.999868324420554e-05, "loss": 2.3974, "step": 317 }, { "epoch": 0.002473068440224847, "grad_norm": 0.18787317985439259, "learning_rate": 9.999867436227472e-05, "loss": 2.46, "step": 318 }, { "epoch": 0.0024808453850054283, "grad_norm": 0.2511476138687557, "learning_rate": 9.999866545048921e-05, "loss": 2.3847, "step": 319 }, { "epoch": 0.0024886223297860095, "grad_norm": 0.1744221676120278, "learning_rate": 9.9998656508849e-05, "loss": 2.4406, "step": 320 }, { "epoch": 0.0024963992745665907, "grad_norm": 0.1454487631017628, "learning_rate": 9.999864753735409e-05, "loss": 2.4015, "step": 321 }, { "epoch": 0.0025041762193471723, "grad_norm": 0.1886537945761126, "learning_rate": 9.999863853600448e-05, "loss": 2.3603, "step": 322 }, { "epoch": 0.0025119531641277534, "grad_norm": 0.19746905742248413, "learning_rate": 9.999862950480023e-05, "loss": 2.4467, "step": 323 }, { "epoch": 0.0025197301089083346, "grad_norm": 0.20088552121514433, "learning_rate": 9.999862044374128e-05, "loss": 2.4415, "step": 324 }, { "epoch": 0.002527507053688916, "grad_norm": 0.2080930125580025, "learning_rate": 9.999861135282767e-05, "loss": 2.3521, "step": 325 }, { "epoch": 0.0025352839984694973, "grad_norm": 0.20553596395631557, "learning_rate": 9.999860223205937e-05, "loss": 2.3634, "step": 326 }, { "epoch": 0.0025430609432500785, "grad_norm": 0.18016739814537666, "learning_rate": 9.999859308143645e-05, "loss": 2.4461, "step": 327 }, { "epoch": 0.0025508378880306597, "grad_norm": 0.1494065762684154, "learning_rate": 9.999858390095886e-05, "loss": 2.4499, "step": 328 }, { "epoch": 0.0025586148328112413, "grad_norm": 0.17683425165134678, "learning_rate": 9.999857469062662e-05, "loss": 2.3864, "step": 329 }, { "epoch": 0.0025663917775918224, "grad_norm": 0.20087944173710282, "learning_rate": 9.999856545043974e-05, "loss": 2.4483, "step": 330 }, { "epoch": 0.0025741687223724036, "grad_norm": 0.2222091452293225, "learning_rate": 9.999855618039823e-05, "loss": 2.459, "step": 331 }, { "epoch": 0.0025819456671529848, "grad_norm": 0.21319249532300485, "learning_rate": 9.999854688050209e-05, "loss": 2.4086, "step": 332 }, { "epoch": 0.0025897226119335663, "grad_norm": 0.17952553075793862, "learning_rate": 9.999853755075131e-05, "loss": 2.3994, "step": 333 }, { "epoch": 0.0025974995567141475, "grad_norm": 0.15540285825661437, "learning_rate": 9.999852819114592e-05, "loss": 2.3616, "step": 334 }, { "epoch": 0.0026052765014947287, "grad_norm": 0.16539170217767304, "learning_rate": 9.999851880168592e-05, "loss": 2.3876, "step": 335 }, { "epoch": 0.0026130534462753103, "grad_norm": 0.24250669574605746, "learning_rate": 9.999850938237133e-05, "loss": 2.4133, "step": 336 }, { "epoch": 0.0026208303910558914, "grad_norm": 0.3297802455052229, "learning_rate": 9.99984999332021e-05, "loss": 2.3682, "step": 337 }, { "epoch": 0.0026286073358364726, "grad_norm": 0.3330225011333816, "learning_rate": 9.99984904541783e-05, "loss": 2.3612, "step": 338 }, { "epoch": 0.0026363842806170538, "grad_norm": 0.21748969980657928, "learning_rate": 9.999848094529989e-05, "loss": 2.3552, "step": 339 }, { "epoch": 0.0026441612253976354, "grad_norm": 0.15534324977105235, "learning_rate": 9.999847140656691e-05, "loss": 2.4183, "step": 340 }, { "epoch": 0.0026519381701782165, "grad_norm": 0.2503822691943333, "learning_rate": 9.999846183797935e-05, "loss": 2.3483, "step": 341 }, { "epoch": 0.0026597151149587977, "grad_norm": 0.31249553875406616, "learning_rate": 9.99984522395372e-05, "loss": 2.3542, "step": 342 }, { "epoch": 0.002667492059739379, "grad_norm": 0.30141242857819966, "learning_rate": 9.999844261124049e-05, "loss": 2.388, "step": 343 }, { "epoch": 0.0026752690045199604, "grad_norm": 0.189710072706099, "learning_rate": 9.999843295308922e-05, "loss": 2.4124, "step": 344 }, { "epoch": 0.0026830459493005416, "grad_norm": 0.17553374899725585, "learning_rate": 9.99984232650834e-05, "loss": 2.3755, "step": 345 }, { "epoch": 0.0026908228940811228, "grad_norm": 0.182815787262757, "learning_rate": 9.999841354722301e-05, "loss": 2.3864, "step": 346 }, { "epoch": 0.002698599838861704, "grad_norm": 0.1693823815662843, "learning_rate": 9.999840379950809e-05, "loss": 2.4332, "step": 347 }, { "epoch": 0.0027063767836422855, "grad_norm": 0.1493044806817198, "learning_rate": 9.999839402193863e-05, "loss": 2.3806, "step": 348 }, { "epoch": 0.0027141537284228667, "grad_norm": 1.0512601282560217, "learning_rate": 9.999838421451462e-05, "loss": 2.4482, "step": 349 }, { "epoch": 0.002721930673203448, "grad_norm": 0.2352934934182863, "learning_rate": 9.99983743772361e-05, "loss": 2.4302, "step": 350 }, { "epoch": 0.0027297076179840295, "grad_norm": 0.28367561301612476, "learning_rate": 9.999836451010305e-05, "loss": 2.4385, "step": 351 }, { "epoch": 0.0027374845627646106, "grad_norm": 0.23592862598912256, "learning_rate": 9.999835461311548e-05, "loss": 2.3743, "step": 352 }, { "epoch": 0.0027452615075451918, "grad_norm": 0.2925924454983817, "learning_rate": 9.999834468627341e-05, "loss": 2.4411, "step": 353 }, { "epoch": 0.002753038452325773, "grad_norm": 0.39342528904188945, "learning_rate": 9.999833472957682e-05, "loss": 2.4037, "step": 354 }, { "epoch": 0.0027608153971063545, "grad_norm": 0.5797902578366818, "learning_rate": 9.999832474302575e-05, "loss": 2.3761, "step": 355 }, { "epoch": 0.0027685923418869357, "grad_norm": 0.7007507136436109, "learning_rate": 9.999831472662017e-05, "loss": 2.3897, "step": 356 }, { "epoch": 0.002776369286667517, "grad_norm": 0.5664157314588293, "learning_rate": 9.999830468036014e-05, "loss": 2.4031, "step": 357 }, { "epoch": 0.002784146231448098, "grad_norm": 0.2617666916855854, "learning_rate": 9.99982946042456e-05, "loss": 2.366, "step": 358 }, { "epoch": 0.0027919231762286796, "grad_norm": 0.28623455251569735, "learning_rate": 9.999828449827658e-05, "loss": 2.396, "step": 359 }, { "epoch": 0.002799700121009261, "grad_norm": 0.3991981147641396, "learning_rate": 9.99982743624531e-05, "loss": 2.4348, "step": 360 }, { "epoch": 0.002807477065789842, "grad_norm": 0.3794573667453484, "learning_rate": 9.999826419677516e-05, "loss": 2.3996, "step": 361 }, { "epoch": 0.0028152540105704235, "grad_norm": 0.3369686826281135, "learning_rate": 9.999825400124277e-05, "loss": 2.3206, "step": 362 }, { "epoch": 0.0028230309553510047, "grad_norm": 0.25895027714512103, "learning_rate": 9.999824377585592e-05, "loss": 2.3626, "step": 363 }, { "epoch": 0.002830807900131586, "grad_norm": 0.4203255092409985, "learning_rate": 9.999823352061464e-05, "loss": 2.369, "step": 364 }, { "epoch": 0.002838584844912167, "grad_norm": 0.4702457727855839, "learning_rate": 9.999822323551891e-05, "loss": 2.4257, "step": 365 }, { "epoch": 0.0028463617896927486, "grad_norm": 0.34112087479373554, "learning_rate": 9.999821292056875e-05, "loss": 2.3262, "step": 366 }, { "epoch": 0.00285413873447333, "grad_norm": 0.19546478157651898, "learning_rate": 9.999820257576416e-05, "loss": 2.4147, "step": 367 }, { "epoch": 0.002861915679253911, "grad_norm": 0.3298770043405219, "learning_rate": 9.999819220110516e-05, "loss": 2.4145, "step": 368 }, { "epoch": 0.002869692624034492, "grad_norm": 0.2743268982720131, "learning_rate": 9.999818179659172e-05, "loss": 2.3713, "step": 369 }, { "epoch": 0.0028774695688150737, "grad_norm": 0.21639635830486792, "learning_rate": 9.999817136222392e-05, "loss": 2.463, "step": 370 }, { "epoch": 0.002885246513595655, "grad_norm": 0.7555383673905623, "learning_rate": 9.99981608980017e-05, "loss": 2.4552, "step": 371 }, { "epoch": 0.002893023458376236, "grad_norm": 0.40917895845761343, "learning_rate": 9.999815040392506e-05, "loss": 2.3939, "step": 372 }, { "epoch": 0.002900800403156817, "grad_norm": 0.2288519146284771, "learning_rate": 9.999813987999406e-05, "loss": 2.4113, "step": 373 }, { "epoch": 0.002908577347937399, "grad_norm": 0.42575551744447593, "learning_rate": 9.999812932620866e-05, "loss": 2.3856, "step": 374 }, { "epoch": 0.00291635429271798, "grad_norm": 3.8268549806346006, "learning_rate": 9.99981187425689e-05, "loss": 2.3931, "step": 375 }, { "epoch": 0.002924131237498561, "grad_norm": 0.901436408379824, "learning_rate": 9.999810812907476e-05, "loss": 2.4072, "step": 376 }, { "epoch": 0.0029319081822791427, "grad_norm": 0.7380155234910668, "learning_rate": 9.999809748572627e-05, "loss": 2.4174, "step": 377 }, { "epoch": 0.002939685127059724, "grad_norm": 0.6046098503754478, "learning_rate": 9.999808681252341e-05, "loss": 2.4154, "step": 378 }, { "epoch": 0.002947462071840305, "grad_norm": 0.4628502285737607, "learning_rate": 9.999807610946621e-05, "loss": 2.4027, "step": 379 }, { "epoch": 0.0029552390166208862, "grad_norm": 0.44262949498257587, "learning_rate": 9.999806537655465e-05, "loss": 2.3648, "step": 380 }, { "epoch": 0.002963015961401468, "grad_norm": 0.3022548969369873, "learning_rate": 9.999805461378877e-05, "loss": 2.3886, "step": 381 }, { "epoch": 0.002970792906182049, "grad_norm": 0.283708374121209, "learning_rate": 9.999804382116855e-05, "loss": 2.4319, "step": 382 }, { "epoch": 0.00297856985096263, "grad_norm": 0.48748562257053313, "learning_rate": 9.999803299869401e-05, "loss": 2.4614, "step": 383 }, { "epoch": 0.0029863467957432113, "grad_norm": 0.2979036534918445, "learning_rate": 9.999802214636516e-05, "loss": 2.3659, "step": 384 }, { "epoch": 0.002994123740523793, "grad_norm": 0.22190511607600016, "learning_rate": 9.999801126418198e-05, "loss": 2.3656, "step": 385 }, { "epoch": 0.003001900685304374, "grad_norm": 0.49489669941350883, "learning_rate": 9.99980003521445e-05, "loss": 2.3919, "step": 386 }, { "epoch": 0.0030096776300849552, "grad_norm": 0.4001525045260654, "learning_rate": 9.999798941025274e-05, "loss": 2.3988, "step": 387 }, { "epoch": 0.003017454574865537, "grad_norm": 0.25266184144501547, "learning_rate": 9.999797843850667e-05, "loss": 2.3709, "step": 388 }, { "epoch": 0.003025231519646118, "grad_norm": 0.37790314850010726, "learning_rate": 9.99979674369063e-05, "loss": 2.3716, "step": 389 }, { "epoch": 0.003033008464426699, "grad_norm": 0.35327358149089866, "learning_rate": 9.999795640545167e-05, "loss": 2.3525, "step": 390 }, { "epoch": 0.0030407854092072803, "grad_norm": 0.26446050071727045, "learning_rate": 9.999794534414277e-05, "loss": 2.4501, "step": 391 }, { "epoch": 0.003048562353987862, "grad_norm": 0.33993550875240175, "learning_rate": 9.999793425297961e-05, "loss": 2.3905, "step": 392 }, { "epoch": 0.003056339298768443, "grad_norm": 0.2604106161654596, "learning_rate": 9.999792313196217e-05, "loss": 2.3715, "step": 393 }, { "epoch": 0.0030641162435490242, "grad_norm": 0.2837404584348805, "learning_rate": 9.99979119810905e-05, "loss": 2.3813, "step": 394 }, { "epoch": 0.0030718931883296054, "grad_norm": 0.2120071725903381, "learning_rate": 9.999790080036457e-05, "loss": 2.3822, "step": 395 }, { "epoch": 0.003079670133110187, "grad_norm": 0.20899840884715942, "learning_rate": 9.99978895897844e-05, "loss": 2.385, "step": 396 }, { "epoch": 0.003087447077890768, "grad_norm": 0.25953954366211274, "learning_rate": 9.999787834935e-05, "loss": 2.382, "step": 397 }, { "epoch": 0.0030952240226713493, "grad_norm": 0.153162477834582, "learning_rate": 9.999786707906138e-05, "loss": 2.3591, "step": 398 }, { "epoch": 0.0031030009674519305, "grad_norm": 0.2264751111640312, "learning_rate": 9.999785577891854e-05, "loss": 2.333, "step": 399 }, { "epoch": 0.003110777912232512, "grad_norm": 0.19987420309199966, "learning_rate": 9.999784444892147e-05, "loss": 2.3713, "step": 400 }, { "epoch": 0.0031185548570130932, "grad_norm": 0.18796460266672715, "learning_rate": 9.999783308907022e-05, "loss": 2.3731, "step": 401 }, { "epoch": 0.0031263318017936744, "grad_norm": 0.24793833042527927, "learning_rate": 9.999782169936476e-05, "loss": 2.3082, "step": 402 }, { "epoch": 0.003134108746574256, "grad_norm": 0.16195308300006153, "learning_rate": 9.999781027980511e-05, "loss": 2.4295, "step": 403 }, { "epoch": 0.003141885691354837, "grad_norm": 0.23741540593606417, "learning_rate": 9.999779883039127e-05, "loss": 2.3358, "step": 404 }, { "epoch": 0.0031496626361354183, "grad_norm": 0.15395033369288452, "learning_rate": 9.999778735112324e-05, "loss": 2.3183, "step": 405 }, { "epoch": 0.0031574395809159995, "grad_norm": 0.19991330776131677, "learning_rate": 9.999777584200105e-05, "loss": 2.3448, "step": 406 }, { "epoch": 0.003165216525696581, "grad_norm": 0.20326729230107304, "learning_rate": 9.99977643030247e-05, "loss": 2.4373, "step": 407 }, { "epoch": 0.0031729934704771623, "grad_norm": 0.15699480762409046, "learning_rate": 9.99977527341942e-05, "loss": 2.4391, "step": 408 }, { "epoch": 0.0031807704152577434, "grad_norm": 0.1881015804780194, "learning_rate": 9.999774113550954e-05, "loss": 2.3816, "step": 409 }, { "epoch": 0.0031885473600383246, "grad_norm": 0.14635925701874922, "learning_rate": 9.999772950697073e-05, "loss": 2.3676, "step": 410 }, { "epoch": 0.003196324304818906, "grad_norm": 0.18296241123198856, "learning_rate": 9.999771784857779e-05, "loss": 2.3333, "step": 411 }, { "epoch": 0.0032041012495994873, "grad_norm": 0.1462381753341106, "learning_rate": 9.999770616033073e-05, "loss": 2.4158, "step": 412 }, { "epoch": 0.0032118781943800685, "grad_norm": 0.2036922371282941, "learning_rate": 9.999769444222952e-05, "loss": 2.3762, "step": 413 }, { "epoch": 0.00321965513916065, "grad_norm": 0.13363715674363028, "learning_rate": 9.999768269427423e-05, "loss": 2.3393, "step": 414 }, { "epoch": 0.0032274320839412313, "grad_norm": 0.17839955054963266, "learning_rate": 9.99976709164648e-05, "loss": 2.3491, "step": 415 }, { "epoch": 0.0032352090287218124, "grad_norm": 0.156345690094181, "learning_rate": 9.999765910880128e-05, "loss": 2.366, "step": 416 }, { "epoch": 0.0032429859735023936, "grad_norm": 0.16938038051344556, "learning_rate": 9.999764727128366e-05, "loss": 2.3269, "step": 417 }, { "epoch": 0.003250762918282975, "grad_norm": 0.13256591099182635, "learning_rate": 9.999763540391197e-05, "loss": 2.3297, "step": 418 }, { "epoch": 0.0032585398630635564, "grad_norm": 0.17752668309075395, "learning_rate": 9.999762350668618e-05, "loss": 2.38, "step": 419 }, { "epoch": 0.0032663168078441375, "grad_norm": 0.17946947783724101, "learning_rate": 9.999761157960632e-05, "loss": 2.3468, "step": 420 }, { "epoch": 0.0032740937526247187, "grad_norm": 0.16579138727221857, "learning_rate": 9.99975996226724e-05, "loss": 2.3471, "step": 421 }, { "epoch": 0.0032818706974053003, "grad_norm": 0.15488715017271032, "learning_rate": 9.999758763588441e-05, "loss": 2.32, "step": 422 }, { "epoch": 0.0032896476421858814, "grad_norm": 0.14952787633294617, "learning_rate": 9.999757561924237e-05, "loss": 2.3786, "step": 423 }, { "epoch": 0.0032974245869664626, "grad_norm": 0.1753023561230869, "learning_rate": 9.99975635727463e-05, "loss": 2.3475, "step": 424 }, { "epoch": 0.003305201531747044, "grad_norm": 0.14414550994380038, "learning_rate": 9.999755149639618e-05, "loss": 2.3301, "step": 425 }, { "epoch": 0.0033129784765276254, "grad_norm": 0.15922777757976736, "learning_rate": 9.999753939019203e-05, "loss": 2.332, "step": 426 }, { "epoch": 0.0033207554213082065, "grad_norm": 0.14511504793815275, "learning_rate": 9.999752725413387e-05, "loss": 2.3738, "step": 427 }, { "epoch": 0.0033285323660887877, "grad_norm": 0.3619948787723085, "learning_rate": 9.999751508822167e-05, "loss": 2.3916, "step": 428 }, { "epoch": 0.0033363093108693693, "grad_norm": 0.2303283911142328, "learning_rate": 9.999750289245547e-05, "loss": 2.3219, "step": 429 }, { "epoch": 0.0033440862556499504, "grad_norm": 0.1675449313418024, "learning_rate": 9.999749066683529e-05, "loss": 2.3494, "step": 430 }, { "epoch": 0.0033518632004305316, "grad_norm": 0.2305876679565056, "learning_rate": 9.999747841136109e-05, "loss": 2.3726, "step": 431 }, { "epoch": 0.0033596401452111128, "grad_norm": 0.16724697865184354, "learning_rate": 9.999746612603291e-05, "loss": 2.3435, "step": 432 }, { "epoch": 0.0033674170899916944, "grad_norm": 0.19583117520183826, "learning_rate": 9.999745381085075e-05, "loss": 2.3699, "step": 433 }, { "epoch": 0.0033751940347722755, "grad_norm": 0.21252901832350102, "learning_rate": 9.999744146581463e-05, "loss": 2.3591, "step": 434 }, { "epoch": 0.0033829709795528567, "grad_norm": 0.16118535013727886, "learning_rate": 9.999742909092452e-05, "loss": 2.4211, "step": 435 }, { "epoch": 0.003390747924333438, "grad_norm": 0.19122887428706303, "learning_rate": 9.999741668618047e-05, "loss": 2.3775, "step": 436 }, { "epoch": 0.0033985248691140195, "grad_norm": 0.1501914343126595, "learning_rate": 9.999740425158246e-05, "loss": 2.3142, "step": 437 }, { "epoch": 0.0034063018138946006, "grad_norm": 0.17941238894401595, "learning_rate": 9.999739178713053e-05, "loss": 2.3234, "step": 438 }, { "epoch": 0.003414078758675182, "grad_norm": 0.17406487943517215, "learning_rate": 9.999737929282463e-05, "loss": 2.373, "step": 439 }, { "epoch": 0.0034218557034557634, "grad_norm": 0.1693890572061884, "learning_rate": 9.999736676866482e-05, "loss": 2.3624, "step": 440 }, { "epoch": 0.0034296326482363445, "grad_norm": 0.22828026946577204, "learning_rate": 9.99973542146511e-05, "loss": 2.3642, "step": 441 }, { "epoch": 0.0034374095930169257, "grad_norm": 0.14612158964432598, "learning_rate": 9.999734163078345e-05, "loss": 2.3469, "step": 442 }, { "epoch": 0.003445186537797507, "grad_norm": 0.17902419708231368, "learning_rate": 9.99973290170619e-05, "loss": 2.3071, "step": 443 }, { "epoch": 0.0034529634825780885, "grad_norm": 0.16416084214405974, "learning_rate": 9.999731637348646e-05, "loss": 2.2874, "step": 444 }, { "epoch": 0.0034607404273586696, "grad_norm": 0.14541019920778384, "learning_rate": 9.999730370005713e-05, "loss": 2.3106, "step": 445 }, { "epoch": 0.003468517372139251, "grad_norm": 0.17474022886956483, "learning_rate": 9.99972909967739e-05, "loss": 2.375, "step": 446 }, { "epoch": 0.003476294316919832, "grad_norm": 0.13122601823636784, "learning_rate": 9.999727826363681e-05, "loss": 2.2971, "step": 447 }, { "epoch": 0.0034840712617004136, "grad_norm": 0.1441507527763934, "learning_rate": 9.999726550064585e-05, "loss": 2.3164, "step": 448 }, { "epoch": 0.0034918482064809947, "grad_norm": 0.12957620956510382, "learning_rate": 9.999725270780103e-05, "loss": 2.3182, "step": 449 }, { "epoch": 0.003499625151261576, "grad_norm": 0.13956427992576123, "learning_rate": 9.999723988510235e-05, "loss": 2.3049, "step": 450 }, { "epoch": 0.0035074020960421575, "grad_norm": 0.154248824135562, "learning_rate": 9.999722703254984e-05, "loss": 2.327, "step": 451 }, { "epoch": 0.0035151790408227386, "grad_norm": 0.13817263391621382, "learning_rate": 9.999721415014349e-05, "loss": 2.371, "step": 452 }, { "epoch": 0.00352295598560332, "grad_norm": 0.13996373526130235, "learning_rate": 9.99972012378833e-05, "loss": 2.2898, "step": 453 }, { "epoch": 0.003530732930383901, "grad_norm": 0.1322226221333067, "learning_rate": 9.99971882957693e-05, "loss": 2.3424, "step": 454 }, { "epoch": 0.0035385098751644826, "grad_norm": 0.15462104042373995, "learning_rate": 9.999717532380147e-05, "loss": 2.3328, "step": 455 }, { "epoch": 0.0035462868199450637, "grad_norm": 0.1489297852620825, "learning_rate": 9.999716232197984e-05, "loss": 2.3109, "step": 456 }, { "epoch": 0.003554063764725645, "grad_norm": 0.13615553956603602, "learning_rate": 9.999714929030443e-05, "loss": 2.3783, "step": 457 }, { "epoch": 0.003561840709506226, "grad_norm": 0.1526958269446435, "learning_rate": 9.999713622877521e-05, "loss": 2.3436, "step": 458 }, { "epoch": 0.0035696176542868076, "grad_norm": 0.13802021130586897, "learning_rate": 9.999712313739222e-05, "loss": 2.2794, "step": 459 }, { "epoch": 0.003577394599067389, "grad_norm": 0.23384990539442962, "learning_rate": 9.999711001615544e-05, "loss": 2.3775, "step": 460 }, { "epoch": 0.00358517154384797, "grad_norm": 0.13620639539438964, "learning_rate": 9.99970968650649e-05, "loss": 2.376, "step": 461 }, { "epoch": 0.003592948488628551, "grad_norm": 0.1322613882890518, "learning_rate": 9.999708368412061e-05, "loss": 2.3319, "step": 462 }, { "epoch": 0.0036007254334091327, "grad_norm": 0.1486346275629189, "learning_rate": 9.999707047332256e-05, "loss": 2.3787, "step": 463 }, { "epoch": 0.003608502378189714, "grad_norm": 0.1711667206175729, "learning_rate": 9.999705723267078e-05, "loss": 2.2993, "step": 464 }, { "epoch": 0.003616279322970295, "grad_norm": 0.14936409810677137, "learning_rate": 9.999704396216525e-05, "loss": 2.3698, "step": 465 }, { "epoch": 0.0036240562677508767, "grad_norm": 0.13851509639890816, "learning_rate": 9.999703066180599e-05, "loss": 2.3419, "step": 466 }, { "epoch": 0.003631833212531458, "grad_norm": 0.1608749651604363, "learning_rate": 9.999701733159301e-05, "loss": 2.3199, "step": 467 }, { "epoch": 0.003639610157312039, "grad_norm": 0.1917251310829306, "learning_rate": 9.999700397152634e-05, "loss": 2.3704, "step": 468 }, { "epoch": 0.00364738710209262, "grad_norm": 0.20010841673646523, "learning_rate": 9.999699058160594e-05, "loss": 2.3478, "step": 469 }, { "epoch": 0.0036551640468732017, "grad_norm": 0.20080526971836424, "learning_rate": 9.999697716183185e-05, "loss": 2.3445, "step": 470 }, { "epoch": 0.003662940991653783, "grad_norm": 0.19263289436508227, "learning_rate": 9.999696371220408e-05, "loss": 2.4104, "step": 471 }, { "epoch": 0.003670717936434364, "grad_norm": 0.14362473788210617, "learning_rate": 9.999695023272264e-05, "loss": 2.3463, "step": 472 }, { "epoch": 0.0036784948812149452, "grad_norm": 0.13984411196949284, "learning_rate": 9.99969367233875e-05, "loss": 2.2895, "step": 473 }, { "epoch": 0.003686271825995527, "grad_norm": 0.1398832428387508, "learning_rate": 9.999692318419872e-05, "loss": 2.2721, "step": 474 }, { "epoch": 0.003694048770776108, "grad_norm": 0.16334009864144614, "learning_rate": 9.999690961515629e-05, "loss": 2.3149, "step": 475 }, { "epoch": 0.003701825715556689, "grad_norm": 0.19476700694539997, "learning_rate": 9.99968960162602e-05, "loss": 2.4171, "step": 476 }, { "epoch": 0.0037096026603372708, "grad_norm": 0.19579256000018427, "learning_rate": 9.999688238751045e-05, "loss": 2.3121, "step": 477 }, { "epoch": 0.003717379605117852, "grad_norm": 0.15428820324641696, "learning_rate": 9.99968687289071e-05, "loss": 2.3867, "step": 478 }, { "epoch": 0.003725156549898433, "grad_norm": 0.16826084109689415, "learning_rate": 9.999685504045011e-05, "loss": 2.3059, "step": 479 }, { "epoch": 0.0037329334946790142, "grad_norm": 0.14973344302607397, "learning_rate": 9.999684132213951e-05, "loss": 2.2898, "step": 480 }, { "epoch": 0.003740710439459596, "grad_norm": 0.17965456990710615, "learning_rate": 9.999682757397531e-05, "loss": 2.3059, "step": 481 }, { "epoch": 0.003748487384240177, "grad_norm": 0.1895716187463182, "learning_rate": 9.999681379595751e-05, "loss": 2.3147, "step": 482 }, { "epoch": 0.003756264329020758, "grad_norm": 0.1611919206983583, "learning_rate": 9.999679998808612e-05, "loss": 2.3703, "step": 483 }, { "epoch": 0.0037640412738013393, "grad_norm": 0.16317205026976495, "learning_rate": 9.999678615036114e-05, "loss": 2.3301, "step": 484 }, { "epoch": 0.003771818218581921, "grad_norm": 0.17172833360044784, "learning_rate": 9.999677228278259e-05, "loss": 2.3181, "step": 485 }, { "epoch": 0.003779595163362502, "grad_norm": 0.17463053813059995, "learning_rate": 9.999675838535047e-05, "loss": 2.3498, "step": 486 }, { "epoch": 0.0037873721081430833, "grad_norm": 0.156830937321242, "learning_rate": 9.999674445806481e-05, "loss": 2.3264, "step": 487 }, { "epoch": 0.0037951490529236644, "grad_norm": 0.139659800495259, "learning_rate": 9.99967305009256e-05, "loss": 2.3411, "step": 488 }, { "epoch": 0.003802925997704246, "grad_norm": 0.13979230676908497, "learning_rate": 9.999671651393283e-05, "loss": 2.3346, "step": 489 }, { "epoch": 0.003810702942484827, "grad_norm": 0.1638279871309997, "learning_rate": 9.999670249708655e-05, "loss": 2.3439, "step": 490 }, { "epoch": 0.0038184798872654083, "grad_norm": 0.18877861115375735, "learning_rate": 9.999668845038673e-05, "loss": 2.3754, "step": 491 }, { "epoch": 0.00382625683204599, "grad_norm": 0.1696567217750582, "learning_rate": 9.99966743738334e-05, "loss": 2.2894, "step": 492 }, { "epoch": 0.003834033776826571, "grad_norm": 0.14714688894193292, "learning_rate": 9.999666026742656e-05, "loss": 2.332, "step": 493 }, { "epoch": 0.0038418107216071523, "grad_norm": 0.14088266835317292, "learning_rate": 9.999664613116621e-05, "loss": 2.2757, "step": 494 }, { "epoch": 0.0038495876663877334, "grad_norm": 0.1390090033036952, "learning_rate": 9.99966319650524e-05, "loss": 2.3352, "step": 495 }, { "epoch": 0.003857364611168315, "grad_norm": 0.17917823774556282, "learning_rate": 9.999661776908508e-05, "loss": 2.3607, "step": 496 }, { "epoch": 0.003865141555948896, "grad_norm": 0.13888877414502812, "learning_rate": 9.999660354326431e-05, "loss": 2.3728, "step": 497 }, { "epoch": 0.0038729185007294773, "grad_norm": 0.14734736378163382, "learning_rate": 9.999658928759006e-05, "loss": 2.4115, "step": 498 }, { "epoch": 0.0038806954455100585, "grad_norm": 0.16051907232324522, "learning_rate": 9.999657500206236e-05, "loss": 2.3062, "step": 499 }, { "epoch": 0.00388847239029064, "grad_norm": 0.1434737750674915, "learning_rate": 9.999656068668122e-05, "loss": 2.3396, "step": 500 }, { "epoch": 0.0038962493350712213, "grad_norm": 0.17458867890143798, "learning_rate": 9.999654634144663e-05, "loss": 2.3856, "step": 501 }, { "epoch": 0.0039040262798518024, "grad_norm": 0.20467345934638467, "learning_rate": 9.999653196635861e-05, "loss": 2.33, "step": 502 }, { "epoch": 0.003911803224632384, "grad_norm": 0.1896978730524515, "learning_rate": 9.999651756141717e-05, "loss": 2.3027, "step": 503 }, { "epoch": 0.003919580169412965, "grad_norm": 0.1442689992861098, "learning_rate": 9.999650312662232e-05, "loss": 2.4118, "step": 504 }, { "epoch": 0.003927357114193546, "grad_norm": 0.18921012830471418, "learning_rate": 9.999648866197407e-05, "loss": 2.3074, "step": 505 }, { "epoch": 0.0039351340589741275, "grad_norm": 0.17460950202693665, "learning_rate": 9.99964741674724e-05, "loss": 2.279, "step": 506 }, { "epoch": 0.003942911003754709, "grad_norm": 0.13665472999596634, "learning_rate": 9.999645964311737e-05, "loss": 2.3065, "step": 507 }, { "epoch": 0.00395068794853529, "grad_norm": 0.1771876279341042, "learning_rate": 9.999644508890895e-05, "loss": 2.3817, "step": 508 }, { "epoch": 0.003958464893315872, "grad_norm": 0.1949584804104512, "learning_rate": 9.999643050484715e-05, "loss": 2.3309, "step": 509 }, { "epoch": 0.003966241838096453, "grad_norm": 0.1771361817137608, "learning_rate": 9.9996415890932e-05, "loss": 2.3739, "step": 510 }, { "epoch": 0.003974018782877034, "grad_norm": 0.19535655617126035, "learning_rate": 9.99964012471635e-05, "loss": 2.3222, "step": 511 }, { "epoch": 0.003981795727657615, "grad_norm": 0.20925084494638296, "learning_rate": 9.999638657354166e-05, "loss": 2.3478, "step": 512 }, { "epoch": 0.0039895726724381965, "grad_norm": 0.18801542426100076, "learning_rate": 9.999637187006647e-05, "loss": 2.3186, "step": 513 }, { "epoch": 0.003997349617218778, "grad_norm": 0.2271620773416596, "learning_rate": 9.999635713673796e-05, "loss": 2.3391, "step": 514 }, { "epoch": 0.004005126561999359, "grad_norm": 0.21658917229639787, "learning_rate": 9.999634237355614e-05, "loss": 2.3432, "step": 515 }, { "epoch": 0.004012903506779941, "grad_norm": 0.15830258761605784, "learning_rate": 9.9996327580521e-05, "loss": 2.3303, "step": 516 }, { "epoch": 0.004020680451560522, "grad_norm": 0.17799661969924277, "learning_rate": 9.999631275763257e-05, "loss": 2.3475, "step": 517 }, { "epoch": 0.004028457396341103, "grad_norm": 0.1788968193701804, "learning_rate": 9.999629790489084e-05, "loss": 2.3303, "step": 518 }, { "epoch": 0.004036234341121684, "grad_norm": 0.19741374012410176, "learning_rate": 9.999628302229585e-05, "loss": 2.3615, "step": 519 }, { "epoch": 0.0040440112859022655, "grad_norm": 0.2189240429889949, "learning_rate": 9.999626810984757e-05, "loss": 2.3863, "step": 520 }, { "epoch": 0.004051788230682847, "grad_norm": 0.20069456207050437, "learning_rate": 9.999625316754602e-05, "loss": 2.276, "step": 521 }, { "epoch": 0.004059565175463428, "grad_norm": 0.16941380250880542, "learning_rate": 9.999623819539121e-05, "loss": 2.3504, "step": 522 }, { "epoch": 0.004067342120244009, "grad_norm": 0.1485134234428808, "learning_rate": 9.999622319338318e-05, "loss": 2.3047, "step": 523 }, { "epoch": 0.004075119065024591, "grad_norm": 0.16280374376908677, "learning_rate": 9.999620816152188e-05, "loss": 2.3268, "step": 524 }, { "epoch": 0.004082896009805172, "grad_norm": 0.16233680724554111, "learning_rate": 9.999619309980738e-05, "loss": 2.3169, "step": 525 }, { "epoch": 0.004090672954585753, "grad_norm": 0.14621456282596332, "learning_rate": 9.999617800823964e-05, "loss": 2.3368, "step": 526 }, { "epoch": 0.0040984498993663345, "grad_norm": 0.1370289309809495, "learning_rate": 9.99961628868187e-05, "loss": 2.3232, "step": 527 }, { "epoch": 0.004106226844146916, "grad_norm": 0.14791346528847363, "learning_rate": 9.999614773554455e-05, "loss": 2.2914, "step": 528 }, { "epoch": 0.004114003788927497, "grad_norm": 0.18645594767619603, "learning_rate": 9.999613255441723e-05, "loss": 2.2987, "step": 529 }, { "epoch": 0.004121780733708078, "grad_norm": 0.15081909325875031, "learning_rate": 9.999611734343671e-05, "loss": 2.3181, "step": 530 }, { "epoch": 0.00412955767848866, "grad_norm": 0.15288561306453635, "learning_rate": 9.999610210260302e-05, "loss": 2.3104, "step": 531 }, { "epoch": 0.004137334623269241, "grad_norm": 0.17515014790337766, "learning_rate": 9.999608683191615e-05, "loss": 2.2611, "step": 532 }, { "epoch": 0.004145111568049822, "grad_norm": 0.2884156726095332, "learning_rate": 9.999607153137613e-05, "loss": 2.3399, "step": 533 }, { "epoch": 0.0041528885128304036, "grad_norm": 0.20232130827219127, "learning_rate": 9.999605620098298e-05, "loss": 2.3162, "step": 534 }, { "epoch": 0.004160665457610985, "grad_norm": 0.1571471592820117, "learning_rate": 9.999604084073669e-05, "loss": 2.3788, "step": 535 }, { "epoch": 0.004168442402391566, "grad_norm": 0.14220382832346853, "learning_rate": 9.999602545063724e-05, "loss": 2.3915, "step": 536 }, { "epoch": 0.004176219347172147, "grad_norm": 0.21097251923089028, "learning_rate": 9.999601003068471e-05, "loss": 2.3142, "step": 537 }, { "epoch": 0.004183996291952728, "grad_norm": 0.17490778907417676, "learning_rate": 9.999599458087905e-05, "loss": 2.2953, "step": 538 }, { "epoch": 0.00419177323673331, "grad_norm": 0.13107543592000284, "learning_rate": 9.99959791012203e-05, "loss": 2.3272, "step": 539 }, { "epoch": 0.004199550181513891, "grad_norm": 0.1469664030209014, "learning_rate": 9.999596359170844e-05, "loss": 2.3113, "step": 540 }, { "epoch": 0.0042073271262944726, "grad_norm": 0.13055451204301752, "learning_rate": 9.999594805234352e-05, "loss": 2.3287, "step": 541 }, { "epoch": 0.004215104071075054, "grad_norm": 0.13879648173404074, "learning_rate": 9.99959324831255e-05, "loss": 2.3398, "step": 542 }, { "epoch": 0.004222881015855635, "grad_norm": 0.1576821640366984, "learning_rate": 9.999591688405443e-05, "loss": 2.3403, "step": 543 }, { "epoch": 0.004230657960636216, "grad_norm": 0.1447032017518288, "learning_rate": 9.99959012551303e-05, "loss": 2.3799, "step": 544 }, { "epoch": 0.004238434905416797, "grad_norm": 0.14891312598917744, "learning_rate": 9.999588559635313e-05, "loss": 2.3232, "step": 545 }, { "epoch": 0.004246211850197379, "grad_norm": 0.1915582364563055, "learning_rate": 9.999586990772294e-05, "loss": 2.3419, "step": 546 }, { "epoch": 0.00425398879497796, "grad_norm": 0.2484564880030742, "learning_rate": 9.99958541892397e-05, "loss": 2.3002, "step": 547 }, { "epoch": 0.004261765739758542, "grad_norm": 0.16264995169524882, "learning_rate": 9.999583844090345e-05, "loss": 2.3144, "step": 548 }, { "epoch": 0.004269542684539123, "grad_norm": 0.1510034918133882, "learning_rate": 9.99958226627142e-05, "loss": 2.3226, "step": 549 }, { "epoch": 0.004277319629319704, "grad_norm": 0.1568549384353263, "learning_rate": 9.999580685467194e-05, "loss": 2.3094, "step": 550 }, { "epoch": 0.004285096574100285, "grad_norm": 0.16449838230989927, "learning_rate": 9.999579101677669e-05, "loss": 2.2891, "step": 551 }, { "epoch": 0.004292873518880866, "grad_norm": 0.1487601820341068, "learning_rate": 9.999577514902848e-05, "loss": 2.3177, "step": 552 }, { "epoch": 0.004300650463661447, "grad_norm": 0.1605866391241864, "learning_rate": 9.999575925142728e-05, "loss": 2.3297, "step": 553 }, { "epoch": 0.004308427408442029, "grad_norm": 0.1605487735635482, "learning_rate": 9.999574332397314e-05, "loss": 2.2751, "step": 554 }, { "epoch": 0.004316204353222611, "grad_norm": 0.16955827822836575, "learning_rate": 9.999572736666602e-05, "loss": 2.2916, "step": 555 }, { "epoch": 0.004323981298003192, "grad_norm": 0.19727041236730897, "learning_rate": 9.999571137950598e-05, "loss": 2.3297, "step": 556 }, { "epoch": 0.004331758242783773, "grad_norm": 0.22762216102347843, "learning_rate": 9.9995695362493e-05, "loss": 2.3764, "step": 557 }, { "epoch": 0.004339535187564354, "grad_norm": 0.25080689551236646, "learning_rate": 9.99956793156271e-05, "loss": 2.2596, "step": 558 }, { "epoch": 0.004347312132344935, "grad_norm": 0.2249938612855262, "learning_rate": 9.99956632389083e-05, "loss": 2.2802, "step": 559 }, { "epoch": 0.004355089077125516, "grad_norm": 0.17813487602021782, "learning_rate": 9.999564713233658e-05, "loss": 2.2803, "step": 560 }, { "epoch": 0.004362866021906098, "grad_norm": 0.14792094962616653, "learning_rate": 9.999563099591197e-05, "loss": 2.2921, "step": 561 }, { "epoch": 0.00437064296668668, "grad_norm": 0.17667662841144746, "learning_rate": 9.999561482963449e-05, "loss": 2.3035, "step": 562 }, { "epoch": 0.004378419911467261, "grad_norm": 0.18728384061164005, "learning_rate": 9.999559863350412e-05, "loss": 2.2791, "step": 563 }, { "epoch": 0.004386196856247842, "grad_norm": 0.1947179804426456, "learning_rate": 9.999558240752089e-05, "loss": 2.267, "step": 564 }, { "epoch": 0.004393973801028423, "grad_norm": 0.20054280167106736, "learning_rate": 9.99955661516848e-05, "loss": 2.2865, "step": 565 }, { "epoch": 0.004401750745809004, "grad_norm": 0.1636621071560139, "learning_rate": 9.999554986599587e-05, "loss": 2.3066, "step": 566 }, { "epoch": 0.004409527690589585, "grad_norm": 0.14154948948776602, "learning_rate": 9.99955335504541e-05, "loss": 2.3591, "step": 567 }, { "epoch": 0.0044173046353701674, "grad_norm": 0.16370693728926808, "learning_rate": 9.999551720505951e-05, "loss": 2.3702, "step": 568 }, { "epoch": 0.004425081580150749, "grad_norm": 0.17127895126697623, "learning_rate": 9.99955008298121e-05, "loss": 2.3053, "step": 569 }, { "epoch": 0.00443285852493133, "grad_norm": 0.1466826607123416, "learning_rate": 9.999548442471189e-05, "loss": 2.3193, "step": 570 }, { "epoch": 0.004440635469711911, "grad_norm": 0.14124117147859386, "learning_rate": 9.999546798975888e-05, "loss": 2.3316, "step": 571 }, { "epoch": 0.004448412414492492, "grad_norm": 0.15792530156945964, "learning_rate": 9.999545152495308e-05, "loss": 2.3176, "step": 572 }, { "epoch": 0.004456189359273073, "grad_norm": 0.14940758586293176, "learning_rate": 9.999543503029451e-05, "loss": 2.3124, "step": 573 }, { "epoch": 0.004463966304053654, "grad_norm": 0.13196620282339333, "learning_rate": 9.999541850578315e-05, "loss": 2.2654, "step": 574 }, { "epoch": 0.004471743248834236, "grad_norm": 0.1334813910754098, "learning_rate": 9.999540195141907e-05, "loss": 2.3155, "step": 575 }, { "epoch": 0.004479520193614818, "grad_norm": 0.15603868975845736, "learning_rate": 9.999538536720223e-05, "loss": 2.3219, "step": 576 }, { "epoch": 0.004487297138395399, "grad_norm": 0.15899673433235442, "learning_rate": 9.999536875313265e-05, "loss": 2.2269, "step": 577 }, { "epoch": 0.00449507408317598, "grad_norm": 0.1429177756311552, "learning_rate": 9.999535210921033e-05, "loss": 2.2411, "step": 578 }, { "epoch": 0.004502851027956561, "grad_norm": 0.13099760305925734, "learning_rate": 9.999533543543531e-05, "loss": 2.3494, "step": 579 }, { "epoch": 0.004510627972737142, "grad_norm": 0.13571060867830598, "learning_rate": 9.999531873180757e-05, "loss": 2.3144, "step": 580 }, { "epoch": 0.004518404917517723, "grad_norm": 0.13120305545864336, "learning_rate": 9.999530199832714e-05, "loss": 2.2965, "step": 581 }, { "epoch": 0.004526181862298305, "grad_norm": 0.13310571766220677, "learning_rate": 9.999528523499402e-05, "loss": 2.3352, "step": 582 }, { "epoch": 0.004533958807078887, "grad_norm": 0.1313527768878608, "learning_rate": 9.999526844180822e-05, "loss": 2.2934, "step": 583 }, { "epoch": 0.004541735751859468, "grad_norm": 0.1888992092012677, "learning_rate": 9.999525161876977e-05, "loss": 2.3221, "step": 584 }, { "epoch": 0.004549512696640049, "grad_norm": 0.15446524760878821, "learning_rate": 9.999523476587865e-05, "loss": 2.2918, "step": 585 }, { "epoch": 0.00455728964142063, "grad_norm": 0.16494018740568678, "learning_rate": 9.999521788313487e-05, "loss": 2.3494, "step": 586 }, { "epoch": 0.004565066586201211, "grad_norm": 0.15856966787587456, "learning_rate": 9.999520097053848e-05, "loss": 2.3534, "step": 587 }, { "epoch": 0.0045728435309817924, "grad_norm": 0.1436338581776061, "learning_rate": 9.999518402808943e-05, "loss": 2.2701, "step": 588 }, { "epoch": 0.004580620475762374, "grad_norm": 0.14246699724927667, "learning_rate": 9.999516705578779e-05, "loss": 2.3144, "step": 589 }, { "epoch": 0.004588397420542955, "grad_norm": 0.1500551276409836, "learning_rate": 9.999515005363353e-05, "loss": 2.344, "step": 590 }, { "epoch": 0.004596174365323537, "grad_norm": 0.15847750604441274, "learning_rate": 9.999513302162668e-05, "loss": 2.2934, "step": 591 }, { "epoch": 0.004603951310104118, "grad_norm": 0.1809751807080797, "learning_rate": 9.999511595976725e-05, "loss": 2.3275, "step": 592 }, { "epoch": 0.004611728254884699, "grad_norm": 0.19951223060150805, "learning_rate": 9.999509886805523e-05, "loss": 2.33, "step": 593 }, { "epoch": 0.00461950519966528, "grad_norm": 0.16958476550446716, "learning_rate": 9.999508174649065e-05, "loss": 2.2955, "step": 594 }, { "epoch": 0.0046272821444458614, "grad_norm": 0.21594321566131797, "learning_rate": 9.99950645950735e-05, "loss": 2.3483, "step": 595 }, { "epoch": 0.004635059089226443, "grad_norm": 0.1726392581657767, "learning_rate": 9.999504741380383e-05, "loss": 2.2718, "step": 596 }, { "epoch": 0.004642836034007024, "grad_norm": 0.1954093966383671, "learning_rate": 9.999503020268162e-05, "loss": 2.3342, "step": 597 }, { "epoch": 0.004650612978787606, "grad_norm": 0.18414475515974257, "learning_rate": 9.999501296170686e-05, "loss": 2.2792, "step": 598 }, { "epoch": 0.004658389923568187, "grad_norm": 0.1779724127525238, "learning_rate": 9.99949956908796e-05, "loss": 2.2943, "step": 599 }, { "epoch": 0.004666166868348768, "grad_norm": 0.1654532755577357, "learning_rate": 9.999497839019985e-05, "loss": 2.3764, "step": 600 }, { "epoch": 0.004673943813129349, "grad_norm": 0.20471625438108226, "learning_rate": 9.999496105966759e-05, "loss": 2.3114, "step": 601 }, { "epoch": 0.0046817207579099305, "grad_norm": 0.25886823638132805, "learning_rate": 9.999494369928285e-05, "loss": 2.3442, "step": 602 }, { "epoch": 0.004689497702690512, "grad_norm": 0.2526518770436192, "learning_rate": 9.999492630904563e-05, "loss": 2.311, "step": 603 }, { "epoch": 0.004697274647471093, "grad_norm": 0.17786834717527925, "learning_rate": 9.999490888895595e-05, "loss": 2.3395, "step": 604 }, { "epoch": 0.004705051592251674, "grad_norm": 0.18992106824351906, "learning_rate": 9.999489143901382e-05, "loss": 2.3195, "step": 605 }, { "epoch": 0.004712828537032256, "grad_norm": 0.2426880445467182, "learning_rate": 9.999487395921925e-05, "loss": 2.3158, "step": 606 }, { "epoch": 0.004720605481812837, "grad_norm": 0.2473503387037462, "learning_rate": 9.999485644957225e-05, "loss": 2.3404, "step": 607 }, { "epoch": 0.004728382426593418, "grad_norm": 0.20637230137758597, "learning_rate": 9.999483891007283e-05, "loss": 2.2864, "step": 608 }, { "epoch": 0.0047361593713739995, "grad_norm": 0.16989919310654691, "learning_rate": 9.999482134072099e-05, "loss": 2.2723, "step": 609 }, { "epoch": 0.004743936316154581, "grad_norm": 0.1965350576473374, "learning_rate": 9.999480374151675e-05, "loss": 2.324, "step": 610 }, { "epoch": 0.004751713260935162, "grad_norm": 0.19865853133702513, "learning_rate": 9.999478611246014e-05, "loss": 2.2647, "step": 611 }, { "epoch": 0.004759490205715743, "grad_norm": 0.20639088938442438, "learning_rate": 9.999476845355112e-05, "loss": 2.3295, "step": 612 }, { "epoch": 0.004767267150496325, "grad_norm": 0.25963374660054267, "learning_rate": 9.999475076478975e-05, "loss": 2.2838, "step": 613 }, { "epoch": 0.004775044095276906, "grad_norm": 0.2305064576032616, "learning_rate": 9.999473304617601e-05, "loss": 2.2712, "step": 614 }, { "epoch": 0.004782821040057487, "grad_norm": 0.15618291028541553, "learning_rate": 9.999471529770994e-05, "loss": 2.3412, "step": 615 }, { "epoch": 0.0047905979848380685, "grad_norm": 0.20792857679496315, "learning_rate": 9.999469751939152e-05, "loss": 2.3352, "step": 616 }, { "epoch": 0.00479837492961865, "grad_norm": 0.22313450316103947, "learning_rate": 9.999467971122077e-05, "loss": 2.3219, "step": 617 }, { "epoch": 0.004806151874399231, "grad_norm": 0.17864669183608084, "learning_rate": 9.999466187319772e-05, "loss": 2.365, "step": 618 }, { "epoch": 0.004813928819179812, "grad_norm": 0.2169726444830505, "learning_rate": 9.999464400532234e-05, "loss": 2.349, "step": 619 }, { "epoch": 0.004821705763960394, "grad_norm": 0.2578137728506735, "learning_rate": 9.999462610759469e-05, "loss": 2.3629, "step": 620 }, { "epoch": 0.004829482708740975, "grad_norm": 0.20471807024990962, "learning_rate": 9.999460818001475e-05, "loss": 2.2734, "step": 621 }, { "epoch": 0.004837259653521556, "grad_norm": 0.23281132249936468, "learning_rate": 9.999459022258252e-05, "loss": 2.3484, "step": 622 }, { "epoch": 0.0048450365983021375, "grad_norm": 0.2291078421447036, "learning_rate": 9.999457223529805e-05, "loss": 2.3169, "step": 623 }, { "epoch": 0.004852813543082719, "grad_norm": 0.24464687026823717, "learning_rate": 9.999455421816133e-05, "loss": 2.2704, "step": 624 }, { "epoch": 0.0048605904878633, "grad_norm": 0.18029923250092011, "learning_rate": 9.999453617117235e-05, "loss": 2.3224, "step": 625 }, { "epoch": 0.004868367432643881, "grad_norm": 0.22820127218528424, "learning_rate": 9.999451809433115e-05, "loss": 2.306, "step": 626 }, { "epoch": 0.004876144377424462, "grad_norm": 0.24654242882548233, "learning_rate": 9.999449998763774e-05, "loss": 2.3193, "step": 627 }, { "epoch": 0.004883921322205044, "grad_norm": 0.17200169817309496, "learning_rate": 9.99944818510921e-05, "loss": 2.3367, "step": 628 }, { "epoch": 0.004891698266985625, "grad_norm": 0.2087380818965298, "learning_rate": 9.999446368469427e-05, "loss": 2.3141, "step": 629 }, { "epoch": 0.0048994752117662065, "grad_norm": 0.25632571705756463, "learning_rate": 9.999444548844426e-05, "loss": 2.3327, "step": 630 }, { "epoch": 0.004907252156546788, "grad_norm": 0.2123316052641992, "learning_rate": 9.999442726234208e-05, "loss": 2.3479, "step": 631 }, { "epoch": 0.004915029101327369, "grad_norm": 0.17286411984753053, "learning_rate": 9.999440900638772e-05, "loss": 2.3615, "step": 632 }, { "epoch": 0.00492280604610795, "grad_norm": 0.19468490081664935, "learning_rate": 9.99943907205812e-05, "loss": 2.3022, "step": 633 }, { "epoch": 0.004930582990888531, "grad_norm": 0.19109351364684335, "learning_rate": 9.999437240492255e-05, "loss": 2.3218, "step": 634 }, { "epoch": 0.004938359935669113, "grad_norm": 0.16240559830390902, "learning_rate": 9.999435405941177e-05, "loss": 2.3158, "step": 635 }, { "epoch": 0.004946136880449694, "grad_norm": 0.14519187624330332, "learning_rate": 9.999433568404888e-05, "loss": 2.2804, "step": 636 }, { "epoch": 0.0049539138252302755, "grad_norm": 0.14557148997666625, "learning_rate": 9.999431727883387e-05, "loss": 2.2746, "step": 637 }, { "epoch": 0.004961690770010857, "grad_norm": 0.14951306353262367, "learning_rate": 9.999429884376674e-05, "loss": 2.2775, "step": 638 }, { "epoch": 0.004969467714791438, "grad_norm": 0.14674021390611494, "learning_rate": 9.999428037884753e-05, "loss": 2.278, "step": 639 }, { "epoch": 0.004977244659572019, "grad_norm": 0.14987745241655104, "learning_rate": 9.999426188407625e-05, "loss": 2.3084, "step": 640 }, { "epoch": 0.0049850216043526, "grad_norm": 0.2590715017705942, "learning_rate": 9.999424335945292e-05, "loss": 2.269, "step": 641 }, { "epoch": 0.004992798549133181, "grad_norm": 0.1455031395929321, "learning_rate": 9.999422480497751e-05, "loss": 2.2822, "step": 642 }, { "epoch": 0.005000575493913763, "grad_norm": 0.1562924786110806, "learning_rate": 9.999420622065006e-05, "loss": 2.2649, "step": 643 }, { "epoch": 0.0050083524386943445, "grad_norm": 0.1584298492804658, "learning_rate": 9.999418760647058e-05, "loss": 2.2786, "step": 644 }, { "epoch": 0.005016129383474926, "grad_norm": 0.14223074754311335, "learning_rate": 9.999416896243907e-05, "loss": 2.329, "step": 645 }, { "epoch": 0.005023906328255507, "grad_norm": 0.14605659533652068, "learning_rate": 9.999415028855557e-05, "loss": 2.2981, "step": 646 }, { "epoch": 0.005031683273036088, "grad_norm": 0.14587357107373206, "learning_rate": 9.999413158482004e-05, "loss": 2.3038, "step": 647 }, { "epoch": 0.005039460217816669, "grad_norm": 0.14054619827684567, "learning_rate": 9.999411285123254e-05, "loss": 2.3171, "step": 648 }, { "epoch": 0.00504723716259725, "grad_norm": 0.13855588118838638, "learning_rate": 9.999409408779308e-05, "loss": 2.2922, "step": 649 }, { "epoch": 0.005055014107377832, "grad_norm": 0.17388088860021297, "learning_rate": 9.999407529450162e-05, "loss": 2.2537, "step": 650 }, { "epoch": 0.0050627910521584135, "grad_norm": 0.2301798961317738, "learning_rate": 9.999405647135822e-05, "loss": 2.3529, "step": 651 }, { "epoch": 0.005070567996938995, "grad_norm": 0.25856044013682306, "learning_rate": 9.999403761836288e-05, "loss": 2.3449, "step": 652 }, { "epoch": 0.005078344941719576, "grad_norm": 0.2747831757638395, "learning_rate": 9.999401873551561e-05, "loss": 2.2669, "step": 653 }, { "epoch": 0.005086121886500157, "grad_norm": 0.21001387925048134, "learning_rate": 9.99939998228164e-05, "loss": 2.2931, "step": 654 }, { "epoch": 0.005093898831280738, "grad_norm": 0.15237699571954896, "learning_rate": 9.99939808802653e-05, "loss": 2.3169, "step": 655 }, { "epoch": 0.005101675776061319, "grad_norm": 0.18284561658086168, "learning_rate": 9.99939619078623e-05, "loss": 2.3939, "step": 656 }, { "epoch": 0.005109452720841901, "grad_norm": 0.23613953615269057, "learning_rate": 9.999394290560741e-05, "loss": 2.253, "step": 657 }, { "epoch": 0.0051172296656224825, "grad_norm": 0.24043097066875532, "learning_rate": 9.999392387350064e-05, "loss": 2.2781, "step": 658 }, { "epoch": 0.005125006610403064, "grad_norm": 0.17849299994552525, "learning_rate": 9.9993904811542e-05, "loss": 2.2932, "step": 659 }, { "epoch": 0.005132783555183645, "grad_norm": 0.1444413281030243, "learning_rate": 9.999388571973152e-05, "loss": 2.2278, "step": 660 }, { "epoch": 0.005140560499964226, "grad_norm": 0.2593014304255416, "learning_rate": 9.99938665980692e-05, "loss": 2.3032, "step": 661 }, { "epoch": 0.005148337444744807, "grad_norm": 0.30771255333783587, "learning_rate": 9.999384744655504e-05, "loss": 2.2995, "step": 662 }, { "epoch": 0.005156114389525388, "grad_norm": 0.2121127561081385, "learning_rate": 9.999382826518906e-05, "loss": 2.3072, "step": 663 }, { "epoch": 0.0051638913343059695, "grad_norm": 0.14716860402247997, "learning_rate": 9.999380905397128e-05, "loss": 2.2556, "step": 664 }, { "epoch": 0.0051716682790865515, "grad_norm": 0.26491726365551194, "learning_rate": 9.999378981290169e-05, "loss": 2.2565, "step": 665 }, { "epoch": 0.005179445223867133, "grad_norm": 0.26321776753301246, "learning_rate": 9.999377054198034e-05, "loss": 2.2851, "step": 666 }, { "epoch": 0.005187222168647714, "grad_norm": 0.14720293285668098, "learning_rate": 9.99937512412072e-05, "loss": 2.2845, "step": 667 }, { "epoch": 0.005194999113428295, "grad_norm": 0.30886010008808784, "learning_rate": 9.99937319105823e-05, "loss": 2.3066, "step": 668 }, { "epoch": 0.005202776058208876, "grad_norm": 0.3708155075774891, "learning_rate": 9.999371255010566e-05, "loss": 2.3281, "step": 669 }, { "epoch": 0.005210553002989457, "grad_norm": 0.9672133042511197, "learning_rate": 9.999369315977727e-05, "loss": 2.3374, "step": 670 }, { "epoch": 0.0052183299477700385, "grad_norm": 0.2552777423592813, "learning_rate": 9.999367373959716e-05, "loss": 2.2867, "step": 671 }, { "epoch": 0.0052261068925506205, "grad_norm": 0.38034275242877275, "learning_rate": 9.999365428956532e-05, "loss": 2.2661, "step": 672 }, { "epoch": 0.005233883837331202, "grad_norm": 0.34357711925476647, "learning_rate": 9.999363480968178e-05, "loss": 2.3179, "step": 673 }, { "epoch": 0.005241660782111783, "grad_norm": 0.2759303425703033, "learning_rate": 9.999361529994656e-05, "loss": 2.2785, "step": 674 }, { "epoch": 0.005249437726892364, "grad_norm": 0.25958967160802, "learning_rate": 9.999359576035965e-05, "loss": 2.2919, "step": 675 }, { "epoch": 0.005257214671672945, "grad_norm": 0.21376074828304645, "learning_rate": 9.99935761909211e-05, "loss": 2.3288, "step": 676 }, { "epoch": 0.005264991616453526, "grad_norm": 0.280911491926753, "learning_rate": 9.999355659163086e-05, "loss": 2.2824, "step": 677 }, { "epoch": 0.0052727685612341075, "grad_norm": 0.2717442765260637, "learning_rate": 9.999353696248899e-05, "loss": 2.294, "step": 678 }, { "epoch": 0.005280545506014689, "grad_norm": 0.20172832759517462, "learning_rate": 9.999351730349547e-05, "loss": 2.2997, "step": 679 }, { "epoch": 0.005288322450795271, "grad_norm": 0.3241645846725689, "learning_rate": 9.999349761465035e-05, "loss": 2.3541, "step": 680 }, { "epoch": 0.005296099395575852, "grad_norm": 0.2783821265737767, "learning_rate": 9.999347789595361e-05, "loss": 2.2327, "step": 681 }, { "epoch": 0.005303876340356433, "grad_norm": 0.19336723438832754, "learning_rate": 9.999345814740526e-05, "loss": 2.3172, "step": 682 }, { "epoch": 0.005311653285137014, "grad_norm": 0.33513884567436364, "learning_rate": 9.999343836900534e-05, "loss": 2.2249, "step": 683 }, { "epoch": 0.005319430229917595, "grad_norm": 0.2803881255776657, "learning_rate": 9.999341856075384e-05, "loss": 2.2863, "step": 684 }, { "epoch": 0.0053272071746981765, "grad_norm": 0.1802476702256254, "learning_rate": 9.999339872265078e-05, "loss": 2.2828, "step": 685 }, { "epoch": 0.005334984119478758, "grad_norm": 0.8029661985920227, "learning_rate": 9.999337885469617e-05, "loss": 2.3629, "step": 686 }, { "epoch": 0.00534276106425934, "grad_norm": 2.0487704192301126, "learning_rate": 9.999335895689001e-05, "loss": 2.2553, "step": 687 }, { "epoch": 0.005350538009039921, "grad_norm": 0.38928492213878324, "learning_rate": 9.999333902923233e-05, "loss": 2.3291, "step": 688 }, { "epoch": 0.005358314953820502, "grad_norm": 0.5748089327073876, "learning_rate": 9.999331907172313e-05, "loss": 2.2733, "step": 689 }, { "epoch": 0.005366091898601083, "grad_norm": 0.59256641996068, "learning_rate": 9.999329908436244e-05, "loss": 2.2717, "step": 690 }, { "epoch": 0.005373868843381664, "grad_norm": 0.5493380060500375, "learning_rate": 9.999327906715025e-05, "loss": 2.2945, "step": 691 }, { "epoch": 0.0053816457881622455, "grad_norm": 0.3503095183839453, "learning_rate": 9.999325902008658e-05, "loss": 2.2975, "step": 692 }, { "epoch": 0.005389422732942827, "grad_norm": 0.24446578646244402, "learning_rate": 9.999323894317146e-05, "loss": 2.273, "step": 693 }, { "epoch": 0.005397199677723408, "grad_norm": 0.4408427807088831, "learning_rate": 9.999321883640485e-05, "loss": 2.2905, "step": 694 }, { "epoch": 0.00540497662250399, "grad_norm": 0.528251006518951, "learning_rate": 9.999319869978681e-05, "loss": 2.3364, "step": 695 }, { "epoch": 0.005412753567284571, "grad_norm": 0.42282762558829406, "learning_rate": 9.999317853331735e-05, "loss": 2.3142, "step": 696 }, { "epoch": 0.005420530512065152, "grad_norm": 0.246815125109041, "learning_rate": 9.999315833699646e-05, "loss": 2.2855, "step": 697 }, { "epoch": 0.005428307456845733, "grad_norm": 0.2106368423099327, "learning_rate": 9.999313811082417e-05, "loss": 2.3407, "step": 698 }, { "epoch": 0.0054360844016263146, "grad_norm": 0.2929567292343542, "learning_rate": 9.999311785480048e-05, "loss": 2.2985, "step": 699 }, { "epoch": 0.005443861346406896, "grad_norm": 0.2864821760654291, "learning_rate": 9.999309756892541e-05, "loss": 2.2789, "step": 700 }, { "epoch": 0.005451638291187477, "grad_norm": 0.16821317484767312, "learning_rate": 9.999307725319896e-05, "loss": 2.2764, "step": 701 }, { "epoch": 0.005459415235968059, "grad_norm": 0.5669821963474783, "learning_rate": 9.999305690762117e-05, "loss": 2.3291, "step": 702 }, { "epoch": 0.00546719218074864, "grad_norm": 0.3858468958635142, "learning_rate": 9.9993036532192e-05, "loss": 2.2636, "step": 703 }, { "epoch": 0.005474969125529221, "grad_norm": 0.1969042047427921, "learning_rate": 9.999301612691154e-05, "loss": 2.2883, "step": 704 }, { "epoch": 0.005482746070309802, "grad_norm": 0.3726403313495079, "learning_rate": 9.999299569177972e-05, "loss": 2.2666, "step": 705 }, { "epoch": 0.0054905230150903836, "grad_norm": 0.29636321755007317, "learning_rate": 9.99929752267966e-05, "loss": 2.3121, "step": 706 }, { "epoch": 0.005498299959870965, "grad_norm": 0.23451287824801756, "learning_rate": 9.999295473196219e-05, "loss": 2.2751, "step": 707 }, { "epoch": 0.005506076904651546, "grad_norm": 0.3247535560870617, "learning_rate": 9.999293420727649e-05, "loss": 2.3266, "step": 708 }, { "epoch": 0.005513853849432128, "grad_norm": 0.19222421095510459, "learning_rate": 9.999291365273951e-05, "loss": 2.2731, "step": 709 }, { "epoch": 0.005521630794212709, "grad_norm": 0.2761013399943059, "learning_rate": 9.999289306835129e-05, "loss": 2.3318, "step": 710 }, { "epoch": 0.00552940773899329, "grad_norm": 0.1767203939907259, "learning_rate": 9.999287245411179e-05, "loss": 2.2685, "step": 711 }, { "epoch": 0.005537184683773871, "grad_norm": 0.20652690568871332, "learning_rate": 9.999285181002107e-05, "loss": 2.3056, "step": 712 }, { "epoch": 0.005544961628554453, "grad_norm": 0.20530009464060106, "learning_rate": 9.999283113607914e-05, "loss": 2.3045, "step": 713 }, { "epoch": 0.005552738573335034, "grad_norm": 0.14554498363095456, "learning_rate": 9.999281043228596e-05, "loss": 2.2608, "step": 714 }, { "epoch": 0.005560515518115615, "grad_norm": 0.1936816183928132, "learning_rate": 9.99927896986416e-05, "loss": 2.3481, "step": 715 }, { "epoch": 0.005568292462896196, "grad_norm": 0.2040376768378784, "learning_rate": 9.999276893514608e-05, "loss": 2.3551, "step": 716 }, { "epoch": 0.005576069407676778, "grad_norm": 0.15210080645204563, "learning_rate": 9.999274814179935e-05, "loss": 2.2724, "step": 717 }, { "epoch": 0.005583846352457359, "grad_norm": 0.1514845381798207, "learning_rate": 9.999272731860148e-05, "loss": 2.2993, "step": 718 }, { "epoch": 0.00559162329723794, "grad_norm": 0.1670256622436606, "learning_rate": 9.999270646555243e-05, "loss": 2.2897, "step": 719 }, { "epoch": 0.005599400242018522, "grad_norm": 0.20981156297434722, "learning_rate": 9.999268558265227e-05, "loss": 2.3348, "step": 720 }, { "epoch": 0.005607177186799103, "grad_norm": 0.18084781194858182, "learning_rate": 9.999266466990099e-05, "loss": 2.3143, "step": 721 }, { "epoch": 0.005614954131579684, "grad_norm": 0.16774096052602647, "learning_rate": 9.999264372729857e-05, "loss": 2.2949, "step": 722 }, { "epoch": 0.005622731076360265, "grad_norm": 0.15093748160865797, "learning_rate": 9.999262275484506e-05, "loss": 2.2832, "step": 723 }, { "epoch": 0.005630508021140847, "grad_norm": 0.18260559330188245, "learning_rate": 9.999260175254046e-05, "loss": 2.2916, "step": 724 }, { "epoch": 0.005638284965921428, "grad_norm": 0.1524325298602502, "learning_rate": 9.99925807203848e-05, "loss": 2.3321, "step": 725 }, { "epoch": 0.005646061910702009, "grad_norm": 0.1754397837876983, "learning_rate": 9.999255965837806e-05, "loss": 2.3163, "step": 726 }, { "epoch": 0.005653838855482591, "grad_norm": 0.20937020727499814, "learning_rate": 9.999253856652029e-05, "loss": 2.2623, "step": 727 }, { "epoch": 0.005661615800263172, "grad_norm": 0.151190777521521, "learning_rate": 9.999251744481146e-05, "loss": 2.2574, "step": 728 }, { "epoch": 0.005669392745043753, "grad_norm": 0.14379181276437242, "learning_rate": 9.999249629325163e-05, "loss": 2.23, "step": 729 }, { "epoch": 0.005677169689824334, "grad_norm": 0.1416591618199621, "learning_rate": 9.999247511184077e-05, "loss": 2.2925, "step": 730 }, { "epoch": 0.005684946634604915, "grad_norm": 0.13259086338429618, "learning_rate": 9.999245390057891e-05, "loss": 2.2276, "step": 731 }, { "epoch": 0.005692723579385497, "grad_norm": 0.14688366265019917, "learning_rate": 9.999243265946608e-05, "loss": 2.3503, "step": 732 }, { "epoch": 0.0057005005241660784, "grad_norm": 0.13147625550771472, "learning_rate": 9.999241138850224e-05, "loss": 2.2548, "step": 733 }, { "epoch": 0.00570827746894666, "grad_norm": 0.20279769236012632, "learning_rate": 9.999239008768747e-05, "loss": 2.3091, "step": 734 }, { "epoch": 0.005716054413727241, "grad_norm": 0.1367350476242964, "learning_rate": 9.999236875702175e-05, "loss": 2.2358, "step": 735 }, { "epoch": 0.005723831358507822, "grad_norm": 0.13819184861515643, "learning_rate": 9.999234739650508e-05, "loss": 2.2593, "step": 736 }, { "epoch": 0.005731608303288403, "grad_norm": 0.5231647075364745, "learning_rate": 9.99923260061375e-05, "loss": 2.3112, "step": 737 }, { "epoch": 0.005739385248068984, "grad_norm": 0.14610938775069504, "learning_rate": 9.9992304585919e-05, "loss": 2.2787, "step": 738 }, { "epoch": 0.005747162192849566, "grad_norm": 0.13753186815644722, "learning_rate": 9.99922831358496e-05, "loss": 2.2484, "step": 739 }, { "epoch": 0.0057549391376301474, "grad_norm": 0.14544444900231393, "learning_rate": 9.999226165592933e-05, "loss": 2.2529, "step": 740 }, { "epoch": 0.005762716082410729, "grad_norm": 0.14987338834496092, "learning_rate": 9.999224014615818e-05, "loss": 2.2889, "step": 741 }, { "epoch": 0.00577049302719131, "grad_norm": 0.1516914360458625, "learning_rate": 9.999221860653617e-05, "loss": 2.2816, "step": 742 }, { "epoch": 0.005778269971971891, "grad_norm": 0.14293321250249003, "learning_rate": 9.99921970370633e-05, "loss": 2.3076, "step": 743 }, { "epoch": 0.005786046916752472, "grad_norm": 0.155532030714047, "learning_rate": 9.999217543773963e-05, "loss": 2.3593, "step": 744 }, { "epoch": 0.005793823861533053, "grad_norm": 0.16226697649870264, "learning_rate": 9.999215380856511e-05, "loss": 2.3413, "step": 745 }, { "epoch": 0.005801600806313634, "grad_norm": 0.14208741236110561, "learning_rate": 9.999213214953979e-05, "loss": 2.3386, "step": 746 }, { "epoch": 0.0058093777510942165, "grad_norm": 0.1656831561067922, "learning_rate": 9.999211046066367e-05, "loss": 2.2701, "step": 747 }, { "epoch": 0.005817154695874798, "grad_norm": 0.15210182641190653, "learning_rate": 9.99920887419368e-05, "loss": 2.2988, "step": 748 }, { "epoch": 0.005824931640655379, "grad_norm": 0.14253325854402424, "learning_rate": 9.999206699335913e-05, "loss": 2.2093, "step": 749 }, { "epoch": 0.00583270858543596, "grad_norm": 0.1454693263475155, "learning_rate": 9.99920452149307e-05, "loss": 2.2438, "step": 750 }, { "epoch": 0.005840485530216541, "grad_norm": 0.2304396919725454, "learning_rate": 9.999202340665155e-05, "loss": 2.3401, "step": 751 }, { "epoch": 0.005848262474997122, "grad_norm": 0.15486142580812945, "learning_rate": 9.999200156852167e-05, "loss": 2.2678, "step": 752 }, { "epoch": 0.005856039419777703, "grad_norm": 0.15149420553830936, "learning_rate": 9.999197970054106e-05, "loss": 2.2293, "step": 753 }, { "epoch": 0.0058638163645582855, "grad_norm": 0.24564068471725237, "learning_rate": 9.999195780270974e-05, "loss": 2.3476, "step": 754 }, { "epoch": 0.005871593309338867, "grad_norm": 0.17682745473201927, "learning_rate": 9.999193587502774e-05, "loss": 2.2848, "step": 755 }, { "epoch": 0.005879370254119448, "grad_norm": 0.14814806498680627, "learning_rate": 9.999191391749507e-05, "loss": 2.2925, "step": 756 }, { "epoch": 0.005887147198900029, "grad_norm": 0.17897939329506066, "learning_rate": 9.999189193011172e-05, "loss": 2.275, "step": 757 }, { "epoch": 0.00589492414368061, "grad_norm": 0.17344491505243623, "learning_rate": 9.999186991287772e-05, "loss": 2.2889, "step": 758 }, { "epoch": 0.005902701088461191, "grad_norm": 0.1482273791533416, "learning_rate": 9.99918478657931e-05, "loss": 2.2747, "step": 759 }, { "epoch": 0.0059104780332417724, "grad_norm": 0.31903047398503936, "learning_rate": 9.999182578885784e-05, "loss": 2.3273, "step": 760 }, { "epoch": 0.0059182549780223545, "grad_norm": 0.17259524438546428, "learning_rate": 9.999180368207197e-05, "loss": 2.3576, "step": 761 }, { "epoch": 0.005926031922802936, "grad_norm": 0.14565669135392129, "learning_rate": 9.99917815454355e-05, "loss": 2.2302, "step": 762 }, { "epoch": 0.005933808867583517, "grad_norm": 0.14911223909138813, "learning_rate": 9.999175937894845e-05, "loss": 2.2964, "step": 763 }, { "epoch": 0.005941585812364098, "grad_norm": 0.14239454488910627, "learning_rate": 9.999173718261082e-05, "loss": 2.2458, "step": 764 }, { "epoch": 0.005949362757144679, "grad_norm": 0.1575905412266833, "learning_rate": 9.999171495642264e-05, "loss": 2.3366, "step": 765 }, { "epoch": 0.00595713970192526, "grad_norm": 0.18068850028199493, "learning_rate": 9.999169270038391e-05, "loss": 2.2638, "step": 766 }, { "epoch": 0.0059649166467058415, "grad_norm": 0.14742698256238979, "learning_rate": 9.999167041449464e-05, "loss": 2.3128, "step": 767 }, { "epoch": 0.005972693591486423, "grad_norm": 0.18928885700221088, "learning_rate": 9.999164809875488e-05, "loss": 2.2949, "step": 768 }, { "epoch": 0.005980470536267005, "grad_norm": 0.1428351970496084, "learning_rate": 9.999162575316459e-05, "loss": 2.3063, "step": 769 }, { "epoch": 0.005988247481047586, "grad_norm": 0.14982453942464308, "learning_rate": 9.999160337772381e-05, "loss": 2.27, "step": 770 }, { "epoch": 0.005996024425828167, "grad_norm": 0.142340702777197, "learning_rate": 9.999158097243256e-05, "loss": 2.277, "step": 771 }, { "epoch": 0.006003801370608748, "grad_norm": 2.069133357038432, "learning_rate": 9.999155853729083e-05, "loss": 2.2868, "step": 772 }, { "epoch": 0.006011578315389329, "grad_norm": 0.1581128049680232, "learning_rate": 9.999153607229865e-05, "loss": 2.2099, "step": 773 }, { "epoch": 0.0060193552601699105, "grad_norm": 0.1703887267312782, "learning_rate": 9.999151357745605e-05, "loss": 2.2859, "step": 774 }, { "epoch": 0.006027132204950492, "grad_norm": 0.1572405085665989, "learning_rate": 9.9991491052763e-05, "loss": 2.2871, "step": 775 }, { "epoch": 0.006034909149731074, "grad_norm": 0.1849731776122652, "learning_rate": 9.999146849821955e-05, "loss": 2.2884, "step": 776 }, { "epoch": 0.006042686094511655, "grad_norm": 0.15804941584233315, "learning_rate": 9.999144591382569e-05, "loss": 2.3114, "step": 777 }, { "epoch": 0.006050463039292236, "grad_norm": 0.16398897495213785, "learning_rate": 9.999142329958146e-05, "loss": 2.3249, "step": 778 }, { "epoch": 0.006058239984072817, "grad_norm": 0.16274613550811148, "learning_rate": 9.999140065548687e-05, "loss": 2.2989, "step": 779 }, { "epoch": 0.006066016928853398, "grad_norm": 0.20381258007168035, "learning_rate": 9.99913779815419e-05, "loss": 2.2689, "step": 780 }, { "epoch": 0.0060737938736339795, "grad_norm": 0.14490927797992956, "learning_rate": 9.999135527774659e-05, "loss": 2.297, "step": 781 }, { "epoch": 0.006081570818414561, "grad_norm": 0.15299732235904676, "learning_rate": 9.999133254410095e-05, "loss": 2.294, "step": 782 }, { "epoch": 0.006089347763195142, "grad_norm": 0.15353467741968235, "learning_rate": 9.9991309780605e-05, "loss": 2.2643, "step": 783 }, { "epoch": 0.006097124707975724, "grad_norm": 0.14943063600408182, "learning_rate": 9.999128698725873e-05, "loss": 2.289, "step": 784 }, { "epoch": 0.006104901652756305, "grad_norm": 0.1671047529216166, "learning_rate": 9.999126416406219e-05, "loss": 2.3697, "step": 785 }, { "epoch": 0.006112678597536886, "grad_norm": 0.15690906827283488, "learning_rate": 9.999124131101536e-05, "loss": 2.2971, "step": 786 }, { "epoch": 0.006120455542317467, "grad_norm": 0.15507075188678118, "learning_rate": 9.999121842811827e-05, "loss": 2.2757, "step": 787 }, { "epoch": 0.0061282324870980485, "grad_norm": 0.16052789908864537, "learning_rate": 9.999119551537094e-05, "loss": 2.2606, "step": 788 }, { "epoch": 0.00613600943187863, "grad_norm": 0.23638016461681083, "learning_rate": 9.999117257277337e-05, "loss": 2.3101, "step": 789 }, { "epoch": 0.006143786376659211, "grad_norm": 0.29139129826576643, "learning_rate": 9.999114960032557e-05, "loss": 2.2795, "step": 790 }, { "epoch": 0.006151563321439793, "grad_norm": 0.24544304912175444, "learning_rate": 9.999112659802758e-05, "loss": 2.2484, "step": 791 }, { "epoch": 0.006159340266220374, "grad_norm": 0.147141755711408, "learning_rate": 9.999110356587937e-05, "loss": 2.27, "step": 792 }, { "epoch": 0.006167117211000955, "grad_norm": 0.22130598271191348, "learning_rate": 9.9991080503881e-05, "loss": 2.2601, "step": 793 }, { "epoch": 0.006174894155781536, "grad_norm": 0.2000655965560728, "learning_rate": 9.999105741203246e-05, "loss": 2.2179, "step": 794 }, { "epoch": 0.0061826711005621175, "grad_norm": 0.13193660410733546, "learning_rate": 9.999103429033377e-05, "loss": 2.2537, "step": 795 }, { "epoch": 0.006190448045342699, "grad_norm": 0.21093984240660313, "learning_rate": 9.999101113878493e-05, "loss": 2.2241, "step": 796 }, { "epoch": 0.00619822499012328, "grad_norm": 2.8655882657332157, "learning_rate": 9.999098795738597e-05, "loss": 2.2576, "step": 797 }, { "epoch": 0.006206001934903861, "grad_norm": 0.3125174311463441, "learning_rate": 9.99909647461369e-05, "loss": 2.2946, "step": 798 }, { "epoch": 0.006213778879684443, "grad_norm": 0.39667860185567283, "learning_rate": 9.999094150503772e-05, "loss": 2.3175, "step": 799 }, { "epoch": 0.006221555824465024, "grad_norm": 0.27027173378992264, "learning_rate": 9.999091823408847e-05, "loss": 2.286, "step": 800 }, { "epoch": 0.006229332769245605, "grad_norm": 0.37214136738068904, "learning_rate": 9.999089493328915e-05, "loss": 2.3349, "step": 801 }, { "epoch": 0.0062371097140261865, "grad_norm": 0.34056035720226513, "learning_rate": 9.999087160263976e-05, "loss": 2.3093, "step": 802 }, { "epoch": 0.006244886658806768, "grad_norm": 0.18547267473564852, "learning_rate": 9.999084824214034e-05, "loss": 2.2982, "step": 803 }, { "epoch": 0.006252663603587349, "grad_norm": 0.4544432139184431, "learning_rate": 9.99908248517909e-05, "loss": 2.2554, "step": 804 }, { "epoch": 0.00626044054836793, "grad_norm": 0.32094439910904404, "learning_rate": 9.999080143159143e-05, "loss": 2.213, "step": 805 }, { "epoch": 0.006268217493148512, "grad_norm": 0.25864948911143676, "learning_rate": 9.999077798154198e-05, "loss": 2.2678, "step": 806 }, { "epoch": 0.006275994437929093, "grad_norm": 0.329565770134202, "learning_rate": 9.999075450164253e-05, "loss": 2.315, "step": 807 }, { "epoch": 0.006283771382709674, "grad_norm": 0.19758663420737965, "learning_rate": 9.99907309918931e-05, "loss": 2.2536, "step": 808 }, { "epoch": 0.0062915483274902555, "grad_norm": 0.22950708470192727, "learning_rate": 9.999070745229372e-05, "loss": 2.2992, "step": 809 }, { "epoch": 0.006299325272270837, "grad_norm": 0.15815771644723073, "learning_rate": 9.999068388284441e-05, "loss": 2.2942, "step": 810 }, { "epoch": 0.006307102217051418, "grad_norm": 0.3616240405994263, "learning_rate": 9.999066028354515e-05, "loss": 2.3402, "step": 811 }, { "epoch": 0.006314879161831999, "grad_norm": 0.18489508754457246, "learning_rate": 9.999063665439597e-05, "loss": 2.262, "step": 812 }, { "epoch": 0.006322656106612581, "grad_norm": 0.19642682651342733, "learning_rate": 9.99906129953969e-05, "loss": 2.2838, "step": 813 }, { "epoch": 0.006330433051393162, "grad_norm": 0.20317119271953313, "learning_rate": 9.999058930654795e-05, "loss": 2.2376, "step": 814 }, { "epoch": 0.006338209996173743, "grad_norm": 0.14100614042640086, "learning_rate": 9.999056558784912e-05, "loss": 2.2472, "step": 815 }, { "epoch": 0.0063459869409543245, "grad_norm": 0.20500310432717544, "learning_rate": 9.999054183930043e-05, "loss": 2.2646, "step": 816 }, { "epoch": 0.006353763885734906, "grad_norm": 0.14099119758281128, "learning_rate": 9.99905180609019e-05, "loss": 2.2206, "step": 817 }, { "epoch": 0.006361540830515487, "grad_norm": 0.18861440354442782, "learning_rate": 9.999049425265352e-05, "loss": 2.2875, "step": 818 }, { "epoch": 0.006369317775296068, "grad_norm": 0.17663124611913103, "learning_rate": 9.999047041455534e-05, "loss": 2.3047, "step": 819 }, { "epoch": 0.006377094720076649, "grad_norm": 0.15386983628682638, "learning_rate": 9.999044654660736e-05, "loss": 2.2591, "step": 820 }, { "epoch": 0.006384871664857231, "grad_norm": 0.1864262238820932, "learning_rate": 9.999042264880958e-05, "loss": 2.2769, "step": 821 }, { "epoch": 0.006392648609637812, "grad_norm": 0.19656066686588733, "learning_rate": 9.999039872116203e-05, "loss": 2.309, "step": 822 }, { "epoch": 0.0064004255544183935, "grad_norm": 0.1442734115422272, "learning_rate": 9.999037476366473e-05, "loss": 2.2835, "step": 823 }, { "epoch": 0.006408202499198975, "grad_norm": 0.1664862166485466, "learning_rate": 9.999035077631767e-05, "loss": 2.207, "step": 824 }, { "epoch": 0.006415979443979556, "grad_norm": 0.12703229886504308, "learning_rate": 9.999032675912088e-05, "loss": 2.3177, "step": 825 }, { "epoch": 0.006423756388760137, "grad_norm": 0.19776743486903106, "learning_rate": 9.999030271207438e-05, "loss": 2.2914, "step": 826 }, { "epoch": 0.006431533333540718, "grad_norm": 0.1884333304147124, "learning_rate": 9.999027863517819e-05, "loss": 2.2838, "step": 827 }, { "epoch": 0.0064393102783213, "grad_norm": 0.13220480031792622, "learning_rate": 9.999025452843229e-05, "loss": 2.2716, "step": 828 }, { "epoch": 0.006447087223101881, "grad_norm": 0.19778782286256158, "learning_rate": 9.999023039183674e-05, "loss": 2.2694, "step": 829 }, { "epoch": 0.0064548641678824625, "grad_norm": 0.1362893116357139, "learning_rate": 9.999020622539151e-05, "loss": 2.2641, "step": 830 }, { "epoch": 0.006462641112663044, "grad_norm": 0.17603128015961836, "learning_rate": 9.999018202909664e-05, "loss": 2.2751, "step": 831 }, { "epoch": 0.006470418057443625, "grad_norm": 0.17615439935379051, "learning_rate": 9.999015780295214e-05, "loss": 2.2608, "step": 832 }, { "epoch": 0.006478195002224206, "grad_norm": 0.13678075864558145, "learning_rate": 9.999013354695803e-05, "loss": 2.2468, "step": 833 }, { "epoch": 0.006485971947004787, "grad_norm": 0.21100460325483483, "learning_rate": 9.999010926111431e-05, "loss": 2.2574, "step": 834 }, { "epoch": 0.006493748891785368, "grad_norm": 0.1595128170635357, "learning_rate": 9.999008494542101e-05, "loss": 2.2743, "step": 835 }, { "epoch": 0.00650152583656595, "grad_norm": 0.16187043397015097, "learning_rate": 9.999006059987814e-05, "loss": 2.2662, "step": 836 }, { "epoch": 0.0065093027813465315, "grad_norm": 0.3032932494638125, "learning_rate": 9.99900362244857e-05, "loss": 2.2982, "step": 837 }, { "epoch": 0.006517079726127113, "grad_norm": 0.16838275158928886, "learning_rate": 9.999001181924374e-05, "loss": 2.2635, "step": 838 }, { "epoch": 0.006524856670907694, "grad_norm": 0.1604358751210722, "learning_rate": 9.998998738415224e-05, "loss": 2.2189, "step": 839 }, { "epoch": 0.006532633615688275, "grad_norm": 0.15461463692359978, "learning_rate": 9.998996291921121e-05, "loss": 2.2637, "step": 840 }, { "epoch": 0.006540410560468856, "grad_norm": 0.13808616341084026, "learning_rate": 9.99899384244207e-05, "loss": 2.2979, "step": 841 }, { "epoch": 0.006548187505249437, "grad_norm": 0.17633784277433107, "learning_rate": 9.99899138997807e-05, "loss": 2.236, "step": 842 }, { "epoch": 0.006555964450030019, "grad_norm": 0.16972426302695154, "learning_rate": 9.998988934529124e-05, "loss": 2.3263, "step": 843 }, { "epoch": 0.0065637413948106006, "grad_norm": 0.14232119441829355, "learning_rate": 9.998986476095231e-05, "loss": 2.2101, "step": 844 }, { "epoch": 0.006571518339591182, "grad_norm": 0.15139989055252795, "learning_rate": 9.998984014676393e-05, "loss": 2.3262, "step": 845 }, { "epoch": 0.006579295284371763, "grad_norm": 0.1392602432252553, "learning_rate": 9.998981550272615e-05, "loss": 2.2591, "step": 846 }, { "epoch": 0.006587072229152344, "grad_norm": 0.14249549680828383, "learning_rate": 9.998979082883893e-05, "loss": 2.2955, "step": 847 }, { "epoch": 0.006594849173932925, "grad_norm": 0.13808621225273993, "learning_rate": 9.998976612510233e-05, "loss": 2.2271, "step": 848 }, { "epoch": 0.006602626118713506, "grad_norm": 0.13997812127182138, "learning_rate": 9.998974139151636e-05, "loss": 2.1969, "step": 849 }, { "epoch": 0.006610403063494088, "grad_norm": 0.12624676380471195, "learning_rate": 9.998971662808101e-05, "loss": 2.2321, "step": 850 }, { "epoch": 0.0066181800082746696, "grad_norm": 0.1336224848744023, "learning_rate": 9.99896918347963e-05, "loss": 2.2598, "step": 851 }, { "epoch": 0.006625956953055251, "grad_norm": 0.12911487225910537, "learning_rate": 9.998966701166226e-05, "loss": 2.2361, "step": 852 }, { "epoch": 0.006633733897835832, "grad_norm": 0.13085919060374843, "learning_rate": 9.99896421586789e-05, "loss": 2.2351, "step": 853 }, { "epoch": 0.006641510842616413, "grad_norm": 0.12976793527701658, "learning_rate": 9.998961727584622e-05, "loss": 2.2215, "step": 854 }, { "epoch": 0.006649287787396994, "grad_norm": 0.12392508697516527, "learning_rate": 9.998959236316427e-05, "loss": 2.3004, "step": 855 }, { "epoch": 0.006657064732177575, "grad_norm": 0.137354043850675, "learning_rate": 9.998956742063302e-05, "loss": 2.3026, "step": 856 }, { "epoch": 0.0066648416769581565, "grad_norm": 0.1300583591093223, "learning_rate": 9.998954244825252e-05, "loss": 2.2847, "step": 857 }, { "epoch": 0.006672618621738739, "grad_norm": 0.128301951310384, "learning_rate": 9.998951744602276e-05, "loss": 2.3136, "step": 858 }, { "epoch": 0.00668039556651932, "grad_norm": 0.14158567000150304, "learning_rate": 9.998949241394378e-05, "loss": 2.3173, "step": 859 }, { "epoch": 0.006688172511299901, "grad_norm": 0.13111656725509313, "learning_rate": 9.998946735201557e-05, "loss": 2.278, "step": 860 }, { "epoch": 0.006695949456080482, "grad_norm": 0.12793199533456945, "learning_rate": 9.998944226023816e-05, "loss": 2.2916, "step": 861 }, { "epoch": 0.006703726400861063, "grad_norm": 0.1288395187745778, "learning_rate": 9.998941713861156e-05, "loss": 2.2471, "step": 862 }, { "epoch": 0.006711503345641644, "grad_norm": 0.12129195695043743, "learning_rate": 9.998939198713579e-05, "loss": 2.2127, "step": 863 }, { "epoch": 0.0067192802904222255, "grad_norm": 0.7537185227963001, "learning_rate": 9.998936680581086e-05, "loss": 2.2891, "step": 864 }, { "epoch": 0.006727057235202808, "grad_norm": 0.14141213525540694, "learning_rate": 9.99893415946368e-05, "loss": 2.2563, "step": 865 }, { "epoch": 0.006734834179983389, "grad_norm": 0.15133515391897984, "learning_rate": 9.998931635361359e-05, "loss": 2.2231, "step": 866 }, { "epoch": 0.00674261112476397, "grad_norm": 0.15914588183144254, "learning_rate": 9.998929108274128e-05, "loss": 2.3139, "step": 867 }, { "epoch": 0.006750388069544551, "grad_norm": 0.15051950075329384, "learning_rate": 9.998926578201985e-05, "loss": 2.2437, "step": 868 }, { "epoch": 0.006758165014325132, "grad_norm": 0.15461433226835208, "learning_rate": 9.998924045144937e-05, "loss": 2.2802, "step": 869 }, { "epoch": 0.006765941959105713, "grad_norm": 0.25606764759245654, "learning_rate": 9.99892150910298e-05, "loss": 2.3104, "step": 870 }, { "epoch": 0.0067737189038862946, "grad_norm": 0.16667835941808445, "learning_rate": 9.998918970076118e-05, "loss": 2.2594, "step": 871 }, { "epoch": 0.006781495848666876, "grad_norm": 0.18111249222018663, "learning_rate": 9.998916428064353e-05, "loss": 2.3192, "step": 872 }, { "epoch": 0.006789272793447458, "grad_norm": 0.14474254464603495, "learning_rate": 9.998913883067686e-05, "loss": 2.2521, "step": 873 }, { "epoch": 0.006797049738228039, "grad_norm": 0.2523605147588283, "learning_rate": 9.998911335086118e-05, "loss": 2.277, "step": 874 }, { "epoch": 0.00680482668300862, "grad_norm": 0.17147523285672978, "learning_rate": 9.99890878411965e-05, "loss": 2.3016, "step": 875 }, { "epoch": 0.006812603627789201, "grad_norm": 0.19111789504897497, "learning_rate": 9.998906230168285e-05, "loss": 2.2432, "step": 876 }, { "epoch": 0.006820380572569782, "grad_norm": 0.2083748333682898, "learning_rate": 9.998903673232022e-05, "loss": 2.2997, "step": 877 }, { "epoch": 0.006828157517350364, "grad_norm": 0.45260096117010423, "learning_rate": 9.998901113310868e-05, "loss": 2.296, "step": 878 }, { "epoch": 0.006835934462130945, "grad_norm": 0.14883823906535873, "learning_rate": 9.998898550404818e-05, "loss": 2.2967, "step": 879 }, { "epoch": 0.006843711406911527, "grad_norm": 0.16090929217326738, "learning_rate": 9.998895984513877e-05, "loss": 2.2921, "step": 880 }, { "epoch": 0.006851488351692108, "grad_norm": 0.13432259229917204, "learning_rate": 9.998893415638047e-05, "loss": 2.2066, "step": 881 }, { "epoch": 0.006859265296472689, "grad_norm": 0.14523432844629147, "learning_rate": 9.998890843777327e-05, "loss": 2.3023, "step": 882 }, { "epoch": 0.00686704224125327, "grad_norm": 0.13564150138272948, "learning_rate": 9.99888826893172e-05, "loss": 2.2655, "step": 883 }, { "epoch": 0.006874819186033851, "grad_norm": 0.14439698687273392, "learning_rate": 9.99888569110123e-05, "loss": 2.2455, "step": 884 }, { "epoch": 0.006882596130814433, "grad_norm": 0.15444900013618706, "learning_rate": 9.998883110285854e-05, "loss": 2.293, "step": 885 }, { "epoch": 0.006890373075595014, "grad_norm": 0.13381264265532627, "learning_rate": 9.998880526485595e-05, "loss": 2.2813, "step": 886 }, { "epoch": 0.006898150020375595, "grad_norm": 0.14096109474788032, "learning_rate": 9.998877939700457e-05, "loss": 2.2921, "step": 887 }, { "epoch": 0.006905926965156177, "grad_norm": 0.2387895256080743, "learning_rate": 9.998875349930439e-05, "loss": 2.2933, "step": 888 }, { "epoch": 0.006913703909936758, "grad_norm": 0.1457940793767532, "learning_rate": 9.998872757175544e-05, "loss": 2.1909, "step": 889 }, { "epoch": 0.006921480854717339, "grad_norm": 0.1539203546351972, "learning_rate": 9.998870161435771e-05, "loss": 2.2753, "step": 890 }, { "epoch": 0.00692925779949792, "grad_norm": 0.3604886615165128, "learning_rate": 9.998867562711126e-05, "loss": 2.2821, "step": 891 }, { "epoch": 0.006937034744278502, "grad_norm": 0.139446993545013, "learning_rate": 9.998864961001605e-05, "loss": 2.2214, "step": 892 }, { "epoch": 0.006944811689059083, "grad_norm": 0.16601465489387485, "learning_rate": 9.998862356307214e-05, "loss": 2.212, "step": 893 }, { "epoch": 0.006952588633839664, "grad_norm": 0.16317278277304156, "learning_rate": 9.998859748627953e-05, "loss": 2.2434, "step": 894 }, { "epoch": 0.006960365578620246, "grad_norm": 0.14424324387200013, "learning_rate": 9.998857137963824e-05, "loss": 2.2934, "step": 895 }, { "epoch": 0.006968142523400827, "grad_norm": 0.1586143544950591, "learning_rate": 9.998854524314827e-05, "loss": 2.3105, "step": 896 }, { "epoch": 0.006975919468181408, "grad_norm": 0.1739261922306834, "learning_rate": 9.998851907680964e-05, "loss": 2.3001, "step": 897 }, { "epoch": 0.006983696412961989, "grad_norm": 0.19546718926213275, "learning_rate": 9.99884928806224e-05, "loss": 2.2967, "step": 898 }, { "epoch": 0.006991473357742571, "grad_norm": 0.18440321677935506, "learning_rate": 9.998846665458652e-05, "loss": 2.2345, "step": 899 }, { "epoch": 0.006999250302523152, "grad_norm": 0.1754065613690978, "learning_rate": 9.998844039870205e-05, "loss": 2.2758, "step": 900 }, { "epoch": 0.007007027247303733, "grad_norm": 0.5486731369462853, "learning_rate": 9.998841411296898e-05, "loss": 2.2421, "step": 901 }, { "epoch": 0.007014804192084315, "grad_norm": 0.16781169016814096, "learning_rate": 9.998838779738733e-05, "loss": 2.2791, "step": 902 }, { "epoch": 0.007022581136864896, "grad_norm": 0.15954961376608004, "learning_rate": 9.998836145195712e-05, "loss": 2.2583, "step": 903 }, { "epoch": 0.007030358081645477, "grad_norm": 0.15785264966366347, "learning_rate": 9.998833507667838e-05, "loss": 2.3244, "step": 904 }, { "epoch": 0.0070381350264260584, "grad_norm": 0.1908213516486348, "learning_rate": 9.99883086715511e-05, "loss": 2.2029, "step": 905 }, { "epoch": 0.00704591197120664, "grad_norm": 0.19031964029826842, "learning_rate": 9.99882822365753e-05, "loss": 2.2411, "step": 906 }, { "epoch": 0.007053688915987221, "grad_norm": 0.1732701697986419, "learning_rate": 9.998825577175103e-05, "loss": 2.2931, "step": 907 }, { "epoch": 0.007061465860767802, "grad_norm": 0.14401285204285988, "learning_rate": 9.998822927707827e-05, "loss": 2.2326, "step": 908 }, { "epoch": 0.007069242805548383, "grad_norm": 0.18219905515593077, "learning_rate": 9.998820275255703e-05, "loss": 2.1982, "step": 909 }, { "epoch": 0.007077019750328965, "grad_norm": 0.18128743966771188, "learning_rate": 9.998817619818737e-05, "loss": 2.2404, "step": 910 }, { "epoch": 0.007084796695109546, "grad_norm": 0.15336947764150297, "learning_rate": 9.998814961396925e-05, "loss": 2.2596, "step": 911 }, { "epoch": 0.0070925736398901275, "grad_norm": 0.14575834106733285, "learning_rate": 9.998812299990272e-05, "loss": 2.2706, "step": 912 }, { "epoch": 0.007100350584670709, "grad_norm": 0.13931461843405313, "learning_rate": 9.998809635598781e-05, "loss": 2.2436, "step": 913 }, { "epoch": 0.00710812752945129, "grad_norm": 0.13119407009842332, "learning_rate": 9.99880696822245e-05, "loss": 2.2746, "step": 914 }, { "epoch": 0.007115904474231871, "grad_norm": 0.5057178823828647, "learning_rate": 9.998804297861282e-05, "loss": 2.2747, "step": 915 }, { "epoch": 0.007123681419012452, "grad_norm": 0.1720630388961833, "learning_rate": 9.998801624515278e-05, "loss": 2.3136, "step": 916 }, { "epoch": 0.007131458363793034, "grad_norm": 0.18052152283210945, "learning_rate": 9.998798948184442e-05, "loss": 2.2184, "step": 917 }, { "epoch": 0.007139235308573615, "grad_norm": 0.9443665735305845, "learning_rate": 9.998796268868774e-05, "loss": 2.2902, "step": 918 }, { "epoch": 0.0071470122533541965, "grad_norm": 0.14393144898910884, "learning_rate": 9.998793586568274e-05, "loss": 2.2134, "step": 919 }, { "epoch": 0.007154789198134778, "grad_norm": 0.18316138721235403, "learning_rate": 9.998790901282944e-05, "loss": 2.2449, "step": 920 }, { "epoch": 0.007162566142915359, "grad_norm": 0.20295580996959708, "learning_rate": 9.998788213012789e-05, "loss": 2.3257, "step": 921 }, { "epoch": 0.00717034308769594, "grad_norm": 0.20984694282857044, "learning_rate": 9.998785521757807e-05, "loss": 2.2383, "step": 922 }, { "epoch": 0.007178120032476521, "grad_norm": 0.19450494574590646, "learning_rate": 9.998782827518001e-05, "loss": 2.209, "step": 923 }, { "epoch": 0.007185896977257102, "grad_norm": 0.18712397370366393, "learning_rate": 9.998780130293373e-05, "loss": 2.3175, "step": 924 }, { "epoch": 0.007193673922037684, "grad_norm": 0.17579559581656656, "learning_rate": 9.998777430083926e-05, "loss": 2.3144, "step": 925 }, { "epoch": 0.0072014508668182655, "grad_norm": 0.14841089731664342, "learning_rate": 9.998774726889657e-05, "loss": 2.2257, "step": 926 }, { "epoch": 0.007209227811598847, "grad_norm": 0.15259492355646728, "learning_rate": 9.998772020710572e-05, "loss": 2.2841, "step": 927 }, { "epoch": 0.007217004756379428, "grad_norm": 0.2855297579803707, "learning_rate": 9.998769311546669e-05, "loss": 2.2706, "step": 928 }, { "epoch": 0.007224781701160009, "grad_norm": 0.1511558035780087, "learning_rate": 9.998766599397953e-05, "loss": 2.2415, "step": 929 }, { "epoch": 0.00723255864594059, "grad_norm": 0.1459624003252647, "learning_rate": 9.998763884264423e-05, "loss": 2.224, "step": 930 }, { "epoch": 0.007240335590721171, "grad_norm": 0.16894895753711192, "learning_rate": 9.998761166146085e-05, "loss": 2.2388, "step": 931 }, { "epoch": 0.007248112535501753, "grad_norm": 0.19027970789813753, "learning_rate": 9.998758445042935e-05, "loss": 2.2807, "step": 932 }, { "epoch": 0.0072558894802823345, "grad_norm": 0.2185491127102095, "learning_rate": 9.998755720954977e-05, "loss": 2.3088, "step": 933 }, { "epoch": 0.007263666425062916, "grad_norm": 0.22471820396502618, "learning_rate": 9.998752993882213e-05, "loss": 2.241, "step": 934 }, { "epoch": 0.007271443369843497, "grad_norm": 0.22731513207726683, "learning_rate": 9.998750263824646e-05, "loss": 2.2696, "step": 935 }, { "epoch": 0.007279220314624078, "grad_norm": 0.16720196816384264, "learning_rate": 9.998747530782273e-05, "loss": 2.2348, "step": 936 }, { "epoch": 0.007286997259404659, "grad_norm": 0.14348625050999775, "learning_rate": 9.9987447947551e-05, "loss": 2.2097, "step": 937 }, { "epoch": 0.00729477420418524, "grad_norm": 0.1596299251492955, "learning_rate": 9.998742055743128e-05, "loss": 2.2937, "step": 938 }, { "epoch": 0.0073025511489658215, "grad_norm": 0.16744033372054068, "learning_rate": 9.998739313746357e-05, "loss": 2.2526, "step": 939 }, { "epoch": 0.0073103280937464035, "grad_norm": 0.17809798472099087, "learning_rate": 9.99873656876479e-05, "loss": 2.3207, "step": 940 }, { "epoch": 0.007318105038526985, "grad_norm": 0.1629445179119342, "learning_rate": 9.998733820798427e-05, "loss": 2.3041, "step": 941 }, { "epoch": 0.007325881983307566, "grad_norm": 0.15799311775518787, "learning_rate": 9.998731069847272e-05, "loss": 2.2266, "step": 942 }, { "epoch": 0.007333658928088147, "grad_norm": 0.14946820852116602, "learning_rate": 9.998728315911325e-05, "loss": 2.2864, "step": 943 }, { "epoch": 0.007341435872868728, "grad_norm": 0.16817265811858553, "learning_rate": 9.998725558990588e-05, "loss": 2.2059, "step": 944 }, { "epoch": 0.007349212817649309, "grad_norm": 0.1521706977266589, "learning_rate": 9.998722799085064e-05, "loss": 2.2663, "step": 945 }, { "epoch": 0.0073569897624298905, "grad_norm": 0.14938874911997863, "learning_rate": 9.998720036194751e-05, "loss": 2.2548, "step": 946 }, { "epoch": 0.0073647667072104725, "grad_norm": 0.16020479846645932, "learning_rate": 9.998717270319656e-05, "loss": 2.2657, "step": 947 }, { "epoch": 0.007372543651991054, "grad_norm": 0.16089585975642218, "learning_rate": 9.998714501459773e-05, "loss": 2.2721, "step": 948 }, { "epoch": 0.007380320596771635, "grad_norm": 0.2083665770353207, "learning_rate": 9.998711729615113e-05, "loss": 2.3009, "step": 949 }, { "epoch": 0.007388097541552216, "grad_norm": 0.1462507330155645, "learning_rate": 9.998708954785672e-05, "loss": 2.1868, "step": 950 }, { "epoch": 0.007395874486332797, "grad_norm": 0.15596446924637117, "learning_rate": 9.998706176971451e-05, "loss": 2.245, "step": 951 }, { "epoch": 0.007403651431113378, "grad_norm": 0.17091998782404247, "learning_rate": 9.998703396172454e-05, "loss": 2.1876, "step": 952 }, { "epoch": 0.0074114283758939595, "grad_norm": 0.31339259269775727, "learning_rate": 9.998700612388681e-05, "loss": 2.2771, "step": 953 }, { "epoch": 0.0074192053206745415, "grad_norm": 0.15255743887249018, "learning_rate": 9.998697825620137e-05, "loss": 2.2761, "step": 954 }, { "epoch": 0.007426982265455123, "grad_norm": 0.17822402842199064, "learning_rate": 9.99869503586682e-05, "loss": 2.2291, "step": 955 }, { "epoch": 0.007434759210235704, "grad_norm": 0.2041887500340801, "learning_rate": 9.998692243128733e-05, "loss": 2.2469, "step": 956 }, { "epoch": 0.007442536155016285, "grad_norm": 0.17580304543063774, "learning_rate": 9.998689447405877e-05, "loss": 2.2125, "step": 957 }, { "epoch": 0.007450313099796866, "grad_norm": 0.1379448296147396, "learning_rate": 9.998686648698255e-05, "loss": 2.2652, "step": 958 }, { "epoch": 0.007458090044577447, "grad_norm": 0.1322487386172374, "learning_rate": 9.998683847005867e-05, "loss": 2.2382, "step": 959 }, { "epoch": 0.0074658669893580285, "grad_norm": 0.14751898730742552, "learning_rate": 9.998681042328717e-05, "loss": 2.1975, "step": 960 }, { "epoch": 0.00747364393413861, "grad_norm": 0.1677793141490688, "learning_rate": 9.998678234666805e-05, "loss": 2.243, "step": 961 }, { "epoch": 0.007481420878919192, "grad_norm": 0.18056428212336353, "learning_rate": 9.998675424020132e-05, "loss": 2.2571, "step": 962 }, { "epoch": 0.007489197823699773, "grad_norm": 0.17892199915980858, "learning_rate": 9.998672610388702e-05, "loss": 2.2707, "step": 963 }, { "epoch": 0.007496974768480354, "grad_norm": 0.1691906738695236, "learning_rate": 9.998669793772515e-05, "loss": 2.244, "step": 964 }, { "epoch": 0.007504751713260935, "grad_norm": 0.14650406068319916, "learning_rate": 9.998666974171573e-05, "loss": 2.2349, "step": 965 }, { "epoch": 0.007512528658041516, "grad_norm": 0.1402367484540074, "learning_rate": 9.998664151585877e-05, "loss": 2.2641, "step": 966 }, { "epoch": 0.0075203056028220975, "grad_norm": 0.15186394831465497, "learning_rate": 9.998661326015429e-05, "loss": 2.2195, "step": 967 }, { "epoch": 0.007528082547602679, "grad_norm": 0.1594049644334333, "learning_rate": 9.998658497460231e-05, "loss": 2.2111, "step": 968 }, { "epoch": 0.007535859492383261, "grad_norm": 0.13747924662834793, "learning_rate": 9.998655665920286e-05, "loss": 2.2485, "step": 969 }, { "epoch": 0.007543636437163842, "grad_norm": 0.16249439307725327, "learning_rate": 9.998652831395595e-05, "loss": 2.2536, "step": 970 }, { "epoch": 0.007551413381944423, "grad_norm": 0.1568824244955487, "learning_rate": 9.998649993886158e-05, "loss": 2.2571, "step": 971 }, { "epoch": 0.007559190326725004, "grad_norm": 0.23159383361922672, "learning_rate": 9.998647153391978e-05, "loss": 2.1953, "step": 972 }, { "epoch": 0.007566967271505585, "grad_norm": 0.1566604356752481, "learning_rate": 9.998644309913056e-05, "loss": 2.1904, "step": 973 }, { "epoch": 0.0075747442162861665, "grad_norm": 0.1415502511240576, "learning_rate": 9.998641463449396e-05, "loss": 2.256, "step": 974 }, { "epoch": 0.007582521161066748, "grad_norm": 0.12700870658424757, "learning_rate": 9.998638614000997e-05, "loss": 2.255, "step": 975 }, { "epoch": 0.007590298105847329, "grad_norm": 0.17485574591881964, "learning_rate": 9.998635761567861e-05, "loss": 2.2424, "step": 976 }, { "epoch": 0.007598075050627911, "grad_norm": 0.14402436737405333, "learning_rate": 9.99863290614999e-05, "loss": 2.2967, "step": 977 }, { "epoch": 0.007605851995408492, "grad_norm": 0.14566516753207223, "learning_rate": 9.998630047747388e-05, "loss": 2.2511, "step": 978 }, { "epoch": 0.007613628940189073, "grad_norm": 0.15318929684014307, "learning_rate": 9.998627186360052e-05, "loss": 2.3044, "step": 979 }, { "epoch": 0.007621405884969654, "grad_norm": 0.13285369281774675, "learning_rate": 9.998624321987988e-05, "loss": 2.2299, "step": 980 }, { "epoch": 0.0076291828297502355, "grad_norm": 0.1401085189490006, "learning_rate": 9.998621454631196e-05, "loss": 2.1939, "step": 981 }, { "epoch": 0.007636959774530817, "grad_norm": 0.16354104700630215, "learning_rate": 9.998618584289679e-05, "loss": 2.2961, "step": 982 }, { "epoch": 0.007644736719311398, "grad_norm": 0.20251528713790873, "learning_rate": 9.998615710963437e-05, "loss": 2.2395, "step": 983 }, { "epoch": 0.00765251366409198, "grad_norm": 0.19459623959286937, "learning_rate": 9.998612834652472e-05, "loss": 2.2411, "step": 984 }, { "epoch": 0.007660290608872561, "grad_norm": 0.14272914145643403, "learning_rate": 9.998609955356786e-05, "loss": 2.2574, "step": 985 }, { "epoch": 0.007668067553653142, "grad_norm": 0.12967325461255036, "learning_rate": 9.99860707307638e-05, "loss": 2.2604, "step": 986 }, { "epoch": 0.007675844498433723, "grad_norm": 0.14682817538725046, "learning_rate": 9.998604187811258e-05, "loss": 2.2894, "step": 987 }, { "epoch": 0.0076836214432143045, "grad_norm": 0.13947116165637405, "learning_rate": 9.998601299561419e-05, "loss": 2.2717, "step": 988 }, { "epoch": 0.007691398387994886, "grad_norm": 0.1526948063775282, "learning_rate": 9.998598408326866e-05, "loss": 2.217, "step": 989 }, { "epoch": 0.007699175332775467, "grad_norm": 0.18556993603045221, "learning_rate": 9.9985955141076e-05, "loss": 2.2664, "step": 990 }, { "epoch": 0.007706952277556048, "grad_norm": 0.22769439936383684, "learning_rate": 9.998592616903627e-05, "loss": 2.2414, "step": 991 }, { "epoch": 0.00771472922233663, "grad_norm": 0.22269676060507218, "learning_rate": 9.998589716714941e-05, "loss": 2.1836, "step": 992 }, { "epoch": 0.007722506167117211, "grad_norm": 0.15582103910432452, "learning_rate": 9.998586813541548e-05, "loss": 2.3309, "step": 993 }, { "epoch": 0.007730283111897792, "grad_norm": 0.15786838106517043, "learning_rate": 9.998583907383451e-05, "loss": 2.1771, "step": 994 }, { "epoch": 0.0077380600566783735, "grad_norm": 0.2503398405600742, "learning_rate": 9.99858099824065e-05, "loss": 2.2781, "step": 995 }, { "epoch": 0.007745837001458955, "grad_norm": 0.300010710279996, "learning_rate": 9.998578086113148e-05, "loss": 2.2335, "step": 996 }, { "epoch": 0.007753613946239536, "grad_norm": 0.23004871926144493, "learning_rate": 9.998575171000945e-05, "loss": 2.2553, "step": 997 }, { "epoch": 0.007761390891020117, "grad_norm": 0.17875773971182435, "learning_rate": 9.998572252904044e-05, "loss": 2.2881, "step": 998 }, { "epoch": 0.007769167835800699, "grad_norm": 0.21946318831603426, "learning_rate": 9.998569331822445e-05, "loss": 2.2898, "step": 999 }, { "epoch": 0.00777694478058128, "grad_norm": 0.16797193355273593, "learning_rate": 9.998566407756151e-05, "loss": 2.2606, "step": 1000 }, { "epoch": 0.007784721725361861, "grad_norm": 0.1408570052275838, "learning_rate": 9.998563480705165e-05, "loss": 2.2035, "step": 1001 }, { "epoch": 0.0077924986701424425, "grad_norm": 0.162507069324404, "learning_rate": 9.998560550669487e-05, "loss": 2.2176, "step": 1002 }, { "epoch": 0.007800275614923024, "grad_norm": 0.1733945599910748, "learning_rate": 9.99855761764912e-05, "loss": 2.2249, "step": 1003 }, { "epoch": 0.007808052559703605, "grad_norm": 0.14955806171595684, "learning_rate": 9.998554681644064e-05, "loss": 2.2898, "step": 1004 }, { "epoch": 0.007815829504484187, "grad_norm": 0.1840047052293933, "learning_rate": 9.998551742654322e-05, "loss": 2.2689, "step": 1005 }, { "epoch": 0.007823606449264768, "grad_norm": 0.19877796586648938, "learning_rate": 9.998548800679895e-05, "loss": 2.2598, "step": 1006 }, { "epoch": 0.00783138339404535, "grad_norm": 0.1656600362281123, "learning_rate": 9.998545855720787e-05, "loss": 2.226, "step": 1007 }, { "epoch": 0.00783916033882593, "grad_norm": 0.17097414280299067, "learning_rate": 9.998542907776995e-05, "loss": 2.2069, "step": 1008 }, { "epoch": 0.007846937283606512, "grad_norm": 0.22498993416278334, "learning_rate": 9.998539956848527e-05, "loss": 2.268, "step": 1009 }, { "epoch": 0.007854714228387093, "grad_norm": 0.153377032854707, "learning_rate": 9.99853700293538e-05, "loss": 2.2515, "step": 1010 }, { "epoch": 0.007862491173167674, "grad_norm": 0.14619297485528382, "learning_rate": 9.998534046037558e-05, "loss": 2.2886, "step": 1011 }, { "epoch": 0.007870268117948255, "grad_norm": 0.21373539072555367, "learning_rate": 9.998531086155061e-05, "loss": 2.2581, "step": 1012 }, { "epoch": 0.007878045062728836, "grad_norm": 0.20054177322742772, "learning_rate": 9.998528123287892e-05, "loss": 2.2242, "step": 1013 }, { "epoch": 0.007885822007509417, "grad_norm": 0.1551321807073183, "learning_rate": 9.998525157436053e-05, "loss": 2.17, "step": 1014 }, { "epoch": 0.007893598952289999, "grad_norm": 0.1750039383967191, "learning_rate": 9.998522188599546e-05, "loss": 2.24, "step": 1015 }, { "epoch": 0.00790137589707058, "grad_norm": 0.1985703494400703, "learning_rate": 9.99851921677837e-05, "loss": 2.252, "step": 1016 }, { "epoch": 0.00790915284185116, "grad_norm": 0.1607635284270119, "learning_rate": 9.99851624197253e-05, "loss": 2.2282, "step": 1017 }, { "epoch": 0.007916929786631744, "grad_norm": 0.14479545016325263, "learning_rate": 9.998513264182028e-05, "loss": 2.2271, "step": 1018 }, { "epoch": 0.007924706731412325, "grad_norm": 0.18361419528885464, "learning_rate": 9.998510283406864e-05, "loss": 2.2634, "step": 1019 }, { "epoch": 0.007932483676192906, "grad_norm": 0.1805663572171506, "learning_rate": 9.99850729964704e-05, "loss": 2.2515, "step": 1020 }, { "epoch": 0.007940260620973487, "grad_norm": 0.15058308351548838, "learning_rate": 9.998504312902556e-05, "loss": 2.2465, "step": 1021 }, { "epoch": 0.007948037565754068, "grad_norm": 0.15857705595927796, "learning_rate": 9.998501323173419e-05, "loss": 2.2878, "step": 1022 }, { "epoch": 0.00795581451053465, "grad_norm": 0.1543672368211313, "learning_rate": 9.998498330459627e-05, "loss": 2.2173, "step": 1023 }, { "epoch": 0.00796359145531523, "grad_norm": 0.15274393619431675, "learning_rate": 9.998495334761182e-05, "loss": 2.2802, "step": 1024 }, { "epoch": 0.007971368400095812, "grad_norm": 0.1553077502170077, "learning_rate": 9.998492336078084e-05, "loss": 2.2187, "step": 1025 }, { "epoch": 0.007979145344876393, "grad_norm": 0.1382615972671636, "learning_rate": 9.998489334410339e-05, "loss": 2.2235, "step": 1026 }, { "epoch": 0.007986922289656974, "grad_norm": 0.13217772371926048, "learning_rate": 9.998486329757948e-05, "loss": 2.2476, "step": 1027 }, { "epoch": 0.007994699234437555, "grad_norm": 0.14487875750411033, "learning_rate": 9.99848332212091e-05, "loss": 2.2449, "step": 1028 }, { "epoch": 0.008002476179218137, "grad_norm": 0.564700759352549, "learning_rate": 9.998480311499227e-05, "loss": 2.1888, "step": 1029 }, { "epoch": 0.008010253123998718, "grad_norm": 0.1668582190388619, "learning_rate": 9.998477297892906e-05, "loss": 2.2628, "step": 1030 }, { "epoch": 0.008018030068779299, "grad_norm": 0.2693836779130838, "learning_rate": 9.998474281301941e-05, "loss": 2.2302, "step": 1031 }, { "epoch": 0.008025807013559882, "grad_norm": 0.28267438663891054, "learning_rate": 9.998471261726341e-05, "loss": 2.1882, "step": 1032 }, { "epoch": 0.008033583958340463, "grad_norm": 0.22990607090410864, "learning_rate": 9.998468239166102e-05, "loss": 2.2136, "step": 1033 }, { "epoch": 0.008041360903121044, "grad_norm": 0.1704206305183344, "learning_rate": 9.99846521362123e-05, "loss": 2.2081, "step": 1034 }, { "epoch": 0.008049137847901625, "grad_norm": 0.18157512985157964, "learning_rate": 9.998462185091723e-05, "loss": 2.2424, "step": 1035 }, { "epoch": 0.008056914792682206, "grad_norm": 0.1554459905058917, "learning_rate": 9.998459153577588e-05, "loss": 2.2421, "step": 1036 }, { "epoch": 0.008064691737462788, "grad_norm": 0.16080314880400418, "learning_rate": 9.998456119078823e-05, "loss": 2.2766, "step": 1037 }, { "epoch": 0.008072468682243369, "grad_norm": 0.1926404652840869, "learning_rate": 9.998453081595429e-05, "loss": 2.2583, "step": 1038 }, { "epoch": 0.00808024562702395, "grad_norm": 0.22462560850968194, "learning_rate": 9.99845004112741e-05, "loss": 2.2694, "step": 1039 }, { "epoch": 0.008088022571804531, "grad_norm": 0.18244037156598047, "learning_rate": 9.998446997674769e-05, "loss": 2.2682, "step": 1040 }, { "epoch": 0.008095799516585112, "grad_norm": 0.15068718456808866, "learning_rate": 9.998443951237505e-05, "loss": 2.2239, "step": 1041 }, { "epoch": 0.008103576461365693, "grad_norm": 0.14843990151659586, "learning_rate": 9.99844090181562e-05, "loss": 2.249, "step": 1042 }, { "epoch": 0.008111353406146275, "grad_norm": 0.13261511783517338, "learning_rate": 9.998437849409117e-05, "loss": 2.1891, "step": 1043 }, { "epoch": 0.008119130350926856, "grad_norm": 0.14089840085733638, "learning_rate": 9.998434794017997e-05, "loss": 2.2443, "step": 1044 }, { "epoch": 0.008126907295707437, "grad_norm": 0.13809459768512328, "learning_rate": 9.998431735642265e-05, "loss": 2.2393, "step": 1045 }, { "epoch": 0.008134684240488018, "grad_norm": 0.14449352878786728, "learning_rate": 9.998428674281917e-05, "loss": 2.2876, "step": 1046 }, { "epoch": 0.008142461185268601, "grad_norm": 0.15230598772140536, "learning_rate": 9.998425609936959e-05, "loss": 2.2601, "step": 1047 }, { "epoch": 0.008150238130049182, "grad_norm": 0.15431408321019735, "learning_rate": 9.998422542607392e-05, "loss": 2.2663, "step": 1048 }, { "epoch": 0.008158015074829763, "grad_norm": 0.19854525820810856, "learning_rate": 9.998419472293218e-05, "loss": 2.2434, "step": 1049 }, { "epoch": 0.008165792019610344, "grad_norm": 0.19687642983269207, "learning_rate": 9.998416398994438e-05, "loss": 2.1973, "step": 1050 }, { "epoch": 0.008173568964390926, "grad_norm": 0.16243593545368767, "learning_rate": 9.998413322711055e-05, "loss": 2.2485, "step": 1051 }, { "epoch": 0.008181345909171507, "grad_norm": 0.13926302629780313, "learning_rate": 9.99841024344307e-05, "loss": 2.1877, "step": 1052 }, { "epoch": 0.008189122853952088, "grad_norm": 0.14889716841657438, "learning_rate": 9.998407161190484e-05, "loss": 2.2569, "step": 1053 }, { "epoch": 0.008196899798732669, "grad_norm": 0.13384490243396105, "learning_rate": 9.998404075953301e-05, "loss": 2.2327, "step": 1054 }, { "epoch": 0.00820467674351325, "grad_norm": 0.15109157279264518, "learning_rate": 9.99840098773152e-05, "loss": 2.2118, "step": 1055 }, { "epoch": 0.008212453688293831, "grad_norm": 0.15208063861362955, "learning_rate": 9.998397896525147e-05, "loss": 2.2504, "step": 1056 }, { "epoch": 0.008220230633074413, "grad_norm": 0.13251820388330665, "learning_rate": 9.998394802334179e-05, "loss": 2.2588, "step": 1057 }, { "epoch": 0.008228007577854994, "grad_norm": 0.16199683443324528, "learning_rate": 9.998391705158622e-05, "loss": 2.2601, "step": 1058 }, { "epoch": 0.008235784522635575, "grad_norm": 0.24476998476716244, "learning_rate": 9.998388604998475e-05, "loss": 2.2425, "step": 1059 }, { "epoch": 0.008243561467416156, "grad_norm": 0.13883155388154103, "learning_rate": 9.998385501853742e-05, "loss": 2.2147, "step": 1060 }, { "epoch": 0.008251338412196737, "grad_norm": 0.15156520357453998, "learning_rate": 9.998382395724423e-05, "loss": 2.2307, "step": 1061 }, { "epoch": 0.00825911535697732, "grad_norm": 0.19312666512296509, "learning_rate": 9.99837928661052e-05, "loss": 2.2411, "step": 1062 }, { "epoch": 0.008266892301757901, "grad_norm": 0.19646815621679534, "learning_rate": 9.998376174512036e-05, "loss": 2.2606, "step": 1063 }, { "epoch": 0.008274669246538482, "grad_norm": 0.16069784541693966, "learning_rate": 9.998373059428973e-05, "loss": 2.279, "step": 1064 }, { "epoch": 0.008282446191319064, "grad_norm": 0.13583720489730958, "learning_rate": 9.998369941361332e-05, "loss": 2.2064, "step": 1065 }, { "epoch": 0.008290223136099645, "grad_norm": 0.19408509259827256, "learning_rate": 9.998366820309115e-05, "loss": 2.2537, "step": 1066 }, { "epoch": 0.008298000080880226, "grad_norm": 0.18706875573600698, "learning_rate": 9.998363696272324e-05, "loss": 2.2125, "step": 1067 }, { "epoch": 0.008305777025660807, "grad_norm": 0.137211324185147, "learning_rate": 9.99836056925096e-05, "loss": 2.2225, "step": 1068 }, { "epoch": 0.008313553970441388, "grad_norm": 0.16044369064888836, "learning_rate": 9.998357439245026e-05, "loss": 2.2092, "step": 1069 }, { "epoch": 0.00832133091522197, "grad_norm": 0.2022070263925855, "learning_rate": 9.998354306254523e-05, "loss": 2.222, "step": 1070 }, { "epoch": 0.00832910786000255, "grad_norm": 0.19483399789067127, "learning_rate": 9.998351170279454e-05, "loss": 2.2213, "step": 1071 }, { "epoch": 0.008336884804783132, "grad_norm": 0.14433732040293176, "learning_rate": 9.998348031319821e-05, "loss": 2.2446, "step": 1072 }, { "epoch": 0.008344661749563713, "grad_norm": 0.20162886880933684, "learning_rate": 9.998344889375625e-05, "loss": 2.2091, "step": 1073 }, { "epoch": 0.008352438694344294, "grad_norm": 0.20457742595663408, "learning_rate": 9.998341744446868e-05, "loss": 2.1992, "step": 1074 }, { "epoch": 0.008360215639124875, "grad_norm": 0.13212840269816023, "learning_rate": 9.99833859653355e-05, "loss": 2.2133, "step": 1075 }, { "epoch": 0.008367992583905456, "grad_norm": 0.21660153042342606, "learning_rate": 9.998335445635677e-05, "loss": 2.1848, "step": 1076 }, { "epoch": 0.00837576952868604, "grad_norm": 0.2268605196290998, "learning_rate": 9.998332291753247e-05, "loss": 2.2708, "step": 1077 }, { "epoch": 0.00838354647346662, "grad_norm": 0.16555406160579175, "learning_rate": 9.998329134886265e-05, "loss": 2.2655, "step": 1078 }, { "epoch": 0.008391323418247202, "grad_norm": 0.2113407789272847, "learning_rate": 9.998325975034731e-05, "loss": 2.2204, "step": 1079 }, { "epoch": 0.008399100363027783, "grad_norm": 0.16268724702691342, "learning_rate": 9.998322812198648e-05, "loss": 2.1868, "step": 1080 }, { "epoch": 0.008406877307808364, "grad_norm": 0.22462913935080356, "learning_rate": 9.998319646378016e-05, "loss": 2.2628, "step": 1081 }, { "epoch": 0.008414654252588945, "grad_norm": 0.4496935915671188, "learning_rate": 9.998316477572838e-05, "loss": 2.2446, "step": 1082 }, { "epoch": 0.008422431197369526, "grad_norm": 0.20619294146067524, "learning_rate": 9.998313305783116e-05, "loss": 2.313, "step": 1083 }, { "epoch": 0.008430208142150107, "grad_norm": 0.2578611394310169, "learning_rate": 9.998310131008853e-05, "loss": 2.2284, "step": 1084 }, { "epoch": 0.008437985086930689, "grad_norm": 0.34124306835881923, "learning_rate": 9.998306953250049e-05, "loss": 2.2282, "step": 1085 }, { "epoch": 0.00844576203171127, "grad_norm": 0.3887470585078817, "learning_rate": 9.998303772506707e-05, "loss": 2.2085, "step": 1086 }, { "epoch": 0.008453538976491851, "grad_norm": 0.29512390421479595, "learning_rate": 9.998300588778828e-05, "loss": 2.2534, "step": 1087 }, { "epoch": 0.008461315921272432, "grad_norm": 0.17408177234529576, "learning_rate": 9.998297402066414e-05, "loss": 2.2566, "step": 1088 }, { "epoch": 0.008469092866053013, "grad_norm": 0.1658888472842109, "learning_rate": 9.998294212369468e-05, "loss": 2.2297, "step": 1089 }, { "epoch": 0.008476869810833594, "grad_norm": 0.1686490987973448, "learning_rate": 9.998291019687994e-05, "loss": 2.2778, "step": 1090 }, { "epoch": 0.008484646755614176, "grad_norm": 0.15194638389358878, "learning_rate": 9.998287824021988e-05, "loss": 2.2835, "step": 1091 }, { "epoch": 0.008492423700394758, "grad_norm": 0.14435892082514018, "learning_rate": 9.998284625371454e-05, "loss": 2.2502, "step": 1092 }, { "epoch": 0.00850020064517534, "grad_norm": 0.1476270689157087, "learning_rate": 9.998281423736398e-05, "loss": 2.2568, "step": 1093 }, { "epoch": 0.00850797758995592, "grad_norm": 0.15886715940658952, "learning_rate": 9.998278219116818e-05, "loss": 2.1734, "step": 1094 }, { "epoch": 0.008515754534736502, "grad_norm": 0.15078261547631072, "learning_rate": 9.998275011512716e-05, "loss": 2.3131, "step": 1095 }, { "epoch": 0.008523531479517083, "grad_norm": 0.15121450798644276, "learning_rate": 9.998271800924095e-05, "loss": 2.2291, "step": 1096 }, { "epoch": 0.008531308424297664, "grad_norm": 0.2406477726885324, "learning_rate": 9.998268587350958e-05, "loss": 2.2909, "step": 1097 }, { "epoch": 0.008539085369078245, "grad_norm": 0.1826939857746834, "learning_rate": 9.998265370793303e-05, "loss": 2.2044, "step": 1098 }, { "epoch": 0.008546862313858827, "grad_norm": 0.2236120400119188, "learning_rate": 9.998262151251137e-05, "loss": 2.2579, "step": 1099 }, { "epoch": 0.008554639258639408, "grad_norm": 0.2762244583698815, "learning_rate": 9.998258928724458e-05, "loss": 2.2638, "step": 1100 }, { "epoch": 0.008562416203419989, "grad_norm": 0.2895650814432769, "learning_rate": 9.99825570321327e-05, "loss": 2.1681, "step": 1101 }, { "epoch": 0.00857019314820057, "grad_norm": 0.18034685975779147, "learning_rate": 9.998252474717574e-05, "loss": 2.2554, "step": 1102 }, { "epoch": 0.008577970092981151, "grad_norm": 0.17119153457660127, "learning_rate": 9.998249243237372e-05, "loss": 2.2851, "step": 1103 }, { "epoch": 0.008585747037761732, "grad_norm": 0.2768851601250348, "learning_rate": 9.998246008772667e-05, "loss": 2.2454, "step": 1104 }, { "epoch": 0.008593523982542314, "grad_norm": 0.21010556570932862, "learning_rate": 9.99824277132346e-05, "loss": 2.1927, "step": 1105 }, { "epoch": 0.008601300927322895, "grad_norm": 0.1391688974965661, "learning_rate": 9.998239530889752e-05, "loss": 2.2228, "step": 1106 }, { "epoch": 0.008609077872103478, "grad_norm": 0.2501965822065839, "learning_rate": 9.998236287471546e-05, "loss": 2.2611, "step": 1107 }, { "epoch": 0.008616854816884059, "grad_norm": 0.24010711628913917, "learning_rate": 9.998233041068845e-05, "loss": 2.2196, "step": 1108 }, { "epoch": 0.00862463176166464, "grad_norm": 0.14770025449758797, "learning_rate": 9.998229791681648e-05, "loss": 2.244, "step": 1109 }, { "epoch": 0.008632408706445221, "grad_norm": 0.17221808404845565, "learning_rate": 9.99822653930996e-05, "loss": 2.2335, "step": 1110 }, { "epoch": 0.008640185651225802, "grad_norm": 0.26880935968564795, "learning_rate": 9.998223283953783e-05, "loss": 2.2557, "step": 1111 }, { "epoch": 0.008647962596006383, "grad_norm": 0.26243844545415884, "learning_rate": 9.998220025613114e-05, "loss": 2.2118, "step": 1112 }, { "epoch": 0.008655739540786965, "grad_norm": 0.1635644806766919, "learning_rate": 9.998216764287962e-05, "loss": 2.207, "step": 1113 }, { "epoch": 0.008663516485567546, "grad_norm": 0.1586959831935214, "learning_rate": 9.998213499978324e-05, "loss": 2.2486, "step": 1114 }, { "epoch": 0.008671293430348127, "grad_norm": 0.22908767717004988, "learning_rate": 9.998210232684205e-05, "loss": 2.2002, "step": 1115 }, { "epoch": 0.008679070375128708, "grad_norm": 0.17272809738705563, "learning_rate": 9.998206962405602e-05, "loss": 2.2262, "step": 1116 }, { "epoch": 0.00868684731990929, "grad_norm": 0.15278994411238478, "learning_rate": 9.998203689142524e-05, "loss": 2.1864, "step": 1117 }, { "epoch": 0.00869462426468987, "grad_norm": 0.15856446811529915, "learning_rate": 9.998200412894967e-05, "loss": 2.2232, "step": 1118 }, { "epoch": 0.008702401209470452, "grad_norm": 0.15159927440436763, "learning_rate": 9.998197133662937e-05, "loss": 2.2523, "step": 1119 }, { "epoch": 0.008710178154251033, "grad_norm": 0.16756046595488985, "learning_rate": 9.998193851446434e-05, "loss": 2.2277, "step": 1120 }, { "epoch": 0.008717955099031616, "grad_norm": 0.1847025216484369, "learning_rate": 9.998190566245458e-05, "loss": 2.1788, "step": 1121 }, { "epoch": 0.008725732043812197, "grad_norm": 0.14798303683646757, "learning_rate": 9.998187278060016e-05, "loss": 2.1918, "step": 1122 }, { "epoch": 0.008733508988592778, "grad_norm": 0.17208738696330728, "learning_rate": 9.998183986890105e-05, "loss": 2.2513, "step": 1123 }, { "epoch": 0.00874128593337336, "grad_norm": 0.1597959760863478, "learning_rate": 9.998180692735731e-05, "loss": 2.1905, "step": 1124 }, { "epoch": 0.00874906287815394, "grad_norm": 0.14096615881681282, "learning_rate": 9.998177395596892e-05, "loss": 2.2464, "step": 1125 }, { "epoch": 0.008756839822934522, "grad_norm": 0.14172237330453444, "learning_rate": 9.998174095473594e-05, "loss": 2.1747, "step": 1126 }, { "epoch": 0.008764616767715103, "grad_norm": 0.14810450413781756, "learning_rate": 9.998170792365835e-05, "loss": 2.2859, "step": 1127 }, { "epoch": 0.008772393712495684, "grad_norm": 0.1298249477725989, "learning_rate": 9.998167486273621e-05, "loss": 2.25, "step": 1128 }, { "epoch": 0.008780170657276265, "grad_norm": 0.13975903693158567, "learning_rate": 9.998164177196951e-05, "loss": 2.2353, "step": 1129 }, { "epoch": 0.008787947602056846, "grad_norm": 0.16261303416605316, "learning_rate": 9.998160865135828e-05, "loss": 2.2667, "step": 1130 }, { "epoch": 0.008795724546837427, "grad_norm": 0.17336359608791396, "learning_rate": 9.998157550090254e-05, "loss": 2.2421, "step": 1131 }, { "epoch": 0.008803501491618008, "grad_norm": 0.13363517039054829, "learning_rate": 9.99815423206023e-05, "loss": 2.2634, "step": 1132 }, { "epoch": 0.00881127843639859, "grad_norm": 0.136724926569296, "learning_rate": 9.99815091104576e-05, "loss": 2.2558, "step": 1133 }, { "epoch": 0.00881905538117917, "grad_norm": 0.18588157934227734, "learning_rate": 9.998147587046844e-05, "loss": 2.2015, "step": 1134 }, { "epoch": 0.008826832325959752, "grad_norm": 0.18685339074060137, "learning_rate": 9.998144260063486e-05, "loss": 2.268, "step": 1135 }, { "epoch": 0.008834609270740335, "grad_norm": 0.16405510374524404, "learning_rate": 9.998140930095685e-05, "loss": 2.2782, "step": 1136 }, { "epoch": 0.008842386215520916, "grad_norm": 0.13568925173991814, "learning_rate": 9.998137597143446e-05, "loss": 2.2491, "step": 1137 }, { "epoch": 0.008850163160301497, "grad_norm": 0.18745927654961977, "learning_rate": 9.998134261206771e-05, "loss": 2.2067, "step": 1138 }, { "epoch": 0.008857940105082078, "grad_norm": 0.22304594051587365, "learning_rate": 9.99813092228566e-05, "loss": 2.2469, "step": 1139 }, { "epoch": 0.00886571704986266, "grad_norm": 0.18328415635260004, "learning_rate": 9.998127580380115e-05, "loss": 2.2286, "step": 1140 }, { "epoch": 0.00887349399464324, "grad_norm": 0.13247719500777672, "learning_rate": 9.998124235490138e-05, "loss": 2.1906, "step": 1141 }, { "epoch": 0.008881270939423822, "grad_norm": 0.14820455284669917, "learning_rate": 9.998120887615734e-05, "loss": 2.2126, "step": 1142 }, { "epoch": 0.008889047884204403, "grad_norm": 0.1853996251616806, "learning_rate": 9.998117536756901e-05, "loss": 2.2443, "step": 1143 }, { "epoch": 0.008896824828984984, "grad_norm": 0.19000637438717516, "learning_rate": 9.998114182913642e-05, "loss": 2.2605, "step": 1144 }, { "epoch": 0.008904601773765565, "grad_norm": 0.150245635959467, "learning_rate": 9.998110826085962e-05, "loss": 2.2586, "step": 1145 }, { "epoch": 0.008912378718546147, "grad_norm": 0.14628325321302935, "learning_rate": 9.99810746627386e-05, "loss": 2.2786, "step": 1146 }, { "epoch": 0.008920155663326728, "grad_norm": 0.14102865792998065, "learning_rate": 9.998104103477339e-05, "loss": 2.2429, "step": 1147 }, { "epoch": 0.008927932608107309, "grad_norm": 0.14397566381938237, "learning_rate": 9.9981007376964e-05, "loss": 2.2286, "step": 1148 }, { "epoch": 0.00893570955288789, "grad_norm": 0.1324222197059745, "learning_rate": 9.998097368931047e-05, "loss": 2.2366, "step": 1149 }, { "epoch": 0.008943486497668471, "grad_norm": 0.132031675592816, "learning_rate": 9.99809399718128e-05, "loss": 2.2034, "step": 1150 }, { "epoch": 0.008951263442449054, "grad_norm": 0.1280379767540447, "learning_rate": 9.998090622447102e-05, "loss": 2.2865, "step": 1151 }, { "epoch": 0.008959040387229635, "grad_norm": 0.13472539445954926, "learning_rate": 9.998087244728515e-05, "loss": 2.2013, "step": 1152 }, { "epoch": 0.008966817332010216, "grad_norm": 0.14117658094459806, "learning_rate": 9.99808386402552e-05, "loss": 2.1789, "step": 1153 }, { "epoch": 0.008974594276790798, "grad_norm": 0.13206992381834354, "learning_rate": 9.998080480338121e-05, "loss": 2.2062, "step": 1154 }, { "epoch": 0.008982371221571379, "grad_norm": 0.13515868668352923, "learning_rate": 9.998077093666318e-05, "loss": 2.2227, "step": 1155 }, { "epoch": 0.00899014816635196, "grad_norm": 0.1439098253762606, "learning_rate": 9.998073704010115e-05, "loss": 2.2198, "step": 1156 }, { "epoch": 0.008997925111132541, "grad_norm": 0.1672968746348558, "learning_rate": 9.998070311369513e-05, "loss": 2.2112, "step": 1157 }, { "epoch": 0.009005702055913122, "grad_norm": 0.17115891104778785, "learning_rate": 9.998066915744513e-05, "loss": 2.2645, "step": 1158 }, { "epoch": 0.009013479000693703, "grad_norm": 0.14095116328568277, "learning_rate": 9.998063517135119e-05, "loss": 2.1836, "step": 1159 }, { "epoch": 0.009021255945474285, "grad_norm": 0.1342583422620378, "learning_rate": 9.99806011554133e-05, "loss": 2.1904, "step": 1160 }, { "epoch": 0.009029032890254866, "grad_norm": 0.15834507671124298, "learning_rate": 9.998056710963153e-05, "loss": 2.2009, "step": 1161 }, { "epoch": 0.009036809835035447, "grad_norm": 0.13949733399222944, "learning_rate": 9.998053303400585e-05, "loss": 2.2477, "step": 1162 }, { "epoch": 0.009044586779816028, "grad_norm": 0.14318934366665662, "learning_rate": 9.99804989285363e-05, "loss": 2.2405, "step": 1163 }, { "epoch": 0.00905236372459661, "grad_norm": 0.17451765807301617, "learning_rate": 9.998046479322291e-05, "loss": 2.1905, "step": 1164 }, { "epoch": 0.00906014066937719, "grad_norm": 0.2079516872850907, "learning_rate": 9.99804306280657e-05, "loss": 2.2509, "step": 1165 }, { "epoch": 0.009067917614157773, "grad_norm": 0.2026374946626618, "learning_rate": 9.998039643306467e-05, "loss": 2.1933, "step": 1166 }, { "epoch": 0.009075694558938354, "grad_norm": 0.16875086204221557, "learning_rate": 9.998036220821984e-05, "loss": 2.2655, "step": 1167 }, { "epoch": 0.009083471503718936, "grad_norm": 0.1311517160855742, "learning_rate": 9.998032795353128e-05, "loss": 2.1618, "step": 1168 }, { "epoch": 0.009091248448499517, "grad_norm": 0.18950608595621174, "learning_rate": 9.998029366899895e-05, "loss": 2.3023, "step": 1169 }, { "epoch": 0.009099025393280098, "grad_norm": 0.20845729451463726, "learning_rate": 9.998025935462289e-05, "loss": 2.3136, "step": 1170 }, { "epoch": 0.009106802338060679, "grad_norm": 0.16587744378453897, "learning_rate": 9.998022501040313e-05, "loss": 2.266, "step": 1171 }, { "epoch": 0.00911457928284126, "grad_norm": 0.15472122963618115, "learning_rate": 9.99801906363397e-05, "loss": 2.2826, "step": 1172 }, { "epoch": 0.009122356227621841, "grad_norm": 0.19663335413012817, "learning_rate": 9.998015623243259e-05, "loss": 2.2045, "step": 1173 }, { "epoch": 0.009130133172402423, "grad_norm": 0.17599740059250202, "learning_rate": 9.998012179868183e-05, "loss": 2.1978, "step": 1174 }, { "epoch": 0.009137910117183004, "grad_norm": 0.1545299254131994, "learning_rate": 9.998008733508746e-05, "loss": 2.226, "step": 1175 }, { "epoch": 0.009145687061963585, "grad_norm": 0.18079493977271371, "learning_rate": 9.998005284164949e-05, "loss": 2.2731, "step": 1176 }, { "epoch": 0.009153464006744166, "grad_norm": 0.20387737813078916, "learning_rate": 9.998001831836792e-05, "loss": 2.2224, "step": 1177 }, { "epoch": 0.009161240951524747, "grad_norm": 0.155704864742666, "learning_rate": 9.99799837652428e-05, "loss": 2.2116, "step": 1178 }, { "epoch": 0.009169017896305328, "grad_norm": 0.13409865492674308, "learning_rate": 9.997994918227415e-05, "loss": 2.1922, "step": 1179 }, { "epoch": 0.00917679484108591, "grad_norm": 0.16025077120575001, "learning_rate": 9.997991456946196e-05, "loss": 2.209, "step": 1180 }, { "epoch": 0.009184571785866492, "grad_norm": 0.16560652097733142, "learning_rate": 9.997987992680629e-05, "loss": 2.2469, "step": 1181 }, { "epoch": 0.009192348730647074, "grad_norm": 0.18503190660491933, "learning_rate": 9.997984525430713e-05, "loss": 2.2004, "step": 1182 }, { "epoch": 0.009200125675427655, "grad_norm": 0.1382780167694495, "learning_rate": 9.997981055196451e-05, "loss": 2.2443, "step": 1183 }, { "epoch": 0.009207902620208236, "grad_norm": 0.1370651587491091, "learning_rate": 9.997977581977846e-05, "loss": 2.2032, "step": 1184 }, { "epoch": 0.009215679564988817, "grad_norm": 0.128887523418658, "learning_rate": 9.997974105774899e-05, "loss": 2.1938, "step": 1185 }, { "epoch": 0.009223456509769398, "grad_norm": 0.13006792277199455, "learning_rate": 9.997970626587612e-05, "loss": 2.2112, "step": 1186 }, { "epoch": 0.00923123345454998, "grad_norm": 0.1578417519369992, "learning_rate": 9.997967144415987e-05, "loss": 2.236, "step": 1187 }, { "epoch": 0.00923901039933056, "grad_norm": 0.30965845085344484, "learning_rate": 9.997963659260027e-05, "loss": 2.2679, "step": 1188 }, { "epoch": 0.009246787344111142, "grad_norm": 0.23489652571518158, "learning_rate": 9.997960171119736e-05, "loss": 2.2211, "step": 1189 }, { "epoch": 0.009254564288891723, "grad_norm": 0.14487616409094548, "learning_rate": 9.99795667999511e-05, "loss": 2.2456, "step": 1190 }, { "epoch": 0.009262341233672304, "grad_norm": 0.1576886392141936, "learning_rate": 9.997953185886157e-05, "loss": 2.1901, "step": 1191 }, { "epoch": 0.009270118178452885, "grad_norm": 0.140924700576912, "learning_rate": 9.997949688792878e-05, "loss": 2.271, "step": 1192 }, { "epoch": 0.009277895123233466, "grad_norm": 0.324640016813621, "learning_rate": 9.997946188715272e-05, "loss": 2.2142, "step": 1193 }, { "epoch": 0.009285672068014048, "grad_norm": 0.1335641438012376, "learning_rate": 9.997942685653344e-05, "loss": 2.2498, "step": 1194 }, { "epoch": 0.009293449012794629, "grad_norm": 0.25770202017692595, "learning_rate": 9.997939179607094e-05, "loss": 2.2239, "step": 1195 }, { "epoch": 0.009301225957575212, "grad_norm": 0.13307765542843317, "learning_rate": 9.997935670576525e-05, "loss": 2.2348, "step": 1196 }, { "epoch": 0.009309002902355793, "grad_norm": 0.12140711586789749, "learning_rate": 9.997932158561643e-05, "loss": 2.1762, "step": 1197 }, { "epoch": 0.009316779847136374, "grad_norm": 0.12603025773617293, "learning_rate": 9.997928643562443e-05, "loss": 2.2503, "step": 1198 }, { "epoch": 0.009324556791916955, "grad_norm": 0.13412269720034184, "learning_rate": 9.997925125578931e-05, "loss": 2.2047, "step": 1199 }, { "epoch": 0.009332333736697536, "grad_norm": 0.12825187962724707, "learning_rate": 9.99792160461111e-05, "loss": 2.2203, "step": 1200 }, { "epoch": 0.009340110681478117, "grad_norm": 0.13653341053876605, "learning_rate": 9.997918080658979e-05, "loss": 2.2448, "step": 1201 }, { "epoch": 0.009347887626258699, "grad_norm": 0.13289300602309206, "learning_rate": 9.997914553722543e-05, "loss": 2.2427, "step": 1202 }, { "epoch": 0.00935566457103928, "grad_norm": 0.1671755503491523, "learning_rate": 9.997911023801803e-05, "loss": 2.2366, "step": 1203 }, { "epoch": 0.009363441515819861, "grad_norm": 0.13734894972527625, "learning_rate": 9.99790749089676e-05, "loss": 2.2578, "step": 1204 }, { "epoch": 0.009371218460600442, "grad_norm": 0.1409414399710952, "learning_rate": 9.99790395500742e-05, "loss": 2.2054, "step": 1205 }, { "epoch": 0.009378995405381023, "grad_norm": 0.13580660951208554, "learning_rate": 9.997900416133779e-05, "loss": 2.2404, "step": 1206 }, { "epoch": 0.009386772350161604, "grad_norm": 0.1600666812312513, "learning_rate": 9.997896874275845e-05, "loss": 2.2947, "step": 1207 }, { "epoch": 0.009394549294942186, "grad_norm": 0.13481465240567972, "learning_rate": 9.997893329433617e-05, "loss": 2.1522, "step": 1208 }, { "epoch": 0.009402326239722767, "grad_norm": 0.14100466496859382, "learning_rate": 9.997889781607096e-05, "loss": 2.2796, "step": 1209 }, { "epoch": 0.009410103184503348, "grad_norm": 0.2144046812068426, "learning_rate": 9.997886230796288e-05, "loss": 2.2207, "step": 1210 }, { "epoch": 0.00941788012928393, "grad_norm": 0.21885608476365506, "learning_rate": 9.99788267700119e-05, "loss": 2.3035, "step": 1211 }, { "epoch": 0.009425657074064512, "grad_norm": 0.15475883723627856, "learning_rate": 9.997879120221809e-05, "loss": 2.2946, "step": 1212 }, { "epoch": 0.009433434018845093, "grad_norm": 0.13875145515533535, "learning_rate": 9.997875560458145e-05, "loss": 2.2126, "step": 1213 }, { "epoch": 0.009441210963625674, "grad_norm": 0.17545711843685136, "learning_rate": 9.9978719977102e-05, "loss": 2.2163, "step": 1214 }, { "epoch": 0.009448987908406255, "grad_norm": 0.1474916300595212, "learning_rate": 9.997868431977977e-05, "loss": 2.1357, "step": 1215 }, { "epoch": 0.009456764853186837, "grad_norm": 0.13781077734525482, "learning_rate": 9.997864863261477e-05, "loss": 2.2362, "step": 1216 }, { "epoch": 0.009464541797967418, "grad_norm": 0.3759038100017104, "learning_rate": 9.997861291560702e-05, "loss": 2.2528, "step": 1217 }, { "epoch": 0.009472318742747999, "grad_norm": 0.15778330559457288, "learning_rate": 9.997857716875657e-05, "loss": 2.2029, "step": 1218 }, { "epoch": 0.00948009568752858, "grad_norm": 0.1449050882450023, "learning_rate": 9.997854139206338e-05, "loss": 2.1741, "step": 1219 }, { "epoch": 0.009487872632309161, "grad_norm": 0.3147491022020202, "learning_rate": 9.997850558552754e-05, "loss": 2.2292, "step": 1220 }, { "epoch": 0.009495649577089742, "grad_norm": 0.15229922333896723, "learning_rate": 9.997846974914904e-05, "loss": 2.2208, "step": 1221 }, { "epoch": 0.009503426521870324, "grad_norm": 0.1670443914387256, "learning_rate": 9.997843388292789e-05, "loss": 2.2198, "step": 1222 }, { "epoch": 0.009511203466650905, "grad_norm": 0.35745582483276, "learning_rate": 9.997839798686415e-05, "loss": 2.2586, "step": 1223 }, { "epoch": 0.009518980411431486, "grad_norm": 0.23087400967168378, "learning_rate": 9.997836206095779e-05, "loss": 2.2205, "step": 1224 }, { "epoch": 0.009526757356212069, "grad_norm": 0.3706808770547865, "learning_rate": 9.997832610520887e-05, "loss": 2.2271, "step": 1225 }, { "epoch": 0.00953453430099265, "grad_norm": 0.37376056222614307, "learning_rate": 9.997829011961739e-05, "loss": 2.197, "step": 1226 }, { "epoch": 0.009542311245773231, "grad_norm": 0.3269171767719524, "learning_rate": 9.997825410418338e-05, "loss": 2.2396, "step": 1227 }, { "epoch": 0.009550088190553812, "grad_norm": 0.20592361386103053, "learning_rate": 9.997821805890688e-05, "loss": 2.2704, "step": 1228 }, { "epoch": 0.009557865135334393, "grad_norm": 0.2062440626763719, "learning_rate": 9.997818198378787e-05, "loss": 2.2037, "step": 1229 }, { "epoch": 0.009565642080114975, "grad_norm": 0.24950409578775803, "learning_rate": 9.997814587882642e-05, "loss": 2.2005, "step": 1230 }, { "epoch": 0.009573419024895556, "grad_norm": 0.2226190382318479, "learning_rate": 9.997810974402251e-05, "loss": 2.2319, "step": 1231 }, { "epoch": 0.009581195969676137, "grad_norm": 0.8165972366170056, "learning_rate": 9.997807357937619e-05, "loss": 2.2045, "step": 1232 }, { "epoch": 0.009588972914456718, "grad_norm": 0.22155480190968324, "learning_rate": 9.997803738488747e-05, "loss": 2.2184, "step": 1233 }, { "epoch": 0.0095967498592373, "grad_norm": 0.2522922249265384, "learning_rate": 9.997800116055636e-05, "loss": 2.2047, "step": 1234 }, { "epoch": 0.00960452680401788, "grad_norm": 0.4659747410525044, "learning_rate": 9.99779649063829e-05, "loss": 2.2538, "step": 1235 }, { "epoch": 0.009612303748798462, "grad_norm": 0.1896728586675948, "learning_rate": 9.99779286223671e-05, "loss": 2.1947, "step": 1236 }, { "epoch": 0.009620080693579043, "grad_norm": 0.1735096822607616, "learning_rate": 9.997789230850898e-05, "loss": 2.1953, "step": 1237 }, { "epoch": 0.009627857638359624, "grad_norm": 0.19376606992269568, "learning_rate": 9.997785596480858e-05, "loss": 2.2214, "step": 1238 }, { "epoch": 0.009635634583140205, "grad_norm": 0.21258897811551952, "learning_rate": 9.99778195912659e-05, "loss": 2.2787, "step": 1239 }, { "epoch": 0.009643411527920788, "grad_norm": 0.18629086937151756, "learning_rate": 9.997778318788098e-05, "loss": 2.2229, "step": 1240 }, { "epoch": 0.00965118847270137, "grad_norm": 0.13552036703189788, "learning_rate": 9.997774675465384e-05, "loss": 2.24, "step": 1241 }, { "epoch": 0.00965896541748195, "grad_norm": 0.17822011525535977, "learning_rate": 9.99777102915845e-05, "loss": 2.2624, "step": 1242 }, { "epoch": 0.009666742362262531, "grad_norm": 0.20098652874815298, "learning_rate": 9.997767379867295e-05, "loss": 2.2277, "step": 1243 }, { "epoch": 0.009674519307043113, "grad_norm": 0.17650787357640677, "learning_rate": 9.997763727591925e-05, "loss": 2.2699, "step": 1244 }, { "epoch": 0.009682296251823694, "grad_norm": 0.15806256746698294, "learning_rate": 9.99776007233234e-05, "loss": 2.2596, "step": 1245 }, { "epoch": 0.009690073196604275, "grad_norm": 0.18210426261089507, "learning_rate": 9.997756414088546e-05, "loss": 2.2655, "step": 1246 }, { "epoch": 0.009697850141384856, "grad_norm": 0.24490166525144827, "learning_rate": 9.99775275286054e-05, "loss": 2.2514, "step": 1247 }, { "epoch": 0.009705627086165437, "grad_norm": 0.21898735520719015, "learning_rate": 9.997749088648329e-05, "loss": 2.2478, "step": 1248 }, { "epoch": 0.009713404030946018, "grad_norm": 0.14927177574520117, "learning_rate": 9.99774542145191e-05, "loss": 2.1921, "step": 1249 }, { "epoch": 0.0097211809757266, "grad_norm": 0.15260572024714483, "learning_rate": 9.99774175127129e-05, "loss": 2.2283, "step": 1250 }, { "epoch": 0.00972895792050718, "grad_norm": 0.2031059208391225, "learning_rate": 9.997738078106467e-05, "loss": 2.2457, "step": 1251 }, { "epoch": 0.009736734865287762, "grad_norm": 0.18658105356214463, "learning_rate": 9.997734401957449e-05, "loss": 2.2403, "step": 1252 }, { "epoch": 0.009744511810068343, "grad_norm": 0.20309519069556173, "learning_rate": 9.99773072282423e-05, "loss": 2.2113, "step": 1253 }, { "epoch": 0.009752288754848924, "grad_norm": 0.16745466957570826, "learning_rate": 9.997727040706819e-05, "loss": 2.2524, "step": 1254 }, { "epoch": 0.009760065699629507, "grad_norm": 0.19369492681438316, "learning_rate": 9.997723355605218e-05, "loss": 2.2209, "step": 1255 }, { "epoch": 0.009767842644410088, "grad_norm": 0.23367831580080706, "learning_rate": 9.997719667519425e-05, "loss": 2.2403, "step": 1256 }, { "epoch": 0.00977561958919067, "grad_norm": 0.13864572222166546, "learning_rate": 9.997715976449444e-05, "loss": 2.2261, "step": 1257 }, { "epoch": 0.00978339653397125, "grad_norm": 0.1835267298530077, "learning_rate": 9.997712282395277e-05, "loss": 2.2255, "step": 1258 }, { "epoch": 0.009791173478751832, "grad_norm": 0.15495876618469345, "learning_rate": 9.997708585356928e-05, "loss": 2.2244, "step": 1259 }, { "epoch": 0.009798950423532413, "grad_norm": 0.13122440080319017, "learning_rate": 9.997704885334399e-05, "loss": 2.1469, "step": 1260 }, { "epoch": 0.009806727368312994, "grad_norm": 0.14270947089997724, "learning_rate": 9.997701182327689e-05, "loss": 2.2525, "step": 1261 }, { "epoch": 0.009814504313093575, "grad_norm": 0.18676294108787245, "learning_rate": 9.997697476336802e-05, "loss": 2.2554, "step": 1262 }, { "epoch": 0.009822281257874156, "grad_norm": 0.21507020455650755, "learning_rate": 9.997693767361741e-05, "loss": 2.252, "step": 1263 }, { "epoch": 0.009830058202654738, "grad_norm": 0.21190441337623694, "learning_rate": 9.997690055402512e-05, "loss": 2.2579, "step": 1264 }, { "epoch": 0.009837835147435319, "grad_norm": 0.19314982014095694, "learning_rate": 9.997686340459107e-05, "loss": 2.2597, "step": 1265 }, { "epoch": 0.0098456120922159, "grad_norm": 0.18431072504522075, "learning_rate": 9.997682622531538e-05, "loss": 2.242, "step": 1266 }, { "epoch": 0.009853389036996481, "grad_norm": 0.1458799148407708, "learning_rate": 9.997678901619802e-05, "loss": 2.2246, "step": 1267 }, { "epoch": 0.009861165981777062, "grad_norm": 0.1460362146349854, "learning_rate": 9.997675177723905e-05, "loss": 2.2508, "step": 1268 }, { "epoch": 0.009868942926557643, "grad_norm": 0.17391222958440739, "learning_rate": 9.997671450843844e-05, "loss": 2.2205, "step": 1269 }, { "epoch": 0.009876719871338226, "grad_norm": 0.2007680443935168, "learning_rate": 9.997667720979625e-05, "loss": 2.2238, "step": 1270 }, { "epoch": 0.009884496816118808, "grad_norm": 0.19910814464048265, "learning_rate": 9.99766398813125e-05, "loss": 2.2142, "step": 1271 }, { "epoch": 0.009892273760899389, "grad_norm": 0.183564347671667, "learning_rate": 9.99766025229872e-05, "loss": 2.1941, "step": 1272 }, { "epoch": 0.00990005070567997, "grad_norm": 0.1514040467285261, "learning_rate": 9.997656513482038e-05, "loss": 2.235, "step": 1273 }, { "epoch": 0.009907827650460551, "grad_norm": 0.2620185141814745, "learning_rate": 9.997652771681205e-05, "loss": 2.2572, "step": 1274 }, { "epoch": 0.009915604595241132, "grad_norm": 0.3043355091653213, "learning_rate": 9.997649026896225e-05, "loss": 2.1474, "step": 1275 }, { "epoch": 0.009923381540021713, "grad_norm": 0.20431707042448274, "learning_rate": 9.997645279127101e-05, "loss": 2.2685, "step": 1276 }, { "epoch": 0.009931158484802294, "grad_norm": 0.1427692563698559, "learning_rate": 9.997641528373833e-05, "loss": 2.1631, "step": 1277 }, { "epoch": 0.009938935429582876, "grad_norm": 0.2720521675143776, "learning_rate": 9.997637774636424e-05, "loss": 2.2324, "step": 1278 }, { "epoch": 0.009946712374363457, "grad_norm": 0.26295378587283913, "learning_rate": 9.997634017914875e-05, "loss": 2.2118, "step": 1279 }, { "epoch": 0.009954489319144038, "grad_norm": 0.1381238959217263, "learning_rate": 9.99763025820919e-05, "loss": 2.2179, "step": 1280 }, { "epoch": 0.00996226626392462, "grad_norm": 0.14451168531277617, "learning_rate": 9.997626495519372e-05, "loss": 2.2418, "step": 1281 }, { "epoch": 0.0099700432087052, "grad_norm": 0.16388713069376215, "learning_rate": 9.997622729845421e-05, "loss": 2.2183, "step": 1282 }, { "epoch": 0.009977820153485781, "grad_norm": 0.1535096297351309, "learning_rate": 9.997618961187341e-05, "loss": 2.2477, "step": 1283 }, { "epoch": 0.009985597098266363, "grad_norm": 0.15574549661853007, "learning_rate": 9.997615189545131e-05, "loss": 2.24, "step": 1284 }, { "epoch": 0.009993374043046946, "grad_norm": 0.135204213265533, "learning_rate": 9.997611414918798e-05, "loss": 2.1975, "step": 1285 }, { "epoch": 0.010001150987827527, "grad_norm": 0.13148950325897438, "learning_rate": 9.997607637308341e-05, "loss": 2.2513, "step": 1286 }, { "epoch": 0.010008927932608108, "grad_norm": 0.1322830338435023, "learning_rate": 9.997603856713764e-05, "loss": 2.2585, "step": 1287 }, { "epoch": 0.010016704877388689, "grad_norm": 0.1245064879144316, "learning_rate": 9.997600073135068e-05, "loss": 2.1456, "step": 1288 }, { "epoch": 0.01002448182216927, "grad_norm": 0.1233301348041187, "learning_rate": 9.997596286572257e-05, "loss": 2.2361, "step": 1289 }, { "epoch": 0.010032258766949851, "grad_norm": 0.1308362984960193, "learning_rate": 9.99759249702533e-05, "loss": 2.2856, "step": 1290 }, { "epoch": 0.010040035711730433, "grad_norm": 0.13329870914688507, "learning_rate": 9.997588704494293e-05, "loss": 2.2015, "step": 1291 }, { "epoch": 0.010047812656511014, "grad_norm": 0.15836717241470274, "learning_rate": 9.997584908979146e-05, "loss": 2.2451, "step": 1292 }, { "epoch": 0.010055589601291595, "grad_norm": 0.26783334592465036, "learning_rate": 9.997581110479892e-05, "loss": 2.213, "step": 1293 }, { "epoch": 0.010063366546072176, "grad_norm": 0.13975134446417967, "learning_rate": 9.997577308996531e-05, "loss": 2.2572, "step": 1294 }, { "epoch": 0.010071143490852757, "grad_norm": 0.15513106410480518, "learning_rate": 9.99757350452907e-05, "loss": 2.2452, "step": 1295 }, { "epoch": 0.010078920435633338, "grad_norm": 0.14705648193103601, "learning_rate": 9.997569697077508e-05, "loss": 2.1946, "step": 1296 }, { "epoch": 0.01008669738041392, "grad_norm": 0.13798309755380153, "learning_rate": 9.997565886641847e-05, "loss": 2.2354, "step": 1297 }, { "epoch": 0.0100944743251945, "grad_norm": 0.21662794534690474, "learning_rate": 9.99756207322209e-05, "loss": 2.1923, "step": 1298 }, { "epoch": 0.010102251269975082, "grad_norm": 0.24497710839945167, "learning_rate": 9.997558256818242e-05, "loss": 2.2566, "step": 1299 }, { "epoch": 0.010110028214755665, "grad_norm": 0.2190623191786516, "learning_rate": 9.9975544374303e-05, "loss": 2.1964, "step": 1300 }, { "epoch": 0.010117805159536246, "grad_norm": 0.13608183115493436, "learning_rate": 9.997550615058271e-05, "loss": 2.2171, "step": 1301 }, { "epoch": 0.010125582104316827, "grad_norm": 0.18844455093550486, "learning_rate": 9.997546789702153e-05, "loss": 2.2604, "step": 1302 }, { "epoch": 0.010133359049097408, "grad_norm": 0.2266043610243144, "learning_rate": 9.997542961361952e-05, "loss": 2.2308, "step": 1303 }, { "epoch": 0.01014113599387799, "grad_norm": 1.4651777333925418, "learning_rate": 9.99753913003767e-05, "loss": 2.2149, "step": 1304 }, { "epoch": 0.01014891293865857, "grad_norm": 0.18137699902613574, "learning_rate": 9.997535295729305e-05, "loss": 2.2721, "step": 1305 }, { "epoch": 0.010156689883439152, "grad_norm": 0.3223159810749645, "learning_rate": 9.997531458436865e-05, "loss": 2.2302, "step": 1306 }, { "epoch": 0.010164466828219733, "grad_norm": 0.2086613269739043, "learning_rate": 9.99752761816035e-05, "loss": 2.2378, "step": 1307 }, { "epoch": 0.010172243773000314, "grad_norm": 0.1612039837443635, "learning_rate": 9.997523774899759e-05, "loss": 2.1896, "step": 1308 }, { "epoch": 0.010180020717780895, "grad_norm": 2.9109750565763757, "learning_rate": 9.997519928655101e-05, "loss": 2.2618, "step": 1309 }, { "epoch": 0.010187797662561476, "grad_norm": 0.24558602049960052, "learning_rate": 9.997516079426372e-05, "loss": 2.2491, "step": 1310 }, { "epoch": 0.010195574607342058, "grad_norm": 0.5790628082013305, "learning_rate": 9.99751222721358e-05, "loss": 2.2067, "step": 1311 }, { "epoch": 0.010203351552122639, "grad_norm": 0.22381536595884724, "learning_rate": 9.997508372016721e-05, "loss": 2.2506, "step": 1312 }, { "epoch": 0.01021112849690322, "grad_norm": 1.2890021636976223, "learning_rate": 9.997504513835801e-05, "loss": 2.2309, "step": 1313 }, { "epoch": 0.010218905441683803, "grad_norm": 0.6389960757652277, "learning_rate": 9.997500652670823e-05, "loss": 2.2774, "step": 1314 }, { "epoch": 0.010226682386464384, "grad_norm": 0.24512074235226688, "learning_rate": 9.997496788521787e-05, "loss": 2.177, "step": 1315 }, { "epoch": 0.010234459331244965, "grad_norm": 0.2228826217333211, "learning_rate": 9.997492921388698e-05, "loss": 2.242, "step": 1316 }, { "epoch": 0.010242236276025546, "grad_norm": 0.5422907053420832, "learning_rate": 9.997489051271554e-05, "loss": 2.2625, "step": 1317 }, { "epoch": 0.010250013220806127, "grad_norm": 1.4109227546179164, "learning_rate": 9.997485178170363e-05, "loss": 2.2358, "step": 1318 }, { "epoch": 0.010257790165586709, "grad_norm": 0.311613190877352, "learning_rate": 9.997481302085123e-05, "loss": 2.2564, "step": 1319 }, { "epoch": 0.01026556711036729, "grad_norm": 0.5170804037834733, "learning_rate": 9.997477423015838e-05, "loss": 2.2344, "step": 1320 }, { "epoch": 0.010273344055147871, "grad_norm": 0.5062975888851058, "learning_rate": 9.99747354096251e-05, "loss": 2.2304, "step": 1321 }, { "epoch": 0.010281120999928452, "grad_norm": 0.34578255365438, "learning_rate": 9.99746965592514e-05, "loss": 2.2202, "step": 1322 }, { "epoch": 0.010288897944709033, "grad_norm": 0.44842616220928677, "learning_rate": 9.997465767903733e-05, "loss": 2.2531, "step": 1323 }, { "epoch": 0.010296674889489614, "grad_norm": 0.6989242455441902, "learning_rate": 9.99746187689829e-05, "loss": 2.3039, "step": 1324 }, { "epoch": 0.010304451834270196, "grad_norm": 7.238882883003214, "learning_rate": 9.997457982908813e-05, "loss": 2.2477, "step": 1325 }, { "epoch": 0.010312228779050777, "grad_norm": 0.8475637229328166, "learning_rate": 9.997454085935305e-05, "loss": 2.2616, "step": 1326 }, { "epoch": 0.010320005723831358, "grad_norm": 1.3809762684686564, "learning_rate": 9.997450185977767e-05, "loss": 2.2486, "step": 1327 }, { "epoch": 0.010327782668611939, "grad_norm": 0.8979862964150682, "learning_rate": 9.997446283036202e-05, "loss": 2.3052, "step": 1328 }, { "epoch": 0.010335559613392522, "grad_norm": 1.2232089349969062, "learning_rate": 9.997442377110614e-05, "loss": 2.3398, "step": 1329 }, { "epoch": 0.010343336558173103, "grad_norm": 0.7760369958430254, "learning_rate": 9.997438468201003e-05, "loss": 2.272, "step": 1330 }, { "epoch": 0.010351113502953684, "grad_norm": 0.5509525632903131, "learning_rate": 9.997434556307372e-05, "loss": 2.2738, "step": 1331 }, { "epoch": 0.010358890447734265, "grad_norm": 0.6100051154406791, "learning_rate": 9.997430641429726e-05, "loss": 2.2676, "step": 1332 }, { "epoch": 0.010366667392514847, "grad_norm": 0.8893917979463929, "learning_rate": 9.997426723568062e-05, "loss": 2.272, "step": 1333 }, { "epoch": 0.010374444337295428, "grad_norm": 2.22801924595185, "learning_rate": 9.997422802722387e-05, "loss": 2.3345, "step": 1334 }, { "epoch": 0.010382221282076009, "grad_norm": 0.6070914318167961, "learning_rate": 9.9974188788927e-05, "loss": 2.2843, "step": 1335 }, { "epoch": 0.01038999822685659, "grad_norm": 0.5337789928971445, "learning_rate": 9.997414952079007e-05, "loss": 2.2433, "step": 1336 }, { "epoch": 0.010397775171637171, "grad_norm": 0.45762051203150544, "learning_rate": 9.997411022281307e-05, "loss": 2.3286, "step": 1337 }, { "epoch": 0.010405552116417752, "grad_norm": 0.8974971309847571, "learning_rate": 9.997407089499603e-05, "loss": 2.322, "step": 1338 }, { "epoch": 0.010413329061198334, "grad_norm": 9.675498694648763, "learning_rate": 9.9974031537339e-05, "loss": 2.5643, "step": 1339 }, { "epoch": 0.010421106005978915, "grad_norm": 0.8555600768546242, "learning_rate": 9.997399214984195e-05, "loss": 2.2931, "step": 1340 }, { "epoch": 0.010428882950759496, "grad_norm": 0.9880351530273892, "learning_rate": 9.997395273250498e-05, "loss": 2.2396, "step": 1341 }, { "epoch": 0.010436659895540077, "grad_norm": 2.9808027849681538, "learning_rate": 9.997391328532803e-05, "loss": 2.311, "step": 1342 }, { "epoch": 0.010444436840320658, "grad_norm": 1.0689757929872468, "learning_rate": 9.997387380831119e-05, "loss": 2.3636, "step": 1343 }, { "epoch": 0.010452213785101241, "grad_norm": 0.6820865782369157, "learning_rate": 9.997383430145446e-05, "loss": 2.3252, "step": 1344 }, { "epoch": 0.010459990729881822, "grad_norm": 0.7781823965066365, "learning_rate": 9.997379476475784e-05, "loss": 2.255, "step": 1345 }, { "epoch": 0.010467767674662403, "grad_norm": 0.9511388669606913, "learning_rate": 9.997375519822138e-05, "loss": 2.2737, "step": 1346 }, { "epoch": 0.010475544619442985, "grad_norm": 1.30518970898008, "learning_rate": 9.997371560184511e-05, "loss": 2.2326, "step": 1347 }, { "epoch": 0.010483321564223566, "grad_norm": 1.1038230331897025, "learning_rate": 9.997367597562904e-05, "loss": 2.3438, "step": 1348 }, { "epoch": 0.010491098509004147, "grad_norm": 0.7072499645496436, "learning_rate": 9.997363631957319e-05, "loss": 2.2636, "step": 1349 }, { "epoch": 0.010498875453784728, "grad_norm": 0.40084187358219175, "learning_rate": 9.99735966336776e-05, "loss": 2.264, "step": 1350 }, { "epoch": 0.01050665239856531, "grad_norm": 0.3942278427096939, "learning_rate": 9.997355691794228e-05, "loss": 2.2911, "step": 1351 }, { "epoch": 0.01051442934334589, "grad_norm": 0.4524794032543244, "learning_rate": 9.997351717236724e-05, "loss": 2.2142, "step": 1352 }, { "epoch": 0.010522206288126472, "grad_norm": 0.4220429447573467, "learning_rate": 9.997347739695254e-05, "loss": 2.2338, "step": 1353 }, { "epoch": 0.010529983232907053, "grad_norm": 0.25255741985238217, "learning_rate": 9.997343759169817e-05, "loss": 2.2189, "step": 1354 }, { "epoch": 0.010537760177687634, "grad_norm": 0.36566399853686404, "learning_rate": 9.997339775660418e-05, "loss": 2.2749, "step": 1355 }, { "epoch": 0.010545537122468215, "grad_norm": 0.3771702598382865, "learning_rate": 9.997335789167058e-05, "loss": 2.2327, "step": 1356 }, { "epoch": 0.010553314067248796, "grad_norm": 0.20706714418781372, "learning_rate": 9.997331799689738e-05, "loss": 2.2365, "step": 1357 }, { "epoch": 0.010561091012029377, "grad_norm": 0.37726166439775183, "learning_rate": 9.997327807228465e-05, "loss": 2.2332, "step": 1358 }, { "epoch": 0.01056886795680996, "grad_norm": 0.30382220038467345, "learning_rate": 9.997323811783236e-05, "loss": 2.2401, "step": 1359 }, { "epoch": 0.010576644901590541, "grad_norm": 0.2801322832839745, "learning_rate": 9.997319813354055e-05, "loss": 2.2742, "step": 1360 }, { "epoch": 0.010584421846371123, "grad_norm": 0.3834556247447373, "learning_rate": 9.997315811940928e-05, "loss": 2.1809, "step": 1361 }, { "epoch": 0.010592198791151704, "grad_norm": 0.24729718627794325, "learning_rate": 9.997311807543852e-05, "loss": 2.2325, "step": 1362 }, { "epoch": 0.010599975735932285, "grad_norm": 0.24441884006936496, "learning_rate": 9.997307800162832e-05, "loss": 2.2258, "step": 1363 }, { "epoch": 0.010607752680712866, "grad_norm": 0.40115178739068014, "learning_rate": 9.997303789797871e-05, "loss": 2.2237, "step": 1364 }, { "epoch": 0.010615529625493447, "grad_norm": 0.22161012052421525, "learning_rate": 9.997299776448972e-05, "loss": 2.2229, "step": 1365 }, { "epoch": 0.010623306570274028, "grad_norm": 0.1857833207788664, "learning_rate": 9.997295760116134e-05, "loss": 2.2586, "step": 1366 }, { "epoch": 0.01063108351505461, "grad_norm": 0.23508453810784546, "learning_rate": 9.997291740799362e-05, "loss": 2.2528, "step": 1367 }, { "epoch": 0.01063886045983519, "grad_norm": 0.13602342746736612, "learning_rate": 9.997287718498658e-05, "loss": 2.1488, "step": 1368 }, { "epoch": 0.010646637404615772, "grad_norm": 0.1889627879921102, "learning_rate": 9.997283693214025e-05, "loss": 2.2559, "step": 1369 }, { "epoch": 0.010654414349396353, "grad_norm": 0.2412916726403704, "learning_rate": 9.997279664945464e-05, "loss": 2.2677, "step": 1370 }, { "epoch": 0.010662191294176934, "grad_norm": 0.15030642549556977, "learning_rate": 9.997275633692977e-05, "loss": 2.2436, "step": 1371 }, { "epoch": 0.010669968238957515, "grad_norm": 0.16109344211665216, "learning_rate": 9.99727159945657e-05, "loss": 2.2644, "step": 1372 }, { "epoch": 0.010677745183738097, "grad_norm": 0.17629438825559907, "learning_rate": 9.997267562236242e-05, "loss": 2.2185, "step": 1373 }, { "epoch": 0.01068552212851868, "grad_norm": 0.15346701308445895, "learning_rate": 9.997263522031995e-05, "loss": 2.2274, "step": 1374 }, { "epoch": 0.01069329907329926, "grad_norm": 0.13665167143951376, "learning_rate": 9.997259478843833e-05, "loss": 2.1995, "step": 1375 }, { "epoch": 0.010701076018079842, "grad_norm": 0.14608426006681172, "learning_rate": 9.99725543267176e-05, "loss": 2.1966, "step": 1376 }, { "epoch": 0.010708852962860423, "grad_norm": 0.1358898971965564, "learning_rate": 9.997251383515773e-05, "loss": 2.2263, "step": 1377 }, { "epoch": 0.010716629907641004, "grad_norm": 0.12410695021560912, "learning_rate": 9.99724733137588e-05, "loss": 2.1852, "step": 1378 }, { "epoch": 0.010724406852421585, "grad_norm": 0.13902961791010887, "learning_rate": 9.997243276252082e-05, "loss": 2.2182, "step": 1379 }, { "epoch": 0.010732183797202166, "grad_norm": 0.12428140140830804, "learning_rate": 9.997239218144381e-05, "loss": 2.1658, "step": 1380 }, { "epoch": 0.010739960741982748, "grad_norm": 0.12816524515236496, "learning_rate": 9.997235157052777e-05, "loss": 2.1533, "step": 1381 }, { "epoch": 0.010747737686763329, "grad_norm": 0.4304834154844412, "learning_rate": 9.997231092977277e-05, "loss": 2.2494, "step": 1382 }, { "epoch": 0.01075551463154391, "grad_norm": 0.12313179552435359, "learning_rate": 9.997227025917882e-05, "loss": 2.2343, "step": 1383 }, { "epoch": 0.010763291576324491, "grad_norm": 0.12394792675586612, "learning_rate": 9.99722295587459e-05, "loss": 2.2489, "step": 1384 }, { "epoch": 0.010771068521105072, "grad_norm": 0.1333052921422295, "learning_rate": 9.99721888284741e-05, "loss": 2.2199, "step": 1385 }, { "epoch": 0.010778845465885653, "grad_norm": 0.1384494455349185, "learning_rate": 9.99721480683634e-05, "loss": 2.2321, "step": 1386 }, { "epoch": 0.010786622410666235, "grad_norm": 0.13018904356922317, "learning_rate": 9.997210727841384e-05, "loss": 2.181, "step": 1387 }, { "epoch": 0.010794399355446816, "grad_norm": 0.12907446503335626, "learning_rate": 9.997206645862544e-05, "loss": 2.1886, "step": 1388 }, { "epoch": 0.010802176300227399, "grad_norm": 0.13958073078952854, "learning_rate": 9.997202560899824e-05, "loss": 2.162, "step": 1389 }, { "epoch": 0.01080995324500798, "grad_norm": 0.13697641504910177, "learning_rate": 9.997198472953222e-05, "loss": 2.2505, "step": 1390 }, { "epoch": 0.010817730189788561, "grad_norm": 0.137594283510862, "learning_rate": 9.997194382022745e-05, "loss": 2.1641, "step": 1391 }, { "epoch": 0.010825507134569142, "grad_norm": 0.1257887946811963, "learning_rate": 9.997190288108395e-05, "loss": 2.1684, "step": 1392 }, { "epoch": 0.010833284079349723, "grad_norm": 0.1161682407832425, "learning_rate": 9.997186191210173e-05, "loss": 2.1945, "step": 1393 }, { "epoch": 0.010841061024130304, "grad_norm": 0.13293285325469448, "learning_rate": 9.997182091328083e-05, "loss": 2.2369, "step": 1394 }, { "epoch": 0.010848837968910886, "grad_norm": 0.11736340329408558, "learning_rate": 9.997177988462125e-05, "loss": 2.1956, "step": 1395 }, { "epoch": 0.010856614913691467, "grad_norm": 0.12603657258545592, "learning_rate": 9.997173882612303e-05, "loss": 2.1976, "step": 1396 }, { "epoch": 0.010864391858472048, "grad_norm": 0.11415191789005998, "learning_rate": 9.997169773778619e-05, "loss": 2.2581, "step": 1397 }, { "epoch": 0.010872168803252629, "grad_norm": 0.1270617950124437, "learning_rate": 9.997165661961076e-05, "loss": 2.2526, "step": 1398 }, { "epoch": 0.01087994574803321, "grad_norm": 0.12105395977028902, "learning_rate": 9.997161547159675e-05, "loss": 2.1963, "step": 1399 }, { "epoch": 0.010887722692813791, "grad_norm": 0.12148319319299036, "learning_rate": 9.997157429374421e-05, "loss": 2.2431, "step": 1400 }, { "epoch": 0.010895499637594373, "grad_norm": 0.12638361529582628, "learning_rate": 9.997153308605315e-05, "loss": 2.1673, "step": 1401 }, { "epoch": 0.010903276582374954, "grad_norm": 0.113818265001305, "learning_rate": 9.997149184852359e-05, "loss": 2.196, "step": 1402 }, { "epoch": 0.010911053527155535, "grad_norm": 0.1168209106648843, "learning_rate": 9.997145058115555e-05, "loss": 2.1768, "step": 1403 }, { "epoch": 0.010918830471936118, "grad_norm": 0.1203041058324103, "learning_rate": 9.99714092839491e-05, "loss": 2.2138, "step": 1404 }, { "epoch": 0.010926607416716699, "grad_norm": 0.1234735673834316, "learning_rate": 9.99713679569042e-05, "loss": 2.2343, "step": 1405 }, { "epoch": 0.01093438436149728, "grad_norm": 0.12714730293669002, "learning_rate": 9.997132660002091e-05, "loss": 2.2411, "step": 1406 }, { "epoch": 0.010942161306277861, "grad_norm": 0.11211517989263922, "learning_rate": 9.997128521329925e-05, "loss": 2.1603, "step": 1407 }, { "epoch": 0.010949938251058442, "grad_norm": 0.1190991727497235, "learning_rate": 9.997124379673923e-05, "loss": 2.1639, "step": 1408 }, { "epoch": 0.010957715195839024, "grad_norm": 0.1188164576646129, "learning_rate": 9.99712023503409e-05, "loss": 2.2522, "step": 1409 }, { "epoch": 0.010965492140619605, "grad_norm": 0.11356187657184343, "learning_rate": 9.997116087410428e-05, "loss": 2.2323, "step": 1410 }, { "epoch": 0.010973269085400186, "grad_norm": 0.11818217585714569, "learning_rate": 9.997111936802939e-05, "loss": 2.2409, "step": 1411 }, { "epoch": 0.010981046030180767, "grad_norm": 0.19258780823855048, "learning_rate": 9.997107783211624e-05, "loss": 2.2087, "step": 1412 }, { "epoch": 0.010988822974961348, "grad_norm": 0.11922185113907241, "learning_rate": 9.997103626636487e-05, "loss": 2.1708, "step": 1413 }, { "epoch": 0.01099659991974193, "grad_norm": 0.12900963615455144, "learning_rate": 9.997099467077529e-05, "loss": 2.1842, "step": 1414 }, { "epoch": 0.01100437686452251, "grad_norm": 0.3120830589586333, "learning_rate": 9.997095304534756e-05, "loss": 2.1957, "step": 1415 }, { "epoch": 0.011012153809303092, "grad_norm": 0.13061704771040944, "learning_rate": 9.997091139008166e-05, "loss": 2.2665, "step": 1416 }, { "epoch": 0.011019930754083673, "grad_norm": 0.1321295153349464, "learning_rate": 9.997086970497764e-05, "loss": 2.1752, "step": 1417 }, { "epoch": 0.011027707698864256, "grad_norm": 0.11368447794528953, "learning_rate": 9.997082799003554e-05, "loss": 2.1687, "step": 1418 }, { "epoch": 0.011035484643644837, "grad_norm": 0.18492899316863487, "learning_rate": 9.997078624525535e-05, "loss": 2.1916, "step": 1419 }, { "epoch": 0.011043261588425418, "grad_norm": 0.2535997050360963, "learning_rate": 9.997074447063712e-05, "loss": 2.1884, "step": 1420 }, { "epoch": 0.011051038533206, "grad_norm": 0.1251065309024154, "learning_rate": 9.997070266618085e-05, "loss": 2.2089, "step": 1421 }, { "epoch": 0.01105881547798658, "grad_norm": 0.13174430819934435, "learning_rate": 9.99706608318866e-05, "loss": 2.249, "step": 1422 }, { "epoch": 0.011066592422767162, "grad_norm": 0.1352715609462399, "learning_rate": 9.997061896775436e-05, "loss": 2.1963, "step": 1423 }, { "epoch": 0.011074369367547743, "grad_norm": 0.12371214804980017, "learning_rate": 9.997057707378419e-05, "loss": 2.1892, "step": 1424 }, { "epoch": 0.011082146312328324, "grad_norm": 0.13031385209365032, "learning_rate": 9.997053514997608e-05, "loss": 2.1396, "step": 1425 }, { "epoch": 0.011089923257108905, "grad_norm": 0.1237573227880835, "learning_rate": 9.997049319633007e-05, "loss": 2.1734, "step": 1426 }, { "epoch": 0.011097700201889486, "grad_norm": 0.1234419007735947, "learning_rate": 9.99704512128462e-05, "loss": 2.2424, "step": 1427 }, { "epoch": 0.011105477146670067, "grad_norm": 0.1322671818939037, "learning_rate": 9.997040919952448e-05, "loss": 2.1662, "step": 1428 }, { "epoch": 0.011113254091450649, "grad_norm": 0.17816781484353503, "learning_rate": 9.997036715636493e-05, "loss": 2.1731, "step": 1429 }, { "epoch": 0.01112103103623123, "grad_norm": 0.28667400069427784, "learning_rate": 9.997032508336758e-05, "loss": 2.2632, "step": 1430 }, { "epoch": 0.011128807981011811, "grad_norm": 0.16106089135612794, "learning_rate": 9.997028298053246e-05, "loss": 2.2363, "step": 1431 }, { "epoch": 0.011136584925792392, "grad_norm": 0.12968298329283906, "learning_rate": 9.997024084785958e-05, "loss": 2.2099, "step": 1432 }, { "epoch": 0.011144361870572975, "grad_norm": 0.14455551060608873, "learning_rate": 9.997019868534901e-05, "loss": 2.197, "step": 1433 }, { "epoch": 0.011152138815353556, "grad_norm": 0.12428712696438478, "learning_rate": 9.99701564930007e-05, "loss": 2.1844, "step": 1434 }, { "epoch": 0.011159915760134137, "grad_norm": 0.13547272829590354, "learning_rate": 9.997011427081474e-05, "loss": 2.2111, "step": 1435 }, { "epoch": 0.011167692704914719, "grad_norm": 0.1252426376390414, "learning_rate": 9.997007201879114e-05, "loss": 2.2392, "step": 1436 }, { "epoch": 0.0111754696496953, "grad_norm": 0.121774074604515, "learning_rate": 9.99700297369299e-05, "loss": 2.1638, "step": 1437 }, { "epoch": 0.01118324659447588, "grad_norm": 0.12570712238032442, "learning_rate": 9.996998742523108e-05, "loss": 2.1715, "step": 1438 }, { "epoch": 0.011191023539256462, "grad_norm": 0.13173928290923523, "learning_rate": 9.996994508369466e-05, "loss": 2.1814, "step": 1439 }, { "epoch": 0.011198800484037043, "grad_norm": 0.12235628869508422, "learning_rate": 9.996990271232072e-05, "loss": 2.1768, "step": 1440 }, { "epoch": 0.011206577428817624, "grad_norm": 0.1274703966821024, "learning_rate": 9.996986031110925e-05, "loss": 2.2064, "step": 1441 }, { "epoch": 0.011214354373598205, "grad_norm": 0.13326385499528687, "learning_rate": 9.996981788006029e-05, "loss": 2.2337, "step": 1442 }, { "epoch": 0.011222131318378787, "grad_norm": 0.11990593778498466, "learning_rate": 9.996977541917385e-05, "loss": 2.1698, "step": 1443 }, { "epoch": 0.011229908263159368, "grad_norm": 0.1322080403437459, "learning_rate": 9.996973292844995e-05, "loss": 2.2009, "step": 1444 }, { "epoch": 0.011237685207939949, "grad_norm": 0.11464346852301827, "learning_rate": 9.996969040788866e-05, "loss": 2.2349, "step": 1445 }, { "epoch": 0.01124546215272053, "grad_norm": 0.14021429277861167, "learning_rate": 9.996964785748996e-05, "loss": 2.2169, "step": 1446 }, { "epoch": 0.011253239097501111, "grad_norm": 0.14664192881786714, "learning_rate": 9.99696052772539e-05, "loss": 2.1979, "step": 1447 }, { "epoch": 0.011261016042281694, "grad_norm": 0.13666598334214936, "learning_rate": 9.99695626671805e-05, "loss": 2.2137, "step": 1448 }, { "epoch": 0.011268792987062275, "grad_norm": 0.12385986332821071, "learning_rate": 9.996952002726976e-05, "loss": 2.211, "step": 1449 }, { "epoch": 0.011276569931842857, "grad_norm": 0.11866369502776003, "learning_rate": 9.996947735752175e-05, "loss": 2.2264, "step": 1450 }, { "epoch": 0.011284346876623438, "grad_norm": 0.12693986455485295, "learning_rate": 9.996943465793645e-05, "loss": 2.2364, "step": 1451 }, { "epoch": 0.011292123821404019, "grad_norm": 0.3005078857371476, "learning_rate": 9.996939192851394e-05, "loss": 2.2524, "step": 1452 }, { "epoch": 0.0112999007661846, "grad_norm": 0.11996855946567173, "learning_rate": 9.996934916925418e-05, "loss": 2.1654, "step": 1453 }, { "epoch": 0.011307677710965181, "grad_norm": 0.1286007356772065, "learning_rate": 9.996930638015725e-05, "loss": 2.2183, "step": 1454 }, { "epoch": 0.011315454655745762, "grad_norm": 0.19610757505642548, "learning_rate": 9.996926356122314e-05, "loss": 2.2308, "step": 1455 }, { "epoch": 0.011323231600526344, "grad_norm": 0.13915636054672043, "learning_rate": 9.996922071245191e-05, "loss": 2.202, "step": 1456 }, { "epoch": 0.011331008545306925, "grad_norm": 0.12293922473210828, "learning_rate": 9.996917783384355e-05, "loss": 2.2092, "step": 1457 }, { "epoch": 0.011338785490087506, "grad_norm": 0.12200371143944982, "learning_rate": 9.996913492539812e-05, "loss": 2.2484, "step": 1458 }, { "epoch": 0.011346562434868087, "grad_norm": 0.1231014562642159, "learning_rate": 9.996909198711562e-05, "loss": 2.255, "step": 1459 }, { "epoch": 0.011354339379648668, "grad_norm": 0.13717458486859574, "learning_rate": 9.996904901899606e-05, "loss": 2.2175, "step": 1460 }, { "epoch": 0.01136211632442925, "grad_norm": 0.14846395091205566, "learning_rate": 9.99690060210395e-05, "loss": 2.1794, "step": 1461 }, { "epoch": 0.01136989326920983, "grad_norm": 0.12202689392482415, "learning_rate": 9.996896299324597e-05, "loss": 2.1509, "step": 1462 }, { "epoch": 0.011377670213990413, "grad_norm": 0.23042233439542628, "learning_rate": 9.996891993561547e-05, "loss": 2.1793, "step": 1463 }, { "epoch": 0.011385447158770995, "grad_norm": 0.12634935584469528, "learning_rate": 9.996887684814803e-05, "loss": 2.2023, "step": 1464 }, { "epoch": 0.011393224103551576, "grad_norm": 0.11723647042632906, "learning_rate": 9.99688337308437e-05, "loss": 2.188, "step": 1465 }, { "epoch": 0.011401001048332157, "grad_norm": 0.12190956427000323, "learning_rate": 9.996879058370247e-05, "loss": 2.1819, "step": 1466 }, { "epoch": 0.011408777993112738, "grad_norm": 0.13029097167026713, "learning_rate": 9.996874740672439e-05, "loss": 2.1822, "step": 1467 }, { "epoch": 0.01141655493789332, "grad_norm": 0.12694242578256482, "learning_rate": 9.996870419990947e-05, "loss": 2.2012, "step": 1468 }, { "epoch": 0.0114243318826739, "grad_norm": 0.12611098029320011, "learning_rate": 9.996866096325775e-05, "loss": 2.2381, "step": 1469 }, { "epoch": 0.011432108827454482, "grad_norm": 0.12116840779645345, "learning_rate": 9.996861769676925e-05, "loss": 2.1619, "step": 1470 }, { "epoch": 0.011439885772235063, "grad_norm": 0.1291084745782598, "learning_rate": 9.9968574400444e-05, "loss": 2.2515, "step": 1471 }, { "epoch": 0.011447662717015644, "grad_norm": 0.12397570420729384, "learning_rate": 9.9968531074282e-05, "loss": 2.189, "step": 1472 }, { "epoch": 0.011455439661796225, "grad_norm": 0.18541872112859822, "learning_rate": 9.996848771828334e-05, "loss": 2.1965, "step": 1473 }, { "epoch": 0.011463216606576806, "grad_norm": 0.11658897730842008, "learning_rate": 9.996844433244797e-05, "loss": 2.2071, "step": 1474 }, { "epoch": 0.011470993551357387, "grad_norm": 0.13490349539542057, "learning_rate": 9.996840091677596e-05, "loss": 2.256, "step": 1475 }, { "epoch": 0.011478770496137969, "grad_norm": 0.15606893117430173, "learning_rate": 9.996835747126732e-05, "loss": 2.1516, "step": 1476 }, { "epoch": 0.01148654744091855, "grad_norm": 0.12116797287209581, "learning_rate": 9.99683139959221e-05, "loss": 2.2044, "step": 1477 }, { "epoch": 0.011494324385699133, "grad_norm": 0.1415521394719542, "learning_rate": 9.99682704907403e-05, "loss": 2.1711, "step": 1478 }, { "epoch": 0.011502101330479714, "grad_norm": 0.124454579873994, "learning_rate": 9.996822695572193e-05, "loss": 2.2084, "step": 1479 }, { "epoch": 0.011509878275260295, "grad_norm": 0.12157302385013694, "learning_rate": 9.996818339086707e-05, "loss": 2.1823, "step": 1480 }, { "epoch": 0.011517655220040876, "grad_norm": 0.13237783611128637, "learning_rate": 9.99681397961757e-05, "loss": 2.2122, "step": 1481 }, { "epoch": 0.011525432164821457, "grad_norm": 0.13824785007536136, "learning_rate": 9.996809617164785e-05, "loss": 2.1942, "step": 1482 }, { "epoch": 0.011533209109602038, "grad_norm": 0.11688258079019641, "learning_rate": 9.996805251728357e-05, "loss": 2.191, "step": 1483 }, { "epoch": 0.01154098605438262, "grad_norm": 0.1331930132588146, "learning_rate": 9.996800883308288e-05, "loss": 2.2027, "step": 1484 }, { "epoch": 0.0115487629991632, "grad_norm": 0.12268018296029633, "learning_rate": 9.99679651190458e-05, "loss": 2.1843, "step": 1485 }, { "epoch": 0.011556539943943782, "grad_norm": 0.11730556925393038, "learning_rate": 9.996792137517235e-05, "loss": 2.2019, "step": 1486 }, { "epoch": 0.011564316888724363, "grad_norm": 0.17037133294301365, "learning_rate": 9.996787760146256e-05, "loss": 2.2179, "step": 1487 }, { "epoch": 0.011572093833504944, "grad_norm": 0.16523404188680366, "learning_rate": 9.996783379791645e-05, "loss": 2.1527, "step": 1488 }, { "epoch": 0.011579870778285525, "grad_norm": 0.1424064579411381, "learning_rate": 9.996778996453407e-05, "loss": 2.204, "step": 1489 }, { "epoch": 0.011587647723066107, "grad_norm": 0.13282607033659913, "learning_rate": 9.996774610131543e-05, "loss": 2.2143, "step": 1490 }, { "epoch": 0.011595424667846688, "grad_norm": 0.16102147305251965, "learning_rate": 9.996770220826055e-05, "loss": 2.2472, "step": 1491 }, { "epoch": 0.011603201612627269, "grad_norm": 0.14871258307215876, "learning_rate": 9.996765828536945e-05, "loss": 2.2574, "step": 1492 }, { "epoch": 0.011610978557407852, "grad_norm": 0.11965169537157645, "learning_rate": 9.996761433264218e-05, "loss": 2.1382, "step": 1493 }, { "epoch": 0.011618755502188433, "grad_norm": 0.1282925234299939, "learning_rate": 9.996757035007877e-05, "loss": 2.2342, "step": 1494 }, { "epoch": 0.011626532446969014, "grad_norm": 0.1252545318908515, "learning_rate": 9.996752633767921e-05, "loss": 2.2248, "step": 1495 }, { "epoch": 0.011634309391749595, "grad_norm": 0.12409195975879128, "learning_rate": 9.996748229544356e-05, "loss": 2.1986, "step": 1496 }, { "epoch": 0.011642086336530176, "grad_norm": 0.1509490831045356, "learning_rate": 9.996743822337183e-05, "loss": 2.1908, "step": 1497 }, { "epoch": 0.011649863281310758, "grad_norm": 0.14707173873868598, "learning_rate": 9.996739412146405e-05, "loss": 2.2, "step": 1498 }, { "epoch": 0.011657640226091339, "grad_norm": 0.12724899961663216, "learning_rate": 9.996734998972024e-05, "loss": 2.1914, "step": 1499 }, { "epoch": 0.01166541717087192, "grad_norm": 0.12281465727630352, "learning_rate": 9.996730582814044e-05, "loss": 2.214, "step": 1500 }, { "epoch": 0.011673194115652501, "grad_norm": 0.13687313526065584, "learning_rate": 9.996726163672467e-05, "loss": 2.1704, "step": 1501 }, { "epoch": 0.011680971060433082, "grad_norm": 0.1306269078546722, "learning_rate": 9.996721741547297e-05, "loss": 2.2188, "step": 1502 }, { "epoch": 0.011688748005213663, "grad_norm": 0.12345209364700875, "learning_rate": 9.996717316438532e-05, "loss": 2.1822, "step": 1503 }, { "epoch": 0.011696524949994245, "grad_norm": 0.15128426025862632, "learning_rate": 9.99671288834618e-05, "loss": 2.2278, "step": 1504 }, { "epoch": 0.011704301894774826, "grad_norm": 0.2164663828056315, "learning_rate": 9.996708457270242e-05, "loss": 2.2353, "step": 1505 }, { "epoch": 0.011712078839555407, "grad_norm": 0.12274947499452046, "learning_rate": 9.996704023210718e-05, "loss": 2.1847, "step": 1506 }, { "epoch": 0.01171985578433599, "grad_norm": 0.14956851259687662, "learning_rate": 9.996699586167615e-05, "loss": 2.1303, "step": 1507 }, { "epoch": 0.011727632729116571, "grad_norm": 0.13779252709233886, "learning_rate": 9.996695146140933e-05, "loss": 2.1944, "step": 1508 }, { "epoch": 0.011735409673897152, "grad_norm": 0.1344620247213119, "learning_rate": 9.996690703130675e-05, "loss": 2.249, "step": 1509 }, { "epoch": 0.011743186618677733, "grad_norm": 0.177308132303281, "learning_rate": 9.996686257136843e-05, "loss": 2.206, "step": 1510 }, { "epoch": 0.011750963563458314, "grad_norm": 0.15223389659167733, "learning_rate": 9.996681808159441e-05, "loss": 2.1952, "step": 1511 }, { "epoch": 0.011758740508238896, "grad_norm": 0.12661002991770645, "learning_rate": 9.996677356198472e-05, "loss": 2.2042, "step": 1512 }, { "epoch": 0.011766517453019477, "grad_norm": 0.15073881215487045, "learning_rate": 9.996672901253937e-05, "loss": 2.2325, "step": 1513 }, { "epoch": 0.011774294397800058, "grad_norm": 0.32605119667957044, "learning_rate": 9.996668443325837e-05, "loss": 2.2538, "step": 1514 }, { "epoch": 0.011782071342580639, "grad_norm": 0.2513153090627278, "learning_rate": 9.99666398241418e-05, "loss": 2.1824, "step": 1515 }, { "epoch": 0.01178984828736122, "grad_norm": 0.12380702865919235, "learning_rate": 9.996659518518965e-05, "loss": 2.1622, "step": 1516 }, { "epoch": 0.011797625232141801, "grad_norm": 0.12512439871812442, "learning_rate": 9.996655051640195e-05, "loss": 2.1674, "step": 1517 }, { "epoch": 0.011805402176922383, "grad_norm": 0.134903154865217, "learning_rate": 9.996650581777873e-05, "loss": 2.1967, "step": 1518 }, { "epoch": 0.011813179121702964, "grad_norm": 0.1273491084372837, "learning_rate": 9.996646108932002e-05, "loss": 2.2588, "step": 1519 }, { "epoch": 0.011820956066483545, "grad_norm": 0.14453174001680721, "learning_rate": 9.996641633102583e-05, "loss": 2.2231, "step": 1520 }, { "epoch": 0.011828733011264126, "grad_norm": 0.1312019193678396, "learning_rate": 9.996637154289622e-05, "loss": 2.2232, "step": 1521 }, { "epoch": 0.011836509956044709, "grad_norm": 0.12665287324119442, "learning_rate": 9.996632672493119e-05, "loss": 2.2121, "step": 1522 }, { "epoch": 0.01184428690082529, "grad_norm": 0.14485591757613386, "learning_rate": 9.996628187713078e-05, "loss": 2.2196, "step": 1523 }, { "epoch": 0.011852063845605871, "grad_norm": 0.12625506889999394, "learning_rate": 9.9966236999495e-05, "loss": 2.1799, "step": 1524 }, { "epoch": 0.011859840790386452, "grad_norm": 0.1359512478539618, "learning_rate": 9.996619209202389e-05, "loss": 2.1493, "step": 1525 }, { "epoch": 0.011867617735167034, "grad_norm": 0.1343097783016391, "learning_rate": 9.996614715471749e-05, "loss": 2.226, "step": 1526 }, { "epoch": 0.011875394679947615, "grad_norm": 0.1217802403344546, "learning_rate": 9.996610218757578e-05, "loss": 2.2407, "step": 1527 }, { "epoch": 0.011883171624728196, "grad_norm": 0.13243980351520915, "learning_rate": 9.996605719059884e-05, "loss": 2.1404, "step": 1528 }, { "epoch": 0.011890948569508777, "grad_norm": 0.1256893196940778, "learning_rate": 9.996601216378667e-05, "loss": 2.2166, "step": 1529 }, { "epoch": 0.011898725514289358, "grad_norm": 0.11773711933567972, "learning_rate": 9.996596710713931e-05, "loss": 2.1932, "step": 1530 }, { "epoch": 0.01190650245906994, "grad_norm": 0.12523336489282946, "learning_rate": 9.996592202065677e-05, "loss": 2.2925, "step": 1531 }, { "epoch": 0.01191427940385052, "grad_norm": 0.1189397116609711, "learning_rate": 9.99658769043391e-05, "loss": 2.2289, "step": 1532 }, { "epoch": 0.011922056348631102, "grad_norm": 0.12369909259922783, "learning_rate": 9.996583175818629e-05, "loss": 2.1548, "step": 1533 }, { "epoch": 0.011929833293411683, "grad_norm": 0.11858471130934949, "learning_rate": 9.996578658219839e-05, "loss": 2.1741, "step": 1534 }, { "epoch": 0.011937610238192264, "grad_norm": 0.5476881984893874, "learning_rate": 9.996574137637545e-05, "loss": 2.2535, "step": 1535 }, { "epoch": 0.011945387182972845, "grad_norm": 0.1197912768918128, "learning_rate": 9.996569614071745e-05, "loss": 2.1953, "step": 1536 }, { "epoch": 0.011953164127753428, "grad_norm": 0.14734705186585112, "learning_rate": 9.996565087522445e-05, "loss": 2.1707, "step": 1537 }, { "epoch": 0.01196094107253401, "grad_norm": 0.2331906431892184, "learning_rate": 9.996560557989647e-05, "loss": 2.1445, "step": 1538 }, { "epoch": 0.01196871801731459, "grad_norm": 0.1692210232247035, "learning_rate": 9.996556025473354e-05, "loss": 2.1662, "step": 1539 }, { "epoch": 0.011976494962095172, "grad_norm": 0.17759280429861385, "learning_rate": 9.996551489973566e-05, "loss": 2.1652, "step": 1540 }, { "epoch": 0.011984271906875753, "grad_norm": 0.18761762390519876, "learning_rate": 9.99654695149029e-05, "loss": 2.1876, "step": 1541 }, { "epoch": 0.011992048851656334, "grad_norm": 0.16530110518799362, "learning_rate": 9.996542410023526e-05, "loss": 2.1826, "step": 1542 }, { "epoch": 0.011999825796436915, "grad_norm": 0.13684014760411128, "learning_rate": 9.996537865573278e-05, "loss": 2.1845, "step": 1543 }, { "epoch": 0.012007602741217496, "grad_norm": 0.12579800974831779, "learning_rate": 9.996533318139547e-05, "loss": 2.2054, "step": 1544 }, { "epoch": 0.012015379685998077, "grad_norm": 0.12941708361832824, "learning_rate": 9.996528767722337e-05, "loss": 2.1946, "step": 1545 }, { "epoch": 0.012023156630778659, "grad_norm": 0.11852289601303516, "learning_rate": 9.99652421432165e-05, "loss": 2.188, "step": 1546 }, { "epoch": 0.01203093357555924, "grad_norm": 0.128037788880594, "learning_rate": 9.99651965793749e-05, "loss": 2.1552, "step": 1547 }, { "epoch": 0.012038710520339821, "grad_norm": 0.12535099520233717, "learning_rate": 9.996515098569859e-05, "loss": 2.2109, "step": 1548 }, { "epoch": 0.012046487465120402, "grad_norm": 0.11749500484171425, "learning_rate": 9.996510536218758e-05, "loss": 2.1658, "step": 1549 }, { "epoch": 0.012054264409900983, "grad_norm": 0.12727155760359904, "learning_rate": 9.996505970884194e-05, "loss": 2.2102, "step": 1550 }, { "epoch": 0.012062041354681564, "grad_norm": 0.1150164709924562, "learning_rate": 9.996501402566164e-05, "loss": 2.2108, "step": 1551 }, { "epoch": 0.012069818299462147, "grad_norm": 0.12710613604055018, "learning_rate": 9.996496831264676e-05, "loss": 2.2079, "step": 1552 }, { "epoch": 0.012077595244242728, "grad_norm": 0.1326957524489851, "learning_rate": 9.99649225697973e-05, "loss": 2.1698, "step": 1553 }, { "epoch": 0.01208537218902331, "grad_norm": 0.12955655381454978, "learning_rate": 9.996487679711329e-05, "loss": 2.2249, "step": 1554 }, { "epoch": 0.01209314913380389, "grad_norm": 0.12383876496920151, "learning_rate": 9.996483099459474e-05, "loss": 2.2751, "step": 1555 }, { "epoch": 0.012100926078584472, "grad_norm": 0.12297104759334046, "learning_rate": 9.996478516224172e-05, "loss": 2.1616, "step": 1556 }, { "epoch": 0.012108703023365053, "grad_norm": 0.12384769064231364, "learning_rate": 9.996473930005425e-05, "loss": 2.2379, "step": 1557 }, { "epoch": 0.012116479968145634, "grad_norm": 0.11590435581394652, "learning_rate": 9.996469340803232e-05, "loss": 2.1924, "step": 1558 }, { "epoch": 0.012124256912926215, "grad_norm": 0.123674098622679, "learning_rate": 9.996464748617598e-05, "loss": 2.3184, "step": 1559 }, { "epoch": 0.012132033857706797, "grad_norm": 0.1255693479323118, "learning_rate": 9.996460153448526e-05, "loss": 2.2399, "step": 1560 }, { "epoch": 0.012139810802487378, "grad_norm": 0.13329235052143176, "learning_rate": 9.996455555296018e-05, "loss": 2.2073, "step": 1561 }, { "epoch": 0.012147587747267959, "grad_norm": 0.12232889377642629, "learning_rate": 9.996450954160077e-05, "loss": 2.1944, "step": 1562 }, { "epoch": 0.01215536469204854, "grad_norm": 0.1198545943726621, "learning_rate": 9.996446350040706e-05, "loss": 2.2327, "step": 1563 }, { "epoch": 0.012163141636829121, "grad_norm": 0.12906194561314605, "learning_rate": 9.99644174293791e-05, "loss": 2.1564, "step": 1564 }, { "epoch": 0.012170918581609702, "grad_norm": 0.13688954606477582, "learning_rate": 9.996437132851687e-05, "loss": 2.2368, "step": 1565 }, { "epoch": 0.012178695526390284, "grad_norm": 0.11874579165629517, "learning_rate": 9.996432519782042e-05, "loss": 2.2239, "step": 1566 }, { "epoch": 0.012186472471170866, "grad_norm": 0.12707403427052275, "learning_rate": 9.996427903728978e-05, "loss": 2.2551, "step": 1567 }, { "epoch": 0.012194249415951448, "grad_norm": 0.12952275456471118, "learning_rate": 9.9964232846925e-05, "loss": 2.2149, "step": 1568 }, { "epoch": 0.012202026360732029, "grad_norm": 0.12121880210029781, "learning_rate": 9.996418662672607e-05, "loss": 2.1912, "step": 1569 }, { "epoch": 0.01220980330551261, "grad_norm": 0.11839305153448798, "learning_rate": 9.996414037669301e-05, "loss": 2.2374, "step": 1570 }, { "epoch": 0.012217580250293191, "grad_norm": 0.11910265842580793, "learning_rate": 9.99640940968259e-05, "loss": 2.2365, "step": 1571 }, { "epoch": 0.012225357195073772, "grad_norm": 0.12251959215532399, "learning_rate": 9.996404778712471e-05, "loss": 2.2175, "step": 1572 }, { "epoch": 0.012233134139854353, "grad_norm": 0.12620881426939243, "learning_rate": 9.996400144758953e-05, "loss": 2.1869, "step": 1573 }, { "epoch": 0.012240911084634935, "grad_norm": 0.12100638492020865, "learning_rate": 9.996395507822032e-05, "loss": 2.2071, "step": 1574 }, { "epoch": 0.012248688029415516, "grad_norm": 0.12973289375983577, "learning_rate": 9.996390867901715e-05, "loss": 2.1774, "step": 1575 }, { "epoch": 0.012256464974196097, "grad_norm": 0.1219000077333318, "learning_rate": 9.996386224998004e-05, "loss": 2.2092, "step": 1576 }, { "epoch": 0.012264241918976678, "grad_norm": 0.11854570029673957, "learning_rate": 9.996381579110903e-05, "loss": 2.2276, "step": 1577 }, { "epoch": 0.01227201886375726, "grad_norm": 0.11866028673142999, "learning_rate": 9.996376930240412e-05, "loss": 2.1872, "step": 1578 }, { "epoch": 0.01227979580853784, "grad_norm": 0.16759042008569855, "learning_rate": 9.996372278386534e-05, "loss": 2.2378, "step": 1579 }, { "epoch": 0.012287572753318422, "grad_norm": 0.11938955100267475, "learning_rate": 9.996367623549274e-05, "loss": 2.225, "step": 1580 }, { "epoch": 0.012295349698099003, "grad_norm": 0.12048119197872685, "learning_rate": 9.996362965728634e-05, "loss": 2.1617, "step": 1581 }, { "epoch": 0.012303126642879586, "grad_norm": 0.1388346560303725, "learning_rate": 9.996358304924616e-05, "loss": 2.2286, "step": 1582 }, { "epoch": 0.012310903587660167, "grad_norm": 0.14076484332943442, "learning_rate": 9.996353641137223e-05, "loss": 2.187, "step": 1583 }, { "epoch": 0.012318680532440748, "grad_norm": 0.12223750948316305, "learning_rate": 9.996348974366459e-05, "loss": 2.1554, "step": 1584 }, { "epoch": 0.01232645747722133, "grad_norm": 0.1224345636195547, "learning_rate": 9.996344304612324e-05, "loss": 2.1994, "step": 1585 }, { "epoch": 0.01233423442200191, "grad_norm": 0.13173233285433192, "learning_rate": 9.996339631874825e-05, "loss": 2.1503, "step": 1586 }, { "epoch": 0.012342011366782491, "grad_norm": 0.15082613633990927, "learning_rate": 9.99633495615396e-05, "loss": 2.1605, "step": 1587 }, { "epoch": 0.012349788311563073, "grad_norm": 0.12271034927694156, "learning_rate": 9.996330277449736e-05, "loss": 2.2276, "step": 1588 }, { "epoch": 0.012357565256343654, "grad_norm": 0.1440288339694103, "learning_rate": 9.996325595762153e-05, "loss": 2.1459, "step": 1589 }, { "epoch": 0.012365342201124235, "grad_norm": 0.14629391171093814, "learning_rate": 9.996320911091214e-05, "loss": 2.2366, "step": 1590 }, { "epoch": 0.012373119145904816, "grad_norm": 0.12197290786902779, "learning_rate": 9.996316223436924e-05, "loss": 2.182, "step": 1591 }, { "epoch": 0.012380896090685397, "grad_norm": 0.1505726270876803, "learning_rate": 9.996311532799285e-05, "loss": 2.1961, "step": 1592 }, { "epoch": 0.012388673035465978, "grad_norm": 0.1686611821048205, "learning_rate": 9.996306839178298e-05, "loss": 2.217, "step": 1593 }, { "epoch": 0.01239644998024656, "grad_norm": 0.12266470393274759, "learning_rate": 9.996302142573967e-05, "loss": 2.1914, "step": 1594 }, { "epoch": 0.01240422692502714, "grad_norm": 0.1575322456393739, "learning_rate": 9.996297442986295e-05, "loss": 2.1654, "step": 1595 }, { "epoch": 0.012412003869807722, "grad_norm": 0.17799994321989362, "learning_rate": 9.996292740415284e-05, "loss": 2.2011, "step": 1596 }, { "epoch": 0.012419780814588305, "grad_norm": 0.12276338873440792, "learning_rate": 9.996288034860938e-05, "loss": 2.1887, "step": 1597 }, { "epoch": 0.012427557759368886, "grad_norm": 0.15183964696733188, "learning_rate": 9.996283326323259e-05, "loss": 2.1896, "step": 1598 }, { "epoch": 0.012435334704149467, "grad_norm": 0.18717695662029338, "learning_rate": 9.99627861480225e-05, "loss": 2.219, "step": 1599 }, { "epoch": 0.012443111648930048, "grad_norm": 0.1269140584096638, "learning_rate": 9.996273900297913e-05, "loss": 2.1707, "step": 1600 }, { "epoch": 0.01245088859371063, "grad_norm": 0.15327005946492883, "learning_rate": 9.996269182810253e-05, "loss": 2.1225, "step": 1601 }, { "epoch": 0.01245866553849121, "grad_norm": 0.17172955667655643, "learning_rate": 9.996264462339271e-05, "loss": 2.1127, "step": 1602 }, { "epoch": 0.012466442483271792, "grad_norm": 0.12484175570778772, "learning_rate": 9.99625973888497e-05, "loss": 2.1957, "step": 1603 }, { "epoch": 0.012474219428052373, "grad_norm": 0.1545719941912, "learning_rate": 9.996255012447353e-05, "loss": 2.1448, "step": 1604 }, { "epoch": 0.012481996372832954, "grad_norm": 0.20349907222916117, "learning_rate": 9.996250283026423e-05, "loss": 2.2072, "step": 1605 }, { "epoch": 0.012489773317613535, "grad_norm": 0.12997544275445339, "learning_rate": 9.996245550622183e-05, "loss": 2.2445, "step": 1606 }, { "epoch": 0.012497550262394116, "grad_norm": 0.16710737469462272, "learning_rate": 9.996240815234634e-05, "loss": 2.2522, "step": 1607 }, { "epoch": 0.012505327207174698, "grad_norm": 0.19617242769005805, "learning_rate": 9.996236076863783e-05, "loss": 2.2321, "step": 1608 }, { "epoch": 0.012513104151955279, "grad_norm": 0.12817066966645727, "learning_rate": 9.99623133550963e-05, "loss": 2.2099, "step": 1609 }, { "epoch": 0.01252088109673586, "grad_norm": 0.19367943829735604, "learning_rate": 9.996226591172175e-05, "loss": 2.2122, "step": 1610 }, { "epoch": 0.012528658041516443, "grad_norm": 0.14702933677981647, "learning_rate": 9.996221843851427e-05, "loss": 2.2465, "step": 1611 }, { "epoch": 0.012536434986297024, "grad_norm": 0.12251516640410287, "learning_rate": 9.996217093547384e-05, "loss": 2.2181, "step": 1612 }, { "epoch": 0.012544211931077605, "grad_norm": 0.14339543920954284, "learning_rate": 9.996212340260052e-05, "loss": 2.1816, "step": 1613 }, { "epoch": 0.012551988875858186, "grad_norm": 0.2992606072481104, "learning_rate": 9.99620758398943e-05, "loss": 2.1753, "step": 1614 }, { "epoch": 0.012559765820638768, "grad_norm": 0.1222771674811537, "learning_rate": 9.996202824735525e-05, "loss": 2.1987, "step": 1615 }, { "epoch": 0.012567542765419349, "grad_norm": 0.12179386001435745, "learning_rate": 9.996198062498338e-05, "loss": 2.112, "step": 1616 }, { "epoch": 0.01257531971019993, "grad_norm": 0.125447128851283, "learning_rate": 9.996193297277872e-05, "loss": 2.1931, "step": 1617 }, { "epoch": 0.012583096654980511, "grad_norm": 0.13447263242074067, "learning_rate": 9.99618852907413e-05, "loss": 2.1445, "step": 1618 }, { "epoch": 0.012590873599761092, "grad_norm": 0.21103954837271016, "learning_rate": 9.996183757887114e-05, "loss": 2.1664, "step": 1619 }, { "epoch": 0.012598650544541673, "grad_norm": 0.13675478927118215, "learning_rate": 9.996178983716827e-05, "loss": 2.1936, "step": 1620 }, { "epoch": 0.012606427489322255, "grad_norm": 0.13734774176399062, "learning_rate": 9.996174206563274e-05, "loss": 2.1864, "step": 1621 }, { "epoch": 0.012614204434102836, "grad_norm": 0.13844600665018114, "learning_rate": 9.996169426426454e-05, "loss": 2.1931, "step": 1622 }, { "epoch": 0.012621981378883417, "grad_norm": 0.13282412127998655, "learning_rate": 9.996164643306374e-05, "loss": 2.2072, "step": 1623 }, { "epoch": 0.012629758323663998, "grad_norm": 0.12406630742787382, "learning_rate": 9.996159857203033e-05, "loss": 2.1532, "step": 1624 }, { "epoch": 0.01263753526844458, "grad_norm": 0.14233190234172843, "learning_rate": 9.996155068116438e-05, "loss": 2.1868, "step": 1625 }, { "epoch": 0.012645312213225162, "grad_norm": 0.1311841419622481, "learning_rate": 9.996150276046587e-05, "loss": 2.1413, "step": 1626 }, { "epoch": 0.012653089158005743, "grad_norm": 0.1286544968808752, "learning_rate": 9.996145480993487e-05, "loss": 2.1638, "step": 1627 }, { "epoch": 0.012660866102786324, "grad_norm": 0.23478776926230033, "learning_rate": 9.996140682957139e-05, "loss": 2.1745, "step": 1628 }, { "epoch": 0.012668643047566906, "grad_norm": 0.14281915571873854, "learning_rate": 9.996135881937546e-05, "loss": 2.1741, "step": 1629 }, { "epoch": 0.012676419992347487, "grad_norm": 0.1259242295912761, "learning_rate": 9.996131077934712e-05, "loss": 2.1972, "step": 1630 }, { "epoch": 0.012684196937128068, "grad_norm": 0.120387736527745, "learning_rate": 9.996126270948638e-05, "loss": 2.1518, "step": 1631 }, { "epoch": 0.012691973881908649, "grad_norm": 0.1424458631126004, "learning_rate": 9.996121460979328e-05, "loss": 2.1903, "step": 1632 }, { "epoch": 0.01269975082668923, "grad_norm": 0.12292211984069311, "learning_rate": 9.996116648026785e-05, "loss": 2.1656, "step": 1633 }, { "epoch": 0.012707527771469811, "grad_norm": 0.11730433707620233, "learning_rate": 9.99611183209101e-05, "loss": 2.1985, "step": 1634 }, { "epoch": 0.012715304716250393, "grad_norm": 0.12507089202831553, "learning_rate": 9.99610701317201e-05, "loss": 2.1782, "step": 1635 }, { "epoch": 0.012723081661030974, "grad_norm": 0.13761813303629908, "learning_rate": 9.996102191269783e-05, "loss": 2.1853, "step": 1636 }, { "epoch": 0.012730858605811555, "grad_norm": 0.12532708105196633, "learning_rate": 9.996097366384334e-05, "loss": 2.1425, "step": 1637 }, { "epoch": 0.012738635550592136, "grad_norm": 0.14884568064449277, "learning_rate": 9.996092538515667e-05, "loss": 2.2454, "step": 1638 }, { "epoch": 0.012746412495372717, "grad_norm": 0.13587366510279608, "learning_rate": 9.996087707663784e-05, "loss": 2.2154, "step": 1639 }, { "epoch": 0.012754189440153298, "grad_norm": 0.13555189044106214, "learning_rate": 9.996082873828687e-05, "loss": 2.1757, "step": 1640 }, { "epoch": 0.012761966384933881, "grad_norm": 0.1659769172123874, "learning_rate": 9.996078037010381e-05, "loss": 2.1693, "step": 1641 }, { "epoch": 0.012769743329714462, "grad_norm": 0.13397975755347236, "learning_rate": 9.996073197208867e-05, "loss": 2.2525, "step": 1642 }, { "epoch": 0.012777520274495044, "grad_norm": 0.1420139121481891, "learning_rate": 9.996068354424148e-05, "loss": 2.2382, "step": 1643 }, { "epoch": 0.012785297219275625, "grad_norm": 0.12169926311351266, "learning_rate": 9.996063508656228e-05, "loss": 2.1919, "step": 1644 }, { "epoch": 0.012793074164056206, "grad_norm": 0.12671921500084002, "learning_rate": 9.99605865990511e-05, "loss": 2.2085, "step": 1645 }, { "epoch": 0.012800851108836787, "grad_norm": 0.1342375297550058, "learning_rate": 9.996053808170796e-05, "loss": 2.1774, "step": 1646 }, { "epoch": 0.012808628053617368, "grad_norm": 0.13325211843100265, "learning_rate": 9.996048953453288e-05, "loss": 2.1752, "step": 1647 }, { "epoch": 0.01281640499839795, "grad_norm": 0.1296019714437295, "learning_rate": 9.996044095752589e-05, "loss": 2.2523, "step": 1648 }, { "epoch": 0.01282418194317853, "grad_norm": 0.11750072660302632, "learning_rate": 9.996039235068704e-05, "loss": 2.1862, "step": 1649 }, { "epoch": 0.012831958887959112, "grad_norm": 0.13005719009948918, "learning_rate": 9.996034371401635e-05, "loss": 2.1648, "step": 1650 }, { "epoch": 0.012839735832739693, "grad_norm": 0.12908958294810385, "learning_rate": 9.996029504751386e-05, "loss": 2.1843, "step": 1651 }, { "epoch": 0.012847512777520274, "grad_norm": 0.11970831832815967, "learning_rate": 9.996024635117955e-05, "loss": 2.1914, "step": 1652 }, { "epoch": 0.012855289722300855, "grad_norm": 0.13000167974676322, "learning_rate": 9.99601976250135e-05, "loss": 2.2116, "step": 1653 }, { "epoch": 0.012863066667081436, "grad_norm": 0.11712447083845182, "learning_rate": 9.996014886901575e-05, "loss": 2.2106, "step": 1654 }, { "epoch": 0.012870843611862018, "grad_norm": 0.13006755835727257, "learning_rate": 9.996010008318627e-05, "loss": 2.2139, "step": 1655 }, { "epoch": 0.0128786205566426, "grad_norm": 0.12664004702531276, "learning_rate": 9.996005126752514e-05, "loss": 2.2125, "step": 1656 }, { "epoch": 0.012886397501423182, "grad_norm": 0.11373869329233305, "learning_rate": 9.996000242203237e-05, "loss": 2.2271, "step": 1657 }, { "epoch": 0.012894174446203763, "grad_norm": 0.12651207739537756, "learning_rate": 9.995995354670798e-05, "loss": 2.1411, "step": 1658 }, { "epoch": 0.012901951390984344, "grad_norm": 0.14808111814696087, "learning_rate": 9.9959904641552e-05, "loss": 2.1634, "step": 1659 }, { "epoch": 0.012909728335764925, "grad_norm": 0.1306057862071512, "learning_rate": 9.99598557065645e-05, "loss": 2.1631, "step": 1660 }, { "epoch": 0.012917505280545506, "grad_norm": 0.13368473999345604, "learning_rate": 9.995980674174545e-05, "loss": 2.2257, "step": 1661 }, { "epoch": 0.012925282225326087, "grad_norm": 0.15100451338241297, "learning_rate": 9.995975774709491e-05, "loss": 2.1933, "step": 1662 }, { "epoch": 0.012933059170106669, "grad_norm": 0.13248040430773556, "learning_rate": 9.995970872261293e-05, "loss": 2.1431, "step": 1663 }, { "epoch": 0.01294083611488725, "grad_norm": 0.1372015782260775, "learning_rate": 9.99596596682995e-05, "loss": 2.187, "step": 1664 }, { "epoch": 0.012948613059667831, "grad_norm": 0.19341009765387743, "learning_rate": 9.995961058415466e-05, "loss": 2.2377, "step": 1665 }, { "epoch": 0.012956390004448412, "grad_norm": 0.18316081024407554, "learning_rate": 9.995956147017845e-05, "loss": 2.2036, "step": 1666 }, { "epoch": 0.012964166949228993, "grad_norm": 0.11558831105162133, "learning_rate": 9.99595123263709e-05, "loss": 2.2224, "step": 1667 }, { "epoch": 0.012971943894009574, "grad_norm": 0.16666192994084963, "learning_rate": 9.995946315273201e-05, "loss": 2.1635, "step": 1668 }, { "epoch": 0.012979720838790156, "grad_norm": 0.1705262841057382, "learning_rate": 9.995941394926185e-05, "loss": 2.1935, "step": 1669 }, { "epoch": 0.012987497783570737, "grad_norm": 0.12351976456471353, "learning_rate": 9.995936471596043e-05, "loss": 2.1714, "step": 1670 }, { "epoch": 0.01299527472835132, "grad_norm": 0.21499275710072083, "learning_rate": 9.99593154528278e-05, "loss": 2.183, "step": 1671 }, { "epoch": 0.0130030516731319, "grad_norm": 0.20693533192471572, "learning_rate": 9.995926615986395e-05, "loss": 2.152, "step": 1672 }, { "epoch": 0.013010828617912482, "grad_norm": 0.1196239383717196, "learning_rate": 9.995921683706893e-05, "loss": 2.2357, "step": 1673 }, { "epoch": 0.013018605562693063, "grad_norm": 0.1954175877499477, "learning_rate": 9.995916748444276e-05, "loss": 2.1239, "step": 1674 }, { "epoch": 0.013026382507473644, "grad_norm": 0.33246760234175293, "learning_rate": 9.99591181019855e-05, "loss": 2.1975, "step": 1675 }, { "epoch": 0.013034159452254225, "grad_norm": 0.14728838609606376, "learning_rate": 9.995906868969716e-05, "loss": 2.1897, "step": 1676 }, { "epoch": 0.013041936397034807, "grad_norm": 0.1711574469859005, "learning_rate": 9.995901924757774e-05, "loss": 2.1294, "step": 1677 }, { "epoch": 0.013049713341815388, "grad_norm": 0.1973977600305796, "learning_rate": 9.995896977562732e-05, "loss": 2.2115, "step": 1678 }, { "epoch": 0.013057490286595969, "grad_norm": 0.1395091352696561, "learning_rate": 9.99589202738459e-05, "loss": 2.1984, "step": 1679 }, { "epoch": 0.01306526723137655, "grad_norm": 0.15718899914804485, "learning_rate": 9.995887074223352e-05, "loss": 2.115, "step": 1680 }, { "epoch": 0.013073044176157131, "grad_norm": 0.1688094810865591, "learning_rate": 9.99588211807902e-05, "loss": 2.1846, "step": 1681 }, { "epoch": 0.013080821120937712, "grad_norm": 0.1550973122082121, "learning_rate": 9.995877158951599e-05, "loss": 2.172, "step": 1682 }, { "epoch": 0.013088598065718294, "grad_norm": 0.12106542934630676, "learning_rate": 9.995872196841088e-05, "loss": 2.1444, "step": 1683 }, { "epoch": 0.013096375010498875, "grad_norm": 0.13786982841939197, "learning_rate": 9.995867231747495e-05, "loss": 2.2072, "step": 1684 }, { "epoch": 0.013104151955279456, "grad_norm": 0.15944274090441327, "learning_rate": 9.995862263670819e-05, "loss": 2.2088, "step": 1685 }, { "epoch": 0.013111928900060039, "grad_norm": 0.1256327643245378, "learning_rate": 9.995857292611064e-05, "loss": 2.2162, "step": 1686 }, { "epoch": 0.01311970584484062, "grad_norm": 0.12486110802956821, "learning_rate": 9.995852318568235e-05, "loss": 2.1432, "step": 1687 }, { "epoch": 0.013127482789621201, "grad_norm": 0.12826332023368706, "learning_rate": 9.995847341542333e-05, "loss": 2.1907, "step": 1688 }, { "epoch": 0.013135259734401782, "grad_norm": 0.12345534537136077, "learning_rate": 9.99584236153336e-05, "loss": 2.2059, "step": 1689 }, { "epoch": 0.013143036679182363, "grad_norm": 0.12066936455670606, "learning_rate": 9.995837378541322e-05, "loss": 2.1894, "step": 1690 }, { "epoch": 0.013150813623962945, "grad_norm": 0.12810448567371183, "learning_rate": 9.99583239256622e-05, "loss": 2.1917, "step": 1691 }, { "epoch": 0.013158590568743526, "grad_norm": 0.11275109125073482, "learning_rate": 9.995827403608055e-05, "loss": 2.1336, "step": 1692 }, { "epoch": 0.013166367513524107, "grad_norm": 0.11780850726348531, "learning_rate": 9.995822411666835e-05, "loss": 2.2038, "step": 1693 }, { "epoch": 0.013174144458304688, "grad_norm": 0.12119290441426162, "learning_rate": 9.995817416742559e-05, "loss": 2.1735, "step": 1694 }, { "epoch": 0.01318192140308527, "grad_norm": 0.12378044414342673, "learning_rate": 9.99581241883523e-05, "loss": 2.1892, "step": 1695 }, { "epoch": 0.01318969834786585, "grad_norm": 0.12937820100983485, "learning_rate": 9.995807417944856e-05, "loss": 2.173, "step": 1696 }, { "epoch": 0.013197475292646432, "grad_norm": 0.1160056657191603, "learning_rate": 9.995802414071434e-05, "loss": 2.1999, "step": 1697 }, { "epoch": 0.013205252237427013, "grad_norm": 0.21707620419020737, "learning_rate": 9.995797407214967e-05, "loss": 2.1938, "step": 1698 }, { "epoch": 0.013213029182207594, "grad_norm": 0.11986282979325125, "learning_rate": 9.995792397375464e-05, "loss": 2.2255, "step": 1699 }, { "epoch": 0.013220806126988177, "grad_norm": 0.1190861336152738, "learning_rate": 9.995787384552921e-05, "loss": 2.1856, "step": 1700 }, { "epoch": 0.013228583071768758, "grad_norm": 0.12052259405879577, "learning_rate": 9.995782368747344e-05, "loss": 2.1925, "step": 1701 }, { "epoch": 0.013236360016549339, "grad_norm": 0.12195177238641142, "learning_rate": 9.99577734995874e-05, "loss": 2.1447, "step": 1702 }, { "epoch": 0.01324413696132992, "grad_norm": 0.11634990231658049, "learning_rate": 9.995772328187105e-05, "loss": 2.1356, "step": 1703 }, { "epoch": 0.013251913906110501, "grad_norm": 0.12618103972623596, "learning_rate": 9.995767303432444e-05, "loss": 2.1873, "step": 1704 }, { "epoch": 0.013259690850891083, "grad_norm": 0.19894125453887423, "learning_rate": 9.995762275694765e-05, "loss": 2.1553, "step": 1705 }, { "epoch": 0.013267467795671664, "grad_norm": 0.12433768206714337, "learning_rate": 9.995757244974064e-05, "loss": 2.1852, "step": 1706 }, { "epoch": 0.013275244740452245, "grad_norm": 0.12649749789341708, "learning_rate": 9.995752211270349e-05, "loss": 2.1489, "step": 1707 }, { "epoch": 0.013283021685232826, "grad_norm": 0.1213555065166289, "learning_rate": 9.995747174583618e-05, "loss": 2.2101, "step": 1708 }, { "epoch": 0.013290798630013407, "grad_norm": 0.12622243679445508, "learning_rate": 9.995742134913881e-05, "loss": 2.1699, "step": 1709 }, { "epoch": 0.013298575574793988, "grad_norm": 0.11811061764267441, "learning_rate": 9.995737092261134e-05, "loss": 2.1782, "step": 1710 }, { "epoch": 0.01330635251957457, "grad_norm": 0.12309582797548496, "learning_rate": 9.995732046625385e-05, "loss": 2.1849, "step": 1711 }, { "epoch": 0.01331412946435515, "grad_norm": 0.1357189165779703, "learning_rate": 9.995726998006636e-05, "loss": 2.1515, "step": 1712 }, { "epoch": 0.013321906409135732, "grad_norm": 0.14298544307775943, "learning_rate": 9.995721946404888e-05, "loss": 2.1782, "step": 1713 }, { "epoch": 0.013329683353916313, "grad_norm": 0.7619575137577246, "learning_rate": 9.995716891820145e-05, "loss": 2.2304, "step": 1714 }, { "epoch": 0.013337460298696896, "grad_norm": 0.11519505420272386, "learning_rate": 9.99571183425241e-05, "loss": 2.2009, "step": 1715 }, { "epoch": 0.013345237243477477, "grad_norm": 0.146104951602159, "learning_rate": 9.995706773701687e-05, "loss": 2.1877, "step": 1716 }, { "epoch": 0.013353014188258058, "grad_norm": 0.15106264840440198, "learning_rate": 9.995701710167978e-05, "loss": 2.225, "step": 1717 }, { "epoch": 0.01336079113303864, "grad_norm": 0.12573960637785309, "learning_rate": 9.995696643651286e-05, "loss": 2.1887, "step": 1718 }, { "epoch": 0.01336856807781922, "grad_norm": 0.131020415587525, "learning_rate": 9.995691574151615e-05, "loss": 2.1289, "step": 1719 }, { "epoch": 0.013376345022599802, "grad_norm": 0.11848125176845875, "learning_rate": 9.995686501668967e-05, "loss": 2.1647, "step": 1720 }, { "epoch": 0.013384121967380383, "grad_norm": 0.12182086859265766, "learning_rate": 9.995681426203345e-05, "loss": 2.2163, "step": 1721 }, { "epoch": 0.013391898912160964, "grad_norm": 0.12567863007214708, "learning_rate": 9.995676347754753e-05, "loss": 2.2369, "step": 1722 }, { "epoch": 0.013399675856941545, "grad_norm": 0.1235288801616873, "learning_rate": 9.995671266323194e-05, "loss": 2.219, "step": 1723 }, { "epoch": 0.013407452801722126, "grad_norm": 0.1259378276040012, "learning_rate": 9.995666181908669e-05, "loss": 2.2364, "step": 1724 }, { "epoch": 0.013415229746502708, "grad_norm": 0.13578674334602928, "learning_rate": 9.995661094511183e-05, "loss": 2.1717, "step": 1725 }, { "epoch": 0.013423006691283289, "grad_norm": 0.13564235057247023, "learning_rate": 9.995656004130737e-05, "loss": 2.1955, "step": 1726 }, { "epoch": 0.01343078363606387, "grad_norm": 0.12280018096521851, "learning_rate": 9.99565091076734e-05, "loss": 2.1987, "step": 1727 }, { "epoch": 0.013438560580844451, "grad_norm": 0.12707592935729373, "learning_rate": 9.995645814420987e-05, "loss": 2.1307, "step": 1728 }, { "epoch": 0.013446337525625032, "grad_norm": 0.1256356844449772, "learning_rate": 9.995640715091685e-05, "loss": 2.2271, "step": 1729 }, { "epoch": 0.013454114470405615, "grad_norm": 0.12985457355699748, "learning_rate": 9.995635612779437e-05, "loss": 2.17, "step": 1730 }, { "epoch": 0.013461891415186196, "grad_norm": 0.14074218675811567, "learning_rate": 9.995630507484247e-05, "loss": 2.2383, "step": 1731 }, { "epoch": 0.013469668359966777, "grad_norm": 0.12692300416936564, "learning_rate": 9.995625399206115e-05, "loss": 2.113, "step": 1732 }, { "epoch": 0.013477445304747359, "grad_norm": 0.12843378580907638, "learning_rate": 9.995620287945047e-05, "loss": 2.1978, "step": 1733 }, { "epoch": 0.01348522224952794, "grad_norm": 0.13973442731108232, "learning_rate": 9.995615173701045e-05, "loss": 2.1499, "step": 1734 }, { "epoch": 0.013492999194308521, "grad_norm": 0.1280197529133058, "learning_rate": 9.995610056474112e-05, "loss": 2.1835, "step": 1735 }, { "epoch": 0.013500776139089102, "grad_norm": 0.12397608152952079, "learning_rate": 9.995604936264249e-05, "loss": 2.2082, "step": 1736 }, { "epoch": 0.013508553083869683, "grad_norm": 0.12570541532723334, "learning_rate": 9.995599813071464e-05, "loss": 2.1836, "step": 1737 }, { "epoch": 0.013516330028650264, "grad_norm": 0.12648641305887628, "learning_rate": 9.995594686895755e-05, "loss": 2.1704, "step": 1738 }, { "epoch": 0.013524106973430846, "grad_norm": 0.1216441236108241, "learning_rate": 9.995589557737128e-05, "loss": 2.1591, "step": 1739 }, { "epoch": 0.013531883918211427, "grad_norm": 0.13807527455037608, "learning_rate": 9.995584425595585e-05, "loss": 2.1605, "step": 1740 }, { "epoch": 0.013539660862992008, "grad_norm": 0.11637119274536764, "learning_rate": 9.99557929047113e-05, "loss": 2.1886, "step": 1741 }, { "epoch": 0.013547437807772589, "grad_norm": 0.12372710412641552, "learning_rate": 9.995574152363765e-05, "loss": 2.2473, "step": 1742 }, { "epoch": 0.01355521475255317, "grad_norm": 0.13915668685298022, "learning_rate": 9.995569011273493e-05, "loss": 2.1754, "step": 1743 }, { "epoch": 0.013562991697333751, "grad_norm": 0.1359402748235422, "learning_rate": 9.995563867200317e-05, "loss": 2.2021, "step": 1744 }, { "epoch": 0.013570768642114334, "grad_norm": 0.12027199223448513, "learning_rate": 9.995558720144242e-05, "loss": 2.1539, "step": 1745 }, { "epoch": 0.013578545586894916, "grad_norm": 0.14764949992998672, "learning_rate": 9.995553570105269e-05, "loss": 2.1593, "step": 1746 }, { "epoch": 0.013586322531675497, "grad_norm": 0.14605083821080406, "learning_rate": 9.995548417083403e-05, "loss": 2.1595, "step": 1747 }, { "epoch": 0.013594099476456078, "grad_norm": 0.13196752147744362, "learning_rate": 9.995543261078644e-05, "loss": 2.1516, "step": 1748 }, { "epoch": 0.013601876421236659, "grad_norm": 0.13030599235830703, "learning_rate": 9.995538102090997e-05, "loss": 2.191, "step": 1749 }, { "epoch": 0.01360965336601724, "grad_norm": 0.1457104759410615, "learning_rate": 9.995532940120466e-05, "loss": 2.1943, "step": 1750 }, { "epoch": 0.013617430310797821, "grad_norm": 0.19641712795584307, "learning_rate": 9.995527775167053e-05, "loss": 2.17, "step": 1751 }, { "epoch": 0.013625207255578402, "grad_norm": 0.1891523259395541, "learning_rate": 9.99552260723076e-05, "loss": 2.1884, "step": 1752 }, { "epoch": 0.013632984200358984, "grad_norm": 0.12833378601241607, "learning_rate": 9.995517436311593e-05, "loss": 2.2107, "step": 1753 }, { "epoch": 0.013640761145139565, "grad_norm": 0.3459171582439039, "learning_rate": 9.995512262409552e-05, "loss": 2.2092, "step": 1754 }, { "epoch": 0.013648538089920146, "grad_norm": 0.2006794662625139, "learning_rate": 9.995507085524642e-05, "loss": 2.1806, "step": 1755 }, { "epoch": 0.013656315034700727, "grad_norm": 0.13625357393463375, "learning_rate": 9.995501905656866e-05, "loss": 2.2426, "step": 1756 }, { "epoch": 0.013664091979481308, "grad_norm": 0.21294143034450158, "learning_rate": 9.995496722806226e-05, "loss": 2.1716, "step": 1757 }, { "epoch": 0.01367186892426189, "grad_norm": 0.24683467393094513, "learning_rate": 9.995491536972725e-05, "loss": 2.2146, "step": 1758 }, { "epoch": 0.01367964586904247, "grad_norm": 0.1336405749434824, "learning_rate": 9.995486348156367e-05, "loss": 2.2145, "step": 1759 }, { "epoch": 0.013687422813823054, "grad_norm": 0.20759264775790864, "learning_rate": 9.995481156357157e-05, "loss": 2.1833, "step": 1760 }, { "epoch": 0.013695199758603635, "grad_norm": 0.2235918274540743, "learning_rate": 9.995475961575094e-05, "loss": 2.1978, "step": 1761 }, { "epoch": 0.013702976703384216, "grad_norm": 0.12477680810671142, "learning_rate": 9.995470763810181e-05, "loss": 2.183, "step": 1762 }, { "epoch": 0.013710753648164797, "grad_norm": 0.16885520289054437, "learning_rate": 9.995465563062427e-05, "loss": 2.2023, "step": 1763 }, { "epoch": 0.013718530592945378, "grad_norm": 0.151790763782119, "learning_rate": 9.99546035933183e-05, "loss": 2.2356, "step": 1764 }, { "epoch": 0.01372630753772596, "grad_norm": 0.12425298081586073, "learning_rate": 9.995455152618393e-05, "loss": 2.1979, "step": 1765 }, { "epoch": 0.01373408448250654, "grad_norm": 0.1513218871347836, "learning_rate": 9.995449942922123e-05, "loss": 2.1322, "step": 1766 }, { "epoch": 0.013741861427287122, "grad_norm": 0.13425350468290642, "learning_rate": 9.99544473024302e-05, "loss": 2.1639, "step": 1767 }, { "epoch": 0.013749638372067703, "grad_norm": 0.12166006785503976, "learning_rate": 9.995439514581086e-05, "loss": 2.1621, "step": 1768 }, { "epoch": 0.013757415316848284, "grad_norm": 0.12478896314396715, "learning_rate": 9.995434295936327e-05, "loss": 2.1677, "step": 1769 }, { "epoch": 0.013765192261628865, "grad_norm": 0.11733596882869302, "learning_rate": 9.995429074308744e-05, "loss": 2.1953, "step": 1770 }, { "epoch": 0.013772969206409446, "grad_norm": 0.13195931026174906, "learning_rate": 9.995423849698343e-05, "loss": 2.2185, "step": 1771 }, { "epoch": 0.013780746151190027, "grad_norm": 0.1227905899356478, "learning_rate": 9.995418622105123e-05, "loss": 2.1409, "step": 1772 }, { "epoch": 0.013788523095970609, "grad_norm": 0.11987019517302365, "learning_rate": 9.995413391529091e-05, "loss": 2.1836, "step": 1773 }, { "epoch": 0.01379630004075119, "grad_norm": 0.12212730144173685, "learning_rate": 9.995408157970249e-05, "loss": 2.1488, "step": 1774 }, { "epoch": 0.013804076985531773, "grad_norm": 0.1344395007396869, "learning_rate": 9.995402921428598e-05, "loss": 2.2023, "step": 1775 }, { "epoch": 0.013811853930312354, "grad_norm": 0.12413661946685375, "learning_rate": 9.995397681904144e-05, "loss": 2.1659, "step": 1776 }, { "epoch": 0.013819630875092935, "grad_norm": 0.1277557221465957, "learning_rate": 9.995392439396887e-05, "loss": 2.2183, "step": 1777 }, { "epoch": 0.013827407819873516, "grad_norm": 0.1467484534283373, "learning_rate": 9.995387193906833e-05, "loss": 2.1541, "step": 1778 }, { "epoch": 0.013835184764654097, "grad_norm": 0.1271923316320424, "learning_rate": 9.995381945433984e-05, "loss": 2.1755, "step": 1779 }, { "epoch": 0.013842961709434679, "grad_norm": 0.12685417441927876, "learning_rate": 9.995376693978345e-05, "loss": 2.1816, "step": 1780 }, { "epoch": 0.01385073865421526, "grad_norm": 0.162283154856087, "learning_rate": 9.995371439539914e-05, "loss": 2.2408, "step": 1781 }, { "epoch": 0.01385851559899584, "grad_norm": 0.15223257969521045, "learning_rate": 9.995366182118699e-05, "loss": 2.2046, "step": 1782 }, { "epoch": 0.013866292543776422, "grad_norm": 0.11687873688769154, "learning_rate": 9.995360921714702e-05, "loss": 2.1897, "step": 1783 }, { "epoch": 0.013874069488557003, "grad_norm": 0.18041810182938522, "learning_rate": 9.995355658327926e-05, "loss": 2.1312, "step": 1784 }, { "epoch": 0.013881846433337584, "grad_norm": 0.2248390172990453, "learning_rate": 9.995350391958374e-05, "loss": 2.1525, "step": 1785 }, { "epoch": 0.013889623378118165, "grad_norm": 0.1788471563790717, "learning_rate": 9.995345122606047e-05, "loss": 2.1288, "step": 1786 }, { "epoch": 0.013897400322898747, "grad_norm": 0.12236429003678909, "learning_rate": 9.995339850270953e-05, "loss": 2.166, "step": 1787 }, { "epoch": 0.013905177267679328, "grad_norm": 0.18091143315305347, "learning_rate": 9.995334574953092e-05, "loss": 2.1838, "step": 1788 }, { "epoch": 0.013912954212459909, "grad_norm": 0.16729259655486126, "learning_rate": 9.995329296652467e-05, "loss": 2.2028, "step": 1789 }, { "epoch": 0.013920731157240492, "grad_norm": 0.12524511770920235, "learning_rate": 9.995324015369081e-05, "loss": 2.1239, "step": 1790 }, { "epoch": 0.013928508102021073, "grad_norm": 0.15561585322299687, "learning_rate": 9.995318731102938e-05, "loss": 2.1867, "step": 1791 }, { "epoch": 0.013936285046801654, "grad_norm": 0.1429581011822307, "learning_rate": 9.99531344385404e-05, "loss": 2.1448, "step": 1792 }, { "epoch": 0.013944061991582235, "grad_norm": 0.128780874180258, "learning_rate": 9.995308153622394e-05, "loss": 2.2018, "step": 1793 }, { "epoch": 0.013951838936362817, "grad_norm": 0.24015576301593936, "learning_rate": 9.995302860407998e-05, "loss": 2.1896, "step": 1794 }, { "epoch": 0.013959615881143398, "grad_norm": 0.13021617815940792, "learning_rate": 9.995297564210857e-05, "loss": 2.2319, "step": 1795 }, { "epoch": 0.013967392825923979, "grad_norm": 0.12395995867496738, "learning_rate": 9.995292265030976e-05, "loss": 2.183, "step": 1796 }, { "epoch": 0.01397516977070456, "grad_norm": 0.12819517037808037, "learning_rate": 9.995286962868357e-05, "loss": 2.1818, "step": 1797 }, { "epoch": 0.013982946715485141, "grad_norm": 0.13331980041919966, "learning_rate": 9.995281657723001e-05, "loss": 2.1996, "step": 1798 }, { "epoch": 0.013990723660265722, "grad_norm": 0.12088069297287335, "learning_rate": 9.995276349594916e-05, "loss": 2.1949, "step": 1799 }, { "epoch": 0.013998500605046304, "grad_norm": 0.13532992124025953, "learning_rate": 9.9952710384841e-05, "loss": 2.1579, "step": 1800 }, { "epoch": 0.014006277549826885, "grad_norm": 0.1549701502275863, "learning_rate": 9.995265724390559e-05, "loss": 2.1745, "step": 1801 }, { "epoch": 0.014014054494607466, "grad_norm": 0.12764366531546043, "learning_rate": 9.995260407314296e-05, "loss": 2.2188, "step": 1802 }, { "epoch": 0.014021831439388047, "grad_norm": 0.14104605589895458, "learning_rate": 9.995255087255313e-05, "loss": 2.1916, "step": 1803 }, { "epoch": 0.01402960838416863, "grad_norm": 0.15174888222788632, "learning_rate": 9.995249764213614e-05, "loss": 2.181, "step": 1804 }, { "epoch": 0.014037385328949211, "grad_norm": 0.12790950003501664, "learning_rate": 9.995244438189204e-05, "loss": 2.1777, "step": 1805 }, { "epoch": 0.014045162273729792, "grad_norm": 0.12822589246558355, "learning_rate": 9.995239109182083e-05, "loss": 2.2362, "step": 1806 }, { "epoch": 0.014052939218510373, "grad_norm": 0.1503757531750428, "learning_rate": 9.995233777192255e-05, "loss": 2.2442, "step": 1807 }, { "epoch": 0.014060716163290955, "grad_norm": 0.11108674714102422, "learning_rate": 9.995228442219724e-05, "loss": 2.2299, "step": 1808 }, { "epoch": 0.014068493108071536, "grad_norm": 0.22366320870238687, "learning_rate": 9.995223104264494e-05, "loss": 2.2114, "step": 1809 }, { "epoch": 0.014076270052852117, "grad_norm": 0.18097784133865813, "learning_rate": 9.995217763326565e-05, "loss": 2.2269, "step": 1810 }, { "epoch": 0.014084046997632698, "grad_norm": 0.149807785444131, "learning_rate": 9.995212419405944e-05, "loss": 2.137, "step": 1811 }, { "epoch": 0.01409182394241328, "grad_norm": 0.21163646202428807, "learning_rate": 9.995207072502632e-05, "loss": 2.2022, "step": 1812 }, { "epoch": 0.01409960088719386, "grad_norm": 0.1990666534563043, "learning_rate": 9.995201722616633e-05, "loss": 2.2293, "step": 1813 }, { "epoch": 0.014107377831974442, "grad_norm": 0.12605560740650112, "learning_rate": 9.995196369747949e-05, "loss": 2.1345, "step": 1814 }, { "epoch": 0.014115154776755023, "grad_norm": 0.22962303308396664, "learning_rate": 9.995191013896585e-05, "loss": 2.1552, "step": 1815 }, { "epoch": 0.014122931721535604, "grad_norm": 0.20337173175940146, "learning_rate": 9.995185655062542e-05, "loss": 2.1784, "step": 1816 }, { "epoch": 0.014130708666316185, "grad_norm": 0.12048678710865009, "learning_rate": 9.995180293245825e-05, "loss": 2.1695, "step": 1817 }, { "epoch": 0.014138485611096766, "grad_norm": 0.1464025572201055, "learning_rate": 9.995174928446437e-05, "loss": 2.1635, "step": 1818 }, { "epoch": 0.014146262555877349, "grad_norm": 0.1316618714096625, "learning_rate": 9.995169560664381e-05, "loss": 2.1645, "step": 1819 }, { "epoch": 0.01415403950065793, "grad_norm": 0.12989083532617798, "learning_rate": 9.99516418989966e-05, "loss": 2.2107, "step": 1820 }, { "epoch": 0.014161816445438511, "grad_norm": 0.1568484851294269, "learning_rate": 9.995158816152276e-05, "loss": 2.1887, "step": 1821 }, { "epoch": 0.014169593390219093, "grad_norm": 0.12908582201595112, "learning_rate": 9.995153439422233e-05, "loss": 2.1399, "step": 1822 }, { "epoch": 0.014177370334999674, "grad_norm": 0.12206123905099325, "learning_rate": 9.995148059709537e-05, "loss": 2.1712, "step": 1823 }, { "epoch": 0.014185147279780255, "grad_norm": 0.12407459717786874, "learning_rate": 9.995142677014188e-05, "loss": 2.0984, "step": 1824 }, { "epoch": 0.014192924224560836, "grad_norm": 0.12287990767248631, "learning_rate": 9.99513729133619e-05, "loss": 2.1945, "step": 1825 }, { "epoch": 0.014200701169341417, "grad_norm": 0.13520799764284003, "learning_rate": 9.995131902675546e-05, "loss": 2.1834, "step": 1826 }, { "epoch": 0.014208478114121998, "grad_norm": 0.13006608805473185, "learning_rate": 9.99512651103226e-05, "loss": 2.1948, "step": 1827 }, { "epoch": 0.01421625505890258, "grad_norm": 0.12325799684904602, "learning_rate": 9.995121116406335e-05, "loss": 2.2328, "step": 1828 }, { "epoch": 0.01422403200368316, "grad_norm": 0.11713024081939762, "learning_rate": 9.995115718797773e-05, "loss": 2.2074, "step": 1829 }, { "epoch": 0.014231808948463742, "grad_norm": 0.12087310187786168, "learning_rate": 9.995110318206579e-05, "loss": 2.1484, "step": 1830 }, { "epoch": 0.014239585893244323, "grad_norm": 0.1261920575266246, "learning_rate": 9.995104914632754e-05, "loss": 2.2118, "step": 1831 }, { "epoch": 0.014247362838024904, "grad_norm": 0.13676816822012006, "learning_rate": 9.995099508076305e-05, "loss": 2.1971, "step": 1832 }, { "epoch": 0.014255139782805485, "grad_norm": 0.12127178162207784, "learning_rate": 9.995094098537231e-05, "loss": 2.1396, "step": 1833 }, { "epoch": 0.014262916727586068, "grad_norm": 0.12548084080602365, "learning_rate": 9.995088686015539e-05, "loss": 2.2095, "step": 1834 }, { "epoch": 0.01427069367236665, "grad_norm": 0.13210544611692993, "learning_rate": 9.995083270511228e-05, "loss": 2.162, "step": 1835 }, { "epoch": 0.01427847061714723, "grad_norm": 0.13683343972472886, "learning_rate": 9.995077852024304e-05, "loss": 2.2004, "step": 1836 }, { "epoch": 0.014286247561927812, "grad_norm": 0.12996201194659257, "learning_rate": 9.99507243055477e-05, "loss": 2.1665, "step": 1837 }, { "epoch": 0.014294024506708393, "grad_norm": 0.13519105747645846, "learning_rate": 9.99506700610263e-05, "loss": 2.1416, "step": 1838 }, { "epoch": 0.014301801451488974, "grad_norm": 0.1212768080279922, "learning_rate": 9.995061578667885e-05, "loss": 2.2042, "step": 1839 }, { "epoch": 0.014309578396269555, "grad_norm": 0.1613558011824642, "learning_rate": 9.995056148250542e-05, "loss": 2.1464, "step": 1840 }, { "epoch": 0.014317355341050136, "grad_norm": 0.14355676109994708, "learning_rate": 9.9950507148506e-05, "loss": 2.1786, "step": 1841 }, { "epoch": 0.014325132285830718, "grad_norm": 0.12313538158578605, "learning_rate": 9.995045278468063e-05, "loss": 2.1616, "step": 1842 }, { "epoch": 0.014332909230611299, "grad_norm": 0.16455074459390195, "learning_rate": 9.995039839102937e-05, "loss": 2.1981, "step": 1843 }, { "epoch": 0.01434068617539188, "grad_norm": 0.1434798704755212, "learning_rate": 9.995034396755223e-05, "loss": 2.2047, "step": 1844 }, { "epoch": 0.014348463120172461, "grad_norm": 0.1306325617393696, "learning_rate": 9.995028951424923e-05, "loss": 2.2054, "step": 1845 }, { "epoch": 0.014356240064953042, "grad_norm": 0.1261912773745319, "learning_rate": 9.995023503112044e-05, "loss": 2.1428, "step": 1846 }, { "epoch": 0.014364017009733623, "grad_norm": 0.16348946479096652, "learning_rate": 9.995018051816588e-05, "loss": 2.2003, "step": 1847 }, { "epoch": 0.014371793954514205, "grad_norm": 0.16223549677097754, "learning_rate": 9.995012597538555e-05, "loss": 2.181, "step": 1848 }, { "epoch": 0.014379570899294787, "grad_norm": 0.11102102070939796, "learning_rate": 9.995007140277953e-05, "loss": 2.2028, "step": 1849 }, { "epoch": 0.014387347844075369, "grad_norm": 0.16086359499389924, "learning_rate": 9.99500168003478e-05, "loss": 2.1253, "step": 1850 }, { "epoch": 0.01439512478885595, "grad_norm": 0.17068763958760833, "learning_rate": 9.994996216809046e-05, "loss": 2.1722, "step": 1851 }, { "epoch": 0.014402901733636531, "grad_norm": 0.12117374594431767, "learning_rate": 9.994990750600749e-05, "loss": 2.1926, "step": 1852 }, { "epoch": 0.014410678678417112, "grad_norm": 0.14044124211225228, "learning_rate": 9.994985281409895e-05, "loss": 2.2059, "step": 1853 }, { "epoch": 0.014418455623197693, "grad_norm": 0.1416739274964551, "learning_rate": 9.994979809236483e-05, "loss": 2.176, "step": 1854 }, { "epoch": 0.014426232567978274, "grad_norm": 0.12124352249355594, "learning_rate": 9.994974334080521e-05, "loss": 2.2157, "step": 1855 }, { "epoch": 0.014434009512758856, "grad_norm": 0.13172097885770961, "learning_rate": 9.994968855942011e-05, "loss": 2.2301, "step": 1856 }, { "epoch": 0.014441786457539437, "grad_norm": 0.13653537579629843, "learning_rate": 9.994963374820957e-05, "loss": 2.2116, "step": 1857 }, { "epoch": 0.014449563402320018, "grad_norm": 0.37434620842019073, "learning_rate": 9.994957890717358e-05, "loss": 2.1547, "step": 1858 }, { "epoch": 0.014457340347100599, "grad_norm": 0.12739782098013575, "learning_rate": 9.994952403631224e-05, "loss": 2.2332, "step": 1859 }, { "epoch": 0.01446511729188118, "grad_norm": 0.14071572387297018, "learning_rate": 9.994946913562552e-05, "loss": 2.1608, "step": 1860 }, { "epoch": 0.014472894236661761, "grad_norm": 0.1336451767693672, "learning_rate": 9.994941420511349e-05, "loss": 2.2283, "step": 1861 }, { "epoch": 0.014480671181442343, "grad_norm": 0.12201781750661857, "learning_rate": 9.994935924477618e-05, "loss": 2.1491, "step": 1862 }, { "epoch": 0.014488448126222924, "grad_norm": 0.1258719940743395, "learning_rate": 9.99493042546136e-05, "loss": 2.1822, "step": 1863 }, { "epoch": 0.014496225071003507, "grad_norm": 0.11796615708667008, "learning_rate": 9.99492492346258e-05, "loss": 2.1752, "step": 1864 }, { "epoch": 0.014504002015784088, "grad_norm": 0.1468744103081468, "learning_rate": 9.994919418481282e-05, "loss": 2.1833, "step": 1865 }, { "epoch": 0.014511778960564669, "grad_norm": 0.15044500535061703, "learning_rate": 9.994913910517467e-05, "loss": 2.1584, "step": 1866 }, { "epoch": 0.01451955590534525, "grad_norm": 0.13254918257208326, "learning_rate": 9.994908399571142e-05, "loss": 2.1831, "step": 1867 }, { "epoch": 0.014527332850125831, "grad_norm": 0.1636274889607404, "learning_rate": 9.994902885642307e-05, "loss": 2.2037, "step": 1868 }, { "epoch": 0.014535109794906412, "grad_norm": 0.1610982254345066, "learning_rate": 9.994897368730965e-05, "loss": 2.2078, "step": 1869 }, { "epoch": 0.014542886739686994, "grad_norm": 0.18430921048871496, "learning_rate": 9.994891848837123e-05, "loss": 2.1714, "step": 1870 }, { "epoch": 0.014550663684467575, "grad_norm": 0.18164090610608036, "learning_rate": 9.99488632596078e-05, "loss": 2.1947, "step": 1871 }, { "epoch": 0.014558440629248156, "grad_norm": 0.12165962817631878, "learning_rate": 9.994880800101942e-05, "loss": 2.1641, "step": 1872 }, { "epoch": 0.014566217574028737, "grad_norm": 0.18026186441717734, "learning_rate": 9.994875271260611e-05, "loss": 2.2116, "step": 1873 }, { "epoch": 0.014573994518809318, "grad_norm": 0.20376426663300037, "learning_rate": 9.99486973943679e-05, "loss": 2.1387, "step": 1874 }, { "epoch": 0.0145817714635899, "grad_norm": 0.13658334086801793, "learning_rate": 9.994864204630485e-05, "loss": 2.1389, "step": 1875 }, { "epoch": 0.01458954840837048, "grad_norm": 0.17364935846165158, "learning_rate": 9.994858666841696e-05, "loss": 2.1591, "step": 1876 }, { "epoch": 0.014597325353151062, "grad_norm": 0.18804307654117883, "learning_rate": 9.994853126070426e-05, "loss": 2.1483, "step": 1877 }, { "epoch": 0.014605102297931643, "grad_norm": 0.12868951599284018, "learning_rate": 9.994847582316684e-05, "loss": 2.1307, "step": 1878 }, { "epoch": 0.014612879242712226, "grad_norm": 0.2099012072930813, "learning_rate": 9.994842035580467e-05, "loss": 2.1628, "step": 1879 }, { "epoch": 0.014620656187492807, "grad_norm": 0.22853564407901852, "learning_rate": 9.99483648586178e-05, "loss": 2.2392, "step": 1880 }, { "epoch": 0.014628433132273388, "grad_norm": 0.13347965510348248, "learning_rate": 9.994830933160628e-05, "loss": 2.182, "step": 1881 }, { "epoch": 0.01463621007705397, "grad_norm": 0.1609816142066746, "learning_rate": 9.994825377477013e-05, "loss": 2.1074, "step": 1882 }, { "epoch": 0.01464398702183455, "grad_norm": 0.2032395129239771, "learning_rate": 9.994819818810938e-05, "loss": 2.1759, "step": 1883 }, { "epoch": 0.014651763966615132, "grad_norm": 0.12638208749364196, "learning_rate": 9.994814257162408e-05, "loss": 2.1386, "step": 1884 }, { "epoch": 0.014659540911395713, "grad_norm": 0.17314832192500823, "learning_rate": 9.994808692531425e-05, "loss": 2.1785, "step": 1885 }, { "epoch": 0.014667317856176294, "grad_norm": 0.19295306469245296, "learning_rate": 9.994803124917993e-05, "loss": 2.176, "step": 1886 }, { "epoch": 0.014675094800956875, "grad_norm": 0.1159555317679867, "learning_rate": 9.994797554322113e-05, "loss": 2.2019, "step": 1887 }, { "epoch": 0.014682871745737456, "grad_norm": 0.20410093120709186, "learning_rate": 9.994791980743792e-05, "loss": 2.1675, "step": 1888 }, { "epoch": 0.014690648690518037, "grad_norm": 0.21608066917397936, "learning_rate": 9.99478640418303e-05, "loss": 2.1875, "step": 1889 }, { "epoch": 0.014698425635298619, "grad_norm": 0.14513679961187892, "learning_rate": 9.994780824639833e-05, "loss": 2.1844, "step": 1890 }, { "epoch": 0.0147062025800792, "grad_norm": 0.13354623346410657, "learning_rate": 9.994775242114203e-05, "loss": 2.1755, "step": 1891 }, { "epoch": 0.014713979524859781, "grad_norm": 0.16774879341380708, "learning_rate": 9.994769656606142e-05, "loss": 2.2047, "step": 1892 }, { "epoch": 0.014721756469640364, "grad_norm": 0.12241196062249687, "learning_rate": 9.994764068115656e-05, "loss": 2.18, "step": 1893 }, { "epoch": 0.014729533414420945, "grad_norm": 0.19533768769657253, "learning_rate": 9.994758476642747e-05, "loss": 2.1861, "step": 1894 }, { "epoch": 0.014737310359201526, "grad_norm": 0.19294533416310844, "learning_rate": 9.994752882187419e-05, "loss": 2.2036, "step": 1895 }, { "epoch": 0.014745087303982107, "grad_norm": 0.12876905237068262, "learning_rate": 9.994747284749675e-05, "loss": 2.2606, "step": 1896 }, { "epoch": 0.014752864248762688, "grad_norm": 0.2636735077400741, "learning_rate": 9.994741684329519e-05, "loss": 2.2099, "step": 1897 }, { "epoch": 0.01476064119354327, "grad_norm": 0.2500566143550606, "learning_rate": 9.99473608092695e-05, "loss": 2.163, "step": 1898 }, { "epoch": 0.01476841813832385, "grad_norm": 0.1570451277628582, "learning_rate": 9.994730474541977e-05, "loss": 2.1904, "step": 1899 }, { "epoch": 0.014776195083104432, "grad_norm": 0.21050034784040358, "learning_rate": 9.994724865174603e-05, "loss": 2.1436, "step": 1900 }, { "epoch": 0.014783972027885013, "grad_norm": 0.12026153624745196, "learning_rate": 9.994719252824826e-05, "loss": 2.1296, "step": 1901 }, { "epoch": 0.014791748972665594, "grad_norm": 0.18854228821916305, "learning_rate": 9.994713637492657e-05, "loss": 2.1305, "step": 1902 }, { "epoch": 0.014799525917446175, "grad_norm": 0.15062664380515933, "learning_rate": 9.994708019178092e-05, "loss": 2.1639, "step": 1903 }, { "epoch": 0.014807302862226757, "grad_norm": 0.14626348374177783, "learning_rate": 9.994702397881139e-05, "loss": 2.1756, "step": 1904 }, { "epoch": 0.014815079807007338, "grad_norm": 0.20779732480092153, "learning_rate": 9.9946967736018e-05, "loss": 2.1256, "step": 1905 }, { "epoch": 0.014822856751787919, "grad_norm": 0.14275799714109455, "learning_rate": 9.994691146340079e-05, "loss": 2.1434, "step": 1906 }, { "epoch": 0.0148306336965685, "grad_norm": 0.14221422624903096, "learning_rate": 9.994685516095977e-05, "loss": 2.1939, "step": 1907 }, { "epoch": 0.014838410641349083, "grad_norm": 0.15457538226711662, "learning_rate": 9.9946798828695e-05, "loss": 2.1814, "step": 1908 }, { "epoch": 0.014846187586129664, "grad_norm": 0.12456279972960016, "learning_rate": 9.994674246660649e-05, "loss": 2.1899, "step": 1909 }, { "epoch": 0.014853964530910245, "grad_norm": 0.1824814717469908, "learning_rate": 9.994668607469431e-05, "loss": 2.1111, "step": 1910 }, { "epoch": 0.014861741475690827, "grad_norm": 0.1460861505307216, "learning_rate": 9.994662965295847e-05, "loss": 2.1193, "step": 1911 }, { "epoch": 0.014869518420471408, "grad_norm": 0.13244124945744148, "learning_rate": 9.994657320139899e-05, "loss": 2.2285, "step": 1912 }, { "epoch": 0.014877295365251989, "grad_norm": 0.18330939378656633, "learning_rate": 9.994651672001593e-05, "loss": 2.1615, "step": 1913 }, { "epoch": 0.01488507231003257, "grad_norm": 0.14879835555568302, "learning_rate": 9.994646020880932e-05, "loss": 2.157, "step": 1914 }, { "epoch": 0.014892849254813151, "grad_norm": 0.13930951176795495, "learning_rate": 9.994640366777917e-05, "loss": 2.1722, "step": 1915 }, { "epoch": 0.014900626199593732, "grad_norm": 0.20480012445268422, "learning_rate": 9.994634709692552e-05, "loss": 2.1731, "step": 1916 }, { "epoch": 0.014908403144374313, "grad_norm": 0.15279249534301828, "learning_rate": 9.994629049624844e-05, "loss": 2.1791, "step": 1917 }, { "epoch": 0.014916180089154895, "grad_norm": 0.13387975031507035, "learning_rate": 9.994623386574793e-05, "loss": 2.2002, "step": 1918 }, { "epoch": 0.014923957033935476, "grad_norm": 0.19827866432111863, "learning_rate": 9.994617720542404e-05, "loss": 2.152, "step": 1919 }, { "epoch": 0.014931733978716057, "grad_norm": 0.14989480675952696, "learning_rate": 9.994612051527678e-05, "loss": 2.1423, "step": 1920 }, { "epoch": 0.014939510923496638, "grad_norm": 0.13655286718794865, "learning_rate": 9.99460637953062e-05, "loss": 2.1817, "step": 1921 }, { "epoch": 0.01494728786827722, "grad_norm": 0.15701873507762923, "learning_rate": 9.994600704551233e-05, "loss": 2.2178, "step": 1922 }, { "epoch": 0.014955064813057802, "grad_norm": 0.1253447112163309, "learning_rate": 9.994595026589522e-05, "loss": 2.1862, "step": 1923 }, { "epoch": 0.014962841757838383, "grad_norm": 0.14435165718796283, "learning_rate": 9.994589345645489e-05, "loss": 2.2129, "step": 1924 }, { "epoch": 0.014970618702618965, "grad_norm": 0.15096717861658113, "learning_rate": 9.994583661719137e-05, "loss": 2.1904, "step": 1925 }, { "epoch": 0.014978395647399546, "grad_norm": 0.11178545192076897, "learning_rate": 9.99457797481047e-05, "loss": 2.1642, "step": 1926 }, { "epoch": 0.014986172592180127, "grad_norm": 0.12638992647499947, "learning_rate": 9.994572284919492e-05, "loss": 2.214, "step": 1927 }, { "epoch": 0.014993949536960708, "grad_norm": 0.15130499378915743, "learning_rate": 9.994566592046204e-05, "loss": 2.0972, "step": 1928 }, { "epoch": 0.01500172648174129, "grad_norm": 0.12292437310667753, "learning_rate": 9.994560896190612e-05, "loss": 2.1386, "step": 1929 }, { "epoch": 0.01500950342652187, "grad_norm": 0.13681411026806947, "learning_rate": 9.99455519735272e-05, "loss": 2.1391, "step": 1930 }, { "epoch": 0.015017280371302451, "grad_norm": 0.13927466705289415, "learning_rate": 9.994549495532528e-05, "loss": 2.1948, "step": 1931 }, { "epoch": 0.015025057316083033, "grad_norm": 0.14966581050532843, "learning_rate": 9.994543790730042e-05, "loss": 2.2087, "step": 1932 }, { "epoch": 0.015032834260863614, "grad_norm": 0.13397765984557441, "learning_rate": 9.994538082945264e-05, "loss": 2.235, "step": 1933 }, { "epoch": 0.015040611205644195, "grad_norm": 0.14316155174037326, "learning_rate": 9.994532372178199e-05, "loss": 2.1335, "step": 1934 }, { "epoch": 0.015048388150424776, "grad_norm": 0.1617792860408269, "learning_rate": 9.994526658428849e-05, "loss": 2.1898, "step": 1935 }, { "epoch": 0.015056165095205357, "grad_norm": 0.13628163191575762, "learning_rate": 9.994520941697217e-05, "loss": 2.1354, "step": 1936 }, { "epoch": 0.015063942039985938, "grad_norm": 0.12833737722042127, "learning_rate": 9.99451522198331e-05, "loss": 2.2039, "step": 1937 }, { "epoch": 0.015071718984766521, "grad_norm": 0.12070945527393082, "learning_rate": 9.994509499287125e-05, "loss": 2.1468, "step": 1938 }, { "epoch": 0.015079495929547103, "grad_norm": 0.14729727066409654, "learning_rate": 9.994503773608672e-05, "loss": 2.1553, "step": 1939 }, { "epoch": 0.015087272874327684, "grad_norm": 0.1409946737822001, "learning_rate": 9.99449804494795e-05, "loss": 2.2154, "step": 1940 }, { "epoch": 0.015095049819108265, "grad_norm": 0.115972179394, "learning_rate": 9.994492313304966e-05, "loss": 2.2218, "step": 1941 }, { "epoch": 0.015102826763888846, "grad_norm": 0.13919377959520368, "learning_rate": 9.99448657867972e-05, "loss": 2.1571, "step": 1942 }, { "epoch": 0.015110603708669427, "grad_norm": 0.1446812242424704, "learning_rate": 9.994480841072217e-05, "loss": 2.1741, "step": 1943 }, { "epoch": 0.015118380653450008, "grad_norm": 0.12487426838020414, "learning_rate": 9.994475100482462e-05, "loss": 2.1651, "step": 1944 }, { "epoch": 0.01512615759823059, "grad_norm": 0.13426386453639919, "learning_rate": 9.994469356910454e-05, "loss": 2.1374, "step": 1945 }, { "epoch": 0.01513393454301117, "grad_norm": 0.17648411491648114, "learning_rate": 9.9944636103562e-05, "loss": 2.1696, "step": 1946 }, { "epoch": 0.015141711487791752, "grad_norm": 0.12197841570542556, "learning_rate": 9.994457860819703e-05, "loss": 2.2011, "step": 1947 }, { "epoch": 0.015149488432572333, "grad_norm": 0.1616461801633883, "learning_rate": 9.994452108300966e-05, "loss": 2.1217, "step": 1948 }, { "epoch": 0.015157265377352914, "grad_norm": 0.14009274623964119, "learning_rate": 9.994446352799991e-05, "loss": 2.1962, "step": 1949 }, { "epoch": 0.015165042322133495, "grad_norm": 0.11222649185123881, "learning_rate": 9.994440594316785e-05, "loss": 2.2092, "step": 1950 }, { "epoch": 0.015172819266914076, "grad_norm": 0.12602399671465672, "learning_rate": 9.994434832851348e-05, "loss": 2.1388, "step": 1951 }, { "epoch": 0.015180596211694658, "grad_norm": 0.11644933909115468, "learning_rate": 9.994429068403686e-05, "loss": 2.1898, "step": 1952 }, { "epoch": 0.01518837315647524, "grad_norm": 0.13086086674571837, "learning_rate": 9.9944233009738e-05, "loss": 2.2085, "step": 1953 }, { "epoch": 0.015196150101255822, "grad_norm": 0.11951312640261721, "learning_rate": 9.994417530561695e-05, "loss": 2.1935, "step": 1954 }, { "epoch": 0.015203927046036403, "grad_norm": 0.11913626924845212, "learning_rate": 9.994411757167374e-05, "loss": 2.1603, "step": 1955 }, { "epoch": 0.015211703990816984, "grad_norm": 0.11993545091585596, "learning_rate": 9.994405980790841e-05, "loss": 2.128, "step": 1956 }, { "epoch": 0.015219480935597565, "grad_norm": 0.16250796734236178, "learning_rate": 9.994400201432099e-05, "loss": 2.2317, "step": 1957 }, { "epoch": 0.015227257880378146, "grad_norm": 0.11999618543145361, "learning_rate": 9.99439441909115e-05, "loss": 2.2089, "step": 1958 }, { "epoch": 0.015235034825158728, "grad_norm": 0.13995992954093625, "learning_rate": 9.994388633768001e-05, "loss": 2.1684, "step": 1959 }, { "epoch": 0.015242811769939309, "grad_norm": 0.11976935134427144, "learning_rate": 9.994382845462652e-05, "loss": 2.1533, "step": 1960 }, { "epoch": 0.01525058871471989, "grad_norm": 0.12294383671215743, "learning_rate": 9.994377054175108e-05, "loss": 2.2135, "step": 1961 }, { "epoch": 0.015258365659500471, "grad_norm": 0.12523531585124645, "learning_rate": 9.994371259905373e-05, "loss": 2.1803, "step": 1962 }, { "epoch": 0.015266142604281052, "grad_norm": 0.14326375568736321, "learning_rate": 9.994365462653447e-05, "loss": 2.1497, "step": 1963 }, { "epoch": 0.015273919549061633, "grad_norm": 0.1253694066857873, "learning_rate": 9.99435966241934e-05, "loss": 2.1311, "step": 1964 }, { "epoch": 0.015281696493842215, "grad_norm": 0.1392009730549658, "learning_rate": 9.99435385920305e-05, "loss": 2.1651, "step": 1965 }, { "epoch": 0.015289473438622796, "grad_norm": 0.13934020485526558, "learning_rate": 9.994348053004581e-05, "loss": 2.1075, "step": 1966 }, { "epoch": 0.015297250383403377, "grad_norm": 0.1259732892637668, "learning_rate": 9.994342243823939e-05, "loss": 2.1653, "step": 1967 }, { "epoch": 0.01530502732818396, "grad_norm": 0.12386325008285093, "learning_rate": 9.994336431661126e-05, "loss": 2.1873, "step": 1968 }, { "epoch": 0.015312804272964541, "grad_norm": 0.1534011842123293, "learning_rate": 9.994330616516145e-05, "loss": 2.1681, "step": 1969 }, { "epoch": 0.015320581217745122, "grad_norm": 0.15745714811786068, "learning_rate": 9.994324798389e-05, "loss": 2.194, "step": 1970 }, { "epoch": 0.015328358162525703, "grad_norm": 0.14485945617259866, "learning_rate": 9.994318977279695e-05, "loss": 2.1742, "step": 1971 }, { "epoch": 0.015336135107306284, "grad_norm": 0.13395269960287198, "learning_rate": 9.994313153188233e-05, "loss": 2.1602, "step": 1972 }, { "epoch": 0.015343912052086866, "grad_norm": 0.11809709553854761, "learning_rate": 9.994307326114616e-05, "loss": 2.0868, "step": 1973 }, { "epoch": 0.015351688996867447, "grad_norm": 0.15093321532444137, "learning_rate": 9.99430149605885e-05, "loss": 2.1088, "step": 1974 }, { "epoch": 0.015359465941648028, "grad_norm": 0.14570755284698303, "learning_rate": 9.994295663020937e-05, "loss": 2.1468, "step": 1975 }, { "epoch": 0.015367242886428609, "grad_norm": 0.14437010165138578, "learning_rate": 9.994289827000882e-05, "loss": 2.2105, "step": 1976 }, { "epoch": 0.01537501983120919, "grad_norm": 0.12322855827118338, "learning_rate": 9.994283987998685e-05, "loss": 2.1541, "step": 1977 }, { "epoch": 0.015382796775989771, "grad_norm": 0.14657743871616044, "learning_rate": 9.994278146014353e-05, "loss": 2.1634, "step": 1978 }, { "epoch": 0.015390573720770353, "grad_norm": 0.17567631992896923, "learning_rate": 9.994272301047889e-05, "loss": 2.1428, "step": 1979 }, { "epoch": 0.015398350665550934, "grad_norm": 0.12762625026624155, "learning_rate": 9.994266453099295e-05, "loss": 2.163, "step": 1980 }, { "epoch": 0.015406127610331515, "grad_norm": 0.1549062783155912, "learning_rate": 9.994260602168576e-05, "loss": 2.1643, "step": 1981 }, { "epoch": 0.015413904555112096, "grad_norm": 0.19805591138976752, "learning_rate": 9.994254748255734e-05, "loss": 2.1897, "step": 1982 }, { "epoch": 0.015421681499892679, "grad_norm": 0.12839106866493516, "learning_rate": 9.994248891360773e-05, "loss": 2.1821, "step": 1983 }, { "epoch": 0.01542945844467326, "grad_norm": 0.1529147127272843, "learning_rate": 9.994243031483699e-05, "loss": 2.234, "step": 1984 }, { "epoch": 0.015437235389453841, "grad_norm": 0.18017905706777634, "learning_rate": 9.994237168624512e-05, "loss": 2.1761, "step": 1985 }, { "epoch": 0.015445012334234422, "grad_norm": 0.12873857322510943, "learning_rate": 9.994231302783216e-05, "loss": 2.1641, "step": 1986 }, { "epoch": 0.015452789279015004, "grad_norm": 0.13178827612827493, "learning_rate": 9.994225433959816e-05, "loss": 2.1641, "step": 1987 }, { "epoch": 0.015460566223795585, "grad_norm": 0.1321940940050828, "learning_rate": 9.994219562154315e-05, "loss": 2.1705, "step": 1988 }, { "epoch": 0.015468343168576166, "grad_norm": 0.12113869602593391, "learning_rate": 9.994213687366716e-05, "loss": 2.1721, "step": 1989 }, { "epoch": 0.015476120113356747, "grad_norm": 0.1474908564654234, "learning_rate": 9.994207809597023e-05, "loss": 2.2051, "step": 1990 }, { "epoch": 0.015483897058137328, "grad_norm": 0.1383428463108477, "learning_rate": 9.994201928845237e-05, "loss": 2.1238, "step": 1991 }, { "epoch": 0.01549167400291791, "grad_norm": 0.19972988309576056, "learning_rate": 9.994196045111367e-05, "loss": 2.1044, "step": 1992 }, { "epoch": 0.01549945094769849, "grad_norm": 0.1413485831548887, "learning_rate": 9.994190158395411e-05, "loss": 2.1564, "step": 1993 }, { "epoch": 0.015507227892479072, "grad_norm": 0.14196255573576655, "learning_rate": 9.994184268697377e-05, "loss": 2.2013, "step": 1994 }, { "epoch": 0.015515004837259653, "grad_norm": 0.13185016097697988, "learning_rate": 9.994178376017265e-05, "loss": 2.1694, "step": 1995 }, { "epoch": 0.015522781782040234, "grad_norm": 0.12641013074029098, "learning_rate": 9.994172480355081e-05, "loss": 2.1119, "step": 1996 }, { "epoch": 0.015530558726820817, "grad_norm": 0.24461331025790983, "learning_rate": 9.994166581710826e-05, "loss": 2.1674, "step": 1997 }, { "epoch": 0.015538335671601398, "grad_norm": 0.12618374444561836, "learning_rate": 9.994160680084506e-05, "loss": 2.1838, "step": 1998 }, { "epoch": 0.01554611261638198, "grad_norm": 0.12527169110407296, "learning_rate": 9.994154775476124e-05, "loss": 2.1691, "step": 1999 }, { "epoch": 0.01555388956116256, "grad_norm": 0.1254959519065607, "learning_rate": 9.99414886788568e-05, "loss": 2.1452, "step": 2000 }, { "epoch": 0.015561666505943142, "grad_norm": 0.12467603162611648, "learning_rate": 9.994142957313184e-05, "loss": 2.1665, "step": 2001 }, { "epoch": 0.015569443450723723, "grad_norm": 0.13271336024111627, "learning_rate": 9.994137043758634e-05, "loss": 2.1571, "step": 2002 }, { "epoch": 0.015577220395504304, "grad_norm": 0.2331488493470543, "learning_rate": 9.994131127222036e-05, "loss": 2.2181, "step": 2003 }, { "epoch": 0.015584997340284885, "grad_norm": 0.12316325020314497, "learning_rate": 9.994125207703393e-05, "loss": 2.1452, "step": 2004 }, { "epoch": 0.015592774285065466, "grad_norm": 0.12745083718525874, "learning_rate": 9.99411928520271e-05, "loss": 2.1776, "step": 2005 }, { "epoch": 0.015600551229846047, "grad_norm": 0.11489341797190197, "learning_rate": 9.994113359719988e-05, "loss": 2.1612, "step": 2006 }, { "epoch": 0.015608328174626629, "grad_norm": 0.2036603962326646, "learning_rate": 9.994107431255231e-05, "loss": 2.1465, "step": 2007 }, { "epoch": 0.01561610511940721, "grad_norm": 0.11990412964696978, "learning_rate": 9.994101499808444e-05, "loss": 2.1813, "step": 2008 }, { "epoch": 0.015623882064187791, "grad_norm": 0.12323765112625726, "learning_rate": 9.994095565379631e-05, "loss": 2.1869, "step": 2009 }, { "epoch": 0.015631659008968374, "grad_norm": 0.1275102954410153, "learning_rate": 9.994089627968793e-05, "loss": 2.1292, "step": 2010 }, { "epoch": 0.015639435953748955, "grad_norm": 0.1391694269677792, "learning_rate": 9.994083687575935e-05, "loss": 2.1579, "step": 2011 }, { "epoch": 0.015647212898529536, "grad_norm": 0.13029447031138214, "learning_rate": 9.99407774420106e-05, "loss": 2.1804, "step": 2012 }, { "epoch": 0.015654989843310117, "grad_norm": 0.14307303948103778, "learning_rate": 9.994071797844172e-05, "loss": 2.1838, "step": 2013 }, { "epoch": 0.0156627667880907, "grad_norm": 0.13432050786708605, "learning_rate": 9.994065848505275e-05, "loss": 2.1347, "step": 2014 }, { "epoch": 0.01567054373287128, "grad_norm": 0.1323404798371813, "learning_rate": 9.994059896184373e-05, "loss": 2.1671, "step": 2015 }, { "epoch": 0.01567832067765186, "grad_norm": 0.11579032391379289, "learning_rate": 9.994053940881467e-05, "loss": 2.1628, "step": 2016 }, { "epoch": 0.015686097622432442, "grad_norm": 0.13214162894882173, "learning_rate": 9.994047982596564e-05, "loss": 2.2297, "step": 2017 }, { "epoch": 0.015693874567213023, "grad_norm": 0.1500163953339119, "learning_rate": 9.994042021329664e-05, "loss": 2.1525, "step": 2018 }, { "epoch": 0.015701651511993604, "grad_norm": 0.13536027051399954, "learning_rate": 9.994036057080773e-05, "loss": 2.157, "step": 2019 }, { "epoch": 0.015709428456774185, "grad_norm": 0.12937919339790982, "learning_rate": 9.994030089849893e-05, "loss": 2.209, "step": 2020 }, { "epoch": 0.015717205401554767, "grad_norm": 0.19569852885867467, "learning_rate": 9.994024119637029e-05, "loss": 2.1772, "step": 2021 }, { "epoch": 0.015724982346335348, "grad_norm": 0.15147368525975785, "learning_rate": 9.994018146442185e-05, "loss": 2.152, "step": 2022 }, { "epoch": 0.01573275929111593, "grad_norm": 0.13369161786138337, "learning_rate": 9.994012170265363e-05, "loss": 2.1349, "step": 2023 }, { "epoch": 0.01574053623589651, "grad_norm": 0.12650148164925637, "learning_rate": 9.994006191106567e-05, "loss": 2.2361, "step": 2024 }, { "epoch": 0.01574831318067709, "grad_norm": 0.1243382846741018, "learning_rate": 9.9940002089658e-05, "loss": 2.1316, "step": 2025 }, { "epoch": 0.015756090125457672, "grad_norm": 0.13400085322295377, "learning_rate": 9.993994223843068e-05, "loss": 2.1099, "step": 2026 }, { "epoch": 0.015763867070238254, "grad_norm": 0.14972131964017807, "learning_rate": 9.993988235738371e-05, "loss": 2.2143, "step": 2027 }, { "epoch": 0.015771644015018835, "grad_norm": 0.13070890140024743, "learning_rate": 9.993982244651715e-05, "loss": 2.1522, "step": 2028 }, { "epoch": 0.015779420959799416, "grad_norm": 0.13052760193226212, "learning_rate": 9.993976250583104e-05, "loss": 2.1415, "step": 2029 }, { "epoch": 0.015787197904579997, "grad_norm": 0.15535501660359058, "learning_rate": 9.993970253532539e-05, "loss": 2.1581, "step": 2030 }, { "epoch": 0.015794974849360578, "grad_norm": 0.1255869976256805, "learning_rate": 9.993964253500026e-05, "loss": 2.1812, "step": 2031 }, { "epoch": 0.01580275179414116, "grad_norm": 0.16766056701264254, "learning_rate": 9.993958250485566e-05, "loss": 2.1872, "step": 2032 }, { "epoch": 0.01581052873892174, "grad_norm": 0.18483661811025215, "learning_rate": 9.993952244489168e-05, "loss": 2.1631, "step": 2033 }, { "epoch": 0.01581830568370232, "grad_norm": 0.15479924758740293, "learning_rate": 9.99394623551083e-05, "loss": 2.1952, "step": 2034 }, { "epoch": 0.015826082628482906, "grad_norm": 0.14079000052439092, "learning_rate": 9.993940223550556e-05, "loss": 2.133, "step": 2035 }, { "epoch": 0.015833859573263488, "grad_norm": 0.12980630416985403, "learning_rate": 9.993934208608353e-05, "loss": 2.1617, "step": 2036 }, { "epoch": 0.01584163651804407, "grad_norm": 0.1545980592800816, "learning_rate": 9.993928190684224e-05, "loss": 2.1769, "step": 2037 }, { "epoch": 0.01584941346282465, "grad_norm": 0.1230220816917447, "learning_rate": 9.993922169778167e-05, "loss": 2.199, "step": 2038 }, { "epoch": 0.01585719040760523, "grad_norm": 0.12562007680697895, "learning_rate": 9.993916145890193e-05, "loss": 2.1853, "step": 2039 }, { "epoch": 0.015864967352385812, "grad_norm": 0.13924679953448948, "learning_rate": 9.993910119020302e-05, "loss": 2.2087, "step": 2040 }, { "epoch": 0.015872744297166393, "grad_norm": 0.1319619290885077, "learning_rate": 9.993904089168498e-05, "loss": 2.1638, "step": 2041 }, { "epoch": 0.015880521241946974, "grad_norm": 0.1326501939977941, "learning_rate": 9.993898056334784e-05, "loss": 2.1607, "step": 2042 }, { "epoch": 0.015888298186727556, "grad_norm": 0.13992775460251708, "learning_rate": 9.993892020519165e-05, "loss": 2.1869, "step": 2043 }, { "epoch": 0.015896075131508137, "grad_norm": 0.3039768022869953, "learning_rate": 9.993885981721644e-05, "loss": 2.1777, "step": 2044 }, { "epoch": 0.015903852076288718, "grad_norm": 0.13744474084953398, "learning_rate": 9.993879939942223e-05, "loss": 2.194, "step": 2045 }, { "epoch": 0.0159116290210693, "grad_norm": 0.14950664491133409, "learning_rate": 9.99387389518091e-05, "loss": 2.1729, "step": 2046 }, { "epoch": 0.01591940596584988, "grad_norm": 0.23151527956203735, "learning_rate": 9.993867847437704e-05, "loss": 2.2081, "step": 2047 }, { "epoch": 0.01592718291063046, "grad_norm": 0.18322681739041122, "learning_rate": 9.99386179671261e-05, "loss": 2.1753, "step": 2048 }, { "epoch": 0.015934959855411043, "grad_norm": 0.22090398347372187, "learning_rate": 9.993855743005631e-05, "loss": 2.2016, "step": 2049 }, { "epoch": 0.015942736800191624, "grad_norm": 0.27212418446686776, "learning_rate": 9.993849686316773e-05, "loss": 2.1418, "step": 2050 }, { "epoch": 0.015950513744972205, "grad_norm": 0.23431382582481586, "learning_rate": 9.993843626646038e-05, "loss": 2.1669, "step": 2051 }, { "epoch": 0.015958290689752786, "grad_norm": 0.14464018012638816, "learning_rate": 9.99383756399343e-05, "loss": 2.1416, "step": 2052 }, { "epoch": 0.015966067634533367, "grad_norm": 0.14109484315748055, "learning_rate": 9.993831498358951e-05, "loss": 2.1797, "step": 2053 }, { "epoch": 0.01597384457931395, "grad_norm": 0.11766120644436777, "learning_rate": 9.993825429742607e-05, "loss": 2.1424, "step": 2054 }, { "epoch": 0.01598162152409453, "grad_norm": 0.1449845876621761, "learning_rate": 9.993819358144401e-05, "loss": 2.1689, "step": 2055 }, { "epoch": 0.01598939846887511, "grad_norm": 0.307629689641504, "learning_rate": 9.993813283564335e-05, "loss": 2.2189, "step": 2056 }, { "epoch": 0.015997175413655692, "grad_norm": 0.15061922902255684, "learning_rate": 9.993807206002416e-05, "loss": 2.2239, "step": 2057 }, { "epoch": 0.016004952358436273, "grad_norm": 0.12291648074771645, "learning_rate": 9.993801125458643e-05, "loss": 2.1468, "step": 2058 }, { "epoch": 0.016012729303216854, "grad_norm": 0.12086914647796936, "learning_rate": 9.993795041933025e-05, "loss": 2.159, "step": 2059 }, { "epoch": 0.016020506247997435, "grad_norm": 0.13802109799128948, "learning_rate": 9.99378895542556e-05, "loss": 2.1673, "step": 2060 }, { "epoch": 0.016028283192778017, "grad_norm": 0.25213472015683996, "learning_rate": 9.993782865936257e-05, "loss": 2.1697, "step": 2061 }, { "epoch": 0.016036060137558598, "grad_norm": 0.16396452968930342, "learning_rate": 9.993776773465115e-05, "loss": 2.1939, "step": 2062 }, { "epoch": 0.01604383708233918, "grad_norm": 0.14806143647854, "learning_rate": 9.99377067801214e-05, "loss": 2.2078, "step": 2063 }, { "epoch": 0.016051614027119764, "grad_norm": 0.3460462315711752, "learning_rate": 9.993764579577336e-05, "loss": 2.1859, "step": 2064 }, { "epoch": 0.016059390971900345, "grad_norm": 0.2640362606960125, "learning_rate": 9.993758478160706e-05, "loss": 2.1844, "step": 2065 }, { "epoch": 0.016067167916680926, "grad_norm": 0.18614461432856544, "learning_rate": 9.993752373762253e-05, "loss": 2.1144, "step": 2066 }, { "epoch": 0.016074944861461507, "grad_norm": 0.14671098509794067, "learning_rate": 9.993746266381981e-05, "loss": 2.1505, "step": 2067 }, { "epoch": 0.016082721806242088, "grad_norm": 0.19059948404133092, "learning_rate": 9.993740156019895e-05, "loss": 2.1416, "step": 2068 }, { "epoch": 0.01609049875102267, "grad_norm": 0.19229663992751592, "learning_rate": 9.993734042675997e-05, "loss": 2.2042, "step": 2069 }, { "epoch": 0.01609827569580325, "grad_norm": 0.16992907833091894, "learning_rate": 9.993727926350291e-05, "loss": 2.1635, "step": 2070 }, { "epoch": 0.01610605264058383, "grad_norm": 0.1412750568289624, "learning_rate": 9.993721807042782e-05, "loss": 2.124, "step": 2071 }, { "epoch": 0.016113829585364413, "grad_norm": 0.1452882424921817, "learning_rate": 9.99371568475347e-05, "loss": 2.165, "step": 2072 }, { "epoch": 0.016121606530144994, "grad_norm": 0.18281153260111546, "learning_rate": 9.993709559482363e-05, "loss": 2.1454, "step": 2073 }, { "epoch": 0.016129383474925575, "grad_norm": 0.18425529721286119, "learning_rate": 9.993703431229463e-05, "loss": 2.1738, "step": 2074 }, { "epoch": 0.016137160419706156, "grad_norm": 0.14027846459458135, "learning_rate": 9.993697299994773e-05, "loss": 2.1955, "step": 2075 }, { "epoch": 0.016144937364486737, "grad_norm": 0.13696613158498877, "learning_rate": 9.993691165778297e-05, "loss": 2.1654, "step": 2076 }, { "epoch": 0.01615271430926732, "grad_norm": 0.42615416921049365, "learning_rate": 9.99368502858004e-05, "loss": 2.2211, "step": 2077 }, { "epoch": 0.0161604912540479, "grad_norm": 0.2237125141768299, "learning_rate": 9.993678888400002e-05, "loss": 2.1304, "step": 2078 }, { "epoch": 0.01616826819882848, "grad_norm": 0.1740076407610878, "learning_rate": 9.993672745238192e-05, "loss": 2.1748, "step": 2079 }, { "epoch": 0.016176045143609062, "grad_norm": 0.12616614430065212, "learning_rate": 9.99366659909461e-05, "loss": 2.216, "step": 2080 }, { "epoch": 0.016183822088389643, "grad_norm": 0.18565067313447417, "learning_rate": 9.993660449969258e-05, "loss": 2.1874, "step": 2081 }, { "epoch": 0.016191599033170224, "grad_norm": 0.23102258365304815, "learning_rate": 9.993654297862146e-05, "loss": 2.1211, "step": 2082 }, { "epoch": 0.016199375977950806, "grad_norm": 0.22739324853679901, "learning_rate": 9.993648142773271e-05, "loss": 2.1535, "step": 2083 }, { "epoch": 0.016207152922731387, "grad_norm": 0.15342735752024309, "learning_rate": 9.99364198470264e-05, "loss": 2.1328, "step": 2084 }, { "epoch": 0.016214929867511968, "grad_norm": 1.1056101897506008, "learning_rate": 9.993635823650256e-05, "loss": 2.1882, "step": 2085 }, { "epoch": 0.01622270681229255, "grad_norm": 0.18713001490548078, "learning_rate": 9.993629659616123e-05, "loss": 2.1411, "step": 2086 }, { "epoch": 0.01623048375707313, "grad_norm": 0.18872340855221104, "learning_rate": 9.993623492600245e-05, "loss": 2.1349, "step": 2087 }, { "epoch": 0.01623826070185371, "grad_norm": 0.15294067304029724, "learning_rate": 9.993617322602627e-05, "loss": 2.185, "step": 2088 }, { "epoch": 0.016246037646634293, "grad_norm": 0.1342523944578063, "learning_rate": 9.993611149623267e-05, "loss": 2.2299, "step": 2089 }, { "epoch": 0.016253814591414874, "grad_norm": 0.190405030473947, "learning_rate": 9.993604973662176e-05, "loss": 2.1725, "step": 2090 }, { "epoch": 0.016261591536195455, "grad_norm": 0.1953448777878211, "learning_rate": 9.993598794719352e-05, "loss": 2.1894, "step": 2091 }, { "epoch": 0.016269368480976036, "grad_norm": 0.1546283217293866, "learning_rate": 9.993592612794802e-05, "loss": 2.1474, "step": 2092 }, { "epoch": 0.016277145425756617, "grad_norm": 0.13599049452521025, "learning_rate": 9.993586427888529e-05, "loss": 2.146, "step": 2093 }, { "epoch": 0.016284922370537202, "grad_norm": 0.18029167301447993, "learning_rate": 9.993580240000535e-05, "loss": 2.1935, "step": 2094 }, { "epoch": 0.016292699315317783, "grad_norm": 0.12661208678830063, "learning_rate": 9.993574049130826e-05, "loss": 2.165, "step": 2095 }, { "epoch": 0.016300476260098364, "grad_norm": 0.13604379724917007, "learning_rate": 9.993567855279404e-05, "loss": 2.158, "step": 2096 }, { "epoch": 0.016308253204878945, "grad_norm": 0.12109599669642569, "learning_rate": 9.993561658446275e-05, "loss": 2.1965, "step": 2097 }, { "epoch": 0.016316030149659527, "grad_norm": 0.11946742079666935, "learning_rate": 9.993555458631441e-05, "loss": 2.1337, "step": 2098 }, { "epoch": 0.016323807094440108, "grad_norm": 0.13197377084750544, "learning_rate": 9.993549255834905e-05, "loss": 2.1429, "step": 2099 }, { "epoch": 0.01633158403922069, "grad_norm": 0.20936333346655334, "learning_rate": 9.993543050056671e-05, "loss": 2.1468, "step": 2100 }, { "epoch": 0.01633936098400127, "grad_norm": 0.12024363295932318, "learning_rate": 9.993536841296743e-05, "loss": 2.1549, "step": 2101 }, { "epoch": 0.01634713792878185, "grad_norm": 0.13727989456886988, "learning_rate": 9.993530629555126e-05, "loss": 2.1729, "step": 2102 }, { "epoch": 0.016354914873562432, "grad_norm": 0.1519862199574232, "learning_rate": 9.993524414831823e-05, "loss": 2.1164, "step": 2103 }, { "epoch": 0.016362691818343014, "grad_norm": 0.16533222186129776, "learning_rate": 9.993518197126836e-05, "loss": 2.1587, "step": 2104 }, { "epoch": 0.016370468763123595, "grad_norm": 0.1554259569485541, "learning_rate": 9.993511976440171e-05, "loss": 2.2008, "step": 2105 }, { "epoch": 0.016378245707904176, "grad_norm": 0.1366833017343756, "learning_rate": 9.993505752771832e-05, "loss": 2.188, "step": 2106 }, { "epoch": 0.016386022652684757, "grad_norm": 0.13556861852776567, "learning_rate": 9.993499526121819e-05, "loss": 2.1963, "step": 2107 }, { "epoch": 0.016393799597465338, "grad_norm": 0.1600131849717862, "learning_rate": 9.99349329649014e-05, "loss": 2.155, "step": 2108 }, { "epoch": 0.01640157654224592, "grad_norm": 0.16778320722945758, "learning_rate": 9.993487063876796e-05, "loss": 2.1989, "step": 2109 }, { "epoch": 0.0164093534870265, "grad_norm": 0.20370890388004945, "learning_rate": 9.993480828281791e-05, "loss": 2.1742, "step": 2110 }, { "epoch": 0.01641713043180708, "grad_norm": 0.13468614738752552, "learning_rate": 9.993474589705131e-05, "loss": 2.1347, "step": 2111 }, { "epoch": 0.016424907376587663, "grad_norm": 0.11872341636191401, "learning_rate": 9.993468348146818e-05, "loss": 2.1732, "step": 2112 }, { "epoch": 0.016432684321368244, "grad_norm": 0.12379652022681963, "learning_rate": 9.993462103606857e-05, "loss": 2.1346, "step": 2113 }, { "epoch": 0.016440461266148825, "grad_norm": 0.13482419525440148, "learning_rate": 9.993455856085247e-05, "loss": 2.1272, "step": 2114 }, { "epoch": 0.016448238210929406, "grad_norm": 0.13741051673103274, "learning_rate": 9.993449605581998e-05, "loss": 2.1801, "step": 2115 }, { "epoch": 0.016456015155709987, "grad_norm": 0.13099590041595532, "learning_rate": 9.993443352097111e-05, "loss": 2.2101, "step": 2116 }, { "epoch": 0.01646379210049057, "grad_norm": 0.1235491957570434, "learning_rate": 9.993437095630588e-05, "loss": 2.1693, "step": 2117 }, { "epoch": 0.01647156904527115, "grad_norm": 0.12003175938964983, "learning_rate": 9.993430836182435e-05, "loss": 2.1085, "step": 2118 }, { "epoch": 0.01647934599005173, "grad_norm": 0.13911788522574917, "learning_rate": 9.993424573752657e-05, "loss": 2.1753, "step": 2119 }, { "epoch": 0.016487122934832312, "grad_norm": 0.12646525491624458, "learning_rate": 9.993418308341256e-05, "loss": 2.1435, "step": 2120 }, { "epoch": 0.016494899879612893, "grad_norm": 0.12586654517029486, "learning_rate": 9.993412039948233e-05, "loss": 2.2682, "step": 2121 }, { "epoch": 0.016502676824393474, "grad_norm": 0.41206830337688566, "learning_rate": 9.993405768573597e-05, "loss": 2.1388, "step": 2122 }, { "epoch": 0.016510453769174056, "grad_norm": 0.11988474598936232, "learning_rate": 9.993399494217348e-05, "loss": 2.1372, "step": 2123 }, { "epoch": 0.01651823071395464, "grad_norm": 0.13035650440325453, "learning_rate": 9.993393216879492e-05, "loss": 2.1333, "step": 2124 }, { "epoch": 0.01652600765873522, "grad_norm": 0.12669091944109448, "learning_rate": 9.993386936560031e-05, "loss": 2.1933, "step": 2125 }, { "epoch": 0.016533784603515803, "grad_norm": 0.14401948685670918, "learning_rate": 9.993380653258969e-05, "loss": 2.1194, "step": 2126 }, { "epoch": 0.016541561548296384, "grad_norm": 0.13675487537132902, "learning_rate": 9.993374366976312e-05, "loss": 2.1468, "step": 2127 }, { "epoch": 0.016549338493076965, "grad_norm": 0.14363352683050323, "learning_rate": 9.99336807771206e-05, "loss": 2.2033, "step": 2128 }, { "epoch": 0.016557115437857546, "grad_norm": 0.12068482493441889, "learning_rate": 9.993361785466219e-05, "loss": 2.1688, "step": 2129 }, { "epoch": 0.016564892382638127, "grad_norm": 0.14672506334708998, "learning_rate": 9.993355490238792e-05, "loss": 2.2415, "step": 2130 }, { "epoch": 0.01657266932741871, "grad_norm": 0.12947819330082033, "learning_rate": 9.993349192029785e-05, "loss": 2.1546, "step": 2131 }, { "epoch": 0.01658044627219929, "grad_norm": 0.12701525172074712, "learning_rate": 9.993342890839199e-05, "loss": 2.118, "step": 2132 }, { "epoch": 0.01658822321697987, "grad_norm": 0.12529967594591343, "learning_rate": 9.99333658666704e-05, "loss": 2.1477, "step": 2133 }, { "epoch": 0.016596000161760452, "grad_norm": 0.12320145262204459, "learning_rate": 9.993330279513309e-05, "loss": 2.1576, "step": 2134 }, { "epoch": 0.016603777106541033, "grad_norm": 0.1342120933177447, "learning_rate": 9.993323969378012e-05, "loss": 2.167, "step": 2135 }, { "epoch": 0.016611554051321614, "grad_norm": 0.125379742101021, "learning_rate": 9.99331765626115e-05, "loss": 2.172, "step": 2136 }, { "epoch": 0.016619330996102195, "grad_norm": 0.11591387939213542, "learning_rate": 9.993311340162732e-05, "loss": 2.1832, "step": 2137 }, { "epoch": 0.016627107940882777, "grad_norm": 0.12558351597803485, "learning_rate": 9.993305021082757e-05, "loss": 2.1668, "step": 2138 }, { "epoch": 0.016634884885663358, "grad_norm": 0.13218634594104278, "learning_rate": 9.99329869902123e-05, "loss": 2.1145, "step": 2139 }, { "epoch": 0.01664266183044394, "grad_norm": 0.13148733115106567, "learning_rate": 9.993292373978156e-05, "loss": 2.2105, "step": 2140 }, { "epoch": 0.01665043877522452, "grad_norm": 0.15612956958277796, "learning_rate": 9.993286045953539e-05, "loss": 2.147, "step": 2141 }, { "epoch": 0.0166582157200051, "grad_norm": 0.25976683418622376, "learning_rate": 9.99327971494738e-05, "loss": 2.1655, "step": 2142 }, { "epoch": 0.016665992664785682, "grad_norm": 0.1409803026912072, "learning_rate": 9.993273380959685e-05, "loss": 2.1715, "step": 2143 }, { "epoch": 0.016673769609566264, "grad_norm": 0.12156576087960223, "learning_rate": 9.993267043990457e-05, "loss": 2.1857, "step": 2144 }, { "epoch": 0.016681546554346845, "grad_norm": 0.15387198277900907, "learning_rate": 9.9932607040397e-05, "loss": 2.1719, "step": 2145 }, { "epoch": 0.016689323499127426, "grad_norm": 0.17428456980601462, "learning_rate": 9.993254361107418e-05, "loss": 2.0999, "step": 2146 }, { "epoch": 0.016697100443908007, "grad_norm": 0.15577213355618627, "learning_rate": 9.993248015193615e-05, "loss": 2.1241, "step": 2147 }, { "epoch": 0.016704877388688588, "grad_norm": 0.15394515883235876, "learning_rate": 9.993241666298293e-05, "loss": 2.1445, "step": 2148 }, { "epoch": 0.01671265433346917, "grad_norm": 0.16907370851169728, "learning_rate": 9.993235314421459e-05, "loss": 2.1393, "step": 2149 }, { "epoch": 0.01672043127824975, "grad_norm": 0.1606296211718092, "learning_rate": 9.993228959563113e-05, "loss": 2.1602, "step": 2150 }, { "epoch": 0.01672820822303033, "grad_norm": 0.13028626836337573, "learning_rate": 9.993222601723262e-05, "loss": 2.1148, "step": 2151 }, { "epoch": 0.016735985167810913, "grad_norm": 0.16442337705692567, "learning_rate": 9.993216240901909e-05, "loss": 2.1732, "step": 2152 }, { "epoch": 0.016743762112591497, "grad_norm": 0.16481135292211208, "learning_rate": 9.993209877099056e-05, "loss": 2.1728, "step": 2153 }, { "epoch": 0.01675153905737208, "grad_norm": 0.1339883721229028, "learning_rate": 9.99320351031471e-05, "loss": 2.1618, "step": 2154 }, { "epoch": 0.01675931600215266, "grad_norm": 0.12194794550226956, "learning_rate": 9.993197140548874e-05, "loss": 2.183, "step": 2155 }, { "epoch": 0.01676709294693324, "grad_norm": 0.14037932524558097, "learning_rate": 9.993190767801547e-05, "loss": 2.193, "step": 2156 }, { "epoch": 0.016774869891713822, "grad_norm": 0.12202368179869774, "learning_rate": 9.993184392072738e-05, "loss": 2.189, "step": 2157 }, { "epoch": 0.016782646836494403, "grad_norm": 0.14017234295251643, "learning_rate": 9.993178013362452e-05, "loss": 2.2114, "step": 2158 }, { "epoch": 0.016790423781274984, "grad_norm": 0.13587349293397732, "learning_rate": 9.993171631670688e-05, "loss": 2.133, "step": 2159 }, { "epoch": 0.016798200726055566, "grad_norm": 0.13924943413484345, "learning_rate": 9.993165246997451e-05, "loss": 2.149, "step": 2160 }, { "epoch": 0.016805977670836147, "grad_norm": 0.12329808425559138, "learning_rate": 9.993158859342749e-05, "loss": 2.1561, "step": 2161 }, { "epoch": 0.016813754615616728, "grad_norm": 0.13424805013157481, "learning_rate": 9.99315246870658e-05, "loss": 2.2037, "step": 2162 }, { "epoch": 0.01682153156039731, "grad_norm": 0.13082626545092507, "learning_rate": 9.993146075088951e-05, "loss": 2.1715, "step": 2163 }, { "epoch": 0.01682930850517789, "grad_norm": 0.15334536476577557, "learning_rate": 9.993139678489865e-05, "loss": 2.1203, "step": 2164 }, { "epoch": 0.01683708544995847, "grad_norm": 0.1444627502738999, "learning_rate": 9.993133278909327e-05, "loss": 2.1548, "step": 2165 }, { "epoch": 0.016844862394739053, "grad_norm": 0.14525238991597683, "learning_rate": 9.99312687634734e-05, "loss": 2.1572, "step": 2166 }, { "epoch": 0.016852639339519634, "grad_norm": 0.14119267981030403, "learning_rate": 9.993120470803907e-05, "loss": 2.1485, "step": 2167 }, { "epoch": 0.016860416284300215, "grad_norm": 0.12774902467602733, "learning_rate": 9.993114062279032e-05, "loss": 2.1415, "step": 2168 }, { "epoch": 0.016868193229080796, "grad_norm": 0.11762111125218307, "learning_rate": 9.993107650772721e-05, "loss": 2.1139, "step": 2169 }, { "epoch": 0.016875970173861377, "grad_norm": 0.14153427686146, "learning_rate": 9.993101236284975e-05, "loss": 2.1719, "step": 2170 }, { "epoch": 0.01688374711864196, "grad_norm": 0.14609121174373596, "learning_rate": 9.993094818815801e-05, "loss": 2.1551, "step": 2171 }, { "epoch": 0.01689152406342254, "grad_norm": 0.1429906311417681, "learning_rate": 9.993088398365198e-05, "loss": 2.1245, "step": 2172 }, { "epoch": 0.01689930100820312, "grad_norm": 0.1371908475538024, "learning_rate": 9.993081974933174e-05, "loss": 2.1365, "step": 2173 }, { "epoch": 0.016907077952983702, "grad_norm": 0.1183166818098984, "learning_rate": 9.993075548519732e-05, "loss": 2.1499, "step": 2174 }, { "epoch": 0.016914854897764283, "grad_norm": 0.13775294858870482, "learning_rate": 9.993069119124875e-05, "loss": 2.1303, "step": 2175 }, { "epoch": 0.016922631842544864, "grad_norm": 0.1900943227709023, "learning_rate": 9.993062686748608e-05, "loss": 2.1607, "step": 2176 }, { "epoch": 0.016930408787325445, "grad_norm": 0.20416979627209186, "learning_rate": 9.993056251390933e-05, "loss": 2.137, "step": 2177 }, { "epoch": 0.016938185732106027, "grad_norm": 0.1527015443819497, "learning_rate": 9.993049813051855e-05, "loss": 2.2628, "step": 2178 }, { "epoch": 0.016945962676886608, "grad_norm": 0.1331690823305962, "learning_rate": 9.993043371731379e-05, "loss": 2.1763, "step": 2179 }, { "epoch": 0.01695373962166719, "grad_norm": 0.17456136801563787, "learning_rate": 9.993036927429506e-05, "loss": 2.1683, "step": 2180 }, { "epoch": 0.01696151656644777, "grad_norm": 0.23392173127345772, "learning_rate": 9.993030480146243e-05, "loss": 2.143, "step": 2181 }, { "epoch": 0.01696929351122835, "grad_norm": 0.24797389490818758, "learning_rate": 9.993024029881591e-05, "loss": 2.1636, "step": 2182 }, { "epoch": 0.016977070456008936, "grad_norm": 0.18277378879887812, "learning_rate": 9.993017576635554e-05, "loss": 2.1415, "step": 2183 }, { "epoch": 0.016984847400789517, "grad_norm": 0.13502485683038581, "learning_rate": 9.99301112040814e-05, "loss": 2.1733, "step": 2184 }, { "epoch": 0.016992624345570098, "grad_norm": 0.16820370656743489, "learning_rate": 9.99300466119935e-05, "loss": 2.1574, "step": 2185 }, { "epoch": 0.01700040129035068, "grad_norm": 0.14571640244600897, "learning_rate": 9.992998199009185e-05, "loss": 2.0833, "step": 2186 }, { "epoch": 0.01700817823513126, "grad_norm": 0.14309425377028276, "learning_rate": 9.992991733837654e-05, "loss": 2.1141, "step": 2187 }, { "epoch": 0.01701595517991184, "grad_norm": 0.23096037633935773, "learning_rate": 9.992985265684755e-05, "loss": 2.1723, "step": 2188 }, { "epoch": 0.017023732124692423, "grad_norm": 0.15584836377923547, "learning_rate": 9.992978794550498e-05, "loss": 2.1986, "step": 2189 }, { "epoch": 0.017031509069473004, "grad_norm": 0.15347671586863415, "learning_rate": 9.992972320434885e-05, "loss": 2.1481, "step": 2190 }, { "epoch": 0.017039286014253585, "grad_norm": 0.24510883216905788, "learning_rate": 9.992965843337917e-05, "loss": 2.2185, "step": 2191 }, { "epoch": 0.017047062959034166, "grad_norm": 0.21968594357630775, "learning_rate": 9.992959363259601e-05, "loss": 2.1657, "step": 2192 }, { "epoch": 0.017054839903814747, "grad_norm": 0.13410323076161054, "learning_rate": 9.992952880199939e-05, "loss": 2.1462, "step": 2193 }, { "epoch": 0.01706261684859533, "grad_norm": 0.2780657838858817, "learning_rate": 9.992946394158937e-05, "loss": 2.1713, "step": 2194 }, { "epoch": 0.01707039379337591, "grad_norm": 0.1537324055254683, "learning_rate": 9.992939905136595e-05, "loss": 2.1215, "step": 2195 }, { "epoch": 0.01707817073815649, "grad_norm": 0.1451857202350152, "learning_rate": 9.992933413132922e-05, "loss": 2.1698, "step": 2196 }, { "epoch": 0.017085947682937072, "grad_norm": 0.14165360795956378, "learning_rate": 9.992926918147917e-05, "loss": 2.1681, "step": 2197 }, { "epoch": 0.017093724627717653, "grad_norm": 0.12494518025909161, "learning_rate": 9.992920420181588e-05, "loss": 2.143, "step": 2198 }, { "epoch": 0.017101501572498234, "grad_norm": 0.37931594776030036, "learning_rate": 9.992913919233937e-05, "loss": 2.1744, "step": 2199 }, { "epoch": 0.017109278517278816, "grad_norm": 0.14133833813274024, "learning_rate": 9.992907415304968e-05, "loss": 2.1713, "step": 2200 }, { "epoch": 0.017117055462059397, "grad_norm": 0.1525536564194705, "learning_rate": 9.992900908394685e-05, "loss": 2.1589, "step": 2201 }, { "epoch": 0.017124832406839978, "grad_norm": 0.15750490953146656, "learning_rate": 9.99289439850309e-05, "loss": 2.1857, "step": 2202 }, { "epoch": 0.01713260935162056, "grad_norm": 0.1644016631779169, "learning_rate": 9.992887885630188e-05, "loss": 2.1243, "step": 2203 }, { "epoch": 0.01714038629640114, "grad_norm": 0.14176570015838486, "learning_rate": 9.992881369775986e-05, "loss": 2.1472, "step": 2204 }, { "epoch": 0.01714816324118172, "grad_norm": 0.15068027555243427, "learning_rate": 9.992874850940483e-05, "loss": 2.1563, "step": 2205 }, { "epoch": 0.017155940185962303, "grad_norm": 0.25588051937542206, "learning_rate": 9.992868329123687e-05, "loss": 2.1969, "step": 2206 }, { "epoch": 0.017163717130742884, "grad_norm": 0.2245727905435998, "learning_rate": 9.9928618043256e-05, "loss": 2.1714, "step": 2207 }, { "epoch": 0.017171494075523465, "grad_norm": 0.1580867628475782, "learning_rate": 9.992855276546225e-05, "loss": 2.1435, "step": 2208 }, { "epoch": 0.017179271020304046, "grad_norm": 0.1708674378103087, "learning_rate": 9.992848745785568e-05, "loss": 2.1673, "step": 2209 }, { "epoch": 0.017187047965084627, "grad_norm": 0.21097697557419062, "learning_rate": 9.992842212043631e-05, "loss": 2.135, "step": 2210 }, { "epoch": 0.01719482490986521, "grad_norm": 0.15429674188212236, "learning_rate": 9.99283567532042e-05, "loss": 2.189, "step": 2211 }, { "epoch": 0.01720260185464579, "grad_norm": 0.14331946724703554, "learning_rate": 9.992829135615936e-05, "loss": 2.1703, "step": 2212 }, { "epoch": 0.017210378799426374, "grad_norm": 0.21352351570072647, "learning_rate": 9.992822592930185e-05, "loss": 2.1138, "step": 2213 }, { "epoch": 0.017218155744206955, "grad_norm": 0.1896965452194138, "learning_rate": 9.992816047263172e-05, "loss": 2.2031, "step": 2214 }, { "epoch": 0.017225932688987537, "grad_norm": 0.1217163686318849, "learning_rate": 9.992809498614897e-05, "loss": 2.1677, "step": 2215 }, { "epoch": 0.017233709633768118, "grad_norm": 0.15098895913370242, "learning_rate": 9.992802946985366e-05, "loss": 2.1562, "step": 2216 }, { "epoch": 0.0172414865785487, "grad_norm": 0.14657390228813025, "learning_rate": 9.992796392374586e-05, "loss": 2.1723, "step": 2217 }, { "epoch": 0.01724926352332928, "grad_norm": 0.13920106727059572, "learning_rate": 9.992789834782557e-05, "loss": 2.1459, "step": 2218 }, { "epoch": 0.01725704046810986, "grad_norm": 0.1222106898103588, "learning_rate": 9.992783274209283e-05, "loss": 2.1913, "step": 2219 }, { "epoch": 0.017264817412890442, "grad_norm": 0.1374869931290009, "learning_rate": 9.992776710654768e-05, "loss": 2.1305, "step": 2220 }, { "epoch": 0.017272594357671023, "grad_norm": 0.1358538917193435, "learning_rate": 9.992770144119019e-05, "loss": 2.1813, "step": 2221 }, { "epoch": 0.017280371302451605, "grad_norm": 0.12967182888138623, "learning_rate": 9.992763574602037e-05, "loss": 2.107, "step": 2222 }, { "epoch": 0.017288148247232186, "grad_norm": 0.12162798479042522, "learning_rate": 9.992757002103827e-05, "loss": 2.1675, "step": 2223 }, { "epoch": 0.017295925192012767, "grad_norm": 0.12837889634179078, "learning_rate": 9.992750426624392e-05, "loss": 2.2008, "step": 2224 }, { "epoch": 0.017303702136793348, "grad_norm": 0.11656080713751162, "learning_rate": 9.992743848163737e-05, "loss": 2.1541, "step": 2225 }, { "epoch": 0.01731147908157393, "grad_norm": 0.11756206492045705, "learning_rate": 9.992737266721865e-05, "loss": 2.1856, "step": 2226 }, { "epoch": 0.01731925602635451, "grad_norm": 0.1193057616362778, "learning_rate": 9.992730682298781e-05, "loss": 2.1418, "step": 2227 }, { "epoch": 0.01732703297113509, "grad_norm": 0.1255539361214958, "learning_rate": 9.992724094894487e-05, "loss": 2.1233, "step": 2228 }, { "epoch": 0.017334809915915673, "grad_norm": 0.17361790516291578, "learning_rate": 9.992717504508991e-05, "loss": 2.183, "step": 2229 }, { "epoch": 0.017342586860696254, "grad_norm": 0.13003733034198534, "learning_rate": 9.992710911142291e-05, "loss": 2.1149, "step": 2230 }, { "epoch": 0.017350363805476835, "grad_norm": 0.16493818496433665, "learning_rate": 9.992704314794397e-05, "loss": 2.1937, "step": 2231 }, { "epoch": 0.017358140750257416, "grad_norm": 0.16653397256554248, "learning_rate": 9.992697715465308e-05, "loss": 2.1724, "step": 2232 }, { "epoch": 0.017365917695037997, "grad_norm": 0.13885863811536023, "learning_rate": 9.99269111315503e-05, "loss": 2.1229, "step": 2233 }, { "epoch": 0.01737369463981858, "grad_norm": 0.14333414904686845, "learning_rate": 9.992684507863568e-05, "loss": 2.107, "step": 2234 }, { "epoch": 0.01738147158459916, "grad_norm": 0.1742627842863702, "learning_rate": 9.992677899590925e-05, "loss": 2.1487, "step": 2235 }, { "epoch": 0.01738924852937974, "grad_norm": 0.13580558425408523, "learning_rate": 9.992671288337103e-05, "loss": 2.1391, "step": 2236 }, { "epoch": 0.017397025474160322, "grad_norm": 0.1581088332392453, "learning_rate": 9.992664674102111e-05, "loss": 2.1866, "step": 2237 }, { "epoch": 0.017404802418940903, "grad_norm": 0.1437310513261751, "learning_rate": 9.992658056885946e-05, "loss": 2.1714, "step": 2238 }, { "epoch": 0.017412579363721484, "grad_norm": 0.13505356922960643, "learning_rate": 9.992651436688618e-05, "loss": 2.1516, "step": 2239 }, { "epoch": 0.017420356308502066, "grad_norm": 0.14168852956967162, "learning_rate": 9.992644813510128e-05, "loss": 2.1941, "step": 2240 }, { "epoch": 0.017428133253282647, "grad_norm": 0.1443275646658841, "learning_rate": 9.99263818735048e-05, "loss": 2.1565, "step": 2241 }, { "epoch": 0.01743591019806323, "grad_norm": 0.1221965893109378, "learning_rate": 9.99263155820968e-05, "loss": 2.1315, "step": 2242 }, { "epoch": 0.017443687142843813, "grad_norm": 0.1332190807533203, "learning_rate": 9.99262492608773e-05, "loss": 2.1808, "step": 2243 }, { "epoch": 0.017451464087624394, "grad_norm": 0.14799247153461323, "learning_rate": 9.992618290984635e-05, "loss": 2.1356, "step": 2244 }, { "epoch": 0.017459241032404975, "grad_norm": 0.12108784078156395, "learning_rate": 9.992611652900397e-05, "loss": 2.1276, "step": 2245 }, { "epoch": 0.017467017977185556, "grad_norm": 0.14059255492110362, "learning_rate": 9.992605011835022e-05, "loss": 2.1557, "step": 2246 }, { "epoch": 0.017474794921966137, "grad_norm": 0.14664951982400193, "learning_rate": 9.992598367788514e-05, "loss": 2.1249, "step": 2247 }, { "epoch": 0.01748257186674672, "grad_norm": 0.12235630539765334, "learning_rate": 9.992591720760875e-05, "loss": 2.1613, "step": 2248 }, { "epoch": 0.0174903488115273, "grad_norm": 0.12996883424939046, "learning_rate": 9.992585070752111e-05, "loss": 2.199, "step": 2249 }, { "epoch": 0.01749812575630788, "grad_norm": 0.148209574014302, "learning_rate": 9.992578417762227e-05, "loss": 2.1978, "step": 2250 }, { "epoch": 0.017505902701088462, "grad_norm": 0.12227956038168561, "learning_rate": 9.992571761791223e-05, "loss": 2.126, "step": 2251 }, { "epoch": 0.017513679645869043, "grad_norm": 0.1306612055226158, "learning_rate": 9.992565102839106e-05, "loss": 2.1451, "step": 2252 }, { "epoch": 0.017521456590649624, "grad_norm": 0.12378996972441775, "learning_rate": 9.992558440905879e-05, "loss": 2.0949, "step": 2253 }, { "epoch": 0.017529233535430205, "grad_norm": 0.12001784650778871, "learning_rate": 9.992551775991546e-05, "loss": 2.1663, "step": 2254 }, { "epoch": 0.017537010480210787, "grad_norm": 0.1173623228780891, "learning_rate": 9.99254510809611e-05, "loss": 2.1096, "step": 2255 }, { "epoch": 0.017544787424991368, "grad_norm": 0.11983141270888159, "learning_rate": 9.992538437219579e-05, "loss": 2.1738, "step": 2256 }, { "epoch": 0.01755256436977195, "grad_norm": 0.11919963512344839, "learning_rate": 9.992531763361952e-05, "loss": 2.1421, "step": 2257 }, { "epoch": 0.01756034131455253, "grad_norm": 0.143542257267669, "learning_rate": 9.992525086523234e-05, "loss": 2.1344, "step": 2258 }, { "epoch": 0.01756811825933311, "grad_norm": 0.15776504310782802, "learning_rate": 9.992518406703432e-05, "loss": 2.1383, "step": 2259 }, { "epoch": 0.017575895204113692, "grad_norm": 0.1378778172964977, "learning_rate": 9.992511723902548e-05, "loss": 2.1253, "step": 2260 }, { "epoch": 0.017583672148894273, "grad_norm": 0.3026652254413277, "learning_rate": 9.992505038120587e-05, "loss": 2.1295, "step": 2261 }, { "epoch": 0.017591449093674855, "grad_norm": 0.12276948747858786, "learning_rate": 9.992498349357551e-05, "loss": 2.1349, "step": 2262 }, { "epoch": 0.017599226038455436, "grad_norm": 0.15668018862314276, "learning_rate": 9.992491657613444e-05, "loss": 2.1399, "step": 2263 }, { "epoch": 0.017607002983236017, "grad_norm": 0.1981467253603372, "learning_rate": 9.992484962888273e-05, "loss": 2.1585, "step": 2264 }, { "epoch": 0.017614779928016598, "grad_norm": 0.19688939784523046, "learning_rate": 9.992478265182039e-05, "loss": 2.1632, "step": 2265 }, { "epoch": 0.01762255687279718, "grad_norm": 0.1525716387226013, "learning_rate": 9.992471564494746e-05, "loss": 2.1607, "step": 2266 }, { "epoch": 0.01763033381757776, "grad_norm": 0.12449563448962708, "learning_rate": 9.992464860826401e-05, "loss": 2.1541, "step": 2267 }, { "epoch": 0.01763811076235834, "grad_norm": 0.1799038394785366, "learning_rate": 9.992458154177005e-05, "loss": 2.1173, "step": 2268 }, { "epoch": 0.017645887707138923, "grad_norm": 0.2144480423923714, "learning_rate": 9.992451444546562e-05, "loss": 2.1942, "step": 2269 }, { "epoch": 0.017653664651919504, "grad_norm": 0.17744810463334984, "learning_rate": 9.992444731935079e-05, "loss": 2.1566, "step": 2270 }, { "epoch": 0.017661441596700085, "grad_norm": 0.12472304296418296, "learning_rate": 9.992438016342557e-05, "loss": 2.1422, "step": 2271 }, { "epoch": 0.01766921854148067, "grad_norm": 0.15077035869380598, "learning_rate": 9.992431297769003e-05, "loss": 2.1708, "step": 2272 }, { "epoch": 0.01767699548626125, "grad_norm": 0.16332284358230234, "learning_rate": 9.992424576214417e-05, "loss": 2.2032, "step": 2273 }, { "epoch": 0.017684772431041832, "grad_norm": 0.27529839742135037, "learning_rate": 9.992417851678805e-05, "loss": 2.2073, "step": 2274 }, { "epoch": 0.017692549375822413, "grad_norm": 0.15206377409143945, "learning_rate": 9.992411124162171e-05, "loss": 2.1746, "step": 2275 }, { "epoch": 0.017700326320602994, "grad_norm": 0.19263098091793449, "learning_rate": 9.99240439366452e-05, "loss": 2.1634, "step": 2276 }, { "epoch": 0.017708103265383576, "grad_norm": 0.1924238767658572, "learning_rate": 9.992397660185854e-05, "loss": 2.1696, "step": 2277 }, { "epoch": 0.017715880210164157, "grad_norm": 0.1304201187657292, "learning_rate": 9.99239092372618e-05, "loss": 2.129, "step": 2278 }, { "epoch": 0.017723657154944738, "grad_norm": 0.12929699591684438, "learning_rate": 9.992384184285499e-05, "loss": 2.165, "step": 2279 }, { "epoch": 0.01773143409972532, "grad_norm": 0.16434509087129617, "learning_rate": 9.992377441863818e-05, "loss": 2.1222, "step": 2280 }, { "epoch": 0.0177392110445059, "grad_norm": 0.1525891395349937, "learning_rate": 9.992370696461138e-05, "loss": 2.163, "step": 2281 }, { "epoch": 0.01774698798928648, "grad_norm": 0.13201606788500242, "learning_rate": 9.992363948077463e-05, "loss": 2.1075, "step": 2282 }, { "epoch": 0.017754764934067063, "grad_norm": 0.1436155791834473, "learning_rate": 9.992357196712801e-05, "loss": 2.2036, "step": 2283 }, { "epoch": 0.017762541878847644, "grad_norm": 0.1764319752196338, "learning_rate": 9.99235044236715e-05, "loss": 2.1085, "step": 2284 }, { "epoch": 0.017770318823628225, "grad_norm": 0.13957397968114568, "learning_rate": 9.99234368504052e-05, "loss": 2.1311, "step": 2285 }, { "epoch": 0.017778095768408806, "grad_norm": 0.1233916059767179, "learning_rate": 9.992336924732911e-05, "loss": 2.1898, "step": 2286 }, { "epoch": 0.017785872713189387, "grad_norm": 0.14796214908892708, "learning_rate": 9.992330161444329e-05, "loss": 2.1375, "step": 2287 }, { "epoch": 0.01779364965796997, "grad_norm": 0.1483834358323459, "learning_rate": 9.992323395174777e-05, "loss": 2.151, "step": 2288 }, { "epoch": 0.01780142660275055, "grad_norm": 0.11954583650280004, "learning_rate": 9.99231662592426e-05, "loss": 2.1636, "step": 2289 }, { "epoch": 0.01780920354753113, "grad_norm": 0.27122619844109336, "learning_rate": 9.992309853692782e-05, "loss": 2.1325, "step": 2290 }, { "epoch": 0.017816980492311712, "grad_norm": 0.15963053136684466, "learning_rate": 9.992303078480346e-05, "loss": 2.1524, "step": 2291 }, { "epoch": 0.017824757437092293, "grad_norm": 0.14883352686662774, "learning_rate": 9.992296300286957e-05, "loss": 2.1438, "step": 2292 }, { "epoch": 0.017832534381872874, "grad_norm": 0.1363034780954492, "learning_rate": 9.992289519112619e-05, "loss": 2.1801, "step": 2293 }, { "epoch": 0.017840311326653455, "grad_norm": 0.16837758864114122, "learning_rate": 9.992282734957334e-05, "loss": 2.1561, "step": 2294 }, { "epoch": 0.017848088271434037, "grad_norm": 0.1655708972728389, "learning_rate": 9.99227594782111e-05, "loss": 2.1339, "step": 2295 }, { "epoch": 0.017855865216214618, "grad_norm": 0.12469327626731747, "learning_rate": 9.992269157703946e-05, "loss": 2.1245, "step": 2296 }, { "epoch": 0.0178636421609952, "grad_norm": 0.3542846133258874, "learning_rate": 9.992262364605851e-05, "loss": 2.1574, "step": 2297 }, { "epoch": 0.01787141910577578, "grad_norm": 0.2353628362859222, "learning_rate": 9.992255568526828e-05, "loss": 2.1792, "step": 2298 }, { "epoch": 0.01787919605055636, "grad_norm": 0.15676836068859754, "learning_rate": 9.992248769466877e-05, "loss": 2.176, "step": 2299 }, { "epoch": 0.017886972995336942, "grad_norm": 0.1376908009564574, "learning_rate": 9.992241967426008e-05, "loss": 2.1286, "step": 2300 }, { "epoch": 0.017894749940117523, "grad_norm": 0.1586406035261587, "learning_rate": 9.99223516240422e-05, "loss": 2.1509, "step": 2301 }, { "epoch": 0.017902526884898108, "grad_norm": 0.14837842215912486, "learning_rate": 9.992228354401521e-05, "loss": 2.1468, "step": 2302 }, { "epoch": 0.01791030382967869, "grad_norm": 0.13129905299033837, "learning_rate": 9.992221543417912e-05, "loss": 2.1286, "step": 2303 }, { "epoch": 0.01791808077445927, "grad_norm": 0.1393472160053323, "learning_rate": 9.992214729453399e-05, "loss": 2.1514, "step": 2304 }, { "epoch": 0.01792585771923985, "grad_norm": 0.12572279220947294, "learning_rate": 9.992207912507984e-05, "loss": 2.1652, "step": 2305 }, { "epoch": 0.017933634664020433, "grad_norm": 0.1834429947930622, "learning_rate": 9.992201092581674e-05, "loss": 2.1248, "step": 2306 }, { "epoch": 0.017941411608801014, "grad_norm": 0.14401362890589284, "learning_rate": 9.99219426967447e-05, "loss": 2.1538, "step": 2307 }, { "epoch": 0.017949188553581595, "grad_norm": 0.14006914567767706, "learning_rate": 9.99218744378638e-05, "loss": 2.1432, "step": 2308 }, { "epoch": 0.017956965498362176, "grad_norm": 0.14087718691513104, "learning_rate": 9.992180614917403e-05, "loss": 2.1669, "step": 2309 }, { "epoch": 0.017964742443142757, "grad_norm": 0.13682670206306946, "learning_rate": 9.992173783067548e-05, "loss": 2.1407, "step": 2310 }, { "epoch": 0.01797251938792334, "grad_norm": 0.14897778779112905, "learning_rate": 9.992166948236816e-05, "loss": 2.1789, "step": 2311 }, { "epoch": 0.01798029633270392, "grad_norm": 0.15952794003480456, "learning_rate": 9.992160110425213e-05, "loss": 2.1888, "step": 2312 }, { "epoch": 0.0179880732774845, "grad_norm": 0.1391228192201599, "learning_rate": 9.99215326963274e-05, "loss": 2.1805, "step": 2313 }, { "epoch": 0.017995850222265082, "grad_norm": 0.12930519544403238, "learning_rate": 9.992146425859405e-05, "loss": 2.1817, "step": 2314 }, { "epoch": 0.018003627167045663, "grad_norm": 0.14692983835700169, "learning_rate": 9.992139579105208e-05, "loss": 2.1916, "step": 2315 }, { "epoch": 0.018011404111826244, "grad_norm": 0.2890515450001521, "learning_rate": 9.992132729370158e-05, "loss": 2.164, "step": 2316 }, { "epoch": 0.018019181056606826, "grad_norm": 0.2083407913484195, "learning_rate": 9.992125876654253e-05, "loss": 2.1482, "step": 2317 }, { "epoch": 0.018026958001387407, "grad_norm": 0.1784766897523493, "learning_rate": 9.992119020957503e-05, "loss": 2.1749, "step": 2318 }, { "epoch": 0.018034734946167988, "grad_norm": 0.15721646090442912, "learning_rate": 9.992112162279909e-05, "loss": 2.1511, "step": 2319 }, { "epoch": 0.01804251189094857, "grad_norm": 0.18731521389872302, "learning_rate": 9.992105300621475e-05, "loss": 2.1631, "step": 2320 }, { "epoch": 0.01805028883572915, "grad_norm": 0.18856126009957375, "learning_rate": 9.992098435982207e-05, "loss": 2.1637, "step": 2321 }, { "epoch": 0.01805806578050973, "grad_norm": 0.141938719191385, "learning_rate": 9.992091568362107e-05, "loss": 2.1316, "step": 2322 }, { "epoch": 0.018065842725290313, "grad_norm": 0.12327539374832426, "learning_rate": 9.992084697761179e-05, "loss": 2.174, "step": 2323 }, { "epoch": 0.018073619670070894, "grad_norm": 0.14096029099844737, "learning_rate": 9.99207782417943e-05, "loss": 2.1553, "step": 2324 }, { "epoch": 0.018081396614851475, "grad_norm": 0.14050946503699174, "learning_rate": 9.99207094761686e-05, "loss": 2.128, "step": 2325 }, { "epoch": 0.018089173559632056, "grad_norm": 0.1232139855329435, "learning_rate": 9.992064068073478e-05, "loss": 2.1229, "step": 2326 }, { "epoch": 0.018096950504412637, "grad_norm": 0.13048789958585705, "learning_rate": 9.992057185549284e-05, "loss": 2.149, "step": 2327 }, { "epoch": 0.01810472744919322, "grad_norm": 0.14484517661903784, "learning_rate": 9.992050300044282e-05, "loss": 2.1201, "step": 2328 }, { "epoch": 0.0181125043939738, "grad_norm": 0.1390161716131947, "learning_rate": 9.992043411558481e-05, "loss": 2.1502, "step": 2329 }, { "epoch": 0.01812028133875438, "grad_norm": 0.14071358497189196, "learning_rate": 9.992036520091879e-05, "loss": 2.228, "step": 2330 }, { "epoch": 0.018128058283534965, "grad_norm": 0.11930891019427886, "learning_rate": 9.992029625644484e-05, "loss": 2.1174, "step": 2331 }, { "epoch": 0.018135835228315546, "grad_norm": 0.1337309775876694, "learning_rate": 9.992022728216297e-05, "loss": 2.1855, "step": 2332 }, { "epoch": 0.018143612173096128, "grad_norm": 0.14425615105698586, "learning_rate": 9.992015827807326e-05, "loss": 2.116, "step": 2333 }, { "epoch": 0.01815138911787671, "grad_norm": 0.13021846693241243, "learning_rate": 9.992008924417573e-05, "loss": 2.1762, "step": 2334 }, { "epoch": 0.01815916606265729, "grad_norm": 0.1215056673028662, "learning_rate": 9.99200201804704e-05, "loss": 2.1401, "step": 2335 }, { "epoch": 0.01816694300743787, "grad_norm": 0.14175165330134398, "learning_rate": 9.991995108695736e-05, "loss": 2.15, "step": 2336 }, { "epoch": 0.018174719952218452, "grad_norm": 0.13320442758128567, "learning_rate": 9.991988196363662e-05, "loss": 2.1755, "step": 2337 }, { "epoch": 0.018182496896999033, "grad_norm": 0.11864911793196935, "learning_rate": 9.991981281050823e-05, "loss": 2.1803, "step": 2338 }, { "epoch": 0.018190273841779615, "grad_norm": 0.1414442826689431, "learning_rate": 9.991974362757222e-05, "loss": 2.1354, "step": 2339 }, { "epoch": 0.018198050786560196, "grad_norm": 0.14688380971786189, "learning_rate": 9.991967441482863e-05, "loss": 2.1325, "step": 2340 }, { "epoch": 0.018205827731340777, "grad_norm": 0.13629978864761244, "learning_rate": 9.991960517227753e-05, "loss": 2.1384, "step": 2341 }, { "epoch": 0.018213604676121358, "grad_norm": 0.12584531346559055, "learning_rate": 9.991953589991894e-05, "loss": 2.1849, "step": 2342 }, { "epoch": 0.01822138162090194, "grad_norm": 0.15868634439610435, "learning_rate": 9.99194665977529e-05, "loss": 2.1771, "step": 2343 }, { "epoch": 0.01822915856568252, "grad_norm": 0.18927431933753894, "learning_rate": 9.991939726577945e-05, "loss": 2.1806, "step": 2344 }, { "epoch": 0.0182369355104631, "grad_norm": 0.1843343827780852, "learning_rate": 9.991932790399862e-05, "loss": 2.1483, "step": 2345 }, { "epoch": 0.018244712455243683, "grad_norm": 0.15783223347333977, "learning_rate": 9.991925851241051e-05, "loss": 2.1597, "step": 2346 }, { "epoch": 0.018252489400024264, "grad_norm": 0.12335346388534645, "learning_rate": 9.991918909101508e-05, "loss": 2.1384, "step": 2347 }, { "epoch": 0.018260266344804845, "grad_norm": 0.23953879666106165, "learning_rate": 9.991911963981242e-05, "loss": 2.1138, "step": 2348 }, { "epoch": 0.018268043289585426, "grad_norm": 0.1429944443153525, "learning_rate": 9.991905015880258e-05, "loss": 2.2013, "step": 2349 }, { "epoch": 0.018275820234366007, "grad_norm": 0.13297539382141207, "learning_rate": 9.991898064798557e-05, "loss": 2.1538, "step": 2350 }, { "epoch": 0.01828359717914659, "grad_norm": 0.13857452061204228, "learning_rate": 9.991891110736145e-05, "loss": 2.1452, "step": 2351 }, { "epoch": 0.01829137412392717, "grad_norm": 0.1351486409968896, "learning_rate": 9.991884153693026e-05, "loss": 2.1709, "step": 2352 }, { "epoch": 0.01829915106870775, "grad_norm": 0.13500297319104104, "learning_rate": 9.991877193669202e-05, "loss": 2.1432, "step": 2353 }, { "epoch": 0.018306928013488332, "grad_norm": 0.12195689548076746, "learning_rate": 9.99187023066468e-05, "loss": 2.1607, "step": 2354 }, { "epoch": 0.018314704958268913, "grad_norm": 0.18471320693580856, "learning_rate": 9.991863264679461e-05, "loss": 2.1372, "step": 2355 }, { "epoch": 0.018322481903049494, "grad_norm": 0.1337549830095234, "learning_rate": 9.991856295713555e-05, "loss": 2.143, "step": 2356 }, { "epoch": 0.018330258847830076, "grad_norm": 0.13769023758063414, "learning_rate": 9.99184932376696e-05, "loss": 2.1223, "step": 2357 }, { "epoch": 0.018338035792610657, "grad_norm": 0.12509817103998427, "learning_rate": 9.991842348839684e-05, "loss": 2.0863, "step": 2358 }, { "epoch": 0.018345812737391238, "grad_norm": 0.12511870900743027, "learning_rate": 9.991835370931728e-05, "loss": 2.112, "step": 2359 }, { "epoch": 0.01835358968217182, "grad_norm": 0.1247426129492696, "learning_rate": 9.9918283900431e-05, "loss": 2.1272, "step": 2360 }, { "epoch": 0.018361366626952404, "grad_norm": 0.16479389206989264, "learning_rate": 9.991821406173801e-05, "loss": 2.1836, "step": 2361 }, { "epoch": 0.018369143571732985, "grad_norm": 0.1625641529156566, "learning_rate": 9.991814419323836e-05, "loss": 2.1953, "step": 2362 }, { "epoch": 0.018376920516513566, "grad_norm": 0.1667544307223083, "learning_rate": 9.991807429493209e-05, "loss": 2.1701, "step": 2363 }, { "epoch": 0.018384697461294147, "grad_norm": 0.13801494211685356, "learning_rate": 9.991800436681926e-05, "loss": 2.1189, "step": 2364 }, { "epoch": 0.01839247440607473, "grad_norm": 0.12000391389193434, "learning_rate": 9.991793440889989e-05, "loss": 2.1956, "step": 2365 }, { "epoch": 0.01840025135085531, "grad_norm": 0.1333517738578382, "learning_rate": 9.991786442117403e-05, "loss": 2.1438, "step": 2366 }, { "epoch": 0.01840802829563589, "grad_norm": 0.14824499194611887, "learning_rate": 9.991779440364173e-05, "loss": 2.1462, "step": 2367 }, { "epoch": 0.018415805240416472, "grad_norm": 0.19997103981547823, "learning_rate": 9.9917724356303e-05, "loss": 2.1624, "step": 2368 }, { "epoch": 0.018423582185197053, "grad_norm": 0.12245056623125436, "learning_rate": 9.991765427915794e-05, "loss": 2.1247, "step": 2369 }, { "epoch": 0.018431359129977634, "grad_norm": 0.152312058517423, "learning_rate": 9.991758417220652e-05, "loss": 2.1765, "step": 2370 }, { "epoch": 0.018439136074758215, "grad_norm": 0.18124975223156176, "learning_rate": 9.991751403544885e-05, "loss": 2.1709, "step": 2371 }, { "epoch": 0.018446913019538796, "grad_norm": 0.15529422217254227, "learning_rate": 9.991744386888492e-05, "loss": 2.2041, "step": 2372 }, { "epoch": 0.018454689964319378, "grad_norm": 0.12941831624506261, "learning_rate": 9.991737367251479e-05, "loss": 2.1799, "step": 2373 }, { "epoch": 0.01846246690909996, "grad_norm": 0.12457876244190165, "learning_rate": 9.991730344633852e-05, "loss": 2.1676, "step": 2374 }, { "epoch": 0.01847024385388054, "grad_norm": 0.13124637928099733, "learning_rate": 9.991723319035613e-05, "loss": 2.1933, "step": 2375 }, { "epoch": 0.01847802079866112, "grad_norm": 0.11989016505926327, "learning_rate": 9.991716290456767e-05, "loss": 2.1314, "step": 2376 }, { "epoch": 0.018485797743441702, "grad_norm": 0.12729103844741316, "learning_rate": 9.991709258897318e-05, "loss": 2.1498, "step": 2377 }, { "epoch": 0.018493574688222283, "grad_norm": 0.148572392896585, "learning_rate": 9.991702224357269e-05, "loss": 2.1271, "step": 2378 }, { "epoch": 0.018501351633002865, "grad_norm": 0.14134394036519915, "learning_rate": 9.991695186836627e-05, "loss": 2.0948, "step": 2379 }, { "epoch": 0.018509128577783446, "grad_norm": 0.1294326449031851, "learning_rate": 9.991688146335394e-05, "loss": 2.1311, "step": 2380 }, { "epoch": 0.018516905522564027, "grad_norm": 0.13375893954036974, "learning_rate": 9.991681102853574e-05, "loss": 2.1326, "step": 2381 }, { "epoch": 0.018524682467344608, "grad_norm": 0.1270603761713249, "learning_rate": 9.991674056391173e-05, "loss": 2.1231, "step": 2382 }, { "epoch": 0.01853245941212519, "grad_norm": 0.11436489770270446, "learning_rate": 9.991667006948195e-05, "loss": 2.1129, "step": 2383 }, { "epoch": 0.01854023635690577, "grad_norm": 0.11998563787151094, "learning_rate": 9.991659954524643e-05, "loss": 2.1431, "step": 2384 }, { "epoch": 0.01854801330168635, "grad_norm": 0.12358056229334512, "learning_rate": 9.99165289912052e-05, "loss": 2.1863, "step": 2385 }, { "epoch": 0.018555790246466933, "grad_norm": 0.12861007336342614, "learning_rate": 9.991645840735833e-05, "loss": 2.1684, "step": 2386 }, { "epoch": 0.018563567191247514, "grad_norm": 0.1601578087505526, "learning_rate": 9.991638779370586e-05, "loss": 2.1318, "step": 2387 }, { "epoch": 0.018571344136028095, "grad_norm": 0.23301073376045925, "learning_rate": 9.991631715024782e-05, "loss": 2.1614, "step": 2388 }, { "epoch": 0.018579121080808676, "grad_norm": 0.23754948507158602, "learning_rate": 9.991624647698426e-05, "loss": 2.1648, "step": 2389 }, { "epoch": 0.018586898025589257, "grad_norm": 0.17470210700502312, "learning_rate": 9.991617577391518e-05, "loss": 2.1436, "step": 2390 }, { "epoch": 0.018594674970369842, "grad_norm": 0.14152625202722285, "learning_rate": 9.99161050410407e-05, "loss": 2.1135, "step": 2391 }, { "epoch": 0.018602451915150423, "grad_norm": 0.20461559379583327, "learning_rate": 9.991603427836081e-05, "loss": 2.162, "step": 2392 }, { "epoch": 0.018610228859931004, "grad_norm": 0.2307130590855755, "learning_rate": 9.991596348587558e-05, "loss": 2.1225, "step": 2393 }, { "epoch": 0.018618005804711586, "grad_norm": 0.5725271252314338, "learning_rate": 9.991589266358501e-05, "loss": 2.159, "step": 2394 }, { "epoch": 0.018625782749492167, "grad_norm": 0.1435147131136257, "learning_rate": 9.991582181148918e-05, "loss": 2.1358, "step": 2395 }, { "epoch": 0.018633559694272748, "grad_norm": 0.13221161745351534, "learning_rate": 9.991575092958812e-05, "loss": 2.1306, "step": 2396 }, { "epoch": 0.01864133663905333, "grad_norm": 0.19799687410995442, "learning_rate": 9.991568001788187e-05, "loss": 2.1934, "step": 2397 }, { "epoch": 0.01864911358383391, "grad_norm": 0.2098459463997017, "learning_rate": 9.991560907637047e-05, "loss": 2.1398, "step": 2398 }, { "epoch": 0.01865689052861449, "grad_norm": 0.16809713156259576, "learning_rate": 9.991553810505399e-05, "loss": 2.1161, "step": 2399 }, { "epoch": 0.018664667473395073, "grad_norm": 0.14319102706112002, "learning_rate": 9.991546710393242e-05, "loss": 2.1361, "step": 2400 }, { "epoch": 0.018672444418175654, "grad_norm": 0.1785876037587625, "learning_rate": 9.991539607300586e-05, "loss": 2.1903, "step": 2401 }, { "epoch": 0.018680221362956235, "grad_norm": 0.22344381817284764, "learning_rate": 9.991532501227432e-05, "loss": 2.1728, "step": 2402 }, { "epoch": 0.018687998307736816, "grad_norm": 0.2425339778963928, "learning_rate": 9.991525392173784e-05, "loss": 2.1958, "step": 2403 }, { "epoch": 0.018695775252517397, "grad_norm": 0.18681136178666477, "learning_rate": 9.991518280139646e-05, "loss": 2.1587, "step": 2404 }, { "epoch": 0.01870355219729798, "grad_norm": 0.1266213934439938, "learning_rate": 9.991511165125027e-05, "loss": 2.154, "step": 2405 }, { "epoch": 0.01871132914207856, "grad_norm": 0.15981678015695566, "learning_rate": 9.991504047129925e-05, "loss": 2.1525, "step": 2406 }, { "epoch": 0.01871910608685914, "grad_norm": 0.17701584934500675, "learning_rate": 9.991496926154347e-05, "loss": 2.1374, "step": 2407 }, { "epoch": 0.018726883031639722, "grad_norm": 0.13697583519196604, "learning_rate": 9.991489802198295e-05, "loss": 2.12, "step": 2408 }, { "epoch": 0.018734659976420303, "grad_norm": 0.1464066589808614, "learning_rate": 9.991482675261779e-05, "loss": 2.1791, "step": 2409 }, { "epoch": 0.018742436921200884, "grad_norm": 0.21291117848240484, "learning_rate": 9.991475545344797e-05, "loss": 2.1469, "step": 2410 }, { "epoch": 0.018750213865981465, "grad_norm": 0.1323416551591574, "learning_rate": 9.991468412447356e-05, "loss": 2.1539, "step": 2411 }, { "epoch": 0.018757990810762046, "grad_norm": 0.12628997024736865, "learning_rate": 9.991461276569461e-05, "loss": 2.1458, "step": 2412 }, { "epoch": 0.018765767755542628, "grad_norm": 0.12384171498929653, "learning_rate": 9.991454137711114e-05, "loss": 2.1599, "step": 2413 }, { "epoch": 0.01877354470032321, "grad_norm": 0.1804523838237584, "learning_rate": 9.991446995872322e-05, "loss": 2.1365, "step": 2414 }, { "epoch": 0.01878132164510379, "grad_norm": 0.12164123859010535, "learning_rate": 9.991439851053085e-05, "loss": 2.1531, "step": 2415 }, { "epoch": 0.01878909858988437, "grad_norm": 0.13022550818441203, "learning_rate": 9.991432703253414e-05, "loss": 2.1893, "step": 2416 }, { "epoch": 0.018796875534664952, "grad_norm": 0.1315849085079062, "learning_rate": 9.991425552473308e-05, "loss": 2.1706, "step": 2417 }, { "epoch": 0.018804652479445533, "grad_norm": 0.14848587067597713, "learning_rate": 9.99141839871277e-05, "loss": 2.1631, "step": 2418 }, { "epoch": 0.018812429424226115, "grad_norm": 0.1519420746205896, "learning_rate": 9.991411241971811e-05, "loss": 2.138, "step": 2419 }, { "epoch": 0.018820206369006696, "grad_norm": 0.14863392441656814, "learning_rate": 9.991404082250429e-05, "loss": 2.1557, "step": 2420 }, { "epoch": 0.01882798331378728, "grad_norm": 0.1253513018325483, "learning_rate": 9.99139691954863e-05, "loss": 2.1708, "step": 2421 }, { "epoch": 0.01883576025856786, "grad_norm": 0.1256669132026189, "learning_rate": 9.99138975386642e-05, "loss": 2.0993, "step": 2422 }, { "epoch": 0.018843537203348443, "grad_norm": 0.14378346979547824, "learning_rate": 9.991382585203802e-05, "loss": 2.1191, "step": 2423 }, { "epoch": 0.018851314148129024, "grad_norm": 0.14027279580418436, "learning_rate": 9.991375413560779e-05, "loss": 2.1286, "step": 2424 }, { "epoch": 0.018859091092909605, "grad_norm": 0.12524968932021727, "learning_rate": 9.991368238937358e-05, "loss": 2.2046, "step": 2425 }, { "epoch": 0.018866868037690186, "grad_norm": 0.1520774376136882, "learning_rate": 9.99136106133354e-05, "loss": 2.1622, "step": 2426 }, { "epoch": 0.018874644982470767, "grad_norm": 0.14773809781556302, "learning_rate": 9.991353880749332e-05, "loss": 2.1356, "step": 2427 }, { "epoch": 0.01888242192725135, "grad_norm": 0.12229621353222643, "learning_rate": 9.991346697184738e-05, "loss": 2.1639, "step": 2428 }, { "epoch": 0.01889019887203193, "grad_norm": 0.13153057619034011, "learning_rate": 9.991339510639761e-05, "loss": 2.1496, "step": 2429 }, { "epoch": 0.01889797581681251, "grad_norm": 0.15200753889619836, "learning_rate": 9.991332321114408e-05, "loss": 2.1217, "step": 2430 }, { "epoch": 0.018905752761593092, "grad_norm": 0.1474502548489185, "learning_rate": 9.991325128608679e-05, "loss": 2.1572, "step": 2431 }, { "epoch": 0.018913529706373673, "grad_norm": 0.11447641388268859, "learning_rate": 9.991317933122582e-05, "loss": 2.1353, "step": 2432 }, { "epoch": 0.018921306651154254, "grad_norm": 0.13759430299129133, "learning_rate": 9.991310734656118e-05, "loss": 2.1385, "step": 2433 }, { "epoch": 0.018929083595934836, "grad_norm": 0.19805226776482202, "learning_rate": 9.991303533209295e-05, "loss": 2.1623, "step": 2434 }, { "epoch": 0.018936860540715417, "grad_norm": 0.2738061301891357, "learning_rate": 9.991296328782113e-05, "loss": 2.1747, "step": 2435 }, { "epoch": 0.018944637485495998, "grad_norm": 0.16905563165093396, "learning_rate": 9.991289121374582e-05, "loss": 2.142, "step": 2436 }, { "epoch": 0.01895241443027658, "grad_norm": 0.14881304408342877, "learning_rate": 9.991281910986701e-05, "loss": 2.1576, "step": 2437 }, { "epoch": 0.01896019137505716, "grad_norm": 0.24407697927658414, "learning_rate": 9.991274697618478e-05, "loss": 2.2047, "step": 2438 }, { "epoch": 0.01896796831983774, "grad_norm": 0.30910409128414207, "learning_rate": 9.991267481269914e-05, "loss": 2.1482, "step": 2439 }, { "epoch": 0.018975745264618323, "grad_norm": 0.25648139606802806, "learning_rate": 9.991260261941016e-05, "loss": 2.0985, "step": 2440 }, { "epoch": 0.018983522209398904, "grad_norm": 0.13253867930940647, "learning_rate": 9.991253039631788e-05, "loss": 2.1498, "step": 2441 }, { "epoch": 0.018991299154179485, "grad_norm": 0.22647793234983057, "learning_rate": 9.991245814342232e-05, "loss": 2.1264, "step": 2442 }, { "epoch": 0.018999076098960066, "grad_norm": 0.31563556612049665, "learning_rate": 9.991238586072356e-05, "loss": 2.1041, "step": 2443 }, { "epoch": 0.019006853043740647, "grad_norm": 0.2649466171174294, "learning_rate": 9.99123135482216e-05, "loss": 2.1272, "step": 2444 }, { "epoch": 0.01901462998852123, "grad_norm": 0.13379656955038263, "learning_rate": 9.991224120591652e-05, "loss": 2.1502, "step": 2445 }, { "epoch": 0.01902240693330181, "grad_norm": 0.28181981266180745, "learning_rate": 9.991216883380834e-05, "loss": 2.1631, "step": 2446 }, { "epoch": 0.01903018387808239, "grad_norm": 0.3767099401638713, "learning_rate": 9.991209643189712e-05, "loss": 2.128, "step": 2447 }, { "epoch": 0.019037960822862972, "grad_norm": 0.2221065070148826, "learning_rate": 9.991202400018289e-05, "loss": 2.1902, "step": 2448 }, { "epoch": 0.019045737767643553, "grad_norm": 0.15690063366620327, "learning_rate": 9.99119515386657e-05, "loss": 2.1972, "step": 2449 }, { "epoch": 0.019053514712424138, "grad_norm": 0.2600212758514075, "learning_rate": 9.991187904734559e-05, "loss": 2.1851, "step": 2450 }, { "epoch": 0.01906129165720472, "grad_norm": 0.2050049494951091, "learning_rate": 9.991180652622261e-05, "loss": 2.1806, "step": 2451 }, { "epoch": 0.0190690686019853, "grad_norm": 0.11813758767697385, "learning_rate": 9.99117339752968e-05, "loss": 2.1366, "step": 2452 }, { "epoch": 0.01907684554676588, "grad_norm": 0.1694920494190201, "learning_rate": 9.99116613945682e-05, "loss": 2.144, "step": 2453 }, { "epoch": 0.019084622491546462, "grad_norm": 0.15916719284857628, "learning_rate": 9.991158878403685e-05, "loss": 2.092, "step": 2454 }, { "epoch": 0.019092399436327043, "grad_norm": 0.1198423950166701, "learning_rate": 9.99115161437028e-05, "loss": 2.0984, "step": 2455 }, { "epoch": 0.019100176381107625, "grad_norm": 0.14414559144461891, "learning_rate": 9.99114434735661e-05, "loss": 2.1613, "step": 2456 }, { "epoch": 0.019107953325888206, "grad_norm": 0.162261414025352, "learning_rate": 9.991137077362678e-05, "loss": 2.1248, "step": 2457 }, { "epoch": 0.019115730270668787, "grad_norm": 0.13356051996661217, "learning_rate": 9.991129804388489e-05, "loss": 2.1374, "step": 2458 }, { "epoch": 0.019123507215449368, "grad_norm": 0.12236648022104266, "learning_rate": 9.991122528434047e-05, "loss": 2.1474, "step": 2459 }, { "epoch": 0.01913128416022995, "grad_norm": 0.15272661780222138, "learning_rate": 9.991115249499356e-05, "loss": 2.148, "step": 2460 }, { "epoch": 0.01913906110501053, "grad_norm": 0.15212747863500667, "learning_rate": 9.991107967584422e-05, "loss": 2.1432, "step": 2461 }, { "epoch": 0.01914683804979111, "grad_norm": 0.11823530951206279, "learning_rate": 9.991100682689247e-05, "loss": 2.1785, "step": 2462 }, { "epoch": 0.019154614994571693, "grad_norm": 0.18587266333345379, "learning_rate": 9.991093394813839e-05, "loss": 2.1686, "step": 2463 }, { "epoch": 0.019162391939352274, "grad_norm": 0.249006521323332, "learning_rate": 9.991086103958198e-05, "loss": 2.1344, "step": 2464 }, { "epoch": 0.019170168884132855, "grad_norm": 0.19393311709993297, "learning_rate": 9.99107881012233e-05, "loss": 2.1918, "step": 2465 }, { "epoch": 0.019177945828913436, "grad_norm": 0.12047776097280836, "learning_rate": 9.991071513306241e-05, "loss": 2.1436, "step": 2466 }, { "epoch": 0.019185722773694017, "grad_norm": 0.2017028311922553, "learning_rate": 9.991064213509934e-05, "loss": 2.0974, "step": 2467 }, { "epoch": 0.0191934997184746, "grad_norm": 0.2496204979160858, "learning_rate": 9.991056910733412e-05, "loss": 2.0757, "step": 2468 }, { "epoch": 0.01920127666325518, "grad_norm": 0.18554684815522365, "learning_rate": 9.991049604976681e-05, "loss": 2.1579, "step": 2469 }, { "epoch": 0.01920905360803576, "grad_norm": 0.12484395490561248, "learning_rate": 9.991042296239747e-05, "loss": 2.119, "step": 2470 }, { "epoch": 0.019216830552816342, "grad_norm": 0.23161460780507137, "learning_rate": 9.991034984522611e-05, "loss": 2.1276, "step": 2471 }, { "epoch": 0.019224607497596923, "grad_norm": 0.2554095796035401, "learning_rate": 9.99102766982528e-05, "loss": 2.1883, "step": 2472 }, { "epoch": 0.019232384442377504, "grad_norm": 0.1384164850097075, "learning_rate": 9.991020352147755e-05, "loss": 2.182, "step": 2473 }, { "epoch": 0.019240161387158086, "grad_norm": 0.17243065707175384, "learning_rate": 9.991013031490044e-05, "loss": 2.164, "step": 2474 }, { "epoch": 0.019247938331938667, "grad_norm": 0.2039306846286029, "learning_rate": 9.991005707852149e-05, "loss": 2.0874, "step": 2475 }, { "epoch": 0.019255715276719248, "grad_norm": 0.12707649367759405, "learning_rate": 9.990998381234075e-05, "loss": 2.1547, "step": 2476 }, { "epoch": 0.01926349222149983, "grad_norm": 0.168581934719287, "learning_rate": 9.990991051635828e-05, "loss": 2.1486, "step": 2477 }, { "epoch": 0.01927126916628041, "grad_norm": 0.18920595159545242, "learning_rate": 9.990983719057412e-05, "loss": 2.1392, "step": 2478 }, { "epoch": 0.01927904611106099, "grad_norm": 0.13866083568239573, "learning_rate": 9.99097638349883e-05, "loss": 2.1349, "step": 2479 }, { "epoch": 0.019286823055841576, "grad_norm": 0.13017800236193677, "learning_rate": 9.990969044960085e-05, "loss": 2.1893, "step": 2480 }, { "epoch": 0.019294600000622157, "grad_norm": 0.15558318909351374, "learning_rate": 9.990961703441184e-05, "loss": 2.1534, "step": 2481 }, { "epoch": 0.01930237694540274, "grad_norm": 0.11849913462545877, "learning_rate": 9.99095435894213e-05, "loss": 2.1332, "step": 2482 }, { "epoch": 0.01931015389018332, "grad_norm": 0.20290523502274557, "learning_rate": 9.99094701146293e-05, "loss": 2.1297, "step": 2483 }, { "epoch": 0.0193179308349639, "grad_norm": 0.16962680676407216, "learning_rate": 9.990939661003584e-05, "loss": 2.1369, "step": 2484 }, { "epoch": 0.019325707779744482, "grad_norm": 0.12316278253987002, "learning_rate": 9.990932307564101e-05, "loss": 2.1501, "step": 2485 }, { "epoch": 0.019333484724525063, "grad_norm": 0.17588108031202215, "learning_rate": 9.990924951144483e-05, "loss": 2.132, "step": 2486 }, { "epoch": 0.019341261669305644, "grad_norm": 0.25333754506330886, "learning_rate": 9.990917591744733e-05, "loss": 2.1375, "step": 2487 }, { "epoch": 0.019349038614086225, "grad_norm": 0.13629665162566815, "learning_rate": 9.990910229364858e-05, "loss": 2.1278, "step": 2488 }, { "epoch": 0.019356815558866806, "grad_norm": 0.2215125642817452, "learning_rate": 9.99090286400486e-05, "loss": 2.1622, "step": 2489 }, { "epoch": 0.019364592503647388, "grad_norm": 0.18495896175266177, "learning_rate": 9.990895495664746e-05, "loss": 2.1379, "step": 2490 }, { "epoch": 0.01937236944842797, "grad_norm": 0.13271608820595446, "learning_rate": 9.990888124344518e-05, "loss": 2.1695, "step": 2491 }, { "epoch": 0.01938014639320855, "grad_norm": 0.2497050098594738, "learning_rate": 9.990880750044183e-05, "loss": 2.154, "step": 2492 }, { "epoch": 0.01938792333798913, "grad_norm": 0.20543999831769608, "learning_rate": 9.990873372763742e-05, "loss": 2.1521, "step": 2493 }, { "epoch": 0.019395700282769712, "grad_norm": 0.13888073046093877, "learning_rate": 9.990865992503204e-05, "loss": 2.1275, "step": 2494 }, { "epoch": 0.019403477227550293, "grad_norm": 0.2233029065123173, "learning_rate": 9.990858609262569e-05, "loss": 2.1055, "step": 2495 }, { "epoch": 0.019411254172330875, "grad_norm": 0.17404953257182051, "learning_rate": 9.990851223041843e-05, "loss": 2.1727, "step": 2496 }, { "epoch": 0.019419031117111456, "grad_norm": 0.13892129802426503, "learning_rate": 9.990843833841032e-05, "loss": 2.1848, "step": 2497 }, { "epoch": 0.019426808061892037, "grad_norm": 0.25699111384125944, "learning_rate": 9.990836441660138e-05, "loss": 2.1312, "step": 2498 }, { "epoch": 0.019434585006672618, "grad_norm": 0.1545057995563264, "learning_rate": 9.990829046499167e-05, "loss": 2.1415, "step": 2499 }, { "epoch": 0.0194423619514532, "grad_norm": 0.14400305178747938, "learning_rate": 9.990821648358122e-05, "loss": 2.1564, "step": 2500 }, { "epoch": 0.01945013889623378, "grad_norm": 0.15731267596764484, "learning_rate": 9.990814247237008e-05, "loss": 2.139, "step": 2501 }, { "epoch": 0.01945791584101436, "grad_norm": 0.1296509381323051, "learning_rate": 9.99080684313583e-05, "loss": 2.1797, "step": 2502 }, { "epoch": 0.019465692785794943, "grad_norm": 0.16172233086738247, "learning_rate": 9.990799436054592e-05, "loss": 2.1715, "step": 2503 }, { "epoch": 0.019473469730575524, "grad_norm": 0.1772660549299164, "learning_rate": 9.990792025993299e-05, "loss": 2.1622, "step": 2504 }, { "epoch": 0.019481246675356105, "grad_norm": 0.13287357910984815, "learning_rate": 9.990784612951954e-05, "loss": 2.1398, "step": 2505 }, { "epoch": 0.019489023620136686, "grad_norm": 0.17265473117540867, "learning_rate": 9.990777196930564e-05, "loss": 2.1516, "step": 2506 }, { "epoch": 0.019496800564917267, "grad_norm": 0.1358428911379162, "learning_rate": 9.99076977792913e-05, "loss": 2.075, "step": 2507 }, { "epoch": 0.01950457750969785, "grad_norm": 0.1506219841844603, "learning_rate": 9.990762355947659e-05, "loss": 2.2199, "step": 2508 }, { "epoch": 0.01951235445447843, "grad_norm": 0.1732246802811716, "learning_rate": 9.990754930986153e-05, "loss": 2.1431, "step": 2509 }, { "epoch": 0.019520131399259014, "grad_norm": 0.11769473387622677, "learning_rate": 9.990747503044619e-05, "loss": 2.1124, "step": 2510 }, { "epoch": 0.019527908344039595, "grad_norm": 0.1511224117209667, "learning_rate": 9.990740072123062e-05, "loss": 2.149, "step": 2511 }, { "epoch": 0.019535685288820177, "grad_norm": 0.15776437837593393, "learning_rate": 9.990732638221484e-05, "loss": 2.1361, "step": 2512 }, { "epoch": 0.019543462233600758, "grad_norm": 0.11635189236013901, "learning_rate": 9.99072520133989e-05, "loss": 2.1312, "step": 2513 }, { "epoch": 0.01955123917838134, "grad_norm": 0.18003897329801624, "learning_rate": 9.990717761478286e-05, "loss": 2.1575, "step": 2514 }, { "epoch": 0.01955901612316192, "grad_norm": 0.17179112312226932, "learning_rate": 9.990710318636674e-05, "loss": 2.213, "step": 2515 }, { "epoch": 0.0195667930679425, "grad_norm": 0.13469417263297537, "learning_rate": 9.99070287281506e-05, "loss": 2.1854, "step": 2516 }, { "epoch": 0.019574570012723082, "grad_norm": 0.1823251022317733, "learning_rate": 9.990695424013448e-05, "loss": 2.1187, "step": 2517 }, { "epoch": 0.019582346957503664, "grad_norm": 0.14456184392599367, "learning_rate": 9.990687972231843e-05, "loss": 2.1362, "step": 2518 }, { "epoch": 0.019590123902284245, "grad_norm": 0.14415823319691368, "learning_rate": 9.99068051747025e-05, "loss": 2.1539, "step": 2519 }, { "epoch": 0.019597900847064826, "grad_norm": 0.163065853292283, "learning_rate": 9.99067305972867e-05, "loss": 2.16, "step": 2520 }, { "epoch": 0.019605677791845407, "grad_norm": 0.1477992954263458, "learning_rate": 9.990665599007112e-05, "loss": 2.1526, "step": 2521 }, { "epoch": 0.01961345473662599, "grad_norm": 0.1509662475218402, "learning_rate": 9.990658135305577e-05, "loss": 2.1507, "step": 2522 }, { "epoch": 0.01962123168140657, "grad_norm": 0.16572212924344804, "learning_rate": 9.990650668624072e-05, "loss": 2.1491, "step": 2523 }, { "epoch": 0.01962900862618715, "grad_norm": 0.14194917661008769, "learning_rate": 9.9906431989626e-05, "loss": 2.1587, "step": 2524 }, { "epoch": 0.019636785570967732, "grad_norm": 0.14761617839766178, "learning_rate": 9.990635726321165e-05, "loss": 2.1613, "step": 2525 }, { "epoch": 0.019644562515748313, "grad_norm": 0.1342359768410953, "learning_rate": 9.990628250699774e-05, "loss": 2.1421, "step": 2526 }, { "epoch": 0.019652339460528894, "grad_norm": 0.12256306918954885, "learning_rate": 9.990620772098428e-05, "loss": 2.173, "step": 2527 }, { "epoch": 0.019660116405309475, "grad_norm": 0.14516816314745667, "learning_rate": 9.990613290517134e-05, "loss": 2.147, "step": 2528 }, { "epoch": 0.019667893350090056, "grad_norm": 0.1348672174560129, "learning_rate": 9.990605805955895e-05, "loss": 2.1665, "step": 2529 }, { "epoch": 0.019675670294870638, "grad_norm": 0.12899706576387227, "learning_rate": 9.990598318414717e-05, "loss": 2.1595, "step": 2530 }, { "epoch": 0.01968344723965122, "grad_norm": 0.13525751744889025, "learning_rate": 9.990590827893602e-05, "loss": 2.0742, "step": 2531 }, { "epoch": 0.0196912241844318, "grad_norm": 0.11598388796037967, "learning_rate": 9.990583334392558e-05, "loss": 2.1629, "step": 2532 }, { "epoch": 0.01969900112921238, "grad_norm": 0.1499611501925672, "learning_rate": 9.990575837911587e-05, "loss": 2.0895, "step": 2533 }, { "epoch": 0.019706778073992962, "grad_norm": 0.14231602189249642, "learning_rate": 9.990568338450693e-05, "loss": 2.1612, "step": 2534 }, { "epoch": 0.019714555018773543, "grad_norm": 0.12058384820863906, "learning_rate": 9.990560836009883e-05, "loss": 2.1461, "step": 2535 }, { "epoch": 0.019722331963554125, "grad_norm": 0.13660691306103234, "learning_rate": 9.990553330589158e-05, "loss": 2.1292, "step": 2536 }, { "epoch": 0.019730108908334706, "grad_norm": 0.12450522185325782, "learning_rate": 9.990545822188527e-05, "loss": 2.1444, "step": 2537 }, { "epoch": 0.019737885853115287, "grad_norm": 0.1256145560207347, "learning_rate": 9.990538310807989e-05, "loss": 2.11, "step": 2538 }, { "epoch": 0.01974566279789587, "grad_norm": 0.12858512791578405, "learning_rate": 9.990530796447552e-05, "loss": 2.127, "step": 2539 }, { "epoch": 0.019753439742676453, "grad_norm": 0.12929319672029224, "learning_rate": 9.990523279107223e-05, "loss": 2.1783, "step": 2540 }, { "epoch": 0.019761216687457034, "grad_norm": 0.13203392967676358, "learning_rate": 9.990515758787e-05, "loss": 2.1469, "step": 2541 }, { "epoch": 0.019768993632237615, "grad_norm": 0.11845309312188289, "learning_rate": 9.990508235486892e-05, "loss": 2.1457, "step": 2542 }, { "epoch": 0.019776770577018196, "grad_norm": 0.12571380281823002, "learning_rate": 9.990500709206902e-05, "loss": 2.174, "step": 2543 }, { "epoch": 0.019784547521798777, "grad_norm": 0.12301716671877963, "learning_rate": 9.990493179947036e-05, "loss": 2.1402, "step": 2544 }, { "epoch": 0.01979232446657936, "grad_norm": 0.11764178007796033, "learning_rate": 9.990485647707296e-05, "loss": 2.1292, "step": 2545 }, { "epoch": 0.01980010141135994, "grad_norm": 0.11730400390148879, "learning_rate": 9.99047811248769e-05, "loss": 2.1159, "step": 2546 }, { "epoch": 0.01980787835614052, "grad_norm": 0.1156447275546899, "learning_rate": 9.990470574288218e-05, "loss": 2.1038, "step": 2547 }, { "epoch": 0.019815655300921102, "grad_norm": 0.1267701027952618, "learning_rate": 9.990463033108886e-05, "loss": 2.1493, "step": 2548 }, { "epoch": 0.019823432245701683, "grad_norm": 0.1208775016712611, "learning_rate": 9.990455488949702e-05, "loss": 2.1243, "step": 2549 }, { "epoch": 0.019831209190482264, "grad_norm": 0.12635046486646387, "learning_rate": 9.990447941810667e-05, "loss": 2.075, "step": 2550 }, { "epoch": 0.019838986135262845, "grad_norm": 0.14238884122929577, "learning_rate": 9.990440391691787e-05, "loss": 2.0955, "step": 2551 }, { "epoch": 0.019846763080043427, "grad_norm": 0.1451109810117627, "learning_rate": 9.990432838593064e-05, "loss": 2.1547, "step": 2552 }, { "epoch": 0.019854540024824008, "grad_norm": 0.11545734635983153, "learning_rate": 9.990425282514507e-05, "loss": 2.1304, "step": 2553 }, { "epoch": 0.01986231696960459, "grad_norm": 0.13068264681786043, "learning_rate": 9.990417723456114e-05, "loss": 2.1665, "step": 2554 }, { "epoch": 0.01987009391438517, "grad_norm": 0.14354531685853145, "learning_rate": 9.990410161417897e-05, "loss": 2.1257, "step": 2555 }, { "epoch": 0.01987787085916575, "grad_norm": 0.13039883199324545, "learning_rate": 9.990402596399855e-05, "loss": 2.2047, "step": 2556 }, { "epoch": 0.019885647803946332, "grad_norm": 0.2380096001843265, "learning_rate": 9.990395028401995e-05, "loss": 2.163, "step": 2557 }, { "epoch": 0.019893424748726914, "grad_norm": 0.1366906996250499, "learning_rate": 9.990387457424322e-05, "loss": 2.1312, "step": 2558 }, { "epoch": 0.019901201693507495, "grad_norm": 0.13231173005313057, "learning_rate": 9.990379883466839e-05, "loss": 2.1338, "step": 2559 }, { "epoch": 0.019908978638288076, "grad_norm": 0.11586699338588581, "learning_rate": 9.99037230652955e-05, "loss": 2.1108, "step": 2560 }, { "epoch": 0.019916755583068657, "grad_norm": 0.12259942536253723, "learning_rate": 9.99036472661246e-05, "loss": 2.1178, "step": 2561 }, { "epoch": 0.01992453252784924, "grad_norm": 0.12403003935700667, "learning_rate": 9.990357143715577e-05, "loss": 2.118, "step": 2562 }, { "epoch": 0.01993230947262982, "grad_norm": 0.11868651826447923, "learning_rate": 9.9903495578389e-05, "loss": 2.1116, "step": 2563 }, { "epoch": 0.0199400864174104, "grad_norm": 0.11677741092704019, "learning_rate": 9.990341968982438e-05, "loss": 2.1674, "step": 2564 }, { "epoch": 0.019947863362190982, "grad_norm": 0.11645130630504688, "learning_rate": 9.990334377146191e-05, "loss": 2.1646, "step": 2565 }, { "epoch": 0.019955640306971563, "grad_norm": 0.13034518362583913, "learning_rate": 9.990326782330169e-05, "loss": 2.1564, "step": 2566 }, { "epoch": 0.019963417251752144, "grad_norm": 0.14184507560373474, "learning_rate": 9.990319184534371e-05, "loss": 2.1164, "step": 2567 }, { "epoch": 0.019971194196532725, "grad_norm": 0.12586404409059268, "learning_rate": 9.990311583758807e-05, "loss": 2.0647, "step": 2568 }, { "epoch": 0.01997897114131331, "grad_norm": 0.11704433621978087, "learning_rate": 9.990303980003476e-05, "loss": 2.0792, "step": 2569 }, { "epoch": 0.01998674808609389, "grad_norm": 0.1644428467014956, "learning_rate": 9.990296373268387e-05, "loss": 2.1613, "step": 2570 }, { "epoch": 0.019994525030874472, "grad_norm": 0.1743819790484633, "learning_rate": 9.990288763553543e-05, "loss": 2.1487, "step": 2571 }, { "epoch": 0.020002301975655053, "grad_norm": 0.12992051558139311, "learning_rate": 9.990281150858947e-05, "loss": 2.1011, "step": 2572 }, { "epoch": 0.020010078920435635, "grad_norm": 0.11963053618537517, "learning_rate": 9.990273535184607e-05, "loss": 2.1011, "step": 2573 }, { "epoch": 0.020017855865216216, "grad_norm": 0.2794592358915931, "learning_rate": 9.990265916530525e-05, "loss": 2.1689, "step": 2574 }, { "epoch": 0.020025632809996797, "grad_norm": 0.17582621714807758, "learning_rate": 9.990258294896705e-05, "loss": 2.1507, "step": 2575 }, { "epoch": 0.020033409754777378, "grad_norm": 0.13423416809814953, "learning_rate": 9.990250670283153e-05, "loss": 2.1352, "step": 2576 }, { "epoch": 0.02004118669955796, "grad_norm": 0.12161058355882255, "learning_rate": 9.990243042689873e-05, "loss": 2.0995, "step": 2577 }, { "epoch": 0.02004896364433854, "grad_norm": 0.15596146570953384, "learning_rate": 9.99023541211687e-05, "loss": 2.1681, "step": 2578 }, { "epoch": 0.02005674058911912, "grad_norm": 0.14760374168335133, "learning_rate": 9.99022777856415e-05, "loss": 2.1463, "step": 2579 }, { "epoch": 0.020064517533899703, "grad_norm": 0.12986125870514925, "learning_rate": 9.990220142031713e-05, "loss": 2.1819, "step": 2580 }, { "epoch": 0.020072294478680284, "grad_norm": 0.11644470277239292, "learning_rate": 9.990212502519567e-05, "loss": 2.1802, "step": 2581 }, { "epoch": 0.020080071423460865, "grad_norm": 0.12974433822918804, "learning_rate": 9.990204860027717e-05, "loss": 2.1194, "step": 2582 }, { "epoch": 0.020087848368241446, "grad_norm": 0.13735307331895752, "learning_rate": 9.990197214556166e-05, "loss": 2.1503, "step": 2583 }, { "epoch": 0.020095625313022027, "grad_norm": 0.11772803789092508, "learning_rate": 9.99018956610492e-05, "loss": 2.1685, "step": 2584 }, { "epoch": 0.02010340225780261, "grad_norm": 0.1279536374635822, "learning_rate": 9.990181914673982e-05, "loss": 2.1057, "step": 2585 }, { "epoch": 0.02011117920258319, "grad_norm": 0.1201181802845967, "learning_rate": 9.990174260263356e-05, "loss": 2.181, "step": 2586 }, { "epoch": 0.02011895614736377, "grad_norm": 0.1237541684738939, "learning_rate": 9.990166602873049e-05, "loss": 2.1567, "step": 2587 }, { "epoch": 0.020126733092144352, "grad_norm": 0.11870130121432478, "learning_rate": 9.990158942503064e-05, "loss": 2.1435, "step": 2588 }, { "epoch": 0.020134510036924933, "grad_norm": 0.11268284743536922, "learning_rate": 9.990151279153405e-05, "loss": 2.1677, "step": 2589 }, { "epoch": 0.020142286981705514, "grad_norm": 0.11455154763579574, "learning_rate": 9.990143612824079e-05, "loss": 2.1271, "step": 2590 }, { "epoch": 0.020150063926486095, "grad_norm": 0.11558705131401587, "learning_rate": 9.990135943515089e-05, "loss": 2.1423, "step": 2591 }, { "epoch": 0.020157840871266677, "grad_norm": 0.11961408760923779, "learning_rate": 9.99012827122644e-05, "loss": 2.1175, "step": 2592 }, { "epoch": 0.020165617816047258, "grad_norm": 0.13499998583083792, "learning_rate": 9.990120595958134e-05, "loss": 2.1346, "step": 2593 }, { "epoch": 0.02017339476082784, "grad_norm": 0.1345356995555066, "learning_rate": 9.99011291771018e-05, "loss": 2.1233, "step": 2594 }, { "epoch": 0.02018117170560842, "grad_norm": 0.12010464484651712, "learning_rate": 9.99010523648258e-05, "loss": 2.135, "step": 2595 }, { "epoch": 0.020188948650389, "grad_norm": 0.13904144732650808, "learning_rate": 9.990097552275339e-05, "loss": 2.1474, "step": 2596 }, { "epoch": 0.020196725595169582, "grad_norm": 0.1535556763839195, "learning_rate": 9.990089865088461e-05, "loss": 2.18, "step": 2597 }, { "epoch": 0.020204502539950164, "grad_norm": 0.1226166797064439, "learning_rate": 9.990082174921952e-05, "loss": 2.1596, "step": 2598 }, { "epoch": 0.020212279484730748, "grad_norm": 0.14853947777954468, "learning_rate": 9.990074481775816e-05, "loss": 2.1589, "step": 2599 }, { "epoch": 0.02022005642951133, "grad_norm": 0.16716090709827608, "learning_rate": 9.990066785650056e-05, "loss": 2.1486, "step": 2600 }, { "epoch": 0.02022783337429191, "grad_norm": 0.1322889934439434, "learning_rate": 9.990059086544678e-05, "loss": 2.0822, "step": 2601 }, { "epoch": 0.020235610319072492, "grad_norm": 0.13503796306158047, "learning_rate": 9.990051384459688e-05, "loss": 2.1582, "step": 2602 }, { "epoch": 0.020243387263853073, "grad_norm": 0.16411197898268223, "learning_rate": 9.990043679395088e-05, "loss": 2.1816, "step": 2603 }, { "epoch": 0.020251164208633654, "grad_norm": 0.14483267130575647, "learning_rate": 9.990035971350885e-05, "loss": 2.1425, "step": 2604 }, { "epoch": 0.020258941153414235, "grad_norm": 0.16270241157222345, "learning_rate": 9.99002826032708e-05, "loss": 2.1749, "step": 2605 }, { "epoch": 0.020266718098194816, "grad_norm": 0.12444997835060806, "learning_rate": 9.990020546323681e-05, "loss": 2.1828, "step": 2606 }, { "epoch": 0.020274495042975398, "grad_norm": 0.12441350114200919, "learning_rate": 9.990012829340693e-05, "loss": 2.1293, "step": 2607 }, { "epoch": 0.02028227198775598, "grad_norm": 0.19219077157966413, "learning_rate": 9.990005109378117e-05, "loss": 2.1283, "step": 2608 }, { "epoch": 0.02029004893253656, "grad_norm": 0.1279469407840623, "learning_rate": 9.989997386435962e-05, "loss": 2.1588, "step": 2609 }, { "epoch": 0.02029782587731714, "grad_norm": 0.15371020802252755, "learning_rate": 9.989989660514229e-05, "loss": 2.1685, "step": 2610 }, { "epoch": 0.020305602822097722, "grad_norm": 0.14812403445419042, "learning_rate": 9.989981931612924e-05, "loss": 2.0565, "step": 2611 }, { "epoch": 0.020313379766878303, "grad_norm": 0.11735324268381414, "learning_rate": 9.989974199732053e-05, "loss": 2.1504, "step": 2612 }, { "epoch": 0.020321156711658885, "grad_norm": 0.125485745313595, "learning_rate": 9.989966464871619e-05, "loss": 2.1359, "step": 2613 }, { "epoch": 0.020328933656439466, "grad_norm": 0.1242096335077398, "learning_rate": 9.989958727031625e-05, "loss": 2.0963, "step": 2614 }, { "epoch": 0.020336710601220047, "grad_norm": 0.23217974744218844, "learning_rate": 9.989950986212079e-05, "loss": 2.1717, "step": 2615 }, { "epoch": 0.020344487546000628, "grad_norm": 0.12144153734961038, "learning_rate": 9.989943242412983e-05, "loss": 2.136, "step": 2616 }, { "epoch": 0.02035226449078121, "grad_norm": 0.1192694428823151, "learning_rate": 9.989935495634344e-05, "loss": 2.0818, "step": 2617 }, { "epoch": 0.02036004143556179, "grad_norm": 0.11564580722507344, "learning_rate": 9.989927745876164e-05, "loss": 2.1034, "step": 2618 }, { "epoch": 0.02036781838034237, "grad_norm": 0.1165694743183819, "learning_rate": 9.98991999313845e-05, "loss": 2.1009, "step": 2619 }, { "epoch": 0.020375595325122953, "grad_norm": 0.12086733086433266, "learning_rate": 9.989912237421206e-05, "loss": 2.1404, "step": 2620 }, { "epoch": 0.020383372269903534, "grad_norm": 0.11506207931135609, "learning_rate": 9.989904478724435e-05, "loss": 2.1137, "step": 2621 }, { "epoch": 0.020391149214684115, "grad_norm": 0.12259470642099976, "learning_rate": 9.989896717048144e-05, "loss": 2.125, "step": 2622 }, { "epoch": 0.020398926159464696, "grad_norm": 0.11877327733627356, "learning_rate": 9.989888952392335e-05, "loss": 2.1365, "step": 2623 }, { "epoch": 0.020406703104245277, "grad_norm": 0.11409780185973628, "learning_rate": 9.989881184757015e-05, "loss": 2.1638, "step": 2624 }, { "epoch": 0.02041448004902586, "grad_norm": 0.11819496601832212, "learning_rate": 9.989873414142187e-05, "loss": 2.1683, "step": 2625 }, { "epoch": 0.02042225699380644, "grad_norm": 0.12752654767809266, "learning_rate": 9.989865640547858e-05, "loss": 2.1625, "step": 2626 }, { "epoch": 0.02043003393858702, "grad_norm": 0.239050556518731, "learning_rate": 9.989857863974031e-05, "loss": 2.118, "step": 2627 }, { "epoch": 0.020437810883367605, "grad_norm": 0.11866087602835697, "learning_rate": 9.989850084420712e-05, "loss": 2.1681, "step": 2628 }, { "epoch": 0.020445587828148187, "grad_norm": 0.1198533744984593, "learning_rate": 9.989842301887901e-05, "loss": 2.1844, "step": 2629 }, { "epoch": 0.020453364772928768, "grad_norm": 0.11678621738798318, "learning_rate": 9.989834516375607e-05, "loss": 2.1452, "step": 2630 }, { "epoch": 0.02046114171770935, "grad_norm": 0.11684742792659944, "learning_rate": 9.989826727883834e-05, "loss": 2.1249, "step": 2631 }, { "epoch": 0.02046891866248993, "grad_norm": 0.17508754071886312, "learning_rate": 9.989818936412587e-05, "loss": 2.11, "step": 2632 }, { "epoch": 0.02047669560727051, "grad_norm": 0.23678220752871498, "learning_rate": 9.989811141961869e-05, "loss": 2.1456, "step": 2633 }, { "epoch": 0.020484472552051092, "grad_norm": 0.22480958313848504, "learning_rate": 9.989803344531685e-05, "loss": 2.1379, "step": 2634 }, { "epoch": 0.020492249496831674, "grad_norm": 0.17675683704359918, "learning_rate": 9.989795544122043e-05, "loss": 2.1626, "step": 2635 }, { "epoch": 0.020500026441612255, "grad_norm": 0.24503528952289128, "learning_rate": 9.989787740732942e-05, "loss": 2.1744, "step": 2636 }, { "epoch": 0.020507803386392836, "grad_norm": 0.11343525915272441, "learning_rate": 9.989779934364391e-05, "loss": 2.0882, "step": 2637 }, { "epoch": 0.020515580331173417, "grad_norm": 0.6568500685610745, "learning_rate": 9.989772125016392e-05, "loss": 2.1792, "step": 2638 }, { "epoch": 0.020523357275953998, "grad_norm": 0.16297366724309276, "learning_rate": 9.989764312688953e-05, "loss": 2.0656, "step": 2639 }, { "epoch": 0.02053113422073458, "grad_norm": 0.1463085960969009, "learning_rate": 9.989756497382075e-05, "loss": 2.088, "step": 2640 }, { "epoch": 0.02053891116551516, "grad_norm": 0.14415621223073946, "learning_rate": 9.989748679095765e-05, "loss": 2.1659, "step": 2641 }, { "epoch": 0.020546688110295742, "grad_norm": 0.1240388572257345, "learning_rate": 9.989740857830027e-05, "loss": 2.1187, "step": 2642 }, { "epoch": 0.020554465055076323, "grad_norm": 0.12258960415776483, "learning_rate": 9.989733033584865e-05, "loss": 2.1489, "step": 2643 }, { "epoch": 0.020562241999856904, "grad_norm": 0.12980801637475287, "learning_rate": 9.989725206360284e-05, "loss": 2.1128, "step": 2644 }, { "epoch": 0.020570018944637485, "grad_norm": 0.12416960546599055, "learning_rate": 9.989717376156289e-05, "loss": 2.1602, "step": 2645 }, { "epoch": 0.020577795889418066, "grad_norm": 0.12015370216167516, "learning_rate": 9.989709542972885e-05, "loss": 2.1904, "step": 2646 }, { "epoch": 0.020585572834198648, "grad_norm": 0.12258166884476182, "learning_rate": 9.989701706810077e-05, "loss": 2.1432, "step": 2647 }, { "epoch": 0.02059334977897923, "grad_norm": 0.24303211413993872, "learning_rate": 9.989693867667867e-05, "loss": 2.1521, "step": 2648 }, { "epoch": 0.02060112672375981, "grad_norm": 0.1338152415725466, "learning_rate": 9.989686025546263e-05, "loss": 2.1459, "step": 2649 }, { "epoch": 0.02060890366854039, "grad_norm": 0.128510291692753, "learning_rate": 9.989678180445267e-05, "loss": 2.1089, "step": 2650 }, { "epoch": 0.020616680613320972, "grad_norm": 0.12204202103417616, "learning_rate": 9.989670332364888e-05, "loss": 2.1201, "step": 2651 }, { "epoch": 0.020624457558101553, "grad_norm": 0.12232628426338504, "learning_rate": 9.989662481305125e-05, "loss": 2.1669, "step": 2652 }, { "epoch": 0.020632234502882135, "grad_norm": 0.12911115126679512, "learning_rate": 9.989654627265986e-05, "loss": 2.1693, "step": 2653 }, { "epoch": 0.020640011447662716, "grad_norm": 0.12066595954958508, "learning_rate": 9.989646770247477e-05, "loss": 2.122, "step": 2654 }, { "epoch": 0.020647788392443297, "grad_norm": 0.1281087443985541, "learning_rate": 9.989638910249597e-05, "loss": 2.1477, "step": 2655 }, { "epoch": 0.020655565337223878, "grad_norm": 0.1333988725608131, "learning_rate": 9.989631047272356e-05, "loss": 2.1241, "step": 2656 }, { "epoch": 0.02066334228200446, "grad_norm": 0.1174248965510674, "learning_rate": 9.989623181315758e-05, "loss": 2.1248, "step": 2657 }, { "epoch": 0.020671119226785044, "grad_norm": 0.12576897511607552, "learning_rate": 9.989615312379807e-05, "loss": 2.1625, "step": 2658 }, { "epoch": 0.020678896171565625, "grad_norm": 0.1430460268906773, "learning_rate": 9.989607440464506e-05, "loss": 2.1175, "step": 2659 }, { "epoch": 0.020686673116346206, "grad_norm": 0.17048711155165122, "learning_rate": 9.989599565569863e-05, "loss": 2.1006, "step": 2660 }, { "epoch": 0.020694450061126787, "grad_norm": 0.17793109275852606, "learning_rate": 9.98959168769588e-05, "loss": 2.1636, "step": 2661 }, { "epoch": 0.02070222700590737, "grad_norm": 0.1497337329323097, "learning_rate": 9.989583806842563e-05, "loss": 2.1613, "step": 2662 }, { "epoch": 0.02071000395068795, "grad_norm": 0.1181943188917551, "learning_rate": 9.989575923009915e-05, "loss": 2.1308, "step": 2663 }, { "epoch": 0.02071778089546853, "grad_norm": 0.16026490674198826, "learning_rate": 9.989568036197943e-05, "loss": 2.1297, "step": 2664 }, { "epoch": 0.020725557840249112, "grad_norm": 0.1965601510901014, "learning_rate": 9.989560146406652e-05, "loss": 2.1299, "step": 2665 }, { "epoch": 0.020733334785029693, "grad_norm": 0.15807264707962396, "learning_rate": 9.989552253636045e-05, "loss": 2.1046, "step": 2666 }, { "epoch": 0.020741111729810274, "grad_norm": 0.11898302485694838, "learning_rate": 9.989544357886126e-05, "loss": 2.0877, "step": 2667 }, { "epoch": 0.020748888674590855, "grad_norm": 0.1419984126009016, "learning_rate": 9.9895364591569e-05, "loss": 2.1409, "step": 2668 }, { "epoch": 0.020756665619371437, "grad_norm": 0.16793952707921206, "learning_rate": 9.989528557448376e-05, "loss": 2.1035, "step": 2669 }, { "epoch": 0.020764442564152018, "grad_norm": 0.1529882296523148, "learning_rate": 9.989520652760553e-05, "loss": 2.1729, "step": 2670 }, { "epoch": 0.0207722195089326, "grad_norm": 0.139091174123897, "learning_rate": 9.989512745093439e-05, "loss": 2.13, "step": 2671 }, { "epoch": 0.02077999645371318, "grad_norm": 0.1224508053216399, "learning_rate": 9.989504834447036e-05, "loss": 2.0713, "step": 2672 }, { "epoch": 0.02078777339849376, "grad_norm": 0.13137515540909758, "learning_rate": 9.989496920821353e-05, "loss": 2.1765, "step": 2673 }, { "epoch": 0.020795550343274342, "grad_norm": 0.12933708088142612, "learning_rate": 9.98948900421639e-05, "loss": 2.0791, "step": 2674 }, { "epoch": 0.020803327288054924, "grad_norm": 0.13271706749583997, "learning_rate": 9.989481084632155e-05, "loss": 2.1881, "step": 2675 }, { "epoch": 0.020811104232835505, "grad_norm": 0.1377246931337986, "learning_rate": 9.989473162068652e-05, "loss": 2.1471, "step": 2676 }, { "epoch": 0.020818881177616086, "grad_norm": 0.1391012996501279, "learning_rate": 9.989465236525886e-05, "loss": 2.1402, "step": 2677 }, { "epoch": 0.020826658122396667, "grad_norm": 0.127858392773834, "learning_rate": 9.98945730800386e-05, "loss": 2.1466, "step": 2678 }, { "epoch": 0.020834435067177248, "grad_norm": 0.12878943845616123, "learning_rate": 9.98944937650258e-05, "loss": 2.1649, "step": 2679 }, { "epoch": 0.02084221201195783, "grad_norm": 0.11196425188522154, "learning_rate": 9.98944144202205e-05, "loss": 2.1522, "step": 2680 }, { "epoch": 0.02084998895673841, "grad_norm": 0.16760115093595507, "learning_rate": 9.989433504562277e-05, "loss": 2.1309, "step": 2681 }, { "epoch": 0.020857765901518992, "grad_norm": 0.12722315055810987, "learning_rate": 9.989425564123263e-05, "loss": 2.1952, "step": 2682 }, { "epoch": 0.020865542846299573, "grad_norm": 0.12189334793313368, "learning_rate": 9.989417620705013e-05, "loss": 2.1024, "step": 2683 }, { "epoch": 0.020873319791080154, "grad_norm": 0.13208668898989764, "learning_rate": 9.989409674307535e-05, "loss": 2.1885, "step": 2684 }, { "epoch": 0.020881096735860735, "grad_norm": 0.1607755099621941, "learning_rate": 9.98940172493083e-05, "loss": 2.1027, "step": 2685 }, { "epoch": 0.020888873680641316, "grad_norm": 0.18101244967783384, "learning_rate": 9.989393772574903e-05, "loss": 2.1333, "step": 2686 }, { "epoch": 0.020896650625421898, "grad_norm": 0.1554753029890062, "learning_rate": 9.98938581723976e-05, "loss": 2.1699, "step": 2687 }, { "epoch": 0.020904427570202482, "grad_norm": 0.1321419070967589, "learning_rate": 9.989377858925408e-05, "loss": 2.1462, "step": 2688 }, { "epoch": 0.020912204514983063, "grad_norm": 0.11869404648942854, "learning_rate": 9.989369897631848e-05, "loss": 2.1793, "step": 2689 }, { "epoch": 0.020919981459763645, "grad_norm": 0.16369751971670027, "learning_rate": 9.989361933359086e-05, "loss": 2.127, "step": 2690 }, { "epoch": 0.020927758404544226, "grad_norm": 0.142857301966563, "learning_rate": 9.989353966107126e-05, "loss": 2.1003, "step": 2691 }, { "epoch": 0.020935535349324807, "grad_norm": 0.12071341742518724, "learning_rate": 9.989345995875973e-05, "loss": 2.1393, "step": 2692 }, { "epoch": 0.020943312294105388, "grad_norm": 0.12148175742341563, "learning_rate": 9.989338022665635e-05, "loss": 2.1586, "step": 2693 }, { "epoch": 0.02095108923888597, "grad_norm": 0.14252016481402124, "learning_rate": 9.989330046476113e-05, "loss": 2.0711, "step": 2694 }, { "epoch": 0.02095886618366655, "grad_norm": 0.13127458678318835, "learning_rate": 9.989322067307412e-05, "loss": 2.1596, "step": 2695 }, { "epoch": 0.02096664312844713, "grad_norm": 0.13405211911104617, "learning_rate": 9.989314085159538e-05, "loss": 2.1576, "step": 2696 }, { "epoch": 0.020974420073227713, "grad_norm": 0.12402614483795894, "learning_rate": 9.989306100032496e-05, "loss": 2.1474, "step": 2697 }, { "epoch": 0.020982197018008294, "grad_norm": 0.11875026236201155, "learning_rate": 9.98929811192629e-05, "loss": 2.1525, "step": 2698 }, { "epoch": 0.020989973962788875, "grad_norm": 0.11580916535257083, "learning_rate": 9.989290120840923e-05, "loss": 2.21, "step": 2699 }, { "epoch": 0.020997750907569456, "grad_norm": 0.12381458818108766, "learning_rate": 9.989282126776405e-05, "loss": 2.2468, "step": 2700 }, { "epoch": 0.021005527852350037, "grad_norm": 0.12004885937557785, "learning_rate": 9.989274129732735e-05, "loss": 2.1604, "step": 2701 }, { "epoch": 0.02101330479713062, "grad_norm": 0.118218777999041, "learning_rate": 9.989266129709922e-05, "loss": 2.1349, "step": 2702 }, { "epoch": 0.0210210817419112, "grad_norm": 0.13553158901678616, "learning_rate": 9.989258126707967e-05, "loss": 2.1159, "step": 2703 }, { "epoch": 0.02102885868669178, "grad_norm": 0.15133571422355907, "learning_rate": 9.989250120726879e-05, "loss": 2.0952, "step": 2704 }, { "epoch": 0.021036635631472362, "grad_norm": 0.15514179399024555, "learning_rate": 9.989242111766659e-05, "loss": 2.1339, "step": 2705 }, { "epoch": 0.021044412576252943, "grad_norm": 0.13884904788575805, "learning_rate": 9.989234099827316e-05, "loss": 2.1575, "step": 2706 }, { "epoch": 0.021052189521033524, "grad_norm": 0.11837280412821162, "learning_rate": 9.989226084908849e-05, "loss": 2.1656, "step": 2707 }, { "epoch": 0.021059966465814105, "grad_norm": 0.1185603602387159, "learning_rate": 9.989218067011267e-05, "loss": 2.1412, "step": 2708 }, { "epoch": 0.021067743410594687, "grad_norm": 0.1430044034989132, "learning_rate": 9.989210046134574e-05, "loss": 2.1521, "step": 2709 }, { "epoch": 0.021075520355375268, "grad_norm": 0.17018946144131636, "learning_rate": 9.989202022278774e-05, "loss": 2.1434, "step": 2710 }, { "epoch": 0.02108329730015585, "grad_norm": 0.18449195748573108, "learning_rate": 9.989193995443873e-05, "loss": 2.1302, "step": 2711 }, { "epoch": 0.02109107424493643, "grad_norm": 0.17395337907916422, "learning_rate": 9.989185965629875e-05, "loss": 2.1053, "step": 2712 }, { "epoch": 0.02109885118971701, "grad_norm": 0.18188497997061256, "learning_rate": 9.989177932836784e-05, "loss": 2.1362, "step": 2713 }, { "epoch": 0.021106628134497592, "grad_norm": 0.1318819098347269, "learning_rate": 9.989169897064607e-05, "loss": 2.1566, "step": 2714 }, { "epoch": 0.021114405079278174, "grad_norm": 0.17521707164630368, "learning_rate": 9.989161858313347e-05, "loss": 2.1525, "step": 2715 }, { "epoch": 0.021122182024058755, "grad_norm": 0.19018581165121937, "learning_rate": 9.989153816583009e-05, "loss": 2.0978, "step": 2716 }, { "epoch": 0.02112995896883934, "grad_norm": 0.1451999341047759, "learning_rate": 9.989145771873598e-05, "loss": 2.1315, "step": 2717 }, { "epoch": 0.02113773591361992, "grad_norm": 0.1171134928202439, "learning_rate": 9.98913772418512e-05, "loss": 2.1104, "step": 2718 }, { "epoch": 0.0211455128584005, "grad_norm": 0.15600025079852722, "learning_rate": 9.989129673517577e-05, "loss": 2.0913, "step": 2719 }, { "epoch": 0.021153289803181083, "grad_norm": 0.14062549022846577, "learning_rate": 9.989121619870977e-05, "loss": 2.1626, "step": 2720 }, { "epoch": 0.021161066747961664, "grad_norm": 0.11350645794797501, "learning_rate": 9.989113563245322e-05, "loss": 2.1213, "step": 2721 }, { "epoch": 0.021168843692742245, "grad_norm": 0.1642306022926931, "learning_rate": 9.98910550364062e-05, "loss": 2.1366, "step": 2722 }, { "epoch": 0.021176620637522826, "grad_norm": 0.19421415248081636, "learning_rate": 9.989097441056873e-05, "loss": 2.1332, "step": 2723 }, { "epoch": 0.021184397582303408, "grad_norm": 0.1566121797689557, "learning_rate": 9.989089375494086e-05, "loss": 2.1579, "step": 2724 }, { "epoch": 0.02119217452708399, "grad_norm": 0.11562349462625567, "learning_rate": 9.989081306952266e-05, "loss": 2.1793, "step": 2725 }, { "epoch": 0.02119995147186457, "grad_norm": 0.18637077301264568, "learning_rate": 9.989073235431415e-05, "loss": 2.1464, "step": 2726 }, { "epoch": 0.02120772841664515, "grad_norm": 0.21739534592270854, "learning_rate": 9.98906516093154e-05, "loss": 2.0777, "step": 2727 }, { "epoch": 0.021215505361425732, "grad_norm": 0.3467137275982058, "learning_rate": 9.989057083452644e-05, "loss": 2.1598, "step": 2728 }, { "epoch": 0.021223282306206313, "grad_norm": 0.1288199978123545, "learning_rate": 9.989049002994735e-05, "loss": 2.1394, "step": 2729 }, { "epoch": 0.021231059250986895, "grad_norm": 0.12795094902407866, "learning_rate": 9.989040919557815e-05, "loss": 2.1744, "step": 2730 }, { "epoch": 0.021238836195767476, "grad_norm": 0.14287256273780957, "learning_rate": 9.989032833141888e-05, "loss": 2.0801, "step": 2731 }, { "epoch": 0.021246613140548057, "grad_norm": 0.15006687437663102, "learning_rate": 9.989024743746962e-05, "loss": 2.1788, "step": 2732 }, { "epoch": 0.021254390085328638, "grad_norm": 0.1308143426265282, "learning_rate": 9.989016651373038e-05, "loss": 2.1675, "step": 2733 }, { "epoch": 0.02126216703010922, "grad_norm": 0.14528281630627246, "learning_rate": 9.989008556020125e-05, "loss": 2.0754, "step": 2734 }, { "epoch": 0.0212699439748898, "grad_norm": 0.20402695307655175, "learning_rate": 9.989000457688226e-05, "loss": 2.1625, "step": 2735 }, { "epoch": 0.02127772091967038, "grad_norm": 0.2109202522678951, "learning_rate": 9.988992356377345e-05, "loss": 2.1335, "step": 2736 }, { "epoch": 0.021285497864450963, "grad_norm": 0.15186849879009331, "learning_rate": 9.988984252087488e-05, "loss": 2.1261, "step": 2737 }, { "epoch": 0.021293274809231544, "grad_norm": 0.1356535401417027, "learning_rate": 9.988976144818659e-05, "loss": 2.1694, "step": 2738 }, { "epoch": 0.021301051754012125, "grad_norm": 0.17137623052509984, "learning_rate": 9.988968034570862e-05, "loss": 2.1233, "step": 2739 }, { "epoch": 0.021308828698792706, "grad_norm": 0.16245852896042, "learning_rate": 9.988959921344104e-05, "loss": 2.1765, "step": 2740 }, { "epoch": 0.021316605643573287, "grad_norm": 0.1336985524661067, "learning_rate": 9.988951805138389e-05, "loss": 2.1233, "step": 2741 }, { "epoch": 0.02132438258835387, "grad_norm": 0.14454197121343235, "learning_rate": 9.988943685953722e-05, "loss": 2.1111, "step": 2742 }, { "epoch": 0.02133215953313445, "grad_norm": 0.13828884162013533, "learning_rate": 9.988935563790108e-05, "loss": 2.149, "step": 2743 }, { "epoch": 0.02133993647791503, "grad_norm": 0.13574949442375164, "learning_rate": 9.98892743864755e-05, "loss": 2.1558, "step": 2744 }, { "epoch": 0.021347713422695612, "grad_norm": 0.1365397438952446, "learning_rate": 9.988919310526055e-05, "loss": 2.1489, "step": 2745 }, { "epoch": 0.021355490367476193, "grad_norm": 0.1296408913077532, "learning_rate": 9.988911179425626e-05, "loss": 2.0697, "step": 2746 }, { "epoch": 0.021363267312256778, "grad_norm": 0.13034366384522314, "learning_rate": 9.98890304534627e-05, "loss": 2.1031, "step": 2747 }, { "epoch": 0.02137104425703736, "grad_norm": 0.11463588033799774, "learning_rate": 9.988894908287992e-05, "loss": 2.0659, "step": 2748 }, { "epoch": 0.02137882120181794, "grad_norm": 0.11929037435334874, "learning_rate": 9.988886768250793e-05, "loss": 2.1433, "step": 2749 }, { "epoch": 0.02138659814659852, "grad_norm": 0.12264969015625027, "learning_rate": 9.988878625234683e-05, "loss": 2.1141, "step": 2750 }, { "epoch": 0.021394375091379102, "grad_norm": 0.11564193829857355, "learning_rate": 9.988870479239663e-05, "loss": 2.1618, "step": 2751 }, { "epoch": 0.021402152036159684, "grad_norm": 0.12580590211925663, "learning_rate": 9.98886233026574e-05, "loss": 2.1926, "step": 2752 }, { "epoch": 0.021409928980940265, "grad_norm": 0.11886303822813754, "learning_rate": 9.988854178312919e-05, "loss": 2.1541, "step": 2753 }, { "epoch": 0.021417705925720846, "grad_norm": 0.13377605874560194, "learning_rate": 9.988846023381201e-05, "loss": 2.1334, "step": 2754 }, { "epoch": 0.021425482870501427, "grad_norm": 0.150606999476985, "learning_rate": 9.988837865470597e-05, "loss": 2.1548, "step": 2755 }, { "epoch": 0.021433259815282008, "grad_norm": 0.1269359441007118, "learning_rate": 9.988829704581109e-05, "loss": 2.1139, "step": 2756 }, { "epoch": 0.02144103676006259, "grad_norm": 0.1172729799125463, "learning_rate": 9.98882154071274e-05, "loss": 2.1295, "step": 2757 }, { "epoch": 0.02144881370484317, "grad_norm": 0.19185372059714703, "learning_rate": 9.988813373865496e-05, "loss": 2.1781, "step": 2758 }, { "epoch": 0.02145659064962375, "grad_norm": 0.12435613947339146, "learning_rate": 9.988805204039383e-05, "loss": 2.1432, "step": 2759 }, { "epoch": 0.021464367594404333, "grad_norm": 0.11657135940378902, "learning_rate": 9.988797031234406e-05, "loss": 2.1548, "step": 2760 }, { "epoch": 0.021472144539184914, "grad_norm": 0.11747899993028234, "learning_rate": 9.988788855450568e-05, "loss": 2.1323, "step": 2761 }, { "epoch": 0.021479921483965495, "grad_norm": 0.13576833502041066, "learning_rate": 9.988780676687877e-05, "loss": 2.14, "step": 2762 }, { "epoch": 0.021487698428746076, "grad_norm": 0.15273558073723156, "learning_rate": 9.988772494946334e-05, "loss": 2.1382, "step": 2763 }, { "epoch": 0.021495475373526658, "grad_norm": 0.13208457820989464, "learning_rate": 9.988764310225946e-05, "loss": 2.1304, "step": 2764 }, { "epoch": 0.02150325231830724, "grad_norm": 0.1220112581184959, "learning_rate": 9.988756122526717e-05, "loss": 2.145, "step": 2765 }, { "epoch": 0.02151102926308782, "grad_norm": 0.16307299579108567, "learning_rate": 9.988747931848654e-05, "loss": 2.1392, "step": 2766 }, { "epoch": 0.0215188062078684, "grad_norm": 0.18032879483736536, "learning_rate": 9.98873973819176e-05, "loss": 2.1642, "step": 2767 }, { "epoch": 0.021526583152648982, "grad_norm": 0.34154776501125267, "learning_rate": 9.98873154155604e-05, "loss": 2.1128, "step": 2768 }, { "epoch": 0.021534360097429563, "grad_norm": 0.13536074271574502, "learning_rate": 9.988723341941501e-05, "loss": 2.1284, "step": 2769 }, { "epoch": 0.021542137042210144, "grad_norm": 0.13328631378060793, "learning_rate": 9.988715139348144e-05, "loss": 2.1594, "step": 2770 }, { "epoch": 0.021549913986990726, "grad_norm": 0.4714543382214588, "learning_rate": 9.988706933775978e-05, "loss": 2.0799, "step": 2771 }, { "epoch": 0.021557690931771307, "grad_norm": 0.14585601609531249, "learning_rate": 9.988698725225003e-05, "loss": 2.1717, "step": 2772 }, { "epoch": 0.021565467876551888, "grad_norm": 0.1367080924099921, "learning_rate": 9.988690513695229e-05, "loss": 2.1677, "step": 2773 }, { "epoch": 0.02157324482133247, "grad_norm": 0.13803860088590655, "learning_rate": 9.988682299186658e-05, "loss": 2.1118, "step": 2774 }, { "epoch": 0.02158102176611305, "grad_norm": 0.17069888091651245, "learning_rate": 9.988674081699297e-05, "loss": 2.1471, "step": 2775 }, { "epoch": 0.02158879871089363, "grad_norm": 0.19040039876490492, "learning_rate": 9.988665861233149e-05, "loss": 2.1344, "step": 2776 }, { "epoch": 0.021596575655674216, "grad_norm": 0.20570214315720728, "learning_rate": 9.988657637788217e-05, "loss": 2.1888, "step": 2777 }, { "epoch": 0.021604352600454797, "grad_norm": 0.1889485310336693, "learning_rate": 9.988649411364511e-05, "loss": 2.1133, "step": 2778 }, { "epoch": 0.02161212954523538, "grad_norm": 0.15518603709061937, "learning_rate": 9.988641181962033e-05, "loss": 2.1136, "step": 2779 }, { "epoch": 0.02161990649001596, "grad_norm": 0.1210145803617002, "learning_rate": 9.988632949580787e-05, "loss": 2.199, "step": 2780 }, { "epoch": 0.02162768343479654, "grad_norm": 0.1785625117364361, "learning_rate": 9.988624714220781e-05, "loss": 2.1204, "step": 2781 }, { "epoch": 0.021635460379577122, "grad_norm": 0.19685740934428517, "learning_rate": 9.988616475882018e-05, "loss": 2.1026, "step": 2782 }, { "epoch": 0.021643237324357703, "grad_norm": 0.1508769894871914, "learning_rate": 9.9886082345645e-05, "loss": 2.1126, "step": 2783 }, { "epoch": 0.021651014269138284, "grad_norm": 0.1259245516365, "learning_rate": 9.988599990268236e-05, "loss": 2.1577, "step": 2784 }, { "epoch": 0.021658791213918865, "grad_norm": 0.15188505329982344, "learning_rate": 9.988591742993231e-05, "loss": 2.1894, "step": 2785 }, { "epoch": 0.021666568158699447, "grad_norm": 0.1856237414960377, "learning_rate": 9.988583492739489e-05, "loss": 2.1331, "step": 2786 }, { "epoch": 0.021674345103480028, "grad_norm": 0.18946090506882882, "learning_rate": 9.988575239507014e-05, "loss": 2.1251, "step": 2787 }, { "epoch": 0.02168212204826061, "grad_norm": 0.16311202810863337, "learning_rate": 9.988566983295813e-05, "loss": 2.1551, "step": 2788 }, { "epoch": 0.02168989899304119, "grad_norm": 0.145054279224432, "learning_rate": 9.988558724105888e-05, "loss": 2.1982, "step": 2789 }, { "epoch": 0.02169767593782177, "grad_norm": 0.12363695600336107, "learning_rate": 9.988550461937245e-05, "loss": 2.1115, "step": 2790 }, { "epoch": 0.021705452882602352, "grad_norm": 0.16078534061853192, "learning_rate": 9.988542196789889e-05, "loss": 2.1299, "step": 2791 }, { "epoch": 0.021713229827382934, "grad_norm": 0.19694675701146108, "learning_rate": 9.988533928663826e-05, "loss": 2.117, "step": 2792 }, { "epoch": 0.021721006772163515, "grad_norm": 0.1483609313012862, "learning_rate": 9.988525657559062e-05, "loss": 2.1525, "step": 2793 }, { "epoch": 0.021728783716944096, "grad_norm": 0.13590625193090952, "learning_rate": 9.988517383475598e-05, "loss": 2.0987, "step": 2794 }, { "epoch": 0.021736560661724677, "grad_norm": 0.21715898567879388, "learning_rate": 9.988509106413442e-05, "loss": 2.1274, "step": 2795 }, { "epoch": 0.021744337606505258, "grad_norm": 0.25647650140764194, "learning_rate": 9.988500826372598e-05, "loss": 2.0856, "step": 2796 }, { "epoch": 0.02175211455128584, "grad_norm": 0.19753548939137736, "learning_rate": 9.98849254335307e-05, "loss": 2.0828, "step": 2797 }, { "epoch": 0.02175989149606642, "grad_norm": 0.12661594174729027, "learning_rate": 9.988484257354866e-05, "loss": 2.1283, "step": 2798 }, { "epoch": 0.021767668440847, "grad_norm": 0.18128855323723095, "learning_rate": 9.988475968377987e-05, "loss": 2.1122, "step": 2799 }, { "epoch": 0.021775445385627583, "grad_norm": 0.24890539135891143, "learning_rate": 9.988467676422441e-05, "loss": 2.0624, "step": 2800 }, { "epoch": 0.021783222330408164, "grad_norm": 0.15837717718121558, "learning_rate": 9.988459381488231e-05, "loss": 2.1566, "step": 2801 }, { "epoch": 0.021790999275188745, "grad_norm": 0.13611092095223665, "learning_rate": 9.988451083575363e-05, "loss": 2.1048, "step": 2802 }, { "epoch": 0.021798776219969326, "grad_norm": 0.18245873654684416, "learning_rate": 9.988442782683842e-05, "loss": 2.1474, "step": 2803 }, { "epoch": 0.021806553164749908, "grad_norm": 0.15356376830672985, "learning_rate": 9.988434478813673e-05, "loss": 2.1463, "step": 2804 }, { "epoch": 0.02181433010953049, "grad_norm": 0.13101429091877517, "learning_rate": 9.988426171964859e-05, "loss": 2.156, "step": 2805 }, { "epoch": 0.02182210705431107, "grad_norm": 0.17665951827466236, "learning_rate": 9.988417862137407e-05, "loss": 2.1205, "step": 2806 }, { "epoch": 0.021829883999091654, "grad_norm": 0.17223267458683797, "learning_rate": 9.988409549331323e-05, "loss": 2.1224, "step": 2807 }, { "epoch": 0.021837660943872236, "grad_norm": 0.12152080388539224, "learning_rate": 9.98840123354661e-05, "loss": 2.1132, "step": 2808 }, { "epoch": 0.021845437888652817, "grad_norm": 0.14248355748717986, "learning_rate": 9.988392914783274e-05, "loss": 2.0968, "step": 2809 }, { "epoch": 0.021853214833433398, "grad_norm": 0.5609130416332327, "learning_rate": 9.988384593041318e-05, "loss": 2.1654, "step": 2810 }, { "epoch": 0.02186099177821398, "grad_norm": 0.38788279196359915, "learning_rate": 9.988376268320749e-05, "loss": 2.1214, "step": 2811 }, { "epoch": 0.02186876872299456, "grad_norm": 0.5279570619582197, "learning_rate": 9.988367940621571e-05, "loss": 2.1345, "step": 2812 }, { "epoch": 0.02187654566777514, "grad_norm": 0.3560118984248632, "learning_rate": 9.988359609943791e-05, "loss": 2.1277, "step": 2813 }, { "epoch": 0.021884322612555723, "grad_norm": 0.14512853307373028, "learning_rate": 9.98835127628741e-05, "loss": 2.1336, "step": 2814 }, { "epoch": 0.021892099557336304, "grad_norm": 0.2599872017844304, "learning_rate": 9.988342939652438e-05, "loss": 2.121, "step": 2815 }, { "epoch": 0.021899876502116885, "grad_norm": 0.3336661029465906, "learning_rate": 9.988334600038875e-05, "loss": 2.1029, "step": 2816 }, { "epoch": 0.021907653446897466, "grad_norm": 0.29366280643618625, "learning_rate": 9.988326257446729e-05, "loss": 2.1903, "step": 2817 }, { "epoch": 0.021915430391678047, "grad_norm": 0.17886761524698744, "learning_rate": 9.988317911876005e-05, "loss": 2.1194, "step": 2818 }, { "epoch": 0.02192320733645863, "grad_norm": 0.15179235546541472, "learning_rate": 9.988309563326706e-05, "loss": 2.1509, "step": 2819 }, { "epoch": 0.02193098428123921, "grad_norm": 0.16236251200093083, "learning_rate": 9.988301211798838e-05, "loss": 2.1808, "step": 2820 }, { "epoch": 0.02193876122601979, "grad_norm": 0.1317691775763766, "learning_rate": 9.988292857292407e-05, "loss": 2.115, "step": 2821 }, { "epoch": 0.021946538170800372, "grad_norm": 0.14260518185935897, "learning_rate": 9.988284499807416e-05, "loss": 2.1412, "step": 2822 }, { "epoch": 0.021954315115580953, "grad_norm": 0.1471782799700863, "learning_rate": 9.988276139343873e-05, "loss": 2.1841, "step": 2823 }, { "epoch": 0.021962092060361534, "grad_norm": 0.2131291327792581, "learning_rate": 9.98826777590178e-05, "loss": 2.1639, "step": 2824 }, { "epoch": 0.021969869005142115, "grad_norm": 0.26303828494587717, "learning_rate": 9.988259409481145e-05, "loss": 2.1624, "step": 2825 }, { "epoch": 0.021977645949922697, "grad_norm": 0.22349717293625704, "learning_rate": 9.988251040081969e-05, "loss": 2.1562, "step": 2826 }, { "epoch": 0.021985422894703278, "grad_norm": 0.13695856540724108, "learning_rate": 9.988242667704259e-05, "loss": 2.1813, "step": 2827 }, { "epoch": 0.02199319983948386, "grad_norm": 0.18134563245025692, "learning_rate": 9.988234292348022e-05, "loss": 2.1663, "step": 2828 }, { "epoch": 0.02200097678426444, "grad_norm": 0.2518351661193138, "learning_rate": 9.98822591401326e-05, "loss": 2.1267, "step": 2829 }, { "epoch": 0.02200875372904502, "grad_norm": 0.24079724125834284, "learning_rate": 9.988217532699979e-05, "loss": 2.1386, "step": 2830 }, { "epoch": 0.022016530673825602, "grad_norm": 0.18556436673168286, "learning_rate": 9.988209148408186e-05, "loss": 2.1503, "step": 2831 }, { "epoch": 0.022024307618606184, "grad_norm": 0.16356890638952606, "learning_rate": 9.988200761137883e-05, "loss": 2.109, "step": 2832 }, { "epoch": 0.022032084563386765, "grad_norm": 0.22284764280615732, "learning_rate": 9.988192370889075e-05, "loss": 2.1501, "step": 2833 }, { "epoch": 0.022039861508167346, "grad_norm": 0.20549388644362354, "learning_rate": 9.988183977661769e-05, "loss": 2.1305, "step": 2834 }, { "epoch": 0.022047638452947927, "grad_norm": 0.13695785548340342, "learning_rate": 9.98817558145597e-05, "loss": 2.1283, "step": 2835 }, { "epoch": 0.02205541539772851, "grad_norm": 0.1813325311987872, "learning_rate": 9.98816718227168e-05, "loss": 2.125, "step": 2836 }, { "epoch": 0.022063192342509093, "grad_norm": 0.18160812021292205, "learning_rate": 9.988158780108908e-05, "loss": 2.1423, "step": 2837 }, { "epoch": 0.022070969287289674, "grad_norm": 0.12863219104198084, "learning_rate": 9.988150374967658e-05, "loss": 2.1163, "step": 2838 }, { "epoch": 0.022078746232070255, "grad_norm": 0.19049750840407081, "learning_rate": 9.988141966847933e-05, "loss": 2.1662, "step": 2839 }, { "epoch": 0.022086523176850836, "grad_norm": 0.16192320595399204, "learning_rate": 9.988133555749738e-05, "loss": 2.096, "step": 2840 }, { "epoch": 0.022094300121631417, "grad_norm": 0.1544579182633771, "learning_rate": 9.988125141673082e-05, "loss": 2.131, "step": 2841 }, { "epoch": 0.022102077066412, "grad_norm": 0.11823803456035081, "learning_rate": 9.988116724617966e-05, "loss": 2.1009, "step": 2842 }, { "epoch": 0.02210985401119258, "grad_norm": 0.2214109858237938, "learning_rate": 9.988108304584397e-05, "loss": 2.1235, "step": 2843 }, { "epoch": 0.02211763095597316, "grad_norm": 0.3379507084314648, "learning_rate": 9.988099881572379e-05, "loss": 2.1972, "step": 2844 }, { "epoch": 0.022125407900753742, "grad_norm": 0.11675406538989865, "learning_rate": 9.988091455581918e-05, "loss": 2.084, "step": 2845 }, { "epoch": 0.022133184845534323, "grad_norm": 0.18871729930374842, "learning_rate": 9.988083026613017e-05, "loss": 2.145, "step": 2846 }, { "epoch": 0.022140961790314904, "grad_norm": 0.209049998112955, "learning_rate": 9.988074594665683e-05, "loss": 2.1538, "step": 2847 }, { "epoch": 0.022148738735095486, "grad_norm": 0.1428196305959334, "learning_rate": 9.988066159739922e-05, "loss": 2.0908, "step": 2848 }, { "epoch": 0.022156515679876067, "grad_norm": 0.23482489328273484, "learning_rate": 9.988057721835735e-05, "loss": 2.1432, "step": 2849 }, { "epoch": 0.022164292624656648, "grad_norm": 0.19432504873553116, "learning_rate": 9.988049280953131e-05, "loss": 2.1285, "step": 2850 }, { "epoch": 0.02217206956943723, "grad_norm": 0.14463849740041898, "learning_rate": 9.988040837092114e-05, "loss": 2.1379, "step": 2851 }, { "epoch": 0.02217984651421781, "grad_norm": 0.1348954677481285, "learning_rate": 9.988032390252688e-05, "loss": 2.1621, "step": 2852 }, { "epoch": 0.02218762345899839, "grad_norm": 0.1766530542466337, "learning_rate": 9.988023940434859e-05, "loss": 2.1649, "step": 2853 }, { "epoch": 0.022195400403778973, "grad_norm": 0.1816114194201105, "learning_rate": 9.98801548763863e-05, "loss": 2.0653, "step": 2854 }, { "epoch": 0.022203177348559554, "grad_norm": 0.11203670904331843, "learning_rate": 9.988007031864011e-05, "loss": 2.123, "step": 2855 }, { "epoch": 0.022210954293340135, "grad_norm": 0.14381355973810178, "learning_rate": 9.987998573111003e-05, "loss": 2.1372, "step": 2856 }, { "epoch": 0.022218731238120716, "grad_norm": 0.14119781802775053, "learning_rate": 9.98799011137961e-05, "loss": 2.1198, "step": 2857 }, { "epoch": 0.022226508182901297, "grad_norm": 0.12262588861962, "learning_rate": 9.987981646669842e-05, "loss": 2.1549, "step": 2858 }, { "epoch": 0.02223428512768188, "grad_norm": 0.1278263003332909, "learning_rate": 9.987973178981697e-05, "loss": 2.1304, "step": 2859 }, { "epoch": 0.02224206207246246, "grad_norm": 0.15614026574692327, "learning_rate": 9.987964708315187e-05, "loss": 2.1625, "step": 2860 }, { "epoch": 0.02224983901724304, "grad_norm": 0.15052196175439342, "learning_rate": 9.987956234670315e-05, "loss": 2.0898, "step": 2861 }, { "epoch": 0.022257615962023622, "grad_norm": 0.12285445593530238, "learning_rate": 9.987947758047083e-05, "loss": 2.1002, "step": 2862 }, { "epoch": 0.022265392906804203, "grad_norm": 0.12666583176518478, "learning_rate": 9.9879392784455e-05, "loss": 2.1126, "step": 2863 }, { "epoch": 0.022273169851584784, "grad_norm": 0.14628872580383, "learning_rate": 9.987930795865567e-05, "loss": 2.108, "step": 2864 }, { "epoch": 0.022280946796365365, "grad_norm": 0.14510707973802903, "learning_rate": 9.987922310307293e-05, "loss": 2.1791, "step": 2865 }, { "epoch": 0.02228872374114595, "grad_norm": 0.1260405511643196, "learning_rate": 9.987913821770681e-05, "loss": 2.1614, "step": 2866 }, { "epoch": 0.02229650068592653, "grad_norm": 0.12997218776770816, "learning_rate": 9.987905330255735e-05, "loss": 2.1595, "step": 2867 }, { "epoch": 0.022304277630707112, "grad_norm": 0.1742852969400213, "learning_rate": 9.987896835762465e-05, "loss": 2.0884, "step": 2868 }, { "epoch": 0.022312054575487694, "grad_norm": 0.1320454299878025, "learning_rate": 9.987888338290869e-05, "loss": 2.1062, "step": 2869 }, { "epoch": 0.022319831520268275, "grad_norm": 0.12338577436809105, "learning_rate": 9.987879837840959e-05, "loss": 2.0994, "step": 2870 }, { "epoch": 0.022327608465048856, "grad_norm": 0.14849743949391692, "learning_rate": 9.987871334412735e-05, "loss": 2.091, "step": 2871 }, { "epoch": 0.022335385409829437, "grad_norm": 0.17020767432978665, "learning_rate": 9.987862828006204e-05, "loss": 2.1012, "step": 2872 }, { "epoch": 0.022343162354610018, "grad_norm": 0.12421646658942313, "learning_rate": 9.987854318621372e-05, "loss": 2.123, "step": 2873 }, { "epoch": 0.0223509392993906, "grad_norm": 0.12236846589340225, "learning_rate": 9.987845806258242e-05, "loss": 2.1395, "step": 2874 }, { "epoch": 0.02235871624417118, "grad_norm": 0.23751734396729668, "learning_rate": 9.98783729091682e-05, "loss": 2.1135, "step": 2875 }, { "epoch": 0.02236649318895176, "grad_norm": 0.16765605268148276, "learning_rate": 9.987828772597112e-05, "loss": 2.1369, "step": 2876 }, { "epoch": 0.022374270133732343, "grad_norm": 0.17304514513533478, "learning_rate": 9.987820251299122e-05, "loss": 2.1269, "step": 2877 }, { "epoch": 0.022382047078512924, "grad_norm": 0.12373711984553132, "learning_rate": 9.987811727022856e-05, "loss": 2.1259, "step": 2878 }, { "epoch": 0.022389824023293505, "grad_norm": 0.14745475799315866, "learning_rate": 9.987803199768317e-05, "loss": 2.1514, "step": 2879 }, { "epoch": 0.022397600968074086, "grad_norm": 0.13409976901345397, "learning_rate": 9.987794669535512e-05, "loss": 2.1147, "step": 2880 }, { "epoch": 0.022405377912854667, "grad_norm": 0.11704474214974393, "learning_rate": 9.987786136324446e-05, "loss": 2.1341, "step": 2881 }, { "epoch": 0.02241315485763525, "grad_norm": 0.12720393820707093, "learning_rate": 9.987777600135124e-05, "loss": 2.1289, "step": 2882 }, { "epoch": 0.02242093180241583, "grad_norm": 0.13359459517378366, "learning_rate": 9.987769060967552e-05, "loss": 2.166, "step": 2883 }, { "epoch": 0.02242870874719641, "grad_norm": 0.1137258879929694, "learning_rate": 9.987760518821732e-05, "loss": 2.1358, "step": 2884 }, { "epoch": 0.022436485691976992, "grad_norm": 0.16049088123242056, "learning_rate": 9.98775197369767e-05, "loss": 2.1207, "step": 2885 }, { "epoch": 0.022444262636757573, "grad_norm": 0.20759852485794653, "learning_rate": 9.987743425595375e-05, "loss": 2.1534, "step": 2886 }, { "epoch": 0.022452039581538154, "grad_norm": 0.16439658836013368, "learning_rate": 9.987734874514847e-05, "loss": 2.1101, "step": 2887 }, { "epoch": 0.022459816526318736, "grad_norm": 0.24182975549148708, "learning_rate": 9.987726320456095e-05, "loss": 2.1562, "step": 2888 }, { "epoch": 0.022467593471099317, "grad_norm": 0.1615012252725805, "learning_rate": 9.987717763419122e-05, "loss": 2.1281, "step": 2889 }, { "epoch": 0.022475370415879898, "grad_norm": 0.2219074597752777, "learning_rate": 9.987709203403932e-05, "loss": 2.1493, "step": 2890 }, { "epoch": 0.02248314736066048, "grad_norm": 0.22148849558127803, "learning_rate": 9.987700640410533e-05, "loss": 2.1163, "step": 2891 }, { "epoch": 0.02249092430544106, "grad_norm": 0.18666812753664788, "learning_rate": 9.98769207443893e-05, "loss": 2.1083, "step": 2892 }, { "epoch": 0.02249870125022164, "grad_norm": 0.13385750973010727, "learning_rate": 9.987683505489126e-05, "loss": 2.1225, "step": 2893 }, { "epoch": 0.022506478195002223, "grad_norm": 0.1734089495347478, "learning_rate": 9.987674933561127e-05, "loss": 2.1746, "step": 2894 }, { "epoch": 0.022514255139782804, "grad_norm": 0.16338213613103733, "learning_rate": 9.987666358654937e-05, "loss": 2.12, "step": 2895 }, { "epoch": 0.02252203208456339, "grad_norm": 0.12648711411151928, "learning_rate": 9.987657780770564e-05, "loss": 2.1249, "step": 2896 }, { "epoch": 0.02252980902934397, "grad_norm": 0.15082063139706148, "learning_rate": 9.987649199908011e-05, "loss": 2.1187, "step": 2897 }, { "epoch": 0.02253758597412455, "grad_norm": 0.18787364706981574, "learning_rate": 9.987640616067283e-05, "loss": 2.1701, "step": 2898 }, { "epoch": 0.022545362918905132, "grad_norm": 0.173055943840527, "learning_rate": 9.987632029248386e-05, "loss": 2.1098, "step": 2899 }, { "epoch": 0.022553139863685713, "grad_norm": 0.12245833169650883, "learning_rate": 9.987623439451324e-05, "loss": 2.1027, "step": 2900 }, { "epoch": 0.022560916808466294, "grad_norm": 0.1511108880878373, "learning_rate": 9.987614846676104e-05, "loss": 2.0938, "step": 2901 }, { "epoch": 0.022568693753246875, "grad_norm": 0.1636790160471088, "learning_rate": 9.987606250922729e-05, "loss": 2.1393, "step": 2902 }, { "epoch": 0.022576470698027457, "grad_norm": 0.18873533060046557, "learning_rate": 9.987597652191207e-05, "loss": 2.0676, "step": 2903 }, { "epoch": 0.022584247642808038, "grad_norm": 0.13362511285390544, "learning_rate": 9.987589050481541e-05, "loss": 2.1745, "step": 2904 }, { "epoch": 0.02259202458758862, "grad_norm": 0.16024065749283214, "learning_rate": 9.987580445793733e-05, "loss": 2.1015, "step": 2905 }, { "epoch": 0.0225998015323692, "grad_norm": 0.13273367252984045, "learning_rate": 9.987571838127796e-05, "loss": 2.1542, "step": 2906 }, { "epoch": 0.02260757847714978, "grad_norm": 0.12237469200172049, "learning_rate": 9.987563227483728e-05, "loss": 2.1669, "step": 2907 }, { "epoch": 0.022615355421930362, "grad_norm": 0.15091724664510475, "learning_rate": 9.987554613861539e-05, "loss": 2.1962, "step": 2908 }, { "epoch": 0.022623132366710944, "grad_norm": 0.16517556862774943, "learning_rate": 9.98754599726123e-05, "loss": 2.1315, "step": 2909 }, { "epoch": 0.022630909311491525, "grad_norm": 0.13704886254614249, "learning_rate": 9.987537377682808e-05, "loss": 2.1637, "step": 2910 }, { "epoch": 0.022638686256272106, "grad_norm": 0.12589012496232976, "learning_rate": 9.987528755126279e-05, "loss": 2.1655, "step": 2911 }, { "epoch": 0.022646463201052687, "grad_norm": 0.1657079512040069, "learning_rate": 9.987520129591647e-05, "loss": 2.1144, "step": 2912 }, { "epoch": 0.022654240145833268, "grad_norm": 0.16115935055440533, "learning_rate": 9.987511501078918e-05, "loss": 2.1109, "step": 2913 }, { "epoch": 0.02266201709061385, "grad_norm": 0.11884275835527422, "learning_rate": 9.987502869588099e-05, "loss": 2.1541, "step": 2914 }, { "epoch": 0.02266979403539443, "grad_norm": 0.15635118435329748, "learning_rate": 9.98749423511919e-05, "loss": 2.207, "step": 2915 }, { "epoch": 0.02267757098017501, "grad_norm": 0.18080890067458316, "learning_rate": 9.9874855976722e-05, "loss": 2.1793, "step": 2916 }, { "epoch": 0.022685347924955593, "grad_norm": 0.23616395943672017, "learning_rate": 9.987476957247133e-05, "loss": 2.1599, "step": 2917 }, { "epoch": 0.022693124869736174, "grad_norm": 0.11303334731804898, "learning_rate": 9.987468313843995e-05, "loss": 2.1107, "step": 2918 }, { "epoch": 0.022700901814516755, "grad_norm": 0.13736483257497537, "learning_rate": 9.987459667462789e-05, "loss": 2.0804, "step": 2919 }, { "epoch": 0.022708678759297336, "grad_norm": 0.12651016222806, "learning_rate": 9.987451018103524e-05, "loss": 2.1506, "step": 2920 }, { "epoch": 0.022716455704077917, "grad_norm": 0.18793084616634864, "learning_rate": 9.987442365766201e-05, "loss": 2.1178, "step": 2921 }, { "epoch": 0.0227242326488585, "grad_norm": 0.12832173456215712, "learning_rate": 9.987433710450826e-05, "loss": 2.1608, "step": 2922 }, { "epoch": 0.02273200959363908, "grad_norm": 0.1312840857757204, "learning_rate": 9.987425052157408e-05, "loss": 2.1227, "step": 2923 }, { "epoch": 0.02273978653841966, "grad_norm": 0.2647182081182881, "learning_rate": 9.987416390885948e-05, "loss": 2.1818, "step": 2924 }, { "epoch": 0.022747563483200246, "grad_norm": 0.12843176219561306, "learning_rate": 9.987407726636453e-05, "loss": 2.1322, "step": 2925 }, { "epoch": 0.022755340427980827, "grad_norm": 0.11778551532724499, "learning_rate": 9.987399059408927e-05, "loss": 2.1479, "step": 2926 }, { "epoch": 0.022763117372761408, "grad_norm": 0.12136947446495525, "learning_rate": 9.987390389203377e-05, "loss": 2.1395, "step": 2927 }, { "epoch": 0.02277089431754199, "grad_norm": 0.12450058344021478, "learning_rate": 9.987381716019805e-05, "loss": 2.1063, "step": 2928 }, { "epoch": 0.02277867126232257, "grad_norm": 0.12144419172744905, "learning_rate": 9.98737303985822e-05, "loss": 2.0729, "step": 2929 }, { "epoch": 0.02278644820710315, "grad_norm": 0.12053742344142722, "learning_rate": 9.987364360718625e-05, "loss": 2.1303, "step": 2930 }, { "epoch": 0.022794225151883733, "grad_norm": 0.13575089279144034, "learning_rate": 9.987355678601025e-05, "loss": 2.1093, "step": 2931 }, { "epoch": 0.022802002096664314, "grad_norm": 0.1251297120633131, "learning_rate": 9.987346993505426e-05, "loss": 2.1086, "step": 2932 }, { "epoch": 0.022809779041444895, "grad_norm": 0.15709469472988175, "learning_rate": 9.987338305431835e-05, "loss": 2.1103, "step": 2933 }, { "epoch": 0.022817555986225476, "grad_norm": 0.12684373269545993, "learning_rate": 9.987329614380253e-05, "loss": 2.1432, "step": 2934 }, { "epoch": 0.022825332931006057, "grad_norm": 0.12529723789795758, "learning_rate": 9.987320920350688e-05, "loss": 2.1305, "step": 2935 }, { "epoch": 0.02283310987578664, "grad_norm": 0.11782757167406806, "learning_rate": 9.987312223343144e-05, "loss": 2.1265, "step": 2936 }, { "epoch": 0.02284088682056722, "grad_norm": 0.12207156748942888, "learning_rate": 9.987303523357627e-05, "loss": 2.1406, "step": 2937 }, { "epoch": 0.0228486637653478, "grad_norm": 0.1539305564500783, "learning_rate": 9.987294820394141e-05, "loss": 2.1376, "step": 2938 }, { "epoch": 0.022856440710128382, "grad_norm": 0.17723814491885437, "learning_rate": 9.987286114452694e-05, "loss": 2.1779, "step": 2939 }, { "epoch": 0.022864217654908963, "grad_norm": 0.15808867187202177, "learning_rate": 9.987277405533289e-05, "loss": 2.1036, "step": 2940 }, { "epoch": 0.022871994599689544, "grad_norm": 0.12176936884443242, "learning_rate": 9.987268693635931e-05, "loss": 2.1113, "step": 2941 }, { "epoch": 0.022879771544470125, "grad_norm": 0.14987036086640523, "learning_rate": 9.987259978760625e-05, "loss": 2.1412, "step": 2942 }, { "epoch": 0.022887548489250707, "grad_norm": 0.17439259200120333, "learning_rate": 9.987251260907378e-05, "loss": 2.0825, "step": 2943 }, { "epoch": 0.022895325434031288, "grad_norm": 0.15175419563192763, "learning_rate": 9.987242540076193e-05, "loss": 2.0995, "step": 2944 }, { "epoch": 0.02290310237881187, "grad_norm": 0.11944390546726405, "learning_rate": 9.987233816267078e-05, "loss": 2.1189, "step": 2945 }, { "epoch": 0.02291087932359245, "grad_norm": 0.16375142784918548, "learning_rate": 9.987225089480037e-05, "loss": 2.1809, "step": 2946 }, { "epoch": 0.02291865626837303, "grad_norm": 0.19315048112405883, "learning_rate": 9.987216359715073e-05, "loss": 2.1598, "step": 2947 }, { "epoch": 0.022926433213153612, "grad_norm": 0.17735017950160847, "learning_rate": 9.987207626972192e-05, "loss": 2.169, "step": 2948 }, { "epoch": 0.022934210157934194, "grad_norm": 0.14731976029170396, "learning_rate": 9.987198891251403e-05, "loss": 2.1874, "step": 2949 }, { "epoch": 0.022941987102714775, "grad_norm": 0.1216041980455024, "learning_rate": 9.987190152552706e-05, "loss": 2.1067, "step": 2950 }, { "epoch": 0.022949764047495356, "grad_norm": 0.1400739279247627, "learning_rate": 9.98718141087611e-05, "loss": 2.1583, "step": 2951 }, { "epoch": 0.022957540992275937, "grad_norm": 0.1452790235032141, "learning_rate": 9.987172666221619e-05, "loss": 2.169, "step": 2952 }, { "epoch": 0.022965317937056518, "grad_norm": 0.11662255562263603, "learning_rate": 9.987163918589237e-05, "loss": 2.0929, "step": 2953 }, { "epoch": 0.0229730948818371, "grad_norm": 0.12702713247168784, "learning_rate": 9.987155167978972e-05, "loss": 2.1634, "step": 2954 }, { "epoch": 0.022980871826617684, "grad_norm": 0.12439927221635878, "learning_rate": 9.987146414390825e-05, "loss": 2.1128, "step": 2955 }, { "epoch": 0.022988648771398265, "grad_norm": 0.1262708597883172, "learning_rate": 9.987137657824806e-05, "loss": 2.1917, "step": 2956 }, { "epoch": 0.022996425716178846, "grad_norm": 0.12895703196426203, "learning_rate": 9.987128898280918e-05, "loss": 2.1171, "step": 2957 }, { "epoch": 0.023004202660959427, "grad_norm": 0.11614185639917435, "learning_rate": 9.987120135759164e-05, "loss": 2.1414, "step": 2958 }, { "epoch": 0.02301197960574001, "grad_norm": 0.13034467151505982, "learning_rate": 9.987111370259554e-05, "loss": 2.1323, "step": 2959 }, { "epoch": 0.02301975655052059, "grad_norm": 0.11995601725742865, "learning_rate": 9.987102601782088e-05, "loss": 2.1342, "step": 2960 }, { "epoch": 0.02302753349530117, "grad_norm": 0.1178681177354653, "learning_rate": 9.987093830326776e-05, "loss": 2.14, "step": 2961 }, { "epoch": 0.023035310440081752, "grad_norm": 0.12818943189779716, "learning_rate": 9.987085055893621e-05, "loss": 2.109, "step": 2962 }, { "epoch": 0.023043087384862333, "grad_norm": 0.15180901154779622, "learning_rate": 9.987076278482627e-05, "loss": 2.1472, "step": 2963 }, { "epoch": 0.023050864329642914, "grad_norm": 0.16142904626585078, "learning_rate": 9.987067498093802e-05, "loss": 2.1622, "step": 2964 }, { "epoch": 0.023058641274423496, "grad_norm": 0.15265079268007686, "learning_rate": 9.98705871472715e-05, "loss": 2.1341, "step": 2965 }, { "epoch": 0.023066418219204077, "grad_norm": 0.1212182646316636, "learning_rate": 9.987049928382674e-05, "loss": 2.1275, "step": 2966 }, { "epoch": 0.023074195163984658, "grad_norm": 0.15430103806974055, "learning_rate": 9.987041139060382e-05, "loss": 2.1109, "step": 2967 }, { "epoch": 0.02308197210876524, "grad_norm": 0.19449592570742655, "learning_rate": 9.98703234676028e-05, "loss": 2.1092, "step": 2968 }, { "epoch": 0.02308974905354582, "grad_norm": 0.1834515150905365, "learning_rate": 9.987023551482371e-05, "loss": 2.1116, "step": 2969 }, { "epoch": 0.0230975259983264, "grad_norm": 0.13819477865959837, "learning_rate": 9.987014753226663e-05, "loss": 2.1115, "step": 2970 }, { "epoch": 0.023105302943106983, "grad_norm": 0.123769324133944, "learning_rate": 9.987005951993156e-05, "loss": 2.1702, "step": 2971 }, { "epoch": 0.023113079887887564, "grad_norm": 0.13317143855343502, "learning_rate": 9.98699714778186e-05, "loss": 2.1137, "step": 2972 }, { "epoch": 0.023120856832668145, "grad_norm": 0.22797188751434275, "learning_rate": 9.98698834059278e-05, "loss": 2.16, "step": 2973 }, { "epoch": 0.023128633777448726, "grad_norm": 0.11566632129370821, "learning_rate": 9.986979530425919e-05, "loss": 2.1306, "step": 2974 }, { "epoch": 0.023136410722229307, "grad_norm": 0.17462061770582002, "learning_rate": 9.986970717281284e-05, "loss": 2.1301, "step": 2975 }, { "epoch": 0.02314418766700989, "grad_norm": 0.14059732242875006, "learning_rate": 9.98696190115888e-05, "loss": 2.1006, "step": 2976 }, { "epoch": 0.02315196461179047, "grad_norm": 0.12810089687447565, "learning_rate": 9.986953082058711e-05, "loss": 2.1333, "step": 2977 }, { "epoch": 0.02315974155657105, "grad_norm": 0.2483787356366566, "learning_rate": 9.986944259980784e-05, "loss": 2.0487, "step": 2978 }, { "epoch": 0.023167518501351632, "grad_norm": 0.13339355749110499, "learning_rate": 9.986935434925102e-05, "loss": 2.1708, "step": 2979 }, { "epoch": 0.023175295446132213, "grad_norm": 0.13345536184092513, "learning_rate": 9.986926606891674e-05, "loss": 2.1005, "step": 2980 }, { "epoch": 0.023183072390912794, "grad_norm": 0.11985264556934953, "learning_rate": 9.9869177758805e-05, "loss": 2.1676, "step": 2981 }, { "epoch": 0.023190849335693375, "grad_norm": 0.12344861849348496, "learning_rate": 9.98690894189159e-05, "loss": 2.1517, "step": 2982 }, { "epoch": 0.023198626280473957, "grad_norm": 0.11557994072785215, "learning_rate": 9.986900104924948e-05, "loss": 2.1165, "step": 2983 }, { "epoch": 0.023206403225254538, "grad_norm": 0.14035786671257344, "learning_rate": 9.986891264980579e-05, "loss": 2.1196, "step": 2984 }, { "epoch": 0.023214180170035122, "grad_norm": 0.1428467824451323, "learning_rate": 9.986882422058485e-05, "loss": 2.1122, "step": 2985 }, { "epoch": 0.023221957114815703, "grad_norm": 0.40996880311645906, "learning_rate": 9.986873576158677e-05, "loss": 2.0751, "step": 2986 }, { "epoch": 0.023229734059596285, "grad_norm": 0.1854028157984202, "learning_rate": 9.986864727281158e-05, "loss": 2.1417, "step": 2987 }, { "epoch": 0.023237511004376866, "grad_norm": 0.15218584965475096, "learning_rate": 9.986855875425932e-05, "loss": 2.1635, "step": 2988 }, { "epoch": 0.023245287949157447, "grad_norm": 0.11711456154233592, "learning_rate": 9.986847020593006e-05, "loss": 2.0817, "step": 2989 }, { "epoch": 0.023253064893938028, "grad_norm": 0.12161931837334816, "learning_rate": 9.986838162782384e-05, "loss": 2.1517, "step": 2990 }, { "epoch": 0.02326084183871861, "grad_norm": 0.1553314947295464, "learning_rate": 9.986829301994072e-05, "loss": 2.0792, "step": 2991 }, { "epoch": 0.02326861878349919, "grad_norm": 0.16313498715478225, "learning_rate": 9.986820438228075e-05, "loss": 2.1232, "step": 2992 }, { "epoch": 0.02327639572827977, "grad_norm": 0.3633986604268786, "learning_rate": 9.986811571484398e-05, "loss": 2.1276, "step": 2993 }, { "epoch": 0.023284172673060353, "grad_norm": 0.13436139212908518, "learning_rate": 9.986802701763047e-05, "loss": 2.1537, "step": 2994 }, { "epoch": 0.023291949617840934, "grad_norm": 0.12208860715100353, "learning_rate": 9.986793829064028e-05, "loss": 2.1113, "step": 2995 }, { "epoch": 0.023299726562621515, "grad_norm": 0.13240147836777716, "learning_rate": 9.986784953387342e-05, "loss": 2.1296, "step": 2996 }, { "epoch": 0.023307503507402096, "grad_norm": 0.1924769350399737, "learning_rate": 9.986776074733001e-05, "loss": 2.1592, "step": 2997 }, { "epoch": 0.023315280452182677, "grad_norm": 0.2179880333399188, "learning_rate": 9.986767193101006e-05, "loss": 2.1345, "step": 2998 }, { "epoch": 0.02332305739696326, "grad_norm": 0.13779839630430987, "learning_rate": 9.986758308491362e-05, "loss": 2.1531, "step": 2999 }, { "epoch": 0.02333083434174384, "grad_norm": 0.15388076198201814, "learning_rate": 9.986749420904077e-05, "loss": 2.1585, "step": 3000 }, { "epoch": 0.02333861128652442, "grad_norm": 0.243113679926718, "learning_rate": 9.986740530339154e-05, "loss": 2.1428, "step": 3001 }, { "epoch": 0.023346388231305002, "grad_norm": 0.28029177924694854, "learning_rate": 9.986731636796599e-05, "loss": 2.1395, "step": 3002 }, { "epoch": 0.023354165176085583, "grad_norm": 0.24472356748336271, "learning_rate": 9.986722740276418e-05, "loss": 2.154, "step": 3003 }, { "epoch": 0.023361942120866164, "grad_norm": 0.13869547591787598, "learning_rate": 9.986713840778614e-05, "loss": 2.1127, "step": 3004 }, { "epoch": 0.023369719065646746, "grad_norm": 0.14760082521127604, "learning_rate": 9.986704938303196e-05, "loss": 2.153, "step": 3005 }, { "epoch": 0.023377496010427327, "grad_norm": 0.17739693113990856, "learning_rate": 9.986696032850167e-05, "loss": 2.1312, "step": 3006 }, { "epoch": 0.023385272955207908, "grad_norm": 0.12789724517511988, "learning_rate": 9.986687124419533e-05, "loss": 2.1292, "step": 3007 }, { "epoch": 0.02339304989998849, "grad_norm": 0.12972191597309013, "learning_rate": 9.986678213011298e-05, "loss": 2.1486, "step": 3008 }, { "epoch": 0.02340082684476907, "grad_norm": 0.13902914487926216, "learning_rate": 9.986669298625468e-05, "loss": 2.141, "step": 3009 }, { "epoch": 0.02340860378954965, "grad_norm": 0.13977889661035484, "learning_rate": 9.986660381262049e-05, "loss": 2.119, "step": 3010 }, { "epoch": 0.023416380734330233, "grad_norm": 0.120912993057715, "learning_rate": 9.986651460921046e-05, "loss": 2.107, "step": 3011 }, { "epoch": 0.023424157679110814, "grad_norm": 0.11747014056164069, "learning_rate": 9.986642537602467e-05, "loss": 2.1183, "step": 3012 }, { "epoch": 0.023431934623891395, "grad_norm": 0.12153786619931142, "learning_rate": 9.98663361130631e-05, "loss": 2.155, "step": 3013 }, { "epoch": 0.02343971156867198, "grad_norm": 0.1354145301224728, "learning_rate": 9.986624682032588e-05, "loss": 2.0853, "step": 3014 }, { "epoch": 0.02344748851345256, "grad_norm": 0.1477062552545747, "learning_rate": 9.986615749781303e-05, "loss": 2.1594, "step": 3015 }, { "epoch": 0.023455265458233142, "grad_norm": 0.1729143712652148, "learning_rate": 9.986606814552459e-05, "loss": 2.1216, "step": 3016 }, { "epoch": 0.023463042403013723, "grad_norm": 0.15506248899041417, "learning_rate": 9.986597876346064e-05, "loss": 2.1068, "step": 3017 }, { "epoch": 0.023470819347794304, "grad_norm": 0.12529170458780292, "learning_rate": 9.986588935162122e-05, "loss": 2.1604, "step": 3018 }, { "epoch": 0.023478596292574885, "grad_norm": 0.197133270144163, "learning_rate": 9.986579991000639e-05, "loss": 2.1315, "step": 3019 }, { "epoch": 0.023486373237355467, "grad_norm": 0.2138715407226823, "learning_rate": 9.986571043861621e-05, "loss": 2.1423, "step": 3020 }, { "epoch": 0.023494150182136048, "grad_norm": 0.1412029213920174, "learning_rate": 9.986562093745072e-05, "loss": 2.1732, "step": 3021 }, { "epoch": 0.02350192712691663, "grad_norm": 0.13005246865033482, "learning_rate": 9.986553140650996e-05, "loss": 2.1656, "step": 3022 }, { "epoch": 0.02350970407169721, "grad_norm": 0.17818937508818275, "learning_rate": 9.986544184579401e-05, "loss": 2.1884, "step": 3023 }, { "epoch": 0.02351748101647779, "grad_norm": 0.19294341838609744, "learning_rate": 9.98653522553029e-05, "loss": 2.1459, "step": 3024 }, { "epoch": 0.023525257961258372, "grad_norm": 0.1247621566584058, "learning_rate": 9.98652626350367e-05, "loss": 2.1513, "step": 3025 }, { "epoch": 0.023533034906038953, "grad_norm": 0.1621123841163212, "learning_rate": 9.986517298499548e-05, "loss": 2.0957, "step": 3026 }, { "epoch": 0.023540811850819535, "grad_norm": 0.17273231783691345, "learning_rate": 9.986508330517925e-05, "loss": 2.1518, "step": 3027 }, { "epoch": 0.023548588795600116, "grad_norm": 0.1431214776495398, "learning_rate": 9.986499359558811e-05, "loss": 2.1608, "step": 3028 }, { "epoch": 0.023556365740380697, "grad_norm": 0.13156660799500022, "learning_rate": 9.986490385622208e-05, "loss": 2.1241, "step": 3029 }, { "epoch": 0.023564142685161278, "grad_norm": 0.11878179466319592, "learning_rate": 9.986481408708121e-05, "loss": 2.0991, "step": 3030 }, { "epoch": 0.02357191962994186, "grad_norm": 0.12590830816256549, "learning_rate": 9.98647242881656e-05, "loss": 2.1254, "step": 3031 }, { "epoch": 0.02357969657472244, "grad_norm": 0.134005161022152, "learning_rate": 9.986463445947525e-05, "loss": 2.1278, "step": 3032 }, { "epoch": 0.02358747351950302, "grad_norm": 0.12224298494034348, "learning_rate": 9.986454460101025e-05, "loss": 2.1203, "step": 3033 }, { "epoch": 0.023595250464283603, "grad_norm": 0.1284650458850817, "learning_rate": 9.986445471277062e-05, "loss": 2.1311, "step": 3034 }, { "epoch": 0.023603027409064184, "grad_norm": 0.19128434799404503, "learning_rate": 9.986436479475645e-05, "loss": 2.1627, "step": 3035 }, { "epoch": 0.023610804353844765, "grad_norm": 0.15281892459359211, "learning_rate": 9.986427484696777e-05, "loss": 2.1628, "step": 3036 }, { "epoch": 0.023618581298625346, "grad_norm": 0.13564425806176442, "learning_rate": 9.986418486940463e-05, "loss": 2.1438, "step": 3037 }, { "epoch": 0.023626358243405927, "grad_norm": 0.11395358043613873, "learning_rate": 9.986409486206711e-05, "loss": 2.111, "step": 3038 }, { "epoch": 0.02363413518818651, "grad_norm": 0.13424749961336585, "learning_rate": 9.986400482495523e-05, "loss": 2.1276, "step": 3039 }, { "epoch": 0.02364191213296709, "grad_norm": 0.1380266488867871, "learning_rate": 9.986391475806909e-05, "loss": 2.1644, "step": 3040 }, { "epoch": 0.02364968907774767, "grad_norm": 0.14194020063543486, "learning_rate": 9.986382466140869e-05, "loss": 2.1119, "step": 3041 }, { "epoch": 0.023657466022528252, "grad_norm": 0.13436728141812213, "learning_rate": 9.986373453497411e-05, "loss": 2.1038, "step": 3042 }, { "epoch": 0.023665242967308833, "grad_norm": 0.11252062588693176, "learning_rate": 9.986364437876541e-05, "loss": 2.1734, "step": 3043 }, { "epoch": 0.023673019912089418, "grad_norm": 0.13247247449150926, "learning_rate": 9.986355419278264e-05, "loss": 2.1244, "step": 3044 }, { "epoch": 0.02368079685687, "grad_norm": 0.16688474829915023, "learning_rate": 9.986346397702584e-05, "loss": 2.1054, "step": 3045 }, { "epoch": 0.02368857380165058, "grad_norm": 0.12874654192056129, "learning_rate": 9.98633737314951e-05, "loss": 2.0404, "step": 3046 }, { "epoch": 0.02369635074643116, "grad_norm": 0.1246466956790918, "learning_rate": 9.986328345619042e-05, "loss": 2.1475, "step": 3047 }, { "epoch": 0.023704127691211743, "grad_norm": 0.13462939667450877, "learning_rate": 9.986319315111189e-05, "loss": 2.1496, "step": 3048 }, { "epoch": 0.023711904635992324, "grad_norm": 0.14826839537525272, "learning_rate": 9.986310281625955e-05, "loss": 2.1098, "step": 3049 }, { "epoch": 0.023719681580772905, "grad_norm": 0.14271906708818377, "learning_rate": 9.986301245163348e-05, "loss": 2.1202, "step": 3050 }, { "epoch": 0.023727458525553486, "grad_norm": 0.1616243594082413, "learning_rate": 9.98629220572337e-05, "loss": 2.0912, "step": 3051 }, { "epoch": 0.023735235470334067, "grad_norm": 0.1613617561185001, "learning_rate": 9.986283163306027e-05, "loss": 2.122, "step": 3052 }, { "epoch": 0.02374301241511465, "grad_norm": 0.1476723912097113, "learning_rate": 9.986274117911327e-05, "loss": 2.1397, "step": 3053 }, { "epoch": 0.02375078935989523, "grad_norm": 0.11562531517628867, "learning_rate": 9.986265069539274e-05, "loss": 2.1315, "step": 3054 }, { "epoch": 0.02375856630467581, "grad_norm": 0.14356661054983005, "learning_rate": 9.98625601818987e-05, "loss": 2.0977, "step": 3055 }, { "epoch": 0.023766343249456392, "grad_norm": 0.1564435155443595, "learning_rate": 9.986246963863126e-05, "loss": 2.1315, "step": 3056 }, { "epoch": 0.023774120194236973, "grad_norm": 0.12854965880156047, "learning_rate": 9.986237906559044e-05, "loss": 2.1489, "step": 3057 }, { "epoch": 0.023781897139017554, "grad_norm": 0.12621779528457686, "learning_rate": 9.98622884627763e-05, "loss": 2.1675, "step": 3058 }, { "epoch": 0.023789674083798135, "grad_norm": 0.15824164101295587, "learning_rate": 9.98621978301889e-05, "loss": 2.1433, "step": 3059 }, { "epoch": 0.023797451028578716, "grad_norm": 0.1804999159022409, "learning_rate": 9.98621071678283e-05, "loss": 2.1383, "step": 3060 }, { "epoch": 0.023805227973359298, "grad_norm": 0.15870089099748014, "learning_rate": 9.986201647569452e-05, "loss": 2.1981, "step": 3061 }, { "epoch": 0.02381300491813988, "grad_norm": 0.13300860723407817, "learning_rate": 9.986192575378766e-05, "loss": 2.146, "step": 3062 }, { "epoch": 0.02382078186292046, "grad_norm": 0.12184099139504596, "learning_rate": 9.986183500210773e-05, "loss": 2.1748, "step": 3063 }, { "epoch": 0.02382855880770104, "grad_norm": 0.1330759538856865, "learning_rate": 9.986174422065483e-05, "loss": 2.2069, "step": 3064 }, { "epoch": 0.023836335752481622, "grad_norm": 0.1245014099700863, "learning_rate": 9.986165340942898e-05, "loss": 2.1848, "step": 3065 }, { "epoch": 0.023844112697262203, "grad_norm": 0.11511864998973308, "learning_rate": 9.986156256843025e-05, "loss": 2.1223, "step": 3066 }, { "epoch": 0.023851889642042785, "grad_norm": 0.11424696348181808, "learning_rate": 9.986147169765869e-05, "loss": 2.1202, "step": 3067 }, { "epoch": 0.023859666586823366, "grad_norm": 0.1181348281571488, "learning_rate": 9.986138079711435e-05, "loss": 2.1309, "step": 3068 }, { "epoch": 0.023867443531603947, "grad_norm": 0.11809117875369543, "learning_rate": 9.98612898667973e-05, "loss": 2.0778, "step": 3069 }, { "epoch": 0.023875220476384528, "grad_norm": 0.1349196840300944, "learning_rate": 9.986119890670756e-05, "loss": 2.0976, "step": 3070 }, { "epoch": 0.02388299742116511, "grad_norm": 0.13425108094401259, "learning_rate": 9.986110791684522e-05, "loss": 2.1558, "step": 3071 }, { "epoch": 0.02389077436594569, "grad_norm": 0.12166067809791012, "learning_rate": 9.986101689721033e-05, "loss": 2.1148, "step": 3072 }, { "epoch": 0.02389855131072627, "grad_norm": 0.12881111492688033, "learning_rate": 9.986092584780292e-05, "loss": 2.0795, "step": 3073 }, { "epoch": 0.023906328255506856, "grad_norm": 0.16706899839167508, "learning_rate": 9.986083476862306e-05, "loss": 2.1581, "step": 3074 }, { "epoch": 0.023914105200287437, "grad_norm": 0.15525745203299451, "learning_rate": 9.986074365967081e-05, "loss": 2.1225, "step": 3075 }, { "epoch": 0.02392188214506802, "grad_norm": 0.12853807542442136, "learning_rate": 9.986065252094623e-05, "loss": 2.2007, "step": 3076 }, { "epoch": 0.0239296590898486, "grad_norm": 0.13343557639257417, "learning_rate": 9.986056135244936e-05, "loss": 2.1396, "step": 3077 }, { "epoch": 0.02393743603462918, "grad_norm": 0.13223941105361672, "learning_rate": 9.986047015418025e-05, "loss": 2.1235, "step": 3078 }, { "epoch": 0.023945212979409762, "grad_norm": 0.18322237406957026, "learning_rate": 9.986037892613896e-05, "loss": 2.1344, "step": 3079 }, { "epoch": 0.023952989924190343, "grad_norm": 0.13346255112115052, "learning_rate": 9.986028766832554e-05, "loss": 2.1458, "step": 3080 }, { "epoch": 0.023960766868970924, "grad_norm": 0.11500918710338491, "learning_rate": 9.986019638074006e-05, "loss": 2.144, "step": 3081 }, { "epoch": 0.023968543813751506, "grad_norm": 0.13904894623584138, "learning_rate": 9.986010506338256e-05, "loss": 2.143, "step": 3082 }, { "epoch": 0.023976320758532087, "grad_norm": 0.16120203344676326, "learning_rate": 9.986001371625312e-05, "loss": 2.1491, "step": 3083 }, { "epoch": 0.023984097703312668, "grad_norm": 0.1543457847289743, "learning_rate": 9.985992233935175e-05, "loss": 2.1161, "step": 3084 }, { "epoch": 0.02399187464809325, "grad_norm": 0.12318670512692079, "learning_rate": 9.985983093267855e-05, "loss": 2.1375, "step": 3085 }, { "epoch": 0.02399965159287383, "grad_norm": 0.13619391732601305, "learning_rate": 9.985973949623356e-05, "loss": 2.144, "step": 3086 }, { "epoch": 0.02400742853765441, "grad_norm": 0.22698956254512476, "learning_rate": 9.98596480300168e-05, "loss": 2.1143, "step": 3087 }, { "epoch": 0.024015205482434993, "grad_norm": 0.16414962923579718, "learning_rate": 9.985955653402837e-05, "loss": 2.1152, "step": 3088 }, { "epoch": 0.024022982427215574, "grad_norm": 0.11856272007146373, "learning_rate": 9.98594650082683e-05, "loss": 2.0963, "step": 3089 }, { "epoch": 0.024030759371996155, "grad_norm": 0.15797195386704055, "learning_rate": 9.985937345273664e-05, "loss": 2.1559, "step": 3090 }, { "epoch": 0.024038536316776736, "grad_norm": 0.18222112817322564, "learning_rate": 9.985928186743347e-05, "loss": 2.0568, "step": 3091 }, { "epoch": 0.024046313261557317, "grad_norm": 0.15507527694759612, "learning_rate": 9.985919025235884e-05, "loss": 2.1234, "step": 3092 }, { "epoch": 0.0240540902063379, "grad_norm": 0.12438250763365165, "learning_rate": 9.985909860751279e-05, "loss": 2.0938, "step": 3093 }, { "epoch": 0.02406186715111848, "grad_norm": 0.12374607785127895, "learning_rate": 9.985900693289539e-05, "loss": 2.1362, "step": 3094 }, { "epoch": 0.02406964409589906, "grad_norm": 0.1422283522584331, "learning_rate": 9.985891522850666e-05, "loss": 2.1161, "step": 3095 }, { "epoch": 0.024077421040679642, "grad_norm": 0.14632667502042548, "learning_rate": 9.985882349434669e-05, "loss": 2.1367, "step": 3096 }, { "epoch": 0.024085197985460223, "grad_norm": 0.12640368022751405, "learning_rate": 9.985873173041555e-05, "loss": 2.122, "step": 3097 }, { "epoch": 0.024092974930240804, "grad_norm": 0.12561406361201524, "learning_rate": 9.985863993671324e-05, "loss": 2.1291, "step": 3098 }, { "epoch": 0.024100751875021385, "grad_norm": 0.13871174349722287, "learning_rate": 9.985854811323986e-05, "loss": 2.1316, "step": 3099 }, { "epoch": 0.024108528819801966, "grad_norm": 0.14337311815619494, "learning_rate": 9.985845625999545e-05, "loss": 2.1464, "step": 3100 }, { "epoch": 0.024116305764582548, "grad_norm": 0.11707700871232342, "learning_rate": 9.985836437698006e-05, "loss": 2.0729, "step": 3101 }, { "epoch": 0.02412408270936313, "grad_norm": 0.1423568529839556, "learning_rate": 9.985827246419375e-05, "loss": 2.168, "step": 3102 }, { "epoch": 0.024131859654143713, "grad_norm": 0.1892749410657127, "learning_rate": 9.985818052163658e-05, "loss": 2.132, "step": 3103 }, { "epoch": 0.024139636598924295, "grad_norm": 0.19927892044859213, "learning_rate": 9.985808854930858e-05, "loss": 2.1488, "step": 3104 }, { "epoch": 0.024147413543704876, "grad_norm": 0.15406948756860442, "learning_rate": 9.985799654720984e-05, "loss": 2.097, "step": 3105 }, { "epoch": 0.024155190488485457, "grad_norm": 0.12323800645122383, "learning_rate": 9.985790451534041e-05, "loss": 2.185, "step": 3106 }, { "epoch": 0.024162967433266038, "grad_norm": 0.12908779420877703, "learning_rate": 9.985781245370032e-05, "loss": 2.1971, "step": 3107 }, { "epoch": 0.02417074437804662, "grad_norm": 0.12703509962400242, "learning_rate": 9.985772036228962e-05, "loss": 2.1009, "step": 3108 }, { "epoch": 0.0241785213228272, "grad_norm": 0.1409933767354703, "learning_rate": 9.985762824110842e-05, "loss": 2.0893, "step": 3109 }, { "epoch": 0.02418629826760778, "grad_norm": 0.13086627612099436, "learning_rate": 9.985753609015671e-05, "loss": 2.1146, "step": 3110 }, { "epoch": 0.024194075212388363, "grad_norm": 0.1207750623426873, "learning_rate": 9.985744390943459e-05, "loss": 2.2106, "step": 3111 }, { "epoch": 0.024201852157168944, "grad_norm": 0.14944443379678812, "learning_rate": 9.985735169894211e-05, "loss": 2.0945, "step": 3112 }, { "epoch": 0.024209629101949525, "grad_norm": 0.15545590110876578, "learning_rate": 9.985725945867929e-05, "loss": 2.0779, "step": 3113 }, { "epoch": 0.024217406046730106, "grad_norm": 0.12747686696457863, "learning_rate": 9.985716718864622e-05, "loss": 2.1324, "step": 3114 }, { "epoch": 0.024225182991510687, "grad_norm": 0.12764535959300974, "learning_rate": 9.985707488884293e-05, "loss": 2.0447, "step": 3115 }, { "epoch": 0.02423295993629127, "grad_norm": 0.18768609542809186, "learning_rate": 9.985698255926952e-05, "loss": 2.1012, "step": 3116 }, { "epoch": 0.02424073688107185, "grad_norm": 0.18754700471053834, "learning_rate": 9.9856890199926e-05, "loss": 2.1012, "step": 3117 }, { "epoch": 0.02424851382585243, "grad_norm": 0.13382566980030108, "learning_rate": 9.985679781081244e-05, "loss": 2.117, "step": 3118 }, { "epoch": 0.024256290770633012, "grad_norm": 0.1309237742057889, "learning_rate": 9.985670539192888e-05, "loss": 2.1394, "step": 3119 }, { "epoch": 0.024264067715413593, "grad_norm": 0.14773547034028006, "learning_rate": 9.98566129432754e-05, "loss": 2.0919, "step": 3120 }, { "epoch": 0.024271844660194174, "grad_norm": 0.1205874883121051, "learning_rate": 9.985652046485206e-05, "loss": 2.1499, "step": 3121 }, { "epoch": 0.024279621604974756, "grad_norm": 0.13638957790906214, "learning_rate": 9.985642795665888e-05, "loss": 2.1061, "step": 3122 }, { "epoch": 0.024287398549755337, "grad_norm": 0.15085749484415964, "learning_rate": 9.985633541869595e-05, "loss": 2.1413, "step": 3123 }, { "epoch": 0.024295175494535918, "grad_norm": 0.1409331339937122, "learning_rate": 9.985624285096328e-05, "loss": 2.1386, "step": 3124 }, { "epoch": 0.0243029524393165, "grad_norm": 0.12339732601449055, "learning_rate": 9.985615025346099e-05, "loss": 2.1536, "step": 3125 }, { "epoch": 0.02431072938409708, "grad_norm": 0.12477680641620749, "learning_rate": 9.985605762618909e-05, "loss": 2.099, "step": 3126 }, { "epoch": 0.02431850632887766, "grad_norm": 0.12693278234656946, "learning_rate": 9.985596496914765e-05, "loss": 2.1287, "step": 3127 }, { "epoch": 0.024326283273658243, "grad_norm": 0.13045191333693368, "learning_rate": 9.985587228233673e-05, "loss": 2.1603, "step": 3128 }, { "epoch": 0.024334060218438824, "grad_norm": 0.12650952613714225, "learning_rate": 9.985577956575636e-05, "loss": 2.09, "step": 3129 }, { "epoch": 0.024341837163219405, "grad_norm": 0.12136662413969758, "learning_rate": 9.985568681940663e-05, "loss": 2.1406, "step": 3130 }, { "epoch": 0.024349614107999986, "grad_norm": 0.12311002887676774, "learning_rate": 9.985559404328755e-05, "loss": 2.1157, "step": 3131 }, { "epoch": 0.024357391052780567, "grad_norm": 0.12805797893606768, "learning_rate": 9.985550123739924e-05, "loss": 2.1328, "step": 3132 }, { "epoch": 0.024365167997561152, "grad_norm": 0.11701940217801421, "learning_rate": 9.98554084017417e-05, "loss": 2.0803, "step": 3133 }, { "epoch": 0.024372944942341733, "grad_norm": 0.11742983688856803, "learning_rate": 9.9855315536315e-05, "loss": 2.127, "step": 3134 }, { "epoch": 0.024380721887122314, "grad_norm": 0.132041373185037, "learning_rate": 9.98552226411192e-05, "loss": 2.1149, "step": 3135 }, { "epoch": 0.024388498831902895, "grad_norm": 0.17176660065625052, "learning_rate": 9.985512971615436e-05, "loss": 2.1436, "step": 3136 }, { "epoch": 0.024396275776683476, "grad_norm": 0.2358864870057917, "learning_rate": 9.985503676142054e-05, "loss": 2.0878, "step": 3137 }, { "epoch": 0.024404052721464058, "grad_norm": 0.2515523067982242, "learning_rate": 9.985494377691778e-05, "loss": 2.1728, "step": 3138 }, { "epoch": 0.02441182966624464, "grad_norm": 0.1695837667999934, "learning_rate": 9.985485076264613e-05, "loss": 2.0809, "step": 3139 }, { "epoch": 0.02441960661102522, "grad_norm": 0.12449246458750698, "learning_rate": 9.985475771860568e-05, "loss": 2.0877, "step": 3140 }, { "epoch": 0.0244273835558058, "grad_norm": 0.18841062393538208, "learning_rate": 9.985466464479644e-05, "loss": 2.1081, "step": 3141 }, { "epoch": 0.024435160500586382, "grad_norm": 0.20695706762418822, "learning_rate": 9.985457154121851e-05, "loss": 2.1597, "step": 3142 }, { "epoch": 0.024442937445366963, "grad_norm": 0.15096186441452156, "learning_rate": 9.985447840787191e-05, "loss": 2.1364, "step": 3143 }, { "epoch": 0.024450714390147545, "grad_norm": 0.12489096031974026, "learning_rate": 9.985438524475672e-05, "loss": 2.0979, "step": 3144 }, { "epoch": 0.024458491334928126, "grad_norm": 0.13721665807685837, "learning_rate": 9.985429205187298e-05, "loss": 2.1492, "step": 3145 }, { "epoch": 0.024466268279708707, "grad_norm": 0.14750695186570295, "learning_rate": 9.985419882922073e-05, "loss": 2.1333, "step": 3146 }, { "epoch": 0.024474045224489288, "grad_norm": 0.13853900018512616, "learning_rate": 9.985410557680008e-05, "loss": 2.1218, "step": 3147 }, { "epoch": 0.02448182216926987, "grad_norm": 0.11975974879946114, "learning_rate": 9.985401229461105e-05, "loss": 2.1186, "step": 3148 }, { "epoch": 0.02448959911405045, "grad_norm": 0.11478456488213759, "learning_rate": 9.985391898265368e-05, "loss": 2.0921, "step": 3149 }, { "epoch": 0.02449737605883103, "grad_norm": 0.12193393776959692, "learning_rate": 9.985382564092804e-05, "loss": 2.1634, "step": 3150 }, { "epoch": 0.024505153003611613, "grad_norm": 0.13022895263273562, "learning_rate": 9.985373226943421e-05, "loss": 2.1551, "step": 3151 }, { "epoch": 0.024512929948392194, "grad_norm": 0.13477870735144387, "learning_rate": 9.98536388681722e-05, "loss": 2.1228, "step": 3152 }, { "epoch": 0.024520706893172775, "grad_norm": 0.15089645766813167, "learning_rate": 9.985354543714212e-05, "loss": 2.0723, "step": 3153 }, { "epoch": 0.024528483837953356, "grad_norm": 0.12007160588095792, "learning_rate": 9.985345197634399e-05, "loss": 2.1267, "step": 3154 }, { "epoch": 0.024536260782733937, "grad_norm": 0.12394396748019884, "learning_rate": 9.985335848577785e-05, "loss": 2.0694, "step": 3155 }, { "epoch": 0.02454403772751452, "grad_norm": 0.1407291901583929, "learning_rate": 9.98532649654438e-05, "loss": 2.0885, "step": 3156 }, { "epoch": 0.0245518146722951, "grad_norm": 0.1401674403639953, "learning_rate": 9.985317141534188e-05, "loss": 2.1048, "step": 3157 }, { "epoch": 0.02455959161707568, "grad_norm": 0.13422395292154957, "learning_rate": 9.985307783547212e-05, "loss": 2.1118, "step": 3158 }, { "epoch": 0.024567368561856262, "grad_norm": 0.118765677522011, "learning_rate": 9.98529842258346e-05, "loss": 2.1525, "step": 3159 }, { "epoch": 0.024575145506636843, "grad_norm": 0.11247204056420337, "learning_rate": 9.985289058642938e-05, "loss": 2.0866, "step": 3160 }, { "epoch": 0.024582922451417424, "grad_norm": 0.11820743015893356, "learning_rate": 9.985279691725651e-05, "loss": 2.0703, "step": 3161 }, { "epoch": 0.024590699396198006, "grad_norm": 0.1213319054168717, "learning_rate": 9.985270321831604e-05, "loss": 2.1315, "step": 3162 }, { "epoch": 0.02459847634097859, "grad_norm": 0.11919353077812624, "learning_rate": 9.985260948960801e-05, "loss": 2.1363, "step": 3163 }, { "epoch": 0.02460625328575917, "grad_norm": 0.12154076577751126, "learning_rate": 9.985251573113253e-05, "loss": 2.1576, "step": 3164 }, { "epoch": 0.024614030230539753, "grad_norm": 0.11929130694755222, "learning_rate": 9.985242194288961e-05, "loss": 2.119, "step": 3165 }, { "epoch": 0.024621807175320334, "grad_norm": 0.11537924699195354, "learning_rate": 9.98523281248793e-05, "loss": 2.131, "step": 3166 }, { "epoch": 0.024629584120100915, "grad_norm": 0.12054309152999058, "learning_rate": 9.98522342771017e-05, "loss": 2.0684, "step": 3167 }, { "epoch": 0.024637361064881496, "grad_norm": 0.15104966037950535, "learning_rate": 9.985214039955683e-05, "loss": 2.1074, "step": 3168 }, { "epoch": 0.024645138009662077, "grad_norm": 0.16709529120339253, "learning_rate": 9.985204649224474e-05, "loss": 2.1679, "step": 3169 }, { "epoch": 0.02465291495444266, "grad_norm": 0.17258136713837285, "learning_rate": 9.985195255516553e-05, "loss": 2.0954, "step": 3170 }, { "epoch": 0.02466069189922324, "grad_norm": 0.5889220847065323, "learning_rate": 9.98518585883192e-05, "loss": 2.0873, "step": 3171 }, { "epoch": 0.02466846884400382, "grad_norm": 0.13983463798106882, "learning_rate": 9.985176459170584e-05, "loss": 2.1066, "step": 3172 }, { "epoch": 0.024676245788784402, "grad_norm": 0.1539916685483776, "learning_rate": 9.985167056532552e-05, "loss": 2.088, "step": 3173 }, { "epoch": 0.024684022733564983, "grad_norm": 0.21279337693622016, "learning_rate": 9.985157650917826e-05, "loss": 2.0979, "step": 3174 }, { "epoch": 0.024691799678345564, "grad_norm": 0.2952381690388895, "learning_rate": 9.985148242326414e-05, "loss": 2.1256, "step": 3175 }, { "epoch": 0.024699576623126145, "grad_norm": 0.5564460043501469, "learning_rate": 9.985138830758321e-05, "loss": 2.115, "step": 3176 }, { "epoch": 0.024707353567906726, "grad_norm": 0.8000756490506672, "learning_rate": 9.985129416213552e-05, "loss": 2.1098, "step": 3177 }, { "epoch": 0.024715130512687308, "grad_norm": 0.5892209766447866, "learning_rate": 9.985119998692112e-05, "loss": 2.1389, "step": 3178 }, { "epoch": 0.02472290745746789, "grad_norm": 0.32269533337142375, "learning_rate": 9.98511057819401e-05, "loss": 2.1261, "step": 3179 }, { "epoch": 0.02473068440224847, "grad_norm": 0.19925393416008425, "learning_rate": 9.985101154719247e-05, "loss": 2.1413, "step": 3180 }, { "epoch": 0.02473846134702905, "grad_norm": 0.7708929775081527, "learning_rate": 9.985091728267832e-05, "loss": 2.1551, "step": 3181 }, { "epoch": 0.024746238291809632, "grad_norm": 1.1747628845475062, "learning_rate": 9.98508229883977e-05, "loss": 2.1796, "step": 3182 }, { "epoch": 0.024754015236590213, "grad_norm": 0.7217374458363538, "learning_rate": 9.985072866435066e-05, "loss": 2.1403, "step": 3183 }, { "epoch": 0.024761792181370795, "grad_norm": 0.4061356239158808, "learning_rate": 9.985063431053726e-05, "loss": 2.1825, "step": 3184 }, { "epoch": 0.024769569126151376, "grad_norm": 0.2780672179480693, "learning_rate": 9.985053992695756e-05, "loss": 2.1556, "step": 3185 }, { "epoch": 0.024777346070931957, "grad_norm": 1.1644699252755253, "learning_rate": 9.98504455136116e-05, "loss": 2.1662, "step": 3186 }, { "epoch": 0.024785123015712538, "grad_norm": 1.5271706970179342, "learning_rate": 9.985035107049945e-05, "loss": 2.2161, "step": 3187 }, { "epoch": 0.02479289996049312, "grad_norm": 0.44910213781839986, "learning_rate": 9.985025659762118e-05, "loss": 2.087, "step": 3188 }, { "epoch": 0.0248006769052737, "grad_norm": 3.105750115318329, "learning_rate": 9.98501620949768e-05, "loss": 2.1601, "step": 3189 }, { "epoch": 0.02480845385005428, "grad_norm": 1.3101455481193576, "learning_rate": 9.985006756256642e-05, "loss": 2.215, "step": 3190 }, { "epoch": 0.024816230794834863, "grad_norm": 0.42167989696955144, "learning_rate": 9.984997300039007e-05, "loss": 2.149, "step": 3191 }, { "epoch": 0.024824007739615444, "grad_norm": 0.9400735280881967, "learning_rate": 9.984987840844781e-05, "loss": 2.1927, "step": 3192 }, { "epoch": 0.02483178468439603, "grad_norm": 1.3394385070460397, "learning_rate": 9.98497837867397e-05, "loss": 2.2508, "step": 3193 }, { "epoch": 0.02483956162917661, "grad_norm": 0.5269005338943437, "learning_rate": 9.984968913526578e-05, "loss": 2.1089, "step": 3194 }, { "epoch": 0.02484733857395719, "grad_norm": 6.428875167943296, "learning_rate": 9.984959445402612e-05, "loss": 2.4262, "step": 3195 }, { "epoch": 0.024855115518737772, "grad_norm": 1.0214689298845652, "learning_rate": 9.984949974302081e-05, "loss": 2.2349, "step": 3196 }, { "epoch": 0.024862892463518353, "grad_norm": 0.5046881679150764, "learning_rate": 9.984940500224983e-05, "loss": 2.1415, "step": 3197 }, { "epoch": 0.024870669408298934, "grad_norm": 0.34009825537438143, "learning_rate": 9.98493102317133e-05, "loss": 2.1951, "step": 3198 }, { "epoch": 0.024878446353079516, "grad_norm": 0.7064432159232426, "learning_rate": 9.984921543141125e-05, "loss": 2.1956, "step": 3199 }, { "epoch": 0.024886223297860097, "grad_norm": 1.6725487444163099, "learning_rate": 9.984912060134374e-05, "loss": 2.191, "step": 3200 }, { "epoch": 0.024894000242640678, "grad_norm": 1.2957222912033968, "learning_rate": 9.984902574151084e-05, "loss": 2.1854, "step": 3201 }, { "epoch": 0.02490177718742126, "grad_norm": 0.36364146511687584, "learning_rate": 9.984893085191258e-05, "loss": 2.1801, "step": 3202 }, { "epoch": 0.02490955413220184, "grad_norm": 1.9375452206504293, "learning_rate": 9.984883593254905e-05, "loss": 2.2132, "step": 3203 }, { "epoch": 0.02491733107698242, "grad_norm": 0.785598431711548, "learning_rate": 9.984874098342028e-05, "loss": 2.1247, "step": 3204 }, { "epoch": 0.024925108021763002, "grad_norm": 0.37635334547454546, "learning_rate": 9.984864600452633e-05, "loss": 2.1864, "step": 3205 }, { "epoch": 0.024932884966543584, "grad_norm": 1.0324131539062806, "learning_rate": 9.984855099586727e-05, "loss": 2.1547, "step": 3206 }, { "epoch": 0.024940661911324165, "grad_norm": 0.8971030499190931, "learning_rate": 9.984845595744314e-05, "loss": 2.1435, "step": 3207 }, { "epoch": 0.024948438856104746, "grad_norm": 0.3559299224139204, "learning_rate": 9.984836088925401e-05, "loss": 2.0767, "step": 3208 }, { "epoch": 0.024956215800885327, "grad_norm": 1.6003937549909582, "learning_rate": 9.984826579129994e-05, "loss": 2.2062, "step": 3209 }, { "epoch": 0.02496399274566591, "grad_norm": 0.6434603812894498, "learning_rate": 9.984817066358096e-05, "loss": 2.1739, "step": 3210 }, { "epoch": 0.02497176969044649, "grad_norm": 0.34450099108366866, "learning_rate": 9.984807550609717e-05, "loss": 2.1794, "step": 3211 }, { "epoch": 0.02497954663522707, "grad_norm": 1.2701656324299544, "learning_rate": 9.98479803188486e-05, "loss": 2.1491, "step": 3212 }, { "epoch": 0.024987323580007652, "grad_norm": 0.6486444607108038, "learning_rate": 9.98478851018353e-05, "loss": 2.1734, "step": 3213 }, { "epoch": 0.024995100524788233, "grad_norm": 0.3533246026089168, "learning_rate": 9.984778985505734e-05, "loss": 2.167, "step": 3214 }, { "epoch": 0.025002877469568814, "grad_norm": 0.9788759859461886, "learning_rate": 9.984769457851477e-05, "loss": 2.1687, "step": 3215 }, { "epoch": 0.025010654414349395, "grad_norm": 0.475379965521598, "learning_rate": 9.984759927220766e-05, "loss": 2.1294, "step": 3216 }, { "epoch": 0.025018431359129976, "grad_norm": 0.39700456348293683, "learning_rate": 9.984750393613605e-05, "loss": 2.2153, "step": 3217 }, { "epoch": 0.025026208303910558, "grad_norm": 0.7357438877036413, "learning_rate": 9.984740857030001e-05, "loss": 2.2007, "step": 3218 }, { "epoch": 0.02503398524869114, "grad_norm": 0.3039019120174499, "learning_rate": 9.984731317469957e-05, "loss": 2.1698, "step": 3219 }, { "epoch": 0.02504176219347172, "grad_norm": 0.33781556426762105, "learning_rate": 9.984721774933483e-05, "loss": 2.1409, "step": 3220 }, { "epoch": 0.0250495391382523, "grad_norm": 0.41079395515646644, "learning_rate": 9.984712229420583e-05, "loss": 2.1589, "step": 3221 }, { "epoch": 0.025057316083032886, "grad_norm": 0.1808983417632175, "learning_rate": 9.98470268093126e-05, "loss": 2.1524, "step": 3222 }, { "epoch": 0.025065093027813467, "grad_norm": 0.2307521120240684, "learning_rate": 9.984693129465524e-05, "loss": 2.1939, "step": 3223 }, { "epoch": 0.025072869972594048, "grad_norm": 0.18898457712522607, "learning_rate": 9.984683575023377e-05, "loss": 2.1533, "step": 3224 }, { "epoch": 0.02508064691737463, "grad_norm": 0.24234880282185275, "learning_rate": 9.984674017604827e-05, "loss": 2.1299, "step": 3225 }, { "epoch": 0.02508842386215521, "grad_norm": 0.16371692215444192, "learning_rate": 9.98466445720988e-05, "loss": 2.1576, "step": 3226 }, { "epoch": 0.02509620080693579, "grad_norm": 0.20932040896065068, "learning_rate": 9.98465489383854e-05, "loss": 2.0839, "step": 3227 }, { "epoch": 0.025103977751716373, "grad_norm": 0.17182335929789044, "learning_rate": 9.984645327490813e-05, "loss": 2.159, "step": 3228 }, { "epoch": 0.025111754696496954, "grad_norm": 0.21830012817899933, "learning_rate": 9.984635758166705e-05, "loss": 2.1146, "step": 3229 }, { "epoch": 0.025119531641277535, "grad_norm": 0.23052186886732895, "learning_rate": 9.984626185866224e-05, "loss": 2.1729, "step": 3230 }, { "epoch": 0.025127308586058116, "grad_norm": 0.14698714581843947, "learning_rate": 9.984616610589371e-05, "loss": 2.1451, "step": 3231 }, { "epoch": 0.025135085530838697, "grad_norm": 0.2665165411004979, "learning_rate": 9.984607032336155e-05, "loss": 2.1526, "step": 3232 }, { "epoch": 0.02514286247561928, "grad_norm": 0.15011594298529618, "learning_rate": 9.984597451106581e-05, "loss": 2.1493, "step": 3233 }, { "epoch": 0.02515063942039986, "grad_norm": 0.14982854701063186, "learning_rate": 9.984587866900655e-05, "loss": 2.0844, "step": 3234 }, { "epoch": 0.02515841636518044, "grad_norm": 0.13749171993227982, "learning_rate": 9.984578279718383e-05, "loss": 2.1155, "step": 3235 }, { "epoch": 0.025166193309961022, "grad_norm": 0.1403966739177616, "learning_rate": 9.98456868955977e-05, "loss": 2.1684, "step": 3236 }, { "epoch": 0.025173970254741603, "grad_norm": 0.1340817761673271, "learning_rate": 9.984559096424822e-05, "loss": 2.1614, "step": 3237 }, { "epoch": 0.025181747199522184, "grad_norm": 0.12565744480507224, "learning_rate": 9.984549500313544e-05, "loss": 2.0985, "step": 3238 }, { "epoch": 0.025189524144302766, "grad_norm": 0.12711312099086391, "learning_rate": 9.984539901225942e-05, "loss": 2.0642, "step": 3239 }, { "epoch": 0.025197301089083347, "grad_norm": 0.13610505514630028, "learning_rate": 9.984530299162023e-05, "loss": 2.0589, "step": 3240 }, { "epoch": 0.025205078033863928, "grad_norm": 0.1311640363428955, "learning_rate": 9.984520694121792e-05, "loss": 2.1235, "step": 3241 }, { "epoch": 0.02521285497864451, "grad_norm": 0.12184848663589198, "learning_rate": 9.984511086105255e-05, "loss": 2.0965, "step": 3242 }, { "epoch": 0.02522063192342509, "grad_norm": 0.13326754704547675, "learning_rate": 9.984501475112415e-05, "loss": 2.1199, "step": 3243 }, { "epoch": 0.02522840886820567, "grad_norm": 0.12792538284972435, "learning_rate": 9.984491861143283e-05, "loss": 2.0901, "step": 3244 }, { "epoch": 0.025236185812986252, "grad_norm": 0.12398339351034522, "learning_rate": 9.984482244197859e-05, "loss": 2.1037, "step": 3245 }, { "epoch": 0.025243962757766834, "grad_norm": 0.13552566906880223, "learning_rate": 9.984472624276153e-05, "loss": 2.0699, "step": 3246 }, { "epoch": 0.025251739702547415, "grad_norm": 0.1198354285863315, "learning_rate": 9.98446300137817e-05, "loss": 2.1355, "step": 3247 }, { "epoch": 0.025259516647327996, "grad_norm": 0.14936241669131275, "learning_rate": 9.984453375503915e-05, "loss": 2.1038, "step": 3248 }, { "epoch": 0.025267293592108577, "grad_norm": 0.12314226097070732, "learning_rate": 9.984443746653392e-05, "loss": 2.1548, "step": 3249 }, { "epoch": 0.02527507053688916, "grad_norm": 0.12292885647599489, "learning_rate": 9.98443411482661e-05, "loss": 2.098, "step": 3250 }, { "epoch": 0.02528284748166974, "grad_norm": 0.1387407876833618, "learning_rate": 9.984424480023573e-05, "loss": 2.122, "step": 3251 }, { "epoch": 0.025290624426450324, "grad_norm": 0.11313276851164791, "learning_rate": 9.984414842244288e-05, "loss": 2.1893, "step": 3252 }, { "epoch": 0.025298401371230905, "grad_norm": 0.15329626697899465, "learning_rate": 9.984405201488758e-05, "loss": 2.1597, "step": 3253 }, { "epoch": 0.025306178316011486, "grad_norm": 0.13857412192412474, "learning_rate": 9.984395557756991e-05, "loss": 2.0671, "step": 3254 }, { "epoch": 0.025313955260792068, "grad_norm": 0.1171046552467851, "learning_rate": 9.984385911048993e-05, "loss": 2.0756, "step": 3255 }, { "epoch": 0.02532173220557265, "grad_norm": 0.1483243344211937, "learning_rate": 9.984376261364769e-05, "loss": 2.11, "step": 3256 }, { "epoch": 0.02532950915035323, "grad_norm": 0.11344613155037717, "learning_rate": 9.984366608704323e-05, "loss": 2.0693, "step": 3257 }, { "epoch": 0.02533728609513381, "grad_norm": 0.13763978987024947, "learning_rate": 9.984356953067663e-05, "loss": 2.0902, "step": 3258 }, { "epoch": 0.025345063039914392, "grad_norm": 0.13619960578700555, "learning_rate": 9.984347294454797e-05, "loss": 2.1741, "step": 3259 }, { "epoch": 0.025352839984694973, "grad_norm": 0.10908257168550727, "learning_rate": 9.984337632865727e-05, "loss": 2.0993, "step": 3260 }, { "epoch": 0.025360616929475555, "grad_norm": 0.11828937253240081, "learning_rate": 9.984327968300459e-05, "loss": 2.1227, "step": 3261 }, { "epoch": 0.025368393874256136, "grad_norm": 0.10427303498985901, "learning_rate": 9.984318300759e-05, "loss": 2.1274, "step": 3262 }, { "epoch": 0.025376170819036717, "grad_norm": 0.176181565564238, "learning_rate": 9.984308630241355e-05, "loss": 2.0987, "step": 3263 }, { "epoch": 0.025383947763817298, "grad_norm": 0.12650808977938346, "learning_rate": 9.984298956747531e-05, "loss": 2.1314, "step": 3264 }, { "epoch": 0.02539172470859788, "grad_norm": 0.11341418533524594, "learning_rate": 9.984289280277534e-05, "loss": 2.1314, "step": 3265 }, { "epoch": 0.02539950165337846, "grad_norm": 0.12065245838979323, "learning_rate": 9.984279600831366e-05, "loss": 2.1301, "step": 3266 }, { "epoch": 0.02540727859815904, "grad_norm": 0.11202600508902431, "learning_rate": 9.984269918409037e-05, "loss": 2.1348, "step": 3267 }, { "epoch": 0.025415055542939623, "grad_norm": 0.1947713597299657, "learning_rate": 9.984260233010552e-05, "loss": 2.1583, "step": 3268 }, { "epoch": 0.025422832487720204, "grad_norm": 0.12193243922344188, "learning_rate": 9.984250544635916e-05, "loss": 2.1225, "step": 3269 }, { "epoch": 0.025430609432500785, "grad_norm": 0.1149105926749313, "learning_rate": 9.984240853285134e-05, "loss": 2.149, "step": 3270 }, { "epoch": 0.025438386377281366, "grad_norm": 0.11402478696548413, "learning_rate": 9.984231158958212e-05, "loss": 2.182, "step": 3271 }, { "epoch": 0.025446163322061947, "grad_norm": 0.12071489433935638, "learning_rate": 9.984221461655159e-05, "loss": 2.1254, "step": 3272 }, { "epoch": 0.02545394026684253, "grad_norm": 0.11012222857150457, "learning_rate": 9.984211761375977e-05, "loss": 2.1383, "step": 3273 }, { "epoch": 0.02546171721162311, "grad_norm": 0.12702673072109544, "learning_rate": 9.984202058120671e-05, "loss": 2.0673, "step": 3274 }, { "epoch": 0.02546949415640369, "grad_norm": 0.11164988024430259, "learning_rate": 9.984192351889253e-05, "loss": 2.1237, "step": 3275 }, { "epoch": 0.025477271101184272, "grad_norm": 0.11314926397793519, "learning_rate": 9.98418264268172e-05, "loss": 2.0926, "step": 3276 }, { "epoch": 0.025485048045964853, "grad_norm": 0.11206080336003138, "learning_rate": 9.984172930498086e-05, "loss": 2.0558, "step": 3277 }, { "epoch": 0.025492824990745434, "grad_norm": 0.11721918924099249, "learning_rate": 9.984163215338351e-05, "loss": 2.1604, "step": 3278 }, { "epoch": 0.025500601935526016, "grad_norm": 0.3349924943309776, "learning_rate": 9.984153497202524e-05, "loss": 2.1446, "step": 3279 }, { "epoch": 0.025508378880306597, "grad_norm": 0.11921182167640239, "learning_rate": 9.98414377609061e-05, "loss": 2.0816, "step": 3280 }, { "epoch": 0.025516155825087178, "grad_norm": 0.11734518108515797, "learning_rate": 9.984134052002615e-05, "loss": 2.1631, "step": 3281 }, { "epoch": 0.025523932769867762, "grad_norm": 0.12082574862745409, "learning_rate": 9.984124324938545e-05, "loss": 2.132, "step": 3282 }, { "epoch": 0.025531709714648344, "grad_norm": 0.12738181060146583, "learning_rate": 9.984114594898404e-05, "loss": 2.095, "step": 3283 }, { "epoch": 0.025539486659428925, "grad_norm": 0.128211693070843, "learning_rate": 9.984104861882199e-05, "loss": 2.1465, "step": 3284 }, { "epoch": 0.025547263604209506, "grad_norm": 0.1205442742728826, "learning_rate": 9.984095125889936e-05, "loss": 2.0654, "step": 3285 }, { "epoch": 0.025555040548990087, "grad_norm": 0.11691681250398966, "learning_rate": 9.98408538692162e-05, "loss": 2.0879, "step": 3286 }, { "epoch": 0.02556281749377067, "grad_norm": 0.12670874899306875, "learning_rate": 9.984075644977259e-05, "loss": 2.1118, "step": 3287 }, { "epoch": 0.02557059443855125, "grad_norm": 0.11395381345356126, "learning_rate": 9.984065900056856e-05, "loss": 2.1585, "step": 3288 }, { "epoch": 0.02557837138333183, "grad_norm": 0.12614990301286888, "learning_rate": 9.984056152160419e-05, "loss": 2.0906, "step": 3289 }, { "epoch": 0.025586148328112412, "grad_norm": 0.1108933143598248, "learning_rate": 9.984046401287954e-05, "loss": 2.0802, "step": 3290 }, { "epoch": 0.025593925272892993, "grad_norm": 0.11945909991203514, "learning_rate": 9.984036647439465e-05, "loss": 2.0849, "step": 3291 }, { "epoch": 0.025601702217673574, "grad_norm": 0.12419729375687519, "learning_rate": 9.984026890614956e-05, "loss": 2.1075, "step": 3292 }, { "epoch": 0.025609479162454155, "grad_norm": 0.11158503120610114, "learning_rate": 9.98401713081444e-05, "loss": 2.1347, "step": 3293 }, { "epoch": 0.025617256107234736, "grad_norm": 0.1208051217978285, "learning_rate": 9.984007368037916e-05, "loss": 2.0996, "step": 3294 }, { "epoch": 0.025625033052015318, "grad_norm": 0.11355589075574908, "learning_rate": 9.98399760228539e-05, "loss": 2.112, "step": 3295 }, { "epoch": 0.0256328099967959, "grad_norm": 0.10975486210104593, "learning_rate": 9.983987833556872e-05, "loss": 2.125, "step": 3296 }, { "epoch": 0.02564058694157648, "grad_norm": 0.1179463888506845, "learning_rate": 9.983978061852366e-05, "loss": 2.0407, "step": 3297 }, { "epoch": 0.02564836388635706, "grad_norm": 0.1181635605569955, "learning_rate": 9.983968287171878e-05, "loss": 2.1196, "step": 3298 }, { "epoch": 0.025656140831137642, "grad_norm": 0.10981066550322462, "learning_rate": 9.983958509515412e-05, "loss": 2.1144, "step": 3299 }, { "epoch": 0.025663917775918223, "grad_norm": 0.1144659028816394, "learning_rate": 9.983948728882977e-05, "loss": 2.1797, "step": 3300 }, { "epoch": 0.025671694720698805, "grad_norm": 0.11569254836511696, "learning_rate": 9.983938945274576e-05, "loss": 2.1215, "step": 3301 }, { "epoch": 0.025679471665479386, "grad_norm": 0.12043681731806136, "learning_rate": 9.983929158690216e-05, "loss": 2.1231, "step": 3302 }, { "epoch": 0.025687248610259967, "grad_norm": 0.11516103012556266, "learning_rate": 9.983919369129902e-05, "loss": 2.094, "step": 3303 }, { "epoch": 0.025695025555040548, "grad_norm": 0.11311318473066112, "learning_rate": 9.983909576593642e-05, "loss": 2.1266, "step": 3304 }, { "epoch": 0.02570280249982113, "grad_norm": 0.1140995895539863, "learning_rate": 9.98389978108144e-05, "loss": 2.1572, "step": 3305 }, { "epoch": 0.02571057944460171, "grad_norm": 0.11914292615844609, "learning_rate": 9.983889982593302e-05, "loss": 2.178, "step": 3306 }, { "epoch": 0.02571835638938229, "grad_norm": 0.12032387111705946, "learning_rate": 9.983880181129234e-05, "loss": 2.1082, "step": 3307 }, { "epoch": 0.025726133334162873, "grad_norm": 0.13003535740268557, "learning_rate": 9.983870376689243e-05, "loss": 2.1753, "step": 3308 }, { "epoch": 0.025733910278943454, "grad_norm": 0.22398139022783156, "learning_rate": 9.983860569273334e-05, "loss": 2.1114, "step": 3309 }, { "epoch": 0.025741687223724035, "grad_norm": 0.11859035221231788, "learning_rate": 9.983850758881512e-05, "loss": 2.0739, "step": 3310 }, { "epoch": 0.02574946416850462, "grad_norm": 0.11010439807111375, "learning_rate": 9.983840945513785e-05, "loss": 2.1183, "step": 3311 }, { "epoch": 0.0257572411132852, "grad_norm": 0.12115772571411748, "learning_rate": 9.983831129170156e-05, "loss": 2.0988, "step": 3312 }, { "epoch": 0.025765018058065782, "grad_norm": 0.11535628296385725, "learning_rate": 9.983821309850632e-05, "loss": 2.1629, "step": 3313 }, { "epoch": 0.025772795002846363, "grad_norm": 0.11467608647396825, "learning_rate": 9.98381148755522e-05, "loss": 2.1056, "step": 3314 }, { "epoch": 0.025780571947626944, "grad_norm": 0.12897137852978952, "learning_rate": 9.983801662283924e-05, "loss": 2.075, "step": 3315 }, { "epoch": 0.025788348892407525, "grad_norm": 0.1142395297022597, "learning_rate": 9.983791834036754e-05, "loss": 2.1219, "step": 3316 }, { "epoch": 0.025796125837188107, "grad_norm": 0.12313772025380312, "learning_rate": 9.98378200281371e-05, "loss": 2.0495, "step": 3317 }, { "epoch": 0.025803902781968688, "grad_norm": 0.11784738483253766, "learning_rate": 9.9837721686148e-05, "loss": 2.1155, "step": 3318 }, { "epoch": 0.02581167972674927, "grad_norm": 0.10996561443765668, "learning_rate": 9.983762331440032e-05, "loss": 2.105, "step": 3319 }, { "epoch": 0.02581945667152985, "grad_norm": 0.11058328444812199, "learning_rate": 9.983752491289412e-05, "loss": 2.1361, "step": 3320 }, { "epoch": 0.02582723361631043, "grad_norm": 0.11595457386857218, "learning_rate": 9.983742648162942e-05, "loss": 2.1052, "step": 3321 }, { "epoch": 0.025835010561091012, "grad_norm": 0.11317307656521702, "learning_rate": 9.98373280206063e-05, "loss": 2.097, "step": 3322 }, { "epoch": 0.025842787505871594, "grad_norm": 0.1181071875455708, "learning_rate": 9.983722952982483e-05, "loss": 2.0662, "step": 3323 }, { "epoch": 0.025850564450652175, "grad_norm": 0.1310741669404243, "learning_rate": 9.983713100928504e-05, "loss": 2.1284, "step": 3324 }, { "epoch": 0.025858341395432756, "grad_norm": 0.11686862377234894, "learning_rate": 9.983703245898704e-05, "loss": 2.139, "step": 3325 }, { "epoch": 0.025866118340213337, "grad_norm": 0.11734939452647, "learning_rate": 9.983693387893084e-05, "loss": 2.1289, "step": 3326 }, { "epoch": 0.02587389528499392, "grad_norm": 0.11669349489484342, "learning_rate": 9.983683526911651e-05, "loss": 2.1561, "step": 3327 }, { "epoch": 0.0258816722297745, "grad_norm": 0.14489491670946206, "learning_rate": 9.983673662954413e-05, "loss": 2.1582, "step": 3328 }, { "epoch": 0.02588944917455508, "grad_norm": 0.12209470982715281, "learning_rate": 9.983663796021375e-05, "loss": 2.1241, "step": 3329 }, { "epoch": 0.025897226119335662, "grad_norm": 0.12225526419811196, "learning_rate": 9.983653926112539e-05, "loss": 2.0523, "step": 3330 }, { "epoch": 0.025905003064116243, "grad_norm": 0.11439544187346506, "learning_rate": 9.983644053227916e-05, "loss": 2.1487, "step": 3331 }, { "epoch": 0.025912780008896824, "grad_norm": 0.10711769923476402, "learning_rate": 9.98363417736751e-05, "loss": 2.0924, "step": 3332 }, { "epoch": 0.025920556953677405, "grad_norm": 0.11277631318942888, "learning_rate": 9.983624298531326e-05, "loss": 2.1442, "step": 3333 }, { "epoch": 0.025928333898457986, "grad_norm": 0.11181781799907023, "learning_rate": 9.983614416719374e-05, "loss": 2.1408, "step": 3334 }, { "epoch": 0.025936110843238568, "grad_norm": 0.12284343430691373, "learning_rate": 9.983604531931653e-05, "loss": 2.091, "step": 3335 }, { "epoch": 0.02594388778801915, "grad_norm": 0.13013346398752673, "learning_rate": 9.983594644168175e-05, "loss": 2.1334, "step": 3336 }, { "epoch": 0.02595166473279973, "grad_norm": 0.11120307492470893, "learning_rate": 9.983584753428942e-05, "loss": 2.1318, "step": 3337 }, { "epoch": 0.02595944167758031, "grad_norm": 0.1171290744111938, "learning_rate": 9.983574859713962e-05, "loss": 2.1035, "step": 3338 }, { "epoch": 0.025967218622360892, "grad_norm": 0.110180108558344, "learning_rate": 9.98356496302324e-05, "loss": 2.1672, "step": 3339 }, { "epoch": 0.025974995567141473, "grad_norm": 0.10981075827544576, "learning_rate": 9.983555063356783e-05, "loss": 2.0815, "step": 3340 }, { "epoch": 0.025982772511922058, "grad_norm": 0.11258432442220224, "learning_rate": 9.983545160714596e-05, "loss": 2.1137, "step": 3341 }, { "epoch": 0.02599054945670264, "grad_norm": 0.10617748154246785, "learning_rate": 9.983535255096685e-05, "loss": 2.066, "step": 3342 }, { "epoch": 0.02599832640148322, "grad_norm": 0.1137739931861085, "learning_rate": 9.983525346503055e-05, "loss": 2.0847, "step": 3343 }, { "epoch": 0.0260061033462638, "grad_norm": 0.10756995511297616, "learning_rate": 9.983515434933714e-05, "loss": 2.1032, "step": 3344 }, { "epoch": 0.026013880291044383, "grad_norm": 0.11187141218663232, "learning_rate": 9.983505520388668e-05, "loss": 2.0946, "step": 3345 }, { "epoch": 0.026021657235824964, "grad_norm": 0.10718061199415059, "learning_rate": 9.98349560286792e-05, "loss": 2.08, "step": 3346 }, { "epoch": 0.026029434180605545, "grad_norm": 0.12275745465887347, "learning_rate": 9.983485682371478e-05, "loss": 2.1619, "step": 3347 }, { "epoch": 0.026037211125386126, "grad_norm": 0.10768957647911419, "learning_rate": 9.983475758899347e-05, "loss": 2.1312, "step": 3348 }, { "epoch": 0.026044988070166707, "grad_norm": 0.10730893860135603, "learning_rate": 9.983465832451535e-05, "loss": 2.136, "step": 3349 }, { "epoch": 0.02605276501494729, "grad_norm": 0.11216438813966688, "learning_rate": 9.983455903028045e-05, "loss": 2.0542, "step": 3350 }, { "epoch": 0.02606054195972787, "grad_norm": 0.11114994644571369, "learning_rate": 9.983445970628886e-05, "loss": 2.1114, "step": 3351 }, { "epoch": 0.02606831890450845, "grad_norm": 0.12531819393232407, "learning_rate": 9.98343603525406e-05, "loss": 2.1015, "step": 3352 }, { "epoch": 0.026076095849289032, "grad_norm": 0.1232877279369467, "learning_rate": 9.983426096903578e-05, "loss": 2.1127, "step": 3353 }, { "epoch": 0.026083872794069613, "grad_norm": 0.11632432120812677, "learning_rate": 9.983416155577441e-05, "loss": 2.1113, "step": 3354 }, { "epoch": 0.026091649738850194, "grad_norm": 0.17076168950170517, "learning_rate": 9.983406211275658e-05, "loss": 2.1094, "step": 3355 }, { "epoch": 0.026099426683630775, "grad_norm": 0.13583435622337697, "learning_rate": 9.983396263998235e-05, "loss": 2.0876, "step": 3356 }, { "epoch": 0.026107203628411357, "grad_norm": 0.11857065459988506, "learning_rate": 9.983386313745176e-05, "loss": 2.0778, "step": 3357 }, { "epoch": 0.026114980573191938, "grad_norm": 0.10765134982319931, "learning_rate": 9.983376360516488e-05, "loss": 2.136, "step": 3358 }, { "epoch": 0.02612275751797252, "grad_norm": 0.13133998004837832, "learning_rate": 9.983366404312175e-05, "loss": 2.1323, "step": 3359 }, { "epoch": 0.0261305344627531, "grad_norm": 0.11703911446975732, "learning_rate": 9.983356445132248e-05, "loss": 2.1528, "step": 3360 }, { "epoch": 0.02613831140753368, "grad_norm": 0.11367944222624513, "learning_rate": 9.983346482976708e-05, "loss": 2.1161, "step": 3361 }, { "epoch": 0.026146088352314262, "grad_norm": 0.11300783101997598, "learning_rate": 9.983336517845564e-05, "loss": 2.1758, "step": 3362 }, { "epoch": 0.026153865297094844, "grad_norm": 0.11284720237052595, "learning_rate": 9.983326549738818e-05, "loss": 2.1304, "step": 3363 }, { "epoch": 0.026161642241875425, "grad_norm": 0.12357862592235713, "learning_rate": 9.98331657865648e-05, "loss": 2.1661, "step": 3364 }, { "epoch": 0.026169419186656006, "grad_norm": 0.12239116282538684, "learning_rate": 9.983306604598557e-05, "loss": 2.1838, "step": 3365 }, { "epoch": 0.026177196131436587, "grad_norm": 0.12185212300335398, "learning_rate": 9.98329662756505e-05, "loss": 2.1328, "step": 3366 }, { "epoch": 0.02618497307621717, "grad_norm": 0.11894058916613173, "learning_rate": 9.983286647555968e-05, "loss": 2.126, "step": 3367 }, { "epoch": 0.02619275002099775, "grad_norm": 0.12073199056221808, "learning_rate": 9.983276664571317e-05, "loss": 2.099, "step": 3368 }, { "epoch": 0.02620052696577833, "grad_norm": 0.11766381964406024, "learning_rate": 9.9832666786111e-05, "loss": 2.0756, "step": 3369 }, { "epoch": 0.026208303910558912, "grad_norm": 0.11674486607362489, "learning_rate": 9.983256689675328e-05, "loss": 2.1173, "step": 3370 }, { "epoch": 0.026216080855339496, "grad_norm": 0.14129993129183846, "learning_rate": 9.983246697764004e-05, "loss": 2.0926, "step": 3371 }, { "epoch": 0.026223857800120078, "grad_norm": 0.113593646454088, "learning_rate": 9.983236702877135e-05, "loss": 2.0884, "step": 3372 }, { "epoch": 0.02623163474490066, "grad_norm": 0.12859012484970453, "learning_rate": 9.983226705014724e-05, "loss": 2.0768, "step": 3373 }, { "epoch": 0.02623941168968124, "grad_norm": 0.12199062972078628, "learning_rate": 9.98321670417678e-05, "loss": 2.1034, "step": 3374 }, { "epoch": 0.02624718863446182, "grad_norm": 0.11641704264152095, "learning_rate": 9.983206700363309e-05, "loss": 2.1414, "step": 3375 }, { "epoch": 0.026254965579242402, "grad_norm": 0.12115943050747559, "learning_rate": 9.983196693574316e-05, "loss": 2.1059, "step": 3376 }, { "epoch": 0.026262742524022983, "grad_norm": 0.11600690772897529, "learning_rate": 9.983186683809806e-05, "loss": 2.084, "step": 3377 }, { "epoch": 0.026270519468803565, "grad_norm": 0.11661164319507444, "learning_rate": 9.983176671069787e-05, "loss": 2.1702, "step": 3378 }, { "epoch": 0.026278296413584146, "grad_norm": 0.11013404009140296, "learning_rate": 9.983166655354264e-05, "loss": 2.1446, "step": 3379 }, { "epoch": 0.026286073358364727, "grad_norm": 0.11468008710192108, "learning_rate": 9.983156636663244e-05, "loss": 2.0887, "step": 3380 }, { "epoch": 0.026293850303145308, "grad_norm": 0.11052511401617508, "learning_rate": 9.983146614996731e-05, "loss": 2.1123, "step": 3381 }, { "epoch": 0.02630162724792589, "grad_norm": 0.11724099208310117, "learning_rate": 9.983136590354733e-05, "loss": 2.088, "step": 3382 }, { "epoch": 0.02630940419270647, "grad_norm": 0.11340391763635053, "learning_rate": 9.983126562737256e-05, "loss": 2.0266, "step": 3383 }, { "epoch": 0.02631718113748705, "grad_norm": 0.12126958903308752, "learning_rate": 9.9831165321443e-05, "loss": 2.0815, "step": 3384 }, { "epoch": 0.026324958082267633, "grad_norm": 0.11275418200420814, "learning_rate": 9.983106498575881e-05, "loss": 2.1102, "step": 3385 }, { "epoch": 0.026332735027048214, "grad_norm": 0.1150184412112124, "learning_rate": 9.983096462031999e-05, "loss": 2.0774, "step": 3386 }, { "epoch": 0.026340511971828795, "grad_norm": 0.18053921243656545, "learning_rate": 9.98308642251266e-05, "loss": 2.0904, "step": 3387 }, { "epoch": 0.026348288916609376, "grad_norm": 0.12178124698500586, "learning_rate": 9.983076380017872e-05, "loss": 2.0863, "step": 3388 }, { "epoch": 0.026356065861389957, "grad_norm": 0.11553289892396575, "learning_rate": 9.983066334547639e-05, "loss": 2.0478, "step": 3389 }, { "epoch": 0.02636384280617054, "grad_norm": 0.11677292073987813, "learning_rate": 9.983056286101969e-05, "loss": 2.148, "step": 3390 }, { "epoch": 0.02637161975095112, "grad_norm": 0.11136533628854066, "learning_rate": 9.983046234680866e-05, "loss": 2.0962, "step": 3391 }, { "epoch": 0.0263793966957317, "grad_norm": 0.2934330333356854, "learning_rate": 9.983036180284338e-05, "loss": 2.116, "step": 3392 }, { "epoch": 0.026387173640512282, "grad_norm": 0.11992161044859381, "learning_rate": 9.983026122912389e-05, "loss": 2.1132, "step": 3393 }, { "epoch": 0.026394950585292863, "grad_norm": 0.11019126152426686, "learning_rate": 9.983016062565027e-05, "loss": 2.024, "step": 3394 }, { "epoch": 0.026402727530073444, "grad_norm": 0.11713869979024603, "learning_rate": 9.983005999242257e-05, "loss": 2.0449, "step": 3395 }, { "epoch": 0.026410504474854025, "grad_norm": 0.12854124027519198, "learning_rate": 9.982995932944083e-05, "loss": 2.1231, "step": 3396 }, { "epoch": 0.026418281419634607, "grad_norm": 0.11614156965922, "learning_rate": 9.982985863670515e-05, "loss": 2.1316, "step": 3397 }, { "epoch": 0.026426058364415188, "grad_norm": 0.12456999760889095, "learning_rate": 9.982975791421557e-05, "loss": 2.1853, "step": 3398 }, { "epoch": 0.02643383530919577, "grad_norm": 0.13322102144530237, "learning_rate": 9.982965716197215e-05, "loss": 2.0913, "step": 3399 }, { "epoch": 0.026441612253976354, "grad_norm": 0.1147412681959683, "learning_rate": 9.982955637997495e-05, "loss": 2.0838, "step": 3400 }, { "epoch": 0.026449389198756935, "grad_norm": 0.1274304630557233, "learning_rate": 9.982945556822402e-05, "loss": 2.1064, "step": 3401 }, { "epoch": 0.026457166143537516, "grad_norm": 0.14170115980053824, "learning_rate": 9.982935472671944e-05, "loss": 2.1134, "step": 3402 }, { "epoch": 0.026464943088318097, "grad_norm": 0.11312500140925298, "learning_rate": 9.982925385546127e-05, "loss": 2.1159, "step": 3403 }, { "epoch": 0.026472720033098678, "grad_norm": 0.12327778454192846, "learning_rate": 9.982915295444954e-05, "loss": 2.0906, "step": 3404 }, { "epoch": 0.02648049697787926, "grad_norm": 0.13998275870162277, "learning_rate": 9.982905202368436e-05, "loss": 2.1104, "step": 3405 }, { "epoch": 0.02648827392265984, "grad_norm": 0.1218015990642212, "learning_rate": 9.982895106316574e-05, "loss": 2.1145, "step": 3406 }, { "epoch": 0.02649605086744042, "grad_norm": 0.1079079678045579, "learning_rate": 9.982885007289377e-05, "loss": 2.052, "step": 3407 }, { "epoch": 0.026503827812221003, "grad_norm": 0.1279695574344383, "learning_rate": 9.98287490528685e-05, "loss": 2.0429, "step": 3408 }, { "epoch": 0.026511604757001584, "grad_norm": 0.1251512198949358, "learning_rate": 9.982864800309e-05, "loss": 2.1449, "step": 3409 }, { "epoch": 0.026519381701782165, "grad_norm": 0.11168840889417923, "learning_rate": 9.982854692355832e-05, "loss": 2.0941, "step": 3410 }, { "epoch": 0.026527158646562746, "grad_norm": 0.11879622892011558, "learning_rate": 9.982844581427353e-05, "loss": 2.101, "step": 3411 }, { "epoch": 0.026534935591343328, "grad_norm": 0.12848469640194055, "learning_rate": 9.982834467523567e-05, "loss": 2.1068, "step": 3412 }, { "epoch": 0.02654271253612391, "grad_norm": 0.11553305411414286, "learning_rate": 9.982824350644482e-05, "loss": 2.0974, "step": 3413 }, { "epoch": 0.02655048948090449, "grad_norm": 0.1308447707475083, "learning_rate": 9.982814230790103e-05, "loss": 2.0883, "step": 3414 }, { "epoch": 0.02655826642568507, "grad_norm": 0.15942136101139248, "learning_rate": 9.982804107960438e-05, "loss": 2.1477, "step": 3415 }, { "epoch": 0.026566043370465652, "grad_norm": 0.13013585004329112, "learning_rate": 9.98279398215549e-05, "loss": 2.1153, "step": 3416 }, { "epoch": 0.026573820315246233, "grad_norm": 0.121368461239089, "learning_rate": 9.982783853375267e-05, "loss": 2.1469, "step": 3417 }, { "epoch": 0.026581597260026815, "grad_norm": 0.14098137604245065, "learning_rate": 9.982773721619776e-05, "loss": 2.0915, "step": 3418 }, { "epoch": 0.026589374204807396, "grad_norm": 0.12836312604556466, "learning_rate": 9.98276358688902e-05, "loss": 2.1224, "step": 3419 }, { "epoch": 0.026597151149587977, "grad_norm": 0.111651645734513, "learning_rate": 9.982753449183006e-05, "loss": 2.0749, "step": 3420 }, { "epoch": 0.026604928094368558, "grad_norm": 0.1287817884557003, "learning_rate": 9.982743308501742e-05, "loss": 2.0715, "step": 3421 }, { "epoch": 0.02661270503914914, "grad_norm": 0.12420814869792214, "learning_rate": 9.982733164845234e-05, "loss": 2.0803, "step": 3422 }, { "epoch": 0.02662048198392972, "grad_norm": 0.11205296078489446, "learning_rate": 9.982723018213487e-05, "loss": 2.1564, "step": 3423 }, { "epoch": 0.0266282589287103, "grad_norm": 0.14204477238674856, "learning_rate": 9.982712868606505e-05, "loss": 2.1267, "step": 3424 }, { "epoch": 0.026636035873490883, "grad_norm": 0.11945426841419587, "learning_rate": 9.982702716024299e-05, "loss": 2.1386, "step": 3425 }, { "epoch": 0.026643812818271464, "grad_norm": 0.12404408021744723, "learning_rate": 9.982692560466869e-05, "loss": 2.0916, "step": 3426 }, { "epoch": 0.026651589763052045, "grad_norm": 0.11751694869011574, "learning_rate": 9.982682401934225e-05, "loss": 2.1281, "step": 3427 }, { "epoch": 0.026659366707832626, "grad_norm": 0.11612670427661276, "learning_rate": 9.982672240426374e-05, "loss": 2.1647, "step": 3428 }, { "epoch": 0.026667143652613207, "grad_norm": 0.12494081078491015, "learning_rate": 9.982662075943318e-05, "loss": 2.1345, "step": 3429 }, { "epoch": 0.026674920597393792, "grad_norm": 0.11507413551183567, "learning_rate": 9.982651908485067e-05, "loss": 2.0653, "step": 3430 }, { "epoch": 0.026682697542174373, "grad_norm": 0.12292165937615673, "learning_rate": 9.982641738051625e-05, "loss": 2.0926, "step": 3431 }, { "epoch": 0.026690474486954954, "grad_norm": 0.1455089863960724, "learning_rate": 9.982631564642998e-05, "loss": 2.0331, "step": 3432 }, { "epoch": 0.026698251431735535, "grad_norm": 0.11821123992695586, "learning_rate": 9.982621388259192e-05, "loss": 2.1025, "step": 3433 }, { "epoch": 0.026706028376516117, "grad_norm": 0.11743720452748256, "learning_rate": 9.982611208900216e-05, "loss": 2.0773, "step": 3434 }, { "epoch": 0.026713805321296698, "grad_norm": 0.1307569591004161, "learning_rate": 9.982601026566072e-05, "loss": 2.0922, "step": 3435 }, { "epoch": 0.02672158226607728, "grad_norm": 0.11607465314723565, "learning_rate": 9.982590841256768e-05, "loss": 2.1076, "step": 3436 }, { "epoch": 0.02672935921085786, "grad_norm": 0.11735138009541615, "learning_rate": 9.98258065297231e-05, "loss": 2.1163, "step": 3437 }, { "epoch": 0.02673713615563844, "grad_norm": 0.11602595607161366, "learning_rate": 9.982570461712704e-05, "loss": 2.1337, "step": 3438 }, { "epoch": 0.026744913100419022, "grad_norm": 0.10764185634412099, "learning_rate": 9.982560267477956e-05, "loss": 2.1396, "step": 3439 }, { "epoch": 0.026752690045199604, "grad_norm": 0.12157301765550557, "learning_rate": 9.982550070268073e-05, "loss": 2.1142, "step": 3440 }, { "epoch": 0.026760466989980185, "grad_norm": 0.11255156583188705, "learning_rate": 9.98253987008306e-05, "loss": 2.1699, "step": 3441 }, { "epoch": 0.026768243934760766, "grad_norm": 0.11178399727597511, "learning_rate": 9.982529666922922e-05, "loss": 2.136, "step": 3442 }, { "epoch": 0.026776020879541347, "grad_norm": 0.1201471350939236, "learning_rate": 9.982519460787668e-05, "loss": 2.0911, "step": 3443 }, { "epoch": 0.026783797824321928, "grad_norm": 0.11949497260756593, "learning_rate": 9.982509251677302e-05, "loss": 2.1244, "step": 3444 }, { "epoch": 0.02679157476910251, "grad_norm": 0.1129041747049179, "learning_rate": 9.982499039591831e-05, "loss": 2.1369, "step": 3445 }, { "epoch": 0.02679935171388309, "grad_norm": 0.1111707491238205, "learning_rate": 9.98248882453126e-05, "loss": 2.1561, "step": 3446 }, { "epoch": 0.02680712865866367, "grad_norm": 0.11057925688237573, "learning_rate": 9.982478606495596e-05, "loss": 2.083, "step": 3447 }, { "epoch": 0.026814905603444253, "grad_norm": 0.11360872312883044, "learning_rate": 9.982468385484845e-05, "loss": 2.1163, "step": 3448 }, { "epoch": 0.026822682548224834, "grad_norm": 0.12017376239345295, "learning_rate": 9.982458161499014e-05, "loss": 2.1094, "step": 3449 }, { "epoch": 0.026830459493005415, "grad_norm": 0.11504509998887985, "learning_rate": 9.982447934538107e-05, "loss": 2.1226, "step": 3450 }, { "epoch": 0.026838236437785996, "grad_norm": 0.1147849622018097, "learning_rate": 9.982437704602132e-05, "loss": 2.1392, "step": 3451 }, { "epoch": 0.026846013382566578, "grad_norm": 0.1167580202235435, "learning_rate": 9.982427471691093e-05, "loss": 2.0834, "step": 3452 }, { "epoch": 0.02685379032734716, "grad_norm": 0.11296809419400777, "learning_rate": 9.982417235804999e-05, "loss": 2.0541, "step": 3453 }, { "epoch": 0.02686156727212774, "grad_norm": 0.11582382525559401, "learning_rate": 9.982406996943854e-05, "loss": 2.1035, "step": 3454 }, { "epoch": 0.02686934421690832, "grad_norm": 0.11520581257237439, "learning_rate": 9.982396755107665e-05, "loss": 2.1166, "step": 3455 }, { "epoch": 0.026877121161688902, "grad_norm": 0.1122714990659475, "learning_rate": 9.982386510296437e-05, "loss": 2.1634, "step": 3456 }, { "epoch": 0.026884898106469483, "grad_norm": 0.14876065141312023, "learning_rate": 9.982376262510178e-05, "loss": 2.0775, "step": 3457 }, { "epoch": 0.026892675051250065, "grad_norm": 0.14208531234608218, "learning_rate": 9.982366011748892e-05, "loss": 2.1833, "step": 3458 }, { "epoch": 0.026900451996030646, "grad_norm": 0.14819027510953645, "learning_rate": 9.982355758012587e-05, "loss": 2.1305, "step": 3459 }, { "epoch": 0.02690822894081123, "grad_norm": 0.1979177586095486, "learning_rate": 9.982345501301268e-05, "loss": 2.1076, "step": 3460 }, { "epoch": 0.02691600588559181, "grad_norm": 0.16652845598445914, "learning_rate": 9.982335241614941e-05, "loss": 2.2002, "step": 3461 }, { "epoch": 0.026923782830372393, "grad_norm": 0.12384004276070755, "learning_rate": 9.982324978953612e-05, "loss": 2.0999, "step": 3462 }, { "epoch": 0.026931559775152974, "grad_norm": 0.27225065536087995, "learning_rate": 9.982314713317288e-05, "loss": 2.0714, "step": 3463 }, { "epoch": 0.026939336719933555, "grad_norm": 0.18376328721844823, "learning_rate": 9.982304444705977e-05, "loss": 2.1263, "step": 3464 }, { "epoch": 0.026947113664714136, "grad_norm": 0.14687437659179325, "learning_rate": 9.982294173119682e-05, "loss": 2.0991, "step": 3465 }, { "epoch": 0.026954890609494717, "grad_norm": 0.3112637038772867, "learning_rate": 9.982283898558408e-05, "loss": 2.0864, "step": 3466 }, { "epoch": 0.0269626675542753, "grad_norm": 0.24692926823637065, "learning_rate": 9.982273621022165e-05, "loss": 2.0678, "step": 3467 }, { "epoch": 0.02697044449905588, "grad_norm": 0.14912410217831412, "learning_rate": 9.982263340510957e-05, "loss": 2.1457, "step": 3468 }, { "epoch": 0.02697822144383646, "grad_norm": 0.32153665343317467, "learning_rate": 9.982253057024791e-05, "loss": 2.1298, "step": 3469 }, { "epoch": 0.026985998388617042, "grad_norm": 0.17224689809025165, "learning_rate": 9.982242770563672e-05, "loss": 2.1386, "step": 3470 }, { "epoch": 0.026993775333397623, "grad_norm": 0.2373574044644908, "learning_rate": 9.982232481127605e-05, "loss": 2.1198, "step": 3471 }, { "epoch": 0.027001552278178204, "grad_norm": 0.28457089737527413, "learning_rate": 9.982222188716601e-05, "loss": 2.111, "step": 3472 }, { "epoch": 0.027009329222958785, "grad_norm": 0.1634782086844754, "learning_rate": 9.982211893330662e-05, "loss": 2.1063, "step": 3473 }, { "epoch": 0.027017106167739367, "grad_norm": 0.2374241365562277, "learning_rate": 9.982201594969794e-05, "loss": 2.1453, "step": 3474 }, { "epoch": 0.027024883112519948, "grad_norm": 0.12898517956108194, "learning_rate": 9.982191293634004e-05, "loss": 2.1109, "step": 3475 }, { "epoch": 0.02703266005730053, "grad_norm": 0.26508181601807074, "learning_rate": 9.982180989323301e-05, "loss": 2.0879, "step": 3476 }, { "epoch": 0.02704043700208111, "grad_norm": 0.12215774929791964, "learning_rate": 9.982170682037686e-05, "loss": 2.0957, "step": 3477 }, { "epoch": 0.02704821394686169, "grad_norm": 0.22938704188209827, "learning_rate": 9.98216037177717e-05, "loss": 2.1107, "step": 3478 }, { "epoch": 0.027055990891642272, "grad_norm": 0.17010278031624174, "learning_rate": 9.982150058541756e-05, "loss": 2.1801, "step": 3479 }, { "epoch": 0.027063767836422854, "grad_norm": 0.20682619664480775, "learning_rate": 9.98213974233145e-05, "loss": 2.052, "step": 3480 }, { "epoch": 0.027071544781203435, "grad_norm": 0.2489163173448496, "learning_rate": 9.982129423146263e-05, "loss": 2.1046, "step": 3481 }, { "epoch": 0.027079321725984016, "grad_norm": 0.1714823867872979, "learning_rate": 9.982119100986193e-05, "loss": 2.1499, "step": 3482 }, { "epoch": 0.027087098670764597, "grad_norm": 0.16186539597076388, "learning_rate": 9.982108775851254e-05, "loss": 2.1457, "step": 3483 }, { "epoch": 0.027094875615545178, "grad_norm": 0.1416461688641647, "learning_rate": 9.982098447741447e-05, "loss": 2.1465, "step": 3484 }, { "epoch": 0.02710265256032576, "grad_norm": 0.8169512575103958, "learning_rate": 9.98208811665678e-05, "loss": 2.1191, "step": 3485 }, { "epoch": 0.02711042950510634, "grad_norm": 0.1472505749813246, "learning_rate": 9.982077782597259e-05, "loss": 2.0931, "step": 3486 }, { "epoch": 0.02711820644988692, "grad_norm": 0.2742195496375704, "learning_rate": 9.982067445562891e-05, "loss": 2.1233, "step": 3487 }, { "epoch": 0.027125983394667503, "grad_norm": 0.1463060874087381, "learning_rate": 9.982057105553682e-05, "loss": 2.1127, "step": 3488 }, { "epoch": 0.027133760339448088, "grad_norm": 0.17436323722067992, "learning_rate": 9.982046762569636e-05, "loss": 2.1022, "step": 3489 }, { "epoch": 0.02714153728422867, "grad_norm": 0.26399461265711305, "learning_rate": 9.98203641661076e-05, "loss": 2.0999, "step": 3490 }, { "epoch": 0.02714931422900925, "grad_norm": 0.16054381462244666, "learning_rate": 9.982026067677063e-05, "loss": 2.1236, "step": 3491 }, { "epoch": 0.02715709117378983, "grad_norm": 0.16695094984558703, "learning_rate": 9.98201571576855e-05, "loss": 2.1184, "step": 3492 }, { "epoch": 0.027164868118570412, "grad_norm": 0.14812762863653903, "learning_rate": 9.982005360885224e-05, "loss": 2.124, "step": 3493 }, { "epoch": 0.027172645063350993, "grad_norm": 0.12986125921761146, "learning_rate": 9.981995003027094e-05, "loss": 2.1545, "step": 3494 }, { "epoch": 0.027180422008131574, "grad_norm": 0.25979002270158164, "learning_rate": 9.981984642194167e-05, "loss": 2.1209, "step": 3495 }, { "epoch": 0.027188198952912156, "grad_norm": 0.16541263210095886, "learning_rate": 9.981974278386447e-05, "loss": 2.1685, "step": 3496 }, { "epoch": 0.027195975897692737, "grad_norm": 0.1308803105690571, "learning_rate": 9.98196391160394e-05, "loss": 2.1505, "step": 3497 }, { "epoch": 0.027203752842473318, "grad_norm": 0.14235530322120077, "learning_rate": 9.981953541846655e-05, "loss": 2.1448, "step": 3498 }, { "epoch": 0.0272115297872539, "grad_norm": 0.12734131681158575, "learning_rate": 9.981943169114597e-05, "loss": 2.1486, "step": 3499 }, { "epoch": 0.02721930673203448, "grad_norm": 0.1474006297920618, "learning_rate": 9.98193279340777e-05, "loss": 2.1512, "step": 3500 }, { "epoch": 0.02722708367681506, "grad_norm": 0.12795381649803192, "learning_rate": 9.981922414726181e-05, "loss": 2.1154, "step": 3501 }, { "epoch": 0.027234860621595643, "grad_norm": 0.16125029309442251, "learning_rate": 9.98191203306984e-05, "loss": 2.1294, "step": 3502 }, { "epoch": 0.027242637566376224, "grad_norm": 0.1462965712951179, "learning_rate": 9.981901648438748e-05, "loss": 2.1029, "step": 3503 }, { "epoch": 0.027250414511156805, "grad_norm": 0.14199605100136778, "learning_rate": 9.981891260832914e-05, "loss": 2.0848, "step": 3504 }, { "epoch": 0.027258191455937386, "grad_norm": 0.2896104934046525, "learning_rate": 9.981880870252343e-05, "loss": 2.1146, "step": 3505 }, { "epoch": 0.027265968400717967, "grad_norm": 0.14926845640109385, "learning_rate": 9.981870476697042e-05, "loss": 2.1206, "step": 3506 }, { "epoch": 0.02727374534549855, "grad_norm": 0.13475868026741383, "learning_rate": 9.981860080167019e-05, "loss": 2.0533, "step": 3507 }, { "epoch": 0.02728152229027913, "grad_norm": 0.1803227581777444, "learning_rate": 9.981849680662276e-05, "loss": 2.1208, "step": 3508 }, { "epoch": 0.02728929923505971, "grad_norm": 0.1272019351221647, "learning_rate": 9.981839278182823e-05, "loss": 2.1604, "step": 3509 }, { "epoch": 0.027297076179840292, "grad_norm": 0.14044264539774523, "learning_rate": 9.981828872728663e-05, "loss": 2.1488, "step": 3510 }, { "epoch": 0.027304853124620873, "grad_norm": 0.12448357794477972, "learning_rate": 9.981818464299806e-05, "loss": 2.0917, "step": 3511 }, { "epoch": 0.027312630069401454, "grad_norm": 0.16314100534480755, "learning_rate": 9.981808052896254e-05, "loss": 2.1462, "step": 3512 }, { "epoch": 0.027320407014182035, "grad_norm": 0.12750745556911175, "learning_rate": 9.981797638518016e-05, "loss": 2.1479, "step": 3513 }, { "epoch": 0.027328183958962617, "grad_norm": 0.12642575159575561, "learning_rate": 9.981787221165098e-05, "loss": 2.0857, "step": 3514 }, { "epoch": 0.027335960903743198, "grad_norm": 0.21349240694349533, "learning_rate": 9.981776800837506e-05, "loss": 2.1597, "step": 3515 }, { "epoch": 0.02734373784852378, "grad_norm": 0.11536930669407974, "learning_rate": 9.981766377535246e-05, "loss": 2.1512, "step": 3516 }, { "epoch": 0.02735151479330436, "grad_norm": 0.1418088766081878, "learning_rate": 9.981755951258323e-05, "loss": 2.1511, "step": 3517 }, { "epoch": 0.02735929173808494, "grad_norm": 0.13940974194694133, "learning_rate": 9.981745522006746e-05, "loss": 2.1163, "step": 3518 }, { "epoch": 0.027367068682865526, "grad_norm": 0.4207025289311332, "learning_rate": 9.98173508978052e-05, "loss": 2.1192, "step": 3519 }, { "epoch": 0.027374845627646107, "grad_norm": 0.14056972647085617, "learning_rate": 9.981724654579648e-05, "loss": 2.0905, "step": 3520 }, { "epoch": 0.027382622572426688, "grad_norm": 0.13323400787728726, "learning_rate": 9.981714216404141e-05, "loss": 2.1224, "step": 3521 }, { "epoch": 0.02739039951720727, "grad_norm": 0.1362787348900065, "learning_rate": 9.981703775254004e-05, "loss": 2.1057, "step": 3522 }, { "epoch": 0.02739817646198785, "grad_norm": 0.1334514463212219, "learning_rate": 9.981693331129242e-05, "loss": 2.0807, "step": 3523 }, { "epoch": 0.02740595340676843, "grad_norm": 0.1315388039954107, "learning_rate": 9.981682884029862e-05, "loss": 2.1118, "step": 3524 }, { "epoch": 0.027413730351549013, "grad_norm": 0.15514379952057417, "learning_rate": 9.981672433955869e-05, "loss": 2.1604, "step": 3525 }, { "epoch": 0.027421507296329594, "grad_norm": 0.12848765684373054, "learning_rate": 9.981661980907271e-05, "loss": 2.109, "step": 3526 }, { "epoch": 0.027429284241110175, "grad_norm": 0.13176381744279356, "learning_rate": 9.981651524884075e-05, "loss": 2.1506, "step": 3527 }, { "epoch": 0.027437061185890756, "grad_norm": 0.15503589765188072, "learning_rate": 9.981641065886284e-05, "loss": 2.109, "step": 3528 }, { "epoch": 0.027444838130671338, "grad_norm": 0.12610741516239707, "learning_rate": 9.981630603913907e-05, "loss": 2.1434, "step": 3529 }, { "epoch": 0.02745261507545192, "grad_norm": 0.12558056892749372, "learning_rate": 9.981620138966949e-05, "loss": 2.117, "step": 3530 }, { "epoch": 0.0274603920202325, "grad_norm": 0.4837296024491928, "learning_rate": 9.981609671045416e-05, "loss": 2.1651, "step": 3531 }, { "epoch": 0.02746816896501308, "grad_norm": 0.3101741838107865, "learning_rate": 9.981599200149314e-05, "loss": 2.1086, "step": 3532 }, { "epoch": 0.027475945909793662, "grad_norm": 0.16049308987088315, "learning_rate": 9.981588726278652e-05, "loss": 2.1271, "step": 3533 }, { "epoch": 0.027483722854574243, "grad_norm": 0.14432246598863108, "learning_rate": 9.981578249433433e-05, "loss": 2.1067, "step": 3534 }, { "epoch": 0.027491499799354824, "grad_norm": 0.12439447519943704, "learning_rate": 9.981567769613664e-05, "loss": 2.1109, "step": 3535 }, { "epoch": 0.027499276744135406, "grad_norm": 0.12440477759348555, "learning_rate": 9.981557286819353e-05, "loss": 2.0975, "step": 3536 }, { "epoch": 0.027507053688915987, "grad_norm": 0.14836011524705098, "learning_rate": 9.981546801050506e-05, "loss": 2.1158, "step": 3537 }, { "epoch": 0.027514830633696568, "grad_norm": 0.12895868601900792, "learning_rate": 9.981536312307128e-05, "loss": 2.1104, "step": 3538 }, { "epoch": 0.02752260757847715, "grad_norm": 0.18993363949305928, "learning_rate": 9.981525820589223e-05, "loss": 2.1067, "step": 3539 }, { "epoch": 0.02753038452325773, "grad_norm": 0.1520957619486266, "learning_rate": 9.981515325896802e-05, "loss": 2.1117, "step": 3540 }, { "epoch": 0.02753816146803831, "grad_norm": 0.12242993011724196, "learning_rate": 9.98150482822987e-05, "loss": 2.1266, "step": 3541 }, { "epoch": 0.027545938412818893, "grad_norm": 0.1612058345817574, "learning_rate": 9.98149432758843e-05, "loss": 2.1974, "step": 3542 }, { "epoch": 0.027553715357599474, "grad_norm": 0.12751476909420906, "learning_rate": 9.981483823972492e-05, "loss": 2.0901, "step": 3543 }, { "epoch": 0.027561492302380055, "grad_norm": 0.14391033037079368, "learning_rate": 9.981473317382062e-05, "loss": 2.121, "step": 3544 }, { "epoch": 0.027569269247160636, "grad_norm": 0.1611126646430432, "learning_rate": 9.981462807817144e-05, "loss": 2.1204, "step": 3545 }, { "epoch": 0.027577046191941217, "grad_norm": 0.11476123498146126, "learning_rate": 9.981452295277744e-05, "loss": 2.1027, "step": 3546 }, { "epoch": 0.0275848231367218, "grad_norm": 0.1273658953563082, "learning_rate": 9.981441779763872e-05, "loss": 2.1324, "step": 3547 }, { "epoch": 0.02759260008150238, "grad_norm": 0.12575190989526316, "learning_rate": 9.98143126127553e-05, "loss": 2.1742, "step": 3548 }, { "epoch": 0.027600377026282964, "grad_norm": 0.12639866809325753, "learning_rate": 9.981420739812728e-05, "loss": 2.1202, "step": 3549 }, { "epoch": 0.027608153971063545, "grad_norm": 0.13022384721832203, "learning_rate": 9.981410215375471e-05, "loss": 2.1306, "step": 3550 }, { "epoch": 0.027615930915844127, "grad_norm": 0.11430555700241916, "learning_rate": 9.981399687963764e-05, "loss": 2.1323, "step": 3551 }, { "epoch": 0.027623707860624708, "grad_norm": 0.1500144467329738, "learning_rate": 9.981389157577614e-05, "loss": 2.1182, "step": 3552 }, { "epoch": 0.02763148480540529, "grad_norm": 0.12505206422234072, "learning_rate": 9.981378624217028e-05, "loss": 2.1169, "step": 3553 }, { "epoch": 0.02763926175018587, "grad_norm": 0.11528272350063602, "learning_rate": 9.981368087882011e-05, "loss": 2.1086, "step": 3554 }, { "epoch": 0.02764703869496645, "grad_norm": 0.12119155655001175, "learning_rate": 9.98135754857257e-05, "loss": 2.1724, "step": 3555 }, { "epoch": 0.027654815639747032, "grad_norm": 0.11449991470395315, "learning_rate": 9.981347006288712e-05, "loss": 2.0901, "step": 3556 }, { "epoch": 0.027662592584527614, "grad_norm": 0.1285003898176304, "learning_rate": 9.981336461030442e-05, "loss": 2.1275, "step": 3557 }, { "epoch": 0.027670369529308195, "grad_norm": 0.11298957362065104, "learning_rate": 9.981325912797768e-05, "loss": 2.0858, "step": 3558 }, { "epoch": 0.027678146474088776, "grad_norm": 0.11777682768680071, "learning_rate": 9.981315361590694e-05, "loss": 2.0954, "step": 3559 }, { "epoch": 0.027685923418869357, "grad_norm": 0.11276702181370757, "learning_rate": 9.981304807409229e-05, "loss": 2.1072, "step": 3560 }, { "epoch": 0.027693700363649938, "grad_norm": 0.11659085607621597, "learning_rate": 9.981294250253377e-05, "loss": 2.0621, "step": 3561 }, { "epoch": 0.02770147730843052, "grad_norm": 0.11616685586196633, "learning_rate": 9.981283690123145e-05, "loss": 2.0884, "step": 3562 }, { "epoch": 0.0277092542532111, "grad_norm": 0.10872810139802794, "learning_rate": 9.981273127018538e-05, "loss": 2.0876, "step": 3563 }, { "epoch": 0.02771703119799168, "grad_norm": 0.11429559427016495, "learning_rate": 9.981262560939565e-05, "loss": 2.1619, "step": 3564 }, { "epoch": 0.027724808142772263, "grad_norm": 0.11682181913708517, "learning_rate": 9.981251991886232e-05, "loss": 2.0505, "step": 3565 }, { "epoch": 0.027732585087552844, "grad_norm": 0.1222726972754923, "learning_rate": 9.981241419858544e-05, "loss": 2.1133, "step": 3566 }, { "epoch": 0.027740362032333425, "grad_norm": 0.12401108673294339, "learning_rate": 9.981230844856506e-05, "loss": 2.0745, "step": 3567 }, { "epoch": 0.027748138977114006, "grad_norm": 0.11552428563007353, "learning_rate": 9.981220266880128e-05, "loss": 2.0546, "step": 3568 }, { "epoch": 0.027755915921894588, "grad_norm": 0.11923399732614359, "learning_rate": 9.981209685929413e-05, "loss": 2.1, "step": 3569 }, { "epoch": 0.02776369286667517, "grad_norm": 0.11423559198754453, "learning_rate": 9.98119910200437e-05, "loss": 2.1573, "step": 3570 }, { "epoch": 0.02777146981145575, "grad_norm": 0.11638894896950677, "learning_rate": 9.981188515105003e-05, "loss": 2.1206, "step": 3571 }, { "epoch": 0.02777924675623633, "grad_norm": 0.1180696669055151, "learning_rate": 9.98117792523132e-05, "loss": 2.1197, "step": 3572 }, { "epoch": 0.027787023701016912, "grad_norm": 0.11521406981789567, "learning_rate": 9.981167332383325e-05, "loss": 2.1258, "step": 3573 }, { "epoch": 0.027794800645797493, "grad_norm": 0.12719731633076606, "learning_rate": 9.981156736561027e-05, "loss": 2.1312, "step": 3574 }, { "epoch": 0.027802577590578074, "grad_norm": 0.13103960735180303, "learning_rate": 9.981146137764431e-05, "loss": 2.0905, "step": 3575 }, { "epoch": 0.027810354535358656, "grad_norm": 0.11002819810824951, "learning_rate": 9.981135535993544e-05, "loss": 2.0854, "step": 3576 }, { "epoch": 0.027818131480139237, "grad_norm": 0.12366731061984967, "learning_rate": 9.981124931248371e-05, "loss": 2.1144, "step": 3577 }, { "epoch": 0.027825908424919818, "grad_norm": 0.14727114284406853, "learning_rate": 9.98111432352892e-05, "loss": 2.0832, "step": 3578 }, { "epoch": 0.027833685369700403, "grad_norm": 0.12281527615715444, "learning_rate": 9.981103712835196e-05, "loss": 2.1166, "step": 3579 }, { "epoch": 0.027841462314480984, "grad_norm": 0.12967885125327575, "learning_rate": 9.981093099167207e-05, "loss": 2.0618, "step": 3580 }, { "epoch": 0.027849239259261565, "grad_norm": 0.11358137307919765, "learning_rate": 9.981082482524957e-05, "loss": 2.1404, "step": 3581 }, { "epoch": 0.027857016204042146, "grad_norm": 0.13246411029158306, "learning_rate": 9.981071862908454e-05, "loss": 2.0682, "step": 3582 }, { "epoch": 0.027864793148822727, "grad_norm": 0.11969915946737013, "learning_rate": 9.981061240317703e-05, "loss": 2.0815, "step": 3583 }, { "epoch": 0.02787257009360331, "grad_norm": 0.12229290603524212, "learning_rate": 9.981050614752714e-05, "loss": 2.069, "step": 3584 }, { "epoch": 0.02788034703838389, "grad_norm": 0.15681963744424948, "learning_rate": 9.981039986213488e-05, "loss": 2.1262, "step": 3585 }, { "epoch": 0.02788812398316447, "grad_norm": 0.1254148766515382, "learning_rate": 9.981029354700035e-05, "loss": 2.1003, "step": 3586 }, { "epoch": 0.027895900927945052, "grad_norm": 0.14867286816947844, "learning_rate": 9.98101872021236e-05, "loss": 2.1177, "step": 3587 }, { "epoch": 0.027903677872725633, "grad_norm": 0.15616124560223457, "learning_rate": 9.981008082750469e-05, "loss": 2.0646, "step": 3588 }, { "epoch": 0.027911454817506214, "grad_norm": 0.11766324969720512, "learning_rate": 9.98099744231437e-05, "loss": 2.0777, "step": 3589 }, { "epoch": 0.027919231762286795, "grad_norm": 0.1394655072879839, "learning_rate": 9.980986798904069e-05, "loss": 2.0919, "step": 3590 }, { "epoch": 0.027927008707067377, "grad_norm": 0.12050461654865198, "learning_rate": 9.98097615251957e-05, "loss": 2.1194, "step": 3591 }, { "epoch": 0.027934785651847958, "grad_norm": 0.12505440819453265, "learning_rate": 9.980965503160882e-05, "loss": 2.1054, "step": 3592 }, { "epoch": 0.02794256259662854, "grad_norm": 0.12596012728179226, "learning_rate": 9.98095485082801e-05, "loss": 2.0904, "step": 3593 }, { "epoch": 0.02795033954140912, "grad_norm": 0.11364175586238753, "learning_rate": 9.980944195520962e-05, "loss": 2.0737, "step": 3594 }, { "epoch": 0.0279581164861897, "grad_norm": 0.12717822738937443, "learning_rate": 9.980933537239742e-05, "loss": 2.1317, "step": 3595 }, { "epoch": 0.027965893430970282, "grad_norm": 0.11025372854127144, "learning_rate": 9.980922875984358e-05, "loss": 2.1227, "step": 3596 }, { "epoch": 0.027973670375750864, "grad_norm": 0.12869014037121135, "learning_rate": 9.980912211754815e-05, "loss": 2.0986, "step": 3597 }, { "epoch": 0.027981447320531445, "grad_norm": 0.12237898677699616, "learning_rate": 9.98090154455112e-05, "loss": 2.1509, "step": 3598 }, { "epoch": 0.027989224265312026, "grad_norm": 0.11144995907173603, "learning_rate": 9.980890874373281e-05, "loss": 2.1327, "step": 3599 }, { "epoch": 0.027997001210092607, "grad_norm": 0.12755116412240797, "learning_rate": 9.980880201221303e-05, "loss": 2.1324, "step": 3600 }, { "epoch": 0.028004778154873188, "grad_norm": 0.21077956418026755, "learning_rate": 9.98086952509519e-05, "loss": 2.133, "step": 3601 }, { "epoch": 0.02801255509965377, "grad_norm": 0.1236082912331468, "learning_rate": 9.980858845994954e-05, "loss": 2.0802, "step": 3602 }, { "epoch": 0.02802033204443435, "grad_norm": 0.12251649509267161, "learning_rate": 9.980848163920597e-05, "loss": 2.1237, "step": 3603 }, { "epoch": 0.02802810898921493, "grad_norm": 0.12070917575823323, "learning_rate": 9.980837478872125e-05, "loss": 2.0782, "step": 3604 }, { "epoch": 0.028035885933995513, "grad_norm": 0.10976786357478571, "learning_rate": 9.980826790849548e-05, "loss": 2.1455, "step": 3605 }, { "epoch": 0.028043662878776094, "grad_norm": 0.13184422679753374, "learning_rate": 9.98081609985287e-05, "loss": 2.1299, "step": 3606 }, { "epoch": 0.028051439823556675, "grad_norm": 0.1154155473110161, "learning_rate": 9.980805405882097e-05, "loss": 2.1398, "step": 3607 }, { "epoch": 0.02805921676833726, "grad_norm": 0.11510357420242305, "learning_rate": 9.980794708937234e-05, "loss": 2.0625, "step": 3608 }, { "epoch": 0.02806699371311784, "grad_norm": 0.111169471919311, "learning_rate": 9.980784009018291e-05, "loss": 2.0633, "step": 3609 }, { "epoch": 0.028074770657898422, "grad_norm": 0.11385899153610517, "learning_rate": 9.980773306125273e-05, "loss": 2.0688, "step": 3610 }, { "epoch": 0.028082547602679003, "grad_norm": 0.12064549513063098, "learning_rate": 9.980762600258187e-05, "loss": 2.1143, "step": 3611 }, { "epoch": 0.028090324547459584, "grad_norm": 0.17251220331650952, "learning_rate": 9.980751891417038e-05, "loss": 2.1808, "step": 3612 }, { "epoch": 0.028098101492240166, "grad_norm": 0.14803001114841383, "learning_rate": 9.980741179601833e-05, "loss": 2.1012, "step": 3613 }, { "epoch": 0.028105878437020747, "grad_norm": 0.12594341109971563, "learning_rate": 9.980730464812578e-05, "loss": 2.1356, "step": 3614 }, { "epoch": 0.028113655381801328, "grad_norm": 0.11288268201584391, "learning_rate": 9.980719747049281e-05, "loss": 2.1752, "step": 3615 }, { "epoch": 0.02812143232658191, "grad_norm": 0.11897593541990925, "learning_rate": 9.980709026311945e-05, "loss": 2.1178, "step": 3616 }, { "epoch": 0.02812920927136249, "grad_norm": 0.1181885420592771, "learning_rate": 9.98069830260058e-05, "loss": 2.0374, "step": 3617 }, { "epoch": 0.02813698621614307, "grad_norm": 0.32355476004424927, "learning_rate": 9.980687575915191e-05, "loss": 2.0955, "step": 3618 }, { "epoch": 0.028144763160923653, "grad_norm": 0.16632347269633443, "learning_rate": 9.980676846255783e-05, "loss": 2.104, "step": 3619 }, { "epoch": 0.028152540105704234, "grad_norm": 0.15018639744592, "learning_rate": 9.980666113622366e-05, "loss": 2.1492, "step": 3620 }, { "epoch": 0.028160317050484815, "grad_norm": 0.12431878662865303, "learning_rate": 9.980655378014942e-05, "loss": 2.1296, "step": 3621 }, { "epoch": 0.028168093995265396, "grad_norm": 0.12442098781843314, "learning_rate": 9.98064463943352e-05, "loss": 2.1464, "step": 3622 }, { "epoch": 0.028175870940045977, "grad_norm": 0.12740932451941978, "learning_rate": 9.980633897878109e-05, "loss": 2.174, "step": 3623 }, { "epoch": 0.02818364788482656, "grad_norm": 0.12317000535074638, "learning_rate": 9.98062315334871e-05, "loss": 2.1491, "step": 3624 }, { "epoch": 0.02819142482960714, "grad_norm": 0.12726828690765166, "learning_rate": 9.980612405845332e-05, "loss": 2.1369, "step": 3625 }, { "epoch": 0.02819920177438772, "grad_norm": 0.1260407133316724, "learning_rate": 9.98060165536798e-05, "loss": 2.089, "step": 3626 }, { "epoch": 0.028206978719168302, "grad_norm": 0.12328339974840345, "learning_rate": 9.980590901916663e-05, "loss": 2.0745, "step": 3627 }, { "epoch": 0.028214755663948883, "grad_norm": 0.124780673989131, "learning_rate": 9.980580145491387e-05, "loss": 2.0686, "step": 3628 }, { "epoch": 0.028222532608729464, "grad_norm": 0.1496373799749933, "learning_rate": 9.980569386092156e-05, "loss": 2.1602, "step": 3629 }, { "epoch": 0.028230309553510045, "grad_norm": 0.13740711588631513, "learning_rate": 9.980558623718978e-05, "loss": 2.1138, "step": 3630 }, { "epoch": 0.028238086498290627, "grad_norm": 0.13859067288355403, "learning_rate": 9.98054785837186e-05, "loss": 2.072, "step": 3631 }, { "epoch": 0.028245863443071208, "grad_norm": 0.1416167324576331, "learning_rate": 9.980537090050807e-05, "loss": 2.1091, "step": 3632 }, { "epoch": 0.02825364038785179, "grad_norm": 0.11011937440202128, "learning_rate": 9.980526318755828e-05, "loss": 2.1226, "step": 3633 }, { "epoch": 0.02826141733263237, "grad_norm": 0.138816476714177, "learning_rate": 9.980515544486925e-05, "loss": 2.1132, "step": 3634 }, { "epoch": 0.02826919427741295, "grad_norm": 0.28133674741458886, "learning_rate": 9.980504767244109e-05, "loss": 2.1437, "step": 3635 }, { "epoch": 0.028276971222193532, "grad_norm": 0.1111546828276917, "learning_rate": 9.980493987027385e-05, "loss": 2.1121, "step": 3636 }, { "epoch": 0.028284748166974114, "grad_norm": 0.1478820024087962, "learning_rate": 9.980483203836757e-05, "loss": 2.1399, "step": 3637 }, { "epoch": 0.028292525111754698, "grad_norm": 0.13275643002563225, "learning_rate": 9.980472417672235e-05, "loss": 2.1163, "step": 3638 }, { "epoch": 0.02830030205653528, "grad_norm": 0.12416513507210603, "learning_rate": 9.980461628533824e-05, "loss": 2.1971, "step": 3639 }, { "epoch": 0.02830807900131586, "grad_norm": 0.16107508684071473, "learning_rate": 9.98045083642153e-05, "loss": 2.1352, "step": 3640 }, { "epoch": 0.02831585594609644, "grad_norm": 0.12029455436039053, "learning_rate": 9.980440041335359e-05, "loss": 2.1481, "step": 3641 }, { "epoch": 0.028323632890877023, "grad_norm": 0.14968374134890494, "learning_rate": 9.980429243275319e-05, "loss": 2.149, "step": 3642 }, { "epoch": 0.028331409835657604, "grad_norm": 0.13958814941133132, "learning_rate": 9.980418442241414e-05, "loss": 2.111, "step": 3643 }, { "epoch": 0.028339186780438185, "grad_norm": 0.1176821134803858, "learning_rate": 9.980407638233654e-05, "loss": 2.1479, "step": 3644 }, { "epoch": 0.028346963725218766, "grad_norm": 0.12670018197235222, "learning_rate": 9.980396831252044e-05, "loss": 2.0797, "step": 3645 }, { "epoch": 0.028354740669999347, "grad_norm": 0.14125379520604855, "learning_rate": 9.980386021296589e-05, "loss": 2.1079, "step": 3646 }, { "epoch": 0.02836251761477993, "grad_norm": 0.1543845173116536, "learning_rate": 9.980375208367295e-05, "loss": 2.0882, "step": 3647 }, { "epoch": 0.02837029455956051, "grad_norm": 0.12761137602025835, "learning_rate": 9.980364392464172e-05, "loss": 2.0719, "step": 3648 }, { "epoch": 0.02837807150434109, "grad_norm": 0.118215116037176, "learning_rate": 9.980353573587223e-05, "loss": 2.1028, "step": 3649 }, { "epoch": 0.028385848449121672, "grad_norm": 0.11894330752990329, "learning_rate": 9.980342751736456e-05, "loss": 2.072, "step": 3650 }, { "epoch": 0.028393625393902253, "grad_norm": 0.1163243267028064, "learning_rate": 9.980331926911879e-05, "loss": 2.1254, "step": 3651 }, { "epoch": 0.028401402338682834, "grad_norm": 0.12049430529414128, "learning_rate": 9.980321099113494e-05, "loss": 2.1199, "step": 3652 }, { "epoch": 0.028409179283463416, "grad_norm": 0.12027716635259172, "learning_rate": 9.980310268341312e-05, "loss": 2.0514, "step": 3653 }, { "epoch": 0.028416956228243997, "grad_norm": 0.12060259210659466, "learning_rate": 9.980299434595337e-05, "loss": 2.1652, "step": 3654 }, { "epoch": 0.028424733173024578, "grad_norm": 0.48254323102767055, "learning_rate": 9.980288597875576e-05, "loss": 2.1098, "step": 3655 }, { "epoch": 0.02843251011780516, "grad_norm": 0.12676175434688833, "learning_rate": 9.980277758182038e-05, "loss": 2.1472, "step": 3656 }, { "epoch": 0.02844028706258574, "grad_norm": 0.15778252048835914, "learning_rate": 9.980266915514724e-05, "loss": 2.0798, "step": 3657 }, { "epoch": 0.02844806400736632, "grad_norm": 0.12561660578303366, "learning_rate": 9.980256069873644e-05, "loss": 2.1299, "step": 3658 }, { "epoch": 0.028455840952146903, "grad_norm": 0.1242479717775548, "learning_rate": 9.980245221258805e-05, "loss": 2.1105, "step": 3659 }, { "epoch": 0.028463617896927484, "grad_norm": 0.13689180170550466, "learning_rate": 9.980234369670211e-05, "loss": 2.1378, "step": 3660 }, { "epoch": 0.028471394841708065, "grad_norm": 0.1303684003399269, "learning_rate": 9.980223515107872e-05, "loss": 2.1821, "step": 3661 }, { "epoch": 0.028479171786488646, "grad_norm": 0.1277298673507427, "learning_rate": 9.980212657571793e-05, "loss": 2.1276, "step": 3662 }, { "epoch": 0.028486948731269227, "grad_norm": 0.14052714561534632, "learning_rate": 9.980201797061977e-05, "loss": 2.1149, "step": 3663 }, { "epoch": 0.02849472567604981, "grad_norm": 0.15293322055110292, "learning_rate": 9.980190933578435e-05, "loss": 2.1218, "step": 3664 }, { "epoch": 0.02850250262083039, "grad_norm": 0.13224568888528662, "learning_rate": 9.980180067121172e-05, "loss": 2.0994, "step": 3665 }, { "epoch": 0.02851027956561097, "grad_norm": 0.12904955431668721, "learning_rate": 9.980169197690194e-05, "loss": 2.0695, "step": 3666 }, { "epoch": 0.028518056510391552, "grad_norm": 0.15822797264417846, "learning_rate": 9.980158325285509e-05, "loss": 2.109, "step": 3667 }, { "epoch": 0.028525833455172137, "grad_norm": 0.14567121701105068, "learning_rate": 9.980147449907122e-05, "loss": 2.1312, "step": 3668 }, { "epoch": 0.028533610399952718, "grad_norm": 0.11888001137176885, "learning_rate": 9.980136571555039e-05, "loss": 2.1406, "step": 3669 }, { "epoch": 0.0285413873447333, "grad_norm": 0.4942713359865094, "learning_rate": 9.980125690229268e-05, "loss": 2.1569, "step": 3670 }, { "epoch": 0.02854916428951388, "grad_norm": 0.13743296634034477, "learning_rate": 9.980114805929815e-05, "loss": 2.1306, "step": 3671 }, { "epoch": 0.02855694123429446, "grad_norm": 0.11585677226965986, "learning_rate": 9.980103918656688e-05, "loss": 2.1443, "step": 3672 }, { "epoch": 0.028564718179075042, "grad_norm": 0.134264991360884, "learning_rate": 9.980093028409889e-05, "loss": 2.1007, "step": 3673 }, { "epoch": 0.028572495123855624, "grad_norm": 0.15903729949519332, "learning_rate": 9.980082135189428e-05, "loss": 2.1161, "step": 3674 }, { "epoch": 0.028580272068636205, "grad_norm": 0.1342582478209204, "learning_rate": 9.980071238995313e-05, "loss": 2.1244, "step": 3675 }, { "epoch": 0.028588049013416786, "grad_norm": 0.13604357776011325, "learning_rate": 9.980060339827546e-05, "loss": 2.1272, "step": 3676 }, { "epoch": 0.028595825958197367, "grad_norm": 0.19554177034847828, "learning_rate": 9.980049437686136e-05, "loss": 2.1256, "step": 3677 }, { "epoch": 0.028603602902977948, "grad_norm": 0.17357241816083482, "learning_rate": 9.98003853257109e-05, "loss": 2.1357, "step": 3678 }, { "epoch": 0.02861137984775853, "grad_norm": 0.1208907756744858, "learning_rate": 9.980027624482415e-05, "loss": 2.1133, "step": 3679 }, { "epoch": 0.02861915679253911, "grad_norm": 0.17595450871607582, "learning_rate": 9.980016713420115e-05, "loss": 2.0718, "step": 3680 }, { "epoch": 0.02862693373731969, "grad_norm": 0.17010810087058909, "learning_rate": 9.980005799384197e-05, "loss": 2.0683, "step": 3681 }, { "epoch": 0.028634710682100273, "grad_norm": 0.11966069081826106, "learning_rate": 9.979994882374671e-05, "loss": 2.0947, "step": 3682 }, { "epoch": 0.028642487626880854, "grad_norm": 0.17036546122549837, "learning_rate": 9.979983962391541e-05, "loss": 2.0733, "step": 3683 }, { "epoch": 0.028650264571661435, "grad_norm": 0.17120581729500833, "learning_rate": 9.979973039434811e-05, "loss": 2.0846, "step": 3684 }, { "epoch": 0.028658041516442016, "grad_norm": 0.11508320340487468, "learning_rate": 9.979962113504491e-05, "loss": 2.0505, "step": 3685 }, { "epoch": 0.028665818461222597, "grad_norm": 0.189400175073648, "learning_rate": 9.979951184600587e-05, "loss": 2.0746, "step": 3686 }, { "epoch": 0.02867359540600318, "grad_norm": 0.15720471599462493, "learning_rate": 9.979940252723105e-05, "loss": 2.0814, "step": 3687 }, { "epoch": 0.02868137235078376, "grad_norm": 0.1155216595013773, "learning_rate": 9.979929317872052e-05, "loss": 2.1279, "step": 3688 }, { "epoch": 0.02868914929556434, "grad_norm": 0.30677676462089254, "learning_rate": 9.979918380047435e-05, "loss": 2.1609, "step": 3689 }, { "epoch": 0.028696926240344922, "grad_norm": 0.19147521019291625, "learning_rate": 9.979907439249258e-05, "loss": 2.1185, "step": 3690 }, { "epoch": 0.028704703185125503, "grad_norm": 0.9444017625174832, "learning_rate": 9.97989649547753e-05, "loss": 2.1142, "step": 3691 }, { "epoch": 0.028712480129906084, "grad_norm": 0.3521330805203631, "learning_rate": 9.979885548732257e-05, "loss": 2.1426, "step": 3692 }, { "epoch": 0.028720257074686666, "grad_norm": 0.1386288477117267, "learning_rate": 9.979874599013443e-05, "loss": 2.095, "step": 3693 }, { "epoch": 0.028728034019467247, "grad_norm": 0.27832148781556926, "learning_rate": 9.979863646321099e-05, "loss": 2.1051, "step": 3694 }, { "epoch": 0.028735810964247828, "grad_norm": 0.16687074642318422, "learning_rate": 9.97985269065523e-05, "loss": 2.0603, "step": 3695 }, { "epoch": 0.02874358790902841, "grad_norm": 0.15649338289465947, "learning_rate": 9.97984173201584e-05, "loss": 2.1241, "step": 3696 }, { "epoch": 0.028751364853808994, "grad_norm": 0.2431275800831902, "learning_rate": 9.979830770402938e-05, "loss": 2.129, "step": 3697 }, { "epoch": 0.028759141798589575, "grad_norm": 0.1716479465815719, "learning_rate": 9.979819805816531e-05, "loss": 2.0992, "step": 3698 }, { "epoch": 0.028766918743370156, "grad_norm": 0.14382818143507473, "learning_rate": 9.979808838256624e-05, "loss": 2.1277, "step": 3699 }, { "epoch": 0.028774695688150737, "grad_norm": 0.15466684460949823, "learning_rate": 9.979797867723224e-05, "loss": 2.1036, "step": 3700 }, { "epoch": 0.02878247263293132, "grad_norm": 0.8639829960212512, "learning_rate": 9.979786894216336e-05, "loss": 2.0993, "step": 3701 }, { "epoch": 0.0287902495777119, "grad_norm": 0.1516210410077584, "learning_rate": 9.979775917735971e-05, "loss": 2.1963, "step": 3702 }, { "epoch": 0.02879802652249248, "grad_norm": 0.14703032498436017, "learning_rate": 9.979764938282131e-05, "loss": 2.0893, "step": 3703 }, { "epoch": 0.028805803467273062, "grad_norm": 0.14410148494104258, "learning_rate": 9.979753955854825e-05, "loss": 2.1135, "step": 3704 }, { "epoch": 0.028813580412053643, "grad_norm": 0.14204094828944822, "learning_rate": 9.979742970454058e-05, "loss": 2.1631, "step": 3705 }, { "epoch": 0.028821357356834224, "grad_norm": 0.13973824294561055, "learning_rate": 9.979731982079839e-05, "loss": 2.1131, "step": 3706 }, { "epoch": 0.028829134301614805, "grad_norm": 0.13796022525770324, "learning_rate": 9.979720990732172e-05, "loss": 2.0884, "step": 3707 }, { "epoch": 0.028836911246395387, "grad_norm": 0.14110256087902723, "learning_rate": 9.979709996411065e-05, "loss": 2.0749, "step": 3708 }, { "epoch": 0.028844688191175968, "grad_norm": 0.12909364098183074, "learning_rate": 9.979698999116522e-05, "loss": 2.1025, "step": 3709 }, { "epoch": 0.02885246513595655, "grad_norm": 0.3601085381270825, "learning_rate": 9.979687998848553e-05, "loss": 2.13, "step": 3710 }, { "epoch": 0.02886024208073713, "grad_norm": 0.13870531543532613, "learning_rate": 9.979676995607163e-05, "loss": 2.1025, "step": 3711 }, { "epoch": 0.02886801902551771, "grad_norm": 0.11680780018238347, "learning_rate": 9.97966598939236e-05, "loss": 2.0608, "step": 3712 }, { "epoch": 0.028875795970298292, "grad_norm": 0.12140562217065455, "learning_rate": 9.979654980204149e-05, "loss": 2.0845, "step": 3713 }, { "epoch": 0.028883572915078874, "grad_norm": 0.12910395601412641, "learning_rate": 9.979643968042536e-05, "loss": 2.1192, "step": 3714 }, { "epoch": 0.028891349859859455, "grad_norm": 0.135642632097994, "learning_rate": 9.97963295290753e-05, "loss": 2.0815, "step": 3715 }, { "epoch": 0.028899126804640036, "grad_norm": 0.15099031988996814, "learning_rate": 9.979621934799135e-05, "loss": 2.1319, "step": 3716 }, { "epoch": 0.028906903749420617, "grad_norm": 0.13840576326469228, "learning_rate": 9.979610913717359e-05, "loss": 2.1021, "step": 3717 }, { "epoch": 0.028914680694201198, "grad_norm": 0.13705416487311162, "learning_rate": 9.979599889662208e-05, "loss": 2.1012, "step": 3718 }, { "epoch": 0.02892245763898178, "grad_norm": 0.12674186414380934, "learning_rate": 9.979588862633689e-05, "loss": 2.124, "step": 3719 }, { "epoch": 0.02893023458376236, "grad_norm": 0.14705819954054433, "learning_rate": 9.979577832631808e-05, "loss": 2.1137, "step": 3720 }, { "epoch": 0.02893801152854294, "grad_norm": 0.13320765930745218, "learning_rate": 9.979566799656573e-05, "loss": 2.1096, "step": 3721 }, { "epoch": 0.028945788473323523, "grad_norm": 0.12476384023042852, "learning_rate": 9.979555763707988e-05, "loss": 2.134, "step": 3722 }, { "epoch": 0.028953565418104104, "grad_norm": 0.1365435431526017, "learning_rate": 9.979544724786063e-05, "loss": 2.1064, "step": 3723 }, { "epoch": 0.028961342362884685, "grad_norm": 0.1500717278930907, "learning_rate": 9.979533682890803e-05, "loss": 2.0551, "step": 3724 }, { "epoch": 0.028969119307665266, "grad_norm": 0.14499977109613407, "learning_rate": 9.979522638022212e-05, "loss": 2.1254, "step": 3725 }, { "epoch": 0.028976896252445847, "grad_norm": 0.12843758902799052, "learning_rate": 9.979511590180301e-05, "loss": 2.126, "step": 3726 }, { "epoch": 0.028984673197226432, "grad_norm": 0.24838258959101575, "learning_rate": 9.979500539365074e-05, "loss": 2.1328, "step": 3727 }, { "epoch": 0.028992450142007013, "grad_norm": 0.16062892349636448, "learning_rate": 9.979489485576539e-05, "loss": 2.1144, "step": 3728 }, { "epoch": 0.029000227086787594, "grad_norm": 0.13982878861675835, "learning_rate": 9.979478428814699e-05, "loss": 2.1113, "step": 3729 }, { "epoch": 0.029008004031568176, "grad_norm": 0.14111320726206505, "learning_rate": 9.979467369079565e-05, "loss": 2.1094, "step": 3730 }, { "epoch": 0.029015780976348757, "grad_norm": 0.17369005579989055, "learning_rate": 9.979456306371143e-05, "loss": 2.1441, "step": 3731 }, { "epoch": 0.029023557921129338, "grad_norm": 0.204848793254881, "learning_rate": 9.979445240689438e-05, "loss": 2.1381, "step": 3732 }, { "epoch": 0.02903133486590992, "grad_norm": 0.1698924891166193, "learning_rate": 9.979434172034457e-05, "loss": 2.0933, "step": 3733 }, { "epoch": 0.0290391118106905, "grad_norm": 0.12002154887602463, "learning_rate": 9.979423100406207e-05, "loss": 2.0699, "step": 3734 }, { "epoch": 0.02904688875547108, "grad_norm": 0.23754506629616448, "learning_rate": 9.979412025804694e-05, "loss": 2.1164, "step": 3735 }, { "epoch": 0.029054665700251663, "grad_norm": 0.13837079959966306, "learning_rate": 9.979400948229925e-05, "loss": 2.1078, "step": 3736 }, { "epoch": 0.029062442645032244, "grad_norm": 0.13970038682510502, "learning_rate": 9.979389867681908e-05, "loss": 2.1363, "step": 3737 }, { "epoch": 0.029070219589812825, "grad_norm": 0.11360965704946281, "learning_rate": 9.979378784160647e-05, "loss": 2.142, "step": 3738 }, { "epoch": 0.029077996534593406, "grad_norm": 0.12947640295311735, "learning_rate": 9.979367697666148e-05, "loss": 2.1036, "step": 3739 }, { "epoch": 0.029085773479373987, "grad_norm": 0.14094691333285186, "learning_rate": 9.979356608198422e-05, "loss": 2.0891, "step": 3740 }, { "epoch": 0.02909355042415457, "grad_norm": 0.11464256734423277, "learning_rate": 9.979345515757473e-05, "loss": 2.0951, "step": 3741 }, { "epoch": 0.02910132736893515, "grad_norm": 0.1169056015293394, "learning_rate": 9.979334420343307e-05, "loss": 2.1164, "step": 3742 }, { "epoch": 0.02910910431371573, "grad_norm": 0.11803790333395227, "learning_rate": 9.979323321955932e-05, "loss": 2.0529, "step": 3743 }, { "epoch": 0.029116881258496312, "grad_norm": 0.11649463398840464, "learning_rate": 9.979312220595354e-05, "loss": 2.0682, "step": 3744 }, { "epoch": 0.029124658203276893, "grad_norm": 0.1271156275638468, "learning_rate": 9.979301116261579e-05, "loss": 2.1367, "step": 3745 }, { "epoch": 0.029132435148057474, "grad_norm": 0.4311896195222985, "learning_rate": 9.979290008954615e-05, "loss": 2.0709, "step": 3746 }, { "epoch": 0.029140212092838055, "grad_norm": 0.11653866126619769, "learning_rate": 9.979278898674467e-05, "loss": 2.1472, "step": 3747 }, { "epoch": 0.029147989037618637, "grad_norm": 0.12646349941986906, "learning_rate": 9.979267785421142e-05, "loss": 2.0037, "step": 3748 }, { "epoch": 0.029155765982399218, "grad_norm": 0.13629204963687794, "learning_rate": 9.97925666919465e-05, "loss": 2.1142, "step": 3749 }, { "epoch": 0.0291635429271798, "grad_norm": 0.12603253099882253, "learning_rate": 9.979245549994992e-05, "loss": 2.1472, "step": 3750 }, { "epoch": 0.02917131987196038, "grad_norm": 0.12600135182827865, "learning_rate": 9.979234427822179e-05, "loss": 2.1283, "step": 3751 }, { "epoch": 0.02917909681674096, "grad_norm": 0.21150190834804894, "learning_rate": 9.979223302676215e-05, "loss": 2.1456, "step": 3752 }, { "epoch": 0.029186873761521542, "grad_norm": 0.12308794748686192, "learning_rate": 9.979212174557109e-05, "loss": 2.1028, "step": 3753 }, { "epoch": 0.029194650706302123, "grad_norm": 0.12707234552184557, "learning_rate": 9.979201043464864e-05, "loss": 2.0467, "step": 3754 }, { "epoch": 0.029202427651082705, "grad_norm": 0.12044532683548617, "learning_rate": 9.979189909399491e-05, "loss": 2.0986, "step": 3755 }, { "epoch": 0.029210204595863286, "grad_norm": 0.1308153784953554, "learning_rate": 9.979178772360993e-05, "loss": 2.0974, "step": 3756 }, { "epoch": 0.02921798154064387, "grad_norm": 0.13570319206695844, "learning_rate": 9.97916763234938e-05, "loss": 2.1535, "step": 3757 }, { "epoch": 0.02922575848542445, "grad_norm": 0.12213918659304299, "learning_rate": 9.979156489364656e-05, "loss": 2.0999, "step": 3758 }, { "epoch": 0.029233535430205033, "grad_norm": 0.12654385787936984, "learning_rate": 9.97914534340683e-05, "loss": 2.0856, "step": 3759 }, { "epoch": 0.029241312374985614, "grad_norm": 0.1227656750186414, "learning_rate": 9.979134194475906e-05, "loss": 2.0766, "step": 3760 }, { "epoch": 0.029249089319766195, "grad_norm": 0.11788368323171312, "learning_rate": 9.979123042571892e-05, "loss": 2.111, "step": 3761 }, { "epoch": 0.029256866264546776, "grad_norm": 0.1136841070392902, "learning_rate": 9.979111887694794e-05, "loss": 2.1432, "step": 3762 }, { "epoch": 0.029264643209327357, "grad_norm": 0.11871782741021226, "learning_rate": 9.979100729844621e-05, "loss": 2.1114, "step": 3763 }, { "epoch": 0.02927242015410794, "grad_norm": 0.11588883962535951, "learning_rate": 9.979089569021376e-05, "loss": 2.1166, "step": 3764 }, { "epoch": 0.02928019709888852, "grad_norm": 0.11080040164930204, "learning_rate": 9.979078405225069e-05, "loss": 2.1218, "step": 3765 }, { "epoch": 0.0292879740436691, "grad_norm": 0.12192739590317282, "learning_rate": 9.979067238455704e-05, "loss": 2.1639, "step": 3766 }, { "epoch": 0.029295750988449682, "grad_norm": 0.1274603420048777, "learning_rate": 9.979056068713289e-05, "loss": 2.145, "step": 3767 }, { "epoch": 0.029303527933230263, "grad_norm": 0.11917045785560686, "learning_rate": 9.979044895997832e-05, "loss": 2.0674, "step": 3768 }, { "epoch": 0.029311304878010844, "grad_norm": 0.1170191082266535, "learning_rate": 9.979033720309336e-05, "loss": 2.1325, "step": 3769 }, { "epoch": 0.029319081822791426, "grad_norm": 0.11846402603549229, "learning_rate": 9.979022541647812e-05, "loss": 2.0801, "step": 3770 }, { "epoch": 0.029326858767572007, "grad_norm": 0.11524578598743879, "learning_rate": 9.979011360013264e-05, "loss": 2.0638, "step": 3771 }, { "epoch": 0.029334635712352588, "grad_norm": 0.11202595505220143, "learning_rate": 9.979000175405699e-05, "loss": 2.1055, "step": 3772 }, { "epoch": 0.02934241265713317, "grad_norm": 0.12095931621906941, "learning_rate": 9.978988987825124e-05, "loss": 2.1416, "step": 3773 }, { "epoch": 0.02935018960191375, "grad_norm": 0.12389595581943977, "learning_rate": 9.978977797271545e-05, "loss": 2.0907, "step": 3774 }, { "epoch": 0.02935796654669433, "grad_norm": 0.11513022001838473, "learning_rate": 9.97896660374497e-05, "loss": 2.0532, "step": 3775 }, { "epoch": 0.029365743491474913, "grad_norm": 0.11516047846389893, "learning_rate": 9.978955407245404e-05, "loss": 2.1657, "step": 3776 }, { "epoch": 0.029373520436255494, "grad_norm": 0.12299359044111816, "learning_rate": 9.978944207772858e-05, "loss": 2.1175, "step": 3777 }, { "epoch": 0.029381297381036075, "grad_norm": 0.11360654667687195, "learning_rate": 9.978933005327332e-05, "loss": 2.0593, "step": 3778 }, { "epoch": 0.029389074325816656, "grad_norm": 0.1500083156023022, "learning_rate": 9.978921799908837e-05, "loss": 2.1136, "step": 3779 }, { "epoch": 0.029396851270597237, "grad_norm": 0.11773116852539098, "learning_rate": 9.978910591517378e-05, "loss": 2.0887, "step": 3780 }, { "epoch": 0.02940462821537782, "grad_norm": 0.10992630849012826, "learning_rate": 9.978899380152964e-05, "loss": 2.0888, "step": 3781 }, { "epoch": 0.0294124051601584, "grad_norm": 0.12343179054982224, "learning_rate": 9.978888165815599e-05, "loss": 2.1016, "step": 3782 }, { "epoch": 0.02942018210493898, "grad_norm": 0.11906238231650096, "learning_rate": 9.97887694850529e-05, "loss": 2.0694, "step": 3783 }, { "epoch": 0.029427959049719562, "grad_norm": 0.11555992057385928, "learning_rate": 9.978865728222046e-05, "loss": 2.136, "step": 3784 }, { "epoch": 0.029435735994500143, "grad_norm": 0.12092939832974833, "learning_rate": 9.978854504965873e-05, "loss": 2.0614, "step": 3785 }, { "epoch": 0.029443512939280728, "grad_norm": 0.12343606787934147, "learning_rate": 9.978843278736775e-05, "loss": 2.0578, "step": 3786 }, { "epoch": 0.02945128988406131, "grad_norm": 0.11893208752276892, "learning_rate": 9.978832049534761e-05, "loss": 2.0856, "step": 3787 }, { "epoch": 0.02945906682884189, "grad_norm": 0.13103882871753453, "learning_rate": 9.97882081735984e-05, "loss": 2.0847, "step": 3788 }, { "epoch": 0.02946684377362247, "grad_norm": 0.12137948439407925, "learning_rate": 9.978809582212012e-05, "loss": 2.125, "step": 3789 }, { "epoch": 0.029474620718403052, "grad_norm": 0.11733447064891546, "learning_rate": 9.97879834409129e-05, "loss": 2.1211, "step": 3790 }, { "epoch": 0.029482397663183633, "grad_norm": 0.1549193109876141, "learning_rate": 9.978787102997678e-05, "loss": 2.1096, "step": 3791 }, { "epoch": 0.029490174607964215, "grad_norm": 0.19196204335844827, "learning_rate": 9.978775858931184e-05, "loss": 2.1433, "step": 3792 }, { "epoch": 0.029497951552744796, "grad_norm": 0.12133669861008334, "learning_rate": 9.978764611891813e-05, "loss": 2.084, "step": 3793 }, { "epoch": 0.029505728497525377, "grad_norm": 0.12479613578313602, "learning_rate": 9.978753361879571e-05, "loss": 2.1797, "step": 3794 }, { "epoch": 0.029513505442305958, "grad_norm": 0.1318365138177817, "learning_rate": 9.978742108894469e-05, "loss": 2.1008, "step": 3795 }, { "epoch": 0.02952128238708654, "grad_norm": 0.12868197673113946, "learning_rate": 9.97873085293651e-05, "loss": 2.1667, "step": 3796 }, { "epoch": 0.02952905933186712, "grad_norm": 0.4396743573929898, "learning_rate": 9.978719594005701e-05, "loss": 2.1611, "step": 3797 }, { "epoch": 0.0295368362766477, "grad_norm": 0.19747123654053922, "learning_rate": 9.978708332102052e-05, "loss": 2.1411, "step": 3798 }, { "epoch": 0.029544613221428283, "grad_norm": 0.12564020492050332, "learning_rate": 9.978697067225564e-05, "loss": 2.0572, "step": 3799 }, { "epoch": 0.029552390166208864, "grad_norm": 0.1285488237372415, "learning_rate": 9.978685799376249e-05, "loss": 2.1474, "step": 3800 }, { "epoch": 0.029560167110989445, "grad_norm": 0.13452578404584903, "learning_rate": 9.978674528554111e-05, "loss": 2.0914, "step": 3801 }, { "epoch": 0.029567944055770026, "grad_norm": 0.1488816545929283, "learning_rate": 9.978663254759157e-05, "loss": 2.1275, "step": 3802 }, { "epoch": 0.029575721000550607, "grad_norm": 0.11665331816110412, "learning_rate": 9.978651977991394e-05, "loss": 2.1283, "step": 3803 }, { "epoch": 0.02958349794533119, "grad_norm": 0.11690499633258136, "learning_rate": 9.978640698250829e-05, "loss": 2.065, "step": 3804 }, { "epoch": 0.02959127489011177, "grad_norm": 0.12507386430454193, "learning_rate": 9.978629415537471e-05, "loss": 2.1203, "step": 3805 }, { "epoch": 0.02959905183489235, "grad_norm": 0.1582946011901239, "learning_rate": 9.97861812985132e-05, "loss": 2.0621, "step": 3806 }, { "epoch": 0.029606828779672932, "grad_norm": 0.12118893585630967, "learning_rate": 9.978606841192391e-05, "loss": 2.1498, "step": 3807 }, { "epoch": 0.029614605724453513, "grad_norm": 0.12408327591479974, "learning_rate": 9.978595549560685e-05, "loss": 2.1593, "step": 3808 }, { "epoch": 0.029622382669234094, "grad_norm": 0.1206610803286067, "learning_rate": 9.978584254956211e-05, "loss": 2.0892, "step": 3809 }, { "epoch": 0.029630159614014676, "grad_norm": 0.2574256563249732, "learning_rate": 9.978572957378975e-05, "loss": 2.0904, "step": 3810 }, { "epoch": 0.029637936558795257, "grad_norm": 0.12521899846497617, "learning_rate": 9.978561656828984e-05, "loss": 2.1419, "step": 3811 }, { "epoch": 0.029645713503575838, "grad_norm": 0.11814843217341857, "learning_rate": 9.978550353306244e-05, "loss": 2.1276, "step": 3812 }, { "epoch": 0.02965349044835642, "grad_norm": 0.14798850200432676, "learning_rate": 9.978539046810764e-05, "loss": 2.1023, "step": 3813 }, { "epoch": 0.029661267393137, "grad_norm": 0.13954504451170535, "learning_rate": 9.978527737342549e-05, "loss": 2.0747, "step": 3814 }, { "epoch": 0.02966904433791758, "grad_norm": 0.12093940888066544, "learning_rate": 9.978516424901605e-05, "loss": 2.1287, "step": 3815 }, { "epoch": 0.029676821282698166, "grad_norm": 0.46344776906167634, "learning_rate": 9.97850510948794e-05, "loss": 2.1218, "step": 3816 }, { "epoch": 0.029684598227478747, "grad_norm": 0.15595061438740723, "learning_rate": 9.978493791101562e-05, "loss": 2.1705, "step": 3817 }, { "epoch": 0.02969237517225933, "grad_norm": 0.12936258441878137, "learning_rate": 9.978482469742474e-05, "loss": 2.1635, "step": 3818 }, { "epoch": 0.02970015211703991, "grad_norm": 0.13588847729613612, "learning_rate": 9.978471145410687e-05, "loss": 2.1134, "step": 3819 }, { "epoch": 0.02970792906182049, "grad_norm": 0.13457444249865738, "learning_rate": 9.978459818106205e-05, "loss": 2.133, "step": 3820 }, { "epoch": 0.029715706006601072, "grad_norm": 0.12454194668579381, "learning_rate": 9.978448487829036e-05, "loss": 2.0631, "step": 3821 }, { "epoch": 0.029723482951381653, "grad_norm": 0.1269256458794424, "learning_rate": 9.978437154579186e-05, "loss": 2.1735, "step": 3822 }, { "epoch": 0.029731259896162234, "grad_norm": 0.13298756879894963, "learning_rate": 9.978425818356663e-05, "loss": 2.0877, "step": 3823 }, { "epoch": 0.029739036840942815, "grad_norm": 0.1242611982590628, "learning_rate": 9.978414479161471e-05, "loss": 2.1004, "step": 3824 }, { "epoch": 0.029746813785723396, "grad_norm": 0.12081767323397373, "learning_rate": 9.978403136993621e-05, "loss": 2.1362, "step": 3825 }, { "epoch": 0.029754590730503978, "grad_norm": 0.12391053450620232, "learning_rate": 9.978391791853117e-05, "loss": 2.1417, "step": 3826 }, { "epoch": 0.02976236767528456, "grad_norm": 0.12218393951174465, "learning_rate": 9.978380443739964e-05, "loss": 2.0716, "step": 3827 }, { "epoch": 0.02977014462006514, "grad_norm": 0.17559587039584734, "learning_rate": 9.978369092654173e-05, "loss": 2.1239, "step": 3828 }, { "epoch": 0.02977792156484572, "grad_norm": 0.13901220027825575, "learning_rate": 9.978357738595749e-05, "loss": 2.1223, "step": 3829 }, { "epoch": 0.029785698509626302, "grad_norm": 0.17212187768933543, "learning_rate": 9.978346381564697e-05, "loss": 2.0883, "step": 3830 }, { "epoch": 0.029793475454406883, "grad_norm": 0.18660077101471867, "learning_rate": 9.978335021561026e-05, "loss": 2.0877, "step": 3831 }, { "epoch": 0.029801252399187465, "grad_norm": 0.2307188507867854, "learning_rate": 9.978323658584743e-05, "loss": 2.1535, "step": 3832 }, { "epoch": 0.029809029343968046, "grad_norm": 0.1252159706481377, "learning_rate": 9.978312292635852e-05, "loss": 2.1701, "step": 3833 }, { "epoch": 0.029816806288748627, "grad_norm": 0.15757070295760708, "learning_rate": 9.978300923714363e-05, "loss": 2.1279, "step": 3834 }, { "epoch": 0.029824583233529208, "grad_norm": 0.1770747333176484, "learning_rate": 9.97828955182028e-05, "loss": 2.1462, "step": 3835 }, { "epoch": 0.02983236017830979, "grad_norm": 0.2906453478606634, "learning_rate": 9.978278176953615e-05, "loss": 2.0944, "step": 3836 }, { "epoch": 0.02984013712309037, "grad_norm": 0.126978600678109, "learning_rate": 9.978266799114368e-05, "loss": 2.1056, "step": 3837 }, { "epoch": 0.02984791406787095, "grad_norm": 0.19057104822458848, "learning_rate": 9.97825541830255e-05, "loss": 2.1524, "step": 3838 }, { "epoch": 0.029855691012651533, "grad_norm": 0.12660145218500463, "learning_rate": 9.978244034518165e-05, "loss": 2.1038, "step": 3839 }, { "epoch": 0.029863467957432114, "grad_norm": 0.1342422333776976, "learning_rate": 9.978232647761223e-05, "loss": 2.07, "step": 3840 }, { "epoch": 0.029871244902212695, "grad_norm": 0.12842673885810418, "learning_rate": 9.978221258031728e-05, "loss": 2.057, "step": 3841 }, { "epoch": 0.029879021846993276, "grad_norm": 0.14539934193874698, "learning_rate": 9.978209865329689e-05, "loss": 2.1436, "step": 3842 }, { "epoch": 0.029886798791773857, "grad_norm": 0.13606459504936474, "learning_rate": 9.978198469655112e-05, "loss": 2.0958, "step": 3843 }, { "epoch": 0.02989457573655444, "grad_norm": 0.11863560046151479, "learning_rate": 9.978187071008003e-05, "loss": 2.171, "step": 3844 }, { "epoch": 0.02990235268133502, "grad_norm": 0.1255101089842697, "learning_rate": 9.97817566938837e-05, "loss": 2.0977, "step": 3845 }, { "epoch": 0.029910129626115604, "grad_norm": 0.12698023283607382, "learning_rate": 9.978164264796219e-05, "loss": 2.0866, "step": 3846 }, { "epoch": 0.029917906570896186, "grad_norm": 0.14842222328807336, "learning_rate": 9.978152857231557e-05, "loss": 2.1223, "step": 3847 }, { "epoch": 0.029925683515676767, "grad_norm": 0.3103490577989123, "learning_rate": 9.978141446694392e-05, "loss": 2.0935, "step": 3848 }, { "epoch": 0.029933460460457348, "grad_norm": 0.134064222893709, "learning_rate": 9.978130033184727e-05, "loss": 2.125, "step": 3849 }, { "epoch": 0.02994123740523793, "grad_norm": 0.1378492364266916, "learning_rate": 9.978118616702574e-05, "loss": 2.0721, "step": 3850 }, { "epoch": 0.02994901435001851, "grad_norm": 0.22493927273147993, "learning_rate": 9.978107197247937e-05, "loss": 2.1264, "step": 3851 }, { "epoch": 0.02995679129479909, "grad_norm": 0.5830925662783073, "learning_rate": 9.978095774820823e-05, "loss": 2.1243, "step": 3852 }, { "epoch": 0.029964568239579673, "grad_norm": 0.2680839177486098, "learning_rate": 9.978084349421237e-05, "loss": 2.158, "step": 3853 }, { "epoch": 0.029972345184360254, "grad_norm": 0.14622975426575685, "learning_rate": 9.978072921049191e-05, "loss": 2.1166, "step": 3854 }, { "epoch": 0.029980122129140835, "grad_norm": 0.22372674725976596, "learning_rate": 9.978061489704686e-05, "loss": 2.0711, "step": 3855 }, { "epoch": 0.029987899073921416, "grad_norm": 0.2141898400947825, "learning_rate": 9.978050055387733e-05, "loss": 2.05, "step": 3856 }, { "epoch": 0.029995676018701997, "grad_norm": 0.13783463990451802, "learning_rate": 9.978038618098336e-05, "loss": 2.1291, "step": 3857 }, { "epoch": 0.03000345296348258, "grad_norm": 0.2749060777438767, "learning_rate": 9.978027177836504e-05, "loss": 2.1138, "step": 3858 }, { "epoch": 0.03001122990826316, "grad_norm": 0.25052424687534103, "learning_rate": 9.978015734602244e-05, "loss": 2.1122, "step": 3859 }, { "epoch": 0.03001900685304374, "grad_norm": 0.14062810458451894, "learning_rate": 9.978004288395559e-05, "loss": 2.0688, "step": 3860 }, { "epoch": 0.030026783797824322, "grad_norm": 0.2146684635965837, "learning_rate": 9.977992839216461e-05, "loss": 2.1243, "step": 3861 }, { "epoch": 0.030034560742604903, "grad_norm": 0.21243743549957592, "learning_rate": 9.977981387064954e-05, "loss": 2.097, "step": 3862 }, { "epoch": 0.030042337687385484, "grad_norm": 0.1385135431553804, "learning_rate": 9.977969931941045e-05, "loss": 2.0801, "step": 3863 }, { "epoch": 0.030050114632166065, "grad_norm": 0.21352579239471853, "learning_rate": 9.97795847384474e-05, "loss": 2.1347, "step": 3864 }, { "epoch": 0.030057891576946646, "grad_norm": 0.18049990817555517, "learning_rate": 9.977947012776047e-05, "loss": 2.1006, "step": 3865 }, { "epoch": 0.030065668521727228, "grad_norm": 0.16931138862386919, "learning_rate": 9.977935548734974e-05, "loss": 2.1378, "step": 3866 }, { "epoch": 0.03007344546650781, "grad_norm": 0.20150466571065587, "learning_rate": 9.977924081721527e-05, "loss": 2.0416, "step": 3867 }, { "epoch": 0.03008122241128839, "grad_norm": 0.15950603559062637, "learning_rate": 9.977912611735711e-05, "loss": 2.1133, "step": 3868 }, { "epoch": 0.03008899935606897, "grad_norm": 0.18292108028436607, "learning_rate": 9.977901138777536e-05, "loss": 2.1052, "step": 3869 }, { "epoch": 0.030096776300849552, "grad_norm": 0.1622317181273038, "learning_rate": 9.977889662847006e-05, "loss": 2.1112, "step": 3870 }, { "epoch": 0.030104553245630133, "grad_norm": 1.0432440871013162, "learning_rate": 9.97787818394413e-05, "loss": 2.1023, "step": 3871 }, { "epoch": 0.030112330190410715, "grad_norm": 0.1698985876092127, "learning_rate": 9.977866702068914e-05, "loss": 2.0886, "step": 3872 }, { "epoch": 0.030120107135191296, "grad_norm": 2.3176805183779834, "learning_rate": 9.977855217221364e-05, "loss": 2.0767, "step": 3873 }, { "epoch": 0.030127884079971877, "grad_norm": 1.6982645019357663, "learning_rate": 9.977843729401486e-05, "loss": 2.1189, "step": 3874 }, { "epoch": 0.030135661024752458, "grad_norm": 0.33637215896237016, "learning_rate": 9.97783223860929e-05, "loss": 2.132, "step": 3875 }, { "epoch": 0.030143437969533043, "grad_norm": 0.39608626926724977, "learning_rate": 9.977820744844782e-05, "loss": 2.1463, "step": 3876 }, { "epoch": 0.030151214914313624, "grad_norm": 0.329359912527856, "learning_rate": 9.977809248107967e-05, "loss": 2.1609, "step": 3877 }, { "epoch": 0.030158991859094205, "grad_norm": 0.269499602113863, "learning_rate": 9.977797748398854e-05, "loss": 2.0944, "step": 3878 }, { "epoch": 0.030166768803874786, "grad_norm": 0.22550333728760236, "learning_rate": 9.977786245717449e-05, "loss": 2.0349, "step": 3879 }, { "epoch": 0.030174545748655367, "grad_norm": 0.2541382382904821, "learning_rate": 9.977774740063759e-05, "loss": 2.1531, "step": 3880 }, { "epoch": 0.03018232269343595, "grad_norm": 0.26207013366451937, "learning_rate": 9.977763231437789e-05, "loss": 2.0795, "step": 3881 }, { "epoch": 0.03019009963821653, "grad_norm": 0.2416567932447716, "learning_rate": 9.977751719839548e-05, "loss": 2.1565, "step": 3882 }, { "epoch": 0.03019787658299711, "grad_norm": 0.27533145975739626, "learning_rate": 9.977740205269043e-05, "loss": 2.0567, "step": 3883 }, { "epoch": 0.030205653527777692, "grad_norm": 0.1708620630644048, "learning_rate": 9.977728687726281e-05, "loss": 2.1274, "step": 3884 }, { "epoch": 0.030213430472558273, "grad_norm": 0.20590519504502441, "learning_rate": 9.977717167211267e-05, "loss": 2.1498, "step": 3885 }, { "epoch": 0.030221207417338854, "grad_norm": 0.2383655620250132, "learning_rate": 9.97770564372401e-05, "loss": 2.165, "step": 3886 }, { "epoch": 0.030228984362119436, "grad_norm": 0.15591353231824762, "learning_rate": 9.977694117264515e-05, "loss": 2.0873, "step": 3887 }, { "epoch": 0.030236761306900017, "grad_norm": 0.18216378277317422, "learning_rate": 9.977682587832791e-05, "loss": 2.077, "step": 3888 }, { "epoch": 0.030244538251680598, "grad_norm": 0.28832809450435626, "learning_rate": 9.977671055428842e-05, "loss": 2.1333, "step": 3889 }, { "epoch": 0.03025231519646118, "grad_norm": 0.13607271158633685, "learning_rate": 9.977659520052678e-05, "loss": 2.1122, "step": 3890 }, { "epoch": 0.03026009214124176, "grad_norm": 0.20012545637579002, "learning_rate": 9.977647981704303e-05, "loss": 2.1203, "step": 3891 }, { "epoch": 0.03026786908602234, "grad_norm": 0.18905762004986643, "learning_rate": 9.977636440383727e-05, "loss": 2.1169, "step": 3892 }, { "epoch": 0.030275646030802923, "grad_norm": 0.14808140634905395, "learning_rate": 9.977624896090953e-05, "loss": 2.0912, "step": 3893 }, { "epoch": 0.030283422975583504, "grad_norm": 0.20370341822990032, "learning_rate": 9.977613348825993e-05, "loss": 2.1213, "step": 3894 }, { "epoch": 0.030291199920364085, "grad_norm": 0.1701009960372909, "learning_rate": 9.97760179858885e-05, "loss": 2.1487, "step": 3895 }, { "epoch": 0.030298976865144666, "grad_norm": 0.13816222935207573, "learning_rate": 9.977590245379531e-05, "loss": 2.1013, "step": 3896 }, { "epoch": 0.030306753809925247, "grad_norm": 39.79656196753672, "learning_rate": 9.977578689198043e-05, "loss": 2.2377, "step": 3897 }, { "epoch": 0.03031453075470583, "grad_norm": 0.4929497572601059, "learning_rate": 9.977567130044396e-05, "loss": 2.1056, "step": 3898 }, { "epoch": 0.03032230769948641, "grad_norm": 0.5335195959939495, "learning_rate": 9.977555567918595e-05, "loss": 2.1125, "step": 3899 }, { "epoch": 0.03033008464426699, "grad_norm": 0.33274931950705894, "learning_rate": 9.977544002820644e-05, "loss": 2.1196, "step": 3900 }, { "epoch": 0.030337861589047572, "grad_norm": 0.7381067465309353, "learning_rate": 9.977532434750555e-05, "loss": 2.1281, "step": 3901 }, { "epoch": 0.030345638533828153, "grad_norm": 0.40670337943502877, "learning_rate": 9.977520863708329e-05, "loss": 2.1551, "step": 3902 }, { "epoch": 0.030353415478608734, "grad_norm": 0.43061649133651125, "learning_rate": 9.97750928969398e-05, "loss": 2.2006, "step": 3903 }, { "epoch": 0.030361192423389315, "grad_norm": 0.35963151737030336, "learning_rate": 9.977497712707509e-05, "loss": 2.122, "step": 3904 }, { "epoch": 0.0303689693681699, "grad_norm": 0.5652633226789289, "learning_rate": 9.977486132748926e-05, "loss": 2.1114, "step": 3905 }, { "epoch": 0.03037674631295048, "grad_norm": 0.3224804085352319, "learning_rate": 9.977474549818235e-05, "loss": 2.1281, "step": 3906 }, { "epoch": 0.030384523257731062, "grad_norm": 0.3446086741592027, "learning_rate": 9.977462963915446e-05, "loss": 2.1525, "step": 3907 }, { "epoch": 0.030392300202511643, "grad_norm": 0.3878312421774174, "learning_rate": 9.977451375040567e-05, "loss": 2.1381, "step": 3908 }, { "epoch": 0.030400077147292225, "grad_norm": 0.37164288259815065, "learning_rate": 9.977439783193601e-05, "loss": 2.1408, "step": 3909 }, { "epoch": 0.030407854092072806, "grad_norm": 0.27627261226447275, "learning_rate": 9.977428188374556e-05, "loss": 2.1012, "step": 3910 }, { "epoch": 0.030415631036853387, "grad_norm": 0.2660494344607653, "learning_rate": 9.977416590583441e-05, "loss": 2.1415, "step": 3911 }, { "epoch": 0.030423407981633968, "grad_norm": 0.22321046764016475, "learning_rate": 9.97740498982026e-05, "loss": 2.0842, "step": 3912 }, { "epoch": 0.03043118492641455, "grad_norm": 0.2735454840068359, "learning_rate": 9.977393386085023e-05, "loss": 2.1056, "step": 3913 }, { "epoch": 0.03043896187119513, "grad_norm": 0.2132691515115329, "learning_rate": 9.977381779377735e-05, "loss": 2.0645, "step": 3914 }, { "epoch": 0.03044673881597571, "grad_norm": 4.452120289521068, "learning_rate": 9.977370169698403e-05, "loss": 2.1736, "step": 3915 }, { "epoch": 0.030454515760756293, "grad_norm": 0.33451903538672684, "learning_rate": 9.977358557047034e-05, "loss": 2.1041, "step": 3916 }, { "epoch": 0.030462292705536874, "grad_norm": 0.24300185797106857, "learning_rate": 9.977346941423636e-05, "loss": 2.1195, "step": 3917 }, { "epoch": 0.030470069650317455, "grad_norm": 0.2781691118878444, "learning_rate": 9.977335322828214e-05, "loss": 2.135, "step": 3918 }, { "epoch": 0.030477846595098036, "grad_norm": 0.25180744828783896, "learning_rate": 9.977323701260775e-05, "loss": 2.114, "step": 3919 }, { "epoch": 0.030485623539878617, "grad_norm": 0.38673397930899805, "learning_rate": 9.977312076721328e-05, "loss": 2.0505, "step": 3920 }, { "epoch": 0.0304934004846592, "grad_norm": 0.25251098491039153, "learning_rate": 9.977300449209881e-05, "loss": 2.0912, "step": 3921 }, { "epoch": 0.03050117742943978, "grad_norm": 0.2032433167664642, "learning_rate": 9.977288818726436e-05, "loss": 2.1304, "step": 3922 }, { "epoch": 0.03050895437422036, "grad_norm": 0.20988958394886398, "learning_rate": 9.977277185271005e-05, "loss": 2.1142, "step": 3923 }, { "epoch": 0.030516731319000942, "grad_norm": 0.19750279511020477, "learning_rate": 9.97726554884359e-05, "loss": 2.1243, "step": 3924 }, { "epoch": 0.030524508263781523, "grad_norm": 0.22279718219365266, "learning_rate": 9.977253909444204e-05, "loss": 2.0842, "step": 3925 }, { "epoch": 0.030532285208562104, "grad_norm": 0.17291993072410114, "learning_rate": 9.977242267072849e-05, "loss": 2.1285, "step": 3926 }, { "epoch": 0.030540062153342686, "grad_norm": 0.17642563242028841, "learning_rate": 9.977230621729534e-05, "loss": 2.1496, "step": 3927 }, { "epoch": 0.030547839098123267, "grad_norm": 0.15722361723838008, "learning_rate": 9.977218973414264e-05, "loss": 2.1318, "step": 3928 }, { "epoch": 0.030555616042903848, "grad_norm": 0.15631174702318545, "learning_rate": 9.977207322127048e-05, "loss": 2.1208, "step": 3929 }, { "epoch": 0.03056339298768443, "grad_norm": 0.16508778716604122, "learning_rate": 9.977195667867895e-05, "loss": 2.1037, "step": 3930 }, { "epoch": 0.03057116993246501, "grad_norm": 0.19544199522476688, "learning_rate": 9.977184010636808e-05, "loss": 2.1253, "step": 3931 }, { "epoch": 0.03057894687724559, "grad_norm": 0.15014459740719718, "learning_rate": 9.977172350433795e-05, "loss": 2.1459, "step": 3932 }, { "epoch": 0.030586723822026173, "grad_norm": 0.14000935273166776, "learning_rate": 9.977160687258863e-05, "loss": 2.1325, "step": 3933 }, { "epoch": 0.030594500766806754, "grad_norm": 1.26441808199511, "learning_rate": 9.97714902111202e-05, "loss": 2.0907, "step": 3934 }, { "epoch": 0.03060227771158734, "grad_norm": 5.886936787547583, "learning_rate": 9.977137351993271e-05, "loss": 2.2018, "step": 3935 }, { "epoch": 0.03061005465636792, "grad_norm": 0.4947623008894854, "learning_rate": 9.977125679902626e-05, "loss": 2.0978, "step": 3936 }, { "epoch": 0.0306178316011485, "grad_norm": 0.21219846877236587, "learning_rate": 9.977114004840089e-05, "loss": 2.1686, "step": 3937 }, { "epoch": 0.030625608545929082, "grad_norm": 0.2695913295233087, "learning_rate": 9.97710232680567e-05, "loss": 2.1494, "step": 3938 }, { "epoch": 0.030633385490709663, "grad_norm": 0.22654353497413793, "learning_rate": 9.977090645799372e-05, "loss": 2.148, "step": 3939 }, { "epoch": 0.030641162435490244, "grad_norm": 0.179976753229906, "learning_rate": 9.977078961821205e-05, "loss": 2.0798, "step": 3940 }, { "epoch": 0.030648939380270825, "grad_norm": 0.2497066817939454, "learning_rate": 9.977067274871175e-05, "loss": 2.0687, "step": 3941 }, { "epoch": 0.030656716325051406, "grad_norm": 0.22589370543676746, "learning_rate": 9.97705558494929e-05, "loss": 2.0942, "step": 3942 }, { "epoch": 0.030664493269831988, "grad_norm": 0.18286406067708602, "learning_rate": 9.977043892055556e-05, "loss": 2.102, "step": 3943 }, { "epoch": 0.03067227021461257, "grad_norm": 0.1845749742112948, "learning_rate": 9.977032196189978e-05, "loss": 2.0867, "step": 3944 }, { "epoch": 0.03068004715939315, "grad_norm": 0.17588465843491607, "learning_rate": 9.977020497352568e-05, "loss": 2.0581, "step": 3945 }, { "epoch": 0.03068782410417373, "grad_norm": 0.16849751981059466, "learning_rate": 9.977008795543328e-05, "loss": 2.1853, "step": 3946 }, { "epoch": 0.030695601048954312, "grad_norm": 0.1625578358131339, "learning_rate": 9.976997090762265e-05, "loss": 2.1221, "step": 3947 }, { "epoch": 0.030703377993734893, "grad_norm": 0.16764420292666501, "learning_rate": 9.976985383009392e-05, "loss": 2.1268, "step": 3948 }, { "epoch": 0.030711154938515475, "grad_norm": 0.12470074641138774, "learning_rate": 9.97697367228471e-05, "loss": 2.0976, "step": 3949 }, { "epoch": 0.030718931883296056, "grad_norm": 1.2431079824301412, "learning_rate": 9.976961958588229e-05, "loss": 2.1062, "step": 3950 }, { "epoch": 0.030726708828076637, "grad_norm": 2.1929354844276228, "learning_rate": 9.976950241919953e-05, "loss": 2.0855, "step": 3951 }, { "epoch": 0.030734485772857218, "grad_norm": 0.7136227561327247, "learning_rate": 9.976938522279893e-05, "loss": 2.0354, "step": 3952 }, { "epoch": 0.0307422627176378, "grad_norm": 0.2600346691218791, "learning_rate": 9.976926799668054e-05, "loss": 2.129, "step": 3953 }, { "epoch": 0.03075003966241838, "grad_norm": 0.3318384709693428, "learning_rate": 9.976915074084442e-05, "loss": 2.0754, "step": 3954 }, { "epoch": 0.03075781660719896, "grad_norm": 0.2261944021271984, "learning_rate": 9.976903345529065e-05, "loss": 2.1092, "step": 3955 }, { "epoch": 0.030765593551979543, "grad_norm": 0.23561126387234227, "learning_rate": 9.976891614001931e-05, "loss": 2.0861, "step": 3956 }, { "epoch": 0.030773370496760124, "grad_norm": 0.1916975800879028, "learning_rate": 9.976879879503044e-05, "loss": 2.1359, "step": 3957 }, { "epoch": 0.030781147441540705, "grad_norm": 0.18456882107302336, "learning_rate": 9.976868142032415e-05, "loss": 2.1085, "step": 3958 }, { "epoch": 0.030788924386321286, "grad_norm": 0.18713464080713763, "learning_rate": 9.97685640159005e-05, "loss": 2.1194, "step": 3959 }, { "epoch": 0.030796701331101867, "grad_norm": 0.1589570624945589, "learning_rate": 9.976844658175953e-05, "loss": 2.1747, "step": 3960 }, { "epoch": 0.03080447827588245, "grad_norm": 0.16318833429704885, "learning_rate": 9.976832911790133e-05, "loss": 2.1278, "step": 3961 }, { "epoch": 0.03081225522066303, "grad_norm": 0.15434691920941188, "learning_rate": 9.976821162432597e-05, "loss": 2.0765, "step": 3962 }, { "epoch": 0.03082003216544361, "grad_norm": 0.1564837385872255, "learning_rate": 9.976809410103352e-05, "loss": 2.0843, "step": 3963 }, { "epoch": 0.030827809110224192, "grad_norm": 0.14643545121941606, "learning_rate": 9.976797654802405e-05, "loss": 2.1269, "step": 3964 }, { "epoch": 0.030835586055004777, "grad_norm": 0.1428608644376351, "learning_rate": 9.976785896529763e-05, "loss": 2.1547, "step": 3965 }, { "epoch": 0.030843362999785358, "grad_norm": 0.13958104704956834, "learning_rate": 9.976774135285433e-05, "loss": 2.1325, "step": 3966 }, { "epoch": 0.03085113994456594, "grad_norm": 0.31625708656439727, "learning_rate": 9.976762371069424e-05, "loss": 2.1066, "step": 3967 }, { "epoch": 0.03085891688934652, "grad_norm": 0.15863844462359306, "learning_rate": 9.97675060388174e-05, "loss": 2.1335, "step": 3968 }, { "epoch": 0.0308666938341271, "grad_norm": 0.15406574344922383, "learning_rate": 9.976738833722389e-05, "loss": 2.1265, "step": 3969 }, { "epoch": 0.030874470778907682, "grad_norm": 0.14050056064458635, "learning_rate": 9.976727060591377e-05, "loss": 2.1266, "step": 3970 }, { "epoch": 0.030882247723688264, "grad_norm": 0.15984488343973288, "learning_rate": 9.976715284488716e-05, "loss": 2.0716, "step": 3971 }, { "epoch": 0.030890024668468845, "grad_norm": 0.1300560558272403, "learning_rate": 9.976703505414406e-05, "loss": 2.1144, "step": 3972 }, { "epoch": 0.030897801613249426, "grad_norm": 0.15654528497896986, "learning_rate": 9.976691723368457e-05, "loss": 2.1357, "step": 3973 }, { "epoch": 0.030905578558030007, "grad_norm": 0.5531884777163986, "learning_rate": 9.976679938350879e-05, "loss": 2.1546, "step": 3974 }, { "epoch": 0.03091335550281059, "grad_norm": 0.24871580277683172, "learning_rate": 9.976668150361674e-05, "loss": 2.1309, "step": 3975 }, { "epoch": 0.03092113244759117, "grad_norm": 0.1771429248464406, "learning_rate": 9.976656359400852e-05, "loss": 2.1145, "step": 3976 }, { "epoch": 0.03092890939237175, "grad_norm": 0.1657397742155355, "learning_rate": 9.976644565468421e-05, "loss": 2.1317, "step": 3977 }, { "epoch": 0.030936686337152332, "grad_norm": 0.14261329859293742, "learning_rate": 9.976632768564384e-05, "loss": 2.0719, "step": 3978 }, { "epoch": 0.030944463281932913, "grad_norm": 0.27891497917196556, "learning_rate": 9.976620968688753e-05, "loss": 2.1797, "step": 3979 }, { "epoch": 0.030952240226713494, "grad_norm": 0.40564204626122424, "learning_rate": 9.976609165841531e-05, "loss": 2.0709, "step": 3980 }, { "epoch": 0.030960017171494075, "grad_norm": 0.14628894167087717, "learning_rate": 9.976597360022726e-05, "loss": 2.1064, "step": 3981 }, { "epoch": 0.030967794116274656, "grad_norm": 0.14830907633531046, "learning_rate": 9.976585551232346e-05, "loss": 2.0781, "step": 3982 }, { "epoch": 0.030975571061055238, "grad_norm": 0.33066902469366544, "learning_rate": 9.976573739470399e-05, "loss": 2.1165, "step": 3983 }, { "epoch": 0.03098334800583582, "grad_norm": 0.13158071721548095, "learning_rate": 9.976561924736891e-05, "loss": 2.1516, "step": 3984 }, { "epoch": 0.0309911249506164, "grad_norm": 0.1487744978666712, "learning_rate": 9.976550107031828e-05, "loss": 2.1025, "step": 3985 }, { "epoch": 0.03099890189539698, "grad_norm": 0.13782029638626944, "learning_rate": 9.976538286355217e-05, "loss": 2.1012, "step": 3986 }, { "epoch": 0.031006678840177562, "grad_norm": 0.14666991950601213, "learning_rate": 9.976526462707068e-05, "loss": 2.1186, "step": 3987 }, { "epoch": 0.031014455784958143, "grad_norm": 0.1300887132601071, "learning_rate": 9.976514636087384e-05, "loss": 2.1151, "step": 3988 }, { "epoch": 0.031022232729738725, "grad_norm": 0.5789709714769946, "learning_rate": 9.976502806496175e-05, "loss": 2.1994, "step": 3989 }, { "epoch": 0.031030009674519306, "grad_norm": 0.29671322436589576, "learning_rate": 9.976490973933447e-05, "loss": 2.0724, "step": 3990 }, { "epoch": 0.031037786619299887, "grad_norm": 0.39677512146109967, "learning_rate": 9.976479138399209e-05, "loss": 2.0961, "step": 3991 }, { "epoch": 0.031045563564080468, "grad_norm": 0.24357328660167432, "learning_rate": 9.976467299893463e-05, "loss": 2.1241, "step": 3992 }, { "epoch": 0.03105334050886105, "grad_norm": 0.3898588901846339, "learning_rate": 9.976455458416222e-05, "loss": 2.1628, "step": 3993 }, { "epoch": 0.031061117453641634, "grad_norm": 0.17233001118186278, "learning_rate": 9.976443613967489e-05, "loss": 2.1142, "step": 3994 }, { "epoch": 0.031068894398422215, "grad_norm": 0.29406193633839506, "learning_rate": 9.976431766547273e-05, "loss": 2.0824, "step": 3995 }, { "epoch": 0.031076671343202796, "grad_norm": 0.160155661829035, "learning_rate": 9.976419916155581e-05, "loss": 2.0998, "step": 3996 }, { "epoch": 0.031084448287983377, "grad_norm": 0.18261971726426013, "learning_rate": 9.976408062792418e-05, "loss": 2.129, "step": 3997 }, { "epoch": 0.03109222523276396, "grad_norm": 0.23756591189091833, "learning_rate": 9.976396206457795e-05, "loss": 2.0992, "step": 3998 }, { "epoch": 0.03110000217754454, "grad_norm": 0.17313535932549198, "learning_rate": 9.976384347151715e-05, "loss": 2.0985, "step": 3999 }, { "epoch": 0.03110777912232512, "grad_norm": 0.24072937332129754, "learning_rate": 9.976372484874187e-05, "loss": 2.1364, "step": 4000 }, { "epoch": 0.031115556067105702, "grad_norm": 0.15045157993130923, "learning_rate": 9.976360619625219e-05, "loss": 2.0309, "step": 4001 }, { "epoch": 0.031123333011886283, "grad_norm": 0.2001963180882103, "learning_rate": 9.976348751404816e-05, "loss": 2.1284, "step": 4002 }, { "epoch": 0.031131109956666864, "grad_norm": 0.14678125554882637, "learning_rate": 9.976336880212987e-05, "loss": 2.0908, "step": 4003 }, { "epoch": 0.031138886901447446, "grad_norm": 0.14625754405422328, "learning_rate": 9.976325006049739e-05, "loss": 2.0926, "step": 4004 }, { "epoch": 0.031146663846228027, "grad_norm": 0.44265227788049716, "learning_rate": 9.976313128915076e-05, "loss": 2.1056, "step": 4005 }, { "epoch": 0.031154440791008608, "grad_norm": 0.14286901910528307, "learning_rate": 9.976301248809008e-05, "loss": 2.1599, "step": 4006 }, { "epoch": 0.03116221773578919, "grad_norm": 0.21357949284926056, "learning_rate": 9.976289365731542e-05, "loss": 2.1638, "step": 4007 }, { "epoch": 0.03116999468056977, "grad_norm": 0.17997347750997014, "learning_rate": 9.976277479682685e-05, "loss": 2.1034, "step": 4008 }, { "epoch": 0.03117777162535035, "grad_norm": 0.15995136467611382, "learning_rate": 9.976265590662444e-05, "loss": 2.1059, "step": 4009 }, { "epoch": 0.031185548570130932, "grad_norm": 0.37460748202864613, "learning_rate": 9.976253698670824e-05, "loss": 2.0967, "step": 4010 }, { "epoch": 0.031193325514911514, "grad_norm": 0.13424970069419154, "learning_rate": 9.976241803707835e-05, "loss": 2.1396, "step": 4011 }, { "epoch": 0.031201102459692095, "grad_norm": 0.14785383037948358, "learning_rate": 9.976229905773485e-05, "loss": 2.1146, "step": 4012 }, { "epoch": 0.031208879404472676, "grad_norm": 0.11867073657456942, "learning_rate": 9.976218004867776e-05, "loss": 2.1251, "step": 4013 }, { "epoch": 0.031216656349253257, "grad_norm": 0.143421512713609, "learning_rate": 9.97620610099072e-05, "loss": 2.1631, "step": 4014 }, { "epoch": 0.03122443329403384, "grad_norm": 0.153252912181129, "learning_rate": 9.976194194142321e-05, "loss": 2.0872, "step": 4015 }, { "epoch": 0.03123221023881442, "grad_norm": 0.12245491768265927, "learning_rate": 9.976182284322588e-05, "loss": 2.1732, "step": 4016 }, { "epoch": 0.031239987183595, "grad_norm": 0.3182600583800142, "learning_rate": 9.976170371531528e-05, "loss": 2.0551, "step": 4017 }, { "epoch": 0.031247764128375582, "grad_norm": 0.1621730349085192, "learning_rate": 9.976158455769147e-05, "loss": 2.1067, "step": 4018 }, { "epoch": 0.031255541073156166, "grad_norm": 0.14674623192195982, "learning_rate": 9.976146537035453e-05, "loss": 2.1005, "step": 4019 }, { "epoch": 0.03126331801793675, "grad_norm": 0.2936182979615493, "learning_rate": 9.976134615330453e-05, "loss": 2.1554, "step": 4020 }, { "epoch": 0.03127109496271733, "grad_norm": 0.12944960005927827, "learning_rate": 9.976122690654153e-05, "loss": 2.1288, "step": 4021 }, { "epoch": 0.03127887190749791, "grad_norm": 0.13033498850528755, "learning_rate": 9.976110763006562e-05, "loss": 2.0848, "step": 4022 }, { "epoch": 0.03128664885227849, "grad_norm": 0.5230342555259039, "learning_rate": 9.976098832387686e-05, "loss": 2.1314, "step": 4023 }, { "epoch": 0.03129442579705907, "grad_norm": 0.17802014957198317, "learning_rate": 9.976086898797532e-05, "loss": 2.0768, "step": 4024 }, { "epoch": 0.03130220274183965, "grad_norm": 0.14811520584366708, "learning_rate": 9.976074962236109e-05, "loss": 2.1046, "step": 4025 }, { "epoch": 0.031309979686620235, "grad_norm": 0.1268983035554467, "learning_rate": 9.976063022703421e-05, "loss": 2.1196, "step": 4026 }, { "epoch": 0.031317756631400816, "grad_norm": 0.23972285654408085, "learning_rate": 9.976051080199477e-05, "loss": 2.1395, "step": 4027 }, { "epoch": 0.0313255335761814, "grad_norm": 0.18791917932311117, "learning_rate": 9.976039134724284e-05, "loss": 2.1219, "step": 4028 }, { "epoch": 0.03133331052096198, "grad_norm": 0.1321014049436228, "learning_rate": 9.97602718627785e-05, "loss": 2.1273, "step": 4029 }, { "epoch": 0.03134108746574256, "grad_norm": 0.14589269833298987, "learning_rate": 9.976015234860181e-05, "loss": 2.1151, "step": 4030 }, { "epoch": 0.03134886441052314, "grad_norm": 0.1744225062130343, "learning_rate": 9.976003280471284e-05, "loss": 2.1301, "step": 4031 }, { "epoch": 0.03135664135530372, "grad_norm": 0.1851480426302968, "learning_rate": 9.975991323111165e-05, "loss": 2.0824, "step": 4032 }, { "epoch": 0.0313644183000843, "grad_norm": 0.13256436619450787, "learning_rate": 9.975979362779834e-05, "loss": 2.1116, "step": 4033 }, { "epoch": 0.031372195244864884, "grad_norm": 0.16447416221759928, "learning_rate": 9.975967399477298e-05, "loss": 2.1039, "step": 4034 }, { "epoch": 0.031379972189645465, "grad_norm": 0.3730160824292035, "learning_rate": 9.97595543320356e-05, "loss": 2.1153, "step": 4035 }, { "epoch": 0.031387749134426046, "grad_norm": 0.1383230014584854, "learning_rate": 9.97594346395863e-05, "loss": 2.1345, "step": 4036 }, { "epoch": 0.03139552607920663, "grad_norm": 0.13134682561287586, "learning_rate": 9.975931491742518e-05, "loss": 2.1347, "step": 4037 }, { "epoch": 0.03140330302398721, "grad_norm": 0.1265772304795401, "learning_rate": 9.975919516555225e-05, "loss": 2.1392, "step": 4038 }, { "epoch": 0.03141107996876779, "grad_norm": 0.1264244621952013, "learning_rate": 9.975907538396765e-05, "loss": 2.1548, "step": 4039 }, { "epoch": 0.03141885691354837, "grad_norm": 0.12789612876539833, "learning_rate": 9.975895557267138e-05, "loss": 2.1182, "step": 4040 }, { "epoch": 0.03142663385832895, "grad_norm": 0.12500673417811586, "learning_rate": 9.975883573166356e-05, "loss": 2.1299, "step": 4041 }, { "epoch": 0.03143441080310953, "grad_norm": 0.11688073896584375, "learning_rate": 9.975871586094427e-05, "loss": 2.1308, "step": 4042 }, { "epoch": 0.031442187747890114, "grad_norm": 0.12997688023696746, "learning_rate": 9.975859596051354e-05, "loss": 2.0655, "step": 4043 }, { "epoch": 0.031449964692670695, "grad_norm": 0.1159556394523954, "learning_rate": 9.975847603037146e-05, "loss": 2.1247, "step": 4044 }, { "epoch": 0.03145774163745128, "grad_norm": 0.14591583659292698, "learning_rate": 9.975835607051811e-05, "loss": 2.1244, "step": 4045 }, { "epoch": 0.03146551858223186, "grad_norm": 0.131537219570094, "learning_rate": 9.975823608095356e-05, "loss": 2.0658, "step": 4046 }, { "epoch": 0.03147329552701244, "grad_norm": 0.12177048881994233, "learning_rate": 9.975811606167786e-05, "loss": 2.074, "step": 4047 }, { "epoch": 0.03148107247179302, "grad_norm": 0.11871506777014526, "learning_rate": 9.97579960126911e-05, "loss": 2.1412, "step": 4048 }, { "epoch": 0.0314888494165736, "grad_norm": 0.14042531097644517, "learning_rate": 9.975787593399338e-05, "loss": 2.0838, "step": 4049 }, { "epoch": 0.03149662636135418, "grad_norm": 0.11749864014950918, "learning_rate": 9.975775582558472e-05, "loss": 2.1101, "step": 4050 }, { "epoch": 0.031504403306134764, "grad_norm": 0.11688059149329072, "learning_rate": 9.97576356874652e-05, "loss": 2.1163, "step": 4051 }, { "epoch": 0.031512180250915345, "grad_norm": 0.11846105238900986, "learning_rate": 9.975751551963494e-05, "loss": 2.0643, "step": 4052 }, { "epoch": 0.031519957195695926, "grad_norm": 0.18561978374228563, "learning_rate": 9.975739532209396e-05, "loss": 2.1439, "step": 4053 }, { "epoch": 0.03152773414047651, "grad_norm": 0.13151965430563115, "learning_rate": 9.975727509484235e-05, "loss": 2.1114, "step": 4054 }, { "epoch": 0.03153551108525709, "grad_norm": 0.1487762569375989, "learning_rate": 9.975715483788018e-05, "loss": 2.0364, "step": 4055 }, { "epoch": 0.03154328803003767, "grad_norm": 0.13234046631753327, "learning_rate": 9.97570345512075e-05, "loss": 2.1094, "step": 4056 }, { "epoch": 0.03155106497481825, "grad_norm": 0.12342995333088039, "learning_rate": 9.975691423482444e-05, "loss": 2.1355, "step": 4057 }, { "epoch": 0.03155884191959883, "grad_norm": 0.11934459634441184, "learning_rate": 9.975679388873102e-05, "loss": 2.1775, "step": 4058 }, { "epoch": 0.03156661886437941, "grad_norm": 0.4243498650734064, "learning_rate": 9.975667351292734e-05, "loss": 2.0797, "step": 4059 }, { "epoch": 0.031574395809159994, "grad_norm": 0.11836662589817182, "learning_rate": 9.975655310741344e-05, "loss": 2.073, "step": 4060 }, { "epoch": 0.031582172753940575, "grad_norm": 0.13011983653814865, "learning_rate": 9.975643267218943e-05, "loss": 2.1032, "step": 4061 }, { "epoch": 0.031589949698721156, "grad_norm": 0.1288333051468071, "learning_rate": 9.975631220725535e-05, "loss": 2.1007, "step": 4062 }, { "epoch": 0.03159772664350174, "grad_norm": 0.19031627266974213, "learning_rate": 9.975619171261129e-05, "loss": 2.0894, "step": 4063 }, { "epoch": 0.03160550358828232, "grad_norm": 0.1274620630532487, "learning_rate": 9.975607118825731e-05, "loss": 2.1092, "step": 4064 }, { "epoch": 0.0316132805330629, "grad_norm": 0.7471673566518165, "learning_rate": 9.975595063419351e-05, "loss": 2.0628, "step": 4065 }, { "epoch": 0.03162105747784348, "grad_norm": 0.17066768992637546, "learning_rate": 9.975583005041993e-05, "loss": 2.1403, "step": 4066 }, { "epoch": 0.03162883442262406, "grad_norm": 0.43391280100868573, "learning_rate": 9.975570943693666e-05, "loss": 2.125, "step": 4067 }, { "epoch": 0.03163661136740464, "grad_norm": 0.25728507159885755, "learning_rate": 9.975558879374376e-05, "loss": 2.1196, "step": 4068 }, { "epoch": 0.03164438831218523, "grad_norm": 0.9577542223323636, "learning_rate": 9.975546812084131e-05, "loss": 2.1079, "step": 4069 }, { "epoch": 0.03165216525696581, "grad_norm": 4.143378414483156, "learning_rate": 9.975534741822937e-05, "loss": 2.1822, "step": 4070 }, { "epoch": 0.031659942201746394, "grad_norm": 0.4009738429233556, "learning_rate": 9.975522668590804e-05, "loss": 2.2103, "step": 4071 }, { "epoch": 0.031667719146526975, "grad_norm": 1.5011277079059469, "learning_rate": 9.975510592387736e-05, "loss": 2.1637, "step": 4072 }, { "epoch": 0.031675496091307556, "grad_norm": 0.3224170263086753, "learning_rate": 9.975498513213741e-05, "loss": 2.152, "step": 4073 }, { "epoch": 0.03168327303608814, "grad_norm": 0.28844502095865066, "learning_rate": 9.975486431068828e-05, "loss": 2.1509, "step": 4074 }, { "epoch": 0.03169104998086872, "grad_norm": 0.27358651422203123, "learning_rate": 9.975474345953003e-05, "loss": 2.1395, "step": 4075 }, { "epoch": 0.0316988269256493, "grad_norm": 0.2367429985485025, "learning_rate": 9.975462257866273e-05, "loss": 2.1368, "step": 4076 }, { "epoch": 0.03170660387042988, "grad_norm": 0.21845010177274535, "learning_rate": 9.975450166808646e-05, "loss": 2.1468, "step": 4077 }, { "epoch": 0.03171438081521046, "grad_norm": 0.35562346460972666, "learning_rate": 9.975438072780128e-05, "loss": 2.1027, "step": 4078 }, { "epoch": 0.03172215775999104, "grad_norm": 0.16372967639547584, "learning_rate": 9.975425975780727e-05, "loss": 2.1632, "step": 4079 }, { "epoch": 0.031729934704771624, "grad_norm": 0.21102838417371575, "learning_rate": 9.975413875810451e-05, "loss": 2.0885, "step": 4080 }, { "epoch": 0.031737711649552205, "grad_norm": 0.2109123519287789, "learning_rate": 9.975401772869305e-05, "loss": 2.1078, "step": 4081 }, { "epoch": 0.03174548859433279, "grad_norm": 0.1766226876423269, "learning_rate": 9.975389666957297e-05, "loss": 2.1379, "step": 4082 }, { "epoch": 0.03175326553911337, "grad_norm": 0.14761192295638265, "learning_rate": 9.975377558074437e-05, "loss": 2.102, "step": 4083 }, { "epoch": 0.03176104248389395, "grad_norm": 0.21304333257533023, "learning_rate": 9.97536544622073e-05, "loss": 2.1335, "step": 4084 }, { "epoch": 0.03176881942867453, "grad_norm": 0.22933063578195026, "learning_rate": 9.975353331396182e-05, "loss": 2.1411, "step": 4085 }, { "epoch": 0.03177659637345511, "grad_norm": 0.7889730928196022, "learning_rate": 9.975341213600801e-05, "loss": 2.1677, "step": 4086 }, { "epoch": 0.03178437331823569, "grad_norm": 0.18482709382364024, "learning_rate": 9.975329092834595e-05, "loss": 2.0749, "step": 4087 }, { "epoch": 0.031792150263016274, "grad_norm": 0.2595072784957245, "learning_rate": 9.975316969097573e-05, "loss": 2.0943, "step": 4088 }, { "epoch": 0.031799927207796855, "grad_norm": 0.14295780058427535, "learning_rate": 9.975304842389738e-05, "loss": 2.0724, "step": 4089 }, { "epoch": 0.031807704152577436, "grad_norm": 0.19895157927280518, "learning_rate": 9.975292712711101e-05, "loss": 2.1449, "step": 4090 }, { "epoch": 0.03181548109735802, "grad_norm": 0.21306505830978997, "learning_rate": 9.975280580061667e-05, "loss": 2.1571, "step": 4091 }, { "epoch": 0.0318232580421386, "grad_norm": 0.18663058146691242, "learning_rate": 9.975268444441444e-05, "loss": 2.1282, "step": 4092 }, { "epoch": 0.03183103498691918, "grad_norm": 0.20071801256166055, "learning_rate": 9.975256305850439e-05, "loss": 2.1497, "step": 4093 }, { "epoch": 0.03183881193169976, "grad_norm": 0.19005597018020068, "learning_rate": 9.97524416428866e-05, "loss": 2.1449, "step": 4094 }, { "epoch": 0.03184658887648034, "grad_norm": 0.14257786584479523, "learning_rate": 9.975232019756112e-05, "loss": 2.1145, "step": 4095 }, { "epoch": 0.03185436582126092, "grad_norm": 0.5241531665317725, "learning_rate": 9.975219872252806e-05, "loss": 2.1356, "step": 4096 }, { "epoch": 0.031862142766041504, "grad_norm": 0.2963507439816401, "learning_rate": 9.975207721778745e-05, "loss": 2.1083, "step": 4097 }, { "epoch": 0.031869919710822085, "grad_norm": 0.12957173271008873, "learning_rate": 9.975195568333942e-05, "loss": 2.0974, "step": 4098 }, { "epoch": 0.031877696655602666, "grad_norm": 0.23222046271261468, "learning_rate": 9.975183411918398e-05, "loss": 2.1262, "step": 4099 }, { "epoch": 0.03188547360038325, "grad_norm": 0.48543224419407366, "learning_rate": 9.975171252532123e-05, "loss": 2.1302, "step": 4100 }, { "epoch": 0.03189325054516383, "grad_norm": 0.148484803337327, "learning_rate": 9.975159090175125e-05, "loss": 2.0737, "step": 4101 }, { "epoch": 0.03190102748994441, "grad_norm": 0.22790455575303611, "learning_rate": 9.975146924847409e-05, "loss": 2.0884, "step": 4102 }, { "epoch": 0.03190880443472499, "grad_norm": 0.14395956029423318, "learning_rate": 9.975134756548984e-05, "loss": 2.1232, "step": 4103 }, { "epoch": 0.03191658137950557, "grad_norm": 0.25548409280094847, "learning_rate": 9.97512258527986e-05, "loss": 2.1185, "step": 4104 }, { "epoch": 0.03192435832428615, "grad_norm": 0.15419778860674, "learning_rate": 9.975110411040038e-05, "loss": 2.1247, "step": 4105 }, { "epoch": 0.031932135269066735, "grad_norm": 0.1386193396433695, "learning_rate": 9.97509823382953e-05, "loss": 2.0974, "step": 4106 }, { "epoch": 0.031939912213847316, "grad_norm": 0.1500645627832559, "learning_rate": 9.975086053648342e-05, "loss": 2.1555, "step": 4107 }, { "epoch": 0.0319476891586279, "grad_norm": 0.5442024844986286, "learning_rate": 9.97507387049648e-05, "loss": 2.1807, "step": 4108 }, { "epoch": 0.03195546610340848, "grad_norm": 0.14885564991627212, "learning_rate": 9.975061684373954e-05, "loss": 2.1241, "step": 4109 }, { "epoch": 0.03196324304818906, "grad_norm": 0.1685072228416568, "learning_rate": 9.975049495280769e-05, "loss": 2.1731, "step": 4110 }, { "epoch": 0.03197101999296964, "grad_norm": 1.202764953739782, "learning_rate": 9.975037303216933e-05, "loss": 2.0878, "step": 4111 }, { "epoch": 0.03197879693775022, "grad_norm": 0.1747620311913342, "learning_rate": 9.975025108182452e-05, "loss": 2.1754, "step": 4112 }, { "epoch": 0.0319865738825308, "grad_norm": 0.2125697118816237, "learning_rate": 9.975012910177337e-05, "loss": 2.1078, "step": 4113 }, { "epoch": 0.031994350827311384, "grad_norm": 0.18953623743445774, "learning_rate": 9.97500070920159e-05, "loss": 2.1392, "step": 4114 }, { "epoch": 0.032002127772091965, "grad_norm": 0.20078767479521167, "learning_rate": 9.974988505255223e-05, "loss": 2.1512, "step": 4115 }, { "epoch": 0.032009904716872546, "grad_norm": 0.1743690172099737, "learning_rate": 9.97497629833824e-05, "loss": 2.1341, "step": 4116 }, { "epoch": 0.03201768166165313, "grad_norm": 0.18149099956296202, "learning_rate": 9.974964088450651e-05, "loss": 2.1102, "step": 4117 }, { "epoch": 0.03202545860643371, "grad_norm": 0.36173883837759313, "learning_rate": 9.97495187559246e-05, "loss": 2.1364, "step": 4118 }, { "epoch": 0.03203323555121429, "grad_norm": 0.277486100253514, "learning_rate": 9.974939659763678e-05, "loss": 2.088, "step": 4119 }, { "epoch": 0.03204101249599487, "grad_norm": 0.16284340311651785, "learning_rate": 9.97492744096431e-05, "loss": 2.1167, "step": 4120 }, { "epoch": 0.03204878944077545, "grad_norm": 0.14205999791459978, "learning_rate": 9.974915219194363e-05, "loss": 2.1052, "step": 4121 }, { "epoch": 0.03205656638555603, "grad_norm": 0.20704317365841202, "learning_rate": 9.974902994453846e-05, "loss": 2.1163, "step": 4122 }, { "epoch": 0.032064343330336614, "grad_norm": 0.1467201358122988, "learning_rate": 9.974890766742766e-05, "loss": 2.1048, "step": 4123 }, { "epoch": 0.032072120275117195, "grad_norm": 0.14554071835614502, "learning_rate": 9.974878536061128e-05, "loss": 2.0981, "step": 4124 }, { "epoch": 0.03207989721989778, "grad_norm": 0.15806412141139076, "learning_rate": 9.974866302408943e-05, "loss": 2.1531, "step": 4125 }, { "epoch": 0.03208767416467836, "grad_norm": 0.15468992346828933, "learning_rate": 9.974854065786214e-05, "loss": 2.115, "step": 4126 }, { "epoch": 0.03209545110945894, "grad_norm": 0.4419939431959374, "learning_rate": 9.974841826192953e-05, "loss": 2.1385, "step": 4127 }, { "epoch": 0.03210322805423953, "grad_norm": 0.14870920247044284, "learning_rate": 9.974829583629164e-05, "loss": 2.083, "step": 4128 }, { "epoch": 0.03211100499902011, "grad_norm": 0.13654682179052038, "learning_rate": 9.974817338094855e-05, "loss": 2.1244, "step": 4129 }, { "epoch": 0.03211878194380069, "grad_norm": 0.13249631383642352, "learning_rate": 9.974805089590034e-05, "loss": 2.1188, "step": 4130 }, { "epoch": 0.03212655888858127, "grad_norm": 0.12827713898554083, "learning_rate": 9.974792838114707e-05, "loss": 2.1577, "step": 4131 }, { "epoch": 0.03213433583336185, "grad_norm": 0.13785744243093073, "learning_rate": 9.974780583668882e-05, "loss": 2.0707, "step": 4132 }, { "epoch": 0.03214211277814243, "grad_norm": 0.14030260468672584, "learning_rate": 9.974768326252568e-05, "loss": 2.0845, "step": 4133 }, { "epoch": 0.032149889722923014, "grad_norm": 0.13779190094078572, "learning_rate": 9.974756065865769e-05, "loss": 2.1223, "step": 4134 }, { "epoch": 0.032157666667703595, "grad_norm": 0.12955556004978933, "learning_rate": 9.974743802508496e-05, "loss": 2.1106, "step": 4135 }, { "epoch": 0.032165443612484176, "grad_norm": 0.13985420501364454, "learning_rate": 9.974731536180754e-05, "loss": 2.0902, "step": 4136 }, { "epoch": 0.03217322055726476, "grad_norm": 0.26822188371834166, "learning_rate": 9.97471926688255e-05, "loss": 2.1218, "step": 4137 }, { "epoch": 0.03218099750204534, "grad_norm": 0.1691503149120094, "learning_rate": 9.974706994613892e-05, "loss": 2.1368, "step": 4138 }, { "epoch": 0.03218877444682592, "grad_norm": 0.13915445094780182, "learning_rate": 9.974694719374789e-05, "loss": 2.0846, "step": 4139 }, { "epoch": 0.0321965513916065, "grad_norm": 0.13882321218816787, "learning_rate": 9.974682441165245e-05, "loss": 2.1282, "step": 4140 }, { "epoch": 0.03220432833638708, "grad_norm": 0.14553868938822728, "learning_rate": 9.97467015998527e-05, "loss": 2.0548, "step": 4141 }, { "epoch": 0.03221210528116766, "grad_norm": 7.4396931981222565, "learning_rate": 9.974657875834869e-05, "loss": 2.0536, "step": 4142 }, { "epoch": 0.032219882225948245, "grad_norm": 0.1835391295994652, "learning_rate": 9.974645588714053e-05, "loss": 2.1391, "step": 4143 }, { "epoch": 0.032227659170728826, "grad_norm": 0.18010368599315493, "learning_rate": 9.974633298622826e-05, "loss": 2.1404, "step": 4144 }, { "epoch": 0.03223543611550941, "grad_norm": 0.1373707885110019, "learning_rate": 9.974621005561197e-05, "loss": 2.1209, "step": 4145 }, { "epoch": 0.03224321306028999, "grad_norm": 0.3380768580038945, "learning_rate": 9.974608709529172e-05, "loss": 2.1598, "step": 4146 }, { "epoch": 0.03225099000507057, "grad_norm": 0.1793097329784831, "learning_rate": 9.97459641052676e-05, "loss": 2.0793, "step": 4147 }, { "epoch": 0.03225876694985115, "grad_norm": 0.1417907474522022, "learning_rate": 9.974584108553966e-05, "loss": 2.0905, "step": 4148 }, { "epoch": 0.03226654389463173, "grad_norm": 0.16265486340535937, "learning_rate": 9.9745718036108e-05, "loss": 2.1084, "step": 4149 }, { "epoch": 0.03227432083941231, "grad_norm": 0.14737543318642282, "learning_rate": 9.974559495697269e-05, "loss": 2.1248, "step": 4150 }, { "epoch": 0.032282097784192894, "grad_norm": 0.15073015049466107, "learning_rate": 9.974547184813378e-05, "loss": 2.1522, "step": 4151 }, { "epoch": 0.032289874728973475, "grad_norm": 0.18993468070296102, "learning_rate": 9.974534870959136e-05, "loss": 2.0919, "step": 4152 }, { "epoch": 0.032297651673754056, "grad_norm": 0.1519629257203931, "learning_rate": 9.974522554134552e-05, "loss": 2.1598, "step": 4153 }, { "epoch": 0.03230542861853464, "grad_norm": 0.1358715134478909, "learning_rate": 9.974510234339629e-05, "loss": 2.0799, "step": 4154 }, { "epoch": 0.03231320556331522, "grad_norm": 0.6059484371224358, "learning_rate": 9.974497911574379e-05, "loss": 2.1747, "step": 4155 }, { "epoch": 0.0323209825080958, "grad_norm": 0.14430383465388041, "learning_rate": 9.974485585838805e-05, "loss": 2.0862, "step": 4156 }, { "epoch": 0.03232875945287638, "grad_norm": 0.19739493034765224, "learning_rate": 9.974473257132919e-05, "loss": 2.1718, "step": 4157 }, { "epoch": 0.03233653639765696, "grad_norm": 0.1495641341361954, "learning_rate": 9.974460925456725e-05, "loss": 2.08, "step": 4158 }, { "epoch": 0.03234431334243754, "grad_norm": 0.13884170275362637, "learning_rate": 9.97444859081023e-05, "loss": 2.1258, "step": 4159 }, { "epoch": 0.032352090287218124, "grad_norm": 0.1563643072958706, "learning_rate": 9.974436253193444e-05, "loss": 2.1571, "step": 4160 }, { "epoch": 0.032359867231998705, "grad_norm": 0.5285065836331051, "learning_rate": 9.974423912606374e-05, "loss": 2.1368, "step": 4161 }, { "epoch": 0.03236764417677929, "grad_norm": 0.1584908375496909, "learning_rate": 9.974411569049025e-05, "loss": 2.1092, "step": 4162 }, { "epoch": 0.03237542112155987, "grad_norm": 0.12946211540829258, "learning_rate": 9.974399222521407e-05, "loss": 2.1251, "step": 4163 }, { "epoch": 0.03238319806634045, "grad_norm": 0.13912481968596294, "learning_rate": 9.974386873023526e-05, "loss": 2.1156, "step": 4164 }, { "epoch": 0.03239097501112103, "grad_norm": 0.1464181393498569, "learning_rate": 9.974374520555389e-05, "loss": 2.1448, "step": 4165 }, { "epoch": 0.03239875195590161, "grad_norm": 0.5251186194141825, "learning_rate": 9.974362165117005e-05, "loss": 2.0991, "step": 4166 }, { "epoch": 0.03240652890068219, "grad_norm": 0.16280843198515957, "learning_rate": 9.974349806708379e-05, "loss": 2.1247, "step": 4167 }, { "epoch": 0.032414305845462774, "grad_norm": 0.15225173530263372, "learning_rate": 9.97433744532952e-05, "loss": 2.1468, "step": 4168 }, { "epoch": 0.032422082790243355, "grad_norm": 0.12899253077661096, "learning_rate": 9.974325080980436e-05, "loss": 2.1257, "step": 4169 }, { "epoch": 0.032429859735023936, "grad_norm": 0.15047044707059673, "learning_rate": 9.974312713661133e-05, "loss": 2.0726, "step": 4170 }, { "epoch": 0.03243763667980452, "grad_norm": 0.1703326418650816, "learning_rate": 9.974300343371619e-05, "loss": 2.0969, "step": 4171 }, { "epoch": 0.0324454136245851, "grad_norm": 0.13437088635349276, "learning_rate": 9.9742879701119e-05, "loss": 2.1094, "step": 4172 }, { "epoch": 0.03245319056936568, "grad_norm": 0.13287602380148808, "learning_rate": 9.974275593881986e-05, "loss": 2.1367, "step": 4173 }, { "epoch": 0.03246096751414626, "grad_norm": 0.24784148129555938, "learning_rate": 9.974263214681883e-05, "loss": 2.083, "step": 4174 }, { "epoch": 0.03246874445892684, "grad_norm": 0.15881093021236312, "learning_rate": 9.974250832511598e-05, "loss": 2.0988, "step": 4175 }, { "epoch": 0.03247652140370742, "grad_norm": 0.1503576498332551, "learning_rate": 9.974238447371141e-05, "loss": 2.1534, "step": 4176 }, { "epoch": 0.032484298348488004, "grad_norm": 0.14400814066856138, "learning_rate": 9.974226059260515e-05, "loss": 2.1034, "step": 4177 }, { "epoch": 0.032492075293268585, "grad_norm": 0.1304486313780096, "learning_rate": 9.974213668179728e-05, "loss": 2.058, "step": 4178 }, { "epoch": 0.032499852238049166, "grad_norm": 0.14028535784789506, "learning_rate": 9.974201274128792e-05, "loss": 2.1498, "step": 4179 }, { "epoch": 0.03250762918282975, "grad_norm": 0.13194966369047917, "learning_rate": 9.97418887710771e-05, "loss": 2.1092, "step": 4180 }, { "epoch": 0.03251540612761033, "grad_norm": 0.12997812906541029, "learning_rate": 9.97417647711649e-05, "loss": 2.0935, "step": 4181 }, { "epoch": 0.03252318307239091, "grad_norm": 0.14923644930686616, "learning_rate": 9.974164074155142e-05, "loss": 2.1043, "step": 4182 }, { "epoch": 0.03253096001717149, "grad_norm": 0.1255706160893978, "learning_rate": 9.97415166822367e-05, "loss": 2.1115, "step": 4183 }, { "epoch": 0.03253873696195207, "grad_norm": 0.33636516323457394, "learning_rate": 9.974139259322085e-05, "loss": 2.0773, "step": 4184 }, { "epoch": 0.03254651390673265, "grad_norm": 0.13204123021343772, "learning_rate": 9.974126847450392e-05, "loss": 2.1151, "step": 4185 }, { "epoch": 0.032554290851513235, "grad_norm": 0.12006987027294581, "learning_rate": 9.974114432608597e-05, "loss": 2.0875, "step": 4186 }, { "epoch": 0.03256206779629382, "grad_norm": 0.11820835015333225, "learning_rate": 9.974102014796711e-05, "loss": 2.14, "step": 4187 }, { "epoch": 0.032569844741074404, "grad_norm": 0.12033359699185932, "learning_rate": 9.974089594014739e-05, "loss": 2.1244, "step": 4188 }, { "epoch": 0.032577621685854985, "grad_norm": 0.18160287421164636, "learning_rate": 9.97407717026269e-05, "loss": 2.1134, "step": 4189 }, { "epoch": 0.032585398630635566, "grad_norm": 0.12151841971398183, "learning_rate": 9.974064743540569e-05, "loss": 2.0684, "step": 4190 }, { "epoch": 0.03259317557541615, "grad_norm": 0.11555914502285972, "learning_rate": 9.974052313848386e-05, "loss": 2.1156, "step": 4191 }, { "epoch": 0.03260095252019673, "grad_norm": 0.1220521114232839, "learning_rate": 9.974039881186147e-05, "loss": 2.1201, "step": 4192 }, { "epoch": 0.03260872946497731, "grad_norm": 0.12452317711422989, "learning_rate": 9.97402744555386e-05, "loss": 2.0813, "step": 4193 }, { "epoch": 0.03261650640975789, "grad_norm": 0.24352007241249066, "learning_rate": 9.974015006951532e-05, "loss": 2.1614, "step": 4194 }, { "epoch": 0.03262428335453847, "grad_norm": 0.2823559741166856, "learning_rate": 9.97400256537917e-05, "loss": 2.0798, "step": 4195 }, { "epoch": 0.03263206029931905, "grad_norm": 0.1251812778380329, "learning_rate": 9.973990120836783e-05, "loss": 2.0566, "step": 4196 }, { "epoch": 0.032639837244099634, "grad_norm": 0.17716068580276606, "learning_rate": 9.973977673324378e-05, "loss": 2.0766, "step": 4197 }, { "epoch": 0.032647614188880215, "grad_norm": 0.12026185263425185, "learning_rate": 9.973965222841962e-05, "loss": 2.0909, "step": 4198 }, { "epoch": 0.0326553911336608, "grad_norm": 0.12805363946621642, "learning_rate": 9.97395276938954e-05, "loss": 2.1138, "step": 4199 }, { "epoch": 0.03266316807844138, "grad_norm": 0.17986496376797165, "learning_rate": 9.973940312967123e-05, "loss": 2.1239, "step": 4200 }, { "epoch": 0.03267094502322196, "grad_norm": 0.1336467601402065, "learning_rate": 9.973927853574719e-05, "loss": 2.1092, "step": 4201 }, { "epoch": 0.03267872196800254, "grad_norm": 0.12812500676834612, "learning_rate": 9.973915391212332e-05, "loss": 2.1012, "step": 4202 }, { "epoch": 0.03268649891278312, "grad_norm": 0.11989725496366609, "learning_rate": 9.97390292587997e-05, "loss": 2.1324, "step": 4203 }, { "epoch": 0.0326942758575637, "grad_norm": 0.12029680352331877, "learning_rate": 9.973890457577644e-05, "loss": 2.0887, "step": 4204 }, { "epoch": 0.032702052802344284, "grad_norm": 0.13340238816667072, "learning_rate": 9.973877986305358e-05, "loss": 2.0923, "step": 4205 }, { "epoch": 0.032709829747124865, "grad_norm": 0.14735731855384088, "learning_rate": 9.97386551206312e-05, "loss": 2.0853, "step": 4206 }, { "epoch": 0.032717606691905446, "grad_norm": 0.16171219654607277, "learning_rate": 9.973853034850938e-05, "loss": 2.1455, "step": 4207 }, { "epoch": 0.03272538363668603, "grad_norm": 0.15313538202760718, "learning_rate": 9.97384055466882e-05, "loss": 2.0884, "step": 4208 }, { "epoch": 0.03273316058146661, "grad_norm": 0.11728181323551128, "learning_rate": 9.973828071516773e-05, "loss": 2.0949, "step": 4209 }, { "epoch": 0.03274093752624719, "grad_norm": 0.12458384695688399, "learning_rate": 9.973815585394803e-05, "loss": 2.1196, "step": 4210 }, { "epoch": 0.03274871447102777, "grad_norm": 0.12343604473593223, "learning_rate": 9.97380309630292e-05, "loss": 2.1435, "step": 4211 }, { "epoch": 0.03275649141580835, "grad_norm": 0.11906670536064348, "learning_rate": 9.973790604241129e-05, "loss": 2.1427, "step": 4212 }, { "epoch": 0.03276426836058893, "grad_norm": 0.19763054322553938, "learning_rate": 9.973778109209439e-05, "loss": 2.1642, "step": 4213 }, { "epoch": 0.032772045305369514, "grad_norm": 0.11962049713414195, "learning_rate": 9.973765611207858e-05, "loss": 2.0697, "step": 4214 }, { "epoch": 0.032779822250150095, "grad_norm": 0.11325045032780205, "learning_rate": 9.973753110236392e-05, "loss": 2.1052, "step": 4215 }, { "epoch": 0.032787599194930676, "grad_norm": 0.12868152197554353, "learning_rate": 9.973740606295049e-05, "loss": 2.1389, "step": 4216 }, { "epoch": 0.03279537613971126, "grad_norm": 0.1411499805684094, "learning_rate": 9.973728099383837e-05, "loss": 2.0899, "step": 4217 }, { "epoch": 0.03280315308449184, "grad_norm": 0.12908305285668212, "learning_rate": 9.973715589502761e-05, "loss": 2.1133, "step": 4218 }, { "epoch": 0.03281093002927242, "grad_norm": 0.12525091482168008, "learning_rate": 9.973703076651832e-05, "loss": 2.0665, "step": 4219 }, { "epoch": 0.032818706974053, "grad_norm": 0.128886397402242, "learning_rate": 9.973690560831055e-05, "loss": 2.1158, "step": 4220 }, { "epoch": 0.03282648391883358, "grad_norm": 0.11048128605282921, "learning_rate": 9.97367804204044e-05, "loss": 2.0986, "step": 4221 }, { "epoch": 0.03283426086361416, "grad_norm": 0.12487988362697162, "learning_rate": 9.973665520279992e-05, "loss": 2.1109, "step": 4222 }, { "epoch": 0.032842037808394745, "grad_norm": 0.1414549293794646, "learning_rate": 9.973652995549718e-05, "loss": 2.0971, "step": 4223 }, { "epoch": 0.032849814753175326, "grad_norm": 0.12265704995266215, "learning_rate": 9.973640467849627e-05, "loss": 2.1193, "step": 4224 }, { "epoch": 0.03285759169795591, "grad_norm": 0.13329666759151151, "learning_rate": 9.973627937179728e-05, "loss": 2.135, "step": 4225 }, { "epoch": 0.03286536864273649, "grad_norm": 0.12409211932484279, "learning_rate": 9.973615403540026e-05, "loss": 2.0986, "step": 4226 }, { "epoch": 0.03287314558751707, "grad_norm": 0.13839745974600473, "learning_rate": 9.973602866930529e-05, "loss": 2.0675, "step": 4227 }, { "epoch": 0.03288092253229765, "grad_norm": 0.13483565582960555, "learning_rate": 9.973590327351246e-05, "loss": 2.1107, "step": 4228 }, { "epoch": 0.03288869947707823, "grad_norm": 0.11769076711985911, "learning_rate": 9.973577784802181e-05, "loss": 2.0632, "step": 4229 }, { "epoch": 0.03289647642185881, "grad_norm": 0.14527292175222312, "learning_rate": 9.973565239283346e-05, "loss": 2.115, "step": 4230 }, { "epoch": 0.032904253366639394, "grad_norm": 0.1365663858280876, "learning_rate": 9.973552690794746e-05, "loss": 2.1769, "step": 4231 }, { "epoch": 0.032912030311419975, "grad_norm": 0.11994760485461683, "learning_rate": 9.973540139336385e-05, "loss": 2.0874, "step": 4232 }, { "epoch": 0.032919807256200556, "grad_norm": 0.1834975386698263, "learning_rate": 9.973527584908277e-05, "loss": 2.102, "step": 4233 }, { "epoch": 0.03292758420098114, "grad_norm": 0.16366722031987288, "learning_rate": 9.973515027510427e-05, "loss": 2.1199, "step": 4234 }, { "epoch": 0.03293536114576172, "grad_norm": 0.11210549601925586, "learning_rate": 9.973502467142843e-05, "loss": 2.1124, "step": 4235 }, { "epoch": 0.0329431380905423, "grad_norm": 0.18249140159333632, "learning_rate": 9.973489903805531e-05, "loss": 2.0964, "step": 4236 }, { "epoch": 0.03295091503532288, "grad_norm": 0.15740854693668996, "learning_rate": 9.973477337498497e-05, "loss": 2.0606, "step": 4237 }, { "epoch": 0.03295869198010346, "grad_norm": 0.11349558769433153, "learning_rate": 9.973464768221753e-05, "loss": 2.1435, "step": 4238 }, { "epoch": 0.03296646892488404, "grad_norm": 0.1393210118768634, "learning_rate": 9.973452195975304e-05, "loss": 2.1094, "step": 4239 }, { "epoch": 0.032974245869664624, "grad_norm": 0.25657527744461134, "learning_rate": 9.973439620759156e-05, "loss": 2.1636, "step": 4240 }, { "epoch": 0.032982022814445205, "grad_norm": 0.17417453615862297, "learning_rate": 9.97342704257332e-05, "loss": 2.0735, "step": 4241 }, { "epoch": 0.03298979975922579, "grad_norm": 0.11618336072122054, "learning_rate": 9.9734144614178e-05, "loss": 2.1044, "step": 4242 }, { "epoch": 0.03299757670400637, "grad_norm": 0.12645187174577138, "learning_rate": 9.973401877292608e-05, "loss": 2.1231, "step": 4243 }, { "epoch": 0.03300535364878695, "grad_norm": 0.12211204396888653, "learning_rate": 9.973389290197747e-05, "loss": 2.1123, "step": 4244 }, { "epoch": 0.03301313059356753, "grad_norm": 0.12304913571509077, "learning_rate": 9.973376700133225e-05, "loss": 2.0804, "step": 4245 }, { "epoch": 0.03302090753834811, "grad_norm": 0.15780890766385228, "learning_rate": 9.973364107099052e-05, "loss": 2.113, "step": 4246 }, { "epoch": 0.0330286844831287, "grad_norm": 0.16152449545411385, "learning_rate": 9.973351511095235e-05, "loss": 2.0706, "step": 4247 }, { "epoch": 0.03303646142790928, "grad_norm": 0.11619987904360302, "learning_rate": 9.973338912121781e-05, "loss": 2.1211, "step": 4248 }, { "epoch": 0.03304423837268986, "grad_norm": 0.2603088153606894, "learning_rate": 9.973326310178697e-05, "loss": 2.0776, "step": 4249 }, { "epoch": 0.03305201531747044, "grad_norm": 0.210014906585264, "learning_rate": 9.97331370526599e-05, "loss": 2.1375, "step": 4250 }, { "epoch": 0.033059792262251024, "grad_norm": 0.14277832509065136, "learning_rate": 9.973301097383668e-05, "loss": 2.1192, "step": 4251 }, { "epoch": 0.033067569207031605, "grad_norm": 0.14091799666614238, "learning_rate": 9.97328848653174e-05, "loss": 2.1273, "step": 4252 }, { "epoch": 0.033075346151812186, "grad_norm": 0.18157909872200026, "learning_rate": 9.973275872710211e-05, "loss": 2.056, "step": 4253 }, { "epoch": 0.03308312309659277, "grad_norm": 0.11873741226700585, "learning_rate": 9.973263255919091e-05, "loss": 2.125, "step": 4254 }, { "epoch": 0.03309090004137335, "grad_norm": 0.16485143477798433, "learning_rate": 9.973250636158387e-05, "loss": 2.0961, "step": 4255 }, { "epoch": 0.03309867698615393, "grad_norm": 0.1638516835291708, "learning_rate": 9.973238013428105e-05, "loss": 2.0753, "step": 4256 }, { "epoch": 0.03310645393093451, "grad_norm": 0.11471029019994103, "learning_rate": 9.973225387728256e-05, "loss": 2.0974, "step": 4257 }, { "epoch": 0.03311423087571509, "grad_norm": 0.17425672129179925, "learning_rate": 9.973212759058843e-05, "loss": 2.1429, "step": 4258 }, { "epoch": 0.03312200782049567, "grad_norm": 0.3152442020718844, "learning_rate": 9.973200127419876e-05, "loss": 2.0917, "step": 4259 }, { "epoch": 0.033129784765276254, "grad_norm": 0.1225780318111406, "learning_rate": 9.973187492811363e-05, "loss": 2.0753, "step": 4260 }, { "epoch": 0.033137561710056836, "grad_norm": 0.20595015596877536, "learning_rate": 9.97317485523331e-05, "loss": 2.098, "step": 4261 }, { "epoch": 0.03314533865483742, "grad_norm": 0.18867414169449748, "learning_rate": 9.973162214685725e-05, "loss": 2.0685, "step": 4262 }, { "epoch": 0.033153115599618, "grad_norm": 0.1245771715175286, "learning_rate": 9.973149571168616e-05, "loss": 2.1118, "step": 4263 }, { "epoch": 0.03316089254439858, "grad_norm": 0.17784846912576502, "learning_rate": 9.97313692468199e-05, "loss": 2.1328, "step": 4264 }, { "epoch": 0.03316866948917916, "grad_norm": 0.17645622418508314, "learning_rate": 9.973124275225856e-05, "loss": 2.1287, "step": 4265 }, { "epoch": 0.03317644643395974, "grad_norm": 0.13296653305317616, "learning_rate": 9.97311162280022e-05, "loss": 2.1146, "step": 4266 }, { "epoch": 0.03318422337874032, "grad_norm": 0.164095039383513, "learning_rate": 9.97309896740509e-05, "loss": 2.1066, "step": 4267 }, { "epoch": 0.033192000323520904, "grad_norm": 0.12945745547079945, "learning_rate": 9.973086309040474e-05, "loss": 2.1543, "step": 4268 }, { "epoch": 0.033199777268301485, "grad_norm": 0.1516106303065718, "learning_rate": 9.973073647706378e-05, "loss": 2.1234, "step": 4269 }, { "epoch": 0.033207554213082066, "grad_norm": 0.13205747781980806, "learning_rate": 9.97306098340281e-05, "loss": 2.0796, "step": 4270 }, { "epoch": 0.03321533115786265, "grad_norm": 0.12871291495697262, "learning_rate": 9.973048316129782e-05, "loss": 2.0741, "step": 4271 }, { "epoch": 0.03322310810264323, "grad_norm": 0.13063588598160575, "learning_rate": 9.973035645887295e-05, "loss": 2.1138, "step": 4272 }, { "epoch": 0.03323088504742381, "grad_norm": 0.12113138009830741, "learning_rate": 9.97302297267536e-05, "loss": 2.0876, "step": 4273 }, { "epoch": 0.03323866199220439, "grad_norm": 0.12820516348031646, "learning_rate": 9.973010296493985e-05, "loss": 2.1127, "step": 4274 }, { "epoch": 0.03324643893698497, "grad_norm": 0.246659971952823, "learning_rate": 9.972997617343175e-05, "loss": 2.0628, "step": 4275 }, { "epoch": 0.03325421588176555, "grad_norm": 0.12607813447967267, "learning_rate": 9.97298493522294e-05, "loss": 2.1038, "step": 4276 }, { "epoch": 0.033261992826546134, "grad_norm": 0.1381168743573838, "learning_rate": 9.972972250133286e-05, "loss": 2.1401, "step": 4277 }, { "epoch": 0.033269769771326715, "grad_norm": 0.1298503295250748, "learning_rate": 9.972959562074223e-05, "loss": 2.0465, "step": 4278 }, { "epoch": 0.0332775467161073, "grad_norm": 0.6179485375377692, "learning_rate": 9.972946871045754e-05, "loss": 2.0417, "step": 4279 }, { "epoch": 0.03328532366088788, "grad_norm": 0.3508935694414775, "learning_rate": 9.972934177047892e-05, "loss": 2.1246, "step": 4280 }, { "epoch": 0.03329310060566846, "grad_norm": 0.16409850352995042, "learning_rate": 9.972921480080641e-05, "loss": 2.0846, "step": 4281 }, { "epoch": 0.03330087755044904, "grad_norm": 0.14657831147982128, "learning_rate": 9.97290878014401e-05, "loss": 2.0723, "step": 4282 }, { "epoch": 0.03330865449522962, "grad_norm": 0.1829250268444913, "learning_rate": 9.972896077238005e-05, "loss": 2.0763, "step": 4283 }, { "epoch": 0.0333164314400102, "grad_norm": 0.18204017540271145, "learning_rate": 9.972883371362635e-05, "loss": 2.0574, "step": 4284 }, { "epoch": 0.033324208384790784, "grad_norm": 0.16527520069372084, "learning_rate": 9.972870662517908e-05, "loss": 2.1391, "step": 4285 }, { "epoch": 0.033331985329571365, "grad_norm": 0.20171064311544107, "learning_rate": 9.972857950703831e-05, "loss": 2.0857, "step": 4286 }, { "epoch": 0.033339762274351946, "grad_norm": 0.1840102455018163, "learning_rate": 9.972845235920412e-05, "loss": 2.0855, "step": 4287 }, { "epoch": 0.03334753921913253, "grad_norm": 0.13738940391823332, "learning_rate": 9.972832518167658e-05, "loss": 2.0429, "step": 4288 }, { "epoch": 0.03335531616391311, "grad_norm": 0.2095431311728283, "learning_rate": 9.972819797445576e-05, "loss": 2.084, "step": 4289 }, { "epoch": 0.03336309310869369, "grad_norm": 0.16487623038820445, "learning_rate": 9.972807073754174e-05, "loss": 2.0283, "step": 4290 }, { "epoch": 0.03337087005347427, "grad_norm": 0.1269647902171371, "learning_rate": 9.972794347093461e-05, "loss": 2.0863, "step": 4291 }, { "epoch": 0.03337864699825485, "grad_norm": 0.14368938798304537, "learning_rate": 9.972781617463443e-05, "loss": 2.0692, "step": 4292 }, { "epoch": 0.03338642394303543, "grad_norm": 0.14396463437509766, "learning_rate": 9.972768884864127e-05, "loss": 2.0589, "step": 4293 }, { "epoch": 0.033394200887816014, "grad_norm": 0.12718779250062492, "learning_rate": 9.972756149295523e-05, "loss": 2.0794, "step": 4294 }, { "epoch": 0.033401977832596595, "grad_norm": 0.17894476859682856, "learning_rate": 9.972743410757637e-05, "loss": 2.0906, "step": 4295 }, { "epoch": 0.033409754777377176, "grad_norm": 0.1543298080969645, "learning_rate": 9.972730669250477e-05, "loss": 2.0793, "step": 4296 }, { "epoch": 0.03341753172215776, "grad_norm": 0.12271966565050797, "learning_rate": 9.972717924774051e-05, "loss": 2.0985, "step": 4297 }, { "epoch": 0.03342530866693834, "grad_norm": 0.1318774850065979, "learning_rate": 9.972705177328365e-05, "loss": 2.1208, "step": 4298 }, { "epoch": 0.03343308561171892, "grad_norm": 0.14412735356172438, "learning_rate": 9.972692426913428e-05, "loss": 2.092, "step": 4299 }, { "epoch": 0.0334408625564995, "grad_norm": 0.12778954187972963, "learning_rate": 9.972679673529249e-05, "loss": 2.1028, "step": 4300 }, { "epoch": 0.03344863950128008, "grad_norm": 0.19711632738122, "learning_rate": 9.972666917175833e-05, "loss": 2.0775, "step": 4301 }, { "epoch": 0.03345641644606066, "grad_norm": 0.18242837997393446, "learning_rate": 9.972654157853187e-05, "loss": 2.1585, "step": 4302 }, { "epoch": 0.033464193390841245, "grad_norm": 0.12329644946467476, "learning_rate": 9.972641395561322e-05, "loss": 2.1647, "step": 4303 }, { "epoch": 0.033471970335621826, "grad_norm": 0.12877237727666024, "learning_rate": 9.972628630300242e-05, "loss": 2.1338, "step": 4304 }, { "epoch": 0.03347974728040241, "grad_norm": 0.1425857926729362, "learning_rate": 9.972615862069958e-05, "loss": 2.0833, "step": 4305 }, { "epoch": 0.033487524225182995, "grad_norm": 0.2075611479553308, "learning_rate": 9.972603090870477e-05, "loss": 2.0068, "step": 4306 }, { "epoch": 0.033495301169963576, "grad_norm": 0.12055514035956115, "learning_rate": 9.972590316701805e-05, "loss": 2.0726, "step": 4307 }, { "epoch": 0.03350307811474416, "grad_norm": 0.11917952960662441, "learning_rate": 9.97257753956395e-05, "loss": 2.095, "step": 4308 }, { "epoch": 0.03351085505952474, "grad_norm": 0.11782469485499822, "learning_rate": 9.97256475945692e-05, "loss": 2.0997, "step": 4309 }, { "epoch": 0.03351863200430532, "grad_norm": 0.125365088000538, "learning_rate": 9.972551976380721e-05, "loss": 2.1062, "step": 4310 }, { "epoch": 0.0335264089490859, "grad_norm": 0.11469232915759421, "learning_rate": 9.972539190335364e-05, "loss": 2.1121, "step": 4311 }, { "epoch": 0.03353418589386648, "grad_norm": 0.1392938522242258, "learning_rate": 9.972526401320854e-05, "loss": 2.0495, "step": 4312 }, { "epoch": 0.03354196283864706, "grad_norm": 0.13718212221815698, "learning_rate": 9.9725136093372e-05, "loss": 2.1054, "step": 4313 }, { "epoch": 0.033549739783427644, "grad_norm": 0.13230232697122377, "learning_rate": 9.97250081438441e-05, "loss": 2.0524, "step": 4314 }, { "epoch": 0.033557516728208225, "grad_norm": 0.11956591111768532, "learning_rate": 9.97248801646249e-05, "loss": 2.0987, "step": 4315 }, { "epoch": 0.03356529367298881, "grad_norm": 0.12161209358759102, "learning_rate": 9.972475215571448e-05, "loss": 2.1098, "step": 4316 }, { "epoch": 0.03357307061776939, "grad_norm": 0.12180874012813163, "learning_rate": 9.972462411711291e-05, "loss": 2.1019, "step": 4317 }, { "epoch": 0.03358084756254997, "grad_norm": 0.5219652085370916, "learning_rate": 9.97244960488203e-05, "loss": 2.0925, "step": 4318 }, { "epoch": 0.03358862450733055, "grad_norm": 0.31583147631387093, "learning_rate": 9.97243679508367e-05, "loss": 2.1429, "step": 4319 }, { "epoch": 0.03359640145211113, "grad_norm": 0.13853190293031356, "learning_rate": 9.972423982316219e-05, "loss": 2.0748, "step": 4320 }, { "epoch": 0.03360417839689171, "grad_norm": 0.2894238493523563, "learning_rate": 9.972411166579684e-05, "loss": 2.1141, "step": 4321 }, { "epoch": 0.033611955341672294, "grad_norm": 0.13629728459516938, "learning_rate": 9.972398347874073e-05, "loss": 2.0459, "step": 4322 }, { "epoch": 0.033619732286452875, "grad_norm": 0.1636842621843874, "learning_rate": 9.972385526199395e-05, "loss": 2.1273, "step": 4323 }, { "epoch": 0.033627509231233456, "grad_norm": 0.14522251578397607, "learning_rate": 9.972372701555656e-05, "loss": 2.0372, "step": 4324 }, { "epoch": 0.03363528617601404, "grad_norm": 0.13343004599664665, "learning_rate": 9.972359873942864e-05, "loss": 2.0863, "step": 4325 }, { "epoch": 0.03364306312079462, "grad_norm": 0.12191611200373577, "learning_rate": 9.972347043361026e-05, "loss": 2.0822, "step": 4326 }, { "epoch": 0.0336508400655752, "grad_norm": 0.12501314137430217, "learning_rate": 9.972334209810152e-05, "loss": 2.1068, "step": 4327 }, { "epoch": 0.03365861701035578, "grad_norm": 0.12667948205164278, "learning_rate": 9.972321373290249e-05, "loss": 2.1036, "step": 4328 }, { "epoch": 0.03366639395513636, "grad_norm": 0.13179560194743373, "learning_rate": 9.972308533801322e-05, "loss": 2.0824, "step": 4329 }, { "epoch": 0.03367417089991694, "grad_norm": 0.22006071860008766, "learning_rate": 9.972295691343382e-05, "loss": 2.1266, "step": 4330 }, { "epoch": 0.033681947844697524, "grad_norm": 0.17333651687204554, "learning_rate": 9.972282845916434e-05, "loss": 2.11, "step": 4331 }, { "epoch": 0.033689724789478105, "grad_norm": 0.2009341673436472, "learning_rate": 9.972269997520489e-05, "loss": 2.0563, "step": 4332 }, { "epoch": 0.033697501734258686, "grad_norm": 0.2518677361417727, "learning_rate": 9.97225714615555e-05, "loss": 2.0632, "step": 4333 }, { "epoch": 0.03370527867903927, "grad_norm": 0.31321070421943353, "learning_rate": 9.97224429182163e-05, "loss": 2.1392, "step": 4334 }, { "epoch": 0.03371305562381985, "grad_norm": 0.3841424051018643, "learning_rate": 9.972231434518732e-05, "loss": 2.102, "step": 4335 }, { "epoch": 0.03372083256860043, "grad_norm": 0.28807434233733736, "learning_rate": 9.972218574246865e-05, "loss": 2.1212, "step": 4336 }, { "epoch": 0.03372860951338101, "grad_norm": 0.20839727491939528, "learning_rate": 9.972205711006039e-05, "loss": 2.0941, "step": 4337 }, { "epoch": 0.03373638645816159, "grad_norm": 0.13986957499959307, "learning_rate": 9.97219284479626e-05, "loss": 2.1127, "step": 4338 }, { "epoch": 0.03374416340294217, "grad_norm": 0.1656806638383342, "learning_rate": 9.972179975617535e-05, "loss": 2.0851, "step": 4339 }, { "epoch": 0.033751940347722754, "grad_norm": 0.23255556774165215, "learning_rate": 9.972167103469873e-05, "loss": 2.0635, "step": 4340 }, { "epoch": 0.033759717292503336, "grad_norm": 0.16641235235314217, "learning_rate": 9.972154228353279e-05, "loss": 2.1245, "step": 4341 }, { "epoch": 0.03376749423728392, "grad_norm": 0.13960546008100289, "learning_rate": 9.972141350267766e-05, "loss": 2.1, "step": 4342 }, { "epoch": 0.0337752711820645, "grad_norm": 0.49035355581993334, "learning_rate": 9.972128469213336e-05, "loss": 2.0949, "step": 4343 }, { "epoch": 0.03378304812684508, "grad_norm": 0.17744914816789215, "learning_rate": 9.972115585189999e-05, "loss": 2.0978, "step": 4344 }, { "epoch": 0.03379082507162566, "grad_norm": 0.16454154358209605, "learning_rate": 9.972102698197764e-05, "loss": 2.1028, "step": 4345 }, { "epoch": 0.03379860201640624, "grad_norm": 0.24502517887774064, "learning_rate": 9.972089808236638e-05, "loss": 2.0783, "step": 4346 }, { "epoch": 0.03380637896118682, "grad_norm": 0.4151406186991085, "learning_rate": 9.972076915306626e-05, "loss": 2.0783, "step": 4347 }, { "epoch": 0.033814155905967404, "grad_norm": 0.31659416900577125, "learning_rate": 9.972064019407738e-05, "loss": 2.1123, "step": 4348 }, { "epoch": 0.033821932850747985, "grad_norm": 0.16368812572694869, "learning_rate": 9.972051120539984e-05, "loss": 2.0619, "step": 4349 }, { "epoch": 0.033829709795528566, "grad_norm": 0.15560576903305223, "learning_rate": 9.972038218703368e-05, "loss": 2.0747, "step": 4350 }, { "epoch": 0.03383748674030915, "grad_norm": 0.15027735422853394, "learning_rate": 9.972025313897899e-05, "loss": 2.0948, "step": 4351 }, { "epoch": 0.03384526368508973, "grad_norm": 0.17152514259969573, "learning_rate": 9.972012406123584e-05, "loss": 2.1363, "step": 4352 }, { "epoch": 0.03385304062987031, "grad_norm": 0.47183741500001464, "learning_rate": 9.971999495380433e-05, "loss": 2.12, "step": 4353 }, { "epoch": 0.03386081757465089, "grad_norm": 0.12916407196719076, "learning_rate": 9.971986581668451e-05, "loss": 2.1056, "step": 4354 }, { "epoch": 0.03386859451943147, "grad_norm": 0.1466787526641777, "learning_rate": 9.971973664987646e-05, "loss": 2.0736, "step": 4355 }, { "epoch": 0.03387637146421205, "grad_norm": 0.151169097977775, "learning_rate": 9.971960745338028e-05, "loss": 2.0996, "step": 4356 }, { "epoch": 0.033884148408992634, "grad_norm": 0.4719189881015311, "learning_rate": 9.971947822719603e-05, "loss": 2.0646, "step": 4357 }, { "epoch": 0.033891925353773215, "grad_norm": 0.19661176193165086, "learning_rate": 9.971934897132379e-05, "loss": 2.1046, "step": 4358 }, { "epoch": 0.0338997022985538, "grad_norm": 0.1282606684585873, "learning_rate": 9.971921968576364e-05, "loss": 2.1065, "step": 4359 }, { "epoch": 0.03390747924333438, "grad_norm": 0.1309592474601484, "learning_rate": 9.971909037051565e-05, "loss": 2.0922, "step": 4360 }, { "epoch": 0.03391525618811496, "grad_norm": 0.15104633138070453, "learning_rate": 9.97189610255799e-05, "loss": 2.1023, "step": 4361 }, { "epoch": 0.03392303313289554, "grad_norm": 0.15729449682219185, "learning_rate": 9.971883165095646e-05, "loss": 2.0806, "step": 4362 }, { "epoch": 0.03393081007767612, "grad_norm": 0.14786110820274964, "learning_rate": 9.971870224664542e-05, "loss": 2.1174, "step": 4363 }, { "epoch": 0.0339385870224567, "grad_norm": 0.19692219812557105, "learning_rate": 9.971857281264687e-05, "loss": 2.1532, "step": 4364 }, { "epoch": 0.03394636396723729, "grad_norm": 0.7056231014615165, "learning_rate": 9.971844334896085e-05, "loss": 2.1379, "step": 4365 }, { "epoch": 0.03395414091201787, "grad_norm": 0.15354580709303917, "learning_rate": 9.971831385558746e-05, "loss": 2.0543, "step": 4366 }, { "epoch": 0.03396191785679845, "grad_norm": 0.17889267011608556, "learning_rate": 9.971818433252679e-05, "loss": 2.0995, "step": 4367 }, { "epoch": 0.033969694801579034, "grad_norm": 0.15081240120365652, "learning_rate": 9.97180547797789e-05, "loss": 2.1051, "step": 4368 }, { "epoch": 0.033977471746359615, "grad_norm": 0.1458456994505573, "learning_rate": 9.971792519734385e-05, "loss": 2.0991, "step": 4369 }, { "epoch": 0.033985248691140196, "grad_norm": 0.17736420340651227, "learning_rate": 9.971779558522175e-05, "loss": 2.1365, "step": 4370 }, { "epoch": 0.03399302563592078, "grad_norm": 0.16522418651642062, "learning_rate": 9.971766594341266e-05, "loss": 2.1329, "step": 4371 }, { "epoch": 0.03400080258070136, "grad_norm": 0.13560657591192504, "learning_rate": 9.971753627191667e-05, "loss": 2.136, "step": 4372 }, { "epoch": 0.03400857952548194, "grad_norm": 0.12689052763798225, "learning_rate": 9.971740657073383e-05, "loss": 2.0722, "step": 4373 }, { "epoch": 0.03401635647026252, "grad_norm": 0.1296484155615443, "learning_rate": 9.971727683986427e-05, "loss": 2.1258, "step": 4374 }, { "epoch": 0.0340241334150431, "grad_norm": 0.13399222865515656, "learning_rate": 9.971714707930801e-05, "loss": 2.1285, "step": 4375 }, { "epoch": 0.03403191035982368, "grad_norm": 0.11835518217687405, "learning_rate": 9.971701728906515e-05, "loss": 2.0668, "step": 4376 }, { "epoch": 0.034039687304604264, "grad_norm": 0.12610529087799488, "learning_rate": 9.971688746913578e-05, "loss": 2.1211, "step": 4377 }, { "epoch": 0.034047464249384846, "grad_norm": 0.14942266397221177, "learning_rate": 9.971675761951997e-05, "loss": 2.1038, "step": 4378 }, { "epoch": 0.03405524119416543, "grad_norm": 0.1865788265066603, "learning_rate": 9.971662774021778e-05, "loss": 2.0762, "step": 4379 }, { "epoch": 0.03406301813894601, "grad_norm": 0.1457327270941104, "learning_rate": 9.971649783122932e-05, "loss": 2.1025, "step": 4380 }, { "epoch": 0.03407079508372659, "grad_norm": 0.11613923386382045, "learning_rate": 9.971636789255463e-05, "loss": 2.0984, "step": 4381 }, { "epoch": 0.03407857202850717, "grad_norm": 0.14569287526251948, "learning_rate": 9.971623792419382e-05, "loss": 2.1363, "step": 4382 }, { "epoch": 0.03408634897328775, "grad_norm": 0.15192927061616682, "learning_rate": 9.971610792614693e-05, "loss": 2.1017, "step": 4383 }, { "epoch": 0.03409412591806833, "grad_norm": 0.12388566731335988, "learning_rate": 9.971597789841409e-05, "loss": 2.1178, "step": 4384 }, { "epoch": 0.034101902862848914, "grad_norm": 0.13180319287263983, "learning_rate": 9.971584784099534e-05, "loss": 2.0938, "step": 4385 }, { "epoch": 0.034109679807629495, "grad_norm": 0.14344657295195845, "learning_rate": 9.971571775389078e-05, "loss": 2.1001, "step": 4386 }, { "epoch": 0.034117456752410076, "grad_norm": 0.18471559090084722, "learning_rate": 9.971558763710046e-05, "loss": 2.1102, "step": 4387 }, { "epoch": 0.03412523369719066, "grad_norm": 0.1265760391520172, "learning_rate": 9.971545749062447e-05, "loss": 2.1553, "step": 4388 }, { "epoch": 0.03413301064197124, "grad_norm": 0.14817742731554964, "learning_rate": 9.971532731446292e-05, "loss": 2.1218, "step": 4389 }, { "epoch": 0.03414078758675182, "grad_norm": 0.13622809620671034, "learning_rate": 9.971519710861582e-05, "loss": 2.0649, "step": 4390 }, { "epoch": 0.0341485645315324, "grad_norm": 0.11579086681000023, "learning_rate": 9.97150668730833e-05, "loss": 2.0281, "step": 4391 }, { "epoch": 0.03415634147631298, "grad_norm": 0.12871155424905928, "learning_rate": 9.971493660786544e-05, "loss": 2.0332, "step": 4392 }, { "epoch": 0.03416411842109356, "grad_norm": 0.13557481515304837, "learning_rate": 9.971480631296228e-05, "loss": 2.096, "step": 4393 }, { "epoch": 0.034171895365874144, "grad_norm": 0.12651068041233918, "learning_rate": 9.971467598837392e-05, "loss": 2.1117, "step": 4394 }, { "epoch": 0.034179672310654725, "grad_norm": 0.11718918708355139, "learning_rate": 9.971454563410046e-05, "loss": 2.1038, "step": 4395 }, { "epoch": 0.03418744925543531, "grad_norm": 0.13242106554353836, "learning_rate": 9.971441525014193e-05, "loss": 2.1293, "step": 4396 }, { "epoch": 0.03419522620021589, "grad_norm": 0.12224818212874781, "learning_rate": 9.971428483649844e-05, "loss": 2.145, "step": 4397 }, { "epoch": 0.03420300314499647, "grad_norm": 0.12076190003701592, "learning_rate": 9.971415439317007e-05, "loss": 2.1337, "step": 4398 }, { "epoch": 0.03421078008977705, "grad_norm": 0.13050039592275758, "learning_rate": 9.971402392015687e-05, "loss": 2.106, "step": 4399 }, { "epoch": 0.03421855703455763, "grad_norm": 0.11880396179947021, "learning_rate": 9.971389341745895e-05, "loss": 2.0788, "step": 4400 }, { "epoch": 0.03422633397933821, "grad_norm": 0.14015467825307035, "learning_rate": 9.971376288507637e-05, "loss": 2.0884, "step": 4401 }, { "epoch": 0.034234110924118794, "grad_norm": 0.13256607899962683, "learning_rate": 9.97136323230092e-05, "loss": 2.1151, "step": 4402 }, { "epoch": 0.034241887868899375, "grad_norm": 0.12769792347984396, "learning_rate": 9.971350173125755e-05, "loss": 2.0894, "step": 4403 }, { "epoch": 0.034249664813679956, "grad_norm": 0.1859409553086703, "learning_rate": 9.971337110982149e-05, "loss": 2.1384, "step": 4404 }, { "epoch": 0.03425744175846054, "grad_norm": 0.19263729272282615, "learning_rate": 9.971324045870106e-05, "loss": 2.0655, "step": 4405 }, { "epoch": 0.03426521870324112, "grad_norm": 0.13481535386693383, "learning_rate": 9.971310977789638e-05, "loss": 2.0758, "step": 4406 }, { "epoch": 0.0342729956480217, "grad_norm": 0.13152760664838292, "learning_rate": 9.971297906740751e-05, "loss": 2.1341, "step": 4407 }, { "epoch": 0.03428077259280228, "grad_norm": 0.142347297629962, "learning_rate": 9.971284832723452e-05, "loss": 2.0144, "step": 4408 }, { "epoch": 0.03428854953758286, "grad_norm": 0.1257257024561322, "learning_rate": 9.97127175573775e-05, "loss": 2.1163, "step": 4409 }, { "epoch": 0.03429632648236344, "grad_norm": 0.13788715125550674, "learning_rate": 9.971258675783653e-05, "loss": 2.0229, "step": 4410 }, { "epoch": 0.034304103427144024, "grad_norm": 0.1421958169605758, "learning_rate": 9.971245592861169e-05, "loss": 2.0955, "step": 4411 }, { "epoch": 0.034311880371924605, "grad_norm": 0.12246583522396254, "learning_rate": 9.971232506970305e-05, "loss": 2.0796, "step": 4412 }, { "epoch": 0.034319657316705186, "grad_norm": 0.1599505510819018, "learning_rate": 9.971219418111068e-05, "loss": 2.1489, "step": 4413 }, { "epoch": 0.03432743426148577, "grad_norm": 0.14886773237525958, "learning_rate": 9.971206326283469e-05, "loss": 2.0922, "step": 4414 }, { "epoch": 0.03433521120626635, "grad_norm": 0.13438693349052885, "learning_rate": 9.971193231487512e-05, "loss": 2.1172, "step": 4415 }, { "epoch": 0.03434298815104693, "grad_norm": 0.12354950876057764, "learning_rate": 9.971180133723208e-05, "loss": 2.1625, "step": 4416 }, { "epoch": 0.03435076509582751, "grad_norm": 0.13706618293393408, "learning_rate": 9.971167032990563e-05, "loss": 2.1011, "step": 4417 }, { "epoch": 0.03435854204060809, "grad_norm": 0.12050477663649319, "learning_rate": 9.971153929289584e-05, "loss": 2.0545, "step": 4418 }, { "epoch": 0.03436631898538867, "grad_norm": 0.1376746739085468, "learning_rate": 9.971140822620281e-05, "loss": 2.1521, "step": 4419 }, { "epoch": 0.034374095930169254, "grad_norm": 0.15783035504370532, "learning_rate": 9.97112771298266e-05, "loss": 2.041, "step": 4420 }, { "epoch": 0.034381872874949836, "grad_norm": 0.1267084105857375, "learning_rate": 9.97111460037673e-05, "loss": 2.0816, "step": 4421 }, { "epoch": 0.03438964981973042, "grad_norm": 0.11259710534931335, "learning_rate": 9.971101484802499e-05, "loss": 2.1061, "step": 4422 }, { "epoch": 0.034397426764511, "grad_norm": 0.11388252394927358, "learning_rate": 9.971088366259974e-05, "loss": 2.0403, "step": 4423 }, { "epoch": 0.03440520370929158, "grad_norm": 0.11994509509961625, "learning_rate": 9.971075244749164e-05, "loss": 2.1056, "step": 4424 }, { "epoch": 0.03441298065407217, "grad_norm": 0.11152197146030135, "learning_rate": 9.971062120270074e-05, "loss": 2.1111, "step": 4425 }, { "epoch": 0.03442075759885275, "grad_norm": 0.11223493879786726, "learning_rate": 9.971048992822716e-05, "loss": 2.1192, "step": 4426 }, { "epoch": 0.03442853454363333, "grad_norm": 0.11240617884271056, "learning_rate": 9.971035862407094e-05, "loss": 2.1534, "step": 4427 }, { "epoch": 0.03443631148841391, "grad_norm": 0.14490240612989158, "learning_rate": 9.971022729023219e-05, "loss": 2.1027, "step": 4428 }, { "epoch": 0.03444408843319449, "grad_norm": 0.12843764568852903, "learning_rate": 9.971009592671097e-05, "loss": 2.0812, "step": 4429 }, { "epoch": 0.03445186537797507, "grad_norm": 0.12397618477991738, "learning_rate": 9.970996453350735e-05, "loss": 2.102, "step": 4430 }, { "epoch": 0.034459642322755654, "grad_norm": 0.11762042856673306, "learning_rate": 9.970983311062143e-05, "loss": 2.063, "step": 4431 }, { "epoch": 0.034467419267536235, "grad_norm": 0.15178649283516013, "learning_rate": 9.970970165805326e-05, "loss": 2.0792, "step": 4432 }, { "epoch": 0.034475196212316817, "grad_norm": 0.1247079105414353, "learning_rate": 9.970957017580297e-05, "loss": 2.1158, "step": 4433 }, { "epoch": 0.0344829731570974, "grad_norm": 0.12018858800140306, "learning_rate": 9.970943866387059e-05, "loss": 2.0472, "step": 4434 }, { "epoch": 0.03449075010187798, "grad_norm": 0.15701788557160243, "learning_rate": 9.97093071222562e-05, "loss": 2.0983, "step": 4435 }, { "epoch": 0.03449852704665856, "grad_norm": 0.1560533285766257, "learning_rate": 9.970917555095992e-05, "loss": 2.1365, "step": 4436 }, { "epoch": 0.03450630399143914, "grad_norm": 0.11957544739351889, "learning_rate": 9.97090439499818e-05, "loss": 2.0984, "step": 4437 }, { "epoch": 0.03451408093621972, "grad_norm": 0.15027459674237384, "learning_rate": 9.97089123193219e-05, "loss": 2.0886, "step": 4438 }, { "epoch": 0.034521857881000304, "grad_norm": 0.17773252360838265, "learning_rate": 9.970878065898033e-05, "loss": 2.0637, "step": 4439 }, { "epoch": 0.034529634825780885, "grad_norm": 0.12480896376601672, "learning_rate": 9.970864896895715e-05, "loss": 2.1218, "step": 4440 }, { "epoch": 0.034537411770561466, "grad_norm": 0.13738833967463057, "learning_rate": 9.970851724925245e-05, "loss": 2.1238, "step": 4441 }, { "epoch": 0.03454518871534205, "grad_norm": 0.16939082080140894, "learning_rate": 9.970838549986631e-05, "loss": 2.0685, "step": 4442 }, { "epoch": 0.03455296566012263, "grad_norm": 0.20176887669573626, "learning_rate": 9.97082537207988e-05, "loss": 2.0844, "step": 4443 }, { "epoch": 0.03456074260490321, "grad_norm": 0.13226774769408606, "learning_rate": 9.970812191205e-05, "loss": 2.1201, "step": 4444 }, { "epoch": 0.03456851954968379, "grad_norm": 0.17178372877436227, "learning_rate": 9.970799007362e-05, "loss": 2.0436, "step": 4445 }, { "epoch": 0.03457629649446437, "grad_norm": 0.1377568529796255, "learning_rate": 9.970785820550885e-05, "loss": 2.1361, "step": 4446 }, { "epoch": 0.03458407343924495, "grad_norm": 0.3095309533598023, "learning_rate": 9.970772630771666e-05, "loss": 2.0719, "step": 4447 }, { "epoch": 0.034591850384025534, "grad_norm": 0.1918505872692569, "learning_rate": 9.97075943802435e-05, "loss": 2.1599, "step": 4448 }, { "epoch": 0.034599627328806115, "grad_norm": 0.16510565151726708, "learning_rate": 9.970746242308944e-05, "loss": 2.0398, "step": 4449 }, { "epoch": 0.034607404273586696, "grad_norm": 0.12775577198521937, "learning_rate": 9.970733043625457e-05, "loss": 2.0992, "step": 4450 }, { "epoch": 0.03461518121836728, "grad_norm": 0.17545872633925674, "learning_rate": 9.970719841973896e-05, "loss": 2.0964, "step": 4451 }, { "epoch": 0.03462295816314786, "grad_norm": 0.14478209437647813, "learning_rate": 9.970706637354267e-05, "loss": 2.079, "step": 4452 }, { "epoch": 0.03463073510792844, "grad_norm": 0.1210676858443268, "learning_rate": 9.970693429766584e-05, "loss": 2.0113, "step": 4453 }, { "epoch": 0.03463851205270902, "grad_norm": 0.14391951851405732, "learning_rate": 9.970680219210849e-05, "loss": 2.0706, "step": 4454 }, { "epoch": 0.0346462889974896, "grad_norm": 0.13389761367971048, "learning_rate": 9.970667005687072e-05, "loss": 2.0876, "step": 4455 }, { "epoch": 0.03465406594227018, "grad_norm": 0.12474172408717099, "learning_rate": 9.970653789195261e-05, "loss": 2.0937, "step": 4456 }, { "epoch": 0.034661842887050764, "grad_norm": 0.1356263349118532, "learning_rate": 9.970640569735423e-05, "loss": 2.114, "step": 4457 }, { "epoch": 0.034669619831831346, "grad_norm": 0.14260120881033672, "learning_rate": 9.970627347307568e-05, "loss": 2.0985, "step": 4458 }, { "epoch": 0.03467739677661193, "grad_norm": 0.1205426570979729, "learning_rate": 9.9706141219117e-05, "loss": 2.1098, "step": 4459 }, { "epoch": 0.03468517372139251, "grad_norm": 0.14715589702775528, "learning_rate": 9.970600893547832e-05, "loss": 2.0569, "step": 4460 }, { "epoch": 0.03469295066617309, "grad_norm": 0.17484742403937042, "learning_rate": 9.970587662215969e-05, "loss": 2.127, "step": 4461 }, { "epoch": 0.03470072761095367, "grad_norm": 0.17353538311409417, "learning_rate": 9.970574427916118e-05, "loss": 2.0456, "step": 4462 }, { "epoch": 0.03470850455573425, "grad_norm": 0.12450871978065724, "learning_rate": 9.970561190648288e-05, "loss": 2.1157, "step": 4463 }, { "epoch": 0.03471628150051483, "grad_norm": 0.14533451008665121, "learning_rate": 9.970547950412488e-05, "loss": 2.1281, "step": 4464 }, { "epoch": 0.034724058445295414, "grad_norm": 0.1320824455620936, "learning_rate": 9.970534707208722e-05, "loss": 2.1251, "step": 4465 }, { "epoch": 0.034731835390075995, "grad_norm": 0.11667831745372695, "learning_rate": 9.970521461037004e-05, "loss": 2.0874, "step": 4466 }, { "epoch": 0.034739612334856576, "grad_norm": 0.1488403097361633, "learning_rate": 9.970508211897337e-05, "loss": 2.1315, "step": 4467 }, { "epoch": 0.03474738927963716, "grad_norm": 0.15684198893640933, "learning_rate": 9.970494959789731e-05, "loss": 2.0832, "step": 4468 }, { "epoch": 0.03475516622441774, "grad_norm": 0.11861975814449043, "learning_rate": 9.970481704714195e-05, "loss": 2.0529, "step": 4469 }, { "epoch": 0.03476294316919832, "grad_norm": 0.12546633906332075, "learning_rate": 9.970468446670733e-05, "loss": 2.1065, "step": 4470 }, { "epoch": 0.0347707201139789, "grad_norm": 0.12469869369963076, "learning_rate": 9.970455185659355e-05, "loss": 2.1252, "step": 4471 }, { "epoch": 0.03477849705875948, "grad_norm": 0.338320113471074, "learning_rate": 9.970441921680071e-05, "loss": 2.0499, "step": 4472 }, { "epoch": 0.03478627400354006, "grad_norm": 0.17373572259666256, "learning_rate": 9.970428654732887e-05, "loss": 2.0531, "step": 4473 }, { "epoch": 0.034794050948320644, "grad_norm": 0.12575692106169742, "learning_rate": 9.97041538481781e-05, "loss": 2.1036, "step": 4474 }, { "epoch": 0.034801827893101225, "grad_norm": 0.12910025402768335, "learning_rate": 9.970402111934852e-05, "loss": 2.1342, "step": 4475 }, { "epoch": 0.03480960483788181, "grad_norm": 0.12116557692692494, "learning_rate": 9.970388836084015e-05, "loss": 2.1283, "step": 4476 }, { "epoch": 0.03481738178266239, "grad_norm": 0.1234845519022928, "learning_rate": 9.97037555726531e-05, "loss": 2.0482, "step": 4477 }, { "epoch": 0.03482515872744297, "grad_norm": 0.12857049107438878, "learning_rate": 9.970362275478745e-05, "loss": 2.0767, "step": 4478 }, { "epoch": 0.03483293567222355, "grad_norm": 0.1227082417300715, "learning_rate": 9.970348990724328e-05, "loss": 2.058, "step": 4479 }, { "epoch": 0.03484071261700413, "grad_norm": 0.1201747839829788, "learning_rate": 9.970335703002068e-05, "loss": 2.0758, "step": 4480 }, { "epoch": 0.03484848956178471, "grad_norm": 0.12089605799941544, "learning_rate": 9.970322412311968e-05, "loss": 2.0795, "step": 4481 }, { "epoch": 0.034856266506565294, "grad_norm": 0.14160387718818637, "learning_rate": 9.970309118654042e-05, "loss": 2.0989, "step": 4482 }, { "epoch": 0.034864043451345875, "grad_norm": 0.24069149791086736, "learning_rate": 9.970295822028296e-05, "loss": 2.0774, "step": 4483 }, { "epoch": 0.03487182039612646, "grad_norm": 0.13990201624734014, "learning_rate": 9.970282522434737e-05, "loss": 2.0741, "step": 4484 }, { "epoch": 0.034879597340907044, "grad_norm": 0.18024246266766525, "learning_rate": 9.970269219873373e-05, "loss": 2.0505, "step": 4485 }, { "epoch": 0.034887374285687625, "grad_norm": 0.14929919005845196, "learning_rate": 9.97025591434421e-05, "loss": 2.0608, "step": 4486 }, { "epoch": 0.034895151230468206, "grad_norm": 0.1157192793674088, "learning_rate": 9.970242605847262e-05, "loss": 2.0715, "step": 4487 }, { "epoch": 0.03490292817524879, "grad_norm": 0.16176674098029564, "learning_rate": 9.970229294382531e-05, "loss": 2.091, "step": 4488 }, { "epoch": 0.03491070512002937, "grad_norm": 0.15007579651831432, "learning_rate": 9.970215979950028e-05, "loss": 2.0781, "step": 4489 }, { "epoch": 0.03491848206480995, "grad_norm": 0.11874635140169455, "learning_rate": 9.97020266254976e-05, "loss": 2.068, "step": 4490 }, { "epoch": 0.03492625900959053, "grad_norm": 0.14447176232219894, "learning_rate": 9.970189342181735e-05, "loss": 2.0964, "step": 4491 }, { "epoch": 0.03493403595437111, "grad_norm": 0.4346998631481485, "learning_rate": 9.97017601884596e-05, "loss": 2.0673, "step": 4492 }, { "epoch": 0.03494181289915169, "grad_norm": 0.12121011833832554, "learning_rate": 9.970162692542446e-05, "loss": 2.1612, "step": 4493 }, { "epoch": 0.034949589843932274, "grad_norm": 0.1486574276561758, "learning_rate": 9.970149363271198e-05, "loss": 2.0759, "step": 4494 }, { "epoch": 0.034957366788712856, "grad_norm": 0.14368504856456604, "learning_rate": 9.970136031032224e-05, "loss": 2.1411, "step": 4495 }, { "epoch": 0.03496514373349344, "grad_norm": 0.11942980073015583, "learning_rate": 9.970122695825533e-05, "loss": 2.1136, "step": 4496 }, { "epoch": 0.03497292067827402, "grad_norm": 0.13755858986449523, "learning_rate": 9.970109357651134e-05, "loss": 2.0868, "step": 4497 }, { "epoch": 0.0349806976230546, "grad_norm": 0.1588682612741881, "learning_rate": 9.970096016509032e-05, "loss": 2.0997, "step": 4498 }, { "epoch": 0.03498847456783518, "grad_norm": 0.1264835935781908, "learning_rate": 9.970082672399236e-05, "loss": 2.0784, "step": 4499 }, { "epoch": 0.03499625151261576, "grad_norm": 0.12329847307738322, "learning_rate": 9.970069325321758e-05, "loss": 2.0915, "step": 4500 }, { "epoch": 0.03500402845739634, "grad_norm": 0.12000852841198263, "learning_rate": 9.9700559752766e-05, "loss": 2.1559, "step": 4501 }, { "epoch": 0.035011805402176924, "grad_norm": 0.11871754066579812, "learning_rate": 9.970042622263773e-05, "loss": 2.0417, "step": 4502 }, { "epoch": 0.035019582346957505, "grad_norm": 0.11846279125054472, "learning_rate": 9.970029266283286e-05, "loss": 2.1356, "step": 4503 }, { "epoch": 0.035027359291738086, "grad_norm": 0.14282141554394198, "learning_rate": 9.970015907335144e-05, "loss": 2.1079, "step": 4504 }, { "epoch": 0.03503513623651867, "grad_norm": 0.11880684541771902, "learning_rate": 9.970002545419357e-05, "loss": 2.0717, "step": 4505 }, { "epoch": 0.03504291318129925, "grad_norm": 0.1205449550471388, "learning_rate": 9.969989180535933e-05, "loss": 2.1105, "step": 4506 }, { "epoch": 0.03505069012607983, "grad_norm": 0.11650147593745964, "learning_rate": 9.96997581268488e-05, "loss": 2.124, "step": 4507 }, { "epoch": 0.03505846707086041, "grad_norm": 0.12856186701677028, "learning_rate": 9.969962441866204e-05, "loss": 2.0937, "step": 4508 }, { "epoch": 0.03506624401564099, "grad_norm": 0.11795186681987375, "learning_rate": 9.969949068079916e-05, "loss": 2.0766, "step": 4509 }, { "epoch": 0.03507402096042157, "grad_norm": 0.11929859753178237, "learning_rate": 9.96993569132602e-05, "loss": 2.1291, "step": 4510 }, { "epoch": 0.035081797905202154, "grad_norm": 0.12677275604924368, "learning_rate": 9.969922311604529e-05, "loss": 2.0851, "step": 4511 }, { "epoch": 0.035089574849982735, "grad_norm": 0.1389638612372092, "learning_rate": 9.969908928915446e-05, "loss": 2.1267, "step": 4512 }, { "epoch": 0.035097351794763317, "grad_norm": 0.12762125258347892, "learning_rate": 9.969895543258784e-05, "loss": 2.0552, "step": 4513 }, { "epoch": 0.0351051287395439, "grad_norm": 0.14275421999654084, "learning_rate": 9.969882154634547e-05, "loss": 2.1503, "step": 4514 }, { "epoch": 0.03511290568432448, "grad_norm": 0.1273578002376959, "learning_rate": 9.969868763042744e-05, "loss": 2.1193, "step": 4515 }, { "epoch": 0.03512068262910506, "grad_norm": 0.11772663697995937, "learning_rate": 9.969855368483386e-05, "loss": 2.1086, "step": 4516 }, { "epoch": 0.03512845957388564, "grad_norm": 0.13067479542278482, "learning_rate": 9.969841970956474e-05, "loss": 2.1347, "step": 4517 }, { "epoch": 0.03513623651866622, "grad_norm": 0.1189028244921046, "learning_rate": 9.969828570462023e-05, "loss": 2.0638, "step": 4518 }, { "epoch": 0.035144013463446803, "grad_norm": 0.1336499257084487, "learning_rate": 9.969815167000038e-05, "loss": 2.1256, "step": 4519 }, { "epoch": 0.035151790408227385, "grad_norm": 0.1320251492129722, "learning_rate": 9.969801760570528e-05, "loss": 2.0963, "step": 4520 }, { "epoch": 0.035159567353007966, "grad_norm": 0.11602105705071694, "learning_rate": 9.9697883511735e-05, "loss": 2.0957, "step": 4521 }, { "epoch": 0.03516734429778855, "grad_norm": 0.1278809798102652, "learning_rate": 9.969774938808962e-05, "loss": 2.1008, "step": 4522 }, { "epoch": 0.03517512124256913, "grad_norm": 0.12002199297394692, "learning_rate": 9.969761523476924e-05, "loss": 2.0462, "step": 4523 }, { "epoch": 0.03518289818734971, "grad_norm": 0.1152081496593583, "learning_rate": 9.96974810517739e-05, "loss": 2.0235, "step": 4524 }, { "epoch": 0.03519067513213029, "grad_norm": 0.12642995234742369, "learning_rate": 9.969734683910372e-05, "loss": 2.1253, "step": 4525 }, { "epoch": 0.03519845207691087, "grad_norm": 0.1591256361494469, "learning_rate": 9.969721259675876e-05, "loss": 2.1379, "step": 4526 }, { "epoch": 0.03520622902169145, "grad_norm": 0.15953632907588677, "learning_rate": 9.969707832473911e-05, "loss": 2.1127, "step": 4527 }, { "epoch": 0.035214005966472034, "grad_norm": 0.42267066512323875, "learning_rate": 9.969694402304484e-05, "loss": 2.0711, "step": 4528 }, { "epoch": 0.035221782911252615, "grad_norm": 0.11411217070992713, "learning_rate": 9.969680969167604e-05, "loss": 2.0533, "step": 4529 }, { "epoch": 0.035229559856033196, "grad_norm": 0.14627831545049963, "learning_rate": 9.969667533063278e-05, "loss": 2.0406, "step": 4530 }, { "epoch": 0.03523733680081378, "grad_norm": 0.17611748791167567, "learning_rate": 9.969654093991514e-05, "loss": 2.1202, "step": 4531 }, { "epoch": 0.03524511374559436, "grad_norm": 0.14444601124945255, "learning_rate": 9.969640651952322e-05, "loss": 2.098, "step": 4532 }, { "epoch": 0.03525289069037494, "grad_norm": 0.12203557011992089, "learning_rate": 9.969627206945708e-05, "loss": 2.0782, "step": 4533 }, { "epoch": 0.03526066763515552, "grad_norm": 0.12282678904550036, "learning_rate": 9.96961375897168e-05, "loss": 2.0895, "step": 4534 }, { "epoch": 0.0352684445799361, "grad_norm": 0.1198522939040006, "learning_rate": 9.969600308030247e-05, "loss": 2.023, "step": 4535 }, { "epoch": 0.03527622152471668, "grad_norm": 0.13619548531307796, "learning_rate": 9.969586854121418e-05, "loss": 2.1198, "step": 4536 }, { "epoch": 0.035283998469497264, "grad_norm": 0.12418250283397929, "learning_rate": 9.969573397245198e-05, "loss": 2.1133, "step": 4537 }, { "epoch": 0.035291775414277846, "grad_norm": 0.13131217194197223, "learning_rate": 9.969559937401598e-05, "loss": 2.1632, "step": 4538 }, { "epoch": 0.03529955235905843, "grad_norm": 0.14206050984271062, "learning_rate": 9.969546474590623e-05, "loss": 2.1046, "step": 4539 }, { "epoch": 0.03530732930383901, "grad_norm": 0.1419505740146942, "learning_rate": 9.969533008812284e-05, "loss": 2.0467, "step": 4540 }, { "epoch": 0.03531510624861959, "grad_norm": 0.12280480938366294, "learning_rate": 9.969519540066589e-05, "loss": 2.0742, "step": 4541 }, { "epoch": 0.03532288319340017, "grad_norm": 0.11547452439684786, "learning_rate": 9.969506068353543e-05, "loss": 2.0042, "step": 4542 }, { "epoch": 0.03533066013818075, "grad_norm": 0.11636079968623728, "learning_rate": 9.969492593673157e-05, "loss": 2.0914, "step": 4543 }, { "epoch": 0.03533843708296134, "grad_norm": 0.11235566065424735, "learning_rate": 9.969479116025437e-05, "loss": 2.0621, "step": 4544 }, { "epoch": 0.03534621402774192, "grad_norm": 0.12056921382280046, "learning_rate": 9.969465635410393e-05, "loss": 2.0846, "step": 4545 }, { "epoch": 0.0353539909725225, "grad_norm": 0.12940530909205283, "learning_rate": 9.969452151828033e-05, "loss": 2.0975, "step": 4546 }, { "epoch": 0.03536176791730308, "grad_norm": 0.13484787369151682, "learning_rate": 9.969438665278363e-05, "loss": 2.0877, "step": 4547 }, { "epoch": 0.035369544862083664, "grad_norm": 0.12103784162697333, "learning_rate": 9.969425175761393e-05, "loss": 2.0371, "step": 4548 }, { "epoch": 0.035377321806864245, "grad_norm": 0.12733035690052988, "learning_rate": 9.969411683277128e-05, "loss": 2.1114, "step": 4549 }, { "epoch": 0.035385098751644826, "grad_norm": 0.12082146906485333, "learning_rate": 9.969398187825581e-05, "loss": 2.0729, "step": 4550 }, { "epoch": 0.03539287569642541, "grad_norm": 0.1168675660563536, "learning_rate": 9.969384689406755e-05, "loss": 2.0464, "step": 4551 }, { "epoch": 0.03540065264120599, "grad_norm": 0.11556117063076267, "learning_rate": 9.969371188020662e-05, "loss": 2.1154, "step": 4552 }, { "epoch": 0.03540842958598657, "grad_norm": 0.12165976662207793, "learning_rate": 9.969357683667308e-05, "loss": 2.175, "step": 4553 }, { "epoch": 0.03541620653076715, "grad_norm": 0.1185252048708846, "learning_rate": 9.969344176346702e-05, "loss": 2.0845, "step": 4554 }, { "epoch": 0.03542398347554773, "grad_norm": 0.1281019529830232, "learning_rate": 9.969330666058852e-05, "loss": 2.097, "step": 4555 }, { "epoch": 0.03543176042032831, "grad_norm": 0.13981443353295525, "learning_rate": 9.969317152803766e-05, "loss": 2.1252, "step": 4556 }, { "epoch": 0.035439537365108895, "grad_norm": 0.1404228559935453, "learning_rate": 9.96930363658145e-05, "loss": 2.1047, "step": 4557 }, { "epoch": 0.035447314309889476, "grad_norm": 0.11668795449518465, "learning_rate": 9.969290117391915e-05, "loss": 2.1132, "step": 4558 }, { "epoch": 0.03545509125467006, "grad_norm": 0.1409550160585045, "learning_rate": 9.969276595235167e-05, "loss": 2.0477, "step": 4559 }, { "epoch": 0.03546286819945064, "grad_norm": 0.1907292582581298, "learning_rate": 9.969263070111217e-05, "loss": 2.0354, "step": 4560 }, { "epoch": 0.03547064514423122, "grad_norm": 0.18099035689918935, "learning_rate": 9.969249542020069e-05, "loss": 2.1179, "step": 4561 }, { "epoch": 0.0354784220890118, "grad_norm": 0.11725392708170372, "learning_rate": 9.969236010961733e-05, "loss": 2.0281, "step": 4562 }, { "epoch": 0.03548619903379238, "grad_norm": 0.16155307004667455, "learning_rate": 9.969222476936219e-05, "loss": 2.0862, "step": 4563 }, { "epoch": 0.03549397597857296, "grad_norm": 0.19179984494046415, "learning_rate": 9.969208939943532e-05, "loss": 2.0888, "step": 4564 }, { "epoch": 0.035501752923353544, "grad_norm": 0.14085967044150818, "learning_rate": 9.969195399983682e-05, "loss": 2.129, "step": 4565 }, { "epoch": 0.035509529868134125, "grad_norm": 0.12361851359890288, "learning_rate": 9.969181857056675e-05, "loss": 2.1012, "step": 4566 }, { "epoch": 0.035517306812914706, "grad_norm": 0.1517115979140295, "learning_rate": 9.969168311162523e-05, "loss": 2.0956, "step": 4567 }, { "epoch": 0.03552508375769529, "grad_norm": 0.1271397562625075, "learning_rate": 9.969154762301229e-05, "loss": 2.1119, "step": 4568 }, { "epoch": 0.03553286070247587, "grad_norm": 0.12498824934530227, "learning_rate": 9.969141210472806e-05, "loss": 2.1333, "step": 4569 }, { "epoch": 0.03554063764725645, "grad_norm": 0.13749837458781747, "learning_rate": 9.969127655677259e-05, "loss": 2.0988, "step": 4570 }, { "epoch": 0.03554841459203703, "grad_norm": 0.12465368967787702, "learning_rate": 9.969114097914596e-05, "loss": 2.1088, "step": 4571 }, { "epoch": 0.03555619153681761, "grad_norm": 0.11802199537266392, "learning_rate": 9.969100537184828e-05, "loss": 2.0544, "step": 4572 }, { "epoch": 0.03556396848159819, "grad_norm": 0.12067016046865234, "learning_rate": 9.96908697348796e-05, "loss": 2.0426, "step": 4573 }, { "epoch": 0.035571745426378774, "grad_norm": 0.11935258347026284, "learning_rate": 9.969073406824001e-05, "loss": 2.0958, "step": 4574 }, { "epoch": 0.035579522371159356, "grad_norm": 0.12219665279800755, "learning_rate": 9.96905983719296e-05, "loss": 2.1136, "step": 4575 }, { "epoch": 0.03558729931593994, "grad_norm": 0.2599013867423358, "learning_rate": 9.969046264594844e-05, "loss": 2.118, "step": 4576 }, { "epoch": 0.03559507626072052, "grad_norm": 0.12760405248273468, "learning_rate": 9.969032689029661e-05, "loss": 2.0765, "step": 4577 }, { "epoch": 0.0356028532055011, "grad_norm": 0.11693958728369015, "learning_rate": 9.96901911049742e-05, "loss": 2.0907, "step": 4578 }, { "epoch": 0.03561063015028168, "grad_norm": 0.1442942080849105, "learning_rate": 9.96900552899813e-05, "loss": 2.112, "step": 4579 }, { "epoch": 0.03561840709506226, "grad_norm": 0.1329736522507025, "learning_rate": 9.968991944531796e-05, "loss": 2.0477, "step": 4580 }, { "epoch": 0.03562618403984284, "grad_norm": 0.11418609224746042, "learning_rate": 9.968978357098429e-05, "loss": 2.1145, "step": 4581 }, { "epoch": 0.035633960984623424, "grad_norm": 0.1320987667673579, "learning_rate": 9.968964766698035e-05, "loss": 2.0802, "step": 4582 }, { "epoch": 0.035641737929404005, "grad_norm": 0.19091054329973245, "learning_rate": 9.968951173330624e-05, "loss": 2.0531, "step": 4583 }, { "epoch": 0.035649514874184586, "grad_norm": 0.1150131414833324, "learning_rate": 9.968937576996204e-05, "loss": 2.0973, "step": 4584 }, { "epoch": 0.03565729181896517, "grad_norm": 0.12084321310863594, "learning_rate": 9.968923977694782e-05, "loss": 2.0899, "step": 4585 }, { "epoch": 0.03566506876374575, "grad_norm": 0.12815025593016371, "learning_rate": 9.968910375426366e-05, "loss": 2.1423, "step": 4586 }, { "epoch": 0.03567284570852633, "grad_norm": 0.14181309146420507, "learning_rate": 9.968896770190964e-05, "loss": 2.0902, "step": 4587 }, { "epoch": 0.03568062265330691, "grad_norm": 0.11920710793695977, "learning_rate": 9.968883161988584e-05, "loss": 2.1024, "step": 4588 }, { "epoch": 0.03568839959808749, "grad_norm": 0.15514287271919616, "learning_rate": 9.968869550819238e-05, "loss": 2.0825, "step": 4589 }, { "epoch": 0.03569617654286807, "grad_norm": 0.1909365960381656, "learning_rate": 9.968855936682929e-05, "loss": 2.0858, "step": 4590 }, { "epoch": 0.035703953487648654, "grad_norm": 0.14173386611406266, "learning_rate": 9.968842319579668e-05, "loss": 2.0756, "step": 4591 }, { "epoch": 0.035711730432429235, "grad_norm": 0.11733037978948802, "learning_rate": 9.968828699509463e-05, "loss": 2.1563, "step": 4592 }, { "epoch": 0.035719507377209817, "grad_norm": 0.14907413418355597, "learning_rate": 9.96881507647232e-05, "loss": 2.0773, "step": 4593 }, { "epoch": 0.0357272843219904, "grad_norm": 0.11881909623018497, "learning_rate": 9.968801450468249e-05, "loss": 2.1133, "step": 4594 }, { "epoch": 0.03573506126677098, "grad_norm": 0.13365509666855824, "learning_rate": 9.968787821497257e-05, "loss": 2.0553, "step": 4595 }, { "epoch": 0.03574283821155156, "grad_norm": 0.13403419763538799, "learning_rate": 9.968774189559354e-05, "loss": 2.0473, "step": 4596 }, { "epoch": 0.03575061515633214, "grad_norm": 0.12095865222781882, "learning_rate": 9.968760554654547e-05, "loss": 2.1013, "step": 4597 }, { "epoch": 0.03575839210111272, "grad_norm": 0.1465740746202237, "learning_rate": 9.968746916782843e-05, "loss": 2.1278, "step": 4598 }, { "epoch": 0.035766169045893303, "grad_norm": 0.1588504542616426, "learning_rate": 9.968733275944251e-05, "loss": 2.0532, "step": 4599 }, { "epoch": 0.035773945990673885, "grad_norm": 0.11792884239869998, "learning_rate": 9.968719632138781e-05, "loss": 2.0876, "step": 4600 }, { "epoch": 0.035781722935454466, "grad_norm": 0.25703260587569166, "learning_rate": 9.968705985366439e-05, "loss": 2.0662, "step": 4601 }, { "epoch": 0.03578949988023505, "grad_norm": 0.14799402313409904, "learning_rate": 9.968692335627235e-05, "loss": 2.1563, "step": 4602 }, { "epoch": 0.035797276825015635, "grad_norm": 0.11306035028116732, "learning_rate": 9.968678682921174e-05, "loss": 2.0883, "step": 4603 }, { "epoch": 0.035805053769796216, "grad_norm": 0.12480837693972098, "learning_rate": 9.968665027248265e-05, "loss": 2.0398, "step": 4604 }, { "epoch": 0.0358128307145768, "grad_norm": 0.11601550568995568, "learning_rate": 9.96865136860852e-05, "loss": 2.0481, "step": 4605 }, { "epoch": 0.03582060765935738, "grad_norm": 0.11782483317518584, "learning_rate": 9.968637707001942e-05, "loss": 2.145, "step": 4606 }, { "epoch": 0.03582838460413796, "grad_norm": 0.17698305175741208, "learning_rate": 9.968624042428543e-05, "loss": 2.0738, "step": 4607 }, { "epoch": 0.03583616154891854, "grad_norm": 0.11626758990972584, "learning_rate": 9.968610374888327e-05, "loss": 2.1431, "step": 4608 }, { "epoch": 0.03584393849369912, "grad_norm": 0.12096265849703662, "learning_rate": 9.968596704381307e-05, "loss": 2.082, "step": 4609 }, { "epoch": 0.0358517154384797, "grad_norm": 0.12690375892460587, "learning_rate": 9.96858303090749e-05, "loss": 2.0227, "step": 4610 }, { "epoch": 0.035859492383260284, "grad_norm": 0.13025574647957242, "learning_rate": 9.968569354466882e-05, "loss": 2.1097, "step": 4611 }, { "epoch": 0.035867269328040866, "grad_norm": 0.14192174026728793, "learning_rate": 9.968555675059492e-05, "loss": 2.0942, "step": 4612 }, { "epoch": 0.03587504627282145, "grad_norm": 0.15029433758592856, "learning_rate": 9.968541992685329e-05, "loss": 2.1386, "step": 4613 }, { "epoch": 0.03588282321760203, "grad_norm": 0.17586474692779447, "learning_rate": 9.9685283073444e-05, "loss": 2.0881, "step": 4614 }, { "epoch": 0.03589060016238261, "grad_norm": 0.21005953117019058, "learning_rate": 9.968514619036712e-05, "loss": 2.0752, "step": 4615 }, { "epoch": 0.03589837710716319, "grad_norm": 0.31524251218841776, "learning_rate": 9.968500927762278e-05, "loss": 2.0273, "step": 4616 }, { "epoch": 0.03590615405194377, "grad_norm": 0.13694689809584829, "learning_rate": 9.968487233521103e-05, "loss": 2.1357, "step": 4617 }, { "epoch": 0.03591393099672435, "grad_norm": 0.13408855481532705, "learning_rate": 9.968473536313195e-05, "loss": 2.0794, "step": 4618 }, { "epoch": 0.035921707941504934, "grad_norm": 0.120196545189667, "learning_rate": 9.968459836138562e-05, "loss": 2.1748, "step": 4619 }, { "epoch": 0.035929484886285515, "grad_norm": 0.5299690448332619, "learning_rate": 9.968446132997211e-05, "loss": 2.1283, "step": 4620 }, { "epoch": 0.035937261831066096, "grad_norm": 0.15403907360937868, "learning_rate": 9.968432426889154e-05, "loss": 2.0927, "step": 4621 }, { "epoch": 0.03594503877584668, "grad_norm": 0.12789220451210687, "learning_rate": 9.968418717814396e-05, "loss": 2.0828, "step": 4622 }, { "epoch": 0.03595281572062726, "grad_norm": 0.19089486895297325, "learning_rate": 9.968405005772948e-05, "loss": 2.1115, "step": 4623 }, { "epoch": 0.03596059266540784, "grad_norm": 0.158657347562153, "learning_rate": 9.968391290764815e-05, "loss": 2.0852, "step": 4624 }, { "epoch": 0.03596836961018842, "grad_norm": 0.13272336653022163, "learning_rate": 9.968377572790006e-05, "loss": 2.1425, "step": 4625 }, { "epoch": 0.035976146554969, "grad_norm": 0.15857550758763267, "learning_rate": 9.96836385184853e-05, "loss": 2.0904, "step": 4626 }, { "epoch": 0.03598392349974958, "grad_norm": 0.14688589102781668, "learning_rate": 9.968350127940397e-05, "loss": 2.1063, "step": 4627 }, { "epoch": 0.035991700444530164, "grad_norm": 0.13387525525910787, "learning_rate": 9.968336401065612e-05, "loss": 2.0523, "step": 4628 }, { "epoch": 0.035999477389310745, "grad_norm": 0.12594544638528074, "learning_rate": 9.968322671224184e-05, "loss": 2.0804, "step": 4629 }, { "epoch": 0.036007254334091326, "grad_norm": 0.13908954916971913, "learning_rate": 9.968308938416122e-05, "loss": 2.0833, "step": 4630 }, { "epoch": 0.03601503127887191, "grad_norm": 0.16217537473625618, "learning_rate": 9.968295202641434e-05, "loss": 2.0826, "step": 4631 }, { "epoch": 0.03602280822365249, "grad_norm": 0.11903202104090617, "learning_rate": 9.968281463900126e-05, "loss": 2.0799, "step": 4632 }, { "epoch": 0.03603058516843307, "grad_norm": 0.1265116076706323, "learning_rate": 9.968267722192211e-05, "loss": 2.0895, "step": 4633 }, { "epoch": 0.03603836211321365, "grad_norm": 0.12929933660923, "learning_rate": 9.968253977517692e-05, "loss": 2.1015, "step": 4634 }, { "epoch": 0.03604613905799423, "grad_norm": 0.12338171534156944, "learning_rate": 9.968240229876582e-05, "loss": 2.1373, "step": 4635 }, { "epoch": 0.03605391600277481, "grad_norm": 0.12226847348877014, "learning_rate": 9.968226479268885e-05, "loss": 2.0973, "step": 4636 }, { "epoch": 0.036061692947555395, "grad_norm": 0.14212351321413208, "learning_rate": 9.968212725694612e-05, "loss": 2.0542, "step": 4637 }, { "epoch": 0.036069469892335976, "grad_norm": 0.1266850560795752, "learning_rate": 9.968198969153769e-05, "loss": 2.135, "step": 4638 }, { "epoch": 0.03607724683711656, "grad_norm": 0.13063446065361561, "learning_rate": 9.968185209646367e-05, "loss": 2.1052, "step": 4639 }, { "epoch": 0.03608502378189714, "grad_norm": 0.14567720242974697, "learning_rate": 9.968171447172412e-05, "loss": 2.0729, "step": 4640 }, { "epoch": 0.03609280072667772, "grad_norm": 0.14532994201238705, "learning_rate": 9.968157681731913e-05, "loss": 2.1689, "step": 4641 }, { "epoch": 0.0361005776714583, "grad_norm": 0.11788980559440364, "learning_rate": 9.968143913324876e-05, "loss": 2.0355, "step": 4642 }, { "epoch": 0.03610835461623888, "grad_norm": 0.16075980359499784, "learning_rate": 9.968130141951315e-05, "loss": 2.0701, "step": 4643 }, { "epoch": 0.03611613156101946, "grad_norm": 0.18064800523949248, "learning_rate": 9.968116367611232e-05, "loss": 2.0898, "step": 4644 }, { "epoch": 0.036123908505800044, "grad_norm": 0.128611022464076, "learning_rate": 9.968102590304639e-05, "loss": 2.1109, "step": 4645 }, { "epoch": 0.036131685450580625, "grad_norm": 0.17044724074201106, "learning_rate": 9.968088810031542e-05, "loss": 2.1242, "step": 4646 }, { "epoch": 0.036139462395361206, "grad_norm": 0.19582272473021572, "learning_rate": 9.968075026791951e-05, "loss": 2.1138, "step": 4647 }, { "epoch": 0.03614723934014179, "grad_norm": 0.13618404448262075, "learning_rate": 9.968061240585873e-05, "loss": 2.1051, "step": 4648 }, { "epoch": 0.03615501628492237, "grad_norm": 0.1474978047320206, "learning_rate": 9.968047451413317e-05, "loss": 2.0896, "step": 4649 }, { "epoch": 0.03616279322970295, "grad_norm": 0.19097860591045646, "learning_rate": 9.96803365927429e-05, "loss": 2.0359, "step": 4650 }, { "epoch": 0.03617057017448353, "grad_norm": 0.14555572027830616, "learning_rate": 9.968019864168803e-05, "loss": 2.0732, "step": 4651 }, { "epoch": 0.03617834711926411, "grad_norm": 0.12315390608534092, "learning_rate": 9.96800606609686e-05, "loss": 2.0577, "step": 4652 }, { "epoch": 0.03618612406404469, "grad_norm": 0.1631094976725269, "learning_rate": 9.967992265058475e-05, "loss": 2.1312, "step": 4653 }, { "epoch": 0.036193901008825274, "grad_norm": 0.13039921513361014, "learning_rate": 9.967978461053651e-05, "loss": 2.0668, "step": 4654 }, { "epoch": 0.036201677953605856, "grad_norm": 0.13862364471417, "learning_rate": 9.967964654082396e-05, "loss": 2.1028, "step": 4655 }, { "epoch": 0.03620945489838644, "grad_norm": 0.16840193395782815, "learning_rate": 9.967950844144723e-05, "loss": 2.0636, "step": 4656 }, { "epoch": 0.03621723184316702, "grad_norm": 0.12491144824504745, "learning_rate": 9.967937031240637e-05, "loss": 2.0809, "step": 4657 }, { "epoch": 0.0362250087879476, "grad_norm": 0.14661278224466204, "learning_rate": 9.967923215370147e-05, "loss": 2.0779, "step": 4658 }, { "epoch": 0.03623278573272818, "grad_norm": 0.17408811702895308, "learning_rate": 9.967909396533262e-05, "loss": 2.0348, "step": 4659 }, { "epoch": 0.03624056267750876, "grad_norm": 0.17387002953605832, "learning_rate": 9.967895574729987e-05, "loss": 2.0308, "step": 4660 }, { "epoch": 0.03624833962228934, "grad_norm": 0.1617431195114592, "learning_rate": 9.967881749960333e-05, "loss": 2.0778, "step": 4661 }, { "epoch": 0.03625611656706993, "grad_norm": 0.20564604361839314, "learning_rate": 9.96786792222431e-05, "loss": 2.0516, "step": 4662 }, { "epoch": 0.03626389351185051, "grad_norm": 0.12276195648751018, "learning_rate": 9.967854091521922e-05, "loss": 2.1193, "step": 4663 }, { "epoch": 0.03627167045663109, "grad_norm": 0.22593572707097642, "learning_rate": 9.967840257853181e-05, "loss": 2.1147, "step": 4664 }, { "epoch": 0.036279447401411674, "grad_norm": 0.2953943316683342, "learning_rate": 9.967826421218093e-05, "loss": 2.0943, "step": 4665 }, { "epoch": 0.036287224346192255, "grad_norm": 0.1399614098287719, "learning_rate": 9.967812581616667e-05, "loss": 2.0834, "step": 4666 }, { "epoch": 0.036295001290972836, "grad_norm": 0.22986004548006386, "learning_rate": 9.96779873904891e-05, "loss": 2.1083, "step": 4667 }, { "epoch": 0.03630277823575342, "grad_norm": 0.23976012813361994, "learning_rate": 9.967784893514834e-05, "loss": 2.0479, "step": 4668 }, { "epoch": 0.036310555180534, "grad_norm": 0.12082376681621558, "learning_rate": 9.967771045014442e-05, "loss": 2.0673, "step": 4669 }, { "epoch": 0.03631833212531458, "grad_norm": 0.2511320054377802, "learning_rate": 9.967757193547747e-05, "loss": 2.0897, "step": 4670 }, { "epoch": 0.03632610907009516, "grad_norm": 0.20860654700431872, "learning_rate": 9.967743339114753e-05, "loss": 2.0825, "step": 4671 }, { "epoch": 0.03633388601487574, "grad_norm": 0.12294539067160258, "learning_rate": 9.967729481715473e-05, "loss": 2.0563, "step": 4672 }, { "epoch": 0.03634166295965632, "grad_norm": 0.22662650266095224, "learning_rate": 9.96771562134991e-05, "loss": 2.0568, "step": 4673 }, { "epoch": 0.036349439904436905, "grad_norm": 0.1783411313656249, "learning_rate": 9.967701758018077e-05, "loss": 2.0445, "step": 4674 }, { "epoch": 0.036357216849217486, "grad_norm": 0.12103983127323327, "learning_rate": 9.96768789171998e-05, "loss": 2.1007, "step": 4675 }, { "epoch": 0.03636499379399807, "grad_norm": 0.1708583721142642, "learning_rate": 9.967674022455628e-05, "loss": 2.097, "step": 4676 }, { "epoch": 0.03637277073877865, "grad_norm": 0.1748544012865692, "learning_rate": 9.967660150225028e-05, "loss": 2.0438, "step": 4677 }, { "epoch": 0.03638054768355923, "grad_norm": 0.1511541752452169, "learning_rate": 9.96764627502819e-05, "loss": 2.1146, "step": 4678 }, { "epoch": 0.03638832462833981, "grad_norm": 0.2593113994185541, "learning_rate": 9.967632396865121e-05, "loss": 2.0576, "step": 4679 }, { "epoch": 0.03639610157312039, "grad_norm": 0.1282643050042202, "learning_rate": 9.967618515735829e-05, "loss": 2.0267, "step": 4680 }, { "epoch": 0.03640387851790097, "grad_norm": 0.1570025737783829, "learning_rate": 9.967604631640324e-05, "loss": 2.0734, "step": 4681 }, { "epoch": 0.036411655462681554, "grad_norm": 0.18574971107657498, "learning_rate": 9.967590744578614e-05, "loss": 2.0733, "step": 4682 }, { "epoch": 0.036419432407462135, "grad_norm": 0.1259286824955409, "learning_rate": 9.967576854550706e-05, "loss": 2.114, "step": 4683 }, { "epoch": 0.036427209352242716, "grad_norm": 0.14237457324863576, "learning_rate": 9.967562961556607e-05, "loss": 2.108, "step": 4684 }, { "epoch": 0.0364349862970233, "grad_norm": 0.16728500585960346, "learning_rate": 9.96754906559633e-05, "loss": 2.0205, "step": 4685 }, { "epoch": 0.03644276324180388, "grad_norm": 0.1362715612587428, "learning_rate": 9.96753516666988e-05, "loss": 2.1269, "step": 4686 }, { "epoch": 0.03645054018658446, "grad_norm": 0.1170504904061279, "learning_rate": 9.967521264777264e-05, "loss": 2.1392, "step": 4687 }, { "epoch": 0.03645831713136504, "grad_norm": 0.14173687450477696, "learning_rate": 9.967507359918494e-05, "loss": 2.1093, "step": 4688 }, { "epoch": 0.03646609407614562, "grad_norm": 0.11471933078214362, "learning_rate": 9.967493452093577e-05, "loss": 2.0155, "step": 4689 }, { "epoch": 0.0364738710209262, "grad_norm": 0.12931596225834802, "learning_rate": 9.96747954130252e-05, "loss": 2.0618, "step": 4690 }, { "epoch": 0.036481647965706784, "grad_norm": 0.12989981716071938, "learning_rate": 9.96746562754533e-05, "loss": 2.1062, "step": 4691 }, { "epoch": 0.036489424910487366, "grad_norm": 0.11673474312884717, "learning_rate": 9.96745171082202e-05, "loss": 2.1217, "step": 4692 }, { "epoch": 0.03649720185526795, "grad_norm": 0.12177416109072599, "learning_rate": 9.967437791132594e-05, "loss": 2.1226, "step": 4693 }, { "epoch": 0.03650497880004853, "grad_norm": 0.11679433275457264, "learning_rate": 9.967423868477062e-05, "loss": 2.0832, "step": 4694 }, { "epoch": 0.03651275574482911, "grad_norm": 0.11404379919843328, "learning_rate": 9.967409942855432e-05, "loss": 2.065, "step": 4695 }, { "epoch": 0.03652053268960969, "grad_norm": 0.12279382052003866, "learning_rate": 9.967396014267715e-05, "loss": 2.0923, "step": 4696 }, { "epoch": 0.03652830963439027, "grad_norm": 0.11421176354633579, "learning_rate": 9.967382082713915e-05, "loss": 2.1207, "step": 4697 }, { "epoch": 0.03653608657917085, "grad_norm": 0.11720754331984849, "learning_rate": 9.967368148194042e-05, "loss": 2.1351, "step": 4698 }, { "epoch": 0.036543863523951434, "grad_norm": 0.1163197768237221, "learning_rate": 9.967354210708105e-05, "loss": 2.0318, "step": 4699 }, { "epoch": 0.036551640468732015, "grad_norm": 0.11322801462224104, "learning_rate": 9.967340270256111e-05, "loss": 2.0624, "step": 4700 }, { "epoch": 0.036559417413512596, "grad_norm": 0.13079856668333117, "learning_rate": 9.967326326838071e-05, "loss": 2.0187, "step": 4701 }, { "epoch": 0.03656719435829318, "grad_norm": 0.12303058207584656, "learning_rate": 9.967312380453991e-05, "loss": 2.0396, "step": 4702 }, { "epoch": 0.03657497130307376, "grad_norm": 0.11771290017978908, "learning_rate": 9.96729843110388e-05, "loss": 2.0561, "step": 4703 }, { "epoch": 0.03658274824785434, "grad_norm": 0.15326645371836545, "learning_rate": 9.967284478787745e-05, "loss": 2.119, "step": 4704 }, { "epoch": 0.03659052519263492, "grad_norm": 0.14154845804328808, "learning_rate": 9.967270523505596e-05, "loss": 2.1025, "step": 4705 }, { "epoch": 0.0365983021374155, "grad_norm": 0.11490481143891745, "learning_rate": 9.967256565257441e-05, "loss": 2.1044, "step": 4706 }, { "epoch": 0.03660607908219608, "grad_norm": 0.14961768703692602, "learning_rate": 9.967242604043288e-05, "loss": 2.1166, "step": 4707 }, { "epoch": 0.036613856026976664, "grad_norm": 0.14607385071605922, "learning_rate": 9.967228639863146e-05, "loss": 2.1189, "step": 4708 }, { "epoch": 0.036621632971757245, "grad_norm": 0.11285842032104465, "learning_rate": 9.967214672717023e-05, "loss": 2.1441, "step": 4709 }, { "epoch": 0.036629409916537826, "grad_norm": 0.1277618591995005, "learning_rate": 9.967200702604925e-05, "loss": 2.1038, "step": 4710 }, { "epoch": 0.03663718686131841, "grad_norm": 0.13204583725706798, "learning_rate": 9.967186729526864e-05, "loss": 2.0846, "step": 4711 }, { "epoch": 0.03664496380609899, "grad_norm": 0.1191298443134934, "learning_rate": 9.967172753482847e-05, "loss": 2.1416, "step": 4712 }, { "epoch": 0.03665274075087957, "grad_norm": 0.1523350930434485, "learning_rate": 9.967158774472882e-05, "loss": 2.1707, "step": 4713 }, { "epoch": 0.03666051769566015, "grad_norm": 0.13372077113403408, "learning_rate": 9.967144792496978e-05, "loss": 2.1142, "step": 4714 }, { "epoch": 0.03666829464044073, "grad_norm": 0.1272651457616384, "learning_rate": 9.967130807555143e-05, "loss": 2.1165, "step": 4715 }, { "epoch": 0.03667607158522131, "grad_norm": 0.1729232860783649, "learning_rate": 9.967116819647385e-05, "loss": 2.0566, "step": 4716 }, { "epoch": 0.036683848530001895, "grad_norm": 0.15928753325957504, "learning_rate": 9.967102828773713e-05, "loss": 2.0438, "step": 4717 }, { "epoch": 0.036691625474782476, "grad_norm": 0.11729768728970988, "learning_rate": 9.967088834934134e-05, "loss": 2.075, "step": 4718 }, { "epoch": 0.03669940241956306, "grad_norm": 0.16766182048847006, "learning_rate": 9.967074838128659e-05, "loss": 2.0516, "step": 4719 }, { "epoch": 0.03670717936434364, "grad_norm": 0.15702622837534227, "learning_rate": 9.967060838357292e-05, "loss": 2.1311, "step": 4720 }, { "epoch": 0.03671495630912422, "grad_norm": 0.11563846469922244, "learning_rate": 9.967046835620046e-05, "loss": 2.0654, "step": 4721 }, { "epoch": 0.03672273325390481, "grad_norm": 0.16995642224573113, "learning_rate": 9.967032829916926e-05, "loss": 2.131, "step": 4722 }, { "epoch": 0.03673051019868539, "grad_norm": 0.15862655619581978, "learning_rate": 9.967018821247944e-05, "loss": 2.067, "step": 4723 }, { "epoch": 0.03673828714346597, "grad_norm": 0.11484019997059933, "learning_rate": 9.967004809613104e-05, "loss": 2.0867, "step": 4724 }, { "epoch": 0.03674606408824655, "grad_norm": 0.17263654210913415, "learning_rate": 9.966990795012418e-05, "loss": 2.1346, "step": 4725 }, { "epoch": 0.03675384103302713, "grad_norm": 0.14842145788416383, "learning_rate": 9.966976777445892e-05, "loss": 2.0489, "step": 4726 }, { "epoch": 0.03676161797780771, "grad_norm": 0.11842931643914238, "learning_rate": 9.966962756913535e-05, "loss": 2.1142, "step": 4727 }, { "epoch": 0.036769394922588294, "grad_norm": 0.18239201534229404, "learning_rate": 9.966948733415355e-05, "loss": 2.0709, "step": 4728 }, { "epoch": 0.036777171867368876, "grad_norm": 0.1584491565235035, "learning_rate": 9.966934706951363e-05, "loss": 2.1261, "step": 4729 }, { "epoch": 0.03678494881214946, "grad_norm": 0.11625510864307495, "learning_rate": 9.966920677521565e-05, "loss": 2.0722, "step": 4730 }, { "epoch": 0.03679272575693004, "grad_norm": 0.17478128656401193, "learning_rate": 9.966906645125968e-05, "loss": 2.0589, "step": 4731 }, { "epoch": 0.03680050270171062, "grad_norm": 0.1681762447837659, "learning_rate": 9.966892609764584e-05, "loss": 2.0962, "step": 4732 }, { "epoch": 0.0368082796464912, "grad_norm": 0.11312547675466354, "learning_rate": 9.966878571437419e-05, "loss": 2.0985, "step": 4733 }, { "epoch": 0.03681605659127178, "grad_norm": 0.15266817130555374, "learning_rate": 9.96686453014448e-05, "loss": 2.0819, "step": 4734 }, { "epoch": 0.03682383353605236, "grad_norm": 0.1422219057903843, "learning_rate": 9.966850485885779e-05, "loss": 2.112, "step": 4735 }, { "epoch": 0.036831610480832944, "grad_norm": 0.1140048269595598, "learning_rate": 9.966836438661322e-05, "loss": 2.1278, "step": 4736 }, { "epoch": 0.036839387425613525, "grad_norm": 0.1619078049230878, "learning_rate": 9.96682238847112e-05, "loss": 2.0976, "step": 4737 }, { "epoch": 0.036847164370394106, "grad_norm": 0.14229968914070606, "learning_rate": 9.966808335315178e-05, "loss": 2.0988, "step": 4738 }, { "epoch": 0.03685494131517469, "grad_norm": 0.1417245883453647, "learning_rate": 9.966794279193505e-05, "loss": 2.0726, "step": 4739 }, { "epoch": 0.03686271825995527, "grad_norm": 0.21337627145984492, "learning_rate": 9.96678022010611e-05, "loss": 2.0411, "step": 4740 }, { "epoch": 0.03687049520473585, "grad_norm": 0.1916245928503509, "learning_rate": 9.966766158053003e-05, "loss": 2.0702, "step": 4741 }, { "epoch": 0.03687827214951643, "grad_norm": 0.1702354348662733, "learning_rate": 9.96675209303419e-05, "loss": 2.0687, "step": 4742 }, { "epoch": 0.03688604909429701, "grad_norm": 0.2194134359169569, "learning_rate": 9.966738025049683e-05, "loss": 2.1069, "step": 4743 }, { "epoch": 0.03689382603907759, "grad_norm": 0.17073187818794822, "learning_rate": 9.966723954099483e-05, "loss": 2.1021, "step": 4744 }, { "epoch": 0.036901602983858174, "grad_norm": 0.134573032072693, "learning_rate": 9.966709880183606e-05, "loss": 2.0403, "step": 4745 }, { "epoch": 0.036909379928638755, "grad_norm": 0.24577985707155164, "learning_rate": 9.966695803302058e-05, "loss": 2.1216, "step": 4746 }, { "epoch": 0.036917156873419336, "grad_norm": 0.16531936407966066, "learning_rate": 9.966681723454847e-05, "loss": 2.0558, "step": 4747 }, { "epoch": 0.03692493381819992, "grad_norm": 0.14943833746138951, "learning_rate": 9.96666764064198e-05, "loss": 2.1339, "step": 4748 }, { "epoch": 0.0369327107629805, "grad_norm": 0.23726519714090327, "learning_rate": 9.966653554863467e-05, "loss": 2.0586, "step": 4749 }, { "epoch": 0.03694048770776108, "grad_norm": 0.1638991055598552, "learning_rate": 9.966639466119319e-05, "loss": 2.1211, "step": 4750 }, { "epoch": 0.03694826465254166, "grad_norm": 0.13866251901142543, "learning_rate": 9.96662537440954e-05, "loss": 2.0635, "step": 4751 }, { "epoch": 0.03695604159732224, "grad_norm": 0.2899384541051894, "learning_rate": 9.966611279734138e-05, "loss": 2.0709, "step": 4752 }, { "epoch": 0.03696381854210282, "grad_norm": 0.11972103414595472, "learning_rate": 9.966597182093125e-05, "loss": 2.1048, "step": 4753 }, { "epoch": 0.036971595486883405, "grad_norm": 0.18089281547198158, "learning_rate": 9.96658308148651e-05, "loss": 2.0595, "step": 4754 }, { "epoch": 0.036979372431663986, "grad_norm": 0.15556766676868364, "learning_rate": 9.966568977914296e-05, "loss": 2.0739, "step": 4755 }, { "epoch": 0.03698714937644457, "grad_norm": 0.13939444808948245, "learning_rate": 9.966554871376495e-05, "loss": 2.1058, "step": 4756 }, { "epoch": 0.03699492632122515, "grad_norm": 0.19490620411006912, "learning_rate": 9.966540761873118e-05, "loss": 2.0796, "step": 4757 }, { "epoch": 0.03700270326600573, "grad_norm": 0.11488336688351591, "learning_rate": 9.966526649404168e-05, "loss": 2.0811, "step": 4758 }, { "epoch": 0.03701048021078631, "grad_norm": 0.16501224412397816, "learning_rate": 9.966512533969656e-05, "loss": 2.1129, "step": 4759 }, { "epoch": 0.03701825715556689, "grad_norm": 0.15843141395275742, "learning_rate": 9.966498415569592e-05, "loss": 2.1276, "step": 4760 }, { "epoch": 0.03702603410034747, "grad_norm": 0.12241312102782062, "learning_rate": 9.966484294203982e-05, "loss": 2.1144, "step": 4761 }, { "epoch": 0.037033811045128054, "grad_norm": 0.15179514674546746, "learning_rate": 9.966470169872835e-05, "loss": 2.1303, "step": 4762 }, { "epoch": 0.037041587989908635, "grad_norm": 0.11584330999473068, "learning_rate": 9.96645604257616e-05, "loss": 2.055, "step": 4763 }, { "epoch": 0.037049364934689216, "grad_norm": 0.1330927251349119, "learning_rate": 9.966441912313965e-05, "loss": 2.0633, "step": 4764 }, { "epoch": 0.0370571418794698, "grad_norm": 0.13391616269243825, "learning_rate": 9.966427779086259e-05, "loss": 2.0968, "step": 4765 }, { "epoch": 0.03706491882425038, "grad_norm": 0.11695209610631761, "learning_rate": 9.96641364289305e-05, "loss": 2.0759, "step": 4766 }, { "epoch": 0.03707269576903096, "grad_norm": 0.1508091274876541, "learning_rate": 9.966399503734347e-05, "loss": 2.1013, "step": 4767 }, { "epoch": 0.03708047271381154, "grad_norm": 0.13721209970936787, "learning_rate": 9.966385361610156e-05, "loss": 2.0958, "step": 4768 }, { "epoch": 0.03708824965859212, "grad_norm": 0.11814255501717931, "learning_rate": 9.96637121652049e-05, "loss": 2.0658, "step": 4769 }, { "epoch": 0.0370960266033727, "grad_norm": 0.1276052073076485, "learning_rate": 9.966357068465353e-05, "loss": 2.0668, "step": 4770 }, { "epoch": 0.037103803548153284, "grad_norm": 0.12263354761947713, "learning_rate": 9.966342917444755e-05, "loss": 2.0601, "step": 4771 }, { "epoch": 0.037111580492933866, "grad_norm": 0.11901962277663981, "learning_rate": 9.966328763458706e-05, "loss": 2.1064, "step": 4772 }, { "epoch": 0.03711935743771445, "grad_norm": 0.29156465607221177, "learning_rate": 9.966314606507212e-05, "loss": 2.0924, "step": 4773 }, { "epoch": 0.03712713438249503, "grad_norm": 0.12587174816386854, "learning_rate": 9.966300446590284e-05, "loss": 2.1129, "step": 4774 }, { "epoch": 0.03713491132727561, "grad_norm": 0.11908198198434908, "learning_rate": 9.966286283707929e-05, "loss": 2.146, "step": 4775 }, { "epoch": 0.03714268827205619, "grad_norm": 0.11709077130294043, "learning_rate": 9.966272117860153e-05, "loss": 2.1208, "step": 4776 }, { "epoch": 0.03715046521683677, "grad_norm": 0.12324153940610148, "learning_rate": 9.96625794904697e-05, "loss": 2.0512, "step": 4777 }, { "epoch": 0.03715824216161735, "grad_norm": 0.11691485298199805, "learning_rate": 9.966243777268385e-05, "loss": 2.1535, "step": 4778 }, { "epoch": 0.037166019106397934, "grad_norm": 0.11509363340616613, "learning_rate": 9.966229602524405e-05, "loss": 2.0621, "step": 4779 }, { "epoch": 0.037173796051178515, "grad_norm": 0.11626637479632115, "learning_rate": 9.966215424815042e-05, "loss": 2.0847, "step": 4780 }, { "epoch": 0.0371815729959591, "grad_norm": 0.11634641891389416, "learning_rate": 9.966201244140303e-05, "loss": 2.1047, "step": 4781 }, { "epoch": 0.037189349940739684, "grad_norm": 0.11695339926795748, "learning_rate": 9.966187060500196e-05, "loss": 2.1013, "step": 4782 }, { "epoch": 0.037197126885520265, "grad_norm": 0.1171699416414077, "learning_rate": 9.966172873894729e-05, "loss": 2.1369, "step": 4783 }, { "epoch": 0.037204903830300846, "grad_norm": 0.12178389600098616, "learning_rate": 9.966158684323912e-05, "loss": 2.0944, "step": 4784 }, { "epoch": 0.03721268077508143, "grad_norm": 0.11473446057228971, "learning_rate": 9.966144491787752e-05, "loss": 2.1589, "step": 4785 }, { "epoch": 0.03722045771986201, "grad_norm": 0.1278563405964415, "learning_rate": 9.966130296286258e-05, "loss": 2.1022, "step": 4786 }, { "epoch": 0.03722823466464259, "grad_norm": 0.15974561034429138, "learning_rate": 9.96611609781944e-05, "loss": 2.1654, "step": 4787 }, { "epoch": 0.03723601160942317, "grad_norm": 0.12563613845543212, "learning_rate": 9.966101896387304e-05, "loss": 2.0404, "step": 4788 }, { "epoch": 0.03724378855420375, "grad_norm": 0.11947220479831475, "learning_rate": 9.96608769198986e-05, "loss": 2.0915, "step": 4789 }, { "epoch": 0.03725156549898433, "grad_norm": 0.1281157004785992, "learning_rate": 9.966073484627116e-05, "loss": 2.1136, "step": 4790 }, { "epoch": 0.037259342443764915, "grad_norm": 0.12157480760176496, "learning_rate": 9.966059274299082e-05, "loss": 2.094, "step": 4791 }, { "epoch": 0.037267119388545496, "grad_norm": 0.1262235432734631, "learning_rate": 9.966045061005763e-05, "loss": 2.029, "step": 4792 }, { "epoch": 0.03727489633332608, "grad_norm": 0.1274802205164814, "learning_rate": 9.966030844747171e-05, "loss": 2.0884, "step": 4793 }, { "epoch": 0.03728267327810666, "grad_norm": 0.11355390327168959, "learning_rate": 9.966016625523312e-05, "loss": 2.0946, "step": 4794 }, { "epoch": 0.03729045022288724, "grad_norm": 0.11738666893726243, "learning_rate": 9.966002403334196e-05, "loss": 1.9912, "step": 4795 }, { "epoch": 0.03729822716766782, "grad_norm": 0.11764242465683095, "learning_rate": 9.965988178179831e-05, "loss": 2.066, "step": 4796 }, { "epoch": 0.0373060041124484, "grad_norm": 0.11802555470373916, "learning_rate": 9.965973950060226e-05, "loss": 2.1185, "step": 4797 }, { "epoch": 0.03731378105722898, "grad_norm": 0.12828699758206935, "learning_rate": 9.965959718975388e-05, "loss": 2.0691, "step": 4798 }, { "epoch": 0.037321558002009564, "grad_norm": 0.1131318191225322, "learning_rate": 9.965945484925328e-05, "loss": 2.1233, "step": 4799 }, { "epoch": 0.037329334946790145, "grad_norm": 0.11965130214689795, "learning_rate": 9.965931247910051e-05, "loss": 2.1208, "step": 4800 }, { "epoch": 0.037337111891570726, "grad_norm": 0.11503611466864753, "learning_rate": 9.96591700792957e-05, "loss": 2.1224, "step": 4801 }, { "epoch": 0.03734488883635131, "grad_norm": 0.11462148239906989, "learning_rate": 9.96590276498389e-05, "loss": 2.151, "step": 4802 }, { "epoch": 0.03735266578113189, "grad_norm": 0.12277226702660358, "learning_rate": 9.96588851907302e-05, "loss": 2.0907, "step": 4803 }, { "epoch": 0.03736044272591247, "grad_norm": 0.11559502665947965, "learning_rate": 9.965874270196968e-05, "loss": 2.0951, "step": 4804 }, { "epoch": 0.03736821967069305, "grad_norm": 0.11797305322322821, "learning_rate": 9.965860018355746e-05, "loss": 2.0788, "step": 4805 }, { "epoch": 0.03737599661547363, "grad_norm": 0.112516419038037, "learning_rate": 9.96584576354936e-05, "loss": 2.0892, "step": 4806 }, { "epoch": 0.03738377356025421, "grad_norm": 0.11800799237329075, "learning_rate": 9.965831505777817e-05, "loss": 2.0915, "step": 4807 }, { "epoch": 0.037391550505034794, "grad_norm": 0.11683881881185922, "learning_rate": 9.965817245041126e-05, "loss": 2.0773, "step": 4808 }, { "epoch": 0.037399327449815375, "grad_norm": 0.1125350057962804, "learning_rate": 9.965802981339299e-05, "loss": 2.1037, "step": 4809 }, { "epoch": 0.03740710439459596, "grad_norm": 0.120858833984995, "learning_rate": 9.965788714672342e-05, "loss": 2.1064, "step": 4810 }, { "epoch": 0.03741488133937654, "grad_norm": 0.11444276864325689, "learning_rate": 9.965774445040263e-05, "loss": 2.1092, "step": 4811 }, { "epoch": 0.03742265828415712, "grad_norm": 0.1206853914575035, "learning_rate": 9.965760172443071e-05, "loss": 2.0101, "step": 4812 }, { "epoch": 0.0374304352289377, "grad_norm": 0.12939745395996158, "learning_rate": 9.965745896880775e-05, "loss": 2.0401, "step": 4813 }, { "epoch": 0.03743821217371828, "grad_norm": 0.12657476290839256, "learning_rate": 9.965731618353382e-05, "loss": 2.0431, "step": 4814 }, { "epoch": 0.03744598911849886, "grad_norm": 0.12231958390278377, "learning_rate": 9.965717336860903e-05, "loss": 2.1104, "step": 4815 }, { "epoch": 0.037453766063279444, "grad_norm": 0.12604850525556902, "learning_rate": 9.965703052403345e-05, "loss": 2.1223, "step": 4816 }, { "epoch": 0.037461543008060025, "grad_norm": 0.12455714023010223, "learning_rate": 9.965688764980717e-05, "loss": 2.1191, "step": 4817 }, { "epoch": 0.037469319952840606, "grad_norm": 0.12131179569347465, "learning_rate": 9.965674474593027e-05, "loss": 2.1095, "step": 4818 }, { "epoch": 0.03747709689762119, "grad_norm": 0.18844204878103804, "learning_rate": 9.965660181240285e-05, "loss": 2.0624, "step": 4819 }, { "epoch": 0.03748487384240177, "grad_norm": 0.14208760869802073, "learning_rate": 9.965645884922497e-05, "loss": 2.057, "step": 4820 }, { "epoch": 0.03749265078718235, "grad_norm": 0.173486958201629, "learning_rate": 9.965631585639672e-05, "loss": 2.0357, "step": 4821 }, { "epoch": 0.03750042773196293, "grad_norm": 0.14527639600428885, "learning_rate": 9.965617283391822e-05, "loss": 2.0897, "step": 4822 }, { "epoch": 0.03750820467674351, "grad_norm": 0.11928629313013014, "learning_rate": 9.965602978178951e-05, "loss": 2.1157, "step": 4823 }, { "epoch": 0.03751598162152409, "grad_norm": 0.11423097440376216, "learning_rate": 9.96558867000107e-05, "loss": 2.0521, "step": 4824 }, { "epoch": 0.037523758566304674, "grad_norm": 0.13535158919418497, "learning_rate": 9.965574358858187e-05, "loss": 2.095, "step": 4825 }, { "epoch": 0.037531535511085255, "grad_norm": 0.11425580605895799, "learning_rate": 9.965560044750311e-05, "loss": 2.0847, "step": 4826 }, { "epoch": 0.037539312455865836, "grad_norm": 0.11791579688490741, "learning_rate": 9.96554572767745e-05, "loss": 2.1125, "step": 4827 }, { "epoch": 0.03754708940064642, "grad_norm": 0.11936252931820991, "learning_rate": 9.965531407639614e-05, "loss": 2.1148, "step": 4828 }, { "epoch": 0.037554866345427, "grad_norm": 0.11918198943521191, "learning_rate": 9.965517084636808e-05, "loss": 2.0499, "step": 4829 }, { "epoch": 0.03756264329020758, "grad_norm": 0.11604259892528904, "learning_rate": 9.965502758669045e-05, "loss": 2.0747, "step": 4830 }, { "epoch": 0.03757042023498816, "grad_norm": 0.11741840884066797, "learning_rate": 9.96548842973633e-05, "loss": 2.0972, "step": 4831 }, { "epoch": 0.03757819717976874, "grad_norm": 0.1382249840308211, "learning_rate": 9.965474097838673e-05, "loss": 2.1111, "step": 4832 }, { "epoch": 0.03758597412454932, "grad_norm": 0.15311010023065252, "learning_rate": 9.965459762976083e-05, "loss": 2.1305, "step": 4833 }, { "epoch": 0.037593751069329905, "grad_norm": 0.14930976483669461, "learning_rate": 9.965445425148567e-05, "loss": 2.0389, "step": 4834 }, { "epoch": 0.037601528014110486, "grad_norm": 0.12494272031357385, "learning_rate": 9.965431084356136e-05, "loss": 2.0778, "step": 4835 }, { "epoch": 0.03760930495889107, "grad_norm": 0.17111957097728162, "learning_rate": 9.965416740598796e-05, "loss": 2.1062, "step": 4836 }, { "epoch": 0.03761708190367165, "grad_norm": 0.19657406326303903, "learning_rate": 9.965402393876557e-05, "loss": 2.0729, "step": 4837 }, { "epoch": 0.03762485884845223, "grad_norm": 0.12149116870415204, "learning_rate": 9.96538804418943e-05, "loss": 2.092, "step": 4838 }, { "epoch": 0.03763263579323281, "grad_norm": 0.622682182516152, "learning_rate": 9.965373691537417e-05, "loss": 2.099, "step": 4839 }, { "epoch": 0.03764041273801339, "grad_norm": 0.25138317090231244, "learning_rate": 9.965359335920532e-05, "loss": 2.1556, "step": 4840 }, { "epoch": 0.03764818968279398, "grad_norm": 0.11924157709882065, "learning_rate": 9.965344977338782e-05, "loss": 2.1205, "step": 4841 }, { "epoch": 0.03765596662757456, "grad_norm": 0.2209925131258195, "learning_rate": 9.965330615792177e-05, "loss": 2.0961, "step": 4842 }, { "epoch": 0.03766374357235514, "grad_norm": 0.26082284403617134, "learning_rate": 9.965316251280722e-05, "loss": 2.1089, "step": 4843 }, { "epoch": 0.03767152051713572, "grad_norm": 0.1572115949160704, "learning_rate": 9.965301883804427e-05, "loss": 2.0776, "step": 4844 }, { "epoch": 0.037679297461916304, "grad_norm": 0.16753267587910847, "learning_rate": 9.965287513363304e-05, "loss": 2.073, "step": 4845 }, { "epoch": 0.037687074406696885, "grad_norm": 0.18822665564112956, "learning_rate": 9.965273139957358e-05, "loss": 2.1946, "step": 4846 }, { "epoch": 0.03769485135147747, "grad_norm": 0.11508266324199222, "learning_rate": 9.965258763586598e-05, "loss": 2.1068, "step": 4847 }, { "epoch": 0.03770262829625805, "grad_norm": 0.15232116403826446, "learning_rate": 9.965244384251034e-05, "loss": 2.0345, "step": 4848 }, { "epoch": 0.03771040524103863, "grad_norm": 0.13955058865828066, "learning_rate": 9.965230001950672e-05, "loss": 2.1067, "step": 4849 }, { "epoch": 0.03771818218581921, "grad_norm": 0.12853667504838065, "learning_rate": 9.965215616685523e-05, "loss": 2.0832, "step": 4850 }, { "epoch": 0.03772595913059979, "grad_norm": 0.12541603756157388, "learning_rate": 9.965201228455597e-05, "loss": 2.0967, "step": 4851 }, { "epoch": 0.03773373607538037, "grad_norm": 0.1323030635196227, "learning_rate": 9.965186837260897e-05, "loss": 2.1245, "step": 4852 }, { "epoch": 0.037741513020160954, "grad_norm": 0.12154983358958099, "learning_rate": 9.965172443101436e-05, "loss": 2.1135, "step": 4853 }, { "epoch": 0.037749289964941535, "grad_norm": 0.18582824755146, "learning_rate": 9.965158045977223e-05, "loss": 2.0849, "step": 4854 }, { "epoch": 0.037757066909722116, "grad_norm": 0.15882831622181584, "learning_rate": 9.965143645888264e-05, "loss": 2.1507, "step": 4855 }, { "epoch": 0.0377648438545027, "grad_norm": 0.1284169154813153, "learning_rate": 9.965129242834568e-05, "loss": 2.1187, "step": 4856 }, { "epoch": 0.03777262079928328, "grad_norm": 0.12082670177456896, "learning_rate": 9.965114836816148e-05, "loss": 2.077, "step": 4857 }, { "epoch": 0.03778039774406386, "grad_norm": 0.12779046187420381, "learning_rate": 9.965100427833005e-05, "loss": 2.0858, "step": 4858 }, { "epoch": 0.03778817468884444, "grad_norm": 0.11671637419303853, "learning_rate": 9.965086015885153e-05, "loss": 2.0868, "step": 4859 }, { "epoch": 0.03779595163362502, "grad_norm": 0.12425133994797291, "learning_rate": 9.965071600972599e-05, "loss": 2.0925, "step": 4860 }, { "epoch": 0.0378037285784056, "grad_norm": 0.12147298715242233, "learning_rate": 9.965057183095353e-05, "loss": 2.0997, "step": 4861 }, { "epoch": 0.037811505523186184, "grad_norm": 0.11334689197393417, "learning_rate": 9.965042762253422e-05, "loss": 2.0958, "step": 4862 }, { "epoch": 0.037819282467966765, "grad_norm": 0.1256727251846386, "learning_rate": 9.965028338446815e-05, "loss": 2.1436, "step": 4863 }, { "epoch": 0.037827059412747346, "grad_norm": 0.11642133871674114, "learning_rate": 9.96501391167554e-05, "loss": 2.0749, "step": 4864 }, { "epoch": 0.03783483635752793, "grad_norm": 0.2120999360883895, "learning_rate": 9.964999481939606e-05, "loss": 2.0857, "step": 4865 }, { "epoch": 0.03784261330230851, "grad_norm": 0.12352546848428307, "learning_rate": 9.964985049239023e-05, "loss": 2.118, "step": 4866 }, { "epoch": 0.03785039024708909, "grad_norm": 0.1215212463600904, "learning_rate": 9.964970613573798e-05, "loss": 2.1032, "step": 4867 }, { "epoch": 0.03785816719186967, "grad_norm": 0.11776479962125602, "learning_rate": 9.96495617494394e-05, "loss": 2.1583, "step": 4868 }, { "epoch": 0.03786594413665025, "grad_norm": 0.1195141003889412, "learning_rate": 9.964941733349458e-05, "loss": 2.0884, "step": 4869 }, { "epoch": 0.03787372108143083, "grad_norm": 0.2643991700291563, "learning_rate": 9.964927288790359e-05, "loss": 2.1049, "step": 4870 }, { "epoch": 0.037881498026211415, "grad_norm": 0.11231105942296804, "learning_rate": 9.964912841266656e-05, "loss": 2.1032, "step": 4871 }, { "epoch": 0.037889274970991996, "grad_norm": 0.11524020261948674, "learning_rate": 9.964898390778353e-05, "loss": 2.0461, "step": 4872 }, { "epoch": 0.03789705191577258, "grad_norm": 0.11327100003376885, "learning_rate": 9.96488393732546e-05, "loss": 2.0973, "step": 4873 }, { "epoch": 0.03790482886055316, "grad_norm": 0.11920522590722792, "learning_rate": 9.964869480907985e-05, "loss": 2.0824, "step": 4874 }, { "epoch": 0.03791260580533374, "grad_norm": 0.1579656192399772, "learning_rate": 9.96485502152594e-05, "loss": 2.1223, "step": 4875 }, { "epoch": 0.03792038275011432, "grad_norm": 0.11246526997527433, "learning_rate": 9.96484055917933e-05, "loss": 2.1242, "step": 4876 }, { "epoch": 0.0379281596948949, "grad_norm": 0.12054698546350752, "learning_rate": 9.964826093868165e-05, "loss": 2.1019, "step": 4877 }, { "epoch": 0.03793593663967548, "grad_norm": 0.12220250831747806, "learning_rate": 9.964811625592452e-05, "loss": 2.0929, "step": 4878 }, { "epoch": 0.037943713584456064, "grad_norm": 0.16715285093593632, "learning_rate": 9.964797154352203e-05, "loss": 2.0681, "step": 4879 }, { "epoch": 0.037951490529236645, "grad_norm": 0.14130231708787563, "learning_rate": 9.964782680147423e-05, "loss": 2.131, "step": 4880 }, { "epoch": 0.037959267474017226, "grad_norm": 0.17181365473482738, "learning_rate": 9.964768202978123e-05, "loss": 2.1485, "step": 4881 }, { "epoch": 0.03796704441879781, "grad_norm": 0.1770041947521624, "learning_rate": 9.96475372284431e-05, "loss": 2.0296, "step": 4882 }, { "epoch": 0.03797482136357839, "grad_norm": 0.12730519607278462, "learning_rate": 9.964739239745995e-05, "loss": 2.0344, "step": 4883 }, { "epoch": 0.03798259830835897, "grad_norm": 0.12430768377723501, "learning_rate": 9.964724753683186e-05, "loss": 1.9984, "step": 4884 }, { "epoch": 0.03799037525313955, "grad_norm": 0.17134189921927195, "learning_rate": 9.96471026465589e-05, "loss": 2.0951, "step": 4885 }, { "epoch": 0.03799815219792013, "grad_norm": 0.16378647651796296, "learning_rate": 9.964695772664117e-05, "loss": 2.1066, "step": 4886 }, { "epoch": 0.03800592914270071, "grad_norm": 0.11960317173299434, "learning_rate": 9.964681277707874e-05, "loss": 2.0805, "step": 4887 }, { "epoch": 0.038013706087481294, "grad_norm": 0.13172161546922492, "learning_rate": 9.964666779787172e-05, "loss": 2.1025, "step": 4888 }, { "epoch": 0.038021483032261875, "grad_norm": 0.14086636462673044, "learning_rate": 9.964652278902018e-05, "loss": 2.1219, "step": 4889 }, { "epoch": 0.03802925997704246, "grad_norm": 0.13077709799042792, "learning_rate": 9.964637775052421e-05, "loss": 2.0593, "step": 4890 }, { "epoch": 0.03803703692182304, "grad_norm": 0.11689881047812575, "learning_rate": 9.964623268238391e-05, "loss": 2.0967, "step": 4891 }, { "epoch": 0.03804481386660362, "grad_norm": 0.11942847119943117, "learning_rate": 9.964608758459935e-05, "loss": 2.1006, "step": 4892 }, { "epoch": 0.0380525908113842, "grad_norm": 0.14754639982067952, "learning_rate": 9.964594245717061e-05, "loss": 2.0949, "step": 4893 }, { "epoch": 0.03806036775616478, "grad_norm": 0.1525378334162843, "learning_rate": 9.96457973000978e-05, "loss": 2.0967, "step": 4894 }, { "epoch": 0.03806814470094536, "grad_norm": 0.11355600882855925, "learning_rate": 9.9645652113381e-05, "loss": 2.0998, "step": 4895 }, { "epoch": 0.038075921645725944, "grad_norm": 0.1548251251142548, "learning_rate": 9.964550689702027e-05, "loss": 2.0551, "step": 4896 }, { "epoch": 0.038083698590506525, "grad_norm": 0.17501265522340004, "learning_rate": 9.964536165101575e-05, "loss": 2.1186, "step": 4897 }, { "epoch": 0.038091475535287106, "grad_norm": 0.11951209993667669, "learning_rate": 9.964521637536748e-05, "loss": 2.1001, "step": 4898 }, { "epoch": 0.03809925248006769, "grad_norm": 0.1435113479418879, "learning_rate": 9.964507107007556e-05, "loss": 2.1018, "step": 4899 }, { "epoch": 0.038107029424848275, "grad_norm": 0.18017703678049657, "learning_rate": 9.964492573514009e-05, "loss": 2.1052, "step": 4900 }, { "epoch": 0.038114806369628856, "grad_norm": 0.1273425391887175, "learning_rate": 9.964478037056114e-05, "loss": 2.1037, "step": 4901 }, { "epoch": 0.03812258331440944, "grad_norm": 0.11669072428941243, "learning_rate": 9.964463497633879e-05, "loss": 2.0519, "step": 4902 }, { "epoch": 0.03813036025919002, "grad_norm": 0.19929440776165822, "learning_rate": 9.964448955247314e-05, "loss": 2.0846, "step": 4903 }, { "epoch": 0.0381381372039706, "grad_norm": 0.115736973250527, "learning_rate": 9.96443440989643e-05, "loss": 2.0964, "step": 4904 }, { "epoch": 0.03814591414875118, "grad_norm": 0.16887821324855723, "learning_rate": 9.964419861581231e-05, "loss": 2.1509, "step": 4905 }, { "epoch": 0.03815369109353176, "grad_norm": 0.17953582618383218, "learning_rate": 9.96440531030173e-05, "loss": 2.0994, "step": 4906 }, { "epoch": 0.03816146803831234, "grad_norm": 0.5031010750357652, "learning_rate": 9.964390756057931e-05, "loss": 2.0795, "step": 4907 }, { "epoch": 0.038169244983092925, "grad_norm": 0.15954740129240028, "learning_rate": 9.964376198849847e-05, "loss": 2.1362, "step": 4908 }, { "epoch": 0.038177021927873506, "grad_norm": 0.23394152198796206, "learning_rate": 9.964361638677485e-05, "loss": 2.1072, "step": 4909 }, { "epoch": 0.03818479887265409, "grad_norm": 0.1576139916906087, "learning_rate": 9.964347075540854e-05, "loss": 2.0556, "step": 4910 }, { "epoch": 0.03819257581743467, "grad_norm": 0.16468950647596448, "learning_rate": 9.964332509439962e-05, "loss": 2.1123, "step": 4911 }, { "epoch": 0.03820035276221525, "grad_norm": 0.24987211608817086, "learning_rate": 9.964317940374819e-05, "loss": 2.063, "step": 4912 }, { "epoch": 0.03820812970699583, "grad_norm": 0.3105289926201444, "learning_rate": 9.964303368345431e-05, "loss": 2.0654, "step": 4913 }, { "epoch": 0.03821590665177641, "grad_norm": 0.21270118418632677, "learning_rate": 9.96428879335181e-05, "loss": 2.081, "step": 4914 }, { "epoch": 0.03822368359655699, "grad_norm": 0.27784186665081206, "learning_rate": 9.964274215393963e-05, "loss": 2.1145, "step": 4915 }, { "epoch": 0.038231460541337574, "grad_norm": 0.12305089277122898, "learning_rate": 9.9642596344719e-05, "loss": 2.0415, "step": 4916 }, { "epoch": 0.038239237486118155, "grad_norm": 0.29748578004548165, "learning_rate": 9.964245050585627e-05, "loss": 2.0559, "step": 4917 }, { "epoch": 0.038247014430898736, "grad_norm": 0.2667016120964083, "learning_rate": 9.964230463735155e-05, "loss": 2.1314, "step": 4918 }, { "epoch": 0.03825479137567932, "grad_norm": 0.15544655808410643, "learning_rate": 9.964215873920492e-05, "loss": 2.1186, "step": 4919 }, { "epoch": 0.0382625683204599, "grad_norm": 0.3088928562403527, "learning_rate": 9.964201281141646e-05, "loss": 2.0596, "step": 4920 }, { "epoch": 0.03827034526524048, "grad_norm": 0.5017685647448971, "learning_rate": 9.964186685398627e-05, "loss": 2.1164, "step": 4921 }, { "epoch": 0.03827812221002106, "grad_norm": 0.2173518669593589, "learning_rate": 9.964172086691445e-05, "loss": 2.0372, "step": 4922 }, { "epoch": 0.03828589915480164, "grad_norm": 0.3185400495608179, "learning_rate": 9.964157485020104e-05, "loss": 2.0328, "step": 4923 }, { "epoch": 0.03829367609958222, "grad_norm": 0.19796303704646484, "learning_rate": 9.964142880384618e-05, "loss": 2.0389, "step": 4924 }, { "epoch": 0.038301453044362804, "grad_norm": 0.158817898412342, "learning_rate": 9.964128272784992e-05, "loss": 2.0821, "step": 4925 }, { "epoch": 0.038309229989143385, "grad_norm": 0.13330281379955805, "learning_rate": 9.964113662221237e-05, "loss": 2.0683, "step": 4926 }, { "epoch": 0.03831700693392397, "grad_norm": 0.17682122978928577, "learning_rate": 9.96409904869336e-05, "loss": 2.123, "step": 4927 }, { "epoch": 0.03832478387870455, "grad_norm": 0.18580468626599936, "learning_rate": 9.964084432201371e-05, "loss": 2.078, "step": 4928 }, { "epoch": 0.03833256082348513, "grad_norm": 0.13542460252503924, "learning_rate": 9.964069812745278e-05, "loss": 2.113, "step": 4929 }, { "epoch": 0.03834033776826571, "grad_norm": 0.1402382150277646, "learning_rate": 9.964055190325091e-05, "loss": 2.1095, "step": 4930 }, { "epoch": 0.03834811471304629, "grad_norm": 0.14291390401381784, "learning_rate": 9.964040564940818e-05, "loss": 2.0692, "step": 4931 }, { "epoch": 0.03835589165782687, "grad_norm": 0.12706123043706838, "learning_rate": 9.964025936592465e-05, "loss": 2.1283, "step": 4932 }, { "epoch": 0.038363668602607454, "grad_norm": 0.12805648760527777, "learning_rate": 9.964011305280045e-05, "loss": 2.1028, "step": 4933 }, { "epoch": 0.038371445547388035, "grad_norm": 0.1318855323008305, "learning_rate": 9.963996671003564e-05, "loss": 2.033, "step": 4934 }, { "epoch": 0.038379222492168616, "grad_norm": 0.12708555044667522, "learning_rate": 9.963982033763033e-05, "loss": 2.0759, "step": 4935 }, { "epoch": 0.0383869994369492, "grad_norm": 0.16708580653048347, "learning_rate": 9.963967393558458e-05, "loss": 2.0777, "step": 4936 }, { "epoch": 0.03839477638172978, "grad_norm": 0.15836919507338526, "learning_rate": 9.963952750389851e-05, "loss": 2.0987, "step": 4937 }, { "epoch": 0.03840255332651036, "grad_norm": 0.12760534062100204, "learning_rate": 9.963938104257218e-05, "loss": 2.1313, "step": 4938 }, { "epoch": 0.03841033027129094, "grad_norm": 0.13574847273266855, "learning_rate": 9.963923455160569e-05, "loss": 2.0804, "step": 4939 }, { "epoch": 0.03841810721607152, "grad_norm": 0.2145048953101021, "learning_rate": 9.96390880309991e-05, "loss": 2.1137, "step": 4940 }, { "epoch": 0.0384258841608521, "grad_norm": 0.12277169691135706, "learning_rate": 9.963894148075254e-05, "loss": 2.0422, "step": 4941 }, { "epoch": 0.038433661105632684, "grad_norm": 0.15978268933889697, "learning_rate": 9.963879490086609e-05, "loss": 2.1377, "step": 4942 }, { "epoch": 0.038441438050413265, "grad_norm": 0.13004296283151498, "learning_rate": 9.963864829133982e-05, "loss": 2.1037, "step": 4943 }, { "epoch": 0.038449214995193846, "grad_norm": 0.12072499465153666, "learning_rate": 9.96385016521738e-05, "loss": 2.0724, "step": 4944 }, { "epoch": 0.03845699193997443, "grad_norm": 0.12424722959757017, "learning_rate": 9.963835498336818e-05, "loss": 2.1035, "step": 4945 }, { "epoch": 0.03846476888475501, "grad_norm": 0.1427048492366682, "learning_rate": 9.963820828492298e-05, "loss": 2.1169, "step": 4946 }, { "epoch": 0.03847254582953559, "grad_norm": 0.12976333058332504, "learning_rate": 9.963806155683832e-05, "loss": 2.0669, "step": 4947 }, { "epoch": 0.03848032277431617, "grad_norm": 0.11989239530213175, "learning_rate": 9.96379147991143e-05, "loss": 2.1181, "step": 4948 }, { "epoch": 0.03848809971909675, "grad_norm": 0.12889660351089904, "learning_rate": 9.963776801175099e-05, "loss": 2.0941, "step": 4949 }, { "epoch": 0.03849587666387733, "grad_norm": 0.1304463900172026, "learning_rate": 9.963762119474847e-05, "loss": 2.0783, "step": 4950 }, { "epoch": 0.038503653608657915, "grad_norm": 0.13146224079727825, "learning_rate": 9.963747434810684e-05, "loss": 2.0817, "step": 4951 }, { "epoch": 0.038511430553438496, "grad_norm": 0.12737621376300903, "learning_rate": 9.96373274718262e-05, "loss": 2.0196, "step": 4952 }, { "epoch": 0.03851920749821908, "grad_norm": 0.12046992340427462, "learning_rate": 9.96371805659066e-05, "loss": 2.0487, "step": 4953 }, { "epoch": 0.03852698444299966, "grad_norm": 0.11943274628647957, "learning_rate": 9.963703363034815e-05, "loss": 2.0426, "step": 4954 }, { "epoch": 0.03853476138778024, "grad_norm": 0.15550689377909033, "learning_rate": 9.963688666515095e-05, "loss": 2.1624, "step": 4955 }, { "epoch": 0.03854253833256082, "grad_norm": 0.160464374607802, "learning_rate": 9.963673967031509e-05, "loss": 2.0468, "step": 4956 }, { "epoch": 0.0385503152773414, "grad_norm": 0.12335004393498078, "learning_rate": 9.963659264584061e-05, "loss": 2.0761, "step": 4957 }, { "epoch": 0.03855809222212198, "grad_norm": 0.14486394563256075, "learning_rate": 9.963644559172764e-05, "loss": 2.0944, "step": 4958 }, { "epoch": 0.03856586916690257, "grad_norm": 0.16827779859562225, "learning_rate": 9.963629850797628e-05, "loss": 2.1164, "step": 4959 }, { "epoch": 0.03857364611168315, "grad_norm": 0.12700383788542172, "learning_rate": 9.963615139458657e-05, "loss": 2.1026, "step": 4960 }, { "epoch": 0.03858142305646373, "grad_norm": 0.12788315819425486, "learning_rate": 9.963600425155865e-05, "loss": 2.1656, "step": 4961 }, { "epoch": 0.038589200001244314, "grad_norm": 0.14486273121210516, "learning_rate": 9.963585707889258e-05, "loss": 2.1022, "step": 4962 }, { "epoch": 0.038596976946024895, "grad_norm": 0.11790803502837793, "learning_rate": 9.963570987658844e-05, "loss": 2.0818, "step": 4963 }, { "epoch": 0.03860475389080548, "grad_norm": 0.16241854341205225, "learning_rate": 9.963556264464633e-05, "loss": 2.0494, "step": 4964 }, { "epoch": 0.03861253083558606, "grad_norm": 0.1425449943399895, "learning_rate": 9.963541538306633e-05, "loss": 2.0829, "step": 4965 }, { "epoch": 0.03862030778036664, "grad_norm": 0.12356740299448399, "learning_rate": 9.963526809184856e-05, "loss": 2.0635, "step": 4966 }, { "epoch": 0.03862808472514722, "grad_norm": 0.13973990352276794, "learning_rate": 9.963512077099305e-05, "loss": 2.1813, "step": 4967 }, { "epoch": 0.0386358616699278, "grad_norm": 0.14018925053072562, "learning_rate": 9.963497342049994e-05, "loss": 2.0937, "step": 4968 }, { "epoch": 0.03864363861470838, "grad_norm": 0.1274555400139991, "learning_rate": 9.963482604036929e-05, "loss": 2.116, "step": 4969 }, { "epoch": 0.038651415559488964, "grad_norm": 0.118811536421875, "learning_rate": 9.96346786306012e-05, "loss": 2.1241, "step": 4970 }, { "epoch": 0.038659192504269545, "grad_norm": 0.13154361363822792, "learning_rate": 9.963453119119575e-05, "loss": 2.114, "step": 4971 }, { "epoch": 0.038666969449050126, "grad_norm": 0.12450214168542416, "learning_rate": 9.963438372215304e-05, "loss": 2.1059, "step": 4972 }, { "epoch": 0.03867474639383071, "grad_norm": 0.14719960509296356, "learning_rate": 9.963423622347316e-05, "loss": 2.0438, "step": 4973 }, { "epoch": 0.03868252333861129, "grad_norm": 0.1313452685930018, "learning_rate": 9.963408869515617e-05, "loss": 2.0706, "step": 4974 }, { "epoch": 0.03869030028339187, "grad_norm": 0.12051563329669039, "learning_rate": 9.963394113720217e-05, "loss": 2.0823, "step": 4975 }, { "epoch": 0.03869807722817245, "grad_norm": 0.11928461479765977, "learning_rate": 9.963379354961128e-05, "loss": 2.1191, "step": 4976 }, { "epoch": 0.03870585417295303, "grad_norm": 0.12047088494539096, "learning_rate": 9.963364593238355e-05, "loss": 2.0693, "step": 4977 }, { "epoch": 0.03871363111773361, "grad_norm": 0.11669825146659185, "learning_rate": 9.963349828551907e-05, "loss": 2.1365, "step": 4978 }, { "epoch": 0.038721408062514194, "grad_norm": 0.11593645689885435, "learning_rate": 9.963335060901796e-05, "loss": 2.0419, "step": 4979 }, { "epoch": 0.038729185007294775, "grad_norm": 0.11780586518858212, "learning_rate": 9.963320290288027e-05, "loss": 2.0676, "step": 4980 }, { "epoch": 0.038736961952075356, "grad_norm": 0.12747100173571108, "learning_rate": 9.963305516710613e-05, "loss": 2.0672, "step": 4981 }, { "epoch": 0.03874473889685594, "grad_norm": 0.12433764010544975, "learning_rate": 9.963290740169557e-05, "loss": 2.1052, "step": 4982 }, { "epoch": 0.03875251584163652, "grad_norm": 0.11534332156830138, "learning_rate": 9.963275960664872e-05, "loss": 2.0897, "step": 4983 }, { "epoch": 0.0387602927864171, "grad_norm": 0.13734262005941872, "learning_rate": 9.963261178196567e-05, "loss": 2.1304, "step": 4984 }, { "epoch": 0.03876806973119768, "grad_norm": 0.11773927617522655, "learning_rate": 9.963246392764651e-05, "loss": 2.121, "step": 4985 }, { "epoch": 0.03877584667597826, "grad_norm": 0.11383043976048629, "learning_rate": 9.963231604369131e-05, "loss": 2.0288, "step": 4986 }, { "epoch": 0.03878362362075884, "grad_norm": 0.12504595800375093, "learning_rate": 9.963216813010016e-05, "loss": 2.0665, "step": 4987 }, { "epoch": 0.038791400565539425, "grad_norm": 0.12615880059693754, "learning_rate": 9.963202018687315e-05, "loss": 2.0915, "step": 4988 }, { "epoch": 0.038799177510320006, "grad_norm": 0.11657373811091976, "learning_rate": 9.963187221401038e-05, "loss": 2.0798, "step": 4989 }, { "epoch": 0.03880695445510059, "grad_norm": 0.13049839833404178, "learning_rate": 9.963172421151192e-05, "loss": 2.0635, "step": 4990 }, { "epoch": 0.03881473139988117, "grad_norm": 0.12190493896430067, "learning_rate": 9.963157617937789e-05, "loss": 2.0948, "step": 4991 }, { "epoch": 0.03882250834466175, "grad_norm": 0.1215539655233263, "learning_rate": 9.963142811760832e-05, "loss": 2.1107, "step": 4992 }, { "epoch": 0.03883028528944233, "grad_norm": 0.14571931452454445, "learning_rate": 9.963128002620337e-05, "loss": 2.1111, "step": 4993 }, { "epoch": 0.03883806223422291, "grad_norm": 0.14368776389352989, "learning_rate": 9.963113190516307e-05, "loss": 2.1197, "step": 4994 }, { "epoch": 0.03884583917900349, "grad_norm": 0.11703554175138373, "learning_rate": 9.963098375448753e-05, "loss": 2.0615, "step": 4995 }, { "epoch": 0.038853616123784074, "grad_norm": 0.12512998114732085, "learning_rate": 9.963083557417686e-05, "loss": 2.0058, "step": 4996 }, { "epoch": 0.038861393068564655, "grad_norm": 0.12763350782231134, "learning_rate": 9.963068736423112e-05, "loss": 2.0982, "step": 4997 }, { "epoch": 0.038869170013345236, "grad_norm": 0.11393598983246556, "learning_rate": 9.96305391246504e-05, "loss": 2.0494, "step": 4998 }, { "epoch": 0.03887694695812582, "grad_norm": 0.12261396568193571, "learning_rate": 9.963039085543482e-05, "loss": 2.0873, "step": 4999 }, { "epoch": 0.0388847239029064, "grad_norm": 0.1172054859301091, "learning_rate": 9.963024255658443e-05, "loss": 2.0322, "step": 5000 }, { "epoch": 0.03889250084768698, "grad_norm": 0.11663737487272928, "learning_rate": 9.963009422809933e-05, "loss": 2.1002, "step": 5001 }, { "epoch": 0.03890027779246756, "grad_norm": 0.11785659597627694, "learning_rate": 9.962994586997962e-05, "loss": 2.0531, "step": 5002 }, { "epoch": 0.03890805473724814, "grad_norm": 0.1400263992415459, "learning_rate": 9.962979748222538e-05, "loss": 2.1233, "step": 5003 }, { "epoch": 0.03891583168202872, "grad_norm": 0.15050037081758344, "learning_rate": 9.962964906483669e-05, "loss": 2.0299, "step": 5004 }, { "epoch": 0.038923608626809304, "grad_norm": 0.14091448865905906, "learning_rate": 9.962950061781365e-05, "loss": 2.0849, "step": 5005 }, { "epoch": 0.038931385571589885, "grad_norm": 0.1153769574187204, "learning_rate": 9.962935214115634e-05, "loss": 2.0598, "step": 5006 }, { "epoch": 0.03893916251637047, "grad_norm": 0.13293556070771156, "learning_rate": 9.962920363486486e-05, "loss": 2.0817, "step": 5007 }, { "epoch": 0.03894693946115105, "grad_norm": 0.13314245030674055, "learning_rate": 9.96290550989393e-05, "loss": 2.1077, "step": 5008 }, { "epoch": 0.03895471640593163, "grad_norm": 0.14142752403367825, "learning_rate": 9.962890653337974e-05, "loss": 2.067, "step": 5009 }, { "epoch": 0.03896249335071221, "grad_norm": 0.12633651925209988, "learning_rate": 9.962875793818627e-05, "loss": 2.0889, "step": 5010 }, { "epoch": 0.03897027029549279, "grad_norm": 0.11926746971436607, "learning_rate": 9.962860931335898e-05, "loss": 2.125, "step": 5011 }, { "epoch": 0.03897804724027337, "grad_norm": 0.1292234304203838, "learning_rate": 9.962846065889797e-05, "loss": 2.0928, "step": 5012 }, { "epoch": 0.038985824185053954, "grad_norm": 0.12572222397005914, "learning_rate": 9.962831197480329e-05, "loss": 2.0919, "step": 5013 }, { "epoch": 0.038993601129834535, "grad_norm": 0.11729845067916439, "learning_rate": 9.962816326107507e-05, "loss": 2.0875, "step": 5014 }, { "epoch": 0.039001378074615116, "grad_norm": 0.1180343167950436, "learning_rate": 9.962801451771338e-05, "loss": 2.0416, "step": 5015 }, { "epoch": 0.0390091550193957, "grad_norm": 0.123061533390317, "learning_rate": 9.962786574471833e-05, "loss": 2.1478, "step": 5016 }, { "epoch": 0.03901693196417628, "grad_norm": 0.12248356863951777, "learning_rate": 9.962771694208998e-05, "loss": 2.0752, "step": 5017 }, { "epoch": 0.03902470890895686, "grad_norm": 0.11422554325448515, "learning_rate": 9.962756810982843e-05, "loss": 2.1068, "step": 5018 }, { "epoch": 0.03903248585373745, "grad_norm": 0.12777059362507515, "learning_rate": 9.962741924793377e-05, "loss": 2.0914, "step": 5019 }, { "epoch": 0.03904026279851803, "grad_norm": 0.1250907032180404, "learning_rate": 9.962727035640609e-05, "loss": 2.1155, "step": 5020 }, { "epoch": 0.03904803974329861, "grad_norm": 0.1212855759604379, "learning_rate": 9.962712143524549e-05, "loss": 2.0549, "step": 5021 }, { "epoch": 0.03905581668807919, "grad_norm": 0.11537553043401635, "learning_rate": 9.962697248445203e-05, "loss": 2.0477, "step": 5022 }, { "epoch": 0.03906359363285977, "grad_norm": 0.12160964531340834, "learning_rate": 9.962682350402582e-05, "loss": 2.0761, "step": 5023 }, { "epoch": 0.03907137057764035, "grad_norm": 0.1693541380632191, "learning_rate": 9.962667449396696e-05, "loss": 2.0995, "step": 5024 }, { "epoch": 0.039079147522420934, "grad_norm": 0.15721750172745666, "learning_rate": 9.962652545427551e-05, "loss": 2.0832, "step": 5025 }, { "epoch": 0.039086924467201516, "grad_norm": 0.12755525917241736, "learning_rate": 9.962637638495156e-05, "loss": 2.1015, "step": 5026 }, { "epoch": 0.0390947014119821, "grad_norm": 0.14134385461739787, "learning_rate": 9.962622728599524e-05, "loss": 2.0969, "step": 5027 }, { "epoch": 0.03910247835676268, "grad_norm": 0.13409239862772382, "learning_rate": 9.962607815740658e-05, "loss": 2.0765, "step": 5028 }, { "epoch": 0.03911025530154326, "grad_norm": 0.11871820208960936, "learning_rate": 9.962592899918572e-05, "loss": 2.0886, "step": 5029 }, { "epoch": 0.03911803224632384, "grad_norm": 0.1358435262124343, "learning_rate": 9.962577981133272e-05, "loss": 2.039, "step": 5030 }, { "epoch": 0.03912580919110442, "grad_norm": 0.13637610170192566, "learning_rate": 9.962563059384768e-05, "loss": 2.0468, "step": 5031 }, { "epoch": 0.039133586135885, "grad_norm": 0.12929504099650205, "learning_rate": 9.962548134673071e-05, "loss": 2.0401, "step": 5032 }, { "epoch": 0.039141363080665584, "grad_norm": 0.11408341765745317, "learning_rate": 9.962533206998185e-05, "loss": 2.1303, "step": 5033 }, { "epoch": 0.039149140025446165, "grad_norm": 0.11428598695274271, "learning_rate": 9.962518276360122e-05, "loss": 2.1576, "step": 5034 }, { "epoch": 0.039156916970226746, "grad_norm": 0.11712645015957739, "learning_rate": 9.962503342758891e-05, "loss": 2.0839, "step": 5035 }, { "epoch": 0.03916469391500733, "grad_norm": 0.11528546527086704, "learning_rate": 9.9624884061945e-05, "loss": 2.134, "step": 5036 }, { "epoch": 0.03917247085978791, "grad_norm": 0.11850689199969988, "learning_rate": 9.962473466666957e-05, "loss": 2.1211, "step": 5037 }, { "epoch": 0.03918024780456849, "grad_norm": 0.11887783769392589, "learning_rate": 9.962458524176275e-05, "loss": 2.0722, "step": 5038 }, { "epoch": 0.03918802474934907, "grad_norm": 0.1125170815759668, "learning_rate": 9.962443578722458e-05, "loss": 2.0816, "step": 5039 }, { "epoch": 0.03919580169412965, "grad_norm": 0.13203950162970282, "learning_rate": 9.962428630305516e-05, "loss": 2.0914, "step": 5040 }, { "epoch": 0.03920357863891023, "grad_norm": 0.1424797429720863, "learning_rate": 9.962413678925461e-05, "loss": 2.0415, "step": 5041 }, { "epoch": 0.039211355583690814, "grad_norm": 0.11919275482088157, "learning_rate": 9.9623987245823e-05, "loss": 2.0935, "step": 5042 }, { "epoch": 0.039219132528471395, "grad_norm": 0.11590106691085517, "learning_rate": 9.962383767276041e-05, "loss": 2.1251, "step": 5043 }, { "epoch": 0.03922690947325198, "grad_norm": 0.12229582453785012, "learning_rate": 9.962368807006695e-05, "loss": 2.0503, "step": 5044 }, { "epoch": 0.03923468641803256, "grad_norm": 0.11654798938458642, "learning_rate": 9.962353843774268e-05, "loss": 2.0909, "step": 5045 }, { "epoch": 0.03924246336281314, "grad_norm": 0.1172343058944649, "learning_rate": 9.962338877578772e-05, "loss": 2.0749, "step": 5046 }, { "epoch": 0.03925024030759372, "grad_norm": 0.11167500052466528, "learning_rate": 9.962323908420213e-05, "loss": 2.0922, "step": 5047 }, { "epoch": 0.0392580172523743, "grad_norm": 0.11918096293002231, "learning_rate": 9.962308936298604e-05, "loss": 2.1091, "step": 5048 }, { "epoch": 0.03926579419715488, "grad_norm": 0.12619281495897305, "learning_rate": 9.96229396121395e-05, "loss": 2.0952, "step": 5049 }, { "epoch": 0.039273571141935464, "grad_norm": 0.12235119885638769, "learning_rate": 9.96227898316626e-05, "loss": 2.082, "step": 5050 }, { "epoch": 0.039281348086716045, "grad_norm": 0.11918955912443834, "learning_rate": 9.962264002155546e-05, "loss": 2.1095, "step": 5051 }, { "epoch": 0.039289125031496626, "grad_norm": 0.11566700518909932, "learning_rate": 9.962249018181815e-05, "loss": 2.0778, "step": 5052 }, { "epoch": 0.03929690197627721, "grad_norm": 0.12279719187812764, "learning_rate": 9.962234031245076e-05, "loss": 2.053, "step": 5053 }, { "epoch": 0.03930467892105779, "grad_norm": 0.1288536607072664, "learning_rate": 9.962219041345337e-05, "loss": 2.0106, "step": 5054 }, { "epoch": 0.03931245586583837, "grad_norm": 0.14434526897294014, "learning_rate": 9.96220404848261e-05, "loss": 2.0605, "step": 5055 }, { "epoch": 0.03932023281061895, "grad_norm": 0.15491420184736412, "learning_rate": 9.962189052656902e-05, "loss": 2.0817, "step": 5056 }, { "epoch": 0.03932800975539953, "grad_norm": 0.1347771476123101, "learning_rate": 9.96217405386822e-05, "loss": 2.0665, "step": 5057 }, { "epoch": 0.03933578670018011, "grad_norm": 0.12103748369724918, "learning_rate": 9.962159052116578e-05, "loss": 2.0387, "step": 5058 }, { "epoch": 0.039343563644960694, "grad_norm": 0.152424911855765, "learning_rate": 9.96214404740198e-05, "loss": 2.0807, "step": 5059 }, { "epoch": 0.039351340589741275, "grad_norm": 0.20581212560453493, "learning_rate": 9.962129039724436e-05, "loss": 2.0618, "step": 5060 }, { "epoch": 0.039359117534521856, "grad_norm": 0.2307134369958741, "learning_rate": 9.962114029083959e-05, "loss": 2.07, "step": 5061 }, { "epoch": 0.03936689447930244, "grad_norm": 0.19956664191907383, "learning_rate": 9.962099015480553e-05, "loss": 2.1061, "step": 5062 }, { "epoch": 0.03937467142408302, "grad_norm": 0.13290363571421998, "learning_rate": 9.962083998914229e-05, "loss": 2.0929, "step": 5063 }, { "epoch": 0.0393824483688636, "grad_norm": 0.14523107807796923, "learning_rate": 9.962068979384995e-05, "loss": 2.0482, "step": 5064 }, { "epoch": 0.03939022531364418, "grad_norm": 0.19929670913046302, "learning_rate": 9.962053956892861e-05, "loss": 2.0529, "step": 5065 }, { "epoch": 0.03939800225842476, "grad_norm": 0.18272395765660918, "learning_rate": 9.962038931437835e-05, "loss": 2.0415, "step": 5066 }, { "epoch": 0.03940577920320534, "grad_norm": 0.11435558503435138, "learning_rate": 9.962023903019929e-05, "loss": 2.0487, "step": 5067 }, { "epoch": 0.039413556147985924, "grad_norm": 0.15485028467437428, "learning_rate": 9.962008871639148e-05, "loss": 2.0733, "step": 5068 }, { "epoch": 0.039421333092766506, "grad_norm": 0.19057651683005639, "learning_rate": 9.961993837295502e-05, "loss": 2.1184, "step": 5069 }, { "epoch": 0.03942911003754709, "grad_norm": 0.1320508696271019, "learning_rate": 9.961978799989003e-05, "loss": 2.0296, "step": 5070 }, { "epoch": 0.03943688698232767, "grad_norm": 0.15290631938378363, "learning_rate": 9.961963759719655e-05, "loss": 2.0791, "step": 5071 }, { "epoch": 0.03944466392710825, "grad_norm": 0.2044793237144183, "learning_rate": 9.961948716487471e-05, "loss": 2.0812, "step": 5072 }, { "epoch": 0.03945244087188883, "grad_norm": 0.14590983620788892, "learning_rate": 9.961933670292458e-05, "loss": 2.082, "step": 5073 }, { "epoch": 0.03946021781666941, "grad_norm": 0.12603868257959933, "learning_rate": 9.961918621134627e-05, "loss": 2.1299, "step": 5074 }, { "epoch": 0.03946799476144999, "grad_norm": 0.1584168705843285, "learning_rate": 9.961903569013985e-05, "loss": 2.0959, "step": 5075 }, { "epoch": 0.039475771706230574, "grad_norm": 0.12706375438623246, "learning_rate": 9.961888513930539e-05, "loss": 2.08, "step": 5076 }, { "epoch": 0.039483548651011155, "grad_norm": 0.11668170907802677, "learning_rate": 9.961873455884303e-05, "loss": 2.0753, "step": 5077 }, { "epoch": 0.03949132559579174, "grad_norm": 0.6176369936864403, "learning_rate": 9.961858394875282e-05, "loss": 2.086, "step": 5078 }, { "epoch": 0.039499102540572324, "grad_norm": 0.12008427414973487, "learning_rate": 9.961843330903486e-05, "loss": 2.0664, "step": 5079 }, { "epoch": 0.039506879485352905, "grad_norm": 0.12808741198788431, "learning_rate": 9.961828263968926e-05, "loss": 2.0339, "step": 5080 }, { "epoch": 0.03951465643013349, "grad_norm": 0.12434080198702235, "learning_rate": 9.961813194071609e-05, "loss": 2.0757, "step": 5081 }, { "epoch": 0.03952243337491407, "grad_norm": 0.11836921318959165, "learning_rate": 9.961798121211546e-05, "loss": 2.0802, "step": 5082 }, { "epoch": 0.03953021031969465, "grad_norm": 0.11566433147369853, "learning_rate": 9.961783045388741e-05, "loss": 2.1137, "step": 5083 }, { "epoch": 0.03953798726447523, "grad_norm": 0.1181080356787633, "learning_rate": 9.961767966603209e-05, "loss": 2.0929, "step": 5084 }, { "epoch": 0.03954576420925581, "grad_norm": 0.12198501990555241, "learning_rate": 9.961752884854955e-05, "loss": 2.0763, "step": 5085 }, { "epoch": 0.03955354115403639, "grad_norm": 0.13094303912959696, "learning_rate": 9.961737800143991e-05, "loss": 2.0786, "step": 5086 }, { "epoch": 0.039561318098816974, "grad_norm": 0.13677538157024416, "learning_rate": 9.961722712470325e-05, "loss": 2.088, "step": 5087 }, { "epoch": 0.039569095043597555, "grad_norm": 0.1271189692033234, "learning_rate": 9.961707621833963e-05, "loss": 2.0917, "step": 5088 }, { "epoch": 0.039576871988378136, "grad_norm": 0.11796167249224661, "learning_rate": 9.961692528234918e-05, "loss": 2.0289, "step": 5089 }, { "epoch": 0.03958464893315872, "grad_norm": 0.2586861797036005, "learning_rate": 9.961677431673198e-05, "loss": 2.0717, "step": 5090 }, { "epoch": 0.0395924258779393, "grad_norm": 0.12230122483772271, "learning_rate": 9.96166233214881e-05, "loss": 2.0868, "step": 5091 }, { "epoch": 0.03960020282271988, "grad_norm": 0.11921283143528082, "learning_rate": 9.961647229661766e-05, "loss": 2.1136, "step": 5092 }, { "epoch": 0.03960797976750046, "grad_norm": 0.11885758763465294, "learning_rate": 9.961632124212073e-05, "loss": 2.1185, "step": 5093 }, { "epoch": 0.03961575671228104, "grad_norm": 0.11731335100952803, "learning_rate": 9.96161701579974e-05, "loss": 2.0886, "step": 5094 }, { "epoch": 0.03962353365706162, "grad_norm": 0.12116020745727232, "learning_rate": 9.961601904424778e-05, "loss": 2.1566, "step": 5095 }, { "epoch": 0.039631310601842204, "grad_norm": 0.1254607286872775, "learning_rate": 9.961586790087193e-05, "loss": 2.1347, "step": 5096 }, { "epoch": 0.039639087546622785, "grad_norm": 0.1332461210859082, "learning_rate": 9.961571672786995e-05, "loss": 2.1166, "step": 5097 }, { "epoch": 0.039646864491403366, "grad_norm": 0.14230750618523802, "learning_rate": 9.961556552524197e-05, "loss": 2.0545, "step": 5098 }, { "epoch": 0.03965464143618395, "grad_norm": 0.13315977472629115, "learning_rate": 9.961541429298802e-05, "loss": 2.0832, "step": 5099 }, { "epoch": 0.03966241838096453, "grad_norm": 0.12456122468240124, "learning_rate": 9.961526303110822e-05, "loss": 2.0805, "step": 5100 }, { "epoch": 0.03967019532574511, "grad_norm": 0.11764759576201574, "learning_rate": 9.961511173960267e-05, "loss": 2.0631, "step": 5101 }, { "epoch": 0.03967797227052569, "grad_norm": 0.1474680540539529, "learning_rate": 9.961496041847144e-05, "loss": 2.1126, "step": 5102 }, { "epoch": 0.03968574921530627, "grad_norm": 0.16679493756225702, "learning_rate": 9.961480906771462e-05, "loss": 2.0767, "step": 5103 }, { "epoch": 0.03969352616008685, "grad_norm": 0.12667672529225132, "learning_rate": 9.961465768733231e-05, "loss": 2.0832, "step": 5104 }, { "epoch": 0.039701303104867434, "grad_norm": 0.12141763467967148, "learning_rate": 9.961450627732461e-05, "loss": 2.0604, "step": 5105 }, { "epoch": 0.039709080049648016, "grad_norm": 0.1635476341342939, "learning_rate": 9.96143548376916e-05, "loss": 2.1048, "step": 5106 }, { "epoch": 0.0397168569944286, "grad_norm": 0.18996160346265906, "learning_rate": 9.961420336843337e-05, "loss": 2.0254, "step": 5107 }, { "epoch": 0.03972463393920918, "grad_norm": 0.15691286576469698, "learning_rate": 9.961405186955e-05, "loss": 2.0444, "step": 5108 }, { "epoch": 0.03973241088398976, "grad_norm": 0.12428061899853228, "learning_rate": 9.96139003410416e-05, "loss": 2.047, "step": 5109 }, { "epoch": 0.03974018782877034, "grad_norm": 0.15108189349213652, "learning_rate": 9.961374878290825e-05, "loss": 2.1239, "step": 5110 }, { "epoch": 0.03974796477355092, "grad_norm": 0.18360380503388646, "learning_rate": 9.961359719515004e-05, "loss": 2.1249, "step": 5111 }, { "epoch": 0.0397557417183315, "grad_norm": 0.14968111773771936, "learning_rate": 9.961344557776706e-05, "loss": 2.1085, "step": 5112 }, { "epoch": 0.039763518663112084, "grad_norm": 0.11770432169834424, "learning_rate": 9.96132939307594e-05, "loss": 2.125, "step": 5113 }, { "epoch": 0.039771295607892665, "grad_norm": 0.16866280246222512, "learning_rate": 9.961314225412716e-05, "loss": 2.0428, "step": 5114 }, { "epoch": 0.039779072552673246, "grad_norm": 0.16319837962958864, "learning_rate": 9.961299054787042e-05, "loss": 2.0748, "step": 5115 }, { "epoch": 0.03978684949745383, "grad_norm": 0.13797595119104758, "learning_rate": 9.961283881198929e-05, "loss": 2.0716, "step": 5116 }, { "epoch": 0.03979462644223441, "grad_norm": 0.12131726374413881, "learning_rate": 9.961268704648383e-05, "loss": 2.1537, "step": 5117 }, { "epoch": 0.03980240338701499, "grad_norm": 0.13790960009386177, "learning_rate": 9.961253525135414e-05, "loss": 2.0452, "step": 5118 }, { "epoch": 0.03981018033179557, "grad_norm": 0.12997048300220668, "learning_rate": 9.961238342660033e-05, "loss": 2.0866, "step": 5119 }, { "epoch": 0.03981795727657615, "grad_norm": 0.11792449513266581, "learning_rate": 9.961223157222247e-05, "loss": 2.1204, "step": 5120 }, { "epoch": 0.03982573422135673, "grad_norm": 0.13892217784421684, "learning_rate": 9.961207968822066e-05, "loss": 2.0811, "step": 5121 }, { "epoch": 0.039833511166137314, "grad_norm": 0.12166498580440237, "learning_rate": 9.961192777459498e-05, "loss": 2.0718, "step": 5122 }, { "epoch": 0.039841288110917895, "grad_norm": 0.1158992163259535, "learning_rate": 9.961177583134553e-05, "loss": 2.0623, "step": 5123 }, { "epoch": 0.03984906505569848, "grad_norm": 0.14821745346454504, "learning_rate": 9.961162385847241e-05, "loss": 2.0714, "step": 5124 }, { "epoch": 0.03985684200047906, "grad_norm": 0.1382708540862673, "learning_rate": 9.96114718559757e-05, "loss": 2.0844, "step": 5125 }, { "epoch": 0.03986461894525964, "grad_norm": 0.11853859087419624, "learning_rate": 9.961131982385549e-05, "loss": 2.0389, "step": 5126 }, { "epoch": 0.03987239589004022, "grad_norm": 0.11653208623268989, "learning_rate": 9.961116776211187e-05, "loss": 2.0803, "step": 5127 }, { "epoch": 0.0398801728348208, "grad_norm": 0.11212583607465136, "learning_rate": 9.961101567074494e-05, "loss": 2.1348, "step": 5128 }, { "epoch": 0.03988794977960138, "grad_norm": 0.11008273238961315, "learning_rate": 9.961086354975478e-05, "loss": 2.039, "step": 5129 }, { "epoch": 0.039895726724381964, "grad_norm": 0.12578315898776435, "learning_rate": 9.961071139914146e-05, "loss": 2.1269, "step": 5130 }, { "epoch": 0.039903503669162545, "grad_norm": 0.11546600287999192, "learning_rate": 9.961055921890513e-05, "loss": 2.1049, "step": 5131 }, { "epoch": 0.039911280613943126, "grad_norm": 0.12332979438591503, "learning_rate": 9.961040700904583e-05, "loss": 2.0718, "step": 5132 }, { "epoch": 0.03991905755872371, "grad_norm": 0.16228755285764526, "learning_rate": 9.961025476956367e-05, "loss": 2.0818, "step": 5133 }, { "epoch": 0.03992683450350429, "grad_norm": 0.1640437927731583, "learning_rate": 9.961010250045874e-05, "loss": 2.0575, "step": 5134 }, { "epoch": 0.03993461144828487, "grad_norm": 0.122144808814217, "learning_rate": 9.960995020173113e-05, "loss": 2.0205, "step": 5135 }, { "epoch": 0.03994238839306545, "grad_norm": 0.1599982865301066, "learning_rate": 9.960979787338093e-05, "loss": 2.1002, "step": 5136 }, { "epoch": 0.03995016533784604, "grad_norm": 0.16269441184171163, "learning_rate": 9.960964551540822e-05, "loss": 2.0336, "step": 5137 }, { "epoch": 0.03995794228262662, "grad_norm": 0.14410485607042117, "learning_rate": 9.960949312781311e-05, "loss": 2.0972, "step": 5138 }, { "epoch": 0.0399657192274072, "grad_norm": 0.11877934889689029, "learning_rate": 9.960934071059568e-05, "loss": 2.035, "step": 5139 }, { "epoch": 0.03997349617218778, "grad_norm": 0.11133556566390082, "learning_rate": 9.960918826375604e-05, "loss": 2.0373, "step": 5140 }, { "epoch": 0.03998127311696836, "grad_norm": 0.1153773000639069, "learning_rate": 9.960903578729424e-05, "loss": 1.9964, "step": 5141 }, { "epoch": 0.039989050061748944, "grad_norm": 0.11889906154256047, "learning_rate": 9.960888328121042e-05, "loss": 2.1049, "step": 5142 }, { "epoch": 0.039996827006529526, "grad_norm": 0.13142683562130497, "learning_rate": 9.960873074550463e-05, "loss": 2.165, "step": 5143 }, { "epoch": 0.04000460395131011, "grad_norm": 0.11775302388902174, "learning_rate": 9.960857818017699e-05, "loss": 2.1146, "step": 5144 }, { "epoch": 0.04001238089609069, "grad_norm": 0.12748696909612545, "learning_rate": 9.960842558522757e-05, "loss": 2.1069, "step": 5145 }, { "epoch": 0.04002015784087127, "grad_norm": 0.13026746984158297, "learning_rate": 9.960827296065646e-05, "loss": 2.0619, "step": 5146 }, { "epoch": 0.04002793478565185, "grad_norm": 0.12002545240118334, "learning_rate": 9.960812030646378e-05, "loss": 2.1171, "step": 5147 }, { "epoch": 0.04003571173043243, "grad_norm": 0.12178319815025379, "learning_rate": 9.96079676226496e-05, "loss": 2.0442, "step": 5148 }, { "epoch": 0.04004348867521301, "grad_norm": 0.1513675983803148, "learning_rate": 9.960781490921401e-05, "loss": 2.0734, "step": 5149 }, { "epoch": 0.040051265619993594, "grad_norm": 0.17260756416442838, "learning_rate": 9.960766216615711e-05, "loss": 2.041, "step": 5150 }, { "epoch": 0.040059042564774175, "grad_norm": 0.1425657306683362, "learning_rate": 9.960750939347899e-05, "loss": 2.0573, "step": 5151 }, { "epoch": 0.040066819509554756, "grad_norm": 0.11930949182791854, "learning_rate": 9.960735659117972e-05, "loss": 2.0703, "step": 5152 }, { "epoch": 0.04007459645433534, "grad_norm": 0.19472395490014635, "learning_rate": 9.960720375925942e-05, "loss": 2.0465, "step": 5153 }, { "epoch": 0.04008237339911592, "grad_norm": 0.12786343359439387, "learning_rate": 9.960705089771817e-05, "loss": 2.0931, "step": 5154 }, { "epoch": 0.0400901503438965, "grad_norm": 0.11856809046223249, "learning_rate": 9.960689800655605e-05, "loss": 2.0609, "step": 5155 }, { "epoch": 0.04009792728867708, "grad_norm": 0.12036257728294052, "learning_rate": 9.960674508577318e-05, "loss": 2.1363, "step": 5156 }, { "epoch": 0.04010570423345766, "grad_norm": 0.1336577036880374, "learning_rate": 9.960659213536962e-05, "loss": 2.0762, "step": 5157 }, { "epoch": 0.04011348117823824, "grad_norm": 0.1271091100188423, "learning_rate": 9.960643915534549e-05, "loss": 2.0592, "step": 5158 }, { "epoch": 0.040121258123018824, "grad_norm": 0.11422610912291005, "learning_rate": 9.960628614570084e-05, "loss": 2.0555, "step": 5159 }, { "epoch": 0.040129035067799405, "grad_norm": 0.17003052348879802, "learning_rate": 9.960613310643582e-05, "loss": 2.1543, "step": 5160 }, { "epoch": 0.04013681201257999, "grad_norm": 0.1864874031115013, "learning_rate": 9.960598003755047e-05, "loss": 2.13, "step": 5161 }, { "epoch": 0.04014458895736057, "grad_norm": 0.17653327553750786, "learning_rate": 9.96058269390449e-05, "loss": 2.0248, "step": 5162 }, { "epoch": 0.04015236590214115, "grad_norm": 0.1261272512537126, "learning_rate": 9.96056738109192e-05, "loss": 2.0794, "step": 5163 }, { "epoch": 0.04016014284692173, "grad_norm": 0.13595442756976875, "learning_rate": 9.960552065317348e-05, "loss": 2.062, "step": 5164 }, { "epoch": 0.04016791979170231, "grad_norm": 0.19596132314411507, "learning_rate": 9.96053674658078e-05, "loss": 2.0491, "step": 5165 }, { "epoch": 0.04017569673648289, "grad_norm": 0.22606741881624018, "learning_rate": 9.960521424882228e-05, "loss": 2.1397, "step": 5166 }, { "epoch": 0.040183473681263474, "grad_norm": 0.11221045901050435, "learning_rate": 9.960506100221698e-05, "loss": 2.0346, "step": 5167 }, { "epoch": 0.040191250626044055, "grad_norm": 0.16439881533323839, "learning_rate": 9.960490772599202e-05, "loss": 2.1168, "step": 5168 }, { "epoch": 0.040199027570824636, "grad_norm": 0.2306677919130171, "learning_rate": 9.960475442014748e-05, "loss": 2.0713, "step": 5169 }, { "epoch": 0.04020680451560522, "grad_norm": 0.21875798200474486, "learning_rate": 9.960460108468345e-05, "loss": 2.1103, "step": 5170 }, { "epoch": 0.0402145814603858, "grad_norm": 0.15905243503038669, "learning_rate": 9.960444771960002e-05, "loss": 2.1051, "step": 5171 }, { "epoch": 0.04022235840516638, "grad_norm": 0.11569428255965156, "learning_rate": 9.96042943248973e-05, "loss": 2.0562, "step": 5172 }, { "epoch": 0.04023013534994696, "grad_norm": 0.16553471863472743, "learning_rate": 9.960414090057536e-05, "loss": 2.0637, "step": 5173 }, { "epoch": 0.04023791229472754, "grad_norm": 0.15226281740384254, "learning_rate": 9.96039874466343e-05, "loss": 2.0471, "step": 5174 }, { "epoch": 0.04024568923950812, "grad_norm": 0.1160216278051244, "learning_rate": 9.96038339630742e-05, "loss": 2.0558, "step": 5175 }, { "epoch": 0.040253466184288704, "grad_norm": 0.15115989329142845, "learning_rate": 9.960368044989516e-05, "loss": 2.0846, "step": 5176 }, { "epoch": 0.040261243129069285, "grad_norm": 0.16065841136940948, "learning_rate": 9.960352690709729e-05, "loss": 2.086, "step": 5177 }, { "epoch": 0.040269020073849866, "grad_norm": 0.13135002791857428, "learning_rate": 9.960337333468066e-05, "loss": 2.0698, "step": 5178 }, { "epoch": 0.04027679701863045, "grad_norm": 0.12270033537988241, "learning_rate": 9.960321973264537e-05, "loss": 2.0888, "step": 5179 }, { "epoch": 0.04028457396341103, "grad_norm": 0.173294449945219, "learning_rate": 9.96030661009915e-05, "loss": 2.1254, "step": 5180 }, { "epoch": 0.04029235090819161, "grad_norm": 0.17368588357801035, "learning_rate": 9.960291243971916e-05, "loss": 2.0645, "step": 5181 }, { "epoch": 0.04030012785297219, "grad_norm": 0.12530588984992302, "learning_rate": 9.960275874882843e-05, "loss": 2.0402, "step": 5182 }, { "epoch": 0.04030790479775277, "grad_norm": 0.17287719773579646, "learning_rate": 9.96026050283194e-05, "loss": 2.11, "step": 5183 }, { "epoch": 0.04031568174253335, "grad_norm": 0.17006134145909355, "learning_rate": 9.960245127819217e-05, "loss": 2.1075, "step": 5184 }, { "epoch": 0.040323458687313934, "grad_norm": 0.14269170186820057, "learning_rate": 9.960229749844682e-05, "loss": 2.1174, "step": 5185 }, { "epoch": 0.040331235632094516, "grad_norm": 0.11457888291286168, "learning_rate": 9.960214368908344e-05, "loss": 2.1291, "step": 5186 }, { "epoch": 0.0403390125768751, "grad_norm": 0.14781889432345027, "learning_rate": 9.960198985010216e-05, "loss": 2.0709, "step": 5187 }, { "epoch": 0.04034678952165568, "grad_norm": 0.15497006410061406, "learning_rate": 9.960183598150303e-05, "loss": 2.1183, "step": 5188 }, { "epoch": 0.04035456646643626, "grad_norm": 0.12659700906079124, "learning_rate": 9.960168208328616e-05, "loss": 2.0426, "step": 5189 }, { "epoch": 0.04036234341121684, "grad_norm": 0.11360905134940216, "learning_rate": 9.960152815545164e-05, "loss": 2.0362, "step": 5190 }, { "epoch": 0.04037012035599742, "grad_norm": 0.12162067145711668, "learning_rate": 9.960137419799953e-05, "loss": 2.0861, "step": 5191 }, { "epoch": 0.040377897300778, "grad_norm": 0.11312865899858646, "learning_rate": 9.960122021092997e-05, "loss": 2.1121, "step": 5192 }, { "epoch": 0.040385674245558584, "grad_norm": 0.11649370225912026, "learning_rate": 9.960106619424304e-05, "loss": 2.0615, "step": 5193 }, { "epoch": 0.040393451190339165, "grad_norm": 0.14766639538404308, "learning_rate": 9.960091214793881e-05, "loss": 2.076, "step": 5194 }, { "epoch": 0.040401228135119746, "grad_norm": 0.15760365786816147, "learning_rate": 9.96007580720174e-05, "loss": 2.0361, "step": 5195 }, { "epoch": 0.04040900507990033, "grad_norm": 0.1234492597068775, "learning_rate": 9.960060396647888e-05, "loss": 2.0774, "step": 5196 }, { "epoch": 0.040416782024680915, "grad_norm": 0.13745179889323075, "learning_rate": 9.960044983132335e-05, "loss": 2.09, "step": 5197 }, { "epoch": 0.040424558969461497, "grad_norm": 0.16942766493132244, "learning_rate": 9.960029566655092e-05, "loss": 2.1145, "step": 5198 }, { "epoch": 0.04043233591424208, "grad_norm": 0.22122405432330045, "learning_rate": 9.960014147216163e-05, "loss": 2.124, "step": 5199 }, { "epoch": 0.04044011285902266, "grad_norm": 0.13063267847908291, "learning_rate": 9.959998724815564e-05, "loss": 2.1485, "step": 5200 }, { "epoch": 0.04044788980380324, "grad_norm": 0.13978222125973444, "learning_rate": 9.959983299453299e-05, "loss": 2.0674, "step": 5201 }, { "epoch": 0.04045566674858382, "grad_norm": 0.2009747484722993, "learning_rate": 9.959967871129381e-05, "loss": 2.0905, "step": 5202 }, { "epoch": 0.0404634436933644, "grad_norm": 0.12736006592056384, "learning_rate": 9.959952439843816e-05, "loss": 2.0831, "step": 5203 }, { "epoch": 0.040471220638144983, "grad_norm": 0.12315244217915632, "learning_rate": 9.959937005596614e-05, "loss": 2.0274, "step": 5204 }, { "epoch": 0.040478997582925565, "grad_norm": 0.19303718955400329, "learning_rate": 9.959921568387786e-05, "loss": 2.064, "step": 5205 }, { "epoch": 0.040486774527706146, "grad_norm": 0.12537588898670815, "learning_rate": 9.95990612821734e-05, "loss": 2.1018, "step": 5206 }, { "epoch": 0.04049455147248673, "grad_norm": 0.12163577074750825, "learning_rate": 9.959890685085284e-05, "loss": 1.9971, "step": 5207 }, { "epoch": 0.04050232841726731, "grad_norm": 0.152867629266479, "learning_rate": 9.95987523899163e-05, "loss": 2.1543, "step": 5208 }, { "epoch": 0.04051010536204789, "grad_norm": 0.3140140119059895, "learning_rate": 9.959859789936383e-05, "loss": 2.0828, "step": 5209 }, { "epoch": 0.04051788230682847, "grad_norm": 0.11404095685904642, "learning_rate": 9.959844337919557e-05, "loss": 2.0405, "step": 5210 }, { "epoch": 0.04052565925160905, "grad_norm": 0.20743995592972783, "learning_rate": 9.959828882941158e-05, "loss": 2.0877, "step": 5211 }, { "epoch": 0.04053343619638963, "grad_norm": 0.287636952694962, "learning_rate": 9.959813425001196e-05, "loss": 2.0827, "step": 5212 }, { "epoch": 0.040541213141170214, "grad_norm": 0.2426696246194093, "learning_rate": 9.959797964099682e-05, "loss": 2.0834, "step": 5213 }, { "epoch": 0.040548990085950795, "grad_norm": 0.14084297107730911, "learning_rate": 9.959782500236623e-05, "loss": 2.1115, "step": 5214 }, { "epoch": 0.040556767030731376, "grad_norm": 0.14146784602975374, "learning_rate": 9.95976703341203e-05, "loss": 2.1484, "step": 5215 }, { "epoch": 0.04056454397551196, "grad_norm": 0.1723321044237962, "learning_rate": 9.95975156362591e-05, "loss": 2.085, "step": 5216 }, { "epoch": 0.04057232092029254, "grad_norm": 0.14438096358222013, "learning_rate": 9.959736090878273e-05, "loss": 2.0649, "step": 5217 }, { "epoch": 0.04058009786507312, "grad_norm": 0.13854607054844148, "learning_rate": 9.95972061516913e-05, "loss": 2.0672, "step": 5218 }, { "epoch": 0.0405878748098537, "grad_norm": 0.11731332688952668, "learning_rate": 9.95970513649849e-05, "loss": 2.1261, "step": 5219 }, { "epoch": 0.04059565175463428, "grad_norm": 0.17742414790740008, "learning_rate": 9.959689654866358e-05, "loss": 2.0472, "step": 5220 }, { "epoch": 0.04060342869941486, "grad_norm": 0.12909746604435865, "learning_rate": 9.959674170272749e-05, "loss": 2.0785, "step": 5221 }, { "epoch": 0.040611205644195444, "grad_norm": 0.11964415045435506, "learning_rate": 9.95965868271767e-05, "loss": 2.0679, "step": 5222 }, { "epoch": 0.040618982588976026, "grad_norm": 0.41457699311597557, "learning_rate": 9.959643192201129e-05, "loss": 2.087, "step": 5223 }, { "epoch": 0.04062675953375661, "grad_norm": 0.11453159640163486, "learning_rate": 9.959627698723136e-05, "loss": 2.1195, "step": 5224 }, { "epoch": 0.04063453647853719, "grad_norm": 0.13564550238355372, "learning_rate": 9.959612202283698e-05, "loss": 2.0507, "step": 5225 }, { "epoch": 0.04064231342331777, "grad_norm": 0.12749413009744442, "learning_rate": 9.95959670288283e-05, "loss": 2.0295, "step": 5226 }, { "epoch": 0.04065009036809835, "grad_norm": 0.12106137214373464, "learning_rate": 9.959581200520538e-05, "loss": 2.0599, "step": 5227 }, { "epoch": 0.04065786731287893, "grad_norm": 0.13747279964318004, "learning_rate": 9.95956569519683e-05, "loss": 2.0842, "step": 5228 }, { "epoch": 0.04066564425765951, "grad_norm": 0.12094845521603591, "learning_rate": 9.959550186911717e-05, "loss": 2.0173, "step": 5229 }, { "epoch": 0.040673421202440094, "grad_norm": 0.12339872573929085, "learning_rate": 9.959534675665208e-05, "loss": 2.0717, "step": 5230 }, { "epoch": 0.040681198147220675, "grad_norm": 0.11856606509073417, "learning_rate": 9.95951916145731e-05, "loss": 2.0792, "step": 5231 }, { "epoch": 0.040688975092001256, "grad_norm": 0.12019056974804224, "learning_rate": 9.959503644288035e-05, "loss": 2.0673, "step": 5232 }, { "epoch": 0.04069675203678184, "grad_norm": 0.11784163659249894, "learning_rate": 9.959488124157392e-05, "loss": 2.0844, "step": 5233 }, { "epoch": 0.04070452898156242, "grad_norm": 0.19375982838557407, "learning_rate": 9.959472601065392e-05, "loss": 2.1134, "step": 5234 }, { "epoch": 0.040712305926343, "grad_norm": 0.11745913343433684, "learning_rate": 9.959457075012039e-05, "loss": 2.1124, "step": 5235 }, { "epoch": 0.04072008287112358, "grad_norm": 0.12149799594596304, "learning_rate": 9.959441545997347e-05, "loss": 2.0911, "step": 5236 }, { "epoch": 0.04072785981590416, "grad_norm": 0.11568003914418558, "learning_rate": 9.959426014021323e-05, "loss": 2.0916, "step": 5237 }, { "epoch": 0.04073563676068474, "grad_norm": 0.4219493955202549, "learning_rate": 9.959410479083977e-05, "loss": 2.0869, "step": 5238 }, { "epoch": 0.040743413705465324, "grad_norm": 0.11705017854194857, "learning_rate": 9.959394941185316e-05, "loss": 2.0941, "step": 5239 }, { "epoch": 0.040751190650245905, "grad_norm": 0.2836575391557583, "learning_rate": 9.959379400325355e-05, "loss": 2.0596, "step": 5240 }, { "epoch": 0.040758967595026487, "grad_norm": 0.12472978769130846, "learning_rate": 9.959363856504098e-05, "loss": 2.0393, "step": 5241 }, { "epoch": 0.04076674453980707, "grad_norm": 0.20122115199856896, "learning_rate": 9.959348309721556e-05, "loss": 2.0875, "step": 5242 }, { "epoch": 0.04077452148458765, "grad_norm": 0.8829254348601033, "learning_rate": 9.959332759977738e-05, "loss": 2.0838, "step": 5243 }, { "epoch": 0.04078229842936823, "grad_norm": 0.2841986050967996, "learning_rate": 9.959317207272653e-05, "loss": 2.0675, "step": 5244 }, { "epoch": 0.04079007537414881, "grad_norm": 0.40962273796538223, "learning_rate": 9.959301651606311e-05, "loss": 2.0812, "step": 5245 }, { "epoch": 0.04079785231892939, "grad_norm": 0.3092908884354573, "learning_rate": 9.959286092978724e-05, "loss": 2.0293, "step": 5246 }, { "epoch": 0.040805629263709974, "grad_norm": 0.15097936245080007, "learning_rate": 9.959270531389895e-05, "loss": 2.1048, "step": 5247 }, { "epoch": 0.040813406208490555, "grad_norm": 0.1700489702492148, "learning_rate": 9.959254966839838e-05, "loss": 2.111, "step": 5248 }, { "epoch": 0.040821183153271136, "grad_norm": 5.216997417444142, "learning_rate": 9.959239399328562e-05, "loss": 2.1476, "step": 5249 }, { "epoch": 0.04082896009805172, "grad_norm": 0.38266880239093815, "learning_rate": 9.959223828856074e-05, "loss": 2.1297, "step": 5250 }, { "epoch": 0.0408367370428323, "grad_norm": 0.5521904109481229, "learning_rate": 9.959208255422384e-05, "loss": 2.1124, "step": 5251 }, { "epoch": 0.04084451398761288, "grad_norm": 0.2005239718306122, "learning_rate": 9.959192679027502e-05, "loss": 2.0663, "step": 5252 }, { "epoch": 0.04085229093239346, "grad_norm": 0.44186856581300893, "learning_rate": 9.959177099671439e-05, "loss": 2.0439, "step": 5253 }, { "epoch": 0.04086006787717404, "grad_norm": 0.5777826105589446, "learning_rate": 9.9591615173542e-05, "loss": 2.0916, "step": 5254 }, { "epoch": 0.04086784482195462, "grad_norm": 0.18989726180306685, "learning_rate": 9.9591459320758e-05, "loss": 2.085, "step": 5255 }, { "epoch": 0.04087562176673521, "grad_norm": 0.6312764679145041, "learning_rate": 9.959130343836242e-05, "loss": 2.0638, "step": 5256 }, { "epoch": 0.04088339871151579, "grad_norm": 0.540241118910758, "learning_rate": 9.95911475263554e-05, "loss": 2.0815, "step": 5257 }, { "epoch": 0.04089117565629637, "grad_norm": 0.22511705396328094, "learning_rate": 9.959099158473701e-05, "loss": 2.0608, "step": 5258 }, { "epoch": 0.040898952601076954, "grad_norm": 0.7155612514303936, "learning_rate": 9.959083561350737e-05, "loss": 2.1108, "step": 5259 }, { "epoch": 0.040906729545857536, "grad_norm": 0.4250847306957652, "learning_rate": 9.959067961266653e-05, "loss": 2.0968, "step": 5260 }, { "epoch": 0.04091450649063812, "grad_norm": 0.32060736081240476, "learning_rate": 9.959052358221461e-05, "loss": 2.0611, "step": 5261 }, { "epoch": 0.0409222834354187, "grad_norm": 0.7602341919483471, "learning_rate": 9.959036752215172e-05, "loss": 2.0917, "step": 5262 }, { "epoch": 0.04093006038019928, "grad_norm": 0.5266691234763933, "learning_rate": 9.95902114324779e-05, "loss": 2.0494, "step": 5263 }, { "epoch": 0.04093783732497986, "grad_norm": 0.25201733405491983, "learning_rate": 9.95900553131933e-05, "loss": 2.1318, "step": 5264 }, { "epoch": 0.04094561426976044, "grad_norm": 0.8482238150476483, "learning_rate": 9.958989916429799e-05, "loss": 2.0475, "step": 5265 }, { "epoch": 0.04095339121454102, "grad_norm": 0.6525793139178444, "learning_rate": 9.958974298579204e-05, "loss": 2.0839, "step": 5266 }, { "epoch": 0.040961168159321604, "grad_norm": 0.24151124532297427, "learning_rate": 9.958958677767559e-05, "loss": 2.0589, "step": 5267 }, { "epoch": 0.040968945104102185, "grad_norm": 1.1074918903179511, "learning_rate": 9.95894305399487e-05, "loss": 2.0975, "step": 5268 }, { "epoch": 0.040976722048882766, "grad_norm": 0.8571001270146411, "learning_rate": 9.958927427261148e-05, "loss": 2.1332, "step": 5269 }, { "epoch": 0.04098449899366335, "grad_norm": 0.738756438725149, "learning_rate": 9.9589117975664e-05, "loss": 2.1371, "step": 5270 }, { "epoch": 0.04099227593844393, "grad_norm": 0.5664439271630313, "learning_rate": 9.958896164910638e-05, "loss": 2.0852, "step": 5271 }, { "epoch": 0.04100005288322451, "grad_norm": 0.9696445639060843, "learning_rate": 9.95888052929387e-05, "loss": 2.1244, "step": 5272 }, { "epoch": 0.04100782982800509, "grad_norm": 0.7056754696874706, "learning_rate": 9.958864890716106e-05, "loss": 2.1581, "step": 5273 }, { "epoch": 0.04101560677278567, "grad_norm": 0.4094991109893762, "learning_rate": 9.958849249177354e-05, "loss": 2.1056, "step": 5274 }, { "epoch": 0.04102338371756625, "grad_norm": 0.8273006257541748, "learning_rate": 9.958833604677626e-05, "loss": 2.0994, "step": 5275 }, { "epoch": 0.041031160662346834, "grad_norm": 0.8602066236607284, "learning_rate": 9.958817957216928e-05, "loss": 2.1244, "step": 5276 }, { "epoch": 0.041038937607127415, "grad_norm": 0.37420590500395423, "learning_rate": 9.958802306795272e-05, "loss": 2.082, "step": 5277 }, { "epoch": 0.041046714551907997, "grad_norm": 2.1789324059735455, "learning_rate": 9.958786653412665e-05, "loss": 2.1319, "step": 5278 }, { "epoch": 0.04105449149668858, "grad_norm": 0.7628333113746751, "learning_rate": 9.958770997069117e-05, "loss": 2.1279, "step": 5279 }, { "epoch": 0.04106226844146916, "grad_norm": 0.27031552474841725, "learning_rate": 9.958755337764641e-05, "loss": 2.0984, "step": 5280 }, { "epoch": 0.04107004538624974, "grad_norm": 1.4202068411945834, "learning_rate": 9.95873967549924e-05, "loss": 2.109, "step": 5281 }, { "epoch": 0.04107782233103032, "grad_norm": 0.9882838745065328, "learning_rate": 9.958724010272929e-05, "loss": 2.1389, "step": 5282 }, { "epoch": 0.0410855992758109, "grad_norm": 0.8422809254042463, "learning_rate": 9.958708342085714e-05, "loss": 2.084, "step": 5283 }, { "epoch": 0.041093376220591483, "grad_norm": 0.3296578337235466, "learning_rate": 9.958692670937606e-05, "loss": 2.0592, "step": 5284 }, { "epoch": 0.041101153165372065, "grad_norm": 0.38947504267326205, "learning_rate": 9.958676996828613e-05, "loss": 2.0757, "step": 5285 }, { "epoch": 0.041108930110152646, "grad_norm": 0.6000956159068142, "learning_rate": 9.958661319758747e-05, "loss": 2.1608, "step": 5286 }, { "epoch": 0.04111670705493323, "grad_norm": 2.1330888171909597, "learning_rate": 9.958645639728012e-05, "loss": 2.1364, "step": 5287 }, { "epoch": 0.04112448399971381, "grad_norm": 0.8047050746608678, "learning_rate": 9.958629956736425e-05, "loss": 2.1308, "step": 5288 }, { "epoch": 0.04113226094449439, "grad_norm": 0.3693713526129923, "learning_rate": 9.958614270783988e-05, "loss": 2.0978, "step": 5289 }, { "epoch": 0.04114003788927497, "grad_norm": 0.9956261806516563, "learning_rate": 9.958598581870715e-05, "loss": 2.1021, "step": 5290 }, { "epoch": 0.04114781483405555, "grad_norm": 0.6503856029531583, "learning_rate": 9.958582889996612e-05, "loss": 2.0549, "step": 5291 }, { "epoch": 0.04115559177883613, "grad_norm": 0.44127519283506583, "learning_rate": 9.958567195161694e-05, "loss": 2.0915, "step": 5292 }, { "epoch": 0.041163368723616714, "grad_norm": 2.573242136586867, "learning_rate": 9.958551497365964e-05, "loss": 2.1343, "step": 5293 }, { "epoch": 0.041171145668397295, "grad_norm": 0.34497849935891395, "learning_rate": 9.958535796609434e-05, "loss": 2.1226, "step": 5294 }, { "epoch": 0.041178922613177876, "grad_norm": 0.4583437291503154, "learning_rate": 9.958520092892114e-05, "loss": 2.1176, "step": 5295 }, { "epoch": 0.04118669955795846, "grad_norm": 0.42147640304764694, "learning_rate": 9.958504386214013e-05, "loss": 2.1367, "step": 5296 }, { "epoch": 0.04119447650273904, "grad_norm": 0.2591779176563114, "learning_rate": 9.958488676575139e-05, "loss": 2.1431, "step": 5297 }, { "epoch": 0.04120225344751962, "grad_norm": 0.38352288053450145, "learning_rate": 9.958472963975504e-05, "loss": 2.0914, "step": 5298 }, { "epoch": 0.0412100303923002, "grad_norm": 0.3195383942409473, "learning_rate": 9.958457248415116e-05, "loss": 2.0648, "step": 5299 }, { "epoch": 0.04121780733708078, "grad_norm": 0.23441056804578525, "learning_rate": 9.958441529893985e-05, "loss": 2.0574, "step": 5300 }, { "epoch": 0.04122558428186136, "grad_norm": 0.24235988768895078, "learning_rate": 9.958425808412117e-05, "loss": 2.0736, "step": 5301 }, { "epoch": 0.041233361226641944, "grad_norm": 0.7362410747293041, "learning_rate": 9.958410083969528e-05, "loss": 2.075, "step": 5302 }, { "epoch": 0.041241138171422526, "grad_norm": 0.3354467973171741, "learning_rate": 9.95839435656622e-05, "loss": 2.0571, "step": 5303 }, { "epoch": 0.04124891511620311, "grad_norm": 1.2881608850926098, "learning_rate": 9.958378626202207e-05, "loss": 2.1062, "step": 5304 }, { "epoch": 0.04125669206098369, "grad_norm": 2.4312854396711465, "learning_rate": 9.958362892877497e-05, "loss": 2.1003, "step": 5305 }, { "epoch": 0.04126446900576427, "grad_norm": 639.3466814392069, "learning_rate": 9.958347156592101e-05, "loss": 7.844, "step": 5306 }, { "epoch": 0.04127224595054485, "grad_norm": 13.135406540670926, "learning_rate": 9.958331417346025e-05, "loss": 2.547, "step": 5307 }, { "epoch": 0.04128002289532543, "grad_norm": 25.217610755070726, "learning_rate": 9.958315675139282e-05, "loss": 2.5814, "step": 5308 }, { "epoch": 0.04128779984010601, "grad_norm": 1.2503172058099339, "learning_rate": 9.958299929971879e-05, "loss": 2.1917, "step": 5309 }, { "epoch": 0.041295576784886594, "grad_norm": 1.2082248649007234, "learning_rate": 9.958284181843826e-05, "loss": 2.1756, "step": 5310 }, { "epoch": 0.041303353729667175, "grad_norm": 0.6541081174942062, "learning_rate": 9.958268430755133e-05, "loss": 2.107, "step": 5311 }, { "epoch": 0.041311130674447756, "grad_norm": 0.7184741806653228, "learning_rate": 9.958252676705811e-05, "loss": 2.167, "step": 5312 }, { "epoch": 0.04131890761922834, "grad_norm": 0.4664769678722144, "learning_rate": 9.958236919695865e-05, "loss": 2.0982, "step": 5313 }, { "epoch": 0.04132668456400892, "grad_norm": 0.322686274441521, "learning_rate": 9.958221159725308e-05, "loss": 2.1107, "step": 5314 }, { "epoch": 0.0413344615087895, "grad_norm": 0.4591063935902333, "learning_rate": 9.958205396794147e-05, "loss": 2.0962, "step": 5315 }, { "epoch": 0.04134223845357009, "grad_norm": 0.34626288468770855, "learning_rate": 9.958189630902393e-05, "loss": 2.116, "step": 5316 }, { "epoch": 0.04135001539835067, "grad_norm": 0.506325150845177, "learning_rate": 9.958173862050056e-05, "loss": 2.1064, "step": 5317 }, { "epoch": 0.04135779234313125, "grad_norm": 0.3940684878779358, "learning_rate": 9.958158090237145e-05, "loss": 2.1095, "step": 5318 }, { "epoch": 0.04136556928791183, "grad_norm": 0.2969602319128485, "learning_rate": 9.958142315463668e-05, "loss": 2.0862, "step": 5319 }, { "epoch": 0.04137334623269241, "grad_norm": 0.2668259745925739, "learning_rate": 9.958126537729634e-05, "loss": 2.094, "step": 5320 }, { "epoch": 0.04138112317747299, "grad_norm": 0.280865906678564, "learning_rate": 9.958110757035057e-05, "loss": 2.0817, "step": 5321 }, { "epoch": 0.041388900122253575, "grad_norm": 2.756459175703028, "learning_rate": 9.958094973379942e-05, "loss": 2.1466, "step": 5322 }, { "epoch": 0.041396677067034156, "grad_norm": 0.29699190635652045, "learning_rate": 9.9580791867643e-05, "loss": 2.1322, "step": 5323 }, { "epoch": 0.04140445401181474, "grad_norm": 0.26503412561630596, "learning_rate": 9.958063397188139e-05, "loss": 2.0985, "step": 5324 }, { "epoch": 0.04141223095659532, "grad_norm": 0.2631720440837844, "learning_rate": 9.95804760465147e-05, "loss": 2.0845, "step": 5325 }, { "epoch": 0.0414200079013759, "grad_norm": 0.22147999195180057, "learning_rate": 9.9580318091543e-05, "loss": 2.0832, "step": 5326 }, { "epoch": 0.04142778484615648, "grad_norm": 0.21852119286728613, "learning_rate": 9.958016010696644e-05, "loss": 2.1158, "step": 5327 }, { "epoch": 0.04143556179093706, "grad_norm": 0.19119764826180968, "learning_rate": 9.958000209278506e-05, "loss": 2.0885, "step": 5328 }, { "epoch": 0.04144333873571764, "grad_norm": 0.7366647340843888, "learning_rate": 9.957984404899896e-05, "loss": 2.0748, "step": 5329 }, { "epoch": 0.041451115680498224, "grad_norm": 0.21405052831812274, "learning_rate": 9.957968597560827e-05, "loss": 2.0662, "step": 5330 }, { "epoch": 0.041458892625278805, "grad_norm": 0.21811232155635657, "learning_rate": 9.957952787261304e-05, "loss": 2.1115, "step": 5331 }, { "epoch": 0.041466669570059386, "grad_norm": 0.22364737413779984, "learning_rate": 9.95793697400134e-05, "loss": 2.1359, "step": 5332 }, { "epoch": 0.04147444651483997, "grad_norm": 0.2069077909777288, "learning_rate": 9.957921157780943e-05, "loss": 2.1244, "step": 5333 }, { "epoch": 0.04148222345962055, "grad_norm": 0.1812498645346886, "learning_rate": 9.957905338600122e-05, "loss": 2.111, "step": 5334 }, { "epoch": 0.04149000040440113, "grad_norm": 0.27313882654280663, "learning_rate": 9.957889516458887e-05, "loss": 2.0691, "step": 5335 }, { "epoch": 0.04149777734918171, "grad_norm": 0.1624955936656773, "learning_rate": 9.957873691357247e-05, "loss": 2.138, "step": 5336 }, { "epoch": 0.04150555429396229, "grad_norm": 0.1651408809666404, "learning_rate": 9.957857863295211e-05, "loss": 2.0807, "step": 5337 }, { "epoch": 0.04151333123874287, "grad_norm": 0.1722923343752547, "learning_rate": 9.95784203227279e-05, "loss": 2.0528, "step": 5338 }, { "epoch": 0.041521108183523454, "grad_norm": 0.13223309484580165, "learning_rate": 9.957826198289993e-05, "loss": 2.0843, "step": 5339 }, { "epoch": 0.041528885128304036, "grad_norm": 0.14630862604815245, "learning_rate": 9.957810361346828e-05, "loss": 2.0913, "step": 5340 }, { "epoch": 0.04153666207308462, "grad_norm": 0.16336883910021618, "learning_rate": 9.957794521443308e-05, "loss": 2.0962, "step": 5341 }, { "epoch": 0.0415444390178652, "grad_norm": 0.12911780941539522, "learning_rate": 9.957778678579439e-05, "loss": 2.087, "step": 5342 }, { "epoch": 0.04155221596264578, "grad_norm": 0.1298595726643546, "learning_rate": 9.957762832755231e-05, "loss": 2.0875, "step": 5343 }, { "epoch": 0.04155999290742636, "grad_norm": 0.13926394996721525, "learning_rate": 9.957746983970692e-05, "loss": 2.1088, "step": 5344 }, { "epoch": 0.04156776985220694, "grad_norm": 0.12628764200737777, "learning_rate": 9.957731132225838e-05, "loss": 2.1044, "step": 5345 }, { "epoch": 0.04157554679698752, "grad_norm": 0.16166852522532704, "learning_rate": 9.957715277520671e-05, "loss": 2.0501, "step": 5346 }, { "epoch": 0.041583323741768104, "grad_norm": 0.22926576176296357, "learning_rate": 9.957699419855203e-05, "loss": 2.086, "step": 5347 }, { "epoch": 0.041591100686548685, "grad_norm": 0.11936287959058747, "learning_rate": 9.957683559229445e-05, "loss": 2.079, "step": 5348 }, { "epoch": 0.041598877631329266, "grad_norm": 0.12444472067926517, "learning_rate": 9.957667695643405e-05, "loss": 2.0573, "step": 5349 }, { "epoch": 0.04160665457610985, "grad_norm": 0.12517207910108424, "learning_rate": 9.957651829097092e-05, "loss": 2.0823, "step": 5350 }, { "epoch": 0.04161443152089043, "grad_norm": 0.12361352435401193, "learning_rate": 9.957635959590516e-05, "loss": 2.1478, "step": 5351 }, { "epoch": 0.04162220846567101, "grad_norm": 0.12439647316444132, "learning_rate": 9.957620087123689e-05, "loss": 2.1099, "step": 5352 }, { "epoch": 0.04162998541045159, "grad_norm": 0.11793529049008178, "learning_rate": 9.957604211696616e-05, "loss": 2.0559, "step": 5353 }, { "epoch": 0.04163776235523217, "grad_norm": 0.20918135162547347, "learning_rate": 9.95758833330931e-05, "loss": 2.1414, "step": 5354 }, { "epoch": 0.04164553930001275, "grad_norm": 0.13415067978921721, "learning_rate": 9.957572451961778e-05, "loss": 2.0885, "step": 5355 }, { "epoch": 0.041653316244793334, "grad_norm": 0.11906103363572158, "learning_rate": 9.957556567654031e-05, "loss": 2.0676, "step": 5356 }, { "epoch": 0.041661093189573915, "grad_norm": 0.4186662451046818, "learning_rate": 9.957540680386079e-05, "loss": 2.1428, "step": 5357 }, { "epoch": 0.041668870134354496, "grad_norm": 0.13626304031763983, "learning_rate": 9.95752479015793e-05, "loss": 2.1259, "step": 5358 }, { "epoch": 0.04167664707913508, "grad_norm": 0.7912564453431772, "learning_rate": 9.957508896969594e-05, "loss": 2.0999, "step": 5359 }, { "epoch": 0.04168442402391566, "grad_norm": 0.17036110633761922, "learning_rate": 9.957493000821082e-05, "loss": 2.0881, "step": 5360 }, { "epoch": 0.04169220096869624, "grad_norm": 0.20687638787137916, "learning_rate": 9.9574771017124e-05, "loss": 2.089, "step": 5361 }, { "epoch": 0.04169997791347682, "grad_norm": 0.22967333763227032, "learning_rate": 9.95746119964356e-05, "loss": 2.1219, "step": 5362 }, { "epoch": 0.0417077548582574, "grad_norm": 0.18751944918236196, "learning_rate": 9.957445294614571e-05, "loss": 2.1109, "step": 5363 }, { "epoch": 0.041715531803037983, "grad_norm": 0.16386413429441019, "learning_rate": 9.957429386625444e-05, "loss": 2.0954, "step": 5364 }, { "epoch": 0.041723308747818565, "grad_norm": 0.1866602944680709, "learning_rate": 9.957413475676187e-05, "loss": 2.0403, "step": 5365 }, { "epoch": 0.041731085692599146, "grad_norm": 0.12853315854730848, "learning_rate": 9.957397561766808e-05, "loss": 2.0503, "step": 5366 }, { "epoch": 0.04173886263737973, "grad_norm": 0.14788460577941004, "learning_rate": 9.957381644897318e-05, "loss": 2.0554, "step": 5367 }, { "epoch": 0.04174663958216031, "grad_norm": 0.1289329634958464, "learning_rate": 9.95736572506773e-05, "loss": 2.0984, "step": 5368 }, { "epoch": 0.04175441652694089, "grad_norm": 0.13724614657485856, "learning_rate": 9.957349802278045e-05, "loss": 2.0882, "step": 5369 }, { "epoch": 0.04176219347172147, "grad_norm": 0.14520470634893357, "learning_rate": 9.957333876528281e-05, "loss": 2.0713, "step": 5370 }, { "epoch": 0.04176997041650205, "grad_norm": 0.12140622610445773, "learning_rate": 9.957317947818443e-05, "loss": 2.0785, "step": 5371 }, { "epoch": 0.04177774736128263, "grad_norm": 0.28532713138429916, "learning_rate": 9.957302016148541e-05, "loss": 2.0637, "step": 5372 }, { "epoch": 0.041785524306063214, "grad_norm": 0.12426698930570952, "learning_rate": 9.957286081518587e-05, "loss": 2.0764, "step": 5373 }, { "epoch": 0.041793301250843795, "grad_norm": 0.12681400162007175, "learning_rate": 9.957270143928587e-05, "loss": 2.0447, "step": 5374 }, { "epoch": 0.04180107819562438, "grad_norm": 0.1201508798952417, "learning_rate": 9.957254203378552e-05, "loss": 2.0886, "step": 5375 }, { "epoch": 0.041808855140404964, "grad_norm": 0.1296057516962752, "learning_rate": 9.957238259868493e-05, "loss": 2.0764, "step": 5376 }, { "epoch": 0.041816632085185546, "grad_norm": 0.12769432426451774, "learning_rate": 9.957222313398418e-05, "loss": 2.1094, "step": 5377 }, { "epoch": 0.04182440902996613, "grad_norm": 0.12070229146546568, "learning_rate": 9.957206363968337e-05, "loss": 2.069, "step": 5378 }, { "epoch": 0.04183218597474671, "grad_norm": 0.12142005711064868, "learning_rate": 9.957190411578257e-05, "loss": 2.0573, "step": 5379 }, { "epoch": 0.04183996291952729, "grad_norm": 0.11627980341790838, "learning_rate": 9.95717445622819e-05, "loss": 2.071, "step": 5380 }, { "epoch": 0.04184773986430787, "grad_norm": 0.12147749525190388, "learning_rate": 9.957158497918148e-05, "loss": 2.0939, "step": 5381 }, { "epoch": 0.04185551680908845, "grad_norm": 0.12617629169160463, "learning_rate": 9.957142536648136e-05, "loss": 2.075, "step": 5382 }, { "epoch": 0.04186329375386903, "grad_norm": 0.11620422084447994, "learning_rate": 9.957126572418166e-05, "loss": 2.1086, "step": 5383 }, { "epoch": 0.041871070698649614, "grad_norm": 0.1247916511223848, "learning_rate": 9.957110605228247e-05, "loss": 2.1349, "step": 5384 }, { "epoch": 0.041878847643430195, "grad_norm": 0.1190940225847594, "learning_rate": 9.957094635078388e-05, "loss": 2.1041, "step": 5385 }, { "epoch": 0.041886624588210776, "grad_norm": 0.12372157828713282, "learning_rate": 9.9570786619686e-05, "loss": 2.0585, "step": 5386 }, { "epoch": 0.04189440153299136, "grad_norm": 0.11554833551516278, "learning_rate": 9.957062685898891e-05, "loss": 2.068, "step": 5387 }, { "epoch": 0.04190217847777194, "grad_norm": 0.12931659914466045, "learning_rate": 9.957046706869268e-05, "loss": 2.1243, "step": 5388 }, { "epoch": 0.04190995542255252, "grad_norm": 0.1125582521692339, "learning_rate": 9.957030724879747e-05, "loss": 2.0949, "step": 5389 }, { "epoch": 0.0419177323673331, "grad_norm": 0.1186040111764085, "learning_rate": 9.957014739930333e-05, "loss": 2.1303, "step": 5390 }, { "epoch": 0.04192550931211368, "grad_norm": 0.11837728302298237, "learning_rate": 9.956998752021038e-05, "loss": 2.1203, "step": 5391 }, { "epoch": 0.04193328625689426, "grad_norm": 0.12777838545217304, "learning_rate": 9.956982761151869e-05, "loss": 2.0692, "step": 5392 }, { "epoch": 0.041941063201674844, "grad_norm": 0.11282110829363315, "learning_rate": 9.956966767322836e-05, "loss": 2.0346, "step": 5393 }, { "epoch": 0.041948840146455425, "grad_norm": 0.11942096635336101, "learning_rate": 9.95695077053395e-05, "loss": 2.0961, "step": 5394 }, { "epoch": 0.041956617091236006, "grad_norm": 0.11665364215351058, "learning_rate": 9.95693477078522e-05, "loss": 2.0512, "step": 5395 }, { "epoch": 0.04196439403601659, "grad_norm": 0.11755569600539979, "learning_rate": 9.956918768076655e-05, "loss": 2.1113, "step": 5396 }, { "epoch": 0.04197217098079717, "grad_norm": 0.11877419022045171, "learning_rate": 9.956902762408266e-05, "loss": 2.1002, "step": 5397 }, { "epoch": 0.04197994792557775, "grad_norm": 0.11016278566809347, "learning_rate": 9.956886753780061e-05, "loss": 2.0819, "step": 5398 }, { "epoch": 0.04198772487035833, "grad_norm": 0.1264855960557374, "learning_rate": 9.95687074219205e-05, "loss": 2.1265, "step": 5399 }, { "epoch": 0.04199550181513891, "grad_norm": 0.11157569090081886, "learning_rate": 9.956854727644243e-05, "loss": 2.0883, "step": 5400 }, { "epoch": 0.04200327875991949, "grad_norm": 0.11906466621194652, "learning_rate": 9.956838710136648e-05, "loss": 2.0692, "step": 5401 }, { "epoch": 0.042011055704700075, "grad_norm": 0.11421099830567907, "learning_rate": 9.956822689669277e-05, "loss": 2.088, "step": 5402 }, { "epoch": 0.042018832649480656, "grad_norm": 0.12664755734867988, "learning_rate": 9.956806666242137e-05, "loss": 2.1288, "step": 5403 }, { "epoch": 0.04202660959426124, "grad_norm": 0.12330861248446828, "learning_rate": 9.95679063985524e-05, "loss": 2.0811, "step": 5404 }, { "epoch": 0.04203438653904182, "grad_norm": 0.11458728284591654, "learning_rate": 9.956774610508595e-05, "loss": 1.9921, "step": 5405 }, { "epoch": 0.0420421634838224, "grad_norm": 0.25050778530477114, "learning_rate": 9.95675857820221e-05, "loss": 2.1201, "step": 5406 }, { "epoch": 0.04204994042860298, "grad_norm": 0.11188137830079856, "learning_rate": 9.956742542936097e-05, "loss": 2.0709, "step": 5407 }, { "epoch": 0.04205771737338356, "grad_norm": 0.11699819888708311, "learning_rate": 9.956726504710264e-05, "loss": 2.0654, "step": 5408 }, { "epoch": 0.04206549431816414, "grad_norm": 0.15206764304317139, "learning_rate": 9.95671046352472e-05, "loss": 2.0879, "step": 5409 }, { "epoch": 0.042073271262944724, "grad_norm": 0.12028572233608677, "learning_rate": 9.956694419379473e-05, "loss": 2.0786, "step": 5410 }, { "epoch": 0.042081048207725305, "grad_norm": 0.14211665474228632, "learning_rate": 9.956678372274538e-05, "loss": 2.1221, "step": 5411 }, { "epoch": 0.042088825152505886, "grad_norm": 0.1230298738897527, "learning_rate": 9.956662322209921e-05, "loss": 2.0825, "step": 5412 }, { "epoch": 0.04209660209728647, "grad_norm": 0.13622092121577126, "learning_rate": 9.956646269185632e-05, "loss": 2.1196, "step": 5413 }, { "epoch": 0.04210437904206705, "grad_norm": 0.11495908374210267, "learning_rate": 9.956630213201681e-05, "loss": 2.1053, "step": 5414 }, { "epoch": 0.04211215598684763, "grad_norm": 0.13713607848348258, "learning_rate": 9.956614154258076e-05, "loss": 2.0449, "step": 5415 }, { "epoch": 0.04211993293162821, "grad_norm": 0.13663647749883992, "learning_rate": 9.956598092354828e-05, "loss": 2.1184, "step": 5416 }, { "epoch": 0.04212770987640879, "grad_norm": 0.11458583782087249, "learning_rate": 9.956582027491948e-05, "loss": 2.0694, "step": 5417 }, { "epoch": 0.04213548682118937, "grad_norm": 0.15254953517182426, "learning_rate": 9.956565959669443e-05, "loss": 2.0814, "step": 5418 }, { "epoch": 0.042143263765969954, "grad_norm": 0.11627429935576328, "learning_rate": 9.956549888887323e-05, "loss": 2.1567, "step": 5419 }, { "epoch": 0.042151040710750536, "grad_norm": 0.1256034270096694, "learning_rate": 9.956533815145599e-05, "loss": 2.1483, "step": 5420 }, { "epoch": 0.04215881765553112, "grad_norm": 0.12231312899975637, "learning_rate": 9.95651773844428e-05, "loss": 2.022, "step": 5421 }, { "epoch": 0.0421665946003117, "grad_norm": 0.11535551081878387, "learning_rate": 9.956501658783375e-05, "loss": 2.1189, "step": 5422 }, { "epoch": 0.04217437154509228, "grad_norm": 0.11828481351964289, "learning_rate": 9.956485576162896e-05, "loss": 2.0461, "step": 5423 }, { "epoch": 0.04218214848987286, "grad_norm": 0.11152270633820026, "learning_rate": 9.956469490582847e-05, "loss": 2.0988, "step": 5424 }, { "epoch": 0.04218992543465344, "grad_norm": 0.15282584848495043, "learning_rate": 9.956453402043245e-05, "loss": 2.0845, "step": 5425 }, { "epoch": 0.04219770237943402, "grad_norm": 0.126635640081839, "learning_rate": 9.956437310544092e-05, "loss": 2.1144, "step": 5426 }, { "epoch": 0.042205479324214604, "grad_norm": 0.12220548142819838, "learning_rate": 9.956421216085403e-05, "loss": 2.0813, "step": 5427 }, { "epoch": 0.042213256268995185, "grad_norm": 0.12748746182820414, "learning_rate": 9.956405118667187e-05, "loss": 2.1116, "step": 5428 }, { "epoch": 0.042221033213775766, "grad_norm": 0.11447734406976832, "learning_rate": 9.956389018289453e-05, "loss": 2.0316, "step": 5429 }, { "epoch": 0.04222881015855635, "grad_norm": 0.1442847121964791, "learning_rate": 9.956372914952209e-05, "loss": 2.0756, "step": 5430 }, { "epoch": 0.04223658710333693, "grad_norm": 0.11035126771429232, "learning_rate": 9.956356808655465e-05, "loss": 2.036, "step": 5431 }, { "epoch": 0.04224436404811751, "grad_norm": 0.13161542358416065, "learning_rate": 9.956340699399234e-05, "loss": 2.0573, "step": 5432 }, { "epoch": 0.04225214099289809, "grad_norm": 0.1355088381103301, "learning_rate": 9.956324587183522e-05, "loss": 2.0556, "step": 5433 }, { "epoch": 0.04225991793767868, "grad_norm": 0.12065679625435323, "learning_rate": 9.95630847200834e-05, "loss": 2.0623, "step": 5434 }, { "epoch": 0.04226769488245926, "grad_norm": 0.13869873073723998, "learning_rate": 9.956292353873697e-05, "loss": 2.1059, "step": 5435 }, { "epoch": 0.04227547182723984, "grad_norm": 0.11323259062115001, "learning_rate": 9.956276232779602e-05, "loss": 2.0919, "step": 5436 }, { "epoch": 0.04228324877202042, "grad_norm": 0.12466732057613701, "learning_rate": 9.956260108726067e-05, "loss": 2.1073, "step": 5437 }, { "epoch": 0.042291025716801, "grad_norm": 0.11270112450310406, "learning_rate": 9.9562439817131e-05, "loss": 2.1089, "step": 5438 }, { "epoch": 0.042298802661581585, "grad_norm": 0.120135239641412, "learning_rate": 9.956227851740711e-05, "loss": 2.0764, "step": 5439 }, { "epoch": 0.042306579606362166, "grad_norm": 0.11467266928735773, "learning_rate": 9.95621171880891e-05, "loss": 2.1017, "step": 5440 }, { "epoch": 0.04231435655114275, "grad_norm": 0.12231111396934856, "learning_rate": 9.956195582917706e-05, "loss": 2.1073, "step": 5441 }, { "epoch": 0.04232213349592333, "grad_norm": 0.1194765268021423, "learning_rate": 9.956179444067109e-05, "loss": 2.0893, "step": 5442 }, { "epoch": 0.04232991044070391, "grad_norm": 0.11753088877157031, "learning_rate": 9.956163302257127e-05, "loss": 2.0505, "step": 5443 }, { "epoch": 0.04233768738548449, "grad_norm": 0.11590753639670227, "learning_rate": 9.956147157487772e-05, "loss": 2.0452, "step": 5444 }, { "epoch": 0.04234546433026507, "grad_norm": 0.11289509900095275, "learning_rate": 9.956131009759053e-05, "loss": 2.0415, "step": 5445 }, { "epoch": 0.04235324127504565, "grad_norm": 0.11725015683098738, "learning_rate": 9.956114859070977e-05, "loss": 2.0701, "step": 5446 }, { "epoch": 0.042361018219826234, "grad_norm": 0.1124281614356118, "learning_rate": 9.956098705423559e-05, "loss": 2.0853, "step": 5447 }, { "epoch": 0.042368795164606815, "grad_norm": 0.12087726297928336, "learning_rate": 9.956082548816804e-05, "loss": 2.1029, "step": 5448 }, { "epoch": 0.042376572109387396, "grad_norm": 0.1195853238228292, "learning_rate": 9.956066389250724e-05, "loss": 2.0462, "step": 5449 }, { "epoch": 0.04238434905416798, "grad_norm": 0.11026786705028925, "learning_rate": 9.956050226725329e-05, "loss": 2.106, "step": 5450 }, { "epoch": 0.04239212599894856, "grad_norm": 0.13091839218415277, "learning_rate": 9.956034061240624e-05, "loss": 2.1013, "step": 5451 }, { "epoch": 0.04239990294372914, "grad_norm": 0.12009425582060888, "learning_rate": 9.956017892796625e-05, "loss": 2.0498, "step": 5452 }, { "epoch": 0.04240767988850972, "grad_norm": 0.13441148800826744, "learning_rate": 9.956001721393339e-05, "loss": 2.1073, "step": 5453 }, { "epoch": 0.0424154568332903, "grad_norm": 0.2739768360627498, "learning_rate": 9.955985547030775e-05, "loss": 2.0753, "step": 5454 }, { "epoch": 0.04242323377807088, "grad_norm": 0.13101458572580366, "learning_rate": 9.955969369708944e-05, "loss": 2.0325, "step": 5455 }, { "epoch": 0.042431010722851464, "grad_norm": 0.11876953596684132, "learning_rate": 9.955953189427854e-05, "loss": 2.0887, "step": 5456 }, { "epoch": 0.042438787667632046, "grad_norm": 0.1197163215663901, "learning_rate": 9.955937006187515e-05, "loss": 2.0637, "step": 5457 }, { "epoch": 0.04244656461241263, "grad_norm": 0.11325892831093384, "learning_rate": 9.955920819987939e-05, "loss": 2.0375, "step": 5458 }, { "epoch": 0.04245434155719321, "grad_norm": 0.12872687522047582, "learning_rate": 9.955904630829132e-05, "loss": 2.0747, "step": 5459 }, { "epoch": 0.04246211850197379, "grad_norm": 0.12682966642325677, "learning_rate": 9.955888438711105e-05, "loss": 2.0733, "step": 5460 }, { "epoch": 0.04246989544675437, "grad_norm": 0.1274191715202574, "learning_rate": 9.955872243633871e-05, "loss": 2.0373, "step": 5461 }, { "epoch": 0.04247767239153495, "grad_norm": 0.12864027170284106, "learning_rate": 9.955856045597434e-05, "loss": 2.0695, "step": 5462 }, { "epoch": 0.04248544933631553, "grad_norm": 0.11664233953111773, "learning_rate": 9.955839844601808e-05, "loss": 2.0819, "step": 5463 }, { "epoch": 0.042493226281096114, "grad_norm": 0.14039765229099282, "learning_rate": 9.955823640647e-05, "loss": 2.0469, "step": 5464 }, { "epoch": 0.042501003225876695, "grad_norm": 0.11132646944247702, "learning_rate": 9.955807433733022e-05, "loss": 2.0763, "step": 5465 }, { "epoch": 0.042508780170657276, "grad_norm": 0.12874146601754569, "learning_rate": 9.955791223859882e-05, "loss": 2.051, "step": 5466 }, { "epoch": 0.04251655711543786, "grad_norm": 0.11814440855151707, "learning_rate": 9.95577501102759e-05, "loss": 2.0158, "step": 5467 }, { "epoch": 0.04252433406021844, "grad_norm": 0.13354548136331126, "learning_rate": 9.955758795236157e-05, "loss": 2.0978, "step": 5468 }, { "epoch": 0.04253211100499902, "grad_norm": 0.1302027517072822, "learning_rate": 9.955742576485592e-05, "loss": 2.0706, "step": 5469 }, { "epoch": 0.0425398879497796, "grad_norm": 0.11739594939969746, "learning_rate": 9.955726354775902e-05, "loss": 2.087, "step": 5470 }, { "epoch": 0.04254766489456018, "grad_norm": 0.14445773941524592, "learning_rate": 9.955710130107102e-05, "loss": 2.1313, "step": 5471 }, { "epoch": 0.04255544183934076, "grad_norm": 0.11016060989671178, "learning_rate": 9.955693902479196e-05, "loss": 2.054, "step": 5472 }, { "epoch": 0.042563218784121344, "grad_norm": 0.12692540255379514, "learning_rate": 9.955677671892197e-05, "loss": 2.0567, "step": 5473 }, { "epoch": 0.042570995728901925, "grad_norm": 0.12347941350112644, "learning_rate": 9.955661438346116e-05, "loss": 2.0899, "step": 5474 }, { "epoch": 0.042578772673682506, "grad_norm": 0.11187259120892627, "learning_rate": 9.955645201840958e-05, "loss": 2.0808, "step": 5475 }, { "epoch": 0.04258654961846309, "grad_norm": 0.13544235706594887, "learning_rate": 9.955628962376737e-05, "loss": 2.073, "step": 5476 }, { "epoch": 0.04259432656324367, "grad_norm": 0.11221780333268622, "learning_rate": 9.95561271995346e-05, "loss": 2.0357, "step": 5477 }, { "epoch": 0.04260210350802425, "grad_norm": 0.17725914738736262, "learning_rate": 9.955596474571138e-05, "loss": 2.0765, "step": 5478 }, { "epoch": 0.04260988045280483, "grad_norm": 0.14859830555540104, "learning_rate": 9.955580226229782e-05, "loss": 2.0284, "step": 5479 }, { "epoch": 0.04261765739758541, "grad_norm": 0.11844187805982836, "learning_rate": 9.955563974929398e-05, "loss": 2.0862, "step": 5480 }, { "epoch": 0.04262543434236599, "grad_norm": 0.20051811778571127, "learning_rate": 9.95554772067e-05, "loss": 2.0348, "step": 5481 }, { "epoch": 0.042633211287146575, "grad_norm": 0.12123909826215076, "learning_rate": 9.955531463451596e-05, "loss": 2.0975, "step": 5482 }, { "epoch": 0.042640988231927156, "grad_norm": 0.1311089559004459, "learning_rate": 9.955515203274192e-05, "loss": 2.1122, "step": 5483 }, { "epoch": 0.04264876517670774, "grad_norm": 0.12056435290377171, "learning_rate": 9.955498940137804e-05, "loss": 2.1169, "step": 5484 }, { "epoch": 0.04265654212148832, "grad_norm": 0.3938477623483276, "learning_rate": 9.955482674042437e-05, "loss": 2.0842, "step": 5485 }, { "epoch": 0.0426643190662689, "grad_norm": 0.15445389084729952, "learning_rate": 9.955466404988104e-05, "loss": 2.0433, "step": 5486 }, { "epoch": 0.04267209601104948, "grad_norm": 0.12934062191116902, "learning_rate": 9.955450132974813e-05, "loss": 2.0738, "step": 5487 }, { "epoch": 0.04267987295583006, "grad_norm": 0.3644193048614725, "learning_rate": 9.955433858002574e-05, "loss": 2.0694, "step": 5488 }, { "epoch": 0.04268764990061064, "grad_norm": 0.13073560545742602, "learning_rate": 9.955417580071395e-05, "loss": 2.0856, "step": 5489 }, { "epoch": 0.042695426845391224, "grad_norm": 0.18713266989433094, "learning_rate": 9.955401299181288e-05, "loss": 2.1187, "step": 5490 }, { "epoch": 0.042703203790171805, "grad_norm": 0.12124959519079695, "learning_rate": 9.955385015332263e-05, "loss": 2.1031, "step": 5491 }, { "epoch": 0.042710980734952386, "grad_norm": 0.16767681304348225, "learning_rate": 9.955368728524328e-05, "loss": 2.0912, "step": 5492 }, { "epoch": 0.04271875767973297, "grad_norm": 0.11895434932855063, "learning_rate": 9.955352438757495e-05, "loss": 2.0218, "step": 5493 }, { "epoch": 0.042726534624513555, "grad_norm": 0.1979675662603144, "learning_rate": 9.95533614603177e-05, "loss": 2.1099, "step": 5494 }, { "epoch": 0.04273431156929414, "grad_norm": 0.12584738348344127, "learning_rate": 9.955319850347166e-05, "loss": 2.0485, "step": 5495 }, { "epoch": 0.04274208851407472, "grad_norm": 0.14182411799531985, "learning_rate": 9.955303551703692e-05, "loss": 2.1053, "step": 5496 }, { "epoch": 0.0427498654588553, "grad_norm": 0.26804959574089476, "learning_rate": 9.955287250101357e-05, "loss": 2.0354, "step": 5497 }, { "epoch": 0.04275764240363588, "grad_norm": 0.12431548108588075, "learning_rate": 9.955270945540172e-05, "loss": 2.0559, "step": 5498 }, { "epoch": 0.04276541934841646, "grad_norm": 0.1332992473036932, "learning_rate": 9.955254638020146e-05, "loss": 2.0632, "step": 5499 }, { "epoch": 0.04277319629319704, "grad_norm": 0.12461497095953675, "learning_rate": 9.955238327541287e-05, "loss": 2.0465, "step": 5500 }, { "epoch": 0.042780973237977624, "grad_norm": 0.43757421106974254, "learning_rate": 9.955222014103607e-05, "loss": 2.0784, "step": 5501 }, { "epoch": 0.042788750182758205, "grad_norm": 0.12335994883688971, "learning_rate": 9.955205697707114e-05, "loss": 2.0962, "step": 5502 }, { "epoch": 0.042796527127538786, "grad_norm": 0.13135956070434646, "learning_rate": 9.955189378351821e-05, "loss": 2.0723, "step": 5503 }, { "epoch": 0.04280430407231937, "grad_norm": 0.12047500809008178, "learning_rate": 9.955173056037735e-05, "loss": 2.0375, "step": 5504 }, { "epoch": 0.04281208101709995, "grad_norm": 0.12594817645292333, "learning_rate": 9.955156730764866e-05, "loss": 2.0338, "step": 5505 }, { "epoch": 0.04281985796188053, "grad_norm": 0.12718533926008793, "learning_rate": 9.955140402533226e-05, "loss": 2.0448, "step": 5506 }, { "epoch": 0.04282763490666111, "grad_norm": 0.6156352619724786, "learning_rate": 9.955124071342819e-05, "loss": 2.0801, "step": 5507 }, { "epoch": 0.04283541185144169, "grad_norm": 0.13658020180795263, "learning_rate": 9.955107737193661e-05, "loss": 2.1192, "step": 5508 }, { "epoch": 0.04284318879622227, "grad_norm": 0.15016190549699984, "learning_rate": 9.955091400085759e-05, "loss": 2.0564, "step": 5509 }, { "epoch": 0.042850965741002854, "grad_norm": 0.14324053782806617, "learning_rate": 9.955075060019123e-05, "loss": 2.0872, "step": 5510 }, { "epoch": 0.042858742685783435, "grad_norm": 0.17886197354973554, "learning_rate": 9.955058716993764e-05, "loss": 2.1323, "step": 5511 }, { "epoch": 0.042866519630564016, "grad_norm": 0.14699127392726521, "learning_rate": 9.955042371009688e-05, "loss": 2.0676, "step": 5512 }, { "epoch": 0.0428742965753446, "grad_norm": 0.19521600013128776, "learning_rate": 9.95502602206691e-05, "loss": 2.067, "step": 5513 }, { "epoch": 0.04288207352012518, "grad_norm": 0.24775176162155926, "learning_rate": 9.955009670165435e-05, "loss": 2.0879, "step": 5514 }, { "epoch": 0.04288985046490576, "grad_norm": 0.8263991553787862, "learning_rate": 9.954993315305276e-05, "loss": 2.077, "step": 5515 }, { "epoch": 0.04289762740968634, "grad_norm": 0.13152791538607875, "learning_rate": 9.954976957486442e-05, "loss": 2.0859, "step": 5516 }, { "epoch": 0.04290540435446692, "grad_norm": 0.13808766852176807, "learning_rate": 9.954960596708943e-05, "loss": 2.1026, "step": 5517 }, { "epoch": 0.0429131812992475, "grad_norm": 0.12797819609620043, "learning_rate": 9.954944232972788e-05, "loss": 2.1164, "step": 5518 }, { "epoch": 0.042920958244028085, "grad_norm": 0.15874287518759356, "learning_rate": 9.954927866277985e-05, "loss": 2.0897, "step": 5519 }, { "epoch": 0.042928735188808666, "grad_norm": 0.13812001869917898, "learning_rate": 9.954911496624549e-05, "loss": 2.0778, "step": 5520 }, { "epoch": 0.04293651213358925, "grad_norm": 0.4284868734745308, "learning_rate": 9.954895124012483e-05, "loss": 2.1136, "step": 5521 }, { "epoch": 0.04294428907836983, "grad_norm": 0.16001588138951583, "learning_rate": 9.954878748441802e-05, "loss": 2.0458, "step": 5522 }, { "epoch": 0.04295206602315041, "grad_norm": 0.13530527815045484, "learning_rate": 9.954862369912515e-05, "loss": 2.0479, "step": 5523 }, { "epoch": 0.04295984296793099, "grad_norm": 0.1645454309130878, "learning_rate": 9.954845988424629e-05, "loss": 2.0871, "step": 5524 }, { "epoch": 0.04296761991271157, "grad_norm": 0.14572163941916946, "learning_rate": 9.954829603978157e-05, "loss": 2.0708, "step": 5525 }, { "epoch": 0.04297539685749215, "grad_norm": 0.1369040536059909, "learning_rate": 9.954813216573107e-05, "loss": 2.1141, "step": 5526 }, { "epoch": 0.042983173802272734, "grad_norm": 0.21153952319733094, "learning_rate": 9.954796826209489e-05, "loss": 2.0964, "step": 5527 }, { "epoch": 0.042990950747053315, "grad_norm": 0.15723701672284862, "learning_rate": 9.954780432887315e-05, "loss": 2.0361, "step": 5528 }, { "epoch": 0.042998727691833896, "grad_norm": 0.12403671086533889, "learning_rate": 9.954764036606589e-05, "loss": 2.0427, "step": 5529 }, { "epoch": 0.04300650463661448, "grad_norm": 1.484960871770516, "learning_rate": 9.954747637367327e-05, "loss": 2.074, "step": 5530 }, { "epoch": 0.04301428158139506, "grad_norm": 0.12938394034188352, "learning_rate": 9.954731235169536e-05, "loss": 2.0471, "step": 5531 }, { "epoch": 0.04302205852617564, "grad_norm": 0.14895675705196185, "learning_rate": 9.954714830013225e-05, "loss": 2.0891, "step": 5532 }, { "epoch": 0.04302983547095622, "grad_norm": 0.18636108946637814, "learning_rate": 9.954698421898408e-05, "loss": 2.0484, "step": 5533 }, { "epoch": 0.0430376124157368, "grad_norm": 0.12413574996923617, "learning_rate": 9.954682010825088e-05, "loss": 2.0473, "step": 5534 }, { "epoch": 0.04304538936051738, "grad_norm": 0.1287614764650486, "learning_rate": 9.95466559679328e-05, "loss": 2.0946, "step": 5535 }, { "epoch": 0.043053166305297964, "grad_norm": 0.12880483960164968, "learning_rate": 9.954649179802992e-05, "loss": 2.0512, "step": 5536 }, { "epoch": 0.043060943250078546, "grad_norm": 0.1283005310324389, "learning_rate": 9.954632759854237e-05, "loss": 2.0754, "step": 5537 }, { "epoch": 0.04306872019485913, "grad_norm": 0.12870060464227426, "learning_rate": 9.954616336947019e-05, "loss": 2.0943, "step": 5538 }, { "epoch": 0.04307649713963971, "grad_norm": 0.20739643832348303, "learning_rate": 9.954599911081351e-05, "loss": 2.0234, "step": 5539 }, { "epoch": 0.04308427408442029, "grad_norm": 0.11515380799477902, "learning_rate": 9.954583482257244e-05, "loss": 2.0973, "step": 5540 }, { "epoch": 0.04309205102920087, "grad_norm": 0.134671117898664, "learning_rate": 9.954567050474706e-05, "loss": 2.1114, "step": 5541 }, { "epoch": 0.04309982797398145, "grad_norm": 0.18676696328141215, "learning_rate": 9.954550615733746e-05, "loss": 2.0788, "step": 5542 }, { "epoch": 0.04310760491876203, "grad_norm": 0.13951381024388798, "learning_rate": 9.954534178034376e-05, "loss": 2.1203, "step": 5543 }, { "epoch": 0.043115381863542614, "grad_norm": 1.6782414616160808, "learning_rate": 9.954517737376605e-05, "loss": 2.0257, "step": 5544 }, { "epoch": 0.043123158808323195, "grad_norm": 0.3314073664665929, "learning_rate": 9.954501293760442e-05, "loss": 2.0574, "step": 5545 }, { "epoch": 0.043130935753103776, "grad_norm": 0.2227384491447174, "learning_rate": 9.954484847185898e-05, "loss": 2.0971, "step": 5546 }, { "epoch": 0.04313871269788436, "grad_norm": 0.37922112938234687, "learning_rate": 9.954468397652981e-05, "loss": 2.1014, "step": 5547 }, { "epoch": 0.04314648964266494, "grad_norm": 0.15853593004827118, "learning_rate": 9.954451945161704e-05, "loss": 2.0725, "step": 5548 }, { "epoch": 0.04315426658744552, "grad_norm": 0.6671207434767498, "learning_rate": 9.954435489712073e-05, "loss": 2.112, "step": 5549 }, { "epoch": 0.0431620435322261, "grad_norm": 0.21327216137449537, "learning_rate": 9.954419031304101e-05, "loss": 2.0519, "step": 5550 }, { "epoch": 0.04316982047700668, "grad_norm": 0.24088016977628038, "learning_rate": 9.954402569937798e-05, "loss": 2.0881, "step": 5551 }, { "epoch": 0.04317759742178726, "grad_norm": 0.22383747479030455, "learning_rate": 9.954386105613169e-05, "loss": 2.0913, "step": 5552 }, { "epoch": 0.04318537436656785, "grad_norm": 0.6150381876442615, "learning_rate": 9.954369638330229e-05, "loss": 2.0492, "step": 5553 }, { "epoch": 0.04319315131134843, "grad_norm": 0.176037680077414, "learning_rate": 9.954353168088987e-05, "loss": 2.0699, "step": 5554 }, { "epoch": 0.04320092825612901, "grad_norm": 0.1954876914298661, "learning_rate": 9.954336694889451e-05, "loss": 2.1215, "step": 5555 }, { "epoch": 0.043208705200909595, "grad_norm": 0.16690251155648053, "learning_rate": 9.954320218731632e-05, "loss": 2.037, "step": 5556 }, { "epoch": 0.043216482145690176, "grad_norm": 0.17072573640074679, "learning_rate": 9.954303739615539e-05, "loss": 2.0723, "step": 5557 }, { "epoch": 0.04322425909047076, "grad_norm": 0.17055377544740555, "learning_rate": 9.954287257541183e-05, "loss": 2.1421, "step": 5558 }, { "epoch": 0.04323203603525134, "grad_norm": 0.2229846407763865, "learning_rate": 9.954270772508573e-05, "loss": 2.106, "step": 5559 }, { "epoch": 0.04323981298003192, "grad_norm": 0.13944099290607415, "learning_rate": 9.954254284517719e-05, "loss": 2.1075, "step": 5560 }, { "epoch": 0.0432475899248125, "grad_norm": 0.1379173523388515, "learning_rate": 9.95423779356863e-05, "loss": 2.1093, "step": 5561 }, { "epoch": 0.04325536686959308, "grad_norm": 0.14129777838819416, "learning_rate": 9.954221299661319e-05, "loss": 2.0874, "step": 5562 }, { "epoch": 0.04326314381437366, "grad_norm": 0.15216673516253099, "learning_rate": 9.954204802795793e-05, "loss": 2.1378, "step": 5563 }, { "epoch": 0.043270920759154244, "grad_norm": 0.12834673670520888, "learning_rate": 9.954188302972062e-05, "loss": 2.0421, "step": 5564 }, { "epoch": 0.043278697703934825, "grad_norm": 0.12931894493533075, "learning_rate": 9.954171800190136e-05, "loss": 2.0543, "step": 5565 }, { "epoch": 0.043286474648715406, "grad_norm": 0.446498136258935, "learning_rate": 9.954155294450025e-05, "loss": 2.0734, "step": 5566 }, { "epoch": 0.04329425159349599, "grad_norm": 0.2049102428182235, "learning_rate": 9.954138785751741e-05, "loss": 2.0614, "step": 5567 }, { "epoch": 0.04330202853827657, "grad_norm": 0.13436936914242842, "learning_rate": 9.95412227409529e-05, "loss": 2.0701, "step": 5568 }, { "epoch": 0.04330980548305715, "grad_norm": 0.19083560441168487, "learning_rate": 9.954105759480685e-05, "loss": 2.0992, "step": 5569 }, { "epoch": 0.04331758242783773, "grad_norm": 0.18259100878406553, "learning_rate": 9.954089241907933e-05, "loss": 2.0971, "step": 5570 }, { "epoch": 0.04332535937261831, "grad_norm": 0.14477154164200978, "learning_rate": 9.954072721377047e-05, "loss": 2.0931, "step": 5571 }, { "epoch": 0.04333313631739889, "grad_norm": 0.23608362420445203, "learning_rate": 9.954056197888035e-05, "loss": 2.0705, "step": 5572 }, { "epoch": 0.043340913262179474, "grad_norm": 0.23995237643395145, "learning_rate": 9.954039671440907e-05, "loss": 2.0835, "step": 5573 }, { "epoch": 0.043348690206960055, "grad_norm": 0.13534050971044534, "learning_rate": 9.954023142035673e-05, "loss": 2.0526, "step": 5574 }, { "epoch": 0.04335646715174064, "grad_norm": 0.23504586767277869, "learning_rate": 9.954006609672344e-05, "loss": 2.0984, "step": 5575 }, { "epoch": 0.04336424409652122, "grad_norm": 0.28022920188881867, "learning_rate": 9.953990074350928e-05, "loss": 2.0638, "step": 5576 }, { "epoch": 0.0433720210413018, "grad_norm": 0.33543087224642787, "learning_rate": 9.953973536071435e-05, "loss": 2.1553, "step": 5577 }, { "epoch": 0.04337979798608238, "grad_norm": 0.29763583389527704, "learning_rate": 9.953956994833878e-05, "loss": 2.1297, "step": 5578 }, { "epoch": 0.04338757493086296, "grad_norm": 0.3422638488133555, "learning_rate": 9.953940450638261e-05, "loss": 2.097, "step": 5579 }, { "epoch": 0.04339535187564354, "grad_norm": 0.15554851236064773, "learning_rate": 9.9539239034846e-05, "loss": 2.0357, "step": 5580 }, { "epoch": 0.043403128820424124, "grad_norm": 0.24997401971386368, "learning_rate": 9.953907353372902e-05, "loss": 2.0861, "step": 5581 }, { "epoch": 0.043410905765204705, "grad_norm": 0.19753052764910678, "learning_rate": 9.953890800303177e-05, "loss": 2.0768, "step": 5582 }, { "epoch": 0.043418682709985286, "grad_norm": 0.17356877463842807, "learning_rate": 9.953874244275434e-05, "loss": 2.1117, "step": 5583 }, { "epoch": 0.04342645965476587, "grad_norm": 0.22057190967029325, "learning_rate": 9.953857685289684e-05, "loss": 2.1496, "step": 5584 }, { "epoch": 0.04343423659954645, "grad_norm": 0.13099866203213573, "learning_rate": 9.953841123345938e-05, "loss": 2.0993, "step": 5585 }, { "epoch": 0.04344201354432703, "grad_norm": 0.16839616101222615, "learning_rate": 9.953824558444202e-05, "loss": 2.0328, "step": 5586 }, { "epoch": 0.04344979048910761, "grad_norm": 0.12985325595981195, "learning_rate": 9.953807990584492e-05, "loss": 2.0357, "step": 5587 }, { "epoch": 0.04345756743388819, "grad_norm": 0.19019031734656164, "learning_rate": 9.953791419766812e-05, "loss": 2.0718, "step": 5588 }, { "epoch": 0.04346534437866877, "grad_norm": 0.13688621742751414, "learning_rate": 9.953774845991176e-05, "loss": 2.0353, "step": 5589 }, { "epoch": 0.043473121323449354, "grad_norm": 0.15712547203985136, "learning_rate": 9.953758269257592e-05, "loss": 2.1025, "step": 5590 }, { "epoch": 0.043480898268229935, "grad_norm": 0.12219552457430126, "learning_rate": 9.953741689566069e-05, "loss": 2.0249, "step": 5591 }, { "epoch": 0.043488675213010516, "grad_norm": 0.13983331040724234, "learning_rate": 9.953725106916618e-05, "loss": 2.0463, "step": 5592 }, { "epoch": 0.0434964521577911, "grad_norm": 0.12123147265478748, "learning_rate": 9.95370852130925e-05, "loss": 2.0564, "step": 5593 }, { "epoch": 0.04350422910257168, "grad_norm": 0.11910004029103341, "learning_rate": 9.953691932743974e-05, "loss": 2.0603, "step": 5594 }, { "epoch": 0.04351200604735226, "grad_norm": 0.11477821233857267, "learning_rate": 9.953675341220798e-05, "loss": 2.0843, "step": 5595 }, { "epoch": 0.04351978299213284, "grad_norm": 0.11771649470638759, "learning_rate": 9.953658746739735e-05, "loss": 2.0989, "step": 5596 }, { "epoch": 0.04352755993691342, "grad_norm": 0.12764046215847027, "learning_rate": 9.953642149300793e-05, "loss": 2.1092, "step": 5597 }, { "epoch": 0.043535336881694, "grad_norm": 0.24584431406143248, "learning_rate": 9.953625548903983e-05, "loss": 2.0828, "step": 5598 }, { "epoch": 0.043543113826474585, "grad_norm": 0.12334344713505437, "learning_rate": 9.953608945549315e-05, "loss": 2.0543, "step": 5599 }, { "epoch": 0.043550890771255166, "grad_norm": 0.1184503741547418, "learning_rate": 9.953592339236797e-05, "loss": 2.1312, "step": 5600 }, { "epoch": 0.04355866771603575, "grad_norm": 0.1687239831107152, "learning_rate": 9.95357572996644e-05, "loss": 2.0432, "step": 5601 }, { "epoch": 0.04356644466081633, "grad_norm": 0.12367482770302772, "learning_rate": 9.953559117738255e-05, "loss": 2.0628, "step": 5602 }, { "epoch": 0.04357422160559691, "grad_norm": 0.14462216526450727, "learning_rate": 9.953542502552251e-05, "loss": 2.0735, "step": 5603 }, { "epoch": 0.04358199855037749, "grad_norm": 0.1249607564012577, "learning_rate": 9.953525884408437e-05, "loss": 2.143, "step": 5604 }, { "epoch": 0.04358977549515807, "grad_norm": 0.18415373945633587, "learning_rate": 9.953509263306825e-05, "loss": 2.0692, "step": 5605 }, { "epoch": 0.04359755243993865, "grad_norm": 0.14710134027082403, "learning_rate": 9.953492639247423e-05, "loss": 2.0995, "step": 5606 }, { "epoch": 0.043605329384719234, "grad_norm": 0.1237442063590897, "learning_rate": 9.953476012230244e-05, "loss": 2.117, "step": 5607 }, { "epoch": 0.043613106329499815, "grad_norm": 0.12157210189870754, "learning_rate": 9.953459382255292e-05, "loss": 2.048, "step": 5608 }, { "epoch": 0.043620883274280396, "grad_norm": 0.27578186204757443, "learning_rate": 9.953442749322584e-05, "loss": 2.134, "step": 5609 }, { "epoch": 0.04362866021906098, "grad_norm": 0.11473556427756167, "learning_rate": 9.953426113432125e-05, "loss": 2.1049, "step": 5610 }, { "epoch": 0.04363643716384156, "grad_norm": 0.11545854948268278, "learning_rate": 9.953409474583925e-05, "loss": 2.1041, "step": 5611 }, { "epoch": 0.04364421410862214, "grad_norm": 0.11467634765612908, "learning_rate": 9.953392832777997e-05, "loss": 2.1153, "step": 5612 }, { "epoch": 0.04365199105340273, "grad_norm": 0.12847789962065323, "learning_rate": 9.95337618801435e-05, "loss": 2.1446, "step": 5613 }, { "epoch": 0.04365976799818331, "grad_norm": 0.12623285051696798, "learning_rate": 9.953359540292992e-05, "loss": 2.1143, "step": 5614 }, { "epoch": 0.04366754494296389, "grad_norm": 0.12452214388849342, "learning_rate": 9.953342889613934e-05, "loss": 2.0967, "step": 5615 }, { "epoch": 0.04367532188774447, "grad_norm": 0.11966172749175433, "learning_rate": 9.953326235977188e-05, "loss": 2.0954, "step": 5616 }, { "epoch": 0.04368309883252505, "grad_norm": 0.12431879571357589, "learning_rate": 9.95330957938276e-05, "loss": 2.0798, "step": 5617 }, { "epoch": 0.043690875777305634, "grad_norm": 0.1136415535837574, "learning_rate": 9.953292919830665e-05, "loss": 2.0165, "step": 5618 }, { "epoch": 0.043698652722086215, "grad_norm": 0.37232592447681545, "learning_rate": 9.953276257320907e-05, "loss": 2.0229, "step": 5619 }, { "epoch": 0.043706429666866796, "grad_norm": 0.13011713317483425, "learning_rate": 9.953259591853501e-05, "loss": 2.099, "step": 5620 }, { "epoch": 0.04371420661164738, "grad_norm": 0.17698501867752195, "learning_rate": 9.953242923428454e-05, "loss": 2.09, "step": 5621 }, { "epoch": 0.04372198355642796, "grad_norm": 0.1865768022721582, "learning_rate": 9.953226252045776e-05, "loss": 2.0793, "step": 5622 }, { "epoch": 0.04372976050120854, "grad_norm": 0.1211790250640529, "learning_rate": 9.95320957770548e-05, "loss": 2.0773, "step": 5623 }, { "epoch": 0.04373753744598912, "grad_norm": 0.1435191528963972, "learning_rate": 9.953192900407571e-05, "loss": 2.1059, "step": 5624 }, { "epoch": 0.0437453143907697, "grad_norm": 0.1197312400599008, "learning_rate": 9.953176220152063e-05, "loss": 2.1217, "step": 5625 }, { "epoch": 0.04375309133555028, "grad_norm": 0.1348621059342558, "learning_rate": 9.953159536938965e-05, "loss": 2.1179, "step": 5626 }, { "epoch": 0.043760868280330864, "grad_norm": 0.14473611473078132, "learning_rate": 9.953142850768286e-05, "loss": 2.0924, "step": 5627 }, { "epoch": 0.043768645225111445, "grad_norm": 0.13730308608062725, "learning_rate": 9.953126161640037e-05, "loss": 2.0456, "step": 5628 }, { "epoch": 0.043776422169892026, "grad_norm": 0.1324721634941693, "learning_rate": 9.953109469554227e-05, "loss": 2.097, "step": 5629 }, { "epoch": 0.04378419911467261, "grad_norm": 0.12993532819346731, "learning_rate": 9.953092774510869e-05, "loss": 2.0621, "step": 5630 }, { "epoch": 0.04379197605945319, "grad_norm": 0.1517465401083135, "learning_rate": 9.953076076509967e-05, "loss": 2.0612, "step": 5631 }, { "epoch": 0.04379975300423377, "grad_norm": 0.11262941807139974, "learning_rate": 9.953059375551537e-05, "loss": 2.0872, "step": 5632 }, { "epoch": 0.04380752994901435, "grad_norm": 0.16600095727940808, "learning_rate": 9.953042671635584e-05, "loss": 2.1176, "step": 5633 }, { "epoch": 0.04381530689379493, "grad_norm": 0.12206024306546122, "learning_rate": 9.953025964762122e-05, "loss": 2.0754, "step": 5634 }, { "epoch": 0.04382308383857551, "grad_norm": 0.14399279949366975, "learning_rate": 9.95300925493116e-05, "loss": 2.0847, "step": 5635 }, { "epoch": 0.043830860783356095, "grad_norm": 0.15415197806814704, "learning_rate": 9.952992542142707e-05, "loss": 2.1026, "step": 5636 }, { "epoch": 0.043838637728136676, "grad_norm": 0.12210778927343942, "learning_rate": 9.952975826396773e-05, "loss": 2.0677, "step": 5637 }, { "epoch": 0.04384641467291726, "grad_norm": 0.14411347258143387, "learning_rate": 9.952959107693368e-05, "loss": 2.0725, "step": 5638 }, { "epoch": 0.04385419161769784, "grad_norm": 0.14479503267106159, "learning_rate": 9.952942386032503e-05, "loss": 2.0956, "step": 5639 }, { "epoch": 0.04386196856247842, "grad_norm": 0.13758393588379325, "learning_rate": 9.952925661414187e-05, "loss": 2.0758, "step": 5640 }, { "epoch": 0.043869745507259, "grad_norm": 0.11893238847395987, "learning_rate": 9.95290893383843e-05, "loss": 2.1042, "step": 5641 }, { "epoch": 0.04387752245203958, "grad_norm": 0.1287010129503243, "learning_rate": 9.952892203305243e-05, "loss": 2.0799, "step": 5642 }, { "epoch": 0.04388529939682016, "grad_norm": 0.12105390930341056, "learning_rate": 9.952875469814636e-05, "loss": 2.0321, "step": 5643 }, { "epoch": 0.043893076341600744, "grad_norm": 0.12353774410913032, "learning_rate": 9.952858733366616e-05, "loss": 2.0778, "step": 5644 }, { "epoch": 0.043900853286381325, "grad_norm": 0.140009592836269, "learning_rate": 9.952841993961196e-05, "loss": 2.1499, "step": 5645 }, { "epoch": 0.043908630231161906, "grad_norm": 0.1231318364747307, "learning_rate": 9.952825251598386e-05, "loss": 2.1262, "step": 5646 }, { "epoch": 0.04391640717594249, "grad_norm": 0.1167418850933133, "learning_rate": 9.952808506278195e-05, "loss": 2.0762, "step": 5647 }, { "epoch": 0.04392418412072307, "grad_norm": 0.14279779174779964, "learning_rate": 9.952791758000632e-05, "loss": 2.1006, "step": 5648 }, { "epoch": 0.04393196106550365, "grad_norm": 0.11966220068870657, "learning_rate": 9.95277500676571e-05, "loss": 2.0317, "step": 5649 }, { "epoch": 0.04393973801028423, "grad_norm": 0.12210321933479586, "learning_rate": 9.952758252573437e-05, "loss": 2.1243, "step": 5650 }, { "epoch": 0.04394751495506481, "grad_norm": 0.11428155415648258, "learning_rate": 9.952741495423823e-05, "loss": 2.1094, "step": 5651 }, { "epoch": 0.04395529189984539, "grad_norm": 0.11650227794797899, "learning_rate": 9.952724735316877e-05, "loss": 2.0461, "step": 5652 }, { "epoch": 0.043963068844625974, "grad_norm": 0.11183546119893409, "learning_rate": 9.952707972252612e-05, "loss": 2.0483, "step": 5653 }, { "epoch": 0.043970845789406555, "grad_norm": 0.11652372440281956, "learning_rate": 9.952691206231036e-05, "loss": 2.0547, "step": 5654 }, { "epoch": 0.04397862273418714, "grad_norm": 0.12518329512198412, "learning_rate": 9.952674437252158e-05, "loss": 2.0462, "step": 5655 }, { "epoch": 0.04398639967896772, "grad_norm": 0.11924324988578221, "learning_rate": 9.952657665315991e-05, "loss": 2.0744, "step": 5656 }, { "epoch": 0.0439941766237483, "grad_norm": 0.10944400922557884, "learning_rate": 9.952640890422542e-05, "loss": 2.1001, "step": 5657 }, { "epoch": 0.04400195356852888, "grad_norm": 0.11420670295839576, "learning_rate": 9.952624112571822e-05, "loss": 2.0945, "step": 5658 }, { "epoch": 0.04400973051330946, "grad_norm": 0.11582724979848068, "learning_rate": 9.952607331763844e-05, "loss": 2.0779, "step": 5659 }, { "epoch": 0.04401750745809004, "grad_norm": 0.12332867159300658, "learning_rate": 9.952590547998612e-05, "loss": 2.0994, "step": 5660 }, { "epoch": 0.044025284402870624, "grad_norm": 0.11633724212643562, "learning_rate": 9.952573761276142e-05, "loss": 2.0252, "step": 5661 }, { "epoch": 0.044033061347651205, "grad_norm": 0.12334384914711581, "learning_rate": 9.952556971596441e-05, "loss": 2.0918, "step": 5662 }, { "epoch": 0.044040838292431786, "grad_norm": 0.11920387687921097, "learning_rate": 9.952540178959517e-05, "loss": 2.0596, "step": 5663 }, { "epoch": 0.04404861523721237, "grad_norm": 0.13623721570812636, "learning_rate": 9.952523383365385e-05, "loss": 2.0856, "step": 5664 }, { "epoch": 0.04405639218199295, "grad_norm": 0.14660099370569293, "learning_rate": 9.952506584814052e-05, "loss": 2.0658, "step": 5665 }, { "epoch": 0.04406416912677353, "grad_norm": 0.11409149188755303, "learning_rate": 9.952489783305528e-05, "loss": 2.0511, "step": 5666 }, { "epoch": 0.04407194607155411, "grad_norm": 0.11676246707206615, "learning_rate": 9.952472978839824e-05, "loss": 2.0263, "step": 5667 }, { "epoch": 0.04407972301633469, "grad_norm": 0.11919028218247978, "learning_rate": 9.95245617141695e-05, "loss": 2.0609, "step": 5668 }, { "epoch": 0.04408749996111527, "grad_norm": 0.117972253552782, "learning_rate": 9.952439361036914e-05, "loss": 2.0863, "step": 5669 }, { "epoch": 0.044095276905895854, "grad_norm": 0.12057476725380778, "learning_rate": 9.95242254769973e-05, "loss": 2.0623, "step": 5670 }, { "epoch": 0.044103053850676435, "grad_norm": 0.11083432187299896, "learning_rate": 9.952405731405403e-05, "loss": 2.0838, "step": 5671 }, { "epoch": 0.04411083079545702, "grad_norm": 0.20980264857431524, "learning_rate": 9.952388912153948e-05, "loss": 2.109, "step": 5672 }, { "epoch": 0.044118607740237605, "grad_norm": 0.11432902478763805, "learning_rate": 9.952372089945371e-05, "loss": 2.1131, "step": 5673 }, { "epoch": 0.044126384685018186, "grad_norm": 0.11369384349992928, "learning_rate": 9.952355264779686e-05, "loss": 2.0286, "step": 5674 }, { "epoch": 0.04413416162979877, "grad_norm": 0.11087006676588643, "learning_rate": 9.9523384366569e-05, "loss": 2.0803, "step": 5675 }, { "epoch": 0.04414193857457935, "grad_norm": 0.11718364835733805, "learning_rate": 9.952321605577023e-05, "loss": 2.0886, "step": 5676 }, { "epoch": 0.04414971551935993, "grad_norm": 0.170052969231822, "learning_rate": 9.952304771540066e-05, "loss": 2.0456, "step": 5677 }, { "epoch": 0.04415749246414051, "grad_norm": 0.11684395823339941, "learning_rate": 9.952287934546041e-05, "loss": 2.0991, "step": 5678 }, { "epoch": 0.04416526940892109, "grad_norm": 0.15088177326981195, "learning_rate": 9.952271094594953e-05, "loss": 2.0498, "step": 5679 }, { "epoch": 0.04417304635370167, "grad_norm": 0.1574962775159719, "learning_rate": 9.952254251686818e-05, "loss": 2.0304, "step": 5680 }, { "epoch": 0.044180823298482254, "grad_norm": 0.13101996978015745, "learning_rate": 9.952237405821642e-05, "loss": 2.0077, "step": 5681 }, { "epoch": 0.044188600243262835, "grad_norm": 0.14272691759352965, "learning_rate": 9.952220556999436e-05, "loss": 2.0664, "step": 5682 }, { "epoch": 0.044196377188043416, "grad_norm": 0.13078327022624706, "learning_rate": 9.952203705220211e-05, "loss": 2.1023, "step": 5683 }, { "epoch": 0.044204154132824, "grad_norm": 0.12228669139216544, "learning_rate": 9.952186850483975e-05, "loss": 2.0395, "step": 5684 }, { "epoch": 0.04421193107760458, "grad_norm": 0.12578816940477391, "learning_rate": 9.95216999279074e-05, "loss": 2.0752, "step": 5685 }, { "epoch": 0.04421970802238516, "grad_norm": 0.12240041704967405, "learning_rate": 9.952153132140518e-05, "loss": 2.1248, "step": 5686 }, { "epoch": 0.04422748496716574, "grad_norm": 0.13212344745594531, "learning_rate": 9.952136268533314e-05, "loss": 2.1382, "step": 5687 }, { "epoch": 0.04423526191194632, "grad_norm": 0.11600030326683217, "learning_rate": 9.952119401969141e-05, "loss": 2.0936, "step": 5688 }, { "epoch": 0.0442430388567269, "grad_norm": 0.11797700988190422, "learning_rate": 9.952102532448009e-05, "loss": 2.0735, "step": 5689 }, { "epoch": 0.044250815801507484, "grad_norm": 0.12784499394865034, "learning_rate": 9.952085659969928e-05, "loss": 2.1395, "step": 5690 }, { "epoch": 0.044258592746288065, "grad_norm": 0.2958996820427068, "learning_rate": 9.952068784534907e-05, "loss": 2.049, "step": 5691 }, { "epoch": 0.04426636969106865, "grad_norm": 0.11367955939123495, "learning_rate": 9.952051906142958e-05, "loss": 2.0766, "step": 5692 }, { "epoch": 0.04427414663584923, "grad_norm": 0.1216564942964201, "learning_rate": 9.952035024794089e-05, "loss": 2.0267, "step": 5693 }, { "epoch": 0.04428192358062981, "grad_norm": 0.11458962415290387, "learning_rate": 9.952018140488312e-05, "loss": 2.1045, "step": 5694 }, { "epoch": 0.04428970052541039, "grad_norm": 0.2648431416849606, "learning_rate": 9.952001253225636e-05, "loss": 2.0927, "step": 5695 }, { "epoch": 0.04429747747019097, "grad_norm": 0.1446957317815595, "learning_rate": 9.95198436300607e-05, "loss": 2.078, "step": 5696 }, { "epoch": 0.04430525441497155, "grad_norm": 0.11831056002517575, "learning_rate": 9.951967469829626e-05, "loss": 2.0609, "step": 5697 }, { "epoch": 0.044313031359752134, "grad_norm": 0.11796939653745574, "learning_rate": 9.951950573696315e-05, "loss": 2.0632, "step": 5698 }, { "epoch": 0.044320808304532715, "grad_norm": 0.45352174676430074, "learning_rate": 9.951933674606144e-05, "loss": 2.0938, "step": 5699 }, { "epoch": 0.044328585249313296, "grad_norm": 0.11658351106470578, "learning_rate": 9.951916772559125e-05, "loss": 2.1274, "step": 5700 }, { "epoch": 0.04433636219409388, "grad_norm": 0.14048620810703463, "learning_rate": 9.951899867555267e-05, "loss": 2.1167, "step": 5701 }, { "epoch": 0.04434413913887446, "grad_norm": 0.13074822882191794, "learning_rate": 9.951882959594583e-05, "loss": 2.0679, "step": 5702 }, { "epoch": 0.04435191608365504, "grad_norm": 0.12141292383081238, "learning_rate": 9.951866048677079e-05, "loss": 2.0703, "step": 5703 }, { "epoch": 0.04435969302843562, "grad_norm": 0.15034721828306838, "learning_rate": 9.951849134802766e-05, "loss": 2.0479, "step": 5704 }, { "epoch": 0.0443674699732162, "grad_norm": 0.1547191630078198, "learning_rate": 9.951832217971657e-05, "loss": 2.1227, "step": 5705 }, { "epoch": 0.04437524691799678, "grad_norm": 0.11450225139239206, "learning_rate": 9.95181529818376e-05, "loss": 2.0678, "step": 5706 }, { "epoch": 0.044383023862777364, "grad_norm": 0.13508454456921246, "learning_rate": 9.951798375439086e-05, "loss": 2.0469, "step": 5707 }, { "epoch": 0.044390800807557945, "grad_norm": 0.1433808701898049, "learning_rate": 9.951781449737642e-05, "loss": 2.1126, "step": 5708 }, { "epoch": 0.044398577752338526, "grad_norm": 0.11891778528805537, "learning_rate": 9.951764521079442e-05, "loss": 2.0711, "step": 5709 }, { "epoch": 0.04440635469711911, "grad_norm": 0.1306520055600139, "learning_rate": 9.951747589464496e-05, "loss": 2.0466, "step": 5710 }, { "epoch": 0.04441413164189969, "grad_norm": 0.11514720731920314, "learning_rate": 9.95173065489281e-05, "loss": 2.11, "step": 5711 }, { "epoch": 0.04442190858668027, "grad_norm": 0.1339255445795203, "learning_rate": 9.951713717364397e-05, "loss": 2.0568, "step": 5712 }, { "epoch": 0.04442968553146085, "grad_norm": 0.1198343791851181, "learning_rate": 9.951696776879269e-05, "loss": 2.0767, "step": 5713 }, { "epoch": 0.04443746247624143, "grad_norm": 0.12418712318718725, "learning_rate": 9.951679833437433e-05, "loss": 2.1111, "step": 5714 }, { "epoch": 0.04444523942102201, "grad_norm": 0.1335372595813112, "learning_rate": 9.951662887038899e-05, "loss": 2.0764, "step": 5715 }, { "epoch": 0.044453016365802595, "grad_norm": 0.11261259366965576, "learning_rate": 9.95164593768368e-05, "loss": 2.0644, "step": 5716 }, { "epoch": 0.044460793310583176, "grad_norm": 0.629202003686183, "learning_rate": 9.951628985371785e-05, "loss": 2.0617, "step": 5717 }, { "epoch": 0.04446857025536376, "grad_norm": 0.12667737498318088, "learning_rate": 9.951612030103223e-05, "loss": 2.0365, "step": 5718 }, { "epoch": 0.04447634720014434, "grad_norm": 0.12843495522963255, "learning_rate": 9.951595071878004e-05, "loss": 2.1158, "step": 5719 }, { "epoch": 0.04448412414492492, "grad_norm": 0.11992974933098806, "learning_rate": 9.951578110696138e-05, "loss": 2.1085, "step": 5720 }, { "epoch": 0.0444919010897055, "grad_norm": 0.11881901419161316, "learning_rate": 9.951561146557637e-05, "loss": 2.0862, "step": 5721 }, { "epoch": 0.04449967803448608, "grad_norm": 0.13608312599254713, "learning_rate": 9.95154417946251e-05, "loss": 2.035, "step": 5722 }, { "epoch": 0.04450745497926666, "grad_norm": 0.12011653418699943, "learning_rate": 9.951527209410767e-05, "loss": 2.063, "step": 5723 }, { "epoch": 0.044515231924047244, "grad_norm": 0.11732969370541796, "learning_rate": 9.95151023640242e-05, "loss": 2.0228, "step": 5724 }, { "epoch": 0.044523008868827825, "grad_norm": 0.11297947229773989, "learning_rate": 9.951493260437474e-05, "loss": 2.034, "step": 5725 }, { "epoch": 0.044530785813608406, "grad_norm": 0.11850983485974606, "learning_rate": 9.951476281515945e-05, "loss": 2.0584, "step": 5726 }, { "epoch": 0.04453856275838899, "grad_norm": 0.11404838903737435, "learning_rate": 9.951459299637841e-05, "loss": 2.0588, "step": 5727 }, { "epoch": 0.04454633970316957, "grad_norm": 0.1177287208041828, "learning_rate": 9.95144231480317e-05, "loss": 2.0935, "step": 5728 }, { "epoch": 0.04455411664795015, "grad_norm": 0.1276847190151009, "learning_rate": 9.951425327011946e-05, "loss": 2.1284, "step": 5729 }, { "epoch": 0.04456189359273073, "grad_norm": 0.12886638246854565, "learning_rate": 9.951408336264176e-05, "loss": 2.1055, "step": 5730 }, { "epoch": 0.04456967053751132, "grad_norm": 0.1364676502935947, "learning_rate": 9.951391342559873e-05, "loss": 2.0684, "step": 5731 }, { "epoch": 0.0445774474822919, "grad_norm": 0.13066944975662442, "learning_rate": 9.951374345899042e-05, "loss": 2.0663, "step": 5732 }, { "epoch": 0.04458522442707248, "grad_norm": 0.12309016805032506, "learning_rate": 9.9513573462817e-05, "loss": 2.0917, "step": 5733 }, { "epoch": 0.04459300137185306, "grad_norm": 0.13225918528967218, "learning_rate": 9.951340343707852e-05, "loss": 2.0471, "step": 5734 }, { "epoch": 0.044600778316633644, "grad_norm": 0.11536241096792371, "learning_rate": 9.951323338177511e-05, "loss": 2.0888, "step": 5735 }, { "epoch": 0.044608555261414225, "grad_norm": 0.13009952319481144, "learning_rate": 9.951306329690684e-05, "loss": 2.0698, "step": 5736 }, { "epoch": 0.044616332206194806, "grad_norm": 0.20461701306079305, "learning_rate": 9.951289318247386e-05, "loss": 2.0524, "step": 5737 }, { "epoch": 0.04462410915097539, "grad_norm": 0.11376279229046023, "learning_rate": 9.951272303847622e-05, "loss": 2.045, "step": 5738 }, { "epoch": 0.04463188609575597, "grad_norm": 0.1394082418487363, "learning_rate": 9.951255286491405e-05, "loss": 2.0672, "step": 5739 }, { "epoch": 0.04463966304053655, "grad_norm": 0.14045389624426546, "learning_rate": 9.951238266178746e-05, "loss": 2.0357, "step": 5740 }, { "epoch": 0.04464743998531713, "grad_norm": 0.11195777321495777, "learning_rate": 9.951221242909654e-05, "loss": 2.015, "step": 5741 }, { "epoch": 0.04465521693009771, "grad_norm": 0.12707946330272316, "learning_rate": 9.951204216684137e-05, "loss": 2.0809, "step": 5742 }, { "epoch": 0.04466299387487829, "grad_norm": 0.16876521406294323, "learning_rate": 9.951187187502208e-05, "loss": 2.0536, "step": 5743 }, { "epoch": 0.044670770819658874, "grad_norm": 0.3682477432236014, "learning_rate": 9.951170155363875e-05, "loss": 2.0981, "step": 5744 }, { "epoch": 0.044678547764439455, "grad_norm": 0.12012426396376825, "learning_rate": 9.951153120269152e-05, "loss": 2.0885, "step": 5745 }, { "epoch": 0.044686324709220036, "grad_norm": 0.11668822031204225, "learning_rate": 9.951136082218046e-05, "loss": 2.0383, "step": 5746 }, { "epoch": 0.04469410165400062, "grad_norm": 0.11992800791315558, "learning_rate": 9.951119041210568e-05, "loss": 2.0691, "step": 5747 }, { "epoch": 0.0447018785987812, "grad_norm": 0.11842622072468852, "learning_rate": 9.951101997246727e-05, "loss": 2.0868, "step": 5748 }, { "epoch": 0.04470965554356178, "grad_norm": 0.11780788452967744, "learning_rate": 9.951084950326534e-05, "loss": 2.0117, "step": 5749 }, { "epoch": 0.04471743248834236, "grad_norm": 0.11921697512877238, "learning_rate": 9.951067900450002e-05, "loss": 2.0097, "step": 5750 }, { "epoch": 0.04472520943312294, "grad_norm": 0.11617897716084165, "learning_rate": 9.951050847617137e-05, "loss": 2.1072, "step": 5751 }, { "epoch": 0.04473298637790352, "grad_norm": 0.11810396398599357, "learning_rate": 9.95103379182795e-05, "loss": 2.0282, "step": 5752 }, { "epoch": 0.044740763322684104, "grad_norm": 0.11036885833810087, "learning_rate": 9.951016733082452e-05, "loss": 2.06, "step": 5753 }, { "epoch": 0.044748540267464686, "grad_norm": 0.11619527226887325, "learning_rate": 9.950999671380654e-05, "loss": 1.9706, "step": 5754 }, { "epoch": 0.04475631721224527, "grad_norm": 0.1152974959404145, "learning_rate": 9.950982606722565e-05, "loss": 2.0706, "step": 5755 }, { "epoch": 0.04476409415702585, "grad_norm": 0.11478921262006674, "learning_rate": 9.950965539108195e-05, "loss": 2.1001, "step": 5756 }, { "epoch": 0.04477187110180643, "grad_norm": 0.12031631060094673, "learning_rate": 9.950948468537556e-05, "loss": 2.1165, "step": 5757 }, { "epoch": 0.04477964804658701, "grad_norm": 0.12041877204786049, "learning_rate": 9.950931395010657e-05, "loss": 2.0718, "step": 5758 }, { "epoch": 0.04478742499136759, "grad_norm": 0.11304143204326522, "learning_rate": 9.950914318527506e-05, "loss": 2.046, "step": 5759 }, { "epoch": 0.04479520193614817, "grad_norm": 0.11289459036672661, "learning_rate": 9.950897239088118e-05, "loss": 2.0097, "step": 5760 }, { "epoch": 0.044802978880928754, "grad_norm": 0.12115578062646784, "learning_rate": 9.950880156692497e-05, "loss": 2.0523, "step": 5761 }, { "epoch": 0.044810755825709335, "grad_norm": 0.11447331478508772, "learning_rate": 9.950863071340659e-05, "loss": 2.0626, "step": 5762 }, { "epoch": 0.044818532770489916, "grad_norm": 0.11281822472916424, "learning_rate": 9.950845983032612e-05, "loss": 2.0964, "step": 5763 }, { "epoch": 0.0448263097152705, "grad_norm": 0.13440203815008991, "learning_rate": 9.950828891768365e-05, "loss": 2.0761, "step": 5764 }, { "epoch": 0.04483408666005108, "grad_norm": 0.11453756849168321, "learning_rate": 9.95081179754793e-05, "loss": 2.0513, "step": 5765 }, { "epoch": 0.04484186360483166, "grad_norm": 0.12019094766837878, "learning_rate": 9.950794700371317e-05, "loss": 2.0279, "step": 5766 }, { "epoch": 0.04484964054961224, "grad_norm": 0.11897045465446537, "learning_rate": 9.950777600238535e-05, "loss": 2.1129, "step": 5767 }, { "epoch": 0.04485741749439282, "grad_norm": 0.40320259894281946, "learning_rate": 9.950760497149595e-05, "loss": 2.0429, "step": 5768 }, { "epoch": 0.0448651944391734, "grad_norm": 0.11756842206951815, "learning_rate": 9.950743391104507e-05, "loss": 2.1299, "step": 5769 }, { "epoch": 0.044872971383953984, "grad_norm": 0.12103659532490836, "learning_rate": 9.950726282103282e-05, "loss": 2.0837, "step": 5770 }, { "epoch": 0.044880748328734565, "grad_norm": 0.12242730345540215, "learning_rate": 9.950709170145929e-05, "loss": 2.0723, "step": 5771 }, { "epoch": 0.04488852527351515, "grad_norm": 0.13365714473558113, "learning_rate": 9.950692055232457e-05, "loss": 2.0715, "step": 5772 }, { "epoch": 0.04489630221829573, "grad_norm": 0.1612827700276024, "learning_rate": 9.95067493736288e-05, "loss": 2.0382, "step": 5773 }, { "epoch": 0.04490407916307631, "grad_norm": 0.12754613989814173, "learning_rate": 9.950657816537208e-05, "loss": 2.1301, "step": 5774 }, { "epoch": 0.04491185610785689, "grad_norm": 0.12856334539823666, "learning_rate": 9.950640692755446e-05, "loss": 2.0787, "step": 5775 }, { "epoch": 0.04491963305263747, "grad_norm": 0.15665467665101285, "learning_rate": 9.95062356601761e-05, "loss": 2.0293, "step": 5776 }, { "epoch": 0.04492740999741805, "grad_norm": 0.1230870546484603, "learning_rate": 9.950606436323706e-05, "loss": 2.0406, "step": 5777 }, { "epoch": 0.044935186942198634, "grad_norm": 0.13321720169934256, "learning_rate": 9.950589303673747e-05, "loss": 2.0927, "step": 5778 }, { "epoch": 0.044942963886979215, "grad_norm": 0.16512697824725983, "learning_rate": 9.950572168067743e-05, "loss": 2.0909, "step": 5779 }, { "epoch": 0.044950740831759796, "grad_norm": 0.13506886740654622, "learning_rate": 9.950555029505704e-05, "loss": 2.0763, "step": 5780 }, { "epoch": 0.04495851777654038, "grad_norm": 0.12738613330687368, "learning_rate": 9.950537887987638e-05, "loss": 2.0779, "step": 5781 }, { "epoch": 0.04496629472132096, "grad_norm": 0.15852712395615, "learning_rate": 9.950520743513559e-05, "loss": 2.0865, "step": 5782 }, { "epoch": 0.04497407166610154, "grad_norm": 0.13133799915850478, "learning_rate": 9.950503596083474e-05, "loss": 2.1067, "step": 5783 }, { "epoch": 0.04498184861088212, "grad_norm": 0.16690216249567408, "learning_rate": 9.950486445697394e-05, "loss": 2.0624, "step": 5784 }, { "epoch": 0.0449896255556627, "grad_norm": 0.14709223063476295, "learning_rate": 9.950469292355331e-05, "loss": 2.0102, "step": 5785 }, { "epoch": 0.04499740250044328, "grad_norm": 0.11610988630473822, "learning_rate": 9.950452136057293e-05, "loss": 2.0767, "step": 5786 }, { "epoch": 0.045005179445223864, "grad_norm": 0.16682742867057804, "learning_rate": 9.950434976803294e-05, "loss": 2.058, "step": 5787 }, { "epoch": 0.045012956390004445, "grad_norm": 0.21625670207124825, "learning_rate": 9.950417814593338e-05, "loss": 2.0924, "step": 5788 }, { "epoch": 0.045020733334785026, "grad_norm": 0.155754299128243, "learning_rate": 9.95040064942744e-05, "loss": 2.0205, "step": 5789 }, { "epoch": 0.04502851027956561, "grad_norm": 0.13050150852375392, "learning_rate": 9.95038348130561e-05, "loss": 2.0888, "step": 5790 }, { "epoch": 0.045036287224346196, "grad_norm": 0.19421841634363912, "learning_rate": 9.950366310227856e-05, "loss": 2.0677, "step": 5791 }, { "epoch": 0.04504406416912678, "grad_norm": 0.19586231375003182, "learning_rate": 9.95034913619419e-05, "loss": 2.118, "step": 5792 }, { "epoch": 0.04505184111390736, "grad_norm": 0.18454921972355384, "learning_rate": 9.950331959204621e-05, "loss": 2.0911, "step": 5793 }, { "epoch": 0.04505961805868794, "grad_norm": 0.17892558109870418, "learning_rate": 9.95031477925916e-05, "loss": 2.0285, "step": 5794 }, { "epoch": 0.04506739500346852, "grad_norm": 0.11719924539116097, "learning_rate": 9.950297596357818e-05, "loss": 2.1791, "step": 5795 }, { "epoch": 0.0450751719482491, "grad_norm": 0.15246030854439202, "learning_rate": 9.950280410500605e-05, "loss": 2.0677, "step": 5796 }, { "epoch": 0.04508294889302968, "grad_norm": 0.15421385142004898, "learning_rate": 9.95026322168753e-05, "loss": 2.0803, "step": 5797 }, { "epoch": 0.045090725837810264, "grad_norm": 0.1384958970776858, "learning_rate": 9.950246029918604e-05, "loss": 2.094, "step": 5798 }, { "epoch": 0.045098502782590845, "grad_norm": 0.15131889915460237, "learning_rate": 9.950228835193838e-05, "loss": 2.0861, "step": 5799 }, { "epoch": 0.045106279727371426, "grad_norm": 0.12831467121720605, "learning_rate": 9.95021163751324e-05, "loss": 2.0475, "step": 5800 }, { "epoch": 0.04511405667215201, "grad_norm": 0.12207290470043158, "learning_rate": 9.950194436876824e-05, "loss": 2.1127, "step": 5801 }, { "epoch": 0.04512183361693259, "grad_norm": 0.1324368885878962, "learning_rate": 9.950177233284596e-05, "loss": 2.1069, "step": 5802 }, { "epoch": 0.04512961056171317, "grad_norm": 0.11592881347936217, "learning_rate": 9.95016002673657e-05, "loss": 2.0655, "step": 5803 }, { "epoch": 0.04513738750649375, "grad_norm": 0.12642025162977477, "learning_rate": 9.950142817232754e-05, "loss": 2.0413, "step": 5804 }, { "epoch": 0.04514516445127433, "grad_norm": 0.12082887179687987, "learning_rate": 9.950125604773161e-05, "loss": 2.0652, "step": 5805 }, { "epoch": 0.04515294139605491, "grad_norm": 0.11698642431151435, "learning_rate": 9.950108389357798e-05, "loss": 2.0535, "step": 5806 }, { "epoch": 0.045160718340835494, "grad_norm": 0.12420640440074028, "learning_rate": 9.950091170986675e-05, "loss": 2.0751, "step": 5807 }, { "epoch": 0.045168495285616075, "grad_norm": 0.11514476474821146, "learning_rate": 9.950073949659804e-05, "loss": 2.1124, "step": 5808 }, { "epoch": 0.04517627223039666, "grad_norm": 0.1143895136306671, "learning_rate": 9.950056725377197e-05, "loss": 2.0977, "step": 5809 }, { "epoch": 0.04518404917517724, "grad_norm": 0.11711125343770683, "learning_rate": 9.950039498138861e-05, "loss": 2.055, "step": 5810 }, { "epoch": 0.04519182611995782, "grad_norm": 0.11645699636481308, "learning_rate": 9.950022267944807e-05, "loss": 2.0752, "step": 5811 }, { "epoch": 0.0451996030647384, "grad_norm": 0.11455294424553301, "learning_rate": 9.950005034795047e-05, "loss": 2.0131, "step": 5812 }, { "epoch": 0.04520738000951898, "grad_norm": 0.1148310696418116, "learning_rate": 9.94998779868959e-05, "loss": 2.0681, "step": 5813 }, { "epoch": 0.04521515695429956, "grad_norm": 0.11208456835736527, "learning_rate": 9.949970559628446e-05, "loss": 2.0779, "step": 5814 }, { "epoch": 0.045222933899080144, "grad_norm": 0.10999362269264021, "learning_rate": 9.949953317611627e-05, "loss": 2.1099, "step": 5815 }, { "epoch": 0.045230710843860725, "grad_norm": 0.1118543981641631, "learning_rate": 9.949936072639142e-05, "loss": 2.0556, "step": 5816 }, { "epoch": 0.045238487788641306, "grad_norm": 0.11327174397450855, "learning_rate": 9.949918824711e-05, "loss": 2.0579, "step": 5817 }, { "epoch": 0.04524626473342189, "grad_norm": 0.11936827020679458, "learning_rate": 9.949901573827214e-05, "loss": 2.0994, "step": 5818 }, { "epoch": 0.04525404167820247, "grad_norm": 0.11747292003051396, "learning_rate": 9.949884319987792e-05, "loss": 2.0643, "step": 5819 }, { "epoch": 0.04526181862298305, "grad_norm": 0.11722104100735117, "learning_rate": 9.949867063192746e-05, "loss": 2.0968, "step": 5820 }, { "epoch": 0.04526959556776363, "grad_norm": 0.11661548003587643, "learning_rate": 9.949849803442086e-05, "loss": 2.0828, "step": 5821 }, { "epoch": 0.04527737251254421, "grad_norm": 0.11874029668524358, "learning_rate": 9.949832540735822e-05, "loss": 2.0753, "step": 5822 }, { "epoch": 0.04528514945732479, "grad_norm": 0.11153218408677162, "learning_rate": 9.949815275073963e-05, "loss": 2.0436, "step": 5823 }, { "epoch": 0.045292926402105374, "grad_norm": 0.17482296782410758, "learning_rate": 9.949798006456522e-05, "loss": 2.0218, "step": 5824 }, { "epoch": 0.045300703346885955, "grad_norm": 0.11056429022655131, "learning_rate": 9.949780734883506e-05, "loss": 2.0534, "step": 5825 }, { "epoch": 0.045308480291666536, "grad_norm": 0.11711394078568359, "learning_rate": 9.949763460354929e-05, "loss": 2.0228, "step": 5826 }, { "epoch": 0.04531625723644712, "grad_norm": 0.11522224238560289, "learning_rate": 9.949746182870798e-05, "loss": 2.0531, "step": 5827 }, { "epoch": 0.0453240341812277, "grad_norm": 0.11249448930953555, "learning_rate": 9.949728902431125e-05, "loss": 2.0029, "step": 5828 }, { "epoch": 0.04533181112600828, "grad_norm": 0.11652537284795197, "learning_rate": 9.94971161903592e-05, "loss": 2.1044, "step": 5829 }, { "epoch": 0.04533958807078886, "grad_norm": 0.11724614913598805, "learning_rate": 9.949694332685195e-05, "loss": 2.0877, "step": 5830 }, { "epoch": 0.04534736501556944, "grad_norm": 0.12874046642193687, "learning_rate": 9.949677043378957e-05, "loss": 2.0485, "step": 5831 }, { "epoch": 0.04535514196035002, "grad_norm": 0.11244418535965267, "learning_rate": 9.949659751117218e-05, "loss": 2.0768, "step": 5832 }, { "epoch": 0.045362918905130604, "grad_norm": 0.12225500240695927, "learning_rate": 9.94964245589999e-05, "loss": 2.0373, "step": 5833 }, { "epoch": 0.045370695849911186, "grad_norm": 0.1166612265040394, "learning_rate": 9.94962515772728e-05, "loss": 2.0385, "step": 5834 }, { "epoch": 0.04537847279469177, "grad_norm": 0.12271448401838318, "learning_rate": 9.949607856599102e-05, "loss": 1.9999, "step": 5835 }, { "epoch": 0.04538624973947235, "grad_norm": 0.11803525963651183, "learning_rate": 9.949590552515464e-05, "loss": 2.0721, "step": 5836 }, { "epoch": 0.04539402668425293, "grad_norm": 0.12391150171081426, "learning_rate": 9.949573245476376e-05, "loss": 2.0681, "step": 5837 }, { "epoch": 0.04540180362903351, "grad_norm": 0.13446284204148695, "learning_rate": 9.949555935481848e-05, "loss": 2.1139, "step": 5838 }, { "epoch": 0.04540958057381409, "grad_norm": 0.11617582951072887, "learning_rate": 9.949538622531893e-05, "loss": 2.0845, "step": 5839 }, { "epoch": 0.04541735751859467, "grad_norm": 0.12985238069694588, "learning_rate": 9.949521306626519e-05, "loss": 2.0855, "step": 5840 }, { "epoch": 0.045425134463375254, "grad_norm": 0.13034977945636445, "learning_rate": 9.949503987765736e-05, "loss": 2.0108, "step": 5841 }, { "epoch": 0.045432911408155835, "grad_norm": 0.12749296212793026, "learning_rate": 9.949486665949557e-05, "loss": 2.095, "step": 5842 }, { "epoch": 0.045440688352936416, "grad_norm": 0.12449622533316315, "learning_rate": 9.94946934117799e-05, "loss": 2.088, "step": 5843 }, { "epoch": 0.045448465297717, "grad_norm": 0.12387004636453824, "learning_rate": 9.949452013451047e-05, "loss": 2.1074, "step": 5844 }, { "epoch": 0.04545624224249758, "grad_norm": 0.21113633580142732, "learning_rate": 9.949434682768737e-05, "loss": 2.0539, "step": 5845 }, { "epoch": 0.04546401918727816, "grad_norm": 0.12125587186892953, "learning_rate": 9.949417349131072e-05, "loss": 2.0472, "step": 5846 }, { "epoch": 0.04547179613205874, "grad_norm": 0.12725314088239312, "learning_rate": 9.949400012538059e-05, "loss": 2.0626, "step": 5847 }, { "epoch": 0.04547957307683932, "grad_norm": 0.1212533950583352, "learning_rate": 9.949382672989711e-05, "loss": 2.0886, "step": 5848 }, { "epoch": 0.0454873500216199, "grad_norm": 0.11381642086480794, "learning_rate": 9.949365330486038e-05, "loss": 2.1064, "step": 5849 }, { "epoch": 0.04549512696640049, "grad_norm": 0.12689311098028555, "learning_rate": 9.949347985027051e-05, "loss": 2.1039, "step": 5850 }, { "epoch": 0.04550290391118107, "grad_norm": 0.1318225300917313, "learning_rate": 9.94933063661276e-05, "loss": 2.1117, "step": 5851 }, { "epoch": 0.045510680855961654, "grad_norm": 0.13078798811533196, "learning_rate": 9.949313285243172e-05, "loss": 2.0139, "step": 5852 }, { "epoch": 0.045518457800742235, "grad_norm": 0.24644548863576524, "learning_rate": 9.949295930918304e-05, "loss": 2.0426, "step": 5853 }, { "epoch": 0.045526234745522816, "grad_norm": 0.11720994481424851, "learning_rate": 9.949278573638159e-05, "loss": 2.0419, "step": 5854 }, { "epoch": 0.0455340116903034, "grad_norm": 0.11325398485344769, "learning_rate": 9.949261213402754e-05, "loss": 2.1399, "step": 5855 }, { "epoch": 0.04554178863508398, "grad_norm": 0.1185916442492441, "learning_rate": 9.949243850212094e-05, "loss": 2.1024, "step": 5856 }, { "epoch": 0.04554956557986456, "grad_norm": 0.1174319811474369, "learning_rate": 9.949226484066194e-05, "loss": 2.0952, "step": 5857 }, { "epoch": 0.04555734252464514, "grad_norm": 0.12007394614795712, "learning_rate": 9.94920911496506e-05, "loss": 2.0752, "step": 5858 }, { "epoch": 0.04556511946942572, "grad_norm": 0.12087902173166791, "learning_rate": 9.949191742908706e-05, "loss": 2.0247, "step": 5859 }, { "epoch": 0.0455728964142063, "grad_norm": 0.11761767682677542, "learning_rate": 9.94917436789714e-05, "loss": 2.0989, "step": 5860 }, { "epoch": 0.045580673358986884, "grad_norm": 0.12505738201256011, "learning_rate": 9.949156989930373e-05, "loss": 2.0989, "step": 5861 }, { "epoch": 0.045588450303767465, "grad_norm": 0.1379217502927547, "learning_rate": 9.949139609008417e-05, "loss": 2.0424, "step": 5862 }, { "epoch": 0.045596227248548046, "grad_norm": 0.1252295646018116, "learning_rate": 9.94912222513128e-05, "loss": 2.0348, "step": 5863 }, { "epoch": 0.04560400419332863, "grad_norm": 0.12114440354040687, "learning_rate": 9.949104838298973e-05, "loss": 2.057, "step": 5864 }, { "epoch": 0.04561178113810921, "grad_norm": 0.11452533574905426, "learning_rate": 9.949087448511507e-05, "loss": 2.0735, "step": 5865 }, { "epoch": 0.04561955808288979, "grad_norm": 0.12008173887172006, "learning_rate": 9.949070055768892e-05, "loss": 2.0146, "step": 5866 }, { "epoch": 0.04562733502767037, "grad_norm": 0.11966219643141594, "learning_rate": 9.94905266007114e-05, "loss": 2.1193, "step": 5867 }, { "epoch": 0.04563511197245095, "grad_norm": 0.11638483948988826, "learning_rate": 9.949035261418257e-05, "loss": 2.1283, "step": 5868 }, { "epoch": 0.04564288891723153, "grad_norm": 0.11259892085963091, "learning_rate": 9.949017859810259e-05, "loss": 2.0158, "step": 5869 }, { "epoch": 0.045650665862012114, "grad_norm": 0.11459854487296357, "learning_rate": 9.949000455247153e-05, "loss": 2.1632, "step": 5870 }, { "epoch": 0.045658442806792696, "grad_norm": 0.12043891192136877, "learning_rate": 9.94898304772895e-05, "loss": 2.0513, "step": 5871 }, { "epoch": 0.04566621975157328, "grad_norm": 0.12086288002559664, "learning_rate": 9.94896563725566e-05, "loss": 2.079, "step": 5872 }, { "epoch": 0.04567399669635386, "grad_norm": 0.11398391594680479, "learning_rate": 9.948948223827295e-05, "loss": 2.0637, "step": 5873 }, { "epoch": 0.04568177364113444, "grad_norm": 0.21192335524231865, "learning_rate": 9.948930807443862e-05, "loss": 2.095, "step": 5874 }, { "epoch": 0.04568955058591502, "grad_norm": 0.12341918896908859, "learning_rate": 9.948913388105377e-05, "loss": 2.1084, "step": 5875 }, { "epoch": 0.0456973275306956, "grad_norm": 0.1175642406818771, "learning_rate": 9.948895965811844e-05, "loss": 2.0546, "step": 5876 }, { "epoch": 0.04570510447547618, "grad_norm": 0.11552621166531561, "learning_rate": 9.948878540563278e-05, "loss": 1.9694, "step": 5877 }, { "epoch": 0.045712881420256764, "grad_norm": 0.13800594551977097, "learning_rate": 9.948861112359687e-05, "loss": 2.04, "step": 5878 }, { "epoch": 0.045720658365037345, "grad_norm": 0.13441789322213224, "learning_rate": 9.948843681201082e-05, "loss": 2.0465, "step": 5879 }, { "epoch": 0.045728435309817926, "grad_norm": 0.12077964724987592, "learning_rate": 9.948826247087475e-05, "loss": 2.1216, "step": 5880 }, { "epoch": 0.04573621225459851, "grad_norm": 0.12807898331247053, "learning_rate": 9.948808810018875e-05, "loss": 2.0977, "step": 5881 }, { "epoch": 0.04574398919937909, "grad_norm": 0.15366736062698133, "learning_rate": 9.948791369995292e-05, "loss": 2.1406, "step": 5882 }, { "epoch": 0.04575176614415967, "grad_norm": 0.13376486983786845, "learning_rate": 9.948773927016736e-05, "loss": 2.0744, "step": 5883 }, { "epoch": 0.04575954308894025, "grad_norm": 0.12379220787266645, "learning_rate": 9.94875648108322e-05, "loss": 2.0267, "step": 5884 }, { "epoch": 0.04576732003372083, "grad_norm": 0.1770047185746632, "learning_rate": 9.948739032194751e-05, "loss": 2.1105, "step": 5885 }, { "epoch": 0.04577509697850141, "grad_norm": 0.15759190827336358, "learning_rate": 9.948721580351342e-05, "loss": 2.077, "step": 5886 }, { "epoch": 0.045782873923281994, "grad_norm": 0.11155049240421487, "learning_rate": 9.948704125553003e-05, "loss": 2.0297, "step": 5887 }, { "epoch": 0.045790650868062575, "grad_norm": 0.1317038640556408, "learning_rate": 9.948686667799744e-05, "loss": 2.0825, "step": 5888 }, { "epoch": 0.04579842781284316, "grad_norm": 0.12965343711877783, "learning_rate": 9.948669207091574e-05, "loss": 2.0308, "step": 5889 }, { "epoch": 0.04580620475762374, "grad_norm": 0.11132759192630673, "learning_rate": 9.948651743428506e-05, "loss": 2.0612, "step": 5890 }, { "epoch": 0.04581398170240432, "grad_norm": 0.12239648913062022, "learning_rate": 9.94863427681055e-05, "loss": 2.062, "step": 5891 }, { "epoch": 0.0458217586471849, "grad_norm": 0.11944074769582606, "learning_rate": 9.948616807237714e-05, "loss": 2.0967, "step": 5892 }, { "epoch": 0.04582953559196548, "grad_norm": 0.11625800403631108, "learning_rate": 9.948599334710012e-05, "loss": 2.1136, "step": 5893 }, { "epoch": 0.04583731253674606, "grad_norm": 0.1495981973449099, "learning_rate": 9.948581859227453e-05, "loss": 2.0289, "step": 5894 }, { "epoch": 0.045845089481526644, "grad_norm": 0.12528086041502756, "learning_rate": 9.948564380790044e-05, "loss": 2.0733, "step": 5895 }, { "epoch": 0.045852866426307225, "grad_norm": 0.11083913054314742, "learning_rate": 9.948546899397801e-05, "loss": 2.1136, "step": 5896 }, { "epoch": 0.045860643371087806, "grad_norm": 0.12388960078438, "learning_rate": 9.948529415050731e-05, "loss": 2.0648, "step": 5897 }, { "epoch": 0.04586842031586839, "grad_norm": 0.1149928998481261, "learning_rate": 9.948511927748845e-05, "loss": 2.0953, "step": 5898 }, { "epoch": 0.04587619726064897, "grad_norm": 0.1284810219775292, "learning_rate": 9.948494437492155e-05, "loss": 2.0646, "step": 5899 }, { "epoch": 0.04588397420542955, "grad_norm": 0.15718577835483802, "learning_rate": 9.94847694428067e-05, "loss": 2.1103, "step": 5900 }, { "epoch": 0.04589175115021013, "grad_norm": 0.13081835486100035, "learning_rate": 9.9484594481144e-05, "loss": 2.001, "step": 5901 }, { "epoch": 0.04589952809499071, "grad_norm": 0.12912898434592152, "learning_rate": 9.948441948993355e-05, "loss": 2.1022, "step": 5902 }, { "epoch": 0.04590730503977129, "grad_norm": 0.19308409676086702, "learning_rate": 9.948424446917548e-05, "loss": 2.0612, "step": 5903 }, { "epoch": 0.045915081984551874, "grad_norm": 0.17845869234154624, "learning_rate": 9.948406941886987e-05, "loss": 2.0718, "step": 5904 }, { "epoch": 0.045922858929332455, "grad_norm": 0.11251745405156463, "learning_rate": 9.948389433901686e-05, "loss": 2.0147, "step": 5905 }, { "epoch": 0.045930635874113036, "grad_norm": 0.19197163298366, "learning_rate": 9.94837192296165e-05, "loss": 2.0513, "step": 5906 }, { "epoch": 0.04593841281889362, "grad_norm": 0.1799662435545276, "learning_rate": 9.948354409066893e-05, "loss": 2.1104, "step": 5907 }, { "epoch": 0.0459461897636742, "grad_norm": 0.11283865134857843, "learning_rate": 9.948336892217427e-05, "loss": 2.1071, "step": 5908 }, { "epoch": 0.04595396670845478, "grad_norm": 0.1741181288747634, "learning_rate": 9.948319372413257e-05, "loss": 2.0738, "step": 5909 }, { "epoch": 0.04596174365323537, "grad_norm": 0.15217124532049453, "learning_rate": 9.9483018496544e-05, "loss": 2.0423, "step": 5910 }, { "epoch": 0.04596952059801595, "grad_norm": 0.11570026662438039, "learning_rate": 9.94828432394086e-05, "loss": 2.0849, "step": 5911 }, { "epoch": 0.04597729754279653, "grad_norm": 0.1680956111925827, "learning_rate": 9.948266795272654e-05, "loss": 2.0429, "step": 5912 }, { "epoch": 0.04598507448757711, "grad_norm": 0.12695379170959634, "learning_rate": 9.948249263649787e-05, "loss": 2.0714, "step": 5913 }, { "epoch": 0.04599285143235769, "grad_norm": 0.11664131290914428, "learning_rate": 9.948231729072272e-05, "loss": 2.09, "step": 5914 }, { "epoch": 0.046000628377138274, "grad_norm": 0.1324861753867168, "learning_rate": 9.948214191540119e-05, "loss": 2.085, "step": 5915 }, { "epoch": 0.046008405321918855, "grad_norm": 0.1243733390860057, "learning_rate": 9.948196651053338e-05, "loss": 2.0724, "step": 5916 }, { "epoch": 0.046016182266699436, "grad_norm": 0.11156541356381657, "learning_rate": 9.948179107611941e-05, "loss": 2.1156, "step": 5917 }, { "epoch": 0.04602395921148002, "grad_norm": 0.12092361133909005, "learning_rate": 9.948161561215937e-05, "loss": 2.0567, "step": 5918 }, { "epoch": 0.0460317361562606, "grad_norm": 0.11474744043418635, "learning_rate": 9.948144011865337e-05, "loss": 2.053, "step": 5919 }, { "epoch": 0.04603951310104118, "grad_norm": 0.11653051788023529, "learning_rate": 9.948126459560152e-05, "loss": 2.1136, "step": 5920 }, { "epoch": 0.04604729004582176, "grad_norm": 0.11333405100069784, "learning_rate": 9.948108904300392e-05, "loss": 2.0693, "step": 5921 }, { "epoch": 0.04605506699060234, "grad_norm": 0.10900192492778074, "learning_rate": 9.948091346086067e-05, "loss": 2.0715, "step": 5922 }, { "epoch": 0.04606284393538292, "grad_norm": 0.1259650769436286, "learning_rate": 9.948073784917187e-05, "loss": 2.0797, "step": 5923 }, { "epoch": 0.046070620880163504, "grad_norm": 0.11706395180092849, "learning_rate": 9.948056220793765e-05, "loss": 2.0445, "step": 5924 }, { "epoch": 0.046078397824944085, "grad_norm": 0.11740700610375669, "learning_rate": 9.94803865371581e-05, "loss": 2.0167, "step": 5925 }, { "epoch": 0.04608617476972467, "grad_norm": 0.11352280797419238, "learning_rate": 9.94802108368333e-05, "loss": 2.0251, "step": 5926 }, { "epoch": 0.04609395171450525, "grad_norm": 0.14524507510458132, "learning_rate": 9.94800351069634e-05, "loss": 2.0732, "step": 5927 }, { "epoch": 0.04610172865928583, "grad_norm": 0.12724855927739967, "learning_rate": 9.947985934754848e-05, "loss": 2.0671, "step": 5928 }, { "epoch": 0.04610950560406641, "grad_norm": 0.11637563910262486, "learning_rate": 9.947968355858865e-05, "loss": 2.1156, "step": 5929 }, { "epoch": 0.04611728254884699, "grad_norm": 0.14745540537772697, "learning_rate": 9.947950774008401e-05, "loss": 2.024, "step": 5930 }, { "epoch": 0.04612505949362757, "grad_norm": 0.1439962605020495, "learning_rate": 9.947933189203466e-05, "loss": 2.0762, "step": 5931 }, { "epoch": 0.046132836438408154, "grad_norm": 0.11265707497664477, "learning_rate": 9.947915601444073e-05, "loss": 2.1078, "step": 5932 }, { "epoch": 0.046140613383188735, "grad_norm": 0.12833862884479671, "learning_rate": 9.94789801073023e-05, "loss": 2.0386, "step": 5933 }, { "epoch": 0.046148390327969316, "grad_norm": 0.1414600043251101, "learning_rate": 9.947880417061947e-05, "loss": 2.0659, "step": 5934 }, { "epoch": 0.0461561672727499, "grad_norm": 0.12087922916878323, "learning_rate": 9.947862820439238e-05, "loss": 2.1123, "step": 5935 }, { "epoch": 0.04616394421753048, "grad_norm": 0.12681421831995057, "learning_rate": 9.94784522086211e-05, "loss": 2.0732, "step": 5936 }, { "epoch": 0.04617172116231106, "grad_norm": 0.15511102389711326, "learning_rate": 9.947827618330576e-05, "loss": 2.0335, "step": 5937 }, { "epoch": 0.04617949810709164, "grad_norm": 0.12407158705197893, "learning_rate": 9.947810012844645e-05, "loss": 2.077, "step": 5938 }, { "epoch": 0.04618727505187222, "grad_norm": 0.16066456656447103, "learning_rate": 9.947792404404328e-05, "loss": 2.0173, "step": 5939 }, { "epoch": 0.0461950519966528, "grad_norm": 0.17180187126197588, "learning_rate": 9.947774793009634e-05, "loss": 2.0657, "step": 5940 }, { "epoch": 0.046202828941433384, "grad_norm": 0.1308525640637858, "learning_rate": 9.947757178660577e-05, "loss": 2.092, "step": 5941 }, { "epoch": 0.046210605886213965, "grad_norm": 0.1314264269185413, "learning_rate": 9.947739561357164e-05, "loss": 2.0899, "step": 5942 }, { "epoch": 0.046218382830994546, "grad_norm": 0.16202991699682098, "learning_rate": 9.947721941099408e-05, "loss": 2.0643, "step": 5943 }, { "epoch": 0.04622615977577513, "grad_norm": 0.1264243460961601, "learning_rate": 9.947704317887318e-05, "loss": 2.0475, "step": 5944 }, { "epoch": 0.04623393672055571, "grad_norm": 0.12198587047236914, "learning_rate": 9.947686691720906e-05, "loss": 2.1174, "step": 5945 }, { "epoch": 0.04624171366533629, "grad_norm": 0.12294774353107013, "learning_rate": 9.94766906260018e-05, "loss": 2.1245, "step": 5946 }, { "epoch": 0.04624949061011687, "grad_norm": 0.1201123672214559, "learning_rate": 9.947651430525152e-05, "loss": 2.0748, "step": 5947 }, { "epoch": 0.04625726755489745, "grad_norm": 0.1353050624770392, "learning_rate": 9.947633795495834e-05, "loss": 2.0957, "step": 5948 }, { "epoch": 0.04626504449967803, "grad_norm": 0.11400378148143105, "learning_rate": 9.947616157512234e-05, "loss": 2.072, "step": 5949 }, { "epoch": 0.046272821444458614, "grad_norm": 0.13860767827965448, "learning_rate": 9.947598516574364e-05, "loss": 2.0355, "step": 5950 }, { "epoch": 0.046280598389239196, "grad_norm": 0.13865493282008975, "learning_rate": 9.947580872682235e-05, "loss": 2.1222, "step": 5951 }, { "epoch": 0.04628837533401978, "grad_norm": 0.14449907587509261, "learning_rate": 9.947563225835855e-05, "loss": 2.0328, "step": 5952 }, { "epoch": 0.04629615227880036, "grad_norm": 0.11037712535179642, "learning_rate": 9.947545576035237e-05, "loss": 2.1075, "step": 5953 }, { "epoch": 0.04630392922358094, "grad_norm": 0.11969225860794484, "learning_rate": 9.94752792328039e-05, "loss": 2.0616, "step": 5954 }, { "epoch": 0.04631170616836152, "grad_norm": 0.16061962155367393, "learning_rate": 9.947510267571326e-05, "loss": 2.1077, "step": 5955 }, { "epoch": 0.0463194831131421, "grad_norm": 0.11198925735465559, "learning_rate": 9.947492608908056e-05, "loss": 2.1392, "step": 5956 }, { "epoch": 0.04632726005792268, "grad_norm": 0.12535273470544864, "learning_rate": 9.947474947290587e-05, "loss": 2.0168, "step": 5957 }, { "epoch": 0.046335037002703264, "grad_norm": 0.12052674495322804, "learning_rate": 9.947457282718935e-05, "loss": 2.0515, "step": 5958 }, { "epoch": 0.046342813947483845, "grad_norm": 0.11631561713384236, "learning_rate": 9.947439615193106e-05, "loss": 2.017, "step": 5959 }, { "epoch": 0.046350590892264426, "grad_norm": 0.11648214862392507, "learning_rate": 9.947421944713111e-05, "loss": 2.0573, "step": 5960 }, { "epoch": 0.04635836783704501, "grad_norm": 0.11493546358045478, "learning_rate": 9.947404271278963e-05, "loss": 2.0841, "step": 5961 }, { "epoch": 0.04636614478182559, "grad_norm": 0.11327834260740492, "learning_rate": 9.947386594890671e-05, "loss": 2.0129, "step": 5962 }, { "epoch": 0.04637392172660617, "grad_norm": 0.1189957100338574, "learning_rate": 9.947368915548245e-05, "loss": 2.0674, "step": 5963 }, { "epoch": 0.04638169867138675, "grad_norm": 0.11438953486565377, "learning_rate": 9.947351233251697e-05, "loss": 2.0992, "step": 5964 }, { "epoch": 0.04638947561616733, "grad_norm": 0.11652483921130966, "learning_rate": 9.947333548001036e-05, "loss": 2.0411, "step": 5965 }, { "epoch": 0.04639725256094791, "grad_norm": 0.12210227029938989, "learning_rate": 9.947315859796274e-05, "loss": 2.0643, "step": 5966 }, { "epoch": 0.046405029505728494, "grad_norm": 0.11398211738990374, "learning_rate": 9.947298168637423e-05, "loss": 2.0465, "step": 5967 }, { "epoch": 0.046412806450509075, "grad_norm": 0.11298307179655662, "learning_rate": 9.947280474524487e-05, "loss": 2.0389, "step": 5968 }, { "epoch": 0.046420583395289663, "grad_norm": 0.11352376005530708, "learning_rate": 9.947262777457485e-05, "loss": 2.0702, "step": 5969 }, { "epoch": 0.046428360340070245, "grad_norm": 0.11325467727105185, "learning_rate": 9.947245077436422e-05, "loss": 2.0651, "step": 5970 }, { "epoch": 0.046436137284850826, "grad_norm": 0.12043682431392784, "learning_rate": 9.94722737446131e-05, "loss": 2.0869, "step": 5971 }, { "epoch": 0.04644391422963141, "grad_norm": 0.12441908425386632, "learning_rate": 9.94720966853216e-05, "loss": 2.0771, "step": 5972 }, { "epoch": 0.04645169117441199, "grad_norm": 0.11983888386355598, "learning_rate": 9.947191959648983e-05, "loss": 2.0258, "step": 5973 }, { "epoch": 0.04645946811919257, "grad_norm": 0.19133036022561856, "learning_rate": 9.947174247811788e-05, "loss": 2.0488, "step": 5974 }, { "epoch": 0.04646724506397315, "grad_norm": 0.13508870140453877, "learning_rate": 9.947156533020588e-05, "loss": 2.1075, "step": 5975 }, { "epoch": 0.04647502200875373, "grad_norm": 0.13951547554565927, "learning_rate": 9.947138815275391e-05, "loss": 2.0357, "step": 5976 }, { "epoch": 0.04648279895353431, "grad_norm": 0.1128421021297627, "learning_rate": 9.947121094576209e-05, "loss": 2.0364, "step": 5977 }, { "epoch": 0.046490575898314894, "grad_norm": 0.1750473598535762, "learning_rate": 9.947103370923053e-05, "loss": 2.073, "step": 5978 }, { "epoch": 0.046498352843095475, "grad_norm": 0.1532189539755375, "learning_rate": 9.94708564431593e-05, "loss": 2.0742, "step": 5979 }, { "epoch": 0.046506129787876056, "grad_norm": 0.1393902207394874, "learning_rate": 9.947067914754856e-05, "loss": 2.0726, "step": 5980 }, { "epoch": 0.04651390673265664, "grad_norm": 0.27085917034005424, "learning_rate": 9.947050182239838e-05, "loss": 2.1053, "step": 5981 }, { "epoch": 0.04652168367743722, "grad_norm": 0.1517992578465455, "learning_rate": 9.947032446770888e-05, "loss": 2.0633, "step": 5982 }, { "epoch": 0.0465294606222178, "grad_norm": 0.13531074931912418, "learning_rate": 9.947014708348017e-05, "loss": 2.0945, "step": 5983 }, { "epoch": 0.04653723756699838, "grad_norm": 0.13043574552065215, "learning_rate": 9.946996966971234e-05, "loss": 2.1045, "step": 5984 }, { "epoch": 0.04654501451177896, "grad_norm": 0.15360419168270661, "learning_rate": 9.94697922264055e-05, "loss": 2.0471, "step": 5985 }, { "epoch": 0.04655279145655954, "grad_norm": 0.13762840335386559, "learning_rate": 9.946961475355976e-05, "loss": 2.0305, "step": 5986 }, { "epoch": 0.046560568401340124, "grad_norm": 0.12363240434036041, "learning_rate": 9.946943725117524e-05, "loss": 2.0715, "step": 5987 }, { "epoch": 0.046568345346120706, "grad_norm": 0.16566233059426427, "learning_rate": 9.946925971925201e-05, "loss": 2.118, "step": 5988 }, { "epoch": 0.04657612229090129, "grad_norm": 0.142782588938416, "learning_rate": 9.946908215779021e-05, "loss": 2.0511, "step": 5989 }, { "epoch": 0.04658389923568187, "grad_norm": 0.1208984470034649, "learning_rate": 9.946890456678992e-05, "loss": 2.0417, "step": 5990 }, { "epoch": 0.04659167618046245, "grad_norm": 0.22867673404354222, "learning_rate": 9.946872694625127e-05, "loss": 2.16, "step": 5991 }, { "epoch": 0.04659945312524303, "grad_norm": 0.1609940858581854, "learning_rate": 9.946854929617437e-05, "loss": 2.1273, "step": 5992 }, { "epoch": 0.04660723007002361, "grad_norm": 0.11979985393681034, "learning_rate": 9.946837161655929e-05, "loss": 2.036, "step": 5993 }, { "epoch": 0.04661500701480419, "grad_norm": 0.16904340855753952, "learning_rate": 9.946819390740617e-05, "loss": 2.0702, "step": 5994 }, { "epoch": 0.046622783959584774, "grad_norm": 0.16354598941807566, "learning_rate": 9.94680161687151e-05, "loss": 2.1043, "step": 5995 }, { "epoch": 0.046630560904365355, "grad_norm": 0.1985780473927748, "learning_rate": 9.946783840048619e-05, "loss": 2.0425, "step": 5996 }, { "epoch": 0.046638337849145936, "grad_norm": 0.13876101301140598, "learning_rate": 9.946766060271954e-05, "loss": 1.9587, "step": 5997 }, { "epoch": 0.04664611479392652, "grad_norm": 0.1395812895657225, "learning_rate": 9.946748277541528e-05, "loss": 2.0705, "step": 5998 }, { "epoch": 0.0466538917387071, "grad_norm": 0.11474455093563571, "learning_rate": 9.946730491857349e-05, "loss": 2.113, "step": 5999 }, { "epoch": 0.04666166868348768, "grad_norm": 0.15911541401836282, "learning_rate": 9.946712703219427e-05, "loss": 2.0557, "step": 6000 }, { "epoch": 0.04666944562826826, "grad_norm": 0.17165013960705058, "learning_rate": 9.946694911627776e-05, "loss": 2.0688, "step": 6001 }, { "epoch": 0.04667722257304884, "grad_norm": 0.12039566174654988, "learning_rate": 9.946677117082404e-05, "loss": 2.1165, "step": 6002 }, { "epoch": 0.04668499951782942, "grad_norm": 0.1232541224795124, "learning_rate": 9.946659319583322e-05, "loss": 2.128, "step": 6003 }, { "epoch": 0.046692776462610004, "grad_norm": 0.13953773774474462, "learning_rate": 9.946641519130542e-05, "loss": 1.9602, "step": 6004 }, { "epoch": 0.046700553407390585, "grad_norm": 0.1178134915514825, "learning_rate": 9.946623715724074e-05, "loss": 2.0691, "step": 6005 }, { "epoch": 0.046708330352171167, "grad_norm": 0.11967734921792648, "learning_rate": 9.946605909363927e-05, "loss": 2.0243, "step": 6006 }, { "epoch": 0.04671610729695175, "grad_norm": 0.11731572568795022, "learning_rate": 9.946588100050113e-05, "loss": 2.1013, "step": 6007 }, { "epoch": 0.04672388424173233, "grad_norm": 0.11859307469206137, "learning_rate": 9.946570287782643e-05, "loss": 2.0653, "step": 6008 }, { "epoch": 0.04673166118651291, "grad_norm": 0.1198039685463298, "learning_rate": 9.946552472561526e-05, "loss": 2.0235, "step": 6009 }, { "epoch": 0.04673943813129349, "grad_norm": 0.1343710121998838, "learning_rate": 9.946534654386775e-05, "loss": 2.0599, "step": 6010 }, { "epoch": 0.04674721507607407, "grad_norm": 0.11664631788957756, "learning_rate": 9.9465168332584e-05, "loss": 2.0649, "step": 6011 }, { "epoch": 0.046754992020854654, "grad_norm": 0.11223013131032648, "learning_rate": 9.94649900917641e-05, "loss": 2.0967, "step": 6012 }, { "epoch": 0.046762768965635235, "grad_norm": 0.11453097742371848, "learning_rate": 9.946481182140816e-05, "loss": 2.0516, "step": 6013 }, { "epoch": 0.046770545910415816, "grad_norm": 0.11358673563047104, "learning_rate": 9.946463352151632e-05, "loss": 2.0562, "step": 6014 }, { "epoch": 0.0467783228551964, "grad_norm": 0.12014256295351228, "learning_rate": 9.946445519208862e-05, "loss": 2.0511, "step": 6015 }, { "epoch": 0.04678609979997698, "grad_norm": 0.10953267363756858, "learning_rate": 9.946427683312525e-05, "loss": 2.0536, "step": 6016 }, { "epoch": 0.04679387674475756, "grad_norm": 0.12320746862636155, "learning_rate": 9.946409844462624e-05, "loss": 2.0461, "step": 6017 }, { "epoch": 0.04680165368953814, "grad_norm": 0.12477984722179765, "learning_rate": 9.946392002659174e-05, "loss": 2.0456, "step": 6018 }, { "epoch": 0.04680943063431872, "grad_norm": 0.11711983786234698, "learning_rate": 9.946374157902186e-05, "loss": 2.0282, "step": 6019 }, { "epoch": 0.0468172075790993, "grad_norm": 0.11699775264119419, "learning_rate": 9.946356310191666e-05, "loss": 2.0596, "step": 6020 }, { "epoch": 0.046824984523879884, "grad_norm": 0.121129337259683, "learning_rate": 9.946338459527631e-05, "loss": 2.1102, "step": 6021 }, { "epoch": 0.046832761468660465, "grad_norm": 0.1142633946282689, "learning_rate": 9.946320605910086e-05, "loss": 2.086, "step": 6022 }, { "epoch": 0.046840538413441046, "grad_norm": 0.1181337527200318, "learning_rate": 9.946302749339046e-05, "loss": 2.0491, "step": 6023 }, { "epoch": 0.04684831535822163, "grad_norm": 0.11690358068548501, "learning_rate": 9.946284889814521e-05, "loss": 2.0427, "step": 6024 }, { "epoch": 0.04685609230300221, "grad_norm": 0.11348606453136477, "learning_rate": 9.946267027336518e-05, "loss": 2.049, "step": 6025 }, { "epoch": 0.04686386924778279, "grad_norm": 0.11814702410191015, "learning_rate": 9.946249161905053e-05, "loss": 2.0643, "step": 6026 }, { "epoch": 0.04687164619256337, "grad_norm": 0.13353556316738174, "learning_rate": 9.946231293520132e-05, "loss": 2.0718, "step": 6027 }, { "epoch": 0.04687942313734396, "grad_norm": 0.12631036276280633, "learning_rate": 9.946213422181767e-05, "loss": 2.0331, "step": 6028 }, { "epoch": 0.04688720008212454, "grad_norm": 0.12397571286465985, "learning_rate": 9.94619554788997e-05, "loss": 2.074, "step": 6029 }, { "epoch": 0.04689497702690512, "grad_norm": 0.1261891047627695, "learning_rate": 9.94617767064475e-05, "loss": 2.1017, "step": 6030 }, { "epoch": 0.0469027539716857, "grad_norm": 0.11823594135987237, "learning_rate": 9.946159790446121e-05, "loss": 2.01, "step": 6031 }, { "epoch": 0.046910530916466284, "grad_norm": 0.12214894399047074, "learning_rate": 9.946141907294089e-05, "loss": 2.0405, "step": 6032 }, { "epoch": 0.046918307861246865, "grad_norm": 0.12331427816461775, "learning_rate": 9.946124021188668e-05, "loss": 2.0735, "step": 6033 }, { "epoch": 0.046926084806027446, "grad_norm": 0.11274150931421341, "learning_rate": 9.946106132129868e-05, "loss": 2.0795, "step": 6034 }, { "epoch": 0.04693386175080803, "grad_norm": 0.11635866017511454, "learning_rate": 9.946088240117697e-05, "loss": 2.0619, "step": 6035 }, { "epoch": 0.04694163869558861, "grad_norm": 0.11393451952073826, "learning_rate": 9.946070345152171e-05, "loss": 2.0687, "step": 6036 }, { "epoch": 0.04694941564036919, "grad_norm": 0.11875399472489265, "learning_rate": 9.946052447233297e-05, "loss": 2.0389, "step": 6037 }, { "epoch": 0.04695719258514977, "grad_norm": 0.14370654102120564, "learning_rate": 9.946034546361084e-05, "loss": 2.0891, "step": 6038 }, { "epoch": 0.04696496952993035, "grad_norm": 0.14477913980322443, "learning_rate": 9.946016642535547e-05, "loss": 2.0848, "step": 6039 }, { "epoch": 0.04697274647471093, "grad_norm": 0.1193896164722796, "learning_rate": 9.945998735756695e-05, "loss": 2.0409, "step": 6040 }, { "epoch": 0.046980523419491514, "grad_norm": 0.12546780039761013, "learning_rate": 9.945980826024537e-05, "loss": 2.1143, "step": 6041 }, { "epoch": 0.046988300364272095, "grad_norm": 0.1481840691621871, "learning_rate": 9.945962913339086e-05, "loss": 2.0761, "step": 6042 }, { "epoch": 0.046996077309052676, "grad_norm": 0.1381748405117692, "learning_rate": 9.945944997700353e-05, "loss": 2.0846, "step": 6043 }, { "epoch": 0.04700385425383326, "grad_norm": 0.119551589364498, "learning_rate": 9.945927079108346e-05, "loss": 2.0853, "step": 6044 }, { "epoch": 0.04701163119861384, "grad_norm": 0.11687402125089588, "learning_rate": 9.945909157563078e-05, "loss": 2.0334, "step": 6045 }, { "epoch": 0.04701940814339442, "grad_norm": 0.11215921942379893, "learning_rate": 9.945891233064557e-05, "loss": 2.0954, "step": 6046 }, { "epoch": 0.047027185088175, "grad_norm": 0.11718045182772693, "learning_rate": 9.945873305612799e-05, "loss": 2.0799, "step": 6047 }, { "epoch": 0.04703496203295558, "grad_norm": 0.13119980379878368, "learning_rate": 9.945855375207808e-05, "loss": 2.0191, "step": 6048 }, { "epoch": 0.047042738977736163, "grad_norm": 0.15673091319034615, "learning_rate": 9.945837441849599e-05, "loss": 2.0429, "step": 6049 }, { "epoch": 0.047050515922516745, "grad_norm": 0.1429256551260521, "learning_rate": 9.945819505538184e-05, "loss": 2.0564, "step": 6050 }, { "epoch": 0.047058292867297326, "grad_norm": 0.1129764366821004, "learning_rate": 9.945801566273568e-05, "loss": 1.9969, "step": 6051 }, { "epoch": 0.04706606981207791, "grad_norm": 0.12048097193015495, "learning_rate": 9.945783624055767e-05, "loss": 2.1106, "step": 6052 }, { "epoch": 0.04707384675685849, "grad_norm": 0.11626740238742539, "learning_rate": 9.94576567888479e-05, "loss": 2.0412, "step": 6053 }, { "epoch": 0.04708162370163907, "grad_norm": 0.7244709588414313, "learning_rate": 9.945747730760646e-05, "loss": 2.0631, "step": 6054 }, { "epoch": 0.04708940064641965, "grad_norm": 0.11896869589464226, "learning_rate": 9.945729779683349e-05, "loss": 2.0509, "step": 6055 }, { "epoch": 0.04709717759120023, "grad_norm": 0.11312241789044149, "learning_rate": 9.945711825652907e-05, "loss": 2.0847, "step": 6056 }, { "epoch": 0.04710495453598081, "grad_norm": 0.11529553603941001, "learning_rate": 9.945693868669332e-05, "loss": 2.0814, "step": 6057 }, { "epoch": 0.047112731480761394, "grad_norm": 0.12449758793818261, "learning_rate": 9.945675908732635e-05, "loss": 2.1097, "step": 6058 }, { "epoch": 0.047120508425541975, "grad_norm": 0.1294648086494306, "learning_rate": 9.945657945842827e-05, "loss": 2.0489, "step": 6059 }, { "epoch": 0.047128285370322556, "grad_norm": 0.11742761423117612, "learning_rate": 9.945639979999917e-05, "loss": 2.0177, "step": 6060 }, { "epoch": 0.04713606231510314, "grad_norm": 0.12221908617403898, "learning_rate": 9.945622011203917e-05, "loss": 2.041, "step": 6061 }, { "epoch": 0.04714383925988372, "grad_norm": 0.11914955983086165, "learning_rate": 9.945604039454836e-05, "loss": 2.0516, "step": 6062 }, { "epoch": 0.0471516162046643, "grad_norm": 0.11642114893460515, "learning_rate": 9.945586064752687e-05, "loss": 2.1184, "step": 6063 }, { "epoch": 0.04715939314944488, "grad_norm": 0.12389425419849534, "learning_rate": 9.945568087097478e-05, "loss": 2.1251, "step": 6064 }, { "epoch": 0.04716717009422546, "grad_norm": 0.1275231952251197, "learning_rate": 9.945550106489226e-05, "loss": 2.1182, "step": 6065 }, { "epoch": 0.04717494703900604, "grad_norm": 0.15060558403451738, "learning_rate": 9.945532122927934e-05, "loss": 2.0629, "step": 6066 }, { "epoch": 0.047182723983786624, "grad_norm": 0.14536508284273825, "learning_rate": 9.945514136413617e-05, "loss": 2.1029, "step": 6067 }, { "epoch": 0.047190500928567206, "grad_norm": 0.12097437582659695, "learning_rate": 9.945496146946284e-05, "loss": 2.0703, "step": 6068 }, { "epoch": 0.04719827787334779, "grad_norm": 0.16995436605360467, "learning_rate": 9.945478154525947e-05, "loss": 1.9789, "step": 6069 }, { "epoch": 0.04720605481812837, "grad_norm": 0.19054958436579256, "learning_rate": 9.945460159152615e-05, "loss": 2.053, "step": 6070 }, { "epoch": 0.04721383176290895, "grad_norm": 0.2732053080087463, "learning_rate": 9.945442160826303e-05, "loss": 2.1228, "step": 6071 }, { "epoch": 0.04722160870768953, "grad_norm": 0.1563074526595376, "learning_rate": 9.945424159547017e-05, "loss": 2.0426, "step": 6072 }, { "epoch": 0.04722938565247011, "grad_norm": 0.2021395124080678, "learning_rate": 9.94540615531477e-05, "loss": 2.1448, "step": 6073 }, { "epoch": 0.04723716259725069, "grad_norm": 0.1322235091605287, "learning_rate": 9.945388148129571e-05, "loss": 2.0879, "step": 6074 }, { "epoch": 0.047244939542031274, "grad_norm": 0.14659414941130086, "learning_rate": 9.945370137991432e-05, "loss": 2.0844, "step": 6075 }, { "epoch": 0.047252716486811855, "grad_norm": 0.17996630467790942, "learning_rate": 9.945352124900366e-05, "loss": 2.0473, "step": 6076 }, { "epoch": 0.047260493431592436, "grad_norm": 0.12638365536031948, "learning_rate": 9.945334108856379e-05, "loss": 2.0448, "step": 6077 }, { "epoch": 0.04726827037637302, "grad_norm": 0.14755806798256255, "learning_rate": 9.945316089859486e-05, "loss": 2.0533, "step": 6078 }, { "epoch": 0.0472760473211536, "grad_norm": 0.17049584935476522, "learning_rate": 9.945298067909695e-05, "loss": 2.1254, "step": 6079 }, { "epoch": 0.04728382426593418, "grad_norm": 0.12028228777778827, "learning_rate": 9.945280043007019e-05, "loss": 2.045, "step": 6080 }, { "epoch": 0.04729160121071476, "grad_norm": 0.17275216106507915, "learning_rate": 9.945262015151466e-05, "loss": 2.0864, "step": 6081 }, { "epoch": 0.04729937815549534, "grad_norm": 0.18515739740649115, "learning_rate": 9.945243984343049e-05, "loss": 2.0716, "step": 6082 }, { "epoch": 0.04730715510027592, "grad_norm": 0.11749327982965652, "learning_rate": 9.945225950581779e-05, "loss": 2.104, "step": 6083 }, { "epoch": 0.047314932045056504, "grad_norm": 0.17461999025949532, "learning_rate": 9.945207913867665e-05, "loss": 2.094, "step": 6084 }, { "epoch": 0.047322708989837085, "grad_norm": 0.2017918708596356, "learning_rate": 9.945189874200719e-05, "loss": 2.0933, "step": 6085 }, { "epoch": 0.047330485934617667, "grad_norm": 0.1334237346994892, "learning_rate": 9.945171831580951e-05, "loss": 2.0882, "step": 6086 }, { "epoch": 0.04733826287939825, "grad_norm": 0.13988809932621957, "learning_rate": 9.945153786008372e-05, "loss": 2.0672, "step": 6087 }, { "epoch": 0.047346039824178836, "grad_norm": 0.18373564756059166, "learning_rate": 9.945135737482993e-05, "loss": 2.0922, "step": 6088 }, { "epoch": 0.04735381676895942, "grad_norm": 0.12464287920446332, "learning_rate": 9.945117686004826e-05, "loss": 2.0537, "step": 6089 }, { "epoch": 0.04736159371374, "grad_norm": 0.1338812512417543, "learning_rate": 9.945099631573879e-05, "loss": 2.0945, "step": 6090 }, { "epoch": 0.04736937065852058, "grad_norm": 0.13134005928504144, "learning_rate": 9.945081574190164e-05, "loss": 2.0746, "step": 6091 }, { "epoch": 0.04737714760330116, "grad_norm": 0.12059930003487712, "learning_rate": 9.945063513853694e-05, "loss": 2.0257, "step": 6092 }, { "epoch": 0.04738492454808174, "grad_norm": 0.1415278850814147, "learning_rate": 9.945045450564477e-05, "loss": 2.0794, "step": 6093 }, { "epoch": 0.04739270149286232, "grad_norm": 0.12409837503257352, "learning_rate": 9.945027384322524e-05, "loss": 2.0918, "step": 6094 }, { "epoch": 0.047400478437642904, "grad_norm": 0.11516326280797523, "learning_rate": 9.945009315127847e-05, "loss": 2.0618, "step": 6095 }, { "epoch": 0.047408255382423485, "grad_norm": 0.13385779014195653, "learning_rate": 9.944991242980457e-05, "loss": 2.0623, "step": 6096 }, { "epoch": 0.047416032327204066, "grad_norm": 0.11110043833820432, "learning_rate": 9.944973167880362e-05, "loss": 2.112, "step": 6097 }, { "epoch": 0.04742380927198465, "grad_norm": 0.16807262064243453, "learning_rate": 9.944955089827576e-05, "loss": 2.0695, "step": 6098 }, { "epoch": 0.04743158621676523, "grad_norm": 0.17860257829115753, "learning_rate": 9.944937008822109e-05, "loss": 2.066, "step": 6099 }, { "epoch": 0.04743936316154581, "grad_norm": 0.11510234002725316, "learning_rate": 9.94491892486397e-05, "loss": 2.0314, "step": 6100 }, { "epoch": 0.04744714010632639, "grad_norm": 0.11478125176117508, "learning_rate": 9.944900837953172e-05, "loss": 2.1206, "step": 6101 }, { "epoch": 0.04745491705110697, "grad_norm": 0.11449346251510699, "learning_rate": 9.944882748089725e-05, "loss": 2.0993, "step": 6102 }, { "epoch": 0.04746269399588755, "grad_norm": 0.11478248913917587, "learning_rate": 9.94486465527364e-05, "loss": 2.0805, "step": 6103 }, { "epoch": 0.047470470940668134, "grad_norm": 0.11946528595139873, "learning_rate": 9.944846559504928e-05, "loss": 2.031, "step": 6104 }, { "epoch": 0.047478247885448716, "grad_norm": 0.12169081475269086, "learning_rate": 9.944828460783597e-05, "loss": 2.0556, "step": 6105 }, { "epoch": 0.0474860248302293, "grad_norm": 0.11952079269824202, "learning_rate": 9.944810359109663e-05, "loss": 2.0468, "step": 6106 }, { "epoch": 0.04749380177500988, "grad_norm": 0.11545122215768425, "learning_rate": 9.944792254483132e-05, "loss": 2.0789, "step": 6107 }, { "epoch": 0.04750157871979046, "grad_norm": 0.11176353691302034, "learning_rate": 9.944774146904019e-05, "loss": 2.0512, "step": 6108 }, { "epoch": 0.04750935566457104, "grad_norm": 0.1173744982431171, "learning_rate": 9.944756036372332e-05, "loss": 2.0296, "step": 6109 }, { "epoch": 0.04751713260935162, "grad_norm": 0.11227329694305133, "learning_rate": 9.944737922888081e-05, "loss": 2.1112, "step": 6110 }, { "epoch": 0.0475249095541322, "grad_norm": 0.11798189888183322, "learning_rate": 9.944719806451279e-05, "loss": 2.0267, "step": 6111 }, { "epoch": 0.047532686498912784, "grad_norm": 0.11606978268773828, "learning_rate": 9.944701687061937e-05, "loss": 2.052, "step": 6112 }, { "epoch": 0.047540463443693365, "grad_norm": 0.11557953353961323, "learning_rate": 9.944683564720064e-05, "loss": 2.0198, "step": 6113 }, { "epoch": 0.047548240388473946, "grad_norm": 0.11276786399746447, "learning_rate": 9.944665439425672e-05, "loss": 2.1129, "step": 6114 }, { "epoch": 0.04755601733325453, "grad_norm": 0.11556110446819239, "learning_rate": 9.944647311178773e-05, "loss": 2.1195, "step": 6115 }, { "epoch": 0.04756379427803511, "grad_norm": 0.12168485462461126, "learning_rate": 9.944629179979373e-05, "loss": 2.0559, "step": 6116 }, { "epoch": 0.04757157122281569, "grad_norm": 0.11633799515231756, "learning_rate": 9.944611045827487e-05, "loss": 2.1159, "step": 6117 }, { "epoch": 0.04757934816759627, "grad_norm": 0.12186372437594911, "learning_rate": 9.944592908723127e-05, "loss": 2.0948, "step": 6118 }, { "epoch": 0.04758712511237685, "grad_norm": 0.13976159553345663, "learning_rate": 9.944574768666301e-05, "loss": 2.0607, "step": 6119 }, { "epoch": 0.04759490205715743, "grad_norm": 0.12722108608055652, "learning_rate": 9.94455662565702e-05, "loss": 2.0717, "step": 6120 }, { "epoch": 0.047602679001938014, "grad_norm": 0.11878114303102345, "learning_rate": 9.944538479695298e-05, "loss": 1.9526, "step": 6121 }, { "epoch": 0.047610455946718595, "grad_norm": 0.12471751588119695, "learning_rate": 9.94452033078114e-05, "loss": 2.0501, "step": 6122 }, { "epoch": 0.047618232891499176, "grad_norm": 0.12255945257206685, "learning_rate": 9.944502178914564e-05, "loss": 2.0679, "step": 6123 }, { "epoch": 0.04762600983627976, "grad_norm": 0.1165810684314721, "learning_rate": 9.944484024095573e-05, "loss": 2.12, "step": 6124 }, { "epoch": 0.04763378678106034, "grad_norm": 0.12799065180843283, "learning_rate": 9.944465866324185e-05, "loss": 2.0685, "step": 6125 }, { "epoch": 0.04764156372584092, "grad_norm": 0.12311702478946378, "learning_rate": 9.944447705600405e-05, "loss": 2.0281, "step": 6126 }, { "epoch": 0.0476493406706215, "grad_norm": 0.11393054537454567, "learning_rate": 9.944429541924249e-05, "loss": 2.0284, "step": 6127 }, { "epoch": 0.04765711761540208, "grad_norm": 0.14388388412674855, "learning_rate": 9.944411375295724e-05, "loss": 2.0582, "step": 6128 }, { "epoch": 0.04766489456018266, "grad_norm": 0.15789784376730298, "learning_rate": 9.944393205714841e-05, "loss": 2.1022, "step": 6129 }, { "epoch": 0.047672671504963245, "grad_norm": 0.12787821941254573, "learning_rate": 9.944375033181615e-05, "loss": 2.0659, "step": 6130 }, { "epoch": 0.047680448449743826, "grad_norm": 0.12213175448543233, "learning_rate": 9.944356857696053e-05, "loss": 2.1041, "step": 6131 }, { "epoch": 0.04768822539452441, "grad_norm": 0.12811704310021838, "learning_rate": 9.944338679258166e-05, "loss": 2.1201, "step": 6132 }, { "epoch": 0.04769600233930499, "grad_norm": 0.11285796831783043, "learning_rate": 9.944320497867966e-05, "loss": 2.0425, "step": 6133 }, { "epoch": 0.04770377928408557, "grad_norm": 0.11862873926926133, "learning_rate": 9.944302313525465e-05, "loss": 2.0977, "step": 6134 }, { "epoch": 0.04771155622886615, "grad_norm": 0.1260011855462895, "learning_rate": 9.94428412623067e-05, "loss": 1.9984, "step": 6135 }, { "epoch": 0.04771933317364673, "grad_norm": 0.12244232270163714, "learning_rate": 9.944265935983594e-05, "loss": 2.0602, "step": 6136 }, { "epoch": 0.04772711011842731, "grad_norm": 0.121590280179743, "learning_rate": 9.94424774278425e-05, "loss": 2.1219, "step": 6137 }, { "epoch": 0.047734887063207894, "grad_norm": 0.11804295248799655, "learning_rate": 9.944229546632647e-05, "loss": 2.0494, "step": 6138 }, { "epoch": 0.047742664007988475, "grad_norm": 0.11398544856223856, "learning_rate": 9.944211347528795e-05, "loss": 2.1253, "step": 6139 }, { "epoch": 0.047750440952769056, "grad_norm": 0.13645263752082218, "learning_rate": 9.944193145472705e-05, "loss": 2.116, "step": 6140 }, { "epoch": 0.04775821789754964, "grad_norm": 0.13071901792175097, "learning_rate": 9.94417494046439e-05, "loss": 2.0671, "step": 6141 }, { "epoch": 0.04776599484233022, "grad_norm": 0.11126822336300295, "learning_rate": 9.944156732503859e-05, "loss": 2.0787, "step": 6142 }, { "epoch": 0.0477737717871108, "grad_norm": 0.12212363134528964, "learning_rate": 9.944138521591122e-05, "loss": 2.0973, "step": 6143 }, { "epoch": 0.04778154873189138, "grad_norm": 0.1238451571128956, "learning_rate": 9.944120307726193e-05, "loss": 2.0799, "step": 6144 }, { "epoch": 0.04778932567667196, "grad_norm": 0.11274467413095526, "learning_rate": 9.94410209090908e-05, "loss": 2.109, "step": 6145 }, { "epoch": 0.04779710262145254, "grad_norm": 0.12360826631023147, "learning_rate": 9.944083871139795e-05, "loss": 2.0731, "step": 6146 }, { "epoch": 0.04780487956623313, "grad_norm": 0.13011429817650008, "learning_rate": 9.94406564841835e-05, "loss": 2.056, "step": 6147 }, { "epoch": 0.04781265651101371, "grad_norm": 0.12061880261293667, "learning_rate": 9.944047422744754e-05, "loss": 2.0078, "step": 6148 }, { "epoch": 0.047820433455794294, "grad_norm": 0.1381238709646313, "learning_rate": 9.944029194119017e-05, "loss": 2.0537, "step": 6149 }, { "epoch": 0.047828210400574875, "grad_norm": 0.25288655765598433, "learning_rate": 9.944010962541154e-05, "loss": 2.0842, "step": 6150 }, { "epoch": 0.047835987345355456, "grad_norm": 0.12823150166163536, "learning_rate": 9.943992728011171e-05, "loss": 2.0854, "step": 6151 }, { "epoch": 0.04784376429013604, "grad_norm": 0.13090277189818111, "learning_rate": 9.943974490529083e-05, "loss": 2.0415, "step": 6152 }, { "epoch": 0.04785154123491662, "grad_norm": 0.16749845822826376, "learning_rate": 9.943956250094899e-05, "loss": 2.0866, "step": 6153 }, { "epoch": 0.0478593181796972, "grad_norm": 0.14134806790551932, "learning_rate": 9.943938006708629e-05, "loss": 2.0477, "step": 6154 }, { "epoch": 0.04786709512447778, "grad_norm": 0.13356888682402496, "learning_rate": 9.943919760370286e-05, "loss": 2.0483, "step": 6155 }, { "epoch": 0.04787487206925836, "grad_norm": 0.2156491655312504, "learning_rate": 9.943901511079878e-05, "loss": 2.0235, "step": 6156 }, { "epoch": 0.04788264901403894, "grad_norm": 0.18126051664152423, "learning_rate": 9.94388325883742e-05, "loss": 2.0382, "step": 6157 }, { "epoch": 0.047890425958819524, "grad_norm": 0.1191230971318089, "learning_rate": 9.943865003642919e-05, "loss": 2.0809, "step": 6158 }, { "epoch": 0.047898202903600105, "grad_norm": 0.18877023675125046, "learning_rate": 9.943846745496388e-05, "loss": 2.0739, "step": 6159 }, { "epoch": 0.047905979848380686, "grad_norm": 0.1945072197477742, "learning_rate": 9.943828484397837e-05, "loss": 2.0102, "step": 6160 }, { "epoch": 0.04791375679316127, "grad_norm": 0.18338630524160607, "learning_rate": 9.943810220347278e-05, "loss": 2.0608, "step": 6161 }, { "epoch": 0.04792153373794185, "grad_norm": 0.21296647763880702, "learning_rate": 9.94379195334472e-05, "loss": 2.0311, "step": 6162 }, { "epoch": 0.04792931068272243, "grad_norm": 0.22899885261114156, "learning_rate": 9.943773683390177e-05, "loss": 2.0807, "step": 6163 }, { "epoch": 0.04793708762750301, "grad_norm": 0.13288883047998348, "learning_rate": 9.943755410483658e-05, "loss": 2.0797, "step": 6164 }, { "epoch": 0.04794486457228359, "grad_norm": 0.13675414195851832, "learning_rate": 9.943737134625173e-05, "loss": 2.0894, "step": 6165 }, { "epoch": 0.04795264151706417, "grad_norm": 0.14659815451146005, "learning_rate": 9.943718855814733e-05, "loss": 2.116, "step": 6166 }, { "epoch": 0.047960418461844755, "grad_norm": 0.12050563354454345, "learning_rate": 9.94370057405235e-05, "loss": 2.0954, "step": 6167 }, { "epoch": 0.047968195406625336, "grad_norm": 0.140252829004475, "learning_rate": 9.943682289338036e-05, "loss": 2.0475, "step": 6168 }, { "epoch": 0.04797597235140592, "grad_norm": 0.12256589936007539, "learning_rate": 9.9436640016718e-05, "loss": 2.1381, "step": 6169 }, { "epoch": 0.0479837492961865, "grad_norm": 0.13250404727116713, "learning_rate": 9.943645711053653e-05, "loss": 2.0827, "step": 6170 }, { "epoch": 0.04799152624096708, "grad_norm": 0.13341111255633475, "learning_rate": 9.943627417483607e-05, "loss": 2.0587, "step": 6171 }, { "epoch": 0.04799930318574766, "grad_norm": 0.11928377473052523, "learning_rate": 9.943609120961672e-05, "loss": 2.033, "step": 6172 }, { "epoch": 0.04800708013052824, "grad_norm": 0.13706526706086142, "learning_rate": 9.94359082148786e-05, "loss": 2.0688, "step": 6173 }, { "epoch": 0.04801485707530882, "grad_norm": 0.12615101091836542, "learning_rate": 9.943572519062181e-05, "loss": 2.0732, "step": 6174 }, { "epoch": 0.048022634020089404, "grad_norm": 0.12260468632660165, "learning_rate": 9.943554213684646e-05, "loss": 2.0605, "step": 6175 }, { "epoch": 0.048030410964869985, "grad_norm": 0.12013256769117882, "learning_rate": 9.943535905355266e-05, "loss": 2.0692, "step": 6176 }, { "epoch": 0.048038187909650566, "grad_norm": 0.12026950910299769, "learning_rate": 9.943517594074052e-05, "loss": 2.0401, "step": 6177 }, { "epoch": 0.04804596485443115, "grad_norm": 0.1629661719704394, "learning_rate": 9.943499279841016e-05, "loss": 2.109, "step": 6178 }, { "epoch": 0.04805374179921173, "grad_norm": 0.12341072132111759, "learning_rate": 9.943480962656166e-05, "loss": 2.0745, "step": 6179 }, { "epoch": 0.04806151874399231, "grad_norm": 0.1307090585267939, "learning_rate": 9.943462642519515e-05, "loss": 2.1003, "step": 6180 }, { "epoch": 0.04806929568877289, "grad_norm": 0.15342582575071925, "learning_rate": 9.943444319431074e-05, "loss": 2.0651, "step": 6181 }, { "epoch": 0.04807707263355347, "grad_norm": 0.14045027443211636, "learning_rate": 9.943425993390854e-05, "loss": 2.0609, "step": 6182 }, { "epoch": 0.04808484957833405, "grad_norm": 0.11805026259942832, "learning_rate": 9.943407664398867e-05, "loss": 2.0425, "step": 6183 }, { "epoch": 0.048092626523114634, "grad_norm": 0.12370527182293274, "learning_rate": 9.94338933245512e-05, "loss": 1.9968, "step": 6184 }, { "epoch": 0.048100403467895216, "grad_norm": 0.12104986383995035, "learning_rate": 9.943370997559628e-05, "loss": 2.0487, "step": 6185 }, { "epoch": 0.0481081804126758, "grad_norm": 0.1193228597719592, "learning_rate": 9.9433526597124e-05, "loss": 2.0429, "step": 6186 }, { "epoch": 0.04811595735745638, "grad_norm": 0.11461817821532133, "learning_rate": 9.943334318913448e-05, "loss": 2.1399, "step": 6187 }, { "epoch": 0.04812373430223696, "grad_norm": 0.11314652117594597, "learning_rate": 9.943315975162781e-05, "loss": 2.0559, "step": 6188 }, { "epoch": 0.04813151124701754, "grad_norm": 0.12358858718519694, "learning_rate": 9.943297628460412e-05, "loss": 2.0689, "step": 6189 }, { "epoch": 0.04813928819179812, "grad_norm": 0.1199273585968785, "learning_rate": 9.943279278806353e-05, "loss": 1.9979, "step": 6190 }, { "epoch": 0.0481470651365787, "grad_norm": 0.11593257209787274, "learning_rate": 9.943260926200613e-05, "loss": 2.0297, "step": 6191 }, { "epoch": 0.048154842081359284, "grad_norm": 0.12278691975305805, "learning_rate": 9.943242570643201e-05, "loss": 2.0497, "step": 6192 }, { "epoch": 0.048162619026139865, "grad_norm": 0.1260984716911959, "learning_rate": 9.94322421213413e-05, "loss": 2.0479, "step": 6193 }, { "epoch": 0.048170395970920446, "grad_norm": 0.12041899060003904, "learning_rate": 9.943205850673413e-05, "loss": 2.1004, "step": 6194 }, { "epoch": 0.04817817291570103, "grad_norm": 0.12167874645557865, "learning_rate": 9.943187486261058e-05, "loss": 2.0735, "step": 6195 }, { "epoch": 0.04818594986048161, "grad_norm": 0.11322191261960625, "learning_rate": 9.943169118897078e-05, "loss": 2.0795, "step": 6196 }, { "epoch": 0.04819372680526219, "grad_norm": 0.11764196257422298, "learning_rate": 9.943150748581482e-05, "loss": 2.1263, "step": 6197 }, { "epoch": 0.04820150375004277, "grad_norm": 0.11386845888545022, "learning_rate": 9.943132375314283e-05, "loss": 1.9941, "step": 6198 }, { "epoch": 0.04820928069482335, "grad_norm": 0.12418213432977657, "learning_rate": 9.943113999095491e-05, "loss": 2.1014, "step": 6199 }, { "epoch": 0.04821705763960393, "grad_norm": 0.20903169384367964, "learning_rate": 9.943095619925116e-05, "loss": 2.0779, "step": 6200 }, { "epoch": 0.048224834584384514, "grad_norm": 0.11342755850018309, "learning_rate": 9.94307723780317e-05, "loss": 2.0486, "step": 6201 }, { "epoch": 0.048232611529165095, "grad_norm": 0.12092591026414931, "learning_rate": 9.943058852729665e-05, "loss": 2.0389, "step": 6202 }, { "epoch": 0.048240388473945676, "grad_norm": 0.1137855984291001, "learning_rate": 9.94304046470461e-05, "loss": 2.0783, "step": 6203 }, { "epoch": 0.04824816541872626, "grad_norm": 0.11876784009236921, "learning_rate": 9.943022073728017e-05, "loss": 2.1003, "step": 6204 }, { "epoch": 0.04825594236350684, "grad_norm": 0.3566847929534832, "learning_rate": 9.943003679799896e-05, "loss": 2.0458, "step": 6205 }, { "epoch": 0.04826371930828743, "grad_norm": 0.1132827321446967, "learning_rate": 9.94298528292026e-05, "loss": 2.0427, "step": 6206 }, { "epoch": 0.04827149625306801, "grad_norm": 0.11952881578914963, "learning_rate": 9.942966883089118e-05, "loss": 2.0649, "step": 6207 }, { "epoch": 0.04827927319784859, "grad_norm": 0.12247966559745571, "learning_rate": 9.942948480306483e-05, "loss": 2.0957, "step": 6208 }, { "epoch": 0.04828705014262917, "grad_norm": 0.12448049732997529, "learning_rate": 9.942930074572363e-05, "loss": 2.0816, "step": 6209 }, { "epoch": 0.04829482708740975, "grad_norm": 0.13360025002606216, "learning_rate": 9.942911665886772e-05, "loss": 2.0494, "step": 6210 }, { "epoch": 0.04830260403219033, "grad_norm": 0.1194906952695835, "learning_rate": 9.942893254249719e-05, "loss": 2.0775, "step": 6211 }, { "epoch": 0.048310380976970914, "grad_norm": 0.21713872792767983, "learning_rate": 9.942874839661216e-05, "loss": 2.0461, "step": 6212 }, { "epoch": 0.048318157921751495, "grad_norm": 0.12793945381263108, "learning_rate": 9.942856422121274e-05, "loss": 2.087, "step": 6213 }, { "epoch": 0.048325934866532076, "grad_norm": 0.11387502747547608, "learning_rate": 9.942838001629904e-05, "loss": 2.0176, "step": 6214 }, { "epoch": 0.04833371181131266, "grad_norm": 0.17760231639740348, "learning_rate": 9.942819578187115e-05, "loss": 2.1181, "step": 6215 }, { "epoch": 0.04834148875609324, "grad_norm": 0.2309139783085014, "learning_rate": 9.942801151792923e-05, "loss": 2.062, "step": 6216 }, { "epoch": 0.04834926570087382, "grad_norm": 0.21348320766880888, "learning_rate": 9.942782722447332e-05, "loss": 2.062, "step": 6217 }, { "epoch": 0.0483570426456544, "grad_norm": 0.1582209349146763, "learning_rate": 9.94276429015036e-05, "loss": 2.097, "step": 6218 }, { "epoch": 0.04836481959043498, "grad_norm": 0.11420035766244718, "learning_rate": 9.942745854902012e-05, "loss": 2.0796, "step": 6219 }, { "epoch": 0.04837259653521556, "grad_norm": 0.14087708369774574, "learning_rate": 9.942727416702304e-05, "loss": 2.0425, "step": 6220 }, { "epoch": 0.048380373479996144, "grad_norm": 0.15843845146240557, "learning_rate": 9.942708975551243e-05, "loss": 2.09, "step": 6221 }, { "epoch": 0.048388150424776726, "grad_norm": 0.11744500487025546, "learning_rate": 9.942690531448843e-05, "loss": 2.0801, "step": 6222 }, { "epoch": 0.04839592736955731, "grad_norm": 0.14773258188164928, "learning_rate": 9.942672084395113e-05, "loss": 2.1196, "step": 6223 }, { "epoch": 0.04840370431433789, "grad_norm": 0.16755138278588996, "learning_rate": 9.942653634390064e-05, "loss": 2.0339, "step": 6224 }, { "epoch": 0.04841148125911847, "grad_norm": 0.1612703683078731, "learning_rate": 9.942635181433711e-05, "loss": 2.0635, "step": 6225 }, { "epoch": 0.04841925820389905, "grad_norm": 0.11047847405274544, "learning_rate": 9.942616725526059e-05, "loss": 2.0081, "step": 6226 }, { "epoch": 0.04842703514867963, "grad_norm": 0.16767640719973972, "learning_rate": 9.942598266667122e-05, "loss": 2.0713, "step": 6227 }, { "epoch": 0.04843481209346021, "grad_norm": 0.1838622485283228, "learning_rate": 9.942579804856911e-05, "loss": 2.0194, "step": 6228 }, { "epoch": 0.048442589038240794, "grad_norm": 0.12345938209090523, "learning_rate": 9.942561340095437e-05, "loss": 2.0825, "step": 6229 }, { "epoch": 0.048450365983021375, "grad_norm": 0.12489137780260076, "learning_rate": 9.942542872382711e-05, "loss": 2.0577, "step": 6230 }, { "epoch": 0.048458142927801956, "grad_norm": 0.13130028090191645, "learning_rate": 9.942524401718746e-05, "loss": 2.0501, "step": 6231 }, { "epoch": 0.04846591987258254, "grad_norm": 0.24287595960571656, "learning_rate": 9.942505928103549e-05, "loss": 2.0946, "step": 6232 }, { "epoch": 0.04847369681736312, "grad_norm": 0.13600920253551035, "learning_rate": 9.942487451537133e-05, "loss": 2.1125, "step": 6233 }, { "epoch": 0.0484814737621437, "grad_norm": 0.13984620428068634, "learning_rate": 9.94246897201951e-05, "loss": 2.064, "step": 6234 }, { "epoch": 0.04848925070692428, "grad_norm": 0.1156741934453515, "learning_rate": 9.94245048955069e-05, "loss": 1.9887, "step": 6235 }, { "epoch": 0.04849702765170486, "grad_norm": 0.1313397978035359, "learning_rate": 9.942432004130683e-05, "loss": 2.0735, "step": 6236 }, { "epoch": 0.04850480459648544, "grad_norm": 0.1278194468463376, "learning_rate": 9.942413515759501e-05, "loss": 2.043, "step": 6237 }, { "epoch": 0.048512581541266024, "grad_norm": 0.1104908684323205, "learning_rate": 9.942395024437157e-05, "loss": 2.0531, "step": 6238 }, { "epoch": 0.048520358486046605, "grad_norm": 0.11729732773064386, "learning_rate": 9.942376530163658e-05, "loss": 2.1188, "step": 6239 }, { "epoch": 0.048528135430827186, "grad_norm": 0.11455006081138205, "learning_rate": 9.942358032939019e-05, "loss": 2.057, "step": 6240 }, { "epoch": 0.04853591237560777, "grad_norm": 0.12859146753195758, "learning_rate": 9.942339532763249e-05, "loss": 2.0892, "step": 6241 }, { "epoch": 0.04854368932038835, "grad_norm": 0.12995129253608503, "learning_rate": 9.942321029636359e-05, "loss": 2.0387, "step": 6242 }, { "epoch": 0.04855146626516893, "grad_norm": 0.11251341093066304, "learning_rate": 9.94230252355836e-05, "loss": 2.059, "step": 6243 }, { "epoch": 0.04855924320994951, "grad_norm": 0.185741114507008, "learning_rate": 9.942284014529263e-05, "loss": 2.1062, "step": 6244 }, { "epoch": 0.04856702015473009, "grad_norm": 0.13769164850525975, "learning_rate": 9.942265502549082e-05, "loss": 2.0905, "step": 6245 }, { "epoch": 0.04857479709951067, "grad_norm": 0.11056691943980776, "learning_rate": 9.942246987617823e-05, "loss": 2.0644, "step": 6246 }, { "epoch": 0.048582574044291255, "grad_norm": 0.13506400894085258, "learning_rate": 9.942228469735502e-05, "loss": 2.0412, "step": 6247 }, { "epoch": 0.048590350989071836, "grad_norm": 0.1324108858251841, "learning_rate": 9.942209948902125e-05, "loss": 2.0852, "step": 6248 }, { "epoch": 0.04859812793385242, "grad_norm": 0.11582406573421464, "learning_rate": 9.942191425117708e-05, "loss": 2.1009, "step": 6249 }, { "epoch": 0.048605904878633, "grad_norm": 0.11625075248960692, "learning_rate": 9.942172898382258e-05, "loss": 2.0449, "step": 6250 }, { "epoch": 0.04861368182341358, "grad_norm": 0.11447027161382405, "learning_rate": 9.942154368695789e-05, "loss": 2.0464, "step": 6251 }, { "epoch": 0.04862145876819416, "grad_norm": 0.1130656229349233, "learning_rate": 9.942135836058312e-05, "loss": 2.0815, "step": 6252 }, { "epoch": 0.04862923571297474, "grad_norm": 0.11549552323366073, "learning_rate": 9.942117300469836e-05, "loss": 2.0483, "step": 6253 }, { "epoch": 0.04863701265775532, "grad_norm": 0.12361533810123775, "learning_rate": 9.942098761930371e-05, "loss": 2.1186, "step": 6254 }, { "epoch": 0.048644789602535904, "grad_norm": 0.17497652702371363, "learning_rate": 9.942080220439934e-05, "loss": 2.0973, "step": 6255 }, { "epoch": 0.048652566547316485, "grad_norm": 0.11245730004454464, "learning_rate": 9.942061675998528e-05, "loss": 2.0512, "step": 6256 }, { "epoch": 0.048660343492097066, "grad_norm": 0.12663030198477973, "learning_rate": 9.942043128606171e-05, "loss": 2.027, "step": 6257 }, { "epoch": 0.04866812043687765, "grad_norm": 0.12178658528394022, "learning_rate": 9.942024578262871e-05, "loss": 2.0685, "step": 6258 }, { "epoch": 0.04867589738165823, "grad_norm": 0.11384623316884425, "learning_rate": 9.942006024968639e-05, "loss": 2.0458, "step": 6259 }, { "epoch": 0.04868367432643881, "grad_norm": 0.1167524457932749, "learning_rate": 9.941987468723488e-05, "loss": 2.1, "step": 6260 }, { "epoch": 0.04869145127121939, "grad_norm": 0.11744760696190715, "learning_rate": 9.941968909527426e-05, "loss": 2.0982, "step": 6261 }, { "epoch": 0.04869922821599997, "grad_norm": 0.13333330159838447, "learning_rate": 9.941950347380467e-05, "loss": 2.0618, "step": 6262 }, { "epoch": 0.04870700516078055, "grad_norm": 0.11625554446166561, "learning_rate": 9.941931782282618e-05, "loss": 2.0395, "step": 6263 }, { "epoch": 0.048714782105561134, "grad_norm": 0.1163151322351897, "learning_rate": 9.941913214233896e-05, "loss": 2.0135, "step": 6264 }, { "epoch": 0.048722559050341716, "grad_norm": 0.11136194010814698, "learning_rate": 9.941894643234306e-05, "loss": 2.0562, "step": 6265 }, { "epoch": 0.048730335995122304, "grad_norm": 0.11091106424517955, "learning_rate": 9.941876069283864e-05, "loss": 2.0567, "step": 6266 }, { "epoch": 0.048738112939902885, "grad_norm": 0.11072809018880665, "learning_rate": 9.94185749238258e-05, "loss": 2.0003, "step": 6267 }, { "epoch": 0.048745889884683466, "grad_norm": 0.11221061899873212, "learning_rate": 9.941838912530463e-05, "loss": 2.1023, "step": 6268 }, { "epoch": 0.04875366682946405, "grad_norm": 0.12012695341708687, "learning_rate": 9.941820329727524e-05, "loss": 2.0798, "step": 6269 }, { "epoch": 0.04876144377424463, "grad_norm": 0.11372437092558811, "learning_rate": 9.941801743973778e-05, "loss": 2.0586, "step": 6270 }, { "epoch": 0.04876922071902521, "grad_norm": 0.11376833524233886, "learning_rate": 9.94178315526923e-05, "loss": 2.0644, "step": 6271 }, { "epoch": 0.04877699766380579, "grad_norm": 0.22113753900643246, "learning_rate": 9.941764563613898e-05, "loss": 2.0519, "step": 6272 }, { "epoch": 0.04878477460858637, "grad_norm": 0.11293562053565787, "learning_rate": 9.941745969007789e-05, "loss": 2.1067, "step": 6273 }, { "epoch": 0.04879255155336695, "grad_norm": 0.1112174493326335, "learning_rate": 9.941727371450913e-05, "loss": 2.0226, "step": 6274 }, { "epoch": 0.048800328498147534, "grad_norm": 0.11818908096843886, "learning_rate": 9.941708770943284e-05, "loss": 2.0701, "step": 6275 }, { "epoch": 0.048808105442928115, "grad_norm": 0.11051950397631362, "learning_rate": 9.941690167484912e-05, "loss": 2.0633, "step": 6276 }, { "epoch": 0.048815882387708696, "grad_norm": 0.11792936079267609, "learning_rate": 9.94167156107581e-05, "loss": 2.0568, "step": 6277 }, { "epoch": 0.04882365933248928, "grad_norm": 0.11869661476424553, "learning_rate": 9.941652951715984e-05, "loss": 2.076, "step": 6278 }, { "epoch": 0.04883143627726986, "grad_norm": 0.12334379526616361, "learning_rate": 9.94163433940545e-05, "loss": 2.0236, "step": 6279 }, { "epoch": 0.04883921322205044, "grad_norm": 0.13502555347467, "learning_rate": 9.941615724144218e-05, "loss": 2.0848, "step": 6280 }, { "epoch": 0.04884699016683102, "grad_norm": 0.1186290039377152, "learning_rate": 9.941597105932297e-05, "loss": 2.0338, "step": 6281 }, { "epoch": 0.0488547671116116, "grad_norm": 0.11185815650784861, "learning_rate": 9.9415784847697e-05, "loss": 2.031, "step": 6282 }, { "epoch": 0.04886254405639218, "grad_norm": 0.11773137897882094, "learning_rate": 9.941559860656439e-05, "loss": 2.058, "step": 6283 }, { "epoch": 0.048870321001172765, "grad_norm": 0.11775724712463503, "learning_rate": 9.941541233592523e-05, "loss": 2.0908, "step": 6284 }, { "epoch": 0.048878097945953346, "grad_norm": 0.13775178520710435, "learning_rate": 9.941522603577963e-05, "loss": 2.0839, "step": 6285 }, { "epoch": 0.04888587489073393, "grad_norm": 0.16805004332591766, "learning_rate": 9.941503970612773e-05, "loss": 2.074, "step": 6286 }, { "epoch": 0.04889365183551451, "grad_norm": 0.21952148850639272, "learning_rate": 9.941485334696963e-05, "loss": 2.0517, "step": 6287 }, { "epoch": 0.04890142878029509, "grad_norm": 0.1146068871497443, "learning_rate": 9.941466695830542e-05, "loss": 2.0835, "step": 6288 }, { "epoch": 0.04890920572507567, "grad_norm": 0.16288931583257854, "learning_rate": 9.941448054013522e-05, "loss": 2.0013, "step": 6289 }, { "epoch": 0.04891698266985625, "grad_norm": 0.22620097400905667, "learning_rate": 9.941429409245915e-05, "loss": 2.0778, "step": 6290 }, { "epoch": 0.04892475961463683, "grad_norm": 0.2275885076931398, "learning_rate": 9.941410761527732e-05, "loss": 2.02, "step": 6291 }, { "epoch": 0.048932536559417414, "grad_norm": 0.11863398695629783, "learning_rate": 9.941392110858985e-05, "loss": 2.0643, "step": 6292 }, { "epoch": 0.048940313504197995, "grad_norm": 0.12621317788646816, "learning_rate": 9.941373457239683e-05, "loss": 2.1017, "step": 6293 }, { "epoch": 0.048948090448978576, "grad_norm": 0.13971298542197308, "learning_rate": 9.941354800669837e-05, "loss": 2.0677, "step": 6294 }, { "epoch": 0.04895586739375916, "grad_norm": 0.11415195254558463, "learning_rate": 9.941336141149462e-05, "loss": 2.0314, "step": 6295 }, { "epoch": 0.04896364433853974, "grad_norm": 0.1320923301462561, "learning_rate": 9.941317478678565e-05, "loss": 2.0048, "step": 6296 }, { "epoch": 0.04897142128332032, "grad_norm": 0.16443798999106363, "learning_rate": 9.941298813257158e-05, "loss": 2.0544, "step": 6297 }, { "epoch": 0.0489791982281009, "grad_norm": 0.19218657726436614, "learning_rate": 9.941280144885254e-05, "loss": 2.0832, "step": 6298 }, { "epoch": 0.04898697517288148, "grad_norm": 0.1227474238683691, "learning_rate": 9.941261473562864e-05, "loss": 2.0946, "step": 6299 }, { "epoch": 0.04899475211766206, "grad_norm": 0.14221127721435894, "learning_rate": 9.941242799289995e-05, "loss": 2.0537, "step": 6300 }, { "epoch": 0.049002529062442644, "grad_norm": 0.1483959668614048, "learning_rate": 9.941224122066664e-05, "loss": 2.0369, "step": 6301 }, { "epoch": 0.049010306007223225, "grad_norm": 0.11595481074483437, "learning_rate": 9.941205441892878e-05, "loss": 2.0539, "step": 6302 }, { "epoch": 0.04901808295200381, "grad_norm": 0.13577468992206476, "learning_rate": 9.941186758768651e-05, "loss": 2.0815, "step": 6303 }, { "epoch": 0.04902585989678439, "grad_norm": 0.14098584443154177, "learning_rate": 9.941168072693992e-05, "loss": 2.0612, "step": 6304 }, { "epoch": 0.04903363684156497, "grad_norm": 0.1278286518725701, "learning_rate": 9.941149383668911e-05, "loss": 2.0427, "step": 6305 }, { "epoch": 0.04904141378634555, "grad_norm": 0.1248146552723091, "learning_rate": 9.941130691693424e-05, "loss": 2.0, "step": 6306 }, { "epoch": 0.04904919073112613, "grad_norm": 0.12155354391889418, "learning_rate": 9.941111996767538e-05, "loss": 2.0716, "step": 6307 }, { "epoch": 0.04905696767590671, "grad_norm": 0.11737923963575521, "learning_rate": 9.941093298891265e-05, "loss": 2.0144, "step": 6308 }, { "epoch": 0.049064744620687294, "grad_norm": 0.11793198909410926, "learning_rate": 9.941074598064615e-05, "loss": 2.0268, "step": 6309 }, { "epoch": 0.049072521565467875, "grad_norm": 0.1143138215060493, "learning_rate": 9.941055894287603e-05, "loss": 2.0735, "step": 6310 }, { "epoch": 0.049080298510248456, "grad_norm": 0.13659936227703479, "learning_rate": 9.941037187560236e-05, "loss": 2.0228, "step": 6311 }, { "epoch": 0.04908807545502904, "grad_norm": 0.13835876270984931, "learning_rate": 9.941018477882528e-05, "loss": 2.0239, "step": 6312 }, { "epoch": 0.04909585239980962, "grad_norm": 0.11939812200797674, "learning_rate": 9.940999765254491e-05, "loss": 2.111, "step": 6313 }, { "epoch": 0.0491036293445902, "grad_norm": 0.1154277592250709, "learning_rate": 9.940981049676132e-05, "loss": 2.1372, "step": 6314 }, { "epoch": 0.04911140628937078, "grad_norm": 0.12316820294330225, "learning_rate": 9.940962331147466e-05, "loss": 2.0664, "step": 6315 }, { "epoch": 0.04911918323415136, "grad_norm": 0.11372655721446509, "learning_rate": 9.9409436096685e-05, "loss": 2.0617, "step": 6316 }, { "epoch": 0.04912696017893194, "grad_norm": 0.11484717049723973, "learning_rate": 9.94092488523925e-05, "loss": 2.05, "step": 6317 }, { "epoch": 0.049134737123712524, "grad_norm": 0.11968931416419758, "learning_rate": 9.940906157859727e-05, "loss": 2.0953, "step": 6318 }, { "epoch": 0.049142514068493105, "grad_norm": 0.1257081080504405, "learning_rate": 9.940887427529937e-05, "loss": 2.0139, "step": 6319 }, { "epoch": 0.049150291013273686, "grad_norm": 0.11966378242985008, "learning_rate": 9.940868694249896e-05, "loss": 2.0301, "step": 6320 }, { "epoch": 0.04915806795805427, "grad_norm": 0.11718885263950655, "learning_rate": 9.940849958019612e-05, "loss": 2.075, "step": 6321 }, { "epoch": 0.04916584490283485, "grad_norm": 0.12132712996435803, "learning_rate": 9.940831218839099e-05, "loss": 2.1024, "step": 6322 }, { "epoch": 0.04917362184761543, "grad_norm": 0.11789405735995488, "learning_rate": 9.940812476708368e-05, "loss": 2.0705, "step": 6323 }, { "epoch": 0.04918139879239601, "grad_norm": 0.11407880415655684, "learning_rate": 9.940793731627427e-05, "loss": 2.0787, "step": 6324 }, { "epoch": 0.0491891757371766, "grad_norm": 0.11852048788314015, "learning_rate": 9.94077498359629e-05, "loss": 2.0895, "step": 6325 }, { "epoch": 0.04919695268195718, "grad_norm": 0.11206599992015889, "learning_rate": 9.94075623261497e-05, "loss": 2.0629, "step": 6326 }, { "epoch": 0.04920472962673776, "grad_norm": 0.16161680483923238, "learning_rate": 9.940737478683472e-05, "loss": 2.0522, "step": 6327 }, { "epoch": 0.04921250657151834, "grad_norm": 0.11250346186724228, "learning_rate": 9.940718721801814e-05, "loss": 2.0472, "step": 6328 }, { "epoch": 0.049220283516298924, "grad_norm": 0.11152678721036997, "learning_rate": 9.940699961970003e-05, "loss": 2.0929, "step": 6329 }, { "epoch": 0.049228060461079505, "grad_norm": 0.20800372161427402, "learning_rate": 9.940681199188052e-05, "loss": 2.095, "step": 6330 }, { "epoch": 0.049235837405860086, "grad_norm": 0.12081363232130868, "learning_rate": 9.940662433455971e-05, "loss": 2.0485, "step": 6331 }, { "epoch": 0.04924361435064067, "grad_norm": 0.112859587953834, "learning_rate": 9.940643664773772e-05, "loss": 2.0025, "step": 6332 }, { "epoch": 0.04925139129542125, "grad_norm": 0.11725279591746289, "learning_rate": 9.940624893141466e-05, "loss": 2.0274, "step": 6333 }, { "epoch": 0.04925916824020183, "grad_norm": 0.15016183046732193, "learning_rate": 9.940606118559064e-05, "loss": 2.0919, "step": 6334 }, { "epoch": 0.04926694518498241, "grad_norm": 0.12106805357644777, "learning_rate": 9.940587341026577e-05, "loss": 2.069, "step": 6335 }, { "epoch": 0.04927472212976299, "grad_norm": 0.1251767279506454, "learning_rate": 9.940568560544017e-05, "loss": 2.0344, "step": 6336 }, { "epoch": 0.04928249907454357, "grad_norm": 0.12462382021339753, "learning_rate": 9.940549777111395e-05, "loss": 2.0692, "step": 6337 }, { "epoch": 0.049290276019324154, "grad_norm": 0.11453830706353771, "learning_rate": 9.940530990728721e-05, "loss": 2.0465, "step": 6338 }, { "epoch": 0.049298052964104735, "grad_norm": 0.13917563053481374, "learning_rate": 9.940512201396008e-05, "loss": 2.1229, "step": 6339 }, { "epoch": 0.04930582990888532, "grad_norm": 0.16575972629303282, "learning_rate": 9.940493409113268e-05, "loss": 2.0485, "step": 6340 }, { "epoch": 0.0493136068536659, "grad_norm": 0.16312529651513852, "learning_rate": 9.940474613880508e-05, "loss": 2.0588, "step": 6341 }, { "epoch": 0.04932138379844648, "grad_norm": 0.12190216748284759, "learning_rate": 9.940455815697745e-05, "loss": 2.0655, "step": 6342 }, { "epoch": 0.04932916074322706, "grad_norm": 0.1447990281677377, "learning_rate": 9.940437014564985e-05, "loss": 2.1307, "step": 6343 }, { "epoch": 0.04933693768800764, "grad_norm": 0.18572813735653976, "learning_rate": 9.94041821048224e-05, "loss": 2.0979, "step": 6344 }, { "epoch": 0.04934471463278822, "grad_norm": 0.13182383036682732, "learning_rate": 9.940399403449526e-05, "loss": 2.0684, "step": 6345 }, { "epoch": 0.049352491577568804, "grad_norm": 0.12177766143127644, "learning_rate": 9.940380593466849e-05, "loss": 2.1119, "step": 6346 }, { "epoch": 0.049360268522349385, "grad_norm": 0.15874045008863902, "learning_rate": 9.940361780534221e-05, "loss": 2.0637, "step": 6347 }, { "epoch": 0.049368045467129966, "grad_norm": 0.1365331637926519, "learning_rate": 9.940342964651658e-05, "loss": 2.0258, "step": 6348 }, { "epoch": 0.04937582241191055, "grad_norm": 0.11299425847885167, "learning_rate": 9.940324145819164e-05, "loss": 2.1022, "step": 6349 }, { "epoch": 0.04938359935669113, "grad_norm": 0.15152858639089148, "learning_rate": 9.940305324036756e-05, "loss": 2.0498, "step": 6350 }, { "epoch": 0.04939137630147171, "grad_norm": 0.14581831450713592, "learning_rate": 9.940286499304442e-05, "loss": 2.0391, "step": 6351 }, { "epoch": 0.04939915324625229, "grad_norm": 0.11517847454511483, "learning_rate": 9.940267671622235e-05, "loss": 2.0501, "step": 6352 }, { "epoch": 0.04940693019103287, "grad_norm": 0.13007230532740116, "learning_rate": 9.940248840990144e-05, "loss": 2.0164, "step": 6353 }, { "epoch": 0.04941470713581345, "grad_norm": 0.15105132137776428, "learning_rate": 9.940230007408183e-05, "loss": 2.0427, "step": 6354 }, { "epoch": 0.049422484080594034, "grad_norm": 0.2129126498689896, "learning_rate": 9.940211170876361e-05, "loss": 2.0778, "step": 6355 }, { "epoch": 0.049430261025374615, "grad_norm": 0.14031246487414675, "learning_rate": 9.940192331394691e-05, "loss": 2.0627, "step": 6356 }, { "epoch": 0.049438037970155196, "grad_norm": 0.19772131544577995, "learning_rate": 9.940173488963182e-05, "loss": 2.0515, "step": 6357 }, { "epoch": 0.04944581491493578, "grad_norm": 0.1796312859979271, "learning_rate": 9.940154643581849e-05, "loss": 2.0755, "step": 6358 }, { "epoch": 0.04945359185971636, "grad_norm": 0.11421178477383245, "learning_rate": 9.9401357952507e-05, "loss": 2.042, "step": 6359 }, { "epoch": 0.04946136880449694, "grad_norm": 0.17946330725886872, "learning_rate": 9.940116943969747e-05, "loss": 2.0839, "step": 6360 }, { "epoch": 0.04946914574927752, "grad_norm": 0.18952497366033894, "learning_rate": 9.940098089739002e-05, "loss": 2.0261, "step": 6361 }, { "epoch": 0.0494769226940581, "grad_norm": 0.12673368940343335, "learning_rate": 9.940079232558476e-05, "loss": 2.0462, "step": 6362 }, { "epoch": 0.04948469963883868, "grad_norm": 0.1339248031491723, "learning_rate": 9.940060372428179e-05, "loss": 2.0145, "step": 6363 }, { "epoch": 0.049492476583619265, "grad_norm": 0.1679761744506972, "learning_rate": 9.940041509348124e-05, "loss": 2.069, "step": 6364 }, { "epoch": 0.049500253528399846, "grad_norm": 0.12472187026958287, "learning_rate": 9.94002264331832e-05, "loss": 2.1036, "step": 6365 }, { "epoch": 0.04950803047318043, "grad_norm": 0.14127555087733648, "learning_rate": 9.940003774338782e-05, "loss": 2.0853, "step": 6366 }, { "epoch": 0.04951580741796101, "grad_norm": 0.17477032262227324, "learning_rate": 9.939984902409518e-05, "loss": 2.002, "step": 6367 }, { "epoch": 0.04952358436274159, "grad_norm": 0.13671400846074588, "learning_rate": 9.93996602753054e-05, "loss": 2.0839, "step": 6368 }, { "epoch": 0.04953136130752217, "grad_norm": 0.13261091262079444, "learning_rate": 9.939947149701861e-05, "loss": 2.0644, "step": 6369 }, { "epoch": 0.04953913825230275, "grad_norm": 0.16740693103673868, "learning_rate": 9.939928268923491e-05, "loss": 2.1147, "step": 6370 }, { "epoch": 0.04954691519708333, "grad_norm": 0.1232309851931935, "learning_rate": 9.939909385195441e-05, "loss": 2.0583, "step": 6371 }, { "epoch": 0.049554692141863914, "grad_norm": 0.1287748990933337, "learning_rate": 9.939890498517721e-05, "loss": 2.0631, "step": 6372 }, { "epoch": 0.049562469086644495, "grad_norm": 0.13670141963297922, "learning_rate": 9.939871608890344e-05, "loss": 2.0756, "step": 6373 }, { "epoch": 0.049570246031425076, "grad_norm": 0.12917612796583136, "learning_rate": 9.939852716313321e-05, "loss": 2.0749, "step": 6374 }, { "epoch": 0.04957802297620566, "grad_norm": 0.1387056745964983, "learning_rate": 9.939833820786663e-05, "loss": 2.0874, "step": 6375 }, { "epoch": 0.04958579992098624, "grad_norm": 0.13284877042552054, "learning_rate": 9.939814922310383e-05, "loss": 2.0134, "step": 6376 }, { "epoch": 0.04959357686576682, "grad_norm": 0.11876089157717011, "learning_rate": 9.939796020884491e-05, "loss": 2.0507, "step": 6377 }, { "epoch": 0.0496013538105474, "grad_norm": 0.1437385136264301, "learning_rate": 9.939777116508997e-05, "loss": 2.0757, "step": 6378 }, { "epoch": 0.04960913075532798, "grad_norm": 0.27086515937686256, "learning_rate": 9.939758209183914e-05, "loss": 2.0613, "step": 6379 }, { "epoch": 0.04961690770010856, "grad_norm": 0.11749902637784165, "learning_rate": 9.939739298909252e-05, "loss": 2.0657, "step": 6380 }, { "epoch": 0.049624684644889144, "grad_norm": 0.1169796121896769, "learning_rate": 9.939720385685023e-05, "loss": 2.0687, "step": 6381 }, { "epoch": 0.049632461589669725, "grad_norm": 0.11920978373618393, "learning_rate": 9.939701469511238e-05, "loss": 2.0124, "step": 6382 }, { "epoch": 0.04964023853445031, "grad_norm": 0.11411301620235467, "learning_rate": 9.93968255038791e-05, "loss": 2.079, "step": 6383 }, { "epoch": 0.04964801547923089, "grad_norm": 0.1172764828438437, "learning_rate": 9.939663628315048e-05, "loss": 2.0962, "step": 6384 }, { "epoch": 0.049655792424011476, "grad_norm": 0.19107685744995181, "learning_rate": 9.939644703292664e-05, "loss": 2.0994, "step": 6385 }, { "epoch": 0.04966356936879206, "grad_norm": 0.11925082270755409, "learning_rate": 9.939625775320769e-05, "loss": 2.0606, "step": 6386 }, { "epoch": 0.04967134631357264, "grad_norm": 0.140246147566127, "learning_rate": 9.939606844399377e-05, "loss": 2.0403, "step": 6387 }, { "epoch": 0.04967912325835322, "grad_norm": 0.14715441680472893, "learning_rate": 9.939587910528494e-05, "loss": 1.9999, "step": 6388 }, { "epoch": 0.0496869002031338, "grad_norm": 0.12250968429001954, "learning_rate": 9.939568973708137e-05, "loss": 2.0878, "step": 6389 }, { "epoch": 0.04969467714791438, "grad_norm": 0.11819671501932218, "learning_rate": 9.939550033938313e-05, "loss": 2.0827, "step": 6390 }, { "epoch": 0.04970245409269496, "grad_norm": 0.12934723914775406, "learning_rate": 9.939531091219037e-05, "loss": 1.9878, "step": 6391 }, { "epoch": 0.049710231037475544, "grad_norm": 0.11528157031686342, "learning_rate": 9.939512145550316e-05, "loss": 1.9788, "step": 6392 }, { "epoch": 0.049718007982256125, "grad_norm": 0.11084458843205472, "learning_rate": 9.939493196932165e-05, "loss": 2.0519, "step": 6393 }, { "epoch": 0.049725784927036706, "grad_norm": 0.1314843318342479, "learning_rate": 9.939474245364593e-05, "loss": 2.07, "step": 6394 }, { "epoch": 0.04973356187181729, "grad_norm": 0.12737267111588366, "learning_rate": 9.939455290847613e-05, "loss": 2.0654, "step": 6395 }, { "epoch": 0.04974133881659787, "grad_norm": 0.11070854744991637, "learning_rate": 9.939436333381235e-05, "loss": 2.0417, "step": 6396 }, { "epoch": 0.04974911576137845, "grad_norm": 0.11606571671977527, "learning_rate": 9.939417372965472e-05, "loss": 2.0503, "step": 6397 }, { "epoch": 0.04975689270615903, "grad_norm": 0.11573364145395375, "learning_rate": 9.939398409600334e-05, "loss": 2.0675, "step": 6398 }, { "epoch": 0.04976466965093961, "grad_norm": 0.11236710881280301, "learning_rate": 9.939379443285833e-05, "loss": 2.0773, "step": 6399 }, { "epoch": 0.04977244659572019, "grad_norm": 0.11562489258678274, "learning_rate": 9.939360474021978e-05, "loss": 2.0329, "step": 6400 }, { "epoch": 0.049780223540500775, "grad_norm": 0.12083801633019482, "learning_rate": 9.939341501808784e-05, "loss": 2.0462, "step": 6401 }, { "epoch": 0.049788000485281356, "grad_norm": 0.12165028296033853, "learning_rate": 9.939322526646258e-05, "loss": 2.1013, "step": 6402 }, { "epoch": 0.04979577743006194, "grad_norm": 0.1148369338398648, "learning_rate": 9.939303548534416e-05, "loss": 2.1181, "step": 6403 }, { "epoch": 0.04980355437484252, "grad_norm": 0.12428782634771844, "learning_rate": 9.939284567473266e-05, "loss": 2.1346, "step": 6404 }, { "epoch": 0.0498113313196231, "grad_norm": 0.4085402232115198, "learning_rate": 9.939265583462822e-05, "loss": 2.0307, "step": 6405 }, { "epoch": 0.04981910826440368, "grad_norm": 0.1139984910849491, "learning_rate": 9.939246596503094e-05, "loss": 2.0631, "step": 6406 }, { "epoch": 0.04982688520918426, "grad_norm": 0.11958865058337247, "learning_rate": 9.939227606594091e-05, "loss": 2.0597, "step": 6407 }, { "epoch": 0.04983466215396484, "grad_norm": 0.11804650015211172, "learning_rate": 9.939208613735829e-05, "loss": 2.0575, "step": 6408 }, { "epoch": 0.049842439098745424, "grad_norm": 0.11819681314154676, "learning_rate": 9.939189617928315e-05, "loss": 2.1276, "step": 6409 }, { "epoch": 0.049850216043526005, "grad_norm": 0.12131436904002121, "learning_rate": 9.939170619171563e-05, "loss": 2.0797, "step": 6410 }, { "epoch": 0.049857992988306586, "grad_norm": 0.13598485875396157, "learning_rate": 9.939151617465583e-05, "loss": 2.0861, "step": 6411 }, { "epoch": 0.04986576993308717, "grad_norm": 0.13201428935255696, "learning_rate": 9.939132612810388e-05, "loss": 2.0453, "step": 6412 }, { "epoch": 0.04987354687786775, "grad_norm": 0.1266710770300341, "learning_rate": 9.939113605205987e-05, "loss": 2.0606, "step": 6413 }, { "epoch": 0.04988132382264833, "grad_norm": 0.14345374605358566, "learning_rate": 9.939094594652393e-05, "loss": 2.1179, "step": 6414 }, { "epoch": 0.04988910076742891, "grad_norm": 0.19255271931598236, "learning_rate": 9.939075581149618e-05, "loss": 2.0949, "step": 6415 }, { "epoch": 0.04989687771220949, "grad_norm": 0.19827756888265036, "learning_rate": 9.939056564697671e-05, "loss": 2.0682, "step": 6416 }, { "epoch": 0.04990465465699007, "grad_norm": 0.1299634703729688, "learning_rate": 9.939037545296566e-05, "loss": 2.0899, "step": 6417 }, { "epoch": 0.049912431601770654, "grad_norm": 0.1312673262702714, "learning_rate": 9.939018522946311e-05, "loss": 2.073, "step": 6418 }, { "epoch": 0.049920208546551235, "grad_norm": 0.1709121463799564, "learning_rate": 9.938999497646921e-05, "loss": 2.0431, "step": 6419 }, { "epoch": 0.04992798549133182, "grad_norm": 0.13839766800683997, "learning_rate": 9.938980469398406e-05, "loss": 2.0463, "step": 6420 }, { "epoch": 0.0499357624361124, "grad_norm": 0.12118795639274771, "learning_rate": 9.938961438200776e-05, "loss": 2.0827, "step": 6421 }, { "epoch": 0.04994353938089298, "grad_norm": 0.13365982688807673, "learning_rate": 9.938942404054044e-05, "loss": 2.1374, "step": 6422 }, { "epoch": 0.04995131632567356, "grad_norm": 0.14238132321687758, "learning_rate": 9.938923366958221e-05, "loss": 2.0956, "step": 6423 }, { "epoch": 0.04995909327045414, "grad_norm": 0.12660302452382438, "learning_rate": 9.938904326913318e-05, "loss": 2.1038, "step": 6424 }, { "epoch": 0.04996687021523472, "grad_norm": 0.12119023977541081, "learning_rate": 9.938885283919346e-05, "loss": 2.0921, "step": 6425 }, { "epoch": 0.049974647160015304, "grad_norm": 0.1334834918617241, "learning_rate": 9.938866237976319e-05, "loss": 2.0695, "step": 6426 }, { "epoch": 0.049982424104795885, "grad_norm": 0.1304706015461504, "learning_rate": 9.938847189084243e-05, "loss": 2.0791, "step": 6427 }, { "epoch": 0.049990201049576466, "grad_norm": 0.12300798701772693, "learning_rate": 9.938828137243134e-05, "loss": 2.0571, "step": 6428 }, { "epoch": 0.04999797799435705, "grad_norm": 0.11250060516097829, "learning_rate": 9.938809082453003e-05, "loss": 2.0223, "step": 6429 }, { "epoch": 0.05000575493913763, "grad_norm": 0.15513096872501758, "learning_rate": 9.93879002471386e-05, "loss": 2.0654, "step": 6430 }, { "epoch": 0.05001353188391821, "grad_norm": 0.11399631227948101, "learning_rate": 9.938770964025716e-05, "loss": 2.0966, "step": 6431 }, { "epoch": 0.05002130882869879, "grad_norm": 0.12726512643532448, "learning_rate": 9.938751900388585e-05, "loss": 2.1056, "step": 6432 }, { "epoch": 0.05002908577347937, "grad_norm": 0.12234515416017235, "learning_rate": 9.938732833802475e-05, "loss": 2.1097, "step": 6433 }, { "epoch": 0.05003686271825995, "grad_norm": 0.13149769357491708, "learning_rate": 9.9387137642674e-05, "loss": 2.0888, "step": 6434 }, { "epoch": 0.050044639663040534, "grad_norm": 0.4137561288021537, "learning_rate": 9.93869469178337e-05, "loss": 2.0557, "step": 6435 }, { "epoch": 0.050052416607821115, "grad_norm": 0.1611702608527733, "learning_rate": 9.938675616350396e-05, "loss": 2.0652, "step": 6436 }, { "epoch": 0.050060193552601696, "grad_norm": 0.13050301178730053, "learning_rate": 9.938656537968491e-05, "loss": 2.1402, "step": 6437 }, { "epoch": 0.05006797049738228, "grad_norm": 0.13449271702591006, "learning_rate": 9.938637456637665e-05, "loss": 2.0363, "step": 6438 }, { "epoch": 0.05007574744216286, "grad_norm": 0.18501480792495237, "learning_rate": 9.93861837235793e-05, "loss": 2.0444, "step": 6439 }, { "epoch": 0.05008352438694344, "grad_norm": 0.19200411806671996, "learning_rate": 9.938599285129299e-05, "loss": 2.0571, "step": 6440 }, { "epoch": 0.05009130133172402, "grad_norm": 0.15071467766611454, "learning_rate": 9.93858019495178e-05, "loss": 2.0098, "step": 6441 }, { "epoch": 0.0500990782765046, "grad_norm": 0.13411177224627702, "learning_rate": 9.938561101825385e-05, "loss": 2.0978, "step": 6442 }, { "epoch": 0.05010685522128518, "grad_norm": 0.13515967431740555, "learning_rate": 9.938542005750129e-05, "loss": 2.0903, "step": 6443 }, { "epoch": 0.05011463216606577, "grad_norm": 0.11988760339748758, "learning_rate": 9.93852290672602e-05, "loss": 2.1039, "step": 6444 }, { "epoch": 0.05012240911084635, "grad_norm": 0.13584423559614267, "learning_rate": 9.93850380475307e-05, "loss": 2.0439, "step": 6445 }, { "epoch": 0.050130186055626934, "grad_norm": 0.23016263596357198, "learning_rate": 9.93848469983129e-05, "loss": 2.0637, "step": 6446 }, { "epoch": 0.050137963000407515, "grad_norm": 0.11728645959700276, "learning_rate": 9.938465591960693e-05, "loss": 2.0772, "step": 6447 }, { "epoch": 0.050145739945188096, "grad_norm": 0.15926658126072854, "learning_rate": 9.93844648114129e-05, "loss": 2.0105, "step": 6448 }, { "epoch": 0.05015351688996868, "grad_norm": 0.18779618923133082, "learning_rate": 9.93842736737309e-05, "loss": 2.0482, "step": 6449 }, { "epoch": 0.05016129383474926, "grad_norm": 0.16542003394120586, "learning_rate": 9.938408250656108e-05, "loss": 2.1035, "step": 6450 }, { "epoch": 0.05016907077952984, "grad_norm": 0.11810152515559094, "learning_rate": 9.938389130990355e-05, "loss": 2.0823, "step": 6451 }, { "epoch": 0.05017684772431042, "grad_norm": 0.14522104261339475, "learning_rate": 9.938370008375839e-05, "loss": 2.0185, "step": 6452 }, { "epoch": 0.050184624669091, "grad_norm": 0.16148178697296803, "learning_rate": 9.938350882812575e-05, "loss": 2.052, "step": 6453 }, { "epoch": 0.05019240161387158, "grad_norm": 0.14936040178209153, "learning_rate": 9.938331754300572e-05, "loss": 2.0658, "step": 6454 }, { "epoch": 0.050200178558652164, "grad_norm": 0.12627028582230115, "learning_rate": 9.938312622839843e-05, "loss": 2.0849, "step": 6455 }, { "epoch": 0.050207955503432745, "grad_norm": 0.12066981166942976, "learning_rate": 9.9382934884304e-05, "loss": 2.0759, "step": 6456 }, { "epoch": 0.05021573244821333, "grad_norm": 0.14723700583110563, "learning_rate": 9.93827435107225e-05, "loss": 2.0628, "step": 6457 }, { "epoch": 0.05022350939299391, "grad_norm": 0.16267711628563863, "learning_rate": 9.93825521076541e-05, "loss": 2.0609, "step": 6458 }, { "epoch": 0.05023128633777449, "grad_norm": 0.13233845188840146, "learning_rate": 9.93823606750989e-05, "loss": 2.0474, "step": 6459 }, { "epoch": 0.05023906328255507, "grad_norm": 0.3185962268699242, "learning_rate": 9.938216921305698e-05, "loss": 2.1004, "step": 6460 }, { "epoch": 0.05024684022733565, "grad_norm": 0.2191532802097097, "learning_rate": 9.938197772152851e-05, "loss": 2.0051, "step": 6461 }, { "epoch": 0.05025461717211623, "grad_norm": 0.18826499432514757, "learning_rate": 9.938178620051355e-05, "loss": 2.0804, "step": 6462 }, { "epoch": 0.050262394116896814, "grad_norm": 0.16108040250029682, "learning_rate": 9.938159465001224e-05, "loss": 2.0636, "step": 6463 }, { "epoch": 0.050270171061677395, "grad_norm": 0.11398290886023929, "learning_rate": 9.938140307002472e-05, "loss": 2.0368, "step": 6464 }, { "epoch": 0.050277948006457976, "grad_norm": 0.18156075903762528, "learning_rate": 9.938121146055106e-05, "loss": 2.0376, "step": 6465 }, { "epoch": 0.05028572495123856, "grad_norm": 0.19366695537159231, "learning_rate": 9.938101982159138e-05, "loss": 2.113, "step": 6466 }, { "epoch": 0.05029350189601914, "grad_norm": 0.1424431837787472, "learning_rate": 9.938082815314582e-05, "loss": 2.0681, "step": 6467 }, { "epoch": 0.05030127884079972, "grad_norm": 0.12843512794653156, "learning_rate": 9.938063645521449e-05, "loss": 2.0935, "step": 6468 }, { "epoch": 0.0503090557855803, "grad_norm": 0.1257170702637425, "learning_rate": 9.938044472779749e-05, "loss": 2.09, "step": 6469 }, { "epoch": 0.05031683273036088, "grad_norm": 0.14466526451787867, "learning_rate": 9.938025297089492e-05, "loss": 2.0488, "step": 6470 }, { "epoch": 0.05032460967514146, "grad_norm": 0.1402818913999215, "learning_rate": 9.938006118450694e-05, "loss": 2.1315, "step": 6471 }, { "epoch": 0.050332386619922044, "grad_norm": 0.13367783733938626, "learning_rate": 9.937986936863361e-05, "loss": 2.0117, "step": 6472 }, { "epoch": 0.050340163564702625, "grad_norm": 0.12581599696532775, "learning_rate": 9.937967752327511e-05, "loss": 2.0371, "step": 6473 }, { "epoch": 0.050347940509483206, "grad_norm": 0.12831823467709197, "learning_rate": 9.937948564843149e-05, "loss": 2.0394, "step": 6474 }, { "epoch": 0.05035571745426379, "grad_norm": 0.1285394450497416, "learning_rate": 9.93792937441029e-05, "loss": 2.0686, "step": 6475 }, { "epoch": 0.05036349439904437, "grad_norm": 0.12229746682501441, "learning_rate": 9.937910181028946e-05, "loss": 2.0663, "step": 6476 }, { "epoch": 0.05037127134382495, "grad_norm": 0.1155383455539986, "learning_rate": 9.937890984699126e-05, "loss": 2.1274, "step": 6477 }, { "epoch": 0.05037904828860553, "grad_norm": 0.26560856303458036, "learning_rate": 9.937871785420843e-05, "loss": 2.0564, "step": 6478 }, { "epoch": 0.05038682523338611, "grad_norm": 0.1257634789345292, "learning_rate": 9.937852583194107e-05, "loss": 1.9787, "step": 6479 }, { "epoch": 0.05039460217816669, "grad_norm": 0.12004795330648961, "learning_rate": 9.937833378018933e-05, "loss": 2.1046, "step": 6480 }, { "epoch": 0.050402379122947275, "grad_norm": 0.13237311469516952, "learning_rate": 9.937814169895328e-05, "loss": 2.0992, "step": 6481 }, { "epoch": 0.050410156067727856, "grad_norm": 0.13006002186534346, "learning_rate": 9.937794958823306e-05, "loss": 2.0681, "step": 6482 }, { "epoch": 0.05041793301250844, "grad_norm": 0.12462508931369982, "learning_rate": 9.937775744802877e-05, "loss": 1.9914, "step": 6483 }, { "epoch": 0.05042570995728902, "grad_norm": 0.12475137016021037, "learning_rate": 9.937756527834054e-05, "loss": 2.0823, "step": 6484 }, { "epoch": 0.0504334869020696, "grad_norm": 0.12293269239711027, "learning_rate": 9.937737307916849e-05, "loss": 2.0723, "step": 6485 }, { "epoch": 0.05044126384685018, "grad_norm": 0.12443616372707036, "learning_rate": 9.937718085051271e-05, "loss": 2.1118, "step": 6486 }, { "epoch": 0.05044904079163076, "grad_norm": 0.11762784905522944, "learning_rate": 9.937698859237335e-05, "loss": 2.1111, "step": 6487 }, { "epoch": 0.05045681773641134, "grad_norm": 0.12252359852727027, "learning_rate": 9.937679630475048e-05, "loss": 2.0495, "step": 6488 }, { "epoch": 0.050464594681191924, "grad_norm": 0.11849264790934805, "learning_rate": 9.937660398764426e-05, "loss": 2.0323, "step": 6489 }, { "epoch": 0.050472371625972505, "grad_norm": 0.1310147173246367, "learning_rate": 9.937641164105477e-05, "loss": 1.9862, "step": 6490 }, { "epoch": 0.050480148570753086, "grad_norm": 0.1402162073003461, "learning_rate": 9.937621926498214e-05, "loss": 2.0584, "step": 6491 }, { "epoch": 0.05048792551553367, "grad_norm": 0.129482671529179, "learning_rate": 9.937602685942649e-05, "loss": 2.0413, "step": 6492 }, { "epoch": 0.05049570246031425, "grad_norm": 0.12564461836827942, "learning_rate": 9.937583442438792e-05, "loss": 2.1218, "step": 6493 }, { "epoch": 0.05050347940509483, "grad_norm": 0.16997949532812487, "learning_rate": 9.937564195986655e-05, "loss": 2.0604, "step": 6494 }, { "epoch": 0.05051125634987541, "grad_norm": 0.1780037802588458, "learning_rate": 9.937544946586251e-05, "loss": 2.063, "step": 6495 }, { "epoch": 0.05051903329465599, "grad_norm": 0.15955265240559124, "learning_rate": 9.937525694237588e-05, "loss": 2.0339, "step": 6496 }, { "epoch": 0.05052681023943657, "grad_norm": 0.12162292020565017, "learning_rate": 9.937506438940681e-05, "loss": 2.0559, "step": 6497 }, { "epoch": 0.050534587184217154, "grad_norm": 0.12355231579063515, "learning_rate": 9.937487180695542e-05, "loss": 2.0839, "step": 6498 }, { "epoch": 0.050542364128997735, "grad_norm": 0.15009540042515118, "learning_rate": 9.937467919502179e-05, "loss": 2.0729, "step": 6499 }, { "epoch": 0.05055014107377832, "grad_norm": 0.15355787450047417, "learning_rate": 9.937448655360605e-05, "loss": 2.1043, "step": 6500 }, { "epoch": 0.0505579180185589, "grad_norm": 0.12648485139099308, "learning_rate": 9.937429388270832e-05, "loss": 2.082, "step": 6501 }, { "epoch": 0.05056569496333948, "grad_norm": 0.1119419560466701, "learning_rate": 9.937410118232871e-05, "loss": 2.0535, "step": 6502 }, { "epoch": 0.05057347190812007, "grad_norm": 0.13259766940989368, "learning_rate": 9.937390845246734e-05, "loss": 2.0488, "step": 6503 }, { "epoch": 0.05058124885290065, "grad_norm": 0.1522040280917495, "learning_rate": 9.937371569312434e-05, "loss": 2.1061, "step": 6504 }, { "epoch": 0.05058902579768123, "grad_norm": 0.1318601734326711, "learning_rate": 9.937352290429979e-05, "loss": 2.0611, "step": 6505 }, { "epoch": 0.05059680274246181, "grad_norm": 0.11531893444024448, "learning_rate": 9.937333008599383e-05, "loss": 2.0397, "step": 6506 }, { "epoch": 0.05060457968724239, "grad_norm": 0.1214426946872868, "learning_rate": 9.937313723820656e-05, "loss": 2.0611, "step": 6507 }, { "epoch": 0.05061235663202297, "grad_norm": 0.14386719384636337, "learning_rate": 9.937294436093812e-05, "loss": 2.038, "step": 6508 }, { "epoch": 0.050620133576803554, "grad_norm": 0.2663040359777839, "learning_rate": 9.937275145418858e-05, "loss": 2.1357, "step": 6509 }, { "epoch": 0.050627910521584135, "grad_norm": 0.12032389543924568, "learning_rate": 9.93725585179581e-05, "loss": 2.0639, "step": 6510 }, { "epoch": 0.050635687466364716, "grad_norm": 0.14844260893985212, "learning_rate": 9.937236555224678e-05, "loss": 2.0602, "step": 6511 }, { "epoch": 0.0506434644111453, "grad_norm": 0.19196763225094246, "learning_rate": 9.937217255705473e-05, "loss": 2.0585, "step": 6512 }, { "epoch": 0.05065124135592588, "grad_norm": 0.17331412041758165, "learning_rate": 9.937197953238208e-05, "loss": 2.0467, "step": 6513 }, { "epoch": 0.05065901830070646, "grad_norm": 0.1237251139863927, "learning_rate": 9.937178647822893e-05, "loss": 2.0641, "step": 6514 }, { "epoch": 0.05066679524548704, "grad_norm": 0.12234298867648406, "learning_rate": 9.937159339459539e-05, "loss": 2.0237, "step": 6515 }, { "epoch": 0.05067457219026762, "grad_norm": 0.1305783664405265, "learning_rate": 9.93714002814816e-05, "loss": 2.0661, "step": 6516 }, { "epoch": 0.0506823491350482, "grad_norm": 0.12857723792428088, "learning_rate": 9.937120713888766e-05, "loss": 2.0622, "step": 6517 }, { "epoch": 0.050690126079828784, "grad_norm": 0.12166717357742521, "learning_rate": 9.937101396681368e-05, "loss": 2.0653, "step": 6518 }, { "epoch": 0.050697903024609366, "grad_norm": 0.13726038276712066, "learning_rate": 9.937082076525976e-05, "loss": 2.0553, "step": 6519 }, { "epoch": 0.05070567996938995, "grad_norm": 0.14520317408782318, "learning_rate": 9.937062753422606e-05, "loss": 2.0767, "step": 6520 }, { "epoch": 0.05071345691417053, "grad_norm": 0.13933967382426288, "learning_rate": 9.937043427371266e-05, "loss": 2.0759, "step": 6521 }, { "epoch": 0.05072123385895111, "grad_norm": 0.14500734507017943, "learning_rate": 9.93702409837197e-05, "loss": 2.0046, "step": 6522 }, { "epoch": 0.05072901080373169, "grad_norm": 0.13505710247075006, "learning_rate": 9.937004766424728e-05, "loss": 2.0809, "step": 6523 }, { "epoch": 0.05073678774851227, "grad_norm": 0.11916303406665095, "learning_rate": 9.936985431529553e-05, "loss": 2.0586, "step": 6524 }, { "epoch": 0.05074456469329285, "grad_norm": 0.11789011690463261, "learning_rate": 9.936966093686453e-05, "loss": 2.1081, "step": 6525 }, { "epoch": 0.050752341638073434, "grad_norm": 0.12119349672256437, "learning_rate": 9.936946752895442e-05, "loss": 2.0338, "step": 6526 }, { "epoch": 0.050760118582854015, "grad_norm": 0.13896583312365626, "learning_rate": 9.936927409156534e-05, "loss": 2.0399, "step": 6527 }, { "epoch": 0.050767895527634596, "grad_norm": 0.13529128788696715, "learning_rate": 9.936908062469737e-05, "loss": 2.0788, "step": 6528 }, { "epoch": 0.05077567247241518, "grad_norm": 0.12579990256381898, "learning_rate": 9.936888712835061e-05, "loss": 2.0729, "step": 6529 }, { "epoch": 0.05078344941719576, "grad_norm": 0.11698129616148271, "learning_rate": 9.936869360252523e-05, "loss": 2.0367, "step": 6530 }, { "epoch": 0.05079122636197634, "grad_norm": 0.12964473656773348, "learning_rate": 9.93685000472213e-05, "loss": 1.9873, "step": 6531 }, { "epoch": 0.05079900330675692, "grad_norm": 0.11755521493218528, "learning_rate": 9.936830646243895e-05, "loss": 2.1028, "step": 6532 }, { "epoch": 0.0508067802515375, "grad_norm": 0.11569145709530303, "learning_rate": 9.936811284817831e-05, "loss": 2.0434, "step": 6533 }, { "epoch": 0.05081455719631808, "grad_norm": 0.11346726345596626, "learning_rate": 9.936791920443949e-05, "loss": 2.0566, "step": 6534 }, { "epoch": 0.050822334141098664, "grad_norm": 0.11694767886085138, "learning_rate": 9.936772553122258e-05, "loss": 2.0388, "step": 6535 }, { "epoch": 0.050830111085879245, "grad_norm": 0.11252015348420481, "learning_rate": 9.936753182852774e-05, "loss": 2.0866, "step": 6536 }, { "epoch": 0.05083788803065983, "grad_norm": 0.1260269928758307, "learning_rate": 9.936733809635503e-05, "loss": 2.0843, "step": 6537 }, { "epoch": 0.05084566497544041, "grad_norm": 0.13603727281753566, "learning_rate": 9.936714433470461e-05, "loss": 2.0473, "step": 6538 }, { "epoch": 0.05085344192022099, "grad_norm": 0.1206657776491765, "learning_rate": 9.936695054357659e-05, "loss": 2.018, "step": 6539 }, { "epoch": 0.05086121886500157, "grad_norm": 0.11586921698882562, "learning_rate": 9.936675672297107e-05, "loss": 2.024, "step": 6540 }, { "epoch": 0.05086899580978215, "grad_norm": 0.13740930489127662, "learning_rate": 9.936656287288818e-05, "loss": 2.0901, "step": 6541 }, { "epoch": 0.05087677275456273, "grad_norm": 0.14228126276206265, "learning_rate": 9.936636899332801e-05, "loss": 2.018, "step": 6542 }, { "epoch": 0.050884549699343314, "grad_norm": 0.1233616444714682, "learning_rate": 9.936617508429072e-05, "loss": 2.0915, "step": 6543 }, { "epoch": 0.050892326644123895, "grad_norm": 0.11808512111901887, "learning_rate": 9.936598114577638e-05, "loss": 2.0656, "step": 6544 }, { "epoch": 0.050900103588904476, "grad_norm": 0.1290975785986835, "learning_rate": 9.936578717778514e-05, "loss": 2.0812, "step": 6545 }, { "epoch": 0.05090788053368506, "grad_norm": 0.11715970312491855, "learning_rate": 9.936559318031709e-05, "loss": 2.0527, "step": 6546 }, { "epoch": 0.05091565747846564, "grad_norm": 0.11133492263233169, "learning_rate": 9.936539915337235e-05, "loss": 2.0772, "step": 6547 }, { "epoch": 0.05092343442324622, "grad_norm": 0.11688823149652067, "learning_rate": 9.936520509695107e-05, "loss": 2.0214, "step": 6548 }, { "epoch": 0.0509312113680268, "grad_norm": 0.11360735612126314, "learning_rate": 9.936501101105332e-05, "loss": 2.0754, "step": 6549 }, { "epoch": 0.05093898831280738, "grad_norm": 0.1165849960161159, "learning_rate": 9.936481689567924e-05, "loss": 2.0275, "step": 6550 }, { "epoch": 0.05094676525758796, "grad_norm": 0.12164306553200975, "learning_rate": 9.936462275082895e-05, "loss": 2.0294, "step": 6551 }, { "epoch": 0.050954542202368544, "grad_norm": 0.1442173602421666, "learning_rate": 9.936442857650254e-05, "loss": 2.0768, "step": 6552 }, { "epoch": 0.050962319147149125, "grad_norm": 0.16337929154346056, "learning_rate": 9.936423437270017e-05, "loss": 2.131, "step": 6553 }, { "epoch": 0.050970096091929706, "grad_norm": 0.15178745890613965, "learning_rate": 9.936404013942192e-05, "loss": 2.047, "step": 6554 }, { "epoch": 0.05097787303671029, "grad_norm": 0.12302574059514261, "learning_rate": 9.93638458766679e-05, "loss": 2.0275, "step": 6555 }, { "epoch": 0.05098564998149087, "grad_norm": 0.12153701688855288, "learning_rate": 9.936365158443825e-05, "loss": 2.1057, "step": 6556 }, { "epoch": 0.05099342692627145, "grad_norm": 0.13942228300640283, "learning_rate": 9.936345726273308e-05, "loss": 2.0403, "step": 6557 }, { "epoch": 0.05100120387105203, "grad_norm": 0.14119752014308973, "learning_rate": 9.936326291155249e-05, "loss": 2.1023, "step": 6558 }, { "epoch": 0.05100898081583261, "grad_norm": 0.1224310231323628, "learning_rate": 9.936306853089663e-05, "loss": 2.0163, "step": 6559 }, { "epoch": 0.05101675776061319, "grad_norm": 0.13183383231293347, "learning_rate": 9.936287412076559e-05, "loss": 2.061, "step": 6560 }, { "epoch": 0.051024534705393775, "grad_norm": 0.17681557838670733, "learning_rate": 9.936267968115949e-05, "loss": 2.0075, "step": 6561 }, { "epoch": 0.051032311650174356, "grad_norm": 0.2129125011525951, "learning_rate": 9.936248521207844e-05, "loss": 2.0841, "step": 6562 }, { "epoch": 0.051040088594954944, "grad_norm": 0.18095940509333908, "learning_rate": 9.936229071352257e-05, "loss": 2.113, "step": 6563 }, { "epoch": 0.051047865539735525, "grad_norm": 0.12544392769128326, "learning_rate": 9.936209618549199e-05, "loss": 2.0359, "step": 6564 }, { "epoch": 0.051055642484516106, "grad_norm": 0.1272024350058959, "learning_rate": 9.93619016279868e-05, "loss": 2.0715, "step": 6565 }, { "epoch": 0.05106341942929669, "grad_norm": 0.16296052715997833, "learning_rate": 9.936170704100716e-05, "loss": 2.0622, "step": 6566 }, { "epoch": 0.05107119637407727, "grad_norm": 0.16529772816546529, "learning_rate": 9.936151242455313e-05, "loss": 2.0274, "step": 6567 }, { "epoch": 0.05107897331885785, "grad_norm": 0.1546862803318321, "learning_rate": 9.936131777862488e-05, "loss": 2.0864, "step": 6568 }, { "epoch": 0.05108675026363843, "grad_norm": 0.11998773391578067, "learning_rate": 9.93611231032225e-05, "loss": 2.0361, "step": 6569 }, { "epoch": 0.05109452720841901, "grad_norm": 0.14153021543920946, "learning_rate": 9.936092839834609e-05, "loss": 2.0976, "step": 6570 }, { "epoch": 0.05110230415319959, "grad_norm": 0.11969675303671809, "learning_rate": 9.936073366399579e-05, "loss": 2.0725, "step": 6571 }, { "epoch": 0.051110081097980174, "grad_norm": 0.1249610844176352, "learning_rate": 9.936053890017172e-05, "loss": 2.067, "step": 6572 }, { "epoch": 0.051117858042760755, "grad_norm": 0.16802887174763267, "learning_rate": 9.936034410687397e-05, "loss": 2.0367, "step": 6573 }, { "epoch": 0.05112563498754134, "grad_norm": 0.17478338860819978, "learning_rate": 9.936014928410268e-05, "loss": 2.0097, "step": 6574 }, { "epoch": 0.05113341193232192, "grad_norm": 0.12590952769531896, "learning_rate": 9.935995443185795e-05, "loss": 2.0739, "step": 6575 }, { "epoch": 0.0511411888771025, "grad_norm": 0.16093108805330641, "learning_rate": 9.935975955013991e-05, "loss": 2.0487, "step": 6576 }, { "epoch": 0.05114896582188308, "grad_norm": 0.29676450621322753, "learning_rate": 9.935956463894868e-05, "loss": 2.0473, "step": 6577 }, { "epoch": 0.05115674276666366, "grad_norm": 0.12434876460884126, "learning_rate": 9.935936969828436e-05, "loss": 2.0572, "step": 6578 }, { "epoch": 0.05116451971144424, "grad_norm": 0.15022264485647477, "learning_rate": 9.935917472814708e-05, "loss": 2.0592, "step": 6579 }, { "epoch": 0.051172296656224824, "grad_norm": 0.19916237815169635, "learning_rate": 9.935897972853694e-05, "loss": 2.0235, "step": 6580 }, { "epoch": 0.051180073601005405, "grad_norm": 0.149513945414054, "learning_rate": 9.935878469945408e-05, "loss": 2.0548, "step": 6581 }, { "epoch": 0.051187850545785986, "grad_norm": 0.12312810605749673, "learning_rate": 9.935858964089859e-05, "loss": 2.0073, "step": 6582 }, { "epoch": 0.05119562749056657, "grad_norm": 0.1674152881374869, "learning_rate": 9.935839455287062e-05, "loss": 2.0758, "step": 6583 }, { "epoch": 0.05120340443534715, "grad_norm": 0.13692615906532726, "learning_rate": 9.935819943537023e-05, "loss": 2.0381, "step": 6584 }, { "epoch": 0.05121118138012773, "grad_norm": 0.12194392948735423, "learning_rate": 9.93580042883976e-05, "loss": 2.0479, "step": 6585 }, { "epoch": 0.05121895832490831, "grad_norm": 0.1482509930275712, "learning_rate": 9.935780911195283e-05, "loss": 2.0456, "step": 6586 }, { "epoch": 0.05122673526968889, "grad_norm": 0.1278535933117111, "learning_rate": 9.9357613906036e-05, "loss": 2.0872, "step": 6587 }, { "epoch": 0.05123451221446947, "grad_norm": 0.11518302441442867, "learning_rate": 9.935741867064727e-05, "loss": 2.0777, "step": 6588 }, { "epoch": 0.051242289159250054, "grad_norm": 0.13071850365571056, "learning_rate": 9.935722340578673e-05, "loss": 2.034, "step": 6589 }, { "epoch": 0.051250066104030635, "grad_norm": 0.13480030805200144, "learning_rate": 9.935702811145451e-05, "loss": 2.1031, "step": 6590 }, { "epoch": 0.051257843048811216, "grad_norm": 0.12328223239652146, "learning_rate": 9.93568327876507e-05, "loss": 2.104, "step": 6591 }, { "epoch": 0.0512656199935918, "grad_norm": 0.12143138918981207, "learning_rate": 9.935663743437547e-05, "loss": 2.0855, "step": 6592 }, { "epoch": 0.05127339693837238, "grad_norm": 0.12735894972631534, "learning_rate": 9.935644205162889e-05, "loss": 2.0802, "step": 6593 }, { "epoch": 0.05128117388315296, "grad_norm": 0.1374781622752093, "learning_rate": 9.93562466394111e-05, "loss": 2.0738, "step": 6594 }, { "epoch": 0.05128895082793354, "grad_norm": 0.12866847861701325, "learning_rate": 9.93560511977222e-05, "loss": 2.0381, "step": 6595 }, { "epoch": 0.05129672777271412, "grad_norm": 0.11866850106044238, "learning_rate": 9.935585572656232e-05, "loss": 2.0269, "step": 6596 }, { "epoch": 0.0513045047174947, "grad_norm": 0.11620755735936424, "learning_rate": 9.935566022593158e-05, "loss": 2.0638, "step": 6597 }, { "epoch": 0.051312281662275284, "grad_norm": 0.1372871623208259, "learning_rate": 9.935546469583007e-05, "loss": 2.08, "step": 6598 }, { "epoch": 0.051320058607055866, "grad_norm": 0.18545274141642765, "learning_rate": 9.935526913625794e-05, "loss": 2.0477, "step": 6599 }, { "epoch": 0.05132783555183645, "grad_norm": 0.11674647820259851, "learning_rate": 9.935507354721529e-05, "loss": 2.0755, "step": 6600 }, { "epoch": 0.05133561249661703, "grad_norm": 0.13282629970183082, "learning_rate": 9.935487792870225e-05, "loss": 2.0944, "step": 6601 }, { "epoch": 0.05134338944139761, "grad_norm": 0.1291643604916951, "learning_rate": 9.935468228071891e-05, "loss": 2.1005, "step": 6602 }, { "epoch": 0.05135116638617819, "grad_norm": 0.11749510361863294, "learning_rate": 9.935448660326541e-05, "loss": 2.0026, "step": 6603 }, { "epoch": 0.05135894333095877, "grad_norm": 0.12113294144857911, "learning_rate": 9.935429089634186e-05, "loss": 2.0785, "step": 6604 }, { "epoch": 0.05136672027573935, "grad_norm": 0.12080831483574948, "learning_rate": 9.935409515994837e-05, "loss": 2.0531, "step": 6605 }, { "epoch": 0.051374497220519934, "grad_norm": 0.12576803423450345, "learning_rate": 9.935389939408508e-05, "loss": 2.0633, "step": 6606 }, { "epoch": 0.051382274165300515, "grad_norm": 0.11511446041229866, "learning_rate": 9.935370359875206e-05, "loss": 2.0766, "step": 6607 }, { "epoch": 0.051390051110081096, "grad_norm": 0.11658911414743636, "learning_rate": 9.93535077739495e-05, "loss": 2.0898, "step": 6608 }, { "epoch": 0.05139782805486168, "grad_norm": 0.11324061405478127, "learning_rate": 9.935331191967745e-05, "loss": 2.1213, "step": 6609 }, { "epoch": 0.05140560499964226, "grad_norm": 0.12025875435613509, "learning_rate": 9.935311603593605e-05, "loss": 2.0408, "step": 6610 }, { "epoch": 0.05141338194442284, "grad_norm": 0.1346783181575128, "learning_rate": 9.935292012272541e-05, "loss": 2.0858, "step": 6611 }, { "epoch": 0.05142115888920342, "grad_norm": 0.1483303451820247, "learning_rate": 9.935272418004566e-05, "loss": 2.0773, "step": 6612 }, { "epoch": 0.051428935833984, "grad_norm": 0.1320436973953122, "learning_rate": 9.935252820789693e-05, "loss": 2.0487, "step": 6613 }, { "epoch": 0.05143671277876458, "grad_norm": 0.1208188726407199, "learning_rate": 9.935233220627931e-05, "loss": 2.0535, "step": 6614 }, { "epoch": 0.051444489723545164, "grad_norm": 0.11540191748316385, "learning_rate": 9.935213617519293e-05, "loss": 2.0682, "step": 6615 }, { "epoch": 0.051452266668325745, "grad_norm": 0.12121052160758801, "learning_rate": 9.935194011463789e-05, "loss": 2.125, "step": 6616 }, { "epoch": 0.05146004361310633, "grad_norm": 0.12206292441586401, "learning_rate": 9.935174402461432e-05, "loss": 2.0348, "step": 6617 }, { "epoch": 0.05146782055788691, "grad_norm": 0.12102504123860573, "learning_rate": 9.935154790512235e-05, "loss": 2.0413, "step": 6618 }, { "epoch": 0.05147559750266749, "grad_norm": 0.11409765928075082, "learning_rate": 9.935135175616207e-05, "loss": 1.9874, "step": 6619 }, { "epoch": 0.05148337444744807, "grad_norm": 0.11378755445146785, "learning_rate": 9.935115557773363e-05, "loss": 2.0973, "step": 6620 }, { "epoch": 0.05149115139222865, "grad_norm": 0.11776570852149494, "learning_rate": 9.935095936983712e-05, "loss": 2.0476, "step": 6621 }, { "epoch": 0.05149892833700924, "grad_norm": 0.13134643420159414, "learning_rate": 9.935076313247265e-05, "loss": 2.0273, "step": 6622 }, { "epoch": 0.05150670528178982, "grad_norm": 0.1266540008675062, "learning_rate": 9.935056686564037e-05, "loss": 2.0273, "step": 6623 }, { "epoch": 0.0515144822265704, "grad_norm": 0.12584309109029293, "learning_rate": 9.935037056934038e-05, "loss": 2.0568, "step": 6624 }, { "epoch": 0.05152225917135098, "grad_norm": 0.11684667670910587, "learning_rate": 9.93501742435728e-05, "loss": 2.0933, "step": 6625 }, { "epoch": 0.051530036116131564, "grad_norm": 0.11513279470271308, "learning_rate": 9.934997788833773e-05, "loss": 2.1059, "step": 6626 }, { "epoch": 0.051537813060912145, "grad_norm": 0.11849115381607422, "learning_rate": 9.934978150363532e-05, "loss": 2.1347, "step": 6627 }, { "epoch": 0.051545590005692726, "grad_norm": 0.1149727875519886, "learning_rate": 9.934958508946566e-05, "loss": 1.9887, "step": 6628 }, { "epoch": 0.05155336695047331, "grad_norm": 0.11734303155371624, "learning_rate": 9.934938864582887e-05, "loss": 2.0467, "step": 6629 }, { "epoch": 0.05156114389525389, "grad_norm": 0.12526315085629652, "learning_rate": 9.93491921727251e-05, "loss": 2.032, "step": 6630 }, { "epoch": 0.05156892084003447, "grad_norm": 0.1338095941559643, "learning_rate": 9.934899567015441e-05, "loss": 2.0788, "step": 6631 }, { "epoch": 0.05157669778481505, "grad_norm": 0.12464391958642033, "learning_rate": 9.934879913811697e-05, "loss": 2.0047, "step": 6632 }, { "epoch": 0.05158447472959563, "grad_norm": 0.116486391697374, "learning_rate": 9.934860257661287e-05, "loss": 2.1151, "step": 6633 }, { "epoch": 0.05159225167437621, "grad_norm": 0.1305937380484149, "learning_rate": 9.934840598564222e-05, "loss": 2.0337, "step": 6634 }, { "epoch": 0.051600028619156794, "grad_norm": 0.13646025219622912, "learning_rate": 9.934820936520517e-05, "loss": 2.09, "step": 6635 }, { "epoch": 0.051607805563937376, "grad_norm": 0.11980008994845476, "learning_rate": 9.93480127153018e-05, "loss": 2.0618, "step": 6636 }, { "epoch": 0.05161558250871796, "grad_norm": 0.12257277710620108, "learning_rate": 9.934781603593226e-05, "loss": 2.0906, "step": 6637 }, { "epoch": 0.05162335945349854, "grad_norm": 0.13723608235195858, "learning_rate": 9.934761932709664e-05, "loss": 2.0511, "step": 6638 }, { "epoch": 0.05163113639827912, "grad_norm": 0.15575293362412965, "learning_rate": 9.934742258879508e-05, "loss": 2.0729, "step": 6639 }, { "epoch": 0.0516389133430597, "grad_norm": 0.1510358391677951, "learning_rate": 9.934722582102769e-05, "loss": 2.0208, "step": 6640 }, { "epoch": 0.05164669028784028, "grad_norm": 0.12123735152960988, "learning_rate": 9.934702902379458e-05, "loss": 2.096, "step": 6641 }, { "epoch": 0.05165446723262086, "grad_norm": 0.12298815201205429, "learning_rate": 9.934683219709587e-05, "loss": 1.9876, "step": 6642 }, { "epoch": 0.051662244177401444, "grad_norm": 0.14766682527505953, "learning_rate": 9.934663534093169e-05, "loss": 2.0351, "step": 6643 }, { "epoch": 0.051670021122182025, "grad_norm": 0.12678995457594652, "learning_rate": 9.934643845530214e-05, "loss": 2.0386, "step": 6644 }, { "epoch": 0.051677798066962606, "grad_norm": 0.11549409234891328, "learning_rate": 9.934624154020736e-05, "loss": 2.0798, "step": 6645 }, { "epoch": 0.05168557501174319, "grad_norm": 0.11857046900385657, "learning_rate": 9.934604459564743e-05, "loss": 2.0849, "step": 6646 }, { "epoch": 0.05169335195652377, "grad_norm": 0.1213099027940713, "learning_rate": 9.93458476216225e-05, "loss": 2.0995, "step": 6647 }, { "epoch": 0.05170112890130435, "grad_norm": 0.12202743595762537, "learning_rate": 9.934565061813268e-05, "loss": 2.0826, "step": 6648 }, { "epoch": 0.05170890584608493, "grad_norm": 0.11510154930830906, "learning_rate": 9.934545358517809e-05, "loss": 2.0932, "step": 6649 }, { "epoch": 0.05171668279086551, "grad_norm": 0.11276567037278865, "learning_rate": 9.934525652275884e-05, "loss": 2.029, "step": 6650 }, { "epoch": 0.05172445973564609, "grad_norm": 0.13069002099128854, "learning_rate": 9.934505943087505e-05, "loss": 2.0683, "step": 6651 }, { "epoch": 0.051732236680426674, "grad_norm": 0.1252310911106676, "learning_rate": 9.934486230952683e-05, "loss": 2.0419, "step": 6652 }, { "epoch": 0.051740013625207255, "grad_norm": 0.11494380224665099, "learning_rate": 9.934466515871433e-05, "loss": 2.0432, "step": 6653 }, { "epoch": 0.05174779056998784, "grad_norm": 0.11538793232360288, "learning_rate": 9.934446797843762e-05, "loss": 2.0286, "step": 6654 }, { "epoch": 0.05175556751476842, "grad_norm": 0.13386583188793905, "learning_rate": 9.934427076869684e-05, "loss": 2.0808, "step": 6655 }, { "epoch": 0.051763344459549, "grad_norm": 0.13549234239177088, "learning_rate": 9.934407352949214e-05, "loss": 1.9923, "step": 6656 }, { "epoch": 0.05177112140432958, "grad_norm": 0.12336836489490045, "learning_rate": 9.934387626082357e-05, "loss": 2.0901, "step": 6657 }, { "epoch": 0.05177889834911016, "grad_norm": 0.11777113223823193, "learning_rate": 9.934367896269131e-05, "loss": 2.0562, "step": 6658 }, { "epoch": 0.05178667529389074, "grad_norm": 0.11910031475130654, "learning_rate": 9.934348163509545e-05, "loss": 2.0419, "step": 6659 }, { "epoch": 0.051794452238671324, "grad_norm": 0.13015679041329903, "learning_rate": 9.93432842780361e-05, "loss": 2.0331, "step": 6660 }, { "epoch": 0.051802229183451905, "grad_norm": 0.14109728505278166, "learning_rate": 9.934308689151341e-05, "loss": 2.0762, "step": 6661 }, { "epoch": 0.051810006128232486, "grad_norm": 0.1387605426251308, "learning_rate": 9.934288947552745e-05, "loss": 2.0965, "step": 6662 }, { "epoch": 0.05181778307301307, "grad_norm": 0.11975423507458932, "learning_rate": 9.934269203007838e-05, "loss": 2.0401, "step": 6663 }, { "epoch": 0.05182556001779365, "grad_norm": 0.12134496655034464, "learning_rate": 9.934249455516631e-05, "loss": 2.0657, "step": 6664 }, { "epoch": 0.05183333696257423, "grad_norm": 0.14422931073367634, "learning_rate": 9.934229705079134e-05, "loss": 2.0663, "step": 6665 }, { "epoch": 0.05184111390735481, "grad_norm": 0.13209343528385661, "learning_rate": 9.93420995169536e-05, "loss": 2.0797, "step": 6666 }, { "epoch": 0.05184889085213539, "grad_norm": 0.11460608243071202, "learning_rate": 9.93419019536532e-05, "loss": 2.0549, "step": 6667 }, { "epoch": 0.05185666779691597, "grad_norm": 0.1417479559948371, "learning_rate": 9.934170436089027e-05, "loss": 2.0534, "step": 6668 }, { "epoch": 0.051864444741696554, "grad_norm": 0.19689157344753958, "learning_rate": 9.934150673866493e-05, "loss": 2.0544, "step": 6669 }, { "epoch": 0.051872221686477135, "grad_norm": 0.2071300926115209, "learning_rate": 9.934130908697727e-05, "loss": 2.0546, "step": 6670 }, { "epoch": 0.051879998631257716, "grad_norm": 0.14539066840033307, "learning_rate": 9.934111140582744e-05, "loss": 2.0242, "step": 6671 }, { "epoch": 0.0518877755760383, "grad_norm": 0.11672296720450086, "learning_rate": 9.934091369521555e-05, "loss": 2.0969, "step": 6672 }, { "epoch": 0.05189555252081888, "grad_norm": 0.16488797811724692, "learning_rate": 9.934071595514171e-05, "loss": 2.0305, "step": 6673 }, { "epoch": 0.05190332946559946, "grad_norm": 0.17010473182588964, "learning_rate": 9.934051818560605e-05, "loss": 2.0599, "step": 6674 }, { "epoch": 0.05191110641038004, "grad_norm": 0.1211830857865394, "learning_rate": 9.934032038660868e-05, "loss": 2.044, "step": 6675 }, { "epoch": 0.05191888335516062, "grad_norm": 0.13426158873184302, "learning_rate": 9.934012255814971e-05, "loss": 2.0697, "step": 6676 }, { "epoch": 0.0519266602999412, "grad_norm": 0.16901579118975135, "learning_rate": 9.933992470022926e-05, "loss": 2.0432, "step": 6677 }, { "epoch": 0.051934437244721784, "grad_norm": 0.158570045053399, "learning_rate": 9.933972681284746e-05, "loss": 2.1235, "step": 6678 }, { "epoch": 0.051942214189502366, "grad_norm": 0.11909562994486457, "learning_rate": 9.933952889600443e-05, "loss": 1.9968, "step": 6679 }, { "epoch": 0.05194999113428295, "grad_norm": 0.14171714006076128, "learning_rate": 9.933933094970028e-05, "loss": 2.0454, "step": 6680 }, { "epoch": 0.05195776807906353, "grad_norm": 0.1735083265378975, "learning_rate": 9.933913297393513e-05, "loss": 2.1104, "step": 6681 }, { "epoch": 0.051965545023844116, "grad_norm": 0.12166916461292826, "learning_rate": 9.933893496870909e-05, "loss": 2.099, "step": 6682 }, { "epoch": 0.0519733219686247, "grad_norm": 0.13063142630170355, "learning_rate": 9.933873693402229e-05, "loss": 2.0525, "step": 6683 }, { "epoch": 0.05198109891340528, "grad_norm": 0.15369529049856043, "learning_rate": 9.933853886987486e-05, "loss": 2.0805, "step": 6684 }, { "epoch": 0.05198887585818586, "grad_norm": 0.12044399148502512, "learning_rate": 9.933834077626687e-05, "loss": 2.0597, "step": 6685 }, { "epoch": 0.05199665280296644, "grad_norm": 0.11639816723737155, "learning_rate": 9.93381426531985e-05, "loss": 2.1177, "step": 6686 }, { "epoch": 0.05200442974774702, "grad_norm": 0.12747217489486448, "learning_rate": 9.933794450066983e-05, "loss": 2.0145, "step": 6687 }, { "epoch": 0.0520122066925276, "grad_norm": 0.11813593309085078, "learning_rate": 9.933774631868098e-05, "loss": 2.0812, "step": 6688 }, { "epoch": 0.052019983637308184, "grad_norm": 0.11468820333600514, "learning_rate": 9.933754810723209e-05, "loss": 2.048, "step": 6689 }, { "epoch": 0.052027760582088765, "grad_norm": 0.11831695891801165, "learning_rate": 9.933734986632324e-05, "loss": 2.0802, "step": 6690 }, { "epoch": 0.052035537526869347, "grad_norm": 0.11714624464601898, "learning_rate": 9.933715159595458e-05, "loss": 2.0142, "step": 6691 }, { "epoch": 0.05204331447164993, "grad_norm": 0.13224960038805186, "learning_rate": 9.933695329612623e-05, "loss": 2.1371, "step": 6692 }, { "epoch": 0.05205109141643051, "grad_norm": 0.12769920830397438, "learning_rate": 9.93367549668383e-05, "loss": 2.0193, "step": 6693 }, { "epoch": 0.05205886836121109, "grad_norm": 0.11403404909582211, "learning_rate": 9.93365566080909e-05, "loss": 2.0585, "step": 6694 }, { "epoch": 0.05206664530599167, "grad_norm": 0.11848450440091839, "learning_rate": 9.933635821988416e-05, "loss": 2.0536, "step": 6695 }, { "epoch": 0.05207442225077225, "grad_norm": 0.11951308582090339, "learning_rate": 9.93361598022182e-05, "loss": 2.0434, "step": 6696 }, { "epoch": 0.052082199195552834, "grad_norm": 0.12085490758121806, "learning_rate": 9.933596135509312e-05, "loss": 2.1147, "step": 6697 }, { "epoch": 0.052089976140333415, "grad_norm": 0.1318897932150968, "learning_rate": 9.933576287850906e-05, "loss": 2.0567, "step": 6698 }, { "epoch": 0.052097753085113996, "grad_norm": 0.12905245967460832, "learning_rate": 9.933556437246613e-05, "loss": 2.0721, "step": 6699 }, { "epoch": 0.05210553002989458, "grad_norm": 0.12607856789783306, "learning_rate": 9.933536583696444e-05, "loss": 2.0812, "step": 6700 }, { "epoch": 0.05211330697467516, "grad_norm": 0.11606016734469611, "learning_rate": 9.933516727200413e-05, "loss": 2.033, "step": 6701 }, { "epoch": 0.05212108391945574, "grad_norm": 0.12584630429927124, "learning_rate": 9.93349686775853e-05, "loss": 2.0557, "step": 6702 }, { "epoch": 0.05212886086423632, "grad_norm": 0.1271533816812633, "learning_rate": 9.933477005370808e-05, "loss": 2.0931, "step": 6703 }, { "epoch": 0.0521366378090169, "grad_norm": 0.11405082579675963, "learning_rate": 9.933457140037257e-05, "loss": 2.0798, "step": 6704 }, { "epoch": 0.05214441475379748, "grad_norm": 0.13036601432523892, "learning_rate": 9.933437271757891e-05, "loss": 2.0605, "step": 6705 }, { "epoch": 0.052152191698578064, "grad_norm": 0.15500337007447193, "learning_rate": 9.93341740053272e-05, "loss": 2.0383, "step": 6706 }, { "epoch": 0.052159968643358645, "grad_norm": 0.132719605398974, "learning_rate": 9.933397526361759e-05, "loss": 2.0756, "step": 6707 }, { "epoch": 0.052167745588139226, "grad_norm": 0.11306767326596118, "learning_rate": 9.933377649245015e-05, "loss": 2.0057, "step": 6708 }, { "epoch": 0.05217552253291981, "grad_norm": 0.1259044241785148, "learning_rate": 9.933357769182505e-05, "loss": 2.1221, "step": 6709 }, { "epoch": 0.05218329947770039, "grad_norm": 0.11134920998441934, "learning_rate": 9.933337886174237e-05, "loss": 2.069, "step": 6710 }, { "epoch": 0.05219107642248097, "grad_norm": 0.11455904494270516, "learning_rate": 9.933318000220224e-05, "loss": 2.0481, "step": 6711 }, { "epoch": 0.05219885336726155, "grad_norm": 0.11798213977390401, "learning_rate": 9.933298111320479e-05, "loss": 2.0255, "step": 6712 }, { "epoch": 0.05220663031204213, "grad_norm": 0.11788697623940911, "learning_rate": 9.933278219475012e-05, "loss": 2.0088, "step": 6713 }, { "epoch": 0.05221440725682271, "grad_norm": 0.12713757325795402, "learning_rate": 9.933258324683839e-05, "loss": 2.0436, "step": 6714 }, { "epoch": 0.052222184201603294, "grad_norm": 0.12953270766022693, "learning_rate": 9.933238426946967e-05, "loss": 2.1411, "step": 6715 }, { "epoch": 0.052229961146383876, "grad_norm": 0.1319300401730366, "learning_rate": 9.93321852626441e-05, "loss": 2.0573, "step": 6716 }, { "epoch": 0.05223773809116446, "grad_norm": 0.12854227808139623, "learning_rate": 9.933198622636177e-05, "loss": 2.0826, "step": 6717 }, { "epoch": 0.05224551503594504, "grad_norm": 0.11927057538866848, "learning_rate": 9.933178716062286e-05, "loss": 2.1022, "step": 6718 }, { "epoch": 0.05225329198072562, "grad_norm": 0.11150182718637597, "learning_rate": 9.933158806542744e-05, "loss": 2.0932, "step": 6719 }, { "epoch": 0.0522610689255062, "grad_norm": 0.11949751975463302, "learning_rate": 9.933138894077565e-05, "loss": 2.0324, "step": 6720 }, { "epoch": 0.05226884587028678, "grad_norm": 0.12039252973093444, "learning_rate": 9.933118978666758e-05, "loss": 2.0855, "step": 6721 }, { "epoch": 0.05227662281506736, "grad_norm": 0.11472334449061021, "learning_rate": 9.93309906031034e-05, "loss": 2.0554, "step": 6722 }, { "epoch": 0.052284399759847944, "grad_norm": 0.11506693852578027, "learning_rate": 9.933079139008317e-05, "loss": 2.1166, "step": 6723 }, { "epoch": 0.052292176704628525, "grad_norm": 0.12098283314006808, "learning_rate": 9.933059214760707e-05, "loss": 2.0418, "step": 6724 }, { "epoch": 0.052299953649409106, "grad_norm": 0.11562814226282872, "learning_rate": 9.933039287567516e-05, "loss": 2.0444, "step": 6725 }, { "epoch": 0.05230773059418969, "grad_norm": 0.11626357489476954, "learning_rate": 9.93301935742876e-05, "loss": 2.0487, "step": 6726 }, { "epoch": 0.05231550753897027, "grad_norm": 0.11562686633455867, "learning_rate": 9.932999424344449e-05, "loss": 2.0486, "step": 6727 }, { "epoch": 0.05232328448375085, "grad_norm": 0.11527347466986874, "learning_rate": 9.932979488314596e-05, "loss": 2.0794, "step": 6728 }, { "epoch": 0.05233106142853143, "grad_norm": 0.12744609995694983, "learning_rate": 9.932959549339213e-05, "loss": 2.0619, "step": 6729 }, { "epoch": 0.05233883837331201, "grad_norm": 0.14703024028619255, "learning_rate": 9.93293960741831e-05, "loss": 2.0465, "step": 6730 }, { "epoch": 0.05234661531809259, "grad_norm": 0.16907723110164524, "learning_rate": 9.9329196625519e-05, "loss": 2.0678, "step": 6731 }, { "epoch": 0.052354392262873174, "grad_norm": 0.15119403544602747, "learning_rate": 9.932899714739996e-05, "loss": 2.0603, "step": 6732 }, { "epoch": 0.052362169207653755, "grad_norm": 0.12739944905976605, "learning_rate": 9.932879763982608e-05, "loss": 2.1134, "step": 6733 }, { "epoch": 0.05236994615243434, "grad_norm": 0.11516348994152222, "learning_rate": 9.93285981027975e-05, "loss": 2.0468, "step": 6734 }, { "epoch": 0.05237772309721492, "grad_norm": 0.1643544202467848, "learning_rate": 9.932839853631432e-05, "loss": 2.0901, "step": 6735 }, { "epoch": 0.0523855000419955, "grad_norm": 0.18405958565406733, "learning_rate": 9.932819894037667e-05, "loss": 2.1314, "step": 6736 }, { "epoch": 0.05239327698677608, "grad_norm": 0.14346148655593133, "learning_rate": 9.932799931498466e-05, "loss": 2.0711, "step": 6737 }, { "epoch": 0.05240105393155666, "grad_norm": 0.11803471703675777, "learning_rate": 9.932779966013843e-05, "loss": 2.06, "step": 6738 }, { "epoch": 0.05240883087633724, "grad_norm": 0.15022618101146357, "learning_rate": 9.932759997583807e-05, "loss": 2.0831, "step": 6739 }, { "epoch": 0.052416607821117824, "grad_norm": 0.16628922634531748, "learning_rate": 9.932740026208372e-05, "loss": 2.1201, "step": 6740 }, { "epoch": 0.05242438476589841, "grad_norm": 0.12336545353252883, "learning_rate": 9.932720051887549e-05, "loss": 2.0516, "step": 6741 }, { "epoch": 0.05243216171067899, "grad_norm": 0.12821557590130503, "learning_rate": 9.932700074621352e-05, "loss": 2.0771, "step": 6742 }, { "epoch": 0.052439938655459574, "grad_norm": 0.15464221150416865, "learning_rate": 9.93268009440979e-05, "loss": 2.0373, "step": 6743 }, { "epoch": 0.052447715600240155, "grad_norm": 0.15161285856695284, "learning_rate": 9.932660111252876e-05, "loss": 2.0418, "step": 6744 }, { "epoch": 0.052455492545020736, "grad_norm": 0.11866057896943734, "learning_rate": 9.932640125150621e-05, "loss": 2.1165, "step": 6745 }, { "epoch": 0.05246326948980132, "grad_norm": 0.11680876951958055, "learning_rate": 9.932620136103039e-05, "loss": 2.0655, "step": 6746 }, { "epoch": 0.0524710464345819, "grad_norm": 0.12403784828840675, "learning_rate": 9.932600144110142e-05, "loss": 1.9974, "step": 6747 }, { "epoch": 0.05247882337936248, "grad_norm": 0.11856643960955572, "learning_rate": 9.932580149171939e-05, "loss": 2.0266, "step": 6748 }, { "epoch": 0.05248660032414306, "grad_norm": 0.11789454860162002, "learning_rate": 9.932560151288445e-05, "loss": 2.0905, "step": 6749 }, { "epoch": 0.05249437726892364, "grad_norm": 0.1326361508034548, "learning_rate": 9.93254015045967e-05, "loss": 2.0423, "step": 6750 }, { "epoch": 0.05250215421370422, "grad_norm": 0.1338442350706483, "learning_rate": 9.932520146685626e-05, "loss": 2.06, "step": 6751 }, { "epoch": 0.052509931158484804, "grad_norm": 0.12702160993613315, "learning_rate": 9.932500139966328e-05, "loss": 2.0312, "step": 6752 }, { "epoch": 0.052517708103265386, "grad_norm": 0.11976376676241215, "learning_rate": 9.932480130301785e-05, "loss": 2.0927, "step": 6753 }, { "epoch": 0.05252548504804597, "grad_norm": 0.12112519386860642, "learning_rate": 9.932460117692008e-05, "loss": 2.0814, "step": 6754 }, { "epoch": 0.05253326199282655, "grad_norm": 0.11612961732612137, "learning_rate": 9.932440102137011e-05, "loss": 2.0784, "step": 6755 }, { "epoch": 0.05254103893760713, "grad_norm": 0.12909385476936458, "learning_rate": 9.932420083636807e-05, "loss": 2.053, "step": 6756 }, { "epoch": 0.05254881588238771, "grad_norm": 0.11943632887895657, "learning_rate": 9.932400062191403e-05, "loss": 2.0502, "step": 6757 }, { "epoch": 0.05255659282716829, "grad_norm": 0.12162743089263188, "learning_rate": 9.932380037800817e-05, "loss": 2.069, "step": 6758 }, { "epoch": 0.05256436977194887, "grad_norm": 0.11917798184223759, "learning_rate": 9.932360010465058e-05, "loss": 2.058, "step": 6759 }, { "epoch": 0.052572146716729454, "grad_norm": 0.13115506374024813, "learning_rate": 9.932339980184137e-05, "loss": 2.051, "step": 6760 }, { "epoch": 0.052579923661510035, "grad_norm": 0.1158428108695475, "learning_rate": 9.932319946958069e-05, "loss": 2.0282, "step": 6761 }, { "epoch": 0.052587700606290616, "grad_norm": 0.16148331599562615, "learning_rate": 9.932299910786862e-05, "loss": 2.019, "step": 6762 }, { "epoch": 0.0525954775510712, "grad_norm": 0.12824730003061946, "learning_rate": 9.93227987167053e-05, "loss": 2.0382, "step": 6763 }, { "epoch": 0.05260325449585178, "grad_norm": 0.12964089020964306, "learning_rate": 9.932259829609089e-05, "loss": 2.08, "step": 6764 }, { "epoch": 0.05261103144063236, "grad_norm": 0.1216658976144267, "learning_rate": 9.932239784602543e-05, "loss": 2.0449, "step": 6765 }, { "epoch": 0.05261880838541294, "grad_norm": 0.11626494337992142, "learning_rate": 9.93221973665091e-05, "loss": 2.0284, "step": 6766 }, { "epoch": 0.05262658533019352, "grad_norm": 0.12765720779754897, "learning_rate": 9.9321996857542e-05, "loss": 2.0503, "step": 6767 }, { "epoch": 0.0526343622749741, "grad_norm": 0.11533810397544746, "learning_rate": 9.932179631912424e-05, "loss": 2.1312, "step": 6768 }, { "epoch": 0.052642139219754684, "grad_norm": 0.11812036551518686, "learning_rate": 9.932159575125596e-05, "loss": 2.0574, "step": 6769 }, { "epoch": 0.052649916164535265, "grad_norm": 0.11970137977614556, "learning_rate": 9.932139515393726e-05, "loss": 2.0811, "step": 6770 }, { "epoch": 0.052657693109315847, "grad_norm": 0.12176163150216147, "learning_rate": 9.932119452716828e-05, "loss": 2.0408, "step": 6771 }, { "epoch": 0.05266547005409643, "grad_norm": 0.13481712024105796, "learning_rate": 9.932099387094911e-05, "loss": 2.1295, "step": 6772 }, { "epoch": 0.05267324699887701, "grad_norm": 0.12475815502946476, "learning_rate": 9.932079318527991e-05, "loss": 2.0501, "step": 6773 }, { "epoch": 0.05268102394365759, "grad_norm": 0.11613304488821263, "learning_rate": 9.932059247016077e-05, "loss": 2.0573, "step": 6774 }, { "epoch": 0.05268880088843817, "grad_norm": 0.11709245234320124, "learning_rate": 9.932039172559181e-05, "loss": 2.0329, "step": 6775 }, { "epoch": 0.05269657783321875, "grad_norm": 0.11955436271347368, "learning_rate": 9.932019095157316e-05, "loss": 1.9945, "step": 6776 }, { "epoch": 0.052704354777999333, "grad_norm": 0.13408316428017705, "learning_rate": 9.931999014810496e-05, "loss": 2.0233, "step": 6777 }, { "epoch": 0.052712131722779915, "grad_norm": 0.1394704706842766, "learning_rate": 9.931978931518728e-05, "loss": 2.0879, "step": 6778 }, { "epoch": 0.052719908667560496, "grad_norm": 0.13842508574610496, "learning_rate": 9.931958845282027e-05, "loss": 2.048, "step": 6779 }, { "epoch": 0.05272768561234108, "grad_norm": 0.12691198610133175, "learning_rate": 9.931938756100407e-05, "loss": 2.0915, "step": 6780 }, { "epoch": 0.05273546255712166, "grad_norm": 0.13413664327578378, "learning_rate": 9.931918663973875e-05, "loss": 2.0644, "step": 6781 }, { "epoch": 0.05274323950190224, "grad_norm": 0.12165324375070954, "learning_rate": 9.931898568902449e-05, "loss": 2.0542, "step": 6782 }, { "epoch": 0.05275101644668282, "grad_norm": 0.11464503527947448, "learning_rate": 9.931878470886135e-05, "loss": 2.0319, "step": 6783 }, { "epoch": 0.0527587933914634, "grad_norm": 0.13668989365589232, "learning_rate": 9.931858369924948e-05, "loss": 2.0785, "step": 6784 }, { "epoch": 0.05276657033624398, "grad_norm": 0.1665578352433923, "learning_rate": 9.931838266018903e-05, "loss": 1.9805, "step": 6785 }, { "epoch": 0.052774347281024564, "grad_norm": 0.18311171580409213, "learning_rate": 9.931818159168005e-05, "loss": 2.1185, "step": 6786 }, { "epoch": 0.052782124225805145, "grad_norm": 0.16918587059355988, "learning_rate": 9.931798049372272e-05, "loss": 2.1255, "step": 6787 }, { "epoch": 0.052789901170585726, "grad_norm": 0.1444702895116412, "learning_rate": 9.931777936631713e-05, "loss": 2.042, "step": 6788 }, { "epoch": 0.05279767811536631, "grad_norm": 0.11968829508138701, "learning_rate": 9.931757820946341e-05, "loss": 2.062, "step": 6789 }, { "epoch": 0.05280545506014689, "grad_norm": 0.3966905744621951, "learning_rate": 9.931737702316166e-05, "loss": 2.064, "step": 6790 }, { "epoch": 0.05281323200492747, "grad_norm": 0.20364964545837896, "learning_rate": 9.931717580741204e-05, "loss": 2.0492, "step": 6791 }, { "epoch": 0.05282100894970805, "grad_norm": 0.18042438424042515, "learning_rate": 9.931697456221464e-05, "loss": 2.0519, "step": 6792 }, { "epoch": 0.05282878589448863, "grad_norm": 0.1179247191547266, "learning_rate": 9.931677328756958e-05, "loss": 2.1267, "step": 6793 }, { "epoch": 0.05283656283926921, "grad_norm": 0.3449978208011738, "learning_rate": 9.931657198347702e-05, "loss": 2.0612, "step": 6794 }, { "epoch": 0.052844339784049794, "grad_norm": 0.20644431548568046, "learning_rate": 9.931637064993702e-05, "loss": 2.0533, "step": 6795 }, { "epoch": 0.052852116728830376, "grad_norm": 0.1933442339106175, "learning_rate": 9.931616928694973e-05, "loss": 2.0603, "step": 6796 }, { "epoch": 0.05285989367361096, "grad_norm": 0.5126083555960345, "learning_rate": 9.931596789451529e-05, "loss": 2.133, "step": 6797 }, { "epoch": 0.05286767061839154, "grad_norm": 0.16460907022710722, "learning_rate": 9.931576647263378e-05, "loss": 2.0406, "step": 6798 }, { "epoch": 0.05287544756317212, "grad_norm": 0.20402638083924432, "learning_rate": 9.931556502130534e-05, "loss": 2.009, "step": 6799 }, { "epoch": 0.05288322450795271, "grad_norm": 0.20557338347293871, "learning_rate": 9.93153635405301e-05, "loss": 2.0681, "step": 6800 }, { "epoch": 0.05289100145273329, "grad_norm": 0.16447434949850936, "learning_rate": 9.931516203030818e-05, "loss": 2.0493, "step": 6801 }, { "epoch": 0.05289877839751387, "grad_norm": 0.1279957887261404, "learning_rate": 9.931496049063968e-05, "loss": 2.0014, "step": 6802 }, { "epoch": 0.05290655534229445, "grad_norm": 0.13983914226848437, "learning_rate": 9.931475892152474e-05, "loss": 2.0765, "step": 6803 }, { "epoch": 0.05291433228707503, "grad_norm": 0.1313888850630484, "learning_rate": 9.931455732296345e-05, "loss": 2.058, "step": 6804 }, { "epoch": 0.05292210923185561, "grad_norm": 0.1313173388406565, "learning_rate": 9.931435569495596e-05, "loss": 2.0866, "step": 6805 }, { "epoch": 0.052929886176636194, "grad_norm": 0.1747466866481294, "learning_rate": 9.931415403750238e-05, "loss": 2.107, "step": 6806 }, { "epoch": 0.052937663121416775, "grad_norm": 0.17571248977079187, "learning_rate": 9.931395235060284e-05, "loss": 2.0612, "step": 6807 }, { "epoch": 0.052945440066197356, "grad_norm": 0.15428680679425527, "learning_rate": 9.931375063425745e-05, "loss": 2.0537, "step": 6808 }, { "epoch": 0.05295321701097794, "grad_norm": 0.1370991335929814, "learning_rate": 9.931354888846634e-05, "loss": 2.0497, "step": 6809 }, { "epoch": 0.05296099395575852, "grad_norm": 0.13164941536760105, "learning_rate": 9.931334711322961e-05, "loss": 2.0894, "step": 6810 }, { "epoch": 0.0529687709005391, "grad_norm": 0.1560256019430162, "learning_rate": 9.931314530854741e-05, "loss": 2.0642, "step": 6811 }, { "epoch": 0.05297654784531968, "grad_norm": 0.13271427142967204, "learning_rate": 9.931294347441983e-05, "loss": 2.0263, "step": 6812 }, { "epoch": 0.05298432479010026, "grad_norm": 0.1265853379846405, "learning_rate": 9.931274161084703e-05, "loss": 2.0274, "step": 6813 }, { "epoch": 0.05299210173488084, "grad_norm": 0.1960450367220802, "learning_rate": 9.931253971782909e-05, "loss": 2.0442, "step": 6814 }, { "epoch": 0.052999878679661425, "grad_norm": 0.29460311832883057, "learning_rate": 9.931233779536616e-05, "loss": 2.08, "step": 6815 }, { "epoch": 0.053007655624442006, "grad_norm": 1.7560898635526225, "learning_rate": 9.931213584345832e-05, "loss": 2.0772, "step": 6816 }, { "epoch": 0.05301543256922259, "grad_norm": 0.28528733424744823, "learning_rate": 9.931193386210573e-05, "loss": 2.0359, "step": 6817 }, { "epoch": 0.05302320951400317, "grad_norm": 0.8301862620060425, "learning_rate": 9.93117318513085e-05, "loss": 2.0312, "step": 6818 }, { "epoch": 0.05303098645878375, "grad_norm": 0.5295494870814491, "learning_rate": 9.931152981106675e-05, "loss": 2.0358, "step": 6819 }, { "epoch": 0.05303876340356433, "grad_norm": 0.2532367554948344, "learning_rate": 9.931132774138061e-05, "loss": 2.0929, "step": 6820 }, { "epoch": 0.05304654034834491, "grad_norm": 0.8524765086914342, "learning_rate": 9.931112564225018e-05, "loss": 2.0981, "step": 6821 }, { "epoch": 0.05305431729312549, "grad_norm": 0.5579484526216707, "learning_rate": 9.931092351367559e-05, "loss": 2.1191, "step": 6822 }, { "epoch": 0.053062094237906074, "grad_norm": 0.433161584450894, "learning_rate": 9.931072135565695e-05, "loss": 2.0846, "step": 6823 }, { "epoch": 0.053069871182686655, "grad_norm": 1.0260418117941126, "learning_rate": 9.931051916819442e-05, "loss": 2.0958, "step": 6824 }, { "epoch": 0.053077648127467236, "grad_norm": 0.5676489448554743, "learning_rate": 9.931031695128807e-05, "loss": 2.0511, "step": 6825 }, { "epoch": 0.05308542507224782, "grad_norm": 0.2508084288371093, "learning_rate": 9.931011470493805e-05, "loss": 2.0752, "step": 6826 }, { "epoch": 0.0530932020170284, "grad_norm": 1.9803858402732302, "learning_rate": 9.93099124291445e-05, "loss": 2.0518, "step": 6827 }, { "epoch": 0.05310097896180898, "grad_norm": 0.7537668759365177, "learning_rate": 9.930971012390748e-05, "loss": 2.0901, "step": 6828 }, { "epoch": 0.05310875590658956, "grad_norm": 0.46757757325171084, "learning_rate": 9.930950778922716e-05, "loss": 2.0302, "step": 6829 }, { "epoch": 0.05311653285137014, "grad_norm": 2.4088846811968163, "learning_rate": 9.930930542510363e-05, "loss": 2.1125, "step": 6830 }, { "epoch": 0.05312430979615072, "grad_norm": 0.49560153137078844, "learning_rate": 9.930910303153706e-05, "loss": 2.098, "step": 6831 }, { "epoch": 0.053132086740931304, "grad_norm": 0.3801683503818333, "learning_rate": 9.930890060852752e-05, "loss": 2.0654, "step": 6832 }, { "epoch": 0.053139863685711886, "grad_norm": 1.7016459960696835, "learning_rate": 9.930869815607515e-05, "loss": 2.0706, "step": 6833 }, { "epoch": 0.05314764063049247, "grad_norm": 0.4348178026375259, "learning_rate": 9.930849567418008e-05, "loss": 2.0541, "step": 6834 }, { "epoch": 0.05315541757527305, "grad_norm": 0.2955145717080175, "learning_rate": 9.930829316284241e-05, "loss": 2.0393, "step": 6835 }, { "epoch": 0.05316319452005363, "grad_norm": 0.8791591786080558, "learning_rate": 9.930809062206228e-05, "loss": 2.0931, "step": 6836 }, { "epoch": 0.05317097146483421, "grad_norm": 0.8175317694927176, "learning_rate": 9.930788805183979e-05, "loss": 2.0914, "step": 6837 }, { "epoch": 0.05317874840961479, "grad_norm": 0.3241350214291921, "learning_rate": 9.93076854521751e-05, "loss": 2.0583, "step": 6838 }, { "epoch": 0.05318652535439537, "grad_norm": 1.0926372074424773, "learning_rate": 9.930748282306827e-05, "loss": 2.0841, "step": 6839 }, { "epoch": 0.053194302299175954, "grad_norm": 0.8525029957013458, "learning_rate": 9.930728016451949e-05, "loss": 2.0844, "step": 6840 }, { "epoch": 0.053202079243956535, "grad_norm": 0.3057978970940213, "learning_rate": 9.930707747652883e-05, "loss": 2.0606, "step": 6841 }, { "epoch": 0.053209856188737116, "grad_norm": 1.2337105988187114, "learning_rate": 9.930687475909643e-05, "loss": 2.1714, "step": 6842 }, { "epoch": 0.0532176331335177, "grad_norm": 0.47117865158542815, "learning_rate": 9.930667201222241e-05, "loss": 2.1326, "step": 6843 }, { "epoch": 0.05322541007829828, "grad_norm": 0.34201502325331634, "learning_rate": 9.93064692359069e-05, "loss": 2.095, "step": 6844 }, { "epoch": 0.05323318702307886, "grad_norm": 1.720412218346394, "learning_rate": 9.930626643015e-05, "loss": 2.1516, "step": 6845 }, { "epoch": 0.05324096396785944, "grad_norm": 0.6165792816349641, "learning_rate": 9.930606359495184e-05, "loss": 2.1234, "step": 6846 }, { "epoch": 0.05324874091264002, "grad_norm": 0.36448306175642087, "learning_rate": 9.930586073031256e-05, "loss": 2.0744, "step": 6847 }, { "epoch": 0.0532565178574206, "grad_norm": 1.545268724911451, "learning_rate": 9.930565783623225e-05, "loss": 2.0941, "step": 6848 }, { "epoch": 0.053264294802201184, "grad_norm": 0.7211104518716848, "learning_rate": 9.930545491271104e-05, "loss": 2.075, "step": 6849 }, { "epoch": 0.053272071746981765, "grad_norm": 0.508614381347211, "learning_rate": 9.930525195974907e-05, "loss": 2.1197, "step": 6850 }, { "epoch": 0.053279848691762347, "grad_norm": 0.2507322786198837, "learning_rate": 9.930504897734644e-05, "loss": 2.0855, "step": 6851 }, { "epoch": 0.05328762563654293, "grad_norm": 0.3824290336927444, "learning_rate": 9.930484596550329e-05, "loss": 2.0683, "step": 6852 }, { "epoch": 0.05329540258132351, "grad_norm": 0.4008662510729157, "learning_rate": 9.930464292421973e-05, "loss": 2.1014, "step": 6853 }, { "epoch": 0.05330317952610409, "grad_norm": 0.41251232283121164, "learning_rate": 9.930443985349587e-05, "loss": 2.0527, "step": 6854 }, { "epoch": 0.05331095647088467, "grad_norm": 0.23552498584830298, "learning_rate": 9.930423675333185e-05, "loss": 2.1221, "step": 6855 }, { "epoch": 0.05331873341566525, "grad_norm": 0.2103583342108985, "learning_rate": 9.930403362372778e-05, "loss": 2.0735, "step": 6856 }, { "epoch": 0.053326510360445833, "grad_norm": 0.21789997762077873, "learning_rate": 9.93038304646838e-05, "loss": 2.1045, "step": 6857 }, { "epoch": 0.053334287305226415, "grad_norm": 0.2085036553695476, "learning_rate": 9.930362727620001e-05, "loss": 2.0929, "step": 6858 }, { "epoch": 0.053342064250006996, "grad_norm": 0.26438568825666425, "learning_rate": 9.930342405827653e-05, "loss": 2.0743, "step": 6859 }, { "epoch": 0.053349841194787584, "grad_norm": 0.3848454102630374, "learning_rate": 9.93032208109135e-05, "loss": 2.0019, "step": 6860 }, { "epoch": 0.053357618139568165, "grad_norm": 1.0235724762456837, "learning_rate": 9.930301753411103e-05, "loss": 2.0534, "step": 6861 }, { "epoch": 0.053365395084348746, "grad_norm": 0.9642920678497434, "learning_rate": 9.930281422786923e-05, "loss": 2.1151, "step": 6862 }, { "epoch": 0.05337317202912933, "grad_norm": 0.4854912182020143, "learning_rate": 9.930261089218824e-05, "loss": 2.0951, "step": 6863 }, { "epoch": 0.05338094897390991, "grad_norm": 1.0596088008945086, "learning_rate": 9.930240752706818e-05, "loss": 2.1324, "step": 6864 }, { "epoch": 0.05338872591869049, "grad_norm": 0.22093426165689092, "learning_rate": 9.930220413250917e-05, "loss": 2.0767, "step": 6865 }, { "epoch": 0.05339650286347107, "grad_norm": 0.5029978271864735, "learning_rate": 9.930200070851133e-05, "loss": 2.0325, "step": 6866 }, { "epoch": 0.05340427980825165, "grad_norm": 0.3980934486897264, "learning_rate": 9.930179725507477e-05, "loss": 2.0298, "step": 6867 }, { "epoch": 0.05341205675303223, "grad_norm": 0.4246891929872024, "learning_rate": 9.930159377219962e-05, "loss": 2.0979, "step": 6868 }, { "epoch": 0.053419833697812814, "grad_norm": 0.6062083757094576, "learning_rate": 9.9301390259886e-05, "loss": 2.0748, "step": 6869 }, { "epoch": 0.053427610642593396, "grad_norm": 0.3854859746754343, "learning_rate": 9.930118671813404e-05, "loss": 2.1014, "step": 6870 }, { "epoch": 0.05343538758737398, "grad_norm": 0.19856130791898946, "learning_rate": 9.930098314694388e-05, "loss": 2.0357, "step": 6871 }, { "epoch": 0.05344316453215456, "grad_norm": 0.3981060055408255, "learning_rate": 9.930077954631558e-05, "loss": 2.1161, "step": 6872 }, { "epoch": 0.05345094147693514, "grad_norm": 0.813612110763565, "learning_rate": 9.93005759162493e-05, "loss": 2.1032, "step": 6873 }, { "epoch": 0.05345871842171572, "grad_norm": 0.8331107087486521, "learning_rate": 9.930037225674518e-05, "loss": 2.0901, "step": 6874 }, { "epoch": 0.0534664953664963, "grad_norm": 0.37517719993283555, "learning_rate": 9.930016856780333e-05, "loss": 2.0071, "step": 6875 }, { "epoch": 0.05347427231127688, "grad_norm": 0.467807716385848, "learning_rate": 9.929996484942383e-05, "loss": 2.0541, "step": 6876 }, { "epoch": 0.053482049256057464, "grad_norm": 0.38617661108297274, "learning_rate": 9.929976110160685e-05, "loss": 2.113, "step": 6877 }, { "epoch": 0.053489826200838045, "grad_norm": 0.25142245435090665, "learning_rate": 9.929955732435252e-05, "loss": 2.0499, "step": 6878 }, { "epoch": 0.053497603145618626, "grad_norm": 0.5225133826885677, "learning_rate": 9.92993535176609e-05, "loss": 2.1021, "step": 6879 }, { "epoch": 0.05350538009039921, "grad_norm": 0.19441379148800156, "learning_rate": 9.929914968153218e-05, "loss": 2.1223, "step": 6880 }, { "epoch": 0.05351315703517979, "grad_norm": 0.2496023243317786, "learning_rate": 9.929894581596644e-05, "loss": 2.0173, "step": 6881 }, { "epoch": 0.05352093397996037, "grad_norm": 0.22682385480845643, "learning_rate": 9.929874192096382e-05, "loss": 2.0718, "step": 6882 }, { "epoch": 0.05352871092474095, "grad_norm": 0.19380137603858258, "learning_rate": 9.929853799652443e-05, "loss": 2.0728, "step": 6883 }, { "epoch": 0.05353648786952153, "grad_norm": 0.1590254553662043, "learning_rate": 9.92983340426484e-05, "loss": 2.1118, "step": 6884 }, { "epoch": 0.05354426481430211, "grad_norm": 0.19821850195511054, "learning_rate": 9.929813005933584e-05, "loss": 2.0722, "step": 6885 }, { "epoch": 0.053552041759082694, "grad_norm": 0.16069133868377056, "learning_rate": 9.92979260465869e-05, "loss": 2.0625, "step": 6886 }, { "epoch": 0.053559818703863275, "grad_norm": 0.133029393558659, "learning_rate": 9.929772200440167e-05, "loss": 2.0706, "step": 6887 }, { "epoch": 0.053567595648643856, "grad_norm": 0.1630503696975849, "learning_rate": 9.929751793278028e-05, "loss": 2.0717, "step": 6888 }, { "epoch": 0.05357537259342444, "grad_norm": 0.15375687973465285, "learning_rate": 9.929731383172287e-05, "loss": 2.0577, "step": 6889 }, { "epoch": 0.05358314953820502, "grad_norm": 0.14326281725051165, "learning_rate": 9.929710970122954e-05, "loss": 2.1408, "step": 6890 }, { "epoch": 0.0535909264829856, "grad_norm": 0.1540475064588192, "learning_rate": 9.929690554130041e-05, "loss": 2.1398, "step": 6891 }, { "epoch": 0.05359870342776618, "grad_norm": 0.13782999552251363, "learning_rate": 9.929670135193563e-05, "loss": 2.0885, "step": 6892 }, { "epoch": 0.05360648037254676, "grad_norm": 0.13977951349021403, "learning_rate": 9.92964971331353e-05, "loss": 2.0921, "step": 6893 }, { "epoch": 0.05361425731732734, "grad_norm": 0.13317403729643887, "learning_rate": 9.929629288489954e-05, "loss": 2.1514, "step": 6894 }, { "epoch": 0.053622034262107925, "grad_norm": 0.1371620550009934, "learning_rate": 9.929608860722848e-05, "loss": 2.1052, "step": 6895 }, { "epoch": 0.053629811206888506, "grad_norm": 0.15063299041940578, "learning_rate": 9.929588430012223e-05, "loss": 2.1089, "step": 6896 }, { "epoch": 0.05363758815166909, "grad_norm": 0.12016641610171791, "learning_rate": 9.929567996358093e-05, "loss": 1.9973, "step": 6897 }, { "epoch": 0.05364536509644967, "grad_norm": 0.14120420437050468, "learning_rate": 9.92954755976047e-05, "loss": 2.0406, "step": 6898 }, { "epoch": 0.05365314204123025, "grad_norm": 0.12329304157914817, "learning_rate": 9.929527120219365e-05, "loss": 2.0535, "step": 6899 }, { "epoch": 0.05366091898601083, "grad_norm": 0.12927850917012948, "learning_rate": 9.92950667773479e-05, "loss": 2.0526, "step": 6900 }, { "epoch": 0.05366869593079141, "grad_norm": 0.12930484775094386, "learning_rate": 9.92948623230676e-05, "loss": 2.0347, "step": 6901 }, { "epoch": 0.05367647287557199, "grad_norm": 0.11808751098899176, "learning_rate": 9.929465783935283e-05, "loss": 2.0166, "step": 6902 }, { "epoch": 0.053684249820352574, "grad_norm": 0.1437432249850876, "learning_rate": 9.929445332620375e-05, "loss": 2.0849, "step": 6903 }, { "epoch": 0.053692026765133155, "grad_norm": 0.1460383465210679, "learning_rate": 9.929424878362046e-05, "loss": 2.0103, "step": 6904 }, { "epoch": 0.053699803709913736, "grad_norm": 0.1153834621532351, "learning_rate": 9.929404421160309e-05, "loss": 2.0972, "step": 6905 }, { "epoch": 0.05370758065469432, "grad_norm": 0.13710842212943178, "learning_rate": 9.929383961015176e-05, "loss": 1.996, "step": 6906 }, { "epoch": 0.0537153575994749, "grad_norm": 0.12314654489366876, "learning_rate": 9.929363497926661e-05, "loss": 2.0891, "step": 6907 }, { "epoch": 0.05372313454425548, "grad_norm": 0.12978974782912228, "learning_rate": 9.929343031894771e-05, "loss": 2.0737, "step": 6908 }, { "epoch": 0.05373091148903606, "grad_norm": 0.13899456528921011, "learning_rate": 9.929322562919524e-05, "loss": 2.0342, "step": 6909 }, { "epoch": 0.05373868843381664, "grad_norm": 0.11736526536201443, "learning_rate": 9.929302091000929e-05, "loss": 2.0843, "step": 6910 }, { "epoch": 0.05374646537859722, "grad_norm": 0.1229912807771812, "learning_rate": 9.929281616139e-05, "loss": 1.9915, "step": 6911 }, { "epoch": 0.053754242323377804, "grad_norm": 0.11740066475377087, "learning_rate": 9.929261138333748e-05, "loss": 2.1043, "step": 6912 }, { "epoch": 0.053762019268158386, "grad_norm": 0.12151656146964972, "learning_rate": 9.929240657585185e-05, "loss": 2.0785, "step": 6913 }, { "epoch": 0.05376979621293897, "grad_norm": 0.1166261563458835, "learning_rate": 9.929220173893325e-05, "loss": 2.0545, "step": 6914 }, { "epoch": 0.05377757315771955, "grad_norm": 0.12112345281435609, "learning_rate": 9.929199687258178e-05, "loss": 2.0203, "step": 6915 }, { "epoch": 0.05378535010250013, "grad_norm": 0.11248944655445631, "learning_rate": 9.929179197679759e-05, "loss": 2.0779, "step": 6916 }, { "epoch": 0.05379312704728071, "grad_norm": 0.12362967715998963, "learning_rate": 9.929158705158076e-05, "loss": 2.0797, "step": 6917 }, { "epoch": 0.05380090399206129, "grad_norm": 0.10919168357169835, "learning_rate": 9.929138209693146e-05, "loss": 2.0539, "step": 6918 }, { "epoch": 0.05380868093684188, "grad_norm": 0.11187443803926252, "learning_rate": 9.929117711284977e-05, "loss": 2.0938, "step": 6919 }, { "epoch": 0.05381645788162246, "grad_norm": 0.12651489825231402, "learning_rate": 9.929097209933585e-05, "loss": 2.0682, "step": 6920 }, { "epoch": 0.05382423482640304, "grad_norm": 0.11924843186059181, "learning_rate": 9.92907670563898e-05, "loss": 2.0481, "step": 6921 }, { "epoch": 0.05383201177118362, "grad_norm": 0.1153939109036651, "learning_rate": 9.929056198401174e-05, "loss": 2.0405, "step": 6922 }, { "epoch": 0.053839788715964204, "grad_norm": 0.11999319789229276, "learning_rate": 9.92903568822018e-05, "loss": 2.0487, "step": 6923 }, { "epoch": 0.053847565660744785, "grad_norm": 0.11946328880304018, "learning_rate": 9.929015175096013e-05, "loss": 2.0155, "step": 6924 }, { "epoch": 0.053855342605525366, "grad_norm": 0.11430116512060057, "learning_rate": 9.928994659028679e-05, "loss": 2.0753, "step": 6925 }, { "epoch": 0.05386311955030595, "grad_norm": 0.11268662483986777, "learning_rate": 9.928974140018195e-05, "loss": 2.0272, "step": 6926 }, { "epoch": 0.05387089649508653, "grad_norm": 0.11070219721174766, "learning_rate": 9.928953618064572e-05, "loss": 2.0216, "step": 6927 }, { "epoch": 0.05387867343986711, "grad_norm": 0.11314213611218875, "learning_rate": 9.928933093167822e-05, "loss": 2.0517, "step": 6928 }, { "epoch": 0.05388645038464769, "grad_norm": 0.13535308279771913, "learning_rate": 9.928912565327957e-05, "loss": 2.1247, "step": 6929 }, { "epoch": 0.05389422732942827, "grad_norm": 0.11440263614637604, "learning_rate": 9.928892034544992e-05, "loss": 2.0822, "step": 6930 }, { "epoch": 0.05390200427420885, "grad_norm": 0.11860663547088808, "learning_rate": 9.928871500818935e-05, "loss": 2.0908, "step": 6931 }, { "epoch": 0.053909781218989435, "grad_norm": 0.11507663725703177, "learning_rate": 9.9288509641498e-05, "loss": 2.0409, "step": 6932 }, { "epoch": 0.053917558163770016, "grad_norm": 0.11402412672922276, "learning_rate": 9.9288304245376e-05, "loss": 2.0943, "step": 6933 }, { "epoch": 0.0539253351085506, "grad_norm": 0.12332074424720413, "learning_rate": 9.928809881982349e-05, "loss": 2.0538, "step": 6934 }, { "epoch": 0.05393311205333118, "grad_norm": 0.115924872935478, "learning_rate": 9.928789336484055e-05, "loss": 2.046, "step": 6935 }, { "epoch": 0.05394088899811176, "grad_norm": 0.1149709747300878, "learning_rate": 9.928768788042732e-05, "loss": 2.0156, "step": 6936 }, { "epoch": 0.05394866594289234, "grad_norm": 0.1204253410390066, "learning_rate": 9.928748236658393e-05, "loss": 2.0362, "step": 6937 }, { "epoch": 0.05395644288767292, "grad_norm": 0.11201566241412884, "learning_rate": 9.928727682331049e-05, "loss": 2.0621, "step": 6938 }, { "epoch": 0.0539642198324535, "grad_norm": 0.1293426907924533, "learning_rate": 9.928707125060714e-05, "loss": 2.0304, "step": 6939 }, { "epoch": 0.053971996777234084, "grad_norm": 0.11672553801284923, "learning_rate": 9.9286865648474e-05, "loss": 2.0972, "step": 6940 }, { "epoch": 0.053979773722014665, "grad_norm": 0.11223613819666438, "learning_rate": 9.928666001691119e-05, "loss": 2.0945, "step": 6941 }, { "epoch": 0.053987550666795246, "grad_norm": 0.11658758010140169, "learning_rate": 9.928645435591881e-05, "loss": 2.0255, "step": 6942 }, { "epoch": 0.05399532761157583, "grad_norm": 0.11556128330212921, "learning_rate": 9.928624866549702e-05, "loss": 2.1057, "step": 6943 }, { "epoch": 0.05400310455635641, "grad_norm": 0.12197963834298066, "learning_rate": 9.928604294564592e-05, "loss": 2.0779, "step": 6944 }, { "epoch": 0.05401088150113699, "grad_norm": 0.11122251959651637, "learning_rate": 9.928583719636564e-05, "loss": 2.0216, "step": 6945 }, { "epoch": 0.05401865844591757, "grad_norm": 0.125660093231822, "learning_rate": 9.92856314176563e-05, "loss": 2.083, "step": 6946 }, { "epoch": 0.05402643539069815, "grad_norm": 0.13036253494640035, "learning_rate": 9.928542560951802e-05, "loss": 2.0637, "step": 6947 }, { "epoch": 0.05403421233547873, "grad_norm": 0.10814231982718679, "learning_rate": 9.928521977195092e-05, "loss": 2.086, "step": 6948 }, { "epoch": 0.054041989280259314, "grad_norm": 0.1257604742902358, "learning_rate": 9.928501390495515e-05, "loss": 2.0821, "step": 6949 }, { "epoch": 0.054049766225039896, "grad_norm": 0.12378322039442516, "learning_rate": 9.928480800853079e-05, "loss": 2.018, "step": 6950 }, { "epoch": 0.05405754316982048, "grad_norm": 0.11177223494727319, "learning_rate": 9.9284602082678e-05, "loss": 2.0506, "step": 6951 }, { "epoch": 0.05406532011460106, "grad_norm": 0.12966878305738772, "learning_rate": 9.928439612739688e-05, "loss": 2.1053, "step": 6952 }, { "epoch": 0.05407309705938164, "grad_norm": 0.12766466198613005, "learning_rate": 9.928419014268757e-05, "loss": 2.064, "step": 6953 }, { "epoch": 0.05408087400416222, "grad_norm": 0.12321937610862223, "learning_rate": 9.928398412855018e-05, "loss": 2.0309, "step": 6954 }, { "epoch": 0.0540886509489428, "grad_norm": 0.13854025592479502, "learning_rate": 9.928377808498485e-05, "loss": 2.0465, "step": 6955 }, { "epoch": 0.05409642789372338, "grad_norm": 0.11305766883253195, "learning_rate": 9.928357201199168e-05, "loss": 2.0557, "step": 6956 }, { "epoch": 0.054104204838503964, "grad_norm": 0.12690584659812285, "learning_rate": 9.92833659095708e-05, "loss": 2.0913, "step": 6957 }, { "epoch": 0.054111981783284545, "grad_norm": 0.12140944420324426, "learning_rate": 9.928315977772233e-05, "loss": 2.0886, "step": 6958 }, { "epoch": 0.054119758728065126, "grad_norm": 0.11481487241028557, "learning_rate": 9.928295361644642e-05, "loss": 2.0459, "step": 6959 }, { "epoch": 0.05412753567284571, "grad_norm": 0.12053637610623176, "learning_rate": 9.928274742574317e-05, "loss": 2.0758, "step": 6960 }, { "epoch": 0.05413531261762629, "grad_norm": 0.11583387216820072, "learning_rate": 9.928254120561269e-05, "loss": 1.9948, "step": 6961 }, { "epoch": 0.05414308956240687, "grad_norm": 0.12522459755030327, "learning_rate": 9.928233495605512e-05, "loss": 2.0043, "step": 6962 }, { "epoch": 0.05415086650718745, "grad_norm": 0.11118546059067616, "learning_rate": 9.92821286770706e-05, "loss": 2.0485, "step": 6963 }, { "epoch": 0.05415864345196803, "grad_norm": 0.11314675173056428, "learning_rate": 9.928192236865922e-05, "loss": 2.1121, "step": 6964 }, { "epoch": 0.05416642039674861, "grad_norm": 0.13608840470769876, "learning_rate": 9.928171603082113e-05, "loss": 2.0673, "step": 6965 }, { "epoch": 0.054174197341529194, "grad_norm": 0.11240658210117214, "learning_rate": 9.928150966355642e-05, "loss": 2.0591, "step": 6966 }, { "epoch": 0.054181974286309775, "grad_norm": 0.11804182021127602, "learning_rate": 9.928130326686526e-05, "loss": 2.0658, "step": 6967 }, { "epoch": 0.054189751231090356, "grad_norm": 0.11311868262181986, "learning_rate": 9.928109684074773e-05, "loss": 2.0512, "step": 6968 }, { "epoch": 0.05419752817587094, "grad_norm": 0.11968506624437833, "learning_rate": 9.928089038520399e-05, "loss": 2.0778, "step": 6969 }, { "epoch": 0.05420530512065152, "grad_norm": 0.13933932504454175, "learning_rate": 9.928068390023413e-05, "loss": 1.9972, "step": 6970 }, { "epoch": 0.0542130820654321, "grad_norm": 0.122554657544002, "learning_rate": 9.92804773858383e-05, "loss": 2.0497, "step": 6971 }, { "epoch": 0.05422085901021268, "grad_norm": 0.12819555263488883, "learning_rate": 9.92802708420166e-05, "loss": 2.0395, "step": 6972 }, { "epoch": 0.05422863595499326, "grad_norm": 0.17946580507069698, "learning_rate": 9.928006426876916e-05, "loss": 2.0587, "step": 6973 }, { "epoch": 0.05423641289977384, "grad_norm": 0.12293201881295375, "learning_rate": 9.927985766609613e-05, "loss": 2.0659, "step": 6974 }, { "epoch": 0.054244189844554425, "grad_norm": 0.13955742245689695, "learning_rate": 9.92796510339976e-05, "loss": 2.054, "step": 6975 }, { "epoch": 0.054251966789335006, "grad_norm": 0.14838969916538702, "learning_rate": 9.927944437247371e-05, "loss": 2.033, "step": 6976 }, { "epoch": 0.05425974373411559, "grad_norm": 0.12645646820762044, "learning_rate": 9.927923768152456e-05, "loss": 2.07, "step": 6977 }, { "epoch": 0.054267520678896175, "grad_norm": 0.168000346775367, "learning_rate": 9.92790309611503e-05, "loss": 2.0736, "step": 6978 }, { "epoch": 0.054275297623676756, "grad_norm": 0.11985358785176391, "learning_rate": 9.927882421135106e-05, "loss": 2.0459, "step": 6979 }, { "epoch": 0.05428307456845734, "grad_norm": 0.14157058368176356, "learning_rate": 9.927861743212694e-05, "loss": 2.0111, "step": 6980 }, { "epoch": 0.05429085151323792, "grad_norm": 0.13556297131554187, "learning_rate": 9.927841062347807e-05, "loss": 2.0517, "step": 6981 }, { "epoch": 0.0542986284580185, "grad_norm": 0.12458402079105085, "learning_rate": 9.927820378540457e-05, "loss": 2.0655, "step": 6982 }, { "epoch": 0.05430640540279908, "grad_norm": 0.141850583104614, "learning_rate": 9.927799691790658e-05, "loss": 2.0717, "step": 6983 }, { "epoch": 0.05431418234757966, "grad_norm": 0.11917274402262916, "learning_rate": 9.927779002098421e-05, "loss": 2.0523, "step": 6984 }, { "epoch": 0.05432195929236024, "grad_norm": 0.11407323172200576, "learning_rate": 9.927758309463757e-05, "loss": 2.0448, "step": 6985 }, { "epoch": 0.054329736237140824, "grad_norm": 0.11492404939019607, "learning_rate": 9.927737613886682e-05, "loss": 1.9887, "step": 6986 }, { "epoch": 0.054337513181921406, "grad_norm": 0.13043911062608876, "learning_rate": 9.927716915367206e-05, "loss": 2.1217, "step": 6987 }, { "epoch": 0.05434529012670199, "grad_norm": 0.11765074656859224, "learning_rate": 9.927696213905341e-05, "loss": 2.0481, "step": 6988 }, { "epoch": 0.05435306707148257, "grad_norm": 0.11805996355603181, "learning_rate": 9.9276755095011e-05, "loss": 2.1134, "step": 6989 }, { "epoch": 0.05436084401626315, "grad_norm": 0.12292339635977677, "learning_rate": 9.927654802154496e-05, "loss": 2.0685, "step": 6990 }, { "epoch": 0.05436862096104373, "grad_norm": 0.11009756024070796, "learning_rate": 9.927634091865541e-05, "loss": 2.0672, "step": 6991 }, { "epoch": 0.05437639790582431, "grad_norm": 0.11107899742513622, "learning_rate": 9.927613378634247e-05, "loss": 2.0821, "step": 6992 }, { "epoch": 0.05438417485060489, "grad_norm": 0.10721811949033727, "learning_rate": 9.927592662460626e-05, "loss": 2.0493, "step": 6993 }, { "epoch": 0.054391951795385474, "grad_norm": 0.11246723966207789, "learning_rate": 9.927571943344689e-05, "loss": 2.0348, "step": 6994 }, { "epoch": 0.054399728740166055, "grad_norm": 0.12203842890734595, "learning_rate": 9.927551221286453e-05, "loss": 2.091, "step": 6995 }, { "epoch": 0.054407505684946636, "grad_norm": 0.11590861559469323, "learning_rate": 9.927530496285926e-05, "loss": 2.1562, "step": 6996 }, { "epoch": 0.05441528262972722, "grad_norm": 0.1146633585514003, "learning_rate": 9.927509768343124e-05, "loss": 2.0048, "step": 6997 }, { "epoch": 0.0544230595745078, "grad_norm": 0.112572212340296, "learning_rate": 9.927489037458055e-05, "loss": 2.0588, "step": 6998 }, { "epoch": 0.05443083651928838, "grad_norm": 0.11599055474834835, "learning_rate": 9.927468303630736e-05, "loss": 2.0933, "step": 6999 }, { "epoch": 0.05443861346406896, "grad_norm": 0.11526472093857734, "learning_rate": 9.927447566861177e-05, "loss": 2.0391, "step": 7000 }, { "epoch": 0.05444639040884954, "grad_norm": 0.1111617616795621, "learning_rate": 9.927426827149389e-05, "loss": 2.0258, "step": 7001 }, { "epoch": 0.05445416735363012, "grad_norm": 0.11391416436901167, "learning_rate": 9.927406084495386e-05, "loss": 2.0948, "step": 7002 }, { "epoch": 0.054461944298410704, "grad_norm": 0.1158195777441804, "learning_rate": 9.92738533889918e-05, "loss": 2.0791, "step": 7003 }, { "epoch": 0.054469721243191285, "grad_norm": 0.1119109046190331, "learning_rate": 9.927364590360786e-05, "loss": 2.0431, "step": 7004 }, { "epoch": 0.054477498187971866, "grad_norm": 0.10947208249117259, "learning_rate": 9.927343838880212e-05, "loss": 2.0092, "step": 7005 }, { "epoch": 0.05448527513275245, "grad_norm": 0.12638126470405137, "learning_rate": 9.927323084457472e-05, "loss": 2.0888, "step": 7006 }, { "epoch": 0.05449305207753303, "grad_norm": 0.1183076208189382, "learning_rate": 9.92730232709258e-05, "loss": 2.1321, "step": 7007 }, { "epoch": 0.05450082902231361, "grad_norm": 0.11130408438526038, "learning_rate": 9.927281566785546e-05, "loss": 2.0064, "step": 7008 }, { "epoch": 0.05450860596709419, "grad_norm": 0.12319755194977168, "learning_rate": 9.927260803536387e-05, "loss": 2.0261, "step": 7009 }, { "epoch": 0.05451638291187477, "grad_norm": 0.11031940579959743, "learning_rate": 9.927240037345109e-05, "loss": 2.0513, "step": 7010 }, { "epoch": 0.05452415985665535, "grad_norm": 0.11871376786912219, "learning_rate": 9.927219268211729e-05, "loss": 2.0229, "step": 7011 }, { "epoch": 0.054531936801435935, "grad_norm": 0.11579636006963623, "learning_rate": 9.927198496136256e-05, "loss": 2.065, "step": 7012 }, { "epoch": 0.054539713746216516, "grad_norm": 0.11616047933057462, "learning_rate": 9.927177721118706e-05, "loss": 2.0361, "step": 7013 }, { "epoch": 0.0545474906909971, "grad_norm": 0.11751366328381378, "learning_rate": 9.927156943159089e-05, "loss": 2.0276, "step": 7014 }, { "epoch": 0.05455526763577768, "grad_norm": 0.11506879654045843, "learning_rate": 9.927136162257416e-05, "loss": 2.0642, "step": 7015 }, { "epoch": 0.05456304458055826, "grad_norm": 0.12078590649993177, "learning_rate": 9.927115378413704e-05, "loss": 2.0629, "step": 7016 }, { "epoch": 0.05457082152533884, "grad_norm": 0.11865331654156666, "learning_rate": 9.927094591627962e-05, "loss": 2.0355, "step": 7017 }, { "epoch": 0.05457859847011942, "grad_norm": 0.11536047364069678, "learning_rate": 9.927073801900203e-05, "loss": 2.0892, "step": 7018 }, { "epoch": 0.0545863754149, "grad_norm": 0.12113216572856032, "learning_rate": 9.92705300923044e-05, "loss": 2.071, "step": 7019 }, { "epoch": 0.054594152359680584, "grad_norm": 0.12993321608732988, "learning_rate": 9.927032213618686e-05, "loss": 2.1379, "step": 7020 }, { "epoch": 0.054601929304461165, "grad_norm": 0.11636273833729989, "learning_rate": 9.927011415064952e-05, "loss": 2.1005, "step": 7021 }, { "epoch": 0.054609706249241746, "grad_norm": 0.12719856855584732, "learning_rate": 9.926990613569251e-05, "loss": 2.0869, "step": 7022 }, { "epoch": 0.05461748319402233, "grad_norm": 0.13201192595597253, "learning_rate": 9.926969809131594e-05, "loss": 2.0522, "step": 7023 }, { "epoch": 0.05462526013880291, "grad_norm": 0.11329698354739832, "learning_rate": 9.926949001751997e-05, "loss": 2.0355, "step": 7024 }, { "epoch": 0.05463303708358349, "grad_norm": 0.14185682483137663, "learning_rate": 9.926928191430469e-05, "loss": 2.0222, "step": 7025 }, { "epoch": 0.05464081402836407, "grad_norm": 0.14281284402293276, "learning_rate": 9.926907378167023e-05, "loss": 2.0165, "step": 7026 }, { "epoch": 0.05464859097314465, "grad_norm": 0.14062124871394957, "learning_rate": 9.926886561961673e-05, "loss": 2.0446, "step": 7027 }, { "epoch": 0.05465636791792523, "grad_norm": 0.1534359116401353, "learning_rate": 9.926865742814431e-05, "loss": 2.0224, "step": 7028 }, { "epoch": 0.054664144862705814, "grad_norm": 0.12808476585597325, "learning_rate": 9.926844920725307e-05, "loss": 2.0426, "step": 7029 }, { "epoch": 0.054671921807486396, "grad_norm": 0.14234594570857, "learning_rate": 9.926824095694317e-05, "loss": 2.0632, "step": 7030 }, { "epoch": 0.05467969875226698, "grad_norm": 0.15843434284625463, "learning_rate": 9.926803267721472e-05, "loss": 2.0737, "step": 7031 }, { "epoch": 0.05468747569704756, "grad_norm": 0.11271534163986668, "learning_rate": 9.926782436806782e-05, "loss": 2.0513, "step": 7032 }, { "epoch": 0.05469525264182814, "grad_norm": 0.147420345334127, "learning_rate": 9.926761602950265e-05, "loss": 2.0617, "step": 7033 }, { "epoch": 0.05470302958660872, "grad_norm": 0.11776447936535155, "learning_rate": 9.926740766151927e-05, "loss": 2.0778, "step": 7034 }, { "epoch": 0.0547108065313893, "grad_norm": 0.1797384574648758, "learning_rate": 9.926719926411784e-05, "loss": 2.0633, "step": 7035 }, { "epoch": 0.05471858347616988, "grad_norm": 0.1193226641499756, "learning_rate": 9.92669908372985e-05, "loss": 2.0695, "step": 7036 }, { "epoch": 0.054726360420950464, "grad_norm": 0.175792629710724, "learning_rate": 9.926678238106135e-05, "loss": 2.0763, "step": 7037 }, { "epoch": 0.05473413736573105, "grad_norm": 0.14747287289338404, "learning_rate": 9.92665738954065e-05, "loss": 2.0505, "step": 7038 }, { "epoch": 0.05474191431051163, "grad_norm": 0.12803458727157024, "learning_rate": 9.926636538033411e-05, "loss": 2.0525, "step": 7039 }, { "epoch": 0.054749691255292214, "grad_norm": 0.14353727068180772, "learning_rate": 9.926615683584429e-05, "loss": 2.087, "step": 7040 }, { "epoch": 0.054757468200072795, "grad_norm": 0.11425550384421677, "learning_rate": 9.926594826193716e-05, "loss": 2.0564, "step": 7041 }, { "epoch": 0.054765245144853376, "grad_norm": 0.1509763741717138, "learning_rate": 9.926573965861283e-05, "loss": 1.9922, "step": 7042 }, { "epoch": 0.05477302208963396, "grad_norm": 0.11554483483880396, "learning_rate": 9.926553102587146e-05, "loss": 2.0277, "step": 7043 }, { "epoch": 0.05478079903441454, "grad_norm": 0.15192162432511166, "learning_rate": 9.926532236371315e-05, "loss": 2.0594, "step": 7044 }, { "epoch": 0.05478857597919512, "grad_norm": 0.11507089530407155, "learning_rate": 9.926511367213803e-05, "loss": 2.0962, "step": 7045 }, { "epoch": 0.0547963529239757, "grad_norm": 0.13984701477963343, "learning_rate": 9.926490495114622e-05, "loss": 2.0455, "step": 7046 }, { "epoch": 0.05480412986875628, "grad_norm": 0.1415041843525349, "learning_rate": 9.926469620073786e-05, "loss": 2.0558, "step": 7047 }, { "epoch": 0.05481190681353686, "grad_norm": 0.11585613733636718, "learning_rate": 9.926448742091307e-05, "loss": 2.0494, "step": 7048 }, { "epoch": 0.054819683758317445, "grad_norm": 0.13726045877756077, "learning_rate": 9.926427861167196e-05, "loss": 2.0679, "step": 7049 }, { "epoch": 0.054827460703098026, "grad_norm": 0.11734918433286144, "learning_rate": 9.926406977301465e-05, "loss": 2.0965, "step": 7050 }, { "epoch": 0.05483523764787861, "grad_norm": 0.11783791726267069, "learning_rate": 9.926386090494129e-05, "loss": 2.0819, "step": 7051 }, { "epoch": 0.05484301459265919, "grad_norm": 0.11702907033310075, "learning_rate": 9.9263652007452e-05, "loss": 2.0578, "step": 7052 }, { "epoch": 0.05485079153743977, "grad_norm": 0.1359992472582796, "learning_rate": 9.926344308054689e-05, "loss": 2.0475, "step": 7053 }, { "epoch": 0.05485856848222035, "grad_norm": 0.12170674175245376, "learning_rate": 9.92632341242261e-05, "loss": 2.0589, "step": 7054 }, { "epoch": 0.05486634542700093, "grad_norm": 0.11540623272929798, "learning_rate": 9.926302513848975e-05, "loss": 2.0263, "step": 7055 }, { "epoch": 0.05487412237178151, "grad_norm": 0.16730346786024322, "learning_rate": 9.926281612333795e-05, "loss": 2.0794, "step": 7056 }, { "epoch": 0.054881899316562094, "grad_norm": 0.11845850787323062, "learning_rate": 9.926260707877084e-05, "loss": 2.0684, "step": 7057 }, { "epoch": 0.054889676261342675, "grad_norm": 0.5646769046459742, "learning_rate": 9.926239800478856e-05, "loss": 2.064, "step": 7058 }, { "epoch": 0.054897453206123256, "grad_norm": 0.12019028107922593, "learning_rate": 9.926218890139119e-05, "loss": 2.0403, "step": 7059 }, { "epoch": 0.05490523015090384, "grad_norm": 0.11641138774301278, "learning_rate": 9.92619797685789e-05, "loss": 2.0373, "step": 7060 }, { "epoch": 0.05491300709568442, "grad_norm": 0.12235084145999524, "learning_rate": 9.926177060635178e-05, "loss": 2.0503, "step": 7061 }, { "epoch": 0.054920784040465, "grad_norm": 0.12686713445601358, "learning_rate": 9.926156141470999e-05, "loss": 2.0736, "step": 7062 }, { "epoch": 0.05492856098524558, "grad_norm": 0.12919395895367294, "learning_rate": 9.926135219365362e-05, "loss": 2.0738, "step": 7063 }, { "epoch": 0.05493633793002616, "grad_norm": 0.13867211488468628, "learning_rate": 9.926114294318283e-05, "loss": 2.0383, "step": 7064 }, { "epoch": 0.05494411487480674, "grad_norm": 0.3332799582038935, "learning_rate": 9.926093366329771e-05, "loss": 2.08, "step": 7065 }, { "epoch": 0.054951891819587324, "grad_norm": 0.1183346528612333, "learning_rate": 9.926072435399841e-05, "loss": 2.0542, "step": 7066 }, { "epoch": 0.054959668764367905, "grad_norm": 0.1280502258719635, "learning_rate": 9.926051501528504e-05, "loss": 2.087, "step": 7067 }, { "epoch": 0.05496744570914849, "grad_norm": 0.125451164390231, "learning_rate": 9.926030564715774e-05, "loss": 2.0753, "step": 7068 }, { "epoch": 0.05497522265392907, "grad_norm": 0.11413036172436355, "learning_rate": 9.92600962496166e-05, "loss": 2.0938, "step": 7069 }, { "epoch": 0.05498299959870965, "grad_norm": 0.12128451925725238, "learning_rate": 9.925988682266181e-05, "loss": 2.0817, "step": 7070 }, { "epoch": 0.05499077654349023, "grad_norm": 0.1262583875283905, "learning_rate": 9.925967736629343e-05, "loss": 2.0476, "step": 7071 }, { "epoch": 0.05499855348827081, "grad_norm": 0.1163669754421756, "learning_rate": 9.925946788051162e-05, "loss": 1.9756, "step": 7072 }, { "epoch": 0.05500633043305139, "grad_norm": 0.12184465487514717, "learning_rate": 9.925925836531649e-05, "loss": 2.0474, "step": 7073 }, { "epoch": 0.055014107377831974, "grad_norm": 0.14535112739504577, "learning_rate": 9.925904882070818e-05, "loss": 2.0332, "step": 7074 }, { "epoch": 0.055021884322612555, "grad_norm": 0.11908142709274092, "learning_rate": 9.925883924668681e-05, "loss": 2.0982, "step": 7075 }, { "epoch": 0.055029661267393136, "grad_norm": 0.1160812655717699, "learning_rate": 9.92586296432525e-05, "loss": 2.0392, "step": 7076 }, { "epoch": 0.05503743821217372, "grad_norm": 0.11876253855271504, "learning_rate": 9.925842001040538e-05, "loss": 2.0795, "step": 7077 }, { "epoch": 0.0550452151569543, "grad_norm": 0.11319900544622268, "learning_rate": 9.925821034814555e-05, "loss": 2.0786, "step": 7078 }, { "epoch": 0.05505299210173488, "grad_norm": 0.12238988817073831, "learning_rate": 9.925800065647317e-05, "loss": 2.1047, "step": 7079 }, { "epoch": 0.05506076904651546, "grad_norm": 0.11182620756191793, "learning_rate": 9.925779093538837e-05, "loss": 2.0576, "step": 7080 }, { "epoch": 0.05506854599129604, "grad_norm": 0.12230342646554809, "learning_rate": 9.925758118489123e-05, "loss": 2.0514, "step": 7081 }, { "epoch": 0.05507632293607662, "grad_norm": 0.11281690141891847, "learning_rate": 9.925737140498191e-05, "loss": 2.0478, "step": 7082 }, { "epoch": 0.055084099880857204, "grad_norm": 0.11857939920600029, "learning_rate": 9.925716159566055e-05, "loss": 2.0789, "step": 7083 }, { "epoch": 0.055091876825637785, "grad_norm": 0.11987617971362553, "learning_rate": 9.925695175692722e-05, "loss": 2.0097, "step": 7084 }, { "epoch": 0.055099653770418366, "grad_norm": 0.12269917315700075, "learning_rate": 9.925674188878208e-05, "loss": 2.0629, "step": 7085 }, { "epoch": 0.05510743071519895, "grad_norm": 0.13879583152183844, "learning_rate": 9.925653199122528e-05, "loss": 2.0459, "step": 7086 }, { "epoch": 0.05511520765997953, "grad_norm": 0.13662671684846558, "learning_rate": 9.925632206425691e-05, "loss": 2.1171, "step": 7087 }, { "epoch": 0.05512298460476011, "grad_norm": 0.11686289761123483, "learning_rate": 9.925611210787711e-05, "loss": 2.0415, "step": 7088 }, { "epoch": 0.05513076154954069, "grad_norm": 0.14957797330404435, "learning_rate": 9.9255902122086e-05, "loss": 2.055, "step": 7089 }, { "epoch": 0.05513853849432127, "grad_norm": 0.12838618145387068, "learning_rate": 9.92556921068837e-05, "loss": 2.0516, "step": 7090 }, { "epoch": 0.05514631543910185, "grad_norm": 0.11963262265610337, "learning_rate": 9.925548206227033e-05, "loss": 2.0453, "step": 7091 }, { "epoch": 0.055154092383882435, "grad_norm": 0.12933953812621807, "learning_rate": 9.925527198824605e-05, "loss": 2.0478, "step": 7092 }, { "epoch": 0.055161869328663016, "grad_norm": 0.10848884057577145, "learning_rate": 9.925506188481094e-05, "loss": 2.0881, "step": 7093 }, { "epoch": 0.0551696462734436, "grad_norm": 0.14400225399830804, "learning_rate": 9.925485175196516e-05, "loss": 2.0305, "step": 7094 }, { "epoch": 0.05517742321822418, "grad_norm": 0.12008797857286832, "learning_rate": 9.92546415897088e-05, "loss": 2.0856, "step": 7095 }, { "epoch": 0.05518520016300476, "grad_norm": 0.13445153586130934, "learning_rate": 9.925443139804205e-05, "loss": 2.0521, "step": 7096 }, { "epoch": 0.05519297710778535, "grad_norm": 0.13821114667272064, "learning_rate": 9.925422117696496e-05, "loss": 2.1071, "step": 7097 }, { "epoch": 0.05520075405256593, "grad_norm": 0.12264003961117495, "learning_rate": 9.925401092647771e-05, "loss": 2.0029, "step": 7098 }, { "epoch": 0.05520853099734651, "grad_norm": 0.15576450962507943, "learning_rate": 9.925380064658039e-05, "loss": 2.0551, "step": 7099 }, { "epoch": 0.05521630794212709, "grad_norm": 0.11099376589015376, "learning_rate": 9.925359033727314e-05, "loss": 2.021, "step": 7100 }, { "epoch": 0.05522408488690767, "grad_norm": 0.14072530427884333, "learning_rate": 9.925337999855608e-05, "loss": 2.0415, "step": 7101 }, { "epoch": 0.05523186183168825, "grad_norm": 0.11800257975657683, "learning_rate": 9.925316963042937e-05, "loss": 2.0449, "step": 7102 }, { "epoch": 0.055239638776468834, "grad_norm": 0.1505426999333216, "learning_rate": 9.925295923289309e-05, "loss": 2.1049, "step": 7103 }, { "epoch": 0.055247415721249415, "grad_norm": 0.1694737910602656, "learning_rate": 9.925274880594737e-05, "loss": 2.0665, "step": 7104 }, { "epoch": 0.05525519266603, "grad_norm": 0.11043617152370927, "learning_rate": 9.925253834959237e-05, "loss": 2.0293, "step": 7105 }, { "epoch": 0.05526296961081058, "grad_norm": 0.1628513894954803, "learning_rate": 9.925232786382819e-05, "loss": 2.0509, "step": 7106 }, { "epoch": 0.05527074655559116, "grad_norm": 0.12689429726147752, "learning_rate": 9.925211734865495e-05, "loss": 2.0919, "step": 7107 }, { "epoch": 0.05527852350037174, "grad_norm": 0.13222555162127958, "learning_rate": 9.925190680407279e-05, "loss": 2.0668, "step": 7108 }, { "epoch": 0.05528630044515232, "grad_norm": 0.1322531220893625, "learning_rate": 9.925169623008184e-05, "loss": 2.0228, "step": 7109 }, { "epoch": 0.0552940773899329, "grad_norm": 0.12060381123148999, "learning_rate": 9.925148562668221e-05, "loss": 2.0549, "step": 7110 }, { "epoch": 0.055301854334713484, "grad_norm": 0.34069106505579216, "learning_rate": 9.925127499387401e-05, "loss": 1.9907, "step": 7111 }, { "epoch": 0.055309631279494065, "grad_norm": 0.11715391185365906, "learning_rate": 9.925106433165742e-05, "loss": 2.0143, "step": 7112 }, { "epoch": 0.055317408224274646, "grad_norm": 0.3920507182724968, "learning_rate": 9.925085364003252e-05, "loss": 2.065, "step": 7113 }, { "epoch": 0.05532518516905523, "grad_norm": 0.141628554515153, "learning_rate": 9.925064291899946e-05, "loss": 2.0831, "step": 7114 }, { "epoch": 0.05533296211383581, "grad_norm": 0.12031193119826167, "learning_rate": 9.925043216855833e-05, "loss": 2.0543, "step": 7115 }, { "epoch": 0.05534073905861639, "grad_norm": 0.14531549957431736, "learning_rate": 9.925022138870931e-05, "loss": 2.0615, "step": 7116 }, { "epoch": 0.05534851600339697, "grad_norm": 0.11553848834877631, "learning_rate": 9.925001057945249e-05, "loss": 2.1024, "step": 7117 }, { "epoch": 0.05535629294817755, "grad_norm": 0.13076199929222163, "learning_rate": 9.924979974078801e-05, "loss": 2.0653, "step": 7118 }, { "epoch": 0.05536406989295813, "grad_norm": 0.11593596139999883, "learning_rate": 9.924958887271598e-05, "loss": 2.0599, "step": 7119 }, { "epoch": 0.055371846837738714, "grad_norm": 0.1512987362745905, "learning_rate": 9.924937797523654e-05, "loss": 2.0445, "step": 7120 }, { "epoch": 0.055379623782519295, "grad_norm": 0.12885455965040116, "learning_rate": 9.924916704834981e-05, "loss": 2.1056, "step": 7121 }, { "epoch": 0.055387400727299876, "grad_norm": 0.13216138330222585, "learning_rate": 9.924895609205592e-05, "loss": 2.0693, "step": 7122 }, { "epoch": 0.05539517767208046, "grad_norm": 0.13184585693129988, "learning_rate": 9.924874510635498e-05, "loss": 2.1093, "step": 7123 }, { "epoch": 0.05540295461686104, "grad_norm": 0.1129260683940112, "learning_rate": 9.924853409124715e-05, "loss": 2.0477, "step": 7124 }, { "epoch": 0.05541073156164162, "grad_norm": 0.5738866246989394, "learning_rate": 9.924832304673251e-05, "loss": 2.0851, "step": 7125 }, { "epoch": 0.0554185085064222, "grad_norm": 0.11627756151941131, "learning_rate": 9.924811197281123e-05, "loss": 2.0697, "step": 7126 }, { "epoch": 0.05542628545120278, "grad_norm": 0.1376755325808266, "learning_rate": 9.924790086948342e-05, "loss": 2.0748, "step": 7127 }, { "epoch": 0.05543406239598336, "grad_norm": 0.15820669308299293, "learning_rate": 9.924768973674918e-05, "loss": 2.0341, "step": 7128 }, { "epoch": 0.055441839340763945, "grad_norm": 0.12997729191617696, "learning_rate": 9.924747857460867e-05, "loss": 2.0487, "step": 7129 }, { "epoch": 0.055449616285544526, "grad_norm": 0.13130065804552293, "learning_rate": 9.924726738306201e-05, "loss": 2.0272, "step": 7130 }, { "epoch": 0.05545739323032511, "grad_norm": 0.12313923918006435, "learning_rate": 9.924705616210933e-05, "loss": 2.0611, "step": 7131 }, { "epoch": 0.05546517017510569, "grad_norm": 0.12473168456253125, "learning_rate": 9.924684491175073e-05, "loss": 2.1238, "step": 7132 }, { "epoch": 0.05547294711988627, "grad_norm": 0.13716898207761644, "learning_rate": 9.924663363198636e-05, "loss": 1.9823, "step": 7133 }, { "epoch": 0.05548072406466685, "grad_norm": 0.12242014683654895, "learning_rate": 9.924642232281634e-05, "loss": 2.0058, "step": 7134 }, { "epoch": 0.05548850100944743, "grad_norm": 0.138927840889947, "learning_rate": 9.924621098424081e-05, "loss": 2.0374, "step": 7135 }, { "epoch": 0.05549627795422801, "grad_norm": 0.12270907350710905, "learning_rate": 9.924599961625986e-05, "loss": 2.0695, "step": 7136 }, { "epoch": 0.055504054899008594, "grad_norm": 0.23966515802965105, "learning_rate": 9.924578821887365e-05, "loss": 2.0159, "step": 7137 }, { "epoch": 0.055511831843789175, "grad_norm": 0.13944798346238746, "learning_rate": 9.924557679208229e-05, "loss": 2.0913, "step": 7138 }, { "epoch": 0.055519608788569756, "grad_norm": 0.14425549270858057, "learning_rate": 9.924536533588592e-05, "loss": 2.0567, "step": 7139 }, { "epoch": 0.05552738573335034, "grad_norm": 0.12221629510359447, "learning_rate": 9.924515385028466e-05, "loss": 2.0215, "step": 7140 }, { "epoch": 0.05553516267813092, "grad_norm": 0.1496414755986292, "learning_rate": 9.924494233527862e-05, "loss": 2.0112, "step": 7141 }, { "epoch": 0.0555429396229115, "grad_norm": 0.1300582629479537, "learning_rate": 9.924473079086794e-05, "loss": 1.9787, "step": 7142 }, { "epoch": 0.05555071656769208, "grad_norm": 0.13194389311371735, "learning_rate": 9.924451921705275e-05, "loss": 2.022, "step": 7143 }, { "epoch": 0.05555849351247266, "grad_norm": 0.12443382224981414, "learning_rate": 9.924430761383317e-05, "loss": 2.0143, "step": 7144 }, { "epoch": 0.05556627045725324, "grad_norm": 0.1220829829741943, "learning_rate": 9.924409598120934e-05, "loss": 2.0404, "step": 7145 }, { "epoch": 0.055574047402033824, "grad_norm": 0.11524721871004129, "learning_rate": 9.924388431918136e-05, "loss": 2.0638, "step": 7146 }, { "epoch": 0.055581824346814405, "grad_norm": 0.124013267733421, "learning_rate": 9.924367262774938e-05, "loss": 2.0416, "step": 7147 }, { "epoch": 0.05558960129159499, "grad_norm": 0.13350644435053186, "learning_rate": 9.92434609069135e-05, "loss": 2.0874, "step": 7148 }, { "epoch": 0.05559737823637557, "grad_norm": 0.11573653935731747, "learning_rate": 9.924324915667389e-05, "loss": 2.0533, "step": 7149 }, { "epoch": 0.05560515518115615, "grad_norm": 0.1262187616339508, "learning_rate": 9.924303737703064e-05, "loss": 2.0417, "step": 7150 }, { "epoch": 0.05561293212593673, "grad_norm": 0.1204897435069588, "learning_rate": 9.92428255679839e-05, "loss": 2.0323, "step": 7151 }, { "epoch": 0.05562070907071731, "grad_norm": 0.13950962564823788, "learning_rate": 9.924261372953376e-05, "loss": 2.0359, "step": 7152 }, { "epoch": 0.05562848601549789, "grad_norm": 0.11395977534933661, "learning_rate": 9.924240186168039e-05, "loss": 1.97, "step": 7153 }, { "epoch": 0.055636262960278474, "grad_norm": 0.12220770015019143, "learning_rate": 9.92421899644239e-05, "loss": 2.0097, "step": 7154 }, { "epoch": 0.055644039905059055, "grad_norm": 0.11306065469466378, "learning_rate": 9.924197803776438e-05, "loss": 2.0399, "step": 7155 }, { "epoch": 0.055651816849839636, "grad_norm": 0.13844046064262974, "learning_rate": 9.924176608170202e-05, "loss": 2.0058, "step": 7156 }, { "epoch": 0.055659593794620224, "grad_norm": 0.12153000194307151, "learning_rate": 9.92415540962369e-05, "loss": 2.0781, "step": 7157 }, { "epoch": 0.055667370739400805, "grad_norm": 0.12485856419089092, "learning_rate": 9.924134208136918e-05, "loss": 2.0362, "step": 7158 }, { "epoch": 0.055675147684181386, "grad_norm": 0.12176451850507783, "learning_rate": 9.924113003709896e-05, "loss": 2.0378, "step": 7159 }, { "epoch": 0.05568292462896197, "grad_norm": 0.1307148524786903, "learning_rate": 9.924091796342639e-05, "loss": 2.0635, "step": 7160 }, { "epoch": 0.05569070157374255, "grad_norm": 0.11836235453526717, "learning_rate": 9.924070586035155e-05, "loss": 2.0606, "step": 7161 }, { "epoch": 0.05569847851852313, "grad_norm": 0.13905366609569614, "learning_rate": 9.924049372787463e-05, "loss": 2.1044, "step": 7162 }, { "epoch": 0.05570625546330371, "grad_norm": 0.14906077607834514, "learning_rate": 9.92402815659957e-05, "loss": 2.0399, "step": 7163 }, { "epoch": 0.05571403240808429, "grad_norm": 0.1157284746557063, "learning_rate": 9.924006937471494e-05, "loss": 2.0874, "step": 7164 }, { "epoch": 0.05572180935286487, "grad_norm": 0.1478693907556657, "learning_rate": 9.923985715403243e-05, "loss": 2.015, "step": 7165 }, { "epoch": 0.055729586297645455, "grad_norm": 0.11220757730922562, "learning_rate": 9.923964490394832e-05, "loss": 2.0383, "step": 7166 }, { "epoch": 0.055737363242426036, "grad_norm": 0.14369571741491555, "learning_rate": 9.923943262446275e-05, "loss": 2.0492, "step": 7167 }, { "epoch": 0.05574514018720662, "grad_norm": 0.14061714846180126, "learning_rate": 9.92392203155758e-05, "loss": 2.0827, "step": 7168 }, { "epoch": 0.0557529171319872, "grad_norm": 0.12244040077267947, "learning_rate": 9.923900797728764e-05, "loss": 2.0434, "step": 7169 }, { "epoch": 0.05576069407676778, "grad_norm": 0.1445381491850017, "learning_rate": 9.92387956095984e-05, "loss": 2.0875, "step": 7170 }, { "epoch": 0.05576847102154836, "grad_norm": 0.13218161022862168, "learning_rate": 9.923858321250817e-05, "loss": 2.0589, "step": 7171 }, { "epoch": 0.05577624796632894, "grad_norm": 0.11859597690201289, "learning_rate": 9.923837078601711e-05, "loss": 2.1172, "step": 7172 }, { "epoch": 0.05578402491110952, "grad_norm": 0.12731408781435896, "learning_rate": 9.923815833012532e-05, "loss": 2.0623, "step": 7173 }, { "epoch": 0.055791801855890104, "grad_norm": 0.11362169850647887, "learning_rate": 9.923794584483294e-05, "loss": 2.0514, "step": 7174 }, { "epoch": 0.055799578800670685, "grad_norm": 0.11454624811191848, "learning_rate": 9.923773333014011e-05, "loss": 2.0443, "step": 7175 }, { "epoch": 0.055807355745451266, "grad_norm": 0.11535998885108692, "learning_rate": 9.923752078604695e-05, "loss": 1.9957, "step": 7176 }, { "epoch": 0.05581513269023185, "grad_norm": 0.11419710891019694, "learning_rate": 9.923730821255357e-05, "loss": 2.1052, "step": 7177 }, { "epoch": 0.05582290963501243, "grad_norm": 0.11909615165502684, "learning_rate": 9.923709560966011e-05, "loss": 2.1144, "step": 7178 }, { "epoch": 0.05583068657979301, "grad_norm": 0.1144756699461067, "learning_rate": 9.923688297736669e-05, "loss": 2.0176, "step": 7179 }, { "epoch": 0.05583846352457359, "grad_norm": 0.1179726180041913, "learning_rate": 9.923667031567343e-05, "loss": 2.0959, "step": 7180 }, { "epoch": 0.05584624046935417, "grad_norm": 0.11501696365785853, "learning_rate": 9.923645762458049e-05, "loss": 2.0227, "step": 7181 }, { "epoch": 0.05585401741413475, "grad_norm": 0.11976241055109033, "learning_rate": 9.923624490408797e-05, "loss": 2.0302, "step": 7182 }, { "epoch": 0.055861794358915334, "grad_norm": 0.11536066740272283, "learning_rate": 9.923603215419601e-05, "loss": 2.0687, "step": 7183 }, { "epoch": 0.055869571303695915, "grad_norm": 0.11559200220861743, "learning_rate": 9.923581937490471e-05, "loss": 2.0569, "step": 7184 }, { "epoch": 0.0558773482484765, "grad_norm": 0.11465379626844319, "learning_rate": 9.923560656621424e-05, "loss": 2.1124, "step": 7185 }, { "epoch": 0.05588512519325708, "grad_norm": 0.11270598704353184, "learning_rate": 9.92353937281247e-05, "loss": 2.0391, "step": 7186 }, { "epoch": 0.05589290213803766, "grad_norm": 0.11314115205780412, "learning_rate": 9.923518086063621e-05, "loss": 2.0631, "step": 7187 }, { "epoch": 0.05590067908281824, "grad_norm": 0.1337510798191787, "learning_rate": 9.923496796374891e-05, "loss": 2.0988, "step": 7188 }, { "epoch": 0.05590845602759882, "grad_norm": 0.1298973534735035, "learning_rate": 9.923475503746294e-05, "loss": 2.0335, "step": 7189 }, { "epoch": 0.0559162329723794, "grad_norm": 0.10851425558291312, "learning_rate": 9.92345420817784e-05, "loss": 2.0774, "step": 7190 }, { "epoch": 0.055924009917159984, "grad_norm": 0.11290835694820714, "learning_rate": 9.923432909669543e-05, "loss": 2.0316, "step": 7191 }, { "epoch": 0.055931786861940565, "grad_norm": 0.12254311021272114, "learning_rate": 9.923411608221416e-05, "loss": 2.0105, "step": 7192 }, { "epoch": 0.055939563806721146, "grad_norm": 0.11103600240853273, "learning_rate": 9.92339030383347e-05, "loss": 2.0808, "step": 7193 }, { "epoch": 0.05594734075150173, "grad_norm": 0.12052697848702441, "learning_rate": 9.923368996505722e-05, "loss": 2.0589, "step": 7194 }, { "epoch": 0.05595511769628231, "grad_norm": 0.11764571989402185, "learning_rate": 9.92334768623818e-05, "loss": 2.0393, "step": 7195 }, { "epoch": 0.05596289464106289, "grad_norm": 0.11940529825738336, "learning_rate": 9.923326373030859e-05, "loss": 2.0283, "step": 7196 }, { "epoch": 0.05597067158584347, "grad_norm": 0.12211028695621201, "learning_rate": 9.923305056883772e-05, "loss": 2.0536, "step": 7197 }, { "epoch": 0.05597844853062405, "grad_norm": 0.11135653864722783, "learning_rate": 9.923283737796931e-05, "loss": 2.059, "step": 7198 }, { "epoch": 0.05598622547540463, "grad_norm": 0.11289440154044016, "learning_rate": 9.923262415770347e-05, "loss": 2.1418, "step": 7199 }, { "epoch": 0.055994002420185214, "grad_norm": 0.11212167598303996, "learning_rate": 9.923241090804037e-05, "loss": 2.0907, "step": 7200 }, { "epoch": 0.056001779364965795, "grad_norm": 0.11684960183782712, "learning_rate": 9.923219762898009e-05, "loss": 2.1, "step": 7201 }, { "epoch": 0.056009556309746376, "grad_norm": 0.1125521409594102, "learning_rate": 9.923198432052278e-05, "loss": 2.093, "step": 7202 }, { "epoch": 0.05601733325452696, "grad_norm": 0.11193701201551058, "learning_rate": 9.923177098266858e-05, "loss": 2.0553, "step": 7203 }, { "epoch": 0.05602511019930754, "grad_norm": 0.11439413508436162, "learning_rate": 9.92315576154176e-05, "loss": 2.0458, "step": 7204 }, { "epoch": 0.05603288714408812, "grad_norm": 0.10714327106458581, "learning_rate": 9.923134421876997e-05, "loss": 2.024, "step": 7205 }, { "epoch": 0.0560406640888687, "grad_norm": 0.11430453240552883, "learning_rate": 9.923113079272582e-05, "loss": 2.0807, "step": 7206 }, { "epoch": 0.05604844103364928, "grad_norm": 0.11129815971827932, "learning_rate": 9.923091733728529e-05, "loss": 2.0686, "step": 7207 }, { "epoch": 0.05605621797842986, "grad_norm": 0.11433062220866645, "learning_rate": 9.923070385244848e-05, "loss": 2.0081, "step": 7208 }, { "epoch": 0.056063994923210445, "grad_norm": 0.1146492466013055, "learning_rate": 9.923049033821553e-05, "loss": 2.0342, "step": 7209 }, { "epoch": 0.056071771867991026, "grad_norm": 0.12193067724465659, "learning_rate": 9.923027679458657e-05, "loss": 1.9909, "step": 7210 }, { "epoch": 0.05607954881277161, "grad_norm": 0.11511334260909184, "learning_rate": 9.923006322156173e-05, "loss": 2.1072, "step": 7211 }, { "epoch": 0.05608732575755219, "grad_norm": 0.11593677379047566, "learning_rate": 9.922984961914114e-05, "loss": 2.0041, "step": 7212 }, { "epoch": 0.05609510270233277, "grad_norm": 0.11227820350495318, "learning_rate": 9.92296359873249e-05, "loss": 2.0647, "step": 7213 }, { "epoch": 0.05610287964711335, "grad_norm": 0.11458535656341874, "learning_rate": 9.922942232611319e-05, "loss": 2.0521, "step": 7214 }, { "epoch": 0.05611065659189393, "grad_norm": 0.12412361055096063, "learning_rate": 9.922920863550608e-05, "loss": 2.0716, "step": 7215 }, { "epoch": 0.05611843353667452, "grad_norm": 0.11325628474438922, "learning_rate": 9.922899491550375e-05, "loss": 2.0562, "step": 7216 }, { "epoch": 0.0561262104814551, "grad_norm": 0.12293204902589905, "learning_rate": 9.922878116610628e-05, "loss": 2.046, "step": 7217 }, { "epoch": 0.05613398742623568, "grad_norm": 0.12505945579277153, "learning_rate": 9.922856738731383e-05, "loss": 2.0596, "step": 7218 }, { "epoch": 0.05614176437101626, "grad_norm": 0.11091696044418299, "learning_rate": 9.922835357912651e-05, "loss": 2.0976, "step": 7219 }, { "epoch": 0.056149541315796844, "grad_norm": 0.23997532847279557, "learning_rate": 9.922813974154447e-05, "loss": 2.0502, "step": 7220 }, { "epoch": 0.056157318260577425, "grad_norm": 0.11159005930649582, "learning_rate": 9.922792587456781e-05, "loss": 1.9942, "step": 7221 }, { "epoch": 0.05616509520535801, "grad_norm": 0.12280558730685803, "learning_rate": 9.922771197819669e-05, "loss": 2.075, "step": 7222 }, { "epoch": 0.05617287215013859, "grad_norm": 0.11395813954143455, "learning_rate": 9.922749805243118e-05, "loss": 2.0291, "step": 7223 }, { "epoch": 0.05618064909491917, "grad_norm": 0.1176089635274417, "learning_rate": 9.922728409727148e-05, "loss": 2.1147, "step": 7224 }, { "epoch": 0.05618842603969975, "grad_norm": 0.12502714743412147, "learning_rate": 9.922707011271768e-05, "loss": 2.0232, "step": 7225 }, { "epoch": 0.05619620298448033, "grad_norm": 0.12063039256211136, "learning_rate": 9.922685609876989e-05, "loss": 2.0548, "step": 7226 }, { "epoch": 0.05620397992926091, "grad_norm": 0.12367881939076215, "learning_rate": 9.922664205542828e-05, "loss": 2.0649, "step": 7227 }, { "epoch": 0.056211756874041494, "grad_norm": 0.12699716385056137, "learning_rate": 9.922642798269295e-05, "loss": 2.0245, "step": 7228 }, { "epoch": 0.056219533818822075, "grad_norm": 0.12310852122203873, "learning_rate": 9.922621388056403e-05, "loss": 2.0246, "step": 7229 }, { "epoch": 0.056227310763602656, "grad_norm": 0.15404295204605822, "learning_rate": 9.922599974904165e-05, "loss": 2.0907, "step": 7230 }, { "epoch": 0.05623508770838324, "grad_norm": 0.12780569211373255, "learning_rate": 9.922578558812596e-05, "loss": 2.0206, "step": 7231 }, { "epoch": 0.05624286465316382, "grad_norm": 0.11856138302245928, "learning_rate": 9.922557139781706e-05, "loss": 2.0573, "step": 7232 }, { "epoch": 0.0562506415979444, "grad_norm": 0.13486797614339466, "learning_rate": 9.922535717811507e-05, "loss": 2.0622, "step": 7233 }, { "epoch": 0.05625841854272498, "grad_norm": 0.117008308224257, "learning_rate": 9.922514292902015e-05, "loss": 2.0732, "step": 7234 }, { "epoch": 0.05626619548750556, "grad_norm": 0.12432081491472748, "learning_rate": 9.92249286505324e-05, "loss": 2.0479, "step": 7235 }, { "epoch": 0.05627397243228614, "grad_norm": 0.11740557544031555, "learning_rate": 9.922471434265198e-05, "loss": 2.086, "step": 7236 }, { "epoch": 0.056281749377066724, "grad_norm": 0.1160009045382732, "learning_rate": 9.922450000537899e-05, "loss": 2.025, "step": 7237 }, { "epoch": 0.056289526321847305, "grad_norm": 0.11950317487931555, "learning_rate": 9.922428563871355e-05, "loss": 2.0834, "step": 7238 }, { "epoch": 0.056297303266627886, "grad_norm": 0.12773016917920527, "learning_rate": 9.922407124265583e-05, "loss": 1.9931, "step": 7239 }, { "epoch": 0.05630508021140847, "grad_norm": 0.11514091423185054, "learning_rate": 9.922385681720591e-05, "loss": 2.0506, "step": 7240 }, { "epoch": 0.05631285715618905, "grad_norm": 0.11189774728828898, "learning_rate": 9.922364236236395e-05, "loss": 2.0388, "step": 7241 }, { "epoch": 0.05632063410096963, "grad_norm": 0.31226870464227124, "learning_rate": 9.922342787813008e-05, "loss": 2.105, "step": 7242 }, { "epoch": 0.05632841104575021, "grad_norm": 0.11955758227884522, "learning_rate": 9.922321336450439e-05, "loss": 2.042, "step": 7243 }, { "epoch": 0.05633618799053079, "grad_norm": 0.1102390341508, "learning_rate": 9.922299882148706e-05, "loss": 2.093, "step": 7244 }, { "epoch": 0.05634396493531137, "grad_norm": 0.11471478348310092, "learning_rate": 9.922278424907819e-05, "loss": 2.0672, "step": 7245 }, { "epoch": 0.056351741880091955, "grad_norm": 0.11438837525757808, "learning_rate": 9.92225696472779e-05, "loss": 2.1084, "step": 7246 }, { "epoch": 0.056359518824872536, "grad_norm": 0.10923571959940169, "learning_rate": 9.922235501608632e-05, "loss": 2.0672, "step": 7247 }, { "epoch": 0.05636729576965312, "grad_norm": 0.11777538374728509, "learning_rate": 9.922214035550359e-05, "loss": 2.0939, "step": 7248 }, { "epoch": 0.0563750727144337, "grad_norm": 0.11078047243633204, "learning_rate": 9.922192566552984e-05, "loss": 2.0551, "step": 7249 }, { "epoch": 0.05638284965921428, "grad_norm": 0.11251901514055769, "learning_rate": 9.92217109461652e-05, "loss": 2.0826, "step": 7250 }, { "epoch": 0.05639062660399486, "grad_norm": 0.12007375857151473, "learning_rate": 9.92214961974098e-05, "loss": 2.0322, "step": 7251 }, { "epoch": 0.05639840354877544, "grad_norm": 0.11116375960465422, "learning_rate": 9.922128141926375e-05, "loss": 2.0566, "step": 7252 }, { "epoch": 0.05640618049355602, "grad_norm": 0.1148930747031914, "learning_rate": 9.922106661172717e-05, "loss": 2.1253, "step": 7253 }, { "epoch": 0.056413957438336604, "grad_norm": 0.3876280415617357, "learning_rate": 9.922085177480023e-05, "loss": 2.1309, "step": 7254 }, { "epoch": 0.056421734383117185, "grad_norm": 0.3526588461652825, "learning_rate": 9.922063690848303e-05, "loss": 2.0534, "step": 7255 }, { "epoch": 0.056429511327897766, "grad_norm": 0.33393673854837597, "learning_rate": 9.922042201277568e-05, "loss": 2.0518, "step": 7256 }, { "epoch": 0.05643728827267835, "grad_norm": 0.18941877062426066, "learning_rate": 9.922020708767835e-05, "loss": 2.0823, "step": 7257 }, { "epoch": 0.05644506521745893, "grad_norm": 0.6165644571216887, "learning_rate": 9.921999213319115e-05, "loss": 2.0845, "step": 7258 }, { "epoch": 0.05645284216223951, "grad_norm": 0.5074037573181862, "learning_rate": 9.921977714931421e-05, "loss": 2.0591, "step": 7259 }, { "epoch": 0.05646061910702009, "grad_norm": 0.326514791404787, "learning_rate": 9.921956213604764e-05, "loss": 2.0633, "step": 7260 }, { "epoch": 0.05646839605180067, "grad_norm": 0.6563857559010065, "learning_rate": 9.92193470933916e-05, "loss": 2.0917, "step": 7261 }, { "epoch": 0.05647617299658125, "grad_norm": 0.16641179124072294, "learning_rate": 9.921913202134621e-05, "loss": 2.0851, "step": 7262 }, { "epoch": 0.056483949941361834, "grad_norm": 0.3838335197491559, "learning_rate": 9.921891691991158e-05, "loss": 2.0666, "step": 7263 }, { "epoch": 0.056491726886142415, "grad_norm": 0.1284960042543037, "learning_rate": 9.921870178908786e-05, "loss": 2.024, "step": 7264 }, { "epoch": 0.056499503830923, "grad_norm": 0.4467027328001456, "learning_rate": 9.921848662887515e-05, "loss": 2.0717, "step": 7265 }, { "epoch": 0.05650728077570358, "grad_norm": 0.12406453903264174, "learning_rate": 9.92182714392736e-05, "loss": 2.0717, "step": 7266 }, { "epoch": 0.05651505772048416, "grad_norm": 0.3249317307800039, "learning_rate": 9.921805622028335e-05, "loss": 2.0231, "step": 7267 }, { "epoch": 0.05652283466526474, "grad_norm": 0.12357965079143751, "learning_rate": 9.92178409719045e-05, "loss": 1.9879, "step": 7268 }, { "epoch": 0.05653061161004532, "grad_norm": 0.35228055714591106, "learning_rate": 9.921762569413719e-05, "loss": 2.0844, "step": 7269 }, { "epoch": 0.0565383885548259, "grad_norm": 0.12265508284579833, "learning_rate": 9.921741038698155e-05, "loss": 2.0641, "step": 7270 }, { "epoch": 0.056546165499606484, "grad_norm": 0.23084084755189804, "learning_rate": 9.921719505043772e-05, "loss": 2.0369, "step": 7271 }, { "epoch": 0.056553942444387065, "grad_norm": 0.1449312734822839, "learning_rate": 9.921697968450582e-05, "loss": 2.0953, "step": 7272 }, { "epoch": 0.056561719389167646, "grad_norm": 0.2565896329245288, "learning_rate": 9.921676428918598e-05, "loss": 2.0612, "step": 7273 }, { "epoch": 0.05656949633394823, "grad_norm": 0.1259385036737814, "learning_rate": 9.921654886447831e-05, "loss": 2.0339, "step": 7274 }, { "epoch": 0.056577273278728815, "grad_norm": 0.19296310597050534, "learning_rate": 9.921633341038297e-05, "loss": 2.048, "step": 7275 }, { "epoch": 0.056585050223509396, "grad_norm": 0.15304311156726397, "learning_rate": 9.921611792690005e-05, "loss": 2.0329, "step": 7276 }, { "epoch": 0.05659282716828998, "grad_norm": 0.21765107114152923, "learning_rate": 9.921590241402971e-05, "loss": 2.0058, "step": 7277 }, { "epoch": 0.05660060411307056, "grad_norm": 0.1393166206867217, "learning_rate": 9.921568687177208e-05, "loss": 2.0797, "step": 7278 }, { "epoch": 0.05660838105785114, "grad_norm": 0.1714338481943565, "learning_rate": 9.921547130012727e-05, "loss": 2.1049, "step": 7279 }, { "epoch": 0.05661615800263172, "grad_norm": 0.1681430036688178, "learning_rate": 9.921525569909542e-05, "loss": 2.0226, "step": 7280 }, { "epoch": 0.0566239349474123, "grad_norm": 0.1670322997337577, "learning_rate": 9.921504006867665e-05, "loss": 2.0124, "step": 7281 }, { "epoch": 0.05663171189219288, "grad_norm": 0.1496967844996762, "learning_rate": 9.921482440887111e-05, "loss": 2.0897, "step": 7282 }, { "epoch": 0.056639488836973464, "grad_norm": 0.14152664353110833, "learning_rate": 9.92146087196789e-05, "loss": 2.027, "step": 7283 }, { "epoch": 0.056647265781754046, "grad_norm": 0.13829981625565474, "learning_rate": 9.921439300110017e-05, "loss": 2.0714, "step": 7284 }, { "epoch": 0.05665504272653463, "grad_norm": 0.12005169014793339, "learning_rate": 9.921417725313504e-05, "loss": 2.016, "step": 7285 }, { "epoch": 0.05666281967131521, "grad_norm": 0.1554416221792662, "learning_rate": 9.921396147578363e-05, "loss": 2.0256, "step": 7286 }, { "epoch": 0.05667059661609579, "grad_norm": 0.12016523349285622, "learning_rate": 9.921374566904608e-05, "loss": 2.1217, "step": 7287 }, { "epoch": 0.05667837356087637, "grad_norm": 0.15075582746703906, "learning_rate": 9.921352983292253e-05, "loss": 2.0631, "step": 7288 }, { "epoch": 0.05668615050565695, "grad_norm": 0.12254498471071652, "learning_rate": 9.921331396741309e-05, "loss": 2.1203, "step": 7289 }, { "epoch": 0.05669392745043753, "grad_norm": 0.13892178466302887, "learning_rate": 9.92130980725179e-05, "loss": 2.0473, "step": 7290 }, { "epoch": 0.056701704395218114, "grad_norm": 0.11692635679541456, "learning_rate": 9.921288214823708e-05, "loss": 2.0395, "step": 7291 }, { "epoch": 0.056709481339998695, "grad_norm": 0.1278966904436155, "learning_rate": 9.921266619457075e-05, "loss": 2.0433, "step": 7292 }, { "epoch": 0.056717258284779276, "grad_norm": 0.12123531314408888, "learning_rate": 9.921245021151906e-05, "loss": 1.9895, "step": 7293 }, { "epoch": 0.05672503522955986, "grad_norm": 0.12838635613614863, "learning_rate": 9.921223419908215e-05, "loss": 1.9781, "step": 7294 }, { "epoch": 0.05673281217434044, "grad_norm": 0.1228888380086784, "learning_rate": 9.921201815726012e-05, "loss": 2.0379, "step": 7295 }, { "epoch": 0.05674058911912102, "grad_norm": 0.1289035115422876, "learning_rate": 9.92118020860531e-05, "loss": 2.0516, "step": 7296 }, { "epoch": 0.0567483660639016, "grad_norm": 0.11828268467867255, "learning_rate": 9.921158598546123e-05, "loss": 2.0468, "step": 7297 }, { "epoch": 0.05675614300868218, "grad_norm": 0.12081328754857211, "learning_rate": 9.921136985548465e-05, "loss": 2.0511, "step": 7298 }, { "epoch": 0.05676391995346276, "grad_norm": 0.11388144733367206, "learning_rate": 9.921115369612346e-05, "loss": 2.0534, "step": 7299 }, { "epoch": 0.056771696898243344, "grad_norm": 0.11084407997265094, "learning_rate": 9.921093750737781e-05, "loss": 1.9475, "step": 7300 }, { "epoch": 0.056779473843023925, "grad_norm": 0.11862980492715457, "learning_rate": 9.921072128924782e-05, "loss": 2.0143, "step": 7301 }, { "epoch": 0.05678725078780451, "grad_norm": 0.11060705285685339, "learning_rate": 9.921050504173363e-05, "loss": 2.0387, "step": 7302 }, { "epoch": 0.05679502773258509, "grad_norm": 0.12037430898991275, "learning_rate": 9.921028876483537e-05, "loss": 1.9815, "step": 7303 }, { "epoch": 0.05680280467736567, "grad_norm": 0.16783148493279845, "learning_rate": 9.921007245855315e-05, "loss": 2.0354, "step": 7304 }, { "epoch": 0.05681058162214625, "grad_norm": 0.1156878469554078, "learning_rate": 9.92098561228871e-05, "loss": 2.1103, "step": 7305 }, { "epoch": 0.05681835856692683, "grad_norm": 0.1197932548493435, "learning_rate": 9.92096397578374e-05, "loss": 2.0772, "step": 7306 }, { "epoch": 0.05682613551170741, "grad_norm": 0.11368811942389666, "learning_rate": 9.920942336340411e-05, "loss": 2.0248, "step": 7307 }, { "epoch": 0.056833912456487994, "grad_norm": 0.11020829447754413, "learning_rate": 9.920920693958739e-05, "loss": 2.1032, "step": 7308 }, { "epoch": 0.056841689401268575, "grad_norm": 0.11165972199348759, "learning_rate": 9.920899048638737e-05, "loss": 2.034, "step": 7309 }, { "epoch": 0.056849466346049156, "grad_norm": 0.11442508625385082, "learning_rate": 9.920877400380417e-05, "loss": 2.0588, "step": 7310 }, { "epoch": 0.05685724329082974, "grad_norm": 0.1125366294346133, "learning_rate": 9.920855749183793e-05, "loss": 2.0295, "step": 7311 }, { "epoch": 0.05686502023561032, "grad_norm": 0.11077536099805164, "learning_rate": 9.920834095048879e-05, "loss": 2.0457, "step": 7312 }, { "epoch": 0.0568727971803909, "grad_norm": 0.1209817760670034, "learning_rate": 9.920812437975685e-05, "loss": 2.0409, "step": 7313 }, { "epoch": 0.05688057412517148, "grad_norm": 0.12263097408773248, "learning_rate": 9.920790777964226e-05, "loss": 2.064, "step": 7314 }, { "epoch": 0.05688835106995206, "grad_norm": 0.11093708416775244, "learning_rate": 9.920769115014515e-05, "loss": 2.0678, "step": 7315 }, { "epoch": 0.05689612801473264, "grad_norm": 0.11637242685916009, "learning_rate": 9.920747449126563e-05, "loss": 2.0999, "step": 7316 }, { "epoch": 0.056903904959513224, "grad_norm": 0.1172373584564637, "learning_rate": 9.920725780300384e-05, "loss": 2.0599, "step": 7317 }, { "epoch": 0.056911681904293805, "grad_norm": 0.11383556142305615, "learning_rate": 9.920704108535991e-05, "loss": 2.0626, "step": 7318 }, { "epoch": 0.056919458849074386, "grad_norm": 0.12321514980795144, "learning_rate": 9.920682433833399e-05, "loss": 2.029, "step": 7319 }, { "epoch": 0.05692723579385497, "grad_norm": 0.11294156043873903, "learning_rate": 9.920660756192618e-05, "loss": 2.0732, "step": 7320 }, { "epoch": 0.05693501273863555, "grad_norm": 0.11213313728006, "learning_rate": 9.920639075613662e-05, "loss": 2.0108, "step": 7321 }, { "epoch": 0.05694278968341613, "grad_norm": 0.11384539231912759, "learning_rate": 9.920617392096544e-05, "loss": 2.0534, "step": 7322 }, { "epoch": 0.05695056662819671, "grad_norm": 0.11567046592003156, "learning_rate": 9.920595705641275e-05, "loss": 2.0446, "step": 7323 }, { "epoch": 0.05695834357297729, "grad_norm": 0.11374665628392087, "learning_rate": 9.920574016247872e-05, "loss": 2.0209, "step": 7324 }, { "epoch": 0.05696612051775787, "grad_norm": 0.15272672038955676, "learning_rate": 9.920552323916346e-05, "loss": 2.0753, "step": 7325 }, { "epoch": 0.056973897462538454, "grad_norm": 0.11507919520608599, "learning_rate": 9.920530628646709e-05, "loss": 2.022, "step": 7326 }, { "epoch": 0.056981674407319036, "grad_norm": 0.11876976482721463, "learning_rate": 9.920508930438975e-05, "loss": 2.0784, "step": 7327 }, { "epoch": 0.05698945135209962, "grad_norm": 0.11372184093642748, "learning_rate": 9.920487229293157e-05, "loss": 2.0418, "step": 7328 }, { "epoch": 0.0569972282968802, "grad_norm": 0.11455706449838261, "learning_rate": 9.920465525209266e-05, "loss": 2.0835, "step": 7329 }, { "epoch": 0.05700500524166078, "grad_norm": 0.1283700592413583, "learning_rate": 9.920443818187319e-05, "loss": 2.0533, "step": 7330 }, { "epoch": 0.05701278218644136, "grad_norm": 0.11410581173630052, "learning_rate": 9.920422108227324e-05, "loss": 2.0255, "step": 7331 }, { "epoch": 0.05702055913122194, "grad_norm": 0.13052160445508676, "learning_rate": 9.920400395329298e-05, "loss": 2.073, "step": 7332 }, { "epoch": 0.05702833607600252, "grad_norm": 0.1191613829376266, "learning_rate": 9.920378679493252e-05, "loss": 2.0645, "step": 7333 }, { "epoch": 0.057036113020783104, "grad_norm": 0.1251847331736679, "learning_rate": 9.9203569607192e-05, "loss": 2.0318, "step": 7334 }, { "epoch": 0.05704388996556369, "grad_norm": 0.1362684942901784, "learning_rate": 9.920335239007153e-05, "loss": 2.0997, "step": 7335 }, { "epoch": 0.05705166691034427, "grad_norm": 0.1483626027177011, "learning_rate": 9.920313514357125e-05, "loss": 2.0388, "step": 7336 }, { "epoch": 0.057059443855124854, "grad_norm": 0.139739633511739, "learning_rate": 9.920291786769132e-05, "loss": 2.0938, "step": 7337 }, { "epoch": 0.057067220799905435, "grad_norm": 0.11628844879068367, "learning_rate": 9.920270056243181e-05, "loss": 2.0659, "step": 7338 }, { "epoch": 0.05707499774468602, "grad_norm": 0.11635697287917668, "learning_rate": 9.920248322779291e-05, "loss": 2.0553, "step": 7339 }, { "epoch": 0.0570827746894666, "grad_norm": 0.11247505244324638, "learning_rate": 9.920226586377471e-05, "loss": 2.0416, "step": 7340 }, { "epoch": 0.05709055163424718, "grad_norm": 0.12716289634675992, "learning_rate": 9.920204847037735e-05, "loss": 2.0681, "step": 7341 }, { "epoch": 0.05709832857902776, "grad_norm": 0.11721140644762584, "learning_rate": 9.920183104760096e-05, "loss": 2.0312, "step": 7342 }, { "epoch": 0.05710610552380834, "grad_norm": 0.11454808101354799, "learning_rate": 9.92016135954457e-05, "loss": 2.085, "step": 7343 }, { "epoch": 0.05711388246858892, "grad_norm": 0.11389398561063355, "learning_rate": 9.920139611391165e-05, "loss": 1.9954, "step": 7344 }, { "epoch": 0.057121659413369504, "grad_norm": 0.1100369928738008, "learning_rate": 9.920117860299896e-05, "loss": 2.0504, "step": 7345 }, { "epoch": 0.057129436358150085, "grad_norm": 0.1114470868206167, "learning_rate": 9.920096106270777e-05, "loss": 2.0081, "step": 7346 }, { "epoch": 0.057137213302930666, "grad_norm": 0.2584036105101969, "learning_rate": 9.920074349303818e-05, "loss": 2.0338, "step": 7347 }, { "epoch": 0.05714499024771125, "grad_norm": 0.11140482288737534, "learning_rate": 9.920052589399035e-05, "loss": 2.056, "step": 7348 }, { "epoch": 0.05715276719249183, "grad_norm": 0.12949030590051222, "learning_rate": 9.920030826556443e-05, "loss": 2.0189, "step": 7349 }, { "epoch": 0.05716054413727241, "grad_norm": 0.11638984021622698, "learning_rate": 9.92000906077605e-05, "loss": 2.0189, "step": 7350 }, { "epoch": 0.05716832108205299, "grad_norm": 0.12059151284145692, "learning_rate": 9.919987292057871e-05, "loss": 1.9838, "step": 7351 }, { "epoch": 0.05717609802683357, "grad_norm": 0.1321681590316112, "learning_rate": 9.919965520401918e-05, "loss": 2.0461, "step": 7352 }, { "epoch": 0.05718387497161415, "grad_norm": 0.1618088931507233, "learning_rate": 9.919943745808207e-05, "loss": 2.1078, "step": 7353 }, { "epoch": 0.057191651916394734, "grad_norm": 0.13738953806289134, "learning_rate": 9.919921968276748e-05, "loss": 2.0286, "step": 7354 }, { "epoch": 0.057199428861175315, "grad_norm": 0.13034833055223238, "learning_rate": 9.919900187807556e-05, "loss": 2.1114, "step": 7355 }, { "epoch": 0.057207205805955896, "grad_norm": 0.11571767698202723, "learning_rate": 9.919878404400642e-05, "loss": 2.0337, "step": 7356 }, { "epoch": 0.05721498275073648, "grad_norm": 0.11434687978832768, "learning_rate": 9.919856618056021e-05, "loss": 2.0822, "step": 7357 }, { "epoch": 0.05722275969551706, "grad_norm": 0.1168790683216705, "learning_rate": 9.919834828773704e-05, "loss": 2.1036, "step": 7358 }, { "epoch": 0.05723053664029764, "grad_norm": 0.11969602685106125, "learning_rate": 9.919813036553707e-05, "loss": 2.0872, "step": 7359 }, { "epoch": 0.05723831358507822, "grad_norm": 0.11759263296641563, "learning_rate": 9.91979124139604e-05, "loss": 2.0498, "step": 7360 }, { "epoch": 0.0572460905298588, "grad_norm": 0.1173240704648181, "learning_rate": 9.919769443300717e-05, "loss": 2.0669, "step": 7361 }, { "epoch": 0.05725386747463938, "grad_norm": 0.11726156281265775, "learning_rate": 9.919747642267752e-05, "loss": 2.0495, "step": 7362 }, { "epoch": 0.057261644419419964, "grad_norm": 0.11583586647449166, "learning_rate": 9.919725838297157e-05, "loss": 2.0837, "step": 7363 }, { "epoch": 0.057269421364200546, "grad_norm": 0.1551462494881624, "learning_rate": 9.919704031388945e-05, "loss": 2.0542, "step": 7364 }, { "epoch": 0.05727719830898113, "grad_norm": 0.11938238117346305, "learning_rate": 9.91968222154313e-05, "loss": 2.0091, "step": 7365 }, { "epoch": 0.05728497525376171, "grad_norm": 0.11418844564239067, "learning_rate": 9.919660408759724e-05, "loss": 2.0097, "step": 7366 }, { "epoch": 0.05729275219854229, "grad_norm": 0.11551590910482153, "learning_rate": 9.919638593038738e-05, "loss": 2.0634, "step": 7367 }, { "epoch": 0.05730052914332287, "grad_norm": 0.11225490598287227, "learning_rate": 9.91961677438019e-05, "loss": 2.0752, "step": 7368 }, { "epoch": 0.05730830608810345, "grad_norm": 0.1251253118771811, "learning_rate": 9.919594952784089e-05, "loss": 2.08, "step": 7369 }, { "epoch": 0.05731608303288403, "grad_norm": 0.11675188814680479, "learning_rate": 9.91957312825045e-05, "loss": 2.0675, "step": 7370 }, { "epoch": 0.057323859977664614, "grad_norm": 0.11286038070159404, "learning_rate": 9.919551300779286e-05, "loss": 2.0529, "step": 7371 }, { "epoch": 0.057331636922445195, "grad_norm": 0.11115413437090019, "learning_rate": 9.919529470370609e-05, "loss": 2.0389, "step": 7372 }, { "epoch": 0.057339413867225776, "grad_norm": 0.11251395906200981, "learning_rate": 9.919507637024432e-05, "loss": 2.0484, "step": 7373 }, { "epoch": 0.05734719081200636, "grad_norm": 0.11543527599531565, "learning_rate": 9.919485800740769e-05, "loss": 2.0428, "step": 7374 }, { "epoch": 0.05735496775678694, "grad_norm": 0.12200432018391759, "learning_rate": 9.919463961519632e-05, "loss": 2.0618, "step": 7375 }, { "epoch": 0.05736274470156752, "grad_norm": 0.12361217426688366, "learning_rate": 9.919442119361036e-05, "loss": 2.073, "step": 7376 }, { "epoch": 0.0573705216463481, "grad_norm": 0.11381528719832507, "learning_rate": 9.91942027426499e-05, "loss": 2.0277, "step": 7377 }, { "epoch": 0.05737829859112868, "grad_norm": 0.1285549058386188, "learning_rate": 9.919398426231512e-05, "loss": 2.0585, "step": 7378 }, { "epoch": 0.05738607553590926, "grad_norm": 0.11215587107033251, "learning_rate": 9.919376575260612e-05, "loss": 2.0739, "step": 7379 }, { "epoch": 0.057393852480689844, "grad_norm": 0.13798844479940128, "learning_rate": 9.919354721352303e-05, "loss": 2.0262, "step": 7380 }, { "epoch": 0.057401629425470425, "grad_norm": 0.1307450747844466, "learning_rate": 9.9193328645066e-05, "loss": 2.0461, "step": 7381 }, { "epoch": 0.05740940637025101, "grad_norm": 0.11775454399062057, "learning_rate": 9.919311004723514e-05, "loss": 2.0422, "step": 7382 }, { "epoch": 0.05741718331503159, "grad_norm": 0.12709860946325643, "learning_rate": 9.919289142003061e-05, "loss": 2.0278, "step": 7383 }, { "epoch": 0.05742496025981217, "grad_norm": 0.11568089128221186, "learning_rate": 9.91926727634525e-05, "loss": 2.0139, "step": 7384 }, { "epoch": 0.05743273720459275, "grad_norm": 0.13484814364482667, "learning_rate": 9.919245407750096e-05, "loss": 2.0647, "step": 7385 }, { "epoch": 0.05744051414937333, "grad_norm": 0.13643274266102656, "learning_rate": 9.919223536217613e-05, "loss": 2.0982, "step": 7386 }, { "epoch": 0.05744829109415391, "grad_norm": 0.12299380222271464, "learning_rate": 9.919201661747813e-05, "loss": 2.0806, "step": 7387 }, { "epoch": 0.057456068038934494, "grad_norm": 0.10727472790292078, "learning_rate": 9.919179784340708e-05, "loss": 2.0173, "step": 7388 }, { "epoch": 0.057463844983715075, "grad_norm": 0.1464037992969777, "learning_rate": 9.919157903996315e-05, "loss": 2.0323, "step": 7389 }, { "epoch": 0.057471621928495656, "grad_norm": 0.11909209926726469, "learning_rate": 9.919136020714641e-05, "loss": 2.0261, "step": 7390 }, { "epoch": 0.05747939887327624, "grad_norm": 0.13363020069160697, "learning_rate": 9.919114134495704e-05, "loss": 2.0513, "step": 7391 }, { "epoch": 0.05748717581805682, "grad_norm": 0.12450764218955311, "learning_rate": 9.919092245339517e-05, "loss": 2.0827, "step": 7392 }, { "epoch": 0.0574949527628374, "grad_norm": 0.1265113316331916, "learning_rate": 9.919070353246089e-05, "loss": 2.0946, "step": 7393 }, { "epoch": 0.05750272970761799, "grad_norm": 0.1330919725038461, "learning_rate": 9.919048458215437e-05, "loss": 2.0554, "step": 7394 }, { "epoch": 0.05751050665239857, "grad_norm": 0.11908040800017261, "learning_rate": 9.919026560247573e-05, "loss": 2.04, "step": 7395 }, { "epoch": 0.05751828359717915, "grad_norm": 0.12304625302117735, "learning_rate": 9.919004659342507e-05, "loss": 2.0786, "step": 7396 }, { "epoch": 0.05752606054195973, "grad_norm": 0.11209304502093334, "learning_rate": 9.91898275550026e-05, "loss": 2.0476, "step": 7397 }, { "epoch": 0.05753383748674031, "grad_norm": 0.12489827168696976, "learning_rate": 9.918960848720835e-05, "loss": 2.0339, "step": 7398 }, { "epoch": 0.05754161443152089, "grad_norm": 0.11239857301249194, "learning_rate": 9.918938939004252e-05, "loss": 1.9989, "step": 7399 }, { "epoch": 0.057549391376301474, "grad_norm": 0.13956112703523751, "learning_rate": 9.918917026350523e-05, "loss": 2.0712, "step": 7400 }, { "epoch": 0.057557168321082056, "grad_norm": 0.1156093461538687, "learning_rate": 9.918895110759658e-05, "loss": 2.0185, "step": 7401 }, { "epoch": 0.05756494526586264, "grad_norm": 0.13822746518884105, "learning_rate": 9.918873192231674e-05, "loss": 2.0868, "step": 7402 }, { "epoch": 0.05757272221064322, "grad_norm": 0.14295649024872026, "learning_rate": 9.918851270766583e-05, "loss": 2.0311, "step": 7403 }, { "epoch": 0.0575804991554238, "grad_norm": 0.12245547645361807, "learning_rate": 9.918829346364396e-05, "loss": 2.0753, "step": 7404 }, { "epoch": 0.05758827610020438, "grad_norm": 0.14937515592405817, "learning_rate": 9.918807419025127e-05, "loss": 2.0668, "step": 7405 }, { "epoch": 0.05759605304498496, "grad_norm": 0.1110052027427475, "learning_rate": 9.91878548874879e-05, "loss": 1.9786, "step": 7406 }, { "epoch": 0.05760382998976554, "grad_norm": 0.13459778736412734, "learning_rate": 9.918763555535399e-05, "loss": 2.0039, "step": 7407 }, { "epoch": 0.057611606934546124, "grad_norm": 0.11601252728097008, "learning_rate": 9.918741619384965e-05, "loss": 2.0636, "step": 7408 }, { "epoch": 0.057619383879326705, "grad_norm": 0.13990459195285343, "learning_rate": 9.918719680297501e-05, "loss": 2.0339, "step": 7409 }, { "epoch": 0.057627160824107286, "grad_norm": 0.1148956164804826, "learning_rate": 9.918697738273023e-05, "loss": 2.0302, "step": 7410 }, { "epoch": 0.05763493776888787, "grad_norm": 0.1714833387315819, "learning_rate": 9.918675793311541e-05, "loss": 2.0546, "step": 7411 }, { "epoch": 0.05764271471366845, "grad_norm": 0.129534147224444, "learning_rate": 9.918653845413068e-05, "loss": 2.0266, "step": 7412 }, { "epoch": 0.05765049165844903, "grad_norm": 0.13482054653355088, "learning_rate": 9.918631894577621e-05, "loss": 2.0147, "step": 7413 }, { "epoch": 0.05765826860322961, "grad_norm": 0.14766620787441662, "learning_rate": 9.91860994080521e-05, "loss": 2.0001, "step": 7414 }, { "epoch": 0.05766604554801019, "grad_norm": 0.11211419398913348, "learning_rate": 9.918587984095847e-05, "loss": 2.0424, "step": 7415 }, { "epoch": 0.05767382249279077, "grad_norm": 0.1505532773244873, "learning_rate": 9.918566024449548e-05, "loss": 2.0477, "step": 7416 }, { "epoch": 0.057681599437571354, "grad_norm": 0.11511631120006412, "learning_rate": 9.918544061866323e-05, "loss": 2.0471, "step": 7417 }, { "epoch": 0.057689376382351935, "grad_norm": 0.12648095584596203, "learning_rate": 9.918522096346188e-05, "loss": 2.016, "step": 7418 }, { "epoch": 0.05769715332713252, "grad_norm": 0.11919815047256148, "learning_rate": 9.918500127889154e-05, "loss": 2.0355, "step": 7419 }, { "epoch": 0.0577049302719131, "grad_norm": 0.11460459111346208, "learning_rate": 9.918478156495237e-05, "loss": 2.021, "step": 7420 }, { "epoch": 0.05771270721669368, "grad_norm": 0.11857828069134872, "learning_rate": 9.918456182164446e-05, "loss": 2.0807, "step": 7421 }, { "epoch": 0.05772048416147426, "grad_norm": 0.11127831584477549, "learning_rate": 9.918434204896798e-05, "loss": 1.9857, "step": 7422 }, { "epoch": 0.05772826110625484, "grad_norm": 0.11355732528159698, "learning_rate": 9.918412224692304e-05, "loss": 2.086, "step": 7423 }, { "epoch": 0.05773603805103542, "grad_norm": 0.11506650217826062, "learning_rate": 9.918390241550979e-05, "loss": 2.065, "step": 7424 }, { "epoch": 0.057743814995816004, "grad_norm": 0.11752845605869577, "learning_rate": 9.918368255472833e-05, "loss": 2.0511, "step": 7425 }, { "epoch": 0.057751591940596585, "grad_norm": 0.11266408702026151, "learning_rate": 9.918346266457881e-05, "loss": 2.0115, "step": 7426 }, { "epoch": 0.057759368885377166, "grad_norm": 0.135846057630756, "learning_rate": 9.918324274506137e-05, "loss": 2.0733, "step": 7427 }, { "epoch": 0.05776714583015775, "grad_norm": 0.11792876164738589, "learning_rate": 9.918302279617612e-05, "loss": 2.0747, "step": 7428 }, { "epoch": 0.05777492277493833, "grad_norm": 0.13046735872397486, "learning_rate": 9.918280281792321e-05, "loss": 2.0602, "step": 7429 }, { "epoch": 0.05778269971971891, "grad_norm": 0.12396824489091762, "learning_rate": 9.918258281030274e-05, "loss": 2.0686, "step": 7430 }, { "epoch": 0.05779047666449949, "grad_norm": 0.12576660592563585, "learning_rate": 9.91823627733149e-05, "loss": 2.0554, "step": 7431 }, { "epoch": 0.05779825360928007, "grad_norm": 0.14512517172857306, "learning_rate": 9.918214270695977e-05, "loss": 2.034, "step": 7432 }, { "epoch": 0.05780603055406065, "grad_norm": 0.11230453088206468, "learning_rate": 9.918192261123751e-05, "loss": 2.064, "step": 7433 }, { "epoch": 0.057813807498841234, "grad_norm": 0.12225530805434075, "learning_rate": 9.918170248614822e-05, "loss": 2.0929, "step": 7434 }, { "epoch": 0.057821584443621815, "grad_norm": 0.11126855562833259, "learning_rate": 9.918148233169207e-05, "loss": 2.0411, "step": 7435 }, { "epoch": 0.057829361388402396, "grad_norm": 0.2772349443554211, "learning_rate": 9.918126214786915e-05, "loss": 2.1043, "step": 7436 }, { "epoch": 0.05783713833318298, "grad_norm": 0.11332904794155305, "learning_rate": 9.918104193467963e-05, "loss": 2.0327, "step": 7437 }, { "epoch": 0.05784491527796356, "grad_norm": 0.12339085548665948, "learning_rate": 9.918082169212363e-05, "loss": 2.0771, "step": 7438 }, { "epoch": 0.05785269222274414, "grad_norm": 0.11679594154852457, "learning_rate": 9.918060142020127e-05, "loss": 2.0729, "step": 7439 }, { "epoch": 0.05786046916752472, "grad_norm": 0.12249556957332705, "learning_rate": 9.918038111891268e-05, "loss": 2.0462, "step": 7440 }, { "epoch": 0.0578682461123053, "grad_norm": 0.1278286423030396, "learning_rate": 9.9180160788258e-05, "loss": 2.0096, "step": 7441 }, { "epoch": 0.05787602305708588, "grad_norm": 0.11497004007444218, "learning_rate": 9.917994042823737e-05, "loss": 2.078, "step": 7442 }, { "epoch": 0.057883800001866464, "grad_norm": 0.1238204565307336, "learning_rate": 9.917972003885092e-05, "loss": 2.0504, "step": 7443 }, { "epoch": 0.057891576946647046, "grad_norm": 0.11649228671089018, "learning_rate": 9.917949962009877e-05, "loss": 2.005, "step": 7444 }, { "epoch": 0.05789935389142763, "grad_norm": 0.11766204624510321, "learning_rate": 9.917927917198104e-05, "loss": 2.1392, "step": 7445 }, { "epoch": 0.05790713083620821, "grad_norm": 0.11722396382598498, "learning_rate": 9.91790586944979e-05, "loss": 2.0803, "step": 7446 }, { "epoch": 0.05791490778098879, "grad_norm": 0.12069108682427987, "learning_rate": 9.917883818764946e-05, "loss": 2.058, "step": 7447 }, { "epoch": 0.05792268472576937, "grad_norm": 0.11271090532988985, "learning_rate": 9.917861765143583e-05, "loss": 2.0613, "step": 7448 }, { "epoch": 0.05793046167054995, "grad_norm": 0.11606168468542749, "learning_rate": 9.917839708585717e-05, "loss": 2.0638, "step": 7449 }, { "epoch": 0.05793823861533053, "grad_norm": 0.12280092087069061, "learning_rate": 9.91781764909136e-05, "loss": 2.0294, "step": 7450 }, { "epoch": 0.057946015560111114, "grad_norm": 0.11013493334600334, "learning_rate": 9.917795586660527e-05, "loss": 2.0552, "step": 7451 }, { "epoch": 0.057953792504891695, "grad_norm": 0.11624169833959756, "learning_rate": 9.91777352129323e-05, "loss": 2.0601, "step": 7452 }, { "epoch": 0.057961569449672276, "grad_norm": 0.11666029826459097, "learning_rate": 9.917751452989481e-05, "loss": 1.9921, "step": 7453 }, { "epoch": 0.057969346394452864, "grad_norm": 0.11826569852992327, "learning_rate": 9.917729381749295e-05, "loss": 2.0212, "step": 7454 }, { "epoch": 0.057977123339233445, "grad_norm": 0.11823005786675675, "learning_rate": 9.917707307572683e-05, "loss": 2.0297, "step": 7455 }, { "epoch": 0.057984900284014027, "grad_norm": 0.11570130461942844, "learning_rate": 9.917685230459661e-05, "loss": 2.0363, "step": 7456 }, { "epoch": 0.05799267722879461, "grad_norm": 0.13099476359087858, "learning_rate": 9.91766315041024e-05, "loss": 2.0504, "step": 7457 }, { "epoch": 0.05800045417357519, "grad_norm": 0.14913949532467086, "learning_rate": 9.917641067424434e-05, "loss": 2.1069, "step": 7458 }, { "epoch": 0.05800823111835577, "grad_norm": 0.11288524803321565, "learning_rate": 9.917618981502256e-05, "loss": 2.062, "step": 7459 }, { "epoch": 0.05801600806313635, "grad_norm": 0.12820800860182088, "learning_rate": 9.91759689264372e-05, "loss": 2.0405, "step": 7460 }, { "epoch": 0.05802378500791693, "grad_norm": 0.11713504014992618, "learning_rate": 9.917574800848837e-05, "loss": 2.1043, "step": 7461 }, { "epoch": 0.058031561952697513, "grad_norm": 0.12782870946706087, "learning_rate": 9.917552706117622e-05, "loss": 2.0004, "step": 7462 }, { "epoch": 0.058039338897478095, "grad_norm": 0.15234833730053512, "learning_rate": 9.917530608450087e-05, "loss": 2.0364, "step": 7463 }, { "epoch": 0.058047115842258676, "grad_norm": 0.11909410126755171, "learning_rate": 9.917508507846248e-05, "loss": 2.0126, "step": 7464 }, { "epoch": 0.05805489278703926, "grad_norm": 0.1348132328639029, "learning_rate": 9.917486404306115e-05, "loss": 2.0511, "step": 7465 }, { "epoch": 0.05806266973181984, "grad_norm": 0.13652183755660866, "learning_rate": 9.917464297829704e-05, "loss": 2.0835, "step": 7466 }, { "epoch": 0.05807044667660042, "grad_norm": 0.11695793499633729, "learning_rate": 9.917442188417025e-05, "loss": 2.0878, "step": 7467 }, { "epoch": 0.058078223621381, "grad_norm": 0.12602253557772888, "learning_rate": 9.917420076068093e-05, "loss": 2.0602, "step": 7468 }, { "epoch": 0.05808600056616158, "grad_norm": 0.13783004207456612, "learning_rate": 9.917397960782922e-05, "loss": 2.0419, "step": 7469 }, { "epoch": 0.05809377751094216, "grad_norm": 0.13445494715666842, "learning_rate": 9.917375842561523e-05, "loss": 2.0656, "step": 7470 }, { "epoch": 0.058101554455722744, "grad_norm": 0.11627916665087934, "learning_rate": 9.91735372140391e-05, "loss": 2.096, "step": 7471 }, { "epoch": 0.058109331400503325, "grad_norm": 0.1231448359296893, "learning_rate": 9.917331597310098e-05, "loss": 2.0301, "step": 7472 }, { "epoch": 0.058117108345283906, "grad_norm": 0.11404501265347179, "learning_rate": 9.917309470280099e-05, "loss": 2.0177, "step": 7473 }, { "epoch": 0.05812488529006449, "grad_norm": 0.11798553074474671, "learning_rate": 9.917287340313926e-05, "loss": 2.054, "step": 7474 }, { "epoch": 0.05813266223484507, "grad_norm": 0.12343489334358156, "learning_rate": 9.91726520741159e-05, "loss": 2.0965, "step": 7475 }, { "epoch": 0.05814043917962565, "grad_norm": 0.118280085898773, "learning_rate": 9.917243071573109e-05, "loss": 2.0203, "step": 7476 }, { "epoch": 0.05814821612440623, "grad_norm": 0.11585472957738509, "learning_rate": 9.917220932798494e-05, "loss": 2.0738, "step": 7477 }, { "epoch": 0.05815599306918681, "grad_norm": 0.11213067556360237, "learning_rate": 9.917198791087756e-05, "loss": 2.0215, "step": 7478 }, { "epoch": 0.05816377001396739, "grad_norm": 0.12740295721945713, "learning_rate": 9.917176646440912e-05, "loss": 2.079, "step": 7479 }, { "epoch": 0.058171546958747974, "grad_norm": 0.11471153531832402, "learning_rate": 9.917154498857973e-05, "loss": 2.0859, "step": 7480 }, { "epoch": 0.058179323903528556, "grad_norm": 0.13008933725954638, "learning_rate": 9.91713234833895e-05, "loss": 2.0544, "step": 7481 }, { "epoch": 0.05818710084830914, "grad_norm": 0.1102034004217416, "learning_rate": 9.917110194883862e-05, "loss": 1.9946, "step": 7482 }, { "epoch": 0.05819487779308972, "grad_norm": 0.11966193397791643, "learning_rate": 9.917088038492718e-05, "loss": 2.0131, "step": 7483 }, { "epoch": 0.0582026547378703, "grad_norm": 0.11415026439800992, "learning_rate": 9.917065879165533e-05, "loss": 2.0455, "step": 7484 }, { "epoch": 0.05821043168265088, "grad_norm": 0.11716951365646963, "learning_rate": 9.917043716902318e-05, "loss": 2.0375, "step": 7485 }, { "epoch": 0.05821820862743146, "grad_norm": 0.11383305644448513, "learning_rate": 9.917021551703087e-05, "loss": 2.0304, "step": 7486 }, { "epoch": 0.05822598557221204, "grad_norm": 0.11868473200172013, "learning_rate": 9.916999383567856e-05, "loss": 2.0678, "step": 7487 }, { "epoch": 0.058233762516992624, "grad_norm": 0.11592959182103983, "learning_rate": 9.916977212496637e-05, "loss": 2.0278, "step": 7488 }, { "epoch": 0.058241539461773205, "grad_norm": 0.11464261505536193, "learning_rate": 9.916955038489441e-05, "loss": 2.0582, "step": 7489 }, { "epoch": 0.058249316406553786, "grad_norm": 0.11591892480289091, "learning_rate": 9.916932861546282e-05, "loss": 2.0595, "step": 7490 }, { "epoch": 0.05825709335133437, "grad_norm": 0.11047448513131954, "learning_rate": 9.916910681667176e-05, "loss": 2.0349, "step": 7491 }, { "epoch": 0.05826487029611495, "grad_norm": 0.12622742370441134, "learning_rate": 9.916888498852133e-05, "loss": 2.0436, "step": 7492 }, { "epoch": 0.05827264724089553, "grad_norm": 0.10963956701414064, "learning_rate": 9.916866313101167e-05, "loss": 1.9588, "step": 7493 }, { "epoch": 0.05828042418567611, "grad_norm": 0.11104588536033436, "learning_rate": 9.916844124414292e-05, "loss": 2.0432, "step": 7494 }, { "epoch": 0.05828820113045669, "grad_norm": 0.11576581576916865, "learning_rate": 9.91682193279152e-05, "loss": 2.0846, "step": 7495 }, { "epoch": 0.05829597807523727, "grad_norm": 0.11589216784717926, "learning_rate": 9.916799738232867e-05, "loss": 2.0446, "step": 7496 }, { "epoch": 0.058303755020017854, "grad_norm": 0.2163570268419858, "learning_rate": 9.916777540738345e-05, "loss": 2.0648, "step": 7497 }, { "epoch": 0.058311531964798435, "grad_norm": 0.11195926594485354, "learning_rate": 9.916755340307964e-05, "loss": 2.0245, "step": 7498 }, { "epoch": 0.05831930890957902, "grad_norm": 0.11997084920972488, "learning_rate": 9.916733136941741e-05, "loss": 2.0399, "step": 7499 }, { "epoch": 0.0583270858543596, "grad_norm": 0.11125614661600101, "learning_rate": 9.916710930639688e-05, "loss": 2.0659, "step": 7500 }, { "epoch": 0.05833486279914018, "grad_norm": 0.13117801317454117, "learning_rate": 9.91668872140182e-05, "loss": 2.0502, "step": 7501 }, { "epoch": 0.05834263974392076, "grad_norm": 0.11357263439803823, "learning_rate": 9.916666509228148e-05, "loss": 2.07, "step": 7502 }, { "epoch": 0.05835041668870134, "grad_norm": 0.123920779321248, "learning_rate": 9.916644294118685e-05, "loss": 2.0887, "step": 7503 }, { "epoch": 0.05835819363348192, "grad_norm": 0.13501702439880905, "learning_rate": 9.916622076073446e-05, "loss": 2.0582, "step": 7504 }, { "epoch": 0.058365970578262504, "grad_norm": 0.1103819914064411, "learning_rate": 9.916599855092443e-05, "loss": 2.0425, "step": 7505 }, { "epoch": 0.058373747523043085, "grad_norm": 0.12950449920848472, "learning_rate": 9.91657763117569e-05, "loss": 2.0678, "step": 7506 }, { "epoch": 0.058381524467823666, "grad_norm": 0.11820353933869587, "learning_rate": 9.916555404323202e-05, "loss": 2.0862, "step": 7507 }, { "epoch": 0.05838930141260425, "grad_norm": 0.12069252976540332, "learning_rate": 9.916533174534988e-05, "loss": 2.0315, "step": 7508 }, { "epoch": 0.05839707835738483, "grad_norm": 0.14287791677193287, "learning_rate": 9.916510941811063e-05, "loss": 2.0743, "step": 7509 }, { "epoch": 0.05840485530216541, "grad_norm": 0.13269052093646416, "learning_rate": 9.916488706151443e-05, "loss": 2.0026, "step": 7510 }, { "epoch": 0.05841263224694599, "grad_norm": 0.111866887099119, "learning_rate": 9.916466467556138e-05, "loss": 2.079, "step": 7511 }, { "epoch": 0.05842040919172657, "grad_norm": 0.12673923452145314, "learning_rate": 9.916444226025163e-05, "loss": 2.0089, "step": 7512 }, { "epoch": 0.05842818613650716, "grad_norm": 0.12185858117514284, "learning_rate": 9.91642198155853e-05, "loss": 2.0966, "step": 7513 }, { "epoch": 0.05843596308128774, "grad_norm": 0.11180420297939266, "learning_rate": 9.916399734156254e-05, "loss": 2.0671, "step": 7514 }, { "epoch": 0.05844374002606832, "grad_norm": 0.130820348627403, "learning_rate": 9.916377483818347e-05, "loss": 2.0014, "step": 7515 }, { "epoch": 0.0584515169708489, "grad_norm": 0.12454968799780716, "learning_rate": 9.916355230544823e-05, "loss": 2.0211, "step": 7516 }, { "epoch": 0.058459293915629484, "grad_norm": 0.11712522855180509, "learning_rate": 9.916332974335694e-05, "loss": 2.0798, "step": 7517 }, { "epoch": 0.058467070860410066, "grad_norm": 0.1233086911365633, "learning_rate": 9.916310715190973e-05, "loss": 1.9656, "step": 7518 }, { "epoch": 0.05847484780519065, "grad_norm": 0.11886523463666208, "learning_rate": 9.916288453110676e-05, "loss": 2.1218, "step": 7519 }, { "epoch": 0.05848262474997123, "grad_norm": 0.11826903366364958, "learning_rate": 9.916266188094814e-05, "loss": 2.0525, "step": 7520 }, { "epoch": 0.05849040169475181, "grad_norm": 0.11123011804560198, "learning_rate": 9.916243920143402e-05, "loss": 2.0984, "step": 7521 }, { "epoch": 0.05849817863953239, "grad_norm": 0.11651083752507603, "learning_rate": 9.916221649256451e-05, "loss": 2.0256, "step": 7522 }, { "epoch": 0.05850595558431297, "grad_norm": 0.11270303248343042, "learning_rate": 9.916199375433977e-05, "loss": 2.0568, "step": 7523 }, { "epoch": 0.05851373252909355, "grad_norm": 0.12076822177318329, "learning_rate": 9.916177098675992e-05, "loss": 2.0477, "step": 7524 }, { "epoch": 0.058521509473874134, "grad_norm": 0.12407959980726131, "learning_rate": 9.916154818982508e-05, "loss": 2.0528, "step": 7525 }, { "epoch": 0.058529286418654715, "grad_norm": 0.11145595774919988, "learning_rate": 9.91613253635354e-05, "loss": 2.0339, "step": 7526 }, { "epoch": 0.058537063363435296, "grad_norm": 0.11484255293388924, "learning_rate": 9.916110250789102e-05, "loss": 2.0845, "step": 7527 }, { "epoch": 0.05854484030821588, "grad_norm": 0.11631167692644523, "learning_rate": 9.916087962289204e-05, "loss": 2.053, "step": 7528 }, { "epoch": 0.05855261725299646, "grad_norm": 0.11355996702385265, "learning_rate": 9.916065670853863e-05, "loss": 2.069, "step": 7529 }, { "epoch": 0.05856039419777704, "grad_norm": 0.11433291395059941, "learning_rate": 9.916043376483089e-05, "loss": 2.0392, "step": 7530 }, { "epoch": 0.05856817114255762, "grad_norm": 0.1846347839327645, "learning_rate": 9.916021079176899e-05, "loss": 2.0355, "step": 7531 }, { "epoch": 0.0585759480873382, "grad_norm": 0.126868717476884, "learning_rate": 9.915998778935304e-05, "loss": 2.0253, "step": 7532 }, { "epoch": 0.05858372503211878, "grad_norm": 0.11957493569095279, "learning_rate": 9.915976475758318e-05, "loss": 2.0535, "step": 7533 }, { "epoch": 0.058591501976899364, "grad_norm": 0.13423046553616902, "learning_rate": 9.915954169645953e-05, "loss": 2.0416, "step": 7534 }, { "epoch": 0.058599278921679945, "grad_norm": 0.11197704682176271, "learning_rate": 9.915931860598223e-05, "loss": 2.0489, "step": 7535 }, { "epoch": 0.058607055866460527, "grad_norm": 0.14986315620559706, "learning_rate": 9.915909548615143e-05, "loss": 2.0609, "step": 7536 }, { "epoch": 0.05861483281124111, "grad_norm": 0.13103089772964355, "learning_rate": 9.915887233696723e-05, "loss": 2.0747, "step": 7537 }, { "epoch": 0.05862260975602169, "grad_norm": 0.19178522691751682, "learning_rate": 9.915864915842981e-05, "loss": 2.0435, "step": 7538 }, { "epoch": 0.05863038670080227, "grad_norm": 0.15269474923115747, "learning_rate": 9.915842595053926e-05, "loss": 2.0466, "step": 7539 }, { "epoch": 0.05863816364558285, "grad_norm": 0.13894721546841915, "learning_rate": 9.915820271329572e-05, "loss": 2.0291, "step": 7540 }, { "epoch": 0.05864594059036343, "grad_norm": 0.11859781203887625, "learning_rate": 9.915797944669935e-05, "loss": 2.0817, "step": 7541 }, { "epoch": 0.058653717535144013, "grad_norm": 0.11959049139757232, "learning_rate": 9.915775615075026e-05, "loss": 2.0311, "step": 7542 }, { "epoch": 0.058661494479924595, "grad_norm": 0.11624052292647837, "learning_rate": 9.91575328254486e-05, "loss": 2.0354, "step": 7543 }, { "epoch": 0.058669271424705176, "grad_norm": 0.11589791577818429, "learning_rate": 9.915730947079448e-05, "loss": 2.0477, "step": 7544 }, { "epoch": 0.05867704836948576, "grad_norm": 0.11347983987782555, "learning_rate": 9.915708608678805e-05, "loss": 2.0044, "step": 7545 }, { "epoch": 0.05868482531426634, "grad_norm": 0.11395774149830461, "learning_rate": 9.915686267342943e-05, "loss": 2.0008, "step": 7546 }, { "epoch": 0.05869260225904692, "grad_norm": 0.3957218456695842, "learning_rate": 9.915663923071878e-05, "loss": 2.0906, "step": 7547 }, { "epoch": 0.0587003792038275, "grad_norm": 0.12921110916838086, "learning_rate": 9.91564157586562e-05, "loss": 2.0454, "step": 7548 }, { "epoch": 0.05870815614860808, "grad_norm": 0.13883644660121944, "learning_rate": 9.915619225724186e-05, "loss": 2.0215, "step": 7549 }, { "epoch": 0.05871593309338866, "grad_norm": 0.11538572853157562, "learning_rate": 9.915596872647586e-05, "loss": 2.018, "step": 7550 }, { "epoch": 0.058723710038169244, "grad_norm": 0.14244868686008327, "learning_rate": 9.915574516635835e-05, "loss": 2.0535, "step": 7551 }, { "epoch": 0.058731486982949825, "grad_norm": 0.13562313818516086, "learning_rate": 9.915552157688945e-05, "loss": 2.0705, "step": 7552 }, { "epoch": 0.058739263927730406, "grad_norm": 0.11999163990050388, "learning_rate": 9.915529795806933e-05, "loss": 1.9842, "step": 7553 }, { "epoch": 0.05874704087251099, "grad_norm": 0.15447066600803636, "learning_rate": 9.915507430989806e-05, "loss": 2.0381, "step": 7554 }, { "epoch": 0.05875481781729157, "grad_norm": 0.12360698055745854, "learning_rate": 9.915485063237584e-05, "loss": 2.0125, "step": 7555 }, { "epoch": 0.05876259476207215, "grad_norm": 0.12493340958820462, "learning_rate": 9.915462692550277e-05, "loss": 2.0118, "step": 7556 }, { "epoch": 0.05877037170685273, "grad_norm": 0.19345708252518848, "learning_rate": 9.915440318927898e-05, "loss": 2.1108, "step": 7557 }, { "epoch": 0.05877814865163331, "grad_norm": 0.12427117680677137, "learning_rate": 9.915417942370462e-05, "loss": 2.0515, "step": 7558 }, { "epoch": 0.05878592559641389, "grad_norm": 0.12380420100696381, "learning_rate": 9.915395562877981e-05, "loss": 2.0865, "step": 7559 }, { "epoch": 0.058793702541194474, "grad_norm": 0.16714505352426492, "learning_rate": 9.915373180450468e-05, "loss": 2.0632, "step": 7560 }, { "epoch": 0.058801479485975056, "grad_norm": 0.11929836055945735, "learning_rate": 9.915350795087939e-05, "loss": 2.0752, "step": 7561 }, { "epoch": 0.05880925643075564, "grad_norm": 0.1248440368265182, "learning_rate": 9.915328406790404e-05, "loss": 2.0481, "step": 7562 }, { "epoch": 0.05881703337553622, "grad_norm": 0.20750008578442536, "learning_rate": 9.915306015557879e-05, "loss": 2.1096, "step": 7563 }, { "epoch": 0.0588248103203168, "grad_norm": 0.12070152580040389, "learning_rate": 9.915283621390376e-05, "loss": 2.086, "step": 7564 }, { "epoch": 0.05883258726509738, "grad_norm": 0.13652945429431904, "learning_rate": 9.915261224287908e-05, "loss": 2.0661, "step": 7565 }, { "epoch": 0.05884036420987796, "grad_norm": 0.1273334754487841, "learning_rate": 9.91523882425049e-05, "loss": 2.0541, "step": 7566 }, { "epoch": 0.05884814115465854, "grad_norm": 0.12832348005487443, "learning_rate": 9.915216421278136e-05, "loss": 2.0832, "step": 7567 }, { "epoch": 0.058855918099439124, "grad_norm": 0.14615004044999885, "learning_rate": 9.915194015370855e-05, "loss": 2.0591, "step": 7568 }, { "epoch": 0.058863695044219705, "grad_norm": 0.1460023171082959, "learning_rate": 9.915171606528664e-05, "loss": 2.0228, "step": 7569 }, { "epoch": 0.058871471989000286, "grad_norm": 0.13659033906810078, "learning_rate": 9.915149194751576e-05, "loss": 2.071, "step": 7570 }, { "epoch": 0.05887924893378087, "grad_norm": 0.14452598079721096, "learning_rate": 9.915126780039604e-05, "loss": 2.0806, "step": 7571 }, { "epoch": 0.058887025878561455, "grad_norm": 0.12136001638265244, "learning_rate": 9.915104362392762e-05, "loss": 2.0801, "step": 7572 }, { "epoch": 0.058894802823342036, "grad_norm": 0.13437590060496307, "learning_rate": 9.915081941811062e-05, "loss": 2.0305, "step": 7573 }, { "epoch": 0.05890257976812262, "grad_norm": 0.14272864538485347, "learning_rate": 9.915059518294518e-05, "loss": 2.1233, "step": 7574 }, { "epoch": 0.0589103567129032, "grad_norm": 0.11972432396865074, "learning_rate": 9.915037091843145e-05, "loss": 2.0467, "step": 7575 }, { "epoch": 0.05891813365768378, "grad_norm": 0.1277863889682795, "learning_rate": 9.915014662456953e-05, "loss": 2.0359, "step": 7576 }, { "epoch": 0.05892591060246436, "grad_norm": 0.14648687670566843, "learning_rate": 9.914992230135958e-05, "loss": 2.0537, "step": 7577 }, { "epoch": 0.05893368754724494, "grad_norm": 0.11705335645303025, "learning_rate": 9.914969794880174e-05, "loss": 2.0643, "step": 7578 }, { "epoch": 0.05894146449202552, "grad_norm": 0.15089901593895647, "learning_rate": 9.91494735668961e-05, "loss": 1.9875, "step": 7579 }, { "epoch": 0.058949241436806105, "grad_norm": 0.16026583435528308, "learning_rate": 9.914924915564285e-05, "loss": 2.0366, "step": 7580 }, { "epoch": 0.058957018381586686, "grad_norm": 0.12212169499161138, "learning_rate": 9.914902471504209e-05, "loss": 2.0099, "step": 7581 }, { "epoch": 0.05896479532636727, "grad_norm": 0.15003412082184026, "learning_rate": 9.914880024509397e-05, "loss": 2.0927, "step": 7582 }, { "epoch": 0.05897257227114785, "grad_norm": 0.17771332836924647, "learning_rate": 9.91485757457986e-05, "loss": 2.0585, "step": 7583 }, { "epoch": 0.05898034921592843, "grad_norm": 0.11753253848874982, "learning_rate": 9.914835121715615e-05, "loss": 2.0896, "step": 7584 }, { "epoch": 0.05898812616070901, "grad_norm": 0.16645733274023972, "learning_rate": 9.914812665916674e-05, "loss": 2.06, "step": 7585 }, { "epoch": 0.05899590310548959, "grad_norm": 0.15900255729796073, "learning_rate": 9.914790207183049e-05, "loss": 2.0645, "step": 7586 }, { "epoch": 0.05900368005027017, "grad_norm": 0.12826124421347485, "learning_rate": 9.914767745514755e-05, "loss": 2.0181, "step": 7587 }, { "epoch": 0.059011456995050754, "grad_norm": 0.1892204658489011, "learning_rate": 9.914745280911803e-05, "loss": 2.028, "step": 7588 }, { "epoch": 0.059019233939831335, "grad_norm": 0.11753211279533099, "learning_rate": 9.914722813374209e-05, "loss": 2.0579, "step": 7589 }, { "epoch": 0.059027010884611916, "grad_norm": 0.18426445699100874, "learning_rate": 9.914700342901986e-05, "loss": 2.0355, "step": 7590 }, { "epoch": 0.0590347878293925, "grad_norm": 0.13882857941967594, "learning_rate": 9.914677869495147e-05, "loss": 2.0427, "step": 7591 }, { "epoch": 0.05904256477417308, "grad_norm": 0.16460038718413694, "learning_rate": 9.914655393153705e-05, "loss": 2.0504, "step": 7592 }, { "epoch": 0.05905034171895366, "grad_norm": 0.1833353144361035, "learning_rate": 9.914632913877676e-05, "loss": 2.0742, "step": 7593 }, { "epoch": 0.05905811866373424, "grad_norm": 0.1194807630475359, "learning_rate": 9.914610431667069e-05, "loss": 2.0231, "step": 7594 }, { "epoch": 0.05906589560851482, "grad_norm": 0.15544760710052463, "learning_rate": 9.914587946521898e-05, "loss": 2.0417, "step": 7595 }, { "epoch": 0.0590736725532954, "grad_norm": 0.12122140661196666, "learning_rate": 9.914565458442181e-05, "loss": 2.0079, "step": 7596 }, { "epoch": 0.059081449498075984, "grad_norm": 0.14465401732416622, "learning_rate": 9.914542967427928e-05, "loss": 2.0163, "step": 7597 }, { "epoch": 0.059089226442856566, "grad_norm": 0.13233180869588126, "learning_rate": 9.914520473479153e-05, "loss": 2.0239, "step": 7598 }, { "epoch": 0.05909700338763715, "grad_norm": 0.13811789864792073, "learning_rate": 9.914497976595868e-05, "loss": 2.0822, "step": 7599 }, { "epoch": 0.05910478033241773, "grad_norm": 0.1408528633604503, "learning_rate": 9.914475476778089e-05, "loss": 2.0194, "step": 7600 }, { "epoch": 0.05911255727719831, "grad_norm": 0.11524226694854485, "learning_rate": 9.914452974025829e-05, "loss": 2.0391, "step": 7601 }, { "epoch": 0.05912033422197889, "grad_norm": 0.13612674809526282, "learning_rate": 9.9144304683391e-05, "loss": 2.0161, "step": 7602 }, { "epoch": 0.05912811116675947, "grad_norm": 0.12761430236246607, "learning_rate": 9.914407959717915e-05, "loss": 2.0971, "step": 7603 }, { "epoch": 0.05913588811154005, "grad_norm": 0.14295149738787327, "learning_rate": 9.91438544816229e-05, "loss": 2.0678, "step": 7604 }, { "epoch": 0.059143665056320634, "grad_norm": 0.2566389856217971, "learning_rate": 9.914362933672237e-05, "loss": 2.0448, "step": 7605 }, { "epoch": 0.059151442001101215, "grad_norm": 0.14338032117830687, "learning_rate": 9.914340416247769e-05, "loss": 2.08, "step": 7606 }, { "epoch": 0.059159218945881796, "grad_norm": 0.11289834766301886, "learning_rate": 9.9143178958889e-05, "loss": 2.0386, "step": 7607 }, { "epoch": 0.05916699589066238, "grad_norm": 0.1381761317858515, "learning_rate": 9.914295372595644e-05, "loss": 1.994, "step": 7608 }, { "epoch": 0.05917477283544296, "grad_norm": 0.12083104761321338, "learning_rate": 9.914272846368013e-05, "loss": 2.071, "step": 7609 }, { "epoch": 0.05918254978022354, "grad_norm": 0.11749384865549371, "learning_rate": 9.914250317206022e-05, "loss": 2.0613, "step": 7610 }, { "epoch": 0.05919032672500412, "grad_norm": 0.11872089957830202, "learning_rate": 9.914227785109682e-05, "loss": 1.9858, "step": 7611 }, { "epoch": 0.0591981036697847, "grad_norm": 0.11747045332708317, "learning_rate": 9.91420525007901e-05, "loss": 2.0278, "step": 7612 }, { "epoch": 0.05920588061456528, "grad_norm": 0.15801757636155203, "learning_rate": 9.914182712114018e-05, "loss": 2.0935, "step": 7613 }, { "epoch": 0.059213657559345864, "grad_norm": 0.15052139335651954, "learning_rate": 9.914160171214717e-05, "loss": 2.0513, "step": 7614 }, { "epoch": 0.059221434504126445, "grad_norm": 0.11693283638151852, "learning_rate": 9.914137627381124e-05, "loss": 2.031, "step": 7615 }, { "epoch": 0.059229211448907026, "grad_norm": 0.16905672112555314, "learning_rate": 9.91411508061325e-05, "loss": 2.0791, "step": 7616 }, { "epoch": 0.05923698839368761, "grad_norm": 0.25832107226620743, "learning_rate": 9.91409253091111e-05, "loss": 2.0698, "step": 7617 }, { "epoch": 0.05924476533846819, "grad_norm": 0.11307510213646299, "learning_rate": 9.914069978274719e-05, "loss": 2.077, "step": 7618 }, { "epoch": 0.05925254228324877, "grad_norm": 0.1160759788985589, "learning_rate": 9.914047422704085e-05, "loss": 2.0763, "step": 7619 }, { "epoch": 0.05926031922802935, "grad_norm": 0.11890251276934219, "learning_rate": 9.914024864199228e-05, "loss": 2.0169, "step": 7620 }, { "epoch": 0.05926809617280993, "grad_norm": 0.11618904963388081, "learning_rate": 9.914002302760154e-05, "loss": 2.045, "step": 7621 }, { "epoch": 0.059275873117590513, "grad_norm": 0.11606841041285455, "learning_rate": 9.913979738386885e-05, "loss": 2.0473, "step": 7622 }, { "epoch": 0.059283650062371095, "grad_norm": 0.12055226873000265, "learning_rate": 9.91395717107943e-05, "loss": 2.084, "step": 7623 }, { "epoch": 0.059291427007151676, "grad_norm": 0.1279288111538917, "learning_rate": 9.9139346008378e-05, "loss": 2.0026, "step": 7624 }, { "epoch": 0.05929920395193226, "grad_norm": 0.11805639706371716, "learning_rate": 9.913912027662014e-05, "loss": 2.043, "step": 7625 }, { "epoch": 0.05930698089671284, "grad_norm": 0.12451481001653268, "learning_rate": 9.91388945155208e-05, "loss": 2.0382, "step": 7626 }, { "epoch": 0.05931475784149342, "grad_norm": 0.12401348736905961, "learning_rate": 9.913866872508016e-05, "loss": 2.0156, "step": 7627 }, { "epoch": 0.059322534786274, "grad_norm": 0.12533955473523462, "learning_rate": 9.913844290529833e-05, "loss": 2.0682, "step": 7628 }, { "epoch": 0.05933031173105458, "grad_norm": 0.1202635933056959, "learning_rate": 9.913821705617545e-05, "loss": 2.1047, "step": 7629 }, { "epoch": 0.05933808867583516, "grad_norm": 0.12436141816108451, "learning_rate": 9.913799117771168e-05, "loss": 2.1152, "step": 7630 }, { "epoch": 0.059345865620615744, "grad_norm": 0.12494358225220954, "learning_rate": 9.91377652699071e-05, "loss": 2.0689, "step": 7631 }, { "epoch": 0.05935364256539633, "grad_norm": 0.1257730257228662, "learning_rate": 9.913753933276189e-05, "loss": 2.0935, "step": 7632 }, { "epoch": 0.05936141951017691, "grad_norm": 0.1347440879382252, "learning_rate": 9.913731336627617e-05, "loss": 1.9812, "step": 7633 }, { "epoch": 0.059369196454957494, "grad_norm": 0.1288301211412563, "learning_rate": 9.913708737045009e-05, "loss": 2.0426, "step": 7634 }, { "epoch": 0.059376973399738076, "grad_norm": 0.12464558876893406, "learning_rate": 9.913686134528375e-05, "loss": 2.0169, "step": 7635 }, { "epoch": 0.05938475034451866, "grad_norm": 0.14371582144294123, "learning_rate": 9.913663529077731e-05, "loss": 2.0627, "step": 7636 }, { "epoch": 0.05939252728929924, "grad_norm": 0.12207884121511857, "learning_rate": 9.913640920693091e-05, "loss": 1.9837, "step": 7637 }, { "epoch": 0.05940030423407982, "grad_norm": 0.15160376682842222, "learning_rate": 9.913618309374467e-05, "loss": 2.0014, "step": 7638 }, { "epoch": 0.0594080811788604, "grad_norm": 0.1377131569221161, "learning_rate": 9.913595695121874e-05, "loss": 2.0701, "step": 7639 }, { "epoch": 0.05941585812364098, "grad_norm": 0.11876288545951741, "learning_rate": 9.913573077935323e-05, "loss": 2.05, "step": 7640 }, { "epoch": 0.05942363506842156, "grad_norm": 0.17148276057906695, "learning_rate": 9.91355045781483e-05, "loss": 2.0514, "step": 7641 }, { "epoch": 0.059431412013202144, "grad_norm": 0.15658516461494554, "learning_rate": 9.913527834760407e-05, "loss": 2.0129, "step": 7642 }, { "epoch": 0.059439188957982725, "grad_norm": 0.12313388943898956, "learning_rate": 9.91350520877207e-05, "loss": 2.052, "step": 7643 }, { "epoch": 0.059446965902763306, "grad_norm": 0.15758350801966825, "learning_rate": 9.913482579849828e-05, "loss": 2.0781, "step": 7644 }, { "epoch": 0.05945474284754389, "grad_norm": 0.14054886791056095, "learning_rate": 9.913459947993698e-05, "loss": 2.0317, "step": 7645 }, { "epoch": 0.05946251979232447, "grad_norm": 0.1178212812799334, "learning_rate": 9.913437313203692e-05, "loss": 2.0834, "step": 7646 }, { "epoch": 0.05947029673710505, "grad_norm": 0.17709259899685875, "learning_rate": 9.913414675479826e-05, "loss": 1.9771, "step": 7647 }, { "epoch": 0.05947807368188563, "grad_norm": 0.16247959010429983, "learning_rate": 9.913392034822111e-05, "loss": 2.0958, "step": 7648 }, { "epoch": 0.05948585062666621, "grad_norm": 0.11748648208906484, "learning_rate": 9.913369391230563e-05, "loss": 2.0516, "step": 7649 }, { "epoch": 0.05949362757144679, "grad_norm": 0.15488873533851424, "learning_rate": 9.913346744705189e-05, "loss": 2.0378, "step": 7650 }, { "epoch": 0.059501404516227374, "grad_norm": 0.13892372666272734, "learning_rate": 9.913324095246012e-05, "loss": 2.0316, "step": 7651 }, { "epoch": 0.059509181461007955, "grad_norm": 0.11759479228877498, "learning_rate": 9.913301442853038e-05, "loss": 2.06, "step": 7652 }, { "epoch": 0.059516958405788536, "grad_norm": 0.13446944454772242, "learning_rate": 9.913278787526285e-05, "loss": 2.0363, "step": 7653 }, { "epoch": 0.05952473535056912, "grad_norm": 0.12596769254365525, "learning_rate": 9.913256129265764e-05, "loss": 2.0729, "step": 7654 }, { "epoch": 0.0595325122953497, "grad_norm": 0.1177876614547134, "learning_rate": 9.913233468071489e-05, "loss": 1.9886, "step": 7655 }, { "epoch": 0.05954028924013028, "grad_norm": 0.127306232035757, "learning_rate": 9.913210803943474e-05, "loss": 2.0392, "step": 7656 }, { "epoch": 0.05954806618491086, "grad_norm": 0.11089879956187318, "learning_rate": 9.913188136881732e-05, "loss": 2.0704, "step": 7657 }, { "epoch": 0.05955584312969144, "grad_norm": 0.12405336106654108, "learning_rate": 9.913165466886278e-05, "loss": 2.0325, "step": 7658 }, { "epoch": 0.05956362007447202, "grad_norm": 0.11710573339571412, "learning_rate": 9.913142793957124e-05, "loss": 2.0944, "step": 7659 }, { "epoch": 0.059571397019252605, "grad_norm": 0.12345728524386201, "learning_rate": 9.913120118094285e-05, "loss": 2.0038, "step": 7660 }, { "epoch": 0.059579173964033186, "grad_norm": 0.11907236465994628, "learning_rate": 9.913097439297773e-05, "loss": 1.9553, "step": 7661 }, { "epoch": 0.05958695090881377, "grad_norm": 0.11671357710959061, "learning_rate": 9.913074757567601e-05, "loss": 2.0688, "step": 7662 }, { "epoch": 0.05959472785359435, "grad_norm": 0.11611822490061147, "learning_rate": 9.913052072903785e-05, "loss": 2.0711, "step": 7663 }, { "epoch": 0.05960250479837493, "grad_norm": 0.11573727699937601, "learning_rate": 9.913029385306336e-05, "loss": 2.0221, "step": 7664 }, { "epoch": 0.05961028174315551, "grad_norm": 0.1501731860552154, "learning_rate": 9.91300669477527e-05, "loss": 2.0549, "step": 7665 }, { "epoch": 0.05961805868793609, "grad_norm": 0.12044165927671957, "learning_rate": 9.9129840013106e-05, "loss": 2.0872, "step": 7666 }, { "epoch": 0.05962583563271667, "grad_norm": 0.12114210371491457, "learning_rate": 9.912961304912335e-05, "loss": 1.997, "step": 7667 }, { "epoch": 0.059633612577497254, "grad_norm": 0.11363848973770882, "learning_rate": 9.912938605580496e-05, "loss": 2.0237, "step": 7668 }, { "epoch": 0.059641389522277835, "grad_norm": 0.11459117309316014, "learning_rate": 9.912915903315093e-05, "loss": 2.0354, "step": 7669 }, { "epoch": 0.059649166467058416, "grad_norm": 0.11782080299522027, "learning_rate": 9.912893198116137e-05, "loss": 2.0726, "step": 7670 }, { "epoch": 0.059656943411839, "grad_norm": 0.11307809934770875, "learning_rate": 9.912870489983645e-05, "loss": 2.0488, "step": 7671 }, { "epoch": 0.05966472035661958, "grad_norm": 0.11620285276066848, "learning_rate": 9.912847778917631e-05, "loss": 2.0427, "step": 7672 }, { "epoch": 0.05967249730140016, "grad_norm": 0.11130597683544953, "learning_rate": 9.912825064918106e-05, "loss": 2.0657, "step": 7673 }, { "epoch": 0.05968027424618074, "grad_norm": 0.1337394271846365, "learning_rate": 9.912802347985087e-05, "loss": 2.0803, "step": 7674 }, { "epoch": 0.05968805119096132, "grad_norm": 0.11314860251576339, "learning_rate": 9.912779628118582e-05, "loss": 2.0291, "step": 7675 }, { "epoch": 0.0596958281357419, "grad_norm": 0.11557555886861799, "learning_rate": 9.91275690531861e-05, "loss": 2.0387, "step": 7676 }, { "epoch": 0.059703605080522484, "grad_norm": 0.11997184365545846, "learning_rate": 9.912734179585181e-05, "loss": 2.0244, "step": 7677 }, { "epoch": 0.059711382025303066, "grad_norm": 0.1131493688929409, "learning_rate": 9.91271145091831e-05, "loss": 2.0402, "step": 7678 }, { "epoch": 0.05971915897008365, "grad_norm": 0.12382540576362917, "learning_rate": 9.912688719318013e-05, "loss": 2.0021, "step": 7679 }, { "epoch": 0.05972693591486423, "grad_norm": 0.12400692960905434, "learning_rate": 9.912665984784299e-05, "loss": 2.0507, "step": 7680 }, { "epoch": 0.05973471285964481, "grad_norm": 0.116180455126591, "learning_rate": 9.912643247317184e-05, "loss": 2.0633, "step": 7681 }, { "epoch": 0.05974248980442539, "grad_norm": 0.11155917291535362, "learning_rate": 9.91262050691668e-05, "loss": 2.0508, "step": 7682 }, { "epoch": 0.05975026674920597, "grad_norm": 0.11836056586260955, "learning_rate": 9.912597763582804e-05, "loss": 1.9713, "step": 7683 }, { "epoch": 0.05975804369398655, "grad_norm": 0.24248717730846084, "learning_rate": 9.912575017315569e-05, "loss": 2.0454, "step": 7684 }, { "epoch": 0.059765820638767134, "grad_norm": 0.1191329404693083, "learning_rate": 9.912552268114984e-05, "loss": 2.0872, "step": 7685 }, { "epoch": 0.059773597583547715, "grad_norm": 0.12083278428705511, "learning_rate": 9.912529515981067e-05, "loss": 2.0089, "step": 7686 }, { "epoch": 0.059781374528328296, "grad_norm": 0.12327558447912126, "learning_rate": 9.912506760913828e-05, "loss": 2.0496, "step": 7687 }, { "epoch": 0.05978915147310888, "grad_norm": 0.1216708152554488, "learning_rate": 9.912484002913286e-05, "loss": 2.0947, "step": 7688 }, { "epoch": 0.05979692841788946, "grad_norm": 0.12509211493675249, "learning_rate": 9.91246124197945e-05, "loss": 2.104, "step": 7689 }, { "epoch": 0.05980470536267004, "grad_norm": 0.12819832619416235, "learning_rate": 9.912438478112335e-05, "loss": 2.0566, "step": 7690 }, { "epoch": 0.05981248230745063, "grad_norm": 0.13313228465027147, "learning_rate": 9.912415711311955e-05, "loss": 2.0641, "step": 7691 }, { "epoch": 0.05982025925223121, "grad_norm": 0.11714861928754411, "learning_rate": 9.912392941578323e-05, "loss": 2.0088, "step": 7692 }, { "epoch": 0.05982803619701179, "grad_norm": 0.12350435367099413, "learning_rate": 9.912370168911452e-05, "loss": 2.0012, "step": 7693 }, { "epoch": 0.05983581314179237, "grad_norm": 0.11561136612464548, "learning_rate": 9.912347393311357e-05, "loss": 2.0576, "step": 7694 }, { "epoch": 0.05984359008657295, "grad_norm": 0.1121471041033827, "learning_rate": 9.912324614778051e-05, "loss": 2.0398, "step": 7695 }, { "epoch": 0.05985136703135353, "grad_norm": 0.11745127903946453, "learning_rate": 9.912301833311548e-05, "loss": 2.111, "step": 7696 }, { "epoch": 0.059859143976134115, "grad_norm": 0.11157551878422702, "learning_rate": 9.912279048911861e-05, "loss": 2.0293, "step": 7697 }, { "epoch": 0.059866920920914696, "grad_norm": 0.11403577446228033, "learning_rate": 9.912256261579003e-05, "loss": 2.0874, "step": 7698 }, { "epoch": 0.05987469786569528, "grad_norm": 0.11379453461222622, "learning_rate": 9.912233471312988e-05, "loss": 2.0366, "step": 7699 }, { "epoch": 0.05988247481047586, "grad_norm": 0.11618198583742849, "learning_rate": 9.912210678113831e-05, "loss": 2.0412, "step": 7700 }, { "epoch": 0.05989025175525644, "grad_norm": 0.11546205593926534, "learning_rate": 9.912187881981545e-05, "loss": 2.0479, "step": 7701 }, { "epoch": 0.05989802870003702, "grad_norm": 0.10963981690170505, "learning_rate": 9.912165082916143e-05, "loss": 1.9896, "step": 7702 }, { "epoch": 0.0599058056448176, "grad_norm": 0.11535705291145294, "learning_rate": 9.912142280917638e-05, "loss": 2.0372, "step": 7703 }, { "epoch": 0.05991358258959818, "grad_norm": 0.12086169133835985, "learning_rate": 9.912119475986046e-05, "loss": 2.0896, "step": 7704 }, { "epoch": 0.059921359534378764, "grad_norm": 0.1196598104239178, "learning_rate": 9.912096668121378e-05, "loss": 2.066, "step": 7705 }, { "epoch": 0.059929136479159345, "grad_norm": 0.11176892921935079, "learning_rate": 9.912073857323649e-05, "loss": 2.0495, "step": 7706 }, { "epoch": 0.059936913423939926, "grad_norm": 0.12109361283842494, "learning_rate": 9.912051043592871e-05, "loss": 2.0583, "step": 7707 }, { "epoch": 0.05994469036872051, "grad_norm": 0.10997478172355672, "learning_rate": 9.91202822692906e-05, "loss": 2.0299, "step": 7708 }, { "epoch": 0.05995246731350109, "grad_norm": 0.1158458047853159, "learning_rate": 9.912005407332229e-05, "loss": 2.1096, "step": 7709 }, { "epoch": 0.05996024425828167, "grad_norm": 0.11504057190491762, "learning_rate": 9.91198258480239e-05, "loss": 2.0119, "step": 7710 }, { "epoch": 0.05996802120306225, "grad_norm": 0.1112997282477414, "learning_rate": 9.91195975933956e-05, "loss": 2.0565, "step": 7711 }, { "epoch": 0.05997579814784283, "grad_norm": 0.11835306231558555, "learning_rate": 9.911936930943748e-05, "loss": 2.066, "step": 7712 }, { "epoch": 0.05998357509262341, "grad_norm": 0.11418098016225611, "learning_rate": 9.911914099614972e-05, "loss": 1.9903, "step": 7713 }, { "epoch": 0.059991352037403994, "grad_norm": 0.1217674597005745, "learning_rate": 9.911891265353241e-05, "loss": 2.023, "step": 7714 }, { "epoch": 0.059999128982184576, "grad_norm": 0.11800523596540351, "learning_rate": 9.911868428158574e-05, "loss": 2.0684, "step": 7715 }, { "epoch": 0.06000690592696516, "grad_norm": 0.11130675716620302, "learning_rate": 9.911845588030982e-05, "loss": 2.0506, "step": 7716 }, { "epoch": 0.06001468287174574, "grad_norm": 0.11816529861361498, "learning_rate": 9.911822744970476e-05, "loss": 2.0526, "step": 7717 }, { "epoch": 0.06002245981652632, "grad_norm": 0.11174620741641333, "learning_rate": 9.911799898977074e-05, "loss": 2.1144, "step": 7718 }, { "epoch": 0.0600302367613069, "grad_norm": 0.11729706182451485, "learning_rate": 9.911777050050787e-05, "loss": 2.0202, "step": 7719 }, { "epoch": 0.06003801370608748, "grad_norm": 0.11699871041028191, "learning_rate": 9.911754198191632e-05, "loss": 2.0598, "step": 7720 }, { "epoch": 0.06004579065086806, "grad_norm": 0.11488649792995113, "learning_rate": 9.911731343399617e-05, "loss": 2.0442, "step": 7721 }, { "epoch": 0.060053567595648644, "grad_norm": 0.1163544198853094, "learning_rate": 9.911708485674761e-05, "loss": 2.0306, "step": 7722 }, { "epoch": 0.060061344540429225, "grad_norm": 0.3541646824568526, "learning_rate": 9.911685625017075e-05, "loss": 2.0896, "step": 7723 }, { "epoch": 0.060069121485209806, "grad_norm": 0.116938314202259, "learning_rate": 9.911662761426572e-05, "loss": 2.0779, "step": 7724 }, { "epoch": 0.06007689842999039, "grad_norm": 0.1208135070321986, "learning_rate": 9.911639894903268e-05, "loss": 2.0501, "step": 7725 }, { "epoch": 0.06008467537477097, "grad_norm": 0.11559924967762174, "learning_rate": 9.911617025447175e-05, "loss": 2.0131, "step": 7726 }, { "epoch": 0.06009245231955155, "grad_norm": 0.1157469442950188, "learning_rate": 9.911594153058307e-05, "loss": 2.0058, "step": 7727 }, { "epoch": 0.06010022926433213, "grad_norm": 0.1196797853396458, "learning_rate": 9.911571277736678e-05, "loss": 2.03, "step": 7728 }, { "epoch": 0.06010800620911271, "grad_norm": 0.12424353248472486, "learning_rate": 9.911548399482302e-05, "loss": 1.9947, "step": 7729 }, { "epoch": 0.06011578315389329, "grad_norm": 0.11384013578038307, "learning_rate": 9.911525518295192e-05, "loss": 2.0005, "step": 7730 }, { "epoch": 0.060123560098673874, "grad_norm": 0.14766343174026628, "learning_rate": 9.91150263417536e-05, "loss": 2.0694, "step": 7731 }, { "epoch": 0.060131337043454455, "grad_norm": 0.3120415007553012, "learning_rate": 9.911479747122823e-05, "loss": 2.0327, "step": 7732 }, { "epoch": 0.060139113988235036, "grad_norm": 0.11877934830214654, "learning_rate": 9.911456857137593e-05, "loss": 2.0385, "step": 7733 }, { "epoch": 0.06014689093301562, "grad_norm": 0.1609480067834418, "learning_rate": 9.911433964219683e-05, "loss": 2.0526, "step": 7734 }, { "epoch": 0.0601546678777962, "grad_norm": 0.19220255203893, "learning_rate": 9.911411068369108e-05, "loss": 2.0006, "step": 7735 }, { "epoch": 0.06016244482257678, "grad_norm": 0.13861064287158623, "learning_rate": 9.91138816958588e-05, "loss": 2.0355, "step": 7736 }, { "epoch": 0.06017022176735736, "grad_norm": 0.13789556459176924, "learning_rate": 9.911365267870015e-05, "loss": 2.0336, "step": 7737 }, { "epoch": 0.06017799871213794, "grad_norm": 0.19439910948016806, "learning_rate": 9.911342363221526e-05, "loss": 2.1036, "step": 7738 }, { "epoch": 0.06018577565691852, "grad_norm": 0.14147185951350166, "learning_rate": 9.911319455640423e-05, "loss": 2.014, "step": 7739 }, { "epoch": 0.060193552601699105, "grad_norm": 0.12875884578316943, "learning_rate": 9.911296545126725e-05, "loss": 2.0055, "step": 7740 }, { "epoch": 0.060201329546479686, "grad_norm": 0.17956460042239744, "learning_rate": 9.911273631680444e-05, "loss": 2.0263, "step": 7741 }, { "epoch": 0.06020910649126027, "grad_norm": 0.15402741075265, "learning_rate": 9.911250715301592e-05, "loss": 2.0946, "step": 7742 }, { "epoch": 0.06021688343604085, "grad_norm": 0.11376448167949112, "learning_rate": 9.911227795990184e-05, "loss": 2.0758, "step": 7743 }, { "epoch": 0.06022466038082143, "grad_norm": 0.3408820006115367, "learning_rate": 9.911204873746233e-05, "loss": 2.0604, "step": 7744 }, { "epoch": 0.06023243732560201, "grad_norm": 0.12082806854813337, "learning_rate": 9.911181948569754e-05, "loss": 2.0159, "step": 7745 }, { "epoch": 0.06024021427038259, "grad_norm": 0.1329405810531614, "learning_rate": 9.911159020460759e-05, "loss": 2.1226, "step": 7746 }, { "epoch": 0.06024799121516317, "grad_norm": 0.1318649989636417, "learning_rate": 9.911136089419264e-05, "loss": 2.0519, "step": 7747 }, { "epoch": 0.060255768159943754, "grad_norm": 0.12108250164777179, "learning_rate": 9.911113155445279e-05, "loss": 2.088, "step": 7748 }, { "epoch": 0.060263545104724335, "grad_norm": 0.13473489937612831, "learning_rate": 9.911090218538824e-05, "loss": 1.9817, "step": 7749 }, { "epoch": 0.060271322049504916, "grad_norm": 0.17582026034280002, "learning_rate": 9.911067278699905e-05, "loss": 2.0815, "step": 7750 }, { "epoch": 0.060279098994285504, "grad_norm": 0.17294702371788326, "learning_rate": 9.911044335928539e-05, "loss": 2.0938, "step": 7751 }, { "epoch": 0.060286875939066085, "grad_norm": 0.1367197320222854, "learning_rate": 9.911021390224741e-05, "loss": 2.0429, "step": 7752 }, { "epoch": 0.06029465288384667, "grad_norm": 0.11462183766523416, "learning_rate": 9.910998441588526e-05, "loss": 2.0555, "step": 7753 }, { "epoch": 0.06030242982862725, "grad_norm": 0.13221047740743389, "learning_rate": 9.910975490019902e-05, "loss": 2.0789, "step": 7754 }, { "epoch": 0.06031020677340783, "grad_norm": 0.13403349397969141, "learning_rate": 9.910952535518889e-05, "loss": 2.0867, "step": 7755 }, { "epoch": 0.06031798371818841, "grad_norm": 0.11269562784475784, "learning_rate": 9.910929578085497e-05, "loss": 1.9952, "step": 7756 }, { "epoch": 0.06032576066296899, "grad_norm": 0.12936299584678906, "learning_rate": 9.910906617719739e-05, "loss": 1.9852, "step": 7757 }, { "epoch": 0.06033353760774957, "grad_norm": 0.13159240275878967, "learning_rate": 9.910883654421632e-05, "loss": 2.0099, "step": 7758 }, { "epoch": 0.060341314552530154, "grad_norm": 0.12610839066569668, "learning_rate": 9.910860688191189e-05, "loss": 2.0983, "step": 7759 }, { "epoch": 0.060349091497310735, "grad_norm": 0.11190395998016134, "learning_rate": 9.91083771902842e-05, "loss": 2.007, "step": 7760 }, { "epoch": 0.060356868442091316, "grad_norm": 0.1211037363408572, "learning_rate": 9.910814746933342e-05, "loss": 2.0828, "step": 7761 }, { "epoch": 0.0603646453868719, "grad_norm": 0.11983952549288024, "learning_rate": 9.910791771905969e-05, "loss": 2.0563, "step": 7762 }, { "epoch": 0.06037242233165248, "grad_norm": 0.11047570846152409, "learning_rate": 9.910768793946315e-05, "loss": 2.0336, "step": 7763 }, { "epoch": 0.06038019927643306, "grad_norm": 0.11699446700978634, "learning_rate": 9.910745813054389e-05, "loss": 2.0327, "step": 7764 }, { "epoch": 0.06038797622121364, "grad_norm": 0.1150939484644165, "learning_rate": 9.910722829230211e-05, "loss": 2.0677, "step": 7765 }, { "epoch": 0.06039575316599422, "grad_norm": 0.11217843748526878, "learning_rate": 9.910699842473791e-05, "loss": 2.0027, "step": 7766 }, { "epoch": 0.0604035301107748, "grad_norm": 0.1166555712879486, "learning_rate": 9.910676852785146e-05, "loss": 2.0509, "step": 7767 }, { "epoch": 0.060411307055555384, "grad_norm": 0.11459983698702449, "learning_rate": 9.910653860164286e-05, "loss": 2.0681, "step": 7768 }, { "epoch": 0.060419084000335965, "grad_norm": 0.143968333320923, "learning_rate": 9.910630864611224e-05, "loss": 2.0558, "step": 7769 }, { "epoch": 0.060426860945116546, "grad_norm": 0.126365135110872, "learning_rate": 9.910607866125978e-05, "loss": 2.0824, "step": 7770 }, { "epoch": 0.06043463788989713, "grad_norm": 0.11336673220680984, "learning_rate": 9.91058486470856e-05, "loss": 2.0203, "step": 7771 }, { "epoch": 0.06044241483467771, "grad_norm": 0.12375800513851587, "learning_rate": 9.910561860358983e-05, "loss": 2.0506, "step": 7772 }, { "epoch": 0.06045019177945829, "grad_norm": 0.12286606436687061, "learning_rate": 9.91053885307726e-05, "loss": 1.9956, "step": 7773 }, { "epoch": 0.06045796872423887, "grad_norm": 0.11235537227100852, "learning_rate": 9.910515842863408e-05, "loss": 2.0485, "step": 7774 }, { "epoch": 0.06046574566901945, "grad_norm": 0.11285287071309992, "learning_rate": 9.910492829717436e-05, "loss": 2.0236, "step": 7775 }, { "epoch": 0.06047352261380003, "grad_norm": 0.11781360650351254, "learning_rate": 9.910469813639363e-05, "loss": 1.9964, "step": 7776 }, { "epoch": 0.060481299558580615, "grad_norm": 0.14308779298327323, "learning_rate": 9.910446794629198e-05, "loss": 2.099, "step": 7777 }, { "epoch": 0.060489076503361196, "grad_norm": 0.13819367174842365, "learning_rate": 9.910423772686958e-05, "loss": 2.081, "step": 7778 }, { "epoch": 0.06049685344814178, "grad_norm": 0.12130685184363053, "learning_rate": 9.910400747812656e-05, "loss": 2.0548, "step": 7779 }, { "epoch": 0.06050463039292236, "grad_norm": 0.20777877963387398, "learning_rate": 9.910377720006304e-05, "loss": 2.0116, "step": 7780 }, { "epoch": 0.06051240733770294, "grad_norm": 0.18818789153625576, "learning_rate": 9.910354689267918e-05, "loss": 2.0775, "step": 7781 }, { "epoch": 0.06052018428248352, "grad_norm": 0.1194271966382707, "learning_rate": 9.910331655597511e-05, "loss": 2.1032, "step": 7782 }, { "epoch": 0.0605279612272641, "grad_norm": 0.1521351380595419, "learning_rate": 9.910308618995097e-05, "loss": 2.0192, "step": 7783 }, { "epoch": 0.06053573817204468, "grad_norm": 0.1275799636188882, "learning_rate": 9.910285579460688e-05, "loss": 2.0418, "step": 7784 }, { "epoch": 0.060543515116825264, "grad_norm": 0.12998515350586032, "learning_rate": 9.910262536994299e-05, "loss": 2.0306, "step": 7785 }, { "epoch": 0.060551292061605845, "grad_norm": 0.133135111812962, "learning_rate": 9.910239491595945e-05, "loss": 2.0382, "step": 7786 }, { "epoch": 0.060559069006386426, "grad_norm": 0.11772791152811314, "learning_rate": 9.910216443265638e-05, "loss": 2.0461, "step": 7787 }, { "epoch": 0.06056684595116701, "grad_norm": 0.11354963801456526, "learning_rate": 9.910193392003393e-05, "loss": 2.0339, "step": 7788 }, { "epoch": 0.06057462289594759, "grad_norm": 0.11524700355993915, "learning_rate": 9.910170337809222e-05, "loss": 2.0419, "step": 7789 }, { "epoch": 0.06058239984072817, "grad_norm": 0.11330420378825479, "learning_rate": 9.910147280683139e-05, "loss": 2.0911, "step": 7790 }, { "epoch": 0.06059017678550875, "grad_norm": 0.12338756897245749, "learning_rate": 9.910124220625161e-05, "loss": 2.0127, "step": 7791 }, { "epoch": 0.06059795373028933, "grad_norm": 0.12350833642533347, "learning_rate": 9.9101011576353e-05, "loss": 2.0483, "step": 7792 }, { "epoch": 0.06060573067506991, "grad_norm": 0.11939483483869766, "learning_rate": 9.910078091713567e-05, "loss": 1.984, "step": 7793 }, { "epoch": 0.060613507619850494, "grad_norm": 0.1284979257566201, "learning_rate": 9.910055022859978e-05, "loss": 2.1239, "step": 7794 }, { "epoch": 0.060621284564631076, "grad_norm": 0.12500991554560328, "learning_rate": 9.910031951074548e-05, "loss": 2.1137, "step": 7795 }, { "epoch": 0.06062906150941166, "grad_norm": 0.11372715024853941, "learning_rate": 9.91000887635729e-05, "loss": 2.0196, "step": 7796 }, { "epoch": 0.06063683845419224, "grad_norm": 0.11681107689829633, "learning_rate": 9.909985798708217e-05, "loss": 1.9546, "step": 7797 }, { "epoch": 0.06064461539897282, "grad_norm": 0.1296637016681058, "learning_rate": 9.909962718127342e-05, "loss": 2.047, "step": 7798 }, { "epoch": 0.0606523923437534, "grad_norm": 0.10944605055898714, "learning_rate": 9.909939634614681e-05, "loss": 2.0619, "step": 7799 }, { "epoch": 0.06066016928853398, "grad_norm": 0.11932190811014878, "learning_rate": 9.909916548170246e-05, "loss": 2.0561, "step": 7800 }, { "epoch": 0.06066794623331456, "grad_norm": 0.12419199277472645, "learning_rate": 9.909893458794052e-05, "loss": 2.0515, "step": 7801 }, { "epoch": 0.060675723178095144, "grad_norm": 0.11660729310425741, "learning_rate": 9.909870366486111e-05, "loss": 2.0308, "step": 7802 }, { "epoch": 0.060683500122875725, "grad_norm": 0.1923472826181688, "learning_rate": 9.909847271246439e-05, "loss": 2.0195, "step": 7803 }, { "epoch": 0.060691277067656306, "grad_norm": 0.12184019686075846, "learning_rate": 9.90982417307505e-05, "loss": 2.0821, "step": 7804 }, { "epoch": 0.06069905401243689, "grad_norm": 0.1242629815578223, "learning_rate": 9.909801071971955e-05, "loss": 2.0198, "step": 7805 }, { "epoch": 0.06070683095721747, "grad_norm": 0.12314717412906744, "learning_rate": 9.90977796793717e-05, "loss": 2.0622, "step": 7806 }, { "epoch": 0.06071460790199805, "grad_norm": 0.11731126042310651, "learning_rate": 9.909754860970708e-05, "loss": 2.0287, "step": 7807 }, { "epoch": 0.06072238484677863, "grad_norm": 0.12331794560571278, "learning_rate": 9.909731751072582e-05, "loss": 2.057, "step": 7808 }, { "epoch": 0.06073016179155921, "grad_norm": 0.13354936285314728, "learning_rate": 9.909708638242808e-05, "loss": 2.036, "step": 7809 }, { "epoch": 0.0607379387363398, "grad_norm": 0.12483899557544993, "learning_rate": 9.909685522481398e-05, "loss": 2.015, "step": 7810 }, { "epoch": 0.06074571568112038, "grad_norm": 0.13158353524952338, "learning_rate": 9.909662403788368e-05, "loss": 2.0479, "step": 7811 }, { "epoch": 0.06075349262590096, "grad_norm": 0.19080969301701836, "learning_rate": 9.909639282163728e-05, "loss": 2.0606, "step": 7812 }, { "epoch": 0.06076126957068154, "grad_norm": 0.16091684362083475, "learning_rate": 9.909616157607495e-05, "loss": 2.0186, "step": 7813 }, { "epoch": 0.060769046515462125, "grad_norm": 0.1164240926520438, "learning_rate": 9.909593030119682e-05, "loss": 2.0583, "step": 7814 }, { "epoch": 0.060776823460242706, "grad_norm": 0.1288766907183453, "learning_rate": 9.909569899700303e-05, "loss": 1.9645, "step": 7815 }, { "epoch": 0.06078460040502329, "grad_norm": 0.13477330611098198, "learning_rate": 9.909546766349372e-05, "loss": 2.0284, "step": 7816 }, { "epoch": 0.06079237734980387, "grad_norm": 0.11717988560798608, "learning_rate": 9.909523630066899e-05, "loss": 2.0795, "step": 7817 }, { "epoch": 0.06080015429458445, "grad_norm": 0.1314370307664571, "learning_rate": 9.909500490852904e-05, "loss": 2.0897, "step": 7818 }, { "epoch": 0.06080793123936503, "grad_norm": 0.13214951588021914, "learning_rate": 9.909477348707396e-05, "loss": 2.0629, "step": 7819 }, { "epoch": 0.06081570818414561, "grad_norm": 0.11805898353347039, "learning_rate": 9.909454203630393e-05, "loss": 2.105, "step": 7820 }, { "epoch": 0.06082348512892619, "grad_norm": 0.13272460288560958, "learning_rate": 9.909431055621904e-05, "loss": 2.0562, "step": 7821 }, { "epoch": 0.060831262073706774, "grad_norm": 0.12430018271809422, "learning_rate": 9.909407904681949e-05, "loss": 2.025, "step": 7822 }, { "epoch": 0.060839039018487355, "grad_norm": 0.11843935587015572, "learning_rate": 9.909384750810536e-05, "loss": 2.0436, "step": 7823 }, { "epoch": 0.060846815963267936, "grad_norm": 0.1227079398084992, "learning_rate": 9.90936159400768e-05, "loss": 2.0252, "step": 7824 }, { "epoch": 0.06085459290804852, "grad_norm": 0.11239228485279991, "learning_rate": 9.909338434273396e-05, "loss": 2.0901, "step": 7825 }, { "epoch": 0.0608623698528291, "grad_norm": 0.1269808792002928, "learning_rate": 9.9093152716077e-05, "loss": 2.0686, "step": 7826 }, { "epoch": 0.06087014679760968, "grad_norm": 0.12055812859159949, "learning_rate": 9.909292106010601e-05, "loss": 2.0638, "step": 7827 }, { "epoch": 0.06087792374239026, "grad_norm": 0.11214380656729492, "learning_rate": 9.909268937482116e-05, "loss": 2.0375, "step": 7828 }, { "epoch": 0.06088570068717084, "grad_norm": 0.12251612771571317, "learning_rate": 9.909245766022258e-05, "loss": 2.0946, "step": 7829 }, { "epoch": 0.06089347763195142, "grad_norm": 0.1244262783822946, "learning_rate": 9.909222591631042e-05, "loss": 2.0232, "step": 7830 }, { "epoch": 0.060901254576732004, "grad_norm": 0.12596781021697898, "learning_rate": 9.909199414308479e-05, "loss": 2.0245, "step": 7831 }, { "epoch": 0.060909031521512585, "grad_norm": 0.1265362865568072, "learning_rate": 9.909176234054586e-05, "loss": 2.0113, "step": 7832 }, { "epoch": 0.06091680846629317, "grad_norm": 0.11194155985720167, "learning_rate": 9.909153050869375e-05, "loss": 2.0228, "step": 7833 }, { "epoch": 0.06092458541107375, "grad_norm": 0.12388810798938897, "learning_rate": 9.909129864752861e-05, "loss": 2.0579, "step": 7834 }, { "epoch": 0.06093236235585433, "grad_norm": 0.12536728453058774, "learning_rate": 9.909106675705057e-05, "loss": 2.0201, "step": 7835 }, { "epoch": 0.06094013930063491, "grad_norm": 0.11659004034690879, "learning_rate": 9.909083483725978e-05, "loss": 2.0356, "step": 7836 }, { "epoch": 0.06094791624541549, "grad_norm": 0.13757046974938206, "learning_rate": 9.909060288815635e-05, "loss": 2.0652, "step": 7837 }, { "epoch": 0.06095569319019607, "grad_norm": 0.12347779824637, "learning_rate": 9.909037090974045e-05, "loss": 2.0279, "step": 7838 }, { "epoch": 0.060963470134976654, "grad_norm": 0.116571813893032, "learning_rate": 9.90901389020122e-05, "loss": 2.0423, "step": 7839 }, { "epoch": 0.060971247079757235, "grad_norm": 0.13404518472863267, "learning_rate": 9.908990686497173e-05, "loss": 2.041, "step": 7840 }, { "epoch": 0.060979024024537816, "grad_norm": 0.16261692373730094, "learning_rate": 9.908967479861922e-05, "loss": 2.043, "step": 7841 }, { "epoch": 0.0609868009693184, "grad_norm": 0.11997417287450074, "learning_rate": 9.908944270295477e-05, "loss": 2.0715, "step": 7842 }, { "epoch": 0.06099457791409898, "grad_norm": 0.11266340187832787, "learning_rate": 9.908921057797853e-05, "loss": 2.0362, "step": 7843 }, { "epoch": 0.06100235485887956, "grad_norm": 0.11369483825165283, "learning_rate": 9.908897842369064e-05, "loss": 1.9775, "step": 7844 }, { "epoch": 0.06101013180366014, "grad_norm": 0.11409412132101601, "learning_rate": 9.908874624009123e-05, "loss": 2.0648, "step": 7845 }, { "epoch": 0.06101790874844072, "grad_norm": 0.11718265588518297, "learning_rate": 9.908851402718045e-05, "loss": 2.054, "step": 7846 }, { "epoch": 0.0610256856932213, "grad_norm": 0.11982818237162632, "learning_rate": 9.908828178495844e-05, "loss": 2.0707, "step": 7847 }, { "epoch": 0.061033462638001884, "grad_norm": 0.11369688175662994, "learning_rate": 9.908804951342533e-05, "loss": 2.0423, "step": 7848 }, { "epoch": 0.061041239582782465, "grad_norm": 0.12122159704436665, "learning_rate": 9.908781721258126e-05, "loss": 2.0431, "step": 7849 }, { "epoch": 0.061049016527563046, "grad_norm": 0.11532507018259393, "learning_rate": 9.908758488242638e-05, "loss": 2.0448, "step": 7850 }, { "epoch": 0.06105679347234363, "grad_norm": 0.1126897633873174, "learning_rate": 9.908735252296081e-05, "loss": 2.0617, "step": 7851 }, { "epoch": 0.06106457041712421, "grad_norm": 0.12239273369353434, "learning_rate": 9.90871201341847e-05, "loss": 2.006, "step": 7852 }, { "epoch": 0.06107234736190479, "grad_norm": 0.11586192345727125, "learning_rate": 9.908688771609817e-05, "loss": 1.9791, "step": 7853 }, { "epoch": 0.06108012430668537, "grad_norm": 0.12379201987875395, "learning_rate": 9.908665526870141e-05, "loss": 2.0697, "step": 7854 }, { "epoch": 0.06108790125146595, "grad_norm": 0.131007961890072, "learning_rate": 9.908642279199451e-05, "loss": 2.0881, "step": 7855 }, { "epoch": 0.06109567819624653, "grad_norm": 0.12153957955145832, "learning_rate": 9.908619028597761e-05, "loss": 2.1048, "step": 7856 }, { "epoch": 0.061103455141027115, "grad_norm": 0.1142941476072874, "learning_rate": 9.908595775065089e-05, "loss": 2.0264, "step": 7857 }, { "epoch": 0.061111232085807696, "grad_norm": 0.11597814007371955, "learning_rate": 9.908572518601443e-05, "loss": 2.0242, "step": 7858 }, { "epoch": 0.06111900903058828, "grad_norm": 0.11660872365768078, "learning_rate": 9.90854925920684e-05, "loss": 2.0695, "step": 7859 }, { "epoch": 0.06112678597536886, "grad_norm": 0.13719965725779693, "learning_rate": 9.908525996881297e-05, "loss": 2.0902, "step": 7860 }, { "epoch": 0.06113456292014944, "grad_norm": 0.1440631260401776, "learning_rate": 9.908502731624823e-05, "loss": 2.0515, "step": 7861 }, { "epoch": 0.06114233986493002, "grad_norm": 0.1303859381998485, "learning_rate": 9.908479463437434e-05, "loss": 2.0803, "step": 7862 }, { "epoch": 0.0611501168097106, "grad_norm": 0.11661293067846971, "learning_rate": 9.908456192319142e-05, "loss": 2.0361, "step": 7863 }, { "epoch": 0.06115789375449118, "grad_norm": 0.14851333493951596, "learning_rate": 9.908432918269964e-05, "loss": 2.0802, "step": 7864 }, { "epoch": 0.061165670699271764, "grad_norm": 0.15651626529403317, "learning_rate": 9.908409641289912e-05, "loss": 2.0718, "step": 7865 }, { "epoch": 0.061173447644052345, "grad_norm": 0.13245757986032258, "learning_rate": 9.908386361379e-05, "loss": 1.9973, "step": 7866 }, { "epoch": 0.061181224588832926, "grad_norm": 0.1111511050536423, "learning_rate": 9.908363078537242e-05, "loss": 2.054, "step": 7867 }, { "epoch": 0.06118900153361351, "grad_norm": 0.13210387639619525, "learning_rate": 9.908339792764654e-05, "loss": 2.0729, "step": 7868 }, { "epoch": 0.061196778478394095, "grad_norm": 0.15171921448748632, "learning_rate": 9.908316504061246e-05, "loss": 2.0846, "step": 7869 }, { "epoch": 0.06120455542317468, "grad_norm": 0.13415824367427745, "learning_rate": 9.908293212427033e-05, "loss": 2.0474, "step": 7870 }, { "epoch": 0.06121233236795526, "grad_norm": 0.12611390690187696, "learning_rate": 9.908269917862033e-05, "loss": 2.0522, "step": 7871 }, { "epoch": 0.06122010931273584, "grad_norm": 0.1287487178554027, "learning_rate": 9.908246620366253e-05, "loss": 2.0507, "step": 7872 }, { "epoch": 0.06122788625751642, "grad_norm": 0.12627393243721483, "learning_rate": 9.908223319939713e-05, "loss": 2.1297, "step": 7873 }, { "epoch": 0.061235663202297, "grad_norm": 0.13567026131196858, "learning_rate": 9.908200016582423e-05, "loss": 2.0517, "step": 7874 }, { "epoch": 0.06124344014707758, "grad_norm": 0.11962168918255596, "learning_rate": 9.9081767102944e-05, "loss": 1.9787, "step": 7875 }, { "epoch": 0.061251217091858164, "grad_norm": 0.11809799427005636, "learning_rate": 9.908153401075655e-05, "loss": 2.0107, "step": 7876 }, { "epoch": 0.061258994036638745, "grad_norm": 0.133999285767103, "learning_rate": 9.908130088926204e-05, "loss": 2.063, "step": 7877 }, { "epoch": 0.061266770981419326, "grad_norm": 0.13664846560981028, "learning_rate": 9.90810677384606e-05, "loss": 2.0447, "step": 7878 }, { "epoch": 0.06127454792619991, "grad_norm": 0.12400819136220395, "learning_rate": 9.908083455835238e-05, "loss": 1.9959, "step": 7879 }, { "epoch": 0.06128232487098049, "grad_norm": 0.12992058161334602, "learning_rate": 9.908060134893749e-05, "loss": 2.0296, "step": 7880 }, { "epoch": 0.06129010181576107, "grad_norm": 0.16102063096132452, "learning_rate": 9.90803681102161e-05, "loss": 2.037, "step": 7881 }, { "epoch": 0.06129787876054165, "grad_norm": 0.13186919146077147, "learning_rate": 9.908013484218834e-05, "loss": 2.0235, "step": 7882 }, { "epoch": 0.06130565570532223, "grad_norm": 0.23408689838691307, "learning_rate": 9.907990154485435e-05, "loss": 2.1142, "step": 7883 }, { "epoch": 0.06131343265010281, "grad_norm": 0.14680902838212903, "learning_rate": 9.907966821821426e-05, "loss": 2.0953, "step": 7884 }, { "epoch": 0.061321209594883394, "grad_norm": 0.19204192548752294, "learning_rate": 9.907943486226823e-05, "loss": 2.0589, "step": 7885 }, { "epoch": 0.061328986539663975, "grad_norm": 0.12988475111570022, "learning_rate": 9.907920147701638e-05, "loss": 2.0765, "step": 7886 }, { "epoch": 0.061336763484444556, "grad_norm": 0.11959152367487957, "learning_rate": 9.907896806245884e-05, "loss": 2.021, "step": 7887 }, { "epoch": 0.06134454042922514, "grad_norm": 0.1417379905585926, "learning_rate": 9.907873461859577e-05, "loss": 2.0355, "step": 7888 }, { "epoch": 0.06135231737400572, "grad_norm": 0.128728214716075, "learning_rate": 9.907850114542731e-05, "loss": 1.967, "step": 7889 }, { "epoch": 0.0613600943187863, "grad_norm": 0.12755673143058185, "learning_rate": 9.90782676429536e-05, "loss": 2.0044, "step": 7890 }, { "epoch": 0.06136787126356688, "grad_norm": 0.15911702915029016, "learning_rate": 9.907803411117477e-05, "loss": 2.0399, "step": 7891 }, { "epoch": 0.06137564820834746, "grad_norm": 0.13354319587216024, "learning_rate": 9.907780055009095e-05, "loss": 2.0455, "step": 7892 }, { "epoch": 0.06138342515312804, "grad_norm": 0.1258953939352911, "learning_rate": 9.90775669597023e-05, "loss": 2.0669, "step": 7893 }, { "epoch": 0.061391202097908625, "grad_norm": 0.14418730388272652, "learning_rate": 9.907733334000896e-05, "loss": 2.0318, "step": 7894 }, { "epoch": 0.061398979042689206, "grad_norm": 0.118178146959762, "learning_rate": 9.907709969101105e-05, "loss": 2.0967, "step": 7895 }, { "epoch": 0.06140675598746979, "grad_norm": 0.1382018758787312, "learning_rate": 9.907686601270873e-05, "loss": 2.0449, "step": 7896 }, { "epoch": 0.06141453293225037, "grad_norm": 0.1367320037575436, "learning_rate": 9.907663230510211e-05, "loss": 2.0678, "step": 7897 }, { "epoch": 0.06142230987703095, "grad_norm": 0.12977003576683885, "learning_rate": 9.907639856819137e-05, "loss": 2.0507, "step": 7898 }, { "epoch": 0.06143008682181153, "grad_norm": 0.12072713939217407, "learning_rate": 9.907616480197662e-05, "loss": 2.0547, "step": 7899 }, { "epoch": 0.06143786376659211, "grad_norm": 0.14100608374802326, "learning_rate": 9.9075931006458e-05, "loss": 2.052, "step": 7900 }, { "epoch": 0.06144564071137269, "grad_norm": 0.13116381964478274, "learning_rate": 9.907569718163567e-05, "loss": 2.0236, "step": 7901 }, { "epoch": 0.061453417656153274, "grad_norm": 0.11807937424380477, "learning_rate": 9.907546332750976e-05, "loss": 2.0108, "step": 7902 }, { "epoch": 0.061461194600933855, "grad_norm": 0.12783496053078747, "learning_rate": 9.90752294440804e-05, "loss": 2.0566, "step": 7903 }, { "epoch": 0.061468971545714436, "grad_norm": 0.1433412922269692, "learning_rate": 9.907499553134774e-05, "loss": 2.0162, "step": 7904 }, { "epoch": 0.06147674849049502, "grad_norm": 0.12846989658919425, "learning_rate": 9.907476158931193e-05, "loss": 2.0871, "step": 7905 }, { "epoch": 0.0614845254352756, "grad_norm": 0.11606547134010857, "learning_rate": 9.907452761797307e-05, "loss": 2.0402, "step": 7906 }, { "epoch": 0.06149230238005618, "grad_norm": 0.12843604739514378, "learning_rate": 9.907429361733134e-05, "loss": 2.0173, "step": 7907 }, { "epoch": 0.06150007932483676, "grad_norm": 0.12009546966119297, "learning_rate": 9.907405958738687e-05, "loss": 2.0808, "step": 7908 }, { "epoch": 0.06150785626961734, "grad_norm": 0.12589120577062743, "learning_rate": 9.907382552813979e-05, "loss": 2.0793, "step": 7909 }, { "epoch": 0.06151563321439792, "grad_norm": 0.14497883608102052, "learning_rate": 9.907359143959025e-05, "loss": 2.0241, "step": 7910 }, { "epoch": 0.061523410159178504, "grad_norm": 0.11136915049897303, "learning_rate": 9.907335732173837e-05, "loss": 2.0146, "step": 7911 }, { "epoch": 0.061531187103959085, "grad_norm": 0.13326850401629237, "learning_rate": 9.907312317458432e-05, "loss": 2.0503, "step": 7912 }, { "epoch": 0.06153896404873967, "grad_norm": 0.12831482724252788, "learning_rate": 9.907288899812821e-05, "loss": 2.1028, "step": 7913 }, { "epoch": 0.06154674099352025, "grad_norm": 0.12685424021506686, "learning_rate": 9.90726547923702e-05, "loss": 2.0601, "step": 7914 }, { "epoch": 0.06155451793830083, "grad_norm": 0.15267113756892114, "learning_rate": 9.907242055731043e-05, "loss": 2.014, "step": 7915 }, { "epoch": 0.06156229488308141, "grad_norm": 0.1495471468361144, "learning_rate": 9.907218629294903e-05, "loss": 2.02, "step": 7916 }, { "epoch": 0.06157007182786199, "grad_norm": 0.11587972887770626, "learning_rate": 9.907195199928617e-05, "loss": 2.0593, "step": 7917 }, { "epoch": 0.06157784877264257, "grad_norm": 0.1449048402384508, "learning_rate": 9.907171767632192e-05, "loss": 2.0439, "step": 7918 }, { "epoch": 0.061585625717423154, "grad_norm": 0.15925903939645095, "learning_rate": 9.907148332405649e-05, "loss": 2.0371, "step": 7919 }, { "epoch": 0.061593402662203735, "grad_norm": 0.11352120627077146, "learning_rate": 9.907124894249e-05, "loss": 2.0201, "step": 7920 }, { "epoch": 0.061601179606984316, "grad_norm": 0.1651898422001601, "learning_rate": 9.907101453162257e-05, "loss": 2.0439, "step": 7921 }, { "epoch": 0.0616089565517649, "grad_norm": 0.20109851336523632, "learning_rate": 9.907078009145434e-05, "loss": 2.0664, "step": 7922 }, { "epoch": 0.06161673349654548, "grad_norm": 0.12625976849378748, "learning_rate": 9.907054562198548e-05, "loss": 2.0414, "step": 7923 }, { "epoch": 0.06162451044132606, "grad_norm": 0.15548802501017217, "learning_rate": 9.907031112321612e-05, "loss": 2.0062, "step": 7924 }, { "epoch": 0.06163228738610664, "grad_norm": 0.20540961491099152, "learning_rate": 9.907007659514639e-05, "loss": 2.0714, "step": 7925 }, { "epoch": 0.06164006433088722, "grad_norm": 0.1559992797036196, "learning_rate": 9.906984203777643e-05, "loss": 2.0701, "step": 7926 }, { "epoch": 0.0616478412756678, "grad_norm": 0.1159994811446556, "learning_rate": 9.906960745110638e-05, "loss": 2.0691, "step": 7927 }, { "epoch": 0.061655618220448384, "grad_norm": 0.14917337727145927, "learning_rate": 9.90693728351364e-05, "loss": 2.0423, "step": 7928 }, { "epoch": 0.06166339516522897, "grad_norm": 0.13997823858141056, "learning_rate": 9.90691381898666e-05, "loss": 2.0765, "step": 7929 }, { "epoch": 0.06167117211000955, "grad_norm": 0.11101797419092474, "learning_rate": 9.906890351529715e-05, "loss": 2.0469, "step": 7930 }, { "epoch": 0.061678949054790135, "grad_norm": 0.13469711037200868, "learning_rate": 9.906866881142815e-05, "loss": 2.0733, "step": 7931 }, { "epoch": 0.061686725999570716, "grad_norm": 0.12255800549588014, "learning_rate": 9.906843407825977e-05, "loss": 1.9837, "step": 7932 }, { "epoch": 0.0616945029443513, "grad_norm": 0.1524818210613081, "learning_rate": 9.906819931579215e-05, "loss": 2.0291, "step": 7933 }, { "epoch": 0.06170227988913188, "grad_norm": 0.1516398046608735, "learning_rate": 9.906796452402542e-05, "loss": 2.0191, "step": 7934 }, { "epoch": 0.06171005683391246, "grad_norm": 0.15005060749489493, "learning_rate": 9.906772970295975e-05, "loss": 2.0378, "step": 7935 }, { "epoch": 0.06171783377869304, "grad_norm": 0.14229134252368528, "learning_rate": 9.906749485259523e-05, "loss": 2.0169, "step": 7936 }, { "epoch": 0.06172561072347362, "grad_norm": 0.16863666147080647, "learning_rate": 9.906725997293202e-05, "loss": 2.0292, "step": 7937 }, { "epoch": 0.0617333876682542, "grad_norm": 0.11432112387849148, "learning_rate": 9.906702506397027e-05, "loss": 2.002, "step": 7938 }, { "epoch": 0.061741164613034784, "grad_norm": 0.15608916610707266, "learning_rate": 9.906679012571014e-05, "loss": 2.0448, "step": 7939 }, { "epoch": 0.061748941557815365, "grad_norm": 0.1471956864196441, "learning_rate": 9.906655515815173e-05, "loss": 2.0401, "step": 7940 }, { "epoch": 0.061756718502595946, "grad_norm": 0.11316379106166875, "learning_rate": 9.906632016129518e-05, "loss": 2.0212, "step": 7941 }, { "epoch": 0.06176449544737653, "grad_norm": 0.21291931964833724, "learning_rate": 9.906608513514067e-05, "loss": 2.0097, "step": 7942 }, { "epoch": 0.06177227239215711, "grad_norm": 0.15623662896728077, "learning_rate": 9.906585007968831e-05, "loss": 2.03, "step": 7943 }, { "epoch": 0.06178004933693769, "grad_norm": 0.12247930149577355, "learning_rate": 9.906561499493825e-05, "loss": 2.0462, "step": 7944 }, { "epoch": 0.06178782628171827, "grad_norm": 0.13744593829339644, "learning_rate": 9.906537988089062e-05, "loss": 2.1029, "step": 7945 }, { "epoch": 0.06179560322649885, "grad_norm": 0.16131577479626844, "learning_rate": 9.906514473754559e-05, "loss": 2.0836, "step": 7946 }, { "epoch": 0.06180338017127943, "grad_norm": 0.12420633890844802, "learning_rate": 9.906490956490324e-05, "loss": 2.0759, "step": 7947 }, { "epoch": 0.061811157116060014, "grad_norm": 0.1220997008018311, "learning_rate": 9.906467436296377e-05, "loss": 2.0755, "step": 7948 }, { "epoch": 0.061818934060840595, "grad_norm": 0.13967235054722418, "learning_rate": 9.90644391317273e-05, "loss": 2.0466, "step": 7949 }, { "epoch": 0.06182671100562118, "grad_norm": 0.11886755582440896, "learning_rate": 9.906420387119397e-05, "loss": 2.0365, "step": 7950 }, { "epoch": 0.06183448795040176, "grad_norm": 0.12324636859682882, "learning_rate": 9.906396858136393e-05, "loss": 2.0594, "step": 7951 }, { "epoch": 0.06184226489518234, "grad_norm": 0.1431899020098606, "learning_rate": 9.90637332622373e-05, "loss": 2.0852, "step": 7952 }, { "epoch": 0.06185004183996292, "grad_norm": 0.1207168176986498, "learning_rate": 9.906349791381423e-05, "loss": 1.977, "step": 7953 }, { "epoch": 0.0618578187847435, "grad_norm": 0.12195857161139999, "learning_rate": 9.906326253609486e-05, "loss": 2.0141, "step": 7954 }, { "epoch": 0.06186559572952408, "grad_norm": 0.1367230413950207, "learning_rate": 9.906302712907933e-05, "loss": 2.0335, "step": 7955 }, { "epoch": 0.061873372674304664, "grad_norm": 0.12667177589107415, "learning_rate": 9.90627916927678e-05, "loss": 2.0368, "step": 7956 }, { "epoch": 0.061881149619085245, "grad_norm": 0.11673141333689215, "learning_rate": 9.906255622716038e-05, "loss": 2.0913, "step": 7957 }, { "epoch": 0.061888926563865826, "grad_norm": 0.11797161267847299, "learning_rate": 9.906232073225724e-05, "loss": 2.0588, "step": 7958 }, { "epoch": 0.06189670350864641, "grad_norm": 0.11630926228277894, "learning_rate": 9.906208520805849e-05, "loss": 2.0239, "step": 7959 }, { "epoch": 0.06190448045342699, "grad_norm": 0.11310470162771959, "learning_rate": 9.906184965456428e-05, "loss": 2.0339, "step": 7960 }, { "epoch": 0.06191225739820757, "grad_norm": 0.11401739141546527, "learning_rate": 9.906161407177477e-05, "loss": 2.0034, "step": 7961 }, { "epoch": 0.06192003434298815, "grad_norm": 0.11703319450423201, "learning_rate": 9.906137845969008e-05, "loss": 2.075, "step": 7962 }, { "epoch": 0.06192781128776873, "grad_norm": 0.11486919183683277, "learning_rate": 9.906114281831037e-05, "loss": 2.0768, "step": 7963 }, { "epoch": 0.06193558823254931, "grad_norm": 0.11861006231949292, "learning_rate": 9.906090714763575e-05, "loss": 2.0278, "step": 7964 }, { "epoch": 0.061943365177329894, "grad_norm": 0.12186682751154657, "learning_rate": 9.906067144766639e-05, "loss": 1.9832, "step": 7965 }, { "epoch": 0.061951142122110475, "grad_norm": 0.11392354064638902, "learning_rate": 9.906043571840242e-05, "loss": 2.0643, "step": 7966 }, { "epoch": 0.061958919066891056, "grad_norm": 0.11716871219152034, "learning_rate": 9.906019995984397e-05, "loss": 2.0315, "step": 7967 }, { "epoch": 0.06196669601167164, "grad_norm": 0.12831458646901425, "learning_rate": 9.905996417199119e-05, "loss": 2.0914, "step": 7968 }, { "epoch": 0.06197447295645222, "grad_norm": 0.12435448063287331, "learning_rate": 9.905972835484424e-05, "loss": 2.0583, "step": 7969 }, { "epoch": 0.0619822499012328, "grad_norm": 0.11761942358796856, "learning_rate": 9.905949250840323e-05, "loss": 2.0503, "step": 7970 }, { "epoch": 0.06199002684601338, "grad_norm": 0.12200639391932423, "learning_rate": 9.905925663266832e-05, "loss": 2.0701, "step": 7971 }, { "epoch": 0.06199780379079396, "grad_norm": 0.11682954283800867, "learning_rate": 9.905902072763964e-05, "loss": 2.0468, "step": 7972 }, { "epoch": 0.06200558073557454, "grad_norm": 0.12811463217710686, "learning_rate": 9.905878479331734e-05, "loss": 2.0299, "step": 7973 }, { "epoch": 0.062013357680355125, "grad_norm": 0.15840081201389458, "learning_rate": 9.905854882970156e-05, "loss": 2.0366, "step": 7974 }, { "epoch": 0.062021134625135706, "grad_norm": 0.13509704790385327, "learning_rate": 9.905831283679243e-05, "loss": 2.0195, "step": 7975 }, { "epoch": 0.06202891156991629, "grad_norm": 0.11858685506572997, "learning_rate": 9.905807681459009e-05, "loss": 2.0422, "step": 7976 }, { "epoch": 0.06203668851469687, "grad_norm": 0.16333408609682956, "learning_rate": 9.90578407630947e-05, "loss": 2.1033, "step": 7977 }, { "epoch": 0.06204446545947745, "grad_norm": 0.15210407201058168, "learning_rate": 9.905760468230638e-05, "loss": 2.0063, "step": 7978 }, { "epoch": 0.06205224240425803, "grad_norm": 0.27265205548706367, "learning_rate": 9.90573685722253e-05, "loss": 2.0761, "step": 7979 }, { "epoch": 0.06206001934903861, "grad_norm": 0.15239213729749937, "learning_rate": 9.905713243285156e-05, "loss": 2.1055, "step": 7980 }, { "epoch": 0.06206779629381919, "grad_norm": 0.169434503014172, "learning_rate": 9.905689626418533e-05, "loss": 2.0771, "step": 7981 }, { "epoch": 0.062075573238599774, "grad_norm": 0.12720864591031303, "learning_rate": 9.905666006622676e-05, "loss": 2.011, "step": 7982 }, { "epoch": 0.062083350183380355, "grad_norm": 0.15652744294432977, "learning_rate": 9.905642383897596e-05, "loss": 2.1216, "step": 7983 }, { "epoch": 0.062091127128160936, "grad_norm": 0.1194628936254363, "learning_rate": 9.905618758243307e-05, "loss": 2.0573, "step": 7984 }, { "epoch": 0.06209890407294152, "grad_norm": 0.13627234189014353, "learning_rate": 9.905595129659828e-05, "loss": 2.0529, "step": 7985 }, { "epoch": 0.0621066810177221, "grad_norm": 0.15688820600891593, "learning_rate": 9.905571498147168e-05, "loss": 1.9991, "step": 7986 }, { "epoch": 0.06211445796250268, "grad_norm": 0.15207636320271334, "learning_rate": 9.905547863705343e-05, "loss": 2.0875, "step": 7987 }, { "epoch": 0.06212223490728327, "grad_norm": 0.11817295983222104, "learning_rate": 9.905524226334368e-05, "loss": 2.0651, "step": 7988 }, { "epoch": 0.06213001185206385, "grad_norm": 0.14060323691810878, "learning_rate": 9.905500586034256e-05, "loss": 2.0037, "step": 7989 }, { "epoch": 0.06213778879684443, "grad_norm": 0.15859606194387638, "learning_rate": 9.90547694280502e-05, "loss": 2.039, "step": 7990 }, { "epoch": 0.06214556574162501, "grad_norm": 0.1332798223260394, "learning_rate": 9.905453296646677e-05, "loss": 2.0825, "step": 7991 }, { "epoch": 0.06215334268640559, "grad_norm": 0.1326097792775174, "learning_rate": 9.905429647559237e-05, "loss": 2.1082, "step": 7992 }, { "epoch": 0.062161119631186174, "grad_norm": 0.17309234458006215, "learning_rate": 9.905405995542718e-05, "loss": 2.0052, "step": 7993 }, { "epoch": 0.062168896575966755, "grad_norm": 0.15324199252156273, "learning_rate": 9.905382340597134e-05, "loss": 2.0443, "step": 7994 }, { "epoch": 0.062176673520747336, "grad_norm": 0.29332522123586025, "learning_rate": 9.905358682722496e-05, "loss": 2.0793, "step": 7995 }, { "epoch": 0.06218445046552792, "grad_norm": 0.17299520238312055, "learning_rate": 9.90533502191882e-05, "loss": 2.0538, "step": 7996 }, { "epoch": 0.0621922274103085, "grad_norm": 0.15206545437219046, "learning_rate": 9.905311358186122e-05, "loss": 2.0685, "step": 7997 }, { "epoch": 0.06220000435508908, "grad_norm": 0.1350564872337157, "learning_rate": 9.905287691524414e-05, "loss": 2.0272, "step": 7998 }, { "epoch": 0.06220778129986966, "grad_norm": 0.1622415588133338, "learning_rate": 9.905264021933708e-05, "loss": 2.0891, "step": 7999 }, { "epoch": 0.06221555824465024, "grad_norm": 0.1563131458424237, "learning_rate": 9.905240349414024e-05, "loss": 2.083, "step": 8000 }, { "epoch": 0.06222333518943082, "grad_norm": 0.12475694408858849, "learning_rate": 9.90521667396537e-05, "loss": 2.0612, "step": 8001 }, { "epoch": 0.062231112134211404, "grad_norm": 0.12988890333232675, "learning_rate": 9.905192995587764e-05, "loss": 2.0932, "step": 8002 }, { "epoch": 0.062238889078991985, "grad_norm": 0.13596044446481292, "learning_rate": 9.905169314281219e-05, "loss": 2.074, "step": 8003 }, { "epoch": 0.062246666023772566, "grad_norm": 0.12110778938191874, "learning_rate": 9.905145630045749e-05, "loss": 2.0457, "step": 8004 }, { "epoch": 0.06225444296855315, "grad_norm": 0.12112451489601156, "learning_rate": 9.905121942881367e-05, "loss": 2.0151, "step": 8005 }, { "epoch": 0.06226221991333373, "grad_norm": 0.15994551128361711, "learning_rate": 9.90509825278809e-05, "loss": 2.0365, "step": 8006 }, { "epoch": 0.06226999685811431, "grad_norm": 0.1555517401228269, "learning_rate": 9.905074559765929e-05, "loss": 2.0161, "step": 8007 }, { "epoch": 0.06227777380289489, "grad_norm": 0.12589572687125472, "learning_rate": 9.9050508638149e-05, "loss": 2.0841, "step": 8008 }, { "epoch": 0.06228555074767547, "grad_norm": 0.12443101006342756, "learning_rate": 9.905027164935017e-05, "loss": 2.0407, "step": 8009 }, { "epoch": 0.06229332769245605, "grad_norm": 0.1394982371167229, "learning_rate": 9.905003463126294e-05, "loss": 2.0502, "step": 8010 }, { "epoch": 0.062301104637236634, "grad_norm": 0.11831450906760455, "learning_rate": 9.904979758388746e-05, "loss": 2.0812, "step": 8011 }, { "epoch": 0.062308881582017216, "grad_norm": 0.15073523322395038, "learning_rate": 9.904956050722385e-05, "loss": 2.0375, "step": 8012 }, { "epoch": 0.0623166585267978, "grad_norm": 0.1701711945971137, "learning_rate": 9.904932340127227e-05, "loss": 2.0364, "step": 8013 }, { "epoch": 0.06232443547157838, "grad_norm": 0.12575251012186464, "learning_rate": 9.904908626603286e-05, "loss": 2.0761, "step": 8014 }, { "epoch": 0.06233221241635896, "grad_norm": 0.12969627812288184, "learning_rate": 9.904884910150575e-05, "loss": 2.021, "step": 8015 }, { "epoch": 0.06233998936113954, "grad_norm": 0.1515367774611278, "learning_rate": 9.90486119076911e-05, "loss": 2.0556, "step": 8016 }, { "epoch": 0.06234776630592012, "grad_norm": 0.11985651016035015, "learning_rate": 9.904837468458902e-05, "loss": 2.032, "step": 8017 }, { "epoch": 0.0623555432507007, "grad_norm": 0.12397710762044639, "learning_rate": 9.904813743219969e-05, "loss": 2.0225, "step": 8018 }, { "epoch": 0.062363320195481284, "grad_norm": 0.1383426299898537, "learning_rate": 9.904790015052323e-05, "loss": 2.0694, "step": 8019 }, { "epoch": 0.062371097140261865, "grad_norm": 0.11500740134832857, "learning_rate": 9.904766283955977e-05, "loss": 2.0058, "step": 8020 }, { "epoch": 0.062378874085042446, "grad_norm": 0.1390462163859168, "learning_rate": 9.904742549930948e-05, "loss": 2.0064, "step": 8021 }, { "epoch": 0.06238665102982303, "grad_norm": 0.15603014827524242, "learning_rate": 9.904718812977249e-05, "loss": 2.0016, "step": 8022 }, { "epoch": 0.06239442797460361, "grad_norm": 0.1321562698904518, "learning_rate": 9.904695073094894e-05, "loss": 2.0119, "step": 8023 }, { "epoch": 0.06240220491938419, "grad_norm": 0.11612385537485556, "learning_rate": 9.904671330283898e-05, "loss": 2.0763, "step": 8024 }, { "epoch": 0.06240998186416477, "grad_norm": 0.11294638107144817, "learning_rate": 9.904647584544273e-05, "loss": 2.0434, "step": 8025 }, { "epoch": 0.06241775880894535, "grad_norm": 0.11532257928801198, "learning_rate": 9.904623835876037e-05, "loss": 2.0281, "step": 8026 }, { "epoch": 0.06242553575372593, "grad_norm": 0.1389442424764645, "learning_rate": 9.904600084279198e-05, "loss": 2.0184, "step": 8027 }, { "epoch": 0.062433312698506514, "grad_norm": 0.12333622588607525, "learning_rate": 9.904576329753778e-05, "loss": 2.018, "step": 8028 }, { "epoch": 0.062441089643287095, "grad_norm": 0.12327107233695643, "learning_rate": 9.904552572299785e-05, "loss": 2.0096, "step": 8029 }, { "epoch": 0.06244886658806768, "grad_norm": 0.15108544673702962, "learning_rate": 9.904528811917236e-05, "loss": 2.0737, "step": 8030 }, { "epoch": 0.06245664353284826, "grad_norm": 0.12907647630018196, "learning_rate": 9.904505048606143e-05, "loss": 2.0765, "step": 8031 }, { "epoch": 0.06246442047762884, "grad_norm": 0.11775439070227328, "learning_rate": 9.904481282366522e-05, "loss": 2.0776, "step": 8032 }, { "epoch": 0.06247219742240942, "grad_norm": 0.16502459636834008, "learning_rate": 9.904457513198389e-05, "loss": 2.0767, "step": 8033 }, { "epoch": 0.06247997436719, "grad_norm": 0.1315052201210226, "learning_rate": 9.904433741101755e-05, "loss": 2.0321, "step": 8034 }, { "epoch": 0.06248775131197058, "grad_norm": 0.11901276165509204, "learning_rate": 9.904409966076636e-05, "loss": 2.0566, "step": 8035 }, { "epoch": 0.062495528256751164, "grad_norm": 0.1961512888035365, "learning_rate": 9.904386188123043e-05, "loss": 2.0266, "step": 8036 }, { "epoch": 0.06250330520153174, "grad_norm": 0.12790398806156272, "learning_rate": 9.904362407240994e-05, "loss": 1.9996, "step": 8037 }, { "epoch": 0.06251108214631233, "grad_norm": 0.12687248828431336, "learning_rate": 9.904338623430503e-05, "loss": 2.0397, "step": 8038 }, { "epoch": 0.06251885909109291, "grad_norm": 0.12295182657626355, "learning_rate": 9.904314836691582e-05, "loss": 2.0847, "step": 8039 }, { "epoch": 0.0625266360358735, "grad_norm": 0.11814117781725053, "learning_rate": 9.904291047024247e-05, "loss": 2.0221, "step": 8040 }, { "epoch": 0.06253441298065407, "grad_norm": 0.11592876834321549, "learning_rate": 9.90426725442851e-05, "loss": 2.0355, "step": 8041 }, { "epoch": 0.06254218992543466, "grad_norm": 0.11561954533938566, "learning_rate": 9.904243458904389e-05, "loss": 2.0216, "step": 8042 }, { "epoch": 0.06254996687021523, "grad_norm": 0.11883803257196995, "learning_rate": 9.904219660451894e-05, "loss": 2.0191, "step": 8043 }, { "epoch": 0.06255774381499582, "grad_norm": 0.1196837136352626, "learning_rate": 9.904195859071042e-05, "loss": 2.0094, "step": 8044 }, { "epoch": 0.0625655207597764, "grad_norm": 0.12809978515200915, "learning_rate": 9.904172054761847e-05, "loss": 2.0258, "step": 8045 }, { "epoch": 0.06257329770455698, "grad_norm": 0.11851674595047199, "learning_rate": 9.90414824752432e-05, "loss": 2.0548, "step": 8046 }, { "epoch": 0.06258107464933756, "grad_norm": 0.12436277815909469, "learning_rate": 9.904124437358479e-05, "loss": 2.0189, "step": 8047 }, { "epoch": 0.06258885159411814, "grad_norm": 0.12102545449418484, "learning_rate": 9.904100624264338e-05, "loss": 2.0341, "step": 8048 }, { "epoch": 0.06259662853889872, "grad_norm": 0.11814909796138352, "learning_rate": 9.90407680824191e-05, "loss": 2.0393, "step": 8049 }, { "epoch": 0.0626044054836793, "grad_norm": 0.1208044173302723, "learning_rate": 9.904052989291209e-05, "loss": 2.1122, "step": 8050 }, { "epoch": 0.06261218242845988, "grad_norm": 0.11309735438903644, "learning_rate": 9.90402916741225e-05, "loss": 2.0281, "step": 8051 }, { "epoch": 0.06261995937324047, "grad_norm": 0.11644214690922752, "learning_rate": 9.904005342605048e-05, "loss": 2.0214, "step": 8052 }, { "epoch": 0.06262773631802104, "grad_norm": 0.17940295798470327, "learning_rate": 9.903981514869614e-05, "loss": 2.043, "step": 8053 }, { "epoch": 0.06263551326280163, "grad_norm": 0.11718713014478156, "learning_rate": 9.903957684205964e-05, "loss": 1.9985, "step": 8054 }, { "epoch": 0.0626432902075822, "grad_norm": 0.12473811324862968, "learning_rate": 9.903933850614114e-05, "loss": 2.042, "step": 8055 }, { "epoch": 0.0626510671523628, "grad_norm": 0.11150393067004315, "learning_rate": 9.903910014094076e-05, "loss": 2.033, "step": 8056 }, { "epoch": 0.06265884409714337, "grad_norm": 4.879786003673146, "learning_rate": 9.903886174645865e-05, "loss": 2.0986, "step": 8057 }, { "epoch": 0.06266662104192396, "grad_norm": 0.12874872150776534, "learning_rate": 9.903862332269496e-05, "loss": 2.0439, "step": 8058 }, { "epoch": 0.06267439798670453, "grad_norm": 0.11803646764738338, "learning_rate": 9.903838486964983e-05, "loss": 2.0355, "step": 8059 }, { "epoch": 0.06268217493148512, "grad_norm": 0.12268541951269152, "learning_rate": 9.903814638732339e-05, "loss": 1.9898, "step": 8060 }, { "epoch": 0.06268995187626569, "grad_norm": 0.1175250929437398, "learning_rate": 9.903790787571579e-05, "loss": 2.0899, "step": 8061 }, { "epoch": 0.06269772882104628, "grad_norm": 0.11771363868433049, "learning_rate": 9.903766933482715e-05, "loss": 2.0294, "step": 8062 }, { "epoch": 0.06270550576582685, "grad_norm": 0.13118621893946714, "learning_rate": 9.903743076465766e-05, "loss": 2.0359, "step": 8063 }, { "epoch": 0.06271328271060744, "grad_norm": 0.1486786304002917, "learning_rate": 9.903719216520743e-05, "loss": 2.0414, "step": 8064 }, { "epoch": 0.06272105965538802, "grad_norm": 0.16013557199080278, "learning_rate": 9.903695353647662e-05, "loss": 2.05, "step": 8065 }, { "epoch": 0.0627288366001686, "grad_norm": 0.1390750955533694, "learning_rate": 9.903671487846535e-05, "loss": 2.0161, "step": 8066 }, { "epoch": 0.06273661354494918, "grad_norm": 0.2778134679784484, "learning_rate": 9.903647619117378e-05, "loss": 2.0395, "step": 8067 }, { "epoch": 0.06274439048972977, "grad_norm": 0.1720831046647526, "learning_rate": 9.903623747460204e-05, "loss": 2.0568, "step": 8068 }, { "epoch": 0.06275216743451034, "grad_norm": 0.19246963937666287, "learning_rate": 9.903599872875028e-05, "loss": 2.0166, "step": 8069 }, { "epoch": 0.06275994437929093, "grad_norm": 0.13852215443274277, "learning_rate": 9.903575995361865e-05, "loss": 1.9814, "step": 8070 }, { "epoch": 0.0627677213240715, "grad_norm": 0.15514885341685958, "learning_rate": 9.903552114920727e-05, "loss": 2.0206, "step": 8071 }, { "epoch": 0.06277549826885209, "grad_norm": 0.23316224841151367, "learning_rate": 9.90352823155163e-05, "loss": 2.0433, "step": 8072 }, { "epoch": 0.06278327521363267, "grad_norm": 0.13582988096973947, "learning_rate": 9.90350434525459e-05, "loss": 2.0084, "step": 8073 }, { "epoch": 0.06279105215841325, "grad_norm": 0.12467299476838377, "learning_rate": 9.903480456029618e-05, "loss": 2.0194, "step": 8074 }, { "epoch": 0.06279882910319383, "grad_norm": 0.14992543745365372, "learning_rate": 9.90345656387673e-05, "loss": 2.0919, "step": 8075 }, { "epoch": 0.06280660604797442, "grad_norm": 0.1448424600897687, "learning_rate": 9.903432668795939e-05, "loss": 2.0284, "step": 8076 }, { "epoch": 0.062814382992755, "grad_norm": 0.1312585311708239, "learning_rate": 9.90340877078726e-05, "loss": 2.0587, "step": 8077 }, { "epoch": 0.06282215993753558, "grad_norm": 0.12156539248566729, "learning_rate": 9.903384869850707e-05, "loss": 2.0503, "step": 8078 }, { "epoch": 0.06282993688231617, "grad_norm": 0.12535336229085986, "learning_rate": 9.903360965986296e-05, "loss": 2.0335, "step": 8079 }, { "epoch": 0.06283771382709674, "grad_norm": 0.1170648745339032, "learning_rate": 9.90333705919404e-05, "loss": 2.0375, "step": 8080 }, { "epoch": 0.06284549077187733, "grad_norm": 0.12245089574891069, "learning_rate": 9.903313149473952e-05, "loss": 2.0336, "step": 8081 }, { "epoch": 0.0628532677166579, "grad_norm": 0.11780925533712493, "learning_rate": 9.903289236826047e-05, "loss": 2.0445, "step": 8082 }, { "epoch": 0.06286104466143849, "grad_norm": 0.11997581024615855, "learning_rate": 9.903265321250338e-05, "loss": 2.1101, "step": 8083 }, { "epoch": 0.06286882160621907, "grad_norm": 0.11234939294000591, "learning_rate": 9.903241402746846e-05, "loss": 2.0628, "step": 8084 }, { "epoch": 0.06287659855099965, "grad_norm": 0.25233600339344875, "learning_rate": 9.903217481315577e-05, "loss": 2.0499, "step": 8085 }, { "epoch": 0.06288437549578023, "grad_norm": 0.14065324155559888, "learning_rate": 9.90319355695655e-05, "loss": 2.0844, "step": 8086 }, { "epoch": 0.06289215244056082, "grad_norm": 0.1418667902188321, "learning_rate": 9.903169629669776e-05, "loss": 2.0757, "step": 8087 }, { "epoch": 0.06289992938534139, "grad_norm": 0.1536331100269119, "learning_rate": 9.903145699455272e-05, "loss": 2.0504, "step": 8088 }, { "epoch": 0.06290770633012198, "grad_norm": 0.15831335922002263, "learning_rate": 9.90312176631305e-05, "loss": 2.0378, "step": 8089 }, { "epoch": 0.06291548327490255, "grad_norm": 0.15321546051923712, "learning_rate": 9.903097830243127e-05, "loss": 2.0522, "step": 8090 }, { "epoch": 0.06292326021968314, "grad_norm": 0.12323119840900135, "learning_rate": 9.903073891245517e-05, "loss": 2.0634, "step": 8091 }, { "epoch": 0.06293103716446372, "grad_norm": 0.13500657768955798, "learning_rate": 9.903049949320233e-05, "loss": 2.0065, "step": 8092 }, { "epoch": 0.0629388141092443, "grad_norm": 0.18897544523674442, "learning_rate": 9.903026004467288e-05, "loss": 2.0643, "step": 8093 }, { "epoch": 0.06294659105402488, "grad_norm": 0.19491484354255934, "learning_rate": 9.903002056686698e-05, "loss": 2.0617, "step": 8094 }, { "epoch": 0.06295436799880547, "grad_norm": 0.15759080085192317, "learning_rate": 9.902978105978479e-05, "loss": 2.0636, "step": 8095 }, { "epoch": 0.06296214494358604, "grad_norm": 0.13080248673723743, "learning_rate": 9.902954152342642e-05, "loss": 1.9925, "step": 8096 }, { "epoch": 0.06296992188836663, "grad_norm": 0.13578853530029347, "learning_rate": 9.902930195779203e-05, "loss": 2.0919, "step": 8097 }, { "epoch": 0.0629776988331472, "grad_norm": 0.14504994519295383, "learning_rate": 9.902906236288176e-05, "loss": 2.0692, "step": 8098 }, { "epoch": 0.06298547577792779, "grad_norm": 0.139872303979997, "learning_rate": 9.902882273869576e-05, "loss": 2.02, "step": 8099 }, { "epoch": 0.06299325272270836, "grad_norm": 0.12267178275705973, "learning_rate": 9.902858308523416e-05, "loss": 2.0618, "step": 8100 }, { "epoch": 0.06300102966748895, "grad_norm": 0.11961106852851479, "learning_rate": 9.902834340249711e-05, "loss": 2.0345, "step": 8101 }, { "epoch": 0.06300880661226953, "grad_norm": 0.12676301803516443, "learning_rate": 9.902810369048476e-05, "loss": 2.0335, "step": 8102 }, { "epoch": 0.06301658355705012, "grad_norm": 0.1159732969365603, "learning_rate": 9.902786394919724e-05, "loss": 2.0857, "step": 8103 }, { "epoch": 0.06302436050183069, "grad_norm": 0.6597595891558155, "learning_rate": 9.902762417863471e-05, "loss": 2.0338, "step": 8104 }, { "epoch": 0.06303213744661128, "grad_norm": 0.12161310221673856, "learning_rate": 9.902738437879728e-05, "loss": 2.0348, "step": 8105 }, { "epoch": 0.06303991439139185, "grad_norm": 0.1298411736081657, "learning_rate": 9.902714454968514e-05, "loss": 2.0633, "step": 8106 }, { "epoch": 0.06304769133617244, "grad_norm": 0.2003642581805484, "learning_rate": 9.90269046912984e-05, "loss": 2.0246, "step": 8107 }, { "epoch": 0.06305546828095301, "grad_norm": 0.18945116596037465, "learning_rate": 9.90266648036372e-05, "loss": 2.0105, "step": 8108 }, { "epoch": 0.0630632452257336, "grad_norm": 0.15304942633290403, "learning_rate": 9.90264248867017e-05, "loss": 2.0446, "step": 8109 }, { "epoch": 0.06307102217051418, "grad_norm": 0.1490799317326979, "learning_rate": 9.902618494049205e-05, "loss": 2.0785, "step": 8110 }, { "epoch": 0.06307879911529476, "grad_norm": 0.13806957360935282, "learning_rate": 9.902594496500839e-05, "loss": 2.0313, "step": 8111 }, { "epoch": 0.06308657606007534, "grad_norm": 0.12998511452570147, "learning_rate": 9.902570496025083e-05, "loss": 2.1012, "step": 8112 }, { "epoch": 0.06309435300485593, "grad_norm": 0.11938786216652088, "learning_rate": 9.902546492621956e-05, "loss": 2.0733, "step": 8113 }, { "epoch": 0.0631021299496365, "grad_norm": 0.13544603405874203, "learning_rate": 9.902522486291469e-05, "loss": 2.023, "step": 8114 }, { "epoch": 0.06310990689441709, "grad_norm": 0.14182070974984215, "learning_rate": 9.902498477033637e-05, "loss": 2.0297, "step": 8115 }, { "epoch": 0.06311768383919766, "grad_norm": 0.12397374866634077, "learning_rate": 9.902474464848477e-05, "loss": 2.0926, "step": 8116 }, { "epoch": 0.06312546078397825, "grad_norm": 0.12480960387637603, "learning_rate": 9.902450449735999e-05, "loss": 2.0345, "step": 8117 }, { "epoch": 0.06313323772875883, "grad_norm": 0.13262534494939252, "learning_rate": 9.902426431696222e-05, "loss": 2.0497, "step": 8118 }, { "epoch": 0.06314101467353941, "grad_norm": 0.11279703999146654, "learning_rate": 9.902402410729154e-05, "loss": 2.0191, "step": 8119 }, { "epoch": 0.06314879161831999, "grad_norm": 0.11914838137246246, "learning_rate": 9.902378386834816e-05, "loss": 1.9757, "step": 8120 }, { "epoch": 0.06315656856310058, "grad_norm": 0.14343283014950384, "learning_rate": 9.90235436001322e-05, "loss": 2.0372, "step": 8121 }, { "epoch": 0.06316434550788115, "grad_norm": 0.11993917733106632, "learning_rate": 9.902330330264379e-05, "loss": 2.0384, "step": 8122 }, { "epoch": 0.06317212245266174, "grad_norm": 0.11812527250632668, "learning_rate": 9.902306297588308e-05, "loss": 2.077, "step": 8123 }, { "epoch": 0.06317989939744231, "grad_norm": 0.12075688831779122, "learning_rate": 9.902282261985021e-05, "loss": 2.0893, "step": 8124 }, { "epoch": 0.0631876763422229, "grad_norm": 0.13772634835498013, "learning_rate": 9.902258223454534e-05, "loss": 2.066, "step": 8125 }, { "epoch": 0.06319545328700348, "grad_norm": 0.12437763196124843, "learning_rate": 9.902234181996861e-05, "loss": 2.0852, "step": 8126 }, { "epoch": 0.06320323023178406, "grad_norm": 0.12605718599344373, "learning_rate": 9.902210137612013e-05, "loss": 2.0477, "step": 8127 }, { "epoch": 0.06321100717656464, "grad_norm": 0.12396932309096917, "learning_rate": 9.902186090300009e-05, "loss": 2.0933, "step": 8128 }, { "epoch": 0.06321878412134523, "grad_norm": 0.12228364716590205, "learning_rate": 9.902162040060863e-05, "loss": 2.0082, "step": 8129 }, { "epoch": 0.0632265610661258, "grad_norm": 0.13184282648948562, "learning_rate": 9.902137986894586e-05, "loss": 2.0168, "step": 8130 }, { "epoch": 0.06323433801090639, "grad_norm": 0.1443860572999566, "learning_rate": 9.902113930801193e-05, "loss": 1.9782, "step": 8131 }, { "epoch": 0.06324211495568696, "grad_norm": 0.1468553263379811, "learning_rate": 9.902089871780701e-05, "loss": 1.9913, "step": 8132 }, { "epoch": 0.06324989190046755, "grad_norm": 0.1469275069828923, "learning_rate": 9.902065809833123e-05, "loss": 2.0731, "step": 8133 }, { "epoch": 0.06325766884524812, "grad_norm": 0.11872357940429569, "learning_rate": 9.902041744958472e-05, "loss": 2.0733, "step": 8134 }, { "epoch": 0.06326544579002871, "grad_norm": 0.12443823116574595, "learning_rate": 9.902017677156764e-05, "loss": 2.0379, "step": 8135 }, { "epoch": 0.06327322273480929, "grad_norm": 0.13119553418011312, "learning_rate": 9.901993606428013e-05, "loss": 2.0718, "step": 8136 }, { "epoch": 0.06328099967958987, "grad_norm": 0.13013025713707302, "learning_rate": 9.901969532772233e-05, "loss": 2.0572, "step": 8137 }, { "epoch": 0.06328877662437046, "grad_norm": 0.11615776143061336, "learning_rate": 9.90194545618944e-05, "loss": 1.9993, "step": 8138 }, { "epoch": 0.06329655356915104, "grad_norm": 0.13035736276575743, "learning_rate": 9.901921376679645e-05, "loss": 2.0061, "step": 8139 }, { "epoch": 0.06330433051393163, "grad_norm": 0.14363942001241045, "learning_rate": 9.901897294242867e-05, "loss": 2.0898, "step": 8140 }, { "epoch": 0.0633121074587122, "grad_norm": 0.1351928378374331, "learning_rate": 9.901873208879115e-05, "loss": 2.0464, "step": 8141 }, { "epoch": 0.06331988440349279, "grad_norm": 0.11716950365792063, "learning_rate": 9.90184912058841e-05, "loss": 2.0034, "step": 8142 }, { "epoch": 0.06332766134827336, "grad_norm": 0.12489087769855006, "learning_rate": 9.901825029370759e-05, "loss": 2.0317, "step": 8143 }, { "epoch": 0.06333543829305395, "grad_norm": 0.14123242646419087, "learning_rate": 9.901800935226182e-05, "loss": 2.0372, "step": 8144 }, { "epoch": 0.06334321523783452, "grad_norm": 0.12290844715307025, "learning_rate": 9.901776838154691e-05, "loss": 2.0585, "step": 8145 }, { "epoch": 0.06335099218261511, "grad_norm": 0.11608263513284407, "learning_rate": 9.9017527381563e-05, "loss": 2.0447, "step": 8146 }, { "epoch": 0.06335876912739569, "grad_norm": 0.13186138790215635, "learning_rate": 9.901728635231025e-05, "loss": 2.0324, "step": 8147 }, { "epoch": 0.06336654607217627, "grad_norm": 0.13059644028658787, "learning_rate": 9.901704529378878e-05, "loss": 2.0065, "step": 8148 }, { "epoch": 0.06337432301695685, "grad_norm": 0.11799163532865385, "learning_rate": 9.901680420599875e-05, "loss": 2.0039, "step": 8149 }, { "epoch": 0.06338209996173744, "grad_norm": 0.1263159725961777, "learning_rate": 9.901656308894032e-05, "loss": 2.0437, "step": 8150 }, { "epoch": 0.06338987690651801, "grad_norm": 0.12589307879712416, "learning_rate": 9.901632194261362e-05, "loss": 2.0401, "step": 8151 }, { "epoch": 0.0633976538512986, "grad_norm": 0.1147519430396889, "learning_rate": 9.901608076701878e-05, "loss": 2.0245, "step": 8152 }, { "epoch": 0.06340543079607917, "grad_norm": 0.11976661388872963, "learning_rate": 9.901583956215595e-05, "loss": 2.0661, "step": 8153 }, { "epoch": 0.06341320774085976, "grad_norm": 0.13638470457118182, "learning_rate": 9.90155983280253e-05, "loss": 2.0193, "step": 8154 }, { "epoch": 0.06342098468564034, "grad_norm": 0.15699960547551509, "learning_rate": 9.901535706462693e-05, "loss": 2.0308, "step": 8155 }, { "epoch": 0.06342876163042092, "grad_norm": 0.12274972785718428, "learning_rate": 9.901511577196101e-05, "loss": 2.0047, "step": 8156 }, { "epoch": 0.0634365385752015, "grad_norm": 0.12158693448002499, "learning_rate": 9.90148744500277e-05, "loss": 2.0461, "step": 8157 }, { "epoch": 0.06344431551998209, "grad_norm": 0.12719417029227909, "learning_rate": 9.90146330988271e-05, "loss": 2.0911, "step": 8158 }, { "epoch": 0.06345209246476266, "grad_norm": 0.13110473975345505, "learning_rate": 9.901439171835941e-05, "loss": 2.0243, "step": 8159 }, { "epoch": 0.06345986940954325, "grad_norm": 0.12057000521570226, "learning_rate": 9.901415030862472e-05, "loss": 2.0339, "step": 8160 }, { "epoch": 0.06346764635432382, "grad_norm": 0.11196320280711974, "learning_rate": 9.901390886962321e-05, "loss": 2.0415, "step": 8161 }, { "epoch": 0.06347542329910441, "grad_norm": 0.11893152905085091, "learning_rate": 9.9013667401355e-05, "loss": 2.0828, "step": 8162 }, { "epoch": 0.06348320024388499, "grad_norm": 0.12301453829824119, "learning_rate": 9.901342590382025e-05, "loss": 2.017, "step": 8163 }, { "epoch": 0.06349097718866557, "grad_norm": 0.12397292535739345, "learning_rate": 9.901318437701911e-05, "loss": 2.0017, "step": 8164 }, { "epoch": 0.06349875413344615, "grad_norm": 0.20183115579007987, "learning_rate": 9.901294282095172e-05, "loss": 2.0545, "step": 8165 }, { "epoch": 0.06350653107822674, "grad_norm": 0.13935443218614715, "learning_rate": 9.90127012356182e-05, "loss": 2.0426, "step": 8166 }, { "epoch": 0.06351430802300731, "grad_norm": 0.12460425654938742, "learning_rate": 9.901245962101873e-05, "loss": 2.0073, "step": 8167 }, { "epoch": 0.0635220849677879, "grad_norm": 0.1405072760758143, "learning_rate": 9.901221797715342e-05, "loss": 2.0798, "step": 8168 }, { "epoch": 0.06352986191256847, "grad_norm": 0.14203949380234415, "learning_rate": 9.901197630402244e-05, "loss": 1.9865, "step": 8169 }, { "epoch": 0.06353763885734906, "grad_norm": 0.1194348674610696, "learning_rate": 9.901173460162594e-05, "loss": 2.0573, "step": 8170 }, { "epoch": 0.06354541580212963, "grad_norm": 0.12619327533123664, "learning_rate": 9.901149286996403e-05, "loss": 2.0733, "step": 8171 }, { "epoch": 0.06355319274691022, "grad_norm": 0.13269078611337218, "learning_rate": 9.901125110903689e-05, "loss": 2.104, "step": 8172 }, { "epoch": 0.0635609696916908, "grad_norm": 0.12640420319725926, "learning_rate": 9.901100931884464e-05, "loss": 2.0444, "step": 8173 }, { "epoch": 0.06356874663647138, "grad_norm": 0.1234819049243059, "learning_rate": 9.901076749938742e-05, "loss": 2.039, "step": 8174 }, { "epoch": 0.06357652358125196, "grad_norm": 0.17251412190425006, "learning_rate": 9.901052565066541e-05, "loss": 2.0595, "step": 8175 }, { "epoch": 0.06358430052603255, "grad_norm": 0.19960106275498202, "learning_rate": 9.901028377267872e-05, "loss": 2.0192, "step": 8176 }, { "epoch": 0.06359207747081312, "grad_norm": 0.18271393089244883, "learning_rate": 9.901004186542752e-05, "loss": 2.0676, "step": 8177 }, { "epoch": 0.06359985441559371, "grad_norm": 0.250414835809348, "learning_rate": 9.900979992891193e-05, "loss": 2.1036, "step": 8178 }, { "epoch": 0.06360763136037428, "grad_norm": 0.12205480989409297, "learning_rate": 9.90095579631321e-05, "loss": 1.9604, "step": 8179 }, { "epoch": 0.06361540830515487, "grad_norm": 0.1759276528705476, "learning_rate": 9.900931596808819e-05, "loss": 2.0682, "step": 8180 }, { "epoch": 0.06362318524993545, "grad_norm": 0.17027060416875128, "learning_rate": 9.900907394378034e-05, "loss": 1.9928, "step": 8181 }, { "epoch": 0.06363096219471603, "grad_norm": 0.11157690476964086, "learning_rate": 9.900883189020868e-05, "loss": 2.0763, "step": 8182 }, { "epoch": 0.06363873913949661, "grad_norm": 0.16033602497592125, "learning_rate": 9.900858980737336e-05, "loss": 2.0329, "step": 8183 }, { "epoch": 0.0636465160842772, "grad_norm": 0.1927482184075254, "learning_rate": 9.900834769527453e-05, "loss": 2.0476, "step": 8184 }, { "epoch": 0.06365429302905777, "grad_norm": 0.1394419586330642, "learning_rate": 9.900810555391234e-05, "loss": 2.1011, "step": 8185 }, { "epoch": 0.06366206997383836, "grad_norm": 0.12771963795804064, "learning_rate": 9.900786338328692e-05, "loss": 2.0712, "step": 8186 }, { "epoch": 0.06366984691861893, "grad_norm": 0.30187238899770824, "learning_rate": 9.900762118339843e-05, "loss": 2.0775, "step": 8187 }, { "epoch": 0.06367762386339952, "grad_norm": 0.14834369459606236, "learning_rate": 9.9007378954247e-05, "loss": 2.0479, "step": 8188 }, { "epoch": 0.0636854008081801, "grad_norm": 0.11733804713183328, "learning_rate": 9.900713669583279e-05, "loss": 2.05, "step": 8189 }, { "epoch": 0.06369317775296068, "grad_norm": 0.16180917330454603, "learning_rate": 9.900689440815593e-05, "loss": 2.0509, "step": 8190 }, { "epoch": 0.06370095469774126, "grad_norm": 0.1744350502615601, "learning_rate": 9.900665209121656e-05, "loss": 1.9946, "step": 8191 }, { "epoch": 0.06370873164252185, "grad_norm": 0.1342956556851076, "learning_rate": 9.900640974501485e-05, "loss": 2.0717, "step": 8192 }, { "epoch": 0.06371650858730242, "grad_norm": 0.12525684040927634, "learning_rate": 9.900616736955092e-05, "loss": 2.0208, "step": 8193 }, { "epoch": 0.06372428553208301, "grad_norm": 0.16246263484448026, "learning_rate": 9.900592496482494e-05, "loss": 2.0297, "step": 8194 }, { "epoch": 0.06373206247686358, "grad_norm": 0.12550201071468617, "learning_rate": 9.900568253083702e-05, "loss": 2.0387, "step": 8195 }, { "epoch": 0.06373983942164417, "grad_norm": 0.13552646629310075, "learning_rate": 9.900544006758734e-05, "loss": 2.0807, "step": 8196 }, { "epoch": 0.06374761636642476, "grad_norm": 0.17369542858615028, "learning_rate": 9.900519757507603e-05, "loss": 2.0601, "step": 8197 }, { "epoch": 0.06375539331120533, "grad_norm": 0.14223351533739925, "learning_rate": 9.900495505330322e-05, "loss": 2.029, "step": 8198 }, { "epoch": 0.06376317025598592, "grad_norm": 0.12593013330688999, "learning_rate": 9.900471250226907e-05, "loss": 2.026, "step": 8199 }, { "epoch": 0.0637709472007665, "grad_norm": 0.16961307440906498, "learning_rate": 9.900446992197373e-05, "loss": 2.0042, "step": 8200 }, { "epoch": 0.06377872414554708, "grad_norm": 0.16690565179035963, "learning_rate": 9.900422731241735e-05, "loss": 2.0496, "step": 8201 }, { "epoch": 0.06378650109032766, "grad_norm": 0.11508018884634695, "learning_rate": 9.900398467360004e-05, "loss": 2.0461, "step": 8202 }, { "epoch": 0.06379427803510825, "grad_norm": 0.16970709069081663, "learning_rate": 9.900374200552197e-05, "loss": 2.0326, "step": 8203 }, { "epoch": 0.06380205497988882, "grad_norm": 0.2195617478802018, "learning_rate": 9.900349930818329e-05, "loss": 2.0033, "step": 8204 }, { "epoch": 0.06380983192466941, "grad_norm": 0.1668471536130015, "learning_rate": 9.900325658158413e-05, "loss": 2.0531, "step": 8205 }, { "epoch": 0.06381760886944998, "grad_norm": 0.12241938622670268, "learning_rate": 9.900301382572465e-05, "loss": 2.0317, "step": 8206 }, { "epoch": 0.06382538581423057, "grad_norm": 0.17081290610994024, "learning_rate": 9.9002771040605e-05, "loss": 2.0632, "step": 8207 }, { "epoch": 0.06383316275901114, "grad_norm": 0.14006535661436778, "learning_rate": 9.900252822622529e-05, "loss": 2.0467, "step": 8208 }, { "epoch": 0.06384093970379173, "grad_norm": 0.11571379068055561, "learning_rate": 9.90022853825857e-05, "loss": 1.9905, "step": 8209 }, { "epoch": 0.0638487166485723, "grad_norm": 0.16914803854729468, "learning_rate": 9.900204250968636e-05, "loss": 2.0363, "step": 8210 }, { "epoch": 0.0638564935933529, "grad_norm": 0.12908219310079205, "learning_rate": 9.900179960752742e-05, "loss": 2.0589, "step": 8211 }, { "epoch": 0.06386427053813347, "grad_norm": 0.11442673187142365, "learning_rate": 9.900155667610901e-05, "loss": 1.9918, "step": 8212 }, { "epoch": 0.06387204748291406, "grad_norm": 0.1282802351463671, "learning_rate": 9.900131371543131e-05, "loss": 2.0287, "step": 8213 }, { "epoch": 0.06387982442769463, "grad_norm": 0.12461170247103066, "learning_rate": 9.900107072549443e-05, "loss": 2.0736, "step": 8214 }, { "epoch": 0.06388760137247522, "grad_norm": 0.17284797952595599, "learning_rate": 9.900082770629853e-05, "loss": 2.0206, "step": 8215 }, { "epoch": 0.0638953783172558, "grad_norm": 0.1233406924011316, "learning_rate": 9.900058465784375e-05, "loss": 2.0407, "step": 8216 }, { "epoch": 0.06390315526203638, "grad_norm": 0.1441479840512262, "learning_rate": 9.900034158013025e-05, "loss": 2.0946, "step": 8217 }, { "epoch": 0.06391093220681696, "grad_norm": 0.1360295577011683, "learning_rate": 9.900009847315814e-05, "loss": 1.9903, "step": 8218 }, { "epoch": 0.06391870915159754, "grad_norm": 0.11362718566392274, "learning_rate": 9.899985533692762e-05, "loss": 2.054, "step": 8219 }, { "epoch": 0.06392648609637812, "grad_norm": 0.17557967938335073, "learning_rate": 9.899961217143878e-05, "loss": 1.9979, "step": 8220 }, { "epoch": 0.0639342630411587, "grad_norm": 0.20556497715740418, "learning_rate": 9.89993689766918e-05, "loss": 2.0377, "step": 8221 }, { "epoch": 0.06394203998593928, "grad_norm": 0.1435731669830248, "learning_rate": 9.899912575268679e-05, "loss": 2.0622, "step": 8222 }, { "epoch": 0.06394981693071987, "grad_norm": 0.12303164287632583, "learning_rate": 9.899888249942395e-05, "loss": 2.0991, "step": 8223 }, { "epoch": 0.06395759387550044, "grad_norm": 0.13972142720197067, "learning_rate": 9.899863921690338e-05, "loss": 2.0541, "step": 8224 }, { "epoch": 0.06396537082028103, "grad_norm": 0.11680683868214821, "learning_rate": 9.899839590512526e-05, "loss": 2.0376, "step": 8225 }, { "epoch": 0.0639731477650616, "grad_norm": 0.17678130108511592, "learning_rate": 9.899815256408969e-05, "loss": 2.0298, "step": 8226 }, { "epoch": 0.0639809247098422, "grad_norm": 0.13249436813190016, "learning_rate": 9.899790919379683e-05, "loss": 2.0743, "step": 8227 }, { "epoch": 0.06398870165462277, "grad_norm": 0.11924742647242605, "learning_rate": 9.899766579424686e-05, "loss": 2.0243, "step": 8228 }, { "epoch": 0.06399647859940336, "grad_norm": 0.12190260471002114, "learning_rate": 9.899742236543989e-05, "loss": 2.0688, "step": 8229 }, { "epoch": 0.06400425554418393, "grad_norm": 0.12293532750596306, "learning_rate": 9.899717890737608e-05, "loss": 2.0528, "step": 8230 }, { "epoch": 0.06401203248896452, "grad_norm": 0.13390758616136372, "learning_rate": 9.899693542005558e-05, "loss": 2.0479, "step": 8231 }, { "epoch": 0.06401980943374509, "grad_norm": 0.11636533549606189, "learning_rate": 9.899669190347852e-05, "loss": 2.0589, "step": 8232 }, { "epoch": 0.06402758637852568, "grad_norm": 0.11849060492001297, "learning_rate": 9.899644835764505e-05, "loss": 2.0184, "step": 8233 }, { "epoch": 0.06403536332330625, "grad_norm": 0.11893346842566524, "learning_rate": 9.899620478255532e-05, "loss": 2.0253, "step": 8234 }, { "epoch": 0.06404314026808684, "grad_norm": 0.12791332518341286, "learning_rate": 9.899596117820949e-05, "loss": 2.0506, "step": 8235 }, { "epoch": 0.06405091721286742, "grad_norm": 0.11720525583606684, "learning_rate": 9.899571754460766e-05, "loss": 2.0637, "step": 8236 }, { "epoch": 0.064058694157648, "grad_norm": 0.12412242356290905, "learning_rate": 9.899547388175001e-05, "loss": 2.0294, "step": 8237 }, { "epoch": 0.06406647110242858, "grad_norm": 0.11674394764138889, "learning_rate": 9.899523018963669e-05, "loss": 2.0604, "step": 8238 }, { "epoch": 0.06407424804720917, "grad_norm": 0.1285375344066229, "learning_rate": 9.899498646826784e-05, "loss": 2.0012, "step": 8239 }, { "epoch": 0.06408202499198974, "grad_norm": 0.12607091002161608, "learning_rate": 9.899474271764358e-05, "loss": 2.0243, "step": 8240 }, { "epoch": 0.06408980193677033, "grad_norm": 0.12341806407833143, "learning_rate": 9.899449893776409e-05, "loss": 2.0684, "step": 8241 }, { "epoch": 0.0640975788815509, "grad_norm": 0.11853447207212278, "learning_rate": 9.899425512862951e-05, "loss": 2.0447, "step": 8242 }, { "epoch": 0.06410535582633149, "grad_norm": 0.14868774141279112, "learning_rate": 9.899401129023996e-05, "loss": 2.0636, "step": 8243 }, { "epoch": 0.06411313277111207, "grad_norm": 0.14244891183540942, "learning_rate": 9.899376742259562e-05, "loss": 2.083, "step": 8244 }, { "epoch": 0.06412090971589265, "grad_norm": 0.12142703859941764, "learning_rate": 9.899352352569661e-05, "loss": 2.0085, "step": 8245 }, { "epoch": 0.06412868666067323, "grad_norm": 0.14162784774055084, "learning_rate": 9.899327959954308e-05, "loss": 2.0231, "step": 8246 }, { "epoch": 0.06413646360545382, "grad_norm": 0.1274180606798738, "learning_rate": 9.899303564413519e-05, "loss": 2.0031, "step": 8247 }, { "epoch": 0.06414424055023439, "grad_norm": 0.11767470631863305, "learning_rate": 9.899279165947307e-05, "loss": 2.0187, "step": 8248 }, { "epoch": 0.06415201749501498, "grad_norm": 0.12114543200584373, "learning_rate": 9.899254764555688e-05, "loss": 2.0599, "step": 8249 }, { "epoch": 0.06415979443979555, "grad_norm": 0.12046820538296968, "learning_rate": 9.899230360238674e-05, "loss": 2.0815, "step": 8250 }, { "epoch": 0.06416757138457614, "grad_norm": 0.12729793710109738, "learning_rate": 9.899205952996282e-05, "loss": 2.0633, "step": 8251 }, { "epoch": 0.06417534832935672, "grad_norm": 0.12605928374032388, "learning_rate": 9.899181542828527e-05, "loss": 1.9981, "step": 8252 }, { "epoch": 0.0641831252741373, "grad_norm": 0.13796801603032408, "learning_rate": 9.899157129735422e-05, "loss": 2.0721, "step": 8253 }, { "epoch": 0.06419090221891788, "grad_norm": 0.11516152211221882, "learning_rate": 9.899132713716981e-05, "loss": 2.0376, "step": 8254 }, { "epoch": 0.06419867916369847, "grad_norm": 0.12769380494083976, "learning_rate": 9.89910829477322e-05, "loss": 2.0132, "step": 8255 }, { "epoch": 0.06420645610847905, "grad_norm": 0.13542574939583507, "learning_rate": 9.899083872904154e-05, "loss": 2.0514, "step": 8256 }, { "epoch": 0.06421423305325963, "grad_norm": 0.1246768295677121, "learning_rate": 9.899059448109795e-05, "loss": 2.0747, "step": 8257 }, { "epoch": 0.06422200999804022, "grad_norm": 0.12501507458388178, "learning_rate": 9.89903502039016e-05, "loss": 2.0968, "step": 8258 }, { "epoch": 0.06422978694282079, "grad_norm": 0.11699307180629989, "learning_rate": 9.899010589745263e-05, "loss": 2.0557, "step": 8259 }, { "epoch": 0.06423756388760138, "grad_norm": 0.12301795004326278, "learning_rate": 9.898986156175119e-05, "loss": 2.1211, "step": 8260 }, { "epoch": 0.06424534083238195, "grad_norm": 0.12280718484977184, "learning_rate": 9.898961719679742e-05, "loss": 2.0243, "step": 8261 }, { "epoch": 0.06425311777716254, "grad_norm": 0.11874106384721579, "learning_rate": 9.898937280259146e-05, "loss": 2.0107, "step": 8262 }, { "epoch": 0.06426089472194312, "grad_norm": 0.12401796517829214, "learning_rate": 9.898912837913348e-05, "loss": 2.069, "step": 8263 }, { "epoch": 0.0642686716667237, "grad_norm": 0.14859315299387596, "learning_rate": 9.898888392642359e-05, "loss": 2.1007, "step": 8264 }, { "epoch": 0.06427644861150428, "grad_norm": 0.15050377974850518, "learning_rate": 9.898863944446198e-05, "loss": 2.0909, "step": 8265 }, { "epoch": 0.06428422555628487, "grad_norm": 0.11640122232728414, "learning_rate": 9.898839493324874e-05, "loss": 2.0715, "step": 8266 }, { "epoch": 0.06429200250106544, "grad_norm": 0.12698360243460982, "learning_rate": 9.898815039278406e-05, "loss": 2.0116, "step": 8267 }, { "epoch": 0.06429977944584603, "grad_norm": 0.14125819989214433, "learning_rate": 9.898790582306808e-05, "loss": 2.0531, "step": 8268 }, { "epoch": 0.0643075563906266, "grad_norm": 0.16560191761263762, "learning_rate": 9.898766122410092e-05, "loss": 2.0377, "step": 8269 }, { "epoch": 0.06431533333540719, "grad_norm": 0.15309221458105632, "learning_rate": 9.898741659588276e-05, "loss": 2.0256, "step": 8270 }, { "epoch": 0.06432311028018776, "grad_norm": 0.11830637110643184, "learning_rate": 9.898717193841374e-05, "loss": 2.0371, "step": 8271 }, { "epoch": 0.06433088722496835, "grad_norm": 0.12304418295073052, "learning_rate": 9.898692725169397e-05, "loss": 2.0629, "step": 8272 }, { "epoch": 0.06433866416974893, "grad_norm": 0.15323062555354172, "learning_rate": 9.898668253572364e-05, "loss": 2.0528, "step": 8273 }, { "epoch": 0.06434644111452952, "grad_norm": 0.12914716246472485, "learning_rate": 9.898643779050287e-05, "loss": 2.0466, "step": 8274 }, { "epoch": 0.06435421805931009, "grad_norm": 0.12201423397833473, "learning_rate": 9.898619301603183e-05, "loss": 2.0048, "step": 8275 }, { "epoch": 0.06436199500409068, "grad_norm": 0.13619876438776252, "learning_rate": 9.898594821231065e-05, "loss": 2.0463, "step": 8276 }, { "epoch": 0.06436977194887125, "grad_norm": 0.1976705260970217, "learning_rate": 9.898570337933946e-05, "loss": 2.0686, "step": 8277 }, { "epoch": 0.06437754889365184, "grad_norm": 0.2080734853403485, "learning_rate": 9.898545851711844e-05, "loss": 2.1092, "step": 8278 }, { "epoch": 0.06438532583843241, "grad_norm": 0.14113302631922367, "learning_rate": 9.89852136256477e-05, "loss": 2.0019, "step": 8279 }, { "epoch": 0.064393102783213, "grad_norm": 0.14499952641624675, "learning_rate": 9.898496870492744e-05, "loss": 2.0569, "step": 8280 }, { "epoch": 0.06440087972799358, "grad_norm": 0.186819130074863, "learning_rate": 9.898472375495775e-05, "loss": 1.9943, "step": 8281 }, { "epoch": 0.06440865667277416, "grad_norm": 0.130199652307236, "learning_rate": 9.89844787757388e-05, "loss": 2.0263, "step": 8282 }, { "epoch": 0.06441643361755474, "grad_norm": 0.16135265359856937, "learning_rate": 9.898423376727075e-05, "loss": 2.0047, "step": 8283 }, { "epoch": 0.06442421056233533, "grad_norm": 0.18184309631717194, "learning_rate": 9.89839887295537e-05, "loss": 1.9991, "step": 8284 }, { "epoch": 0.0644319875071159, "grad_norm": 0.12768362425207638, "learning_rate": 9.898374366258786e-05, "loss": 2.0143, "step": 8285 }, { "epoch": 0.06443976445189649, "grad_norm": 0.13711336515205458, "learning_rate": 9.898349856637332e-05, "loss": 2.0217, "step": 8286 }, { "epoch": 0.06444754139667706, "grad_norm": 0.1599901685992752, "learning_rate": 9.898325344091026e-05, "loss": 2.1144, "step": 8287 }, { "epoch": 0.06445531834145765, "grad_norm": 0.12705997250979548, "learning_rate": 9.898300828619882e-05, "loss": 2.0664, "step": 8288 }, { "epoch": 0.06446309528623823, "grad_norm": 0.1254919991713794, "learning_rate": 9.898276310223915e-05, "loss": 1.9989, "step": 8289 }, { "epoch": 0.06447087223101881, "grad_norm": 0.14067575204645622, "learning_rate": 9.898251788903138e-05, "loss": 2.0657, "step": 8290 }, { "epoch": 0.06447864917579939, "grad_norm": 0.12135466823162393, "learning_rate": 9.898227264657567e-05, "loss": 2.0385, "step": 8291 }, { "epoch": 0.06448642612057998, "grad_norm": 0.13718647292631003, "learning_rate": 9.898202737487215e-05, "loss": 2.0815, "step": 8292 }, { "epoch": 0.06449420306536055, "grad_norm": 0.16497980577781418, "learning_rate": 9.8981782073921e-05, "loss": 2.0235, "step": 8293 }, { "epoch": 0.06450198001014114, "grad_norm": 0.1269807498294558, "learning_rate": 9.898153674372232e-05, "loss": 2.0388, "step": 8294 }, { "epoch": 0.06450975695492171, "grad_norm": 0.15408969949470339, "learning_rate": 9.898129138427629e-05, "loss": 1.9985, "step": 8295 }, { "epoch": 0.0645175338997023, "grad_norm": 0.18211341641510334, "learning_rate": 9.898104599558307e-05, "loss": 2.0084, "step": 8296 }, { "epoch": 0.06452531084448287, "grad_norm": 0.16283441025050016, "learning_rate": 9.898080057764276e-05, "loss": 2.0876, "step": 8297 }, { "epoch": 0.06453308778926346, "grad_norm": 0.12458680782312227, "learning_rate": 9.898055513045554e-05, "loss": 2.0191, "step": 8298 }, { "epoch": 0.06454086473404404, "grad_norm": 0.13353607016996966, "learning_rate": 9.898030965402156e-05, "loss": 2.0217, "step": 8299 }, { "epoch": 0.06454864167882463, "grad_norm": 0.16721170578620473, "learning_rate": 9.898006414834094e-05, "loss": 2.0179, "step": 8300 }, { "epoch": 0.0645564186236052, "grad_norm": 0.1389961288489003, "learning_rate": 9.897981861341383e-05, "loss": 2.0415, "step": 8301 }, { "epoch": 0.06456419556838579, "grad_norm": 0.12631647150127492, "learning_rate": 9.897957304924041e-05, "loss": 2.079, "step": 8302 }, { "epoch": 0.06457197251316636, "grad_norm": 0.22133895378610474, "learning_rate": 9.897932745582079e-05, "loss": 2.0755, "step": 8303 }, { "epoch": 0.06457974945794695, "grad_norm": 0.21949483592195002, "learning_rate": 9.897908183315514e-05, "loss": 2.0519, "step": 8304 }, { "epoch": 0.06458752640272752, "grad_norm": 0.11263926408977294, "learning_rate": 9.897883618124359e-05, "loss": 2.0372, "step": 8305 }, { "epoch": 0.06459530334750811, "grad_norm": 0.2472029084750822, "learning_rate": 9.897859050008631e-05, "loss": 2.0641, "step": 8306 }, { "epoch": 0.06460308029228869, "grad_norm": 0.24546719456003438, "learning_rate": 9.897834478968342e-05, "loss": 2.0418, "step": 8307 }, { "epoch": 0.06461085723706927, "grad_norm": 0.13404135224339714, "learning_rate": 9.897809905003507e-05, "loss": 2.0138, "step": 8308 }, { "epoch": 0.06461863418184985, "grad_norm": 0.37211906621441887, "learning_rate": 9.897785328114142e-05, "loss": 2.0068, "step": 8309 }, { "epoch": 0.06462641112663044, "grad_norm": 0.27822864378561474, "learning_rate": 9.897760748300262e-05, "loss": 1.9789, "step": 8310 }, { "epoch": 0.06463418807141101, "grad_norm": 0.18615137151451744, "learning_rate": 9.897736165561881e-05, "loss": 2.0446, "step": 8311 }, { "epoch": 0.0646419650161916, "grad_norm": 0.42503192307528703, "learning_rate": 9.897711579899012e-05, "loss": 1.9944, "step": 8312 }, { "epoch": 0.06464974196097217, "grad_norm": 0.21661443138225178, "learning_rate": 9.897686991311672e-05, "loss": 2.0236, "step": 8313 }, { "epoch": 0.06465751890575276, "grad_norm": 0.21007483184651807, "learning_rate": 9.897662399799873e-05, "loss": 1.982, "step": 8314 }, { "epoch": 0.06466529585053335, "grad_norm": 0.2500247269183148, "learning_rate": 9.897637805363634e-05, "loss": 2.006, "step": 8315 }, { "epoch": 0.06467307279531392, "grad_norm": 0.11590635878861556, "learning_rate": 9.897613208002968e-05, "loss": 2.0774, "step": 8316 }, { "epoch": 0.06468084974009451, "grad_norm": 0.21969422983828035, "learning_rate": 9.897588607717887e-05, "loss": 2.0666, "step": 8317 }, { "epoch": 0.06468862668487509, "grad_norm": 0.14168554273113146, "learning_rate": 9.897564004508408e-05, "loss": 2.0819, "step": 8318 }, { "epoch": 0.06469640362965567, "grad_norm": 0.15780595410443832, "learning_rate": 9.897539398374544e-05, "loss": 2.0574, "step": 8319 }, { "epoch": 0.06470418057443625, "grad_norm": 0.18440816843282862, "learning_rate": 9.897514789316312e-05, "loss": 2.0634, "step": 8320 }, { "epoch": 0.06471195751921684, "grad_norm": 0.1206403898741905, "learning_rate": 9.897490177333726e-05, "loss": 2.0488, "step": 8321 }, { "epoch": 0.06471973446399741, "grad_norm": 0.18555201059929421, "learning_rate": 9.8974655624268e-05, "loss": 2.0655, "step": 8322 }, { "epoch": 0.064727511408778, "grad_norm": 0.17042934693449555, "learning_rate": 9.89744094459555e-05, "loss": 2.0393, "step": 8323 }, { "epoch": 0.06473528835355857, "grad_norm": 0.12278800272434703, "learning_rate": 9.897416323839988e-05, "loss": 2.0616, "step": 8324 }, { "epoch": 0.06474306529833916, "grad_norm": 0.15621351618575785, "learning_rate": 9.897391700160132e-05, "loss": 1.9683, "step": 8325 }, { "epoch": 0.06475084224311974, "grad_norm": 0.15162810719936737, "learning_rate": 9.897367073555994e-05, "loss": 2.0259, "step": 8326 }, { "epoch": 0.06475861918790032, "grad_norm": 0.11561763220068902, "learning_rate": 9.897342444027591e-05, "loss": 2.0586, "step": 8327 }, { "epoch": 0.0647663961326809, "grad_norm": 0.12635949925845458, "learning_rate": 9.897317811574935e-05, "loss": 2.0748, "step": 8328 }, { "epoch": 0.06477417307746149, "grad_norm": 0.12663962976434032, "learning_rate": 9.897293176198045e-05, "loss": 2.0583, "step": 8329 }, { "epoch": 0.06478195002224206, "grad_norm": 0.11536590801751595, "learning_rate": 9.89726853789693e-05, "loss": 2.0449, "step": 8330 }, { "epoch": 0.06478972696702265, "grad_norm": 0.12372990197220002, "learning_rate": 9.89724389667161e-05, "loss": 2.1177, "step": 8331 }, { "epoch": 0.06479750391180322, "grad_norm": 0.13639006517242122, "learning_rate": 9.897219252522096e-05, "loss": 2.0336, "step": 8332 }, { "epoch": 0.06480528085658381, "grad_norm": 0.12418649007616125, "learning_rate": 9.897194605448406e-05, "loss": 2.0554, "step": 8333 }, { "epoch": 0.06481305780136438, "grad_norm": 0.1368927062203933, "learning_rate": 9.897169955450551e-05, "loss": 1.9739, "step": 8334 }, { "epoch": 0.06482083474614497, "grad_norm": 0.1292157471245687, "learning_rate": 9.897145302528548e-05, "loss": 2.0346, "step": 8335 }, { "epoch": 0.06482861169092555, "grad_norm": 0.12057300091503953, "learning_rate": 9.89712064668241e-05, "loss": 2.0692, "step": 8336 }, { "epoch": 0.06483638863570614, "grad_norm": 0.131617793734222, "learning_rate": 9.897095987912155e-05, "loss": 2.0367, "step": 8337 }, { "epoch": 0.06484416558048671, "grad_norm": 0.12428575578339991, "learning_rate": 9.897071326217795e-05, "loss": 2.0266, "step": 8338 }, { "epoch": 0.0648519425252673, "grad_norm": 0.1267321559215937, "learning_rate": 9.897046661599345e-05, "loss": 2.057, "step": 8339 }, { "epoch": 0.06485971947004787, "grad_norm": 0.1284704736128697, "learning_rate": 9.897021994056821e-05, "loss": 2.0938, "step": 8340 }, { "epoch": 0.06486749641482846, "grad_norm": 0.11377217328534606, "learning_rate": 9.896997323590237e-05, "loss": 2.0835, "step": 8341 }, { "epoch": 0.06487527335960903, "grad_norm": 0.12354036291275901, "learning_rate": 9.896972650199608e-05, "loss": 2.0122, "step": 8342 }, { "epoch": 0.06488305030438962, "grad_norm": 0.1388150724134291, "learning_rate": 9.896947973884947e-05, "loss": 2.0388, "step": 8343 }, { "epoch": 0.0648908272491702, "grad_norm": 0.11509012366938624, "learning_rate": 9.89692329464627e-05, "loss": 2.0088, "step": 8344 }, { "epoch": 0.06489860419395078, "grad_norm": 0.11740724274340515, "learning_rate": 9.896898612483594e-05, "loss": 2.0918, "step": 8345 }, { "epoch": 0.06490638113873136, "grad_norm": 0.12313030656853581, "learning_rate": 9.89687392739693e-05, "loss": 2.026, "step": 8346 }, { "epoch": 0.06491415808351195, "grad_norm": 0.11947104611756744, "learning_rate": 9.896849239386294e-05, "loss": 2.0773, "step": 8347 }, { "epoch": 0.06492193502829252, "grad_norm": 0.12105094842773943, "learning_rate": 9.896824548451702e-05, "loss": 2.0399, "step": 8348 }, { "epoch": 0.06492971197307311, "grad_norm": 0.11368381350742461, "learning_rate": 9.896799854593168e-05, "loss": 2.0701, "step": 8349 }, { "epoch": 0.06493748891785368, "grad_norm": 0.1389106617288571, "learning_rate": 9.896775157810704e-05, "loss": 2.0547, "step": 8350 }, { "epoch": 0.06494526586263427, "grad_norm": 0.13280488143185862, "learning_rate": 9.896750458104332e-05, "loss": 2.0742, "step": 8351 }, { "epoch": 0.06495304280741485, "grad_norm": 0.170831302083447, "learning_rate": 9.896725755474059e-05, "loss": 2.0682, "step": 8352 }, { "epoch": 0.06496081975219543, "grad_norm": 0.12883121587498014, "learning_rate": 9.896701049919903e-05, "loss": 2.0767, "step": 8353 }, { "epoch": 0.06496859669697601, "grad_norm": 0.1429428112096493, "learning_rate": 9.896676341441879e-05, "loss": 1.9932, "step": 8354 }, { "epoch": 0.0649763736417566, "grad_norm": 0.13835951996997997, "learning_rate": 9.89665163004e-05, "loss": 2.0182, "step": 8355 }, { "epoch": 0.06498415058653717, "grad_norm": 0.12054549445181104, "learning_rate": 9.896626915714285e-05, "loss": 2.0222, "step": 8356 }, { "epoch": 0.06499192753131776, "grad_norm": 0.12513494778966977, "learning_rate": 9.896602198464743e-05, "loss": 2.0627, "step": 8357 }, { "epoch": 0.06499970447609833, "grad_norm": 0.1266879213048604, "learning_rate": 9.896577478291392e-05, "loss": 2.0431, "step": 8358 }, { "epoch": 0.06500748142087892, "grad_norm": 0.12134233020381133, "learning_rate": 9.896552755194248e-05, "loss": 2.0668, "step": 8359 }, { "epoch": 0.0650152583656595, "grad_norm": 0.11561911551599956, "learning_rate": 9.896528029173322e-05, "loss": 2.0524, "step": 8360 }, { "epoch": 0.06502303531044008, "grad_norm": 0.11620795709286862, "learning_rate": 9.896503300228633e-05, "loss": 2.0983, "step": 8361 }, { "epoch": 0.06503081225522066, "grad_norm": 0.12829576483321664, "learning_rate": 9.896478568360191e-05, "loss": 2.0229, "step": 8362 }, { "epoch": 0.06503858920000125, "grad_norm": 0.19323599293124788, "learning_rate": 9.896453833568016e-05, "loss": 2.0221, "step": 8363 }, { "epoch": 0.06504636614478182, "grad_norm": 0.14195264845094085, "learning_rate": 9.896429095852119e-05, "loss": 2.0419, "step": 8364 }, { "epoch": 0.06505414308956241, "grad_norm": 0.12522273602212172, "learning_rate": 9.896404355212516e-05, "loss": 2.0309, "step": 8365 }, { "epoch": 0.06506192003434298, "grad_norm": 0.1274580042653655, "learning_rate": 9.896379611649221e-05, "loss": 2.0523, "step": 8366 }, { "epoch": 0.06506969697912357, "grad_norm": 0.15604758648299238, "learning_rate": 9.89635486516225e-05, "loss": 2.003, "step": 8367 }, { "epoch": 0.06507747392390414, "grad_norm": 0.11936664587174835, "learning_rate": 9.896330115751617e-05, "loss": 2.0288, "step": 8368 }, { "epoch": 0.06508525086868473, "grad_norm": 0.1305533043878788, "learning_rate": 9.896305363417339e-05, "loss": 2.0132, "step": 8369 }, { "epoch": 0.0650930278134653, "grad_norm": 0.12780850342778874, "learning_rate": 9.896280608159427e-05, "loss": 1.9719, "step": 8370 }, { "epoch": 0.0651008047582459, "grad_norm": 0.12712026422075162, "learning_rate": 9.896255849977898e-05, "loss": 2.0465, "step": 8371 }, { "epoch": 0.06510858170302647, "grad_norm": 0.17223847690773292, "learning_rate": 9.896231088872768e-05, "loss": 2.0268, "step": 8372 }, { "epoch": 0.06511635864780706, "grad_norm": 0.14201022391091062, "learning_rate": 9.896206324844047e-05, "loss": 2.0806, "step": 8373 }, { "epoch": 0.06512413559258765, "grad_norm": 0.1237327466548445, "learning_rate": 9.896181557891756e-05, "loss": 2.0634, "step": 8374 }, { "epoch": 0.06513191253736822, "grad_norm": 0.14588465096337633, "learning_rate": 9.896156788015905e-05, "loss": 2.0601, "step": 8375 }, { "epoch": 0.06513968948214881, "grad_norm": 0.15459510629266573, "learning_rate": 9.89613201521651e-05, "loss": 2.0647, "step": 8376 }, { "epoch": 0.06514746642692938, "grad_norm": 0.20710411024361572, "learning_rate": 9.896107239493587e-05, "loss": 2.0811, "step": 8377 }, { "epoch": 0.06515524337170997, "grad_norm": 0.11342318987961673, "learning_rate": 9.89608246084715e-05, "loss": 2.1077, "step": 8378 }, { "epoch": 0.06516302031649054, "grad_norm": 0.13477032292214086, "learning_rate": 9.896057679277216e-05, "loss": 2.0217, "step": 8379 }, { "epoch": 0.06517079726127113, "grad_norm": 0.1341185575588772, "learning_rate": 9.896032894783795e-05, "loss": 2.0031, "step": 8380 }, { "epoch": 0.0651785742060517, "grad_norm": 0.12094437911227504, "learning_rate": 9.896008107366905e-05, "loss": 2.0523, "step": 8381 }, { "epoch": 0.0651863511508323, "grad_norm": 0.12373155844443663, "learning_rate": 9.895983317026561e-05, "loss": 2.031, "step": 8382 }, { "epoch": 0.06519412809561287, "grad_norm": 0.1376084971486088, "learning_rate": 9.895958523762777e-05, "loss": 2.0155, "step": 8383 }, { "epoch": 0.06520190504039346, "grad_norm": 0.20411117385620905, "learning_rate": 9.895933727575568e-05, "loss": 2.1221, "step": 8384 }, { "epoch": 0.06520968198517403, "grad_norm": 0.1211315338209869, "learning_rate": 9.895908928464948e-05, "loss": 2.0417, "step": 8385 }, { "epoch": 0.06521745892995462, "grad_norm": 0.13661089834213078, "learning_rate": 9.895884126430934e-05, "loss": 2.0625, "step": 8386 }, { "epoch": 0.0652252358747352, "grad_norm": 0.1407440511144969, "learning_rate": 9.895859321473537e-05, "loss": 2.0308, "step": 8387 }, { "epoch": 0.06523301281951578, "grad_norm": 0.11731800824448388, "learning_rate": 9.895834513592777e-05, "loss": 1.9976, "step": 8388 }, { "epoch": 0.06524078976429636, "grad_norm": 0.13089284268973073, "learning_rate": 9.895809702788664e-05, "loss": 2.0488, "step": 8389 }, { "epoch": 0.06524856670907694, "grad_norm": 0.14372884594653035, "learning_rate": 9.895784889061216e-05, "loss": 2.0273, "step": 8390 }, { "epoch": 0.06525634365385752, "grad_norm": 0.12583916117717886, "learning_rate": 9.895760072410446e-05, "loss": 2.0066, "step": 8391 }, { "epoch": 0.0652641205986381, "grad_norm": 0.12540117114125915, "learning_rate": 9.895735252836369e-05, "loss": 2.0285, "step": 8392 }, { "epoch": 0.06527189754341868, "grad_norm": 0.11946014562721359, "learning_rate": 9.895710430339e-05, "loss": 2.0861, "step": 8393 }, { "epoch": 0.06527967448819927, "grad_norm": 0.1222388345950369, "learning_rate": 9.895685604918357e-05, "loss": 2.0599, "step": 8394 }, { "epoch": 0.06528745143297984, "grad_norm": 0.14182206964294397, "learning_rate": 9.895660776574449e-05, "loss": 2.0475, "step": 8395 }, { "epoch": 0.06529522837776043, "grad_norm": 0.12457337325271353, "learning_rate": 9.895635945307293e-05, "loss": 2.0839, "step": 8396 }, { "epoch": 0.065303005322541, "grad_norm": 0.11532721382372418, "learning_rate": 9.895611111116907e-05, "loss": 2.0166, "step": 8397 }, { "epoch": 0.0653107822673216, "grad_norm": 0.11951251888726191, "learning_rate": 9.895586274003302e-05, "loss": 2.0342, "step": 8398 }, { "epoch": 0.06531855921210217, "grad_norm": 0.11945374349799415, "learning_rate": 9.895561433966495e-05, "loss": 2.0178, "step": 8399 }, { "epoch": 0.06532633615688276, "grad_norm": 0.11352944159482015, "learning_rate": 9.895536591006499e-05, "loss": 2.0969, "step": 8400 }, { "epoch": 0.06533411310166333, "grad_norm": 0.14358297406429754, "learning_rate": 9.895511745123331e-05, "loss": 2.0564, "step": 8401 }, { "epoch": 0.06534189004644392, "grad_norm": 0.15262335335018665, "learning_rate": 9.895486896317003e-05, "loss": 2.0263, "step": 8402 }, { "epoch": 0.06534966699122449, "grad_norm": 0.13515386733231002, "learning_rate": 9.895462044587533e-05, "loss": 2.0556, "step": 8403 }, { "epoch": 0.06535744393600508, "grad_norm": 0.1332504890678089, "learning_rate": 9.895437189934936e-05, "loss": 2.0503, "step": 8404 }, { "epoch": 0.06536522088078565, "grad_norm": 0.17656391392876256, "learning_rate": 9.895412332359221e-05, "loss": 1.9865, "step": 8405 }, { "epoch": 0.06537299782556624, "grad_norm": 0.20346900335343004, "learning_rate": 9.89538747186041e-05, "loss": 2.0327, "step": 8406 }, { "epoch": 0.06538077477034682, "grad_norm": 0.13259914348404095, "learning_rate": 9.895362608438514e-05, "loss": 2.0278, "step": 8407 }, { "epoch": 0.0653885517151274, "grad_norm": 0.13706185753081326, "learning_rate": 9.895337742093549e-05, "loss": 1.9746, "step": 8408 }, { "epoch": 0.06539632865990798, "grad_norm": 0.18699099045243053, "learning_rate": 9.895312872825531e-05, "loss": 2.0217, "step": 8409 }, { "epoch": 0.06540410560468857, "grad_norm": 0.1479777788118621, "learning_rate": 9.89528800063447e-05, "loss": 1.9833, "step": 8410 }, { "epoch": 0.06541188254946914, "grad_norm": 0.11673238036585971, "learning_rate": 9.895263125520388e-05, "loss": 2.0719, "step": 8411 }, { "epoch": 0.06541965949424973, "grad_norm": 0.15204296561121428, "learning_rate": 9.895238247483294e-05, "loss": 2.0521, "step": 8412 }, { "epoch": 0.0654274364390303, "grad_norm": 0.14245685003617242, "learning_rate": 9.895213366523206e-05, "loss": 2.0367, "step": 8413 }, { "epoch": 0.06543521338381089, "grad_norm": 0.12000481676643869, "learning_rate": 9.895188482640137e-05, "loss": 2.0245, "step": 8414 }, { "epoch": 0.06544299032859147, "grad_norm": 0.1327611080446035, "learning_rate": 9.895163595834102e-05, "loss": 2.0349, "step": 8415 }, { "epoch": 0.06545076727337205, "grad_norm": 0.14998450200532737, "learning_rate": 9.895138706105119e-05, "loss": 2.0176, "step": 8416 }, { "epoch": 0.06545854421815263, "grad_norm": 0.12469451098975294, "learning_rate": 9.895113813453198e-05, "loss": 2.0466, "step": 8417 }, { "epoch": 0.06546632116293322, "grad_norm": 0.118730917618027, "learning_rate": 9.895088917878359e-05, "loss": 2.059, "step": 8418 }, { "epoch": 0.06547409810771379, "grad_norm": 0.12126783427053364, "learning_rate": 9.895064019380612e-05, "loss": 2.0586, "step": 8419 }, { "epoch": 0.06548187505249438, "grad_norm": 0.11069623649089483, "learning_rate": 9.895039117959975e-05, "loss": 2.0323, "step": 8420 }, { "epoch": 0.06548965199727495, "grad_norm": 0.1387279883434311, "learning_rate": 9.895014213616462e-05, "loss": 2.0605, "step": 8421 }, { "epoch": 0.06549742894205554, "grad_norm": 0.16275734072923112, "learning_rate": 9.894989306350087e-05, "loss": 2.086, "step": 8422 }, { "epoch": 0.06550520588683612, "grad_norm": 0.12687447163996715, "learning_rate": 9.894964396160866e-05, "loss": 2.0395, "step": 8423 }, { "epoch": 0.0655129828316167, "grad_norm": 0.13236157473628665, "learning_rate": 9.894939483048813e-05, "loss": 2.01, "step": 8424 }, { "epoch": 0.06552075977639728, "grad_norm": 0.16549415892701397, "learning_rate": 9.894914567013945e-05, "loss": 2.0084, "step": 8425 }, { "epoch": 0.06552853672117787, "grad_norm": 0.14737131243345167, "learning_rate": 9.894889648056273e-05, "loss": 2.0536, "step": 8426 }, { "epoch": 0.06553631366595844, "grad_norm": 0.11597666975550401, "learning_rate": 9.894864726175816e-05, "loss": 1.9985, "step": 8427 }, { "epoch": 0.06554409061073903, "grad_norm": 0.5324983624966838, "learning_rate": 9.894839801372586e-05, "loss": 2.0735, "step": 8428 }, { "epoch": 0.0655518675555196, "grad_norm": 0.1428788883189662, "learning_rate": 9.8948148736466e-05, "loss": 2.0904, "step": 8429 }, { "epoch": 0.06555964450030019, "grad_norm": 0.1333895062205421, "learning_rate": 9.894789942997871e-05, "loss": 2.0192, "step": 8430 }, { "epoch": 0.06556742144508076, "grad_norm": 0.19111174621590885, "learning_rate": 9.894765009426416e-05, "loss": 2.094, "step": 8431 }, { "epoch": 0.06557519838986135, "grad_norm": 0.1642013838981315, "learning_rate": 9.894740072932247e-05, "loss": 2.1048, "step": 8432 }, { "epoch": 0.06558297533464194, "grad_norm": 0.12088493289424244, "learning_rate": 9.894715133515381e-05, "loss": 2.0736, "step": 8433 }, { "epoch": 0.06559075227942252, "grad_norm": 0.4269538870671598, "learning_rate": 9.894690191175833e-05, "loss": 1.9984, "step": 8434 }, { "epoch": 0.0655985292242031, "grad_norm": 0.13614384630308676, "learning_rate": 9.894665245913616e-05, "loss": 2.0687, "step": 8435 }, { "epoch": 0.06560630616898368, "grad_norm": 0.12235826465964611, "learning_rate": 9.894640297728747e-05, "loss": 2.0433, "step": 8436 }, { "epoch": 0.06561408311376427, "grad_norm": 0.1221861639345736, "learning_rate": 9.89461534662124e-05, "loss": 2.0229, "step": 8437 }, { "epoch": 0.06562186005854484, "grad_norm": 0.14240030131848175, "learning_rate": 9.89459039259111e-05, "loss": 2.0648, "step": 8438 }, { "epoch": 0.06562963700332543, "grad_norm": 0.12546230809713585, "learning_rate": 9.894565435638372e-05, "loss": 1.9874, "step": 8439 }, { "epoch": 0.065637413948106, "grad_norm": 0.12428297011669273, "learning_rate": 9.894540475763039e-05, "loss": 2.0332, "step": 8440 }, { "epoch": 0.06564519089288659, "grad_norm": 0.1386783035702606, "learning_rate": 9.894515512965131e-05, "loss": 2.0661, "step": 8441 }, { "epoch": 0.06565296783766716, "grad_norm": 0.12916758767469916, "learning_rate": 9.894490547244657e-05, "loss": 2.0368, "step": 8442 }, { "epoch": 0.06566074478244775, "grad_norm": 0.12014587551175644, "learning_rate": 9.894465578601634e-05, "loss": 2.0853, "step": 8443 }, { "epoch": 0.06566852172722833, "grad_norm": 0.14091138253633254, "learning_rate": 9.89444060703608e-05, "loss": 2.0229, "step": 8444 }, { "epoch": 0.06567629867200891, "grad_norm": 0.1910691443171986, "learning_rate": 9.894415632548005e-05, "loss": 2.0531, "step": 8445 }, { "epoch": 0.06568407561678949, "grad_norm": 0.20734170381393247, "learning_rate": 9.894390655137427e-05, "loss": 2.0102, "step": 8446 }, { "epoch": 0.06569185256157008, "grad_norm": 0.34987905792717255, "learning_rate": 9.89436567480436e-05, "loss": 2.0289, "step": 8447 }, { "epoch": 0.06569962950635065, "grad_norm": 0.12303543951940557, "learning_rate": 9.89434069154882e-05, "loss": 2.0246, "step": 8448 }, { "epoch": 0.06570740645113124, "grad_norm": 0.15351432403203275, "learning_rate": 9.894315705370819e-05, "loss": 2.0334, "step": 8449 }, { "epoch": 0.06571518339591181, "grad_norm": 0.1561128787385808, "learning_rate": 9.894290716270376e-05, "loss": 2.0307, "step": 8450 }, { "epoch": 0.0657229603406924, "grad_norm": 0.14851192069558075, "learning_rate": 9.894265724247502e-05, "loss": 2.0275, "step": 8451 }, { "epoch": 0.06573073728547298, "grad_norm": 0.12546968128959224, "learning_rate": 9.894240729302215e-05, "loss": 1.9989, "step": 8452 }, { "epoch": 0.06573851423025356, "grad_norm": 0.12206107705977207, "learning_rate": 9.894215731434527e-05, "loss": 2.0281, "step": 8453 }, { "epoch": 0.06574629117503414, "grad_norm": 0.14835384699917475, "learning_rate": 9.894190730644458e-05, "loss": 2.057, "step": 8454 }, { "epoch": 0.06575406811981473, "grad_norm": 0.1433706277010375, "learning_rate": 9.894165726932015e-05, "loss": 2.0579, "step": 8455 }, { "epoch": 0.0657618450645953, "grad_norm": 0.1258979891119105, "learning_rate": 9.89414072029722e-05, "loss": 1.9714, "step": 8456 }, { "epoch": 0.06576962200937589, "grad_norm": 0.1172101510334289, "learning_rate": 9.894115710740085e-05, "loss": 1.9845, "step": 8457 }, { "epoch": 0.06577739895415646, "grad_norm": 0.11603134894020498, "learning_rate": 9.894090698260625e-05, "loss": 2.0506, "step": 8458 }, { "epoch": 0.06578517589893705, "grad_norm": 0.11135357024878494, "learning_rate": 9.894065682858856e-05, "loss": 2.0203, "step": 8459 }, { "epoch": 0.06579295284371763, "grad_norm": 0.11642848450982254, "learning_rate": 9.894040664534791e-05, "loss": 2.0118, "step": 8460 }, { "epoch": 0.06580072978849821, "grad_norm": 0.12190918110399321, "learning_rate": 9.894015643288448e-05, "loss": 1.9948, "step": 8461 }, { "epoch": 0.06580850673327879, "grad_norm": 0.21088993347115384, "learning_rate": 9.893990619119838e-05, "loss": 2.031, "step": 8462 }, { "epoch": 0.06581628367805938, "grad_norm": 0.13286629723573914, "learning_rate": 9.89396559202898e-05, "loss": 2.0382, "step": 8463 }, { "epoch": 0.06582406062283995, "grad_norm": 0.14112458026360145, "learning_rate": 9.893940562015885e-05, "loss": 2.0786, "step": 8464 }, { "epoch": 0.06583183756762054, "grad_norm": 0.1330080274969965, "learning_rate": 9.893915529080571e-05, "loss": 2.0754, "step": 8465 }, { "epoch": 0.06583961451240111, "grad_norm": 0.11503692904437156, "learning_rate": 9.89389049322305e-05, "loss": 2.0364, "step": 8466 }, { "epoch": 0.0658473914571817, "grad_norm": 0.12049929351857946, "learning_rate": 9.89386545444334e-05, "loss": 2.0955, "step": 8467 }, { "epoch": 0.06585516840196227, "grad_norm": 0.1349077239886133, "learning_rate": 9.893840412741455e-05, "loss": 2.036, "step": 8468 }, { "epoch": 0.06586294534674286, "grad_norm": 0.1433618406561431, "learning_rate": 9.89381536811741e-05, "loss": 2.0318, "step": 8469 }, { "epoch": 0.06587072229152344, "grad_norm": 0.14066707784332766, "learning_rate": 9.893790320571219e-05, "loss": 2.0418, "step": 8470 }, { "epoch": 0.06587849923630403, "grad_norm": 0.1386792052634645, "learning_rate": 9.8937652701029e-05, "loss": 2.0645, "step": 8471 }, { "epoch": 0.0658862761810846, "grad_norm": 0.12064981902703431, "learning_rate": 9.893740216712461e-05, "loss": 2.0385, "step": 8472 }, { "epoch": 0.06589405312586519, "grad_norm": 0.11704608778444467, "learning_rate": 9.893715160399924e-05, "loss": 2.059, "step": 8473 }, { "epoch": 0.06590183007064576, "grad_norm": 0.11747904665454639, "learning_rate": 9.893690101165303e-05, "loss": 2.0295, "step": 8474 }, { "epoch": 0.06590960701542635, "grad_norm": 0.11532278510329035, "learning_rate": 9.893665039008608e-05, "loss": 2.0191, "step": 8475 }, { "epoch": 0.06591738396020692, "grad_norm": 0.13910679460943853, "learning_rate": 9.89363997392986e-05, "loss": 1.9808, "step": 8476 }, { "epoch": 0.06592516090498751, "grad_norm": 0.10913633265711788, "learning_rate": 9.893614905929071e-05, "loss": 2.0507, "step": 8477 }, { "epoch": 0.06593293784976809, "grad_norm": 0.11839693611657677, "learning_rate": 9.893589835006255e-05, "loss": 1.9901, "step": 8478 }, { "epoch": 0.06594071479454867, "grad_norm": 0.19138272944814458, "learning_rate": 9.893564761161429e-05, "loss": 2.0462, "step": 8479 }, { "epoch": 0.06594849173932925, "grad_norm": 0.1213102165390148, "learning_rate": 9.893539684394608e-05, "loss": 2.032, "step": 8480 }, { "epoch": 0.06595626868410984, "grad_norm": 0.12050180561247849, "learning_rate": 9.893514604705805e-05, "loss": 2.0231, "step": 8481 }, { "epoch": 0.06596404562889041, "grad_norm": 0.4401518679382781, "learning_rate": 9.893489522095036e-05, "loss": 2.0475, "step": 8482 }, { "epoch": 0.065971822573671, "grad_norm": 0.1282039233353678, "learning_rate": 9.893464436562318e-05, "loss": 2.0553, "step": 8483 }, { "epoch": 0.06597959951845157, "grad_norm": 0.13445256739945124, "learning_rate": 9.893439348107663e-05, "loss": 2.0457, "step": 8484 }, { "epoch": 0.06598737646323216, "grad_norm": 0.12392939868294017, "learning_rate": 9.893414256731089e-05, "loss": 2.0071, "step": 8485 }, { "epoch": 0.06599515340801274, "grad_norm": 0.12702641136117282, "learning_rate": 9.893389162432607e-05, "loss": 2.0966, "step": 8486 }, { "epoch": 0.06600293035279332, "grad_norm": 0.12262815304439215, "learning_rate": 9.893364065212235e-05, "loss": 2.0207, "step": 8487 }, { "epoch": 0.0660107072975739, "grad_norm": 0.12716802872768698, "learning_rate": 9.893338965069984e-05, "loss": 2.0324, "step": 8488 }, { "epoch": 0.06601848424235449, "grad_norm": 0.1324178032825936, "learning_rate": 9.893313862005876e-05, "loss": 2.0702, "step": 8489 }, { "epoch": 0.06602626118713506, "grad_norm": 0.12339726410006151, "learning_rate": 9.89328875601992e-05, "loss": 2.1099, "step": 8490 }, { "epoch": 0.06603403813191565, "grad_norm": 0.1718619189244023, "learning_rate": 9.893263647112135e-05, "loss": 2.083, "step": 8491 }, { "epoch": 0.06604181507669622, "grad_norm": 0.22177144130681029, "learning_rate": 9.893238535282534e-05, "loss": 2.0414, "step": 8492 }, { "epoch": 0.06604959202147681, "grad_norm": 0.20971857080981493, "learning_rate": 9.893213420531131e-05, "loss": 2.1055, "step": 8493 }, { "epoch": 0.0660573689662574, "grad_norm": 0.14045764917179343, "learning_rate": 9.893188302857941e-05, "loss": 2.0553, "step": 8494 }, { "epoch": 0.06606514591103797, "grad_norm": 0.11669646359672195, "learning_rate": 9.893163182262981e-05, "loss": 2.0321, "step": 8495 }, { "epoch": 0.06607292285581856, "grad_norm": 0.16967655997783132, "learning_rate": 9.893138058746266e-05, "loss": 2.0215, "step": 8496 }, { "epoch": 0.06608069980059914, "grad_norm": 0.17711233463688986, "learning_rate": 9.893112932307809e-05, "loss": 2.0373, "step": 8497 }, { "epoch": 0.06608847674537972, "grad_norm": 0.13284016166725676, "learning_rate": 9.893087802947628e-05, "loss": 2.0413, "step": 8498 }, { "epoch": 0.0660962536901603, "grad_norm": 0.12677443052993098, "learning_rate": 9.893062670665733e-05, "loss": 1.995, "step": 8499 }, { "epoch": 0.06610403063494089, "grad_norm": 0.1585934425282612, "learning_rate": 9.893037535462144e-05, "loss": 2.0618, "step": 8500 } ], "logging_steps": 1, "max_steps": 128585, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9176292519837696.0, "train_batch_size": 11, "trial_name": null, "trial_params": null }