{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 4154, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00024077529645458377, "grad_norm": 26.920116424560547, "learning_rate": 0.0, "loss": 4.5785, "step": 1 }, { "epoch": 0.00048155059290916753, "grad_norm": 8.383347511291504, "learning_rate": 1.6000000000000001e-06, "loss": 5.9464, "step": 2 }, { "epoch": 0.0007223258893637513, "grad_norm": 16.050535202026367, "learning_rate": 3.2000000000000003e-06, "loss": 7.4406, "step": 3 }, { "epoch": 0.0009631011858183351, "grad_norm": 20.88637351989746, "learning_rate": 4.800000000000001e-06, "loss": 9.6416, "step": 4 }, { "epoch": 0.0012038764822729189, "grad_norm": 10.887105941772461, "learning_rate": 6.4000000000000006e-06, "loss": 4.9843, "step": 5 }, { "epoch": 0.0014446517787275025, "grad_norm": 12.895423889160156, "learning_rate": 8.000000000000001e-06, "loss": 4.7898, "step": 6 }, { "epoch": 0.0016854270751820862, "grad_norm": 11.34997844696045, "learning_rate": 9.600000000000001e-06, "loss": 6.4794, "step": 7 }, { "epoch": 0.0019262023716366701, "grad_norm": 11.045844078063965, "learning_rate": 1.1200000000000001e-05, "loss": 5.5106, "step": 8 }, { "epoch": 0.002166977668091254, "grad_norm": 6.337793827056885, "learning_rate": 1.2800000000000001e-05, "loss": 4.1446, "step": 9 }, { "epoch": 0.0024077529645458377, "grad_norm": 16.309860229492188, "learning_rate": 1.44e-05, "loss": 6.5782, "step": 10 }, { "epoch": 0.002648528261000421, "grad_norm": 18.269319534301758, "learning_rate": 1.6000000000000003e-05, "loss": 7.8044, "step": 11 }, { "epoch": 0.002889303557455005, "grad_norm": 3.6918132305145264, "learning_rate": 1.76e-05, "loss": 2.8253, "step": 12 }, { "epoch": 0.003130078853909589, "grad_norm": 13.319107055664062, "learning_rate": 1.9200000000000003e-05, "loss": 5.9873, "step": 13 }, { "epoch": 0.0033708541503641725, "grad_norm": 16.075435638427734, "learning_rate": 2.08e-05, "loss": 5.123, "step": 14 }, { "epoch": 0.0036116294468187564, "grad_norm": 13.996861457824707, "learning_rate": 2.2400000000000002e-05, "loss": 6.5848, "step": 15 }, { "epoch": 0.0038524047432733403, "grad_norm": 23.012784957885742, "learning_rate": 2.4e-05, "loss": 9.885, "step": 16 }, { "epoch": 0.004093180039727924, "grad_norm": 8.31369686126709, "learning_rate": 2.5600000000000002e-05, "loss": 6.0589, "step": 17 }, { "epoch": 0.004333955336182508, "grad_norm": 20.620271682739258, "learning_rate": 2.7200000000000004e-05, "loss": 5.3127, "step": 18 }, { "epoch": 0.004574730632637091, "grad_norm": 13.044432640075684, "learning_rate": 2.88e-05, "loss": 5.1087, "step": 19 }, { "epoch": 0.004815505929091675, "grad_norm": 31.605579376220703, "learning_rate": 3.04e-05, "loss": 6.1106, "step": 20 }, { "epoch": 0.005056281225546259, "grad_norm": 8.28500747680664, "learning_rate": 3.2000000000000005e-05, "loss": 4.5685, "step": 21 }, { "epoch": 0.005297056522000842, "grad_norm": 14.513694763183594, "learning_rate": 3.3600000000000004e-05, "loss": 7.3525, "step": 22 }, { "epoch": 0.005537831818455427, "grad_norm": 10.957548141479492, "learning_rate": 3.52e-05, "loss": 6.184, "step": 23 }, { "epoch": 0.00577860711491001, "grad_norm": 6.602078914642334, "learning_rate": 3.68e-05, "loss": 3.2581, "step": 24 }, { "epoch": 0.006019382411364594, "grad_norm": 10.11325454711914, "learning_rate": 3.8400000000000005e-05, "loss": 7.9033, "step": 25 }, { "epoch": 0.006260157707819178, "grad_norm": 10.033835411071777, "learning_rate": 4e-05, "loss": 4.9469, "step": 26 }, { "epoch": 0.0065009330042737614, "grad_norm": 10.819141387939453, "learning_rate": 4.16e-05, "loss": 7.46, "step": 27 }, { "epoch": 0.006741708300728345, "grad_norm": 25.810640335083008, "learning_rate": 4.32e-05, "loss": 5.7763, "step": 28 }, { "epoch": 0.006982483597182929, "grad_norm": 10.680785179138184, "learning_rate": 4.4800000000000005e-05, "loss": 5.9753, "step": 29 }, { "epoch": 0.007223258893637513, "grad_norm": 14.47507381439209, "learning_rate": 4.64e-05, "loss": 5.2033, "step": 30 }, { "epoch": 0.007464034190092096, "grad_norm": 17.440105438232422, "learning_rate": 4.8e-05, "loss": 5.6511, "step": 31 }, { "epoch": 0.0077048094865466805, "grad_norm": 10.847347259521484, "learning_rate": 4.96e-05, "loss": 6.7464, "step": 32 }, { "epoch": 0.007945584783001265, "grad_norm": 9.08476734161377, "learning_rate": 5.1200000000000004e-05, "loss": 5.9733, "step": 33 }, { "epoch": 0.008186360079455848, "grad_norm": 6.558286190032959, "learning_rate": 5.28e-05, "loss": 4.6828, "step": 34 }, { "epoch": 0.008427135375910432, "grad_norm": 33.23648452758789, "learning_rate": 5.440000000000001e-05, "loss": 5.5094, "step": 35 }, { "epoch": 0.008667910672365015, "grad_norm": 8.349298477172852, "learning_rate": 5.6000000000000006e-05, "loss": 4.1564, "step": 36 }, { "epoch": 0.008908685968819599, "grad_norm": 42.14231491088867, "learning_rate": 5.76e-05, "loss": 4.5537, "step": 37 }, { "epoch": 0.009149461265274182, "grad_norm": 14.191291809082031, "learning_rate": 5.92e-05, "loss": 6.6075, "step": 38 }, { "epoch": 0.009390236561728767, "grad_norm": 73.26921844482422, "learning_rate": 6.08e-05, "loss": 6.399, "step": 39 }, { "epoch": 0.00963101185818335, "grad_norm": 14.339468002319336, "learning_rate": 6.24e-05, "loss": 5.774, "step": 40 }, { "epoch": 0.009871787154637934, "grad_norm": 15.463168144226074, "learning_rate": 6.400000000000001e-05, "loss": 5.1967, "step": 41 }, { "epoch": 0.010112562451092518, "grad_norm": 49.4256477355957, "learning_rate": 6.560000000000001e-05, "loss": 3.4196, "step": 42 }, { "epoch": 0.010353337747547101, "grad_norm": 28.241819381713867, "learning_rate": 6.720000000000001e-05, "loss": 5.9708, "step": 43 }, { "epoch": 0.010594113044001685, "grad_norm": 7.766085624694824, "learning_rate": 6.879999999999999e-05, "loss": 5.5671, "step": 44 }, { "epoch": 0.01083488834045627, "grad_norm": 6.781948566436768, "learning_rate": 7.04e-05, "loss": 3.8236, "step": 45 }, { "epoch": 0.011075663636910853, "grad_norm": 22.539283752441406, "learning_rate": 7.2e-05, "loss": 5.7164, "step": 46 }, { "epoch": 0.011316438933365437, "grad_norm": 138.10426330566406, "learning_rate": 7.36e-05, "loss": 4.4738, "step": 47 }, { "epoch": 0.01155721422982002, "grad_norm": 6.446707725524902, "learning_rate": 7.52e-05, "loss": 3.4618, "step": 48 }, { "epoch": 0.011797989526274604, "grad_norm": 11.496111869812012, "learning_rate": 7.680000000000001e-05, "loss": 7.4023, "step": 49 }, { "epoch": 0.012038764822729187, "grad_norm": 12.171050071716309, "learning_rate": 7.840000000000001e-05, "loss": 5.3548, "step": 50 }, { "epoch": 0.012279540119183772, "grad_norm": 6.225719928741455, "learning_rate": 8e-05, "loss": 3.7378, "step": 51 }, { "epoch": 0.012520315415638356, "grad_norm": 30.29821014404297, "learning_rate": 8.16e-05, "loss": 4.3953, "step": 52 }, { "epoch": 0.01276109071209294, "grad_norm": 8.839107513427734, "learning_rate": 8.32e-05, "loss": 6.1286, "step": 53 }, { "epoch": 0.013001866008547523, "grad_norm": 15.737375259399414, "learning_rate": 8.48e-05, "loss": 4.3585, "step": 54 }, { "epoch": 0.013242641305002106, "grad_norm": 13.612770080566406, "learning_rate": 8.64e-05, "loss": 5.2315, "step": 55 }, { "epoch": 0.01348341660145669, "grad_norm": 20.0008544921875, "learning_rate": 8.800000000000001e-05, "loss": 5.8228, "step": 56 }, { "epoch": 0.013724191897911275, "grad_norm": 5.86611795425415, "learning_rate": 8.960000000000001e-05, "loss": 4.7417, "step": 57 }, { "epoch": 0.013964967194365858, "grad_norm": 11.262532234191895, "learning_rate": 9.120000000000001e-05, "loss": 4.7822, "step": 58 }, { "epoch": 0.014205742490820442, "grad_norm": 12.375737190246582, "learning_rate": 9.28e-05, "loss": 5.4579, "step": 59 }, { "epoch": 0.014446517787275025, "grad_norm": 10.653188705444336, "learning_rate": 9.44e-05, "loss": 6.4277, "step": 60 }, { "epoch": 0.014687293083729609, "grad_norm": 39.603145599365234, "learning_rate": 9.6e-05, "loss": 6.2061, "step": 61 }, { "epoch": 0.014928068380184192, "grad_norm": 16.928340911865234, "learning_rate": 9.76e-05, "loss": 7.8526, "step": 62 }, { "epoch": 0.015168843676638778, "grad_norm": 13.248088836669922, "learning_rate": 9.92e-05, "loss": 3.6139, "step": 63 }, { "epoch": 0.015409618973093361, "grad_norm": 4.931760311126709, "learning_rate": 0.00010080000000000001, "loss": 3.006, "step": 64 }, { "epoch": 0.015650394269547944, "grad_norm": 11.34759521484375, "learning_rate": 0.00010240000000000001, "loss": 6.164, "step": 65 }, { "epoch": 0.01589116956600253, "grad_norm": 9.345470428466797, "learning_rate": 0.00010400000000000001, "loss": 4.9446, "step": 66 }, { "epoch": 0.01613194486245711, "grad_norm": 10.860252380371094, "learning_rate": 0.0001056, "loss": 3.235, "step": 67 }, { "epoch": 0.016372720158911697, "grad_norm": 22.121963500976562, "learning_rate": 0.00010720000000000002, "loss": 6.3519, "step": 68 }, { "epoch": 0.01661349545536628, "grad_norm": 23.994407653808594, "learning_rate": 0.00010880000000000002, "loss": 5.4503, "step": 69 }, { "epoch": 0.016854270751820864, "grad_norm": 5.721750736236572, "learning_rate": 0.00011040000000000001, "loss": 3.448, "step": 70 }, { "epoch": 0.017095046048275445, "grad_norm": 46.67560577392578, "learning_rate": 0.00011200000000000001, "loss": 3.8795, "step": 71 }, { "epoch": 0.01733582134473003, "grad_norm": 11.732275009155273, "learning_rate": 0.0001136, "loss": 3.7436, "step": 72 }, { "epoch": 0.017576596641184616, "grad_norm": 19.560314178466797, "learning_rate": 0.0001152, "loss": 5.2577, "step": 73 }, { "epoch": 0.017817371937639197, "grad_norm": 47.164306640625, "learning_rate": 0.00011679999999999999, "loss": 5.2273, "step": 74 }, { "epoch": 0.018058147234093783, "grad_norm": 25.40642738342285, "learning_rate": 0.0001184, "loss": 5.912, "step": 75 }, { "epoch": 0.018298922530548364, "grad_norm": 10.063149452209473, "learning_rate": 0.00012, "loss": 5.6455, "step": 76 }, { "epoch": 0.01853969782700295, "grad_norm": 15.481316566467285, "learning_rate": 0.0001216, "loss": 3.1296, "step": 77 }, { "epoch": 0.018780473123457535, "grad_norm": 10.528315544128418, "learning_rate": 0.0001232, "loss": 4.7821, "step": 78 }, { "epoch": 0.019021248419912117, "grad_norm": 17.255693435668945, "learning_rate": 0.0001248, "loss": 3.7784, "step": 79 }, { "epoch": 0.0192620237163667, "grad_norm": 26.67706298828125, "learning_rate": 0.0001264, "loss": 6.29, "step": 80 }, { "epoch": 0.019502799012821283, "grad_norm": 10.499022483825684, "learning_rate": 0.00012800000000000002, "loss": 5.3771, "step": 81 }, { "epoch": 0.01974357430927587, "grad_norm": 8.131221771240234, "learning_rate": 0.0001296, "loss": 4.0706, "step": 82 }, { "epoch": 0.01998434960573045, "grad_norm": 45.455230712890625, "learning_rate": 0.00013120000000000002, "loss": 3.3879, "step": 83 }, { "epoch": 0.020225124902185036, "grad_norm": 19.001127243041992, "learning_rate": 0.0001328, "loss": 7.6711, "step": 84 }, { "epoch": 0.02046590019863962, "grad_norm": 6.148112773895264, "learning_rate": 0.00013440000000000001, "loss": 3.472, "step": 85 }, { "epoch": 0.020706675495094203, "grad_norm": 11.255337715148926, "learning_rate": 0.00013600000000000003, "loss": 8.2827, "step": 86 }, { "epoch": 0.020947450791548788, "grad_norm": 9.724173545837402, "learning_rate": 0.00013759999999999998, "loss": 4.0604, "step": 87 }, { "epoch": 0.02118822608800337, "grad_norm": 58.57503128051758, "learning_rate": 0.0001392, "loss": 3.8051, "step": 88 }, { "epoch": 0.021429001384457955, "grad_norm": 19.386682510375977, "learning_rate": 0.0001408, "loss": 7.015, "step": 89 }, { "epoch": 0.02166977668091254, "grad_norm": 7.111973762512207, "learning_rate": 0.0001424, "loss": 2.9986, "step": 90 }, { "epoch": 0.02191055197736712, "grad_norm": 10.568584442138672, "learning_rate": 0.000144, "loss": 4.6358, "step": 91 }, { "epoch": 0.022151327273821707, "grad_norm": 10.066975593566895, "learning_rate": 0.00014560000000000002, "loss": 4.6346, "step": 92 }, { "epoch": 0.02239210257027629, "grad_norm": 25.012971878051758, "learning_rate": 0.0001472, "loss": 5.2808, "step": 93 }, { "epoch": 0.022632877866730874, "grad_norm": 8.683295249938965, "learning_rate": 0.0001488, "loss": 5.2811, "step": 94 }, { "epoch": 0.022873653163185455, "grad_norm": 5.454954624176025, "learning_rate": 0.0001504, "loss": 3.4226, "step": 95 }, { "epoch": 0.02311442845964004, "grad_norm": 5.077779293060303, "learning_rate": 0.000152, "loss": 2.7948, "step": 96 }, { "epoch": 0.023355203756094626, "grad_norm": 33.022857666015625, "learning_rate": 0.00015360000000000002, "loss": 7.5081, "step": 97 }, { "epoch": 0.023595979052549208, "grad_norm": 15.922677040100098, "learning_rate": 0.0001552, "loss": 5.6511, "step": 98 }, { "epoch": 0.023836754349003793, "grad_norm": 7.067165851593018, "learning_rate": 0.00015680000000000002, "loss": 3.4348, "step": 99 }, { "epoch": 0.024077529645458375, "grad_norm": 23.432310104370117, "learning_rate": 0.00015840000000000003, "loss": 4.1903, "step": 100 }, { "epoch": 0.02431830494191296, "grad_norm": 6.100611686706543, "learning_rate": 0.00016, "loss": 4.3271, "step": 101 }, { "epoch": 0.024559080238367545, "grad_norm": 32.087772369384766, "learning_rate": 0.00016160000000000002, "loss": 3.9315, "step": 102 }, { "epoch": 0.024799855534822127, "grad_norm": 6.975442886352539, "learning_rate": 0.0001632, "loss": 4.5229, "step": 103 }, { "epoch": 0.025040630831276712, "grad_norm": 15.898163795471191, "learning_rate": 0.0001648, "loss": 4.6573, "step": 104 }, { "epoch": 0.025281406127731294, "grad_norm": 22.853700637817383, "learning_rate": 0.0001664, "loss": 5.0038, "step": 105 }, { "epoch": 0.02552218142418588, "grad_norm": 11.595605850219727, "learning_rate": 0.000168, "loss": 2.2838, "step": 106 }, { "epoch": 0.025762956720640464, "grad_norm": 32.837886810302734, "learning_rate": 0.0001696, "loss": 3.5895, "step": 107 }, { "epoch": 0.026003732017095046, "grad_norm": 46.952144622802734, "learning_rate": 0.00017120000000000001, "loss": 2.7184, "step": 108 }, { "epoch": 0.02624450731354963, "grad_norm": 10.757477760314941, "learning_rate": 0.0001728, "loss": 5.1609, "step": 109 }, { "epoch": 0.026485282610004213, "grad_norm": 7.686053276062012, "learning_rate": 0.0001744, "loss": 4.8981, "step": 110 }, { "epoch": 0.026726057906458798, "grad_norm": 9.239624977111816, "learning_rate": 0.00017600000000000002, "loss": 4.208, "step": 111 }, { "epoch": 0.02696683320291338, "grad_norm": 12.09170150756836, "learning_rate": 0.0001776, "loss": 4.2583, "step": 112 }, { "epoch": 0.027207608499367965, "grad_norm": 9.89987850189209, "learning_rate": 0.00017920000000000002, "loss": 4.0053, "step": 113 }, { "epoch": 0.02744838379582255, "grad_norm": 10.28636360168457, "learning_rate": 0.0001808, "loss": 5.7005, "step": 114 }, { "epoch": 0.027689159092277132, "grad_norm": 4.695734024047852, "learning_rate": 0.00018240000000000002, "loss": 3.4277, "step": 115 }, { "epoch": 0.027929934388731717, "grad_norm": 105.01580810546875, "learning_rate": 0.00018400000000000003, "loss": 3.761, "step": 116 }, { "epoch": 0.0281707096851863, "grad_norm": 9.191910743713379, "learning_rate": 0.0001856, "loss": 4.2246, "step": 117 }, { "epoch": 0.028411484981640884, "grad_norm": 26.174537658691406, "learning_rate": 0.00018720000000000002, "loss": 2.9553, "step": 118 }, { "epoch": 0.02865226027809547, "grad_norm": 9.34518814086914, "learning_rate": 0.0001888, "loss": 3.3358, "step": 119 }, { "epoch": 0.02889303557455005, "grad_norm": 9.987439155578613, "learning_rate": 0.0001904, "loss": 3.6771, "step": 120 }, { "epoch": 0.029133810871004636, "grad_norm": 7.954049587249756, "learning_rate": 0.000192, "loss": 3.2811, "step": 121 }, { "epoch": 0.029374586167459218, "grad_norm": 8.947925567626953, "learning_rate": 0.00019360000000000002, "loss": 2.854, "step": 122 }, { "epoch": 0.029615361463913803, "grad_norm": 31.957183837890625, "learning_rate": 0.0001952, "loss": 7.1923, "step": 123 }, { "epoch": 0.029856136760368385, "grad_norm": 10.06078815460205, "learning_rate": 0.0001968, "loss": 3.0629, "step": 124 }, { "epoch": 0.03009691205682297, "grad_norm": 9.508298873901367, "learning_rate": 0.0001984, "loss": 4.4433, "step": 125 }, { "epoch": 0.030337687353277555, "grad_norm": 49.658111572265625, "learning_rate": 0.0002, "loss": 4.6153, "step": 126 }, { "epoch": 0.030578462649732137, "grad_norm": 21.386220932006836, "learning_rate": 0.00019999996959988735, "loss": 5.5672, "step": 127 }, { "epoch": 0.030819237946186722, "grad_norm": 122.65118408203125, "learning_rate": 0.0001999998783995679, "loss": 3.5313, "step": 128 }, { "epoch": 0.031060013242641304, "grad_norm": 13.517218589782715, "learning_rate": 0.00019999972639909706, "loss": 4.7874, "step": 129 }, { "epoch": 0.03130078853909589, "grad_norm": 18.364986419677734, "learning_rate": 0.00019999951359856726, "loss": 6.3622, "step": 130 }, { "epoch": 0.03154156383555047, "grad_norm": 10.25970458984375, "learning_rate": 0.0001999992399981079, "loss": 3.7715, "step": 131 }, { "epoch": 0.03178233913200506, "grad_norm": 15.492377281188965, "learning_rate": 0.0001999989055978853, "loss": 3.5824, "step": 132 }, { "epoch": 0.03202311442845964, "grad_norm": 28.90912437438965, "learning_rate": 0.00019999851039810283, "loss": 4.791, "step": 133 }, { "epoch": 0.03226388972491422, "grad_norm": 9.603219032287598, "learning_rate": 0.00019999805439900072, "loss": 3.1532, "step": 134 }, { "epoch": 0.032504665021368805, "grad_norm": 7.891742706298828, "learning_rate": 0.0001999975376008562, "loss": 2.59, "step": 135 }, { "epoch": 0.03274544031782339, "grad_norm": 14.559179306030273, "learning_rate": 0.0001999969600039836, "loss": 3.9376, "step": 136 }, { "epoch": 0.032986215614277975, "grad_norm": 17.962955474853516, "learning_rate": 0.00019999632160873398, "loss": 3.7606, "step": 137 }, { "epoch": 0.03322699091073256, "grad_norm": 13.648564338684082, "learning_rate": 0.0001999956224154955, "loss": 4.019, "step": 138 }, { "epoch": 0.033467766207187145, "grad_norm": 14.759313583374023, "learning_rate": 0.00019999486242469337, "loss": 3.5558, "step": 139 }, { "epoch": 0.03370854150364173, "grad_norm": 15.668071746826172, "learning_rate": 0.00019999404163678955, "loss": 3.7936, "step": 140 }, { "epoch": 0.03394931680009631, "grad_norm": 17.56260108947754, "learning_rate": 0.00019999316005228312, "loss": 2.4151, "step": 141 }, { "epoch": 0.03419009209655089, "grad_norm": 5.186138153076172, "learning_rate": 0.0001999922176717101, "loss": 2.7492, "step": 142 }, { "epoch": 0.03443086739300548, "grad_norm": 12.366766929626465, "learning_rate": 0.00019999121449564347, "loss": 3.1902, "step": 143 }, { "epoch": 0.03467164268946006, "grad_norm": 16.707490921020508, "learning_rate": 0.0001999901505246931, "loss": 2.3057, "step": 144 }, { "epoch": 0.03491241798591464, "grad_norm": 9.578150749206543, "learning_rate": 0.00019998902575950596, "loss": 4.3383, "step": 145 }, { "epoch": 0.03515319328236923, "grad_norm": 19.261411666870117, "learning_rate": 0.0001999878402007659, "loss": 5.0221, "step": 146 }, { "epoch": 0.03539396857882381, "grad_norm": 8.17841911315918, "learning_rate": 0.0001999865938491937, "loss": 2.2664, "step": 147 }, { "epoch": 0.035634743875278395, "grad_norm": 109.94926452636719, "learning_rate": 0.00019998528670554715, "loss": 6.5844, "step": 148 }, { "epoch": 0.035875519171732984, "grad_norm": 8.02511215209961, "learning_rate": 0.00019998391877062104, "loss": 3.7571, "step": 149 }, { "epoch": 0.036116294468187565, "grad_norm": 8.986191749572754, "learning_rate": 0.00019998249004524703, "loss": 5.3496, "step": 150 }, { "epoch": 0.03635706976464215, "grad_norm": 5.070540904998779, "learning_rate": 0.0001999810005302938, "loss": 2.581, "step": 151 }, { "epoch": 0.03659784506109673, "grad_norm": 10.687249183654785, "learning_rate": 0.00019997945022666701, "loss": 3.4334, "step": 152 }, { "epoch": 0.03683862035755132, "grad_norm": 7.046168327331543, "learning_rate": 0.00019997783913530923, "loss": 2.8572, "step": 153 }, { "epoch": 0.0370793956540059, "grad_norm": 4.520480632781982, "learning_rate": 0.0001999761672572, "loss": 2.4164, "step": 154 }, { "epoch": 0.03732017095046048, "grad_norm": 20.134994506835938, "learning_rate": 0.0001999744345933558, "loss": 5.1227, "step": 155 }, { "epoch": 0.03756094624691507, "grad_norm": 8.454794883728027, "learning_rate": 0.00019997264114483015, "loss": 4.3214, "step": 156 }, { "epoch": 0.03780172154336965, "grad_norm": 19.004796981811523, "learning_rate": 0.00019997078691271348, "loss": 2.8001, "step": 157 }, { "epoch": 0.03804249683982423, "grad_norm": 8.622836112976074, "learning_rate": 0.00019996887189813306, "loss": 2.6805, "step": 158 }, { "epoch": 0.038283272136278815, "grad_norm": 11.92911434173584, "learning_rate": 0.00019996689610225332, "loss": 3.4712, "step": 159 }, { "epoch": 0.0385240474327334, "grad_norm": 8.337674140930176, "learning_rate": 0.00019996485952627552, "loss": 3.6351, "step": 160 }, { "epoch": 0.038764822729187985, "grad_norm": 16.739974975585938, "learning_rate": 0.00019996276217143792, "loss": 2.8034, "step": 161 }, { "epoch": 0.03900559802564257, "grad_norm": 19.424123764038086, "learning_rate": 0.0001999606040390157, "loss": 3.6437, "step": 162 }, { "epoch": 0.039246373322097156, "grad_norm": 6.484769344329834, "learning_rate": 0.000199958385130321, "loss": 1.9908, "step": 163 }, { "epoch": 0.03948714861855174, "grad_norm": 12.377532005310059, "learning_rate": 0.0001999561054467029, "loss": 4.8526, "step": 164 }, { "epoch": 0.03972792391500632, "grad_norm": 13.827719688415527, "learning_rate": 0.00019995376498954754, "loss": 3.6073, "step": 165 }, { "epoch": 0.0399686992114609, "grad_norm": 7.668979167938232, "learning_rate": 0.00019995136376027786, "loss": 2.496, "step": 166 }, { "epoch": 0.04020947450791549, "grad_norm": 8.068209648132324, "learning_rate": 0.00019994890176035378, "loss": 4.0669, "step": 167 }, { "epoch": 0.04045024980437007, "grad_norm": 11.890876770019531, "learning_rate": 0.00019994637899127228, "loss": 2.6487, "step": 168 }, { "epoch": 0.04069102510082465, "grad_norm": 16.064224243164062, "learning_rate": 0.00019994379545456713, "loss": 2.9892, "step": 169 }, { "epoch": 0.04093180039727924, "grad_norm": 7.469193458557129, "learning_rate": 0.00019994115115180922, "loss": 3.3422, "step": 170 }, { "epoch": 0.04117257569373382, "grad_norm": 14.787521362304688, "learning_rate": 0.00019993844608460622, "loss": 3.911, "step": 171 }, { "epoch": 0.041413350990188405, "grad_norm": 24.229990005493164, "learning_rate": 0.00019993568025460283, "loss": 3.3516, "step": 172 }, { "epoch": 0.041654126286642994, "grad_norm": 4.197109222412109, "learning_rate": 0.0001999328536634807, "loss": 1.3666, "step": 173 }, { "epoch": 0.041894901583097575, "grad_norm": 9.006143569946289, "learning_rate": 0.00019992996631295836, "loss": 4.234, "step": 174 }, { "epoch": 0.04213567687955216, "grad_norm": 21.24369239807129, "learning_rate": 0.00019992701820479138, "loss": 3.2965, "step": 175 }, { "epoch": 0.04237645217600674, "grad_norm": 21.48784828186035, "learning_rate": 0.0001999240093407722, "loss": 1.7589, "step": 176 }, { "epoch": 0.04261722747246133, "grad_norm": 8.93320369720459, "learning_rate": 0.00019992093972273018, "loss": 1.9561, "step": 177 }, { "epoch": 0.04285800276891591, "grad_norm": 12.301058769226074, "learning_rate": 0.0001999178093525317, "loss": 2.0668, "step": 178 }, { "epoch": 0.04309877806537049, "grad_norm": 18.54864501953125, "learning_rate": 0.00019991461823208004, "loss": 3.1243, "step": 179 }, { "epoch": 0.04333955336182508, "grad_norm": 14.172440528869629, "learning_rate": 0.00019991136636331538, "loss": 2.7406, "step": 180 }, { "epoch": 0.04358032865827966, "grad_norm": 42.0859375, "learning_rate": 0.00019990805374821483, "loss": 1.4452, "step": 181 }, { "epoch": 0.04382110395473424, "grad_norm": 10.7669677734375, "learning_rate": 0.00019990468038879255, "loss": 3.331, "step": 182 }, { "epoch": 0.044061879251188825, "grad_norm": 11.51449966430664, "learning_rate": 0.0001999012462870995, "loss": 1.4512, "step": 183 }, { "epoch": 0.044302654547643414, "grad_norm": 21.03165054321289, "learning_rate": 0.00019989775144522358, "loss": 3.0687, "step": 184 }, { "epoch": 0.044543429844097995, "grad_norm": 11.455255508422852, "learning_rate": 0.00019989419586528975, "loss": 3.6598, "step": 185 }, { "epoch": 0.04478420514055258, "grad_norm": 10.61294174194336, "learning_rate": 0.00019989057954945976, "loss": 2.4758, "step": 186 }, { "epoch": 0.045024980437007166, "grad_norm": 96.17725372314453, "learning_rate": 0.00019988690249993235, "loss": 2.7045, "step": 187 }, { "epoch": 0.04526575573346175, "grad_norm": 8.609686851501465, "learning_rate": 0.00019988316471894314, "loss": 1.7687, "step": 188 }, { "epoch": 0.04550653102991633, "grad_norm": 7.83888053894043, "learning_rate": 0.00019987936620876478, "loss": 1.7098, "step": 189 }, { "epoch": 0.04574730632637091, "grad_norm": 6.7235941886901855, "learning_rate": 0.00019987550697170674, "loss": 1.6275, "step": 190 }, { "epoch": 0.0459880816228255, "grad_norm": 14.214694023132324, "learning_rate": 0.0001998715870101154, "loss": 4.1546, "step": 191 }, { "epoch": 0.04622885691928008, "grad_norm": 3.766120433807373, "learning_rate": 0.0001998676063263742, "loss": 2.2139, "step": 192 }, { "epoch": 0.04646963221573466, "grad_norm": 4.959268093109131, "learning_rate": 0.0001998635649229033, "loss": 1.4615, "step": 193 }, { "epoch": 0.04671040751218925, "grad_norm": 6.699900150299072, "learning_rate": 0.00019985946280215994, "loss": 1.9309, "step": 194 }, { "epoch": 0.046951182808643833, "grad_norm": 8.718276023864746, "learning_rate": 0.00019985529996663823, "loss": 2.1614, "step": 195 }, { "epoch": 0.047191958105098415, "grad_norm": 13.810513496398926, "learning_rate": 0.00019985107641886917, "loss": 3.8401, "step": 196 }, { "epoch": 0.047432733401553004, "grad_norm": 12.379217147827148, "learning_rate": 0.00019984679216142066, "loss": 1.5629, "step": 197 }, { "epoch": 0.047673508698007586, "grad_norm": 10.015958786010742, "learning_rate": 0.00019984244719689756, "loss": 1.6573, "step": 198 }, { "epoch": 0.04791428399446217, "grad_norm": 12.203784942626953, "learning_rate": 0.00019983804152794163, "loss": 1.9251, "step": 199 }, { "epoch": 0.04815505929091675, "grad_norm": 8.036340713500977, "learning_rate": 0.0001998335751572315, "loss": 1.7192, "step": 200 }, { "epoch": 0.04839583458737134, "grad_norm": 8.729804039001465, "learning_rate": 0.00019982904808748275, "loss": 0.9223, "step": 201 }, { "epoch": 0.04863660988382592, "grad_norm": 32.94856262207031, "learning_rate": 0.00019982446032144785, "loss": 3.5147, "step": 202 }, { "epoch": 0.0488773851802805, "grad_norm": 5.190202713012695, "learning_rate": 0.00019981981186191616, "loss": 1.0766, "step": 203 }, { "epoch": 0.04911816047673509, "grad_norm": 15.163110733032227, "learning_rate": 0.00019981510271171394, "loss": 3.0481, "step": 204 }, { "epoch": 0.04935893577318967, "grad_norm": 35.894718170166016, "learning_rate": 0.00019981033287370443, "loss": 3.3266, "step": 205 }, { "epoch": 0.04959971106964425, "grad_norm": 12.832849502563477, "learning_rate": 0.0001998055023507876, "loss": 1.614, "step": 206 }, { "epoch": 0.049840486366098835, "grad_norm": 12.771391868591309, "learning_rate": 0.00019980061114590055, "loss": 1.63, "step": 207 }, { "epoch": 0.050081261662553424, "grad_norm": 20.120861053466797, "learning_rate": 0.00019979565926201703, "loss": 3.5633, "step": 208 }, { "epoch": 0.050322036959008005, "grad_norm": 10.067777633666992, "learning_rate": 0.00019979064670214782, "loss": 1.7442, "step": 209 }, { "epoch": 0.05056281225546259, "grad_norm": 5.05864143371582, "learning_rate": 0.0001997855734693406, "loss": 2.6813, "step": 210 }, { "epoch": 0.050803587551917176, "grad_norm": 5.616927623748779, "learning_rate": 0.0001997804395666799, "loss": 1.4455, "step": 211 }, { "epoch": 0.05104436284837176, "grad_norm": 18.011022567749023, "learning_rate": 0.00019977524499728712, "loss": 1.095, "step": 212 }, { "epoch": 0.05128513814482634, "grad_norm": 4.923522472381592, "learning_rate": 0.0001997699897643206, "loss": 1.7786, "step": 213 }, { "epoch": 0.05152591344128093, "grad_norm": 8.910199165344238, "learning_rate": 0.00019976467387097552, "loss": 2.6016, "step": 214 }, { "epoch": 0.05176668873773551, "grad_norm": 6.376938343048096, "learning_rate": 0.00019975929732048394, "loss": 1.3324, "step": 215 }, { "epoch": 0.05200746403419009, "grad_norm": 25.141647338867188, "learning_rate": 0.00019975386011611483, "loss": 1.0559, "step": 216 }, { "epoch": 0.05224823933064467, "grad_norm": 69.8543472290039, "learning_rate": 0.00019974836226117405, "loss": 2.2873, "step": 217 }, { "epoch": 0.05248901462709926, "grad_norm": 13.50328254699707, "learning_rate": 0.00019974280375900424, "loss": 2.7354, "step": 218 }, { "epoch": 0.052729789923553844, "grad_norm": 9.282197952270508, "learning_rate": 0.00019973718461298502, "loss": 2.1698, "step": 219 }, { "epoch": 0.052970565220008425, "grad_norm": 12.128793716430664, "learning_rate": 0.00019973150482653287, "loss": 2.3485, "step": 220 }, { "epoch": 0.053211340516463014, "grad_norm": 6.763794422149658, "learning_rate": 0.00019972576440310105, "loss": 1.2761, "step": 221 }, { "epoch": 0.053452115812917596, "grad_norm": 4.751701354980469, "learning_rate": 0.00019971996334617985, "loss": 1.0254, "step": 222 }, { "epoch": 0.05369289110937218, "grad_norm": 10.452568054199219, "learning_rate": 0.00019971410165929622, "loss": 1.5987, "step": 223 }, { "epoch": 0.05393366640582676, "grad_norm": 5.189295768737793, "learning_rate": 0.00019970817934601413, "loss": 0.9901, "step": 224 }, { "epoch": 0.05417444170228135, "grad_norm": 6.027712821960449, "learning_rate": 0.00019970219640993438, "loss": 2.048, "step": 225 }, { "epoch": 0.05441521699873593, "grad_norm": 5.749260425567627, "learning_rate": 0.00019969615285469455, "loss": 1.1023, "step": 226 }, { "epoch": 0.05465599229519051, "grad_norm": 14.753028869628906, "learning_rate": 0.0001996900486839692, "loss": 1.657, "step": 227 }, { "epoch": 0.0548967675916451, "grad_norm": 19.311214447021484, "learning_rate": 0.0001996838839014696, "loss": 1.5839, "step": 228 }, { "epoch": 0.05513754288809968, "grad_norm": 2.4820916652679443, "learning_rate": 0.000199677658510944, "loss": 1.2341, "step": 229 }, { "epoch": 0.055378318184554264, "grad_norm": 16.423561096191406, "learning_rate": 0.0001996713725161775, "loss": 2.0571, "step": 230 }, { "epoch": 0.055619093481008845, "grad_norm": 10.467788696289062, "learning_rate": 0.00019966502592099188, "loss": 1.5059, "step": 231 }, { "epoch": 0.055859868777463434, "grad_norm": 3.1604106426239014, "learning_rate": 0.000199658618729246, "loss": 1.099, "step": 232 }, { "epoch": 0.056100644073918016, "grad_norm": 9.024856567382812, "learning_rate": 0.00019965215094483539, "loss": 1.2078, "step": 233 }, { "epoch": 0.0563414193703726, "grad_norm": 2.999100923538208, "learning_rate": 0.00019964562257169247, "loss": 0.792, "step": 234 }, { "epoch": 0.056582194666827186, "grad_norm": 16.542631149291992, "learning_rate": 0.00019963903361378655, "loss": 2.7538, "step": 235 }, { "epoch": 0.05682296996328177, "grad_norm": 5.134494781494141, "learning_rate": 0.00019963238407512366, "loss": 2.8096, "step": 236 }, { "epoch": 0.05706374525973635, "grad_norm": 7.121161460876465, "learning_rate": 0.0001996256739597468, "loss": 1.1293, "step": 237 }, { "epoch": 0.05730452055619094, "grad_norm": 3.483020544052124, "learning_rate": 0.00019961890327173574, "loss": 1.8818, "step": 238 }, { "epoch": 0.05754529585264552, "grad_norm": 4.338151454925537, "learning_rate": 0.00019961207201520703, "loss": 1.0518, "step": 239 }, { "epoch": 0.0577860711491001, "grad_norm": 6.399717807769775, "learning_rate": 0.00019960518019431408, "loss": 1.2687, "step": 240 }, { "epoch": 0.05802684644555468, "grad_norm": 1.5952820777893066, "learning_rate": 0.00019959822781324718, "loss": 0.6345, "step": 241 }, { "epoch": 0.05826762174200927, "grad_norm": 9.329618453979492, "learning_rate": 0.0001995912148762334, "loss": 1.3564, "step": 242 }, { "epoch": 0.058508397038463854, "grad_norm": 7.548645973205566, "learning_rate": 0.00019958414138753657, "loss": 1.0375, "step": 243 }, { "epoch": 0.058749172334918436, "grad_norm": 18.993824005126953, "learning_rate": 0.00019957700735145738, "loss": 2.458, "step": 244 }, { "epoch": 0.058989947631373024, "grad_norm": 20.46088981628418, "learning_rate": 0.0001995698127723334, "loss": 2.1789, "step": 245 }, { "epoch": 0.059230722927827606, "grad_norm": 6.136659145355225, "learning_rate": 0.00019956255765453892, "loss": 1.3776, "step": 246 }, { "epoch": 0.05947149822428219, "grad_norm": 61.323387145996094, "learning_rate": 0.00019955524200248505, "loss": 1.657, "step": 247 }, { "epoch": 0.05971227352073677, "grad_norm": 4.754699230194092, "learning_rate": 0.00019954786582061977, "loss": 1.0319, "step": 248 }, { "epoch": 0.05995304881719136, "grad_norm": 10.321673393249512, "learning_rate": 0.0001995404291134278, "loss": 2.1272, "step": 249 }, { "epoch": 0.06019382411364594, "grad_norm": 8.861504554748535, "learning_rate": 0.0001995329318854306, "loss": 1.4962, "step": 250 }, { "epoch": 0.06043459941010052, "grad_norm": 44.048126220703125, "learning_rate": 0.0001995253741411866, "loss": 2.3729, "step": 251 }, { "epoch": 0.06067537470655511, "grad_norm": 2.5337188243865967, "learning_rate": 0.0001995177558852909, "loss": 0.564, "step": 252 }, { "epoch": 0.06091615000300969, "grad_norm": 11.171781539916992, "learning_rate": 0.0001995100771223754, "loss": 1.2622, "step": 253 }, { "epoch": 0.061156925299464274, "grad_norm": 10.259223937988281, "learning_rate": 0.0001995023378571088, "loss": 1.9257, "step": 254 }, { "epoch": 0.061397700595918855, "grad_norm": 8.954612731933594, "learning_rate": 0.0001994945380941966, "loss": 1.008, "step": 255 }, { "epoch": 0.061638475892373444, "grad_norm": 26.720203399658203, "learning_rate": 0.0001994866778383811, "loss": 1.8667, "step": 256 }, { "epoch": 0.061879251188828026, "grad_norm": 8.071576118469238, "learning_rate": 0.00019947875709444131, "loss": 1.5516, "step": 257 }, { "epoch": 0.06212002648528261, "grad_norm": 1.8321843147277832, "learning_rate": 0.00019947077586719307, "loss": 0.8952, "step": 258 }, { "epoch": 0.062360801781737196, "grad_norm": 10.932100296020508, "learning_rate": 0.000199462734161489, "loss": 1.3651, "step": 259 }, { "epoch": 0.06260157707819178, "grad_norm": 4.2766828536987305, "learning_rate": 0.00019945463198221846, "loss": 1.0024, "step": 260 }, { "epoch": 0.06284235237464636, "grad_norm": 7.300168991088867, "learning_rate": 0.00019944646933430762, "loss": 1.1335, "step": 261 }, { "epoch": 0.06308312767110094, "grad_norm": 2.313037395477295, "learning_rate": 0.00019943824622271935, "loss": 1.1619, "step": 262 }, { "epoch": 0.06332390296755552, "grad_norm": 3.020617961883545, "learning_rate": 0.00019942996265245335, "loss": 1.0354, "step": 263 }, { "epoch": 0.06356467826401012, "grad_norm": 14.848864555358887, "learning_rate": 0.00019942161862854601, "loss": 1.4934, "step": 264 }, { "epoch": 0.0638054535604647, "grad_norm": 8.351217269897461, "learning_rate": 0.0001994132141560706, "loss": 1.5487, "step": 265 }, { "epoch": 0.06404622885691928, "grad_norm": 5.606395244598389, "learning_rate": 0.00019940474924013698, "loss": 1.6069, "step": 266 }, { "epoch": 0.06428700415337386, "grad_norm": 5.64864444732666, "learning_rate": 0.00019939622388589183, "loss": 1.0187, "step": 267 }, { "epoch": 0.06452777944982845, "grad_norm": 8.374507904052734, "learning_rate": 0.00019938763809851864, "loss": 1.2051, "step": 268 }, { "epoch": 0.06476855474628303, "grad_norm": 3.38839054107666, "learning_rate": 0.00019937899188323757, "loss": 0.6262, "step": 269 }, { "epoch": 0.06500933004273761, "grad_norm": 2.6535820960998535, "learning_rate": 0.00019937028524530552, "loss": 0.6246, "step": 270 }, { "epoch": 0.0652501053391922, "grad_norm": 5.218156337738037, "learning_rate": 0.00019936151819001618, "loss": 1.8041, "step": 271 }, { "epoch": 0.06549088063564679, "grad_norm": 6.953288555145264, "learning_rate": 0.00019935269072269987, "loss": 0.886, "step": 272 }, { "epoch": 0.06573165593210137, "grad_norm": 3.9206128120422363, "learning_rate": 0.00019934380284872377, "loss": 0.5613, "step": 273 }, { "epoch": 0.06597243122855595, "grad_norm": 3.6051864624023438, "learning_rate": 0.00019933485457349174, "loss": 0.8749, "step": 274 }, { "epoch": 0.06621320652501053, "grad_norm": 3.60562801361084, "learning_rate": 0.00019932584590244434, "loss": 0.892, "step": 275 }, { "epoch": 0.06645398182146511, "grad_norm": 3.6393070220947266, "learning_rate": 0.0001993167768410588, "loss": 0.8526, "step": 276 }, { "epoch": 0.0666947571179197, "grad_norm": 10.952275276184082, "learning_rate": 0.0001993076473948492, "loss": 1.1597, "step": 277 }, { "epoch": 0.06693553241437429, "grad_norm": 9.785892486572266, "learning_rate": 0.00019929845756936626, "loss": 1.1667, "step": 278 }, { "epoch": 0.06717630771082887, "grad_norm": 8.532158851623535, "learning_rate": 0.00019928920737019733, "loss": 1.4692, "step": 279 }, { "epoch": 0.06741708300728345, "grad_norm": 12.74774169921875, "learning_rate": 0.00019927989680296667, "loss": 2.1035, "step": 280 }, { "epoch": 0.06765785830373804, "grad_norm": 10.734175682067871, "learning_rate": 0.00019927052587333507, "loss": 1.8876, "step": 281 }, { "epoch": 0.06789863360019262, "grad_norm": 4.373108863830566, "learning_rate": 0.00019926109458700007, "loss": 0.9184, "step": 282 }, { "epoch": 0.0681394088966472, "grad_norm": 7.9593281745910645, "learning_rate": 0.00019925160294969593, "loss": 1.2637, "step": 283 }, { "epoch": 0.06838018419310178, "grad_norm": 5.800394058227539, "learning_rate": 0.0001992420509671936, "loss": 0.8262, "step": 284 }, { "epoch": 0.06862095948955638, "grad_norm": 5.995545864105225, "learning_rate": 0.00019923243864530064, "loss": 1.6762, "step": 285 }, { "epoch": 0.06886173478601096, "grad_norm": 21.66741371154785, "learning_rate": 0.00019922276598986145, "loss": 1.1287, "step": 286 }, { "epoch": 0.06910251008246554, "grad_norm": 12.231538772583008, "learning_rate": 0.00019921303300675697, "loss": 1.4966, "step": 287 }, { "epoch": 0.06934328537892012, "grad_norm": 19.181198120117188, "learning_rate": 0.00019920323970190487, "loss": 1.7811, "step": 288 }, { "epoch": 0.0695840606753747, "grad_norm": 4.649646282196045, "learning_rate": 0.00019919338608125956, "loss": 1.2632, "step": 289 }, { "epoch": 0.06982483597182929, "grad_norm": 5.04226541519165, "learning_rate": 0.00019918347215081204, "loss": 1.3552, "step": 290 }, { "epoch": 0.07006561126828387, "grad_norm": 4.240399360656738, "learning_rate": 0.00019917349791658996, "loss": 1.2266, "step": 291 }, { "epoch": 0.07030638656473846, "grad_norm": 13.989855766296387, "learning_rate": 0.0001991634633846577, "loss": 1.429, "step": 292 }, { "epoch": 0.07054716186119304, "grad_norm": 8.629983901977539, "learning_rate": 0.00019915336856111631, "loss": 1.0381, "step": 293 }, { "epoch": 0.07078793715764763, "grad_norm": 14.188498497009277, "learning_rate": 0.00019914321345210342, "loss": 2.8836, "step": 294 }, { "epoch": 0.07102871245410221, "grad_norm": 8.33694076538086, "learning_rate": 0.00019913299806379334, "loss": 0.6366, "step": 295 }, { "epoch": 0.07126948775055679, "grad_norm": 16.30498695373535, "learning_rate": 0.00019912272240239716, "loss": 1.3799, "step": 296 }, { "epoch": 0.07151026304701137, "grad_norm": 5.43389368057251, "learning_rate": 0.00019911238647416242, "loss": 1.1131, "step": 297 }, { "epoch": 0.07175103834346597, "grad_norm": 20.10192108154297, "learning_rate": 0.00019910199028537337, "loss": 1.1515, "step": 298 }, { "epoch": 0.07199181363992055, "grad_norm": 3.4195728302001953, "learning_rate": 0.00019909153384235095, "loss": 0.5817, "step": 299 }, { "epoch": 0.07223258893637513, "grad_norm": 6.387148857116699, "learning_rate": 0.00019908101715145272, "loss": 0.7634, "step": 300 }, { "epoch": 0.07247336423282971, "grad_norm": 4.05348539352417, "learning_rate": 0.00019907044021907281, "loss": 0.8352, "step": 301 }, { "epoch": 0.0727141395292843, "grad_norm": 2.757005214691162, "learning_rate": 0.00019905980305164205, "loss": 0.7532, "step": 302 }, { "epoch": 0.07295491482573888, "grad_norm": 2.14371919631958, "learning_rate": 0.00019904910565562785, "loss": 1.2168, "step": 303 }, { "epoch": 0.07319569012219346, "grad_norm": 5.939690589904785, "learning_rate": 0.00019903834803753425, "loss": 0.8704, "step": 304 }, { "epoch": 0.07343646541864805, "grad_norm": 7.156602382659912, "learning_rate": 0.0001990275302039019, "loss": 0.8243, "step": 305 }, { "epoch": 0.07367724071510263, "grad_norm": 3.6926629543304443, "learning_rate": 0.00019901665216130808, "loss": 0.8763, "step": 306 }, { "epoch": 0.07391801601155722, "grad_norm": 7.309814453125, "learning_rate": 0.00019900571391636665, "loss": 0.7731, "step": 307 }, { "epoch": 0.0741587913080118, "grad_norm": 12.59055233001709, "learning_rate": 0.00019899471547572811, "loss": 1.0003, "step": 308 }, { "epoch": 0.07439956660446638, "grad_norm": 3.9260809421539307, "learning_rate": 0.00019898365684607952, "loss": 0.9478, "step": 309 }, { "epoch": 0.07464034190092096, "grad_norm": 3.1046080589294434, "learning_rate": 0.00019897253803414456, "loss": 0.7514, "step": 310 }, { "epoch": 0.07488111719737554, "grad_norm": 2.8333990573883057, "learning_rate": 0.0001989613590466835, "loss": 0.4307, "step": 311 }, { "epoch": 0.07512189249383014, "grad_norm": 11.99578857421875, "learning_rate": 0.00019895011989049316, "loss": 1.0123, "step": 312 }, { "epoch": 0.07536266779028472, "grad_norm": 2.916750431060791, "learning_rate": 0.000198938820572407, "loss": 0.9809, "step": 313 }, { "epoch": 0.0756034430867393, "grad_norm": 3.6491167545318604, "learning_rate": 0.00019892746109929498, "loss": 0.3447, "step": 314 }, { "epoch": 0.07584421838319388, "grad_norm": 3.625203847885132, "learning_rate": 0.00019891604147806376, "loss": 1.0226, "step": 315 }, { "epoch": 0.07608499367964847, "grad_norm": 3.9918270111083984, "learning_rate": 0.00019890456171565643, "loss": 0.6953, "step": 316 }, { "epoch": 0.07632576897610305, "grad_norm": 7.0212554931640625, "learning_rate": 0.00019889302181905278, "loss": 1.1393, "step": 317 }, { "epoch": 0.07656654427255763, "grad_norm": 6.060014247894287, "learning_rate": 0.00019888142179526902, "loss": 0.9609, "step": 318 }, { "epoch": 0.07680731956901223, "grad_norm": 6.098717212677002, "learning_rate": 0.00019886976165135807, "loss": 1.3731, "step": 319 }, { "epoch": 0.0770480948654668, "grad_norm": 8.985902786254883, "learning_rate": 0.00019885804139440925, "loss": 1.4469, "step": 320 }, { "epoch": 0.07728887016192139, "grad_norm": 6.856400966644287, "learning_rate": 0.00019884626103154856, "loss": 1.4352, "step": 321 }, { "epoch": 0.07752964545837597, "grad_norm": 4.309900283813477, "learning_rate": 0.00019883442056993841, "loss": 0.4605, "step": 322 }, { "epoch": 0.07777042075483055, "grad_norm": 2.33298397064209, "learning_rate": 0.00019882252001677793, "loss": 1.2381, "step": 323 }, { "epoch": 0.07801119605128513, "grad_norm": 3.6052260398864746, "learning_rate": 0.0001988105593793026, "loss": 1.0468, "step": 324 }, { "epoch": 0.07825197134773972, "grad_norm": 4.753766059875488, "learning_rate": 0.00019879853866478455, "loss": 1.1693, "step": 325 }, { "epoch": 0.07849274664419431, "grad_norm": 3.6719765663146973, "learning_rate": 0.00019878645788053238, "loss": 0.7712, "step": 326 }, { "epoch": 0.07873352194064889, "grad_norm": 3.6164121627807617, "learning_rate": 0.00019877431703389128, "loss": 1.2832, "step": 327 }, { "epoch": 0.07897429723710347, "grad_norm": 9.66127872467041, "learning_rate": 0.00019876211613224288, "loss": 2.2482, "step": 328 }, { "epoch": 0.07921507253355806, "grad_norm": 2.208888053894043, "learning_rate": 0.00019874985518300532, "loss": 1.1646, "step": 329 }, { "epoch": 0.07945584783001264, "grad_norm": 1.7235151529312134, "learning_rate": 0.00019873753419363336, "loss": 0.5038, "step": 330 }, { "epoch": 0.07969662312646722, "grad_norm": 1.9844493865966797, "learning_rate": 0.00019872515317161812, "loss": 1.1001, "step": 331 }, { "epoch": 0.0799373984229218, "grad_norm": 7.393949508666992, "learning_rate": 0.00019871271212448734, "loss": 1.7001, "step": 332 }, { "epoch": 0.0801781737193764, "grad_norm": 10.367690086364746, "learning_rate": 0.00019870021105980522, "loss": 0.8829, "step": 333 }, { "epoch": 0.08041894901583098, "grad_norm": 6.111469745635986, "learning_rate": 0.00019868764998517236, "loss": 1.6088, "step": 334 }, { "epoch": 0.08065972431228556, "grad_norm": 4.986114978790283, "learning_rate": 0.00019867502890822598, "loss": 0.3513, "step": 335 }, { "epoch": 0.08090049960874014, "grad_norm": 4.137001037597656, "learning_rate": 0.00019866234783663968, "loss": 1.1246, "step": 336 }, { "epoch": 0.08114127490519472, "grad_norm": 2.4128201007843018, "learning_rate": 0.00019864960677812364, "loss": 0.7535, "step": 337 }, { "epoch": 0.0813820502016493, "grad_norm": 19.265674591064453, "learning_rate": 0.0001986368057404244, "loss": 1.0217, "step": 338 }, { "epoch": 0.0816228254981039, "grad_norm": 5.218925952911377, "learning_rate": 0.00019862394473132503, "loss": 0.6478, "step": 339 }, { "epoch": 0.08186360079455848, "grad_norm": 9.463326454162598, "learning_rate": 0.00019861102375864508, "loss": 0.4951, "step": 340 }, { "epoch": 0.08210437609101306, "grad_norm": 4.882657527923584, "learning_rate": 0.0001985980428302405, "loss": 0.5187, "step": 341 }, { "epoch": 0.08234515138746765, "grad_norm": 9.088946342468262, "learning_rate": 0.00019858500195400373, "loss": 1.6635, "step": 342 }, { "epoch": 0.08258592668392223, "grad_norm": 3.0154218673706055, "learning_rate": 0.0001985719011378637, "loss": 1.4851, "step": 343 }, { "epoch": 0.08282670198037681, "grad_norm": 8.918438911437988, "learning_rate": 0.00019855874038978563, "loss": 0.8483, "step": 344 }, { "epoch": 0.08306747727683139, "grad_norm": 3.460216760635376, "learning_rate": 0.00019854551971777137, "loss": 0.858, "step": 345 }, { "epoch": 0.08330825257328599, "grad_norm": 5.214385032653809, "learning_rate": 0.00019853223912985913, "loss": 0.6952, "step": 346 }, { "epoch": 0.08354902786974057, "grad_norm": 9.299979209899902, "learning_rate": 0.00019851889863412345, "loss": 0.5402, "step": 347 }, { "epoch": 0.08378980316619515, "grad_norm": 3.114903211593628, "learning_rate": 0.0001985054982386755, "loss": 0.5039, "step": 348 }, { "epoch": 0.08403057846264973, "grad_norm": 1.686824917793274, "learning_rate": 0.00019849203795166263, "loss": 0.5443, "step": 349 }, { "epoch": 0.08427135375910431, "grad_norm": 21.62729835510254, "learning_rate": 0.00019847851778126877, "loss": 0.9847, "step": 350 }, { "epoch": 0.0845121290555589, "grad_norm": 1.997676134109497, "learning_rate": 0.00019846493773571425, "loss": 0.2535, "step": 351 }, { "epoch": 0.08475290435201348, "grad_norm": 3.0039217472076416, "learning_rate": 0.0001984512978232558, "loss": 1.1073, "step": 352 }, { "epoch": 0.08499367964846807, "grad_norm": 1.8206866979599, "learning_rate": 0.00019843759805218637, "loss": 1.4459, "step": 353 }, { "epoch": 0.08523445494492266, "grad_norm": 2.975524663925171, "learning_rate": 0.0001984238384308356, "loss": 1.5481, "step": 354 }, { "epoch": 0.08547523024137724, "grad_norm": 2.0778095722198486, "learning_rate": 0.0001984100189675693, "loss": 0.8862, "step": 355 }, { "epoch": 0.08571600553783182, "grad_norm": 15.60510540008545, "learning_rate": 0.0001983961396707897, "loss": 0.6816, "step": 356 }, { "epoch": 0.0859567808342864, "grad_norm": 3.4831383228302, "learning_rate": 0.00019838220054893552, "loss": 0.6734, "step": 357 }, { "epoch": 0.08619755613074098, "grad_norm": 3.5622880458831787, "learning_rate": 0.00019836820161048176, "loss": 1.166, "step": 358 }, { "epoch": 0.08643833142719556, "grad_norm": 0.6584992408752441, "learning_rate": 0.00019835414286393979, "loss": 0.563, "step": 359 }, { "epoch": 0.08667910672365016, "grad_norm": 3.729058027267456, "learning_rate": 0.00019834002431785735, "loss": 0.8303, "step": 360 }, { "epoch": 0.08691988202010474, "grad_norm": 5.881722450256348, "learning_rate": 0.0001983258459808186, "loss": 1.3364, "step": 361 }, { "epoch": 0.08716065731655932, "grad_norm": 5.635914325714111, "learning_rate": 0.00019831160786144394, "loss": 0.7647, "step": 362 }, { "epoch": 0.0874014326130139, "grad_norm": 3.507514715194702, "learning_rate": 0.0001982973099683902, "loss": 0.5602, "step": 363 }, { "epoch": 0.08764220790946849, "grad_norm": 5.673732757568359, "learning_rate": 0.00019828295231035051, "loss": 0.946, "step": 364 }, { "epoch": 0.08788298320592307, "grad_norm": 2.3530821800231934, "learning_rate": 0.0001982685348960544, "loss": 0.3095, "step": 365 }, { "epoch": 0.08812375850237765, "grad_norm": 3.0282411575317383, "learning_rate": 0.00019825405773426767, "loss": 0.791, "step": 366 }, { "epoch": 0.08836453379883225, "grad_norm": 5.266041278839111, "learning_rate": 0.0001982395208337925, "loss": 1.4795, "step": 367 }, { "epoch": 0.08860530909528683, "grad_norm": 2.5949831008911133, "learning_rate": 0.0001982249242034673, "loss": 0.4774, "step": 368 }, { "epoch": 0.08884608439174141, "grad_norm": 2.186204195022583, "learning_rate": 0.00019821026785216687, "loss": 0.8617, "step": 369 }, { "epoch": 0.08908685968819599, "grad_norm": 18.886642456054688, "learning_rate": 0.00019819555178880234, "loss": 0.7616, "step": 370 }, { "epoch": 0.08932763498465057, "grad_norm": 2.88727068901062, "learning_rate": 0.00019818077602232106, "loss": 0.5059, "step": 371 }, { "epoch": 0.08956841028110515, "grad_norm": 6.770381927490234, "learning_rate": 0.00019816594056170676, "loss": 1.5388, "step": 372 }, { "epoch": 0.08980918557755974, "grad_norm": 4.677947044372559, "learning_rate": 0.00019815104541597944, "loss": 0.5632, "step": 373 }, { "epoch": 0.09004996087401433, "grad_norm": 2.849351406097412, "learning_rate": 0.00019813609059419538, "loss": 0.3689, "step": 374 }, { "epoch": 0.09029073617046891, "grad_norm": 2.1919734477996826, "learning_rate": 0.0001981210761054471, "loss": 1.003, "step": 375 }, { "epoch": 0.0905315114669235, "grad_norm": 3.296410083770752, "learning_rate": 0.0001981060019588635, "loss": 0.5615, "step": 376 }, { "epoch": 0.09077228676337808, "grad_norm": 2.373533248901367, "learning_rate": 0.00019809086816360968, "loss": 0.7389, "step": 377 }, { "epoch": 0.09101306205983266, "grad_norm": 4.461115837097168, "learning_rate": 0.00019807567472888702, "loss": 0.98, "step": 378 }, { "epoch": 0.09125383735628724, "grad_norm": 6.4342427253723145, "learning_rate": 0.00019806042166393314, "loss": 0.8969, "step": 379 }, { "epoch": 0.09149461265274182, "grad_norm": 2.5169475078582764, "learning_rate": 0.00019804510897802197, "loss": 0.3081, "step": 380 }, { "epoch": 0.09173538794919642, "grad_norm": 5.742027282714844, "learning_rate": 0.00019802973668046363, "loss": 1.2418, "step": 381 }, { "epoch": 0.091976163245651, "grad_norm": 1.0783274173736572, "learning_rate": 0.00019801430478060453, "loss": 0.4456, "step": 382 }, { "epoch": 0.09221693854210558, "grad_norm": 5.443319797515869, "learning_rate": 0.0001979988132878273, "loss": 1.0314, "step": 383 }, { "epoch": 0.09245771383856016, "grad_norm": 1.8633432388305664, "learning_rate": 0.00019798326221155078, "loss": 1.3362, "step": 384 }, { "epoch": 0.09269848913501474, "grad_norm": 8.395817756652832, "learning_rate": 0.00019796765156123008, "loss": 1.7206, "step": 385 }, { "epoch": 0.09293926443146933, "grad_norm": 0.9301803112030029, "learning_rate": 0.00019795198134635653, "loss": 0.3155, "step": 386 }, { "epoch": 0.09318003972792392, "grad_norm": 6.0776047706604, "learning_rate": 0.00019793625157645762, "loss": 0.7454, "step": 387 }, { "epoch": 0.0934208150243785, "grad_norm": 4.320910453796387, "learning_rate": 0.00019792046226109708, "loss": 0.9696, "step": 388 }, { "epoch": 0.09366159032083309, "grad_norm": 8.204424858093262, "learning_rate": 0.0001979046134098749, "loss": 1.1431, "step": 389 }, { "epoch": 0.09390236561728767, "grad_norm": 0.629797101020813, "learning_rate": 0.00019788870503242715, "loss": 0.4199, "step": 390 }, { "epoch": 0.09414314091374225, "grad_norm": 3.0499680042266846, "learning_rate": 0.00019787273713842623, "loss": 0.722, "step": 391 }, { "epoch": 0.09438391621019683, "grad_norm": 2.0613560676574707, "learning_rate": 0.00019785670973758058, "loss": 0.8111, "step": 392 }, { "epoch": 0.09462469150665141, "grad_norm": 14.847646713256836, "learning_rate": 0.00019784062283963495, "loss": 1.0207, "step": 393 }, { "epoch": 0.09486546680310601, "grad_norm": 2.1953060626983643, "learning_rate": 0.00019782447645437022, "loss": 0.3284, "step": 394 }, { "epoch": 0.09510624209956059, "grad_norm": 6.55955171585083, "learning_rate": 0.00019780827059160338, "loss": 1.3168, "step": 395 }, { "epoch": 0.09534701739601517, "grad_norm": 4.817495822906494, "learning_rate": 0.0001977920052611877, "loss": 0.6965, "step": 396 }, { "epoch": 0.09558779269246975, "grad_norm": 2.0958549976348877, "learning_rate": 0.00019777568047301243, "loss": 1.2996, "step": 397 }, { "epoch": 0.09582856798892433, "grad_norm": 3.6508209705352783, "learning_rate": 0.00019775929623700318, "loss": 0.4667, "step": 398 }, { "epoch": 0.09606934328537892, "grad_norm": 4.169986724853516, "learning_rate": 0.00019774285256312152, "loss": 1.0308, "step": 399 }, { "epoch": 0.0963101185818335, "grad_norm": 4.545289516448975, "learning_rate": 0.00019772634946136535, "loss": 1.4587, "step": 400 }, { "epoch": 0.0965508938782881, "grad_norm": 2.637938976287842, "learning_rate": 0.00019770978694176846, "loss": 0.7042, "step": 401 }, { "epoch": 0.09679166917474268, "grad_norm": 5.515408992767334, "learning_rate": 0.00019769316501440102, "loss": 1.0088, "step": 402 }, { "epoch": 0.09703244447119726, "grad_norm": 1.7717092037200928, "learning_rate": 0.00019767648368936914, "loss": 0.3585, "step": 403 }, { "epoch": 0.09727321976765184, "grad_norm": 5.126103401184082, "learning_rate": 0.0001976597429768151, "loss": 1.5234, "step": 404 }, { "epoch": 0.09751399506410642, "grad_norm": 0.473143607378006, "learning_rate": 0.00019764294288691727, "loss": 0.2934, "step": 405 }, { "epoch": 0.097754770360561, "grad_norm": 7.283068656921387, "learning_rate": 0.0001976260834298902, "loss": 1.1666, "step": 406 }, { "epoch": 0.09799554565701558, "grad_norm": 2.16549015045166, "learning_rate": 0.00019760916461598446, "loss": 0.4612, "step": 407 }, { "epoch": 0.09823632095347018, "grad_norm": 1.2254639863967896, "learning_rate": 0.0001975921864554867, "loss": 0.7512, "step": 408 }, { "epoch": 0.09847709624992476, "grad_norm": 1.8601148128509521, "learning_rate": 0.0001975751489587197, "loss": 0.8824, "step": 409 }, { "epoch": 0.09871787154637934, "grad_norm": 2.0946712493896484, "learning_rate": 0.0001975580521360423, "loss": 0.6299, "step": 410 }, { "epoch": 0.09895864684283392, "grad_norm": 5.10854434967041, "learning_rate": 0.00019754089599784938, "loss": 1.3609, "step": 411 }, { "epoch": 0.0991994221392885, "grad_norm": 2.166837453842163, "learning_rate": 0.00019752368055457197, "loss": 0.7314, "step": 412 }, { "epoch": 0.09944019743574309, "grad_norm": 3.2308640480041504, "learning_rate": 0.00019750640581667702, "loss": 1.679, "step": 413 }, { "epoch": 0.09968097273219767, "grad_norm": 3.175098180770874, "learning_rate": 0.00019748907179466767, "loss": 0.7682, "step": 414 }, { "epoch": 0.09992174802865227, "grad_norm": 6.487977981567383, "learning_rate": 0.00019747167849908304, "loss": 0.926, "step": 415 }, { "epoch": 0.10016252332510685, "grad_norm": 3.602936029434204, "learning_rate": 0.00019745422594049825, "loss": 1.0786, "step": 416 }, { "epoch": 0.10040329862156143, "grad_norm": 2.2537026405334473, "learning_rate": 0.00019743671412952453, "loss": 0.3749, "step": 417 }, { "epoch": 0.10064407391801601, "grad_norm": 3.4394688606262207, "learning_rate": 0.00019741914307680908, "loss": 0.6582, "step": 418 }, { "epoch": 0.10088484921447059, "grad_norm": 4.710788726806641, "learning_rate": 0.00019740151279303518, "loss": 1.0236, "step": 419 }, { "epoch": 0.10112562451092517, "grad_norm": 2.518106698989868, "learning_rate": 0.000197383823288922, "loss": 0.7708, "step": 420 }, { "epoch": 0.10136639980737976, "grad_norm": 2.9978835582733154, "learning_rate": 0.0001973660745752249, "loss": 0.4426, "step": 421 }, { "epoch": 0.10160717510383435, "grad_norm": 2.2193732261657715, "learning_rate": 0.0001973482666627351, "loss": 1.0488, "step": 422 }, { "epoch": 0.10184795040028893, "grad_norm": 2.385712146759033, "learning_rate": 0.0001973303995622798, "loss": 0.5798, "step": 423 }, { "epoch": 0.10208872569674352, "grad_norm": 6.944875240325928, "learning_rate": 0.00019731247328472228, "loss": 0.9012, "step": 424 }, { "epoch": 0.1023295009931981, "grad_norm": 1.5543016195297241, "learning_rate": 0.00019729448784096179, "loss": 0.5052, "step": 425 }, { "epoch": 0.10257027628965268, "grad_norm": 4.3643317222595215, "learning_rate": 0.00019727644324193347, "loss": 1.0582, "step": 426 }, { "epoch": 0.10281105158610726, "grad_norm": 3.4253134727478027, "learning_rate": 0.00019725833949860847, "loss": 0.8646, "step": 427 }, { "epoch": 0.10305182688256186, "grad_norm": 6.012450218200684, "learning_rate": 0.00019724017662199397, "loss": 0.9271, "step": 428 }, { "epoch": 0.10329260217901644, "grad_norm": 4.788900375366211, "learning_rate": 0.00019722195462313296, "loss": 0.6417, "step": 429 }, { "epoch": 0.10353337747547102, "grad_norm": 1.210336446762085, "learning_rate": 0.00019720367351310452, "loss": 0.6169, "step": 430 }, { "epoch": 0.1037741527719256, "grad_norm": 3.0183141231536865, "learning_rate": 0.00019718533330302358, "loss": 1.077, "step": 431 }, { "epoch": 0.10401492806838018, "grad_norm": 1.4695411920547485, "learning_rate": 0.000197166934004041, "loss": 0.423, "step": 432 }, { "epoch": 0.10425570336483476, "grad_norm": 3.7340753078460693, "learning_rate": 0.00019714847562734365, "loss": 0.506, "step": 433 }, { "epoch": 0.10449647866128935, "grad_norm": 0.8714501261711121, "learning_rate": 0.00019712995818415424, "loss": 0.3461, "step": 434 }, { "epoch": 0.10473725395774394, "grad_norm": 1.6766986846923828, "learning_rate": 0.00019711138168573142, "loss": 0.9932, "step": 435 }, { "epoch": 0.10497802925419852, "grad_norm": 2.76531720161438, "learning_rate": 0.00019709274614336975, "loss": 0.7046, "step": 436 }, { "epoch": 0.1052188045506531, "grad_norm": 6.036025524139404, "learning_rate": 0.00019707405156839966, "loss": 0.9637, "step": 437 }, { "epoch": 0.10545957984710769, "grad_norm": 4.022448539733887, "learning_rate": 0.0001970552979721875, "loss": 0.2239, "step": 438 }, { "epoch": 0.10570035514356227, "grad_norm": 2.6519360542297363, "learning_rate": 0.0001970364853661355, "loss": 1.0182, "step": 439 }, { "epoch": 0.10594113044001685, "grad_norm": 5.855311870574951, "learning_rate": 0.0001970176137616818, "loss": 0.7519, "step": 440 }, { "epoch": 0.10618190573647143, "grad_norm": 3.5491368770599365, "learning_rate": 0.00019699868317030035, "loss": 0.8588, "step": 441 }, { "epoch": 0.10642268103292603, "grad_norm": 4.17829704284668, "learning_rate": 0.00019697969360350098, "loss": 0.9785, "step": 442 }, { "epoch": 0.10666345632938061, "grad_norm": 4.346673488616943, "learning_rate": 0.00019696064507282937, "loss": 0.7598, "step": 443 }, { "epoch": 0.10690423162583519, "grad_norm": 2.906926155090332, "learning_rate": 0.00019694153758986714, "loss": 0.6547, "step": 444 }, { "epoch": 0.10714500692228977, "grad_norm": 1.955552339553833, "learning_rate": 0.00019692237116623163, "loss": 0.925, "step": 445 }, { "epoch": 0.10738578221874436, "grad_norm": 4.8115739822387695, "learning_rate": 0.00019690314581357607, "loss": 0.9647, "step": 446 }, { "epoch": 0.10762655751519894, "grad_norm": 2.199876308441162, "learning_rate": 0.00019688386154358955, "loss": 1.2637, "step": 447 }, { "epoch": 0.10786733281165352, "grad_norm": 8.052813529968262, "learning_rate": 0.0001968645183679969, "loss": 0.7113, "step": 448 }, { "epoch": 0.10810810810810811, "grad_norm": 6.857846260070801, "learning_rate": 0.00019684511629855888, "loss": 1.0796, "step": 449 }, { "epoch": 0.1083488834045627, "grad_norm": 3.3255105018615723, "learning_rate": 0.00019682565534707194, "loss": 0.4504, "step": 450 }, { "epoch": 0.10858965870101728, "grad_norm": 4.091807842254639, "learning_rate": 0.0001968061355253684, "loss": 0.9397, "step": 451 }, { "epoch": 0.10883043399747186, "grad_norm": 2.051816701889038, "learning_rate": 0.00019678655684531634, "loss": 0.5485, "step": 452 }, { "epoch": 0.10907120929392644, "grad_norm": 1.8907794952392578, "learning_rate": 0.00019676691931881968, "loss": 0.567, "step": 453 }, { "epoch": 0.10931198459038102, "grad_norm": 4.47649621963501, "learning_rate": 0.00019674722295781805, "loss": 0.8856, "step": 454 }, { "epoch": 0.1095527598868356, "grad_norm": 5.481165409088135, "learning_rate": 0.0001967274677742869, "loss": 0.4616, "step": 455 }, { "epoch": 0.1097935351832902, "grad_norm": 8.510377883911133, "learning_rate": 0.0001967076537802374, "loss": 0.3674, "step": 456 }, { "epoch": 0.11003431047974478, "grad_norm": 3.4752211570739746, "learning_rate": 0.00019668778098771647, "loss": 0.7903, "step": 457 }, { "epoch": 0.11027508577619936, "grad_norm": 3.52034330368042, "learning_rate": 0.00019666784940880691, "loss": 0.5652, "step": 458 }, { "epoch": 0.11051586107265395, "grad_norm": 4.425768852233887, "learning_rate": 0.0001966478590556271, "loss": 0.6404, "step": 459 }, { "epoch": 0.11075663636910853, "grad_norm": 9.201542854309082, "learning_rate": 0.00019662780994033125, "loss": 1.0613, "step": 460 }, { "epoch": 0.11099741166556311, "grad_norm": 3.8637278079986572, "learning_rate": 0.00019660770207510924, "loss": 1.1498, "step": 461 }, { "epoch": 0.11123818696201769, "grad_norm": 5.719259738922119, "learning_rate": 0.0001965875354721867, "loss": 1.0628, "step": 462 }, { "epoch": 0.11147896225847229, "grad_norm": 1.5758776664733887, "learning_rate": 0.00019656731014382501, "loss": 0.5364, "step": 463 }, { "epoch": 0.11171973755492687, "grad_norm": 7.384488582611084, "learning_rate": 0.00019654702610232114, "loss": 0.7939, "step": 464 }, { "epoch": 0.11196051285138145, "grad_norm": 5.359811782836914, "learning_rate": 0.0001965266833600079, "loss": 0.7241, "step": 465 }, { "epoch": 0.11220128814783603, "grad_norm": 3.234246015548706, "learning_rate": 0.0001965062819292537, "loss": 0.974, "step": 466 }, { "epoch": 0.11244206344429061, "grad_norm": 2.34318208694458, "learning_rate": 0.00019648582182246266, "loss": 0.3588, "step": 467 }, { "epoch": 0.1126828387407452, "grad_norm": 3.9500319957733154, "learning_rate": 0.0001964653030520746, "loss": 0.6119, "step": 468 }, { "epoch": 0.11292361403719978, "grad_norm": 2.85276198387146, "learning_rate": 0.00019644472563056485, "loss": 0.6573, "step": 469 }, { "epoch": 0.11316438933365437, "grad_norm": 1.7280099391937256, "learning_rate": 0.0001964240895704447, "loss": 0.8111, "step": 470 }, { "epoch": 0.11340516463010895, "grad_norm": 2.8521628379821777, "learning_rate": 0.00019640339488426084, "loss": 0.4068, "step": 471 }, { "epoch": 0.11364593992656354, "grad_norm": 3.4895570278167725, "learning_rate": 0.00019638264158459566, "loss": 0.8143, "step": 472 }, { "epoch": 0.11388671522301812, "grad_norm": 1.5952945947647095, "learning_rate": 0.00019636182968406726, "loss": 0.5789, "step": 473 }, { "epoch": 0.1141274905194727, "grad_norm": 3.6532886028289795, "learning_rate": 0.00019634095919532932, "loss": 0.4563, "step": 474 }, { "epoch": 0.11436826581592728, "grad_norm": 1.950562596321106, "learning_rate": 0.00019632003013107113, "loss": 0.6839, "step": 475 }, { "epoch": 0.11460904111238188, "grad_norm": 6.8443779945373535, "learning_rate": 0.00019629904250401757, "loss": 0.5238, "step": 476 }, { "epoch": 0.11484981640883646, "grad_norm": 3.7890400886535645, "learning_rate": 0.00019627799632692923, "loss": 0.6927, "step": 477 }, { "epoch": 0.11509059170529104, "grad_norm": 6.215263366699219, "learning_rate": 0.0001962568916126022, "loss": 0.962, "step": 478 }, { "epoch": 0.11533136700174562, "grad_norm": 2.3885769844055176, "learning_rate": 0.0001962357283738682, "loss": 0.5657, "step": 479 }, { "epoch": 0.1155721422982002, "grad_norm": 2.069955587387085, "learning_rate": 0.00019621450662359456, "loss": 0.5302, "step": 480 }, { "epoch": 0.11581291759465479, "grad_norm": 2.8343095779418945, "learning_rate": 0.0001961932263746841, "loss": 0.3862, "step": 481 }, { "epoch": 0.11605369289110937, "grad_norm": 2.8576223850250244, "learning_rate": 0.00019617188764007524, "loss": 1.4014, "step": 482 }, { "epoch": 0.11629446818756396, "grad_norm": 2.8722829818725586, "learning_rate": 0.00019615049043274205, "loss": 0.3474, "step": 483 }, { "epoch": 0.11653524348401854, "grad_norm": 3.647714376449585, "learning_rate": 0.00019612903476569406, "loss": 0.8658, "step": 484 }, { "epoch": 0.11677601878047313, "grad_norm": 6.923486232757568, "learning_rate": 0.00019610752065197634, "loss": 1.0191, "step": 485 }, { "epoch": 0.11701679407692771, "grad_norm": 2.716620683670044, "learning_rate": 0.0001960859481046695, "loss": 0.6586, "step": 486 }, { "epoch": 0.11725756937338229, "grad_norm": 3.657470941543579, "learning_rate": 0.00019606431713688975, "loss": 0.5454, "step": 487 }, { "epoch": 0.11749834466983687, "grad_norm": 4.070058345794678, "learning_rate": 0.00019604262776178876, "loss": 0.5342, "step": 488 }, { "epoch": 0.11773911996629145, "grad_norm": 5.016479015350342, "learning_rate": 0.0001960208799925537, "loss": 0.4871, "step": 489 }, { "epoch": 0.11797989526274605, "grad_norm": 0.8800312280654907, "learning_rate": 0.00019599907384240726, "loss": 0.7974, "step": 490 }, { "epoch": 0.11822067055920063, "grad_norm": 1.534217119216919, "learning_rate": 0.00019597720932460763, "loss": 0.5083, "step": 491 }, { "epoch": 0.11846144585565521, "grad_norm": 2.766813278198242, "learning_rate": 0.0001959552864524485, "loss": 0.7765, "step": 492 }, { "epoch": 0.1187022211521098, "grad_norm": 8.230446815490723, "learning_rate": 0.00019593330523925902, "loss": 0.6511, "step": 493 }, { "epoch": 0.11894299644856438, "grad_norm": 2.781522750854492, "learning_rate": 0.00019591126569840382, "loss": 0.4147, "step": 494 }, { "epoch": 0.11918377174501896, "grad_norm": 4.93475341796875, "learning_rate": 0.00019588916784328295, "loss": 0.4109, "step": 495 }, { "epoch": 0.11942454704147354, "grad_norm": 3.2182798385620117, "learning_rate": 0.00019586701168733202, "loss": 0.5109, "step": 496 }, { "epoch": 0.11966532233792813, "grad_norm": 5.978203773498535, "learning_rate": 0.00019584479724402197, "loss": 0.5933, "step": 497 }, { "epoch": 0.11990609763438272, "grad_norm": 2.445081949234009, "learning_rate": 0.00019582252452685927, "loss": 1.1266, "step": 498 }, { "epoch": 0.1201468729308373, "grad_norm": 2.4677765369415283, "learning_rate": 0.0001958001935493858, "loss": 0.4697, "step": 499 }, { "epoch": 0.12038764822729188, "grad_norm": 0.8610912561416626, "learning_rate": 0.00019577780432517879, "loss": 1.0763, "step": 500 }, { "epoch": 0.12062842352374646, "grad_norm": 1.9866464138031006, "learning_rate": 0.0001957553568678509, "loss": 0.729, "step": 501 }, { "epoch": 0.12086919882020104, "grad_norm": 2.3875463008880615, "learning_rate": 0.00019573285119105037, "loss": 0.8719, "step": 502 }, { "epoch": 0.12110997411665562, "grad_norm": 4.172793388366699, "learning_rate": 0.0001957102873084606, "loss": 1.0308, "step": 503 }, { "epoch": 0.12135074941311022, "grad_norm": 1.5716460943222046, "learning_rate": 0.0001956876652338005, "loss": 1.0994, "step": 504 }, { "epoch": 0.1215915247095648, "grad_norm": 8.024327278137207, "learning_rate": 0.00019566498498082438, "loss": 0.437, "step": 505 }, { "epoch": 0.12183230000601938, "grad_norm": 2.5161705017089844, "learning_rate": 0.0001956422465633218, "loss": 1.0868, "step": 506 }, { "epoch": 0.12207307530247397, "grad_norm": 4.083341598510742, "learning_rate": 0.0001956194499951179, "loss": 0.694, "step": 507 }, { "epoch": 0.12231385059892855, "grad_norm": 2.113607406616211, "learning_rate": 0.00019559659529007293, "loss": 0.8918, "step": 508 }, { "epoch": 0.12255462589538313, "grad_norm": 2.2010605335235596, "learning_rate": 0.00019557368246208263, "loss": 0.2703, "step": 509 }, { "epoch": 0.12279540119183771, "grad_norm": 2.9058799743652344, "learning_rate": 0.0001955507115250781, "loss": 1.207, "step": 510 }, { "epoch": 0.1230361764882923, "grad_norm": 7.344447612762451, "learning_rate": 0.00019552768249302566, "loss": 1.1835, "step": 511 }, { "epoch": 0.12327695178474689, "grad_norm": 1.3118301630020142, "learning_rate": 0.00019550459537992704, "loss": 0.5164, "step": 512 }, { "epoch": 0.12351772708120147, "grad_norm": 1.65935480594635, "learning_rate": 0.00019548145019981924, "loss": 0.7932, "step": 513 }, { "epoch": 0.12375850237765605, "grad_norm": 3.032277822494507, "learning_rate": 0.0001954582469667746, "loss": 0.2637, "step": 514 }, { "epoch": 0.12399927767411063, "grad_norm": 4.980113983154297, "learning_rate": 0.00019543498569490076, "loss": 0.8955, "step": 515 }, { "epoch": 0.12424005297056522, "grad_norm": 4.82036828994751, "learning_rate": 0.00019541166639834058, "loss": 1.1343, "step": 516 }, { "epoch": 0.1244808282670198, "grad_norm": 1.410509705543518, "learning_rate": 0.0001953882890912723, "loss": 0.8404, "step": 517 }, { "epoch": 0.12472160356347439, "grad_norm": 4.177162170410156, "learning_rate": 0.00019536485378790928, "loss": 1.0445, "step": 518 }, { "epoch": 0.12496237885992897, "grad_norm": 1.0933364629745483, "learning_rate": 0.00019534136050250033, "loss": 0.5387, "step": 519 }, { "epoch": 0.12520315415638356, "grad_norm": 1.1372244358062744, "learning_rate": 0.00019531780924932939, "loss": 0.5226, "step": 520 }, { "epoch": 0.12544392945283814, "grad_norm": 43.66477966308594, "learning_rate": 0.00019529420004271567, "loss": 0.2868, "step": 521 }, { "epoch": 0.12568470474929272, "grad_norm": 3.931898593902588, "learning_rate": 0.0001952705328970136, "loss": 0.5439, "step": 522 }, { "epoch": 0.1259254800457473, "grad_norm": 1.9407854080200195, "learning_rate": 0.00019524680782661294, "loss": 0.9395, "step": 523 }, { "epoch": 0.12616625534220188, "grad_norm": 3.817629814147949, "learning_rate": 0.0001952230248459385, "loss": 1.0245, "step": 524 }, { "epoch": 0.12640703063865646, "grad_norm": 6.78740119934082, "learning_rate": 0.0001951991839694504, "loss": 0.5898, "step": 525 }, { "epoch": 0.12664780593511105, "grad_norm": 3.067821979522705, "learning_rate": 0.00019517528521164395, "loss": 1.1765, "step": 526 }, { "epoch": 0.12688858123156563, "grad_norm": 3.173957109451294, "learning_rate": 0.00019515132858704965, "loss": 0.9526, "step": 527 }, { "epoch": 0.12712935652802024, "grad_norm": 2.5016558170318604, "learning_rate": 0.00019512731411023323, "loss": 1.0662, "step": 528 }, { "epoch": 0.12737013182447482, "grad_norm": 3.3116912841796875, "learning_rate": 0.00019510324179579548, "loss": 1.3049, "step": 529 }, { "epoch": 0.1276109071209294, "grad_norm": 4.086653709411621, "learning_rate": 0.00019507911165837248, "loss": 1.0897, "step": 530 }, { "epoch": 0.12785168241738398, "grad_norm": 7.5260329246521, "learning_rate": 0.00019505492371263533, "loss": 1.0542, "step": 531 }, { "epoch": 0.12809245771383856, "grad_norm": 4.74697208404541, "learning_rate": 0.00019503067797329044, "loss": 1.4031, "step": 532 }, { "epoch": 0.12833323301029315, "grad_norm": 3.090668201446533, "learning_rate": 0.0001950063744550792, "loss": 0.4726, "step": 533 }, { "epoch": 0.12857400830674773, "grad_norm": 2.889418840408325, "learning_rate": 0.00019498201317277828, "loss": 1.3182, "step": 534 }, { "epoch": 0.1288147836032023, "grad_norm": 2.548130989074707, "learning_rate": 0.00019495759414119932, "loss": 0.6617, "step": 535 }, { "epoch": 0.1290555588996569, "grad_norm": 2.9702346324920654, "learning_rate": 0.0001949331173751892, "loss": 0.7535, "step": 536 }, { "epoch": 0.12929633419611147, "grad_norm": 6.834994316101074, "learning_rate": 0.00019490858288962983, "loss": 1.2718, "step": 537 }, { "epoch": 0.12953710949256605, "grad_norm": 6.125328540802002, "learning_rate": 0.00019488399069943823, "loss": 0.6736, "step": 538 }, { "epoch": 0.12977788478902064, "grad_norm": 5.69896125793457, "learning_rate": 0.0001948593408195665, "loss": 0.6771, "step": 539 }, { "epoch": 0.13001866008547522, "grad_norm": 2.1542887687683105, "learning_rate": 0.0001948346332650018, "loss": 0.2843, "step": 540 }, { "epoch": 0.13025943538192983, "grad_norm": 4.776561737060547, "learning_rate": 0.0001948098680507665, "loss": 0.5372, "step": 541 }, { "epoch": 0.1305002106783844, "grad_norm": 1.1416128873825073, "learning_rate": 0.00019478504519191773, "loss": 0.7292, "step": 542 }, { "epoch": 0.130740985974839, "grad_norm": 1.7264859676361084, "learning_rate": 0.00019476016470354796, "loss": 0.5956, "step": 543 }, { "epoch": 0.13098176127129357, "grad_norm": 2.4325296878814697, "learning_rate": 0.00019473522660078455, "loss": 0.819, "step": 544 }, { "epoch": 0.13122253656774815, "grad_norm": 2.0552382469177246, "learning_rate": 0.00019471023089878995, "loss": 1.0633, "step": 545 }, { "epoch": 0.13146331186420274, "grad_norm": 6.430831432342529, "learning_rate": 0.00019468517761276154, "loss": 0.711, "step": 546 }, { "epoch": 0.13170408716065732, "grad_norm": 3.12066650390625, "learning_rate": 0.00019466006675793185, "loss": 0.525, "step": 547 }, { "epoch": 0.1319448624571119, "grad_norm": 18.034626007080078, "learning_rate": 0.00019463489834956827, "loss": 0.7595, "step": 548 }, { "epoch": 0.13218563775356648, "grad_norm": 17.345428466796875, "learning_rate": 0.0001946096724029733, "loss": 1.1251, "step": 549 }, { "epoch": 0.13242641305002106, "grad_norm": 1.709258794784546, "learning_rate": 0.00019458438893348433, "loss": 1.4069, "step": 550 }, { "epoch": 0.13266718834647565, "grad_norm": 2.9705605506896973, "learning_rate": 0.0001945590479564738, "loss": 0.8629, "step": 551 }, { "epoch": 0.13290796364293023, "grad_norm": 1.2169429063796997, "learning_rate": 0.00019453364948734906, "loss": 0.4889, "step": 552 }, { "epoch": 0.1331487389393848, "grad_norm": 2.59025502204895, "learning_rate": 0.00019450819354155244, "loss": 0.1758, "step": 553 }, { "epoch": 0.1333895142358394, "grad_norm": 1.7973146438598633, "learning_rate": 0.00019448268013456125, "loss": 0.9624, "step": 554 }, { "epoch": 0.133630289532294, "grad_norm": 1.6008778810501099, "learning_rate": 0.00019445710928188764, "loss": 0.8347, "step": 555 }, { "epoch": 0.13387106482874858, "grad_norm": 2.505977153778076, "learning_rate": 0.00019443148099907877, "loss": 0.3091, "step": 556 }, { "epoch": 0.13411184012520316, "grad_norm": 3.7619707584381104, "learning_rate": 0.0001944057953017167, "loss": 0.7405, "step": 557 }, { "epoch": 0.13435261542165775, "grad_norm": 2.6600496768951416, "learning_rate": 0.0001943800522054184, "loss": 0.5151, "step": 558 }, { "epoch": 0.13459339071811233, "grad_norm": 5.565666198730469, "learning_rate": 0.0001943542517258357, "loss": 1.0332, "step": 559 }, { "epoch": 0.1348341660145669, "grad_norm": 2.515794277191162, "learning_rate": 0.00019432839387865537, "loss": 1.1725, "step": 560 }, { "epoch": 0.1350749413110215, "grad_norm": 3.981748104095459, "learning_rate": 0.00019430247867959906, "loss": 0.5203, "step": 561 }, { "epoch": 0.13531571660747607, "grad_norm": 2.138054847717285, "learning_rate": 0.00019427650614442323, "loss": 0.0975, "step": 562 }, { "epoch": 0.13555649190393065, "grad_norm": 4.705209255218506, "learning_rate": 0.00019425047628891925, "loss": 0.8184, "step": 563 }, { "epoch": 0.13579726720038524, "grad_norm": 1.8869285583496094, "learning_rate": 0.00019422438912891337, "loss": 1.061, "step": 564 }, { "epoch": 0.13603804249683982, "grad_norm": 5.188673973083496, "learning_rate": 0.00019419824468026655, "loss": 0.9384, "step": 565 }, { "epoch": 0.1362788177932944, "grad_norm": 3.5460383892059326, "learning_rate": 0.0001941720429588748, "loss": 0.6326, "step": 566 }, { "epoch": 0.13651959308974898, "grad_norm": 3.3124594688415527, "learning_rate": 0.00019414578398066872, "loss": 1.625, "step": 567 }, { "epoch": 0.13676036838620356, "grad_norm": 1.9925857782363892, "learning_rate": 0.00019411946776161387, "loss": 0.86, "step": 568 }, { "epoch": 0.13700114368265817, "grad_norm": 2.7330362796783447, "learning_rate": 0.00019409309431771057, "loss": 0.5012, "step": 569 }, { "epoch": 0.13724191897911275, "grad_norm": 5.8978776931762695, "learning_rate": 0.00019406666366499393, "loss": 0.8465, "step": 570 }, { "epoch": 0.13748269427556734, "grad_norm": 1.46619713306427, "learning_rate": 0.00019404017581953385, "loss": 0.5121, "step": 571 }, { "epoch": 0.13772346957202192, "grad_norm": 3.0455288887023926, "learning_rate": 0.000194013630797435, "loss": 0.6288, "step": 572 }, { "epoch": 0.1379642448684765, "grad_norm": 2.932802677154541, "learning_rate": 0.00019398702861483678, "loss": 0.9645, "step": 573 }, { "epoch": 0.13820502016493108, "grad_norm": 4.07331657409668, "learning_rate": 0.00019396036928791345, "loss": 0.7568, "step": 574 }, { "epoch": 0.13844579546138566, "grad_norm": 2.658447027206421, "learning_rate": 0.00019393365283287386, "loss": 0.9391, "step": 575 }, { "epoch": 0.13868657075784024, "grad_norm": 1.1309797763824463, "learning_rate": 0.00019390687926596173, "loss": 0.8911, "step": 576 }, { "epoch": 0.13892734605429483, "grad_norm": 6.038357734680176, "learning_rate": 0.00019388004860345544, "loss": 0.4398, "step": 577 }, { "epoch": 0.1391681213507494, "grad_norm": 5.158764362335205, "learning_rate": 0.0001938531608616681, "loss": 0.5778, "step": 578 }, { "epoch": 0.139408896647204, "grad_norm": 0.5642886161804199, "learning_rate": 0.00019382621605694745, "loss": 0.5383, "step": 579 }, { "epoch": 0.13964967194365857, "grad_norm": 2.256866455078125, "learning_rate": 0.00019379921420567607, "loss": 0.5772, "step": 580 }, { "epoch": 0.13989044724011315, "grad_norm": 2.100571870803833, "learning_rate": 0.00019377215532427115, "loss": 0.9185, "step": 581 }, { "epoch": 0.14013122253656773, "grad_norm": 6.56969690322876, "learning_rate": 0.0001937450394291845, "loss": 1.0287, "step": 582 }, { "epoch": 0.14037199783302234, "grad_norm": 2.601640462875366, "learning_rate": 0.00019371786653690266, "loss": 1.5671, "step": 583 }, { "epoch": 0.14061277312947693, "grad_norm": 5.737114906311035, "learning_rate": 0.00019369063666394682, "loss": 0.9538, "step": 584 }, { "epoch": 0.1408535484259315, "grad_norm": 3.682819128036499, "learning_rate": 0.0001936633498268728, "loss": 0.8473, "step": 585 }, { "epoch": 0.1410943237223861, "grad_norm": 3.1207540035247803, "learning_rate": 0.00019363600604227105, "loss": 0.5173, "step": 586 }, { "epoch": 0.14133509901884067, "grad_norm": 2.8920652866363525, "learning_rate": 0.0001936086053267667, "loss": 0.7551, "step": 587 }, { "epoch": 0.14157587431529525, "grad_norm": 4.445816993713379, "learning_rate": 0.00019358114769701937, "loss": 0.6121, "step": 588 }, { "epoch": 0.14181664961174983, "grad_norm": 2.9601528644561768, "learning_rate": 0.00019355363316972342, "loss": 1.3002, "step": 589 }, { "epoch": 0.14205742490820442, "grad_norm": 6.101936340332031, "learning_rate": 0.0001935260617616077, "loss": 0.7764, "step": 590 }, { "epoch": 0.142298200204659, "grad_norm": 4.4996562004089355, "learning_rate": 0.00019349843348943574, "loss": 0.744, "step": 591 }, { "epoch": 0.14253897550111358, "grad_norm": 1.1355993747711182, "learning_rate": 0.00019347074837000554, "loss": 0.3457, "step": 592 }, { "epoch": 0.14277975079756816, "grad_norm": 5.778316497802734, "learning_rate": 0.00019344300642014974, "loss": 1.1338, "step": 593 }, { "epoch": 0.14302052609402274, "grad_norm": 1.4276717901229858, "learning_rate": 0.00019341520765673553, "loss": 0.4207, "step": 594 }, { "epoch": 0.14326130139047732, "grad_norm": 2.5159173011779785, "learning_rate": 0.00019338735209666457, "loss": 0.6524, "step": 595 }, { "epoch": 0.14350207668693193, "grad_norm": 1.4529104232788086, "learning_rate": 0.00019335943975687316, "loss": 0.4851, "step": 596 }, { "epoch": 0.14374285198338652, "grad_norm": 3.2440574169158936, "learning_rate": 0.000193331470654332, "loss": 0.8624, "step": 597 }, { "epoch": 0.1439836272798411, "grad_norm": 0.9937834739685059, "learning_rate": 0.00019330344480604646, "loss": 0.5917, "step": 598 }, { "epoch": 0.14422440257629568, "grad_norm": 2.456488609313965, "learning_rate": 0.00019327536222905623, "loss": 0.3061, "step": 599 }, { "epoch": 0.14446517787275026, "grad_norm": 4.499001979827881, "learning_rate": 0.00019324722294043558, "loss": 0.8591, "step": 600 }, { "epoch": 0.14470595316920484, "grad_norm": 1.5652315616607666, "learning_rate": 0.0001932190269572933, "loss": 0.1562, "step": 601 }, { "epoch": 0.14494672846565942, "grad_norm": 2.7683820724487305, "learning_rate": 0.00019319077429677268, "loss": 0.4619, "step": 602 }, { "epoch": 0.145187503762114, "grad_norm": 2.41717529296875, "learning_rate": 0.00019316246497605127, "loss": 0.8059, "step": 603 }, { "epoch": 0.1454282790585686, "grad_norm": 3.7864205837249756, "learning_rate": 0.00019313409901234127, "loss": 0.9378, "step": 604 }, { "epoch": 0.14566905435502317, "grad_norm": 2.745898723602295, "learning_rate": 0.00019310567642288922, "loss": 0.4861, "step": 605 }, { "epoch": 0.14590982965147775, "grad_norm": 2.9701218605041504, "learning_rate": 0.00019307719722497612, "loss": 0.9418, "step": 606 }, { "epoch": 0.14615060494793233, "grad_norm": 5.684365749359131, "learning_rate": 0.00019304866143591746, "loss": 0.7847, "step": 607 }, { "epoch": 0.14639138024438691, "grad_norm": 18.988853454589844, "learning_rate": 0.00019302006907306296, "loss": 1.1569, "step": 608 }, { "epoch": 0.1466321555408415, "grad_norm": 3.721798896789551, "learning_rate": 0.0001929914201537969, "loss": 0.8478, "step": 609 }, { "epoch": 0.1468729308372961, "grad_norm": 1.7376899719238281, "learning_rate": 0.00019296271469553786, "loss": 0.5951, "step": 610 }, { "epoch": 0.1471137061337507, "grad_norm": 13.979349136352539, "learning_rate": 0.00019293395271573885, "loss": 0.6307, "step": 611 }, { "epoch": 0.14735448143020527, "grad_norm": 3.307643175125122, "learning_rate": 0.00019290513423188724, "loss": 0.954, "step": 612 }, { "epoch": 0.14759525672665985, "grad_norm": 2.229158878326416, "learning_rate": 0.00019287625926150465, "loss": 0.9561, "step": 613 }, { "epoch": 0.14783603202311443, "grad_norm": 1.0365084409713745, "learning_rate": 0.0001928473278221472, "loss": 1.1344, "step": 614 }, { "epoch": 0.14807680731956901, "grad_norm": 2.6409239768981934, "learning_rate": 0.00019281833993140525, "loss": 0.657, "step": 615 }, { "epoch": 0.1483175826160236, "grad_norm": 1.3793067932128906, "learning_rate": 0.00019278929560690347, "loss": 0.6292, "step": 616 }, { "epoch": 0.14855835791247818, "grad_norm": 1.6385407447814941, "learning_rate": 0.00019276019486630093, "loss": 0.4795, "step": 617 }, { "epoch": 0.14879913320893276, "grad_norm": 4.453542232513428, "learning_rate": 0.00019273103772729093, "loss": 1.0279, "step": 618 }, { "epoch": 0.14903990850538734, "grad_norm": 0.5888392925262451, "learning_rate": 0.00019270182420760102, "loss": 0.3529, "step": 619 }, { "epoch": 0.14928068380184192, "grad_norm": 2.5179574489593506, "learning_rate": 0.00019267255432499318, "loss": 0.5329, "step": 620 }, { "epoch": 0.1495214590982965, "grad_norm": 10.619978904724121, "learning_rate": 0.0001926432280972635, "loss": 0.828, "step": 621 }, { "epoch": 0.1497622343947511, "grad_norm": 0.41896963119506836, "learning_rate": 0.0001926138455422424, "loss": 0.5892, "step": 622 }, { "epoch": 0.15000300969120567, "grad_norm": 3.307152032852173, "learning_rate": 0.00019258440667779456, "loss": 0.9538, "step": 623 }, { "epoch": 0.15024378498766028, "grad_norm": 1.9945799112319946, "learning_rate": 0.00019255491152181885, "loss": 0.3184, "step": 624 }, { "epoch": 0.15048456028411486, "grad_norm": 2.7123000621795654, "learning_rate": 0.00019252536009224845, "loss": 0.5069, "step": 625 }, { "epoch": 0.15072533558056944, "grad_norm": 2.0505239963531494, "learning_rate": 0.0001924957524070506, "loss": 0.6904, "step": 626 }, { "epoch": 0.15096611087702402, "grad_norm": 2.483839273452759, "learning_rate": 0.00019246608848422691, "loss": 1.4015, "step": 627 }, { "epoch": 0.1512068861734786, "grad_norm": 3.842451333999634, "learning_rate": 0.00019243636834181312, "loss": 0.5501, "step": 628 }, { "epoch": 0.1514476614699332, "grad_norm": 1.5859034061431885, "learning_rate": 0.00019240659199787908, "loss": 0.5125, "step": 629 }, { "epoch": 0.15168843676638777, "grad_norm": 1.8935115337371826, "learning_rate": 0.0001923767594705289, "loss": 0.4354, "step": 630 }, { "epoch": 0.15192921206284235, "grad_norm": 4.2323384284973145, "learning_rate": 0.00019234687077790085, "loss": 0.8988, "step": 631 }, { "epoch": 0.15216998735929693, "grad_norm": 1.3674668073654175, "learning_rate": 0.00019231692593816733, "loss": 0.3303, "step": 632 }, { "epoch": 0.1524107626557515, "grad_norm": 7.714446544647217, "learning_rate": 0.0001922869249695348, "loss": 0.2196, "step": 633 }, { "epoch": 0.1526515379522061, "grad_norm": 3.0279879570007324, "learning_rate": 0.00019225686789024402, "loss": 0.6256, "step": 634 }, { "epoch": 0.15289231324866068, "grad_norm": 4.163952350616455, "learning_rate": 0.0001922267547185697, "loss": 0.9441, "step": 635 }, { "epoch": 0.15313308854511526, "grad_norm": 1.383583426475525, "learning_rate": 0.00019219658547282067, "loss": 0.7899, "step": 636 }, { "epoch": 0.15337386384156987, "grad_norm": 1.438839077949524, "learning_rate": 0.00019216636017133998, "loss": 0.4349, "step": 637 }, { "epoch": 0.15361463913802445, "grad_norm": 7.890371322631836, "learning_rate": 0.00019213607883250466, "loss": 1.5545, "step": 638 }, { "epoch": 0.15385541443447903, "grad_norm": 6.0160746574401855, "learning_rate": 0.0001921057414747258, "loss": 1.8333, "step": 639 }, { "epoch": 0.1540961897309336, "grad_norm": 1.7680754661560059, "learning_rate": 0.00019207534811644864, "loss": 0.805, "step": 640 }, { "epoch": 0.1543369650273882, "grad_norm": 3.0242257118225098, "learning_rate": 0.00019204489877615237, "loss": 0.4745, "step": 641 }, { "epoch": 0.15457774032384278, "grad_norm": 1.6106970310211182, "learning_rate": 0.00019201439347235025, "loss": 0.5615, "step": 642 }, { "epoch": 0.15481851562029736, "grad_norm": 3.6016252040863037, "learning_rate": 0.0001919838322235896, "loss": 1.3254, "step": 643 }, { "epoch": 0.15505929091675194, "grad_norm": 6.142489433288574, "learning_rate": 0.00019195321504845173, "loss": 0.5939, "step": 644 }, { "epoch": 0.15530006621320652, "grad_norm": 2.9963788986206055, "learning_rate": 0.00019192254196555191, "loss": 0.8563, "step": 645 }, { "epoch": 0.1555408415096611, "grad_norm": 2.010145664215088, "learning_rate": 0.00019189181299353946, "loss": 0.6641, "step": 646 }, { "epoch": 0.15578161680611569, "grad_norm": 3.030747890472412, "learning_rate": 0.0001918610281510977, "loss": 1.0257, "step": 647 }, { "epoch": 0.15602239210257027, "grad_norm": 3.0926742553710938, "learning_rate": 0.0001918301874569439, "loss": 0.7438, "step": 648 }, { "epoch": 0.15626316739902485, "grad_norm": 3.063593864440918, "learning_rate": 0.00019179929092982912, "loss": 0.6192, "step": 649 }, { "epoch": 0.15650394269547943, "grad_norm": 1.6936414241790771, "learning_rate": 0.0001917683385885387, "loss": 0.3439, "step": 650 }, { "epoch": 0.15674471799193404, "grad_norm": 27.274925231933594, "learning_rate": 0.0001917373304518917, "loss": 0.8737, "step": 651 }, { "epoch": 0.15698549328838862, "grad_norm": 2.2580983638763428, "learning_rate": 0.000191706266538741, "loss": 0.9577, "step": 652 }, { "epoch": 0.1572262685848432, "grad_norm": 1.4257384538650513, "learning_rate": 0.00019167514686797369, "loss": 0.1513, "step": 653 }, { "epoch": 0.15746704388129779, "grad_norm": 2.24150013923645, "learning_rate": 0.00019164397145851055, "loss": 0.6569, "step": 654 }, { "epoch": 0.15770781917775237, "grad_norm": 5.1359758377075195, "learning_rate": 0.00019161274032930626, "loss": 0.9886, "step": 655 }, { "epoch": 0.15794859447420695, "grad_norm": 2.413954734802246, "learning_rate": 0.00019158145349934945, "loss": 0.2666, "step": 656 }, { "epoch": 0.15818936977066153, "grad_norm": 0.6739373803138733, "learning_rate": 0.00019155011098766255, "loss": 0.5449, "step": 657 }, { "epoch": 0.1584301450671161, "grad_norm": 0.7366794943809509, "learning_rate": 0.00019151871281330193, "loss": 0.2757, "step": 658 }, { "epoch": 0.1586709203635707, "grad_norm": 2.2127983570098877, "learning_rate": 0.00019148725899535774, "loss": 0.5392, "step": 659 }, { "epoch": 0.15891169566002528, "grad_norm": 1.907882571220398, "learning_rate": 0.00019145574955295395, "loss": 0.4752, "step": 660 }, { "epoch": 0.15915247095647986, "grad_norm": 4.098206520080566, "learning_rate": 0.00019142418450524836, "loss": 0.9706, "step": 661 }, { "epoch": 0.15939324625293444, "grad_norm": 3.782545804977417, "learning_rate": 0.00019139256387143262, "loss": 1.0815, "step": 662 }, { "epoch": 0.15963402154938902, "grad_norm": 2.8690521717071533, "learning_rate": 0.00019136088767073215, "loss": 1.0296, "step": 663 }, { "epoch": 0.1598747968458436, "grad_norm": 6.640118598937988, "learning_rate": 0.00019132915592240613, "loss": 0.6574, "step": 664 }, { "epoch": 0.1601155721422982, "grad_norm": 5.299488067626953, "learning_rate": 0.00019129736864574755, "loss": 0.9321, "step": 665 }, { "epoch": 0.1603563474387528, "grad_norm": 1.4800339937210083, "learning_rate": 0.0001912655258600831, "loss": 1.0515, "step": 666 }, { "epoch": 0.16059712273520738, "grad_norm": 4.096741199493408, "learning_rate": 0.00019123362758477334, "loss": 0.8097, "step": 667 }, { "epoch": 0.16083789803166196, "grad_norm": 1.2806522846221924, "learning_rate": 0.00019120167383921243, "loss": 0.5217, "step": 668 }, { "epoch": 0.16107867332811654, "grad_norm": 2.5771350860595703, "learning_rate": 0.0001911696646428284, "loss": 0.725, "step": 669 }, { "epoch": 0.16131944862457112, "grad_norm": 2.9327738285064697, "learning_rate": 0.0001911376000150828, "loss": 0.7475, "step": 670 }, { "epoch": 0.1615602239210257, "grad_norm": 3.3815646171569824, "learning_rate": 0.00019110547997547108, "loss": 0.935, "step": 671 }, { "epoch": 0.16180099921748028, "grad_norm": 7.282792568206787, "learning_rate": 0.00019107330454352228, "loss": 1.0584, "step": 672 }, { "epoch": 0.16204177451393487, "grad_norm": 12.47275447845459, "learning_rate": 0.00019104107373879909, "loss": 0.6211, "step": 673 }, { "epoch": 0.16228254981038945, "grad_norm": 1.406531572341919, "learning_rate": 0.00019100878758089798, "loss": 0.5329, "step": 674 }, { "epoch": 0.16252332510684403, "grad_norm": 2.693037748336792, "learning_rate": 0.00019097644608944897, "loss": 0.6528, "step": 675 }, { "epoch": 0.1627641004032986, "grad_norm": 0.5329806804656982, "learning_rate": 0.0001909440492841158, "loss": 0.4698, "step": 676 }, { "epoch": 0.1630048756997532, "grad_norm": 3.925929069519043, "learning_rate": 0.0001909115971845957, "loss": 0.6919, "step": 677 }, { "epoch": 0.1632456509962078, "grad_norm": 9.350509643554688, "learning_rate": 0.00019087908981061972, "loss": 1.1159, "step": 678 }, { "epoch": 0.16348642629266238, "grad_norm": 6.900551795959473, "learning_rate": 0.00019084652718195238, "loss": 0.5557, "step": 679 }, { "epoch": 0.16372720158911697, "grad_norm": 1.4014828205108643, "learning_rate": 0.00019081390931839181, "loss": 0.997, "step": 680 }, { "epoch": 0.16396797688557155, "grad_norm": 7.637568950653076, "learning_rate": 0.0001907812362397698, "loss": 1.3175, "step": 681 }, { "epoch": 0.16420875218202613, "grad_norm": 1.3787779808044434, "learning_rate": 0.00019074850796595163, "loss": 0.4951, "step": 682 }, { "epoch": 0.1644495274784807, "grad_norm": 3.6682255268096924, "learning_rate": 0.00019071572451683614, "loss": 1.0832, "step": 683 }, { "epoch": 0.1646903027749353, "grad_norm": 25.37391471862793, "learning_rate": 0.00019068288591235578, "loss": 0.6875, "step": 684 }, { "epoch": 0.16493107807138987, "grad_norm": 2.674971580505371, "learning_rate": 0.00019064999217247643, "loss": 0.9103, "step": 685 }, { "epoch": 0.16517185336784446, "grad_norm": 3.5297887325286865, "learning_rate": 0.00019061704331719764, "loss": 0.8173, "step": 686 }, { "epoch": 0.16541262866429904, "grad_norm": 1.2813355922698975, "learning_rate": 0.00019058403936655233, "loss": 0.3151, "step": 687 }, { "epoch": 0.16565340396075362, "grad_norm": 3.667281150817871, "learning_rate": 0.000190550980340607, "loss": 0.6559, "step": 688 }, { "epoch": 0.1658941792572082, "grad_norm": 2.3366219997406006, "learning_rate": 0.00019051786625946162, "loss": 0.5158, "step": 689 }, { "epoch": 0.16613495455366278, "grad_norm": 1.1751844882965088, "learning_rate": 0.00019048469714324958, "loss": 0.8607, "step": 690 }, { "epoch": 0.16637572985011737, "grad_norm": 3.535374164581299, "learning_rate": 0.00019045147301213788, "loss": 1.228, "step": 691 }, { "epoch": 0.16661650514657197, "grad_norm": 4.35559606552124, "learning_rate": 0.00019041819388632676, "loss": 0.8601, "step": 692 }, { "epoch": 0.16685728044302656, "grad_norm": 2.7030580043792725, "learning_rate": 0.00019038485978605004, "loss": 1.0164, "step": 693 }, { "epoch": 0.16709805573948114, "grad_norm": 3.0144922733306885, "learning_rate": 0.00019035147073157493, "loss": 0.8172, "step": 694 }, { "epoch": 0.16733883103593572, "grad_norm": 2.4854543209075928, "learning_rate": 0.00019031802674320206, "loss": 0.924, "step": 695 }, { "epoch": 0.1675796063323903, "grad_norm": 2.9239442348480225, "learning_rate": 0.00019028452784126542, "loss": 0.796, "step": 696 }, { "epoch": 0.16782038162884488, "grad_norm": 3.872009038925171, "learning_rate": 0.00019025097404613245, "loss": 0.4696, "step": 697 }, { "epoch": 0.16806115692529947, "grad_norm": 1.675231695175171, "learning_rate": 0.00019021736537820394, "loss": 0.4549, "step": 698 }, { "epoch": 0.16830193222175405, "grad_norm": 2.725574493408203, "learning_rate": 0.000190183701857914, "loss": 0.6834, "step": 699 }, { "epoch": 0.16854270751820863, "grad_norm": 2.2455711364746094, "learning_rate": 0.00019014998350573014, "loss": 0.4471, "step": 700 }, { "epoch": 0.1687834828146632, "grad_norm": 0.9234648942947388, "learning_rate": 0.00019011621034215322, "loss": 0.1788, "step": 701 }, { "epoch": 0.1690242581111178, "grad_norm": 1.5781611204147339, "learning_rate": 0.00019008238238771736, "loss": 0.244, "step": 702 }, { "epoch": 0.16926503340757237, "grad_norm": 5.697232246398926, "learning_rate": 0.00019004849966299005, "loss": 0.4329, "step": 703 }, { "epoch": 0.16950580870402696, "grad_norm": 4.987598896026611, "learning_rate": 0.00019001456218857208, "loss": 0.9072, "step": 704 }, { "epoch": 0.16974658400048154, "grad_norm": 2.579894781112671, "learning_rate": 0.00018998056998509747, "loss": 0.717, "step": 705 }, { "epoch": 0.16998735929693615, "grad_norm": 3.0871734619140625, "learning_rate": 0.00018994652307323363, "loss": 0.2763, "step": 706 }, { "epoch": 0.17022813459339073, "grad_norm": 2.6915767192840576, "learning_rate": 0.00018991242147368105, "loss": 0.8432, "step": 707 }, { "epoch": 0.1704689098898453, "grad_norm": 4.125692844390869, "learning_rate": 0.00018987826520717365, "loss": 1.2892, "step": 708 }, { "epoch": 0.1707096851862999, "grad_norm": 3.3036179542541504, "learning_rate": 0.00018984405429447852, "loss": 0.9282, "step": 709 }, { "epoch": 0.17095046048275447, "grad_norm": 2.7406651973724365, "learning_rate": 0.00018980978875639596, "loss": 1.1154, "step": 710 }, { "epoch": 0.17119123577920906, "grad_norm": 0.8988383412361145, "learning_rate": 0.00018977546861375947, "loss": 0.4264, "step": 711 }, { "epoch": 0.17143201107566364, "grad_norm": 0.4057740867137909, "learning_rate": 0.00018974109388743583, "loss": 0.9764, "step": 712 }, { "epoch": 0.17167278637211822, "grad_norm": 3.4650371074676514, "learning_rate": 0.0001897066645983249, "loss": 1.0979, "step": 713 }, { "epoch": 0.1719135616685728, "grad_norm": 4.947608947753906, "learning_rate": 0.00018967218076735976, "loss": 0.7168, "step": 714 }, { "epoch": 0.17215433696502738, "grad_norm": 1.033057451248169, "learning_rate": 0.0001896376424155067, "loss": 0.2137, "step": 715 }, { "epoch": 0.17239511226148196, "grad_norm": 5.465882778167725, "learning_rate": 0.00018960304956376511, "loss": 1.7501, "step": 716 }, { "epoch": 0.17263588755793655, "grad_norm": 3.3956429958343506, "learning_rate": 0.00018956840223316752, "loss": 0.5464, "step": 717 }, { "epoch": 0.17287666285439113, "grad_norm": 0.9355387687683105, "learning_rate": 0.00018953370044477955, "loss": 0.3183, "step": 718 }, { "epoch": 0.1731174381508457, "grad_norm": 0.6955990195274353, "learning_rate": 0.00018949894421969998, "loss": 0.4827, "step": 719 }, { "epoch": 0.17335821344730032, "grad_norm": 9.664114952087402, "learning_rate": 0.00018946413357906068, "loss": 0.8839, "step": 720 }, { "epoch": 0.1735989887437549, "grad_norm": 3.0460386276245117, "learning_rate": 0.0001894292685440266, "loss": 0.4881, "step": 721 }, { "epoch": 0.17383976404020948, "grad_norm": 3.0840280055999756, "learning_rate": 0.00018939434913579578, "loss": 1.0241, "step": 722 }, { "epoch": 0.17408053933666406, "grad_norm": 3.3748912811279297, "learning_rate": 0.00018935937537559926, "loss": 1.2437, "step": 723 }, { "epoch": 0.17432131463311865, "grad_norm": 10.365636825561523, "learning_rate": 0.00018932434728470118, "loss": 0.762, "step": 724 }, { "epoch": 0.17456208992957323, "grad_norm": 4.329830169677734, "learning_rate": 0.00018928926488439869, "loss": 0.7613, "step": 725 }, { "epoch": 0.1748028652260278, "grad_norm": 4.144877910614014, "learning_rate": 0.00018925412819602202, "loss": 1.1638, "step": 726 }, { "epoch": 0.1750436405224824, "grad_norm": 1.3736963272094727, "learning_rate": 0.00018921893724093428, "loss": 0.6176, "step": 727 }, { "epoch": 0.17528441581893697, "grad_norm": 0.9337141513824463, "learning_rate": 0.0001891836920405317, "loss": 0.2855, "step": 728 }, { "epoch": 0.17552519111539155, "grad_norm": 5.704214572906494, "learning_rate": 0.0001891483926162434, "loss": 0.5566, "step": 729 }, { "epoch": 0.17576596641184614, "grad_norm": 1.9563344717025757, "learning_rate": 0.00018911303898953158, "loss": 0.5568, "step": 730 }, { "epoch": 0.17600674170830072, "grad_norm": 5.422361850738525, "learning_rate": 0.00018907763118189124, "loss": 0.7783, "step": 731 }, { "epoch": 0.1762475170047553, "grad_norm": 3.7933502197265625, "learning_rate": 0.00018904216921485046, "loss": 1.178, "step": 732 }, { "epoch": 0.1764882923012099, "grad_norm": 2.3435802459716797, "learning_rate": 0.00018900665310997018, "loss": 0.5904, "step": 733 }, { "epoch": 0.1767290675976645, "grad_norm": 6.887885093688965, "learning_rate": 0.0001889710828888443, "loss": 1.1331, "step": 734 }, { "epoch": 0.17696984289411907, "grad_norm": 2.859257221221924, "learning_rate": 0.00018893545857309954, "loss": 0.8934, "step": 735 }, { "epoch": 0.17721061819057365, "grad_norm": 3.1216025352478027, "learning_rate": 0.0001888997801843956, "loss": 0.604, "step": 736 }, { "epoch": 0.17745139348702824, "grad_norm": 2.1345009803771973, "learning_rate": 0.00018886404774442502, "loss": 1.0628, "step": 737 }, { "epoch": 0.17769216878348282, "grad_norm": 3.882951021194458, "learning_rate": 0.0001888282612749132, "loss": 0.4992, "step": 738 }, { "epoch": 0.1779329440799374, "grad_norm": 6.192306041717529, "learning_rate": 0.0001887924207976184, "loss": 0.7377, "step": 739 }, { "epoch": 0.17817371937639198, "grad_norm": 7.351373672485352, "learning_rate": 0.00018875652633433166, "loss": 1.103, "step": 740 }, { "epoch": 0.17841449467284656, "grad_norm": 1.2278997898101807, "learning_rate": 0.00018872057790687697, "loss": 0.2774, "step": 741 }, { "epoch": 0.17865526996930114, "grad_norm": 2.035078525543213, "learning_rate": 0.00018868457553711102, "loss": 0.3135, "step": 742 }, { "epoch": 0.17889604526575573, "grad_norm": 3.5295181274414062, "learning_rate": 0.00018864851924692335, "loss": 0.8756, "step": 743 }, { "epoch": 0.1791368205622103, "grad_norm": 1.8237663507461548, "learning_rate": 0.00018861240905823623, "loss": 0.986, "step": 744 }, { "epoch": 0.1793775958586649, "grad_norm": 4.102538108825684, "learning_rate": 0.00018857624499300476, "loss": 0.3661, "step": 745 }, { "epoch": 0.17961837115511947, "grad_norm": 1.7040005922317505, "learning_rate": 0.0001885400270732168, "loss": 0.5499, "step": 746 }, { "epoch": 0.17985914645157408, "grad_norm": 1.8217339515686035, "learning_rate": 0.00018850375532089285, "loss": 0.3162, "step": 747 }, { "epoch": 0.18009992174802866, "grad_norm": 4.074040412902832, "learning_rate": 0.00018846742975808632, "loss": 1.4644, "step": 748 }, { "epoch": 0.18034069704448324, "grad_norm": 4.6111016273498535, "learning_rate": 0.00018843105040688312, "loss": 0.7778, "step": 749 }, { "epoch": 0.18058147234093783, "grad_norm": 2.9776699542999268, "learning_rate": 0.00018839461728940203, "loss": 0.7832, "step": 750 }, { "epoch": 0.1808222476373924, "grad_norm": 1.9872022867202759, "learning_rate": 0.0001883581304277945, "loss": 0.9256, "step": 751 }, { "epoch": 0.181063022933847, "grad_norm": 2.69476580619812, "learning_rate": 0.00018832158984424463, "loss": 0.9596, "step": 752 }, { "epoch": 0.18130379823030157, "grad_norm": 5.690935134887695, "learning_rate": 0.00018828499556096907, "loss": 0.9447, "step": 753 }, { "epoch": 0.18154457352675615, "grad_norm": 6.152745723724365, "learning_rate": 0.00018824834760021737, "loss": 1.0374, "step": 754 }, { "epoch": 0.18178534882321074, "grad_norm": 0.8274415135383606, "learning_rate": 0.00018821164598427145, "loss": 0.5589, "step": 755 }, { "epoch": 0.18202612411966532, "grad_norm": 0.797907829284668, "learning_rate": 0.00018817489073544609, "loss": 0.198, "step": 756 }, { "epoch": 0.1822668994161199, "grad_norm": 2.9858620166778564, "learning_rate": 0.00018813808187608845, "loss": 0.8879, "step": 757 }, { "epoch": 0.18250767471257448, "grad_norm": 3.2753536701202393, "learning_rate": 0.00018810121942857845, "loss": 0.9035, "step": 758 }, { "epoch": 0.18274845000902906, "grad_norm": 2.3199586868286133, "learning_rate": 0.00018806430341532858, "loss": 0.3536, "step": 759 }, { "epoch": 0.18298922530548364, "grad_norm": 2.436077833175659, "learning_rate": 0.0001880273338587838, "loss": 0.5789, "step": 760 }, { "epoch": 0.18323000060193825, "grad_norm": 4.57729959487915, "learning_rate": 0.0001879903107814217, "loss": 0.5619, "step": 761 }, { "epoch": 0.18347077589839283, "grad_norm": 2.3822367191314697, "learning_rate": 0.0001879532342057524, "loss": 0.6583, "step": 762 }, { "epoch": 0.18371155119484742, "grad_norm": 5.95395565032959, "learning_rate": 0.00018791610415431855, "loss": 0.9503, "step": 763 }, { "epoch": 0.183952326491302, "grad_norm": 10.346938133239746, "learning_rate": 0.0001878789206496953, "loss": 1.0378, "step": 764 }, { "epoch": 0.18419310178775658, "grad_norm": 2.6373162269592285, "learning_rate": 0.0001878416837144903, "loss": 0.2419, "step": 765 }, { "epoch": 0.18443387708421116, "grad_norm": 1.50508451461792, "learning_rate": 0.00018780439337134368, "loss": 0.5883, "step": 766 }, { "epoch": 0.18467465238066574, "grad_norm": 1.039527416229248, "learning_rate": 0.0001877670496429281, "loss": 0.586, "step": 767 }, { "epoch": 0.18491542767712033, "grad_norm": 3.885326862335205, "learning_rate": 0.00018772965255194857, "loss": 0.9222, "step": 768 }, { "epoch": 0.1851562029735749, "grad_norm": 5.3813605308532715, "learning_rate": 0.0001876922021211426, "loss": 0.7393, "step": 769 }, { "epoch": 0.1853969782700295, "grad_norm": 3.15456223487854, "learning_rate": 0.0001876546983732802, "loss": 0.7792, "step": 770 }, { "epoch": 0.18563775356648407, "grad_norm": 3.184206962585449, "learning_rate": 0.0001876171413311637, "loss": 1.2433, "step": 771 }, { "epoch": 0.18587852886293865, "grad_norm": 1.582762598991394, "learning_rate": 0.00018757953101762787, "loss": 0.5598, "step": 772 }, { "epoch": 0.18611930415939323, "grad_norm": 1.884548306465149, "learning_rate": 0.00018754186745553985, "loss": 0.4477, "step": 773 }, { "epoch": 0.18636007945584784, "grad_norm": 5.777435302734375, "learning_rate": 0.0001875041506677992, "loss": 0.4906, "step": 774 }, { "epoch": 0.18660085475230243, "grad_norm": 1.3165128231048584, "learning_rate": 0.00018746638067733778, "loss": 0.6351, "step": 775 }, { "epoch": 0.186841630048757, "grad_norm": 1.5441575050354004, "learning_rate": 0.00018742855750711988, "loss": 0.7108, "step": 776 }, { "epoch": 0.1870824053452116, "grad_norm": 2.326465606689453, "learning_rate": 0.00018739068118014198, "loss": 0.861, "step": 777 }, { "epoch": 0.18732318064166617, "grad_norm": 3.9939534664154053, "learning_rate": 0.00018735275171943307, "loss": 0.6814, "step": 778 }, { "epoch": 0.18756395593812075, "grad_norm": 1.1253992319107056, "learning_rate": 0.00018731476914805425, "loss": 0.1546, "step": 779 }, { "epoch": 0.18780473123457533, "grad_norm": 2.305006980895996, "learning_rate": 0.00018727673348909913, "loss": 1.0963, "step": 780 }, { "epoch": 0.18804550653102992, "grad_norm": 3.0463790893554688, "learning_rate": 0.0001872386447656934, "loss": 0.734, "step": 781 }, { "epoch": 0.1882862818274845, "grad_norm": 2.357088088989258, "learning_rate": 0.00018720050300099507, "loss": 0.7065, "step": 782 }, { "epoch": 0.18852705712393908, "grad_norm": 2.2680745124816895, "learning_rate": 0.0001871623082181945, "loss": 1.4469, "step": 783 }, { "epoch": 0.18876783242039366, "grad_norm": 2.114755392074585, "learning_rate": 0.0001871240604405141, "loss": 0.7899, "step": 784 }, { "epoch": 0.18900860771684824, "grad_norm": 1.0055882930755615, "learning_rate": 0.0001870857596912087, "loss": 0.1715, "step": 785 }, { "epoch": 0.18924938301330282, "grad_norm": 1.9801616668701172, "learning_rate": 0.00018704740599356518, "loss": 0.5179, "step": 786 }, { "epoch": 0.1894901583097574, "grad_norm": 2.5894370079040527, "learning_rate": 0.0001870089993709027, "loss": 0.4325, "step": 787 }, { "epoch": 0.18973093360621202, "grad_norm": 3.895353078842163, "learning_rate": 0.00018697053984657256, "loss": 0.3835, "step": 788 }, { "epoch": 0.1899717089026666, "grad_norm": 1.0935512781143188, "learning_rate": 0.00018693202744395827, "loss": 1.1042, "step": 789 }, { "epoch": 0.19021248419912118, "grad_norm": 1.6422269344329834, "learning_rate": 0.0001868934621864754, "loss": 0.718, "step": 790 }, { "epoch": 0.19045325949557576, "grad_norm": 2.844287633895874, "learning_rate": 0.00018685484409757178, "loss": 1.2023, "step": 791 }, { "epoch": 0.19069403479203034, "grad_norm": 1.130077600479126, "learning_rate": 0.00018681617320072725, "loss": 0.2922, "step": 792 }, { "epoch": 0.19093481008848492, "grad_norm": 2.1571900844573975, "learning_rate": 0.0001867774495194538, "loss": 0.7212, "step": 793 }, { "epoch": 0.1911755853849395, "grad_norm": 6.230739593505859, "learning_rate": 0.00018673867307729555, "loss": 0.8975, "step": 794 }, { "epoch": 0.1914163606813941, "grad_norm": 2.590592622756958, "learning_rate": 0.00018669984389782865, "loss": 0.3676, "step": 795 }, { "epoch": 0.19165713597784867, "grad_norm": 8.08610725402832, "learning_rate": 0.00018666096200466132, "loss": 0.7873, "step": 796 }, { "epoch": 0.19189791127430325, "grad_norm": 1.4064202308654785, "learning_rate": 0.00018662202742143383, "loss": 0.5145, "step": 797 }, { "epoch": 0.19213868657075783, "grad_norm": 1.37117338180542, "learning_rate": 0.0001865830401718185, "loss": 0.8417, "step": 798 }, { "epoch": 0.19237946186721241, "grad_norm": 2.1927073001861572, "learning_rate": 0.00018654400027951967, "loss": 0.9088, "step": 799 }, { "epoch": 0.192620237163667, "grad_norm": 2.8337302207946777, "learning_rate": 0.0001865049077682737, "loss": 0.5877, "step": 800 }, { "epoch": 0.19286101246012158, "grad_norm": 6.606812000274658, "learning_rate": 0.00018646576266184893, "loss": 0.9887, "step": 801 }, { "epoch": 0.1931017877565762, "grad_norm": 2.9909074306488037, "learning_rate": 0.00018642656498404564, "loss": 0.5693, "step": 802 }, { "epoch": 0.19334256305303077, "grad_norm": 0.7477906346321106, "learning_rate": 0.0001863873147586961, "loss": 0.2322, "step": 803 }, { "epoch": 0.19358333834948535, "grad_norm": 2.028005599975586, "learning_rate": 0.00018634801200966453, "loss": 0.3557, "step": 804 }, { "epoch": 0.19382411364593993, "grad_norm": 5.629332065582275, "learning_rate": 0.00018630865676084714, "loss": 0.6842, "step": 805 }, { "epoch": 0.19406488894239451, "grad_norm": 0.9226589202880859, "learning_rate": 0.000186269249036172, "loss": 0.2885, "step": 806 }, { "epoch": 0.1943056642388491, "grad_norm": 1.8051038980484009, "learning_rate": 0.00018622978885959906, "loss": 0.8416, "step": 807 }, { "epoch": 0.19454643953530368, "grad_norm": 4.140893936157227, "learning_rate": 0.0001861902762551202, "loss": 1.0417, "step": 808 }, { "epoch": 0.19478721483175826, "grad_norm": 7.981260776519775, "learning_rate": 0.0001861507112467592, "loss": 0.525, "step": 809 }, { "epoch": 0.19502799012821284, "grad_norm": 5.369372367858887, "learning_rate": 0.0001861110938585717, "loss": 0.5619, "step": 810 }, { "epoch": 0.19526876542466742, "grad_norm": 1.8795945644378662, "learning_rate": 0.0001860714241146451, "loss": 1.0825, "step": 811 }, { "epoch": 0.195509540721122, "grad_norm": 3.486668586730957, "learning_rate": 0.0001860317020390987, "loss": 0.3657, "step": 812 }, { "epoch": 0.1957503160175766, "grad_norm": 1.3779692649841309, "learning_rate": 0.00018599192765608364, "loss": 0.9127, "step": 813 }, { "epoch": 0.19599109131403117, "grad_norm": 2.563727617263794, "learning_rate": 0.00018595210098978283, "loss": 0.5109, "step": 814 }, { "epoch": 0.19623186661048578, "grad_norm": 0.7977485656738281, "learning_rate": 0.00018591222206441096, "loss": 0.5252, "step": 815 }, { "epoch": 0.19647264190694036, "grad_norm": 4.5069475173950195, "learning_rate": 0.0001858722909042145, "loss": 0.3426, "step": 816 }, { "epoch": 0.19671341720339494, "grad_norm": 6.430407524108887, "learning_rate": 0.00018583230753347173, "loss": 0.9264, "step": 817 }, { "epoch": 0.19695419249984952, "grad_norm": 2.3652713298797607, "learning_rate": 0.00018579227197649257, "loss": 0.6739, "step": 818 }, { "epoch": 0.1971949677963041, "grad_norm": 2.2648465633392334, "learning_rate": 0.00018575218425761876, "loss": 0.3986, "step": 819 }, { "epoch": 0.1974357430927587, "grad_norm": 2.1836869716644287, "learning_rate": 0.0001857120444012237, "loss": 0.2466, "step": 820 }, { "epoch": 0.19767651838921327, "grad_norm": 1.898180603981018, "learning_rate": 0.00018567185243171256, "loss": 0.5558, "step": 821 }, { "epoch": 0.19791729368566785, "grad_norm": 0.8913256525993347, "learning_rate": 0.00018563160837352212, "loss": 0.6096, "step": 822 }, { "epoch": 0.19815806898212243, "grad_norm": 3.458024024963379, "learning_rate": 0.00018559131225112085, "loss": 0.7502, "step": 823 }, { "epoch": 0.198398844278577, "grad_norm": 3.377265691757202, "learning_rate": 0.00018555096408900889, "loss": 0.9659, "step": 824 }, { "epoch": 0.1986396195750316, "grad_norm": 5.404399394989014, "learning_rate": 0.00018551056391171803, "loss": 0.8436, "step": 825 }, { "epoch": 0.19888039487148618, "grad_norm": 2.176090717315674, "learning_rate": 0.00018547011174381163, "loss": 0.6543, "step": 826 }, { "epoch": 0.19912117016794076, "grad_norm": 1.4764220714569092, "learning_rate": 0.00018542960760988475, "loss": 0.4371, "step": 827 }, { "epoch": 0.19936194546439534, "grad_norm": 4.111733913421631, "learning_rate": 0.00018538905153456394, "loss": 0.7307, "step": 828 }, { "epoch": 0.19960272076084995, "grad_norm": 3.4664177894592285, "learning_rate": 0.0001853484435425074, "loss": 0.8896, "step": 829 }, { "epoch": 0.19984349605730453, "grad_norm": 1.9064959287643433, "learning_rate": 0.00018530778365840497, "loss": 0.5491, "step": 830 }, { "epoch": 0.2000842713537591, "grad_norm": 1.8238356113433838, "learning_rate": 0.00018526707190697782, "loss": 0.564, "step": 831 }, { "epoch": 0.2003250466502137, "grad_norm": 1.4021512269973755, "learning_rate": 0.00018522630831297886, "loss": 0.2522, "step": 832 }, { "epoch": 0.20056582194666828, "grad_norm": 1.9710665941238403, "learning_rate": 0.0001851854929011924, "loss": 0.2168, "step": 833 }, { "epoch": 0.20080659724312286, "grad_norm": 1.932867407798767, "learning_rate": 0.00018514462569643435, "loss": 0.5669, "step": 834 }, { "epoch": 0.20104737253957744, "grad_norm": 1.412558674812317, "learning_rate": 0.00018510370672355204, "loss": 0.5655, "step": 835 }, { "epoch": 0.20128814783603202, "grad_norm": 5.750187873840332, "learning_rate": 0.00018506273600742433, "loss": 0.8122, "step": 836 }, { "epoch": 0.2015289231324866, "grad_norm": 4.016916275024414, "learning_rate": 0.00018502171357296144, "loss": 0.5478, "step": 837 }, { "epoch": 0.20176969842894119, "grad_norm": 1.5730372667312622, "learning_rate": 0.00018498063944510516, "loss": 0.3524, "step": 838 }, { "epoch": 0.20201047372539577, "grad_norm": 1.1213641166687012, "learning_rate": 0.0001849395136488286, "loss": 0.386, "step": 839 }, { "epoch": 0.20225124902185035, "grad_norm": 1.455862045288086, "learning_rate": 0.00018489833620913642, "loss": 0.2709, "step": 840 }, { "epoch": 0.20249202431830493, "grad_norm": 3.3921029567718506, "learning_rate": 0.0001848571071510645, "loss": 0.2738, "step": 841 }, { "epoch": 0.2027327996147595, "grad_norm": 1.9654597043991089, "learning_rate": 0.00018481582649968028, "loss": 0.5441, "step": 842 }, { "epoch": 0.20297357491121412, "grad_norm": 8.712904930114746, "learning_rate": 0.00018477449428008246, "loss": 0.5047, "step": 843 }, { "epoch": 0.2032143502076687, "grad_norm": 4.064781665802002, "learning_rate": 0.0001847331105174011, "loss": 0.6401, "step": 844 }, { "epoch": 0.20345512550412329, "grad_norm": 10.879172325134277, "learning_rate": 0.0001846916752367976, "loss": 0.7271, "step": 845 }, { "epoch": 0.20369590080057787, "grad_norm": 1.46236252784729, "learning_rate": 0.00018465018846346482, "loss": 0.3446, "step": 846 }, { "epoch": 0.20393667609703245, "grad_norm": 1.9737117290496826, "learning_rate": 0.0001846086502226267, "loss": 0.5821, "step": 847 }, { "epoch": 0.20417745139348703, "grad_norm": 2.094733715057373, "learning_rate": 0.00018456706053953862, "loss": 0.2923, "step": 848 }, { "epoch": 0.2044182266899416, "grad_norm": 1.962471842765808, "learning_rate": 0.0001845254194394872, "loss": 0.756, "step": 849 }, { "epoch": 0.2046590019863962, "grad_norm": 3.4438953399658203, "learning_rate": 0.00018448372694779034, "loss": 0.4609, "step": 850 }, { "epoch": 0.20489977728285078, "grad_norm": 1.1954097747802734, "learning_rate": 0.00018444198308979713, "loss": 0.6803, "step": 851 }, { "epoch": 0.20514055257930536, "grad_norm": 2.8534281253814697, "learning_rate": 0.00018440018789088794, "loss": 0.8631, "step": 852 }, { "epoch": 0.20538132787575994, "grad_norm": 0.7627564072608948, "learning_rate": 0.0001843583413764744, "loss": 0.3575, "step": 853 }, { "epoch": 0.20562210317221452, "grad_norm": 2.954674482345581, "learning_rate": 0.0001843164435719992, "loss": 1.2672, "step": 854 }, { "epoch": 0.2058628784686691, "grad_norm": 3.019871950149536, "learning_rate": 0.00018427449450293635, "loss": 0.5769, "step": 855 }, { "epoch": 0.2061036537651237, "grad_norm": 1.2849375009536743, "learning_rate": 0.00018423249419479099, "loss": 1.0092, "step": 856 }, { "epoch": 0.2063444290615783, "grad_norm": 2.783853054046631, "learning_rate": 0.00018419044267309939, "loss": 0.6801, "step": 857 }, { "epoch": 0.20658520435803288, "grad_norm": 3.1100003719329834, "learning_rate": 0.0001841483399634289, "loss": 1.2878, "step": 858 }, { "epoch": 0.20682597965448746, "grad_norm": 1.7785344123840332, "learning_rate": 0.00018410618609137816, "loss": 0.5104, "step": 859 }, { "epoch": 0.20706675495094204, "grad_norm": 1.5101239681243896, "learning_rate": 0.0001840639810825768, "loss": 0.6032, "step": 860 }, { "epoch": 0.20730753024739662, "grad_norm": 4.038559913635254, "learning_rate": 0.00018402172496268554, "loss": 0.6457, "step": 861 }, { "epoch": 0.2075483055438512, "grad_norm": 8.409773826599121, "learning_rate": 0.0001839794177573962, "loss": 1.5939, "step": 862 }, { "epoch": 0.20778908084030578, "grad_norm": 2.086423635482788, "learning_rate": 0.00018393705949243164, "loss": 0.7663, "step": 863 }, { "epoch": 0.20802985613676037, "grad_norm": 4.5612945556640625, "learning_rate": 0.00018389465019354577, "loss": 0.5459, "step": 864 }, { "epoch": 0.20827063143321495, "grad_norm": 1.9495208263397217, "learning_rate": 0.0001838521898865236, "loss": 0.1955, "step": 865 }, { "epoch": 0.20851140672966953, "grad_norm": 2.232084035873413, "learning_rate": 0.00018380967859718105, "loss": 0.798, "step": 866 }, { "epoch": 0.2087521820261241, "grad_norm": 5.387617111206055, "learning_rate": 0.0001837671163513651, "loss": 0.7414, "step": 867 }, { "epoch": 0.2089929573225787, "grad_norm": 7.861992359161377, "learning_rate": 0.00018372450317495365, "loss": 0.9128, "step": 868 }, { "epoch": 0.20923373261903327, "grad_norm": 2.3675897121429443, "learning_rate": 0.00018368183909385567, "loss": 0.6167, "step": 869 }, { "epoch": 0.20947450791548788, "grad_norm": 3.206550121307373, "learning_rate": 0.00018363912413401097, "loss": 0.918, "step": 870 }, { "epoch": 0.20971528321194247, "grad_norm": 11.829947471618652, "learning_rate": 0.00018359635832139034, "loss": 1.1065, "step": 871 }, { "epoch": 0.20995605850839705, "grad_norm": 2.4670798778533936, "learning_rate": 0.00018355354168199552, "loss": 0.52, "step": 872 }, { "epoch": 0.21019683380485163, "grad_norm": 2.387666702270508, "learning_rate": 0.00018351067424185913, "loss": 0.3961, "step": 873 }, { "epoch": 0.2104376091013062, "grad_norm": 0.41803881525993347, "learning_rate": 0.00018346775602704464, "loss": 0.1675, "step": 874 }, { "epoch": 0.2106783843977608, "grad_norm": 5.301272869110107, "learning_rate": 0.0001834247870636464, "loss": 1.194, "step": 875 }, { "epoch": 0.21091915969421537, "grad_norm": 2.4999866485595703, "learning_rate": 0.0001833817673777897, "loss": 0.1707, "step": 876 }, { "epoch": 0.21115993499066996, "grad_norm": 1.3982088565826416, "learning_rate": 0.00018333869699563055, "loss": 1.0266, "step": 877 }, { "epoch": 0.21140071028712454, "grad_norm": 3.187394380569458, "learning_rate": 0.00018329557594335585, "loss": 1.0817, "step": 878 }, { "epoch": 0.21164148558357912, "grad_norm": 3.2300422191619873, "learning_rate": 0.00018325240424718335, "loss": 0.6478, "step": 879 }, { "epoch": 0.2118822608800337, "grad_norm": 3.521116018295288, "learning_rate": 0.00018320918193336148, "loss": 0.8387, "step": 880 }, { "epoch": 0.21212303617648828, "grad_norm": 9.480287551879883, "learning_rate": 0.00018316590902816952, "loss": 0.9253, "step": 881 }, { "epoch": 0.21236381147294286, "grad_norm": 2.395949602127075, "learning_rate": 0.0001831225855579175, "loss": 0.8792, "step": 882 }, { "epoch": 0.21260458676939745, "grad_norm": 1.681579351425171, "learning_rate": 0.0001830792115489462, "loss": 0.9965, "step": 883 }, { "epoch": 0.21284536206585206, "grad_norm": 1.3200875520706177, "learning_rate": 0.00018303578702762705, "loss": 0.2478, "step": 884 }, { "epoch": 0.21308613736230664, "grad_norm": 2.904762029647827, "learning_rate": 0.00018299231202036233, "loss": 0.4818, "step": 885 }, { "epoch": 0.21332691265876122, "grad_norm": 2.1330971717834473, "learning_rate": 0.00018294878655358493, "loss": 0.1786, "step": 886 }, { "epoch": 0.2135676879552158, "grad_norm": 4.824681758880615, "learning_rate": 0.0001829052106537584, "loss": 0.8048, "step": 887 }, { "epoch": 0.21380846325167038, "grad_norm": 2.336089849472046, "learning_rate": 0.000182861584347377, "loss": 0.7041, "step": 888 }, { "epoch": 0.21404923854812496, "grad_norm": 2.5671005249023438, "learning_rate": 0.00018281790766096564, "loss": 0.6426, "step": 889 }, { "epoch": 0.21429001384457955, "grad_norm": 18.460041046142578, "learning_rate": 0.00018277418062107986, "loss": 0.9763, "step": 890 }, { "epoch": 0.21453078914103413, "grad_norm": 2.5273513793945312, "learning_rate": 0.00018273040325430574, "loss": 0.5831, "step": 891 }, { "epoch": 0.2147715644374887, "grad_norm": 1.38306725025177, "learning_rate": 0.00018268657558726003, "loss": 0.8044, "step": 892 }, { "epoch": 0.2150123397339433, "grad_norm": 1.9609812498092651, "learning_rate": 0.00018264269764659013, "loss": 0.3049, "step": 893 }, { "epoch": 0.21525311503039787, "grad_norm": 4.538389205932617, "learning_rate": 0.0001825987694589738, "loss": 0.8865, "step": 894 }, { "epoch": 0.21549389032685246, "grad_norm": 2.368454933166504, "learning_rate": 0.00018255479105111957, "loss": 1.0822, "step": 895 }, { "epoch": 0.21573466562330704, "grad_norm": 4.19332218170166, "learning_rate": 0.00018251076244976637, "loss": 1.0274, "step": 896 }, { "epoch": 0.21597544091976162, "grad_norm": 0.907124400138855, "learning_rate": 0.00018246668368168372, "loss": 0.5454, "step": 897 }, { "epoch": 0.21621621621621623, "grad_norm": 2.2195355892181396, "learning_rate": 0.0001824225547736716, "loss": 0.4168, "step": 898 }, { "epoch": 0.2164569915126708, "grad_norm": 4.278376579284668, "learning_rate": 0.00018237837575256044, "loss": 0.6395, "step": 899 }, { "epoch": 0.2166977668091254, "grad_norm": 3.1869797706604004, "learning_rate": 0.00018233414664521123, "loss": 0.9863, "step": 900 }, { "epoch": 0.21693854210557997, "grad_norm": 1.9933998584747314, "learning_rate": 0.00018228986747851537, "loss": 0.6143, "step": 901 }, { "epoch": 0.21717931740203456, "grad_norm": 1.5613797903060913, "learning_rate": 0.00018224553827939468, "loss": 0.4492, "step": 902 }, { "epoch": 0.21742009269848914, "grad_norm": 2.306579351425171, "learning_rate": 0.00018220115907480143, "loss": 0.5864, "step": 903 }, { "epoch": 0.21766086799494372, "grad_norm": 3.8171541690826416, "learning_rate": 0.00018215672989171824, "loss": 0.8157, "step": 904 }, { "epoch": 0.2179016432913983, "grad_norm": 1.4388493299484253, "learning_rate": 0.00018211225075715816, "loss": 0.8506, "step": 905 }, { "epoch": 0.21814241858785288, "grad_norm": 1.82477867603302, "learning_rate": 0.00018206772169816467, "loss": 0.7865, "step": 906 }, { "epoch": 0.21838319388430746, "grad_norm": 3.2749521732330322, "learning_rate": 0.00018202314274181144, "loss": 1.3825, "step": 907 }, { "epoch": 0.21862396918076205, "grad_norm": 1.8761945962905884, "learning_rate": 0.00018197851391520264, "loss": 0.8722, "step": 908 }, { "epoch": 0.21886474447721663, "grad_norm": 1.6125880479812622, "learning_rate": 0.0001819338352454727, "loss": 0.5524, "step": 909 }, { "epoch": 0.2191055197736712, "grad_norm": 1.2524000406265259, "learning_rate": 0.0001818891067597863, "loss": 0.8105, "step": 910 }, { "epoch": 0.21934629507012582, "grad_norm": 3.3656504154205322, "learning_rate": 0.0001818443284853385, "loss": 1.6029, "step": 911 }, { "epoch": 0.2195870703665804, "grad_norm": 1.9755463600158691, "learning_rate": 0.00018179950044935458, "loss": 0.401, "step": 912 }, { "epoch": 0.21982784566303498, "grad_norm": 3.240755081176758, "learning_rate": 0.0001817546226790901, "loss": 1.0564, "step": 913 }, { "epoch": 0.22006862095948956, "grad_norm": 5.947300910949707, "learning_rate": 0.00018170969520183084, "loss": 0.4548, "step": 914 }, { "epoch": 0.22030939625594415, "grad_norm": 3.0205721855163574, "learning_rate": 0.0001816647180448928, "loss": 0.8396, "step": 915 }, { "epoch": 0.22055017155239873, "grad_norm": 1.6607885360717773, "learning_rate": 0.0001816196912356222, "loss": 1.1035, "step": 916 }, { "epoch": 0.2207909468488533, "grad_norm": 1.3007737398147583, "learning_rate": 0.0001815746148013954, "loss": 0.1121, "step": 917 }, { "epoch": 0.2210317221453079, "grad_norm": 2.658994674682617, "learning_rate": 0.00018152948876961906, "loss": 0.3838, "step": 918 }, { "epoch": 0.22127249744176247, "grad_norm": 1.1010584831237793, "learning_rate": 0.00018148431316772983, "loss": 0.1575, "step": 919 }, { "epoch": 0.22151327273821705, "grad_norm": 4.701428413391113, "learning_rate": 0.0001814390880231946, "loss": 0.4306, "step": 920 }, { "epoch": 0.22175404803467164, "grad_norm": 3.2852671146392822, "learning_rate": 0.0001813938133635104, "loss": 0.2974, "step": 921 }, { "epoch": 0.22199482333112622, "grad_norm": 3.092611312866211, "learning_rate": 0.0001813484892162043, "loss": 0.7887, "step": 922 }, { "epoch": 0.2222355986275808, "grad_norm": 2.474486827850342, "learning_rate": 0.00018130311560883344, "loss": 0.7599, "step": 923 }, { "epoch": 0.22247637392403538, "grad_norm": 5.097280025482178, "learning_rate": 0.00018125769256898511, "loss": 0.6548, "step": 924 }, { "epoch": 0.22271714922049, "grad_norm": 3.1248862743377686, "learning_rate": 0.00018121222012427665, "loss": 1.0051, "step": 925 }, { "epoch": 0.22295792451694457, "grad_norm": 4.130378723144531, "learning_rate": 0.00018116669830235536, "loss": 0.8515, "step": 926 }, { "epoch": 0.22319869981339915, "grad_norm": 3.8639516830444336, "learning_rate": 0.00018112112713089863, "loss": 0.3418, "step": 927 }, { "epoch": 0.22343947510985374, "grad_norm": 5.733872890472412, "learning_rate": 0.00018107550663761386, "loss": 0.4249, "step": 928 }, { "epoch": 0.22368025040630832, "grad_norm": 2.717703104019165, "learning_rate": 0.0001810298368502384, "loss": 0.3455, "step": 929 }, { "epoch": 0.2239210257027629, "grad_norm": 4.0550689697265625, "learning_rate": 0.00018098411779653953, "loss": 0.6515, "step": 930 }, { "epoch": 0.22416180099921748, "grad_norm": 1.4261348247528076, "learning_rate": 0.00018093834950431458, "loss": 0.7618, "step": 931 }, { "epoch": 0.22440257629567206, "grad_norm": 1.7245268821716309, "learning_rate": 0.0001808925320013908, "loss": 0.7967, "step": 932 }, { "epoch": 0.22464335159212664, "grad_norm": 4.139218807220459, "learning_rate": 0.0001808466653156253, "loss": 0.7014, "step": 933 }, { "epoch": 0.22488412688858123, "grad_norm": 2.1172738075256348, "learning_rate": 0.00018080074947490516, "loss": 0.4765, "step": 934 }, { "epoch": 0.2251249021850358, "grad_norm": 4.761689186096191, "learning_rate": 0.00018075478450714724, "loss": 0.699, "step": 935 }, { "epoch": 0.2253656774814904, "grad_norm": 2.6363584995269775, "learning_rate": 0.00018070877044029846, "loss": 0.8263, "step": 936 }, { "epoch": 0.22560645277794497, "grad_norm": 1.930909276008606, "learning_rate": 0.00018066270730233538, "loss": 0.6952, "step": 937 }, { "epoch": 0.22584722807439955, "grad_norm": 0.8242762684822083, "learning_rate": 0.00018061659512126453, "loss": 0.5675, "step": 938 }, { "epoch": 0.22608800337085416, "grad_norm": 1.3294146060943604, "learning_rate": 0.0001805704339251222, "loss": 0.5123, "step": 939 }, { "epoch": 0.22632877866730874, "grad_norm": 0.8458835482597351, "learning_rate": 0.00018052422374197454, "loss": 0.2988, "step": 940 }, { "epoch": 0.22656955396376333, "grad_norm": 1.0856271982192993, "learning_rate": 0.00018047796459991742, "loss": 0.7522, "step": 941 }, { "epoch": 0.2268103292602179, "grad_norm": 5.306552410125732, "learning_rate": 0.00018043165652707649, "loss": 0.7063, "step": 942 }, { "epoch": 0.2270511045566725, "grad_norm": 5.354522228240967, "learning_rate": 0.00018038529955160718, "loss": 0.7462, "step": 943 }, { "epoch": 0.22729187985312707, "grad_norm": 1.556826114654541, "learning_rate": 0.00018033889370169465, "loss": 0.7949, "step": 944 }, { "epoch": 0.22753265514958165, "grad_norm": 0.9913277626037598, "learning_rate": 0.00018029243900555373, "loss": 0.5612, "step": 945 }, { "epoch": 0.22777343044603623, "grad_norm": 1.7368444204330444, "learning_rate": 0.000180245935491429, "loss": 0.3213, "step": 946 }, { "epoch": 0.22801420574249082, "grad_norm": 2.660506010055542, "learning_rate": 0.0001801993831875947, "loss": 0.39, "step": 947 }, { "epoch": 0.2282549810389454, "grad_norm": 3.1855568885803223, "learning_rate": 0.0001801527821223547, "loss": 0.4602, "step": 948 }, { "epoch": 0.22849575633539998, "grad_norm": 2.8115875720977783, "learning_rate": 0.0001801061323240426, "loss": 0.3065, "step": 949 }, { "epoch": 0.22873653163185456, "grad_norm": 18.071075439453125, "learning_rate": 0.00018005943382102158, "loss": 0.8023, "step": 950 }, { "epoch": 0.22897730692830914, "grad_norm": 1.1732177734375, "learning_rate": 0.00018001268664168439, "loss": 0.8773, "step": 951 }, { "epoch": 0.22921808222476375, "grad_norm": 2.2807600498199463, "learning_rate": 0.00017996589081445348, "loss": 0.7107, "step": 952 }, { "epoch": 0.22945885752121833, "grad_norm": 2.0999910831451416, "learning_rate": 0.00017991904636778077, "loss": 0.6253, "step": 953 }, { "epoch": 0.22969963281767292, "grad_norm": 13.20639419555664, "learning_rate": 0.00017987215333014782, "loss": 0.9696, "step": 954 }, { "epoch": 0.2299404081141275, "grad_norm": 1.124551773071289, "learning_rate": 0.00017982521173006568, "loss": 0.3418, "step": 955 }, { "epoch": 0.23018118341058208, "grad_norm": 0.3517683148384094, "learning_rate": 0.00017977822159607497, "loss": 0.2291, "step": 956 }, { "epoch": 0.23042195870703666, "grad_norm": 2.7812604904174805, "learning_rate": 0.0001797311829567458, "loss": 0.966, "step": 957 }, { "epoch": 0.23066273400349124, "grad_norm": 1.8114944696426392, "learning_rate": 0.0001796840958406777, "loss": 0.5787, "step": 958 }, { "epoch": 0.23090350929994582, "grad_norm": 2.012598991394043, "learning_rate": 0.00017963696027649986, "loss": 1.1201, "step": 959 }, { "epoch": 0.2311442845964004, "grad_norm": 1.5761219263076782, "learning_rate": 0.00017958977629287074, "loss": 0.9017, "step": 960 }, { "epoch": 0.231385059892855, "grad_norm": 1.2920587062835693, "learning_rate": 0.0001795425439184783, "loss": 0.319, "step": 961 }, { "epoch": 0.23162583518930957, "grad_norm": 6.733016014099121, "learning_rate": 0.00017949526318203997, "loss": 0.7354, "step": 962 }, { "epoch": 0.23186661048576415, "grad_norm": 1.5943965911865234, "learning_rate": 0.0001794479341123025, "loss": 0.2783, "step": 963 }, { "epoch": 0.23210738578221873, "grad_norm": 1.023605227470398, "learning_rate": 0.00017940055673804208, "loss": 0.5166, "step": 964 }, { "epoch": 0.23234816107867332, "grad_norm": 0.7512199282646179, "learning_rate": 0.00017935313108806427, "loss": 0.1101, "step": 965 }, { "epoch": 0.23258893637512792, "grad_norm": 3.7386422157287598, "learning_rate": 0.000179305657191204, "loss": 0.6783, "step": 966 }, { "epoch": 0.2328297116715825, "grad_norm": 1.3405836820602417, "learning_rate": 0.00017925813507632546, "loss": 0.5868, "step": 967 }, { "epoch": 0.2330704869680371, "grad_norm": 3.388740301132202, "learning_rate": 0.00017921056477232224, "loss": 0.5516, "step": 968 }, { "epoch": 0.23331126226449167, "grad_norm": 2.8512704372406006, "learning_rate": 0.00017916294630811717, "loss": 0.383, "step": 969 }, { "epoch": 0.23355203756094625, "grad_norm": 1.5921225547790527, "learning_rate": 0.00017911527971266238, "loss": 0.4268, "step": 970 }, { "epoch": 0.23379281285740083, "grad_norm": 8.35683536529541, "learning_rate": 0.00017906756501493925, "loss": 0.3925, "step": 971 }, { "epoch": 0.23403358815385542, "grad_norm": 1.589657187461853, "learning_rate": 0.0001790198022439585, "loss": 0.5233, "step": 972 }, { "epoch": 0.23427436345031, "grad_norm": 2.5263054370880127, "learning_rate": 0.00017897199142875994, "loss": 0.3526, "step": 973 }, { "epoch": 0.23451513874676458, "grad_norm": 1.696166753768921, "learning_rate": 0.00017892413259841265, "loss": 0.3805, "step": 974 }, { "epoch": 0.23475591404321916, "grad_norm": 3.3580451011657715, "learning_rate": 0.0001788762257820149, "loss": 0.66, "step": 975 }, { "epoch": 0.23499668933967374, "grad_norm": 2.4022610187530518, "learning_rate": 0.0001788282710086942, "loss": 0.4526, "step": 976 }, { "epoch": 0.23523746463612832, "grad_norm": 2.932914972305298, "learning_rate": 0.00017878026830760714, "loss": 0.8118, "step": 977 }, { "epoch": 0.2354782399325829, "grad_norm": 2.4748735427856445, "learning_rate": 0.00017873221770793943, "loss": 0.7625, "step": 978 }, { "epoch": 0.2357190152290375, "grad_norm": 7.512228488922119, "learning_rate": 0.00017868411923890597, "loss": 0.8987, "step": 979 }, { "epoch": 0.2359597905254921, "grad_norm": 1.6160115003585815, "learning_rate": 0.00017863597292975075, "loss": 0.5894, "step": 980 }, { "epoch": 0.23620056582194668, "grad_norm": 1.4038505554199219, "learning_rate": 0.00017858777880974677, "loss": 0.411, "step": 981 }, { "epoch": 0.23644134111840126, "grad_norm": 0.987040102481842, "learning_rate": 0.00017853953690819628, "loss": 0.4793, "step": 982 }, { "epoch": 0.23668211641485584, "grad_norm": 12.90198802947998, "learning_rate": 0.00017849124725443033, "loss": 0.7816, "step": 983 }, { "epoch": 0.23692289171131042, "grad_norm": 1.474013090133667, "learning_rate": 0.00017844290987780926, "loss": 0.8878, "step": 984 }, { "epoch": 0.237163667007765, "grad_norm": 4.9217963218688965, "learning_rate": 0.0001783945248077222, "loss": 0.7382, "step": 985 }, { "epoch": 0.2374044423042196, "grad_norm": 3.483311891555786, "learning_rate": 0.0001783460920735875, "loss": 1.2593, "step": 986 }, { "epoch": 0.23764521760067417, "grad_norm": 4.8503594398498535, "learning_rate": 0.00017829761170485228, "loss": 1.5008, "step": 987 }, { "epoch": 0.23788599289712875, "grad_norm": 1.4968628883361816, "learning_rate": 0.0001782490837309927, "loss": 0.62, "step": 988 }, { "epoch": 0.23812676819358333, "grad_norm": 2.4329562187194824, "learning_rate": 0.00017820050818151395, "loss": 0.7213, "step": 989 }, { "epoch": 0.23836754349003791, "grad_norm": 7.601263523101807, "learning_rate": 0.00017815188508595002, "loss": 0.4269, "step": 990 }, { "epoch": 0.2386083187864925, "grad_norm": 2.8010635375976562, "learning_rate": 0.00017810321447386387, "loss": 0.9812, "step": 991 }, { "epoch": 0.23884909408294708, "grad_norm": 4.355586051940918, "learning_rate": 0.0001780544963748474, "loss": 0.6753, "step": 992 }, { "epoch": 0.2390898693794017, "grad_norm": 1.6186625957489014, "learning_rate": 0.00017800573081852122, "loss": 0.5759, "step": 993 }, { "epoch": 0.23933064467585627, "grad_norm": 1.3594582080841064, "learning_rate": 0.000177956917834535, "loss": 0.6777, "step": 994 }, { "epoch": 0.23957141997231085, "grad_norm": 0.8430949449539185, "learning_rate": 0.00017790805745256704, "loss": 0.4463, "step": 995 }, { "epoch": 0.23981219526876543, "grad_norm": 3.564265012741089, "learning_rate": 0.00017785914970232467, "loss": 0.7162, "step": 996 }, { "epoch": 0.24005297056522001, "grad_norm": 2.442955255508423, "learning_rate": 0.00017781019461354385, "loss": 1.1975, "step": 997 }, { "epoch": 0.2402937458616746, "grad_norm": 2.008604049682617, "learning_rate": 0.00017776119221598938, "loss": 0.3523, "step": 998 }, { "epoch": 0.24053452115812918, "grad_norm": 5.036071300506592, "learning_rate": 0.00017771214253945488, "loss": 0.7299, "step": 999 }, { "epoch": 0.24077529645458376, "grad_norm": 2.059300661087036, "learning_rate": 0.0001776630456137626, "loss": 1.0976, "step": 1000 }, { "epoch": 0.24101607175103834, "grad_norm": 1.5523993968963623, "learning_rate": 0.0001776139014687636, "loss": 0.3973, "step": 1001 }, { "epoch": 0.24125684704749292, "grad_norm": 2.268207311630249, "learning_rate": 0.00017756471013433766, "loss": 0.6189, "step": 1002 }, { "epoch": 0.2414976223439475, "grad_norm": 1.0523104667663574, "learning_rate": 0.0001775154716403932, "loss": 0.6191, "step": 1003 }, { "epoch": 0.2417383976404021, "grad_norm": 1.8148690462112427, "learning_rate": 0.00017746618601686734, "loss": 0.5895, "step": 1004 }, { "epoch": 0.24197917293685667, "grad_norm": 2.2843098640441895, "learning_rate": 0.00017741685329372584, "loss": 0.4135, "step": 1005 }, { "epoch": 0.24221994823331125, "grad_norm": 1.5911093950271606, "learning_rate": 0.00017736747350096313, "loss": 0.3805, "step": 1006 }, { "epoch": 0.24246072352976586, "grad_norm": 1.609438180923462, "learning_rate": 0.00017731804666860218, "loss": 0.4508, "step": 1007 }, { "epoch": 0.24270149882622044, "grad_norm": 9.2236328125, "learning_rate": 0.0001772685728266947, "loss": 0.7403, "step": 1008 }, { "epoch": 0.24294227412267502, "grad_norm": 4.165558815002441, "learning_rate": 0.00017721905200532084, "loss": 0.4195, "step": 1009 }, { "epoch": 0.2431830494191296, "grad_norm": 2.679929494857788, "learning_rate": 0.00017716948423458938, "loss": 0.696, "step": 1010 }, { "epoch": 0.24342382471558419, "grad_norm": 2.558372974395752, "learning_rate": 0.00017711986954463765, "loss": 0.8344, "step": 1011 }, { "epoch": 0.24366460001203877, "grad_norm": 2.897308588027954, "learning_rate": 0.0001770702079656315, "loss": 0.4203, "step": 1012 }, { "epoch": 0.24390537530849335, "grad_norm": 3.2203593254089355, "learning_rate": 0.00017702049952776522, "loss": 0.7664, "step": 1013 }, { "epoch": 0.24414615060494793, "grad_norm": 4.204813480377197, "learning_rate": 0.00017697074426126173, "loss": 0.3801, "step": 1014 }, { "epoch": 0.2443869259014025, "grad_norm": 0.8308073878288269, "learning_rate": 0.0001769209421963723, "loss": 0.2596, "step": 1015 }, { "epoch": 0.2446277011978571, "grad_norm": 2.2909529209136963, "learning_rate": 0.00017687109336337673, "loss": 0.3914, "step": 1016 }, { "epoch": 0.24486847649431168, "grad_norm": 3.4535796642303467, "learning_rate": 0.00017682119779258317, "loss": 0.7128, "step": 1017 }, { "epoch": 0.24510925179076626, "grad_norm": 2.2746803760528564, "learning_rate": 0.0001767712555143283, "loss": 0.6153, "step": 1018 }, { "epoch": 0.24535002708722084, "grad_norm": 3.151444435119629, "learning_rate": 0.00017672126655897708, "loss": 0.8, "step": 1019 }, { "epoch": 0.24559080238367542, "grad_norm": 7.057896614074707, "learning_rate": 0.00017667123095692296, "loss": 0.4853, "step": 1020 }, { "epoch": 0.24583157768013003, "grad_norm": 1.5912202596664429, "learning_rate": 0.00017662114873858768, "loss": 0.5406, "step": 1021 }, { "epoch": 0.2460723529765846, "grad_norm": 4.36636209487915, "learning_rate": 0.00017657101993442132, "loss": 1.5037, "step": 1022 }, { "epoch": 0.2463131282730392, "grad_norm": 3.4972012042999268, "learning_rate": 0.00017652084457490233, "loss": 0.4583, "step": 1023 }, { "epoch": 0.24655390356949378, "grad_norm": 1.963361382484436, "learning_rate": 0.00017647062269053745, "loss": 0.5212, "step": 1024 }, { "epoch": 0.24679467886594836, "grad_norm": 8.170878410339355, "learning_rate": 0.00017642035431186166, "loss": 0.3219, "step": 1025 }, { "epoch": 0.24703545416240294, "grad_norm": 5.340506076812744, "learning_rate": 0.00017637003946943826, "loss": 0.8826, "step": 1026 }, { "epoch": 0.24727622945885752, "grad_norm": 0.9775887727737427, "learning_rate": 0.00017631967819385885, "loss": 0.7689, "step": 1027 }, { "epoch": 0.2475170047553121, "grad_norm": 5.842097759246826, "learning_rate": 0.0001762692705157431, "loss": 1.3133, "step": 1028 }, { "epoch": 0.24775778005176669, "grad_norm": 3.479212999343872, "learning_rate": 0.00017621881646573905, "loss": 0.6421, "step": 1029 }, { "epoch": 0.24799855534822127, "grad_norm": 2.3998911380767822, "learning_rate": 0.00017616831607452288, "loss": 0.9605, "step": 1030 }, { "epoch": 0.24823933064467585, "grad_norm": 2.134242057800293, "learning_rate": 0.00017611776937279894, "loss": 0.5968, "step": 1031 }, { "epoch": 0.24848010594113043, "grad_norm": 1.5552438497543335, "learning_rate": 0.00017606717639129967, "loss": 0.5313, "step": 1032 }, { "epoch": 0.248720881237585, "grad_norm": 1.7223352193832397, "learning_rate": 0.00017601653716078583, "loss": 0.6771, "step": 1033 }, { "epoch": 0.2489616565340396, "grad_norm": 1.0817844867706299, "learning_rate": 0.00017596585171204612, "loss": 0.0747, "step": 1034 }, { "epoch": 0.2492024318304942, "grad_norm": 16.8873291015625, "learning_rate": 0.0001759151200758974, "loss": 0.7068, "step": 1035 }, { "epoch": 0.24944320712694878, "grad_norm": 3.909327983856201, "learning_rate": 0.00017586434228318462, "loss": 1.1171, "step": 1036 }, { "epoch": 0.24968398242340337, "grad_norm": 1.0942474603652954, "learning_rate": 0.00017581351836478085, "loss": 0.3179, "step": 1037 }, { "epoch": 0.24992475771985795, "grad_norm": 1.4328174591064453, "learning_rate": 0.00017576264835158706, "loss": 0.5279, "step": 1038 }, { "epoch": 0.2501655330163125, "grad_norm": 1.4774179458618164, "learning_rate": 0.0001757117322745324, "loss": 0.5594, "step": 1039 }, { "epoch": 0.2504063083127671, "grad_norm": 5.494201183319092, "learning_rate": 0.00017566077016457394, "loss": 0.892, "step": 1040 }, { "epoch": 0.2506470836092217, "grad_norm": 0.7356401085853577, "learning_rate": 0.00017560976205269673, "loss": 0.3253, "step": 1041 }, { "epoch": 0.2508878589056763, "grad_norm": 1.2550084590911865, "learning_rate": 0.00017555870796991387, "loss": 0.4044, "step": 1042 }, { "epoch": 0.2511286342021309, "grad_norm": 5.170292377471924, "learning_rate": 0.00017550760794726633, "loss": 0.5862, "step": 1043 }, { "epoch": 0.25136940949858544, "grad_norm": 1.0770255327224731, "learning_rate": 0.00017545646201582303, "loss": 0.7886, "step": 1044 }, { "epoch": 0.25161018479504005, "grad_norm": 1.4369720220565796, "learning_rate": 0.0001754052702066808, "loss": 0.3612, "step": 1045 }, { "epoch": 0.2518509600914946, "grad_norm": 1.905137300491333, "learning_rate": 0.00017535403255096444, "loss": 0.9435, "step": 1046 }, { "epoch": 0.2520917353879492, "grad_norm": 16.227949142456055, "learning_rate": 0.00017530274907982647, "loss": 1.7011, "step": 1047 }, { "epoch": 0.25233251068440377, "grad_norm": 1.4642868041992188, "learning_rate": 0.0001752514198244474, "loss": 0.5992, "step": 1048 }, { "epoch": 0.2525732859808584, "grad_norm": 2.769197463989258, "learning_rate": 0.00017520004481603554, "loss": 1.3272, "step": 1049 }, { "epoch": 0.25281406127731293, "grad_norm": 2.604154586791992, "learning_rate": 0.00017514862408582701, "loss": 0.6135, "step": 1050 }, { "epoch": 0.25305483657376754, "grad_norm": 4.081873416900635, "learning_rate": 0.00017509715766508575, "loss": 1.1481, "step": 1051 }, { "epoch": 0.2532956118702221, "grad_norm": 5.054668426513672, "learning_rate": 0.0001750456455851034, "loss": 0.8622, "step": 1052 }, { "epoch": 0.2535363871666767, "grad_norm": 3.3023860454559326, "learning_rate": 0.00017499408787719945, "loss": 0.6033, "step": 1053 }, { "epoch": 0.25377716246313126, "grad_norm": 1.7069976329803467, "learning_rate": 0.00017494248457272112, "loss": 0.4344, "step": 1054 }, { "epoch": 0.25401793775958587, "grad_norm": 4.372264385223389, "learning_rate": 0.00017489083570304333, "loss": 1.2165, "step": 1055 }, { "epoch": 0.2542587130560405, "grad_norm": 3.081066608428955, "learning_rate": 0.00017483914129956868, "loss": 0.8693, "step": 1056 }, { "epoch": 0.25449948835249503, "grad_norm": 3.6236472129821777, "learning_rate": 0.00017478740139372753, "loss": 0.8538, "step": 1057 }, { "epoch": 0.25474026364894964, "grad_norm": 1.3119887113571167, "learning_rate": 0.00017473561601697783, "loss": 0.6279, "step": 1058 }, { "epoch": 0.2549810389454042, "grad_norm": 1.9135150909423828, "learning_rate": 0.0001746837852008052, "loss": 0.2616, "step": 1059 }, { "epoch": 0.2552218142418588, "grad_norm": 3.3330864906311035, "learning_rate": 0.0001746319089767229, "loss": 0.5011, "step": 1060 }, { "epoch": 0.25546258953831336, "grad_norm": 1.3728001117706299, "learning_rate": 0.00017457998737627182, "loss": 0.3416, "step": 1061 }, { "epoch": 0.25570336483476797, "grad_norm": 1.3205347061157227, "learning_rate": 0.00017452802043102034, "loss": 0.4671, "step": 1062 }, { "epoch": 0.2559441401312225, "grad_norm": 1.428043246269226, "learning_rate": 0.00017447600817256458, "loss": 0.9892, "step": 1063 }, { "epoch": 0.25618491542767713, "grad_norm": 6.333396911621094, "learning_rate": 0.000174423950632528, "loss": 0.2126, "step": 1064 }, { "epoch": 0.2564256907241317, "grad_norm": 4.501138210296631, "learning_rate": 0.00017437184784256177, "loss": 1.0723, "step": 1065 }, { "epoch": 0.2566664660205863, "grad_norm": 6.165459632873535, "learning_rate": 0.0001743196998343445, "loss": 0.7911, "step": 1066 }, { "epoch": 0.25690724131704085, "grad_norm": 2.045748710632324, "learning_rate": 0.00017426750663958231, "loss": 0.6512, "step": 1067 }, { "epoch": 0.25714801661349546, "grad_norm": 15.06201457977295, "learning_rate": 0.00017421526829000872, "loss": 1.1931, "step": 1068 }, { "epoch": 0.25738879190995007, "grad_norm": 1.8321692943572998, "learning_rate": 0.00017416298481738482, "loss": 0.5883, "step": 1069 }, { "epoch": 0.2576295672064046, "grad_norm": 1.6688590049743652, "learning_rate": 0.00017411065625349905, "loss": 0.2832, "step": 1070 }, { "epoch": 0.25787034250285923, "grad_norm": 3.1032514572143555, "learning_rate": 0.00017405828263016734, "loss": 0.6419, "step": 1071 }, { "epoch": 0.2581111177993138, "grad_norm": 1.9432801008224487, "learning_rate": 0.00017400586397923288, "loss": 0.5049, "step": 1072 }, { "epoch": 0.2583518930957684, "grad_norm": 1.8286429643630981, "learning_rate": 0.0001739534003325664, "loss": 0.9415, "step": 1073 }, { "epoch": 0.25859266839222295, "grad_norm": 1.2921900749206543, "learning_rate": 0.00017390089172206592, "loss": 0.1063, "step": 1074 }, { "epoch": 0.25883344368867756, "grad_norm": 2.301280975341797, "learning_rate": 0.00017384833817965674, "loss": 0.6128, "step": 1075 }, { "epoch": 0.2590742189851321, "grad_norm": 2.117572069168091, "learning_rate": 0.00017379573973729163, "loss": 0.2485, "step": 1076 }, { "epoch": 0.2593149942815867, "grad_norm": 44.768497467041016, "learning_rate": 0.0001737430964269504, "loss": 1.1191, "step": 1077 }, { "epoch": 0.2595557695780413, "grad_norm": 3.337317943572998, "learning_rate": 0.00017369040828064047, "loss": 0.372, "step": 1078 }, { "epoch": 0.2597965448744959, "grad_norm": 2.307708263397217, "learning_rate": 0.00017363767533039626, "loss": 0.9921, "step": 1079 }, { "epoch": 0.26003732017095044, "grad_norm": 1.9676494598388672, "learning_rate": 0.00017358489760827954, "loss": 0.1324, "step": 1080 }, { "epoch": 0.26027809546740505, "grad_norm": 2.810729503631592, "learning_rate": 0.00017353207514637928, "loss": 0.5826, "step": 1081 }, { "epoch": 0.26051887076385966, "grad_norm": 0.4161555767059326, "learning_rate": 0.00017347920797681165, "loss": 0.1594, "step": 1082 }, { "epoch": 0.2607596460603142, "grad_norm": 2.319537878036499, "learning_rate": 0.00017342629613172005, "loss": 1.0077, "step": 1083 }, { "epoch": 0.2610004213567688, "grad_norm": 6.007627487182617, "learning_rate": 0.00017337333964327493, "loss": 0.7686, "step": 1084 }, { "epoch": 0.2612411966532234, "grad_norm": 1.204407811164856, "learning_rate": 0.00017332033854367405, "loss": 0.3591, "step": 1085 }, { "epoch": 0.261481971949678, "grad_norm": 1.132603406906128, "learning_rate": 0.00017326729286514208, "loss": 0.5379, "step": 1086 }, { "epoch": 0.26172274724613254, "grad_norm": 2.5757410526275635, "learning_rate": 0.00017321420263993102, "loss": 0.4672, "step": 1087 }, { "epoch": 0.26196352254258715, "grad_norm": 4.104795932769775, "learning_rate": 0.0001731610679003198, "loss": 0.9948, "step": 1088 }, { "epoch": 0.2622042978390417, "grad_norm": 2.2880449295043945, "learning_rate": 0.00017310788867861446, "loss": 0.5483, "step": 1089 }, { "epoch": 0.2624450731354963, "grad_norm": 0.9389917254447937, "learning_rate": 0.00017305466500714808, "loss": 0.4569, "step": 1090 }, { "epoch": 0.26268584843195086, "grad_norm": 1.6185779571533203, "learning_rate": 0.00017300139691828076, "loss": 0.5776, "step": 1091 }, { "epoch": 0.2629266237284055, "grad_norm": 1.6307772397994995, "learning_rate": 0.00017294808444439966, "loss": 0.3469, "step": 1092 }, { "epoch": 0.26316739902486, "grad_norm": 1.6767011880874634, "learning_rate": 0.00017289472761791887, "loss": 0.4124, "step": 1093 }, { "epoch": 0.26340817432131464, "grad_norm": 3.5887739658355713, "learning_rate": 0.00017284132647127947, "loss": 0.7729, "step": 1094 }, { "epoch": 0.2636489496177692, "grad_norm": 2.98939847946167, "learning_rate": 0.00017278788103694943, "loss": 0.547, "step": 1095 }, { "epoch": 0.2638897249142238, "grad_norm": 3.3976495265960693, "learning_rate": 0.00017273439134742372, "loss": 0.9218, "step": 1096 }, { "epoch": 0.2641305002106784, "grad_norm": 4.070367813110352, "learning_rate": 0.00017268085743522423, "loss": 0.6816, "step": 1097 }, { "epoch": 0.26437127550713296, "grad_norm": 2.875377893447876, "learning_rate": 0.00017262727933289965, "loss": 0.3219, "step": 1098 }, { "epoch": 0.2646120508035876, "grad_norm": 2.2826290130615234, "learning_rate": 0.0001725736570730256, "loss": 0.4502, "step": 1099 }, { "epoch": 0.2648528261000421, "grad_norm": 1.6275043487548828, "learning_rate": 0.00017251999068820456, "loss": 0.2921, "step": 1100 }, { "epoch": 0.26509360139649674, "grad_norm": 7.831151962280273, "learning_rate": 0.00017246628021106577, "loss": 0.6283, "step": 1101 }, { "epoch": 0.2653343766929513, "grad_norm": 2.219731569290161, "learning_rate": 0.00017241252567426534, "loss": 0.4064, "step": 1102 }, { "epoch": 0.2655751519894059, "grad_norm": 3.912492036819458, "learning_rate": 0.00017235872711048617, "loss": 1.037, "step": 1103 }, { "epoch": 0.26581592728586045, "grad_norm": 2.191307783126831, "learning_rate": 0.00017230488455243788, "loss": 0.8365, "step": 1104 }, { "epoch": 0.26605670258231506, "grad_norm": 4.49420690536499, "learning_rate": 0.00017225099803285692, "loss": 0.6711, "step": 1105 }, { "epoch": 0.2662974778787696, "grad_norm": 3.0059874057769775, "learning_rate": 0.00017219706758450631, "loss": 0.8782, "step": 1106 }, { "epoch": 0.2665382531752242, "grad_norm": 0.6084616780281067, "learning_rate": 0.00017214309324017598, "loss": 0.146, "step": 1107 }, { "epoch": 0.2667790284716788, "grad_norm": 3.9558541774749756, "learning_rate": 0.0001720890750326824, "loss": 0.4695, "step": 1108 }, { "epoch": 0.2670198037681334, "grad_norm": 3.0784409046173096, "learning_rate": 0.00017203501299486881, "loss": 0.9544, "step": 1109 }, { "epoch": 0.267260579064588, "grad_norm": 0.6500839591026306, "learning_rate": 0.000171980907159605, "loss": 0.2016, "step": 1110 }, { "epoch": 0.26750135436104255, "grad_norm": 4.700311660766602, "learning_rate": 0.00017192675755978748, "loss": 1.1171, "step": 1111 }, { "epoch": 0.26774212965749716, "grad_norm": 1.3093310594558716, "learning_rate": 0.00017187256422833929, "loss": 0.2874, "step": 1112 }, { "epoch": 0.2679829049539517, "grad_norm": 2.463928461074829, "learning_rate": 0.0001718183271982101, "loss": 0.4196, "step": 1113 }, { "epoch": 0.2682236802504063, "grad_norm": 3.2348127365112305, "learning_rate": 0.0001717640465023762, "loss": 1.1128, "step": 1114 }, { "epoch": 0.2684644555468609, "grad_norm": 2.890456438064575, "learning_rate": 0.00017170972217384035, "loss": 0.5629, "step": 1115 }, { "epoch": 0.2687052308433155, "grad_norm": 4.826290607452393, "learning_rate": 0.00017165535424563185, "loss": 0.7627, "step": 1116 }, { "epoch": 0.26894600613977004, "grad_norm": 2.350214719772339, "learning_rate": 0.00017160094275080648, "loss": 0.8664, "step": 1117 }, { "epoch": 0.26918678143622465, "grad_norm": 2.407381772994995, "learning_rate": 0.00017154648772244664, "loss": 0.5608, "step": 1118 }, { "epoch": 0.2694275567326792, "grad_norm": 4.348508834838867, "learning_rate": 0.00017149198919366105, "loss": 0.7147, "step": 1119 }, { "epoch": 0.2696683320291338, "grad_norm": 2.3149821758270264, "learning_rate": 0.00017143744719758499, "loss": 0.8603, "step": 1120 }, { "epoch": 0.26990910732558837, "grad_norm": 2.070988893508911, "learning_rate": 0.00017138286176738006, "loss": 0.3237, "step": 1121 }, { "epoch": 0.270149882622043, "grad_norm": 2.3443868160247803, "learning_rate": 0.00017132823293623432, "loss": 0.313, "step": 1122 }, { "epoch": 0.27039065791849753, "grad_norm": 2.001828908920288, "learning_rate": 0.0001712735607373623, "loss": 0.665, "step": 1123 }, { "epoch": 0.27063143321495214, "grad_norm": 8.432289123535156, "learning_rate": 0.00017121884520400474, "loss": 0.7836, "step": 1124 }, { "epoch": 0.27087220851140675, "grad_norm": 2.163132429122925, "learning_rate": 0.00017116408636942888, "loss": 0.2619, "step": 1125 }, { "epoch": 0.2711129838078613, "grad_norm": 1.5865484476089478, "learning_rate": 0.0001711092842669281, "loss": 0.4622, "step": 1126 }, { "epoch": 0.2713537591043159, "grad_norm": 2.0779945850372314, "learning_rate": 0.0001710544389298223, "loss": 0.3762, "step": 1127 }, { "epoch": 0.27159453440077047, "grad_norm": 7.750448703765869, "learning_rate": 0.00017099955039145758, "loss": 0.7578, "step": 1128 }, { "epoch": 0.2718353096972251, "grad_norm": 1.3454210758209229, "learning_rate": 0.00017094461868520622, "loss": 0.5281, "step": 1129 }, { "epoch": 0.27207608499367963, "grad_norm": 1.0360485315322876, "learning_rate": 0.0001708896438444669, "loss": 0.7575, "step": 1130 }, { "epoch": 0.27231686029013424, "grad_norm": 1.6887176036834717, "learning_rate": 0.00017083462590266438, "loss": 0.4004, "step": 1131 }, { "epoch": 0.2725576355865888, "grad_norm": 2.2301809787750244, "learning_rate": 0.00017077956489324972, "loss": 0.4566, "step": 1132 }, { "epoch": 0.2727984108830434, "grad_norm": 1.984376311302185, "learning_rate": 0.00017072446084970014, "loss": 0.3397, "step": 1133 }, { "epoch": 0.27303918617949796, "grad_norm": 1.465584397315979, "learning_rate": 0.000170669313805519, "loss": 1.118, "step": 1134 }, { "epoch": 0.27327996147595257, "grad_norm": 3.79280948638916, "learning_rate": 0.00017061412379423588, "loss": 1.0574, "step": 1135 }, { "epoch": 0.2735207367724071, "grad_norm": 2.3764195442199707, "learning_rate": 0.00017055889084940638, "loss": 0.66, "step": 1136 }, { "epoch": 0.27376151206886173, "grad_norm": 4.677147388458252, "learning_rate": 0.00017050361500461225, "loss": 1.764, "step": 1137 }, { "epoch": 0.27400228736531634, "grad_norm": 0.9030484557151794, "learning_rate": 0.00017044829629346138, "loss": 0.6526, "step": 1138 }, { "epoch": 0.2742430626617709, "grad_norm": 0.627707302570343, "learning_rate": 0.00017039293474958766, "loss": 0.3727, "step": 1139 }, { "epoch": 0.2744838379582255, "grad_norm": 1.3778057098388672, "learning_rate": 0.00017033753040665098, "loss": 0.6493, "step": 1140 }, { "epoch": 0.27472461325468006, "grad_norm": 4.6707305908203125, "learning_rate": 0.00017028208329833734, "loss": 0.9048, "step": 1141 }, { "epoch": 0.27496538855113467, "grad_norm": 0.9319252967834473, "learning_rate": 0.00017022659345835873, "loss": 0.199, "step": 1142 }, { "epoch": 0.2752061638475892, "grad_norm": 4.65252685546875, "learning_rate": 0.00017017106092045308, "loss": 0.5977, "step": 1143 }, { "epoch": 0.27544693914404383, "grad_norm": 2.0846447944641113, "learning_rate": 0.00017011548571838425, "loss": 0.8822, "step": 1144 }, { "epoch": 0.2756877144404984, "grad_norm": 7.9237961769104, "learning_rate": 0.00017005986788594217, "loss": 0.9254, "step": 1145 }, { "epoch": 0.275928489736953, "grad_norm": 1.8218225240707397, "learning_rate": 0.00017000420745694254, "loss": 0.8144, "step": 1146 }, { "epoch": 0.27616926503340755, "grad_norm": 2.215475082397461, "learning_rate": 0.00016994850446522708, "loss": 0.5376, "step": 1147 }, { "epoch": 0.27641004032986216, "grad_norm": 2.7972052097320557, "learning_rate": 0.0001698927589446633, "loss": 0.7172, "step": 1148 }, { "epoch": 0.2766508156263167, "grad_norm": 3.2082738876342773, "learning_rate": 0.00016983697092914462, "loss": 0.7779, "step": 1149 }, { "epoch": 0.2768915909227713, "grad_norm": 7.983036041259766, "learning_rate": 0.00016978114045259024, "loss": 0.6586, "step": 1150 }, { "epoch": 0.27713236621922593, "grad_norm": 1.2389219999313354, "learning_rate": 0.00016972526754894526, "loss": 0.4504, "step": 1151 }, { "epoch": 0.2773731415156805, "grad_norm": 4.120885848999023, "learning_rate": 0.00016966935225218055, "loss": 0.9209, "step": 1152 }, { "epoch": 0.2776139168121351, "grad_norm": 1.707640528678894, "learning_rate": 0.0001696133945962927, "loss": 0.4769, "step": 1153 }, { "epoch": 0.27785469210858965, "grad_norm": 2.347038745880127, "learning_rate": 0.00016955739461530403, "loss": 0.6969, "step": 1154 }, { "epoch": 0.27809546740504426, "grad_norm": 1.5984582901000977, "learning_rate": 0.0001695013523432628, "loss": 0.7477, "step": 1155 }, { "epoch": 0.2783362427014988, "grad_norm": 3.808624267578125, "learning_rate": 0.0001694452678142427, "loss": 0.6706, "step": 1156 }, { "epoch": 0.2785770179979534, "grad_norm": 2.3861489295959473, "learning_rate": 0.00016938914106234333, "loss": 0.492, "step": 1157 }, { "epoch": 0.278817793294408, "grad_norm": 6.34063196182251, "learning_rate": 0.00016933297212168985, "loss": 0.9194, "step": 1158 }, { "epoch": 0.2790585685908626, "grad_norm": 2.32570743560791, "learning_rate": 0.0001692767610264331, "loss": 0.3936, "step": 1159 }, { "epoch": 0.27929934388731714, "grad_norm": 2.573622226715088, "learning_rate": 0.0001692205078107496, "loss": 0.8134, "step": 1160 }, { "epoch": 0.27954011918377175, "grad_norm": 2.499985933303833, "learning_rate": 0.00016916421250884138, "loss": 0.4928, "step": 1161 }, { "epoch": 0.2797808944802263, "grad_norm": 1.9372178316116333, "learning_rate": 0.00016910787515493611, "loss": 0.6883, "step": 1162 }, { "epoch": 0.2800216697766809, "grad_norm": 1.012056589126587, "learning_rate": 0.00016905149578328702, "loss": 0.3567, "step": 1163 }, { "epoch": 0.28026244507313547, "grad_norm": 1.383881688117981, "learning_rate": 0.00016899507442817298, "loss": 0.7005, "step": 1164 }, { "epoch": 0.2805032203695901, "grad_norm": 7.843169212341309, "learning_rate": 0.00016893861112389822, "loss": 0.59, "step": 1165 }, { "epoch": 0.2807439956660447, "grad_norm": 15.78963851928711, "learning_rate": 0.00016888210590479256, "loss": 0.7168, "step": 1166 }, { "epoch": 0.28098477096249924, "grad_norm": 1.2202370166778564, "learning_rate": 0.0001688255588052113, "loss": 0.2079, "step": 1167 }, { "epoch": 0.28122554625895385, "grad_norm": 1.067835807800293, "learning_rate": 0.0001687689698595353, "loss": 0.6354, "step": 1168 }, { "epoch": 0.2814663215554084, "grad_norm": 0.6400854587554932, "learning_rate": 0.0001687123391021706, "loss": 0.307, "step": 1169 }, { "epoch": 0.281707096851863, "grad_norm": 2.6087357997894287, "learning_rate": 0.00016865566656754896, "loss": 0.4111, "step": 1170 }, { "epoch": 0.28194787214831757, "grad_norm": 1.9883902072906494, "learning_rate": 0.00016859895229012737, "loss": 0.6824, "step": 1171 }, { "epoch": 0.2821886474447722, "grad_norm": 2.6531500816345215, "learning_rate": 0.00016854219630438818, "loss": 0.745, "step": 1172 }, { "epoch": 0.28242942274122673, "grad_norm": 0.8592819571495056, "learning_rate": 0.00016848539864483926, "loss": 0.7847, "step": 1173 }, { "epoch": 0.28267019803768134, "grad_norm": 4.981196880340576, "learning_rate": 0.00016842855934601366, "loss": 0.9405, "step": 1174 }, { "epoch": 0.2829109733341359, "grad_norm": 1.9096482992172241, "learning_rate": 0.0001683716784424698, "loss": 0.8852, "step": 1175 }, { "epoch": 0.2831517486305905, "grad_norm": 7.072299003601074, "learning_rate": 0.0001683147559687914, "loss": 1.6136, "step": 1176 }, { "epoch": 0.28339252392704506, "grad_norm": 18.518299102783203, "learning_rate": 0.00016825779195958745, "loss": 0.2307, "step": 1177 }, { "epoch": 0.28363329922349967, "grad_norm": 2.7872228622436523, "learning_rate": 0.0001682007864494922, "loss": 0.6282, "step": 1178 }, { "epoch": 0.2838740745199543, "grad_norm": 1.4213825464248657, "learning_rate": 0.00016814373947316512, "loss": 0.6838, "step": 1179 }, { "epoch": 0.28411484981640883, "grad_norm": 1.1344329118728638, "learning_rate": 0.00016808665106529094, "loss": 0.4394, "step": 1180 }, { "epoch": 0.28435562511286344, "grad_norm": 1.0440508127212524, "learning_rate": 0.0001680295212605795, "loss": 0.1343, "step": 1181 }, { "epoch": 0.284596400409318, "grad_norm": 3.40962553024292, "learning_rate": 0.00016797235009376586, "loss": 0.6312, "step": 1182 }, { "epoch": 0.2848371757057726, "grad_norm": 3.0211853981018066, "learning_rate": 0.0001679151375996102, "loss": 0.6371, "step": 1183 }, { "epoch": 0.28507795100222716, "grad_norm": 8.21009635925293, "learning_rate": 0.0001678578838128979, "loss": 0.7002, "step": 1184 }, { "epoch": 0.28531872629868177, "grad_norm": 2.1480865478515625, "learning_rate": 0.00016780058876843934, "loss": 0.4914, "step": 1185 }, { "epoch": 0.2855595015951363, "grad_norm": 1.2523528337478638, "learning_rate": 0.00016774325250107006, "loss": 0.5931, "step": 1186 }, { "epoch": 0.28580027689159093, "grad_norm": 1.5123728513717651, "learning_rate": 0.00016768587504565062, "loss": 0.439, "step": 1187 }, { "epoch": 0.2860410521880455, "grad_norm": 1.9221967458724976, "learning_rate": 0.00016762845643706665, "loss": 0.6541, "step": 1188 }, { "epoch": 0.2862818274845001, "grad_norm": 4.153512477874756, "learning_rate": 0.00016757099671022883, "loss": 0.7725, "step": 1189 }, { "epoch": 0.28652260278095465, "grad_norm": 1.0292513370513916, "learning_rate": 0.00016751349590007274, "loss": 0.5082, "step": 1190 }, { "epoch": 0.28676337807740926, "grad_norm": 4.168222904205322, "learning_rate": 0.00016745595404155905, "loss": 0.5705, "step": 1191 }, { "epoch": 0.28700415337386387, "grad_norm": 1.6598914861679077, "learning_rate": 0.00016739837116967328, "loss": 0.8381, "step": 1192 }, { "epoch": 0.2872449286703184, "grad_norm": 2.8263731002807617, "learning_rate": 0.00016734074731942605, "loss": 0.7783, "step": 1193 }, { "epoch": 0.28748570396677303, "grad_norm": 1.6634050607681274, "learning_rate": 0.00016728308252585267, "loss": 0.3698, "step": 1194 }, { "epoch": 0.2877264792632276, "grad_norm": 2.690964937210083, "learning_rate": 0.00016722537682401357, "loss": 0.4771, "step": 1195 }, { "epoch": 0.2879672545596822, "grad_norm": 0.9511985778808594, "learning_rate": 0.0001671676302489939, "loss": 0.2755, "step": 1196 }, { "epoch": 0.28820802985613675, "grad_norm": 2.063718557357788, "learning_rate": 0.0001671098428359037, "loss": 0.4661, "step": 1197 }, { "epoch": 0.28844880515259136, "grad_norm": 3.1178414821624756, "learning_rate": 0.00016705201461987782, "loss": 1.1358, "step": 1198 }, { "epoch": 0.2886895804490459, "grad_norm": 1.8301066160202026, "learning_rate": 0.00016699414563607601, "loss": 0.3741, "step": 1199 }, { "epoch": 0.2889303557455005, "grad_norm": 2.6910312175750732, "learning_rate": 0.00016693623591968273, "loss": 1.0457, "step": 1200 }, { "epoch": 0.2891711310419551, "grad_norm": 1.9413840770721436, "learning_rate": 0.0001668782855059072, "loss": 0.7107, "step": 1201 }, { "epoch": 0.2894119063384097, "grad_norm": 1.9084299802780151, "learning_rate": 0.00016682029442998338, "loss": 0.9563, "step": 1202 }, { "epoch": 0.28965268163486424, "grad_norm": 6.873541831970215, "learning_rate": 0.00016676226272717, "loss": 0.8658, "step": 1203 }, { "epoch": 0.28989345693131885, "grad_norm": 2.0159761905670166, "learning_rate": 0.00016670419043275048, "loss": 0.7841, "step": 1204 }, { "epoch": 0.2901342322277734, "grad_norm": 1.8797401189804077, "learning_rate": 0.00016664607758203287, "loss": 0.7343, "step": 1205 }, { "epoch": 0.290375007524228, "grad_norm": 1.6734647750854492, "learning_rate": 0.00016658792421034996, "loss": 0.4975, "step": 1206 }, { "epoch": 0.2906157828206826, "grad_norm": 0.8860729932785034, "learning_rate": 0.00016652973035305907, "loss": 0.4253, "step": 1207 }, { "epoch": 0.2908565581171372, "grad_norm": 0.7764965295791626, "learning_rate": 0.00016647149604554227, "loss": 0.7893, "step": 1208 }, { "epoch": 0.2910973334135918, "grad_norm": 6.640602111816406, "learning_rate": 0.0001664132213232061, "loss": 0.9636, "step": 1209 }, { "epoch": 0.29133810871004634, "grad_norm": 6.02003288269043, "learning_rate": 0.00016635490622148177, "loss": 0.8415, "step": 1210 }, { "epoch": 0.29157888400650095, "grad_norm": 1.2742475271224976, "learning_rate": 0.00016629655077582487, "loss": 0.2262, "step": 1211 }, { "epoch": 0.2918196593029555, "grad_norm": 0.7330831289291382, "learning_rate": 0.0001662381550217158, "loss": 0.4596, "step": 1212 }, { "epoch": 0.2920604345994101, "grad_norm": 5.310278415679932, "learning_rate": 0.00016617971899465922, "loss": 0.4937, "step": 1213 }, { "epoch": 0.29230120989586467, "grad_norm": 3.351181983947754, "learning_rate": 0.0001661212427301844, "loss": 0.3122, "step": 1214 }, { "epoch": 0.2925419851923193, "grad_norm": 2.28200101852417, "learning_rate": 0.000166062726263845, "loss": 1.5276, "step": 1215 }, { "epoch": 0.29278276048877383, "grad_norm": 4.403338432312012, "learning_rate": 0.0001660041696312192, "loss": 0.8055, "step": 1216 }, { "epoch": 0.29302353578522844, "grad_norm": 2.3211700916290283, "learning_rate": 0.00016594557286790957, "loss": 0.715, "step": 1217 }, { "epoch": 0.293264311081683, "grad_norm": 2.3568782806396484, "learning_rate": 0.00016588693600954306, "loss": 0.4839, "step": 1218 }, { "epoch": 0.2935050863781376, "grad_norm": 3.552236795425415, "learning_rate": 0.00016582825909177099, "loss": 0.8309, "step": 1219 }, { "epoch": 0.2937458616745922, "grad_norm": 1.4845949411392212, "learning_rate": 0.0001657695421502691, "loss": 0.3576, "step": 1220 }, { "epoch": 0.29398663697104677, "grad_norm": 4.3355607986450195, "learning_rate": 0.00016571078522073737, "loss": 0.4216, "step": 1221 }, { "epoch": 0.2942274122675014, "grad_norm": 2.5869123935699463, "learning_rate": 0.0001656519883389002, "loss": 1.0778, "step": 1222 }, { "epoch": 0.29446818756395593, "grad_norm": 3.6160268783569336, "learning_rate": 0.0001655931515405062, "loss": 0.6609, "step": 1223 }, { "epoch": 0.29470896286041054, "grad_norm": 2.8097994327545166, "learning_rate": 0.00016553427486132828, "loss": 0.6801, "step": 1224 }, { "epoch": 0.2949497381568651, "grad_norm": 2.5700998306274414, "learning_rate": 0.00016547535833716362, "loss": 0.3883, "step": 1225 }, { "epoch": 0.2951905134533197, "grad_norm": 0.5326368808746338, "learning_rate": 0.00016541640200383356, "loss": 0.2599, "step": 1226 }, { "epoch": 0.29543128874977426, "grad_norm": 4.097855567932129, "learning_rate": 0.00016535740589718366, "loss": 1.4335, "step": 1227 }, { "epoch": 0.29567206404622887, "grad_norm": 1.7571992874145508, "learning_rate": 0.00016529837005308375, "loss": 0.7812, "step": 1228 }, { "epoch": 0.2959128393426834, "grad_norm": 2.6337194442749023, "learning_rate": 0.00016523929450742774, "loss": 0.3936, "step": 1229 }, { "epoch": 0.29615361463913803, "grad_norm": 0.9062210917472839, "learning_rate": 0.00016518017929613367, "loss": 0.3914, "step": 1230 }, { "epoch": 0.2963943899355926, "grad_norm": 0.581713080406189, "learning_rate": 0.00016512102445514375, "loss": 0.8761, "step": 1231 }, { "epoch": 0.2966351652320472, "grad_norm": 1.9768112897872925, "learning_rate": 0.0001650618300204242, "loss": 0.5862, "step": 1232 }, { "epoch": 0.2968759405285018, "grad_norm": 1.7873097658157349, "learning_rate": 0.00016500259602796546, "loss": 0.1979, "step": 1233 }, { "epoch": 0.29711671582495636, "grad_norm": 2.351323366165161, "learning_rate": 0.00016494332251378187, "loss": 0.6285, "step": 1234 }, { "epoch": 0.29735749112141097, "grad_norm": 2.609557628631592, "learning_rate": 0.00016488400951391186, "loss": 0.5139, "step": 1235 }, { "epoch": 0.2975982664178655, "grad_norm": 2.986835241317749, "learning_rate": 0.0001648246570644179, "loss": 0.3242, "step": 1236 }, { "epoch": 0.29783904171432013, "grad_norm": 1.083709716796875, "learning_rate": 0.00016476526520138636, "loss": 0.7125, "step": 1237 }, { "epoch": 0.2980798170107747, "grad_norm": 4.175523281097412, "learning_rate": 0.0001647058339609277, "loss": 0.6407, "step": 1238 }, { "epoch": 0.2983205923072293, "grad_norm": 1.0320210456848145, "learning_rate": 0.00016464636337917618, "loss": 0.4267, "step": 1239 }, { "epoch": 0.29856136760368385, "grad_norm": 1.7650171518325806, "learning_rate": 0.0001645868534922901, "loss": 0.8656, "step": 1240 }, { "epoch": 0.29880214290013846, "grad_norm": 0.3890477418899536, "learning_rate": 0.00016452730433645153, "loss": 0.4355, "step": 1241 }, { "epoch": 0.299042918196593, "grad_norm": 3.933539390563965, "learning_rate": 0.0001644677159478666, "loss": 0.7368, "step": 1242 }, { "epoch": 0.2992836934930476, "grad_norm": 3.1213431358337402, "learning_rate": 0.00016440808836276508, "loss": 0.5998, "step": 1243 }, { "epoch": 0.2995244687895022, "grad_norm": 2.534736156463623, "learning_rate": 0.00016434842161740075, "loss": 0.6373, "step": 1244 }, { "epoch": 0.2997652440859568, "grad_norm": 1.6457316875457764, "learning_rate": 0.0001642887157480511, "loss": 0.8746, "step": 1245 }, { "epoch": 0.30000601938241134, "grad_norm": 2.4500882625579834, "learning_rate": 0.0001642289707910174, "loss": 0.3703, "step": 1246 }, { "epoch": 0.30024679467886595, "grad_norm": 6.751053810119629, "learning_rate": 0.0001641691867826248, "loss": 0.9699, "step": 1247 }, { "epoch": 0.30048756997532056, "grad_norm": 2.2047617435455322, "learning_rate": 0.000164109363759222, "loss": 0.7187, "step": 1248 }, { "epoch": 0.3007283452717751, "grad_norm": 1.771125316619873, "learning_rate": 0.00016404950175718166, "loss": 0.4576, "step": 1249 }, { "epoch": 0.3009691205682297, "grad_norm": 2.1661245822906494, "learning_rate": 0.0001639896008128999, "loss": 0.6159, "step": 1250 }, { "epoch": 0.3012098958646843, "grad_norm": 4.253533363342285, "learning_rate": 0.0001639296609627967, "loss": 0.7709, "step": 1251 }, { "epoch": 0.3014506711611389, "grad_norm": 3.3934977054595947, "learning_rate": 0.00016386968224331558, "loss": 1.3822, "step": 1252 }, { "epoch": 0.30169144645759344, "grad_norm": 4.271642684936523, "learning_rate": 0.00016380966469092378, "loss": 1.5999, "step": 1253 }, { "epoch": 0.30193222175404805, "grad_norm": 1.2420214414596558, "learning_rate": 0.00016374960834211204, "loss": 0.3992, "step": 1254 }, { "epoch": 0.3021729970505026, "grad_norm": 1.2237993478775024, "learning_rate": 0.00016368951323339484, "loss": 0.2898, "step": 1255 }, { "epoch": 0.3024137723469572, "grad_norm": 1.4050495624542236, "learning_rate": 0.00016362937940131008, "loss": 0.2777, "step": 1256 }, { "epoch": 0.30265454764341176, "grad_norm": 1.4772244691848755, "learning_rate": 0.0001635692068824193, "loss": 0.9147, "step": 1257 }, { "epoch": 0.3028953229398664, "grad_norm": 4.798654556274414, "learning_rate": 0.0001635089957133075, "loss": 0.3208, "step": 1258 }, { "epoch": 0.30313609823632093, "grad_norm": 2.012327194213867, "learning_rate": 0.0001634487459305832, "loss": 0.691, "step": 1259 }, { "epoch": 0.30337687353277554, "grad_norm": 7.864597797393799, "learning_rate": 0.00016338845757087847, "loss": 0.7949, "step": 1260 }, { "epoch": 0.30361764882923015, "grad_norm": 1.5631287097930908, "learning_rate": 0.0001633281306708487, "loss": 0.8655, "step": 1261 }, { "epoch": 0.3038584241256847, "grad_norm": 3.419724225997925, "learning_rate": 0.0001632677652671728, "loss": 0.7764, "step": 1262 }, { "epoch": 0.3040991994221393, "grad_norm": 4.057196617126465, "learning_rate": 0.00016320736139655305, "loss": 0.4629, "step": 1263 }, { "epoch": 0.30433997471859386, "grad_norm": 2.437304735183716, "learning_rate": 0.0001631469190957152, "loss": 0.6043, "step": 1264 }, { "epoch": 0.3045807500150485, "grad_norm": 3.452397108078003, "learning_rate": 0.00016308643840140828, "loss": 0.9057, "step": 1265 }, { "epoch": 0.304821525311503, "grad_norm": 2.3599209785461426, "learning_rate": 0.00016302591935040463, "loss": 0.4477, "step": 1266 }, { "epoch": 0.30506230060795764, "grad_norm": 2.7127840518951416, "learning_rate": 0.0001629653619795, "loss": 0.4695, "step": 1267 }, { "epoch": 0.3053030759044122, "grad_norm": 1.4742056131362915, "learning_rate": 0.00016290476632551347, "loss": 0.7507, "step": 1268 }, { "epoch": 0.3055438512008668, "grad_norm": 1.4544012546539307, "learning_rate": 0.0001628441324252873, "loss": 0.5332, "step": 1269 }, { "epoch": 0.30578462649732135, "grad_norm": 3.388953685760498, "learning_rate": 0.000162783460315687, "loss": 0.9343, "step": 1270 }, { "epoch": 0.30602540179377596, "grad_norm": 3.447437047958374, "learning_rate": 0.00016272275003360135, "loss": 0.7331, "step": 1271 }, { "epoch": 0.3062661770902305, "grad_norm": 2.9696388244628906, "learning_rate": 0.0001626620016159424, "loss": 0.2491, "step": 1272 }, { "epoch": 0.3065069523866851, "grad_norm": 0.8574854135513306, "learning_rate": 0.0001626012150996453, "loss": 0.5318, "step": 1273 }, { "epoch": 0.30674772768313974, "grad_norm": 2.6496622562408447, "learning_rate": 0.00016254039052166833, "loss": 0.725, "step": 1274 }, { "epoch": 0.3069885029795943, "grad_norm": 1.233094334602356, "learning_rate": 0.00016247952791899307, "loss": 0.7075, "step": 1275 }, { "epoch": 0.3072292782760489, "grad_norm": 1.2451717853546143, "learning_rate": 0.00016241862732862403, "loss": 0.8067, "step": 1276 }, { "epoch": 0.30747005357250345, "grad_norm": 2.2256247997283936, "learning_rate": 0.00016235768878758897, "loss": 0.389, "step": 1277 }, { "epoch": 0.30771082886895806, "grad_norm": 2.2310009002685547, "learning_rate": 0.00016229671233293863, "loss": 1.3423, "step": 1278 }, { "epoch": 0.3079516041654126, "grad_norm": 2.3196895122528076, "learning_rate": 0.0001622356980017468, "loss": 0.5485, "step": 1279 }, { "epoch": 0.3081923794618672, "grad_norm": 6.365363121032715, "learning_rate": 0.0001621746458311104, "loss": 0.6047, "step": 1280 }, { "epoch": 0.3084331547583218, "grad_norm": 2.375135898590088, "learning_rate": 0.00016211355585814925, "loss": 1.0309, "step": 1281 }, { "epoch": 0.3086739300547764, "grad_norm": 3.856171131134033, "learning_rate": 0.00016205242812000617, "loss": 0.5747, "step": 1282 }, { "epoch": 0.30891470535123094, "grad_norm": 1.3465646505355835, "learning_rate": 0.00016199126265384702, "loss": 0.6992, "step": 1283 }, { "epoch": 0.30915548064768555, "grad_norm": 3.8031649589538574, "learning_rate": 0.0001619300594968605, "loss": 0.6855, "step": 1284 }, { "epoch": 0.3093962559441401, "grad_norm": 6.7793169021606445, "learning_rate": 0.00016186881868625826, "loss": 0.7541, "step": 1285 }, { "epoch": 0.3096370312405947, "grad_norm": 4.37679386138916, "learning_rate": 0.00016180754025927488, "loss": 0.7391, "step": 1286 }, { "epoch": 0.30987780653704927, "grad_norm": 2.1541247367858887, "learning_rate": 0.00016174622425316776, "loss": 0.6678, "step": 1287 }, { "epoch": 0.3101185818335039, "grad_norm": 1.5806964635849, "learning_rate": 0.00016168487070521717, "loss": 0.6008, "step": 1288 }, { "epoch": 0.3103593571299585, "grad_norm": 2.3984477519989014, "learning_rate": 0.00016162347965272624, "loss": 0.431, "step": 1289 }, { "epoch": 0.31060013242641304, "grad_norm": 5.19956111907959, "learning_rate": 0.00016156205113302083, "loss": 1.1046, "step": 1290 }, { "epoch": 0.31084090772286765, "grad_norm": 1.4966849088668823, "learning_rate": 0.00016150058518344963, "loss": 0.4343, "step": 1291 }, { "epoch": 0.3110816830193222, "grad_norm": 1.530099868774414, "learning_rate": 0.00016143908184138408, "loss": 0.3569, "step": 1292 }, { "epoch": 0.3113224583157768, "grad_norm": 6.92020845413208, "learning_rate": 0.00016137754114421834, "loss": 0.4397, "step": 1293 }, { "epoch": 0.31156323361223137, "grad_norm": 4.40862512588501, "learning_rate": 0.0001613159631293693, "loss": 0.7515, "step": 1294 }, { "epoch": 0.311804008908686, "grad_norm": 6.846129894256592, "learning_rate": 0.00016125434783427654, "loss": 0.9461, "step": 1295 }, { "epoch": 0.31204478420514054, "grad_norm": 6.169475078582764, "learning_rate": 0.0001611926952964023, "loss": 1.5009, "step": 1296 }, { "epoch": 0.31228555950159514, "grad_norm": 2.2589635848999023, "learning_rate": 0.0001611310055532314, "loss": 0.6197, "step": 1297 }, { "epoch": 0.3125263347980497, "grad_norm": 5.03438663482666, "learning_rate": 0.00016106927864227143, "loss": 1.2404, "step": 1298 }, { "epoch": 0.3127671100945043, "grad_norm": 2.119262456893921, "learning_rate": 0.00016100751460105243, "loss": 0.3959, "step": 1299 }, { "epoch": 0.31300788539095886, "grad_norm": 1.556208610534668, "learning_rate": 0.00016094571346712716, "loss": 0.3569, "step": 1300 }, { "epoch": 0.31324866068741347, "grad_norm": 3.7477822303771973, "learning_rate": 0.0001608838752780707, "loss": 0.9697, "step": 1301 }, { "epoch": 0.3134894359838681, "grad_norm": 1.226062297821045, "learning_rate": 0.000160822000071481, "loss": 0.2165, "step": 1302 }, { "epoch": 0.31373021128032264, "grad_norm": 1.5284736156463623, "learning_rate": 0.00016076008788497816, "loss": 0.3499, "step": 1303 }, { "epoch": 0.31397098657677724, "grad_norm": 1.2165570259094238, "learning_rate": 0.00016069813875620498, "loss": 0.3322, "step": 1304 }, { "epoch": 0.3142117618732318, "grad_norm": 2.2660257816314697, "learning_rate": 0.00016063615272282673, "loss": 0.9303, "step": 1305 }, { "epoch": 0.3144525371696864, "grad_norm": 3.506263494491577, "learning_rate": 0.00016057412982253098, "loss": 0.3677, "step": 1306 }, { "epoch": 0.31469331246614096, "grad_norm": 2.1276533603668213, "learning_rate": 0.00016051207009302781, "loss": 0.8432, "step": 1307 }, { "epoch": 0.31493408776259557, "grad_norm": 4.875666618347168, "learning_rate": 0.00016044997357204973, "loss": 0.4637, "step": 1308 }, { "epoch": 0.3151748630590501, "grad_norm": 0.7586674690246582, "learning_rate": 0.0001603878402973515, "loss": 0.3961, "step": 1309 }, { "epoch": 0.31541563835550473, "grad_norm": 1.5257422924041748, "learning_rate": 0.0001603256703067103, "loss": 0.6829, "step": 1310 }, { "epoch": 0.3156564136519593, "grad_norm": 2.0728249549865723, "learning_rate": 0.00016026346363792567, "loss": 0.5977, "step": 1311 }, { "epoch": 0.3158971889484139, "grad_norm": 6.4057936668396, "learning_rate": 0.00016020122032881932, "loss": 1.2481, "step": 1312 }, { "epoch": 0.31613796424486845, "grad_norm": 0.8619070649147034, "learning_rate": 0.00016013894041723542, "loss": 1.0521, "step": 1313 }, { "epoch": 0.31637873954132306, "grad_norm": 2.2395753860473633, "learning_rate": 0.00016007662394104024, "loss": 0.4075, "step": 1314 }, { "epoch": 0.31661951483777767, "grad_norm": 1.7755603790283203, "learning_rate": 0.00016001427093812235, "loss": 0.6441, "step": 1315 }, { "epoch": 0.3168602901342322, "grad_norm": 0.9650968909263611, "learning_rate": 0.0001599518814463925, "loss": 0.1726, "step": 1316 }, { "epoch": 0.31710106543068683, "grad_norm": 1.7282532453536987, "learning_rate": 0.0001598894555037837, "loss": 0.2156, "step": 1317 }, { "epoch": 0.3173418407271414, "grad_norm": 1.8310699462890625, "learning_rate": 0.000159826993148251, "loss": 0.4061, "step": 1318 }, { "epoch": 0.317582616023596, "grad_norm": 2.200747489929199, "learning_rate": 0.00015976449441777163, "loss": 0.3275, "step": 1319 }, { "epoch": 0.31782339132005055, "grad_norm": 3.546372175216675, "learning_rate": 0.00015970195935034506, "loss": 1.1699, "step": 1320 }, { "epoch": 0.31806416661650516, "grad_norm": 4.240285873413086, "learning_rate": 0.00015963938798399267, "loss": 1.0385, "step": 1321 }, { "epoch": 0.3183049419129597, "grad_norm": 1.8230444192886353, "learning_rate": 0.00015957678035675806, "loss": 0.8566, "step": 1322 }, { "epoch": 0.3185457172094143, "grad_norm": 1.3038523197174072, "learning_rate": 0.00015951413650670669, "loss": 0.545, "step": 1323 }, { "epoch": 0.3187864925058689, "grad_norm": 2.877883195877075, "learning_rate": 0.00015945145647192627, "loss": 0.522, "step": 1324 }, { "epoch": 0.3190272678023235, "grad_norm": 2.4238524436950684, "learning_rate": 0.0001593887402905264, "loss": 1.1966, "step": 1325 }, { "epoch": 0.31926804309877804, "grad_norm": 6.8975982666015625, "learning_rate": 0.0001593259880006386, "loss": 0.3491, "step": 1326 }, { "epoch": 0.31950881839523265, "grad_norm": 2.4402458667755127, "learning_rate": 0.0001592631996404164, "loss": 0.8293, "step": 1327 }, { "epoch": 0.3197495936916872, "grad_norm": 2.575347900390625, "learning_rate": 0.00015920037524803538, "loss": 0.9677, "step": 1328 }, { "epoch": 0.3199903689881418, "grad_norm": 1.8966193199157715, "learning_rate": 0.00015913751486169275, "loss": 0.4061, "step": 1329 }, { "epoch": 0.3202311442845964, "grad_norm": 3.9115090370178223, "learning_rate": 0.0001590746185196079, "loss": 0.8245, "step": 1330 }, { "epoch": 0.320471919581051, "grad_norm": 2.3119075298309326, "learning_rate": 0.00015901168626002184, "loss": 0.8401, "step": 1331 }, { "epoch": 0.3207126948775056, "grad_norm": 3.0290722846984863, "learning_rate": 0.00015894871812119764, "loss": 0.2644, "step": 1332 }, { "epoch": 0.32095347017396014, "grad_norm": 1.3376718759536743, "learning_rate": 0.00015888571414141996, "loss": 0.7519, "step": 1333 }, { "epoch": 0.32119424547041475, "grad_norm": 3.897224187850952, "learning_rate": 0.00015882267435899543, "loss": 0.5062, "step": 1334 }, { "epoch": 0.3214350207668693, "grad_norm": 3.285747766494751, "learning_rate": 0.00015875959881225238, "loss": 0.6907, "step": 1335 }, { "epoch": 0.3216757960633239, "grad_norm": 0.9387348294258118, "learning_rate": 0.00015869648753954083, "loss": 0.3543, "step": 1336 }, { "epoch": 0.32191657135977847, "grad_norm": 0.8521896600723267, "learning_rate": 0.00015863334057923263, "loss": 0.6814, "step": 1337 }, { "epoch": 0.3221573466562331, "grad_norm": 3.792236328125, "learning_rate": 0.00015857015796972126, "loss": 0.361, "step": 1338 }, { "epoch": 0.32239812195268763, "grad_norm": 1.9048930406570435, "learning_rate": 0.00015850693974942188, "loss": 0.841, "step": 1339 }, { "epoch": 0.32263889724914224, "grad_norm": 4.0882744789123535, "learning_rate": 0.00015844368595677128, "loss": 1.4357, "step": 1340 }, { "epoch": 0.3228796725455968, "grad_norm": 2.6341850757598877, "learning_rate": 0.000158380396630228, "loss": 0.7541, "step": 1341 }, { "epoch": 0.3231204478420514, "grad_norm": 1.2838685512542725, "learning_rate": 0.000158317071808272, "loss": 0.3621, "step": 1342 }, { "epoch": 0.323361223138506, "grad_norm": 2.2688121795654297, "learning_rate": 0.000158253711529405, "loss": 0.4616, "step": 1343 }, { "epoch": 0.32360199843496057, "grad_norm": 1.7393488883972168, "learning_rate": 0.00015819031583215007, "loss": 0.9108, "step": 1344 }, { "epoch": 0.3238427737314152, "grad_norm": 2.279599666595459, "learning_rate": 0.00015812688475505201, "loss": 0.4704, "step": 1345 }, { "epoch": 0.32408354902786973, "grad_norm": 1.3711464405059814, "learning_rate": 0.0001580634183366771, "loss": 0.5435, "step": 1346 }, { "epoch": 0.32432432432432434, "grad_norm": 2.167222499847412, "learning_rate": 0.00015799991661561303, "loss": 0.2528, "step": 1347 }, { "epoch": 0.3245650996207789, "grad_norm": 6.101914405822754, "learning_rate": 0.00015793637963046897, "loss": 1.3281, "step": 1348 }, { "epoch": 0.3248058749172335, "grad_norm": 2.9374330043792725, "learning_rate": 0.00015787280741987557, "loss": 0.5171, "step": 1349 }, { "epoch": 0.32504665021368806, "grad_norm": 1.7808711528778076, "learning_rate": 0.00015780920002248484, "loss": 0.6773, "step": 1350 }, { "epoch": 0.32528742551014267, "grad_norm": 2.1058807373046875, "learning_rate": 0.00015774555747697025, "loss": 0.5836, "step": 1351 }, { "epoch": 0.3255282008065972, "grad_norm": 3.5313520431518555, "learning_rate": 0.00015768187982202666, "loss": 0.6518, "step": 1352 }, { "epoch": 0.32576897610305183, "grad_norm": 4.409549236297607, "learning_rate": 0.00015761816709637015, "loss": 0.8503, "step": 1353 }, { "epoch": 0.3260097513995064, "grad_norm": 1.0890048742294312, "learning_rate": 0.00015755441933873823, "loss": 0.2637, "step": 1354 }, { "epoch": 0.326250526695961, "grad_norm": 1.0471165180206299, "learning_rate": 0.00015749063658788967, "loss": 0.4454, "step": 1355 }, { "epoch": 0.3264913019924156, "grad_norm": 1.7348659038543701, "learning_rate": 0.00015742681888260455, "loss": 0.977, "step": 1356 }, { "epoch": 0.32673207728887016, "grad_norm": 2.9363324642181396, "learning_rate": 0.0001573629662616842, "loss": 0.5051, "step": 1357 }, { "epoch": 0.32697285258532477, "grad_norm": 2.2017180919647217, "learning_rate": 0.00015729907876395105, "loss": 0.6374, "step": 1358 }, { "epoch": 0.3272136278817793, "grad_norm": 1.8804614543914795, "learning_rate": 0.00015723515642824894, "loss": 0.5544, "step": 1359 }, { "epoch": 0.32745440317823393, "grad_norm": 1.586624264717102, "learning_rate": 0.00015717119929344278, "loss": 1.0256, "step": 1360 }, { "epoch": 0.3276951784746885, "grad_norm": 3.861217737197876, "learning_rate": 0.00015710720739841864, "loss": 0.6251, "step": 1361 }, { "epoch": 0.3279359537711431, "grad_norm": 1.4513386487960815, "learning_rate": 0.00015704318078208374, "loss": 0.7021, "step": 1362 }, { "epoch": 0.32817672906759765, "grad_norm": 1.8319506645202637, "learning_rate": 0.00015697911948336641, "loss": 0.5171, "step": 1363 }, { "epoch": 0.32841750436405226, "grad_norm": 1.2202706336975098, "learning_rate": 0.00015691502354121605, "loss": 0.3055, "step": 1364 }, { "epoch": 0.3286582796605068, "grad_norm": 3.2873902320861816, "learning_rate": 0.00015685089299460317, "loss": 0.9132, "step": 1365 }, { "epoch": 0.3288990549569614, "grad_norm": 2.2419869899749756, "learning_rate": 0.00015678672788251922, "loss": 0.5913, "step": 1366 }, { "epoch": 0.329139830253416, "grad_norm": 5.597873210906982, "learning_rate": 0.0001567225282439768, "loss": 0.8836, "step": 1367 }, { "epoch": 0.3293806055498706, "grad_norm": 1.8877670764923096, "learning_rate": 0.0001566582941180094, "loss": 0.486, "step": 1368 }, { "epoch": 0.32962138084632514, "grad_norm": 3.7749109268188477, "learning_rate": 0.00015659402554367153, "loss": 0.8683, "step": 1369 }, { "epoch": 0.32986215614277975, "grad_norm": 1.6134521961212158, "learning_rate": 0.00015652972256003864, "loss": 0.7984, "step": 1370 }, { "epoch": 0.33010293143923436, "grad_norm": 2.474909782409668, "learning_rate": 0.00015646538520620705, "loss": 0.5438, "step": 1371 }, { "epoch": 0.3303437067356889, "grad_norm": 4.5085368156433105, "learning_rate": 0.00015640101352129402, "loss": 0.8213, "step": 1372 }, { "epoch": 0.3305844820321435, "grad_norm": 1.169089913368225, "learning_rate": 0.00015633660754443772, "loss": 0.2603, "step": 1373 }, { "epoch": 0.3308252573285981, "grad_norm": 1.1745972633361816, "learning_rate": 0.0001562721673147971, "loss": 0.3671, "step": 1374 }, { "epoch": 0.3310660326250527, "grad_norm": 2.545999765396118, "learning_rate": 0.00015620769287155197, "loss": 0.8987, "step": 1375 }, { "epoch": 0.33130680792150724, "grad_norm": 3.68367338180542, "learning_rate": 0.00015614318425390296, "loss": 0.7555, "step": 1376 }, { "epoch": 0.33154758321796185, "grad_norm": 1.6733169555664062, "learning_rate": 0.0001560786415010714, "loss": 0.5136, "step": 1377 }, { "epoch": 0.3317883585144164, "grad_norm": 6.26981258392334, "learning_rate": 0.00015601406465229947, "loss": 0.7942, "step": 1378 }, { "epoch": 0.332029133810871, "grad_norm": 2.717362880706787, "learning_rate": 0.00015594945374685002, "loss": 0.7386, "step": 1379 }, { "epoch": 0.33226990910732557, "grad_norm": 1.8613269329071045, "learning_rate": 0.00015588480882400662, "loss": 0.5929, "step": 1380 }, { "epoch": 0.3325106844037802, "grad_norm": 4.205772876739502, "learning_rate": 0.0001558201299230736, "loss": 0.6058, "step": 1381 }, { "epoch": 0.33275145970023473, "grad_norm": 3.043046474456787, "learning_rate": 0.0001557554170833758, "loss": 0.3609, "step": 1382 }, { "epoch": 0.33299223499668934, "grad_norm": 2.67464542388916, "learning_rate": 0.00015569067034425878, "loss": 0.9453, "step": 1383 }, { "epoch": 0.33323301029314395, "grad_norm": 1.8980488777160645, "learning_rate": 0.00015562588974508872, "loss": 0.8145, "step": 1384 }, { "epoch": 0.3334737855895985, "grad_norm": 1.8081344366073608, "learning_rate": 0.00015556107532525238, "loss": 0.5141, "step": 1385 }, { "epoch": 0.3337145608860531, "grad_norm": 2.477198362350464, "learning_rate": 0.00015549622712415702, "loss": 0.8897, "step": 1386 }, { "epoch": 0.33395533618250767, "grad_norm": 2.74221134185791, "learning_rate": 0.0001554313451812306, "loss": 0.8277, "step": 1387 }, { "epoch": 0.3341961114789623, "grad_norm": 1.0981510877609253, "learning_rate": 0.0001553664295359214, "loss": 1.2254, "step": 1388 }, { "epoch": 0.33443688677541683, "grad_norm": 1.967882752418518, "learning_rate": 0.0001553014802276983, "loss": 0.8044, "step": 1389 }, { "epoch": 0.33467766207187144, "grad_norm": 2.7293615341186523, "learning_rate": 0.0001552364972960506, "loss": 0.3669, "step": 1390 }, { "epoch": 0.334918437368326, "grad_norm": 0.9830564260482788, "learning_rate": 0.00015517148078048808, "loss": 0.0667, "step": 1391 }, { "epoch": 0.3351592126647806, "grad_norm": 2.215790033340454, "learning_rate": 0.00015510643072054098, "loss": 0.7652, "step": 1392 }, { "epoch": 0.33539998796123516, "grad_norm": 3.3328158855438232, "learning_rate": 0.00015504134715575986, "loss": 0.7612, "step": 1393 }, { "epoch": 0.33564076325768977, "grad_norm": 0.497371107339859, "learning_rate": 0.00015497623012571566, "loss": 0.5093, "step": 1394 }, { "epoch": 0.3358815385541443, "grad_norm": 3.333343744277954, "learning_rate": 0.00015491107966999964, "loss": 1.1697, "step": 1395 }, { "epoch": 0.33612231385059893, "grad_norm": 1.7939079999923706, "learning_rate": 0.00015484589582822348, "loss": 0.8186, "step": 1396 }, { "epoch": 0.33636308914705354, "grad_norm": 0.5831475257873535, "learning_rate": 0.00015478067864001908, "loss": 0.0296, "step": 1397 }, { "epoch": 0.3366038644435081, "grad_norm": 1.5100713968276978, "learning_rate": 0.00015471542814503867, "loss": 0.7465, "step": 1398 }, { "epoch": 0.3368446397399627, "grad_norm": 3.8856029510498047, "learning_rate": 0.00015465014438295467, "loss": 0.5473, "step": 1399 }, { "epoch": 0.33708541503641726, "grad_norm": 1.1712760925292969, "learning_rate": 0.00015458482739345974, "loss": 0.3689, "step": 1400 }, { "epoch": 0.33732619033287187, "grad_norm": 4.496668338775635, "learning_rate": 0.00015451947721626676, "loss": 1.0972, "step": 1401 }, { "epoch": 0.3375669656293264, "grad_norm": 5.929965496063232, "learning_rate": 0.00015445409389110883, "loss": 0.6352, "step": 1402 }, { "epoch": 0.33780774092578103, "grad_norm": 1.9079606533050537, "learning_rate": 0.00015438867745773912, "loss": 0.5129, "step": 1403 }, { "epoch": 0.3380485162222356, "grad_norm": 3.6617226600646973, "learning_rate": 0.00015432322795593098, "loss": 0.4049, "step": 1404 }, { "epoch": 0.3382892915186902, "grad_norm": 5.218686580657959, "learning_rate": 0.00015425774542547784, "loss": 0.3238, "step": 1405 }, { "epoch": 0.33853006681514475, "grad_norm": 1.2463502883911133, "learning_rate": 0.00015419222990619322, "loss": 0.4756, "step": 1406 }, { "epoch": 0.33877084211159936, "grad_norm": 3.0786678791046143, "learning_rate": 0.00015412668143791075, "loss": 0.8815, "step": 1407 }, { "epoch": 0.3390116174080539, "grad_norm": 2.135958194732666, "learning_rate": 0.000154061100060484, "loss": 0.6417, "step": 1408 }, { "epoch": 0.3392523927045085, "grad_norm": 3.0606963634490967, "learning_rate": 0.00015399548581378664, "loss": 0.573, "step": 1409 }, { "epoch": 0.3394931680009631, "grad_norm": 1.5275843143463135, "learning_rate": 0.00015392983873771223, "loss": 0.5197, "step": 1410 }, { "epoch": 0.3397339432974177, "grad_norm": 2.3803906440734863, "learning_rate": 0.00015386415887217437, "loss": 0.543, "step": 1411 }, { "epoch": 0.3399747185938723, "grad_norm": 5.230526924133301, "learning_rate": 0.00015379844625710654, "loss": 0.5216, "step": 1412 }, { "epoch": 0.34021549389032685, "grad_norm": 1.887787938117981, "learning_rate": 0.0001537327009324622, "loss": 0.7004, "step": 1413 }, { "epoch": 0.34045626918678146, "grad_norm": 3.1152963638305664, "learning_rate": 0.0001536669229382146, "loss": 0.65, "step": 1414 }, { "epoch": 0.340697044483236, "grad_norm": 4.267107009887695, "learning_rate": 0.00015360111231435693, "loss": 0.7265, "step": 1415 }, { "epoch": 0.3409378197796906, "grad_norm": 1.1614614725112915, "learning_rate": 0.0001535352691009023, "loss": 0.443, "step": 1416 }, { "epoch": 0.3411785950761452, "grad_norm": 2.7194442749023438, "learning_rate": 0.00015346939333788336, "loss": 0.93, "step": 1417 }, { "epoch": 0.3414193703725998, "grad_norm": 1.5683730840682983, "learning_rate": 0.00015340348506535283, "loss": 0.665, "step": 1418 }, { "epoch": 0.34166014566905434, "grad_norm": 0.9245167970657349, "learning_rate": 0.00015333754432338302, "loss": 0.3983, "step": 1419 }, { "epoch": 0.34190092096550895, "grad_norm": 3.776094913482666, "learning_rate": 0.00015327157115206614, "loss": 0.6996, "step": 1420 }, { "epoch": 0.3421416962619635, "grad_norm": 3.2278683185577393, "learning_rate": 0.00015320556559151398, "loss": 0.614, "step": 1421 }, { "epoch": 0.3423824715584181, "grad_norm": 1.4512388706207275, "learning_rate": 0.00015313952768185803, "loss": 0.8104, "step": 1422 }, { "epoch": 0.34262324685487267, "grad_norm": 1.858079195022583, "learning_rate": 0.00015307345746324954, "loss": 0.8088, "step": 1423 }, { "epoch": 0.3428640221513273, "grad_norm": 0.6770870685577393, "learning_rate": 0.00015300735497585934, "loss": 0.6674, "step": 1424 }, { "epoch": 0.3431047974477819, "grad_norm": 1.5875935554504395, "learning_rate": 0.00015294122025987788, "loss": 0.5163, "step": 1425 }, { "epoch": 0.34334557274423644, "grad_norm": 1.7607767581939697, "learning_rate": 0.00015287505335551525, "loss": 0.5005, "step": 1426 }, { "epoch": 0.34358634804069105, "grad_norm": 4.621982574462891, "learning_rate": 0.000152808854303001, "loss": 0.7541, "step": 1427 }, { "epoch": 0.3438271233371456, "grad_norm": 3.1218035221099854, "learning_rate": 0.00015274262314258442, "loss": 0.5221, "step": 1428 }, { "epoch": 0.3440678986336002, "grad_norm": 4.2029924392700195, "learning_rate": 0.00015267635991453408, "loss": 0.6852, "step": 1429 }, { "epoch": 0.34430867393005476, "grad_norm": 5.702292442321777, "learning_rate": 0.00015261006465913828, "loss": 0.7622, "step": 1430 }, { "epoch": 0.3445494492265094, "grad_norm": 3.05202054977417, "learning_rate": 0.00015254373741670457, "loss": 0.6527, "step": 1431 }, { "epoch": 0.34479022452296393, "grad_norm": 3.4306201934814453, "learning_rate": 0.00015247737822756018, "loss": 1.2398, "step": 1432 }, { "epoch": 0.34503099981941854, "grad_norm": 2.056917667388916, "learning_rate": 0.0001524109871320516, "loss": 0.3902, "step": 1433 }, { "epoch": 0.3452717751158731, "grad_norm": 1.1819590330123901, "learning_rate": 0.00015234456417054476, "loss": 0.0903, "step": 1434 }, { "epoch": 0.3455125504123277, "grad_norm": 1.0845695734024048, "learning_rate": 0.00015227810938342492, "loss": 0.3635, "step": 1435 }, { "epoch": 0.34575332570878226, "grad_norm": 2.537416458129883, "learning_rate": 0.00015221162281109683, "loss": 0.4821, "step": 1436 }, { "epoch": 0.34599410100523686, "grad_norm": 1.1138862371444702, "learning_rate": 0.00015214510449398442, "loss": 0.5671, "step": 1437 }, { "epoch": 0.3462348763016914, "grad_norm": 2.424607276916504, "learning_rate": 0.00015207855447253103, "loss": 0.8349, "step": 1438 }, { "epoch": 0.34647565159814603, "grad_norm": 5.5035176277160645, "learning_rate": 0.00015201197278719915, "loss": 0.9117, "step": 1439 }, { "epoch": 0.34671642689460064, "grad_norm": 2.757199287414551, "learning_rate": 0.00015194535947847063, "loss": 0.4329, "step": 1440 }, { "epoch": 0.3469572021910552, "grad_norm": 6.263975143432617, "learning_rate": 0.00015187871458684655, "loss": 0.7671, "step": 1441 }, { "epoch": 0.3471979774875098, "grad_norm": 2.1420156955718994, "learning_rate": 0.00015181203815284707, "loss": 0.5561, "step": 1442 }, { "epoch": 0.34743875278396436, "grad_norm": 2.368563175201416, "learning_rate": 0.00015174533021701167, "loss": 0.479, "step": 1443 }, { "epoch": 0.34767952808041896, "grad_norm": 20.131282806396484, "learning_rate": 0.00015167859081989895, "loss": 0.9437, "step": 1444 }, { "epoch": 0.3479203033768735, "grad_norm": 1.1026864051818848, "learning_rate": 0.00015161182000208653, "loss": 0.0633, "step": 1445 }, { "epoch": 0.34816107867332813, "grad_norm": 1.3895201683044434, "learning_rate": 0.0001515450178041713, "loss": 0.8124, "step": 1446 }, { "epoch": 0.3484018539697827, "grad_norm": 3.4181928634643555, "learning_rate": 0.0001514781842667691, "loss": 0.8081, "step": 1447 }, { "epoch": 0.3486426292662373, "grad_norm": 3.2324140071868896, "learning_rate": 0.0001514113194305149, "loss": 1.6239, "step": 1448 }, { "epoch": 0.34888340456269185, "grad_norm": 0.9572200775146484, "learning_rate": 0.00015134442333606264, "loss": 0.7945, "step": 1449 }, { "epoch": 0.34912417985914646, "grad_norm": 3.1057350635528564, "learning_rate": 0.00015127749602408529, "loss": 0.3813, "step": 1450 }, { "epoch": 0.349364955155601, "grad_norm": 3.538774251937866, "learning_rate": 0.00015121053753527485, "loss": 0.6439, "step": 1451 }, { "epoch": 0.3496057304520556, "grad_norm": 1.1300746202468872, "learning_rate": 0.00015114354791034225, "loss": 0.1153, "step": 1452 }, { "epoch": 0.34984650574851023, "grad_norm": 6.388082027435303, "learning_rate": 0.00015107652719001724, "loss": 1.2515, "step": 1453 }, { "epoch": 0.3500872810449648, "grad_norm": 3.4116952419281006, "learning_rate": 0.00015100947541504863, "loss": 0.4694, "step": 1454 }, { "epoch": 0.3503280563414194, "grad_norm": 1.212721586227417, "learning_rate": 0.00015094239262620406, "loss": 0.4981, "step": 1455 }, { "epoch": 0.35056883163787395, "grad_norm": 4.218289375305176, "learning_rate": 0.00015087527886426997, "loss": 0.7434, "step": 1456 }, { "epoch": 0.35080960693432856, "grad_norm": 1.7133764028549194, "learning_rate": 0.00015080813417005172, "loss": 0.5143, "step": 1457 }, { "epoch": 0.3510503822307831, "grad_norm": 2.915750503540039, "learning_rate": 0.00015074095858437343, "loss": 0.6977, "step": 1458 }, { "epoch": 0.3512911575272377, "grad_norm": 1.0086733102798462, "learning_rate": 0.00015067375214807796, "loss": 0.4913, "step": 1459 }, { "epoch": 0.3515319328236923, "grad_norm": 2.266055107116699, "learning_rate": 0.0001506065149020271, "loss": 0.4658, "step": 1460 }, { "epoch": 0.3517727081201469, "grad_norm": 1.7266699075698853, "learning_rate": 0.0001505392468871011, "loss": 1.0223, "step": 1461 }, { "epoch": 0.35201348341660144, "grad_norm": 4.561027526855469, "learning_rate": 0.00015047194814419914, "loss": 0.4841, "step": 1462 }, { "epoch": 0.35225425871305605, "grad_norm": 2.0526604652404785, "learning_rate": 0.00015040461871423897, "loss": 0.0654, "step": 1463 }, { "epoch": 0.3524950340095106, "grad_norm": 2.187910556793213, "learning_rate": 0.0001503372586381571, "loss": 0.8852, "step": 1464 }, { "epoch": 0.3527358093059652, "grad_norm": 1.6966273784637451, "learning_rate": 0.00015026986795690857, "loss": 0.5213, "step": 1465 }, { "epoch": 0.3529765846024198, "grad_norm": 1.83759343624115, "learning_rate": 0.00015020244671146702, "loss": 0.5114, "step": 1466 }, { "epoch": 0.3532173598988744, "grad_norm": 6.522552490234375, "learning_rate": 0.00015013499494282478, "loss": 0.7191, "step": 1467 }, { "epoch": 0.353458135195329, "grad_norm": 1.2005650997161865, "learning_rate": 0.00015006751269199263, "loss": 0.4789, "step": 1468 }, { "epoch": 0.35369891049178354, "grad_norm": 11.688396453857422, "learning_rate": 0.00015000000000000001, "loss": 1.0394, "step": 1469 }, { "epoch": 0.35393968578823815, "grad_norm": 2.6536548137664795, "learning_rate": 0.0001499324569078947, "loss": 0.7116, "step": 1470 }, { "epoch": 0.3541804610846927, "grad_norm": 5.886802673339844, "learning_rate": 0.00014986488345674313, "loss": 0.8322, "step": 1471 }, { "epoch": 0.3544212363811473, "grad_norm": 3.7790753841400146, "learning_rate": 0.00014979727968763003, "loss": 0.6478, "step": 1472 }, { "epoch": 0.35466201167760186, "grad_norm": 2.2750492095947266, "learning_rate": 0.0001497296456416587, "loss": 0.5024, "step": 1473 }, { "epoch": 0.35490278697405647, "grad_norm": 4.089879035949707, "learning_rate": 0.0001496619813599508, "loss": 0.5536, "step": 1474 }, { "epoch": 0.355143562270511, "grad_norm": 4.2792558670043945, "learning_rate": 0.00014959428688364633, "loss": 0.8609, "step": 1475 }, { "epoch": 0.35538433756696564, "grad_norm": 1.6434048414230347, "learning_rate": 0.0001495265622539037, "loss": 0.7308, "step": 1476 }, { "epoch": 0.3556251128634202, "grad_norm": 4.617370128631592, "learning_rate": 0.00014945880751189965, "loss": 0.8816, "step": 1477 }, { "epoch": 0.3558658881598748, "grad_norm": 1.2536977529525757, "learning_rate": 0.0001493910226988292, "loss": 0.3698, "step": 1478 }, { "epoch": 0.35610666345632935, "grad_norm": 1.1533249616622925, "learning_rate": 0.00014932320785590562, "loss": 0.7943, "step": 1479 }, { "epoch": 0.35634743875278396, "grad_norm": 3.6115407943725586, "learning_rate": 0.00014925536302436057, "loss": 0.8441, "step": 1480 }, { "epoch": 0.35658821404923857, "grad_norm": 3.1211764812469482, "learning_rate": 0.0001491874882454438, "loss": 0.3908, "step": 1481 }, { "epoch": 0.3568289893456931, "grad_norm": 1.8636207580566406, "learning_rate": 0.00014911958356042342, "loss": 0.7643, "step": 1482 }, { "epoch": 0.35706976464214774, "grad_norm": 2.321810722351074, "learning_rate": 0.00014905164901058551, "loss": 0.3578, "step": 1483 }, { "epoch": 0.3573105399386023, "grad_norm": 2.7864739894866943, "learning_rate": 0.0001489836846372345, "loss": 0.6989, "step": 1484 }, { "epoch": 0.3575513152350569, "grad_norm": 2.2333052158355713, "learning_rate": 0.0001489156904816929, "loss": 0.7687, "step": 1485 }, { "epoch": 0.35779209053151145, "grad_norm": 3.294875383377075, "learning_rate": 0.00014884766658530125, "loss": 0.7011, "step": 1486 }, { "epoch": 0.35803286582796606, "grad_norm": 4.5237250328063965, "learning_rate": 0.00014877961298941824, "loss": 0.7226, "step": 1487 }, { "epoch": 0.3582736411244206, "grad_norm": 3.1399903297424316, "learning_rate": 0.00014871152973542067, "loss": 1.068, "step": 1488 }, { "epoch": 0.3585144164208752, "grad_norm": 3.0712268352508545, "learning_rate": 0.00014864341686470324, "loss": 0.3788, "step": 1489 }, { "epoch": 0.3587551917173298, "grad_norm": 2.3056137561798096, "learning_rate": 0.0001485752744186788, "loss": 0.7056, "step": 1490 }, { "epoch": 0.3589959670137844, "grad_norm": 2.5554864406585693, "learning_rate": 0.00014850710243877803, "loss": 0.8528, "step": 1491 }, { "epoch": 0.35923674231023894, "grad_norm": 2.9446706771850586, "learning_rate": 0.0001484389009664497, "loss": 0.6246, "step": 1492 }, { "epoch": 0.35947751760669355, "grad_norm": 2.1668176651000977, "learning_rate": 0.00014837067004316049, "loss": 0.6075, "step": 1493 }, { "epoch": 0.35971829290314816, "grad_norm": 3.2588088512420654, "learning_rate": 0.00014830240971039487, "loss": 0.6044, "step": 1494 }, { "epoch": 0.3599590681996027, "grad_norm": 2.2925832271575928, "learning_rate": 0.00014823412000965533, "loss": 0.5931, "step": 1495 }, { "epoch": 0.3601998434960573, "grad_norm": 1.257023811340332, "learning_rate": 0.00014816580098246215, "loss": 0.3431, "step": 1496 }, { "epoch": 0.3604406187925119, "grad_norm": 2.16398024559021, "learning_rate": 0.00014809745267035346, "loss": 0.5645, "step": 1497 }, { "epoch": 0.3606813940889665, "grad_norm": 3.020810604095459, "learning_rate": 0.0001480290751148852, "loss": 0.8691, "step": 1498 }, { "epoch": 0.36092216938542104, "grad_norm": 2.6079869270324707, "learning_rate": 0.00014796066835763103, "loss": 0.7485, "step": 1499 }, { "epoch": 0.36116294468187565, "grad_norm": 4.599701881408691, "learning_rate": 0.00014789223244018244, "loss": 0.7325, "step": 1500 }, { "epoch": 0.3614037199783302, "grad_norm": 3.939009428024292, "learning_rate": 0.00014782376740414863, "loss": 0.6157, "step": 1501 }, { "epoch": 0.3616444952747848, "grad_norm": 1.3872252702713013, "learning_rate": 0.0001477552732911565, "loss": 0.7243, "step": 1502 }, { "epoch": 0.36188527057123937, "grad_norm": 2.120624542236328, "learning_rate": 0.00014768675014285062, "loss": 0.8203, "step": 1503 }, { "epoch": 0.362126045867694, "grad_norm": 1.584861397743225, "learning_rate": 0.0001476181980008932, "loss": 0.5647, "step": 1504 }, { "epoch": 0.36236682116414853, "grad_norm": 2.499906063079834, "learning_rate": 0.0001475496169069641, "loss": 0.4923, "step": 1505 }, { "epoch": 0.36260759646060314, "grad_norm": 4.404402732849121, "learning_rate": 0.0001474810069027608, "loss": 0.8503, "step": 1506 }, { "epoch": 0.36284837175705775, "grad_norm": 1.0523654222488403, "learning_rate": 0.00014741236802999835, "loss": 0.6165, "step": 1507 }, { "epoch": 0.3630891470535123, "grad_norm": 4.341505527496338, "learning_rate": 0.00014734370033040928, "loss": 0.8027, "step": 1508 }, { "epoch": 0.3633299223499669, "grad_norm": 1.6273728609085083, "learning_rate": 0.00014727500384574375, "loss": 0.4199, "step": 1509 }, { "epoch": 0.36357069764642147, "grad_norm": 2.939532995223999, "learning_rate": 0.00014720627861776939, "loss": 1.1424, "step": 1510 }, { "epoch": 0.3638114729428761, "grad_norm": 1.7308731079101562, "learning_rate": 0.00014713752468827128, "loss": 0.2182, "step": 1511 }, { "epoch": 0.36405224823933063, "grad_norm": 8.7367582321167, "learning_rate": 0.00014706874209905192, "loss": 0.7563, "step": 1512 }, { "epoch": 0.36429302353578524, "grad_norm": 0.6256684064865112, "learning_rate": 0.00014699993089193134, "loss": 0.7837, "step": 1513 }, { "epoch": 0.3645337988322398, "grad_norm": 2.6632747650146484, "learning_rate": 0.00014693109110874687, "loss": 0.5477, "step": 1514 }, { "epoch": 0.3647745741286944, "grad_norm": 2.9854331016540527, "learning_rate": 0.00014686222279135328, "loss": 0.6202, "step": 1515 }, { "epoch": 0.36501534942514896, "grad_norm": 4.577966690063477, "learning_rate": 0.00014679332598162265, "loss": 0.4136, "step": 1516 }, { "epoch": 0.36525612472160357, "grad_norm": 0.8396844863891602, "learning_rate": 0.00014672440072144443, "loss": 0.8962, "step": 1517 }, { "epoch": 0.3654969000180581, "grad_norm": 2.5254063606262207, "learning_rate": 0.00014665544705272525, "loss": 0.9689, "step": 1518 }, { "epoch": 0.36573767531451273, "grad_norm": 2.8939788341522217, "learning_rate": 0.0001465864650173892, "loss": 0.6918, "step": 1519 }, { "epoch": 0.3659784506109673, "grad_norm": 2.4425880908966064, "learning_rate": 0.00014651745465737737, "loss": 0.6507, "step": 1520 }, { "epoch": 0.3662192259074219, "grad_norm": 3.3433775901794434, "learning_rate": 0.00014644841601464838, "loss": 0.7875, "step": 1521 }, { "epoch": 0.3664600012038765, "grad_norm": 1.840368390083313, "learning_rate": 0.00014637934913117777, "loss": 0.6712, "step": 1522 }, { "epoch": 0.36670077650033106, "grad_norm": 0.7851834297180176, "learning_rate": 0.0001463102540489584, "loss": 0.3682, "step": 1523 }, { "epoch": 0.36694155179678567, "grad_norm": 4.149460792541504, "learning_rate": 0.00014624113081000023, "loss": 0.5221, "step": 1524 }, { "epoch": 0.3671823270932402, "grad_norm": 1.0604087114334106, "learning_rate": 0.00014617197945633037, "loss": 0.6734, "step": 1525 }, { "epoch": 0.36742310238969483, "grad_norm": 0.5313230752944946, "learning_rate": 0.00014610280002999291, "loss": 0.1435, "step": 1526 }, { "epoch": 0.3676638776861494, "grad_norm": 9.519638061523438, "learning_rate": 0.00014603359257304925, "loss": 0.8516, "step": 1527 }, { "epoch": 0.367904652982604, "grad_norm": 1.3151129484176636, "learning_rate": 0.0001459643571275775, "loss": 0.5194, "step": 1528 }, { "epoch": 0.36814542827905855, "grad_norm": 1.3871126174926758, "learning_rate": 0.00014589509373567314, "loss": 0.3852, "step": 1529 }, { "epoch": 0.36838620357551316, "grad_norm": 1.881598949432373, "learning_rate": 0.00014582580243944836, "loss": 0.5607, "step": 1530 }, { "epoch": 0.3686269788719677, "grad_norm": 3.6260831356048584, "learning_rate": 0.0001457564832810324, "loss": 0.4408, "step": 1531 }, { "epoch": 0.3688677541684223, "grad_norm": 2.3478870391845703, "learning_rate": 0.00014568713630257155, "loss": 0.8691, "step": 1532 }, { "epoch": 0.3691085294648769, "grad_norm": 1.4225029945373535, "learning_rate": 0.00014561776154622892, "loss": 0.6029, "step": 1533 }, { "epoch": 0.3693493047613315, "grad_norm": 2.58164381980896, "learning_rate": 0.00014554835905418448, "loss": 0.6517, "step": 1534 }, { "epoch": 0.3695900800577861, "grad_norm": 1.5946727991104126, "learning_rate": 0.00014547892886863508, "loss": 0.3034, "step": 1535 }, { "epoch": 0.36983085535424065, "grad_norm": 1.9315208196640015, "learning_rate": 0.00014540947103179448, "loss": 0.1705, "step": 1536 }, { "epoch": 0.37007163065069526, "grad_norm": 1.584106683731079, "learning_rate": 0.0001453399855858932, "loss": 0.6266, "step": 1537 }, { "epoch": 0.3703124059471498, "grad_norm": 1.4017444849014282, "learning_rate": 0.00014527047257317853, "loss": 0.2253, "step": 1538 }, { "epoch": 0.3705531812436044, "grad_norm": 1.8468575477600098, "learning_rate": 0.00014520093203591452, "loss": 0.6682, "step": 1539 }, { "epoch": 0.370793956540059, "grad_norm": 3.008110761642456, "learning_rate": 0.000145131364016382, "loss": 0.5261, "step": 1540 }, { "epoch": 0.3710347318365136, "grad_norm": 4.193415641784668, "learning_rate": 0.00014506176855687847, "loss": 0.9067, "step": 1541 }, { "epoch": 0.37127550713296814, "grad_norm": 1.9622831344604492, "learning_rate": 0.00014499214569971814, "loss": 1.1056, "step": 1542 }, { "epoch": 0.37151628242942275, "grad_norm": 1.8108497858047485, "learning_rate": 0.00014492249548723188, "loss": 1.02, "step": 1543 }, { "epoch": 0.3717570577258773, "grad_norm": 3.2026660442352295, "learning_rate": 0.00014485281796176714, "loss": 0.675, "step": 1544 }, { "epoch": 0.3719978330223319, "grad_norm": 2.4484424591064453, "learning_rate": 0.00014478311316568797, "loss": 0.3577, "step": 1545 }, { "epoch": 0.37223860831878647, "grad_norm": 1.7263269424438477, "learning_rate": 0.00014471338114137517, "loss": 0.7703, "step": 1546 }, { "epoch": 0.3724793836152411, "grad_norm": 18.308795928955078, "learning_rate": 0.00014464362193122586, "loss": 0.6747, "step": 1547 }, { "epoch": 0.3727201589116957, "grad_norm": 2.601501941680908, "learning_rate": 0.00014457383557765386, "loss": 1.0085, "step": 1548 }, { "epoch": 0.37296093420815024, "grad_norm": 1.59297513961792, "learning_rate": 0.00014450402212308936, "loss": 0.5779, "step": 1549 }, { "epoch": 0.37320170950460485, "grad_norm": 2.8175299167633057, "learning_rate": 0.00014443418160997918, "loss": 0.5384, "step": 1550 }, { "epoch": 0.3734424848010594, "grad_norm": 4.92849063873291, "learning_rate": 0.00014436431408078643, "loss": 0.5555, "step": 1551 }, { "epoch": 0.373683260097514, "grad_norm": 2.1497936248779297, "learning_rate": 0.00014429441957799078, "loss": 0.6927, "step": 1552 }, { "epoch": 0.37392403539396857, "grad_norm": 0.8706673979759216, "learning_rate": 0.00014422449814408824, "loss": 0.2299, "step": 1553 }, { "epoch": 0.3741648106904232, "grad_norm": 1.8380601406097412, "learning_rate": 0.0001441545498215912, "loss": 0.6146, "step": 1554 }, { "epoch": 0.37440558598687773, "grad_norm": 1.3901095390319824, "learning_rate": 0.0001440845746530284, "loss": 0.3206, "step": 1555 }, { "epoch": 0.37464636128333234, "grad_norm": 1.46050226688385, "learning_rate": 0.00014401457268094483, "loss": 0.7791, "step": 1556 }, { "epoch": 0.3748871365797869, "grad_norm": 4.091619968414307, "learning_rate": 0.0001439445439479019, "loss": 0.5138, "step": 1557 }, { "epoch": 0.3751279118762415, "grad_norm": 1.6713485717773438, "learning_rate": 0.00014387448849647732, "loss": 0.3188, "step": 1558 }, { "epoch": 0.37536868717269606, "grad_norm": 3.4357035160064697, "learning_rate": 0.00014380440636926485, "loss": 0.6026, "step": 1559 }, { "epoch": 0.37560946246915067, "grad_norm": 3.0857181549072266, "learning_rate": 0.00014373429760887457, "loss": 0.7203, "step": 1560 }, { "epoch": 0.3758502377656052, "grad_norm": 3.1348352432250977, "learning_rate": 0.00014366416225793284, "loss": 0.611, "step": 1561 }, { "epoch": 0.37609101306205983, "grad_norm": 1.1630622148513794, "learning_rate": 0.0001435940003590821, "loss": 0.3327, "step": 1562 }, { "epoch": 0.37633178835851444, "grad_norm": 1.690561294555664, "learning_rate": 0.00014352381195498093, "loss": 0.6988, "step": 1563 }, { "epoch": 0.376572563654969, "grad_norm": 3.028482437133789, "learning_rate": 0.000143453597088304, "loss": 0.594, "step": 1564 }, { "epoch": 0.3768133389514236, "grad_norm": 0.997052013874054, "learning_rate": 0.00014338335580174212, "loss": 0.8037, "step": 1565 }, { "epoch": 0.37705411424787816, "grad_norm": 7.6312079429626465, "learning_rate": 0.00014331308813800222, "loss": 1.306, "step": 1566 }, { "epoch": 0.37729488954433277, "grad_norm": 2.4936201572418213, "learning_rate": 0.00014324279413980713, "loss": 0.4458, "step": 1567 }, { "epoch": 0.3775356648407873, "grad_norm": 2.824725389480591, "learning_rate": 0.00014317247384989577, "loss": 0.5562, "step": 1568 }, { "epoch": 0.37777644013724193, "grad_norm": 1.7765711545944214, "learning_rate": 0.00014310212731102304, "loss": 0.5947, "step": 1569 }, { "epoch": 0.3780172154336965, "grad_norm": 9.701804161071777, "learning_rate": 0.00014303175456595977, "loss": 1.0711, "step": 1570 }, { "epoch": 0.3782579907301511, "grad_norm": 2.2651548385620117, "learning_rate": 0.0001429613556574928, "loss": 0.7001, "step": 1571 }, { "epoch": 0.37849876602660565, "grad_norm": 1.20858895778656, "learning_rate": 0.0001428909306284248, "loss": 0.6168, "step": 1572 }, { "epoch": 0.37873954132306026, "grad_norm": 1.3196377754211426, "learning_rate": 0.00014282047952157432, "loss": 0.8402, "step": 1573 }, { "epoch": 0.3789803166195148, "grad_norm": 1.9669349193572998, "learning_rate": 0.00014275000237977582, "loss": 0.5, "step": 1574 }, { "epoch": 0.3792210919159694, "grad_norm": 2.7113590240478516, "learning_rate": 0.00014267949924587958, "loss": 0.2134, "step": 1575 }, { "epoch": 0.37946186721242403, "grad_norm": 7.937801837921143, "learning_rate": 0.00014260897016275166, "loss": 0.4475, "step": 1576 }, { "epoch": 0.3797026425088786, "grad_norm": 1.9907861948013306, "learning_rate": 0.00014253841517327382, "loss": 0.7746, "step": 1577 }, { "epoch": 0.3799434178053332, "grad_norm": 2.06160569190979, "learning_rate": 0.00014246783432034373, "loss": 0.9227, "step": 1578 }, { "epoch": 0.38018419310178775, "grad_norm": 2.051358461380005, "learning_rate": 0.00014239722764687474, "loss": 0.7264, "step": 1579 }, { "epoch": 0.38042496839824236, "grad_norm": 3.846851110458374, "learning_rate": 0.0001423265951957958, "loss": 0.9719, "step": 1580 }, { "epoch": 0.3806657436946969, "grad_norm": 1.47300124168396, "learning_rate": 0.00014225593701005157, "loss": 1.17, "step": 1581 }, { "epoch": 0.3809065189911515, "grad_norm": 4.379542827606201, "learning_rate": 0.0001421852531326025, "loss": 0.5917, "step": 1582 }, { "epoch": 0.3811472942876061, "grad_norm": 1.3115028142929077, "learning_rate": 0.00014211454360642443, "loss": 0.1916, "step": 1583 }, { "epoch": 0.3813880695840607, "grad_norm": 2.608750343322754, "learning_rate": 0.00014204380847450897, "loss": 0.8763, "step": 1584 }, { "epoch": 0.38162884488051524, "grad_norm": 3.5941126346588135, "learning_rate": 0.00014197304777986325, "loss": 0.1222, "step": 1585 }, { "epoch": 0.38186962017696985, "grad_norm": 4.869987487792969, "learning_rate": 0.0001419022615655099, "loss": 0.8706, "step": 1586 }, { "epoch": 0.3821103954734244, "grad_norm": 3.05656099319458, "learning_rate": 0.00014183144987448711, "loss": 0.5847, "step": 1587 }, { "epoch": 0.382351170769879, "grad_norm": 1.0079351663589478, "learning_rate": 0.00014176061274984858, "loss": 0.2984, "step": 1588 }, { "epoch": 0.3825919460663336, "grad_norm": 3.344771146774292, "learning_rate": 0.00014168975023466337, "loss": 0.5847, "step": 1589 }, { "epoch": 0.3828327213627882, "grad_norm": 2.857647657394409, "learning_rate": 0.00014161886237201612, "loss": 1.2925, "step": 1590 }, { "epoch": 0.3830734966592428, "grad_norm": 2.705115795135498, "learning_rate": 0.00014154794920500673, "loss": 0.5277, "step": 1591 }, { "epoch": 0.38331427195569734, "grad_norm": 1.9536807537078857, "learning_rate": 0.00014147701077675065, "loss": 0.5553, "step": 1592 }, { "epoch": 0.38355504725215195, "grad_norm": 1.2713546752929688, "learning_rate": 0.00014140604713037857, "loss": 0.507, "step": 1593 }, { "epoch": 0.3837958225486065, "grad_norm": 2.593982219696045, "learning_rate": 0.00014133505830903658, "loss": 0.3527, "step": 1594 }, { "epoch": 0.3840365978450611, "grad_norm": 0.6847010254859924, "learning_rate": 0.00014126404435588596, "loss": 0.1223, "step": 1595 }, { "epoch": 0.38427737314151567, "grad_norm": 1.8529340028762817, "learning_rate": 0.00014119300531410342, "loss": 0.44, "step": 1596 }, { "epoch": 0.3845181484379703, "grad_norm": 2.9218854904174805, "learning_rate": 0.0001411219412268808, "loss": 0.4854, "step": 1597 }, { "epoch": 0.38475892373442483, "grad_norm": 2.5640389919281006, "learning_rate": 0.00014105085213742533, "loss": 0.7238, "step": 1598 }, { "epoch": 0.38499969903087944, "grad_norm": 0.7277923822402954, "learning_rate": 0.00014097973808895926, "loss": 0.2205, "step": 1599 }, { "epoch": 0.385240474327334, "grad_norm": 1.501104474067688, "learning_rate": 0.00014090859912472005, "loss": 0.1477, "step": 1600 }, { "epoch": 0.3854812496237886, "grad_norm": 3.788515329360962, "learning_rate": 0.00014083743528796045, "loss": 1.0636, "step": 1601 }, { "epoch": 0.38572202492024316, "grad_norm": 2.322822332382202, "learning_rate": 0.00014076624662194816, "loss": 0.283, "step": 1602 }, { "epoch": 0.38596280021669777, "grad_norm": 1.9796638488769531, "learning_rate": 0.00014069503316996613, "loss": 0.3978, "step": 1603 }, { "epoch": 0.3862035755131524, "grad_norm": 0.6974928379058838, "learning_rate": 0.0001406237949753122, "loss": 0.8248, "step": 1604 }, { "epoch": 0.38644435080960693, "grad_norm": 0.8106366991996765, "learning_rate": 0.00014055253208129938, "loss": 0.5371, "step": 1605 }, { "epoch": 0.38668512610606154, "grad_norm": 1.5011775493621826, "learning_rate": 0.00014048124453125573, "loss": 0.2772, "step": 1606 }, { "epoch": 0.3869259014025161, "grad_norm": 1.6291502714157104, "learning_rate": 0.0001404099323685242, "loss": 0.1682, "step": 1607 }, { "epoch": 0.3871666766989707, "grad_norm": 1.8147183656692505, "learning_rate": 0.00014033859563646276, "loss": 0.1837, "step": 1608 }, { "epoch": 0.38740745199542526, "grad_norm": 2.469822645187378, "learning_rate": 0.00014026723437844421, "loss": 0.7883, "step": 1609 }, { "epoch": 0.38764822729187987, "grad_norm": 6.157069683074951, "learning_rate": 0.00014019584863785652, "loss": 0.3593, "step": 1610 }, { "epoch": 0.3878890025883344, "grad_norm": 1.2629841566085815, "learning_rate": 0.00014012443845810223, "loss": 0.4991, "step": 1611 }, { "epoch": 0.38812977788478903, "grad_norm": 2.7113308906555176, "learning_rate": 0.000140053003882599, "loss": 1.136, "step": 1612 }, { "epoch": 0.3883705531812436, "grad_norm": 3.3584749698638916, "learning_rate": 0.00013998154495477912, "loss": 0.1191, "step": 1613 }, { "epoch": 0.3886113284776982, "grad_norm": 2.5008931159973145, "learning_rate": 0.0001399100617180899, "loss": 0.6197, "step": 1614 }, { "epoch": 0.38885210377415275, "grad_norm": 1.7047406435012817, "learning_rate": 0.00013983855421599318, "loss": 0.6819, "step": 1615 }, { "epoch": 0.38909287907060736, "grad_norm": 1.2568997144699097, "learning_rate": 0.0001397670224919658, "loss": 0.4986, "step": 1616 }, { "epoch": 0.38933365436706197, "grad_norm": 4.387941360473633, "learning_rate": 0.0001396954665894991, "loss": 0.4947, "step": 1617 }, { "epoch": 0.3895744296635165, "grad_norm": 1.8967385292053223, "learning_rate": 0.00013962388655209927, "loss": 0.6985, "step": 1618 }, { "epoch": 0.38981520495997113, "grad_norm": 3.39685320854187, "learning_rate": 0.00013955228242328718, "loss": 1.0637, "step": 1619 }, { "epoch": 0.3900559802564257, "grad_norm": 4.821850299835205, "learning_rate": 0.00013948065424659824, "loss": 0.4031, "step": 1620 }, { "epoch": 0.3902967555528803, "grad_norm": 2.4104623794555664, "learning_rate": 0.00013940900206558257, "loss": 0.9255, "step": 1621 }, { "epoch": 0.39053753084933485, "grad_norm": 2.2007462978363037, "learning_rate": 0.00013933732592380483, "loss": 0.5469, "step": 1622 }, { "epoch": 0.39077830614578946, "grad_norm": 2.2772059440612793, "learning_rate": 0.00013926562586484434, "loss": 0.4233, "step": 1623 }, { "epoch": 0.391019081442244, "grad_norm": 2.6534852981567383, "learning_rate": 0.00013919390193229485, "loss": 0.3978, "step": 1624 }, { "epoch": 0.3912598567386986, "grad_norm": 0.3831101357936859, "learning_rate": 0.00013912215416976467, "loss": 0.2271, "step": 1625 }, { "epoch": 0.3915006320351532, "grad_norm": 1.9152987003326416, "learning_rate": 0.00013905038262087662, "loss": 0.522, "step": 1626 }, { "epoch": 0.3917414073316078, "grad_norm": 2.0952141284942627, "learning_rate": 0.00013897858732926793, "loss": 0.2229, "step": 1627 }, { "epoch": 0.39198218262806234, "grad_norm": 10.112699508666992, "learning_rate": 0.00013890676833859037, "loss": 1.0788, "step": 1628 }, { "epoch": 0.39222295792451695, "grad_norm": 2.1068572998046875, "learning_rate": 0.00013883492569250998, "loss": 0.5627, "step": 1629 }, { "epoch": 0.39246373322097156, "grad_norm": 2.1683926582336426, "learning_rate": 0.00013876305943470724, "loss": 1.0251, "step": 1630 }, { "epoch": 0.3927045085174261, "grad_norm": 5.917585372924805, "learning_rate": 0.00013869116960887708, "loss": 0.6836, "step": 1631 }, { "epoch": 0.3929452838138807, "grad_norm": 2.575009346008301, "learning_rate": 0.0001386192562587286, "loss": 0.8661, "step": 1632 }, { "epoch": 0.3931860591103353, "grad_norm": 2.4185233116149902, "learning_rate": 0.00013854731942798532, "loss": 0.7001, "step": 1633 }, { "epoch": 0.3934268344067899, "grad_norm": 1.6709206104278564, "learning_rate": 0.00013847535916038496, "loss": 0.364, "step": 1634 }, { "epoch": 0.39366760970324444, "grad_norm": 3.425093650817871, "learning_rate": 0.00013840337549967955, "loss": 0.3667, "step": 1635 }, { "epoch": 0.39390838499969905, "grad_norm": 1.7669458389282227, "learning_rate": 0.00013833136848963532, "loss": 0.733, "step": 1636 }, { "epoch": 0.3941491602961536, "grad_norm": 2.1822469234466553, "learning_rate": 0.00013825933817403267, "loss": 0.7814, "step": 1637 }, { "epoch": 0.3943899355926082, "grad_norm": 8.053266525268555, "learning_rate": 0.00013818728459666623, "loss": 0.9111, "step": 1638 }, { "epoch": 0.39463071088906276, "grad_norm": 1.4243130683898926, "learning_rate": 0.0001381152078013447, "loss": 0.4235, "step": 1639 }, { "epoch": 0.3948714861855174, "grad_norm": 1.732535481452942, "learning_rate": 0.00013804310783189098, "loss": 0.3293, "step": 1640 }, { "epoch": 0.3951122614819719, "grad_norm": 1.332587718963623, "learning_rate": 0.00013797098473214197, "loss": 0.6848, "step": 1641 }, { "epoch": 0.39535303677842654, "grad_norm": 1.3026105165481567, "learning_rate": 0.0001378988385459487, "loss": 0.7852, "step": 1642 }, { "epoch": 0.3955938120748811, "grad_norm": 2.118013620376587, "learning_rate": 0.0001378266693171762, "loss": 0.4796, "step": 1643 }, { "epoch": 0.3958345873713357, "grad_norm": 2.2776410579681396, "learning_rate": 0.00013775447708970351, "loss": 1.0214, "step": 1644 }, { "epoch": 0.3960753626677903, "grad_norm": 1.8297806978225708, "learning_rate": 0.0001376822619074237, "loss": 0.4031, "step": 1645 }, { "epoch": 0.39631613796424486, "grad_norm": 1.5983656644821167, "learning_rate": 0.0001376100238142438, "loss": 0.2453, "step": 1646 }, { "epoch": 0.3965569132606995, "grad_norm": 1.8416905403137207, "learning_rate": 0.00013753776285408464, "loss": 0.5695, "step": 1647 }, { "epoch": 0.396797688557154, "grad_norm": 2.1590733528137207, "learning_rate": 0.00013746547907088108, "loss": 0.1617, "step": 1648 }, { "epoch": 0.39703846385360864, "grad_norm": 2.4669997692108154, "learning_rate": 0.00013739317250858186, "loss": 0.5653, "step": 1649 }, { "epoch": 0.3972792391500632, "grad_norm": 1.7538673877716064, "learning_rate": 0.0001373208432111495, "loss": 0.16, "step": 1650 }, { "epoch": 0.3975200144465178, "grad_norm": 2.019120216369629, "learning_rate": 0.00013724849122256035, "loss": 0.6373, "step": 1651 }, { "epoch": 0.39776078974297235, "grad_norm": 1.4879308938980103, "learning_rate": 0.00013717611658680464, "loss": 0.8454, "step": 1652 }, { "epoch": 0.39800156503942696, "grad_norm": 0.9595705270767212, "learning_rate": 0.00013710371934788632, "loss": 0.532, "step": 1653 }, { "epoch": 0.3982423403358815, "grad_norm": 1.8083183765411377, "learning_rate": 0.00013703129954982299, "loss": 0.4841, "step": 1654 }, { "epoch": 0.3984831156323361, "grad_norm": 1.0364370346069336, "learning_rate": 0.00013695885723664616, "loss": 0.2084, "step": 1655 }, { "epoch": 0.3987238909287907, "grad_norm": 6.035412788391113, "learning_rate": 0.00013688639245240078, "loss": 0.7487, "step": 1656 }, { "epoch": 0.3989646662252453, "grad_norm": 1.0442893505096436, "learning_rate": 0.00013681390524114575, "loss": 0.422, "step": 1657 }, { "epoch": 0.3992054415216999, "grad_norm": 2.071849822998047, "learning_rate": 0.00013674139564695333, "loss": 0.5663, "step": 1658 }, { "epoch": 0.39944621681815445, "grad_norm": 2.249422311782837, "learning_rate": 0.00013666886371390967, "loss": 0.679, "step": 1659 }, { "epoch": 0.39968699211460906, "grad_norm": 5.166494369506836, "learning_rate": 0.0001365963094861142, "loss": 0.9236, "step": 1660 }, { "epoch": 0.3999277674110636, "grad_norm": 2.5879993438720703, "learning_rate": 0.0001365237330076801, "loss": 0.642, "step": 1661 }, { "epoch": 0.4001685427075182, "grad_norm": 2.8723905086517334, "learning_rate": 0.00013645113432273403, "loss": 0.7538, "step": 1662 }, { "epoch": 0.4004093180039728, "grad_norm": 1.0138564109802246, "learning_rate": 0.0001363785134754162, "loss": 0.511, "step": 1663 }, { "epoch": 0.4006500933004274, "grad_norm": 3.8104164600372314, "learning_rate": 0.00013630587050988022, "loss": 0.4648, "step": 1664 }, { "epoch": 0.40089086859688194, "grad_norm": 2.2068583965301514, "learning_rate": 0.00013623320547029316, "loss": 0.6258, "step": 1665 }, { "epoch": 0.40113164389333655, "grad_norm": 1.245370864868164, "learning_rate": 0.0001361605184008355, "loss": 0.4723, "step": 1666 }, { "epoch": 0.4013724191897911, "grad_norm": 1.0925084352493286, "learning_rate": 0.00013608780934570123, "loss": 0.5381, "step": 1667 }, { "epoch": 0.4016131944862457, "grad_norm": 6.653575897216797, "learning_rate": 0.00013601507834909757, "loss": 0.5606, "step": 1668 }, { "epoch": 0.40185396978270027, "grad_norm": 1.6157435178756714, "learning_rate": 0.0001359423254552451, "loss": 0.8517, "step": 1669 }, { "epoch": 0.4020947450791549, "grad_norm": 1.4830398559570312, "learning_rate": 0.00013586955070837777, "loss": 0.895, "step": 1670 }, { "epoch": 0.4023355203756095, "grad_norm": 0.835504949092865, "learning_rate": 0.00013579675415274284, "loss": 0.3608, "step": 1671 }, { "epoch": 0.40257629567206404, "grad_norm": 3.575409173965454, "learning_rate": 0.00013572393583260073, "loss": 0.985, "step": 1672 }, { "epoch": 0.40281707096851865, "grad_norm": 2.397228479385376, "learning_rate": 0.0001356510957922251, "loss": 0.5574, "step": 1673 }, { "epoch": 0.4030578462649732, "grad_norm": 1.162008285522461, "learning_rate": 0.00013557823407590294, "loss": 0.4828, "step": 1674 }, { "epoch": 0.4032986215614278, "grad_norm": 2.0564050674438477, "learning_rate": 0.00013550535072793428, "loss": 1.0467, "step": 1675 }, { "epoch": 0.40353939685788237, "grad_norm": 4.555008888244629, "learning_rate": 0.00013543244579263244, "loss": 0.645, "step": 1676 }, { "epoch": 0.403780172154337, "grad_norm": 1.655927062034607, "learning_rate": 0.00013535951931432366, "loss": 0.5477, "step": 1677 }, { "epoch": 0.40402094745079153, "grad_norm": 2.4142045974731445, "learning_rate": 0.0001352865713373475, "loss": 0.5651, "step": 1678 }, { "epoch": 0.40426172274724614, "grad_norm": 2.2285380363464355, "learning_rate": 0.00013521360190605646, "loss": 0.648, "step": 1679 }, { "epoch": 0.4045024980437007, "grad_norm": 3.8250715732574463, "learning_rate": 0.00013514061106481614, "loss": 0.9591, "step": 1680 }, { "epoch": 0.4047432733401553, "grad_norm": 0.9585970640182495, "learning_rate": 0.0001350675988580051, "loss": 0.3991, "step": 1681 }, { "epoch": 0.40498404863660986, "grad_norm": 9.034631729125977, "learning_rate": 0.00013499456533001497, "loss": 0.5749, "step": 1682 }, { "epoch": 0.40522482393306447, "grad_norm": 2.0019724369049072, "learning_rate": 0.00013492151052525023, "loss": 0.1236, "step": 1683 }, { "epoch": 0.405465599229519, "grad_norm": 11.653858184814453, "learning_rate": 0.00013484843448812844, "loss": 0.785, "step": 1684 }, { "epoch": 0.40570637452597363, "grad_norm": 2.2401812076568604, "learning_rate": 0.00013477533726308, "loss": 0.8912, "step": 1685 }, { "epoch": 0.40594714982242824, "grad_norm": 12.922853469848633, "learning_rate": 0.0001347022188945481, "loss": 0.6012, "step": 1686 }, { "epoch": 0.4061879251188828, "grad_norm": 1.3376822471618652, "learning_rate": 0.00013462907942698895, "loss": 1.2057, "step": 1687 }, { "epoch": 0.4064287004153374, "grad_norm": 6.5069708824157715, "learning_rate": 0.00013455591890487148, "loss": 0.8799, "step": 1688 }, { "epoch": 0.40666947571179196, "grad_norm": 1.1161401271820068, "learning_rate": 0.0001344827373726775, "loss": 1.4456, "step": 1689 }, { "epoch": 0.40691025100824657, "grad_norm": 0.9486348032951355, "learning_rate": 0.00013440953487490144, "loss": 0.5933, "step": 1690 }, { "epoch": 0.4071510263047011, "grad_norm": 1.8005541563034058, "learning_rate": 0.0001343363114560507, "loss": 0.7821, "step": 1691 }, { "epoch": 0.40739180160115573, "grad_norm": 2.908756732940674, "learning_rate": 0.0001342630671606452, "loss": 0.8259, "step": 1692 }, { "epoch": 0.4076325768976103, "grad_norm": 1.161380648612976, "learning_rate": 0.00013418980203321772, "loss": 0.7767, "step": 1693 }, { "epoch": 0.4078733521940649, "grad_norm": 2.2439661026000977, "learning_rate": 0.00013411651611831352, "loss": 0.3818, "step": 1694 }, { "epoch": 0.40811412749051945, "grad_norm": 2.2217512130737305, "learning_rate": 0.00013404320946049068, "loss": 0.2162, "step": 1695 }, { "epoch": 0.40835490278697406, "grad_norm": 2.809119462966919, "learning_rate": 0.00013396988210431977, "loss": 0.7169, "step": 1696 }, { "epoch": 0.4085956780834286, "grad_norm": 2.8886725902557373, "learning_rate": 0.00013389653409438406, "loss": 0.349, "step": 1697 }, { "epoch": 0.4088364533798832, "grad_norm": 1.2677594423294067, "learning_rate": 0.00013382316547527919, "loss": 0.2073, "step": 1698 }, { "epoch": 0.40907722867633783, "grad_norm": 6.638054370880127, "learning_rate": 0.00013374977629161355, "loss": 0.9768, "step": 1699 }, { "epoch": 0.4093180039727924, "grad_norm": 2.200249195098877, "learning_rate": 0.00013367636658800783, "loss": 0.4204, "step": 1700 }, { "epoch": 0.409558779269247, "grad_norm": 2.565556526184082, "learning_rate": 0.0001336029364090954, "loss": 0.8401, "step": 1701 }, { "epoch": 0.40979955456570155, "grad_norm": 1.9111295938491821, "learning_rate": 0.0001335294857995219, "loss": 1.2343, "step": 1702 }, { "epoch": 0.41004032986215616, "grad_norm": 5.341217041015625, "learning_rate": 0.0001334560148039455, "loss": 0.371, "step": 1703 }, { "epoch": 0.4102811051586107, "grad_norm": 1.3484272956848145, "learning_rate": 0.00013338252346703673, "loss": 0.7788, "step": 1704 }, { "epoch": 0.4105218804550653, "grad_norm": 2.7777099609375, "learning_rate": 0.00013330901183347847, "loss": 0.4438, "step": 1705 }, { "epoch": 0.4107626557515199, "grad_norm": 2.4722752571105957, "learning_rate": 0.00013323547994796597, "loss": 0.4454, "step": 1706 }, { "epoch": 0.4110034310479745, "grad_norm": 2.2678263187408447, "learning_rate": 0.0001331619278552068, "loss": 0.4712, "step": 1707 }, { "epoch": 0.41124420634442904, "grad_norm": 2.552933692932129, "learning_rate": 0.00013308835559992075, "loss": 0.6171, "step": 1708 }, { "epoch": 0.41148498164088365, "grad_norm": 4.140172958374023, "learning_rate": 0.00013301476322683997, "loss": 1.2291, "step": 1709 }, { "epoch": 0.4117257569373382, "grad_norm": 1.883234977722168, "learning_rate": 0.00013294115078070875, "loss": 0.3714, "step": 1710 }, { "epoch": 0.4119665322337928, "grad_norm": 1.5748333930969238, "learning_rate": 0.00013286751830628363, "loss": 0.891, "step": 1711 }, { "epoch": 0.4122073075302474, "grad_norm": 1.558668613433838, "learning_rate": 0.00013279386584833335, "loss": 0.5892, "step": 1712 }, { "epoch": 0.412448082826702, "grad_norm": 6.810975074768066, "learning_rate": 0.00013272019345163873, "loss": 0.5012, "step": 1713 }, { "epoch": 0.4126888581231566, "grad_norm": 2.5344254970550537, "learning_rate": 0.00013264650116099277, "loss": 0.6199, "step": 1714 }, { "epoch": 0.41292963341961114, "grad_norm": 1.2778170108795166, "learning_rate": 0.00013257278902120058, "loss": 0.8041, "step": 1715 }, { "epoch": 0.41317040871606575, "grad_norm": 1.3319803476333618, "learning_rate": 0.00013249905707707926, "loss": 1.0953, "step": 1716 }, { "epoch": 0.4134111840125203, "grad_norm": 4.633189678192139, "learning_rate": 0.000132425305373458, "loss": 0.9773, "step": 1717 }, { "epoch": 0.4136519593089749, "grad_norm": 1.2184745073318481, "learning_rate": 0.00013235153395517804, "loss": 0.7046, "step": 1718 }, { "epoch": 0.41389273460542947, "grad_norm": 1.2916301488876343, "learning_rate": 0.00013227774286709253, "loss": 0.2718, "step": 1719 }, { "epoch": 0.4141335099018841, "grad_norm": 1.1648756265640259, "learning_rate": 0.00013220393215406664, "loss": 0.446, "step": 1720 }, { "epoch": 0.41437428519833863, "grad_norm": 2.0171449184417725, "learning_rate": 0.00013213010186097744, "loss": 0.1262, "step": 1721 }, { "epoch": 0.41461506049479324, "grad_norm": 2.397416591644287, "learning_rate": 0.00013205625203271395, "loss": 0.7722, "step": 1722 }, { "epoch": 0.4148558357912478, "grad_norm": 0.6799049377441406, "learning_rate": 0.00013198238271417697, "loss": 0.6582, "step": 1723 }, { "epoch": 0.4150966110877024, "grad_norm": 2.0616261959075928, "learning_rate": 0.00013190849395027928, "loss": 1.2671, "step": 1724 }, { "epoch": 0.41533738638415696, "grad_norm": 0.9546332955360413, "learning_rate": 0.00013183458578594533, "loss": 0.2217, "step": 1725 }, { "epoch": 0.41557816168061157, "grad_norm": 4.271639823913574, "learning_rate": 0.0001317606582661115, "loss": 0.6956, "step": 1726 }, { "epoch": 0.4158189369770662, "grad_norm": 1.4144961833953857, "learning_rate": 0.0001316867114357259, "loss": 0.6413, "step": 1727 }, { "epoch": 0.41605971227352073, "grad_norm": 0.7294138073921204, "learning_rate": 0.00013161274533974836, "loss": 0.3907, "step": 1728 }, { "epoch": 0.41630048756997534, "grad_norm": 1.5399507284164429, "learning_rate": 0.00013153876002315045, "loss": 0.635, "step": 1729 }, { "epoch": 0.4165412628664299, "grad_norm": 1.771852731704712, "learning_rate": 0.00013146475553091536, "loss": 0.4428, "step": 1730 }, { "epoch": 0.4167820381628845, "grad_norm": 1.6749565601348877, "learning_rate": 0.000131390731908038, "loss": 0.5446, "step": 1731 }, { "epoch": 0.41702281345933906, "grad_norm": 1.4587945938110352, "learning_rate": 0.00013131668919952495, "loss": 0.8724, "step": 1732 }, { "epoch": 0.41726358875579367, "grad_norm": 2.2476232051849365, "learning_rate": 0.0001312426274503943, "loss": 0.184, "step": 1733 }, { "epoch": 0.4175043640522482, "grad_norm": 1.1287919282913208, "learning_rate": 0.00013116854670567577, "loss": 0.4209, "step": 1734 }, { "epoch": 0.41774513934870283, "grad_norm": 0.6734319925308228, "learning_rate": 0.00013109444701041057, "loss": 0.2422, "step": 1735 }, { "epoch": 0.4179859146451574, "grad_norm": 2.265183448791504, "learning_rate": 0.0001310203284096516, "loss": 0.6902, "step": 1736 }, { "epoch": 0.418226689941612, "grad_norm": 3.4037933349609375, "learning_rate": 0.00013094619094846304, "loss": 0.628, "step": 1737 }, { "epoch": 0.41846746523806655, "grad_norm": 4.971876621246338, "learning_rate": 0.00013087203467192067, "loss": 0.9363, "step": 1738 }, { "epoch": 0.41870824053452116, "grad_norm": 2.3928446769714355, "learning_rate": 0.00013079785962511164, "loss": 0.5608, "step": 1739 }, { "epoch": 0.41894901583097577, "grad_norm": 1.1700559854507446, "learning_rate": 0.0001307236658531346, "loss": 0.4105, "step": 1740 }, { "epoch": 0.4191897911274303, "grad_norm": 1.4026082754135132, "learning_rate": 0.00013064945340109948, "loss": 0.6884, "step": 1741 }, { "epoch": 0.41943056642388493, "grad_norm": 2.345377206802368, "learning_rate": 0.00013057522231412765, "loss": 0.6579, "step": 1742 }, { "epoch": 0.4196713417203395, "grad_norm": 0.7213815450668335, "learning_rate": 0.00013050097263735174, "loss": 0.4405, "step": 1743 }, { "epoch": 0.4199121170167941, "grad_norm": 0.8045918941497803, "learning_rate": 0.0001304267044159158, "loss": 0.4975, "step": 1744 }, { "epoch": 0.42015289231324865, "grad_norm": 1.4894392490386963, "learning_rate": 0.000130352417694975, "loss": 0.5714, "step": 1745 }, { "epoch": 0.42039366760970326, "grad_norm": 8.357844352722168, "learning_rate": 0.00013027811251969585, "loss": 0.6262, "step": 1746 }, { "epoch": 0.4206344429061578, "grad_norm": 1.4566922187805176, "learning_rate": 0.00013020378893525603, "loss": 0.1933, "step": 1747 }, { "epoch": 0.4208752182026124, "grad_norm": 0.6821098327636719, "learning_rate": 0.00013012944698684455, "loss": 0.4767, "step": 1748 }, { "epoch": 0.421115993499067, "grad_norm": 2.3084802627563477, "learning_rate": 0.00013005508671966141, "loss": 0.6336, "step": 1749 }, { "epoch": 0.4213567687955216, "grad_norm": 2.982093572616577, "learning_rate": 0.0001299807081789178, "loss": 0.5048, "step": 1750 }, { "epoch": 0.42159754409197614, "grad_norm": 1.2381023168563843, "learning_rate": 0.0001299063114098361, "loss": 0.6217, "step": 1751 }, { "epoch": 0.42183831938843075, "grad_norm": 2.4936861991882324, "learning_rate": 0.00012983189645764966, "loss": 0.5497, "step": 1752 }, { "epoch": 0.4220790946848853, "grad_norm": 0.9683302044868469, "learning_rate": 0.00012975746336760298, "loss": 0.3565, "step": 1753 }, { "epoch": 0.4223198699813399, "grad_norm": 3.492793083190918, "learning_rate": 0.00012968301218495152, "loss": 0.5133, "step": 1754 }, { "epoch": 0.4225606452777945, "grad_norm": 2.7869482040405273, "learning_rate": 0.00012960854295496178, "loss": 0.9106, "step": 1755 }, { "epoch": 0.4228014205742491, "grad_norm": 2.8004496097564697, "learning_rate": 0.00012953405572291117, "loss": 0.5493, "step": 1756 }, { "epoch": 0.4230421958707037, "grad_norm": 1.4663894176483154, "learning_rate": 0.0001294595505340882, "loss": 0.4555, "step": 1757 }, { "epoch": 0.42328297116715824, "grad_norm": 14.337491035461426, "learning_rate": 0.00012938502743379212, "loss": 0.7048, "step": 1758 }, { "epoch": 0.42352374646361285, "grad_norm": 1.1422491073608398, "learning_rate": 0.00012931048646733313, "loss": 0.6569, "step": 1759 }, { "epoch": 0.4237645217600674, "grad_norm": 0.4564094841480255, "learning_rate": 0.00012923592768003235, "loss": 0.1381, "step": 1760 }, { "epoch": 0.424005297056522, "grad_norm": 2.7814853191375732, "learning_rate": 0.00012916135111722165, "loss": 0.5271, "step": 1761 }, { "epoch": 0.42424607235297657, "grad_norm": 3.1444740295410156, "learning_rate": 0.0001290867568242438, "loss": 0.9703, "step": 1762 }, { "epoch": 0.4244868476494312, "grad_norm": 0.9466924071311951, "learning_rate": 0.00012901214484645226, "loss": 0.64, "step": 1763 }, { "epoch": 0.42472762294588573, "grad_norm": 0.9237557053565979, "learning_rate": 0.00012893751522921124, "loss": 0.6848, "step": 1764 }, { "epoch": 0.42496839824234034, "grad_norm": 2.4244697093963623, "learning_rate": 0.00012886286801789583, "loss": 0.6039, "step": 1765 }, { "epoch": 0.4252091735387949, "grad_norm": 9.26452922821045, "learning_rate": 0.00012878820325789162, "loss": 0.6834, "step": 1766 }, { "epoch": 0.4254499488352495, "grad_norm": 0.7539100646972656, "learning_rate": 0.00012871352099459496, "loss": 0.3441, "step": 1767 }, { "epoch": 0.4256907241317041, "grad_norm": 2.2698490619659424, "learning_rate": 0.00012863882127341284, "loss": 0.9277, "step": 1768 }, { "epoch": 0.42593149942815867, "grad_norm": 5.280149936676025, "learning_rate": 0.00012856410413976285, "loss": 0.9697, "step": 1769 }, { "epoch": 0.4261722747246133, "grad_norm": 3.395625591278076, "learning_rate": 0.0001284893696390732, "loss": 1.0316, "step": 1770 }, { "epoch": 0.42641305002106783, "grad_norm": 1.7464591264724731, "learning_rate": 0.00012841461781678263, "loss": 0.7941, "step": 1771 }, { "epoch": 0.42665382531752244, "grad_norm": 2.47660493850708, "learning_rate": 0.00012833984871834042, "loss": 0.9419, "step": 1772 }, { "epoch": 0.426894600613977, "grad_norm": 1.8424837589263916, "learning_rate": 0.00012826506238920632, "loss": 0.6514, "step": 1773 }, { "epoch": 0.4271353759104316, "grad_norm": 11.50127124786377, "learning_rate": 0.00012819025887485062, "loss": 1.3198, "step": 1774 }, { "epoch": 0.42737615120688616, "grad_norm": 1.003143548965454, "learning_rate": 0.00012811543822075397, "loss": 0.2809, "step": 1775 }, { "epoch": 0.42761692650334077, "grad_norm": 1.2120084762573242, "learning_rate": 0.00012804060047240756, "loss": 0.4469, "step": 1776 }, { "epoch": 0.4278577017997953, "grad_norm": 2.660773515701294, "learning_rate": 0.0001279657456753129, "loss": 0.8044, "step": 1777 }, { "epoch": 0.42809847709624993, "grad_norm": 3.651428461074829, "learning_rate": 0.00012789087387498187, "loss": 0.7613, "step": 1778 }, { "epoch": 0.4283392523927045, "grad_norm": 1.7895033359527588, "learning_rate": 0.00012781598511693666, "loss": 0.5028, "step": 1779 }, { "epoch": 0.4285800276891591, "grad_norm": 2.3747005462646484, "learning_rate": 0.00012774107944670983, "loss": 0.6884, "step": 1780 }, { "epoch": 0.4288208029856137, "grad_norm": 0.8780110478401184, "learning_rate": 0.00012766615690984422, "loss": 0.5869, "step": 1781 }, { "epoch": 0.42906157828206826, "grad_norm": 1.754726529121399, "learning_rate": 0.00012759121755189282, "loss": 0.883, "step": 1782 }, { "epoch": 0.42930235357852287, "grad_norm": 1.48545241355896, "learning_rate": 0.00012751626141841902, "loss": 0.6704, "step": 1783 }, { "epoch": 0.4295431288749774, "grad_norm": 1.908327579498291, "learning_rate": 0.0001274412885549963, "loss": 0.5986, "step": 1784 }, { "epoch": 0.42978390417143203, "grad_norm": 2.356943130493164, "learning_rate": 0.0001273662990072083, "loss": 0.5414, "step": 1785 }, { "epoch": 0.4300246794678866, "grad_norm": 2.7557711601257324, "learning_rate": 0.00012729129282064886, "loss": 0.7579, "step": 1786 }, { "epoch": 0.4302654547643412, "grad_norm": 1.6613632440567017, "learning_rate": 0.00012721627004092184, "loss": 0.7389, "step": 1787 }, { "epoch": 0.43050623006079575, "grad_norm": 0.23793041706085205, "learning_rate": 0.00012714123071364138, "loss": 0.3544, "step": 1788 }, { "epoch": 0.43074700535725036, "grad_norm": 0.7207126617431641, "learning_rate": 0.0001270661748844315, "loss": 0.4286, "step": 1789 }, { "epoch": 0.4309877806537049, "grad_norm": 2.2661566734313965, "learning_rate": 0.00012699110259892625, "loss": 0.8774, "step": 1790 }, { "epoch": 0.4312285559501595, "grad_norm": 2.410264730453491, "learning_rate": 0.00012691601390276983, "loss": 0.6303, "step": 1791 }, { "epoch": 0.4314693312466141, "grad_norm": 3.2463836669921875, "learning_rate": 0.00012684090884161636, "loss": 0.4901, "step": 1792 }, { "epoch": 0.4317101065430687, "grad_norm": 2.540635824203491, "learning_rate": 0.0001267657874611298, "loss": 0.4825, "step": 1793 }, { "epoch": 0.43195088183952324, "grad_norm": 0.8397485613822937, "learning_rate": 0.00012669064980698418, "loss": 0.1515, "step": 1794 }, { "epoch": 0.43219165713597785, "grad_norm": 3.4554481506347656, "learning_rate": 0.00012661549592486327, "loss": 0.8663, "step": 1795 }, { "epoch": 0.43243243243243246, "grad_norm": 2.4448556900024414, "learning_rate": 0.00012654032586046097, "loss": 0.2905, "step": 1796 }, { "epoch": 0.432673207728887, "grad_norm": 2.5620980262756348, "learning_rate": 0.0001264651396594807, "loss": 0.4889, "step": 1797 }, { "epoch": 0.4329139830253416, "grad_norm": 1.970119833946228, "learning_rate": 0.0001263899373676359, "loss": 0.6237, "step": 1798 }, { "epoch": 0.4331547583217962, "grad_norm": 1.576965093612671, "learning_rate": 0.00012631471903064973, "loss": 0.4789, "step": 1799 }, { "epoch": 0.4333955336182508, "grad_norm": 2.5547585487365723, "learning_rate": 0.0001262394846942551, "loss": 0.2264, "step": 1800 }, { "epoch": 0.43363630891470534, "grad_norm": 1.3450043201446533, "learning_rate": 0.00012616423440419468, "loss": 0.6556, "step": 1801 }, { "epoch": 0.43387708421115995, "grad_norm": 2.3657472133636475, "learning_rate": 0.00012608896820622077, "loss": 0.5502, "step": 1802 }, { "epoch": 0.4341178595076145, "grad_norm": 0.8529106974601746, "learning_rate": 0.0001260136861460954, "loss": 0.5755, "step": 1803 }, { "epoch": 0.4343586348040691, "grad_norm": 1.913244366645813, "learning_rate": 0.00012593838826959023, "loss": 0.4943, "step": 1804 }, { "epoch": 0.43459941010052366, "grad_norm": 2.5146071910858154, "learning_rate": 0.0001258630746224866, "loss": 0.7254, "step": 1805 }, { "epoch": 0.4348401853969783, "grad_norm": 1.7178691625595093, "learning_rate": 0.00012578774525057532, "loss": 0.5247, "step": 1806 }, { "epoch": 0.4350809606934328, "grad_norm": 3.7382612228393555, "learning_rate": 0.0001257124001996568, "loss": 0.6197, "step": 1807 }, { "epoch": 0.43532173598988744, "grad_norm": 4.024393558502197, "learning_rate": 0.00012563703951554102, "loss": 0.6969, "step": 1808 }, { "epoch": 0.43556251128634205, "grad_norm": 2.9647786617279053, "learning_rate": 0.0001255616632440475, "loss": 0.4495, "step": 1809 }, { "epoch": 0.4358032865827966, "grad_norm": 3.270671844482422, "learning_rate": 0.0001254862714310051, "loss": 1.3434, "step": 1810 }, { "epoch": 0.4360440618792512, "grad_norm": 2.660315752029419, "learning_rate": 0.00012541086412225225, "loss": 0.3752, "step": 1811 }, { "epoch": 0.43628483717570576, "grad_norm": 3.405566453933716, "learning_rate": 0.00012533544136363677, "loss": 0.6865, "step": 1812 }, { "epoch": 0.4365256124721604, "grad_norm": 1.3535075187683105, "learning_rate": 0.00012526000320101584, "loss": 0.5975, "step": 1813 }, { "epoch": 0.4367663877686149, "grad_norm": 7.082382678985596, "learning_rate": 0.0001251845496802561, "loss": 0.9345, "step": 1814 }, { "epoch": 0.43700716306506954, "grad_norm": 3.6921160221099854, "learning_rate": 0.00012510908084723335, "loss": 0.7298, "step": 1815 }, { "epoch": 0.4372479383615241, "grad_norm": 1.1721895933151245, "learning_rate": 0.00012503359674783293, "loss": 0.7013, "step": 1816 }, { "epoch": 0.4374887136579787, "grad_norm": 3.0077133178710938, "learning_rate": 0.00012495809742794927, "loss": 1.094, "step": 1817 }, { "epoch": 0.43772948895443325, "grad_norm": 0.8102177381515503, "learning_rate": 0.00012488258293348614, "loss": 0.5695, "step": 1818 }, { "epoch": 0.43797026425088786, "grad_norm": 2.2918097972869873, "learning_rate": 0.0001248070533103565, "loss": 0.3564, "step": 1819 }, { "epoch": 0.4382110395473424, "grad_norm": 0.9323842525482178, "learning_rate": 0.0001247315086044826, "loss": 0.38, "step": 1820 }, { "epoch": 0.438451814843797, "grad_norm": 3.6484107971191406, "learning_rate": 0.0001246559488617957, "loss": 0.4016, "step": 1821 }, { "epoch": 0.43869259014025164, "grad_norm": 2.140214204788208, "learning_rate": 0.0001245803741282364, "loss": 0.4333, "step": 1822 }, { "epoch": 0.4389333654367062, "grad_norm": 2.356504201889038, "learning_rate": 0.00012450478444975423, "loss": 0.3219, "step": 1823 }, { "epoch": 0.4391741407331608, "grad_norm": 1.8598113059997559, "learning_rate": 0.0001244291798723079, "loss": 0.4902, "step": 1824 }, { "epoch": 0.43941491602961535, "grad_norm": 3.3434224128723145, "learning_rate": 0.00012435356044186512, "loss": 0.63, "step": 1825 }, { "epoch": 0.43965569132606996, "grad_norm": 0.9114461541175842, "learning_rate": 0.00012427792620440278, "loss": 0.1613, "step": 1826 }, { "epoch": 0.4398964666225245, "grad_norm": 3.5121147632598877, "learning_rate": 0.00012420227720590657, "loss": 0.9353, "step": 1827 }, { "epoch": 0.4401372419189791, "grad_norm": 1.0356240272521973, "learning_rate": 0.00012412661349237134, "loss": 0.2213, "step": 1828 }, { "epoch": 0.4403780172154337, "grad_norm": 2.9189321994781494, "learning_rate": 0.00012405093510980072, "loss": 0.8359, "step": 1829 }, { "epoch": 0.4406187925118883, "grad_norm": 2.3521268367767334, "learning_rate": 0.00012397524210420736, "loss": 0.6263, "step": 1830 }, { "epoch": 0.44085956780834284, "grad_norm": 1.810509443283081, "learning_rate": 0.0001238995345216128, "loss": 0.415, "step": 1831 }, { "epoch": 0.44110034310479745, "grad_norm": 1.4703214168548584, "learning_rate": 0.0001238238124080474, "loss": 1.0236, "step": 1832 }, { "epoch": 0.441341118401252, "grad_norm": 1.8066413402557373, "learning_rate": 0.0001237480758095504, "loss": 0.5183, "step": 1833 }, { "epoch": 0.4415818936977066, "grad_norm": 2.030515670776367, "learning_rate": 0.00012367232477216973, "loss": 0.963, "step": 1834 }, { "epoch": 0.44182266899416117, "grad_norm": 4.339605808258057, "learning_rate": 0.00012359655934196236, "loss": 0.9798, "step": 1835 }, { "epoch": 0.4420634442906158, "grad_norm": 1.818166971206665, "learning_rate": 0.00012352077956499365, "loss": 0.4265, "step": 1836 }, { "epoch": 0.4423042195870704, "grad_norm": 3.2408132553100586, "learning_rate": 0.00012344498548733806, "loss": 0.9419, "step": 1837 }, { "epoch": 0.44254499488352494, "grad_norm": 2.5602407455444336, "learning_rate": 0.0001233691771550784, "loss": 0.7254, "step": 1838 }, { "epoch": 0.44278577017997955, "grad_norm": 2.216360330581665, "learning_rate": 0.0001232933546143064, "loss": 0.9503, "step": 1839 }, { "epoch": 0.4430265454764341, "grad_norm": 1.1760109663009644, "learning_rate": 0.00012321751791112234, "loss": 0.6887, "step": 1840 }, { "epoch": 0.4432673207728887, "grad_norm": 3.627732276916504, "learning_rate": 0.00012314166709163508, "loss": 0.5434, "step": 1841 }, { "epoch": 0.44350809606934327, "grad_norm": 2.6786983013153076, "learning_rate": 0.00012306580220196206, "loss": 1.46, "step": 1842 }, { "epoch": 0.4437488713657979, "grad_norm": 1.4511840343475342, "learning_rate": 0.00012298992328822937, "loss": 0.739, "step": 1843 }, { "epoch": 0.44398964666225244, "grad_norm": 6.163101673126221, "learning_rate": 0.00012291403039657147, "loss": 0.6288, "step": 1844 }, { "epoch": 0.44423042195870704, "grad_norm": 3.6202635765075684, "learning_rate": 0.00012283812357313152, "loss": 0.3465, "step": 1845 }, { "epoch": 0.4444711972551616, "grad_norm": 2.3283517360687256, "learning_rate": 0.00012276220286406097, "loss": 0.9489, "step": 1846 }, { "epoch": 0.4447119725516162, "grad_norm": 0.9239123463630676, "learning_rate": 0.00012268626831551978, "loss": 0.2347, "step": 1847 }, { "epoch": 0.44495274784807076, "grad_norm": 2.289092779159546, "learning_rate": 0.00012261031997367632, "loss": 0.6748, "step": 1848 }, { "epoch": 0.44519352314452537, "grad_norm": 3.025836706161499, "learning_rate": 0.0001225343578847074, "loss": 0.9556, "step": 1849 }, { "epoch": 0.44543429844098, "grad_norm": 2.054135322570801, "learning_rate": 0.00012245838209479812, "loss": 0.6107, "step": 1850 }, { "epoch": 0.44567507373743454, "grad_norm": 2.0344197750091553, "learning_rate": 0.0001223823926501419, "loss": 0.4859, "step": 1851 }, { "epoch": 0.44591584903388914, "grad_norm": 1.1552016735076904, "learning_rate": 0.00012230638959694054, "loss": 0.4992, "step": 1852 }, { "epoch": 0.4461566243303437, "grad_norm": 4.617137908935547, "learning_rate": 0.00012223037298140406, "loss": 0.4169, "step": 1853 }, { "epoch": 0.4463973996267983, "grad_norm": 1.8090236186981201, "learning_rate": 0.00012215434284975073, "loss": 1.1123, "step": 1854 }, { "epoch": 0.44663817492325286, "grad_norm": 1.46204674243927, "learning_rate": 0.000122078299248207, "loss": 0.3101, "step": 1855 }, { "epoch": 0.44687895021970747, "grad_norm": 2.3522417545318604, "learning_rate": 0.00012200224222300758, "loss": 0.3873, "step": 1856 }, { "epoch": 0.447119725516162, "grad_norm": 2.6018474102020264, "learning_rate": 0.00012192617182039534, "loss": 1.0129, "step": 1857 }, { "epoch": 0.44736050081261663, "grad_norm": 4.971423149108887, "learning_rate": 0.00012185008808662124, "loss": 0.7512, "step": 1858 }, { "epoch": 0.4476012761090712, "grad_norm": 1.7388516664505005, "learning_rate": 0.00012177399106794433, "loss": 0.9286, "step": 1859 }, { "epoch": 0.4478420514055258, "grad_norm": 2.4302382469177246, "learning_rate": 0.0001216978808106318, "loss": 1.2512, "step": 1860 }, { "epoch": 0.44808282670198035, "grad_norm": 2.7931926250457764, "learning_rate": 0.00012162175736095887, "loss": 0.5571, "step": 1861 }, { "epoch": 0.44832360199843496, "grad_norm": 1.046998381614685, "learning_rate": 0.00012154562076520874, "loss": 0.5747, "step": 1862 }, { "epoch": 0.44856437729488957, "grad_norm": 1.4877816438674927, "learning_rate": 0.00012146947106967266, "loss": 0.9024, "step": 1863 }, { "epoch": 0.4488051525913441, "grad_norm": 0.7219827175140381, "learning_rate": 0.00012139330832064974, "loss": 0.3575, "step": 1864 }, { "epoch": 0.44904592788779873, "grad_norm": 2.1786413192749023, "learning_rate": 0.00012131713256444722, "loss": 0.5989, "step": 1865 }, { "epoch": 0.4492867031842533, "grad_norm": 3.176419973373413, "learning_rate": 0.00012124094384738005, "loss": 0.6001, "step": 1866 }, { "epoch": 0.4495274784807079, "grad_norm": 4.134557723999023, "learning_rate": 0.00012116474221577116, "loss": 0.3355, "step": 1867 }, { "epoch": 0.44976825377716245, "grad_norm": 1.623186707496643, "learning_rate": 0.00012108852771595129, "loss": 0.4517, "step": 1868 }, { "epoch": 0.45000902907361706, "grad_norm": 0.6475129723548889, "learning_rate": 0.00012101230039425911, "loss": 0.3038, "step": 1869 }, { "epoch": 0.4502498043700716, "grad_norm": 1.8964297771453857, "learning_rate": 0.00012093606029704094, "loss": 0.1228, "step": 1870 }, { "epoch": 0.4504905796665262, "grad_norm": 3.343824625015259, "learning_rate": 0.00012085980747065093, "loss": 0.7346, "step": 1871 }, { "epoch": 0.4507313549629808, "grad_norm": 2.4471538066864014, "learning_rate": 0.00012078354196145099, "loss": 0.7198, "step": 1872 }, { "epoch": 0.4509721302594354, "grad_norm": 1.736475944519043, "learning_rate": 0.00012070726381581068, "loss": 1.0944, "step": 1873 }, { "epoch": 0.45121290555588994, "grad_norm": 2.5887935161590576, "learning_rate": 0.00012063097308010734, "loss": 0.7277, "step": 1874 }, { "epoch": 0.45145368085234455, "grad_norm": 2.683844804763794, "learning_rate": 0.0001205546698007259, "loss": 0.263, "step": 1875 }, { "epoch": 0.4516944561487991, "grad_norm": 2.049633741378784, "learning_rate": 0.00012047835402405887, "loss": 0.7888, "step": 1876 }, { "epoch": 0.4519352314452537, "grad_norm": 1.6313300132751465, "learning_rate": 0.00012040202579650648, "loss": 0.5099, "step": 1877 }, { "epoch": 0.4521760067417083, "grad_norm": 2.1740105152130127, "learning_rate": 0.00012032568516447645, "loss": 0.5025, "step": 1878 }, { "epoch": 0.4524167820381629, "grad_norm": 0.8934720754623413, "learning_rate": 0.00012024933217438403, "loss": 0.3097, "step": 1879 }, { "epoch": 0.4526575573346175, "grad_norm": 3.051832675933838, "learning_rate": 0.00012017296687265201, "loss": 0.5882, "step": 1880 }, { "epoch": 0.45289833263107204, "grad_norm": 0.8491730093955994, "learning_rate": 0.00012009658930571069, "loss": 0.3899, "step": 1881 }, { "epoch": 0.45313910792752665, "grad_norm": 1.7300267219543457, "learning_rate": 0.0001200201995199978, "loss": 0.6787, "step": 1882 }, { "epoch": 0.4533798832239812, "grad_norm": 1.650277853012085, "learning_rate": 0.00011994379756195852, "loss": 0.6297, "step": 1883 }, { "epoch": 0.4536206585204358, "grad_norm": 1.1548956632614136, "learning_rate": 0.00011986738347804536, "loss": 0.519, "step": 1884 }, { "epoch": 0.45386143381689037, "grad_norm": 1.0197851657867432, "learning_rate": 0.0001197909573147183, "loss": 0.6041, "step": 1885 }, { "epoch": 0.454102209113345, "grad_norm": 1.3757448196411133, "learning_rate": 0.00011971451911844457, "loss": 0.6228, "step": 1886 }, { "epoch": 0.45434298440979953, "grad_norm": 3.988311767578125, "learning_rate": 0.00011963806893569885, "loss": 0.5685, "step": 1887 }, { "epoch": 0.45458375970625414, "grad_norm": 1.5994011163711548, "learning_rate": 0.00011956160681296293, "loss": 0.5188, "step": 1888 }, { "epoch": 0.4548245350027087, "grad_norm": 1.7165995836257935, "learning_rate": 0.00011948513279672602, "loss": 0.8819, "step": 1889 }, { "epoch": 0.4550653102991633, "grad_norm": 1.711625337600708, "learning_rate": 0.00011940864693348444, "loss": 0.6248, "step": 1890 }, { "epoch": 0.4553060855956179, "grad_norm": 3.749361038208008, "learning_rate": 0.00011933214926974183, "loss": 0.6371, "step": 1891 }, { "epoch": 0.45554686089207247, "grad_norm": 0.7839668393135071, "learning_rate": 0.00011925563985200887, "loss": 0.3796, "step": 1892 }, { "epoch": 0.4557876361885271, "grad_norm": 1.5670320987701416, "learning_rate": 0.00011917911872680354, "loss": 0.3806, "step": 1893 }, { "epoch": 0.45602841148498163, "grad_norm": 1.0422892570495605, "learning_rate": 0.00011910258594065078, "loss": 0.0708, "step": 1894 }, { "epoch": 0.45626918678143624, "grad_norm": 3.335632562637329, "learning_rate": 0.00011902604154008274, "loss": 0.5238, "step": 1895 }, { "epoch": 0.4565099620778908, "grad_norm": 4.482006072998047, "learning_rate": 0.00011894948557163859, "loss": 0.7926, "step": 1896 }, { "epoch": 0.4567507373743454, "grad_norm": 2.2744340896606445, "learning_rate": 0.00011887291808186452, "loss": 1.2551, "step": 1897 }, { "epoch": 0.45699151267079996, "grad_norm": 1.9892547130584717, "learning_rate": 0.00011879633911731372, "loss": 0.6706, "step": 1898 }, { "epoch": 0.45723228796725457, "grad_norm": 0.8157358169555664, "learning_rate": 0.00011871974872454639, "loss": 0.2129, "step": 1899 }, { "epoch": 0.4574730632637091, "grad_norm": 0.6796861886978149, "learning_rate": 0.00011864314695012963, "loss": 0.5986, "step": 1900 }, { "epoch": 0.45771383856016373, "grad_norm": 1.5171664953231812, "learning_rate": 0.00011856653384063756, "loss": 0.4835, "step": 1901 }, { "epoch": 0.4579546138566183, "grad_norm": 1.7098067998886108, "learning_rate": 0.00011848990944265111, "loss": 0.4977, "step": 1902 }, { "epoch": 0.4581953891530729, "grad_norm": 1.370509386062622, "learning_rate": 0.00011841327380275799, "loss": 0.7172, "step": 1903 }, { "epoch": 0.4584361644495275, "grad_norm": 3.1343603134155273, "learning_rate": 0.00011833662696755295, "loss": 0.6409, "step": 1904 }, { "epoch": 0.45867693974598206, "grad_norm": 2.920408010482788, "learning_rate": 0.00011825996898363741, "loss": 0.496, "step": 1905 }, { "epoch": 0.45891771504243667, "grad_norm": 1.1497353315353394, "learning_rate": 0.00011818329989761959, "loss": 0.4073, "step": 1906 }, { "epoch": 0.4591584903388912, "grad_norm": 1.3592454195022583, "learning_rate": 0.00011810661975611444, "loss": 0.7055, "step": 1907 }, { "epoch": 0.45939926563534583, "grad_norm": 4.271294593811035, "learning_rate": 0.0001180299286057437, "loss": 0.6676, "step": 1908 }, { "epoch": 0.4596400409318004, "grad_norm": 3.51015567779541, "learning_rate": 0.00011795322649313574, "loss": 0.7051, "step": 1909 }, { "epoch": 0.459880816228255, "grad_norm": 0.6392609477043152, "learning_rate": 0.00011787651346492561, "loss": 0.2079, "step": 1910 }, { "epoch": 0.46012159152470955, "grad_norm": 5.496769905090332, "learning_rate": 0.00011779978956775506, "loss": 0.6687, "step": 1911 }, { "epoch": 0.46036236682116416, "grad_norm": 2.1029446125030518, "learning_rate": 0.00011772305484827231, "loss": 1.0162, "step": 1912 }, { "epoch": 0.4606031421176187, "grad_norm": 3.9741029739379883, "learning_rate": 0.00011764630935313228, "loss": 1.0211, "step": 1913 }, { "epoch": 0.4608439174140733, "grad_norm": 3.1617109775543213, "learning_rate": 0.00011756955312899642, "loss": 1.1738, "step": 1914 }, { "epoch": 0.4610846927105279, "grad_norm": 0.6556163430213928, "learning_rate": 0.00011749278622253268, "loss": 0.037, "step": 1915 }, { "epoch": 0.4613254680069825, "grad_norm": 3.8767170906066895, "learning_rate": 0.00011741600868041549, "loss": 0.8335, "step": 1916 }, { "epoch": 0.46156624330343704, "grad_norm": 2.683124542236328, "learning_rate": 0.00011733922054932577, "loss": 1.3765, "step": 1917 }, { "epoch": 0.46180701859989165, "grad_norm": 1.8836538791656494, "learning_rate": 0.00011726242187595091, "loss": 0.2703, "step": 1918 }, { "epoch": 0.46204779389634626, "grad_norm": 1.551708698272705, "learning_rate": 0.00011718561270698467, "loss": 0.1608, "step": 1919 }, { "epoch": 0.4622885691928008, "grad_norm": 2.271167278289795, "learning_rate": 0.00011710879308912717, "loss": 0.5994, "step": 1920 }, { "epoch": 0.4625293444892554, "grad_norm": 2.433912992477417, "learning_rate": 0.0001170319630690849, "loss": 0.756, "step": 1921 }, { "epoch": 0.46277011978571, "grad_norm": 0.924586296081543, "learning_rate": 0.00011695512269357076, "loss": 0.655, "step": 1922 }, { "epoch": 0.4630108950821646, "grad_norm": 2.5200753211975098, "learning_rate": 0.00011687827200930381, "loss": 0.5851, "step": 1923 }, { "epoch": 0.46325167037861914, "grad_norm": 1.0272419452667236, "learning_rate": 0.00011680141106300943, "loss": 0.3672, "step": 1924 }, { "epoch": 0.46349244567507375, "grad_norm": 5.197723865509033, "learning_rate": 0.00011672453990141927, "loss": 0.8146, "step": 1925 }, { "epoch": 0.4637332209715283, "grad_norm": 1.0704439878463745, "learning_rate": 0.00011664765857127118, "loss": 0.1931, "step": 1926 }, { "epoch": 0.4639739962679829, "grad_norm": 2.9581477642059326, "learning_rate": 0.00011657076711930919, "loss": 0.9693, "step": 1927 }, { "epoch": 0.46421477156443747, "grad_norm": 3.3531832695007324, "learning_rate": 0.00011649386559228341, "loss": 0.6575, "step": 1928 }, { "epoch": 0.4644555468608921, "grad_norm": 1.1153233051300049, "learning_rate": 0.00011641695403695021, "loss": 0.3665, "step": 1929 }, { "epoch": 0.46469632215734663, "grad_norm": 1.21510910987854, "learning_rate": 0.000116340032500072, "loss": 0.4691, "step": 1930 }, { "epoch": 0.46493709745380124, "grad_norm": 1.5340150594711304, "learning_rate": 0.00011626310102841718, "loss": 0.5084, "step": 1931 }, { "epoch": 0.46517787275025585, "grad_norm": 1.7374811172485352, "learning_rate": 0.0001161861596687603, "loss": 0.5055, "step": 1932 }, { "epoch": 0.4654186480467104, "grad_norm": 1.4668828248977661, "learning_rate": 0.00011610920846788184, "loss": 0.7268, "step": 1933 }, { "epoch": 0.465659423343165, "grad_norm": 2.211509943008423, "learning_rate": 0.0001160322474725684, "loss": 1.0503, "step": 1934 }, { "epoch": 0.46590019863961957, "grad_norm": 2.9162306785583496, "learning_rate": 0.00011595527672961235, "loss": 0.987, "step": 1935 }, { "epoch": 0.4661409739360742, "grad_norm": 0.8210351467132568, "learning_rate": 0.00011587829628581213, "loss": 0.2187, "step": 1936 }, { "epoch": 0.46638174923252873, "grad_norm": 2.2448573112487793, "learning_rate": 0.00011580130618797193, "loss": 0.7065, "step": 1937 }, { "epoch": 0.46662252452898334, "grad_norm": 2.9925882816314697, "learning_rate": 0.000115724306482902, "loss": 0.9504, "step": 1938 }, { "epoch": 0.4668632998254379, "grad_norm": 4.324154376983643, "learning_rate": 0.00011564729721741829, "loss": 0.8914, "step": 1939 }, { "epoch": 0.4671040751218925, "grad_norm": 0.7890626192092896, "learning_rate": 0.00011557027843834265, "loss": 0.4613, "step": 1940 }, { "epoch": 0.46734485041834706, "grad_norm": 0.8351976275444031, "learning_rate": 0.00011549325019250261, "loss": 0.262, "step": 1941 }, { "epoch": 0.46758562571480167, "grad_norm": 5.956714153289795, "learning_rate": 0.00011541621252673153, "loss": 0.5128, "step": 1942 }, { "epoch": 0.4678264010112562, "grad_norm": 1.704748511314392, "learning_rate": 0.00011533916548786857, "loss": 0.5645, "step": 1943 }, { "epoch": 0.46806717630771083, "grad_norm": 2.2155847549438477, "learning_rate": 0.00011526210912275836, "loss": 0.7139, "step": 1944 }, { "epoch": 0.46830795160416544, "grad_norm": 3.7036075592041016, "learning_rate": 0.00011518504347825145, "loss": 0.6394, "step": 1945 }, { "epoch": 0.46854872690062, "grad_norm": 1.530531406402588, "learning_rate": 0.00011510796860120388, "loss": 0.8155, "step": 1946 }, { "epoch": 0.4687895021970746, "grad_norm": 3.814300298690796, "learning_rate": 0.00011503088453847739, "loss": 0.9626, "step": 1947 }, { "epoch": 0.46903027749352916, "grad_norm": 2.3494253158569336, "learning_rate": 0.00011495379133693922, "loss": 0.3687, "step": 1948 }, { "epoch": 0.46927105278998377, "grad_norm": 2.5800893306732178, "learning_rate": 0.00011487668904346221, "loss": 0.8505, "step": 1949 }, { "epoch": 0.4695118280864383, "grad_norm": 1.1892086267471313, "learning_rate": 0.00011479957770492476, "loss": 0.8398, "step": 1950 }, { "epoch": 0.46975260338289293, "grad_norm": 4.8080034255981445, "learning_rate": 0.00011472245736821072, "loss": 0.8072, "step": 1951 }, { "epoch": 0.4699933786793475, "grad_norm": 4.775472164154053, "learning_rate": 0.00011464532808020943, "loss": 0.8344, "step": 1952 }, { "epoch": 0.4702341539758021, "grad_norm": 4.2183966636657715, "learning_rate": 0.00011456818988781565, "loss": 0.2391, "step": 1953 }, { "epoch": 0.47047492927225665, "grad_norm": 0.8757205605506897, "learning_rate": 0.00011449104283792964, "loss": 0.6855, "step": 1954 }, { "epoch": 0.47071570456871126, "grad_norm": 4.9031524658203125, "learning_rate": 0.0001144138869774569, "loss": 0.7411, "step": 1955 }, { "epoch": 0.4709564798651658, "grad_norm": 3.9868388175964355, "learning_rate": 0.0001143367223533084, "loss": 1.3733, "step": 1956 }, { "epoch": 0.4711972551616204, "grad_norm": 2.6897597312927246, "learning_rate": 0.0001142595490124004, "loss": 1.1645, "step": 1957 }, { "epoch": 0.471438030458075, "grad_norm": 0.6126354932785034, "learning_rate": 0.00011418236700165452, "loss": 0.4618, "step": 1958 }, { "epoch": 0.4716788057545296, "grad_norm": 0.5356245040893555, "learning_rate": 0.00011410517636799751, "loss": 0.805, "step": 1959 }, { "epoch": 0.4719195810509842, "grad_norm": 0.8628101944923401, "learning_rate": 0.00011402797715836153, "loss": 0.291, "step": 1960 }, { "epoch": 0.47216035634743875, "grad_norm": 1.2963393926620483, "learning_rate": 0.00011395076941968379, "loss": 0.7377, "step": 1961 }, { "epoch": 0.47240113164389336, "grad_norm": 1.1663508415222168, "learning_rate": 0.00011387355319890685, "loss": 0.1149, "step": 1962 }, { "epoch": 0.4726419069403479, "grad_norm": 1.1222305297851562, "learning_rate": 0.00011379632854297828, "loss": 0.8273, "step": 1963 }, { "epoch": 0.4728826822368025, "grad_norm": 1.7846665382385254, "learning_rate": 0.00011371909549885087, "loss": 0.5701, "step": 1964 }, { "epoch": 0.4731234575332571, "grad_norm": 2.4753174781799316, "learning_rate": 0.00011364185411348247, "loss": 0.6405, "step": 1965 }, { "epoch": 0.4733642328297117, "grad_norm": 2.678506374359131, "learning_rate": 0.00011356460443383607, "loss": 0.5651, "step": 1966 }, { "epoch": 0.47360500812616624, "grad_norm": 2.547746181488037, "learning_rate": 0.00011348734650687962, "loss": 0.4664, "step": 1967 }, { "epoch": 0.47384578342262085, "grad_norm": 2.901313304901123, "learning_rate": 0.00011341008037958607, "loss": 0.2748, "step": 1968 }, { "epoch": 0.4740865587190754, "grad_norm": 2.7114925384521484, "learning_rate": 0.00011333280609893344, "loss": 0.7454, "step": 1969 }, { "epoch": 0.47432733401553, "grad_norm": 2.3827106952667236, "learning_rate": 0.0001132555237119047, "loss": 1.0865, "step": 1970 }, { "epoch": 0.47456810931198457, "grad_norm": 1.0428249835968018, "learning_rate": 0.00011317823326548765, "loss": 0.6484, "step": 1971 }, { "epoch": 0.4748088846084392, "grad_norm": 2.74362850189209, "learning_rate": 0.00011310093480667507, "loss": 0.4957, "step": 1972 }, { "epoch": 0.4750496599048938, "grad_norm": 2.0484142303466797, "learning_rate": 0.00011302362838246463, "loss": 0.5256, "step": 1973 }, { "epoch": 0.47529043520134834, "grad_norm": 2.8634374141693115, "learning_rate": 0.0001129463140398588, "loss": 0.4557, "step": 1974 }, { "epoch": 0.47553121049780295, "grad_norm": 2.5055246353149414, "learning_rate": 0.00011286899182586485, "loss": 0.9222, "step": 1975 }, { "epoch": 0.4757719857942575, "grad_norm": 1.0836631059646606, "learning_rate": 0.00011279166178749489, "loss": 0.3692, "step": 1976 }, { "epoch": 0.4760127610907121, "grad_norm": 2.4149179458618164, "learning_rate": 0.0001127143239717657, "loss": 0.675, "step": 1977 }, { "epoch": 0.47625353638716666, "grad_norm": 1.899614930152893, "learning_rate": 0.00011263697842569894, "loss": 0.988, "step": 1978 }, { "epoch": 0.4764943116836213, "grad_norm": 3.755749464035034, "learning_rate": 0.00011255962519632081, "loss": 0.8779, "step": 1979 }, { "epoch": 0.47673508698007583, "grad_norm": 2.487436056137085, "learning_rate": 0.0001124822643306623, "loss": 0.4025, "step": 1980 }, { "epoch": 0.47697586227653044, "grad_norm": 0.9542964100837708, "learning_rate": 0.00011240489587575889, "loss": 0.5613, "step": 1981 }, { "epoch": 0.477216637572985, "grad_norm": 1.5301231145858765, "learning_rate": 0.00011232751987865084, "loss": 0.2873, "step": 1982 }, { "epoch": 0.4774574128694396, "grad_norm": 1.3803631067276, "learning_rate": 0.00011225013638638297, "loss": 0.2015, "step": 1983 }, { "epoch": 0.47769818816589416, "grad_norm": 2.5215346813201904, "learning_rate": 0.00011217274544600458, "loss": 0.7079, "step": 1984 }, { "epoch": 0.47793896346234876, "grad_norm": 0.8680809736251831, "learning_rate": 0.00011209534710456951, "loss": 0.8823, "step": 1985 }, { "epoch": 0.4781797387588034, "grad_norm": 1.0122793912887573, "learning_rate": 0.00011201794140913613, "loss": 0.3876, "step": 1986 }, { "epoch": 0.47842051405525793, "grad_norm": 2.795023202896118, "learning_rate": 0.00011194052840676735, "loss": 0.9525, "step": 1987 }, { "epoch": 0.47866128935171254, "grad_norm": 4.1220784187316895, "learning_rate": 0.00011186310814453035, "loss": 0.8727, "step": 1988 }, { "epoch": 0.4789020646481671, "grad_norm": 0.5881559252738953, "learning_rate": 0.00011178568066949688, "loss": 0.1987, "step": 1989 }, { "epoch": 0.4791428399446217, "grad_norm": 2.1437673568725586, "learning_rate": 0.00011170824602874301, "loss": 0.7281, "step": 1990 }, { "epoch": 0.47938361524107626, "grad_norm": 2.513075113296509, "learning_rate": 0.0001116308042693492, "loss": 0.6271, "step": 1991 }, { "epoch": 0.47962439053753086, "grad_norm": 2.7916321754455566, "learning_rate": 0.00011155335543840017, "loss": 0.4875, "step": 1992 }, { "epoch": 0.4798651658339854, "grad_norm": 1.7488362789154053, "learning_rate": 0.000111475899582985, "loss": 0.4013, "step": 1993 }, { "epoch": 0.48010594113044003, "grad_norm": 8.35679817199707, "learning_rate": 0.00011139843675019704, "loss": 0.7598, "step": 1994 }, { "epoch": 0.4803467164268946, "grad_norm": 1.7272447347640991, "learning_rate": 0.00011132096698713385, "loss": 0.4311, "step": 1995 }, { "epoch": 0.4805874917233492, "grad_norm": 2.1946487426757812, "learning_rate": 0.00011124349034089723, "loss": 0.6132, "step": 1996 }, { "epoch": 0.48082826701980375, "grad_norm": 1.6528022289276123, "learning_rate": 0.00011116600685859313, "loss": 0.738, "step": 1997 }, { "epoch": 0.48106904231625836, "grad_norm": 2.6232638359069824, "learning_rate": 0.0001110885165873317, "loss": 0.7694, "step": 1998 }, { "epoch": 0.4813098176127129, "grad_norm": 1.2000987529754639, "learning_rate": 0.00011101101957422723, "loss": 0.4693, "step": 1999 }, { "epoch": 0.4815505929091675, "grad_norm": 1.4360319375991821, "learning_rate": 0.00011093351586639806, "loss": 0.7783, "step": 2000 }, { "epoch": 0.48179136820562213, "grad_norm": 1.4586645364761353, "learning_rate": 0.00011085600551096657, "loss": 0.7863, "step": 2001 }, { "epoch": 0.4820321435020767, "grad_norm": 1.3981388807296753, "learning_rate": 0.0001107784885550593, "loss": 0.7304, "step": 2002 }, { "epoch": 0.4822729187985313, "grad_norm": 2.226198196411133, "learning_rate": 0.00011070096504580669, "loss": 0.5331, "step": 2003 }, { "epoch": 0.48251369409498585, "grad_norm": 1.673223614692688, "learning_rate": 0.00011062343503034325, "loss": 0.5965, "step": 2004 }, { "epoch": 0.48275446939144045, "grad_norm": 3.5956525802612305, "learning_rate": 0.00011054589855580732, "loss": 0.9231, "step": 2005 }, { "epoch": 0.482995244687895, "grad_norm": 2.030714273452759, "learning_rate": 0.00011046835566934138, "loss": 0.71, "step": 2006 }, { "epoch": 0.4832360199843496, "grad_norm": 6.168741226196289, "learning_rate": 0.00011039080641809154, "loss": 0.9501, "step": 2007 }, { "epoch": 0.4834767952808042, "grad_norm": 2.15983510017395, "learning_rate": 0.00011031325084920802, "loss": 1.0474, "step": 2008 }, { "epoch": 0.4837175705772588, "grad_norm": 3.2638587951660156, "learning_rate": 0.00011023568900984473, "loss": 0.4585, "step": 2009 }, { "epoch": 0.48395834587371334, "grad_norm": 0.8049036264419556, "learning_rate": 0.0001101581209471595, "loss": 0.3537, "step": 2010 }, { "epoch": 0.48419912117016795, "grad_norm": 2.6150450706481934, "learning_rate": 0.00011008054670831381, "loss": 0.4149, "step": 2011 }, { "epoch": 0.4844398964666225, "grad_norm": 1.5464622974395752, "learning_rate": 0.00011000296634047302, "loss": 0.1812, "step": 2012 }, { "epoch": 0.4846806717630771, "grad_norm": 1.843767762184143, "learning_rate": 0.00010992537989080618, "loss": 0.3838, "step": 2013 }, { "epoch": 0.4849214470595317, "grad_norm": 1.0260145664215088, "learning_rate": 0.00010984778740648598, "loss": 0.2033, "step": 2014 }, { "epoch": 0.4851622223559863, "grad_norm": 1.7894840240478516, "learning_rate": 0.00010977018893468884, "loss": 0.5762, "step": 2015 }, { "epoch": 0.4854029976524409, "grad_norm": 2.454301118850708, "learning_rate": 0.00010969258452259483, "loss": 0.8953, "step": 2016 }, { "epoch": 0.48564377294889544, "grad_norm": 0.5999788045883179, "learning_rate": 0.0001096149742173876, "loss": 0.4977, "step": 2017 }, { "epoch": 0.48588454824535005, "grad_norm": 2.6491451263427734, "learning_rate": 0.00010953735806625439, "loss": 1.0362, "step": 2018 }, { "epoch": 0.4861253235418046, "grad_norm": 2.1559669971466064, "learning_rate": 0.00010945973611638596, "loss": 0.4835, "step": 2019 }, { "epoch": 0.4863660988382592, "grad_norm": 4.336763381958008, "learning_rate": 0.00010938210841497667, "loss": 1.0278, "step": 2020 }, { "epoch": 0.48660687413471376, "grad_norm": 0.9082402586936951, "learning_rate": 0.00010930447500922433, "loss": 0.7064, "step": 2021 }, { "epoch": 0.48684764943116837, "grad_norm": 11.172735214233398, "learning_rate": 0.00010922683594633021, "loss": 0.9112, "step": 2022 }, { "epoch": 0.4870884247276229, "grad_norm": 1.7960487604141235, "learning_rate": 0.00010914919127349906, "loss": 0.4387, "step": 2023 }, { "epoch": 0.48732920002407754, "grad_norm": 2.4477851390838623, "learning_rate": 0.00010907154103793899, "loss": 0.2548, "step": 2024 }, { "epoch": 0.4875699753205321, "grad_norm": 1.2202852964401245, "learning_rate": 0.00010899388528686154, "loss": 0.4231, "step": 2025 }, { "epoch": 0.4878107506169867, "grad_norm": 2.1632204055786133, "learning_rate": 0.00010891622406748157, "loss": 0.5211, "step": 2026 }, { "epoch": 0.4880515259134413, "grad_norm": 2.416361093521118, "learning_rate": 0.00010883855742701727, "loss": 0.5395, "step": 2027 }, { "epoch": 0.48829230120989586, "grad_norm": 3.4709837436676025, "learning_rate": 0.00010876088541269014, "loss": 0.959, "step": 2028 }, { "epoch": 0.48853307650635047, "grad_norm": 4.083737373352051, "learning_rate": 0.00010868320807172496, "loss": 0.4737, "step": 2029 }, { "epoch": 0.488773851802805, "grad_norm": 2.2041704654693604, "learning_rate": 0.0001086055254513497, "loss": 0.7522, "step": 2030 }, { "epoch": 0.48901462709925964, "grad_norm": 1.5947551727294922, "learning_rate": 0.00010852783759879557, "loss": 0.1179, "step": 2031 }, { "epoch": 0.4892554023957142, "grad_norm": 2.6516928672790527, "learning_rate": 0.00010845014456129698, "loss": 0.9625, "step": 2032 }, { "epoch": 0.4894961776921688, "grad_norm": 3.6693668365478516, "learning_rate": 0.00010837244638609145, "loss": 0.4759, "step": 2033 }, { "epoch": 0.48973695298862335, "grad_norm": 5.227980613708496, "learning_rate": 0.00010829474312041963, "loss": 0.66, "step": 2034 }, { "epoch": 0.48997772828507796, "grad_norm": 1.135461688041687, "learning_rate": 0.00010821703481152534, "loss": 0.3187, "step": 2035 }, { "epoch": 0.4902185035815325, "grad_norm": 0.8220135569572449, "learning_rate": 0.00010813932150665538, "loss": 0.4416, "step": 2036 }, { "epoch": 0.4904592788779871, "grad_norm": 2.479522943496704, "learning_rate": 0.00010806160325305956, "loss": 0.6935, "step": 2037 }, { "epoch": 0.4907000541744417, "grad_norm": 3.262054920196533, "learning_rate": 0.00010798388009799084, "loss": 1.5196, "step": 2038 }, { "epoch": 0.4909408294708963, "grad_norm": 3.85654354095459, "learning_rate": 0.000107906152088705, "loss": 0.3237, "step": 2039 }, { "epoch": 0.49118160476735084, "grad_norm": 2.709144353866577, "learning_rate": 0.0001078284192724609, "loss": 0.806, "step": 2040 }, { "epoch": 0.49142238006380545, "grad_norm": 4.338006019592285, "learning_rate": 0.00010775068169652023, "loss": 0.6148, "step": 2041 }, { "epoch": 0.49166315536026006, "grad_norm": 1.0794256925582886, "learning_rate": 0.00010767293940814762, "loss": 0.3614, "step": 2042 }, { "epoch": 0.4919039306567146, "grad_norm": 0.8536688089370728, "learning_rate": 0.0001075951924546106, "loss": 0.3124, "step": 2043 }, { "epoch": 0.4921447059531692, "grad_norm": 2.3540027141571045, "learning_rate": 0.00010751744088317943, "loss": 0.8265, "step": 2044 }, { "epoch": 0.4923854812496238, "grad_norm": 1.918283462524414, "learning_rate": 0.00010743968474112728, "loss": 0.8652, "step": 2045 }, { "epoch": 0.4926262565460784, "grad_norm": 2.034250497817993, "learning_rate": 0.00010736192407573, "loss": 0.8617, "step": 2046 }, { "epoch": 0.49286703184253294, "grad_norm": 3.235872268676758, "learning_rate": 0.00010728415893426635, "loss": 0.4167, "step": 2047 }, { "epoch": 0.49310780713898755, "grad_norm": 1.3588740825653076, "learning_rate": 0.00010720638936401766, "loss": 0.8502, "step": 2048 }, { "epoch": 0.4933485824354421, "grad_norm": 2.3015613555908203, "learning_rate": 0.00010712861541226797, "loss": 0.8856, "step": 2049 }, { "epoch": 0.4935893577318967, "grad_norm": 3.480872631072998, "learning_rate": 0.00010705083712630401, "loss": 0.9697, "step": 2050 }, { "epoch": 0.49383013302835127, "grad_norm": 8.409546852111816, "learning_rate": 0.00010697305455341526, "loss": 0.3575, "step": 2051 }, { "epoch": 0.4940709083248059, "grad_norm": 1.3223494291305542, "learning_rate": 0.00010689526774089362, "loss": 0.3494, "step": 2052 }, { "epoch": 0.49431168362126043, "grad_norm": 1.0988234281539917, "learning_rate": 0.00010681747673603366, "loss": 0.2256, "step": 2053 }, { "epoch": 0.49455245891771504, "grad_norm": 1.517215609550476, "learning_rate": 0.00010673968158613243, "loss": 0.5634, "step": 2054 }, { "epoch": 0.49479323421416965, "grad_norm": 3.4470624923706055, "learning_rate": 0.00010666188233848967, "loss": 0.5364, "step": 2055 }, { "epoch": 0.4950340095106242, "grad_norm": 2.27813720703125, "learning_rate": 0.00010658407904040743, "loss": 0.7642, "step": 2056 }, { "epoch": 0.4952747848070788, "grad_norm": 0.9174807667732239, "learning_rate": 0.0001065062717391903, "loss": 0.4644, "step": 2057 }, { "epoch": 0.49551556010353337, "grad_norm": 1.2668373584747314, "learning_rate": 0.00010642846048214527, "loss": 0.3014, "step": 2058 }, { "epoch": 0.495756335399988, "grad_norm": 3.347287893295288, "learning_rate": 0.00010635064531658178, "loss": 0.283, "step": 2059 }, { "epoch": 0.49599711069644253, "grad_norm": 0.6961964964866638, "learning_rate": 0.00010627282628981165, "loss": 0.2452, "step": 2060 }, { "epoch": 0.49623788599289714, "grad_norm": 4.018993377685547, "learning_rate": 0.00010619500344914902, "loss": 0.7302, "step": 2061 }, { "epoch": 0.4964786612893517, "grad_norm": 1.290248990058899, "learning_rate": 0.0001061171768419103, "loss": 0.6912, "step": 2062 }, { "epoch": 0.4967194365858063, "grad_norm": 2.4808475971221924, "learning_rate": 0.00010603934651541427, "loss": 0.2996, "step": 2063 }, { "epoch": 0.49696021188226086, "grad_norm": 2.5349011421203613, "learning_rate": 0.00010596151251698199, "loss": 0.5265, "step": 2064 }, { "epoch": 0.49720098717871547, "grad_norm": 2.832211494445801, "learning_rate": 0.00010588367489393666, "loss": 0.9041, "step": 2065 }, { "epoch": 0.49744176247517, "grad_norm": 1.3861429691314697, "learning_rate": 0.00010580583369360373, "loss": 0.5222, "step": 2066 }, { "epoch": 0.49768253777162463, "grad_norm": 1.224226951599121, "learning_rate": 0.00010572798896331082, "loss": 0.5713, "step": 2067 }, { "epoch": 0.4979233130680792, "grad_norm": 2.4965927600860596, "learning_rate": 0.00010565014075038775, "loss": 0.3679, "step": 2068 }, { "epoch": 0.4981640883645338, "grad_norm": 2.0286030769348145, "learning_rate": 0.00010557228910216637, "loss": 0.3128, "step": 2069 }, { "epoch": 0.4984048636609884, "grad_norm": 1.7408385276794434, "learning_rate": 0.00010549443406598063, "loss": 0.6847, "step": 2070 }, { "epoch": 0.49864563895744296, "grad_norm": 2.918757915496826, "learning_rate": 0.00010541657568916661, "loss": 0.4012, "step": 2071 }, { "epoch": 0.49888641425389757, "grad_norm": 0.9126492142677307, "learning_rate": 0.00010533871401906237, "loss": 0.3021, "step": 2072 }, { "epoch": 0.4991271895503521, "grad_norm": 1.8159611225128174, "learning_rate": 0.00010526084910300798, "loss": 0.5893, "step": 2073 }, { "epoch": 0.49936796484680673, "grad_norm": 1.3606966733932495, "learning_rate": 0.00010518298098834547, "loss": 0.4645, "step": 2074 }, { "epoch": 0.4996087401432613, "grad_norm": 3.8433918952941895, "learning_rate": 0.00010510510972241887, "loss": 0.3448, "step": 2075 }, { "epoch": 0.4998495154397159, "grad_norm": 2.995986223220825, "learning_rate": 0.00010502723535257401, "loss": 0.6148, "step": 2076 }, { "epoch": 0.5000902907361705, "grad_norm": 2.552739381790161, "learning_rate": 0.00010494935792615879, "loss": 0.5938, "step": 2077 }, { "epoch": 0.500331066032625, "grad_norm": 2.2203798294067383, "learning_rate": 0.00010487147749052275, "loss": 0.8364, "step": 2078 }, { "epoch": 0.5005718413290796, "grad_norm": 1.519313097000122, "learning_rate": 0.00010479359409301745, "loss": 0.6105, "step": 2079 }, { "epoch": 0.5008126166255342, "grad_norm": 1.471633791923523, "learning_rate": 0.00010471570778099611, "loss": 0.8271, "step": 2080 }, { "epoch": 0.5010533919219888, "grad_norm": 3.146540880203247, "learning_rate": 0.00010463781860181385, "loss": 0.6785, "step": 2081 }, { "epoch": 0.5012941672184434, "grad_norm": 2.4692275524139404, "learning_rate": 0.00010455992660282741, "loss": 0.8448, "step": 2082 }, { "epoch": 0.5015349425148979, "grad_norm": 2.2308695316314697, "learning_rate": 0.00010448203183139533, "loss": 0.72, "step": 2083 }, { "epoch": 0.5017757178113526, "grad_norm": 0.7109373807907104, "learning_rate": 0.00010440413433487781, "loss": 0.1728, "step": 2084 }, { "epoch": 0.5020164931078072, "grad_norm": 5.810349464416504, "learning_rate": 0.00010432623416063667, "loss": 1.3146, "step": 2085 }, { "epoch": 0.5022572684042618, "grad_norm": 1.7806396484375, "learning_rate": 0.0001042483313560354, "loss": 0.224, "step": 2086 }, { "epoch": 0.5024980437007163, "grad_norm": 4.550583362579346, "learning_rate": 0.00010417042596843914, "loss": 0.5014, "step": 2087 }, { "epoch": 0.5027388189971709, "grad_norm": 0.9690256118774414, "learning_rate": 0.00010409251804521447, "loss": 0.2506, "step": 2088 }, { "epoch": 0.5029795942936255, "grad_norm": 1.3459006547927856, "learning_rate": 0.00010401460763372961, "loss": 0.212, "step": 2089 }, { "epoch": 0.5032203695900801, "grad_norm": 1.2357487678527832, "learning_rate": 0.00010393669478135426, "loss": 0.6829, "step": 2090 }, { "epoch": 0.5034611448865346, "grad_norm": 0.7511969804763794, "learning_rate": 0.00010385877953545961, "loss": 0.737, "step": 2091 }, { "epoch": 0.5037019201829892, "grad_norm": 1.3373340368270874, "learning_rate": 0.00010378086194341832, "loss": 0.9976, "step": 2092 }, { "epoch": 0.5039426954794438, "grad_norm": 2.1753182411193848, "learning_rate": 0.00010370294205260443, "loss": 0.3736, "step": 2093 }, { "epoch": 0.5041834707758984, "grad_norm": 0.6808569431304932, "learning_rate": 0.00010362501991039347, "loss": 0.4928, "step": 2094 }, { "epoch": 0.5044242460723529, "grad_norm": 5.135721683502197, "learning_rate": 0.00010354709556416218, "loss": 0.6557, "step": 2095 }, { "epoch": 0.5046650213688075, "grad_norm": 3.574115037918091, "learning_rate": 0.00010346916906128883, "loss": 0.6108, "step": 2096 }, { "epoch": 0.5049057966652621, "grad_norm": 5.21065092086792, "learning_rate": 0.0001033912404491529, "loss": 0.5354, "step": 2097 }, { "epoch": 0.5051465719617168, "grad_norm": 4.044327259063721, "learning_rate": 0.00010331330977513509, "loss": 0.4002, "step": 2098 }, { "epoch": 0.5053873472581714, "grad_norm": 1.0108164548873901, "learning_rate": 0.00010323537708661748, "loss": 0.5534, "step": 2099 }, { "epoch": 0.5056281225546259, "grad_norm": 0.433327317237854, "learning_rate": 0.00010315744243098333, "loss": 0.4697, "step": 2100 }, { "epoch": 0.5058688978510805, "grad_norm": 1.2929291725158691, "learning_rate": 0.00010307950585561706, "loss": 0.6741, "step": 2101 }, { "epoch": 0.5061096731475351, "grad_norm": 1.6541675329208374, "learning_rate": 0.00010300156740790427, "loss": 0.3582, "step": 2102 }, { "epoch": 0.5063504484439897, "grad_norm": 2.3018059730529785, "learning_rate": 0.00010292362713523176, "loss": 1.1002, "step": 2103 }, { "epoch": 0.5065912237404442, "grad_norm": 1.3195204734802246, "learning_rate": 0.00010284568508498735, "loss": 0.4559, "step": 2104 }, { "epoch": 0.5068319990368988, "grad_norm": 1.7798513174057007, "learning_rate": 0.00010276774130456001, "loss": 0.3002, "step": 2105 }, { "epoch": 0.5070727743333534, "grad_norm": 1.1935960054397583, "learning_rate": 0.00010268979584133971, "loss": 0.5571, "step": 2106 }, { "epoch": 0.507313549629808, "grad_norm": 2.004664421081543, "learning_rate": 0.00010261184874271748, "loss": 0.5307, "step": 2107 }, { "epoch": 0.5075543249262625, "grad_norm": 1.2251675128936768, "learning_rate": 0.00010253390005608534, "loss": 0.4798, "step": 2108 }, { "epoch": 0.5077951002227171, "grad_norm": 1.0275200605392456, "learning_rate": 0.00010245594982883626, "loss": 0.8242, "step": 2109 }, { "epoch": 0.5080358755191717, "grad_norm": 0.9734987616539001, "learning_rate": 0.00010237799810836413, "loss": 0.5406, "step": 2110 }, { "epoch": 0.5082766508156263, "grad_norm": 2.428023099899292, "learning_rate": 0.0001023000449420638, "loss": 0.3063, "step": 2111 }, { "epoch": 0.508517426112081, "grad_norm": 2.5370419025421143, "learning_rate": 0.00010222209037733097, "loss": 0.7001, "step": 2112 }, { "epoch": 0.5087582014085354, "grad_norm": 8.222167015075684, "learning_rate": 0.0001021441344615622, "loss": 1.3225, "step": 2113 }, { "epoch": 0.5089989767049901, "grad_norm": 8.197820663452148, "learning_rate": 0.00010206617724215481, "loss": 0.1596, "step": 2114 }, { "epoch": 0.5092397520014447, "grad_norm": 4.041478157043457, "learning_rate": 0.00010198821876650701, "loss": 0.4862, "step": 2115 }, { "epoch": 0.5094805272978993, "grad_norm": 1.4023808240890503, "learning_rate": 0.00010191025908201774, "loss": 0.337, "step": 2116 }, { "epoch": 0.5097213025943538, "grad_norm": 0.8638352751731873, "learning_rate": 0.00010183229823608665, "loss": 0.3498, "step": 2117 }, { "epoch": 0.5099620778908084, "grad_norm": 3.4716315269470215, "learning_rate": 0.00010175433627611408, "loss": 1.0205, "step": 2118 }, { "epoch": 0.510202853187263, "grad_norm": 9.713912010192871, "learning_rate": 0.0001016763732495011, "loss": 0.7581, "step": 2119 }, { "epoch": 0.5104436284837176, "grad_norm": 3.348017930984497, "learning_rate": 0.00010159840920364943, "loss": 0.1819, "step": 2120 }, { "epoch": 0.5106844037801721, "grad_norm": 9.675308227539062, "learning_rate": 0.00010152044418596136, "loss": 0.5749, "step": 2121 }, { "epoch": 0.5109251790766267, "grad_norm": 0.7371659278869629, "learning_rate": 0.00010144247824383979, "loss": 0.2887, "step": 2122 }, { "epoch": 0.5111659543730813, "grad_norm": 1.746598720550537, "learning_rate": 0.00010136451142468819, "loss": 0.9139, "step": 2123 }, { "epoch": 0.5114067296695359, "grad_norm": 0.3207070827484131, "learning_rate": 0.00010128654377591056, "loss": 0.3856, "step": 2124 }, { "epoch": 0.5116475049659905, "grad_norm": 1.747492790222168, "learning_rate": 0.00010120857534491144, "loss": 0.4888, "step": 2125 }, { "epoch": 0.511888280262445, "grad_norm": 1.8366111516952515, "learning_rate": 0.0001011306061790958, "loss": 0.8371, "step": 2126 }, { "epoch": 0.5121290555588996, "grad_norm": 2.3959193229675293, "learning_rate": 0.00010105263632586904, "loss": 1.0204, "step": 2127 }, { "epoch": 0.5123698308553543, "grad_norm": 1.2648195028305054, "learning_rate": 0.00010097466583263699, "loss": 0.3782, "step": 2128 }, { "epoch": 0.5126106061518089, "grad_norm": 3.5460050106048584, "learning_rate": 0.00010089669474680596, "loss": 0.697, "step": 2129 }, { "epoch": 0.5128513814482634, "grad_norm": 0.989863932132721, "learning_rate": 0.00010081872311578249, "loss": 0.217, "step": 2130 }, { "epoch": 0.513092156744718, "grad_norm": 5.3702921867370605, "learning_rate": 0.00010074075098697351, "loss": 0.7093, "step": 2131 }, { "epoch": 0.5133329320411726, "grad_norm": 8.320046424865723, "learning_rate": 0.00010066277840778626, "loss": 0.8629, "step": 2132 }, { "epoch": 0.5135737073376272, "grad_norm": 3.336007833480835, "learning_rate": 0.00010058480542562828, "loss": 1.1258, "step": 2133 }, { "epoch": 0.5138144826340817, "grad_norm": 0.6159772276878357, "learning_rate": 0.00010050683208790726, "loss": 0.3306, "step": 2134 }, { "epoch": 0.5140552579305363, "grad_norm": 1.654181957244873, "learning_rate": 0.00010042885844203119, "loss": 0.7766, "step": 2135 }, { "epoch": 0.5142960332269909, "grad_norm": 1.8773746490478516, "learning_rate": 0.00010035088453540822, "loss": 0.2017, "step": 2136 }, { "epoch": 0.5145368085234455, "grad_norm": 1.3991271257400513, "learning_rate": 0.00010027291041544664, "loss": 0.643, "step": 2137 }, { "epoch": 0.5147775838199001, "grad_norm": 2.1096439361572266, "learning_rate": 0.00010019493612955495, "loss": 0.6112, "step": 2138 }, { "epoch": 0.5150183591163546, "grad_norm": 2.802321195602417, "learning_rate": 0.00010011696172514162, "loss": 0.7492, "step": 2139 }, { "epoch": 0.5152591344128092, "grad_norm": 2.361962080001831, "learning_rate": 0.00010003898724961533, "loss": 0.2983, "step": 2140 }, { "epoch": 0.5154999097092638, "grad_norm": 2.6102824211120605, "learning_rate": 9.99610127503847e-05, "loss": 0.8425, "step": 2141 }, { "epoch": 0.5157406850057185, "grad_norm": 0.7321549654006958, "learning_rate": 9.988303827485839e-05, "loss": 0.2544, "step": 2142 }, { "epoch": 0.515981460302173, "grad_norm": 3.4591763019561768, "learning_rate": 9.980506387044508e-05, "loss": 0.6845, "step": 2143 }, { "epoch": 0.5162222355986276, "grad_norm": 6.815724849700928, "learning_rate": 9.972708958455337e-05, "loss": 0.6039, "step": 2144 }, { "epoch": 0.5164630108950822, "grad_norm": 3.7558867931365967, "learning_rate": 9.964911546459181e-05, "loss": 0.6514, "step": 2145 }, { "epoch": 0.5167037861915368, "grad_norm": 1.1329708099365234, "learning_rate": 9.957114155796884e-05, "loss": 1.0924, "step": 2146 }, { "epoch": 0.5169445614879913, "grad_norm": 2.772102117538452, "learning_rate": 9.949316791209275e-05, "loss": 0.3061, "step": 2147 }, { "epoch": 0.5171853367844459, "grad_norm": 1.8187817335128784, "learning_rate": 9.941519457437173e-05, "loss": 0.4169, "step": 2148 }, { "epoch": 0.5174261120809005, "grad_norm": 0.46912047266960144, "learning_rate": 9.933722159221376e-05, "loss": 0.336, "step": 2149 }, { "epoch": 0.5176668873773551, "grad_norm": 1.7679054737091064, "learning_rate": 9.925924901302651e-05, "loss": 0.4573, "step": 2150 }, { "epoch": 0.5179076626738097, "grad_norm": 1.68385648727417, "learning_rate": 9.918127688421755e-05, "loss": 0.529, "step": 2151 }, { "epoch": 0.5181484379702642, "grad_norm": 0.7433429956436157, "learning_rate": 9.910330525319406e-05, "loss": 0.3717, "step": 2152 }, { "epoch": 0.5183892132667188, "grad_norm": 1.197072148323059, "learning_rate": 9.902533416736302e-05, "loss": 0.2179, "step": 2153 }, { "epoch": 0.5186299885631734, "grad_norm": 1.157617211341858, "learning_rate": 9.894736367413102e-05, "loss": 0.6772, "step": 2154 }, { "epoch": 0.518870763859628, "grad_norm": 2.84462308883667, "learning_rate": 9.886939382090422e-05, "loss": 0.4376, "step": 2155 }, { "epoch": 0.5191115391560825, "grad_norm": 1.1269418001174927, "learning_rate": 9.879142465508856e-05, "loss": 0.5879, "step": 2156 }, { "epoch": 0.5193523144525372, "grad_norm": 1.6317634582519531, "learning_rate": 9.871345622408946e-05, "loss": 0.8341, "step": 2157 }, { "epoch": 0.5195930897489918, "grad_norm": 2.172504425048828, "learning_rate": 9.863548857531183e-05, "loss": 0.4717, "step": 2158 }, { "epoch": 0.5198338650454464, "grad_norm": 0.8946624994277954, "learning_rate": 9.855752175616025e-05, "loss": 0.9934, "step": 2159 }, { "epoch": 0.5200746403419009, "grad_norm": 1.0163549184799194, "learning_rate": 9.847955581403866e-05, "loss": 0.6364, "step": 2160 }, { "epoch": 0.5203154156383555, "grad_norm": 1.2340433597564697, "learning_rate": 9.840159079635057e-05, "loss": 0.681, "step": 2161 }, { "epoch": 0.5205561909348101, "grad_norm": 2.015260934829712, "learning_rate": 9.832362675049893e-05, "loss": 0.7061, "step": 2162 }, { "epoch": 0.5207969662312647, "grad_norm": 1.6834375858306885, "learning_rate": 9.824566372388596e-05, "loss": 0.6874, "step": 2163 }, { "epoch": 0.5210377415277193, "grad_norm": 2.863741874694824, "learning_rate": 9.81677017639134e-05, "loss": 0.1785, "step": 2164 }, { "epoch": 0.5212785168241738, "grad_norm": 0.741033673286438, "learning_rate": 9.808974091798227e-05, "loss": 0.3825, "step": 2165 }, { "epoch": 0.5215192921206284, "grad_norm": 2.9215714931488037, "learning_rate": 9.801178123349298e-05, "loss": 0.5243, "step": 2166 }, { "epoch": 0.521760067417083, "grad_norm": 2.389853000640869, "learning_rate": 9.793382275784521e-05, "loss": 0.5792, "step": 2167 }, { "epoch": 0.5220008427135376, "grad_norm": 4.854155540466309, "learning_rate": 9.785586553843781e-05, "loss": 0.7133, "step": 2168 }, { "epoch": 0.5222416180099921, "grad_norm": 1.7137115001678467, "learning_rate": 9.777790962266903e-05, "loss": 0.8245, "step": 2169 }, { "epoch": 0.5224823933064467, "grad_norm": 5.3910603523254395, "learning_rate": 9.769995505793622e-05, "loss": 0.5916, "step": 2170 }, { "epoch": 0.5227231686029014, "grad_norm": 9.350793838500977, "learning_rate": 9.762200189163588e-05, "loss": 0.7286, "step": 2171 }, { "epoch": 0.522963943899356, "grad_norm": 2.6609160900115967, "learning_rate": 9.754405017116379e-05, "loss": 0.5725, "step": 2172 }, { "epoch": 0.5232047191958105, "grad_norm": 2.54089617729187, "learning_rate": 9.746609994391468e-05, "loss": 0.7312, "step": 2173 }, { "epoch": 0.5234454944922651, "grad_norm": 1.6947931051254272, "learning_rate": 9.738815125728252e-05, "loss": 1.0029, "step": 2174 }, { "epoch": 0.5236862697887197, "grad_norm": 1.9103237390518188, "learning_rate": 9.73102041586603e-05, "loss": 0.6121, "step": 2175 }, { "epoch": 0.5239270450851743, "grad_norm": 3.6913580894470215, "learning_rate": 9.723225869544001e-05, "loss": 0.8657, "step": 2176 }, { "epoch": 0.5241678203816288, "grad_norm": 1.9038362503051758, "learning_rate": 9.715431491501269e-05, "loss": 0.5313, "step": 2177 }, { "epoch": 0.5244085956780834, "grad_norm": 3.199769973754883, "learning_rate": 9.707637286476827e-05, "loss": 0.7072, "step": 2178 }, { "epoch": 0.524649370974538, "grad_norm": 1.5751662254333496, "learning_rate": 9.699843259209574e-05, "loss": 0.2701, "step": 2179 }, { "epoch": 0.5248901462709926, "grad_norm": 1.8176679611206055, "learning_rate": 9.692049414438299e-05, "loss": 0.2336, "step": 2180 }, { "epoch": 0.5251309215674472, "grad_norm": 7.185880661010742, "learning_rate": 9.68425575690167e-05, "loss": 0.4916, "step": 2181 }, { "epoch": 0.5253716968639017, "grad_norm": 3.68613338470459, "learning_rate": 9.676462291338253e-05, "loss": 0.5863, "step": 2182 }, { "epoch": 0.5256124721603563, "grad_norm": 1.8995952606201172, "learning_rate": 9.668669022486494e-05, "loss": 0.1889, "step": 2183 }, { "epoch": 0.525853247456811, "grad_norm": 1.6753265857696533, "learning_rate": 9.660875955084713e-05, "loss": 0.539, "step": 2184 }, { "epoch": 0.5260940227532656, "grad_norm": 0.9983983039855957, "learning_rate": 9.65308309387112e-05, "loss": 0.3609, "step": 2185 }, { "epoch": 0.52633479804972, "grad_norm": 3.3040006160736084, "learning_rate": 9.645290443583785e-05, "loss": 1.2302, "step": 2186 }, { "epoch": 0.5265755733461747, "grad_norm": 2.018064498901367, "learning_rate": 9.637498008960657e-05, "loss": 0.443, "step": 2187 }, { "epoch": 0.5268163486426293, "grad_norm": 2.3584113121032715, "learning_rate": 9.629705794739558e-05, "loss": 0.8664, "step": 2188 }, { "epoch": 0.5270571239390839, "grad_norm": 0.6062427163124084, "learning_rate": 9.62191380565817e-05, "loss": 0.2761, "step": 2189 }, { "epoch": 0.5272978992355384, "grad_norm": 4.201809406280518, "learning_rate": 9.614122046454044e-05, "loss": 1.1502, "step": 2190 }, { "epoch": 0.527538674531993, "grad_norm": 6.053175449371338, "learning_rate": 9.606330521864576e-05, "loss": 0.465, "step": 2191 }, { "epoch": 0.5277794498284476, "grad_norm": 1.6828287839889526, "learning_rate": 9.59853923662704e-05, "loss": 0.7583, "step": 2192 }, { "epoch": 0.5280202251249022, "grad_norm": 2.127516746520996, "learning_rate": 9.590748195478557e-05, "loss": 0.581, "step": 2193 }, { "epoch": 0.5282610004213568, "grad_norm": 2.426520824432373, "learning_rate": 9.582957403156089e-05, "loss": 0.729, "step": 2194 }, { "epoch": 0.5285017757178113, "grad_norm": 0.5099361538887024, "learning_rate": 9.575166864396459e-05, "loss": 0.2235, "step": 2195 }, { "epoch": 0.5287425510142659, "grad_norm": 2.9863169193267822, "learning_rate": 9.567376583936335e-05, "loss": 0.5938, "step": 2196 }, { "epoch": 0.5289833263107205, "grad_norm": 1.6381510496139526, "learning_rate": 9.559586566512221e-05, "loss": 0.7708, "step": 2197 }, { "epoch": 0.5292241016071751, "grad_norm": 2.1702208518981934, "learning_rate": 9.551796816860471e-05, "loss": 0.2262, "step": 2198 }, { "epoch": 0.5294648769036296, "grad_norm": 1.5045363903045654, "learning_rate": 9.544007339717261e-05, "loss": 0.6521, "step": 2199 }, { "epoch": 0.5297056522000843, "grad_norm": 1.3283405303955078, "learning_rate": 9.536218139818614e-05, "loss": 0.386, "step": 2200 }, { "epoch": 0.5299464274965389, "grad_norm": 2.6849524974823, "learning_rate": 9.52842922190039e-05, "loss": 0.5514, "step": 2201 }, { "epoch": 0.5301872027929935, "grad_norm": 1.1004747152328491, "learning_rate": 9.520640590698258e-05, "loss": 0.5606, "step": 2202 }, { "epoch": 0.530427978089448, "grad_norm": 2.8887600898742676, "learning_rate": 9.512852250947727e-05, "loss": 0.7519, "step": 2203 }, { "epoch": 0.5306687533859026, "grad_norm": 2.1143975257873535, "learning_rate": 9.505064207384124e-05, "loss": 0.3216, "step": 2204 }, { "epoch": 0.5309095286823572, "grad_norm": 1.3769932985305786, "learning_rate": 9.497276464742598e-05, "loss": 0.2864, "step": 2205 }, { "epoch": 0.5311503039788118, "grad_norm": 1.131319284439087, "learning_rate": 9.489489027758118e-05, "loss": 0.5236, "step": 2206 }, { "epoch": 0.5313910792752664, "grad_norm": 1.2855147123336792, "learning_rate": 9.481701901165455e-05, "loss": 0.8535, "step": 2207 }, { "epoch": 0.5316318545717209, "grad_norm": 4.562783718109131, "learning_rate": 9.473915089699203e-05, "loss": 1.103, "step": 2208 }, { "epoch": 0.5318726298681755, "grad_norm": 1.491631269454956, "learning_rate": 9.466128598093767e-05, "loss": 0.328, "step": 2209 }, { "epoch": 0.5321134051646301, "grad_norm": 1.7544147968292236, "learning_rate": 9.458342431083342e-05, "loss": 0.0794, "step": 2210 }, { "epoch": 0.5323541804610847, "grad_norm": 1.3631882667541504, "learning_rate": 9.45055659340194e-05, "loss": 0.2153, "step": 2211 }, { "epoch": 0.5325949557575392, "grad_norm": 6.174732208251953, "learning_rate": 9.442771089783366e-05, "loss": 0.7058, "step": 2212 }, { "epoch": 0.5328357310539938, "grad_norm": 1.6120647192001343, "learning_rate": 9.434985924961226e-05, "loss": 0.5721, "step": 2213 }, { "epoch": 0.5330765063504485, "grad_norm": 0.5557000637054443, "learning_rate": 9.42720110366892e-05, "loss": 0.255, "step": 2214 }, { "epoch": 0.5333172816469031, "grad_norm": 3.7805826663970947, "learning_rate": 9.41941663063963e-05, "loss": 0.6903, "step": 2215 }, { "epoch": 0.5335580569433576, "grad_norm": 4.721010684967041, "learning_rate": 9.411632510606337e-05, "loss": 1.1333, "step": 2216 }, { "epoch": 0.5337988322398122, "grad_norm": 3.89003849029541, "learning_rate": 9.403848748301802e-05, "loss": 0.9563, "step": 2217 }, { "epoch": 0.5340396075362668, "grad_norm": 1.9357439279556274, "learning_rate": 9.396065348458571e-05, "loss": 0.6106, "step": 2218 }, { "epoch": 0.5342803828327214, "grad_norm": 1.0858145952224731, "learning_rate": 9.388282315808971e-05, "loss": 0.4984, "step": 2219 }, { "epoch": 0.534521158129176, "grad_norm": 2.763885259628296, "learning_rate": 9.3804996550851e-05, "loss": 0.3943, "step": 2220 }, { "epoch": 0.5347619334256305, "grad_norm": 0.8865588903427124, "learning_rate": 9.372717371018834e-05, "loss": 0.2669, "step": 2221 }, { "epoch": 0.5350027087220851, "grad_norm": 1.0072959661483765, "learning_rate": 9.364935468341824e-05, "loss": 0.2614, "step": 2222 }, { "epoch": 0.5352434840185397, "grad_norm": 1.3582466840744019, "learning_rate": 9.357153951785475e-05, "loss": 0.8149, "step": 2223 }, { "epoch": 0.5354842593149943, "grad_norm": 1.8487718105316162, "learning_rate": 9.349372826080974e-05, "loss": 1.013, "step": 2224 }, { "epoch": 0.5357250346114488, "grad_norm": 2.25203275680542, "learning_rate": 9.341592095959259e-05, "loss": 0.4711, "step": 2225 }, { "epoch": 0.5359658099079034, "grad_norm": 4.066526889801025, "learning_rate": 9.333811766151033e-05, "loss": 1.3851, "step": 2226 }, { "epoch": 0.536206585204358, "grad_norm": 3.2181577682495117, "learning_rate": 9.326031841386759e-05, "loss": 0.7188, "step": 2227 }, { "epoch": 0.5364473605008127, "grad_norm": 4.251607894897461, "learning_rate": 9.318252326396635e-05, "loss": 0.9096, "step": 2228 }, { "epoch": 0.5366881357972672, "grad_norm": 3.6044514179229736, "learning_rate": 9.310473225910641e-05, "loss": 0.4364, "step": 2229 }, { "epoch": 0.5369289110937218, "grad_norm": 0.8138754367828369, "learning_rate": 9.302694544658475e-05, "loss": 0.3227, "step": 2230 }, { "epoch": 0.5371696863901764, "grad_norm": 1.5204187631607056, "learning_rate": 9.294916287369597e-05, "loss": 0.3241, "step": 2231 }, { "epoch": 0.537410461686631, "grad_norm": 2.078233242034912, "learning_rate": 9.287138458773208e-05, "loss": 0.5936, "step": 2232 }, { "epoch": 0.5376512369830856, "grad_norm": 6.410951614379883, "learning_rate": 9.279361063598238e-05, "loss": 0.4392, "step": 2233 }, { "epoch": 0.5378920122795401, "grad_norm": 1.241186499595642, "learning_rate": 9.271584106573364e-05, "loss": 0.4729, "step": 2234 }, { "epoch": 0.5381327875759947, "grad_norm": 1.808719515800476, "learning_rate": 9.263807592427001e-05, "loss": 0.5305, "step": 2235 }, { "epoch": 0.5383735628724493, "grad_norm": 0.6988890171051025, "learning_rate": 9.256031525887273e-05, "loss": 0.5642, "step": 2236 }, { "epoch": 0.5386143381689039, "grad_norm": 2.4080259799957275, "learning_rate": 9.24825591168206e-05, "loss": 0.7976, "step": 2237 }, { "epoch": 0.5388551134653584, "grad_norm": 4.949229717254639, "learning_rate": 9.240480754538942e-05, "loss": 1.2054, "step": 2238 }, { "epoch": 0.539095888761813, "grad_norm": 1.403643250465393, "learning_rate": 9.232706059185236e-05, "loss": 0.9002, "step": 2239 }, { "epoch": 0.5393366640582676, "grad_norm": 2.1335864067077637, "learning_rate": 9.224931830347978e-05, "loss": 0.9663, "step": 2240 }, { "epoch": 0.5395774393547222, "grad_norm": 2.4091343879699707, "learning_rate": 9.21715807275391e-05, "loss": 0.9484, "step": 2241 }, { "epoch": 0.5398182146511767, "grad_norm": 2.391929864883423, "learning_rate": 9.209384791129504e-05, "loss": 0.6072, "step": 2242 }, { "epoch": 0.5400589899476314, "grad_norm": 5.663161754608154, "learning_rate": 9.20161199020092e-05, "loss": 0.3371, "step": 2243 }, { "epoch": 0.540299765244086, "grad_norm": 1.5023120641708374, "learning_rate": 9.193839674694046e-05, "loss": 0.7458, "step": 2244 }, { "epoch": 0.5405405405405406, "grad_norm": 2.3951783180236816, "learning_rate": 9.186067849334467e-05, "loss": 0.8693, "step": 2245 }, { "epoch": 0.5407813158369951, "grad_norm": 1.6337603330612183, "learning_rate": 9.178296518847467e-05, "loss": 0.8064, "step": 2246 }, { "epoch": 0.5410220911334497, "grad_norm": 4.101715564727783, "learning_rate": 9.170525687958035e-05, "loss": 0.7042, "step": 2247 }, { "epoch": 0.5412628664299043, "grad_norm": 0.9086791276931763, "learning_rate": 9.162755361390858e-05, "loss": 0.8873, "step": 2248 }, { "epoch": 0.5415036417263589, "grad_norm": 1.7184299230575562, "learning_rate": 9.154985543870304e-05, "loss": 0.8026, "step": 2249 }, { "epoch": 0.5417444170228135, "grad_norm": 2.9949686527252197, "learning_rate": 9.147216240120446e-05, "loss": 0.6126, "step": 2250 }, { "epoch": 0.541985192319268, "grad_norm": 2.2674872875213623, "learning_rate": 9.139447454865033e-05, "loss": 0.8358, "step": 2251 }, { "epoch": 0.5422259676157226, "grad_norm": 0.7034595012664795, "learning_rate": 9.131679192827506e-05, "loss": 0.4057, "step": 2252 }, { "epoch": 0.5424667429121772, "grad_norm": 3.044638156890869, "learning_rate": 9.123911458730988e-05, "loss": 0.7883, "step": 2253 }, { "epoch": 0.5427075182086318, "grad_norm": 4.1872239112854, "learning_rate": 9.116144257298274e-05, "loss": 1.4448, "step": 2254 }, { "epoch": 0.5429482935050863, "grad_norm": 1.9178543090820312, "learning_rate": 9.108377593251847e-05, "loss": 1.2404, "step": 2255 }, { "epoch": 0.5431890688015409, "grad_norm": 1.3553639650344849, "learning_rate": 9.100611471313849e-05, "loss": 0.4571, "step": 2256 }, { "epoch": 0.5434298440979956, "grad_norm": 5.682826042175293, "learning_rate": 9.092845896206102e-05, "loss": 0.6029, "step": 2257 }, { "epoch": 0.5436706193944502, "grad_norm": 3.233644485473633, "learning_rate": 9.085080872650098e-05, "loss": 0.7475, "step": 2258 }, { "epoch": 0.5439113946909047, "grad_norm": 0.8178972601890564, "learning_rate": 9.077316405366981e-05, "loss": 0.4826, "step": 2259 }, { "epoch": 0.5441521699873593, "grad_norm": 1.9637796878814697, "learning_rate": 9.069552499077569e-05, "loss": 0.7773, "step": 2260 }, { "epoch": 0.5443929452838139, "grad_norm": 4.2175188064575195, "learning_rate": 9.061789158502336e-05, "loss": 0.5585, "step": 2261 }, { "epoch": 0.5446337205802685, "grad_norm": 1.7888754606246948, "learning_rate": 9.054026388361405e-05, "loss": 0.5089, "step": 2262 }, { "epoch": 0.5448744958767231, "grad_norm": 1.9590795040130615, "learning_rate": 9.046264193374568e-05, "loss": 0.5263, "step": 2263 }, { "epoch": 0.5451152711731776, "grad_norm": 2.484314441680908, "learning_rate": 9.038502578261241e-05, "loss": 0.5187, "step": 2264 }, { "epoch": 0.5453560464696322, "grad_norm": 1.6243886947631836, "learning_rate": 9.030741547740517e-05, "loss": 0.6487, "step": 2265 }, { "epoch": 0.5455968217660868, "grad_norm": 3.200514793395996, "learning_rate": 9.022981106531119e-05, "loss": 0.4566, "step": 2266 }, { "epoch": 0.5458375970625414, "grad_norm": 2.8995554447174072, "learning_rate": 9.015221259351405e-05, "loss": 1.1906, "step": 2267 }, { "epoch": 0.5460783723589959, "grad_norm": 1.6960794925689697, "learning_rate": 9.007462010919386e-05, "loss": 0.8553, "step": 2268 }, { "epoch": 0.5463191476554505, "grad_norm": 0.8978815674781799, "learning_rate": 8.999703365952699e-05, "loss": 0.9352, "step": 2269 }, { "epoch": 0.5465599229519051, "grad_norm": 1.4150447845458984, "learning_rate": 8.99194532916862e-05, "loss": 0.3387, "step": 2270 }, { "epoch": 0.5468006982483598, "grad_norm": 1.1384726762771606, "learning_rate": 8.984187905284055e-05, "loss": 0.2762, "step": 2271 }, { "epoch": 0.5470414735448142, "grad_norm": 1.1837869882583618, "learning_rate": 8.976431099015528e-05, "loss": 0.43, "step": 2272 }, { "epoch": 0.5472822488412689, "grad_norm": 3.328984498977661, "learning_rate": 8.968674915079197e-05, "loss": 0.9047, "step": 2273 }, { "epoch": 0.5475230241377235, "grad_norm": 2.5467495918273926, "learning_rate": 8.960919358190848e-05, "loss": 0.7412, "step": 2274 }, { "epoch": 0.5477637994341781, "grad_norm": 0.357572466135025, "learning_rate": 8.953164433065866e-05, "loss": 0.2749, "step": 2275 }, { "epoch": 0.5480045747306327, "grad_norm": 1.4513580799102783, "learning_rate": 8.945410144419269e-05, "loss": 0.4484, "step": 2276 }, { "epoch": 0.5482453500270872, "grad_norm": 2.9045469760894775, "learning_rate": 8.937656496965678e-05, "loss": 0.8804, "step": 2277 }, { "epoch": 0.5484861253235418, "grad_norm": 2.212029218673706, "learning_rate": 8.929903495419331e-05, "loss": 0.5796, "step": 2278 }, { "epoch": 0.5487269006199964, "grad_norm": 5.107553482055664, "learning_rate": 8.922151144494072e-05, "loss": 0.6931, "step": 2279 }, { "epoch": 0.548967675916451, "grad_norm": 0.923570990562439, "learning_rate": 8.914399448903344e-05, "loss": 0.2629, "step": 2280 }, { "epoch": 0.5492084512129055, "grad_norm": 4.435163974761963, "learning_rate": 8.906648413360197e-05, "loss": 0.4986, "step": 2281 }, { "epoch": 0.5494492265093601, "grad_norm": 0.577694296836853, "learning_rate": 8.898898042577279e-05, "loss": 0.4683, "step": 2282 }, { "epoch": 0.5496900018058147, "grad_norm": 3.198882579803467, "learning_rate": 8.891148341266828e-05, "loss": 0.4887, "step": 2283 }, { "epoch": 0.5499307771022693, "grad_norm": 2.20881724357605, "learning_rate": 8.883399314140689e-05, "loss": 0.6167, "step": 2284 }, { "epoch": 0.5501715523987238, "grad_norm": 2.165309429168701, "learning_rate": 8.875650965910279e-05, "loss": 0.6205, "step": 2285 }, { "epoch": 0.5504123276951784, "grad_norm": 1.3588035106658936, "learning_rate": 8.867903301286616e-05, "loss": 0.3225, "step": 2286 }, { "epoch": 0.5506531029916331, "grad_norm": 1.6632091999053955, "learning_rate": 8.8601563249803e-05, "loss": 0.5279, "step": 2287 }, { "epoch": 0.5508938782880877, "grad_norm": 1.157415509223938, "learning_rate": 8.852410041701502e-05, "loss": 0.4965, "step": 2288 }, { "epoch": 0.5511346535845423, "grad_norm": 3.8233842849731445, "learning_rate": 8.844664456159985e-05, "loss": 0.7001, "step": 2289 }, { "epoch": 0.5513754288809968, "grad_norm": 1.0012489557266235, "learning_rate": 8.836919573065082e-05, "loss": 0.4657, "step": 2290 }, { "epoch": 0.5516162041774514, "grad_norm": 1.7905609607696533, "learning_rate": 8.829175397125698e-05, "loss": 0.3764, "step": 2291 }, { "epoch": 0.551856979473906, "grad_norm": 3.6006107330322266, "learning_rate": 8.821431933050313e-05, "loss": 0.7817, "step": 2292 }, { "epoch": 0.5520977547703606, "grad_norm": 0.9073820114135742, "learning_rate": 8.813689185546965e-05, "loss": 0.322, "step": 2293 }, { "epoch": 0.5523385300668151, "grad_norm": 3.195746660232544, "learning_rate": 8.80594715932327e-05, "loss": 0.6941, "step": 2294 }, { "epoch": 0.5525793053632697, "grad_norm": 1.6812855005264282, "learning_rate": 8.798205859086388e-05, "loss": 0.7138, "step": 2295 }, { "epoch": 0.5528200806597243, "grad_norm": 1.5866107940673828, "learning_rate": 8.790465289543051e-05, "loss": 0.4609, "step": 2296 }, { "epoch": 0.5530608559561789, "grad_norm": 1.2990373373031616, "learning_rate": 8.782725455399546e-05, "loss": 0.5497, "step": 2297 }, { "epoch": 0.5533016312526334, "grad_norm": 0.8197939395904541, "learning_rate": 8.774986361361705e-05, "loss": 0.3533, "step": 2298 }, { "epoch": 0.553542406549088, "grad_norm": 2.288421869277954, "learning_rate": 8.767248012134914e-05, "loss": 0.1527, "step": 2299 }, { "epoch": 0.5537831818455426, "grad_norm": 6.408196449279785, "learning_rate": 8.759510412424113e-05, "loss": 0.6184, "step": 2300 }, { "epoch": 0.5540239571419973, "grad_norm": 4.457020282745361, "learning_rate": 8.751773566933774e-05, "loss": 0.665, "step": 2301 }, { "epoch": 0.5542647324384519, "grad_norm": 2.0285515785217285, "learning_rate": 8.744037480367921e-05, "loss": 0.9767, "step": 2302 }, { "epoch": 0.5545055077349064, "grad_norm": 4.255732536315918, "learning_rate": 8.736302157430107e-05, "loss": 0.7522, "step": 2303 }, { "epoch": 0.554746283031361, "grad_norm": 1.1508095264434814, "learning_rate": 8.728567602823429e-05, "loss": 0.4259, "step": 2304 }, { "epoch": 0.5549870583278156, "grad_norm": 0.9924709796905518, "learning_rate": 8.720833821250513e-05, "loss": 1.6025, "step": 2305 }, { "epoch": 0.5552278336242702, "grad_norm": 1.755651593208313, "learning_rate": 8.713100817413516e-05, "loss": 0.3882, "step": 2306 }, { "epoch": 0.5554686089207247, "grad_norm": 1.430647850036621, "learning_rate": 8.705368596014125e-05, "loss": 0.5597, "step": 2307 }, { "epoch": 0.5557093842171793, "grad_norm": 1.2561583518981934, "learning_rate": 8.697637161753538e-05, "loss": 0.8822, "step": 2308 }, { "epoch": 0.5559501595136339, "grad_norm": 1.0225826501846313, "learning_rate": 8.689906519332491e-05, "loss": 0.8633, "step": 2309 }, { "epoch": 0.5561909348100885, "grad_norm": 1.079167366027832, "learning_rate": 8.682176673451239e-05, "loss": 0.4746, "step": 2310 }, { "epoch": 0.556431710106543, "grad_norm": 1.3175033330917358, "learning_rate": 8.674447628809533e-05, "loss": 0.4305, "step": 2311 }, { "epoch": 0.5566724854029976, "grad_norm": 4.170149326324463, "learning_rate": 8.666719390106655e-05, "loss": 0.8164, "step": 2312 }, { "epoch": 0.5569132606994522, "grad_norm": 0.9638872742652893, "learning_rate": 8.658991962041395e-05, "loss": 0.5429, "step": 2313 }, { "epoch": 0.5571540359959068, "grad_norm": 1.9414424896240234, "learning_rate": 8.65126534931204e-05, "loss": 0.9223, "step": 2314 }, { "epoch": 0.5573948112923615, "grad_norm": 1.570064902305603, "learning_rate": 8.643539556616397e-05, "loss": 1.0301, "step": 2315 }, { "epoch": 0.557635586588816, "grad_norm": 3.4186506271362305, "learning_rate": 8.635814588651754e-05, "loss": 0.8375, "step": 2316 }, { "epoch": 0.5578763618852706, "grad_norm": 2.636807441711426, "learning_rate": 8.628090450114916e-05, "loss": 0.4639, "step": 2317 }, { "epoch": 0.5581171371817252, "grad_norm": 4.225121974945068, "learning_rate": 8.620367145702177e-05, "loss": 0.5046, "step": 2318 }, { "epoch": 0.5583579124781798, "grad_norm": 0.9116895198822021, "learning_rate": 8.612644680109319e-05, "loss": 0.2553, "step": 2319 }, { "epoch": 0.5585986877746343, "grad_norm": 2.3729517459869385, "learning_rate": 8.604923058031624e-05, "loss": 0.592, "step": 2320 }, { "epoch": 0.5588394630710889, "grad_norm": 1.5719141960144043, "learning_rate": 8.59720228416385e-05, "loss": 0.9508, "step": 2321 }, { "epoch": 0.5590802383675435, "grad_norm": 3.1368796825408936, "learning_rate": 8.589482363200247e-05, "loss": 0.9687, "step": 2322 }, { "epoch": 0.5593210136639981, "grad_norm": 2.1668570041656494, "learning_rate": 8.581763299834551e-05, "loss": 0.0668, "step": 2323 }, { "epoch": 0.5595617889604526, "grad_norm": 0.7108801007270813, "learning_rate": 8.57404509875996e-05, "loss": 0.2144, "step": 2324 }, { "epoch": 0.5598025642569072, "grad_norm": 2.860525369644165, "learning_rate": 8.56632776466916e-05, "loss": 0.619, "step": 2325 }, { "epoch": 0.5600433395533618, "grad_norm": 4.221729278564453, "learning_rate": 8.558611302254314e-05, "loss": 0.828, "step": 2326 }, { "epoch": 0.5602841148498164, "grad_norm": 1.6991534233093262, "learning_rate": 8.55089571620704e-05, "loss": 0.909, "step": 2327 }, { "epoch": 0.5605248901462709, "grad_norm": 4.212416648864746, "learning_rate": 8.543181011218437e-05, "loss": 1.5328, "step": 2328 }, { "epoch": 0.5607656654427255, "grad_norm": 4.365540504455566, "learning_rate": 8.535467191979058e-05, "loss": 0.6489, "step": 2329 }, { "epoch": 0.5610064407391802, "grad_norm": 0.9320734143257141, "learning_rate": 8.527754263178929e-05, "loss": 0.6582, "step": 2330 }, { "epoch": 0.5612472160356348, "grad_norm": 4.166979789733887, "learning_rate": 8.520042229507528e-05, "loss": 0.5757, "step": 2331 }, { "epoch": 0.5614879913320894, "grad_norm": 3.1154069900512695, "learning_rate": 8.512331095653781e-05, "loss": 0.8792, "step": 2332 }, { "epoch": 0.5617287666285439, "grad_norm": 4.849252700805664, "learning_rate": 8.504620866306083e-05, "loss": 0.4272, "step": 2333 }, { "epoch": 0.5619695419249985, "grad_norm": 2.375708818435669, "learning_rate": 8.496911546152265e-05, "loss": 0.971, "step": 2334 }, { "epoch": 0.5622103172214531, "grad_norm": 2.0698773860931396, "learning_rate": 8.489203139879612e-05, "loss": 0.4473, "step": 2335 }, { "epoch": 0.5624510925179077, "grad_norm": 6.773448944091797, "learning_rate": 8.481495652174859e-05, "loss": 0.363, "step": 2336 }, { "epoch": 0.5626918678143622, "grad_norm": 5.320286750793457, "learning_rate": 8.473789087724165e-05, "loss": 0.5259, "step": 2337 }, { "epoch": 0.5629326431108168, "grad_norm": 2.9927375316619873, "learning_rate": 8.466083451213144e-05, "loss": 0.5302, "step": 2338 }, { "epoch": 0.5631734184072714, "grad_norm": 1.8399150371551514, "learning_rate": 8.458378747326848e-05, "loss": 0.9814, "step": 2339 }, { "epoch": 0.563414193703726, "grad_norm": 1.0915262699127197, "learning_rate": 8.450674980749742e-05, "loss": 0.2, "step": 2340 }, { "epoch": 0.5636549690001805, "grad_norm": 6.239700794219971, "learning_rate": 8.442972156165738e-05, "loss": 0.754, "step": 2341 }, { "epoch": 0.5638957442966351, "grad_norm": 3.9862194061279297, "learning_rate": 8.435270278258172e-05, "loss": 0.306, "step": 2342 }, { "epoch": 0.5641365195930897, "grad_norm": 3.2919952869415283, "learning_rate": 8.427569351709801e-05, "loss": 0.776, "step": 2343 }, { "epoch": 0.5643772948895444, "grad_norm": 1.855094075202942, "learning_rate": 8.41986938120281e-05, "loss": 0.6994, "step": 2344 }, { "epoch": 0.564618070185999, "grad_norm": 1.7668780088424683, "learning_rate": 8.41217037141879e-05, "loss": 0.3419, "step": 2345 }, { "epoch": 0.5648588454824535, "grad_norm": 3.746309280395508, "learning_rate": 8.404472327038768e-05, "loss": 1.1026, "step": 2346 }, { "epoch": 0.5650996207789081, "grad_norm": 2.670344591140747, "learning_rate": 8.396775252743162e-05, "loss": 0.7391, "step": 2347 }, { "epoch": 0.5653403960753627, "grad_norm": 1.6550657749176025, "learning_rate": 8.389079153211814e-05, "loss": 0.4773, "step": 2348 }, { "epoch": 0.5655811713718173, "grad_norm": 2.2174558639526367, "learning_rate": 8.381384033123974e-05, "loss": 0.6246, "step": 2349 }, { "epoch": 0.5658219466682718, "grad_norm": 0.4945906400680542, "learning_rate": 8.373689897158284e-05, "loss": 0.1936, "step": 2350 }, { "epoch": 0.5660627219647264, "grad_norm": 1.8350954055786133, "learning_rate": 8.365996749992801e-05, "loss": 0.3785, "step": 2351 }, { "epoch": 0.566303497261181, "grad_norm": 0.7016525864601135, "learning_rate": 8.358304596304982e-05, "loss": 0.567, "step": 2352 }, { "epoch": 0.5665442725576356, "grad_norm": 5.016156196594238, "learning_rate": 8.35061344077166e-05, "loss": 0.6756, "step": 2353 }, { "epoch": 0.5667850478540901, "grad_norm": 1.9168941974639893, "learning_rate": 8.342923288069086e-05, "loss": 0.888, "step": 2354 }, { "epoch": 0.5670258231505447, "grad_norm": 1.5404551029205322, "learning_rate": 8.335234142872885e-05, "loss": 0.4729, "step": 2355 }, { "epoch": 0.5672665984469993, "grad_norm": 1.9677037000656128, "learning_rate": 8.327546009858074e-05, "loss": 0.3468, "step": 2356 }, { "epoch": 0.567507373743454, "grad_norm": 1.9757428169250488, "learning_rate": 8.319858893699059e-05, "loss": 0.2262, "step": 2357 }, { "epoch": 0.5677481490399086, "grad_norm": 1.3826395273208618, "learning_rate": 8.312172799069621e-05, "loss": 0.5705, "step": 2358 }, { "epoch": 0.567988924336363, "grad_norm": 1.7746422290802002, "learning_rate": 8.304487730642929e-05, "loss": 0.7911, "step": 2359 }, { "epoch": 0.5682296996328177, "grad_norm": 1.2216047048568726, "learning_rate": 8.296803693091511e-05, "loss": 0.5022, "step": 2360 }, { "epoch": 0.5684704749292723, "grad_norm": 0.8310643434524536, "learning_rate": 8.289120691087285e-05, "loss": 0.3669, "step": 2361 }, { "epoch": 0.5687112502257269, "grad_norm": 0.5129712820053101, "learning_rate": 8.281438729301536e-05, "loss": 0.436, "step": 2362 }, { "epoch": 0.5689520255221814, "grad_norm": 3.883026599884033, "learning_rate": 8.27375781240491e-05, "loss": 1.1273, "step": 2363 }, { "epoch": 0.569192800818636, "grad_norm": 2.724834680557251, "learning_rate": 8.266077945067424e-05, "loss": 0.8467, "step": 2364 }, { "epoch": 0.5694335761150906, "grad_norm": 2.839754343032837, "learning_rate": 8.258399131958454e-05, "loss": 0.9973, "step": 2365 }, { "epoch": 0.5696743514115452, "grad_norm": 1.3639193773269653, "learning_rate": 8.250721377746734e-05, "loss": 0.3668, "step": 2366 }, { "epoch": 0.5699151267079997, "grad_norm": 4.23447322845459, "learning_rate": 8.243044687100363e-05, "loss": 0.3128, "step": 2367 }, { "epoch": 0.5701559020044543, "grad_norm": 1.0347940921783447, "learning_rate": 8.235369064686776e-05, "loss": 0.4905, "step": 2368 }, { "epoch": 0.5703966773009089, "grad_norm": 3.1089839935302734, "learning_rate": 8.227694515172773e-05, "loss": 0.4338, "step": 2369 }, { "epoch": 0.5706374525973635, "grad_norm": 4.172400951385498, "learning_rate": 8.2200210432245e-05, "loss": 1.1137, "step": 2370 }, { "epoch": 0.5708782278938181, "grad_norm": 3.9930694103240967, "learning_rate": 8.21234865350744e-05, "loss": 0.1863, "step": 2371 }, { "epoch": 0.5711190031902726, "grad_norm": 2.020798921585083, "learning_rate": 8.204677350686432e-05, "loss": 0.379, "step": 2372 }, { "epoch": 0.5713597784867273, "grad_norm": 3.6490232944488525, "learning_rate": 8.197007139425631e-05, "loss": 0.4755, "step": 2373 }, { "epoch": 0.5716005537831819, "grad_norm": 2.922484874725342, "learning_rate": 8.189338024388557e-05, "loss": 1.0381, "step": 2374 }, { "epoch": 0.5718413290796365, "grad_norm": 3.1068320274353027, "learning_rate": 8.181670010238046e-05, "loss": 0.8434, "step": 2375 }, { "epoch": 0.572082104376091, "grad_norm": 2.6153829097747803, "learning_rate": 8.174003101636261e-05, "loss": 1.216, "step": 2376 }, { "epoch": 0.5723228796725456, "grad_norm": 1.5444633960723877, "learning_rate": 8.166337303244705e-05, "loss": 0.5995, "step": 2377 }, { "epoch": 0.5725636549690002, "grad_norm": 4.170453071594238, "learning_rate": 8.158672619724203e-05, "loss": 0.6781, "step": 2378 }, { "epoch": 0.5728044302654548, "grad_norm": 4.247837543487549, "learning_rate": 8.151009055734893e-05, "loss": 0.3414, "step": 2379 }, { "epoch": 0.5730452055619093, "grad_norm": 1.5872865915298462, "learning_rate": 8.143346615936247e-05, "loss": 0.6584, "step": 2380 }, { "epoch": 0.5732859808583639, "grad_norm": 1.2567731142044067, "learning_rate": 8.135685304987039e-05, "loss": 0.6352, "step": 2381 }, { "epoch": 0.5735267561548185, "grad_norm": 3.6656978130340576, "learning_rate": 8.128025127545362e-05, "loss": 1.2404, "step": 2382 }, { "epoch": 0.5737675314512731, "grad_norm": 2.5888733863830566, "learning_rate": 8.120366088268632e-05, "loss": 0.3153, "step": 2383 }, { "epoch": 0.5740083067477277, "grad_norm": 0.779647171497345, "learning_rate": 8.112708191813552e-05, "loss": 0.4345, "step": 2384 }, { "epoch": 0.5742490820441822, "grad_norm": 0.7447169423103333, "learning_rate": 8.105051442836145e-05, "loss": 0.2654, "step": 2385 }, { "epoch": 0.5744898573406368, "grad_norm": 0.5837435722351074, "learning_rate": 8.097395845991727e-05, "loss": 0.5411, "step": 2386 }, { "epoch": 0.5747306326370915, "grad_norm": 1.1887192726135254, "learning_rate": 8.089741405934922e-05, "loss": 0.3803, "step": 2387 }, { "epoch": 0.5749714079335461, "grad_norm": 2.842036724090576, "learning_rate": 8.08208812731965e-05, "loss": 0.8952, "step": 2388 }, { "epoch": 0.5752121832300006, "grad_norm": 3.2157955169677734, "learning_rate": 8.074436014799114e-05, "loss": 0.2237, "step": 2389 }, { "epoch": 0.5754529585264552, "grad_norm": 3.5656988620758057, "learning_rate": 8.06678507302582e-05, "loss": 0.2823, "step": 2390 }, { "epoch": 0.5756937338229098, "grad_norm": 2.8901267051696777, "learning_rate": 8.059135306651557e-05, "loss": 0.6339, "step": 2391 }, { "epoch": 0.5759345091193644, "grad_norm": 1.380159854888916, "learning_rate": 8.0514867203274e-05, "loss": 1.2206, "step": 2392 }, { "epoch": 0.5761752844158189, "grad_norm": 5.57066011428833, "learning_rate": 8.043839318703709e-05, "loss": 0.932, "step": 2393 }, { "epoch": 0.5764160597122735, "grad_norm": 2.786633253097534, "learning_rate": 8.036193106430118e-05, "loss": 0.4513, "step": 2394 }, { "epoch": 0.5766568350087281, "grad_norm": 2.4537577629089355, "learning_rate": 8.028548088155542e-05, "loss": 0.8421, "step": 2395 }, { "epoch": 0.5768976103051827, "grad_norm": 3.0305957794189453, "learning_rate": 8.020904268528175e-05, "loss": 0.7525, "step": 2396 }, { "epoch": 0.5771383856016373, "grad_norm": 1.3954887390136719, "learning_rate": 8.013261652195466e-05, "loss": 0.3742, "step": 2397 }, { "epoch": 0.5773791608980918, "grad_norm": 2.359279155731201, "learning_rate": 8.00562024380415e-05, "loss": 1.0482, "step": 2398 }, { "epoch": 0.5776199361945464, "grad_norm": 4.190445899963379, "learning_rate": 7.99798004800022e-05, "loss": 1.031, "step": 2399 }, { "epoch": 0.577860711491001, "grad_norm": 3.663658618927002, "learning_rate": 7.990341069428931e-05, "loss": 0.4797, "step": 2400 }, { "epoch": 0.5781014867874557, "grad_norm": 1.6564805507659912, "learning_rate": 7.9827033127348e-05, "loss": 0.8746, "step": 2401 }, { "epoch": 0.5783422620839102, "grad_norm": 1.9383180141448975, "learning_rate": 7.9750667825616e-05, "loss": 0.7385, "step": 2402 }, { "epoch": 0.5785830373803648, "grad_norm": 0.28531309962272644, "learning_rate": 7.967431483552356e-05, "loss": 0.2861, "step": 2403 }, { "epoch": 0.5788238126768194, "grad_norm": 2.478971004486084, "learning_rate": 7.959797420349355e-05, "loss": 0.3581, "step": 2404 }, { "epoch": 0.579064587973274, "grad_norm": 3.229998826980591, "learning_rate": 7.952164597594115e-05, "loss": 0.7698, "step": 2405 }, { "epoch": 0.5793053632697285, "grad_norm": 1.7557350397109985, "learning_rate": 7.944533019927414e-05, "loss": 0.758, "step": 2406 }, { "epoch": 0.5795461385661831, "grad_norm": 7.241235256195068, "learning_rate": 7.936902691989267e-05, "loss": 1.014, "step": 2407 }, { "epoch": 0.5797869138626377, "grad_norm": 4.189211368560791, "learning_rate": 7.929273618418933e-05, "loss": 0.7462, "step": 2408 }, { "epoch": 0.5800276891590923, "grad_norm": 4.705471515655518, "learning_rate": 7.921645803854907e-05, "loss": 0.9831, "step": 2409 }, { "epoch": 0.5802684644555468, "grad_norm": 2.0091071128845215, "learning_rate": 7.914019252934908e-05, "loss": 0.8221, "step": 2410 }, { "epoch": 0.5805092397520014, "grad_norm": 2.49102783203125, "learning_rate": 7.906393970295905e-05, "loss": 0.5716, "step": 2411 }, { "epoch": 0.580750015048456, "grad_norm": 2.925053119659424, "learning_rate": 7.89876996057409e-05, "loss": 0.9088, "step": 2412 }, { "epoch": 0.5809907903449106, "grad_norm": 4.885961532592773, "learning_rate": 7.891147228404869e-05, "loss": 0.5873, "step": 2413 }, { "epoch": 0.5812315656413652, "grad_norm": 1.2176140546798706, "learning_rate": 7.883525778422887e-05, "loss": 0.8426, "step": 2414 }, { "epoch": 0.5814723409378197, "grad_norm": 1.188421368598938, "learning_rate": 7.875905615261997e-05, "loss": 0.6984, "step": 2415 }, { "epoch": 0.5817131162342744, "grad_norm": 3.3436102867126465, "learning_rate": 7.868286743555279e-05, "loss": 0.7285, "step": 2416 }, { "epoch": 0.581953891530729, "grad_norm": 2.9441144466400146, "learning_rate": 7.860669167935028e-05, "loss": 0.3281, "step": 2417 }, { "epoch": 0.5821946668271836, "grad_norm": 1.1844704151153564, "learning_rate": 7.853052893032736e-05, "loss": 0.3296, "step": 2418 }, { "epoch": 0.5824354421236381, "grad_norm": 1.122290849685669, "learning_rate": 7.84543792347913e-05, "loss": 0.3637, "step": 2419 }, { "epoch": 0.5826762174200927, "grad_norm": 0.8115438222885132, "learning_rate": 7.837824263904116e-05, "loss": 0.1266, "step": 2420 }, { "epoch": 0.5829169927165473, "grad_norm": 2.9317989349365234, "learning_rate": 7.83021191893682e-05, "loss": 0.7098, "step": 2421 }, { "epoch": 0.5831577680130019, "grad_norm": 2.4324686527252197, "learning_rate": 7.822600893205569e-05, "loss": 0.384, "step": 2422 }, { "epoch": 0.5833985433094564, "grad_norm": 1.8341871500015259, "learning_rate": 7.814991191337877e-05, "loss": 0.6857, "step": 2423 }, { "epoch": 0.583639318605911, "grad_norm": 0.9151331782341003, "learning_rate": 7.807382817960464e-05, "loss": 0.2521, "step": 2424 }, { "epoch": 0.5838800939023656, "grad_norm": 3.2553586959838867, "learning_rate": 7.799775777699243e-05, "loss": 0.277, "step": 2425 }, { "epoch": 0.5841208691988202, "grad_norm": 5.162132263183594, "learning_rate": 7.792170075179302e-05, "loss": 0.3815, "step": 2426 }, { "epoch": 0.5843616444952748, "grad_norm": 0.9806712865829468, "learning_rate": 7.784565715024932e-05, "loss": 0.6379, "step": 2427 }, { "epoch": 0.5846024197917293, "grad_norm": 2.116602897644043, "learning_rate": 7.776962701859596e-05, "loss": 0.3267, "step": 2428 }, { "epoch": 0.5848431950881839, "grad_norm": 2.120924472808838, "learning_rate": 7.769361040305944e-05, "loss": 0.5844, "step": 2429 }, { "epoch": 0.5850839703846386, "grad_norm": 1.5902043581008911, "learning_rate": 7.76176073498581e-05, "loss": 0.3459, "step": 2430 }, { "epoch": 0.5853247456810932, "grad_norm": 2.4817397594451904, "learning_rate": 7.75416179052019e-05, "loss": 0.4125, "step": 2431 }, { "epoch": 0.5855655209775477, "grad_norm": 1.6833219528198242, "learning_rate": 7.746564211529264e-05, "loss": 0.422, "step": 2432 }, { "epoch": 0.5858062962740023, "grad_norm": 9.756152153015137, "learning_rate": 7.73896800263237e-05, "loss": 1.028, "step": 2433 }, { "epoch": 0.5860470715704569, "grad_norm": 1.5745316743850708, "learning_rate": 7.731373168448027e-05, "loss": 0.6256, "step": 2434 }, { "epoch": 0.5862878468669115, "grad_norm": 3.160309314727783, "learning_rate": 7.723779713593908e-05, "loss": 0.4354, "step": 2435 }, { "epoch": 0.586528622163366, "grad_norm": 1.0206762552261353, "learning_rate": 7.716187642686851e-05, "loss": 0.7593, "step": 2436 }, { "epoch": 0.5867693974598206, "grad_norm": 2.535022020339966, "learning_rate": 7.708596960342852e-05, "loss": 0.5759, "step": 2437 }, { "epoch": 0.5870101727562752, "grad_norm": 1.5657432079315186, "learning_rate": 7.701007671177067e-05, "loss": 0.4912, "step": 2438 }, { "epoch": 0.5872509480527298, "grad_norm": 0.8812488317489624, "learning_rate": 7.693419779803794e-05, "loss": 0.3876, "step": 2439 }, { "epoch": 0.5874917233491844, "grad_norm": 2.174088954925537, "learning_rate": 7.685833290836497e-05, "loss": 0.6519, "step": 2440 }, { "epoch": 0.5877324986456389, "grad_norm": 1.4618853330612183, "learning_rate": 7.678248208887767e-05, "loss": 0.1547, "step": 2441 }, { "epoch": 0.5879732739420935, "grad_norm": 1.048917293548584, "learning_rate": 7.670664538569358e-05, "loss": 0.7463, "step": 2442 }, { "epoch": 0.5882140492385481, "grad_norm": 0.559017539024353, "learning_rate": 7.663082284492161e-05, "loss": 0.4063, "step": 2443 }, { "epoch": 0.5884548245350028, "grad_norm": 1.1241803169250488, "learning_rate": 7.655501451266197e-05, "loss": 0.6386, "step": 2444 }, { "epoch": 0.5886955998314573, "grad_norm": 1.063376545906067, "learning_rate": 7.647922043500637e-05, "loss": 0.7574, "step": 2445 }, { "epoch": 0.5889363751279119, "grad_norm": 0.6335359811782837, "learning_rate": 7.640344065803768e-05, "loss": 0.4932, "step": 2446 }, { "epoch": 0.5891771504243665, "grad_norm": 5.459598064422607, "learning_rate": 7.632767522783025e-05, "loss": 0.7634, "step": 2447 }, { "epoch": 0.5894179257208211, "grad_norm": 2.985459089279175, "learning_rate": 7.625192419044966e-05, "loss": 1.101, "step": 2448 }, { "epoch": 0.5896587010172756, "grad_norm": 1.2922786474227905, "learning_rate": 7.617618759195262e-05, "loss": 0.2012, "step": 2449 }, { "epoch": 0.5898994763137302, "grad_norm": 3.0108511447906494, "learning_rate": 7.61004654783872e-05, "loss": 0.4321, "step": 2450 }, { "epoch": 0.5901402516101848, "grad_norm": 1.4148467779159546, "learning_rate": 7.602475789579265e-05, "loss": 0.4081, "step": 2451 }, { "epoch": 0.5903810269066394, "grad_norm": 3.129077911376953, "learning_rate": 7.594906489019928e-05, "loss": 0.3434, "step": 2452 }, { "epoch": 0.590621802203094, "grad_norm": 2.397958993911743, "learning_rate": 7.58733865076287e-05, "loss": 0.6114, "step": 2453 }, { "epoch": 0.5908625774995485, "grad_norm": 1.5747435092926025, "learning_rate": 7.579772279409342e-05, "loss": 0.6804, "step": 2454 }, { "epoch": 0.5911033527960031, "grad_norm": 2.1680166721343994, "learning_rate": 7.572207379559721e-05, "loss": 0.5044, "step": 2455 }, { "epoch": 0.5913441280924577, "grad_norm": 0.6241942644119263, "learning_rate": 7.564643955813489e-05, "loss": 0.7446, "step": 2456 }, { "epoch": 0.5915849033889123, "grad_norm": 4.499767780303955, "learning_rate": 7.557082012769213e-05, "loss": 0.8841, "step": 2457 }, { "epoch": 0.5918256786853668, "grad_norm": 1.2002981901168823, "learning_rate": 7.549521555024582e-05, "loss": 0.2635, "step": 2458 }, { "epoch": 0.5920664539818215, "grad_norm": 1.4949264526367188, "learning_rate": 7.541962587176361e-05, "loss": 0.3554, "step": 2459 }, { "epoch": 0.5923072292782761, "grad_norm": 3.360037326812744, "learning_rate": 7.534405113820427e-05, "loss": 0.1464, "step": 2460 }, { "epoch": 0.5925480045747307, "grad_norm": 1.4905561208724976, "learning_rate": 7.526849139551744e-05, "loss": 0.3034, "step": 2461 }, { "epoch": 0.5927887798711852, "grad_norm": 1.9774373769760132, "learning_rate": 7.51929466896435e-05, "loss": 0.7877, "step": 2462 }, { "epoch": 0.5930295551676398, "grad_norm": 1.1401469707489014, "learning_rate": 7.511741706651384e-05, "loss": 0.7026, "step": 2463 }, { "epoch": 0.5932703304640944, "grad_norm": 2.067647695541382, "learning_rate": 7.504190257205075e-05, "loss": 0.5986, "step": 2464 }, { "epoch": 0.593511105760549, "grad_norm": 2.496720790863037, "learning_rate": 7.496640325216708e-05, "loss": 0.4588, "step": 2465 }, { "epoch": 0.5937518810570036, "grad_norm": 0.8668129444122314, "learning_rate": 7.489091915276664e-05, "loss": 0.2105, "step": 2466 }, { "epoch": 0.5939926563534581, "grad_norm": 2.7461793422698975, "learning_rate": 7.481545031974392e-05, "loss": 0.829, "step": 2467 }, { "epoch": 0.5942334316499127, "grad_norm": 2.240567207336426, "learning_rate": 7.473999679898414e-05, "loss": 0.3528, "step": 2468 }, { "epoch": 0.5944742069463673, "grad_norm": 3.95941162109375, "learning_rate": 7.466455863636326e-05, "loss": 0.5933, "step": 2469 }, { "epoch": 0.5947149822428219, "grad_norm": 3.9699573516845703, "learning_rate": 7.458913587774777e-05, "loss": 1.0409, "step": 2470 }, { "epoch": 0.5949557575392764, "grad_norm": 1.2216235399246216, "learning_rate": 7.451372856899494e-05, "loss": 0.2177, "step": 2471 }, { "epoch": 0.595196532835731, "grad_norm": 7.556828022003174, "learning_rate": 7.443833675595255e-05, "loss": 1.0671, "step": 2472 }, { "epoch": 0.5954373081321856, "grad_norm": 3.3300185203552246, "learning_rate": 7.436296048445899e-05, "loss": 0.4654, "step": 2473 }, { "epoch": 0.5956780834286403, "grad_norm": 1.8964426517486572, "learning_rate": 7.428759980034324e-05, "loss": 0.8291, "step": 2474 }, { "epoch": 0.5959188587250948, "grad_norm": 2.5427963733673096, "learning_rate": 7.421225474942472e-05, "loss": 0.7374, "step": 2475 }, { "epoch": 0.5961596340215494, "grad_norm": 2.4423563480377197, "learning_rate": 7.413692537751341e-05, "loss": 0.5469, "step": 2476 }, { "epoch": 0.596400409318004, "grad_norm": 0.9203125834465027, "learning_rate": 7.40616117304098e-05, "loss": 0.3301, "step": 2477 }, { "epoch": 0.5966411846144586, "grad_norm": 2.929774284362793, "learning_rate": 7.398631385390464e-05, "loss": 0.4724, "step": 2478 }, { "epoch": 0.5968819599109132, "grad_norm": 1.285556674003601, "learning_rate": 7.391103179377927e-05, "loss": 0.5685, "step": 2479 }, { "epoch": 0.5971227352073677, "grad_norm": 2.1298601627349854, "learning_rate": 7.383576559580537e-05, "loss": 0.4238, "step": 2480 }, { "epoch": 0.5973635105038223, "grad_norm": 3.565706968307495, "learning_rate": 7.37605153057449e-05, "loss": 0.8206, "step": 2481 }, { "epoch": 0.5976042858002769, "grad_norm": 1.6290020942687988, "learning_rate": 7.368528096935028e-05, "loss": 0.7003, "step": 2482 }, { "epoch": 0.5978450610967315, "grad_norm": 0.8494675755500793, "learning_rate": 7.361006263236409e-05, "loss": 0.4155, "step": 2483 }, { "epoch": 0.598085836393186, "grad_norm": 6.1263508796691895, "learning_rate": 7.353486034051933e-05, "loss": 0.3906, "step": 2484 }, { "epoch": 0.5983266116896406, "grad_norm": 2.4993395805358887, "learning_rate": 7.345967413953906e-05, "loss": 0.8447, "step": 2485 }, { "epoch": 0.5985673869860952, "grad_norm": 1.5568212270736694, "learning_rate": 7.338450407513671e-05, "loss": 0.3203, "step": 2486 }, { "epoch": 0.5988081622825498, "grad_norm": 1.6858243942260742, "learning_rate": 7.330935019301587e-05, "loss": 0.5842, "step": 2487 }, { "epoch": 0.5990489375790043, "grad_norm": 1.3046417236328125, "learning_rate": 7.323421253887022e-05, "loss": 0.4694, "step": 2488 }, { "epoch": 0.599289712875459, "grad_norm": 2.6327929496765137, "learning_rate": 7.315909115838367e-05, "loss": 1.0909, "step": 2489 }, { "epoch": 0.5995304881719136, "grad_norm": 3.176302909851074, "learning_rate": 7.308398609723019e-05, "loss": 0.6372, "step": 2490 }, { "epoch": 0.5997712634683682, "grad_norm": 0.5345314145088196, "learning_rate": 7.300889740107376e-05, "loss": 0.1974, "step": 2491 }, { "epoch": 0.6000120387648227, "grad_norm": 3.916313886642456, "learning_rate": 7.293382511556856e-05, "loss": 1.1176, "step": 2492 }, { "epoch": 0.6002528140612773, "grad_norm": 1.7653621435165405, "learning_rate": 7.285876928635864e-05, "loss": 0.6719, "step": 2493 }, { "epoch": 0.6004935893577319, "grad_norm": 1.351683497428894, "learning_rate": 7.278372995907815e-05, "loss": 0.4, "step": 2494 }, { "epoch": 0.6007343646541865, "grad_norm": 1.6684333086013794, "learning_rate": 7.270870717935119e-05, "loss": 0.5533, "step": 2495 }, { "epoch": 0.6009751399506411, "grad_norm": 2.9273340702056885, "learning_rate": 7.263370099279172e-05, "loss": 0.7937, "step": 2496 }, { "epoch": 0.6012159152470956, "grad_norm": 1.5765647888183594, "learning_rate": 7.255871144500375e-05, "loss": 1.2214, "step": 2497 }, { "epoch": 0.6014566905435502, "grad_norm": 0.8081079125404358, "learning_rate": 7.248373858158099e-05, "loss": 0.2137, "step": 2498 }, { "epoch": 0.6016974658400048, "grad_norm": 1.117992877960205, "learning_rate": 7.240878244810718e-05, "loss": 0.3442, "step": 2499 }, { "epoch": 0.6019382411364594, "grad_norm": 2.1289424896240234, "learning_rate": 7.233384309015584e-05, "loss": 0.673, "step": 2500 }, { "epoch": 0.6021790164329139, "grad_norm": 1.3627246618270874, "learning_rate": 7.22589205532902e-05, "loss": 0.6268, "step": 2501 }, { "epoch": 0.6024197917293685, "grad_norm": 1.519879937171936, "learning_rate": 7.218401488306337e-05, "loss": 0.272, "step": 2502 }, { "epoch": 0.6026605670258232, "grad_norm": 2.85306978225708, "learning_rate": 7.210912612501817e-05, "loss": 0.6996, "step": 2503 }, { "epoch": 0.6029013423222778, "grad_norm": 1.7936211824417114, "learning_rate": 7.20342543246871e-05, "loss": 0.4967, "step": 2504 }, { "epoch": 0.6031421176187323, "grad_norm": 3.3115148544311523, "learning_rate": 7.195939952759248e-05, "loss": 0.2885, "step": 2505 }, { "epoch": 0.6033828929151869, "grad_norm": 0.8520734906196594, "learning_rate": 7.188456177924605e-05, "loss": 0.7537, "step": 2506 }, { "epoch": 0.6036236682116415, "grad_norm": 0.48104944825172424, "learning_rate": 7.180974112514943e-05, "loss": 0.1885, "step": 2507 }, { "epoch": 0.6038644435080961, "grad_norm": 0.6579359769821167, "learning_rate": 7.173493761079372e-05, "loss": 0.5065, "step": 2508 }, { "epoch": 0.6041052188045507, "grad_norm": 1.0354386568069458, "learning_rate": 7.166015128165962e-05, "loss": 0.2026, "step": 2509 }, { "epoch": 0.6043459941010052, "grad_norm": 0.9975037574768066, "learning_rate": 7.158538218321739e-05, "loss": 0.3232, "step": 2510 }, { "epoch": 0.6045867693974598, "grad_norm": 1.1760191917419434, "learning_rate": 7.15106303609268e-05, "loss": 0.8285, "step": 2511 }, { "epoch": 0.6048275446939144, "grad_norm": 1.8876464366912842, "learning_rate": 7.143589586023715e-05, "loss": 0.3947, "step": 2512 }, { "epoch": 0.605068319990369, "grad_norm": 1.618282437324524, "learning_rate": 7.136117872658721e-05, "loss": 0.5223, "step": 2513 }, { "epoch": 0.6053090952868235, "grad_norm": 3.051154851913452, "learning_rate": 7.128647900540506e-05, "loss": 0.6019, "step": 2514 }, { "epoch": 0.6055498705832781, "grad_norm": 1.742828130722046, "learning_rate": 7.121179674210841e-05, "loss": 0.4666, "step": 2515 }, { "epoch": 0.6057906458797327, "grad_norm": 1.7815309762954712, "learning_rate": 7.11371319821042e-05, "loss": 0.9279, "step": 2516 }, { "epoch": 0.6060314211761874, "grad_norm": 2.8688342571258545, "learning_rate": 7.106248477078874e-05, "loss": 0.7174, "step": 2517 }, { "epoch": 0.6062721964726419, "grad_norm": 0.4165075123310089, "learning_rate": 7.09878551535478e-05, "loss": 0.286, "step": 2518 }, { "epoch": 0.6065129717690965, "grad_norm": 1.0387226343154907, "learning_rate": 7.091324317575623e-05, "loss": 0.5322, "step": 2519 }, { "epoch": 0.6067537470655511, "grad_norm": 3.8460330963134766, "learning_rate": 7.083864888277833e-05, "loss": 0.5769, "step": 2520 }, { "epoch": 0.6069945223620057, "grad_norm": 3.067915201187134, "learning_rate": 7.076407231996768e-05, "loss": 0.8518, "step": 2521 }, { "epoch": 0.6072352976584603, "grad_norm": 0.5079042315483093, "learning_rate": 7.06895135326669e-05, "loss": 0.7296, "step": 2522 }, { "epoch": 0.6074760729549148, "grad_norm": 2.1162655353546143, "learning_rate": 7.061497256620793e-05, "loss": 0.4852, "step": 2523 }, { "epoch": 0.6077168482513694, "grad_norm": 0.4626848101615906, "learning_rate": 7.054044946591184e-05, "loss": 0.4988, "step": 2524 }, { "epoch": 0.607957623547824, "grad_norm": 1.59532630443573, "learning_rate": 7.046594427708882e-05, "loss": 0.2568, "step": 2525 }, { "epoch": 0.6081983988442786, "grad_norm": 4.2776384353637695, "learning_rate": 7.039145704503829e-05, "loss": 0.9273, "step": 2526 }, { "epoch": 0.6084391741407331, "grad_norm": 2.9044902324676514, "learning_rate": 7.031698781504849e-05, "loss": 0.6728, "step": 2527 }, { "epoch": 0.6086799494371877, "grad_norm": 4.779917240142822, "learning_rate": 7.024253663239704e-05, "loss": 0.6652, "step": 2528 }, { "epoch": 0.6089207247336423, "grad_norm": 2.6108837127685547, "learning_rate": 7.016810354235038e-05, "loss": 0.6116, "step": 2529 }, { "epoch": 0.609161500030097, "grad_norm": 0.8134174942970276, "learning_rate": 7.009368859016393e-05, "loss": 0.5276, "step": 2530 }, { "epoch": 0.6094022753265514, "grad_norm": 0.9418330192565918, "learning_rate": 7.001929182108223e-05, "loss": 0.315, "step": 2531 }, { "epoch": 0.609643050623006, "grad_norm": 1.0904672145843506, "learning_rate": 6.994491328033862e-05, "loss": 0.4043, "step": 2532 }, { "epoch": 0.6098838259194607, "grad_norm": 3.003647565841675, "learning_rate": 6.987055301315546e-05, "loss": 1.1199, "step": 2533 }, { "epoch": 0.6101246012159153, "grad_norm": 5.428164958953857, "learning_rate": 6.979621106474399e-05, "loss": 0.6681, "step": 2534 }, { "epoch": 0.6103653765123699, "grad_norm": 3.2087454795837402, "learning_rate": 6.972188748030419e-05, "loss": 0.629, "step": 2535 }, { "epoch": 0.6106061518088244, "grad_norm": 4.468095779418945, "learning_rate": 6.964758230502503e-05, "loss": 0.9202, "step": 2536 }, { "epoch": 0.610846927105279, "grad_norm": 3.3546736240386963, "learning_rate": 6.957329558408423e-05, "loss": 0.1201, "step": 2537 }, { "epoch": 0.6110877024017336, "grad_norm": 1.5020190477371216, "learning_rate": 6.949902736264823e-05, "loss": 0.7108, "step": 2538 }, { "epoch": 0.6113284776981882, "grad_norm": 2.882939577102661, "learning_rate": 6.942477768587237e-05, "loss": 0.8403, "step": 2539 }, { "epoch": 0.6115692529946427, "grad_norm": 2.0902936458587646, "learning_rate": 6.935054659890052e-05, "loss": 0.5279, "step": 2540 }, { "epoch": 0.6118100282910973, "grad_norm": 0.7436076402664185, "learning_rate": 6.92763341468654e-05, "loss": 0.2855, "step": 2541 }, { "epoch": 0.6120508035875519, "grad_norm": 1.6438990831375122, "learning_rate": 6.920214037488837e-05, "loss": 0.9144, "step": 2542 }, { "epoch": 0.6122915788840065, "grad_norm": 4.628514766693115, "learning_rate": 6.912796532807934e-05, "loss": 0.4584, "step": 2543 }, { "epoch": 0.612532354180461, "grad_norm": 1.0494173765182495, "learning_rate": 6.905380905153699e-05, "loss": 0.8366, "step": 2544 }, { "epoch": 0.6127731294769156, "grad_norm": 1.7360765933990479, "learning_rate": 6.897967159034842e-05, "loss": 0.5281, "step": 2545 }, { "epoch": 0.6130139047733703, "grad_norm": 1.996323823928833, "learning_rate": 6.89055529895894e-05, "loss": 0.9566, "step": 2546 }, { "epoch": 0.6132546800698249, "grad_norm": 1.1495094299316406, "learning_rate": 6.883145329432427e-05, "loss": 0.2774, "step": 2547 }, { "epoch": 0.6134954553662795, "grad_norm": 1.1022604703903198, "learning_rate": 6.875737254960573e-05, "loss": 0.5837, "step": 2548 }, { "epoch": 0.613736230662734, "grad_norm": 1.230012059211731, "learning_rate": 6.86833108004751e-05, "loss": 0.3625, "step": 2549 }, { "epoch": 0.6139770059591886, "grad_norm": 2.1149914264678955, "learning_rate": 6.860926809196202e-05, "loss": 0.5104, "step": 2550 }, { "epoch": 0.6142177812556432, "grad_norm": 1.968151330947876, "learning_rate": 6.853524446908469e-05, "loss": 0.2779, "step": 2551 }, { "epoch": 0.6144585565520978, "grad_norm": 2.429940938949585, "learning_rate": 6.84612399768496e-05, "loss": 0.6746, "step": 2552 }, { "epoch": 0.6146993318485523, "grad_norm": 5.285004138946533, "learning_rate": 6.838725466025165e-05, "loss": 1.0503, "step": 2553 }, { "epoch": 0.6149401071450069, "grad_norm": 2.815894842147827, "learning_rate": 6.83132885642741e-05, "loss": 0.7085, "step": 2554 }, { "epoch": 0.6151808824414615, "grad_norm": 0.6075404286384583, "learning_rate": 6.823934173388851e-05, "loss": 0.772, "step": 2555 }, { "epoch": 0.6154216577379161, "grad_norm": 2.521939992904663, "learning_rate": 6.81654142140547e-05, "loss": 0.3452, "step": 2556 }, { "epoch": 0.6156624330343706, "grad_norm": 2.4346060752868652, "learning_rate": 6.809150604972079e-05, "loss": 0.4844, "step": 2557 }, { "epoch": 0.6159032083308252, "grad_norm": 3.9991800785064697, "learning_rate": 6.801761728582305e-05, "loss": 0.4445, "step": 2558 }, { "epoch": 0.6161439836272798, "grad_norm": 1.3528568744659424, "learning_rate": 6.794374796728606e-05, "loss": 0.558, "step": 2559 }, { "epoch": 0.6163847589237345, "grad_norm": 1.2810922861099243, "learning_rate": 6.786989813902256e-05, "loss": 0.1497, "step": 2560 }, { "epoch": 0.616625534220189, "grad_norm": 2.726228713989258, "learning_rate": 6.779606784593335e-05, "loss": 0.6956, "step": 2561 }, { "epoch": 0.6168663095166436, "grad_norm": 1.971341609954834, "learning_rate": 6.77222571329075e-05, "loss": 0.5738, "step": 2562 }, { "epoch": 0.6171070848130982, "grad_norm": 1.7685867547988892, "learning_rate": 6.764846604482198e-05, "loss": 0.596, "step": 2563 }, { "epoch": 0.6173478601095528, "grad_norm": 2.118589162826538, "learning_rate": 6.7574694626542e-05, "loss": 0.6178, "step": 2564 }, { "epoch": 0.6175886354060074, "grad_norm": 1.7449718713760376, "learning_rate": 6.750094292292077e-05, "loss": 0.478, "step": 2565 }, { "epoch": 0.6178294107024619, "grad_norm": 3.4893815517425537, "learning_rate": 6.742721097879944e-05, "loss": 0.7634, "step": 2566 }, { "epoch": 0.6180701859989165, "grad_norm": 1.0635044574737549, "learning_rate": 6.735349883900723e-05, "loss": 0.4162, "step": 2567 }, { "epoch": 0.6183109612953711, "grad_norm": 3.9489662647247314, "learning_rate": 6.727980654836128e-05, "loss": 0.3658, "step": 2568 }, { "epoch": 0.6185517365918257, "grad_norm": 0.335372656583786, "learning_rate": 6.720613415166666e-05, "loss": 0.3209, "step": 2569 }, { "epoch": 0.6187925118882802, "grad_norm": 5.411129474639893, "learning_rate": 6.71324816937164e-05, "loss": 0.9387, "step": 2570 }, { "epoch": 0.6190332871847348, "grad_norm": 3.5781424045562744, "learning_rate": 6.705884921929129e-05, "loss": 0.389, "step": 2571 }, { "epoch": 0.6192740624811894, "grad_norm": 1.238693356513977, "learning_rate": 6.698523677316005e-05, "loss": 0.2619, "step": 2572 }, { "epoch": 0.619514837777644, "grad_norm": 1.2711673974990845, "learning_rate": 6.691164440007927e-05, "loss": 0.5968, "step": 2573 }, { "epoch": 0.6197556130740985, "grad_norm": 1.706970453262329, "learning_rate": 6.683807214479323e-05, "loss": 0.512, "step": 2574 }, { "epoch": 0.6199963883705532, "grad_norm": 0.9450774192810059, "learning_rate": 6.676452005203406e-05, "loss": 0.1936, "step": 2575 }, { "epoch": 0.6202371636670078, "grad_norm": 2.2367053031921387, "learning_rate": 6.669098816652154e-05, "loss": 0.7918, "step": 2576 }, { "epoch": 0.6204779389634624, "grad_norm": 2.209228277206421, "learning_rate": 6.661747653296328e-05, "loss": 0.8082, "step": 2577 }, { "epoch": 0.620718714259917, "grad_norm": 2.171247720718384, "learning_rate": 6.654398519605453e-05, "loss": 0.1968, "step": 2578 }, { "epoch": 0.6209594895563715, "grad_norm": 1.3200645446777344, "learning_rate": 6.647051420047811e-05, "loss": 0.3753, "step": 2579 }, { "epoch": 0.6212002648528261, "grad_norm": 1.0066230297088623, "learning_rate": 6.63970635909046e-05, "loss": 0.2922, "step": 2580 }, { "epoch": 0.6214410401492807, "grad_norm": 1.6534557342529297, "learning_rate": 6.632363341199216e-05, "loss": 0.1991, "step": 2581 }, { "epoch": 0.6216818154457353, "grad_norm": 0.7948190569877625, "learning_rate": 6.625022370838649e-05, "loss": 0.6887, "step": 2582 }, { "epoch": 0.6219225907421898, "grad_norm": 0.8418195843696594, "learning_rate": 6.617683452472084e-05, "loss": 0.3701, "step": 2583 }, { "epoch": 0.6221633660386444, "grad_norm": 0.7696940898895264, "learning_rate": 6.610346590561597e-05, "loss": 0.2781, "step": 2584 }, { "epoch": 0.622404141335099, "grad_norm": 4.234709739685059, "learning_rate": 6.603011789568021e-05, "loss": 0.4584, "step": 2585 }, { "epoch": 0.6226449166315536, "grad_norm": 2.2976908683776855, "learning_rate": 6.595679053950933e-05, "loss": 0.9089, "step": 2586 }, { "epoch": 0.6228856919280081, "grad_norm": 3.831660270690918, "learning_rate": 6.588348388168649e-05, "loss": 1.088, "step": 2587 }, { "epoch": 0.6231264672244627, "grad_norm": 1.276307463645935, "learning_rate": 6.581019796678231e-05, "loss": 0.5542, "step": 2588 }, { "epoch": 0.6233672425209174, "grad_norm": 1.1596672534942627, "learning_rate": 6.57369328393548e-05, "loss": 0.5181, "step": 2589 }, { "epoch": 0.623608017817372, "grad_norm": 37.87815856933594, "learning_rate": 6.566368854394931e-05, "loss": 0.9077, "step": 2590 }, { "epoch": 0.6238487931138266, "grad_norm": 2.3513474464416504, "learning_rate": 6.55904651250986e-05, "loss": 0.4156, "step": 2591 }, { "epoch": 0.6240895684102811, "grad_norm": 2.0487992763519287, "learning_rate": 6.551726262732253e-05, "loss": 0.2226, "step": 2592 }, { "epoch": 0.6243303437067357, "grad_norm": 2.3378994464874268, "learning_rate": 6.54440810951285e-05, "loss": 0.1514, "step": 2593 }, { "epoch": 0.6245711190031903, "grad_norm": 0.9720037579536438, "learning_rate": 6.537092057301107e-05, "loss": 0.4916, "step": 2594 }, { "epoch": 0.6248118942996449, "grad_norm": 9.787001609802246, "learning_rate": 6.529778110545191e-05, "loss": 0.5348, "step": 2595 }, { "epoch": 0.6250526695960994, "grad_norm": 1.4247881174087524, "learning_rate": 6.522466273692006e-05, "loss": 0.6283, "step": 2596 }, { "epoch": 0.625293444892554, "grad_norm": 0.8598988652229309, "learning_rate": 6.515156551187156e-05, "loss": 0.4231, "step": 2597 }, { "epoch": 0.6255342201890086, "grad_norm": 1.4908655881881714, "learning_rate": 6.507848947474976e-05, "loss": 0.5314, "step": 2598 }, { "epoch": 0.6257749954854632, "grad_norm": 1.3599947690963745, "learning_rate": 6.500543466998508e-05, "loss": 0.6969, "step": 2599 }, { "epoch": 0.6260157707819177, "grad_norm": 2.1686439514160156, "learning_rate": 6.49324011419949e-05, "loss": 1.0609, "step": 2600 }, { "epoch": 0.6262565460783723, "grad_norm": 2.399899482727051, "learning_rate": 6.48593889351839e-05, "loss": 0.433, "step": 2601 }, { "epoch": 0.6264973213748269, "grad_norm": 2.7841548919677734, "learning_rate": 6.478639809394355e-05, "loss": 0.9582, "step": 2602 }, { "epoch": 0.6267380966712816, "grad_norm": 1.8142119646072388, "learning_rate": 6.471342866265251e-05, "loss": 0.978, "step": 2603 }, { "epoch": 0.6269788719677362, "grad_norm": 2.9826908111572266, "learning_rate": 6.464048068567637e-05, "loss": 1.4267, "step": 2604 }, { "epoch": 0.6272196472641907, "grad_norm": 1.7013874053955078, "learning_rate": 6.45675542073676e-05, "loss": 0.7787, "step": 2605 }, { "epoch": 0.6274604225606453, "grad_norm": 2.441843032836914, "learning_rate": 6.44946492720657e-05, "loss": 0.6874, "step": 2606 }, { "epoch": 0.6277011978570999, "grad_norm": 2.108856439590454, "learning_rate": 6.44217659240971e-05, "loss": 0.1561, "step": 2607 }, { "epoch": 0.6279419731535545, "grad_norm": 2.3653995990753174, "learning_rate": 6.434890420777491e-05, "loss": 1.1851, "step": 2608 }, { "epoch": 0.628182748450009, "grad_norm": 2.6391708850860596, "learning_rate": 6.427606416739932e-05, "loss": 0.7138, "step": 2609 }, { "epoch": 0.6284235237464636, "grad_norm": 2.129570484161377, "learning_rate": 6.420324584725719e-05, "loss": 0.5445, "step": 2610 }, { "epoch": 0.6286642990429182, "grad_norm": 2.4864790439605713, "learning_rate": 6.413044929162221e-05, "loss": 0.3383, "step": 2611 }, { "epoch": 0.6289050743393728, "grad_norm": 1.6433722972869873, "learning_rate": 6.405767454475492e-05, "loss": 0.5752, "step": 2612 }, { "epoch": 0.6291458496358273, "grad_norm": 1.772709846496582, "learning_rate": 6.398492165090246e-05, "loss": 0.4523, "step": 2613 }, { "epoch": 0.6293866249322819, "grad_norm": 1.9673948287963867, "learning_rate": 6.391219065429882e-05, "loss": 0.7943, "step": 2614 }, { "epoch": 0.6296274002287365, "grad_norm": 1.1356581449508667, "learning_rate": 6.383948159916453e-05, "loss": 0.4172, "step": 2615 }, { "epoch": 0.6298681755251911, "grad_norm": 1.4601411819458008, "learning_rate": 6.376679452970689e-05, "loss": 0.6976, "step": 2616 }, { "epoch": 0.6301089508216458, "grad_norm": 1.9286946058273315, "learning_rate": 6.369412949011983e-05, "loss": 0.4763, "step": 2617 }, { "epoch": 0.6303497261181003, "grad_norm": 44.064327239990234, "learning_rate": 6.362148652458382e-05, "loss": 0.3406, "step": 2618 }, { "epoch": 0.6305905014145549, "grad_norm": 1.2986640930175781, "learning_rate": 6.354886567726596e-05, "loss": 0.3705, "step": 2619 }, { "epoch": 0.6308312767110095, "grad_norm": 2.005955219268799, "learning_rate": 6.347626699231995e-05, "loss": 0.7816, "step": 2620 }, { "epoch": 0.6310720520074641, "grad_norm": 5.238460063934326, "learning_rate": 6.340369051388583e-05, "loss": 0.8997, "step": 2621 }, { "epoch": 0.6313128273039186, "grad_norm": 2.330120801925659, "learning_rate": 6.33311362860904e-05, "loss": 1.2146, "step": 2622 }, { "epoch": 0.6315536026003732, "grad_norm": 1.597697138786316, "learning_rate": 6.325860435304668e-05, "loss": 0.5971, "step": 2623 }, { "epoch": 0.6317943778968278, "grad_norm": 3.000946521759033, "learning_rate": 6.318609475885427e-05, "loss": 1.0077, "step": 2624 }, { "epoch": 0.6320351531932824, "grad_norm": 1.6477075815200806, "learning_rate": 6.311360754759923e-05, "loss": 0.2647, "step": 2625 }, { "epoch": 0.6322759284897369, "grad_norm": 3.0683581829071045, "learning_rate": 6.30411427633539e-05, "loss": 0.5957, "step": 2626 }, { "epoch": 0.6325167037861915, "grad_norm": 2.284569025039673, "learning_rate": 6.296870045017704e-05, "loss": 0.5671, "step": 2627 }, { "epoch": 0.6327574790826461, "grad_norm": 1.2169504165649414, "learning_rate": 6.28962806521137e-05, "loss": 0.8968, "step": 2628 }, { "epoch": 0.6329982543791007, "grad_norm": 0.7323624491691589, "learning_rate": 6.282388341319534e-05, "loss": 0.2924, "step": 2629 }, { "epoch": 0.6332390296755553, "grad_norm": 1.0401545763015747, "learning_rate": 6.275150877743968e-05, "loss": 0.4025, "step": 2630 }, { "epoch": 0.6334798049720098, "grad_norm": 1.7448832988739014, "learning_rate": 6.267915678885054e-05, "loss": 0.5393, "step": 2631 }, { "epoch": 0.6337205802684645, "grad_norm": 2.505074977874756, "learning_rate": 6.260682749141816e-05, "loss": 0.5271, "step": 2632 }, { "epoch": 0.6339613555649191, "grad_norm": 1.562775731086731, "learning_rate": 6.253452092911893e-05, "loss": 0.6703, "step": 2633 }, { "epoch": 0.6342021308613737, "grad_norm": 7.656639099121094, "learning_rate": 6.24622371459154e-05, "loss": 0.6791, "step": 2634 }, { "epoch": 0.6344429061578282, "grad_norm": 1.8805732727050781, "learning_rate": 6.238997618575625e-05, "loss": 0.5853, "step": 2635 }, { "epoch": 0.6346836814542828, "grad_norm": 2.444486141204834, "learning_rate": 6.231773809257631e-05, "loss": 0.463, "step": 2636 }, { "epoch": 0.6349244567507374, "grad_norm": 2.5845704078674316, "learning_rate": 6.224552291029648e-05, "loss": 1.1734, "step": 2637 }, { "epoch": 0.635165232047192, "grad_norm": 2.266542434692383, "learning_rate": 6.217333068282383e-05, "loss": 0.6734, "step": 2638 }, { "epoch": 0.6354060073436465, "grad_norm": 1.0296664237976074, "learning_rate": 6.210116145405132e-05, "loss": 0.5709, "step": 2639 }, { "epoch": 0.6356467826401011, "grad_norm": 1.0958337783813477, "learning_rate": 6.202901526785806e-05, "loss": 0.4469, "step": 2640 }, { "epoch": 0.6358875579365557, "grad_norm": 4.823302745819092, "learning_rate": 6.195689216810903e-05, "loss": 0.5806, "step": 2641 }, { "epoch": 0.6361283332330103, "grad_norm": 2.3272790908813477, "learning_rate": 6.188479219865529e-05, "loss": 0.7147, "step": 2642 }, { "epoch": 0.6363691085294648, "grad_norm": 1.8934577703475952, "learning_rate": 6.181271540333379e-05, "loss": 0.4858, "step": 2643 }, { "epoch": 0.6366098838259194, "grad_norm": 3.648973226547241, "learning_rate": 6.174066182596734e-05, "loss": 0.6624, "step": 2644 }, { "epoch": 0.636850659122374, "grad_norm": 1.5058507919311523, "learning_rate": 6.166863151036468e-05, "loss": 0.4902, "step": 2645 }, { "epoch": 0.6370914344188287, "grad_norm": 1.450817584991455, "learning_rate": 6.159662450032046e-05, "loss": 0.5773, "step": 2646 }, { "epoch": 0.6373322097152833, "grad_norm": 1.6548353433609009, "learning_rate": 6.152464083961506e-05, "loss": 0.7833, "step": 2647 }, { "epoch": 0.6375729850117378, "grad_norm": 2.030325174331665, "learning_rate": 6.145268057201473e-05, "loss": 0.3633, "step": 2648 }, { "epoch": 0.6378137603081924, "grad_norm": 0.9832929968833923, "learning_rate": 6.138074374127141e-05, "loss": 0.8045, "step": 2649 }, { "epoch": 0.638054535604647, "grad_norm": 3.2297143936157227, "learning_rate": 6.130883039112292e-05, "loss": 0.8928, "step": 2650 }, { "epoch": 0.6382953109011016, "grad_norm": 1.0606904029846191, "learning_rate": 6.123694056529277e-05, "loss": 0.2497, "step": 2651 }, { "epoch": 0.6385360861975561, "grad_norm": 1.3311941623687744, "learning_rate": 6.116507430749005e-05, "loss": 1.0808, "step": 2652 }, { "epoch": 0.6387768614940107, "grad_norm": 1.9613722562789917, "learning_rate": 6.109323166140968e-05, "loss": 1.0504, "step": 2653 }, { "epoch": 0.6390176367904653, "grad_norm": 1.6116615533828735, "learning_rate": 6.102141267073207e-05, "loss": 0.5613, "step": 2654 }, { "epoch": 0.6392584120869199, "grad_norm": 1.1715492010116577, "learning_rate": 6.094961737912339e-05, "loss": 0.3594, "step": 2655 }, { "epoch": 0.6394991873833744, "grad_norm": 1.0914250612258911, "learning_rate": 6.087784583023535e-05, "loss": 0.4884, "step": 2656 }, { "epoch": 0.639739962679829, "grad_norm": 3.499825954437256, "learning_rate": 6.080609806770516e-05, "loss": 1.0449, "step": 2657 }, { "epoch": 0.6399807379762836, "grad_norm": 1.0770723819732666, "learning_rate": 6.073437413515566e-05, "loss": 0.5479, "step": 2658 }, { "epoch": 0.6402215132727382, "grad_norm": 1.7570230960845947, "learning_rate": 6.0662674076195194e-05, "loss": 0.3681, "step": 2659 }, { "epoch": 0.6404622885691929, "grad_norm": 1.4941871166229248, "learning_rate": 6.059099793441746e-05, "loss": 0.4912, "step": 2660 }, { "epoch": 0.6407030638656473, "grad_norm": 2.4457526206970215, "learning_rate": 6.05193457534018e-05, "loss": 0.7875, "step": 2661 }, { "epoch": 0.640943839162102, "grad_norm": 0.5564669966697693, "learning_rate": 6.044771757671286e-05, "loss": 0.4049, "step": 2662 }, { "epoch": 0.6411846144585566, "grad_norm": 2.4950759410858154, "learning_rate": 6.037611344790073e-05, "loss": 0.7389, "step": 2663 }, { "epoch": 0.6414253897550112, "grad_norm": 4.833079814910889, "learning_rate": 6.030453341050093e-05, "loss": 0.9588, "step": 2664 }, { "epoch": 0.6416661650514657, "grad_norm": 5.021981716156006, "learning_rate": 6.023297750803423e-05, "loss": 0.9724, "step": 2665 }, { "epoch": 0.6419069403479203, "grad_norm": 1.1995772123336792, "learning_rate": 6.0161445784006845e-05, "loss": 0.6185, "step": 2666 }, { "epoch": 0.6421477156443749, "grad_norm": 1.3051928281784058, "learning_rate": 6.008993828191013e-05, "loss": 0.382, "step": 2667 }, { "epoch": 0.6423884909408295, "grad_norm": 3.9973623752593994, "learning_rate": 6.001845504522086e-05, "loss": 0.4625, "step": 2668 }, { "epoch": 0.642629266237284, "grad_norm": 1.3775721788406372, "learning_rate": 5.994699611740102e-05, "loss": 0.5969, "step": 2669 }, { "epoch": 0.6428700415337386, "grad_norm": 1.1527438163757324, "learning_rate": 5.987556154189777e-05, "loss": 0.6209, "step": 2670 }, { "epoch": 0.6431108168301932, "grad_norm": 1.8281904458999634, "learning_rate": 5.98041513621435e-05, "loss": 0.636, "step": 2671 }, { "epoch": 0.6433515921266478, "grad_norm": 2.8507192134857178, "learning_rate": 5.973276562155581e-05, "loss": 0.8928, "step": 2672 }, { "epoch": 0.6435923674231024, "grad_norm": 2.5010929107666016, "learning_rate": 5.9661404363537287e-05, "loss": 1.1794, "step": 2673 }, { "epoch": 0.6438331427195569, "grad_norm": 2.6095519065856934, "learning_rate": 5.959006763147584e-05, "loss": 1.2778, "step": 2674 }, { "epoch": 0.6440739180160115, "grad_norm": 1.2294814586639404, "learning_rate": 5.951875546874428e-05, "loss": 0.3187, "step": 2675 }, { "epoch": 0.6443146933124662, "grad_norm": 4.2607316970825195, "learning_rate": 5.9447467918700614e-05, "loss": 0.4591, "step": 2676 }, { "epoch": 0.6445554686089208, "grad_norm": 1.8376699686050415, "learning_rate": 5.9376205024687835e-05, "loss": 0.6134, "step": 2677 }, { "epoch": 0.6447962439053753, "grad_norm": 1.8157228231430054, "learning_rate": 5.9304966830033907e-05, "loss": 0.6103, "step": 2678 }, { "epoch": 0.6450370192018299, "grad_norm": 1.8555665016174316, "learning_rate": 5.923375337805186e-05, "loss": 0.5274, "step": 2679 }, { "epoch": 0.6452777944982845, "grad_norm": 1.7795042991638184, "learning_rate": 5.916256471203958e-05, "loss": 0.6227, "step": 2680 }, { "epoch": 0.6455185697947391, "grad_norm": 2.5843842029571533, "learning_rate": 5.909140087527996e-05, "loss": 1.0466, "step": 2681 }, { "epoch": 0.6457593450911936, "grad_norm": 3.1203784942626953, "learning_rate": 5.9020261911040796e-05, "loss": 0.874, "step": 2682 }, { "epoch": 0.6460001203876482, "grad_norm": 2.341921091079712, "learning_rate": 5.89491478625747e-05, "loss": 0.3058, "step": 2683 }, { "epoch": 0.6462408956841028, "grad_norm": 3.351788282394409, "learning_rate": 5.8878058773119185e-05, "loss": 0.7692, "step": 2684 }, { "epoch": 0.6464816709805574, "grad_norm": 3.12622332572937, "learning_rate": 5.880699468589661e-05, "loss": 1.2673, "step": 2685 }, { "epoch": 0.646722446277012, "grad_norm": 1.2435944080352783, "learning_rate": 5.8735955644114046e-05, "loss": 0.7739, "step": 2686 }, { "epoch": 0.6469632215734665, "grad_norm": 0.624167799949646, "learning_rate": 5.866494169096348e-05, "loss": 0.7543, "step": 2687 }, { "epoch": 0.6472039968699211, "grad_norm": 1.9075957536697388, "learning_rate": 5.8593952869621436e-05, "loss": 0.4008, "step": 2688 }, { "epoch": 0.6474447721663757, "grad_norm": 2.9206345081329346, "learning_rate": 5.852298922324935e-05, "loss": 1.4938, "step": 2689 }, { "epoch": 0.6476855474628304, "grad_norm": 2.629241704940796, "learning_rate": 5.8452050794993275e-05, "loss": 0.55, "step": 2690 }, { "epoch": 0.6479263227592849, "grad_norm": 1.5018374919891357, "learning_rate": 5.8381137627983915e-05, "loss": 0.6912, "step": 2691 }, { "epoch": 0.6481670980557395, "grad_norm": 4.23275089263916, "learning_rate": 5.831024976533668e-05, "loss": 0.6019, "step": 2692 }, { "epoch": 0.6484078733521941, "grad_norm": 1.633299708366394, "learning_rate": 5.823938725015148e-05, "loss": 0.5051, "step": 2693 }, { "epoch": 0.6486486486486487, "grad_norm": 7.270058631896973, "learning_rate": 5.816855012551291e-05, "loss": 0.6392, "step": 2694 }, { "epoch": 0.6488894239451032, "grad_norm": 1.3112475872039795, "learning_rate": 5.809773843449011e-05, "loss": 0.4469, "step": 2695 }, { "epoch": 0.6491301992415578, "grad_norm": 1.903131365776062, "learning_rate": 5.802695222013676e-05, "loss": 0.381, "step": 2696 }, { "epoch": 0.6493709745380124, "grad_norm": 2.8745779991149902, "learning_rate": 5.795619152549102e-05, "loss": 0.8397, "step": 2697 }, { "epoch": 0.649611749834467, "grad_norm": 5.189337730407715, "learning_rate": 5.78854563935756e-05, "loss": 0.4703, "step": 2698 }, { "epoch": 0.6498525251309216, "grad_norm": 1.0739030838012695, "learning_rate": 5.781474686739754e-05, "loss": 0.6484, "step": 2699 }, { "epoch": 0.6500933004273761, "grad_norm": 1.2836296558380127, "learning_rate": 5.7744062989948464e-05, "loss": 0.5722, "step": 2700 }, { "epoch": 0.6503340757238307, "grad_norm": 7.686650276184082, "learning_rate": 5.767340480420426e-05, "loss": 0.4963, "step": 2701 }, { "epoch": 0.6505748510202853, "grad_norm": 2.3548178672790527, "learning_rate": 5.760277235312529e-05, "loss": 0.757, "step": 2702 }, { "epoch": 0.65081562631674, "grad_norm": 1.2598122358322144, "learning_rate": 5.753216567965626e-05, "loss": 0.397, "step": 2703 }, { "epoch": 0.6510564016131944, "grad_norm": 2.0660560131073, "learning_rate": 5.746158482672617e-05, "loss": 0.8815, "step": 2704 }, { "epoch": 0.6512971769096491, "grad_norm": 1.225748896598816, "learning_rate": 5.73910298372484e-05, "loss": 0.8635, "step": 2705 }, { "epoch": 0.6515379522061037, "grad_norm": 1.400314211845398, "learning_rate": 5.7320500754120434e-05, "loss": 0.5746, "step": 2706 }, { "epoch": 0.6517787275025583, "grad_norm": 2.5364537239074707, "learning_rate": 5.724999762022416e-05, "loss": 0.657, "step": 2707 }, { "epoch": 0.6520195027990128, "grad_norm": 2.604710340499878, "learning_rate": 5.717952047842571e-05, "loss": 0.564, "step": 2708 }, { "epoch": 0.6522602780954674, "grad_norm": 1.6602225303649902, "learning_rate": 5.710906937157523e-05, "loss": 0.7511, "step": 2709 }, { "epoch": 0.652501053391922, "grad_norm": 1.7182340621948242, "learning_rate": 5.7038644342507205e-05, "loss": 0.3137, "step": 2710 }, { "epoch": 0.6527418286883766, "grad_norm": 1.6290512084960938, "learning_rate": 5.6968245434040225e-05, "loss": 0.8388, "step": 2711 }, { "epoch": 0.6529826039848312, "grad_norm": 1.5818265676498413, "learning_rate": 5.689787268897697e-05, "loss": 0.3487, "step": 2712 }, { "epoch": 0.6532233792812857, "grad_norm": 3.120393753051758, "learning_rate": 5.682752615010427e-05, "loss": 0.9698, "step": 2713 }, { "epoch": 0.6534641545777403, "grad_norm": 1.175986886024475, "learning_rate": 5.6757205860192905e-05, "loss": 0.44, "step": 2714 }, { "epoch": 0.6537049298741949, "grad_norm": 1.2981712818145752, "learning_rate": 5.6686911861997795e-05, "loss": 0.2365, "step": 2715 }, { "epoch": 0.6539457051706495, "grad_norm": 1.5336500406265259, "learning_rate": 5.66166441982579e-05, "loss": 0.9387, "step": 2716 }, { "epoch": 0.654186480467104, "grad_norm": 1.408539056777954, "learning_rate": 5.654640291169604e-05, "loss": 0.6778, "step": 2717 }, { "epoch": 0.6544272557635586, "grad_norm": 0.782248854637146, "learning_rate": 5.647618804501915e-05, "loss": 0.803, "step": 2718 }, { "epoch": 0.6546680310600133, "grad_norm": 2.8610918521881104, "learning_rate": 5.640599964091791e-05, "loss": 0.828, "step": 2719 }, { "epoch": 0.6549088063564679, "grad_norm": 2.616344690322876, "learning_rate": 5.6335837742067145e-05, "loss": 0.8763, "step": 2720 }, { "epoch": 0.6551495816529224, "grad_norm": 2.1979098320007324, "learning_rate": 5.6265702391125444e-05, "loss": 0.383, "step": 2721 }, { "epoch": 0.655390356949377, "grad_norm": 2.1095833778381348, "learning_rate": 5.6195593630735185e-05, "loss": 0.6834, "step": 2722 }, { "epoch": 0.6556311322458316, "grad_norm": 3.0333285331726074, "learning_rate": 5.61255115035227e-05, "loss": 0.4893, "step": 2723 }, { "epoch": 0.6558719075422862, "grad_norm": 3.033856153488159, "learning_rate": 5.60554560520981e-05, "loss": 0.6269, "step": 2724 }, { "epoch": 0.6561126828387407, "grad_norm": 1.2351047992706299, "learning_rate": 5.5985427319055195e-05, "loss": 0.2402, "step": 2725 }, { "epoch": 0.6563534581351953, "grad_norm": 0.572399914264679, "learning_rate": 5.5915425346971683e-05, "loss": 0.4192, "step": 2726 }, { "epoch": 0.6565942334316499, "grad_norm": 16.815656661987305, "learning_rate": 5.584545017840885e-05, "loss": 0.7806, "step": 2727 }, { "epoch": 0.6568350087281045, "grad_norm": 1.1614093780517578, "learning_rate": 5.577550185591174e-05, "loss": 0.7885, "step": 2728 }, { "epoch": 0.6570757840245591, "grad_norm": 0.8286291360855103, "learning_rate": 5.570558042200923e-05, "loss": 0.3164, "step": 2729 }, { "epoch": 0.6573165593210136, "grad_norm": 1.0357860326766968, "learning_rate": 5.563568591921358e-05, "loss": 0.2793, "step": 2730 }, { "epoch": 0.6575573346174682, "grad_norm": 1.7955443859100342, "learning_rate": 5.5565818390020886e-05, "loss": 0.4327, "step": 2731 }, { "epoch": 0.6577981099139228, "grad_norm": 25.76378631591797, "learning_rate": 5.5495977876910675e-05, "loss": 1.1611, "step": 2732 }, { "epoch": 0.6580388852103775, "grad_norm": 1.9328374862670898, "learning_rate": 5.542616442234618e-05, "loss": 0.4974, "step": 2733 }, { "epoch": 0.658279660506832, "grad_norm": 1.2386356592178345, "learning_rate": 5.535637806877419e-05, "loss": 0.8787, "step": 2734 }, { "epoch": 0.6585204358032866, "grad_norm": 3.446852684020996, "learning_rate": 5.5286618858624874e-05, "loss": 0.4847, "step": 2735 }, { "epoch": 0.6587612110997412, "grad_norm": 1.4139548540115356, "learning_rate": 5.5216886834312e-05, "loss": 0.6242, "step": 2736 }, { "epoch": 0.6590019863961958, "grad_norm": 0.9220794439315796, "learning_rate": 5.51471820382329e-05, "loss": 0.3538, "step": 2737 }, { "epoch": 0.6592427616926503, "grad_norm": 0.6702575087547302, "learning_rate": 5.507750451276814e-05, "loss": 0.4045, "step": 2738 }, { "epoch": 0.6594835369891049, "grad_norm": 0.9865245223045349, "learning_rate": 5.500785430028188e-05, "loss": 0.4853, "step": 2739 }, { "epoch": 0.6597243122855595, "grad_norm": 0.48536545038223267, "learning_rate": 5.4938231443121546e-05, "loss": 0.2637, "step": 2740 }, { "epoch": 0.6599650875820141, "grad_norm": 10.391051292419434, "learning_rate": 5.4868635983618014e-05, "loss": 0.4469, "step": 2741 }, { "epoch": 0.6602058628784687, "grad_norm": 1.1766020059585571, "learning_rate": 5.4799067964085526e-05, "loss": 0.3332, "step": 2742 }, { "epoch": 0.6604466381749232, "grad_norm": 1.8510931730270386, "learning_rate": 5.4729527426821514e-05, "loss": 0.6154, "step": 2743 }, { "epoch": 0.6606874134713778, "grad_norm": 1.353156328201294, "learning_rate": 5.466001441410682e-05, "loss": 0.3427, "step": 2744 }, { "epoch": 0.6609281887678324, "grad_norm": 1.8044596910476685, "learning_rate": 5.459052896820551e-05, "loss": 0.2089, "step": 2745 }, { "epoch": 0.661168964064287, "grad_norm": 2.0654067993164062, "learning_rate": 5.4521071131364906e-05, "loss": 0.781, "step": 2746 }, { "epoch": 0.6614097393607415, "grad_norm": 1.7442854642868042, "learning_rate": 5.4451640945815564e-05, "loss": 0.3432, "step": 2747 }, { "epoch": 0.6616505146571962, "grad_norm": 1.4665995836257935, "learning_rate": 5.438223845377111e-05, "loss": 0.5632, "step": 2748 }, { "epoch": 0.6618912899536508, "grad_norm": 3.585455894470215, "learning_rate": 5.431286369742844e-05, "loss": 1.3084, "step": 2749 }, { "epoch": 0.6621320652501054, "grad_norm": 3.5575718879699707, "learning_rate": 5.424351671896761e-05, "loss": 0.3184, "step": 2750 }, { "epoch": 0.6623728405465599, "grad_norm": 17.34886932373047, "learning_rate": 5.4174197560551685e-05, "loss": 1.1173, "step": 2751 }, { "epoch": 0.6626136158430145, "grad_norm": 2.6678974628448486, "learning_rate": 5.4104906264326884e-05, "loss": 0.6505, "step": 2752 }, { "epoch": 0.6628543911394691, "grad_norm": 4.722652912139893, "learning_rate": 5.403564287242248e-05, "loss": 1.3932, "step": 2753 }, { "epoch": 0.6630951664359237, "grad_norm": 1.4697628021240234, "learning_rate": 5.396640742695076e-05, "loss": 0.5875, "step": 2754 }, { "epoch": 0.6633359417323783, "grad_norm": 2.3093106746673584, "learning_rate": 5.389719997000708e-05, "loss": 0.8362, "step": 2755 }, { "epoch": 0.6635767170288328, "grad_norm": 1.6689960956573486, "learning_rate": 5.382802054366966e-05, "loss": 0.3395, "step": 2756 }, { "epoch": 0.6638174923252874, "grad_norm": 1.0874135494232178, "learning_rate": 5.37588691899998e-05, "loss": 0.757, "step": 2757 }, { "epoch": 0.664058267621742, "grad_norm": 1.348419427871704, "learning_rate": 5.3689745951041626e-05, "loss": 0.6488, "step": 2758 }, { "epoch": 0.6642990429181966, "grad_norm": 2.6158974170684814, "learning_rate": 5.3620650868822256e-05, "loss": 0.251, "step": 2759 }, { "epoch": 0.6645398182146511, "grad_norm": 1.7711026668548584, "learning_rate": 5.3551583985351636e-05, "loss": 0.607, "step": 2760 }, { "epoch": 0.6647805935111057, "grad_norm": 1.2356005907058716, "learning_rate": 5.348254534262262e-05, "loss": 0.2628, "step": 2761 }, { "epoch": 0.6650213688075604, "grad_norm": 4.986912727355957, "learning_rate": 5.3413534982610836e-05, "loss": 0.4364, "step": 2762 }, { "epoch": 0.665262144104015, "grad_norm": 2.007636785507202, "learning_rate": 5.3344552947274776e-05, "loss": 0.6385, "step": 2763 }, { "epoch": 0.6655029194004695, "grad_norm": 1.343567132949829, "learning_rate": 5.32755992785556e-05, "loss": 0.4478, "step": 2764 }, { "epoch": 0.6657436946969241, "grad_norm": 0.5190923810005188, "learning_rate": 5.320667401837738e-05, "loss": 0.5346, "step": 2765 }, { "epoch": 0.6659844699933787, "grad_norm": 1.11277437210083, "learning_rate": 5.313777720864674e-05, "loss": 0.2615, "step": 2766 }, { "epoch": 0.6662252452898333, "grad_norm": 1.8751391172409058, "learning_rate": 5.3068908891253134e-05, "loss": 0.632, "step": 2767 }, { "epoch": 0.6664660205862879, "grad_norm": 1.2252461910247803, "learning_rate": 5.3000069108068674e-05, "loss": 0.2374, "step": 2768 }, { "epoch": 0.6667067958827424, "grad_norm": 0.743096113204956, "learning_rate": 5.293125790094809e-05, "loss": 0.4381, "step": 2769 }, { "epoch": 0.666947571179197, "grad_norm": 1.818084478378296, "learning_rate": 5.286247531172877e-05, "loss": 0.3025, "step": 2770 }, { "epoch": 0.6671883464756516, "grad_norm": 1.5191317796707153, "learning_rate": 5.2793721382230624e-05, "loss": 0.8121, "step": 2771 }, { "epoch": 0.6674291217721062, "grad_norm": 1.5100692510604858, "learning_rate": 5.272499615425624e-05, "loss": 0.2726, "step": 2772 }, { "epoch": 0.6676698970685607, "grad_norm": 3.111989974975586, "learning_rate": 5.2656299669590756e-05, "loss": 0.9423, "step": 2773 }, { "epoch": 0.6679106723650153, "grad_norm": 2.021303415298462, "learning_rate": 5.2587631970001697e-05, "loss": 0.6443, "step": 2774 }, { "epoch": 0.6681514476614699, "grad_norm": 3.0573856830596924, "learning_rate": 5.251899309723921e-05, "loss": 0.7831, "step": 2775 }, { "epoch": 0.6683922229579246, "grad_norm": 3.3146016597747803, "learning_rate": 5.2450383093035905e-05, "loss": 0.4461, "step": 2776 }, { "epoch": 0.668632998254379, "grad_norm": 2.406733989715576, "learning_rate": 5.2381801999106806e-05, "loss": 0.9433, "step": 2777 }, { "epoch": 0.6688737735508337, "grad_norm": 1.2738914489746094, "learning_rate": 5.2313249857149414e-05, "loss": 0.5913, "step": 2778 }, { "epoch": 0.6691145488472883, "grad_norm": 1.1874566078186035, "learning_rate": 5.2244726708843516e-05, "loss": 0.6504, "step": 2779 }, { "epoch": 0.6693553241437429, "grad_norm": 2.1480422019958496, "learning_rate": 5.217623259585136e-05, "loss": 0.9203, "step": 2780 }, { "epoch": 0.6695960994401975, "grad_norm": 3.107542037963867, "learning_rate": 5.2107767559817586e-05, "loss": 0.4462, "step": 2781 }, { "epoch": 0.669836874736652, "grad_norm": 1.504056453704834, "learning_rate": 5.2039331642369004e-05, "loss": 0.6573, "step": 2782 }, { "epoch": 0.6700776500331066, "grad_norm": 0.6759874224662781, "learning_rate": 5.197092488511482e-05, "loss": 0.284, "step": 2783 }, { "epoch": 0.6703184253295612, "grad_norm": 0.775605320930481, "learning_rate": 5.1902547329646536e-05, "loss": 0.5416, "step": 2784 }, { "epoch": 0.6705592006260158, "grad_norm": 0.199252650141716, "learning_rate": 5.1834199017537834e-05, "loss": 0.4752, "step": 2785 }, { "epoch": 0.6707999759224703, "grad_norm": 3.240574359893799, "learning_rate": 5.176587999034468e-05, "loss": 0.6012, "step": 2786 }, { "epoch": 0.6710407512189249, "grad_norm": 3.1757545471191406, "learning_rate": 5.1697590289605136e-05, "loss": 1.1006, "step": 2787 }, { "epoch": 0.6712815265153795, "grad_norm": 1.6893806457519531, "learning_rate": 5.162932995683951e-05, "loss": 0.7829, "step": 2788 }, { "epoch": 0.6715223018118341, "grad_norm": 1.8709641695022583, "learning_rate": 5.156109903355031e-05, "loss": 0.9529, "step": 2789 }, { "epoch": 0.6717630771082886, "grad_norm": 1.4354405403137207, "learning_rate": 5.1492897561221976e-05, "loss": 0.3602, "step": 2790 }, { "epoch": 0.6720038524047433, "grad_norm": 1.0905989408493042, "learning_rate": 5.142472558132125e-05, "loss": 0.6715, "step": 2791 }, { "epoch": 0.6722446277011979, "grad_norm": 0.8870560526847839, "learning_rate": 5.1356583135296744e-05, "loss": 0.3977, "step": 2792 }, { "epoch": 0.6724854029976525, "grad_norm": 2.73464298248291, "learning_rate": 5.1288470264579327e-05, "loss": 0.8096, "step": 2793 }, { "epoch": 0.6727261782941071, "grad_norm": 3.191851854324341, "learning_rate": 5.122038701058176e-05, "loss": 0.8286, "step": 2794 }, { "epoch": 0.6729669535905616, "grad_norm": 0.7643956542015076, "learning_rate": 5.115233341469877e-05, "loss": 0.3281, "step": 2795 }, { "epoch": 0.6732077288870162, "grad_norm": 4.106560230255127, "learning_rate": 5.108430951830716e-05, "loss": 0.7662, "step": 2796 }, { "epoch": 0.6734485041834708, "grad_norm": 1.1065987348556519, "learning_rate": 5.101631536276552e-05, "loss": 0.9248, "step": 2797 }, { "epoch": 0.6736892794799254, "grad_norm": 2.439703941345215, "learning_rate": 5.094835098941451e-05, "loss": 1.0613, "step": 2798 }, { "epoch": 0.6739300547763799, "grad_norm": 1.7999013662338257, "learning_rate": 5.088041643957664e-05, "loss": 0.4121, "step": 2799 }, { "epoch": 0.6741708300728345, "grad_norm": 2.1599056720733643, "learning_rate": 5.081251175455617e-05, "loss": 0.3685, "step": 2800 }, { "epoch": 0.6744116053692891, "grad_norm": 3.4274089336395264, "learning_rate": 5.0744636975639424e-05, "loss": 0.3434, "step": 2801 }, { "epoch": 0.6746523806657437, "grad_norm": 1.6600308418273926, "learning_rate": 5.06767921440944e-05, "loss": 0.402, "step": 2802 }, { "epoch": 0.6748931559621982, "grad_norm": 1.426318883895874, "learning_rate": 5.0608977301170845e-05, "loss": 0.2329, "step": 2803 }, { "epoch": 0.6751339312586528, "grad_norm": 1.2392008304595947, "learning_rate": 5.05411924881004e-05, "loss": 0.5493, "step": 2804 }, { "epoch": 0.6753747065551075, "grad_norm": 0.7665623426437378, "learning_rate": 5.047343774609632e-05, "loss": 0.3614, "step": 2805 }, { "epoch": 0.6756154818515621, "grad_norm": 2.566469192504883, "learning_rate": 5.040571311635367e-05, "loss": 0.7117, "step": 2806 }, { "epoch": 0.6758562571480166, "grad_norm": 1.9703136682510376, "learning_rate": 5.033801864004923e-05, "loss": 0.7024, "step": 2807 }, { "epoch": 0.6760970324444712, "grad_norm": 0.3591214716434479, "learning_rate": 5.0270354358341307e-05, "loss": 0.1396, "step": 2808 }, { "epoch": 0.6763378077409258, "grad_norm": 3.92927622795105, "learning_rate": 5.020272031236996e-05, "loss": 0.7112, "step": 2809 }, { "epoch": 0.6765785830373804, "grad_norm": 10.165884971618652, "learning_rate": 5.013511654325689e-05, "loss": 0.3902, "step": 2810 }, { "epoch": 0.676819358333835, "grad_norm": 4.904458045959473, "learning_rate": 5.0067543092105284e-05, "loss": 0.9305, "step": 2811 }, { "epoch": 0.6770601336302895, "grad_norm": 3.849393844604492, "learning_rate": 5.000000000000002e-05, "loss": 0.9527, "step": 2812 }, { "epoch": 0.6773009089267441, "grad_norm": 1.6533546447753906, "learning_rate": 4.993248730800737e-05, "loss": 0.2365, "step": 2813 }, { "epoch": 0.6775416842231987, "grad_norm": 2.609186887741089, "learning_rate": 4.986500505717524e-05, "loss": 0.8003, "step": 2814 }, { "epoch": 0.6777824595196533, "grad_norm": 4.835500240325928, "learning_rate": 4.9797553288533036e-05, "loss": 1.3338, "step": 2815 }, { "epoch": 0.6780232348161078, "grad_norm": 4.737950325012207, "learning_rate": 4.9730132043091494e-05, "loss": 0.9067, "step": 2816 }, { "epoch": 0.6782640101125624, "grad_norm": 0.9720152616500854, "learning_rate": 4.9662741361842934e-05, "loss": 0.5726, "step": 2817 }, { "epoch": 0.678504785409017, "grad_norm": 2.1223130226135254, "learning_rate": 4.9595381285761036e-05, "loss": 0.2327, "step": 2818 }, { "epoch": 0.6787455607054717, "grad_norm": 1.109189748764038, "learning_rate": 4.9528051855800874e-05, "loss": 0.8678, "step": 2819 }, { "epoch": 0.6789863360019261, "grad_norm": 1.6633493900299072, "learning_rate": 4.946075311289894e-05, "loss": 0.1905, "step": 2820 }, { "epoch": 0.6792271112983808, "grad_norm": 2.95998477935791, "learning_rate": 4.939348509797293e-05, "loss": 0.6686, "step": 2821 }, { "epoch": 0.6794678865948354, "grad_norm": 1.455351710319519, "learning_rate": 4.932624785192206e-05, "loss": 0.3546, "step": 2822 }, { "epoch": 0.67970866189129, "grad_norm": 3.9884376525878906, "learning_rate": 4.9259041415626615e-05, "loss": 1.0358, "step": 2823 }, { "epoch": 0.6799494371877446, "grad_norm": 3.427591562271118, "learning_rate": 4.91918658299483e-05, "loss": 0.9694, "step": 2824 }, { "epoch": 0.6801902124841991, "grad_norm": 0.9723843336105347, "learning_rate": 4.912472113573005e-05, "loss": 0.0587, "step": 2825 }, { "epoch": 0.6804309877806537, "grad_norm": 2.2842836380004883, "learning_rate": 4.905760737379597e-05, "loss": 0.9764, "step": 2826 }, { "epoch": 0.6806717630771083, "grad_norm": 3.051936626434326, "learning_rate": 4.899052458495137e-05, "loss": 0.7743, "step": 2827 }, { "epoch": 0.6809125383735629, "grad_norm": 4.598084926605225, "learning_rate": 4.8923472809982795e-05, "loss": 0.9498, "step": 2828 }, { "epoch": 0.6811533136700174, "grad_norm": 5.4813947677612305, "learning_rate": 4.885645208965779e-05, "loss": 0.5918, "step": 2829 }, { "epoch": 0.681394088966472, "grad_norm": 5.616296768188477, "learning_rate": 4.8789462464725176e-05, "loss": 0.3233, "step": 2830 }, { "epoch": 0.6816348642629266, "grad_norm": 2.7440338134765625, "learning_rate": 4.8722503975914724e-05, "loss": 0.834, "step": 2831 }, { "epoch": 0.6818756395593812, "grad_norm": 2.0612759590148926, "learning_rate": 4.865557666393739e-05, "loss": 0.6478, "step": 2832 }, { "epoch": 0.6821164148558357, "grad_norm": 2.663640260696411, "learning_rate": 4.858868056948512e-05, "loss": 0.3945, "step": 2833 }, { "epoch": 0.6823571901522903, "grad_norm": 1.9246824979782104, "learning_rate": 4.8521815733230894e-05, "loss": 0.7715, "step": 2834 }, { "epoch": 0.682597965448745, "grad_norm": 1.4412755966186523, "learning_rate": 4.8454982195828725e-05, "loss": 0.1277, "step": 2835 }, { "epoch": 0.6828387407451996, "grad_norm": 2.150667190551758, "learning_rate": 4.838817999791348e-05, "loss": 0.694, "step": 2836 }, { "epoch": 0.6830795160416542, "grad_norm": 2.0565173625946045, "learning_rate": 4.832140918010107e-05, "loss": 0.4911, "step": 2837 }, { "epoch": 0.6833202913381087, "grad_norm": 2.131206750869751, "learning_rate": 4.825466978298835e-05, "loss": 0.3765, "step": 2838 }, { "epoch": 0.6835610666345633, "grad_norm": 1.4898698329925537, "learning_rate": 4.818796184715295e-05, "loss": 0.8131, "step": 2839 }, { "epoch": 0.6838018419310179, "grad_norm": 1.2278820276260376, "learning_rate": 4.812128541315348e-05, "loss": 0.6781, "step": 2840 }, { "epoch": 0.6840426172274725, "grad_norm": 1.6083769798278809, "learning_rate": 4.805464052152937e-05, "loss": 0.3348, "step": 2841 }, { "epoch": 0.684283392523927, "grad_norm": 2.421626091003418, "learning_rate": 4.7988027212800856e-05, "loss": 1.0312, "step": 2842 }, { "epoch": 0.6845241678203816, "grad_norm": 2.5446934700012207, "learning_rate": 4.7921445527469014e-05, "loss": 0.6185, "step": 2843 }, { "epoch": 0.6847649431168362, "grad_norm": 2.2218170166015625, "learning_rate": 4.7854895506015587e-05, "loss": 0.5071, "step": 2844 }, { "epoch": 0.6850057184132908, "grad_norm": 1.7741551399230957, "learning_rate": 4.7788377188903176e-05, "loss": 0.1996, "step": 2845 }, { "epoch": 0.6852464937097453, "grad_norm": 0.4389905035495758, "learning_rate": 4.7721890616575103e-05, "loss": 0.2294, "step": 2846 }, { "epoch": 0.6854872690061999, "grad_norm": 2.4608020782470703, "learning_rate": 4.76554358294553e-05, "loss": 0.4453, "step": 2847 }, { "epoch": 0.6857280443026545, "grad_norm": 1.8286683559417725, "learning_rate": 4.758901286794842e-05, "loss": 0.3692, "step": 2848 }, { "epoch": 0.6859688195991092, "grad_norm": 3.2650294303894043, "learning_rate": 4.7522621772439826e-05, "loss": 0.2029, "step": 2849 }, { "epoch": 0.6862095948955638, "grad_norm": 1.0558974742889404, "learning_rate": 4.7456262583295406e-05, "loss": 0.2386, "step": 2850 }, { "epoch": 0.6864503701920183, "grad_norm": 3.462625741958618, "learning_rate": 4.7389935340861766e-05, "loss": 0.5172, "step": 2851 }, { "epoch": 0.6866911454884729, "grad_norm": 1.7738730907440186, "learning_rate": 4.732364008546593e-05, "loss": 0.5665, "step": 2852 }, { "epoch": 0.6869319207849275, "grad_norm": 1.2178890705108643, "learning_rate": 4.72573768574156e-05, "loss": 0.4615, "step": 2853 }, { "epoch": 0.6871726960813821, "grad_norm": 4.225795745849609, "learning_rate": 4.719114569699902e-05, "loss": 1.0835, "step": 2854 }, { "epoch": 0.6874134713778366, "grad_norm": 3.400425434112549, "learning_rate": 4.712494664448479e-05, "loss": 0.8196, "step": 2855 }, { "epoch": 0.6876542466742912, "grad_norm": 4.688882827758789, "learning_rate": 4.705877974012213e-05, "loss": 0.9437, "step": 2856 }, { "epoch": 0.6878950219707458, "grad_norm": 1.8264660835266113, "learning_rate": 4.699264502414066e-05, "loss": 0.649, "step": 2857 }, { "epoch": 0.6881357972672004, "grad_norm": 4.103915214538574, "learning_rate": 4.6926542536750454e-05, "loss": 0.5432, "step": 2858 }, { "epoch": 0.6883765725636549, "grad_norm": 8.779718399047852, "learning_rate": 4.686047231814199e-05, "loss": 0.8389, "step": 2859 }, { "epoch": 0.6886173478601095, "grad_norm": 1.3146713972091675, "learning_rate": 4.6794434408486043e-05, "loss": 0.5095, "step": 2860 }, { "epoch": 0.6888581231565641, "grad_norm": 2.284715175628662, "learning_rate": 4.6728428847933893e-05, "loss": 0.7908, "step": 2861 }, { "epoch": 0.6890988984530187, "grad_norm": 1.923722267150879, "learning_rate": 4.666245567661699e-05, "loss": 0.7053, "step": 2862 }, { "epoch": 0.6893396737494734, "grad_norm": 1.3939085006713867, "learning_rate": 4.659651493464721e-05, "loss": 0.5569, "step": 2863 }, { "epoch": 0.6895804490459279, "grad_norm": 1.071938157081604, "learning_rate": 4.653060666211665e-05, "loss": 0.474, "step": 2864 }, { "epoch": 0.6898212243423825, "grad_norm": 2.894726514816284, "learning_rate": 4.646473089909772e-05, "loss": 0.5261, "step": 2865 }, { "epoch": 0.6900619996388371, "grad_norm": 2.7686641216278076, "learning_rate": 4.639888768564302e-05, "loss": 0.8032, "step": 2866 }, { "epoch": 0.6903027749352917, "grad_norm": 1.58405601978302, "learning_rate": 4.633307706178541e-05, "loss": 0.8255, "step": 2867 }, { "epoch": 0.6905435502317462, "grad_norm": 2.1864027976989746, "learning_rate": 4.626729906753782e-05, "loss": 0.5292, "step": 2868 }, { "epoch": 0.6907843255282008, "grad_norm": 2.6647164821624756, "learning_rate": 4.62015537428935e-05, "loss": 1.1816, "step": 2869 }, { "epoch": 0.6910251008246554, "grad_norm": 0.903181254863739, "learning_rate": 4.613584112782567e-05, "loss": 0.5345, "step": 2870 }, { "epoch": 0.69126587612111, "grad_norm": 1.2306076288223267, "learning_rate": 4.607016126228779e-05, "loss": 0.5126, "step": 2871 }, { "epoch": 0.6915066514175645, "grad_norm": 1.6878161430358887, "learning_rate": 4.600451418621341e-05, "loss": 0.5813, "step": 2872 }, { "epoch": 0.6917474267140191, "grad_norm": 1.6797889471054077, "learning_rate": 4.593889993951599e-05, "loss": 0.4037, "step": 2873 }, { "epoch": 0.6919882020104737, "grad_norm": 1.061113715171814, "learning_rate": 4.587331856208927e-05, "loss": 0.3819, "step": 2874 }, { "epoch": 0.6922289773069283, "grad_norm": 2.49900484085083, "learning_rate": 4.580777009380678e-05, "loss": 0.4709, "step": 2875 }, { "epoch": 0.6924697526033828, "grad_norm": 1.850058674812317, "learning_rate": 4.574225457452217e-05, "loss": 0.4061, "step": 2876 }, { "epoch": 0.6927105278998374, "grad_norm": 4.711109638214111, "learning_rate": 4.5676772044069064e-05, "loss": 0.6784, "step": 2877 }, { "epoch": 0.6929513031962921, "grad_norm": 2.337125778198242, "learning_rate": 4.5611322542260906e-05, "loss": 1.2925, "step": 2878 }, { "epoch": 0.6931920784927467, "grad_norm": 1.2772440910339355, "learning_rate": 4.554590610889118e-05, "loss": 0.453, "step": 2879 }, { "epoch": 0.6934328537892013, "grad_norm": 1.6413322687149048, "learning_rate": 4.548052278373327e-05, "loss": 0.509, "step": 2880 }, { "epoch": 0.6936736290856558, "grad_norm": 2.3125624656677246, "learning_rate": 4.54151726065403e-05, "loss": 0.68, "step": 2881 }, { "epoch": 0.6939144043821104, "grad_norm": 2.5944857597351074, "learning_rate": 4.534985561704537e-05, "loss": 0.9755, "step": 2882 }, { "epoch": 0.694155179678565, "grad_norm": 5.304381370544434, "learning_rate": 4.528457185496134e-05, "loss": 0.6764, "step": 2883 }, { "epoch": 0.6943959549750196, "grad_norm": 1.1039294004440308, "learning_rate": 4.521932135998092e-05, "loss": 0.4513, "step": 2884 }, { "epoch": 0.6946367302714741, "grad_norm": 4.094736576080322, "learning_rate": 4.5154104171776546e-05, "loss": 1.098, "step": 2885 }, { "epoch": 0.6948775055679287, "grad_norm": 2.167951822280884, "learning_rate": 4.5088920330000386e-05, "loss": 0.9008, "step": 2886 }, { "epoch": 0.6951182808643833, "grad_norm": 1.435927152633667, "learning_rate": 4.502376987428442e-05, "loss": 0.3153, "step": 2887 }, { "epoch": 0.6953590561608379, "grad_norm": 1.3964961767196655, "learning_rate": 4.495865284424018e-05, "loss": 0.9771, "step": 2888 }, { "epoch": 0.6955998314572924, "grad_norm": 1.8884319067001343, "learning_rate": 4.4893569279459034e-05, "loss": 0.5999, "step": 2889 }, { "epoch": 0.695840606753747, "grad_norm": 1.9889702796936035, "learning_rate": 4.4828519219511914e-05, "loss": 0.408, "step": 2890 }, { "epoch": 0.6960813820502016, "grad_norm": 0.6705599427223206, "learning_rate": 4.476350270394942e-05, "loss": 1.1983, "step": 2891 }, { "epoch": 0.6963221573466563, "grad_norm": 1.9054640531539917, "learning_rate": 4.469851977230173e-05, "loss": 0.6402, "step": 2892 }, { "epoch": 0.6965629326431109, "grad_norm": 0.7746975421905518, "learning_rate": 4.463357046407864e-05, "loss": 0.3632, "step": 2893 }, { "epoch": 0.6968037079395654, "grad_norm": 0.40516397356987, "learning_rate": 4.456865481876943e-05, "loss": 0.1903, "step": 2894 }, { "epoch": 0.69704448323602, "grad_norm": 3.2087621688842773, "learning_rate": 4.4503772875843e-05, "loss": 0.568, "step": 2895 }, { "epoch": 0.6972852585324746, "grad_norm": 2.681427478790283, "learning_rate": 4.4438924674747663e-05, "loss": 0.9806, "step": 2896 }, { "epoch": 0.6975260338289292, "grad_norm": 1.9525858163833618, "learning_rate": 4.4374110254911306e-05, "loss": 0.4023, "step": 2897 }, { "epoch": 0.6977668091253837, "grad_norm": 5.92275857925415, "learning_rate": 4.430932965574125e-05, "loss": 0.8938, "step": 2898 }, { "epoch": 0.6980075844218383, "grad_norm": 5.130187034606934, "learning_rate": 4.424458291662422e-05, "loss": 0.5991, "step": 2899 }, { "epoch": 0.6982483597182929, "grad_norm": 1.512535810470581, "learning_rate": 4.417987007692641e-05, "loss": 0.6119, "step": 2900 }, { "epoch": 0.6984891350147475, "grad_norm": 1.4863414764404297, "learning_rate": 4.4115191175993385e-05, "loss": 0.5287, "step": 2901 }, { "epoch": 0.698729910311202, "grad_norm": 0.5450987219810486, "learning_rate": 4.405054625314999e-05, "loss": 0.4031, "step": 2902 }, { "epoch": 0.6989706856076566, "grad_norm": 2.9713079929351807, "learning_rate": 4.398593534770058e-05, "loss": 0.8828, "step": 2903 }, { "epoch": 0.6992114609041112, "grad_norm": 1.440027117729187, "learning_rate": 4.3921358498928645e-05, "loss": 0.3911, "step": 2904 }, { "epoch": 0.6994522362005658, "grad_norm": 1.3297288417816162, "learning_rate": 4.385681574609708e-05, "loss": 0.3319, "step": 2905 }, { "epoch": 0.6996930114970205, "grad_norm": 0.7073665261268616, "learning_rate": 4.379230712844804e-05, "loss": 0.6385, "step": 2906 }, { "epoch": 0.699933786793475, "grad_norm": 2.934152126312256, "learning_rate": 4.37278326852029e-05, "loss": 1.4158, "step": 2907 }, { "epoch": 0.7001745620899296, "grad_norm": 2.335797071456909, "learning_rate": 4.36633924555623e-05, "loss": 0.9337, "step": 2908 }, { "epoch": 0.7004153373863842, "grad_norm": 1.474564552307129, "learning_rate": 4.359898647870599e-05, "loss": 0.5355, "step": 2909 }, { "epoch": 0.7006561126828388, "grad_norm": 1.9566766023635864, "learning_rate": 4.353461479379297e-05, "loss": 0.4216, "step": 2910 }, { "epoch": 0.7008968879792933, "grad_norm": 1.7746264934539795, "learning_rate": 4.34702774399614e-05, "loss": 0.5385, "step": 2911 }, { "epoch": 0.7011376632757479, "grad_norm": 2.327068567276001, "learning_rate": 4.340597445632849e-05, "loss": 0.1434, "step": 2912 }, { "epoch": 0.7013784385722025, "grad_norm": 0.7720171809196472, "learning_rate": 4.334170588199061e-05, "loss": 0.327, "step": 2913 }, { "epoch": 0.7016192138686571, "grad_norm": 0.9734980463981628, "learning_rate": 4.32774717560232e-05, "loss": 0.6511, "step": 2914 }, { "epoch": 0.7018599891651116, "grad_norm": 2.17838191986084, "learning_rate": 4.321327211748077e-05, "loss": 0.6218, "step": 2915 }, { "epoch": 0.7021007644615662, "grad_norm": 1.358054757118225, "learning_rate": 4.314910700539687e-05, "loss": 0.8311, "step": 2916 }, { "epoch": 0.7023415397580208, "grad_norm": 7.809467792510986, "learning_rate": 4.308497645878396e-05, "loss": 1.164, "step": 2917 }, { "epoch": 0.7025823150544754, "grad_norm": 2.3735713958740234, "learning_rate": 4.302088051663359e-05, "loss": 0.5243, "step": 2918 }, { "epoch": 0.70282309035093, "grad_norm": 1.5434727668762207, "learning_rate": 4.2956819217916275e-05, "loss": 0.2084, "step": 2919 }, { "epoch": 0.7030638656473845, "grad_norm": 2.580521821975708, "learning_rate": 4.289279260158137e-05, "loss": 0.666, "step": 2920 }, { "epoch": 0.7033046409438392, "grad_norm": 3.4632489681243896, "learning_rate": 4.282880070655723e-05, "loss": 0.7674, "step": 2921 }, { "epoch": 0.7035454162402938, "grad_norm": 3.7505438327789307, "learning_rate": 4.2764843571751046e-05, "loss": 0.8833, "step": 2922 }, { "epoch": 0.7037861915367484, "grad_norm": 1.1136095523834229, "learning_rate": 4.270092123604894e-05, "loss": 0.5675, "step": 2923 }, { "epoch": 0.7040269668332029, "grad_norm": 2.523184299468994, "learning_rate": 4.263703373831586e-05, "loss": 0.621, "step": 2924 }, { "epoch": 0.7042677421296575, "grad_norm": 1.7620470523834229, "learning_rate": 4.2573181117395455e-05, "loss": 0.2796, "step": 2925 }, { "epoch": 0.7045085174261121, "grad_norm": 0.7479153275489807, "learning_rate": 4.250936341211032e-05, "loss": 0.7364, "step": 2926 }, { "epoch": 0.7047492927225667, "grad_norm": 1.9243773221969604, "learning_rate": 4.2445580661261794e-05, "loss": 0.5447, "step": 2927 }, { "epoch": 0.7049900680190212, "grad_norm": 2.319751501083374, "learning_rate": 4.238183290362987e-05, "loss": 0.4302, "step": 2928 }, { "epoch": 0.7052308433154758, "grad_norm": 1.1263662576675415, "learning_rate": 4.231812017797335e-05, "loss": 0.5473, "step": 2929 }, { "epoch": 0.7054716186119304, "grad_norm": 1.7234480381011963, "learning_rate": 4.225444252302973e-05, "loss": 0.3453, "step": 2930 }, { "epoch": 0.705712393908385, "grad_norm": 1.5523897409439087, "learning_rate": 4.219079997751515e-05, "loss": 0.2537, "step": 2931 }, { "epoch": 0.7059531692048396, "grad_norm": 2.044769287109375, "learning_rate": 4.212719258012447e-05, "loss": 0.3151, "step": 2932 }, { "epoch": 0.7061939445012941, "grad_norm": 3.085174322128296, "learning_rate": 4.206362036953104e-05, "loss": 0.4571, "step": 2933 }, { "epoch": 0.7064347197977487, "grad_norm": 7.409231185913086, "learning_rate": 4.2000083384387e-05, "loss": 1.0109, "step": 2934 }, { "epoch": 0.7066754950942034, "grad_norm": 4.059498310089111, "learning_rate": 4.193658166332291e-05, "loss": 0.4508, "step": 2935 }, { "epoch": 0.706916270390658, "grad_norm": 3.295271873474121, "learning_rate": 4.187311524494798e-05, "loss": 0.5282, "step": 2936 }, { "epoch": 0.7071570456871125, "grad_norm": 1.7621487379074097, "learning_rate": 4.1809684167849936e-05, "loss": 1.1533, "step": 2937 }, { "epoch": 0.7073978209835671, "grad_norm": 3.32817006111145, "learning_rate": 4.1746288470595044e-05, "loss": 0.7824, "step": 2938 }, { "epoch": 0.7076385962800217, "grad_norm": 2.0236010551452637, "learning_rate": 4.1682928191727985e-05, "loss": 0.4317, "step": 2939 }, { "epoch": 0.7078793715764763, "grad_norm": 2.6577980518341064, "learning_rate": 4.161960336977203e-05, "loss": 0.5246, "step": 2940 }, { "epoch": 0.7081201468729308, "grad_norm": 45.47622299194336, "learning_rate": 4.1556314043228705e-05, "loss": 0.5691, "step": 2941 }, { "epoch": 0.7083609221693854, "grad_norm": 2.584383249282837, "learning_rate": 4.1493060250578165e-05, "loss": 0.4159, "step": 2942 }, { "epoch": 0.70860169746584, "grad_norm": 2.8023557662963867, "learning_rate": 4.1429842030278774e-05, "loss": 0.8909, "step": 2943 }, { "epoch": 0.7088424727622946, "grad_norm": 1.6718467473983765, "learning_rate": 4.1366659420767384e-05, "loss": 0.5008, "step": 2944 }, { "epoch": 0.7090832480587492, "grad_norm": 4.263134956359863, "learning_rate": 4.1303512460459214e-05, "loss": 0.6134, "step": 2945 }, { "epoch": 0.7093240233552037, "grad_norm": 3.2568228244781494, "learning_rate": 4.124040118774763e-05, "loss": 0.4874, "step": 2946 }, { "epoch": 0.7095647986516583, "grad_norm": 0.5476480722427368, "learning_rate": 4.1177325641004595e-05, "loss": 0.341, "step": 2947 }, { "epoch": 0.7098055739481129, "grad_norm": 2.8167431354522705, "learning_rate": 4.1114285858580045e-05, "loss": 0.3281, "step": 2948 }, { "epoch": 0.7100463492445676, "grad_norm": 4.154418468475342, "learning_rate": 4.105128187880238e-05, "loss": 0.5594, "step": 2949 }, { "epoch": 0.710287124541022, "grad_norm": 0.7136004567146301, "learning_rate": 4.098831373997818e-05, "loss": 0.4894, "step": 2950 }, { "epoch": 0.7105278998374767, "grad_norm": 2.967937707901001, "learning_rate": 4.0925381480392135e-05, "loss": 0.6342, "step": 2951 }, { "epoch": 0.7107686751339313, "grad_norm": 2.3439087867736816, "learning_rate": 4.086248513830725e-05, "loss": 0.514, "step": 2952 }, { "epoch": 0.7110094504303859, "grad_norm": 1.2852379083633423, "learning_rate": 4.079962475196468e-05, "loss": 0.9783, "step": 2953 }, { "epoch": 0.7112502257268404, "grad_norm": 3.063833713531494, "learning_rate": 4.0736800359583605e-05, "loss": 0.3231, "step": 2954 }, { "epoch": 0.711491001023295, "grad_norm": 0.8757096529006958, "learning_rate": 4.067401199936143e-05, "loss": 0.3563, "step": 2955 }, { "epoch": 0.7117317763197496, "grad_norm": 2.1515250205993652, "learning_rate": 4.061125970947363e-05, "loss": 0.5002, "step": 2956 }, { "epoch": 0.7119725516162042, "grad_norm": 1.7841241359710693, "learning_rate": 4.054854352807372e-05, "loss": 0.2222, "step": 2957 }, { "epoch": 0.7122133269126587, "grad_norm": 1.669628620147705, "learning_rate": 4.048586349329333e-05, "loss": 0.8098, "step": 2958 }, { "epoch": 0.7124541022091133, "grad_norm": 1.4398468732833862, "learning_rate": 4.0423219643241985e-05, "loss": 0.3151, "step": 2959 }, { "epoch": 0.7126948775055679, "grad_norm": 3.6351101398468018, "learning_rate": 4.036061201600737e-05, "loss": 0.3961, "step": 2960 }, { "epoch": 0.7129356528020225, "grad_norm": 0.8414926528930664, "learning_rate": 4.029804064965498e-05, "loss": 0.2666, "step": 2961 }, { "epoch": 0.7131764280984771, "grad_norm": 1.7037287950515747, "learning_rate": 4.023550558222837e-05, "loss": 0.4597, "step": 2962 }, { "epoch": 0.7134172033949316, "grad_norm": 2.4606921672821045, "learning_rate": 4.017300685174903e-05, "loss": 0.6738, "step": 2963 }, { "epoch": 0.7136579786913863, "grad_norm": 0.9210506677627563, "learning_rate": 4.011054449621632e-05, "loss": 0.5534, "step": 2964 }, { "epoch": 0.7138987539878409, "grad_norm": 1.8560645580291748, "learning_rate": 4.004811855360748e-05, "loss": 0.2176, "step": 2965 }, { "epoch": 0.7141395292842955, "grad_norm": 1.5320228338241577, "learning_rate": 3.998572906187767e-05, "loss": 0.6553, "step": 2966 }, { "epoch": 0.71438030458075, "grad_norm": 7.649412155151367, "learning_rate": 3.9923376058959774e-05, "loss": 0.8473, "step": 2967 }, { "epoch": 0.7146210798772046, "grad_norm": 0.8672193288803101, "learning_rate": 3.986105958276463e-05, "loss": 0.4563, "step": 2968 }, { "epoch": 0.7148618551736592, "grad_norm": 3.8993074893951416, "learning_rate": 3.97987796711807e-05, "loss": 0.5047, "step": 2969 }, { "epoch": 0.7151026304701138, "grad_norm": 2.695249319076538, "learning_rate": 3.973653636207437e-05, "loss": 0.8572, "step": 2970 }, { "epoch": 0.7153434057665683, "grad_norm": 1.0926902294158936, "learning_rate": 3.967432969328971e-05, "loss": 0.4632, "step": 2971 }, { "epoch": 0.7155841810630229, "grad_norm": 2.5427393913269043, "learning_rate": 3.961215970264852e-05, "loss": 0.4715, "step": 2972 }, { "epoch": 0.7158249563594775, "grad_norm": 1.8015666007995605, "learning_rate": 3.9550026427950315e-05, "loss": 0.4259, "step": 2973 }, { "epoch": 0.7160657316559321, "grad_norm": 2.0264315605163574, "learning_rate": 3.94879299069722e-05, "loss": 0.7189, "step": 2974 }, { "epoch": 0.7163065069523867, "grad_norm": 2.786452531814575, "learning_rate": 3.942587017746904e-05, "loss": 1.0023, "step": 2975 }, { "epoch": 0.7165472822488412, "grad_norm": 1.3321934938430786, "learning_rate": 3.936384727717332e-05, "loss": 0.5356, "step": 2976 }, { "epoch": 0.7167880575452958, "grad_norm": 1.1468703746795654, "learning_rate": 3.930186124379503e-05, "loss": 0.6806, "step": 2977 }, { "epoch": 0.7170288328417505, "grad_norm": 3.6442174911499023, "learning_rate": 3.923991211502187e-05, "loss": 0.2468, "step": 2978 }, { "epoch": 0.7172696081382051, "grad_norm": 1.8191343545913696, "learning_rate": 3.917799992851903e-05, "loss": 1.0023, "step": 2979 }, { "epoch": 0.7175103834346596, "grad_norm": 2.2116637229919434, "learning_rate": 3.911612472192927e-05, "loss": 0.3557, "step": 2980 }, { "epoch": 0.7177511587311142, "grad_norm": 0.8968959450721741, "learning_rate": 3.9054286532872884e-05, "loss": 0.3245, "step": 2981 }, { "epoch": 0.7179919340275688, "grad_norm": 1.421441674232483, "learning_rate": 3.899248539894757e-05, "loss": 0.3783, "step": 2982 }, { "epoch": 0.7182327093240234, "grad_norm": 2.168306827545166, "learning_rate": 3.8930721357728584e-05, "loss": 0.2099, "step": 2983 }, { "epoch": 0.7184734846204779, "grad_norm": 1.9068177938461304, "learning_rate": 3.886899444676863e-05, "loss": 0.8279, "step": 2984 }, { "epoch": 0.7187142599169325, "grad_norm": 1.3989911079406738, "learning_rate": 3.880730470359776e-05, "loss": 0.9995, "step": 2985 }, { "epoch": 0.7189550352133871, "grad_norm": 5.264814376831055, "learning_rate": 3.8745652165723486e-05, "loss": 0.9829, "step": 2986 }, { "epoch": 0.7191958105098417, "grad_norm": 2.0597469806671143, "learning_rate": 3.8684036870630705e-05, "loss": 0.2443, "step": 2987 }, { "epoch": 0.7194365858062963, "grad_norm": 1.258255958557129, "learning_rate": 3.862245885578166e-05, "loss": 0.7055, "step": 2988 }, { "epoch": 0.7196773611027508, "grad_norm": 3.225368022918701, "learning_rate": 3.856091815861595e-05, "loss": 0.3839, "step": 2989 }, { "epoch": 0.7199181363992054, "grad_norm": 2.35640025138855, "learning_rate": 3.8499414816550384e-05, "loss": 0.8443, "step": 2990 }, { "epoch": 0.72015891169566, "grad_norm": 2.4103639125823975, "learning_rate": 3.843794886697917e-05, "loss": 0.693, "step": 2991 }, { "epoch": 0.7203996869921147, "grad_norm": 0.8811191320419312, "learning_rate": 3.837652034727378e-05, "loss": 0.3415, "step": 2992 }, { "epoch": 0.7206404622885692, "grad_norm": 1.7266875505447388, "learning_rate": 3.8315129294782835e-05, "loss": 0.4295, "step": 2993 }, { "epoch": 0.7208812375850238, "grad_norm": 1.2905570268630981, "learning_rate": 3.8253775746832244e-05, "loss": 0.248, "step": 2994 }, { "epoch": 0.7211220128814784, "grad_norm": 0.3409409821033478, "learning_rate": 3.819245974072513e-05, "loss": 0.6092, "step": 2995 }, { "epoch": 0.721362788177933, "grad_norm": 1.4550303220748901, "learning_rate": 3.8131181313741735e-05, "loss": 0.6874, "step": 2996 }, { "epoch": 0.7216035634743875, "grad_norm": 0.9903691411018372, "learning_rate": 3.806994050313953e-05, "loss": 0.1963, "step": 2997 }, { "epoch": 0.7218443387708421, "grad_norm": 0.8208291530609131, "learning_rate": 3.800873734615299e-05, "loss": 0.3679, "step": 2998 }, { "epoch": 0.7220851140672967, "grad_norm": 1.0615532398223877, "learning_rate": 3.794757187999386e-05, "loss": 0.9426, "step": 2999 }, { "epoch": 0.7223258893637513, "grad_norm": 2.0494561195373535, "learning_rate": 3.788644414185078e-05, "loss": 0.2539, "step": 3000 }, { "epoch": 0.7225666646602059, "grad_norm": 2.378437042236328, "learning_rate": 3.782535416888963e-05, "loss": 0.7789, "step": 3001 }, { "epoch": 0.7228074399566604, "grad_norm": 1.714324951171875, "learning_rate": 3.776430199825321e-05, "loss": 0.9774, "step": 3002 }, { "epoch": 0.723048215253115, "grad_norm": 2.891805648803711, "learning_rate": 3.770328766706139e-05, "loss": 0.6982, "step": 3003 }, { "epoch": 0.7232889905495696, "grad_norm": 4.66194486618042, "learning_rate": 3.764231121241103e-05, "loss": 0.6659, "step": 3004 }, { "epoch": 0.7235297658460242, "grad_norm": 3.184102773666382, "learning_rate": 3.758137267137598e-05, "loss": 0.5286, "step": 3005 }, { "epoch": 0.7237705411424787, "grad_norm": 5.212895393371582, "learning_rate": 3.752047208100694e-05, "loss": 0.2767, "step": 3006 }, { "epoch": 0.7240113164389333, "grad_norm": 1.256901741027832, "learning_rate": 3.745960947833168e-05, "loss": 0.477, "step": 3007 }, { "epoch": 0.724252091735388, "grad_norm": 1.1887600421905518, "learning_rate": 3.739878490035473e-05, "loss": 0.7814, "step": 3008 }, { "epoch": 0.7244928670318426, "grad_norm": 2.5815846920013428, "learning_rate": 3.73379983840576e-05, "loss": 0.5839, "step": 3009 }, { "epoch": 0.7247336423282971, "grad_norm": 4.950305938720703, "learning_rate": 3.727724996639863e-05, "loss": 0.4643, "step": 3010 }, { "epoch": 0.7249744176247517, "grad_norm": 6.662084102630615, "learning_rate": 3.7216539684313004e-05, "loss": 0.5806, "step": 3011 }, { "epoch": 0.7252151929212063, "grad_norm": 2.3618359565734863, "learning_rate": 3.715586757471273e-05, "loss": 0.5451, "step": 3012 }, { "epoch": 0.7254559682176609, "grad_norm": 1.401696801185608, "learning_rate": 3.709523367448653e-05, "loss": 0.8228, "step": 3013 }, { "epoch": 0.7256967435141155, "grad_norm": 0.5292275547981262, "learning_rate": 3.7034638020499976e-05, "loss": 0.3713, "step": 3014 }, { "epoch": 0.72593751881057, "grad_norm": 0.8193963766098022, "learning_rate": 3.697408064959541e-05, "loss": 0.1659, "step": 3015 }, { "epoch": 0.7261782941070246, "grad_norm": 2.547407388687134, "learning_rate": 3.691356159859177e-05, "loss": 0.3945, "step": 3016 }, { "epoch": 0.7264190694034792, "grad_norm": 1.281667709350586, "learning_rate": 3.685308090428481e-05, "loss": 0.6264, "step": 3017 }, { "epoch": 0.7266598446999338, "grad_norm": 0.9349974989891052, "learning_rate": 3.6792638603446974e-05, "loss": 0.5355, "step": 3018 }, { "epoch": 0.7269006199963883, "grad_norm": 1.2555688619613647, "learning_rate": 3.67322347328272e-05, "loss": 0.1645, "step": 3019 }, { "epoch": 0.7271413952928429, "grad_norm": 2.40930438041687, "learning_rate": 3.667186932915133e-05, "loss": 0.3945, "step": 3020 }, { "epoch": 0.7273821705892975, "grad_norm": 1.6507692337036133, "learning_rate": 3.661154242912155e-05, "loss": 0.3394, "step": 3021 }, { "epoch": 0.7276229458857522, "grad_norm": 0.6924558877944946, "learning_rate": 3.6551254069416774e-05, "loss": 0.2132, "step": 3022 }, { "epoch": 0.7278637211822067, "grad_norm": 1.6599589586257935, "learning_rate": 3.649100428669253e-05, "loss": 0.5329, "step": 3023 }, { "epoch": 0.7281044964786613, "grad_norm": 1.3489158153533936, "learning_rate": 3.643079311758072e-05, "loss": 0.4529, "step": 3024 }, { "epoch": 0.7283452717751159, "grad_norm": 3.1767184734344482, "learning_rate": 3.637062059868996e-05, "loss": 1.4829, "step": 3025 }, { "epoch": 0.7285860470715705, "grad_norm": 2.8698904514312744, "learning_rate": 3.63104867666052e-05, "loss": 0.4736, "step": 3026 }, { "epoch": 0.7288268223680251, "grad_norm": 1.75603187084198, "learning_rate": 3.625039165788794e-05, "loss": 0.8231, "step": 3027 }, { "epoch": 0.7290675976644796, "grad_norm": 0.7908713221549988, "learning_rate": 3.619033530907625e-05, "loss": 0.5338, "step": 3028 }, { "epoch": 0.7293083729609342, "grad_norm": 1.7771409749984741, "learning_rate": 3.613031775668443e-05, "loss": 0.7482, "step": 3029 }, { "epoch": 0.7295491482573888, "grad_norm": 2.4424712657928467, "learning_rate": 3.6070339037203306e-05, "loss": 0.4881, "step": 3030 }, { "epoch": 0.7297899235538434, "grad_norm": 0.698549211025238, "learning_rate": 3.601039918710012e-05, "loss": 0.3092, "step": 3031 }, { "epoch": 0.7300306988502979, "grad_norm": 0.616523802280426, "learning_rate": 3.595049824281837e-05, "loss": 0.5394, "step": 3032 }, { "epoch": 0.7302714741467525, "grad_norm": 1.3015395402908325, "learning_rate": 3.589063624077802e-05, "loss": 0.5671, "step": 3033 }, { "epoch": 0.7305122494432071, "grad_norm": 0.953938364982605, "learning_rate": 3.583081321737525e-05, "loss": 0.7368, "step": 3034 }, { "epoch": 0.7307530247396617, "grad_norm": 0.6559523344039917, "learning_rate": 3.577102920898261e-05, "loss": 0.2857, "step": 3035 }, { "epoch": 0.7309938000361162, "grad_norm": 0.8794732689857483, "learning_rate": 3.5711284251948914e-05, "loss": 0.2559, "step": 3036 }, { "epoch": 0.7312345753325709, "grad_norm": 1.048971176147461, "learning_rate": 3.565157838259925e-05, "loss": 0.2112, "step": 3037 }, { "epoch": 0.7314753506290255, "grad_norm": 1.1826798915863037, "learning_rate": 3.5591911637234945e-05, "loss": 0.2799, "step": 3038 }, { "epoch": 0.7317161259254801, "grad_norm": 2.4413845539093018, "learning_rate": 3.5532284052133436e-05, "loss": 0.5779, "step": 3039 }, { "epoch": 0.7319569012219346, "grad_norm": 0.9847295880317688, "learning_rate": 3.547269566354847e-05, "loss": 0.6497, "step": 3040 }, { "epoch": 0.7321976765183892, "grad_norm": 1.833725094795227, "learning_rate": 3.541314650770996e-05, "loss": 0.3938, "step": 3041 }, { "epoch": 0.7324384518148438, "grad_norm": 2.012840747833252, "learning_rate": 3.535363662082385e-05, "loss": 0.4187, "step": 3042 }, { "epoch": 0.7326792271112984, "grad_norm": 3.2702102661132812, "learning_rate": 3.529416603907233e-05, "loss": 0.9575, "step": 3043 }, { "epoch": 0.732920002407753, "grad_norm": 1.4701731204986572, "learning_rate": 3.523473479861365e-05, "loss": 0.5232, "step": 3044 }, { "epoch": 0.7331607777042075, "grad_norm": 1.68658447265625, "learning_rate": 3.5175342935582114e-05, "loss": 0.6121, "step": 3045 }, { "epoch": 0.7334015530006621, "grad_norm": 1.9545087814331055, "learning_rate": 3.5115990486088166e-05, "loss": 0.31, "step": 3046 }, { "epoch": 0.7336423282971167, "grad_norm": 4.512576580047607, "learning_rate": 3.5056677486218145e-05, "loss": 0.8468, "step": 3047 }, { "epoch": 0.7338831035935713, "grad_norm": 2.4108033180236816, "learning_rate": 3.4997403972034546e-05, "loss": 0.826, "step": 3048 }, { "epoch": 0.7341238788900258, "grad_norm": 3.4939920902252197, "learning_rate": 3.493816997957582e-05, "loss": 0.4593, "step": 3049 }, { "epoch": 0.7343646541864804, "grad_norm": 2.438183307647705, "learning_rate": 3.487897554485628e-05, "loss": 0.6518, "step": 3050 }, { "epoch": 0.7346054294829351, "grad_norm": 3.4589779376983643, "learning_rate": 3.4819820703866344e-05, "loss": 0.6474, "step": 3051 }, { "epoch": 0.7348462047793897, "grad_norm": 4.573122978210449, "learning_rate": 3.4760705492572266e-05, "loss": 0.529, "step": 3052 }, { "epoch": 0.7350869800758442, "grad_norm": 0.7465322017669678, "learning_rate": 3.470162994691624e-05, "loss": 0.4171, "step": 3053 }, { "epoch": 0.7353277553722988, "grad_norm": 1.0964757204055786, "learning_rate": 3.464259410281635e-05, "loss": 0.4091, "step": 3054 }, { "epoch": 0.7355685306687534, "grad_norm": 3.490908145904541, "learning_rate": 3.458359799616647e-05, "loss": 1.0212, "step": 3055 }, { "epoch": 0.735809305965208, "grad_norm": 1.6229488849639893, "learning_rate": 3.45246416628364e-05, "loss": 0.5396, "step": 3056 }, { "epoch": 0.7360500812616626, "grad_norm": 2.6889917850494385, "learning_rate": 3.446572513867175e-05, "loss": 0.8915, "step": 3057 }, { "epoch": 0.7362908565581171, "grad_norm": 2.3369765281677246, "learning_rate": 3.4406848459493814e-05, "loss": 0.62, "step": 3058 }, { "epoch": 0.7365316318545717, "grad_norm": 1.6141836643218994, "learning_rate": 3.434801166109981e-05, "loss": 0.4647, "step": 3059 }, { "epoch": 0.7367724071510263, "grad_norm": 4.394378662109375, "learning_rate": 3.4289214779262636e-05, "loss": 1.081, "step": 3060 }, { "epoch": 0.7370131824474809, "grad_norm": 2.108896255493164, "learning_rate": 3.423045784973091e-05, "loss": 0.6174, "step": 3061 }, { "epoch": 0.7372539577439354, "grad_norm": 6.742406845092773, "learning_rate": 3.4171740908229044e-05, "loss": 1.3335, "step": 3062 }, { "epoch": 0.73749473304039, "grad_norm": 2.831634998321533, "learning_rate": 3.411306399045697e-05, "loss": 0.492, "step": 3063 }, { "epoch": 0.7377355083368446, "grad_norm": 0.8104602694511414, "learning_rate": 3.405442713209047e-05, "loss": 0.6458, "step": 3064 }, { "epoch": 0.7379762836332993, "grad_norm": 1.0663022994995117, "learning_rate": 3.3995830368780825e-05, "loss": 0.3529, "step": 3065 }, { "epoch": 0.7382170589297538, "grad_norm": 2.1759705543518066, "learning_rate": 3.393727373615503e-05, "loss": 0.7057, "step": 3066 }, { "epoch": 0.7384578342262084, "grad_norm": 2.893615245819092, "learning_rate": 3.387875726981563e-05, "loss": 0.7425, "step": 3067 }, { "epoch": 0.738698609522663, "grad_norm": 1.8920822143554688, "learning_rate": 3.3820281005340794e-05, "loss": 0.4257, "step": 3068 }, { "epoch": 0.7389393848191176, "grad_norm": 2.6992859840393066, "learning_rate": 3.3761844978284205e-05, "loss": 1.193, "step": 3069 }, { "epoch": 0.7391801601155722, "grad_norm": 2.974738836288452, "learning_rate": 3.370344922417513e-05, "loss": 1.1457, "step": 3070 }, { "epoch": 0.7394209354120267, "grad_norm": 0.7591432929039001, "learning_rate": 3.364509377851828e-05, "loss": 0.4777, "step": 3071 }, { "epoch": 0.7396617107084813, "grad_norm": 2.3580071926116943, "learning_rate": 3.358677867679394e-05, "loss": 0.5326, "step": 3072 }, { "epoch": 0.7399024860049359, "grad_norm": 2.4343063831329346, "learning_rate": 3.3528503954457756e-05, "loss": 0.4066, "step": 3073 }, { "epoch": 0.7401432613013905, "grad_norm": 1.2667893171310425, "learning_rate": 3.3470269646940935e-05, "loss": 0.423, "step": 3074 }, { "epoch": 0.740384036597845, "grad_norm": 2.454868793487549, "learning_rate": 3.341207578965005e-05, "loss": 0.2659, "step": 3075 }, { "epoch": 0.7406248118942996, "grad_norm": 1.9105570316314697, "learning_rate": 3.335392241796712e-05, "loss": 1.0031, "step": 3076 }, { "epoch": 0.7408655871907542, "grad_norm": 2.387080669403076, "learning_rate": 3.329580956724955e-05, "loss": 0.5239, "step": 3077 }, { "epoch": 0.7411063624872088, "grad_norm": 8.040419578552246, "learning_rate": 3.3237737272830013e-05, "loss": 0.4703, "step": 3078 }, { "epoch": 0.7413471377836633, "grad_norm": 1.0667513608932495, "learning_rate": 3.317970557001664e-05, "loss": 0.5395, "step": 3079 }, { "epoch": 0.741587913080118, "grad_norm": 0.727729082107544, "learning_rate": 3.312171449409285e-05, "loss": 0.2785, "step": 3080 }, { "epoch": 0.7418286883765726, "grad_norm": 1.5719585418701172, "learning_rate": 3.306376408031729e-05, "loss": 0.54, "step": 3081 }, { "epoch": 0.7420694636730272, "grad_norm": 2.5653600692749023, "learning_rate": 3.3005854363923995e-05, "loss": 0.2214, "step": 3082 }, { "epoch": 0.7423102389694818, "grad_norm": 1.5638865232467651, "learning_rate": 3.294798538012217e-05, "loss": 0.7477, "step": 3083 }, { "epoch": 0.7425510142659363, "grad_norm": 1.88933527469635, "learning_rate": 3.289015716409631e-05, "loss": 0.7616, "step": 3084 }, { "epoch": 0.7427917895623909, "grad_norm": 0.9233277440071106, "learning_rate": 3.283236975100613e-05, "loss": 0.3405, "step": 3085 }, { "epoch": 0.7430325648588455, "grad_norm": 2.3473784923553467, "learning_rate": 3.277462317598644e-05, "loss": 0.8511, "step": 3086 }, { "epoch": 0.7432733401553001, "grad_norm": 1.4704930782318115, "learning_rate": 3.271691747414731e-05, "loss": 0.5758, "step": 3087 }, { "epoch": 0.7435141154517546, "grad_norm": 1.2950267791748047, "learning_rate": 3.265925268057398e-05, "loss": 0.7987, "step": 3088 }, { "epoch": 0.7437548907482092, "grad_norm": 0.8450798392295837, "learning_rate": 3.2601628830326726e-05, "loss": 0.3298, "step": 3089 }, { "epoch": 0.7439956660446638, "grad_norm": 1.7670706510543823, "learning_rate": 3.2544045958441004e-05, "loss": 0.4484, "step": 3090 }, { "epoch": 0.7442364413411184, "grad_norm": 1.2544729709625244, "learning_rate": 3.248650409992726e-05, "loss": 0.4268, "step": 3091 }, { "epoch": 0.7444772166375729, "grad_norm": 1.582452416419983, "learning_rate": 3.2429003289771176e-05, "loss": 0.5207, "step": 3092 }, { "epoch": 0.7447179919340275, "grad_norm": 1.0165259838104248, "learning_rate": 3.237154356293336e-05, "loss": 0.7176, "step": 3093 }, { "epoch": 0.7449587672304822, "grad_norm": 1.914751410484314, "learning_rate": 3.231412495434939e-05, "loss": 0.5358, "step": 3094 }, { "epoch": 0.7451995425269368, "grad_norm": 4.326685428619385, "learning_rate": 3.225674749892994e-05, "loss": 0.7129, "step": 3095 }, { "epoch": 0.7454403178233914, "grad_norm": 0.8451967239379883, "learning_rate": 3.219941123156068e-05, "loss": 0.4402, "step": 3096 }, { "epoch": 0.7456810931198459, "grad_norm": 0.9839834570884705, "learning_rate": 3.214211618710211e-05, "loss": 0.3726, "step": 3097 }, { "epoch": 0.7459218684163005, "grad_norm": 1.0465095043182373, "learning_rate": 3.208486240038982e-05, "loss": 0.1241, "step": 3098 }, { "epoch": 0.7461626437127551, "grad_norm": 1.104686975479126, "learning_rate": 3.202764990623417e-05, "loss": 0.5279, "step": 3099 }, { "epoch": 0.7464034190092097, "grad_norm": 1.0594794750213623, "learning_rate": 3.1970478739420496e-05, "loss": 0.3273, "step": 3100 }, { "epoch": 0.7466441943056642, "grad_norm": 0.9185763597488403, "learning_rate": 3.191334893470907e-05, "loss": 0.3357, "step": 3101 }, { "epoch": 0.7468849696021188, "grad_norm": 2.1206271648406982, "learning_rate": 3.185626052683487e-05, "loss": 0.5291, "step": 3102 }, { "epoch": 0.7471257448985734, "grad_norm": 0.9549693465232849, "learning_rate": 3.1799213550507835e-05, "loss": 0.6672, "step": 3103 }, { "epoch": 0.747366520195028, "grad_norm": 1.769875407218933, "learning_rate": 3.174220804041258e-05, "loss": 0.9207, "step": 3104 }, { "epoch": 0.7476072954914825, "grad_norm": 10.123749732971191, "learning_rate": 3.168524403120863e-05, "loss": 0.9403, "step": 3105 }, { "epoch": 0.7478480707879371, "grad_norm": 3.946068525314331, "learning_rate": 3.1628321557530246e-05, "loss": 0.6703, "step": 3106 }, { "epoch": 0.7480888460843917, "grad_norm": 1.5204689502716064, "learning_rate": 3.157144065398638e-05, "loss": 0.6827, "step": 3107 }, { "epoch": 0.7483296213808464, "grad_norm": 0.9539960026741028, "learning_rate": 3.151460135516075e-05, "loss": 0.6948, "step": 3108 }, { "epoch": 0.748570396677301, "grad_norm": 2.0044784545898438, "learning_rate": 3.145780369561182e-05, "loss": 0.6487, "step": 3109 }, { "epoch": 0.7488111719737555, "grad_norm": 2.3419203758239746, "learning_rate": 3.140104770987265e-05, "loss": 0.4121, "step": 3110 }, { "epoch": 0.7490519472702101, "grad_norm": 2.2572646141052246, "learning_rate": 3.1344333432451066e-05, "loss": 0.2235, "step": 3111 }, { "epoch": 0.7492927225666647, "grad_norm": 1.7564064264297485, "learning_rate": 3.1287660897829404e-05, "loss": 0.2708, "step": 3112 }, { "epoch": 0.7495334978631193, "grad_norm": 1.6659893989562988, "learning_rate": 3.1231030140464736e-05, "loss": 0.7538, "step": 3113 }, { "epoch": 0.7497742731595738, "grad_norm": 4.775331497192383, "learning_rate": 3.117444119478871e-05, "loss": 1.2959, "step": 3114 }, { "epoch": 0.7500150484560284, "grad_norm": 4.739798545837402, "learning_rate": 3.111789409520746e-05, "loss": 0.5799, "step": 3115 }, { "epoch": 0.750255823752483, "grad_norm": 1.0320911407470703, "learning_rate": 3.1061388876101804e-05, "loss": 0.4581, "step": 3116 }, { "epoch": 0.7504965990489376, "grad_norm": 3.4287285804748535, "learning_rate": 3.1004925571827023e-05, "loss": 0.8336, "step": 3117 }, { "epoch": 0.7507373743453921, "grad_norm": 2.3229026794433594, "learning_rate": 3.094850421671295e-05, "loss": 0.591, "step": 3118 }, { "epoch": 0.7509781496418467, "grad_norm": 1.660323977470398, "learning_rate": 3.089212484506392e-05, "loss": 0.7506, "step": 3119 }, { "epoch": 0.7512189249383013, "grad_norm": 2.4399898052215576, "learning_rate": 3.083578749115865e-05, "loss": 0.7181, "step": 3120 }, { "epoch": 0.751459700234756, "grad_norm": 1.1477172374725342, "learning_rate": 3.0779492189250414e-05, "loss": 0.6411, "step": 3121 }, { "epoch": 0.7517004755312104, "grad_norm": 3.424316167831421, "learning_rate": 3.0723238973566925e-05, "loss": 0.6226, "step": 3122 }, { "epoch": 0.751941250827665, "grad_norm": 3.0182266235351562, "learning_rate": 3.066702787831017e-05, "loss": 0.3055, "step": 3123 }, { "epoch": 0.7521820261241197, "grad_norm": 4.055928707122803, "learning_rate": 3.06108589376567e-05, "loss": 0.9499, "step": 3124 }, { "epoch": 0.7524228014205743, "grad_norm": 2.966586112976074, "learning_rate": 3.0554732185757315e-05, "loss": 0.4065, "step": 3125 }, { "epoch": 0.7526635767170289, "grad_norm": 2.517282247543335, "learning_rate": 3.0498647656737223e-05, "loss": 0.5657, "step": 3126 }, { "epoch": 0.7529043520134834, "grad_norm": 5.178724765777588, "learning_rate": 3.0442605384695977e-05, "loss": 0.7705, "step": 3127 }, { "epoch": 0.753145127309938, "grad_norm": 2.8488965034484863, "learning_rate": 3.0386605403707346e-05, "loss": 0.4091, "step": 3128 }, { "epoch": 0.7533859026063926, "grad_norm": 0.804840087890625, "learning_rate": 3.0330647747819496e-05, "loss": 0.3117, "step": 3129 }, { "epoch": 0.7536266779028472, "grad_norm": 1.8321592807769775, "learning_rate": 3.0274732451054756e-05, "loss": 0.58, "step": 3130 }, { "epoch": 0.7538674531993017, "grad_norm": 0.4262060225009918, "learning_rate": 3.021885954740977e-05, "loss": 0.206, "step": 3131 }, { "epoch": 0.7541082284957563, "grad_norm": 1.1734882593154907, "learning_rate": 3.016302907085541e-05, "loss": 0.9527, "step": 3132 }, { "epoch": 0.7543490037922109, "grad_norm": 1.2724254131317139, "learning_rate": 3.010724105533671e-05, "loss": 0.7622, "step": 3133 }, { "epoch": 0.7545897790886655, "grad_norm": 1.9372936487197876, "learning_rate": 3.005149553477292e-05, "loss": 0.5003, "step": 3134 }, { "epoch": 0.75483055438512, "grad_norm": 4.942528247833252, "learning_rate": 2.9995792543057478e-05, "loss": 0.2299, "step": 3135 }, { "epoch": 0.7550713296815746, "grad_norm": 2.330275535583496, "learning_rate": 2.994013211405785e-05, "loss": 0.5149, "step": 3136 }, { "epoch": 0.7553121049780293, "grad_norm": 3.635746717453003, "learning_rate": 2.988451428161578e-05, "loss": 0.7856, "step": 3137 }, { "epoch": 0.7555528802744839, "grad_norm": 1.8431618213653564, "learning_rate": 2.982893907954697e-05, "loss": 0.3647, "step": 3138 }, { "epoch": 0.7557936555709385, "grad_norm": 3.0256638526916504, "learning_rate": 2.977340654164129e-05, "loss": 0.3034, "step": 3139 }, { "epoch": 0.756034430867393, "grad_norm": 1.4221413135528564, "learning_rate": 2.9717916701662662e-05, "loss": 0.4793, "step": 3140 }, { "epoch": 0.7562752061638476, "grad_norm": 1.2902501821517944, "learning_rate": 2.966246959334903e-05, "loss": 0.5462, "step": 3141 }, { "epoch": 0.7565159814603022, "grad_norm": 2.2602968215942383, "learning_rate": 2.960706525041238e-05, "loss": 0.5961, "step": 3142 }, { "epoch": 0.7567567567567568, "grad_norm": 2.0314295291900635, "learning_rate": 2.9551703706538623e-05, "loss": 0.9683, "step": 3143 }, { "epoch": 0.7569975320532113, "grad_norm": 1.4725910425186157, "learning_rate": 2.949638499538774e-05, "loss": 0.5248, "step": 3144 }, { "epoch": 0.7572383073496659, "grad_norm": 1.5069992542266846, "learning_rate": 2.944110915059366e-05, "loss": 0.5018, "step": 3145 }, { "epoch": 0.7574790826461205, "grad_norm": 1.0725562572479248, "learning_rate": 2.938587620576415e-05, "loss": 0.7976, "step": 3146 }, { "epoch": 0.7577198579425751, "grad_norm": 2.012692451477051, "learning_rate": 2.9330686194481006e-05, "loss": 0.5563, "step": 3147 }, { "epoch": 0.7579606332390296, "grad_norm": 1.9315499067306519, "learning_rate": 2.927553915029987e-05, "loss": 0.8436, "step": 3148 }, { "epoch": 0.7582014085354842, "grad_norm": 1.7731233835220337, "learning_rate": 2.9220435106750276e-05, "loss": 0.6159, "step": 3149 }, { "epoch": 0.7584421838319388, "grad_norm": 4.7184977531433105, "learning_rate": 2.9165374097335642e-05, "loss": 0.625, "step": 3150 }, { "epoch": 0.7586829591283935, "grad_norm": 3.9251320362091064, "learning_rate": 2.9110356155533113e-05, "loss": 0.6392, "step": 3151 }, { "epoch": 0.7589237344248481, "grad_norm": 2.422001600265503, "learning_rate": 2.905538131479376e-05, "loss": 0.4932, "step": 3152 }, { "epoch": 0.7591645097213026, "grad_norm": 3.3069140911102295, "learning_rate": 2.9000449608542447e-05, "loss": 0.7679, "step": 3153 }, { "epoch": 0.7594052850177572, "grad_norm": 2.4573240280151367, "learning_rate": 2.8945561070177696e-05, "loss": 0.8308, "step": 3154 }, { "epoch": 0.7596460603142118, "grad_norm": 1.1037508249282837, "learning_rate": 2.8890715733071927e-05, "loss": 0.4607, "step": 3155 }, { "epoch": 0.7598868356106664, "grad_norm": 1.992222785949707, "learning_rate": 2.8835913630571155e-05, "loss": 1.0511, "step": 3156 }, { "epoch": 0.7601276109071209, "grad_norm": 3.1501615047454834, "learning_rate": 2.8781154795995247e-05, "loss": 0.7244, "step": 3157 }, { "epoch": 0.7603683862035755, "grad_norm": 5.186891078948975, "learning_rate": 2.8726439262637727e-05, "loss": 0.5768, "step": 3158 }, { "epoch": 0.7606091615000301, "grad_norm": 3.4781057834625244, "learning_rate": 2.8671767063765676e-05, "loss": 0.4973, "step": 3159 }, { "epoch": 0.7608499367964847, "grad_norm": 0.9576385617256165, "learning_rate": 2.8617138232619955e-05, "loss": 0.6546, "step": 3160 }, { "epoch": 0.7610907120929392, "grad_norm": 1.434462070465088, "learning_rate": 2.8562552802415055e-05, "loss": 0.5047, "step": 3161 }, { "epoch": 0.7613314873893938, "grad_norm": 1.7557677030563354, "learning_rate": 2.850801080633896e-05, "loss": 0.6268, "step": 3162 }, { "epoch": 0.7615722626858484, "grad_norm": 1.2142372131347656, "learning_rate": 2.8453512277553406e-05, "loss": 0.5757, "step": 3163 }, { "epoch": 0.761813037982303, "grad_norm": 1.8882231712341309, "learning_rate": 2.8399057249193518e-05, "loss": 0.9265, "step": 3164 }, { "epoch": 0.7620538132787577, "grad_norm": 0.7379496693611145, "learning_rate": 2.8344645754368172e-05, "loss": 0.6167, "step": 3165 }, { "epoch": 0.7622945885752122, "grad_norm": 1.584207534790039, "learning_rate": 2.8290277826159683e-05, "loss": 0.7881, "step": 3166 }, { "epoch": 0.7625353638716668, "grad_norm": 2.564490556716919, "learning_rate": 2.8235953497623803e-05, "loss": 0.4444, "step": 3167 }, { "epoch": 0.7627761391681214, "grad_norm": 2.109895706176758, "learning_rate": 2.8181672801789917e-05, "loss": 0.5114, "step": 3168 }, { "epoch": 0.763016914464576, "grad_norm": 2.212892770767212, "learning_rate": 2.8127435771660747e-05, "loss": 0.9194, "step": 3169 }, { "epoch": 0.7632576897610305, "grad_norm": 0.5335499048233032, "learning_rate": 2.8073242440212556e-05, "loss": 0.2802, "step": 3170 }, { "epoch": 0.7634984650574851, "grad_norm": 0.7556986808776855, "learning_rate": 2.8019092840395044e-05, "loss": 0.4347, "step": 3171 }, { "epoch": 0.7637392403539397, "grad_norm": 4.619632244110107, "learning_rate": 2.796498700513124e-05, "loss": 0.8844, "step": 3172 }, { "epoch": 0.7639800156503943, "grad_norm": 4.419466018676758, "learning_rate": 2.7910924967317585e-05, "loss": 0.7078, "step": 3173 }, { "epoch": 0.7642207909468488, "grad_norm": 2.6079466342926025, "learning_rate": 2.785690675982404e-05, "loss": 0.3685, "step": 3174 }, { "epoch": 0.7644615662433034, "grad_norm": 2.4883298873901367, "learning_rate": 2.7802932415493698e-05, "loss": 1.2917, "step": 3175 }, { "epoch": 0.764702341539758, "grad_norm": 2.154827356338501, "learning_rate": 2.7749001967143128e-05, "loss": 1.0546, "step": 3176 }, { "epoch": 0.7649431168362126, "grad_norm": 1.366364598274231, "learning_rate": 2.7695115447562126e-05, "loss": 0.3194, "step": 3177 }, { "epoch": 0.7651838921326672, "grad_norm": 2.245346784591675, "learning_rate": 2.7641272889513837e-05, "loss": 0.5605, "step": 3178 }, { "epoch": 0.7654246674291217, "grad_norm": 8.141434669494629, "learning_rate": 2.7587474325734687e-05, "loss": 0.3617, "step": 3179 }, { "epoch": 0.7656654427255764, "grad_norm": 1.830678105354309, "learning_rate": 2.7533719788934255e-05, "loss": 0.6659, "step": 3180 }, { "epoch": 0.765906218022031, "grad_norm": 2.041790246963501, "learning_rate": 2.7480009311795473e-05, "loss": 0.7169, "step": 3181 }, { "epoch": 0.7661469933184856, "grad_norm": 2.693058967590332, "learning_rate": 2.7426342926974413e-05, "loss": 0.6781, "step": 3182 }, { "epoch": 0.7663877686149401, "grad_norm": 1.7061842679977417, "learning_rate": 2.737272066710036e-05, "loss": 0.5184, "step": 3183 }, { "epoch": 0.7666285439113947, "grad_norm": 1.023386001586914, "learning_rate": 2.73191425647758e-05, "loss": 0.2387, "step": 3184 }, { "epoch": 0.7668693192078493, "grad_norm": 0.27525773644447327, "learning_rate": 2.726560865257629e-05, "loss": 0.1579, "step": 3185 }, { "epoch": 0.7671100945043039, "grad_norm": 0.31351879239082336, "learning_rate": 2.7212118963050592e-05, "loss": 0.2697, "step": 3186 }, { "epoch": 0.7673508698007584, "grad_norm": 1.223887324333191, "learning_rate": 2.715867352872058e-05, "loss": 0.5606, "step": 3187 }, { "epoch": 0.767591645097213, "grad_norm": 4.273595333099365, "learning_rate": 2.710527238208116e-05, "loss": 0.6982, "step": 3188 }, { "epoch": 0.7678324203936676, "grad_norm": 3.2111504077911377, "learning_rate": 2.705191555560035e-05, "loss": 0.5278, "step": 3189 }, { "epoch": 0.7680731956901222, "grad_norm": 1.322572112083435, "learning_rate": 2.6998603081719243e-05, "loss": 0.6689, "step": 3190 }, { "epoch": 0.7683139709865767, "grad_norm": 2.3791556358337402, "learning_rate": 2.6945334992851933e-05, "loss": 0.3503, "step": 3191 }, { "epoch": 0.7685547462830313, "grad_norm": 3.1207807064056396, "learning_rate": 2.6892111321385584e-05, "loss": 0.6434, "step": 3192 }, { "epoch": 0.7687955215794859, "grad_norm": 1.937662124633789, "learning_rate": 2.6838932099680225e-05, "loss": 0.2284, "step": 3193 }, { "epoch": 0.7690362968759406, "grad_norm": 1.8253540992736816, "learning_rate": 2.678579736006901e-05, "loss": 0.4102, "step": 3194 }, { "epoch": 0.7692770721723952, "grad_norm": 1.696462869644165, "learning_rate": 2.6732707134857937e-05, "loss": 0.603, "step": 3195 }, { "epoch": 0.7695178474688497, "grad_norm": 1.8397753238677979, "learning_rate": 2.6679661456325988e-05, "loss": 0.3548, "step": 3196 }, { "epoch": 0.7697586227653043, "grad_norm": 2.073573350906372, "learning_rate": 2.6626660356725064e-05, "loss": 0.6005, "step": 3197 }, { "epoch": 0.7699993980617589, "grad_norm": 0.84525465965271, "learning_rate": 2.6573703868279963e-05, "loss": 0.3808, "step": 3198 }, { "epoch": 0.7702401733582135, "grad_norm": 2.4383602142333984, "learning_rate": 2.6520792023188333e-05, "loss": 0.8604, "step": 3199 }, { "epoch": 0.770480948654668, "grad_norm": 0.7531054019927979, "learning_rate": 2.646792485362074e-05, "loss": 0.5609, "step": 3200 }, { "epoch": 0.7707217239511226, "grad_norm": 0.8975092768669128, "learning_rate": 2.6415102391720482e-05, "loss": 0.4972, "step": 3201 }, { "epoch": 0.7709624992475772, "grad_norm": 0.7330169081687927, "learning_rate": 2.6362324669603776e-05, "loss": 0.3688, "step": 3202 }, { "epoch": 0.7712032745440318, "grad_norm": 1.5205063819885254, "learning_rate": 2.630959171935956e-05, "loss": 0.7152, "step": 3203 }, { "epoch": 0.7714440498404863, "grad_norm": 4.553707122802734, "learning_rate": 2.6256903573049597e-05, "loss": 0.5479, "step": 3204 }, { "epoch": 0.7716848251369409, "grad_norm": 0.9805248379707336, "learning_rate": 2.6204260262708403e-05, "loss": 0.655, "step": 3205 }, { "epoch": 0.7719256004333955, "grad_norm": 1.8487534523010254, "learning_rate": 2.6151661820343243e-05, "loss": 0.6114, "step": 3206 }, { "epoch": 0.7721663757298501, "grad_norm": 0.826151967048645, "learning_rate": 2.6099108277934103e-05, "loss": 0.2134, "step": 3207 }, { "epoch": 0.7724071510263048, "grad_norm": 3.3528854846954346, "learning_rate": 2.6046599667433603e-05, "loss": 0.8448, "step": 3208 }, { "epoch": 0.7726479263227592, "grad_norm": 1.5255182981491089, "learning_rate": 2.5994136020767124e-05, "loss": 0.5631, "step": 3209 }, { "epoch": 0.7728887016192139, "grad_norm": 2.4078643321990967, "learning_rate": 2.5941717369832707e-05, "loss": 0.5426, "step": 3210 }, { "epoch": 0.7731294769156685, "grad_norm": 4.288626670837402, "learning_rate": 2.588934374650096e-05, "loss": 0.4884, "step": 3211 }, { "epoch": 0.7733702522121231, "grad_norm": 3.0660624504089355, "learning_rate": 2.583701518261519e-05, "loss": 0.4575, "step": 3212 }, { "epoch": 0.7736110275085776, "grad_norm": 0.8354116678237915, "learning_rate": 2.5784731709991272e-05, "loss": 0.6711, "step": 3213 }, { "epoch": 0.7738518028050322, "grad_norm": 3.1987497806549072, "learning_rate": 2.57324933604177e-05, "loss": 0.6139, "step": 3214 }, { "epoch": 0.7740925781014868, "grad_norm": 0.9417548179626465, "learning_rate": 2.5680300165655503e-05, "loss": 0.6099, "step": 3215 }, { "epoch": 0.7743333533979414, "grad_norm": 1.8997162580490112, "learning_rate": 2.5628152157438222e-05, "loss": 0.6094, "step": 3216 }, { "epoch": 0.7745741286943959, "grad_norm": 1.4700846672058105, "learning_rate": 2.5576049367471998e-05, "loss": 0.2409, "step": 3217 }, { "epoch": 0.7748149039908505, "grad_norm": 7.270529747009277, "learning_rate": 2.5523991827435468e-05, "loss": 0.6279, "step": 3218 }, { "epoch": 0.7750556792873051, "grad_norm": 1.6653450727462769, "learning_rate": 2.5471979568979666e-05, "loss": 0.6544, "step": 3219 }, { "epoch": 0.7752964545837597, "grad_norm": 4.143406391143799, "learning_rate": 2.5420012623728208e-05, "loss": 0.733, "step": 3220 }, { "epoch": 0.7755372298802143, "grad_norm": 0.922996997833252, "learning_rate": 2.5368091023277096e-05, "loss": 0.3873, "step": 3221 }, { "epoch": 0.7757780051766688, "grad_norm": 1.7943379878997803, "learning_rate": 2.5316214799194805e-05, "loss": 0.1036, "step": 3222 }, { "epoch": 0.7760187804731234, "grad_norm": 3.269728422164917, "learning_rate": 2.5264383983022198e-05, "loss": 0.697, "step": 3223 }, { "epoch": 0.7762595557695781, "grad_norm": 1.3404314517974854, "learning_rate": 2.5212598606272486e-05, "loss": 0.7117, "step": 3224 }, { "epoch": 0.7765003310660327, "grad_norm": 3.5856986045837402, "learning_rate": 2.516085870043131e-05, "loss": 0.6111, "step": 3225 }, { "epoch": 0.7767411063624872, "grad_norm": 1.1721508502960205, "learning_rate": 2.51091642969567e-05, "loss": 0.2938, "step": 3226 }, { "epoch": 0.7769818816589418, "grad_norm": 2.648401975631714, "learning_rate": 2.50575154272789e-05, "loss": 0.8979, "step": 3227 }, { "epoch": 0.7772226569553964, "grad_norm": 4.080894947052002, "learning_rate": 2.5005912122800557e-05, "loss": 0.284, "step": 3228 }, { "epoch": 0.777463432251851, "grad_norm": 2.2749102115631104, "learning_rate": 2.495435441489661e-05, "loss": 0.6611, "step": 3229 }, { "epoch": 0.7777042075483055, "grad_norm": 2.709296226501465, "learning_rate": 2.4902842334914266e-05, "loss": 0.6276, "step": 3230 }, { "epoch": 0.7779449828447601, "grad_norm": 1.5649709701538086, "learning_rate": 2.4851375914173003e-05, "loss": 0.706, "step": 3231 }, { "epoch": 0.7781857581412147, "grad_norm": 1.1615535020828247, "learning_rate": 2.4799955183964463e-05, "loss": 0.4029, "step": 3232 }, { "epoch": 0.7784265334376693, "grad_norm": 1.1178641319274902, "learning_rate": 2.4748580175552627e-05, "loss": 0.6088, "step": 3233 }, { "epoch": 0.7786673087341239, "grad_norm": 1.7456036806106567, "learning_rate": 2.4697250920173566e-05, "loss": 0.6452, "step": 3234 }, { "epoch": 0.7789080840305784, "grad_norm": 1.5968141555786133, "learning_rate": 2.46459674490356e-05, "loss": 0.2782, "step": 3235 }, { "epoch": 0.779148859327033, "grad_norm": 1.4153774976730347, "learning_rate": 2.4594729793319227e-05, "loss": 0.9929, "step": 3236 }, { "epoch": 0.7793896346234876, "grad_norm": 4.274727821350098, "learning_rate": 2.4543537984176978e-05, "loss": 0.4176, "step": 3237 }, { "epoch": 0.7796304099199423, "grad_norm": 2.977787494659424, "learning_rate": 2.449239205273367e-05, "loss": 0.5403, "step": 3238 }, { "epoch": 0.7798711852163968, "grad_norm": 2.9022774696350098, "learning_rate": 2.4441292030086137e-05, "loss": 0.4639, "step": 3239 }, { "epoch": 0.7801119605128514, "grad_norm": 1.2932614088058472, "learning_rate": 2.439023794730326e-05, "loss": 0.6634, "step": 3240 }, { "epoch": 0.780352735809306, "grad_norm": 3.5876283645629883, "learning_rate": 2.433922983542609e-05, "loss": 1.0981, "step": 3241 }, { "epoch": 0.7805935111057606, "grad_norm": 1.4793999195098877, "learning_rate": 2.4288267725467618e-05, "loss": 0.398, "step": 3242 }, { "epoch": 0.7808342864022151, "grad_norm": 3.516136884689331, "learning_rate": 2.4237351648412942e-05, "loss": 0.6531, "step": 3243 }, { "epoch": 0.7810750616986697, "grad_norm": 0.46728962659835815, "learning_rate": 2.4186481635219193e-05, "loss": 0.0365, "step": 3244 }, { "epoch": 0.7813158369951243, "grad_norm": 5.394861221313477, "learning_rate": 2.4135657716815397e-05, "loss": 0.18, "step": 3245 }, { "epoch": 0.7815566122915789, "grad_norm": 2.0945961475372314, "learning_rate": 2.408487992410263e-05, "loss": 0.5442, "step": 3246 }, { "epoch": 0.7817973875880335, "grad_norm": 0.8790675401687622, "learning_rate": 2.4034148287953904e-05, "loss": 0.434, "step": 3247 }, { "epoch": 0.782038162884488, "grad_norm": 5.560616493225098, "learning_rate": 2.3983462839214177e-05, "loss": 0.4532, "step": 3248 }, { "epoch": 0.7822789381809426, "grad_norm": 1.3845301866531372, "learning_rate": 2.3932823608700338e-05, "loss": 0.6569, "step": 3249 }, { "epoch": 0.7825197134773972, "grad_norm": 4.446075916290283, "learning_rate": 2.3882230627201096e-05, "loss": 0.2362, "step": 3250 }, { "epoch": 0.7827604887738518, "grad_norm": 3.534898281097412, "learning_rate": 2.3831683925477134e-05, "loss": 0.3983, "step": 3251 }, { "epoch": 0.7830012640703063, "grad_norm": 2.6837666034698486, "learning_rate": 2.3781183534260975e-05, "loss": 0.6582, "step": 3252 }, { "epoch": 0.783242039366761, "grad_norm": 1.4857863187789917, "learning_rate": 2.373072948425692e-05, "loss": 0.3928, "step": 3253 }, { "epoch": 0.7834828146632156, "grad_norm": 7.157393932342529, "learning_rate": 2.368032180614118e-05, "loss": 0.7341, "step": 3254 }, { "epoch": 0.7837235899596702, "grad_norm": 1.6006975173950195, "learning_rate": 2.3629960530561736e-05, "loss": 0.8314, "step": 3255 }, { "epoch": 0.7839643652561247, "grad_norm": 2.4584901332855225, "learning_rate": 2.3579645688138352e-05, "loss": 0.4926, "step": 3256 }, { "epoch": 0.7842051405525793, "grad_norm": 2.698150396347046, "learning_rate": 2.3529377309462585e-05, "loss": 0.7207, "step": 3257 }, { "epoch": 0.7844459158490339, "grad_norm": 2.509859561920166, "learning_rate": 2.347915542509769e-05, "loss": 0.8804, "step": 3258 }, { "epoch": 0.7846866911454885, "grad_norm": 2.254075765609741, "learning_rate": 2.342898006557872e-05, "loss": 0.4099, "step": 3259 }, { "epoch": 0.7849274664419431, "grad_norm": 1.3479466438293457, "learning_rate": 2.337885126141236e-05, "loss": 0.4939, "step": 3260 }, { "epoch": 0.7851682417383976, "grad_norm": 1.9788506031036377, "learning_rate": 2.3328769043077058e-05, "loss": 0.6189, "step": 3261 }, { "epoch": 0.7854090170348522, "grad_norm": 1.8684098720550537, "learning_rate": 2.3278733441022925e-05, "loss": 0.8066, "step": 3262 }, { "epoch": 0.7856497923313068, "grad_norm": 3.792185068130493, "learning_rate": 2.3228744485671718e-05, "loss": 0.4835, "step": 3263 }, { "epoch": 0.7858905676277614, "grad_norm": 0.6826027035713196, "learning_rate": 2.3178802207416828e-05, "loss": 0.4087, "step": 3264 }, { "epoch": 0.7861313429242159, "grad_norm": 1.6336182355880737, "learning_rate": 2.3128906636623303e-05, "loss": 0.5187, "step": 3265 }, { "epoch": 0.7863721182206705, "grad_norm": 2.8685998916625977, "learning_rate": 2.3079057803627713e-05, "loss": 0.3996, "step": 3266 }, { "epoch": 0.7866128935171252, "grad_norm": 1.4814997911453247, "learning_rate": 2.3029255738738308e-05, "loss": 0.2919, "step": 3267 }, { "epoch": 0.7868536688135798, "grad_norm": 2.786038398742676, "learning_rate": 2.2979500472234806e-05, "loss": 0.62, "step": 3268 }, { "epoch": 0.7870944441100343, "grad_norm": 4.719537734985352, "learning_rate": 2.2929792034368535e-05, "loss": 0.822, "step": 3269 }, { "epoch": 0.7873352194064889, "grad_norm": 8.993035316467285, "learning_rate": 2.2880130455362358e-05, "loss": 0.4918, "step": 3270 }, { "epoch": 0.7875759947029435, "grad_norm": 1.7135777473449707, "learning_rate": 2.2830515765410622e-05, "loss": 0.4715, "step": 3271 }, { "epoch": 0.7878167699993981, "grad_norm": 2.256098508834839, "learning_rate": 2.278094799467918e-05, "loss": 1.0771, "step": 3272 }, { "epoch": 0.7880575452958526, "grad_norm": 1.1801178455352783, "learning_rate": 2.2731427173305307e-05, "loss": 0.6812, "step": 3273 }, { "epoch": 0.7882983205923072, "grad_norm": 1.6812212467193604, "learning_rate": 2.268195333139781e-05, "loss": 0.567, "step": 3274 }, { "epoch": 0.7885390958887618, "grad_norm": 2.234989881515503, "learning_rate": 2.263252649903691e-05, "loss": 0.5069, "step": 3275 }, { "epoch": 0.7887798711852164, "grad_norm": 1.5656296014785767, "learning_rate": 2.2583146706274184e-05, "loss": 0.3535, "step": 3276 }, { "epoch": 0.789020646481671, "grad_norm": 1.6030066013336182, "learning_rate": 2.253381398313269e-05, "loss": 0.9362, "step": 3277 }, { "epoch": 0.7892614217781255, "grad_norm": 1.6599286794662476, "learning_rate": 2.2484528359606816e-05, "loss": 0.2563, "step": 3278 }, { "epoch": 0.7895021970745801, "grad_norm": 3.2402637004852295, "learning_rate": 2.2435289865662344e-05, "loss": 0.8971, "step": 3279 }, { "epoch": 0.7897429723710347, "grad_norm": 3.251466751098633, "learning_rate": 2.2386098531236422e-05, "loss": 1.0431, "step": 3280 }, { "epoch": 0.7899837476674894, "grad_norm": 1.1390844583511353, "learning_rate": 2.233695438623743e-05, "loss": 0.381, "step": 3281 }, { "epoch": 0.7902245229639439, "grad_norm": 1.3459970951080322, "learning_rate": 2.228785746054515e-05, "loss": 0.5704, "step": 3282 }, { "epoch": 0.7904652982603985, "grad_norm": 6.388441562652588, "learning_rate": 2.223880778401065e-05, "loss": 0.7198, "step": 3283 }, { "epoch": 0.7907060735568531, "grad_norm": 2.3066797256469727, "learning_rate": 2.2189805386456186e-05, "loss": 0.3931, "step": 3284 }, { "epoch": 0.7909468488533077, "grad_norm": 4.641172409057617, "learning_rate": 2.2140850297675353e-05, "loss": 0.9101, "step": 3285 }, { "epoch": 0.7911876241497622, "grad_norm": 2.48939847946167, "learning_rate": 2.2091942547432955e-05, "loss": 0.5307, "step": 3286 }, { "epoch": 0.7914283994462168, "grad_norm": 1.4391555786132812, "learning_rate": 2.2043082165465023e-05, "loss": 0.3424, "step": 3287 }, { "epoch": 0.7916691747426714, "grad_norm": 2.05548357963562, "learning_rate": 2.19942691814788e-05, "loss": 0.5474, "step": 3288 }, { "epoch": 0.791909950039126, "grad_norm": 3.470940589904785, "learning_rate": 2.194550362515263e-05, "loss": 0.5817, "step": 3289 }, { "epoch": 0.7921507253355806, "grad_norm": 6.029779434204102, "learning_rate": 2.189678552613612e-05, "loss": 0.8264, "step": 3290 }, { "epoch": 0.7923915006320351, "grad_norm": 2.177302837371826, "learning_rate": 2.184811491405001e-05, "loss": 0.5883, "step": 3291 }, { "epoch": 0.7926322759284897, "grad_norm": 3.795201063156128, "learning_rate": 2.1799491818486083e-05, "loss": 0.8381, "step": 3292 }, { "epoch": 0.7928730512249443, "grad_norm": 2.861975908279419, "learning_rate": 2.1750916269007316e-05, "loss": 0.5125, "step": 3293 }, { "epoch": 0.793113826521399, "grad_norm": 2.659313917160034, "learning_rate": 2.1702388295147747e-05, "loss": 0.5038, "step": 3294 }, { "epoch": 0.7933546018178534, "grad_norm": 2.762467384338379, "learning_rate": 2.165390792641251e-05, "loss": 0.5655, "step": 3295 }, { "epoch": 0.793595377114308, "grad_norm": 1.6343928575515747, "learning_rate": 2.160547519227779e-05, "loss": 0.4066, "step": 3296 }, { "epoch": 0.7938361524107627, "grad_norm": 0.6321638822555542, "learning_rate": 2.155709012219076e-05, "loss": 0.3478, "step": 3297 }, { "epoch": 0.7940769277072173, "grad_norm": 4.435551643371582, "learning_rate": 2.1508752745569695e-05, "loss": 1.071, "step": 3298 }, { "epoch": 0.7943177030036718, "grad_norm": 11.227981567382812, "learning_rate": 2.1460463091803773e-05, "loss": 0.741, "step": 3299 }, { "epoch": 0.7945584783001264, "grad_norm": 1.14915931224823, "learning_rate": 2.1412221190253245e-05, "loss": 0.5523, "step": 3300 }, { "epoch": 0.794799253596581, "grad_norm": 0.4120855927467346, "learning_rate": 2.1364027070249282e-05, "loss": 0.1457, "step": 3301 }, { "epoch": 0.7950400288930356, "grad_norm": 3.5175342559814453, "learning_rate": 2.1315880761094044e-05, "loss": 1.1578, "step": 3302 }, { "epoch": 0.7952808041894902, "grad_norm": 1.621909260749817, "learning_rate": 2.126778229206058e-05, "loss": 1.1031, "step": 3303 }, { "epoch": 0.7955215794859447, "grad_norm": 1.1469271183013916, "learning_rate": 2.1219731692392887e-05, "loss": 0.16, "step": 3304 }, { "epoch": 0.7957623547823993, "grad_norm": 1.0267417430877686, "learning_rate": 2.1171728991305795e-05, "loss": 0.6588, "step": 3305 }, { "epoch": 0.7960031300788539, "grad_norm": 0.7822784781455994, "learning_rate": 2.1123774217985116e-05, "loss": 0.4397, "step": 3306 }, { "epoch": 0.7962439053753085, "grad_norm": 0.9245109558105469, "learning_rate": 2.107586740158738e-05, "loss": 0.2842, "step": 3307 }, { "epoch": 0.796484680671763, "grad_norm": 1.0245726108551025, "learning_rate": 2.1028008571240088e-05, "loss": 1.107, "step": 3308 }, { "epoch": 0.7967254559682176, "grad_norm": 4.2257466316223145, "learning_rate": 2.0980197756041542e-05, "loss": 0.7681, "step": 3309 }, { "epoch": 0.7969662312646723, "grad_norm": 1.4079909324645996, "learning_rate": 2.0932434985060733e-05, "loss": 0.3071, "step": 3310 }, { "epoch": 0.7972070065611269, "grad_norm": 2.9457712173461914, "learning_rate": 2.0884720287337657e-05, "loss": 0.5083, "step": 3311 }, { "epoch": 0.7974477818575814, "grad_norm": 1.8760308027267456, "learning_rate": 2.0837053691882856e-05, "loss": 0.1079, "step": 3312 }, { "epoch": 0.797688557154036, "grad_norm": 2.5826492309570312, "learning_rate": 2.0789435227677777e-05, "loss": 0.8308, "step": 3313 }, { "epoch": 0.7979293324504906, "grad_norm": 1.930856466293335, "learning_rate": 2.074186492367457e-05, "loss": 0.6475, "step": 3314 }, { "epoch": 0.7981701077469452, "grad_norm": 1.8756681680679321, "learning_rate": 2.069434280879603e-05, "loss": 0.4886, "step": 3315 }, { "epoch": 0.7984108830433998, "grad_norm": 3.851440668106079, "learning_rate": 2.0646868911935735e-05, "loss": 1.2528, "step": 3316 }, { "epoch": 0.7986516583398543, "grad_norm": 1.4915354251861572, "learning_rate": 2.0599443261957962e-05, "loss": 0.534, "step": 3317 }, { "epoch": 0.7988924336363089, "grad_norm": 0.9275015592575073, "learning_rate": 2.0552065887697546e-05, "loss": 0.3342, "step": 3318 }, { "epoch": 0.7991332089327635, "grad_norm": 2.0070860385894775, "learning_rate": 2.0504736817960068e-05, "loss": 0.7064, "step": 3319 }, { "epoch": 0.7993739842292181, "grad_norm": 2.1603689193725586, "learning_rate": 2.045745608152171e-05, "loss": 0.7129, "step": 3320 }, { "epoch": 0.7996147595256726, "grad_norm": 1.2571876049041748, "learning_rate": 2.0410223707129274e-05, "loss": 0.8612, "step": 3321 }, { "epoch": 0.7998555348221272, "grad_norm": 2.6326212882995605, "learning_rate": 2.0363039723500156e-05, "loss": 0.4104, "step": 3322 }, { "epoch": 0.8000963101185818, "grad_norm": 1.6665747165679932, "learning_rate": 2.0315904159322287e-05, "loss": 0.6619, "step": 3323 }, { "epoch": 0.8003370854150365, "grad_norm": 1.7550292015075684, "learning_rate": 2.026881704325425e-05, "loss": 0.8006, "step": 3324 }, { "epoch": 0.800577860711491, "grad_norm": 0.9334133267402649, "learning_rate": 2.0221778403925062e-05, "loss": 0.4847, "step": 3325 }, { "epoch": 0.8008186360079456, "grad_norm": 1.6787467002868652, "learning_rate": 2.0174788269934343e-05, "loss": 0.4084, "step": 3326 }, { "epoch": 0.8010594113044002, "grad_norm": 1.1613237857818604, "learning_rate": 2.01278466698522e-05, "loss": 0.4913, "step": 3327 }, { "epoch": 0.8013001866008548, "grad_norm": 1.7713500261306763, "learning_rate": 2.0080953632219247e-05, "loss": 0.3606, "step": 3328 }, { "epoch": 0.8015409618973094, "grad_norm": 2.774338960647583, "learning_rate": 2.0034109185546534e-05, "loss": 0.6157, "step": 3329 }, { "epoch": 0.8017817371937639, "grad_norm": 2.1375133991241455, "learning_rate": 1.9987313358315628e-05, "loss": 0.8029, "step": 3330 }, { "epoch": 0.8020225124902185, "grad_norm": 4.05165958404541, "learning_rate": 1.994056617897846e-05, "loss": 0.9467, "step": 3331 }, { "epoch": 0.8022632877866731, "grad_norm": 1.2718948125839233, "learning_rate": 1.9893867675957445e-05, "loss": 0.6438, "step": 3332 }, { "epoch": 0.8025040630831277, "grad_norm": 2.1101791858673096, "learning_rate": 1.984721787764534e-05, "loss": 0.6168, "step": 3333 }, { "epoch": 0.8027448383795822, "grad_norm": 3.8065285682678223, "learning_rate": 1.9800616812405348e-05, "loss": 0.331, "step": 3334 }, { "epoch": 0.8029856136760368, "grad_norm": 1.6323808431625366, "learning_rate": 1.9754064508571036e-05, "loss": 0.4418, "step": 3335 }, { "epoch": 0.8032263889724914, "grad_norm": 1.982974648475647, "learning_rate": 1.9707560994446284e-05, "loss": 0.6296, "step": 3336 }, { "epoch": 0.803467164268946, "grad_norm": 0.8455390334129333, "learning_rate": 1.9661106298305387e-05, "loss": 0.4286, "step": 3337 }, { "epoch": 0.8037079395654005, "grad_norm": 1.4520126581192017, "learning_rate": 1.9614700448392832e-05, "loss": 0.7171, "step": 3338 }, { "epoch": 0.8039487148618552, "grad_norm": 1.9037809371948242, "learning_rate": 1.9568343472923524e-05, "loss": 0.4656, "step": 3339 }, { "epoch": 0.8041894901583098, "grad_norm": 1.7248350381851196, "learning_rate": 1.9522035400082615e-05, "loss": 0.3961, "step": 3340 }, { "epoch": 0.8044302654547644, "grad_norm": 2.145430326461792, "learning_rate": 1.947577625802548e-05, "loss": 0.5493, "step": 3341 }, { "epoch": 0.804671040751219, "grad_norm": 0.38401633501052856, "learning_rate": 1.9429566074877816e-05, "loss": 0.4645, "step": 3342 }, { "epoch": 0.8049118160476735, "grad_norm": 4.1669840812683105, "learning_rate": 1.938340487873549e-05, "loss": 0.6155, "step": 3343 }, { "epoch": 0.8051525913441281, "grad_norm": 1.835777997970581, "learning_rate": 1.9337292697664633e-05, "loss": 0.7442, "step": 3344 }, { "epoch": 0.8053933666405827, "grad_norm": 1.9592784643173218, "learning_rate": 1.9291229559701572e-05, "loss": 0.6318, "step": 3345 }, { "epoch": 0.8056341419370373, "grad_norm": 2.0905535221099854, "learning_rate": 1.9245215492852766e-05, "loss": 0.5699, "step": 3346 }, { "epoch": 0.8058749172334918, "grad_norm": 1.7609286308288574, "learning_rate": 1.919925052509487e-05, "loss": 0.9583, "step": 3347 }, { "epoch": 0.8061156925299464, "grad_norm": 2.8224740028381348, "learning_rate": 1.9153334684374725e-05, "loss": 0.8957, "step": 3348 }, { "epoch": 0.806356467826401, "grad_norm": 0.9539978504180908, "learning_rate": 1.9107467998609228e-05, "loss": 0.7801, "step": 3349 }, { "epoch": 0.8065972431228556, "grad_norm": 3.402155637741089, "learning_rate": 1.9061650495685433e-05, "loss": 0.8503, "step": 3350 }, { "epoch": 0.8068380184193101, "grad_norm": 1.3554385900497437, "learning_rate": 1.9015882203460488e-05, "loss": 0.5393, "step": 3351 }, { "epoch": 0.8070787937157647, "grad_norm": 3.57460618019104, "learning_rate": 1.8970163149761634e-05, "loss": 0.9425, "step": 3352 }, { "epoch": 0.8073195690122194, "grad_norm": 7.111121654510498, "learning_rate": 1.8924493362386166e-05, "loss": 0.5293, "step": 3353 }, { "epoch": 0.807560344308674, "grad_norm": 2.1528825759887695, "learning_rate": 1.887887286910137e-05, "loss": 0.2559, "step": 3354 }, { "epoch": 0.8078011196051285, "grad_norm": 0.7006820440292358, "learning_rate": 1.8833301697644644e-05, "loss": 0.431, "step": 3355 }, { "epoch": 0.8080418949015831, "grad_norm": 2.09385085105896, "learning_rate": 1.878777987572339e-05, "loss": 1.159, "step": 3356 }, { "epoch": 0.8082826701980377, "grad_norm": 3.343334674835205, "learning_rate": 1.8742307431014905e-05, "loss": 1.0441, "step": 3357 }, { "epoch": 0.8085234454944923, "grad_norm": 4.183254241943359, "learning_rate": 1.869688439116659e-05, "loss": 1.0862, "step": 3358 }, { "epoch": 0.8087642207909469, "grad_norm": 2.1016793251037598, "learning_rate": 1.8651510783795734e-05, "loss": 0.5553, "step": 3359 }, { "epoch": 0.8090049960874014, "grad_norm": 0.7969531416893005, "learning_rate": 1.8606186636489596e-05, "loss": 0.54, "step": 3360 }, { "epoch": 0.809245771383856, "grad_norm": 1.4294320344924927, "learning_rate": 1.8560911976805405e-05, "loss": 0.4665, "step": 3361 }, { "epoch": 0.8094865466803106, "grad_norm": 3.3648643493652344, "learning_rate": 1.8515686832270184e-05, "loss": 1.236, "step": 3362 }, { "epoch": 0.8097273219767652, "grad_norm": 1.2407772541046143, "learning_rate": 1.8470511230380983e-05, "loss": 0.553, "step": 3363 }, { "epoch": 0.8099680972732197, "grad_norm": 3.098479986190796, "learning_rate": 1.8425385198604615e-05, "loss": 0.4733, "step": 3364 }, { "epoch": 0.8102088725696743, "grad_norm": 1.627521276473999, "learning_rate": 1.8380308764377842e-05, "loss": 0.4355, "step": 3365 }, { "epoch": 0.8104496478661289, "grad_norm": 1.2290712594985962, "learning_rate": 1.833528195510722e-05, "loss": 0.4626, "step": 3366 }, { "epoch": 0.8106904231625836, "grad_norm": 1.5985430479049683, "learning_rate": 1.8290304798169176e-05, "loss": 0.2771, "step": 3367 }, { "epoch": 0.810931198459038, "grad_norm": 2.063868522644043, "learning_rate": 1.8245377320909894e-05, "loss": 0.3625, "step": 3368 }, { "epoch": 0.8111719737554927, "grad_norm": 17.49557113647461, "learning_rate": 1.8200499550645433e-05, "loss": 1.085, "step": 3369 }, { "epoch": 0.8114127490519473, "grad_norm": 2.0841176509857178, "learning_rate": 1.815567151466151e-05, "loss": 0.722, "step": 3370 }, { "epoch": 0.8116535243484019, "grad_norm": 2.4572806358337402, "learning_rate": 1.8110893240213733e-05, "loss": 0.365, "step": 3371 }, { "epoch": 0.8118942996448565, "grad_norm": 0.8181131482124329, "learning_rate": 1.806616475452734e-05, "loss": 0.4139, "step": 3372 }, { "epoch": 0.812135074941311, "grad_norm": 2.0539329051971436, "learning_rate": 1.8021486084797368e-05, "loss": 0.8089, "step": 3373 }, { "epoch": 0.8123758502377656, "grad_norm": 1.458526611328125, "learning_rate": 1.797685725818856e-05, "loss": 0.7537, "step": 3374 }, { "epoch": 0.8126166255342202, "grad_norm": 0.4736323058605194, "learning_rate": 1.7932278301835347e-05, "loss": 0.1747, "step": 3375 }, { "epoch": 0.8128574008306748, "grad_norm": 2.3120036125183105, "learning_rate": 1.7887749242841844e-05, "loss": 0.4192, "step": 3376 }, { "epoch": 0.8130981761271293, "grad_norm": 15.435149192810059, "learning_rate": 1.7843270108281772e-05, "loss": 0.6507, "step": 3377 }, { "epoch": 0.8133389514235839, "grad_norm": 1.2103174924850464, "learning_rate": 1.779884092519859e-05, "loss": 0.6697, "step": 3378 }, { "epoch": 0.8135797267200385, "grad_norm": 1.7959200143814087, "learning_rate": 1.7754461720605342e-05, "loss": 0.4094, "step": 3379 }, { "epoch": 0.8138205020164931, "grad_norm": 2.2297651767730713, "learning_rate": 1.7710132521484646e-05, "loss": 1.1822, "step": 3380 }, { "epoch": 0.8140612773129476, "grad_norm": 1.7390764951705933, "learning_rate": 1.7665853354788774e-05, "loss": 0.6164, "step": 3381 }, { "epoch": 0.8143020526094022, "grad_norm": 1.3549991846084595, "learning_rate": 1.7621624247439594e-05, "loss": 0.3209, "step": 3382 }, { "epoch": 0.8145428279058569, "grad_norm": 0.4444354176521301, "learning_rate": 1.7577445226328425e-05, "loss": 0.3606, "step": 3383 }, { "epoch": 0.8147836032023115, "grad_norm": 10.032902717590332, "learning_rate": 1.7533316318316307e-05, "loss": 0.7676, "step": 3384 }, { "epoch": 0.8150243784987661, "grad_norm": 1.579328179359436, "learning_rate": 1.748923755023364e-05, "loss": 0.7432, "step": 3385 }, { "epoch": 0.8152651537952206, "grad_norm": 2.0636134147644043, "learning_rate": 1.7445208948880442e-05, "loss": 0.9798, "step": 3386 }, { "epoch": 0.8155059290916752, "grad_norm": 1.8931350708007812, "learning_rate": 1.7401230541026226e-05, "loss": 0.4045, "step": 3387 }, { "epoch": 0.8157467043881298, "grad_norm": 29.280317306518555, "learning_rate": 1.735730235340991e-05, "loss": 0.8831, "step": 3388 }, { "epoch": 0.8159874796845844, "grad_norm": 1.0704580545425415, "learning_rate": 1.7313424412739987e-05, "loss": 0.5779, "step": 3389 }, { "epoch": 0.8162282549810389, "grad_norm": 1.8337838649749756, "learning_rate": 1.7269596745694295e-05, "loss": 0.6469, "step": 3390 }, { "epoch": 0.8164690302774935, "grad_norm": 1.4514000415802002, "learning_rate": 1.722581937892015e-05, "loss": 0.4172, "step": 3391 }, { "epoch": 0.8167098055739481, "grad_norm": 2.778085231781006, "learning_rate": 1.718209233903436e-05, "loss": 0.622, "step": 3392 }, { "epoch": 0.8169505808704027, "grad_norm": 1.9710346460342407, "learning_rate": 1.7138415652622995e-05, "loss": 0.3566, "step": 3393 }, { "epoch": 0.8171913561668572, "grad_norm": 3.470649003982544, "learning_rate": 1.70947893462416e-05, "loss": 0.6804, "step": 3394 }, { "epoch": 0.8174321314633118, "grad_norm": 0.8175150752067566, "learning_rate": 1.7051213446415104e-05, "loss": 0.1711, "step": 3395 }, { "epoch": 0.8176729067597664, "grad_norm": 2.38569974899292, "learning_rate": 1.7007687979637687e-05, "loss": 1.0114, "step": 3396 }, { "epoch": 0.8179136820562211, "grad_norm": 1.0082086324691772, "learning_rate": 1.6964212972372995e-05, "loss": 0.6251, "step": 3397 }, { "epoch": 0.8181544573526757, "grad_norm": 3.498782157897949, "learning_rate": 1.692078845105386e-05, "loss": 1.078, "step": 3398 }, { "epoch": 0.8183952326491302, "grad_norm": 1.055105447769165, "learning_rate": 1.6877414442082528e-05, "loss": 0.45, "step": 3399 }, { "epoch": 0.8186360079455848, "grad_norm": 3.0139036178588867, "learning_rate": 1.6834090971830507e-05, "loss": 0.5409, "step": 3400 }, { "epoch": 0.8188767832420394, "grad_norm": 1.671664834022522, "learning_rate": 1.6790818066638536e-05, "loss": 0.5133, "step": 3401 }, { "epoch": 0.819117558538494, "grad_norm": 3.366199254989624, "learning_rate": 1.6747595752816658e-05, "loss": 0.6439, "step": 3402 }, { "epoch": 0.8193583338349485, "grad_norm": 1.2832306623458862, "learning_rate": 1.6704424056644154e-05, "loss": 0.7887, "step": 3403 }, { "epoch": 0.8195991091314031, "grad_norm": 1.5629595518112183, "learning_rate": 1.6661303004369468e-05, "loss": 0.5992, "step": 3404 }, { "epoch": 0.8198398844278577, "grad_norm": 1.5762869119644165, "learning_rate": 1.661823262221035e-05, "loss": 0.6254, "step": 3405 }, { "epoch": 0.8200806597243123, "grad_norm": 4.563362121582031, "learning_rate": 1.6575212936353625e-05, "loss": 0.9719, "step": 3406 }, { "epoch": 0.8203214350207668, "grad_norm": 0.8489302396774292, "learning_rate": 1.6532243972955398e-05, "loss": 0.3529, "step": 3407 }, { "epoch": 0.8205622103172214, "grad_norm": 4.638950824737549, "learning_rate": 1.6489325758140895e-05, "loss": 0.3409, "step": 3408 }, { "epoch": 0.820802985613676, "grad_norm": 1.3469526767730713, "learning_rate": 1.6446458318004477e-05, "loss": 0.5872, "step": 3409 }, { "epoch": 0.8210437609101306, "grad_norm": 0.8786214590072632, "learning_rate": 1.640364167860967e-05, "loss": 0.3984, "step": 3410 }, { "epoch": 0.8212845362065853, "grad_norm": 1.392174482345581, "learning_rate": 1.6360875865989046e-05, "loss": 0.293, "step": 3411 }, { "epoch": 0.8215253115030398, "grad_norm": 1.987600326538086, "learning_rate": 1.631816090614434e-05, "loss": 0.2162, "step": 3412 }, { "epoch": 0.8217660867994944, "grad_norm": 0.8357548713684082, "learning_rate": 1.6275496825046367e-05, "loss": 0.5193, "step": 3413 }, { "epoch": 0.822006862095949, "grad_norm": 2.0921974182128906, "learning_rate": 1.6232883648634933e-05, "loss": 0.5682, "step": 3414 }, { "epoch": 0.8222476373924036, "grad_norm": 2.1746203899383545, "learning_rate": 1.6190321402818963e-05, "loss": 0.6858, "step": 3415 }, { "epoch": 0.8224884126888581, "grad_norm": 4.1129584312438965, "learning_rate": 1.6147810113476413e-05, "loss": 0.6928, "step": 3416 }, { "epoch": 0.8227291879853127, "grad_norm": 3.485736846923828, "learning_rate": 1.610534980645423e-05, "loss": 0.9683, "step": 3417 }, { "epoch": 0.8229699632817673, "grad_norm": 1.9745628833770752, "learning_rate": 1.60629405075684e-05, "loss": 0.5169, "step": 3418 }, { "epoch": 0.8232107385782219, "grad_norm": 1.6666935682296753, "learning_rate": 1.6020582242603844e-05, "loss": 0.6159, "step": 3419 }, { "epoch": 0.8234515138746764, "grad_norm": 4.859829425811768, "learning_rate": 1.5978275037314482e-05, "loss": 0.9885, "step": 3420 }, { "epoch": 0.823692289171131, "grad_norm": 2.979112148284912, "learning_rate": 1.5936018917423236e-05, "loss": 0.6964, "step": 3421 }, { "epoch": 0.8239330644675856, "grad_norm": 2.812305450439453, "learning_rate": 1.5893813908621857e-05, "loss": 0.9054, "step": 3422 }, { "epoch": 0.8241738397640402, "grad_norm": 8.65831184387207, "learning_rate": 1.5851660036571115e-05, "loss": 1.0088, "step": 3423 }, { "epoch": 0.8244146150604948, "grad_norm": 0.610390305519104, "learning_rate": 1.5809557326900647e-05, "loss": 0.4041, "step": 3424 }, { "epoch": 0.8246553903569493, "grad_norm": 2.6373860836029053, "learning_rate": 1.5767505805209027e-05, "loss": 0.7424, "step": 3425 }, { "epoch": 0.824896165653404, "grad_norm": 2.2015843391418457, "learning_rate": 1.5725505497063664e-05, "loss": 0.9575, "step": 3426 }, { "epoch": 0.8251369409498586, "grad_norm": 2.0284852981567383, "learning_rate": 1.568355642800081e-05, "loss": 0.6706, "step": 3427 }, { "epoch": 0.8253777162463132, "grad_norm": 3.0381412506103516, "learning_rate": 1.5641658623525623e-05, "loss": 0.3887, "step": 3428 }, { "epoch": 0.8256184915427677, "grad_norm": 3.0191638469696045, "learning_rate": 1.5599812109112076e-05, "loss": 0.8594, "step": 3429 }, { "epoch": 0.8258592668392223, "grad_norm": 2.608114719390869, "learning_rate": 1.55580169102029e-05, "loss": 0.2382, "step": 3430 }, { "epoch": 0.8261000421356769, "grad_norm": 2.528092384338379, "learning_rate": 1.5516273052209683e-05, "loss": 0.6125, "step": 3431 }, { "epoch": 0.8263408174321315, "grad_norm": 1.7258909940719604, "learning_rate": 1.547458056051281e-05, "loss": 0.2416, "step": 3432 }, { "epoch": 0.826581592728586, "grad_norm": 1.4426395893096924, "learning_rate": 1.5432939460461384e-05, "loss": 0.5668, "step": 3433 }, { "epoch": 0.8268223680250406, "grad_norm": 1.0259637832641602, "learning_rate": 1.539134977737332e-05, "loss": 0.2167, "step": 3434 }, { "epoch": 0.8270631433214952, "grad_norm": 1.5782815217971802, "learning_rate": 1.5349811536535196e-05, "loss": 0.9795, "step": 3435 }, { "epoch": 0.8273039186179498, "grad_norm": 1.5504636764526367, "learning_rate": 1.5308324763202397e-05, "loss": 0.6967, "step": 3436 }, { "epoch": 0.8275446939144043, "grad_norm": 0.8018413186073303, "learning_rate": 1.5266889482598934e-05, "loss": 0.1634, "step": 3437 }, { "epoch": 0.8277854692108589, "grad_norm": 2.674348831176758, "learning_rate": 1.5225505719917577e-05, "loss": 0.8628, "step": 3438 }, { "epoch": 0.8280262445073135, "grad_norm": 1.5734491348266602, "learning_rate": 1.5184173500319731e-05, "loss": 0.6495, "step": 3439 }, { "epoch": 0.8282670198037682, "grad_norm": 3.662338972091675, "learning_rate": 1.5142892848935497e-05, "loss": 0.9257, "step": 3440 }, { "epoch": 0.8285077951002228, "grad_norm": 4.754350185394287, "learning_rate": 1.5101663790863596e-05, "loss": 0.2461, "step": 3441 }, { "epoch": 0.8287485703966773, "grad_norm": 3.429853677749634, "learning_rate": 1.5060486351171411e-05, "loss": 1.0115, "step": 3442 }, { "epoch": 0.8289893456931319, "grad_norm": 2.6595869064331055, "learning_rate": 1.5019360554894868e-05, "loss": 0.4968, "step": 3443 }, { "epoch": 0.8292301209895865, "grad_norm": 1.3187874555587769, "learning_rate": 1.4978286427038601e-05, "loss": 0.6117, "step": 3444 }, { "epoch": 0.8294708962860411, "grad_norm": 1.491859793663025, "learning_rate": 1.4937263992575712e-05, "loss": 0.4465, "step": 3445 }, { "epoch": 0.8297116715824956, "grad_norm": 2.291027784347534, "learning_rate": 1.489629327644797e-05, "loss": 0.8872, "step": 3446 }, { "epoch": 0.8299524468789502, "grad_norm": 1.3068852424621582, "learning_rate": 1.4855374303565662e-05, "loss": 0.358, "step": 3447 }, { "epoch": 0.8301932221754048, "grad_norm": 1.2296390533447266, "learning_rate": 1.4814507098807595e-05, "loss": 0.5845, "step": 3448 }, { "epoch": 0.8304339974718594, "grad_norm": 2.593040943145752, "learning_rate": 1.4773691687021174e-05, "loss": 0.6584, "step": 3449 }, { "epoch": 0.8306747727683139, "grad_norm": 1.2656725645065308, "learning_rate": 1.473292809302219e-05, "loss": 0.6618, "step": 3450 }, { "epoch": 0.8309155480647685, "grad_norm": 1.9369158744812012, "learning_rate": 1.4692216341595044e-05, "loss": 0.4147, "step": 3451 }, { "epoch": 0.8311563233612231, "grad_norm": 1.4402110576629639, "learning_rate": 1.4651556457492588e-05, "loss": 0.2083, "step": 3452 }, { "epoch": 0.8313970986576777, "grad_norm": 0.478405237197876, "learning_rate": 1.4610948465436069e-05, "loss": 0.2323, "step": 3453 }, { "epoch": 0.8316378739541324, "grad_norm": 2.096238613128662, "learning_rate": 1.4570392390115261e-05, "loss": 0.2422, "step": 3454 }, { "epoch": 0.8318786492505869, "grad_norm": 0.71112060546875, "learning_rate": 1.4529888256188363e-05, "loss": 0.2833, "step": 3455 }, { "epoch": 0.8321194245470415, "grad_norm": 4.690402984619141, "learning_rate": 1.448943608828197e-05, "loss": 1.0562, "step": 3456 }, { "epoch": 0.8323601998434961, "grad_norm": 1.7524763345718384, "learning_rate": 1.4449035910991115e-05, "loss": 0.087, "step": 3457 }, { "epoch": 0.8326009751399507, "grad_norm": 1.4712945222854614, "learning_rate": 1.4408687748879156e-05, "loss": 0.4935, "step": 3458 }, { "epoch": 0.8328417504364052, "grad_norm": 2.9652466773986816, "learning_rate": 1.4368391626477884e-05, "loss": 0.7135, "step": 3459 }, { "epoch": 0.8330825257328598, "grad_norm": 1.4162325859069824, "learning_rate": 1.4328147568287453e-05, "loss": 0.4518, "step": 3460 }, { "epoch": 0.8333233010293144, "grad_norm": 0.665979266166687, "learning_rate": 1.4287955598776304e-05, "loss": 0.3329, "step": 3461 }, { "epoch": 0.833564076325769, "grad_norm": 2.3450982570648193, "learning_rate": 1.4247815742381277e-05, "loss": 0.4297, "step": 3462 }, { "epoch": 0.8338048516222235, "grad_norm": 2.609652519226074, "learning_rate": 1.4207728023507471e-05, "loss": 0.595, "step": 3463 }, { "epoch": 0.8340456269186781, "grad_norm": 1.1684465408325195, "learning_rate": 1.4167692466528281e-05, "loss": 0.2792, "step": 3464 }, { "epoch": 0.8342864022151327, "grad_norm": 1.9600780010223389, "learning_rate": 1.4127709095785513e-05, "loss": 0.1156, "step": 3465 }, { "epoch": 0.8345271775115873, "grad_norm": 1.2096495628356934, "learning_rate": 1.4087777935589052e-05, "loss": 0.6391, "step": 3466 }, { "epoch": 0.834767952808042, "grad_norm": 2.56876540184021, "learning_rate": 1.404789901021717e-05, "loss": 0.5451, "step": 3467 }, { "epoch": 0.8350087281044964, "grad_norm": 2.7339911460876465, "learning_rate": 1.4008072343916379e-05, "loss": 0.7596, "step": 3468 }, { "epoch": 0.835249503400951, "grad_norm": 1.9011280536651611, "learning_rate": 1.396829796090131e-05, "loss": 0.9727, "step": 3469 }, { "epoch": 0.8354902786974057, "grad_norm": 1.2117639780044556, "learning_rate": 1.3928575885354933e-05, "loss": 0.2906, "step": 3470 }, { "epoch": 0.8357310539938603, "grad_norm": 1.8486530780792236, "learning_rate": 1.3888906141428325e-05, "loss": 0.395, "step": 3471 }, { "epoch": 0.8359718292903148, "grad_norm": 4.039324760437012, "learning_rate": 1.3849288753240786e-05, "loss": 0.4848, "step": 3472 }, { "epoch": 0.8362126045867694, "grad_norm": 1.2622008323669434, "learning_rate": 1.3809723744879788e-05, "loss": 0.6475, "step": 3473 }, { "epoch": 0.836453379883224, "grad_norm": 0.6000483632087708, "learning_rate": 1.3770211140400946e-05, "loss": 0.1693, "step": 3474 }, { "epoch": 0.8366941551796786, "grad_norm": 2.0849924087524414, "learning_rate": 1.3730750963828032e-05, "loss": 0.1984, "step": 3475 }, { "epoch": 0.8369349304761331, "grad_norm": 2.24556040763855, "learning_rate": 1.3691343239152864e-05, "loss": 0.4181, "step": 3476 }, { "epoch": 0.8371757057725877, "grad_norm": 1.9239386320114136, "learning_rate": 1.3651987990335469e-05, "loss": 0.4543, "step": 3477 }, { "epoch": 0.8374164810690423, "grad_norm": 1.7591582536697388, "learning_rate": 1.3612685241303947e-05, "loss": 0.5755, "step": 3478 }, { "epoch": 0.8376572563654969, "grad_norm": 0.7575153112411499, "learning_rate": 1.3573435015954406e-05, "loss": 0.0756, "step": 3479 }, { "epoch": 0.8378980316619515, "grad_norm": 1.6970840692520142, "learning_rate": 1.3534237338151102e-05, "loss": 0.2317, "step": 3480 }, { "epoch": 0.838138806958406, "grad_norm": 1.827329158782959, "learning_rate": 1.3495092231726304e-05, "loss": 0.7636, "step": 3481 }, { "epoch": 0.8383795822548606, "grad_norm": 2.604074239730835, "learning_rate": 1.3455999720480316e-05, "loss": 0.6374, "step": 3482 }, { "epoch": 0.8386203575513153, "grad_norm": 2.306910514831543, "learning_rate": 1.341695982818152e-05, "loss": 0.7725, "step": 3483 }, { "epoch": 0.8388611328477699, "grad_norm": 1.2861391305923462, "learning_rate": 1.337797257856619e-05, "loss": 0.6311, "step": 3484 }, { "epoch": 0.8391019081442244, "grad_norm": 2.032479763031006, "learning_rate": 1.33390379953387e-05, "loss": 0.4538, "step": 3485 }, { "epoch": 0.839342683440679, "grad_norm": 0.6007648706436157, "learning_rate": 1.3300156102171379e-05, "loss": 0.1634, "step": 3486 }, { "epoch": 0.8395834587371336, "grad_norm": 2.5209765434265137, "learning_rate": 1.3261326922704464e-05, "loss": 0.1593, "step": 3487 }, { "epoch": 0.8398242340335882, "grad_norm": 0.9033012390136719, "learning_rate": 1.3222550480546203e-05, "loss": 0.3819, "step": 3488 }, { "epoch": 0.8400650093300427, "grad_norm": 7.269277572631836, "learning_rate": 1.3183826799272758e-05, "loss": 0.5313, "step": 3489 }, { "epoch": 0.8403057846264973, "grad_norm": 2.5734009742736816, "learning_rate": 1.3145155902428219e-05, "loss": 0.6007, "step": 3490 }, { "epoch": 0.8405465599229519, "grad_norm": 2.136906147003174, "learning_rate": 1.310653781352461e-05, "loss": 0.7098, "step": 3491 }, { "epoch": 0.8407873352194065, "grad_norm": 1.183111310005188, "learning_rate": 1.3067972556041752e-05, "loss": 0.5929, "step": 3492 }, { "epoch": 0.8410281105158611, "grad_norm": 2.32429575920105, "learning_rate": 1.3029460153427442e-05, "loss": 0.6398, "step": 3493 }, { "epoch": 0.8412688858123156, "grad_norm": 2.2042758464813232, "learning_rate": 1.2991000629097328e-05, "loss": 0.4816, "step": 3494 }, { "epoch": 0.8415096611087702, "grad_norm": 2.461498498916626, "learning_rate": 1.2952594006434849e-05, "loss": 0.74, "step": 3495 }, { "epoch": 0.8417504364052248, "grad_norm": 1.1167387962341309, "learning_rate": 1.2914240308791326e-05, "loss": 1.0576, "step": 3496 }, { "epoch": 0.8419912117016795, "grad_norm": 1.2157386541366577, "learning_rate": 1.2875939559485905e-05, "loss": 0.5803, "step": 3497 }, { "epoch": 0.842231986998134, "grad_norm": 1.176676869392395, "learning_rate": 1.2837691781805516e-05, "loss": 0.7533, "step": 3498 }, { "epoch": 0.8424727622945886, "grad_norm": 1.823185920715332, "learning_rate": 1.2799496999004935e-05, "loss": 0.3445, "step": 3499 }, { "epoch": 0.8427135375910432, "grad_norm": 2.291487216949463, "learning_rate": 1.2761355234306626e-05, "loss": 0.6968, "step": 3500 }, { "epoch": 0.8429543128874978, "grad_norm": 3.8804900646209717, "learning_rate": 1.27232665109009e-05, "loss": 0.626, "step": 3501 }, { "epoch": 0.8431950881839523, "grad_norm": 3.4894497394561768, "learning_rate": 1.268523085194575e-05, "loss": 0.6506, "step": 3502 }, { "epoch": 0.8434358634804069, "grad_norm": 4.037177085876465, "learning_rate": 1.264724828056696e-05, "loss": 0.5966, "step": 3503 }, { "epoch": 0.8436766387768615, "grad_norm": 2.5394115447998047, "learning_rate": 1.2609318819858029e-05, "loss": 0.8267, "step": 3504 }, { "epoch": 0.8439174140733161, "grad_norm": 4.015261173248291, "learning_rate": 1.2571442492880159e-05, "loss": 0.9835, "step": 3505 }, { "epoch": 0.8441581893697706, "grad_norm": 3.58262300491333, "learning_rate": 1.2533619322662216e-05, "loss": 0.4845, "step": 3506 }, { "epoch": 0.8443989646662252, "grad_norm": 1.208625316619873, "learning_rate": 1.2495849332200815e-05, "loss": 0.4519, "step": 3507 }, { "epoch": 0.8446397399626798, "grad_norm": 6.022768497467041, "learning_rate": 1.2458132544460155e-05, "loss": 0.4975, "step": 3508 }, { "epoch": 0.8448805152591344, "grad_norm": 0.27254337072372437, "learning_rate": 1.2420468982372158e-05, "loss": 0.4787, "step": 3509 }, { "epoch": 0.845121290555589, "grad_norm": 3.7382776737213135, "learning_rate": 1.2382858668836317e-05, "loss": 0.7591, "step": 3510 }, { "epoch": 0.8453620658520435, "grad_norm": 1.5053765773773193, "learning_rate": 1.2345301626719808e-05, "loss": 0.4858, "step": 3511 }, { "epoch": 0.8456028411484982, "grad_norm": 2.0983424186706543, "learning_rate": 1.2307797878857396e-05, "loss": 0.4047, "step": 3512 }, { "epoch": 0.8458436164449528, "grad_norm": 1.0963115692138672, "learning_rate": 1.2270347448051456e-05, "loss": 0.6322, "step": 3513 }, { "epoch": 0.8460843917414074, "grad_norm": 2.32830548286438, "learning_rate": 1.2232950357071937e-05, "loss": 0.339, "step": 3514 }, { "epoch": 0.8463251670378619, "grad_norm": 2.1017560958862305, "learning_rate": 1.219560662865633e-05, "loss": 0.6883, "step": 3515 }, { "epoch": 0.8465659423343165, "grad_norm": 2.3535349369049072, "learning_rate": 1.2158316285509708e-05, "loss": 0.3383, "step": 3516 }, { "epoch": 0.8468067176307711, "grad_norm": 0.24175573885440826, "learning_rate": 1.2121079350304732e-05, "loss": 0.3707, "step": 3517 }, { "epoch": 0.8470474929272257, "grad_norm": 3.2046167850494385, "learning_rate": 1.208389584568147e-05, "loss": 0.7788, "step": 3518 }, { "epoch": 0.8472882682236802, "grad_norm": 0.8707532286643982, "learning_rate": 1.2046765794247604e-05, "loss": 0.5653, "step": 3519 }, { "epoch": 0.8475290435201348, "grad_norm": 5.456550598144531, "learning_rate": 1.2009689218578313e-05, "loss": 0.5814, "step": 3520 }, { "epoch": 0.8477698188165894, "grad_norm": 3.579458236694336, "learning_rate": 1.1972666141216215e-05, "loss": 0.3246, "step": 3521 }, { "epoch": 0.848010594113044, "grad_norm": 1.3261420726776123, "learning_rate": 1.1935696584671452e-05, "loss": 0.4498, "step": 3522 }, { "epoch": 0.8482513694094986, "grad_norm": 1.701804280281067, "learning_rate": 1.1898780571421552e-05, "loss": 0.5852, "step": 3523 }, { "epoch": 0.8484921447059531, "grad_norm": 3.848027229309082, "learning_rate": 1.1861918123911564e-05, "loss": 0.8602, "step": 3524 }, { "epoch": 0.8487329200024077, "grad_norm": 2.2513511180877686, "learning_rate": 1.1825109264553947e-05, "loss": 1.1043, "step": 3525 }, { "epoch": 0.8489736952988624, "grad_norm": 3.580653429031372, "learning_rate": 1.1788354015728543e-05, "loss": 0.3012, "step": 3526 }, { "epoch": 0.849214470595317, "grad_norm": 2.1842987537384033, "learning_rate": 1.1751652399782665e-05, "loss": 0.5502, "step": 3527 }, { "epoch": 0.8494552458917715, "grad_norm": 1.2588567733764648, "learning_rate": 1.1715004439030908e-05, "loss": 0.5083, "step": 3528 }, { "epoch": 0.8496960211882261, "grad_norm": 2.6544106006622314, "learning_rate": 1.1678410155755382e-05, "loss": 0.342, "step": 3529 }, { "epoch": 0.8499367964846807, "grad_norm": 0.7514671087265015, "learning_rate": 1.1641869572205489e-05, "loss": 0.323, "step": 3530 }, { "epoch": 0.8501775717811353, "grad_norm": 2.6009557247161865, "learning_rate": 1.1605382710597957e-05, "loss": 0.4687, "step": 3531 }, { "epoch": 0.8504183470775898, "grad_norm": 1.5390700101852417, "learning_rate": 1.1568949593116884e-05, "loss": 0.4879, "step": 3532 }, { "epoch": 0.8506591223740444, "grad_norm": 4.1019110679626465, "learning_rate": 1.1532570241913721e-05, "loss": 0.9231, "step": 3533 }, { "epoch": 0.850899897670499, "grad_norm": 1.2922954559326172, "learning_rate": 1.1496244679107148e-05, "loss": 0.546, "step": 3534 }, { "epoch": 0.8511406729669536, "grad_norm": 1.7373534440994263, "learning_rate": 1.1459972926783236e-05, "loss": 0.2165, "step": 3535 }, { "epoch": 0.8513814482634082, "grad_norm": 1.6212053298950195, "learning_rate": 1.1423755006995241e-05, "loss": 0.8171, "step": 3536 }, { "epoch": 0.8516222235598627, "grad_norm": 6.341080665588379, "learning_rate": 1.1387590941763749e-05, "loss": 0.8483, "step": 3537 }, { "epoch": 0.8518629988563173, "grad_norm": 0.9660471081733704, "learning_rate": 1.135148075307666e-05, "loss": 0.284, "step": 3538 }, { "epoch": 0.8521037741527719, "grad_norm": 1.2168993949890137, "learning_rate": 1.1315424462888968e-05, "loss": 0.3476, "step": 3539 }, { "epoch": 0.8523445494492266, "grad_norm": 2.224290609359741, "learning_rate": 1.1279422093123037e-05, "loss": 0.4849, "step": 3540 }, { "epoch": 0.852585324745681, "grad_norm": 4.049657821655273, "learning_rate": 1.1243473665668336e-05, "loss": 0.4054, "step": 3541 }, { "epoch": 0.8528261000421357, "grad_norm": 1.0879812240600586, "learning_rate": 1.1207579202381625e-05, "loss": 0.2502, "step": 3542 }, { "epoch": 0.8530668753385903, "grad_norm": 1.2523934841156006, "learning_rate": 1.1171738725086833e-05, "loss": 0.3098, "step": 3543 }, { "epoch": 0.8533076506350449, "grad_norm": 0.9704805612564087, "learning_rate": 1.1135952255574999e-05, "loss": 0.5721, "step": 3544 }, { "epoch": 0.8535484259314994, "grad_norm": 3.1758830547332764, "learning_rate": 1.1100219815604418e-05, "loss": 0.4685, "step": 3545 }, { "epoch": 0.853789201227954, "grad_norm": 0.9316069483757019, "learning_rate": 1.1064541426900476e-05, "loss": 0.575, "step": 3546 }, { "epoch": 0.8540299765244086, "grad_norm": 5.6592912673950195, "learning_rate": 1.1028917111155712e-05, "loss": 1.1386, "step": 3547 }, { "epoch": 0.8542707518208632, "grad_norm": 2.0701656341552734, "learning_rate": 1.0993346890029832e-05, "loss": 1.1186, "step": 3548 }, { "epoch": 0.8545115271173178, "grad_norm": 2.067758560180664, "learning_rate": 1.0957830785149548e-05, "loss": 0.7525, "step": 3549 }, { "epoch": 0.8547523024137723, "grad_norm": 1.7323333024978638, "learning_rate": 1.0922368818108774e-05, "loss": 0.4399, "step": 3550 }, { "epoch": 0.8549930777102269, "grad_norm": 1.6739343404769897, "learning_rate": 1.0886961010468466e-05, "loss": 0.4477, "step": 3551 }, { "epoch": 0.8552338530066815, "grad_norm": 1.9262574911117554, "learning_rate": 1.0851607383756612e-05, "loss": 0.733, "step": 3552 }, { "epoch": 0.8554746283031361, "grad_norm": 1.7514442205429077, "learning_rate": 1.081630795946833e-05, "loss": 0.7005, "step": 3553 }, { "epoch": 0.8557154035995906, "grad_norm": 1.4854047298431396, "learning_rate": 1.078106275906573e-05, "loss": 0.926, "step": 3554 }, { "epoch": 0.8559561788960452, "grad_norm": 1.791135549545288, "learning_rate": 1.0745871803978002e-05, "loss": 0.4316, "step": 3555 }, { "epoch": 0.8561969541924999, "grad_norm": 0.6483386754989624, "learning_rate": 1.0710735115601311e-05, "loss": 0.4356, "step": 3556 }, { "epoch": 0.8564377294889545, "grad_norm": 6.167766094207764, "learning_rate": 1.0675652715298835e-05, "loss": 0.8807, "step": 3557 }, { "epoch": 0.856678504785409, "grad_norm": 3.2649612426757812, "learning_rate": 1.0640624624400752e-05, "loss": 0.724, "step": 3558 }, { "epoch": 0.8569192800818636, "grad_norm": 1.8932918310165405, "learning_rate": 1.0605650864204252e-05, "loss": 0.6025, "step": 3559 }, { "epoch": 0.8571600553783182, "grad_norm": 1.1560626029968262, "learning_rate": 1.0570731455973414e-05, "loss": 0.4024, "step": 3560 }, { "epoch": 0.8574008306747728, "grad_norm": 0.9457545280456543, "learning_rate": 1.0535866420939332e-05, "loss": 0.2836, "step": 3561 }, { "epoch": 0.8576416059712274, "grad_norm": 1.101394772529602, "learning_rate": 1.050105578030003e-05, "loss": 0.4336, "step": 3562 }, { "epoch": 0.8578823812676819, "grad_norm": 2.297769784927368, "learning_rate": 1.046629955522046e-05, "loss": 1.2278, "step": 3563 }, { "epoch": 0.8581231565641365, "grad_norm": 6.07118034362793, "learning_rate": 1.0431597766832502e-05, "loss": 0.6956, "step": 3564 }, { "epoch": 0.8583639318605911, "grad_norm": 0.6239258646965027, "learning_rate": 1.0396950436234887e-05, "loss": 0.051, "step": 3565 }, { "epoch": 0.8586047071570457, "grad_norm": 3.072779417037964, "learning_rate": 1.0362357584493298e-05, "loss": 0.9781, "step": 3566 }, { "epoch": 0.8588454824535002, "grad_norm": 1.9962631464004517, "learning_rate": 1.0327819232640235e-05, "loss": 0.5025, "step": 3567 }, { "epoch": 0.8590862577499548, "grad_norm": 1.4594467878341675, "learning_rate": 1.029333540167512e-05, "loss": 0.4257, "step": 3568 }, { "epoch": 0.8593270330464094, "grad_norm": 3.9603610038757324, "learning_rate": 1.0258906112564181e-05, "loss": 0.3696, "step": 3569 }, { "epoch": 0.8595678083428641, "grad_norm": 3.076791286468506, "learning_rate": 1.0224531386240522e-05, "loss": 0.7587, "step": 3570 }, { "epoch": 0.8598085836393186, "grad_norm": 3.732264995574951, "learning_rate": 1.0190211243604043e-05, "loss": 0.5357, "step": 3571 }, { "epoch": 0.8600493589357732, "grad_norm": 1.9595264196395874, "learning_rate": 1.0155945705521486e-05, "loss": 0.4164, "step": 3572 }, { "epoch": 0.8602901342322278, "grad_norm": 1.8622673749923706, "learning_rate": 1.0121734792826353e-05, "loss": 0.5882, "step": 3573 }, { "epoch": 0.8605309095286824, "grad_norm": 2.3604211807250977, "learning_rate": 1.0087578526318975e-05, "loss": 0.7776, "step": 3574 }, { "epoch": 0.860771684825137, "grad_norm": 2.476921796798706, "learning_rate": 1.0053476926766414e-05, "loss": 0.5129, "step": 3575 }, { "epoch": 0.8610124601215915, "grad_norm": 0.9629519581794739, "learning_rate": 1.0019430014902531e-05, "loss": 0.6324, "step": 3576 }, { "epoch": 0.8612532354180461, "grad_norm": 1.2336691617965698, "learning_rate": 9.985437811427933e-06, "loss": 0.236, "step": 3577 }, { "epoch": 0.8614940107145007, "grad_norm": 1.3842549324035645, "learning_rate": 9.951500337009945e-06, "loss": 0.8148, "step": 3578 }, { "epoch": 0.8617347860109553, "grad_norm": 1.1485180854797363, "learning_rate": 9.917617612282648e-06, "loss": 0.4162, "step": 3579 }, { "epoch": 0.8619755613074098, "grad_norm": 1.5087698698043823, "learning_rate": 9.883789657846799e-06, "loss": 1.0833, "step": 3580 }, { "epoch": 0.8622163366038644, "grad_norm": 0.747292697429657, "learning_rate": 9.850016494269853e-06, "loss": 0.4187, "step": 3581 }, { "epoch": 0.862457111900319, "grad_norm": 0.6730207204818726, "learning_rate": 9.816298142086022e-06, "loss": 0.5837, "step": 3582 }, { "epoch": 0.8626978871967736, "grad_norm": 3.047215461730957, "learning_rate": 9.782634621796083e-06, "loss": 0.3071, "step": 3583 }, { "epoch": 0.8629386624932281, "grad_norm": 0.9594640135765076, "learning_rate": 9.749025953867552e-06, "loss": 0.2057, "step": 3584 }, { "epoch": 0.8631794377896828, "grad_norm": 1.894709587097168, "learning_rate": 9.715472158734585e-06, "loss": 0.4201, "step": 3585 }, { "epoch": 0.8634202130861374, "grad_norm": 1.63250732421875, "learning_rate": 9.68197325679795e-06, "loss": 0.7254, "step": 3586 }, { "epoch": 0.863660988382592, "grad_norm": 8.452190399169922, "learning_rate": 9.648529268425088e-06, "loss": 0.2811, "step": 3587 }, { "epoch": 0.8639017636790465, "grad_norm": 0.6347200870513916, "learning_rate": 9.61514021394998e-06, "loss": 0.355, "step": 3588 }, { "epoch": 0.8641425389755011, "grad_norm": 1.1674455404281616, "learning_rate": 9.581806113673253e-06, "loss": 0.5386, "step": 3589 }, { "epoch": 0.8643833142719557, "grad_norm": 1.5471045970916748, "learning_rate": 9.548526987862149e-06, "loss": 0.6897, "step": 3590 }, { "epoch": 0.8646240895684103, "grad_norm": 1.1853959560394287, "learning_rate": 9.515302856750408e-06, "loss": 0.7887, "step": 3591 }, { "epoch": 0.8648648648648649, "grad_norm": 0.8306871056556702, "learning_rate": 9.48213374053839e-06, "loss": 0.4411, "step": 3592 }, { "epoch": 0.8651056401613194, "grad_norm": 1.9106206893920898, "learning_rate": 9.449019659392999e-06, "loss": 0.4598, "step": 3593 }, { "epoch": 0.865346415457774, "grad_norm": 2.1204633712768555, "learning_rate": 9.415960633447674e-06, "loss": 0.6254, "step": 3594 }, { "epoch": 0.8655871907542286, "grad_norm": 1.09833562374115, "learning_rate": 9.382956682802379e-06, "loss": 0.4423, "step": 3595 }, { "epoch": 0.8658279660506832, "grad_norm": 1.1681790351867676, "learning_rate": 9.350007827523577e-06, "loss": 0.9621, "step": 3596 }, { "epoch": 0.8660687413471377, "grad_norm": 2.5067391395568848, "learning_rate": 9.317114087644252e-06, "loss": 0.5401, "step": 3597 }, { "epoch": 0.8663095166435923, "grad_norm": 2.4286880493164062, "learning_rate": 9.284275483163885e-06, "loss": 0.4498, "step": 3598 }, { "epoch": 0.866550291940047, "grad_norm": 3.0217394828796387, "learning_rate": 9.251492034048393e-06, "loss": 0.769, "step": 3599 }, { "epoch": 0.8667910672365016, "grad_norm": 0.8035622239112854, "learning_rate": 9.21876376023022e-06, "loss": 0.1272, "step": 3600 }, { "epoch": 0.8670318425329561, "grad_norm": 0.6220622658729553, "learning_rate": 9.186090681608173e-06, "loss": 0.2411, "step": 3601 }, { "epoch": 0.8672726178294107, "grad_norm": 2.2393176555633545, "learning_rate": 9.153472818047625e-06, "loss": 0.3123, "step": 3602 }, { "epoch": 0.8675133931258653, "grad_norm": 5.026528835296631, "learning_rate": 9.120910189380294e-06, "loss": 1.1053, "step": 3603 }, { "epoch": 0.8677541684223199, "grad_norm": 4.139969825744629, "learning_rate": 9.088402815404306e-06, "loss": 0.6011, "step": 3604 }, { "epoch": 0.8679949437187745, "grad_norm": 2.963592290878296, "learning_rate": 9.055950715884254e-06, "loss": 0.629, "step": 3605 }, { "epoch": 0.868235719015229, "grad_norm": 4.244201183319092, "learning_rate": 9.023553910551041e-06, "loss": 0.2936, "step": 3606 }, { "epoch": 0.8684764943116836, "grad_norm": 1.2388718128204346, "learning_rate": 8.991212419102025e-06, "loss": 0.6559, "step": 3607 }, { "epoch": 0.8687172696081382, "grad_norm": 1.3638206720352173, "learning_rate": 8.958926261200928e-06, "loss": 0.6455, "step": 3608 }, { "epoch": 0.8689580449045928, "grad_norm": 1.8364553451538086, "learning_rate": 8.926695456477751e-06, "loss": 0.325, "step": 3609 }, { "epoch": 0.8691988202010473, "grad_norm": 2.5603854656219482, "learning_rate": 8.894520024528918e-06, "loss": 0.4407, "step": 3610 }, { "epoch": 0.8694395954975019, "grad_norm": 1.9255558252334595, "learning_rate": 8.862399984917213e-06, "loss": 0.8542, "step": 3611 }, { "epoch": 0.8696803707939565, "grad_norm": 1.358974575996399, "learning_rate": 8.830335357171627e-06, "loss": 0.9968, "step": 3612 }, { "epoch": 0.8699211460904112, "grad_norm": 1.3834187984466553, "learning_rate": 8.798326160787573e-06, "loss": 0.9395, "step": 3613 }, { "epoch": 0.8701619213868657, "grad_norm": 2.3378701210021973, "learning_rate": 8.766372415226675e-06, "loss": 0.2571, "step": 3614 }, { "epoch": 0.8704026966833203, "grad_norm": 1.1119276285171509, "learning_rate": 8.734474139916903e-06, "loss": 0.4873, "step": 3615 }, { "epoch": 0.8706434719797749, "grad_norm": 0.519648551940918, "learning_rate": 8.702631354252489e-06, "loss": 0.4369, "step": 3616 }, { "epoch": 0.8708842472762295, "grad_norm": 6.27766227722168, "learning_rate": 8.670844077593899e-06, "loss": 0.6788, "step": 3617 }, { "epoch": 0.8711250225726841, "grad_norm": 1.280344009399414, "learning_rate": 8.639112329267862e-06, "loss": 0.6255, "step": 3618 }, { "epoch": 0.8713657978691386, "grad_norm": 4.4702067375183105, "learning_rate": 8.60743612856738e-06, "loss": 0.4173, "step": 3619 }, { "epoch": 0.8716065731655932, "grad_norm": 0.5965597033500671, "learning_rate": 8.575815494751637e-06, "loss": 0.1958, "step": 3620 }, { "epoch": 0.8718473484620478, "grad_norm": 1.7509022951126099, "learning_rate": 8.544250447046075e-06, "loss": 0.1665, "step": 3621 }, { "epoch": 0.8720881237585024, "grad_norm": 1.8660122156143188, "learning_rate": 8.512741004642277e-06, "loss": 0.3934, "step": 3622 }, { "epoch": 0.8723288990549569, "grad_norm": 0.804557204246521, "learning_rate": 8.481287186698061e-06, "loss": 0.383, "step": 3623 }, { "epoch": 0.8725696743514115, "grad_norm": 1.3217666149139404, "learning_rate": 8.449889012337453e-06, "loss": 0.1176, "step": 3624 }, { "epoch": 0.8728104496478661, "grad_norm": 3.857081651687622, "learning_rate": 8.418546500650582e-06, "loss": 0.5942, "step": 3625 }, { "epoch": 0.8730512249443207, "grad_norm": 1.6486952304840088, "learning_rate": 8.387259670693759e-06, "loss": 0.525, "step": 3626 }, { "epoch": 0.8732920002407752, "grad_norm": 0.7626795768737793, "learning_rate": 8.356028541489468e-06, "loss": 0.2785, "step": 3627 }, { "epoch": 0.8735327755372299, "grad_norm": 2.2588295936584473, "learning_rate": 8.3248531320263e-06, "loss": 0.3082, "step": 3628 }, { "epoch": 0.8737735508336845, "grad_norm": 6.79339075088501, "learning_rate": 8.293733461259002e-06, "loss": 0.9379, "step": 3629 }, { "epoch": 0.8740143261301391, "grad_norm": 1.2690285444259644, "learning_rate": 8.262669548108349e-06, "loss": 0.9261, "step": 3630 }, { "epoch": 0.8742551014265937, "grad_norm": 1.7360615730285645, "learning_rate": 8.23166141146131e-06, "loss": 0.3663, "step": 3631 }, { "epoch": 0.8744958767230482, "grad_norm": 1.55054771900177, "learning_rate": 8.200709070170876e-06, "loss": 0.4774, "step": 3632 }, { "epoch": 0.8747366520195028, "grad_norm": 7.235383987426758, "learning_rate": 8.169812543056155e-06, "loss": 0.4964, "step": 3633 }, { "epoch": 0.8749774273159574, "grad_norm": 1.6751036643981934, "learning_rate": 8.13897184890231e-06, "loss": 1.2376, "step": 3634 }, { "epoch": 0.875218202612412, "grad_norm": 2.576141119003296, "learning_rate": 8.108187006460533e-06, "loss": 0.543, "step": 3635 }, { "epoch": 0.8754589779088665, "grad_norm": 1.2057117223739624, "learning_rate": 8.077458034448105e-06, "loss": 0.2369, "step": 3636 }, { "epoch": 0.8756997532053211, "grad_norm": 1.8533183336257935, "learning_rate": 8.046784951548302e-06, "loss": 0.6765, "step": 3637 }, { "epoch": 0.8759405285017757, "grad_norm": 1.2572599649429321, "learning_rate": 8.01616777641041e-06, "loss": 0.5824, "step": 3638 }, { "epoch": 0.8761813037982303, "grad_norm": 1.9591710567474365, "learning_rate": 7.985606527649769e-06, "loss": 0.9164, "step": 3639 }, { "epoch": 0.8764220790946848, "grad_norm": 0.9350030422210693, "learning_rate": 7.955101223847649e-06, "loss": 0.1639, "step": 3640 }, { "epoch": 0.8766628543911394, "grad_norm": 1.2530356645584106, "learning_rate": 7.92465188355137e-06, "loss": 0.6552, "step": 3641 }, { "epoch": 0.876903629687594, "grad_norm": 2.239734649658203, "learning_rate": 7.894258525274189e-06, "loss": 0.9524, "step": 3642 }, { "epoch": 0.8771444049840487, "grad_norm": 1.1480050086975098, "learning_rate": 7.863921167495348e-06, "loss": 0.5416, "step": 3643 }, { "epoch": 0.8773851802805033, "grad_norm": 1.4632046222686768, "learning_rate": 7.833639828660033e-06, "loss": 0.756, "step": 3644 }, { "epoch": 0.8776259555769578, "grad_norm": 2.0627739429473877, "learning_rate": 7.803414527179343e-06, "loss": 0.2886, "step": 3645 }, { "epoch": 0.8778667308734124, "grad_norm": 1.060166835784912, "learning_rate": 7.77324528143033e-06, "loss": 0.4151, "step": 3646 }, { "epoch": 0.878107506169867, "grad_norm": 0.9938207864761353, "learning_rate": 7.743132109756001e-06, "loss": 0.4678, "step": 3647 }, { "epoch": 0.8783482814663216, "grad_norm": 3.3694393634796143, "learning_rate": 7.713075030465199e-06, "loss": 0.7547, "step": 3648 }, { "epoch": 0.8785890567627761, "grad_norm": 5.267509460449219, "learning_rate": 7.683074061832685e-06, "loss": 0.8142, "step": 3649 }, { "epoch": 0.8788298320592307, "grad_norm": 1.6653624773025513, "learning_rate": 7.653129222099143e-06, "loss": 0.5309, "step": 3650 }, { "epoch": 0.8790706073556853, "grad_norm": 6.179348468780518, "learning_rate": 7.623240529471099e-06, "loss": 0.5801, "step": 3651 }, { "epoch": 0.8793113826521399, "grad_norm": 1.773995041847229, "learning_rate": 7.5934080021209496e-06, "loss": 0.2403, "step": 3652 }, { "epoch": 0.8795521579485944, "grad_norm": 1.451350450515747, "learning_rate": 7.563631658186921e-06, "loss": 0.4058, "step": 3653 }, { "epoch": 0.879792933245049, "grad_norm": 1.8011319637298584, "learning_rate": 7.533911515773096e-06, "loss": 0.6627, "step": 3654 }, { "epoch": 0.8800337085415036, "grad_norm": 1.6576850414276123, "learning_rate": 7.5042475929494205e-06, "loss": 0.4088, "step": 3655 }, { "epoch": 0.8802744838379583, "grad_norm": 3.480985403060913, "learning_rate": 7.4746399077515905e-06, "loss": 0.6784, "step": 3656 }, { "epoch": 0.8805152591344129, "grad_norm": 6.059078693389893, "learning_rate": 7.445088478181151e-06, "loss": 0.6168, "step": 3657 }, { "epoch": 0.8807560344308674, "grad_norm": 2.815342903137207, "learning_rate": 7.4155933222054494e-06, "loss": 0.6268, "step": 3658 }, { "epoch": 0.880996809727322, "grad_norm": 1.5780977010726929, "learning_rate": 7.386154457757599e-06, "loss": 0.5302, "step": 3659 }, { "epoch": 0.8812375850237766, "grad_norm": 1.6322784423828125, "learning_rate": 7.356771902736514e-06, "loss": 0.8104, "step": 3660 }, { "epoch": 0.8814783603202312, "grad_norm": 2.2666542530059814, "learning_rate": 7.327445675006839e-06, "loss": 0.4232, "step": 3661 }, { "epoch": 0.8817191356166857, "grad_norm": 2.0304696559906006, "learning_rate": 7.2981757923989755e-06, "loss": 0.7694, "step": 3662 }, { "epoch": 0.8819599109131403, "grad_norm": 4.598212242126465, "learning_rate": 7.268962272709101e-06, "loss": 0.8297, "step": 3663 }, { "epoch": 0.8822006862095949, "grad_norm": 0.6841728687286377, "learning_rate": 7.239805133699085e-06, "loss": 0.4114, "step": 3664 }, { "epoch": 0.8824414615060495, "grad_norm": 0.9421213269233704, "learning_rate": 7.210704393096534e-06, "loss": 0.24, "step": 3665 }, { "epoch": 0.882682236802504, "grad_norm": 3.708446979522705, "learning_rate": 7.181660068594764e-06, "loss": 0.8693, "step": 3666 }, { "epoch": 0.8829230120989586, "grad_norm": 0.668267548084259, "learning_rate": 7.152672177852804e-06, "loss": 0.2204, "step": 3667 }, { "epoch": 0.8831637873954132, "grad_norm": 1.9290603399276733, "learning_rate": 7.1237407384953655e-06, "loss": 0.3814, "step": 3668 }, { "epoch": 0.8834045626918678, "grad_norm": 1.4112284183502197, "learning_rate": 7.09486576811278e-06, "loss": 0.5461, "step": 3669 }, { "epoch": 0.8836453379883223, "grad_norm": 2.2663886547088623, "learning_rate": 7.066047284261157e-06, "loss": 0.4108, "step": 3670 }, { "epoch": 0.883886113284777, "grad_norm": 1.4161934852600098, "learning_rate": 7.037285304462138e-06, "loss": 0.4483, "step": 3671 }, { "epoch": 0.8841268885812316, "grad_norm": 2.718825101852417, "learning_rate": 7.008579846203112e-06, "loss": 0.454, "step": 3672 }, { "epoch": 0.8843676638776862, "grad_norm": 1.7748380899429321, "learning_rate": 6.979930926937062e-06, "loss": 0.343, "step": 3673 }, { "epoch": 0.8846084391741408, "grad_norm": 3.08974552154541, "learning_rate": 6.951338564082555e-06, "loss": 0.9658, "step": 3674 }, { "epoch": 0.8848492144705953, "grad_norm": 1.58262300491333, "learning_rate": 6.922802775023862e-06, "loss": 0.7142, "step": 3675 }, { "epoch": 0.8850899897670499, "grad_norm": 2.7481048107147217, "learning_rate": 6.894323577110795e-06, "loss": 0.4091, "step": 3676 }, { "epoch": 0.8853307650635045, "grad_norm": 1.3223680257797241, "learning_rate": 6.865900987658758e-06, "loss": 0.6664, "step": 3677 }, { "epoch": 0.8855715403599591, "grad_norm": 2.7666354179382324, "learning_rate": 6.83753502394876e-06, "loss": 0.7086, "step": 3678 }, { "epoch": 0.8858123156564136, "grad_norm": 1.754913091659546, "learning_rate": 6.809225703227351e-06, "loss": 0.8025, "step": 3679 }, { "epoch": 0.8860530909528682, "grad_norm": 1.091379165649414, "learning_rate": 6.780973042706673e-06, "loss": 0.1816, "step": 3680 }, { "epoch": 0.8862938662493228, "grad_norm": 0.8506015539169312, "learning_rate": 6.75277705956443e-06, "loss": 0.3386, "step": 3681 }, { "epoch": 0.8865346415457774, "grad_norm": 2.4108307361602783, "learning_rate": 6.724637770943798e-06, "loss": 0.6603, "step": 3682 }, { "epoch": 0.8867754168422319, "grad_norm": 7.575955390930176, "learning_rate": 6.6965551939535795e-06, "loss": 0.7667, "step": 3683 }, { "epoch": 0.8870161921386865, "grad_norm": 1.8191972970962524, "learning_rate": 6.668529345667995e-06, "loss": 0.5581, "step": 3684 }, { "epoch": 0.8872569674351412, "grad_norm": 1.4198626279830933, "learning_rate": 6.640560243126859e-06, "loss": 0.3262, "step": 3685 }, { "epoch": 0.8874977427315958, "grad_norm": 1.5749354362487793, "learning_rate": 6.612647903335445e-06, "loss": 0.5869, "step": 3686 }, { "epoch": 0.8877385180280504, "grad_norm": 5.316013813018799, "learning_rate": 6.58479234326449e-06, "loss": 0.3689, "step": 3687 }, { "epoch": 0.8879792933245049, "grad_norm": 1.4162142276763916, "learning_rate": 6.556993579850268e-06, "loss": 0.4004, "step": 3688 }, { "epoch": 0.8882200686209595, "grad_norm": 2.608461618423462, "learning_rate": 6.529251629994482e-06, "loss": 0.4771, "step": 3689 }, { "epoch": 0.8884608439174141, "grad_norm": 4.454953670501709, "learning_rate": 6.501566510564295e-06, "loss": 1.0927, "step": 3690 }, { "epoch": 0.8887016192138687, "grad_norm": 5.660929203033447, "learning_rate": 6.4739382383923185e-06, "loss": 0.5887, "step": 3691 }, { "epoch": 0.8889423945103232, "grad_norm": 5.009692668914795, "learning_rate": 6.446366830276607e-06, "loss": 0.8515, "step": 3692 }, { "epoch": 0.8891831698067778, "grad_norm": 1.05977201461792, "learning_rate": 6.4188523029806495e-06, "loss": 0.5923, "step": 3693 }, { "epoch": 0.8894239451032324, "grad_norm": 2.408989906311035, "learning_rate": 6.3913946732333414e-06, "loss": 0.5728, "step": 3694 }, { "epoch": 0.889664720399687, "grad_norm": 1.7964873313903809, "learning_rate": 6.363993957728953e-06, "loss": 0.616, "step": 3695 }, { "epoch": 0.8899054956961415, "grad_norm": 1.842602252960205, "learning_rate": 6.336650173127223e-06, "loss": 0.5101, "step": 3696 }, { "epoch": 0.8901462709925961, "grad_norm": 1.9197190999984741, "learning_rate": 6.309363336053209e-06, "loss": 0.6051, "step": 3697 }, { "epoch": 0.8903870462890507, "grad_norm": 14.988290786743164, "learning_rate": 6.282133463097362e-06, "loss": 0.4667, "step": 3698 }, { "epoch": 0.8906278215855054, "grad_norm": 2.018468141555786, "learning_rate": 6.254960570815527e-06, "loss": 0.3706, "step": 3699 }, { "epoch": 0.89086859688196, "grad_norm": 1.9267723560333252, "learning_rate": 6.227844675728867e-06, "loss": 1.4215, "step": 3700 }, { "epoch": 0.8911093721784145, "grad_norm": 4.696993350982666, "learning_rate": 6.2007857943239155e-06, "loss": 0.7318, "step": 3701 }, { "epoch": 0.8913501474748691, "grad_norm": 2.4501404762268066, "learning_rate": 6.1737839430525575e-06, "loss": 0.7474, "step": 3702 }, { "epoch": 0.8915909227713237, "grad_norm": 3.1551716327667236, "learning_rate": 6.146839138331928e-06, "loss": 0.4585, "step": 3703 }, { "epoch": 0.8918316980677783, "grad_norm": 8.161409378051758, "learning_rate": 6.119951396544576e-06, "loss": 0.3907, "step": 3704 }, { "epoch": 0.8920724733642328, "grad_norm": 5.239265441894531, "learning_rate": 6.093120734038283e-06, "loss": 0.8809, "step": 3705 }, { "epoch": 0.8923132486606874, "grad_norm": 2.640469789505005, "learning_rate": 6.0663471671261515e-06, "loss": 0.5767, "step": 3706 }, { "epoch": 0.892554023957142, "grad_norm": 2.713616371154785, "learning_rate": 6.0396307120865746e-06, "loss": 1.0203, "step": 3707 }, { "epoch": 0.8927947992535966, "grad_norm": 1.149683952331543, "learning_rate": 6.012971385163224e-06, "loss": 0.481, "step": 3708 }, { "epoch": 0.8930355745500511, "grad_norm": 0.7889773845672607, "learning_rate": 5.986369202565034e-06, "loss": 0.4251, "step": 3709 }, { "epoch": 0.8932763498465057, "grad_norm": 0.5434550046920776, "learning_rate": 5.959824180466178e-06, "loss": 0.1314, "step": 3710 }, { "epoch": 0.8935171251429603, "grad_norm": 4.118932723999023, "learning_rate": 5.93333633500609e-06, "loss": 0.3042, "step": 3711 }, { "epoch": 0.8937579004394149, "grad_norm": 1.7867053747177124, "learning_rate": 5.906905682289465e-06, "loss": 0.6089, "step": 3712 }, { "epoch": 0.8939986757358696, "grad_norm": 3.315713405609131, "learning_rate": 5.880532238386161e-06, "loss": 0.8192, "step": 3713 }, { "epoch": 0.894239451032324, "grad_norm": 2.045057535171509, "learning_rate": 5.854216019331305e-06, "loss": 0.6398, "step": 3714 }, { "epoch": 0.8944802263287787, "grad_norm": 1.3669121265411377, "learning_rate": 5.8279570411252316e-06, "loss": 0.2998, "step": 3715 }, { "epoch": 0.8947210016252333, "grad_norm": 1.606748104095459, "learning_rate": 5.801755319733438e-06, "loss": 0.4933, "step": 3716 }, { "epoch": 0.8949617769216879, "grad_norm": 1.4545626640319824, "learning_rate": 5.775610871086667e-06, "loss": 0.7581, "step": 3717 }, { "epoch": 0.8952025522181424, "grad_norm": 2.5948798656463623, "learning_rate": 5.749523711080762e-06, "loss": 0.7313, "step": 3718 }, { "epoch": 0.895443327514597, "grad_norm": 3.4522511959075928, "learning_rate": 5.723493855576778e-06, "loss": 0.4838, "step": 3719 }, { "epoch": 0.8956841028110516, "grad_norm": 1.7841429710388184, "learning_rate": 5.697521320400967e-06, "loss": 0.5223, "step": 3720 }, { "epoch": 0.8959248781075062, "grad_norm": 2.9225831031799316, "learning_rate": 5.67160612134463e-06, "loss": 0.8148, "step": 3721 }, { "epoch": 0.8961656534039607, "grad_norm": 1.33102285861969, "learning_rate": 5.645748274164309e-06, "loss": 0.1499, "step": 3722 }, { "epoch": 0.8964064287004153, "grad_norm": 0.6809419989585876, "learning_rate": 5.619947794581615e-06, "loss": 0.1958, "step": 3723 }, { "epoch": 0.8966472039968699, "grad_norm": 2.1398706436157227, "learning_rate": 5.594204698283301e-06, "loss": 0.4343, "step": 3724 }, { "epoch": 0.8968879792933245, "grad_norm": 2.5033817291259766, "learning_rate": 5.568519000921235e-06, "loss": 0.7202, "step": 3725 }, { "epoch": 0.8971287545897791, "grad_norm": 1.002875566482544, "learning_rate": 5.54289071811237e-06, "loss": 0.479, "step": 3726 }, { "epoch": 0.8973695298862336, "grad_norm": 2.744338035583496, "learning_rate": 5.517319865438764e-06, "loss": 0.923, "step": 3727 }, { "epoch": 0.8976103051826883, "grad_norm": 2.405243396759033, "learning_rate": 5.491806458447557e-06, "loss": 1.0923, "step": 3728 }, { "epoch": 0.8978510804791429, "grad_norm": 2.02970814704895, "learning_rate": 5.466350512650953e-06, "loss": 0.9937, "step": 3729 }, { "epoch": 0.8980918557755975, "grad_norm": 1.6559299230575562, "learning_rate": 5.440952043526215e-06, "loss": 0.5801, "step": 3730 }, { "epoch": 0.898332631072052, "grad_norm": 4.650358200073242, "learning_rate": 5.41561106651568e-06, "loss": 0.812, "step": 3731 }, { "epoch": 0.8985734063685066, "grad_norm": 2.2116572856903076, "learning_rate": 5.390327597026712e-06, "loss": 0.5227, "step": 3732 }, { "epoch": 0.8988141816649612, "grad_norm": 2.8776440620422363, "learning_rate": 5.3651016504317475e-06, "loss": 1.0063, "step": 3733 }, { "epoch": 0.8990549569614158, "grad_norm": 2.672783136367798, "learning_rate": 5.339933242068174e-06, "loss": 0.5567, "step": 3734 }, { "epoch": 0.8992957322578703, "grad_norm": 2.6852715015411377, "learning_rate": 5.3148223872384715e-06, "loss": 0.4038, "step": 3735 }, { "epoch": 0.8995365075543249, "grad_norm": 4.501379013061523, "learning_rate": 5.289769101210074e-06, "loss": 0.6712, "step": 3736 }, { "epoch": 0.8997772828507795, "grad_norm": 1.5511711835861206, "learning_rate": 5.26477339921545e-06, "loss": 0.2077, "step": 3737 }, { "epoch": 0.9000180581472341, "grad_norm": 1.7630692720413208, "learning_rate": 5.239835296452045e-06, "loss": 0.4195, "step": 3738 }, { "epoch": 0.9002588334436887, "grad_norm": 1.3953133821487427, "learning_rate": 5.214954808082273e-06, "loss": 0.6487, "step": 3739 }, { "epoch": 0.9004996087401432, "grad_norm": 1.0030934810638428, "learning_rate": 5.190131949233523e-06, "loss": 0.6278, "step": 3740 }, { "epoch": 0.9007403840365978, "grad_norm": 0.8308902978897095, "learning_rate": 5.165366734998178e-06, "loss": 0.301, "step": 3741 }, { "epoch": 0.9009811593330525, "grad_norm": 2.4039227962493896, "learning_rate": 5.140659180433516e-06, "loss": 0.8232, "step": 3742 }, { "epoch": 0.9012219346295071, "grad_norm": 1.496519923210144, "learning_rate": 5.116009300561797e-06, "loss": 0.5532, "step": 3743 }, { "epoch": 0.9014627099259616, "grad_norm": 2.6047515869140625, "learning_rate": 5.0914171103701895e-06, "loss": 0.6708, "step": 3744 }, { "epoch": 0.9017034852224162, "grad_norm": 4.155404090881348, "learning_rate": 5.066882624810809e-06, "loss": 0.4486, "step": 3745 }, { "epoch": 0.9019442605188708, "grad_norm": 5.623289108276367, "learning_rate": 5.042405858800692e-06, "loss": 0.6778, "step": 3746 }, { "epoch": 0.9021850358153254, "grad_norm": 2.981544256210327, "learning_rate": 5.017986827221733e-06, "loss": 0.6458, "step": 3747 }, { "epoch": 0.9024258111117799, "grad_norm": 0.939237117767334, "learning_rate": 4.993625544920799e-06, "loss": 0.1, "step": 3748 }, { "epoch": 0.9026665864082345, "grad_norm": 1.9405542612075806, "learning_rate": 4.969322026709577e-06, "loss": 0.5831, "step": 3749 }, { "epoch": 0.9029073617046891, "grad_norm": 0.6850067973136902, "learning_rate": 4.945076287364669e-06, "loss": 0.3371, "step": 3750 }, { "epoch": 0.9031481370011437, "grad_norm": 2.4914636611938477, "learning_rate": 4.9208883416275495e-06, "loss": 0.726, "step": 3751 }, { "epoch": 0.9033889122975982, "grad_norm": 2.531623601913452, "learning_rate": 4.896758204204532e-06, "loss": 0.4892, "step": 3752 }, { "epoch": 0.9036296875940528, "grad_norm": 0.5930827856063843, "learning_rate": 4.8726858897667816e-06, "loss": 0.2482, "step": 3753 }, { "epoch": 0.9038704628905074, "grad_norm": 1.0176321268081665, "learning_rate": 4.8486714129503565e-06, "loss": 0.4158, "step": 3754 }, { "epoch": 0.904111238186962, "grad_norm": 2.1726293563842773, "learning_rate": 4.824714788356066e-06, "loss": 0.4867, "step": 3755 }, { "epoch": 0.9043520134834167, "grad_norm": 0.8435872793197632, "learning_rate": 4.800816030549638e-06, "loss": 0.5242, "step": 3756 }, { "epoch": 0.9045927887798711, "grad_norm": 3.269883394241333, "learning_rate": 4.776975154061536e-06, "loss": 1.0293, "step": 3757 }, { "epoch": 0.9048335640763258, "grad_norm": 1.6402075290679932, "learning_rate": 4.753192173387089e-06, "loss": 0.5345, "step": 3758 }, { "epoch": 0.9050743393727804, "grad_norm": 1.5480372905731201, "learning_rate": 4.729467102986396e-06, "loss": 0.3328, "step": 3759 }, { "epoch": 0.905315114669235, "grad_norm": 3.9539589881896973, "learning_rate": 4.705799957284351e-06, "loss": 0.5114, "step": 3760 }, { "epoch": 0.9055558899656895, "grad_norm": 1.5085039138793945, "learning_rate": 4.6821907506706345e-06, "loss": 0.928, "step": 3761 }, { "epoch": 0.9057966652621441, "grad_norm": 1.067598819732666, "learning_rate": 4.6586394974996836e-06, "loss": 0.9092, "step": 3762 }, { "epoch": 0.9060374405585987, "grad_norm": 2.00384783744812, "learning_rate": 4.635146212090735e-06, "loss": 0.5831, "step": 3763 }, { "epoch": 0.9062782158550533, "grad_norm": 3.4452266693115234, "learning_rate": 4.61171090872774e-06, "loss": 0.2842, "step": 3764 }, { "epoch": 0.9065189911515078, "grad_norm": 2.7296142578125, "learning_rate": 4.588333601659423e-06, "loss": 0.5461, "step": 3765 }, { "epoch": 0.9067597664479624, "grad_norm": 1.4393811225891113, "learning_rate": 4.565014305099247e-06, "loss": 0.329, "step": 3766 }, { "epoch": 0.907000541744417, "grad_norm": 1.5528326034545898, "learning_rate": 4.541753033225393e-06, "loss": 0.9322, "step": 3767 }, { "epoch": 0.9072413170408716, "grad_norm": 1.016589879989624, "learning_rate": 4.5185498001807605e-06, "loss": 0.4822, "step": 3768 }, { "epoch": 0.9074820923373262, "grad_norm": 1.8541332483291626, "learning_rate": 4.495404620072985e-06, "loss": 0.3784, "step": 3769 }, { "epoch": 0.9077228676337807, "grad_norm": 2.7582716941833496, "learning_rate": 4.472317506974366e-06, "loss": 0.2414, "step": 3770 }, { "epoch": 0.9079636429302353, "grad_norm": 1.1082451343536377, "learning_rate": 4.44928847492192e-06, "loss": 0.6802, "step": 3771 }, { "epoch": 0.90820441822669, "grad_norm": 1.2040634155273438, "learning_rate": 4.426317537917368e-06, "loss": 0.333, "step": 3772 }, { "epoch": 0.9084451935231446, "grad_norm": 2.672924280166626, "learning_rate": 4.403404709927084e-06, "loss": 0.5014, "step": 3773 }, { "epoch": 0.9086859688195991, "grad_norm": 3.4659922122955322, "learning_rate": 4.3805500048821225e-06, "loss": 0.2261, "step": 3774 }, { "epoch": 0.9089267441160537, "grad_norm": 2.050246477127075, "learning_rate": 4.35775343667818e-06, "loss": 0.7365, "step": 3775 }, { "epoch": 0.9091675194125083, "grad_norm": 0.8071643710136414, "learning_rate": 4.335015019175637e-06, "loss": 0.7056, "step": 3776 }, { "epoch": 0.9094082947089629, "grad_norm": 2.027353525161743, "learning_rate": 4.3123347661995105e-06, "loss": 0.2542, "step": 3777 }, { "epoch": 0.9096490700054174, "grad_norm": 2.125551700592041, "learning_rate": 4.289712691539416e-06, "loss": 0.1728, "step": 3778 }, { "epoch": 0.909889845301872, "grad_norm": 1.8031567335128784, "learning_rate": 4.267148808949639e-06, "loss": 0.4307, "step": 3779 }, { "epoch": 0.9101306205983266, "grad_norm": 0.9503381252288818, "learning_rate": 4.244643132149084e-06, "loss": 0.4191, "step": 3780 }, { "epoch": 0.9103713958947812, "grad_norm": 1.0291205644607544, "learning_rate": 4.2221956748212384e-06, "loss": 0.7377, "step": 3781 }, { "epoch": 0.9106121711912358, "grad_norm": 0.7489404082298279, "learning_rate": 4.19980645061423e-06, "loss": 0.5554, "step": 3782 }, { "epoch": 0.9108529464876903, "grad_norm": 1.0450713634490967, "learning_rate": 4.177475473140724e-06, "loss": 0.3164, "step": 3783 }, { "epoch": 0.9110937217841449, "grad_norm": 1.918267011642456, "learning_rate": 4.155202755978027e-06, "loss": 0.759, "step": 3784 }, { "epoch": 0.9113344970805995, "grad_norm": 1.4207653999328613, "learning_rate": 4.132988312667996e-06, "loss": 0.6942, "step": 3785 }, { "epoch": 0.9115752723770542, "grad_norm": 1.9554914236068726, "learning_rate": 4.110832156717059e-06, "loss": 0.6352, "step": 3786 }, { "epoch": 0.9118160476735087, "grad_norm": 2.0343658924102783, "learning_rate": 4.088734301596209e-06, "loss": 0.7284, "step": 3787 }, { "epoch": 0.9120568229699633, "grad_norm": 2.685506820678711, "learning_rate": 4.066694760740996e-06, "loss": 0.732, "step": 3788 }, { "epoch": 0.9122975982664179, "grad_norm": 0.8585965633392334, "learning_rate": 4.044713547551504e-06, "loss": 0.4282, "step": 3789 }, { "epoch": 0.9125383735628725, "grad_norm": 1.1643730401992798, "learning_rate": 4.022790675392385e-06, "loss": 0.3473, "step": 3790 }, { "epoch": 0.912779148859327, "grad_norm": 1.323641061782837, "learning_rate": 4.0009261575927545e-06, "loss": 0.3057, "step": 3791 }, { "epoch": 0.9130199241557816, "grad_norm": 0.6881577968597412, "learning_rate": 3.979120007446313e-06, "loss": 0.3613, "step": 3792 }, { "epoch": 0.9132606994522362, "grad_norm": 2.510683298110962, "learning_rate": 3.957372238211254e-06, "loss": 0.8894, "step": 3793 }, { "epoch": 0.9135014747486908, "grad_norm": 1.2805593013763428, "learning_rate": 3.935682863110246e-06, "loss": 0.3263, "step": 3794 }, { "epoch": 0.9137422500451454, "grad_norm": 0.9712691903114319, "learning_rate": 3.914051895330506e-06, "loss": 0.3099, "step": 3795 }, { "epoch": 0.9139830253415999, "grad_norm": 6.864174842834473, "learning_rate": 3.892479348023681e-06, "loss": 0.5236, "step": 3796 }, { "epoch": 0.9142238006380545, "grad_norm": 3.510481834411621, "learning_rate": 3.8709652343059565e-06, "loss": 0.6298, "step": 3797 }, { "epoch": 0.9144645759345091, "grad_norm": 1.1707494258880615, "learning_rate": 3.849509567257959e-06, "loss": 0.5126, "step": 3798 }, { "epoch": 0.9147053512309637, "grad_norm": 1.940595269203186, "learning_rate": 3.828112359924763e-06, "loss": 0.5885, "step": 3799 }, { "epoch": 0.9149461265274182, "grad_norm": 3.032635450363159, "learning_rate": 3.8067736253159404e-06, "loss": 0.4042, "step": 3800 }, { "epoch": 0.9151869018238729, "grad_norm": 1.1645691394805908, "learning_rate": 3.785493376405469e-06, "loss": 0.441, "step": 3801 }, { "epoch": 0.9154276771203275, "grad_norm": 4.984523296356201, "learning_rate": 3.764271626131799e-06, "loss": 0.4769, "step": 3802 }, { "epoch": 0.9156684524167821, "grad_norm": 1.2044367790222168, "learning_rate": 3.743108387397798e-06, "loss": 0.3186, "step": 3803 }, { "epoch": 0.9159092277132366, "grad_norm": 0.9009220004081726, "learning_rate": 3.722003673070773e-06, "loss": 0.3869, "step": 3804 }, { "epoch": 0.9161500030096912, "grad_norm": 3.3949239253997803, "learning_rate": 3.700957495982427e-06, "loss": 0.4886, "step": 3805 }, { "epoch": 0.9163907783061458, "grad_norm": 1.1531803607940674, "learning_rate": 3.6799698689289007e-06, "loss": 0.488, "step": 3806 }, { "epoch": 0.9166315536026004, "grad_norm": 1.1033868789672852, "learning_rate": 3.659040804670699e-06, "loss": 0.4561, "step": 3807 }, { "epoch": 0.916872328899055, "grad_norm": 0.8439015746116638, "learning_rate": 3.638170315932754e-06, "loss": 0.443, "step": 3808 }, { "epoch": 0.9171131041955095, "grad_norm": 3.8729732036590576, "learning_rate": 3.6173584154043484e-06, "loss": 0.4908, "step": 3809 }, { "epoch": 0.9173538794919641, "grad_norm": 1.3288518190383911, "learning_rate": 3.5966051157391824e-06, "loss": 0.5959, "step": 3810 }, { "epoch": 0.9175946547884187, "grad_norm": 1.7388828992843628, "learning_rate": 3.575910429555307e-06, "loss": 0.1869, "step": 3811 }, { "epoch": 0.9178354300848733, "grad_norm": 1.7654670476913452, "learning_rate": 3.5552743694351354e-06, "loss": 0.5691, "step": 3812 }, { "epoch": 0.9180762053813278, "grad_norm": 1.1300573348999023, "learning_rate": 3.5346969479254532e-06, "loss": 0.5942, "step": 3813 }, { "epoch": 0.9183169806777824, "grad_norm": 1.7620266675949097, "learning_rate": 3.5141781775373527e-06, "loss": 0.623, "step": 3814 }, { "epoch": 0.9185577559742371, "grad_norm": 2.159961223602295, "learning_rate": 3.493718070746299e-06, "loss": 0.5568, "step": 3815 }, { "epoch": 0.9187985312706917, "grad_norm": 1.376558542251587, "learning_rate": 3.473316639992108e-06, "loss": 0.4562, "step": 3816 }, { "epoch": 0.9190393065671462, "grad_norm": 2.9691762924194336, "learning_rate": 3.4529738976788574e-06, "loss": 0.2444, "step": 3817 }, { "epoch": 0.9192800818636008, "grad_norm": 2.1220481395721436, "learning_rate": 3.4326898561750087e-06, "loss": 0.7534, "step": 3818 }, { "epoch": 0.9195208571600554, "grad_norm": 3.4113810062408447, "learning_rate": 3.412464527813297e-06, "loss": 0.4182, "step": 3819 }, { "epoch": 0.91976163245651, "grad_norm": 0.48114293813705444, "learning_rate": 3.3922979248907638e-06, "loss": 0.3866, "step": 3820 }, { "epoch": 0.9200024077529645, "grad_norm": 0.9137384295463562, "learning_rate": 3.372190059668756e-06, "loss": 0.4303, "step": 3821 }, { "epoch": 0.9202431830494191, "grad_norm": 2.026947021484375, "learning_rate": 3.3521409443728947e-06, "loss": 0.813, "step": 3822 }, { "epoch": 0.9204839583458737, "grad_norm": 4.139359474182129, "learning_rate": 3.332150591193095e-06, "loss": 0.5473, "step": 3823 }, { "epoch": 0.9207247336423283, "grad_norm": 0.5374311208724976, "learning_rate": 3.312219012283535e-06, "loss": 0.3422, "step": 3824 }, { "epoch": 0.9209655089387829, "grad_norm": 1.6601024866104126, "learning_rate": 3.2923462197626433e-06, "loss": 0.777, "step": 3825 }, { "epoch": 0.9212062842352374, "grad_norm": 1.9013340473175049, "learning_rate": 3.272532225713143e-06, "loss": 0.8191, "step": 3826 }, { "epoch": 0.921447059531692, "grad_norm": 1.4580706357955933, "learning_rate": 3.252777042181976e-06, "loss": 0.5187, "step": 3827 }, { "epoch": 0.9216878348281466, "grad_norm": 4.314640522003174, "learning_rate": 3.233080681180323e-06, "loss": 1.1448, "step": 3828 }, { "epoch": 0.9219286101246013, "grad_norm": 0.6895533800125122, "learning_rate": 3.21344315468366e-06, "loss": 0.3816, "step": 3829 }, { "epoch": 0.9221693854210558, "grad_norm": 0.710097074508667, "learning_rate": 3.193864474631614e-06, "loss": 0.317, "step": 3830 }, { "epoch": 0.9224101607175104, "grad_norm": 3.7332968711853027, "learning_rate": 3.174344652928063e-06, "loss": 0.261, "step": 3831 }, { "epoch": 0.922650936013965, "grad_norm": 1.7163169384002686, "learning_rate": 3.1548837014411357e-06, "loss": 0.5922, "step": 3832 }, { "epoch": 0.9228917113104196, "grad_norm": 1.793519139289856, "learning_rate": 3.135481632003101e-06, "loss": 1.0053, "step": 3833 }, { "epoch": 0.9231324866068741, "grad_norm": 2.4100849628448486, "learning_rate": 3.116138456410478e-06, "loss": 0.5618, "step": 3834 }, { "epoch": 0.9233732619033287, "grad_norm": 1.54547119140625, "learning_rate": 3.0968541864239476e-06, "loss": 0.1529, "step": 3835 }, { "epoch": 0.9236140371997833, "grad_norm": 1.1684705018997192, "learning_rate": 3.0776288337683977e-06, "loss": 0.2276, "step": 3836 }, { "epoch": 0.9238548124962379, "grad_norm": 2.4300782680511475, "learning_rate": 3.0584624101328785e-06, "loss": 0.4722, "step": 3837 }, { "epoch": 0.9240955877926925, "grad_norm": 1.5282294750213623, "learning_rate": 3.0393549271706345e-06, "loss": 0.5003, "step": 3838 }, { "epoch": 0.924336363089147, "grad_norm": 2.3679680824279785, "learning_rate": 3.0203063964990617e-06, "loss": 0.6853, "step": 3839 }, { "epoch": 0.9245771383856016, "grad_norm": 0.8373381495475769, "learning_rate": 3.001316829699685e-06, "loss": 0.2153, "step": 3840 }, { "epoch": 0.9248179136820562, "grad_norm": 3.2224361896514893, "learning_rate": 2.982386238318213e-06, "loss": 0.4978, "step": 3841 }, { "epoch": 0.9250586889785108, "grad_norm": 3.300617218017578, "learning_rate": 2.963514633864506e-06, "loss": 0.747, "step": 3842 }, { "epoch": 0.9252994642749653, "grad_norm": 1.0810372829437256, "learning_rate": 2.9447020278125072e-06, "loss": 0.4873, "step": 3843 }, { "epoch": 0.92554023957142, "grad_norm": 1.0733656883239746, "learning_rate": 2.925948431600356e-06, "loss": 0.6448, "step": 3844 }, { "epoch": 0.9257810148678746, "grad_norm": 1.8431061506271362, "learning_rate": 2.9072538566302654e-06, "loss": 0.3739, "step": 3845 }, { "epoch": 0.9260217901643292, "grad_norm": 4.552158832550049, "learning_rate": 2.8886183142685763e-06, "loss": 1.5292, "step": 3846 }, { "epoch": 0.9262625654607837, "grad_norm": 1.8968464136123657, "learning_rate": 2.87004181584577e-06, "loss": 0.3096, "step": 3847 }, { "epoch": 0.9265033407572383, "grad_norm": 1.599402904510498, "learning_rate": 2.8515243726563557e-06, "loss": 0.8706, "step": 3848 }, { "epoch": 0.9267441160536929, "grad_norm": 1.9225425720214844, "learning_rate": 2.8330659959589946e-06, "loss": 0.5347, "step": 3849 }, { "epoch": 0.9269848913501475, "grad_norm": 2.9519357681274414, "learning_rate": 2.8146666969764535e-06, "loss": 0.7436, "step": 3850 }, { "epoch": 0.9272256666466021, "grad_norm": 0.88628089427948, "learning_rate": 2.7963264868955065e-06, "loss": 0.5481, "step": 3851 }, { "epoch": 0.9274664419430566, "grad_norm": 2.948660373687744, "learning_rate": 2.7780453768670557e-06, "loss": 0.8285, "step": 3852 }, { "epoch": 0.9277072172395112, "grad_norm": 0.3918326497077942, "learning_rate": 2.7598233780060543e-06, "loss": 0.3189, "step": 3853 }, { "epoch": 0.9279479925359658, "grad_norm": 0.8619207739830017, "learning_rate": 2.7416605013915297e-06, "loss": 0.2851, "step": 3854 }, { "epoch": 0.9281887678324204, "grad_norm": 2.232577323913574, "learning_rate": 2.7235567580665587e-06, "loss": 0.5436, "step": 3855 }, { "epoch": 0.9284295431288749, "grad_norm": 3.190218448638916, "learning_rate": 2.705512159038226e-06, "loss": 0.6544, "step": 3856 }, { "epoch": 0.9286703184253295, "grad_norm": 2.477781295776367, "learning_rate": 2.687526715277722e-06, "loss": 0.5965, "step": 3857 }, { "epoch": 0.9289110937217842, "grad_norm": 1.608775019645691, "learning_rate": 2.669600437720221e-06, "loss": 0.3984, "step": 3858 }, { "epoch": 0.9291518690182388, "grad_norm": 4.561281204223633, "learning_rate": 2.651733337264928e-06, "loss": 0.4893, "step": 3859 }, { "epoch": 0.9293926443146933, "grad_norm": 6.173032760620117, "learning_rate": 2.6339254247751078e-06, "loss": 0.7945, "step": 3860 }, { "epoch": 0.9296334196111479, "grad_norm": 1.251524806022644, "learning_rate": 2.616176711077989e-06, "loss": 0.6277, "step": 3861 }, { "epoch": 0.9298741949076025, "grad_norm": 1.0110701322555542, "learning_rate": 2.5984872069648393e-06, "loss": 0.5838, "step": 3862 }, { "epoch": 0.9301149702040571, "grad_norm": 2.0771772861480713, "learning_rate": 2.580856923190933e-06, "loss": 0.9169, "step": 3863 }, { "epoch": 0.9303557455005117, "grad_norm": 1.6572563648223877, "learning_rate": 2.5632858704754848e-06, "loss": 0.4677, "step": 3864 }, { "epoch": 0.9305965207969662, "grad_norm": 1.9874509572982788, "learning_rate": 2.5457740595017707e-06, "loss": 0.4567, "step": 3865 }, { "epoch": 0.9308372960934208, "grad_norm": 2.116501808166504, "learning_rate": 2.5283215009169857e-06, "loss": 0.4592, "step": 3866 }, { "epoch": 0.9310780713898754, "grad_norm": 2.0553879737854004, "learning_rate": 2.51092820533233e-06, "loss": 0.3902, "step": 3867 }, { "epoch": 0.93131884668633, "grad_norm": 2.063753604888916, "learning_rate": 2.4935941833229782e-06, "loss": 0.7985, "step": 3868 }, { "epoch": 0.9315596219827845, "grad_norm": 1.6043528318405151, "learning_rate": 2.4763194454280435e-06, "loss": 0.7647, "step": 3869 }, { "epoch": 0.9318003972792391, "grad_norm": 1.9053353071212769, "learning_rate": 2.4591040021506027e-06, "loss": 0.6018, "step": 3870 }, { "epoch": 0.9320411725756937, "grad_norm": 2.5057151317596436, "learning_rate": 2.4419478639577164e-06, "loss": 0.7383, "step": 3871 }, { "epoch": 0.9322819478721484, "grad_norm": 2.225681781768799, "learning_rate": 2.424851041280307e-06, "loss": 0.4332, "step": 3872 }, { "epoch": 0.9325227231686029, "grad_norm": 0.5860837697982788, "learning_rate": 2.4078135445133156e-06, "loss": 0.6404, "step": 3873 }, { "epoch": 0.9327634984650575, "grad_norm": 3.1039059162139893, "learning_rate": 2.390835384015555e-06, "loss": 0.6935, "step": 3874 }, { "epoch": 0.9330042737615121, "grad_norm": 2.5515451431274414, "learning_rate": 2.373916570109802e-06, "loss": 0.3266, "step": 3875 }, { "epoch": 0.9332450490579667, "grad_norm": 3.696157455444336, "learning_rate": 2.357057113082728e-06, "loss": 0.3768, "step": 3876 }, { "epoch": 0.9334858243544213, "grad_norm": 2.1884636878967285, "learning_rate": 2.340257023184922e-06, "loss": 0.2111, "step": 3877 }, { "epoch": 0.9337265996508758, "grad_norm": 4.436749458312988, "learning_rate": 2.323516310630891e-06, "loss": 0.9444, "step": 3878 }, { "epoch": 0.9339673749473304, "grad_norm": 1.7192350625991821, "learning_rate": 2.3068349855989936e-06, "loss": 0.3693, "step": 3879 }, { "epoch": 0.934208150243785, "grad_norm": 5.450645446777344, "learning_rate": 2.2902130582315274e-06, "loss": 0.4809, "step": 3880 }, { "epoch": 0.9344489255402396, "grad_norm": 1.2411659955978394, "learning_rate": 2.2736505386346863e-06, "loss": 1.0273, "step": 3881 }, { "epoch": 0.9346897008366941, "grad_norm": 0.8129162192344666, "learning_rate": 2.2571474368784707e-06, "loss": 0.0588, "step": 3882 }, { "epoch": 0.9349304761331487, "grad_norm": 1.5056270360946655, "learning_rate": 2.240703762996843e-06, "loss": 0.6161, "step": 3883 }, { "epoch": 0.9351712514296033, "grad_norm": 1.0241050720214844, "learning_rate": 2.224319526987584e-06, "loss": 0.4896, "step": 3884 }, { "epoch": 0.9354120267260579, "grad_norm": 0.8922635316848755, "learning_rate": 2.2079947388123356e-06, "loss": 0.5838, "step": 3885 }, { "epoch": 0.9356528020225124, "grad_norm": 3.8598411083221436, "learning_rate": 2.1917294083966254e-06, "loss": 0.5277, "step": 3886 }, { "epoch": 0.935893577318967, "grad_norm": 10.636117935180664, "learning_rate": 2.1755235456297986e-06, "loss": 0.9566, "step": 3887 }, { "epoch": 0.9361343526154217, "grad_norm": 0.4968515932559967, "learning_rate": 2.15937716036505e-06, "loss": 0.563, "step": 3888 }, { "epoch": 0.9363751279118763, "grad_norm": 8.35496711730957, "learning_rate": 2.1432902624194286e-06, "loss": 0.2234, "step": 3889 }, { "epoch": 0.9366159032083309, "grad_norm": 1.9201698303222656, "learning_rate": 2.1272628615737977e-06, "loss": 0.797, "step": 3890 }, { "epoch": 0.9368566785047854, "grad_norm": 3.2286055088043213, "learning_rate": 2.1112949675728743e-06, "loss": 0.6248, "step": 3891 }, { "epoch": 0.93709745380124, "grad_norm": 1.6388925313949585, "learning_rate": 2.0953865901251255e-06, "loss": 0.685, "step": 3892 }, { "epoch": 0.9373382290976946, "grad_norm": 1.8927644491195679, "learning_rate": 2.0795377389029257e-06, "loss": 0.3773, "step": 3893 }, { "epoch": 0.9375790043941492, "grad_norm": 11.435422897338867, "learning_rate": 2.063748423542411e-06, "loss": 0.7658, "step": 3894 }, { "epoch": 0.9378197796906037, "grad_norm": 1.6999096870422363, "learning_rate": 2.048018653643491e-06, "loss": 0.727, "step": 3895 }, { "epoch": 0.9380605549870583, "grad_norm": 0.23305965960025787, "learning_rate": 2.0323484387699264e-06, "loss": 0.1286, "step": 3896 }, { "epoch": 0.9383013302835129, "grad_norm": 1.2219979763031006, "learning_rate": 2.0167377884492412e-06, "loss": 0.3562, "step": 3897 }, { "epoch": 0.9385421055799675, "grad_norm": 1.201636552810669, "learning_rate": 2.0011867121727313e-06, "loss": 0.6063, "step": 3898 }, { "epoch": 0.938782880876422, "grad_norm": 9.849644660949707, "learning_rate": 1.9856952193955005e-06, "loss": 0.9423, "step": 3899 }, { "epoch": 0.9390236561728766, "grad_norm": 1.613932490348816, "learning_rate": 1.9702633195363917e-06, "loss": 0.522, "step": 3900 }, { "epoch": 0.9392644314693313, "grad_norm": 2.0753109455108643, "learning_rate": 1.954891021978045e-06, "loss": 0.844, "step": 3901 }, { "epoch": 0.9395052067657859, "grad_norm": 2.056060552597046, "learning_rate": 1.9395783360668718e-06, "loss": 0.5813, "step": 3902 }, { "epoch": 0.9397459820622404, "grad_norm": 1.1586860418319702, "learning_rate": 1.9243252711129923e-06, "loss": 0.8256, "step": 3903 }, { "epoch": 0.939986757358695, "grad_norm": 2.10019850730896, "learning_rate": 1.909131836390321e-06, "loss": 1.029, "step": 3904 }, { "epoch": 0.9402275326551496, "grad_norm": 0.8406896591186523, "learning_rate": 1.893998041136502e-06, "loss": 0.4226, "step": 3905 }, { "epoch": 0.9404683079516042, "grad_norm": 1.0460152626037598, "learning_rate": 1.8789238945528976e-06, "loss": 0.5012, "step": 3906 }, { "epoch": 0.9407090832480588, "grad_norm": 3.7730448246002197, "learning_rate": 1.8639094058046425e-06, "loss": 1.1832, "step": 3907 }, { "epoch": 0.9409498585445133, "grad_norm": 3.430011034011841, "learning_rate": 1.848954584020568e-06, "loss": 0.3868, "step": 3908 }, { "epoch": 0.9411906338409679, "grad_norm": 3.7194321155548096, "learning_rate": 1.834059438293234e-06, "loss": 0.8586, "step": 3909 }, { "epoch": 0.9414314091374225, "grad_norm": 1.3014260530471802, "learning_rate": 1.819223977678941e-06, "loss": 0.5398, "step": 3910 }, { "epoch": 0.9416721844338771, "grad_norm": 0.9454424381256104, "learning_rate": 1.8044482111976735e-06, "loss": 0.6537, "step": 3911 }, { "epoch": 0.9419129597303316, "grad_norm": 1.6029918193817139, "learning_rate": 1.7897321478331342e-06, "loss": 0.3148, "step": 3912 }, { "epoch": 0.9421537350267862, "grad_norm": 3.4292304515838623, "learning_rate": 1.7750757965327213e-06, "loss": 0.5388, "step": 3913 }, { "epoch": 0.9423945103232408, "grad_norm": 2.228184461593628, "learning_rate": 1.7604791662075181e-06, "loss": 0.7117, "step": 3914 }, { "epoch": 0.9426352856196955, "grad_norm": 0.6483622193336487, "learning_rate": 1.7459422657323254e-06, "loss": 0.4383, "step": 3915 }, { "epoch": 0.94287606091615, "grad_norm": 2.6446633338928223, "learning_rate": 1.7314651039455954e-06, "loss": 0.6101, "step": 3916 }, { "epoch": 0.9431168362126046, "grad_norm": 2.0396006107330322, "learning_rate": 1.717047689649487e-06, "loss": 0.5838, "step": 3917 }, { "epoch": 0.9433576115090592, "grad_norm": 0.8823184370994568, "learning_rate": 1.7026900316098215e-06, "loss": 0.3099, "step": 3918 }, { "epoch": 0.9435983868055138, "grad_norm": 2.7773776054382324, "learning_rate": 1.688392138556083e-06, "loss": 0.9675, "step": 3919 }, { "epoch": 0.9438391621019684, "grad_norm": 5.874734878540039, "learning_rate": 1.6741540191814287e-06, "loss": 0.3189, "step": 3920 }, { "epoch": 0.9440799373984229, "grad_norm": 3.0924699306488037, "learning_rate": 1.6599756821426449e-06, "loss": 0.1751, "step": 3921 }, { "epoch": 0.9443207126948775, "grad_norm": 0.5945261120796204, "learning_rate": 1.6458571360602248e-06, "loss": 0.6427, "step": 3922 }, { "epoch": 0.9445614879913321, "grad_norm": 4.261098861694336, "learning_rate": 1.6317983895182575e-06, "loss": 0.7257, "step": 3923 }, { "epoch": 0.9448022632877867, "grad_norm": 3.0185914039611816, "learning_rate": 1.6177994510644834e-06, "loss": 0.6177, "step": 3924 }, { "epoch": 0.9450430385842412, "grad_norm": 1.362781286239624, "learning_rate": 1.603860329210316e-06, "loss": 1.1531, "step": 3925 }, { "epoch": 0.9452838138806958, "grad_norm": 1.4825752973556519, "learning_rate": 1.589981032430743e-06, "loss": 0.7275, "step": 3926 }, { "epoch": 0.9455245891771504, "grad_norm": 1.4190683364868164, "learning_rate": 1.576161569164436e-06, "loss": 0.7995, "step": 3927 }, { "epoch": 0.945765364473605, "grad_norm": 1.807726263999939, "learning_rate": 1.5624019478136408e-06, "loss": 0.3198, "step": 3928 }, { "epoch": 0.9460061397700595, "grad_norm": 2.1518940925598145, "learning_rate": 1.5487021767442433e-06, "loss": 0.4084, "step": 3929 }, { "epoch": 0.9462469150665141, "grad_norm": 1.5749576091766357, "learning_rate": 1.535062264285736e-06, "loss": 0.6711, "step": 3930 }, { "epoch": 0.9464876903629688, "grad_norm": 0.5705631375312805, "learning_rate": 1.5214822187312294e-06, "loss": 0.3036, "step": 3931 }, { "epoch": 0.9467284656594234, "grad_norm": 0.7009626030921936, "learning_rate": 1.5079620483373857e-06, "loss": 0.5722, "step": 3932 }, { "epoch": 0.946969240955878, "grad_norm": 1.3820369243621826, "learning_rate": 1.4945017613245294e-06, "loss": 0.1829, "step": 3933 }, { "epoch": 0.9472100162523325, "grad_norm": 5.041447639465332, "learning_rate": 1.481101365876547e-06, "loss": 0.6447, "step": 3934 }, { "epoch": 0.9474507915487871, "grad_norm": 3.8214685916900635, "learning_rate": 1.4677608701408886e-06, "loss": 0.7699, "step": 3935 }, { "epoch": 0.9476915668452417, "grad_norm": 2.032578468322754, "learning_rate": 1.4544802822286318e-06, "loss": 0.5696, "step": 3936 }, { "epoch": 0.9479323421416963, "grad_norm": 0.9442195892333984, "learning_rate": 1.4412596102143738e-06, "loss": 0.2906, "step": 3937 }, { "epoch": 0.9481731174381508, "grad_norm": 1.667283535003662, "learning_rate": 1.42809886213634e-06, "loss": 0.4485, "step": 3938 }, { "epoch": 0.9484138927346054, "grad_norm": 2.698345184326172, "learning_rate": 1.4149980459962742e-06, "loss": 0.3603, "step": 3939 }, { "epoch": 0.94865466803106, "grad_norm": 1.8065334558486938, "learning_rate": 1.4019571697595156e-06, "loss": 0.2997, "step": 3940 }, { "epoch": 0.9488954433275146, "grad_norm": 2.3097822666168213, "learning_rate": 1.3889762413549333e-06, "loss": 0.5277, "step": 3941 }, { "epoch": 0.9491362186239691, "grad_norm": 2.765949249267578, "learning_rate": 1.3760552686749806e-06, "loss": 0.3425, "step": 3942 }, { "epoch": 0.9493769939204237, "grad_norm": 0.5022979378700256, "learning_rate": 1.3631942595756175e-06, "loss": 0.5907, "step": 3943 }, { "epoch": 0.9496177692168783, "grad_norm": 1.2168604135513306, "learning_rate": 1.3503932218763893e-06, "loss": 0.2498, "step": 3944 }, { "epoch": 0.949858544513333, "grad_norm": 0.7240424752235413, "learning_rate": 1.3376521633603256e-06, "loss": 0.2152, "step": 3945 }, { "epoch": 0.9500993198097876, "grad_norm": 5.824214935302734, "learning_rate": 1.324971091774052e-06, "loss": 0.7278, "step": 3946 }, { "epoch": 0.9503400951062421, "grad_norm": 2.0212886333465576, "learning_rate": 1.312350014827668e-06, "loss": 0.8276, "step": 3947 }, { "epoch": 0.9505808704026967, "grad_norm": 4.131972789764404, "learning_rate": 1.2997889401948126e-06, "loss": 0.4576, "step": 3948 }, { "epoch": 0.9508216456991513, "grad_norm": 0.5388569831848145, "learning_rate": 1.287287875512655e-06, "loss": 0.3775, "step": 3949 }, { "epoch": 0.9510624209956059, "grad_norm": 0.646866500377655, "learning_rate": 1.2748468283818815e-06, "loss": 0.2518, "step": 3950 }, { "epoch": 0.9513031962920604, "grad_norm": 1.9133360385894775, "learning_rate": 1.2624658063666639e-06, "loss": 0.6595, "step": 3951 }, { "epoch": 0.951543971588515, "grad_norm": 1.385985255241394, "learning_rate": 1.2501448169946916e-06, "loss": 0.6742, "step": 3952 }, { "epoch": 0.9517847468849696, "grad_norm": 2.3750483989715576, "learning_rate": 1.2378838677571503e-06, "loss": 0.9347, "step": 3953 }, { "epoch": 0.9520255221814242, "grad_norm": 2.106820583343506, "learning_rate": 1.2256829661087432e-06, "loss": 0.4332, "step": 3954 }, { "epoch": 0.9522662974778787, "grad_norm": 1.1812132596969604, "learning_rate": 1.2135421194676256e-06, "loss": 0.4027, "step": 3955 }, { "epoch": 0.9525070727743333, "grad_norm": 1.4752898216247559, "learning_rate": 1.2014613352154702e-06, "loss": 0.2301, "step": 3956 }, { "epoch": 0.9527478480707879, "grad_norm": 1.9427971839904785, "learning_rate": 1.189440620697424e-06, "loss": 0.6347, "step": 3957 }, { "epoch": 0.9529886233672425, "grad_norm": 2.3208911418914795, "learning_rate": 1.1774799832220961e-06, "loss": 0.9104, "step": 3958 }, { "epoch": 0.9532293986636972, "grad_norm": 1.113741397857666, "learning_rate": 1.1655794300615918e-06, "loss": 0.8192, "step": 3959 }, { "epoch": 0.9534701739601517, "grad_norm": 2.645212411880493, "learning_rate": 1.1537389684514787e-06, "loss": 0.7612, "step": 3960 }, { "epoch": 0.9537109492566063, "grad_norm": 4.026910781860352, "learning_rate": 1.141958605590765e-06, "loss": 0.8181, "step": 3961 }, { "epoch": 0.9539517245530609, "grad_norm": 3.301568031311035, "learning_rate": 1.1302383486419544e-06, "loss": 1.2559, "step": 3962 }, { "epoch": 0.9541924998495155, "grad_norm": 1.3520029783248901, "learning_rate": 1.11857820473098e-06, "loss": 0.6955, "step": 3963 }, { "epoch": 0.95443327514597, "grad_norm": 3.8628885746002197, "learning_rate": 1.106978180947238e-06, "loss": 0.9347, "step": 3964 }, { "epoch": 0.9546740504424246, "grad_norm": 1.2216380834579468, "learning_rate": 1.095438284343575e-06, "loss": 0.6449, "step": 3965 }, { "epoch": 0.9549148257388792, "grad_norm": 0.4715072214603424, "learning_rate": 1.083958521936257e-06, "loss": 0.2787, "step": 3966 }, { "epoch": 0.9551556010353338, "grad_norm": 2.9943132400512695, "learning_rate": 1.0725389007050446e-06, "loss": 0.8323, "step": 3967 }, { "epoch": 0.9553963763317883, "grad_norm": 3.0367226600646973, "learning_rate": 1.0611794275930399e-06, "loss": 0.8128, "step": 3968 }, { "epoch": 0.9556371516282429, "grad_norm": 1.5612680912017822, "learning_rate": 1.0498801095068733e-06, "loss": 0.2859, "step": 3969 }, { "epoch": 0.9558779269246975, "grad_norm": 1.2354720830917358, "learning_rate": 1.0386409533165276e-06, "loss": 0.6305, "step": 3970 }, { "epoch": 0.9561187022211521, "grad_norm": 3.0069570541381836, "learning_rate": 1.0274619658554475e-06, "loss": 0.6211, "step": 3971 }, { "epoch": 0.9563594775176067, "grad_norm": 2.1763761043548584, "learning_rate": 1.0163431539204847e-06, "loss": 0.7641, "step": 3972 }, { "epoch": 0.9566002528140612, "grad_norm": 2.459559202194214, "learning_rate": 1.005284524271899e-06, "loss": 0.822, "step": 3973 }, { "epoch": 0.9568410281105159, "grad_norm": 1.9608721733093262, "learning_rate": 9.942860836333445e-07, "loss": 0.8843, "step": 3974 }, { "epoch": 0.9570818034069705, "grad_norm": 2.488222599029541, "learning_rate": 9.833478386919282e-07, "loss": 0.9933, "step": 3975 }, { "epoch": 0.9573225787034251, "grad_norm": 1.750231146812439, "learning_rate": 9.724697960981077e-07, "loss": 0.3185, "step": 3976 }, { "epoch": 0.9575633539998796, "grad_norm": 1.300431489944458, "learning_rate": 9.616519624657706e-07, "loss": 0.4801, "step": 3977 }, { "epoch": 0.9578041292963342, "grad_norm": 0.6917396783828735, "learning_rate": 9.508943443721663e-07, "loss": 0.5667, "step": 3978 }, { "epoch": 0.9580449045927888, "grad_norm": 3.407341480255127, "learning_rate": 9.401969483579632e-07, "loss": 0.5509, "step": 3979 }, { "epoch": 0.9582856798892434, "grad_norm": 2.0416157245635986, "learning_rate": 9.295597809272028e-07, "loss": 0.1392, "step": 3980 }, { "epoch": 0.9585264551856979, "grad_norm": 2.7798619270324707, "learning_rate": 9.189828485473006e-07, "loss": 0.9261, "step": 3981 }, { "epoch": 0.9587672304821525, "grad_norm": 0.8763837218284607, "learning_rate": 9.084661576490461e-07, "loss": 0.398, "step": 3982 }, { "epoch": 0.9590080057786071, "grad_norm": 1.7817946672439575, "learning_rate": 8.980097146266464e-07, "loss": 0.4061, "step": 3983 }, { "epoch": 0.9592487810750617, "grad_norm": 1.229095458984375, "learning_rate": 8.876135258376051e-07, "loss": 0.4831, "step": 3984 }, { "epoch": 0.9594895563715162, "grad_norm": 3.4042162895202637, "learning_rate": 8.772775976028546e-07, "loss": 0.4386, "step": 3985 }, { "epoch": 0.9597303316679708, "grad_norm": 1.3494471311569214, "learning_rate": 8.670019362066461e-07, "loss": 0.1705, "step": 3986 }, { "epoch": 0.9599711069644254, "grad_norm": 1.1241267919540405, "learning_rate": 8.567865478966042e-07, "loss": 0.3146, "step": 3987 }, { "epoch": 0.9602118822608801, "grad_norm": 0.9588642120361328, "learning_rate": 8.466314388837271e-07, "loss": 0.2576, "step": 3988 }, { "epoch": 0.9604526575573347, "grad_norm": 0.49164265394210815, "learning_rate": 8.365366153423204e-07, "loss": 0.2641, "step": 3989 }, { "epoch": 0.9606934328537892, "grad_norm": 1.6935783624649048, "learning_rate": 8.265020834100635e-07, "loss": 0.5948, "step": 3990 }, { "epoch": 0.9609342081502438, "grad_norm": 1.3072270154953003, "learning_rate": 8.165278491879868e-07, "loss": 0.5994, "step": 3991 }, { "epoch": 0.9611749834466984, "grad_norm": 4.324315071105957, "learning_rate": 8.066139187404398e-07, "loss": 0.4573, "step": 3992 }, { "epoch": 0.961415758743153, "grad_norm": 6.5658087730407715, "learning_rate": 7.967602980951228e-07, "loss": 0.6555, "step": 3993 }, { "epoch": 0.9616565340396075, "grad_norm": 2.510852813720703, "learning_rate": 7.869669932430435e-07, "loss": 0.617, "step": 3994 }, { "epoch": 0.9618973093360621, "grad_norm": 1.054416298866272, "learning_rate": 7.772340101385611e-07, "loss": 0.7014, "step": 3995 }, { "epoch": 0.9621380846325167, "grad_norm": 3.1223275661468506, "learning_rate": 7.675613546993643e-07, "loss": 0.7056, "step": 3996 }, { "epoch": 0.9623788599289713, "grad_norm": 2.15596866607666, "learning_rate": 7.579490328064265e-07, "loss": 0.7612, "step": 3997 }, { "epoch": 0.9626196352254258, "grad_norm": 3.02179217338562, "learning_rate": 7.483970503040726e-07, "loss": 0.6353, "step": 3998 }, { "epoch": 0.9628604105218804, "grad_norm": 1.420333743095398, "learning_rate": 7.38905412999924e-07, "loss": 0.3904, "step": 3999 }, { "epoch": 0.963101185818335, "grad_norm": 2.189934253692627, "learning_rate": 7.294741266649307e-07, "loss": 0.4709, "step": 4000 }, { "epoch": 0.9633419611147896, "grad_norm": 3.9134743213653564, "learning_rate": 7.201031970333283e-07, "loss": 0.3967, "step": 4001 }, { "epoch": 0.9635827364112443, "grad_norm": 14.885796546936035, "learning_rate": 7.10792629802659e-07, "loss": 0.9829, "step": 4002 }, { "epoch": 0.9638235117076988, "grad_norm": 2.1734344959259033, "learning_rate": 7.015424306337725e-07, "loss": 0.3751, "step": 4003 }, { "epoch": 0.9640642870041534, "grad_norm": 2.0911247730255127, "learning_rate": 6.923526051508145e-07, "loss": 0.7239, "step": 4004 }, { "epoch": 0.964305062300608, "grad_norm": 2.525022029876709, "learning_rate": 6.832231589412042e-07, "loss": 0.2855, "step": 4005 }, { "epoch": 0.9645458375970626, "grad_norm": 1.239410161972046, "learning_rate": 6.741540975556903e-07, "loss": 0.3171, "step": 4006 }, { "epoch": 0.9647866128935171, "grad_norm": 1.3856205940246582, "learning_rate": 6.651454265082512e-07, "loss": 0.5041, "step": 4007 }, { "epoch": 0.9650273881899717, "grad_norm": 1.1256098747253418, "learning_rate": 6.561971512762055e-07, "loss": 0.4639, "step": 4008 }, { "epoch": 0.9652681634864263, "grad_norm": 1.6934860944747925, "learning_rate": 6.473092773001233e-07, "loss": 0.488, "step": 4009 }, { "epoch": 0.9655089387828809, "grad_norm": 3.070348024368286, "learning_rate": 6.384818099838374e-07, "loss": 0.9032, "step": 4010 }, { "epoch": 0.9657497140793354, "grad_norm": 1.458402395248413, "learning_rate": 6.297147546944882e-07, "loss": 0.4057, "step": 4011 }, { "epoch": 0.96599048937579, "grad_norm": 2.034212589263916, "learning_rate": 6.210081167624338e-07, "loss": 0.265, "step": 4012 }, { "epoch": 0.9662312646722446, "grad_norm": 1.1797361373901367, "learning_rate": 6.12361901481362e-07, "loss": 0.6248, "step": 4013 }, { "epoch": 0.9664720399686992, "grad_norm": 4.208076000213623, "learning_rate": 6.037761141081677e-07, "loss": 0.8388, "step": 4014 }, { "epoch": 0.9667128152651538, "grad_norm": 3.974991798400879, "learning_rate": 5.952507598630419e-07, "loss": 0.8926, "step": 4015 }, { "epoch": 0.9669535905616083, "grad_norm": 1.3050296306610107, "learning_rate": 5.86785843929416e-07, "loss": 0.438, "step": 4016 }, { "epoch": 0.967194365858063, "grad_norm": 4.134682655334473, "learning_rate": 5.783813714539731e-07, "loss": 1.4216, "step": 4017 }, { "epoch": 0.9674351411545176, "grad_norm": 1.5222718715667725, "learning_rate": 5.700373475466592e-07, "loss": 0.5619, "step": 4018 }, { "epoch": 0.9676759164509722, "grad_norm": 1.8690755367279053, "learning_rate": 5.617537772806602e-07, "loss": 0.45, "step": 4019 }, { "epoch": 0.9679166917474267, "grad_norm": 2.2518856525421143, "learning_rate": 5.535306656923922e-07, "loss": 0.2202, "step": 4020 }, { "epoch": 0.9681574670438813, "grad_norm": 0.996590793132782, "learning_rate": 5.453680177815445e-07, "loss": 0.599, "step": 4021 }, { "epoch": 0.9683982423403359, "grad_norm": 2.219210624694824, "learning_rate": 5.372658385110141e-07, "loss": 0.4227, "step": 4022 }, { "epoch": 0.9686390176367905, "grad_norm": 4.723870754241943, "learning_rate": 5.29224132806938e-07, "loss": 0.6357, "step": 4023 }, { "epoch": 0.968879792933245, "grad_norm": 3.3653030395507812, "learning_rate": 5.212429055587165e-07, "loss": 0.5787, "step": 4024 }, { "epoch": 0.9691205682296996, "grad_norm": 2.3071415424346924, "learning_rate": 5.133221616189232e-07, "loss": 0.6784, "step": 4025 }, { "epoch": 0.9693613435261542, "grad_norm": 2.037489414215088, "learning_rate": 5.054619058033949e-07, "loss": 0.7301, "step": 4026 }, { "epoch": 0.9696021188226088, "grad_norm": 0.8913125991821289, "learning_rate": 4.976621428912087e-07, "loss": 0.3828, "step": 4027 }, { "epoch": 0.9698428941190634, "grad_norm": 2.0508530139923096, "learning_rate": 4.899228776246157e-07, "loss": 0.4969, "step": 4028 }, { "epoch": 0.9700836694155179, "grad_norm": 2.9134552478790283, "learning_rate": 4.822441147091072e-07, "loss": 0.4914, "step": 4029 }, { "epoch": 0.9703244447119725, "grad_norm": 1.360295295715332, "learning_rate": 4.7462585881339337e-07, "loss": 0.5586, "step": 4030 }, { "epoch": 0.9705652200084272, "grad_norm": 0.46363896131515503, "learning_rate": 4.6706811456939116e-07, "loss": 0.1493, "step": 4031 }, { "epoch": 0.9708059953048818, "grad_norm": 1.6321947574615479, "learning_rate": 4.595708865722359e-07, "loss": 0.7034, "step": 4032 }, { "epoch": 0.9710467706013363, "grad_norm": 3.976177930831909, "learning_rate": 4.5213417938023693e-07, "loss": 0.6017, "step": 4033 }, { "epoch": 0.9712875458977909, "grad_norm": 3.0852105617523193, "learning_rate": 4.4475799751494405e-07, "loss": 0.8562, "step": 4034 }, { "epoch": 0.9715283211942455, "grad_norm": 1.4149786233901978, "learning_rate": 4.374423454610921e-07, "loss": 0.331, "step": 4035 }, { "epoch": 0.9717690964907001, "grad_norm": 3.825847625732422, "learning_rate": 4.3018722766661193e-07, "loss": 0.7702, "step": 4036 }, { "epoch": 0.9720098717871546, "grad_norm": 5.9385175704956055, "learning_rate": 4.2299264854263056e-07, "loss": 0.4582, "step": 4037 }, { "epoch": 0.9722506470836092, "grad_norm": 2.5086779594421387, "learning_rate": 4.1585861246346e-07, "loss": 0.4805, "step": 4038 }, { "epoch": 0.9724914223800638, "grad_norm": 1.3712728023529053, "learning_rate": 4.087851237666196e-07, "loss": 0.4194, "step": 4039 }, { "epoch": 0.9727321976765184, "grad_norm": 4.936484336853027, "learning_rate": 4.017721867528246e-07, "loss": 0.4498, "step": 4040 }, { "epoch": 0.972972972972973, "grad_norm": 1.3534749746322632, "learning_rate": 3.948198056859198e-07, "loss": 0.6054, "step": 4041 }, { "epoch": 0.9732137482694275, "grad_norm": 3.1654248237609863, "learning_rate": 3.8792798479299066e-07, "loss": 0.8157, "step": 4042 }, { "epoch": 0.9734545235658821, "grad_norm": 1.9799362421035767, "learning_rate": 3.810967282642741e-07, "loss": 0.5287, "step": 4043 }, { "epoch": 0.9736952988623367, "grad_norm": 0.713421642780304, "learning_rate": 3.743260402531923e-07, "loss": 0.4187, "step": 4044 }, { "epoch": 0.9739360741587914, "grad_norm": 2.7310409545898438, "learning_rate": 3.676159248763411e-07, "loss": 0.8605, "step": 4045 }, { "epoch": 0.9741768494552459, "grad_norm": 0.2964976131916046, "learning_rate": 3.6096638621346824e-07, "loss": 0.176, "step": 4046 }, { "epoch": 0.9744176247517005, "grad_norm": 4.847579002380371, "learning_rate": 3.543774283075396e-07, "loss": 0.6553, "step": 4047 }, { "epoch": 0.9746584000481551, "grad_norm": 1.6431396007537842, "learning_rate": 3.478490551646285e-07, "loss": 0.5625, "step": 4048 }, { "epoch": 0.9748991753446097, "grad_norm": 1.354458212852478, "learning_rate": 3.413812707540154e-07, "loss": 0.3545, "step": 4049 }, { "epoch": 0.9751399506410642, "grad_norm": 0.7734440565109253, "learning_rate": 3.3497407900812126e-07, "loss": 0.3589, "step": 4050 }, { "epoch": 0.9753807259375188, "grad_norm": 2.87133526802063, "learning_rate": 3.2862748382253006e-07, "loss": 0.2974, "step": 4051 }, { "epoch": 0.9756215012339734, "grad_norm": 8.928435325622559, "learning_rate": 3.223414890559995e-07, "loss": 0.4755, "step": 4052 }, { "epoch": 0.975862276530428, "grad_norm": 3.5270206928253174, "learning_rate": 3.161160985304168e-07, "loss": 0.4525, "step": 4053 }, { "epoch": 0.9761030518268826, "grad_norm": 1.8198820352554321, "learning_rate": 3.0995131603083205e-07, "loss": 0.7032, "step": 4054 }, { "epoch": 0.9763438271233371, "grad_norm": 2.2425918579101562, "learning_rate": 3.038471453054581e-07, "loss": 0.8367, "step": 4055 }, { "epoch": 0.9765846024197917, "grad_norm": 2.4997448921203613, "learning_rate": 2.978035900656373e-07, "loss": 0.7143, "step": 4056 }, { "epoch": 0.9768253777162463, "grad_norm": 2.2880537509918213, "learning_rate": 2.918206539858637e-07, "loss": 0.5019, "step": 4057 }, { "epoch": 0.9770661530127009, "grad_norm": 0.7689948081970215, "learning_rate": 2.8589834070378295e-07, "loss": 0.2252, "step": 4058 }, { "epoch": 0.9773069283091554, "grad_norm": 1.0523358583450317, "learning_rate": 2.800366538201593e-07, "loss": 0.3874, "step": 4059 }, { "epoch": 0.97754770360561, "grad_norm": 2.757550001144409, "learning_rate": 2.742355968989307e-07, "loss": 0.3381, "step": 4060 }, { "epoch": 0.9777884789020647, "grad_norm": 0.9217396378517151, "learning_rate": 2.684951734671426e-07, "loss": 0.1354, "step": 4061 }, { "epoch": 0.9780292541985193, "grad_norm": 0.9151739478111267, "learning_rate": 2.6281538701498075e-07, "loss": 0.4916, "step": 4062 }, { "epoch": 0.9782700294949738, "grad_norm": 1.2322125434875488, "learning_rate": 2.571962409957718e-07, "loss": 0.5954, "step": 4063 }, { "epoch": 0.9785108047914284, "grad_norm": 6.46744441986084, "learning_rate": 2.5163773882598274e-07, "loss": 0.917, "step": 4064 }, { "epoch": 0.978751580087883, "grad_norm": 2.882272958755493, "learning_rate": 2.4613988388517696e-07, "loss": 0.7739, "step": 4065 }, { "epoch": 0.9789923553843376, "grad_norm": 1.9530011415481567, "learning_rate": 2.407026795160694e-07, "loss": 0.8486, "step": 4066 }, { "epoch": 0.9792331306807921, "grad_norm": 2.2306883335113525, "learning_rate": 2.3532612902449346e-07, "loss": 0.5382, "step": 4067 }, { "epoch": 0.9794739059772467, "grad_norm": 2.601823568344116, "learning_rate": 2.3001023567941205e-07, "loss": 0.7243, "step": 4068 }, { "epoch": 0.9797146812737013, "grad_norm": 1.1144752502441406, "learning_rate": 2.247550027128842e-07, "loss": 0.8548, "step": 4069 }, { "epoch": 0.9799554565701559, "grad_norm": 10.666824340820312, "learning_rate": 2.1956043332010955e-07, "loss": 0.9193, "step": 4070 }, { "epoch": 0.9801962318666105, "grad_norm": 1.2598254680633545, "learning_rate": 2.144265306594062e-07, "loss": 0.3549, "step": 4071 }, { "epoch": 0.980437007163065, "grad_norm": 2.1722021102905273, "learning_rate": 2.093532978521884e-07, "loss": 0.5457, "step": 4072 }, { "epoch": 0.9806777824595196, "grad_norm": 3.0609018802642822, "learning_rate": 2.0434073798298869e-07, "loss": 0.9473, "step": 4073 }, { "epoch": 0.9809185577559743, "grad_norm": 1.4855046272277832, "learning_rate": 1.9938885409948038e-07, "loss": 0.428, "step": 4074 }, { "epoch": 0.9811593330524289, "grad_norm": 2.298407793045044, "learning_rate": 1.9449764921238845e-07, "loss": 1.0341, "step": 4075 }, { "epoch": 0.9814001083488834, "grad_norm": 1.82069730758667, "learning_rate": 1.8966712629558957e-07, "loss": 0.9768, "step": 4076 }, { "epoch": 0.981640883645338, "grad_norm": 3.1224253177642822, "learning_rate": 1.848972882860567e-07, "loss": 0.6785, "step": 4077 }, { "epoch": 0.9818816589417926, "grad_norm": 3.6556875705718994, "learning_rate": 1.8018813808385883e-07, "loss": 0.6481, "step": 4078 }, { "epoch": 0.9821224342382472, "grad_norm": 1.220012903213501, "learning_rate": 1.7553967855217235e-07, "loss": 0.9078, "step": 4079 }, { "epoch": 0.9823632095347017, "grad_norm": 0.8602136373519897, "learning_rate": 1.7095191251726982e-07, "loss": 0.0683, "step": 4080 }, { "epoch": 0.9826039848311563, "grad_norm": 2.155679941177368, "learning_rate": 1.6642484276852e-07, "loss": 0.7771, "step": 4081 }, { "epoch": 0.9828447601276109, "grad_norm": 2.459348440170288, "learning_rate": 1.6195847205838777e-07, "loss": 1.0204, "step": 4082 }, { "epoch": 0.9830855354240655, "grad_norm": 1.9554654359817505, "learning_rate": 1.5755280310244536e-07, "loss": 0.4039, "step": 4083 }, { "epoch": 0.9833263107205201, "grad_norm": 1.885136604309082, "learning_rate": 1.5320783857935005e-07, "loss": 0.4138, "step": 4084 }, { "epoch": 0.9835670860169746, "grad_norm": 1.192893385887146, "learning_rate": 1.4892358113084426e-07, "loss": 0.5029, "step": 4085 }, { "epoch": 0.9838078613134292, "grad_norm": 3.7182071208953857, "learning_rate": 1.447000333617665e-07, "loss": 1.1002, "step": 4086 }, { "epoch": 0.9840486366098838, "grad_norm": 1.4601658582687378, "learning_rate": 1.405371978400516e-07, "loss": 0.5054, "step": 4087 }, { "epoch": 0.9842894119063385, "grad_norm": 2.312633752822876, "learning_rate": 1.3643507709669713e-07, "loss": 0.4722, "step": 4088 }, { "epoch": 0.984530187202793, "grad_norm": 0.7593234181404114, "learning_rate": 1.3239367362581912e-07, "loss": 0.1804, "step": 4089 }, { "epoch": 0.9847709624992476, "grad_norm": 2.778722047805786, "learning_rate": 1.284129898845854e-07, "loss": 0.6931, "step": 4090 }, { "epoch": 0.9850117377957022, "grad_norm": 3.4330999851226807, "learning_rate": 1.2449302829327102e-07, "loss": 1.1589, "step": 4091 }, { "epoch": 0.9852525130921568, "grad_norm": 2.7237799167633057, "learning_rate": 1.20633791235214e-07, "loss": 0.58, "step": 4092 }, { "epoch": 0.9854932883886113, "grad_norm": 1.8853704929351807, "learning_rate": 1.1683528105684848e-07, "loss": 0.9436, "step": 4093 }, { "epoch": 0.9857340636850659, "grad_norm": 1.500649094581604, "learning_rate": 1.130975000676715e-07, "loss": 0.8359, "step": 4094 }, { "epoch": 0.9859748389815205, "grad_norm": 2.9082491397857666, "learning_rate": 1.0942045054025407e-07, "loss": 1.1474, "step": 4095 }, { "epoch": 0.9862156142779751, "grad_norm": 0.6749841570854187, "learning_rate": 1.058041347102634e-07, "loss": 0.5816, "step": 4096 }, { "epoch": 0.9864563895744297, "grad_norm": 1.8493642807006836, "learning_rate": 1.0224855477642959e-07, "loss": 1.1293, "step": 4097 }, { "epoch": 0.9866971648708842, "grad_norm": 0.32945817708969116, "learning_rate": 9.875371290053447e-08, "loss": 0.3363, "step": 4098 }, { "epoch": 0.9869379401673388, "grad_norm": 1.6765530109405518, "learning_rate": 9.531961120746724e-08, "loss": 0.892, "step": 4099 }, { "epoch": 0.9871787154637934, "grad_norm": 2.7227275371551514, "learning_rate": 9.19462517851688e-08, "loss": 0.8791, "step": 4100 }, { "epoch": 0.987419490760248, "grad_norm": 0.8839995265007019, "learning_rate": 8.863363668464297e-08, "loss": 0.5598, "step": 4101 }, { "epoch": 0.9876602660567025, "grad_norm": 1.8077160120010376, "learning_rate": 8.538176791996754e-08, "loss": 0.3711, "step": 4102 }, { "epoch": 0.9879010413531571, "grad_norm": 2.301443338394165, "learning_rate": 8.21906474682943e-08, "loss": 1.0025, "step": 4103 }, { "epoch": 0.9881418166496118, "grad_norm": 1.6749955415725708, "learning_rate": 7.906027726981568e-08, "loss": 1.0008, "step": 4104 }, { "epoch": 0.9883825919460664, "grad_norm": 2.291646957397461, "learning_rate": 7.599065922780924e-08, "loss": 0.4288, "step": 4105 }, { "epoch": 0.9886233672425209, "grad_norm": 3.5161843299865723, "learning_rate": 7.298179520862647e-08, "loss": 0.8569, "step": 4106 }, { "epoch": 0.9888641425389755, "grad_norm": 0.49535292387008667, "learning_rate": 7.003368704164847e-08, "loss": 0.3556, "step": 4107 }, { "epoch": 0.9891049178354301, "grad_norm": 2.1845948696136475, "learning_rate": 6.714633651931923e-08, "loss": 0.4408, "step": 4108 }, { "epoch": 0.9893456931318847, "grad_norm": 2.3166656494140625, "learning_rate": 6.431974539717888e-08, "loss": 0.4087, "step": 4109 }, { "epoch": 0.9895864684283393, "grad_norm": 0.6833885312080383, "learning_rate": 6.155391539379718e-08, "loss": 0.362, "step": 4110 }, { "epoch": 0.9898272437247938, "grad_norm": 1.6554492712020874, "learning_rate": 5.884884819079561e-08, "loss": 0.4277, "step": 4111 }, { "epoch": 0.9900680190212484, "grad_norm": 1.1396666765213013, "learning_rate": 5.620454543285858e-08, "loss": 0.5649, "step": 4112 }, { "epoch": 0.990308794317703, "grad_norm": 2.3340067863464355, "learning_rate": 5.362100872773334e-08, "loss": 0.5773, "step": 4113 }, { "epoch": 0.9905495696141576, "grad_norm": 2.0202200412750244, "learning_rate": 5.109823964621896e-08, "loss": 0.5604, "step": 4114 }, { "epoch": 0.9907903449106121, "grad_norm": 1.5799890756607056, "learning_rate": 4.863623972216624e-08, "loss": 0.5799, "step": 4115 }, { "epoch": 0.9910311202070667, "grad_norm": 3.0115244388580322, "learning_rate": 4.62350104524778e-08, "loss": 0.631, "step": 4116 }, { "epoch": 0.9912718955035213, "grad_norm": 1.998792290687561, "learning_rate": 4.3894553297085805e-08, "loss": 0.8206, "step": 4117 }, { "epoch": 0.991512670799976, "grad_norm": 1.306921124458313, "learning_rate": 4.161486967901862e-08, "loss": 0.6584, "step": 4118 }, { "epoch": 0.9917534460964305, "grad_norm": 2.2372684478759766, "learning_rate": 3.9395960984323076e-08, "loss": 0.4433, "step": 4119 }, { "epoch": 0.9919942213928851, "grad_norm": 1.8838356733322144, "learning_rate": 3.723782856208669e-08, "loss": 0.7484, "step": 4120 }, { "epoch": 0.9922349966893397, "grad_norm": 0.9679247140884399, "learning_rate": 3.5140473724482034e-08, "loss": 0.1638, "step": 4121 }, { "epoch": 0.9924757719857943, "grad_norm": 1.0013998746871948, "learning_rate": 3.3103897746689097e-08, "loss": 0.2034, "step": 4122 }, { "epoch": 0.9927165472822489, "grad_norm": 1.7386034727096558, "learning_rate": 3.11281018669507e-08, "loss": 0.6798, "step": 4123 }, { "epoch": 0.9929573225787034, "grad_norm": 1.7022814750671387, "learning_rate": 2.921308728656147e-08, "loss": 0.9177, "step": 4124 }, { "epoch": 0.993198097875158, "grad_norm": 1.6356173753738403, "learning_rate": 2.7358855169845598e-08, "loss": 0.397, "step": 4125 }, { "epoch": 0.9934388731716126, "grad_norm": 2.2541160583496094, "learning_rate": 2.556540664419016e-08, "loss": 0.3977, "step": 4126 }, { "epoch": 0.9936796484680672, "grad_norm": 4.046707630157471, "learning_rate": 2.38327428000118e-08, "loss": 0.4311, "step": 4127 }, { "epoch": 0.9939204237645217, "grad_norm": 3.0581321716308594, "learning_rate": 2.216086469077894e-08, "loss": 1.07, "step": 4128 }, { "epoch": 0.9941611990609763, "grad_norm": 1.8278604745864868, "learning_rate": 2.0549773332989575e-08, "loss": 0.2437, "step": 4129 }, { "epoch": 0.9944019743574309, "grad_norm": 1.3549528121948242, "learning_rate": 1.8999469706193484e-08, "loss": 0.3801, "step": 4130 }, { "epoch": 0.9946427496538855, "grad_norm": 2.3834619522094727, "learning_rate": 1.750995475299222e-08, "loss": 0.6792, "step": 4131 }, { "epoch": 0.99488352495034, "grad_norm": 2.10779070854187, "learning_rate": 1.6081229378983598e-08, "loss": 0.4779, "step": 4132 }, { "epoch": 0.9951243002467947, "grad_norm": 3.692774534225464, "learning_rate": 1.4713294452861626e-08, "loss": 0.9011, "step": 4133 }, { "epoch": 0.9953650755432493, "grad_norm": 1.1384838819503784, "learning_rate": 1.3406150806327678e-08, "loss": 0.4314, "step": 4134 }, { "epoch": 0.9956058508397039, "grad_norm": 1.2060233354568481, "learning_rate": 1.2159799234134905e-08, "loss": 0.4898, "step": 4135 }, { "epoch": 0.9958466261361584, "grad_norm": 2.4775209426879883, "learning_rate": 1.097424049404383e-08, "loss": 0.4144, "step": 4136 }, { "epoch": 0.996087401432613, "grad_norm": 6.3312177658081055, "learning_rate": 9.849475306900058e-09, "loss": 0.9015, "step": 4137 }, { "epoch": 0.9963281767290676, "grad_norm": 1.8564362525939941, "learning_rate": 8.785504356556563e-09, "loss": 0.3928, "step": 4138 }, { "epoch": 0.9965689520255222, "grad_norm": 1.6765766143798828, "learning_rate": 7.782328289906992e-09, "loss": 1.0499, "step": 4139 }, { "epoch": 0.9968097273219768, "grad_norm": 2.188523054122925, "learning_rate": 6.839947716885675e-09, "loss": 0.9049, "step": 4140 }, { "epoch": 0.9970505026184313, "grad_norm": 0.7010088562965393, "learning_rate": 5.95836321046761e-09, "loss": 0.4464, "step": 4141 }, { "epoch": 0.9972912779148859, "grad_norm": 1.611911416053772, "learning_rate": 5.137575306646269e-09, "loss": 0.3036, "step": 4142 }, { "epoch": 0.9975320532113405, "grad_norm": 2.555997371673584, "learning_rate": 4.377584504478005e-09, "loss": 0.4531, "step": 4143 }, { "epoch": 0.9977728285077951, "grad_norm": 1.4258787631988525, "learning_rate": 3.6783912660265372e-09, "loss": 0.4331, "step": 4144 }, { "epoch": 0.9980136038042496, "grad_norm": 1.0815415382385254, "learning_rate": 3.039996016407365e-09, "loss": 0.3503, "step": 4145 }, { "epoch": 0.9982543791007042, "grad_norm": 4.800786972045898, "learning_rate": 2.4623991437766614e-09, "loss": 0.6749, "step": 4146 }, { "epoch": 0.9984951543971589, "grad_norm": 1.161253809928894, "learning_rate": 1.9456009992979696e-09, "loss": 0.5973, "step": 4147 }, { "epoch": 0.9987359296936135, "grad_norm": 3.008234739303589, "learning_rate": 1.48960189718661e-09, "loss": 0.9873, "step": 4148 }, { "epoch": 0.998976704990068, "grad_norm": 1.616468071937561, "learning_rate": 1.0944021146985784e-09, "loss": 0.5877, "step": 4149 }, { "epoch": 0.9992174802865226, "grad_norm": 1.8020235300064087, "learning_rate": 7.60001892119444e-10, "loss": 0.6324, "step": 4150 }, { "epoch": 0.9994582555829772, "grad_norm": 2.7583658695220947, "learning_rate": 4.864014327532474e-10, "loss": 0.8355, "step": 4151 }, { "epoch": 0.9996990308794318, "grad_norm": 1.8322501182556152, "learning_rate": 2.7360090296690846e-10, "loss": 0.706, "step": 4152 }, { "epoch": 0.9999398061758864, "grad_norm": 1.8930530548095703, "learning_rate": 1.2160043212361417e-10, "loss": 0.6687, "step": 4153 }, { "epoch": 1.0, "grad_norm": 5.636228084564209, "learning_rate": 3.0400112649431325e-11, "loss": 0.9409, "step": 4154 }, { "epoch": 1.0, "step": 4154, "total_flos": 7.629858860247867e+17, "train_loss": 0.8297393302427364, "train_runtime": 10162.4154, "train_samples_per_second": 3.269, "train_steps_per_second": 0.409 } ], "logging_steps": 1, "max_steps": 4154, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2400000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.629858860247867e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }