diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,29121 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 4154, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00024077529645458377, + "grad_norm": 26.920116424560547, + "learning_rate": 0.0, + "loss": 4.5785, + "step": 1 + }, + { + "epoch": 0.00048155059290916753, + "grad_norm": 8.383347511291504, + "learning_rate": 1.6000000000000001e-06, + "loss": 5.9464, + "step": 2 + }, + { + "epoch": 0.0007223258893637513, + "grad_norm": 16.050535202026367, + "learning_rate": 3.2000000000000003e-06, + "loss": 7.4406, + "step": 3 + }, + { + "epoch": 0.0009631011858183351, + "grad_norm": 20.88637351989746, + "learning_rate": 4.800000000000001e-06, + "loss": 9.6416, + "step": 4 + }, + { + "epoch": 0.0012038764822729189, + "grad_norm": 10.887105941772461, + "learning_rate": 6.4000000000000006e-06, + "loss": 4.9843, + "step": 5 + }, + { + "epoch": 0.0014446517787275025, + "grad_norm": 12.895423889160156, + "learning_rate": 8.000000000000001e-06, + "loss": 4.7898, + "step": 6 + }, + { + "epoch": 0.0016854270751820862, + "grad_norm": 11.34997844696045, + "learning_rate": 9.600000000000001e-06, + "loss": 6.4794, + "step": 7 + }, + { + "epoch": 0.0019262023716366701, + "grad_norm": 11.045844078063965, + "learning_rate": 1.1200000000000001e-05, + "loss": 5.5106, + "step": 8 + }, + { + "epoch": 0.002166977668091254, + "grad_norm": 6.337793827056885, + "learning_rate": 1.2800000000000001e-05, + "loss": 4.1446, + "step": 9 + }, + { + "epoch": 0.0024077529645458377, + "grad_norm": 16.309860229492188, + "learning_rate": 1.44e-05, + "loss": 6.5782, + "step": 10 + }, + { + "epoch": 0.002648528261000421, + "grad_norm": 18.269319534301758, + "learning_rate": 1.6000000000000003e-05, + "loss": 7.8044, + "step": 11 + }, + { + "epoch": 0.002889303557455005, + "grad_norm": 3.6918132305145264, + "learning_rate": 1.76e-05, + "loss": 2.8253, + "step": 12 + }, + { + "epoch": 0.003130078853909589, + "grad_norm": 13.319107055664062, + "learning_rate": 1.9200000000000003e-05, + "loss": 5.9873, + "step": 13 + }, + { + "epoch": 0.0033708541503641725, + "grad_norm": 16.075435638427734, + "learning_rate": 2.08e-05, + "loss": 5.123, + "step": 14 + }, + { + "epoch": 0.0036116294468187564, + "grad_norm": 13.996861457824707, + "learning_rate": 2.2400000000000002e-05, + "loss": 6.5848, + "step": 15 + }, + { + "epoch": 0.0038524047432733403, + "grad_norm": 23.012784957885742, + "learning_rate": 2.4e-05, + "loss": 9.885, + "step": 16 + }, + { + "epoch": 0.004093180039727924, + "grad_norm": 8.31369686126709, + "learning_rate": 2.5600000000000002e-05, + "loss": 6.0589, + "step": 17 + }, + { + "epoch": 0.004333955336182508, + "grad_norm": 20.620271682739258, + "learning_rate": 2.7200000000000004e-05, + "loss": 5.3127, + "step": 18 + }, + { + "epoch": 0.004574730632637091, + "grad_norm": 13.044432640075684, + "learning_rate": 2.88e-05, + "loss": 5.1087, + "step": 19 + }, + { + "epoch": 0.004815505929091675, + "grad_norm": 31.605579376220703, + "learning_rate": 3.04e-05, + "loss": 6.1106, + "step": 20 + }, + { + "epoch": 0.005056281225546259, + "grad_norm": 8.28500747680664, + "learning_rate": 3.2000000000000005e-05, + "loss": 4.5685, + "step": 21 + }, + { + "epoch": 0.005297056522000842, + "grad_norm": 14.513694763183594, + "learning_rate": 3.3600000000000004e-05, + "loss": 7.3525, + "step": 22 + }, + { + "epoch": 0.005537831818455427, + "grad_norm": 10.957548141479492, + "learning_rate": 3.52e-05, + "loss": 6.184, + "step": 23 + }, + { + "epoch": 0.00577860711491001, + "grad_norm": 6.602078914642334, + "learning_rate": 3.68e-05, + "loss": 3.2581, + "step": 24 + }, + { + "epoch": 0.006019382411364594, + "grad_norm": 10.11325454711914, + "learning_rate": 3.8400000000000005e-05, + "loss": 7.9033, + "step": 25 + }, + { + "epoch": 0.006260157707819178, + "grad_norm": 10.033835411071777, + "learning_rate": 4e-05, + "loss": 4.9469, + "step": 26 + }, + { + "epoch": 0.0065009330042737614, + "grad_norm": 10.819141387939453, + "learning_rate": 4.16e-05, + "loss": 7.46, + "step": 27 + }, + { + "epoch": 0.006741708300728345, + "grad_norm": 25.810640335083008, + "learning_rate": 4.32e-05, + "loss": 5.7763, + "step": 28 + }, + { + "epoch": 0.006982483597182929, + "grad_norm": 10.680785179138184, + "learning_rate": 4.4800000000000005e-05, + "loss": 5.9753, + "step": 29 + }, + { + "epoch": 0.007223258893637513, + "grad_norm": 14.47507381439209, + "learning_rate": 4.64e-05, + "loss": 5.2033, + "step": 30 + }, + { + "epoch": 0.007464034190092096, + "grad_norm": 17.440105438232422, + "learning_rate": 4.8e-05, + "loss": 5.6511, + "step": 31 + }, + { + "epoch": 0.0077048094865466805, + "grad_norm": 10.847347259521484, + "learning_rate": 4.96e-05, + "loss": 6.7464, + "step": 32 + }, + { + "epoch": 0.007945584783001265, + "grad_norm": 9.08476734161377, + "learning_rate": 5.1200000000000004e-05, + "loss": 5.9733, + "step": 33 + }, + { + "epoch": 0.008186360079455848, + "grad_norm": 6.558286190032959, + "learning_rate": 5.28e-05, + "loss": 4.6828, + "step": 34 + }, + { + "epoch": 0.008427135375910432, + "grad_norm": 33.23648452758789, + "learning_rate": 5.440000000000001e-05, + "loss": 5.5094, + "step": 35 + }, + { + "epoch": 0.008667910672365015, + "grad_norm": 8.349298477172852, + "learning_rate": 5.6000000000000006e-05, + "loss": 4.1564, + "step": 36 + }, + { + "epoch": 0.008908685968819599, + "grad_norm": 42.14231491088867, + "learning_rate": 5.76e-05, + "loss": 4.5537, + "step": 37 + }, + { + "epoch": 0.009149461265274182, + "grad_norm": 14.191291809082031, + "learning_rate": 5.92e-05, + "loss": 6.6075, + "step": 38 + }, + { + "epoch": 0.009390236561728767, + "grad_norm": 73.26921844482422, + "learning_rate": 6.08e-05, + "loss": 6.399, + "step": 39 + }, + { + "epoch": 0.00963101185818335, + "grad_norm": 14.339468002319336, + "learning_rate": 6.24e-05, + "loss": 5.774, + "step": 40 + }, + { + "epoch": 0.009871787154637934, + "grad_norm": 15.463168144226074, + "learning_rate": 6.400000000000001e-05, + "loss": 5.1967, + "step": 41 + }, + { + "epoch": 0.010112562451092518, + "grad_norm": 49.4256477355957, + "learning_rate": 6.560000000000001e-05, + "loss": 3.4196, + "step": 42 + }, + { + "epoch": 0.010353337747547101, + "grad_norm": 28.241819381713867, + "learning_rate": 6.720000000000001e-05, + "loss": 5.9708, + "step": 43 + }, + { + "epoch": 0.010594113044001685, + "grad_norm": 7.766085624694824, + "learning_rate": 6.879999999999999e-05, + "loss": 5.5671, + "step": 44 + }, + { + "epoch": 0.01083488834045627, + "grad_norm": 6.781948566436768, + "learning_rate": 7.04e-05, + "loss": 3.8236, + "step": 45 + }, + { + "epoch": 0.011075663636910853, + "grad_norm": 22.539283752441406, + "learning_rate": 7.2e-05, + "loss": 5.7164, + "step": 46 + }, + { + "epoch": 0.011316438933365437, + "grad_norm": 138.10426330566406, + "learning_rate": 7.36e-05, + "loss": 4.4738, + "step": 47 + }, + { + "epoch": 0.01155721422982002, + "grad_norm": 6.446707725524902, + "learning_rate": 7.52e-05, + "loss": 3.4618, + "step": 48 + }, + { + "epoch": 0.011797989526274604, + "grad_norm": 11.496111869812012, + "learning_rate": 7.680000000000001e-05, + "loss": 7.4023, + "step": 49 + }, + { + "epoch": 0.012038764822729187, + "grad_norm": 12.171050071716309, + "learning_rate": 7.840000000000001e-05, + "loss": 5.3548, + "step": 50 + }, + { + "epoch": 0.012279540119183772, + "grad_norm": 6.225719928741455, + "learning_rate": 8e-05, + "loss": 3.7378, + "step": 51 + }, + { + "epoch": 0.012520315415638356, + "grad_norm": 30.29821014404297, + "learning_rate": 8.16e-05, + "loss": 4.3953, + "step": 52 + }, + { + "epoch": 0.01276109071209294, + "grad_norm": 8.839107513427734, + "learning_rate": 8.32e-05, + "loss": 6.1286, + "step": 53 + }, + { + "epoch": 0.013001866008547523, + "grad_norm": 15.737375259399414, + "learning_rate": 8.48e-05, + "loss": 4.3585, + "step": 54 + }, + { + "epoch": 0.013242641305002106, + "grad_norm": 13.612770080566406, + "learning_rate": 8.64e-05, + "loss": 5.2315, + "step": 55 + }, + { + "epoch": 0.01348341660145669, + "grad_norm": 20.0008544921875, + "learning_rate": 8.800000000000001e-05, + "loss": 5.8228, + "step": 56 + }, + { + "epoch": 0.013724191897911275, + "grad_norm": 5.86611795425415, + "learning_rate": 8.960000000000001e-05, + "loss": 4.7417, + "step": 57 + }, + { + "epoch": 0.013964967194365858, + "grad_norm": 11.262532234191895, + "learning_rate": 9.120000000000001e-05, + "loss": 4.7822, + "step": 58 + }, + { + "epoch": 0.014205742490820442, + "grad_norm": 12.375737190246582, + "learning_rate": 9.28e-05, + "loss": 5.4579, + "step": 59 + }, + { + "epoch": 0.014446517787275025, + "grad_norm": 10.653188705444336, + "learning_rate": 9.44e-05, + "loss": 6.4277, + "step": 60 + }, + { + "epoch": 0.014687293083729609, + "grad_norm": 39.603145599365234, + "learning_rate": 9.6e-05, + "loss": 6.2061, + "step": 61 + }, + { + "epoch": 0.014928068380184192, + "grad_norm": 16.928340911865234, + "learning_rate": 9.76e-05, + "loss": 7.8526, + "step": 62 + }, + { + "epoch": 0.015168843676638778, + "grad_norm": 13.248088836669922, + "learning_rate": 9.92e-05, + "loss": 3.6139, + "step": 63 + }, + { + "epoch": 0.015409618973093361, + "grad_norm": 4.931760311126709, + "learning_rate": 0.00010080000000000001, + "loss": 3.006, + "step": 64 + }, + { + "epoch": 0.015650394269547944, + "grad_norm": 11.34759521484375, + "learning_rate": 0.00010240000000000001, + "loss": 6.164, + "step": 65 + }, + { + "epoch": 0.01589116956600253, + "grad_norm": 9.345470428466797, + "learning_rate": 0.00010400000000000001, + "loss": 4.9446, + "step": 66 + }, + { + "epoch": 0.01613194486245711, + "grad_norm": 10.860252380371094, + "learning_rate": 0.0001056, + "loss": 3.235, + "step": 67 + }, + { + "epoch": 0.016372720158911697, + "grad_norm": 22.121963500976562, + "learning_rate": 0.00010720000000000002, + "loss": 6.3519, + "step": 68 + }, + { + "epoch": 0.01661349545536628, + "grad_norm": 23.994407653808594, + "learning_rate": 0.00010880000000000002, + "loss": 5.4503, + "step": 69 + }, + { + "epoch": 0.016854270751820864, + "grad_norm": 5.721750736236572, + "learning_rate": 0.00011040000000000001, + "loss": 3.448, + "step": 70 + }, + { + "epoch": 0.017095046048275445, + "grad_norm": 46.67560577392578, + "learning_rate": 0.00011200000000000001, + "loss": 3.8795, + "step": 71 + }, + { + "epoch": 0.01733582134473003, + "grad_norm": 11.732275009155273, + "learning_rate": 0.0001136, + "loss": 3.7436, + "step": 72 + }, + { + "epoch": 0.017576596641184616, + "grad_norm": 19.560314178466797, + "learning_rate": 0.0001152, + "loss": 5.2577, + "step": 73 + }, + { + "epoch": 0.017817371937639197, + "grad_norm": 47.164306640625, + "learning_rate": 0.00011679999999999999, + "loss": 5.2273, + "step": 74 + }, + { + "epoch": 0.018058147234093783, + "grad_norm": 25.40642738342285, + "learning_rate": 0.0001184, + "loss": 5.912, + "step": 75 + }, + { + "epoch": 0.018298922530548364, + "grad_norm": 10.063149452209473, + "learning_rate": 0.00012, + "loss": 5.6455, + "step": 76 + }, + { + "epoch": 0.01853969782700295, + "grad_norm": 15.481316566467285, + "learning_rate": 0.0001216, + "loss": 3.1296, + "step": 77 + }, + { + "epoch": 0.018780473123457535, + "grad_norm": 10.528315544128418, + "learning_rate": 0.0001232, + "loss": 4.7821, + "step": 78 + }, + { + "epoch": 0.019021248419912117, + "grad_norm": 17.255693435668945, + "learning_rate": 0.0001248, + "loss": 3.7784, + "step": 79 + }, + { + "epoch": 0.0192620237163667, + "grad_norm": 26.67706298828125, + "learning_rate": 0.0001264, + "loss": 6.29, + "step": 80 + }, + { + "epoch": 0.019502799012821283, + "grad_norm": 10.499022483825684, + "learning_rate": 0.00012800000000000002, + "loss": 5.3771, + "step": 81 + }, + { + "epoch": 0.01974357430927587, + "grad_norm": 8.131221771240234, + "learning_rate": 0.0001296, + "loss": 4.0706, + "step": 82 + }, + { + "epoch": 0.01998434960573045, + "grad_norm": 45.455230712890625, + "learning_rate": 0.00013120000000000002, + "loss": 3.3879, + "step": 83 + }, + { + "epoch": 0.020225124902185036, + "grad_norm": 19.001127243041992, + "learning_rate": 0.0001328, + "loss": 7.6711, + "step": 84 + }, + { + "epoch": 0.02046590019863962, + "grad_norm": 6.148112773895264, + "learning_rate": 0.00013440000000000001, + "loss": 3.472, + "step": 85 + }, + { + "epoch": 0.020706675495094203, + "grad_norm": 11.255337715148926, + "learning_rate": 0.00013600000000000003, + "loss": 8.2827, + "step": 86 + }, + { + "epoch": 0.020947450791548788, + "grad_norm": 9.724173545837402, + "learning_rate": 0.00013759999999999998, + "loss": 4.0604, + "step": 87 + }, + { + "epoch": 0.02118822608800337, + "grad_norm": 58.57503128051758, + "learning_rate": 0.0001392, + "loss": 3.8051, + "step": 88 + }, + { + "epoch": 0.021429001384457955, + "grad_norm": 19.386682510375977, + "learning_rate": 0.0001408, + "loss": 7.015, + "step": 89 + }, + { + "epoch": 0.02166977668091254, + "grad_norm": 7.111973762512207, + "learning_rate": 0.0001424, + "loss": 2.9986, + "step": 90 + }, + { + "epoch": 0.02191055197736712, + "grad_norm": 10.568584442138672, + "learning_rate": 0.000144, + "loss": 4.6358, + "step": 91 + }, + { + "epoch": 0.022151327273821707, + "grad_norm": 10.066975593566895, + "learning_rate": 0.00014560000000000002, + "loss": 4.6346, + "step": 92 + }, + { + "epoch": 0.02239210257027629, + "grad_norm": 25.012971878051758, + "learning_rate": 0.0001472, + "loss": 5.2808, + "step": 93 + }, + { + "epoch": 0.022632877866730874, + "grad_norm": 8.683295249938965, + "learning_rate": 0.0001488, + "loss": 5.2811, + "step": 94 + }, + { + "epoch": 0.022873653163185455, + "grad_norm": 5.454954624176025, + "learning_rate": 0.0001504, + "loss": 3.4226, + "step": 95 + }, + { + "epoch": 0.02311442845964004, + "grad_norm": 5.077779293060303, + "learning_rate": 0.000152, + "loss": 2.7948, + "step": 96 + }, + { + "epoch": 0.023355203756094626, + "grad_norm": 33.022857666015625, + "learning_rate": 0.00015360000000000002, + "loss": 7.5081, + "step": 97 + }, + { + "epoch": 0.023595979052549208, + "grad_norm": 15.922677040100098, + "learning_rate": 0.0001552, + "loss": 5.6511, + "step": 98 + }, + { + "epoch": 0.023836754349003793, + "grad_norm": 7.067165851593018, + "learning_rate": 0.00015680000000000002, + "loss": 3.4348, + "step": 99 + }, + { + "epoch": 0.024077529645458375, + "grad_norm": 23.432310104370117, + "learning_rate": 0.00015840000000000003, + "loss": 4.1903, + "step": 100 + }, + { + "epoch": 0.02431830494191296, + "grad_norm": 6.100611686706543, + "learning_rate": 0.00016, + "loss": 4.3271, + "step": 101 + }, + { + "epoch": 0.024559080238367545, + "grad_norm": 32.087772369384766, + "learning_rate": 0.00016160000000000002, + "loss": 3.9315, + "step": 102 + }, + { + "epoch": 0.024799855534822127, + "grad_norm": 6.975442886352539, + "learning_rate": 0.0001632, + "loss": 4.5229, + "step": 103 + }, + { + "epoch": 0.025040630831276712, + "grad_norm": 15.898163795471191, + "learning_rate": 0.0001648, + "loss": 4.6573, + "step": 104 + }, + { + "epoch": 0.025281406127731294, + "grad_norm": 22.853700637817383, + "learning_rate": 0.0001664, + "loss": 5.0038, + "step": 105 + }, + { + "epoch": 0.02552218142418588, + "grad_norm": 11.595605850219727, + "learning_rate": 0.000168, + "loss": 2.2838, + "step": 106 + }, + { + "epoch": 0.025762956720640464, + "grad_norm": 32.837886810302734, + "learning_rate": 0.0001696, + "loss": 3.5895, + "step": 107 + }, + { + "epoch": 0.026003732017095046, + "grad_norm": 46.952144622802734, + "learning_rate": 0.00017120000000000001, + "loss": 2.7184, + "step": 108 + }, + { + "epoch": 0.02624450731354963, + "grad_norm": 10.757477760314941, + "learning_rate": 0.0001728, + "loss": 5.1609, + "step": 109 + }, + { + "epoch": 0.026485282610004213, + "grad_norm": 7.686053276062012, + "learning_rate": 0.0001744, + "loss": 4.8981, + "step": 110 + }, + { + "epoch": 0.026726057906458798, + "grad_norm": 9.239624977111816, + "learning_rate": 0.00017600000000000002, + "loss": 4.208, + "step": 111 + }, + { + "epoch": 0.02696683320291338, + "grad_norm": 12.09170150756836, + "learning_rate": 0.0001776, + "loss": 4.2583, + "step": 112 + }, + { + "epoch": 0.027207608499367965, + "grad_norm": 9.89987850189209, + "learning_rate": 0.00017920000000000002, + "loss": 4.0053, + "step": 113 + }, + { + "epoch": 0.02744838379582255, + "grad_norm": 10.28636360168457, + "learning_rate": 0.0001808, + "loss": 5.7005, + "step": 114 + }, + { + "epoch": 0.027689159092277132, + "grad_norm": 4.695734024047852, + "learning_rate": 0.00018240000000000002, + "loss": 3.4277, + "step": 115 + }, + { + "epoch": 0.027929934388731717, + "grad_norm": 105.01580810546875, + "learning_rate": 0.00018400000000000003, + "loss": 3.761, + "step": 116 + }, + { + "epoch": 0.0281707096851863, + "grad_norm": 9.191910743713379, + "learning_rate": 0.0001856, + "loss": 4.2246, + "step": 117 + }, + { + "epoch": 0.028411484981640884, + "grad_norm": 26.174537658691406, + "learning_rate": 0.00018720000000000002, + "loss": 2.9553, + "step": 118 + }, + { + "epoch": 0.02865226027809547, + "grad_norm": 9.34518814086914, + "learning_rate": 0.0001888, + "loss": 3.3358, + "step": 119 + }, + { + "epoch": 0.02889303557455005, + "grad_norm": 9.987439155578613, + "learning_rate": 0.0001904, + "loss": 3.6771, + "step": 120 + }, + { + "epoch": 0.029133810871004636, + "grad_norm": 7.954049587249756, + "learning_rate": 0.000192, + "loss": 3.2811, + "step": 121 + }, + { + "epoch": 0.029374586167459218, + "grad_norm": 8.947925567626953, + "learning_rate": 0.00019360000000000002, + "loss": 2.854, + "step": 122 + }, + { + "epoch": 0.029615361463913803, + "grad_norm": 31.957183837890625, + "learning_rate": 0.0001952, + "loss": 7.1923, + "step": 123 + }, + { + "epoch": 0.029856136760368385, + "grad_norm": 10.06078815460205, + "learning_rate": 0.0001968, + "loss": 3.0629, + "step": 124 + }, + { + "epoch": 0.03009691205682297, + "grad_norm": 9.508298873901367, + "learning_rate": 0.0001984, + "loss": 4.4433, + "step": 125 + }, + { + "epoch": 0.030337687353277555, + "grad_norm": 49.658111572265625, + "learning_rate": 0.0002, + "loss": 4.6153, + "step": 126 + }, + { + "epoch": 0.030578462649732137, + "grad_norm": 21.386220932006836, + "learning_rate": 0.00019999996959988735, + "loss": 5.5672, + "step": 127 + }, + { + "epoch": 0.030819237946186722, + "grad_norm": 122.65118408203125, + "learning_rate": 0.0001999998783995679, + "loss": 3.5313, + "step": 128 + }, + { + "epoch": 0.031060013242641304, + "grad_norm": 13.517218589782715, + "learning_rate": 0.00019999972639909706, + "loss": 4.7874, + "step": 129 + }, + { + "epoch": 0.03130078853909589, + "grad_norm": 18.364986419677734, + "learning_rate": 0.00019999951359856726, + "loss": 6.3622, + "step": 130 + }, + { + "epoch": 0.03154156383555047, + "grad_norm": 10.25970458984375, + "learning_rate": 0.0001999992399981079, + "loss": 3.7715, + "step": 131 + }, + { + "epoch": 0.03178233913200506, + "grad_norm": 15.492377281188965, + "learning_rate": 0.0001999989055978853, + "loss": 3.5824, + "step": 132 + }, + { + "epoch": 0.03202311442845964, + "grad_norm": 28.90912437438965, + "learning_rate": 0.00019999851039810283, + "loss": 4.791, + "step": 133 + }, + { + "epoch": 0.03226388972491422, + "grad_norm": 9.603219032287598, + "learning_rate": 0.00019999805439900072, + "loss": 3.1532, + "step": 134 + }, + { + "epoch": 0.032504665021368805, + "grad_norm": 7.891742706298828, + "learning_rate": 0.0001999975376008562, + "loss": 2.59, + "step": 135 + }, + { + "epoch": 0.03274544031782339, + "grad_norm": 14.559179306030273, + "learning_rate": 0.0001999969600039836, + "loss": 3.9376, + "step": 136 + }, + { + "epoch": 0.032986215614277975, + "grad_norm": 17.962955474853516, + "learning_rate": 0.00019999632160873398, + "loss": 3.7606, + "step": 137 + }, + { + "epoch": 0.03322699091073256, + "grad_norm": 13.648564338684082, + "learning_rate": 0.0001999956224154955, + "loss": 4.019, + "step": 138 + }, + { + "epoch": 0.033467766207187145, + "grad_norm": 14.759313583374023, + "learning_rate": 0.00019999486242469337, + "loss": 3.5558, + "step": 139 + }, + { + "epoch": 0.03370854150364173, + "grad_norm": 15.668071746826172, + "learning_rate": 0.00019999404163678955, + "loss": 3.7936, + "step": 140 + }, + { + "epoch": 0.03394931680009631, + "grad_norm": 17.56260108947754, + "learning_rate": 0.00019999316005228312, + "loss": 2.4151, + "step": 141 + }, + { + "epoch": 0.03419009209655089, + "grad_norm": 5.186138153076172, + "learning_rate": 0.0001999922176717101, + "loss": 2.7492, + "step": 142 + }, + { + "epoch": 0.03443086739300548, + "grad_norm": 12.366766929626465, + "learning_rate": 0.00019999121449564347, + "loss": 3.1902, + "step": 143 + }, + { + "epoch": 0.03467164268946006, + "grad_norm": 16.707490921020508, + "learning_rate": 0.0001999901505246931, + "loss": 2.3057, + "step": 144 + }, + { + "epoch": 0.03491241798591464, + "grad_norm": 9.578150749206543, + "learning_rate": 0.00019998902575950596, + "loss": 4.3383, + "step": 145 + }, + { + "epoch": 0.03515319328236923, + "grad_norm": 19.261411666870117, + "learning_rate": 0.0001999878402007659, + "loss": 5.0221, + "step": 146 + }, + { + "epoch": 0.03539396857882381, + "grad_norm": 8.17841911315918, + "learning_rate": 0.0001999865938491937, + "loss": 2.2664, + "step": 147 + }, + { + "epoch": 0.035634743875278395, + "grad_norm": 109.94926452636719, + "learning_rate": 0.00019998528670554715, + "loss": 6.5844, + "step": 148 + }, + { + "epoch": 0.035875519171732984, + "grad_norm": 8.02511215209961, + "learning_rate": 0.00019998391877062104, + "loss": 3.7571, + "step": 149 + }, + { + "epoch": 0.036116294468187565, + "grad_norm": 8.986191749572754, + "learning_rate": 0.00019998249004524703, + "loss": 5.3496, + "step": 150 + }, + { + "epoch": 0.03635706976464215, + "grad_norm": 5.070540904998779, + "learning_rate": 0.0001999810005302938, + "loss": 2.581, + "step": 151 + }, + { + "epoch": 0.03659784506109673, + "grad_norm": 10.687249183654785, + "learning_rate": 0.00019997945022666701, + "loss": 3.4334, + "step": 152 + }, + { + "epoch": 0.03683862035755132, + "grad_norm": 7.046168327331543, + "learning_rate": 0.00019997783913530923, + "loss": 2.8572, + "step": 153 + }, + { + "epoch": 0.0370793956540059, + "grad_norm": 4.520480632781982, + "learning_rate": 0.0001999761672572, + "loss": 2.4164, + "step": 154 + }, + { + "epoch": 0.03732017095046048, + "grad_norm": 20.134994506835938, + "learning_rate": 0.0001999744345933558, + "loss": 5.1227, + "step": 155 + }, + { + "epoch": 0.03756094624691507, + "grad_norm": 8.454794883728027, + "learning_rate": 0.00019997264114483015, + "loss": 4.3214, + "step": 156 + }, + { + "epoch": 0.03780172154336965, + "grad_norm": 19.004796981811523, + "learning_rate": 0.00019997078691271348, + "loss": 2.8001, + "step": 157 + }, + { + "epoch": 0.03804249683982423, + "grad_norm": 8.622836112976074, + "learning_rate": 0.00019996887189813306, + "loss": 2.6805, + "step": 158 + }, + { + "epoch": 0.038283272136278815, + "grad_norm": 11.92911434173584, + "learning_rate": 0.00019996689610225332, + "loss": 3.4712, + "step": 159 + }, + { + "epoch": 0.0385240474327334, + "grad_norm": 8.337674140930176, + "learning_rate": 0.00019996485952627552, + "loss": 3.6351, + "step": 160 + }, + { + "epoch": 0.038764822729187985, + "grad_norm": 16.739974975585938, + "learning_rate": 0.00019996276217143792, + "loss": 2.8034, + "step": 161 + }, + { + "epoch": 0.03900559802564257, + "grad_norm": 19.424123764038086, + "learning_rate": 0.0001999606040390157, + "loss": 3.6437, + "step": 162 + }, + { + "epoch": 0.039246373322097156, + "grad_norm": 6.484769344329834, + "learning_rate": 0.000199958385130321, + "loss": 1.9908, + "step": 163 + }, + { + "epoch": 0.03948714861855174, + "grad_norm": 12.377532005310059, + "learning_rate": 0.0001999561054467029, + "loss": 4.8526, + "step": 164 + }, + { + "epoch": 0.03972792391500632, + "grad_norm": 13.827719688415527, + "learning_rate": 0.00019995376498954754, + "loss": 3.6073, + "step": 165 + }, + { + "epoch": 0.0399686992114609, + "grad_norm": 7.668979167938232, + "learning_rate": 0.00019995136376027786, + "loss": 2.496, + "step": 166 + }, + { + "epoch": 0.04020947450791549, + "grad_norm": 8.068209648132324, + "learning_rate": 0.00019994890176035378, + "loss": 4.0669, + "step": 167 + }, + { + "epoch": 0.04045024980437007, + "grad_norm": 11.890876770019531, + "learning_rate": 0.00019994637899127228, + "loss": 2.6487, + "step": 168 + }, + { + "epoch": 0.04069102510082465, + "grad_norm": 16.064224243164062, + "learning_rate": 0.00019994379545456713, + "loss": 2.9892, + "step": 169 + }, + { + "epoch": 0.04093180039727924, + "grad_norm": 7.469193458557129, + "learning_rate": 0.00019994115115180922, + "loss": 3.3422, + "step": 170 + }, + { + "epoch": 0.04117257569373382, + "grad_norm": 14.787521362304688, + "learning_rate": 0.00019993844608460622, + "loss": 3.911, + "step": 171 + }, + { + "epoch": 0.041413350990188405, + "grad_norm": 24.229990005493164, + "learning_rate": 0.00019993568025460283, + "loss": 3.3516, + "step": 172 + }, + { + "epoch": 0.041654126286642994, + "grad_norm": 4.197109222412109, + "learning_rate": 0.0001999328536634807, + "loss": 1.3666, + "step": 173 + }, + { + "epoch": 0.041894901583097575, + "grad_norm": 9.006143569946289, + "learning_rate": 0.00019992996631295836, + "loss": 4.234, + "step": 174 + }, + { + "epoch": 0.04213567687955216, + "grad_norm": 21.24369239807129, + "learning_rate": 0.00019992701820479138, + "loss": 3.2965, + "step": 175 + }, + { + "epoch": 0.04237645217600674, + "grad_norm": 21.48784828186035, + "learning_rate": 0.0001999240093407722, + "loss": 1.7589, + "step": 176 + }, + { + "epoch": 0.04261722747246133, + "grad_norm": 8.93320369720459, + "learning_rate": 0.00019992093972273018, + "loss": 1.9561, + "step": 177 + }, + { + "epoch": 0.04285800276891591, + "grad_norm": 12.301058769226074, + "learning_rate": 0.0001999178093525317, + "loss": 2.0668, + "step": 178 + }, + { + "epoch": 0.04309877806537049, + "grad_norm": 18.54864501953125, + "learning_rate": 0.00019991461823208004, + "loss": 3.1243, + "step": 179 + }, + { + "epoch": 0.04333955336182508, + "grad_norm": 14.172440528869629, + "learning_rate": 0.00019991136636331538, + "loss": 2.7406, + "step": 180 + }, + { + "epoch": 0.04358032865827966, + "grad_norm": 42.0859375, + "learning_rate": 0.00019990805374821483, + "loss": 1.4452, + "step": 181 + }, + { + "epoch": 0.04382110395473424, + "grad_norm": 10.7669677734375, + "learning_rate": 0.00019990468038879255, + "loss": 3.331, + "step": 182 + }, + { + "epoch": 0.044061879251188825, + "grad_norm": 11.51449966430664, + "learning_rate": 0.0001999012462870995, + "loss": 1.4512, + "step": 183 + }, + { + "epoch": 0.044302654547643414, + "grad_norm": 21.03165054321289, + "learning_rate": 0.00019989775144522358, + "loss": 3.0687, + "step": 184 + }, + { + "epoch": 0.044543429844097995, + "grad_norm": 11.455255508422852, + "learning_rate": 0.00019989419586528975, + "loss": 3.6598, + "step": 185 + }, + { + "epoch": 0.04478420514055258, + "grad_norm": 10.61294174194336, + "learning_rate": 0.00019989057954945976, + "loss": 2.4758, + "step": 186 + }, + { + "epoch": 0.045024980437007166, + "grad_norm": 96.17725372314453, + "learning_rate": 0.00019988690249993235, + "loss": 2.7045, + "step": 187 + }, + { + "epoch": 0.04526575573346175, + "grad_norm": 8.609686851501465, + "learning_rate": 0.00019988316471894314, + "loss": 1.7687, + "step": 188 + }, + { + "epoch": 0.04550653102991633, + "grad_norm": 7.83888053894043, + "learning_rate": 0.00019987936620876478, + "loss": 1.7098, + "step": 189 + }, + { + "epoch": 0.04574730632637091, + "grad_norm": 6.7235941886901855, + "learning_rate": 0.00019987550697170674, + "loss": 1.6275, + "step": 190 + }, + { + "epoch": 0.0459880816228255, + "grad_norm": 14.214694023132324, + "learning_rate": 0.0001998715870101154, + "loss": 4.1546, + "step": 191 + }, + { + "epoch": 0.04622885691928008, + "grad_norm": 3.766120433807373, + "learning_rate": 0.0001998676063263742, + "loss": 2.2139, + "step": 192 + }, + { + "epoch": 0.04646963221573466, + "grad_norm": 4.959268093109131, + "learning_rate": 0.0001998635649229033, + "loss": 1.4615, + "step": 193 + }, + { + "epoch": 0.04671040751218925, + "grad_norm": 6.699900150299072, + "learning_rate": 0.00019985946280215994, + "loss": 1.9309, + "step": 194 + }, + { + "epoch": 0.046951182808643833, + "grad_norm": 8.718276023864746, + "learning_rate": 0.00019985529996663823, + "loss": 2.1614, + "step": 195 + }, + { + "epoch": 0.047191958105098415, + "grad_norm": 13.810513496398926, + "learning_rate": 0.00019985107641886917, + "loss": 3.8401, + "step": 196 + }, + { + "epoch": 0.047432733401553004, + "grad_norm": 12.379217147827148, + "learning_rate": 0.00019984679216142066, + "loss": 1.5629, + "step": 197 + }, + { + "epoch": 0.047673508698007586, + "grad_norm": 10.015958786010742, + "learning_rate": 0.00019984244719689756, + "loss": 1.6573, + "step": 198 + }, + { + "epoch": 0.04791428399446217, + "grad_norm": 12.203784942626953, + "learning_rate": 0.00019983804152794163, + "loss": 1.9251, + "step": 199 + }, + { + "epoch": 0.04815505929091675, + "grad_norm": 8.036340713500977, + "learning_rate": 0.0001998335751572315, + "loss": 1.7192, + "step": 200 + }, + { + "epoch": 0.04839583458737134, + "grad_norm": 8.729804039001465, + "learning_rate": 0.00019982904808748275, + "loss": 0.9223, + "step": 201 + }, + { + "epoch": 0.04863660988382592, + "grad_norm": 32.94856262207031, + "learning_rate": 0.00019982446032144785, + "loss": 3.5147, + "step": 202 + }, + { + "epoch": 0.0488773851802805, + "grad_norm": 5.190202713012695, + "learning_rate": 0.00019981981186191616, + "loss": 1.0766, + "step": 203 + }, + { + "epoch": 0.04911816047673509, + "grad_norm": 15.163110733032227, + "learning_rate": 0.00019981510271171394, + "loss": 3.0481, + "step": 204 + }, + { + "epoch": 0.04935893577318967, + "grad_norm": 35.894718170166016, + "learning_rate": 0.00019981033287370443, + "loss": 3.3266, + "step": 205 + }, + { + "epoch": 0.04959971106964425, + "grad_norm": 12.832849502563477, + "learning_rate": 0.0001998055023507876, + "loss": 1.614, + "step": 206 + }, + { + "epoch": 0.049840486366098835, + "grad_norm": 12.771391868591309, + "learning_rate": 0.00019980061114590055, + "loss": 1.63, + "step": 207 + }, + { + "epoch": 0.050081261662553424, + "grad_norm": 20.120861053466797, + "learning_rate": 0.00019979565926201703, + "loss": 3.5633, + "step": 208 + }, + { + "epoch": 0.050322036959008005, + "grad_norm": 10.067777633666992, + "learning_rate": 0.00019979064670214782, + "loss": 1.7442, + "step": 209 + }, + { + "epoch": 0.05056281225546259, + "grad_norm": 5.05864143371582, + "learning_rate": 0.0001997855734693406, + "loss": 2.6813, + "step": 210 + }, + { + "epoch": 0.050803587551917176, + "grad_norm": 5.616927623748779, + "learning_rate": 0.0001997804395666799, + "loss": 1.4455, + "step": 211 + }, + { + "epoch": 0.05104436284837176, + "grad_norm": 18.011022567749023, + "learning_rate": 0.00019977524499728712, + "loss": 1.095, + "step": 212 + }, + { + "epoch": 0.05128513814482634, + "grad_norm": 4.923522472381592, + "learning_rate": 0.0001997699897643206, + "loss": 1.7786, + "step": 213 + }, + { + "epoch": 0.05152591344128093, + "grad_norm": 8.910199165344238, + "learning_rate": 0.00019976467387097552, + "loss": 2.6016, + "step": 214 + }, + { + "epoch": 0.05176668873773551, + "grad_norm": 6.376938343048096, + "learning_rate": 0.00019975929732048394, + "loss": 1.3324, + "step": 215 + }, + { + "epoch": 0.05200746403419009, + "grad_norm": 25.141647338867188, + "learning_rate": 0.00019975386011611483, + "loss": 1.0559, + "step": 216 + }, + { + "epoch": 0.05224823933064467, + "grad_norm": 69.8543472290039, + "learning_rate": 0.00019974836226117405, + "loss": 2.2873, + "step": 217 + }, + { + "epoch": 0.05248901462709926, + "grad_norm": 13.50328254699707, + "learning_rate": 0.00019974280375900424, + "loss": 2.7354, + "step": 218 + }, + { + "epoch": 0.052729789923553844, + "grad_norm": 9.282197952270508, + "learning_rate": 0.00019973718461298502, + "loss": 2.1698, + "step": 219 + }, + { + "epoch": 0.052970565220008425, + "grad_norm": 12.128793716430664, + "learning_rate": 0.00019973150482653287, + "loss": 2.3485, + "step": 220 + }, + { + "epoch": 0.053211340516463014, + "grad_norm": 6.763794422149658, + "learning_rate": 0.00019972576440310105, + "loss": 1.2761, + "step": 221 + }, + { + "epoch": 0.053452115812917596, + "grad_norm": 4.751701354980469, + "learning_rate": 0.00019971996334617985, + "loss": 1.0254, + "step": 222 + }, + { + "epoch": 0.05369289110937218, + "grad_norm": 10.452568054199219, + "learning_rate": 0.00019971410165929622, + "loss": 1.5987, + "step": 223 + }, + { + "epoch": 0.05393366640582676, + "grad_norm": 5.189295768737793, + "learning_rate": 0.00019970817934601413, + "loss": 0.9901, + "step": 224 + }, + { + "epoch": 0.05417444170228135, + "grad_norm": 6.027712821960449, + "learning_rate": 0.00019970219640993438, + "loss": 2.048, + "step": 225 + }, + { + "epoch": 0.05441521699873593, + "grad_norm": 5.749260425567627, + "learning_rate": 0.00019969615285469455, + "loss": 1.1023, + "step": 226 + }, + { + "epoch": 0.05465599229519051, + "grad_norm": 14.753028869628906, + "learning_rate": 0.0001996900486839692, + "loss": 1.657, + "step": 227 + }, + { + "epoch": 0.0548967675916451, + "grad_norm": 19.311214447021484, + "learning_rate": 0.0001996838839014696, + "loss": 1.5839, + "step": 228 + }, + { + "epoch": 0.05513754288809968, + "grad_norm": 2.4820916652679443, + "learning_rate": 0.000199677658510944, + "loss": 1.2341, + "step": 229 + }, + { + "epoch": 0.055378318184554264, + "grad_norm": 16.423561096191406, + "learning_rate": 0.0001996713725161775, + "loss": 2.0571, + "step": 230 + }, + { + "epoch": 0.055619093481008845, + "grad_norm": 10.467788696289062, + "learning_rate": 0.00019966502592099188, + "loss": 1.5059, + "step": 231 + }, + { + "epoch": 0.055859868777463434, + "grad_norm": 3.1604106426239014, + "learning_rate": 0.000199658618729246, + "loss": 1.099, + "step": 232 + }, + { + "epoch": 0.056100644073918016, + "grad_norm": 9.024856567382812, + "learning_rate": 0.00019965215094483539, + "loss": 1.2078, + "step": 233 + }, + { + "epoch": 0.0563414193703726, + "grad_norm": 2.999100923538208, + "learning_rate": 0.00019964562257169247, + "loss": 0.792, + "step": 234 + }, + { + "epoch": 0.056582194666827186, + "grad_norm": 16.542631149291992, + "learning_rate": 0.00019963903361378655, + "loss": 2.7538, + "step": 235 + }, + { + "epoch": 0.05682296996328177, + "grad_norm": 5.134494781494141, + "learning_rate": 0.00019963238407512366, + "loss": 2.8096, + "step": 236 + }, + { + "epoch": 0.05706374525973635, + "grad_norm": 7.121161460876465, + "learning_rate": 0.0001996256739597468, + "loss": 1.1293, + "step": 237 + }, + { + "epoch": 0.05730452055619094, + "grad_norm": 3.483020544052124, + "learning_rate": 0.00019961890327173574, + "loss": 1.8818, + "step": 238 + }, + { + "epoch": 0.05754529585264552, + "grad_norm": 4.338151454925537, + "learning_rate": 0.00019961207201520703, + "loss": 1.0518, + "step": 239 + }, + { + "epoch": 0.0577860711491001, + "grad_norm": 6.399717807769775, + "learning_rate": 0.00019960518019431408, + "loss": 1.2687, + "step": 240 + }, + { + "epoch": 0.05802684644555468, + "grad_norm": 1.5952820777893066, + "learning_rate": 0.00019959822781324718, + "loss": 0.6345, + "step": 241 + }, + { + "epoch": 0.05826762174200927, + "grad_norm": 9.329618453979492, + "learning_rate": 0.0001995912148762334, + "loss": 1.3564, + "step": 242 + }, + { + "epoch": 0.058508397038463854, + "grad_norm": 7.548645973205566, + "learning_rate": 0.00019958414138753657, + "loss": 1.0375, + "step": 243 + }, + { + "epoch": 0.058749172334918436, + "grad_norm": 18.993824005126953, + "learning_rate": 0.00019957700735145738, + "loss": 2.458, + "step": 244 + }, + { + "epoch": 0.058989947631373024, + "grad_norm": 20.46088981628418, + "learning_rate": 0.0001995698127723334, + "loss": 2.1789, + "step": 245 + }, + { + "epoch": 0.059230722927827606, + "grad_norm": 6.136659145355225, + "learning_rate": 0.00019956255765453892, + "loss": 1.3776, + "step": 246 + }, + { + "epoch": 0.05947149822428219, + "grad_norm": 61.323387145996094, + "learning_rate": 0.00019955524200248505, + "loss": 1.657, + "step": 247 + }, + { + "epoch": 0.05971227352073677, + "grad_norm": 4.754699230194092, + "learning_rate": 0.00019954786582061977, + "loss": 1.0319, + "step": 248 + }, + { + "epoch": 0.05995304881719136, + "grad_norm": 10.321673393249512, + "learning_rate": 0.0001995404291134278, + "loss": 2.1272, + "step": 249 + }, + { + "epoch": 0.06019382411364594, + "grad_norm": 8.861504554748535, + "learning_rate": 0.0001995329318854306, + "loss": 1.4962, + "step": 250 + }, + { + "epoch": 0.06043459941010052, + "grad_norm": 44.048126220703125, + "learning_rate": 0.0001995253741411866, + "loss": 2.3729, + "step": 251 + }, + { + "epoch": 0.06067537470655511, + "grad_norm": 2.5337188243865967, + "learning_rate": 0.0001995177558852909, + "loss": 0.564, + "step": 252 + }, + { + "epoch": 0.06091615000300969, + "grad_norm": 11.171781539916992, + "learning_rate": 0.0001995100771223754, + "loss": 1.2622, + "step": 253 + }, + { + "epoch": 0.061156925299464274, + "grad_norm": 10.259223937988281, + "learning_rate": 0.0001995023378571088, + "loss": 1.9257, + "step": 254 + }, + { + "epoch": 0.061397700595918855, + "grad_norm": 8.954612731933594, + "learning_rate": 0.0001994945380941966, + "loss": 1.008, + "step": 255 + }, + { + "epoch": 0.061638475892373444, + "grad_norm": 26.720203399658203, + "learning_rate": 0.0001994866778383811, + "loss": 1.8667, + "step": 256 + }, + { + "epoch": 0.061879251188828026, + "grad_norm": 8.071576118469238, + "learning_rate": 0.00019947875709444131, + "loss": 1.5516, + "step": 257 + }, + { + "epoch": 0.06212002648528261, + "grad_norm": 1.8321843147277832, + "learning_rate": 0.00019947077586719307, + "loss": 0.8952, + "step": 258 + }, + { + "epoch": 0.062360801781737196, + "grad_norm": 10.932100296020508, + "learning_rate": 0.000199462734161489, + "loss": 1.3651, + "step": 259 + }, + { + "epoch": 0.06260157707819178, + "grad_norm": 4.2766828536987305, + "learning_rate": 0.00019945463198221846, + "loss": 1.0024, + "step": 260 + }, + { + "epoch": 0.06284235237464636, + "grad_norm": 7.300168991088867, + "learning_rate": 0.00019944646933430762, + "loss": 1.1335, + "step": 261 + }, + { + "epoch": 0.06308312767110094, + "grad_norm": 2.313037395477295, + "learning_rate": 0.00019943824622271935, + "loss": 1.1619, + "step": 262 + }, + { + "epoch": 0.06332390296755552, + "grad_norm": 3.020617961883545, + "learning_rate": 0.00019942996265245335, + "loss": 1.0354, + "step": 263 + }, + { + "epoch": 0.06356467826401012, + "grad_norm": 14.848864555358887, + "learning_rate": 0.00019942161862854601, + "loss": 1.4934, + "step": 264 + }, + { + "epoch": 0.0638054535604647, + "grad_norm": 8.351217269897461, + "learning_rate": 0.0001994132141560706, + "loss": 1.5487, + "step": 265 + }, + { + "epoch": 0.06404622885691928, + "grad_norm": 5.606395244598389, + "learning_rate": 0.00019940474924013698, + "loss": 1.6069, + "step": 266 + }, + { + "epoch": 0.06428700415337386, + "grad_norm": 5.64864444732666, + "learning_rate": 0.00019939622388589183, + "loss": 1.0187, + "step": 267 + }, + { + "epoch": 0.06452777944982845, + "grad_norm": 8.374507904052734, + "learning_rate": 0.00019938763809851864, + "loss": 1.2051, + "step": 268 + }, + { + "epoch": 0.06476855474628303, + "grad_norm": 3.38839054107666, + "learning_rate": 0.00019937899188323757, + "loss": 0.6262, + "step": 269 + }, + { + "epoch": 0.06500933004273761, + "grad_norm": 2.6535820960998535, + "learning_rate": 0.00019937028524530552, + "loss": 0.6246, + "step": 270 + }, + { + "epoch": 0.0652501053391922, + "grad_norm": 5.218156337738037, + "learning_rate": 0.00019936151819001618, + "loss": 1.8041, + "step": 271 + }, + { + "epoch": 0.06549088063564679, + "grad_norm": 6.953288555145264, + "learning_rate": 0.00019935269072269987, + "loss": 0.886, + "step": 272 + }, + { + "epoch": 0.06573165593210137, + "grad_norm": 3.9206128120422363, + "learning_rate": 0.00019934380284872377, + "loss": 0.5613, + "step": 273 + }, + { + "epoch": 0.06597243122855595, + "grad_norm": 3.6051864624023438, + "learning_rate": 0.00019933485457349174, + "loss": 0.8749, + "step": 274 + }, + { + "epoch": 0.06621320652501053, + "grad_norm": 3.60562801361084, + "learning_rate": 0.00019932584590244434, + "loss": 0.892, + "step": 275 + }, + { + "epoch": 0.06645398182146511, + "grad_norm": 3.6393070220947266, + "learning_rate": 0.0001993167768410588, + "loss": 0.8526, + "step": 276 + }, + { + "epoch": 0.0666947571179197, + "grad_norm": 10.952275276184082, + "learning_rate": 0.0001993076473948492, + "loss": 1.1597, + "step": 277 + }, + { + "epoch": 0.06693553241437429, + "grad_norm": 9.785892486572266, + "learning_rate": 0.00019929845756936626, + "loss": 1.1667, + "step": 278 + }, + { + "epoch": 0.06717630771082887, + "grad_norm": 8.532158851623535, + "learning_rate": 0.00019928920737019733, + "loss": 1.4692, + "step": 279 + }, + { + "epoch": 0.06741708300728345, + "grad_norm": 12.74774169921875, + "learning_rate": 0.00019927989680296667, + "loss": 2.1035, + "step": 280 + }, + { + "epoch": 0.06765785830373804, + "grad_norm": 10.734175682067871, + "learning_rate": 0.00019927052587333507, + "loss": 1.8876, + "step": 281 + }, + { + "epoch": 0.06789863360019262, + "grad_norm": 4.373108863830566, + "learning_rate": 0.00019926109458700007, + "loss": 0.9184, + "step": 282 + }, + { + "epoch": 0.0681394088966472, + "grad_norm": 7.9593281745910645, + "learning_rate": 0.00019925160294969593, + "loss": 1.2637, + "step": 283 + }, + { + "epoch": 0.06838018419310178, + "grad_norm": 5.800394058227539, + "learning_rate": 0.0001992420509671936, + "loss": 0.8262, + "step": 284 + }, + { + "epoch": 0.06862095948955638, + "grad_norm": 5.995545864105225, + "learning_rate": 0.00019923243864530064, + "loss": 1.6762, + "step": 285 + }, + { + "epoch": 0.06886173478601096, + "grad_norm": 21.66741371154785, + "learning_rate": 0.00019922276598986145, + "loss": 1.1287, + "step": 286 + }, + { + "epoch": 0.06910251008246554, + "grad_norm": 12.231538772583008, + "learning_rate": 0.00019921303300675697, + "loss": 1.4966, + "step": 287 + }, + { + "epoch": 0.06934328537892012, + "grad_norm": 19.181198120117188, + "learning_rate": 0.00019920323970190487, + "loss": 1.7811, + "step": 288 + }, + { + "epoch": 0.0695840606753747, + "grad_norm": 4.649646282196045, + "learning_rate": 0.00019919338608125956, + "loss": 1.2632, + "step": 289 + }, + { + "epoch": 0.06982483597182929, + "grad_norm": 5.04226541519165, + "learning_rate": 0.00019918347215081204, + "loss": 1.3552, + "step": 290 + }, + { + "epoch": 0.07006561126828387, + "grad_norm": 4.240399360656738, + "learning_rate": 0.00019917349791658996, + "loss": 1.2266, + "step": 291 + }, + { + "epoch": 0.07030638656473846, + "grad_norm": 13.989855766296387, + "learning_rate": 0.0001991634633846577, + "loss": 1.429, + "step": 292 + }, + { + "epoch": 0.07054716186119304, + "grad_norm": 8.629983901977539, + "learning_rate": 0.00019915336856111631, + "loss": 1.0381, + "step": 293 + }, + { + "epoch": 0.07078793715764763, + "grad_norm": 14.188498497009277, + "learning_rate": 0.00019914321345210342, + "loss": 2.8836, + "step": 294 + }, + { + "epoch": 0.07102871245410221, + "grad_norm": 8.33694076538086, + "learning_rate": 0.00019913299806379334, + "loss": 0.6366, + "step": 295 + }, + { + "epoch": 0.07126948775055679, + "grad_norm": 16.30498695373535, + "learning_rate": 0.00019912272240239716, + "loss": 1.3799, + "step": 296 + }, + { + "epoch": 0.07151026304701137, + "grad_norm": 5.43389368057251, + "learning_rate": 0.00019911238647416242, + "loss": 1.1131, + "step": 297 + }, + { + "epoch": 0.07175103834346597, + "grad_norm": 20.10192108154297, + "learning_rate": 0.00019910199028537337, + "loss": 1.1515, + "step": 298 + }, + { + "epoch": 0.07199181363992055, + "grad_norm": 3.4195728302001953, + "learning_rate": 0.00019909153384235095, + "loss": 0.5817, + "step": 299 + }, + { + "epoch": 0.07223258893637513, + "grad_norm": 6.387148857116699, + "learning_rate": 0.00019908101715145272, + "loss": 0.7634, + "step": 300 + }, + { + "epoch": 0.07247336423282971, + "grad_norm": 4.05348539352417, + "learning_rate": 0.00019907044021907281, + "loss": 0.8352, + "step": 301 + }, + { + "epoch": 0.0727141395292843, + "grad_norm": 2.757005214691162, + "learning_rate": 0.00019905980305164205, + "loss": 0.7532, + "step": 302 + }, + { + "epoch": 0.07295491482573888, + "grad_norm": 2.14371919631958, + "learning_rate": 0.00019904910565562785, + "loss": 1.2168, + "step": 303 + }, + { + "epoch": 0.07319569012219346, + "grad_norm": 5.939690589904785, + "learning_rate": 0.00019903834803753425, + "loss": 0.8704, + "step": 304 + }, + { + "epoch": 0.07343646541864805, + "grad_norm": 7.156602382659912, + "learning_rate": 0.0001990275302039019, + "loss": 0.8243, + "step": 305 + }, + { + "epoch": 0.07367724071510263, + "grad_norm": 3.6926629543304443, + "learning_rate": 0.00019901665216130808, + "loss": 0.8763, + "step": 306 + }, + { + "epoch": 0.07391801601155722, + "grad_norm": 7.309814453125, + "learning_rate": 0.00019900571391636665, + "loss": 0.7731, + "step": 307 + }, + { + "epoch": 0.0741587913080118, + "grad_norm": 12.59055233001709, + "learning_rate": 0.00019899471547572811, + "loss": 1.0003, + "step": 308 + }, + { + "epoch": 0.07439956660446638, + "grad_norm": 3.9260809421539307, + "learning_rate": 0.00019898365684607952, + "loss": 0.9478, + "step": 309 + }, + { + "epoch": 0.07464034190092096, + "grad_norm": 3.1046080589294434, + "learning_rate": 0.00019897253803414456, + "loss": 0.7514, + "step": 310 + }, + { + "epoch": 0.07488111719737554, + "grad_norm": 2.8333990573883057, + "learning_rate": 0.0001989613590466835, + "loss": 0.4307, + "step": 311 + }, + { + "epoch": 0.07512189249383014, + "grad_norm": 11.99578857421875, + "learning_rate": 0.00019895011989049316, + "loss": 1.0123, + "step": 312 + }, + { + "epoch": 0.07536266779028472, + "grad_norm": 2.916750431060791, + "learning_rate": 0.000198938820572407, + "loss": 0.9809, + "step": 313 + }, + { + "epoch": 0.0756034430867393, + "grad_norm": 3.6491167545318604, + "learning_rate": 0.00019892746109929498, + "loss": 0.3447, + "step": 314 + }, + { + "epoch": 0.07584421838319388, + "grad_norm": 3.625203847885132, + "learning_rate": 0.00019891604147806376, + "loss": 1.0226, + "step": 315 + }, + { + "epoch": 0.07608499367964847, + "grad_norm": 3.9918270111083984, + "learning_rate": 0.00019890456171565643, + "loss": 0.6953, + "step": 316 + }, + { + "epoch": 0.07632576897610305, + "grad_norm": 7.0212554931640625, + "learning_rate": 0.00019889302181905278, + "loss": 1.1393, + "step": 317 + }, + { + "epoch": 0.07656654427255763, + "grad_norm": 6.060014247894287, + "learning_rate": 0.00019888142179526902, + "loss": 0.9609, + "step": 318 + }, + { + "epoch": 0.07680731956901223, + "grad_norm": 6.098717212677002, + "learning_rate": 0.00019886976165135807, + "loss": 1.3731, + "step": 319 + }, + { + "epoch": 0.0770480948654668, + "grad_norm": 8.985902786254883, + "learning_rate": 0.00019885804139440925, + "loss": 1.4469, + "step": 320 + }, + { + "epoch": 0.07728887016192139, + "grad_norm": 6.856400966644287, + "learning_rate": 0.00019884626103154856, + "loss": 1.4352, + "step": 321 + }, + { + "epoch": 0.07752964545837597, + "grad_norm": 4.309900283813477, + "learning_rate": 0.00019883442056993841, + "loss": 0.4605, + "step": 322 + }, + { + "epoch": 0.07777042075483055, + "grad_norm": 2.33298397064209, + "learning_rate": 0.00019882252001677793, + "loss": 1.2381, + "step": 323 + }, + { + "epoch": 0.07801119605128513, + "grad_norm": 3.6052260398864746, + "learning_rate": 0.0001988105593793026, + "loss": 1.0468, + "step": 324 + }, + { + "epoch": 0.07825197134773972, + "grad_norm": 4.753766059875488, + "learning_rate": 0.00019879853866478455, + "loss": 1.1693, + "step": 325 + }, + { + "epoch": 0.07849274664419431, + "grad_norm": 3.6719765663146973, + "learning_rate": 0.00019878645788053238, + "loss": 0.7712, + "step": 326 + }, + { + "epoch": 0.07873352194064889, + "grad_norm": 3.6164121627807617, + "learning_rate": 0.00019877431703389128, + "loss": 1.2832, + "step": 327 + }, + { + "epoch": 0.07897429723710347, + "grad_norm": 9.66127872467041, + "learning_rate": 0.00019876211613224288, + "loss": 2.2482, + "step": 328 + }, + { + "epoch": 0.07921507253355806, + "grad_norm": 2.208888053894043, + "learning_rate": 0.00019874985518300532, + "loss": 1.1646, + "step": 329 + }, + { + "epoch": 0.07945584783001264, + "grad_norm": 1.7235151529312134, + "learning_rate": 0.00019873753419363336, + "loss": 0.5038, + "step": 330 + }, + { + "epoch": 0.07969662312646722, + "grad_norm": 1.9844493865966797, + "learning_rate": 0.00019872515317161812, + "loss": 1.1001, + "step": 331 + }, + { + "epoch": 0.0799373984229218, + "grad_norm": 7.393949508666992, + "learning_rate": 0.00019871271212448734, + "loss": 1.7001, + "step": 332 + }, + { + "epoch": 0.0801781737193764, + "grad_norm": 10.367690086364746, + "learning_rate": 0.00019870021105980522, + "loss": 0.8829, + "step": 333 + }, + { + "epoch": 0.08041894901583098, + "grad_norm": 6.111469745635986, + "learning_rate": 0.00019868764998517236, + "loss": 1.6088, + "step": 334 + }, + { + "epoch": 0.08065972431228556, + "grad_norm": 4.986114978790283, + "learning_rate": 0.00019867502890822598, + "loss": 0.3513, + "step": 335 + }, + { + "epoch": 0.08090049960874014, + "grad_norm": 4.137001037597656, + "learning_rate": 0.00019866234783663968, + "loss": 1.1246, + "step": 336 + }, + { + "epoch": 0.08114127490519472, + "grad_norm": 2.4128201007843018, + "learning_rate": 0.00019864960677812364, + "loss": 0.7535, + "step": 337 + }, + { + "epoch": 0.0813820502016493, + "grad_norm": 19.265674591064453, + "learning_rate": 0.0001986368057404244, + "loss": 1.0217, + "step": 338 + }, + { + "epoch": 0.0816228254981039, + "grad_norm": 5.218925952911377, + "learning_rate": 0.00019862394473132503, + "loss": 0.6478, + "step": 339 + }, + { + "epoch": 0.08186360079455848, + "grad_norm": 9.463326454162598, + "learning_rate": 0.00019861102375864508, + "loss": 0.4951, + "step": 340 + }, + { + "epoch": 0.08210437609101306, + "grad_norm": 4.882657527923584, + "learning_rate": 0.0001985980428302405, + "loss": 0.5187, + "step": 341 + }, + { + "epoch": 0.08234515138746765, + "grad_norm": 9.088946342468262, + "learning_rate": 0.00019858500195400373, + "loss": 1.6635, + "step": 342 + }, + { + "epoch": 0.08258592668392223, + "grad_norm": 3.0154218673706055, + "learning_rate": 0.0001985719011378637, + "loss": 1.4851, + "step": 343 + }, + { + "epoch": 0.08282670198037681, + "grad_norm": 8.918438911437988, + "learning_rate": 0.00019855874038978563, + "loss": 0.8483, + "step": 344 + }, + { + "epoch": 0.08306747727683139, + "grad_norm": 3.460216760635376, + "learning_rate": 0.00019854551971777137, + "loss": 0.858, + "step": 345 + }, + { + "epoch": 0.08330825257328599, + "grad_norm": 5.214385032653809, + "learning_rate": 0.00019853223912985913, + "loss": 0.6952, + "step": 346 + }, + { + "epoch": 0.08354902786974057, + "grad_norm": 9.299979209899902, + "learning_rate": 0.00019851889863412345, + "loss": 0.5402, + "step": 347 + }, + { + "epoch": 0.08378980316619515, + "grad_norm": 3.114903211593628, + "learning_rate": 0.0001985054982386755, + "loss": 0.5039, + "step": 348 + }, + { + "epoch": 0.08403057846264973, + "grad_norm": 1.686824917793274, + "learning_rate": 0.00019849203795166263, + "loss": 0.5443, + "step": 349 + }, + { + "epoch": 0.08427135375910431, + "grad_norm": 21.62729835510254, + "learning_rate": 0.00019847851778126877, + "loss": 0.9847, + "step": 350 + }, + { + "epoch": 0.0845121290555589, + "grad_norm": 1.997676134109497, + "learning_rate": 0.00019846493773571425, + "loss": 0.2535, + "step": 351 + }, + { + "epoch": 0.08475290435201348, + "grad_norm": 3.0039217472076416, + "learning_rate": 0.0001984512978232558, + "loss": 1.1073, + "step": 352 + }, + { + "epoch": 0.08499367964846807, + "grad_norm": 1.8206866979599, + "learning_rate": 0.00019843759805218637, + "loss": 1.4459, + "step": 353 + }, + { + "epoch": 0.08523445494492266, + "grad_norm": 2.975524663925171, + "learning_rate": 0.0001984238384308356, + "loss": 1.5481, + "step": 354 + }, + { + "epoch": 0.08547523024137724, + "grad_norm": 2.0778095722198486, + "learning_rate": 0.0001984100189675693, + "loss": 0.8862, + "step": 355 + }, + { + "epoch": 0.08571600553783182, + "grad_norm": 15.60510540008545, + "learning_rate": 0.0001983961396707897, + "loss": 0.6816, + "step": 356 + }, + { + "epoch": 0.0859567808342864, + "grad_norm": 3.4831383228302, + "learning_rate": 0.00019838220054893552, + "loss": 0.6734, + "step": 357 + }, + { + "epoch": 0.08619755613074098, + "grad_norm": 3.5622880458831787, + "learning_rate": 0.00019836820161048176, + "loss": 1.166, + "step": 358 + }, + { + "epoch": 0.08643833142719556, + "grad_norm": 0.6584992408752441, + "learning_rate": 0.00019835414286393979, + "loss": 0.563, + "step": 359 + }, + { + "epoch": 0.08667910672365016, + "grad_norm": 3.729058027267456, + "learning_rate": 0.00019834002431785735, + "loss": 0.8303, + "step": 360 + }, + { + "epoch": 0.08691988202010474, + "grad_norm": 5.881722450256348, + "learning_rate": 0.0001983258459808186, + "loss": 1.3364, + "step": 361 + }, + { + "epoch": 0.08716065731655932, + "grad_norm": 5.635914325714111, + "learning_rate": 0.00019831160786144394, + "loss": 0.7647, + "step": 362 + }, + { + "epoch": 0.0874014326130139, + "grad_norm": 3.507514715194702, + "learning_rate": 0.0001982973099683902, + "loss": 0.5602, + "step": 363 + }, + { + "epoch": 0.08764220790946849, + "grad_norm": 5.673732757568359, + "learning_rate": 0.00019828295231035051, + "loss": 0.946, + "step": 364 + }, + { + "epoch": 0.08788298320592307, + "grad_norm": 2.3530821800231934, + "learning_rate": 0.0001982685348960544, + "loss": 0.3095, + "step": 365 + }, + { + "epoch": 0.08812375850237765, + "grad_norm": 3.0282411575317383, + "learning_rate": 0.00019825405773426767, + "loss": 0.791, + "step": 366 + }, + { + "epoch": 0.08836453379883225, + "grad_norm": 5.266041278839111, + "learning_rate": 0.0001982395208337925, + "loss": 1.4795, + "step": 367 + }, + { + "epoch": 0.08860530909528683, + "grad_norm": 2.5949831008911133, + "learning_rate": 0.0001982249242034673, + "loss": 0.4774, + "step": 368 + }, + { + "epoch": 0.08884608439174141, + "grad_norm": 2.186204195022583, + "learning_rate": 0.00019821026785216687, + "loss": 0.8617, + "step": 369 + }, + { + "epoch": 0.08908685968819599, + "grad_norm": 18.886642456054688, + "learning_rate": 0.00019819555178880234, + "loss": 0.7616, + "step": 370 + }, + { + "epoch": 0.08932763498465057, + "grad_norm": 2.88727068901062, + "learning_rate": 0.00019818077602232106, + "loss": 0.5059, + "step": 371 + }, + { + "epoch": 0.08956841028110515, + "grad_norm": 6.770381927490234, + "learning_rate": 0.00019816594056170676, + "loss": 1.5388, + "step": 372 + }, + { + "epoch": 0.08980918557755974, + "grad_norm": 4.677947044372559, + "learning_rate": 0.00019815104541597944, + "loss": 0.5632, + "step": 373 + }, + { + "epoch": 0.09004996087401433, + "grad_norm": 2.849351406097412, + "learning_rate": 0.00019813609059419538, + "loss": 0.3689, + "step": 374 + }, + { + "epoch": 0.09029073617046891, + "grad_norm": 2.1919734477996826, + "learning_rate": 0.0001981210761054471, + "loss": 1.003, + "step": 375 + }, + { + "epoch": 0.0905315114669235, + "grad_norm": 3.296410083770752, + "learning_rate": 0.0001981060019588635, + "loss": 0.5615, + "step": 376 + }, + { + "epoch": 0.09077228676337808, + "grad_norm": 2.373533248901367, + "learning_rate": 0.00019809086816360968, + "loss": 0.7389, + "step": 377 + }, + { + "epoch": 0.09101306205983266, + "grad_norm": 4.461115837097168, + "learning_rate": 0.00019807567472888702, + "loss": 0.98, + "step": 378 + }, + { + "epoch": 0.09125383735628724, + "grad_norm": 6.4342427253723145, + "learning_rate": 0.00019806042166393314, + "loss": 0.8969, + "step": 379 + }, + { + "epoch": 0.09149461265274182, + "grad_norm": 2.5169475078582764, + "learning_rate": 0.00019804510897802197, + "loss": 0.3081, + "step": 380 + }, + { + "epoch": 0.09173538794919642, + "grad_norm": 5.742027282714844, + "learning_rate": 0.00019802973668046363, + "loss": 1.2418, + "step": 381 + }, + { + "epoch": 0.091976163245651, + "grad_norm": 1.0783274173736572, + "learning_rate": 0.00019801430478060453, + "loss": 0.4456, + "step": 382 + }, + { + "epoch": 0.09221693854210558, + "grad_norm": 5.443319797515869, + "learning_rate": 0.0001979988132878273, + "loss": 1.0314, + "step": 383 + }, + { + "epoch": 0.09245771383856016, + "grad_norm": 1.8633432388305664, + "learning_rate": 0.00019798326221155078, + "loss": 1.3362, + "step": 384 + }, + { + "epoch": 0.09269848913501474, + "grad_norm": 8.395817756652832, + "learning_rate": 0.00019796765156123008, + "loss": 1.7206, + "step": 385 + }, + { + "epoch": 0.09293926443146933, + "grad_norm": 0.9301803112030029, + "learning_rate": 0.00019795198134635653, + "loss": 0.3155, + "step": 386 + }, + { + "epoch": 0.09318003972792392, + "grad_norm": 6.0776047706604, + "learning_rate": 0.00019793625157645762, + "loss": 0.7454, + "step": 387 + }, + { + "epoch": 0.0934208150243785, + "grad_norm": 4.320910453796387, + "learning_rate": 0.00019792046226109708, + "loss": 0.9696, + "step": 388 + }, + { + "epoch": 0.09366159032083309, + "grad_norm": 8.204424858093262, + "learning_rate": 0.0001979046134098749, + "loss": 1.1431, + "step": 389 + }, + { + "epoch": 0.09390236561728767, + "grad_norm": 0.629797101020813, + "learning_rate": 0.00019788870503242715, + "loss": 0.4199, + "step": 390 + }, + { + "epoch": 0.09414314091374225, + "grad_norm": 3.0499680042266846, + "learning_rate": 0.00019787273713842623, + "loss": 0.722, + "step": 391 + }, + { + "epoch": 0.09438391621019683, + "grad_norm": 2.0613560676574707, + "learning_rate": 0.00019785670973758058, + "loss": 0.8111, + "step": 392 + }, + { + "epoch": 0.09462469150665141, + "grad_norm": 14.847646713256836, + "learning_rate": 0.00019784062283963495, + "loss": 1.0207, + "step": 393 + }, + { + "epoch": 0.09486546680310601, + "grad_norm": 2.1953060626983643, + "learning_rate": 0.00019782447645437022, + "loss": 0.3284, + "step": 394 + }, + { + "epoch": 0.09510624209956059, + "grad_norm": 6.55955171585083, + "learning_rate": 0.00019780827059160338, + "loss": 1.3168, + "step": 395 + }, + { + "epoch": 0.09534701739601517, + "grad_norm": 4.817495822906494, + "learning_rate": 0.0001977920052611877, + "loss": 0.6965, + "step": 396 + }, + { + "epoch": 0.09558779269246975, + "grad_norm": 2.0958549976348877, + "learning_rate": 0.00019777568047301243, + "loss": 1.2996, + "step": 397 + }, + { + "epoch": 0.09582856798892433, + "grad_norm": 3.6508209705352783, + "learning_rate": 0.00019775929623700318, + "loss": 0.4667, + "step": 398 + }, + { + "epoch": 0.09606934328537892, + "grad_norm": 4.169986724853516, + "learning_rate": 0.00019774285256312152, + "loss": 1.0308, + "step": 399 + }, + { + "epoch": 0.0963101185818335, + "grad_norm": 4.545289516448975, + "learning_rate": 0.00019772634946136535, + "loss": 1.4587, + "step": 400 + }, + { + "epoch": 0.0965508938782881, + "grad_norm": 2.637938976287842, + "learning_rate": 0.00019770978694176846, + "loss": 0.7042, + "step": 401 + }, + { + "epoch": 0.09679166917474268, + "grad_norm": 5.515408992767334, + "learning_rate": 0.00019769316501440102, + "loss": 1.0088, + "step": 402 + }, + { + "epoch": 0.09703244447119726, + "grad_norm": 1.7717092037200928, + "learning_rate": 0.00019767648368936914, + "loss": 0.3585, + "step": 403 + }, + { + "epoch": 0.09727321976765184, + "grad_norm": 5.126103401184082, + "learning_rate": 0.0001976597429768151, + "loss": 1.5234, + "step": 404 + }, + { + "epoch": 0.09751399506410642, + "grad_norm": 0.473143607378006, + "learning_rate": 0.00019764294288691727, + "loss": 0.2934, + "step": 405 + }, + { + "epoch": 0.097754770360561, + "grad_norm": 7.283068656921387, + "learning_rate": 0.0001976260834298902, + "loss": 1.1666, + "step": 406 + }, + { + "epoch": 0.09799554565701558, + "grad_norm": 2.16549015045166, + "learning_rate": 0.00019760916461598446, + "loss": 0.4612, + "step": 407 + }, + { + "epoch": 0.09823632095347018, + "grad_norm": 1.2254639863967896, + "learning_rate": 0.0001975921864554867, + "loss": 0.7512, + "step": 408 + }, + { + "epoch": 0.09847709624992476, + "grad_norm": 1.8601148128509521, + "learning_rate": 0.0001975751489587197, + "loss": 0.8824, + "step": 409 + }, + { + "epoch": 0.09871787154637934, + "grad_norm": 2.0946712493896484, + "learning_rate": 0.0001975580521360423, + "loss": 0.6299, + "step": 410 + }, + { + "epoch": 0.09895864684283392, + "grad_norm": 5.10854434967041, + "learning_rate": 0.00019754089599784938, + "loss": 1.3609, + "step": 411 + }, + { + "epoch": 0.0991994221392885, + "grad_norm": 2.166837453842163, + "learning_rate": 0.00019752368055457197, + "loss": 0.7314, + "step": 412 + }, + { + "epoch": 0.09944019743574309, + "grad_norm": 3.2308640480041504, + "learning_rate": 0.00019750640581667702, + "loss": 1.679, + "step": 413 + }, + { + "epoch": 0.09968097273219767, + "grad_norm": 3.175098180770874, + "learning_rate": 0.00019748907179466767, + "loss": 0.7682, + "step": 414 + }, + { + "epoch": 0.09992174802865227, + "grad_norm": 6.487977981567383, + "learning_rate": 0.00019747167849908304, + "loss": 0.926, + "step": 415 + }, + { + "epoch": 0.10016252332510685, + "grad_norm": 3.602936029434204, + "learning_rate": 0.00019745422594049825, + "loss": 1.0786, + "step": 416 + }, + { + "epoch": 0.10040329862156143, + "grad_norm": 2.2537026405334473, + "learning_rate": 0.00019743671412952453, + "loss": 0.3749, + "step": 417 + }, + { + "epoch": 0.10064407391801601, + "grad_norm": 3.4394688606262207, + "learning_rate": 0.00019741914307680908, + "loss": 0.6582, + "step": 418 + }, + { + "epoch": 0.10088484921447059, + "grad_norm": 4.710788726806641, + "learning_rate": 0.00019740151279303518, + "loss": 1.0236, + "step": 419 + }, + { + "epoch": 0.10112562451092517, + "grad_norm": 2.518106698989868, + "learning_rate": 0.000197383823288922, + "loss": 0.7708, + "step": 420 + }, + { + "epoch": 0.10136639980737976, + "grad_norm": 2.9978835582733154, + "learning_rate": 0.0001973660745752249, + "loss": 0.4426, + "step": 421 + }, + { + "epoch": 0.10160717510383435, + "grad_norm": 2.2193732261657715, + "learning_rate": 0.0001973482666627351, + "loss": 1.0488, + "step": 422 + }, + { + "epoch": 0.10184795040028893, + "grad_norm": 2.385712146759033, + "learning_rate": 0.0001973303995622798, + "loss": 0.5798, + "step": 423 + }, + { + "epoch": 0.10208872569674352, + "grad_norm": 6.944875240325928, + "learning_rate": 0.00019731247328472228, + "loss": 0.9012, + "step": 424 + }, + { + "epoch": 0.1023295009931981, + "grad_norm": 1.5543016195297241, + "learning_rate": 0.00019729448784096179, + "loss": 0.5052, + "step": 425 + }, + { + "epoch": 0.10257027628965268, + "grad_norm": 4.3643317222595215, + "learning_rate": 0.00019727644324193347, + "loss": 1.0582, + "step": 426 + }, + { + "epoch": 0.10281105158610726, + "grad_norm": 3.4253134727478027, + "learning_rate": 0.00019725833949860847, + "loss": 0.8646, + "step": 427 + }, + { + "epoch": 0.10305182688256186, + "grad_norm": 6.012450218200684, + "learning_rate": 0.00019724017662199397, + "loss": 0.9271, + "step": 428 + }, + { + "epoch": 0.10329260217901644, + "grad_norm": 4.788900375366211, + "learning_rate": 0.00019722195462313296, + "loss": 0.6417, + "step": 429 + }, + { + "epoch": 0.10353337747547102, + "grad_norm": 1.210336446762085, + "learning_rate": 0.00019720367351310452, + "loss": 0.6169, + "step": 430 + }, + { + "epoch": 0.1037741527719256, + "grad_norm": 3.0183141231536865, + "learning_rate": 0.00019718533330302358, + "loss": 1.077, + "step": 431 + }, + { + "epoch": 0.10401492806838018, + "grad_norm": 1.4695411920547485, + "learning_rate": 0.000197166934004041, + "loss": 0.423, + "step": 432 + }, + { + "epoch": 0.10425570336483476, + "grad_norm": 3.7340753078460693, + "learning_rate": 0.00019714847562734365, + "loss": 0.506, + "step": 433 + }, + { + "epoch": 0.10449647866128935, + "grad_norm": 0.8714501261711121, + "learning_rate": 0.00019712995818415424, + "loss": 0.3461, + "step": 434 + }, + { + "epoch": 0.10473725395774394, + "grad_norm": 1.6766986846923828, + "learning_rate": 0.00019711138168573142, + "loss": 0.9932, + "step": 435 + }, + { + "epoch": 0.10497802925419852, + "grad_norm": 2.76531720161438, + "learning_rate": 0.00019709274614336975, + "loss": 0.7046, + "step": 436 + }, + { + "epoch": 0.1052188045506531, + "grad_norm": 6.036025524139404, + "learning_rate": 0.00019707405156839966, + "loss": 0.9637, + "step": 437 + }, + { + "epoch": 0.10545957984710769, + "grad_norm": 4.022448539733887, + "learning_rate": 0.0001970552979721875, + "loss": 0.2239, + "step": 438 + }, + { + "epoch": 0.10570035514356227, + "grad_norm": 2.6519360542297363, + "learning_rate": 0.0001970364853661355, + "loss": 1.0182, + "step": 439 + }, + { + "epoch": 0.10594113044001685, + "grad_norm": 5.855311870574951, + "learning_rate": 0.0001970176137616818, + "loss": 0.7519, + "step": 440 + }, + { + "epoch": 0.10618190573647143, + "grad_norm": 3.5491368770599365, + "learning_rate": 0.00019699868317030035, + "loss": 0.8588, + "step": 441 + }, + { + "epoch": 0.10642268103292603, + "grad_norm": 4.17829704284668, + "learning_rate": 0.00019697969360350098, + "loss": 0.9785, + "step": 442 + }, + { + "epoch": 0.10666345632938061, + "grad_norm": 4.346673488616943, + "learning_rate": 0.00019696064507282937, + "loss": 0.7598, + "step": 443 + }, + { + "epoch": 0.10690423162583519, + "grad_norm": 2.906926155090332, + "learning_rate": 0.00019694153758986714, + "loss": 0.6547, + "step": 444 + }, + { + "epoch": 0.10714500692228977, + "grad_norm": 1.955552339553833, + "learning_rate": 0.00019692237116623163, + "loss": 0.925, + "step": 445 + }, + { + "epoch": 0.10738578221874436, + "grad_norm": 4.8115739822387695, + "learning_rate": 0.00019690314581357607, + "loss": 0.9647, + "step": 446 + }, + { + "epoch": 0.10762655751519894, + "grad_norm": 2.199876308441162, + "learning_rate": 0.00019688386154358955, + "loss": 1.2637, + "step": 447 + }, + { + "epoch": 0.10786733281165352, + "grad_norm": 8.052813529968262, + "learning_rate": 0.0001968645183679969, + "loss": 0.7113, + "step": 448 + }, + { + "epoch": 0.10810810810810811, + "grad_norm": 6.857846260070801, + "learning_rate": 0.00019684511629855888, + "loss": 1.0796, + "step": 449 + }, + { + "epoch": 0.1083488834045627, + "grad_norm": 3.3255105018615723, + "learning_rate": 0.00019682565534707194, + "loss": 0.4504, + "step": 450 + }, + { + "epoch": 0.10858965870101728, + "grad_norm": 4.091807842254639, + "learning_rate": 0.0001968061355253684, + "loss": 0.9397, + "step": 451 + }, + { + "epoch": 0.10883043399747186, + "grad_norm": 2.051816701889038, + "learning_rate": 0.00019678655684531634, + "loss": 0.5485, + "step": 452 + }, + { + "epoch": 0.10907120929392644, + "grad_norm": 1.8907794952392578, + "learning_rate": 0.00019676691931881968, + "loss": 0.567, + "step": 453 + }, + { + "epoch": 0.10931198459038102, + "grad_norm": 4.47649621963501, + "learning_rate": 0.00019674722295781805, + "loss": 0.8856, + "step": 454 + }, + { + "epoch": 0.1095527598868356, + "grad_norm": 5.481165409088135, + "learning_rate": 0.0001967274677742869, + "loss": 0.4616, + "step": 455 + }, + { + "epoch": 0.1097935351832902, + "grad_norm": 8.510377883911133, + "learning_rate": 0.0001967076537802374, + "loss": 0.3674, + "step": 456 + }, + { + "epoch": 0.11003431047974478, + "grad_norm": 3.4752211570739746, + "learning_rate": 0.00019668778098771647, + "loss": 0.7903, + "step": 457 + }, + { + "epoch": 0.11027508577619936, + "grad_norm": 3.52034330368042, + "learning_rate": 0.00019666784940880691, + "loss": 0.5652, + "step": 458 + }, + { + "epoch": 0.11051586107265395, + "grad_norm": 4.425768852233887, + "learning_rate": 0.0001966478590556271, + "loss": 0.6404, + "step": 459 + }, + { + "epoch": 0.11075663636910853, + "grad_norm": 9.201542854309082, + "learning_rate": 0.00019662780994033125, + "loss": 1.0613, + "step": 460 + }, + { + "epoch": 0.11099741166556311, + "grad_norm": 3.8637278079986572, + "learning_rate": 0.00019660770207510924, + "loss": 1.1498, + "step": 461 + }, + { + "epoch": 0.11123818696201769, + "grad_norm": 5.719259738922119, + "learning_rate": 0.0001965875354721867, + "loss": 1.0628, + "step": 462 + }, + { + "epoch": 0.11147896225847229, + "grad_norm": 1.5758776664733887, + "learning_rate": 0.00019656731014382501, + "loss": 0.5364, + "step": 463 + }, + { + "epoch": 0.11171973755492687, + "grad_norm": 7.384488582611084, + "learning_rate": 0.00019654702610232114, + "loss": 0.7939, + "step": 464 + }, + { + "epoch": 0.11196051285138145, + "grad_norm": 5.359811782836914, + "learning_rate": 0.0001965266833600079, + "loss": 0.7241, + "step": 465 + }, + { + "epoch": 0.11220128814783603, + "grad_norm": 3.234246015548706, + "learning_rate": 0.0001965062819292537, + "loss": 0.974, + "step": 466 + }, + { + "epoch": 0.11244206344429061, + "grad_norm": 2.34318208694458, + "learning_rate": 0.00019648582182246266, + "loss": 0.3588, + "step": 467 + }, + { + "epoch": 0.1126828387407452, + "grad_norm": 3.9500319957733154, + "learning_rate": 0.0001964653030520746, + "loss": 0.6119, + "step": 468 + }, + { + "epoch": 0.11292361403719978, + "grad_norm": 2.85276198387146, + "learning_rate": 0.00019644472563056485, + "loss": 0.6573, + "step": 469 + }, + { + "epoch": 0.11316438933365437, + "grad_norm": 1.7280099391937256, + "learning_rate": 0.0001964240895704447, + "loss": 0.8111, + "step": 470 + }, + { + "epoch": 0.11340516463010895, + "grad_norm": 2.8521628379821777, + "learning_rate": 0.00019640339488426084, + "loss": 0.4068, + "step": 471 + }, + { + "epoch": 0.11364593992656354, + "grad_norm": 3.4895570278167725, + "learning_rate": 0.00019638264158459566, + "loss": 0.8143, + "step": 472 + }, + { + "epoch": 0.11388671522301812, + "grad_norm": 1.5952945947647095, + "learning_rate": 0.00019636182968406726, + "loss": 0.5789, + "step": 473 + }, + { + "epoch": 0.1141274905194727, + "grad_norm": 3.6532886028289795, + "learning_rate": 0.00019634095919532932, + "loss": 0.4563, + "step": 474 + }, + { + "epoch": 0.11436826581592728, + "grad_norm": 1.950562596321106, + "learning_rate": 0.00019632003013107113, + "loss": 0.6839, + "step": 475 + }, + { + "epoch": 0.11460904111238188, + "grad_norm": 6.8443779945373535, + "learning_rate": 0.00019629904250401757, + "loss": 0.5238, + "step": 476 + }, + { + "epoch": 0.11484981640883646, + "grad_norm": 3.7890400886535645, + "learning_rate": 0.00019627799632692923, + "loss": 0.6927, + "step": 477 + }, + { + "epoch": 0.11509059170529104, + "grad_norm": 6.215263366699219, + "learning_rate": 0.0001962568916126022, + "loss": 0.962, + "step": 478 + }, + { + "epoch": 0.11533136700174562, + "grad_norm": 2.3885769844055176, + "learning_rate": 0.0001962357283738682, + "loss": 0.5657, + "step": 479 + }, + { + "epoch": 0.1155721422982002, + "grad_norm": 2.069955587387085, + "learning_rate": 0.00019621450662359456, + "loss": 0.5302, + "step": 480 + }, + { + "epoch": 0.11581291759465479, + "grad_norm": 2.8343095779418945, + "learning_rate": 0.0001961932263746841, + "loss": 0.3862, + "step": 481 + }, + { + "epoch": 0.11605369289110937, + "grad_norm": 2.8576223850250244, + "learning_rate": 0.00019617188764007524, + "loss": 1.4014, + "step": 482 + }, + { + "epoch": 0.11629446818756396, + "grad_norm": 2.8722829818725586, + "learning_rate": 0.00019615049043274205, + "loss": 0.3474, + "step": 483 + }, + { + "epoch": 0.11653524348401854, + "grad_norm": 3.647714376449585, + "learning_rate": 0.00019612903476569406, + "loss": 0.8658, + "step": 484 + }, + { + "epoch": 0.11677601878047313, + "grad_norm": 6.923486232757568, + "learning_rate": 0.00019610752065197634, + "loss": 1.0191, + "step": 485 + }, + { + "epoch": 0.11701679407692771, + "grad_norm": 2.716620683670044, + "learning_rate": 0.0001960859481046695, + "loss": 0.6586, + "step": 486 + }, + { + "epoch": 0.11725756937338229, + "grad_norm": 3.657470941543579, + "learning_rate": 0.00019606431713688975, + "loss": 0.5454, + "step": 487 + }, + { + "epoch": 0.11749834466983687, + "grad_norm": 4.070058345794678, + "learning_rate": 0.00019604262776178876, + "loss": 0.5342, + "step": 488 + }, + { + "epoch": 0.11773911996629145, + "grad_norm": 5.016479015350342, + "learning_rate": 0.0001960208799925537, + "loss": 0.4871, + "step": 489 + }, + { + "epoch": 0.11797989526274605, + "grad_norm": 0.8800312280654907, + "learning_rate": 0.00019599907384240726, + "loss": 0.7974, + "step": 490 + }, + { + "epoch": 0.11822067055920063, + "grad_norm": 1.534217119216919, + "learning_rate": 0.00019597720932460763, + "loss": 0.5083, + "step": 491 + }, + { + "epoch": 0.11846144585565521, + "grad_norm": 2.766813278198242, + "learning_rate": 0.0001959552864524485, + "loss": 0.7765, + "step": 492 + }, + { + "epoch": 0.1187022211521098, + "grad_norm": 8.230446815490723, + "learning_rate": 0.00019593330523925902, + "loss": 0.6511, + "step": 493 + }, + { + "epoch": 0.11894299644856438, + "grad_norm": 2.781522750854492, + "learning_rate": 0.00019591126569840382, + "loss": 0.4147, + "step": 494 + }, + { + "epoch": 0.11918377174501896, + "grad_norm": 4.93475341796875, + "learning_rate": 0.00019588916784328295, + "loss": 0.4109, + "step": 495 + }, + { + "epoch": 0.11942454704147354, + "grad_norm": 3.2182798385620117, + "learning_rate": 0.00019586701168733202, + "loss": 0.5109, + "step": 496 + }, + { + "epoch": 0.11966532233792813, + "grad_norm": 5.978203773498535, + "learning_rate": 0.00019584479724402197, + "loss": 0.5933, + "step": 497 + }, + { + "epoch": 0.11990609763438272, + "grad_norm": 2.445081949234009, + "learning_rate": 0.00019582252452685927, + "loss": 1.1266, + "step": 498 + }, + { + "epoch": 0.1201468729308373, + "grad_norm": 2.4677765369415283, + "learning_rate": 0.0001958001935493858, + "loss": 0.4697, + "step": 499 + }, + { + "epoch": 0.12038764822729188, + "grad_norm": 0.8610912561416626, + "learning_rate": 0.00019577780432517879, + "loss": 1.0763, + "step": 500 + }, + { + "epoch": 0.12062842352374646, + "grad_norm": 1.9866464138031006, + "learning_rate": 0.0001957553568678509, + "loss": 0.729, + "step": 501 + }, + { + "epoch": 0.12086919882020104, + "grad_norm": 2.3875463008880615, + "learning_rate": 0.00019573285119105037, + "loss": 0.8719, + "step": 502 + }, + { + "epoch": 0.12110997411665562, + "grad_norm": 4.172793388366699, + "learning_rate": 0.0001957102873084606, + "loss": 1.0308, + "step": 503 + }, + { + "epoch": 0.12135074941311022, + "grad_norm": 1.5716460943222046, + "learning_rate": 0.0001956876652338005, + "loss": 1.0994, + "step": 504 + }, + { + "epoch": 0.1215915247095648, + "grad_norm": 8.024327278137207, + "learning_rate": 0.00019566498498082438, + "loss": 0.437, + "step": 505 + }, + { + "epoch": 0.12183230000601938, + "grad_norm": 2.5161705017089844, + "learning_rate": 0.0001956422465633218, + "loss": 1.0868, + "step": 506 + }, + { + "epoch": 0.12207307530247397, + "grad_norm": 4.083341598510742, + "learning_rate": 0.0001956194499951179, + "loss": 0.694, + "step": 507 + }, + { + "epoch": 0.12231385059892855, + "grad_norm": 2.113607406616211, + "learning_rate": 0.00019559659529007293, + "loss": 0.8918, + "step": 508 + }, + { + "epoch": 0.12255462589538313, + "grad_norm": 2.2010605335235596, + "learning_rate": 0.00019557368246208263, + "loss": 0.2703, + "step": 509 + }, + { + "epoch": 0.12279540119183771, + "grad_norm": 2.9058799743652344, + "learning_rate": 0.0001955507115250781, + "loss": 1.207, + "step": 510 + }, + { + "epoch": 0.1230361764882923, + "grad_norm": 7.344447612762451, + "learning_rate": 0.00019552768249302566, + "loss": 1.1835, + "step": 511 + }, + { + "epoch": 0.12327695178474689, + "grad_norm": 1.3118301630020142, + "learning_rate": 0.00019550459537992704, + "loss": 0.5164, + "step": 512 + }, + { + "epoch": 0.12351772708120147, + "grad_norm": 1.65935480594635, + "learning_rate": 0.00019548145019981924, + "loss": 0.7932, + "step": 513 + }, + { + "epoch": 0.12375850237765605, + "grad_norm": 3.032277822494507, + "learning_rate": 0.0001954582469667746, + "loss": 0.2637, + "step": 514 + }, + { + "epoch": 0.12399927767411063, + "grad_norm": 4.980113983154297, + "learning_rate": 0.00019543498569490076, + "loss": 0.8955, + "step": 515 + }, + { + "epoch": 0.12424005297056522, + "grad_norm": 4.82036828994751, + "learning_rate": 0.00019541166639834058, + "loss": 1.1343, + "step": 516 + }, + { + "epoch": 0.1244808282670198, + "grad_norm": 1.410509705543518, + "learning_rate": 0.0001953882890912723, + "loss": 0.8404, + "step": 517 + }, + { + "epoch": 0.12472160356347439, + "grad_norm": 4.177162170410156, + "learning_rate": 0.00019536485378790928, + "loss": 1.0445, + "step": 518 + }, + { + "epoch": 0.12496237885992897, + "grad_norm": 1.0933364629745483, + "learning_rate": 0.00019534136050250033, + "loss": 0.5387, + "step": 519 + }, + { + "epoch": 0.12520315415638356, + "grad_norm": 1.1372244358062744, + "learning_rate": 0.00019531780924932939, + "loss": 0.5226, + "step": 520 + }, + { + "epoch": 0.12544392945283814, + "grad_norm": 43.66477966308594, + "learning_rate": 0.00019529420004271567, + "loss": 0.2868, + "step": 521 + }, + { + "epoch": 0.12568470474929272, + "grad_norm": 3.931898593902588, + "learning_rate": 0.0001952705328970136, + "loss": 0.5439, + "step": 522 + }, + { + "epoch": 0.1259254800457473, + "grad_norm": 1.9407854080200195, + "learning_rate": 0.00019524680782661294, + "loss": 0.9395, + "step": 523 + }, + { + "epoch": 0.12616625534220188, + "grad_norm": 3.817629814147949, + "learning_rate": 0.0001952230248459385, + "loss": 1.0245, + "step": 524 + }, + { + "epoch": 0.12640703063865646, + "grad_norm": 6.78740119934082, + "learning_rate": 0.0001951991839694504, + "loss": 0.5898, + "step": 525 + }, + { + "epoch": 0.12664780593511105, + "grad_norm": 3.067821979522705, + "learning_rate": 0.00019517528521164395, + "loss": 1.1765, + "step": 526 + }, + { + "epoch": 0.12688858123156563, + "grad_norm": 3.173957109451294, + "learning_rate": 0.00019515132858704965, + "loss": 0.9526, + "step": 527 + }, + { + "epoch": 0.12712935652802024, + "grad_norm": 2.5016558170318604, + "learning_rate": 0.00019512731411023323, + "loss": 1.0662, + "step": 528 + }, + { + "epoch": 0.12737013182447482, + "grad_norm": 3.3116912841796875, + "learning_rate": 0.00019510324179579548, + "loss": 1.3049, + "step": 529 + }, + { + "epoch": 0.1276109071209294, + "grad_norm": 4.086653709411621, + "learning_rate": 0.00019507911165837248, + "loss": 1.0897, + "step": 530 + }, + { + "epoch": 0.12785168241738398, + "grad_norm": 7.5260329246521, + "learning_rate": 0.00019505492371263533, + "loss": 1.0542, + "step": 531 + }, + { + "epoch": 0.12809245771383856, + "grad_norm": 4.74697208404541, + "learning_rate": 0.00019503067797329044, + "loss": 1.4031, + "step": 532 + }, + { + "epoch": 0.12833323301029315, + "grad_norm": 3.090668201446533, + "learning_rate": 0.0001950063744550792, + "loss": 0.4726, + "step": 533 + }, + { + "epoch": 0.12857400830674773, + "grad_norm": 2.889418840408325, + "learning_rate": 0.00019498201317277828, + "loss": 1.3182, + "step": 534 + }, + { + "epoch": 0.1288147836032023, + "grad_norm": 2.548130989074707, + "learning_rate": 0.00019495759414119932, + "loss": 0.6617, + "step": 535 + }, + { + "epoch": 0.1290555588996569, + "grad_norm": 2.9702346324920654, + "learning_rate": 0.0001949331173751892, + "loss": 0.7535, + "step": 536 + }, + { + "epoch": 0.12929633419611147, + "grad_norm": 6.834994316101074, + "learning_rate": 0.00019490858288962983, + "loss": 1.2718, + "step": 537 + }, + { + "epoch": 0.12953710949256605, + "grad_norm": 6.125328540802002, + "learning_rate": 0.00019488399069943823, + "loss": 0.6736, + "step": 538 + }, + { + "epoch": 0.12977788478902064, + "grad_norm": 5.69896125793457, + "learning_rate": 0.0001948593408195665, + "loss": 0.6771, + "step": 539 + }, + { + "epoch": 0.13001866008547522, + "grad_norm": 2.1542887687683105, + "learning_rate": 0.0001948346332650018, + "loss": 0.2843, + "step": 540 + }, + { + "epoch": 0.13025943538192983, + "grad_norm": 4.776561737060547, + "learning_rate": 0.0001948098680507665, + "loss": 0.5372, + "step": 541 + }, + { + "epoch": 0.1305002106783844, + "grad_norm": 1.1416128873825073, + "learning_rate": 0.00019478504519191773, + "loss": 0.7292, + "step": 542 + }, + { + "epoch": 0.130740985974839, + "grad_norm": 1.7264859676361084, + "learning_rate": 0.00019476016470354796, + "loss": 0.5956, + "step": 543 + }, + { + "epoch": 0.13098176127129357, + "grad_norm": 2.4325296878814697, + "learning_rate": 0.00019473522660078455, + "loss": 0.819, + "step": 544 + }, + { + "epoch": 0.13122253656774815, + "grad_norm": 2.0552382469177246, + "learning_rate": 0.00019471023089878995, + "loss": 1.0633, + "step": 545 + }, + { + "epoch": 0.13146331186420274, + "grad_norm": 6.430831432342529, + "learning_rate": 0.00019468517761276154, + "loss": 0.711, + "step": 546 + }, + { + "epoch": 0.13170408716065732, + "grad_norm": 3.12066650390625, + "learning_rate": 0.00019466006675793185, + "loss": 0.525, + "step": 547 + }, + { + "epoch": 0.1319448624571119, + "grad_norm": 18.034626007080078, + "learning_rate": 0.00019463489834956827, + "loss": 0.7595, + "step": 548 + }, + { + "epoch": 0.13218563775356648, + "grad_norm": 17.345428466796875, + "learning_rate": 0.0001946096724029733, + "loss": 1.1251, + "step": 549 + }, + { + "epoch": 0.13242641305002106, + "grad_norm": 1.709258794784546, + "learning_rate": 0.00019458438893348433, + "loss": 1.4069, + "step": 550 + }, + { + "epoch": 0.13266718834647565, + "grad_norm": 2.9705605506896973, + "learning_rate": 0.0001945590479564738, + "loss": 0.8629, + "step": 551 + }, + { + "epoch": 0.13290796364293023, + "grad_norm": 1.2169429063796997, + "learning_rate": 0.00019453364948734906, + "loss": 0.4889, + "step": 552 + }, + { + "epoch": 0.1331487389393848, + "grad_norm": 2.59025502204895, + "learning_rate": 0.00019450819354155244, + "loss": 0.1758, + "step": 553 + }, + { + "epoch": 0.1333895142358394, + "grad_norm": 1.7973146438598633, + "learning_rate": 0.00019448268013456125, + "loss": 0.9624, + "step": 554 + }, + { + "epoch": 0.133630289532294, + "grad_norm": 1.6008778810501099, + "learning_rate": 0.00019445710928188764, + "loss": 0.8347, + "step": 555 + }, + { + "epoch": 0.13387106482874858, + "grad_norm": 2.505977153778076, + "learning_rate": 0.00019443148099907877, + "loss": 0.3091, + "step": 556 + }, + { + "epoch": 0.13411184012520316, + "grad_norm": 3.7619707584381104, + "learning_rate": 0.0001944057953017167, + "loss": 0.7405, + "step": 557 + }, + { + "epoch": 0.13435261542165775, + "grad_norm": 2.6600496768951416, + "learning_rate": 0.0001943800522054184, + "loss": 0.5151, + "step": 558 + }, + { + "epoch": 0.13459339071811233, + "grad_norm": 5.565666198730469, + "learning_rate": 0.0001943542517258357, + "loss": 1.0332, + "step": 559 + }, + { + "epoch": 0.1348341660145669, + "grad_norm": 2.515794277191162, + "learning_rate": 0.00019432839387865537, + "loss": 1.1725, + "step": 560 + }, + { + "epoch": 0.1350749413110215, + "grad_norm": 3.981748104095459, + "learning_rate": 0.00019430247867959906, + "loss": 0.5203, + "step": 561 + }, + { + "epoch": 0.13531571660747607, + "grad_norm": 2.138054847717285, + "learning_rate": 0.00019427650614442323, + "loss": 0.0975, + "step": 562 + }, + { + "epoch": 0.13555649190393065, + "grad_norm": 4.705209255218506, + "learning_rate": 0.00019425047628891925, + "loss": 0.8184, + "step": 563 + }, + { + "epoch": 0.13579726720038524, + "grad_norm": 1.8869285583496094, + "learning_rate": 0.00019422438912891337, + "loss": 1.061, + "step": 564 + }, + { + "epoch": 0.13603804249683982, + "grad_norm": 5.188673973083496, + "learning_rate": 0.00019419824468026655, + "loss": 0.9384, + "step": 565 + }, + { + "epoch": 0.1362788177932944, + "grad_norm": 3.5460383892059326, + "learning_rate": 0.0001941720429588748, + "loss": 0.6326, + "step": 566 + }, + { + "epoch": 0.13651959308974898, + "grad_norm": 3.3124594688415527, + "learning_rate": 0.00019414578398066872, + "loss": 1.625, + "step": 567 + }, + { + "epoch": 0.13676036838620356, + "grad_norm": 1.9925857782363892, + "learning_rate": 0.00019411946776161387, + "loss": 0.86, + "step": 568 + }, + { + "epoch": 0.13700114368265817, + "grad_norm": 2.7330362796783447, + "learning_rate": 0.00019409309431771057, + "loss": 0.5012, + "step": 569 + }, + { + "epoch": 0.13724191897911275, + "grad_norm": 5.8978776931762695, + "learning_rate": 0.00019406666366499393, + "loss": 0.8465, + "step": 570 + }, + { + "epoch": 0.13748269427556734, + "grad_norm": 1.46619713306427, + "learning_rate": 0.00019404017581953385, + "loss": 0.5121, + "step": 571 + }, + { + "epoch": 0.13772346957202192, + "grad_norm": 3.0455288887023926, + "learning_rate": 0.000194013630797435, + "loss": 0.6288, + "step": 572 + }, + { + "epoch": 0.1379642448684765, + "grad_norm": 2.932802677154541, + "learning_rate": 0.00019398702861483678, + "loss": 0.9645, + "step": 573 + }, + { + "epoch": 0.13820502016493108, + "grad_norm": 4.07331657409668, + "learning_rate": 0.00019396036928791345, + "loss": 0.7568, + "step": 574 + }, + { + "epoch": 0.13844579546138566, + "grad_norm": 2.658447027206421, + "learning_rate": 0.00019393365283287386, + "loss": 0.9391, + "step": 575 + }, + { + "epoch": 0.13868657075784024, + "grad_norm": 1.1309797763824463, + "learning_rate": 0.00019390687926596173, + "loss": 0.8911, + "step": 576 + }, + { + "epoch": 0.13892734605429483, + "grad_norm": 6.038357734680176, + "learning_rate": 0.00019388004860345544, + "loss": 0.4398, + "step": 577 + }, + { + "epoch": 0.1391681213507494, + "grad_norm": 5.158764362335205, + "learning_rate": 0.0001938531608616681, + "loss": 0.5778, + "step": 578 + }, + { + "epoch": 0.139408896647204, + "grad_norm": 0.5642886161804199, + "learning_rate": 0.00019382621605694745, + "loss": 0.5383, + "step": 579 + }, + { + "epoch": 0.13964967194365857, + "grad_norm": 2.256866455078125, + "learning_rate": 0.00019379921420567607, + "loss": 0.5772, + "step": 580 + }, + { + "epoch": 0.13989044724011315, + "grad_norm": 2.100571870803833, + "learning_rate": 0.00019377215532427115, + "loss": 0.9185, + "step": 581 + }, + { + "epoch": 0.14013122253656773, + "grad_norm": 6.56969690322876, + "learning_rate": 0.0001937450394291845, + "loss": 1.0287, + "step": 582 + }, + { + "epoch": 0.14037199783302234, + "grad_norm": 2.601640462875366, + "learning_rate": 0.00019371786653690266, + "loss": 1.5671, + "step": 583 + }, + { + "epoch": 0.14061277312947693, + "grad_norm": 5.737114906311035, + "learning_rate": 0.00019369063666394682, + "loss": 0.9538, + "step": 584 + }, + { + "epoch": 0.1408535484259315, + "grad_norm": 3.682819128036499, + "learning_rate": 0.0001936633498268728, + "loss": 0.8473, + "step": 585 + }, + { + "epoch": 0.1410943237223861, + "grad_norm": 3.1207540035247803, + "learning_rate": 0.00019363600604227105, + "loss": 0.5173, + "step": 586 + }, + { + "epoch": 0.14133509901884067, + "grad_norm": 2.8920652866363525, + "learning_rate": 0.0001936086053267667, + "loss": 0.7551, + "step": 587 + }, + { + "epoch": 0.14157587431529525, + "grad_norm": 4.445816993713379, + "learning_rate": 0.00019358114769701937, + "loss": 0.6121, + "step": 588 + }, + { + "epoch": 0.14181664961174983, + "grad_norm": 2.9601528644561768, + "learning_rate": 0.00019355363316972342, + "loss": 1.3002, + "step": 589 + }, + { + "epoch": 0.14205742490820442, + "grad_norm": 6.101936340332031, + "learning_rate": 0.0001935260617616077, + "loss": 0.7764, + "step": 590 + }, + { + "epoch": 0.142298200204659, + "grad_norm": 4.4996562004089355, + "learning_rate": 0.00019349843348943574, + "loss": 0.744, + "step": 591 + }, + { + "epoch": 0.14253897550111358, + "grad_norm": 1.1355993747711182, + "learning_rate": 0.00019347074837000554, + "loss": 0.3457, + "step": 592 + }, + { + "epoch": 0.14277975079756816, + "grad_norm": 5.778316497802734, + "learning_rate": 0.00019344300642014974, + "loss": 1.1338, + "step": 593 + }, + { + "epoch": 0.14302052609402274, + "grad_norm": 1.4276717901229858, + "learning_rate": 0.00019341520765673553, + "loss": 0.4207, + "step": 594 + }, + { + "epoch": 0.14326130139047732, + "grad_norm": 2.5159173011779785, + "learning_rate": 0.00019338735209666457, + "loss": 0.6524, + "step": 595 + }, + { + "epoch": 0.14350207668693193, + "grad_norm": 1.4529104232788086, + "learning_rate": 0.00019335943975687316, + "loss": 0.4851, + "step": 596 + }, + { + "epoch": 0.14374285198338652, + "grad_norm": 3.2440574169158936, + "learning_rate": 0.000193331470654332, + "loss": 0.8624, + "step": 597 + }, + { + "epoch": 0.1439836272798411, + "grad_norm": 0.9937834739685059, + "learning_rate": 0.00019330344480604646, + "loss": 0.5917, + "step": 598 + }, + { + "epoch": 0.14422440257629568, + "grad_norm": 2.456488609313965, + "learning_rate": 0.00019327536222905623, + "loss": 0.3061, + "step": 599 + }, + { + "epoch": 0.14446517787275026, + "grad_norm": 4.499001979827881, + "learning_rate": 0.00019324722294043558, + "loss": 0.8591, + "step": 600 + }, + { + "epoch": 0.14470595316920484, + "grad_norm": 1.5652315616607666, + "learning_rate": 0.0001932190269572933, + "loss": 0.1562, + "step": 601 + }, + { + "epoch": 0.14494672846565942, + "grad_norm": 2.7683820724487305, + "learning_rate": 0.00019319077429677268, + "loss": 0.4619, + "step": 602 + }, + { + "epoch": 0.145187503762114, + "grad_norm": 2.41717529296875, + "learning_rate": 0.00019316246497605127, + "loss": 0.8059, + "step": 603 + }, + { + "epoch": 0.1454282790585686, + "grad_norm": 3.7864205837249756, + "learning_rate": 0.00019313409901234127, + "loss": 0.9378, + "step": 604 + }, + { + "epoch": 0.14566905435502317, + "grad_norm": 2.745898723602295, + "learning_rate": 0.00019310567642288922, + "loss": 0.4861, + "step": 605 + }, + { + "epoch": 0.14590982965147775, + "grad_norm": 2.9701218605041504, + "learning_rate": 0.00019307719722497612, + "loss": 0.9418, + "step": 606 + }, + { + "epoch": 0.14615060494793233, + "grad_norm": 5.684365749359131, + "learning_rate": 0.00019304866143591746, + "loss": 0.7847, + "step": 607 + }, + { + "epoch": 0.14639138024438691, + "grad_norm": 18.988853454589844, + "learning_rate": 0.00019302006907306296, + "loss": 1.1569, + "step": 608 + }, + { + "epoch": 0.1466321555408415, + "grad_norm": 3.721798896789551, + "learning_rate": 0.0001929914201537969, + "loss": 0.8478, + "step": 609 + }, + { + "epoch": 0.1468729308372961, + "grad_norm": 1.7376899719238281, + "learning_rate": 0.00019296271469553786, + "loss": 0.5951, + "step": 610 + }, + { + "epoch": 0.1471137061337507, + "grad_norm": 13.979349136352539, + "learning_rate": 0.00019293395271573885, + "loss": 0.6307, + "step": 611 + }, + { + "epoch": 0.14735448143020527, + "grad_norm": 3.307643175125122, + "learning_rate": 0.00019290513423188724, + "loss": 0.954, + "step": 612 + }, + { + "epoch": 0.14759525672665985, + "grad_norm": 2.229158878326416, + "learning_rate": 0.00019287625926150465, + "loss": 0.9561, + "step": 613 + }, + { + "epoch": 0.14783603202311443, + "grad_norm": 1.0365084409713745, + "learning_rate": 0.0001928473278221472, + "loss": 1.1344, + "step": 614 + }, + { + "epoch": 0.14807680731956901, + "grad_norm": 2.6409239768981934, + "learning_rate": 0.00019281833993140525, + "loss": 0.657, + "step": 615 + }, + { + "epoch": 0.1483175826160236, + "grad_norm": 1.3793067932128906, + "learning_rate": 0.00019278929560690347, + "loss": 0.6292, + "step": 616 + }, + { + "epoch": 0.14855835791247818, + "grad_norm": 1.6385407447814941, + "learning_rate": 0.00019276019486630093, + "loss": 0.4795, + "step": 617 + }, + { + "epoch": 0.14879913320893276, + "grad_norm": 4.453542232513428, + "learning_rate": 0.00019273103772729093, + "loss": 1.0279, + "step": 618 + }, + { + "epoch": 0.14903990850538734, + "grad_norm": 0.5888392925262451, + "learning_rate": 0.00019270182420760102, + "loss": 0.3529, + "step": 619 + }, + { + "epoch": 0.14928068380184192, + "grad_norm": 2.5179574489593506, + "learning_rate": 0.00019267255432499318, + "loss": 0.5329, + "step": 620 + }, + { + "epoch": 0.1495214590982965, + "grad_norm": 10.619978904724121, + "learning_rate": 0.0001926432280972635, + "loss": 0.828, + "step": 621 + }, + { + "epoch": 0.1497622343947511, + "grad_norm": 0.41896963119506836, + "learning_rate": 0.0001926138455422424, + "loss": 0.5892, + "step": 622 + }, + { + "epoch": 0.15000300969120567, + "grad_norm": 3.307152032852173, + "learning_rate": 0.00019258440667779456, + "loss": 0.9538, + "step": 623 + }, + { + "epoch": 0.15024378498766028, + "grad_norm": 1.9945799112319946, + "learning_rate": 0.00019255491152181885, + "loss": 0.3184, + "step": 624 + }, + { + "epoch": 0.15048456028411486, + "grad_norm": 2.7123000621795654, + "learning_rate": 0.00019252536009224845, + "loss": 0.5069, + "step": 625 + }, + { + "epoch": 0.15072533558056944, + "grad_norm": 2.0505239963531494, + "learning_rate": 0.0001924957524070506, + "loss": 0.6904, + "step": 626 + }, + { + "epoch": 0.15096611087702402, + "grad_norm": 2.483839273452759, + "learning_rate": 0.00019246608848422691, + "loss": 1.4015, + "step": 627 + }, + { + "epoch": 0.1512068861734786, + "grad_norm": 3.842451333999634, + "learning_rate": 0.00019243636834181312, + "loss": 0.5501, + "step": 628 + }, + { + "epoch": 0.1514476614699332, + "grad_norm": 1.5859034061431885, + "learning_rate": 0.00019240659199787908, + "loss": 0.5125, + "step": 629 + }, + { + "epoch": 0.15168843676638777, + "grad_norm": 1.8935115337371826, + "learning_rate": 0.0001923767594705289, + "loss": 0.4354, + "step": 630 + }, + { + "epoch": 0.15192921206284235, + "grad_norm": 4.2323384284973145, + "learning_rate": 0.00019234687077790085, + "loss": 0.8988, + "step": 631 + }, + { + "epoch": 0.15216998735929693, + "grad_norm": 1.3674668073654175, + "learning_rate": 0.00019231692593816733, + "loss": 0.3303, + "step": 632 + }, + { + "epoch": 0.1524107626557515, + "grad_norm": 7.714446544647217, + "learning_rate": 0.0001922869249695348, + "loss": 0.2196, + "step": 633 + }, + { + "epoch": 0.1526515379522061, + "grad_norm": 3.0279879570007324, + "learning_rate": 0.00019225686789024402, + "loss": 0.6256, + "step": 634 + }, + { + "epoch": 0.15289231324866068, + "grad_norm": 4.163952350616455, + "learning_rate": 0.0001922267547185697, + "loss": 0.9441, + "step": 635 + }, + { + "epoch": 0.15313308854511526, + "grad_norm": 1.383583426475525, + "learning_rate": 0.00019219658547282067, + "loss": 0.7899, + "step": 636 + }, + { + "epoch": 0.15337386384156987, + "grad_norm": 1.438839077949524, + "learning_rate": 0.00019216636017133998, + "loss": 0.4349, + "step": 637 + }, + { + "epoch": 0.15361463913802445, + "grad_norm": 7.890371322631836, + "learning_rate": 0.00019213607883250466, + "loss": 1.5545, + "step": 638 + }, + { + "epoch": 0.15385541443447903, + "grad_norm": 6.0160746574401855, + "learning_rate": 0.0001921057414747258, + "loss": 1.8333, + "step": 639 + }, + { + "epoch": 0.1540961897309336, + "grad_norm": 1.7680754661560059, + "learning_rate": 0.00019207534811644864, + "loss": 0.805, + "step": 640 + }, + { + "epoch": 0.1543369650273882, + "grad_norm": 3.0242257118225098, + "learning_rate": 0.00019204489877615237, + "loss": 0.4745, + "step": 641 + }, + { + "epoch": 0.15457774032384278, + "grad_norm": 1.6106970310211182, + "learning_rate": 0.00019201439347235025, + "loss": 0.5615, + "step": 642 + }, + { + "epoch": 0.15481851562029736, + "grad_norm": 3.6016252040863037, + "learning_rate": 0.0001919838322235896, + "loss": 1.3254, + "step": 643 + }, + { + "epoch": 0.15505929091675194, + "grad_norm": 6.142489433288574, + "learning_rate": 0.00019195321504845173, + "loss": 0.5939, + "step": 644 + }, + { + "epoch": 0.15530006621320652, + "grad_norm": 2.9963788986206055, + "learning_rate": 0.00019192254196555191, + "loss": 0.8563, + "step": 645 + }, + { + "epoch": 0.1555408415096611, + "grad_norm": 2.010145664215088, + "learning_rate": 0.00019189181299353946, + "loss": 0.6641, + "step": 646 + }, + { + "epoch": 0.15578161680611569, + "grad_norm": 3.030747890472412, + "learning_rate": 0.0001918610281510977, + "loss": 1.0257, + "step": 647 + }, + { + "epoch": 0.15602239210257027, + "grad_norm": 3.0926742553710938, + "learning_rate": 0.0001918301874569439, + "loss": 0.7438, + "step": 648 + }, + { + "epoch": 0.15626316739902485, + "grad_norm": 3.063593864440918, + "learning_rate": 0.00019179929092982912, + "loss": 0.6192, + "step": 649 + }, + { + "epoch": 0.15650394269547943, + "grad_norm": 1.6936414241790771, + "learning_rate": 0.0001917683385885387, + "loss": 0.3439, + "step": 650 + }, + { + "epoch": 0.15674471799193404, + "grad_norm": 27.274925231933594, + "learning_rate": 0.0001917373304518917, + "loss": 0.8737, + "step": 651 + }, + { + "epoch": 0.15698549328838862, + "grad_norm": 2.2580983638763428, + "learning_rate": 0.000191706266538741, + "loss": 0.9577, + "step": 652 + }, + { + "epoch": 0.1572262685848432, + "grad_norm": 1.4257384538650513, + "learning_rate": 0.00019167514686797369, + "loss": 0.1513, + "step": 653 + }, + { + "epoch": 0.15746704388129779, + "grad_norm": 2.24150013923645, + "learning_rate": 0.00019164397145851055, + "loss": 0.6569, + "step": 654 + }, + { + "epoch": 0.15770781917775237, + "grad_norm": 5.1359758377075195, + "learning_rate": 0.00019161274032930626, + "loss": 0.9886, + "step": 655 + }, + { + "epoch": 0.15794859447420695, + "grad_norm": 2.413954734802246, + "learning_rate": 0.00019158145349934945, + "loss": 0.2666, + "step": 656 + }, + { + "epoch": 0.15818936977066153, + "grad_norm": 0.6739373803138733, + "learning_rate": 0.00019155011098766255, + "loss": 0.5449, + "step": 657 + }, + { + "epoch": 0.1584301450671161, + "grad_norm": 0.7366794943809509, + "learning_rate": 0.00019151871281330193, + "loss": 0.2757, + "step": 658 + }, + { + "epoch": 0.1586709203635707, + "grad_norm": 2.2127983570098877, + "learning_rate": 0.00019148725899535774, + "loss": 0.5392, + "step": 659 + }, + { + "epoch": 0.15891169566002528, + "grad_norm": 1.907882571220398, + "learning_rate": 0.00019145574955295395, + "loss": 0.4752, + "step": 660 + }, + { + "epoch": 0.15915247095647986, + "grad_norm": 4.098206520080566, + "learning_rate": 0.00019142418450524836, + "loss": 0.9706, + "step": 661 + }, + { + "epoch": 0.15939324625293444, + "grad_norm": 3.782545804977417, + "learning_rate": 0.00019139256387143262, + "loss": 1.0815, + "step": 662 + }, + { + "epoch": 0.15963402154938902, + "grad_norm": 2.8690521717071533, + "learning_rate": 0.00019136088767073215, + "loss": 1.0296, + "step": 663 + }, + { + "epoch": 0.1598747968458436, + "grad_norm": 6.640118598937988, + "learning_rate": 0.00019132915592240613, + "loss": 0.6574, + "step": 664 + }, + { + "epoch": 0.1601155721422982, + "grad_norm": 5.299488067626953, + "learning_rate": 0.00019129736864574755, + "loss": 0.9321, + "step": 665 + }, + { + "epoch": 0.1603563474387528, + "grad_norm": 1.4800339937210083, + "learning_rate": 0.0001912655258600831, + "loss": 1.0515, + "step": 666 + }, + { + "epoch": 0.16059712273520738, + "grad_norm": 4.096741199493408, + "learning_rate": 0.00019123362758477334, + "loss": 0.8097, + "step": 667 + }, + { + "epoch": 0.16083789803166196, + "grad_norm": 1.2806522846221924, + "learning_rate": 0.00019120167383921243, + "loss": 0.5217, + "step": 668 + }, + { + "epoch": 0.16107867332811654, + "grad_norm": 2.5771350860595703, + "learning_rate": 0.0001911696646428284, + "loss": 0.725, + "step": 669 + }, + { + "epoch": 0.16131944862457112, + "grad_norm": 2.9327738285064697, + "learning_rate": 0.0001911376000150828, + "loss": 0.7475, + "step": 670 + }, + { + "epoch": 0.1615602239210257, + "grad_norm": 3.3815646171569824, + "learning_rate": 0.00019110547997547108, + "loss": 0.935, + "step": 671 + }, + { + "epoch": 0.16180099921748028, + "grad_norm": 7.282792568206787, + "learning_rate": 0.00019107330454352228, + "loss": 1.0584, + "step": 672 + }, + { + "epoch": 0.16204177451393487, + "grad_norm": 12.47275447845459, + "learning_rate": 0.00019104107373879909, + "loss": 0.6211, + "step": 673 + }, + { + "epoch": 0.16228254981038945, + "grad_norm": 1.406531572341919, + "learning_rate": 0.00019100878758089798, + "loss": 0.5329, + "step": 674 + }, + { + "epoch": 0.16252332510684403, + "grad_norm": 2.693037748336792, + "learning_rate": 0.00019097644608944897, + "loss": 0.6528, + "step": 675 + }, + { + "epoch": 0.1627641004032986, + "grad_norm": 0.5329806804656982, + "learning_rate": 0.0001909440492841158, + "loss": 0.4698, + "step": 676 + }, + { + "epoch": 0.1630048756997532, + "grad_norm": 3.925929069519043, + "learning_rate": 0.0001909115971845957, + "loss": 0.6919, + "step": 677 + }, + { + "epoch": 0.1632456509962078, + "grad_norm": 9.350509643554688, + "learning_rate": 0.00019087908981061972, + "loss": 1.1159, + "step": 678 + }, + { + "epoch": 0.16348642629266238, + "grad_norm": 6.900551795959473, + "learning_rate": 0.00019084652718195238, + "loss": 0.5557, + "step": 679 + }, + { + "epoch": 0.16372720158911697, + "grad_norm": 1.4014828205108643, + "learning_rate": 0.00019081390931839181, + "loss": 0.997, + "step": 680 + }, + { + "epoch": 0.16396797688557155, + "grad_norm": 7.637568950653076, + "learning_rate": 0.0001907812362397698, + "loss": 1.3175, + "step": 681 + }, + { + "epoch": 0.16420875218202613, + "grad_norm": 1.3787779808044434, + "learning_rate": 0.00019074850796595163, + "loss": 0.4951, + "step": 682 + }, + { + "epoch": 0.1644495274784807, + "grad_norm": 3.6682255268096924, + "learning_rate": 0.00019071572451683614, + "loss": 1.0832, + "step": 683 + }, + { + "epoch": 0.1646903027749353, + "grad_norm": 25.37391471862793, + "learning_rate": 0.00019068288591235578, + "loss": 0.6875, + "step": 684 + }, + { + "epoch": 0.16493107807138987, + "grad_norm": 2.674971580505371, + "learning_rate": 0.00019064999217247643, + "loss": 0.9103, + "step": 685 + }, + { + "epoch": 0.16517185336784446, + "grad_norm": 3.5297887325286865, + "learning_rate": 0.00019061704331719764, + "loss": 0.8173, + "step": 686 + }, + { + "epoch": 0.16541262866429904, + "grad_norm": 1.2813355922698975, + "learning_rate": 0.00019058403936655233, + "loss": 0.3151, + "step": 687 + }, + { + "epoch": 0.16565340396075362, + "grad_norm": 3.667281150817871, + "learning_rate": 0.000190550980340607, + "loss": 0.6559, + "step": 688 + }, + { + "epoch": 0.1658941792572082, + "grad_norm": 2.3366219997406006, + "learning_rate": 0.00019051786625946162, + "loss": 0.5158, + "step": 689 + }, + { + "epoch": 0.16613495455366278, + "grad_norm": 1.1751844882965088, + "learning_rate": 0.00019048469714324958, + "loss": 0.8607, + "step": 690 + }, + { + "epoch": 0.16637572985011737, + "grad_norm": 3.535374164581299, + "learning_rate": 0.00019045147301213788, + "loss": 1.228, + "step": 691 + }, + { + "epoch": 0.16661650514657197, + "grad_norm": 4.35559606552124, + "learning_rate": 0.00019041819388632676, + "loss": 0.8601, + "step": 692 + }, + { + "epoch": 0.16685728044302656, + "grad_norm": 2.7030580043792725, + "learning_rate": 0.00019038485978605004, + "loss": 1.0164, + "step": 693 + }, + { + "epoch": 0.16709805573948114, + "grad_norm": 3.0144922733306885, + "learning_rate": 0.00019035147073157493, + "loss": 0.8172, + "step": 694 + }, + { + "epoch": 0.16733883103593572, + "grad_norm": 2.4854543209075928, + "learning_rate": 0.00019031802674320206, + "loss": 0.924, + "step": 695 + }, + { + "epoch": 0.1675796063323903, + "grad_norm": 2.9239442348480225, + "learning_rate": 0.00019028452784126542, + "loss": 0.796, + "step": 696 + }, + { + "epoch": 0.16782038162884488, + "grad_norm": 3.872009038925171, + "learning_rate": 0.00019025097404613245, + "loss": 0.4696, + "step": 697 + }, + { + "epoch": 0.16806115692529947, + "grad_norm": 1.675231695175171, + "learning_rate": 0.00019021736537820394, + "loss": 0.4549, + "step": 698 + }, + { + "epoch": 0.16830193222175405, + "grad_norm": 2.725574493408203, + "learning_rate": 0.000190183701857914, + "loss": 0.6834, + "step": 699 + }, + { + "epoch": 0.16854270751820863, + "grad_norm": 2.2455711364746094, + "learning_rate": 0.00019014998350573014, + "loss": 0.4471, + "step": 700 + }, + { + "epoch": 0.1687834828146632, + "grad_norm": 0.9234648942947388, + "learning_rate": 0.00019011621034215322, + "loss": 0.1788, + "step": 701 + }, + { + "epoch": 0.1690242581111178, + "grad_norm": 1.5781611204147339, + "learning_rate": 0.00019008238238771736, + "loss": 0.244, + "step": 702 + }, + { + "epoch": 0.16926503340757237, + "grad_norm": 5.697232246398926, + "learning_rate": 0.00019004849966299005, + "loss": 0.4329, + "step": 703 + }, + { + "epoch": 0.16950580870402696, + "grad_norm": 4.987598896026611, + "learning_rate": 0.00019001456218857208, + "loss": 0.9072, + "step": 704 + }, + { + "epoch": 0.16974658400048154, + "grad_norm": 2.579894781112671, + "learning_rate": 0.00018998056998509747, + "loss": 0.717, + "step": 705 + }, + { + "epoch": 0.16998735929693615, + "grad_norm": 3.0871734619140625, + "learning_rate": 0.00018994652307323363, + "loss": 0.2763, + "step": 706 + }, + { + "epoch": 0.17022813459339073, + "grad_norm": 2.6915767192840576, + "learning_rate": 0.00018991242147368105, + "loss": 0.8432, + "step": 707 + }, + { + "epoch": 0.1704689098898453, + "grad_norm": 4.125692844390869, + "learning_rate": 0.00018987826520717365, + "loss": 1.2892, + "step": 708 + }, + { + "epoch": 0.1707096851862999, + "grad_norm": 3.3036179542541504, + "learning_rate": 0.00018984405429447852, + "loss": 0.9282, + "step": 709 + }, + { + "epoch": 0.17095046048275447, + "grad_norm": 2.7406651973724365, + "learning_rate": 0.00018980978875639596, + "loss": 1.1154, + "step": 710 + }, + { + "epoch": 0.17119123577920906, + "grad_norm": 0.8988383412361145, + "learning_rate": 0.00018977546861375947, + "loss": 0.4264, + "step": 711 + }, + { + "epoch": 0.17143201107566364, + "grad_norm": 0.4057740867137909, + "learning_rate": 0.00018974109388743583, + "loss": 0.9764, + "step": 712 + }, + { + "epoch": 0.17167278637211822, + "grad_norm": 3.4650371074676514, + "learning_rate": 0.0001897066645983249, + "loss": 1.0979, + "step": 713 + }, + { + "epoch": 0.1719135616685728, + "grad_norm": 4.947608947753906, + "learning_rate": 0.00018967218076735976, + "loss": 0.7168, + "step": 714 + }, + { + "epoch": 0.17215433696502738, + "grad_norm": 1.033057451248169, + "learning_rate": 0.0001896376424155067, + "loss": 0.2137, + "step": 715 + }, + { + "epoch": 0.17239511226148196, + "grad_norm": 5.465882778167725, + "learning_rate": 0.00018960304956376511, + "loss": 1.7501, + "step": 716 + }, + { + "epoch": 0.17263588755793655, + "grad_norm": 3.3956429958343506, + "learning_rate": 0.00018956840223316752, + "loss": 0.5464, + "step": 717 + }, + { + "epoch": 0.17287666285439113, + "grad_norm": 0.9355387687683105, + "learning_rate": 0.00018953370044477955, + "loss": 0.3183, + "step": 718 + }, + { + "epoch": 0.1731174381508457, + "grad_norm": 0.6955990195274353, + "learning_rate": 0.00018949894421969998, + "loss": 0.4827, + "step": 719 + }, + { + "epoch": 0.17335821344730032, + "grad_norm": 9.664114952087402, + "learning_rate": 0.00018946413357906068, + "loss": 0.8839, + "step": 720 + }, + { + "epoch": 0.1735989887437549, + "grad_norm": 3.0460386276245117, + "learning_rate": 0.0001894292685440266, + "loss": 0.4881, + "step": 721 + }, + { + "epoch": 0.17383976404020948, + "grad_norm": 3.0840280055999756, + "learning_rate": 0.00018939434913579578, + "loss": 1.0241, + "step": 722 + }, + { + "epoch": 0.17408053933666406, + "grad_norm": 3.3748912811279297, + "learning_rate": 0.00018935937537559926, + "loss": 1.2437, + "step": 723 + }, + { + "epoch": 0.17432131463311865, + "grad_norm": 10.365636825561523, + "learning_rate": 0.00018932434728470118, + "loss": 0.762, + "step": 724 + }, + { + "epoch": 0.17456208992957323, + "grad_norm": 4.329830169677734, + "learning_rate": 0.00018928926488439869, + "loss": 0.7613, + "step": 725 + }, + { + "epoch": 0.1748028652260278, + "grad_norm": 4.144877910614014, + "learning_rate": 0.00018925412819602202, + "loss": 1.1638, + "step": 726 + }, + { + "epoch": 0.1750436405224824, + "grad_norm": 1.3736963272094727, + "learning_rate": 0.00018921893724093428, + "loss": 0.6176, + "step": 727 + }, + { + "epoch": 0.17528441581893697, + "grad_norm": 0.9337141513824463, + "learning_rate": 0.0001891836920405317, + "loss": 0.2855, + "step": 728 + }, + { + "epoch": 0.17552519111539155, + "grad_norm": 5.704214572906494, + "learning_rate": 0.0001891483926162434, + "loss": 0.5566, + "step": 729 + }, + { + "epoch": 0.17576596641184614, + "grad_norm": 1.9563344717025757, + "learning_rate": 0.00018911303898953158, + "loss": 0.5568, + "step": 730 + }, + { + "epoch": 0.17600674170830072, + "grad_norm": 5.422361850738525, + "learning_rate": 0.00018907763118189124, + "loss": 0.7783, + "step": 731 + }, + { + "epoch": 0.1762475170047553, + "grad_norm": 3.7933502197265625, + "learning_rate": 0.00018904216921485046, + "loss": 1.178, + "step": 732 + }, + { + "epoch": 0.1764882923012099, + "grad_norm": 2.3435802459716797, + "learning_rate": 0.00018900665310997018, + "loss": 0.5904, + "step": 733 + }, + { + "epoch": 0.1767290675976645, + "grad_norm": 6.887885093688965, + "learning_rate": 0.0001889710828888443, + "loss": 1.1331, + "step": 734 + }, + { + "epoch": 0.17696984289411907, + "grad_norm": 2.859257221221924, + "learning_rate": 0.00018893545857309954, + "loss": 0.8934, + "step": 735 + }, + { + "epoch": 0.17721061819057365, + "grad_norm": 3.1216025352478027, + "learning_rate": 0.0001888997801843956, + "loss": 0.604, + "step": 736 + }, + { + "epoch": 0.17745139348702824, + "grad_norm": 2.1345009803771973, + "learning_rate": 0.00018886404774442502, + "loss": 1.0628, + "step": 737 + }, + { + "epoch": 0.17769216878348282, + "grad_norm": 3.882951021194458, + "learning_rate": 0.0001888282612749132, + "loss": 0.4992, + "step": 738 + }, + { + "epoch": 0.1779329440799374, + "grad_norm": 6.192306041717529, + "learning_rate": 0.0001887924207976184, + "loss": 0.7377, + "step": 739 + }, + { + "epoch": 0.17817371937639198, + "grad_norm": 7.351373672485352, + "learning_rate": 0.00018875652633433166, + "loss": 1.103, + "step": 740 + }, + { + "epoch": 0.17841449467284656, + "grad_norm": 1.2278997898101807, + "learning_rate": 0.00018872057790687697, + "loss": 0.2774, + "step": 741 + }, + { + "epoch": 0.17865526996930114, + "grad_norm": 2.035078525543213, + "learning_rate": 0.00018868457553711102, + "loss": 0.3135, + "step": 742 + }, + { + "epoch": 0.17889604526575573, + "grad_norm": 3.5295181274414062, + "learning_rate": 0.00018864851924692335, + "loss": 0.8756, + "step": 743 + }, + { + "epoch": 0.1791368205622103, + "grad_norm": 1.8237663507461548, + "learning_rate": 0.00018861240905823623, + "loss": 0.986, + "step": 744 + }, + { + "epoch": 0.1793775958586649, + "grad_norm": 4.102538108825684, + "learning_rate": 0.00018857624499300476, + "loss": 0.3661, + "step": 745 + }, + { + "epoch": 0.17961837115511947, + "grad_norm": 1.7040005922317505, + "learning_rate": 0.0001885400270732168, + "loss": 0.5499, + "step": 746 + }, + { + "epoch": 0.17985914645157408, + "grad_norm": 1.8217339515686035, + "learning_rate": 0.00018850375532089285, + "loss": 0.3162, + "step": 747 + }, + { + "epoch": 0.18009992174802866, + "grad_norm": 4.074040412902832, + "learning_rate": 0.00018846742975808632, + "loss": 1.4644, + "step": 748 + }, + { + "epoch": 0.18034069704448324, + "grad_norm": 4.6111016273498535, + "learning_rate": 0.00018843105040688312, + "loss": 0.7778, + "step": 749 + }, + { + "epoch": 0.18058147234093783, + "grad_norm": 2.9776699542999268, + "learning_rate": 0.00018839461728940203, + "loss": 0.7832, + "step": 750 + }, + { + "epoch": 0.1808222476373924, + "grad_norm": 1.9872022867202759, + "learning_rate": 0.0001883581304277945, + "loss": 0.9256, + "step": 751 + }, + { + "epoch": 0.181063022933847, + "grad_norm": 2.69476580619812, + "learning_rate": 0.00018832158984424463, + "loss": 0.9596, + "step": 752 + }, + { + "epoch": 0.18130379823030157, + "grad_norm": 5.690935134887695, + "learning_rate": 0.00018828499556096907, + "loss": 0.9447, + "step": 753 + }, + { + "epoch": 0.18154457352675615, + "grad_norm": 6.152745723724365, + "learning_rate": 0.00018824834760021737, + "loss": 1.0374, + "step": 754 + }, + { + "epoch": 0.18178534882321074, + "grad_norm": 0.8274415135383606, + "learning_rate": 0.00018821164598427145, + "loss": 0.5589, + "step": 755 + }, + { + "epoch": 0.18202612411966532, + "grad_norm": 0.797907829284668, + "learning_rate": 0.00018817489073544609, + "loss": 0.198, + "step": 756 + }, + { + "epoch": 0.1822668994161199, + "grad_norm": 2.9858620166778564, + "learning_rate": 0.00018813808187608845, + "loss": 0.8879, + "step": 757 + }, + { + "epoch": 0.18250767471257448, + "grad_norm": 3.2753536701202393, + "learning_rate": 0.00018810121942857845, + "loss": 0.9035, + "step": 758 + }, + { + "epoch": 0.18274845000902906, + "grad_norm": 2.3199586868286133, + "learning_rate": 0.00018806430341532858, + "loss": 0.3536, + "step": 759 + }, + { + "epoch": 0.18298922530548364, + "grad_norm": 2.436077833175659, + "learning_rate": 0.0001880273338587838, + "loss": 0.5789, + "step": 760 + }, + { + "epoch": 0.18323000060193825, + "grad_norm": 4.57729959487915, + "learning_rate": 0.0001879903107814217, + "loss": 0.5619, + "step": 761 + }, + { + "epoch": 0.18347077589839283, + "grad_norm": 2.3822367191314697, + "learning_rate": 0.0001879532342057524, + "loss": 0.6583, + "step": 762 + }, + { + "epoch": 0.18371155119484742, + "grad_norm": 5.95395565032959, + "learning_rate": 0.00018791610415431855, + "loss": 0.9503, + "step": 763 + }, + { + "epoch": 0.183952326491302, + "grad_norm": 10.346938133239746, + "learning_rate": 0.0001878789206496953, + "loss": 1.0378, + "step": 764 + }, + { + "epoch": 0.18419310178775658, + "grad_norm": 2.6373162269592285, + "learning_rate": 0.0001878416837144903, + "loss": 0.2419, + "step": 765 + }, + { + "epoch": 0.18443387708421116, + "grad_norm": 1.50508451461792, + "learning_rate": 0.00018780439337134368, + "loss": 0.5883, + "step": 766 + }, + { + "epoch": 0.18467465238066574, + "grad_norm": 1.039527416229248, + "learning_rate": 0.0001877670496429281, + "loss": 0.586, + "step": 767 + }, + { + "epoch": 0.18491542767712033, + "grad_norm": 3.885326862335205, + "learning_rate": 0.00018772965255194857, + "loss": 0.9222, + "step": 768 + }, + { + "epoch": 0.1851562029735749, + "grad_norm": 5.3813605308532715, + "learning_rate": 0.0001876922021211426, + "loss": 0.7393, + "step": 769 + }, + { + "epoch": 0.1853969782700295, + "grad_norm": 3.15456223487854, + "learning_rate": 0.0001876546983732802, + "loss": 0.7792, + "step": 770 + }, + { + "epoch": 0.18563775356648407, + "grad_norm": 3.184206962585449, + "learning_rate": 0.0001876171413311637, + "loss": 1.2433, + "step": 771 + }, + { + "epoch": 0.18587852886293865, + "grad_norm": 1.582762598991394, + "learning_rate": 0.00018757953101762787, + "loss": 0.5598, + "step": 772 + }, + { + "epoch": 0.18611930415939323, + "grad_norm": 1.884548306465149, + "learning_rate": 0.00018754186745553985, + "loss": 0.4477, + "step": 773 + }, + { + "epoch": 0.18636007945584784, + "grad_norm": 5.777435302734375, + "learning_rate": 0.0001875041506677992, + "loss": 0.4906, + "step": 774 + }, + { + "epoch": 0.18660085475230243, + "grad_norm": 1.3165128231048584, + "learning_rate": 0.00018746638067733778, + "loss": 0.6351, + "step": 775 + }, + { + "epoch": 0.186841630048757, + "grad_norm": 1.5441575050354004, + "learning_rate": 0.00018742855750711988, + "loss": 0.7108, + "step": 776 + }, + { + "epoch": 0.1870824053452116, + "grad_norm": 2.326465606689453, + "learning_rate": 0.00018739068118014198, + "loss": 0.861, + "step": 777 + }, + { + "epoch": 0.18732318064166617, + "grad_norm": 3.9939534664154053, + "learning_rate": 0.00018735275171943307, + "loss": 0.6814, + "step": 778 + }, + { + "epoch": 0.18756395593812075, + "grad_norm": 1.1253992319107056, + "learning_rate": 0.00018731476914805425, + "loss": 0.1546, + "step": 779 + }, + { + "epoch": 0.18780473123457533, + "grad_norm": 2.305006980895996, + "learning_rate": 0.00018727673348909913, + "loss": 1.0963, + "step": 780 + }, + { + "epoch": 0.18804550653102992, + "grad_norm": 3.0463790893554688, + "learning_rate": 0.0001872386447656934, + "loss": 0.734, + "step": 781 + }, + { + "epoch": 0.1882862818274845, + "grad_norm": 2.357088088989258, + "learning_rate": 0.00018720050300099507, + "loss": 0.7065, + "step": 782 + }, + { + "epoch": 0.18852705712393908, + "grad_norm": 2.2680745124816895, + "learning_rate": 0.0001871623082181945, + "loss": 1.4469, + "step": 783 + }, + { + "epoch": 0.18876783242039366, + "grad_norm": 2.114755392074585, + "learning_rate": 0.0001871240604405141, + "loss": 0.7899, + "step": 784 + }, + { + "epoch": 0.18900860771684824, + "grad_norm": 1.0055882930755615, + "learning_rate": 0.0001870857596912087, + "loss": 0.1715, + "step": 785 + }, + { + "epoch": 0.18924938301330282, + "grad_norm": 1.9801616668701172, + "learning_rate": 0.00018704740599356518, + "loss": 0.5179, + "step": 786 + }, + { + "epoch": 0.1894901583097574, + "grad_norm": 2.5894370079040527, + "learning_rate": 0.0001870089993709027, + "loss": 0.4325, + "step": 787 + }, + { + "epoch": 0.18973093360621202, + "grad_norm": 3.895353078842163, + "learning_rate": 0.00018697053984657256, + "loss": 0.3835, + "step": 788 + }, + { + "epoch": 0.1899717089026666, + "grad_norm": 1.0935512781143188, + "learning_rate": 0.00018693202744395827, + "loss": 1.1042, + "step": 789 + }, + { + "epoch": 0.19021248419912118, + "grad_norm": 1.6422269344329834, + "learning_rate": 0.0001868934621864754, + "loss": 0.718, + "step": 790 + }, + { + "epoch": 0.19045325949557576, + "grad_norm": 2.844287633895874, + "learning_rate": 0.00018685484409757178, + "loss": 1.2023, + "step": 791 + }, + { + "epoch": 0.19069403479203034, + "grad_norm": 1.130077600479126, + "learning_rate": 0.00018681617320072725, + "loss": 0.2922, + "step": 792 + }, + { + "epoch": 0.19093481008848492, + "grad_norm": 2.1571900844573975, + "learning_rate": 0.0001867774495194538, + "loss": 0.7212, + "step": 793 + }, + { + "epoch": 0.1911755853849395, + "grad_norm": 6.230739593505859, + "learning_rate": 0.00018673867307729555, + "loss": 0.8975, + "step": 794 + }, + { + "epoch": 0.1914163606813941, + "grad_norm": 2.590592622756958, + "learning_rate": 0.00018669984389782865, + "loss": 0.3676, + "step": 795 + }, + { + "epoch": 0.19165713597784867, + "grad_norm": 8.08610725402832, + "learning_rate": 0.00018666096200466132, + "loss": 0.7873, + "step": 796 + }, + { + "epoch": 0.19189791127430325, + "grad_norm": 1.4064202308654785, + "learning_rate": 0.00018662202742143383, + "loss": 0.5145, + "step": 797 + }, + { + "epoch": 0.19213868657075783, + "grad_norm": 1.37117338180542, + "learning_rate": 0.0001865830401718185, + "loss": 0.8417, + "step": 798 + }, + { + "epoch": 0.19237946186721241, + "grad_norm": 2.1927073001861572, + "learning_rate": 0.00018654400027951967, + "loss": 0.9088, + "step": 799 + }, + { + "epoch": 0.192620237163667, + "grad_norm": 2.8337302207946777, + "learning_rate": 0.0001865049077682737, + "loss": 0.5877, + "step": 800 + }, + { + "epoch": 0.19286101246012158, + "grad_norm": 6.606812000274658, + "learning_rate": 0.00018646576266184893, + "loss": 0.9887, + "step": 801 + }, + { + "epoch": 0.1931017877565762, + "grad_norm": 2.9909074306488037, + "learning_rate": 0.00018642656498404564, + "loss": 0.5693, + "step": 802 + }, + { + "epoch": 0.19334256305303077, + "grad_norm": 0.7477906346321106, + "learning_rate": 0.0001863873147586961, + "loss": 0.2322, + "step": 803 + }, + { + "epoch": 0.19358333834948535, + "grad_norm": 2.028005599975586, + "learning_rate": 0.00018634801200966453, + "loss": 0.3557, + "step": 804 + }, + { + "epoch": 0.19382411364593993, + "grad_norm": 5.629332065582275, + "learning_rate": 0.00018630865676084714, + "loss": 0.6842, + "step": 805 + }, + { + "epoch": 0.19406488894239451, + "grad_norm": 0.9226589202880859, + "learning_rate": 0.000186269249036172, + "loss": 0.2885, + "step": 806 + }, + { + "epoch": 0.1943056642388491, + "grad_norm": 1.8051038980484009, + "learning_rate": 0.00018622978885959906, + "loss": 0.8416, + "step": 807 + }, + { + "epoch": 0.19454643953530368, + "grad_norm": 4.140893936157227, + "learning_rate": 0.0001861902762551202, + "loss": 1.0417, + "step": 808 + }, + { + "epoch": 0.19478721483175826, + "grad_norm": 7.981260776519775, + "learning_rate": 0.0001861507112467592, + "loss": 0.525, + "step": 809 + }, + { + "epoch": 0.19502799012821284, + "grad_norm": 5.369372367858887, + "learning_rate": 0.0001861110938585717, + "loss": 0.5619, + "step": 810 + }, + { + "epoch": 0.19526876542466742, + "grad_norm": 1.8795945644378662, + "learning_rate": 0.0001860714241146451, + "loss": 1.0825, + "step": 811 + }, + { + "epoch": 0.195509540721122, + "grad_norm": 3.486668586730957, + "learning_rate": 0.0001860317020390987, + "loss": 0.3657, + "step": 812 + }, + { + "epoch": 0.1957503160175766, + "grad_norm": 1.3779692649841309, + "learning_rate": 0.00018599192765608364, + "loss": 0.9127, + "step": 813 + }, + { + "epoch": 0.19599109131403117, + "grad_norm": 2.563727617263794, + "learning_rate": 0.00018595210098978283, + "loss": 0.5109, + "step": 814 + }, + { + "epoch": 0.19623186661048578, + "grad_norm": 0.7977485656738281, + "learning_rate": 0.00018591222206441096, + "loss": 0.5252, + "step": 815 + }, + { + "epoch": 0.19647264190694036, + "grad_norm": 4.5069475173950195, + "learning_rate": 0.0001858722909042145, + "loss": 0.3426, + "step": 816 + }, + { + "epoch": 0.19671341720339494, + "grad_norm": 6.430407524108887, + "learning_rate": 0.00018583230753347173, + "loss": 0.9264, + "step": 817 + }, + { + "epoch": 0.19695419249984952, + "grad_norm": 2.3652713298797607, + "learning_rate": 0.00018579227197649257, + "loss": 0.6739, + "step": 818 + }, + { + "epoch": 0.1971949677963041, + "grad_norm": 2.2648465633392334, + "learning_rate": 0.00018575218425761876, + "loss": 0.3986, + "step": 819 + }, + { + "epoch": 0.1974357430927587, + "grad_norm": 2.1836869716644287, + "learning_rate": 0.0001857120444012237, + "loss": 0.2466, + "step": 820 + }, + { + "epoch": 0.19767651838921327, + "grad_norm": 1.898180603981018, + "learning_rate": 0.00018567185243171256, + "loss": 0.5558, + "step": 821 + }, + { + "epoch": 0.19791729368566785, + "grad_norm": 0.8913256525993347, + "learning_rate": 0.00018563160837352212, + "loss": 0.6096, + "step": 822 + }, + { + "epoch": 0.19815806898212243, + "grad_norm": 3.458024024963379, + "learning_rate": 0.00018559131225112085, + "loss": 0.7502, + "step": 823 + }, + { + "epoch": 0.198398844278577, + "grad_norm": 3.377265691757202, + "learning_rate": 0.00018555096408900889, + "loss": 0.9659, + "step": 824 + }, + { + "epoch": 0.1986396195750316, + "grad_norm": 5.404399394989014, + "learning_rate": 0.00018551056391171803, + "loss": 0.8436, + "step": 825 + }, + { + "epoch": 0.19888039487148618, + "grad_norm": 2.176090717315674, + "learning_rate": 0.00018547011174381163, + "loss": 0.6543, + "step": 826 + }, + { + "epoch": 0.19912117016794076, + "grad_norm": 1.4764220714569092, + "learning_rate": 0.00018542960760988475, + "loss": 0.4371, + "step": 827 + }, + { + "epoch": 0.19936194546439534, + "grad_norm": 4.111733913421631, + "learning_rate": 0.00018538905153456394, + "loss": 0.7307, + "step": 828 + }, + { + "epoch": 0.19960272076084995, + "grad_norm": 3.4664177894592285, + "learning_rate": 0.0001853484435425074, + "loss": 0.8896, + "step": 829 + }, + { + "epoch": 0.19984349605730453, + "grad_norm": 1.9064959287643433, + "learning_rate": 0.00018530778365840497, + "loss": 0.5491, + "step": 830 + }, + { + "epoch": 0.2000842713537591, + "grad_norm": 1.8238356113433838, + "learning_rate": 0.00018526707190697782, + "loss": 0.564, + "step": 831 + }, + { + "epoch": 0.2003250466502137, + "grad_norm": 1.4021512269973755, + "learning_rate": 0.00018522630831297886, + "loss": 0.2522, + "step": 832 + }, + { + "epoch": 0.20056582194666828, + "grad_norm": 1.9710665941238403, + "learning_rate": 0.0001851854929011924, + "loss": 0.2168, + "step": 833 + }, + { + "epoch": 0.20080659724312286, + "grad_norm": 1.932867407798767, + "learning_rate": 0.00018514462569643435, + "loss": 0.5669, + "step": 834 + }, + { + "epoch": 0.20104737253957744, + "grad_norm": 1.412558674812317, + "learning_rate": 0.00018510370672355204, + "loss": 0.5655, + "step": 835 + }, + { + "epoch": 0.20128814783603202, + "grad_norm": 5.750187873840332, + "learning_rate": 0.00018506273600742433, + "loss": 0.8122, + "step": 836 + }, + { + "epoch": 0.2015289231324866, + "grad_norm": 4.016916275024414, + "learning_rate": 0.00018502171357296144, + "loss": 0.5478, + "step": 837 + }, + { + "epoch": 0.20176969842894119, + "grad_norm": 1.5730372667312622, + "learning_rate": 0.00018498063944510516, + "loss": 0.3524, + "step": 838 + }, + { + "epoch": 0.20201047372539577, + "grad_norm": 1.1213641166687012, + "learning_rate": 0.0001849395136488286, + "loss": 0.386, + "step": 839 + }, + { + "epoch": 0.20225124902185035, + "grad_norm": 1.455862045288086, + "learning_rate": 0.00018489833620913642, + "loss": 0.2709, + "step": 840 + }, + { + "epoch": 0.20249202431830493, + "grad_norm": 3.3921029567718506, + "learning_rate": 0.0001848571071510645, + "loss": 0.2738, + "step": 841 + }, + { + "epoch": 0.2027327996147595, + "grad_norm": 1.9654597043991089, + "learning_rate": 0.00018481582649968028, + "loss": 0.5441, + "step": 842 + }, + { + "epoch": 0.20297357491121412, + "grad_norm": 8.712904930114746, + "learning_rate": 0.00018477449428008246, + "loss": 0.5047, + "step": 843 + }, + { + "epoch": 0.2032143502076687, + "grad_norm": 4.064781665802002, + "learning_rate": 0.0001847331105174011, + "loss": 0.6401, + "step": 844 + }, + { + "epoch": 0.20345512550412329, + "grad_norm": 10.879172325134277, + "learning_rate": 0.0001846916752367976, + "loss": 0.7271, + "step": 845 + }, + { + "epoch": 0.20369590080057787, + "grad_norm": 1.46236252784729, + "learning_rate": 0.00018465018846346482, + "loss": 0.3446, + "step": 846 + }, + { + "epoch": 0.20393667609703245, + "grad_norm": 1.9737117290496826, + "learning_rate": 0.0001846086502226267, + "loss": 0.5821, + "step": 847 + }, + { + "epoch": 0.20417745139348703, + "grad_norm": 2.094733715057373, + "learning_rate": 0.00018456706053953862, + "loss": 0.2923, + "step": 848 + }, + { + "epoch": 0.2044182266899416, + "grad_norm": 1.962471842765808, + "learning_rate": 0.0001845254194394872, + "loss": 0.756, + "step": 849 + }, + { + "epoch": 0.2046590019863962, + "grad_norm": 3.4438953399658203, + "learning_rate": 0.00018448372694779034, + "loss": 0.4609, + "step": 850 + }, + { + "epoch": 0.20489977728285078, + "grad_norm": 1.1954097747802734, + "learning_rate": 0.00018444198308979713, + "loss": 0.6803, + "step": 851 + }, + { + "epoch": 0.20514055257930536, + "grad_norm": 2.8534281253814697, + "learning_rate": 0.00018440018789088794, + "loss": 0.8631, + "step": 852 + }, + { + "epoch": 0.20538132787575994, + "grad_norm": 0.7627564072608948, + "learning_rate": 0.0001843583413764744, + "loss": 0.3575, + "step": 853 + }, + { + "epoch": 0.20562210317221452, + "grad_norm": 2.954674482345581, + "learning_rate": 0.0001843164435719992, + "loss": 1.2672, + "step": 854 + }, + { + "epoch": 0.2058628784686691, + "grad_norm": 3.019871950149536, + "learning_rate": 0.00018427449450293635, + "loss": 0.5769, + "step": 855 + }, + { + "epoch": 0.2061036537651237, + "grad_norm": 1.2849375009536743, + "learning_rate": 0.00018423249419479099, + "loss": 1.0092, + "step": 856 + }, + { + "epoch": 0.2063444290615783, + "grad_norm": 2.783853054046631, + "learning_rate": 0.00018419044267309939, + "loss": 0.6801, + "step": 857 + }, + { + "epoch": 0.20658520435803288, + "grad_norm": 3.1100003719329834, + "learning_rate": 0.0001841483399634289, + "loss": 1.2878, + "step": 858 + }, + { + "epoch": 0.20682597965448746, + "grad_norm": 1.7785344123840332, + "learning_rate": 0.00018410618609137816, + "loss": 0.5104, + "step": 859 + }, + { + "epoch": 0.20706675495094204, + "grad_norm": 1.5101239681243896, + "learning_rate": 0.0001840639810825768, + "loss": 0.6032, + "step": 860 + }, + { + "epoch": 0.20730753024739662, + "grad_norm": 4.038559913635254, + "learning_rate": 0.00018402172496268554, + "loss": 0.6457, + "step": 861 + }, + { + "epoch": 0.2075483055438512, + "grad_norm": 8.409773826599121, + "learning_rate": 0.0001839794177573962, + "loss": 1.5939, + "step": 862 + }, + { + "epoch": 0.20778908084030578, + "grad_norm": 2.086423635482788, + "learning_rate": 0.00018393705949243164, + "loss": 0.7663, + "step": 863 + }, + { + "epoch": 0.20802985613676037, + "grad_norm": 4.5612945556640625, + "learning_rate": 0.00018389465019354577, + "loss": 0.5459, + "step": 864 + }, + { + "epoch": 0.20827063143321495, + "grad_norm": 1.9495208263397217, + "learning_rate": 0.0001838521898865236, + "loss": 0.1955, + "step": 865 + }, + { + "epoch": 0.20851140672966953, + "grad_norm": 2.232084035873413, + "learning_rate": 0.00018380967859718105, + "loss": 0.798, + "step": 866 + }, + { + "epoch": 0.2087521820261241, + "grad_norm": 5.387617111206055, + "learning_rate": 0.0001837671163513651, + "loss": 0.7414, + "step": 867 + }, + { + "epoch": 0.2089929573225787, + "grad_norm": 7.861992359161377, + "learning_rate": 0.00018372450317495365, + "loss": 0.9128, + "step": 868 + }, + { + "epoch": 0.20923373261903327, + "grad_norm": 2.3675897121429443, + "learning_rate": 0.00018368183909385567, + "loss": 0.6167, + "step": 869 + }, + { + "epoch": 0.20947450791548788, + "grad_norm": 3.206550121307373, + "learning_rate": 0.00018363912413401097, + "loss": 0.918, + "step": 870 + }, + { + "epoch": 0.20971528321194247, + "grad_norm": 11.829947471618652, + "learning_rate": 0.00018359635832139034, + "loss": 1.1065, + "step": 871 + }, + { + "epoch": 0.20995605850839705, + "grad_norm": 2.4670798778533936, + "learning_rate": 0.00018355354168199552, + "loss": 0.52, + "step": 872 + }, + { + "epoch": 0.21019683380485163, + "grad_norm": 2.387666702270508, + "learning_rate": 0.00018351067424185913, + "loss": 0.3961, + "step": 873 + }, + { + "epoch": 0.2104376091013062, + "grad_norm": 0.41803881525993347, + "learning_rate": 0.00018346775602704464, + "loss": 0.1675, + "step": 874 + }, + { + "epoch": 0.2106783843977608, + "grad_norm": 5.301272869110107, + "learning_rate": 0.0001834247870636464, + "loss": 1.194, + "step": 875 + }, + { + "epoch": 0.21091915969421537, + "grad_norm": 2.4999866485595703, + "learning_rate": 0.0001833817673777897, + "loss": 0.1707, + "step": 876 + }, + { + "epoch": 0.21115993499066996, + "grad_norm": 1.3982088565826416, + "learning_rate": 0.00018333869699563055, + "loss": 1.0266, + "step": 877 + }, + { + "epoch": 0.21140071028712454, + "grad_norm": 3.187394380569458, + "learning_rate": 0.00018329557594335585, + "loss": 1.0817, + "step": 878 + }, + { + "epoch": 0.21164148558357912, + "grad_norm": 3.2300422191619873, + "learning_rate": 0.00018325240424718335, + "loss": 0.6478, + "step": 879 + }, + { + "epoch": 0.2118822608800337, + "grad_norm": 3.521116018295288, + "learning_rate": 0.00018320918193336148, + "loss": 0.8387, + "step": 880 + }, + { + "epoch": 0.21212303617648828, + "grad_norm": 9.480287551879883, + "learning_rate": 0.00018316590902816952, + "loss": 0.9253, + "step": 881 + }, + { + "epoch": 0.21236381147294286, + "grad_norm": 2.395949602127075, + "learning_rate": 0.0001831225855579175, + "loss": 0.8792, + "step": 882 + }, + { + "epoch": 0.21260458676939745, + "grad_norm": 1.681579351425171, + "learning_rate": 0.0001830792115489462, + "loss": 0.9965, + "step": 883 + }, + { + "epoch": 0.21284536206585206, + "grad_norm": 1.3200875520706177, + "learning_rate": 0.00018303578702762705, + "loss": 0.2478, + "step": 884 + }, + { + "epoch": 0.21308613736230664, + "grad_norm": 2.904762029647827, + "learning_rate": 0.00018299231202036233, + "loss": 0.4818, + "step": 885 + }, + { + "epoch": 0.21332691265876122, + "grad_norm": 2.1330971717834473, + "learning_rate": 0.00018294878655358493, + "loss": 0.1786, + "step": 886 + }, + { + "epoch": 0.2135676879552158, + "grad_norm": 4.824681758880615, + "learning_rate": 0.0001829052106537584, + "loss": 0.8048, + "step": 887 + }, + { + "epoch": 0.21380846325167038, + "grad_norm": 2.336089849472046, + "learning_rate": 0.000182861584347377, + "loss": 0.7041, + "step": 888 + }, + { + "epoch": 0.21404923854812496, + "grad_norm": 2.5671005249023438, + "learning_rate": 0.00018281790766096564, + "loss": 0.6426, + "step": 889 + }, + { + "epoch": 0.21429001384457955, + "grad_norm": 18.460041046142578, + "learning_rate": 0.00018277418062107986, + "loss": 0.9763, + "step": 890 + }, + { + "epoch": 0.21453078914103413, + "grad_norm": 2.5273513793945312, + "learning_rate": 0.00018273040325430574, + "loss": 0.5831, + "step": 891 + }, + { + "epoch": 0.2147715644374887, + "grad_norm": 1.38306725025177, + "learning_rate": 0.00018268657558726003, + "loss": 0.8044, + "step": 892 + }, + { + "epoch": 0.2150123397339433, + "grad_norm": 1.9609812498092651, + "learning_rate": 0.00018264269764659013, + "loss": 0.3049, + "step": 893 + }, + { + "epoch": 0.21525311503039787, + "grad_norm": 4.538389205932617, + "learning_rate": 0.0001825987694589738, + "loss": 0.8865, + "step": 894 + }, + { + "epoch": 0.21549389032685246, + "grad_norm": 2.368454933166504, + "learning_rate": 0.00018255479105111957, + "loss": 1.0822, + "step": 895 + }, + { + "epoch": 0.21573466562330704, + "grad_norm": 4.19332218170166, + "learning_rate": 0.00018251076244976637, + "loss": 1.0274, + "step": 896 + }, + { + "epoch": 0.21597544091976162, + "grad_norm": 0.907124400138855, + "learning_rate": 0.00018246668368168372, + "loss": 0.5454, + "step": 897 + }, + { + "epoch": 0.21621621621621623, + "grad_norm": 2.2195355892181396, + "learning_rate": 0.0001824225547736716, + "loss": 0.4168, + "step": 898 + }, + { + "epoch": 0.2164569915126708, + "grad_norm": 4.278376579284668, + "learning_rate": 0.00018237837575256044, + "loss": 0.6395, + "step": 899 + }, + { + "epoch": 0.2166977668091254, + "grad_norm": 3.1869797706604004, + "learning_rate": 0.00018233414664521123, + "loss": 0.9863, + "step": 900 + }, + { + "epoch": 0.21693854210557997, + "grad_norm": 1.9933998584747314, + "learning_rate": 0.00018228986747851537, + "loss": 0.6143, + "step": 901 + }, + { + "epoch": 0.21717931740203456, + "grad_norm": 1.5613797903060913, + "learning_rate": 0.00018224553827939468, + "loss": 0.4492, + "step": 902 + }, + { + "epoch": 0.21742009269848914, + "grad_norm": 2.306579351425171, + "learning_rate": 0.00018220115907480143, + "loss": 0.5864, + "step": 903 + }, + { + "epoch": 0.21766086799494372, + "grad_norm": 3.8171541690826416, + "learning_rate": 0.00018215672989171824, + "loss": 0.8157, + "step": 904 + }, + { + "epoch": 0.2179016432913983, + "grad_norm": 1.4388493299484253, + "learning_rate": 0.00018211225075715816, + "loss": 0.8506, + "step": 905 + }, + { + "epoch": 0.21814241858785288, + "grad_norm": 1.82477867603302, + "learning_rate": 0.00018206772169816467, + "loss": 0.7865, + "step": 906 + }, + { + "epoch": 0.21838319388430746, + "grad_norm": 3.2749521732330322, + "learning_rate": 0.00018202314274181144, + "loss": 1.3825, + "step": 907 + }, + { + "epoch": 0.21862396918076205, + "grad_norm": 1.8761945962905884, + "learning_rate": 0.00018197851391520264, + "loss": 0.8722, + "step": 908 + }, + { + "epoch": 0.21886474447721663, + "grad_norm": 1.6125880479812622, + "learning_rate": 0.0001819338352454727, + "loss": 0.5524, + "step": 909 + }, + { + "epoch": 0.2191055197736712, + "grad_norm": 1.2524000406265259, + "learning_rate": 0.0001818891067597863, + "loss": 0.8105, + "step": 910 + }, + { + "epoch": 0.21934629507012582, + "grad_norm": 3.3656504154205322, + "learning_rate": 0.0001818443284853385, + "loss": 1.6029, + "step": 911 + }, + { + "epoch": 0.2195870703665804, + "grad_norm": 1.9755463600158691, + "learning_rate": 0.00018179950044935458, + "loss": 0.401, + "step": 912 + }, + { + "epoch": 0.21982784566303498, + "grad_norm": 3.240755081176758, + "learning_rate": 0.0001817546226790901, + "loss": 1.0564, + "step": 913 + }, + { + "epoch": 0.22006862095948956, + "grad_norm": 5.947300910949707, + "learning_rate": 0.00018170969520183084, + "loss": 0.4548, + "step": 914 + }, + { + "epoch": 0.22030939625594415, + "grad_norm": 3.0205721855163574, + "learning_rate": 0.0001816647180448928, + "loss": 0.8396, + "step": 915 + }, + { + "epoch": 0.22055017155239873, + "grad_norm": 1.6607885360717773, + "learning_rate": 0.0001816196912356222, + "loss": 1.1035, + "step": 916 + }, + { + "epoch": 0.2207909468488533, + "grad_norm": 1.3007737398147583, + "learning_rate": 0.0001815746148013954, + "loss": 0.1121, + "step": 917 + }, + { + "epoch": 0.2210317221453079, + "grad_norm": 2.658994674682617, + "learning_rate": 0.00018152948876961906, + "loss": 0.3838, + "step": 918 + }, + { + "epoch": 0.22127249744176247, + "grad_norm": 1.1010584831237793, + "learning_rate": 0.00018148431316772983, + "loss": 0.1575, + "step": 919 + }, + { + "epoch": 0.22151327273821705, + "grad_norm": 4.701428413391113, + "learning_rate": 0.0001814390880231946, + "loss": 0.4306, + "step": 920 + }, + { + "epoch": 0.22175404803467164, + "grad_norm": 3.2852671146392822, + "learning_rate": 0.0001813938133635104, + "loss": 0.2974, + "step": 921 + }, + { + "epoch": 0.22199482333112622, + "grad_norm": 3.092611312866211, + "learning_rate": 0.0001813484892162043, + "loss": 0.7887, + "step": 922 + }, + { + "epoch": 0.2222355986275808, + "grad_norm": 2.474486827850342, + "learning_rate": 0.00018130311560883344, + "loss": 0.7599, + "step": 923 + }, + { + "epoch": 0.22247637392403538, + "grad_norm": 5.097280025482178, + "learning_rate": 0.00018125769256898511, + "loss": 0.6548, + "step": 924 + }, + { + "epoch": 0.22271714922049, + "grad_norm": 3.1248862743377686, + "learning_rate": 0.00018121222012427665, + "loss": 1.0051, + "step": 925 + }, + { + "epoch": 0.22295792451694457, + "grad_norm": 4.130378723144531, + "learning_rate": 0.00018116669830235536, + "loss": 0.8515, + "step": 926 + }, + { + "epoch": 0.22319869981339915, + "grad_norm": 3.8639516830444336, + "learning_rate": 0.00018112112713089863, + "loss": 0.3418, + "step": 927 + }, + { + "epoch": 0.22343947510985374, + "grad_norm": 5.733872890472412, + "learning_rate": 0.00018107550663761386, + "loss": 0.4249, + "step": 928 + }, + { + "epoch": 0.22368025040630832, + "grad_norm": 2.717703104019165, + "learning_rate": 0.0001810298368502384, + "loss": 0.3455, + "step": 929 + }, + { + "epoch": 0.2239210257027629, + "grad_norm": 4.0550689697265625, + "learning_rate": 0.00018098411779653953, + "loss": 0.6515, + "step": 930 + }, + { + "epoch": 0.22416180099921748, + "grad_norm": 1.4261348247528076, + "learning_rate": 0.00018093834950431458, + "loss": 0.7618, + "step": 931 + }, + { + "epoch": 0.22440257629567206, + "grad_norm": 1.7245268821716309, + "learning_rate": 0.0001808925320013908, + "loss": 0.7967, + "step": 932 + }, + { + "epoch": 0.22464335159212664, + "grad_norm": 4.139218807220459, + "learning_rate": 0.0001808466653156253, + "loss": 0.7014, + "step": 933 + }, + { + "epoch": 0.22488412688858123, + "grad_norm": 2.1172738075256348, + "learning_rate": 0.00018080074947490516, + "loss": 0.4765, + "step": 934 + }, + { + "epoch": 0.2251249021850358, + "grad_norm": 4.761689186096191, + "learning_rate": 0.00018075478450714724, + "loss": 0.699, + "step": 935 + }, + { + "epoch": 0.2253656774814904, + "grad_norm": 2.6363584995269775, + "learning_rate": 0.00018070877044029846, + "loss": 0.8263, + "step": 936 + }, + { + "epoch": 0.22560645277794497, + "grad_norm": 1.930909276008606, + "learning_rate": 0.00018066270730233538, + "loss": 0.6952, + "step": 937 + }, + { + "epoch": 0.22584722807439955, + "grad_norm": 0.8242762684822083, + "learning_rate": 0.00018061659512126453, + "loss": 0.5675, + "step": 938 + }, + { + "epoch": 0.22608800337085416, + "grad_norm": 1.3294146060943604, + "learning_rate": 0.0001805704339251222, + "loss": 0.5123, + "step": 939 + }, + { + "epoch": 0.22632877866730874, + "grad_norm": 0.8458835482597351, + "learning_rate": 0.00018052422374197454, + "loss": 0.2988, + "step": 940 + }, + { + "epoch": 0.22656955396376333, + "grad_norm": 1.0856271982192993, + "learning_rate": 0.00018047796459991742, + "loss": 0.7522, + "step": 941 + }, + { + "epoch": 0.2268103292602179, + "grad_norm": 5.306552410125732, + "learning_rate": 0.00018043165652707649, + "loss": 0.7063, + "step": 942 + }, + { + "epoch": 0.2270511045566725, + "grad_norm": 5.354522228240967, + "learning_rate": 0.00018038529955160718, + "loss": 0.7462, + "step": 943 + }, + { + "epoch": 0.22729187985312707, + "grad_norm": 1.556826114654541, + "learning_rate": 0.00018033889370169465, + "loss": 0.7949, + "step": 944 + }, + { + "epoch": 0.22753265514958165, + "grad_norm": 0.9913277626037598, + "learning_rate": 0.00018029243900555373, + "loss": 0.5612, + "step": 945 + }, + { + "epoch": 0.22777343044603623, + "grad_norm": 1.7368444204330444, + "learning_rate": 0.000180245935491429, + "loss": 0.3213, + "step": 946 + }, + { + "epoch": 0.22801420574249082, + "grad_norm": 2.660506010055542, + "learning_rate": 0.0001801993831875947, + "loss": 0.39, + "step": 947 + }, + { + "epoch": 0.2282549810389454, + "grad_norm": 3.1855568885803223, + "learning_rate": 0.0001801527821223547, + "loss": 0.4602, + "step": 948 + }, + { + "epoch": 0.22849575633539998, + "grad_norm": 2.8115875720977783, + "learning_rate": 0.0001801061323240426, + "loss": 0.3065, + "step": 949 + }, + { + "epoch": 0.22873653163185456, + "grad_norm": 18.071075439453125, + "learning_rate": 0.00018005943382102158, + "loss": 0.8023, + "step": 950 + }, + { + "epoch": 0.22897730692830914, + "grad_norm": 1.1732177734375, + "learning_rate": 0.00018001268664168439, + "loss": 0.8773, + "step": 951 + }, + { + "epoch": 0.22921808222476375, + "grad_norm": 2.2807600498199463, + "learning_rate": 0.00017996589081445348, + "loss": 0.7107, + "step": 952 + }, + { + "epoch": 0.22945885752121833, + "grad_norm": 2.0999910831451416, + "learning_rate": 0.00017991904636778077, + "loss": 0.6253, + "step": 953 + }, + { + "epoch": 0.22969963281767292, + "grad_norm": 13.20639419555664, + "learning_rate": 0.00017987215333014782, + "loss": 0.9696, + "step": 954 + }, + { + "epoch": 0.2299404081141275, + "grad_norm": 1.124551773071289, + "learning_rate": 0.00017982521173006568, + "loss": 0.3418, + "step": 955 + }, + { + "epoch": 0.23018118341058208, + "grad_norm": 0.3517683148384094, + "learning_rate": 0.00017977822159607497, + "loss": 0.2291, + "step": 956 + }, + { + "epoch": 0.23042195870703666, + "grad_norm": 2.7812604904174805, + "learning_rate": 0.0001797311829567458, + "loss": 0.966, + "step": 957 + }, + { + "epoch": 0.23066273400349124, + "grad_norm": 1.8114944696426392, + "learning_rate": 0.0001796840958406777, + "loss": 0.5787, + "step": 958 + }, + { + "epoch": 0.23090350929994582, + "grad_norm": 2.012598991394043, + "learning_rate": 0.00017963696027649986, + "loss": 1.1201, + "step": 959 + }, + { + "epoch": 0.2311442845964004, + "grad_norm": 1.5761219263076782, + "learning_rate": 0.00017958977629287074, + "loss": 0.9017, + "step": 960 + }, + { + "epoch": 0.231385059892855, + "grad_norm": 1.2920587062835693, + "learning_rate": 0.0001795425439184783, + "loss": 0.319, + "step": 961 + }, + { + "epoch": 0.23162583518930957, + "grad_norm": 6.733016014099121, + "learning_rate": 0.00017949526318203997, + "loss": 0.7354, + "step": 962 + }, + { + "epoch": 0.23186661048576415, + "grad_norm": 1.5943965911865234, + "learning_rate": 0.0001794479341123025, + "loss": 0.2783, + "step": 963 + }, + { + "epoch": 0.23210738578221873, + "grad_norm": 1.023605227470398, + "learning_rate": 0.00017940055673804208, + "loss": 0.5166, + "step": 964 + }, + { + "epoch": 0.23234816107867332, + "grad_norm": 0.7512199282646179, + "learning_rate": 0.00017935313108806427, + "loss": 0.1101, + "step": 965 + }, + { + "epoch": 0.23258893637512792, + "grad_norm": 3.7386422157287598, + "learning_rate": 0.000179305657191204, + "loss": 0.6783, + "step": 966 + }, + { + "epoch": 0.2328297116715825, + "grad_norm": 1.3405836820602417, + "learning_rate": 0.00017925813507632546, + "loss": 0.5868, + "step": 967 + }, + { + "epoch": 0.2330704869680371, + "grad_norm": 3.388740301132202, + "learning_rate": 0.00017921056477232224, + "loss": 0.5516, + "step": 968 + }, + { + "epoch": 0.23331126226449167, + "grad_norm": 2.8512704372406006, + "learning_rate": 0.00017916294630811717, + "loss": 0.383, + "step": 969 + }, + { + "epoch": 0.23355203756094625, + "grad_norm": 1.5921225547790527, + "learning_rate": 0.00017911527971266238, + "loss": 0.4268, + "step": 970 + }, + { + "epoch": 0.23379281285740083, + "grad_norm": 8.35683536529541, + "learning_rate": 0.00017906756501493925, + "loss": 0.3925, + "step": 971 + }, + { + "epoch": 0.23403358815385542, + "grad_norm": 1.589657187461853, + "learning_rate": 0.0001790198022439585, + "loss": 0.5233, + "step": 972 + }, + { + "epoch": 0.23427436345031, + "grad_norm": 2.5263054370880127, + "learning_rate": 0.00017897199142875994, + "loss": 0.3526, + "step": 973 + }, + { + "epoch": 0.23451513874676458, + "grad_norm": 1.696166753768921, + "learning_rate": 0.00017892413259841265, + "loss": 0.3805, + "step": 974 + }, + { + "epoch": 0.23475591404321916, + "grad_norm": 3.3580451011657715, + "learning_rate": 0.0001788762257820149, + "loss": 0.66, + "step": 975 + }, + { + "epoch": 0.23499668933967374, + "grad_norm": 2.4022610187530518, + "learning_rate": 0.0001788282710086942, + "loss": 0.4526, + "step": 976 + }, + { + "epoch": 0.23523746463612832, + "grad_norm": 2.932914972305298, + "learning_rate": 0.00017878026830760714, + "loss": 0.8118, + "step": 977 + }, + { + "epoch": 0.2354782399325829, + "grad_norm": 2.4748735427856445, + "learning_rate": 0.00017873221770793943, + "loss": 0.7625, + "step": 978 + }, + { + "epoch": 0.2357190152290375, + "grad_norm": 7.512228488922119, + "learning_rate": 0.00017868411923890597, + "loss": 0.8987, + "step": 979 + }, + { + "epoch": 0.2359597905254921, + "grad_norm": 1.6160115003585815, + "learning_rate": 0.00017863597292975075, + "loss": 0.5894, + "step": 980 + }, + { + "epoch": 0.23620056582194668, + "grad_norm": 1.4038505554199219, + "learning_rate": 0.00017858777880974677, + "loss": 0.411, + "step": 981 + }, + { + "epoch": 0.23644134111840126, + "grad_norm": 0.987040102481842, + "learning_rate": 0.00017853953690819628, + "loss": 0.4793, + "step": 982 + }, + { + "epoch": 0.23668211641485584, + "grad_norm": 12.90198802947998, + "learning_rate": 0.00017849124725443033, + "loss": 0.7816, + "step": 983 + }, + { + "epoch": 0.23692289171131042, + "grad_norm": 1.474013090133667, + "learning_rate": 0.00017844290987780926, + "loss": 0.8878, + "step": 984 + }, + { + "epoch": 0.237163667007765, + "grad_norm": 4.9217963218688965, + "learning_rate": 0.0001783945248077222, + "loss": 0.7382, + "step": 985 + }, + { + "epoch": 0.2374044423042196, + "grad_norm": 3.483311891555786, + "learning_rate": 0.0001783460920735875, + "loss": 1.2593, + "step": 986 + }, + { + "epoch": 0.23764521760067417, + "grad_norm": 4.8503594398498535, + "learning_rate": 0.00017829761170485228, + "loss": 1.5008, + "step": 987 + }, + { + "epoch": 0.23788599289712875, + "grad_norm": 1.4968628883361816, + "learning_rate": 0.0001782490837309927, + "loss": 0.62, + "step": 988 + }, + { + "epoch": 0.23812676819358333, + "grad_norm": 2.4329562187194824, + "learning_rate": 0.00017820050818151395, + "loss": 0.7213, + "step": 989 + }, + { + "epoch": 0.23836754349003791, + "grad_norm": 7.601263523101807, + "learning_rate": 0.00017815188508595002, + "loss": 0.4269, + "step": 990 + }, + { + "epoch": 0.2386083187864925, + "grad_norm": 2.8010635375976562, + "learning_rate": 0.00017810321447386387, + "loss": 0.9812, + "step": 991 + }, + { + "epoch": 0.23884909408294708, + "grad_norm": 4.355586051940918, + "learning_rate": 0.0001780544963748474, + "loss": 0.6753, + "step": 992 + }, + { + "epoch": 0.2390898693794017, + "grad_norm": 1.6186625957489014, + "learning_rate": 0.00017800573081852122, + "loss": 0.5759, + "step": 993 + }, + { + "epoch": 0.23933064467585627, + "grad_norm": 1.3594582080841064, + "learning_rate": 0.000177956917834535, + "loss": 0.6777, + "step": 994 + }, + { + "epoch": 0.23957141997231085, + "grad_norm": 0.8430949449539185, + "learning_rate": 0.00017790805745256704, + "loss": 0.4463, + "step": 995 + }, + { + "epoch": 0.23981219526876543, + "grad_norm": 3.564265012741089, + "learning_rate": 0.00017785914970232467, + "loss": 0.7162, + "step": 996 + }, + { + "epoch": 0.24005297056522001, + "grad_norm": 2.442955255508423, + "learning_rate": 0.00017781019461354385, + "loss": 1.1975, + "step": 997 + }, + { + "epoch": 0.2402937458616746, + "grad_norm": 2.008604049682617, + "learning_rate": 0.00017776119221598938, + "loss": 0.3523, + "step": 998 + }, + { + "epoch": 0.24053452115812918, + "grad_norm": 5.036071300506592, + "learning_rate": 0.00017771214253945488, + "loss": 0.7299, + "step": 999 + }, + { + "epoch": 0.24077529645458376, + "grad_norm": 2.059300661087036, + "learning_rate": 0.0001776630456137626, + "loss": 1.0976, + "step": 1000 + }, + { + "epoch": 0.24101607175103834, + "grad_norm": 1.5523993968963623, + "learning_rate": 0.0001776139014687636, + "loss": 0.3973, + "step": 1001 + }, + { + "epoch": 0.24125684704749292, + "grad_norm": 2.268207311630249, + "learning_rate": 0.00017756471013433766, + "loss": 0.6189, + "step": 1002 + }, + { + "epoch": 0.2414976223439475, + "grad_norm": 1.0523104667663574, + "learning_rate": 0.0001775154716403932, + "loss": 0.6191, + "step": 1003 + }, + { + "epoch": 0.2417383976404021, + "grad_norm": 1.8148690462112427, + "learning_rate": 0.00017746618601686734, + "loss": 0.5895, + "step": 1004 + }, + { + "epoch": 0.24197917293685667, + "grad_norm": 2.2843098640441895, + "learning_rate": 0.00017741685329372584, + "loss": 0.4135, + "step": 1005 + }, + { + "epoch": 0.24221994823331125, + "grad_norm": 1.5911093950271606, + "learning_rate": 0.00017736747350096313, + "loss": 0.3805, + "step": 1006 + }, + { + "epoch": 0.24246072352976586, + "grad_norm": 1.609438180923462, + "learning_rate": 0.00017731804666860218, + "loss": 0.4508, + "step": 1007 + }, + { + "epoch": 0.24270149882622044, + "grad_norm": 9.2236328125, + "learning_rate": 0.0001772685728266947, + "loss": 0.7403, + "step": 1008 + }, + { + "epoch": 0.24294227412267502, + "grad_norm": 4.165558815002441, + "learning_rate": 0.00017721905200532084, + "loss": 0.4195, + "step": 1009 + }, + { + "epoch": 0.2431830494191296, + "grad_norm": 2.679929494857788, + "learning_rate": 0.00017716948423458938, + "loss": 0.696, + "step": 1010 + }, + { + "epoch": 0.24342382471558419, + "grad_norm": 2.558372974395752, + "learning_rate": 0.00017711986954463765, + "loss": 0.8344, + "step": 1011 + }, + { + "epoch": 0.24366460001203877, + "grad_norm": 2.897308588027954, + "learning_rate": 0.0001770702079656315, + "loss": 0.4203, + "step": 1012 + }, + { + "epoch": 0.24390537530849335, + "grad_norm": 3.2203593254089355, + "learning_rate": 0.00017702049952776522, + "loss": 0.7664, + "step": 1013 + }, + { + "epoch": 0.24414615060494793, + "grad_norm": 4.204813480377197, + "learning_rate": 0.00017697074426126173, + "loss": 0.3801, + "step": 1014 + }, + { + "epoch": 0.2443869259014025, + "grad_norm": 0.8308073878288269, + "learning_rate": 0.0001769209421963723, + "loss": 0.2596, + "step": 1015 + }, + { + "epoch": 0.2446277011978571, + "grad_norm": 2.2909529209136963, + "learning_rate": 0.00017687109336337673, + "loss": 0.3914, + "step": 1016 + }, + { + "epoch": 0.24486847649431168, + "grad_norm": 3.4535796642303467, + "learning_rate": 0.00017682119779258317, + "loss": 0.7128, + "step": 1017 + }, + { + "epoch": 0.24510925179076626, + "grad_norm": 2.2746803760528564, + "learning_rate": 0.0001767712555143283, + "loss": 0.6153, + "step": 1018 + }, + { + "epoch": 0.24535002708722084, + "grad_norm": 3.151444435119629, + "learning_rate": 0.00017672126655897708, + "loss": 0.8, + "step": 1019 + }, + { + "epoch": 0.24559080238367542, + "grad_norm": 7.057896614074707, + "learning_rate": 0.00017667123095692296, + "loss": 0.4853, + "step": 1020 + }, + { + "epoch": 0.24583157768013003, + "grad_norm": 1.5912202596664429, + "learning_rate": 0.00017662114873858768, + "loss": 0.5406, + "step": 1021 + }, + { + "epoch": 0.2460723529765846, + "grad_norm": 4.36636209487915, + "learning_rate": 0.00017657101993442132, + "loss": 1.5037, + "step": 1022 + }, + { + "epoch": 0.2463131282730392, + "grad_norm": 3.4972012042999268, + "learning_rate": 0.00017652084457490233, + "loss": 0.4583, + "step": 1023 + }, + { + "epoch": 0.24655390356949378, + "grad_norm": 1.963361382484436, + "learning_rate": 0.00017647062269053745, + "loss": 0.5212, + "step": 1024 + }, + { + "epoch": 0.24679467886594836, + "grad_norm": 8.170878410339355, + "learning_rate": 0.00017642035431186166, + "loss": 0.3219, + "step": 1025 + }, + { + "epoch": 0.24703545416240294, + "grad_norm": 5.340506076812744, + "learning_rate": 0.00017637003946943826, + "loss": 0.8826, + "step": 1026 + }, + { + "epoch": 0.24727622945885752, + "grad_norm": 0.9775887727737427, + "learning_rate": 0.00017631967819385885, + "loss": 0.7689, + "step": 1027 + }, + { + "epoch": 0.2475170047553121, + "grad_norm": 5.842097759246826, + "learning_rate": 0.0001762692705157431, + "loss": 1.3133, + "step": 1028 + }, + { + "epoch": 0.24775778005176669, + "grad_norm": 3.479212999343872, + "learning_rate": 0.00017621881646573905, + "loss": 0.6421, + "step": 1029 + }, + { + "epoch": 0.24799855534822127, + "grad_norm": 2.3998911380767822, + "learning_rate": 0.00017616831607452288, + "loss": 0.9605, + "step": 1030 + }, + { + "epoch": 0.24823933064467585, + "grad_norm": 2.134242057800293, + "learning_rate": 0.00017611776937279894, + "loss": 0.5968, + "step": 1031 + }, + { + "epoch": 0.24848010594113043, + "grad_norm": 1.5552438497543335, + "learning_rate": 0.00017606717639129967, + "loss": 0.5313, + "step": 1032 + }, + { + "epoch": 0.248720881237585, + "grad_norm": 1.7223352193832397, + "learning_rate": 0.00017601653716078583, + "loss": 0.6771, + "step": 1033 + }, + { + "epoch": 0.2489616565340396, + "grad_norm": 1.0817844867706299, + "learning_rate": 0.00017596585171204612, + "loss": 0.0747, + "step": 1034 + }, + { + "epoch": 0.2492024318304942, + "grad_norm": 16.8873291015625, + "learning_rate": 0.0001759151200758974, + "loss": 0.7068, + "step": 1035 + }, + { + "epoch": 0.24944320712694878, + "grad_norm": 3.909327983856201, + "learning_rate": 0.00017586434228318462, + "loss": 1.1171, + "step": 1036 + }, + { + "epoch": 0.24968398242340337, + "grad_norm": 1.0942474603652954, + "learning_rate": 0.00017581351836478085, + "loss": 0.3179, + "step": 1037 + }, + { + "epoch": 0.24992475771985795, + "grad_norm": 1.4328174591064453, + "learning_rate": 0.00017576264835158706, + "loss": 0.5279, + "step": 1038 + }, + { + "epoch": 0.2501655330163125, + "grad_norm": 1.4774179458618164, + "learning_rate": 0.0001757117322745324, + "loss": 0.5594, + "step": 1039 + }, + { + "epoch": 0.2504063083127671, + "grad_norm": 5.494201183319092, + "learning_rate": 0.00017566077016457394, + "loss": 0.892, + "step": 1040 + }, + { + "epoch": 0.2506470836092217, + "grad_norm": 0.7356401085853577, + "learning_rate": 0.00017560976205269673, + "loss": 0.3253, + "step": 1041 + }, + { + "epoch": 0.2508878589056763, + "grad_norm": 1.2550084590911865, + "learning_rate": 0.00017555870796991387, + "loss": 0.4044, + "step": 1042 + }, + { + "epoch": 0.2511286342021309, + "grad_norm": 5.170292377471924, + "learning_rate": 0.00017550760794726633, + "loss": 0.5862, + "step": 1043 + }, + { + "epoch": 0.25136940949858544, + "grad_norm": 1.0770255327224731, + "learning_rate": 0.00017545646201582303, + "loss": 0.7886, + "step": 1044 + }, + { + "epoch": 0.25161018479504005, + "grad_norm": 1.4369720220565796, + "learning_rate": 0.0001754052702066808, + "loss": 0.3612, + "step": 1045 + }, + { + "epoch": 0.2518509600914946, + "grad_norm": 1.905137300491333, + "learning_rate": 0.00017535403255096444, + "loss": 0.9435, + "step": 1046 + }, + { + "epoch": 0.2520917353879492, + "grad_norm": 16.227949142456055, + "learning_rate": 0.00017530274907982647, + "loss": 1.7011, + "step": 1047 + }, + { + "epoch": 0.25233251068440377, + "grad_norm": 1.4642868041992188, + "learning_rate": 0.0001752514198244474, + "loss": 0.5992, + "step": 1048 + }, + { + "epoch": 0.2525732859808584, + "grad_norm": 2.769197463989258, + "learning_rate": 0.00017520004481603554, + "loss": 1.3272, + "step": 1049 + }, + { + "epoch": 0.25281406127731293, + "grad_norm": 2.604154586791992, + "learning_rate": 0.00017514862408582701, + "loss": 0.6135, + "step": 1050 + }, + { + "epoch": 0.25305483657376754, + "grad_norm": 4.081873416900635, + "learning_rate": 0.00017509715766508575, + "loss": 1.1481, + "step": 1051 + }, + { + "epoch": 0.2532956118702221, + "grad_norm": 5.054668426513672, + "learning_rate": 0.0001750456455851034, + "loss": 0.8622, + "step": 1052 + }, + { + "epoch": 0.2535363871666767, + "grad_norm": 3.3023860454559326, + "learning_rate": 0.00017499408787719945, + "loss": 0.6033, + "step": 1053 + }, + { + "epoch": 0.25377716246313126, + "grad_norm": 1.7069976329803467, + "learning_rate": 0.00017494248457272112, + "loss": 0.4344, + "step": 1054 + }, + { + "epoch": 0.25401793775958587, + "grad_norm": 4.372264385223389, + "learning_rate": 0.00017489083570304333, + "loss": 1.2165, + "step": 1055 + }, + { + "epoch": 0.2542587130560405, + "grad_norm": 3.081066608428955, + "learning_rate": 0.00017483914129956868, + "loss": 0.8693, + "step": 1056 + }, + { + "epoch": 0.25449948835249503, + "grad_norm": 3.6236472129821777, + "learning_rate": 0.00017478740139372753, + "loss": 0.8538, + "step": 1057 + }, + { + "epoch": 0.25474026364894964, + "grad_norm": 1.3119887113571167, + "learning_rate": 0.00017473561601697783, + "loss": 0.6279, + "step": 1058 + }, + { + "epoch": 0.2549810389454042, + "grad_norm": 1.9135150909423828, + "learning_rate": 0.0001746837852008052, + "loss": 0.2616, + "step": 1059 + }, + { + "epoch": 0.2552218142418588, + "grad_norm": 3.3330864906311035, + "learning_rate": 0.0001746319089767229, + "loss": 0.5011, + "step": 1060 + }, + { + "epoch": 0.25546258953831336, + "grad_norm": 1.3728001117706299, + "learning_rate": 0.00017457998737627182, + "loss": 0.3416, + "step": 1061 + }, + { + "epoch": 0.25570336483476797, + "grad_norm": 1.3205347061157227, + "learning_rate": 0.00017452802043102034, + "loss": 0.4671, + "step": 1062 + }, + { + "epoch": 0.2559441401312225, + "grad_norm": 1.428043246269226, + "learning_rate": 0.00017447600817256458, + "loss": 0.9892, + "step": 1063 + }, + { + "epoch": 0.25618491542767713, + "grad_norm": 6.333396911621094, + "learning_rate": 0.000174423950632528, + "loss": 0.2126, + "step": 1064 + }, + { + "epoch": 0.2564256907241317, + "grad_norm": 4.501138210296631, + "learning_rate": 0.00017437184784256177, + "loss": 1.0723, + "step": 1065 + }, + { + "epoch": 0.2566664660205863, + "grad_norm": 6.165459632873535, + "learning_rate": 0.0001743196998343445, + "loss": 0.7911, + "step": 1066 + }, + { + "epoch": 0.25690724131704085, + "grad_norm": 2.045748710632324, + "learning_rate": 0.00017426750663958231, + "loss": 0.6512, + "step": 1067 + }, + { + "epoch": 0.25714801661349546, + "grad_norm": 15.06201457977295, + "learning_rate": 0.00017421526829000872, + "loss": 1.1931, + "step": 1068 + }, + { + "epoch": 0.25738879190995007, + "grad_norm": 1.8321692943572998, + "learning_rate": 0.00017416298481738482, + "loss": 0.5883, + "step": 1069 + }, + { + "epoch": 0.2576295672064046, + "grad_norm": 1.6688590049743652, + "learning_rate": 0.00017411065625349905, + "loss": 0.2832, + "step": 1070 + }, + { + "epoch": 0.25787034250285923, + "grad_norm": 3.1032514572143555, + "learning_rate": 0.00017405828263016734, + "loss": 0.6419, + "step": 1071 + }, + { + "epoch": 0.2581111177993138, + "grad_norm": 1.9432801008224487, + "learning_rate": 0.00017400586397923288, + "loss": 0.5049, + "step": 1072 + }, + { + "epoch": 0.2583518930957684, + "grad_norm": 1.8286429643630981, + "learning_rate": 0.0001739534003325664, + "loss": 0.9415, + "step": 1073 + }, + { + "epoch": 0.25859266839222295, + "grad_norm": 1.2921900749206543, + "learning_rate": 0.00017390089172206592, + "loss": 0.1063, + "step": 1074 + }, + { + "epoch": 0.25883344368867756, + "grad_norm": 2.301280975341797, + "learning_rate": 0.00017384833817965674, + "loss": 0.6128, + "step": 1075 + }, + { + "epoch": 0.2590742189851321, + "grad_norm": 2.117572069168091, + "learning_rate": 0.00017379573973729163, + "loss": 0.2485, + "step": 1076 + }, + { + "epoch": 0.2593149942815867, + "grad_norm": 44.768497467041016, + "learning_rate": 0.0001737430964269504, + "loss": 1.1191, + "step": 1077 + }, + { + "epoch": 0.2595557695780413, + "grad_norm": 3.337317943572998, + "learning_rate": 0.00017369040828064047, + "loss": 0.372, + "step": 1078 + }, + { + "epoch": 0.2597965448744959, + "grad_norm": 2.307708263397217, + "learning_rate": 0.00017363767533039626, + "loss": 0.9921, + "step": 1079 + }, + { + "epoch": 0.26003732017095044, + "grad_norm": 1.9676494598388672, + "learning_rate": 0.00017358489760827954, + "loss": 0.1324, + "step": 1080 + }, + { + "epoch": 0.26027809546740505, + "grad_norm": 2.810729503631592, + "learning_rate": 0.00017353207514637928, + "loss": 0.5826, + "step": 1081 + }, + { + "epoch": 0.26051887076385966, + "grad_norm": 0.4161555767059326, + "learning_rate": 0.00017347920797681165, + "loss": 0.1594, + "step": 1082 + }, + { + "epoch": 0.2607596460603142, + "grad_norm": 2.319537878036499, + "learning_rate": 0.00017342629613172005, + "loss": 1.0077, + "step": 1083 + }, + { + "epoch": 0.2610004213567688, + "grad_norm": 6.007627487182617, + "learning_rate": 0.00017337333964327493, + "loss": 0.7686, + "step": 1084 + }, + { + "epoch": 0.2612411966532234, + "grad_norm": 1.204407811164856, + "learning_rate": 0.00017332033854367405, + "loss": 0.3591, + "step": 1085 + }, + { + "epoch": 0.261481971949678, + "grad_norm": 1.132603406906128, + "learning_rate": 0.00017326729286514208, + "loss": 0.5379, + "step": 1086 + }, + { + "epoch": 0.26172274724613254, + "grad_norm": 2.5757410526275635, + "learning_rate": 0.00017321420263993102, + "loss": 0.4672, + "step": 1087 + }, + { + "epoch": 0.26196352254258715, + "grad_norm": 4.104795932769775, + "learning_rate": 0.0001731610679003198, + "loss": 0.9948, + "step": 1088 + }, + { + "epoch": 0.2622042978390417, + "grad_norm": 2.2880449295043945, + "learning_rate": 0.00017310788867861446, + "loss": 0.5483, + "step": 1089 + }, + { + "epoch": 0.2624450731354963, + "grad_norm": 0.9389917254447937, + "learning_rate": 0.00017305466500714808, + "loss": 0.4569, + "step": 1090 + }, + { + "epoch": 0.26268584843195086, + "grad_norm": 1.6185779571533203, + "learning_rate": 0.00017300139691828076, + "loss": 0.5776, + "step": 1091 + }, + { + "epoch": 0.2629266237284055, + "grad_norm": 1.6307772397994995, + "learning_rate": 0.00017294808444439966, + "loss": 0.3469, + "step": 1092 + }, + { + "epoch": 0.26316739902486, + "grad_norm": 1.6767011880874634, + "learning_rate": 0.00017289472761791887, + "loss": 0.4124, + "step": 1093 + }, + { + "epoch": 0.26340817432131464, + "grad_norm": 3.5887739658355713, + "learning_rate": 0.00017284132647127947, + "loss": 0.7729, + "step": 1094 + }, + { + "epoch": 0.2636489496177692, + "grad_norm": 2.98939847946167, + "learning_rate": 0.00017278788103694943, + "loss": 0.547, + "step": 1095 + }, + { + "epoch": 0.2638897249142238, + "grad_norm": 3.3976495265960693, + "learning_rate": 0.00017273439134742372, + "loss": 0.9218, + "step": 1096 + }, + { + "epoch": 0.2641305002106784, + "grad_norm": 4.070367813110352, + "learning_rate": 0.00017268085743522423, + "loss": 0.6816, + "step": 1097 + }, + { + "epoch": 0.26437127550713296, + "grad_norm": 2.875377893447876, + "learning_rate": 0.00017262727933289965, + "loss": 0.3219, + "step": 1098 + }, + { + "epoch": 0.2646120508035876, + "grad_norm": 2.2826290130615234, + "learning_rate": 0.0001725736570730256, + "loss": 0.4502, + "step": 1099 + }, + { + "epoch": 0.2648528261000421, + "grad_norm": 1.6275043487548828, + "learning_rate": 0.00017251999068820456, + "loss": 0.2921, + "step": 1100 + }, + { + "epoch": 0.26509360139649674, + "grad_norm": 7.831151962280273, + "learning_rate": 0.00017246628021106577, + "loss": 0.6283, + "step": 1101 + }, + { + "epoch": 0.2653343766929513, + "grad_norm": 2.219731569290161, + "learning_rate": 0.00017241252567426534, + "loss": 0.4064, + "step": 1102 + }, + { + "epoch": 0.2655751519894059, + "grad_norm": 3.912492036819458, + "learning_rate": 0.00017235872711048617, + "loss": 1.037, + "step": 1103 + }, + { + "epoch": 0.26581592728586045, + "grad_norm": 2.191307783126831, + "learning_rate": 0.00017230488455243788, + "loss": 0.8365, + "step": 1104 + }, + { + "epoch": 0.26605670258231506, + "grad_norm": 4.49420690536499, + "learning_rate": 0.00017225099803285692, + "loss": 0.6711, + "step": 1105 + }, + { + "epoch": 0.2662974778787696, + "grad_norm": 3.0059874057769775, + "learning_rate": 0.00017219706758450631, + "loss": 0.8782, + "step": 1106 + }, + { + "epoch": 0.2665382531752242, + "grad_norm": 0.6084616780281067, + "learning_rate": 0.00017214309324017598, + "loss": 0.146, + "step": 1107 + }, + { + "epoch": 0.2667790284716788, + "grad_norm": 3.9558541774749756, + "learning_rate": 0.0001720890750326824, + "loss": 0.4695, + "step": 1108 + }, + { + "epoch": 0.2670198037681334, + "grad_norm": 3.0784409046173096, + "learning_rate": 0.00017203501299486881, + "loss": 0.9544, + "step": 1109 + }, + { + "epoch": 0.267260579064588, + "grad_norm": 0.6500839591026306, + "learning_rate": 0.000171980907159605, + "loss": 0.2016, + "step": 1110 + }, + { + "epoch": 0.26750135436104255, + "grad_norm": 4.700311660766602, + "learning_rate": 0.00017192675755978748, + "loss": 1.1171, + "step": 1111 + }, + { + "epoch": 0.26774212965749716, + "grad_norm": 1.3093310594558716, + "learning_rate": 0.00017187256422833929, + "loss": 0.2874, + "step": 1112 + }, + { + "epoch": 0.2679829049539517, + "grad_norm": 2.463928461074829, + "learning_rate": 0.0001718183271982101, + "loss": 0.4196, + "step": 1113 + }, + { + "epoch": 0.2682236802504063, + "grad_norm": 3.2348127365112305, + "learning_rate": 0.0001717640465023762, + "loss": 1.1128, + "step": 1114 + }, + { + "epoch": 0.2684644555468609, + "grad_norm": 2.890456438064575, + "learning_rate": 0.00017170972217384035, + "loss": 0.5629, + "step": 1115 + }, + { + "epoch": 0.2687052308433155, + "grad_norm": 4.826290607452393, + "learning_rate": 0.00017165535424563185, + "loss": 0.7627, + "step": 1116 + }, + { + "epoch": 0.26894600613977004, + "grad_norm": 2.350214719772339, + "learning_rate": 0.00017160094275080648, + "loss": 0.8664, + "step": 1117 + }, + { + "epoch": 0.26918678143622465, + "grad_norm": 2.407381772994995, + "learning_rate": 0.00017154648772244664, + "loss": 0.5608, + "step": 1118 + }, + { + "epoch": 0.2694275567326792, + "grad_norm": 4.348508834838867, + "learning_rate": 0.00017149198919366105, + "loss": 0.7147, + "step": 1119 + }, + { + "epoch": 0.2696683320291338, + "grad_norm": 2.3149821758270264, + "learning_rate": 0.00017143744719758499, + "loss": 0.8603, + "step": 1120 + }, + { + "epoch": 0.26990910732558837, + "grad_norm": 2.070988893508911, + "learning_rate": 0.00017138286176738006, + "loss": 0.3237, + "step": 1121 + }, + { + "epoch": 0.270149882622043, + "grad_norm": 2.3443868160247803, + "learning_rate": 0.00017132823293623432, + "loss": 0.313, + "step": 1122 + }, + { + "epoch": 0.27039065791849753, + "grad_norm": 2.001828908920288, + "learning_rate": 0.0001712735607373623, + "loss": 0.665, + "step": 1123 + }, + { + "epoch": 0.27063143321495214, + "grad_norm": 8.432289123535156, + "learning_rate": 0.00017121884520400474, + "loss": 0.7836, + "step": 1124 + }, + { + "epoch": 0.27087220851140675, + "grad_norm": 2.163132429122925, + "learning_rate": 0.00017116408636942888, + "loss": 0.2619, + "step": 1125 + }, + { + "epoch": 0.2711129838078613, + "grad_norm": 1.5865484476089478, + "learning_rate": 0.0001711092842669281, + "loss": 0.4622, + "step": 1126 + }, + { + "epoch": 0.2713537591043159, + "grad_norm": 2.0779945850372314, + "learning_rate": 0.0001710544389298223, + "loss": 0.3762, + "step": 1127 + }, + { + "epoch": 0.27159453440077047, + "grad_norm": 7.750448703765869, + "learning_rate": 0.00017099955039145758, + "loss": 0.7578, + "step": 1128 + }, + { + "epoch": 0.2718353096972251, + "grad_norm": 1.3454210758209229, + "learning_rate": 0.00017094461868520622, + "loss": 0.5281, + "step": 1129 + }, + { + "epoch": 0.27207608499367963, + "grad_norm": 1.0360485315322876, + "learning_rate": 0.0001708896438444669, + "loss": 0.7575, + "step": 1130 + }, + { + "epoch": 0.27231686029013424, + "grad_norm": 1.6887176036834717, + "learning_rate": 0.00017083462590266438, + "loss": 0.4004, + "step": 1131 + }, + { + "epoch": 0.2725576355865888, + "grad_norm": 2.2301809787750244, + "learning_rate": 0.00017077956489324972, + "loss": 0.4566, + "step": 1132 + }, + { + "epoch": 0.2727984108830434, + "grad_norm": 1.984376311302185, + "learning_rate": 0.00017072446084970014, + "loss": 0.3397, + "step": 1133 + }, + { + "epoch": 0.27303918617949796, + "grad_norm": 1.465584397315979, + "learning_rate": 0.000170669313805519, + "loss": 1.118, + "step": 1134 + }, + { + "epoch": 0.27327996147595257, + "grad_norm": 3.79280948638916, + "learning_rate": 0.00017061412379423588, + "loss": 1.0574, + "step": 1135 + }, + { + "epoch": 0.2735207367724071, + "grad_norm": 2.3764195442199707, + "learning_rate": 0.00017055889084940638, + "loss": 0.66, + "step": 1136 + }, + { + "epoch": 0.27376151206886173, + "grad_norm": 4.677147388458252, + "learning_rate": 0.00017050361500461225, + "loss": 1.764, + "step": 1137 + }, + { + "epoch": 0.27400228736531634, + "grad_norm": 0.9030484557151794, + "learning_rate": 0.00017044829629346138, + "loss": 0.6526, + "step": 1138 + }, + { + "epoch": 0.2742430626617709, + "grad_norm": 0.627707302570343, + "learning_rate": 0.00017039293474958766, + "loss": 0.3727, + "step": 1139 + }, + { + "epoch": 0.2744838379582255, + "grad_norm": 1.3778057098388672, + "learning_rate": 0.00017033753040665098, + "loss": 0.6493, + "step": 1140 + }, + { + "epoch": 0.27472461325468006, + "grad_norm": 4.6707305908203125, + "learning_rate": 0.00017028208329833734, + "loss": 0.9048, + "step": 1141 + }, + { + "epoch": 0.27496538855113467, + "grad_norm": 0.9319252967834473, + "learning_rate": 0.00017022659345835873, + "loss": 0.199, + "step": 1142 + }, + { + "epoch": 0.2752061638475892, + "grad_norm": 4.65252685546875, + "learning_rate": 0.00017017106092045308, + "loss": 0.5977, + "step": 1143 + }, + { + "epoch": 0.27544693914404383, + "grad_norm": 2.0846447944641113, + "learning_rate": 0.00017011548571838425, + "loss": 0.8822, + "step": 1144 + }, + { + "epoch": 0.2756877144404984, + "grad_norm": 7.9237961769104, + "learning_rate": 0.00017005986788594217, + "loss": 0.9254, + "step": 1145 + }, + { + "epoch": 0.275928489736953, + "grad_norm": 1.8218225240707397, + "learning_rate": 0.00017000420745694254, + "loss": 0.8144, + "step": 1146 + }, + { + "epoch": 0.27616926503340755, + "grad_norm": 2.215475082397461, + "learning_rate": 0.00016994850446522708, + "loss": 0.5376, + "step": 1147 + }, + { + "epoch": 0.27641004032986216, + "grad_norm": 2.7972052097320557, + "learning_rate": 0.0001698927589446633, + "loss": 0.7172, + "step": 1148 + }, + { + "epoch": 0.2766508156263167, + "grad_norm": 3.2082738876342773, + "learning_rate": 0.00016983697092914462, + "loss": 0.7779, + "step": 1149 + }, + { + "epoch": 0.2768915909227713, + "grad_norm": 7.983036041259766, + "learning_rate": 0.00016978114045259024, + "loss": 0.6586, + "step": 1150 + }, + { + "epoch": 0.27713236621922593, + "grad_norm": 1.2389219999313354, + "learning_rate": 0.00016972526754894526, + "loss": 0.4504, + "step": 1151 + }, + { + "epoch": 0.2773731415156805, + "grad_norm": 4.120885848999023, + "learning_rate": 0.00016966935225218055, + "loss": 0.9209, + "step": 1152 + }, + { + "epoch": 0.2776139168121351, + "grad_norm": 1.707640528678894, + "learning_rate": 0.0001696133945962927, + "loss": 0.4769, + "step": 1153 + }, + { + "epoch": 0.27785469210858965, + "grad_norm": 2.347038745880127, + "learning_rate": 0.00016955739461530403, + "loss": 0.6969, + "step": 1154 + }, + { + "epoch": 0.27809546740504426, + "grad_norm": 1.5984582901000977, + "learning_rate": 0.0001695013523432628, + "loss": 0.7477, + "step": 1155 + }, + { + "epoch": 0.2783362427014988, + "grad_norm": 3.808624267578125, + "learning_rate": 0.0001694452678142427, + "loss": 0.6706, + "step": 1156 + }, + { + "epoch": 0.2785770179979534, + "grad_norm": 2.3861489295959473, + "learning_rate": 0.00016938914106234333, + "loss": 0.492, + "step": 1157 + }, + { + "epoch": 0.278817793294408, + "grad_norm": 6.34063196182251, + "learning_rate": 0.00016933297212168985, + "loss": 0.9194, + "step": 1158 + }, + { + "epoch": 0.2790585685908626, + "grad_norm": 2.32570743560791, + "learning_rate": 0.0001692767610264331, + "loss": 0.3936, + "step": 1159 + }, + { + "epoch": 0.27929934388731714, + "grad_norm": 2.573622226715088, + "learning_rate": 0.0001692205078107496, + "loss": 0.8134, + "step": 1160 + }, + { + "epoch": 0.27954011918377175, + "grad_norm": 2.499985933303833, + "learning_rate": 0.00016916421250884138, + "loss": 0.4928, + "step": 1161 + }, + { + "epoch": 0.2797808944802263, + "grad_norm": 1.9372178316116333, + "learning_rate": 0.00016910787515493611, + "loss": 0.6883, + "step": 1162 + }, + { + "epoch": 0.2800216697766809, + "grad_norm": 1.012056589126587, + "learning_rate": 0.00016905149578328702, + "loss": 0.3567, + "step": 1163 + }, + { + "epoch": 0.28026244507313547, + "grad_norm": 1.383881688117981, + "learning_rate": 0.00016899507442817298, + "loss": 0.7005, + "step": 1164 + }, + { + "epoch": 0.2805032203695901, + "grad_norm": 7.843169212341309, + "learning_rate": 0.00016893861112389822, + "loss": 0.59, + "step": 1165 + }, + { + "epoch": 0.2807439956660447, + "grad_norm": 15.78963851928711, + "learning_rate": 0.00016888210590479256, + "loss": 0.7168, + "step": 1166 + }, + { + "epoch": 0.28098477096249924, + "grad_norm": 1.2202370166778564, + "learning_rate": 0.0001688255588052113, + "loss": 0.2079, + "step": 1167 + }, + { + "epoch": 0.28122554625895385, + "grad_norm": 1.067835807800293, + "learning_rate": 0.0001687689698595353, + "loss": 0.6354, + "step": 1168 + }, + { + "epoch": 0.2814663215554084, + "grad_norm": 0.6400854587554932, + "learning_rate": 0.0001687123391021706, + "loss": 0.307, + "step": 1169 + }, + { + "epoch": 0.281707096851863, + "grad_norm": 2.6087357997894287, + "learning_rate": 0.00016865566656754896, + "loss": 0.4111, + "step": 1170 + }, + { + "epoch": 0.28194787214831757, + "grad_norm": 1.9883902072906494, + "learning_rate": 0.00016859895229012737, + "loss": 0.6824, + "step": 1171 + }, + { + "epoch": 0.2821886474447722, + "grad_norm": 2.6531500816345215, + "learning_rate": 0.00016854219630438818, + "loss": 0.745, + "step": 1172 + }, + { + "epoch": 0.28242942274122673, + "grad_norm": 0.8592819571495056, + "learning_rate": 0.00016848539864483926, + "loss": 0.7847, + "step": 1173 + }, + { + "epoch": 0.28267019803768134, + "grad_norm": 4.981196880340576, + "learning_rate": 0.00016842855934601366, + "loss": 0.9405, + "step": 1174 + }, + { + "epoch": 0.2829109733341359, + "grad_norm": 1.9096482992172241, + "learning_rate": 0.0001683716784424698, + "loss": 0.8852, + "step": 1175 + }, + { + "epoch": 0.2831517486305905, + "grad_norm": 7.072299003601074, + "learning_rate": 0.0001683147559687914, + "loss": 1.6136, + "step": 1176 + }, + { + "epoch": 0.28339252392704506, + "grad_norm": 18.518299102783203, + "learning_rate": 0.00016825779195958745, + "loss": 0.2307, + "step": 1177 + }, + { + "epoch": 0.28363329922349967, + "grad_norm": 2.7872228622436523, + "learning_rate": 0.0001682007864494922, + "loss": 0.6282, + "step": 1178 + }, + { + "epoch": 0.2838740745199543, + "grad_norm": 1.4213825464248657, + "learning_rate": 0.00016814373947316512, + "loss": 0.6838, + "step": 1179 + }, + { + "epoch": 0.28411484981640883, + "grad_norm": 1.1344329118728638, + "learning_rate": 0.00016808665106529094, + "loss": 0.4394, + "step": 1180 + }, + { + "epoch": 0.28435562511286344, + "grad_norm": 1.0440508127212524, + "learning_rate": 0.0001680295212605795, + "loss": 0.1343, + "step": 1181 + }, + { + "epoch": 0.284596400409318, + "grad_norm": 3.40962553024292, + "learning_rate": 0.00016797235009376586, + "loss": 0.6312, + "step": 1182 + }, + { + "epoch": 0.2848371757057726, + "grad_norm": 3.0211853981018066, + "learning_rate": 0.0001679151375996102, + "loss": 0.6371, + "step": 1183 + }, + { + "epoch": 0.28507795100222716, + "grad_norm": 8.21009635925293, + "learning_rate": 0.0001678578838128979, + "loss": 0.7002, + "step": 1184 + }, + { + "epoch": 0.28531872629868177, + "grad_norm": 2.1480865478515625, + "learning_rate": 0.00016780058876843934, + "loss": 0.4914, + "step": 1185 + }, + { + "epoch": 0.2855595015951363, + "grad_norm": 1.2523528337478638, + "learning_rate": 0.00016774325250107006, + "loss": 0.5931, + "step": 1186 + }, + { + "epoch": 0.28580027689159093, + "grad_norm": 1.5123728513717651, + "learning_rate": 0.00016768587504565062, + "loss": 0.439, + "step": 1187 + }, + { + "epoch": 0.2860410521880455, + "grad_norm": 1.9221967458724976, + "learning_rate": 0.00016762845643706665, + "loss": 0.6541, + "step": 1188 + }, + { + "epoch": 0.2862818274845001, + "grad_norm": 4.153512477874756, + "learning_rate": 0.00016757099671022883, + "loss": 0.7725, + "step": 1189 + }, + { + "epoch": 0.28652260278095465, + "grad_norm": 1.0292513370513916, + "learning_rate": 0.00016751349590007274, + "loss": 0.5082, + "step": 1190 + }, + { + "epoch": 0.28676337807740926, + "grad_norm": 4.168222904205322, + "learning_rate": 0.00016745595404155905, + "loss": 0.5705, + "step": 1191 + }, + { + "epoch": 0.28700415337386387, + "grad_norm": 1.6598914861679077, + "learning_rate": 0.00016739837116967328, + "loss": 0.8381, + "step": 1192 + }, + { + "epoch": 0.2872449286703184, + "grad_norm": 2.8263731002807617, + "learning_rate": 0.00016734074731942605, + "loss": 0.7783, + "step": 1193 + }, + { + "epoch": 0.28748570396677303, + "grad_norm": 1.6634050607681274, + "learning_rate": 0.00016728308252585267, + "loss": 0.3698, + "step": 1194 + }, + { + "epoch": 0.2877264792632276, + "grad_norm": 2.690964937210083, + "learning_rate": 0.00016722537682401357, + "loss": 0.4771, + "step": 1195 + }, + { + "epoch": 0.2879672545596822, + "grad_norm": 0.9511985778808594, + "learning_rate": 0.0001671676302489939, + "loss": 0.2755, + "step": 1196 + }, + { + "epoch": 0.28820802985613675, + "grad_norm": 2.063718557357788, + "learning_rate": 0.0001671098428359037, + "loss": 0.4661, + "step": 1197 + }, + { + "epoch": 0.28844880515259136, + "grad_norm": 3.1178414821624756, + "learning_rate": 0.00016705201461987782, + "loss": 1.1358, + "step": 1198 + }, + { + "epoch": 0.2886895804490459, + "grad_norm": 1.8301066160202026, + "learning_rate": 0.00016699414563607601, + "loss": 0.3741, + "step": 1199 + }, + { + "epoch": 0.2889303557455005, + "grad_norm": 2.6910312175750732, + "learning_rate": 0.00016693623591968273, + "loss": 1.0457, + "step": 1200 + }, + { + "epoch": 0.2891711310419551, + "grad_norm": 1.9413840770721436, + "learning_rate": 0.0001668782855059072, + "loss": 0.7107, + "step": 1201 + }, + { + "epoch": 0.2894119063384097, + "grad_norm": 1.9084299802780151, + "learning_rate": 0.00016682029442998338, + "loss": 0.9563, + "step": 1202 + }, + { + "epoch": 0.28965268163486424, + "grad_norm": 6.873541831970215, + "learning_rate": 0.00016676226272717, + "loss": 0.8658, + "step": 1203 + }, + { + "epoch": 0.28989345693131885, + "grad_norm": 2.0159761905670166, + "learning_rate": 0.00016670419043275048, + "loss": 0.7841, + "step": 1204 + }, + { + "epoch": 0.2901342322277734, + "grad_norm": 1.8797401189804077, + "learning_rate": 0.00016664607758203287, + "loss": 0.7343, + "step": 1205 + }, + { + "epoch": 0.290375007524228, + "grad_norm": 1.6734647750854492, + "learning_rate": 0.00016658792421034996, + "loss": 0.4975, + "step": 1206 + }, + { + "epoch": 0.2906157828206826, + "grad_norm": 0.8860729932785034, + "learning_rate": 0.00016652973035305907, + "loss": 0.4253, + "step": 1207 + }, + { + "epoch": 0.2908565581171372, + "grad_norm": 0.7764965295791626, + "learning_rate": 0.00016647149604554227, + "loss": 0.7893, + "step": 1208 + }, + { + "epoch": 0.2910973334135918, + "grad_norm": 6.640602111816406, + "learning_rate": 0.0001664132213232061, + "loss": 0.9636, + "step": 1209 + }, + { + "epoch": 0.29133810871004634, + "grad_norm": 6.02003288269043, + "learning_rate": 0.00016635490622148177, + "loss": 0.8415, + "step": 1210 + }, + { + "epoch": 0.29157888400650095, + "grad_norm": 1.2742475271224976, + "learning_rate": 0.00016629655077582487, + "loss": 0.2262, + "step": 1211 + }, + { + "epoch": 0.2918196593029555, + "grad_norm": 0.7330831289291382, + "learning_rate": 0.0001662381550217158, + "loss": 0.4596, + "step": 1212 + }, + { + "epoch": 0.2920604345994101, + "grad_norm": 5.310278415679932, + "learning_rate": 0.00016617971899465922, + "loss": 0.4937, + "step": 1213 + }, + { + "epoch": 0.29230120989586467, + "grad_norm": 3.351181983947754, + "learning_rate": 0.0001661212427301844, + "loss": 0.3122, + "step": 1214 + }, + { + "epoch": 0.2925419851923193, + "grad_norm": 2.28200101852417, + "learning_rate": 0.000166062726263845, + "loss": 1.5276, + "step": 1215 + }, + { + "epoch": 0.29278276048877383, + "grad_norm": 4.403338432312012, + "learning_rate": 0.0001660041696312192, + "loss": 0.8055, + "step": 1216 + }, + { + "epoch": 0.29302353578522844, + "grad_norm": 2.3211700916290283, + "learning_rate": 0.00016594557286790957, + "loss": 0.715, + "step": 1217 + }, + { + "epoch": 0.293264311081683, + "grad_norm": 2.3568782806396484, + "learning_rate": 0.00016588693600954306, + "loss": 0.4839, + "step": 1218 + }, + { + "epoch": 0.2935050863781376, + "grad_norm": 3.552236795425415, + "learning_rate": 0.00016582825909177099, + "loss": 0.8309, + "step": 1219 + }, + { + "epoch": 0.2937458616745922, + "grad_norm": 1.4845949411392212, + "learning_rate": 0.0001657695421502691, + "loss": 0.3576, + "step": 1220 + }, + { + "epoch": 0.29398663697104677, + "grad_norm": 4.3355607986450195, + "learning_rate": 0.00016571078522073737, + "loss": 0.4216, + "step": 1221 + }, + { + "epoch": 0.2942274122675014, + "grad_norm": 2.5869123935699463, + "learning_rate": 0.0001656519883389002, + "loss": 1.0778, + "step": 1222 + }, + { + "epoch": 0.29446818756395593, + "grad_norm": 3.6160268783569336, + "learning_rate": 0.0001655931515405062, + "loss": 0.6609, + "step": 1223 + }, + { + "epoch": 0.29470896286041054, + "grad_norm": 2.8097994327545166, + "learning_rate": 0.00016553427486132828, + "loss": 0.6801, + "step": 1224 + }, + { + "epoch": 0.2949497381568651, + "grad_norm": 2.5700998306274414, + "learning_rate": 0.00016547535833716362, + "loss": 0.3883, + "step": 1225 + }, + { + "epoch": 0.2951905134533197, + "grad_norm": 0.5326368808746338, + "learning_rate": 0.00016541640200383356, + "loss": 0.2599, + "step": 1226 + }, + { + "epoch": 0.29543128874977426, + "grad_norm": 4.097855567932129, + "learning_rate": 0.00016535740589718366, + "loss": 1.4335, + "step": 1227 + }, + { + "epoch": 0.29567206404622887, + "grad_norm": 1.7571992874145508, + "learning_rate": 0.00016529837005308375, + "loss": 0.7812, + "step": 1228 + }, + { + "epoch": 0.2959128393426834, + "grad_norm": 2.6337194442749023, + "learning_rate": 0.00016523929450742774, + "loss": 0.3936, + "step": 1229 + }, + { + "epoch": 0.29615361463913803, + "grad_norm": 0.9062210917472839, + "learning_rate": 0.00016518017929613367, + "loss": 0.3914, + "step": 1230 + }, + { + "epoch": 0.2963943899355926, + "grad_norm": 0.581713080406189, + "learning_rate": 0.00016512102445514375, + "loss": 0.8761, + "step": 1231 + }, + { + "epoch": 0.2966351652320472, + "grad_norm": 1.9768112897872925, + "learning_rate": 0.0001650618300204242, + "loss": 0.5862, + "step": 1232 + }, + { + "epoch": 0.2968759405285018, + "grad_norm": 1.7873097658157349, + "learning_rate": 0.00016500259602796546, + "loss": 0.1979, + "step": 1233 + }, + { + "epoch": 0.29711671582495636, + "grad_norm": 2.351323366165161, + "learning_rate": 0.00016494332251378187, + "loss": 0.6285, + "step": 1234 + }, + { + "epoch": 0.29735749112141097, + "grad_norm": 2.609557628631592, + "learning_rate": 0.00016488400951391186, + "loss": 0.5139, + "step": 1235 + }, + { + "epoch": 0.2975982664178655, + "grad_norm": 2.986835241317749, + "learning_rate": 0.0001648246570644179, + "loss": 0.3242, + "step": 1236 + }, + { + "epoch": 0.29783904171432013, + "grad_norm": 1.083709716796875, + "learning_rate": 0.00016476526520138636, + "loss": 0.7125, + "step": 1237 + }, + { + "epoch": 0.2980798170107747, + "grad_norm": 4.175523281097412, + "learning_rate": 0.0001647058339609277, + "loss": 0.6407, + "step": 1238 + }, + { + "epoch": 0.2983205923072293, + "grad_norm": 1.0320210456848145, + "learning_rate": 0.00016464636337917618, + "loss": 0.4267, + "step": 1239 + }, + { + "epoch": 0.29856136760368385, + "grad_norm": 1.7650171518325806, + "learning_rate": 0.0001645868534922901, + "loss": 0.8656, + "step": 1240 + }, + { + "epoch": 0.29880214290013846, + "grad_norm": 0.3890477418899536, + "learning_rate": 0.00016452730433645153, + "loss": 0.4355, + "step": 1241 + }, + { + "epoch": 0.299042918196593, + "grad_norm": 3.933539390563965, + "learning_rate": 0.0001644677159478666, + "loss": 0.7368, + "step": 1242 + }, + { + "epoch": 0.2992836934930476, + "grad_norm": 3.1213431358337402, + "learning_rate": 0.00016440808836276508, + "loss": 0.5998, + "step": 1243 + }, + { + "epoch": 0.2995244687895022, + "grad_norm": 2.534736156463623, + "learning_rate": 0.00016434842161740075, + "loss": 0.6373, + "step": 1244 + }, + { + "epoch": 0.2997652440859568, + "grad_norm": 1.6457316875457764, + "learning_rate": 0.0001642887157480511, + "loss": 0.8746, + "step": 1245 + }, + { + "epoch": 0.30000601938241134, + "grad_norm": 2.4500882625579834, + "learning_rate": 0.0001642289707910174, + "loss": 0.3703, + "step": 1246 + }, + { + "epoch": 0.30024679467886595, + "grad_norm": 6.751053810119629, + "learning_rate": 0.0001641691867826248, + "loss": 0.9699, + "step": 1247 + }, + { + "epoch": 0.30048756997532056, + "grad_norm": 2.2047617435455322, + "learning_rate": 0.000164109363759222, + "loss": 0.7187, + "step": 1248 + }, + { + "epoch": 0.3007283452717751, + "grad_norm": 1.771125316619873, + "learning_rate": 0.00016404950175718166, + "loss": 0.4576, + "step": 1249 + }, + { + "epoch": 0.3009691205682297, + "grad_norm": 2.1661245822906494, + "learning_rate": 0.0001639896008128999, + "loss": 0.6159, + "step": 1250 + }, + { + "epoch": 0.3012098958646843, + "grad_norm": 4.253533363342285, + "learning_rate": 0.0001639296609627967, + "loss": 0.7709, + "step": 1251 + }, + { + "epoch": 0.3014506711611389, + "grad_norm": 3.3934977054595947, + "learning_rate": 0.00016386968224331558, + "loss": 1.3822, + "step": 1252 + }, + { + "epoch": 0.30169144645759344, + "grad_norm": 4.271642684936523, + "learning_rate": 0.00016380966469092378, + "loss": 1.5999, + "step": 1253 + }, + { + "epoch": 0.30193222175404805, + "grad_norm": 1.2420214414596558, + "learning_rate": 0.00016374960834211204, + "loss": 0.3992, + "step": 1254 + }, + { + "epoch": 0.3021729970505026, + "grad_norm": 1.2237993478775024, + "learning_rate": 0.00016368951323339484, + "loss": 0.2898, + "step": 1255 + }, + { + "epoch": 0.3024137723469572, + "grad_norm": 1.4050495624542236, + "learning_rate": 0.00016362937940131008, + "loss": 0.2777, + "step": 1256 + }, + { + "epoch": 0.30265454764341176, + "grad_norm": 1.4772244691848755, + "learning_rate": 0.0001635692068824193, + "loss": 0.9147, + "step": 1257 + }, + { + "epoch": 0.3028953229398664, + "grad_norm": 4.798654556274414, + "learning_rate": 0.0001635089957133075, + "loss": 0.3208, + "step": 1258 + }, + { + "epoch": 0.30313609823632093, + "grad_norm": 2.012327194213867, + "learning_rate": 0.0001634487459305832, + "loss": 0.691, + "step": 1259 + }, + { + "epoch": 0.30337687353277554, + "grad_norm": 7.864597797393799, + "learning_rate": 0.00016338845757087847, + "loss": 0.7949, + "step": 1260 + }, + { + "epoch": 0.30361764882923015, + "grad_norm": 1.5631287097930908, + "learning_rate": 0.0001633281306708487, + "loss": 0.8655, + "step": 1261 + }, + { + "epoch": 0.3038584241256847, + "grad_norm": 3.419724225997925, + "learning_rate": 0.0001632677652671728, + "loss": 0.7764, + "step": 1262 + }, + { + "epoch": 0.3040991994221393, + "grad_norm": 4.057196617126465, + "learning_rate": 0.00016320736139655305, + "loss": 0.4629, + "step": 1263 + }, + { + "epoch": 0.30433997471859386, + "grad_norm": 2.437304735183716, + "learning_rate": 0.0001631469190957152, + "loss": 0.6043, + "step": 1264 + }, + { + "epoch": 0.3045807500150485, + "grad_norm": 3.452397108078003, + "learning_rate": 0.00016308643840140828, + "loss": 0.9057, + "step": 1265 + }, + { + "epoch": 0.304821525311503, + "grad_norm": 2.3599209785461426, + "learning_rate": 0.00016302591935040463, + "loss": 0.4477, + "step": 1266 + }, + { + "epoch": 0.30506230060795764, + "grad_norm": 2.7127840518951416, + "learning_rate": 0.0001629653619795, + "loss": 0.4695, + "step": 1267 + }, + { + "epoch": 0.3053030759044122, + "grad_norm": 1.4742056131362915, + "learning_rate": 0.00016290476632551347, + "loss": 0.7507, + "step": 1268 + }, + { + "epoch": 0.3055438512008668, + "grad_norm": 1.4544012546539307, + "learning_rate": 0.0001628441324252873, + "loss": 0.5332, + "step": 1269 + }, + { + "epoch": 0.30578462649732135, + "grad_norm": 3.388953685760498, + "learning_rate": 0.000162783460315687, + "loss": 0.9343, + "step": 1270 + }, + { + "epoch": 0.30602540179377596, + "grad_norm": 3.447437047958374, + "learning_rate": 0.00016272275003360135, + "loss": 0.7331, + "step": 1271 + }, + { + "epoch": 0.3062661770902305, + "grad_norm": 2.9696388244628906, + "learning_rate": 0.0001626620016159424, + "loss": 0.2491, + "step": 1272 + }, + { + "epoch": 0.3065069523866851, + "grad_norm": 0.8574854135513306, + "learning_rate": 0.0001626012150996453, + "loss": 0.5318, + "step": 1273 + }, + { + "epoch": 0.30674772768313974, + "grad_norm": 2.6496622562408447, + "learning_rate": 0.00016254039052166833, + "loss": 0.725, + "step": 1274 + }, + { + "epoch": 0.3069885029795943, + "grad_norm": 1.233094334602356, + "learning_rate": 0.00016247952791899307, + "loss": 0.7075, + "step": 1275 + }, + { + "epoch": 0.3072292782760489, + "grad_norm": 1.2451717853546143, + "learning_rate": 0.00016241862732862403, + "loss": 0.8067, + "step": 1276 + }, + { + "epoch": 0.30747005357250345, + "grad_norm": 2.2256247997283936, + "learning_rate": 0.00016235768878758897, + "loss": 0.389, + "step": 1277 + }, + { + "epoch": 0.30771082886895806, + "grad_norm": 2.2310009002685547, + "learning_rate": 0.00016229671233293863, + "loss": 1.3423, + "step": 1278 + }, + { + "epoch": 0.3079516041654126, + "grad_norm": 2.3196895122528076, + "learning_rate": 0.0001622356980017468, + "loss": 0.5485, + "step": 1279 + }, + { + "epoch": 0.3081923794618672, + "grad_norm": 6.365363121032715, + "learning_rate": 0.0001621746458311104, + "loss": 0.6047, + "step": 1280 + }, + { + "epoch": 0.3084331547583218, + "grad_norm": 2.375135898590088, + "learning_rate": 0.00016211355585814925, + "loss": 1.0309, + "step": 1281 + }, + { + "epoch": 0.3086739300547764, + "grad_norm": 3.856171131134033, + "learning_rate": 0.00016205242812000617, + "loss": 0.5747, + "step": 1282 + }, + { + "epoch": 0.30891470535123094, + "grad_norm": 1.3465646505355835, + "learning_rate": 0.00016199126265384702, + "loss": 0.6992, + "step": 1283 + }, + { + "epoch": 0.30915548064768555, + "grad_norm": 3.8031649589538574, + "learning_rate": 0.0001619300594968605, + "loss": 0.6855, + "step": 1284 + }, + { + "epoch": 0.3093962559441401, + "grad_norm": 6.7793169021606445, + "learning_rate": 0.00016186881868625826, + "loss": 0.7541, + "step": 1285 + }, + { + "epoch": 0.3096370312405947, + "grad_norm": 4.37679386138916, + "learning_rate": 0.00016180754025927488, + "loss": 0.7391, + "step": 1286 + }, + { + "epoch": 0.30987780653704927, + "grad_norm": 2.1541247367858887, + "learning_rate": 0.00016174622425316776, + "loss": 0.6678, + "step": 1287 + }, + { + "epoch": 0.3101185818335039, + "grad_norm": 1.5806964635849, + "learning_rate": 0.00016168487070521717, + "loss": 0.6008, + "step": 1288 + }, + { + "epoch": 0.3103593571299585, + "grad_norm": 2.3984477519989014, + "learning_rate": 0.00016162347965272624, + "loss": 0.431, + "step": 1289 + }, + { + "epoch": 0.31060013242641304, + "grad_norm": 5.19956111907959, + "learning_rate": 0.00016156205113302083, + "loss": 1.1046, + "step": 1290 + }, + { + "epoch": 0.31084090772286765, + "grad_norm": 1.4966849088668823, + "learning_rate": 0.00016150058518344963, + "loss": 0.4343, + "step": 1291 + }, + { + "epoch": 0.3110816830193222, + "grad_norm": 1.530099868774414, + "learning_rate": 0.00016143908184138408, + "loss": 0.3569, + "step": 1292 + }, + { + "epoch": 0.3113224583157768, + "grad_norm": 6.92020845413208, + "learning_rate": 0.00016137754114421834, + "loss": 0.4397, + "step": 1293 + }, + { + "epoch": 0.31156323361223137, + "grad_norm": 4.40862512588501, + "learning_rate": 0.0001613159631293693, + "loss": 0.7515, + "step": 1294 + }, + { + "epoch": 0.311804008908686, + "grad_norm": 6.846129894256592, + "learning_rate": 0.00016125434783427654, + "loss": 0.9461, + "step": 1295 + }, + { + "epoch": 0.31204478420514054, + "grad_norm": 6.169475078582764, + "learning_rate": 0.0001611926952964023, + "loss": 1.5009, + "step": 1296 + }, + { + "epoch": 0.31228555950159514, + "grad_norm": 2.2589635848999023, + "learning_rate": 0.0001611310055532314, + "loss": 0.6197, + "step": 1297 + }, + { + "epoch": 0.3125263347980497, + "grad_norm": 5.03438663482666, + "learning_rate": 0.00016106927864227143, + "loss": 1.2404, + "step": 1298 + }, + { + "epoch": 0.3127671100945043, + "grad_norm": 2.119262456893921, + "learning_rate": 0.00016100751460105243, + "loss": 0.3959, + "step": 1299 + }, + { + "epoch": 0.31300788539095886, + "grad_norm": 1.556208610534668, + "learning_rate": 0.00016094571346712716, + "loss": 0.3569, + "step": 1300 + }, + { + "epoch": 0.31324866068741347, + "grad_norm": 3.7477822303771973, + "learning_rate": 0.0001608838752780707, + "loss": 0.9697, + "step": 1301 + }, + { + "epoch": 0.3134894359838681, + "grad_norm": 1.226062297821045, + "learning_rate": 0.000160822000071481, + "loss": 0.2165, + "step": 1302 + }, + { + "epoch": 0.31373021128032264, + "grad_norm": 1.5284736156463623, + "learning_rate": 0.00016076008788497816, + "loss": 0.3499, + "step": 1303 + }, + { + "epoch": 0.31397098657677724, + "grad_norm": 1.2165570259094238, + "learning_rate": 0.00016069813875620498, + "loss": 0.3322, + "step": 1304 + }, + { + "epoch": 0.3142117618732318, + "grad_norm": 2.2660257816314697, + "learning_rate": 0.00016063615272282673, + "loss": 0.9303, + "step": 1305 + }, + { + "epoch": 0.3144525371696864, + "grad_norm": 3.506263494491577, + "learning_rate": 0.00016057412982253098, + "loss": 0.3677, + "step": 1306 + }, + { + "epoch": 0.31469331246614096, + "grad_norm": 2.1276533603668213, + "learning_rate": 0.00016051207009302781, + "loss": 0.8432, + "step": 1307 + }, + { + "epoch": 0.31493408776259557, + "grad_norm": 4.875666618347168, + "learning_rate": 0.00016044997357204973, + "loss": 0.4637, + "step": 1308 + }, + { + "epoch": 0.3151748630590501, + "grad_norm": 0.7586674690246582, + "learning_rate": 0.0001603878402973515, + "loss": 0.3961, + "step": 1309 + }, + { + "epoch": 0.31541563835550473, + "grad_norm": 1.5257422924041748, + "learning_rate": 0.0001603256703067103, + "loss": 0.6829, + "step": 1310 + }, + { + "epoch": 0.3156564136519593, + "grad_norm": 2.0728249549865723, + "learning_rate": 0.00016026346363792567, + "loss": 0.5977, + "step": 1311 + }, + { + "epoch": 0.3158971889484139, + "grad_norm": 6.4057936668396, + "learning_rate": 0.00016020122032881932, + "loss": 1.2481, + "step": 1312 + }, + { + "epoch": 0.31613796424486845, + "grad_norm": 0.8619070649147034, + "learning_rate": 0.00016013894041723542, + "loss": 1.0521, + "step": 1313 + }, + { + "epoch": 0.31637873954132306, + "grad_norm": 2.2395753860473633, + "learning_rate": 0.00016007662394104024, + "loss": 0.4075, + "step": 1314 + }, + { + "epoch": 0.31661951483777767, + "grad_norm": 1.7755603790283203, + "learning_rate": 0.00016001427093812235, + "loss": 0.6441, + "step": 1315 + }, + { + "epoch": 0.3168602901342322, + "grad_norm": 0.9650968909263611, + "learning_rate": 0.0001599518814463925, + "loss": 0.1726, + "step": 1316 + }, + { + "epoch": 0.31710106543068683, + "grad_norm": 1.7282532453536987, + "learning_rate": 0.0001598894555037837, + "loss": 0.2156, + "step": 1317 + }, + { + "epoch": 0.3173418407271414, + "grad_norm": 1.8310699462890625, + "learning_rate": 0.000159826993148251, + "loss": 0.4061, + "step": 1318 + }, + { + "epoch": 0.317582616023596, + "grad_norm": 2.200747489929199, + "learning_rate": 0.00015976449441777163, + "loss": 0.3275, + "step": 1319 + }, + { + "epoch": 0.31782339132005055, + "grad_norm": 3.546372175216675, + "learning_rate": 0.00015970195935034506, + "loss": 1.1699, + "step": 1320 + }, + { + "epoch": 0.31806416661650516, + "grad_norm": 4.240285873413086, + "learning_rate": 0.00015963938798399267, + "loss": 1.0385, + "step": 1321 + }, + { + "epoch": 0.3183049419129597, + "grad_norm": 1.8230444192886353, + "learning_rate": 0.00015957678035675806, + "loss": 0.8566, + "step": 1322 + }, + { + "epoch": 0.3185457172094143, + "grad_norm": 1.3038523197174072, + "learning_rate": 0.00015951413650670669, + "loss": 0.545, + "step": 1323 + }, + { + "epoch": 0.3187864925058689, + "grad_norm": 2.877883195877075, + "learning_rate": 0.00015945145647192627, + "loss": 0.522, + "step": 1324 + }, + { + "epoch": 0.3190272678023235, + "grad_norm": 2.4238524436950684, + "learning_rate": 0.0001593887402905264, + "loss": 1.1966, + "step": 1325 + }, + { + "epoch": 0.31926804309877804, + "grad_norm": 6.8975982666015625, + "learning_rate": 0.0001593259880006386, + "loss": 0.3491, + "step": 1326 + }, + { + "epoch": 0.31950881839523265, + "grad_norm": 2.4402458667755127, + "learning_rate": 0.0001592631996404164, + "loss": 0.8293, + "step": 1327 + }, + { + "epoch": 0.3197495936916872, + "grad_norm": 2.575347900390625, + "learning_rate": 0.00015920037524803538, + "loss": 0.9677, + "step": 1328 + }, + { + "epoch": 0.3199903689881418, + "grad_norm": 1.8966193199157715, + "learning_rate": 0.00015913751486169275, + "loss": 0.4061, + "step": 1329 + }, + { + "epoch": 0.3202311442845964, + "grad_norm": 3.9115090370178223, + "learning_rate": 0.0001590746185196079, + "loss": 0.8245, + "step": 1330 + }, + { + "epoch": 0.320471919581051, + "grad_norm": 2.3119075298309326, + "learning_rate": 0.00015901168626002184, + "loss": 0.8401, + "step": 1331 + }, + { + "epoch": 0.3207126948775056, + "grad_norm": 3.0290722846984863, + "learning_rate": 0.00015894871812119764, + "loss": 0.2644, + "step": 1332 + }, + { + "epoch": 0.32095347017396014, + "grad_norm": 1.3376718759536743, + "learning_rate": 0.00015888571414141996, + "loss": 0.7519, + "step": 1333 + }, + { + "epoch": 0.32119424547041475, + "grad_norm": 3.897224187850952, + "learning_rate": 0.00015882267435899543, + "loss": 0.5062, + "step": 1334 + }, + { + "epoch": 0.3214350207668693, + "grad_norm": 3.285747766494751, + "learning_rate": 0.00015875959881225238, + "loss": 0.6907, + "step": 1335 + }, + { + "epoch": 0.3216757960633239, + "grad_norm": 0.9387348294258118, + "learning_rate": 0.00015869648753954083, + "loss": 0.3543, + "step": 1336 + }, + { + "epoch": 0.32191657135977847, + "grad_norm": 0.8521896600723267, + "learning_rate": 0.00015863334057923263, + "loss": 0.6814, + "step": 1337 + }, + { + "epoch": 0.3221573466562331, + "grad_norm": 3.792236328125, + "learning_rate": 0.00015857015796972126, + "loss": 0.361, + "step": 1338 + }, + { + "epoch": 0.32239812195268763, + "grad_norm": 1.9048930406570435, + "learning_rate": 0.00015850693974942188, + "loss": 0.841, + "step": 1339 + }, + { + "epoch": 0.32263889724914224, + "grad_norm": 4.0882744789123535, + "learning_rate": 0.00015844368595677128, + "loss": 1.4357, + "step": 1340 + }, + { + "epoch": 0.3228796725455968, + "grad_norm": 2.6341850757598877, + "learning_rate": 0.000158380396630228, + "loss": 0.7541, + "step": 1341 + }, + { + "epoch": 0.3231204478420514, + "grad_norm": 1.2838685512542725, + "learning_rate": 0.000158317071808272, + "loss": 0.3621, + "step": 1342 + }, + { + "epoch": 0.323361223138506, + "grad_norm": 2.2688121795654297, + "learning_rate": 0.000158253711529405, + "loss": 0.4616, + "step": 1343 + }, + { + "epoch": 0.32360199843496057, + "grad_norm": 1.7393488883972168, + "learning_rate": 0.00015819031583215007, + "loss": 0.9108, + "step": 1344 + }, + { + "epoch": 0.3238427737314152, + "grad_norm": 2.279599666595459, + "learning_rate": 0.00015812688475505201, + "loss": 0.4704, + "step": 1345 + }, + { + "epoch": 0.32408354902786973, + "grad_norm": 1.3711464405059814, + "learning_rate": 0.0001580634183366771, + "loss": 0.5435, + "step": 1346 + }, + { + "epoch": 0.32432432432432434, + "grad_norm": 2.167222499847412, + "learning_rate": 0.00015799991661561303, + "loss": 0.2528, + "step": 1347 + }, + { + "epoch": 0.3245650996207789, + "grad_norm": 6.101914405822754, + "learning_rate": 0.00015793637963046897, + "loss": 1.3281, + "step": 1348 + }, + { + "epoch": 0.3248058749172335, + "grad_norm": 2.9374330043792725, + "learning_rate": 0.00015787280741987557, + "loss": 0.5171, + "step": 1349 + }, + { + "epoch": 0.32504665021368806, + "grad_norm": 1.7808711528778076, + "learning_rate": 0.00015780920002248484, + "loss": 0.6773, + "step": 1350 + }, + { + "epoch": 0.32528742551014267, + "grad_norm": 2.1058807373046875, + "learning_rate": 0.00015774555747697025, + "loss": 0.5836, + "step": 1351 + }, + { + "epoch": 0.3255282008065972, + "grad_norm": 3.5313520431518555, + "learning_rate": 0.00015768187982202666, + "loss": 0.6518, + "step": 1352 + }, + { + "epoch": 0.32576897610305183, + "grad_norm": 4.409549236297607, + "learning_rate": 0.00015761816709637015, + "loss": 0.8503, + "step": 1353 + }, + { + "epoch": 0.3260097513995064, + "grad_norm": 1.0890048742294312, + "learning_rate": 0.00015755441933873823, + "loss": 0.2637, + "step": 1354 + }, + { + "epoch": 0.326250526695961, + "grad_norm": 1.0471165180206299, + "learning_rate": 0.00015749063658788967, + "loss": 0.4454, + "step": 1355 + }, + { + "epoch": 0.3264913019924156, + "grad_norm": 1.7348659038543701, + "learning_rate": 0.00015742681888260455, + "loss": 0.977, + "step": 1356 + }, + { + "epoch": 0.32673207728887016, + "grad_norm": 2.9363324642181396, + "learning_rate": 0.0001573629662616842, + "loss": 0.5051, + "step": 1357 + }, + { + "epoch": 0.32697285258532477, + "grad_norm": 2.2017180919647217, + "learning_rate": 0.00015729907876395105, + "loss": 0.6374, + "step": 1358 + }, + { + "epoch": 0.3272136278817793, + "grad_norm": 1.8804614543914795, + "learning_rate": 0.00015723515642824894, + "loss": 0.5544, + "step": 1359 + }, + { + "epoch": 0.32745440317823393, + "grad_norm": 1.586624264717102, + "learning_rate": 0.00015717119929344278, + "loss": 1.0256, + "step": 1360 + }, + { + "epoch": 0.3276951784746885, + "grad_norm": 3.861217737197876, + "learning_rate": 0.00015710720739841864, + "loss": 0.6251, + "step": 1361 + }, + { + "epoch": 0.3279359537711431, + "grad_norm": 1.4513386487960815, + "learning_rate": 0.00015704318078208374, + "loss": 0.7021, + "step": 1362 + }, + { + "epoch": 0.32817672906759765, + "grad_norm": 1.8319506645202637, + "learning_rate": 0.00015697911948336641, + "loss": 0.5171, + "step": 1363 + }, + { + "epoch": 0.32841750436405226, + "grad_norm": 1.2202706336975098, + "learning_rate": 0.00015691502354121605, + "loss": 0.3055, + "step": 1364 + }, + { + "epoch": 0.3286582796605068, + "grad_norm": 3.2873902320861816, + "learning_rate": 0.00015685089299460317, + "loss": 0.9132, + "step": 1365 + }, + { + "epoch": 0.3288990549569614, + "grad_norm": 2.2419869899749756, + "learning_rate": 0.00015678672788251922, + "loss": 0.5913, + "step": 1366 + }, + { + "epoch": 0.329139830253416, + "grad_norm": 5.597873210906982, + "learning_rate": 0.0001567225282439768, + "loss": 0.8836, + "step": 1367 + }, + { + "epoch": 0.3293806055498706, + "grad_norm": 1.8877670764923096, + "learning_rate": 0.0001566582941180094, + "loss": 0.486, + "step": 1368 + }, + { + "epoch": 0.32962138084632514, + "grad_norm": 3.7749109268188477, + "learning_rate": 0.00015659402554367153, + "loss": 0.8683, + "step": 1369 + }, + { + "epoch": 0.32986215614277975, + "grad_norm": 1.6134521961212158, + "learning_rate": 0.00015652972256003864, + "loss": 0.7984, + "step": 1370 + }, + { + "epoch": 0.33010293143923436, + "grad_norm": 2.474909782409668, + "learning_rate": 0.00015646538520620705, + "loss": 0.5438, + "step": 1371 + }, + { + "epoch": 0.3303437067356889, + "grad_norm": 4.5085368156433105, + "learning_rate": 0.00015640101352129402, + "loss": 0.8213, + "step": 1372 + }, + { + "epoch": 0.3305844820321435, + "grad_norm": 1.169089913368225, + "learning_rate": 0.00015633660754443772, + "loss": 0.2603, + "step": 1373 + }, + { + "epoch": 0.3308252573285981, + "grad_norm": 1.1745972633361816, + "learning_rate": 0.0001562721673147971, + "loss": 0.3671, + "step": 1374 + }, + { + "epoch": 0.3310660326250527, + "grad_norm": 2.545999765396118, + "learning_rate": 0.00015620769287155197, + "loss": 0.8987, + "step": 1375 + }, + { + "epoch": 0.33130680792150724, + "grad_norm": 3.68367338180542, + "learning_rate": 0.00015614318425390296, + "loss": 0.7555, + "step": 1376 + }, + { + "epoch": 0.33154758321796185, + "grad_norm": 1.6733169555664062, + "learning_rate": 0.0001560786415010714, + "loss": 0.5136, + "step": 1377 + }, + { + "epoch": 0.3317883585144164, + "grad_norm": 6.26981258392334, + "learning_rate": 0.00015601406465229947, + "loss": 0.7942, + "step": 1378 + }, + { + "epoch": 0.332029133810871, + "grad_norm": 2.717362880706787, + "learning_rate": 0.00015594945374685002, + "loss": 0.7386, + "step": 1379 + }, + { + "epoch": 0.33226990910732557, + "grad_norm": 1.8613269329071045, + "learning_rate": 0.00015588480882400662, + "loss": 0.5929, + "step": 1380 + }, + { + "epoch": 0.3325106844037802, + "grad_norm": 4.205772876739502, + "learning_rate": 0.0001558201299230736, + "loss": 0.6058, + "step": 1381 + }, + { + "epoch": 0.33275145970023473, + "grad_norm": 3.043046474456787, + "learning_rate": 0.0001557554170833758, + "loss": 0.3609, + "step": 1382 + }, + { + "epoch": 0.33299223499668934, + "grad_norm": 2.67464542388916, + "learning_rate": 0.00015569067034425878, + "loss": 0.9453, + "step": 1383 + }, + { + "epoch": 0.33323301029314395, + "grad_norm": 1.8980488777160645, + "learning_rate": 0.00015562588974508872, + "loss": 0.8145, + "step": 1384 + }, + { + "epoch": 0.3334737855895985, + "grad_norm": 1.8081344366073608, + "learning_rate": 0.00015556107532525238, + "loss": 0.5141, + "step": 1385 + }, + { + "epoch": 0.3337145608860531, + "grad_norm": 2.477198362350464, + "learning_rate": 0.00015549622712415702, + "loss": 0.8897, + "step": 1386 + }, + { + "epoch": 0.33395533618250767, + "grad_norm": 2.74221134185791, + "learning_rate": 0.0001554313451812306, + "loss": 0.8277, + "step": 1387 + }, + { + "epoch": 0.3341961114789623, + "grad_norm": 1.0981510877609253, + "learning_rate": 0.0001553664295359214, + "loss": 1.2254, + "step": 1388 + }, + { + "epoch": 0.33443688677541683, + "grad_norm": 1.967882752418518, + "learning_rate": 0.0001553014802276983, + "loss": 0.8044, + "step": 1389 + }, + { + "epoch": 0.33467766207187144, + "grad_norm": 2.7293615341186523, + "learning_rate": 0.0001552364972960506, + "loss": 0.3669, + "step": 1390 + }, + { + "epoch": 0.334918437368326, + "grad_norm": 0.9830564260482788, + "learning_rate": 0.00015517148078048808, + "loss": 0.0667, + "step": 1391 + }, + { + "epoch": 0.3351592126647806, + "grad_norm": 2.215790033340454, + "learning_rate": 0.00015510643072054098, + "loss": 0.7652, + "step": 1392 + }, + { + "epoch": 0.33539998796123516, + "grad_norm": 3.3328158855438232, + "learning_rate": 0.00015504134715575986, + "loss": 0.7612, + "step": 1393 + }, + { + "epoch": 0.33564076325768977, + "grad_norm": 0.497371107339859, + "learning_rate": 0.00015497623012571566, + "loss": 0.5093, + "step": 1394 + }, + { + "epoch": 0.3358815385541443, + "grad_norm": 3.333343744277954, + "learning_rate": 0.00015491107966999964, + "loss": 1.1697, + "step": 1395 + }, + { + "epoch": 0.33612231385059893, + "grad_norm": 1.7939079999923706, + "learning_rate": 0.00015484589582822348, + "loss": 0.8186, + "step": 1396 + }, + { + "epoch": 0.33636308914705354, + "grad_norm": 0.5831475257873535, + "learning_rate": 0.00015478067864001908, + "loss": 0.0296, + "step": 1397 + }, + { + "epoch": 0.3366038644435081, + "grad_norm": 1.5100713968276978, + "learning_rate": 0.00015471542814503867, + "loss": 0.7465, + "step": 1398 + }, + { + "epoch": 0.3368446397399627, + "grad_norm": 3.8856029510498047, + "learning_rate": 0.00015465014438295467, + "loss": 0.5473, + "step": 1399 + }, + { + "epoch": 0.33708541503641726, + "grad_norm": 1.1712760925292969, + "learning_rate": 0.00015458482739345974, + "loss": 0.3689, + "step": 1400 + }, + { + "epoch": 0.33732619033287187, + "grad_norm": 4.496668338775635, + "learning_rate": 0.00015451947721626676, + "loss": 1.0972, + "step": 1401 + }, + { + "epoch": 0.3375669656293264, + "grad_norm": 5.929965496063232, + "learning_rate": 0.00015445409389110883, + "loss": 0.6352, + "step": 1402 + }, + { + "epoch": 0.33780774092578103, + "grad_norm": 1.9079606533050537, + "learning_rate": 0.00015438867745773912, + "loss": 0.5129, + "step": 1403 + }, + { + "epoch": 0.3380485162222356, + "grad_norm": 3.6617226600646973, + "learning_rate": 0.00015432322795593098, + "loss": 0.4049, + "step": 1404 + }, + { + "epoch": 0.3382892915186902, + "grad_norm": 5.218686580657959, + "learning_rate": 0.00015425774542547784, + "loss": 0.3238, + "step": 1405 + }, + { + "epoch": 0.33853006681514475, + "grad_norm": 1.2463502883911133, + "learning_rate": 0.00015419222990619322, + "loss": 0.4756, + "step": 1406 + }, + { + "epoch": 0.33877084211159936, + "grad_norm": 3.0786678791046143, + "learning_rate": 0.00015412668143791075, + "loss": 0.8815, + "step": 1407 + }, + { + "epoch": 0.3390116174080539, + "grad_norm": 2.135958194732666, + "learning_rate": 0.000154061100060484, + "loss": 0.6417, + "step": 1408 + }, + { + "epoch": 0.3392523927045085, + "grad_norm": 3.0606963634490967, + "learning_rate": 0.00015399548581378664, + "loss": 0.573, + "step": 1409 + }, + { + "epoch": 0.3394931680009631, + "grad_norm": 1.5275843143463135, + "learning_rate": 0.00015392983873771223, + "loss": 0.5197, + "step": 1410 + }, + { + "epoch": 0.3397339432974177, + "grad_norm": 2.3803906440734863, + "learning_rate": 0.00015386415887217437, + "loss": 0.543, + "step": 1411 + }, + { + "epoch": 0.3399747185938723, + "grad_norm": 5.230526924133301, + "learning_rate": 0.00015379844625710654, + "loss": 0.5216, + "step": 1412 + }, + { + "epoch": 0.34021549389032685, + "grad_norm": 1.887787938117981, + "learning_rate": 0.0001537327009324622, + "loss": 0.7004, + "step": 1413 + }, + { + "epoch": 0.34045626918678146, + "grad_norm": 3.1152963638305664, + "learning_rate": 0.0001536669229382146, + "loss": 0.65, + "step": 1414 + }, + { + "epoch": 0.340697044483236, + "grad_norm": 4.267107009887695, + "learning_rate": 0.00015360111231435693, + "loss": 0.7265, + "step": 1415 + }, + { + "epoch": 0.3409378197796906, + "grad_norm": 1.1614614725112915, + "learning_rate": 0.0001535352691009023, + "loss": 0.443, + "step": 1416 + }, + { + "epoch": 0.3411785950761452, + "grad_norm": 2.7194442749023438, + "learning_rate": 0.00015346939333788336, + "loss": 0.93, + "step": 1417 + }, + { + "epoch": 0.3414193703725998, + "grad_norm": 1.5683730840682983, + "learning_rate": 0.00015340348506535283, + "loss": 0.665, + "step": 1418 + }, + { + "epoch": 0.34166014566905434, + "grad_norm": 0.9245167970657349, + "learning_rate": 0.00015333754432338302, + "loss": 0.3983, + "step": 1419 + }, + { + "epoch": 0.34190092096550895, + "grad_norm": 3.776094913482666, + "learning_rate": 0.00015327157115206614, + "loss": 0.6996, + "step": 1420 + }, + { + "epoch": 0.3421416962619635, + "grad_norm": 3.2278683185577393, + "learning_rate": 0.00015320556559151398, + "loss": 0.614, + "step": 1421 + }, + { + "epoch": 0.3423824715584181, + "grad_norm": 1.4512388706207275, + "learning_rate": 0.00015313952768185803, + "loss": 0.8104, + "step": 1422 + }, + { + "epoch": 0.34262324685487267, + "grad_norm": 1.858079195022583, + "learning_rate": 0.00015307345746324954, + "loss": 0.8088, + "step": 1423 + }, + { + "epoch": 0.3428640221513273, + "grad_norm": 0.6770870685577393, + "learning_rate": 0.00015300735497585934, + "loss": 0.6674, + "step": 1424 + }, + { + "epoch": 0.3431047974477819, + "grad_norm": 1.5875935554504395, + "learning_rate": 0.00015294122025987788, + "loss": 0.5163, + "step": 1425 + }, + { + "epoch": 0.34334557274423644, + "grad_norm": 1.7607767581939697, + "learning_rate": 0.00015287505335551525, + "loss": 0.5005, + "step": 1426 + }, + { + "epoch": 0.34358634804069105, + "grad_norm": 4.621982574462891, + "learning_rate": 0.000152808854303001, + "loss": 0.7541, + "step": 1427 + }, + { + "epoch": 0.3438271233371456, + "grad_norm": 3.1218035221099854, + "learning_rate": 0.00015274262314258442, + "loss": 0.5221, + "step": 1428 + }, + { + "epoch": 0.3440678986336002, + "grad_norm": 4.2029924392700195, + "learning_rate": 0.00015267635991453408, + "loss": 0.6852, + "step": 1429 + }, + { + "epoch": 0.34430867393005476, + "grad_norm": 5.702292442321777, + "learning_rate": 0.00015261006465913828, + "loss": 0.7622, + "step": 1430 + }, + { + "epoch": 0.3445494492265094, + "grad_norm": 3.05202054977417, + "learning_rate": 0.00015254373741670457, + "loss": 0.6527, + "step": 1431 + }, + { + "epoch": 0.34479022452296393, + "grad_norm": 3.4306201934814453, + "learning_rate": 0.00015247737822756018, + "loss": 1.2398, + "step": 1432 + }, + { + "epoch": 0.34503099981941854, + "grad_norm": 2.056917667388916, + "learning_rate": 0.0001524109871320516, + "loss": 0.3902, + "step": 1433 + }, + { + "epoch": 0.3452717751158731, + "grad_norm": 1.1819590330123901, + "learning_rate": 0.00015234456417054476, + "loss": 0.0903, + "step": 1434 + }, + { + "epoch": 0.3455125504123277, + "grad_norm": 1.0845695734024048, + "learning_rate": 0.00015227810938342492, + "loss": 0.3635, + "step": 1435 + }, + { + "epoch": 0.34575332570878226, + "grad_norm": 2.537416458129883, + "learning_rate": 0.00015221162281109683, + "loss": 0.4821, + "step": 1436 + }, + { + "epoch": 0.34599410100523686, + "grad_norm": 1.1138862371444702, + "learning_rate": 0.00015214510449398442, + "loss": 0.5671, + "step": 1437 + }, + { + "epoch": 0.3462348763016914, + "grad_norm": 2.424607276916504, + "learning_rate": 0.00015207855447253103, + "loss": 0.8349, + "step": 1438 + }, + { + "epoch": 0.34647565159814603, + "grad_norm": 5.5035176277160645, + "learning_rate": 0.00015201197278719915, + "loss": 0.9117, + "step": 1439 + }, + { + "epoch": 0.34671642689460064, + "grad_norm": 2.757199287414551, + "learning_rate": 0.00015194535947847063, + "loss": 0.4329, + "step": 1440 + }, + { + "epoch": 0.3469572021910552, + "grad_norm": 6.263975143432617, + "learning_rate": 0.00015187871458684655, + "loss": 0.7671, + "step": 1441 + }, + { + "epoch": 0.3471979774875098, + "grad_norm": 2.1420156955718994, + "learning_rate": 0.00015181203815284707, + "loss": 0.5561, + "step": 1442 + }, + { + "epoch": 0.34743875278396436, + "grad_norm": 2.368563175201416, + "learning_rate": 0.00015174533021701167, + "loss": 0.479, + "step": 1443 + }, + { + "epoch": 0.34767952808041896, + "grad_norm": 20.131282806396484, + "learning_rate": 0.00015167859081989895, + "loss": 0.9437, + "step": 1444 + }, + { + "epoch": 0.3479203033768735, + "grad_norm": 1.1026864051818848, + "learning_rate": 0.00015161182000208653, + "loss": 0.0633, + "step": 1445 + }, + { + "epoch": 0.34816107867332813, + "grad_norm": 1.3895201683044434, + "learning_rate": 0.0001515450178041713, + "loss": 0.8124, + "step": 1446 + }, + { + "epoch": 0.3484018539697827, + "grad_norm": 3.4181928634643555, + "learning_rate": 0.0001514781842667691, + "loss": 0.8081, + "step": 1447 + }, + { + "epoch": 0.3486426292662373, + "grad_norm": 3.2324140071868896, + "learning_rate": 0.0001514113194305149, + "loss": 1.6239, + "step": 1448 + }, + { + "epoch": 0.34888340456269185, + "grad_norm": 0.9572200775146484, + "learning_rate": 0.00015134442333606264, + "loss": 0.7945, + "step": 1449 + }, + { + "epoch": 0.34912417985914646, + "grad_norm": 3.1057350635528564, + "learning_rate": 0.00015127749602408529, + "loss": 0.3813, + "step": 1450 + }, + { + "epoch": 0.349364955155601, + "grad_norm": 3.538774251937866, + "learning_rate": 0.00015121053753527485, + "loss": 0.6439, + "step": 1451 + }, + { + "epoch": 0.3496057304520556, + "grad_norm": 1.1300746202468872, + "learning_rate": 0.00015114354791034225, + "loss": 0.1153, + "step": 1452 + }, + { + "epoch": 0.34984650574851023, + "grad_norm": 6.388082027435303, + "learning_rate": 0.00015107652719001724, + "loss": 1.2515, + "step": 1453 + }, + { + "epoch": 0.3500872810449648, + "grad_norm": 3.4116952419281006, + "learning_rate": 0.00015100947541504863, + "loss": 0.4694, + "step": 1454 + }, + { + "epoch": 0.3503280563414194, + "grad_norm": 1.212721586227417, + "learning_rate": 0.00015094239262620406, + "loss": 0.4981, + "step": 1455 + }, + { + "epoch": 0.35056883163787395, + "grad_norm": 4.218289375305176, + "learning_rate": 0.00015087527886426997, + "loss": 0.7434, + "step": 1456 + }, + { + "epoch": 0.35080960693432856, + "grad_norm": 1.7133764028549194, + "learning_rate": 0.00015080813417005172, + "loss": 0.5143, + "step": 1457 + }, + { + "epoch": 0.3510503822307831, + "grad_norm": 2.915750503540039, + "learning_rate": 0.00015074095858437343, + "loss": 0.6977, + "step": 1458 + }, + { + "epoch": 0.3512911575272377, + "grad_norm": 1.0086733102798462, + "learning_rate": 0.00015067375214807796, + "loss": 0.4913, + "step": 1459 + }, + { + "epoch": 0.3515319328236923, + "grad_norm": 2.266055107116699, + "learning_rate": 0.0001506065149020271, + "loss": 0.4658, + "step": 1460 + }, + { + "epoch": 0.3517727081201469, + "grad_norm": 1.7266699075698853, + "learning_rate": 0.0001505392468871011, + "loss": 1.0223, + "step": 1461 + }, + { + "epoch": 0.35201348341660144, + "grad_norm": 4.561027526855469, + "learning_rate": 0.00015047194814419914, + "loss": 0.4841, + "step": 1462 + }, + { + "epoch": 0.35225425871305605, + "grad_norm": 2.0526604652404785, + "learning_rate": 0.00015040461871423897, + "loss": 0.0654, + "step": 1463 + }, + { + "epoch": 0.3524950340095106, + "grad_norm": 2.187910556793213, + "learning_rate": 0.0001503372586381571, + "loss": 0.8852, + "step": 1464 + }, + { + "epoch": 0.3527358093059652, + "grad_norm": 1.6966273784637451, + "learning_rate": 0.00015026986795690857, + "loss": 0.5213, + "step": 1465 + }, + { + "epoch": 0.3529765846024198, + "grad_norm": 1.83759343624115, + "learning_rate": 0.00015020244671146702, + "loss": 0.5114, + "step": 1466 + }, + { + "epoch": 0.3532173598988744, + "grad_norm": 6.522552490234375, + "learning_rate": 0.00015013499494282478, + "loss": 0.7191, + "step": 1467 + }, + { + "epoch": 0.353458135195329, + "grad_norm": 1.2005650997161865, + "learning_rate": 0.00015006751269199263, + "loss": 0.4789, + "step": 1468 + }, + { + "epoch": 0.35369891049178354, + "grad_norm": 11.688396453857422, + "learning_rate": 0.00015000000000000001, + "loss": 1.0394, + "step": 1469 + }, + { + "epoch": 0.35393968578823815, + "grad_norm": 2.6536548137664795, + "learning_rate": 0.0001499324569078947, + "loss": 0.7116, + "step": 1470 + }, + { + "epoch": 0.3541804610846927, + "grad_norm": 5.886802673339844, + "learning_rate": 0.00014986488345674313, + "loss": 0.8322, + "step": 1471 + }, + { + "epoch": 0.3544212363811473, + "grad_norm": 3.7790753841400146, + "learning_rate": 0.00014979727968763003, + "loss": 0.6478, + "step": 1472 + }, + { + "epoch": 0.35466201167760186, + "grad_norm": 2.2750492095947266, + "learning_rate": 0.0001497296456416587, + "loss": 0.5024, + "step": 1473 + }, + { + "epoch": 0.35490278697405647, + "grad_norm": 4.089879035949707, + "learning_rate": 0.0001496619813599508, + "loss": 0.5536, + "step": 1474 + }, + { + "epoch": 0.355143562270511, + "grad_norm": 4.2792558670043945, + "learning_rate": 0.00014959428688364633, + "loss": 0.8609, + "step": 1475 + }, + { + "epoch": 0.35538433756696564, + "grad_norm": 1.6434048414230347, + "learning_rate": 0.0001495265622539037, + "loss": 0.7308, + "step": 1476 + }, + { + "epoch": 0.3556251128634202, + "grad_norm": 4.617370128631592, + "learning_rate": 0.00014945880751189965, + "loss": 0.8816, + "step": 1477 + }, + { + "epoch": 0.3558658881598748, + "grad_norm": 1.2536977529525757, + "learning_rate": 0.0001493910226988292, + "loss": 0.3698, + "step": 1478 + }, + { + "epoch": 0.35610666345632935, + "grad_norm": 1.1533249616622925, + "learning_rate": 0.00014932320785590562, + "loss": 0.7943, + "step": 1479 + }, + { + "epoch": 0.35634743875278396, + "grad_norm": 3.6115407943725586, + "learning_rate": 0.00014925536302436057, + "loss": 0.8441, + "step": 1480 + }, + { + "epoch": 0.35658821404923857, + "grad_norm": 3.1211764812469482, + "learning_rate": 0.0001491874882454438, + "loss": 0.3908, + "step": 1481 + }, + { + "epoch": 0.3568289893456931, + "grad_norm": 1.8636207580566406, + "learning_rate": 0.00014911958356042342, + "loss": 0.7643, + "step": 1482 + }, + { + "epoch": 0.35706976464214774, + "grad_norm": 2.321810722351074, + "learning_rate": 0.00014905164901058551, + "loss": 0.3578, + "step": 1483 + }, + { + "epoch": 0.3573105399386023, + "grad_norm": 2.7864739894866943, + "learning_rate": 0.0001489836846372345, + "loss": 0.6989, + "step": 1484 + }, + { + "epoch": 0.3575513152350569, + "grad_norm": 2.2333052158355713, + "learning_rate": 0.0001489156904816929, + "loss": 0.7687, + "step": 1485 + }, + { + "epoch": 0.35779209053151145, + "grad_norm": 3.294875383377075, + "learning_rate": 0.00014884766658530125, + "loss": 0.7011, + "step": 1486 + }, + { + "epoch": 0.35803286582796606, + "grad_norm": 4.5237250328063965, + "learning_rate": 0.00014877961298941824, + "loss": 0.7226, + "step": 1487 + }, + { + "epoch": 0.3582736411244206, + "grad_norm": 3.1399903297424316, + "learning_rate": 0.00014871152973542067, + "loss": 1.068, + "step": 1488 + }, + { + "epoch": 0.3585144164208752, + "grad_norm": 3.0712268352508545, + "learning_rate": 0.00014864341686470324, + "loss": 0.3788, + "step": 1489 + }, + { + "epoch": 0.3587551917173298, + "grad_norm": 2.3056137561798096, + "learning_rate": 0.0001485752744186788, + "loss": 0.7056, + "step": 1490 + }, + { + "epoch": 0.3589959670137844, + "grad_norm": 2.5554864406585693, + "learning_rate": 0.00014850710243877803, + "loss": 0.8528, + "step": 1491 + }, + { + "epoch": 0.35923674231023894, + "grad_norm": 2.9446706771850586, + "learning_rate": 0.0001484389009664497, + "loss": 0.6246, + "step": 1492 + }, + { + "epoch": 0.35947751760669355, + "grad_norm": 2.1668176651000977, + "learning_rate": 0.00014837067004316049, + "loss": 0.6075, + "step": 1493 + }, + { + "epoch": 0.35971829290314816, + "grad_norm": 3.2588088512420654, + "learning_rate": 0.00014830240971039487, + "loss": 0.6044, + "step": 1494 + }, + { + "epoch": 0.3599590681996027, + "grad_norm": 2.2925832271575928, + "learning_rate": 0.00014823412000965533, + "loss": 0.5931, + "step": 1495 + }, + { + "epoch": 0.3601998434960573, + "grad_norm": 1.257023811340332, + "learning_rate": 0.00014816580098246215, + "loss": 0.3431, + "step": 1496 + }, + { + "epoch": 0.3604406187925119, + "grad_norm": 2.16398024559021, + "learning_rate": 0.00014809745267035346, + "loss": 0.5645, + "step": 1497 + }, + { + "epoch": 0.3606813940889665, + "grad_norm": 3.020810604095459, + "learning_rate": 0.0001480290751148852, + "loss": 0.8691, + "step": 1498 + }, + { + "epoch": 0.36092216938542104, + "grad_norm": 2.6079869270324707, + "learning_rate": 0.00014796066835763103, + "loss": 0.7485, + "step": 1499 + }, + { + "epoch": 0.36116294468187565, + "grad_norm": 4.599701881408691, + "learning_rate": 0.00014789223244018244, + "loss": 0.7325, + "step": 1500 + }, + { + "epoch": 0.3614037199783302, + "grad_norm": 3.939009428024292, + "learning_rate": 0.00014782376740414863, + "loss": 0.6157, + "step": 1501 + }, + { + "epoch": 0.3616444952747848, + "grad_norm": 1.3872252702713013, + "learning_rate": 0.0001477552732911565, + "loss": 0.7243, + "step": 1502 + }, + { + "epoch": 0.36188527057123937, + "grad_norm": 2.120624542236328, + "learning_rate": 0.00014768675014285062, + "loss": 0.8203, + "step": 1503 + }, + { + "epoch": 0.362126045867694, + "grad_norm": 1.584861397743225, + "learning_rate": 0.0001476181980008932, + "loss": 0.5647, + "step": 1504 + }, + { + "epoch": 0.36236682116414853, + "grad_norm": 2.499906063079834, + "learning_rate": 0.0001475496169069641, + "loss": 0.4923, + "step": 1505 + }, + { + "epoch": 0.36260759646060314, + "grad_norm": 4.404402732849121, + "learning_rate": 0.0001474810069027608, + "loss": 0.8503, + "step": 1506 + }, + { + "epoch": 0.36284837175705775, + "grad_norm": 1.0523654222488403, + "learning_rate": 0.00014741236802999835, + "loss": 0.6165, + "step": 1507 + }, + { + "epoch": 0.3630891470535123, + "grad_norm": 4.341505527496338, + "learning_rate": 0.00014734370033040928, + "loss": 0.8027, + "step": 1508 + }, + { + "epoch": 0.3633299223499669, + "grad_norm": 1.6273728609085083, + "learning_rate": 0.00014727500384574375, + "loss": 0.4199, + "step": 1509 + }, + { + "epoch": 0.36357069764642147, + "grad_norm": 2.939532995223999, + "learning_rate": 0.00014720627861776939, + "loss": 1.1424, + "step": 1510 + }, + { + "epoch": 0.3638114729428761, + "grad_norm": 1.7308731079101562, + "learning_rate": 0.00014713752468827128, + "loss": 0.2182, + "step": 1511 + }, + { + "epoch": 0.36405224823933063, + "grad_norm": 8.7367582321167, + "learning_rate": 0.00014706874209905192, + "loss": 0.7563, + "step": 1512 + }, + { + "epoch": 0.36429302353578524, + "grad_norm": 0.6256684064865112, + "learning_rate": 0.00014699993089193134, + "loss": 0.7837, + "step": 1513 + }, + { + "epoch": 0.3645337988322398, + "grad_norm": 2.6632747650146484, + "learning_rate": 0.00014693109110874687, + "loss": 0.5477, + "step": 1514 + }, + { + "epoch": 0.3647745741286944, + "grad_norm": 2.9854331016540527, + "learning_rate": 0.00014686222279135328, + "loss": 0.6202, + "step": 1515 + }, + { + "epoch": 0.36501534942514896, + "grad_norm": 4.577966690063477, + "learning_rate": 0.00014679332598162265, + "loss": 0.4136, + "step": 1516 + }, + { + "epoch": 0.36525612472160357, + "grad_norm": 0.8396844863891602, + "learning_rate": 0.00014672440072144443, + "loss": 0.8962, + "step": 1517 + }, + { + "epoch": 0.3654969000180581, + "grad_norm": 2.5254063606262207, + "learning_rate": 0.00014665544705272525, + "loss": 0.9689, + "step": 1518 + }, + { + "epoch": 0.36573767531451273, + "grad_norm": 2.8939788341522217, + "learning_rate": 0.0001465864650173892, + "loss": 0.6918, + "step": 1519 + }, + { + "epoch": 0.3659784506109673, + "grad_norm": 2.4425880908966064, + "learning_rate": 0.00014651745465737737, + "loss": 0.6507, + "step": 1520 + }, + { + "epoch": 0.3662192259074219, + "grad_norm": 3.3433775901794434, + "learning_rate": 0.00014644841601464838, + "loss": 0.7875, + "step": 1521 + }, + { + "epoch": 0.3664600012038765, + "grad_norm": 1.840368390083313, + "learning_rate": 0.00014637934913117777, + "loss": 0.6712, + "step": 1522 + }, + { + "epoch": 0.36670077650033106, + "grad_norm": 0.7851834297180176, + "learning_rate": 0.0001463102540489584, + "loss": 0.3682, + "step": 1523 + }, + { + "epoch": 0.36694155179678567, + "grad_norm": 4.149460792541504, + "learning_rate": 0.00014624113081000023, + "loss": 0.5221, + "step": 1524 + }, + { + "epoch": 0.3671823270932402, + "grad_norm": 1.0604087114334106, + "learning_rate": 0.00014617197945633037, + "loss": 0.6734, + "step": 1525 + }, + { + "epoch": 0.36742310238969483, + "grad_norm": 0.5313230752944946, + "learning_rate": 0.00014610280002999291, + "loss": 0.1435, + "step": 1526 + }, + { + "epoch": 0.3676638776861494, + "grad_norm": 9.519638061523438, + "learning_rate": 0.00014603359257304925, + "loss": 0.8516, + "step": 1527 + }, + { + "epoch": 0.367904652982604, + "grad_norm": 1.3151129484176636, + "learning_rate": 0.0001459643571275775, + "loss": 0.5194, + "step": 1528 + }, + { + "epoch": 0.36814542827905855, + "grad_norm": 1.3871126174926758, + "learning_rate": 0.00014589509373567314, + "loss": 0.3852, + "step": 1529 + }, + { + "epoch": 0.36838620357551316, + "grad_norm": 1.881598949432373, + "learning_rate": 0.00014582580243944836, + "loss": 0.5607, + "step": 1530 + }, + { + "epoch": 0.3686269788719677, + "grad_norm": 3.6260831356048584, + "learning_rate": 0.0001457564832810324, + "loss": 0.4408, + "step": 1531 + }, + { + "epoch": 0.3688677541684223, + "grad_norm": 2.3478870391845703, + "learning_rate": 0.00014568713630257155, + "loss": 0.8691, + "step": 1532 + }, + { + "epoch": 0.3691085294648769, + "grad_norm": 1.4225029945373535, + "learning_rate": 0.00014561776154622892, + "loss": 0.6029, + "step": 1533 + }, + { + "epoch": 0.3693493047613315, + "grad_norm": 2.58164381980896, + "learning_rate": 0.00014554835905418448, + "loss": 0.6517, + "step": 1534 + }, + { + "epoch": 0.3695900800577861, + "grad_norm": 1.5946727991104126, + "learning_rate": 0.00014547892886863508, + "loss": 0.3034, + "step": 1535 + }, + { + "epoch": 0.36983085535424065, + "grad_norm": 1.9315208196640015, + "learning_rate": 0.00014540947103179448, + "loss": 0.1705, + "step": 1536 + }, + { + "epoch": 0.37007163065069526, + "grad_norm": 1.584106683731079, + "learning_rate": 0.0001453399855858932, + "loss": 0.6266, + "step": 1537 + }, + { + "epoch": 0.3703124059471498, + "grad_norm": 1.4017444849014282, + "learning_rate": 0.00014527047257317853, + "loss": 0.2253, + "step": 1538 + }, + { + "epoch": 0.3705531812436044, + "grad_norm": 1.8468575477600098, + "learning_rate": 0.00014520093203591452, + "loss": 0.6682, + "step": 1539 + }, + { + "epoch": 0.370793956540059, + "grad_norm": 3.008110761642456, + "learning_rate": 0.000145131364016382, + "loss": 0.5261, + "step": 1540 + }, + { + "epoch": 0.3710347318365136, + "grad_norm": 4.193415641784668, + "learning_rate": 0.00014506176855687847, + "loss": 0.9067, + "step": 1541 + }, + { + "epoch": 0.37127550713296814, + "grad_norm": 1.9622831344604492, + "learning_rate": 0.00014499214569971814, + "loss": 1.1056, + "step": 1542 + }, + { + "epoch": 0.37151628242942275, + "grad_norm": 1.8108497858047485, + "learning_rate": 0.00014492249548723188, + "loss": 1.02, + "step": 1543 + }, + { + "epoch": 0.3717570577258773, + "grad_norm": 3.2026660442352295, + "learning_rate": 0.00014485281796176714, + "loss": 0.675, + "step": 1544 + }, + { + "epoch": 0.3719978330223319, + "grad_norm": 2.4484424591064453, + "learning_rate": 0.00014478311316568797, + "loss": 0.3577, + "step": 1545 + }, + { + "epoch": 0.37223860831878647, + "grad_norm": 1.7263269424438477, + "learning_rate": 0.00014471338114137517, + "loss": 0.7703, + "step": 1546 + }, + { + "epoch": 0.3724793836152411, + "grad_norm": 18.308795928955078, + "learning_rate": 0.00014464362193122586, + "loss": 0.6747, + "step": 1547 + }, + { + "epoch": 0.3727201589116957, + "grad_norm": 2.601501941680908, + "learning_rate": 0.00014457383557765386, + "loss": 1.0085, + "step": 1548 + }, + { + "epoch": 0.37296093420815024, + "grad_norm": 1.59297513961792, + "learning_rate": 0.00014450402212308936, + "loss": 0.5779, + "step": 1549 + }, + { + "epoch": 0.37320170950460485, + "grad_norm": 2.8175299167633057, + "learning_rate": 0.00014443418160997918, + "loss": 0.5384, + "step": 1550 + }, + { + "epoch": 0.3734424848010594, + "grad_norm": 4.92849063873291, + "learning_rate": 0.00014436431408078643, + "loss": 0.5555, + "step": 1551 + }, + { + "epoch": 0.373683260097514, + "grad_norm": 2.1497936248779297, + "learning_rate": 0.00014429441957799078, + "loss": 0.6927, + "step": 1552 + }, + { + "epoch": 0.37392403539396857, + "grad_norm": 0.8706673979759216, + "learning_rate": 0.00014422449814408824, + "loss": 0.2299, + "step": 1553 + }, + { + "epoch": 0.3741648106904232, + "grad_norm": 1.8380601406097412, + "learning_rate": 0.0001441545498215912, + "loss": 0.6146, + "step": 1554 + }, + { + "epoch": 0.37440558598687773, + "grad_norm": 1.3901095390319824, + "learning_rate": 0.0001440845746530284, + "loss": 0.3206, + "step": 1555 + }, + { + "epoch": 0.37464636128333234, + "grad_norm": 1.46050226688385, + "learning_rate": 0.00014401457268094483, + "loss": 0.7791, + "step": 1556 + }, + { + "epoch": 0.3748871365797869, + "grad_norm": 4.091619968414307, + "learning_rate": 0.0001439445439479019, + "loss": 0.5138, + "step": 1557 + }, + { + "epoch": 0.3751279118762415, + "grad_norm": 1.6713485717773438, + "learning_rate": 0.00014387448849647732, + "loss": 0.3188, + "step": 1558 + }, + { + "epoch": 0.37536868717269606, + "grad_norm": 3.4357035160064697, + "learning_rate": 0.00014380440636926485, + "loss": 0.6026, + "step": 1559 + }, + { + "epoch": 0.37560946246915067, + "grad_norm": 3.0857181549072266, + "learning_rate": 0.00014373429760887457, + "loss": 0.7203, + "step": 1560 + }, + { + "epoch": 0.3758502377656052, + "grad_norm": 3.1348352432250977, + "learning_rate": 0.00014366416225793284, + "loss": 0.611, + "step": 1561 + }, + { + "epoch": 0.37609101306205983, + "grad_norm": 1.1630622148513794, + "learning_rate": 0.0001435940003590821, + "loss": 0.3327, + "step": 1562 + }, + { + "epoch": 0.37633178835851444, + "grad_norm": 1.690561294555664, + "learning_rate": 0.00014352381195498093, + "loss": 0.6988, + "step": 1563 + }, + { + "epoch": 0.376572563654969, + "grad_norm": 3.028482437133789, + "learning_rate": 0.000143453597088304, + "loss": 0.594, + "step": 1564 + }, + { + "epoch": 0.3768133389514236, + "grad_norm": 0.997052013874054, + "learning_rate": 0.00014338335580174212, + "loss": 0.8037, + "step": 1565 + }, + { + "epoch": 0.37705411424787816, + "grad_norm": 7.6312079429626465, + "learning_rate": 0.00014331308813800222, + "loss": 1.306, + "step": 1566 + }, + { + "epoch": 0.37729488954433277, + "grad_norm": 2.4936201572418213, + "learning_rate": 0.00014324279413980713, + "loss": 0.4458, + "step": 1567 + }, + { + "epoch": 0.3775356648407873, + "grad_norm": 2.824725389480591, + "learning_rate": 0.00014317247384989577, + "loss": 0.5562, + "step": 1568 + }, + { + "epoch": 0.37777644013724193, + "grad_norm": 1.7765711545944214, + "learning_rate": 0.00014310212731102304, + "loss": 0.5947, + "step": 1569 + }, + { + "epoch": 0.3780172154336965, + "grad_norm": 9.701804161071777, + "learning_rate": 0.00014303175456595977, + "loss": 1.0711, + "step": 1570 + }, + { + "epoch": 0.3782579907301511, + "grad_norm": 2.2651548385620117, + "learning_rate": 0.0001429613556574928, + "loss": 0.7001, + "step": 1571 + }, + { + "epoch": 0.37849876602660565, + "grad_norm": 1.20858895778656, + "learning_rate": 0.0001428909306284248, + "loss": 0.6168, + "step": 1572 + }, + { + "epoch": 0.37873954132306026, + "grad_norm": 1.3196377754211426, + "learning_rate": 0.00014282047952157432, + "loss": 0.8402, + "step": 1573 + }, + { + "epoch": 0.3789803166195148, + "grad_norm": 1.9669349193572998, + "learning_rate": 0.00014275000237977582, + "loss": 0.5, + "step": 1574 + }, + { + "epoch": 0.3792210919159694, + "grad_norm": 2.7113590240478516, + "learning_rate": 0.00014267949924587958, + "loss": 0.2134, + "step": 1575 + }, + { + "epoch": 0.37946186721242403, + "grad_norm": 7.937801837921143, + "learning_rate": 0.00014260897016275166, + "loss": 0.4475, + "step": 1576 + }, + { + "epoch": 0.3797026425088786, + "grad_norm": 1.9907861948013306, + "learning_rate": 0.00014253841517327382, + "loss": 0.7746, + "step": 1577 + }, + { + "epoch": 0.3799434178053332, + "grad_norm": 2.06160569190979, + "learning_rate": 0.00014246783432034373, + "loss": 0.9227, + "step": 1578 + }, + { + "epoch": 0.38018419310178775, + "grad_norm": 2.051358461380005, + "learning_rate": 0.00014239722764687474, + "loss": 0.7264, + "step": 1579 + }, + { + "epoch": 0.38042496839824236, + "grad_norm": 3.846851110458374, + "learning_rate": 0.0001423265951957958, + "loss": 0.9719, + "step": 1580 + }, + { + "epoch": 0.3806657436946969, + "grad_norm": 1.47300124168396, + "learning_rate": 0.00014225593701005157, + "loss": 1.17, + "step": 1581 + }, + { + "epoch": 0.3809065189911515, + "grad_norm": 4.379542827606201, + "learning_rate": 0.0001421852531326025, + "loss": 0.5917, + "step": 1582 + }, + { + "epoch": 0.3811472942876061, + "grad_norm": 1.3115028142929077, + "learning_rate": 0.00014211454360642443, + "loss": 0.1916, + "step": 1583 + }, + { + "epoch": 0.3813880695840607, + "grad_norm": 2.608750343322754, + "learning_rate": 0.00014204380847450897, + "loss": 0.8763, + "step": 1584 + }, + { + "epoch": 0.38162884488051524, + "grad_norm": 3.5941126346588135, + "learning_rate": 0.00014197304777986325, + "loss": 0.1222, + "step": 1585 + }, + { + "epoch": 0.38186962017696985, + "grad_norm": 4.869987487792969, + "learning_rate": 0.0001419022615655099, + "loss": 0.8706, + "step": 1586 + }, + { + "epoch": 0.3821103954734244, + "grad_norm": 3.05656099319458, + "learning_rate": 0.00014183144987448711, + "loss": 0.5847, + "step": 1587 + }, + { + "epoch": 0.382351170769879, + "grad_norm": 1.0079351663589478, + "learning_rate": 0.00014176061274984858, + "loss": 0.2984, + "step": 1588 + }, + { + "epoch": 0.3825919460663336, + "grad_norm": 3.344771146774292, + "learning_rate": 0.00014168975023466337, + "loss": 0.5847, + "step": 1589 + }, + { + "epoch": 0.3828327213627882, + "grad_norm": 2.857647657394409, + "learning_rate": 0.00014161886237201612, + "loss": 1.2925, + "step": 1590 + }, + { + "epoch": 0.3830734966592428, + "grad_norm": 2.705115795135498, + "learning_rate": 0.00014154794920500673, + "loss": 0.5277, + "step": 1591 + }, + { + "epoch": 0.38331427195569734, + "grad_norm": 1.9536807537078857, + "learning_rate": 0.00014147701077675065, + "loss": 0.5553, + "step": 1592 + }, + { + "epoch": 0.38355504725215195, + "grad_norm": 1.2713546752929688, + "learning_rate": 0.00014140604713037857, + "loss": 0.507, + "step": 1593 + }, + { + "epoch": 0.3837958225486065, + "grad_norm": 2.593982219696045, + "learning_rate": 0.00014133505830903658, + "loss": 0.3527, + "step": 1594 + }, + { + "epoch": 0.3840365978450611, + "grad_norm": 0.6847010254859924, + "learning_rate": 0.00014126404435588596, + "loss": 0.1223, + "step": 1595 + }, + { + "epoch": 0.38427737314151567, + "grad_norm": 1.8529340028762817, + "learning_rate": 0.00014119300531410342, + "loss": 0.44, + "step": 1596 + }, + { + "epoch": 0.3845181484379703, + "grad_norm": 2.9218854904174805, + "learning_rate": 0.0001411219412268808, + "loss": 0.4854, + "step": 1597 + }, + { + "epoch": 0.38475892373442483, + "grad_norm": 2.5640389919281006, + "learning_rate": 0.00014105085213742533, + "loss": 0.7238, + "step": 1598 + }, + { + "epoch": 0.38499969903087944, + "grad_norm": 0.7277923822402954, + "learning_rate": 0.00014097973808895926, + "loss": 0.2205, + "step": 1599 + }, + { + "epoch": 0.385240474327334, + "grad_norm": 1.501104474067688, + "learning_rate": 0.00014090859912472005, + "loss": 0.1477, + "step": 1600 + }, + { + "epoch": 0.3854812496237886, + "grad_norm": 3.788515329360962, + "learning_rate": 0.00014083743528796045, + "loss": 1.0636, + "step": 1601 + }, + { + "epoch": 0.38572202492024316, + "grad_norm": 2.322822332382202, + "learning_rate": 0.00014076624662194816, + "loss": 0.283, + "step": 1602 + }, + { + "epoch": 0.38596280021669777, + "grad_norm": 1.9796638488769531, + "learning_rate": 0.00014069503316996613, + "loss": 0.3978, + "step": 1603 + }, + { + "epoch": 0.3862035755131524, + "grad_norm": 0.6974928379058838, + "learning_rate": 0.0001406237949753122, + "loss": 0.8248, + "step": 1604 + }, + { + "epoch": 0.38644435080960693, + "grad_norm": 0.8106366991996765, + "learning_rate": 0.00014055253208129938, + "loss": 0.5371, + "step": 1605 + }, + { + "epoch": 0.38668512610606154, + "grad_norm": 1.5011775493621826, + "learning_rate": 0.00014048124453125573, + "loss": 0.2772, + "step": 1606 + }, + { + "epoch": 0.3869259014025161, + "grad_norm": 1.6291502714157104, + "learning_rate": 0.0001404099323685242, + "loss": 0.1682, + "step": 1607 + }, + { + "epoch": 0.3871666766989707, + "grad_norm": 1.8147183656692505, + "learning_rate": 0.00014033859563646276, + "loss": 0.1837, + "step": 1608 + }, + { + "epoch": 0.38740745199542526, + "grad_norm": 2.469822645187378, + "learning_rate": 0.00014026723437844421, + "loss": 0.7883, + "step": 1609 + }, + { + "epoch": 0.38764822729187987, + "grad_norm": 6.157069683074951, + "learning_rate": 0.00014019584863785652, + "loss": 0.3593, + "step": 1610 + }, + { + "epoch": 0.3878890025883344, + "grad_norm": 1.2629841566085815, + "learning_rate": 0.00014012443845810223, + "loss": 0.4991, + "step": 1611 + }, + { + "epoch": 0.38812977788478903, + "grad_norm": 2.7113308906555176, + "learning_rate": 0.000140053003882599, + "loss": 1.136, + "step": 1612 + }, + { + "epoch": 0.3883705531812436, + "grad_norm": 3.3584749698638916, + "learning_rate": 0.00013998154495477912, + "loss": 0.1191, + "step": 1613 + }, + { + "epoch": 0.3886113284776982, + "grad_norm": 2.5008931159973145, + "learning_rate": 0.0001399100617180899, + "loss": 0.6197, + "step": 1614 + }, + { + "epoch": 0.38885210377415275, + "grad_norm": 1.7047406435012817, + "learning_rate": 0.00013983855421599318, + "loss": 0.6819, + "step": 1615 + }, + { + "epoch": 0.38909287907060736, + "grad_norm": 1.2568997144699097, + "learning_rate": 0.0001397670224919658, + "loss": 0.4986, + "step": 1616 + }, + { + "epoch": 0.38933365436706197, + "grad_norm": 4.387941360473633, + "learning_rate": 0.0001396954665894991, + "loss": 0.4947, + "step": 1617 + }, + { + "epoch": 0.3895744296635165, + "grad_norm": 1.8967385292053223, + "learning_rate": 0.00013962388655209927, + "loss": 0.6985, + "step": 1618 + }, + { + "epoch": 0.38981520495997113, + "grad_norm": 3.39685320854187, + "learning_rate": 0.00013955228242328718, + "loss": 1.0637, + "step": 1619 + }, + { + "epoch": 0.3900559802564257, + "grad_norm": 4.821850299835205, + "learning_rate": 0.00013948065424659824, + "loss": 0.4031, + "step": 1620 + }, + { + "epoch": 0.3902967555528803, + "grad_norm": 2.4104623794555664, + "learning_rate": 0.00013940900206558257, + "loss": 0.9255, + "step": 1621 + }, + { + "epoch": 0.39053753084933485, + "grad_norm": 2.2007462978363037, + "learning_rate": 0.00013933732592380483, + "loss": 0.5469, + "step": 1622 + }, + { + "epoch": 0.39077830614578946, + "grad_norm": 2.2772059440612793, + "learning_rate": 0.00013926562586484434, + "loss": 0.4233, + "step": 1623 + }, + { + "epoch": 0.391019081442244, + "grad_norm": 2.6534852981567383, + "learning_rate": 0.00013919390193229485, + "loss": 0.3978, + "step": 1624 + }, + { + "epoch": 0.3912598567386986, + "grad_norm": 0.3831101357936859, + "learning_rate": 0.00013912215416976467, + "loss": 0.2271, + "step": 1625 + }, + { + "epoch": 0.3915006320351532, + "grad_norm": 1.9152987003326416, + "learning_rate": 0.00013905038262087662, + "loss": 0.522, + "step": 1626 + }, + { + "epoch": 0.3917414073316078, + "grad_norm": 2.0952141284942627, + "learning_rate": 0.00013897858732926793, + "loss": 0.2229, + "step": 1627 + }, + { + "epoch": 0.39198218262806234, + "grad_norm": 10.112699508666992, + "learning_rate": 0.00013890676833859037, + "loss": 1.0788, + "step": 1628 + }, + { + "epoch": 0.39222295792451695, + "grad_norm": 2.1068572998046875, + "learning_rate": 0.00013883492569250998, + "loss": 0.5627, + "step": 1629 + }, + { + "epoch": 0.39246373322097156, + "grad_norm": 2.1683926582336426, + "learning_rate": 0.00013876305943470724, + "loss": 1.0251, + "step": 1630 + }, + { + "epoch": 0.3927045085174261, + "grad_norm": 5.917585372924805, + "learning_rate": 0.00013869116960887708, + "loss": 0.6836, + "step": 1631 + }, + { + "epoch": 0.3929452838138807, + "grad_norm": 2.575009346008301, + "learning_rate": 0.0001386192562587286, + "loss": 0.8661, + "step": 1632 + }, + { + "epoch": 0.3931860591103353, + "grad_norm": 2.4185233116149902, + "learning_rate": 0.00013854731942798532, + "loss": 0.7001, + "step": 1633 + }, + { + "epoch": 0.3934268344067899, + "grad_norm": 1.6709206104278564, + "learning_rate": 0.00013847535916038496, + "loss": 0.364, + "step": 1634 + }, + { + "epoch": 0.39366760970324444, + "grad_norm": 3.425093650817871, + "learning_rate": 0.00013840337549967955, + "loss": 0.3667, + "step": 1635 + }, + { + "epoch": 0.39390838499969905, + "grad_norm": 1.7669458389282227, + "learning_rate": 0.00013833136848963532, + "loss": 0.733, + "step": 1636 + }, + { + "epoch": 0.3941491602961536, + "grad_norm": 2.1822469234466553, + "learning_rate": 0.00013825933817403267, + "loss": 0.7814, + "step": 1637 + }, + { + "epoch": 0.3943899355926082, + "grad_norm": 8.053266525268555, + "learning_rate": 0.00013818728459666623, + "loss": 0.9111, + "step": 1638 + }, + { + "epoch": 0.39463071088906276, + "grad_norm": 1.4243130683898926, + "learning_rate": 0.0001381152078013447, + "loss": 0.4235, + "step": 1639 + }, + { + "epoch": 0.3948714861855174, + "grad_norm": 1.732535481452942, + "learning_rate": 0.00013804310783189098, + "loss": 0.3293, + "step": 1640 + }, + { + "epoch": 0.3951122614819719, + "grad_norm": 1.332587718963623, + "learning_rate": 0.00013797098473214197, + "loss": 0.6848, + "step": 1641 + }, + { + "epoch": 0.39535303677842654, + "grad_norm": 1.3026105165481567, + "learning_rate": 0.0001378988385459487, + "loss": 0.7852, + "step": 1642 + }, + { + "epoch": 0.3955938120748811, + "grad_norm": 2.118013620376587, + "learning_rate": 0.0001378266693171762, + "loss": 0.4796, + "step": 1643 + }, + { + "epoch": 0.3958345873713357, + "grad_norm": 2.2776410579681396, + "learning_rate": 0.00013775447708970351, + "loss": 1.0214, + "step": 1644 + }, + { + "epoch": 0.3960753626677903, + "grad_norm": 1.8297806978225708, + "learning_rate": 0.0001376822619074237, + "loss": 0.4031, + "step": 1645 + }, + { + "epoch": 0.39631613796424486, + "grad_norm": 1.5983656644821167, + "learning_rate": 0.0001376100238142438, + "loss": 0.2453, + "step": 1646 + }, + { + "epoch": 0.3965569132606995, + "grad_norm": 1.8416905403137207, + "learning_rate": 0.00013753776285408464, + "loss": 0.5695, + "step": 1647 + }, + { + "epoch": 0.396797688557154, + "grad_norm": 2.1590733528137207, + "learning_rate": 0.00013746547907088108, + "loss": 0.1617, + "step": 1648 + }, + { + "epoch": 0.39703846385360864, + "grad_norm": 2.4669997692108154, + "learning_rate": 0.00013739317250858186, + "loss": 0.5653, + "step": 1649 + }, + { + "epoch": 0.3972792391500632, + "grad_norm": 1.7538673877716064, + "learning_rate": 0.0001373208432111495, + "loss": 0.16, + "step": 1650 + }, + { + "epoch": 0.3975200144465178, + "grad_norm": 2.019120216369629, + "learning_rate": 0.00013724849122256035, + "loss": 0.6373, + "step": 1651 + }, + { + "epoch": 0.39776078974297235, + "grad_norm": 1.4879308938980103, + "learning_rate": 0.00013717611658680464, + "loss": 0.8454, + "step": 1652 + }, + { + "epoch": 0.39800156503942696, + "grad_norm": 0.9595705270767212, + "learning_rate": 0.00013710371934788632, + "loss": 0.532, + "step": 1653 + }, + { + "epoch": 0.3982423403358815, + "grad_norm": 1.8083183765411377, + "learning_rate": 0.00013703129954982299, + "loss": 0.4841, + "step": 1654 + }, + { + "epoch": 0.3984831156323361, + "grad_norm": 1.0364370346069336, + "learning_rate": 0.00013695885723664616, + "loss": 0.2084, + "step": 1655 + }, + { + "epoch": 0.3987238909287907, + "grad_norm": 6.035412788391113, + "learning_rate": 0.00013688639245240078, + "loss": 0.7487, + "step": 1656 + }, + { + "epoch": 0.3989646662252453, + "grad_norm": 1.0442893505096436, + "learning_rate": 0.00013681390524114575, + "loss": 0.422, + "step": 1657 + }, + { + "epoch": 0.3992054415216999, + "grad_norm": 2.071849822998047, + "learning_rate": 0.00013674139564695333, + "loss": 0.5663, + "step": 1658 + }, + { + "epoch": 0.39944621681815445, + "grad_norm": 2.249422311782837, + "learning_rate": 0.00013666886371390967, + "loss": 0.679, + "step": 1659 + }, + { + "epoch": 0.39968699211460906, + "grad_norm": 5.166494369506836, + "learning_rate": 0.0001365963094861142, + "loss": 0.9236, + "step": 1660 + }, + { + "epoch": 0.3999277674110636, + "grad_norm": 2.5879993438720703, + "learning_rate": 0.0001365237330076801, + "loss": 0.642, + "step": 1661 + }, + { + "epoch": 0.4001685427075182, + "grad_norm": 2.8723905086517334, + "learning_rate": 0.00013645113432273403, + "loss": 0.7538, + "step": 1662 + }, + { + "epoch": 0.4004093180039728, + "grad_norm": 1.0138564109802246, + "learning_rate": 0.0001363785134754162, + "loss": 0.511, + "step": 1663 + }, + { + "epoch": 0.4006500933004274, + "grad_norm": 3.8104164600372314, + "learning_rate": 0.00013630587050988022, + "loss": 0.4648, + "step": 1664 + }, + { + "epoch": 0.40089086859688194, + "grad_norm": 2.2068583965301514, + "learning_rate": 0.00013623320547029316, + "loss": 0.6258, + "step": 1665 + }, + { + "epoch": 0.40113164389333655, + "grad_norm": 1.245370864868164, + "learning_rate": 0.0001361605184008355, + "loss": 0.4723, + "step": 1666 + }, + { + "epoch": 0.4013724191897911, + "grad_norm": 1.0925084352493286, + "learning_rate": 0.00013608780934570123, + "loss": 0.5381, + "step": 1667 + }, + { + "epoch": 0.4016131944862457, + "grad_norm": 6.653575897216797, + "learning_rate": 0.00013601507834909757, + "loss": 0.5606, + "step": 1668 + }, + { + "epoch": 0.40185396978270027, + "grad_norm": 1.6157435178756714, + "learning_rate": 0.0001359423254552451, + "loss": 0.8517, + "step": 1669 + }, + { + "epoch": 0.4020947450791549, + "grad_norm": 1.4830398559570312, + "learning_rate": 0.00013586955070837777, + "loss": 0.895, + "step": 1670 + }, + { + "epoch": 0.4023355203756095, + "grad_norm": 0.835504949092865, + "learning_rate": 0.00013579675415274284, + "loss": 0.3608, + "step": 1671 + }, + { + "epoch": 0.40257629567206404, + "grad_norm": 3.575409173965454, + "learning_rate": 0.00013572393583260073, + "loss": 0.985, + "step": 1672 + }, + { + "epoch": 0.40281707096851865, + "grad_norm": 2.397228479385376, + "learning_rate": 0.0001356510957922251, + "loss": 0.5574, + "step": 1673 + }, + { + "epoch": 0.4030578462649732, + "grad_norm": 1.162008285522461, + "learning_rate": 0.00013557823407590294, + "loss": 0.4828, + "step": 1674 + }, + { + "epoch": 0.4032986215614278, + "grad_norm": 2.0564050674438477, + "learning_rate": 0.00013550535072793428, + "loss": 1.0467, + "step": 1675 + }, + { + "epoch": 0.40353939685788237, + "grad_norm": 4.555008888244629, + "learning_rate": 0.00013543244579263244, + "loss": 0.645, + "step": 1676 + }, + { + "epoch": 0.403780172154337, + "grad_norm": 1.655927062034607, + "learning_rate": 0.00013535951931432366, + "loss": 0.5477, + "step": 1677 + }, + { + "epoch": 0.40402094745079153, + "grad_norm": 2.4142045974731445, + "learning_rate": 0.0001352865713373475, + "loss": 0.5651, + "step": 1678 + }, + { + "epoch": 0.40426172274724614, + "grad_norm": 2.2285380363464355, + "learning_rate": 0.00013521360190605646, + "loss": 0.648, + "step": 1679 + }, + { + "epoch": 0.4045024980437007, + "grad_norm": 3.8250715732574463, + "learning_rate": 0.00013514061106481614, + "loss": 0.9591, + "step": 1680 + }, + { + "epoch": 0.4047432733401553, + "grad_norm": 0.9585970640182495, + "learning_rate": 0.0001350675988580051, + "loss": 0.3991, + "step": 1681 + }, + { + "epoch": 0.40498404863660986, + "grad_norm": 9.034631729125977, + "learning_rate": 0.00013499456533001497, + "loss": 0.5749, + "step": 1682 + }, + { + "epoch": 0.40522482393306447, + "grad_norm": 2.0019724369049072, + "learning_rate": 0.00013492151052525023, + "loss": 0.1236, + "step": 1683 + }, + { + "epoch": 0.405465599229519, + "grad_norm": 11.653858184814453, + "learning_rate": 0.00013484843448812844, + "loss": 0.785, + "step": 1684 + }, + { + "epoch": 0.40570637452597363, + "grad_norm": 2.2401812076568604, + "learning_rate": 0.00013477533726308, + "loss": 0.8912, + "step": 1685 + }, + { + "epoch": 0.40594714982242824, + "grad_norm": 12.922853469848633, + "learning_rate": 0.0001347022188945481, + "loss": 0.6012, + "step": 1686 + }, + { + "epoch": 0.4061879251188828, + "grad_norm": 1.3376822471618652, + "learning_rate": 0.00013462907942698895, + "loss": 1.2057, + "step": 1687 + }, + { + "epoch": 0.4064287004153374, + "grad_norm": 6.5069708824157715, + "learning_rate": 0.00013455591890487148, + "loss": 0.8799, + "step": 1688 + }, + { + "epoch": 0.40666947571179196, + "grad_norm": 1.1161401271820068, + "learning_rate": 0.0001344827373726775, + "loss": 1.4456, + "step": 1689 + }, + { + "epoch": 0.40691025100824657, + "grad_norm": 0.9486348032951355, + "learning_rate": 0.00013440953487490144, + "loss": 0.5933, + "step": 1690 + }, + { + "epoch": 0.4071510263047011, + "grad_norm": 1.8005541563034058, + "learning_rate": 0.0001343363114560507, + "loss": 0.7821, + "step": 1691 + }, + { + "epoch": 0.40739180160115573, + "grad_norm": 2.908756732940674, + "learning_rate": 0.0001342630671606452, + "loss": 0.8259, + "step": 1692 + }, + { + "epoch": 0.4076325768976103, + "grad_norm": 1.161380648612976, + "learning_rate": 0.00013418980203321772, + "loss": 0.7767, + "step": 1693 + }, + { + "epoch": 0.4078733521940649, + "grad_norm": 2.2439661026000977, + "learning_rate": 0.00013411651611831352, + "loss": 0.3818, + "step": 1694 + }, + { + "epoch": 0.40811412749051945, + "grad_norm": 2.2217512130737305, + "learning_rate": 0.00013404320946049068, + "loss": 0.2162, + "step": 1695 + }, + { + "epoch": 0.40835490278697406, + "grad_norm": 2.809119462966919, + "learning_rate": 0.00013396988210431977, + "loss": 0.7169, + "step": 1696 + }, + { + "epoch": 0.4085956780834286, + "grad_norm": 2.8886725902557373, + "learning_rate": 0.00013389653409438406, + "loss": 0.349, + "step": 1697 + }, + { + "epoch": 0.4088364533798832, + "grad_norm": 1.2677594423294067, + "learning_rate": 0.00013382316547527919, + "loss": 0.2073, + "step": 1698 + }, + { + "epoch": 0.40907722867633783, + "grad_norm": 6.638054370880127, + "learning_rate": 0.00013374977629161355, + "loss": 0.9768, + "step": 1699 + }, + { + "epoch": 0.4093180039727924, + "grad_norm": 2.200249195098877, + "learning_rate": 0.00013367636658800783, + "loss": 0.4204, + "step": 1700 + }, + { + "epoch": 0.409558779269247, + "grad_norm": 2.565556526184082, + "learning_rate": 0.0001336029364090954, + "loss": 0.8401, + "step": 1701 + }, + { + "epoch": 0.40979955456570155, + "grad_norm": 1.9111295938491821, + "learning_rate": 0.0001335294857995219, + "loss": 1.2343, + "step": 1702 + }, + { + "epoch": 0.41004032986215616, + "grad_norm": 5.341217041015625, + "learning_rate": 0.0001334560148039455, + "loss": 0.371, + "step": 1703 + }, + { + "epoch": 0.4102811051586107, + "grad_norm": 1.3484272956848145, + "learning_rate": 0.00013338252346703673, + "loss": 0.7788, + "step": 1704 + }, + { + "epoch": 0.4105218804550653, + "grad_norm": 2.7777099609375, + "learning_rate": 0.00013330901183347847, + "loss": 0.4438, + "step": 1705 + }, + { + "epoch": 0.4107626557515199, + "grad_norm": 2.4722752571105957, + "learning_rate": 0.00013323547994796597, + "loss": 0.4454, + "step": 1706 + }, + { + "epoch": 0.4110034310479745, + "grad_norm": 2.2678263187408447, + "learning_rate": 0.0001331619278552068, + "loss": 0.4712, + "step": 1707 + }, + { + "epoch": 0.41124420634442904, + "grad_norm": 2.552933692932129, + "learning_rate": 0.00013308835559992075, + "loss": 0.6171, + "step": 1708 + }, + { + "epoch": 0.41148498164088365, + "grad_norm": 4.140172958374023, + "learning_rate": 0.00013301476322683997, + "loss": 1.2291, + "step": 1709 + }, + { + "epoch": 0.4117257569373382, + "grad_norm": 1.883234977722168, + "learning_rate": 0.00013294115078070875, + "loss": 0.3714, + "step": 1710 + }, + { + "epoch": 0.4119665322337928, + "grad_norm": 1.5748333930969238, + "learning_rate": 0.00013286751830628363, + "loss": 0.891, + "step": 1711 + }, + { + "epoch": 0.4122073075302474, + "grad_norm": 1.558668613433838, + "learning_rate": 0.00013279386584833335, + "loss": 0.5892, + "step": 1712 + }, + { + "epoch": 0.412448082826702, + "grad_norm": 6.810975074768066, + "learning_rate": 0.00013272019345163873, + "loss": 0.5012, + "step": 1713 + }, + { + "epoch": 0.4126888581231566, + "grad_norm": 2.5344254970550537, + "learning_rate": 0.00013264650116099277, + "loss": 0.6199, + "step": 1714 + }, + { + "epoch": 0.41292963341961114, + "grad_norm": 1.2778170108795166, + "learning_rate": 0.00013257278902120058, + "loss": 0.8041, + "step": 1715 + }, + { + "epoch": 0.41317040871606575, + "grad_norm": 1.3319803476333618, + "learning_rate": 0.00013249905707707926, + "loss": 1.0953, + "step": 1716 + }, + { + "epoch": 0.4134111840125203, + "grad_norm": 4.633189678192139, + "learning_rate": 0.000132425305373458, + "loss": 0.9773, + "step": 1717 + }, + { + "epoch": 0.4136519593089749, + "grad_norm": 1.2184745073318481, + "learning_rate": 0.00013235153395517804, + "loss": 0.7046, + "step": 1718 + }, + { + "epoch": 0.41389273460542947, + "grad_norm": 1.2916301488876343, + "learning_rate": 0.00013227774286709253, + "loss": 0.2718, + "step": 1719 + }, + { + "epoch": 0.4141335099018841, + "grad_norm": 1.1648756265640259, + "learning_rate": 0.00013220393215406664, + "loss": 0.446, + "step": 1720 + }, + { + "epoch": 0.41437428519833863, + "grad_norm": 2.0171449184417725, + "learning_rate": 0.00013213010186097744, + "loss": 0.1262, + "step": 1721 + }, + { + "epoch": 0.41461506049479324, + "grad_norm": 2.397416591644287, + "learning_rate": 0.00013205625203271395, + "loss": 0.7722, + "step": 1722 + }, + { + "epoch": 0.4148558357912478, + "grad_norm": 0.6799049377441406, + "learning_rate": 0.00013198238271417697, + "loss": 0.6582, + "step": 1723 + }, + { + "epoch": 0.4150966110877024, + "grad_norm": 2.0616261959075928, + "learning_rate": 0.00013190849395027928, + "loss": 1.2671, + "step": 1724 + }, + { + "epoch": 0.41533738638415696, + "grad_norm": 0.9546332955360413, + "learning_rate": 0.00013183458578594533, + "loss": 0.2217, + "step": 1725 + }, + { + "epoch": 0.41557816168061157, + "grad_norm": 4.271639823913574, + "learning_rate": 0.0001317606582661115, + "loss": 0.6956, + "step": 1726 + }, + { + "epoch": 0.4158189369770662, + "grad_norm": 1.4144961833953857, + "learning_rate": 0.0001316867114357259, + "loss": 0.6413, + "step": 1727 + }, + { + "epoch": 0.41605971227352073, + "grad_norm": 0.7294138073921204, + "learning_rate": 0.00013161274533974836, + "loss": 0.3907, + "step": 1728 + }, + { + "epoch": 0.41630048756997534, + "grad_norm": 1.5399507284164429, + "learning_rate": 0.00013153876002315045, + "loss": 0.635, + "step": 1729 + }, + { + "epoch": 0.4165412628664299, + "grad_norm": 1.771852731704712, + "learning_rate": 0.00013146475553091536, + "loss": 0.4428, + "step": 1730 + }, + { + "epoch": 0.4167820381628845, + "grad_norm": 1.6749565601348877, + "learning_rate": 0.000131390731908038, + "loss": 0.5446, + "step": 1731 + }, + { + "epoch": 0.41702281345933906, + "grad_norm": 1.4587945938110352, + "learning_rate": 0.00013131668919952495, + "loss": 0.8724, + "step": 1732 + }, + { + "epoch": 0.41726358875579367, + "grad_norm": 2.2476232051849365, + "learning_rate": 0.0001312426274503943, + "loss": 0.184, + "step": 1733 + }, + { + "epoch": 0.4175043640522482, + "grad_norm": 1.1287919282913208, + "learning_rate": 0.00013116854670567577, + "loss": 0.4209, + "step": 1734 + }, + { + "epoch": 0.41774513934870283, + "grad_norm": 0.6734319925308228, + "learning_rate": 0.00013109444701041057, + "loss": 0.2422, + "step": 1735 + }, + { + "epoch": 0.4179859146451574, + "grad_norm": 2.265183448791504, + "learning_rate": 0.0001310203284096516, + "loss": 0.6902, + "step": 1736 + }, + { + "epoch": 0.418226689941612, + "grad_norm": 3.4037933349609375, + "learning_rate": 0.00013094619094846304, + "loss": 0.628, + "step": 1737 + }, + { + "epoch": 0.41846746523806655, + "grad_norm": 4.971876621246338, + "learning_rate": 0.00013087203467192067, + "loss": 0.9363, + "step": 1738 + }, + { + "epoch": 0.41870824053452116, + "grad_norm": 2.3928446769714355, + "learning_rate": 0.00013079785962511164, + "loss": 0.5608, + "step": 1739 + }, + { + "epoch": 0.41894901583097577, + "grad_norm": 1.1700559854507446, + "learning_rate": 0.0001307236658531346, + "loss": 0.4105, + "step": 1740 + }, + { + "epoch": 0.4191897911274303, + "grad_norm": 1.4026082754135132, + "learning_rate": 0.00013064945340109948, + "loss": 0.6884, + "step": 1741 + }, + { + "epoch": 0.41943056642388493, + "grad_norm": 2.345377206802368, + "learning_rate": 0.00013057522231412765, + "loss": 0.6579, + "step": 1742 + }, + { + "epoch": 0.4196713417203395, + "grad_norm": 0.7213815450668335, + "learning_rate": 0.00013050097263735174, + "loss": 0.4405, + "step": 1743 + }, + { + "epoch": 0.4199121170167941, + "grad_norm": 0.8045918941497803, + "learning_rate": 0.0001304267044159158, + "loss": 0.4975, + "step": 1744 + }, + { + "epoch": 0.42015289231324865, + "grad_norm": 1.4894392490386963, + "learning_rate": 0.000130352417694975, + "loss": 0.5714, + "step": 1745 + }, + { + "epoch": 0.42039366760970326, + "grad_norm": 8.357844352722168, + "learning_rate": 0.00013027811251969585, + "loss": 0.6262, + "step": 1746 + }, + { + "epoch": 0.4206344429061578, + "grad_norm": 1.4566922187805176, + "learning_rate": 0.00013020378893525603, + "loss": 0.1933, + "step": 1747 + }, + { + "epoch": 0.4208752182026124, + "grad_norm": 0.6821098327636719, + "learning_rate": 0.00013012944698684455, + "loss": 0.4767, + "step": 1748 + }, + { + "epoch": 0.421115993499067, + "grad_norm": 2.3084802627563477, + "learning_rate": 0.00013005508671966141, + "loss": 0.6336, + "step": 1749 + }, + { + "epoch": 0.4213567687955216, + "grad_norm": 2.982093572616577, + "learning_rate": 0.0001299807081789178, + "loss": 0.5048, + "step": 1750 + }, + { + "epoch": 0.42159754409197614, + "grad_norm": 1.2381023168563843, + "learning_rate": 0.0001299063114098361, + "loss": 0.6217, + "step": 1751 + }, + { + "epoch": 0.42183831938843075, + "grad_norm": 2.4936861991882324, + "learning_rate": 0.00012983189645764966, + "loss": 0.5497, + "step": 1752 + }, + { + "epoch": 0.4220790946848853, + "grad_norm": 0.9683302044868469, + "learning_rate": 0.00012975746336760298, + "loss": 0.3565, + "step": 1753 + }, + { + "epoch": 0.4223198699813399, + "grad_norm": 3.492793083190918, + "learning_rate": 0.00012968301218495152, + "loss": 0.5133, + "step": 1754 + }, + { + "epoch": 0.4225606452777945, + "grad_norm": 2.7869482040405273, + "learning_rate": 0.00012960854295496178, + "loss": 0.9106, + "step": 1755 + }, + { + "epoch": 0.4228014205742491, + "grad_norm": 2.8004496097564697, + "learning_rate": 0.00012953405572291117, + "loss": 0.5493, + "step": 1756 + }, + { + "epoch": 0.4230421958707037, + "grad_norm": 1.4663894176483154, + "learning_rate": 0.0001294595505340882, + "loss": 0.4555, + "step": 1757 + }, + { + "epoch": 0.42328297116715824, + "grad_norm": 14.337491035461426, + "learning_rate": 0.00012938502743379212, + "loss": 0.7048, + "step": 1758 + }, + { + "epoch": 0.42352374646361285, + "grad_norm": 1.1422491073608398, + "learning_rate": 0.00012931048646733313, + "loss": 0.6569, + "step": 1759 + }, + { + "epoch": 0.4237645217600674, + "grad_norm": 0.4564094841480255, + "learning_rate": 0.00012923592768003235, + "loss": 0.1381, + "step": 1760 + }, + { + "epoch": 0.424005297056522, + "grad_norm": 2.7814853191375732, + "learning_rate": 0.00012916135111722165, + "loss": 0.5271, + "step": 1761 + }, + { + "epoch": 0.42424607235297657, + "grad_norm": 3.1444740295410156, + "learning_rate": 0.0001290867568242438, + "loss": 0.9703, + "step": 1762 + }, + { + "epoch": 0.4244868476494312, + "grad_norm": 0.9466924071311951, + "learning_rate": 0.00012901214484645226, + "loss": 0.64, + "step": 1763 + }, + { + "epoch": 0.42472762294588573, + "grad_norm": 0.9237557053565979, + "learning_rate": 0.00012893751522921124, + "loss": 0.6848, + "step": 1764 + }, + { + "epoch": 0.42496839824234034, + "grad_norm": 2.4244697093963623, + "learning_rate": 0.00012886286801789583, + "loss": 0.6039, + "step": 1765 + }, + { + "epoch": 0.4252091735387949, + "grad_norm": 9.26452922821045, + "learning_rate": 0.00012878820325789162, + "loss": 0.6834, + "step": 1766 + }, + { + "epoch": 0.4254499488352495, + "grad_norm": 0.7539100646972656, + "learning_rate": 0.00012871352099459496, + "loss": 0.3441, + "step": 1767 + }, + { + "epoch": 0.4256907241317041, + "grad_norm": 2.2698490619659424, + "learning_rate": 0.00012863882127341284, + "loss": 0.9277, + "step": 1768 + }, + { + "epoch": 0.42593149942815867, + "grad_norm": 5.280149936676025, + "learning_rate": 0.00012856410413976285, + "loss": 0.9697, + "step": 1769 + }, + { + "epoch": 0.4261722747246133, + "grad_norm": 3.395625591278076, + "learning_rate": 0.0001284893696390732, + "loss": 1.0316, + "step": 1770 + }, + { + "epoch": 0.42641305002106783, + "grad_norm": 1.7464591264724731, + "learning_rate": 0.00012841461781678263, + "loss": 0.7941, + "step": 1771 + }, + { + "epoch": 0.42665382531752244, + "grad_norm": 2.47660493850708, + "learning_rate": 0.00012833984871834042, + "loss": 0.9419, + "step": 1772 + }, + { + "epoch": 0.426894600613977, + "grad_norm": 1.8424837589263916, + "learning_rate": 0.00012826506238920632, + "loss": 0.6514, + "step": 1773 + }, + { + "epoch": 0.4271353759104316, + "grad_norm": 11.50127124786377, + "learning_rate": 0.00012819025887485062, + "loss": 1.3198, + "step": 1774 + }, + { + "epoch": 0.42737615120688616, + "grad_norm": 1.003143548965454, + "learning_rate": 0.00012811543822075397, + "loss": 0.2809, + "step": 1775 + }, + { + "epoch": 0.42761692650334077, + "grad_norm": 1.2120084762573242, + "learning_rate": 0.00012804060047240756, + "loss": 0.4469, + "step": 1776 + }, + { + "epoch": 0.4278577017997953, + "grad_norm": 2.660773515701294, + "learning_rate": 0.0001279657456753129, + "loss": 0.8044, + "step": 1777 + }, + { + "epoch": 0.42809847709624993, + "grad_norm": 3.651428461074829, + "learning_rate": 0.00012789087387498187, + "loss": 0.7613, + "step": 1778 + }, + { + "epoch": 0.4283392523927045, + "grad_norm": 1.7895033359527588, + "learning_rate": 0.00012781598511693666, + "loss": 0.5028, + "step": 1779 + }, + { + "epoch": 0.4285800276891591, + "grad_norm": 2.3747005462646484, + "learning_rate": 0.00012774107944670983, + "loss": 0.6884, + "step": 1780 + }, + { + "epoch": 0.4288208029856137, + "grad_norm": 0.8780110478401184, + "learning_rate": 0.00012766615690984422, + "loss": 0.5869, + "step": 1781 + }, + { + "epoch": 0.42906157828206826, + "grad_norm": 1.754726529121399, + "learning_rate": 0.00012759121755189282, + "loss": 0.883, + "step": 1782 + }, + { + "epoch": 0.42930235357852287, + "grad_norm": 1.48545241355896, + "learning_rate": 0.00012751626141841902, + "loss": 0.6704, + "step": 1783 + }, + { + "epoch": 0.4295431288749774, + "grad_norm": 1.908327579498291, + "learning_rate": 0.0001274412885549963, + "loss": 0.5986, + "step": 1784 + }, + { + "epoch": 0.42978390417143203, + "grad_norm": 2.356943130493164, + "learning_rate": 0.0001273662990072083, + "loss": 0.5414, + "step": 1785 + }, + { + "epoch": 0.4300246794678866, + "grad_norm": 2.7557711601257324, + "learning_rate": 0.00012729129282064886, + "loss": 0.7579, + "step": 1786 + }, + { + "epoch": 0.4302654547643412, + "grad_norm": 1.6613632440567017, + "learning_rate": 0.00012721627004092184, + "loss": 0.7389, + "step": 1787 + }, + { + "epoch": 0.43050623006079575, + "grad_norm": 0.23793041706085205, + "learning_rate": 0.00012714123071364138, + "loss": 0.3544, + "step": 1788 + }, + { + "epoch": 0.43074700535725036, + "grad_norm": 0.7207126617431641, + "learning_rate": 0.0001270661748844315, + "loss": 0.4286, + "step": 1789 + }, + { + "epoch": 0.4309877806537049, + "grad_norm": 2.2661566734313965, + "learning_rate": 0.00012699110259892625, + "loss": 0.8774, + "step": 1790 + }, + { + "epoch": 0.4312285559501595, + "grad_norm": 2.410264730453491, + "learning_rate": 0.00012691601390276983, + "loss": 0.6303, + "step": 1791 + }, + { + "epoch": 0.4314693312466141, + "grad_norm": 3.2463836669921875, + "learning_rate": 0.00012684090884161636, + "loss": 0.4901, + "step": 1792 + }, + { + "epoch": 0.4317101065430687, + "grad_norm": 2.540635824203491, + "learning_rate": 0.0001267657874611298, + "loss": 0.4825, + "step": 1793 + }, + { + "epoch": 0.43195088183952324, + "grad_norm": 0.8397485613822937, + "learning_rate": 0.00012669064980698418, + "loss": 0.1515, + "step": 1794 + }, + { + "epoch": 0.43219165713597785, + "grad_norm": 3.4554481506347656, + "learning_rate": 0.00012661549592486327, + "loss": 0.8663, + "step": 1795 + }, + { + "epoch": 0.43243243243243246, + "grad_norm": 2.4448556900024414, + "learning_rate": 0.00012654032586046097, + "loss": 0.2905, + "step": 1796 + }, + { + "epoch": 0.432673207728887, + "grad_norm": 2.5620980262756348, + "learning_rate": 0.0001264651396594807, + "loss": 0.4889, + "step": 1797 + }, + { + "epoch": 0.4329139830253416, + "grad_norm": 1.970119833946228, + "learning_rate": 0.0001263899373676359, + "loss": 0.6237, + "step": 1798 + }, + { + "epoch": 0.4331547583217962, + "grad_norm": 1.576965093612671, + "learning_rate": 0.00012631471903064973, + "loss": 0.4789, + "step": 1799 + }, + { + "epoch": 0.4333955336182508, + "grad_norm": 2.5547585487365723, + "learning_rate": 0.0001262394846942551, + "loss": 0.2264, + "step": 1800 + }, + { + "epoch": 0.43363630891470534, + "grad_norm": 1.3450043201446533, + "learning_rate": 0.00012616423440419468, + "loss": 0.6556, + "step": 1801 + }, + { + "epoch": 0.43387708421115995, + "grad_norm": 2.3657472133636475, + "learning_rate": 0.00012608896820622077, + "loss": 0.5502, + "step": 1802 + }, + { + "epoch": 0.4341178595076145, + "grad_norm": 0.8529106974601746, + "learning_rate": 0.0001260136861460954, + "loss": 0.5755, + "step": 1803 + }, + { + "epoch": 0.4343586348040691, + "grad_norm": 1.913244366645813, + "learning_rate": 0.00012593838826959023, + "loss": 0.4943, + "step": 1804 + }, + { + "epoch": 0.43459941010052366, + "grad_norm": 2.5146071910858154, + "learning_rate": 0.0001258630746224866, + "loss": 0.7254, + "step": 1805 + }, + { + "epoch": 0.4348401853969783, + "grad_norm": 1.7178691625595093, + "learning_rate": 0.00012578774525057532, + "loss": 0.5247, + "step": 1806 + }, + { + "epoch": 0.4350809606934328, + "grad_norm": 3.7382612228393555, + "learning_rate": 0.0001257124001996568, + "loss": 0.6197, + "step": 1807 + }, + { + "epoch": 0.43532173598988744, + "grad_norm": 4.024393558502197, + "learning_rate": 0.00012563703951554102, + "loss": 0.6969, + "step": 1808 + }, + { + "epoch": 0.43556251128634205, + "grad_norm": 2.9647786617279053, + "learning_rate": 0.0001255616632440475, + "loss": 0.4495, + "step": 1809 + }, + { + "epoch": 0.4358032865827966, + "grad_norm": 3.270671844482422, + "learning_rate": 0.0001254862714310051, + "loss": 1.3434, + "step": 1810 + }, + { + "epoch": 0.4360440618792512, + "grad_norm": 2.660315752029419, + "learning_rate": 0.00012541086412225225, + "loss": 0.3752, + "step": 1811 + }, + { + "epoch": 0.43628483717570576, + "grad_norm": 3.405566453933716, + "learning_rate": 0.00012533544136363677, + "loss": 0.6865, + "step": 1812 + }, + { + "epoch": 0.4365256124721604, + "grad_norm": 1.3535075187683105, + "learning_rate": 0.00012526000320101584, + "loss": 0.5975, + "step": 1813 + }, + { + "epoch": 0.4367663877686149, + "grad_norm": 7.082382678985596, + "learning_rate": 0.0001251845496802561, + "loss": 0.9345, + "step": 1814 + }, + { + "epoch": 0.43700716306506954, + "grad_norm": 3.6921160221099854, + "learning_rate": 0.00012510908084723335, + "loss": 0.7298, + "step": 1815 + }, + { + "epoch": 0.4372479383615241, + "grad_norm": 1.1721895933151245, + "learning_rate": 0.00012503359674783293, + "loss": 0.7013, + "step": 1816 + }, + { + "epoch": 0.4374887136579787, + "grad_norm": 3.0077133178710938, + "learning_rate": 0.00012495809742794927, + "loss": 1.094, + "step": 1817 + }, + { + "epoch": 0.43772948895443325, + "grad_norm": 0.8102177381515503, + "learning_rate": 0.00012488258293348614, + "loss": 0.5695, + "step": 1818 + }, + { + "epoch": 0.43797026425088786, + "grad_norm": 2.2918097972869873, + "learning_rate": 0.0001248070533103565, + "loss": 0.3564, + "step": 1819 + }, + { + "epoch": 0.4382110395473424, + "grad_norm": 0.9323842525482178, + "learning_rate": 0.0001247315086044826, + "loss": 0.38, + "step": 1820 + }, + { + "epoch": 0.438451814843797, + "grad_norm": 3.6484107971191406, + "learning_rate": 0.0001246559488617957, + "loss": 0.4016, + "step": 1821 + }, + { + "epoch": 0.43869259014025164, + "grad_norm": 2.140214204788208, + "learning_rate": 0.0001245803741282364, + "loss": 0.4333, + "step": 1822 + }, + { + "epoch": 0.4389333654367062, + "grad_norm": 2.356504201889038, + "learning_rate": 0.00012450478444975423, + "loss": 0.3219, + "step": 1823 + }, + { + "epoch": 0.4391741407331608, + "grad_norm": 1.8598113059997559, + "learning_rate": 0.0001244291798723079, + "loss": 0.4902, + "step": 1824 + }, + { + "epoch": 0.43941491602961535, + "grad_norm": 3.3434224128723145, + "learning_rate": 0.00012435356044186512, + "loss": 0.63, + "step": 1825 + }, + { + "epoch": 0.43965569132606996, + "grad_norm": 0.9114461541175842, + "learning_rate": 0.00012427792620440278, + "loss": 0.1613, + "step": 1826 + }, + { + "epoch": 0.4398964666225245, + "grad_norm": 3.5121147632598877, + "learning_rate": 0.00012420227720590657, + "loss": 0.9353, + "step": 1827 + }, + { + "epoch": 0.4401372419189791, + "grad_norm": 1.0356240272521973, + "learning_rate": 0.00012412661349237134, + "loss": 0.2213, + "step": 1828 + }, + { + "epoch": 0.4403780172154337, + "grad_norm": 2.9189321994781494, + "learning_rate": 0.00012405093510980072, + "loss": 0.8359, + "step": 1829 + }, + { + "epoch": 0.4406187925118883, + "grad_norm": 2.3521268367767334, + "learning_rate": 0.00012397524210420736, + "loss": 0.6263, + "step": 1830 + }, + { + "epoch": 0.44085956780834284, + "grad_norm": 1.810509443283081, + "learning_rate": 0.0001238995345216128, + "loss": 0.415, + "step": 1831 + }, + { + "epoch": 0.44110034310479745, + "grad_norm": 1.4703214168548584, + "learning_rate": 0.0001238238124080474, + "loss": 1.0236, + "step": 1832 + }, + { + "epoch": 0.441341118401252, + "grad_norm": 1.8066413402557373, + "learning_rate": 0.0001237480758095504, + "loss": 0.5183, + "step": 1833 + }, + { + "epoch": 0.4415818936977066, + "grad_norm": 2.030515670776367, + "learning_rate": 0.00012367232477216973, + "loss": 0.963, + "step": 1834 + }, + { + "epoch": 0.44182266899416117, + "grad_norm": 4.339605808258057, + "learning_rate": 0.00012359655934196236, + "loss": 0.9798, + "step": 1835 + }, + { + "epoch": 0.4420634442906158, + "grad_norm": 1.818166971206665, + "learning_rate": 0.00012352077956499365, + "loss": 0.4265, + "step": 1836 + }, + { + "epoch": 0.4423042195870704, + "grad_norm": 3.2408132553100586, + "learning_rate": 0.00012344498548733806, + "loss": 0.9419, + "step": 1837 + }, + { + "epoch": 0.44254499488352494, + "grad_norm": 2.5602407455444336, + "learning_rate": 0.0001233691771550784, + "loss": 0.7254, + "step": 1838 + }, + { + "epoch": 0.44278577017997955, + "grad_norm": 2.216360330581665, + "learning_rate": 0.0001232933546143064, + "loss": 0.9503, + "step": 1839 + }, + { + "epoch": 0.4430265454764341, + "grad_norm": 1.1760109663009644, + "learning_rate": 0.00012321751791112234, + "loss": 0.6887, + "step": 1840 + }, + { + "epoch": 0.4432673207728887, + "grad_norm": 3.627732276916504, + "learning_rate": 0.00012314166709163508, + "loss": 0.5434, + "step": 1841 + }, + { + "epoch": 0.44350809606934327, + "grad_norm": 2.6786983013153076, + "learning_rate": 0.00012306580220196206, + "loss": 1.46, + "step": 1842 + }, + { + "epoch": 0.4437488713657979, + "grad_norm": 1.4511840343475342, + "learning_rate": 0.00012298992328822937, + "loss": 0.739, + "step": 1843 + }, + { + "epoch": 0.44398964666225244, + "grad_norm": 6.163101673126221, + "learning_rate": 0.00012291403039657147, + "loss": 0.6288, + "step": 1844 + }, + { + "epoch": 0.44423042195870704, + "grad_norm": 3.6202635765075684, + "learning_rate": 0.00012283812357313152, + "loss": 0.3465, + "step": 1845 + }, + { + "epoch": 0.4444711972551616, + "grad_norm": 2.3283517360687256, + "learning_rate": 0.00012276220286406097, + "loss": 0.9489, + "step": 1846 + }, + { + "epoch": 0.4447119725516162, + "grad_norm": 0.9239123463630676, + "learning_rate": 0.00012268626831551978, + "loss": 0.2347, + "step": 1847 + }, + { + "epoch": 0.44495274784807076, + "grad_norm": 2.289092779159546, + "learning_rate": 0.00012261031997367632, + "loss": 0.6748, + "step": 1848 + }, + { + "epoch": 0.44519352314452537, + "grad_norm": 3.025836706161499, + "learning_rate": 0.0001225343578847074, + "loss": 0.9556, + "step": 1849 + }, + { + "epoch": 0.44543429844098, + "grad_norm": 2.054135322570801, + "learning_rate": 0.00012245838209479812, + "loss": 0.6107, + "step": 1850 + }, + { + "epoch": 0.44567507373743454, + "grad_norm": 2.0344197750091553, + "learning_rate": 0.0001223823926501419, + "loss": 0.4859, + "step": 1851 + }, + { + "epoch": 0.44591584903388914, + "grad_norm": 1.1552016735076904, + "learning_rate": 0.00012230638959694054, + "loss": 0.4992, + "step": 1852 + }, + { + "epoch": 0.4461566243303437, + "grad_norm": 4.617137908935547, + "learning_rate": 0.00012223037298140406, + "loss": 0.4169, + "step": 1853 + }, + { + "epoch": 0.4463973996267983, + "grad_norm": 1.8090236186981201, + "learning_rate": 0.00012215434284975073, + "loss": 1.1123, + "step": 1854 + }, + { + "epoch": 0.44663817492325286, + "grad_norm": 1.46204674243927, + "learning_rate": 0.000122078299248207, + "loss": 0.3101, + "step": 1855 + }, + { + "epoch": 0.44687895021970747, + "grad_norm": 2.3522417545318604, + "learning_rate": 0.00012200224222300758, + "loss": 0.3873, + "step": 1856 + }, + { + "epoch": 0.447119725516162, + "grad_norm": 2.6018474102020264, + "learning_rate": 0.00012192617182039534, + "loss": 1.0129, + "step": 1857 + }, + { + "epoch": 0.44736050081261663, + "grad_norm": 4.971423149108887, + "learning_rate": 0.00012185008808662124, + "loss": 0.7512, + "step": 1858 + }, + { + "epoch": 0.4476012761090712, + "grad_norm": 1.7388516664505005, + "learning_rate": 0.00012177399106794433, + "loss": 0.9286, + "step": 1859 + }, + { + "epoch": 0.4478420514055258, + "grad_norm": 2.4302382469177246, + "learning_rate": 0.0001216978808106318, + "loss": 1.2512, + "step": 1860 + }, + { + "epoch": 0.44808282670198035, + "grad_norm": 2.7931926250457764, + "learning_rate": 0.00012162175736095887, + "loss": 0.5571, + "step": 1861 + }, + { + "epoch": 0.44832360199843496, + "grad_norm": 1.046998381614685, + "learning_rate": 0.00012154562076520874, + "loss": 0.5747, + "step": 1862 + }, + { + "epoch": 0.44856437729488957, + "grad_norm": 1.4877816438674927, + "learning_rate": 0.00012146947106967266, + "loss": 0.9024, + "step": 1863 + }, + { + "epoch": 0.4488051525913441, + "grad_norm": 0.7219827175140381, + "learning_rate": 0.00012139330832064974, + "loss": 0.3575, + "step": 1864 + }, + { + "epoch": 0.44904592788779873, + "grad_norm": 2.1786413192749023, + "learning_rate": 0.00012131713256444722, + "loss": 0.5989, + "step": 1865 + }, + { + "epoch": 0.4492867031842533, + "grad_norm": 3.176419973373413, + "learning_rate": 0.00012124094384738005, + "loss": 0.6001, + "step": 1866 + }, + { + "epoch": 0.4495274784807079, + "grad_norm": 4.134557723999023, + "learning_rate": 0.00012116474221577116, + "loss": 0.3355, + "step": 1867 + }, + { + "epoch": 0.44976825377716245, + "grad_norm": 1.623186707496643, + "learning_rate": 0.00012108852771595129, + "loss": 0.4517, + "step": 1868 + }, + { + "epoch": 0.45000902907361706, + "grad_norm": 0.6475129723548889, + "learning_rate": 0.00012101230039425911, + "loss": 0.3038, + "step": 1869 + }, + { + "epoch": 0.4502498043700716, + "grad_norm": 1.8964297771453857, + "learning_rate": 0.00012093606029704094, + "loss": 0.1228, + "step": 1870 + }, + { + "epoch": 0.4504905796665262, + "grad_norm": 3.343824625015259, + "learning_rate": 0.00012085980747065093, + "loss": 0.7346, + "step": 1871 + }, + { + "epoch": 0.4507313549629808, + "grad_norm": 2.4471538066864014, + "learning_rate": 0.00012078354196145099, + "loss": 0.7198, + "step": 1872 + }, + { + "epoch": 0.4509721302594354, + "grad_norm": 1.736475944519043, + "learning_rate": 0.00012070726381581068, + "loss": 1.0944, + "step": 1873 + }, + { + "epoch": 0.45121290555588994, + "grad_norm": 2.5887935161590576, + "learning_rate": 0.00012063097308010734, + "loss": 0.7277, + "step": 1874 + }, + { + "epoch": 0.45145368085234455, + "grad_norm": 2.683844804763794, + "learning_rate": 0.0001205546698007259, + "loss": 0.263, + "step": 1875 + }, + { + "epoch": 0.4516944561487991, + "grad_norm": 2.049633741378784, + "learning_rate": 0.00012047835402405887, + "loss": 0.7888, + "step": 1876 + }, + { + "epoch": 0.4519352314452537, + "grad_norm": 1.6313300132751465, + "learning_rate": 0.00012040202579650648, + "loss": 0.5099, + "step": 1877 + }, + { + "epoch": 0.4521760067417083, + "grad_norm": 2.1740105152130127, + "learning_rate": 0.00012032568516447645, + "loss": 0.5025, + "step": 1878 + }, + { + "epoch": 0.4524167820381629, + "grad_norm": 0.8934720754623413, + "learning_rate": 0.00012024933217438403, + "loss": 0.3097, + "step": 1879 + }, + { + "epoch": 0.4526575573346175, + "grad_norm": 3.051832675933838, + "learning_rate": 0.00012017296687265201, + "loss": 0.5882, + "step": 1880 + }, + { + "epoch": 0.45289833263107204, + "grad_norm": 0.8491730093955994, + "learning_rate": 0.00012009658930571069, + "loss": 0.3899, + "step": 1881 + }, + { + "epoch": 0.45313910792752665, + "grad_norm": 1.7300267219543457, + "learning_rate": 0.0001200201995199978, + "loss": 0.6787, + "step": 1882 + }, + { + "epoch": 0.4533798832239812, + "grad_norm": 1.650277853012085, + "learning_rate": 0.00011994379756195852, + "loss": 0.6297, + "step": 1883 + }, + { + "epoch": 0.4536206585204358, + "grad_norm": 1.1548956632614136, + "learning_rate": 0.00011986738347804536, + "loss": 0.519, + "step": 1884 + }, + { + "epoch": 0.45386143381689037, + "grad_norm": 1.0197851657867432, + "learning_rate": 0.0001197909573147183, + "loss": 0.6041, + "step": 1885 + }, + { + "epoch": 0.454102209113345, + "grad_norm": 1.3757448196411133, + "learning_rate": 0.00011971451911844457, + "loss": 0.6228, + "step": 1886 + }, + { + "epoch": 0.45434298440979953, + "grad_norm": 3.988311767578125, + "learning_rate": 0.00011963806893569885, + "loss": 0.5685, + "step": 1887 + }, + { + "epoch": 0.45458375970625414, + "grad_norm": 1.5994011163711548, + "learning_rate": 0.00011956160681296293, + "loss": 0.5188, + "step": 1888 + }, + { + "epoch": 0.4548245350027087, + "grad_norm": 1.7165995836257935, + "learning_rate": 0.00011948513279672602, + "loss": 0.8819, + "step": 1889 + }, + { + "epoch": 0.4550653102991633, + "grad_norm": 1.711625337600708, + "learning_rate": 0.00011940864693348444, + "loss": 0.6248, + "step": 1890 + }, + { + "epoch": 0.4553060855956179, + "grad_norm": 3.749361038208008, + "learning_rate": 0.00011933214926974183, + "loss": 0.6371, + "step": 1891 + }, + { + "epoch": 0.45554686089207247, + "grad_norm": 0.7839668393135071, + "learning_rate": 0.00011925563985200887, + "loss": 0.3796, + "step": 1892 + }, + { + "epoch": 0.4557876361885271, + "grad_norm": 1.5670320987701416, + "learning_rate": 0.00011917911872680354, + "loss": 0.3806, + "step": 1893 + }, + { + "epoch": 0.45602841148498163, + "grad_norm": 1.0422892570495605, + "learning_rate": 0.00011910258594065078, + "loss": 0.0708, + "step": 1894 + }, + { + "epoch": 0.45626918678143624, + "grad_norm": 3.335632562637329, + "learning_rate": 0.00011902604154008274, + "loss": 0.5238, + "step": 1895 + }, + { + "epoch": 0.4565099620778908, + "grad_norm": 4.482006072998047, + "learning_rate": 0.00011894948557163859, + "loss": 0.7926, + "step": 1896 + }, + { + "epoch": 0.4567507373743454, + "grad_norm": 2.2744340896606445, + "learning_rate": 0.00011887291808186452, + "loss": 1.2551, + "step": 1897 + }, + { + "epoch": 0.45699151267079996, + "grad_norm": 1.9892547130584717, + "learning_rate": 0.00011879633911731372, + "loss": 0.6706, + "step": 1898 + }, + { + "epoch": 0.45723228796725457, + "grad_norm": 0.8157358169555664, + "learning_rate": 0.00011871974872454639, + "loss": 0.2129, + "step": 1899 + }, + { + "epoch": 0.4574730632637091, + "grad_norm": 0.6796861886978149, + "learning_rate": 0.00011864314695012963, + "loss": 0.5986, + "step": 1900 + }, + { + "epoch": 0.45771383856016373, + "grad_norm": 1.5171664953231812, + "learning_rate": 0.00011856653384063756, + "loss": 0.4835, + "step": 1901 + }, + { + "epoch": 0.4579546138566183, + "grad_norm": 1.7098067998886108, + "learning_rate": 0.00011848990944265111, + "loss": 0.4977, + "step": 1902 + }, + { + "epoch": 0.4581953891530729, + "grad_norm": 1.370509386062622, + "learning_rate": 0.00011841327380275799, + "loss": 0.7172, + "step": 1903 + }, + { + "epoch": 0.4584361644495275, + "grad_norm": 3.1343603134155273, + "learning_rate": 0.00011833662696755295, + "loss": 0.6409, + "step": 1904 + }, + { + "epoch": 0.45867693974598206, + "grad_norm": 2.920408010482788, + "learning_rate": 0.00011825996898363741, + "loss": 0.496, + "step": 1905 + }, + { + "epoch": 0.45891771504243667, + "grad_norm": 1.1497353315353394, + "learning_rate": 0.00011818329989761959, + "loss": 0.4073, + "step": 1906 + }, + { + "epoch": 0.4591584903388912, + "grad_norm": 1.3592454195022583, + "learning_rate": 0.00011810661975611444, + "loss": 0.7055, + "step": 1907 + }, + { + "epoch": 0.45939926563534583, + "grad_norm": 4.271294593811035, + "learning_rate": 0.0001180299286057437, + "loss": 0.6676, + "step": 1908 + }, + { + "epoch": 0.4596400409318004, + "grad_norm": 3.51015567779541, + "learning_rate": 0.00011795322649313574, + "loss": 0.7051, + "step": 1909 + }, + { + "epoch": 0.459880816228255, + "grad_norm": 0.6392609477043152, + "learning_rate": 0.00011787651346492561, + "loss": 0.2079, + "step": 1910 + }, + { + "epoch": 0.46012159152470955, + "grad_norm": 5.496769905090332, + "learning_rate": 0.00011779978956775506, + "loss": 0.6687, + "step": 1911 + }, + { + "epoch": 0.46036236682116416, + "grad_norm": 2.1029446125030518, + "learning_rate": 0.00011772305484827231, + "loss": 1.0162, + "step": 1912 + }, + { + "epoch": 0.4606031421176187, + "grad_norm": 3.9741029739379883, + "learning_rate": 0.00011764630935313228, + "loss": 1.0211, + "step": 1913 + }, + { + "epoch": 0.4608439174140733, + "grad_norm": 3.1617109775543213, + "learning_rate": 0.00011756955312899642, + "loss": 1.1738, + "step": 1914 + }, + { + "epoch": 0.4610846927105279, + "grad_norm": 0.6556163430213928, + "learning_rate": 0.00011749278622253268, + "loss": 0.037, + "step": 1915 + }, + { + "epoch": 0.4613254680069825, + "grad_norm": 3.8767170906066895, + "learning_rate": 0.00011741600868041549, + "loss": 0.8335, + "step": 1916 + }, + { + "epoch": 0.46156624330343704, + "grad_norm": 2.683124542236328, + "learning_rate": 0.00011733922054932577, + "loss": 1.3765, + "step": 1917 + }, + { + "epoch": 0.46180701859989165, + "grad_norm": 1.8836538791656494, + "learning_rate": 0.00011726242187595091, + "loss": 0.2703, + "step": 1918 + }, + { + "epoch": 0.46204779389634626, + "grad_norm": 1.551708698272705, + "learning_rate": 0.00011718561270698467, + "loss": 0.1608, + "step": 1919 + }, + { + "epoch": 0.4622885691928008, + "grad_norm": 2.271167278289795, + "learning_rate": 0.00011710879308912717, + "loss": 0.5994, + "step": 1920 + }, + { + "epoch": 0.4625293444892554, + "grad_norm": 2.433912992477417, + "learning_rate": 0.0001170319630690849, + "loss": 0.756, + "step": 1921 + }, + { + "epoch": 0.46277011978571, + "grad_norm": 0.924586296081543, + "learning_rate": 0.00011695512269357076, + "loss": 0.655, + "step": 1922 + }, + { + "epoch": 0.4630108950821646, + "grad_norm": 2.5200753211975098, + "learning_rate": 0.00011687827200930381, + "loss": 0.5851, + "step": 1923 + }, + { + "epoch": 0.46325167037861914, + "grad_norm": 1.0272419452667236, + "learning_rate": 0.00011680141106300943, + "loss": 0.3672, + "step": 1924 + }, + { + "epoch": 0.46349244567507375, + "grad_norm": 5.197723865509033, + "learning_rate": 0.00011672453990141927, + "loss": 0.8146, + "step": 1925 + }, + { + "epoch": 0.4637332209715283, + "grad_norm": 1.0704439878463745, + "learning_rate": 0.00011664765857127118, + "loss": 0.1931, + "step": 1926 + }, + { + "epoch": 0.4639739962679829, + "grad_norm": 2.9581477642059326, + "learning_rate": 0.00011657076711930919, + "loss": 0.9693, + "step": 1927 + }, + { + "epoch": 0.46421477156443747, + "grad_norm": 3.3531832695007324, + "learning_rate": 0.00011649386559228341, + "loss": 0.6575, + "step": 1928 + }, + { + "epoch": 0.4644555468608921, + "grad_norm": 1.1153233051300049, + "learning_rate": 0.00011641695403695021, + "loss": 0.3665, + "step": 1929 + }, + { + "epoch": 0.46469632215734663, + "grad_norm": 1.21510910987854, + "learning_rate": 0.000116340032500072, + "loss": 0.4691, + "step": 1930 + }, + { + "epoch": 0.46493709745380124, + "grad_norm": 1.5340150594711304, + "learning_rate": 0.00011626310102841718, + "loss": 0.5084, + "step": 1931 + }, + { + "epoch": 0.46517787275025585, + "grad_norm": 1.7374811172485352, + "learning_rate": 0.0001161861596687603, + "loss": 0.5055, + "step": 1932 + }, + { + "epoch": 0.4654186480467104, + "grad_norm": 1.4668828248977661, + "learning_rate": 0.00011610920846788184, + "loss": 0.7268, + "step": 1933 + }, + { + "epoch": 0.465659423343165, + "grad_norm": 2.211509943008423, + "learning_rate": 0.0001160322474725684, + "loss": 1.0503, + "step": 1934 + }, + { + "epoch": 0.46590019863961957, + "grad_norm": 2.9162306785583496, + "learning_rate": 0.00011595527672961235, + "loss": 0.987, + "step": 1935 + }, + { + "epoch": 0.4661409739360742, + "grad_norm": 0.8210351467132568, + "learning_rate": 0.00011587829628581213, + "loss": 0.2187, + "step": 1936 + }, + { + "epoch": 0.46638174923252873, + "grad_norm": 2.2448573112487793, + "learning_rate": 0.00011580130618797193, + "loss": 0.7065, + "step": 1937 + }, + { + "epoch": 0.46662252452898334, + "grad_norm": 2.9925882816314697, + "learning_rate": 0.000115724306482902, + "loss": 0.9504, + "step": 1938 + }, + { + "epoch": 0.4668632998254379, + "grad_norm": 4.324154376983643, + "learning_rate": 0.00011564729721741829, + "loss": 0.8914, + "step": 1939 + }, + { + "epoch": 0.4671040751218925, + "grad_norm": 0.7890626192092896, + "learning_rate": 0.00011557027843834265, + "loss": 0.4613, + "step": 1940 + }, + { + "epoch": 0.46734485041834706, + "grad_norm": 0.8351976275444031, + "learning_rate": 0.00011549325019250261, + "loss": 0.262, + "step": 1941 + }, + { + "epoch": 0.46758562571480167, + "grad_norm": 5.956714153289795, + "learning_rate": 0.00011541621252673153, + "loss": 0.5128, + "step": 1942 + }, + { + "epoch": 0.4678264010112562, + "grad_norm": 1.704748511314392, + "learning_rate": 0.00011533916548786857, + "loss": 0.5645, + "step": 1943 + }, + { + "epoch": 0.46806717630771083, + "grad_norm": 2.2155847549438477, + "learning_rate": 0.00011526210912275836, + "loss": 0.7139, + "step": 1944 + }, + { + "epoch": 0.46830795160416544, + "grad_norm": 3.7036075592041016, + "learning_rate": 0.00011518504347825145, + "loss": 0.6394, + "step": 1945 + }, + { + "epoch": 0.46854872690062, + "grad_norm": 1.530531406402588, + "learning_rate": 0.00011510796860120388, + "loss": 0.8155, + "step": 1946 + }, + { + "epoch": 0.4687895021970746, + "grad_norm": 3.814300298690796, + "learning_rate": 0.00011503088453847739, + "loss": 0.9626, + "step": 1947 + }, + { + "epoch": 0.46903027749352916, + "grad_norm": 2.3494253158569336, + "learning_rate": 0.00011495379133693922, + "loss": 0.3687, + "step": 1948 + }, + { + "epoch": 0.46927105278998377, + "grad_norm": 2.5800893306732178, + "learning_rate": 0.00011487668904346221, + "loss": 0.8505, + "step": 1949 + }, + { + "epoch": 0.4695118280864383, + "grad_norm": 1.1892086267471313, + "learning_rate": 0.00011479957770492476, + "loss": 0.8398, + "step": 1950 + }, + { + "epoch": 0.46975260338289293, + "grad_norm": 4.8080034255981445, + "learning_rate": 0.00011472245736821072, + "loss": 0.8072, + "step": 1951 + }, + { + "epoch": 0.4699933786793475, + "grad_norm": 4.775472164154053, + "learning_rate": 0.00011464532808020943, + "loss": 0.8344, + "step": 1952 + }, + { + "epoch": 0.4702341539758021, + "grad_norm": 4.2183966636657715, + "learning_rate": 0.00011456818988781565, + "loss": 0.2391, + "step": 1953 + }, + { + "epoch": 0.47047492927225665, + "grad_norm": 0.8757205605506897, + "learning_rate": 0.00011449104283792964, + "loss": 0.6855, + "step": 1954 + }, + { + "epoch": 0.47071570456871126, + "grad_norm": 4.9031524658203125, + "learning_rate": 0.0001144138869774569, + "loss": 0.7411, + "step": 1955 + }, + { + "epoch": 0.4709564798651658, + "grad_norm": 3.9868388175964355, + "learning_rate": 0.0001143367223533084, + "loss": 1.3733, + "step": 1956 + }, + { + "epoch": 0.4711972551616204, + "grad_norm": 2.6897597312927246, + "learning_rate": 0.0001142595490124004, + "loss": 1.1645, + "step": 1957 + }, + { + "epoch": 0.471438030458075, + "grad_norm": 0.6126354932785034, + "learning_rate": 0.00011418236700165452, + "loss": 0.4618, + "step": 1958 + }, + { + "epoch": 0.4716788057545296, + "grad_norm": 0.5356245040893555, + "learning_rate": 0.00011410517636799751, + "loss": 0.805, + "step": 1959 + }, + { + "epoch": 0.4719195810509842, + "grad_norm": 0.8628101944923401, + "learning_rate": 0.00011402797715836153, + "loss": 0.291, + "step": 1960 + }, + { + "epoch": 0.47216035634743875, + "grad_norm": 1.2963393926620483, + "learning_rate": 0.00011395076941968379, + "loss": 0.7377, + "step": 1961 + }, + { + "epoch": 0.47240113164389336, + "grad_norm": 1.1663508415222168, + "learning_rate": 0.00011387355319890685, + "loss": 0.1149, + "step": 1962 + }, + { + "epoch": 0.4726419069403479, + "grad_norm": 1.1222305297851562, + "learning_rate": 0.00011379632854297828, + "loss": 0.8273, + "step": 1963 + }, + { + "epoch": 0.4728826822368025, + "grad_norm": 1.7846665382385254, + "learning_rate": 0.00011371909549885087, + "loss": 0.5701, + "step": 1964 + }, + { + "epoch": 0.4731234575332571, + "grad_norm": 2.4753174781799316, + "learning_rate": 0.00011364185411348247, + "loss": 0.6405, + "step": 1965 + }, + { + "epoch": 0.4733642328297117, + "grad_norm": 2.678506374359131, + "learning_rate": 0.00011356460443383607, + "loss": 0.5651, + "step": 1966 + }, + { + "epoch": 0.47360500812616624, + "grad_norm": 2.547746181488037, + "learning_rate": 0.00011348734650687962, + "loss": 0.4664, + "step": 1967 + }, + { + "epoch": 0.47384578342262085, + "grad_norm": 2.901313304901123, + "learning_rate": 0.00011341008037958607, + "loss": 0.2748, + "step": 1968 + }, + { + "epoch": 0.4740865587190754, + "grad_norm": 2.7114925384521484, + "learning_rate": 0.00011333280609893344, + "loss": 0.7454, + "step": 1969 + }, + { + "epoch": 0.47432733401553, + "grad_norm": 2.3827106952667236, + "learning_rate": 0.0001132555237119047, + "loss": 1.0865, + "step": 1970 + }, + { + "epoch": 0.47456810931198457, + "grad_norm": 1.0428249835968018, + "learning_rate": 0.00011317823326548765, + "loss": 0.6484, + "step": 1971 + }, + { + "epoch": 0.4748088846084392, + "grad_norm": 2.74362850189209, + "learning_rate": 0.00011310093480667507, + "loss": 0.4957, + "step": 1972 + }, + { + "epoch": 0.4750496599048938, + "grad_norm": 2.0484142303466797, + "learning_rate": 0.00011302362838246463, + "loss": 0.5256, + "step": 1973 + }, + { + "epoch": 0.47529043520134834, + "grad_norm": 2.8634374141693115, + "learning_rate": 0.0001129463140398588, + "loss": 0.4557, + "step": 1974 + }, + { + "epoch": 0.47553121049780295, + "grad_norm": 2.5055246353149414, + "learning_rate": 0.00011286899182586485, + "loss": 0.9222, + "step": 1975 + }, + { + "epoch": 0.4757719857942575, + "grad_norm": 1.0836631059646606, + "learning_rate": 0.00011279166178749489, + "loss": 0.3692, + "step": 1976 + }, + { + "epoch": 0.4760127610907121, + "grad_norm": 2.4149179458618164, + "learning_rate": 0.0001127143239717657, + "loss": 0.675, + "step": 1977 + }, + { + "epoch": 0.47625353638716666, + "grad_norm": 1.899614930152893, + "learning_rate": 0.00011263697842569894, + "loss": 0.988, + "step": 1978 + }, + { + "epoch": 0.4764943116836213, + "grad_norm": 3.755749464035034, + "learning_rate": 0.00011255962519632081, + "loss": 0.8779, + "step": 1979 + }, + { + "epoch": 0.47673508698007583, + "grad_norm": 2.487436056137085, + "learning_rate": 0.0001124822643306623, + "loss": 0.4025, + "step": 1980 + }, + { + "epoch": 0.47697586227653044, + "grad_norm": 0.9542964100837708, + "learning_rate": 0.00011240489587575889, + "loss": 0.5613, + "step": 1981 + }, + { + "epoch": 0.477216637572985, + "grad_norm": 1.5301231145858765, + "learning_rate": 0.00011232751987865084, + "loss": 0.2873, + "step": 1982 + }, + { + "epoch": 0.4774574128694396, + "grad_norm": 1.3803631067276, + "learning_rate": 0.00011225013638638297, + "loss": 0.2015, + "step": 1983 + }, + { + "epoch": 0.47769818816589416, + "grad_norm": 2.5215346813201904, + "learning_rate": 0.00011217274544600458, + "loss": 0.7079, + "step": 1984 + }, + { + "epoch": 0.47793896346234876, + "grad_norm": 0.8680809736251831, + "learning_rate": 0.00011209534710456951, + "loss": 0.8823, + "step": 1985 + }, + { + "epoch": 0.4781797387588034, + "grad_norm": 1.0122793912887573, + "learning_rate": 0.00011201794140913613, + "loss": 0.3876, + "step": 1986 + }, + { + "epoch": 0.47842051405525793, + "grad_norm": 2.795023202896118, + "learning_rate": 0.00011194052840676735, + "loss": 0.9525, + "step": 1987 + }, + { + "epoch": 0.47866128935171254, + "grad_norm": 4.1220784187316895, + "learning_rate": 0.00011186310814453035, + "loss": 0.8727, + "step": 1988 + }, + { + "epoch": 0.4789020646481671, + "grad_norm": 0.5881559252738953, + "learning_rate": 0.00011178568066949688, + "loss": 0.1987, + "step": 1989 + }, + { + "epoch": 0.4791428399446217, + "grad_norm": 2.1437673568725586, + "learning_rate": 0.00011170824602874301, + "loss": 0.7281, + "step": 1990 + }, + { + "epoch": 0.47938361524107626, + "grad_norm": 2.513075113296509, + "learning_rate": 0.0001116308042693492, + "loss": 0.6271, + "step": 1991 + }, + { + "epoch": 0.47962439053753086, + "grad_norm": 2.7916321754455566, + "learning_rate": 0.00011155335543840017, + "loss": 0.4875, + "step": 1992 + }, + { + "epoch": 0.4798651658339854, + "grad_norm": 1.7488362789154053, + "learning_rate": 0.000111475899582985, + "loss": 0.4013, + "step": 1993 + }, + { + "epoch": 0.48010594113044003, + "grad_norm": 8.35679817199707, + "learning_rate": 0.00011139843675019704, + "loss": 0.7598, + "step": 1994 + }, + { + "epoch": 0.4803467164268946, + "grad_norm": 1.7272447347640991, + "learning_rate": 0.00011132096698713385, + "loss": 0.4311, + "step": 1995 + }, + { + "epoch": 0.4805874917233492, + "grad_norm": 2.1946487426757812, + "learning_rate": 0.00011124349034089723, + "loss": 0.6132, + "step": 1996 + }, + { + "epoch": 0.48082826701980375, + "grad_norm": 1.6528022289276123, + "learning_rate": 0.00011116600685859313, + "loss": 0.738, + "step": 1997 + }, + { + "epoch": 0.48106904231625836, + "grad_norm": 2.6232638359069824, + "learning_rate": 0.0001110885165873317, + "loss": 0.7694, + "step": 1998 + }, + { + "epoch": 0.4813098176127129, + "grad_norm": 1.2000987529754639, + "learning_rate": 0.00011101101957422723, + "loss": 0.4693, + "step": 1999 + }, + { + "epoch": 0.4815505929091675, + "grad_norm": 1.4360319375991821, + "learning_rate": 0.00011093351586639806, + "loss": 0.7783, + "step": 2000 + }, + { + "epoch": 0.48179136820562213, + "grad_norm": 1.4586645364761353, + "learning_rate": 0.00011085600551096657, + "loss": 0.7863, + "step": 2001 + }, + { + "epoch": 0.4820321435020767, + "grad_norm": 1.3981388807296753, + "learning_rate": 0.0001107784885550593, + "loss": 0.7304, + "step": 2002 + }, + { + "epoch": 0.4822729187985313, + "grad_norm": 2.226198196411133, + "learning_rate": 0.00011070096504580669, + "loss": 0.5331, + "step": 2003 + }, + { + "epoch": 0.48251369409498585, + "grad_norm": 1.673223614692688, + "learning_rate": 0.00011062343503034325, + "loss": 0.5965, + "step": 2004 + }, + { + "epoch": 0.48275446939144045, + "grad_norm": 3.5956525802612305, + "learning_rate": 0.00011054589855580732, + "loss": 0.9231, + "step": 2005 + }, + { + "epoch": 0.482995244687895, + "grad_norm": 2.030714273452759, + "learning_rate": 0.00011046835566934138, + "loss": 0.71, + "step": 2006 + }, + { + "epoch": 0.4832360199843496, + "grad_norm": 6.168741226196289, + "learning_rate": 0.00011039080641809154, + "loss": 0.9501, + "step": 2007 + }, + { + "epoch": 0.4834767952808042, + "grad_norm": 2.15983510017395, + "learning_rate": 0.00011031325084920802, + "loss": 1.0474, + "step": 2008 + }, + { + "epoch": 0.4837175705772588, + "grad_norm": 3.2638587951660156, + "learning_rate": 0.00011023568900984473, + "loss": 0.4585, + "step": 2009 + }, + { + "epoch": 0.48395834587371334, + "grad_norm": 0.8049036264419556, + "learning_rate": 0.0001101581209471595, + "loss": 0.3537, + "step": 2010 + }, + { + "epoch": 0.48419912117016795, + "grad_norm": 2.6150450706481934, + "learning_rate": 0.00011008054670831381, + "loss": 0.4149, + "step": 2011 + }, + { + "epoch": 0.4844398964666225, + "grad_norm": 1.5464622974395752, + "learning_rate": 0.00011000296634047302, + "loss": 0.1812, + "step": 2012 + }, + { + "epoch": 0.4846806717630771, + "grad_norm": 1.843767762184143, + "learning_rate": 0.00010992537989080618, + "loss": 0.3838, + "step": 2013 + }, + { + "epoch": 0.4849214470595317, + "grad_norm": 1.0260145664215088, + "learning_rate": 0.00010984778740648598, + "loss": 0.2033, + "step": 2014 + }, + { + "epoch": 0.4851622223559863, + "grad_norm": 1.7894840240478516, + "learning_rate": 0.00010977018893468884, + "loss": 0.5762, + "step": 2015 + }, + { + "epoch": 0.4854029976524409, + "grad_norm": 2.454301118850708, + "learning_rate": 0.00010969258452259483, + "loss": 0.8953, + "step": 2016 + }, + { + "epoch": 0.48564377294889544, + "grad_norm": 0.5999788045883179, + "learning_rate": 0.0001096149742173876, + "loss": 0.4977, + "step": 2017 + }, + { + "epoch": 0.48588454824535005, + "grad_norm": 2.6491451263427734, + "learning_rate": 0.00010953735806625439, + "loss": 1.0362, + "step": 2018 + }, + { + "epoch": 0.4861253235418046, + "grad_norm": 2.1559669971466064, + "learning_rate": 0.00010945973611638596, + "loss": 0.4835, + "step": 2019 + }, + { + "epoch": 0.4863660988382592, + "grad_norm": 4.336763381958008, + "learning_rate": 0.00010938210841497667, + "loss": 1.0278, + "step": 2020 + }, + { + "epoch": 0.48660687413471376, + "grad_norm": 0.9082402586936951, + "learning_rate": 0.00010930447500922433, + "loss": 0.7064, + "step": 2021 + }, + { + "epoch": 0.48684764943116837, + "grad_norm": 11.172735214233398, + "learning_rate": 0.00010922683594633021, + "loss": 0.9112, + "step": 2022 + }, + { + "epoch": 0.4870884247276229, + "grad_norm": 1.7960487604141235, + "learning_rate": 0.00010914919127349906, + "loss": 0.4387, + "step": 2023 + }, + { + "epoch": 0.48732920002407754, + "grad_norm": 2.4477851390838623, + "learning_rate": 0.00010907154103793899, + "loss": 0.2548, + "step": 2024 + }, + { + "epoch": 0.4875699753205321, + "grad_norm": 1.2202852964401245, + "learning_rate": 0.00010899388528686154, + "loss": 0.4231, + "step": 2025 + }, + { + "epoch": 0.4878107506169867, + "grad_norm": 2.1632204055786133, + "learning_rate": 0.00010891622406748157, + "loss": 0.5211, + "step": 2026 + }, + { + "epoch": 0.4880515259134413, + "grad_norm": 2.416361093521118, + "learning_rate": 0.00010883855742701727, + "loss": 0.5395, + "step": 2027 + }, + { + "epoch": 0.48829230120989586, + "grad_norm": 3.4709837436676025, + "learning_rate": 0.00010876088541269014, + "loss": 0.959, + "step": 2028 + }, + { + "epoch": 0.48853307650635047, + "grad_norm": 4.083737373352051, + "learning_rate": 0.00010868320807172496, + "loss": 0.4737, + "step": 2029 + }, + { + "epoch": 0.488773851802805, + "grad_norm": 2.2041704654693604, + "learning_rate": 0.0001086055254513497, + "loss": 0.7522, + "step": 2030 + }, + { + "epoch": 0.48901462709925964, + "grad_norm": 1.5947551727294922, + "learning_rate": 0.00010852783759879557, + "loss": 0.1179, + "step": 2031 + }, + { + "epoch": 0.4892554023957142, + "grad_norm": 2.6516928672790527, + "learning_rate": 0.00010845014456129698, + "loss": 0.9625, + "step": 2032 + }, + { + "epoch": 0.4894961776921688, + "grad_norm": 3.6693668365478516, + "learning_rate": 0.00010837244638609145, + "loss": 0.4759, + "step": 2033 + }, + { + "epoch": 0.48973695298862335, + "grad_norm": 5.227980613708496, + "learning_rate": 0.00010829474312041963, + "loss": 0.66, + "step": 2034 + }, + { + "epoch": 0.48997772828507796, + "grad_norm": 1.135461688041687, + "learning_rate": 0.00010821703481152534, + "loss": 0.3187, + "step": 2035 + }, + { + "epoch": 0.4902185035815325, + "grad_norm": 0.8220135569572449, + "learning_rate": 0.00010813932150665538, + "loss": 0.4416, + "step": 2036 + }, + { + "epoch": 0.4904592788779871, + "grad_norm": 2.479522943496704, + "learning_rate": 0.00010806160325305956, + "loss": 0.6935, + "step": 2037 + }, + { + "epoch": 0.4907000541744417, + "grad_norm": 3.262054920196533, + "learning_rate": 0.00010798388009799084, + "loss": 1.5196, + "step": 2038 + }, + { + "epoch": 0.4909408294708963, + "grad_norm": 3.85654354095459, + "learning_rate": 0.000107906152088705, + "loss": 0.3237, + "step": 2039 + }, + { + "epoch": 0.49118160476735084, + "grad_norm": 2.709144353866577, + "learning_rate": 0.0001078284192724609, + "loss": 0.806, + "step": 2040 + }, + { + "epoch": 0.49142238006380545, + "grad_norm": 4.338006019592285, + "learning_rate": 0.00010775068169652023, + "loss": 0.6148, + "step": 2041 + }, + { + "epoch": 0.49166315536026006, + "grad_norm": 1.0794256925582886, + "learning_rate": 0.00010767293940814762, + "loss": 0.3614, + "step": 2042 + }, + { + "epoch": 0.4919039306567146, + "grad_norm": 0.8536688089370728, + "learning_rate": 0.0001075951924546106, + "loss": 0.3124, + "step": 2043 + }, + { + "epoch": 0.4921447059531692, + "grad_norm": 2.3540027141571045, + "learning_rate": 0.00010751744088317943, + "loss": 0.8265, + "step": 2044 + }, + { + "epoch": 0.4923854812496238, + "grad_norm": 1.918283462524414, + "learning_rate": 0.00010743968474112728, + "loss": 0.8652, + "step": 2045 + }, + { + "epoch": 0.4926262565460784, + "grad_norm": 2.034250497817993, + "learning_rate": 0.00010736192407573, + "loss": 0.8617, + "step": 2046 + }, + { + "epoch": 0.49286703184253294, + "grad_norm": 3.235872268676758, + "learning_rate": 0.00010728415893426635, + "loss": 0.4167, + "step": 2047 + }, + { + "epoch": 0.49310780713898755, + "grad_norm": 1.3588740825653076, + "learning_rate": 0.00010720638936401766, + "loss": 0.8502, + "step": 2048 + }, + { + "epoch": 0.4933485824354421, + "grad_norm": 2.3015613555908203, + "learning_rate": 0.00010712861541226797, + "loss": 0.8856, + "step": 2049 + }, + { + "epoch": 0.4935893577318967, + "grad_norm": 3.480872631072998, + "learning_rate": 0.00010705083712630401, + "loss": 0.9697, + "step": 2050 + }, + { + "epoch": 0.49383013302835127, + "grad_norm": 8.409546852111816, + "learning_rate": 0.00010697305455341526, + "loss": 0.3575, + "step": 2051 + }, + { + "epoch": 0.4940709083248059, + "grad_norm": 1.3223494291305542, + "learning_rate": 0.00010689526774089362, + "loss": 0.3494, + "step": 2052 + }, + { + "epoch": 0.49431168362126043, + "grad_norm": 1.0988234281539917, + "learning_rate": 0.00010681747673603366, + "loss": 0.2256, + "step": 2053 + }, + { + "epoch": 0.49455245891771504, + "grad_norm": 1.517215609550476, + "learning_rate": 0.00010673968158613243, + "loss": 0.5634, + "step": 2054 + }, + { + "epoch": 0.49479323421416965, + "grad_norm": 3.4470624923706055, + "learning_rate": 0.00010666188233848967, + "loss": 0.5364, + "step": 2055 + }, + { + "epoch": 0.4950340095106242, + "grad_norm": 2.27813720703125, + "learning_rate": 0.00010658407904040743, + "loss": 0.7642, + "step": 2056 + }, + { + "epoch": 0.4952747848070788, + "grad_norm": 0.9174807667732239, + "learning_rate": 0.0001065062717391903, + "loss": 0.4644, + "step": 2057 + }, + { + "epoch": 0.49551556010353337, + "grad_norm": 1.2668373584747314, + "learning_rate": 0.00010642846048214527, + "loss": 0.3014, + "step": 2058 + }, + { + "epoch": 0.495756335399988, + "grad_norm": 3.347287893295288, + "learning_rate": 0.00010635064531658178, + "loss": 0.283, + "step": 2059 + }, + { + "epoch": 0.49599711069644253, + "grad_norm": 0.6961964964866638, + "learning_rate": 0.00010627282628981165, + "loss": 0.2452, + "step": 2060 + }, + { + "epoch": 0.49623788599289714, + "grad_norm": 4.018993377685547, + "learning_rate": 0.00010619500344914902, + "loss": 0.7302, + "step": 2061 + }, + { + "epoch": 0.4964786612893517, + "grad_norm": 1.290248990058899, + "learning_rate": 0.0001061171768419103, + "loss": 0.6912, + "step": 2062 + }, + { + "epoch": 0.4967194365858063, + "grad_norm": 2.4808475971221924, + "learning_rate": 0.00010603934651541427, + "loss": 0.2996, + "step": 2063 + }, + { + "epoch": 0.49696021188226086, + "grad_norm": 2.5349011421203613, + "learning_rate": 0.00010596151251698199, + "loss": 0.5265, + "step": 2064 + }, + { + "epoch": 0.49720098717871547, + "grad_norm": 2.832211494445801, + "learning_rate": 0.00010588367489393666, + "loss": 0.9041, + "step": 2065 + }, + { + "epoch": 0.49744176247517, + "grad_norm": 1.3861429691314697, + "learning_rate": 0.00010580583369360373, + "loss": 0.5222, + "step": 2066 + }, + { + "epoch": 0.49768253777162463, + "grad_norm": 1.224226951599121, + "learning_rate": 0.00010572798896331082, + "loss": 0.5713, + "step": 2067 + }, + { + "epoch": 0.4979233130680792, + "grad_norm": 2.4965927600860596, + "learning_rate": 0.00010565014075038775, + "loss": 0.3679, + "step": 2068 + }, + { + "epoch": 0.4981640883645338, + "grad_norm": 2.0286030769348145, + "learning_rate": 0.00010557228910216637, + "loss": 0.3128, + "step": 2069 + }, + { + "epoch": 0.4984048636609884, + "grad_norm": 1.7408385276794434, + "learning_rate": 0.00010549443406598063, + "loss": 0.6847, + "step": 2070 + }, + { + "epoch": 0.49864563895744296, + "grad_norm": 2.918757915496826, + "learning_rate": 0.00010541657568916661, + "loss": 0.4012, + "step": 2071 + }, + { + "epoch": 0.49888641425389757, + "grad_norm": 0.9126492142677307, + "learning_rate": 0.00010533871401906237, + "loss": 0.3021, + "step": 2072 + }, + { + "epoch": 0.4991271895503521, + "grad_norm": 1.8159611225128174, + "learning_rate": 0.00010526084910300798, + "loss": 0.5893, + "step": 2073 + }, + { + "epoch": 0.49936796484680673, + "grad_norm": 1.3606966733932495, + "learning_rate": 0.00010518298098834547, + "loss": 0.4645, + "step": 2074 + }, + { + "epoch": 0.4996087401432613, + "grad_norm": 3.8433918952941895, + "learning_rate": 0.00010510510972241887, + "loss": 0.3448, + "step": 2075 + }, + { + "epoch": 0.4998495154397159, + "grad_norm": 2.995986223220825, + "learning_rate": 0.00010502723535257401, + "loss": 0.6148, + "step": 2076 + }, + { + "epoch": 0.5000902907361705, + "grad_norm": 2.552739381790161, + "learning_rate": 0.00010494935792615879, + "loss": 0.5938, + "step": 2077 + }, + { + "epoch": 0.500331066032625, + "grad_norm": 2.2203798294067383, + "learning_rate": 0.00010487147749052275, + "loss": 0.8364, + "step": 2078 + }, + { + "epoch": 0.5005718413290796, + "grad_norm": 1.519313097000122, + "learning_rate": 0.00010479359409301745, + "loss": 0.6105, + "step": 2079 + }, + { + "epoch": 0.5008126166255342, + "grad_norm": 1.471633791923523, + "learning_rate": 0.00010471570778099611, + "loss": 0.8271, + "step": 2080 + }, + { + "epoch": 0.5010533919219888, + "grad_norm": 3.146540880203247, + "learning_rate": 0.00010463781860181385, + "loss": 0.6785, + "step": 2081 + }, + { + "epoch": 0.5012941672184434, + "grad_norm": 2.4692275524139404, + "learning_rate": 0.00010455992660282741, + "loss": 0.8448, + "step": 2082 + }, + { + "epoch": 0.5015349425148979, + "grad_norm": 2.2308695316314697, + "learning_rate": 0.00010448203183139533, + "loss": 0.72, + "step": 2083 + }, + { + "epoch": 0.5017757178113526, + "grad_norm": 0.7109373807907104, + "learning_rate": 0.00010440413433487781, + "loss": 0.1728, + "step": 2084 + }, + { + "epoch": 0.5020164931078072, + "grad_norm": 5.810349464416504, + "learning_rate": 0.00010432623416063667, + "loss": 1.3146, + "step": 2085 + }, + { + "epoch": 0.5022572684042618, + "grad_norm": 1.7806396484375, + "learning_rate": 0.0001042483313560354, + "loss": 0.224, + "step": 2086 + }, + { + "epoch": 0.5024980437007163, + "grad_norm": 4.550583362579346, + "learning_rate": 0.00010417042596843914, + "loss": 0.5014, + "step": 2087 + }, + { + "epoch": 0.5027388189971709, + "grad_norm": 0.9690256118774414, + "learning_rate": 0.00010409251804521447, + "loss": 0.2506, + "step": 2088 + }, + { + "epoch": 0.5029795942936255, + "grad_norm": 1.3459006547927856, + "learning_rate": 0.00010401460763372961, + "loss": 0.212, + "step": 2089 + }, + { + "epoch": 0.5032203695900801, + "grad_norm": 1.2357487678527832, + "learning_rate": 0.00010393669478135426, + "loss": 0.6829, + "step": 2090 + }, + { + "epoch": 0.5034611448865346, + "grad_norm": 0.7511969804763794, + "learning_rate": 0.00010385877953545961, + "loss": 0.737, + "step": 2091 + }, + { + "epoch": 0.5037019201829892, + "grad_norm": 1.3373340368270874, + "learning_rate": 0.00010378086194341832, + "loss": 0.9976, + "step": 2092 + }, + { + "epoch": 0.5039426954794438, + "grad_norm": 2.1753182411193848, + "learning_rate": 0.00010370294205260443, + "loss": 0.3736, + "step": 2093 + }, + { + "epoch": 0.5041834707758984, + "grad_norm": 0.6808569431304932, + "learning_rate": 0.00010362501991039347, + "loss": 0.4928, + "step": 2094 + }, + { + "epoch": 0.5044242460723529, + "grad_norm": 5.135721683502197, + "learning_rate": 0.00010354709556416218, + "loss": 0.6557, + "step": 2095 + }, + { + "epoch": 0.5046650213688075, + "grad_norm": 3.574115037918091, + "learning_rate": 0.00010346916906128883, + "loss": 0.6108, + "step": 2096 + }, + { + "epoch": 0.5049057966652621, + "grad_norm": 5.21065092086792, + "learning_rate": 0.0001033912404491529, + "loss": 0.5354, + "step": 2097 + }, + { + "epoch": 0.5051465719617168, + "grad_norm": 4.044327259063721, + "learning_rate": 0.00010331330977513509, + "loss": 0.4002, + "step": 2098 + }, + { + "epoch": 0.5053873472581714, + "grad_norm": 1.0108164548873901, + "learning_rate": 0.00010323537708661748, + "loss": 0.5534, + "step": 2099 + }, + { + "epoch": 0.5056281225546259, + "grad_norm": 0.433327317237854, + "learning_rate": 0.00010315744243098333, + "loss": 0.4697, + "step": 2100 + }, + { + "epoch": 0.5058688978510805, + "grad_norm": 1.2929291725158691, + "learning_rate": 0.00010307950585561706, + "loss": 0.6741, + "step": 2101 + }, + { + "epoch": 0.5061096731475351, + "grad_norm": 1.6541675329208374, + "learning_rate": 0.00010300156740790427, + "loss": 0.3582, + "step": 2102 + }, + { + "epoch": 0.5063504484439897, + "grad_norm": 2.3018059730529785, + "learning_rate": 0.00010292362713523176, + "loss": 1.1002, + "step": 2103 + }, + { + "epoch": 0.5065912237404442, + "grad_norm": 1.3195204734802246, + "learning_rate": 0.00010284568508498735, + "loss": 0.4559, + "step": 2104 + }, + { + "epoch": 0.5068319990368988, + "grad_norm": 1.7798513174057007, + "learning_rate": 0.00010276774130456001, + "loss": 0.3002, + "step": 2105 + }, + { + "epoch": 0.5070727743333534, + "grad_norm": 1.1935960054397583, + "learning_rate": 0.00010268979584133971, + "loss": 0.5571, + "step": 2106 + }, + { + "epoch": 0.507313549629808, + "grad_norm": 2.004664421081543, + "learning_rate": 0.00010261184874271748, + "loss": 0.5307, + "step": 2107 + }, + { + "epoch": 0.5075543249262625, + "grad_norm": 1.2251675128936768, + "learning_rate": 0.00010253390005608534, + "loss": 0.4798, + "step": 2108 + }, + { + "epoch": 0.5077951002227171, + "grad_norm": 1.0275200605392456, + "learning_rate": 0.00010245594982883626, + "loss": 0.8242, + "step": 2109 + }, + { + "epoch": 0.5080358755191717, + "grad_norm": 0.9734987616539001, + "learning_rate": 0.00010237799810836413, + "loss": 0.5406, + "step": 2110 + }, + { + "epoch": 0.5082766508156263, + "grad_norm": 2.428023099899292, + "learning_rate": 0.0001023000449420638, + "loss": 0.3063, + "step": 2111 + }, + { + "epoch": 0.508517426112081, + "grad_norm": 2.5370419025421143, + "learning_rate": 0.00010222209037733097, + "loss": 0.7001, + "step": 2112 + }, + { + "epoch": 0.5087582014085354, + "grad_norm": 8.222167015075684, + "learning_rate": 0.0001021441344615622, + "loss": 1.3225, + "step": 2113 + }, + { + "epoch": 0.5089989767049901, + "grad_norm": 8.197820663452148, + "learning_rate": 0.00010206617724215481, + "loss": 0.1596, + "step": 2114 + }, + { + "epoch": 0.5092397520014447, + "grad_norm": 4.041478157043457, + "learning_rate": 0.00010198821876650701, + "loss": 0.4862, + "step": 2115 + }, + { + "epoch": 0.5094805272978993, + "grad_norm": 1.4023808240890503, + "learning_rate": 0.00010191025908201774, + "loss": 0.337, + "step": 2116 + }, + { + "epoch": 0.5097213025943538, + "grad_norm": 0.8638352751731873, + "learning_rate": 0.00010183229823608665, + "loss": 0.3498, + "step": 2117 + }, + { + "epoch": 0.5099620778908084, + "grad_norm": 3.4716315269470215, + "learning_rate": 0.00010175433627611408, + "loss": 1.0205, + "step": 2118 + }, + { + "epoch": 0.510202853187263, + "grad_norm": 9.713912010192871, + "learning_rate": 0.0001016763732495011, + "loss": 0.7581, + "step": 2119 + }, + { + "epoch": 0.5104436284837176, + "grad_norm": 3.348017930984497, + "learning_rate": 0.00010159840920364943, + "loss": 0.1819, + "step": 2120 + }, + { + "epoch": 0.5106844037801721, + "grad_norm": 9.675308227539062, + "learning_rate": 0.00010152044418596136, + "loss": 0.5749, + "step": 2121 + }, + { + "epoch": 0.5109251790766267, + "grad_norm": 0.7371659278869629, + "learning_rate": 0.00010144247824383979, + "loss": 0.2887, + "step": 2122 + }, + { + "epoch": 0.5111659543730813, + "grad_norm": 1.746598720550537, + "learning_rate": 0.00010136451142468819, + "loss": 0.9139, + "step": 2123 + }, + { + "epoch": 0.5114067296695359, + "grad_norm": 0.3207070827484131, + "learning_rate": 0.00010128654377591056, + "loss": 0.3856, + "step": 2124 + }, + { + "epoch": 0.5116475049659905, + "grad_norm": 1.747492790222168, + "learning_rate": 0.00010120857534491144, + "loss": 0.4888, + "step": 2125 + }, + { + "epoch": 0.511888280262445, + "grad_norm": 1.8366111516952515, + "learning_rate": 0.0001011306061790958, + "loss": 0.8371, + "step": 2126 + }, + { + "epoch": 0.5121290555588996, + "grad_norm": 2.3959193229675293, + "learning_rate": 0.00010105263632586904, + "loss": 1.0204, + "step": 2127 + }, + { + "epoch": 0.5123698308553543, + "grad_norm": 1.2648195028305054, + "learning_rate": 0.00010097466583263699, + "loss": 0.3782, + "step": 2128 + }, + { + "epoch": 0.5126106061518089, + "grad_norm": 3.5460050106048584, + "learning_rate": 0.00010089669474680596, + "loss": 0.697, + "step": 2129 + }, + { + "epoch": 0.5128513814482634, + "grad_norm": 0.989863932132721, + "learning_rate": 0.00010081872311578249, + "loss": 0.217, + "step": 2130 + }, + { + "epoch": 0.513092156744718, + "grad_norm": 5.3702921867370605, + "learning_rate": 0.00010074075098697351, + "loss": 0.7093, + "step": 2131 + }, + { + "epoch": 0.5133329320411726, + "grad_norm": 8.320046424865723, + "learning_rate": 0.00010066277840778626, + "loss": 0.8629, + "step": 2132 + }, + { + "epoch": 0.5135737073376272, + "grad_norm": 3.336007833480835, + "learning_rate": 0.00010058480542562828, + "loss": 1.1258, + "step": 2133 + }, + { + "epoch": 0.5138144826340817, + "grad_norm": 0.6159772276878357, + "learning_rate": 0.00010050683208790726, + "loss": 0.3306, + "step": 2134 + }, + { + "epoch": 0.5140552579305363, + "grad_norm": 1.654181957244873, + "learning_rate": 0.00010042885844203119, + "loss": 0.7766, + "step": 2135 + }, + { + "epoch": 0.5142960332269909, + "grad_norm": 1.8773746490478516, + "learning_rate": 0.00010035088453540822, + "loss": 0.2017, + "step": 2136 + }, + { + "epoch": 0.5145368085234455, + "grad_norm": 1.3991271257400513, + "learning_rate": 0.00010027291041544664, + "loss": 0.643, + "step": 2137 + }, + { + "epoch": 0.5147775838199001, + "grad_norm": 2.1096439361572266, + "learning_rate": 0.00010019493612955495, + "loss": 0.6112, + "step": 2138 + }, + { + "epoch": 0.5150183591163546, + "grad_norm": 2.802321195602417, + "learning_rate": 0.00010011696172514162, + "loss": 0.7492, + "step": 2139 + }, + { + "epoch": 0.5152591344128092, + "grad_norm": 2.361962080001831, + "learning_rate": 0.00010003898724961533, + "loss": 0.2983, + "step": 2140 + }, + { + "epoch": 0.5154999097092638, + "grad_norm": 2.6102824211120605, + "learning_rate": 9.99610127503847e-05, + "loss": 0.8425, + "step": 2141 + }, + { + "epoch": 0.5157406850057185, + "grad_norm": 0.7321549654006958, + "learning_rate": 9.988303827485839e-05, + "loss": 0.2544, + "step": 2142 + }, + { + "epoch": 0.515981460302173, + "grad_norm": 3.4591763019561768, + "learning_rate": 9.980506387044508e-05, + "loss": 0.6845, + "step": 2143 + }, + { + "epoch": 0.5162222355986276, + "grad_norm": 6.815724849700928, + "learning_rate": 9.972708958455337e-05, + "loss": 0.6039, + "step": 2144 + }, + { + "epoch": 0.5164630108950822, + "grad_norm": 3.7558867931365967, + "learning_rate": 9.964911546459181e-05, + "loss": 0.6514, + "step": 2145 + }, + { + "epoch": 0.5167037861915368, + "grad_norm": 1.1329708099365234, + "learning_rate": 9.957114155796884e-05, + "loss": 1.0924, + "step": 2146 + }, + { + "epoch": 0.5169445614879913, + "grad_norm": 2.772102117538452, + "learning_rate": 9.949316791209275e-05, + "loss": 0.3061, + "step": 2147 + }, + { + "epoch": 0.5171853367844459, + "grad_norm": 1.8187817335128784, + "learning_rate": 9.941519457437173e-05, + "loss": 0.4169, + "step": 2148 + }, + { + "epoch": 0.5174261120809005, + "grad_norm": 0.46912047266960144, + "learning_rate": 9.933722159221376e-05, + "loss": 0.336, + "step": 2149 + }, + { + "epoch": 0.5176668873773551, + "grad_norm": 1.7679054737091064, + "learning_rate": 9.925924901302651e-05, + "loss": 0.4573, + "step": 2150 + }, + { + "epoch": 0.5179076626738097, + "grad_norm": 1.68385648727417, + "learning_rate": 9.918127688421755e-05, + "loss": 0.529, + "step": 2151 + }, + { + "epoch": 0.5181484379702642, + "grad_norm": 0.7433429956436157, + "learning_rate": 9.910330525319406e-05, + "loss": 0.3717, + "step": 2152 + }, + { + "epoch": 0.5183892132667188, + "grad_norm": 1.197072148323059, + "learning_rate": 9.902533416736302e-05, + "loss": 0.2179, + "step": 2153 + }, + { + "epoch": 0.5186299885631734, + "grad_norm": 1.157617211341858, + "learning_rate": 9.894736367413102e-05, + "loss": 0.6772, + "step": 2154 + }, + { + "epoch": 0.518870763859628, + "grad_norm": 2.84462308883667, + "learning_rate": 9.886939382090422e-05, + "loss": 0.4376, + "step": 2155 + }, + { + "epoch": 0.5191115391560825, + "grad_norm": 1.1269418001174927, + "learning_rate": 9.879142465508856e-05, + "loss": 0.5879, + "step": 2156 + }, + { + "epoch": 0.5193523144525372, + "grad_norm": 1.6317634582519531, + "learning_rate": 9.871345622408946e-05, + "loss": 0.8341, + "step": 2157 + }, + { + "epoch": 0.5195930897489918, + "grad_norm": 2.172504425048828, + "learning_rate": 9.863548857531183e-05, + "loss": 0.4717, + "step": 2158 + }, + { + "epoch": 0.5198338650454464, + "grad_norm": 0.8946624994277954, + "learning_rate": 9.855752175616025e-05, + "loss": 0.9934, + "step": 2159 + }, + { + "epoch": 0.5200746403419009, + "grad_norm": 1.0163549184799194, + "learning_rate": 9.847955581403866e-05, + "loss": 0.6364, + "step": 2160 + }, + { + "epoch": 0.5203154156383555, + "grad_norm": 1.2340433597564697, + "learning_rate": 9.840159079635057e-05, + "loss": 0.681, + "step": 2161 + }, + { + "epoch": 0.5205561909348101, + "grad_norm": 2.015260934829712, + "learning_rate": 9.832362675049893e-05, + "loss": 0.7061, + "step": 2162 + }, + { + "epoch": 0.5207969662312647, + "grad_norm": 1.6834375858306885, + "learning_rate": 9.824566372388596e-05, + "loss": 0.6874, + "step": 2163 + }, + { + "epoch": 0.5210377415277193, + "grad_norm": 2.863741874694824, + "learning_rate": 9.81677017639134e-05, + "loss": 0.1785, + "step": 2164 + }, + { + "epoch": 0.5212785168241738, + "grad_norm": 0.741033673286438, + "learning_rate": 9.808974091798227e-05, + "loss": 0.3825, + "step": 2165 + }, + { + "epoch": 0.5215192921206284, + "grad_norm": 2.9215714931488037, + "learning_rate": 9.801178123349298e-05, + "loss": 0.5243, + "step": 2166 + }, + { + "epoch": 0.521760067417083, + "grad_norm": 2.389853000640869, + "learning_rate": 9.793382275784521e-05, + "loss": 0.5792, + "step": 2167 + }, + { + "epoch": 0.5220008427135376, + "grad_norm": 4.854155540466309, + "learning_rate": 9.785586553843781e-05, + "loss": 0.7133, + "step": 2168 + }, + { + "epoch": 0.5222416180099921, + "grad_norm": 1.7137115001678467, + "learning_rate": 9.777790962266903e-05, + "loss": 0.8245, + "step": 2169 + }, + { + "epoch": 0.5224823933064467, + "grad_norm": 5.3910603523254395, + "learning_rate": 9.769995505793622e-05, + "loss": 0.5916, + "step": 2170 + }, + { + "epoch": 0.5227231686029014, + "grad_norm": 9.350793838500977, + "learning_rate": 9.762200189163588e-05, + "loss": 0.7286, + "step": 2171 + }, + { + "epoch": 0.522963943899356, + "grad_norm": 2.6609160900115967, + "learning_rate": 9.754405017116379e-05, + "loss": 0.5725, + "step": 2172 + }, + { + "epoch": 0.5232047191958105, + "grad_norm": 2.54089617729187, + "learning_rate": 9.746609994391468e-05, + "loss": 0.7312, + "step": 2173 + }, + { + "epoch": 0.5234454944922651, + "grad_norm": 1.6947931051254272, + "learning_rate": 9.738815125728252e-05, + "loss": 1.0029, + "step": 2174 + }, + { + "epoch": 0.5236862697887197, + "grad_norm": 1.9103237390518188, + "learning_rate": 9.73102041586603e-05, + "loss": 0.6121, + "step": 2175 + }, + { + "epoch": 0.5239270450851743, + "grad_norm": 3.6913580894470215, + "learning_rate": 9.723225869544001e-05, + "loss": 0.8657, + "step": 2176 + }, + { + "epoch": 0.5241678203816288, + "grad_norm": 1.9038362503051758, + "learning_rate": 9.715431491501269e-05, + "loss": 0.5313, + "step": 2177 + }, + { + "epoch": 0.5244085956780834, + "grad_norm": 3.199769973754883, + "learning_rate": 9.707637286476827e-05, + "loss": 0.7072, + "step": 2178 + }, + { + "epoch": 0.524649370974538, + "grad_norm": 1.5751662254333496, + "learning_rate": 9.699843259209574e-05, + "loss": 0.2701, + "step": 2179 + }, + { + "epoch": 0.5248901462709926, + "grad_norm": 1.8176679611206055, + "learning_rate": 9.692049414438299e-05, + "loss": 0.2336, + "step": 2180 + }, + { + "epoch": 0.5251309215674472, + "grad_norm": 7.185880661010742, + "learning_rate": 9.68425575690167e-05, + "loss": 0.4916, + "step": 2181 + }, + { + "epoch": 0.5253716968639017, + "grad_norm": 3.68613338470459, + "learning_rate": 9.676462291338253e-05, + "loss": 0.5863, + "step": 2182 + }, + { + "epoch": 0.5256124721603563, + "grad_norm": 1.8995952606201172, + "learning_rate": 9.668669022486494e-05, + "loss": 0.1889, + "step": 2183 + }, + { + "epoch": 0.525853247456811, + "grad_norm": 1.6753265857696533, + "learning_rate": 9.660875955084713e-05, + "loss": 0.539, + "step": 2184 + }, + { + "epoch": 0.5260940227532656, + "grad_norm": 0.9983983039855957, + "learning_rate": 9.65308309387112e-05, + "loss": 0.3609, + "step": 2185 + }, + { + "epoch": 0.52633479804972, + "grad_norm": 3.3040006160736084, + "learning_rate": 9.645290443583785e-05, + "loss": 1.2302, + "step": 2186 + }, + { + "epoch": 0.5265755733461747, + "grad_norm": 2.018064498901367, + "learning_rate": 9.637498008960657e-05, + "loss": 0.443, + "step": 2187 + }, + { + "epoch": 0.5268163486426293, + "grad_norm": 2.3584113121032715, + "learning_rate": 9.629705794739558e-05, + "loss": 0.8664, + "step": 2188 + }, + { + "epoch": 0.5270571239390839, + "grad_norm": 0.6062427163124084, + "learning_rate": 9.62191380565817e-05, + "loss": 0.2761, + "step": 2189 + }, + { + "epoch": 0.5272978992355384, + "grad_norm": 4.201809406280518, + "learning_rate": 9.614122046454044e-05, + "loss": 1.1502, + "step": 2190 + }, + { + "epoch": 0.527538674531993, + "grad_norm": 6.053175449371338, + "learning_rate": 9.606330521864576e-05, + "loss": 0.465, + "step": 2191 + }, + { + "epoch": 0.5277794498284476, + "grad_norm": 1.6828287839889526, + "learning_rate": 9.59853923662704e-05, + "loss": 0.7583, + "step": 2192 + }, + { + "epoch": 0.5280202251249022, + "grad_norm": 2.127516746520996, + "learning_rate": 9.590748195478557e-05, + "loss": 0.581, + "step": 2193 + }, + { + "epoch": 0.5282610004213568, + "grad_norm": 2.426520824432373, + "learning_rate": 9.582957403156089e-05, + "loss": 0.729, + "step": 2194 + }, + { + "epoch": 0.5285017757178113, + "grad_norm": 0.5099361538887024, + "learning_rate": 9.575166864396459e-05, + "loss": 0.2235, + "step": 2195 + }, + { + "epoch": 0.5287425510142659, + "grad_norm": 2.9863169193267822, + "learning_rate": 9.567376583936335e-05, + "loss": 0.5938, + "step": 2196 + }, + { + "epoch": 0.5289833263107205, + "grad_norm": 1.6381510496139526, + "learning_rate": 9.559586566512221e-05, + "loss": 0.7708, + "step": 2197 + }, + { + "epoch": 0.5292241016071751, + "grad_norm": 2.1702208518981934, + "learning_rate": 9.551796816860471e-05, + "loss": 0.2262, + "step": 2198 + }, + { + "epoch": 0.5294648769036296, + "grad_norm": 1.5045363903045654, + "learning_rate": 9.544007339717261e-05, + "loss": 0.6521, + "step": 2199 + }, + { + "epoch": 0.5297056522000843, + "grad_norm": 1.3283405303955078, + "learning_rate": 9.536218139818614e-05, + "loss": 0.386, + "step": 2200 + }, + { + "epoch": 0.5299464274965389, + "grad_norm": 2.6849524974823, + "learning_rate": 9.52842922190039e-05, + "loss": 0.5514, + "step": 2201 + }, + { + "epoch": 0.5301872027929935, + "grad_norm": 1.1004747152328491, + "learning_rate": 9.520640590698258e-05, + "loss": 0.5606, + "step": 2202 + }, + { + "epoch": 0.530427978089448, + "grad_norm": 2.8887600898742676, + "learning_rate": 9.512852250947727e-05, + "loss": 0.7519, + "step": 2203 + }, + { + "epoch": 0.5306687533859026, + "grad_norm": 2.1143975257873535, + "learning_rate": 9.505064207384124e-05, + "loss": 0.3216, + "step": 2204 + }, + { + "epoch": 0.5309095286823572, + "grad_norm": 1.3769932985305786, + "learning_rate": 9.497276464742598e-05, + "loss": 0.2864, + "step": 2205 + }, + { + "epoch": 0.5311503039788118, + "grad_norm": 1.131319284439087, + "learning_rate": 9.489489027758118e-05, + "loss": 0.5236, + "step": 2206 + }, + { + "epoch": 0.5313910792752664, + "grad_norm": 1.2855147123336792, + "learning_rate": 9.481701901165455e-05, + "loss": 0.8535, + "step": 2207 + }, + { + "epoch": 0.5316318545717209, + "grad_norm": 4.562783718109131, + "learning_rate": 9.473915089699203e-05, + "loss": 1.103, + "step": 2208 + }, + { + "epoch": 0.5318726298681755, + "grad_norm": 1.491631269454956, + "learning_rate": 9.466128598093767e-05, + "loss": 0.328, + "step": 2209 + }, + { + "epoch": 0.5321134051646301, + "grad_norm": 1.7544147968292236, + "learning_rate": 9.458342431083342e-05, + "loss": 0.0794, + "step": 2210 + }, + { + "epoch": 0.5323541804610847, + "grad_norm": 1.3631882667541504, + "learning_rate": 9.45055659340194e-05, + "loss": 0.2153, + "step": 2211 + }, + { + "epoch": 0.5325949557575392, + "grad_norm": 6.174732208251953, + "learning_rate": 9.442771089783366e-05, + "loss": 0.7058, + "step": 2212 + }, + { + "epoch": 0.5328357310539938, + "grad_norm": 1.6120647192001343, + "learning_rate": 9.434985924961226e-05, + "loss": 0.5721, + "step": 2213 + }, + { + "epoch": 0.5330765063504485, + "grad_norm": 0.5557000637054443, + "learning_rate": 9.42720110366892e-05, + "loss": 0.255, + "step": 2214 + }, + { + "epoch": 0.5333172816469031, + "grad_norm": 3.7805826663970947, + "learning_rate": 9.41941663063963e-05, + "loss": 0.6903, + "step": 2215 + }, + { + "epoch": 0.5335580569433576, + "grad_norm": 4.721010684967041, + "learning_rate": 9.411632510606337e-05, + "loss": 1.1333, + "step": 2216 + }, + { + "epoch": 0.5337988322398122, + "grad_norm": 3.89003849029541, + "learning_rate": 9.403848748301802e-05, + "loss": 0.9563, + "step": 2217 + }, + { + "epoch": 0.5340396075362668, + "grad_norm": 1.9357439279556274, + "learning_rate": 9.396065348458571e-05, + "loss": 0.6106, + "step": 2218 + }, + { + "epoch": 0.5342803828327214, + "grad_norm": 1.0858145952224731, + "learning_rate": 9.388282315808971e-05, + "loss": 0.4984, + "step": 2219 + }, + { + "epoch": 0.534521158129176, + "grad_norm": 2.763885259628296, + "learning_rate": 9.3804996550851e-05, + "loss": 0.3943, + "step": 2220 + }, + { + "epoch": 0.5347619334256305, + "grad_norm": 0.8865588903427124, + "learning_rate": 9.372717371018834e-05, + "loss": 0.2669, + "step": 2221 + }, + { + "epoch": 0.5350027087220851, + "grad_norm": 1.0072959661483765, + "learning_rate": 9.364935468341824e-05, + "loss": 0.2614, + "step": 2222 + }, + { + "epoch": 0.5352434840185397, + "grad_norm": 1.3582466840744019, + "learning_rate": 9.357153951785475e-05, + "loss": 0.8149, + "step": 2223 + }, + { + "epoch": 0.5354842593149943, + "grad_norm": 1.8487718105316162, + "learning_rate": 9.349372826080974e-05, + "loss": 1.013, + "step": 2224 + }, + { + "epoch": 0.5357250346114488, + "grad_norm": 2.25203275680542, + "learning_rate": 9.341592095959259e-05, + "loss": 0.4711, + "step": 2225 + }, + { + "epoch": 0.5359658099079034, + "grad_norm": 4.066526889801025, + "learning_rate": 9.333811766151033e-05, + "loss": 1.3851, + "step": 2226 + }, + { + "epoch": 0.536206585204358, + "grad_norm": 3.2181577682495117, + "learning_rate": 9.326031841386759e-05, + "loss": 0.7188, + "step": 2227 + }, + { + "epoch": 0.5364473605008127, + "grad_norm": 4.251607894897461, + "learning_rate": 9.318252326396635e-05, + "loss": 0.9096, + "step": 2228 + }, + { + "epoch": 0.5366881357972672, + "grad_norm": 3.6044514179229736, + "learning_rate": 9.310473225910641e-05, + "loss": 0.4364, + "step": 2229 + }, + { + "epoch": 0.5369289110937218, + "grad_norm": 0.8138754367828369, + "learning_rate": 9.302694544658475e-05, + "loss": 0.3227, + "step": 2230 + }, + { + "epoch": 0.5371696863901764, + "grad_norm": 1.5204187631607056, + "learning_rate": 9.294916287369597e-05, + "loss": 0.3241, + "step": 2231 + }, + { + "epoch": 0.537410461686631, + "grad_norm": 2.078233242034912, + "learning_rate": 9.287138458773208e-05, + "loss": 0.5936, + "step": 2232 + }, + { + "epoch": 0.5376512369830856, + "grad_norm": 6.410951614379883, + "learning_rate": 9.279361063598238e-05, + "loss": 0.4392, + "step": 2233 + }, + { + "epoch": 0.5378920122795401, + "grad_norm": 1.241186499595642, + "learning_rate": 9.271584106573364e-05, + "loss": 0.4729, + "step": 2234 + }, + { + "epoch": 0.5381327875759947, + "grad_norm": 1.808719515800476, + "learning_rate": 9.263807592427001e-05, + "loss": 0.5305, + "step": 2235 + }, + { + "epoch": 0.5383735628724493, + "grad_norm": 0.6988890171051025, + "learning_rate": 9.256031525887273e-05, + "loss": 0.5642, + "step": 2236 + }, + { + "epoch": 0.5386143381689039, + "grad_norm": 2.4080259799957275, + "learning_rate": 9.24825591168206e-05, + "loss": 0.7976, + "step": 2237 + }, + { + "epoch": 0.5388551134653584, + "grad_norm": 4.949229717254639, + "learning_rate": 9.240480754538942e-05, + "loss": 1.2054, + "step": 2238 + }, + { + "epoch": 0.539095888761813, + "grad_norm": 1.403643250465393, + "learning_rate": 9.232706059185236e-05, + "loss": 0.9002, + "step": 2239 + }, + { + "epoch": 0.5393366640582676, + "grad_norm": 2.1335864067077637, + "learning_rate": 9.224931830347978e-05, + "loss": 0.9663, + "step": 2240 + }, + { + "epoch": 0.5395774393547222, + "grad_norm": 2.4091343879699707, + "learning_rate": 9.21715807275391e-05, + "loss": 0.9484, + "step": 2241 + }, + { + "epoch": 0.5398182146511767, + "grad_norm": 2.391929864883423, + "learning_rate": 9.209384791129504e-05, + "loss": 0.6072, + "step": 2242 + }, + { + "epoch": 0.5400589899476314, + "grad_norm": 5.663161754608154, + "learning_rate": 9.20161199020092e-05, + "loss": 0.3371, + "step": 2243 + }, + { + "epoch": 0.540299765244086, + "grad_norm": 1.5023120641708374, + "learning_rate": 9.193839674694046e-05, + "loss": 0.7458, + "step": 2244 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 2.3951783180236816, + "learning_rate": 9.186067849334467e-05, + "loss": 0.8693, + "step": 2245 + }, + { + "epoch": 0.5407813158369951, + "grad_norm": 1.6337603330612183, + "learning_rate": 9.178296518847467e-05, + "loss": 0.8064, + "step": 2246 + }, + { + "epoch": 0.5410220911334497, + "grad_norm": 4.101715564727783, + "learning_rate": 9.170525687958035e-05, + "loss": 0.7042, + "step": 2247 + }, + { + "epoch": 0.5412628664299043, + "grad_norm": 0.9086791276931763, + "learning_rate": 9.162755361390858e-05, + "loss": 0.8873, + "step": 2248 + }, + { + "epoch": 0.5415036417263589, + "grad_norm": 1.7184299230575562, + "learning_rate": 9.154985543870304e-05, + "loss": 0.8026, + "step": 2249 + }, + { + "epoch": 0.5417444170228135, + "grad_norm": 2.9949686527252197, + "learning_rate": 9.147216240120446e-05, + "loss": 0.6126, + "step": 2250 + }, + { + "epoch": 0.541985192319268, + "grad_norm": 2.2674872875213623, + "learning_rate": 9.139447454865033e-05, + "loss": 0.8358, + "step": 2251 + }, + { + "epoch": 0.5422259676157226, + "grad_norm": 0.7034595012664795, + "learning_rate": 9.131679192827506e-05, + "loss": 0.4057, + "step": 2252 + }, + { + "epoch": 0.5424667429121772, + "grad_norm": 3.044638156890869, + "learning_rate": 9.123911458730988e-05, + "loss": 0.7883, + "step": 2253 + }, + { + "epoch": 0.5427075182086318, + "grad_norm": 4.1872239112854, + "learning_rate": 9.116144257298274e-05, + "loss": 1.4448, + "step": 2254 + }, + { + "epoch": 0.5429482935050863, + "grad_norm": 1.9178543090820312, + "learning_rate": 9.108377593251847e-05, + "loss": 1.2404, + "step": 2255 + }, + { + "epoch": 0.5431890688015409, + "grad_norm": 1.3553639650344849, + "learning_rate": 9.100611471313849e-05, + "loss": 0.4571, + "step": 2256 + }, + { + "epoch": 0.5434298440979956, + "grad_norm": 5.682826042175293, + "learning_rate": 9.092845896206102e-05, + "loss": 0.6029, + "step": 2257 + }, + { + "epoch": 0.5436706193944502, + "grad_norm": 3.233644485473633, + "learning_rate": 9.085080872650098e-05, + "loss": 0.7475, + "step": 2258 + }, + { + "epoch": 0.5439113946909047, + "grad_norm": 0.8178972601890564, + "learning_rate": 9.077316405366981e-05, + "loss": 0.4826, + "step": 2259 + }, + { + "epoch": 0.5441521699873593, + "grad_norm": 1.9637796878814697, + "learning_rate": 9.069552499077569e-05, + "loss": 0.7773, + "step": 2260 + }, + { + "epoch": 0.5443929452838139, + "grad_norm": 4.2175188064575195, + "learning_rate": 9.061789158502336e-05, + "loss": 0.5585, + "step": 2261 + }, + { + "epoch": 0.5446337205802685, + "grad_norm": 1.7888754606246948, + "learning_rate": 9.054026388361405e-05, + "loss": 0.5089, + "step": 2262 + }, + { + "epoch": 0.5448744958767231, + "grad_norm": 1.9590795040130615, + "learning_rate": 9.046264193374568e-05, + "loss": 0.5263, + "step": 2263 + }, + { + "epoch": 0.5451152711731776, + "grad_norm": 2.484314441680908, + "learning_rate": 9.038502578261241e-05, + "loss": 0.5187, + "step": 2264 + }, + { + "epoch": 0.5453560464696322, + "grad_norm": 1.6243886947631836, + "learning_rate": 9.030741547740517e-05, + "loss": 0.6487, + "step": 2265 + }, + { + "epoch": 0.5455968217660868, + "grad_norm": 3.200514793395996, + "learning_rate": 9.022981106531119e-05, + "loss": 0.4566, + "step": 2266 + }, + { + "epoch": 0.5458375970625414, + "grad_norm": 2.8995554447174072, + "learning_rate": 9.015221259351405e-05, + "loss": 1.1906, + "step": 2267 + }, + { + "epoch": 0.5460783723589959, + "grad_norm": 1.6960794925689697, + "learning_rate": 9.007462010919386e-05, + "loss": 0.8553, + "step": 2268 + }, + { + "epoch": 0.5463191476554505, + "grad_norm": 0.8978815674781799, + "learning_rate": 8.999703365952699e-05, + "loss": 0.9352, + "step": 2269 + }, + { + "epoch": 0.5465599229519051, + "grad_norm": 1.4150447845458984, + "learning_rate": 8.99194532916862e-05, + "loss": 0.3387, + "step": 2270 + }, + { + "epoch": 0.5468006982483598, + "grad_norm": 1.1384726762771606, + "learning_rate": 8.984187905284055e-05, + "loss": 0.2762, + "step": 2271 + }, + { + "epoch": 0.5470414735448142, + "grad_norm": 1.1837869882583618, + "learning_rate": 8.976431099015528e-05, + "loss": 0.43, + "step": 2272 + }, + { + "epoch": 0.5472822488412689, + "grad_norm": 3.328984498977661, + "learning_rate": 8.968674915079197e-05, + "loss": 0.9047, + "step": 2273 + }, + { + "epoch": 0.5475230241377235, + "grad_norm": 2.5467495918273926, + "learning_rate": 8.960919358190848e-05, + "loss": 0.7412, + "step": 2274 + }, + { + "epoch": 0.5477637994341781, + "grad_norm": 0.357572466135025, + "learning_rate": 8.953164433065866e-05, + "loss": 0.2749, + "step": 2275 + }, + { + "epoch": 0.5480045747306327, + "grad_norm": 1.4513580799102783, + "learning_rate": 8.945410144419269e-05, + "loss": 0.4484, + "step": 2276 + }, + { + "epoch": 0.5482453500270872, + "grad_norm": 2.9045469760894775, + "learning_rate": 8.937656496965678e-05, + "loss": 0.8804, + "step": 2277 + }, + { + "epoch": 0.5484861253235418, + "grad_norm": 2.212029218673706, + "learning_rate": 8.929903495419331e-05, + "loss": 0.5796, + "step": 2278 + }, + { + "epoch": 0.5487269006199964, + "grad_norm": 5.107553482055664, + "learning_rate": 8.922151144494072e-05, + "loss": 0.6931, + "step": 2279 + }, + { + "epoch": 0.548967675916451, + "grad_norm": 0.923570990562439, + "learning_rate": 8.914399448903344e-05, + "loss": 0.2629, + "step": 2280 + }, + { + "epoch": 0.5492084512129055, + "grad_norm": 4.435163974761963, + "learning_rate": 8.906648413360197e-05, + "loss": 0.4986, + "step": 2281 + }, + { + "epoch": 0.5494492265093601, + "grad_norm": 0.577694296836853, + "learning_rate": 8.898898042577279e-05, + "loss": 0.4683, + "step": 2282 + }, + { + "epoch": 0.5496900018058147, + "grad_norm": 3.198882579803467, + "learning_rate": 8.891148341266828e-05, + "loss": 0.4887, + "step": 2283 + }, + { + "epoch": 0.5499307771022693, + "grad_norm": 2.20881724357605, + "learning_rate": 8.883399314140689e-05, + "loss": 0.6167, + "step": 2284 + }, + { + "epoch": 0.5501715523987238, + "grad_norm": 2.165309429168701, + "learning_rate": 8.875650965910279e-05, + "loss": 0.6205, + "step": 2285 + }, + { + "epoch": 0.5504123276951784, + "grad_norm": 1.3588035106658936, + "learning_rate": 8.867903301286616e-05, + "loss": 0.3225, + "step": 2286 + }, + { + "epoch": 0.5506531029916331, + "grad_norm": 1.6632091999053955, + "learning_rate": 8.8601563249803e-05, + "loss": 0.5279, + "step": 2287 + }, + { + "epoch": 0.5508938782880877, + "grad_norm": 1.157415509223938, + "learning_rate": 8.852410041701502e-05, + "loss": 0.4965, + "step": 2288 + }, + { + "epoch": 0.5511346535845423, + "grad_norm": 3.8233842849731445, + "learning_rate": 8.844664456159985e-05, + "loss": 0.7001, + "step": 2289 + }, + { + "epoch": 0.5513754288809968, + "grad_norm": 1.0012489557266235, + "learning_rate": 8.836919573065082e-05, + "loss": 0.4657, + "step": 2290 + }, + { + "epoch": 0.5516162041774514, + "grad_norm": 1.7905609607696533, + "learning_rate": 8.829175397125698e-05, + "loss": 0.3764, + "step": 2291 + }, + { + "epoch": 0.551856979473906, + "grad_norm": 3.6006107330322266, + "learning_rate": 8.821431933050313e-05, + "loss": 0.7817, + "step": 2292 + }, + { + "epoch": 0.5520977547703606, + "grad_norm": 0.9073820114135742, + "learning_rate": 8.813689185546965e-05, + "loss": 0.322, + "step": 2293 + }, + { + "epoch": 0.5523385300668151, + "grad_norm": 3.195746660232544, + "learning_rate": 8.80594715932327e-05, + "loss": 0.6941, + "step": 2294 + }, + { + "epoch": 0.5525793053632697, + "grad_norm": 1.6812855005264282, + "learning_rate": 8.798205859086388e-05, + "loss": 0.7138, + "step": 2295 + }, + { + "epoch": 0.5528200806597243, + "grad_norm": 1.5866107940673828, + "learning_rate": 8.790465289543051e-05, + "loss": 0.4609, + "step": 2296 + }, + { + "epoch": 0.5530608559561789, + "grad_norm": 1.2990373373031616, + "learning_rate": 8.782725455399546e-05, + "loss": 0.5497, + "step": 2297 + }, + { + "epoch": 0.5533016312526334, + "grad_norm": 0.8197939395904541, + "learning_rate": 8.774986361361705e-05, + "loss": 0.3533, + "step": 2298 + }, + { + "epoch": 0.553542406549088, + "grad_norm": 2.288421869277954, + "learning_rate": 8.767248012134914e-05, + "loss": 0.1527, + "step": 2299 + }, + { + "epoch": 0.5537831818455426, + "grad_norm": 6.408196449279785, + "learning_rate": 8.759510412424113e-05, + "loss": 0.6184, + "step": 2300 + }, + { + "epoch": 0.5540239571419973, + "grad_norm": 4.457020282745361, + "learning_rate": 8.751773566933774e-05, + "loss": 0.665, + "step": 2301 + }, + { + "epoch": 0.5542647324384519, + "grad_norm": 2.0285515785217285, + "learning_rate": 8.744037480367921e-05, + "loss": 0.9767, + "step": 2302 + }, + { + "epoch": 0.5545055077349064, + "grad_norm": 4.255732536315918, + "learning_rate": 8.736302157430107e-05, + "loss": 0.7522, + "step": 2303 + }, + { + "epoch": 0.554746283031361, + "grad_norm": 1.1508095264434814, + "learning_rate": 8.728567602823429e-05, + "loss": 0.4259, + "step": 2304 + }, + { + "epoch": 0.5549870583278156, + "grad_norm": 0.9924709796905518, + "learning_rate": 8.720833821250513e-05, + "loss": 1.6025, + "step": 2305 + }, + { + "epoch": 0.5552278336242702, + "grad_norm": 1.755651593208313, + "learning_rate": 8.713100817413516e-05, + "loss": 0.3882, + "step": 2306 + }, + { + "epoch": 0.5554686089207247, + "grad_norm": 1.430647850036621, + "learning_rate": 8.705368596014125e-05, + "loss": 0.5597, + "step": 2307 + }, + { + "epoch": 0.5557093842171793, + "grad_norm": 1.2561583518981934, + "learning_rate": 8.697637161753538e-05, + "loss": 0.8822, + "step": 2308 + }, + { + "epoch": 0.5559501595136339, + "grad_norm": 1.0225826501846313, + "learning_rate": 8.689906519332491e-05, + "loss": 0.8633, + "step": 2309 + }, + { + "epoch": 0.5561909348100885, + "grad_norm": 1.079167366027832, + "learning_rate": 8.682176673451239e-05, + "loss": 0.4746, + "step": 2310 + }, + { + "epoch": 0.556431710106543, + "grad_norm": 1.3175033330917358, + "learning_rate": 8.674447628809533e-05, + "loss": 0.4305, + "step": 2311 + }, + { + "epoch": 0.5566724854029976, + "grad_norm": 4.170149326324463, + "learning_rate": 8.666719390106655e-05, + "loss": 0.8164, + "step": 2312 + }, + { + "epoch": 0.5569132606994522, + "grad_norm": 0.9638872742652893, + "learning_rate": 8.658991962041395e-05, + "loss": 0.5429, + "step": 2313 + }, + { + "epoch": 0.5571540359959068, + "grad_norm": 1.9414424896240234, + "learning_rate": 8.65126534931204e-05, + "loss": 0.9223, + "step": 2314 + }, + { + "epoch": 0.5573948112923615, + "grad_norm": 1.570064902305603, + "learning_rate": 8.643539556616397e-05, + "loss": 1.0301, + "step": 2315 + }, + { + "epoch": 0.557635586588816, + "grad_norm": 3.4186506271362305, + "learning_rate": 8.635814588651754e-05, + "loss": 0.8375, + "step": 2316 + }, + { + "epoch": 0.5578763618852706, + "grad_norm": 2.636807441711426, + "learning_rate": 8.628090450114916e-05, + "loss": 0.4639, + "step": 2317 + }, + { + "epoch": 0.5581171371817252, + "grad_norm": 4.225121974945068, + "learning_rate": 8.620367145702177e-05, + "loss": 0.5046, + "step": 2318 + }, + { + "epoch": 0.5583579124781798, + "grad_norm": 0.9116895198822021, + "learning_rate": 8.612644680109319e-05, + "loss": 0.2553, + "step": 2319 + }, + { + "epoch": 0.5585986877746343, + "grad_norm": 2.3729517459869385, + "learning_rate": 8.604923058031624e-05, + "loss": 0.592, + "step": 2320 + }, + { + "epoch": 0.5588394630710889, + "grad_norm": 1.5719141960144043, + "learning_rate": 8.59720228416385e-05, + "loss": 0.9508, + "step": 2321 + }, + { + "epoch": 0.5590802383675435, + "grad_norm": 3.1368796825408936, + "learning_rate": 8.589482363200247e-05, + "loss": 0.9687, + "step": 2322 + }, + { + "epoch": 0.5593210136639981, + "grad_norm": 2.1668570041656494, + "learning_rate": 8.581763299834551e-05, + "loss": 0.0668, + "step": 2323 + }, + { + "epoch": 0.5595617889604526, + "grad_norm": 0.7108801007270813, + "learning_rate": 8.57404509875996e-05, + "loss": 0.2144, + "step": 2324 + }, + { + "epoch": 0.5598025642569072, + "grad_norm": 2.860525369644165, + "learning_rate": 8.56632776466916e-05, + "loss": 0.619, + "step": 2325 + }, + { + "epoch": 0.5600433395533618, + "grad_norm": 4.221729278564453, + "learning_rate": 8.558611302254314e-05, + "loss": 0.828, + "step": 2326 + }, + { + "epoch": 0.5602841148498164, + "grad_norm": 1.6991534233093262, + "learning_rate": 8.55089571620704e-05, + "loss": 0.909, + "step": 2327 + }, + { + "epoch": 0.5605248901462709, + "grad_norm": 4.212416648864746, + "learning_rate": 8.543181011218437e-05, + "loss": 1.5328, + "step": 2328 + }, + { + "epoch": 0.5607656654427255, + "grad_norm": 4.365540504455566, + "learning_rate": 8.535467191979058e-05, + "loss": 0.6489, + "step": 2329 + }, + { + "epoch": 0.5610064407391802, + "grad_norm": 0.9320734143257141, + "learning_rate": 8.527754263178929e-05, + "loss": 0.6582, + "step": 2330 + }, + { + "epoch": 0.5612472160356348, + "grad_norm": 4.166979789733887, + "learning_rate": 8.520042229507528e-05, + "loss": 0.5757, + "step": 2331 + }, + { + "epoch": 0.5614879913320894, + "grad_norm": 3.1154069900512695, + "learning_rate": 8.512331095653781e-05, + "loss": 0.8792, + "step": 2332 + }, + { + "epoch": 0.5617287666285439, + "grad_norm": 4.849252700805664, + "learning_rate": 8.504620866306083e-05, + "loss": 0.4272, + "step": 2333 + }, + { + "epoch": 0.5619695419249985, + "grad_norm": 2.375708818435669, + "learning_rate": 8.496911546152265e-05, + "loss": 0.971, + "step": 2334 + }, + { + "epoch": 0.5622103172214531, + "grad_norm": 2.0698773860931396, + "learning_rate": 8.489203139879612e-05, + "loss": 0.4473, + "step": 2335 + }, + { + "epoch": 0.5624510925179077, + "grad_norm": 6.773448944091797, + "learning_rate": 8.481495652174859e-05, + "loss": 0.363, + "step": 2336 + }, + { + "epoch": 0.5626918678143622, + "grad_norm": 5.320286750793457, + "learning_rate": 8.473789087724165e-05, + "loss": 0.5259, + "step": 2337 + }, + { + "epoch": 0.5629326431108168, + "grad_norm": 2.9927375316619873, + "learning_rate": 8.466083451213144e-05, + "loss": 0.5302, + "step": 2338 + }, + { + "epoch": 0.5631734184072714, + "grad_norm": 1.8399150371551514, + "learning_rate": 8.458378747326848e-05, + "loss": 0.9814, + "step": 2339 + }, + { + "epoch": 0.563414193703726, + "grad_norm": 1.0915262699127197, + "learning_rate": 8.450674980749742e-05, + "loss": 0.2, + "step": 2340 + }, + { + "epoch": 0.5636549690001805, + "grad_norm": 6.239700794219971, + "learning_rate": 8.442972156165738e-05, + "loss": 0.754, + "step": 2341 + }, + { + "epoch": 0.5638957442966351, + "grad_norm": 3.9862194061279297, + "learning_rate": 8.435270278258172e-05, + "loss": 0.306, + "step": 2342 + }, + { + "epoch": 0.5641365195930897, + "grad_norm": 3.2919952869415283, + "learning_rate": 8.427569351709801e-05, + "loss": 0.776, + "step": 2343 + }, + { + "epoch": 0.5643772948895444, + "grad_norm": 1.855094075202942, + "learning_rate": 8.41986938120281e-05, + "loss": 0.6994, + "step": 2344 + }, + { + "epoch": 0.564618070185999, + "grad_norm": 1.7668780088424683, + "learning_rate": 8.41217037141879e-05, + "loss": 0.3419, + "step": 2345 + }, + { + "epoch": 0.5648588454824535, + "grad_norm": 3.746309280395508, + "learning_rate": 8.404472327038768e-05, + "loss": 1.1026, + "step": 2346 + }, + { + "epoch": 0.5650996207789081, + "grad_norm": 2.670344591140747, + "learning_rate": 8.396775252743162e-05, + "loss": 0.7391, + "step": 2347 + }, + { + "epoch": 0.5653403960753627, + "grad_norm": 1.6550657749176025, + "learning_rate": 8.389079153211814e-05, + "loss": 0.4773, + "step": 2348 + }, + { + "epoch": 0.5655811713718173, + "grad_norm": 2.2174558639526367, + "learning_rate": 8.381384033123974e-05, + "loss": 0.6246, + "step": 2349 + }, + { + "epoch": 0.5658219466682718, + "grad_norm": 0.4945906400680542, + "learning_rate": 8.373689897158284e-05, + "loss": 0.1936, + "step": 2350 + }, + { + "epoch": 0.5660627219647264, + "grad_norm": 1.8350954055786133, + "learning_rate": 8.365996749992801e-05, + "loss": 0.3785, + "step": 2351 + }, + { + "epoch": 0.566303497261181, + "grad_norm": 0.7016525864601135, + "learning_rate": 8.358304596304982e-05, + "loss": 0.567, + "step": 2352 + }, + { + "epoch": 0.5665442725576356, + "grad_norm": 5.016156196594238, + "learning_rate": 8.35061344077166e-05, + "loss": 0.6756, + "step": 2353 + }, + { + "epoch": 0.5667850478540901, + "grad_norm": 1.9168941974639893, + "learning_rate": 8.342923288069086e-05, + "loss": 0.888, + "step": 2354 + }, + { + "epoch": 0.5670258231505447, + "grad_norm": 1.5404551029205322, + "learning_rate": 8.335234142872885e-05, + "loss": 0.4729, + "step": 2355 + }, + { + "epoch": 0.5672665984469993, + "grad_norm": 1.9677037000656128, + "learning_rate": 8.327546009858074e-05, + "loss": 0.3468, + "step": 2356 + }, + { + "epoch": 0.567507373743454, + "grad_norm": 1.9757428169250488, + "learning_rate": 8.319858893699059e-05, + "loss": 0.2262, + "step": 2357 + }, + { + "epoch": 0.5677481490399086, + "grad_norm": 1.3826395273208618, + "learning_rate": 8.312172799069621e-05, + "loss": 0.5705, + "step": 2358 + }, + { + "epoch": 0.567988924336363, + "grad_norm": 1.7746422290802002, + "learning_rate": 8.304487730642929e-05, + "loss": 0.7911, + "step": 2359 + }, + { + "epoch": 0.5682296996328177, + "grad_norm": 1.2216047048568726, + "learning_rate": 8.296803693091511e-05, + "loss": 0.5022, + "step": 2360 + }, + { + "epoch": 0.5684704749292723, + "grad_norm": 0.8310643434524536, + "learning_rate": 8.289120691087285e-05, + "loss": 0.3669, + "step": 2361 + }, + { + "epoch": 0.5687112502257269, + "grad_norm": 0.5129712820053101, + "learning_rate": 8.281438729301536e-05, + "loss": 0.436, + "step": 2362 + }, + { + "epoch": 0.5689520255221814, + "grad_norm": 3.883026599884033, + "learning_rate": 8.27375781240491e-05, + "loss": 1.1273, + "step": 2363 + }, + { + "epoch": 0.569192800818636, + "grad_norm": 2.724834680557251, + "learning_rate": 8.266077945067424e-05, + "loss": 0.8467, + "step": 2364 + }, + { + "epoch": 0.5694335761150906, + "grad_norm": 2.839754343032837, + "learning_rate": 8.258399131958454e-05, + "loss": 0.9973, + "step": 2365 + }, + { + "epoch": 0.5696743514115452, + "grad_norm": 1.3639193773269653, + "learning_rate": 8.250721377746734e-05, + "loss": 0.3668, + "step": 2366 + }, + { + "epoch": 0.5699151267079997, + "grad_norm": 4.23447322845459, + "learning_rate": 8.243044687100363e-05, + "loss": 0.3128, + "step": 2367 + }, + { + "epoch": 0.5701559020044543, + "grad_norm": 1.0347940921783447, + "learning_rate": 8.235369064686776e-05, + "loss": 0.4905, + "step": 2368 + }, + { + "epoch": 0.5703966773009089, + "grad_norm": 3.1089839935302734, + "learning_rate": 8.227694515172773e-05, + "loss": 0.4338, + "step": 2369 + }, + { + "epoch": 0.5706374525973635, + "grad_norm": 4.172400951385498, + "learning_rate": 8.2200210432245e-05, + "loss": 1.1137, + "step": 2370 + }, + { + "epoch": 0.5708782278938181, + "grad_norm": 3.9930694103240967, + "learning_rate": 8.21234865350744e-05, + "loss": 0.1863, + "step": 2371 + }, + { + "epoch": 0.5711190031902726, + "grad_norm": 2.020798921585083, + "learning_rate": 8.204677350686432e-05, + "loss": 0.379, + "step": 2372 + }, + { + "epoch": 0.5713597784867273, + "grad_norm": 3.6490232944488525, + "learning_rate": 8.197007139425631e-05, + "loss": 0.4755, + "step": 2373 + }, + { + "epoch": 0.5716005537831819, + "grad_norm": 2.922484874725342, + "learning_rate": 8.189338024388557e-05, + "loss": 1.0381, + "step": 2374 + }, + { + "epoch": 0.5718413290796365, + "grad_norm": 3.1068320274353027, + "learning_rate": 8.181670010238046e-05, + "loss": 0.8434, + "step": 2375 + }, + { + "epoch": 0.572082104376091, + "grad_norm": 2.6153829097747803, + "learning_rate": 8.174003101636261e-05, + "loss": 1.216, + "step": 2376 + }, + { + "epoch": 0.5723228796725456, + "grad_norm": 1.5444633960723877, + "learning_rate": 8.166337303244705e-05, + "loss": 0.5995, + "step": 2377 + }, + { + "epoch": 0.5725636549690002, + "grad_norm": 4.170453071594238, + "learning_rate": 8.158672619724203e-05, + "loss": 0.6781, + "step": 2378 + }, + { + "epoch": 0.5728044302654548, + "grad_norm": 4.247837543487549, + "learning_rate": 8.151009055734893e-05, + "loss": 0.3414, + "step": 2379 + }, + { + "epoch": 0.5730452055619093, + "grad_norm": 1.5872865915298462, + "learning_rate": 8.143346615936247e-05, + "loss": 0.6584, + "step": 2380 + }, + { + "epoch": 0.5732859808583639, + "grad_norm": 1.2567731142044067, + "learning_rate": 8.135685304987039e-05, + "loss": 0.6352, + "step": 2381 + }, + { + "epoch": 0.5735267561548185, + "grad_norm": 3.6656978130340576, + "learning_rate": 8.128025127545362e-05, + "loss": 1.2404, + "step": 2382 + }, + { + "epoch": 0.5737675314512731, + "grad_norm": 2.5888733863830566, + "learning_rate": 8.120366088268632e-05, + "loss": 0.3153, + "step": 2383 + }, + { + "epoch": 0.5740083067477277, + "grad_norm": 0.779647171497345, + "learning_rate": 8.112708191813552e-05, + "loss": 0.4345, + "step": 2384 + }, + { + "epoch": 0.5742490820441822, + "grad_norm": 0.7447169423103333, + "learning_rate": 8.105051442836145e-05, + "loss": 0.2654, + "step": 2385 + }, + { + "epoch": 0.5744898573406368, + "grad_norm": 0.5837435722351074, + "learning_rate": 8.097395845991727e-05, + "loss": 0.5411, + "step": 2386 + }, + { + "epoch": 0.5747306326370915, + "grad_norm": 1.1887192726135254, + "learning_rate": 8.089741405934922e-05, + "loss": 0.3803, + "step": 2387 + }, + { + "epoch": 0.5749714079335461, + "grad_norm": 2.842036724090576, + "learning_rate": 8.08208812731965e-05, + "loss": 0.8952, + "step": 2388 + }, + { + "epoch": 0.5752121832300006, + "grad_norm": 3.2157955169677734, + "learning_rate": 8.074436014799114e-05, + "loss": 0.2237, + "step": 2389 + }, + { + "epoch": 0.5754529585264552, + "grad_norm": 3.5656988620758057, + "learning_rate": 8.06678507302582e-05, + "loss": 0.2823, + "step": 2390 + }, + { + "epoch": 0.5756937338229098, + "grad_norm": 2.8901267051696777, + "learning_rate": 8.059135306651557e-05, + "loss": 0.6339, + "step": 2391 + }, + { + "epoch": 0.5759345091193644, + "grad_norm": 1.380159854888916, + "learning_rate": 8.0514867203274e-05, + "loss": 1.2206, + "step": 2392 + }, + { + "epoch": 0.5761752844158189, + "grad_norm": 5.57066011428833, + "learning_rate": 8.043839318703709e-05, + "loss": 0.932, + "step": 2393 + }, + { + "epoch": 0.5764160597122735, + "grad_norm": 2.786633253097534, + "learning_rate": 8.036193106430118e-05, + "loss": 0.4513, + "step": 2394 + }, + { + "epoch": 0.5766568350087281, + "grad_norm": 2.4537577629089355, + "learning_rate": 8.028548088155542e-05, + "loss": 0.8421, + "step": 2395 + }, + { + "epoch": 0.5768976103051827, + "grad_norm": 3.0305957794189453, + "learning_rate": 8.020904268528175e-05, + "loss": 0.7525, + "step": 2396 + }, + { + "epoch": 0.5771383856016373, + "grad_norm": 1.3954887390136719, + "learning_rate": 8.013261652195466e-05, + "loss": 0.3742, + "step": 2397 + }, + { + "epoch": 0.5773791608980918, + "grad_norm": 2.359279155731201, + "learning_rate": 8.00562024380415e-05, + "loss": 1.0482, + "step": 2398 + }, + { + "epoch": 0.5776199361945464, + "grad_norm": 4.190445899963379, + "learning_rate": 7.99798004800022e-05, + "loss": 1.031, + "step": 2399 + }, + { + "epoch": 0.577860711491001, + "grad_norm": 3.663658618927002, + "learning_rate": 7.990341069428931e-05, + "loss": 0.4797, + "step": 2400 + }, + { + "epoch": 0.5781014867874557, + "grad_norm": 1.6564805507659912, + "learning_rate": 7.9827033127348e-05, + "loss": 0.8746, + "step": 2401 + }, + { + "epoch": 0.5783422620839102, + "grad_norm": 1.9383180141448975, + "learning_rate": 7.9750667825616e-05, + "loss": 0.7385, + "step": 2402 + }, + { + "epoch": 0.5785830373803648, + "grad_norm": 0.28531309962272644, + "learning_rate": 7.967431483552356e-05, + "loss": 0.2861, + "step": 2403 + }, + { + "epoch": 0.5788238126768194, + "grad_norm": 2.478971004486084, + "learning_rate": 7.959797420349355e-05, + "loss": 0.3581, + "step": 2404 + }, + { + "epoch": 0.579064587973274, + "grad_norm": 3.229998826980591, + "learning_rate": 7.952164597594115e-05, + "loss": 0.7698, + "step": 2405 + }, + { + "epoch": 0.5793053632697285, + "grad_norm": 1.7557350397109985, + "learning_rate": 7.944533019927414e-05, + "loss": 0.758, + "step": 2406 + }, + { + "epoch": 0.5795461385661831, + "grad_norm": 7.241235256195068, + "learning_rate": 7.936902691989267e-05, + "loss": 1.014, + "step": 2407 + }, + { + "epoch": 0.5797869138626377, + "grad_norm": 4.189211368560791, + "learning_rate": 7.929273618418933e-05, + "loss": 0.7462, + "step": 2408 + }, + { + "epoch": 0.5800276891590923, + "grad_norm": 4.705471515655518, + "learning_rate": 7.921645803854907e-05, + "loss": 0.9831, + "step": 2409 + }, + { + "epoch": 0.5802684644555468, + "grad_norm": 2.0091071128845215, + "learning_rate": 7.914019252934908e-05, + "loss": 0.8221, + "step": 2410 + }, + { + "epoch": 0.5805092397520014, + "grad_norm": 2.49102783203125, + "learning_rate": 7.906393970295905e-05, + "loss": 0.5716, + "step": 2411 + }, + { + "epoch": 0.580750015048456, + "grad_norm": 2.925053119659424, + "learning_rate": 7.89876996057409e-05, + "loss": 0.9088, + "step": 2412 + }, + { + "epoch": 0.5809907903449106, + "grad_norm": 4.885961532592773, + "learning_rate": 7.891147228404869e-05, + "loss": 0.5873, + "step": 2413 + }, + { + "epoch": 0.5812315656413652, + "grad_norm": 1.2176140546798706, + "learning_rate": 7.883525778422887e-05, + "loss": 0.8426, + "step": 2414 + }, + { + "epoch": 0.5814723409378197, + "grad_norm": 1.188421368598938, + "learning_rate": 7.875905615261997e-05, + "loss": 0.6984, + "step": 2415 + }, + { + "epoch": 0.5817131162342744, + "grad_norm": 3.3436102867126465, + "learning_rate": 7.868286743555279e-05, + "loss": 0.7285, + "step": 2416 + }, + { + "epoch": 0.581953891530729, + "grad_norm": 2.9441144466400146, + "learning_rate": 7.860669167935028e-05, + "loss": 0.3281, + "step": 2417 + }, + { + "epoch": 0.5821946668271836, + "grad_norm": 1.1844704151153564, + "learning_rate": 7.853052893032736e-05, + "loss": 0.3296, + "step": 2418 + }, + { + "epoch": 0.5824354421236381, + "grad_norm": 1.122290849685669, + "learning_rate": 7.84543792347913e-05, + "loss": 0.3637, + "step": 2419 + }, + { + "epoch": 0.5826762174200927, + "grad_norm": 0.8115438222885132, + "learning_rate": 7.837824263904116e-05, + "loss": 0.1266, + "step": 2420 + }, + { + "epoch": 0.5829169927165473, + "grad_norm": 2.9317989349365234, + "learning_rate": 7.83021191893682e-05, + "loss": 0.7098, + "step": 2421 + }, + { + "epoch": 0.5831577680130019, + "grad_norm": 2.4324686527252197, + "learning_rate": 7.822600893205569e-05, + "loss": 0.384, + "step": 2422 + }, + { + "epoch": 0.5833985433094564, + "grad_norm": 1.8341871500015259, + "learning_rate": 7.814991191337877e-05, + "loss": 0.6857, + "step": 2423 + }, + { + "epoch": 0.583639318605911, + "grad_norm": 0.9151331782341003, + "learning_rate": 7.807382817960464e-05, + "loss": 0.2521, + "step": 2424 + }, + { + "epoch": 0.5838800939023656, + "grad_norm": 3.2553586959838867, + "learning_rate": 7.799775777699243e-05, + "loss": 0.277, + "step": 2425 + }, + { + "epoch": 0.5841208691988202, + "grad_norm": 5.162132263183594, + "learning_rate": 7.792170075179302e-05, + "loss": 0.3815, + "step": 2426 + }, + { + "epoch": 0.5843616444952748, + "grad_norm": 0.9806712865829468, + "learning_rate": 7.784565715024932e-05, + "loss": 0.6379, + "step": 2427 + }, + { + "epoch": 0.5846024197917293, + "grad_norm": 2.116602897644043, + "learning_rate": 7.776962701859596e-05, + "loss": 0.3267, + "step": 2428 + }, + { + "epoch": 0.5848431950881839, + "grad_norm": 2.120924472808838, + "learning_rate": 7.769361040305944e-05, + "loss": 0.5844, + "step": 2429 + }, + { + "epoch": 0.5850839703846386, + "grad_norm": 1.5902043581008911, + "learning_rate": 7.76176073498581e-05, + "loss": 0.3459, + "step": 2430 + }, + { + "epoch": 0.5853247456810932, + "grad_norm": 2.4817397594451904, + "learning_rate": 7.75416179052019e-05, + "loss": 0.4125, + "step": 2431 + }, + { + "epoch": 0.5855655209775477, + "grad_norm": 1.6833219528198242, + "learning_rate": 7.746564211529264e-05, + "loss": 0.422, + "step": 2432 + }, + { + "epoch": 0.5858062962740023, + "grad_norm": 9.756152153015137, + "learning_rate": 7.73896800263237e-05, + "loss": 1.028, + "step": 2433 + }, + { + "epoch": 0.5860470715704569, + "grad_norm": 1.5745316743850708, + "learning_rate": 7.731373168448027e-05, + "loss": 0.6256, + "step": 2434 + }, + { + "epoch": 0.5862878468669115, + "grad_norm": 3.160309314727783, + "learning_rate": 7.723779713593908e-05, + "loss": 0.4354, + "step": 2435 + }, + { + "epoch": 0.586528622163366, + "grad_norm": 1.0206762552261353, + "learning_rate": 7.716187642686851e-05, + "loss": 0.7593, + "step": 2436 + }, + { + "epoch": 0.5867693974598206, + "grad_norm": 2.535022020339966, + "learning_rate": 7.708596960342852e-05, + "loss": 0.5759, + "step": 2437 + }, + { + "epoch": 0.5870101727562752, + "grad_norm": 1.5657432079315186, + "learning_rate": 7.701007671177067e-05, + "loss": 0.4912, + "step": 2438 + }, + { + "epoch": 0.5872509480527298, + "grad_norm": 0.8812488317489624, + "learning_rate": 7.693419779803794e-05, + "loss": 0.3876, + "step": 2439 + }, + { + "epoch": 0.5874917233491844, + "grad_norm": 2.174088954925537, + "learning_rate": 7.685833290836497e-05, + "loss": 0.6519, + "step": 2440 + }, + { + "epoch": 0.5877324986456389, + "grad_norm": 1.4618853330612183, + "learning_rate": 7.678248208887767e-05, + "loss": 0.1547, + "step": 2441 + }, + { + "epoch": 0.5879732739420935, + "grad_norm": 1.048917293548584, + "learning_rate": 7.670664538569358e-05, + "loss": 0.7463, + "step": 2442 + }, + { + "epoch": 0.5882140492385481, + "grad_norm": 0.559017539024353, + "learning_rate": 7.663082284492161e-05, + "loss": 0.4063, + "step": 2443 + }, + { + "epoch": 0.5884548245350028, + "grad_norm": 1.1241803169250488, + "learning_rate": 7.655501451266197e-05, + "loss": 0.6386, + "step": 2444 + }, + { + "epoch": 0.5886955998314573, + "grad_norm": 1.063376545906067, + "learning_rate": 7.647922043500637e-05, + "loss": 0.7574, + "step": 2445 + }, + { + "epoch": 0.5889363751279119, + "grad_norm": 0.6335359811782837, + "learning_rate": 7.640344065803768e-05, + "loss": 0.4932, + "step": 2446 + }, + { + "epoch": 0.5891771504243665, + "grad_norm": 5.459598064422607, + "learning_rate": 7.632767522783025e-05, + "loss": 0.7634, + "step": 2447 + }, + { + "epoch": 0.5894179257208211, + "grad_norm": 2.985459089279175, + "learning_rate": 7.625192419044966e-05, + "loss": 1.101, + "step": 2448 + }, + { + "epoch": 0.5896587010172756, + "grad_norm": 1.2922786474227905, + "learning_rate": 7.617618759195262e-05, + "loss": 0.2012, + "step": 2449 + }, + { + "epoch": 0.5898994763137302, + "grad_norm": 3.0108511447906494, + "learning_rate": 7.61004654783872e-05, + "loss": 0.4321, + "step": 2450 + }, + { + "epoch": 0.5901402516101848, + "grad_norm": 1.4148467779159546, + "learning_rate": 7.602475789579265e-05, + "loss": 0.4081, + "step": 2451 + }, + { + "epoch": 0.5903810269066394, + "grad_norm": 3.129077911376953, + "learning_rate": 7.594906489019928e-05, + "loss": 0.3434, + "step": 2452 + }, + { + "epoch": 0.590621802203094, + "grad_norm": 2.397958993911743, + "learning_rate": 7.58733865076287e-05, + "loss": 0.6114, + "step": 2453 + }, + { + "epoch": 0.5908625774995485, + "grad_norm": 1.5747435092926025, + "learning_rate": 7.579772279409342e-05, + "loss": 0.6804, + "step": 2454 + }, + { + "epoch": 0.5911033527960031, + "grad_norm": 2.1680166721343994, + "learning_rate": 7.572207379559721e-05, + "loss": 0.5044, + "step": 2455 + }, + { + "epoch": 0.5913441280924577, + "grad_norm": 0.6241942644119263, + "learning_rate": 7.564643955813489e-05, + "loss": 0.7446, + "step": 2456 + }, + { + "epoch": 0.5915849033889123, + "grad_norm": 4.499767780303955, + "learning_rate": 7.557082012769213e-05, + "loss": 0.8841, + "step": 2457 + }, + { + "epoch": 0.5918256786853668, + "grad_norm": 1.2002981901168823, + "learning_rate": 7.549521555024582e-05, + "loss": 0.2635, + "step": 2458 + }, + { + "epoch": 0.5920664539818215, + "grad_norm": 1.4949264526367188, + "learning_rate": 7.541962587176361e-05, + "loss": 0.3554, + "step": 2459 + }, + { + "epoch": 0.5923072292782761, + "grad_norm": 3.360037326812744, + "learning_rate": 7.534405113820427e-05, + "loss": 0.1464, + "step": 2460 + }, + { + "epoch": 0.5925480045747307, + "grad_norm": 1.4905561208724976, + "learning_rate": 7.526849139551744e-05, + "loss": 0.3034, + "step": 2461 + }, + { + "epoch": 0.5927887798711852, + "grad_norm": 1.9774373769760132, + "learning_rate": 7.51929466896435e-05, + "loss": 0.7877, + "step": 2462 + }, + { + "epoch": 0.5930295551676398, + "grad_norm": 1.1401469707489014, + "learning_rate": 7.511741706651384e-05, + "loss": 0.7026, + "step": 2463 + }, + { + "epoch": 0.5932703304640944, + "grad_norm": 2.067647695541382, + "learning_rate": 7.504190257205075e-05, + "loss": 0.5986, + "step": 2464 + }, + { + "epoch": 0.593511105760549, + "grad_norm": 2.496720790863037, + "learning_rate": 7.496640325216708e-05, + "loss": 0.4588, + "step": 2465 + }, + { + "epoch": 0.5937518810570036, + "grad_norm": 0.8668129444122314, + "learning_rate": 7.489091915276664e-05, + "loss": 0.2105, + "step": 2466 + }, + { + "epoch": 0.5939926563534581, + "grad_norm": 2.7461793422698975, + "learning_rate": 7.481545031974392e-05, + "loss": 0.829, + "step": 2467 + }, + { + "epoch": 0.5942334316499127, + "grad_norm": 2.240567207336426, + "learning_rate": 7.473999679898414e-05, + "loss": 0.3528, + "step": 2468 + }, + { + "epoch": 0.5944742069463673, + "grad_norm": 3.95941162109375, + "learning_rate": 7.466455863636326e-05, + "loss": 0.5933, + "step": 2469 + }, + { + "epoch": 0.5947149822428219, + "grad_norm": 3.9699573516845703, + "learning_rate": 7.458913587774777e-05, + "loss": 1.0409, + "step": 2470 + }, + { + "epoch": 0.5949557575392764, + "grad_norm": 1.2216235399246216, + "learning_rate": 7.451372856899494e-05, + "loss": 0.2177, + "step": 2471 + }, + { + "epoch": 0.595196532835731, + "grad_norm": 7.556828022003174, + "learning_rate": 7.443833675595255e-05, + "loss": 1.0671, + "step": 2472 + }, + { + "epoch": 0.5954373081321856, + "grad_norm": 3.3300185203552246, + "learning_rate": 7.436296048445899e-05, + "loss": 0.4654, + "step": 2473 + }, + { + "epoch": 0.5956780834286403, + "grad_norm": 1.8964426517486572, + "learning_rate": 7.428759980034324e-05, + "loss": 0.8291, + "step": 2474 + }, + { + "epoch": 0.5959188587250948, + "grad_norm": 2.5427963733673096, + "learning_rate": 7.421225474942472e-05, + "loss": 0.7374, + "step": 2475 + }, + { + "epoch": 0.5961596340215494, + "grad_norm": 2.4423563480377197, + "learning_rate": 7.413692537751341e-05, + "loss": 0.5469, + "step": 2476 + }, + { + "epoch": 0.596400409318004, + "grad_norm": 0.9203125834465027, + "learning_rate": 7.40616117304098e-05, + "loss": 0.3301, + "step": 2477 + }, + { + "epoch": 0.5966411846144586, + "grad_norm": 2.929774284362793, + "learning_rate": 7.398631385390464e-05, + "loss": 0.4724, + "step": 2478 + }, + { + "epoch": 0.5968819599109132, + "grad_norm": 1.285556674003601, + "learning_rate": 7.391103179377927e-05, + "loss": 0.5685, + "step": 2479 + }, + { + "epoch": 0.5971227352073677, + "grad_norm": 2.1298601627349854, + "learning_rate": 7.383576559580537e-05, + "loss": 0.4238, + "step": 2480 + }, + { + "epoch": 0.5973635105038223, + "grad_norm": 3.565706968307495, + "learning_rate": 7.37605153057449e-05, + "loss": 0.8206, + "step": 2481 + }, + { + "epoch": 0.5976042858002769, + "grad_norm": 1.6290020942687988, + "learning_rate": 7.368528096935028e-05, + "loss": 0.7003, + "step": 2482 + }, + { + "epoch": 0.5978450610967315, + "grad_norm": 0.8494675755500793, + "learning_rate": 7.361006263236409e-05, + "loss": 0.4155, + "step": 2483 + }, + { + "epoch": 0.598085836393186, + "grad_norm": 6.1263508796691895, + "learning_rate": 7.353486034051933e-05, + "loss": 0.3906, + "step": 2484 + }, + { + "epoch": 0.5983266116896406, + "grad_norm": 2.4993395805358887, + "learning_rate": 7.345967413953906e-05, + "loss": 0.8447, + "step": 2485 + }, + { + "epoch": 0.5985673869860952, + "grad_norm": 1.5568212270736694, + "learning_rate": 7.338450407513671e-05, + "loss": 0.3203, + "step": 2486 + }, + { + "epoch": 0.5988081622825498, + "grad_norm": 1.6858243942260742, + "learning_rate": 7.330935019301587e-05, + "loss": 0.5842, + "step": 2487 + }, + { + "epoch": 0.5990489375790043, + "grad_norm": 1.3046417236328125, + "learning_rate": 7.323421253887022e-05, + "loss": 0.4694, + "step": 2488 + }, + { + "epoch": 0.599289712875459, + "grad_norm": 2.6327929496765137, + "learning_rate": 7.315909115838367e-05, + "loss": 1.0909, + "step": 2489 + }, + { + "epoch": 0.5995304881719136, + "grad_norm": 3.176302909851074, + "learning_rate": 7.308398609723019e-05, + "loss": 0.6372, + "step": 2490 + }, + { + "epoch": 0.5997712634683682, + "grad_norm": 0.5345314145088196, + "learning_rate": 7.300889740107376e-05, + "loss": 0.1974, + "step": 2491 + }, + { + "epoch": 0.6000120387648227, + "grad_norm": 3.916313886642456, + "learning_rate": 7.293382511556856e-05, + "loss": 1.1176, + "step": 2492 + }, + { + "epoch": 0.6002528140612773, + "grad_norm": 1.7653621435165405, + "learning_rate": 7.285876928635864e-05, + "loss": 0.6719, + "step": 2493 + }, + { + "epoch": 0.6004935893577319, + "grad_norm": 1.351683497428894, + "learning_rate": 7.278372995907815e-05, + "loss": 0.4, + "step": 2494 + }, + { + "epoch": 0.6007343646541865, + "grad_norm": 1.6684333086013794, + "learning_rate": 7.270870717935119e-05, + "loss": 0.5533, + "step": 2495 + }, + { + "epoch": 0.6009751399506411, + "grad_norm": 2.9273340702056885, + "learning_rate": 7.263370099279172e-05, + "loss": 0.7937, + "step": 2496 + }, + { + "epoch": 0.6012159152470956, + "grad_norm": 1.5765647888183594, + "learning_rate": 7.255871144500375e-05, + "loss": 1.2214, + "step": 2497 + }, + { + "epoch": 0.6014566905435502, + "grad_norm": 0.8081079125404358, + "learning_rate": 7.248373858158099e-05, + "loss": 0.2137, + "step": 2498 + }, + { + "epoch": 0.6016974658400048, + "grad_norm": 1.117992877960205, + "learning_rate": 7.240878244810718e-05, + "loss": 0.3442, + "step": 2499 + }, + { + "epoch": 0.6019382411364594, + "grad_norm": 2.1289424896240234, + "learning_rate": 7.233384309015584e-05, + "loss": 0.673, + "step": 2500 + }, + { + "epoch": 0.6021790164329139, + "grad_norm": 1.3627246618270874, + "learning_rate": 7.22589205532902e-05, + "loss": 0.6268, + "step": 2501 + }, + { + "epoch": 0.6024197917293685, + "grad_norm": 1.519879937171936, + "learning_rate": 7.218401488306337e-05, + "loss": 0.272, + "step": 2502 + }, + { + "epoch": 0.6026605670258232, + "grad_norm": 2.85306978225708, + "learning_rate": 7.210912612501817e-05, + "loss": 0.6996, + "step": 2503 + }, + { + "epoch": 0.6029013423222778, + "grad_norm": 1.7936211824417114, + "learning_rate": 7.20342543246871e-05, + "loss": 0.4967, + "step": 2504 + }, + { + "epoch": 0.6031421176187323, + "grad_norm": 3.3115148544311523, + "learning_rate": 7.195939952759248e-05, + "loss": 0.2885, + "step": 2505 + }, + { + "epoch": 0.6033828929151869, + "grad_norm": 0.8520734906196594, + "learning_rate": 7.188456177924605e-05, + "loss": 0.7537, + "step": 2506 + }, + { + "epoch": 0.6036236682116415, + "grad_norm": 0.48104944825172424, + "learning_rate": 7.180974112514943e-05, + "loss": 0.1885, + "step": 2507 + }, + { + "epoch": 0.6038644435080961, + "grad_norm": 0.6579359769821167, + "learning_rate": 7.173493761079372e-05, + "loss": 0.5065, + "step": 2508 + }, + { + "epoch": 0.6041052188045507, + "grad_norm": 1.0354386568069458, + "learning_rate": 7.166015128165962e-05, + "loss": 0.2026, + "step": 2509 + }, + { + "epoch": 0.6043459941010052, + "grad_norm": 0.9975037574768066, + "learning_rate": 7.158538218321739e-05, + "loss": 0.3232, + "step": 2510 + }, + { + "epoch": 0.6045867693974598, + "grad_norm": 1.1760191917419434, + "learning_rate": 7.15106303609268e-05, + "loss": 0.8285, + "step": 2511 + }, + { + "epoch": 0.6048275446939144, + "grad_norm": 1.8876464366912842, + "learning_rate": 7.143589586023715e-05, + "loss": 0.3947, + "step": 2512 + }, + { + "epoch": 0.605068319990369, + "grad_norm": 1.618282437324524, + "learning_rate": 7.136117872658721e-05, + "loss": 0.5223, + "step": 2513 + }, + { + "epoch": 0.6053090952868235, + "grad_norm": 3.051154851913452, + "learning_rate": 7.128647900540506e-05, + "loss": 0.6019, + "step": 2514 + }, + { + "epoch": 0.6055498705832781, + "grad_norm": 1.742828130722046, + "learning_rate": 7.121179674210841e-05, + "loss": 0.4666, + "step": 2515 + }, + { + "epoch": 0.6057906458797327, + "grad_norm": 1.7815309762954712, + "learning_rate": 7.11371319821042e-05, + "loss": 0.9279, + "step": 2516 + }, + { + "epoch": 0.6060314211761874, + "grad_norm": 2.8688342571258545, + "learning_rate": 7.106248477078874e-05, + "loss": 0.7174, + "step": 2517 + }, + { + "epoch": 0.6062721964726419, + "grad_norm": 0.4165075123310089, + "learning_rate": 7.09878551535478e-05, + "loss": 0.286, + "step": 2518 + }, + { + "epoch": 0.6065129717690965, + "grad_norm": 1.0387226343154907, + "learning_rate": 7.091324317575623e-05, + "loss": 0.5322, + "step": 2519 + }, + { + "epoch": 0.6067537470655511, + "grad_norm": 3.8460330963134766, + "learning_rate": 7.083864888277833e-05, + "loss": 0.5769, + "step": 2520 + }, + { + "epoch": 0.6069945223620057, + "grad_norm": 3.067915201187134, + "learning_rate": 7.076407231996768e-05, + "loss": 0.8518, + "step": 2521 + }, + { + "epoch": 0.6072352976584603, + "grad_norm": 0.5079042315483093, + "learning_rate": 7.06895135326669e-05, + "loss": 0.7296, + "step": 2522 + }, + { + "epoch": 0.6074760729549148, + "grad_norm": 2.1162655353546143, + "learning_rate": 7.061497256620793e-05, + "loss": 0.4852, + "step": 2523 + }, + { + "epoch": 0.6077168482513694, + "grad_norm": 0.4626848101615906, + "learning_rate": 7.054044946591184e-05, + "loss": 0.4988, + "step": 2524 + }, + { + "epoch": 0.607957623547824, + "grad_norm": 1.59532630443573, + "learning_rate": 7.046594427708882e-05, + "loss": 0.2568, + "step": 2525 + }, + { + "epoch": 0.6081983988442786, + "grad_norm": 4.2776384353637695, + "learning_rate": 7.039145704503829e-05, + "loss": 0.9273, + "step": 2526 + }, + { + "epoch": 0.6084391741407331, + "grad_norm": 2.9044902324676514, + "learning_rate": 7.031698781504849e-05, + "loss": 0.6728, + "step": 2527 + }, + { + "epoch": 0.6086799494371877, + "grad_norm": 4.779917240142822, + "learning_rate": 7.024253663239704e-05, + "loss": 0.6652, + "step": 2528 + }, + { + "epoch": 0.6089207247336423, + "grad_norm": 2.6108837127685547, + "learning_rate": 7.016810354235038e-05, + "loss": 0.6116, + "step": 2529 + }, + { + "epoch": 0.609161500030097, + "grad_norm": 0.8134174942970276, + "learning_rate": 7.009368859016393e-05, + "loss": 0.5276, + "step": 2530 + }, + { + "epoch": 0.6094022753265514, + "grad_norm": 0.9418330192565918, + "learning_rate": 7.001929182108223e-05, + "loss": 0.315, + "step": 2531 + }, + { + "epoch": 0.609643050623006, + "grad_norm": 1.0904672145843506, + "learning_rate": 6.994491328033862e-05, + "loss": 0.4043, + "step": 2532 + }, + { + "epoch": 0.6098838259194607, + "grad_norm": 3.003647565841675, + "learning_rate": 6.987055301315546e-05, + "loss": 1.1199, + "step": 2533 + }, + { + "epoch": 0.6101246012159153, + "grad_norm": 5.428164958953857, + "learning_rate": 6.979621106474399e-05, + "loss": 0.6681, + "step": 2534 + }, + { + "epoch": 0.6103653765123699, + "grad_norm": 3.2087454795837402, + "learning_rate": 6.972188748030419e-05, + "loss": 0.629, + "step": 2535 + }, + { + "epoch": 0.6106061518088244, + "grad_norm": 4.468095779418945, + "learning_rate": 6.964758230502503e-05, + "loss": 0.9202, + "step": 2536 + }, + { + "epoch": 0.610846927105279, + "grad_norm": 3.3546736240386963, + "learning_rate": 6.957329558408423e-05, + "loss": 0.1201, + "step": 2537 + }, + { + "epoch": 0.6110877024017336, + "grad_norm": 1.5020190477371216, + "learning_rate": 6.949902736264823e-05, + "loss": 0.7108, + "step": 2538 + }, + { + "epoch": 0.6113284776981882, + "grad_norm": 2.882939577102661, + "learning_rate": 6.942477768587237e-05, + "loss": 0.8403, + "step": 2539 + }, + { + "epoch": 0.6115692529946427, + "grad_norm": 2.0902936458587646, + "learning_rate": 6.935054659890052e-05, + "loss": 0.5279, + "step": 2540 + }, + { + "epoch": 0.6118100282910973, + "grad_norm": 0.7436076402664185, + "learning_rate": 6.92763341468654e-05, + "loss": 0.2855, + "step": 2541 + }, + { + "epoch": 0.6120508035875519, + "grad_norm": 1.6438990831375122, + "learning_rate": 6.920214037488837e-05, + "loss": 0.9144, + "step": 2542 + }, + { + "epoch": 0.6122915788840065, + "grad_norm": 4.628514766693115, + "learning_rate": 6.912796532807934e-05, + "loss": 0.4584, + "step": 2543 + }, + { + "epoch": 0.612532354180461, + "grad_norm": 1.0494173765182495, + "learning_rate": 6.905380905153699e-05, + "loss": 0.8366, + "step": 2544 + }, + { + "epoch": 0.6127731294769156, + "grad_norm": 1.7360765933990479, + "learning_rate": 6.897967159034842e-05, + "loss": 0.5281, + "step": 2545 + }, + { + "epoch": 0.6130139047733703, + "grad_norm": 1.996323823928833, + "learning_rate": 6.89055529895894e-05, + "loss": 0.9566, + "step": 2546 + }, + { + "epoch": 0.6132546800698249, + "grad_norm": 1.1495094299316406, + "learning_rate": 6.883145329432427e-05, + "loss": 0.2774, + "step": 2547 + }, + { + "epoch": 0.6134954553662795, + "grad_norm": 1.1022604703903198, + "learning_rate": 6.875737254960573e-05, + "loss": 0.5837, + "step": 2548 + }, + { + "epoch": 0.613736230662734, + "grad_norm": 1.230012059211731, + "learning_rate": 6.86833108004751e-05, + "loss": 0.3625, + "step": 2549 + }, + { + "epoch": 0.6139770059591886, + "grad_norm": 2.1149914264678955, + "learning_rate": 6.860926809196202e-05, + "loss": 0.5104, + "step": 2550 + }, + { + "epoch": 0.6142177812556432, + "grad_norm": 1.968151330947876, + "learning_rate": 6.853524446908469e-05, + "loss": 0.2779, + "step": 2551 + }, + { + "epoch": 0.6144585565520978, + "grad_norm": 2.429940938949585, + "learning_rate": 6.84612399768496e-05, + "loss": 0.6746, + "step": 2552 + }, + { + "epoch": 0.6146993318485523, + "grad_norm": 5.285004138946533, + "learning_rate": 6.838725466025165e-05, + "loss": 1.0503, + "step": 2553 + }, + { + "epoch": 0.6149401071450069, + "grad_norm": 2.815894842147827, + "learning_rate": 6.83132885642741e-05, + "loss": 0.7085, + "step": 2554 + }, + { + "epoch": 0.6151808824414615, + "grad_norm": 0.6075404286384583, + "learning_rate": 6.823934173388851e-05, + "loss": 0.772, + "step": 2555 + }, + { + "epoch": 0.6154216577379161, + "grad_norm": 2.521939992904663, + "learning_rate": 6.81654142140547e-05, + "loss": 0.3452, + "step": 2556 + }, + { + "epoch": 0.6156624330343706, + "grad_norm": 2.4346060752868652, + "learning_rate": 6.809150604972079e-05, + "loss": 0.4844, + "step": 2557 + }, + { + "epoch": 0.6159032083308252, + "grad_norm": 3.9991800785064697, + "learning_rate": 6.801761728582305e-05, + "loss": 0.4445, + "step": 2558 + }, + { + "epoch": 0.6161439836272798, + "grad_norm": 1.3528568744659424, + "learning_rate": 6.794374796728606e-05, + "loss": 0.558, + "step": 2559 + }, + { + "epoch": 0.6163847589237345, + "grad_norm": 1.2810922861099243, + "learning_rate": 6.786989813902256e-05, + "loss": 0.1497, + "step": 2560 + }, + { + "epoch": 0.616625534220189, + "grad_norm": 2.726228713989258, + "learning_rate": 6.779606784593335e-05, + "loss": 0.6956, + "step": 2561 + }, + { + "epoch": 0.6168663095166436, + "grad_norm": 1.971341609954834, + "learning_rate": 6.77222571329075e-05, + "loss": 0.5738, + "step": 2562 + }, + { + "epoch": 0.6171070848130982, + "grad_norm": 1.7685867547988892, + "learning_rate": 6.764846604482198e-05, + "loss": 0.596, + "step": 2563 + }, + { + "epoch": 0.6173478601095528, + "grad_norm": 2.118589162826538, + "learning_rate": 6.7574694626542e-05, + "loss": 0.6178, + "step": 2564 + }, + { + "epoch": 0.6175886354060074, + "grad_norm": 1.7449718713760376, + "learning_rate": 6.750094292292077e-05, + "loss": 0.478, + "step": 2565 + }, + { + "epoch": 0.6178294107024619, + "grad_norm": 3.4893815517425537, + "learning_rate": 6.742721097879944e-05, + "loss": 0.7634, + "step": 2566 + }, + { + "epoch": 0.6180701859989165, + "grad_norm": 1.0635044574737549, + "learning_rate": 6.735349883900723e-05, + "loss": 0.4162, + "step": 2567 + }, + { + "epoch": 0.6183109612953711, + "grad_norm": 3.9489662647247314, + "learning_rate": 6.727980654836128e-05, + "loss": 0.3658, + "step": 2568 + }, + { + "epoch": 0.6185517365918257, + "grad_norm": 0.335372656583786, + "learning_rate": 6.720613415166666e-05, + "loss": 0.3209, + "step": 2569 + }, + { + "epoch": 0.6187925118882802, + "grad_norm": 5.411129474639893, + "learning_rate": 6.71324816937164e-05, + "loss": 0.9387, + "step": 2570 + }, + { + "epoch": 0.6190332871847348, + "grad_norm": 3.5781424045562744, + "learning_rate": 6.705884921929129e-05, + "loss": 0.389, + "step": 2571 + }, + { + "epoch": 0.6192740624811894, + "grad_norm": 1.238693356513977, + "learning_rate": 6.698523677316005e-05, + "loss": 0.2619, + "step": 2572 + }, + { + "epoch": 0.619514837777644, + "grad_norm": 1.2711673974990845, + "learning_rate": 6.691164440007927e-05, + "loss": 0.5968, + "step": 2573 + }, + { + "epoch": 0.6197556130740985, + "grad_norm": 1.706970453262329, + "learning_rate": 6.683807214479323e-05, + "loss": 0.512, + "step": 2574 + }, + { + "epoch": 0.6199963883705532, + "grad_norm": 0.9450774192810059, + "learning_rate": 6.676452005203406e-05, + "loss": 0.1936, + "step": 2575 + }, + { + "epoch": 0.6202371636670078, + "grad_norm": 2.2367053031921387, + "learning_rate": 6.669098816652154e-05, + "loss": 0.7918, + "step": 2576 + }, + { + "epoch": 0.6204779389634624, + "grad_norm": 2.209228277206421, + "learning_rate": 6.661747653296328e-05, + "loss": 0.8082, + "step": 2577 + }, + { + "epoch": 0.620718714259917, + "grad_norm": 2.171247720718384, + "learning_rate": 6.654398519605453e-05, + "loss": 0.1968, + "step": 2578 + }, + { + "epoch": 0.6209594895563715, + "grad_norm": 1.3200645446777344, + "learning_rate": 6.647051420047811e-05, + "loss": 0.3753, + "step": 2579 + }, + { + "epoch": 0.6212002648528261, + "grad_norm": 1.0066230297088623, + "learning_rate": 6.63970635909046e-05, + "loss": 0.2922, + "step": 2580 + }, + { + "epoch": 0.6214410401492807, + "grad_norm": 1.6534557342529297, + "learning_rate": 6.632363341199216e-05, + "loss": 0.1991, + "step": 2581 + }, + { + "epoch": 0.6216818154457353, + "grad_norm": 0.7948190569877625, + "learning_rate": 6.625022370838649e-05, + "loss": 0.6887, + "step": 2582 + }, + { + "epoch": 0.6219225907421898, + "grad_norm": 0.8418195843696594, + "learning_rate": 6.617683452472084e-05, + "loss": 0.3701, + "step": 2583 + }, + { + "epoch": 0.6221633660386444, + "grad_norm": 0.7696940898895264, + "learning_rate": 6.610346590561597e-05, + "loss": 0.2781, + "step": 2584 + }, + { + "epoch": 0.622404141335099, + "grad_norm": 4.234709739685059, + "learning_rate": 6.603011789568021e-05, + "loss": 0.4584, + "step": 2585 + }, + { + "epoch": 0.6226449166315536, + "grad_norm": 2.2976908683776855, + "learning_rate": 6.595679053950933e-05, + "loss": 0.9089, + "step": 2586 + }, + { + "epoch": 0.6228856919280081, + "grad_norm": 3.831660270690918, + "learning_rate": 6.588348388168649e-05, + "loss": 1.088, + "step": 2587 + }, + { + "epoch": 0.6231264672244627, + "grad_norm": 1.276307463645935, + "learning_rate": 6.581019796678231e-05, + "loss": 0.5542, + "step": 2588 + }, + { + "epoch": 0.6233672425209174, + "grad_norm": 1.1596672534942627, + "learning_rate": 6.57369328393548e-05, + "loss": 0.5181, + "step": 2589 + }, + { + "epoch": 0.623608017817372, + "grad_norm": 37.87815856933594, + "learning_rate": 6.566368854394931e-05, + "loss": 0.9077, + "step": 2590 + }, + { + "epoch": 0.6238487931138266, + "grad_norm": 2.3513474464416504, + "learning_rate": 6.55904651250986e-05, + "loss": 0.4156, + "step": 2591 + }, + { + "epoch": 0.6240895684102811, + "grad_norm": 2.0487992763519287, + "learning_rate": 6.551726262732253e-05, + "loss": 0.2226, + "step": 2592 + }, + { + "epoch": 0.6243303437067357, + "grad_norm": 2.3378994464874268, + "learning_rate": 6.54440810951285e-05, + "loss": 0.1514, + "step": 2593 + }, + { + "epoch": 0.6245711190031903, + "grad_norm": 0.9720037579536438, + "learning_rate": 6.537092057301107e-05, + "loss": 0.4916, + "step": 2594 + }, + { + "epoch": 0.6248118942996449, + "grad_norm": 9.787001609802246, + "learning_rate": 6.529778110545191e-05, + "loss": 0.5348, + "step": 2595 + }, + { + "epoch": 0.6250526695960994, + "grad_norm": 1.4247881174087524, + "learning_rate": 6.522466273692006e-05, + "loss": 0.6283, + "step": 2596 + }, + { + "epoch": 0.625293444892554, + "grad_norm": 0.8598988652229309, + "learning_rate": 6.515156551187156e-05, + "loss": 0.4231, + "step": 2597 + }, + { + "epoch": 0.6255342201890086, + "grad_norm": 1.4908655881881714, + "learning_rate": 6.507848947474976e-05, + "loss": 0.5314, + "step": 2598 + }, + { + "epoch": 0.6257749954854632, + "grad_norm": 1.3599947690963745, + "learning_rate": 6.500543466998508e-05, + "loss": 0.6969, + "step": 2599 + }, + { + "epoch": 0.6260157707819177, + "grad_norm": 2.1686439514160156, + "learning_rate": 6.49324011419949e-05, + "loss": 1.0609, + "step": 2600 + }, + { + "epoch": 0.6262565460783723, + "grad_norm": 2.399899482727051, + "learning_rate": 6.48593889351839e-05, + "loss": 0.433, + "step": 2601 + }, + { + "epoch": 0.6264973213748269, + "grad_norm": 2.7841548919677734, + "learning_rate": 6.478639809394355e-05, + "loss": 0.9582, + "step": 2602 + }, + { + "epoch": 0.6267380966712816, + "grad_norm": 1.8142119646072388, + "learning_rate": 6.471342866265251e-05, + "loss": 0.978, + "step": 2603 + }, + { + "epoch": 0.6269788719677362, + "grad_norm": 2.9826908111572266, + "learning_rate": 6.464048068567637e-05, + "loss": 1.4267, + "step": 2604 + }, + { + "epoch": 0.6272196472641907, + "grad_norm": 1.7013874053955078, + "learning_rate": 6.45675542073676e-05, + "loss": 0.7787, + "step": 2605 + }, + { + "epoch": 0.6274604225606453, + "grad_norm": 2.441843032836914, + "learning_rate": 6.44946492720657e-05, + "loss": 0.6874, + "step": 2606 + }, + { + "epoch": 0.6277011978570999, + "grad_norm": 2.108856439590454, + "learning_rate": 6.44217659240971e-05, + "loss": 0.1561, + "step": 2607 + }, + { + "epoch": 0.6279419731535545, + "grad_norm": 2.3653995990753174, + "learning_rate": 6.434890420777491e-05, + "loss": 1.1851, + "step": 2608 + }, + { + "epoch": 0.628182748450009, + "grad_norm": 2.6391708850860596, + "learning_rate": 6.427606416739932e-05, + "loss": 0.7138, + "step": 2609 + }, + { + "epoch": 0.6284235237464636, + "grad_norm": 2.129570484161377, + "learning_rate": 6.420324584725719e-05, + "loss": 0.5445, + "step": 2610 + }, + { + "epoch": 0.6286642990429182, + "grad_norm": 2.4864790439605713, + "learning_rate": 6.413044929162221e-05, + "loss": 0.3383, + "step": 2611 + }, + { + "epoch": 0.6289050743393728, + "grad_norm": 1.6433722972869873, + "learning_rate": 6.405767454475492e-05, + "loss": 0.5752, + "step": 2612 + }, + { + "epoch": 0.6291458496358273, + "grad_norm": 1.772709846496582, + "learning_rate": 6.398492165090246e-05, + "loss": 0.4523, + "step": 2613 + }, + { + "epoch": 0.6293866249322819, + "grad_norm": 1.9673948287963867, + "learning_rate": 6.391219065429882e-05, + "loss": 0.7943, + "step": 2614 + }, + { + "epoch": 0.6296274002287365, + "grad_norm": 1.1356581449508667, + "learning_rate": 6.383948159916453e-05, + "loss": 0.4172, + "step": 2615 + }, + { + "epoch": 0.6298681755251911, + "grad_norm": 1.4601411819458008, + "learning_rate": 6.376679452970689e-05, + "loss": 0.6976, + "step": 2616 + }, + { + "epoch": 0.6301089508216458, + "grad_norm": 1.9286946058273315, + "learning_rate": 6.369412949011983e-05, + "loss": 0.4763, + "step": 2617 + }, + { + "epoch": 0.6303497261181003, + "grad_norm": 44.064327239990234, + "learning_rate": 6.362148652458382e-05, + "loss": 0.3406, + "step": 2618 + }, + { + "epoch": 0.6305905014145549, + "grad_norm": 1.2986640930175781, + "learning_rate": 6.354886567726596e-05, + "loss": 0.3705, + "step": 2619 + }, + { + "epoch": 0.6308312767110095, + "grad_norm": 2.005955219268799, + "learning_rate": 6.347626699231995e-05, + "loss": 0.7816, + "step": 2620 + }, + { + "epoch": 0.6310720520074641, + "grad_norm": 5.238460063934326, + "learning_rate": 6.340369051388583e-05, + "loss": 0.8997, + "step": 2621 + }, + { + "epoch": 0.6313128273039186, + "grad_norm": 2.330120801925659, + "learning_rate": 6.33311362860904e-05, + "loss": 1.2146, + "step": 2622 + }, + { + "epoch": 0.6315536026003732, + "grad_norm": 1.597697138786316, + "learning_rate": 6.325860435304668e-05, + "loss": 0.5971, + "step": 2623 + }, + { + "epoch": 0.6317943778968278, + "grad_norm": 3.000946521759033, + "learning_rate": 6.318609475885427e-05, + "loss": 1.0077, + "step": 2624 + }, + { + "epoch": 0.6320351531932824, + "grad_norm": 1.6477075815200806, + "learning_rate": 6.311360754759923e-05, + "loss": 0.2647, + "step": 2625 + }, + { + "epoch": 0.6322759284897369, + "grad_norm": 3.0683581829071045, + "learning_rate": 6.30411427633539e-05, + "loss": 0.5957, + "step": 2626 + }, + { + "epoch": 0.6325167037861915, + "grad_norm": 2.284569025039673, + "learning_rate": 6.296870045017704e-05, + "loss": 0.5671, + "step": 2627 + }, + { + "epoch": 0.6327574790826461, + "grad_norm": 1.2169504165649414, + "learning_rate": 6.28962806521137e-05, + "loss": 0.8968, + "step": 2628 + }, + { + "epoch": 0.6329982543791007, + "grad_norm": 0.7323624491691589, + "learning_rate": 6.282388341319534e-05, + "loss": 0.2924, + "step": 2629 + }, + { + "epoch": 0.6332390296755553, + "grad_norm": 1.0401545763015747, + "learning_rate": 6.275150877743968e-05, + "loss": 0.4025, + "step": 2630 + }, + { + "epoch": 0.6334798049720098, + "grad_norm": 1.7448832988739014, + "learning_rate": 6.267915678885054e-05, + "loss": 0.5393, + "step": 2631 + }, + { + "epoch": 0.6337205802684645, + "grad_norm": 2.505074977874756, + "learning_rate": 6.260682749141816e-05, + "loss": 0.5271, + "step": 2632 + }, + { + "epoch": 0.6339613555649191, + "grad_norm": 1.562775731086731, + "learning_rate": 6.253452092911893e-05, + "loss": 0.6703, + "step": 2633 + }, + { + "epoch": 0.6342021308613737, + "grad_norm": 7.656639099121094, + "learning_rate": 6.24622371459154e-05, + "loss": 0.6791, + "step": 2634 + }, + { + "epoch": 0.6344429061578282, + "grad_norm": 1.8805732727050781, + "learning_rate": 6.238997618575625e-05, + "loss": 0.5853, + "step": 2635 + }, + { + "epoch": 0.6346836814542828, + "grad_norm": 2.444486141204834, + "learning_rate": 6.231773809257631e-05, + "loss": 0.463, + "step": 2636 + }, + { + "epoch": 0.6349244567507374, + "grad_norm": 2.5845704078674316, + "learning_rate": 6.224552291029648e-05, + "loss": 1.1734, + "step": 2637 + }, + { + "epoch": 0.635165232047192, + "grad_norm": 2.266542434692383, + "learning_rate": 6.217333068282383e-05, + "loss": 0.6734, + "step": 2638 + }, + { + "epoch": 0.6354060073436465, + "grad_norm": 1.0296664237976074, + "learning_rate": 6.210116145405132e-05, + "loss": 0.5709, + "step": 2639 + }, + { + "epoch": 0.6356467826401011, + "grad_norm": 1.0958337783813477, + "learning_rate": 6.202901526785806e-05, + "loss": 0.4469, + "step": 2640 + }, + { + "epoch": 0.6358875579365557, + "grad_norm": 4.823302745819092, + "learning_rate": 6.195689216810903e-05, + "loss": 0.5806, + "step": 2641 + }, + { + "epoch": 0.6361283332330103, + "grad_norm": 2.3272790908813477, + "learning_rate": 6.188479219865529e-05, + "loss": 0.7147, + "step": 2642 + }, + { + "epoch": 0.6363691085294648, + "grad_norm": 1.8934577703475952, + "learning_rate": 6.181271540333379e-05, + "loss": 0.4858, + "step": 2643 + }, + { + "epoch": 0.6366098838259194, + "grad_norm": 3.648973226547241, + "learning_rate": 6.174066182596734e-05, + "loss": 0.6624, + "step": 2644 + }, + { + "epoch": 0.636850659122374, + "grad_norm": 1.5058507919311523, + "learning_rate": 6.166863151036468e-05, + "loss": 0.4902, + "step": 2645 + }, + { + "epoch": 0.6370914344188287, + "grad_norm": 1.450817584991455, + "learning_rate": 6.159662450032046e-05, + "loss": 0.5773, + "step": 2646 + }, + { + "epoch": 0.6373322097152833, + "grad_norm": 1.6548353433609009, + "learning_rate": 6.152464083961506e-05, + "loss": 0.7833, + "step": 2647 + }, + { + "epoch": 0.6375729850117378, + "grad_norm": 2.030325174331665, + "learning_rate": 6.145268057201473e-05, + "loss": 0.3633, + "step": 2648 + }, + { + "epoch": 0.6378137603081924, + "grad_norm": 0.9832929968833923, + "learning_rate": 6.138074374127141e-05, + "loss": 0.8045, + "step": 2649 + }, + { + "epoch": 0.638054535604647, + "grad_norm": 3.2297143936157227, + "learning_rate": 6.130883039112292e-05, + "loss": 0.8928, + "step": 2650 + }, + { + "epoch": 0.6382953109011016, + "grad_norm": 1.0606904029846191, + "learning_rate": 6.123694056529277e-05, + "loss": 0.2497, + "step": 2651 + }, + { + "epoch": 0.6385360861975561, + "grad_norm": 1.3311941623687744, + "learning_rate": 6.116507430749005e-05, + "loss": 1.0808, + "step": 2652 + }, + { + "epoch": 0.6387768614940107, + "grad_norm": 1.9613722562789917, + "learning_rate": 6.109323166140968e-05, + "loss": 1.0504, + "step": 2653 + }, + { + "epoch": 0.6390176367904653, + "grad_norm": 1.6116615533828735, + "learning_rate": 6.102141267073207e-05, + "loss": 0.5613, + "step": 2654 + }, + { + "epoch": 0.6392584120869199, + "grad_norm": 1.1715492010116577, + "learning_rate": 6.094961737912339e-05, + "loss": 0.3594, + "step": 2655 + }, + { + "epoch": 0.6394991873833744, + "grad_norm": 1.0914250612258911, + "learning_rate": 6.087784583023535e-05, + "loss": 0.4884, + "step": 2656 + }, + { + "epoch": 0.639739962679829, + "grad_norm": 3.499825954437256, + "learning_rate": 6.080609806770516e-05, + "loss": 1.0449, + "step": 2657 + }, + { + "epoch": 0.6399807379762836, + "grad_norm": 1.0770723819732666, + "learning_rate": 6.073437413515566e-05, + "loss": 0.5479, + "step": 2658 + }, + { + "epoch": 0.6402215132727382, + "grad_norm": 1.7570230960845947, + "learning_rate": 6.0662674076195194e-05, + "loss": 0.3681, + "step": 2659 + }, + { + "epoch": 0.6404622885691929, + "grad_norm": 1.4941871166229248, + "learning_rate": 6.059099793441746e-05, + "loss": 0.4912, + "step": 2660 + }, + { + "epoch": 0.6407030638656473, + "grad_norm": 2.4457526206970215, + "learning_rate": 6.05193457534018e-05, + "loss": 0.7875, + "step": 2661 + }, + { + "epoch": 0.640943839162102, + "grad_norm": 0.5564669966697693, + "learning_rate": 6.044771757671286e-05, + "loss": 0.4049, + "step": 2662 + }, + { + "epoch": 0.6411846144585566, + "grad_norm": 2.4950759410858154, + "learning_rate": 6.037611344790073e-05, + "loss": 0.7389, + "step": 2663 + }, + { + "epoch": 0.6414253897550112, + "grad_norm": 4.833079814910889, + "learning_rate": 6.030453341050093e-05, + "loss": 0.9588, + "step": 2664 + }, + { + "epoch": 0.6416661650514657, + "grad_norm": 5.021981716156006, + "learning_rate": 6.023297750803423e-05, + "loss": 0.9724, + "step": 2665 + }, + { + "epoch": 0.6419069403479203, + "grad_norm": 1.1995772123336792, + "learning_rate": 6.0161445784006845e-05, + "loss": 0.6185, + "step": 2666 + }, + { + "epoch": 0.6421477156443749, + "grad_norm": 1.3051928281784058, + "learning_rate": 6.008993828191013e-05, + "loss": 0.382, + "step": 2667 + }, + { + "epoch": 0.6423884909408295, + "grad_norm": 3.9973623752593994, + "learning_rate": 6.001845504522086e-05, + "loss": 0.4625, + "step": 2668 + }, + { + "epoch": 0.642629266237284, + "grad_norm": 1.3775721788406372, + "learning_rate": 5.994699611740102e-05, + "loss": 0.5969, + "step": 2669 + }, + { + "epoch": 0.6428700415337386, + "grad_norm": 1.1527438163757324, + "learning_rate": 5.987556154189777e-05, + "loss": 0.6209, + "step": 2670 + }, + { + "epoch": 0.6431108168301932, + "grad_norm": 1.8281904458999634, + "learning_rate": 5.98041513621435e-05, + "loss": 0.636, + "step": 2671 + }, + { + "epoch": 0.6433515921266478, + "grad_norm": 2.8507192134857178, + "learning_rate": 5.973276562155581e-05, + "loss": 0.8928, + "step": 2672 + }, + { + "epoch": 0.6435923674231024, + "grad_norm": 2.5010929107666016, + "learning_rate": 5.9661404363537287e-05, + "loss": 1.1794, + "step": 2673 + }, + { + "epoch": 0.6438331427195569, + "grad_norm": 2.6095519065856934, + "learning_rate": 5.959006763147584e-05, + "loss": 1.2778, + "step": 2674 + }, + { + "epoch": 0.6440739180160115, + "grad_norm": 1.2294814586639404, + "learning_rate": 5.951875546874428e-05, + "loss": 0.3187, + "step": 2675 + }, + { + "epoch": 0.6443146933124662, + "grad_norm": 4.2607316970825195, + "learning_rate": 5.9447467918700614e-05, + "loss": 0.4591, + "step": 2676 + }, + { + "epoch": 0.6445554686089208, + "grad_norm": 1.8376699686050415, + "learning_rate": 5.9376205024687835e-05, + "loss": 0.6134, + "step": 2677 + }, + { + "epoch": 0.6447962439053753, + "grad_norm": 1.8157228231430054, + "learning_rate": 5.9304966830033907e-05, + "loss": 0.6103, + "step": 2678 + }, + { + "epoch": 0.6450370192018299, + "grad_norm": 1.8555665016174316, + "learning_rate": 5.923375337805186e-05, + "loss": 0.5274, + "step": 2679 + }, + { + "epoch": 0.6452777944982845, + "grad_norm": 1.7795042991638184, + "learning_rate": 5.916256471203958e-05, + "loss": 0.6227, + "step": 2680 + }, + { + "epoch": 0.6455185697947391, + "grad_norm": 2.5843842029571533, + "learning_rate": 5.909140087527996e-05, + "loss": 1.0466, + "step": 2681 + }, + { + "epoch": 0.6457593450911936, + "grad_norm": 3.1203784942626953, + "learning_rate": 5.9020261911040796e-05, + "loss": 0.874, + "step": 2682 + }, + { + "epoch": 0.6460001203876482, + "grad_norm": 2.341921091079712, + "learning_rate": 5.89491478625747e-05, + "loss": 0.3058, + "step": 2683 + }, + { + "epoch": 0.6462408956841028, + "grad_norm": 3.351788282394409, + "learning_rate": 5.8878058773119185e-05, + "loss": 0.7692, + "step": 2684 + }, + { + "epoch": 0.6464816709805574, + "grad_norm": 3.12622332572937, + "learning_rate": 5.880699468589661e-05, + "loss": 1.2673, + "step": 2685 + }, + { + "epoch": 0.646722446277012, + "grad_norm": 1.2435944080352783, + "learning_rate": 5.8735955644114046e-05, + "loss": 0.7739, + "step": 2686 + }, + { + "epoch": 0.6469632215734665, + "grad_norm": 0.624167799949646, + "learning_rate": 5.866494169096348e-05, + "loss": 0.7543, + "step": 2687 + }, + { + "epoch": 0.6472039968699211, + "grad_norm": 1.9075957536697388, + "learning_rate": 5.8593952869621436e-05, + "loss": 0.4008, + "step": 2688 + }, + { + "epoch": 0.6474447721663757, + "grad_norm": 2.9206345081329346, + "learning_rate": 5.852298922324935e-05, + "loss": 1.4938, + "step": 2689 + }, + { + "epoch": 0.6476855474628304, + "grad_norm": 2.629241704940796, + "learning_rate": 5.8452050794993275e-05, + "loss": 0.55, + "step": 2690 + }, + { + "epoch": 0.6479263227592849, + "grad_norm": 1.5018374919891357, + "learning_rate": 5.8381137627983915e-05, + "loss": 0.6912, + "step": 2691 + }, + { + "epoch": 0.6481670980557395, + "grad_norm": 4.23275089263916, + "learning_rate": 5.831024976533668e-05, + "loss": 0.6019, + "step": 2692 + }, + { + "epoch": 0.6484078733521941, + "grad_norm": 1.633299708366394, + "learning_rate": 5.823938725015148e-05, + "loss": 0.5051, + "step": 2693 + }, + { + "epoch": 0.6486486486486487, + "grad_norm": 7.270058631896973, + "learning_rate": 5.816855012551291e-05, + "loss": 0.6392, + "step": 2694 + }, + { + "epoch": 0.6488894239451032, + "grad_norm": 1.3112475872039795, + "learning_rate": 5.809773843449011e-05, + "loss": 0.4469, + "step": 2695 + }, + { + "epoch": 0.6491301992415578, + "grad_norm": 1.903131365776062, + "learning_rate": 5.802695222013676e-05, + "loss": 0.381, + "step": 2696 + }, + { + "epoch": 0.6493709745380124, + "grad_norm": 2.8745779991149902, + "learning_rate": 5.795619152549102e-05, + "loss": 0.8397, + "step": 2697 + }, + { + "epoch": 0.649611749834467, + "grad_norm": 5.189337730407715, + "learning_rate": 5.78854563935756e-05, + "loss": 0.4703, + "step": 2698 + }, + { + "epoch": 0.6498525251309216, + "grad_norm": 1.0739030838012695, + "learning_rate": 5.781474686739754e-05, + "loss": 0.6484, + "step": 2699 + }, + { + "epoch": 0.6500933004273761, + "grad_norm": 1.2836296558380127, + "learning_rate": 5.7744062989948464e-05, + "loss": 0.5722, + "step": 2700 + }, + { + "epoch": 0.6503340757238307, + "grad_norm": 7.686650276184082, + "learning_rate": 5.767340480420426e-05, + "loss": 0.4963, + "step": 2701 + }, + { + "epoch": 0.6505748510202853, + "grad_norm": 2.3548178672790527, + "learning_rate": 5.760277235312529e-05, + "loss": 0.757, + "step": 2702 + }, + { + "epoch": 0.65081562631674, + "grad_norm": 1.2598122358322144, + "learning_rate": 5.753216567965626e-05, + "loss": 0.397, + "step": 2703 + }, + { + "epoch": 0.6510564016131944, + "grad_norm": 2.0660560131073, + "learning_rate": 5.746158482672617e-05, + "loss": 0.8815, + "step": 2704 + }, + { + "epoch": 0.6512971769096491, + "grad_norm": 1.225748896598816, + "learning_rate": 5.73910298372484e-05, + "loss": 0.8635, + "step": 2705 + }, + { + "epoch": 0.6515379522061037, + "grad_norm": 1.400314211845398, + "learning_rate": 5.7320500754120434e-05, + "loss": 0.5746, + "step": 2706 + }, + { + "epoch": 0.6517787275025583, + "grad_norm": 2.5364537239074707, + "learning_rate": 5.724999762022416e-05, + "loss": 0.657, + "step": 2707 + }, + { + "epoch": 0.6520195027990128, + "grad_norm": 2.604710340499878, + "learning_rate": 5.717952047842571e-05, + "loss": 0.564, + "step": 2708 + }, + { + "epoch": 0.6522602780954674, + "grad_norm": 1.6602225303649902, + "learning_rate": 5.710906937157523e-05, + "loss": 0.7511, + "step": 2709 + }, + { + "epoch": 0.652501053391922, + "grad_norm": 1.7182340621948242, + "learning_rate": 5.7038644342507205e-05, + "loss": 0.3137, + "step": 2710 + }, + { + "epoch": 0.6527418286883766, + "grad_norm": 1.6290512084960938, + "learning_rate": 5.6968245434040225e-05, + "loss": 0.8388, + "step": 2711 + }, + { + "epoch": 0.6529826039848312, + "grad_norm": 1.5818265676498413, + "learning_rate": 5.689787268897697e-05, + "loss": 0.3487, + "step": 2712 + }, + { + "epoch": 0.6532233792812857, + "grad_norm": 3.120393753051758, + "learning_rate": 5.682752615010427e-05, + "loss": 0.9698, + "step": 2713 + }, + { + "epoch": 0.6534641545777403, + "grad_norm": 1.175986886024475, + "learning_rate": 5.6757205860192905e-05, + "loss": 0.44, + "step": 2714 + }, + { + "epoch": 0.6537049298741949, + "grad_norm": 1.2981712818145752, + "learning_rate": 5.6686911861997795e-05, + "loss": 0.2365, + "step": 2715 + }, + { + "epoch": 0.6539457051706495, + "grad_norm": 1.5336500406265259, + "learning_rate": 5.66166441982579e-05, + "loss": 0.9387, + "step": 2716 + }, + { + "epoch": 0.654186480467104, + "grad_norm": 1.408539056777954, + "learning_rate": 5.654640291169604e-05, + "loss": 0.6778, + "step": 2717 + }, + { + "epoch": 0.6544272557635586, + "grad_norm": 0.782248854637146, + "learning_rate": 5.647618804501915e-05, + "loss": 0.803, + "step": 2718 + }, + { + "epoch": 0.6546680310600133, + "grad_norm": 2.8610918521881104, + "learning_rate": 5.640599964091791e-05, + "loss": 0.828, + "step": 2719 + }, + { + "epoch": 0.6549088063564679, + "grad_norm": 2.616344690322876, + "learning_rate": 5.6335837742067145e-05, + "loss": 0.8763, + "step": 2720 + }, + { + "epoch": 0.6551495816529224, + "grad_norm": 2.1979098320007324, + "learning_rate": 5.6265702391125444e-05, + "loss": 0.383, + "step": 2721 + }, + { + "epoch": 0.655390356949377, + "grad_norm": 2.1095833778381348, + "learning_rate": 5.6195593630735185e-05, + "loss": 0.6834, + "step": 2722 + }, + { + "epoch": 0.6556311322458316, + "grad_norm": 3.0333285331726074, + "learning_rate": 5.61255115035227e-05, + "loss": 0.4893, + "step": 2723 + }, + { + "epoch": 0.6558719075422862, + "grad_norm": 3.033856153488159, + "learning_rate": 5.60554560520981e-05, + "loss": 0.6269, + "step": 2724 + }, + { + "epoch": 0.6561126828387407, + "grad_norm": 1.2351047992706299, + "learning_rate": 5.5985427319055195e-05, + "loss": 0.2402, + "step": 2725 + }, + { + "epoch": 0.6563534581351953, + "grad_norm": 0.572399914264679, + "learning_rate": 5.5915425346971683e-05, + "loss": 0.4192, + "step": 2726 + }, + { + "epoch": 0.6565942334316499, + "grad_norm": 16.815656661987305, + "learning_rate": 5.584545017840885e-05, + "loss": 0.7806, + "step": 2727 + }, + { + "epoch": 0.6568350087281045, + "grad_norm": 1.1614093780517578, + "learning_rate": 5.577550185591174e-05, + "loss": 0.7885, + "step": 2728 + }, + { + "epoch": 0.6570757840245591, + "grad_norm": 0.8286291360855103, + "learning_rate": 5.570558042200923e-05, + "loss": 0.3164, + "step": 2729 + }, + { + "epoch": 0.6573165593210136, + "grad_norm": 1.0357860326766968, + "learning_rate": 5.563568591921358e-05, + "loss": 0.2793, + "step": 2730 + }, + { + "epoch": 0.6575573346174682, + "grad_norm": 1.7955443859100342, + "learning_rate": 5.5565818390020886e-05, + "loss": 0.4327, + "step": 2731 + }, + { + "epoch": 0.6577981099139228, + "grad_norm": 25.76378631591797, + "learning_rate": 5.5495977876910675e-05, + "loss": 1.1611, + "step": 2732 + }, + { + "epoch": 0.6580388852103775, + "grad_norm": 1.9328374862670898, + "learning_rate": 5.542616442234618e-05, + "loss": 0.4974, + "step": 2733 + }, + { + "epoch": 0.658279660506832, + "grad_norm": 1.2386356592178345, + "learning_rate": 5.535637806877419e-05, + "loss": 0.8787, + "step": 2734 + }, + { + "epoch": 0.6585204358032866, + "grad_norm": 3.446852684020996, + "learning_rate": 5.5286618858624874e-05, + "loss": 0.4847, + "step": 2735 + }, + { + "epoch": 0.6587612110997412, + "grad_norm": 1.4139548540115356, + "learning_rate": 5.5216886834312e-05, + "loss": 0.6242, + "step": 2736 + }, + { + "epoch": 0.6590019863961958, + "grad_norm": 0.9220794439315796, + "learning_rate": 5.51471820382329e-05, + "loss": 0.3538, + "step": 2737 + }, + { + "epoch": 0.6592427616926503, + "grad_norm": 0.6702575087547302, + "learning_rate": 5.507750451276814e-05, + "loss": 0.4045, + "step": 2738 + }, + { + "epoch": 0.6594835369891049, + "grad_norm": 0.9865245223045349, + "learning_rate": 5.500785430028188e-05, + "loss": 0.4853, + "step": 2739 + }, + { + "epoch": 0.6597243122855595, + "grad_norm": 0.48536545038223267, + "learning_rate": 5.4938231443121546e-05, + "loss": 0.2637, + "step": 2740 + }, + { + "epoch": 0.6599650875820141, + "grad_norm": 10.391051292419434, + "learning_rate": 5.4868635983618014e-05, + "loss": 0.4469, + "step": 2741 + }, + { + "epoch": 0.6602058628784687, + "grad_norm": 1.1766020059585571, + "learning_rate": 5.4799067964085526e-05, + "loss": 0.3332, + "step": 2742 + }, + { + "epoch": 0.6604466381749232, + "grad_norm": 1.8510931730270386, + "learning_rate": 5.4729527426821514e-05, + "loss": 0.6154, + "step": 2743 + }, + { + "epoch": 0.6606874134713778, + "grad_norm": 1.353156328201294, + "learning_rate": 5.466001441410682e-05, + "loss": 0.3427, + "step": 2744 + }, + { + "epoch": 0.6609281887678324, + "grad_norm": 1.8044596910476685, + "learning_rate": 5.459052896820551e-05, + "loss": 0.2089, + "step": 2745 + }, + { + "epoch": 0.661168964064287, + "grad_norm": 2.0654067993164062, + "learning_rate": 5.4521071131364906e-05, + "loss": 0.781, + "step": 2746 + }, + { + "epoch": 0.6614097393607415, + "grad_norm": 1.7442854642868042, + "learning_rate": 5.4451640945815564e-05, + "loss": 0.3432, + "step": 2747 + }, + { + "epoch": 0.6616505146571962, + "grad_norm": 1.4665995836257935, + "learning_rate": 5.438223845377111e-05, + "loss": 0.5632, + "step": 2748 + }, + { + "epoch": 0.6618912899536508, + "grad_norm": 3.585455894470215, + "learning_rate": 5.431286369742844e-05, + "loss": 1.3084, + "step": 2749 + }, + { + "epoch": 0.6621320652501054, + "grad_norm": 3.5575718879699707, + "learning_rate": 5.424351671896761e-05, + "loss": 0.3184, + "step": 2750 + }, + { + "epoch": 0.6623728405465599, + "grad_norm": 17.34886932373047, + "learning_rate": 5.4174197560551685e-05, + "loss": 1.1173, + "step": 2751 + }, + { + "epoch": 0.6626136158430145, + "grad_norm": 2.6678974628448486, + "learning_rate": 5.4104906264326884e-05, + "loss": 0.6505, + "step": 2752 + }, + { + "epoch": 0.6628543911394691, + "grad_norm": 4.722652912139893, + "learning_rate": 5.403564287242248e-05, + "loss": 1.3932, + "step": 2753 + }, + { + "epoch": 0.6630951664359237, + "grad_norm": 1.4697628021240234, + "learning_rate": 5.396640742695076e-05, + "loss": 0.5875, + "step": 2754 + }, + { + "epoch": 0.6633359417323783, + "grad_norm": 2.3093106746673584, + "learning_rate": 5.389719997000708e-05, + "loss": 0.8362, + "step": 2755 + }, + { + "epoch": 0.6635767170288328, + "grad_norm": 1.6689960956573486, + "learning_rate": 5.382802054366966e-05, + "loss": 0.3395, + "step": 2756 + }, + { + "epoch": 0.6638174923252874, + "grad_norm": 1.0874135494232178, + "learning_rate": 5.37588691899998e-05, + "loss": 0.757, + "step": 2757 + }, + { + "epoch": 0.664058267621742, + "grad_norm": 1.348419427871704, + "learning_rate": 5.3689745951041626e-05, + "loss": 0.6488, + "step": 2758 + }, + { + "epoch": 0.6642990429181966, + "grad_norm": 2.6158974170684814, + "learning_rate": 5.3620650868822256e-05, + "loss": 0.251, + "step": 2759 + }, + { + "epoch": 0.6645398182146511, + "grad_norm": 1.7711026668548584, + "learning_rate": 5.3551583985351636e-05, + "loss": 0.607, + "step": 2760 + }, + { + "epoch": 0.6647805935111057, + "grad_norm": 1.2356005907058716, + "learning_rate": 5.348254534262262e-05, + "loss": 0.2628, + "step": 2761 + }, + { + "epoch": 0.6650213688075604, + "grad_norm": 4.986912727355957, + "learning_rate": 5.3413534982610836e-05, + "loss": 0.4364, + "step": 2762 + }, + { + "epoch": 0.665262144104015, + "grad_norm": 2.007636785507202, + "learning_rate": 5.3344552947274776e-05, + "loss": 0.6385, + "step": 2763 + }, + { + "epoch": 0.6655029194004695, + "grad_norm": 1.343567132949829, + "learning_rate": 5.32755992785556e-05, + "loss": 0.4478, + "step": 2764 + }, + { + "epoch": 0.6657436946969241, + "grad_norm": 0.5190923810005188, + "learning_rate": 5.320667401837738e-05, + "loss": 0.5346, + "step": 2765 + }, + { + "epoch": 0.6659844699933787, + "grad_norm": 1.11277437210083, + "learning_rate": 5.313777720864674e-05, + "loss": 0.2615, + "step": 2766 + }, + { + "epoch": 0.6662252452898333, + "grad_norm": 1.8751391172409058, + "learning_rate": 5.3068908891253134e-05, + "loss": 0.632, + "step": 2767 + }, + { + "epoch": 0.6664660205862879, + "grad_norm": 1.2252461910247803, + "learning_rate": 5.3000069108068674e-05, + "loss": 0.2374, + "step": 2768 + }, + { + "epoch": 0.6667067958827424, + "grad_norm": 0.743096113204956, + "learning_rate": 5.293125790094809e-05, + "loss": 0.4381, + "step": 2769 + }, + { + "epoch": 0.666947571179197, + "grad_norm": 1.818084478378296, + "learning_rate": 5.286247531172877e-05, + "loss": 0.3025, + "step": 2770 + }, + { + "epoch": 0.6671883464756516, + "grad_norm": 1.5191317796707153, + "learning_rate": 5.2793721382230624e-05, + "loss": 0.8121, + "step": 2771 + }, + { + "epoch": 0.6674291217721062, + "grad_norm": 1.5100692510604858, + "learning_rate": 5.272499615425624e-05, + "loss": 0.2726, + "step": 2772 + }, + { + "epoch": 0.6676698970685607, + "grad_norm": 3.111989974975586, + "learning_rate": 5.2656299669590756e-05, + "loss": 0.9423, + "step": 2773 + }, + { + "epoch": 0.6679106723650153, + "grad_norm": 2.021303415298462, + "learning_rate": 5.2587631970001697e-05, + "loss": 0.6443, + "step": 2774 + }, + { + "epoch": 0.6681514476614699, + "grad_norm": 3.0573856830596924, + "learning_rate": 5.251899309723921e-05, + "loss": 0.7831, + "step": 2775 + }, + { + "epoch": 0.6683922229579246, + "grad_norm": 3.3146016597747803, + "learning_rate": 5.2450383093035905e-05, + "loss": 0.4461, + "step": 2776 + }, + { + "epoch": 0.668632998254379, + "grad_norm": 2.406733989715576, + "learning_rate": 5.2381801999106806e-05, + "loss": 0.9433, + "step": 2777 + }, + { + "epoch": 0.6688737735508337, + "grad_norm": 1.2738914489746094, + "learning_rate": 5.2313249857149414e-05, + "loss": 0.5913, + "step": 2778 + }, + { + "epoch": 0.6691145488472883, + "grad_norm": 1.1874566078186035, + "learning_rate": 5.2244726708843516e-05, + "loss": 0.6504, + "step": 2779 + }, + { + "epoch": 0.6693553241437429, + "grad_norm": 2.1480422019958496, + "learning_rate": 5.217623259585136e-05, + "loss": 0.9203, + "step": 2780 + }, + { + "epoch": 0.6695960994401975, + "grad_norm": 3.107542037963867, + "learning_rate": 5.2107767559817586e-05, + "loss": 0.4462, + "step": 2781 + }, + { + "epoch": 0.669836874736652, + "grad_norm": 1.504056453704834, + "learning_rate": 5.2039331642369004e-05, + "loss": 0.6573, + "step": 2782 + }, + { + "epoch": 0.6700776500331066, + "grad_norm": 0.6759874224662781, + "learning_rate": 5.197092488511482e-05, + "loss": 0.284, + "step": 2783 + }, + { + "epoch": 0.6703184253295612, + "grad_norm": 0.775605320930481, + "learning_rate": 5.1902547329646536e-05, + "loss": 0.5416, + "step": 2784 + }, + { + "epoch": 0.6705592006260158, + "grad_norm": 0.199252650141716, + "learning_rate": 5.1834199017537834e-05, + "loss": 0.4752, + "step": 2785 + }, + { + "epoch": 0.6707999759224703, + "grad_norm": 3.240574359893799, + "learning_rate": 5.176587999034468e-05, + "loss": 0.6012, + "step": 2786 + }, + { + "epoch": 0.6710407512189249, + "grad_norm": 3.1757545471191406, + "learning_rate": 5.1697590289605136e-05, + "loss": 1.1006, + "step": 2787 + }, + { + "epoch": 0.6712815265153795, + "grad_norm": 1.6893806457519531, + "learning_rate": 5.162932995683951e-05, + "loss": 0.7829, + "step": 2788 + }, + { + "epoch": 0.6715223018118341, + "grad_norm": 1.8709641695022583, + "learning_rate": 5.156109903355031e-05, + "loss": 0.9529, + "step": 2789 + }, + { + "epoch": 0.6717630771082886, + "grad_norm": 1.4354405403137207, + "learning_rate": 5.1492897561221976e-05, + "loss": 0.3602, + "step": 2790 + }, + { + "epoch": 0.6720038524047433, + "grad_norm": 1.0905989408493042, + "learning_rate": 5.142472558132125e-05, + "loss": 0.6715, + "step": 2791 + }, + { + "epoch": 0.6722446277011979, + "grad_norm": 0.8870560526847839, + "learning_rate": 5.1356583135296744e-05, + "loss": 0.3977, + "step": 2792 + }, + { + "epoch": 0.6724854029976525, + "grad_norm": 2.73464298248291, + "learning_rate": 5.1288470264579327e-05, + "loss": 0.8096, + "step": 2793 + }, + { + "epoch": 0.6727261782941071, + "grad_norm": 3.191851854324341, + "learning_rate": 5.122038701058176e-05, + "loss": 0.8286, + "step": 2794 + }, + { + "epoch": 0.6729669535905616, + "grad_norm": 0.7643956542015076, + "learning_rate": 5.115233341469877e-05, + "loss": 0.3281, + "step": 2795 + }, + { + "epoch": 0.6732077288870162, + "grad_norm": 4.106560230255127, + "learning_rate": 5.108430951830716e-05, + "loss": 0.7662, + "step": 2796 + }, + { + "epoch": 0.6734485041834708, + "grad_norm": 1.1065987348556519, + "learning_rate": 5.101631536276552e-05, + "loss": 0.9248, + "step": 2797 + }, + { + "epoch": 0.6736892794799254, + "grad_norm": 2.439703941345215, + "learning_rate": 5.094835098941451e-05, + "loss": 1.0613, + "step": 2798 + }, + { + "epoch": 0.6739300547763799, + "grad_norm": 1.7999013662338257, + "learning_rate": 5.088041643957664e-05, + "loss": 0.4121, + "step": 2799 + }, + { + "epoch": 0.6741708300728345, + "grad_norm": 2.1599056720733643, + "learning_rate": 5.081251175455617e-05, + "loss": 0.3685, + "step": 2800 + }, + { + "epoch": 0.6744116053692891, + "grad_norm": 3.4274089336395264, + "learning_rate": 5.0744636975639424e-05, + "loss": 0.3434, + "step": 2801 + }, + { + "epoch": 0.6746523806657437, + "grad_norm": 1.6600308418273926, + "learning_rate": 5.06767921440944e-05, + "loss": 0.402, + "step": 2802 + }, + { + "epoch": 0.6748931559621982, + "grad_norm": 1.426318883895874, + "learning_rate": 5.0608977301170845e-05, + "loss": 0.2329, + "step": 2803 + }, + { + "epoch": 0.6751339312586528, + "grad_norm": 1.2392008304595947, + "learning_rate": 5.05411924881004e-05, + "loss": 0.5493, + "step": 2804 + }, + { + "epoch": 0.6753747065551075, + "grad_norm": 0.7665623426437378, + "learning_rate": 5.047343774609632e-05, + "loss": 0.3614, + "step": 2805 + }, + { + "epoch": 0.6756154818515621, + "grad_norm": 2.566469192504883, + "learning_rate": 5.040571311635367e-05, + "loss": 0.7117, + "step": 2806 + }, + { + "epoch": 0.6758562571480166, + "grad_norm": 1.9703136682510376, + "learning_rate": 5.033801864004923e-05, + "loss": 0.7024, + "step": 2807 + }, + { + "epoch": 0.6760970324444712, + "grad_norm": 0.3591214716434479, + "learning_rate": 5.0270354358341307e-05, + "loss": 0.1396, + "step": 2808 + }, + { + "epoch": 0.6763378077409258, + "grad_norm": 3.92927622795105, + "learning_rate": 5.020272031236996e-05, + "loss": 0.7112, + "step": 2809 + }, + { + "epoch": 0.6765785830373804, + "grad_norm": 10.165884971618652, + "learning_rate": 5.013511654325689e-05, + "loss": 0.3902, + "step": 2810 + }, + { + "epoch": 0.676819358333835, + "grad_norm": 4.904458045959473, + "learning_rate": 5.0067543092105284e-05, + "loss": 0.9305, + "step": 2811 + }, + { + "epoch": 0.6770601336302895, + "grad_norm": 3.849393844604492, + "learning_rate": 5.000000000000002e-05, + "loss": 0.9527, + "step": 2812 + }, + { + "epoch": 0.6773009089267441, + "grad_norm": 1.6533546447753906, + "learning_rate": 4.993248730800737e-05, + "loss": 0.2365, + "step": 2813 + }, + { + "epoch": 0.6775416842231987, + "grad_norm": 2.609186887741089, + "learning_rate": 4.986500505717524e-05, + "loss": 0.8003, + "step": 2814 + }, + { + "epoch": 0.6777824595196533, + "grad_norm": 4.835500240325928, + "learning_rate": 4.9797553288533036e-05, + "loss": 1.3338, + "step": 2815 + }, + { + "epoch": 0.6780232348161078, + "grad_norm": 4.737950325012207, + "learning_rate": 4.9730132043091494e-05, + "loss": 0.9067, + "step": 2816 + }, + { + "epoch": 0.6782640101125624, + "grad_norm": 0.9720152616500854, + "learning_rate": 4.9662741361842934e-05, + "loss": 0.5726, + "step": 2817 + }, + { + "epoch": 0.678504785409017, + "grad_norm": 2.1223130226135254, + "learning_rate": 4.9595381285761036e-05, + "loss": 0.2327, + "step": 2818 + }, + { + "epoch": 0.6787455607054717, + "grad_norm": 1.109189748764038, + "learning_rate": 4.9528051855800874e-05, + "loss": 0.8678, + "step": 2819 + }, + { + "epoch": 0.6789863360019261, + "grad_norm": 1.6633493900299072, + "learning_rate": 4.946075311289894e-05, + "loss": 0.1905, + "step": 2820 + }, + { + "epoch": 0.6792271112983808, + "grad_norm": 2.95998477935791, + "learning_rate": 4.939348509797293e-05, + "loss": 0.6686, + "step": 2821 + }, + { + "epoch": 0.6794678865948354, + "grad_norm": 1.455351710319519, + "learning_rate": 4.932624785192206e-05, + "loss": 0.3546, + "step": 2822 + }, + { + "epoch": 0.67970866189129, + "grad_norm": 3.9884376525878906, + "learning_rate": 4.9259041415626615e-05, + "loss": 1.0358, + "step": 2823 + }, + { + "epoch": 0.6799494371877446, + "grad_norm": 3.427591562271118, + "learning_rate": 4.91918658299483e-05, + "loss": 0.9694, + "step": 2824 + }, + { + "epoch": 0.6801902124841991, + "grad_norm": 0.9723843336105347, + "learning_rate": 4.912472113573005e-05, + "loss": 0.0587, + "step": 2825 + }, + { + "epoch": 0.6804309877806537, + "grad_norm": 2.2842836380004883, + "learning_rate": 4.905760737379597e-05, + "loss": 0.9764, + "step": 2826 + }, + { + "epoch": 0.6806717630771083, + "grad_norm": 3.051936626434326, + "learning_rate": 4.899052458495137e-05, + "loss": 0.7743, + "step": 2827 + }, + { + "epoch": 0.6809125383735629, + "grad_norm": 4.598084926605225, + "learning_rate": 4.8923472809982795e-05, + "loss": 0.9498, + "step": 2828 + }, + { + "epoch": 0.6811533136700174, + "grad_norm": 5.4813947677612305, + "learning_rate": 4.885645208965779e-05, + "loss": 0.5918, + "step": 2829 + }, + { + "epoch": 0.681394088966472, + "grad_norm": 5.616296768188477, + "learning_rate": 4.8789462464725176e-05, + "loss": 0.3233, + "step": 2830 + }, + { + "epoch": 0.6816348642629266, + "grad_norm": 2.7440338134765625, + "learning_rate": 4.8722503975914724e-05, + "loss": 0.834, + "step": 2831 + }, + { + "epoch": 0.6818756395593812, + "grad_norm": 2.0612759590148926, + "learning_rate": 4.865557666393739e-05, + "loss": 0.6478, + "step": 2832 + }, + { + "epoch": 0.6821164148558357, + "grad_norm": 2.663640260696411, + "learning_rate": 4.858868056948512e-05, + "loss": 0.3945, + "step": 2833 + }, + { + "epoch": 0.6823571901522903, + "grad_norm": 1.9246824979782104, + "learning_rate": 4.8521815733230894e-05, + "loss": 0.7715, + "step": 2834 + }, + { + "epoch": 0.682597965448745, + "grad_norm": 1.4412755966186523, + "learning_rate": 4.8454982195828725e-05, + "loss": 0.1277, + "step": 2835 + }, + { + "epoch": 0.6828387407451996, + "grad_norm": 2.150667190551758, + "learning_rate": 4.838817999791348e-05, + "loss": 0.694, + "step": 2836 + }, + { + "epoch": 0.6830795160416542, + "grad_norm": 2.0565173625946045, + "learning_rate": 4.832140918010107e-05, + "loss": 0.4911, + "step": 2837 + }, + { + "epoch": 0.6833202913381087, + "grad_norm": 2.131206750869751, + "learning_rate": 4.825466978298835e-05, + "loss": 0.3765, + "step": 2838 + }, + { + "epoch": 0.6835610666345633, + "grad_norm": 1.4898698329925537, + "learning_rate": 4.818796184715295e-05, + "loss": 0.8131, + "step": 2839 + }, + { + "epoch": 0.6838018419310179, + "grad_norm": 1.2278820276260376, + "learning_rate": 4.812128541315348e-05, + "loss": 0.6781, + "step": 2840 + }, + { + "epoch": 0.6840426172274725, + "grad_norm": 1.6083769798278809, + "learning_rate": 4.805464052152937e-05, + "loss": 0.3348, + "step": 2841 + }, + { + "epoch": 0.684283392523927, + "grad_norm": 2.421626091003418, + "learning_rate": 4.7988027212800856e-05, + "loss": 1.0312, + "step": 2842 + }, + { + "epoch": 0.6845241678203816, + "grad_norm": 2.5446934700012207, + "learning_rate": 4.7921445527469014e-05, + "loss": 0.6185, + "step": 2843 + }, + { + "epoch": 0.6847649431168362, + "grad_norm": 2.2218170166015625, + "learning_rate": 4.7854895506015587e-05, + "loss": 0.5071, + "step": 2844 + }, + { + "epoch": 0.6850057184132908, + "grad_norm": 1.7741551399230957, + "learning_rate": 4.7788377188903176e-05, + "loss": 0.1996, + "step": 2845 + }, + { + "epoch": 0.6852464937097453, + "grad_norm": 0.4389905035495758, + "learning_rate": 4.7721890616575103e-05, + "loss": 0.2294, + "step": 2846 + }, + { + "epoch": 0.6854872690061999, + "grad_norm": 2.4608020782470703, + "learning_rate": 4.76554358294553e-05, + "loss": 0.4453, + "step": 2847 + }, + { + "epoch": 0.6857280443026545, + "grad_norm": 1.8286683559417725, + "learning_rate": 4.758901286794842e-05, + "loss": 0.3692, + "step": 2848 + }, + { + "epoch": 0.6859688195991092, + "grad_norm": 3.2650294303894043, + "learning_rate": 4.7522621772439826e-05, + "loss": 0.2029, + "step": 2849 + }, + { + "epoch": 0.6862095948955638, + "grad_norm": 1.0558974742889404, + "learning_rate": 4.7456262583295406e-05, + "loss": 0.2386, + "step": 2850 + }, + { + "epoch": 0.6864503701920183, + "grad_norm": 3.462625741958618, + "learning_rate": 4.7389935340861766e-05, + "loss": 0.5172, + "step": 2851 + }, + { + "epoch": 0.6866911454884729, + "grad_norm": 1.7738730907440186, + "learning_rate": 4.732364008546593e-05, + "loss": 0.5665, + "step": 2852 + }, + { + "epoch": 0.6869319207849275, + "grad_norm": 1.2178890705108643, + "learning_rate": 4.72573768574156e-05, + "loss": 0.4615, + "step": 2853 + }, + { + "epoch": 0.6871726960813821, + "grad_norm": 4.225795745849609, + "learning_rate": 4.719114569699902e-05, + "loss": 1.0835, + "step": 2854 + }, + { + "epoch": 0.6874134713778366, + "grad_norm": 3.400425434112549, + "learning_rate": 4.712494664448479e-05, + "loss": 0.8196, + "step": 2855 + }, + { + "epoch": 0.6876542466742912, + "grad_norm": 4.688882827758789, + "learning_rate": 4.705877974012213e-05, + "loss": 0.9437, + "step": 2856 + }, + { + "epoch": 0.6878950219707458, + "grad_norm": 1.8264660835266113, + "learning_rate": 4.699264502414066e-05, + "loss": 0.649, + "step": 2857 + }, + { + "epoch": 0.6881357972672004, + "grad_norm": 4.103915214538574, + "learning_rate": 4.6926542536750454e-05, + "loss": 0.5432, + "step": 2858 + }, + { + "epoch": 0.6883765725636549, + "grad_norm": 8.779718399047852, + "learning_rate": 4.686047231814199e-05, + "loss": 0.8389, + "step": 2859 + }, + { + "epoch": 0.6886173478601095, + "grad_norm": 1.3146713972091675, + "learning_rate": 4.6794434408486043e-05, + "loss": 0.5095, + "step": 2860 + }, + { + "epoch": 0.6888581231565641, + "grad_norm": 2.284715175628662, + "learning_rate": 4.6728428847933893e-05, + "loss": 0.7908, + "step": 2861 + }, + { + "epoch": 0.6890988984530187, + "grad_norm": 1.923722267150879, + "learning_rate": 4.666245567661699e-05, + "loss": 0.7053, + "step": 2862 + }, + { + "epoch": 0.6893396737494734, + "grad_norm": 1.3939085006713867, + "learning_rate": 4.659651493464721e-05, + "loss": 0.5569, + "step": 2863 + }, + { + "epoch": 0.6895804490459279, + "grad_norm": 1.071938157081604, + "learning_rate": 4.653060666211665e-05, + "loss": 0.474, + "step": 2864 + }, + { + "epoch": 0.6898212243423825, + "grad_norm": 2.894726514816284, + "learning_rate": 4.646473089909772e-05, + "loss": 0.5261, + "step": 2865 + }, + { + "epoch": 0.6900619996388371, + "grad_norm": 2.7686641216278076, + "learning_rate": 4.639888768564302e-05, + "loss": 0.8032, + "step": 2866 + }, + { + "epoch": 0.6903027749352917, + "grad_norm": 1.58405601978302, + "learning_rate": 4.633307706178541e-05, + "loss": 0.8255, + "step": 2867 + }, + { + "epoch": 0.6905435502317462, + "grad_norm": 2.1864027976989746, + "learning_rate": 4.626729906753782e-05, + "loss": 0.5292, + "step": 2868 + }, + { + "epoch": 0.6907843255282008, + "grad_norm": 2.6647164821624756, + "learning_rate": 4.62015537428935e-05, + "loss": 1.1816, + "step": 2869 + }, + { + "epoch": 0.6910251008246554, + "grad_norm": 0.903181254863739, + "learning_rate": 4.613584112782567e-05, + "loss": 0.5345, + "step": 2870 + }, + { + "epoch": 0.69126587612111, + "grad_norm": 1.2306076288223267, + "learning_rate": 4.607016126228779e-05, + "loss": 0.5126, + "step": 2871 + }, + { + "epoch": 0.6915066514175645, + "grad_norm": 1.6878161430358887, + "learning_rate": 4.600451418621341e-05, + "loss": 0.5813, + "step": 2872 + }, + { + "epoch": 0.6917474267140191, + "grad_norm": 1.6797889471054077, + "learning_rate": 4.593889993951599e-05, + "loss": 0.4037, + "step": 2873 + }, + { + "epoch": 0.6919882020104737, + "grad_norm": 1.061113715171814, + "learning_rate": 4.587331856208927e-05, + "loss": 0.3819, + "step": 2874 + }, + { + "epoch": 0.6922289773069283, + "grad_norm": 2.49900484085083, + "learning_rate": 4.580777009380678e-05, + "loss": 0.4709, + "step": 2875 + }, + { + "epoch": 0.6924697526033828, + "grad_norm": 1.850058674812317, + "learning_rate": 4.574225457452217e-05, + "loss": 0.4061, + "step": 2876 + }, + { + "epoch": 0.6927105278998374, + "grad_norm": 4.711109638214111, + "learning_rate": 4.5676772044069064e-05, + "loss": 0.6784, + "step": 2877 + }, + { + "epoch": 0.6929513031962921, + "grad_norm": 2.337125778198242, + "learning_rate": 4.5611322542260906e-05, + "loss": 1.2925, + "step": 2878 + }, + { + "epoch": 0.6931920784927467, + "grad_norm": 1.2772440910339355, + "learning_rate": 4.554590610889118e-05, + "loss": 0.453, + "step": 2879 + }, + { + "epoch": 0.6934328537892013, + "grad_norm": 1.6413322687149048, + "learning_rate": 4.548052278373327e-05, + "loss": 0.509, + "step": 2880 + }, + { + "epoch": 0.6936736290856558, + "grad_norm": 2.3125624656677246, + "learning_rate": 4.54151726065403e-05, + "loss": 0.68, + "step": 2881 + }, + { + "epoch": 0.6939144043821104, + "grad_norm": 2.5944857597351074, + "learning_rate": 4.534985561704537e-05, + "loss": 0.9755, + "step": 2882 + }, + { + "epoch": 0.694155179678565, + "grad_norm": 5.304381370544434, + "learning_rate": 4.528457185496134e-05, + "loss": 0.6764, + "step": 2883 + }, + { + "epoch": 0.6943959549750196, + "grad_norm": 1.1039294004440308, + "learning_rate": 4.521932135998092e-05, + "loss": 0.4513, + "step": 2884 + }, + { + "epoch": 0.6946367302714741, + "grad_norm": 4.094736576080322, + "learning_rate": 4.5154104171776546e-05, + "loss": 1.098, + "step": 2885 + }, + { + "epoch": 0.6948775055679287, + "grad_norm": 2.167951822280884, + "learning_rate": 4.5088920330000386e-05, + "loss": 0.9008, + "step": 2886 + }, + { + "epoch": 0.6951182808643833, + "grad_norm": 1.435927152633667, + "learning_rate": 4.502376987428442e-05, + "loss": 0.3153, + "step": 2887 + }, + { + "epoch": 0.6953590561608379, + "grad_norm": 1.3964961767196655, + "learning_rate": 4.495865284424018e-05, + "loss": 0.9771, + "step": 2888 + }, + { + "epoch": 0.6955998314572924, + "grad_norm": 1.8884319067001343, + "learning_rate": 4.4893569279459034e-05, + "loss": 0.5999, + "step": 2889 + }, + { + "epoch": 0.695840606753747, + "grad_norm": 1.9889702796936035, + "learning_rate": 4.4828519219511914e-05, + "loss": 0.408, + "step": 2890 + }, + { + "epoch": 0.6960813820502016, + "grad_norm": 0.6705599427223206, + "learning_rate": 4.476350270394942e-05, + "loss": 1.1983, + "step": 2891 + }, + { + "epoch": 0.6963221573466563, + "grad_norm": 1.9054640531539917, + "learning_rate": 4.469851977230173e-05, + "loss": 0.6402, + "step": 2892 + }, + { + "epoch": 0.6965629326431109, + "grad_norm": 0.7746975421905518, + "learning_rate": 4.463357046407864e-05, + "loss": 0.3632, + "step": 2893 + }, + { + "epoch": 0.6968037079395654, + "grad_norm": 0.40516397356987, + "learning_rate": 4.456865481876943e-05, + "loss": 0.1903, + "step": 2894 + }, + { + "epoch": 0.69704448323602, + "grad_norm": 3.2087621688842773, + "learning_rate": 4.4503772875843e-05, + "loss": 0.568, + "step": 2895 + }, + { + "epoch": 0.6972852585324746, + "grad_norm": 2.681427478790283, + "learning_rate": 4.4438924674747663e-05, + "loss": 0.9806, + "step": 2896 + }, + { + "epoch": 0.6975260338289292, + "grad_norm": 1.9525858163833618, + "learning_rate": 4.4374110254911306e-05, + "loss": 0.4023, + "step": 2897 + }, + { + "epoch": 0.6977668091253837, + "grad_norm": 5.92275857925415, + "learning_rate": 4.430932965574125e-05, + "loss": 0.8938, + "step": 2898 + }, + { + "epoch": 0.6980075844218383, + "grad_norm": 5.130187034606934, + "learning_rate": 4.424458291662422e-05, + "loss": 0.5991, + "step": 2899 + }, + { + "epoch": 0.6982483597182929, + "grad_norm": 1.512535810470581, + "learning_rate": 4.417987007692641e-05, + "loss": 0.6119, + "step": 2900 + }, + { + "epoch": 0.6984891350147475, + "grad_norm": 1.4863414764404297, + "learning_rate": 4.4115191175993385e-05, + "loss": 0.5287, + "step": 2901 + }, + { + "epoch": 0.698729910311202, + "grad_norm": 0.5450987219810486, + "learning_rate": 4.405054625314999e-05, + "loss": 0.4031, + "step": 2902 + }, + { + "epoch": 0.6989706856076566, + "grad_norm": 2.9713079929351807, + "learning_rate": 4.398593534770058e-05, + "loss": 0.8828, + "step": 2903 + }, + { + "epoch": 0.6992114609041112, + "grad_norm": 1.440027117729187, + "learning_rate": 4.3921358498928645e-05, + "loss": 0.3911, + "step": 2904 + }, + { + "epoch": 0.6994522362005658, + "grad_norm": 1.3297288417816162, + "learning_rate": 4.385681574609708e-05, + "loss": 0.3319, + "step": 2905 + }, + { + "epoch": 0.6996930114970205, + "grad_norm": 0.7073665261268616, + "learning_rate": 4.379230712844804e-05, + "loss": 0.6385, + "step": 2906 + }, + { + "epoch": 0.699933786793475, + "grad_norm": 2.934152126312256, + "learning_rate": 4.37278326852029e-05, + "loss": 1.4158, + "step": 2907 + }, + { + "epoch": 0.7001745620899296, + "grad_norm": 2.335797071456909, + "learning_rate": 4.36633924555623e-05, + "loss": 0.9337, + "step": 2908 + }, + { + "epoch": 0.7004153373863842, + "grad_norm": 1.474564552307129, + "learning_rate": 4.359898647870599e-05, + "loss": 0.5355, + "step": 2909 + }, + { + "epoch": 0.7006561126828388, + "grad_norm": 1.9566766023635864, + "learning_rate": 4.353461479379297e-05, + "loss": 0.4216, + "step": 2910 + }, + { + "epoch": 0.7008968879792933, + "grad_norm": 1.7746264934539795, + "learning_rate": 4.34702774399614e-05, + "loss": 0.5385, + "step": 2911 + }, + { + "epoch": 0.7011376632757479, + "grad_norm": 2.327068567276001, + "learning_rate": 4.340597445632849e-05, + "loss": 0.1434, + "step": 2912 + }, + { + "epoch": 0.7013784385722025, + "grad_norm": 0.7720171809196472, + "learning_rate": 4.334170588199061e-05, + "loss": 0.327, + "step": 2913 + }, + { + "epoch": 0.7016192138686571, + "grad_norm": 0.9734980463981628, + "learning_rate": 4.32774717560232e-05, + "loss": 0.6511, + "step": 2914 + }, + { + "epoch": 0.7018599891651116, + "grad_norm": 2.17838191986084, + "learning_rate": 4.321327211748077e-05, + "loss": 0.6218, + "step": 2915 + }, + { + "epoch": 0.7021007644615662, + "grad_norm": 1.358054757118225, + "learning_rate": 4.314910700539687e-05, + "loss": 0.8311, + "step": 2916 + }, + { + "epoch": 0.7023415397580208, + "grad_norm": 7.809467792510986, + "learning_rate": 4.308497645878396e-05, + "loss": 1.164, + "step": 2917 + }, + { + "epoch": 0.7025823150544754, + "grad_norm": 2.3735713958740234, + "learning_rate": 4.302088051663359e-05, + "loss": 0.5243, + "step": 2918 + }, + { + "epoch": 0.70282309035093, + "grad_norm": 1.5434727668762207, + "learning_rate": 4.2956819217916275e-05, + "loss": 0.2084, + "step": 2919 + }, + { + "epoch": 0.7030638656473845, + "grad_norm": 2.580521821975708, + "learning_rate": 4.289279260158137e-05, + "loss": 0.666, + "step": 2920 + }, + { + "epoch": 0.7033046409438392, + "grad_norm": 3.4632489681243896, + "learning_rate": 4.282880070655723e-05, + "loss": 0.7674, + "step": 2921 + }, + { + "epoch": 0.7035454162402938, + "grad_norm": 3.7505438327789307, + "learning_rate": 4.2764843571751046e-05, + "loss": 0.8833, + "step": 2922 + }, + { + "epoch": 0.7037861915367484, + "grad_norm": 1.1136095523834229, + "learning_rate": 4.270092123604894e-05, + "loss": 0.5675, + "step": 2923 + }, + { + "epoch": 0.7040269668332029, + "grad_norm": 2.523184299468994, + "learning_rate": 4.263703373831586e-05, + "loss": 0.621, + "step": 2924 + }, + { + "epoch": 0.7042677421296575, + "grad_norm": 1.7620470523834229, + "learning_rate": 4.2573181117395455e-05, + "loss": 0.2796, + "step": 2925 + }, + { + "epoch": 0.7045085174261121, + "grad_norm": 0.7479153275489807, + "learning_rate": 4.250936341211032e-05, + "loss": 0.7364, + "step": 2926 + }, + { + "epoch": 0.7047492927225667, + "grad_norm": 1.9243773221969604, + "learning_rate": 4.2445580661261794e-05, + "loss": 0.5447, + "step": 2927 + }, + { + "epoch": 0.7049900680190212, + "grad_norm": 2.319751501083374, + "learning_rate": 4.238183290362987e-05, + "loss": 0.4302, + "step": 2928 + }, + { + "epoch": 0.7052308433154758, + "grad_norm": 1.1263662576675415, + "learning_rate": 4.231812017797335e-05, + "loss": 0.5473, + "step": 2929 + }, + { + "epoch": 0.7054716186119304, + "grad_norm": 1.7234480381011963, + "learning_rate": 4.225444252302973e-05, + "loss": 0.3453, + "step": 2930 + }, + { + "epoch": 0.705712393908385, + "grad_norm": 1.5523897409439087, + "learning_rate": 4.219079997751515e-05, + "loss": 0.2537, + "step": 2931 + }, + { + "epoch": 0.7059531692048396, + "grad_norm": 2.044769287109375, + "learning_rate": 4.212719258012447e-05, + "loss": 0.3151, + "step": 2932 + }, + { + "epoch": 0.7061939445012941, + "grad_norm": 3.085174322128296, + "learning_rate": 4.206362036953104e-05, + "loss": 0.4571, + "step": 2933 + }, + { + "epoch": 0.7064347197977487, + "grad_norm": 7.409231185913086, + "learning_rate": 4.2000083384387e-05, + "loss": 1.0109, + "step": 2934 + }, + { + "epoch": 0.7066754950942034, + "grad_norm": 4.059498310089111, + "learning_rate": 4.193658166332291e-05, + "loss": 0.4508, + "step": 2935 + }, + { + "epoch": 0.706916270390658, + "grad_norm": 3.295271873474121, + "learning_rate": 4.187311524494798e-05, + "loss": 0.5282, + "step": 2936 + }, + { + "epoch": 0.7071570456871125, + "grad_norm": 1.7621487379074097, + "learning_rate": 4.1809684167849936e-05, + "loss": 1.1533, + "step": 2937 + }, + { + "epoch": 0.7073978209835671, + "grad_norm": 3.32817006111145, + "learning_rate": 4.1746288470595044e-05, + "loss": 0.7824, + "step": 2938 + }, + { + "epoch": 0.7076385962800217, + "grad_norm": 2.0236010551452637, + "learning_rate": 4.1682928191727985e-05, + "loss": 0.4317, + "step": 2939 + }, + { + "epoch": 0.7078793715764763, + "grad_norm": 2.6577980518341064, + "learning_rate": 4.161960336977203e-05, + "loss": 0.5246, + "step": 2940 + }, + { + "epoch": 0.7081201468729308, + "grad_norm": 45.47622299194336, + "learning_rate": 4.1556314043228705e-05, + "loss": 0.5691, + "step": 2941 + }, + { + "epoch": 0.7083609221693854, + "grad_norm": 2.584383249282837, + "learning_rate": 4.1493060250578165e-05, + "loss": 0.4159, + "step": 2942 + }, + { + "epoch": 0.70860169746584, + "grad_norm": 2.8023557662963867, + "learning_rate": 4.1429842030278774e-05, + "loss": 0.8909, + "step": 2943 + }, + { + "epoch": 0.7088424727622946, + "grad_norm": 1.6718467473983765, + "learning_rate": 4.1366659420767384e-05, + "loss": 0.5008, + "step": 2944 + }, + { + "epoch": 0.7090832480587492, + "grad_norm": 4.263134956359863, + "learning_rate": 4.1303512460459214e-05, + "loss": 0.6134, + "step": 2945 + }, + { + "epoch": 0.7093240233552037, + "grad_norm": 3.2568228244781494, + "learning_rate": 4.124040118774763e-05, + "loss": 0.4874, + "step": 2946 + }, + { + "epoch": 0.7095647986516583, + "grad_norm": 0.5476480722427368, + "learning_rate": 4.1177325641004595e-05, + "loss": 0.341, + "step": 2947 + }, + { + "epoch": 0.7098055739481129, + "grad_norm": 2.8167431354522705, + "learning_rate": 4.1114285858580045e-05, + "loss": 0.3281, + "step": 2948 + }, + { + "epoch": 0.7100463492445676, + "grad_norm": 4.154418468475342, + "learning_rate": 4.105128187880238e-05, + "loss": 0.5594, + "step": 2949 + }, + { + "epoch": 0.710287124541022, + "grad_norm": 0.7136004567146301, + "learning_rate": 4.098831373997818e-05, + "loss": 0.4894, + "step": 2950 + }, + { + "epoch": 0.7105278998374767, + "grad_norm": 2.967937707901001, + "learning_rate": 4.0925381480392135e-05, + "loss": 0.6342, + "step": 2951 + }, + { + "epoch": 0.7107686751339313, + "grad_norm": 2.3439087867736816, + "learning_rate": 4.086248513830725e-05, + "loss": 0.514, + "step": 2952 + }, + { + "epoch": 0.7110094504303859, + "grad_norm": 1.2852379083633423, + "learning_rate": 4.079962475196468e-05, + "loss": 0.9783, + "step": 2953 + }, + { + "epoch": 0.7112502257268404, + "grad_norm": 3.063833713531494, + "learning_rate": 4.0736800359583605e-05, + "loss": 0.3231, + "step": 2954 + }, + { + "epoch": 0.711491001023295, + "grad_norm": 0.8757096529006958, + "learning_rate": 4.067401199936143e-05, + "loss": 0.3563, + "step": 2955 + }, + { + "epoch": 0.7117317763197496, + "grad_norm": 2.1515250205993652, + "learning_rate": 4.061125970947363e-05, + "loss": 0.5002, + "step": 2956 + }, + { + "epoch": 0.7119725516162042, + "grad_norm": 1.7841241359710693, + "learning_rate": 4.054854352807372e-05, + "loss": 0.2222, + "step": 2957 + }, + { + "epoch": 0.7122133269126587, + "grad_norm": 1.669628620147705, + "learning_rate": 4.048586349329333e-05, + "loss": 0.8098, + "step": 2958 + }, + { + "epoch": 0.7124541022091133, + "grad_norm": 1.4398468732833862, + "learning_rate": 4.0423219643241985e-05, + "loss": 0.3151, + "step": 2959 + }, + { + "epoch": 0.7126948775055679, + "grad_norm": 3.6351101398468018, + "learning_rate": 4.036061201600737e-05, + "loss": 0.3961, + "step": 2960 + }, + { + "epoch": 0.7129356528020225, + "grad_norm": 0.8414926528930664, + "learning_rate": 4.029804064965498e-05, + "loss": 0.2666, + "step": 2961 + }, + { + "epoch": 0.7131764280984771, + "grad_norm": 1.7037287950515747, + "learning_rate": 4.023550558222837e-05, + "loss": 0.4597, + "step": 2962 + }, + { + "epoch": 0.7134172033949316, + "grad_norm": 2.4606921672821045, + "learning_rate": 4.017300685174903e-05, + "loss": 0.6738, + "step": 2963 + }, + { + "epoch": 0.7136579786913863, + "grad_norm": 0.9210506677627563, + "learning_rate": 4.011054449621632e-05, + "loss": 0.5534, + "step": 2964 + }, + { + "epoch": 0.7138987539878409, + "grad_norm": 1.8560645580291748, + "learning_rate": 4.004811855360748e-05, + "loss": 0.2176, + "step": 2965 + }, + { + "epoch": 0.7141395292842955, + "grad_norm": 1.5320228338241577, + "learning_rate": 3.998572906187767e-05, + "loss": 0.6553, + "step": 2966 + }, + { + "epoch": 0.71438030458075, + "grad_norm": 7.649412155151367, + "learning_rate": 3.9923376058959774e-05, + "loss": 0.8473, + "step": 2967 + }, + { + "epoch": 0.7146210798772046, + "grad_norm": 0.8672193288803101, + "learning_rate": 3.986105958276463e-05, + "loss": 0.4563, + "step": 2968 + }, + { + "epoch": 0.7148618551736592, + "grad_norm": 3.8993074893951416, + "learning_rate": 3.97987796711807e-05, + "loss": 0.5047, + "step": 2969 + }, + { + "epoch": 0.7151026304701138, + "grad_norm": 2.695249319076538, + "learning_rate": 3.973653636207437e-05, + "loss": 0.8572, + "step": 2970 + }, + { + "epoch": 0.7153434057665683, + "grad_norm": 1.0926902294158936, + "learning_rate": 3.967432969328971e-05, + "loss": 0.4632, + "step": 2971 + }, + { + "epoch": 0.7155841810630229, + "grad_norm": 2.5427393913269043, + "learning_rate": 3.961215970264852e-05, + "loss": 0.4715, + "step": 2972 + }, + { + "epoch": 0.7158249563594775, + "grad_norm": 1.8015666007995605, + "learning_rate": 3.9550026427950315e-05, + "loss": 0.4259, + "step": 2973 + }, + { + "epoch": 0.7160657316559321, + "grad_norm": 2.0264315605163574, + "learning_rate": 3.94879299069722e-05, + "loss": 0.7189, + "step": 2974 + }, + { + "epoch": 0.7163065069523867, + "grad_norm": 2.786452531814575, + "learning_rate": 3.942587017746904e-05, + "loss": 1.0023, + "step": 2975 + }, + { + "epoch": 0.7165472822488412, + "grad_norm": 1.3321934938430786, + "learning_rate": 3.936384727717332e-05, + "loss": 0.5356, + "step": 2976 + }, + { + "epoch": 0.7167880575452958, + "grad_norm": 1.1468703746795654, + "learning_rate": 3.930186124379503e-05, + "loss": 0.6806, + "step": 2977 + }, + { + "epoch": 0.7170288328417505, + "grad_norm": 3.6442174911499023, + "learning_rate": 3.923991211502187e-05, + "loss": 0.2468, + "step": 2978 + }, + { + "epoch": 0.7172696081382051, + "grad_norm": 1.8191343545913696, + "learning_rate": 3.917799992851903e-05, + "loss": 1.0023, + "step": 2979 + }, + { + "epoch": 0.7175103834346596, + "grad_norm": 2.2116637229919434, + "learning_rate": 3.911612472192927e-05, + "loss": 0.3557, + "step": 2980 + }, + { + "epoch": 0.7177511587311142, + "grad_norm": 0.8968959450721741, + "learning_rate": 3.9054286532872884e-05, + "loss": 0.3245, + "step": 2981 + }, + { + "epoch": 0.7179919340275688, + "grad_norm": 1.421441674232483, + "learning_rate": 3.899248539894757e-05, + "loss": 0.3783, + "step": 2982 + }, + { + "epoch": 0.7182327093240234, + "grad_norm": 2.168306827545166, + "learning_rate": 3.8930721357728584e-05, + "loss": 0.2099, + "step": 2983 + }, + { + "epoch": 0.7184734846204779, + "grad_norm": 1.9068177938461304, + "learning_rate": 3.886899444676863e-05, + "loss": 0.8279, + "step": 2984 + }, + { + "epoch": 0.7187142599169325, + "grad_norm": 1.3989911079406738, + "learning_rate": 3.880730470359776e-05, + "loss": 0.9995, + "step": 2985 + }, + { + "epoch": 0.7189550352133871, + "grad_norm": 5.264814376831055, + "learning_rate": 3.8745652165723486e-05, + "loss": 0.9829, + "step": 2986 + }, + { + "epoch": 0.7191958105098417, + "grad_norm": 2.0597469806671143, + "learning_rate": 3.8684036870630705e-05, + "loss": 0.2443, + "step": 2987 + }, + { + "epoch": 0.7194365858062963, + "grad_norm": 1.258255958557129, + "learning_rate": 3.862245885578166e-05, + "loss": 0.7055, + "step": 2988 + }, + { + "epoch": 0.7196773611027508, + "grad_norm": 3.225368022918701, + "learning_rate": 3.856091815861595e-05, + "loss": 0.3839, + "step": 2989 + }, + { + "epoch": 0.7199181363992054, + "grad_norm": 2.35640025138855, + "learning_rate": 3.8499414816550384e-05, + "loss": 0.8443, + "step": 2990 + }, + { + "epoch": 0.72015891169566, + "grad_norm": 2.4103639125823975, + "learning_rate": 3.843794886697917e-05, + "loss": 0.693, + "step": 2991 + }, + { + "epoch": 0.7203996869921147, + "grad_norm": 0.8811191320419312, + "learning_rate": 3.837652034727378e-05, + "loss": 0.3415, + "step": 2992 + }, + { + "epoch": 0.7206404622885692, + "grad_norm": 1.7266875505447388, + "learning_rate": 3.8315129294782835e-05, + "loss": 0.4295, + "step": 2993 + }, + { + "epoch": 0.7208812375850238, + "grad_norm": 1.2905570268630981, + "learning_rate": 3.8253775746832244e-05, + "loss": 0.248, + "step": 2994 + }, + { + "epoch": 0.7211220128814784, + "grad_norm": 0.3409409821033478, + "learning_rate": 3.819245974072513e-05, + "loss": 0.6092, + "step": 2995 + }, + { + "epoch": 0.721362788177933, + "grad_norm": 1.4550303220748901, + "learning_rate": 3.8131181313741735e-05, + "loss": 0.6874, + "step": 2996 + }, + { + "epoch": 0.7216035634743875, + "grad_norm": 0.9903691411018372, + "learning_rate": 3.806994050313953e-05, + "loss": 0.1963, + "step": 2997 + }, + { + "epoch": 0.7218443387708421, + "grad_norm": 0.8208291530609131, + "learning_rate": 3.800873734615299e-05, + "loss": 0.3679, + "step": 2998 + }, + { + "epoch": 0.7220851140672967, + "grad_norm": 1.0615532398223877, + "learning_rate": 3.794757187999386e-05, + "loss": 0.9426, + "step": 2999 + }, + { + "epoch": 0.7223258893637513, + "grad_norm": 2.0494561195373535, + "learning_rate": 3.788644414185078e-05, + "loss": 0.2539, + "step": 3000 + }, + { + "epoch": 0.7225666646602059, + "grad_norm": 2.378437042236328, + "learning_rate": 3.782535416888963e-05, + "loss": 0.7789, + "step": 3001 + }, + { + "epoch": 0.7228074399566604, + "grad_norm": 1.714324951171875, + "learning_rate": 3.776430199825321e-05, + "loss": 0.9774, + "step": 3002 + }, + { + "epoch": 0.723048215253115, + "grad_norm": 2.891805648803711, + "learning_rate": 3.770328766706139e-05, + "loss": 0.6982, + "step": 3003 + }, + { + "epoch": 0.7232889905495696, + "grad_norm": 4.66194486618042, + "learning_rate": 3.764231121241103e-05, + "loss": 0.6659, + "step": 3004 + }, + { + "epoch": 0.7235297658460242, + "grad_norm": 3.184102773666382, + "learning_rate": 3.758137267137598e-05, + "loss": 0.5286, + "step": 3005 + }, + { + "epoch": 0.7237705411424787, + "grad_norm": 5.212895393371582, + "learning_rate": 3.752047208100694e-05, + "loss": 0.2767, + "step": 3006 + }, + { + "epoch": 0.7240113164389333, + "grad_norm": 1.256901741027832, + "learning_rate": 3.745960947833168e-05, + "loss": 0.477, + "step": 3007 + }, + { + "epoch": 0.724252091735388, + "grad_norm": 1.1887600421905518, + "learning_rate": 3.739878490035473e-05, + "loss": 0.7814, + "step": 3008 + }, + { + "epoch": 0.7244928670318426, + "grad_norm": 2.5815846920013428, + "learning_rate": 3.73379983840576e-05, + "loss": 0.5839, + "step": 3009 + }, + { + "epoch": 0.7247336423282971, + "grad_norm": 4.950305938720703, + "learning_rate": 3.727724996639863e-05, + "loss": 0.4643, + "step": 3010 + }, + { + "epoch": 0.7249744176247517, + "grad_norm": 6.662084102630615, + "learning_rate": 3.7216539684313004e-05, + "loss": 0.5806, + "step": 3011 + }, + { + "epoch": 0.7252151929212063, + "grad_norm": 2.3618359565734863, + "learning_rate": 3.715586757471273e-05, + "loss": 0.5451, + "step": 3012 + }, + { + "epoch": 0.7254559682176609, + "grad_norm": 1.401696801185608, + "learning_rate": 3.709523367448653e-05, + "loss": 0.8228, + "step": 3013 + }, + { + "epoch": 0.7256967435141155, + "grad_norm": 0.5292275547981262, + "learning_rate": 3.7034638020499976e-05, + "loss": 0.3713, + "step": 3014 + }, + { + "epoch": 0.72593751881057, + "grad_norm": 0.8193963766098022, + "learning_rate": 3.697408064959541e-05, + "loss": 0.1659, + "step": 3015 + }, + { + "epoch": 0.7261782941070246, + "grad_norm": 2.547407388687134, + "learning_rate": 3.691356159859177e-05, + "loss": 0.3945, + "step": 3016 + }, + { + "epoch": 0.7264190694034792, + "grad_norm": 1.281667709350586, + "learning_rate": 3.685308090428481e-05, + "loss": 0.6264, + "step": 3017 + }, + { + "epoch": 0.7266598446999338, + "grad_norm": 0.9349974989891052, + "learning_rate": 3.6792638603446974e-05, + "loss": 0.5355, + "step": 3018 + }, + { + "epoch": 0.7269006199963883, + "grad_norm": 1.2555688619613647, + "learning_rate": 3.67322347328272e-05, + "loss": 0.1645, + "step": 3019 + }, + { + "epoch": 0.7271413952928429, + "grad_norm": 2.40930438041687, + "learning_rate": 3.667186932915133e-05, + "loss": 0.3945, + "step": 3020 + }, + { + "epoch": 0.7273821705892975, + "grad_norm": 1.6507692337036133, + "learning_rate": 3.661154242912155e-05, + "loss": 0.3394, + "step": 3021 + }, + { + "epoch": 0.7276229458857522, + "grad_norm": 0.6924558877944946, + "learning_rate": 3.6551254069416774e-05, + "loss": 0.2132, + "step": 3022 + }, + { + "epoch": 0.7278637211822067, + "grad_norm": 1.6599589586257935, + "learning_rate": 3.649100428669253e-05, + "loss": 0.5329, + "step": 3023 + }, + { + "epoch": 0.7281044964786613, + "grad_norm": 1.3489158153533936, + "learning_rate": 3.643079311758072e-05, + "loss": 0.4529, + "step": 3024 + }, + { + "epoch": 0.7283452717751159, + "grad_norm": 3.1767184734344482, + "learning_rate": 3.637062059868996e-05, + "loss": 1.4829, + "step": 3025 + }, + { + "epoch": 0.7285860470715705, + "grad_norm": 2.8698904514312744, + "learning_rate": 3.63104867666052e-05, + "loss": 0.4736, + "step": 3026 + }, + { + "epoch": 0.7288268223680251, + "grad_norm": 1.75603187084198, + "learning_rate": 3.625039165788794e-05, + "loss": 0.8231, + "step": 3027 + }, + { + "epoch": 0.7290675976644796, + "grad_norm": 0.7908713221549988, + "learning_rate": 3.619033530907625e-05, + "loss": 0.5338, + "step": 3028 + }, + { + "epoch": 0.7293083729609342, + "grad_norm": 1.7771409749984741, + "learning_rate": 3.613031775668443e-05, + "loss": 0.7482, + "step": 3029 + }, + { + "epoch": 0.7295491482573888, + "grad_norm": 2.4424712657928467, + "learning_rate": 3.6070339037203306e-05, + "loss": 0.4881, + "step": 3030 + }, + { + "epoch": 0.7297899235538434, + "grad_norm": 0.698549211025238, + "learning_rate": 3.601039918710012e-05, + "loss": 0.3092, + "step": 3031 + }, + { + "epoch": 0.7300306988502979, + "grad_norm": 0.616523802280426, + "learning_rate": 3.595049824281837e-05, + "loss": 0.5394, + "step": 3032 + }, + { + "epoch": 0.7302714741467525, + "grad_norm": 1.3015395402908325, + "learning_rate": 3.589063624077802e-05, + "loss": 0.5671, + "step": 3033 + }, + { + "epoch": 0.7305122494432071, + "grad_norm": 0.953938364982605, + "learning_rate": 3.583081321737525e-05, + "loss": 0.7368, + "step": 3034 + }, + { + "epoch": 0.7307530247396617, + "grad_norm": 0.6559523344039917, + "learning_rate": 3.577102920898261e-05, + "loss": 0.2857, + "step": 3035 + }, + { + "epoch": 0.7309938000361162, + "grad_norm": 0.8794732689857483, + "learning_rate": 3.5711284251948914e-05, + "loss": 0.2559, + "step": 3036 + }, + { + "epoch": 0.7312345753325709, + "grad_norm": 1.048971176147461, + "learning_rate": 3.565157838259925e-05, + "loss": 0.2112, + "step": 3037 + }, + { + "epoch": 0.7314753506290255, + "grad_norm": 1.1826798915863037, + "learning_rate": 3.5591911637234945e-05, + "loss": 0.2799, + "step": 3038 + }, + { + "epoch": 0.7317161259254801, + "grad_norm": 2.4413845539093018, + "learning_rate": 3.5532284052133436e-05, + "loss": 0.5779, + "step": 3039 + }, + { + "epoch": 0.7319569012219346, + "grad_norm": 0.9847295880317688, + "learning_rate": 3.547269566354847e-05, + "loss": 0.6497, + "step": 3040 + }, + { + "epoch": 0.7321976765183892, + "grad_norm": 1.833725094795227, + "learning_rate": 3.541314650770996e-05, + "loss": 0.3938, + "step": 3041 + }, + { + "epoch": 0.7324384518148438, + "grad_norm": 2.012840747833252, + "learning_rate": 3.535363662082385e-05, + "loss": 0.4187, + "step": 3042 + }, + { + "epoch": 0.7326792271112984, + "grad_norm": 3.2702102661132812, + "learning_rate": 3.529416603907233e-05, + "loss": 0.9575, + "step": 3043 + }, + { + "epoch": 0.732920002407753, + "grad_norm": 1.4701731204986572, + "learning_rate": 3.523473479861365e-05, + "loss": 0.5232, + "step": 3044 + }, + { + "epoch": 0.7331607777042075, + "grad_norm": 1.68658447265625, + "learning_rate": 3.5175342935582114e-05, + "loss": 0.6121, + "step": 3045 + }, + { + "epoch": 0.7334015530006621, + "grad_norm": 1.9545087814331055, + "learning_rate": 3.5115990486088166e-05, + "loss": 0.31, + "step": 3046 + }, + { + "epoch": 0.7336423282971167, + "grad_norm": 4.512576580047607, + "learning_rate": 3.5056677486218145e-05, + "loss": 0.8468, + "step": 3047 + }, + { + "epoch": 0.7338831035935713, + "grad_norm": 2.4108033180236816, + "learning_rate": 3.4997403972034546e-05, + "loss": 0.826, + "step": 3048 + }, + { + "epoch": 0.7341238788900258, + "grad_norm": 3.4939920902252197, + "learning_rate": 3.493816997957582e-05, + "loss": 0.4593, + "step": 3049 + }, + { + "epoch": 0.7343646541864804, + "grad_norm": 2.438183307647705, + "learning_rate": 3.487897554485628e-05, + "loss": 0.6518, + "step": 3050 + }, + { + "epoch": 0.7346054294829351, + "grad_norm": 3.4589779376983643, + "learning_rate": 3.4819820703866344e-05, + "loss": 0.6474, + "step": 3051 + }, + { + "epoch": 0.7348462047793897, + "grad_norm": 4.573122978210449, + "learning_rate": 3.4760705492572266e-05, + "loss": 0.529, + "step": 3052 + }, + { + "epoch": 0.7350869800758442, + "grad_norm": 0.7465322017669678, + "learning_rate": 3.470162994691624e-05, + "loss": 0.4171, + "step": 3053 + }, + { + "epoch": 0.7353277553722988, + "grad_norm": 1.0964757204055786, + "learning_rate": 3.464259410281635e-05, + "loss": 0.4091, + "step": 3054 + }, + { + "epoch": 0.7355685306687534, + "grad_norm": 3.490908145904541, + "learning_rate": 3.458359799616647e-05, + "loss": 1.0212, + "step": 3055 + }, + { + "epoch": 0.735809305965208, + "grad_norm": 1.6229488849639893, + "learning_rate": 3.45246416628364e-05, + "loss": 0.5396, + "step": 3056 + }, + { + "epoch": 0.7360500812616626, + "grad_norm": 2.6889917850494385, + "learning_rate": 3.446572513867175e-05, + "loss": 0.8915, + "step": 3057 + }, + { + "epoch": 0.7362908565581171, + "grad_norm": 2.3369765281677246, + "learning_rate": 3.4406848459493814e-05, + "loss": 0.62, + "step": 3058 + }, + { + "epoch": 0.7365316318545717, + "grad_norm": 1.6141836643218994, + "learning_rate": 3.434801166109981e-05, + "loss": 0.4647, + "step": 3059 + }, + { + "epoch": 0.7367724071510263, + "grad_norm": 4.394378662109375, + "learning_rate": 3.4289214779262636e-05, + "loss": 1.081, + "step": 3060 + }, + { + "epoch": 0.7370131824474809, + "grad_norm": 2.108896255493164, + "learning_rate": 3.423045784973091e-05, + "loss": 0.6174, + "step": 3061 + }, + { + "epoch": 0.7372539577439354, + "grad_norm": 6.742406845092773, + "learning_rate": 3.4171740908229044e-05, + "loss": 1.3335, + "step": 3062 + }, + { + "epoch": 0.73749473304039, + "grad_norm": 2.831634998321533, + "learning_rate": 3.411306399045697e-05, + "loss": 0.492, + "step": 3063 + }, + { + "epoch": 0.7377355083368446, + "grad_norm": 0.8104602694511414, + "learning_rate": 3.405442713209047e-05, + "loss": 0.6458, + "step": 3064 + }, + { + "epoch": 0.7379762836332993, + "grad_norm": 1.0663022994995117, + "learning_rate": 3.3995830368780825e-05, + "loss": 0.3529, + "step": 3065 + }, + { + "epoch": 0.7382170589297538, + "grad_norm": 2.1759705543518066, + "learning_rate": 3.393727373615503e-05, + "loss": 0.7057, + "step": 3066 + }, + { + "epoch": 0.7384578342262084, + "grad_norm": 2.893615245819092, + "learning_rate": 3.387875726981563e-05, + "loss": 0.7425, + "step": 3067 + }, + { + "epoch": 0.738698609522663, + "grad_norm": 1.8920822143554688, + "learning_rate": 3.3820281005340794e-05, + "loss": 0.4257, + "step": 3068 + }, + { + "epoch": 0.7389393848191176, + "grad_norm": 2.6992859840393066, + "learning_rate": 3.3761844978284205e-05, + "loss": 1.193, + "step": 3069 + }, + { + "epoch": 0.7391801601155722, + "grad_norm": 2.974738836288452, + "learning_rate": 3.370344922417513e-05, + "loss": 1.1457, + "step": 3070 + }, + { + "epoch": 0.7394209354120267, + "grad_norm": 0.7591432929039001, + "learning_rate": 3.364509377851828e-05, + "loss": 0.4777, + "step": 3071 + }, + { + "epoch": 0.7396617107084813, + "grad_norm": 2.3580071926116943, + "learning_rate": 3.358677867679394e-05, + "loss": 0.5326, + "step": 3072 + }, + { + "epoch": 0.7399024860049359, + "grad_norm": 2.4343063831329346, + "learning_rate": 3.3528503954457756e-05, + "loss": 0.4066, + "step": 3073 + }, + { + "epoch": 0.7401432613013905, + "grad_norm": 1.2667893171310425, + "learning_rate": 3.3470269646940935e-05, + "loss": 0.423, + "step": 3074 + }, + { + "epoch": 0.740384036597845, + "grad_norm": 2.454868793487549, + "learning_rate": 3.341207578965005e-05, + "loss": 0.2659, + "step": 3075 + }, + { + "epoch": 0.7406248118942996, + "grad_norm": 1.9105570316314697, + "learning_rate": 3.335392241796712e-05, + "loss": 1.0031, + "step": 3076 + }, + { + "epoch": 0.7408655871907542, + "grad_norm": 2.387080669403076, + "learning_rate": 3.329580956724955e-05, + "loss": 0.5239, + "step": 3077 + }, + { + "epoch": 0.7411063624872088, + "grad_norm": 8.040419578552246, + "learning_rate": 3.3237737272830013e-05, + "loss": 0.4703, + "step": 3078 + }, + { + "epoch": 0.7413471377836633, + "grad_norm": 1.0667513608932495, + "learning_rate": 3.317970557001664e-05, + "loss": 0.5395, + "step": 3079 + }, + { + "epoch": 0.741587913080118, + "grad_norm": 0.727729082107544, + "learning_rate": 3.312171449409285e-05, + "loss": 0.2785, + "step": 3080 + }, + { + "epoch": 0.7418286883765726, + "grad_norm": 1.5719585418701172, + "learning_rate": 3.306376408031729e-05, + "loss": 0.54, + "step": 3081 + }, + { + "epoch": 0.7420694636730272, + "grad_norm": 2.5653600692749023, + "learning_rate": 3.3005854363923995e-05, + "loss": 0.2214, + "step": 3082 + }, + { + "epoch": 0.7423102389694818, + "grad_norm": 1.5638865232467651, + "learning_rate": 3.294798538012217e-05, + "loss": 0.7477, + "step": 3083 + }, + { + "epoch": 0.7425510142659363, + "grad_norm": 1.88933527469635, + "learning_rate": 3.289015716409631e-05, + "loss": 0.7616, + "step": 3084 + }, + { + "epoch": 0.7427917895623909, + "grad_norm": 0.9233277440071106, + "learning_rate": 3.283236975100613e-05, + "loss": 0.3405, + "step": 3085 + }, + { + "epoch": 0.7430325648588455, + "grad_norm": 2.3473784923553467, + "learning_rate": 3.277462317598644e-05, + "loss": 0.8511, + "step": 3086 + }, + { + "epoch": 0.7432733401553001, + "grad_norm": 1.4704930782318115, + "learning_rate": 3.271691747414731e-05, + "loss": 0.5758, + "step": 3087 + }, + { + "epoch": 0.7435141154517546, + "grad_norm": 1.2950267791748047, + "learning_rate": 3.265925268057398e-05, + "loss": 0.7987, + "step": 3088 + }, + { + "epoch": 0.7437548907482092, + "grad_norm": 0.8450798392295837, + "learning_rate": 3.2601628830326726e-05, + "loss": 0.3298, + "step": 3089 + }, + { + "epoch": 0.7439956660446638, + "grad_norm": 1.7670706510543823, + "learning_rate": 3.2544045958441004e-05, + "loss": 0.4484, + "step": 3090 + }, + { + "epoch": 0.7442364413411184, + "grad_norm": 1.2544729709625244, + "learning_rate": 3.248650409992726e-05, + "loss": 0.4268, + "step": 3091 + }, + { + "epoch": 0.7444772166375729, + "grad_norm": 1.582452416419983, + "learning_rate": 3.2429003289771176e-05, + "loss": 0.5207, + "step": 3092 + }, + { + "epoch": 0.7447179919340275, + "grad_norm": 1.0165259838104248, + "learning_rate": 3.237154356293336e-05, + "loss": 0.7176, + "step": 3093 + }, + { + "epoch": 0.7449587672304822, + "grad_norm": 1.914751410484314, + "learning_rate": 3.231412495434939e-05, + "loss": 0.5358, + "step": 3094 + }, + { + "epoch": 0.7451995425269368, + "grad_norm": 4.326685428619385, + "learning_rate": 3.225674749892994e-05, + "loss": 0.7129, + "step": 3095 + }, + { + "epoch": 0.7454403178233914, + "grad_norm": 0.8451967239379883, + "learning_rate": 3.219941123156068e-05, + "loss": 0.4402, + "step": 3096 + }, + { + "epoch": 0.7456810931198459, + "grad_norm": 0.9839834570884705, + "learning_rate": 3.214211618710211e-05, + "loss": 0.3726, + "step": 3097 + }, + { + "epoch": 0.7459218684163005, + "grad_norm": 1.0465095043182373, + "learning_rate": 3.208486240038982e-05, + "loss": 0.1241, + "step": 3098 + }, + { + "epoch": 0.7461626437127551, + "grad_norm": 1.104686975479126, + "learning_rate": 3.202764990623417e-05, + "loss": 0.5279, + "step": 3099 + }, + { + "epoch": 0.7464034190092097, + "grad_norm": 1.0594794750213623, + "learning_rate": 3.1970478739420496e-05, + "loss": 0.3273, + "step": 3100 + }, + { + "epoch": 0.7466441943056642, + "grad_norm": 0.9185763597488403, + "learning_rate": 3.191334893470907e-05, + "loss": 0.3357, + "step": 3101 + }, + { + "epoch": 0.7468849696021188, + "grad_norm": 2.1206271648406982, + "learning_rate": 3.185626052683487e-05, + "loss": 0.5291, + "step": 3102 + }, + { + "epoch": 0.7471257448985734, + "grad_norm": 0.9549693465232849, + "learning_rate": 3.1799213550507835e-05, + "loss": 0.6672, + "step": 3103 + }, + { + "epoch": 0.747366520195028, + "grad_norm": 1.769875407218933, + "learning_rate": 3.174220804041258e-05, + "loss": 0.9207, + "step": 3104 + }, + { + "epoch": 0.7476072954914825, + "grad_norm": 10.123749732971191, + "learning_rate": 3.168524403120863e-05, + "loss": 0.9403, + "step": 3105 + }, + { + "epoch": 0.7478480707879371, + "grad_norm": 3.946068525314331, + "learning_rate": 3.1628321557530246e-05, + "loss": 0.6703, + "step": 3106 + }, + { + "epoch": 0.7480888460843917, + "grad_norm": 1.5204689502716064, + "learning_rate": 3.157144065398638e-05, + "loss": 0.6827, + "step": 3107 + }, + { + "epoch": 0.7483296213808464, + "grad_norm": 0.9539960026741028, + "learning_rate": 3.151460135516075e-05, + "loss": 0.6948, + "step": 3108 + }, + { + "epoch": 0.748570396677301, + "grad_norm": 2.0044784545898438, + "learning_rate": 3.145780369561182e-05, + "loss": 0.6487, + "step": 3109 + }, + { + "epoch": 0.7488111719737555, + "grad_norm": 2.3419203758239746, + "learning_rate": 3.140104770987265e-05, + "loss": 0.4121, + "step": 3110 + }, + { + "epoch": 0.7490519472702101, + "grad_norm": 2.2572646141052246, + "learning_rate": 3.1344333432451066e-05, + "loss": 0.2235, + "step": 3111 + }, + { + "epoch": 0.7492927225666647, + "grad_norm": 1.7564064264297485, + "learning_rate": 3.1287660897829404e-05, + "loss": 0.2708, + "step": 3112 + }, + { + "epoch": 0.7495334978631193, + "grad_norm": 1.6659893989562988, + "learning_rate": 3.1231030140464736e-05, + "loss": 0.7538, + "step": 3113 + }, + { + "epoch": 0.7497742731595738, + "grad_norm": 4.775331497192383, + "learning_rate": 3.117444119478871e-05, + "loss": 1.2959, + "step": 3114 + }, + { + "epoch": 0.7500150484560284, + "grad_norm": 4.739798545837402, + "learning_rate": 3.111789409520746e-05, + "loss": 0.5799, + "step": 3115 + }, + { + "epoch": 0.750255823752483, + "grad_norm": 1.0320911407470703, + "learning_rate": 3.1061388876101804e-05, + "loss": 0.4581, + "step": 3116 + }, + { + "epoch": 0.7504965990489376, + "grad_norm": 3.4287285804748535, + "learning_rate": 3.1004925571827023e-05, + "loss": 0.8336, + "step": 3117 + }, + { + "epoch": 0.7507373743453921, + "grad_norm": 2.3229026794433594, + "learning_rate": 3.094850421671295e-05, + "loss": 0.591, + "step": 3118 + }, + { + "epoch": 0.7509781496418467, + "grad_norm": 1.660323977470398, + "learning_rate": 3.089212484506392e-05, + "loss": 0.7506, + "step": 3119 + }, + { + "epoch": 0.7512189249383013, + "grad_norm": 2.4399898052215576, + "learning_rate": 3.083578749115865e-05, + "loss": 0.7181, + "step": 3120 + }, + { + "epoch": 0.751459700234756, + "grad_norm": 1.1477172374725342, + "learning_rate": 3.0779492189250414e-05, + "loss": 0.6411, + "step": 3121 + }, + { + "epoch": 0.7517004755312104, + "grad_norm": 3.424316167831421, + "learning_rate": 3.0723238973566925e-05, + "loss": 0.6226, + "step": 3122 + }, + { + "epoch": 0.751941250827665, + "grad_norm": 3.0182266235351562, + "learning_rate": 3.066702787831017e-05, + "loss": 0.3055, + "step": 3123 + }, + { + "epoch": 0.7521820261241197, + "grad_norm": 4.055928707122803, + "learning_rate": 3.06108589376567e-05, + "loss": 0.9499, + "step": 3124 + }, + { + "epoch": 0.7524228014205743, + "grad_norm": 2.966586112976074, + "learning_rate": 3.0554732185757315e-05, + "loss": 0.4065, + "step": 3125 + }, + { + "epoch": 0.7526635767170289, + "grad_norm": 2.517282247543335, + "learning_rate": 3.0498647656737223e-05, + "loss": 0.5657, + "step": 3126 + }, + { + "epoch": 0.7529043520134834, + "grad_norm": 5.178724765777588, + "learning_rate": 3.0442605384695977e-05, + "loss": 0.7705, + "step": 3127 + }, + { + "epoch": 0.753145127309938, + "grad_norm": 2.8488965034484863, + "learning_rate": 3.0386605403707346e-05, + "loss": 0.4091, + "step": 3128 + }, + { + "epoch": 0.7533859026063926, + "grad_norm": 0.804840087890625, + "learning_rate": 3.0330647747819496e-05, + "loss": 0.3117, + "step": 3129 + }, + { + "epoch": 0.7536266779028472, + "grad_norm": 1.8321592807769775, + "learning_rate": 3.0274732451054756e-05, + "loss": 0.58, + "step": 3130 + }, + { + "epoch": 0.7538674531993017, + "grad_norm": 0.4262060225009918, + "learning_rate": 3.021885954740977e-05, + "loss": 0.206, + "step": 3131 + }, + { + "epoch": 0.7541082284957563, + "grad_norm": 1.1734882593154907, + "learning_rate": 3.016302907085541e-05, + "loss": 0.9527, + "step": 3132 + }, + { + "epoch": 0.7543490037922109, + "grad_norm": 1.2724254131317139, + "learning_rate": 3.010724105533671e-05, + "loss": 0.7622, + "step": 3133 + }, + { + "epoch": 0.7545897790886655, + "grad_norm": 1.9372936487197876, + "learning_rate": 3.005149553477292e-05, + "loss": 0.5003, + "step": 3134 + }, + { + "epoch": 0.75483055438512, + "grad_norm": 4.942528247833252, + "learning_rate": 2.9995792543057478e-05, + "loss": 0.2299, + "step": 3135 + }, + { + "epoch": 0.7550713296815746, + "grad_norm": 2.330275535583496, + "learning_rate": 2.994013211405785e-05, + "loss": 0.5149, + "step": 3136 + }, + { + "epoch": 0.7553121049780293, + "grad_norm": 3.635746717453003, + "learning_rate": 2.988451428161578e-05, + "loss": 0.7856, + "step": 3137 + }, + { + "epoch": 0.7555528802744839, + "grad_norm": 1.8431618213653564, + "learning_rate": 2.982893907954697e-05, + "loss": 0.3647, + "step": 3138 + }, + { + "epoch": 0.7557936555709385, + "grad_norm": 3.0256638526916504, + "learning_rate": 2.977340654164129e-05, + "loss": 0.3034, + "step": 3139 + }, + { + "epoch": 0.756034430867393, + "grad_norm": 1.4221413135528564, + "learning_rate": 2.9717916701662662e-05, + "loss": 0.4793, + "step": 3140 + }, + { + "epoch": 0.7562752061638476, + "grad_norm": 1.2902501821517944, + "learning_rate": 2.966246959334903e-05, + "loss": 0.5462, + "step": 3141 + }, + { + "epoch": 0.7565159814603022, + "grad_norm": 2.2602968215942383, + "learning_rate": 2.960706525041238e-05, + "loss": 0.5961, + "step": 3142 + }, + { + "epoch": 0.7567567567567568, + "grad_norm": 2.0314295291900635, + "learning_rate": 2.9551703706538623e-05, + "loss": 0.9683, + "step": 3143 + }, + { + "epoch": 0.7569975320532113, + "grad_norm": 1.4725910425186157, + "learning_rate": 2.949638499538774e-05, + "loss": 0.5248, + "step": 3144 + }, + { + "epoch": 0.7572383073496659, + "grad_norm": 1.5069992542266846, + "learning_rate": 2.944110915059366e-05, + "loss": 0.5018, + "step": 3145 + }, + { + "epoch": 0.7574790826461205, + "grad_norm": 1.0725562572479248, + "learning_rate": 2.938587620576415e-05, + "loss": 0.7976, + "step": 3146 + }, + { + "epoch": 0.7577198579425751, + "grad_norm": 2.012692451477051, + "learning_rate": 2.9330686194481006e-05, + "loss": 0.5563, + "step": 3147 + }, + { + "epoch": 0.7579606332390296, + "grad_norm": 1.9315499067306519, + "learning_rate": 2.927553915029987e-05, + "loss": 0.8436, + "step": 3148 + }, + { + "epoch": 0.7582014085354842, + "grad_norm": 1.7731233835220337, + "learning_rate": 2.9220435106750276e-05, + "loss": 0.6159, + "step": 3149 + }, + { + "epoch": 0.7584421838319388, + "grad_norm": 4.7184977531433105, + "learning_rate": 2.9165374097335642e-05, + "loss": 0.625, + "step": 3150 + }, + { + "epoch": 0.7586829591283935, + "grad_norm": 3.9251320362091064, + "learning_rate": 2.9110356155533113e-05, + "loss": 0.6392, + "step": 3151 + }, + { + "epoch": 0.7589237344248481, + "grad_norm": 2.422001600265503, + "learning_rate": 2.905538131479376e-05, + "loss": 0.4932, + "step": 3152 + }, + { + "epoch": 0.7591645097213026, + "grad_norm": 3.3069140911102295, + "learning_rate": 2.9000449608542447e-05, + "loss": 0.7679, + "step": 3153 + }, + { + "epoch": 0.7594052850177572, + "grad_norm": 2.4573240280151367, + "learning_rate": 2.8945561070177696e-05, + "loss": 0.8308, + "step": 3154 + }, + { + "epoch": 0.7596460603142118, + "grad_norm": 1.1037508249282837, + "learning_rate": 2.8890715733071927e-05, + "loss": 0.4607, + "step": 3155 + }, + { + "epoch": 0.7598868356106664, + "grad_norm": 1.992222785949707, + "learning_rate": 2.8835913630571155e-05, + "loss": 1.0511, + "step": 3156 + }, + { + "epoch": 0.7601276109071209, + "grad_norm": 3.1501615047454834, + "learning_rate": 2.8781154795995247e-05, + "loss": 0.7244, + "step": 3157 + }, + { + "epoch": 0.7603683862035755, + "grad_norm": 5.186891078948975, + "learning_rate": 2.8726439262637727e-05, + "loss": 0.5768, + "step": 3158 + }, + { + "epoch": 0.7606091615000301, + "grad_norm": 3.4781057834625244, + "learning_rate": 2.8671767063765676e-05, + "loss": 0.4973, + "step": 3159 + }, + { + "epoch": 0.7608499367964847, + "grad_norm": 0.9576385617256165, + "learning_rate": 2.8617138232619955e-05, + "loss": 0.6546, + "step": 3160 + }, + { + "epoch": 0.7610907120929392, + "grad_norm": 1.434462070465088, + "learning_rate": 2.8562552802415055e-05, + "loss": 0.5047, + "step": 3161 + }, + { + "epoch": 0.7613314873893938, + "grad_norm": 1.7557677030563354, + "learning_rate": 2.850801080633896e-05, + "loss": 0.6268, + "step": 3162 + }, + { + "epoch": 0.7615722626858484, + "grad_norm": 1.2142372131347656, + "learning_rate": 2.8453512277553406e-05, + "loss": 0.5757, + "step": 3163 + }, + { + "epoch": 0.761813037982303, + "grad_norm": 1.8882231712341309, + "learning_rate": 2.8399057249193518e-05, + "loss": 0.9265, + "step": 3164 + }, + { + "epoch": 0.7620538132787577, + "grad_norm": 0.7379496693611145, + "learning_rate": 2.8344645754368172e-05, + "loss": 0.6167, + "step": 3165 + }, + { + "epoch": 0.7622945885752122, + "grad_norm": 1.584207534790039, + "learning_rate": 2.8290277826159683e-05, + "loss": 0.7881, + "step": 3166 + }, + { + "epoch": 0.7625353638716668, + "grad_norm": 2.564490556716919, + "learning_rate": 2.8235953497623803e-05, + "loss": 0.4444, + "step": 3167 + }, + { + "epoch": 0.7627761391681214, + "grad_norm": 2.109895706176758, + "learning_rate": 2.8181672801789917e-05, + "loss": 0.5114, + "step": 3168 + }, + { + "epoch": 0.763016914464576, + "grad_norm": 2.212892770767212, + "learning_rate": 2.8127435771660747e-05, + "loss": 0.9194, + "step": 3169 + }, + { + "epoch": 0.7632576897610305, + "grad_norm": 0.5335499048233032, + "learning_rate": 2.8073242440212556e-05, + "loss": 0.2802, + "step": 3170 + }, + { + "epoch": 0.7634984650574851, + "grad_norm": 0.7556986808776855, + "learning_rate": 2.8019092840395044e-05, + "loss": 0.4347, + "step": 3171 + }, + { + "epoch": 0.7637392403539397, + "grad_norm": 4.619632244110107, + "learning_rate": 2.796498700513124e-05, + "loss": 0.8844, + "step": 3172 + }, + { + "epoch": 0.7639800156503943, + "grad_norm": 4.419466018676758, + "learning_rate": 2.7910924967317585e-05, + "loss": 0.7078, + "step": 3173 + }, + { + "epoch": 0.7642207909468488, + "grad_norm": 2.6079466342926025, + "learning_rate": 2.785690675982404e-05, + "loss": 0.3685, + "step": 3174 + }, + { + "epoch": 0.7644615662433034, + "grad_norm": 2.4883298873901367, + "learning_rate": 2.7802932415493698e-05, + "loss": 1.2917, + "step": 3175 + }, + { + "epoch": 0.764702341539758, + "grad_norm": 2.154827356338501, + "learning_rate": 2.7749001967143128e-05, + "loss": 1.0546, + "step": 3176 + }, + { + "epoch": 0.7649431168362126, + "grad_norm": 1.366364598274231, + "learning_rate": 2.7695115447562126e-05, + "loss": 0.3194, + "step": 3177 + }, + { + "epoch": 0.7651838921326672, + "grad_norm": 2.245346784591675, + "learning_rate": 2.7641272889513837e-05, + "loss": 0.5605, + "step": 3178 + }, + { + "epoch": 0.7654246674291217, + "grad_norm": 8.141434669494629, + "learning_rate": 2.7587474325734687e-05, + "loss": 0.3617, + "step": 3179 + }, + { + "epoch": 0.7656654427255764, + "grad_norm": 1.830678105354309, + "learning_rate": 2.7533719788934255e-05, + "loss": 0.6659, + "step": 3180 + }, + { + "epoch": 0.765906218022031, + "grad_norm": 2.041790246963501, + "learning_rate": 2.7480009311795473e-05, + "loss": 0.7169, + "step": 3181 + }, + { + "epoch": 0.7661469933184856, + "grad_norm": 2.693058967590332, + "learning_rate": 2.7426342926974413e-05, + "loss": 0.6781, + "step": 3182 + }, + { + "epoch": 0.7663877686149401, + "grad_norm": 1.7061842679977417, + "learning_rate": 2.737272066710036e-05, + "loss": 0.5184, + "step": 3183 + }, + { + "epoch": 0.7666285439113947, + "grad_norm": 1.023386001586914, + "learning_rate": 2.73191425647758e-05, + "loss": 0.2387, + "step": 3184 + }, + { + "epoch": 0.7668693192078493, + "grad_norm": 0.27525773644447327, + "learning_rate": 2.726560865257629e-05, + "loss": 0.1579, + "step": 3185 + }, + { + "epoch": 0.7671100945043039, + "grad_norm": 0.31351879239082336, + "learning_rate": 2.7212118963050592e-05, + "loss": 0.2697, + "step": 3186 + }, + { + "epoch": 0.7673508698007584, + "grad_norm": 1.223887324333191, + "learning_rate": 2.715867352872058e-05, + "loss": 0.5606, + "step": 3187 + }, + { + "epoch": 0.767591645097213, + "grad_norm": 4.273595333099365, + "learning_rate": 2.710527238208116e-05, + "loss": 0.6982, + "step": 3188 + }, + { + "epoch": 0.7678324203936676, + "grad_norm": 3.2111504077911377, + "learning_rate": 2.705191555560035e-05, + "loss": 0.5278, + "step": 3189 + }, + { + "epoch": 0.7680731956901222, + "grad_norm": 1.322572112083435, + "learning_rate": 2.6998603081719243e-05, + "loss": 0.6689, + "step": 3190 + }, + { + "epoch": 0.7683139709865767, + "grad_norm": 2.3791556358337402, + "learning_rate": 2.6945334992851933e-05, + "loss": 0.3503, + "step": 3191 + }, + { + "epoch": 0.7685547462830313, + "grad_norm": 3.1207807064056396, + "learning_rate": 2.6892111321385584e-05, + "loss": 0.6434, + "step": 3192 + }, + { + "epoch": 0.7687955215794859, + "grad_norm": 1.937662124633789, + "learning_rate": 2.6838932099680225e-05, + "loss": 0.2284, + "step": 3193 + }, + { + "epoch": 0.7690362968759406, + "grad_norm": 1.8253540992736816, + "learning_rate": 2.678579736006901e-05, + "loss": 0.4102, + "step": 3194 + }, + { + "epoch": 0.7692770721723952, + "grad_norm": 1.696462869644165, + "learning_rate": 2.6732707134857937e-05, + "loss": 0.603, + "step": 3195 + }, + { + "epoch": 0.7695178474688497, + "grad_norm": 1.8397753238677979, + "learning_rate": 2.6679661456325988e-05, + "loss": 0.3548, + "step": 3196 + }, + { + "epoch": 0.7697586227653043, + "grad_norm": 2.073573350906372, + "learning_rate": 2.6626660356725064e-05, + "loss": 0.6005, + "step": 3197 + }, + { + "epoch": 0.7699993980617589, + "grad_norm": 0.84525465965271, + "learning_rate": 2.6573703868279963e-05, + "loss": 0.3808, + "step": 3198 + }, + { + "epoch": 0.7702401733582135, + "grad_norm": 2.4383602142333984, + "learning_rate": 2.6520792023188333e-05, + "loss": 0.8604, + "step": 3199 + }, + { + "epoch": 0.770480948654668, + "grad_norm": 0.7531054019927979, + "learning_rate": 2.646792485362074e-05, + "loss": 0.5609, + "step": 3200 + }, + { + "epoch": 0.7707217239511226, + "grad_norm": 0.8975092768669128, + "learning_rate": 2.6415102391720482e-05, + "loss": 0.4972, + "step": 3201 + }, + { + "epoch": 0.7709624992475772, + "grad_norm": 0.7330169081687927, + "learning_rate": 2.6362324669603776e-05, + "loss": 0.3688, + "step": 3202 + }, + { + "epoch": 0.7712032745440318, + "grad_norm": 1.5205063819885254, + "learning_rate": 2.630959171935956e-05, + "loss": 0.7152, + "step": 3203 + }, + { + "epoch": 0.7714440498404863, + "grad_norm": 4.553707122802734, + "learning_rate": 2.6256903573049597e-05, + "loss": 0.5479, + "step": 3204 + }, + { + "epoch": 0.7716848251369409, + "grad_norm": 0.9805248379707336, + "learning_rate": 2.6204260262708403e-05, + "loss": 0.655, + "step": 3205 + }, + { + "epoch": 0.7719256004333955, + "grad_norm": 1.8487534523010254, + "learning_rate": 2.6151661820343243e-05, + "loss": 0.6114, + "step": 3206 + }, + { + "epoch": 0.7721663757298501, + "grad_norm": 0.826151967048645, + "learning_rate": 2.6099108277934103e-05, + "loss": 0.2134, + "step": 3207 + }, + { + "epoch": 0.7724071510263048, + "grad_norm": 3.3528854846954346, + "learning_rate": 2.6046599667433603e-05, + "loss": 0.8448, + "step": 3208 + }, + { + "epoch": 0.7726479263227592, + "grad_norm": 1.5255182981491089, + "learning_rate": 2.5994136020767124e-05, + "loss": 0.5631, + "step": 3209 + }, + { + "epoch": 0.7728887016192139, + "grad_norm": 2.4078643321990967, + "learning_rate": 2.5941717369832707e-05, + "loss": 0.5426, + "step": 3210 + }, + { + "epoch": 0.7731294769156685, + "grad_norm": 4.288626670837402, + "learning_rate": 2.588934374650096e-05, + "loss": 0.4884, + "step": 3211 + }, + { + "epoch": 0.7733702522121231, + "grad_norm": 3.0660624504089355, + "learning_rate": 2.583701518261519e-05, + "loss": 0.4575, + "step": 3212 + }, + { + "epoch": 0.7736110275085776, + "grad_norm": 0.8354116678237915, + "learning_rate": 2.5784731709991272e-05, + "loss": 0.6711, + "step": 3213 + }, + { + "epoch": 0.7738518028050322, + "grad_norm": 3.1987497806549072, + "learning_rate": 2.57324933604177e-05, + "loss": 0.6139, + "step": 3214 + }, + { + "epoch": 0.7740925781014868, + "grad_norm": 0.9417548179626465, + "learning_rate": 2.5680300165655503e-05, + "loss": 0.6099, + "step": 3215 + }, + { + "epoch": 0.7743333533979414, + "grad_norm": 1.8997162580490112, + "learning_rate": 2.5628152157438222e-05, + "loss": 0.6094, + "step": 3216 + }, + { + "epoch": 0.7745741286943959, + "grad_norm": 1.4700846672058105, + "learning_rate": 2.5576049367471998e-05, + "loss": 0.2409, + "step": 3217 + }, + { + "epoch": 0.7748149039908505, + "grad_norm": 7.270529747009277, + "learning_rate": 2.5523991827435468e-05, + "loss": 0.6279, + "step": 3218 + }, + { + "epoch": 0.7750556792873051, + "grad_norm": 1.6653450727462769, + "learning_rate": 2.5471979568979666e-05, + "loss": 0.6544, + "step": 3219 + }, + { + "epoch": 0.7752964545837597, + "grad_norm": 4.143406391143799, + "learning_rate": 2.5420012623728208e-05, + "loss": 0.733, + "step": 3220 + }, + { + "epoch": 0.7755372298802143, + "grad_norm": 0.922996997833252, + "learning_rate": 2.5368091023277096e-05, + "loss": 0.3873, + "step": 3221 + }, + { + "epoch": 0.7757780051766688, + "grad_norm": 1.7943379878997803, + "learning_rate": 2.5316214799194805e-05, + "loss": 0.1036, + "step": 3222 + }, + { + "epoch": 0.7760187804731234, + "grad_norm": 3.269728422164917, + "learning_rate": 2.5264383983022198e-05, + "loss": 0.697, + "step": 3223 + }, + { + "epoch": 0.7762595557695781, + "grad_norm": 1.3404314517974854, + "learning_rate": 2.5212598606272486e-05, + "loss": 0.7117, + "step": 3224 + }, + { + "epoch": 0.7765003310660327, + "grad_norm": 3.5856986045837402, + "learning_rate": 2.516085870043131e-05, + "loss": 0.6111, + "step": 3225 + }, + { + "epoch": 0.7767411063624872, + "grad_norm": 1.1721508502960205, + "learning_rate": 2.51091642969567e-05, + "loss": 0.2938, + "step": 3226 + }, + { + "epoch": 0.7769818816589418, + "grad_norm": 2.648401975631714, + "learning_rate": 2.50575154272789e-05, + "loss": 0.8979, + "step": 3227 + }, + { + "epoch": 0.7772226569553964, + "grad_norm": 4.080894947052002, + "learning_rate": 2.5005912122800557e-05, + "loss": 0.284, + "step": 3228 + }, + { + "epoch": 0.777463432251851, + "grad_norm": 2.2749102115631104, + "learning_rate": 2.495435441489661e-05, + "loss": 0.6611, + "step": 3229 + }, + { + "epoch": 0.7777042075483055, + "grad_norm": 2.709296226501465, + "learning_rate": 2.4902842334914266e-05, + "loss": 0.6276, + "step": 3230 + }, + { + "epoch": 0.7779449828447601, + "grad_norm": 1.5649709701538086, + "learning_rate": 2.4851375914173003e-05, + "loss": 0.706, + "step": 3231 + }, + { + "epoch": 0.7781857581412147, + "grad_norm": 1.1615535020828247, + "learning_rate": 2.4799955183964463e-05, + "loss": 0.4029, + "step": 3232 + }, + { + "epoch": 0.7784265334376693, + "grad_norm": 1.1178641319274902, + "learning_rate": 2.4748580175552627e-05, + "loss": 0.6088, + "step": 3233 + }, + { + "epoch": 0.7786673087341239, + "grad_norm": 1.7456036806106567, + "learning_rate": 2.4697250920173566e-05, + "loss": 0.6452, + "step": 3234 + }, + { + "epoch": 0.7789080840305784, + "grad_norm": 1.5968141555786133, + "learning_rate": 2.46459674490356e-05, + "loss": 0.2782, + "step": 3235 + }, + { + "epoch": 0.779148859327033, + "grad_norm": 1.4153774976730347, + "learning_rate": 2.4594729793319227e-05, + "loss": 0.9929, + "step": 3236 + }, + { + "epoch": 0.7793896346234876, + "grad_norm": 4.274727821350098, + "learning_rate": 2.4543537984176978e-05, + "loss": 0.4176, + "step": 3237 + }, + { + "epoch": 0.7796304099199423, + "grad_norm": 2.977787494659424, + "learning_rate": 2.449239205273367e-05, + "loss": 0.5403, + "step": 3238 + }, + { + "epoch": 0.7798711852163968, + "grad_norm": 2.9022774696350098, + "learning_rate": 2.4441292030086137e-05, + "loss": 0.4639, + "step": 3239 + }, + { + "epoch": 0.7801119605128514, + "grad_norm": 1.2932614088058472, + "learning_rate": 2.439023794730326e-05, + "loss": 0.6634, + "step": 3240 + }, + { + "epoch": 0.780352735809306, + "grad_norm": 3.5876283645629883, + "learning_rate": 2.433922983542609e-05, + "loss": 1.0981, + "step": 3241 + }, + { + "epoch": 0.7805935111057606, + "grad_norm": 1.4793999195098877, + "learning_rate": 2.4288267725467618e-05, + "loss": 0.398, + "step": 3242 + }, + { + "epoch": 0.7808342864022151, + "grad_norm": 3.516136884689331, + "learning_rate": 2.4237351648412942e-05, + "loss": 0.6531, + "step": 3243 + }, + { + "epoch": 0.7810750616986697, + "grad_norm": 0.46728962659835815, + "learning_rate": 2.4186481635219193e-05, + "loss": 0.0365, + "step": 3244 + }, + { + "epoch": 0.7813158369951243, + "grad_norm": 5.394861221313477, + "learning_rate": 2.4135657716815397e-05, + "loss": 0.18, + "step": 3245 + }, + { + "epoch": 0.7815566122915789, + "grad_norm": 2.0945961475372314, + "learning_rate": 2.408487992410263e-05, + "loss": 0.5442, + "step": 3246 + }, + { + "epoch": 0.7817973875880335, + "grad_norm": 0.8790675401687622, + "learning_rate": 2.4034148287953904e-05, + "loss": 0.434, + "step": 3247 + }, + { + "epoch": 0.782038162884488, + "grad_norm": 5.560616493225098, + "learning_rate": 2.3983462839214177e-05, + "loss": 0.4532, + "step": 3248 + }, + { + "epoch": 0.7822789381809426, + "grad_norm": 1.3845301866531372, + "learning_rate": 2.3932823608700338e-05, + "loss": 0.6569, + "step": 3249 + }, + { + "epoch": 0.7825197134773972, + "grad_norm": 4.446075916290283, + "learning_rate": 2.3882230627201096e-05, + "loss": 0.2362, + "step": 3250 + }, + { + "epoch": 0.7827604887738518, + "grad_norm": 3.534898281097412, + "learning_rate": 2.3831683925477134e-05, + "loss": 0.3983, + "step": 3251 + }, + { + "epoch": 0.7830012640703063, + "grad_norm": 2.6837666034698486, + "learning_rate": 2.3781183534260975e-05, + "loss": 0.6582, + "step": 3252 + }, + { + "epoch": 0.783242039366761, + "grad_norm": 1.4857863187789917, + "learning_rate": 2.373072948425692e-05, + "loss": 0.3928, + "step": 3253 + }, + { + "epoch": 0.7834828146632156, + "grad_norm": 7.157393932342529, + "learning_rate": 2.368032180614118e-05, + "loss": 0.7341, + "step": 3254 + }, + { + "epoch": 0.7837235899596702, + "grad_norm": 1.6006975173950195, + "learning_rate": 2.3629960530561736e-05, + "loss": 0.8314, + "step": 3255 + }, + { + "epoch": 0.7839643652561247, + "grad_norm": 2.4584901332855225, + "learning_rate": 2.3579645688138352e-05, + "loss": 0.4926, + "step": 3256 + }, + { + "epoch": 0.7842051405525793, + "grad_norm": 2.698150396347046, + "learning_rate": 2.3529377309462585e-05, + "loss": 0.7207, + "step": 3257 + }, + { + "epoch": 0.7844459158490339, + "grad_norm": 2.509859561920166, + "learning_rate": 2.347915542509769e-05, + "loss": 0.8804, + "step": 3258 + }, + { + "epoch": 0.7846866911454885, + "grad_norm": 2.254075765609741, + "learning_rate": 2.342898006557872e-05, + "loss": 0.4099, + "step": 3259 + }, + { + "epoch": 0.7849274664419431, + "grad_norm": 1.3479466438293457, + "learning_rate": 2.337885126141236e-05, + "loss": 0.4939, + "step": 3260 + }, + { + "epoch": 0.7851682417383976, + "grad_norm": 1.9788506031036377, + "learning_rate": 2.3328769043077058e-05, + "loss": 0.6189, + "step": 3261 + }, + { + "epoch": 0.7854090170348522, + "grad_norm": 1.8684098720550537, + "learning_rate": 2.3278733441022925e-05, + "loss": 0.8066, + "step": 3262 + }, + { + "epoch": 0.7856497923313068, + "grad_norm": 3.792185068130493, + "learning_rate": 2.3228744485671718e-05, + "loss": 0.4835, + "step": 3263 + }, + { + "epoch": 0.7858905676277614, + "grad_norm": 0.6826027035713196, + "learning_rate": 2.3178802207416828e-05, + "loss": 0.4087, + "step": 3264 + }, + { + "epoch": 0.7861313429242159, + "grad_norm": 1.6336182355880737, + "learning_rate": 2.3128906636623303e-05, + "loss": 0.5187, + "step": 3265 + }, + { + "epoch": 0.7863721182206705, + "grad_norm": 2.8685998916625977, + "learning_rate": 2.3079057803627713e-05, + "loss": 0.3996, + "step": 3266 + }, + { + "epoch": 0.7866128935171252, + "grad_norm": 1.4814997911453247, + "learning_rate": 2.3029255738738308e-05, + "loss": 0.2919, + "step": 3267 + }, + { + "epoch": 0.7868536688135798, + "grad_norm": 2.786038398742676, + "learning_rate": 2.2979500472234806e-05, + "loss": 0.62, + "step": 3268 + }, + { + "epoch": 0.7870944441100343, + "grad_norm": 4.719537734985352, + "learning_rate": 2.2929792034368535e-05, + "loss": 0.822, + "step": 3269 + }, + { + "epoch": 0.7873352194064889, + "grad_norm": 8.993035316467285, + "learning_rate": 2.2880130455362358e-05, + "loss": 0.4918, + "step": 3270 + }, + { + "epoch": 0.7875759947029435, + "grad_norm": 1.7135777473449707, + "learning_rate": 2.2830515765410622e-05, + "loss": 0.4715, + "step": 3271 + }, + { + "epoch": 0.7878167699993981, + "grad_norm": 2.256098508834839, + "learning_rate": 2.278094799467918e-05, + "loss": 1.0771, + "step": 3272 + }, + { + "epoch": 0.7880575452958526, + "grad_norm": 1.1801178455352783, + "learning_rate": 2.2731427173305307e-05, + "loss": 0.6812, + "step": 3273 + }, + { + "epoch": 0.7882983205923072, + "grad_norm": 1.6812212467193604, + "learning_rate": 2.268195333139781e-05, + "loss": 0.567, + "step": 3274 + }, + { + "epoch": 0.7885390958887618, + "grad_norm": 2.234989881515503, + "learning_rate": 2.263252649903691e-05, + "loss": 0.5069, + "step": 3275 + }, + { + "epoch": 0.7887798711852164, + "grad_norm": 1.5656296014785767, + "learning_rate": 2.2583146706274184e-05, + "loss": 0.3535, + "step": 3276 + }, + { + "epoch": 0.789020646481671, + "grad_norm": 1.6030066013336182, + "learning_rate": 2.253381398313269e-05, + "loss": 0.9362, + "step": 3277 + }, + { + "epoch": 0.7892614217781255, + "grad_norm": 1.6599286794662476, + "learning_rate": 2.2484528359606816e-05, + "loss": 0.2563, + "step": 3278 + }, + { + "epoch": 0.7895021970745801, + "grad_norm": 3.2402637004852295, + "learning_rate": 2.2435289865662344e-05, + "loss": 0.8971, + "step": 3279 + }, + { + "epoch": 0.7897429723710347, + "grad_norm": 3.251466751098633, + "learning_rate": 2.2386098531236422e-05, + "loss": 1.0431, + "step": 3280 + }, + { + "epoch": 0.7899837476674894, + "grad_norm": 1.1390844583511353, + "learning_rate": 2.233695438623743e-05, + "loss": 0.381, + "step": 3281 + }, + { + "epoch": 0.7902245229639439, + "grad_norm": 1.3459970951080322, + "learning_rate": 2.228785746054515e-05, + "loss": 0.5704, + "step": 3282 + }, + { + "epoch": 0.7904652982603985, + "grad_norm": 6.388441562652588, + "learning_rate": 2.223880778401065e-05, + "loss": 0.7198, + "step": 3283 + }, + { + "epoch": 0.7907060735568531, + "grad_norm": 2.3066797256469727, + "learning_rate": 2.2189805386456186e-05, + "loss": 0.3931, + "step": 3284 + }, + { + "epoch": 0.7909468488533077, + "grad_norm": 4.641172409057617, + "learning_rate": 2.2140850297675353e-05, + "loss": 0.9101, + "step": 3285 + }, + { + "epoch": 0.7911876241497622, + "grad_norm": 2.48939847946167, + "learning_rate": 2.2091942547432955e-05, + "loss": 0.5307, + "step": 3286 + }, + { + "epoch": 0.7914283994462168, + "grad_norm": 1.4391555786132812, + "learning_rate": 2.2043082165465023e-05, + "loss": 0.3424, + "step": 3287 + }, + { + "epoch": 0.7916691747426714, + "grad_norm": 2.05548357963562, + "learning_rate": 2.19942691814788e-05, + "loss": 0.5474, + "step": 3288 + }, + { + "epoch": 0.791909950039126, + "grad_norm": 3.470940589904785, + "learning_rate": 2.194550362515263e-05, + "loss": 0.5817, + "step": 3289 + }, + { + "epoch": 0.7921507253355806, + "grad_norm": 6.029779434204102, + "learning_rate": 2.189678552613612e-05, + "loss": 0.8264, + "step": 3290 + }, + { + "epoch": 0.7923915006320351, + "grad_norm": 2.177302837371826, + "learning_rate": 2.184811491405001e-05, + "loss": 0.5883, + "step": 3291 + }, + { + "epoch": 0.7926322759284897, + "grad_norm": 3.795201063156128, + "learning_rate": 2.1799491818486083e-05, + "loss": 0.8381, + "step": 3292 + }, + { + "epoch": 0.7928730512249443, + "grad_norm": 2.861975908279419, + "learning_rate": 2.1750916269007316e-05, + "loss": 0.5125, + "step": 3293 + }, + { + "epoch": 0.793113826521399, + "grad_norm": 2.659313917160034, + "learning_rate": 2.1702388295147747e-05, + "loss": 0.5038, + "step": 3294 + }, + { + "epoch": 0.7933546018178534, + "grad_norm": 2.762467384338379, + "learning_rate": 2.165390792641251e-05, + "loss": 0.5655, + "step": 3295 + }, + { + "epoch": 0.793595377114308, + "grad_norm": 1.6343928575515747, + "learning_rate": 2.160547519227779e-05, + "loss": 0.4066, + "step": 3296 + }, + { + "epoch": 0.7938361524107627, + "grad_norm": 0.6321638822555542, + "learning_rate": 2.155709012219076e-05, + "loss": 0.3478, + "step": 3297 + }, + { + "epoch": 0.7940769277072173, + "grad_norm": 4.435551643371582, + "learning_rate": 2.1508752745569695e-05, + "loss": 1.071, + "step": 3298 + }, + { + "epoch": 0.7943177030036718, + "grad_norm": 11.227981567382812, + "learning_rate": 2.1460463091803773e-05, + "loss": 0.741, + "step": 3299 + }, + { + "epoch": 0.7945584783001264, + "grad_norm": 1.14915931224823, + "learning_rate": 2.1412221190253245e-05, + "loss": 0.5523, + "step": 3300 + }, + { + "epoch": 0.794799253596581, + "grad_norm": 0.4120855927467346, + "learning_rate": 2.1364027070249282e-05, + "loss": 0.1457, + "step": 3301 + }, + { + "epoch": 0.7950400288930356, + "grad_norm": 3.5175342559814453, + "learning_rate": 2.1315880761094044e-05, + "loss": 1.1578, + "step": 3302 + }, + { + "epoch": 0.7952808041894902, + "grad_norm": 1.621909260749817, + "learning_rate": 2.126778229206058e-05, + "loss": 1.1031, + "step": 3303 + }, + { + "epoch": 0.7955215794859447, + "grad_norm": 1.1469271183013916, + "learning_rate": 2.1219731692392887e-05, + "loss": 0.16, + "step": 3304 + }, + { + "epoch": 0.7957623547823993, + "grad_norm": 1.0267417430877686, + "learning_rate": 2.1171728991305795e-05, + "loss": 0.6588, + "step": 3305 + }, + { + "epoch": 0.7960031300788539, + "grad_norm": 0.7822784781455994, + "learning_rate": 2.1123774217985116e-05, + "loss": 0.4397, + "step": 3306 + }, + { + "epoch": 0.7962439053753085, + "grad_norm": 0.9245109558105469, + "learning_rate": 2.107586740158738e-05, + "loss": 0.2842, + "step": 3307 + }, + { + "epoch": 0.796484680671763, + "grad_norm": 1.0245726108551025, + "learning_rate": 2.1028008571240088e-05, + "loss": 1.107, + "step": 3308 + }, + { + "epoch": 0.7967254559682176, + "grad_norm": 4.2257466316223145, + "learning_rate": 2.0980197756041542e-05, + "loss": 0.7681, + "step": 3309 + }, + { + "epoch": 0.7969662312646723, + "grad_norm": 1.4079909324645996, + "learning_rate": 2.0932434985060733e-05, + "loss": 0.3071, + "step": 3310 + }, + { + "epoch": 0.7972070065611269, + "grad_norm": 2.9457712173461914, + "learning_rate": 2.0884720287337657e-05, + "loss": 0.5083, + "step": 3311 + }, + { + "epoch": 0.7974477818575814, + "grad_norm": 1.8760308027267456, + "learning_rate": 2.0837053691882856e-05, + "loss": 0.1079, + "step": 3312 + }, + { + "epoch": 0.797688557154036, + "grad_norm": 2.5826492309570312, + "learning_rate": 2.0789435227677777e-05, + "loss": 0.8308, + "step": 3313 + }, + { + "epoch": 0.7979293324504906, + "grad_norm": 1.930856466293335, + "learning_rate": 2.074186492367457e-05, + "loss": 0.6475, + "step": 3314 + }, + { + "epoch": 0.7981701077469452, + "grad_norm": 1.8756681680679321, + "learning_rate": 2.069434280879603e-05, + "loss": 0.4886, + "step": 3315 + }, + { + "epoch": 0.7984108830433998, + "grad_norm": 3.851440668106079, + "learning_rate": 2.0646868911935735e-05, + "loss": 1.2528, + "step": 3316 + }, + { + "epoch": 0.7986516583398543, + "grad_norm": 1.4915354251861572, + "learning_rate": 2.0599443261957962e-05, + "loss": 0.534, + "step": 3317 + }, + { + "epoch": 0.7988924336363089, + "grad_norm": 0.9275015592575073, + "learning_rate": 2.0552065887697546e-05, + "loss": 0.3342, + "step": 3318 + }, + { + "epoch": 0.7991332089327635, + "grad_norm": 2.0070860385894775, + "learning_rate": 2.0504736817960068e-05, + "loss": 0.7064, + "step": 3319 + }, + { + "epoch": 0.7993739842292181, + "grad_norm": 2.1603689193725586, + "learning_rate": 2.045745608152171e-05, + "loss": 0.7129, + "step": 3320 + }, + { + "epoch": 0.7996147595256726, + "grad_norm": 1.2571876049041748, + "learning_rate": 2.0410223707129274e-05, + "loss": 0.8612, + "step": 3321 + }, + { + "epoch": 0.7998555348221272, + "grad_norm": 2.6326212882995605, + "learning_rate": 2.0363039723500156e-05, + "loss": 0.4104, + "step": 3322 + }, + { + "epoch": 0.8000963101185818, + "grad_norm": 1.6665747165679932, + "learning_rate": 2.0315904159322287e-05, + "loss": 0.6619, + "step": 3323 + }, + { + "epoch": 0.8003370854150365, + "grad_norm": 1.7550292015075684, + "learning_rate": 2.026881704325425e-05, + "loss": 0.8006, + "step": 3324 + }, + { + "epoch": 0.800577860711491, + "grad_norm": 0.9334133267402649, + "learning_rate": 2.0221778403925062e-05, + "loss": 0.4847, + "step": 3325 + }, + { + "epoch": 0.8008186360079456, + "grad_norm": 1.6787467002868652, + "learning_rate": 2.0174788269934343e-05, + "loss": 0.4084, + "step": 3326 + }, + { + "epoch": 0.8010594113044002, + "grad_norm": 1.1613237857818604, + "learning_rate": 2.01278466698522e-05, + "loss": 0.4913, + "step": 3327 + }, + { + "epoch": 0.8013001866008548, + "grad_norm": 1.7713500261306763, + "learning_rate": 2.0080953632219247e-05, + "loss": 0.3606, + "step": 3328 + }, + { + "epoch": 0.8015409618973094, + "grad_norm": 2.774338960647583, + "learning_rate": 2.0034109185546534e-05, + "loss": 0.6157, + "step": 3329 + }, + { + "epoch": 0.8017817371937639, + "grad_norm": 2.1375133991241455, + "learning_rate": 1.9987313358315628e-05, + "loss": 0.8029, + "step": 3330 + }, + { + "epoch": 0.8020225124902185, + "grad_norm": 4.05165958404541, + "learning_rate": 1.994056617897846e-05, + "loss": 0.9467, + "step": 3331 + }, + { + "epoch": 0.8022632877866731, + "grad_norm": 1.2718948125839233, + "learning_rate": 1.9893867675957445e-05, + "loss": 0.6438, + "step": 3332 + }, + { + "epoch": 0.8025040630831277, + "grad_norm": 2.1101791858673096, + "learning_rate": 1.984721787764534e-05, + "loss": 0.6168, + "step": 3333 + }, + { + "epoch": 0.8027448383795822, + "grad_norm": 3.8065285682678223, + "learning_rate": 1.9800616812405348e-05, + "loss": 0.331, + "step": 3334 + }, + { + "epoch": 0.8029856136760368, + "grad_norm": 1.6323808431625366, + "learning_rate": 1.9754064508571036e-05, + "loss": 0.4418, + "step": 3335 + }, + { + "epoch": 0.8032263889724914, + "grad_norm": 1.982974648475647, + "learning_rate": 1.9707560994446284e-05, + "loss": 0.6296, + "step": 3336 + }, + { + "epoch": 0.803467164268946, + "grad_norm": 0.8455390334129333, + "learning_rate": 1.9661106298305387e-05, + "loss": 0.4286, + "step": 3337 + }, + { + "epoch": 0.8037079395654005, + "grad_norm": 1.4520126581192017, + "learning_rate": 1.9614700448392832e-05, + "loss": 0.7171, + "step": 3338 + }, + { + "epoch": 0.8039487148618552, + "grad_norm": 1.9037809371948242, + "learning_rate": 1.9568343472923524e-05, + "loss": 0.4656, + "step": 3339 + }, + { + "epoch": 0.8041894901583098, + "grad_norm": 1.7248350381851196, + "learning_rate": 1.9522035400082615e-05, + "loss": 0.3961, + "step": 3340 + }, + { + "epoch": 0.8044302654547644, + "grad_norm": 2.145430326461792, + "learning_rate": 1.947577625802548e-05, + "loss": 0.5493, + "step": 3341 + }, + { + "epoch": 0.804671040751219, + "grad_norm": 0.38401633501052856, + "learning_rate": 1.9429566074877816e-05, + "loss": 0.4645, + "step": 3342 + }, + { + "epoch": 0.8049118160476735, + "grad_norm": 4.1669840812683105, + "learning_rate": 1.938340487873549e-05, + "loss": 0.6155, + "step": 3343 + }, + { + "epoch": 0.8051525913441281, + "grad_norm": 1.835777997970581, + "learning_rate": 1.9337292697664633e-05, + "loss": 0.7442, + "step": 3344 + }, + { + "epoch": 0.8053933666405827, + "grad_norm": 1.9592784643173218, + "learning_rate": 1.9291229559701572e-05, + "loss": 0.6318, + "step": 3345 + }, + { + "epoch": 0.8056341419370373, + "grad_norm": 2.0905535221099854, + "learning_rate": 1.9245215492852766e-05, + "loss": 0.5699, + "step": 3346 + }, + { + "epoch": 0.8058749172334918, + "grad_norm": 1.7609286308288574, + "learning_rate": 1.919925052509487e-05, + "loss": 0.9583, + "step": 3347 + }, + { + "epoch": 0.8061156925299464, + "grad_norm": 2.8224740028381348, + "learning_rate": 1.9153334684374725e-05, + "loss": 0.8957, + "step": 3348 + }, + { + "epoch": 0.806356467826401, + "grad_norm": 0.9539978504180908, + "learning_rate": 1.9107467998609228e-05, + "loss": 0.7801, + "step": 3349 + }, + { + "epoch": 0.8065972431228556, + "grad_norm": 3.402155637741089, + "learning_rate": 1.9061650495685433e-05, + "loss": 0.8503, + "step": 3350 + }, + { + "epoch": 0.8068380184193101, + "grad_norm": 1.3554385900497437, + "learning_rate": 1.9015882203460488e-05, + "loss": 0.5393, + "step": 3351 + }, + { + "epoch": 0.8070787937157647, + "grad_norm": 3.57460618019104, + "learning_rate": 1.8970163149761634e-05, + "loss": 0.9425, + "step": 3352 + }, + { + "epoch": 0.8073195690122194, + "grad_norm": 7.111121654510498, + "learning_rate": 1.8924493362386166e-05, + "loss": 0.5293, + "step": 3353 + }, + { + "epoch": 0.807560344308674, + "grad_norm": 2.1528825759887695, + "learning_rate": 1.887887286910137e-05, + "loss": 0.2559, + "step": 3354 + }, + { + "epoch": 0.8078011196051285, + "grad_norm": 0.7006820440292358, + "learning_rate": 1.8833301697644644e-05, + "loss": 0.431, + "step": 3355 + }, + { + "epoch": 0.8080418949015831, + "grad_norm": 2.09385085105896, + "learning_rate": 1.878777987572339e-05, + "loss": 1.159, + "step": 3356 + }, + { + "epoch": 0.8082826701980377, + "grad_norm": 3.343334674835205, + "learning_rate": 1.8742307431014905e-05, + "loss": 1.0441, + "step": 3357 + }, + { + "epoch": 0.8085234454944923, + "grad_norm": 4.183254241943359, + "learning_rate": 1.869688439116659e-05, + "loss": 1.0862, + "step": 3358 + }, + { + "epoch": 0.8087642207909469, + "grad_norm": 2.1016793251037598, + "learning_rate": 1.8651510783795734e-05, + "loss": 0.5553, + "step": 3359 + }, + { + "epoch": 0.8090049960874014, + "grad_norm": 0.7969531416893005, + "learning_rate": 1.8606186636489596e-05, + "loss": 0.54, + "step": 3360 + }, + { + "epoch": 0.809245771383856, + "grad_norm": 1.4294320344924927, + "learning_rate": 1.8560911976805405e-05, + "loss": 0.4665, + "step": 3361 + }, + { + "epoch": 0.8094865466803106, + "grad_norm": 3.3648643493652344, + "learning_rate": 1.8515686832270184e-05, + "loss": 1.236, + "step": 3362 + }, + { + "epoch": 0.8097273219767652, + "grad_norm": 1.2407772541046143, + "learning_rate": 1.8470511230380983e-05, + "loss": 0.553, + "step": 3363 + }, + { + "epoch": 0.8099680972732197, + "grad_norm": 3.098479986190796, + "learning_rate": 1.8425385198604615e-05, + "loss": 0.4733, + "step": 3364 + }, + { + "epoch": 0.8102088725696743, + "grad_norm": 1.627521276473999, + "learning_rate": 1.8380308764377842e-05, + "loss": 0.4355, + "step": 3365 + }, + { + "epoch": 0.8104496478661289, + "grad_norm": 1.2290712594985962, + "learning_rate": 1.833528195510722e-05, + "loss": 0.4626, + "step": 3366 + }, + { + "epoch": 0.8106904231625836, + "grad_norm": 1.5985430479049683, + "learning_rate": 1.8290304798169176e-05, + "loss": 0.2771, + "step": 3367 + }, + { + "epoch": 0.810931198459038, + "grad_norm": 2.063868522644043, + "learning_rate": 1.8245377320909894e-05, + "loss": 0.3625, + "step": 3368 + }, + { + "epoch": 0.8111719737554927, + "grad_norm": 17.49557113647461, + "learning_rate": 1.8200499550645433e-05, + "loss": 1.085, + "step": 3369 + }, + { + "epoch": 0.8114127490519473, + "grad_norm": 2.0841176509857178, + "learning_rate": 1.815567151466151e-05, + "loss": 0.722, + "step": 3370 + }, + { + "epoch": 0.8116535243484019, + "grad_norm": 2.4572806358337402, + "learning_rate": 1.8110893240213733e-05, + "loss": 0.365, + "step": 3371 + }, + { + "epoch": 0.8118942996448565, + "grad_norm": 0.8181131482124329, + "learning_rate": 1.806616475452734e-05, + "loss": 0.4139, + "step": 3372 + }, + { + "epoch": 0.812135074941311, + "grad_norm": 2.0539329051971436, + "learning_rate": 1.8021486084797368e-05, + "loss": 0.8089, + "step": 3373 + }, + { + "epoch": 0.8123758502377656, + "grad_norm": 1.458526611328125, + "learning_rate": 1.797685725818856e-05, + "loss": 0.7537, + "step": 3374 + }, + { + "epoch": 0.8126166255342202, + "grad_norm": 0.4736323058605194, + "learning_rate": 1.7932278301835347e-05, + "loss": 0.1747, + "step": 3375 + }, + { + "epoch": 0.8128574008306748, + "grad_norm": 2.3120036125183105, + "learning_rate": 1.7887749242841844e-05, + "loss": 0.4192, + "step": 3376 + }, + { + "epoch": 0.8130981761271293, + "grad_norm": 15.435149192810059, + "learning_rate": 1.7843270108281772e-05, + "loss": 0.6507, + "step": 3377 + }, + { + "epoch": 0.8133389514235839, + "grad_norm": 1.2103174924850464, + "learning_rate": 1.779884092519859e-05, + "loss": 0.6697, + "step": 3378 + }, + { + "epoch": 0.8135797267200385, + "grad_norm": 1.7959200143814087, + "learning_rate": 1.7754461720605342e-05, + "loss": 0.4094, + "step": 3379 + }, + { + "epoch": 0.8138205020164931, + "grad_norm": 2.2297651767730713, + "learning_rate": 1.7710132521484646e-05, + "loss": 1.1822, + "step": 3380 + }, + { + "epoch": 0.8140612773129476, + "grad_norm": 1.7390764951705933, + "learning_rate": 1.7665853354788774e-05, + "loss": 0.6164, + "step": 3381 + }, + { + "epoch": 0.8143020526094022, + "grad_norm": 1.3549991846084595, + "learning_rate": 1.7621624247439594e-05, + "loss": 0.3209, + "step": 3382 + }, + { + "epoch": 0.8145428279058569, + "grad_norm": 0.4444354176521301, + "learning_rate": 1.7577445226328425e-05, + "loss": 0.3606, + "step": 3383 + }, + { + "epoch": 0.8147836032023115, + "grad_norm": 10.032902717590332, + "learning_rate": 1.7533316318316307e-05, + "loss": 0.7676, + "step": 3384 + }, + { + "epoch": 0.8150243784987661, + "grad_norm": 1.579328179359436, + "learning_rate": 1.748923755023364e-05, + "loss": 0.7432, + "step": 3385 + }, + { + "epoch": 0.8152651537952206, + "grad_norm": 2.0636134147644043, + "learning_rate": 1.7445208948880442e-05, + "loss": 0.9798, + "step": 3386 + }, + { + "epoch": 0.8155059290916752, + "grad_norm": 1.8931350708007812, + "learning_rate": 1.7401230541026226e-05, + "loss": 0.4045, + "step": 3387 + }, + { + "epoch": 0.8157467043881298, + "grad_norm": 29.280317306518555, + "learning_rate": 1.735730235340991e-05, + "loss": 0.8831, + "step": 3388 + }, + { + "epoch": 0.8159874796845844, + "grad_norm": 1.0704580545425415, + "learning_rate": 1.7313424412739987e-05, + "loss": 0.5779, + "step": 3389 + }, + { + "epoch": 0.8162282549810389, + "grad_norm": 1.8337838649749756, + "learning_rate": 1.7269596745694295e-05, + "loss": 0.6469, + "step": 3390 + }, + { + "epoch": 0.8164690302774935, + "grad_norm": 1.4514000415802002, + "learning_rate": 1.722581937892015e-05, + "loss": 0.4172, + "step": 3391 + }, + { + "epoch": 0.8167098055739481, + "grad_norm": 2.778085231781006, + "learning_rate": 1.718209233903436e-05, + "loss": 0.622, + "step": 3392 + }, + { + "epoch": 0.8169505808704027, + "grad_norm": 1.9710346460342407, + "learning_rate": 1.7138415652622995e-05, + "loss": 0.3566, + "step": 3393 + }, + { + "epoch": 0.8171913561668572, + "grad_norm": 3.470649003982544, + "learning_rate": 1.70947893462416e-05, + "loss": 0.6804, + "step": 3394 + }, + { + "epoch": 0.8174321314633118, + "grad_norm": 0.8175150752067566, + "learning_rate": 1.7051213446415104e-05, + "loss": 0.1711, + "step": 3395 + }, + { + "epoch": 0.8176729067597664, + "grad_norm": 2.38569974899292, + "learning_rate": 1.7007687979637687e-05, + "loss": 1.0114, + "step": 3396 + }, + { + "epoch": 0.8179136820562211, + "grad_norm": 1.0082086324691772, + "learning_rate": 1.6964212972372995e-05, + "loss": 0.6251, + "step": 3397 + }, + { + "epoch": 0.8181544573526757, + "grad_norm": 3.498782157897949, + "learning_rate": 1.692078845105386e-05, + "loss": 1.078, + "step": 3398 + }, + { + "epoch": 0.8183952326491302, + "grad_norm": 1.055105447769165, + "learning_rate": 1.6877414442082528e-05, + "loss": 0.45, + "step": 3399 + }, + { + "epoch": 0.8186360079455848, + "grad_norm": 3.0139036178588867, + "learning_rate": 1.6834090971830507e-05, + "loss": 0.5409, + "step": 3400 + }, + { + "epoch": 0.8188767832420394, + "grad_norm": 1.671664834022522, + "learning_rate": 1.6790818066638536e-05, + "loss": 0.5133, + "step": 3401 + }, + { + "epoch": 0.819117558538494, + "grad_norm": 3.366199254989624, + "learning_rate": 1.6747595752816658e-05, + "loss": 0.6439, + "step": 3402 + }, + { + "epoch": 0.8193583338349485, + "grad_norm": 1.2832306623458862, + "learning_rate": 1.6704424056644154e-05, + "loss": 0.7887, + "step": 3403 + }, + { + "epoch": 0.8195991091314031, + "grad_norm": 1.5629595518112183, + "learning_rate": 1.6661303004369468e-05, + "loss": 0.5992, + "step": 3404 + }, + { + "epoch": 0.8198398844278577, + "grad_norm": 1.5762869119644165, + "learning_rate": 1.661823262221035e-05, + "loss": 0.6254, + "step": 3405 + }, + { + "epoch": 0.8200806597243123, + "grad_norm": 4.563362121582031, + "learning_rate": 1.6575212936353625e-05, + "loss": 0.9719, + "step": 3406 + }, + { + "epoch": 0.8203214350207668, + "grad_norm": 0.8489302396774292, + "learning_rate": 1.6532243972955398e-05, + "loss": 0.3529, + "step": 3407 + }, + { + "epoch": 0.8205622103172214, + "grad_norm": 4.638950824737549, + "learning_rate": 1.6489325758140895e-05, + "loss": 0.3409, + "step": 3408 + }, + { + "epoch": 0.820802985613676, + "grad_norm": 1.3469526767730713, + "learning_rate": 1.6446458318004477e-05, + "loss": 0.5872, + "step": 3409 + }, + { + "epoch": 0.8210437609101306, + "grad_norm": 0.8786214590072632, + "learning_rate": 1.640364167860967e-05, + "loss": 0.3984, + "step": 3410 + }, + { + "epoch": 0.8212845362065853, + "grad_norm": 1.392174482345581, + "learning_rate": 1.6360875865989046e-05, + "loss": 0.293, + "step": 3411 + }, + { + "epoch": 0.8215253115030398, + "grad_norm": 1.987600326538086, + "learning_rate": 1.631816090614434e-05, + "loss": 0.2162, + "step": 3412 + }, + { + "epoch": 0.8217660867994944, + "grad_norm": 0.8357548713684082, + "learning_rate": 1.6275496825046367e-05, + "loss": 0.5193, + "step": 3413 + }, + { + "epoch": 0.822006862095949, + "grad_norm": 2.0921974182128906, + "learning_rate": 1.6232883648634933e-05, + "loss": 0.5682, + "step": 3414 + }, + { + "epoch": 0.8222476373924036, + "grad_norm": 2.1746203899383545, + "learning_rate": 1.6190321402818963e-05, + "loss": 0.6858, + "step": 3415 + }, + { + "epoch": 0.8224884126888581, + "grad_norm": 4.1129584312438965, + "learning_rate": 1.6147810113476413e-05, + "loss": 0.6928, + "step": 3416 + }, + { + "epoch": 0.8227291879853127, + "grad_norm": 3.485736846923828, + "learning_rate": 1.610534980645423e-05, + "loss": 0.9683, + "step": 3417 + }, + { + "epoch": 0.8229699632817673, + "grad_norm": 1.9745628833770752, + "learning_rate": 1.60629405075684e-05, + "loss": 0.5169, + "step": 3418 + }, + { + "epoch": 0.8232107385782219, + "grad_norm": 1.6666935682296753, + "learning_rate": 1.6020582242603844e-05, + "loss": 0.6159, + "step": 3419 + }, + { + "epoch": 0.8234515138746764, + "grad_norm": 4.859829425811768, + "learning_rate": 1.5978275037314482e-05, + "loss": 0.9885, + "step": 3420 + }, + { + "epoch": 0.823692289171131, + "grad_norm": 2.979112148284912, + "learning_rate": 1.5936018917423236e-05, + "loss": 0.6964, + "step": 3421 + }, + { + "epoch": 0.8239330644675856, + "grad_norm": 2.812305450439453, + "learning_rate": 1.5893813908621857e-05, + "loss": 0.9054, + "step": 3422 + }, + { + "epoch": 0.8241738397640402, + "grad_norm": 8.65831184387207, + "learning_rate": 1.5851660036571115e-05, + "loss": 1.0088, + "step": 3423 + }, + { + "epoch": 0.8244146150604948, + "grad_norm": 0.610390305519104, + "learning_rate": 1.5809557326900647e-05, + "loss": 0.4041, + "step": 3424 + }, + { + "epoch": 0.8246553903569493, + "grad_norm": 2.6373860836029053, + "learning_rate": 1.5767505805209027e-05, + "loss": 0.7424, + "step": 3425 + }, + { + "epoch": 0.824896165653404, + "grad_norm": 2.2015843391418457, + "learning_rate": 1.5725505497063664e-05, + "loss": 0.9575, + "step": 3426 + }, + { + "epoch": 0.8251369409498586, + "grad_norm": 2.0284852981567383, + "learning_rate": 1.568355642800081e-05, + "loss": 0.6706, + "step": 3427 + }, + { + "epoch": 0.8253777162463132, + "grad_norm": 3.0381412506103516, + "learning_rate": 1.5641658623525623e-05, + "loss": 0.3887, + "step": 3428 + }, + { + "epoch": 0.8256184915427677, + "grad_norm": 3.0191638469696045, + "learning_rate": 1.5599812109112076e-05, + "loss": 0.8594, + "step": 3429 + }, + { + "epoch": 0.8258592668392223, + "grad_norm": 2.608114719390869, + "learning_rate": 1.55580169102029e-05, + "loss": 0.2382, + "step": 3430 + }, + { + "epoch": 0.8261000421356769, + "grad_norm": 2.528092384338379, + "learning_rate": 1.5516273052209683e-05, + "loss": 0.6125, + "step": 3431 + }, + { + "epoch": 0.8263408174321315, + "grad_norm": 1.7258909940719604, + "learning_rate": 1.547458056051281e-05, + "loss": 0.2416, + "step": 3432 + }, + { + "epoch": 0.826581592728586, + "grad_norm": 1.4426395893096924, + "learning_rate": 1.5432939460461384e-05, + "loss": 0.5668, + "step": 3433 + }, + { + "epoch": 0.8268223680250406, + "grad_norm": 1.0259637832641602, + "learning_rate": 1.539134977737332e-05, + "loss": 0.2167, + "step": 3434 + }, + { + "epoch": 0.8270631433214952, + "grad_norm": 1.5782815217971802, + "learning_rate": 1.5349811536535196e-05, + "loss": 0.9795, + "step": 3435 + }, + { + "epoch": 0.8273039186179498, + "grad_norm": 1.5504636764526367, + "learning_rate": 1.5308324763202397e-05, + "loss": 0.6967, + "step": 3436 + }, + { + "epoch": 0.8275446939144043, + "grad_norm": 0.8018413186073303, + "learning_rate": 1.5266889482598934e-05, + "loss": 0.1634, + "step": 3437 + }, + { + "epoch": 0.8277854692108589, + "grad_norm": 2.674348831176758, + "learning_rate": 1.5225505719917577e-05, + "loss": 0.8628, + "step": 3438 + }, + { + "epoch": 0.8280262445073135, + "grad_norm": 1.5734491348266602, + "learning_rate": 1.5184173500319731e-05, + "loss": 0.6495, + "step": 3439 + }, + { + "epoch": 0.8282670198037682, + "grad_norm": 3.662338972091675, + "learning_rate": 1.5142892848935497e-05, + "loss": 0.9257, + "step": 3440 + }, + { + "epoch": 0.8285077951002228, + "grad_norm": 4.754350185394287, + "learning_rate": 1.5101663790863596e-05, + "loss": 0.2461, + "step": 3441 + }, + { + "epoch": 0.8287485703966773, + "grad_norm": 3.429853677749634, + "learning_rate": 1.5060486351171411e-05, + "loss": 1.0115, + "step": 3442 + }, + { + "epoch": 0.8289893456931319, + "grad_norm": 2.6595869064331055, + "learning_rate": 1.5019360554894868e-05, + "loss": 0.4968, + "step": 3443 + }, + { + "epoch": 0.8292301209895865, + "grad_norm": 1.3187874555587769, + "learning_rate": 1.4978286427038601e-05, + "loss": 0.6117, + "step": 3444 + }, + { + "epoch": 0.8294708962860411, + "grad_norm": 1.491859793663025, + "learning_rate": 1.4937263992575712e-05, + "loss": 0.4465, + "step": 3445 + }, + { + "epoch": 0.8297116715824956, + "grad_norm": 2.291027784347534, + "learning_rate": 1.489629327644797e-05, + "loss": 0.8872, + "step": 3446 + }, + { + "epoch": 0.8299524468789502, + "grad_norm": 1.3068852424621582, + "learning_rate": 1.4855374303565662e-05, + "loss": 0.358, + "step": 3447 + }, + { + "epoch": 0.8301932221754048, + "grad_norm": 1.2296390533447266, + "learning_rate": 1.4814507098807595e-05, + "loss": 0.5845, + "step": 3448 + }, + { + "epoch": 0.8304339974718594, + "grad_norm": 2.593040943145752, + "learning_rate": 1.4773691687021174e-05, + "loss": 0.6584, + "step": 3449 + }, + { + "epoch": 0.8306747727683139, + "grad_norm": 1.2656725645065308, + "learning_rate": 1.473292809302219e-05, + "loss": 0.6618, + "step": 3450 + }, + { + "epoch": 0.8309155480647685, + "grad_norm": 1.9369158744812012, + "learning_rate": 1.4692216341595044e-05, + "loss": 0.4147, + "step": 3451 + }, + { + "epoch": 0.8311563233612231, + "grad_norm": 1.4402110576629639, + "learning_rate": 1.4651556457492588e-05, + "loss": 0.2083, + "step": 3452 + }, + { + "epoch": 0.8313970986576777, + "grad_norm": 0.478405237197876, + "learning_rate": 1.4610948465436069e-05, + "loss": 0.2323, + "step": 3453 + }, + { + "epoch": 0.8316378739541324, + "grad_norm": 2.096238613128662, + "learning_rate": 1.4570392390115261e-05, + "loss": 0.2422, + "step": 3454 + }, + { + "epoch": 0.8318786492505869, + "grad_norm": 0.71112060546875, + "learning_rate": 1.4529888256188363e-05, + "loss": 0.2833, + "step": 3455 + }, + { + "epoch": 0.8321194245470415, + "grad_norm": 4.690402984619141, + "learning_rate": 1.448943608828197e-05, + "loss": 1.0562, + "step": 3456 + }, + { + "epoch": 0.8323601998434961, + "grad_norm": 1.7524763345718384, + "learning_rate": 1.4449035910991115e-05, + "loss": 0.087, + "step": 3457 + }, + { + "epoch": 0.8326009751399507, + "grad_norm": 1.4712945222854614, + "learning_rate": 1.4408687748879156e-05, + "loss": 0.4935, + "step": 3458 + }, + { + "epoch": 0.8328417504364052, + "grad_norm": 2.9652466773986816, + "learning_rate": 1.4368391626477884e-05, + "loss": 0.7135, + "step": 3459 + }, + { + "epoch": 0.8330825257328598, + "grad_norm": 1.4162325859069824, + "learning_rate": 1.4328147568287453e-05, + "loss": 0.4518, + "step": 3460 + }, + { + "epoch": 0.8333233010293144, + "grad_norm": 0.665979266166687, + "learning_rate": 1.4287955598776304e-05, + "loss": 0.3329, + "step": 3461 + }, + { + "epoch": 0.833564076325769, + "grad_norm": 2.3450982570648193, + "learning_rate": 1.4247815742381277e-05, + "loss": 0.4297, + "step": 3462 + }, + { + "epoch": 0.8338048516222235, + "grad_norm": 2.609652519226074, + "learning_rate": 1.4207728023507471e-05, + "loss": 0.595, + "step": 3463 + }, + { + "epoch": 0.8340456269186781, + "grad_norm": 1.1684465408325195, + "learning_rate": 1.4167692466528281e-05, + "loss": 0.2792, + "step": 3464 + }, + { + "epoch": 0.8342864022151327, + "grad_norm": 1.9600780010223389, + "learning_rate": 1.4127709095785513e-05, + "loss": 0.1156, + "step": 3465 + }, + { + "epoch": 0.8345271775115873, + "grad_norm": 1.2096495628356934, + "learning_rate": 1.4087777935589052e-05, + "loss": 0.6391, + "step": 3466 + }, + { + "epoch": 0.834767952808042, + "grad_norm": 2.56876540184021, + "learning_rate": 1.404789901021717e-05, + "loss": 0.5451, + "step": 3467 + }, + { + "epoch": 0.8350087281044964, + "grad_norm": 2.7339911460876465, + "learning_rate": 1.4008072343916379e-05, + "loss": 0.7596, + "step": 3468 + }, + { + "epoch": 0.835249503400951, + "grad_norm": 1.9011280536651611, + "learning_rate": 1.396829796090131e-05, + "loss": 0.9727, + "step": 3469 + }, + { + "epoch": 0.8354902786974057, + "grad_norm": 1.2117639780044556, + "learning_rate": 1.3928575885354933e-05, + "loss": 0.2906, + "step": 3470 + }, + { + "epoch": 0.8357310539938603, + "grad_norm": 1.8486530780792236, + "learning_rate": 1.3888906141428325e-05, + "loss": 0.395, + "step": 3471 + }, + { + "epoch": 0.8359718292903148, + "grad_norm": 4.039324760437012, + "learning_rate": 1.3849288753240786e-05, + "loss": 0.4848, + "step": 3472 + }, + { + "epoch": 0.8362126045867694, + "grad_norm": 1.2622008323669434, + "learning_rate": 1.3809723744879788e-05, + "loss": 0.6475, + "step": 3473 + }, + { + "epoch": 0.836453379883224, + "grad_norm": 0.6000483632087708, + "learning_rate": 1.3770211140400946e-05, + "loss": 0.1693, + "step": 3474 + }, + { + "epoch": 0.8366941551796786, + "grad_norm": 2.0849924087524414, + "learning_rate": 1.3730750963828032e-05, + "loss": 0.1984, + "step": 3475 + }, + { + "epoch": 0.8369349304761331, + "grad_norm": 2.24556040763855, + "learning_rate": 1.3691343239152864e-05, + "loss": 0.4181, + "step": 3476 + }, + { + "epoch": 0.8371757057725877, + "grad_norm": 1.9239386320114136, + "learning_rate": 1.3651987990335469e-05, + "loss": 0.4543, + "step": 3477 + }, + { + "epoch": 0.8374164810690423, + "grad_norm": 1.7591582536697388, + "learning_rate": 1.3612685241303947e-05, + "loss": 0.5755, + "step": 3478 + }, + { + "epoch": 0.8376572563654969, + "grad_norm": 0.7575153112411499, + "learning_rate": 1.3573435015954406e-05, + "loss": 0.0756, + "step": 3479 + }, + { + "epoch": 0.8378980316619515, + "grad_norm": 1.6970840692520142, + "learning_rate": 1.3534237338151102e-05, + "loss": 0.2317, + "step": 3480 + }, + { + "epoch": 0.838138806958406, + "grad_norm": 1.827329158782959, + "learning_rate": 1.3495092231726304e-05, + "loss": 0.7636, + "step": 3481 + }, + { + "epoch": 0.8383795822548606, + "grad_norm": 2.604074239730835, + "learning_rate": 1.3455999720480316e-05, + "loss": 0.6374, + "step": 3482 + }, + { + "epoch": 0.8386203575513153, + "grad_norm": 2.306910514831543, + "learning_rate": 1.341695982818152e-05, + "loss": 0.7725, + "step": 3483 + }, + { + "epoch": 0.8388611328477699, + "grad_norm": 1.2861391305923462, + "learning_rate": 1.337797257856619e-05, + "loss": 0.6311, + "step": 3484 + }, + { + "epoch": 0.8391019081442244, + "grad_norm": 2.032479763031006, + "learning_rate": 1.33390379953387e-05, + "loss": 0.4538, + "step": 3485 + }, + { + "epoch": 0.839342683440679, + "grad_norm": 0.6007648706436157, + "learning_rate": 1.3300156102171379e-05, + "loss": 0.1634, + "step": 3486 + }, + { + "epoch": 0.8395834587371336, + "grad_norm": 2.5209765434265137, + "learning_rate": 1.3261326922704464e-05, + "loss": 0.1593, + "step": 3487 + }, + { + "epoch": 0.8398242340335882, + "grad_norm": 0.9033012390136719, + "learning_rate": 1.3222550480546203e-05, + "loss": 0.3819, + "step": 3488 + }, + { + "epoch": 0.8400650093300427, + "grad_norm": 7.269277572631836, + "learning_rate": 1.3183826799272758e-05, + "loss": 0.5313, + "step": 3489 + }, + { + "epoch": 0.8403057846264973, + "grad_norm": 2.5734009742736816, + "learning_rate": 1.3145155902428219e-05, + "loss": 0.6007, + "step": 3490 + }, + { + "epoch": 0.8405465599229519, + "grad_norm": 2.136906147003174, + "learning_rate": 1.310653781352461e-05, + "loss": 0.7098, + "step": 3491 + }, + { + "epoch": 0.8407873352194065, + "grad_norm": 1.183111310005188, + "learning_rate": 1.3067972556041752e-05, + "loss": 0.5929, + "step": 3492 + }, + { + "epoch": 0.8410281105158611, + "grad_norm": 2.32429575920105, + "learning_rate": 1.3029460153427442e-05, + "loss": 0.6398, + "step": 3493 + }, + { + "epoch": 0.8412688858123156, + "grad_norm": 2.2042758464813232, + "learning_rate": 1.2991000629097328e-05, + "loss": 0.4816, + "step": 3494 + }, + { + "epoch": 0.8415096611087702, + "grad_norm": 2.461498498916626, + "learning_rate": 1.2952594006434849e-05, + "loss": 0.74, + "step": 3495 + }, + { + "epoch": 0.8417504364052248, + "grad_norm": 1.1167387962341309, + "learning_rate": 1.2914240308791326e-05, + "loss": 1.0576, + "step": 3496 + }, + { + "epoch": 0.8419912117016795, + "grad_norm": 1.2157386541366577, + "learning_rate": 1.2875939559485905e-05, + "loss": 0.5803, + "step": 3497 + }, + { + "epoch": 0.842231986998134, + "grad_norm": 1.176676869392395, + "learning_rate": 1.2837691781805516e-05, + "loss": 0.7533, + "step": 3498 + }, + { + "epoch": 0.8424727622945886, + "grad_norm": 1.823185920715332, + "learning_rate": 1.2799496999004935e-05, + "loss": 0.3445, + "step": 3499 + }, + { + "epoch": 0.8427135375910432, + "grad_norm": 2.291487216949463, + "learning_rate": 1.2761355234306626e-05, + "loss": 0.6968, + "step": 3500 + }, + { + "epoch": 0.8429543128874978, + "grad_norm": 3.8804900646209717, + "learning_rate": 1.27232665109009e-05, + "loss": 0.626, + "step": 3501 + }, + { + "epoch": 0.8431950881839523, + "grad_norm": 3.4894497394561768, + "learning_rate": 1.268523085194575e-05, + "loss": 0.6506, + "step": 3502 + }, + { + "epoch": 0.8434358634804069, + "grad_norm": 4.037177085876465, + "learning_rate": 1.264724828056696e-05, + "loss": 0.5966, + "step": 3503 + }, + { + "epoch": 0.8436766387768615, + "grad_norm": 2.5394115447998047, + "learning_rate": 1.2609318819858029e-05, + "loss": 0.8267, + "step": 3504 + }, + { + "epoch": 0.8439174140733161, + "grad_norm": 4.015261173248291, + "learning_rate": 1.2571442492880159e-05, + "loss": 0.9835, + "step": 3505 + }, + { + "epoch": 0.8441581893697706, + "grad_norm": 3.58262300491333, + "learning_rate": 1.2533619322662216e-05, + "loss": 0.4845, + "step": 3506 + }, + { + "epoch": 0.8443989646662252, + "grad_norm": 1.208625316619873, + "learning_rate": 1.2495849332200815e-05, + "loss": 0.4519, + "step": 3507 + }, + { + "epoch": 0.8446397399626798, + "grad_norm": 6.022768497467041, + "learning_rate": 1.2458132544460155e-05, + "loss": 0.4975, + "step": 3508 + }, + { + "epoch": 0.8448805152591344, + "grad_norm": 0.27254337072372437, + "learning_rate": 1.2420468982372158e-05, + "loss": 0.4787, + "step": 3509 + }, + { + "epoch": 0.845121290555589, + "grad_norm": 3.7382776737213135, + "learning_rate": 1.2382858668836317e-05, + "loss": 0.7591, + "step": 3510 + }, + { + "epoch": 0.8453620658520435, + "grad_norm": 1.5053765773773193, + "learning_rate": 1.2345301626719808e-05, + "loss": 0.4858, + "step": 3511 + }, + { + "epoch": 0.8456028411484982, + "grad_norm": 2.0983424186706543, + "learning_rate": 1.2307797878857396e-05, + "loss": 0.4047, + "step": 3512 + }, + { + "epoch": 0.8458436164449528, + "grad_norm": 1.0963115692138672, + "learning_rate": 1.2270347448051456e-05, + "loss": 0.6322, + "step": 3513 + }, + { + "epoch": 0.8460843917414074, + "grad_norm": 2.32830548286438, + "learning_rate": 1.2232950357071937e-05, + "loss": 0.339, + "step": 3514 + }, + { + "epoch": 0.8463251670378619, + "grad_norm": 2.1017560958862305, + "learning_rate": 1.219560662865633e-05, + "loss": 0.6883, + "step": 3515 + }, + { + "epoch": 0.8465659423343165, + "grad_norm": 2.3535349369049072, + "learning_rate": 1.2158316285509708e-05, + "loss": 0.3383, + "step": 3516 + }, + { + "epoch": 0.8468067176307711, + "grad_norm": 0.24175573885440826, + "learning_rate": 1.2121079350304732e-05, + "loss": 0.3707, + "step": 3517 + }, + { + "epoch": 0.8470474929272257, + "grad_norm": 3.2046167850494385, + "learning_rate": 1.208389584568147e-05, + "loss": 0.7788, + "step": 3518 + }, + { + "epoch": 0.8472882682236802, + "grad_norm": 0.8707532286643982, + "learning_rate": 1.2046765794247604e-05, + "loss": 0.5653, + "step": 3519 + }, + { + "epoch": 0.8475290435201348, + "grad_norm": 5.456550598144531, + "learning_rate": 1.2009689218578313e-05, + "loss": 0.5814, + "step": 3520 + }, + { + "epoch": 0.8477698188165894, + "grad_norm": 3.579458236694336, + "learning_rate": 1.1972666141216215e-05, + "loss": 0.3246, + "step": 3521 + }, + { + "epoch": 0.848010594113044, + "grad_norm": 1.3261420726776123, + "learning_rate": 1.1935696584671452e-05, + "loss": 0.4498, + "step": 3522 + }, + { + "epoch": 0.8482513694094986, + "grad_norm": 1.701804280281067, + "learning_rate": 1.1898780571421552e-05, + "loss": 0.5852, + "step": 3523 + }, + { + "epoch": 0.8484921447059531, + "grad_norm": 3.848027229309082, + "learning_rate": 1.1861918123911564e-05, + "loss": 0.8602, + "step": 3524 + }, + { + "epoch": 0.8487329200024077, + "grad_norm": 2.2513511180877686, + "learning_rate": 1.1825109264553947e-05, + "loss": 1.1043, + "step": 3525 + }, + { + "epoch": 0.8489736952988624, + "grad_norm": 3.580653429031372, + "learning_rate": 1.1788354015728543e-05, + "loss": 0.3012, + "step": 3526 + }, + { + "epoch": 0.849214470595317, + "grad_norm": 2.1842987537384033, + "learning_rate": 1.1751652399782665e-05, + "loss": 0.5502, + "step": 3527 + }, + { + "epoch": 0.8494552458917715, + "grad_norm": 1.2588567733764648, + "learning_rate": 1.1715004439030908e-05, + "loss": 0.5083, + "step": 3528 + }, + { + "epoch": 0.8496960211882261, + "grad_norm": 2.6544106006622314, + "learning_rate": 1.1678410155755382e-05, + "loss": 0.342, + "step": 3529 + }, + { + "epoch": 0.8499367964846807, + "grad_norm": 0.7514671087265015, + "learning_rate": 1.1641869572205489e-05, + "loss": 0.323, + "step": 3530 + }, + { + "epoch": 0.8501775717811353, + "grad_norm": 2.6009557247161865, + "learning_rate": 1.1605382710597957e-05, + "loss": 0.4687, + "step": 3531 + }, + { + "epoch": 0.8504183470775898, + "grad_norm": 1.5390700101852417, + "learning_rate": 1.1568949593116884e-05, + "loss": 0.4879, + "step": 3532 + }, + { + "epoch": 0.8506591223740444, + "grad_norm": 4.1019110679626465, + "learning_rate": 1.1532570241913721e-05, + "loss": 0.9231, + "step": 3533 + }, + { + "epoch": 0.850899897670499, + "grad_norm": 1.2922954559326172, + "learning_rate": 1.1496244679107148e-05, + "loss": 0.546, + "step": 3534 + }, + { + "epoch": 0.8511406729669536, + "grad_norm": 1.7373534440994263, + "learning_rate": 1.1459972926783236e-05, + "loss": 0.2165, + "step": 3535 + }, + { + "epoch": 0.8513814482634082, + "grad_norm": 1.6212053298950195, + "learning_rate": 1.1423755006995241e-05, + "loss": 0.8171, + "step": 3536 + }, + { + "epoch": 0.8516222235598627, + "grad_norm": 6.341080665588379, + "learning_rate": 1.1387590941763749e-05, + "loss": 0.8483, + "step": 3537 + }, + { + "epoch": 0.8518629988563173, + "grad_norm": 0.9660471081733704, + "learning_rate": 1.135148075307666e-05, + "loss": 0.284, + "step": 3538 + }, + { + "epoch": 0.8521037741527719, + "grad_norm": 1.2168993949890137, + "learning_rate": 1.1315424462888968e-05, + "loss": 0.3476, + "step": 3539 + }, + { + "epoch": 0.8523445494492266, + "grad_norm": 2.224290609359741, + "learning_rate": 1.1279422093123037e-05, + "loss": 0.4849, + "step": 3540 + }, + { + "epoch": 0.852585324745681, + "grad_norm": 4.049657821655273, + "learning_rate": 1.1243473665668336e-05, + "loss": 0.4054, + "step": 3541 + }, + { + "epoch": 0.8528261000421357, + "grad_norm": 1.0879812240600586, + "learning_rate": 1.1207579202381625e-05, + "loss": 0.2502, + "step": 3542 + }, + { + "epoch": 0.8530668753385903, + "grad_norm": 1.2523934841156006, + "learning_rate": 1.1171738725086833e-05, + "loss": 0.3098, + "step": 3543 + }, + { + "epoch": 0.8533076506350449, + "grad_norm": 0.9704805612564087, + "learning_rate": 1.1135952255574999e-05, + "loss": 0.5721, + "step": 3544 + }, + { + "epoch": 0.8535484259314994, + "grad_norm": 3.1758830547332764, + "learning_rate": 1.1100219815604418e-05, + "loss": 0.4685, + "step": 3545 + }, + { + "epoch": 0.853789201227954, + "grad_norm": 0.9316069483757019, + "learning_rate": 1.1064541426900476e-05, + "loss": 0.575, + "step": 3546 + }, + { + "epoch": 0.8540299765244086, + "grad_norm": 5.6592912673950195, + "learning_rate": 1.1028917111155712e-05, + "loss": 1.1386, + "step": 3547 + }, + { + "epoch": 0.8542707518208632, + "grad_norm": 2.0701656341552734, + "learning_rate": 1.0993346890029832e-05, + "loss": 1.1186, + "step": 3548 + }, + { + "epoch": 0.8545115271173178, + "grad_norm": 2.067758560180664, + "learning_rate": 1.0957830785149548e-05, + "loss": 0.7525, + "step": 3549 + }, + { + "epoch": 0.8547523024137723, + "grad_norm": 1.7323333024978638, + "learning_rate": 1.0922368818108774e-05, + "loss": 0.4399, + "step": 3550 + }, + { + "epoch": 0.8549930777102269, + "grad_norm": 1.6739343404769897, + "learning_rate": 1.0886961010468466e-05, + "loss": 0.4477, + "step": 3551 + }, + { + "epoch": 0.8552338530066815, + "grad_norm": 1.9262574911117554, + "learning_rate": 1.0851607383756612e-05, + "loss": 0.733, + "step": 3552 + }, + { + "epoch": 0.8554746283031361, + "grad_norm": 1.7514442205429077, + "learning_rate": 1.081630795946833e-05, + "loss": 0.7005, + "step": 3553 + }, + { + "epoch": 0.8557154035995906, + "grad_norm": 1.4854047298431396, + "learning_rate": 1.078106275906573e-05, + "loss": 0.926, + "step": 3554 + }, + { + "epoch": 0.8559561788960452, + "grad_norm": 1.791135549545288, + "learning_rate": 1.0745871803978002e-05, + "loss": 0.4316, + "step": 3555 + }, + { + "epoch": 0.8561969541924999, + "grad_norm": 0.6483386754989624, + "learning_rate": 1.0710735115601311e-05, + "loss": 0.4356, + "step": 3556 + }, + { + "epoch": 0.8564377294889545, + "grad_norm": 6.167766094207764, + "learning_rate": 1.0675652715298835e-05, + "loss": 0.8807, + "step": 3557 + }, + { + "epoch": 0.856678504785409, + "grad_norm": 3.2649612426757812, + "learning_rate": 1.0640624624400752e-05, + "loss": 0.724, + "step": 3558 + }, + { + "epoch": 0.8569192800818636, + "grad_norm": 1.8932918310165405, + "learning_rate": 1.0605650864204252e-05, + "loss": 0.6025, + "step": 3559 + }, + { + "epoch": 0.8571600553783182, + "grad_norm": 1.1560626029968262, + "learning_rate": 1.0570731455973414e-05, + "loss": 0.4024, + "step": 3560 + }, + { + "epoch": 0.8574008306747728, + "grad_norm": 0.9457545280456543, + "learning_rate": 1.0535866420939332e-05, + "loss": 0.2836, + "step": 3561 + }, + { + "epoch": 0.8576416059712274, + "grad_norm": 1.101394772529602, + "learning_rate": 1.050105578030003e-05, + "loss": 0.4336, + "step": 3562 + }, + { + "epoch": 0.8578823812676819, + "grad_norm": 2.297769784927368, + "learning_rate": 1.046629955522046e-05, + "loss": 1.2278, + "step": 3563 + }, + { + "epoch": 0.8581231565641365, + "grad_norm": 6.07118034362793, + "learning_rate": 1.0431597766832502e-05, + "loss": 0.6956, + "step": 3564 + }, + { + "epoch": 0.8583639318605911, + "grad_norm": 0.6239258646965027, + "learning_rate": 1.0396950436234887e-05, + "loss": 0.051, + "step": 3565 + }, + { + "epoch": 0.8586047071570457, + "grad_norm": 3.072779417037964, + "learning_rate": 1.0362357584493298e-05, + "loss": 0.9781, + "step": 3566 + }, + { + "epoch": 0.8588454824535002, + "grad_norm": 1.9962631464004517, + "learning_rate": 1.0327819232640235e-05, + "loss": 0.5025, + "step": 3567 + }, + { + "epoch": 0.8590862577499548, + "grad_norm": 1.4594467878341675, + "learning_rate": 1.029333540167512e-05, + "loss": 0.4257, + "step": 3568 + }, + { + "epoch": 0.8593270330464094, + "grad_norm": 3.9603610038757324, + "learning_rate": 1.0258906112564181e-05, + "loss": 0.3696, + "step": 3569 + }, + { + "epoch": 0.8595678083428641, + "grad_norm": 3.076791286468506, + "learning_rate": 1.0224531386240522e-05, + "loss": 0.7587, + "step": 3570 + }, + { + "epoch": 0.8598085836393186, + "grad_norm": 3.732264995574951, + "learning_rate": 1.0190211243604043e-05, + "loss": 0.5357, + "step": 3571 + }, + { + "epoch": 0.8600493589357732, + "grad_norm": 1.9595264196395874, + "learning_rate": 1.0155945705521486e-05, + "loss": 0.4164, + "step": 3572 + }, + { + "epoch": 0.8602901342322278, + "grad_norm": 1.8622673749923706, + "learning_rate": 1.0121734792826353e-05, + "loss": 0.5882, + "step": 3573 + }, + { + "epoch": 0.8605309095286824, + "grad_norm": 2.3604211807250977, + "learning_rate": 1.0087578526318975e-05, + "loss": 0.7776, + "step": 3574 + }, + { + "epoch": 0.860771684825137, + "grad_norm": 2.476921796798706, + "learning_rate": 1.0053476926766414e-05, + "loss": 0.5129, + "step": 3575 + }, + { + "epoch": 0.8610124601215915, + "grad_norm": 0.9629519581794739, + "learning_rate": 1.0019430014902531e-05, + "loss": 0.6324, + "step": 3576 + }, + { + "epoch": 0.8612532354180461, + "grad_norm": 1.2336691617965698, + "learning_rate": 9.985437811427933e-06, + "loss": 0.236, + "step": 3577 + }, + { + "epoch": 0.8614940107145007, + "grad_norm": 1.3842549324035645, + "learning_rate": 9.951500337009945e-06, + "loss": 0.8148, + "step": 3578 + }, + { + "epoch": 0.8617347860109553, + "grad_norm": 1.1485180854797363, + "learning_rate": 9.917617612282648e-06, + "loss": 0.4162, + "step": 3579 + }, + { + "epoch": 0.8619755613074098, + "grad_norm": 1.5087698698043823, + "learning_rate": 9.883789657846799e-06, + "loss": 1.0833, + "step": 3580 + }, + { + "epoch": 0.8622163366038644, + "grad_norm": 0.747292697429657, + "learning_rate": 9.850016494269853e-06, + "loss": 0.4187, + "step": 3581 + }, + { + "epoch": 0.862457111900319, + "grad_norm": 0.6730207204818726, + "learning_rate": 9.816298142086022e-06, + "loss": 0.5837, + "step": 3582 + }, + { + "epoch": 0.8626978871967736, + "grad_norm": 3.047215461730957, + "learning_rate": 9.782634621796083e-06, + "loss": 0.3071, + "step": 3583 + }, + { + "epoch": 0.8629386624932281, + "grad_norm": 0.9594640135765076, + "learning_rate": 9.749025953867552e-06, + "loss": 0.2057, + "step": 3584 + }, + { + "epoch": 0.8631794377896828, + "grad_norm": 1.894709587097168, + "learning_rate": 9.715472158734585e-06, + "loss": 0.4201, + "step": 3585 + }, + { + "epoch": 0.8634202130861374, + "grad_norm": 1.63250732421875, + "learning_rate": 9.68197325679795e-06, + "loss": 0.7254, + "step": 3586 + }, + { + "epoch": 0.863660988382592, + "grad_norm": 8.452190399169922, + "learning_rate": 9.648529268425088e-06, + "loss": 0.2811, + "step": 3587 + }, + { + "epoch": 0.8639017636790465, + "grad_norm": 0.6347200870513916, + "learning_rate": 9.61514021394998e-06, + "loss": 0.355, + "step": 3588 + }, + { + "epoch": 0.8641425389755011, + "grad_norm": 1.1674455404281616, + "learning_rate": 9.581806113673253e-06, + "loss": 0.5386, + "step": 3589 + }, + { + "epoch": 0.8643833142719557, + "grad_norm": 1.5471045970916748, + "learning_rate": 9.548526987862149e-06, + "loss": 0.6897, + "step": 3590 + }, + { + "epoch": 0.8646240895684103, + "grad_norm": 1.1853959560394287, + "learning_rate": 9.515302856750408e-06, + "loss": 0.7887, + "step": 3591 + }, + { + "epoch": 0.8648648648648649, + "grad_norm": 0.8306871056556702, + "learning_rate": 9.48213374053839e-06, + "loss": 0.4411, + "step": 3592 + }, + { + "epoch": 0.8651056401613194, + "grad_norm": 1.9106206893920898, + "learning_rate": 9.449019659392999e-06, + "loss": 0.4598, + "step": 3593 + }, + { + "epoch": 0.865346415457774, + "grad_norm": 2.1204633712768555, + "learning_rate": 9.415960633447674e-06, + "loss": 0.6254, + "step": 3594 + }, + { + "epoch": 0.8655871907542286, + "grad_norm": 1.09833562374115, + "learning_rate": 9.382956682802379e-06, + "loss": 0.4423, + "step": 3595 + }, + { + "epoch": 0.8658279660506832, + "grad_norm": 1.1681790351867676, + "learning_rate": 9.350007827523577e-06, + "loss": 0.9621, + "step": 3596 + }, + { + "epoch": 0.8660687413471377, + "grad_norm": 2.5067391395568848, + "learning_rate": 9.317114087644252e-06, + "loss": 0.5401, + "step": 3597 + }, + { + "epoch": 0.8663095166435923, + "grad_norm": 2.4286880493164062, + "learning_rate": 9.284275483163885e-06, + "loss": 0.4498, + "step": 3598 + }, + { + "epoch": 0.866550291940047, + "grad_norm": 3.0217394828796387, + "learning_rate": 9.251492034048393e-06, + "loss": 0.769, + "step": 3599 + }, + { + "epoch": 0.8667910672365016, + "grad_norm": 0.8035622239112854, + "learning_rate": 9.21876376023022e-06, + "loss": 0.1272, + "step": 3600 + }, + { + "epoch": 0.8670318425329561, + "grad_norm": 0.6220622658729553, + "learning_rate": 9.186090681608173e-06, + "loss": 0.2411, + "step": 3601 + }, + { + "epoch": 0.8672726178294107, + "grad_norm": 2.2393176555633545, + "learning_rate": 9.153472818047625e-06, + "loss": 0.3123, + "step": 3602 + }, + { + "epoch": 0.8675133931258653, + "grad_norm": 5.026528835296631, + "learning_rate": 9.120910189380294e-06, + "loss": 1.1053, + "step": 3603 + }, + { + "epoch": 0.8677541684223199, + "grad_norm": 4.139969825744629, + "learning_rate": 9.088402815404306e-06, + "loss": 0.6011, + "step": 3604 + }, + { + "epoch": 0.8679949437187745, + "grad_norm": 2.963592290878296, + "learning_rate": 9.055950715884254e-06, + "loss": 0.629, + "step": 3605 + }, + { + "epoch": 0.868235719015229, + "grad_norm": 4.244201183319092, + "learning_rate": 9.023553910551041e-06, + "loss": 0.2936, + "step": 3606 + }, + { + "epoch": 0.8684764943116836, + "grad_norm": 1.2388718128204346, + "learning_rate": 8.991212419102025e-06, + "loss": 0.6559, + "step": 3607 + }, + { + "epoch": 0.8687172696081382, + "grad_norm": 1.3638206720352173, + "learning_rate": 8.958926261200928e-06, + "loss": 0.6455, + "step": 3608 + }, + { + "epoch": 0.8689580449045928, + "grad_norm": 1.8364553451538086, + "learning_rate": 8.926695456477751e-06, + "loss": 0.325, + "step": 3609 + }, + { + "epoch": 0.8691988202010473, + "grad_norm": 2.5603854656219482, + "learning_rate": 8.894520024528918e-06, + "loss": 0.4407, + "step": 3610 + }, + { + "epoch": 0.8694395954975019, + "grad_norm": 1.9255558252334595, + "learning_rate": 8.862399984917213e-06, + "loss": 0.8542, + "step": 3611 + }, + { + "epoch": 0.8696803707939565, + "grad_norm": 1.358974575996399, + "learning_rate": 8.830335357171627e-06, + "loss": 0.9968, + "step": 3612 + }, + { + "epoch": 0.8699211460904112, + "grad_norm": 1.3834187984466553, + "learning_rate": 8.798326160787573e-06, + "loss": 0.9395, + "step": 3613 + }, + { + "epoch": 0.8701619213868657, + "grad_norm": 2.3378701210021973, + "learning_rate": 8.766372415226675e-06, + "loss": 0.2571, + "step": 3614 + }, + { + "epoch": 0.8704026966833203, + "grad_norm": 1.1119276285171509, + "learning_rate": 8.734474139916903e-06, + "loss": 0.4873, + "step": 3615 + }, + { + "epoch": 0.8706434719797749, + "grad_norm": 0.519648551940918, + "learning_rate": 8.702631354252489e-06, + "loss": 0.4369, + "step": 3616 + }, + { + "epoch": 0.8708842472762295, + "grad_norm": 6.27766227722168, + "learning_rate": 8.670844077593899e-06, + "loss": 0.6788, + "step": 3617 + }, + { + "epoch": 0.8711250225726841, + "grad_norm": 1.280344009399414, + "learning_rate": 8.639112329267862e-06, + "loss": 0.6255, + "step": 3618 + }, + { + "epoch": 0.8713657978691386, + "grad_norm": 4.4702067375183105, + "learning_rate": 8.60743612856738e-06, + "loss": 0.4173, + "step": 3619 + }, + { + "epoch": 0.8716065731655932, + "grad_norm": 0.5965597033500671, + "learning_rate": 8.575815494751637e-06, + "loss": 0.1958, + "step": 3620 + }, + { + "epoch": 0.8718473484620478, + "grad_norm": 1.7509022951126099, + "learning_rate": 8.544250447046075e-06, + "loss": 0.1665, + "step": 3621 + }, + { + "epoch": 0.8720881237585024, + "grad_norm": 1.8660122156143188, + "learning_rate": 8.512741004642277e-06, + "loss": 0.3934, + "step": 3622 + }, + { + "epoch": 0.8723288990549569, + "grad_norm": 0.804557204246521, + "learning_rate": 8.481287186698061e-06, + "loss": 0.383, + "step": 3623 + }, + { + "epoch": 0.8725696743514115, + "grad_norm": 1.3217666149139404, + "learning_rate": 8.449889012337453e-06, + "loss": 0.1176, + "step": 3624 + }, + { + "epoch": 0.8728104496478661, + "grad_norm": 3.857081651687622, + "learning_rate": 8.418546500650582e-06, + "loss": 0.5942, + "step": 3625 + }, + { + "epoch": 0.8730512249443207, + "grad_norm": 1.6486952304840088, + "learning_rate": 8.387259670693759e-06, + "loss": 0.525, + "step": 3626 + }, + { + "epoch": 0.8732920002407752, + "grad_norm": 0.7626795768737793, + "learning_rate": 8.356028541489468e-06, + "loss": 0.2785, + "step": 3627 + }, + { + "epoch": 0.8735327755372299, + "grad_norm": 2.2588295936584473, + "learning_rate": 8.3248531320263e-06, + "loss": 0.3082, + "step": 3628 + }, + { + "epoch": 0.8737735508336845, + "grad_norm": 6.79339075088501, + "learning_rate": 8.293733461259002e-06, + "loss": 0.9379, + "step": 3629 + }, + { + "epoch": 0.8740143261301391, + "grad_norm": 1.2690285444259644, + "learning_rate": 8.262669548108349e-06, + "loss": 0.9261, + "step": 3630 + }, + { + "epoch": 0.8742551014265937, + "grad_norm": 1.7360615730285645, + "learning_rate": 8.23166141146131e-06, + "loss": 0.3663, + "step": 3631 + }, + { + "epoch": 0.8744958767230482, + "grad_norm": 1.55054771900177, + "learning_rate": 8.200709070170876e-06, + "loss": 0.4774, + "step": 3632 + }, + { + "epoch": 0.8747366520195028, + "grad_norm": 7.235383987426758, + "learning_rate": 8.169812543056155e-06, + "loss": 0.4964, + "step": 3633 + }, + { + "epoch": 0.8749774273159574, + "grad_norm": 1.6751036643981934, + "learning_rate": 8.13897184890231e-06, + "loss": 1.2376, + "step": 3634 + }, + { + "epoch": 0.875218202612412, + "grad_norm": 2.576141119003296, + "learning_rate": 8.108187006460533e-06, + "loss": 0.543, + "step": 3635 + }, + { + "epoch": 0.8754589779088665, + "grad_norm": 1.2057117223739624, + "learning_rate": 8.077458034448105e-06, + "loss": 0.2369, + "step": 3636 + }, + { + "epoch": 0.8756997532053211, + "grad_norm": 1.8533183336257935, + "learning_rate": 8.046784951548302e-06, + "loss": 0.6765, + "step": 3637 + }, + { + "epoch": 0.8759405285017757, + "grad_norm": 1.2572599649429321, + "learning_rate": 8.01616777641041e-06, + "loss": 0.5824, + "step": 3638 + }, + { + "epoch": 0.8761813037982303, + "grad_norm": 1.9591710567474365, + "learning_rate": 7.985606527649769e-06, + "loss": 0.9164, + "step": 3639 + }, + { + "epoch": 0.8764220790946848, + "grad_norm": 0.9350030422210693, + "learning_rate": 7.955101223847649e-06, + "loss": 0.1639, + "step": 3640 + }, + { + "epoch": 0.8766628543911394, + "grad_norm": 1.2530356645584106, + "learning_rate": 7.92465188355137e-06, + "loss": 0.6552, + "step": 3641 + }, + { + "epoch": 0.876903629687594, + "grad_norm": 2.239734649658203, + "learning_rate": 7.894258525274189e-06, + "loss": 0.9524, + "step": 3642 + }, + { + "epoch": 0.8771444049840487, + "grad_norm": 1.1480050086975098, + "learning_rate": 7.863921167495348e-06, + "loss": 0.5416, + "step": 3643 + }, + { + "epoch": 0.8773851802805033, + "grad_norm": 1.4632046222686768, + "learning_rate": 7.833639828660033e-06, + "loss": 0.756, + "step": 3644 + }, + { + "epoch": 0.8776259555769578, + "grad_norm": 2.0627739429473877, + "learning_rate": 7.803414527179343e-06, + "loss": 0.2886, + "step": 3645 + }, + { + "epoch": 0.8778667308734124, + "grad_norm": 1.060166835784912, + "learning_rate": 7.77324528143033e-06, + "loss": 0.4151, + "step": 3646 + }, + { + "epoch": 0.878107506169867, + "grad_norm": 0.9938207864761353, + "learning_rate": 7.743132109756001e-06, + "loss": 0.4678, + "step": 3647 + }, + { + "epoch": 0.8783482814663216, + "grad_norm": 3.3694393634796143, + "learning_rate": 7.713075030465199e-06, + "loss": 0.7547, + "step": 3648 + }, + { + "epoch": 0.8785890567627761, + "grad_norm": 5.267509460449219, + "learning_rate": 7.683074061832685e-06, + "loss": 0.8142, + "step": 3649 + }, + { + "epoch": 0.8788298320592307, + "grad_norm": 1.6653624773025513, + "learning_rate": 7.653129222099143e-06, + "loss": 0.5309, + "step": 3650 + }, + { + "epoch": 0.8790706073556853, + "grad_norm": 6.179348468780518, + "learning_rate": 7.623240529471099e-06, + "loss": 0.5801, + "step": 3651 + }, + { + "epoch": 0.8793113826521399, + "grad_norm": 1.773995041847229, + "learning_rate": 7.5934080021209496e-06, + "loss": 0.2403, + "step": 3652 + }, + { + "epoch": 0.8795521579485944, + "grad_norm": 1.451350450515747, + "learning_rate": 7.563631658186921e-06, + "loss": 0.4058, + "step": 3653 + }, + { + "epoch": 0.879792933245049, + "grad_norm": 1.8011319637298584, + "learning_rate": 7.533911515773096e-06, + "loss": 0.6627, + "step": 3654 + }, + { + "epoch": 0.8800337085415036, + "grad_norm": 1.6576850414276123, + "learning_rate": 7.5042475929494205e-06, + "loss": 0.4088, + "step": 3655 + }, + { + "epoch": 0.8802744838379583, + "grad_norm": 3.480985403060913, + "learning_rate": 7.4746399077515905e-06, + "loss": 0.6784, + "step": 3656 + }, + { + "epoch": 0.8805152591344129, + "grad_norm": 6.059078693389893, + "learning_rate": 7.445088478181151e-06, + "loss": 0.6168, + "step": 3657 + }, + { + "epoch": 0.8807560344308674, + "grad_norm": 2.815342903137207, + "learning_rate": 7.4155933222054494e-06, + "loss": 0.6268, + "step": 3658 + }, + { + "epoch": 0.880996809727322, + "grad_norm": 1.5780977010726929, + "learning_rate": 7.386154457757599e-06, + "loss": 0.5302, + "step": 3659 + }, + { + "epoch": 0.8812375850237766, + "grad_norm": 1.6322784423828125, + "learning_rate": 7.356771902736514e-06, + "loss": 0.8104, + "step": 3660 + }, + { + "epoch": 0.8814783603202312, + "grad_norm": 2.2666542530059814, + "learning_rate": 7.327445675006839e-06, + "loss": 0.4232, + "step": 3661 + }, + { + "epoch": 0.8817191356166857, + "grad_norm": 2.0304696559906006, + "learning_rate": 7.2981757923989755e-06, + "loss": 0.7694, + "step": 3662 + }, + { + "epoch": 0.8819599109131403, + "grad_norm": 4.598212242126465, + "learning_rate": 7.268962272709101e-06, + "loss": 0.8297, + "step": 3663 + }, + { + "epoch": 0.8822006862095949, + "grad_norm": 0.6841728687286377, + "learning_rate": 7.239805133699085e-06, + "loss": 0.4114, + "step": 3664 + }, + { + "epoch": 0.8824414615060495, + "grad_norm": 0.9421213269233704, + "learning_rate": 7.210704393096534e-06, + "loss": 0.24, + "step": 3665 + }, + { + "epoch": 0.882682236802504, + "grad_norm": 3.708446979522705, + "learning_rate": 7.181660068594764e-06, + "loss": 0.8693, + "step": 3666 + }, + { + "epoch": 0.8829230120989586, + "grad_norm": 0.668267548084259, + "learning_rate": 7.152672177852804e-06, + "loss": 0.2204, + "step": 3667 + }, + { + "epoch": 0.8831637873954132, + "grad_norm": 1.9290603399276733, + "learning_rate": 7.1237407384953655e-06, + "loss": 0.3814, + "step": 3668 + }, + { + "epoch": 0.8834045626918678, + "grad_norm": 1.4112284183502197, + "learning_rate": 7.09486576811278e-06, + "loss": 0.5461, + "step": 3669 + }, + { + "epoch": 0.8836453379883223, + "grad_norm": 2.2663886547088623, + "learning_rate": 7.066047284261157e-06, + "loss": 0.4108, + "step": 3670 + }, + { + "epoch": 0.883886113284777, + "grad_norm": 1.4161934852600098, + "learning_rate": 7.037285304462138e-06, + "loss": 0.4483, + "step": 3671 + }, + { + "epoch": 0.8841268885812316, + "grad_norm": 2.718825101852417, + "learning_rate": 7.008579846203112e-06, + "loss": 0.454, + "step": 3672 + }, + { + "epoch": 0.8843676638776862, + "grad_norm": 1.7748380899429321, + "learning_rate": 6.979930926937062e-06, + "loss": 0.343, + "step": 3673 + }, + { + "epoch": 0.8846084391741408, + "grad_norm": 3.08974552154541, + "learning_rate": 6.951338564082555e-06, + "loss": 0.9658, + "step": 3674 + }, + { + "epoch": 0.8848492144705953, + "grad_norm": 1.58262300491333, + "learning_rate": 6.922802775023862e-06, + "loss": 0.7142, + "step": 3675 + }, + { + "epoch": 0.8850899897670499, + "grad_norm": 2.7481048107147217, + "learning_rate": 6.894323577110795e-06, + "loss": 0.4091, + "step": 3676 + }, + { + "epoch": 0.8853307650635045, + "grad_norm": 1.3223680257797241, + "learning_rate": 6.865900987658758e-06, + "loss": 0.6664, + "step": 3677 + }, + { + "epoch": 0.8855715403599591, + "grad_norm": 2.7666354179382324, + "learning_rate": 6.83753502394876e-06, + "loss": 0.7086, + "step": 3678 + }, + { + "epoch": 0.8858123156564136, + "grad_norm": 1.754913091659546, + "learning_rate": 6.809225703227351e-06, + "loss": 0.8025, + "step": 3679 + }, + { + "epoch": 0.8860530909528682, + "grad_norm": 1.091379165649414, + "learning_rate": 6.780973042706673e-06, + "loss": 0.1816, + "step": 3680 + }, + { + "epoch": 0.8862938662493228, + "grad_norm": 0.8506015539169312, + "learning_rate": 6.75277705956443e-06, + "loss": 0.3386, + "step": 3681 + }, + { + "epoch": 0.8865346415457774, + "grad_norm": 2.4108307361602783, + "learning_rate": 6.724637770943798e-06, + "loss": 0.6603, + "step": 3682 + }, + { + "epoch": 0.8867754168422319, + "grad_norm": 7.575955390930176, + "learning_rate": 6.6965551939535795e-06, + "loss": 0.7667, + "step": 3683 + }, + { + "epoch": 0.8870161921386865, + "grad_norm": 1.8191972970962524, + "learning_rate": 6.668529345667995e-06, + "loss": 0.5581, + "step": 3684 + }, + { + "epoch": 0.8872569674351412, + "grad_norm": 1.4198626279830933, + "learning_rate": 6.640560243126859e-06, + "loss": 0.3262, + "step": 3685 + }, + { + "epoch": 0.8874977427315958, + "grad_norm": 1.5749354362487793, + "learning_rate": 6.612647903335445e-06, + "loss": 0.5869, + "step": 3686 + }, + { + "epoch": 0.8877385180280504, + "grad_norm": 5.316013813018799, + "learning_rate": 6.58479234326449e-06, + "loss": 0.3689, + "step": 3687 + }, + { + "epoch": 0.8879792933245049, + "grad_norm": 1.4162142276763916, + "learning_rate": 6.556993579850268e-06, + "loss": 0.4004, + "step": 3688 + }, + { + "epoch": 0.8882200686209595, + "grad_norm": 2.608461618423462, + "learning_rate": 6.529251629994482e-06, + "loss": 0.4771, + "step": 3689 + }, + { + "epoch": 0.8884608439174141, + "grad_norm": 4.454953670501709, + "learning_rate": 6.501566510564295e-06, + "loss": 1.0927, + "step": 3690 + }, + { + "epoch": 0.8887016192138687, + "grad_norm": 5.660929203033447, + "learning_rate": 6.4739382383923185e-06, + "loss": 0.5887, + "step": 3691 + }, + { + "epoch": 0.8889423945103232, + "grad_norm": 5.009692668914795, + "learning_rate": 6.446366830276607e-06, + "loss": 0.8515, + "step": 3692 + }, + { + "epoch": 0.8891831698067778, + "grad_norm": 1.05977201461792, + "learning_rate": 6.4188523029806495e-06, + "loss": 0.5923, + "step": 3693 + }, + { + "epoch": 0.8894239451032324, + "grad_norm": 2.408989906311035, + "learning_rate": 6.3913946732333414e-06, + "loss": 0.5728, + "step": 3694 + }, + { + "epoch": 0.889664720399687, + "grad_norm": 1.7964873313903809, + "learning_rate": 6.363993957728953e-06, + "loss": 0.616, + "step": 3695 + }, + { + "epoch": 0.8899054956961415, + "grad_norm": 1.842602252960205, + "learning_rate": 6.336650173127223e-06, + "loss": 0.5101, + "step": 3696 + }, + { + "epoch": 0.8901462709925961, + "grad_norm": 1.9197190999984741, + "learning_rate": 6.309363336053209e-06, + "loss": 0.6051, + "step": 3697 + }, + { + "epoch": 0.8903870462890507, + "grad_norm": 14.988290786743164, + "learning_rate": 6.282133463097362e-06, + "loss": 0.4667, + "step": 3698 + }, + { + "epoch": 0.8906278215855054, + "grad_norm": 2.018468141555786, + "learning_rate": 6.254960570815527e-06, + "loss": 0.3706, + "step": 3699 + }, + { + "epoch": 0.89086859688196, + "grad_norm": 1.9267723560333252, + "learning_rate": 6.227844675728867e-06, + "loss": 1.4215, + "step": 3700 + }, + { + "epoch": 0.8911093721784145, + "grad_norm": 4.696993350982666, + "learning_rate": 6.2007857943239155e-06, + "loss": 0.7318, + "step": 3701 + }, + { + "epoch": 0.8913501474748691, + "grad_norm": 2.4501404762268066, + "learning_rate": 6.1737839430525575e-06, + "loss": 0.7474, + "step": 3702 + }, + { + "epoch": 0.8915909227713237, + "grad_norm": 3.1551716327667236, + "learning_rate": 6.146839138331928e-06, + "loss": 0.4585, + "step": 3703 + }, + { + "epoch": 0.8918316980677783, + "grad_norm": 8.161409378051758, + "learning_rate": 6.119951396544576e-06, + "loss": 0.3907, + "step": 3704 + }, + { + "epoch": 0.8920724733642328, + "grad_norm": 5.239265441894531, + "learning_rate": 6.093120734038283e-06, + "loss": 0.8809, + "step": 3705 + }, + { + "epoch": 0.8923132486606874, + "grad_norm": 2.640469789505005, + "learning_rate": 6.0663471671261515e-06, + "loss": 0.5767, + "step": 3706 + }, + { + "epoch": 0.892554023957142, + "grad_norm": 2.713616371154785, + "learning_rate": 6.0396307120865746e-06, + "loss": 1.0203, + "step": 3707 + }, + { + "epoch": 0.8927947992535966, + "grad_norm": 1.149683952331543, + "learning_rate": 6.012971385163224e-06, + "loss": 0.481, + "step": 3708 + }, + { + "epoch": 0.8930355745500511, + "grad_norm": 0.7889773845672607, + "learning_rate": 5.986369202565034e-06, + "loss": 0.4251, + "step": 3709 + }, + { + "epoch": 0.8932763498465057, + "grad_norm": 0.5434550046920776, + "learning_rate": 5.959824180466178e-06, + "loss": 0.1314, + "step": 3710 + }, + { + "epoch": 0.8935171251429603, + "grad_norm": 4.118932723999023, + "learning_rate": 5.93333633500609e-06, + "loss": 0.3042, + "step": 3711 + }, + { + "epoch": 0.8937579004394149, + "grad_norm": 1.7867053747177124, + "learning_rate": 5.906905682289465e-06, + "loss": 0.6089, + "step": 3712 + }, + { + "epoch": 0.8939986757358696, + "grad_norm": 3.315713405609131, + "learning_rate": 5.880532238386161e-06, + "loss": 0.8192, + "step": 3713 + }, + { + "epoch": 0.894239451032324, + "grad_norm": 2.045057535171509, + "learning_rate": 5.854216019331305e-06, + "loss": 0.6398, + "step": 3714 + }, + { + "epoch": 0.8944802263287787, + "grad_norm": 1.3669121265411377, + "learning_rate": 5.8279570411252316e-06, + "loss": 0.2998, + "step": 3715 + }, + { + "epoch": 0.8947210016252333, + "grad_norm": 1.606748104095459, + "learning_rate": 5.801755319733438e-06, + "loss": 0.4933, + "step": 3716 + }, + { + "epoch": 0.8949617769216879, + "grad_norm": 1.4545626640319824, + "learning_rate": 5.775610871086667e-06, + "loss": 0.7581, + "step": 3717 + }, + { + "epoch": 0.8952025522181424, + "grad_norm": 2.5948798656463623, + "learning_rate": 5.749523711080762e-06, + "loss": 0.7313, + "step": 3718 + }, + { + "epoch": 0.895443327514597, + "grad_norm": 3.4522511959075928, + "learning_rate": 5.723493855576778e-06, + "loss": 0.4838, + "step": 3719 + }, + { + "epoch": 0.8956841028110516, + "grad_norm": 1.7841429710388184, + "learning_rate": 5.697521320400967e-06, + "loss": 0.5223, + "step": 3720 + }, + { + "epoch": 0.8959248781075062, + "grad_norm": 2.9225831031799316, + "learning_rate": 5.67160612134463e-06, + "loss": 0.8148, + "step": 3721 + }, + { + "epoch": 0.8961656534039607, + "grad_norm": 1.33102285861969, + "learning_rate": 5.645748274164309e-06, + "loss": 0.1499, + "step": 3722 + }, + { + "epoch": 0.8964064287004153, + "grad_norm": 0.6809419989585876, + "learning_rate": 5.619947794581615e-06, + "loss": 0.1958, + "step": 3723 + }, + { + "epoch": 0.8966472039968699, + "grad_norm": 2.1398706436157227, + "learning_rate": 5.594204698283301e-06, + "loss": 0.4343, + "step": 3724 + }, + { + "epoch": 0.8968879792933245, + "grad_norm": 2.5033817291259766, + "learning_rate": 5.568519000921235e-06, + "loss": 0.7202, + "step": 3725 + }, + { + "epoch": 0.8971287545897791, + "grad_norm": 1.002875566482544, + "learning_rate": 5.54289071811237e-06, + "loss": 0.479, + "step": 3726 + }, + { + "epoch": 0.8973695298862336, + "grad_norm": 2.744338035583496, + "learning_rate": 5.517319865438764e-06, + "loss": 0.923, + "step": 3727 + }, + { + "epoch": 0.8976103051826883, + "grad_norm": 2.405243396759033, + "learning_rate": 5.491806458447557e-06, + "loss": 1.0923, + "step": 3728 + }, + { + "epoch": 0.8978510804791429, + "grad_norm": 2.02970814704895, + "learning_rate": 5.466350512650953e-06, + "loss": 0.9937, + "step": 3729 + }, + { + "epoch": 0.8980918557755975, + "grad_norm": 1.6559299230575562, + "learning_rate": 5.440952043526215e-06, + "loss": 0.5801, + "step": 3730 + }, + { + "epoch": 0.898332631072052, + "grad_norm": 4.650358200073242, + "learning_rate": 5.41561106651568e-06, + "loss": 0.812, + "step": 3731 + }, + { + "epoch": 0.8985734063685066, + "grad_norm": 2.2116572856903076, + "learning_rate": 5.390327597026712e-06, + "loss": 0.5227, + "step": 3732 + }, + { + "epoch": 0.8988141816649612, + "grad_norm": 2.8776440620422363, + "learning_rate": 5.3651016504317475e-06, + "loss": 1.0063, + "step": 3733 + }, + { + "epoch": 0.8990549569614158, + "grad_norm": 2.672783136367798, + "learning_rate": 5.339933242068174e-06, + "loss": 0.5567, + "step": 3734 + }, + { + "epoch": 0.8992957322578703, + "grad_norm": 2.6852715015411377, + "learning_rate": 5.3148223872384715e-06, + "loss": 0.4038, + "step": 3735 + }, + { + "epoch": 0.8995365075543249, + "grad_norm": 4.501379013061523, + "learning_rate": 5.289769101210074e-06, + "loss": 0.6712, + "step": 3736 + }, + { + "epoch": 0.8997772828507795, + "grad_norm": 1.5511711835861206, + "learning_rate": 5.26477339921545e-06, + "loss": 0.2077, + "step": 3737 + }, + { + "epoch": 0.9000180581472341, + "grad_norm": 1.7630692720413208, + "learning_rate": 5.239835296452045e-06, + "loss": 0.4195, + "step": 3738 + }, + { + "epoch": 0.9002588334436887, + "grad_norm": 1.3953133821487427, + "learning_rate": 5.214954808082273e-06, + "loss": 0.6487, + "step": 3739 + }, + { + "epoch": 0.9004996087401432, + "grad_norm": 1.0030934810638428, + "learning_rate": 5.190131949233523e-06, + "loss": 0.6278, + "step": 3740 + }, + { + "epoch": 0.9007403840365978, + "grad_norm": 0.8308902978897095, + "learning_rate": 5.165366734998178e-06, + "loss": 0.301, + "step": 3741 + }, + { + "epoch": 0.9009811593330525, + "grad_norm": 2.4039227962493896, + "learning_rate": 5.140659180433516e-06, + "loss": 0.8232, + "step": 3742 + }, + { + "epoch": 0.9012219346295071, + "grad_norm": 1.496519923210144, + "learning_rate": 5.116009300561797e-06, + "loss": 0.5532, + "step": 3743 + }, + { + "epoch": 0.9014627099259616, + "grad_norm": 2.6047515869140625, + "learning_rate": 5.0914171103701895e-06, + "loss": 0.6708, + "step": 3744 + }, + { + "epoch": 0.9017034852224162, + "grad_norm": 4.155404090881348, + "learning_rate": 5.066882624810809e-06, + "loss": 0.4486, + "step": 3745 + }, + { + "epoch": 0.9019442605188708, + "grad_norm": 5.623289108276367, + "learning_rate": 5.042405858800692e-06, + "loss": 0.6778, + "step": 3746 + }, + { + "epoch": 0.9021850358153254, + "grad_norm": 2.981544256210327, + "learning_rate": 5.017986827221733e-06, + "loss": 0.6458, + "step": 3747 + }, + { + "epoch": 0.9024258111117799, + "grad_norm": 0.939237117767334, + "learning_rate": 4.993625544920799e-06, + "loss": 0.1, + "step": 3748 + }, + { + "epoch": 0.9026665864082345, + "grad_norm": 1.9405542612075806, + "learning_rate": 4.969322026709577e-06, + "loss": 0.5831, + "step": 3749 + }, + { + "epoch": 0.9029073617046891, + "grad_norm": 0.6850067973136902, + "learning_rate": 4.945076287364669e-06, + "loss": 0.3371, + "step": 3750 + }, + { + "epoch": 0.9031481370011437, + "grad_norm": 2.4914636611938477, + "learning_rate": 4.9208883416275495e-06, + "loss": 0.726, + "step": 3751 + }, + { + "epoch": 0.9033889122975982, + "grad_norm": 2.531623601913452, + "learning_rate": 4.896758204204532e-06, + "loss": 0.4892, + "step": 3752 + }, + { + "epoch": 0.9036296875940528, + "grad_norm": 0.5930827856063843, + "learning_rate": 4.8726858897667816e-06, + "loss": 0.2482, + "step": 3753 + }, + { + "epoch": 0.9038704628905074, + "grad_norm": 1.0176321268081665, + "learning_rate": 4.8486714129503565e-06, + "loss": 0.4158, + "step": 3754 + }, + { + "epoch": 0.904111238186962, + "grad_norm": 2.1726293563842773, + "learning_rate": 4.824714788356066e-06, + "loss": 0.4867, + "step": 3755 + }, + { + "epoch": 0.9043520134834167, + "grad_norm": 0.8435872793197632, + "learning_rate": 4.800816030549638e-06, + "loss": 0.5242, + "step": 3756 + }, + { + "epoch": 0.9045927887798711, + "grad_norm": 3.269883394241333, + "learning_rate": 4.776975154061536e-06, + "loss": 1.0293, + "step": 3757 + }, + { + "epoch": 0.9048335640763258, + "grad_norm": 1.6402075290679932, + "learning_rate": 4.753192173387089e-06, + "loss": 0.5345, + "step": 3758 + }, + { + "epoch": 0.9050743393727804, + "grad_norm": 1.5480372905731201, + "learning_rate": 4.729467102986396e-06, + "loss": 0.3328, + "step": 3759 + }, + { + "epoch": 0.905315114669235, + "grad_norm": 3.9539589881896973, + "learning_rate": 4.705799957284351e-06, + "loss": 0.5114, + "step": 3760 + }, + { + "epoch": 0.9055558899656895, + "grad_norm": 1.5085039138793945, + "learning_rate": 4.6821907506706345e-06, + "loss": 0.928, + "step": 3761 + }, + { + "epoch": 0.9057966652621441, + "grad_norm": 1.067598819732666, + "learning_rate": 4.6586394974996836e-06, + "loss": 0.9092, + "step": 3762 + }, + { + "epoch": 0.9060374405585987, + "grad_norm": 2.00384783744812, + "learning_rate": 4.635146212090735e-06, + "loss": 0.5831, + "step": 3763 + }, + { + "epoch": 0.9062782158550533, + "grad_norm": 3.4452266693115234, + "learning_rate": 4.61171090872774e-06, + "loss": 0.2842, + "step": 3764 + }, + { + "epoch": 0.9065189911515078, + "grad_norm": 2.7296142578125, + "learning_rate": 4.588333601659423e-06, + "loss": 0.5461, + "step": 3765 + }, + { + "epoch": 0.9067597664479624, + "grad_norm": 1.4393811225891113, + "learning_rate": 4.565014305099247e-06, + "loss": 0.329, + "step": 3766 + }, + { + "epoch": 0.907000541744417, + "grad_norm": 1.5528326034545898, + "learning_rate": 4.541753033225393e-06, + "loss": 0.9322, + "step": 3767 + }, + { + "epoch": 0.9072413170408716, + "grad_norm": 1.016589879989624, + "learning_rate": 4.5185498001807605e-06, + "loss": 0.4822, + "step": 3768 + }, + { + "epoch": 0.9074820923373262, + "grad_norm": 1.8541332483291626, + "learning_rate": 4.495404620072985e-06, + "loss": 0.3784, + "step": 3769 + }, + { + "epoch": 0.9077228676337807, + "grad_norm": 2.7582716941833496, + "learning_rate": 4.472317506974366e-06, + "loss": 0.2414, + "step": 3770 + }, + { + "epoch": 0.9079636429302353, + "grad_norm": 1.1082451343536377, + "learning_rate": 4.44928847492192e-06, + "loss": 0.6802, + "step": 3771 + }, + { + "epoch": 0.90820441822669, + "grad_norm": 1.2040634155273438, + "learning_rate": 4.426317537917368e-06, + "loss": 0.333, + "step": 3772 + }, + { + "epoch": 0.9084451935231446, + "grad_norm": 2.672924280166626, + "learning_rate": 4.403404709927084e-06, + "loss": 0.5014, + "step": 3773 + }, + { + "epoch": 0.9086859688195991, + "grad_norm": 3.4659922122955322, + "learning_rate": 4.3805500048821225e-06, + "loss": 0.2261, + "step": 3774 + }, + { + "epoch": 0.9089267441160537, + "grad_norm": 2.050246477127075, + "learning_rate": 4.35775343667818e-06, + "loss": 0.7365, + "step": 3775 + }, + { + "epoch": 0.9091675194125083, + "grad_norm": 0.8071643710136414, + "learning_rate": 4.335015019175637e-06, + "loss": 0.7056, + "step": 3776 + }, + { + "epoch": 0.9094082947089629, + "grad_norm": 2.027353525161743, + "learning_rate": 4.3123347661995105e-06, + "loss": 0.2542, + "step": 3777 + }, + { + "epoch": 0.9096490700054174, + "grad_norm": 2.125551700592041, + "learning_rate": 4.289712691539416e-06, + "loss": 0.1728, + "step": 3778 + }, + { + "epoch": 0.909889845301872, + "grad_norm": 1.8031567335128784, + "learning_rate": 4.267148808949639e-06, + "loss": 0.4307, + "step": 3779 + }, + { + "epoch": 0.9101306205983266, + "grad_norm": 0.9503381252288818, + "learning_rate": 4.244643132149084e-06, + "loss": 0.4191, + "step": 3780 + }, + { + "epoch": 0.9103713958947812, + "grad_norm": 1.0291205644607544, + "learning_rate": 4.2221956748212384e-06, + "loss": 0.7377, + "step": 3781 + }, + { + "epoch": 0.9106121711912358, + "grad_norm": 0.7489404082298279, + "learning_rate": 4.19980645061423e-06, + "loss": 0.5554, + "step": 3782 + }, + { + "epoch": 0.9108529464876903, + "grad_norm": 1.0450713634490967, + "learning_rate": 4.177475473140724e-06, + "loss": 0.3164, + "step": 3783 + }, + { + "epoch": 0.9110937217841449, + "grad_norm": 1.918267011642456, + "learning_rate": 4.155202755978027e-06, + "loss": 0.759, + "step": 3784 + }, + { + "epoch": 0.9113344970805995, + "grad_norm": 1.4207653999328613, + "learning_rate": 4.132988312667996e-06, + "loss": 0.6942, + "step": 3785 + }, + { + "epoch": 0.9115752723770542, + "grad_norm": 1.9554914236068726, + "learning_rate": 4.110832156717059e-06, + "loss": 0.6352, + "step": 3786 + }, + { + "epoch": 0.9118160476735087, + "grad_norm": 2.0343658924102783, + "learning_rate": 4.088734301596209e-06, + "loss": 0.7284, + "step": 3787 + }, + { + "epoch": 0.9120568229699633, + "grad_norm": 2.685506820678711, + "learning_rate": 4.066694760740996e-06, + "loss": 0.732, + "step": 3788 + }, + { + "epoch": 0.9122975982664179, + "grad_norm": 0.8585965633392334, + "learning_rate": 4.044713547551504e-06, + "loss": 0.4282, + "step": 3789 + }, + { + "epoch": 0.9125383735628725, + "grad_norm": 1.1643730401992798, + "learning_rate": 4.022790675392385e-06, + "loss": 0.3473, + "step": 3790 + }, + { + "epoch": 0.912779148859327, + "grad_norm": 1.323641061782837, + "learning_rate": 4.0009261575927545e-06, + "loss": 0.3057, + "step": 3791 + }, + { + "epoch": 0.9130199241557816, + "grad_norm": 0.6881577968597412, + "learning_rate": 3.979120007446313e-06, + "loss": 0.3613, + "step": 3792 + }, + { + "epoch": 0.9132606994522362, + "grad_norm": 2.510683298110962, + "learning_rate": 3.957372238211254e-06, + "loss": 0.8894, + "step": 3793 + }, + { + "epoch": 0.9135014747486908, + "grad_norm": 1.2805593013763428, + "learning_rate": 3.935682863110246e-06, + "loss": 0.3263, + "step": 3794 + }, + { + "epoch": 0.9137422500451454, + "grad_norm": 0.9712691903114319, + "learning_rate": 3.914051895330506e-06, + "loss": 0.3099, + "step": 3795 + }, + { + "epoch": 0.9139830253415999, + "grad_norm": 6.864174842834473, + "learning_rate": 3.892479348023681e-06, + "loss": 0.5236, + "step": 3796 + }, + { + "epoch": 0.9142238006380545, + "grad_norm": 3.510481834411621, + "learning_rate": 3.8709652343059565e-06, + "loss": 0.6298, + "step": 3797 + }, + { + "epoch": 0.9144645759345091, + "grad_norm": 1.1707494258880615, + "learning_rate": 3.849509567257959e-06, + "loss": 0.5126, + "step": 3798 + }, + { + "epoch": 0.9147053512309637, + "grad_norm": 1.940595269203186, + "learning_rate": 3.828112359924763e-06, + "loss": 0.5885, + "step": 3799 + }, + { + "epoch": 0.9149461265274182, + "grad_norm": 3.032635450363159, + "learning_rate": 3.8067736253159404e-06, + "loss": 0.4042, + "step": 3800 + }, + { + "epoch": 0.9151869018238729, + "grad_norm": 1.1645691394805908, + "learning_rate": 3.785493376405469e-06, + "loss": 0.441, + "step": 3801 + }, + { + "epoch": 0.9154276771203275, + "grad_norm": 4.984523296356201, + "learning_rate": 3.764271626131799e-06, + "loss": 0.4769, + "step": 3802 + }, + { + "epoch": 0.9156684524167821, + "grad_norm": 1.2044367790222168, + "learning_rate": 3.743108387397798e-06, + "loss": 0.3186, + "step": 3803 + }, + { + "epoch": 0.9159092277132366, + "grad_norm": 0.9009220004081726, + "learning_rate": 3.722003673070773e-06, + "loss": 0.3869, + "step": 3804 + }, + { + "epoch": 0.9161500030096912, + "grad_norm": 3.3949239253997803, + "learning_rate": 3.700957495982427e-06, + "loss": 0.4886, + "step": 3805 + }, + { + "epoch": 0.9163907783061458, + "grad_norm": 1.1531803607940674, + "learning_rate": 3.6799698689289007e-06, + "loss": 0.488, + "step": 3806 + }, + { + "epoch": 0.9166315536026004, + "grad_norm": 1.1033868789672852, + "learning_rate": 3.659040804670699e-06, + "loss": 0.4561, + "step": 3807 + }, + { + "epoch": 0.916872328899055, + "grad_norm": 0.8439015746116638, + "learning_rate": 3.638170315932754e-06, + "loss": 0.443, + "step": 3808 + }, + { + "epoch": 0.9171131041955095, + "grad_norm": 3.8729732036590576, + "learning_rate": 3.6173584154043484e-06, + "loss": 0.4908, + "step": 3809 + }, + { + "epoch": 0.9173538794919641, + "grad_norm": 1.3288518190383911, + "learning_rate": 3.5966051157391824e-06, + "loss": 0.5959, + "step": 3810 + }, + { + "epoch": 0.9175946547884187, + "grad_norm": 1.7388828992843628, + "learning_rate": 3.575910429555307e-06, + "loss": 0.1869, + "step": 3811 + }, + { + "epoch": 0.9178354300848733, + "grad_norm": 1.7654670476913452, + "learning_rate": 3.5552743694351354e-06, + "loss": 0.5691, + "step": 3812 + }, + { + "epoch": 0.9180762053813278, + "grad_norm": 1.1300573348999023, + "learning_rate": 3.5346969479254532e-06, + "loss": 0.5942, + "step": 3813 + }, + { + "epoch": 0.9183169806777824, + "grad_norm": 1.7620266675949097, + "learning_rate": 3.5141781775373527e-06, + "loss": 0.623, + "step": 3814 + }, + { + "epoch": 0.9185577559742371, + "grad_norm": 2.159961223602295, + "learning_rate": 3.493718070746299e-06, + "loss": 0.5568, + "step": 3815 + }, + { + "epoch": 0.9187985312706917, + "grad_norm": 1.376558542251587, + "learning_rate": 3.473316639992108e-06, + "loss": 0.4562, + "step": 3816 + }, + { + "epoch": 0.9190393065671462, + "grad_norm": 2.9691762924194336, + "learning_rate": 3.4529738976788574e-06, + "loss": 0.2444, + "step": 3817 + }, + { + "epoch": 0.9192800818636008, + "grad_norm": 2.1220481395721436, + "learning_rate": 3.4326898561750087e-06, + "loss": 0.7534, + "step": 3818 + }, + { + "epoch": 0.9195208571600554, + "grad_norm": 3.4113810062408447, + "learning_rate": 3.412464527813297e-06, + "loss": 0.4182, + "step": 3819 + }, + { + "epoch": 0.91976163245651, + "grad_norm": 0.48114293813705444, + "learning_rate": 3.3922979248907638e-06, + "loss": 0.3866, + "step": 3820 + }, + { + "epoch": 0.9200024077529645, + "grad_norm": 0.9137384295463562, + "learning_rate": 3.372190059668756e-06, + "loss": 0.4303, + "step": 3821 + }, + { + "epoch": 0.9202431830494191, + "grad_norm": 2.026947021484375, + "learning_rate": 3.3521409443728947e-06, + "loss": 0.813, + "step": 3822 + }, + { + "epoch": 0.9204839583458737, + "grad_norm": 4.139359474182129, + "learning_rate": 3.332150591193095e-06, + "loss": 0.5473, + "step": 3823 + }, + { + "epoch": 0.9207247336423283, + "grad_norm": 0.5374311208724976, + "learning_rate": 3.312219012283535e-06, + "loss": 0.3422, + "step": 3824 + }, + { + "epoch": 0.9209655089387829, + "grad_norm": 1.6601024866104126, + "learning_rate": 3.2923462197626433e-06, + "loss": 0.777, + "step": 3825 + }, + { + "epoch": 0.9212062842352374, + "grad_norm": 1.9013340473175049, + "learning_rate": 3.272532225713143e-06, + "loss": 0.8191, + "step": 3826 + }, + { + "epoch": 0.921447059531692, + "grad_norm": 1.4580706357955933, + "learning_rate": 3.252777042181976e-06, + "loss": 0.5187, + "step": 3827 + }, + { + "epoch": 0.9216878348281466, + "grad_norm": 4.314640522003174, + "learning_rate": 3.233080681180323e-06, + "loss": 1.1448, + "step": 3828 + }, + { + "epoch": 0.9219286101246013, + "grad_norm": 0.6895533800125122, + "learning_rate": 3.21344315468366e-06, + "loss": 0.3816, + "step": 3829 + }, + { + "epoch": 0.9221693854210558, + "grad_norm": 0.710097074508667, + "learning_rate": 3.193864474631614e-06, + "loss": 0.317, + "step": 3830 + }, + { + "epoch": 0.9224101607175104, + "grad_norm": 3.7332968711853027, + "learning_rate": 3.174344652928063e-06, + "loss": 0.261, + "step": 3831 + }, + { + "epoch": 0.922650936013965, + "grad_norm": 1.7163169384002686, + "learning_rate": 3.1548837014411357e-06, + "loss": 0.5922, + "step": 3832 + }, + { + "epoch": 0.9228917113104196, + "grad_norm": 1.793519139289856, + "learning_rate": 3.135481632003101e-06, + "loss": 1.0053, + "step": 3833 + }, + { + "epoch": 0.9231324866068741, + "grad_norm": 2.4100849628448486, + "learning_rate": 3.116138456410478e-06, + "loss": 0.5618, + "step": 3834 + }, + { + "epoch": 0.9233732619033287, + "grad_norm": 1.54547119140625, + "learning_rate": 3.0968541864239476e-06, + "loss": 0.1529, + "step": 3835 + }, + { + "epoch": 0.9236140371997833, + "grad_norm": 1.1684705018997192, + "learning_rate": 3.0776288337683977e-06, + "loss": 0.2276, + "step": 3836 + }, + { + "epoch": 0.9238548124962379, + "grad_norm": 2.4300782680511475, + "learning_rate": 3.0584624101328785e-06, + "loss": 0.4722, + "step": 3837 + }, + { + "epoch": 0.9240955877926925, + "grad_norm": 1.5282294750213623, + "learning_rate": 3.0393549271706345e-06, + "loss": 0.5003, + "step": 3838 + }, + { + "epoch": 0.924336363089147, + "grad_norm": 2.3679680824279785, + "learning_rate": 3.0203063964990617e-06, + "loss": 0.6853, + "step": 3839 + }, + { + "epoch": 0.9245771383856016, + "grad_norm": 0.8373381495475769, + "learning_rate": 3.001316829699685e-06, + "loss": 0.2153, + "step": 3840 + }, + { + "epoch": 0.9248179136820562, + "grad_norm": 3.2224361896514893, + "learning_rate": 2.982386238318213e-06, + "loss": 0.4978, + "step": 3841 + }, + { + "epoch": 0.9250586889785108, + "grad_norm": 3.300617218017578, + "learning_rate": 2.963514633864506e-06, + "loss": 0.747, + "step": 3842 + }, + { + "epoch": 0.9252994642749653, + "grad_norm": 1.0810372829437256, + "learning_rate": 2.9447020278125072e-06, + "loss": 0.4873, + "step": 3843 + }, + { + "epoch": 0.92554023957142, + "grad_norm": 1.0733656883239746, + "learning_rate": 2.925948431600356e-06, + "loss": 0.6448, + "step": 3844 + }, + { + "epoch": 0.9257810148678746, + "grad_norm": 1.8431061506271362, + "learning_rate": 2.9072538566302654e-06, + "loss": 0.3739, + "step": 3845 + }, + { + "epoch": 0.9260217901643292, + "grad_norm": 4.552158832550049, + "learning_rate": 2.8886183142685763e-06, + "loss": 1.5292, + "step": 3846 + }, + { + "epoch": 0.9262625654607837, + "grad_norm": 1.8968464136123657, + "learning_rate": 2.87004181584577e-06, + "loss": 0.3096, + "step": 3847 + }, + { + "epoch": 0.9265033407572383, + "grad_norm": 1.599402904510498, + "learning_rate": 2.8515243726563557e-06, + "loss": 0.8706, + "step": 3848 + }, + { + "epoch": 0.9267441160536929, + "grad_norm": 1.9225425720214844, + "learning_rate": 2.8330659959589946e-06, + "loss": 0.5347, + "step": 3849 + }, + { + "epoch": 0.9269848913501475, + "grad_norm": 2.9519357681274414, + "learning_rate": 2.8146666969764535e-06, + "loss": 0.7436, + "step": 3850 + }, + { + "epoch": 0.9272256666466021, + "grad_norm": 0.88628089427948, + "learning_rate": 2.7963264868955065e-06, + "loss": 0.5481, + "step": 3851 + }, + { + "epoch": 0.9274664419430566, + "grad_norm": 2.948660373687744, + "learning_rate": 2.7780453768670557e-06, + "loss": 0.8285, + "step": 3852 + }, + { + "epoch": 0.9277072172395112, + "grad_norm": 0.3918326497077942, + "learning_rate": 2.7598233780060543e-06, + "loss": 0.3189, + "step": 3853 + }, + { + "epoch": 0.9279479925359658, + "grad_norm": 0.8619207739830017, + "learning_rate": 2.7416605013915297e-06, + "loss": 0.2851, + "step": 3854 + }, + { + "epoch": 0.9281887678324204, + "grad_norm": 2.232577323913574, + "learning_rate": 2.7235567580665587e-06, + "loss": 0.5436, + "step": 3855 + }, + { + "epoch": 0.9284295431288749, + "grad_norm": 3.190218448638916, + "learning_rate": 2.705512159038226e-06, + "loss": 0.6544, + "step": 3856 + }, + { + "epoch": 0.9286703184253295, + "grad_norm": 2.477781295776367, + "learning_rate": 2.687526715277722e-06, + "loss": 0.5965, + "step": 3857 + }, + { + "epoch": 0.9289110937217842, + "grad_norm": 1.608775019645691, + "learning_rate": 2.669600437720221e-06, + "loss": 0.3984, + "step": 3858 + }, + { + "epoch": 0.9291518690182388, + "grad_norm": 4.561281204223633, + "learning_rate": 2.651733337264928e-06, + "loss": 0.4893, + "step": 3859 + }, + { + "epoch": 0.9293926443146933, + "grad_norm": 6.173032760620117, + "learning_rate": 2.6339254247751078e-06, + "loss": 0.7945, + "step": 3860 + }, + { + "epoch": 0.9296334196111479, + "grad_norm": 1.251524806022644, + "learning_rate": 2.616176711077989e-06, + "loss": 0.6277, + "step": 3861 + }, + { + "epoch": 0.9298741949076025, + "grad_norm": 1.0110701322555542, + "learning_rate": 2.5984872069648393e-06, + "loss": 0.5838, + "step": 3862 + }, + { + "epoch": 0.9301149702040571, + "grad_norm": 2.0771772861480713, + "learning_rate": 2.580856923190933e-06, + "loss": 0.9169, + "step": 3863 + }, + { + "epoch": 0.9303557455005117, + "grad_norm": 1.6572563648223877, + "learning_rate": 2.5632858704754848e-06, + "loss": 0.4677, + "step": 3864 + }, + { + "epoch": 0.9305965207969662, + "grad_norm": 1.9874509572982788, + "learning_rate": 2.5457740595017707e-06, + "loss": 0.4567, + "step": 3865 + }, + { + "epoch": 0.9308372960934208, + "grad_norm": 2.116501808166504, + "learning_rate": 2.5283215009169857e-06, + "loss": 0.4592, + "step": 3866 + }, + { + "epoch": 0.9310780713898754, + "grad_norm": 2.0553879737854004, + "learning_rate": 2.51092820533233e-06, + "loss": 0.3902, + "step": 3867 + }, + { + "epoch": 0.93131884668633, + "grad_norm": 2.063753604888916, + "learning_rate": 2.4935941833229782e-06, + "loss": 0.7985, + "step": 3868 + }, + { + "epoch": 0.9315596219827845, + "grad_norm": 1.6043528318405151, + "learning_rate": 2.4763194454280435e-06, + "loss": 0.7647, + "step": 3869 + }, + { + "epoch": 0.9318003972792391, + "grad_norm": 1.9053353071212769, + "learning_rate": 2.4591040021506027e-06, + "loss": 0.6018, + "step": 3870 + }, + { + "epoch": 0.9320411725756937, + "grad_norm": 2.5057151317596436, + "learning_rate": 2.4419478639577164e-06, + "loss": 0.7383, + "step": 3871 + }, + { + "epoch": 0.9322819478721484, + "grad_norm": 2.225681781768799, + "learning_rate": 2.424851041280307e-06, + "loss": 0.4332, + "step": 3872 + }, + { + "epoch": 0.9325227231686029, + "grad_norm": 0.5860837697982788, + "learning_rate": 2.4078135445133156e-06, + "loss": 0.6404, + "step": 3873 + }, + { + "epoch": 0.9327634984650575, + "grad_norm": 3.1039059162139893, + "learning_rate": 2.390835384015555e-06, + "loss": 0.6935, + "step": 3874 + }, + { + "epoch": 0.9330042737615121, + "grad_norm": 2.5515451431274414, + "learning_rate": 2.373916570109802e-06, + "loss": 0.3266, + "step": 3875 + }, + { + "epoch": 0.9332450490579667, + "grad_norm": 3.696157455444336, + "learning_rate": 2.357057113082728e-06, + "loss": 0.3768, + "step": 3876 + }, + { + "epoch": 0.9334858243544213, + "grad_norm": 2.1884636878967285, + "learning_rate": 2.340257023184922e-06, + "loss": 0.2111, + "step": 3877 + }, + { + "epoch": 0.9337265996508758, + "grad_norm": 4.436749458312988, + "learning_rate": 2.323516310630891e-06, + "loss": 0.9444, + "step": 3878 + }, + { + "epoch": 0.9339673749473304, + "grad_norm": 1.7192350625991821, + "learning_rate": 2.3068349855989936e-06, + "loss": 0.3693, + "step": 3879 + }, + { + "epoch": 0.934208150243785, + "grad_norm": 5.450645446777344, + "learning_rate": 2.2902130582315274e-06, + "loss": 0.4809, + "step": 3880 + }, + { + "epoch": 0.9344489255402396, + "grad_norm": 1.2411659955978394, + "learning_rate": 2.2736505386346863e-06, + "loss": 1.0273, + "step": 3881 + }, + { + "epoch": 0.9346897008366941, + "grad_norm": 0.8129162192344666, + "learning_rate": 2.2571474368784707e-06, + "loss": 0.0588, + "step": 3882 + }, + { + "epoch": 0.9349304761331487, + "grad_norm": 1.5056270360946655, + "learning_rate": 2.240703762996843e-06, + "loss": 0.6161, + "step": 3883 + }, + { + "epoch": 0.9351712514296033, + "grad_norm": 1.0241050720214844, + "learning_rate": 2.224319526987584e-06, + "loss": 0.4896, + "step": 3884 + }, + { + "epoch": 0.9354120267260579, + "grad_norm": 0.8922635316848755, + "learning_rate": 2.2079947388123356e-06, + "loss": 0.5838, + "step": 3885 + }, + { + "epoch": 0.9356528020225124, + "grad_norm": 3.8598411083221436, + "learning_rate": 2.1917294083966254e-06, + "loss": 0.5277, + "step": 3886 + }, + { + "epoch": 0.935893577318967, + "grad_norm": 10.636117935180664, + "learning_rate": 2.1755235456297986e-06, + "loss": 0.9566, + "step": 3887 + }, + { + "epoch": 0.9361343526154217, + "grad_norm": 0.4968515932559967, + "learning_rate": 2.15937716036505e-06, + "loss": 0.563, + "step": 3888 + }, + { + "epoch": 0.9363751279118763, + "grad_norm": 8.35496711730957, + "learning_rate": 2.1432902624194286e-06, + "loss": 0.2234, + "step": 3889 + }, + { + "epoch": 0.9366159032083309, + "grad_norm": 1.9201698303222656, + "learning_rate": 2.1272628615737977e-06, + "loss": 0.797, + "step": 3890 + }, + { + "epoch": 0.9368566785047854, + "grad_norm": 3.2286055088043213, + "learning_rate": 2.1112949675728743e-06, + "loss": 0.6248, + "step": 3891 + }, + { + "epoch": 0.93709745380124, + "grad_norm": 1.6388925313949585, + "learning_rate": 2.0953865901251255e-06, + "loss": 0.685, + "step": 3892 + }, + { + "epoch": 0.9373382290976946, + "grad_norm": 1.8927644491195679, + "learning_rate": 2.0795377389029257e-06, + "loss": 0.3773, + "step": 3893 + }, + { + "epoch": 0.9375790043941492, + "grad_norm": 11.435422897338867, + "learning_rate": 2.063748423542411e-06, + "loss": 0.7658, + "step": 3894 + }, + { + "epoch": 0.9378197796906037, + "grad_norm": 1.6999096870422363, + "learning_rate": 2.048018653643491e-06, + "loss": 0.727, + "step": 3895 + }, + { + "epoch": 0.9380605549870583, + "grad_norm": 0.23305965960025787, + "learning_rate": 2.0323484387699264e-06, + "loss": 0.1286, + "step": 3896 + }, + { + "epoch": 0.9383013302835129, + "grad_norm": 1.2219979763031006, + "learning_rate": 2.0167377884492412e-06, + "loss": 0.3562, + "step": 3897 + }, + { + "epoch": 0.9385421055799675, + "grad_norm": 1.201636552810669, + "learning_rate": 2.0011867121727313e-06, + "loss": 0.6063, + "step": 3898 + }, + { + "epoch": 0.938782880876422, + "grad_norm": 9.849644660949707, + "learning_rate": 1.9856952193955005e-06, + "loss": 0.9423, + "step": 3899 + }, + { + "epoch": 0.9390236561728766, + "grad_norm": 1.613932490348816, + "learning_rate": 1.9702633195363917e-06, + "loss": 0.522, + "step": 3900 + }, + { + "epoch": 0.9392644314693313, + "grad_norm": 2.0753109455108643, + "learning_rate": 1.954891021978045e-06, + "loss": 0.844, + "step": 3901 + }, + { + "epoch": 0.9395052067657859, + "grad_norm": 2.056060552597046, + "learning_rate": 1.9395783360668718e-06, + "loss": 0.5813, + "step": 3902 + }, + { + "epoch": 0.9397459820622404, + "grad_norm": 1.1586860418319702, + "learning_rate": 1.9243252711129923e-06, + "loss": 0.8256, + "step": 3903 + }, + { + "epoch": 0.939986757358695, + "grad_norm": 2.10019850730896, + "learning_rate": 1.909131836390321e-06, + "loss": 1.029, + "step": 3904 + }, + { + "epoch": 0.9402275326551496, + "grad_norm": 0.8406896591186523, + "learning_rate": 1.893998041136502e-06, + "loss": 0.4226, + "step": 3905 + }, + { + "epoch": 0.9404683079516042, + "grad_norm": 1.0460152626037598, + "learning_rate": 1.8789238945528976e-06, + "loss": 0.5012, + "step": 3906 + }, + { + "epoch": 0.9407090832480588, + "grad_norm": 3.7730448246002197, + "learning_rate": 1.8639094058046425e-06, + "loss": 1.1832, + "step": 3907 + }, + { + "epoch": 0.9409498585445133, + "grad_norm": 3.430011034011841, + "learning_rate": 1.848954584020568e-06, + "loss": 0.3868, + "step": 3908 + }, + { + "epoch": 0.9411906338409679, + "grad_norm": 3.7194321155548096, + "learning_rate": 1.834059438293234e-06, + "loss": 0.8586, + "step": 3909 + }, + { + "epoch": 0.9414314091374225, + "grad_norm": 1.3014260530471802, + "learning_rate": 1.819223977678941e-06, + "loss": 0.5398, + "step": 3910 + }, + { + "epoch": 0.9416721844338771, + "grad_norm": 0.9454424381256104, + "learning_rate": 1.8044482111976735e-06, + "loss": 0.6537, + "step": 3911 + }, + { + "epoch": 0.9419129597303316, + "grad_norm": 1.6029918193817139, + "learning_rate": 1.7897321478331342e-06, + "loss": 0.3148, + "step": 3912 + }, + { + "epoch": 0.9421537350267862, + "grad_norm": 3.4292304515838623, + "learning_rate": 1.7750757965327213e-06, + "loss": 0.5388, + "step": 3913 + }, + { + "epoch": 0.9423945103232408, + "grad_norm": 2.228184461593628, + "learning_rate": 1.7604791662075181e-06, + "loss": 0.7117, + "step": 3914 + }, + { + "epoch": 0.9426352856196955, + "grad_norm": 0.6483622193336487, + "learning_rate": 1.7459422657323254e-06, + "loss": 0.4383, + "step": 3915 + }, + { + "epoch": 0.94287606091615, + "grad_norm": 2.6446633338928223, + "learning_rate": 1.7314651039455954e-06, + "loss": 0.6101, + "step": 3916 + }, + { + "epoch": 0.9431168362126046, + "grad_norm": 2.0396006107330322, + "learning_rate": 1.717047689649487e-06, + "loss": 0.5838, + "step": 3917 + }, + { + "epoch": 0.9433576115090592, + "grad_norm": 0.8823184370994568, + "learning_rate": 1.7026900316098215e-06, + "loss": 0.3099, + "step": 3918 + }, + { + "epoch": 0.9435983868055138, + "grad_norm": 2.7773776054382324, + "learning_rate": 1.688392138556083e-06, + "loss": 0.9675, + "step": 3919 + }, + { + "epoch": 0.9438391621019684, + "grad_norm": 5.874734878540039, + "learning_rate": 1.6741540191814287e-06, + "loss": 0.3189, + "step": 3920 + }, + { + "epoch": 0.9440799373984229, + "grad_norm": 3.0924699306488037, + "learning_rate": 1.6599756821426449e-06, + "loss": 0.1751, + "step": 3921 + }, + { + "epoch": 0.9443207126948775, + "grad_norm": 0.5945261120796204, + "learning_rate": 1.6458571360602248e-06, + "loss": 0.6427, + "step": 3922 + }, + { + "epoch": 0.9445614879913321, + "grad_norm": 4.261098861694336, + "learning_rate": 1.6317983895182575e-06, + "loss": 0.7257, + "step": 3923 + }, + { + "epoch": 0.9448022632877867, + "grad_norm": 3.0185914039611816, + "learning_rate": 1.6177994510644834e-06, + "loss": 0.6177, + "step": 3924 + }, + { + "epoch": 0.9450430385842412, + "grad_norm": 1.362781286239624, + "learning_rate": 1.603860329210316e-06, + "loss": 1.1531, + "step": 3925 + }, + { + "epoch": 0.9452838138806958, + "grad_norm": 1.4825752973556519, + "learning_rate": 1.589981032430743e-06, + "loss": 0.7275, + "step": 3926 + }, + { + "epoch": 0.9455245891771504, + "grad_norm": 1.4190683364868164, + "learning_rate": 1.576161569164436e-06, + "loss": 0.7995, + "step": 3927 + }, + { + "epoch": 0.945765364473605, + "grad_norm": 1.807726263999939, + "learning_rate": 1.5624019478136408e-06, + "loss": 0.3198, + "step": 3928 + }, + { + "epoch": 0.9460061397700595, + "grad_norm": 2.1518940925598145, + "learning_rate": 1.5487021767442433e-06, + "loss": 0.4084, + "step": 3929 + }, + { + "epoch": 0.9462469150665141, + "grad_norm": 1.5749576091766357, + "learning_rate": 1.535062264285736e-06, + "loss": 0.6711, + "step": 3930 + }, + { + "epoch": 0.9464876903629688, + "grad_norm": 0.5705631375312805, + "learning_rate": 1.5214822187312294e-06, + "loss": 0.3036, + "step": 3931 + }, + { + "epoch": 0.9467284656594234, + "grad_norm": 0.7009626030921936, + "learning_rate": 1.5079620483373857e-06, + "loss": 0.5722, + "step": 3932 + }, + { + "epoch": 0.946969240955878, + "grad_norm": 1.3820369243621826, + "learning_rate": 1.4945017613245294e-06, + "loss": 0.1829, + "step": 3933 + }, + { + "epoch": 0.9472100162523325, + "grad_norm": 5.041447639465332, + "learning_rate": 1.481101365876547e-06, + "loss": 0.6447, + "step": 3934 + }, + { + "epoch": 0.9474507915487871, + "grad_norm": 3.8214685916900635, + "learning_rate": 1.4677608701408886e-06, + "loss": 0.7699, + "step": 3935 + }, + { + "epoch": 0.9476915668452417, + "grad_norm": 2.032578468322754, + "learning_rate": 1.4544802822286318e-06, + "loss": 0.5696, + "step": 3936 + }, + { + "epoch": 0.9479323421416963, + "grad_norm": 0.9442195892333984, + "learning_rate": 1.4412596102143738e-06, + "loss": 0.2906, + "step": 3937 + }, + { + "epoch": 0.9481731174381508, + "grad_norm": 1.667283535003662, + "learning_rate": 1.42809886213634e-06, + "loss": 0.4485, + "step": 3938 + }, + { + "epoch": 0.9484138927346054, + "grad_norm": 2.698345184326172, + "learning_rate": 1.4149980459962742e-06, + "loss": 0.3603, + "step": 3939 + }, + { + "epoch": 0.94865466803106, + "grad_norm": 1.8065334558486938, + "learning_rate": 1.4019571697595156e-06, + "loss": 0.2997, + "step": 3940 + }, + { + "epoch": 0.9488954433275146, + "grad_norm": 2.3097822666168213, + "learning_rate": 1.3889762413549333e-06, + "loss": 0.5277, + "step": 3941 + }, + { + "epoch": 0.9491362186239691, + "grad_norm": 2.765949249267578, + "learning_rate": 1.3760552686749806e-06, + "loss": 0.3425, + "step": 3942 + }, + { + "epoch": 0.9493769939204237, + "grad_norm": 0.5022979378700256, + "learning_rate": 1.3631942595756175e-06, + "loss": 0.5907, + "step": 3943 + }, + { + "epoch": 0.9496177692168783, + "grad_norm": 1.2168604135513306, + "learning_rate": 1.3503932218763893e-06, + "loss": 0.2498, + "step": 3944 + }, + { + "epoch": 0.949858544513333, + "grad_norm": 0.7240424752235413, + "learning_rate": 1.3376521633603256e-06, + "loss": 0.2152, + "step": 3945 + }, + { + "epoch": 0.9500993198097876, + "grad_norm": 5.824214935302734, + "learning_rate": 1.324971091774052e-06, + "loss": 0.7278, + "step": 3946 + }, + { + "epoch": 0.9503400951062421, + "grad_norm": 2.0212886333465576, + "learning_rate": 1.312350014827668e-06, + "loss": 0.8276, + "step": 3947 + }, + { + "epoch": 0.9505808704026967, + "grad_norm": 4.131972789764404, + "learning_rate": 1.2997889401948126e-06, + "loss": 0.4576, + "step": 3948 + }, + { + "epoch": 0.9508216456991513, + "grad_norm": 0.5388569831848145, + "learning_rate": 1.287287875512655e-06, + "loss": 0.3775, + "step": 3949 + }, + { + "epoch": 0.9510624209956059, + "grad_norm": 0.646866500377655, + "learning_rate": 1.2748468283818815e-06, + "loss": 0.2518, + "step": 3950 + }, + { + "epoch": 0.9513031962920604, + "grad_norm": 1.9133360385894775, + "learning_rate": 1.2624658063666639e-06, + "loss": 0.6595, + "step": 3951 + }, + { + "epoch": 0.951543971588515, + "grad_norm": 1.385985255241394, + "learning_rate": 1.2501448169946916e-06, + "loss": 0.6742, + "step": 3952 + }, + { + "epoch": 0.9517847468849696, + "grad_norm": 2.3750483989715576, + "learning_rate": 1.2378838677571503e-06, + "loss": 0.9347, + "step": 3953 + }, + { + "epoch": 0.9520255221814242, + "grad_norm": 2.106820583343506, + "learning_rate": 1.2256829661087432e-06, + "loss": 0.4332, + "step": 3954 + }, + { + "epoch": 0.9522662974778787, + "grad_norm": 1.1812132596969604, + "learning_rate": 1.2135421194676256e-06, + "loss": 0.4027, + "step": 3955 + }, + { + "epoch": 0.9525070727743333, + "grad_norm": 1.4752898216247559, + "learning_rate": 1.2014613352154702e-06, + "loss": 0.2301, + "step": 3956 + }, + { + "epoch": 0.9527478480707879, + "grad_norm": 1.9427971839904785, + "learning_rate": 1.189440620697424e-06, + "loss": 0.6347, + "step": 3957 + }, + { + "epoch": 0.9529886233672425, + "grad_norm": 2.3208911418914795, + "learning_rate": 1.1774799832220961e-06, + "loss": 0.9104, + "step": 3958 + }, + { + "epoch": 0.9532293986636972, + "grad_norm": 1.113741397857666, + "learning_rate": 1.1655794300615918e-06, + "loss": 0.8192, + "step": 3959 + }, + { + "epoch": 0.9534701739601517, + "grad_norm": 2.645212411880493, + "learning_rate": 1.1537389684514787e-06, + "loss": 0.7612, + "step": 3960 + }, + { + "epoch": 0.9537109492566063, + "grad_norm": 4.026910781860352, + "learning_rate": 1.141958605590765e-06, + "loss": 0.8181, + "step": 3961 + }, + { + "epoch": 0.9539517245530609, + "grad_norm": 3.301568031311035, + "learning_rate": 1.1302383486419544e-06, + "loss": 1.2559, + "step": 3962 + }, + { + "epoch": 0.9541924998495155, + "grad_norm": 1.3520029783248901, + "learning_rate": 1.11857820473098e-06, + "loss": 0.6955, + "step": 3963 + }, + { + "epoch": 0.95443327514597, + "grad_norm": 3.8628885746002197, + "learning_rate": 1.106978180947238e-06, + "loss": 0.9347, + "step": 3964 + }, + { + "epoch": 0.9546740504424246, + "grad_norm": 1.2216380834579468, + "learning_rate": 1.095438284343575e-06, + "loss": 0.6449, + "step": 3965 + }, + { + "epoch": 0.9549148257388792, + "grad_norm": 0.4715072214603424, + "learning_rate": 1.083958521936257e-06, + "loss": 0.2787, + "step": 3966 + }, + { + "epoch": 0.9551556010353338, + "grad_norm": 2.9943132400512695, + "learning_rate": 1.0725389007050446e-06, + "loss": 0.8323, + "step": 3967 + }, + { + "epoch": 0.9553963763317883, + "grad_norm": 3.0367226600646973, + "learning_rate": 1.0611794275930399e-06, + "loss": 0.8128, + "step": 3968 + }, + { + "epoch": 0.9556371516282429, + "grad_norm": 1.5612680912017822, + "learning_rate": 1.0498801095068733e-06, + "loss": 0.2859, + "step": 3969 + }, + { + "epoch": 0.9558779269246975, + "grad_norm": 1.2354720830917358, + "learning_rate": 1.0386409533165276e-06, + "loss": 0.6305, + "step": 3970 + }, + { + "epoch": 0.9561187022211521, + "grad_norm": 3.0069570541381836, + "learning_rate": 1.0274619658554475e-06, + "loss": 0.6211, + "step": 3971 + }, + { + "epoch": 0.9563594775176067, + "grad_norm": 2.1763761043548584, + "learning_rate": 1.0163431539204847e-06, + "loss": 0.7641, + "step": 3972 + }, + { + "epoch": 0.9566002528140612, + "grad_norm": 2.459559202194214, + "learning_rate": 1.005284524271899e-06, + "loss": 0.822, + "step": 3973 + }, + { + "epoch": 0.9568410281105159, + "grad_norm": 1.9608721733093262, + "learning_rate": 9.942860836333445e-07, + "loss": 0.8843, + "step": 3974 + }, + { + "epoch": 0.9570818034069705, + "grad_norm": 2.488222599029541, + "learning_rate": 9.833478386919282e-07, + "loss": 0.9933, + "step": 3975 + }, + { + "epoch": 0.9573225787034251, + "grad_norm": 1.750231146812439, + "learning_rate": 9.724697960981077e-07, + "loss": 0.3185, + "step": 3976 + }, + { + "epoch": 0.9575633539998796, + "grad_norm": 1.300431489944458, + "learning_rate": 9.616519624657706e-07, + "loss": 0.4801, + "step": 3977 + }, + { + "epoch": 0.9578041292963342, + "grad_norm": 0.6917396783828735, + "learning_rate": 9.508943443721663e-07, + "loss": 0.5667, + "step": 3978 + }, + { + "epoch": 0.9580449045927888, + "grad_norm": 3.407341480255127, + "learning_rate": 9.401969483579632e-07, + "loss": 0.5509, + "step": 3979 + }, + { + "epoch": 0.9582856798892434, + "grad_norm": 2.0416157245635986, + "learning_rate": 9.295597809272028e-07, + "loss": 0.1392, + "step": 3980 + }, + { + "epoch": 0.9585264551856979, + "grad_norm": 2.7798619270324707, + "learning_rate": 9.189828485473006e-07, + "loss": 0.9261, + "step": 3981 + }, + { + "epoch": 0.9587672304821525, + "grad_norm": 0.8763837218284607, + "learning_rate": 9.084661576490461e-07, + "loss": 0.398, + "step": 3982 + }, + { + "epoch": 0.9590080057786071, + "grad_norm": 1.7817946672439575, + "learning_rate": 8.980097146266464e-07, + "loss": 0.4061, + "step": 3983 + }, + { + "epoch": 0.9592487810750617, + "grad_norm": 1.229095458984375, + "learning_rate": 8.876135258376051e-07, + "loss": 0.4831, + "step": 3984 + }, + { + "epoch": 0.9594895563715162, + "grad_norm": 3.4042162895202637, + "learning_rate": 8.772775976028546e-07, + "loss": 0.4386, + "step": 3985 + }, + { + "epoch": 0.9597303316679708, + "grad_norm": 1.3494471311569214, + "learning_rate": 8.670019362066461e-07, + "loss": 0.1705, + "step": 3986 + }, + { + "epoch": 0.9599711069644254, + "grad_norm": 1.1241267919540405, + "learning_rate": 8.567865478966042e-07, + "loss": 0.3146, + "step": 3987 + }, + { + "epoch": 0.9602118822608801, + "grad_norm": 0.9588642120361328, + "learning_rate": 8.466314388837271e-07, + "loss": 0.2576, + "step": 3988 + }, + { + "epoch": 0.9604526575573347, + "grad_norm": 0.49164265394210815, + "learning_rate": 8.365366153423204e-07, + "loss": 0.2641, + "step": 3989 + }, + { + "epoch": 0.9606934328537892, + "grad_norm": 1.6935783624649048, + "learning_rate": 8.265020834100635e-07, + "loss": 0.5948, + "step": 3990 + }, + { + "epoch": 0.9609342081502438, + "grad_norm": 1.3072270154953003, + "learning_rate": 8.165278491879868e-07, + "loss": 0.5994, + "step": 3991 + }, + { + "epoch": 0.9611749834466984, + "grad_norm": 4.324315071105957, + "learning_rate": 8.066139187404398e-07, + "loss": 0.4573, + "step": 3992 + }, + { + "epoch": 0.961415758743153, + "grad_norm": 6.5658087730407715, + "learning_rate": 7.967602980951228e-07, + "loss": 0.6555, + "step": 3993 + }, + { + "epoch": 0.9616565340396075, + "grad_norm": 2.510852813720703, + "learning_rate": 7.869669932430435e-07, + "loss": 0.617, + "step": 3994 + }, + { + "epoch": 0.9618973093360621, + "grad_norm": 1.054416298866272, + "learning_rate": 7.772340101385611e-07, + "loss": 0.7014, + "step": 3995 + }, + { + "epoch": 0.9621380846325167, + "grad_norm": 3.1223275661468506, + "learning_rate": 7.675613546993643e-07, + "loss": 0.7056, + "step": 3996 + }, + { + "epoch": 0.9623788599289713, + "grad_norm": 2.15596866607666, + "learning_rate": 7.579490328064265e-07, + "loss": 0.7612, + "step": 3997 + }, + { + "epoch": 0.9626196352254258, + "grad_norm": 3.02179217338562, + "learning_rate": 7.483970503040726e-07, + "loss": 0.6353, + "step": 3998 + }, + { + "epoch": 0.9628604105218804, + "grad_norm": 1.420333743095398, + "learning_rate": 7.38905412999924e-07, + "loss": 0.3904, + "step": 3999 + }, + { + "epoch": 0.963101185818335, + "grad_norm": 2.189934253692627, + "learning_rate": 7.294741266649307e-07, + "loss": 0.4709, + "step": 4000 + }, + { + "epoch": 0.9633419611147896, + "grad_norm": 3.9134743213653564, + "learning_rate": 7.201031970333283e-07, + "loss": 0.3967, + "step": 4001 + }, + { + "epoch": 0.9635827364112443, + "grad_norm": 14.885796546936035, + "learning_rate": 7.10792629802659e-07, + "loss": 0.9829, + "step": 4002 + }, + { + "epoch": 0.9638235117076988, + "grad_norm": 2.1734344959259033, + "learning_rate": 7.015424306337725e-07, + "loss": 0.3751, + "step": 4003 + }, + { + "epoch": 0.9640642870041534, + "grad_norm": 2.0911247730255127, + "learning_rate": 6.923526051508145e-07, + "loss": 0.7239, + "step": 4004 + }, + { + "epoch": 0.964305062300608, + "grad_norm": 2.525022029876709, + "learning_rate": 6.832231589412042e-07, + "loss": 0.2855, + "step": 4005 + }, + { + "epoch": 0.9645458375970626, + "grad_norm": 1.239410161972046, + "learning_rate": 6.741540975556903e-07, + "loss": 0.3171, + "step": 4006 + }, + { + "epoch": 0.9647866128935171, + "grad_norm": 1.3856205940246582, + "learning_rate": 6.651454265082512e-07, + "loss": 0.5041, + "step": 4007 + }, + { + "epoch": 0.9650273881899717, + "grad_norm": 1.1256098747253418, + "learning_rate": 6.561971512762055e-07, + "loss": 0.4639, + "step": 4008 + }, + { + "epoch": 0.9652681634864263, + "grad_norm": 1.6934860944747925, + "learning_rate": 6.473092773001233e-07, + "loss": 0.488, + "step": 4009 + }, + { + "epoch": 0.9655089387828809, + "grad_norm": 3.070348024368286, + "learning_rate": 6.384818099838374e-07, + "loss": 0.9032, + "step": 4010 + }, + { + "epoch": 0.9657497140793354, + "grad_norm": 1.458402395248413, + "learning_rate": 6.297147546944882e-07, + "loss": 0.4057, + "step": 4011 + }, + { + "epoch": 0.96599048937579, + "grad_norm": 2.034212589263916, + "learning_rate": 6.210081167624338e-07, + "loss": 0.265, + "step": 4012 + }, + { + "epoch": 0.9662312646722446, + "grad_norm": 1.1797361373901367, + "learning_rate": 6.12361901481362e-07, + "loss": 0.6248, + "step": 4013 + }, + { + "epoch": 0.9664720399686992, + "grad_norm": 4.208076000213623, + "learning_rate": 6.037761141081677e-07, + "loss": 0.8388, + "step": 4014 + }, + { + "epoch": 0.9667128152651538, + "grad_norm": 3.974991798400879, + "learning_rate": 5.952507598630419e-07, + "loss": 0.8926, + "step": 4015 + }, + { + "epoch": 0.9669535905616083, + "grad_norm": 1.3050296306610107, + "learning_rate": 5.86785843929416e-07, + "loss": 0.438, + "step": 4016 + }, + { + "epoch": 0.967194365858063, + "grad_norm": 4.134682655334473, + "learning_rate": 5.783813714539731e-07, + "loss": 1.4216, + "step": 4017 + }, + { + "epoch": 0.9674351411545176, + "grad_norm": 1.5222718715667725, + "learning_rate": 5.700373475466592e-07, + "loss": 0.5619, + "step": 4018 + }, + { + "epoch": 0.9676759164509722, + "grad_norm": 1.8690755367279053, + "learning_rate": 5.617537772806602e-07, + "loss": 0.45, + "step": 4019 + }, + { + "epoch": 0.9679166917474267, + "grad_norm": 2.2518856525421143, + "learning_rate": 5.535306656923922e-07, + "loss": 0.2202, + "step": 4020 + }, + { + "epoch": 0.9681574670438813, + "grad_norm": 0.996590793132782, + "learning_rate": 5.453680177815445e-07, + "loss": 0.599, + "step": 4021 + }, + { + "epoch": 0.9683982423403359, + "grad_norm": 2.219210624694824, + "learning_rate": 5.372658385110141e-07, + "loss": 0.4227, + "step": 4022 + }, + { + "epoch": 0.9686390176367905, + "grad_norm": 4.723870754241943, + "learning_rate": 5.29224132806938e-07, + "loss": 0.6357, + "step": 4023 + }, + { + "epoch": 0.968879792933245, + "grad_norm": 3.3653030395507812, + "learning_rate": 5.212429055587165e-07, + "loss": 0.5787, + "step": 4024 + }, + { + "epoch": 0.9691205682296996, + "grad_norm": 2.3071415424346924, + "learning_rate": 5.133221616189232e-07, + "loss": 0.6784, + "step": 4025 + }, + { + "epoch": 0.9693613435261542, + "grad_norm": 2.037489414215088, + "learning_rate": 5.054619058033949e-07, + "loss": 0.7301, + "step": 4026 + }, + { + "epoch": 0.9696021188226088, + "grad_norm": 0.8913125991821289, + "learning_rate": 4.976621428912087e-07, + "loss": 0.3828, + "step": 4027 + }, + { + "epoch": 0.9698428941190634, + "grad_norm": 2.0508530139923096, + "learning_rate": 4.899228776246157e-07, + "loss": 0.4969, + "step": 4028 + }, + { + "epoch": 0.9700836694155179, + "grad_norm": 2.9134552478790283, + "learning_rate": 4.822441147091072e-07, + "loss": 0.4914, + "step": 4029 + }, + { + "epoch": 0.9703244447119725, + "grad_norm": 1.360295295715332, + "learning_rate": 4.7462585881339337e-07, + "loss": 0.5586, + "step": 4030 + }, + { + "epoch": 0.9705652200084272, + "grad_norm": 0.46363896131515503, + "learning_rate": 4.6706811456939116e-07, + "loss": 0.1493, + "step": 4031 + }, + { + "epoch": 0.9708059953048818, + "grad_norm": 1.6321947574615479, + "learning_rate": 4.595708865722359e-07, + "loss": 0.7034, + "step": 4032 + }, + { + "epoch": 0.9710467706013363, + "grad_norm": 3.976177930831909, + "learning_rate": 4.5213417938023693e-07, + "loss": 0.6017, + "step": 4033 + }, + { + "epoch": 0.9712875458977909, + "grad_norm": 3.0852105617523193, + "learning_rate": 4.4475799751494405e-07, + "loss": 0.8562, + "step": 4034 + }, + { + "epoch": 0.9715283211942455, + "grad_norm": 1.4149786233901978, + "learning_rate": 4.374423454610921e-07, + "loss": 0.331, + "step": 4035 + }, + { + "epoch": 0.9717690964907001, + "grad_norm": 3.825847625732422, + "learning_rate": 4.3018722766661193e-07, + "loss": 0.7702, + "step": 4036 + }, + { + "epoch": 0.9720098717871546, + "grad_norm": 5.9385175704956055, + "learning_rate": 4.2299264854263056e-07, + "loss": 0.4582, + "step": 4037 + }, + { + "epoch": 0.9722506470836092, + "grad_norm": 2.5086779594421387, + "learning_rate": 4.1585861246346e-07, + "loss": 0.4805, + "step": 4038 + }, + { + "epoch": 0.9724914223800638, + "grad_norm": 1.3712728023529053, + "learning_rate": 4.087851237666196e-07, + "loss": 0.4194, + "step": 4039 + }, + { + "epoch": 0.9727321976765184, + "grad_norm": 4.936484336853027, + "learning_rate": 4.017721867528246e-07, + "loss": 0.4498, + "step": 4040 + }, + { + "epoch": 0.972972972972973, + "grad_norm": 1.3534749746322632, + "learning_rate": 3.948198056859198e-07, + "loss": 0.6054, + "step": 4041 + }, + { + "epoch": 0.9732137482694275, + "grad_norm": 3.1654248237609863, + "learning_rate": 3.8792798479299066e-07, + "loss": 0.8157, + "step": 4042 + }, + { + "epoch": 0.9734545235658821, + "grad_norm": 1.9799362421035767, + "learning_rate": 3.810967282642741e-07, + "loss": 0.5287, + "step": 4043 + }, + { + "epoch": 0.9736952988623367, + "grad_norm": 0.713421642780304, + "learning_rate": 3.743260402531923e-07, + "loss": 0.4187, + "step": 4044 + }, + { + "epoch": 0.9739360741587914, + "grad_norm": 2.7310409545898438, + "learning_rate": 3.676159248763411e-07, + "loss": 0.8605, + "step": 4045 + }, + { + "epoch": 0.9741768494552459, + "grad_norm": 0.2964976131916046, + "learning_rate": 3.6096638621346824e-07, + "loss": 0.176, + "step": 4046 + }, + { + "epoch": 0.9744176247517005, + "grad_norm": 4.847579002380371, + "learning_rate": 3.543774283075396e-07, + "loss": 0.6553, + "step": 4047 + }, + { + "epoch": 0.9746584000481551, + "grad_norm": 1.6431396007537842, + "learning_rate": 3.478490551646285e-07, + "loss": 0.5625, + "step": 4048 + }, + { + "epoch": 0.9748991753446097, + "grad_norm": 1.354458212852478, + "learning_rate": 3.413812707540154e-07, + "loss": 0.3545, + "step": 4049 + }, + { + "epoch": 0.9751399506410642, + "grad_norm": 0.7734440565109253, + "learning_rate": 3.3497407900812126e-07, + "loss": 0.3589, + "step": 4050 + }, + { + "epoch": 0.9753807259375188, + "grad_norm": 2.87133526802063, + "learning_rate": 3.2862748382253006e-07, + "loss": 0.2974, + "step": 4051 + }, + { + "epoch": 0.9756215012339734, + "grad_norm": 8.928435325622559, + "learning_rate": 3.223414890559995e-07, + "loss": 0.4755, + "step": 4052 + }, + { + "epoch": 0.975862276530428, + "grad_norm": 3.5270206928253174, + "learning_rate": 3.161160985304168e-07, + "loss": 0.4525, + "step": 4053 + }, + { + "epoch": 0.9761030518268826, + "grad_norm": 1.8198820352554321, + "learning_rate": 3.0995131603083205e-07, + "loss": 0.7032, + "step": 4054 + }, + { + "epoch": 0.9763438271233371, + "grad_norm": 2.2425918579101562, + "learning_rate": 3.038471453054581e-07, + "loss": 0.8367, + "step": 4055 + }, + { + "epoch": 0.9765846024197917, + "grad_norm": 2.4997448921203613, + "learning_rate": 2.978035900656373e-07, + "loss": 0.7143, + "step": 4056 + }, + { + "epoch": 0.9768253777162463, + "grad_norm": 2.2880537509918213, + "learning_rate": 2.918206539858637e-07, + "loss": 0.5019, + "step": 4057 + }, + { + "epoch": 0.9770661530127009, + "grad_norm": 0.7689948081970215, + "learning_rate": 2.8589834070378295e-07, + "loss": 0.2252, + "step": 4058 + }, + { + "epoch": 0.9773069283091554, + "grad_norm": 1.0523358583450317, + "learning_rate": 2.800366538201593e-07, + "loss": 0.3874, + "step": 4059 + }, + { + "epoch": 0.97754770360561, + "grad_norm": 2.757550001144409, + "learning_rate": 2.742355968989307e-07, + "loss": 0.3381, + "step": 4060 + }, + { + "epoch": 0.9777884789020647, + "grad_norm": 0.9217396378517151, + "learning_rate": 2.684951734671426e-07, + "loss": 0.1354, + "step": 4061 + }, + { + "epoch": 0.9780292541985193, + "grad_norm": 0.9151739478111267, + "learning_rate": 2.6281538701498075e-07, + "loss": 0.4916, + "step": 4062 + }, + { + "epoch": 0.9782700294949738, + "grad_norm": 1.2322125434875488, + "learning_rate": 2.571962409957718e-07, + "loss": 0.5954, + "step": 4063 + }, + { + "epoch": 0.9785108047914284, + "grad_norm": 6.46744441986084, + "learning_rate": 2.5163773882598274e-07, + "loss": 0.917, + "step": 4064 + }, + { + "epoch": 0.978751580087883, + "grad_norm": 2.882272958755493, + "learning_rate": 2.4613988388517696e-07, + "loss": 0.7739, + "step": 4065 + }, + { + "epoch": 0.9789923553843376, + "grad_norm": 1.9530011415481567, + "learning_rate": 2.407026795160694e-07, + "loss": 0.8486, + "step": 4066 + }, + { + "epoch": 0.9792331306807921, + "grad_norm": 2.2306883335113525, + "learning_rate": 2.3532612902449346e-07, + "loss": 0.5382, + "step": 4067 + }, + { + "epoch": 0.9794739059772467, + "grad_norm": 2.601823568344116, + "learning_rate": 2.3001023567941205e-07, + "loss": 0.7243, + "step": 4068 + }, + { + "epoch": 0.9797146812737013, + "grad_norm": 1.1144752502441406, + "learning_rate": 2.247550027128842e-07, + "loss": 0.8548, + "step": 4069 + }, + { + "epoch": 0.9799554565701559, + "grad_norm": 10.666824340820312, + "learning_rate": 2.1956043332010955e-07, + "loss": 0.9193, + "step": 4070 + }, + { + "epoch": 0.9801962318666105, + "grad_norm": 1.2598254680633545, + "learning_rate": 2.144265306594062e-07, + "loss": 0.3549, + "step": 4071 + }, + { + "epoch": 0.980437007163065, + "grad_norm": 2.1722021102905273, + "learning_rate": 2.093532978521884e-07, + "loss": 0.5457, + "step": 4072 + }, + { + "epoch": 0.9806777824595196, + "grad_norm": 3.0609018802642822, + "learning_rate": 2.0434073798298869e-07, + "loss": 0.9473, + "step": 4073 + }, + { + "epoch": 0.9809185577559743, + "grad_norm": 1.4855046272277832, + "learning_rate": 1.9938885409948038e-07, + "loss": 0.428, + "step": 4074 + }, + { + "epoch": 0.9811593330524289, + "grad_norm": 2.298407793045044, + "learning_rate": 1.9449764921238845e-07, + "loss": 1.0341, + "step": 4075 + }, + { + "epoch": 0.9814001083488834, + "grad_norm": 1.82069730758667, + "learning_rate": 1.8966712629558957e-07, + "loss": 0.9768, + "step": 4076 + }, + { + "epoch": 0.981640883645338, + "grad_norm": 3.1224253177642822, + "learning_rate": 1.848972882860567e-07, + "loss": 0.6785, + "step": 4077 + }, + { + "epoch": 0.9818816589417926, + "grad_norm": 3.6556875705718994, + "learning_rate": 1.8018813808385883e-07, + "loss": 0.6481, + "step": 4078 + }, + { + "epoch": 0.9821224342382472, + "grad_norm": 1.220012903213501, + "learning_rate": 1.7553967855217235e-07, + "loss": 0.9078, + "step": 4079 + }, + { + "epoch": 0.9823632095347017, + "grad_norm": 0.8602136373519897, + "learning_rate": 1.7095191251726982e-07, + "loss": 0.0683, + "step": 4080 + }, + { + "epoch": 0.9826039848311563, + "grad_norm": 2.155679941177368, + "learning_rate": 1.6642484276852e-07, + "loss": 0.7771, + "step": 4081 + }, + { + "epoch": 0.9828447601276109, + "grad_norm": 2.459348440170288, + "learning_rate": 1.6195847205838777e-07, + "loss": 1.0204, + "step": 4082 + }, + { + "epoch": 0.9830855354240655, + "grad_norm": 1.9554654359817505, + "learning_rate": 1.5755280310244536e-07, + "loss": 0.4039, + "step": 4083 + }, + { + "epoch": 0.9833263107205201, + "grad_norm": 1.885136604309082, + "learning_rate": 1.5320783857935005e-07, + "loss": 0.4138, + "step": 4084 + }, + { + "epoch": 0.9835670860169746, + "grad_norm": 1.192893385887146, + "learning_rate": 1.4892358113084426e-07, + "loss": 0.5029, + "step": 4085 + }, + { + "epoch": 0.9838078613134292, + "grad_norm": 3.7182071208953857, + "learning_rate": 1.447000333617665e-07, + "loss": 1.1002, + "step": 4086 + }, + { + "epoch": 0.9840486366098838, + "grad_norm": 1.4601658582687378, + "learning_rate": 1.405371978400516e-07, + "loss": 0.5054, + "step": 4087 + }, + { + "epoch": 0.9842894119063385, + "grad_norm": 2.312633752822876, + "learning_rate": 1.3643507709669713e-07, + "loss": 0.4722, + "step": 4088 + }, + { + "epoch": 0.984530187202793, + "grad_norm": 0.7593234181404114, + "learning_rate": 1.3239367362581912e-07, + "loss": 0.1804, + "step": 4089 + }, + { + "epoch": 0.9847709624992476, + "grad_norm": 2.778722047805786, + "learning_rate": 1.284129898845854e-07, + "loss": 0.6931, + "step": 4090 + }, + { + "epoch": 0.9850117377957022, + "grad_norm": 3.4330999851226807, + "learning_rate": 1.2449302829327102e-07, + "loss": 1.1589, + "step": 4091 + }, + { + "epoch": 0.9852525130921568, + "grad_norm": 2.7237799167633057, + "learning_rate": 1.20633791235214e-07, + "loss": 0.58, + "step": 4092 + }, + { + "epoch": 0.9854932883886113, + "grad_norm": 1.8853704929351807, + "learning_rate": 1.1683528105684848e-07, + "loss": 0.9436, + "step": 4093 + }, + { + "epoch": 0.9857340636850659, + "grad_norm": 1.500649094581604, + "learning_rate": 1.130975000676715e-07, + "loss": 0.8359, + "step": 4094 + }, + { + "epoch": 0.9859748389815205, + "grad_norm": 2.9082491397857666, + "learning_rate": 1.0942045054025407e-07, + "loss": 1.1474, + "step": 4095 + }, + { + "epoch": 0.9862156142779751, + "grad_norm": 0.6749841570854187, + "learning_rate": 1.058041347102634e-07, + "loss": 0.5816, + "step": 4096 + }, + { + "epoch": 0.9864563895744297, + "grad_norm": 1.8493642807006836, + "learning_rate": 1.0224855477642959e-07, + "loss": 1.1293, + "step": 4097 + }, + { + "epoch": 0.9866971648708842, + "grad_norm": 0.32945817708969116, + "learning_rate": 9.875371290053447e-08, + "loss": 0.3363, + "step": 4098 + }, + { + "epoch": 0.9869379401673388, + "grad_norm": 1.6765530109405518, + "learning_rate": 9.531961120746724e-08, + "loss": 0.892, + "step": 4099 + }, + { + "epoch": 0.9871787154637934, + "grad_norm": 2.7227275371551514, + "learning_rate": 9.19462517851688e-08, + "loss": 0.8791, + "step": 4100 + }, + { + "epoch": 0.987419490760248, + "grad_norm": 0.8839995265007019, + "learning_rate": 8.863363668464297e-08, + "loss": 0.5598, + "step": 4101 + }, + { + "epoch": 0.9876602660567025, + "grad_norm": 1.8077160120010376, + "learning_rate": 8.538176791996754e-08, + "loss": 0.3711, + "step": 4102 + }, + { + "epoch": 0.9879010413531571, + "grad_norm": 2.301443338394165, + "learning_rate": 8.21906474682943e-08, + "loss": 1.0025, + "step": 4103 + }, + { + "epoch": 0.9881418166496118, + "grad_norm": 1.6749955415725708, + "learning_rate": 7.906027726981568e-08, + "loss": 1.0008, + "step": 4104 + }, + { + "epoch": 0.9883825919460664, + "grad_norm": 2.291646957397461, + "learning_rate": 7.599065922780924e-08, + "loss": 0.4288, + "step": 4105 + }, + { + "epoch": 0.9886233672425209, + "grad_norm": 3.5161843299865723, + "learning_rate": 7.298179520862647e-08, + "loss": 0.8569, + "step": 4106 + }, + { + "epoch": 0.9888641425389755, + "grad_norm": 0.49535292387008667, + "learning_rate": 7.003368704164847e-08, + "loss": 0.3556, + "step": 4107 + }, + { + "epoch": 0.9891049178354301, + "grad_norm": 2.1845948696136475, + "learning_rate": 6.714633651931923e-08, + "loss": 0.4408, + "step": 4108 + }, + { + "epoch": 0.9893456931318847, + "grad_norm": 2.3166656494140625, + "learning_rate": 6.431974539717888e-08, + "loss": 0.4087, + "step": 4109 + }, + { + "epoch": 0.9895864684283393, + "grad_norm": 0.6833885312080383, + "learning_rate": 6.155391539379718e-08, + "loss": 0.362, + "step": 4110 + }, + { + "epoch": 0.9898272437247938, + "grad_norm": 1.6554492712020874, + "learning_rate": 5.884884819079561e-08, + "loss": 0.4277, + "step": 4111 + }, + { + "epoch": 0.9900680190212484, + "grad_norm": 1.1396666765213013, + "learning_rate": 5.620454543285858e-08, + "loss": 0.5649, + "step": 4112 + }, + { + "epoch": 0.990308794317703, + "grad_norm": 2.3340067863464355, + "learning_rate": 5.362100872773334e-08, + "loss": 0.5773, + "step": 4113 + }, + { + "epoch": 0.9905495696141576, + "grad_norm": 2.0202200412750244, + "learning_rate": 5.109823964621896e-08, + "loss": 0.5604, + "step": 4114 + }, + { + "epoch": 0.9907903449106121, + "grad_norm": 1.5799890756607056, + "learning_rate": 4.863623972216624e-08, + "loss": 0.5799, + "step": 4115 + }, + { + "epoch": 0.9910311202070667, + "grad_norm": 3.0115244388580322, + "learning_rate": 4.62350104524778e-08, + "loss": 0.631, + "step": 4116 + }, + { + "epoch": 0.9912718955035213, + "grad_norm": 1.998792290687561, + "learning_rate": 4.3894553297085805e-08, + "loss": 0.8206, + "step": 4117 + }, + { + "epoch": 0.991512670799976, + "grad_norm": 1.306921124458313, + "learning_rate": 4.161486967901862e-08, + "loss": 0.6584, + "step": 4118 + }, + { + "epoch": 0.9917534460964305, + "grad_norm": 2.2372684478759766, + "learning_rate": 3.9395960984323076e-08, + "loss": 0.4433, + "step": 4119 + }, + { + "epoch": 0.9919942213928851, + "grad_norm": 1.8838356733322144, + "learning_rate": 3.723782856208669e-08, + "loss": 0.7484, + "step": 4120 + }, + { + "epoch": 0.9922349966893397, + "grad_norm": 0.9679247140884399, + "learning_rate": 3.5140473724482034e-08, + "loss": 0.1638, + "step": 4121 + }, + { + "epoch": 0.9924757719857943, + "grad_norm": 1.0013998746871948, + "learning_rate": 3.3103897746689097e-08, + "loss": 0.2034, + "step": 4122 + }, + { + "epoch": 0.9927165472822489, + "grad_norm": 1.7386034727096558, + "learning_rate": 3.11281018669507e-08, + "loss": 0.6798, + "step": 4123 + }, + { + "epoch": 0.9929573225787034, + "grad_norm": 1.7022814750671387, + "learning_rate": 2.921308728656147e-08, + "loss": 0.9177, + "step": 4124 + }, + { + "epoch": 0.993198097875158, + "grad_norm": 1.6356173753738403, + "learning_rate": 2.7358855169845598e-08, + "loss": 0.397, + "step": 4125 + }, + { + "epoch": 0.9934388731716126, + "grad_norm": 2.2541160583496094, + "learning_rate": 2.556540664419016e-08, + "loss": 0.3977, + "step": 4126 + }, + { + "epoch": 0.9936796484680672, + "grad_norm": 4.046707630157471, + "learning_rate": 2.38327428000118e-08, + "loss": 0.4311, + "step": 4127 + }, + { + "epoch": 0.9939204237645217, + "grad_norm": 3.0581321716308594, + "learning_rate": 2.216086469077894e-08, + "loss": 1.07, + "step": 4128 + }, + { + "epoch": 0.9941611990609763, + "grad_norm": 1.8278604745864868, + "learning_rate": 2.0549773332989575e-08, + "loss": 0.2437, + "step": 4129 + }, + { + "epoch": 0.9944019743574309, + "grad_norm": 1.3549528121948242, + "learning_rate": 1.8999469706193484e-08, + "loss": 0.3801, + "step": 4130 + }, + { + "epoch": 0.9946427496538855, + "grad_norm": 2.3834619522094727, + "learning_rate": 1.750995475299222e-08, + "loss": 0.6792, + "step": 4131 + }, + { + "epoch": 0.99488352495034, + "grad_norm": 2.10779070854187, + "learning_rate": 1.6081229378983598e-08, + "loss": 0.4779, + "step": 4132 + }, + { + "epoch": 0.9951243002467947, + "grad_norm": 3.692774534225464, + "learning_rate": 1.4713294452861626e-08, + "loss": 0.9011, + "step": 4133 + }, + { + "epoch": 0.9953650755432493, + "grad_norm": 1.1384838819503784, + "learning_rate": 1.3406150806327678e-08, + "loss": 0.4314, + "step": 4134 + }, + { + "epoch": 0.9956058508397039, + "grad_norm": 1.2060233354568481, + "learning_rate": 1.2159799234134905e-08, + "loss": 0.4898, + "step": 4135 + }, + { + "epoch": 0.9958466261361584, + "grad_norm": 2.4775209426879883, + "learning_rate": 1.097424049404383e-08, + "loss": 0.4144, + "step": 4136 + }, + { + "epoch": 0.996087401432613, + "grad_norm": 6.3312177658081055, + "learning_rate": 9.849475306900058e-09, + "loss": 0.9015, + "step": 4137 + }, + { + "epoch": 0.9963281767290676, + "grad_norm": 1.8564362525939941, + "learning_rate": 8.785504356556563e-09, + "loss": 0.3928, + "step": 4138 + }, + { + "epoch": 0.9965689520255222, + "grad_norm": 1.6765766143798828, + "learning_rate": 7.782328289906992e-09, + "loss": 1.0499, + "step": 4139 + }, + { + "epoch": 0.9968097273219768, + "grad_norm": 2.188523054122925, + "learning_rate": 6.839947716885675e-09, + "loss": 0.9049, + "step": 4140 + }, + { + "epoch": 0.9970505026184313, + "grad_norm": 0.7010088562965393, + "learning_rate": 5.95836321046761e-09, + "loss": 0.4464, + "step": 4141 + }, + { + "epoch": 0.9972912779148859, + "grad_norm": 1.611911416053772, + "learning_rate": 5.137575306646269e-09, + "loss": 0.3036, + "step": 4142 + }, + { + "epoch": 0.9975320532113405, + "grad_norm": 2.555997371673584, + "learning_rate": 4.377584504478005e-09, + "loss": 0.4531, + "step": 4143 + }, + { + "epoch": 0.9977728285077951, + "grad_norm": 1.4258787631988525, + "learning_rate": 3.6783912660265372e-09, + "loss": 0.4331, + "step": 4144 + }, + { + "epoch": 0.9980136038042496, + "grad_norm": 1.0815415382385254, + "learning_rate": 3.039996016407365e-09, + "loss": 0.3503, + "step": 4145 + }, + { + "epoch": 0.9982543791007042, + "grad_norm": 4.800786972045898, + "learning_rate": 2.4623991437766614e-09, + "loss": 0.6749, + "step": 4146 + }, + { + "epoch": 0.9984951543971589, + "grad_norm": 1.161253809928894, + "learning_rate": 1.9456009992979696e-09, + "loss": 0.5973, + "step": 4147 + }, + { + "epoch": 0.9987359296936135, + "grad_norm": 3.008234739303589, + "learning_rate": 1.48960189718661e-09, + "loss": 0.9873, + "step": 4148 + }, + { + "epoch": 0.998976704990068, + "grad_norm": 1.616468071937561, + "learning_rate": 1.0944021146985784e-09, + "loss": 0.5877, + "step": 4149 + }, + { + "epoch": 0.9992174802865226, + "grad_norm": 1.8020235300064087, + "learning_rate": 7.60001892119444e-10, + "loss": 0.6324, + "step": 4150 + }, + { + "epoch": 0.9994582555829772, + "grad_norm": 2.7583658695220947, + "learning_rate": 4.864014327532474e-10, + "loss": 0.8355, + "step": 4151 + }, + { + "epoch": 0.9996990308794318, + "grad_norm": 1.8322501182556152, + "learning_rate": 2.7360090296690846e-10, + "loss": 0.706, + "step": 4152 + }, + { + "epoch": 0.9999398061758864, + "grad_norm": 1.8930530548095703, + "learning_rate": 1.2160043212361417e-10, + "loss": 0.6687, + "step": 4153 + }, + { + "epoch": 1.0, + "grad_norm": 5.636228084564209, + "learning_rate": 3.0400112649431325e-11, + "loss": 0.9409, + "step": 4154 + }, + { + "epoch": 1.0, + "step": 4154, + "total_flos": 7.629858860247867e+17, + "train_loss": 0.8297393302427364, + "train_runtime": 10162.4154, + "train_samples_per_second": 3.269, + "train_steps_per_second": 0.409 + } + ], + "logging_steps": 1, + "max_steps": 4154, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 2400000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7.629858860247867e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}