{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 19560, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002556237218813906, "grad_norm": 86.6442642211914, "learning_rate": 4.601226993865031e-08, "loss": 4.7575, "step": 10 }, { "epoch": 0.005112474437627812, "grad_norm": 90.80802154541016, "learning_rate": 9.713701431492844e-08, "loss": 5.2498, "step": 20 }, { "epoch": 0.007668711656441718, "grad_norm": 79.19834899902344, "learning_rate": 1.4826175869120655e-07, "loss": 4.8172, "step": 30 }, { "epoch": 0.010224948875255624, "grad_norm": 93.67054748535156, "learning_rate": 1.9938650306748468e-07, "loss": 4.7357, "step": 40 }, { "epoch": 0.01278118609406953, "grad_norm": 84.51275634765625, "learning_rate": 2.505112474437628e-07, "loss": 4.5906, "step": 50 }, { "epoch": 0.015337423312883436, "grad_norm": 160.59327697753906, "learning_rate": 3.0163599182004093e-07, "loss": 4.4981, "step": 60 }, { "epoch": 0.01789366053169734, "grad_norm": 54.63095474243164, "learning_rate": 3.52760736196319e-07, "loss": 3.8254, "step": 70 }, { "epoch": 0.02044989775051125, "grad_norm": 56.51041793823242, "learning_rate": 4.038854805725972e-07, "loss": 3.2408, "step": 80 }, { "epoch": 0.023006134969325152, "grad_norm": 54.57204818725586, "learning_rate": 4.5501022494887533e-07, "loss": 3.1371, "step": 90 }, { "epoch": 0.02556237218813906, "grad_norm": 17.450857162475586, "learning_rate": 5.061349693251534e-07, "loss": 3.0649, "step": 100 }, { "epoch": 0.028118609406952964, "grad_norm": 14.947675704956055, "learning_rate": 5.572597137014316e-07, "loss": 2.6627, "step": 110 }, { "epoch": 0.03067484662576687, "grad_norm": 21.680395126342773, "learning_rate": 6.083844580777097e-07, "loss": 2.6963, "step": 120 }, { "epoch": 0.033231083844580775, "grad_norm": 14.042488098144531, "learning_rate": 6.595092024539878e-07, "loss": 2.7294, "step": 130 }, { "epoch": 0.03578732106339468, "grad_norm": 15.667428016662598, "learning_rate": 7.106339468302658e-07, "loss": 2.688, "step": 140 }, { "epoch": 0.03834355828220859, "grad_norm": 15.774909973144531, "learning_rate": 7.61758691206544e-07, "loss": 2.5642, "step": 150 }, { "epoch": 0.0408997955010225, "grad_norm": 17.561765670776367, "learning_rate": 8.128834355828222e-07, "loss": 2.621, "step": 160 }, { "epoch": 0.0434560327198364, "grad_norm": 22.855037689208984, "learning_rate": 8.640081799591003e-07, "loss": 2.5726, "step": 170 }, { "epoch": 0.046012269938650305, "grad_norm": 21.334442138671875, "learning_rate": 9.151329243353784e-07, "loss": 2.5354, "step": 180 }, { "epoch": 0.04856850715746421, "grad_norm": 16.87196159362793, "learning_rate": 9.662576687116565e-07, "loss": 2.5895, "step": 190 }, { "epoch": 0.05112474437627812, "grad_norm": 13.189834594726562, "learning_rate": 1.0173824130879346e-06, "loss": 2.5198, "step": 200 }, { "epoch": 0.05368098159509203, "grad_norm": 32.269676208496094, "learning_rate": 1.0685071574642128e-06, "loss": 2.4075, "step": 210 }, { "epoch": 0.05623721881390593, "grad_norm": 35.43799591064453, "learning_rate": 1.119631901840491e-06, "loss": 2.3543, "step": 220 }, { "epoch": 0.058793456032719835, "grad_norm": 15.105649948120117, "learning_rate": 1.170756646216769e-06, "loss": 2.4616, "step": 230 }, { "epoch": 0.06134969325153374, "grad_norm": 19.825071334838867, "learning_rate": 1.221881390593047e-06, "loss": 2.3646, "step": 240 }, { "epoch": 0.06390593047034765, "grad_norm": 14.074588775634766, "learning_rate": 1.2730061349693252e-06, "loss": 2.1941, "step": 250 }, { "epoch": 0.06646216768916155, "grad_norm": 21.8532772064209, "learning_rate": 1.3241308793456035e-06, "loss": 2.2606, "step": 260 }, { "epoch": 0.06901840490797546, "grad_norm": 22.797710418701172, "learning_rate": 1.3752556237218813e-06, "loss": 2.3052, "step": 270 }, { "epoch": 0.07157464212678936, "grad_norm": 19.603389739990234, "learning_rate": 1.4263803680981596e-06, "loss": 2.1969, "step": 280 }, { "epoch": 0.07413087934560328, "grad_norm": 17.005781173706055, "learning_rate": 1.4775051124744377e-06, "loss": 2.3629, "step": 290 }, { "epoch": 0.07668711656441718, "grad_norm": 15.876945495605469, "learning_rate": 1.5286298568507158e-06, "loss": 2.5044, "step": 300 }, { "epoch": 0.07924335378323108, "grad_norm": 17.874849319458008, "learning_rate": 1.579754601226994e-06, "loss": 2.145, "step": 310 }, { "epoch": 0.081799591002045, "grad_norm": 22.35564422607422, "learning_rate": 1.630879345603272e-06, "loss": 2.1845, "step": 320 }, { "epoch": 0.0843558282208589, "grad_norm": 16.786012649536133, "learning_rate": 1.6820040899795503e-06, "loss": 2.1084, "step": 330 }, { "epoch": 0.0869120654396728, "grad_norm": 28.834739685058594, "learning_rate": 1.7331288343558283e-06, "loss": 2.2875, "step": 340 }, { "epoch": 0.08946830265848671, "grad_norm": 14.567386627197266, "learning_rate": 1.7842535787321064e-06, "loss": 2.3375, "step": 350 }, { "epoch": 0.09202453987730061, "grad_norm": 31.9110164642334, "learning_rate": 1.8353783231083847e-06, "loss": 2.1919, "step": 360 }, { "epoch": 0.09458077709611452, "grad_norm": 14.252608299255371, "learning_rate": 1.8865030674846626e-06, "loss": 1.9781, "step": 370 }, { "epoch": 0.09713701431492842, "grad_norm": 24.873756408691406, "learning_rate": 1.937627811860941e-06, "loss": 2.3583, "step": 380 }, { "epoch": 0.09969325153374232, "grad_norm": 28.036256790161133, "learning_rate": 1.988752556237219e-06, "loss": 2.136, "step": 390 }, { "epoch": 0.10224948875255624, "grad_norm": 31.46503257751465, "learning_rate": 2.039877300613497e-06, "loss": 2.206, "step": 400 }, { "epoch": 0.10480572597137014, "grad_norm": 26.446863174438477, "learning_rate": 2.091002044989775e-06, "loss": 2.1021, "step": 410 }, { "epoch": 0.10736196319018405, "grad_norm": 14.209206581115723, "learning_rate": 2.142126789366053e-06, "loss": 2.058, "step": 420 }, { "epoch": 0.10991820040899795, "grad_norm": 16.250686645507812, "learning_rate": 2.1932515337423317e-06, "loss": 2.1551, "step": 430 }, { "epoch": 0.11247443762781185, "grad_norm": 14.691299438476562, "learning_rate": 2.24437627811861e-06, "loss": 1.9728, "step": 440 }, { "epoch": 0.11503067484662577, "grad_norm": 15.899717330932617, "learning_rate": 2.2955010224948875e-06, "loss": 1.9192, "step": 450 }, { "epoch": 0.11758691206543967, "grad_norm": 13.063642501831055, "learning_rate": 2.346625766871166e-06, "loss": 2.1218, "step": 460 }, { "epoch": 0.12014314928425358, "grad_norm": 13.922229766845703, "learning_rate": 2.397750511247444e-06, "loss": 1.8393, "step": 470 }, { "epoch": 0.12269938650306748, "grad_norm": 20.130887985229492, "learning_rate": 2.448875255623722e-06, "loss": 2.0518, "step": 480 }, { "epoch": 0.1252556237218814, "grad_norm": 20.779491424560547, "learning_rate": 2.5e-06, "loss": 2.0944, "step": 490 }, { "epoch": 0.1278118609406953, "grad_norm": 16.71235466003418, "learning_rate": 2.5511247443762783e-06, "loss": 1.9393, "step": 500 }, { "epoch": 0.1303680981595092, "grad_norm": 29.568498611450195, "learning_rate": 2.6022494887525564e-06, "loss": 2.05, "step": 510 }, { "epoch": 0.1329243353783231, "grad_norm": 13.172797203063965, "learning_rate": 2.653374233128835e-06, "loss": 2.1483, "step": 520 }, { "epoch": 0.13548057259713703, "grad_norm": 19.142757415771484, "learning_rate": 2.704498977505113e-06, "loss": 1.804, "step": 530 }, { "epoch": 0.13803680981595093, "grad_norm": 11.22604751586914, "learning_rate": 2.7556237218813906e-06, "loss": 1.8299, "step": 540 }, { "epoch": 0.14059304703476483, "grad_norm": 18.316205978393555, "learning_rate": 2.8067484662576687e-06, "loss": 2.0066, "step": 550 }, { "epoch": 0.14314928425357873, "grad_norm": 31.03098487854004, "learning_rate": 2.8578732106339468e-06, "loss": 2.1166, "step": 560 }, { "epoch": 0.14570552147239263, "grad_norm": 13.343217849731445, "learning_rate": 2.9089979550102253e-06, "loss": 1.9187, "step": 570 }, { "epoch": 0.14826175869120656, "grad_norm": 16.278240203857422, "learning_rate": 2.9601226993865034e-06, "loss": 1.8516, "step": 580 }, { "epoch": 0.15081799591002046, "grad_norm": 14.405373573303223, "learning_rate": 3.0112474437627814e-06, "loss": 1.8636, "step": 590 }, { "epoch": 0.15337423312883436, "grad_norm": 28.38698387145996, "learning_rate": 3.0623721881390595e-06, "loss": 1.897, "step": 600 }, { "epoch": 0.15593047034764826, "grad_norm": 15.13376522064209, "learning_rate": 3.1134969325153376e-06, "loss": 1.7026, "step": 610 }, { "epoch": 0.15848670756646216, "grad_norm": 21.627859115600586, "learning_rate": 3.164621676891616e-06, "loss": 1.7341, "step": 620 }, { "epoch": 0.16104294478527606, "grad_norm": 34.46842956542969, "learning_rate": 3.215746421267894e-06, "loss": 1.668, "step": 630 }, { "epoch": 0.16359918200409, "grad_norm": 15.768085479736328, "learning_rate": 3.266871165644172e-06, "loss": 1.5321, "step": 640 }, { "epoch": 0.1661554192229039, "grad_norm": 15.477701187133789, "learning_rate": 3.31799591002045e-06, "loss": 1.7124, "step": 650 }, { "epoch": 0.1687116564417178, "grad_norm": 14.961771011352539, "learning_rate": 3.369120654396728e-06, "loss": 1.7718, "step": 660 }, { "epoch": 0.1712678936605317, "grad_norm": 16.937185287475586, "learning_rate": 3.4202453987730065e-06, "loss": 1.7621, "step": 670 }, { "epoch": 0.1738241308793456, "grad_norm": 13.276283264160156, "learning_rate": 3.4713701431492846e-06, "loss": 1.8156, "step": 680 }, { "epoch": 0.17638036809815952, "grad_norm": 20.783721923828125, "learning_rate": 3.5224948875255627e-06, "loss": 1.8182, "step": 690 }, { "epoch": 0.17893660531697342, "grad_norm": 15.780200004577637, "learning_rate": 3.5736196319018408e-06, "loss": 1.7411, "step": 700 }, { "epoch": 0.18149284253578732, "grad_norm": 14.318155288696289, "learning_rate": 3.624744376278119e-06, "loss": 1.8282, "step": 710 }, { "epoch": 0.18404907975460122, "grad_norm": 17.099267959594727, "learning_rate": 3.6758691206543974e-06, "loss": 1.8697, "step": 720 }, { "epoch": 0.18660531697341512, "grad_norm": 24.751697540283203, "learning_rate": 3.7269938650306754e-06, "loss": 1.585, "step": 730 }, { "epoch": 0.18916155419222905, "grad_norm": 14.778061866760254, "learning_rate": 3.778118609406953e-06, "loss": 1.5203, "step": 740 }, { "epoch": 0.19171779141104295, "grad_norm": 11.545260429382324, "learning_rate": 3.829243353783232e-06, "loss": 1.85, "step": 750 }, { "epoch": 0.19427402862985685, "grad_norm": 15.764128684997559, "learning_rate": 3.880368098159509e-06, "loss": 1.7218, "step": 760 }, { "epoch": 0.19683026584867075, "grad_norm": 19.65780258178711, "learning_rate": 3.931492842535788e-06, "loss": 1.7144, "step": 770 }, { "epoch": 0.19938650306748465, "grad_norm": 11.911704063415527, "learning_rate": 3.982617586912066e-06, "loss": 1.4344, "step": 780 }, { "epoch": 0.20194274028629858, "grad_norm": 28.547578811645508, "learning_rate": 4.033742331288344e-06, "loss": 1.6595, "step": 790 }, { "epoch": 0.20449897750511248, "grad_norm": 21.17897605895996, "learning_rate": 4.084867075664622e-06, "loss": 1.5372, "step": 800 }, { "epoch": 0.20705521472392638, "grad_norm": 32.14210510253906, "learning_rate": 4.1359918200409e-06, "loss": 1.7575, "step": 810 }, { "epoch": 0.20961145194274028, "grad_norm": 16.999343872070312, "learning_rate": 4.187116564417179e-06, "loss": 1.4743, "step": 820 }, { "epoch": 0.21216768916155418, "grad_norm": 16.725637435913086, "learning_rate": 4.238241308793456e-06, "loss": 1.7326, "step": 830 }, { "epoch": 0.2147239263803681, "grad_norm": 17.674165725708008, "learning_rate": 4.289366053169735e-06, "loss": 1.7185, "step": 840 }, { "epoch": 0.217280163599182, "grad_norm": 14.320542335510254, "learning_rate": 4.3404907975460124e-06, "loss": 1.7974, "step": 850 }, { "epoch": 0.2198364008179959, "grad_norm": 15.723745346069336, "learning_rate": 4.391615541922291e-06, "loss": 1.475, "step": 860 }, { "epoch": 0.2223926380368098, "grad_norm": 17.50130844116211, "learning_rate": 4.4427402862985694e-06, "loss": 1.5188, "step": 870 }, { "epoch": 0.2249488752556237, "grad_norm": 20.599023818969727, "learning_rate": 4.493865030674847e-06, "loss": 1.3638, "step": 880 }, { "epoch": 0.22750511247443764, "grad_norm": 20.347301483154297, "learning_rate": 4.544989775051125e-06, "loss": 1.6373, "step": 890 }, { "epoch": 0.23006134969325154, "grad_norm": 16.79341697692871, "learning_rate": 4.596114519427403e-06, "loss": 1.6475, "step": 900 }, { "epoch": 0.23261758691206544, "grad_norm": 38.66135787963867, "learning_rate": 4.647239263803681e-06, "loss": 1.4448, "step": 910 }, { "epoch": 0.23517382413087934, "grad_norm": 39.84195327758789, "learning_rate": 4.6983640081799594e-06, "loss": 1.5184, "step": 920 }, { "epoch": 0.23773006134969324, "grad_norm": 13.964972496032715, "learning_rate": 4.749488752556238e-06, "loss": 1.506, "step": 930 }, { "epoch": 0.24028629856850717, "grad_norm": 41.46586608886719, "learning_rate": 4.800613496932516e-06, "loss": 1.552, "step": 940 }, { "epoch": 0.24284253578732107, "grad_norm": 14.640779495239258, "learning_rate": 4.851738241308794e-06, "loss": 1.3879, "step": 950 }, { "epoch": 0.24539877300613497, "grad_norm": 15.459932327270508, "learning_rate": 4.902862985685072e-06, "loss": 1.3674, "step": 960 }, { "epoch": 0.24795501022494887, "grad_norm": 15.347355842590332, "learning_rate": 4.95398773006135e-06, "loss": 1.1746, "step": 970 }, { "epoch": 0.2505112474437628, "grad_norm": 28.36712074279785, "learning_rate": 5.005112474437628e-06, "loss": 1.5726, "step": 980 }, { "epoch": 0.25306748466257667, "grad_norm": 19.360950469970703, "learning_rate": 5.0562372188139064e-06, "loss": 1.6336, "step": 990 }, { "epoch": 0.2556237218813906, "grad_norm": 14.269163131713867, "learning_rate": 5.107361963190185e-06, "loss": 1.4811, "step": 1000 }, { "epoch": 0.2581799591002045, "grad_norm": 13.799911499023438, "learning_rate": 5.158486707566463e-06, "loss": 1.5457, "step": 1010 }, { "epoch": 0.2607361963190184, "grad_norm": 14.226543426513672, "learning_rate": 5.209611451942741e-06, "loss": 1.292, "step": 1020 }, { "epoch": 0.2632924335378323, "grad_norm": 17.43229103088379, "learning_rate": 5.260736196319019e-06, "loss": 1.4887, "step": 1030 }, { "epoch": 0.2658486707566462, "grad_norm": 14.904557228088379, "learning_rate": 5.311860940695297e-06, "loss": 1.4825, "step": 1040 }, { "epoch": 0.2684049079754601, "grad_norm": 14.981825828552246, "learning_rate": 5.362985685071576e-06, "loss": 1.3343, "step": 1050 }, { "epoch": 0.27096114519427406, "grad_norm": 17.142181396484375, "learning_rate": 5.4141104294478534e-06, "loss": 1.5909, "step": 1060 }, { "epoch": 0.27351738241308793, "grad_norm": 25.113466262817383, "learning_rate": 5.465235173824132e-06, "loss": 1.4104, "step": 1070 }, { "epoch": 0.27607361963190186, "grad_norm": 15.018096923828125, "learning_rate": 5.516359918200409e-06, "loss": 1.6476, "step": 1080 }, { "epoch": 0.27862985685071573, "grad_norm": 20.576635360717773, "learning_rate": 5.567484662576687e-06, "loss": 1.402, "step": 1090 }, { "epoch": 0.28118609406952966, "grad_norm": 22.463077545166016, "learning_rate": 5.618609406952967e-06, "loss": 1.4635, "step": 1100 }, { "epoch": 0.2837423312883436, "grad_norm": 24.64898109436035, "learning_rate": 5.669734151329243e-06, "loss": 1.4447, "step": 1110 }, { "epoch": 0.28629856850715746, "grad_norm": 13.527003288269043, "learning_rate": 5.720858895705522e-06, "loss": 1.3098, "step": 1120 }, { "epoch": 0.2888548057259714, "grad_norm": 28.426868438720703, "learning_rate": 5.7719836400817996e-06, "loss": 1.2572, "step": 1130 }, { "epoch": 0.29141104294478526, "grad_norm": 17.067176818847656, "learning_rate": 5.823108384458078e-06, "loss": 1.1472, "step": 1140 }, { "epoch": 0.2939672801635992, "grad_norm": 13.91565990447998, "learning_rate": 5.874233128834357e-06, "loss": 1.4355, "step": 1150 }, { "epoch": 0.2965235173824131, "grad_norm": 19.00754165649414, "learning_rate": 5.925357873210634e-06, "loss": 1.4044, "step": 1160 }, { "epoch": 0.299079754601227, "grad_norm": 23.37032127380371, "learning_rate": 5.976482617586913e-06, "loss": 1.0234, "step": 1170 }, { "epoch": 0.3016359918200409, "grad_norm": 16.511402130126953, "learning_rate": 6.02760736196319e-06, "loss": 1.3716, "step": 1180 }, { "epoch": 0.3041922290388548, "grad_norm": 27.51507568359375, "learning_rate": 6.078732106339469e-06, "loss": 1.19, "step": 1190 }, { "epoch": 0.3067484662576687, "grad_norm": 9.414546012878418, "learning_rate": 6.129856850715747e-06, "loss": 1.1535, "step": 1200 }, { "epoch": 0.30930470347648265, "grad_norm": 11.353070259094238, "learning_rate": 6.180981595092025e-06, "loss": 1.2848, "step": 1210 }, { "epoch": 0.3118609406952965, "grad_norm": 13.55284595489502, "learning_rate": 6.232106339468304e-06, "loss": 1.1883, "step": 1220 }, { "epoch": 0.31441717791411045, "grad_norm": 22.225780487060547, "learning_rate": 6.283231083844581e-06, "loss": 1.5084, "step": 1230 }, { "epoch": 0.3169734151329243, "grad_norm": 31.59255027770996, "learning_rate": 6.33435582822086e-06, "loss": 1.4056, "step": 1240 }, { "epoch": 0.31952965235173825, "grad_norm": 37.905452728271484, "learning_rate": 6.385480572597138e-06, "loss": 1.5661, "step": 1250 }, { "epoch": 0.3220858895705521, "grad_norm": 18.418148040771484, "learning_rate": 6.436605316973416e-06, "loss": 1.4067, "step": 1260 }, { "epoch": 0.32464212678936605, "grad_norm": 14.485339164733887, "learning_rate": 6.487730061349694e-06, "loss": 1.0165, "step": 1270 }, { "epoch": 0.32719836400818, "grad_norm": 30.809120178222656, "learning_rate": 6.538854805725971e-06, "loss": 1.2704, "step": 1280 }, { "epoch": 0.32975460122699385, "grad_norm": 19.595361709594727, "learning_rate": 6.58997955010225e-06, "loss": 1.2991, "step": 1290 }, { "epoch": 0.3323108384458078, "grad_norm": 10.220748901367188, "learning_rate": 6.641104294478529e-06, "loss": 1.1867, "step": 1300 }, { "epoch": 0.33486707566462165, "grad_norm": 12.308537483215332, "learning_rate": 6.692229038854806e-06, "loss": 1.2807, "step": 1310 }, { "epoch": 0.3374233128834356, "grad_norm": 21.010303497314453, "learning_rate": 6.743353783231084e-06, "loss": 1.1327, "step": 1320 }, { "epoch": 0.3399795501022495, "grad_norm": 13.279821395874023, "learning_rate": 6.794478527607362e-06, "loss": 1.113, "step": 1330 }, { "epoch": 0.3425357873210634, "grad_norm": 16.616683959960938, "learning_rate": 6.8456032719836406e-06, "loss": 1.2778, "step": 1340 }, { "epoch": 0.3450920245398773, "grad_norm": 14.577863693237305, "learning_rate": 6.896728016359919e-06, "loss": 1.1638, "step": 1350 }, { "epoch": 0.3476482617586912, "grad_norm": 19.595500946044922, "learning_rate": 6.947852760736197e-06, "loss": 1.374, "step": 1360 }, { "epoch": 0.3502044989775051, "grad_norm": 19.486886978149414, "learning_rate": 6.998977505112475e-06, "loss": 1.2494, "step": 1370 }, { "epoch": 0.35276073619631904, "grad_norm": 15.864728927612305, "learning_rate": 7.050102249488753e-06, "loss": 1.2167, "step": 1380 }, { "epoch": 0.3553169734151329, "grad_norm": 11.7051362991333, "learning_rate": 7.101226993865031e-06, "loss": 1.2047, "step": 1390 }, { "epoch": 0.35787321063394684, "grad_norm": 27.072895050048828, "learning_rate": 7.15235173824131e-06, "loss": 1.1739, "step": 1400 }, { "epoch": 0.3604294478527607, "grad_norm": 13.395477294921875, "learning_rate": 7.2034764826175876e-06, "loss": 1.4076, "step": 1410 }, { "epoch": 0.36298568507157464, "grad_norm": 11.141236305236816, "learning_rate": 7.254601226993866e-06, "loss": 1.2486, "step": 1420 }, { "epoch": 0.36554192229038857, "grad_norm": 29.229612350463867, "learning_rate": 7.305725971370144e-06, "loss": 1.2913, "step": 1430 }, { "epoch": 0.36809815950920244, "grad_norm": 13.788121223449707, "learning_rate": 7.356850715746422e-06, "loss": 1.2888, "step": 1440 }, { "epoch": 0.37065439672801637, "grad_norm": 22.21321678161621, "learning_rate": 7.407975460122701e-06, "loss": 1.2711, "step": 1450 }, { "epoch": 0.37321063394683024, "grad_norm": 15.443243980407715, "learning_rate": 7.459100204498978e-06, "loss": 1.0954, "step": 1460 }, { "epoch": 0.37576687116564417, "grad_norm": 16.390304565429688, "learning_rate": 7.510224948875257e-06, "loss": 1.267, "step": 1470 }, { "epoch": 0.3783231083844581, "grad_norm": 25.774921417236328, "learning_rate": 7.561349693251534e-06, "loss": 1.4799, "step": 1480 }, { "epoch": 0.38087934560327197, "grad_norm": 12.54340648651123, "learning_rate": 7.612474437627812e-06, "loss": 1.3772, "step": 1490 }, { "epoch": 0.3834355828220859, "grad_norm": 22.544086456298828, "learning_rate": 7.663599182004092e-06, "loss": 1.0647, "step": 1500 }, { "epoch": 0.38599182004089977, "grad_norm": 18.576513290405273, "learning_rate": 7.714723926380368e-06, "loss": 1.0959, "step": 1510 }, { "epoch": 0.3885480572597137, "grad_norm": 29.345508575439453, "learning_rate": 7.765848670756647e-06, "loss": 0.7637, "step": 1520 }, { "epoch": 0.3911042944785276, "grad_norm": 17.49864387512207, "learning_rate": 7.816973415132925e-06, "loss": 1.0717, "step": 1530 }, { "epoch": 0.3936605316973415, "grad_norm": 16.895015716552734, "learning_rate": 7.868098159509204e-06, "loss": 1.1233, "step": 1540 }, { "epoch": 0.3962167689161554, "grad_norm": 13.488862991333008, "learning_rate": 7.919222903885482e-06, "loss": 1.2759, "step": 1550 }, { "epoch": 0.3987730061349693, "grad_norm": 32.994239807128906, "learning_rate": 7.97034764826176e-06, "loss": 1.3437, "step": 1560 }, { "epoch": 0.4013292433537832, "grad_norm": 16.069793701171875, "learning_rate": 8.021472392638038e-06, "loss": 1.2101, "step": 1570 }, { "epoch": 0.40388548057259716, "grad_norm": 12.347966194152832, "learning_rate": 8.072597137014315e-06, "loss": 0.8976, "step": 1580 }, { "epoch": 0.40644171779141103, "grad_norm": 19.300352096557617, "learning_rate": 8.123721881390593e-06, "loss": 1.0809, "step": 1590 }, { "epoch": 0.40899795501022496, "grad_norm": 18.28475570678711, "learning_rate": 8.174846625766872e-06, "loss": 1.1435, "step": 1600 }, { "epoch": 0.41155419222903883, "grad_norm": 32.143680572509766, "learning_rate": 8.22597137014315e-06, "loss": 1.1703, "step": 1610 }, { "epoch": 0.41411042944785276, "grad_norm": 16.422698974609375, "learning_rate": 8.277096114519429e-06, "loss": 0.8745, "step": 1620 }, { "epoch": 0.4166666666666667, "grad_norm": 29.04837989807129, "learning_rate": 8.328220858895705e-06, "loss": 0.929, "step": 1630 }, { "epoch": 0.41922290388548056, "grad_norm": 18.266582489013672, "learning_rate": 8.379345603271984e-06, "loss": 1.0091, "step": 1640 }, { "epoch": 0.4217791411042945, "grad_norm": 15.355749130249023, "learning_rate": 8.430470347648262e-06, "loss": 0.9601, "step": 1650 }, { "epoch": 0.42433537832310836, "grad_norm": 11.973981857299805, "learning_rate": 8.481595092024541e-06, "loss": 1.0835, "step": 1660 }, { "epoch": 0.4268916155419223, "grad_norm": 17.572921752929688, "learning_rate": 8.53271983640082e-06, "loss": 1.0341, "step": 1670 }, { "epoch": 0.4294478527607362, "grad_norm": 13.629963874816895, "learning_rate": 8.583844580777096e-06, "loss": 1.1951, "step": 1680 }, { "epoch": 0.4320040899795501, "grad_norm": 10.527235984802246, "learning_rate": 8.634969325153375e-06, "loss": 1.0702, "step": 1690 }, { "epoch": 0.434560327198364, "grad_norm": 17.04031753540039, "learning_rate": 8.686094069529653e-06, "loss": 1.2097, "step": 1700 }, { "epoch": 0.4371165644171779, "grad_norm": 11.430649757385254, "learning_rate": 8.737218813905932e-06, "loss": 0.7584, "step": 1710 }, { "epoch": 0.4396728016359918, "grad_norm": 10.45757007598877, "learning_rate": 8.78834355828221e-06, "loss": 1.1054, "step": 1720 }, { "epoch": 0.44222903885480574, "grad_norm": 17.184608459472656, "learning_rate": 8.839468302658487e-06, "loss": 0.8942, "step": 1730 }, { "epoch": 0.4447852760736196, "grad_norm": 11.653769493103027, "learning_rate": 8.890593047034766e-06, "loss": 1.2842, "step": 1740 }, { "epoch": 0.44734151329243355, "grad_norm": 17.205242156982422, "learning_rate": 8.941717791411042e-06, "loss": 0.9786, "step": 1750 }, { "epoch": 0.4498977505112474, "grad_norm": 27.1918888092041, "learning_rate": 8.992842535787321e-06, "loss": 1.2404, "step": 1760 }, { "epoch": 0.45245398773006135, "grad_norm": 14.006787300109863, "learning_rate": 9.043967280163601e-06, "loss": 1.0146, "step": 1770 }, { "epoch": 0.4550102249488753, "grad_norm": 21.269569396972656, "learning_rate": 9.095092024539878e-06, "loss": 0.9494, "step": 1780 }, { "epoch": 0.45756646216768915, "grad_norm": 13.8292236328125, "learning_rate": 9.146216768916156e-06, "loss": 1.2056, "step": 1790 }, { "epoch": 0.4601226993865031, "grad_norm": 11.28924560546875, "learning_rate": 9.197341513292433e-06, "loss": 1.1219, "step": 1800 }, { "epoch": 0.46267893660531695, "grad_norm": 24.358989715576172, "learning_rate": 9.248466257668712e-06, "loss": 1.0259, "step": 1810 }, { "epoch": 0.4652351738241309, "grad_norm": 16.623613357543945, "learning_rate": 9.29959100204499e-06, "loss": 0.8764, "step": 1820 }, { "epoch": 0.4677914110429448, "grad_norm": 11.915813446044922, "learning_rate": 9.350715746421269e-06, "loss": 0.9114, "step": 1830 }, { "epoch": 0.4703476482617587, "grad_norm": 14.000443458557129, "learning_rate": 9.401840490797547e-06, "loss": 0.9903, "step": 1840 }, { "epoch": 0.4729038854805726, "grad_norm": 9.23658561706543, "learning_rate": 9.452965235173824e-06, "loss": 1.1503, "step": 1850 }, { "epoch": 0.4754601226993865, "grad_norm": 14.627740859985352, "learning_rate": 9.504089979550103e-06, "loss": 1.0605, "step": 1860 }, { "epoch": 0.4780163599182004, "grad_norm": 13.077226638793945, "learning_rate": 9.555214723926381e-06, "loss": 0.9759, "step": 1870 }, { "epoch": 0.48057259713701433, "grad_norm": 9.975872993469238, "learning_rate": 9.60633946830266e-06, "loss": 0.9908, "step": 1880 }, { "epoch": 0.4831288343558282, "grad_norm": 15.750456809997559, "learning_rate": 9.657464212678938e-06, "loss": 1.0758, "step": 1890 }, { "epoch": 0.48568507157464214, "grad_norm": 10.907366752624512, "learning_rate": 9.708588957055215e-06, "loss": 0.8757, "step": 1900 }, { "epoch": 0.488241308793456, "grad_norm": 26.87792205810547, "learning_rate": 9.759713701431493e-06, "loss": 0.9745, "step": 1910 }, { "epoch": 0.49079754601226994, "grad_norm": 10.880130767822266, "learning_rate": 9.810838445807772e-06, "loss": 0.9391, "step": 1920 }, { "epoch": 0.49335378323108386, "grad_norm": 19.826669692993164, "learning_rate": 9.86196319018405e-06, "loss": 1.1917, "step": 1930 }, { "epoch": 0.49591002044989774, "grad_norm": 11.035025596618652, "learning_rate": 9.913087934560329e-06, "loss": 0.7836, "step": 1940 }, { "epoch": 0.49846625766871167, "grad_norm": 13.407333374023438, "learning_rate": 9.964212678936606e-06, "loss": 1.1624, "step": 1950 }, { "epoch": 0.5010224948875256, "grad_norm": 18.5594482421875, "learning_rate": 9.999999283428496e-06, "loss": 1.0359, "step": 1960 }, { "epoch": 0.5035787321063395, "grad_norm": 30.378826141357422, "learning_rate": 9.999986544385255e-06, "loss": 0.9342, "step": 1970 }, { "epoch": 0.5061349693251533, "grad_norm": 26.27793312072754, "learning_rate": 9.99995788157752e-06, "loss": 0.7684, "step": 1980 }, { "epoch": 0.5086912065439673, "grad_norm": 17.525869369506836, "learning_rate": 9.999913295096573e-06, "loss": 1.2072, "step": 1990 }, { "epoch": 0.5112474437627812, "grad_norm": 19.318090438842773, "learning_rate": 9.999852785084414e-06, "loss": 0.9006, "step": 2000 }, { "epoch": 0.5138036809815951, "grad_norm": 11.649446487426758, "learning_rate": 9.999776351733751e-06, "loss": 0.831, "step": 2010 }, { "epoch": 0.516359918200409, "grad_norm": 18.077003479003906, "learning_rate": 9.999683995288008e-06, "loss": 0.8372, "step": 2020 }, { "epoch": 0.5189161554192229, "grad_norm": 24.69324493408203, "learning_rate": 9.999575716041316e-06, "loss": 1.0961, "step": 2030 }, { "epoch": 0.5214723926380368, "grad_norm": 10.308004379272461, "learning_rate": 9.99945151433852e-06, "loss": 1.0896, "step": 2040 }, { "epoch": 0.5240286298568507, "grad_norm": 14.579326629638672, "learning_rate": 9.99931139057517e-06, "loss": 0.8101, "step": 2050 }, { "epoch": 0.5265848670756647, "grad_norm": 19.19144630432129, "learning_rate": 9.999155345197531e-06, "loss": 0.9718, "step": 2060 }, { "epoch": 0.5291411042944786, "grad_norm": 14.424161911010742, "learning_rate": 9.99898337870257e-06, "loss": 1.1082, "step": 2070 }, { "epoch": 0.5316973415132924, "grad_norm": 11.568525314331055, "learning_rate": 9.998795491637956e-06, "loss": 0.9928, "step": 2080 }, { "epoch": 0.5342535787321063, "grad_norm": 28.195453643798828, "learning_rate": 9.998591684602065e-06, "loss": 0.967, "step": 2090 }, { "epoch": 0.5368098159509203, "grad_norm": 11.809616088867188, "learning_rate": 9.998371958243977e-06, "loss": 0.8879, "step": 2100 }, { "epoch": 0.5393660531697342, "grad_norm": 11.77135944366455, "learning_rate": 9.998136313263465e-06, "loss": 1.0883, "step": 2110 }, { "epoch": 0.5419222903885481, "grad_norm": 17.555498123168945, "learning_rate": 9.997884750411004e-06, "loss": 1.0922, "step": 2120 }, { "epoch": 0.5444785276073619, "grad_norm": 11.646632194519043, "learning_rate": 9.997617270487761e-06, "loss": 0.831, "step": 2130 }, { "epoch": 0.5470347648261759, "grad_norm": 11.330808639526367, "learning_rate": 9.997333874345594e-06, "loss": 1.1629, "step": 2140 }, { "epoch": 0.5495910020449898, "grad_norm": 12.656023979187012, "learning_rate": 9.997034562887054e-06, "loss": 1.1112, "step": 2150 }, { "epoch": 0.5521472392638037, "grad_norm": 10.297701835632324, "learning_rate": 9.996719337065376e-06, "loss": 0.9942, "step": 2160 }, { "epoch": 0.5547034764826176, "grad_norm": 20.408578872680664, "learning_rate": 9.99638819788448e-06, "loss": 0.7756, "step": 2170 }, { "epoch": 0.5572597137014315, "grad_norm": 13.656134605407715, "learning_rate": 9.996041146398963e-06, "loss": 1.2323, "step": 2180 }, { "epoch": 0.5598159509202454, "grad_norm": 10.573500633239746, "learning_rate": 9.995678183714104e-06, "loss": 0.9494, "step": 2190 }, { "epoch": 0.5623721881390593, "grad_norm": 30.932117462158203, "learning_rate": 9.99529931098585e-06, "loss": 0.9215, "step": 2200 }, { "epoch": 0.5649284253578732, "grad_norm": 12.926258087158203, "learning_rate": 9.994904529420824e-06, "loss": 1.151, "step": 2210 }, { "epoch": 0.5674846625766872, "grad_norm": 9.75345516204834, "learning_rate": 9.994493840276308e-06, "loss": 1.0613, "step": 2220 }, { "epoch": 0.570040899795501, "grad_norm": 15.309710502624512, "learning_rate": 9.99406724486025e-06, "loss": 1.1024, "step": 2230 }, { "epoch": 0.5725971370143149, "grad_norm": 13.060432434082031, "learning_rate": 9.993624744531253e-06, "loss": 0.8317, "step": 2240 }, { "epoch": 0.5751533742331288, "grad_norm": 22.823984146118164, "learning_rate": 9.993166340698577e-06, "loss": 0.9703, "step": 2250 }, { "epoch": 0.5777096114519428, "grad_norm": 11.097712516784668, "learning_rate": 9.992692034822127e-06, "loss": 0.9237, "step": 2260 }, { "epoch": 0.5802658486707567, "grad_norm": 14.171446800231934, "learning_rate": 9.992201828412458e-06, "loss": 0.9436, "step": 2270 }, { "epoch": 0.5828220858895705, "grad_norm": 10.901077270507812, "learning_rate": 9.991695723030755e-06, "loss": 0.9086, "step": 2280 }, { "epoch": 0.5853783231083844, "grad_norm": 44.82511901855469, "learning_rate": 9.991173720288847e-06, "loss": 0.7686, "step": 2290 }, { "epoch": 0.5879345603271984, "grad_norm": 8.220059394836426, "learning_rate": 9.990635821849187e-06, "loss": 0.7624, "step": 2300 }, { "epoch": 0.5904907975460123, "grad_norm": 11.58703327178955, "learning_rate": 9.990082029424852e-06, "loss": 0.7953, "step": 2310 }, { "epoch": 0.5930470347648262, "grad_norm": 18.552797317504883, "learning_rate": 9.989512344779541e-06, "loss": 0.7791, "step": 2320 }, { "epoch": 0.59560327198364, "grad_norm": 16.435989379882812, "learning_rate": 9.988926769727563e-06, "loss": 1.1133, "step": 2330 }, { "epoch": 0.598159509202454, "grad_norm": 9.04973316192627, "learning_rate": 9.988325306133832e-06, "loss": 0.8, "step": 2340 }, { "epoch": 0.6007157464212679, "grad_norm": 9.818502426147461, "learning_rate": 9.987707955913873e-06, "loss": 0.7636, "step": 2350 }, { "epoch": 0.6032719836400818, "grad_norm": 8.12960147857666, "learning_rate": 9.98707472103379e-06, "loss": 0.7332, "step": 2360 }, { "epoch": 0.6058282208588958, "grad_norm": 14.352721214294434, "learning_rate": 9.986425603510292e-06, "loss": 0.7819, "step": 2370 }, { "epoch": 0.6083844580777096, "grad_norm": 6.8704986572265625, "learning_rate": 9.985760605410662e-06, "loss": 0.7691, "step": 2380 }, { "epoch": 0.6109406952965235, "grad_norm": 10.685389518737793, "learning_rate": 9.985079728852759e-06, "loss": 0.8252, "step": 2390 }, { "epoch": 0.6134969325153374, "grad_norm": 16.207923889160156, "learning_rate": 9.98438297600501e-06, "loss": 0.9821, "step": 2400 }, { "epoch": 0.6160531697341514, "grad_norm": 15.584657669067383, "learning_rate": 9.983670349086413e-06, "loss": 0.876, "step": 2410 }, { "epoch": 0.6186094069529653, "grad_norm": 15.134186744689941, "learning_rate": 9.982941850366513e-06, "loss": 0.5934, "step": 2420 }, { "epoch": 0.6211656441717791, "grad_norm": 28.123193740844727, "learning_rate": 9.982197482165398e-06, "loss": 0.7742, "step": 2430 }, { "epoch": 0.623721881390593, "grad_norm": 17.409650802612305, "learning_rate": 9.981437246853712e-06, "loss": 0.7065, "step": 2440 }, { "epoch": 0.626278118609407, "grad_norm": 13.156755447387695, "learning_rate": 9.980661146852619e-06, "loss": 0.6499, "step": 2450 }, { "epoch": 0.6288343558282209, "grad_norm": 20.250652313232422, "learning_rate": 9.979869184633812e-06, "loss": 0.7821, "step": 2460 }, { "epoch": 0.6313905930470347, "grad_norm": 52.275699615478516, "learning_rate": 9.979061362719502e-06, "loss": 0.8, "step": 2470 }, { "epoch": 0.6339468302658486, "grad_norm": 10.591206550598145, "learning_rate": 9.97823768368241e-06, "loss": 1.0135, "step": 2480 }, { "epoch": 0.6365030674846626, "grad_norm": 20.04345703125, "learning_rate": 9.977398150145758e-06, "loss": 0.9202, "step": 2490 }, { "epoch": 0.6390593047034765, "grad_norm": 15.350805282592773, "learning_rate": 9.976542764783256e-06, "loss": 1.0958, "step": 2500 }, { "epoch": 0.6416155419222904, "grad_norm": 10.294832229614258, "learning_rate": 9.97567153031911e-06, "loss": 0.9347, "step": 2510 }, { "epoch": 0.6441717791411042, "grad_norm": 18.00196075439453, "learning_rate": 9.974784449527984e-06, "loss": 0.776, "step": 2520 }, { "epoch": 0.6467280163599182, "grad_norm": 15.802022933959961, "learning_rate": 9.973881525235028e-06, "loss": 0.7016, "step": 2530 }, { "epoch": 0.6492842535787321, "grad_norm": 16.474000930786133, "learning_rate": 9.972962760315834e-06, "loss": 0.9632, "step": 2540 }, { "epoch": 0.651840490797546, "grad_norm": 20.025535583496094, "learning_rate": 9.972028157696452e-06, "loss": 0.9582, "step": 2550 }, { "epoch": 0.65439672801636, "grad_norm": 20.044818878173828, "learning_rate": 9.971077720353368e-06, "loss": 0.8913, "step": 2560 }, { "epoch": 0.6569529652351738, "grad_norm": 10.750015258789062, "learning_rate": 9.970111451313498e-06, "loss": 0.9251, "step": 2570 }, { "epoch": 0.6595092024539877, "grad_norm": 13.033714294433594, "learning_rate": 9.969129353654179e-06, "loss": 0.8761, "step": 2580 }, { "epoch": 0.6620654396728016, "grad_norm": 9.243477821350098, "learning_rate": 9.968131430503157e-06, "loss": 0.5353, "step": 2590 }, { "epoch": 0.6646216768916156, "grad_norm": 8.169621467590332, "learning_rate": 9.96711768503858e-06, "loss": 0.6617, "step": 2600 }, { "epoch": 0.6671779141104295, "grad_norm": 21.10552406311035, "learning_rate": 9.966088120488985e-06, "loss": 0.5695, "step": 2610 }, { "epoch": 0.6697341513292433, "grad_norm": 9.105271339416504, "learning_rate": 9.96504274013329e-06, "loss": 0.9342, "step": 2620 }, { "epoch": 0.6722903885480572, "grad_norm": 12.127760887145996, "learning_rate": 9.96398154730078e-06, "loss": 0.8841, "step": 2630 }, { "epoch": 0.6748466257668712, "grad_norm": 6.325476169586182, "learning_rate": 9.962904545371104e-06, "loss": 0.6288, "step": 2640 }, { "epoch": 0.6774028629856851, "grad_norm": 36.65105438232422, "learning_rate": 9.961811737774256e-06, "loss": 0.7858, "step": 2650 }, { "epoch": 0.679959100204499, "grad_norm": 12.881020545959473, "learning_rate": 9.960703127990564e-06, "loss": 0.6614, "step": 2660 }, { "epoch": 0.6825153374233128, "grad_norm": 9.100659370422363, "learning_rate": 9.959578719550689e-06, "loss": 0.753, "step": 2670 }, { "epoch": 0.6850715746421268, "grad_norm": 6.299210071563721, "learning_rate": 9.958438516035604e-06, "loss": 0.7298, "step": 2680 }, { "epoch": 0.6876278118609407, "grad_norm": 10.514267921447754, "learning_rate": 9.957282521076583e-06, "loss": 0.7337, "step": 2690 }, { "epoch": 0.6901840490797546, "grad_norm": 6.144178867340088, "learning_rate": 9.956110738355197e-06, "loss": 0.7576, "step": 2700 }, { "epoch": 0.6927402862985685, "grad_norm": 7.862902641296387, "learning_rate": 9.95492317160329e-06, "loss": 0.8132, "step": 2710 }, { "epoch": 0.6952965235173824, "grad_norm": 15.029640197753906, "learning_rate": 9.953719824602982e-06, "loss": 0.7462, "step": 2720 }, { "epoch": 0.6978527607361963, "grad_norm": 13.379220008850098, "learning_rate": 9.952500701186649e-06, "loss": 0.4353, "step": 2730 }, { "epoch": 0.7004089979550102, "grad_norm": 8.90844440460205, "learning_rate": 9.951265805236903e-06, "loss": 0.6655, "step": 2740 }, { "epoch": 0.7029652351738241, "grad_norm": 14.42451000213623, "learning_rate": 9.950015140686595e-06, "loss": 0.6928, "step": 2750 }, { "epoch": 0.7055214723926381, "grad_norm": 9.552287101745605, "learning_rate": 9.948748711518792e-06, "loss": 0.5294, "step": 2760 }, { "epoch": 0.7080777096114519, "grad_norm": 12.426175117492676, "learning_rate": 9.947466521766772e-06, "loss": 0.7148, "step": 2770 }, { "epoch": 0.7106339468302658, "grad_norm": 16.0783748626709, "learning_rate": 9.946168575514e-06, "loss": 0.6684, "step": 2780 }, { "epoch": 0.7131901840490797, "grad_norm": 10.560613632202148, "learning_rate": 9.94485487689413e-06, "loss": 0.7561, "step": 2790 }, { "epoch": 0.7157464212678937, "grad_norm": 13.276518821716309, "learning_rate": 9.943525430090973e-06, "loss": 0.5811, "step": 2800 }, { "epoch": 0.7183026584867076, "grad_norm": 13.999181747436523, "learning_rate": 9.942180239338503e-06, "loss": 0.5591, "step": 2810 }, { "epoch": 0.7208588957055214, "grad_norm": 12.428943634033203, "learning_rate": 9.940819308920832e-06, "loss": 0.7026, "step": 2820 }, { "epoch": 0.7234151329243353, "grad_norm": 7.707891941070557, "learning_rate": 9.939442643172197e-06, "loss": 0.7179, "step": 2830 }, { "epoch": 0.7259713701431493, "grad_norm": 7.399072170257568, "learning_rate": 9.93805024647695e-06, "loss": 0.664, "step": 2840 }, { "epoch": 0.7285276073619632, "grad_norm": 23.526582717895508, "learning_rate": 9.936642123269546e-06, "loss": 0.7611, "step": 2850 }, { "epoch": 0.7310838445807771, "grad_norm": 9.424376487731934, "learning_rate": 9.93521827803452e-06, "loss": 0.7113, "step": 2860 }, { "epoch": 0.733640081799591, "grad_norm": 13.683032989501953, "learning_rate": 9.933778715306474e-06, "loss": 0.4565, "step": 2870 }, { "epoch": 0.7361963190184049, "grad_norm": 8.428793907165527, "learning_rate": 9.932323439670079e-06, "loss": 0.6818, "step": 2880 }, { "epoch": 0.7387525562372188, "grad_norm": 20.064414978027344, "learning_rate": 9.930852455760039e-06, "loss": 0.6954, "step": 2890 }, { "epoch": 0.7413087934560327, "grad_norm": 12.071993827819824, "learning_rate": 9.929365768261085e-06, "loss": 0.8114, "step": 2900 }, { "epoch": 0.7438650306748467, "grad_norm": 10.930386543273926, "learning_rate": 9.927863381907963e-06, "loss": 0.7282, "step": 2910 }, { "epoch": 0.7464212678936605, "grad_norm": 10.170836448669434, "learning_rate": 9.926345301485414e-06, "loss": 0.9321, "step": 2920 }, { "epoch": 0.7489775051124744, "grad_norm": 6.6626129150390625, "learning_rate": 9.924811531828164e-06, "loss": 0.7144, "step": 2930 }, { "epoch": 0.7515337423312883, "grad_norm": 8.486347198486328, "learning_rate": 9.923262077820903e-06, "loss": 0.5393, "step": 2940 }, { "epoch": 0.7540899795501023, "grad_norm": 12.877697944641113, "learning_rate": 9.921696944398274e-06, "loss": 0.4268, "step": 2950 }, { "epoch": 0.7566462167689162, "grad_norm": 11.594487190246582, "learning_rate": 9.920116136544849e-06, "loss": 0.5911, "step": 2960 }, { "epoch": 0.75920245398773, "grad_norm": 15.745911598205566, "learning_rate": 9.918519659295127e-06, "loss": 0.7711, "step": 2970 }, { "epoch": 0.7617586912065439, "grad_norm": 13.972307205200195, "learning_rate": 9.916907517733508e-06, "loss": 0.5574, "step": 2980 }, { "epoch": 0.7643149284253579, "grad_norm": 6.976569175720215, "learning_rate": 9.915279716994276e-06, "loss": 0.4998, "step": 2990 }, { "epoch": 0.7668711656441718, "grad_norm": 6.9776787757873535, "learning_rate": 9.913636262261592e-06, "loss": 0.4069, "step": 3000 }, { "epoch": 0.7694274028629857, "grad_norm": 12.110786437988281, "learning_rate": 9.911977158769461e-06, "loss": 0.6704, "step": 3010 }, { "epoch": 0.7719836400817995, "grad_norm": 6.544830799102783, "learning_rate": 9.910302411801738e-06, "loss": 0.5889, "step": 3020 }, { "epoch": 0.7745398773006135, "grad_norm": 8.968564987182617, "learning_rate": 9.90861202669209e-06, "loss": 0.4109, "step": 3030 }, { "epoch": 0.7770961145194274, "grad_norm": 16.600383758544922, "learning_rate": 9.906906008823989e-06, "loss": 0.9562, "step": 3040 }, { "epoch": 0.7796523517382413, "grad_norm": 21.926057815551758, "learning_rate": 9.905184363630698e-06, "loss": 0.5117, "step": 3050 }, { "epoch": 0.7822085889570553, "grad_norm": 13.331565856933594, "learning_rate": 9.903447096595245e-06, "loss": 0.5186, "step": 3060 }, { "epoch": 0.7847648261758691, "grad_norm": 10.782326698303223, "learning_rate": 9.90169421325041e-06, "loss": 0.5799, "step": 3070 }, { "epoch": 0.787321063394683, "grad_norm": 20.489850997924805, "learning_rate": 9.89992571917871e-06, "loss": 0.4487, "step": 3080 }, { "epoch": 0.7898773006134969, "grad_norm": 12.216683387756348, "learning_rate": 9.898141620012374e-06, "loss": 0.636, "step": 3090 }, { "epoch": 0.7924335378323109, "grad_norm": 8.060449600219727, "learning_rate": 9.896341921433337e-06, "loss": 0.6251, "step": 3100 }, { "epoch": 0.7949897750511248, "grad_norm": 5.005650997161865, "learning_rate": 9.894526629173204e-06, "loss": 0.6748, "step": 3110 }, { "epoch": 0.7975460122699386, "grad_norm": 11.046931266784668, "learning_rate": 9.892695749013253e-06, "loss": 0.599, "step": 3120 }, { "epoch": 0.8001022494887525, "grad_norm": 11.397811889648438, "learning_rate": 9.890849286784398e-06, "loss": 0.7874, "step": 3130 }, { "epoch": 0.8026584867075665, "grad_norm": 8.473251342773438, "learning_rate": 9.888987248367181e-06, "loss": 0.6328, "step": 3140 }, { "epoch": 0.8052147239263804, "grad_norm": 11.444445610046387, "learning_rate": 9.88710963969175e-06, "loss": 0.5749, "step": 3150 }, { "epoch": 0.8077709611451943, "grad_norm": 8.93635082244873, "learning_rate": 9.885216466737843e-06, "loss": 0.7803, "step": 3160 }, { "epoch": 0.8103271983640081, "grad_norm": 8.53089714050293, "learning_rate": 9.883307735534761e-06, "loss": 0.6362, "step": 3170 }, { "epoch": 0.8128834355828221, "grad_norm": 4.943642616271973, "learning_rate": 9.88138345216136e-06, "loss": 0.6297, "step": 3180 }, { "epoch": 0.815439672801636, "grad_norm": 10.993963241577148, "learning_rate": 9.87944362274602e-06, "loss": 0.4654, "step": 3190 }, { "epoch": 0.8179959100204499, "grad_norm": 20.30816650390625, "learning_rate": 9.87748825346664e-06, "loss": 0.5197, "step": 3200 }, { "epoch": 0.8205521472392638, "grad_norm": 10.663908004760742, "learning_rate": 9.875517350550601e-06, "loss": 0.6027, "step": 3210 }, { "epoch": 0.8231083844580777, "grad_norm": 7.101048469543457, "learning_rate": 9.873530920274761e-06, "loss": 0.5027, "step": 3220 }, { "epoch": 0.8256646216768916, "grad_norm": 16.21637725830078, "learning_rate": 9.871528968965426e-06, "loss": 0.6488, "step": 3230 }, { "epoch": 0.8282208588957055, "grad_norm": 11.160218238830566, "learning_rate": 9.86951150299833e-06, "loss": 0.6848, "step": 3240 }, { "epoch": 0.8307770961145194, "grad_norm": 7.589058876037598, "learning_rate": 9.867478528798625e-06, "loss": 0.3006, "step": 3250 }, { "epoch": 0.8333333333333334, "grad_norm": 13.101618766784668, "learning_rate": 9.865430052840849e-06, "loss": 0.6459, "step": 3260 }, { "epoch": 0.8358895705521472, "grad_norm": 6.775156021118164, "learning_rate": 9.863366081648907e-06, "loss": 0.5887, "step": 3270 }, { "epoch": 0.8384458077709611, "grad_norm": 14.762919425964355, "learning_rate": 9.861286621796056e-06, "loss": 0.4892, "step": 3280 }, { "epoch": 0.841002044989775, "grad_norm": 22.660533905029297, "learning_rate": 9.85919167990488e-06, "loss": 0.5762, "step": 3290 }, { "epoch": 0.843558282208589, "grad_norm": 12.753227233886719, "learning_rate": 9.857081262647269e-06, "loss": 0.6596, "step": 3300 }, { "epoch": 0.8461145194274029, "grad_norm": 14.134135246276855, "learning_rate": 9.854955376744397e-06, "loss": 0.5865, "step": 3310 }, { "epoch": 0.8486707566462167, "grad_norm": 7.306004047393799, "learning_rate": 9.852814028966706e-06, "loss": 0.5196, "step": 3320 }, { "epoch": 0.8512269938650306, "grad_norm": 12.3103609085083, "learning_rate": 9.850657226133878e-06, "loss": 0.605, "step": 3330 }, { "epoch": 0.8537832310838446, "grad_norm": 7.823228359222412, "learning_rate": 9.848484975114812e-06, "loss": 0.6368, "step": 3340 }, { "epoch": 0.8563394683026585, "grad_norm": 11.120277404785156, "learning_rate": 9.846297282827612e-06, "loss": 0.4841, "step": 3350 }, { "epoch": 0.8588957055214724, "grad_norm": 8.988906860351562, "learning_rate": 9.844094156239557e-06, "loss": 0.5918, "step": 3360 }, { "epoch": 0.8614519427402862, "grad_norm": 14.820247650146484, "learning_rate": 9.841875602367079e-06, "loss": 0.4307, "step": 3370 }, { "epoch": 0.8640081799591002, "grad_norm": 7.334587097167969, "learning_rate": 9.83964162827574e-06, "loss": 0.564, "step": 3380 }, { "epoch": 0.8665644171779141, "grad_norm": 11.864500999450684, "learning_rate": 9.837392241080218e-06, "loss": 0.5235, "step": 3390 }, { "epoch": 0.869120654396728, "grad_norm": 10.920977592468262, "learning_rate": 9.835127447944274e-06, "loss": 0.4475, "step": 3400 }, { "epoch": 0.871676891615542, "grad_norm": 8.427702903747559, "learning_rate": 9.832847256080734e-06, "loss": 0.5594, "step": 3410 }, { "epoch": 0.8742331288343558, "grad_norm": 9.778414726257324, "learning_rate": 9.830551672751463e-06, "loss": 0.6194, "step": 3420 }, { "epoch": 0.8767893660531697, "grad_norm": 8.027331352233887, "learning_rate": 9.82824070526735e-06, "loss": 0.5957, "step": 3430 }, { "epoch": 0.8793456032719836, "grad_norm": 6.331071376800537, "learning_rate": 9.825914360988271e-06, "loss": 0.5145, "step": 3440 }, { "epoch": 0.8819018404907976, "grad_norm": 8.607481956481934, "learning_rate": 9.82357264732308e-06, "loss": 0.5986, "step": 3450 }, { "epoch": 0.8844580777096115, "grad_norm": 6.551468849182129, "learning_rate": 9.821215571729578e-06, "loss": 0.5461, "step": 3460 }, { "epoch": 0.8870143149284253, "grad_norm": 6.835443496704102, "learning_rate": 9.818843141714486e-06, "loss": 0.7021, "step": 3470 }, { "epoch": 0.8895705521472392, "grad_norm": 7.249754428863525, "learning_rate": 9.81645536483343e-06, "loss": 0.6188, "step": 3480 }, { "epoch": 0.8921267893660532, "grad_norm": 7.487998962402344, "learning_rate": 9.814052248690906e-06, "loss": 0.4203, "step": 3490 }, { "epoch": 0.8946830265848671, "grad_norm": 17.97199821472168, "learning_rate": 9.81163380094027e-06, "loss": 0.4725, "step": 3500 }, { "epoch": 0.897239263803681, "grad_norm": 15.719616889953613, "learning_rate": 9.809200029283698e-06, "loss": 0.5723, "step": 3510 }, { "epoch": 0.8997955010224948, "grad_norm": 9.500740051269531, "learning_rate": 9.806750941472175e-06, "loss": 0.417, "step": 3520 }, { "epoch": 0.9023517382413088, "grad_norm": 7.425899505615234, "learning_rate": 9.804286545305456e-06, "loss": 0.4884, "step": 3530 }, { "epoch": 0.9049079754601227, "grad_norm": 8.523987770080566, "learning_rate": 9.801806848632062e-06, "loss": 0.4925, "step": 3540 }, { "epoch": 0.9074642126789366, "grad_norm": 13.769088745117188, "learning_rate": 9.799311859349235e-06, "loss": 0.3849, "step": 3550 }, { "epoch": 0.9100204498977505, "grad_norm": 7.716251850128174, "learning_rate": 9.796801585402913e-06, "loss": 0.4594, "step": 3560 }, { "epoch": 0.9125766871165644, "grad_norm": 10.922795295715332, "learning_rate": 9.79427603478773e-06, "loss": 0.4632, "step": 3570 }, { "epoch": 0.9151329243353783, "grad_norm": 8.93303108215332, "learning_rate": 9.791735215546953e-06, "loss": 0.453, "step": 3580 }, { "epoch": 0.9176891615541922, "grad_norm": 6.447891712188721, "learning_rate": 9.78917913577249e-06, "loss": 0.3284, "step": 3590 }, { "epoch": 0.9202453987730062, "grad_norm": 8.590970993041992, "learning_rate": 9.786607803604844e-06, "loss": 0.4445, "step": 3600 }, { "epoch": 0.9228016359918201, "grad_norm": 9.189178466796875, "learning_rate": 9.784021227233097e-06, "loss": 0.5768, "step": 3610 }, { "epoch": 0.9253578732106339, "grad_norm": 8.67251968383789, "learning_rate": 9.781419414894877e-06, "loss": 0.4507, "step": 3620 }, { "epoch": 0.9279141104294478, "grad_norm": 10.756339073181152, "learning_rate": 9.778802374876332e-06, "loss": 0.4278, "step": 3630 }, { "epoch": 0.9304703476482618, "grad_norm": 9.680365562438965, "learning_rate": 9.776170115512115e-06, "loss": 0.3831, "step": 3640 }, { "epoch": 0.9330265848670757, "grad_norm": 16.632375717163086, "learning_rate": 9.773522645185342e-06, "loss": 0.5033, "step": 3650 }, { "epoch": 0.9355828220858896, "grad_norm": 7.6330695152282715, "learning_rate": 9.770859972327575e-06, "loss": 0.3978, "step": 3660 }, { "epoch": 0.9381390593047034, "grad_norm": 8.260819435119629, "learning_rate": 9.768182105418791e-06, "loss": 0.5457, "step": 3670 }, { "epoch": 0.9406952965235174, "grad_norm": 18.994287490844727, "learning_rate": 9.765489052987357e-06, "loss": 0.5469, "step": 3680 }, { "epoch": 0.9432515337423313, "grad_norm": 8.636393547058105, "learning_rate": 9.762780823610006e-06, "loss": 0.4657, "step": 3690 }, { "epoch": 0.9458077709611452, "grad_norm": 16.197158813476562, "learning_rate": 9.760057425911797e-06, "loss": 0.3715, "step": 3700 }, { "epoch": 0.9483640081799591, "grad_norm": 28.646278381347656, "learning_rate": 9.757318868566107e-06, "loss": 0.3147, "step": 3710 }, { "epoch": 0.950920245398773, "grad_norm": 9.230977058410645, "learning_rate": 9.754565160294587e-06, "loss": 0.6337, "step": 3720 }, { "epoch": 0.9534764826175869, "grad_norm": 17.38115882873535, "learning_rate": 9.751796309867139e-06, "loss": 0.4393, "step": 3730 }, { "epoch": 0.9560327198364008, "grad_norm": 15.209970474243164, "learning_rate": 9.749012326101891e-06, "loss": 0.4759, "step": 3740 }, { "epoch": 0.9585889570552147, "grad_norm": 15.37113094329834, "learning_rate": 9.74621321786517e-06, "loss": 0.493, "step": 3750 }, { "epoch": 0.9611451942740287, "grad_norm": 9.076826095581055, "learning_rate": 9.743398994071467e-06, "loss": 0.2903, "step": 3760 }, { "epoch": 0.9637014314928425, "grad_norm": 6.899563312530518, "learning_rate": 9.740569663683413e-06, "loss": 0.3847, "step": 3770 }, { "epoch": 0.9662576687116564, "grad_norm": 14.622838973999023, "learning_rate": 9.73772523571175e-06, "loss": 0.3528, "step": 3780 }, { "epoch": 0.9688139059304703, "grad_norm": 11.762303352355957, "learning_rate": 9.734865719215303e-06, "loss": 0.4437, "step": 3790 }, { "epoch": 0.9713701431492843, "grad_norm": 11.108593940734863, "learning_rate": 9.73199112330095e-06, "loss": 0.2947, "step": 3800 }, { "epoch": 0.9739263803680982, "grad_norm": 7.895074367523193, "learning_rate": 9.729101457123593e-06, "loss": 0.4659, "step": 3810 }, { "epoch": 0.976482617586912, "grad_norm": 10.534423828125, "learning_rate": 9.72619672988613e-06, "loss": 0.5034, "step": 3820 }, { "epoch": 0.9790388548057259, "grad_norm": 6.145469665527344, "learning_rate": 9.723276950839425e-06, "loss": 0.4708, "step": 3830 }, { "epoch": 0.9815950920245399, "grad_norm": 5.333863258361816, "learning_rate": 9.720342129282277e-06, "loss": 0.5987, "step": 3840 }, { "epoch": 0.9841513292433538, "grad_norm": 11.559300422668457, "learning_rate": 9.717392274561392e-06, "loss": 0.5316, "step": 3850 }, { "epoch": 0.9867075664621677, "grad_norm": 7.202635288238525, "learning_rate": 9.714427396071354e-06, "loss": 0.3995, "step": 3860 }, { "epoch": 0.9892638036809815, "grad_norm": 9.292013168334961, "learning_rate": 9.711447503254595e-06, "loss": 0.5362, "step": 3870 }, { "epoch": 0.9918200408997955, "grad_norm": 15.875975608825684, "learning_rate": 9.708452605601361e-06, "loss": 0.3956, "step": 3880 }, { "epoch": 0.9943762781186094, "grad_norm": 5.166224002838135, "learning_rate": 9.705442712649688e-06, "loss": 0.4298, "step": 3890 }, { "epoch": 0.9969325153374233, "grad_norm": 28.647296905517578, "learning_rate": 9.702417833985367e-06, "loss": 0.5758, "step": 3900 }, { "epoch": 0.9994887525562373, "grad_norm": 7.455996990203857, "learning_rate": 9.699377979241915e-06, "loss": 0.4445, "step": 3910 }, { "epoch": 1.0020449897750512, "grad_norm": 8.313132286071777, "learning_rate": 9.696323158100543e-06, "loss": 0.3661, "step": 3920 }, { "epoch": 1.0046012269938651, "grad_norm": 2.6401190757751465, "learning_rate": 9.69325338029013e-06, "loss": 0.4446, "step": 3930 }, { "epoch": 1.007157464212679, "grad_norm": 8.16818904876709, "learning_rate": 9.690168655587184e-06, "loss": 0.298, "step": 3940 }, { "epoch": 1.0097137014314927, "grad_norm": 9.28429889678955, "learning_rate": 9.687068993815819e-06, "loss": 0.2262, "step": 3950 }, { "epoch": 1.0122699386503067, "grad_norm": 6.392743110656738, "learning_rate": 9.683954404847715e-06, "loss": 0.2432, "step": 3960 }, { "epoch": 1.0148261758691206, "grad_norm": 6.890766620635986, "learning_rate": 9.6808248986021e-06, "loss": 0.4461, "step": 3970 }, { "epoch": 1.0173824130879345, "grad_norm": 10.436578750610352, "learning_rate": 9.6776804850457e-06, "loss": 0.3529, "step": 3980 }, { "epoch": 1.0199386503067485, "grad_norm": 7.264800071716309, "learning_rate": 9.674521174192726e-06, "loss": 0.2966, "step": 3990 }, { "epoch": 1.0224948875255624, "grad_norm": 10.522168159484863, "learning_rate": 9.671346976104828e-06, "loss": 0.2953, "step": 4000 }, { "epoch": 1.0250511247443763, "grad_norm": 5.65585994720459, "learning_rate": 9.668157900891069e-06, "loss": 0.3308, "step": 4010 }, { "epoch": 1.0276073619631902, "grad_norm": 9.439372062683105, "learning_rate": 9.664953958707892e-06, "loss": 0.2545, "step": 4020 }, { "epoch": 1.0301635991820042, "grad_norm": 3.5625405311584473, "learning_rate": 9.661735159759093e-06, "loss": 0.2846, "step": 4030 }, { "epoch": 1.032719836400818, "grad_norm": 26.94212532043457, "learning_rate": 9.658501514295775e-06, "loss": 0.205, "step": 4040 }, { "epoch": 1.0352760736196318, "grad_norm": 11.873112678527832, "learning_rate": 9.655253032616327e-06, "loss": 0.3401, "step": 4050 }, { "epoch": 1.0378323108384457, "grad_norm": 7.584825038909912, "learning_rate": 9.651989725066393e-06, "loss": 0.2991, "step": 4060 }, { "epoch": 1.0403885480572597, "grad_norm": 7.558630466461182, "learning_rate": 9.648711602038823e-06, "loss": 0.3096, "step": 4070 }, { "epoch": 1.0429447852760736, "grad_norm": 24.522443771362305, "learning_rate": 9.64541867397366e-06, "loss": 0.4115, "step": 4080 }, { "epoch": 1.0455010224948875, "grad_norm": 5.4436354637146, "learning_rate": 9.642110951358097e-06, "loss": 0.2687, "step": 4090 }, { "epoch": 1.0480572597137015, "grad_norm": 9.708597183227539, "learning_rate": 9.638788444726437e-06, "loss": 0.2038, "step": 4100 }, { "epoch": 1.0506134969325154, "grad_norm": 5.303321361541748, "learning_rate": 9.635451164660073e-06, "loss": 0.3039, "step": 4110 }, { "epoch": 1.0531697341513293, "grad_norm": 7.557952404022217, "learning_rate": 9.632099121787445e-06, "loss": 0.3325, "step": 4120 }, { "epoch": 1.0557259713701432, "grad_norm": 5.638031005859375, "learning_rate": 9.628732326784014e-06, "loss": 0.3189, "step": 4130 }, { "epoch": 1.058282208588957, "grad_norm": 1.7007097005844116, "learning_rate": 9.625350790372214e-06, "loss": 0.3178, "step": 4140 }, { "epoch": 1.0608384458077709, "grad_norm": 8.193168640136719, "learning_rate": 9.621954523321434e-06, "loss": 0.307, "step": 4150 }, { "epoch": 1.0633946830265848, "grad_norm": 15.883909225463867, "learning_rate": 9.618543536447974e-06, "loss": 0.2642, "step": 4160 }, { "epoch": 1.0659509202453987, "grad_norm": 13.922346115112305, "learning_rate": 9.615117840615011e-06, "loss": 0.3466, "step": 4170 }, { "epoch": 1.0685071574642127, "grad_norm": 21.666532516479492, "learning_rate": 9.611677446732576e-06, "loss": 0.2475, "step": 4180 }, { "epoch": 1.0710633946830266, "grad_norm": 14.09211540222168, "learning_rate": 9.608222365757498e-06, "loss": 0.2698, "step": 4190 }, { "epoch": 1.0736196319018405, "grad_norm": 9.652295112609863, "learning_rate": 9.604752608693384e-06, "loss": 0.2477, "step": 4200 }, { "epoch": 1.0761758691206544, "grad_norm": 5.439416408538818, "learning_rate": 9.601268186590587e-06, "loss": 0.2024, "step": 4210 }, { "epoch": 1.0787321063394684, "grad_norm": 3.458691358566284, "learning_rate": 9.597769110546158e-06, "loss": 0.2974, "step": 4220 }, { "epoch": 1.0812883435582823, "grad_norm": 8.662911415100098, "learning_rate": 9.594255391703821e-06, "loss": 0.2053, "step": 4230 }, { "epoch": 1.0838445807770962, "grad_norm": 9.305736541748047, "learning_rate": 9.59072704125393e-06, "loss": 0.3785, "step": 4240 }, { "epoch": 1.08640081799591, "grad_norm": 8.057384490966797, "learning_rate": 9.587184070433442e-06, "loss": 0.239, "step": 4250 }, { "epoch": 1.0889570552147239, "grad_norm": 11.628586769104004, "learning_rate": 9.583626490525872e-06, "loss": 0.3451, "step": 4260 }, { "epoch": 1.0915132924335378, "grad_norm": 5.124874591827393, "learning_rate": 9.580054312861264e-06, "loss": 0.3267, "step": 4270 }, { "epoch": 1.0940695296523517, "grad_norm": 8.520767211914062, "learning_rate": 9.576467548816154e-06, "loss": 0.2843, "step": 4280 }, { "epoch": 1.0966257668711656, "grad_norm": 13.09350872039795, "learning_rate": 9.572866209813525e-06, "loss": 0.2522, "step": 4290 }, { "epoch": 1.0991820040899796, "grad_norm": 6.647915840148926, "learning_rate": 9.569250307322788e-06, "loss": 0.3104, "step": 4300 }, { "epoch": 1.1017382413087935, "grad_norm": 10.310320854187012, "learning_rate": 9.565619852859727e-06, "loss": 0.2137, "step": 4310 }, { "epoch": 1.1042944785276074, "grad_norm": 6.362160682678223, "learning_rate": 9.561974857986472e-06, "loss": 0.1824, "step": 4320 }, { "epoch": 1.1068507157464214, "grad_norm": 16.666887283325195, "learning_rate": 9.558315334311467e-06, "loss": 0.3631, "step": 4330 }, { "epoch": 1.109406952965235, "grad_norm": 2.7935502529144287, "learning_rate": 9.554641293489419e-06, "loss": 0.2915, "step": 4340 }, { "epoch": 1.111963190184049, "grad_norm": 15.494998931884766, "learning_rate": 9.55095274722127e-06, "loss": 0.2922, "step": 4350 }, { "epoch": 1.114519427402863, "grad_norm": 6.94740629196167, "learning_rate": 9.547249707254166e-06, "loss": 0.264, "step": 4360 }, { "epoch": 1.1170756646216768, "grad_norm": 7.18923807144165, "learning_rate": 9.543532185381397e-06, "loss": 0.3097, "step": 4370 }, { "epoch": 1.1196319018404908, "grad_norm": 10.083481788635254, "learning_rate": 9.53980019344239e-06, "loss": 0.2706, "step": 4380 }, { "epoch": 1.1221881390593047, "grad_norm": 7.783493995666504, "learning_rate": 9.53605374332265e-06, "loss": 0.1824, "step": 4390 }, { "epoch": 1.1247443762781186, "grad_norm": 10.747809410095215, "learning_rate": 9.532292846953723e-06, "loss": 0.3375, "step": 4400 }, { "epoch": 1.1273006134969326, "grad_norm": 11.694700241088867, "learning_rate": 9.528517516313167e-06, "loss": 0.2018, "step": 4410 }, { "epoch": 1.1298568507157465, "grad_norm": 6.256073474884033, "learning_rate": 9.524727763424513e-06, "loss": 0.1545, "step": 4420 }, { "epoch": 1.1324130879345604, "grad_norm": 6.233736991882324, "learning_rate": 9.520923600357217e-06, "loss": 0.2827, "step": 4430 }, { "epoch": 1.1349693251533743, "grad_norm": 8.213584899902344, "learning_rate": 9.517105039226632e-06, "loss": 0.315, "step": 4440 }, { "epoch": 1.137525562372188, "grad_norm": 12.951038360595703, "learning_rate": 9.513272092193965e-06, "loss": 0.2061, "step": 4450 }, { "epoch": 1.140081799591002, "grad_norm": 5.706482410430908, "learning_rate": 9.509424771466236e-06, "loss": 0.2526, "step": 4460 }, { "epoch": 1.142638036809816, "grad_norm": 6.124299049377441, "learning_rate": 9.505563089296246e-06, "loss": 0.3302, "step": 4470 }, { "epoch": 1.1451942740286298, "grad_norm": 11.08293342590332, "learning_rate": 9.501687057982531e-06, "loss": 0.2411, "step": 4480 }, { "epoch": 1.1477505112474438, "grad_norm": 8.393287658691406, "learning_rate": 9.497796689869324e-06, "loss": 0.3682, "step": 4490 }, { "epoch": 1.1503067484662577, "grad_norm": 0.49787667393684387, "learning_rate": 9.493891997346522e-06, "loss": 0.176, "step": 4500 }, { "epoch": 1.1528629856850716, "grad_norm": 6.434317588806152, "learning_rate": 9.489972992849641e-06, "loss": 0.2696, "step": 4510 }, { "epoch": 1.1554192229038855, "grad_norm": 8.729398727416992, "learning_rate": 9.486039688859772e-06, "loss": 0.2838, "step": 4520 }, { "epoch": 1.1579754601226995, "grad_norm": 9.446803092956543, "learning_rate": 9.482092097903551e-06, "loss": 0.3253, "step": 4530 }, { "epoch": 1.1605316973415132, "grad_norm": 6.4901957511901855, "learning_rate": 9.478130232553111e-06, "loss": 0.3429, "step": 4540 }, { "epoch": 1.163087934560327, "grad_norm": 9.026398658752441, "learning_rate": 9.474154105426055e-06, "loss": 0.3302, "step": 4550 }, { "epoch": 1.165644171779141, "grad_norm": 6.108066082000732, "learning_rate": 9.470163729185392e-06, "loss": 0.1702, "step": 4560 }, { "epoch": 1.168200408997955, "grad_norm": 10.425956726074219, "learning_rate": 9.466159116539523e-06, "loss": 0.3008, "step": 4570 }, { "epoch": 1.170756646216769, "grad_norm": 4.817817211151123, "learning_rate": 9.462140280242182e-06, "loss": 0.3151, "step": 4580 }, { "epoch": 1.1733128834355828, "grad_norm": 17.882158279418945, "learning_rate": 9.458107233092406e-06, "loss": 0.23, "step": 4590 }, { "epoch": 1.1758691206543967, "grad_norm": 5.028483867645264, "learning_rate": 9.454059987934487e-06, "loss": 0.2413, "step": 4600 }, { "epoch": 1.1784253578732107, "grad_norm": 9.872651100158691, "learning_rate": 9.449998557657936e-06, "loss": 0.1329, "step": 4610 }, { "epoch": 1.1809815950920246, "grad_norm": 5.998063087463379, "learning_rate": 9.445922955197437e-06, "loss": 0.2879, "step": 4620 }, { "epoch": 1.1835378323108385, "grad_norm": 8.390649795532227, "learning_rate": 9.441833193532817e-06, "loss": 0.2824, "step": 4630 }, { "epoch": 1.1860940695296525, "grad_norm": 6.652390003204346, "learning_rate": 9.437729285688986e-06, "loss": 0.3389, "step": 4640 }, { "epoch": 1.1886503067484662, "grad_norm": 10.573369026184082, "learning_rate": 9.433611244735914e-06, "loss": 0.3841, "step": 4650 }, { "epoch": 1.19120654396728, "grad_norm": 10.0396146774292, "learning_rate": 9.429479083788578e-06, "loss": 0.2638, "step": 4660 }, { "epoch": 1.193762781186094, "grad_norm": 11.902812004089355, "learning_rate": 9.425332816006927e-06, "loss": 0.4186, "step": 4670 }, { "epoch": 1.196319018404908, "grad_norm": 10.162897109985352, "learning_rate": 9.421172454595834e-06, "loss": 0.3057, "step": 4680 }, { "epoch": 1.1988752556237219, "grad_norm": 11.278912544250488, "learning_rate": 9.416998012805057e-06, "loss": 0.3223, "step": 4690 }, { "epoch": 1.2014314928425358, "grad_norm": 8.295330047607422, "learning_rate": 9.412809503929198e-06, "loss": 0.2588, "step": 4700 }, { "epoch": 1.2039877300613497, "grad_norm": 7.55431604385376, "learning_rate": 9.408606941307658e-06, "loss": 0.3087, "step": 4710 }, { "epoch": 1.2065439672801637, "grad_norm": 3.9323720932006836, "learning_rate": 9.404390338324599e-06, "loss": 0.3091, "step": 4720 }, { "epoch": 1.2091002044989776, "grad_norm": 7.560153007507324, "learning_rate": 9.400159708408892e-06, "loss": 0.2096, "step": 4730 }, { "epoch": 1.2116564417177913, "grad_norm": 9.517462730407715, "learning_rate": 9.395915065034085e-06, "loss": 0.1582, "step": 4740 }, { "epoch": 1.2142126789366052, "grad_norm": 5.7381720542907715, "learning_rate": 9.391656421718356e-06, "loss": 0.1742, "step": 4750 }, { "epoch": 1.2167689161554192, "grad_norm": 7.014863014221191, "learning_rate": 9.387383792024469e-06, "loss": 0.2988, "step": 4760 }, { "epoch": 1.219325153374233, "grad_norm": 12.077631950378418, "learning_rate": 9.383097189559728e-06, "loss": 0.254, "step": 4770 }, { "epoch": 1.221881390593047, "grad_norm": 8.781020164489746, "learning_rate": 9.37879662797594e-06, "loss": 0.2946, "step": 4780 }, { "epoch": 1.224437627811861, "grad_norm": 9.89029312133789, "learning_rate": 9.37448212096937e-06, "loss": 0.2043, "step": 4790 }, { "epoch": 1.2269938650306749, "grad_norm": 7.694300174713135, "learning_rate": 9.370153682280692e-06, "loss": 0.138, "step": 4800 }, { "epoch": 1.2295501022494888, "grad_norm": 8.310929298400879, "learning_rate": 9.365811325694949e-06, "loss": 0.2311, "step": 4810 }, { "epoch": 1.2321063394683027, "grad_norm": 12.575085639953613, "learning_rate": 9.361455065041514e-06, "loss": 0.2834, "step": 4820 }, { "epoch": 1.2346625766871167, "grad_norm": 10.732074737548828, "learning_rate": 9.357084914194036e-06, "loss": 0.2134, "step": 4830 }, { "epoch": 1.2372188139059306, "grad_norm": 10.34244441986084, "learning_rate": 9.352700887070403e-06, "loss": 0.3486, "step": 4840 }, { "epoch": 1.2397750511247443, "grad_norm": 10.497349739074707, "learning_rate": 9.348302997632699e-06, "loss": 0.3058, "step": 4850 }, { "epoch": 1.2423312883435582, "grad_norm": 14.589156150817871, "learning_rate": 9.343891259887148e-06, "loss": 0.2331, "step": 4860 }, { "epoch": 1.2448875255623721, "grad_norm": 5.382908344268799, "learning_rate": 9.339465687884086e-06, "loss": 0.3091, "step": 4870 }, { "epoch": 1.247443762781186, "grad_norm": 16.56047821044922, "learning_rate": 9.335026295717902e-06, "loss": 0.2812, "step": 4880 }, { "epoch": 1.25, "grad_norm": 5.166291236877441, "learning_rate": 9.330573097527002e-06, "loss": 0.2357, "step": 4890 }, { "epoch": 1.252556237218814, "grad_norm": 6.794707775115967, "learning_rate": 9.326106107493762e-06, "loss": 0.2503, "step": 4900 }, { "epoch": 1.2551124744376279, "grad_norm": 6.429582118988037, "learning_rate": 9.321625339844476e-06, "loss": 0.1967, "step": 4910 }, { "epoch": 1.2576687116564418, "grad_norm": 21.49854278564453, "learning_rate": 9.317130808849322e-06, "loss": 0.3339, "step": 4920 }, { "epoch": 1.2602249488752557, "grad_norm": 6.054262161254883, "learning_rate": 9.312622528822308e-06, "loss": 0.1903, "step": 4930 }, { "epoch": 1.2627811860940694, "grad_norm": 13.686524391174316, "learning_rate": 9.308100514121233e-06, "loss": 0.1497, "step": 4940 }, { "epoch": 1.2653374233128836, "grad_norm": 18.514162063598633, "learning_rate": 9.303564779147634e-06, "loss": 0.2372, "step": 4950 }, { "epoch": 1.2678936605316973, "grad_norm": 6.550439357757568, "learning_rate": 9.299015338346745e-06, "loss": 0.2101, "step": 4960 }, { "epoch": 1.2704498977505112, "grad_norm": 9.836435317993164, "learning_rate": 9.294452206207448e-06, "loss": 0.1643, "step": 4970 }, { "epoch": 1.2730061349693251, "grad_norm": 7.0567307472229, "learning_rate": 9.289875397262234e-06, "loss": 0.1969, "step": 4980 }, { "epoch": 1.275562372188139, "grad_norm": 8.437677383422852, "learning_rate": 9.285284926087144e-06, "loss": 0.3502, "step": 4990 }, { "epoch": 1.278118609406953, "grad_norm": 7.982880592346191, "learning_rate": 9.280680807301735e-06, "loss": 0.1473, "step": 5000 }, { "epoch": 1.280674846625767, "grad_norm": 6.814586162567139, "learning_rate": 9.276063055569029e-06, "loss": 0.2684, "step": 5010 }, { "epoch": 1.2832310838445808, "grad_norm": 5.944293022155762, "learning_rate": 9.271431685595461e-06, "loss": 0.1763, "step": 5020 }, { "epoch": 1.2857873210633946, "grad_norm": 5.889406204223633, "learning_rate": 9.266786712130842e-06, "loss": 0.1852, "step": 5030 }, { "epoch": 1.2883435582822087, "grad_norm": 5.56532096862793, "learning_rate": 9.262128149968304e-06, "loss": 0.3474, "step": 5040 }, { "epoch": 1.2908997955010224, "grad_norm": 6.4994049072265625, "learning_rate": 9.257456013944255e-06, "loss": 0.1804, "step": 5050 }, { "epoch": 1.2934560327198363, "grad_norm": 6.235182285308838, "learning_rate": 9.252770318938334e-06, "loss": 0.2414, "step": 5060 }, { "epoch": 1.2960122699386503, "grad_norm": 5.915652275085449, "learning_rate": 9.248071079873362e-06, "loss": 0.2333, "step": 5070 }, { "epoch": 1.2985685071574642, "grad_norm": 9.032744407653809, "learning_rate": 9.243358311715298e-06, "loss": 0.2185, "step": 5080 }, { "epoch": 1.3011247443762781, "grad_norm": 7.362344264984131, "learning_rate": 9.238632029473178e-06, "loss": 0.2571, "step": 5090 }, { "epoch": 1.303680981595092, "grad_norm": 9.257672309875488, "learning_rate": 9.23389224819909e-06, "loss": 0.2363, "step": 5100 }, { "epoch": 1.306237218813906, "grad_norm": 8.25611400604248, "learning_rate": 9.229138982988102e-06, "loss": 0.1432, "step": 5110 }, { "epoch": 1.30879345603272, "grad_norm": 9.176118850708008, "learning_rate": 9.224372248978231e-06, "loss": 0.2158, "step": 5120 }, { "epoch": 1.3113496932515338, "grad_norm": 3.796792984008789, "learning_rate": 9.21959206135039e-06, "loss": 0.1544, "step": 5130 }, { "epoch": 1.3139059304703475, "grad_norm": 6.011196613311768, "learning_rate": 9.214798435328334e-06, "loss": 0.3326, "step": 5140 }, { "epoch": 1.3164621676891617, "grad_norm": 16.793350219726562, "learning_rate": 9.209991386178621e-06, "loss": 0.2056, "step": 5150 }, { "epoch": 1.3190184049079754, "grad_norm": 7.064115047454834, "learning_rate": 9.205170929210552e-06, "loss": 0.3113, "step": 5160 }, { "epoch": 1.3215746421267893, "grad_norm": 19.5340518951416, "learning_rate": 9.200337079776136e-06, "loss": 0.1886, "step": 5170 }, { "epoch": 1.3241308793456033, "grad_norm": 12.674887657165527, "learning_rate": 9.195489853270029e-06, "loss": 0.4599, "step": 5180 }, { "epoch": 1.3266871165644172, "grad_norm": 13.094590187072754, "learning_rate": 9.190629265129492e-06, "loss": 0.2936, "step": 5190 }, { "epoch": 1.329243353783231, "grad_norm": 9.762693405151367, "learning_rate": 9.185755330834338e-06, "loss": 0.2078, "step": 5200 }, { "epoch": 1.331799591002045, "grad_norm": 7.909463405609131, "learning_rate": 9.180868065906884e-06, "loss": 0.2288, "step": 5210 }, { "epoch": 1.334355828220859, "grad_norm": 7.411076545715332, "learning_rate": 9.175967485911907e-06, "loss": 0.2717, "step": 5220 }, { "epoch": 1.3369120654396727, "grad_norm": 6.424882411956787, "learning_rate": 9.171053606456582e-06, "loss": 0.1745, "step": 5230 }, { "epoch": 1.3394683026584868, "grad_norm": 6.506113052368164, "learning_rate": 9.166126443190443e-06, "loss": 0.1601, "step": 5240 }, { "epoch": 1.3420245398773005, "grad_norm": 9.06916332244873, "learning_rate": 9.161186011805332e-06, "loss": 0.3146, "step": 5250 }, { "epoch": 1.3445807770961145, "grad_norm": 10.523892402648926, "learning_rate": 9.156232328035342e-06, "loss": 0.2956, "step": 5260 }, { "epoch": 1.3471370143149284, "grad_norm": 8.017621994018555, "learning_rate": 9.151265407656775e-06, "loss": 0.2294, "step": 5270 }, { "epoch": 1.3496932515337423, "grad_norm": 14.679991722106934, "learning_rate": 9.146285266488088e-06, "loss": 0.2024, "step": 5280 }, { "epoch": 1.3522494887525562, "grad_norm": 0.9324799180030823, "learning_rate": 9.141291920389843e-06, "loss": 0.1614, "step": 5290 }, { "epoch": 1.3548057259713702, "grad_norm": 5.870517253875732, "learning_rate": 9.136285385264655e-06, "loss": 0.2225, "step": 5300 }, { "epoch": 1.357361963190184, "grad_norm": 11.407279014587402, "learning_rate": 9.131265677057146e-06, "loss": 0.1872, "step": 5310 }, { "epoch": 1.359918200408998, "grad_norm": 17.659618377685547, "learning_rate": 9.12623281175389e-06, "loss": 0.2171, "step": 5320 }, { "epoch": 1.362474437627812, "grad_norm": 12.906618118286133, "learning_rate": 9.121186805383358e-06, "loss": 0.2759, "step": 5330 }, { "epoch": 1.3650306748466257, "grad_norm": 6.954870223999023, "learning_rate": 9.11612767401588e-06, "loss": 0.2188, "step": 5340 }, { "epoch": 1.3675869120654398, "grad_norm": 4.730753421783447, "learning_rate": 9.111055433763582e-06, "loss": 0.2126, "step": 5350 }, { "epoch": 1.3701431492842535, "grad_norm": 13.265816688537598, "learning_rate": 9.105970100780341e-06, "loss": 0.2904, "step": 5360 }, { "epoch": 1.3726993865030674, "grad_norm": 3.0092155933380127, "learning_rate": 9.100871691261728e-06, "loss": 0.1578, "step": 5370 }, { "epoch": 1.3752556237218814, "grad_norm": 6.426031112670898, "learning_rate": 9.09576022144496e-06, "loss": 0.2037, "step": 5380 }, { "epoch": 1.3778118609406953, "grad_norm": 8.25606918334961, "learning_rate": 9.09063570760885e-06, "loss": 0.1798, "step": 5390 }, { "epoch": 1.3803680981595092, "grad_norm": 20.269100189208984, "learning_rate": 9.085498166073755e-06, "loss": 0.3306, "step": 5400 }, { "epoch": 1.3829243353783232, "grad_norm": 7.950530529022217, "learning_rate": 9.080347613201513e-06, "loss": 0.2489, "step": 5410 }, { "epoch": 1.385480572597137, "grad_norm": 11.141780853271484, "learning_rate": 9.075184065395413e-06, "loss": 0.2043, "step": 5420 }, { "epoch": 1.3880368098159508, "grad_norm": 4.896001815795898, "learning_rate": 9.070007539100118e-06, "loss": 0.3356, "step": 5430 }, { "epoch": 1.390593047034765, "grad_norm": 11.557963371276855, "learning_rate": 9.064818050801634e-06, "loss": 0.1741, "step": 5440 }, { "epoch": 1.3931492842535786, "grad_norm": 6.800997734069824, "learning_rate": 9.05961561702724e-06, "loss": 0.1887, "step": 5450 }, { "epoch": 1.3957055214723926, "grad_norm": 6.017879009246826, "learning_rate": 9.054400254345448e-06, "loss": 0.2398, "step": 5460 }, { "epoch": 1.3982617586912065, "grad_norm": 6.6386189460754395, "learning_rate": 9.049171979365945e-06, "loss": 0.1465, "step": 5470 }, { "epoch": 1.4008179959100204, "grad_norm": 4.621875762939453, "learning_rate": 9.043930808739537e-06, "loss": 0.335, "step": 5480 }, { "epoch": 1.4033742331288344, "grad_norm": 6.274672508239746, "learning_rate": 9.038676759158105e-06, "loss": 0.1384, "step": 5490 }, { "epoch": 1.4059304703476483, "grad_norm": 2.794377565383911, "learning_rate": 9.033409847354542e-06, "loss": 0.2304, "step": 5500 }, { "epoch": 1.4084867075664622, "grad_norm": 10.634669303894043, "learning_rate": 9.028130090102706e-06, "loss": 0.3528, "step": 5510 }, { "epoch": 1.4110429447852761, "grad_norm": 6.818256855010986, "learning_rate": 9.022837504217366e-06, "loss": 0.1227, "step": 5520 }, { "epoch": 1.41359918200409, "grad_norm": 8.108813285827637, "learning_rate": 9.017532106554143e-06, "loss": 0.2864, "step": 5530 }, { "epoch": 1.4161554192229038, "grad_norm": 8.222419738769531, "learning_rate": 9.012213914009464e-06, "loss": 0.251, "step": 5540 }, { "epoch": 1.418711656441718, "grad_norm": 9.900671005249023, "learning_rate": 9.006882943520506e-06, "loss": 0.2974, "step": 5550 }, { "epoch": 1.4212678936605316, "grad_norm": 4.816144943237305, "learning_rate": 9.001539212065136e-06, "loss": 0.2626, "step": 5560 }, { "epoch": 1.4238241308793456, "grad_norm": 3.0924923419952393, "learning_rate": 8.996182736661863e-06, "loss": 0.1263, "step": 5570 }, { "epoch": 1.4263803680981595, "grad_norm": 5.688522815704346, "learning_rate": 8.990813534369787e-06, "loss": 0.2336, "step": 5580 }, { "epoch": 1.4289366053169734, "grad_norm": 10.940909385681152, "learning_rate": 8.985431622288533e-06, "loss": 0.2868, "step": 5590 }, { "epoch": 1.4314928425357873, "grad_norm": 13.232209205627441, "learning_rate": 8.98003701755821e-06, "loss": 0.2469, "step": 5600 }, { "epoch": 1.4340490797546013, "grad_norm": 7.461823463439941, "learning_rate": 8.974629737359348e-06, "loss": 0.2405, "step": 5610 }, { "epoch": 1.4366053169734152, "grad_norm": 3.547605037689209, "learning_rate": 8.96920979891284e-06, "loss": 0.1996, "step": 5620 }, { "epoch": 1.439161554192229, "grad_norm": 6.454622745513916, "learning_rate": 8.963777219479902e-06, "loss": 0.2072, "step": 5630 }, { "epoch": 1.441717791411043, "grad_norm": 6.902385711669922, "learning_rate": 8.958332016362e-06, "loss": 0.0997, "step": 5640 }, { "epoch": 1.4442740286298568, "grad_norm": 7.078310489654541, "learning_rate": 8.952874206900809e-06, "loss": 0.1943, "step": 5650 }, { "epoch": 1.4468302658486707, "grad_norm": 5.974771976470947, "learning_rate": 8.94740380847815e-06, "loss": 0.3048, "step": 5660 }, { "epoch": 1.4493865030674846, "grad_norm": 7.63726806640625, "learning_rate": 8.941920838515936e-06, "loss": 0.1593, "step": 5670 }, { "epoch": 1.4519427402862985, "grad_norm": 5.1760430335998535, "learning_rate": 8.936425314476121e-06, "loss": 0.1877, "step": 5680 }, { "epoch": 1.4544989775051125, "grad_norm": 8.131750106811523, "learning_rate": 8.930917253860637e-06, "loss": 0.2409, "step": 5690 }, { "epoch": 1.4570552147239264, "grad_norm": 6.002188205718994, "learning_rate": 8.925396674211341e-06, "loss": 0.2159, "step": 5700 }, { "epoch": 1.4596114519427403, "grad_norm": 12.237569808959961, "learning_rate": 8.919863593109967e-06, "loss": 0.2005, "step": 5710 }, { "epoch": 1.4621676891615543, "grad_norm": 14.401376724243164, "learning_rate": 8.914318028178055e-06, "loss": 0.3153, "step": 5720 }, { "epoch": 1.4647239263803682, "grad_norm": 5.81574821472168, "learning_rate": 8.908759997076909e-06, "loss": 0.1836, "step": 5730 }, { "epoch": 1.467280163599182, "grad_norm": 6.657829761505127, "learning_rate": 8.903189517507527e-06, "loss": 0.2741, "step": 5740 }, { "epoch": 1.469836400817996, "grad_norm": 4.597752094268799, "learning_rate": 8.897606607210563e-06, "loss": 0.1928, "step": 5750 }, { "epoch": 1.4723926380368098, "grad_norm": 7.948934555053711, "learning_rate": 8.892011283966253e-06, "loss": 0.1889, "step": 5760 }, { "epoch": 1.4749488752556237, "grad_norm": 14.392995834350586, "learning_rate": 8.886403565594367e-06, "loss": 0.2368, "step": 5770 }, { "epoch": 1.4775051124744376, "grad_norm": 7.179086685180664, "learning_rate": 8.88078346995415e-06, "loss": 0.2, "step": 5780 }, { "epoch": 1.4800613496932515, "grad_norm": 7.146066665649414, "learning_rate": 8.875151014944267e-06, "loss": 0.1678, "step": 5790 }, { "epoch": 1.4826175869120655, "grad_norm": 9.944082260131836, "learning_rate": 8.869506218502742e-06, "loss": 0.1642, "step": 5800 }, { "epoch": 1.4851738241308794, "grad_norm": 12.250117301940918, "learning_rate": 8.863849098606907e-06, "loss": 0.2266, "step": 5810 }, { "epoch": 1.4877300613496933, "grad_norm": 29.39047622680664, "learning_rate": 8.858179673273337e-06, "loss": 0.2813, "step": 5820 }, { "epoch": 1.490286298568507, "grad_norm": 11.951471328735352, "learning_rate": 8.852497960557804e-06, "loss": 0.3751, "step": 5830 }, { "epoch": 1.4928425357873212, "grad_norm": 1.6928082704544067, "learning_rate": 8.846803978555203e-06, "loss": 0.1711, "step": 5840 }, { "epoch": 1.4953987730061349, "grad_norm": 8.914717674255371, "learning_rate": 8.84109774539951e-06, "loss": 0.2084, "step": 5850 }, { "epoch": 1.4979550102249488, "grad_norm": 9.57482624053955, "learning_rate": 8.835379279263718e-06, "loss": 0.2722, "step": 5860 }, { "epoch": 1.5005112474437627, "grad_norm": 11.420355796813965, "learning_rate": 8.829648598359775e-06, "loss": 0.1593, "step": 5870 }, { "epoch": 1.5030674846625767, "grad_norm": 4.315236568450928, "learning_rate": 8.823905720938534e-06, "loss": 0.1693, "step": 5880 }, { "epoch": 1.5056237218813906, "grad_norm": 4.3361945152282715, "learning_rate": 8.81815066528969e-06, "loss": 0.164, "step": 5890 }, { "epoch": 1.5081799591002045, "grad_norm": 9.296090126037598, "learning_rate": 8.812383449741724e-06, "loss": 0.1611, "step": 5900 }, { "epoch": 1.5107361963190185, "grad_norm": 16.11349105834961, "learning_rate": 8.806604092661839e-06, "loss": 0.1636, "step": 5910 }, { "epoch": 1.5132924335378322, "grad_norm": 12.905272483825684, "learning_rate": 8.800812612455909e-06, "loss": 0.1995, "step": 5920 }, { "epoch": 1.5158486707566463, "grad_norm": 4.522705554962158, "learning_rate": 8.79500902756842e-06, "loss": 0.115, "step": 5930 }, { "epoch": 1.51840490797546, "grad_norm": 8.156167984008789, "learning_rate": 8.789193356482401e-06, "loss": 0.1444, "step": 5940 }, { "epoch": 1.5209611451942742, "grad_norm": 6.0793328285217285, "learning_rate": 8.783365617719382e-06, "loss": 0.1781, "step": 5950 }, { "epoch": 1.5235173824130879, "grad_norm": 8.255613327026367, "learning_rate": 8.777525829839317e-06, "loss": 0.2307, "step": 5960 }, { "epoch": 1.5260736196319018, "grad_norm": 13.122941017150879, "learning_rate": 8.77167401144054e-06, "loss": 0.1803, "step": 5970 }, { "epoch": 1.5286298568507157, "grad_norm": 4.706987380981445, "learning_rate": 8.765810181159696e-06, "loss": 0.1343, "step": 5980 }, { "epoch": 1.5311860940695297, "grad_norm": 4.327836990356445, "learning_rate": 8.759934357671685e-06, "loss": 0.2642, "step": 5990 }, { "epoch": 1.5337423312883436, "grad_norm": 5.442415714263916, "learning_rate": 8.754046559689602e-06, "loss": 0.2007, "step": 6000 }, { "epoch": 1.5362985685071575, "grad_norm": 12.884740829467773, "learning_rate": 8.748146805964683e-06, "loss": 0.2029, "step": 6010 }, { "epoch": 1.5388548057259714, "grad_norm": 7.4214582443237305, "learning_rate": 8.742235115286232e-06, "loss": 0.131, "step": 6020 }, { "epoch": 1.5414110429447851, "grad_norm": 5.057283878326416, "learning_rate": 8.736311506481579e-06, "loss": 0.2342, "step": 6030 }, { "epoch": 1.5439672801635993, "grad_norm": 11.823676109313965, "learning_rate": 8.730375998416e-06, "loss": 0.145, "step": 6040 }, { "epoch": 1.546523517382413, "grad_norm": 8.330456733703613, "learning_rate": 8.724428609992675e-06, "loss": 0.1139, "step": 6050 }, { "epoch": 1.5490797546012272, "grad_norm": 11.217977523803711, "learning_rate": 8.718469360152617e-06, "loss": 0.2302, "step": 6060 }, { "epoch": 1.5516359918200409, "grad_norm": 7.306154251098633, "learning_rate": 8.712498267874615e-06, "loss": 0.1695, "step": 6070 }, { "epoch": 1.5541922290388548, "grad_norm": 5.975497722625732, "learning_rate": 8.706515352175173e-06, "loss": 0.2389, "step": 6080 }, { "epoch": 1.5567484662576687, "grad_norm": 7.292505264282227, "learning_rate": 8.700520632108453e-06, "loss": 0.305, "step": 6090 }, { "epoch": 1.5593047034764826, "grad_norm": 12.038248062133789, "learning_rate": 8.694514126766205e-06, "loss": 0.1872, "step": 6100 }, { "epoch": 1.5618609406952966, "grad_norm": 5.702522277832031, "learning_rate": 8.688495855277718e-06, "loss": 0.1847, "step": 6110 }, { "epoch": 1.5644171779141103, "grad_norm": 6.972240447998047, "learning_rate": 8.68246583680975e-06, "loss": 0.177, "step": 6120 }, { "epoch": 1.5669734151329244, "grad_norm": 5.465381145477295, "learning_rate": 8.676424090566473e-06, "loss": 0.2276, "step": 6130 }, { "epoch": 1.5695296523517381, "grad_norm": 3.666998863220215, "learning_rate": 8.670370635789407e-06, "loss": 0.2746, "step": 6140 }, { "epoch": 1.5720858895705523, "grad_norm": 1.9799798727035522, "learning_rate": 8.66430549175736e-06, "loss": 0.1176, "step": 6150 }, { "epoch": 1.574642126789366, "grad_norm": 5.453342437744141, "learning_rate": 8.65822867778637e-06, "loss": 0.2283, "step": 6160 }, { "epoch": 1.57719836400818, "grad_norm": 5.7280683517456055, "learning_rate": 8.652140213229642e-06, "loss": 0.1838, "step": 6170 }, { "epoch": 1.5797546012269938, "grad_norm": 5.071581840515137, "learning_rate": 8.64604011747748e-06, "loss": 0.179, "step": 6180 }, { "epoch": 1.5823108384458078, "grad_norm": 1.5993189811706543, "learning_rate": 8.639928409957236e-06, "loss": 0.222, "step": 6190 }, { "epoch": 1.5848670756646217, "grad_norm": 5.141691207885742, "learning_rate": 8.63380511013324e-06, "loss": 0.2307, "step": 6200 }, { "epoch": 1.5874233128834356, "grad_norm": 8.022561073303223, "learning_rate": 8.627670237506742e-06, "loss": 0.2617, "step": 6210 }, { "epoch": 1.5899795501022496, "grad_norm": 7.5429301261901855, "learning_rate": 8.621523811615848e-06, "loss": 0.1311, "step": 6220 }, { "epoch": 1.5925357873210633, "grad_norm": 6.324619293212891, "learning_rate": 8.615365852035456e-06, "loss": 0.2665, "step": 6230 }, { "epoch": 1.5950920245398774, "grad_norm": 5.001183032989502, "learning_rate": 8.609196378377203e-06, "loss": 0.205, "step": 6240 }, { "epoch": 1.5976482617586911, "grad_norm": 7.617444038391113, "learning_rate": 8.603015410289387e-06, "loss": 0.4019, "step": 6250 }, { "epoch": 1.6002044989775053, "grad_norm": 4.471902847290039, "learning_rate": 8.596822967456915e-06, "loss": 0.1962, "step": 6260 }, { "epoch": 1.602760736196319, "grad_norm": 6.265940189361572, "learning_rate": 8.590619069601247e-06, "loss": 0.139, "step": 6270 }, { "epoch": 1.605316973415133, "grad_norm": 6.503332614898682, "learning_rate": 8.584403736480313e-06, "loss": 0.1892, "step": 6280 }, { "epoch": 1.6078732106339468, "grad_norm": 4.576842784881592, "learning_rate": 8.57817698788847e-06, "loss": 0.1271, "step": 6290 }, { "epoch": 1.6104294478527608, "grad_norm": 7.434634685516357, "learning_rate": 8.571938843656422e-06, "loss": 0.2066, "step": 6300 }, { "epoch": 1.6129856850715747, "grad_norm": 8.325051307678223, "learning_rate": 8.565689323651174e-06, "loss": 0.1975, "step": 6310 }, { "epoch": 1.6155419222903884, "grad_norm": 7.133656978607178, "learning_rate": 8.559428447775956e-06, "loss": 0.1116, "step": 6320 }, { "epoch": 1.6180981595092025, "grad_norm": 7.880911827087402, "learning_rate": 8.553156235970163e-06, "loss": 0.1743, "step": 6330 }, { "epoch": 1.6206543967280163, "grad_norm": 20.269716262817383, "learning_rate": 8.546872708209297e-06, "loss": 0.1994, "step": 6340 }, { "epoch": 1.6232106339468304, "grad_norm": 8.107951164245605, "learning_rate": 8.54057788450489e-06, "loss": 0.1642, "step": 6350 }, { "epoch": 1.6257668711656441, "grad_norm": 5.440578937530518, "learning_rate": 8.534271784904457e-06, "loss": 0.1593, "step": 6360 }, { "epoch": 1.628323108384458, "grad_norm": 3.178661584854126, "learning_rate": 8.527954429491422e-06, "loss": 0.2159, "step": 6370 }, { "epoch": 1.630879345603272, "grad_norm": 5.0311055183410645, "learning_rate": 8.521625838385052e-06, "loss": 0.2587, "step": 6380 }, { "epoch": 1.633435582822086, "grad_norm": 1.3832993507385254, "learning_rate": 8.515286031740403e-06, "loss": 0.1799, "step": 6390 }, { "epoch": 1.6359918200408998, "grad_norm": 8.102804183959961, "learning_rate": 8.508935029748244e-06, "loss": 0.1516, "step": 6400 }, { "epoch": 1.6385480572597138, "grad_norm": 6.02394437789917, "learning_rate": 8.502572852635005e-06, "loss": 0.179, "step": 6410 }, { "epoch": 1.6411042944785277, "grad_norm": 6.3991312980651855, "learning_rate": 8.4961995206627e-06, "loss": 0.2349, "step": 6420 }, { "epoch": 1.6436605316973414, "grad_norm": 5.750975608825684, "learning_rate": 8.489815054128874e-06, "loss": 0.1607, "step": 6430 }, { "epoch": 1.6462167689161555, "grad_norm": 4.242618560791016, "learning_rate": 8.483419473366525e-06, "loss": 0.1986, "step": 6440 }, { "epoch": 1.6487730061349692, "grad_norm": 9.25927734375, "learning_rate": 8.477012798744056e-06, "loss": 0.1515, "step": 6450 }, { "epoch": 1.6513292433537834, "grad_norm": 0.4773276150226593, "learning_rate": 8.470595050665196e-06, "loss": 0.1506, "step": 6460 }, { "epoch": 1.653885480572597, "grad_norm": 9.461527824401855, "learning_rate": 8.464166249568944e-06, "loss": 0.2223, "step": 6470 }, { "epoch": 1.656441717791411, "grad_norm": 4.911471843719482, "learning_rate": 8.457726415929494e-06, "loss": 0.1179, "step": 6480 }, { "epoch": 1.658997955010225, "grad_norm": 5.247636318206787, "learning_rate": 8.451275570256183e-06, "loss": 0.1667, "step": 6490 }, { "epoch": 1.6615541922290389, "grad_norm": 7.205673694610596, "learning_rate": 8.444813733093416e-06, "loss": 0.184, "step": 6500 }, { "epoch": 1.6641104294478528, "grad_norm": 12.158601760864258, "learning_rate": 8.4383409250206e-06, "loss": 0.1431, "step": 6510 }, { "epoch": 1.6666666666666665, "grad_norm": 7.19647741317749, "learning_rate": 8.43185716665209e-06, "loss": 0.1936, "step": 6520 }, { "epoch": 1.6692229038854807, "grad_norm": 7.732553958892822, "learning_rate": 8.425362478637105e-06, "loss": 0.1933, "step": 6530 }, { "epoch": 1.6717791411042944, "grad_norm": 8.475358009338379, "learning_rate": 8.418856881659677e-06, "loss": 0.2284, "step": 6540 }, { "epoch": 1.6743353783231085, "grad_norm": 11.112258911132812, "learning_rate": 8.412340396438587e-06, "loss": 0.1528, "step": 6550 }, { "epoch": 1.6768916155419222, "grad_norm": 11.443809509277344, "learning_rate": 8.405813043727279e-06, "loss": 0.1782, "step": 6560 }, { "epoch": 1.6794478527607362, "grad_norm": 0.7984766960144043, "learning_rate": 8.399274844313816e-06, "loss": 0.1205, "step": 6570 }, { "epoch": 1.68200408997955, "grad_norm": 0.6593146324157715, "learning_rate": 8.392725819020806e-06, "loss": 0.0928, "step": 6580 }, { "epoch": 1.684560327198364, "grad_norm": 7.761658668518066, "learning_rate": 8.38616598870533e-06, "loss": 0.1637, "step": 6590 }, { "epoch": 1.687116564417178, "grad_norm": 6.802185535430908, "learning_rate": 8.379595374258883e-06, "loss": 0.3094, "step": 6600 }, { "epoch": 1.6896728016359919, "grad_norm": 7.621953964233398, "learning_rate": 8.373013996607309e-06, "loss": 0.1235, "step": 6610 }, { "epoch": 1.6922290388548058, "grad_norm": 5.766721248626709, "learning_rate": 8.36642187671072e-06, "loss": 0.1979, "step": 6620 }, { "epoch": 1.6947852760736195, "grad_norm": 7.573540687561035, "learning_rate": 8.359819035563447e-06, "loss": 0.1544, "step": 6630 }, { "epoch": 1.6973415132924337, "grad_norm": 7.856776237487793, "learning_rate": 8.353205494193965e-06, "loss": 0.2178, "step": 6640 }, { "epoch": 1.6998977505112474, "grad_norm": 6.826193332672119, "learning_rate": 8.346581273664826e-06, "loss": 0.1453, "step": 6650 }, { "epoch": 1.7024539877300615, "grad_norm": 3.6651082038879395, "learning_rate": 8.339946395072593e-06, "loss": 0.1316, "step": 6660 }, { "epoch": 1.7050102249488752, "grad_norm": 13.016592025756836, "learning_rate": 8.33330087954777e-06, "loss": 0.2319, "step": 6670 }, { "epoch": 1.7075664621676891, "grad_norm": 2.1794581413269043, "learning_rate": 8.32664474825474e-06, "loss": 0.096, "step": 6680 }, { "epoch": 1.710122699386503, "grad_norm": 6.232535362243652, "learning_rate": 8.319978022391692e-06, "loss": 0.1157, "step": 6690 }, { "epoch": 1.712678936605317, "grad_norm": 11.268756866455078, "learning_rate": 8.313300723190561e-06, "loss": 0.1155, "step": 6700 }, { "epoch": 1.715235173824131, "grad_norm": 7.64271879196167, "learning_rate": 8.306612871916946e-06, "loss": 0.1295, "step": 6710 }, { "epoch": 1.7177914110429446, "grad_norm": 3.3692967891693115, "learning_rate": 8.299914489870065e-06, "loss": 0.1837, "step": 6720 }, { "epoch": 1.7203476482617588, "grad_norm": 3.621946096420288, "learning_rate": 8.293205598382662e-06, "loss": 0.116, "step": 6730 }, { "epoch": 1.7229038854805725, "grad_norm": 7.414484024047852, "learning_rate": 8.28648621882096e-06, "loss": 0.2422, "step": 6740 }, { "epoch": 1.7254601226993866, "grad_norm": 8.968006134033203, "learning_rate": 8.279756372584575e-06, "loss": 0.1423, "step": 6750 }, { "epoch": 1.7280163599182004, "grad_norm": 5.072629451751709, "learning_rate": 8.273016081106468e-06, "loss": 0.1433, "step": 6760 }, { "epoch": 1.7305725971370143, "grad_norm": 8.455986976623535, "learning_rate": 8.266265365852854e-06, "loss": 0.2221, "step": 6770 }, { "epoch": 1.7331288343558282, "grad_norm": 7.337911128997803, "learning_rate": 8.259504248323155e-06, "loss": 0.0976, "step": 6780 }, { "epoch": 1.7356850715746421, "grad_norm": 7.0469207763671875, "learning_rate": 8.252732750049918e-06, "loss": 0.1134, "step": 6790 }, { "epoch": 1.738241308793456, "grad_norm": 6.939335823059082, "learning_rate": 8.245950892598746e-06, "loss": 0.1975, "step": 6800 }, { "epoch": 1.74079754601227, "grad_norm": 3.959833860397339, "learning_rate": 8.23915869756824e-06, "loss": 0.1229, "step": 6810 }, { "epoch": 1.743353783231084, "grad_norm": 9.389518737792969, "learning_rate": 8.23235618658992e-06, "loss": 0.1164, "step": 6820 }, { "epoch": 1.7459100204498976, "grad_norm": 3.3109939098358154, "learning_rate": 8.225543381328162e-06, "loss": 0.1659, "step": 6830 }, { "epoch": 1.7484662576687118, "grad_norm": 4.770479202270508, "learning_rate": 8.218720303480124e-06, "loss": 0.1385, "step": 6840 }, { "epoch": 1.7510224948875255, "grad_norm": 3.3656115531921387, "learning_rate": 8.211886974775682e-06, "loss": 0.2088, "step": 6850 }, { "epoch": 1.7535787321063396, "grad_norm": 5.787675857543945, "learning_rate": 8.205043416977358e-06, "loss": 0.0627, "step": 6860 }, { "epoch": 1.7561349693251533, "grad_norm": 5.655759334564209, "learning_rate": 8.198189651880253e-06, "loss": 0.1626, "step": 6870 }, { "epoch": 1.7586912065439673, "grad_norm": 5.212615966796875, "learning_rate": 8.191325701311971e-06, "loss": 0.1073, "step": 6880 }, { "epoch": 1.7612474437627812, "grad_norm": 5.487759113311768, "learning_rate": 8.18445158713256e-06, "loss": 0.1968, "step": 6890 }, { "epoch": 1.7638036809815951, "grad_norm": 13.81961727142334, "learning_rate": 8.17756733123443e-06, "loss": 0.1275, "step": 6900 }, { "epoch": 1.766359918200409, "grad_norm": 5.11100959777832, "learning_rate": 8.170672955542299e-06, "loss": 0.183, "step": 6910 }, { "epoch": 1.7689161554192228, "grad_norm": 1.606713056564331, "learning_rate": 8.163768482013106e-06, "loss": 0.0828, "step": 6920 }, { "epoch": 1.771472392638037, "grad_norm": 5.141575813293457, "learning_rate": 8.156853932635955e-06, "loss": 0.1193, "step": 6930 }, { "epoch": 1.7740286298568506, "grad_norm": 11.083499908447266, "learning_rate": 8.149929329432032e-06, "loss": 0.2004, "step": 6940 }, { "epoch": 1.7765848670756648, "grad_norm": 10.328533172607422, "learning_rate": 8.14299469445455e-06, "loss": 0.0874, "step": 6950 }, { "epoch": 1.7791411042944785, "grad_norm": 6.226305961608887, "learning_rate": 8.136050049788666e-06, "loss": 0.103, "step": 6960 }, { "epoch": 1.7816973415132924, "grad_norm": 8.67745590209961, "learning_rate": 8.129095417551416e-06, "loss": 0.1642, "step": 6970 }, { "epoch": 1.7842535787321063, "grad_norm": 9.080946922302246, "learning_rate": 8.122130819891645e-06, "loss": 0.14, "step": 6980 }, { "epoch": 1.7868098159509203, "grad_norm": 4.160292625427246, "learning_rate": 8.115156278989938e-06, "loss": 0.0769, "step": 6990 }, { "epoch": 1.7893660531697342, "grad_norm": 4.340435028076172, "learning_rate": 8.10817181705854e-06, "loss": 0.0904, "step": 7000 }, { "epoch": 1.7919222903885481, "grad_norm": 5.093479156494141, "learning_rate": 8.101177456341301e-06, "loss": 0.1122, "step": 7010 }, { "epoch": 1.794478527607362, "grad_norm": 7.038718223571777, "learning_rate": 8.094173219113589e-06, "loss": 0.1572, "step": 7020 }, { "epoch": 1.7970347648261757, "grad_norm": 4.94278621673584, "learning_rate": 8.087159127682227e-06, "loss": 0.1477, "step": 7030 }, { "epoch": 1.79959100204499, "grad_norm": 1.7163784503936768, "learning_rate": 8.080135204385425e-06, "loss": 0.2002, "step": 7040 }, { "epoch": 1.8021472392638036, "grad_norm": 8.449196815490723, "learning_rate": 8.073101471592702e-06, "loss": 0.2222, "step": 7050 }, { "epoch": 1.8047034764826178, "grad_norm": 6.09740686416626, "learning_rate": 8.066057951704821e-06, "loss": 0.14, "step": 7060 }, { "epoch": 1.8072597137014315, "grad_norm": 13.180371284484863, "learning_rate": 8.059004667153713e-06, "loss": 0.0977, "step": 7070 }, { "epoch": 1.8098159509202454, "grad_norm": 7.9253058433532715, "learning_rate": 8.051941640402406e-06, "loss": 0.1332, "step": 7080 }, { "epoch": 1.8123721881390593, "grad_norm": 8.333995819091797, "learning_rate": 8.044868893944955e-06, "loss": 0.1297, "step": 7090 }, { "epoch": 1.8149284253578732, "grad_norm": 8.638833045959473, "learning_rate": 8.03778645030637e-06, "loss": 0.101, "step": 7100 }, { "epoch": 1.8174846625766872, "grad_norm": 6.839685916900635, "learning_rate": 8.030694332042548e-06, "loss": 0.0693, "step": 7110 }, { "epoch": 1.8200408997955009, "grad_norm": 7.357212066650391, "learning_rate": 8.02359256174019e-06, "loss": 0.066, "step": 7120 }, { "epoch": 1.822597137014315, "grad_norm": 4.24409294128418, "learning_rate": 8.01648116201674e-06, "loss": 0.1837, "step": 7130 }, { "epoch": 1.8251533742331287, "grad_norm": 8.311896324157715, "learning_rate": 8.009360155520313e-06, "loss": 0.1389, "step": 7140 }, { "epoch": 1.8277096114519429, "grad_norm": 12.251752853393555, "learning_rate": 8.002229564929616e-06, "loss": 0.111, "step": 7150 }, { "epoch": 1.8302658486707566, "grad_norm": 5.574610233306885, "learning_rate": 7.995089412953875e-06, "loss": 0.1158, "step": 7160 }, { "epoch": 1.8328220858895705, "grad_norm": 8.057143211364746, "learning_rate": 7.987939722332776e-06, "loss": 0.094, "step": 7170 }, { "epoch": 1.8353783231083844, "grad_norm": 11.000237464904785, "learning_rate": 7.980780515836377e-06, "loss": 0.1, "step": 7180 }, { "epoch": 1.8379345603271984, "grad_norm": 5.534488201141357, "learning_rate": 7.97361181626504e-06, "loss": 0.2236, "step": 7190 }, { "epoch": 1.8404907975460123, "grad_norm": 6.447413444519043, "learning_rate": 7.966433646449364e-06, "loss": 0.2489, "step": 7200 }, { "epoch": 1.8430470347648262, "grad_norm": 2.375591516494751, "learning_rate": 7.959246029250112e-06, "loss": 0.0896, "step": 7210 }, { "epoch": 1.8456032719836402, "grad_norm": 7.849663734436035, "learning_rate": 7.952048987558126e-06, "loss": 0.2143, "step": 7220 }, { "epoch": 1.8481595092024539, "grad_norm": 9.33170223236084, "learning_rate": 7.944842544294268e-06, "loss": 0.1366, "step": 7230 }, { "epoch": 1.850715746421268, "grad_norm": 7.391844749450684, "learning_rate": 7.937626722409342e-06, "loss": 0.1979, "step": 7240 }, { "epoch": 1.8532719836400817, "grad_norm": 0.42054474353790283, "learning_rate": 7.930401544884017e-06, "loss": 0.0991, "step": 7250 }, { "epoch": 1.8558282208588959, "grad_norm": 6.135138511657715, "learning_rate": 7.923167034728763e-06, "loss": 0.0628, "step": 7260 }, { "epoch": 1.8583844580777096, "grad_norm": 9.923365592956543, "learning_rate": 7.915923214983767e-06, "loss": 0.1159, "step": 7270 }, { "epoch": 1.8609406952965235, "grad_norm": 7.890591144561768, "learning_rate": 7.908670108718868e-06, "loss": 0.1056, "step": 7280 }, { "epoch": 1.8634969325153374, "grad_norm": 1.3051115274429321, "learning_rate": 7.90140773903348e-06, "loss": 0.1308, "step": 7290 }, { "epoch": 1.8660531697341514, "grad_norm": 7.580386161804199, "learning_rate": 7.894136129056516e-06, "loss": 0.1585, "step": 7300 }, { "epoch": 1.8686094069529653, "grad_norm": 4.543681621551514, "learning_rate": 7.886855301946322e-06, "loss": 0.0982, "step": 7310 }, { "epoch": 1.871165644171779, "grad_norm": 8.670321464538574, "learning_rate": 7.879565280890593e-06, "loss": 0.1984, "step": 7320 }, { "epoch": 1.8737218813905931, "grad_norm": 10.790763854980469, "learning_rate": 7.872266089106309e-06, "loss": 0.0939, "step": 7330 }, { "epoch": 1.8762781186094069, "grad_norm": 5.62462854385376, "learning_rate": 7.864957749839653e-06, "loss": 0.125, "step": 7340 }, { "epoch": 1.878834355828221, "grad_norm": 5.0767717361450195, "learning_rate": 7.857640286365946e-06, "loss": 0.1439, "step": 7350 }, { "epoch": 1.8813905930470347, "grad_norm": 4.852501392364502, "learning_rate": 7.850313721989558e-06, "loss": 0.2335, "step": 7360 }, { "epoch": 1.8839468302658486, "grad_norm": 8.364299774169922, "learning_rate": 7.842978080043855e-06, "loss": 0.138, "step": 7370 }, { "epoch": 1.8865030674846626, "grad_norm": 16.219741821289062, "learning_rate": 7.835633383891102e-06, "loss": 0.1868, "step": 7380 }, { "epoch": 1.8890593047034765, "grad_norm": 6.5828375816345215, "learning_rate": 7.828279656922408e-06, "loss": 0.1366, "step": 7390 }, { "epoch": 1.8916155419222904, "grad_norm": 11.096482276916504, "learning_rate": 7.820916922557636e-06, "loss": 0.0636, "step": 7400 }, { "epoch": 1.8941717791411041, "grad_norm": 7.485594749450684, "learning_rate": 7.813545204245341e-06, "loss": 0.2255, "step": 7410 }, { "epoch": 1.8967280163599183, "grad_norm": 8.4011869430542, "learning_rate": 7.806164525462687e-06, "loss": 0.1484, "step": 7420 }, { "epoch": 1.899284253578732, "grad_norm": 7.423107624053955, "learning_rate": 7.798774909715374e-06, "loss": 0.1592, "step": 7430 }, { "epoch": 1.9018404907975461, "grad_norm": 5.52902364730835, "learning_rate": 7.791376380537567e-06, "loss": 0.0735, "step": 7440 }, { "epoch": 1.9043967280163598, "grad_norm": 3.4649152755737305, "learning_rate": 7.783968961491818e-06, "loss": 0.1479, "step": 7450 }, { "epoch": 1.9069529652351738, "grad_norm": 7.678995132446289, "learning_rate": 7.776552676168987e-06, "loss": 0.2274, "step": 7460 }, { "epoch": 1.9095092024539877, "grad_norm": 0.06970912218093872, "learning_rate": 7.769127548188174e-06, "loss": 0.1003, "step": 7470 }, { "epoch": 1.9120654396728016, "grad_norm": 17.272754669189453, "learning_rate": 7.761693601196642e-06, "loss": 0.0924, "step": 7480 }, { "epoch": 1.9146216768916156, "grad_norm": 5.135831832885742, "learning_rate": 7.75425085886974e-06, "loss": 0.1176, "step": 7490 }, { "epoch": 1.9171779141104295, "grad_norm": 5.651144981384277, "learning_rate": 7.746799344910822e-06, "loss": 0.1398, "step": 7500 }, { "epoch": 1.9197341513292434, "grad_norm": 6.184920787811279, "learning_rate": 7.739339083051186e-06, "loss": 0.1766, "step": 7510 }, { "epoch": 1.9222903885480571, "grad_norm": 6.632026672363281, "learning_rate": 7.73187009704999e-06, "loss": 0.1427, "step": 7520 }, { "epoch": 1.9248466257668713, "grad_norm": 8.19317626953125, "learning_rate": 7.724392410694167e-06, "loss": 0.1126, "step": 7530 }, { "epoch": 1.927402862985685, "grad_norm": 1.4213460683822632, "learning_rate": 7.716906047798364e-06, "loss": 0.1248, "step": 7540 }, { "epoch": 1.9299591002044991, "grad_norm": 8.21669864654541, "learning_rate": 7.709411032204868e-06, "loss": 0.1148, "step": 7550 }, { "epoch": 1.9325153374233128, "grad_norm": 6.994448661804199, "learning_rate": 7.701907387783509e-06, "loss": 0.1548, "step": 7560 }, { "epoch": 1.9350715746421268, "grad_norm": 7.743505954742432, "learning_rate": 7.694395138431608e-06, "loss": 0.1274, "step": 7570 }, { "epoch": 1.9376278118609407, "grad_norm": 5.287206172943115, "learning_rate": 7.686874308073885e-06, "loss": 0.0779, "step": 7580 }, { "epoch": 1.9401840490797546, "grad_norm": 15.280442237854004, "learning_rate": 7.679344920662394e-06, "loss": 0.0718, "step": 7590 }, { "epoch": 1.9427402862985685, "grad_norm": 9.01176929473877, "learning_rate": 7.671807000176434e-06, "loss": 0.2102, "step": 7600 }, { "epoch": 1.9452965235173822, "grad_norm": 4.649341583251953, "learning_rate": 7.664260570622487e-06, "loss": 0.1391, "step": 7610 }, { "epoch": 1.9478527607361964, "grad_norm": 9.841882705688477, "learning_rate": 7.656705656034132e-06, "loss": 0.1092, "step": 7620 }, { "epoch": 1.95040899795501, "grad_norm": 4.586430072784424, "learning_rate": 7.649142280471964e-06, "loss": 0.1478, "step": 7630 }, { "epoch": 1.9529652351738243, "grad_norm": 3.7405037879943848, "learning_rate": 7.641570468023536e-06, "loss": 0.157, "step": 7640 }, { "epoch": 1.955521472392638, "grad_norm": 4.911379337310791, "learning_rate": 7.633990242803263e-06, "loss": 0.0739, "step": 7650 }, { "epoch": 1.9580777096114519, "grad_norm": 14.748944282531738, "learning_rate": 7.626401628952352e-06, "loss": 0.1426, "step": 7660 }, { "epoch": 1.9606339468302658, "grad_norm": 7.726930141448975, "learning_rate": 7.61880465063873e-06, "loss": 0.1107, "step": 7670 }, { "epoch": 1.9631901840490797, "grad_norm": 7.120655059814453, "learning_rate": 7.61119933205696e-06, "loss": 0.044, "step": 7680 }, { "epoch": 1.9657464212678937, "grad_norm": 1.3341773748397827, "learning_rate": 7.603585697428169e-06, "loss": 0.0847, "step": 7690 }, { "epoch": 1.9683026584867076, "grad_norm": 5.155945777893066, "learning_rate": 7.595963770999966e-06, "loss": 0.2069, "step": 7700 }, { "epoch": 1.9708588957055215, "grad_norm": 6.961178302764893, "learning_rate": 7.588333577046368e-06, "loss": 0.1673, "step": 7710 }, { "epoch": 1.9734151329243352, "grad_norm": 5.766995906829834, "learning_rate": 7.5806951398677255e-06, "loss": 0.1469, "step": 7720 }, { "epoch": 1.9759713701431494, "grad_norm": 11.985013961791992, "learning_rate": 7.573048483790635e-06, "loss": 0.1621, "step": 7730 }, { "epoch": 1.978527607361963, "grad_norm": 0.5396488308906555, "learning_rate": 7.565393633167876e-06, "loss": 0.0574, "step": 7740 }, { "epoch": 1.9810838445807772, "grad_norm": 11.865923881530762, "learning_rate": 7.557730612378318e-06, "loss": 0.1207, "step": 7750 }, { "epoch": 1.983640081799591, "grad_norm": 4.756693363189697, "learning_rate": 7.5500594458268576e-06, "loss": 0.1147, "step": 7760 }, { "epoch": 1.9861963190184049, "grad_norm": 4.860601425170898, "learning_rate": 7.542380157944328e-06, "loss": 0.0956, "step": 7770 }, { "epoch": 1.9887525562372188, "grad_norm": 14.664186477661133, "learning_rate": 7.534692773187431e-06, "loss": 0.1399, "step": 7780 }, { "epoch": 1.9913087934560327, "grad_norm": 4.663970470428467, "learning_rate": 7.526997316038654e-06, "loss": 0.0859, "step": 7790 }, { "epoch": 1.9938650306748467, "grad_norm": 0.8506277203559875, "learning_rate": 7.519293811006187e-06, "loss": 0.136, "step": 7800 }, { "epoch": 1.9964212678936604, "grad_norm": 5.4818644523620605, "learning_rate": 7.511582282623865e-06, "loss": 0.0835, "step": 7810 }, { "epoch": 1.9989775051124745, "grad_norm": 5.375784397125244, "learning_rate": 7.503862755451059e-06, "loss": 0.1255, "step": 7820 }, { "epoch": 2.0015337423312882, "grad_norm": 1.3432427644729614, "learning_rate": 7.4961352540726274e-06, "loss": 0.0644, "step": 7830 }, { "epoch": 2.0040899795501024, "grad_norm": 8.615415573120117, "learning_rate": 7.4883998030988136e-06, "loss": 0.1136, "step": 7840 }, { "epoch": 2.006646216768916, "grad_norm": 7.158458232879639, "learning_rate": 7.480656427165187e-06, "loss": 0.09, "step": 7850 }, { "epoch": 2.0092024539877302, "grad_norm": 8.66907024383545, "learning_rate": 7.47290515093255e-06, "loss": 0.0703, "step": 7860 }, { "epoch": 2.011758691206544, "grad_norm": 25.180543899536133, "learning_rate": 7.465145999086874e-06, "loss": 0.1314, "step": 7870 }, { "epoch": 2.014314928425358, "grad_norm": 0.7909819483757019, "learning_rate": 7.457378996339201e-06, "loss": 0.0538, "step": 7880 }, { "epoch": 2.016871165644172, "grad_norm": 0.6326285600662231, "learning_rate": 7.4496041674255834e-06, "loss": 0.0545, "step": 7890 }, { "epoch": 2.0194274028629855, "grad_norm": 6.992855548858643, "learning_rate": 7.441821537107e-06, "loss": 0.0811, "step": 7900 }, { "epoch": 2.0219836400817996, "grad_norm": 4.8581743240356445, "learning_rate": 7.434031130169268e-06, "loss": 0.0897, "step": 7910 }, { "epoch": 2.0245398773006134, "grad_norm": 0.3861932158470154, "learning_rate": 7.42623297142298e-06, "loss": 0.0795, "step": 7920 }, { "epoch": 2.0270961145194275, "grad_norm": 4.8933424949646, "learning_rate": 7.418427085703406e-06, "loss": 0.0746, "step": 7930 }, { "epoch": 2.029652351738241, "grad_norm": 7.480552673339844, "learning_rate": 7.410613497870432e-06, "loss": 0.0816, "step": 7940 }, { "epoch": 2.0322085889570554, "grad_norm": 5.1835126876831055, "learning_rate": 7.402792232808474e-06, "loss": 0.1248, "step": 7950 }, { "epoch": 2.034764826175869, "grad_norm": 0.7514427304267883, "learning_rate": 7.394963315426393e-06, "loss": 0.077, "step": 7960 }, { "epoch": 2.037321063394683, "grad_norm": 5.26667594909668, "learning_rate": 7.387126770657423e-06, "loss": 0.0694, "step": 7970 }, { "epoch": 2.039877300613497, "grad_norm": 8.795965194702148, "learning_rate": 7.379282623459093e-06, "loss": 0.0845, "step": 7980 }, { "epoch": 2.0424335378323106, "grad_norm": 5.604037284851074, "learning_rate": 7.371430898813137e-06, "loss": 0.0753, "step": 7990 }, { "epoch": 2.044989775051125, "grad_norm": 2.7282750606536865, "learning_rate": 7.363571621725427e-06, "loss": 0.031, "step": 8000 }, { "epoch": 2.0475460122699385, "grad_norm": 5.139689922332764, "learning_rate": 7.355704817225886e-06, "loss": 0.1, "step": 8010 }, { "epoch": 2.0501022494887526, "grad_norm": 7.020951271057129, "learning_rate": 7.347830510368409e-06, "loss": 0.0798, "step": 8020 }, { "epoch": 2.0526584867075663, "grad_norm": 2.1761505603790283, "learning_rate": 7.3399487262307866e-06, "loss": 0.0768, "step": 8030 }, { "epoch": 2.0552147239263805, "grad_norm": 5.854605197906494, "learning_rate": 7.332059489914619e-06, "loss": 0.0601, "step": 8040 }, { "epoch": 2.057770961145194, "grad_norm": 0.5980772376060486, "learning_rate": 7.324162826545245e-06, "loss": 0.0586, "step": 8050 }, { "epoch": 2.0603271983640083, "grad_norm": 4.8323235511779785, "learning_rate": 7.316258761271651e-06, "loss": 0.0578, "step": 8060 }, { "epoch": 2.062883435582822, "grad_norm": 8.097885131835938, "learning_rate": 7.308347319266401e-06, "loss": 0.0469, "step": 8070 }, { "epoch": 2.065439672801636, "grad_norm": 5.477297782897949, "learning_rate": 7.300428525725549e-06, "loss": 0.0597, "step": 8080 }, { "epoch": 2.06799591002045, "grad_norm": 2.20831298828125, "learning_rate": 7.2925024058685664e-06, "loss": 0.0512, "step": 8090 }, { "epoch": 2.0705521472392636, "grad_norm": 6.855231761932373, "learning_rate": 7.2845689849382514e-06, "loss": 0.0787, "step": 8100 }, { "epoch": 2.0731083844580778, "grad_norm": 9.492572784423828, "learning_rate": 7.27662828820066e-06, "loss": 0.104, "step": 8110 }, { "epoch": 2.0756646216768915, "grad_norm": 7.048098087310791, "learning_rate": 7.268680340945016e-06, "loss": 0.1052, "step": 8120 }, { "epoch": 2.0782208588957056, "grad_norm": 7.1551594734191895, "learning_rate": 7.260725168483634e-06, "loss": 0.0538, "step": 8130 }, { "epoch": 2.0807770961145193, "grad_norm": 3.1020727157592773, "learning_rate": 7.252762796151843e-06, "loss": 0.0923, "step": 8140 }, { "epoch": 2.0833333333333335, "grad_norm": 6.914649963378906, "learning_rate": 7.2447932493079e-06, "loss": 0.0458, "step": 8150 }, { "epoch": 2.085889570552147, "grad_norm": 1.941754698753357, "learning_rate": 7.236816553332909e-06, "loss": 0.0847, "step": 8160 }, { "epoch": 2.0884458077709613, "grad_norm": 3.0333592891693115, "learning_rate": 7.228832733630742e-06, "loss": 0.0318, "step": 8170 }, { "epoch": 2.091002044989775, "grad_norm": 2.743631601333618, "learning_rate": 7.220841815627966e-06, "loss": 0.0935, "step": 8180 }, { "epoch": 2.0935582822085887, "grad_norm": 6.149184226989746, "learning_rate": 7.212843824773745e-06, "loss": 0.1325, "step": 8190 }, { "epoch": 2.096114519427403, "grad_norm": 9.376814842224121, "learning_rate": 7.204838786539772e-06, "loss": 0.0287, "step": 8200 }, { "epoch": 2.0986707566462166, "grad_norm": 6.627695560455322, "learning_rate": 7.196826726420185e-06, "loss": 0.1187, "step": 8210 }, { "epoch": 2.1012269938650308, "grad_norm": 7.894048690795898, "learning_rate": 7.188807669931486e-06, "loss": 0.078, "step": 8220 }, { "epoch": 2.1037832310838445, "grad_norm": 6.502098083496094, "learning_rate": 7.180781642612453e-06, "loss": 0.0647, "step": 8230 }, { "epoch": 2.1063394683026586, "grad_norm": 5.958528995513916, "learning_rate": 7.172748670024073e-06, "loss": 0.0945, "step": 8240 }, { "epoch": 2.1088957055214723, "grad_norm": 0.8460894823074341, "learning_rate": 7.164708777749445e-06, "loss": 0.0558, "step": 8250 }, { "epoch": 2.1114519427402865, "grad_norm": 5.054858207702637, "learning_rate": 7.1566619913937105e-06, "loss": 0.1047, "step": 8260 }, { "epoch": 2.1140081799591, "grad_norm": 9.798078536987305, "learning_rate": 7.148608336583961e-06, "loss": 0.0616, "step": 8270 }, { "epoch": 2.116564417177914, "grad_norm": 2.237877607345581, "learning_rate": 7.140547838969168e-06, "loss": 0.0827, "step": 8280 }, { "epoch": 2.119120654396728, "grad_norm": 0.3861876428127289, "learning_rate": 7.1324805242200956e-06, "loss": 0.0635, "step": 8290 }, { "epoch": 2.1216768916155417, "grad_norm": 6.713496685028076, "learning_rate": 7.1244064180292134e-06, "loss": 0.0663, "step": 8300 }, { "epoch": 2.124233128834356, "grad_norm": 5.54539155960083, "learning_rate": 7.116325546110628e-06, "loss": 0.0446, "step": 8310 }, { "epoch": 2.1267893660531696, "grad_norm": 5.177070617675781, "learning_rate": 7.108237934199983e-06, "loss": 0.0517, "step": 8320 }, { "epoch": 2.1293456032719837, "grad_norm": 0.7041372060775757, "learning_rate": 7.1001436080544e-06, "loss": 0.0289, "step": 8330 }, { "epoch": 2.1319018404907975, "grad_norm": 6.997579574584961, "learning_rate": 7.0920425934523705e-06, "loss": 0.0502, "step": 8340 }, { "epoch": 2.1344580777096116, "grad_norm": 9.049081802368164, "learning_rate": 7.083934916193698e-06, "loss": 0.0795, "step": 8350 }, { "epoch": 2.1370143149284253, "grad_norm": 3.9479804039001465, "learning_rate": 7.075820602099399e-06, "loss": 0.0659, "step": 8360 }, { "epoch": 2.1395705521472395, "grad_norm": 7.389666557312012, "learning_rate": 7.0676996770116294e-06, "loss": 0.0533, "step": 8370 }, { "epoch": 2.142126789366053, "grad_norm": 5.052390098571777, "learning_rate": 7.059572166793598e-06, "loss": 0.075, "step": 8380 }, { "epoch": 2.144683026584867, "grad_norm": 8.923999786376953, "learning_rate": 7.051438097329485e-06, "loss": 0.0782, "step": 8390 }, { "epoch": 2.147239263803681, "grad_norm": 9.955076217651367, "learning_rate": 7.043297494524364e-06, "loss": 0.0648, "step": 8400 }, { "epoch": 2.1497955010224947, "grad_norm": 14.11737060546875, "learning_rate": 7.03515038430411e-06, "loss": 0.0368, "step": 8410 }, { "epoch": 2.152351738241309, "grad_norm": 4.5228590965271, "learning_rate": 7.026996792615328e-06, "loss": 0.0758, "step": 8420 }, { "epoch": 2.1549079754601226, "grad_norm": 6.418432712554932, "learning_rate": 7.0188367454252624e-06, "loss": 0.0705, "step": 8430 }, { "epoch": 2.1574642126789367, "grad_norm": 5.669915676116943, "learning_rate": 7.010670268721718e-06, "loss": 0.1191, "step": 8440 }, { "epoch": 2.1600204498977504, "grad_norm": 5.414175033569336, "learning_rate": 7.002497388512971e-06, "loss": 0.0665, "step": 8450 }, { "epoch": 2.1625766871165646, "grad_norm": 3.9066929817199707, "learning_rate": 6.9943181308277e-06, "loss": 0.0625, "step": 8460 }, { "epoch": 2.1651329243353783, "grad_norm": 0.23331096768379211, "learning_rate": 6.986132521714888e-06, "loss": 0.0674, "step": 8470 }, { "epoch": 2.1676891615541924, "grad_norm": 3.160121440887451, "learning_rate": 6.977940587243745e-06, "loss": 0.0834, "step": 8480 }, { "epoch": 2.170245398773006, "grad_norm": 4.351058483123779, "learning_rate": 6.969742353503635e-06, "loss": 0.0386, "step": 8490 }, { "epoch": 2.17280163599182, "grad_norm": 9.820882797241211, "learning_rate": 6.96153784660397e-06, "loss": 0.0672, "step": 8500 }, { "epoch": 2.175357873210634, "grad_norm": 15.702372550964355, "learning_rate": 6.9533270926741506e-06, "loss": 0.0749, "step": 8510 }, { "epoch": 2.1779141104294477, "grad_norm": 5.2401509284973145, "learning_rate": 6.945110117863469e-06, "loss": 0.0703, "step": 8520 }, { "epoch": 2.180470347648262, "grad_norm": 5.104111194610596, "learning_rate": 6.936886948341029e-06, "loss": 0.091, "step": 8530 }, { "epoch": 2.1830265848670756, "grad_norm": 12.119421005249023, "learning_rate": 6.928657610295666e-06, "loss": 0.045, "step": 8540 }, { "epoch": 2.1855828220858897, "grad_norm": 3.9862372875213623, "learning_rate": 6.920422129935859e-06, "loss": 0.0863, "step": 8550 }, { "epoch": 2.1881390593047034, "grad_norm": 8.157721519470215, "learning_rate": 6.912180533489645e-06, "loss": 0.0649, "step": 8560 }, { "epoch": 2.1906952965235176, "grad_norm": 12.727168083190918, "learning_rate": 6.903932847204548e-06, "loss": 0.0839, "step": 8570 }, { "epoch": 2.1932515337423313, "grad_norm": 2.1506459712982178, "learning_rate": 6.895679097347476e-06, "loss": 0.0704, "step": 8580 }, { "epoch": 2.195807770961145, "grad_norm": 3.3470993041992188, "learning_rate": 6.887419310204657e-06, "loss": 0.0637, "step": 8590 }, { "epoch": 2.198364008179959, "grad_norm": 4.4683356285095215, "learning_rate": 6.879153512081542e-06, "loss": 0.0556, "step": 8600 }, { "epoch": 2.200920245398773, "grad_norm": 5.225627899169922, "learning_rate": 6.870881729302728e-06, "loss": 0.0467, "step": 8610 }, { "epoch": 2.203476482617587, "grad_norm": 4.654438018798828, "learning_rate": 6.862603988211866e-06, "loss": 0.117, "step": 8620 }, { "epoch": 2.2060327198364007, "grad_norm": 5.765674114227295, "learning_rate": 6.854320315171591e-06, "loss": 0.0833, "step": 8630 }, { "epoch": 2.208588957055215, "grad_norm": 4.424642086029053, "learning_rate": 6.8460307365634225e-06, "loss": 0.0879, "step": 8640 }, { "epoch": 2.2111451942740286, "grad_norm": 0.613856315612793, "learning_rate": 6.837735278787694e-06, "loss": 0.0309, "step": 8650 }, { "epoch": 2.2137014314928427, "grad_norm": 6.912576675415039, "learning_rate": 6.829433968263458e-06, "loss": 0.0571, "step": 8660 }, { "epoch": 2.2162576687116564, "grad_norm": 4.189841270446777, "learning_rate": 6.821126831428408e-06, "loss": 0.0856, "step": 8670 }, { "epoch": 2.21881390593047, "grad_norm": 2.174213171005249, "learning_rate": 6.8128138947387966e-06, "loss": 0.0573, "step": 8680 }, { "epoch": 2.2213701431492843, "grad_norm": 9.983304023742676, "learning_rate": 6.80449518466934e-06, "loss": 0.0715, "step": 8690 }, { "epoch": 2.223926380368098, "grad_norm": 5.989863872528076, "learning_rate": 6.796170727713147e-06, "loss": 0.0759, "step": 8700 }, { "epoch": 2.226482617586912, "grad_norm": 2.114159345626831, "learning_rate": 6.787840550381628e-06, "loss": 0.0244, "step": 8710 }, { "epoch": 2.229038854805726, "grad_norm": 7.211903095245361, "learning_rate": 6.779504679204412e-06, "loss": 0.0973, "step": 8720 }, { "epoch": 2.23159509202454, "grad_norm": 5.518008232116699, "learning_rate": 6.771163140729257e-06, "loss": 0.1189, "step": 8730 }, { "epoch": 2.2341513292433537, "grad_norm": 7.159096717834473, "learning_rate": 6.762815961521976e-06, "loss": 0.0472, "step": 8740 }, { "epoch": 2.236707566462168, "grad_norm": 5.628960132598877, "learning_rate": 6.754463168166342e-06, "loss": 0.0646, "step": 8750 }, { "epoch": 2.2392638036809815, "grad_norm": 2.7872536182403564, "learning_rate": 6.746104787264011e-06, "loss": 0.0603, "step": 8760 }, { "epoch": 2.2418200408997957, "grad_norm": 4.476940155029297, "learning_rate": 6.737740845434432e-06, "loss": 0.0635, "step": 8770 }, { "epoch": 2.2443762781186094, "grad_norm": 4.041048049926758, "learning_rate": 6.7293713693147635e-06, "loss": 0.0462, "step": 8780 }, { "epoch": 2.246932515337423, "grad_norm": 3.1163430213928223, "learning_rate": 6.720996385559793e-06, "loss": 0.0552, "step": 8790 }, { "epoch": 2.2494887525562373, "grad_norm": 5.569158554077148, "learning_rate": 6.712615920841843e-06, "loss": 0.0689, "step": 8800 }, { "epoch": 2.252044989775051, "grad_norm": 5.894398212432861, "learning_rate": 6.704230001850696e-06, "loss": 0.0531, "step": 8810 }, { "epoch": 2.254601226993865, "grad_norm": 6.340700149536133, "learning_rate": 6.695838655293505e-06, "loss": 0.0568, "step": 8820 }, { "epoch": 2.257157464212679, "grad_norm": 4.014859199523926, "learning_rate": 6.6874419078947076e-06, "loss": 0.0613, "step": 8830 }, { "epoch": 2.259713701431493, "grad_norm": 10.440202713012695, "learning_rate": 6.679039786395936e-06, "loss": 0.0497, "step": 8840 }, { "epoch": 2.2622699386503067, "grad_norm": 9.94273567199707, "learning_rate": 6.6706323175559504e-06, "loss": 0.0866, "step": 8850 }, { "epoch": 2.264826175869121, "grad_norm": 1.108022689819336, "learning_rate": 6.662219528150529e-06, "loss": 0.0504, "step": 8860 }, { "epoch": 2.2673824130879345, "grad_norm": 3.8868322372436523, "learning_rate": 6.653801444972398e-06, "loss": 0.0675, "step": 8870 }, { "epoch": 2.2699386503067487, "grad_norm": 3.9967801570892334, "learning_rate": 6.64537809483115e-06, "loss": 0.0846, "step": 8880 }, { "epoch": 2.2724948875255624, "grad_norm": 4.602581024169922, "learning_rate": 6.63694950455314e-06, "loss": 0.0553, "step": 8890 }, { "epoch": 2.275051124744376, "grad_norm": 0.8123490810394287, "learning_rate": 6.628515700981424e-06, "loss": 0.0463, "step": 8900 }, { "epoch": 2.2776073619631902, "grad_norm": 0.3097759783267975, "learning_rate": 6.620076710975648e-06, "loss": 0.0754, "step": 8910 }, { "epoch": 2.280163599182004, "grad_norm": 4.588552474975586, "learning_rate": 6.611632561411987e-06, "loss": 0.078, "step": 8920 }, { "epoch": 2.282719836400818, "grad_norm": 7.720053195953369, "learning_rate": 6.603183279183041e-06, "loss": 0.0946, "step": 8930 }, { "epoch": 2.285276073619632, "grad_norm": 6.0510945320129395, "learning_rate": 6.594728891197758e-06, "loss": 0.0565, "step": 8940 }, { "epoch": 2.287832310838446, "grad_norm": 9.991423606872559, "learning_rate": 6.586269424381349e-06, "loss": 0.0585, "step": 8950 }, { "epoch": 2.2903885480572597, "grad_norm": 0.4170069098472595, "learning_rate": 6.577804905675196e-06, "loss": 0.0552, "step": 8960 }, { "epoch": 2.292944785276074, "grad_norm": 4.831167221069336, "learning_rate": 6.569335362036773e-06, "loss": 0.0477, "step": 8970 }, { "epoch": 2.2955010224948875, "grad_norm": 2.8550946712493896, "learning_rate": 6.560860820439557e-06, "loss": 0.0386, "step": 8980 }, { "epoch": 2.2980572597137012, "grad_norm": 6.75853967666626, "learning_rate": 6.55238130787294e-06, "loss": 0.0722, "step": 8990 }, { "epoch": 2.3006134969325154, "grad_norm": 7.144791603088379, "learning_rate": 6.543896851342148e-06, "loss": 0.0713, "step": 9000 }, { "epoch": 2.303169734151329, "grad_norm": 0.21670909225940704, "learning_rate": 6.535407477868151e-06, "loss": 0.0809, "step": 9010 }, { "epoch": 2.3057259713701432, "grad_norm": 4.624710559844971, "learning_rate": 6.526913214487578e-06, "loss": 0.0727, "step": 9020 }, { "epoch": 2.308282208588957, "grad_norm": 0.8301081657409668, "learning_rate": 6.518414088252632e-06, "loss": 0.0522, "step": 9030 }, { "epoch": 2.310838445807771, "grad_norm": 0.1696474701166153, "learning_rate": 6.509910126231003e-06, "loss": 0.0482, "step": 9040 }, { "epoch": 2.313394683026585, "grad_norm": 2.2531793117523193, "learning_rate": 6.501401355505782e-06, "loss": 0.0557, "step": 9050 }, { "epoch": 2.315950920245399, "grad_norm": 4.994187831878662, "learning_rate": 6.492887803175374e-06, "loss": 0.0938, "step": 9060 }, { "epoch": 2.3185071574642127, "grad_norm": 6.800015926361084, "learning_rate": 6.484369496353412e-06, "loss": 0.061, "step": 9070 }, { "epoch": 2.3210633946830264, "grad_norm": 10.822134017944336, "learning_rate": 6.4758464621686715e-06, "loss": 0.0584, "step": 9080 }, { "epoch": 2.3236196319018405, "grad_norm": 2.0743789672851562, "learning_rate": 6.467318727764983e-06, "loss": 0.0489, "step": 9090 }, { "epoch": 2.326175869120654, "grad_norm": 6.299098968505859, "learning_rate": 6.458786320301146e-06, "loss": 0.0832, "step": 9100 }, { "epoch": 2.3287321063394684, "grad_norm": 0.3529964089393616, "learning_rate": 6.450249266950846e-06, "loss": 0.0281, "step": 9110 }, { "epoch": 2.331288343558282, "grad_norm": 1.3568120002746582, "learning_rate": 6.4417075949025575e-06, "loss": 0.0326, "step": 9120 }, { "epoch": 2.3338445807770962, "grad_norm": 5.631685256958008, "learning_rate": 6.43316133135947e-06, "loss": 0.0618, "step": 9130 }, { "epoch": 2.33640081799591, "grad_norm": 0.5822303295135498, "learning_rate": 6.4246105035393965e-06, "loss": 0.0483, "step": 9140 }, { "epoch": 2.338957055214724, "grad_norm": 0.39102593064308167, "learning_rate": 6.416055138674682e-06, "loss": 0.0429, "step": 9150 }, { "epoch": 2.341513292433538, "grad_norm": 4.372485637664795, "learning_rate": 6.4074952640121226e-06, "loss": 0.0795, "step": 9160 }, { "epoch": 2.3440695296523515, "grad_norm": 0.6967136263847351, "learning_rate": 6.398930906812877e-06, "loss": 0.0307, "step": 9170 }, { "epoch": 2.3466257668711656, "grad_norm": 5.449244022369385, "learning_rate": 6.390362094352382e-06, "loss": 0.0729, "step": 9180 }, { "epoch": 2.34918200408998, "grad_norm": 5.0675482749938965, "learning_rate": 6.3817888539202595e-06, "loss": 0.0707, "step": 9190 }, { "epoch": 2.3517382413087935, "grad_norm": 2.9963107109069824, "learning_rate": 6.373211212820237e-06, "loss": 0.0545, "step": 9200 }, { "epoch": 2.354294478527607, "grad_norm": 3.9092769622802734, "learning_rate": 6.364629198370054e-06, "loss": 0.0281, "step": 9210 }, { "epoch": 2.3568507157464214, "grad_norm": 8.632110595703125, "learning_rate": 6.3560428379013795e-06, "loss": 0.0994, "step": 9220 }, { "epoch": 2.359406952965235, "grad_norm": 3.0046439170837402, "learning_rate": 6.3474521587597234e-06, "loss": 0.0505, "step": 9230 }, { "epoch": 2.361963190184049, "grad_norm": 2.9390039443969727, "learning_rate": 6.3388571883043505e-06, "loss": 0.0561, "step": 9240 }, { "epoch": 2.364519427402863, "grad_norm": 7.934990406036377, "learning_rate": 6.330257953908192e-06, "loss": 0.0442, "step": 9250 }, { "epoch": 2.367075664621677, "grad_norm": 3.6421031951904297, "learning_rate": 6.321654482957756e-06, "loss": 0.0761, "step": 9260 }, { "epoch": 2.3696319018404908, "grad_norm": 2.067728042602539, "learning_rate": 6.313046802853047e-06, "loss": 0.0361, "step": 9270 }, { "epoch": 2.372188139059305, "grad_norm": 0.5931568741798401, "learning_rate": 6.304434941007473e-06, "loss": 0.0441, "step": 9280 }, { "epoch": 2.3747443762781186, "grad_norm": 7.320766925811768, "learning_rate": 6.295818924847761e-06, "loss": 0.0736, "step": 9290 }, { "epoch": 2.3773006134969323, "grad_norm": 5.4987688064575195, "learning_rate": 6.2871987818138626e-06, "loss": 0.0694, "step": 9300 }, { "epoch": 2.3798568507157465, "grad_norm": 7.312312602996826, "learning_rate": 6.2785745393588815e-06, "loss": 0.0698, "step": 9310 }, { "epoch": 2.38241308793456, "grad_norm": 8.894052505493164, "learning_rate": 6.2699462249489715e-06, "loss": 0.0651, "step": 9320 }, { "epoch": 2.3849693251533743, "grad_norm": 0.4445403516292572, "learning_rate": 6.261313866063257e-06, "loss": 0.0271, "step": 9330 }, { "epoch": 2.387525562372188, "grad_norm": 3.842348575592041, "learning_rate": 6.252677490193739e-06, "loss": 0.0625, "step": 9340 }, { "epoch": 2.390081799591002, "grad_norm": 3.183258295059204, "learning_rate": 6.244037124845217e-06, "loss": 0.0454, "step": 9350 }, { "epoch": 2.392638036809816, "grad_norm": 3.39320969581604, "learning_rate": 6.235392797535193e-06, "loss": 0.0615, "step": 9360 }, { "epoch": 2.39519427402863, "grad_norm": 10.20765495300293, "learning_rate": 6.226744535793788e-06, "loss": 0.0808, "step": 9370 }, { "epoch": 2.3977505112474438, "grad_norm": 0.8380181789398193, "learning_rate": 6.2180923671636524e-06, "loss": 0.0485, "step": 9380 }, { "epoch": 2.4003067484662575, "grad_norm": 0.444444477558136, "learning_rate": 6.20943631919988e-06, "loss": 0.02, "step": 9390 }, { "epoch": 2.4028629856850716, "grad_norm": 8.41584587097168, "learning_rate": 6.200776419469918e-06, "loss": 0.054, "step": 9400 }, { "epoch": 2.4054192229038853, "grad_norm": 5.808600425720215, "learning_rate": 6.192112695553483e-06, "loss": 0.0671, "step": 9410 }, { "epoch": 2.4079754601226995, "grad_norm": 3.8908348083496094, "learning_rate": 6.183445175042466e-06, "loss": 0.0618, "step": 9420 }, { "epoch": 2.410531697341513, "grad_norm": 4.925373077392578, "learning_rate": 6.174773885540855e-06, "loss": 0.0512, "step": 9430 }, { "epoch": 2.4130879345603273, "grad_norm": 0.03155489265918732, "learning_rate": 6.166098854664638e-06, "loss": 0.0356, "step": 9440 }, { "epoch": 2.415644171779141, "grad_norm": 0.08001308143138885, "learning_rate": 6.157420110041719e-06, "loss": 0.031, "step": 9450 }, { "epoch": 2.418200408997955, "grad_norm": 6.34970760345459, "learning_rate": 6.1487376793118285e-06, "loss": 0.0595, "step": 9460 }, { "epoch": 2.420756646216769, "grad_norm": 15.385396957397461, "learning_rate": 6.140051590126439e-06, "loss": 0.0452, "step": 9470 }, { "epoch": 2.4233128834355826, "grad_norm": 5.20993185043335, "learning_rate": 6.131361870148672e-06, "loss": 0.0745, "step": 9480 }, { "epoch": 2.4258691206543967, "grad_norm": 4.343068599700928, "learning_rate": 6.1226685470532125e-06, "loss": 0.0639, "step": 9490 }, { "epoch": 2.4284253578732105, "grad_norm": 4.774913787841797, "learning_rate": 6.113971648526222e-06, "loss": 0.0416, "step": 9500 }, { "epoch": 2.4309815950920246, "grad_norm": 0.5611134767532349, "learning_rate": 6.105271202265246e-06, "loss": 0.0636, "step": 9510 }, { "epoch": 2.4335378323108383, "grad_norm": 6.5504279136657715, "learning_rate": 6.096567235979133e-06, "loss": 0.0537, "step": 9520 }, { "epoch": 2.4360940695296525, "grad_norm": 0.9646693468093872, "learning_rate": 6.0878597773879376e-06, "loss": 0.0512, "step": 9530 }, { "epoch": 2.438650306748466, "grad_norm": 4.056527614593506, "learning_rate": 6.079148854222839e-06, "loss": 0.0451, "step": 9540 }, { "epoch": 2.4412065439672803, "grad_norm": 5.754093170166016, "learning_rate": 6.07043449422605e-06, "loss": 0.0635, "step": 9550 }, { "epoch": 2.443762781186094, "grad_norm": 7.742176532745361, "learning_rate": 6.061716725150727e-06, "loss": 0.0305, "step": 9560 }, { "epoch": 2.4463190184049077, "grad_norm": 7.218969345092773, "learning_rate": 6.052995574760887e-06, "loss": 0.0615, "step": 9570 }, { "epoch": 2.448875255623722, "grad_norm": 3.2471063137054443, "learning_rate": 6.044271070831312e-06, "loss": 0.0568, "step": 9580 }, { "epoch": 2.451431492842536, "grad_norm": 0.23956027626991272, "learning_rate": 6.035543241147469e-06, "loss": 0.0468, "step": 9590 }, { "epoch": 2.4539877300613497, "grad_norm": 0.8415837287902832, "learning_rate": 6.026812113505409e-06, "loss": 0.0366, "step": 9600 }, { "epoch": 2.4565439672801634, "grad_norm": 4.5563154220581055, "learning_rate": 6.018077715711695e-06, "loss": 0.0611, "step": 9610 }, { "epoch": 2.4591002044989776, "grad_norm": 9.952324867248535, "learning_rate": 6.009340075583299e-06, "loss": 0.0504, "step": 9620 }, { "epoch": 2.4616564417177913, "grad_norm": 3.28727126121521, "learning_rate": 6.00059922094752e-06, "loss": 0.0563, "step": 9630 }, { "epoch": 2.4642126789366054, "grad_norm": 4.564260959625244, "learning_rate": 5.991855179641896e-06, "loss": 0.0354, "step": 9640 }, { "epoch": 2.466768916155419, "grad_norm": 5.473964214324951, "learning_rate": 5.983107979514112e-06, "loss": 0.0389, "step": 9650 }, { "epoch": 2.4693251533742333, "grad_norm": 3.674219846725464, "learning_rate": 5.974357648421916e-06, "loss": 0.0745, "step": 9660 }, { "epoch": 2.471881390593047, "grad_norm": 0.6603105068206787, "learning_rate": 5.965604214233022e-06, "loss": 0.0572, "step": 9670 }, { "epoch": 2.474437627811861, "grad_norm": 4.627801895141602, "learning_rate": 5.956847704825033e-06, "loss": 0.0395, "step": 9680 }, { "epoch": 2.476993865030675, "grad_norm": 2.601986885070801, "learning_rate": 5.94808814808534e-06, "loss": 0.0775, "step": 9690 }, { "epoch": 2.4795501022494886, "grad_norm": 5.2239460945129395, "learning_rate": 5.9393255719110455e-06, "loss": 0.057, "step": 9700 }, { "epoch": 2.4821063394683027, "grad_norm": 0.7189023494720459, "learning_rate": 5.9305600042088595e-06, "loss": 0.0669, "step": 9710 }, { "epoch": 2.4846625766871164, "grad_norm": 4.483514308929443, "learning_rate": 5.9217914728950286e-06, "loss": 0.0511, "step": 9720 }, { "epoch": 2.4872188139059306, "grad_norm": 1.7777493000030518, "learning_rate": 5.913020005895232e-06, "loss": 0.0491, "step": 9730 }, { "epoch": 2.4897750511247443, "grad_norm": 6.096456050872803, "learning_rate": 5.904245631144498e-06, "loss": 0.0772, "step": 9740 }, { "epoch": 2.4923312883435584, "grad_norm": 6.093538761138916, "learning_rate": 5.895468376587121e-06, "loss": 0.0738, "step": 9750 }, { "epoch": 2.494887525562372, "grad_norm": 0.5627290606498718, "learning_rate": 5.8866882701765605e-06, "loss": 0.0428, "step": 9760 }, { "epoch": 2.4974437627811863, "grad_norm": 2.499333143234253, "learning_rate": 5.877905339875363e-06, "loss": 0.0465, "step": 9770 }, { "epoch": 2.5, "grad_norm": 2.476902723312378, "learning_rate": 5.869119613655062e-06, "loss": 0.033, "step": 9780 }, { "epoch": 2.5025562372188137, "grad_norm": 3.85345458984375, "learning_rate": 5.860331119496106e-06, "loss": 0.0589, "step": 9790 }, { "epoch": 2.505112474437628, "grad_norm": 0.05737360939383507, "learning_rate": 5.851539885387748e-06, "loss": 0.0693, "step": 9800 }, { "epoch": 2.5076687116564416, "grad_norm": 5.509619235992432, "learning_rate": 5.8427459393279736e-06, "loss": 0.0514, "step": 9810 }, { "epoch": 2.5102249488752557, "grad_norm": 5.018087863922119, "learning_rate": 5.8339493093234025e-06, "loss": 0.0638, "step": 9820 }, { "epoch": 2.5127811860940694, "grad_norm": 5.845489501953125, "learning_rate": 5.825150023389203e-06, "loss": 0.0408, "step": 9830 }, { "epoch": 2.5153374233128836, "grad_norm": 3.2593860626220703, "learning_rate": 5.816348109549005e-06, "loss": 0.0141, "step": 9840 }, { "epoch": 2.5178936605316973, "grad_norm": 5.271510124206543, "learning_rate": 5.807543595834799e-06, "loss": 0.0526, "step": 9850 }, { "epoch": 2.5204498977505114, "grad_norm": 0.1252453476190567, "learning_rate": 5.798736510286866e-06, "loss": 0.0522, "step": 9860 }, { "epoch": 2.523006134969325, "grad_norm": 4.33157205581665, "learning_rate": 5.7899268809536705e-06, "loss": 0.0888, "step": 9870 }, { "epoch": 2.525562372188139, "grad_norm": 6.989223480224609, "learning_rate": 5.781114735891781e-06, "loss": 0.0413, "step": 9880 }, { "epoch": 2.528118609406953, "grad_norm": 4.28364896774292, "learning_rate": 5.772300103165777e-06, "loss": 0.0438, "step": 9890 }, { "epoch": 2.530674846625767, "grad_norm": 0.42973408102989197, "learning_rate": 5.763483010848161e-06, "loss": 0.0537, "step": 9900 }, { "epoch": 2.533231083844581, "grad_norm": 3.0371270179748535, "learning_rate": 5.7546634870192695e-06, "loss": 0.0482, "step": 9910 }, { "epoch": 2.5357873210633946, "grad_norm": 10.468814849853516, "learning_rate": 5.745841559767182e-06, "loss": 0.0593, "step": 9920 }, { "epoch": 2.5383435582822087, "grad_norm": 1.1547623872756958, "learning_rate": 5.737017257187634e-06, "loss": 0.0457, "step": 9930 }, { "epoch": 2.5408997955010224, "grad_norm": 5.563620567321777, "learning_rate": 5.728190607383921e-06, "loss": 0.0876, "step": 9940 }, { "epoch": 2.5434560327198366, "grad_norm": 1.5379348993301392, "learning_rate": 5.719361638466819e-06, "loss": 0.0441, "step": 9950 }, { "epoch": 2.5460122699386503, "grad_norm": 4.261902809143066, "learning_rate": 5.7105303785544894e-06, "loss": 0.0243, "step": 9960 }, { "epoch": 2.548568507157464, "grad_norm": 5.261180400848389, "learning_rate": 5.7016968557723874e-06, "loss": 0.0309, "step": 9970 }, { "epoch": 2.551124744376278, "grad_norm": 2.3447492122650146, "learning_rate": 5.692861098253174e-06, "loss": 0.0348, "step": 9980 }, { "epoch": 2.5536809815950923, "grad_norm": 2.6790072917938232, "learning_rate": 5.684023134136634e-06, "loss": 0.0353, "step": 9990 }, { "epoch": 2.556237218813906, "grad_norm": 3.520054817199707, "learning_rate": 5.67518299156957e-06, "loss": 0.0852, "step": 10000 }, { "epoch": 2.5587934560327197, "grad_norm": 4.114648342132568, "learning_rate": 5.66634069870573e-06, "loss": 0.0708, "step": 10010 }, { "epoch": 2.561349693251534, "grad_norm": 1.8037816286087036, "learning_rate": 5.657496283705708e-06, "loss": 0.0496, "step": 10020 }, { "epoch": 2.5639059304703475, "grad_norm": 4.1163716316223145, "learning_rate": 5.648649774736855e-06, "loss": 0.0555, "step": 10030 }, { "epoch": 2.5664621676891617, "grad_norm": 3.350024700164795, "learning_rate": 5.639801199973191e-06, "loss": 0.0262, "step": 10040 }, { "epoch": 2.5690184049079754, "grad_norm": 7.735722541809082, "learning_rate": 5.630950587595319e-06, "loss": 0.0463, "step": 10050 }, { "epoch": 2.571574642126789, "grad_norm": 4.3258538246154785, "learning_rate": 5.622097965790325e-06, "loss": 0.0553, "step": 10060 }, { "epoch": 2.5741308793456033, "grad_norm": 4.328603744506836, "learning_rate": 5.6132433627517005e-06, "loss": 0.0632, "step": 10070 }, { "epoch": 2.5766871165644174, "grad_norm": 4.42746114730835, "learning_rate": 5.6043868066792415e-06, "loss": 0.0503, "step": 10080 }, { "epoch": 2.579243353783231, "grad_norm": 3.8148934841156006, "learning_rate": 5.595528325778968e-06, "loss": 0.0607, "step": 10090 }, { "epoch": 2.581799591002045, "grad_norm": 3.635321617126465, "learning_rate": 5.58666794826303e-06, "loss": 0.0448, "step": 10100 }, { "epoch": 2.584355828220859, "grad_norm": 0.01259413082152605, "learning_rate": 5.577805702349614e-06, "loss": 0.0408, "step": 10110 }, { "epoch": 2.5869120654396727, "grad_norm": 3.7854018211364746, "learning_rate": 5.568941616262861e-06, "loss": 0.0585, "step": 10120 }, { "epoch": 2.589468302658487, "grad_norm": 1.9844086170196533, "learning_rate": 5.5600757182327695e-06, "loss": 0.0263, "step": 10130 }, { "epoch": 2.5920245398773005, "grad_norm": 5.477379322052002, "learning_rate": 5.5512080364951105e-06, "loss": 0.0553, "step": 10140 }, { "epoch": 2.5945807770961147, "grad_norm": 6.039186954498291, "learning_rate": 5.542338599291335e-06, "loss": 0.0379, "step": 10150 }, { "epoch": 2.5971370143149284, "grad_norm": 0.8108224272727966, "learning_rate": 5.533467434868486e-06, "loss": 0.0534, "step": 10160 }, { "epoch": 2.5996932515337425, "grad_norm": 4.296036243438721, "learning_rate": 5.524594571479104e-06, "loss": 0.036, "step": 10170 }, { "epoch": 2.6022494887525562, "grad_norm": 3.003478765487671, "learning_rate": 5.515720037381144e-06, "loss": 0.0471, "step": 10180 }, { "epoch": 2.60480572597137, "grad_norm": 4.551526069641113, "learning_rate": 5.50684386083788e-06, "loss": 0.0675, "step": 10190 }, { "epoch": 2.607361963190184, "grad_norm": 2.661856174468994, "learning_rate": 5.497966070117816e-06, "loss": 0.0298, "step": 10200 }, { "epoch": 2.609918200408998, "grad_norm": 3.3979713916778564, "learning_rate": 5.4890866934946e-06, "loss": 0.0422, "step": 10210 }, { "epoch": 2.612474437627812, "grad_norm": 5.5186896324157715, "learning_rate": 5.480205759246926e-06, "loss": 0.0471, "step": 10220 }, { "epoch": 2.6150306748466257, "grad_norm": 3.5918192863464355, "learning_rate": 5.471323295658455e-06, "loss": 0.0692, "step": 10230 }, { "epoch": 2.61758691206544, "grad_norm": 4.837007999420166, "learning_rate": 5.462439331017711e-06, "loss": 0.0464, "step": 10240 }, { "epoch": 2.6201431492842535, "grad_norm": 1.8546375036239624, "learning_rate": 5.453553893618003e-06, "loss": 0.0397, "step": 10250 }, { "epoch": 2.6226993865030677, "grad_norm": 7.079483985900879, "learning_rate": 5.44466701175733e-06, "loss": 0.0308, "step": 10260 }, { "epoch": 2.6252556237218814, "grad_norm": 0.1995091438293457, "learning_rate": 5.435778713738292e-06, "loss": 0.0247, "step": 10270 }, { "epoch": 2.627811860940695, "grad_norm": 5.363643169403076, "learning_rate": 5.426889027867997e-06, "loss": 0.0418, "step": 10280 }, { "epoch": 2.6303680981595092, "grad_norm": 1.1156824827194214, "learning_rate": 5.417997982457974e-06, "loss": 0.0631, "step": 10290 }, { "epoch": 2.6329243353783234, "grad_norm": 0.43824976682662964, "learning_rate": 5.409105605824082e-06, "loss": 0.0433, "step": 10300 }, { "epoch": 2.635480572597137, "grad_norm": 0.8822130560874939, "learning_rate": 5.400211926286421e-06, "loss": 0.0247, "step": 10310 }, { "epoch": 2.638036809815951, "grad_norm": 3.7047805786132812, "learning_rate": 5.391316972169236e-06, "loss": 0.039, "step": 10320 }, { "epoch": 2.640593047034765, "grad_norm": 3.4349169731140137, "learning_rate": 5.382420771800836e-06, "loss": 0.0148, "step": 10330 }, { "epoch": 2.6431492842535786, "grad_norm": 4.191125392913818, "learning_rate": 5.373523353513498e-06, "loss": 0.0671, "step": 10340 }, { "epoch": 2.645705521472393, "grad_norm": 8.565727233886719, "learning_rate": 5.364624745643375e-06, "loss": 0.0534, "step": 10350 }, { "epoch": 2.6482617586912065, "grad_norm": 5.6679840087890625, "learning_rate": 5.35572497653041e-06, "loss": 0.0493, "step": 10360 }, { "epoch": 2.65081799591002, "grad_norm": 2.3933236598968506, "learning_rate": 5.346824074518246e-06, "loss": 0.05, "step": 10370 }, { "epoch": 2.6533742331288344, "grad_norm": 0.3358931839466095, "learning_rate": 5.337922067954136e-06, "loss": 0.0137, "step": 10380 }, { "epoch": 2.6559304703476485, "grad_norm": 2.881453275680542, "learning_rate": 5.329018985188841e-06, "loss": 0.0689, "step": 10390 }, { "epoch": 2.658486707566462, "grad_norm": 3.27288818359375, "learning_rate": 5.320114854576559e-06, "loss": 0.0297, "step": 10400 }, { "epoch": 2.661042944785276, "grad_norm": 3.823456287384033, "learning_rate": 5.3112097044748235e-06, "loss": 0.0607, "step": 10410 }, { "epoch": 2.66359918200409, "grad_norm": 3.608356475830078, "learning_rate": 5.302303563244413e-06, "loss": 0.0381, "step": 10420 }, { "epoch": 2.6661554192229038, "grad_norm": 1.3827208280563354, "learning_rate": 5.2933964592492614e-06, "loss": 0.05, "step": 10430 }, { "epoch": 2.668711656441718, "grad_norm": 0.04524281620979309, "learning_rate": 5.284488420856372e-06, "loss": 0.0268, "step": 10440 }, { "epoch": 2.6712678936605316, "grad_norm": 7.237791538238525, "learning_rate": 5.275579476435719e-06, "loss": 0.0239, "step": 10450 }, { "epoch": 2.6738241308793453, "grad_norm": 0.08604143559932709, "learning_rate": 5.2666696543601696e-06, "loss": 0.0819, "step": 10460 }, { "epoch": 2.6763803680981595, "grad_norm": 4.662979602813721, "learning_rate": 5.25775898300538e-06, "loss": 0.0539, "step": 10470 }, { "epoch": 2.6789366053169736, "grad_norm": 0.7715989947319031, "learning_rate": 5.248847490749711e-06, "loss": 0.0375, "step": 10480 }, { "epoch": 2.6814928425357873, "grad_norm": 5.067183971405029, "learning_rate": 5.239935205974145e-06, "loss": 0.0205, "step": 10490 }, { "epoch": 2.684049079754601, "grad_norm": 5.718189716339111, "learning_rate": 5.231022157062177e-06, "loss": 0.0898, "step": 10500 }, { "epoch": 2.686605316973415, "grad_norm": 14.444259643554688, "learning_rate": 5.222108372399746e-06, "loss": 0.043, "step": 10510 }, { "epoch": 2.689161554192229, "grad_norm": 0.2056499719619751, "learning_rate": 5.213193880375127e-06, "loss": 0.0639, "step": 10520 }, { "epoch": 2.691717791411043, "grad_norm": 0.02217238023877144, "learning_rate": 5.204278709378854e-06, "loss": 0.0177, "step": 10530 }, { "epoch": 2.6942740286298568, "grad_norm": 1.9676077365875244, "learning_rate": 5.195362887803617e-06, "loss": 0.0495, "step": 10540 }, { "epoch": 2.696830265848671, "grad_norm": 2.4766979217529297, "learning_rate": 5.186446444044184e-06, "loss": 0.0572, "step": 10550 }, { "epoch": 2.6993865030674846, "grad_norm": 0.9265226721763611, "learning_rate": 5.177529406497298e-06, "loss": 0.0192, "step": 10560 }, { "epoch": 2.7019427402862988, "grad_norm": 6.686746597290039, "learning_rate": 5.168611803561599e-06, "loss": 0.0632, "step": 10570 }, { "epoch": 2.7044989775051125, "grad_norm": 4.72622013092041, "learning_rate": 5.159693663637525e-06, "loss": 0.0499, "step": 10580 }, { "epoch": 2.707055214723926, "grad_norm": 4.173243045806885, "learning_rate": 5.150775015127224e-06, "loss": 0.0343, "step": 10590 }, { "epoch": 2.7096114519427403, "grad_norm": 0.10401232540607452, "learning_rate": 5.1418558864344645e-06, "loss": 0.0417, "step": 10600 }, { "epoch": 2.712167689161554, "grad_norm": 4.092282772064209, "learning_rate": 5.132936305964543e-06, "loss": 0.0335, "step": 10610 }, { "epoch": 2.714723926380368, "grad_norm": 8.394328117370605, "learning_rate": 5.1240163021241975e-06, "loss": 0.0785, "step": 10620 }, { "epoch": 2.717280163599182, "grad_norm": 3.676940441131592, "learning_rate": 5.1150959033215104e-06, "loss": 0.0382, "step": 10630 }, { "epoch": 2.719836400817996, "grad_norm": 0.23662449419498444, "learning_rate": 5.106175137965826e-06, "loss": 0.0467, "step": 10640 }, { "epoch": 2.7223926380368098, "grad_norm": 6.808079719543457, "learning_rate": 5.097254034467652e-06, "loss": 0.0348, "step": 10650 }, { "epoch": 2.724948875255624, "grad_norm": 0.04969576373696327, "learning_rate": 5.0883326212385775e-06, "loss": 0.031, "step": 10660 }, { "epoch": 2.7275051124744376, "grad_norm": 6.316954612731934, "learning_rate": 5.079410926691174e-06, "loss": 0.053, "step": 10670 }, { "epoch": 2.7300613496932513, "grad_norm": 4.699779987335205, "learning_rate": 5.07048897923891e-06, "loss": 0.0328, "step": 10680 }, { "epoch": 2.7326175869120655, "grad_norm": 2.899876117706299, "learning_rate": 5.061566807296062e-06, "loss": 0.0537, "step": 10690 }, { "epoch": 2.7351738241308796, "grad_norm": 1.7334074974060059, "learning_rate": 5.052644439277617e-06, "loss": 0.036, "step": 10700 }, { "epoch": 2.7377300613496933, "grad_norm": 0.5449509024620056, "learning_rate": 5.043721903599193e-06, "loss": 0.0199, "step": 10710 }, { "epoch": 2.740286298568507, "grad_norm": 0.7619210481643677, "learning_rate": 5.0347992286769324e-06, "loss": 0.0349, "step": 10720 }, { "epoch": 2.742842535787321, "grad_norm": 0.09413593262434006, "learning_rate": 5.025876442927429e-06, "loss": 0.0579, "step": 10730 }, { "epoch": 2.745398773006135, "grad_norm": 2.7584242820739746, "learning_rate": 5.016953574767629e-06, "loss": 0.0824, "step": 10740 }, { "epoch": 2.747955010224949, "grad_norm": 3.956817626953125, "learning_rate": 5.008030652614737e-06, "loss": 0.0461, "step": 10750 }, { "epoch": 2.7505112474437627, "grad_norm": 0.14918692409992218, "learning_rate": 4.99910770488613e-06, "loss": 0.0116, "step": 10760 }, { "epoch": 2.7530674846625764, "grad_norm": 4.674230098724365, "learning_rate": 4.990184759999271e-06, "loss": 0.0704, "step": 10770 }, { "epoch": 2.7556237218813906, "grad_norm": 4.550516128540039, "learning_rate": 4.981261846371612e-06, "loss": 0.0328, "step": 10780 }, { "epoch": 2.7581799591002047, "grad_norm": 5.67306661605835, "learning_rate": 4.972338992420501e-06, "loss": 0.0425, "step": 10790 }, { "epoch": 2.7607361963190185, "grad_norm": 3.2620246410369873, "learning_rate": 4.9634162265631016e-06, "loss": 0.0281, "step": 10800 }, { "epoch": 2.763292433537832, "grad_norm": 5.77325963973999, "learning_rate": 4.954493577216294e-06, "loss": 0.0263, "step": 10810 }, { "epoch": 2.7658486707566463, "grad_norm": 7.105217933654785, "learning_rate": 4.9455710727965886e-06, "loss": 0.0971, "step": 10820 }, { "epoch": 2.76840490797546, "grad_norm": 8.464949607849121, "learning_rate": 4.936648741720032e-06, "loss": 0.0459, "step": 10830 }, { "epoch": 2.770961145194274, "grad_norm": 9.054972648620605, "learning_rate": 4.9277266124021245e-06, "loss": 0.0335, "step": 10840 }, { "epoch": 2.773517382413088, "grad_norm": 1.2454347610473633, "learning_rate": 4.918804713257715e-06, "loss": 0.0471, "step": 10850 }, { "epoch": 2.7760736196319016, "grad_norm": 2.4472923278808594, "learning_rate": 4.909883072700928e-06, "loss": 0.0462, "step": 10860 }, { "epoch": 2.7786298568507157, "grad_norm": 0.04563615098595619, "learning_rate": 4.900961719145056e-06, "loss": 0.0167, "step": 10870 }, { "epoch": 2.78118609406953, "grad_norm": 5.5846734046936035, "learning_rate": 4.892040681002488e-06, "loss": 0.0578, "step": 10880 }, { "epoch": 2.7837423312883436, "grad_norm": 4.339868068695068, "learning_rate": 4.883119986684596e-06, "loss": 0.0273, "step": 10890 }, { "epoch": 2.7862985685071573, "grad_norm": 4.785184383392334, "learning_rate": 4.87419966460167e-06, "loss": 0.076, "step": 10900 }, { "epoch": 2.7888548057259714, "grad_norm": 0.035292405635118484, "learning_rate": 4.865279743162804e-06, "loss": 0.0462, "step": 10910 }, { "epoch": 2.791411042944785, "grad_norm": 3.155709743499756, "learning_rate": 4.856360250775821e-06, "loss": 0.036, "step": 10920 }, { "epoch": 2.7939672801635993, "grad_norm": 0.7432450652122498, "learning_rate": 4.847441215847177e-06, "loss": 0.0619, "step": 10930 }, { "epoch": 2.796523517382413, "grad_norm": 5.327913761138916, "learning_rate": 4.838522666781871e-06, "loss": 0.0647, "step": 10940 }, { "epoch": 2.799079754601227, "grad_norm": 2.4953877925872803, "learning_rate": 4.829604631983353e-06, "loss": 0.0392, "step": 10950 }, { "epoch": 2.801635991820041, "grad_norm": 3.6846439838409424, "learning_rate": 4.8206871398534385e-06, "loss": 0.0368, "step": 10960 }, { "epoch": 2.804192229038855, "grad_norm": 5.844122886657715, "learning_rate": 4.811770218792212e-06, "loss": 0.0476, "step": 10970 }, { "epoch": 2.8067484662576687, "grad_norm": 4.004204273223877, "learning_rate": 4.80285389719794e-06, "loss": 0.0589, "step": 10980 }, { "epoch": 2.8093047034764824, "grad_norm": 0.9968608021736145, "learning_rate": 4.793938203466979e-06, "loss": 0.0448, "step": 10990 }, { "epoch": 2.8118609406952966, "grad_norm": 6.936352252960205, "learning_rate": 4.78502316599369e-06, "loss": 0.0447, "step": 11000 }, { "epoch": 2.8144171779141103, "grad_norm": 4.1466383934021, "learning_rate": 4.776108813170337e-06, "loss": 0.0406, "step": 11010 }, { "epoch": 2.8169734151329244, "grad_norm": 12.088165283203125, "learning_rate": 4.76719517338701e-06, "loss": 0.0544, "step": 11020 }, { "epoch": 2.819529652351738, "grad_norm": 3.7247049808502197, "learning_rate": 4.758282275031524e-06, "loss": 0.0304, "step": 11030 }, { "epoch": 2.8220858895705523, "grad_norm": 5.583109378814697, "learning_rate": 4.7493701464893366e-06, "loss": 0.0326, "step": 11040 }, { "epoch": 2.824642126789366, "grad_norm": 1.8860771656036377, "learning_rate": 4.740458816143447e-06, "loss": 0.0268, "step": 11050 }, { "epoch": 2.82719836400818, "grad_norm": 2.164116144180298, "learning_rate": 4.731548312374323e-06, "loss": 0.0403, "step": 11060 }, { "epoch": 2.829754601226994, "grad_norm": 3.961606740951538, "learning_rate": 4.722638663559787e-06, "loss": 0.039, "step": 11070 }, { "epoch": 2.8323108384458076, "grad_norm": 0.07476239651441574, "learning_rate": 4.713729898074949e-06, "loss": 0.0522, "step": 11080 }, { "epoch": 2.8348670756646217, "grad_norm": 4.681721210479736, "learning_rate": 4.704822044292103e-06, "loss": 0.0413, "step": 11090 }, { "epoch": 2.837423312883436, "grad_norm": 4.108366012573242, "learning_rate": 4.695915130580636e-06, "loss": 0.0305, "step": 11100 }, { "epoch": 2.8399795501022496, "grad_norm": 0.2699336111545563, "learning_rate": 4.687009185306945e-06, "loss": 0.0495, "step": 11110 }, { "epoch": 2.8425357873210633, "grad_norm": 3.466141939163208, "learning_rate": 4.678104236834341e-06, "loss": 0.0725, "step": 11120 }, { "epoch": 2.8450920245398774, "grad_norm": 3.030548334121704, "learning_rate": 4.6692003135229606e-06, "loss": 0.0405, "step": 11130 }, { "epoch": 2.847648261758691, "grad_norm": 4.3781938552856445, "learning_rate": 4.660297443729675e-06, "loss": 0.0209, "step": 11140 }, { "epoch": 2.8502044989775053, "grad_norm": 0.2208949774503708, "learning_rate": 4.6513956558080034e-06, "loss": 0.0237, "step": 11150 }, { "epoch": 2.852760736196319, "grad_norm": 4.45728874206543, "learning_rate": 4.642494978108014e-06, "loss": 0.0528, "step": 11160 }, { "epoch": 2.8553169734151327, "grad_norm": 6.202856063842773, "learning_rate": 4.633595438976244e-06, "loss": 0.0534, "step": 11170 }, { "epoch": 2.857873210633947, "grad_norm": 3.93393874168396, "learning_rate": 4.624697066755602e-06, "loss": 0.0261, "step": 11180 }, { "epoch": 2.860429447852761, "grad_norm": 1.9619215726852417, "learning_rate": 4.6157998897852815e-06, "loss": 0.0429, "step": 11190 }, { "epoch": 2.8629856850715747, "grad_norm": 5.04984188079834, "learning_rate": 4.606903936400667e-06, "loss": 0.0428, "step": 11200 }, { "epoch": 2.8655419222903884, "grad_norm": 3.097203254699707, "learning_rate": 4.5980092349332525e-06, "loss": 0.0336, "step": 11210 }, { "epoch": 2.8680981595092025, "grad_norm": 1.7928495407104492, "learning_rate": 4.589115813710535e-06, "loss": 0.0516, "step": 11220 }, { "epoch": 2.8706543967280163, "grad_norm": 3.5692665576934814, "learning_rate": 4.580223701055945e-06, "loss": 0.0328, "step": 11230 }, { "epoch": 2.8732106339468304, "grad_norm": 1.9397566318511963, "learning_rate": 4.571332925288735e-06, "loss": 0.0255, "step": 11240 }, { "epoch": 2.875766871165644, "grad_norm": 3.0860631465911865, "learning_rate": 4.562443514723911e-06, "loss": 0.0356, "step": 11250 }, { "epoch": 2.878323108384458, "grad_norm": 3.6334643363952637, "learning_rate": 4.553555497672119e-06, "loss": 0.0535, "step": 11260 }, { "epoch": 2.880879345603272, "grad_norm": 5.285019397735596, "learning_rate": 4.544668902439577e-06, "loss": 0.073, "step": 11270 }, { "epoch": 2.883435582822086, "grad_norm": 0.21129778027534485, "learning_rate": 4.53578375732797e-06, "loss": 0.0175, "step": 11280 }, { "epoch": 2.8859918200409, "grad_norm": 0.07329968363046646, "learning_rate": 4.526900090634368e-06, "loss": 0.0222, "step": 11290 }, { "epoch": 2.8885480572597135, "grad_norm": 2.5236427783966064, "learning_rate": 4.518017930651128e-06, "loss": 0.0439, "step": 11300 }, { "epoch": 2.8911042944785277, "grad_norm": 0.4075072407722473, "learning_rate": 4.509137305665812e-06, "loss": 0.0405, "step": 11310 }, { "epoch": 2.8936605316973414, "grad_norm": 1.6199369430541992, "learning_rate": 4.5002582439610895e-06, "loss": 0.019, "step": 11320 }, { "epoch": 2.8962167689161555, "grad_norm": 0.04643448814749718, "learning_rate": 4.491380773814659e-06, "loss": 0.0212, "step": 11330 }, { "epoch": 2.8987730061349692, "grad_norm": 1.4235713481903076, "learning_rate": 4.4825049234991405e-06, "loss": 0.0105, "step": 11340 }, { "epoch": 2.9013292433537834, "grad_norm": 0.04633248969912529, "learning_rate": 4.473630721282004e-06, "loss": 0.0261, "step": 11350 }, { "epoch": 2.903885480572597, "grad_norm": 5.469078063964844, "learning_rate": 4.464758195425464e-06, "loss": 0.0275, "step": 11360 }, { "epoch": 2.9064417177914113, "grad_norm": 0.15273931622505188, "learning_rate": 4.455887374186401e-06, "loss": 0.0297, "step": 11370 }, { "epoch": 2.908997955010225, "grad_norm": 0.10551747679710388, "learning_rate": 4.447018285816263e-06, "loss": 0.0285, "step": 11380 }, { "epoch": 2.9115541922290387, "grad_norm": 0.063129723072052, "learning_rate": 4.438150958560983e-06, "loss": 0.028, "step": 11390 }, { "epoch": 2.914110429447853, "grad_norm": 0.8330835700035095, "learning_rate": 4.42928542066088e-06, "loss": 0.0227, "step": 11400 }, { "epoch": 2.9166666666666665, "grad_norm": 5.185162544250488, "learning_rate": 4.420421700350581e-06, "loss": 0.0378, "step": 11410 }, { "epoch": 2.9192229038854807, "grad_norm": 0.602056622505188, "learning_rate": 4.4115598258589165e-06, "loss": 0.0259, "step": 11420 }, { "epoch": 2.9217791411042944, "grad_norm": 3.8201723098754883, "learning_rate": 4.402699825408849e-06, "loss": 0.0373, "step": 11430 }, { "epoch": 2.9243353783231085, "grad_norm": 0.2384403496980667, "learning_rate": 4.393841727217361e-06, "loss": 0.0158, "step": 11440 }, { "epoch": 2.9268916155419222, "grad_norm": 2.9862217903137207, "learning_rate": 4.384985559495387e-06, "loss": 0.0573, "step": 11450 }, { "epoch": 2.9294478527607364, "grad_norm": 5.518589019775391, "learning_rate": 4.376131350447703e-06, "loss": 0.0331, "step": 11460 }, { "epoch": 2.93200408997955, "grad_norm": 6.048367500305176, "learning_rate": 4.36727912827286e-06, "loss": 0.0422, "step": 11470 }, { "epoch": 2.934560327198364, "grad_norm": 5.123732089996338, "learning_rate": 4.358428921163066e-06, "loss": 0.0287, "step": 11480 }, { "epoch": 2.937116564417178, "grad_norm": 4.53354549407959, "learning_rate": 4.349580757304127e-06, "loss": 0.0191, "step": 11490 }, { "epoch": 2.939672801635992, "grad_norm": 1.6047019958496094, "learning_rate": 4.34073466487533e-06, "loss": 0.0529, "step": 11500 }, { "epoch": 2.942229038854806, "grad_norm": 0.1400771290063858, "learning_rate": 4.331890672049371e-06, "loss": 0.029, "step": 11510 }, { "epoch": 2.9447852760736195, "grad_norm": 4.497285842895508, "learning_rate": 4.323048806992257e-06, "loss": 0.031, "step": 11520 }, { "epoch": 2.9473415132924337, "grad_norm": 5.1836442947387695, "learning_rate": 4.31420909786322e-06, "loss": 0.0347, "step": 11530 }, { "epoch": 2.9498977505112474, "grad_norm": 0.12893950939178467, "learning_rate": 4.305371572814623e-06, "loss": 0.0141, "step": 11540 }, { "epoch": 2.9524539877300615, "grad_norm": 5.480885028839111, "learning_rate": 4.296536259991876e-06, "loss": 0.0223, "step": 11550 }, { "epoch": 2.955010224948875, "grad_norm": 15.032180786132812, "learning_rate": 4.287703187533346e-06, "loss": 0.0722, "step": 11560 }, { "epoch": 2.957566462167689, "grad_norm": 2.98856520652771, "learning_rate": 4.278872383570256e-06, "loss": 0.0248, "step": 11570 }, { "epoch": 2.960122699386503, "grad_norm": 3.5357167720794678, "learning_rate": 4.270043876226616e-06, "loss": 0.0385, "step": 11580 }, { "epoch": 2.9626789366053172, "grad_norm": 1.0948529243469238, "learning_rate": 4.2612176936191104e-06, "loss": 0.0293, "step": 11590 }, { "epoch": 2.965235173824131, "grad_norm": 1.0036929845809937, "learning_rate": 4.252393863857033e-06, "loss": 0.0598, "step": 11600 }, { "epoch": 2.9677914110429446, "grad_norm": 5.068575382232666, "learning_rate": 4.243572415042168e-06, "loss": 0.0479, "step": 11610 }, { "epoch": 2.970347648261759, "grad_norm": 2.0871167182922363, "learning_rate": 4.2347533752687335e-06, "loss": 0.0228, "step": 11620 }, { "epoch": 2.9729038854805725, "grad_norm": 0.04474279657006264, "learning_rate": 4.225936772623262e-06, "loss": 0.0119, "step": 11630 }, { "epoch": 2.9754601226993866, "grad_norm": 3.878139019012451, "learning_rate": 4.217122635184532e-06, "loss": 0.0333, "step": 11640 }, { "epoch": 2.9780163599182004, "grad_norm": 0.04483529180288315, "learning_rate": 4.208310991023469e-06, "loss": 0.0411, "step": 11650 }, { "epoch": 2.980572597137014, "grad_norm": 0.06956873834133148, "learning_rate": 4.199501868203059e-06, "loss": 0.015, "step": 11660 }, { "epoch": 2.983128834355828, "grad_norm": 4.716834545135498, "learning_rate": 4.190695294778254e-06, "loss": 0.0272, "step": 11670 }, { "epoch": 2.9856850715746424, "grad_norm": 4.978919506072998, "learning_rate": 4.1818912987958935e-06, "loss": 0.0349, "step": 11680 }, { "epoch": 2.988241308793456, "grad_norm": 4.98551607131958, "learning_rate": 4.1730899082946e-06, "loss": 0.0391, "step": 11690 }, { "epoch": 2.9907975460122698, "grad_norm": 0.028066415339708328, "learning_rate": 4.164291151304707e-06, "loss": 0.0366, "step": 11700 }, { "epoch": 2.993353783231084, "grad_norm": 3.7603607177734375, "learning_rate": 4.155495055848154e-06, "loss": 0.0309, "step": 11710 }, { "epoch": 2.9959100204498976, "grad_norm": 6.2368621826171875, "learning_rate": 4.146701649938409e-06, "loss": 0.0526, "step": 11720 }, { "epoch": 2.9984662576687118, "grad_norm": 1.746232032775879, "learning_rate": 4.13791096158037e-06, "loss": 0.018, "step": 11730 }, { "epoch": 3.0010224948875255, "grad_norm": 3.928952693939209, "learning_rate": 4.129123018770285e-06, "loss": 0.0108, "step": 11740 }, { "epoch": 3.0035787321063396, "grad_norm": 0.7030458450317383, "learning_rate": 4.120337849495654e-06, "loss": 0.019, "step": 11750 }, { "epoch": 3.0061349693251533, "grad_norm": 1.5258599519729614, "learning_rate": 4.111555481735147e-06, "loss": 0.0215, "step": 11760 }, { "epoch": 3.0086912065439675, "grad_norm": 3.1201798915863037, "learning_rate": 4.102775943458508e-06, "loss": 0.015, "step": 11770 }, { "epoch": 3.011247443762781, "grad_norm": 2.5468101501464844, "learning_rate": 4.093999262626474e-06, "loss": 0.0092, "step": 11780 }, { "epoch": 3.013803680981595, "grad_norm": 4.258352279663086, "learning_rate": 4.0852254671906794e-06, "loss": 0.0111, "step": 11790 }, { "epoch": 3.016359918200409, "grad_norm": 4.136040210723877, "learning_rate": 4.076454585093572e-06, "loss": 0.0247, "step": 11800 }, { "epoch": 3.0189161554192228, "grad_norm": 0.01770654506981373, "learning_rate": 4.067686644268316e-06, "loss": 0.0168, "step": 11810 }, { "epoch": 3.021472392638037, "grad_norm": 3.165257453918457, "learning_rate": 4.0589216726387146e-06, "loss": 0.0157, "step": 11820 }, { "epoch": 3.0240286298568506, "grad_norm": 1.5152426958084106, "learning_rate": 4.050159698119107e-06, "loss": 0.0113, "step": 11830 }, { "epoch": 3.0265848670756648, "grad_norm": 0.025976594537496567, "learning_rate": 4.0414007486142985e-06, "loss": 0.0072, "step": 11840 }, { "epoch": 3.0291411042944785, "grad_norm": 4.125540256500244, "learning_rate": 4.032644852019447e-06, "loss": 0.0118, "step": 11850 }, { "epoch": 3.0316973415132926, "grad_norm": 0.026777638122439384, "learning_rate": 4.023892036220001e-06, "loss": 0.001, "step": 11860 }, { "epoch": 3.0342535787321063, "grad_norm": 3.001214027404785, "learning_rate": 4.015142329091587e-06, "loss": 0.0372, "step": 11870 }, { "epoch": 3.03680981595092, "grad_norm": 0.012349724769592285, "learning_rate": 4.006395758499937e-06, "loss": 0.0242, "step": 11880 }, { "epoch": 3.039366053169734, "grad_norm": 0.48854807019233704, "learning_rate": 3.99765235230079e-06, "loss": 0.0202, "step": 11890 }, { "epoch": 3.041922290388548, "grad_norm": 7.029765605926514, "learning_rate": 3.988912138339812e-06, "loss": 0.0228, "step": 11900 }, { "epoch": 3.044478527607362, "grad_norm": 2.26522159576416, "learning_rate": 3.980175144452496e-06, "loss": 0.0152, "step": 11910 }, { "epoch": 3.0470347648261757, "grad_norm": 5.204248905181885, "learning_rate": 3.971441398464088e-06, "loss": 0.021, "step": 11920 }, { "epoch": 3.04959100204499, "grad_norm": 2.968381881713867, "learning_rate": 3.962710928189481e-06, "loss": 0.0234, "step": 11930 }, { "epoch": 3.0521472392638036, "grad_norm": 3.710779905319214, "learning_rate": 3.953983761433144e-06, "loss": 0.0067, "step": 11940 }, { "epoch": 3.0547034764826178, "grad_norm": 2.136486530303955, "learning_rate": 3.94525992598902e-06, "loss": 0.0096, "step": 11950 }, { "epoch": 3.0572597137014315, "grad_norm": 0.898169219493866, "learning_rate": 3.936539449640445e-06, "loss": 0.007, "step": 11960 }, { "epoch": 3.0598159509202456, "grad_norm": 7.237276077270508, "learning_rate": 3.927822360160053e-06, "loss": 0.0261, "step": 11970 }, { "epoch": 3.0623721881390593, "grad_norm": 2.5147705078125, "learning_rate": 3.919108685309699e-06, "loss": 0.014, "step": 11980 }, { "epoch": 3.064928425357873, "grad_norm": 3.493708372116089, "learning_rate": 3.9103984528403555e-06, "loss": 0.0213, "step": 11990 }, { "epoch": 3.067484662576687, "grad_norm": 1.2625579833984375, "learning_rate": 3.901691690492035e-06, "loss": 0.0161, "step": 12000 }, { "epoch": 3.070040899795501, "grad_norm": 3.3386011123657227, "learning_rate": 3.892988425993703e-06, "loss": 0.004, "step": 12010 }, { "epoch": 3.072597137014315, "grad_norm": 1.1990747451782227, "learning_rate": 3.884288687063177e-06, "loss": 0.0109, "step": 12020 }, { "epoch": 3.0751533742331287, "grad_norm": 1.3895822763442993, "learning_rate": 3.875592501407052e-06, "loss": 0.0272, "step": 12030 }, { "epoch": 3.077709611451943, "grad_norm": 9.504667282104492, "learning_rate": 3.866899896720604e-06, "loss": 0.0211, "step": 12040 }, { "epoch": 3.0802658486707566, "grad_norm": 10.509309768676758, "learning_rate": 3.858210900687707e-06, "loss": 0.0174, "step": 12050 }, { "epoch": 3.0828220858895707, "grad_norm": 0.08506203442811966, "learning_rate": 3.849525540980739e-06, "loss": 0.0087, "step": 12060 }, { "epoch": 3.0853783231083844, "grad_norm": 1.2189379930496216, "learning_rate": 3.840843845260501e-06, "loss": 0.0119, "step": 12070 }, { "epoch": 3.087934560327198, "grad_norm": 0.03395168483257294, "learning_rate": 3.832165841176121e-06, "loss": 0.0163, "step": 12080 }, { "epoch": 3.0904907975460123, "grad_norm": 4.858822345733643, "learning_rate": 3.823491556364973e-06, "loss": 0.0104, "step": 12090 }, { "epoch": 3.093047034764826, "grad_norm": 0.15337003767490387, "learning_rate": 3.814821018452583e-06, "loss": 0.0249, "step": 12100 }, { "epoch": 3.09560327198364, "grad_norm": 6.41199254989624, "learning_rate": 3.806154255052551e-06, "loss": 0.0067, "step": 12110 }, { "epoch": 3.098159509202454, "grad_norm": 1.0053160190582275, "learning_rate": 3.7974912937664455e-06, "loss": 0.0299, "step": 12120 }, { "epoch": 3.100715746421268, "grad_norm": 1.6271339654922485, "learning_rate": 3.7888321621837363e-06, "loss": 0.0053, "step": 12130 }, { "epoch": 3.1032719836400817, "grad_norm": 0.03732278570532799, "learning_rate": 3.7801768878816892e-06, "loss": 0.0089, "step": 12140 }, { "epoch": 3.105828220858896, "grad_norm": 4.223018646240234, "learning_rate": 3.771525498425289e-06, "loss": 0.0107, "step": 12150 }, { "epoch": 3.1083844580777096, "grad_norm": 1.2061896324157715, "learning_rate": 3.762878021367148e-06, "loss": 0.0154, "step": 12160 }, { "epoch": 3.1109406952965237, "grad_norm": 2.464517831802368, "learning_rate": 3.754234484247418e-06, "loss": 0.0078, "step": 12170 }, { "epoch": 3.1134969325153374, "grad_norm": 0.042976122349500656, "learning_rate": 3.745594914593701e-06, "loss": 0.0114, "step": 12180 }, { "epoch": 3.116053169734151, "grad_norm": 0.11069530993700027, "learning_rate": 3.7369593399209704e-06, "loss": 0.0111, "step": 12190 }, { "epoch": 3.1186094069529653, "grad_norm": 0.14891409873962402, "learning_rate": 3.728327787731465e-06, "loss": 0.0084, "step": 12200 }, { "epoch": 3.121165644171779, "grad_norm": 0.02942030318081379, "learning_rate": 3.7197002855146257e-06, "loss": 0.011, "step": 12210 }, { "epoch": 3.123721881390593, "grad_norm": 3.233976364135742, "learning_rate": 3.7110768607469842e-06, "loss": 0.0082, "step": 12220 }, { "epoch": 3.126278118609407, "grad_norm": 3.62264084815979, "learning_rate": 3.7024575408920958e-06, "loss": 0.009, "step": 12230 }, { "epoch": 3.128834355828221, "grad_norm": 0.051736973226070404, "learning_rate": 3.693842353400435e-06, "loss": 0.0276, "step": 12240 }, { "epoch": 3.1313905930470347, "grad_norm": 1.5636509656906128, "learning_rate": 3.6852313257093214e-06, "loss": 0.0283, "step": 12250 }, { "epoch": 3.133946830265849, "grad_norm": 3.639524221420288, "learning_rate": 3.6766244852428218e-06, "loss": 0.0209, "step": 12260 }, { "epoch": 3.1365030674846626, "grad_norm": 2.127938985824585, "learning_rate": 3.6680218594116725e-06, "loss": 0.0079, "step": 12270 }, { "epoch": 3.1390593047034763, "grad_norm": 5.6783447265625, "learning_rate": 3.6594234756131826e-06, "loss": 0.0194, "step": 12280 }, { "epoch": 3.1416155419222904, "grad_norm": 0.3146345615386963, "learning_rate": 3.6508293612311552e-06, "loss": 0.0153, "step": 12290 }, { "epoch": 3.144171779141104, "grad_norm": 0.37290289998054504, "learning_rate": 3.642239543635793e-06, "loss": 0.0235, "step": 12300 }, { "epoch": 3.1467280163599183, "grad_norm": 0.22575929760932922, "learning_rate": 3.6336540501836185e-06, "loss": 0.0109, "step": 12310 }, { "epoch": 3.149284253578732, "grad_norm": 3.687939405441284, "learning_rate": 3.625072908217378e-06, "loss": 0.0177, "step": 12320 }, { "epoch": 3.151840490797546, "grad_norm": 0.08439797908067703, "learning_rate": 3.6164961450659634e-06, "loss": 0.0045, "step": 12330 }, { "epoch": 3.15439672801636, "grad_norm": 2.362006425857544, "learning_rate": 3.6079237880443186e-06, "loss": 0.0142, "step": 12340 }, { "epoch": 3.156952965235174, "grad_norm": 0.579308807849884, "learning_rate": 3.599355864453357e-06, "loss": 0.0074, "step": 12350 }, { "epoch": 3.1595092024539877, "grad_norm": 0.3662513494491577, "learning_rate": 3.5907924015798697e-06, "loss": 0.0133, "step": 12360 }, { "epoch": 3.1620654396728014, "grad_norm": 0.22020725905895233, "learning_rate": 3.5822334266964454e-06, "loss": 0.0245, "step": 12370 }, { "epoch": 3.1646216768916156, "grad_norm": 0.26042699813842773, "learning_rate": 3.573678967061374e-06, "loss": 0.0039, "step": 12380 }, { "epoch": 3.1671779141104293, "grad_norm": 4.502334117889404, "learning_rate": 3.5651290499185752e-06, "loss": 0.0135, "step": 12390 }, { "epoch": 3.1697341513292434, "grad_norm": 0.07907534390687943, "learning_rate": 3.556583702497489e-06, "loss": 0.0058, "step": 12400 }, { "epoch": 3.172290388548057, "grad_norm": 0.012879629619419575, "learning_rate": 3.5480429520130144e-06, "loss": 0.018, "step": 12410 }, { "epoch": 3.1748466257668713, "grad_norm": 0.1027621328830719, "learning_rate": 3.5395068256653984e-06, "loss": 0.0055, "step": 12420 }, { "epoch": 3.177402862985685, "grad_norm": 2.4270403385162354, "learning_rate": 3.5309753506401747e-06, "loss": 0.0186, "step": 12430 }, { "epoch": 3.179959100204499, "grad_norm": 0.0203610397875309, "learning_rate": 3.5224485541080476e-06, "loss": 0.011, "step": 12440 }, { "epoch": 3.182515337423313, "grad_norm": 3.286555528640747, "learning_rate": 3.513926463224836e-06, "loss": 0.0051, "step": 12450 }, { "epoch": 3.185071574642127, "grad_norm": 0.15632130205631256, "learning_rate": 3.5054091051313666e-06, "loss": 0.0061, "step": 12460 }, { "epoch": 3.1876278118609407, "grad_norm": 1.8245761394500732, "learning_rate": 3.49689650695339e-06, "loss": 0.0151, "step": 12470 }, { "epoch": 3.190184049079755, "grad_norm": 0.6735230088233948, "learning_rate": 3.4883886958015046e-06, "loss": 0.0129, "step": 12480 }, { "epoch": 3.1927402862985685, "grad_norm": 1.4515380859375, "learning_rate": 3.4798856987710574e-06, "loss": 0.0222, "step": 12490 }, { "epoch": 3.1952965235173822, "grad_norm": 0.036662183701992035, "learning_rate": 3.4713875429420656e-06, "loss": 0.0235, "step": 12500 }, { "epoch": 3.1978527607361964, "grad_norm": 2.479926109313965, "learning_rate": 3.4628942553791285e-06, "loss": 0.0075, "step": 12510 }, { "epoch": 3.20040899795501, "grad_norm": 0.033283405005931854, "learning_rate": 3.4544058631313427e-06, "loss": 0.0105, "step": 12520 }, { "epoch": 3.2029652351738243, "grad_norm": 0.06612569093704224, "learning_rate": 3.44592239323221e-06, "loss": 0.0143, "step": 12530 }, { "epoch": 3.205521472392638, "grad_norm": 0.0648500844836235, "learning_rate": 3.4374438726995614e-06, "loss": 0.0086, "step": 12540 }, { "epoch": 3.208077709611452, "grad_norm": 0.08395984768867493, "learning_rate": 3.4289703285354587e-06, "loss": 0.0105, "step": 12550 }, { "epoch": 3.210633946830266, "grad_norm": 1.128602147102356, "learning_rate": 3.4205017877261244e-06, "loss": 0.0157, "step": 12560 }, { "epoch": 3.21319018404908, "grad_norm": 0.026443956419825554, "learning_rate": 3.4120382772418346e-06, "loss": 0.0075, "step": 12570 }, { "epoch": 3.2157464212678937, "grad_norm": 0.2616029679775238, "learning_rate": 3.4035798240368578e-06, "loss": 0.0085, "step": 12580 }, { "epoch": 3.2183026584867074, "grad_norm": 3.7674038410186768, "learning_rate": 3.3951264550493433e-06, "loss": 0.0166, "step": 12590 }, { "epoch": 3.2208588957055215, "grad_norm": 4.487175941467285, "learning_rate": 3.3866781972012602e-06, "loss": 0.0082, "step": 12600 }, { "epoch": 3.2234151329243352, "grad_norm": 4.683178424835205, "learning_rate": 3.378235077398292e-06, "loss": 0.0081, "step": 12610 }, { "epoch": 3.2259713701431494, "grad_norm": 0.07378882169723511, "learning_rate": 3.369797122529762e-06, "loss": 0.0126, "step": 12620 }, { "epoch": 3.228527607361963, "grad_norm": 0.04591992124915123, "learning_rate": 3.3613643594685436e-06, "loss": 0.0069, "step": 12630 }, { "epoch": 3.2310838445807772, "grad_norm": 0.039997998625040054, "learning_rate": 3.3529368150709762e-06, "loss": 0.0084, "step": 12640 }, { "epoch": 3.233640081799591, "grad_norm": 0.03221229463815689, "learning_rate": 3.344514516176778e-06, "loss": 0.0148, "step": 12650 }, { "epoch": 3.236196319018405, "grad_norm": 2.5336620807647705, "learning_rate": 3.336097489608962e-06, "loss": 0.0144, "step": 12660 }, { "epoch": 3.238752556237219, "grad_norm": 0.19575847685337067, "learning_rate": 3.3276857621737495e-06, "loss": 0.009, "step": 12670 }, { "epoch": 3.2413087934560325, "grad_norm": 4.261199951171875, "learning_rate": 3.3192793606604877e-06, "loss": 0.0123, "step": 12680 }, { "epoch": 3.2438650306748467, "grad_norm": 3.218693733215332, "learning_rate": 3.3108783118415583e-06, "loss": 0.0124, "step": 12690 }, { "epoch": 3.2464212678936604, "grad_norm": 0.16256259381771088, "learning_rate": 3.3024826424722993e-06, "loss": 0.0139, "step": 12700 }, { "epoch": 3.2489775051124745, "grad_norm": 3.9794180393218994, "learning_rate": 3.2940923792909134e-06, "loss": 0.0163, "step": 12710 }, { "epoch": 3.2515337423312882, "grad_norm": 0.19562911987304688, "learning_rate": 3.28570754901839e-06, "loss": 0.0087, "step": 12720 }, { "epoch": 3.2540899795501024, "grad_norm": 1.9741108417510986, "learning_rate": 3.2773281783584104e-06, "loss": 0.0221, "step": 12730 }, { "epoch": 3.256646216768916, "grad_norm": 5.769931793212891, "learning_rate": 3.2689542939972742e-06, "loss": 0.0191, "step": 12740 }, { "epoch": 3.2592024539877302, "grad_norm": 3.071200370788574, "learning_rate": 3.2605859226038038e-06, "loss": 0.0333, "step": 12750 }, { "epoch": 3.261758691206544, "grad_norm": 0.05902179330587387, "learning_rate": 3.2522230908292674e-06, "loss": 0.0056, "step": 12760 }, { "epoch": 3.2643149284253576, "grad_norm": 4.801800727844238, "learning_rate": 3.243865825307286e-06, "loss": 0.03, "step": 12770 }, { "epoch": 3.266871165644172, "grad_norm": 0.9842033386230469, "learning_rate": 3.2355141526537636e-06, "loss": 0.0188, "step": 12780 }, { "epoch": 3.2694274028629855, "grad_norm": 0.02561868727207184, "learning_rate": 3.2271680994667776e-06, "loss": 0.0064, "step": 12790 }, { "epoch": 3.2719836400817996, "grad_norm": 0.22291143238544464, "learning_rate": 3.2188276923265237e-06, "loss": 0.0054, "step": 12800 }, { "epoch": 3.2745398773006134, "grad_norm": 2.7589244842529297, "learning_rate": 3.2104929577952028e-06, "loss": 0.0211, "step": 12810 }, { "epoch": 3.2770961145194275, "grad_norm": 0.16009178757667542, "learning_rate": 3.2021639224169615e-06, "loss": 0.0069, "step": 12820 }, { "epoch": 3.279652351738241, "grad_norm": 0.02730882354080677, "learning_rate": 3.1938406127177878e-06, "loss": 0.0145, "step": 12830 }, { "epoch": 3.2822085889570554, "grad_norm": 0.1984373927116394, "learning_rate": 3.1855230552054395e-06, "loss": 0.0114, "step": 12840 }, { "epoch": 3.284764826175869, "grad_norm": 1.7443758249282837, "learning_rate": 3.177211276369351e-06, "loss": 0.0084, "step": 12850 }, { "epoch": 3.287321063394683, "grad_norm": 1.0765767097473145, "learning_rate": 3.1689053026805573e-06, "loss": 0.0055, "step": 12860 }, { "epoch": 3.289877300613497, "grad_norm": 0.25870829820632935, "learning_rate": 3.160605160591602e-06, "loss": 0.0189, "step": 12870 }, { "epoch": 3.292433537832311, "grad_norm": 0.007034212350845337, "learning_rate": 3.1523108765364598e-06, "loss": 0.0059, "step": 12880 }, { "epoch": 3.294989775051125, "grad_norm": 3.2303411960601807, "learning_rate": 3.1440224769304446e-06, "loss": 0.009, "step": 12890 }, { "epoch": 3.2975460122699385, "grad_norm": 3.338958978652954, "learning_rate": 3.1357399881701326e-06, "loss": 0.0126, "step": 12900 }, { "epoch": 3.3001022494887526, "grad_norm": 0.04261789843440056, "learning_rate": 3.1274634366332775e-06, "loss": 0.004, "step": 12910 }, { "epoch": 3.3026584867075663, "grad_norm": 0.30257269740104675, "learning_rate": 3.119192848678717e-06, "loss": 0.0025, "step": 12920 }, { "epoch": 3.3052147239263805, "grad_norm": 0.821662962436676, "learning_rate": 3.110928250646307e-06, "loss": 0.0129, "step": 12930 }, { "epoch": 3.307770961145194, "grad_norm": 0.3390525281429291, "learning_rate": 3.1026696688568137e-06, "loss": 0.0106, "step": 12940 }, { "epoch": 3.3103271983640083, "grad_norm": 0.07365961372852325, "learning_rate": 3.0944171296118574e-06, "loss": 0.0271, "step": 12950 }, { "epoch": 3.312883435582822, "grad_norm": 0.03872542828321457, "learning_rate": 3.0861706591938013e-06, "loss": 0.0106, "step": 12960 }, { "epoch": 3.315439672801636, "grad_norm": 0.08862084150314331, "learning_rate": 3.0779302838656906e-06, "loss": 0.0046, "step": 12970 }, { "epoch": 3.31799591002045, "grad_norm": 4.23959493637085, "learning_rate": 3.0696960298711525e-06, "loss": 0.0028, "step": 12980 }, { "epoch": 3.3205521472392636, "grad_norm": 0.6766570806503296, "learning_rate": 3.0614679234343242e-06, "loss": 0.0076, "step": 12990 }, { "epoch": 3.3231083844580778, "grad_norm": 6.0040130615234375, "learning_rate": 3.05324599075976e-06, "loss": 0.0292, "step": 13000 }, { "epoch": 3.3256646216768915, "grad_norm": 1.5274661779403687, "learning_rate": 3.0450302580323553e-06, "loss": 0.0104, "step": 13010 }, { "epoch": 3.3282208588957056, "grad_norm": 2.2926924228668213, "learning_rate": 3.036820751417259e-06, "loss": 0.038, "step": 13020 }, { "epoch": 3.3307770961145193, "grad_norm": 2.260282278060913, "learning_rate": 3.0286174970597916e-06, "loss": 0.0122, "step": 13030 }, { "epoch": 3.3333333333333335, "grad_norm": 0.12744282186031342, "learning_rate": 3.02042052108536e-06, "loss": 0.0079, "step": 13040 }, { "epoch": 3.335889570552147, "grad_norm": 0.016303053125739098, "learning_rate": 3.0122298495993803e-06, "loss": 0.0297, "step": 13050 }, { "epoch": 3.3384458077709613, "grad_norm": 0.024792036041617393, "learning_rate": 3.0040455086871846e-06, "loss": 0.015, "step": 13060 }, { "epoch": 3.341002044989775, "grad_norm": 3.460494041442871, "learning_rate": 2.995867524413949e-06, "loss": 0.0163, "step": 13070 }, { "epoch": 3.3435582822085887, "grad_norm": 0.18595871329307556, "learning_rate": 2.9876959228246006e-06, "loss": 0.0047, "step": 13080 }, { "epoch": 3.346114519427403, "grad_norm": 3.1711583137512207, "learning_rate": 2.9795307299437425e-06, "loss": 0.0171, "step": 13090 }, { "epoch": 3.3486707566462166, "grad_norm": 0.08736535161733627, "learning_rate": 2.971371971775565e-06, "loss": 0.0196, "step": 13100 }, { "epoch": 3.3512269938650308, "grad_norm": 0.05008082464337349, "learning_rate": 2.96321967430377e-06, "loss": 0.0042, "step": 13110 }, { "epoch": 3.3537832310838445, "grad_norm": 2.4688546657562256, "learning_rate": 2.9550738634914765e-06, "loss": 0.0086, "step": 13120 }, { "epoch": 3.3563394683026586, "grad_norm": 2.240190267562866, "learning_rate": 2.946934565281151e-06, "loss": 0.0203, "step": 13130 }, { "epoch": 3.3588957055214723, "grad_norm": 0.22563564777374268, "learning_rate": 2.9388018055945157e-06, "loss": 0.0028, "step": 13140 }, { "epoch": 3.3614519427402865, "grad_norm": 3.0712578296661377, "learning_rate": 2.930675610332473e-06, "loss": 0.019, "step": 13150 }, { "epoch": 3.3640081799591, "grad_norm": 2.705103635787964, "learning_rate": 2.9225560053750113e-06, "loss": 0.0041, "step": 13160 }, { "epoch": 3.366564417177914, "grad_norm": 0.05551149323582649, "learning_rate": 2.9144430165811423e-06, "loss": 0.0132, "step": 13170 }, { "epoch": 3.369120654396728, "grad_norm": 0.1891528069972992, "learning_rate": 2.9063366697887947e-06, "loss": 0.0135, "step": 13180 }, { "epoch": 3.3716768916155417, "grad_norm": 0.014974648132920265, "learning_rate": 2.898236990814751e-06, "loss": 0.0119, "step": 13190 }, { "epoch": 3.374233128834356, "grad_norm": 3.4752650260925293, "learning_rate": 2.890144005454557e-06, "loss": 0.0181, "step": 13200 }, { "epoch": 3.3767893660531696, "grad_norm": 2.042994976043701, "learning_rate": 2.8820577394824433e-06, "loss": 0.0029, "step": 13210 }, { "epoch": 3.3793456032719837, "grad_norm": 7.3295769691467285, "learning_rate": 2.873978218651233e-06, "loss": 0.0173, "step": 13220 }, { "epoch": 3.3819018404907975, "grad_norm": 0.04749654605984688, "learning_rate": 2.8659054686922757e-06, "loss": 0.0123, "step": 13230 }, { "epoch": 3.3844580777096116, "grad_norm": 0.024615732952952385, "learning_rate": 2.8578395153153536e-06, "loss": 0.0077, "step": 13240 }, { "epoch": 3.3870143149284253, "grad_norm": 0.014985025860369205, "learning_rate": 2.849780384208607e-06, "loss": 0.0039, "step": 13250 }, { "epoch": 3.3895705521472395, "grad_norm": 0.13005146384239197, "learning_rate": 2.8417281010384396e-06, "loss": 0.0251, "step": 13260 }, { "epoch": 3.392126789366053, "grad_norm": 0.07327497750520706, "learning_rate": 2.8336826914494607e-06, "loss": 0.0027, "step": 13270 }, { "epoch": 3.3946830265848673, "grad_norm": 0.07815208286046982, "learning_rate": 2.8256441810643755e-06, "loss": 0.0119, "step": 13280 }, { "epoch": 3.397239263803681, "grad_norm": 1.9264451265335083, "learning_rate": 2.8176125954839247e-06, "loss": 0.0107, "step": 13290 }, { "epoch": 3.3997955010224947, "grad_norm": 3.673927068710327, "learning_rate": 2.8095879602867877e-06, "loss": 0.0077, "step": 13300 }, { "epoch": 3.402351738241309, "grad_norm": 2.514970064163208, "learning_rate": 2.8015703010295214e-06, "loss": 0.0301, "step": 13310 }, { "epoch": 3.4049079754601226, "grad_norm": 2.072049379348755, "learning_rate": 2.793559643246451e-06, "loss": 0.0028, "step": 13320 }, { "epoch": 3.4074642126789367, "grad_norm": 2.3494277000427246, "learning_rate": 2.7855560124496146e-06, "loss": 0.0079, "step": 13330 }, { "epoch": 3.4100204498977504, "grad_norm": 2.0031983852386475, "learning_rate": 2.777559434128666e-06, "loss": 0.0137, "step": 13340 }, { "epoch": 3.4125766871165646, "grad_norm": 4.773671627044678, "learning_rate": 2.7695699337507996e-06, "loss": 0.0102, "step": 13350 }, { "epoch": 3.4151329243353783, "grad_norm": 0.5617696642875671, "learning_rate": 2.7615875367606704e-06, "loss": 0.0155, "step": 13360 }, { "epoch": 3.4176891615541924, "grad_norm": 5.82913875579834, "learning_rate": 2.753612268580306e-06, "loss": 0.0117, "step": 13370 }, { "epoch": 3.420245398773006, "grad_norm": 0.17889423668384552, "learning_rate": 2.7456441546090335e-06, "loss": 0.0077, "step": 13380 }, { "epoch": 3.42280163599182, "grad_norm": 3.2761387825012207, "learning_rate": 2.7376832202233962e-06, "loss": 0.0039, "step": 13390 }, { "epoch": 3.425357873210634, "grad_norm": 0.0072940983809530735, "learning_rate": 2.7297294907770735e-06, "loss": 0.0059, "step": 13400 }, { "epoch": 3.4279141104294477, "grad_norm": 0.05972537025809288, "learning_rate": 2.7217829916007888e-06, "loss": 0.0119, "step": 13410 }, { "epoch": 3.430470347648262, "grad_norm": 1.3629683256149292, "learning_rate": 2.713843748002256e-06, "loss": 0.0102, "step": 13420 }, { "epoch": 3.4330265848670756, "grad_norm": 2.336515188217163, "learning_rate": 2.7059117852660667e-06, "loss": 0.0082, "step": 13430 }, { "epoch": 3.4355828220858897, "grad_norm": 0.766312837600708, "learning_rate": 2.697987128653633e-06, "loss": 0.0148, "step": 13440 }, { "epoch": 3.4381390593047034, "grad_norm": 0.01915799267590046, "learning_rate": 2.6900698034030904e-06, "loss": 0.0027, "step": 13450 }, { "epoch": 3.4406952965235176, "grad_norm": 6.4156646728515625, "learning_rate": 2.6821598347292387e-06, "loss": 0.0227, "step": 13460 }, { "epoch": 3.4432515337423313, "grad_norm": 1.6114623546600342, "learning_rate": 2.6742572478234363e-06, "loss": 0.0045, "step": 13470 }, { "epoch": 3.445807770961145, "grad_norm": 0.04842757061123848, "learning_rate": 2.6663620678535396e-06, "loss": 0.0031, "step": 13480 }, { "epoch": 3.448364008179959, "grad_norm": 4.460205554962158, "learning_rate": 2.658474319963812e-06, "loss": 0.0242, "step": 13490 }, { "epoch": 3.450920245398773, "grad_norm": 1.2775579690933228, "learning_rate": 2.650594029274853e-06, "loss": 0.0083, "step": 13500 }, { "epoch": 3.453476482617587, "grad_norm": 8.932818412780762, "learning_rate": 2.642721220883503e-06, "loss": 0.0197, "step": 13510 }, { "epoch": 3.4560327198364007, "grad_norm": 2.6447277069091797, "learning_rate": 2.634855919862782e-06, "loss": 0.0086, "step": 13520 }, { "epoch": 3.458588957055215, "grad_norm": 4.694246292114258, "learning_rate": 2.626998151261798e-06, "loss": 0.0063, "step": 13530 }, { "epoch": 3.4611451942740286, "grad_norm": 5.1632914543151855, "learning_rate": 2.61914794010567e-06, "loss": 0.0071, "step": 13540 }, { "epoch": 3.4637014314928427, "grad_norm": 0.45551520586013794, "learning_rate": 2.6113053113954456e-06, "loss": 0.0198, "step": 13550 }, { "epoch": 3.4662576687116564, "grad_norm": 0.023942044004797935, "learning_rate": 2.6034702901080278e-06, "loss": 0.0098, "step": 13560 }, { "epoch": 3.46881390593047, "grad_norm": 1.2750016450881958, "learning_rate": 2.5956429011960905e-06, "loss": 0.0101, "step": 13570 }, { "epoch": 3.4713701431492843, "grad_norm": 4.26313591003418, "learning_rate": 2.5878231695880023e-06, "loss": 0.0115, "step": 13580 }, { "epoch": 3.473926380368098, "grad_norm": 0.28257378935813904, "learning_rate": 2.5800111201877397e-06, "loss": 0.0079, "step": 13590 }, { "epoch": 3.476482617586912, "grad_norm": 0.5308012962341309, "learning_rate": 2.572206777874818e-06, "loss": 0.0096, "step": 13600 }, { "epoch": 3.479038854805726, "grad_norm": 4.8633341789245605, "learning_rate": 2.5644101675042066e-06, "loss": 0.021, "step": 13610 }, { "epoch": 3.48159509202454, "grad_norm": 2.2458882331848145, "learning_rate": 2.5566213139062502e-06, "loss": 0.0071, "step": 13620 }, { "epoch": 3.4841513292433537, "grad_norm": 0.1293790638446808, "learning_rate": 2.5488402418865854e-06, "loss": 0.0114, "step": 13630 }, { "epoch": 3.486707566462168, "grad_norm": 0.014333824627101421, "learning_rate": 2.5410669762260788e-06, "loss": 0.0146, "step": 13640 }, { "epoch": 3.4892638036809815, "grad_norm": 4.425572395324707, "learning_rate": 2.5333015416807192e-06, "loss": 0.0093, "step": 13650 }, { "epoch": 3.4918200408997953, "grad_norm": 0.04234839603304863, "learning_rate": 2.525543962981569e-06, "loss": 0.0049, "step": 13660 }, { "epoch": 3.4943762781186094, "grad_norm": 0.4814109802246094, "learning_rate": 2.5177942648346597e-06, "loss": 0.0059, "step": 13670 }, { "epoch": 3.4969325153374236, "grad_norm": 0.25284790992736816, "learning_rate": 2.5100524719209387e-06, "loss": 0.0086, "step": 13680 }, { "epoch": 3.4994887525562373, "grad_norm": 2.6780126094818115, "learning_rate": 2.502318608896165e-06, "loss": 0.0078, "step": 13690 }, { "epoch": 3.502044989775051, "grad_norm": 1.6357485055923462, "learning_rate": 2.494592700390848e-06, "loss": 0.0047, "step": 13700 }, { "epoch": 3.504601226993865, "grad_norm": 0.4582887887954712, "learning_rate": 2.4868747710101647e-06, "loss": 0.0093, "step": 13710 }, { "epoch": 3.507157464212679, "grad_norm": 1.8089367151260376, "learning_rate": 2.479164845333881e-06, "loss": 0.0039, "step": 13720 }, { "epoch": 3.509713701431493, "grad_norm": 3.7371037006378174, "learning_rate": 2.471462947916267e-06, "loss": 0.0095, "step": 13730 }, { "epoch": 3.5122699386503067, "grad_norm": 0.04978760704398155, "learning_rate": 2.4637691032860306e-06, "loss": 0.0093, "step": 13740 }, { "epoch": 3.5148261758691204, "grad_norm": 0.17964474856853485, "learning_rate": 2.456083335946232e-06, "loss": 0.0245, "step": 13750 }, { "epoch": 3.5173824130879345, "grad_norm": 0.01520370040088892, "learning_rate": 2.4484056703742083e-06, "loss": 0.01, "step": 13760 }, { "epoch": 3.5199386503067487, "grad_norm": 0.04782997816801071, "learning_rate": 2.4407361310214893e-06, "loss": 0.0102, "step": 13770 }, { "epoch": 3.5224948875255624, "grad_norm": 0.04237792268395424, "learning_rate": 2.4330747423137314e-06, "loss": 0.0059, "step": 13780 }, { "epoch": 3.525051124744376, "grad_norm": 0.24677464365959167, "learning_rate": 2.4254215286506287e-06, "loss": 0.0035, "step": 13790 }, { "epoch": 3.5276073619631902, "grad_norm": 2.3235230445861816, "learning_rate": 2.4177765144058424e-06, "loss": 0.008, "step": 13800 }, { "epoch": 3.530163599182004, "grad_norm": 0.09863277524709702, "learning_rate": 2.4101397239269202e-06, "loss": 0.0169, "step": 13810 }, { "epoch": 3.532719836400818, "grad_norm": 0.050358258187770844, "learning_rate": 2.402511181535213e-06, "loss": 0.0032, "step": 13820 }, { "epoch": 3.535276073619632, "grad_norm": 0.08366558700799942, "learning_rate": 2.3948909115258163e-06, "loss": 0.005, "step": 13830 }, { "epoch": 3.537832310838446, "grad_norm": 0.028095854446291924, "learning_rate": 2.3872789381674665e-06, "loss": 0.0131, "step": 13840 }, { "epoch": 3.5403885480572597, "grad_norm": 0.010922097600996494, "learning_rate": 2.3796752857024854e-06, "loss": 0.0127, "step": 13850 }, { "epoch": 3.542944785276074, "grad_norm": 5.2768330574035645, "learning_rate": 2.372079978346691e-06, "loss": 0.004, "step": 13860 }, { "epoch": 3.5455010224948875, "grad_norm": 5.860825061798096, "learning_rate": 2.3644930402893297e-06, "loss": 0.0121, "step": 13870 }, { "epoch": 3.5480572597137012, "grad_norm": 0.030172038823366165, "learning_rate": 2.356914495692984e-06, "loss": 0.0014, "step": 13880 }, { "epoch": 3.5506134969325154, "grad_norm": 0.023287015035748482, "learning_rate": 2.349344368693513e-06, "loss": 0.0078, "step": 13890 }, { "epoch": 3.553169734151329, "grad_norm": 0.010513374581933022, "learning_rate": 2.3417826833999657e-06, "loss": 0.0075, "step": 13900 }, { "epoch": 3.5557259713701432, "grad_norm": 3.824662923812866, "learning_rate": 2.3342294638945077e-06, "loss": 0.0234, "step": 13910 }, { "epoch": 3.558282208588957, "grad_norm": 1.5583800077438354, "learning_rate": 2.3266847342323377e-06, "loss": 0.0024, "step": 13920 }, { "epoch": 3.560838445807771, "grad_norm": 0.9682608842849731, "learning_rate": 2.319148518441622e-06, "loss": 0.0043, "step": 13930 }, { "epoch": 3.563394683026585, "grad_norm": 0.0384635366499424, "learning_rate": 2.3116208405234107e-06, "loss": 0.006, "step": 13940 }, { "epoch": 3.565950920245399, "grad_norm": 0.4134227931499481, "learning_rate": 2.304101724451564e-06, "loss": 0.0118, "step": 13950 }, { "epoch": 3.5685071574642127, "grad_norm": 0.014091679826378822, "learning_rate": 2.2965911941726687e-06, "loss": 0.0034, "step": 13960 }, { "epoch": 3.5710633946830264, "grad_norm": 0.21840809285640717, "learning_rate": 2.289089273605975e-06, "loss": 0.0055, "step": 13970 }, { "epoch": 3.5736196319018405, "grad_norm": 0.015261857770383358, "learning_rate": 2.2815959866433096e-06, "loss": 0.0019, "step": 13980 }, { "epoch": 3.5761758691206547, "grad_norm": 4.033803939819336, "learning_rate": 2.2741113571490066e-06, "loss": 0.0131, "step": 13990 }, { "epoch": 3.5787321063394684, "grad_norm": 0.08580244332551956, "learning_rate": 2.2666354089598198e-06, "loss": 0.0133, "step": 14000 }, { "epoch": 3.581288343558282, "grad_norm": 0.17088328301906586, "learning_rate": 2.2591681658848686e-06, "loss": 0.0047, "step": 14010 }, { "epoch": 3.5838445807770962, "grad_norm": 2.8940188884735107, "learning_rate": 2.251709651705535e-06, "loss": 0.0082, "step": 14020 }, { "epoch": 3.58640081799591, "grad_norm": 1.2774847745895386, "learning_rate": 2.244259890175412e-06, "loss": 0.0128, "step": 14030 }, { "epoch": 3.588957055214724, "grad_norm": 0.8745086789131165, "learning_rate": 2.236818905020207e-06, "loss": 0.0056, "step": 14040 }, { "epoch": 3.591513292433538, "grad_norm": 0.05803001672029495, "learning_rate": 2.22938671993769e-06, "loss": 0.0036, "step": 14050 }, { "epoch": 3.5940695296523515, "grad_norm": 3.186616897583008, "learning_rate": 2.221963358597593e-06, "loss": 0.0049, "step": 14060 }, { "epoch": 3.5966257668711656, "grad_norm": 0.13081157207489014, "learning_rate": 2.214548844641552e-06, "loss": 0.0159, "step": 14070 }, { "epoch": 3.59918200408998, "grad_norm": 0.5573609471321106, "learning_rate": 2.2071432016830257e-06, "loss": 0.0063, "step": 14080 }, { "epoch": 3.6017382413087935, "grad_norm": 0.11412039399147034, "learning_rate": 2.1997464533072232e-06, "loss": 0.0092, "step": 14090 }, { "epoch": 3.604294478527607, "grad_norm": 2.3137636184692383, "learning_rate": 2.1923586230710185e-06, "loss": 0.0082, "step": 14100 }, { "epoch": 3.6068507157464214, "grad_norm": 0.7297873497009277, "learning_rate": 2.1849797345028917e-06, "loss": 0.0057, "step": 14110 }, { "epoch": 3.609406952965235, "grad_norm": 0.14575114846229553, "learning_rate": 2.1776098111028427e-06, "loss": 0.0122, "step": 14120 }, { "epoch": 3.611963190184049, "grad_norm": 0.20701062679290771, "learning_rate": 2.1702488763423206e-06, "loss": 0.0116, "step": 14130 }, { "epoch": 3.614519427402863, "grad_norm": 2.8510355949401855, "learning_rate": 2.1628969536641436e-06, "loss": 0.0094, "step": 14140 }, { "epoch": 3.6170756646216766, "grad_norm": 0.13213932514190674, "learning_rate": 2.1555540664824337e-06, "loss": 0.0136, "step": 14150 }, { "epoch": 3.6196319018404908, "grad_norm": 0.011733833700418472, "learning_rate": 2.1482202381825356e-06, "loss": 0.0049, "step": 14160 }, { "epoch": 3.622188139059305, "grad_norm": 0.06473023444414139, "learning_rate": 2.1408954921209435e-06, "loss": 0.007, "step": 14170 }, { "epoch": 3.6247443762781186, "grad_norm": 0.029512058943510056, "learning_rate": 2.1335798516252243e-06, "loss": 0.0187, "step": 14180 }, { "epoch": 3.6273006134969323, "grad_norm": 4.00309944152832, "learning_rate": 2.126273339993949e-06, "loss": 0.0142, "step": 14190 }, { "epoch": 3.6298568507157465, "grad_norm": 1.9352320432662964, "learning_rate": 2.1189759804966142e-06, "loss": 0.0048, "step": 14200 }, { "epoch": 3.63241308793456, "grad_norm": 2.03886079788208, "learning_rate": 2.1116877963735714e-06, "loss": 0.0007, "step": 14210 }, { "epoch": 3.6349693251533743, "grad_norm": 2.063149929046631, "learning_rate": 2.1044088108359433e-06, "loss": 0.0113, "step": 14220 }, { "epoch": 3.637525562372188, "grad_norm": 0.1273782104253769, "learning_rate": 2.0971390470655693e-06, "loss": 0.008, "step": 14230 }, { "epoch": 3.640081799591002, "grad_norm": 0.050878312438726425, "learning_rate": 2.089878528214908e-06, "loss": 0.0002, "step": 14240 }, { "epoch": 3.642638036809816, "grad_norm": 0.3995646834373474, "learning_rate": 2.082627277406983e-06, "loss": 0.0134, "step": 14250 }, { "epoch": 3.64519427402863, "grad_norm": 2.8083791732788086, "learning_rate": 2.0753853177352945e-06, "loss": 0.0122, "step": 14260 }, { "epoch": 3.6477505112474438, "grad_norm": 0.38471710681915283, "learning_rate": 2.0681526722637603e-06, "loss": 0.0061, "step": 14270 }, { "epoch": 3.6503067484662575, "grad_norm": 1.0761078596115112, "learning_rate": 2.060929364026632e-06, "loss": 0.0071, "step": 14280 }, { "epoch": 3.6528629856850716, "grad_norm": 4.6696319580078125, "learning_rate": 2.05371541602842e-06, "loss": 0.015, "step": 14290 }, { "epoch": 3.6554192229038853, "grad_norm": 1.2931352853775024, "learning_rate": 2.0465108512438285e-06, "loss": 0.0105, "step": 14300 }, { "epoch": 3.6579754601226995, "grad_norm": 0.30030888319015503, "learning_rate": 2.0393156926176796e-06, "loss": 0.0035, "step": 14310 }, { "epoch": 3.660531697341513, "grad_norm": 1.4162043333053589, "learning_rate": 2.0321299630648374e-06, "loss": 0.007, "step": 14320 }, { "epoch": 3.6630879345603273, "grad_norm": 1.6966540813446045, "learning_rate": 2.0249536854701335e-06, "loss": 0.0022, "step": 14330 }, { "epoch": 3.665644171779141, "grad_norm": 2.748809337615967, "learning_rate": 2.017786882688303e-06, "loss": 0.0059, "step": 14340 }, { "epoch": 3.668200408997955, "grad_norm": 3.920806646347046, "learning_rate": 2.0106295775439018e-06, "loss": 0.0024, "step": 14350 }, { "epoch": 3.670756646216769, "grad_norm": 4.018367290496826, "learning_rate": 2.003481792831242e-06, "loss": 0.0134, "step": 14360 }, { "epoch": 3.6733128834355826, "grad_norm": 0.7412097454071045, "learning_rate": 1.9963435513143076e-06, "loss": 0.0061, "step": 14370 }, { "epoch": 3.6758691206543967, "grad_norm": 0.00914350152015686, "learning_rate": 1.989214875726702e-06, "loss": 0.0037, "step": 14380 }, { "epoch": 3.678425357873211, "grad_norm": 0.7989885210990906, "learning_rate": 1.982095788771552e-06, "loss": 0.0081, "step": 14390 }, { "epoch": 3.6809815950920246, "grad_norm": 0.09935598075389862, "learning_rate": 1.9749863131214543e-06, "loss": 0.0057, "step": 14400 }, { "epoch": 3.6835378323108383, "grad_norm": 0.021534953266382217, "learning_rate": 1.9678864714183877e-06, "loss": 0.0009, "step": 14410 }, { "epoch": 3.6860940695296525, "grad_norm": 1.7669703960418701, "learning_rate": 1.9607962862736617e-06, "loss": 0.004, "step": 14420 }, { "epoch": 3.688650306748466, "grad_norm": 3.493924856185913, "learning_rate": 1.9537157802678196e-06, "loss": 0.0012, "step": 14430 }, { "epoch": 3.6912065439672803, "grad_norm": 0.002254684455692768, "learning_rate": 1.9466449759505856e-06, "loss": 0.0053, "step": 14440 }, { "epoch": 3.693762781186094, "grad_norm": 3.5533618927001953, "learning_rate": 1.939583895840785e-06, "loss": 0.0053, "step": 14450 }, { "epoch": 3.6963190184049077, "grad_norm": 6.355319976806641, "learning_rate": 1.932532562426275e-06, "loss": 0.0086, "step": 14460 }, { "epoch": 3.698875255623722, "grad_norm": 0.021470896899700165, "learning_rate": 1.925490998163868e-06, "loss": 0.0097, "step": 14470 }, { "epoch": 3.701431492842536, "grad_norm": 2.308654308319092, "learning_rate": 1.918459225479268e-06, "loss": 0.0156, "step": 14480 }, { "epoch": 3.7039877300613497, "grad_norm": 2.9286420345306396, "learning_rate": 1.911437266766993e-06, "loss": 0.0076, "step": 14490 }, { "epoch": 3.7065439672801634, "grad_norm": 0.0710514560341835, "learning_rate": 1.9044251443903088e-06, "loss": 0.0009, "step": 14500 }, { "epoch": 3.7091002044989776, "grad_norm": 0.029081158339977264, "learning_rate": 1.8974228806811496e-06, "loss": 0.0007, "step": 14510 }, { "epoch": 3.7116564417177913, "grad_norm": 4.481345176696777, "learning_rate": 1.8904304979400557e-06, "loss": 0.0094, "step": 14520 }, { "epoch": 3.7142126789366054, "grad_norm": 0.005593888461589813, "learning_rate": 1.8834480184360987e-06, "loss": 0.0025, "step": 14530 }, { "epoch": 3.716768916155419, "grad_norm": 0.050757136195898056, "learning_rate": 1.8764754644068122e-06, "loss": 0.0052, "step": 14540 }, { "epoch": 3.719325153374233, "grad_norm": 0.02077576145529747, "learning_rate": 1.8695128580581146e-06, "loss": 0.0015, "step": 14550 }, { "epoch": 3.721881390593047, "grad_norm": 0.041414808481931686, "learning_rate": 1.862560221564247e-06, "loss": 0.0077, "step": 14560 }, { "epoch": 3.724437627811861, "grad_norm": 0.014929791912436485, "learning_rate": 1.8556175770676987e-06, "loss": 0.0033, "step": 14570 }, { "epoch": 3.726993865030675, "grad_norm": 0.21779873967170715, "learning_rate": 1.8486849466791385e-06, "loss": 0.0058, "step": 14580 }, { "epoch": 3.7295501022494886, "grad_norm": 0.025204051285982132, "learning_rate": 1.8417623524773343e-06, "loss": 0.0102, "step": 14590 }, { "epoch": 3.7321063394683027, "grad_norm": 0.015351396054029465, "learning_rate": 1.8348498165091056e-06, "loss": 0.0017, "step": 14600 }, { "epoch": 3.7346625766871164, "grad_norm": 0.05748201906681061, "learning_rate": 1.827947360789225e-06, "loss": 0.0054, "step": 14610 }, { "epoch": 3.7372188139059306, "grad_norm": 1.63164484500885, "learning_rate": 1.8210550073003701e-06, "loss": 0.0067, "step": 14620 }, { "epoch": 3.7397750511247443, "grad_norm": 0.021220263093709946, "learning_rate": 1.814172777993039e-06, "loss": 0.0112, "step": 14630 }, { "epoch": 3.7423312883435584, "grad_norm": 1.962134599685669, "learning_rate": 1.807300694785496e-06, "loss": 0.0066, "step": 14640 }, { "epoch": 3.744887525562372, "grad_norm": 0.02569643221795559, "learning_rate": 1.800438779563683e-06, "loss": 0.01, "step": 14650 }, { "epoch": 3.7474437627811863, "grad_norm": 0.10192188620567322, "learning_rate": 1.7935870541811633e-06, "loss": 0.0025, "step": 14660 }, { "epoch": 3.75, "grad_norm": 0.06718003004789352, "learning_rate": 1.7867455404590495e-06, "loss": 0.0014, "step": 14670 }, { "epoch": 3.7525562372188137, "grad_norm": 0.01870041899383068, "learning_rate": 1.7799142601859322e-06, "loss": 0.0062, "step": 14680 }, { "epoch": 3.755112474437628, "grad_norm": 3.080137014389038, "learning_rate": 1.7730932351178055e-06, "loss": 0.0049, "step": 14690 }, { "epoch": 3.7576687116564416, "grad_norm": 0.33492809534072876, "learning_rate": 1.7662824869780094e-06, "loss": 0.0088, "step": 14700 }, { "epoch": 3.7602249488752557, "grad_norm": 0.7548993825912476, "learning_rate": 1.759482037457152e-06, "loss": 0.0021, "step": 14710 }, { "epoch": 3.7627811860940694, "grad_norm": 0.2977140247821808, "learning_rate": 1.7526919082130434e-06, "loss": 0.0089, "step": 14720 }, { "epoch": 3.7653374233128836, "grad_norm": 0.009994206950068474, "learning_rate": 1.7459121208706264e-06, "loss": 0.0069, "step": 14730 }, { "epoch": 3.7678936605316973, "grad_norm": 1.6630052328109741, "learning_rate": 1.7391426970219021e-06, "loss": 0.0103, "step": 14740 }, { "epoch": 3.7704498977505114, "grad_norm": 1.2915098667144775, "learning_rate": 1.7323836582258774e-06, "loss": 0.0079, "step": 14750 }, { "epoch": 3.773006134969325, "grad_norm": 3.242319345474243, "learning_rate": 1.7256350260084736e-06, "loss": 0.0069, "step": 14760 }, { "epoch": 3.775562372188139, "grad_norm": 0.026173055171966553, "learning_rate": 1.718896821862478e-06, "loss": 0.0011, "step": 14770 }, { "epoch": 3.778118609406953, "grad_norm": 0.021731965243816376, "learning_rate": 1.7121690672474577e-06, "loss": 0.0042, "step": 14780 }, { "epoch": 3.780674846625767, "grad_norm": 4.898509502410889, "learning_rate": 1.7054517835897144e-06, "loss": 0.0178, "step": 14790 }, { "epoch": 3.783231083844581, "grad_norm": 5.831714630126953, "learning_rate": 1.6987449922821887e-06, "loss": 0.006, "step": 14800 }, { "epoch": 3.7857873210633946, "grad_norm": 0.009105149656534195, "learning_rate": 1.6920487146844117e-06, "loss": 0.0012, "step": 14810 }, { "epoch": 3.7883435582822087, "grad_norm": 0.0681765154004097, "learning_rate": 1.6853629721224318e-06, "loss": 0.0064, "step": 14820 }, { "epoch": 3.7908997955010224, "grad_norm": 0.09596231579780579, "learning_rate": 1.6786877858887457e-06, "loss": 0.0036, "step": 14830 }, { "epoch": 3.7934560327198366, "grad_norm": 0.2018987387418747, "learning_rate": 1.6720231772422251e-06, "loss": 0.0041, "step": 14840 }, { "epoch": 3.7960122699386503, "grad_norm": 0.034721288830041885, "learning_rate": 1.665369167408062e-06, "loss": 0.0083, "step": 14850 }, { "epoch": 3.798568507157464, "grad_norm": 0.009844356216490269, "learning_rate": 1.6587257775776889e-06, "loss": 0.0047, "step": 14860 }, { "epoch": 3.801124744376278, "grad_norm": 0.014034909196197987, "learning_rate": 1.6520930289087206e-06, "loss": 0.0053, "step": 14870 }, { "epoch": 3.8036809815950923, "grad_norm": 0.03924409672617912, "learning_rate": 1.6454709425248754e-06, "loss": 0.0053, "step": 14880 }, { "epoch": 3.806237218813906, "grad_norm": 0.03811722993850708, "learning_rate": 1.6388595395159207e-06, "loss": 0.0107, "step": 14890 }, { "epoch": 3.8087934560327197, "grad_norm": 3.9966225624084473, "learning_rate": 1.632258840937599e-06, "loss": 0.0111, "step": 14900 }, { "epoch": 3.811349693251534, "grad_norm": 0.009593687951564789, "learning_rate": 1.6256688678115607e-06, "loss": 0.0138, "step": 14910 }, { "epoch": 3.8139059304703475, "grad_norm": 0.011800892651081085, "learning_rate": 1.6190896411252966e-06, "loss": 0.0066, "step": 14920 }, { "epoch": 3.8164621676891617, "grad_norm": 0.02664501592516899, "learning_rate": 1.612521181832075e-06, "loss": 0.0053, "step": 14930 }, { "epoch": 3.8190184049079754, "grad_norm": 2.8575503826141357, "learning_rate": 1.6059635108508731e-06, "loss": 0.0082, "step": 14940 }, { "epoch": 3.821574642126789, "grad_norm": 1.9057544469833374, "learning_rate": 1.5994166490663087e-06, "loss": 0.0026, "step": 14950 }, { "epoch": 3.8241308793456033, "grad_norm": 0.012572694569826126, "learning_rate": 1.5928806173285716e-06, "loss": 0.0035, "step": 14960 }, { "epoch": 3.8266871165644174, "grad_norm": 3.401106595993042, "learning_rate": 1.58635543645337e-06, "loss": 0.0068, "step": 14970 }, { "epoch": 3.829243353783231, "grad_norm": 0.008161719888448715, "learning_rate": 1.5798411272218427e-06, "loss": 0.0048, "step": 14980 }, { "epoch": 3.831799591002045, "grad_norm": 4.691705703735352, "learning_rate": 1.5733377103805154e-06, "loss": 0.0045, "step": 14990 }, { "epoch": 3.834355828220859, "grad_norm": 0.011220389977097511, "learning_rate": 1.5668452066412137e-06, "loss": 0.0004, "step": 15000 }, { "epoch": 3.8369120654396727, "grad_norm": 0.5998366475105286, "learning_rate": 1.56036363668102e-06, "loss": 0.0055, "step": 15010 }, { "epoch": 3.839468302658487, "grad_norm": 3.6791257858276367, "learning_rate": 1.5538930211421839e-06, "loss": 0.0094, "step": 15020 }, { "epoch": 3.8420245398773005, "grad_norm": 0.025082003325223923, "learning_rate": 1.5474333806320735e-06, "loss": 0.004, "step": 15030 }, { "epoch": 3.8445807770961147, "grad_norm": 0.01650637947022915, "learning_rate": 1.540984735723104e-06, "loss": 0.0042, "step": 15040 }, { "epoch": 3.8471370143149284, "grad_norm": 0.943402111530304, "learning_rate": 1.5345471069526718e-06, "loss": 0.0047, "step": 15050 }, { "epoch": 3.8496932515337425, "grad_norm": 0.018428007140755653, "learning_rate": 1.5281205148230866e-06, "loss": 0.0187, "step": 15060 }, { "epoch": 3.8522494887525562, "grad_norm": 0.013692053034901619, "learning_rate": 1.5217049798015127e-06, "loss": 0.0018, "step": 15070 }, { "epoch": 3.85480572597137, "grad_norm": 0.01854683831334114, "learning_rate": 1.5153005223198986e-06, "loss": 0.0011, "step": 15080 }, { "epoch": 3.857361963190184, "grad_norm": 0.07175463438034058, "learning_rate": 1.5089071627749157e-06, "loss": 0.0003, "step": 15090 }, { "epoch": 3.859918200408998, "grad_norm": 0.3006972074508667, "learning_rate": 1.5025249215278852e-06, "loss": 0.0027, "step": 15100 }, { "epoch": 3.862474437627812, "grad_norm": 0.5995022058486938, "learning_rate": 1.4961538189047258e-06, "loss": 0.0079, "step": 15110 }, { "epoch": 3.8650306748466257, "grad_norm": 0.03315654397010803, "learning_rate": 1.489793875195879e-06, "loss": 0.0002, "step": 15120 }, { "epoch": 3.86758691206544, "grad_norm": 0.01580039970576763, "learning_rate": 1.4834451106562502e-06, "loss": 0.0002, "step": 15130 }, { "epoch": 3.8701431492842535, "grad_norm": 0.047737788408994675, "learning_rate": 1.477107545505137e-06, "loss": 0.0041, "step": 15140 }, { "epoch": 3.8726993865030677, "grad_norm": 2.45046329498291, "learning_rate": 1.470781199926174e-06, "loss": 0.0075, "step": 15150 }, { "epoch": 3.8752556237218814, "grad_norm": 3.830009698867798, "learning_rate": 1.4644660940672628e-06, "loss": 0.0058, "step": 15160 }, { "epoch": 3.877811860940695, "grad_norm": 0.03586220741271973, "learning_rate": 1.4581622480405095e-06, "loss": 0.0055, "step": 15170 }, { "epoch": 3.8803680981595092, "grad_norm": 0.048213325440883636, "learning_rate": 1.45186968192216e-06, "loss": 0.0135, "step": 15180 }, { "epoch": 3.8829243353783234, "grad_norm": 0.011242564767599106, "learning_rate": 1.4455884157525369e-06, "loss": 0.0049, "step": 15190 }, { "epoch": 3.885480572597137, "grad_norm": 0.26863622665405273, "learning_rate": 1.4393184695359752e-06, "loss": 0.0038, "step": 15200 }, { "epoch": 3.888036809815951, "grad_norm": 0.09017948806285858, "learning_rate": 1.4330598632407554e-06, "loss": 0.0018, "step": 15210 }, { "epoch": 3.890593047034765, "grad_norm": 0.21921706199645996, "learning_rate": 1.4268126167990475e-06, "loss": 0.0051, "step": 15220 }, { "epoch": 3.8931492842535786, "grad_norm": 0.048430170863866806, "learning_rate": 1.4205767501068413e-06, "loss": 0.0027, "step": 15230 }, { "epoch": 3.895705521472393, "grad_norm": 0.03785645216703415, "learning_rate": 1.4143522830238855e-06, "loss": 0.0022, "step": 15240 }, { "epoch": 3.8982617586912065, "grad_norm": 0.018065497279167175, "learning_rate": 1.4081392353736206e-06, "loss": 0.0075, "step": 15250 }, { "epoch": 3.90081799591002, "grad_norm": 1.4500396251678467, "learning_rate": 1.4019376269431229e-06, "loss": 0.0034, "step": 15260 }, { "epoch": 3.9033742331288344, "grad_norm": 0.04054681211709976, "learning_rate": 1.395747477483036e-06, "loss": 0.0021, "step": 15270 }, { "epoch": 3.9059304703476485, "grad_norm": 1.107225775718689, "learning_rate": 1.3895688067075109e-06, "loss": 0.0012, "step": 15280 }, { "epoch": 3.908486707566462, "grad_norm": 0.14938171207904816, "learning_rate": 1.3834016342941364e-06, "loss": 0.0052, "step": 15290 }, { "epoch": 3.911042944785276, "grad_norm": 0.2700784504413605, "learning_rate": 1.3772459798838884e-06, "loss": 0.022, "step": 15300 }, { "epoch": 3.91359918200409, "grad_norm": 0.010788323357701302, "learning_rate": 1.3711018630810568e-06, "loss": 0.0127, "step": 15310 }, { "epoch": 3.9161554192229038, "grad_norm": 0.17254537343978882, "learning_rate": 1.3649693034531908e-06, "loss": 0.0026, "step": 15320 }, { "epoch": 3.918711656441718, "grad_norm": 2.0272927284240723, "learning_rate": 1.3588483205310238e-06, "loss": 0.0028, "step": 15330 }, { "epoch": 3.9212678936605316, "grad_norm": 0.7689258456230164, "learning_rate": 1.352738933808434e-06, "loss": 0.0046, "step": 15340 }, { "epoch": 3.9238241308793453, "grad_norm": 0.09393978118896484, "learning_rate": 1.3466411627423553e-06, "loss": 0.0058, "step": 15350 }, { "epoch": 3.9263803680981595, "grad_norm": 0.02193518355488777, "learning_rate": 1.3405550267527373e-06, "loss": 0.0118, "step": 15360 }, { "epoch": 3.9289366053169736, "grad_norm": 1.4280931949615479, "learning_rate": 1.3344805452224668e-06, "loss": 0.0055, "step": 15370 }, { "epoch": 3.9314928425357873, "grad_norm": 0.017598293721675873, "learning_rate": 1.3284177374973252e-06, "loss": 0.0001, "step": 15380 }, { "epoch": 3.934049079754601, "grad_norm": 0.017776915803551674, "learning_rate": 1.3223666228859034e-06, "loss": 0.0089, "step": 15390 }, { "epoch": 3.936605316973415, "grad_norm": 0.025338156148791313, "learning_rate": 1.3163272206595607e-06, "loss": 0.0101, "step": 15400 }, { "epoch": 3.939161554192229, "grad_norm": 0.00857964251190424, "learning_rate": 1.3102995500523513e-06, "loss": 0.0002, "step": 15410 }, { "epoch": 3.941717791411043, "grad_norm": 0.17898601293563843, "learning_rate": 1.3042836302609707e-06, "loss": 0.0083, "step": 15420 }, { "epoch": 3.9442740286298568, "grad_norm": 0.01416697259992361, "learning_rate": 1.2982794804446858e-06, "loss": 0.0031, "step": 15430 }, { "epoch": 3.946830265848671, "grad_norm": 0.03067069500684738, "learning_rate": 1.2922871197252818e-06, "loss": 0.0027, "step": 15440 }, { "epoch": 3.9493865030674846, "grad_norm": 0.013419978320598602, "learning_rate": 1.2863065671869995e-06, "loss": 0.0004, "step": 15450 }, { "epoch": 3.9519427402862988, "grad_norm": 0.36794596910476685, "learning_rate": 1.2803378418764728e-06, "loss": 0.0034, "step": 15460 }, { "epoch": 3.9544989775051125, "grad_norm": 0.014534058049321175, "learning_rate": 1.274380962802666e-06, "loss": 0.0006, "step": 15470 }, { "epoch": 3.957055214723926, "grad_norm": 0.953043520450592, "learning_rate": 1.2684359489368186e-06, "loss": 0.0097, "step": 15480 }, { "epoch": 3.9596114519427403, "grad_norm": 0.0640961155295372, "learning_rate": 1.2625028192123822e-06, "loss": 0.0076, "step": 15490 }, { "epoch": 3.962167689161554, "grad_norm": 0.026453586295247078, "learning_rate": 1.2565815925249613e-06, "loss": 0.0042, "step": 15500 }, { "epoch": 3.964723926380368, "grad_norm": 0.02020988054573536, "learning_rate": 1.250672287732247e-06, "loss": 0.0005, "step": 15510 }, { "epoch": 3.967280163599182, "grad_norm": 0.8880366683006287, "learning_rate": 1.2447749236539674e-06, "loss": 0.0122, "step": 15520 }, { "epoch": 3.969836400817996, "grad_norm": 0.06537387520074844, "learning_rate": 1.2388895190718209e-06, "loss": 0.0043, "step": 15530 }, { "epoch": 3.9723926380368098, "grad_norm": 0.03674660250544548, "learning_rate": 1.2330160927294178e-06, "loss": 0.001, "step": 15540 }, { "epoch": 3.974948875255624, "grad_norm": 0.06352321058511734, "learning_rate": 1.2271546633322157e-06, "loss": 0.0007, "step": 15550 }, { "epoch": 3.9775051124744376, "grad_norm": 2.053643226623535, "learning_rate": 1.2213052495474759e-06, "loss": 0.0064, "step": 15560 }, { "epoch": 3.9800613496932513, "grad_norm": 0.006071037612855434, "learning_rate": 1.2154678700041805e-06, "loss": 0.0061, "step": 15570 }, { "epoch": 3.9826175869120655, "grad_norm": 0.032529015094041824, "learning_rate": 1.2096425432929943e-06, "loss": 0.0042, "step": 15580 }, { "epoch": 3.9851738241308796, "grad_norm": 0.14356601238250732, "learning_rate": 1.2038292879661896e-06, "loss": 0.0025, "step": 15590 }, { "epoch": 3.9877300613496933, "grad_norm": 0.009755146689713001, "learning_rate": 1.1980281225376029e-06, "loss": 0.0123, "step": 15600 }, { "epoch": 3.990286298568507, "grad_norm": 0.026369577273726463, "learning_rate": 1.1922390654825582e-06, "loss": 0.002, "step": 15610 }, { "epoch": 3.992842535787321, "grad_norm": 1.3809363842010498, "learning_rate": 1.186462135237823e-06, "loss": 0.0037, "step": 15620 }, { "epoch": 3.995398773006135, "grad_norm": 0.06871844828128815, "learning_rate": 1.1806973502015423e-06, "loss": 0.0076, "step": 15630 }, { "epoch": 3.997955010224949, "grad_norm": 0.01024967897683382, "learning_rate": 1.1749447287331805e-06, "loss": 0.0064, "step": 15640 }, { "epoch": 4.000511247443763, "grad_norm": 0.010401812382042408, "learning_rate": 1.1692042891534677e-06, "loss": 0.0001, "step": 15650 }, { "epoch": 4.0030674846625764, "grad_norm": 0.019328856840729713, "learning_rate": 1.1634760497443308e-06, "loss": 0.0011, "step": 15660 }, { "epoch": 4.00562372188139, "grad_norm": 2.2071306705474854, "learning_rate": 1.1577600287488472e-06, "loss": 0.0046, "step": 15670 }, { "epoch": 4.008179959100205, "grad_norm": 0.2718164026737213, "learning_rate": 1.1520562443711813e-06, "loss": 0.0002, "step": 15680 }, { "epoch": 4.0107361963190185, "grad_norm": 0.0036849735770374537, "learning_rate": 1.1463647147765262e-06, "loss": 0.0024, "step": 15690 }, { "epoch": 4.013292433537832, "grad_norm": 0.026393355801701546, "learning_rate": 1.1406854580910426e-06, "loss": 0.0003, "step": 15700 }, { "epoch": 4.015848670756646, "grad_norm": 0.02888057939708233, "learning_rate": 1.1350184924018137e-06, "loss": 0.001, "step": 15710 }, { "epoch": 4.0184049079754605, "grad_norm": 4.927444934844971, "learning_rate": 1.1293638357567692e-06, "loss": 0.0046, "step": 15720 }, { "epoch": 4.020961145194274, "grad_norm": 1.5200058221817017, "learning_rate": 1.1237215061646446e-06, "loss": 0.0009, "step": 15730 }, { "epoch": 4.023517382413088, "grad_norm": 0.006760958582162857, "learning_rate": 1.118091521594909e-06, "loss": 0.0001, "step": 15740 }, { "epoch": 4.026073619631902, "grad_norm": 0.0747433677315712, "learning_rate": 1.1124738999777268e-06, "loss": 0.0004, "step": 15750 }, { "epoch": 4.028629856850716, "grad_norm": 0.018624255433678627, "learning_rate": 1.1068686592038786e-06, "loss": 0.0011, "step": 15760 }, { "epoch": 4.03118609406953, "grad_norm": 0.018480489030480385, "learning_rate": 1.10127581712472e-06, "loss": 0.0029, "step": 15770 }, { "epoch": 4.033742331288344, "grad_norm": 0.08727142959833145, "learning_rate": 1.0956953915521196e-06, "loss": 0.0015, "step": 15780 }, { "epoch": 4.036298568507157, "grad_norm": 0.17428268492221832, "learning_rate": 1.0901274002584029e-06, "loss": 0.0003, "step": 15790 }, { "epoch": 4.038854805725971, "grad_norm": 0.006692373659461737, "learning_rate": 1.0845718609762912e-06, "loss": 0.0016, "step": 15800 }, { "epoch": 4.041411042944786, "grad_norm": 0.03485719487071037, "learning_rate": 1.0790287913988533e-06, "loss": 0.0028, "step": 15810 }, { "epoch": 4.043967280163599, "grad_norm": 0.1434144675731659, "learning_rate": 1.0734982091794439e-06, "loss": 0.0014, "step": 15820 }, { "epoch": 4.046523517382413, "grad_norm": 0.025571011006832123, "learning_rate": 1.067980131931649e-06, "loss": 0.0043, "step": 15830 }, { "epoch": 4.049079754601227, "grad_norm": 0.022263115271925926, "learning_rate": 1.0624745772292262e-06, "loss": 0.0001, "step": 15840 }, { "epoch": 4.051635991820041, "grad_norm": 0.026498448103666306, "learning_rate": 1.0569815626060553e-06, "loss": 0.0014, "step": 15850 }, { "epoch": 4.054192229038855, "grad_norm": 0.11276555806398392, "learning_rate": 1.051501105556077e-06, "loss": 0.0012, "step": 15860 }, { "epoch": 4.056748466257669, "grad_norm": 0.03225693851709366, "learning_rate": 1.0460332235332421e-06, "loss": 0.0019, "step": 15870 }, { "epoch": 4.059304703476482, "grad_norm": 0.6012237071990967, "learning_rate": 1.0405779339514466e-06, "loss": 0.0028, "step": 15880 }, { "epoch": 4.061860940695296, "grad_norm": 1.399053931236267, "learning_rate": 1.0351352541844895e-06, "loss": 0.0005, "step": 15890 }, { "epoch": 4.064417177914111, "grad_norm": 0.004602417815476656, "learning_rate": 1.0297052015660065e-06, "loss": 0.0018, "step": 15900 }, { "epoch": 4.066973415132924, "grad_norm": 0.011124187149107456, "learning_rate": 1.0242877933894212e-06, "loss": 0.0026, "step": 15910 }, { "epoch": 4.069529652351738, "grad_norm": 0.012430194765329361, "learning_rate": 1.0188830469078832e-06, "loss": 0.0008, "step": 15920 }, { "epoch": 4.072085889570552, "grad_norm": 0.00355120119638741, "learning_rate": 1.0134909793342251e-06, "loss": 0.0014, "step": 15930 }, { "epoch": 4.074642126789366, "grad_norm": 0.010579611174762249, "learning_rate": 1.0081116078408932e-06, "loss": 0.0002, "step": 15940 }, { "epoch": 4.07719836400818, "grad_norm": 0.13667239248752594, "learning_rate": 1.0027449495599045e-06, "loss": 0.0002, "step": 15950 }, { "epoch": 4.079754601226994, "grad_norm": 0.013385191559791565, "learning_rate": 9.97391021582782e-07, "loss": 0.0018, "step": 15960 }, { "epoch": 4.0823108384458076, "grad_norm": 0.09518828243017197, "learning_rate": 9.92049840960514e-07, "loss": 0.002, "step": 15970 }, { "epoch": 4.084867075664621, "grad_norm": 0.017801359295845032, "learning_rate": 9.86721424703483e-07, "loss": 0.0005, "step": 15980 }, { "epoch": 4.087423312883436, "grad_norm": 0.021596604958176613, "learning_rate": 9.81405789781425e-07, "loss": 0.0011, "step": 15990 }, { "epoch": 4.08997955010225, "grad_norm": 2.4400172233581543, "learning_rate": 9.76102953123369e-07, "loss": 0.0041, "step": 16000 }, { "epoch": 4.092535787321063, "grad_norm": 0.07604683190584183, "learning_rate": 9.708129316175875e-07, "loss": 0.0009, "step": 16010 }, { "epoch": 4.095092024539877, "grad_norm": 0.00839927326887846, "learning_rate": 9.655357421115324e-07, "loss": 0.0001, "step": 16020 }, { "epoch": 4.097648261758692, "grad_norm": 0.8300763964653015, "learning_rate": 9.60271401411797e-07, "loss": 0.0009, "step": 16030 }, { "epoch": 4.100204498977505, "grad_norm": 0.036536745727062225, "learning_rate": 9.550199262840494e-07, "loss": 0.0004, "step": 16040 }, { "epoch": 4.102760736196319, "grad_norm": 0.013892588205635548, "learning_rate": 9.49781333452987e-07, "loss": 0.0009, "step": 16050 }, { "epoch": 4.105316973415133, "grad_norm": 0.27383795380592346, "learning_rate": 9.445556396022754e-07, "loss": 0.0005, "step": 16060 }, { "epoch": 4.107873210633947, "grad_norm": 0.009578673169016838, "learning_rate": 9.393428613745036e-07, "loss": 0.0036, "step": 16070 }, { "epoch": 4.110429447852761, "grad_norm": 0.42609789967536926, "learning_rate": 9.341430153711306e-07, "loss": 0.0049, "step": 16080 }, { "epoch": 4.112985685071575, "grad_norm": 0.12703081965446472, "learning_rate": 9.289561181524214e-07, "loss": 0.0037, "step": 16090 }, { "epoch": 4.115541922290388, "grad_norm": 0.13383671641349792, "learning_rate": 9.237821862374092e-07, "loss": 0.0022, "step": 16100 }, { "epoch": 4.118098159509202, "grad_norm": 0.011773956939578056, "learning_rate": 9.186212361038288e-07, "loss": 0.0002, "step": 16110 }, { "epoch": 4.120654396728017, "grad_norm": 2.616377115249634, "learning_rate": 9.134732841880811e-07, "loss": 0.003, "step": 16120 }, { "epoch": 4.12321063394683, "grad_norm": 0.012035293504595757, "learning_rate": 9.083383468851609e-07, "loss": 0.0079, "step": 16130 }, { "epoch": 4.125766871165644, "grad_norm": 0.2209741622209549, "learning_rate": 9.032164405486193e-07, "loss": 0.0047, "step": 16140 }, { "epoch": 4.128323108384458, "grad_norm": 0.00564198475331068, "learning_rate": 8.981075814905077e-07, "loss": 0.0009, "step": 16150 }, { "epoch": 4.130879345603272, "grad_norm": 0.021742451936006546, "learning_rate": 8.930117859813236e-07, "loss": 0.0009, "step": 16160 }, { "epoch": 4.133435582822086, "grad_norm": 0.011637063696980476, "learning_rate": 8.879290702499576e-07, "loss": 0.0025, "step": 16170 }, { "epoch": 4.1359918200409, "grad_norm": 0.44723692536354065, "learning_rate": 8.828594504836491e-07, "loss": 0.0012, "step": 16180 }, { "epoch": 4.1385480572597135, "grad_norm": 2.4436564445495605, "learning_rate": 8.778029428279278e-07, "loss": 0.0014, "step": 16190 }, { "epoch": 4.141104294478527, "grad_norm": 0.3361468017101288, "learning_rate": 8.727595633865643e-07, "loss": 0.0013, "step": 16200 }, { "epoch": 4.143660531697342, "grad_norm": 1.743153691291809, "learning_rate": 8.677293282215182e-07, "loss": 0.0022, "step": 16210 }, { "epoch": 4.1462167689161555, "grad_norm": 0.13101418316364288, "learning_rate": 8.627122533528892e-07, "loss": 0.0001, "step": 16220 }, { "epoch": 4.148773006134969, "grad_norm": 0.00849368516355753, "learning_rate": 8.577083547588638e-07, "loss": 0.0001, "step": 16230 }, { "epoch": 4.151329243353783, "grad_norm": 0.005825154948979616, "learning_rate": 8.527176483756671e-07, "loss": 0.0004, "step": 16240 }, { "epoch": 4.1538854805725975, "grad_norm": 0.012522836215794086, "learning_rate": 8.477401500975063e-07, "loss": 0.0005, "step": 16250 }, { "epoch": 4.156441717791411, "grad_norm": 0.1203024610877037, "learning_rate": 8.427758757765264e-07, "loss": 0.0029, "step": 16260 }, { "epoch": 4.158997955010225, "grad_norm": 0.046782489866018295, "learning_rate": 8.378248412227574e-07, "loss": 0.0016, "step": 16270 }, { "epoch": 4.161554192229039, "grad_norm": 0.02540050819516182, "learning_rate": 8.328870622040652e-07, "loss": 0.0001, "step": 16280 }, { "epoch": 4.164110429447852, "grad_norm": 0.00631357915699482, "learning_rate": 8.27962554446094e-07, "loss": 0.0001, "step": 16290 }, { "epoch": 4.166666666666667, "grad_norm": 0.02206520363688469, "learning_rate": 8.23051333632231e-07, "loss": 0.0001, "step": 16300 }, { "epoch": 4.169222903885481, "grad_norm": 0.02339177392423153, "learning_rate": 8.181534154035398e-07, "loss": 0.0012, "step": 16310 }, { "epoch": 4.171779141104294, "grad_norm": 0.11059535294771194, "learning_rate": 8.132688153587237e-07, "loss": 0.0002, "step": 16320 }, { "epoch": 4.174335378323108, "grad_norm": 0.04154384881258011, "learning_rate": 8.083975490540658e-07, "loss": 0.0003, "step": 16330 }, { "epoch": 4.176891615541923, "grad_norm": 0.004592495039105415, "learning_rate": 8.035396320033911e-07, "loss": 0.0022, "step": 16340 }, { "epoch": 4.179447852760736, "grad_norm": 0.005622932221740484, "learning_rate": 7.98695079678004e-07, "loss": 0.0001, "step": 16350 }, { "epoch": 4.18200408997955, "grad_norm": 0.008403644897043705, "learning_rate": 7.93863907506649e-07, "loss": 0.0007, "step": 16360 }, { "epoch": 4.184560327198364, "grad_norm": 0.04854018986225128, "learning_rate": 7.890461308754565e-07, "loss": 0.0025, "step": 16370 }, { "epoch": 4.1871165644171775, "grad_norm": 0.007120284251868725, "learning_rate": 7.842417651278978e-07, "loss": 0.0041, "step": 16380 }, { "epoch": 4.189672801635992, "grad_norm": 0.005977618508040905, "learning_rate": 7.794508255647293e-07, "loss": 0.0005, "step": 16390 }, { "epoch": 4.192229038854806, "grad_norm": 0.01604490913450718, "learning_rate": 7.746733274439517e-07, "loss": 0.0005, "step": 16400 }, { "epoch": 4.1947852760736195, "grad_norm": 0.04814684018492699, "learning_rate": 7.699092859807566e-07, "loss": 0.0006, "step": 16410 }, { "epoch": 4.197341513292433, "grad_norm": 0.9865232706069946, "learning_rate": 7.651587163474822e-07, "loss": 0.0002, "step": 16420 }, { "epoch": 4.199897750511248, "grad_norm": 0.014311658218502998, "learning_rate": 7.604216336735554e-07, "loss": 0.0016, "step": 16430 }, { "epoch": 4.2024539877300615, "grad_norm": 0.07794589549303055, "learning_rate": 7.556980530454571e-07, "loss": 0.001, "step": 16440 }, { "epoch": 4.205010224948875, "grad_norm": 0.1298278272151947, "learning_rate": 7.509879895066652e-07, "loss": 0.0025, "step": 16450 }, { "epoch": 4.207566462167689, "grad_norm": 0.026249248534440994, "learning_rate": 7.462914580576081e-07, "loss": 0.0028, "step": 16460 }, { "epoch": 4.210122699386503, "grad_norm": 0.008666482754051685, "learning_rate": 7.416084736556173e-07, "loss": 0.0007, "step": 16470 }, { "epoch": 4.212678936605317, "grad_norm": 0.37366658449172974, "learning_rate": 7.369390512148816e-07, "loss": 0.0008, "step": 16480 }, { "epoch": 4.215235173824131, "grad_norm": 0.007286503445357084, "learning_rate": 7.322832056063978e-07, "loss": 0.0003, "step": 16490 }, { "epoch": 4.217791411042945, "grad_norm": 1.1950106620788574, "learning_rate": 7.276409516579252e-07, "loss": 0.0024, "step": 16500 }, { "epoch": 4.220347648261758, "grad_norm": 0.008293128572404385, "learning_rate": 7.23012304153931e-07, "loss": 0.0001, "step": 16510 }, { "epoch": 4.222903885480573, "grad_norm": 0.009917296469211578, "learning_rate": 7.183972778355586e-07, "loss": 0.0006, "step": 16520 }, { "epoch": 4.225460122699387, "grad_norm": 0.013085747137665749, "learning_rate": 7.137958874005629e-07, "loss": 0.003, "step": 16530 }, { "epoch": 4.2280163599182, "grad_norm": 0.09627839922904968, "learning_rate": 7.092081475032753e-07, "loss": 0.0006, "step": 16540 }, { "epoch": 4.230572597137014, "grad_norm": 0.007571856956928968, "learning_rate": 7.046340727545531e-07, "loss": 0.0001, "step": 16550 }, { "epoch": 4.233128834355828, "grad_norm": 0.005919609218835831, "learning_rate": 7.000736777217332e-07, "loss": 0.0015, "step": 16560 }, { "epoch": 4.235685071574642, "grad_norm": 0.016153009608387947, "learning_rate": 6.955269769285877e-07, "loss": 0.0001, "step": 16570 }, { "epoch": 4.238241308793456, "grad_norm": 0.016689471900463104, "learning_rate": 6.909939848552722e-07, "loss": 0.0015, "step": 16580 }, { "epoch": 4.24079754601227, "grad_norm": 0.012151209637522697, "learning_rate": 6.864747159382851e-07, "loss": 0.0056, "step": 16590 }, { "epoch": 4.2433537832310835, "grad_norm": 0.006638688966631889, "learning_rate": 6.819691845704207e-07, "loss": 0.002, "step": 16600 }, { "epoch": 4.245910020449898, "grad_norm": 0.026410236954689026, "learning_rate": 6.774774051007227e-07, "loss": 0.0006, "step": 16610 }, { "epoch": 4.248466257668712, "grad_norm": 0.03302263841032982, "learning_rate": 6.729993918344347e-07, "loss": 0.0001, "step": 16620 }, { "epoch": 4.2510224948875255, "grad_norm": 0.010984467342495918, "learning_rate": 6.685351590329625e-07, "loss": 0.0033, "step": 16630 }, { "epoch": 4.253578732106339, "grad_norm": 0.01048219483345747, "learning_rate": 6.640847209138224e-07, "loss": 0.0024, "step": 16640 }, { "epoch": 4.256134969325154, "grad_norm": 0.005691261030733585, "learning_rate": 6.596480916505993e-07, "loss": 0.0034, "step": 16650 }, { "epoch": 4.2586912065439675, "grad_norm": 0.018030589446425438, "learning_rate": 6.552252853728958e-07, "loss": 0.0003, "step": 16660 }, { "epoch": 4.261247443762781, "grad_norm": 0.5344707369804382, "learning_rate": 6.508163161662994e-07, "loss": 0.001, "step": 16670 }, { "epoch": 4.263803680981595, "grad_norm": 0.01154622994363308, "learning_rate": 6.464211980723223e-07, "loss": 0.0011, "step": 16680 }, { "epoch": 4.266359918200409, "grad_norm": 0.014204679057002068, "learning_rate": 6.42039945088369e-07, "loss": 0.0008, "step": 16690 }, { "epoch": 4.268916155419223, "grad_norm": 0.17623376846313477, "learning_rate": 6.376725711676829e-07, "loss": 0.0004, "step": 16700 }, { "epoch": 4.271472392638037, "grad_norm": 0.15494291484355927, "learning_rate": 6.33319090219311e-07, "loss": 0.0002, "step": 16710 }, { "epoch": 4.274028629856851, "grad_norm": 0.007995120249688625, "learning_rate": 6.289795161080492e-07, "loss": 0.0005, "step": 16720 }, { "epoch": 4.276584867075664, "grad_norm": 0.006873907521367073, "learning_rate": 6.246538626544074e-07, "loss": 0.0021, "step": 16730 }, { "epoch": 4.279141104294479, "grad_norm": 0.8828302621841431, "learning_rate": 6.203421436345597e-07, "loss": 0.0017, "step": 16740 }, { "epoch": 4.281697341513293, "grad_norm": 0.027829930186271667, "learning_rate": 6.160443727803034e-07, "loss": 0.0083, "step": 16750 }, { "epoch": 4.284253578732106, "grad_norm": 0.010724814608693123, "learning_rate": 6.11760563779012e-07, "loss": 0.001, "step": 16760 }, { "epoch": 4.28680981595092, "grad_norm": 0.005555745679885149, "learning_rate": 6.07490730273596e-07, "loss": 0.0013, "step": 16770 }, { "epoch": 4.289366053169734, "grad_norm": 0.5492291450500488, "learning_rate": 6.03234885862457e-07, "loss": 0.0018, "step": 16780 }, { "epoch": 4.291922290388548, "grad_norm": 0.01208993699401617, "learning_rate": 5.989930440994451e-07, "loss": 0.001, "step": 16790 }, { "epoch": 4.294478527607362, "grad_norm": 0.06638287752866745, "learning_rate": 5.947652184938124e-07, "loss": 0.0055, "step": 16800 }, { "epoch": 4.297034764826176, "grad_norm": 0.021167244762182236, "learning_rate": 5.905514225101761e-07, "loss": 0.0027, "step": 16810 }, { "epoch": 4.2995910020449895, "grad_norm": 0.01520733255892992, "learning_rate": 5.863516695684713e-07, "loss": 0.0002, "step": 16820 }, { "epoch": 4.302147239263804, "grad_norm": 0.41557273268699646, "learning_rate": 5.8216597304391e-07, "loss": 0.0003, "step": 16830 }, { "epoch": 4.304703476482618, "grad_norm": 0.0344698503613472, "learning_rate": 5.779943462669357e-07, "loss": 0.0006, "step": 16840 }, { "epoch": 4.3072597137014315, "grad_norm": 0.5560601353645325, "learning_rate": 5.738368025231856e-07, "loss": 0.0053, "step": 16850 }, { "epoch": 4.309815950920245, "grad_norm": 0.014782343059778214, "learning_rate": 5.696933550534445e-07, "loss": 0.0003, "step": 16860 }, { "epoch": 4.31237218813906, "grad_norm": 0.018144864588975906, "learning_rate": 5.655640170536053e-07, "loss": 0.0014, "step": 16870 }, { "epoch": 4.3149284253578735, "grad_norm": 0.0022675390355288982, "learning_rate": 5.614488016746216e-07, "loss": 0.0007, "step": 16880 }, { "epoch": 4.317484662576687, "grad_norm": 0.22662724554538727, "learning_rate": 5.573477220224777e-07, "loss": 0.0006, "step": 16890 }, { "epoch": 4.320040899795501, "grad_norm": 0.011389585211873055, "learning_rate": 5.532607911581294e-07, "loss": 0.0022, "step": 16900 }, { "epoch": 4.322597137014315, "grad_norm": 0.06751693785190582, "learning_rate": 5.491880220974799e-07, "loss": 0.0005, "step": 16910 }, { "epoch": 4.325153374233129, "grad_norm": 0.006349935662001371, "learning_rate": 5.451294278113234e-07, "loss": 0.0002, "step": 16920 }, { "epoch": 4.327709611451943, "grad_norm": 0.013773099519312382, "learning_rate": 5.410850212253193e-07, "loss": 0.0001, "step": 16930 }, { "epoch": 4.330265848670757, "grad_norm": 0.010721610859036446, "learning_rate": 5.37054815219934e-07, "loss": 0.0001, "step": 16940 }, { "epoch": 4.33282208588957, "grad_norm": 0.3423142433166504, "learning_rate": 5.330388226304145e-07, "loss": 0.0005, "step": 16950 }, { "epoch": 4.335378323108385, "grad_norm": 1.396784782409668, "learning_rate": 5.290370562467378e-07, "loss": 0.0034, "step": 16960 }, { "epoch": 4.337934560327199, "grad_norm": 0.018410976976156235, "learning_rate": 5.250495288135776e-07, "loss": 0.001, "step": 16970 }, { "epoch": 4.340490797546012, "grad_norm": 0.0063552772626280785, "learning_rate": 5.210762530302554e-07, "loss": 0.0005, "step": 16980 }, { "epoch": 4.343047034764826, "grad_norm": 0.010990941897034645, "learning_rate": 5.17117241550707e-07, "loss": 0.0022, "step": 16990 }, { "epoch": 4.34560327198364, "grad_norm": 0.009601407684385777, "learning_rate": 5.131725069834403e-07, "loss": 0.0004, "step": 17000 }, { "epoch": 4.348159509202454, "grad_norm": 0.010526538826525211, "learning_rate": 5.092420618914934e-07, "loss": 0.0007, "step": 17010 }, { "epoch": 4.350715746421268, "grad_norm": 0.02961459383368492, "learning_rate": 5.053259187923981e-07, "loss": 0.0008, "step": 17020 }, { "epoch": 4.353271983640082, "grad_norm": 0.9297602772712708, "learning_rate": 5.01424090158133e-07, "loss": 0.0008, "step": 17030 }, { "epoch": 4.355828220858895, "grad_norm": 0.12763309478759766, "learning_rate": 4.975365884150951e-07, "loss": 0.0002, "step": 17040 }, { "epoch": 4.35838445807771, "grad_norm": 0.005043504294008017, "learning_rate": 4.93663425944047e-07, "loss": 0.001, "step": 17050 }, { "epoch": 4.360940695296524, "grad_norm": 0.7221952676773071, "learning_rate": 4.8980461508009e-07, "loss": 0.0012, "step": 17060 }, { "epoch": 4.363496932515337, "grad_norm": 0.00512282457202673, "learning_rate": 4.85960168112613e-07, "loss": 0.0041, "step": 17070 }, { "epoch": 4.366053169734151, "grad_norm": 0.27002206444740295, "learning_rate": 4.821300972852666e-07, "loss": 0.002, "step": 17080 }, { "epoch": 4.368609406952965, "grad_norm": 0.0055974265560507774, "learning_rate": 4.783144147959096e-07, "loss": 0.0001, "step": 17090 }, { "epoch": 4.371165644171779, "grad_norm": 0.011708883568644524, "learning_rate": 4.745131327965818e-07, "loss": 0.0006, "step": 17100 }, { "epoch": 4.373721881390593, "grad_norm": 0.47823408246040344, "learning_rate": 4.7072626339345896e-07, "loss": 0.0006, "step": 17110 }, { "epoch": 4.376278118609407, "grad_norm": 0.01948222517967224, "learning_rate": 4.669538186468192e-07, "loss": 0.0007, "step": 17120 }, { "epoch": 4.378834355828221, "grad_norm": 0.006773567758500576, "learning_rate": 4.6319581057099604e-07, "loss": 0.0009, "step": 17130 }, { "epoch": 4.381390593047035, "grad_norm": 0.009596975520253181, "learning_rate": 4.5945225113435024e-07, "loss": 0.0005, "step": 17140 }, { "epoch": 4.383946830265849, "grad_norm": 0.007052087225019932, "learning_rate": 4.557231522592254e-07, "loss": 0.0102, "step": 17150 }, { "epoch": 4.386503067484663, "grad_norm": 0.00172753247898072, "learning_rate": 4.520085258219131e-07, "loss": 0.0011, "step": 17160 }, { "epoch": 4.389059304703476, "grad_norm": 0.0074590700678527355, "learning_rate": 4.4830838365261086e-07, "loss": 0.0003, "step": 17170 }, { "epoch": 4.39161554192229, "grad_norm": 0.011804984882473946, "learning_rate": 4.446227375353895e-07, "loss": 0.0001, "step": 17180 }, { "epoch": 4.394171779141105, "grad_norm": 0.011131849139928818, "learning_rate": 4.4095159920815254e-07, "loss": 0.0064, "step": 17190 }, { "epoch": 4.396728016359918, "grad_norm": 0.01048702746629715, "learning_rate": 4.3729498036260144e-07, "loss": 0.0001, "step": 17200 }, { "epoch": 4.399284253578732, "grad_norm": 0.005339341703802347, "learning_rate": 4.336528926441924e-07, "loss": 0.0013, "step": 17210 }, { "epoch": 4.401840490797546, "grad_norm": 0.005323043093085289, "learning_rate": 4.300253476521077e-07, "loss": 0.0006, "step": 17220 }, { "epoch": 4.40439672801636, "grad_norm": 4.231276512145996, "learning_rate": 4.2641235693921257e-07, "loss": 0.0021, "step": 17230 }, { "epoch": 4.406952965235174, "grad_norm": 0.007030295208096504, "learning_rate": 4.228139320120211e-07, "loss": 0.0004, "step": 17240 }, { "epoch": 4.409509202453988, "grad_norm": 0.0034739875700324774, "learning_rate": 4.1923008433065627e-07, "loss": 0.0014, "step": 17250 }, { "epoch": 4.412065439672801, "grad_norm": 0.008167327381670475, "learning_rate": 4.1566082530882126e-07, "loss": 0.0004, "step": 17260 }, { "epoch": 4.414621676891615, "grad_norm": 0.008502046577632427, "learning_rate": 4.1210616631375267e-07, "loss": 0.0003, "step": 17270 }, { "epoch": 4.41717791411043, "grad_norm": 0.011800073087215424, "learning_rate": 4.085661186661921e-07, "loss": 0.0001, "step": 17280 }, { "epoch": 4.419734151329243, "grad_norm": 0.013263700529932976, "learning_rate": 4.050406936403456e-07, "loss": 0.0001, "step": 17290 }, { "epoch": 4.422290388548057, "grad_norm": 2.6378958225250244, "learning_rate": 4.015299024638536e-07, "loss": 0.001, "step": 17300 }, { "epoch": 4.424846625766871, "grad_norm": 0.012993947602808475, "learning_rate": 3.9803375631774555e-07, "loss": 0.0008, "step": 17310 }, { "epoch": 4.427402862985685, "grad_norm": 0.01075220387428999, "learning_rate": 3.945522663364154e-07, "loss": 0.0001, "step": 17320 }, { "epoch": 4.429959100204499, "grad_norm": 0.2742706537246704, "learning_rate": 3.910854436075767e-07, "loss": 0.0002, "step": 17330 }, { "epoch": 4.432515337423313, "grad_norm": 0.007065648213028908, "learning_rate": 3.876332991722348e-07, "loss": 0.0016, "step": 17340 }, { "epoch": 4.4350715746421265, "grad_norm": 0.020136894658207893, "learning_rate": 3.84195844024644e-07, "loss": 0.0005, "step": 17350 }, { "epoch": 4.43762781186094, "grad_norm": 0.027455536648631096, "learning_rate": 3.8077308911227964e-07, "loss": 0.0006, "step": 17360 }, { "epoch": 4.440184049079755, "grad_norm": 0.05177016928792, "learning_rate": 3.773650453358008e-07, "loss": 0.0005, "step": 17370 }, { "epoch": 4.4427402862985685, "grad_norm": 1.8424170017242432, "learning_rate": 3.739717235490137e-07, "loss": 0.0013, "step": 17380 }, { "epoch": 4.445296523517382, "grad_norm": 0.1712723970413208, "learning_rate": 3.705931345588376e-07, "loss": 0.0003, "step": 17390 }, { "epoch": 4.447852760736196, "grad_norm": 0.09310045093297958, "learning_rate": 3.672292891252732e-07, "loss": 0.0001, "step": 17400 }, { "epoch": 4.4504089979550105, "grad_norm": 0.0639791414141655, "learning_rate": 3.6388019796136654e-07, "loss": 0.001, "step": 17410 }, { "epoch": 4.452965235173824, "grad_norm": 0.0685553252696991, "learning_rate": 3.605458717331739e-07, "loss": 0.0006, "step": 17420 }, { "epoch": 4.455521472392638, "grad_norm": 0.034271907061338425, "learning_rate": 3.5722632105972765e-07, "loss": 0.0007, "step": 17430 }, { "epoch": 4.458077709611452, "grad_norm": 0.04286443442106247, "learning_rate": 3.539215565130055e-07, "loss": 0.0007, "step": 17440 }, { "epoch": 4.460633946830266, "grad_norm": 0.02046520821750164, "learning_rate": 3.506315886178957e-07, "loss": 0.0006, "step": 17450 }, { "epoch": 4.46319018404908, "grad_norm": 0.012680341489613056, "learning_rate": 3.4735642785215963e-07, "loss": 0.0041, "step": 17460 }, { "epoch": 4.465746421267894, "grad_norm": 0.005123642738908529, "learning_rate": 3.4409608464640366e-07, "loss": 0.0006, "step": 17470 }, { "epoch": 4.468302658486707, "grad_norm": 0.003161899745464325, "learning_rate": 3.4085056938404303e-07, "loss": 0.0009, "step": 17480 }, { "epoch": 4.470858895705521, "grad_norm": 0.009017124772071838, "learning_rate": 3.376198924012708e-07, "loss": 0.001, "step": 17490 }, { "epoch": 4.473415132924336, "grad_norm": 0.04075018689036369, "learning_rate": 3.3440406398702055e-07, "loss": 0.0024, "step": 17500 }, { "epoch": 4.475971370143149, "grad_norm": 3.3212058544158936, "learning_rate": 3.3120309438293973e-07, "loss": 0.0038, "step": 17510 }, { "epoch": 4.478527607361963, "grad_norm": 0.15001584589481354, "learning_rate": 3.2801699378335274e-07, "loss": 0.0001, "step": 17520 }, { "epoch": 4.481083844580777, "grad_norm": 0.054374609142541885, "learning_rate": 3.248457723352316e-07, "loss": 0.0008, "step": 17530 }, { "epoch": 4.483640081799591, "grad_norm": 0.01903144083917141, "learning_rate": 3.2168944013815764e-07, "loss": 0.0002, "step": 17540 }, { "epoch": 4.486196319018405, "grad_norm": 0.012091502547264099, "learning_rate": 3.1854800724429703e-07, "loss": 0.0003, "step": 17550 }, { "epoch": 4.488752556237219, "grad_norm": 0.0041188085451722145, "learning_rate": 3.1542148365836465e-07, "loss": 0.0001, "step": 17560 }, { "epoch": 4.4913087934560325, "grad_norm": 0.004006090573966503, "learning_rate": 3.123098793375928e-07, "loss": 0.0055, "step": 17570 }, { "epoch": 4.493865030674846, "grad_norm": 4.543797016143799, "learning_rate": 3.092132041916979e-07, "loss": 0.0013, "step": 17580 }, { "epoch": 4.496421267893661, "grad_norm": 0.006038820371031761, "learning_rate": 3.06131468082852e-07, "loss": 0.0002, "step": 17590 }, { "epoch": 4.4989775051124745, "grad_norm": 0.06375231593847275, "learning_rate": 3.0306468082564933e-07, "loss": 0.0013, "step": 17600 }, { "epoch": 4.501533742331288, "grad_norm": 0.010167334228754044, "learning_rate": 3.000128521870771e-07, "loss": 0.0001, "step": 17610 }, { "epoch": 4.504089979550102, "grad_norm": 0.3248971700668335, "learning_rate": 2.969759918864784e-07, "loss": 0.0001, "step": 17620 }, { "epoch": 4.5066462167689165, "grad_norm": 0.0035574256908148527, "learning_rate": 2.939541095955334e-07, "loss": 0.0016, "step": 17630 }, { "epoch": 4.50920245398773, "grad_norm": 0.0564039871096611, "learning_rate": 2.9094721493821255e-07, "loss": 0.0009, "step": 17640 }, { "epoch": 4.511758691206544, "grad_norm": 0.0021962756291031837, "learning_rate": 2.8795531749076067e-07, "loss": 0.0008, "step": 17650 }, { "epoch": 4.514314928425358, "grad_norm": 0.00536281056702137, "learning_rate": 2.8497842678165467e-07, "loss": 0.0006, "step": 17660 }, { "epoch": 4.516871165644172, "grad_norm": 0.007071573752909899, "learning_rate": 2.8201655229158465e-07, "loss": 0.0002, "step": 17670 }, { "epoch": 4.519427402862986, "grad_norm": 6.006162643432617, "learning_rate": 2.7906970345341177e-07, "loss": 0.0027, "step": 17680 }, { "epoch": 4.5219836400818, "grad_norm": 0.013232548721134663, "learning_rate": 2.761378896521477e-07, "loss": 0.0009, "step": 17690 }, { "epoch": 4.524539877300613, "grad_norm": 0.007870076224207878, "learning_rate": 2.732211202249202e-07, "loss": 0.0004, "step": 17700 }, { "epoch": 4.527096114519427, "grad_norm": 0.001714581623673439, "learning_rate": 2.7031940446094475e-07, "loss": 0.0005, "step": 17710 }, { "epoch": 4.529652351738242, "grad_norm": 0.7307052612304688, "learning_rate": 2.674327516014924e-07, "loss": 0.0004, "step": 17720 }, { "epoch": 4.532208588957055, "grad_norm": 0.15873846411705017, "learning_rate": 2.6456117083986487e-07, "loss": 0.0001, "step": 17730 }, { "epoch": 4.534764826175869, "grad_norm": 0.02356737293303013, "learning_rate": 2.617046713213617e-07, "loss": 0.0005, "step": 17740 }, { "epoch": 4.537321063394683, "grad_norm": 0.005058961920440197, "learning_rate": 2.5886326214325297e-07, "loss": 0.0061, "step": 17750 }, { "epoch": 4.539877300613497, "grad_norm": 0.007151829544454813, "learning_rate": 2.560369523547485e-07, "loss": 0.0011, "step": 17760 }, { "epoch": 4.542433537832311, "grad_norm": 0.008786034770309925, "learning_rate": 2.5322575095697077e-07, "loss": 0.0012, "step": 17770 }, { "epoch": 4.544989775051125, "grad_norm": 0.01927710324525833, "learning_rate": 2.50429666902926e-07, "loss": 0.0001, "step": 17780 }, { "epoch": 4.5475460122699385, "grad_norm": 0.008998622186481953, "learning_rate": 2.476487090974755e-07, "loss": 0.0002, "step": 17790 }, { "epoch": 4.550102249488752, "grad_norm": 0.007529801689088345, "learning_rate": 2.448828863973052e-07, "loss": 0.0006, "step": 17800 }, { "epoch": 4.552658486707567, "grad_norm": 0.039857879281044006, "learning_rate": 2.4213220761090173e-07, "loss": 0.0069, "step": 17810 }, { "epoch": 4.5552147239263805, "grad_norm": 0.008589456789195538, "learning_rate": 2.3939668149852046e-07, "loss": 0.0001, "step": 17820 }, { "epoch": 4.557770961145194, "grad_norm": 0.005125945899635553, "learning_rate": 2.366763167721603e-07, "loss": 0.0003, "step": 17830 }, { "epoch": 4.560327198364008, "grad_norm": 0.004016405437141657, "learning_rate": 2.3397112209553207e-07, "loss": 0.0002, "step": 17840 }, { "epoch": 4.5628834355828225, "grad_norm": 0.08685509860515594, "learning_rate": 2.312811060840381e-07, "loss": 0.0026, "step": 17850 }, { "epoch": 4.565439672801636, "grad_norm": 0.0353722870349884, "learning_rate": 2.286062773047354e-07, "loss": 0.0001, "step": 17860 }, { "epoch": 4.56799591002045, "grad_norm": 0.021159430965781212, "learning_rate": 2.2594664427631807e-07, "loss": 0.0001, "step": 17870 }, { "epoch": 4.570552147239264, "grad_norm": 0.02464616298675537, "learning_rate": 2.2330221546908005e-07, "loss": 0.0002, "step": 17880 }, { "epoch": 4.573108384458077, "grad_norm": 0.042522210627794266, "learning_rate": 2.2067299930489838e-07, "loss": 0.0002, "step": 17890 }, { "epoch": 4.575664621676892, "grad_norm": 0.03682945668697357, "learning_rate": 2.180590041571995e-07, "loss": 0.0001, "step": 17900 }, { "epoch": 4.578220858895706, "grad_norm": 0.006650723051279783, "learning_rate": 2.15460238350933e-07, "loss": 0.0001, "step": 17910 }, { "epoch": 4.580777096114519, "grad_norm": 0.008757648058235645, "learning_rate": 2.1287671016254897e-07, "loss": 0.0002, "step": 17920 }, { "epoch": 4.583333333333333, "grad_norm": 0.17380449175834656, "learning_rate": 2.1030842781996796e-07, "loss": 0.0002, "step": 17930 }, { "epoch": 4.585889570552148, "grad_norm": 0.010563348419964314, "learning_rate": 2.0775539950255774e-07, "loss": 0.0002, "step": 17940 }, { "epoch": 4.588445807770961, "grad_norm": 0.1902349591255188, "learning_rate": 2.0521763334110324e-07, "loss": 0.0002, "step": 17950 }, { "epoch": 4.591002044989775, "grad_norm": 0.012331271544098854, "learning_rate": 2.0269513741778492e-07, "loss": 0.0002, "step": 17960 }, { "epoch": 4.593558282208589, "grad_norm": 0.03246452286839485, "learning_rate": 2.0018791976615048e-07, "loss": 0.0001, "step": 17970 }, { "epoch": 4.5961145194274025, "grad_norm": 0.03161914646625519, "learning_rate": 1.9769598837109105e-07, "loss": 0.0051, "step": 17980 }, { "epoch": 4.598670756646217, "grad_norm": 0.0297340489923954, "learning_rate": 1.9521935116881107e-07, "loss": 0.0002, "step": 17990 }, { "epoch": 4.601226993865031, "grad_norm": 3.1284525394439697, "learning_rate": 1.9275801604681232e-07, "loss": 0.0012, "step": 18000 }, { "epoch": 4.6037832310838445, "grad_norm": 0.005135530140250921, "learning_rate": 1.9031199084385833e-07, "loss": 0.0006, "step": 18010 }, { "epoch": 4.606339468302658, "grad_norm": 0.005707759875804186, "learning_rate": 1.8788128334995715e-07, "loss": 0.0001, "step": 18020 }, { "epoch": 4.608895705521473, "grad_norm": 0.008373766206204891, "learning_rate": 1.8546590130633035e-07, "loss": 0.0011, "step": 18030 }, { "epoch": 4.6114519427402865, "grad_norm": 0.013387088663876057, "learning_rate": 1.8306585240539576e-07, "loss": 0.0, "step": 18040 }, { "epoch": 4.6140081799591, "grad_norm": 0.0016770199872553349, "learning_rate": 1.8068114429073524e-07, "loss": 0.0002, "step": 18050 }, { "epoch": 4.616564417177914, "grad_norm": 0.005193398799747229, "learning_rate": 1.7831178455707533e-07, "loss": 0.0001, "step": 18060 }, { "epoch": 4.619120654396728, "grad_norm": 0.008146319538354874, "learning_rate": 1.759577807502627e-07, "loss": 0.001, "step": 18070 }, { "epoch": 4.621676891615542, "grad_norm": 0.00837623793631792, "learning_rate": 1.736191403672377e-07, "loss": 0.0028, "step": 18080 }, { "epoch": 4.624233128834356, "grad_norm": 0.0052582272328436375, "learning_rate": 1.7129587085601084e-07, "loss": 0.0031, "step": 18090 }, { "epoch": 4.62678936605317, "grad_norm": 0.004001881927251816, "learning_rate": 1.689879796156424e-07, "loss": 0.0001, "step": 18100 }, { "epoch": 4.629345603271983, "grad_norm": 0.00547180837020278, "learning_rate": 1.6669547399621567e-07, "loss": 0.0012, "step": 18110 }, { "epoch": 4.631901840490798, "grad_norm": 0.01456737145781517, "learning_rate": 1.6441836129881427e-07, "loss": 0.0002, "step": 18120 }, { "epoch": 4.634458077709612, "grad_norm": 0.02086499147117138, "learning_rate": 1.6215664877549774e-07, "loss": 0.0005, "step": 18130 }, { "epoch": 4.637014314928425, "grad_norm": 0.012769919820129871, "learning_rate": 1.5991034362928204e-07, "loss": 0.0002, "step": 18140 }, { "epoch": 4.639570552147239, "grad_norm": 0.18312056362628937, "learning_rate": 1.576794530141129e-07, "loss": 0.0016, "step": 18150 }, { "epoch": 4.642126789366053, "grad_norm": 0.0038857313338667154, "learning_rate": 1.5546398403484542e-07, "loss": 0.0028, "step": 18160 }, { "epoch": 4.644683026584867, "grad_norm": 0.011599598452448845, "learning_rate": 1.5326394374721887e-07, "loss": 0.0001, "step": 18170 }, { "epoch": 4.647239263803681, "grad_norm": 0.01293268147855997, "learning_rate": 1.5107933915783745e-07, "loss": 0.0008, "step": 18180 }, { "epoch": 4.649795501022495, "grad_norm": 0.012520798482000828, "learning_rate": 1.4891017722414525e-07, "loss": 0.0012, "step": 18190 }, { "epoch": 4.652351738241308, "grad_norm": 1.7857009172439575, "learning_rate": 1.467564648544062e-07, "loss": 0.0014, "step": 18200 }, { "epoch": 4.654907975460123, "grad_norm": 0.0051713059656322, "learning_rate": 1.4461820890767976e-07, "loss": 0.0001, "step": 18210 }, { "epoch": 4.657464212678937, "grad_norm": 0.27911797165870667, "learning_rate": 1.424954161938019e-07, "loss": 0.0002, "step": 18220 }, { "epoch": 4.66002044989775, "grad_norm": 0.030297599732875824, "learning_rate": 1.4038809347336036e-07, "loss": 0.0001, "step": 18230 }, { "epoch": 4.662576687116564, "grad_norm": 0.008832601830363274, "learning_rate": 1.38296247457676e-07, "loss": 0.0002, "step": 18240 }, { "epoch": 4.665132924335378, "grad_norm": 0.015586239285767078, "learning_rate": 1.3621988480877812e-07, "loss": 0.0001, "step": 18250 }, { "epoch": 4.6676891615541924, "grad_norm": 0.010234987363219261, "learning_rate": 1.341590121393882e-07, "loss": 0.0001, "step": 18260 }, { "epoch": 4.670245398773006, "grad_norm": 0.0027397891972213984, "learning_rate": 1.3211363601289273e-07, "loss": 0.0033, "step": 18270 }, { "epoch": 4.67280163599182, "grad_norm": 0.009892730042338371, "learning_rate": 1.3008376294332715e-07, "loss": 0.0001, "step": 18280 }, { "epoch": 4.675357873210634, "grad_norm": 0.03709021210670471, "learning_rate": 1.2806939939535358e-07, "loss": 0.0001, "step": 18290 }, { "epoch": 4.677914110429448, "grad_norm": 0.006908297538757324, "learning_rate": 1.2607055178423978e-07, "loss": 0.0001, "step": 18300 }, { "epoch": 4.680470347648262, "grad_norm": 0.009108340367674828, "learning_rate": 1.2408722647583692e-07, "loss": 0.0001, "step": 18310 }, { "epoch": 4.683026584867076, "grad_norm": 0.009007609449326992, "learning_rate": 1.221194297865641e-07, "loss": 0.0006, "step": 18320 }, { "epoch": 4.685582822085889, "grad_norm": 0.16736240684986115, "learning_rate": 1.2016716798338436e-07, "loss": 0.0004, "step": 18330 }, { "epoch": 4.688139059304703, "grad_norm": 0.015963025391101837, "learning_rate": 1.182304472837853e-07, "loss": 0.0001, "step": 18340 }, { "epoch": 4.690695296523518, "grad_norm": 2.035236358642578, "learning_rate": 1.1630927385576196e-07, "loss": 0.0034, "step": 18350 }, { "epoch": 4.693251533742331, "grad_norm": 0.017824502661824226, "learning_rate": 1.1440365381779117e-07, "loss": 0.0003, "step": 18360 }, { "epoch": 4.695807770961145, "grad_norm": 0.021831089630723, "learning_rate": 1.1251359323881994e-07, "loss": 0.0004, "step": 18370 }, { "epoch": 4.69836400817996, "grad_norm": 0.016453437507152557, "learning_rate": 1.1063909813823992e-07, "loss": 0.0002, "step": 18380 }, { "epoch": 4.700920245398773, "grad_norm": 0.0010980580700561404, "learning_rate": 1.0878017448587075e-07, "loss": 0.0007, "step": 18390 }, { "epoch": 4.703476482617587, "grad_norm": 0.015402301214635372, "learning_rate": 1.0693682820194062e-07, "loss": 0.0001, "step": 18400 }, { "epoch": 4.706032719836401, "grad_norm": 0.00750606507062912, "learning_rate": 1.0510906515706798e-07, "loss": 0.0001, "step": 18410 }, { "epoch": 4.708588957055214, "grad_norm": 0.007333151530474424, "learning_rate": 1.0329689117224262e-07, "loss": 0.0001, "step": 18420 }, { "epoch": 4.711145194274029, "grad_norm": 0.043747998774051666, "learning_rate": 1.0150031201880573e-07, "loss": 0.0017, "step": 18430 }, { "epoch": 4.713701431492843, "grad_norm": 0.038498375564813614, "learning_rate": 9.97193334184332e-08, "loss": 0.0005, "step": 18440 }, { "epoch": 4.716257668711656, "grad_norm": 0.01316425297409296, "learning_rate": 9.79539610431185e-08, "loss": 0.0001, "step": 18450 }, { "epoch": 4.71881390593047, "grad_norm": 0.014060962945222855, "learning_rate": 9.620420051514978e-08, "loss": 0.0002, "step": 18460 }, { "epoch": 4.721370143149285, "grad_norm": 0.014505197294056416, "learning_rate": 9.44700574070978e-08, "loss": 0.002, "step": 18470 }, { "epoch": 4.723926380368098, "grad_norm": 0.006656293291598558, "learning_rate": 9.275153724179475e-08, "loss": 0.001, "step": 18480 }, { "epoch": 4.726482617586912, "grad_norm": 0.009168506599962711, "learning_rate": 9.104864549231706e-08, "loss": 0.0005, "step": 18490 }, { "epoch": 4.729038854805726, "grad_norm": 0.016905134543776512, "learning_rate": 8.936138758196933e-08, "loss": 0.0001, "step": 18500 }, { "epoch": 4.7315950920245395, "grad_norm": 0.012130284681916237, "learning_rate": 8.768976888426484e-08, "loss": 0.0001, "step": 18510 }, { "epoch": 4.734151329243354, "grad_norm": 0.08311706781387329, "learning_rate": 8.603379472291118e-08, "loss": 0.0006, "step": 18520 }, { "epoch": 4.736707566462168, "grad_norm": 0.006343462970107794, "learning_rate": 8.43934703717908e-08, "loss": 0.0001, "step": 18530 }, { "epoch": 4.7392638036809815, "grad_norm": 0.0163714736700058, "learning_rate": 8.27688010549449e-08, "loss": 0.0001, "step": 18540 }, { "epoch": 4.741820040899795, "grad_norm": 0.004221266135573387, "learning_rate": 8.115979194655843e-08, "loss": 0.0006, "step": 18550 }, { "epoch": 4.74437627811861, "grad_norm": 0.21704137325286865, "learning_rate": 7.956644817094072e-08, "loss": 0.0005, "step": 18560 }, { "epoch": 4.7469325153374236, "grad_norm": 0.010948584415018559, "learning_rate": 7.798877480251321e-08, "loss": 0.0002, "step": 18570 }, { "epoch": 4.749488752556237, "grad_norm": 0.004772350657731295, "learning_rate": 7.642677686578726e-08, "loss": 0.0001, "step": 18580 }, { "epoch": 4.752044989775051, "grad_norm": 0.005361333955079317, "learning_rate": 7.488045933535582e-08, "loss": 0.0004, "step": 18590 }, { "epoch": 4.754601226993865, "grad_norm": 0.014715801924467087, "learning_rate": 7.334982713586958e-08, "loss": 0.0013, "step": 18600 }, { "epoch": 4.757157464212679, "grad_norm": 0.16467002034187317, "learning_rate": 7.183488514202863e-08, "loss": 0.0002, "step": 18610 }, { "epoch": 4.759713701431493, "grad_norm": 0.01957176998257637, "learning_rate": 7.03356381785597e-08, "loss": 0.0001, "step": 18620 }, { "epoch": 4.762269938650307, "grad_norm": 1.6649690866470337, "learning_rate": 6.885209102020896e-08, "loss": 0.0014, "step": 18630 }, { "epoch": 4.76482617586912, "grad_norm": 0.3953768014907837, "learning_rate": 6.73842483917192e-08, "loss": 0.0001, "step": 18640 }, { "epoch": 4.767382413087935, "grad_norm": 0.14624808728694916, "learning_rate": 6.593211496781881e-08, "loss": 0.0004, "step": 18650 }, { "epoch": 4.769938650306749, "grad_norm": 0.1692701280117035, "learning_rate": 6.449569537320677e-08, "loss": 0.0002, "step": 18660 }, { "epoch": 4.772494887525562, "grad_norm": 0.009253126569092274, "learning_rate": 6.307499418253705e-08, "loss": 0.0001, "step": 18670 }, { "epoch": 4.775051124744376, "grad_norm": 0.011899994686245918, "learning_rate": 6.167001592040367e-08, "loss": 0.0001, "step": 18680 }, { "epoch": 4.77760736196319, "grad_norm": 0.0060684094205498695, "learning_rate": 6.028076506132741e-08, "loss": 0.0014, "step": 18690 }, { "epoch": 4.780163599182004, "grad_norm": 0.004709139931946993, "learning_rate": 5.890724602974074e-08, "loss": 0.0001, "step": 18700 }, { "epoch": 4.782719836400818, "grad_norm": 0.01010716613382101, "learning_rate": 5.7549463199974566e-08, "loss": 0.0001, "step": 18710 }, { "epoch": 4.785276073619632, "grad_norm": 0.016905710101127625, "learning_rate": 5.6207420896242646e-08, "loss": 0.0001, "step": 18720 }, { "epoch": 4.7878323108384455, "grad_norm": 0.016229376196861267, "learning_rate": 5.488112339263052e-08, "loss": 0.0002, "step": 18730 }, { "epoch": 4.79038854805726, "grad_norm": 0.05223598703742027, "learning_rate": 5.3570574913078264e-08, "loss": 0.0006, "step": 18740 }, { "epoch": 4.792944785276074, "grad_norm": 2.0258493423461914, "learning_rate": 5.2275779631371646e-08, "loss": 0.0006, "step": 18750 }, { "epoch": 4.7955010224948875, "grad_norm": 3.3475382328033447, "learning_rate": 5.0996741671123226e-08, "loss": 0.0017, "step": 18760 }, { "epoch": 4.798057259713701, "grad_norm": 0.05064383149147034, "learning_rate": 4.97334651057646e-08, "loss": 0.0011, "step": 18770 }, { "epoch": 4.800613496932515, "grad_norm": 0.023241933435201645, "learning_rate": 4.8485953958530286e-08, "loss": 0.0001, "step": 18780 }, { "epoch": 4.8031697341513295, "grad_norm": 0.016580374911427498, "learning_rate": 4.725421220244553e-08, "loss": 0.0005, "step": 18790 }, { "epoch": 4.805725971370143, "grad_norm": 0.029072437435388565, "learning_rate": 4.603824376031407e-08, "loss": 0.0009, "step": 18800 }, { "epoch": 4.808282208588957, "grad_norm": 0.012765723280608654, "learning_rate": 4.4838052504705406e-08, "loss": 0.0001, "step": 18810 }, { "epoch": 4.810838445807771, "grad_norm": 0.10864470154047012, "learning_rate": 4.3653642257943105e-08, "loss": 0.0001, "step": 18820 }, { "epoch": 4.813394683026585, "grad_norm": 0.876848578453064, "learning_rate": 4.248501679208983e-08, "loss": 0.0015, "step": 18830 }, { "epoch": 4.815950920245399, "grad_norm": 2.7491190433502197, "learning_rate": 4.133217982894011e-08, "loss": 0.0034, "step": 18840 }, { "epoch": 4.818507157464213, "grad_norm": 0.014472966082394123, "learning_rate": 4.019513504000372e-08, "loss": 0.002, "step": 18850 }, { "epoch": 4.821063394683026, "grad_norm": 2.2643988132476807, "learning_rate": 3.907388604649842e-08, "loss": 0.0031, "step": 18860 }, { "epoch": 4.82361963190184, "grad_norm": 1.9290136098861694, "learning_rate": 3.796843641933334e-08, "loss": 0.0006, "step": 18870 }, { "epoch": 4.826175869120655, "grad_norm": 0.0031455198768526316, "learning_rate": 3.687878967910285e-08, "loss": 0.0023, "step": 18880 }, { "epoch": 4.828732106339468, "grad_norm": 0.07761465013027191, "learning_rate": 3.580494929607159e-08, "loss": 0.0017, "step": 18890 }, { "epoch": 4.831288343558282, "grad_norm": 0.007843953557312489, "learning_rate": 3.4746918690165e-08, "loss": 0.0001, "step": 18900 }, { "epoch": 4.833844580777096, "grad_norm": 0.012562461197376251, "learning_rate": 3.370470123095826e-08, "loss": 0.0, "step": 18910 }, { "epoch": 4.83640081799591, "grad_norm": 0.030938010662794113, "learning_rate": 3.267830023766516e-08, "loss": 0.0003, "step": 18920 }, { "epoch": 4.838957055214724, "grad_norm": 0.011501450091600418, "learning_rate": 3.166771897912868e-08, "loss": 0.0, "step": 18930 }, { "epoch": 4.841513292433538, "grad_norm": 0.020580632612109184, "learning_rate": 3.0672960673808205e-08, "loss": 0.0001, "step": 18940 }, { "epoch": 4.8440695296523515, "grad_norm": 0.9514909386634827, "learning_rate": 2.969402848977232e-08, "loss": 0.0004, "step": 18950 }, { "epoch": 4.846625766871165, "grad_norm": 0.01408900786191225, "learning_rate": 2.873092554468604e-08, "loss": 0.0017, "step": 18960 }, { "epoch": 4.84918200408998, "grad_norm": 0.007879039272665977, "learning_rate": 2.7783654905803036e-08, "loss": 0.0007, "step": 18970 }, { "epoch": 4.8517382413087935, "grad_norm": 0.00814911350607872, "learning_rate": 2.6852219589953986e-08, "loss": 0.0012, "step": 18980 }, { "epoch": 4.854294478527607, "grad_norm": 0.006910000462085009, "learning_rate": 2.5936622563537685e-08, "loss": 0.0, "step": 18990 }, { "epoch": 4.856850715746421, "grad_norm": 0.01161187607795, "learning_rate": 2.503686674251382e-08, "loss": 0.0002, "step": 19000 }, { "epoch": 4.8594069529652355, "grad_norm": 0.4636043906211853, "learning_rate": 2.4152954992388565e-08, "loss": 0.0005, "step": 19010 }, { "epoch": 4.861963190184049, "grad_norm": 0.003940473776310682, "learning_rate": 2.328489012821067e-08, "loss": 0.0001, "step": 19020 }, { "epoch": 4.864519427402863, "grad_norm": 0.01835138350725174, "learning_rate": 2.243267491455925e-08, "loss": 0.0018, "step": 19030 }, { "epoch": 4.867075664621677, "grad_norm": 0.003659491892904043, "learning_rate": 2.159631206553714e-08, "loss": 0.0004, "step": 19040 }, { "epoch": 4.86963190184049, "grad_norm": 0.010196760296821594, "learning_rate": 2.077580424475978e-08, "loss": 0.0007, "step": 19050 }, { "epoch": 4.872188139059305, "grad_norm": 0.40100085735321045, "learning_rate": 1.9971154065349108e-08, "loss": 0.0008, "step": 19060 }, { "epoch": 4.874744376278119, "grad_norm": 0.0034521687775850296, "learning_rate": 1.9182364089924134e-08, "loss": 0.0001, "step": 19070 }, { "epoch": 4.877300613496932, "grad_norm": 0.019500114023685455, "learning_rate": 1.8409436830593152e-08, "loss": 0.0002, "step": 19080 }, { "epoch": 4.879856850715746, "grad_norm": 0.006733261980116367, "learning_rate": 1.765237474894488e-08, "loss": 0.0046, "step": 19090 }, { "epoch": 4.882413087934561, "grad_norm": 0.02504415065050125, "learning_rate": 1.691118025604066e-08, "loss": 0.0001, "step": 19100 }, { "epoch": 4.884969325153374, "grad_norm": 0.017662404105067253, "learning_rate": 1.618585571240949e-08, "loss": 0.0006, "step": 19110 }, { "epoch": 4.887525562372188, "grad_norm": 0.005395929794758558, "learning_rate": 1.5476403428035803e-08, "loss": 0.0001, "step": 19120 }, { "epoch": 4.890081799591002, "grad_norm": 0.004282295238226652, "learning_rate": 1.478282566235667e-08, "loss": 0.0013, "step": 19130 }, { "epoch": 4.8926380368098155, "grad_norm": 0.006876455619931221, "learning_rate": 1.4105124624251843e-08, "loss": 0.0001, "step": 19140 }, { "epoch": 4.89519427402863, "grad_norm": 2.063361406326294, "learning_rate": 1.3443302472036513e-08, "loss": 0.0009, "step": 19150 }, { "epoch": 4.897750511247444, "grad_norm": 0.34553998708724976, "learning_rate": 1.279736131345799e-08, "loss": 0.0013, "step": 19160 }, { "epoch": 4.9003067484662575, "grad_norm": 0.18306072056293488, "learning_rate": 1.2167303205682934e-08, "loss": 0.001, "step": 19170 }, { "epoch": 4.902862985685072, "grad_norm": 0.007264157757163048, "learning_rate": 1.1553130155297908e-08, "loss": 0.0066, "step": 19180 }, { "epoch": 4.905419222903886, "grad_norm": 0.006425623781979084, "learning_rate": 1.0954844118296614e-08, "loss": 0.0032, "step": 19190 }, { "epoch": 4.9079754601226995, "grad_norm": 0.014158538542687893, "learning_rate": 1.0372447000077113e-08, "loss": 0.0001, "step": 19200 }, { "epoch": 4.910531697341513, "grad_norm": 0.011595489457249641, "learning_rate": 9.805940655436274e-09, "loss": 0.0003, "step": 19210 }, { "epoch": 4.913087934560327, "grad_norm": 3.0229008197784424, "learning_rate": 9.2553268885609e-09, "loss": 0.0012, "step": 19220 }, { "epoch": 4.9156441717791415, "grad_norm": 0.048986513167619705, "learning_rate": 8.720607453024388e-09, "loss": 0.0009, "step": 19230 }, { "epoch": 4.918200408997955, "grad_norm": 0.010676774196326733, "learning_rate": 8.20178405178118e-09, "loss": 0.0008, "step": 19240 }, { "epoch": 4.920756646216769, "grad_norm": 0.006409882567822933, "learning_rate": 7.698858337159553e-09, "loss": 0.0005, "step": 19250 }, { "epoch": 4.923312883435583, "grad_norm": 0.019153757020831108, "learning_rate": 7.2118319108582805e-09, "loss": 0.0002, "step": 19260 }, { "epoch": 4.925869120654397, "grad_norm": 0.15216705203056335, "learning_rate": 6.7407063239405264e-09, "loss": 0.0005, "step": 19270 }, { "epoch": 4.928425357873211, "grad_norm": 0.005260101519525051, "learning_rate": 6.285483076828858e-09, "loss": 0.0002, "step": 19280 }, { "epoch": 4.930981595092025, "grad_norm": 0.012515813112258911, "learning_rate": 5.846163619300238e-09, "loss": 0.0006, "step": 19290 }, { "epoch": 4.933537832310838, "grad_norm": 0.2751409411430359, "learning_rate": 5.422749350482148e-09, "loss": 0.0025, "step": 19300 }, { "epoch": 4.936094069529652, "grad_norm": 0.006292239762842655, "learning_rate": 5.015241618849254e-09, "loss": 0.0017, "step": 19310 }, { "epoch": 4.938650306748467, "grad_norm": 0.006389266811311245, "learning_rate": 4.623641722215077e-09, "loss": 0.0003, "step": 19320 }, { "epoch": 4.94120654396728, "grad_norm": 0.00943728256970644, "learning_rate": 4.247950907733112e-09, "loss": 0.0002, "step": 19330 }, { "epoch": 4.943762781186094, "grad_norm": 0.10176153481006622, "learning_rate": 3.888170371887934e-09, "loss": 0.0009, "step": 19340 }, { "epoch": 4.946319018404908, "grad_norm": 0.5887343287467957, "learning_rate": 3.5443012604957638e-09, "loss": 0.0003, "step": 19350 }, { "epoch": 4.948875255623722, "grad_norm": 0.013615554198622704, "learning_rate": 3.2163446686966913e-09, "loss": 0.0013, "step": 19360 }, { "epoch": 4.951431492842536, "grad_norm": 2.769341230392456, "learning_rate": 2.9043016409552317e-09, "loss": 0.0029, "step": 19370 }, { "epoch": 4.95398773006135, "grad_norm": 0.010591404512524605, "learning_rate": 2.6081731710531076e-09, "loss": 0.0022, "step": 19380 }, { "epoch": 4.956543967280163, "grad_norm": 0.02332007698714733, "learning_rate": 2.3279602020892522e-09, "loss": 0.0001, "step": 19390 }, { "epoch": 4.959100204498977, "grad_norm": 0.5851395130157471, "learning_rate": 2.06366362647481e-09, "loss": 0.0002, "step": 19400 }, { "epoch": 4.961656441717792, "grad_norm": 0.0056032175198197365, "learning_rate": 1.8152842859320286e-09, "loss": 0.0001, "step": 19410 }, { "epoch": 4.9642126789366054, "grad_norm": 0.011081293225288391, "learning_rate": 1.5828229714892619e-09, "loss": 0.0001, "step": 19420 }, { "epoch": 4.966768916155419, "grad_norm": 0.00719296932220459, "learning_rate": 1.366280423480415e-09, "loss": 0.0012, "step": 19430 }, { "epoch": 4.969325153374233, "grad_norm": 0.0061414423398673534, "learning_rate": 1.1656573315421693e-09, "loss": 0.0002, "step": 19440 }, { "epoch": 4.9718813905930475, "grad_norm": 0.006891654338687658, "learning_rate": 9.80954334611206e-10, "loss": 0.0003, "step": 19450 }, { "epoch": 4.974437627811861, "grad_norm": 0.35890260338783264, "learning_rate": 8.121720209219864e-10, "loss": 0.0025, "step": 19460 }, { "epoch": 4.976993865030675, "grad_norm": 0.010992944240570068, "learning_rate": 6.59310928006196e-10, "loss": 0.0001, "step": 19470 }, { "epoch": 4.979550102249489, "grad_norm": 0.010081345215439796, "learning_rate": 5.2237154268997e-10, "loss": 0.0003, "step": 19480 }, { "epoch": 4.982106339468302, "grad_norm": 0.027087528258562088, "learning_rate": 4.013543010927823e-10, "loss": 0.0005, "step": 19490 }, { "epoch": 4.984662576687117, "grad_norm": 0.005823803599923849, "learning_rate": 2.962595886257802e-10, "loss": 0.0006, "step": 19500 }, { "epoch": 4.987218813905931, "grad_norm": 0.10786204785108566, "learning_rate": 2.0708773999011945e-10, "loss": 0.0003, "step": 19510 }, { "epoch": 4.989775051124744, "grad_norm": 0.31824302673339844, "learning_rate": 1.3383903917696394e-10, "loss": 0.0009, "step": 19520 }, { "epoch": 4.992331288343558, "grad_norm": 0.01443443726748228, "learning_rate": 7.651371946637565e-11, "loss": 0.0014, "step": 19530 }, { "epoch": 4.994887525562373, "grad_norm": 0.014061720110476017, "learning_rate": 3.511196342509404e-11, "loss": 0.0015, "step": 19540 }, { "epoch": 4.997443762781186, "grad_norm": 0.10343813896179199, "learning_rate": 9.633902908201542e-12, "loss": 0.0003, "step": 19550 }, { "epoch": 5.0, "grad_norm": 0.009711273945868015, "learning_rate": 7.961905745812459e-14, "loss": 0.0002, "step": 19560 }, { "epoch": 5.0, "step": 19560, "total_flos": 1.975874722701312e+17, "train_loss": 0.2911138574734241, "train_runtime": 11035.9124, "train_samples_per_second": 1.772, "train_steps_per_second": 1.772 } ], "logging_steps": 10, "max_steps": 19560, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.975874722701312e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }