diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13735 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 19560, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002556237218813906, + "grad_norm": 86.6442642211914, + "learning_rate": 4.601226993865031e-08, + "loss": 4.7575, + "step": 10 + }, + { + "epoch": 0.005112474437627812, + "grad_norm": 90.80802154541016, + "learning_rate": 9.713701431492844e-08, + "loss": 5.2498, + "step": 20 + }, + { + "epoch": 0.007668711656441718, + "grad_norm": 79.19834899902344, + "learning_rate": 1.4826175869120655e-07, + "loss": 4.8172, + "step": 30 + }, + { + "epoch": 0.010224948875255624, + "grad_norm": 93.67054748535156, + "learning_rate": 1.9938650306748468e-07, + "loss": 4.7357, + "step": 40 + }, + { + "epoch": 0.01278118609406953, + "grad_norm": 84.51275634765625, + "learning_rate": 2.505112474437628e-07, + "loss": 4.5906, + "step": 50 + }, + { + "epoch": 0.015337423312883436, + "grad_norm": 160.59327697753906, + "learning_rate": 3.0163599182004093e-07, + "loss": 4.4981, + "step": 60 + }, + { + "epoch": 0.01789366053169734, + "grad_norm": 54.63095474243164, + "learning_rate": 3.52760736196319e-07, + "loss": 3.8254, + "step": 70 + }, + { + "epoch": 0.02044989775051125, + "grad_norm": 56.51041793823242, + "learning_rate": 4.038854805725972e-07, + "loss": 3.2408, + "step": 80 + }, + { + "epoch": 0.023006134969325152, + "grad_norm": 54.57204818725586, + "learning_rate": 4.5501022494887533e-07, + "loss": 3.1371, + "step": 90 + }, + { + "epoch": 0.02556237218813906, + "grad_norm": 17.450857162475586, + "learning_rate": 5.061349693251534e-07, + "loss": 3.0649, + "step": 100 + }, + { + "epoch": 0.028118609406952964, + "grad_norm": 14.947675704956055, + "learning_rate": 5.572597137014316e-07, + "loss": 2.6627, + "step": 110 + }, + { + "epoch": 0.03067484662576687, + "grad_norm": 21.680395126342773, + "learning_rate": 6.083844580777097e-07, + "loss": 2.6963, + "step": 120 + }, + { + "epoch": 0.033231083844580775, + "grad_norm": 14.042488098144531, + "learning_rate": 6.595092024539878e-07, + "loss": 2.7294, + "step": 130 + }, + { + "epoch": 0.03578732106339468, + "grad_norm": 15.667428016662598, + "learning_rate": 7.106339468302658e-07, + "loss": 2.688, + "step": 140 + }, + { + "epoch": 0.03834355828220859, + "grad_norm": 15.774909973144531, + "learning_rate": 7.61758691206544e-07, + "loss": 2.5642, + "step": 150 + }, + { + "epoch": 0.0408997955010225, + "grad_norm": 17.561765670776367, + "learning_rate": 8.128834355828222e-07, + "loss": 2.621, + "step": 160 + }, + { + "epoch": 0.0434560327198364, + "grad_norm": 22.855037689208984, + "learning_rate": 8.640081799591003e-07, + "loss": 2.5726, + "step": 170 + }, + { + "epoch": 0.046012269938650305, + "grad_norm": 21.334442138671875, + "learning_rate": 9.151329243353784e-07, + "loss": 2.5354, + "step": 180 + }, + { + "epoch": 0.04856850715746421, + "grad_norm": 16.87196159362793, + "learning_rate": 9.662576687116565e-07, + "loss": 2.5895, + "step": 190 + }, + { + "epoch": 0.05112474437627812, + "grad_norm": 13.189834594726562, + "learning_rate": 1.0173824130879346e-06, + "loss": 2.5198, + "step": 200 + }, + { + "epoch": 0.05368098159509203, + "grad_norm": 32.269676208496094, + "learning_rate": 1.0685071574642128e-06, + "loss": 2.4075, + "step": 210 + }, + { + "epoch": 0.05623721881390593, + "grad_norm": 35.43799591064453, + "learning_rate": 1.119631901840491e-06, + "loss": 2.3543, + "step": 220 + }, + { + "epoch": 0.058793456032719835, + "grad_norm": 15.105649948120117, + "learning_rate": 1.170756646216769e-06, + "loss": 2.4616, + "step": 230 + }, + { + "epoch": 0.06134969325153374, + "grad_norm": 19.825071334838867, + "learning_rate": 1.221881390593047e-06, + "loss": 2.3646, + "step": 240 + }, + { + "epoch": 0.06390593047034765, + "grad_norm": 14.074588775634766, + "learning_rate": 1.2730061349693252e-06, + "loss": 2.1941, + "step": 250 + }, + { + "epoch": 0.06646216768916155, + "grad_norm": 21.8532772064209, + "learning_rate": 1.3241308793456035e-06, + "loss": 2.2606, + "step": 260 + }, + { + "epoch": 0.06901840490797546, + "grad_norm": 22.797710418701172, + "learning_rate": 1.3752556237218813e-06, + "loss": 2.3052, + "step": 270 + }, + { + "epoch": 0.07157464212678936, + "grad_norm": 19.603389739990234, + "learning_rate": 1.4263803680981596e-06, + "loss": 2.1969, + "step": 280 + }, + { + "epoch": 0.07413087934560328, + "grad_norm": 17.005781173706055, + "learning_rate": 1.4775051124744377e-06, + "loss": 2.3629, + "step": 290 + }, + { + "epoch": 0.07668711656441718, + "grad_norm": 15.876945495605469, + "learning_rate": 1.5286298568507158e-06, + "loss": 2.5044, + "step": 300 + }, + { + "epoch": 0.07924335378323108, + "grad_norm": 17.874849319458008, + "learning_rate": 1.579754601226994e-06, + "loss": 2.145, + "step": 310 + }, + { + "epoch": 0.081799591002045, + "grad_norm": 22.35564422607422, + "learning_rate": 1.630879345603272e-06, + "loss": 2.1845, + "step": 320 + }, + { + "epoch": 0.0843558282208589, + "grad_norm": 16.786012649536133, + "learning_rate": 1.6820040899795503e-06, + "loss": 2.1084, + "step": 330 + }, + { + "epoch": 0.0869120654396728, + "grad_norm": 28.834739685058594, + "learning_rate": 1.7331288343558283e-06, + "loss": 2.2875, + "step": 340 + }, + { + "epoch": 0.08946830265848671, + "grad_norm": 14.567386627197266, + "learning_rate": 1.7842535787321064e-06, + "loss": 2.3375, + "step": 350 + }, + { + "epoch": 0.09202453987730061, + "grad_norm": 31.9110164642334, + "learning_rate": 1.8353783231083847e-06, + "loss": 2.1919, + "step": 360 + }, + { + "epoch": 0.09458077709611452, + "grad_norm": 14.252608299255371, + "learning_rate": 1.8865030674846626e-06, + "loss": 1.9781, + "step": 370 + }, + { + "epoch": 0.09713701431492842, + "grad_norm": 24.873756408691406, + "learning_rate": 1.937627811860941e-06, + "loss": 2.3583, + "step": 380 + }, + { + "epoch": 0.09969325153374232, + "grad_norm": 28.036256790161133, + "learning_rate": 1.988752556237219e-06, + "loss": 2.136, + "step": 390 + }, + { + "epoch": 0.10224948875255624, + "grad_norm": 31.46503257751465, + "learning_rate": 2.039877300613497e-06, + "loss": 2.206, + "step": 400 + }, + { + "epoch": 0.10480572597137014, + "grad_norm": 26.446863174438477, + "learning_rate": 2.091002044989775e-06, + "loss": 2.1021, + "step": 410 + }, + { + "epoch": 0.10736196319018405, + "grad_norm": 14.209206581115723, + "learning_rate": 2.142126789366053e-06, + "loss": 2.058, + "step": 420 + }, + { + "epoch": 0.10991820040899795, + "grad_norm": 16.250686645507812, + "learning_rate": 2.1932515337423317e-06, + "loss": 2.1551, + "step": 430 + }, + { + "epoch": 0.11247443762781185, + "grad_norm": 14.691299438476562, + "learning_rate": 2.24437627811861e-06, + "loss": 1.9728, + "step": 440 + }, + { + "epoch": 0.11503067484662577, + "grad_norm": 15.899717330932617, + "learning_rate": 2.2955010224948875e-06, + "loss": 1.9192, + "step": 450 + }, + { + "epoch": 0.11758691206543967, + "grad_norm": 13.063642501831055, + "learning_rate": 2.346625766871166e-06, + "loss": 2.1218, + "step": 460 + }, + { + "epoch": 0.12014314928425358, + "grad_norm": 13.922229766845703, + "learning_rate": 2.397750511247444e-06, + "loss": 1.8393, + "step": 470 + }, + { + "epoch": 0.12269938650306748, + "grad_norm": 20.130887985229492, + "learning_rate": 2.448875255623722e-06, + "loss": 2.0518, + "step": 480 + }, + { + "epoch": 0.1252556237218814, + "grad_norm": 20.779491424560547, + "learning_rate": 2.5e-06, + "loss": 2.0944, + "step": 490 + }, + { + "epoch": 0.1278118609406953, + "grad_norm": 16.71235466003418, + "learning_rate": 2.5511247443762783e-06, + "loss": 1.9393, + "step": 500 + }, + { + "epoch": 0.1303680981595092, + "grad_norm": 29.568498611450195, + "learning_rate": 2.6022494887525564e-06, + "loss": 2.05, + "step": 510 + }, + { + "epoch": 0.1329243353783231, + "grad_norm": 13.172797203063965, + "learning_rate": 2.653374233128835e-06, + "loss": 2.1483, + "step": 520 + }, + { + "epoch": 0.13548057259713703, + "grad_norm": 19.142757415771484, + "learning_rate": 2.704498977505113e-06, + "loss": 1.804, + "step": 530 + }, + { + "epoch": 0.13803680981595093, + "grad_norm": 11.22604751586914, + "learning_rate": 2.7556237218813906e-06, + "loss": 1.8299, + "step": 540 + }, + { + "epoch": 0.14059304703476483, + "grad_norm": 18.316205978393555, + "learning_rate": 2.8067484662576687e-06, + "loss": 2.0066, + "step": 550 + }, + { + "epoch": 0.14314928425357873, + "grad_norm": 31.03098487854004, + "learning_rate": 2.8578732106339468e-06, + "loss": 2.1166, + "step": 560 + }, + { + "epoch": 0.14570552147239263, + "grad_norm": 13.343217849731445, + "learning_rate": 2.9089979550102253e-06, + "loss": 1.9187, + "step": 570 + }, + { + "epoch": 0.14826175869120656, + "grad_norm": 16.278240203857422, + "learning_rate": 2.9601226993865034e-06, + "loss": 1.8516, + "step": 580 + }, + { + "epoch": 0.15081799591002046, + "grad_norm": 14.405373573303223, + "learning_rate": 3.0112474437627814e-06, + "loss": 1.8636, + "step": 590 + }, + { + "epoch": 0.15337423312883436, + "grad_norm": 28.38698387145996, + "learning_rate": 3.0623721881390595e-06, + "loss": 1.897, + "step": 600 + }, + { + "epoch": 0.15593047034764826, + "grad_norm": 15.13376522064209, + "learning_rate": 3.1134969325153376e-06, + "loss": 1.7026, + "step": 610 + }, + { + "epoch": 0.15848670756646216, + "grad_norm": 21.627859115600586, + "learning_rate": 3.164621676891616e-06, + "loss": 1.7341, + "step": 620 + }, + { + "epoch": 0.16104294478527606, + "grad_norm": 34.46842956542969, + "learning_rate": 3.215746421267894e-06, + "loss": 1.668, + "step": 630 + }, + { + "epoch": 0.16359918200409, + "grad_norm": 15.768085479736328, + "learning_rate": 3.266871165644172e-06, + "loss": 1.5321, + "step": 640 + }, + { + "epoch": 0.1661554192229039, + "grad_norm": 15.477701187133789, + "learning_rate": 3.31799591002045e-06, + "loss": 1.7124, + "step": 650 + }, + { + "epoch": 0.1687116564417178, + "grad_norm": 14.961771011352539, + "learning_rate": 3.369120654396728e-06, + "loss": 1.7718, + "step": 660 + }, + { + "epoch": 0.1712678936605317, + "grad_norm": 16.937185287475586, + "learning_rate": 3.4202453987730065e-06, + "loss": 1.7621, + "step": 670 + }, + { + "epoch": 0.1738241308793456, + "grad_norm": 13.276283264160156, + "learning_rate": 3.4713701431492846e-06, + "loss": 1.8156, + "step": 680 + }, + { + "epoch": 0.17638036809815952, + "grad_norm": 20.783721923828125, + "learning_rate": 3.5224948875255627e-06, + "loss": 1.8182, + "step": 690 + }, + { + "epoch": 0.17893660531697342, + "grad_norm": 15.780200004577637, + "learning_rate": 3.5736196319018408e-06, + "loss": 1.7411, + "step": 700 + }, + { + "epoch": 0.18149284253578732, + "grad_norm": 14.318155288696289, + "learning_rate": 3.624744376278119e-06, + "loss": 1.8282, + "step": 710 + }, + { + "epoch": 0.18404907975460122, + "grad_norm": 17.099267959594727, + "learning_rate": 3.6758691206543974e-06, + "loss": 1.8697, + "step": 720 + }, + { + "epoch": 0.18660531697341512, + "grad_norm": 24.751697540283203, + "learning_rate": 3.7269938650306754e-06, + "loss": 1.585, + "step": 730 + }, + { + "epoch": 0.18916155419222905, + "grad_norm": 14.778061866760254, + "learning_rate": 3.778118609406953e-06, + "loss": 1.5203, + "step": 740 + }, + { + "epoch": 0.19171779141104295, + "grad_norm": 11.545260429382324, + "learning_rate": 3.829243353783232e-06, + "loss": 1.85, + "step": 750 + }, + { + "epoch": 0.19427402862985685, + "grad_norm": 15.764128684997559, + "learning_rate": 3.880368098159509e-06, + "loss": 1.7218, + "step": 760 + }, + { + "epoch": 0.19683026584867075, + "grad_norm": 19.65780258178711, + "learning_rate": 3.931492842535788e-06, + "loss": 1.7144, + "step": 770 + }, + { + "epoch": 0.19938650306748465, + "grad_norm": 11.911704063415527, + "learning_rate": 3.982617586912066e-06, + "loss": 1.4344, + "step": 780 + }, + { + "epoch": 0.20194274028629858, + "grad_norm": 28.547578811645508, + "learning_rate": 4.033742331288344e-06, + "loss": 1.6595, + "step": 790 + }, + { + "epoch": 0.20449897750511248, + "grad_norm": 21.17897605895996, + "learning_rate": 4.084867075664622e-06, + "loss": 1.5372, + "step": 800 + }, + { + "epoch": 0.20705521472392638, + "grad_norm": 32.14210510253906, + "learning_rate": 4.1359918200409e-06, + "loss": 1.7575, + "step": 810 + }, + { + "epoch": 0.20961145194274028, + "grad_norm": 16.999343872070312, + "learning_rate": 4.187116564417179e-06, + "loss": 1.4743, + "step": 820 + }, + { + "epoch": 0.21216768916155418, + "grad_norm": 16.725637435913086, + "learning_rate": 4.238241308793456e-06, + "loss": 1.7326, + "step": 830 + }, + { + "epoch": 0.2147239263803681, + "grad_norm": 17.674165725708008, + "learning_rate": 4.289366053169735e-06, + "loss": 1.7185, + "step": 840 + }, + { + "epoch": 0.217280163599182, + "grad_norm": 14.320542335510254, + "learning_rate": 4.3404907975460124e-06, + "loss": 1.7974, + "step": 850 + }, + { + "epoch": 0.2198364008179959, + "grad_norm": 15.723745346069336, + "learning_rate": 4.391615541922291e-06, + "loss": 1.475, + "step": 860 + }, + { + "epoch": 0.2223926380368098, + "grad_norm": 17.50130844116211, + "learning_rate": 4.4427402862985694e-06, + "loss": 1.5188, + "step": 870 + }, + { + "epoch": 0.2249488752556237, + "grad_norm": 20.599023818969727, + "learning_rate": 4.493865030674847e-06, + "loss": 1.3638, + "step": 880 + }, + { + "epoch": 0.22750511247443764, + "grad_norm": 20.347301483154297, + "learning_rate": 4.544989775051125e-06, + "loss": 1.6373, + "step": 890 + }, + { + "epoch": 0.23006134969325154, + "grad_norm": 16.79341697692871, + "learning_rate": 4.596114519427403e-06, + "loss": 1.6475, + "step": 900 + }, + { + "epoch": 0.23261758691206544, + "grad_norm": 38.66135787963867, + "learning_rate": 4.647239263803681e-06, + "loss": 1.4448, + "step": 910 + }, + { + "epoch": 0.23517382413087934, + "grad_norm": 39.84195327758789, + "learning_rate": 4.6983640081799594e-06, + "loss": 1.5184, + "step": 920 + }, + { + "epoch": 0.23773006134969324, + "grad_norm": 13.964972496032715, + "learning_rate": 4.749488752556238e-06, + "loss": 1.506, + "step": 930 + }, + { + "epoch": 0.24028629856850717, + "grad_norm": 41.46586608886719, + "learning_rate": 4.800613496932516e-06, + "loss": 1.552, + "step": 940 + }, + { + "epoch": 0.24284253578732107, + "grad_norm": 14.640779495239258, + "learning_rate": 4.851738241308794e-06, + "loss": 1.3879, + "step": 950 + }, + { + "epoch": 0.24539877300613497, + "grad_norm": 15.459932327270508, + "learning_rate": 4.902862985685072e-06, + "loss": 1.3674, + "step": 960 + }, + { + "epoch": 0.24795501022494887, + "grad_norm": 15.347355842590332, + "learning_rate": 4.95398773006135e-06, + "loss": 1.1746, + "step": 970 + }, + { + "epoch": 0.2505112474437628, + "grad_norm": 28.36712074279785, + "learning_rate": 5.005112474437628e-06, + "loss": 1.5726, + "step": 980 + }, + { + "epoch": 0.25306748466257667, + "grad_norm": 19.360950469970703, + "learning_rate": 5.0562372188139064e-06, + "loss": 1.6336, + "step": 990 + }, + { + "epoch": 0.2556237218813906, + "grad_norm": 14.269163131713867, + "learning_rate": 5.107361963190185e-06, + "loss": 1.4811, + "step": 1000 + }, + { + "epoch": 0.2581799591002045, + "grad_norm": 13.799911499023438, + "learning_rate": 5.158486707566463e-06, + "loss": 1.5457, + "step": 1010 + }, + { + "epoch": 0.2607361963190184, + "grad_norm": 14.226543426513672, + "learning_rate": 5.209611451942741e-06, + "loss": 1.292, + "step": 1020 + }, + { + "epoch": 0.2632924335378323, + "grad_norm": 17.43229103088379, + "learning_rate": 5.260736196319019e-06, + "loss": 1.4887, + "step": 1030 + }, + { + "epoch": 0.2658486707566462, + "grad_norm": 14.904557228088379, + "learning_rate": 5.311860940695297e-06, + "loss": 1.4825, + "step": 1040 + }, + { + "epoch": 0.2684049079754601, + "grad_norm": 14.981825828552246, + "learning_rate": 5.362985685071576e-06, + "loss": 1.3343, + "step": 1050 + }, + { + "epoch": 0.27096114519427406, + "grad_norm": 17.142181396484375, + "learning_rate": 5.4141104294478534e-06, + "loss": 1.5909, + "step": 1060 + }, + { + "epoch": 0.27351738241308793, + "grad_norm": 25.113466262817383, + "learning_rate": 5.465235173824132e-06, + "loss": 1.4104, + "step": 1070 + }, + { + "epoch": 0.27607361963190186, + "grad_norm": 15.018096923828125, + "learning_rate": 5.516359918200409e-06, + "loss": 1.6476, + "step": 1080 + }, + { + "epoch": 0.27862985685071573, + "grad_norm": 20.576635360717773, + "learning_rate": 5.567484662576687e-06, + "loss": 1.402, + "step": 1090 + }, + { + "epoch": 0.28118609406952966, + "grad_norm": 22.463077545166016, + "learning_rate": 5.618609406952967e-06, + "loss": 1.4635, + "step": 1100 + }, + { + "epoch": 0.2837423312883436, + "grad_norm": 24.64898109436035, + "learning_rate": 5.669734151329243e-06, + "loss": 1.4447, + "step": 1110 + }, + { + "epoch": 0.28629856850715746, + "grad_norm": 13.527003288269043, + "learning_rate": 5.720858895705522e-06, + "loss": 1.3098, + "step": 1120 + }, + { + "epoch": 0.2888548057259714, + "grad_norm": 28.426868438720703, + "learning_rate": 5.7719836400817996e-06, + "loss": 1.2572, + "step": 1130 + }, + { + "epoch": 0.29141104294478526, + "grad_norm": 17.067176818847656, + "learning_rate": 5.823108384458078e-06, + "loss": 1.1472, + "step": 1140 + }, + { + "epoch": 0.2939672801635992, + "grad_norm": 13.91565990447998, + "learning_rate": 5.874233128834357e-06, + "loss": 1.4355, + "step": 1150 + }, + { + "epoch": 0.2965235173824131, + "grad_norm": 19.00754165649414, + "learning_rate": 5.925357873210634e-06, + "loss": 1.4044, + "step": 1160 + }, + { + "epoch": 0.299079754601227, + "grad_norm": 23.37032127380371, + "learning_rate": 5.976482617586913e-06, + "loss": 1.0234, + "step": 1170 + }, + { + "epoch": 0.3016359918200409, + "grad_norm": 16.511402130126953, + "learning_rate": 6.02760736196319e-06, + "loss": 1.3716, + "step": 1180 + }, + { + "epoch": 0.3041922290388548, + "grad_norm": 27.51507568359375, + "learning_rate": 6.078732106339469e-06, + "loss": 1.19, + "step": 1190 + }, + { + "epoch": 0.3067484662576687, + "grad_norm": 9.414546012878418, + "learning_rate": 6.129856850715747e-06, + "loss": 1.1535, + "step": 1200 + }, + { + "epoch": 0.30930470347648265, + "grad_norm": 11.353070259094238, + "learning_rate": 6.180981595092025e-06, + "loss": 1.2848, + "step": 1210 + }, + { + "epoch": 0.3118609406952965, + "grad_norm": 13.55284595489502, + "learning_rate": 6.232106339468304e-06, + "loss": 1.1883, + "step": 1220 + }, + { + "epoch": 0.31441717791411045, + "grad_norm": 22.225780487060547, + "learning_rate": 6.283231083844581e-06, + "loss": 1.5084, + "step": 1230 + }, + { + "epoch": 0.3169734151329243, + "grad_norm": 31.59255027770996, + "learning_rate": 6.33435582822086e-06, + "loss": 1.4056, + "step": 1240 + }, + { + "epoch": 0.31952965235173825, + "grad_norm": 37.905452728271484, + "learning_rate": 6.385480572597138e-06, + "loss": 1.5661, + "step": 1250 + }, + { + "epoch": 0.3220858895705521, + "grad_norm": 18.418148040771484, + "learning_rate": 6.436605316973416e-06, + "loss": 1.4067, + "step": 1260 + }, + { + "epoch": 0.32464212678936605, + "grad_norm": 14.485339164733887, + "learning_rate": 6.487730061349694e-06, + "loss": 1.0165, + "step": 1270 + }, + { + "epoch": 0.32719836400818, + "grad_norm": 30.809120178222656, + "learning_rate": 6.538854805725971e-06, + "loss": 1.2704, + "step": 1280 + }, + { + "epoch": 0.32975460122699385, + "grad_norm": 19.595361709594727, + "learning_rate": 6.58997955010225e-06, + "loss": 1.2991, + "step": 1290 + }, + { + "epoch": 0.3323108384458078, + "grad_norm": 10.220748901367188, + "learning_rate": 6.641104294478529e-06, + "loss": 1.1867, + "step": 1300 + }, + { + "epoch": 0.33486707566462165, + "grad_norm": 12.308537483215332, + "learning_rate": 6.692229038854806e-06, + "loss": 1.2807, + "step": 1310 + }, + { + "epoch": 0.3374233128834356, + "grad_norm": 21.010303497314453, + "learning_rate": 6.743353783231084e-06, + "loss": 1.1327, + "step": 1320 + }, + { + "epoch": 0.3399795501022495, + "grad_norm": 13.279821395874023, + "learning_rate": 6.794478527607362e-06, + "loss": 1.113, + "step": 1330 + }, + { + "epoch": 0.3425357873210634, + "grad_norm": 16.616683959960938, + "learning_rate": 6.8456032719836406e-06, + "loss": 1.2778, + "step": 1340 + }, + { + "epoch": 0.3450920245398773, + "grad_norm": 14.577863693237305, + "learning_rate": 6.896728016359919e-06, + "loss": 1.1638, + "step": 1350 + }, + { + "epoch": 0.3476482617586912, + "grad_norm": 19.595500946044922, + "learning_rate": 6.947852760736197e-06, + "loss": 1.374, + "step": 1360 + }, + { + "epoch": 0.3502044989775051, + "grad_norm": 19.486886978149414, + "learning_rate": 6.998977505112475e-06, + "loss": 1.2494, + "step": 1370 + }, + { + "epoch": 0.35276073619631904, + "grad_norm": 15.864728927612305, + "learning_rate": 7.050102249488753e-06, + "loss": 1.2167, + "step": 1380 + }, + { + "epoch": 0.3553169734151329, + "grad_norm": 11.7051362991333, + "learning_rate": 7.101226993865031e-06, + "loss": 1.2047, + "step": 1390 + }, + { + "epoch": 0.35787321063394684, + "grad_norm": 27.072895050048828, + "learning_rate": 7.15235173824131e-06, + "loss": 1.1739, + "step": 1400 + }, + { + "epoch": 0.3604294478527607, + "grad_norm": 13.395477294921875, + "learning_rate": 7.2034764826175876e-06, + "loss": 1.4076, + "step": 1410 + }, + { + "epoch": 0.36298568507157464, + "grad_norm": 11.141236305236816, + "learning_rate": 7.254601226993866e-06, + "loss": 1.2486, + "step": 1420 + }, + { + "epoch": 0.36554192229038857, + "grad_norm": 29.229612350463867, + "learning_rate": 7.305725971370144e-06, + "loss": 1.2913, + "step": 1430 + }, + { + "epoch": 0.36809815950920244, + "grad_norm": 13.788121223449707, + "learning_rate": 7.356850715746422e-06, + "loss": 1.2888, + "step": 1440 + }, + { + "epoch": 0.37065439672801637, + "grad_norm": 22.21321678161621, + "learning_rate": 7.407975460122701e-06, + "loss": 1.2711, + "step": 1450 + }, + { + "epoch": 0.37321063394683024, + "grad_norm": 15.443243980407715, + "learning_rate": 7.459100204498978e-06, + "loss": 1.0954, + "step": 1460 + }, + { + "epoch": 0.37576687116564417, + "grad_norm": 16.390304565429688, + "learning_rate": 7.510224948875257e-06, + "loss": 1.267, + "step": 1470 + }, + { + "epoch": 0.3783231083844581, + "grad_norm": 25.774921417236328, + "learning_rate": 7.561349693251534e-06, + "loss": 1.4799, + "step": 1480 + }, + { + "epoch": 0.38087934560327197, + "grad_norm": 12.54340648651123, + "learning_rate": 7.612474437627812e-06, + "loss": 1.3772, + "step": 1490 + }, + { + "epoch": 0.3834355828220859, + "grad_norm": 22.544086456298828, + "learning_rate": 7.663599182004092e-06, + "loss": 1.0647, + "step": 1500 + }, + { + "epoch": 0.38599182004089977, + "grad_norm": 18.576513290405273, + "learning_rate": 7.714723926380368e-06, + "loss": 1.0959, + "step": 1510 + }, + { + "epoch": 0.3885480572597137, + "grad_norm": 29.345508575439453, + "learning_rate": 7.765848670756647e-06, + "loss": 0.7637, + "step": 1520 + }, + { + "epoch": 0.3911042944785276, + "grad_norm": 17.49864387512207, + "learning_rate": 7.816973415132925e-06, + "loss": 1.0717, + "step": 1530 + }, + { + "epoch": 0.3936605316973415, + "grad_norm": 16.895015716552734, + "learning_rate": 7.868098159509204e-06, + "loss": 1.1233, + "step": 1540 + }, + { + "epoch": 0.3962167689161554, + "grad_norm": 13.488862991333008, + "learning_rate": 7.919222903885482e-06, + "loss": 1.2759, + "step": 1550 + }, + { + "epoch": 0.3987730061349693, + "grad_norm": 32.994239807128906, + "learning_rate": 7.97034764826176e-06, + "loss": 1.3437, + "step": 1560 + }, + { + "epoch": 0.4013292433537832, + "grad_norm": 16.069793701171875, + "learning_rate": 8.021472392638038e-06, + "loss": 1.2101, + "step": 1570 + }, + { + "epoch": 0.40388548057259716, + "grad_norm": 12.347966194152832, + "learning_rate": 8.072597137014315e-06, + "loss": 0.8976, + "step": 1580 + }, + { + "epoch": 0.40644171779141103, + "grad_norm": 19.300352096557617, + "learning_rate": 8.123721881390593e-06, + "loss": 1.0809, + "step": 1590 + }, + { + "epoch": 0.40899795501022496, + "grad_norm": 18.28475570678711, + "learning_rate": 8.174846625766872e-06, + "loss": 1.1435, + "step": 1600 + }, + { + "epoch": 0.41155419222903883, + "grad_norm": 32.143680572509766, + "learning_rate": 8.22597137014315e-06, + "loss": 1.1703, + "step": 1610 + }, + { + "epoch": 0.41411042944785276, + "grad_norm": 16.422698974609375, + "learning_rate": 8.277096114519429e-06, + "loss": 0.8745, + "step": 1620 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 29.04837989807129, + "learning_rate": 8.328220858895705e-06, + "loss": 0.929, + "step": 1630 + }, + { + "epoch": 0.41922290388548056, + "grad_norm": 18.266582489013672, + "learning_rate": 8.379345603271984e-06, + "loss": 1.0091, + "step": 1640 + }, + { + "epoch": 0.4217791411042945, + "grad_norm": 15.355749130249023, + "learning_rate": 8.430470347648262e-06, + "loss": 0.9601, + "step": 1650 + }, + { + "epoch": 0.42433537832310836, + "grad_norm": 11.973981857299805, + "learning_rate": 8.481595092024541e-06, + "loss": 1.0835, + "step": 1660 + }, + { + "epoch": 0.4268916155419223, + "grad_norm": 17.572921752929688, + "learning_rate": 8.53271983640082e-06, + "loss": 1.0341, + "step": 1670 + }, + { + "epoch": 0.4294478527607362, + "grad_norm": 13.629963874816895, + "learning_rate": 8.583844580777096e-06, + "loss": 1.1951, + "step": 1680 + }, + { + "epoch": 0.4320040899795501, + "grad_norm": 10.527235984802246, + "learning_rate": 8.634969325153375e-06, + "loss": 1.0702, + "step": 1690 + }, + { + "epoch": 0.434560327198364, + "grad_norm": 17.04031753540039, + "learning_rate": 8.686094069529653e-06, + "loss": 1.2097, + "step": 1700 + }, + { + "epoch": 0.4371165644171779, + "grad_norm": 11.430649757385254, + "learning_rate": 8.737218813905932e-06, + "loss": 0.7584, + "step": 1710 + }, + { + "epoch": 0.4396728016359918, + "grad_norm": 10.45757007598877, + "learning_rate": 8.78834355828221e-06, + "loss": 1.1054, + "step": 1720 + }, + { + "epoch": 0.44222903885480574, + "grad_norm": 17.184608459472656, + "learning_rate": 8.839468302658487e-06, + "loss": 0.8942, + "step": 1730 + }, + { + "epoch": 0.4447852760736196, + "grad_norm": 11.653769493103027, + "learning_rate": 8.890593047034766e-06, + "loss": 1.2842, + "step": 1740 + }, + { + "epoch": 0.44734151329243355, + "grad_norm": 17.205242156982422, + "learning_rate": 8.941717791411042e-06, + "loss": 0.9786, + "step": 1750 + }, + { + "epoch": 0.4498977505112474, + "grad_norm": 27.1918888092041, + "learning_rate": 8.992842535787321e-06, + "loss": 1.2404, + "step": 1760 + }, + { + "epoch": 0.45245398773006135, + "grad_norm": 14.006787300109863, + "learning_rate": 9.043967280163601e-06, + "loss": 1.0146, + "step": 1770 + }, + { + "epoch": 0.4550102249488753, + "grad_norm": 21.269569396972656, + "learning_rate": 9.095092024539878e-06, + "loss": 0.9494, + "step": 1780 + }, + { + "epoch": 0.45756646216768915, + "grad_norm": 13.8292236328125, + "learning_rate": 9.146216768916156e-06, + "loss": 1.2056, + "step": 1790 + }, + { + "epoch": 0.4601226993865031, + "grad_norm": 11.28924560546875, + "learning_rate": 9.197341513292433e-06, + "loss": 1.1219, + "step": 1800 + }, + { + "epoch": 0.46267893660531695, + "grad_norm": 24.358989715576172, + "learning_rate": 9.248466257668712e-06, + "loss": 1.0259, + "step": 1810 + }, + { + "epoch": 0.4652351738241309, + "grad_norm": 16.623613357543945, + "learning_rate": 9.29959100204499e-06, + "loss": 0.8764, + "step": 1820 + }, + { + "epoch": 0.4677914110429448, + "grad_norm": 11.915813446044922, + "learning_rate": 9.350715746421269e-06, + "loss": 0.9114, + "step": 1830 + }, + { + "epoch": 0.4703476482617587, + "grad_norm": 14.000443458557129, + "learning_rate": 9.401840490797547e-06, + "loss": 0.9903, + "step": 1840 + }, + { + "epoch": 0.4729038854805726, + "grad_norm": 9.23658561706543, + "learning_rate": 9.452965235173824e-06, + "loss": 1.1503, + "step": 1850 + }, + { + "epoch": 0.4754601226993865, + "grad_norm": 14.627740859985352, + "learning_rate": 9.504089979550103e-06, + "loss": 1.0605, + "step": 1860 + }, + { + "epoch": 0.4780163599182004, + "grad_norm": 13.077226638793945, + "learning_rate": 9.555214723926381e-06, + "loss": 0.9759, + "step": 1870 + }, + { + "epoch": 0.48057259713701433, + "grad_norm": 9.975872993469238, + "learning_rate": 9.60633946830266e-06, + "loss": 0.9908, + "step": 1880 + }, + { + "epoch": 0.4831288343558282, + "grad_norm": 15.750456809997559, + "learning_rate": 9.657464212678938e-06, + "loss": 1.0758, + "step": 1890 + }, + { + "epoch": 0.48568507157464214, + "grad_norm": 10.907366752624512, + "learning_rate": 9.708588957055215e-06, + "loss": 0.8757, + "step": 1900 + }, + { + "epoch": 0.488241308793456, + "grad_norm": 26.87792205810547, + "learning_rate": 9.759713701431493e-06, + "loss": 0.9745, + "step": 1910 + }, + { + "epoch": 0.49079754601226994, + "grad_norm": 10.880130767822266, + "learning_rate": 9.810838445807772e-06, + "loss": 0.9391, + "step": 1920 + }, + { + "epoch": 0.49335378323108386, + "grad_norm": 19.826669692993164, + "learning_rate": 9.86196319018405e-06, + "loss": 1.1917, + "step": 1930 + }, + { + "epoch": 0.49591002044989774, + "grad_norm": 11.035025596618652, + "learning_rate": 9.913087934560329e-06, + "loss": 0.7836, + "step": 1940 + }, + { + "epoch": 0.49846625766871167, + "grad_norm": 13.407333374023438, + "learning_rate": 9.964212678936606e-06, + "loss": 1.1624, + "step": 1950 + }, + { + "epoch": 0.5010224948875256, + "grad_norm": 18.5594482421875, + "learning_rate": 9.999999283428496e-06, + "loss": 1.0359, + "step": 1960 + }, + { + "epoch": 0.5035787321063395, + "grad_norm": 30.378826141357422, + "learning_rate": 9.999986544385255e-06, + "loss": 0.9342, + "step": 1970 + }, + { + "epoch": 0.5061349693251533, + "grad_norm": 26.27793312072754, + "learning_rate": 9.99995788157752e-06, + "loss": 0.7684, + "step": 1980 + }, + { + "epoch": 0.5086912065439673, + "grad_norm": 17.525869369506836, + "learning_rate": 9.999913295096573e-06, + "loss": 1.2072, + "step": 1990 + }, + { + "epoch": 0.5112474437627812, + "grad_norm": 19.318090438842773, + "learning_rate": 9.999852785084414e-06, + "loss": 0.9006, + "step": 2000 + }, + { + "epoch": 0.5138036809815951, + "grad_norm": 11.649446487426758, + "learning_rate": 9.999776351733751e-06, + "loss": 0.831, + "step": 2010 + }, + { + "epoch": 0.516359918200409, + "grad_norm": 18.077003479003906, + "learning_rate": 9.999683995288008e-06, + "loss": 0.8372, + "step": 2020 + }, + { + "epoch": 0.5189161554192229, + "grad_norm": 24.69324493408203, + "learning_rate": 9.999575716041316e-06, + "loss": 1.0961, + "step": 2030 + }, + { + "epoch": 0.5214723926380368, + "grad_norm": 10.308004379272461, + "learning_rate": 9.99945151433852e-06, + "loss": 1.0896, + "step": 2040 + }, + { + "epoch": 0.5240286298568507, + "grad_norm": 14.579326629638672, + "learning_rate": 9.99931139057517e-06, + "loss": 0.8101, + "step": 2050 + }, + { + "epoch": 0.5265848670756647, + "grad_norm": 19.19144630432129, + "learning_rate": 9.999155345197531e-06, + "loss": 0.9718, + "step": 2060 + }, + { + "epoch": 0.5291411042944786, + "grad_norm": 14.424161911010742, + "learning_rate": 9.99898337870257e-06, + "loss": 1.1082, + "step": 2070 + }, + { + "epoch": 0.5316973415132924, + "grad_norm": 11.568525314331055, + "learning_rate": 9.998795491637956e-06, + "loss": 0.9928, + "step": 2080 + }, + { + "epoch": 0.5342535787321063, + "grad_norm": 28.195453643798828, + "learning_rate": 9.998591684602065e-06, + "loss": 0.967, + "step": 2090 + }, + { + "epoch": 0.5368098159509203, + "grad_norm": 11.809616088867188, + "learning_rate": 9.998371958243977e-06, + "loss": 0.8879, + "step": 2100 + }, + { + "epoch": 0.5393660531697342, + "grad_norm": 11.77135944366455, + "learning_rate": 9.998136313263465e-06, + "loss": 1.0883, + "step": 2110 + }, + { + "epoch": 0.5419222903885481, + "grad_norm": 17.555498123168945, + "learning_rate": 9.997884750411004e-06, + "loss": 1.0922, + "step": 2120 + }, + { + "epoch": 0.5444785276073619, + "grad_norm": 11.646632194519043, + "learning_rate": 9.997617270487761e-06, + "loss": 0.831, + "step": 2130 + }, + { + "epoch": 0.5470347648261759, + "grad_norm": 11.330808639526367, + "learning_rate": 9.997333874345594e-06, + "loss": 1.1629, + "step": 2140 + }, + { + "epoch": 0.5495910020449898, + "grad_norm": 12.656023979187012, + "learning_rate": 9.997034562887054e-06, + "loss": 1.1112, + "step": 2150 + }, + { + "epoch": 0.5521472392638037, + "grad_norm": 10.297701835632324, + "learning_rate": 9.996719337065376e-06, + "loss": 0.9942, + "step": 2160 + }, + { + "epoch": 0.5547034764826176, + "grad_norm": 20.408578872680664, + "learning_rate": 9.99638819788448e-06, + "loss": 0.7756, + "step": 2170 + }, + { + "epoch": 0.5572597137014315, + "grad_norm": 13.656134605407715, + "learning_rate": 9.996041146398963e-06, + "loss": 1.2323, + "step": 2180 + }, + { + "epoch": 0.5598159509202454, + "grad_norm": 10.573500633239746, + "learning_rate": 9.995678183714104e-06, + "loss": 0.9494, + "step": 2190 + }, + { + "epoch": 0.5623721881390593, + "grad_norm": 30.932117462158203, + "learning_rate": 9.99529931098585e-06, + "loss": 0.9215, + "step": 2200 + }, + { + "epoch": 0.5649284253578732, + "grad_norm": 12.926258087158203, + "learning_rate": 9.994904529420824e-06, + "loss": 1.151, + "step": 2210 + }, + { + "epoch": 0.5674846625766872, + "grad_norm": 9.75345516204834, + "learning_rate": 9.994493840276308e-06, + "loss": 1.0613, + "step": 2220 + }, + { + "epoch": 0.570040899795501, + "grad_norm": 15.309710502624512, + "learning_rate": 9.99406724486025e-06, + "loss": 1.1024, + "step": 2230 + }, + { + "epoch": 0.5725971370143149, + "grad_norm": 13.060432434082031, + "learning_rate": 9.993624744531253e-06, + "loss": 0.8317, + "step": 2240 + }, + { + "epoch": 0.5751533742331288, + "grad_norm": 22.823984146118164, + "learning_rate": 9.993166340698577e-06, + "loss": 0.9703, + "step": 2250 + }, + { + "epoch": 0.5777096114519428, + "grad_norm": 11.097712516784668, + "learning_rate": 9.992692034822127e-06, + "loss": 0.9237, + "step": 2260 + }, + { + "epoch": 0.5802658486707567, + "grad_norm": 14.171446800231934, + "learning_rate": 9.992201828412458e-06, + "loss": 0.9436, + "step": 2270 + }, + { + "epoch": 0.5828220858895705, + "grad_norm": 10.901077270507812, + "learning_rate": 9.991695723030755e-06, + "loss": 0.9086, + "step": 2280 + }, + { + "epoch": 0.5853783231083844, + "grad_norm": 44.82511901855469, + "learning_rate": 9.991173720288847e-06, + "loss": 0.7686, + "step": 2290 + }, + { + "epoch": 0.5879345603271984, + "grad_norm": 8.220059394836426, + "learning_rate": 9.990635821849187e-06, + "loss": 0.7624, + "step": 2300 + }, + { + "epoch": 0.5904907975460123, + "grad_norm": 11.58703327178955, + "learning_rate": 9.990082029424852e-06, + "loss": 0.7953, + "step": 2310 + }, + { + "epoch": 0.5930470347648262, + "grad_norm": 18.552797317504883, + "learning_rate": 9.989512344779541e-06, + "loss": 0.7791, + "step": 2320 + }, + { + "epoch": 0.59560327198364, + "grad_norm": 16.435989379882812, + "learning_rate": 9.988926769727563e-06, + "loss": 1.1133, + "step": 2330 + }, + { + "epoch": 0.598159509202454, + "grad_norm": 9.04973316192627, + "learning_rate": 9.988325306133832e-06, + "loss": 0.8, + "step": 2340 + }, + { + "epoch": 0.6007157464212679, + "grad_norm": 9.818502426147461, + "learning_rate": 9.987707955913873e-06, + "loss": 0.7636, + "step": 2350 + }, + { + "epoch": 0.6032719836400818, + "grad_norm": 8.12960147857666, + "learning_rate": 9.98707472103379e-06, + "loss": 0.7332, + "step": 2360 + }, + { + "epoch": 0.6058282208588958, + "grad_norm": 14.352721214294434, + "learning_rate": 9.986425603510292e-06, + "loss": 0.7819, + "step": 2370 + }, + { + "epoch": 0.6083844580777096, + "grad_norm": 6.8704986572265625, + "learning_rate": 9.985760605410662e-06, + "loss": 0.7691, + "step": 2380 + }, + { + "epoch": 0.6109406952965235, + "grad_norm": 10.685389518737793, + "learning_rate": 9.985079728852759e-06, + "loss": 0.8252, + "step": 2390 + }, + { + "epoch": 0.6134969325153374, + "grad_norm": 16.207923889160156, + "learning_rate": 9.98438297600501e-06, + "loss": 0.9821, + "step": 2400 + }, + { + "epoch": 0.6160531697341514, + "grad_norm": 15.584657669067383, + "learning_rate": 9.983670349086413e-06, + "loss": 0.876, + "step": 2410 + }, + { + "epoch": 0.6186094069529653, + "grad_norm": 15.134186744689941, + "learning_rate": 9.982941850366513e-06, + "loss": 0.5934, + "step": 2420 + }, + { + "epoch": 0.6211656441717791, + "grad_norm": 28.123193740844727, + "learning_rate": 9.982197482165398e-06, + "loss": 0.7742, + "step": 2430 + }, + { + "epoch": 0.623721881390593, + "grad_norm": 17.409650802612305, + "learning_rate": 9.981437246853712e-06, + "loss": 0.7065, + "step": 2440 + }, + { + "epoch": 0.626278118609407, + "grad_norm": 13.156755447387695, + "learning_rate": 9.980661146852619e-06, + "loss": 0.6499, + "step": 2450 + }, + { + "epoch": 0.6288343558282209, + "grad_norm": 20.250652313232422, + "learning_rate": 9.979869184633812e-06, + "loss": 0.7821, + "step": 2460 + }, + { + "epoch": 0.6313905930470347, + "grad_norm": 52.275699615478516, + "learning_rate": 9.979061362719502e-06, + "loss": 0.8, + "step": 2470 + }, + { + "epoch": 0.6339468302658486, + "grad_norm": 10.591206550598145, + "learning_rate": 9.97823768368241e-06, + "loss": 1.0135, + "step": 2480 + }, + { + "epoch": 0.6365030674846626, + "grad_norm": 20.04345703125, + "learning_rate": 9.977398150145758e-06, + "loss": 0.9202, + "step": 2490 + }, + { + "epoch": 0.6390593047034765, + "grad_norm": 15.350805282592773, + "learning_rate": 9.976542764783256e-06, + "loss": 1.0958, + "step": 2500 + }, + { + "epoch": 0.6416155419222904, + "grad_norm": 10.294832229614258, + "learning_rate": 9.97567153031911e-06, + "loss": 0.9347, + "step": 2510 + }, + { + "epoch": 0.6441717791411042, + "grad_norm": 18.00196075439453, + "learning_rate": 9.974784449527984e-06, + "loss": 0.776, + "step": 2520 + }, + { + "epoch": 0.6467280163599182, + "grad_norm": 15.802022933959961, + "learning_rate": 9.973881525235028e-06, + "loss": 0.7016, + "step": 2530 + }, + { + "epoch": 0.6492842535787321, + "grad_norm": 16.474000930786133, + "learning_rate": 9.972962760315834e-06, + "loss": 0.9632, + "step": 2540 + }, + { + "epoch": 0.651840490797546, + "grad_norm": 20.025535583496094, + "learning_rate": 9.972028157696452e-06, + "loss": 0.9582, + "step": 2550 + }, + { + "epoch": 0.65439672801636, + "grad_norm": 20.044818878173828, + "learning_rate": 9.971077720353368e-06, + "loss": 0.8913, + "step": 2560 + }, + { + "epoch": 0.6569529652351738, + "grad_norm": 10.750015258789062, + "learning_rate": 9.970111451313498e-06, + "loss": 0.9251, + "step": 2570 + }, + { + "epoch": 0.6595092024539877, + "grad_norm": 13.033714294433594, + "learning_rate": 9.969129353654179e-06, + "loss": 0.8761, + "step": 2580 + }, + { + "epoch": 0.6620654396728016, + "grad_norm": 9.243477821350098, + "learning_rate": 9.968131430503157e-06, + "loss": 0.5353, + "step": 2590 + }, + { + "epoch": 0.6646216768916156, + "grad_norm": 8.169621467590332, + "learning_rate": 9.96711768503858e-06, + "loss": 0.6617, + "step": 2600 + }, + { + "epoch": 0.6671779141104295, + "grad_norm": 21.10552406311035, + "learning_rate": 9.966088120488985e-06, + "loss": 0.5695, + "step": 2610 + }, + { + "epoch": 0.6697341513292433, + "grad_norm": 9.105271339416504, + "learning_rate": 9.96504274013329e-06, + "loss": 0.9342, + "step": 2620 + }, + { + "epoch": 0.6722903885480572, + "grad_norm": 12.127760887145996, + "learning_rate": 9.96398154730078e-06, + "loss": 0.8841, + "step": 2630 + }, + { + "epoch": 0.6748466257668712, + "grad_norm": 6.325476169586182, + "learning_rate": 9.962904545371104e-06, + "loss": 0.6288, + "step": 2640 + }, + { + "epoch": 0.6774028629856851, + "grad_norm": 36.65105438232422, + "learning_rate": 9.961811737774256e-06, + "loss": 0.7858, + "step": 2650 + }, + { + "epoch": 0.679959100204499, + "grad_norm": 12.881020545959473, + "learning_rate": 9.960703127990564e-06, + "loss": 0.6614, + "step": 2660 + }, + { + "epoch": 0.6825153374233128, + "grad_norm": 9.100659370422363, + "learning_rate": 9.959578719550689e-06, + "loss": 0.753, + "step": 2670 + }, + { + "epoch": 0.6850715746421268, + "grad_norm": 6.299210071563721, + "learning_rate": 9.958438516035604e-06, + "loss": 0.7298, + "step": 2680 + }, + { + "epoch": 0.6876278118609407, + "grad_norm": 10.514267921447754, + "learning_rate": 9.957282521076583e-06, + "loss": 0.7337, + "step": 2690 + }, + { + "epoch": 0.6901840490797546, + "grad_norm": 6.144178867340088, + "learning_rate": 9.956110738355197e-06, + "loss": 0.7576, + "step": 2700 + }, + { + "epoch": 0.6927402862985685, + "grad_norm": 7.862902641296387, + "learning_rate": 9.95492317160329e-06, + "loss": 0.8132, + "step": 2710 + }, + { + "epoch": 0.6952965235173824, + "grad_norm": 15.029640197753906, + "learning_rate": 9.953719824602982e-06, + "loss": 0.7462, + "step": 2720 + }, + { + "epoch": 0.6978527607361963, + "grad_norm": 13.379220008850098, + "learning_rate": 9.952500701186649e-06, + "loss": 0.4353, + "step": 2730 + }, + { + "epoch": 0.7004089979550102, + "grad_norm": 8.90844440460205, + "learning_rate": 9.951265805236903e-06, + "loss": 0.6655, + "step": 2740 + }, + { + "epoch": 0.7029652351738241, + "grad_norm": 14.42451000213623, + "learning_rate": 9.950015140686595e-06, + "loss": 0.6928, + "step": 2750 + }, + { + "epoch": 0.7055214723926381, + "grad_norm": 9.552287101745605, + "learning_rate": 9.948748711518792e-06, + "loss": 0.5294, + "step": 2760 + }, + { + "epoch": 0.7080777096114519, + "grad_norm": 12.426175117492676, + "learning_rate": 9.947466521766772e-06, + "loss": 0.7148, + "step": 2770 + }, + { + "epoch": 0.7106339468302658, + "grad_norm": 16.0783748626709, + "learning_rate": 9.946168575514e-06, + "loss": 0.6684, + "step": 2780 + }, + { + "epoch": 0.7131901840490797, + "grad_norm": 10.560613632202148, + "learning_rate": 9.94485487689413e-06, + "loss": 0.7561, + "step": 2790 + }, + { + "epoch": 0.7157464212678937, + "grad_norm": 13.276518821716309, + "learning_rate": 9.943525430090973e-06, + "loss": 0.5811, + "step": 2800 + }, + { + "epoch": 0.7183026584867076, + "grad_norm": 13.999181747436523, + "learning_rate": 9.942180239338503e-06, + "loss": 0.5591, + "step": 2810 + }, + { + "epoch": 0.7208588957055214, + "grad_norm": 12.428943634033203, + "learning_rate": 9.940819308920832e-06, + "loss": 0.7026, + "step": 2820 + }, + { + "epoch": 0.7234151329243353, + "grad_norm": 7.707891941070557, + "learning_rate": 9.939442643172197e-06, + "loss": 0.7179, + "step": 2830 + }, + { + "epoch": 0.7259713701431493, + "grad_norm": 7.399072170257568, + "learning_rate": 9.93805024647695e-06, + "loss": 0.664, + "step": 2840 + }, + { + "epoch": 0.7285276073619632, + "grad_norm": 23.526582717895508, + "learning_rate": 9.936642123269546e-06, + "loss": 0.7611, + "step": 2850 + }, + { + "epoch": 0.7310838445807771, + "grad_norm": 9.424376487731934, + "learning_rate": 9.93521827803452e-06, + "loss": 0.7113, + "step": 2860 + }, + { + "epoch": 0.733640081799591, + "grad_norm": 13.683032989501953, + "learning_rate": 9.933778715306474e-06, + "loss": 0.4565, + "step": 2870 + }, + { + "epoch": 0.7361963190184049, + "grad_norm": 8.428793907165527, + "learning_rate": 9.932323439670079e-06, + "loss": 0.6818, + "step": 2880 + }, + { + "epoch": 0.7387525562372188, + "grad_norm": 20.064414978027344, + "learning_rate": 9.930852455760039e-06, + "loss": 0.6954, + "step": 2890 + }, + { + "epoch": 0.7413087934560327, + "grad_norm": 12.071993827819824, + "learning_rate": 9.929365768261085e-06, + "loss": 0.8114, + "step": 2900 + }, + { + "epoch": 0.7438650306748467, + "grad_norm": 10.930386543273926, + "learning_rate": 9.927863381907963e-06, + "loss": 0.7282, + "step": 2910 + }, + { + "epoch": 0.7464212678936605, + "grad_norm": 10.170836448669434, + "learning_rate": 9.926345301485414e-06, + "loss": 0.9321, + "step": 2920 + }, + { + "epoch": 0.7489775051124744, + "grad_norm": 6.6626129150390625, + "learning_rate": 9.924811531828164e-06, + "loss": 0.7144, + "step": 2930 + }, + { + "epoch": 0.7515337423312883, + "grad_norm": 8.486347198486328, + "learning_rate": 9.923262077820903e-06, + "loss": 0.5393, + "step": 2940 + }, + { + "epoch": 0.7540899795501023, + "grad_norm": 12.877697944641113, + "learning_rate": 9.921696944398274e-06, + "loss": 0.4268, + "step": 2950 + }, + { + "epoch": 0.7566462167689162, + "grad_norm": 11.594487190246582, + "learning_rate": 9.920116136544849e-06, + "loss": 0.5911, + "step": 2960 + }, + { + "epoch": 0.75920245398773, + "grad_norm": 15.745911598205566, + "learning_rate": 9.918519659295127e-06, + "loss": 0.7711, + "step": 2970 + }, + { + "epoch": 0.7617586912065439, + "grad_norm": 13.972307205200195, + "learning_rate": 9.916907517733508e-06, + "loss": 0.5574, + "step": 2980 + }, + { + "epoch": 0.7643149284253579, + "grad_norm": 6.976569175720215, + "learning_rate": 9.915279716994276e-06, + "loss": 0.4998, + "step": 2990 + }, + { + "epoch": 0.7668711656441718, + "grad_norm": 6.9776787757873535, + "learning_rate": 9.913636262261592e-06, + "loss": 0.4069, + "step": 3000 + }, + { + "epoch": 0.7694274028629857, + "grad_norm": 12.110786437988281, + "learning_rate": 9.911977158769461e-06, + "loss": 0.6704, + "step": 3010 + }, + { + "epoch": 0.7719836400817995, + "grad_norm": 6.544830799102783, + "learning_rate": 9.910302411801738e-06, + "loss": 0.5889, + "step": 3020 + }, + { + "epoch": 0.7745398773006135, + "grad_norm": 8.968564987182617, + "learning_rate": 9.90861202669209e-06, + "loss": 0.4109, + "step": 3030 + }, + { + "epoch": 0.7770961145194274, + "grad_norm": 16.600383758544922, + "learning_rate": 9.906906008823989e-06, + "loss": 0.9562, + "step": 3040 + }, + { + "epoch": 0.7796523517382413, + "grad_norm": 21.926057815551758, + "learning_rate": 9.905184363630698e-06, + "loss": 0.5117, + "step": 3050 + }, + { + "epoch": 0.7822085889570553, + "grad_norm": 13.331565856933594, + "learning_rate": 9.903447096595245e-06, + "loss": 0.5186, + "step": 3060 + }, + { + "epoch": 0.7847648261758691, + "grad_norm": 10.782326698303223, + "learning_rate": 9.90169421325041e-06, + "loss": 0.5799, + "step": 3070 + }, + { + "epoch": 0.787321063394683, + "grad_norm": 20.489850997924805, + "learning_rate": 9.89992571917871e-06, + "loss": 0.4487, + "step": 3080 + }, + { + "epoch": 0.7898773006134969, + "grad_norm": 12.216683387756348, + "learning_rate": 9.898141620012374e-06, + "loss": 0.636, + "step": 3090 + }, + { + "epoch": 0.7924335378323109, + "grad_norm": 8.060449600219727, + "learning_rate": 9.896341921433337e-06, + "loss": 0.6251, + "step": 3100 + }, + { + "epoch": 0.7949897750511248, + "grad_norm": 5.005650997161865, + "learning_rate": 9.894526629173204e-06, + "loss": 0.6748, + "step": 3110 + }, + { + "epoch": 0.7975460122699386, + "grad_norm": 11.046931266784668, + "learning_rate": 9.892695749013253e-06, + "loss": 0.599, + "step": 3120 + }, + { + "epoch": 0.8001022494887525, + "grad_norm": 11.397811889648438, + "learning_rate": 9.890849286784398e-06, + "loss": 0.7874, + "step": 3130 + }, + { + "epoch": 0.8026584867075665, + "grad_norm": 8.473251342773438, + "learning_rate": 9.888987248367181e-06, + "loss": 0.6328, + "step": 3140 + }, + { + "epoch": 0.8052147239263804, + "grad_norm": 11.444445610046387, + "learning_rate": 9.88710963969175e-06, + "loss": 0.5749, + "step": 3150 + }, + { + "epoch": 0.8077709611451943, + "grad_norm": 8.93635082244873, + "learning_rate": 9.885216466737843e-06, + "loss": 0.7803, + "step": 3160 + }, + { + "epoch": 0.8103271983640081, + "grad_norm": 8.53089714050293, + "learning_rate": 9.883307735534761e-06, + "loss": 0.6362, + "step": 3170 + }, + { + "epoch": 0.8128834355828221, + "grad_norm": 4.943642616271973, + "learning_rate": 9.88138345216136e-06, + "loss": 0.6297, + "step": 3180 + }, + { + "epoch": 0.815439672801636, + "grad_norm": 10.993963241577148, + "learning_rate": 9.87944362274602e-06, + "loss": 0.4654, + "step": 3190 + }, + { + "epoch": 0.8179959100204499, + "grad_norm": 20.30816650390625, + "learning_rate": 9.87748825346664e-06, + "loss": 0.5197, + "step": 3200 + }, + { + "epoch": 0.8205521472392638, + "grad_norm": 10.663908004760742, + "learning_rate": 9.875517350550601e-06, + "loss": 0.6027, + "step": 3210 + }, + { + "epoch": 0.8231083844580777, + "grad_norm": 7.101048469543457, + "learning_rate": 9.873530920274761e-06, + "loss": 0.5027, + "step": 3220 + }, + { + "epoch": 0.8256646216768916, + "grad_norm": 16.21637725830078, + "learning_rate": 9.871528968965426e-06, + "loss": 0.6488, + "step": 3230 + }, + { + "epoch": 0.8282208588957055, + "grad_norm": 11.160218238830566, + "learning_rate": 9.86951150299833e-06, + "loss": 0.6848, + "step": 3240 + }, + { + "epoch": 0.8307770961145194, + "grad_norm": 7.589058876037598, + "learning_rate": 9.867478528798625e-06, + "loss": 0.3006, + "step": 3250 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 13.101618766784668, + "learning_rate": 9.865430052840849e-06, + "loss": 0.6459, + "step": 3260 + }, + { + "epoch": 0.8358895705521472, + "grad_norm": 6.775156021118164, + "learning_rate": 9.863366081648907e-06, + "loss": 0.5887, + "step": 3270 + }, + { + "epoch": 0.8384458077709611, + "grad_norm": 14.762919425964355, + "learning_rate": 9.861286621796056e-06, + "loss": 0.4892, + "step": 3280 + }, + { + "epoch": 0.841002044989775, + "grad_norm": 22.660533905029297, + "learning_rate": 9.85919167990488e-06, + "loss": 0.5762, + "step": 3290 + }, + { + "epoch": 0.843558282208589, + "grad_norm": 12.753227233886719, + "learning_rate": 9.857081262647269e-06, + "loss": 0.6596, + "step": 3300 + }, + { + "epoch": 0.8461145194274029, + "grad_norm": 14.134135246276855, + "learning_rate": 9.854955376744397e-06, + "loss": 0.5865, + "step": 3310 + }, + { + "epoch": 0.8486707566462167, + "grad_norm": 7.306004047393799, + "learning_rate": 9.852814028966706e-06, + "loss": 0.5196, + "step": 3320 + }, + { + "epoch": 0.8512269938650306, + "grad_norm": 12.3103609085083, + "learning_rate": 9.850657226133878e-06, + "loss": 0.605, + "step": 3330 + }, + { + "epoch": 0.8537832310838446, + "grad_norm": 7.823228359222412, + "learning_rate": 9.848484975114812e-06, + "loss": 0.6368, + "step": 3340 + }, + { + "epoch": 0.8563394683026585, + "grad_norm": 11.120277404785156, + "learning_rate": 9.846297282827612e-06, + "loss": 0.4841, + "step": 3350 + }, + { + "epoch": 0.8588957055214724, + "grad_norm": 8.988906860351562, + "learning_rate": 9.844094156239557e-06, + "loss": 0.5918, + "step": 3360 + }, + { + "epoch": 0.8614519427402862, + "grad_norm": 14.820247650146484, + "learning_rate": 9.841875602367079e-06, + "loss": 0.4307, + "step": 3370 + }, + { + "epoch": 0.8640081799591002, + "grad_norm": 7.334587097167969, + "learning_rate": 9.83964162827574e-06, + "loss": 0.564, + "step": 3380 + }, + { + "epoch": 0.8665644171779141, + "grad_norm": 11.864500999450684, + "learning_rate": 9.837392241080218e-06, + "loss": 0.5235, + "step": 3390 + }, + { + "epoch": 0.869120654396728, + "grad_norm": 10.920977592468262, + "learning_rate": 9.835127447944274e-06, + "loss": 0.4475, + "step": 3400 + }, + { + "epoch": 0.871676891615542, + "grad_norm": 8.427702903747559, + "learning_rate": 9.832847256080734e-06, + "loss": 0.5594, + "step": 3410 + }, + { + "epoch": 0.8742331288343558, + "grad_norm": 9.778414726257324, + "learning_rate": 9.830551672751463e-06, + "loss": 0.6194, + "step": 3420 + }, + { + "epoch": 0.8767893660531697, + "grad_norm": 8.027331352233887, + "learning_rate": 9.82824070526735e-06, + "loss": 0.5957, + "step": 3430 + }, + { + "epoch": 0.8793456032719836, + "grad_norm": 6.331071376800537, + "learning_rate": 9.825914360988271e-06, + "loss": 0.5145, + "step": 3440 + }, + { + "epoch": 0.8819018404907976, + "grad_norm": 8.607481956481934, + "learning_rate": 9.82357264732308e-06, + "loss": 0.5986, + "step": 3450 + }, + { + "epoch": 0.8844580777096115, + "grad_norm": 6.551468849182129, + "learning_rate": 9.821215571729578e-06, + "loss": 0.5461, + "step": 3460 + }, + { + "epoch": 0.8870143149284253, + "grad_norm": 6.835443496704102, + "learning_rate": 9.818843141714486e-06, + "loss": 0.7021, + "step": 3470 + }, + { + "epoch": 0.8895705521472392, + "grad_norm": 7.249754428863525, + "learning_rate": 9.81645536483343e-06, + "loss": 0.6188, + "step": 3480 + }, + { + "epoch": 0.8921267893660532, + "grad_norm": 7.487998962402344, + "learning_rate": 9.814052248690906e-06, + "loss": 0.4203, + "step": 3490 + }, + { + "epoch": 0.8946830265848671, + "grad_norm": 17.97199821472168, + "learning_rate": 9.81163380094027e-06, + "loss": 0.4725, + "step": 3500 + }, + { + "epoch": 0.897239263803681, + "grad_norm": 15.719616889953613, + "learning_rate": 9.809200029283698e-06, + "loss": 0.5723, + "step": 3510 + }, + { + "epoch": 0.8997955010224948, + "grad_norm": 9.500740051269531, + "learning_rate": 9.806750941472175e-06, + "loss": 0.417, + "step": 3520 + }, + { + "epoch": 0.9023517382413088, + "grad_norm": 7.425899505615234, + "learning_rate": 9.804286545305456e-06, + "loss": 0.4884, + "step": 3530 + }, + { + "epoch": 0.9049079754601227, + "grad_norm": 8.523987770080566, + "learning_rate": 9.801806848632062e-06, + "loss": 0.4925, + "step": 3540 + }, + { + "epoch": 0.9074642126789366, + "grad_norm": 13.769088745117188, + "learning_rate": 9.799311859349235e-06, + "loss": 0.3849, + "step": 3550 + }, + { + "epoch": 0.9100204498977505, + "grad_norm": 7.716251850128174, + "learning_rate": 9.796801585402913e-06, + "loss": 0.4594, + "step": 3560 + }, + { + "epoch": 0.9125766871165644, + "grad_norm": 10.922795295715332, + "learning_rate": 9.79427603478773e-06, + "loss": 0.4632, + "step": 3570 + }, + { + "epoch": 0.9151329243353783, + "grad_norm": 8.93303108215332, + "learning_rate": 9.791735215546953e-06, + "loss": 0.453, + "step": 3580 + }, + { + "epoch": 0.9176891615541922, + "grad_norm": 6.447891712188721, + "learning_rate": 9.78917913577249e-06, + "loss": 0.3284, + "step": 3590 + }, + { + "epoch": 0.9202453987730062, + "grad_norm": 8.590970993041992, + "learning_rate": 9.786607803604844e-06, + "loss": 0.4445, + "step": 3600 + }, + { + "epoch": 0.9228016359918201, + "grad_norm": 9.189178466796875, + "learning_rate": 9.784021227233097e-06, + "loss": 0.5768, + "step": 3610 + }, + { + "epoch": 0.9253578732106339, + "grad_norm": 8.67251968383789, + "learning_rate": 9.781419414894877e-06, + "loss": 0.4507, + "step": 3620 + }, + { + "epoch": 0.9279141104294478, + "grad_norm": 10.756339073181152, + "learning_rate": 9.778802374876332e-06, + "loss": 0.4278, + "step": 3630 + }, + { + "epoch": 0.9304703476482618, + "grad_norm": 9.680365562438965, + "learning_rate": 9.776170115512115e-06, + "loss": 0.3831, + "step": 3640 + }, + { + "epoch": 0.9330265848670757, + "grad_norm": 16.632375717163086, + "learning_rate": 9.773522645185342e-06, + "loss": 0.5033, + "step": 3650 + }, + { + "epoch": 0.9355828220858896, + "grad_norm": 7.6330695152282715, + "learning_rate": 9.770859972327575e-06, + "loss": 0.3978, + "step": 3660 + }, + { + "epoch": 0.9381390593047034, + "grad_norm": 8.260819435119629, + "learning_rate": 9.768182105418791e-06, + "loss": 0.5457, + "step": 3670 + }, + { + "epoch": 0.9406952965235174, + "grad_norm": 18.994287490844727, + "learning_rate": 9.765489052987357e-06, + "loss": 0.5469, + "step": 3680 + }, + { + "epoch": 0.9432515337423313, + "grad_norm": 8.636393547058105, + "learning_rate": 9.762780823610006e-06, + "loss": 0.4657, + "step": 3690 + }, + { + "epoch": 0.9458077709611452, + "grad_norm": 16.197158813476562, + "learning_rate": 9.760057425911797e-06, + "loss": 0.3715, + "step": 3700 + }, + { + "epoch": 0.9483640081799591, + "grad_norm": 28.646278381347656, + "learning_rate": 9.757318868566107e-06, + "loss": 0.3147, + "step": 3710 + }, + { + "epoch": 0.950920245398773, + "grad_norm": 9.230977058410645, + "learning_rate": 9.754565160294587e-06, + "loss": 0.6337, + "step": 3720 + }, + { + "epoch": 0.9534764826175869, + "grad_norm": 17.38115882873535, + "learning_rate": 9.751796309867139e-06, + "loss": 0.4393, + "step": 3730 + }, + { + "epoch": 0.9560327198364008, + "grad_norm": 15.209970474243164, + "learning_rate": 9.749012326101891e-06, + "loss": 0.4759, + "step": 3740 + }, + { + "epoch": 0.9585889570552147, + "grad_norm": 15.37113094329834, + "learning_rate": 9.74621321786517e-06, + "loss": 0.493, + "step": 3750 + }, + { + "epoch": 0.9611451942740287, + "grad_norm": 9.076826095581055, + "learning_rate": 9.743398994071467e-06, + "loss": 0.2903, + "step": 3760 + }, + { + "epoch": 0.9637014314928425, + "grad_norm": 6.899563312530518, + "learning_rate": 9.740569663683413e-06, + "loss": 0.3847, + "step": 3770 + }, + { + "epoch": 0.9662576687116564, + "grad_norm": 14.622838973999023, + "learning_rate": 9.73772523571175e-06, + "loss": 0.3528, + "step": 3780 + }, + { + "epoch": 0.9688139059304703, + "grad_norm": 11.762303352355957, + "learning_rate": 9.734865719215303e-06, + "loss": 0.4437, + "step": 3790 + }, + { + "epoch": 0.9713701431492843, + "grad_norm": 11.108593940734863, + "learning_rate": 9.73199112330095e-06, + "loss": 0.2947, + "step": 3800 + }, + { + "epoch": 0.9739263803680982, + "grad_norm": 7.895074367523193, + "learning_rate": 9.729101457123593e-06, + "loss": 0.4659, + "step": 3810 + }, + { + "epoch": 0.976482617586912, + "grad_norm": 10.534423828125, + "learning_rate": 9.72619672988613e-06, + "loss": 0.5034, + "step": 3820 + }, + { + "epoch": 0.9790388548057259, + "grad_norm": 6.145469665527344, + "learning_rate": 9.723276950839425e-06, + "loss": 0.4708, + "step": 3830 + }, + { + "epoch": 0.9815950920245399, + "grad_norm": 5.333863258361816, + "learning_rate": 9.720342129282277e-06, + "loss": 0.5987, + "step": 3840 + }, + { + "epoch": 0.9841513292433538, + "grad_norm": 11.559300422668457, + "learning_rate": 9.717392274561392e-06, + "loss": 0.5316, + "step": 3850 + }, + { + "epoch": 0.9867075664621677, + "grad_norm": 7.202635288238525, + "learning_rate": 9.714427396071354e-06, + "loss": 0.3995, + "step": 3860 + }, + { + "epoch": 0.9892638036809815, + "grad_norm": 9.292013168334961, + "learning_rate": 9.711447503254595e-06, + "loss": 0.5362, + "step": 3870 + }, + { + "epoch": 0.9918200408997955, + "grad_norm": 15.875975608825684, + "learning_rate": 9.708452605601361e-06, + "loss": 0.3956, + "step": 3880 + }, + { + "epoch": 0.9943762781186094, + "grad_norm": 5.166224002838135, + "learning_rate": 9.705442712649688e-06, + "loss": 0.4298, + "step": 3890 + }, + { + "epoch": 0.9969325153374233, + "grad_norm": 28.647296905517578, + "learning_rate": 9.702417833985367e-06, + "loss": 0.5758, + "step": 3900 + }, + { + "epoch": 0.9994887525562373, + "grad_norm": 7.455996990203857, + "learning_rate": 9.699377979241915e-06, + "loss": 0.4445, + "step": 3910 + }, + { + "epoch": 1.0020449897750512, + "grad_norm": 8.313132286071777, + "learning_rate": 9.696323158100543e-06, + "loss": 0.3661, + "step": 3920 + }, + { + "epoch": 1.0046012269938651, + "grad_norm": 2.6401190757751465, + "learning_rate": 9.69325338029013e-06, + "loss": 0.4446, + "step": 3930 + }, + { + "epoch": 1.007157464212679, + "grad_norm": 8.16818904876709, + "learning_rate": 9.690168655587184e-06, + "loss": 0.298, + "step": 3940 + }, + { + "epoch": 1.0097137014314927, + "grad_norm": 9.28429889678955, + "learning_rate": 9.687068993815819e-06, + "loss": 0.2262, + "step": 3950 + }, + { + "epoch": 1.0122699386503067, + "grad_norm": 6.392743110656738, + "learning_rate": 9.683954404847715e-06, + "loss": 0.2432, + "step": 3960 + }, + { + "epoch": 1.0148261758691206, + "grad_norm": 6.890766620635986, + "learning_rate": 9.6808248986021e-06, + "loss": 0.4461, + "step": 3970 + }, + { + "epoch": 1.0173824130879345, + "grad_norm": 10.436578750610352, + "learning_rate": 9.6776804850457e-06, + "loss": 0.3529, + "step": 3980 + }, + { + "epoch": 1.0199386503067485, + "grad_norm": 7.264800071716309, + "learning_rate": 9.674521174192726e-06, + "loss": 0.2966, + "step": 3990 + }, + { + "epoch": 1.0224948875255624, + "grad_norm": 10.522168159484863, + "learning_rate": 9.671346976104828e-06, + "loss": 0.2953, + "step": 4000 + }, + { + "epoch": 1.0250511247443763, + "grad_norm": 5.65585994720459, + "learning_rate": 9.668157900891069e-06, + "loss": 0.3308, + "step": 4010 + }, + { + "epoch": 1.0276073619631902, + "grad_norm": 9.439372062683105, + "learning_rate": 9.664953958707892e-06, + "loss": 0.2545, + "step": 4020 + }, + { + "epoch": 1.0301635991820042, + "grad_norm": 3.5625405311584473, + "learning_rate": 9.661735159759093e-06, + "loss": 0.2846, + "step": 4030 + }, + { + "epoch": 1.032719836400818, + "grad_norm": 26.94212532043457, + "learning_rate": 9.658501514295775e-06, + "loss": 0.205, + "step": 4040 + }, + { + "epoch": 1.0352760736196318, + "grad_norm": 11.873112678527832, + "learning_rate": 9.655253032616327e-06, + "loss": 0.3401, + "step": 4050 + }, + { + "epoch": 1.0378323108384457, + "grad_norm": 7.584825038909912, + "learning_rate": 9.651989725066393e-06, + "loss": 0.2991, + "step": 4060 + }, + { + "epoch": 1.0403885480572597, + "grad_norm": 7.558630466461182, + "learning_rate": 9.648711602038823e-06, + "loss": 0.3096, + "step": 4070 + }, + { + "epoch": 1.0429447852760736, + "grad_norm": 24.522443771362305, + "learning_rate": 9.64541867397366e-06, + "loss": 0.4115, + "step": 4080 + }, + { + "epoch": 1.0455010224948875, + "grad_norm": 5.4436354637146, + "learning_rate": 9.642110951358097e-06, + "loss": 0.2687, + "step": 4090 + }, + { + "epoch": 1.0480572597137015, + "grad_norm": 9.708597183227539, + "learning_rate": 9.638788444726437e-06, + "loss": 0.2038, + "step": 4100 + }, + { + "epoch": 1.0506134969325154, + "grad_norm": 5.303321361541748, + "learning_rate": 9.635451164660073e-06, + "loss": 0.3039, + "step": 4110 + }, + { + "epoch": 1.0531697341513293, + "grad_norm": 7.557952404022217, + "learning_rate": 9.632099121787445e-06, + "loss": 0.3325, + "step": 4120 + }, + { + "epoch": 1.0557259713701432, + "grad_norm": 5.638031005859375, + "learning_rate": 9.628732326784014e-06, + "loss": 0.3189, + "step": 4130 + }, + { + "epoch": 1.058282208588957, + "grad_norm": 1.7007097005844116, + "learning_rate": 9.625350790372214e-06, + "loss": 0.3178, + "step": 4140 + }, + { + "epoch": 1.0608384458077709, + "grad_norm": 8.193168640136719, + "learning_rate": 9.621954523321434e-06, + "loss": 0.307, + "step": 4150 + }, + { + "epoch": 1.0633946830265848, + "grad_norm": 15.883909225463867, + "learning_rate": 9.618543536447974e-06, + "loss": 0.2642, + "step": 4160 + }, + { + "epoch": 1.0659509202453987, + "grad_norm": 13.922346115112305, + "learning_rate": 9.615117840615011e-06, + "loss": 0.3466, + "step": 4170 + }, + { + "epoch": 1.0685071574642127, + "grad_norm": 21.666532516479492, + "learning_rate": 9.611677446732576e-06, + "loss": 0.2475, + "step": 4180 + }, + { + "epoch": 1.0710633946830266, + "grad_norm": 14.09211540222168, + "learning_rate": 9.608222365757498e-06, + "loss": 0.2698, + "step": 4190 + }, + { + "epoch": 1.0736196319018405, + "grad_norm": 9.652295112609863, + "learning_rate": 9.604752608693384e-06, + "loss": 0.2477, + "step": 4200 + }, + { + "epoch": 1.0761758691206544, + "grad_norm": 5.439416408538818, + "learning_rate": 9.601268186590587e-06, + "loss": 0.2024, + "step": 4210 + }, + { + "epoch": 1.0787321063394684, + "grad_norm": 3.458691358566284, + "learning_rate": 9.597769110546158e-06, + "loss": 0.2974, + "step": 4220 + }, + { + "epoch": 1.0812883435582823, + "grad_norm": 8.662911415100098, + "learning_rate": 9.594255391703821e-06, + "loss": 0.2053, + "step": 4230 + }, + { + "epoch": 1.0838445807770962, + "grad_norm": 9.305736541748047, + "learning_rate": 9.59072704125393e-06, + "loss": 0.3785, + "step": 4240 + }, + { + "epoch": 1.08640081799591, + "grad_norm": 8.057384490966797, + "learning_rate": 9.587184070433442e-06, + "loss": 0.239, + "step": 4250 + }, + { + "epoch": 1.0889570552147239, + "grad_norm": 11.628586769104004, + "learning_rate": 9.583626490525872e-06, + "loss": 0.3451, + "step": 4260 + }, + { + "epoch": 1.0915132924335378, + "grad_norm": 5.124874591827393, + "learning_rate": 9.580054312861264e-06, + "loss": 0.3267, + "step": 4270 + }, + { + "epoch": 1.0940695296523517, + "grad_norm": 8.520767211914062, + "learning_rate": 9.576467548816154e-06, + "loss": 0.2843, + "step": 4280 + }, + { + "epoch": 1.0966257668711656, + "grad_norm": 13.09350872039795, + "learning_rate": 9.572866209813525e-06, + "loss": 0.2522, + "step": 4290 + }, + { + "epoch": 1.0991820040899796, + "grad_norm": 6.647915840148926, + "learning_rate": 9.569250307322788e-06, + "loss": 0.3104, + "step": 4300 + }, + { + "epoch": 1.1017382413087935, + "grad_norm": 10.310320854187012, + "learning_rate": 9.565619852859727e-06, + "loss": 0.2137, + "step": 4310 + }, + { + "epoch": 1.1042944785276074, + "grad_norm": 6.362160682678223, + "learning_rate": 9.561974857986472e-06, + "loss": 0.1824, + "step": 4320 + }, + { + "epoch": 1.1068507157464214, + "grad_norm": 16.666887283325195, + "learning_rate": 9.558315334311467e-06, + "loss": 0.3631, + "step": 4330 + }, + { + "epoch": 1.109406952965235, + "grad_norm": 2.7935502529144287, + "learning_rate": 9.554641293489419e-06, + "loss": 0.2915, + "step": 4340 + }, + { + "epoch": 1.111963190184049, + "grad_norm": 15.494998931884766, + "learning_rate": 9.55095274722127e-06, + "loss": 0.2922, + "step": 4350 + }, + { + "epoch": 1.114519427402863, + "grad_norm": 6.94740629196167, + "learning_rate": 9.547249707254166e-06, + "loss": 0.264, + "step": 4360 + }, + { + "epoch": 1.1170756646216768, + "grad_norm": 7.18923807144165, + "learning_rate": 9.543532185381397e-06, + "loss": 0.3097, + "step": 4370 + }, + { + "epoch": 1.1196319018404908, + "grad_norm": 10.083481788635254, + "learning_rate": 9.53980019344239e-06, + "loss": 0.2706, + "step": 4380 + }, + { + "epoch": 1.1221881390593047, + "grad_norm": 7.783493995666504, + "learning_rate": 9.53605374332265e-06, + "loss": 0.1824, + "step": 4390 + }, + { + "epoch": 1.1247443762781186, + "grad_norm": 10.747809410095215, + "learning_rate": 9.532292846953723e-06, + "loss": 0.3375, + "step": 4400 + }, + { + "epoch": 1.1273006134969326, + "grad_norm": 11.694700241088867, + "learning_rate": 9.528517516313167e-06, + "loss": 0.2018, + "step": 4410 + }, + { + "epoch": 1.1298568507157465, + "grad_norm": 6.256073474884033, + "learning_rate": 9.524727763424513e-06, + "loss": 0.1545, + "step": 4420 + }, + { + "epoch": 1.1324130879345604, + "grad_norm": 6.233736991882324, + "learning_rate": 9.520923600357217e-06, + "loss": 0.2827, + "step": 4430 + }, + { + "epoch": 1.1349693251533743, + "grad_norm": 8.213584899902344, + "learning_rate": 9.517105039226632e-06, + "loss": 0.315, + "step": 4440 + }, + { + "epoch": 1.137525562372188, + "grad_norm": 12.951038360595703, + "learning_rate": 9.513272092193965e-06, + "loss": 0.2061, + "step": 4450 + }, + { + "epoch": 1.140081799591002, + "grad_norm": 5.706482410430908, + "learning_rate": 9.509424771466236e-06, + "loss": 0.2526, + "step": 4460 + }, + { + "epoch": 1.142638036809816, + "grad_norm": 6.124299049377441, + "learning_rate": 9.505563089296246e-06, + "loss": 0.3302, + "step": 4470 + }, + { + "epoch": 1.1451942740286298, + "grad_norm": 11.08293342590332, + "learning_rate": 9.501687057982531e-06, + "loss": 0.2411, + "step": 4480 + }, + { + "epoch": 1.1477505112474438, + "grad_norm": 8.393287658691406, + "learning_rate": 9.497796689869324e-06, + "loss": 0.3682, + "step": 4490 + }, + { + "epoch": 1.1503067484662577, + "grad_norm": 0.49787667393684387, + "learning_rate": 9.493891997346522e-06, + "loss": 0.176, + "step": 4500 + }, + { + "epoch": 1.1528629856850716, + "grad_norm": 6.434317588806152, + "learning_rate": 9.489972992849641e-06, + "loss": 0.2696, + "step": 4510 + }, + { + "epoch": 1.1554192229038855, + "grad_norm": 8.729398727416992, + "learning_rate": 9.486039688859772e-06, + "loss": 0.2838, + "step": 4520 + }, + { + "epoch": 1.1579754601226995, + "grad_norm": 9.446803092956543, + "learning_rate": 9.482092097903551e-06, + "loss": 0.3253, + "step": 4530 + }, + { + "epoch": 1.1605316973415132, + "grad_norm": 6.4901957511901855, + "learning_rate": 9.478130232553111e-06, + "loss": 0.3429, + "step": 4540 + }, + { + "epoch": 1.163087934560327, + "grad_norm": 9.026398658752441, + "learning_rate": 9.474154105426055e-06, + "loss": 0.3302, + "step": 4550 + }, + { + "epoch": 1.165644171779141, + "grad_norm": 6.108066082000732, + "learning_rate": 9.470163729185392e-06, + "loss": 0.1702, + "step": 4560 + }, + { + "epoch": 1.168200408997955, + "grad_norm": 10.425956726074219, + "learning_rate": 9.466159116539523e-06, + "loss": 0.3008, + "step": 4570 + }, + { + "epoch": 1.170756646216769, + "grad_norm": 4.817817211151123, + "learning_rate": 9.462140280242182e-06, + "loss": 0.3151, + "step": 4580 + }, + { + "epoch": 1.1733128834355828, + "grad_norm": 17.882158279418945, + "learning_rate": 9.458107233092406e-06, + "loss": 0.23, + "step": 4590 + }, + { + "epoch": 1.1758691206543967, + "grad_norm": 5.028483867645264, + "learning_rate": 9.454059987934487e-06, + "loss": 0.2413, + "step": 4600 + }, + { + "epoch": 1.1784253578732107, + "grad_norm": 9.872651100158691, + "learning_rate": 9.449998557657936e-06, + "loss": 0.1329, + "step": 4610 + }, + { + "epoch": 1.1809815950920246, + "grad_norm": 5.998063087463379, + "learning_rate": 9.445922955197437e-06, + "loss": 0.2879, + "step": 4620 + }, + { + "epoch": 1.1835378323108385, + "grad_norm": 8.390649795532227, + "learning_rate": 9.441833193532817e-06, + "loss": 0.2824, + "step": 4630 + }, + { + "epoch": 1.1860940695296525, + "grad_norm": 6.652390003204346, + "learning_rate": 9.437729285688986e-06, + "loss": 0.3389, + "step": 4640 + }, + { + "epoch": 1.1886503067484662, + "grad_norm": 10.573369026184082, + "learning_rate": 9.433611244735914e-06, + "loss": 0.3841, + "step": 4650 + }, + { + "epoch": 1.19120654396728, + "grad_norm": 10.0396146774292, + "learning_rate": 9.429479083788578e-06, + "loss": 0.2638, + "step": 4660 + }, + { + "epoch": 1.193762781186094, + "grad_norm": 11.902812004089355, + "learning_rate": 9.425332816006927e-06, + "loss": 0.4186, + "step": 4670 + }, + { + "epoch": 1.196319018404908, + "grad_norm": 10.162897109985352, + "learning_rate": 9.421172454595834e-06, + "loss": 0.3057, + "step": 4680 + }, + { + "epoch": 1.1988752556237219, + "grad_norm": 11.278912544250488, + "learning_rate": 9.416998012805057e-06, + "loss": 0.3223, + "step": 4690 + }, + { + "epoch": 1.2014314928425358, + "grad_norm": 8.295330047607422, + "learning_rate": 9.412809503929198e-06, + "loss": 0.2588, + "step": 4700 + }, + { + "epoch": 1.2039877300613497, + "grad_norm": 7.55431604385376, + "learning_rate": 9.408606941307658e-06, + "loss": 0.3087, + "step": 4710 + }, + { + "epoch": 1.2065439672801637, + "grad_norm": 3.9323720932006836, + "learning_rate": 9.404390338324599e-06, + "loss": 0.3091, + "step": 4720 + }, + { + "epoch": 1.2091002044989776, + "grad_norm": 7.560153007507324, + "learning_rate": 9.400159708408892e-06, + "loss": 0.2096, + "step": 4730 + }, + { + "epoch": 1.2116564417177913, + "grad_norm": 9.517462730407715, + "learning_rate": 9.395915065034085e-06, + "loss": 0.1582, + "step": 4740 + }, + { + "epoch": 1.2142126789366052, + "grad_norm": 5.7381720542907715, + "learning_rate": 9.391656421718356e-06, + "loss": 0.1742, + "step": 4750 + }, + { + "epoch": 1.2167689161554192, + "grad_norm": 7.014863014221191, + "learning_rate": 9.387383792024469e-06, + "loss": 0.2988, + "step": 4760 + }, + { + "epoch": 1.219325153374233, + "grad_norm": 12.077631950378418, + "learning_rate": 9.383097189559728e-06, + "loss": 0.254, + "step": 4770 + }, + { + "epoch": 1.221881390593047, + "grad_norm": 8.781020164489746, + "learning_rate": 9.37879662797594e-06, + "loss": 0.2946, + "step": 4780 + }, + { + "epoch": 1.224437627811861, + "grad_norm": 9.89029312133789, + "learning_rate": 9.37448212096937e-06, + "loss": 0.2043, + "step": 4790 + }, + { + "epoch": 1.2269938650306749, + "grad_norm": 7.694300174713135, + "learning_rate": 9.370153682280692e-06, + "loss": 0.138, + "step": 4800 + }, + { + "epoch": 1.2295501022494888, + "grad_norm": 8.310929298400879, + "learning_rate": 9.365811325694949e-06, + "loss": 0.2311, + "step": 4810 + }, + { + "epoch": 1.2321063394683027, + "grad_norm": 12.575085639953613, + "learning_rate": 9.361455065041514e-06, + "loss": 0.2834, + "step": 4820 + }, + { + "epoch": 1.2346625766871167, + "grad_norm": 10.732074737548828, + "learning_rate": 9.357084914194036e-06, + "loss": 0.2134, + "step": 4830 + }, + { + "epoch": 1.2372188139059306, + "grad_norm": 10.34244441986084, + "learning_rate": 9.352700887070403e-06, + "loss": 0.3486, + "step": 4840 + }, + { + "epoch": 1.2397750511247443, + "grad_norm": 10.497349739074707, + "learning_rate": 9.348302997632699e-06, + "loss": 0.3058, + "step": 4850 + }, + { + "epoch": 1.2423312883435582, + "grad_norm": 14.589156150817871, + "learning_rate": 9.343891259887148e-06, + "loss": 0.2331, + "step": 4860 + }, + { + "epoch": 1.2448875255623721, + "grad_norm": 5.382908344268799, + "learning_rate": 9.339465687884086e-06, + "loss": 0.3091, + "step": 4870 + }, + { + "epoch": 1.247443762781186, + "grad_norm": 16.56047821044922, + "learning_rate": 9.335026295717902e-06, + "loss": 0.2812, + "step": 4880 + }, + { + "epoch": 1.25, + "grad_norm": 5.166291236877441, + "learning_rate": 9.330573097527002e-06, + "loss": 0.2357, + "step": 4890 + }, + { + "epoch": 1.252556237218814, + "grad_norm": 6.794707775115967, + "learning_rate": 9.326106107493762e-06, + "loss": 0.2503, + "step": 4900 + }, + { + "epoch": 1.2551124744376279, + "grad_norm": 6.429582118988037, + "learning_rate": 9.321625339844476e-06, + "loss": 0.1967, + "step": 4910 + }, + { + "epoch": 1.2576687116564418, + "grad_norm": 21.49854278564453, + "learning_rate": 9.317130808849322e-06, + "loss": 0.3339, + "step": 4920 + }, + { + "epoch": 1.2602249488752557, + "grad_norm": 6.054262161254883, + "learning_rate": 9.312622528822308e-06, + "loss": 0.1903, + "step": 4930 + }, + { + "epoch": 1.2627811860940694, + "grad_norm": 13.686524391174316, + "learning_rate": 9.308100514121233e-06, + "loss": 0.1497, + "step": 4940 + }, + { + "epoch": 1.2653374233128836, + "grad_norm": 18.514162063598633, + "learning_rate": 9.303564779147634e-06, + "loss": 0.2372, + "step": 4950 + }, + { + "epoch": 1.2678936605316973, + "grad_norm": 6.550439357757568, + "learning_rate": 9.299015338346745e-06, + "loss": 0.2101, + "step": 4960 + }, + { + "epoch": 1.2704498977505112, + "grad_norm": 9.836435317993164, + "learning_rate": 9.294452206207448e-06, + "loss": 0.1643, + "step": 4970 + }, + { + "epoch": 1.2730061349693251, + "grad_norm": 7.0567307472229, + "learning_rate": 9.289875397262234e-06, + "loss": 0.1969, + "step": 4980 + }, + { + "epoch": 1.275562372188139, + "grad_norm": 8.437677383422852, + "learning_rate": 9.285284926087144e-06, + "loss": 0.3502, + "step": 4990 + }, + { + "epoch": 1.278118609406953, + "grad_norm": 7.982880592346191, + "learning_rate": 9.280680807301735e-06, + "loss": 0.1473, + "step": 5000 + }, + { + "epoch": 1.280674846625767, + "grad_norm": 6.814586162567139, + "learning_rate": 9.276063055569029e-06, + "loss": 0.2684, + "step": 5010 + }, + { + "epoch": 1.2832310838445808, + "grad_norm": 5.944293022155762, + "learning_rate": 9.271431685595461e-06, + "loss": 0.1763, + "step": 5020 + }, + { + "epoch": 1.2857873210633946, + "grad_norm": 5.889406204223633, + "learning_rate": 9.266786712130842e-06, + "loss": 0.1852, + "step": 5030 + }, + { + "epoch": 1.2883435582822087, + "grad_norm": 5.56532096862793, + "learning_rate": 9.262128149968304e-06, + "loss": 0.3474, + "step": 5040 + }, + { + "epoch": 1.2908997955010224, + "grad_norm": 6.4994049072265625, + "learning_rate": 9.257456013944255e-06, + "loss": 0.1804, + "step": 5050 + }, + { + "epoch": 1.2934560327198363, + "grad_norm": 6.235182285308838, + "learning_rate": 9.252770318938334e-06, + "loss": 0.2414, + "step": 5060 + }, + { + "epoch": 1.2960122699386503, + "grad_norm": 5.915652275085449, + "learning_rate": 9.248071079873362e-06, + "loss": 0.2333, + "step": 5070 + }, + { + "epoch": 1.2985685071574642, + "grad_norm": 9.032744407653809, + "learning_rate": 9.243358311715298e-06, + "loss": 0.2185, + "step": 5080 + }, + { + "epoch": 1.3011247443762781, + "grad_norm": 7.362344264984131, + "learning_rate": 9.238632029473178e-06, + "loss": 0.2571, + "step": 5090 + }, + { + "epoch": 1.303680981595092, + "grad_norm": 9.257672309875488, + "learning_rate": 9.23389224819909e-06, + "loss": 0.2363, + "step": 5100 + }, + { + "epoch": 1.306237218813906, + "grad_norm": 8.25611400604248, + "learning_rate": 9.229138982988102e-06, + "loss": 0.1432, + "step": 5110 + }, + { + "epoch": 1.30879345603272, + "grad_norm": 9.176118850708008, + "learning_rate": 9.224372248978231e-06, + "loss": 0.2158, + "step": 5120 + }, + { + "epoch": 1.3113496932515338, + "grad_norm": 3.796792984008789, + "learning_rate": 9.21959206135039e-06, + "loss": 0.1544, + "step": 5130 + }, + { + "epoch": 1.3139059304703475, + "grad_norm": 6.011196613311768, + "learning_rate": 9.214798435328334e-06, + "loss": 0.3326, + "step": 5140 + }, + { + "epoch": 1.3164621676891617, + "grad_norm": 16.793350219726562, + "learning_rate": 9.209991386178621e-06, + "loss": 0.2056, + "step": 5150 + }, + { + "epoch": 1.3190184049079754, + "grad_norm": 7.064115047454834, + "learning_rate": 9.205170929210552e-06, + "loss": 0.3113, + "step": 5160 + }, + { + "epoch": 1.3215746421267893, + "grad_norm": 19.5340518951416, + "learning_rate": 9.200337079776136e-06, + "loss": 0.1886, + "step": 5170 + }, + { + "epoch": 1.3241308793456033, + "grad_norm": 12.674887657165527, + "learning_rate": 9.195489853270029e-06, + "loss": 0.4599, + "step": 5180 + }, + { + "epoch": 1.3266871165644172, + "grad_norm": 13.094590187072754, + "learning_rate": 9.190629265129492e-06, + "loss": 0.2936, + "step": 5190 + }, + { + "epoch": 1.329243353783231, + "grad_norm": 9.762693405151367, + "learning_rate": 9.185755330834338e-06, + "loss": 0.2078, + "step": 5200 + }, + { + "epoch": 1.331799591002045, + "grad_norm": 7.909463405609131, + "learning_rate": 9.180868065906884e-06, + "loss": 0.2288, + "step": 5210 + }, + { + "epoch": 1.334355828220859, + "grad_norm": 7.411076545715332, + "learning_rate": 9.175967485911907e-06, + "loss": 0.2717, + "step": 5220 + }, + { + "epoch": 1.3369120654396727, + "grad_norm": 6.424882411956787, + "learning_rate": 9.171053606456582e-06, + "loss": 0.1745, + "step": 5230 + }, + { + "epoch": 1.3394683026584868, + "grad_norm": 6.506113052368164, + "learning_rate": 9.166126443190443e-06, + "loss": 0.1601, + "step": 5240 + }, + { + "epoch": 1.3420245398773005, + "grad_norm": 9.06916332244873, + "learning_rate": 9.161186011805332e-06, + "loss": 0.3146, + "step": 5250 + }, + { + "epoch": 1.3445807770961145, + "grad_norm": 10.523892402648926, + "learning_rate": 9.156232328035342e-06, + "loss": 0.2956, + "step": 5260 + }, + { + "epoch": 1.3471370143149284, + "grad_norm": 8.017621994018555, + "learning_rate": 9.151265407656775e-06, + "loss": 0.2294, + "step": 5270 + }, + { + "epoch": 1.3496932515337423, + "grad_norm": 14.679991722106934, + "learning_rate": 9.146285266488088e-06, + "loss": 0.2024, + "step": 5280 + }, + { + "epoch": 1.3522494887525562, + "grad_norm": 0.9324799180030823, + "learning_rate": 9.141291920389843e-06, + "loss": 0.1614, + "step": 5290 + }, + { + "epoch": 1.3548057259713702, + "grad_norm": 5.870517253875732, + "learning_rate": 9.136285385264655e-06, + "loss": 0.2225, + "step": 5300 + }, + { + "epoch": 1.357361963190184, + "grad_norm": 11.407279014587402, + "learning_rate": 9.131265677057146e-06, + "loss": 0.1872, + "step": 5310 + }, + { + "epoch": 1.359918200408998, + "grad_norm": 17.659618377685547, + "learning_rate": 9.12623281175389e-06, + "loss": 0.2171, + "step": 5320 + }, + { + "epoch": 1.362474437627812, + "grad_norm": 12.906618118286133, + "learning_rate": 9.121186805383358e-06, + "loss": 0.2759, + "step": 5330 + }, + { + "epoch": 1.3650306748466257, + "grad_norm": 6.954870223999023, + "learning_rate": 9.11612767401588e-06, + "loss": 0.2188, + "step": 5340 + }, + { + "epoch": 1.3675869120654398, + "grad_norm": 4.730753421783447, + "learning_rate": 9.111055433763582e-06, + "loss": 0.2126, + "step": 5350 + }, + { + "epoch": 1.3701431492842535, + "grad_norm": 13.265816688537598, + "learning_rate": 9.105970100780341e-06, + "loss": 0.2904, + "step": 5360 + }, + { + "epoch": 1.3726993865030674, + "grad_norm": 3.0092155933380127, + "learning_rate": 9.100871691261728e-06, + "loss": 0.1578, + "step": 5370 + }, + { + "epoch": 1.3752556237218814, + "grad_norm": 6.426031112670898, + "learning_rate": 9.09576022144496e-06, + "loss": 0.2037, + "step": 5380 + }, + { + "epoch": 1.3778118609406953, + "grad_norm": 8.25606918334961, + "learning_rate": 9.09063570760885e-06, + "loss": 0.1798, + "step": 5390 + }, + { + "epoch": 1.3803680981595092, + "grad_norm": 20.269100189208984, + "learning_rate": 9.085498166073755e-06, + "loss": 0.3306, + "step": 5400 + }, + { + "epoch": 1.3829243353783232, + "grad_norm": 7.950530529022217, + "learning_rate": 9.080347613201513e-06, + "loss": 0.2489, + "step": 5410 + }, + { + "epoch": 1.385480572597137, + "grad_norm": 11.141780853271484, + "learning_rate": 9.075184065395413e-06, + "loss": 0.2043, + "step": 5420 + }, + { + "epoch": 1.3880368098159508, + "grad_norm": 4.896001815795898, + "learning_rate": 9.070007539100118e-06, + "loss": 0.3356, + "step": 5430 + }, + { + "epoch": 1.390593047034765, + "grad_norm": 11.557963371276855, + "learning_rate": 9.064818050801634e-06, + "loss": 0.1741, + "step": 5440 + }, + { + "epoch": 1.3931492842535786, + "grad_norm": 6.800997734069824, + "learning_rate": 9.05961561702724e-06, + "loss": 0.1887, + "step": 5450 + }, + { + "epoch": 1.3957055214723926, + "grad_norm": 6.017879009246826, + "learning_rate": 9.054400254345448e-06, + "loss": 0.2398, + "step": 5460 + }, + { + "epoch": 1.3982617586912065, + "grad_norm": 6.6386189460754395, + "learning_rate": 9.049171979365945e-06, + "loss": 0.1465, + "step": 5470 + }, + { + "epoch": 1.4008179959100204, + "grad_norm": 4.621875762939453, + "learning_rate": 9.043930808739537e-06, + "loss": 0.335, + "step": 5480 + }, + { + "epoch": 1.4033742331288344, + "grad_norm": 6.274672508239746, + "learning_rate": 9.038676759158105e-06, + "loss": 0.1384, + "step": 5490 + }, + { + "epoch": 1.4059304703476483, + "grad_norm": 2.794377565383911, + "learning_rate": 9.033409847354542e-06, + "loss": 0.2304, + "step": 5500 + }, + { + "epoch": 1.4084867075664622, + "grad_norm": 10.634669303894043, + "learning_rate": 9.028130090102706e-06, + "loss": 0.3528, + "step": 5510 + }, + { + "epoch": 1.4110429447852761, + "grad_norm": 6.818256855010986, + "learning_rate": 9.022837504217366e-06, + "loss": 0.1227, + "step": 5520 + }, + { + "epoch": 1.41359918200409, + "grad_norm": 8.108813285827637, + "learning_rate": 9.017532106554143e-06, + "loss": 0.2864, + "step": 5530 + }, + { + "epoch": 1.4161554192229038, + "grad_norm": 8.222419738769531, + "learning_rate": 9.012213914009464e-06, + "loss": 0.251, + "step": 5540 + }, + { + "epoch": 1.418711656441718, + "grad_norm": 9.900671005249023, + "learning_rate": 9.006882943520506e-06, + "loss": 0.2974, + "step": 5550 + }, + { + "epoch": 1.4212678936605316, + "grad_norm": 4.816144943237305, + "learning_rate": 9.001539212065136e-06, + "loss": 0.2626, + "step": 5560 + }, + { + "epoch": 1.4238241308793456, + "grad_norm": 3.0924923419952393, + "learning_rate": 8.996182736661863e-06, + "loss": 0.1263, + "step": 5570 + }, + { + "epoch": 1.4263803680981595, + "grad_norm": 5.688522815704346, + "learning_rate": 8.990813534369787e-06, + "loss": 0.2336, + "step": 5580 + }, + { + "epoch": 1.4289366053169734, + "grad_norm": 10.940909385681152, + "learning_rate": 8.985431622288533e-06, + "loss": 0.2868, + "step": 5590 + }, + { + "epoch": 1.4314928425357873, + "grad_norm": 13.232209205627441, + "learning_rate": 8.98003701755821e-06, + "loss": 0.2469, + "step": 5600 + }, + { + "epoch": 1.4340490797546013, + "grad_norm": 7.461823463439941, + "learning_rate": 8.974629737359348e-06, + "loss": 0.2405, + "step": 5610 + }, + { + "epoch": 1.4366053169734152, + "grad_norm": 3.547605037689209, + "learning_rate": 8.96920979891284e-06, + "loss": 0.1996, + "step": 5620 + }, + { + "epoch": 1.439161554192229, + "grad_norm": 6.454622745513916, + "learning_rate": 8.963777219479902e-06, + "loss": 0.2072, + "step": 5630 + }, + { + "epoch": 1.441717791411043, + "grad_norm": 6.902385711669922, + "learning_rate": 8.958332016362e-06, + "loss": 0.0997, + "step": 5640 + }, + { + "epoch": 1.4442740286298568, + "grad_norm": 7.078310489654541, + "learning_rate": 8.952874206900809e-06, + "loss": 0.1943, + "step": 5650 + }, + { + "epoch": 1.4468302658486707, + "grad_norm": 5.974771976470947, + "learning_rate": 8.94740380847815e-06, + "loss": 0.3048, + "step": 5660 + }, + { + "epoch": 1.4493865030674846, + "grad_norm": 7.63726806640625, + "learning_rate": 8.941920838515936e-06, + "loss": 0.1593, + "step": 5670 + }, + { + "epoch": 1.4519427402862985, + "grad_norm": 5.1760430335998535, + "learning_rate": 8.936425314476121e-06, + "loss": 0.1877, + "step": 5680 + }, + { + "epoch": 1.4544989775051125, + "grad_norm": 8.131750106811523, + "learning_rate": 8.930917253860637e-06, + "loss": 0.2409, + "step": 5690 + }, + { + "epoch": 1.4570552147239264, + "grad_norm": 6.002188205718994, + "learning_rate": 8.925396674211341e-06, + "loss": 0.2159, + "step": 5700 + }, + { + "epoch": 1.4596114519427403, + "grad_norm": 12.237569808959961, + "learning_rate": 8.919863593109967e-06, + "loss": 0.2005, + "step": 5710 + }, + { + "epoch": 1.4621676891615543, + "grad_norm": 14.401376724243164, + "learning_rate": 8.914318028178055e-06, + "loss": 0.3153, + "step": 5720 + }, + { + "epoch": 1.4647239263803682, + "grad_norm": 5.81574821472168, + "learning_rate": 8.908759997076909e-06, + "loss": 0.1836, + "step": 5730 + }, + { + "epoch": 1.467280163599182, + "grad_norm": 6.657829761505127, + "learning_rate": 8.903189517507527e-06, + "loss": 0.2741, + "step": 5740 + }, + { + "epoch": 1.469836400817996, + "grad_norm": 4.597752094268799, + "learning_rate": 8.897606607210563e-06, + "loss": 0.1928, + "step": 5750 + }, + { + "epoch": 1.4723926380368098, + "grad_norm": 7.948934555053711, + "learning_rate": 8.892011283966253e-06, + "loss": 0.1889, + "step": 5760 + }, + { + "epoch": 1.4749488752556237, + "grad_norm": 14.392995834350586, + "learning_rate": 8.886403565594367e-06, + "loss": 0.2368, + "step": 5770 + }, + { + "epoch": 1.4775051124744376, + "grad_norm": 7.179086685180664, + "learning_rate": 8.88078346995415e-06, + "loss": 0.2, + "step": 5780 + }, + { + "epoch": 1.4800613496932515, + "grad_norm": 7.146066665649414, + "learning_rate": 8.875151014944267e-06, + "loss": 0.1678, + "step": 5790 + }, + { + "epoch": 1.4826175869120655, + "grad_norm": 9.944082260131836, + "learning_rate": 8.869506218502742e-06, + "loss": 0.1642, + "step": 5800 + }, + { + "epoch": 1.4851738241308794, + "grad_norm": 12.250117301940918, + "learning_rate": 8.863849098606907e-06, + "loss": 0.2266, + "step": 5810 + }, + { + "epoch": 1.4877300613496933, + "grad_norm": 29.39047622680664, + "learning_rate": 8.858179673273337e-06, + "loss": 0.2813, + "step": 5820 + }, + { + "epoch": 1.490286298568507, + "grad_norm": 11.951471328735352, + "learning_rate": 8.852497960557804e-06, + "loss": 0.3751, + "step": 5830 + }, + { + "epoch": 1.4928425357873212, + "grad_norm": 1.6928082704544067, + "learning_rate": 8.846803978555203e-06, + "loss": 0.1711, + "step": 5840 + }, + { + "epoch": 1.4953987730061349, + "grad_norm": 8.914717674255371, + "learning_rate": 8.84109774539951e-06, + "loss": 0.2084, + "step": 5850 + }, + { + "epoch": 1.4979550102249488, + "grad_norm": 9.57482624053955, + "learning_rate": 8.835379279263718e-06, + "loss": 0.2722, + "step": 5860 + }, + { + "epoch": 1.5005112474437627, + "grad_norm": 11.420355796813965, + "learning_rate": 8.829648598359775e-06, + "loss": 0.1593, + "step": 5870 + }, + { + "epoch": 1.5030674846625767, + "grad_norm": 4.315236568450928, + "learning_rate": 8.823905720938534e-06, + "loss": 0.1693, + "step": 5880 + }, + { + "epoch": 1.5056237218813906, + "grad_norm": 4.3361945152282715, + "learning_rate": 8.81815066528969e-06, + "loss": 0.164, + "step": 5890 + }, + { + "epoch": 1.5081799591002045, + "grad_norm": 9.296090126037598, + "learning_rate": 8.812383449741724e-06, + "loss": 0.1611, + "step": 5900 + }, + { + "epoch": 1.5107361963190185, + "grad_norm": 16.11349105834961, + "learning_rate": 8.806604092661839e-06, + "loss": 0.1636, + "step": 5910 + }, + { + "epoch": 1.5132924335378322, + "grad_norm": 12.905272483825684, + "learning_rate": 8.800812612455909e-06, + "loss": 0.1995, + "step": 5920 + }, + { + "epoch": 1.5158486707566463, + "grad_norm": 4.522705554962158, + "learning_rate": 8.79500902756842e-06, + "loss": 0.115, + "step": 5930 + }, + { + "epoch": 1.51840490797546, + "grad_norm": 8.156167984008789, + "learning_rate": 8.789193356482401e-06, + "loss": 0.1444, + "step": 5940 + }, + { + "epoch": 1.5209611451942742, + "grad_norm": 6.0793328285217285, + "learning_rate": 8.783365617719382e-06, + "loss": 0.1781, + "step": 5950 + }, + { + "epoch": 1.5235173824130879, + "grad_norm": 8.255613327026367, + "learning_rate": 8.777525829839317e-06, + "loss": 0.2307, + "step": 5960 + }, + { + "epoch": 1.5260736196319018, + "grad_norm": 13.122941017150879, + "learning_rate": 8.77167401144054e-06, + "loss": 0.1803, + "step": 5970 + }, + { + "epoch": 1.5286298568507157, + "grad_norm": 4.706987380981445, + "learning_rate": 8.765810181159696e-06, + "loss": 0.1343, + "step": 5980 + }, + { + "epoch": 1.5311860940695297, + "grad_norm": 4.327836990356445, + "learning_rate": 8.759934357671685e-06, + "loss": 0.2642, + "step": 5990 + }, + { + "epoch": 1.5337423312883436, + "grad_norm": 5.442415714263916, + "learning_rate": 8.754046559689602e-06, + "loss": 0.2007, + "step": 6000 + }, + { + "epoch": 1.5362985685071575, + "grad_norm": 12.884740829467773, + "learning_rate": 8.748146805964683e-06, + "loss": 0.2029, + "step": 6010 + }, + { + "epoch": 1.5388548057259714, + "grad_norm": 7.4214582443237305, + "learning_rate": 8.742235115286232e-06, + "loss": 0.131, + "step": 6020 + }, + { + "epoch": 1.5414110429447851, + "grad_norm": 5.057283878326416, + "learning_rate": 8.736311506481579e-06, + "loss": 0.2342, + "step": 6030 + }, + { + "epoch": 1.5439672801635993, + "grad_norm": 11.823676109313965, + "learning_rate": 8.730375998416e-06, + "loss": 0.145, + "step": 6040 + }, + { + "epoch": 1.546523517382413, + "grad_norm": 8.330456733703613, + "learning_rate": 8.724428609992675e-06, + "loss": 0.1139, + "step": 6050 + }, + { + "epoch": 1.5490797546012272, + "grad_norm": 11.217977523803711, + "learning_rate": 8.718469360152617e-06, + "loss": 0.2302, + "step": 6060 + }, + { + "epoch": 1.5516359918200409, + "grad_norm": 7.306154251098633, + "learning_rate": 8.712498267874615e-06, + "loss": 0.1695, + "step": 6070 + }, + { + "epoch": 1.5541922290388548, + "grad_norm": 5.975497722625732, + "learning_rate": 8.706515352175173e-06, + "loss": 0.2389, + "step": 6080 + }, + { + "epoch": 1.5567484662576687, + "grad_norm": 7.292505264282227, + "learning_rate": 8.700520632108453e-06, + "loss": 0.305, + "step": 6090 + }, + { + "epoch": 1.5593047034764826, + "grad_norm": 12.038248062133789, + "learning_rate": 8.694514126766205e-06, + "loss": 0.1872, + "step": 6100 + }, + { + "epoch": 1.5618609406952966, + "grad_norm": 5.702522277832031, + "learning_rate": 8.688495855277718e-06, + "loss": 0.1847, + "step": 6110 + }, + { + "epoch": 1.5644171779141103, + "grad_norm": 6.972240447998047, + "learning_rate": 8.68246583680975e-06, + "loss": 0.177, + "step": 6120 + }, + { + "epoch": 1.5669734151329244, + "grad_norm": 5.465381145477295, + "learning_rate": 8.676424090566473e-06, + "loss": 0.2276, + "step": 6130 + }, + { + "epoch": 1.5695296523517381, + "grad_norm": 3.666998863220215, + "learning_rate": 8.670370635789407e-06, + "loss": 0.2746, + "step": 6140 + }, + { + "epoch": 1.5720858895705523, + "grad_norm": 1.9799798727035522, + "learning_rate": 8.66430549175736e-06, + "loss": 0.1176, + "step": 6150 + }, + { + "epoch": 1.574642126789366, + "grad_norm": 5.453342437744141, + "learning_rate": 8.65822867778637e-06, + "loss": 0.2283, + "step": 6160 + }, + { + "epoch": 1.57719836400818, + "grad_norm": 5.7280683517456055, + "learning_rate": 8.652140213229642e-06, + "loss": 0.1838, + "step": 6170 + }, + { + "epoch": 1.5797546012269938, + "grad_norm": 5.071581840515137, + "learning_rate": 8.64604011747748e-06, + "loss": 0.179, + "step": 6180 + }, + { + "epoch": 1.5823108384458078, + "grad_norm": 1.5993189811706543, + "learning_rate": 8.639928409957236e-06, + "loss": 0.222, + "step": 6190 + }, + { + "epoch": 1.5848670756646217, + "grad_norm": 5.141691207885742, + "learning_rate": 8.63380511013324e-06, + "loss": 0.2307, + "step": 6200 + }, + { + "epoch": 1.5874233128834356, + "grad_norm": 8.022561073303223, + "learning_rate": 8.627670237506742e-06, + "loss": 0.2617, + "step": 6210 + }, + { + "epoch": 1.5899795501022496, + "grad_norm": 7.5429301261901855, + "learning_rate": 8.621523811615848e-06, + "loss": 0.1311, + "step": 6220 + }, + { + "epoch": 1.5925357873210633, + "grad_norm": 6.324619293212891, + "learning_rate": 8.615365852035456e-06, + "loss": 0.2665, + "step": 6230 + }, + { + "epoch": 1.5950920245398774, + "grad_norm": 5.001183032989502, + "learning_rate": 8.609196378377203e-06, + "loss": 0.205, + "step": 6240 + }, + { + "epoch": 1.5976482617586911, + "grad_norm": 7.617444038391113, + "learning_rate": 8.603015410289387e-06, + "loss": 0.4019, + "step": 6250 + }, + { + "epoch": 1.6002044989775053, + "grad_norm": 4.471902847290039, + "learning_rate": 8.596822967456915e-06, + "loss": 0.1962, + "step": 6260 + }, + { + "epoch": 1.602760736196319, + "grad_norm": 6.265940189361572, + "learning_rate": 8.590619069601247e-06, + "loss": 0.139, + "step": 6270 + }, + { + "epoch": 1.605316973415133, + "grad_norm": 6.503332614898682, + "learning_rate": 8.584403736480313e-06, + "loss": 0.1892, + "step": 6280 + }, + { + "epoch": 1.6078732106339468, + "grad_norm": 4.576842784881592, + "learning_rate": 8.57817698788847e-06, + "loss": 0.1271, + "step": 6290 + }, + { + "epoch": 1.6104294478527608, + "grad_norm": 7.434634685516357, + "learning_rate": 8.571938843656422e-06, + "loss": 0.2066, + "step": 6300 + }, + { + "epoch": 1.6129856850715747, + "grad_norm": 8.325051307678223, + "learning_rate": 8.565689323651174e-06, + "loss": 0.1975, + "step": 6310 + }, + { + "epoch": 1.6155419222903884, + "grad_norm": 7.133656978607178, + "learning_rate": 8.559428447775956e-06, + "loss": 0.1116, + "step": 6320 + }, + { + "epoch": 1.6180981595092025, + "grad_norm": 7.880911827087402, + "learning_rate": 8.553156235970163e-06, + "loss": 0.1743, + "step": 6330 + }, + { + "epoch": 1.6206543967280163, + "grad_norm": 20.269716262817383, + "learning_rate": 8.546872708209297e-06, + "loss": 0.1994, + "step": 6340 + }, + { + "epoch": 1.6232106339468304, + "grad_norm": 8.107951164245605, + "learning_rate": 8.54057788450489e-06, + "loss": 0.1642, + "step": 6350 + }, + { + "epoch": 1.6257668711656441, + "grad_norm": 5.440578937530518, + "learning_rate": 8.534271784904457e-06, + "loss": 0.1593, + "step": 6360 + }, + { + "epoch": 1.628323108384458, + "grad_norm": 3.178661584854126, + "learning_rate": 8.527954429491422e-06, + "loss": 0.2159, + "step": 6370 + }, + { + "epoch": 1.630879345603272, + "grad_norm": 5.0311055183410645, + "learning_rate": 8.521625838385052e-06, + "loss": 0.2587, + "step": 6380 + }, + { + "epoch": 1.633435582822086, + "grad_norm": 1.3832993507385254, + "learning_rate": 8.515286031740403e-06, + "loss": 0.1799, + "step": 6390 + }, + { + "epoch": 1.6359918200408998, + "grad_norm": 8.102804183959961, + "learning_rate": 8.508935029748244e-06, + "loss": 0.1516, + "step": 6400 + }, + { + "epoch": 1.6385480572597138, + "grad_norm": 6.02394437789917, + "learning_rate": 8.502572852635005e-06, + "loss": 0.179, + "step": 6410 + }, + { + "epoch": 1.6411042944785277, + "grad_norm": 6.3991312980651855, + "learning_rate": 8.4961995206627e-06, + "loss": 0.2349, + "step": 6420 + }, + { + "epoch": 1.6436605316973414, + "grad_norm": 5.750975608825684, + "learning_rate": 8.489815054128874e-06, + "loss": 0.1607, + "step": 6430 + }, + { + "epoch": 1.6462167689161555, + "grad_norm": 4.242618560791016, + "learning_rate": 8.483419473366525e-06, + "loss": 0.1986, + "step": 6440 + }, + { + "epoch": 1.6487730061349692, + "grad_norm": 9.25927734375, + "learning_rate": 8.477012798744056e-06, + "loss": 0.1515, + "step": 6450 + }, + { + "epoch": 1.6513292433537834, + "grad_norm": 0.4773276150226593, + "learning_rate": 8.470595050665196e-06, + "loss": 0.1506, + "step": 6460 + }, + { + "epoch": 1.653885480572597, + "grad_norm": 9.461527824401855, + "learning_rate": 8.464166249568944e-06, + "loss": 0.2223, + "step": 6470 + }, + { + "epoch": 1.656441717791411, + "grad_norm": 4.911471843719482, + "learning_rate": 8.457726415929494e-06, + "loss": 0.1179, + "step": 6480 + }, + { + "epoch": 1.658997955010225, + "grad_norm": 5.247636318206787, + "learning_rate": 8.451275570256183e-06, + "loss": 0.1667, + "step": 6490 + }, + { + "epoch": 1.6615541922290389, + "grad_norm": 7.205673694610596, + "learning_rate": 8.444813733093416e-06, + "loss": 0.184, + "step": 6500 + }, + { + "epoch": 1.6641104294478528, + "grad_norm": 12.158601760864258, + "learning_rate": 8.4383409250206e-06, + "loss": 0.1431, + "step": 6510 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 7.19647741317749, + "learning_rate": 8.43185716665209e-06, + "loss": 0.1936, + "step": 6520 + }, + { + "epoch": 1.6692229038854807, + "grad_norm": 7.732553958892822, + "learning_rate": 8.425362478637105e-06, + "loss": 0.1933, + "step": 6530 + }, + { + "epoch": 1.6717791411042944, + "grad_norm": 8.475358009338379, + "learning_rate": 8.418856881659677e-06, + "loss": 0.2284, + "step": 6540 + }, + { + "epoch": 1.6743353783231085, + "grad_norm": 11.112258911132812, + "learning_rate": 8.412340396438587e-06, + "loss": 0.1528, + "step": 6550 + }, + { + "epoch": 1.6768916155419222, + "grad_norm": 11.443809509277344, + "learning_rate": 8.405813043727279e-06, + "loss": 0.1782, + "step": 6560 + }, + { + "epoch": 1.6794478527607362, + "grad_norm": 0.7984766960144043, + "learning_rate": 8.399274844313816e-06, + "loss": 0.1205, + "step": 6570 + }, + { + "epoch": 1.68200408997955, + "grad_norm": 0.6593146324157715, + "learning_rate": 8.392725819020806e-06, + "loss": 0.0928, + "step": 6580 + }, + { + "epoch": 1.684560327198364, + "grad_norm": 7.761658668518066, + "learning_rate": 8.38616598870533e-06, + "loss": 0.1637, + "step": 6590 + }, + { + "epoch": 1.687116564417178, + "grad_norm": 6.802185535430908, + "learning_rate": 8.379595374258883e-06, + "loss": 0.3094, + "step": 6600 + }, + { + "epoch": 1.6896728016359919, + "grad_norm": 7.621953964233398, + "learning_rate": 8.373013996607309e-06, + "loss": 0.1235, + "step": 6610 + }, + { + "epoch": 1.6922290388548058, + "grad_norm": 5.766721248626709, + "learning_rate": 8.36642187671072e-06, + "loss": 0.1979, + "step": 6620 + }, + { + "epoch": 1.6947852760736195, + "grad_norm": 7.573540687561035, + "learning_rate": 8.359819035563447e-06, + "loss": 0.1544, + "step": 6630 + }, + { + "epoch": 1.6973415132924337, + "grad_norm": 7.856776237487793, + "learning_rate": 8.353205494193965e-06, + "loss": 0.2178, + "step": 6640 + }, + { + "epoch": 1.6998977505112474, + "grad_norm": 6.826193332672119, + "learning_rate": 8.346581273664826e-06, + "loss": 0.1453, + "step": 6650 + }, + { + "epoch": 1.7024539877300615, + "grad_norm": 3.6651082038879395, + "learning_rate": 8.339946395072593e-06, + "loss": 0.1316, + "step": 6660 + }, + { + "epoch": 1.7050102249488752, + "grad_norm": 13.016592025756836, + "learning_rate": 8.33330087954777e-06, + "loss": 0.2319, + "step": 6670 + }, + { + "epoch": 1.7075664621676891, + "grad_norm": 2.1794581413269043, + "learning_rate": 8.32664474825474e-06, + "loss": 0.096, + "step": 6680 + }, + { + "epoch": 1.710122699386503, + "grad_norm": 6.232535362243652, + "learning_rate": 8.319978022391692e-06, + "loss": 0.1157, + "step": 6690 + }, + { + "epoch": 1.712678936605317, + "grad_norm": 11.268756866455078, + "learning_rate": 8.313300723190561e-06, + "loss": 0.1155, + "step": 6700 + }, + { + "epoch": 1.715235173824131, + "grad_norm": 7.64271879196167, + "learning_rate": 8.306612871916946e-06, + "loss": 0.1295, + "step": 6710 + }, + { + "epoch": 1.7177914110429446, + "grad_norm": 3.3692967891693115, + "learning_rate": 8.299914489870065e-06, + "loss": 0.1837, + "step": 6720 + }, + { + "epoch": 1.7203476482617588, + "grad_norm": 3.621946096420288, + "learning_rate": 8.293205598382662e-06, + "loss": 0.116, + "step": 6730 + }, + { + "epoch": 1.7229038854805725, + "grad_norm": 7.414484024047852, + "learning_rate": 8.28648621882096e-06, + "loss": 0.2422, + "step": 6740 + }, + { + "epoch": 1.7254601226993866, + "grad_norm": 8.968006134033203, + "learning_rate": 8.279756372584575e-06, + "loss": 0.1423, + "step": 6750 + }, + { + "epoch": 1.7280163599182004, + "grad_norm": 5.072629451751709, + "learning_rate": 8.273016081106468e-06, + "loss": 0.1433, + "step": 6760 + }, + { + "epoch": 1.7305725971370143, + "grad_norm": 8.455986976623535, + "learning_rate": 8.266265365852854e-06, + "loss": 0.2221, + "step": 6770 + }, + { + "epoch": 1.7331288343558282, + "grad_norm": 7.337911128997803, + "learning_rate": 8.259504248323155e-06, + "loss": 0.0976, + "step": 6780 + }, + { + "epoch": 1.7356850715746421, + "grad_norm": 7.0469207763671875, + "learning_rate": 8.252732750049918e-06, + "loss": 0.1134, + "step": 6790 + }, + { + "epoch": 1.738241308793456, + "grad_norm": 6.939335823059082, + "learning_rate": 8.245950892598746e-06, + "loss": 0.1975, + "step": 6800 + }, + { + "epoch": 1.74079754601227, + "grad_norm": 3.959833860397339, + "learning_rate": 8.23915869756824e-06, + "loss": 0.1229, + "step": 6810 + }, + { + "epoch": 1.743353783231084, + "grad_norm": 9.389518737792969, + "learning_rate": 8.23235618658992e-06, + "loss": 0.1164, + "step": 6820 + }, + { + "epoch": 1.7459100204498976, + "grad_norm": 3.3109939098358154, + "learning_rate": 8.225543381328162e-06, + "loss": 0.1659, + "step": 6830 + }, + { + "epoch": 1.7484662576687118, + "grad_norm": 4.770479202270508, + "learning_rate": 8.218720303480124e-06, + "loss": 0.1385, + "step": 6840 + }, + { + "epoch": 1.7510224948875255, + "grad_norm": 3.3656115531921387, + "learning_rate": 8.211886974775682e-06, + "loss": 0.2088, + "step": 6850 + }, + { + "epoch": 1.7535787321063396, + "grad_norm": 5.787675857543945, + "learning_rate": 8.205043416977358e-06, + "loss": 0.0627, + "step": 6860 + }, + { + "epoch": 1.7561349693251533, + "grad_norm": 5.655759334564209, + "learning_rate": 8.198189651880253e-06, + "loss": 0.1626, + "step": 6870 + }, + { + "epoch": 1.7586912065439673, + "grad_norm": 5.212615966796875, + "learning_rate": 8.191325701311971e-06, + "loss": 0.1073, + "step": 6880 + }, + { + "epoch": 1.7612474437627812, + "grad_norm": 5.487759113311768, + "learning_rate": 8.18445158713256e-06, + "loss": 0.1968, + "step": 6890 + }, + { + "epoch": 1.7638036809815951, + "grad_norm": 13.81961727142334, + "learning_rate": 8.17756733123443e-06, + "loss": 0.1275, + "step": 6900 + }, + { + "epoch": 1.766359918200409, + "grad_norm": 5.11100959777832, + "learning_rate": 8.170672955542299e-06, + "loss": 0.183, + "step": 6910 + }, + { + "epoch": 1.7689161554192228, + "grad_norm": 1.606713056564331, + "learning_rate": 8.163768482013106e-06, + "loss": 0.0828, + "step": 6920 + }, + { + "epoch": 1.771472392638037, + "grad_norm": 5.141575813293457, + "learning_rate": 8.156853932635955e-06, + "loss": 0.1193, + "step": 6930 + }, + { + "epoch": 1.7740286298568506, + "grad_norm": 11.083499908447266, + "learning_rate": 8.149929329432032e-06, + "loss": 0.2004, + "step": 6940 + }, + { + "epoch": 1.7765848670756648, + "grad_norm": 10.328533172607422, + "learning_rate": 8.14299469445455e-06, + "loss": 0.0874, + "step": 6950 + }, + { + "epoch": 1.7791411042944785, + "grad_norm": 6.226305961608887, + "learning_rate": 8.136050049788666e-06, + "loss": 0.103, + "step": 6960 + }, + { + "epoch": 1.7816973415132924, + "grad_norm": 8.67745590209961, + "learning_rate": 8.129095417551416e-06, + "loss": 0.1642, + "step": 6970 + }, + { + "epoch": 1.7842535787321063, + "grad_norm": 9.080946922302246, + "learning_rate": 8.122130819891645e-06, + "loss": 0.14, + "step": 6980 + }, + { + "epoch": 1.7868098159509203, + "grad_norm": 4.160292625427246, + "learning_rate": 8.115156278989938e-06, + "loss": 0.0769, + "step": 6990 + }, + { + "epoch": 1.7893660531697342, + "grad_norm": 4.340435028076172, + "learning_rate": 8.10817181705854e-06, + "loss": 0.0904, + "step": 7000 + }, + { + "epoch": 1.7919222903885481, + "grad_norm": 5.093479156494141, + "learning_rate": 8.101177456341301e-06, + "loss": 0.1122, + "step": 7010 + }, + { + "epoch": 1.794478527607362, + "grad_norm": 7.038718223571777, + "learning_rate": 8.094173219113589e-06, + "loss": 0.1572, + "step": 7020 + }, + { + "epoch": 1.7970347648261757, + "grad_norm": 4.94278621673584, + "learning_rate": 8.087159127682227e-06, + "loss": 0.1477, + "step": 7030 + }, + { + "epoch": 1.79959100204499, + "grad_norm": 1.7163784503936768, + "learning_rate": 8.080135204385425e-06, + "loss": 0.2002, + "step": 7040 + }, + { + "epoch": 1.8021472392638036, + "grad_norm": 8.449196815490723, + "learning_rate": 8.073101471592702e-06, + "loss": 0.2222, + "step": 7050 + }, + { + "epoch": 1.8047034764826178, + "grad_norm": 6.09740686416626, + "learning_rate": 8.066057951704821e-06, + "loss": 0.14, + "step": 7060 + }, + { + "epoch": 1.8072597137014315, + "grad_norm": 13.180371284484863, + "learning_rate": 8.059004667153713e-06, + "loss": 0.0977, + "step": 7070 + }, + { + "epoch": 1.8098159509202454, + "grad_norm": 7.9253058433532715, + "learning_rate": 8.051941640402406e-06, + "loss": 0.1332, + "step": 7080 + }, + { + "epoch": 1.8123721881390593, + "grad_norm": 8.333995819091797, + "learning_rate": 8.044868893944955e-06, + "loss": 0.1297, + "step": 7090 + }, + { + "epoch": 1.8149284253578732, + "grad_norm": 8.638833045959473, + "learning_rate": 8.03778645030637e-06, + "loss": 0.101, + "step": 7100 + }, + { + "epoch": 1.8174846625766872, + "grad_norm": 6.839685916900635, + "learning_rate": 8.030694332042548e-06, + "loss": 0.0693, + "step": 7110 + }, + { + "epoch": 1.8200408997955009, + "grad_norm": 7.357212066650391, + "learning_rate": 8.02359256174019e-06, + "loss": 0.066, + "step": 7120 + }, + { + "epoch": 1.822597137014315, + "grad_norm": 4.24409294128418, + "learning_rate": 8.01648116201674e-06, + "loss": 0.1837, + "step": 7130 + }, + { + "epoch": 1.8251533742331287, + "grad_norm": 8.311896324157715, + "learning_rate": 8.009360155520313e-06, + "loss": 0.1389, + "step": 7140 + }, + { + "epoch": 1.8277096114519429, + "grad_norm": 12.251752853393555, + "learning_rate": 8.002229564929616e-06, + "loss": 0.111, + "step": 7150 + }, + { + "epoch": 1.8302658486707566, + "grad_norm": 5.574610233306885, + "learning_rate": 7.995089412953875e-06, + "loss": 0.1158, + "step": 7160 + }, + { + "epoch": 1.8328220858895705, + "grad_norm": 8.057143211364746, + "learning_rate": 7.987939722332776e-06, + "loss": 0.094, + "step": 7170 + }, + { + "epoch": 1.8353783231083844, + "grad_norm": 11.000237464904785, + "learning_rate": 7.980780515836377e-06, + "loss": 0.1, + "step": 7180 + }, + { + "epoch": 1.8379345603271984, + "grad_norm": 5.534488201141357, + "learning_rate": 7.97361181626504e-06, + "loss": 0.2236, + "step": 7190 + }, + { + "epoch": 1.8404907975460123, + "grad_norm": 6.447413444519043, + "learning_rate": 7.966433646449364e-06, + "loss": 0.2489, + "step": 7200 + }, + { + "epoch": 1.8430470347648262, + "grad_norm": 2.375591516494751, + "learning_rate": 7.959246029250112e-06, + "loss": 0.0896, + "step": 7210 + }, + { + "epoch": 1.8456032719836402, + "grad_norm": 7.849663734436035, + "learning_rate": 7.952048987558126e-06, + "loss": 0.2143, + "step": 7220 + }, + { + "epoch": 1.8481595092024539, + "grad_norm": 9.33170223236084, + "learning_rate": 7.944842544294268e-06, + "loss": 0.1366, + "step": 7230 + }, + { + "epoch": 1.850715746421268, + "grad_norm": 7.391844749450684, + "learning_rate": 7.937626722409342e-06, + "loss": 0.1979, + "step": 7240 + }, + { + "epoch": 1.8532719836400817, + "grad_norm": 0.42054474353790283, + "learning_rate": 7.930401544884017e-06, + "loss": 0.0991, + "step": 7250 + }, + { + "epoch": 1.8558282208588959, + "grad_norm": 6.135138511657715, + "learning_rate": 7.923167034728763e-06, + "loss": 0.0628, + "step": 7260 + }, + { + "epoch": 1.8583844580777096, + "grad_norm": 9.923365592956543, + "learning_rate": 7.915923214983767e-06, + "loss": 0.1159, + "step": 7270 + }, + { + "epoch": 1.8609406952965235, + "grad_norm": 7.890591144561768, + "learning_rate": 7.908670108718868e-06, + "loss": 0.1056, + "step": 7280 + }, + { + "epoch": 1.8634969325153374, + "grad_norm": 1.3051115274429321, + "learning_rate": 7.90140773903348e-06, + "loss": 0.1308, + "step": 7290 + }, + { + "epoch": 1.8660531697341514, + "grad_norm": 7.580386161804199, + "learning_rate": 7.894136129056516e-06, + "loss": 0.1585, + "step": 7300 + }, + { + "epoch": 1.8686094069529653, + "grad_norm": 4.543681621551514, + "learning_rate": 7.886855301946322e-06, + "loss": 0.0982, + "step": 7310 + }, + { + "epoch": 1.871165644171779, + "grad_norm": 8.670321464538574, + "learning_rate": 7.879565280890593e-06, + "loss": 0.1984, + "step": 7320 + }, + { + "epoch": 1.8737218813905931, + "grad_norm": 10.790763854980469, + "learning_rate": 7.872266089106309e-06, + "loss": 0.0939, + "step": 7330 + }, + { + "epoch": 1.8762781186094069, + "grad_norm": 5.62462854385376, + "learning_rate": 7.864957749839653e-06, + "loss": 0.125, + "step": 7340 + }, + { + "epoch": 1.878834355828221, + "grad_norm": 5.0767717361450195, + "learning_rate": 7.857640286365946e-06, + "loss": 0.1439, + "step": 7350 + }, + { + "epoch": 1.8813905930470347, + "grad_norm": 4.852501392364502, + "learning_rate": 7.850313721989558e-06, + "loss": 0.2335, + "step": 7360 + }, + { + "epoch": 1.8839468302658486, + "grad_norm": 8.364299774169922, + "learning_rate": 7.842978080043855e-06, + "loss": 0.138, + "step": 7370 + }, + { + "epoch": 1.8865030674846626, + "grad_norm": 16.219741821289062, + "learning_rate": 7.835633383891102e-06, + "loss": 0.1868, + "step": 7380 + }, + { + "epoch": 1.8890593047034765, + "grad_norm": 6.5828375816345215, + "learning_rate": 7.828279656922408e-06, + "loss": 0.1366, + "step": 7390 + }, + { + "epoch": 1.8916155419222904, + "grad_norm": 11.096482276916504, + "learning_rate": 7.820916922557636e-06, + "loss": 0.0636, + "step": 7400 + }, + { + "epoch": 1.8941717791411041, + "grad_norm": 7.485594749450684, + "learning_rate": 7.813545204245341e-06, + "loss": 0.2255, + "step": 7410 + }, + { + "epoch": 1.8967280163599183, + "grad_norm": 8.4011869430542, + "learning_rate": 7.806164525462687e-06, + "loss": 0.1484, + "step": 7420 + }, + { + "epoch": 1.899284253578732, + "grad_norm": 7.423107624053955, + "learning_rate": 7.798774909715374e-06, + "loss": 0.1592, + "step": 7430 + }, + { + "epoch": 1.9018404907975461, + "grad_norm": 5.52902364730835, + "learning_rate": 7.791376380537567e-06, + "loss": 0.0735, + "step": 7440 + }, + { + "epoch": 1.9043967280163598, + "grad_norm": 3.4649152755737305, + "learning_rate": 7.783968961491818e-06, + "loss": 0.1479, + "step": 7450 + }, + { + "epoch": 1.9069529652351738, + "grad_norm": 7.678995132446289, + "learning_rate": 7.776552676168987e-06, + "loss": 0.2274, + "step": 7460 + }, + { + "epoch": 1.9095092024539877, + "grad_norm": 0.06970912218093872, + "learning_rate": 7.769127548188174e-06, + "loss": 0.1003, + "step": 7470 + }, + { + "epoch": 1.9120654396728016, + "grad_norm": 17.272754669189453, + "learning_rate": 7.761693601196642e-06, + "loss": 0.0924, + "step": 7480 + }, + { + "epoch": 1.9146216768916156, + "grad_norm": 5.135831832885742, + "learning_rate": 7.75425085886974e-06, + "loss": 0.1176, + "step": 7490 + }, + { + "epoch": 1.9171779141104295, + "grad_norm": 5.651144981384277, + "learning_rate": 7.746799344910822e-06, + "loss": 0.1398, + "step": 7500 + }, + { + "epoch": 1.9197341513292434, + "grad_norm": 6.184920787811279, + "learning_rate": 7.739339083051186e-06, + "loss": 0.1766, + "step": 7510 + }, + { + "epoch": 1.9222903885480571, + "grad_norm": 6.632026672363281, + "learning_rate": 7.73187009704999e-06, + "loss": 0.1427, + "step": 7520 + }, + { + "epoch": 1.9248466257668713, + "grad_norm": 8.19317626953125, + "learning_rate": 7.724392410694167e-06, + "loss": 0.1126, + "step": 7530 + }, + { + "epoch": 1.927402862985685, + "grad_norm": 1.4213460683822632, + "learning_rate": 7.716906047798364e-06, + "loss": 0.1248, + "step": 7540 + }, + { + "epoch": 1.9299591002044991, + "grad_norm": 8.21669864654541, + "learning_rate": 7.709411032204868e-06, + "loss": 0.1148, + "step": 7550 + }, + { + "epoch": 1.9325153374233128, + "grad_norm": 6.994448661804199, + "learning_rate": 7.701907387783509e-06, + "loss": 0.1548, + "step": 7560 + }, + { + "epoch": 1.9350715746421268, + "grad_norm": 7.743505954742432, + "learning_rate": 7.694395138431608e-06, + "loss": 0.1274, + "step": 7570 + }, + { + "epoch": 1.9376278118609407, + "grad_norm": 5.287206172943115, + "learning_rate": 7.686874308073885e-06, + "loss": 0.0779, + "step": 7580 + }, + { + "epoch": 1.9401840490797546, + "grad_norm": 15.280442237854004, + "learning_rate": 7.679344920662394e-06, + "loss": 0.0718, + "step": 7590 + }, + { + "epoch": 1.9427402862985685, + "grad_norm": 9.01176929473877, + "learning_rate": 7.671807000176434e-06, + "loss": 0.2102, + "step": 7600 + }, + { + "epoch": 1.9452965235173822, + "grad_norm": 4.649341583251953, + "learning_rate": 7.664260570622487e-06, + "loss": 0.1391, + "step": 7610 + }, + { + "epoch": 1.9478527607361964, + "grad_norm": 9.841882705688477, + "learning_rate": 7.656705656034132e-06, + "loss": 0.1092, + "step": 7620 + }, + { + "epoch": 1.95040899795501, + "grad_norm": 4.586430072784424, + "learning_rate": 7.649142280471964e-06, + "loss": 0.1478, + "step": 7630 + }, + { + "epoch": 1.9529652351738243, + "grad_norm": 3.7405037879943848, + "learning_rate": 7.641570468023536e-06, + "loss": 0.157, + "step": 7640 + }, + { + "epoch": 1.955521472392638, + "grad_norm": 4.911379337310791, + "learning_rate": 7.633990242803263e-06, + "loss": 0.0739, + "step": 7650 + }, + { + "epoch": 1.9580777096114519, + "grad_norm": 14.748944282531738, + "learning_rate": 7.626401628952352e-06, + "loss": 0.1426, + "step": 7660 + }, + { + "epoch": 1.9606339468302658, + "grad_norm": 7.726930141448975, + "learning_rate": 7.61880465063873e-06, + "loss": 0.1107, + "step": 7670 + }, + { + "epoch": 1.9631901840490797, + "grad_norm": 7.120655059814453, + "learning_rate": 7.61119933205696e-06, + "loss": 0.044, + "step": 7680 + }, + { + "epoch": 1.9657464212678937, + "grad_norm": 1.3341773748397827, + "learning_rate": 7.603585697428169e-06, + "loss": 0.0847, + "step": 7690 + }, + { + "epoch": 1.9683026584867076, + "grad_norm": 5.155945777893066, + "learning_rate": 7.595963770999966e-06, + "loss": 0.2069, + "step": 7700 + }, + { + "epoch": 1.9708588957055215, + "grad_norm": 6.961178302764893, + "learning_rate": 7.588333577046368e-06, + "loss": 0.1673, + "step": 7710 + }, + { + "epoch": 1.9734151329243352, + "grad_norm": 5.766995906829834, + "learning_rate": 7.5806951398677255e-06, + "loss": 0.1469, + "step": 7720 + }, + { + "epoch": 1.9759713701431494, + "grad_norm": 11.985013961791992, + "learning_rate": 7.573048483790635e-06, + "loss": 0.1621, + "step": 7730 + }, + { + "epoch": 1.978527607361963, + "grad_norm": 0.5396488308906555, + "learning_rate": 7.565393633167876e-06, + "loss": 0.0574, + "step": 7740 + }, + { + "epoch": 1.9810838445807772, + "grad_norm": 11.865923881530762, + "learning_rate": 7.557730612378318e-06, + "loss": 0.1207, + "step": 7750 + }, + { + "epoch": 1.983640081799591, + "grad_norm": 4.756693363189697, + "learning_rate": 7.5500594458268576e-06, + "loss": 0.1147, + "step": 7760 + }, + { + "epoch": 1.9861963190184049, + "grad_norm": 4.860601425170898, + "learning_rate": 7.542380157944328e-06, + "loss": 0.0956, + "step": 7770 + }, + { + "epoch": 1.9887525562372188, + "grad_norm": 14.664186477661133, + "learning_rate": 7.534692773187431e-06, + "loss": 0.1399, + "step": 7780 + }, + { + "epoch": 1.9913087934560327, + "grad_norm": 4.663970470428467, + "learning_rate": 7.526997316038654e-06, + "loss": 0.0859, + "step": 7790 + }, + { + "epoch": 1.9938650306748467, + "grad_norm": 0.8506277203559875, + "learning_rate": 7.519293811006187e-06, + "loss": 0.136, + "step": 7800 + }, + { + "epoch": 1.9964212678936604, + "grad_norm": 5.4818644523620605, + "learning_rate": 7.511582282623865e-06, + "loss": 0.0835, + "step": 7810 + }, + { + "epoch": 1.9989775051124745, + "grad_norm": 5.375784397125244, + "learning_rate": 7.503862755451059e-06, + "loss": 0.1255, + "step": 7820 + }, + { + "epoch": 2.0015337423312882, + "grad_norm": 1.3432427644729614, + "learning_rate": 7.4961352540726274e-06, + "loss": 0.0644, + "step": 7830 + }, + { + "epoch": 2.0040899795501024, + "grad_norm": 8.615415573120117, + "learning_rate": 7.4883998030988136e-06, + "loss": 0.1136, + "step": 7840 + }, + { + "epoch": 2.006646216768916, + "grad_norm": 7.158458232879639, + "learning_rate": 7.480656427165187e-06, + "loss": 0.09, + "step": 7850 + }, + { + "epoch": 2.0092024539877302, + "grad_norm": 8.66907024383545, + "learning_rate": 7.47290515093255e-06, + "loss": 0.0703, + "step": 7860 + }, + { + "epoch": 2.011758691206544, + "grad_norm": 25.180543899536133, + "learning_rate": 7.465145999086874e-06, + "loss": 0.1314, + "step": 7870 + }, + { + "epoch": 2.014314928425358, + "grad_norm": 0.7909819483757019, + "learning_rate": 7.457378996339201e-06, + "loss": 0.0538, + "step": 7880 + }, + { + "epoch": 2.016871165644172, + "grad_norm": 0.6326285600662231, + "learning_rate": 7.4496041674255834e-06, + "loss": 0.0545, + "step": 7890 + }, + { + "epoch": 2.0194274028629855, + "grad_norm": 6.992855548858643, + "learning_rate": 7.441821537107e-06, + "loss": 0.0811, + "step": 7900 + }, + { + "epoch": 2.0219836400817996, + "grad_norm": 4.8581743240356445, + "learning_rate": 7.434031130169268e-06, + "loss": 0.0897, + "step": 7910 + }, + { + "epoch": 2.0245398773006134, + "grad_norm": 0.3861932158470154, + "learning_rate": 7.42623297142298e-06, + "loss": 0.0795, + "step": 7920 + }, + { + "epoch": 2.0270961145194275, + "grad_norm": 4.8933424949646, + "learning_rate": 7.418427085703406e-06, + "loss": 0.0746, + "step": 7930 + }, + { + "epoch": 2.029652351738241, + "grad_norm": 7.480552673339844, + "learning_rate": 7.410613497870432e-06, + "loss": 0.0816, + "step": 7940 + }, + { + "epoch": 2.0322085889570554, + "grad_norm": 5.1835126876831055, + "learning_rate": 7.402792232808474e-06, + "loss": 0.1248, + "step": 7950 + }, + { + "epoch": 2.034764826175869, + "grad_norm": 0.7514427304267883, + "learning_rate": 7.394963315426393e-06, + "loss": 0.077, + "step": 7960 + }, + { + "epoch": 2.037321063394683, + "grad_norm": 5.26667594909668, + "learning_rate": 7.387126770657423e-06, + "loss": 0.0694, + "step": 7970 + }, + { + "epoch": 2.039877300613497, + "grad_norm": 8.795965194702148, + "learning_rate": 7.379282623459093e-06, + "loss": 0.0845, + "step": 7980 + }, + { + "epoch": 2.0424335378323106, + "grad_norm": 5.604037284851074, + "learning_rate": 7.371430898813137e-06, + "loss": 0.0753, + "step": 7990 + }, + { + "epoch": 2.044989775051125, + "grad_norm": 2.7282750606536865, + "learning_rate": 7.363571621725427e-06, + "loss": 0.031, + "step": 8000 + }, + { + "epoch": 2.0475460122699385, + "grad_norm": 5.139689922332764, + "learning_rate": 7.355704817225886e-06, + "loss": 0.1, + "step": 8010 + }, + { + "epoch": 2.0501022494887526, + "grad_norm": 7.020951271057129, + "learning_rate": 7.347830510368409e-06, + "loss": 0.0798, + "step": 8020 + }, + { + "epoch": 2.0526584867075663, + "grad_norm": 2.1761505603790283, + "learning_rate": 7.3399487262307866e-06, + "loss": 0.0768, + "step": 8030 + }, + { + "epoch": 2.0552147239263805, + "grad_norm": 5.854605197906494, + "learning_rate": 7.332059489914619e-06, + "loss": 0.0601, + "step": 8040 + }, + { + "epoch": 2.057770961145194, + "grad_norm": 0.5980772376060486, + "learning_rate": 7.324162826545245e-06, + "loss": 0.0586, + "step": 8050 + }, + { + "epoch": 2.0603271983640083, + "grad_norm": 4.8323235511779785, + "learning_rate": 7.316258761271651e-06, + "loss": 0.0578, + "step": 8060 + }, + { + "epoch": 2.062883435582822, + "grad_norm": 8.097885131835938, + "learning_rate": 7.308347319266401e-06, + "loss": 0.0469, + "step": 8070 + }, + { + "epoch": 2.065439672801636, + "grad_norm": 5.477297782897949, + "learning_rate": 7.300428525725549e-06, + "loss": 0.0597, + "step": 8080 + }, + { + "epoch": 2.06799591002045, + "grad_norm": 2.20831298828125, + "learning_rate": 7.2925024058685664e-06, + "loss": 0.0512, + "step": 8090 + }, + { + "epoch": 2.0705521472392636, + "grad_norm": 6.855231761932373, + "learning_rate": 7.2845689849382514e-06, + "loss": 0.0787, + "step": 8100 + }, + { + "epoch": 2.0731083844580778, + "grad_norm": 9.492572784423828, + "learning_rate": 7.27662828820066e-06, + "loss": 0.104, + "step": 8110 + }, + { + "epoch": 2.0756646216768915, + "grad_norm": 7.048098087310791, + "learning_rate": 7.268680340945016e-06, + "loss": 0.1052, + "step": 8120 + }, + { + "epoch": 2.0782208588957056, + "grad_norm": 7.1551594734191895, + "learning_rate": 7.260725168483634e-06, + "loss": 0.0538, + "step": 8130 + }, + { + "epoch": 2.0807770961145193, + "grad_norm": 3.1020727157592773, + "learning_rate": 7.252762796151843e-06, + "loss": 0.0923, + "step": 8140 + }, + { + "epoch": 2.0833333333333335, + "grad_norm": 6.914649963378906, + "learning_rate": 7.2447932493079e-06, + "loss": 0.0458, + "step": 8150 + }, + { + "epoch": 2.085889570552147, + "grad_norm": 1.941754698753357, + "learning_rate": 7.236816553332909e-06, + "loss": 0.0847, + "step": 8160 + }, + { + "epoch": 2.0884458077709613, + "grad_norm": 3.0333592891693115, + "learning_rate": 7.228832733630742e-06, + "loss": 0.0318, + "step": 8170 + }, + { + "epoch": 2.091002044989775, + "grad_norm": 2.743631601333618, + "learning_rate": 7.220841815627966e-06, + "loss": 0.0935, + "step": 8180 + }, + { + "epoch": 2.0935582822085887, + "grad_norm": 6.149184226989746, + "learning_rate": 7.212843824773745e-06, + "loss": 0.1325, + "step": 8190 + }, + { + "epoch": 2.096114519427403, + "grad_norm": 9.376814842224121, + "learning_rate": 7.204838786539772e-06, + "loss": 0.0287, + "step": 8200 + }, + { + "epoch": 2.0986707566462166, + "grad_norm": 6.627695560455322, + "learning_rate": 7.196826726420185e-06, + "loss": 0.1187, + "step": 8210 + }, + { + "epoch": 2.1012269938650308, + "grad_norm": 7.894048690795898, + "learning_rate": 7.188807669931486e-06, + "loss": 0.078, + "step": 8220 + }, + { + "epoch": 2.1037832310838445, + "grad_norm": 6.502098083496094, + "learning_rate": 7.180781642612453e-06, + "loss": 0.0647, + "step": 8230 + }, + { + "epoch": 2.1063394683026586, + "grad_norm": 5.958528995513916, + "learning_rate": 7.172748670024073e-06, + "loss": 0.0945, + "step": 8240 + }, + { + "epoch": 2.1088957055214723, + "grad_norm": 0.8460894823074341, + "learning_rate": 7.164708777749445e-06, + "loss": 0.0558, + "step": 8250 + }, + { + "epoch": 2.1114519427402865, + "grad_norm": 5.054858207702637, + "learning_rate": 7.1566619913937105e-06, + "loss": 0.1047, + "step": 8260 + }, + { + "epoch": 2.1140081799591, + "grad_norm": 9.798078536987305, + "learning_rate": 7.148608336583961e-06, + "loss": 0.0616, + "step": 8270 + }, + { + "epoch": 2.116564417177914, + "grad_norm": 2.237877607345581, + "learning_rate": 7.140547838969168e-06, + "loss": 0.0827, + "step": 8280 + }, + { + "epoch": 2.119120654396728, + "grad_norm": 0.3861876428127289, + "learning_rate": 7.1324805242200956e-06, + "loss": 0.0635, + "step": 8290 + }, + { + "epoch": 2.1216768916155417, + "grad_norm": 6.713496685028076, + "learning_rate": 7.1244064180292134e-06, + "loss": 0.0663, + "step": 8300 + }, + { + "epoch": 2.124233128834356, + "grad_norm": 5.54539155960083, + "learning_rate": 7.116325546110628e-06, + "loss": 0.0446, + "step": 8310 + }, + { + "epoch": 2.1267893660531696, + "grad_norm": 5.177070617675781, + "learning_rate": 7.108237934199983e-06, + "loss": 0.0517, + "step": 8320 + }, + { + "epoch": 2.1293456032719837, + "grad_norm": 0.7041372060775757, + "learning_rate": 7.1001436080544e-06, + "loss": 0.0289, + "step": 8330 + }, + { + "epoch": 2.1319018404907975, + "grad_norm": 6.997579574584961, + "learning_rate": 7.0920425934523705e-06, + "loss": 0.0502, + "step": 8340 + }, + { + "epoch": 2.1344580777096116, + "grad_norm": 9.049081802368164, + "learning_rate": 7.083934916193698e-06, + "loss": 0.0795, + "step": 8350 + }, + { + "epoch": 2.1370143149284253, + "grad_norm": 3.9479804039001465, + "learning_rate": 7.075820602099399e-06, + "loss": 0.0659, + "step": 8360 + }, + { + "epoch": 2.1395705521472395, + "grad_norm": 7.389666557312012, + "learning_rate": 7.0676996770116294e-06, + "loss": 0.0533, + "step": 8370 + }, + { + "epoch": 2.142126789366053, + "grad_norm": 5.052390098571777, + "learning_rate": 7.059572166793598e-06, + "loss": 0.075, + "step": 8380 + }, + { + "epoch": 2.144683026584867, + "grad_norm": 8.923999786376953, + "learning_rate": 7.051438097329485e-06, + "loss": 0.0782, + "step": 8390 + }, + { + "epoch": 2.147239263803681, + "grad_norm": 9.955076217651367, + "learning_rate": 7.043297494524364e-06, + "loss": 0.0648, + "step": 8400 + }, + { + "epoch": 2.1497955010224947, + "grad_norm": 14.11737060546875, + "learning_rate": 7.03515038430411e-06, + "loss": 0.0368, + "step": 8410 + }, + { + "epoch": 2.152351738241309, + "grad_norm": 4.5228590965271, + "learning_rate": 7.026996792615328e-06, + "loss": 0.0758, + "step": 8420 + }, + { + "epoch": 2.1549079754601226, + "grad_norm": 6.418432712554932, + "learning_rate": 7.0188367454252624e-06, + "loss": 0.0705, + "step": 8430 + }, + { + "epoch": 2.1574642126789367, + "grad_norm": 5.669915676116943, + "learning_rate": 7.010670268721718e-06, + "loss": 0.1191, + "step": 8440 + }, + { + "epoch": 2.1600204498977504, + "grad_norm": 5.414175033569336, + "learning_rate": 7.002497388512971e-06, + "loss": 0.0665, + "step": 8450 + }, + { + "epoch": 2.1625766871165646, + "grad_norm": 3.9066929817199707, + "learning_rate": 6.9943181308277e-06, + "loss": 0.0625, + "step": 8460 + }, + { + "epoch": 2.1651329243353783, + "grad_norm": 0.23331096768379211, + "learning_rate": 6.986132521714888e-06, + "loss": 0.0674, + "step": 8470 + }, + { + "epoch": 2.1676891615541924, + "grad_norm": 3.160121440887451, + "learning_rate": 6.977940587243745e-06, + "loss": 0.0834, + "step": 8480 + }, + { + "epoch": 2.170245398773006, + "grad_norm": 4.351058483123779, + "learning_rate": 6.969742353503635e-06, + "loss": 0.0386, + "step": 8490 + }, + { + "epoch": 2.17280163599182, + "grad_norm": 9.820882797241211, + "learning_rate": 6.96153784660397e-06, + "loss": 0.0672, + "step": 8500 + }, + { + "epoch": 2.175357873210634, + "grad_norm": 15.702372550964355, + "learning_rate": 6.9533270926741506e-06, + "loss": 0.0749, + "step": 8510 + }, + { + "epoch": 2.1779141104294477, + "grad_norm": 5.2401509284973145, + "learning_rate": 6.945110117863469e-06, + "loss": 0.0703, + "step": 8520 + }, + { + "epoch": 2.180470347648262, + "grad_norm": 5.104111194610596, + "learning_rate": 6.936886948341029e-06, + "loss": 0.091, + "step": 8530 + }, + { + "epoch": 2.1830265848670756, + "grad_norm": 12.119421005249023, + "learning_rate": 6.928657610295666e-06, + "loss": 0.045, + "step": 8540 + }, + { + "epoch": 2.1855828220858897, + "grad_norm": 3.9862372875213623, + "learning_rate": 6.920422129935859e-06, + "loss": 0.0863, + "step": 8550 + }, + { + "epoch": 2.1881390593047034, + "grad_norm": 8.157721519470215, + "learning_rate": 6.912180533489645e-06, + "loss": 0.0649, + "step": 8560 + }, + { + "epoch": 2.1906952965235176, + "grad_norm": 12.727168083190918, + "learning_rate": 6.903932847204548e-06, + "loss": 0.0839, + "step": 8570 + }, + { + "epoch": 2.1932515337423313, + "grad_norm": 2.1506459712982178, + "learning_rate": 6.895679097347476e-06, + "loss": 0.0704, + "step": 8580 + }, + { + "epoch": 2.195807770961145, + "grad_norm": 3.3470993041992188, + "learning_rate": 6.887419310204657e-06, + "loss": 0.0637, + "step": 8590 + }, + { + "epoch": 2.198364008179959, + "grad_norm": 4.4683356285095215, + "learning_rate": 6.879153512081542e-06, + "loss": 0.0556, + "step": 8600 + }, + { + "epoch": 2.200920245398773, + "grad_norm": 5.225627899169922, + "learning_rate": 6.870881729302728e-06, + "loss": 0.0467, + "step": 8610 + }, + { + "epoch": 2.203476482617587, + "grad_norm": 4.654438018798828, + "learning_rate": 6.862603988211866e-06, + "loss": 0.117, + "step": 8620 + }, + { + "epoch": 2.2060327198364007, + "grad_norm": 5.765674114227295, + "learning_rate": 6.854320315171591e-06, + "loss": 0.0833, + "step": 8630 + }, + { + "epoch": 2.208588957055215, + "grad_norm": 4.424642086029053, + "learning_rate": 6.8460307365634225e-06, + "loss": 0.0879, + "step": 8640 + }, + { + "epoch": 2.2111451942740286, + "grad_norm": 0.613856315612793, + "learning_rate": 6.837735278787694e-06, + "loss": 0.0309, + "step": 8650 + }, + { + "epoch": 2.2137014314928427, + "grad_norm": 6.912576675415039, + "learning_rate": 6.829433968263458e-06, + "loss": 0.0571, + "step": 8660 + }, + { + "epoch": 2.2162576687116564, + "grad_norm": 4.189841270446777, + "learning_rate": 6.821126831428408e-06, + "loss": 0.0856, + "step": 8670 + }, + { + "epoch": 2.21881390593047, + "grad_norm": 2.174213171005249, + "learning_rate": 6.8128138947387966e-06, + "loss": 0.0573, + "step": 8680 + }, + { + "epoch": 2.2213701431492843, + "grad_norm": 9.983304023742676, + "learning_rate": 6.80449518466934e-06, + "loss": 0.0715, + "step": 8690 + }, + { + "epoch": 2.223926380368098, + "grad_norm": 5.989863872528076, + "learning_rate": 6.796170727713147e-06, + "loss": 0.0759, + "step": 8700 + }, + { + "epoch": 2.226482617586912, + "grad_norm": 2.114159345626831, + "learning_rate": 6.787840550381628e-06, + "loss": 0.0244, + "step": 8710 + }, + { + "epoch": 2.229038854805726, + "grad_norm": 7.211903095245361, + "learning_rate": 6.779504679204412e-06, + "loss": 0.0973, + "step": 8720 + }, + { + "epoch": 2.23159509202454, + "grad_norm": 5.518008232116699, + "learning_rate": 6.771163140729257e-06, + "loss": 0.1189, + "step": 8730 + }, + { + "epoch": 2.2341513292433537, + "grad_norm": 7.159096717834473, + "learning_rate": 6.762815961521976e-06, + "loss": 0.0472, + "step": 8740 + }, + { + "epoch": 2.236707566462168, + "grad_norm": 5.628960132598877, + "learning_rate": 6.754463168166342e-06, + "loss": 0.0646, + "step": 8750 + }, + { + "epoch": 2.2392638036809815, + "grad_norm": 2.7872536182403564, + "learning_rate": 6.746104787264011e-06, + "loss": 0.0603, + "step": 8760 + }, + { + "epoch": 2.2418200408997957, + "grad_norm": 4.476940155029297, + "learning_rate": 6.737740845434432e-06, + "loss": 0.0635, + "step": 8770 + }, + { + "epoch": 2.2443762781186094, + "grad_norm": 4.041048049926758, + "learning_rate": 6.7293713693147635e-06, + "loss": 0.0462, + "step": 8780 + }, + { + "epoch": 2.246932515337423, + "grad_norm": 3.1163430213928223, + "learning_rate": 6.720996385559793e-06, + "loss": 0.0552, + "step": 8790 + }, + { + "epoch": 2.2494887525562373, + "grad_norm": 5.569158554077148, + "learning_rate": 6.712615920841843e-06, + "loss": 0.0689, + "step": 8800 + }, + { + "epoch": 2.252044989775051, + "grad_norm": 5.894398212432861, + "learning_rate": 6.704230001850696e-06, + "loss": 0.0531, + "step": 8810 + }, + { + "epoch": 2.254601226993865, + "grad_norm": 6.340700149536133, + "learning_rate": 6.695838655293505e-06, + "loss": 0.0568, + "step": 8820 + }, + { + "epoch": 2.257157464212679, + "grad_norm": 4.014859199523926, + "learning_rate": 6.6874419078947076e-06, + "loss": 0.0613, + "step": 8830 + }, + { + "epoch": 2.259713701431493, + "grad_norm": 10.440202713012695, + "learning_rate": 6.679039786395936e-06, + "loss": 0.0497, + "step": 8840 + }, + { + "epoch": 2.2622699386503067, + "grad_norm": 9.94273567199707, + "learning_rate": 6.6706323175559504e-06, + "loss": 0.0866, + "step": 8850 + }, + { + "epoch": 2.264826175869121, + "grad_norm": 1.108022689819336, + "learning_rate": 6.662219528150529e-06, + "loss": 0.0504, + "step": 8860 + }, + { + "epoch": 2.2673824130879345, + "grad_norm": 3.8868322372436523, + "learning_rate": 6.653801444972398e-06, + "loss": 0.0675, + "step": 8870 + }, + { + "epoch": 2.2699386503067487, + "grad_norm": 3.9967801570892334, + "learning_rate": 6.64537809483115e-06, + "loss": 0.0846, + "step": 8880 + }, + { + "epoch": 2.2724948875255624, + "grad_norm": 4.602581024169922, + "learning_rate": 6.63694950455314e-06, + "loss": 0.0553, + "step": 8890 + }, + { + "epoch": 2.275051124744376, + "grad_norm": 0.8123490810394287, + "learning_rate": 6.628515700981424e-06, + "loss": 0.0463, + "step": 8900 + }, + { + "epoch": 2.2776073619631902, + "grad_norm": 0.3097759783267975, + "learning_rate": 6.620076710975648e-06, + "loss": 0.0754, + "step": 8910 + }, + { + "epoch": 2.280163599182004, + "grad_norm": 4.588552474975586, + "learning_rate": 6.611632561411987e-06, + "loss": 0.078, + "step": 8920 + }, + { + "epoch": 2.282719836400818, + "grad_norm": 7.720053195953369, + "learning_rate": 6.603183279183041e-06, + "loss": 0.0946, + "step": 8930 + }, + { + "epoch": 2.285276073619632, + "grad_norm": 6.0510945320129395, + "learning_rate": 6.594728891197758e-06, + "loss": 0.0565, + "step": 8940 + }, + { + "epoch": 2.287832310838446, + "grad_norm": 9.991423606872559, + "learning_rate": 6.586269424381349e-06, + "loss": 0.0585, + "step": 8950 + }, + { + "epoch": 2.2903885480572597, + "grad_norm": 0.4170069098472595, + "learning_rate": 6.577804905675196e-06, + "loss": 0.0552, + "step": 8960 + }, + { + "epoch": 2.292944785276074, + "grad_norm": 4.831167221069336, + "learning_rate": 6.569335362036773e-06, + "loss": 0.0477, + "step": 8970 + }, + { + "epoch": 2.2955010224948875, + "grad_norm": 2.8550946712493896, + "learning_rate": 6.560860820439557e-06, + "loss": 0.0386, + "step": 8980 + }, + { + "epoch": 2.2980572597137012, + "grad_norm": 6.75853967666626, + "learning_rate": 6.55238130787294e-06, + "loss": 0.0722, + "step": 8990 + }, + { + "epoch": 2.3006134969325154, + "grad_norm": 7.144791603088379, + "learning_rate": 6.543896851342148e-06, + "loss": 0.0713, + "step": 9000 + }, + { + "epoch": 2.303169734151329, + "grad_norm": 0.21670909225940704, + "learning_rate": 6.535407477868151e-06, + "loss": 0.0809, + "step": 9010 + }, + { + "epoch": 2.3057259713701432, + "grad_norm": 4.624710559844971, + "learning_rate": 6.526913214487578e-06, + "loss": 0.0727, + "step": 9020 + }, + { + "epoch": 2.308282208588957, + "grad_norm": 0.8301081657409668, + "learning_rate": 6.518414088252632e-06, + "loss": 0.0522, + "step": 9030 + }, + { + "epoch": 2.310838445807771, + "grad_norm": 0.1696474701166153, + "learning_rate": 6.509910126231003e-06, + "loss": 0.0482, + "step": 9040 + }, + { + "epoch": 2.313394683026585, + "grad_norm": 2.2531793117523193, + "learning_rate": 6.501401355505782e-06, + "loss": 0.0557, + "step": 9050 + }, + { + "epoch": 2.315950920245399, + "grad_norm": 4.994187831878662, + "learning_rate": 6.492887803175374e-06, + "loss": 0.0938, + "step": 9060 + }, + { + "epoch": 2.3185071574642127, + "grad_norm": 6.800015926361084, + "learning_rate": 6.484369496353412e-06, + "loss": 0.061, + "step": 9070 + }, + { + "epoch": 2.3210633946830264, + "grad_norm": 10.822134017944336, + "learning_rate": 6.4758464621686715e-06, + "loss": 0.0584, + "step": 9080 + }, + { + "epoch": 2.3236196319018405, + "grad_norm": 2.0743789672851562, + "learning_rate": 6.467318727764983e-06, + "loss": 0.0489, + "step": 9090 + }, + { + "epoch": 2.326175869120654, + "grad_norm": 6.299098968505859, + "learning_rate": 6.458786320301146e-06, + "loss": 0.0832, + "step": 9100 + }, + { + "epoch": 2.3287321063394684, + "grad_norm": 0.3529964089393616, + "learning_rate": 6.450249266950846e-06, + "loss": 0.0281, + "step": 9110 + }, + { + "epoch": 2.331288343558282, + "grad_norm": 1.3568120002746582, + "learning_rate": 6.4417075949025575e-06, + "loss": 0.0326, + "step": 9120 + }, + { + "epoch": 2.3338445807770962, + "grad_norm": 5.631685256958008, + "learning_rate": 6.43316133135947e-06, + "loss": 0.0618, + "step": 9130 + }, + { + "epoch": 2.33640081799591, + "grad_norm": 0.5822303295135498, + "learning_rate": 6.4246105035393965e-06, + "loss": 0.0483, + "step": 9140 + }, + { + "epoch": 2.338957055214724, + "grad_norm": 0.39102593064308167, + "learning_rate": 6.416055138674682e-06, + "loss": 0.0429, + "step": 9150 + }, + { + "epoch": 2.341513292433538, + "grad_norm": 4.372485637664795, + "learning_rate": 6.4074952640121226e-06, + "loss": 0.0795, + "step": 9160 + }, + { + "epoch": 2.3440695296523515, + "grad_norm": 0.6967136263847351, + "learning_rate": 6.398930906812877e-06, + "loss": 0.0307, + "step": 9170 + }, + { + "epoch": 2.3466257668711656, + "grad_norm": 5.449244022369385, + "learning_rate": 6.390362094352382e-06, + "loss": 0.0729, + "step": 9180 + }, + { + "epoch": 2.34918200408998, + "grad_norm": 5.0675482749938965, + "learning_rate": 6.3817888539202595e-06, + "loss": 0.0707, + "step": 9190 + }, + { + "epoch": 2.3517382413087935, + "grad_norm": 2.9963107109069824, + "learning_rate": 6.373211212820237e-06, + "loss": 0.0545, + "step": 9200 + }, + { + "epoch": 2.354294478527607, + "grad_norm": 3.9092769622802734, + "learning_rate": 6.364629198370054e-06, + "loss": 0.0281, + "step": 9210 + }, + { + "epoch": 2.3568507157464214, + "grad_norm": 8.632110595703125, + "learning_rate": 6.3560428379013795e-06, + "loss": 0.0994, + "step": 9220 + }, + { + "epoch": 2.359406952965235, + "grad_norm": 3.0046439170837402, + "learning_rate": 6.3474521587597234e-06, + "loss": 0.0505, + "step": 9230 + }, + { + "epoch": 2.361963190184049, + "grad_norm": 2.9390039443969727, + "learning_rate": 6.3388571883043505e-06, + "loss": 0.0561, + "step": 9240 + }, + { + "epoch": 2.364519427402863, + "grad_norm": 7.934990406036377, + "learning_rate": 6.330257953908192e-06, + "loss": 0.0442, + "step": 9250 + }, + { + "epoch": 2.367075664621677, + "grad_norm": 3.6421031951904297, + "learning_rate": 6.321654482957756e-06, + "loss": 0.0761, + "step": 9260 + }, + { + "epoch": 2.3696319018404908, + "grad_norm": 2.067728042602539, + "learning_rate": 6.313046802853047e-06, + "loss": 0.0361, + "step": 9270 + }, + { + "epoch": 2.372188139059305, + "grad_norm": 0.5931568741798401, + "learning_rate": 6.304434941007473e-06, + "loss": 0.0441, + "step": 9280 + }, + { + "epoch": 2.3747443762781186, + "grad_norm": 7.320766925811768, + "learning_rate": 6.295818924847761e-06, + "loss": 0.0736, + "step": 9290 + }, + { + "epoch": 2.3773006134969323, + "grad_norm": 5.4987688064575195, + "learning_rate": 6.2871987818138626e-06, + "loss": 0.0694, + "step": 9300 + }, + { + "epoch": 2.3798568507157465, + "grad_norm": 7.312312602996826, + "learning_rate": 6.2785745393588815e-06, + "loss": 0.0698, + "step": 9310 + }, + { + "epoch": 2.38241308793456, + "grad_norm": 8.894052505493164, + "learning_rate": 6.2699462249489715e-06, + "loss": 0.0651, + "step": 9320 + }, + { + "epoch": 2.3849693251533743, + "grad_norm": 0.4445403516292572, + "learning_rate": 6.261313866063257e-06, + "loss": 0.0271, + "step": 9330 + }, + { + "epoch": 2.387525562372188, + "grad_norm": 3.842348575592041, + "learning_rate": 6.252677490193739e-06, + "loss": 0.0625, + "step": 9340 + }, + { + "epoch": 2.390081799591002, + "grad_norm": 3.183258295059204, + "learning_rate": 6.244037124845217e-06, + "loss": 0.0454, + "step": 9350 + }, + { + "epoch": 2.392638036809816, + "grad_norm": 3.39320969581604, + "learning_rate": 6.235392797535193e-06, + "loss": 0.0615, + "step": 9360 + }, + { + "epoch": 2.39519427402863, + "grad_norm": 10.20765495300293, + "learning_rate": 6.226744535793788e-06, + "loss": 0.0808, + "step": 9370 + }, + { + "epoch": 2.3977505112474438, + "grad_norm": 0.8380181789398193, + "learning_rate": 6.2180923671636524e-06, + "loss": 0.0485, + "step": 9380 + }, + { + "epoch": 2.4003067484662575, + "grad_norm": 0.444444477558136, + "learning_rate": 6.20943631919988e-06, + "loss": 0.02, + "step": 9390 + }, + { + "epoch": 2.4028629856850716, + "grad_norm": 8.41584587097168, + "learning_rate": 6.200776419469918e-06, + "loss": 0.054, + "step": 9400 + }, + { + "epoch": 2.4054192229038853, + "grad_norm": 5.808600425720215, + "learning_rate": 6.192112695553483e-06, + "loss": 0.0671, + "step": 9410 + }, + { + "epoch": 2.4079754601226995, + "grad_norm": 3.8908348083496094, + "learning_rate": 6.183445175042466e-06, + "loss": 0.0618, + "step": 9420 + }, + { + "epoch": 2.410531697341513, + "grad_norm": 4.925373077392578, + "learning_rate": 6.174773885540855e-06, + "loss": 0.0512, + "step": 9430 + }, + { + "epoch": 2.4130879345603273, + "grad_norm": 0.03155489265918732, + "learning_rate": 6.166098854664638e-06, + "loss": 0.0356, + "step": 9440 + }, + { + "epoch": 2.415644171779141, + "grad_norm": 0.08001308143138885, + "learning_rate": 6.157420110041719e-06, + "loss": 0.031, + "step": 9450 + }, + { + "epoch": 2.418200408997955, + "grad_norm": 6.34970760345459, + "learning_rate": 6.1487376793118285e-06, + "loss": 0.0595, + "step": 9460 + }, + { + "epoch": 2.420756646216769, + "grad_norm": 15.385396957397461, + "learning_rate": 6.140051590126439e-06, + "loss": 0.0452, + "step": 9470 + }, + { + "epoch": 2.4233128834355826, + "grad_norm": 5.20993185043335, + "learning_rate": 6.131361870148672e-06, + "loss": 0.0745, + "step": 9480 + }, + { + "epoch": 2.4258691206543967, + "grad_norm": 4.343068599700928, + "learning_rate": 6.1226685470532125e-06, + "loss": 0.0639, + "step": 9490 + }, + { + "epoch": 2.4284253578732105, + "grad_norm": 4.774913787841797, + "learning_rate": 6.113971648526222e-06, + "loss": 0.0416, + "step": 9500 + }, + { + "epoch": 2.4309815950920246, + "grad_norm": 0.5611134767532349, + "learning_rate": 6.105271202265246e-06, + "loss": 0.0636, + "step": 9510 + }, + { + "epoch": 2.4335378323108383, + "grad_norm": 6.5504279136657715, + "learning_rate": 6.096567235979133e-06, + "loss": 0.0537, + "step": 9520 + }, + { + "epoch": 2.4360940695296525, + "grad_norm": 0.9646693468093872, + "learning_rate": 6.0878597773879376e-06, + "loss": 0.0512, + "step": 9530 + }, + { + "epoch": 2.438650306748466, + "grad_norm": 4.056527614593506, + "learning_rate": 6.079148854222839e-06, + "loss": 0.0451, + "step": 9540 + }, + { + "epoch": 2.4412065439672803, + "grad_norm": 5.754093170166016, + "learning_rate": 6.07043449422605e-06, + "loss": 0.0635, + "step": 9550 + }, + { + "epoch": 2.443762781186094, + "grad_norm": 7.742176532745361, + "learning_rate": 6.061716725150727e-06, + "loss": 0.0305, + "step": 9560 + }, + { + "epoch": 2.4463190184049077, + "grad_norm": 7.218969345092773, + "learning_rate": 6.052995574760887e-06, + "loss": 0.0615, + "step": 9570 + }, + { + "epoch": 2.448875255623722, + "grad_norm": 3.2471063137054443, + "learning_rate": 6.044271070831312e-06, + "loss": 0.0568, + "step": 9580 + }, + { + "epoch": 2.451431492842536, + "grad_norm": 0.23956027626991272, + "learning_rate": 6.035543241147469e-06, + "loss": 0.0468, + "step": 9590 + }, + { + "epoch": 2.4539877300613497, + "grad_norm": 0.8415837287902832, + "learning_rate": 6.026812113505409e-06, + "loss": 0.0366, + "step": 9600 + }, + { + "epoch": 2.4565439672801634, + "grad_norm": 4.5563154220581055, + "learning_rate": 6.018077715711695e-06, + "loss": 0.0611, + "step": 9610 + }, + { + "epoch": 2.4591002044989776, + "grad_norm": 9.952324867248535, + "learning_rate": 6.009340075583299e-06, + "loss": 0.0504, + "step": 9620 + }, + { + "epoch": 2.4616564417177913, + "grad_norm": 3.28727126121521, + "learning_rate": 6.00059922094752e-06, + "loss": 0.0563, + "step": 9630 + }, + { + "epoch": 2.4642126789366054, + "grad_norm": 4.564260959625244, + "learning_rate": 5.991855179641896e-06, + "loss": 0.0354, + "step": 9640 + }, + { + "epoch": 2.466768916155419, + "grad_norm": 5.473964214324951, + "learning_rate": 5.983107979514112e-06, + "loss": 0.0389, + "step": 9650 + }, + { + "epoch": 2.4693251533742333, + "grad_norm": 3.674219846725464, + "learning_rate": 5.974357648421916e-06, + "loss": 0.0745, + "step": 9660 + }, + { + "epoch": 2.471881390593047, + "grad_norm": 0.6603105068206787, + "learning_rate": 5.965604214233022e-06, + "loss": 0.0572, + "step": 9670 + }, + { + "epoch": 2.474437627811861, + "grad_norm": 4.627801895141602, + "learning_rate": 5.956847704825033e-06, + "loss": 0.0395, + "step": 9680 + }, + { + "epoch": 2.476993865030675, + "grad_norm": 2.601986885070801, + "learning_rate": 5.94808814808534e-06, + "loss": 0.0775, + "step": 9690 + }, + { + "epoch": 2.4795501022494886, + "grad_norm": 5.2239460945129395, + "learning_rate": 5.9393255719110455e-06, + "loss": 0.057, + "step": 9700 + }, + { + "epoch": 2.4821063394683027, + "grad_norm": 0.7189023494720459, + "learning_rate": 5.9305600042088595e-06, + "loss": 0.0669, + "step": 9710 + }, + { + "epoch": 2.4846625766871164, + "grad_norm": 4.483514308929443, + "learning_rate": 5.9217914728950286e-06, + "loss": 0.0511, + "step": 9720 + }, + { + "epoch": 2.4872188139059306, + "grad_norm": 1.7777493000030518, + "learning_rate": 5.913020005895232e-06, + "loss": 0.0491, + "step": 9730 + }, + { + "epoch": 2.4897750511247443, + "grad_norm": 6.096456050872803, + "learning_rate": 5.904245631144498e-06, + "loss": 0.0772, + "step": 9740 + }, + { + "epoch": 2.4923312883435584, + "grad_norm": 6.093538761138916, + "learning_rate": 5.895468376587121e-06, + "loss": 0.0738, + "step": 9750 + }, + { + "epoch": 2.494887525562372, + "grad_norm": 0.5627290606498718, + "learning_rate": 5.8866882701765605e-06, + "loss": 0.0428, + "step": 9760 + }, + { + "epoch": 2.4974437627811863, + "grad_norm": 2.499333143234253, + "learning_rate": 5.877905339875363e-06, + "loss": 0.0465, + "step": 9770 + }, + { + "epoch": 2.5, + "grad_norm": 2.476902723312378, + "learning_rate": 5.869119613655062e-06, + "loss": 0.033, + "step": 9780 + }, + { + "epoch": 2.5025562372188137, + "grad_norm": 3.85345458984375, + "learning_rate": 5.860331119496106e-06, + "loss": 0.0589, + "step": 9790 + }, + { + "epoch": 2.505112474437628, + "grad_norm": 0.05737360939383507, + "learning_rate": 5.851539885387748e-06, + "loss": 0.0693, + "step": 9800 + }, + { + "epoch": 2.5076687116564416, + "grad_norm": 5.509619235992432, + "learning_rate": 5.8427459393279736e-06, + "loss": 0.0514, + "step": 9810 + }, + { + "epoch": 2.5102249488752557, + "grad_norm": 5.018087863922119, + "learning_rate": 5.8339493093234025e-06, + "loss": 0.0638, + "step": 9820 + }, + { + "epoch": 2.5127811860940694, + "grad_norm": 5.845489501953125, + "learning_rate": 5.825150023389203e-06, + "loss": 0.0408, + "step": 9830 + }, + { + "epoch": 2.5153374233128836, + "grad_norm": 3.2593860626220703, + "learning_rate": 5.816348109549005e-06, + "loss": 0.0141, + "step": 9840 + }, + { + "epoch": 2.5178936605316973, + "grad_norm": 5.271510124206543, + "learning_rate": 5.807543595834799e-06, + "loss": 0.0526, + "step": 9850 + }, + { + "epoch": 2.5204498977505114, + "grad_norm": 0.1252453476190567, + "learning_rate": 5.798736510286866e-06, + "loss": 0.0522, + "step": 9860 + }, + { + "epoch": 2.523006134969325, + "grad_norm": 4.33157205581665, + "learning_rate": 5.7899268809536705e-06, + "loss": 0.0888, + "step": 9870 + }, + { + "epoch": 2.525562372188139, + "grad_norm": 6.989223480224609, + "learning_rate": 5.781114735891781e-06, + "loss": 0.0413, + "step": 9880 + }, + { + "epoch": 2.528118609406953, + "grad_norm": 4.28364896774292, + "learning_rate": 5.772300103165777e-06, + "loss": 0.0438, + "step": 9890 + }, + { + "epoch": 2.530674846625767, + "grad_norm": 0.42973408102989197, + "learning_rate": 5.763483010848161e-06, + "loss": 0.0537, + "step": 9900 + }, + { + "epoch": 2.533231083844581, + "grad_norm": 3.0371270179748535, + "learning_rate": 5.7546634870192695e-06, + "loss": 0.0482, + "step": 9910 + }, + { + "epoch": 2.5357873210633946, + "grad_norm": 10.468814849853516, + "learning_rate": 5.745841559767182e-06, + "loss": 0.0593, + "step": 9920 + }, + { + "epoch": 2.5383435582822087, + "grad_norm": 1.1547623872756958, + "learning_rate": 5.737017257187634e-06, + "loss": 0.0457, + "step": 9930 + }, + { + "epoch": 2.5408997955010224, + "grad_norm": 5.563620567321777, + "learning_rate": 5.728190607383921e-06, + "loss": 0.0876, + "step": 9940 + }, + { + "epoch": 2.5434560327198366, + "grad_norm": 1.5379348993301392, + "learning_rate": 5.719361638466819e-06, + "loss": 0.0441, + "step": 9950 + }, + { + "epoch": 2.5460122699386503, + "grad_norm": 4.261902809143066, + "learning_rate": 5.7105303785544894e-06, + "loss": 0.0243, + "step": 9960 + }, + { + "epoch": 2.548568507157464, + "grad_norm": 5.261180400848389, + "learning_rate": 5.7016968557723874e-06, + "loss": 0.0309, + "step": 9970 + }, + { + "epoch": 2.551124744376278, + "grad_norm": 2.3447492122650146, + "learning_rate": 5.692861098253174e-06, + "loss": 0.0348, + "step": 9980 + }, + { + "epoch": 2.5536809815950923, + "grad_norm": 2.6790072917938232, + "learning_rate": 5.684023134136634e-06, + "loss": 0.0353, + "step": 9990 + }, + { + "epoch": 2.556237218813906, + "grad_norm": 3.520054817199707, + "learning_rate": 5.67518299156957e-06, + "loss": 0.0852, + "step": 10000 + }, + { + "epoch": 2.5587934560327197, + "grad_norm": 4.114648342132568, + "learning_rate": 5.66634069870573e-06, + "loss": 0.0708, + "step": 10010 + }, + { + "epoch": 2.561349693251534, + "grad_norm": 1.8037816286087036, + "learning_rate": 5.657496283705708e-06, + "loss": 0.0496, + "step": 10020 + }, + { + "epoch": 2.5639059304703475, + "grad_norm": 4.1163716316223145, + "learning_rate": 5.648649774736855e-06, + "loss": 0.0555, + "step": 10030 + }, + { + "epoch": 2.5664621676891617, + "grad_norm": 3.350024700164795, + "learning_rate": 5.639801199973191e-06, + "loss": 0.0262, + "step": 10040 + }, + { + "epoch": 2.5690184049079754, + "grad_norm": 7.735722541809082, + "learning_rate": 5.630950587595319e-06, + "loss": 0.0463, + "step": 10050 + }, + { + "epoch": 2.571574642126789, + "grad_norm": 4.3258538246154785, + "learning_rate": 5.622097965790325e-06, + "loss": 0.0553, + "step": 10060 + }, + { + "epoch": 2.5741308793456033, + "grad_norm": 4.328603744506836, + "learning_rate": 5.6132433627517005e-06, + "loss": 0.0632, + "step": 10070 + }, + { + "epoch": 2.5766871165644174, + "grad_norm": 4.42746114730835, + "learning_rate": 5.6043868066792415e-06, + "loss": 0.0503, + "step": 10080 + }, + { + "epoch": 2.579243353783231, + "grad_norm": 3.8148934841156006, + "learning_rate": 5.595528325778968e-06, + "loss": 0.0607, + "step": 10090 + }, + { + "epoch": 2.581799591002045, + "grad_norm": 3.635321617126465, + "learning_rate": 5.58666794826303e-06, + "loss": 0.0448, + "step": 10100 + }, + { + "epoch": 2.584355828220859, + "grad_norm": 0.01259413082152605, + "learning_rate": 5.577805702349614e-06, + "loss": 0.0408, + "step": 10110 + }, + { + "epoch": 2.5869120654396727, + "grad_norm": 3.7854018211364746, + "learning_rate": 5.568941616262861e-06, + "loss": 0.0585, + "step": 10120 + }, + { + "epoch": 2.589468302658487, + "grad_norm": 1.9844086170196533, + "learning_rate": 5.5600757182327695e-06, + "loss": 0.0263, + "step": 10130 + }, + { + "epoch": 2.5920245398773005, + "grad_norm": 5.477379322052002, + "learning_rate": 5.5512080364951105e-06, + "loss": 0.0553, + "step": 10140 + }, + { + "epoch": 2.5945807770961147, + "grad_norm": 6.039186954498291, + "learning_rate": 5.542338599291335e-06, + "loss": 0.0379, + "step": 10150 + }, + { + "epoch": 2.5971370143149284, + "grad_norm": 0.8108224272727966, + "learning_rate": 5.533467434868486e-06, + "loss": 0.0534, + "step": 10160 + }, + { + "epoch": 2.5996932515337425, + "grad_norm": 4.296036243438721, + "learning_rate": 5.524594571479104e-06, + "loss": 0.036, + "step": 10170 + }, + { + "epoch": 2.6022494887525562, + "grad_norm": 3.003478765487671, + "learning_rate": 5.515720037381144e-06, + "loss": 0.0471, + "step": 10180 + }, + { + "epoch": 2.60480572597137, + "grad_norm": 4.551526069641113, + "learning_rate": 5.50684386083788e-06, + "loss": 0.0675, + "step": 10190 + }, + { + "epoch": 2.607361963190184, + "grad_norm": 2.661856174468994, + "learning_rate": 5.497966070117816e-06, + "loss": 0.0298, + "step": 10200 + }, + { + "epoch": 2.609918200408998, + "grad_norm": 3.3979713916778564, + "learning_rate": 5.4890866934946e-06, + "loss": 0.0422, + "step": 10210 + }, + { + "epoch": 2.612474437627812, + "grad_norm": 5.5186896324157715, + "learning_rate": 5.480205759246926e-06, + "loss": 0.0471, + "step": 10220 + }, + { + "epoch": 2.6150306748466257, + "grad_norm": 3.5918192863464355, + "learning_rate": 5.471323295658455e-06, + "loss": 0.0692, + "step": 10230 + }, + { + "epoch": 2.61758691206544, + "grad_norm": 4.837007999420166, + "learning_rate": 5.462439331017711e-06, + "loss": 0.0464, + "step": 10240 + }, + { + "epoch": 2.6201431492842535, + "grad_norm": 1.8546375036239624, + "learning_rate": 5.453553893618003e-06, + "loss": 0.0397, + "step": 10250 + }, + { + "epoch": 2.6226993865030677, + "grad_norm": 7.079483985900879, + "learning_rate": 5.44466701175733e-06, + "loss": 0.0308, + "step": 10260 + }, + { + "epoch": 2.6252556237218814, + "grad_norm": 0.1995091438293457, + "learning_rate": 5.435778713738292e-06, + "loss": 0.0247, + "step": 10270 + }, + { + "epoch": 2.627811860940695, + "grad_norm": 5.363643169403076, + "learning_rate": 5.426889027867997e-06, + "loss": 0.0418, + "step": 10280 + }, + { + "epoch": 2.6303680981595092, + "grad_norm": 1.1156824827194214, + "learning_rate": 5.417997982457974e-06, + "loss": 0.0631, + "step": 10290 + }, + { + "epoch": 2.6329243353783234, + "grad_norm": 0.43824976682662964, + "learning_rate": 5.409105605824082e-06, + "loss": 0.0433, + "step": 10300 + }, + { + "epoch": 2.635480572597137, + "grad_norm": 0.8822130560874939, + "learning_rate": 5.400211926286421e-06, + "loss": 0.0247, + "step": 10310 + }, + { + "epoch": 2.638036809815951, + "grad_norm": 3.7047805786132812, + "learning_rate": 5.391316972169236e-06, + "loss": 0.039, + "step": 10320 + }, + { + "epoch": 2.640593047034765, + "grad_norm": 3.4349169731140137, + "learning_rate": 5.382420771800836e-06, + "loss": 0.0148, + "step": 10330 + }, + { + "epoch": 2.6431492842535786, + "grad_norm": 4.191125392913818, + "learning_rate": 5.373523353513498e-06, + "loss": 0.0671, + "step": 10340 + }, + { + "epoch": 2.645705521472393, + "grad_norm": 8.565727233886719, + "learning_rate": 5.364624745643375e-06, + "loss": 0.0534, + "step": 10350 + }, + { + "epoch": 2.6482617586912065, + "grad_norm": 5.6679840087890625, + "learning_rate": 5.35572497653041e-06, + "loss": 0.0493, + "step": 10360 + }, + { + "epoch": 2.65081799591002, + "grad_norm": 2.3933236598968506, + "learning_rate": 5.346824074518246e-06, + "loss": 0.05, + "step": 10370 + }, + { + "epoch": 2.6533742331288344, + "grad_norm": 0.3358931839466095, + "learning_rate": 5.337922067954136e-06, + "loss": 0.0137, + "step": 10380 + }, + { + "epoch": 2.6559304703476485, + "grad_norm": 2.881453275680542, + "learning_rate": 5.329018985188841e-06, + "loss": 0.0689, + "step": 10390 + }, + { + "epoch": 2.658486707566462, + "grad_norm": 3.27288818359375, + "learning_rate": 5.320114854576559e-06, + "loss": 0.0297, + "step": 10400 + }, + { + "epoch": 2.661042944785276, + "grad_norm": 3.823456287384033, + "learning_rate": 5.3112097044748235e-06, + "loss": 0.0607, + "step": 10410 + }, + { + "epoch": 2.66359918200409, + "grad_norm": 3.608356475830078, + "learning_rate": 5.302303563244413e-06, + "loss": 0.0381, + "step": 10420 + }, + { + "epoch": 2.6661554192229038, + "grad_norm": 1.3827208280563354, + "learning_rate": 5.2933964592492614e-06, + "loss": 0.05, + "step": 10430 + }, + { + "epoch": 2.668711656441718, + "grad_norm": 0.04524281620979309, + "learning_rate": 5.284488420856372e-06, + "loss": 0.0268, + "step": 10440 + }, + { + "epoch": 2.6712678936605316, + "grad_norm": 7.237791538238525, + "learning_rate": 5.275579476435719e-06, + "loss": 0.0239, + "step": 10450 + }, + { + "epoch": 2.6738241308793453, + "grad_norm": 0.08604143559932709, + "learning_rate": 5.2666696543601696e-06, + "loss": 0.0819, + "step": 10460 + }, + { + "epoch": 2.6763803680981595, + "grad_norm": 4.662979602813721, + "learning_rate": 5.25775898300538e-06, + "loss": 0.0539, + "step": 10470 + }, + { + "epoch": 2.6789366053169736, + "grad_norm": 0.7715989947319031, + "learning_rate": 5.248847490749711e-06, + "loss": 0.0375, + "step": 10480 + }, + { + "epoch": 2.6814928425357873, + "grad_norm": 5.067183971405029, + "learning_rate": 5.239935205974145e-06, + "loss": 0.0205, + "step": 10490 + }, + { + "epoch": 2.684049079754601, + "grad_norm": 5.718189716339111, + "learning_rate": 5.231022157062177e-06, + "loss": 0.0898, + "step": 10500 + }, + { + "epoch": 2.686605316973415, + "grad_norm": 14.444259643554688, + "learning_rate": 5.222108372399746e-06, + "loss": 0.043, + "step": 10510 + }, + { + "epoch": 2.689161554192229, + "grad_norm": 0.2056499719619751, + "learning_rate": 5.213193880375127e-06, + "loss": 0.0639, + "step": 10520 + }, + { + "epoch": 2.691717791411043, + "grad_norm": 0.02217238023877144, + "learning_rate": 5.204278709378854e-06, + "loss": 0.0177, + "step": 10530 + }, + { + "epoch": 2.6942740286298568, + "grad_norm": 1.9676077365875244, + "learning_rate": 5.195362887803617e-06, + "loss": 0.0495, + "step": 10540 + }, + { + "epoch": 2.696830265848671, + "grad_norm": 2.4766979217529297, + "learning_rate": 5.186446444044184e-06, + "loss": 0.0572, + "step": 10550 + }, + { + "epoch": 2.6993865030674846, + "grad_norm": 0.9265226721763611, + "learning_rate": 5.177529406497298e-06, + "loss": 0.0192, + "step": 10560 + }, + { + "epoch": 2.7019427402862988, + "grad_norm": 6.686746597290039, + "learning_rate": 5.168611803561599e-06, + "loss": 0.0632, + "step": 10570 + }, + { + "epoch": 2.7044989775051125, + "grad_norm": 4.72622013092041, + "learning_rate": 5.159693663637525e-06, + "loss": 0.0499, + "step": 10580 + }, + { + "epoch": 2.707055214723926, + "grad_norm": 4.173243045806885, + "learning_rate": 5.150775015127224e-06, + "loss": 0.0343, + "step": 10590 + }, + { + "epoch": 2.7096114519427403, + "grad_norm": 0.10401232540607452, + "learning_rate": 5.1418558864344645e-06, + "loss": 0.0417, + "step": 10600 + }, + { + "epoch": 2.712167689161554, + "grad_norm": 4.092282772064209, + "learning_rate": 5.132936305964543e-06, + "loss": 0.0335, + "step": 10610 + }, + { + "epoch": 2.714723926380368, + "grad_norm": 8.394328117370605, + "learning_rate": 5.1240163021241975e-06, + "loss": 0.0785, + "step": 10620 + }, + { + "epoch": 2.717280163599182, + "grad_norm": 3.676940441131592, + "learning_rate": 5.1150959033215104e-06, + "loss": 0.0382, + "step": 10630 + }, + { + "epoch": 2.719836400817996, + "grad_norm": 0.23662449419498444, + "learning_rate": 5.106175137965826e-06, + "loss": 0.0467, + "step": 10640 + }, + { + "epoch": 2.7223926380368098, + "grad_norm": 6.808079719543457, + "learning_rate": 5.097254034467652e-06, + "loss": 0.0348, + "step": 10650 + }, + { + "epoch": 2.724948875255624, + "grad_norm": 0.04969576373696327, + "learning_rate": 5.0883326212385775e-06, + "loss": 0.031, + "step": 10660 + }, + { + "epoch": 2.7275051124744376, + "grad_norm": 6.316954612731934, + "learning_rate": 5.079410926691174e-06, + "loss": 0.053, + "step": 10670 + }, + { + "epoch": 2.7300613496932513, + "grad_norm": 4.699779987335205, + "learning_rate": 5.07048897923891e-06, + "loss": 0.0328, + "step": 10680 + }, + { + "epoch": 2.7326175869120655, + "grad_norm": 2.899876117706299, + "learning_rate": 5.061566807296062e-06, + "loss": 0.0537, + "step": 10690 + }, + { + "epoch": 2.7351738241308796, + "grad_norm": 1.7334074974060059, + "learning_rate": 5.052644439277617e-06, + "loss": 0.036, + "step": 10700 + }, + { + "epoch": 2.7377300613496933, + "grad_norm": 0.5449509024620056, + "learning_rate": 5.043721903599193e-06, + "loss": 0.0199, + "step": 10710 + }, + { + "epoch": 2.740286298568507, + "grad_norm": 0.7619210481643677, + "learning_rate": 5.0347992286769324e-06, + "loss": 0.0349, + "step": 10720 + }, + { + "epoch": 2.742842535787321, + "grad_norm": 0.09413593262434006, + "learning_rate": 5.025876442927429e-06, + "loss": 0.0579, + "step": 10730 + }, + { + "epoch": 2.745398773006135, + "grad_norm": 2.7584242820739746, + "learning_rate": 5.016953574767629e-06, + "loss": 0.0824, + "step": 10740 + }, + { + "epoch": 2.747955010224949, + "grad_norm": 3.956817626953125, + "learning_rate": 5.008030652614737e-06, + "loss": 0.0461, + "step": 10750 + }, + { + "epoch": 2.7505112474437627, + "grad_norm": 0.14918692409992218, + "learning_rate": 4.99910770488613e-06, + "loss": 0.0116, + "step": 10760 + }, + { + "epoch": 2.7530674846625764, + "grad_norm": 4.674230098724365, + "learning_rate": 4.990184759999271e-06, + "loss": 0.0704, + "step": 10770 + }, + { + "epoch": 2.7556237218813906, + "grad_norm": 4.550516128540039, + "learning_rate": 4.981261846371612e-06, + "loss": 0.0328, + "step": 10780 + }, + { + "epoch": 2.7581799591002047, + "grad_norm": 5.67306661605835, + "learning_rate": 4.972338992420501e-06, + "loss": 0.0425, + "step": 10790 + }, + { + "epoch": 2.7607361963190185, + "grad_norm": 3.2620246410369873, + "learning_rate": 4.9634162265631016e-06, + "loss": 0.0281, + "step": 10800 + }, + { + "epoch": 2.763292433537832, + "grad_norm": 5.77325963973999, + "learning_rate": 4.954493577216294e-06, + "loss": 0.0263, + "step": 10810 + }, + { + "epoch": 2.7658486707566463, + "grad_norm": 7.105217933654785, + "learning_rate": 4.9455710727965886e-06, + "loss": 0.0971, + "step": 10820 + }, + { + "epoch": 2.76840490797546, + "grad_norm": 8.464949607849121, + "learning_rate": 4.936648741720032e-06, + "loss": 0.0459, + "step": 10830 + }, + { + "epoch": 2.770961145194274, + "grad_norm": 9.054972648620605, + "learning_rate": 4.9277266124021245e-06, + "loss": 0.0335, + "step": 10840 + }, + { + "epoch": 2.773517382413088, + "grad_norm": 1.2454347610473633, + "learning_rate": 4.918804713257715e-06, + "loss": 0.0471, + "step": 10850 + }, + { + "epoch": 2.7760736196319016, + "grad_norm": 2.4472923278808594, + "learning_rate": 4.909883072700928e-06, + "loss": 0.0462, + "step": 10860 + }, + { + "epoch": 2.7786298568507157, + "grad_norm": 0.04563615098595619, + "learning_rate": 4.900961719145056e-06, + "loss": 0.0167, + "step": 10870 + }, + { + "epoch": 2.78118609406953, + "grad_norm": 5.5846734046936035, + "learning_rate": 4.892040681002488e-06, + "loss": 0.0578, + "step": 10880 + }, + { + "epoch": 2.7837423312883436, + "grad_norm": 4.339868068695068, + "learning_rate": 4.883119986684596e-06, + "loss": 0.0273, + "step": 10890 + }, + { + "epoch": 2.7862985685071573, + "grad_norm": 4.785184383392334, + "learning_rate": 4.87419966460167e-06, + "loss": 0.076, + "step": 10900 + }, + { + "epoch": 2.7888548057259714, + "grad_norm": 0.035292405635118484, + "learning_rate": 4.865279743162804e-06, + "loss": 0.0462, + "step": 10910 + }, + { + "epoch": 2.791411042944785, + "grad_norm": 3.155709743499756, + "learning_rate": 4.856360250775821e-06, + "loss": 0.036, + "step": 10920 + }, + { + "epoch": 2.7939672801635993, + "grad_norm": 0.7432450652122498, + "learning_rate": 4.847441215847177e-06, + "loss": 0.0619, + "step": 10930 + }, + { + "epoch": 2.796523517382413, + "grad_norm": 5.327913761138916, + "learning_rate": 4.838522666781871e-06, + "loss": 0.0647, + "step": 10940 + }, + { + "epoch": 2.799079754601227, + "grad_norm": 2.4953877925872803, + "learning_rate": 4.829604631983353e-06, + "loss": 0.0392, + "step": 10950 + }, + { + "epoch": 2.801635991820041, + "grad_norm": 3.6846439838409424, + "learning_rate": 4.8206871398534385e-06, + "loss": 0.0368, + "step": 10960 + }, + { + "epoch": 2.804192229038855, + "grad_norm": 5.844122886657715, + "learning_rate": 4.811770218792212e-06, + "loss": 0.0476, + "step": 10970 + }, + { + "epoch": 2.8067484662576687, + "grad_norm": 4.004204273223877, + "learning_rate": 4.80285389719794e-06, + "loss": 0.0589, + "step": 10980 + }, + { + "epoch": 2.8093047034764824, + "grad_norm": 0.9968608021736145, + "learning_rate": 4.793938203466979e-06, + "loss": 0.0448, + "step": 10990 + }, + { + "epoch": 2.8118609406952966, + "grad_norm": 6.936352252960205, + "learning_rate": 4.78502316599369e-06, + "loss": 0.0447, + "step": 11000 + }, + { + "epoch": 2.8144171779141103, + "grad_norm": 4.1466383934021, + "learning_rate": 4.776108813170337e-06, + "loss": 0.0406, + "step": 11010 + }, + { + "epoch": 2.8169734151329244, + "grad_norm": 12.088165283203125, + "learning_rate": 4.76719517338701e-06, + "loss": 0.0544, + "step": 11020 + }, + { + "epoch": 2.819529652351738, + "grad_norm": 3.7247049808502197, + "learning_rate": 4.758282275031524e-06, + "loss": 0.0304, + "step": 11030 + }, + { + "epoch": 2.8220858895705523, + "grad_norm": 5.583109378814697, + "learning_rate": 4.7493701464893366e-06, + "loss": 0.0326, + "step": 11040 + }, + { + "epoch": 2.824642126789366, + "grad_norm": 1.8860771656036377, + "learning_rate": 4.740458816143447e-06, + "loss": 0.0268, + "step": 11050 + }, + { + "epoch": 2.82719836400818, + "grad_norm": 2.164116144180298, + "learning_rate": 4.731548312374323e-06, + "loss": 0.0403, + "step": 11060 + }, + { + "epoch": 2.829754601226994, + "grad_norm": 3.961606740951538, + "learning_rate": 4.722638663559787e-06, + "loss": 0.039, + "step": 11070 + }, + { + "epoch": 2.8323108384458076, + "grad_norm": 0.07476239651441574, + "learning_rate": 4.713729898074949e-06, + "loss": 0.0522, + "step": 11080 + }, + { + "epoch": 2.8348670756646217, + "grad_norm": 4.681721210479736, + "learning_rate": 4.704822044292103e-06, + "loss": 0.0413, + "step": 11090 + }, + { + "epoch": 2.837423312883436, + "grad_norm": 4.108366012573242, + "learning_rate": 4.695915130580636e-06, + "loss": 0.0305, + "step": 11100 + }, + { + "epoch": 2.8399795501022496, + "grad_norm": 0.2699336111545563, + "learning_rate": 4.687009185306945e-06, + "loss": 0.0495, + "step": 11110 + }, + { + "epoch": 2.8425357873210633, + "grad_norm": 3.466141939163208, + "learning_rate": 4.678104236834341e-06, + "loss": 0.0725, + "step": 11120 + }, + { + "epoch": 2.8450920245398774, + "grad_norm": 3.030548334121704, + "learning_rate": 4.6692003135229606e-06, + "loss": 0.0405, + "step": 11130 + }, + { + "epoch": 2.847648261758691, + "grad_norm": 4.3781938552856445, + "learning_rate": 4.660297443729675e-06, + "loss": 0.0209, + "step": 11140 + }, + { + "epoch": 2.8502044989775053, + "grad_norm": 0.2208949774503708, + "learning_rate": 4.6513956558080034e-06, + "loss": 0.0237, + "step": 11150 + }, + { + "epoch": 2.852760736196319, + "grad_norm": 4.45728874206543, + "learning_rate": 4.642494978108014e-06, + "loss": 0.0528, + "step": 11160 + }, + { + "epoch": 2.8553169734151327, + "grad_norm": 6.202856063842773, + "learning_rate": 4.633595438976244e-06, + "loss": 0.0534, + "step": 11170 + }, + { + "epoch": 2.857873210633947, + "grad_norm": 3.93393874168396, + "learning_rate": 4.624697066755602e-06, + "loss": 0.0261, + "step": 11180 + }, + { + "epoch": 2.860429447852761, + "grad_norm": 1.9619215726852417, + "learning_rate": 4.6157998897852815e-06, + "loss": 0.0429, + "step": 11190 + }, + { + "epoch": 2.8629856850715747, + "grad_norm": 5.04984188079834, + "learning_rate": 4.606903936400667e-06, + "loss": 0.0428, + "step": 11200 + }, + { + "epoch": 2.8655419222903884, + "grad_norm": 3.097203254699707, + "learning_rate": 4.5980092349332525e-06, + "loss": 0.0336, + "step": 11210 + }, + { + "epoch": 2.8680981595092025, + "grad_norm": 1.7928495407104492, + "learning_rate": 4.589115813710535e-06, + "loss": 0.0516, + "step": 11220 + }, + { + "epoch": 2.8706543967280163, + "grad_norm": 3.5692665576934814, + "learning_rate": 4.580223701055945e-06, + "loss": 0.0328, + "step": 11230 + }, + { + "epoch": 2.8732106339468304, + "grad_norm": 1.9397566318511963, + "learning_rate": 4.571332925288735e-06, + "loss": 0.0255, + "step": 11240 + }, + { + "epoch": 2.875766871165644, + "grad_norm": 3.0860631465911865, + "learning_rate": 4.562443514723911e-06, + "loss": 0.0356, + "step": 11250 + }, + { + "epoch": 2.878323108384458, + "grad_norm": 3.6334643363952637, + "learning_rate": 4.553555497672119e-06, + "loss": 0.0535, + "step": 11260 + }, + { + "epoch": 2.880879345603272, + "grad_norm": 5.285019397735596, + "learning_rate": 4.544668902439577e-06, + "loss": 0.073, + "step": 11270 + }, + { + "epoch": 2.883435582822086, + "grad_norm": 0.21129778027534485, + "learning_rate": 4.53578375732797e-06, + "loss": 0.0175, + "step": 11280 + }, + { + "epoch": 2.8859918200409, + "grad_norm": 0.07329968363046646, + "learning_rate": 4.526900090634368e-06, + "loss": 0.0222, + "step": 11290 + }, + { + "epoch": 2.8885480572597135, + "grad_norm": 2.5236427783966064, + "learning_rate": 4.518017930651128e-06, + "loss": 0.0439, + "step": 11300 + }, + { + "epoch": 2.8911042944785277, + "grad_norm": 0.4075072407722473, + "learning_rate": 4.509137305665812e-06, + "loss": 0.0405, + "step": 11310 + }, + { + "epoch": 2.8936605316973414, + "grad_norm": 1.6199369430541992, + "learning_rate": 4.5002582439610895e-06, + "loss": 0.019, + "step": 11320 + }, + { + "epoch": 2.8962167689161555, + "grad_norm": 0.04643448814749718, + "learning_rate": 4.491380773814659e-06, + "loss": 0.0212, + "step": 11330 + }, + { + "epoch": 2.8987730061349692, + "grad_norm": 1.4235713481903076, + "learning_rate": 4.4825049234991405e-06, + "loss": 0.0105, + "step": 11340 + }, + { + "epoch": 2.9013292433537834, + "grad_norm": 0.04633248969912529, + "learning_rate": 4.473630721282004e-06, + "loss": 0.0261, + "step": 11350 + }, + { + "epoch": 2.903885480572597, + "grad_norm": 5.469078063964844, + "learning_rate": 4.464758195425464e-06, + "loss": 0.0275, + "step": 11360 + }, + { + "epoch": 2.9064417177914113, + "grad_norm": 0.15273931622505188, + "learning_rate": 4.455887374186401e-06, + "loss": 0.0297, + "step": 11370 + }, + { + "epoch": 2.908997955010225, + "grad_norm": 0.10551747679710388, + "learning_rate": 4.447018285816263e-06, + "loss": 0.0285, + "step": 11380 + }, + { + "epoch": 2.9115541922290387, + "grad_norm": 0.063129723072052, + "learning_rate": 4.438150958560983e-06, + "loss": 0.028, + "step": 11390 + }, + { + "epoch": 2.914110429447853, + "grad_norm": 0.8330835700035095, + "learning_rate": 4.42928542066088e-06, + "loss": 0.0227, + "step": 11400 + }, + { + "epoch": 2.9166666666666665, + "grad_norm": 5.185162544250488, + "learning_rate": 4.420421700350581e-06, + "loss": 0.0378, + "step": 11410 + }, + { + "epoch": 2.9192229038854807, + "grad_norm": 0.602056622505188, + "learning_rate": 4.4115598258589165e-06, + "loss": 0.0259, + "step": 11420 + }, + { + "epoch": 2.9217791411042944, + "grad_norm": 3.8201723098754883, + "learning_rate": 4.402699825408849e-06, + "loss": 0.0373, + "step": 11430 + }, + { + "epoch": 2.9243353783231085, + "grad_norm": 0.2384403496980667, + "learning_rate": 4.393841727217361e-06, + "loss": 0.0158, + "step": 11440 + }, + { + "epoch": 2.9268916155419222, + "grad_norm": 2.9862217903137207, + "learning_rate": 4.384985559495387e-06, + "loss": 0.0573, + "step": 11450 + }, + { + "epoch": 2.9294478527607364, + "grad_norm": 5.518589019775391, + "learning_rate": 4.376131350447703e-06, + "loss": 0.0331, + "step": 11460 + }, + { + "epoch": 2.93200408997955, + "grad_norm": 6.048367500305176, + "learning_rate": 4.36727912827286e-06, + "loss": 0.0422, + "step": 11470 + }, + { + "epoch": 2.934560327198364, + "grad_norm": 5.123732089996338, + "learning_rate": 4.358428921163066e-06, + "loss": 0.0287, + "step": 11480 + }, + { + "epoch": 2.937116564417178, + "grad_norm": 4.53354549407959, + "learning_rate": 4.349580757304127e-06, + "loss": 0.0191, + "step": 11490 + }, + { + "epoch": 2.939672801635992, + "grad_norm": 1.6047019958496094, + "learning_rate": 4.34073466487533e-06, + "loss": 0.0529, + "step": 11500 + }, + { + "epoch": 2.942229038854806, + "grad_norm": 0.1400771290063858, + "learning_rate": 4.331890672049371e-06, + "loss": 0.029, + "step": 11510 + }, + { + "epoch": 2.9447852760736195, + "grad_norm": 4.497285842895508, + "learning_rate": 4.323048806992257e-06, + "loss": 0.031, + "step": 11520 + }, + { + "epoch": 2.9473415132924337, + "grad_norm": 5.1836442947387695, + "learning_rate": 4.31420909786322e-06, + "loss": 0.0347, + "step": 11530 + }, + { + "epoch": 2.9498977505112474, + "grad_norm": 0.12893950939178467, + "learning_rate": 4.305371572814623e-06, + "loss": 0.0141, + "step": 11540 + }, + { + "epoch": 2.9524539877300615, + "grad_norm": 5.480885028839111, + "learning_rate": 4.296536259991876e-06, + "loss": 0.0223, + "step": 11550 + }, + { + "epoch": 2.955010224948875, + "grad_norm": 15.032180786132812, + "learning_rate": 4.287703187533346e-06, + "loss": 0.0722, + "step": 11560 + }, + { + "epoch": 2.957566462167689, + "grad_norm": 2.98856520652771, + "learning_rate": 4.278872383570256e-06, + "loss": 0.0248, + "step": 11570 + }, + { + "epoch": 2.960122699386503, + "grad_norm": 3.5357167720794678, + "learning_rate": 4.270043876226616e-06, + "loss": 0.0385, + "step": 11580 + }, + { + "epoch": 2.9626789366053172, + "grad_norm": 1.0948529243469238, + "learning_rate": 4.2612176936191104e-06, + "loss": 0.0293, + "step": 11590 + }, + { + "epoch": 2.965235173824131, + "grad_norm": 1.0036929845809937, + "learning_rate": 4.252393863857033e-06, + "loss": 0.0598, + "step": 11600 + }, + { + "epoch": 2.9677914110429446, + "grad_norm": 5.068575382232666, + "learning_rate": 4.243572415042168e-06, + "loss": 0.0479, + "step": 11610 + }, + { + "epoch": 2.970347648261759, + "grad_norm": 2.0871167182922363, + "learning_rate": 4.2347533752687335e-06, + "loss": 0.0228, + "step": 11620 + }, + { + "epoch": 2.9729038854805725, + "grad_norm": 0.04474279657006264, + "learning_rate": 4.225936772623262e-06, + "loss": 0.0119, + "step": 11630 + }, + { + "epoch": 2.9754601226993866, + "grad_norm": 3.878139019012451, + "learning_rate": 4.217122635184532e-06, + "loss": 0.0333, + "step": 11640 + }, + { + "epoch": 2.9780163599182004, + "grad_norm": 0.04483529180288315, + "learning_rate": 4.208310991023469e-06, + "loss": 0.0411, + "step": 11650 + }, + { + "epoch": 2.980572597137014, + "grad_norm": 0.06956873834133148, + "learning_rate": 4.199501868203059e-06, + "loss": 0.015, + "step": 11660 + }, + { + "epoch": 2.983128834355828, + "grad_norm": 4.716834545135498, + "learning_rate": 4.190695294778254e-06, + "loss": 0.0272, + "step": 11670 + }, + { + "epoch": 2.9856850715746424, + "grad_norm": 4.978919506072998, + "learning_rate": 4.1818912987958935e-06, + "loss": 0.0349, + "step": 11680 + }, + { + "epoch": 2.988241308793456, + "grad_norm": 4.98551607131958, + "learning_rate": 4.1730899082946e-06, + "loss": 0.0391, + "step": 11690 + }, + { + "epoch": 2.9907975460122698, + "grad_norm": 0.028066415339708328, + "learning_rate": 4.164291151304707e-06, + "loss": 0.0366, + "step": 11700 + }, + { + "epoch": 2.993353783231084, + "grad_norm": 3.7603607177734375, + "learning_rate": 4.155495055848154e-06, + "loss": 0.0309, + "step": 11710 + }, + { + "epoch": 2.9959100204498976, + "grad_norm": 6.2368621826171875, + "learning_rate": 4.146701649938409e-06, + "loss": 0.0526, + "step": 11720 + }, + { + "epoch": 2.9984662576687118, + "grad_norm": 1.746232032775879, + "learning_rate": 4.13791096158037e-06, + "loss": 0.018, + "step": 11730 + }, + { + "epoch": 3.0010224948875255, + "grad_norm": 3.928952693939209, + "learning_rate": 4.129123018770285e-06, + "loss": 0.0108, + "step": 11740 + }, + { + "epoch": 3.0035787321063396, + "grad_norm": 0.7030458450317383, + "learning_rate": 4.120337849495654e-06, + "loss": 0.019, + "step": 11750 + }, + { + "epoch": 3.0061349693251533, + "grad_norm": 1.5258599519729614, + "learning_rate": 4.111555481735147e-06, + "loss": 0.0215, + "step": 11760 + }, + { + "epoch": 3.0086912065439675, + "grad_norm": 3.1201798915863037, + "learning_rate": 4.102775943458508e-06, + "loss": 0.015, + "step": 11770 + }, + { + "epoch": 3.011247443762781, + "grad_norm": 2.5468101501464844, + "learning_rate": 4.093999262626474e-06, + "loss": 0.0092, + "step": 11780 + }, + { + "epoch": 3.013803680981595, + "grad_norm": 4.258352279663086, + "learning_rate": 4.0852254671906794e-06, + "loss": 0.0111, + "step": 11790 + }, + { + "epoch": 3.016359918200409, + "grad_norm": 4.136040210723877, + "learning_rate": 4.076454585093572e-06, + "loss": 0.0247, + "step": 11800 + }, + { + "epoch": 3.0189161554192228, + "grad_norm": 0.01770654506981373, + "learning_rate": 4.067686644268316e-06, + "loss": 0.0168, + "step": 11810 + }, + { + "epoch": 3.021472392638037, + "grad_norm": 3.165257453918457, + "learning_rate": 4.0589216726387146e-06, + "loss": 0.0157, + "step": 11820 + }, + { + "epoch": 3.0240286298568506, + "grad_norm": 1.5152426958084106, + "learning_rate": 4.050159698119107e-06, + "loss": 0.0113, + "step": 11830 + }, + { + "epoch": 3.0265848670756648, + "grad_norm": 0.025976594537496567, + "learning_rate": 4.0414007486142985e-06, + "loss": 0.0072, + "step": 11840 + }, + { + "epoch": 3.0291411042944785, + "grad_norm": 4.125540256500244, + "learning_rate": 4.032644852019447e-06, + "loss": 0.0118, + "step": 11850 + }, + { + "epoch": 3.0316973415132926, + "grad_norm": 0.026777638122439384, + "learning_rate": 4.023892036220001e-06, + "loss": 0.001, + "step": 11860 + }, + { + "epoch": 3.0342535787321063, + "grad_norm": 3.001214027404785, + "learning_rate": 4.015142329091587e-06, + "loss": 0.0372, + "step": 11870 + }, + { + "epoch": 3.03680981595092, + "grad_norm": 0.012349724769592285, + "learning_rate": 4.006395758499937e-06, + "loss": 0.0242, + "step": 11880 + }, + { + "epoch": 3.039366053169734, + "grad_norm": 0.48854807019233704, + "learning_rate": 3.99765235230079e-06, + "loss": 0.0202, + "step": 11890 + }, + { + "epoch": 3.041922290388548, + "grad_norm": 7.029765605926514, + "learning_rate": 3.988912138339812e-06, + "loss": 0.0228, + "step": 11900 + }, + { + "epoch": 3.044478527607362, + "grad_norm": 2.26522159576416, + "learning_rate": 3.980175144452496e-06, + "loss": 0.0152, + "step": 11910 + }, + { + "epoch": 3.0470347648261757, + "grad_norm": 5.204248905181885, + "learning_rate": 3.971441398464088e-06, + "loss": 0.021, + "step": 11920 + }, + { + "epoch": 3.04959100204499, + "grad_norm": 2.968381881713867, + "learning_rate": 3.962710928189481e-06, + "loss": 0.0234, + "step": 11930 + }, + { + "epoch": 3.0521472392638036, + "grad_norm": 3.710779905319214, + "learning_rate": 3.953983761433144e-06, + "loss": 0.0067, + "step": 11940 + }, + { + "epoch": 3.0547034764826178, + "grad_norm": 2.136486530303955, + "learning_rate": 3.94525992598902e-06, + "loss": 0.0096, + "step": 11950 + }, + { + "epoch": 3.0572597137014315, + "grad_norm": 0.898169219493866, + "learning_rate": 3.936539449640445e-06, + "loss": 0.007, + "step": 11960 + }, + { + "epoch": 3.0598159509202456, + "grad_norm": 7.237276077270508, + "learning_rate": 3.927822360160053e-06, + "loss": 0.0261, + "step": 11970 + }, + { + "epoch": 3.0623721881390593, + "grad_norm": 2.5147705078125, + "learning_rate": 3.919108685309699e-06, + "loss": 0.014, + "step": 11980 + }, + { + "epoch": 3.064928425357873, + "grad_norm": 3.493708372116089, + "learning_rate": 3.9103984528403555e-06, + "loss": 0.0213, + "step": 11990 + }, + { + "epoch": 3.067484662576687, + "grad_norm": 1.2625579833984375, + "learning_rate": 3.901691690492035e-06, + "loss": 0.0161, + "step": 12000 + }, + { + "epoch": 3.070040899795501, + "grad_norm": 3.3386011123657227, + "learning_rate": 3.892988425993703e-06, + "loss": 0.004, + "step": 12010 + }, + { + "epoch": 3.072597137014315, + "grad_norm": 1.1990747451782227, + "learning_rate": 3.884288687063177e-06, + "loss": 0.0109, + "step": 12020 + }, + { + "epoch": 3.0751533742331287, + "grad_norm": 1.3895822763442993, + "learning_rate": 3.875592501407052e-06, + "loss": 0.0272, + "step": 12030 + }, + { + "epoch": 3.077709611451943, + "grad_norm": 9.504667282104492, + "learning_rate": 3.866899896720604e-06, + "loss": 0.0211, + "step": 12040 + }, + { + "epoch": 3.0802658486707566, + "grad_norm": 10.509309768676758, + "learning_rate": 3.858210900687707e-06, + "loss": 0.0174, + "step": 12050 + }, + { + "epoch": 3.0828220858895707, + "grad_norm": 0.08506203442811966, + "learning_rate": 3.849525540980739e-06, + "loss": 0.0087, + "step": 12060 + }, + { + "epoch": 3.0853783231083844, + "grad_norm": 1.2189379930496216, + "learning_rate": 3.840843845260501e-06, + "loss": 0.0119, + "step": 12070 + }, + { + "epoch": 3.087934560327198, + "grad_norm": 0.03395168483257294, + "learning_rate": 3.832165841176121e-06, + "loss": 0.0163, + "step": 12080 + }, + { + "epoch": 3.0904907975460123, + "grad_norm": 4.858822345733643, + "learning_rate": 3.823491556364973e-06, + "loss": 0.0104, + "step": 12090 + }, + { + "epoch": 3.093047034764826, + "grad_norm": 0.15337003767490387, + "learning_rate": 3.814821018452583e-06, + "loss": 0.0249, + "step": 12100 + }, + { + "epoch": 3.09560327198364, + "grad_norm": 6.41199254989624, + "learning_rate": 3.806154255052551e-06, + "loss": 0.0067, + "step": 12110 + }, + { + "epoch": 3.098159509202454, + "grad_norm": 1.0053160190582275, + "learning_rate": 3.7974912937664455e-06, + "loss": 0.0299, + "step": 12120 + }, + { + "epoch": 3.100715746421268, + "grad_norm": 1.6271339654922485, + "learning_rate": 3.7888321621837363e-06, + "loss": 0.0053, + "step": 12130 + }, + { + "epoch": 3.1032719836400817, + "grad_norm": 0.03732278570532799, + "learning_rate": 3.7801768878816892e-06, + "loss": 0.0089, + "step": 12140 + }, + { + "epoch": 3.105828220858896, + "grad_norm": 4.223018646240234, + "learning_rate": 3.771525498425289e-06, + "loss": 0.0107, + "step": 12150 + }, + { + "epoch": 3.1083844580777096, + "grad_norm": 1.2061896324157715, + "learning_rate": 3.762878021367148e-06, + "loss": 0.0154, + "step": 12160 + }, + { + "epoch": 3.1109406952965237, + "grad_norm": 2.464517831802368, + "learning_rate": 3.754234484247418e-06, + "loss": 0.0078, + "step": 12170 + }, + { + "epoch": 3.1134969325153374, + "grad_norm": 0.042976122349500656, + "learning_rate": 3.745594914593701e-06, + "loss": 0.0114, + "step": 12180 + }, + { + "epoch": 3.116053169734151, + "grad_norm": 0.11069530993700027, + "learning_rate": 3.7369593399209704e-06, + "loss": 0.0111, + "step": 12190 + }, + { + "epoch": 3.1186094069529653, + "grad_norm": 0.14891409873962402, + "learning_rate": 3.728327787731465e-06, + "loss": 0.0084, + "step": 12200 + }, + { + "epoch": 3.121165644171779, + "grad_norm": 0.02942030318081379, + "learning_rate": 3.7197002855146257e-06, + "loss": 0.011, + "step": 12210 + }, + { + "epoch": 3.123721881390593, + "grad_norm": 3.233976364135742, + "learning_rate": 3.7110768607469842e-06, + "loss": 0.0082, + "step": 12220 + }, + { + "epoch": 3.126278118609407, + "grad_norm": 3.62264084815979, + "learning_rate": 3.7024575408920958e-06, + "loss": 0.009, + "step": 12230 + }, + { + "epoch": 3.128834355828221, + "grad_norm": 0.051736973226070404, + "learning_rate": 3.693842353400435e-06, + "loss": 0.0276, + "step": 12240 + }, + { + "epoch": 3.1313905930470347, + "grad_norm": 1.5636509656906128, + "learning_rate": 3.6852313257093214e-06, + "loss": 0.0283, + "step": 12250 + }, + { + "epoch": 3.133946830265849, + "grad_norm": 3.639524221420288, + "learning_rate": 3.6766244852428218e-06, + "loss": 0.0209, + "step": 12260 + }, + { + "epoch": 3.1365030674846626, + "grad_norm": 2.127938985824585, + "learning_rate": 3.6680218594116725e-06, + "loss": 0.0079, + "step": 12270 + }, + { + "epoch": 3.1390593047034763, + "grad_norm": 5.6783447265625, + "learning_rate": 3.6594234756131826e-06, + "loss": 0.0194, + "step": 12280 + }, + { + "epoch": 3.1416155419222904, + "grad_norm": 0.3146345615386963, + "learning_rate": 3.6508293612311552e-06, + "loss": 0.0153, + "step": 12290 + }, + { + "epoch": 3.144171779141104, + "grad_norm": 0.37290289998054504, + "learning_rate": 3.642239543635793e-06, + "loss": 0.0235, + "step": 12300 + }, + { + "epoch": 3.1467280163599183, + "grad_norm": 0.22575929760932922, + "learning_rate": 3.6336540501836185e-06, + "loss": 0.0109, + "step": 12310 + }, + { + "epoch": 3.149284253578732, + "grad_norm": 3.687939405441284, + "learning_rate": 3.625072908217378e-06, + "loss": 0.0177, + "step": 12320 + }, + { + "epoch": 3.151840490797546, + "grad_norm": 0.08439797908067703, + "learning_rate": 3.6164961450659634e-06, + "loss": 0.0045, + "step": 12330 + }, + { + "epoch": 3.15439672801636, + "grad_norm": 2.362006425857544, + "learning_rate": 3.6079237880443186e-06, + "loss": 0.0142, + "step": 12340 + }, + { + "epoch": 3.156952965235174, + "grad_norm": 0.579308807849884, + "learning_rate": 3.599355864453357e-06, + "loss": 0.0074, + "step": 12350 + }, + { + "epoch": 3.1595092024539877, + "grad_norm": 0.3662513494491577, + "learning_rate": 3.5907924015798697e-06, + "loss": 0.0133, + "step": 12360 + }, + { + "epoch": 3.1620654396728014, + "grad_norm": 0.22020725905895233, + "learning_rate": 3.5822334266964454e-06, + "loss": 0.0245, + "step": 12370 + }, + { + "epoch": 3.1646216768916156, + "grad_norm": 0.26042699813842773, + "learning_rate": 3.573678967061374e-06, + "loss": 0.0039, + "step": 12380 + }, + { + "epoch": 3.1671779141104293, + "grad_norm": 4.502334117889404, + "learning_rate": 3.5651290499185752e-06, + "loss": 0.0135, + "step": 12390 + }, + { + "epoch": 3.1697341513292434, + "grad_norm": 0.07907534390687943, + "learning_rate": 3.556583702497489e-06, + "loss": 0.0058, + "step": 12400 + }, + { + "epoch": 3.172290388548057, + "grad_norm": 0.012879629619419575, + "learning_rate": 3.5480429520130144e-06, + "loss": 0.018, + "step": 12410 + }, + { + "epoch": 3.1748466257668713, + "grad_norm": 0.1027621328830719, + "learning_rate": 3.5395068256653984e-06, + "loss": 0.0055, + "step": 12420 + }, + { + "epoch": 3.177402862985685, + "grad_norm": 2.4270403385162354, + "learning_rate": 3.5309753506401747e-06, + "loss": 0.0186, + "step": 12430 + }, + { + "epoch": 3.179959100204499, + "grad_norm": 0.0203610397875309, + "learning_rate": 3.5224485541080476e-06, + "loss": 0.011, + "step": 12440 + }, + { + "epoch": 3.182515337423313, + "grad_norm": 3.286555528640747, + "learning_rate": 3.513926463224836e-06, + "loss": 0.0051, + "step": 12450 + }, + { + "epoch": 3.185071574642127, + "grad_norm": 0.15632130205631256, + "learning_rate": 3.5054091051313666e-06, + "loss": 0.0061, + "step": 12460 + }, + { + "epoch": 3.1876278118609407, + "grad_norm": 1.8245761394500732, + "learning_rate": 3.49689650695339e-06, + "loss": 0.0151, + "step": 12470 + }, + { + "epoch": 3.190184049079755, + "grad_norm": 0.6735230088233948, + "learning_rate": 3.4883886958015046e-06, + "loss": 0.0129, + "step": 12480 + }, + { + "epoch": 3.1927402862985685, + "grad_norm": 1.4515380859375, + "learning_rate": 3.4798856987710574e-06, + "loss": 0.0222, + "step": 12490 + }, + { + "epoch": 3.1952965235173822, + "grad_norm": 0.036662183701992035, + "learning_rate": 3.4713875429420656e-06, + "loss": 0.0235, + "step": 12500 + }, + { + "epoch": 3.1978527607361964, + "grad_norm": 2.479926109313965, + "learning_rate": 3.4628942553791285e-06, + "loss": 0.0075, + "step": 12510 + }, + { + "epoch": 3.20040899795501, + "grad_norm": 0.033283405005931854, + "learning_rate": 3.4544058631313427e-06, + "loss": 0.0105, + "step": 12520 + }, + { + "epoch": 3.2029652351738243, + "grad_norm": 0.06612569093704224, + "learning_rate": 3.44592239323221e-06, + "loss": 0.0143, + "step": 12530 + }, + { + "epoch": 3.205521472392638, + "grad_norm": 0.0648500844836235, + "learning_rate": 3.4374438726995614e-06, + "loss": 0.0086, + "step": 12540 + }, + { + "epoch": 3.208077709611452, + "grad_norm": 0.08395984768867493, + "learning_rate": 3.4289703285354587e-06, + "loss": 0.0105, + "step": 12550 + }, + { + "epoch": 3.210633946830266, + "grad_norm": 1.128602147102356, + "learning_rate": 3.4205017877261244e-06, + "loss": 0.0157, + "step": 12560 + }, + { + "epoch": 3.21319018404908, + "grad_norm": 0.026443956419825554, + "learning_rate": 3.4120382772418346e-06, + "loss": 0.0075, + "step": 12570 + }, + { + "epoch": 3.2157464212678937, + "grad_norm": 0.2616029679775238, + "learning_rate": 3.4035798240368578e-06, + "loss": 0.0085, + "step": 12580 + }, + { + "epoch": 3.2183026584867074, + "grad_norm": 3.7674038410186768, + "learning_rate": 3.3951264550493433e-06, + "loss": 0.0166, + "step": 12590 + }, + { + "epoch": 3.2208588957055215, + "grad_norm": 4.487175941467285, + "learning_rate": 3.3866781972012602e-06, + "loss": 0.0082, + "step": 12600 + }, + { + "epoch": 3.2234151329243352, + "grad_norm": 4.683178424835205, + "learning_rate": 3.378235077398292e-06, + "loss": 0.0081, + "step": 12610 + }, + { + "epoch": 3.2259713701431494, + "grad_norm": 0.07378882169723511, + "learning_rate": 3.369797122529762e-06, + "loss": 0.0126, + "step": 12620 + }, + { + "epoch": 3.228527607361963, + "grad_norm": 0.04591992124915123, + "learning_rate": 3.3613643594685436e-06, + "loss": 0.0069, + "step": 12630 + }, + { + "epoch": 3.2310838445807772, + "grad_norm": 0.039997998625040054, + "learning_rate": 3.3529368150709762e-06, + "loss": 0.0084, + "step": 12640 + }, + { + "epoch": 3.233640081799591, + "grad_norm": 0.03221229463815689, + "learning_rate": 3.344514516176778e-06, + "loss": 0.0148, + "step": 12650 + }, + { + "epoch": 3.236196319018405, + "grad_norm": 2.5336620807647705, + "learning_rate": 3.336097489608962e-06, + "loss": 0.0144, + "step": 12660 + }, + { + "epoch": 3.238752556237219, + "grad_norm": 0.19575847685337067, + "learning_rate": 3.3276857621737495e-06, + "loss": 0.009, + "step": 12670 + }, + { + "epoch": 3.2413087934560325, + "grad_norm": 4.261199951171875, + "learning_rate": 3.3192793606604877e-06, + "loss": 0.0123, + "step": 12680 + }, + { + "epoch": 3.2438650306748467, + "grad_norm": 3.218693733215332, + "learning_rate": 3.3108783118415583e-06, + "loss": 0.0124, + "step": 12690 + }, + { + "epoch": 3.2464212678936604, + "grad_norm": 0.16256259381771088, + "learning_rate": 3.3024826424722993e-06, + "loss": 0.0139, + "step": 12700 + }, + { + "epoch": 3.2489775051124745, + "grad_norm": 3.9794180393218994, + "learning_rate": 3.2940923792909134e-06, + "loss": 0.0163, + "step": 12710 + }, + { + "epoch": 3.2515337423312882, + "grad_norm": 0.19562911987304688, + "learning_rate": 3.28570754901839e-06, + "loss": 0.0087, + "step": 12720 + }, + { + "epoch": 3.2540899795501024, + "grad_norm": 1.9741108417510986, + "learning_rate": 3.2773281783584104e-06, + "loss": 0.0221, + "step": 12730 + }, + { + "epoch": 3.256646216768916, + "grad_norm": 5.769931793212891, + "learning_rate": 3.2689542939972742e-06, + "loss": 0.0191, + "step": 12740 + }, + { + "epoch": 3.2592024539877302, + "grad_norm": 3.071200370788574, + "learning_rate": 3.2605859226038038e-06, + "loss": 0.0333, + "step": 12750 + }, + { + "epoch": 3.261758691206544, + "grad_norm": 0.05902179330587387, + "learning_rate": 3.2522230908292674e-06, + "loss": 0.0056, + "step": 12760 + }, + { + "epoch": 3.2643149284253576, + "grad_norm": 4.801800727844238, + "learning_rate": 3.243865825307286e-06, + "loss": 0.03, + "step": 12770 + }, + { + "epoch": 3.266871165644172, + "grad_norm": 0.9842033386230469, + "learning_rate": 3.2355141526537636e-06, + "loss": 0.0188, + "step": 12780 + }, + { + "epoch": 3.2694274028629855, + "grad_norm": 0.02561868727207184, + "learning_rate": 3.2271680994667776e-06, + "loss": 0.0064, + "step": 12790 + }, + { + "epoch": 3.2719836400817996, + "grad_norm": 0.22291143238544464, + "learning_rate": 3.2188276923265237e-06, + "loss": 0.0054, + "step": 12800 + }, + { + "epoch": 3.2745398773006134, + "grad_norm": 2.7589244842529297, + "learning_rate": 3.2104929577952028e-06, + "loss": 0.0211, + "step": 12810 + }, + { + "epoch": 3.2770961145194275, + "grad_norm": 0.16009178757667542, + "learning_rate": 3.2021639224169615e-06, + "loss": 0.0069, + "step": 12820 + }, + { + "epoch": 3.279652351738241, + "grad_norm": 0.02730882354080677, + "learning_rate": 3.1938406127177878e-06, + "loss": 0.0145, + "step": 12830 + }, + { + "epoch": 3.2822085889570554, + "grad_norm": 0.1984373927116394, + "learning_rate": 3.1855230552054395e-06, + "loss": 0.0114, + "step": 12840 + }, + { + "epoch": 3.284764826175869, + "grad_norm": 1.7443758249282837, + "learning_rate": 3.177211276369351e-06, + "loss": 0.0084, + "step": 12850 + }, + { + "epoch": 3.287321063394683, + "grad_norm": 1.0765767097473145, + "learning_rate": 3.1689053026805573e-06, + "loss": 0.0055, + "step": 12860 + }, + { + "epoch": 3.289877300613497, + "grad_norm": 0.25870829820632935, + "learning_rate": 3.160605160591602e-06, + "loss": 0.0189, + "step": 12870 + }, + { + "epoch": 3.292433537832311, + "grad_norm": 0.007034212350845337, + "learning_rate": 3.1523108765364598e-06, + "loss": 0.0059, + "step": 12880 + }, + { + "epoch": 3.294989775051125, + "grad_norm": 3.2303411960601807, + "learning_rate": 3.1440224769304446e-06, + "loss": 0.009, + "step": 12890 + }, + { + "epoch": 3.2975460122699385, + "grad_norm": 3.338958978652954, + "learning_rate": 3.1357399881701326e-06, + "loss": 0.0126, + "step": 12900 + }, + { + "epoch": 3.3001022494887526, + "grad_norm": 0.04261789843440056, + "learning_rate": 3.1274634366332775e-06, + "loss": 0.004, + "step": 12910 + }, + { + "epoch": 3.3026584867075663, + "grad_norm": 0.30257269740104675, + "learning_rate": 3.119192848678717e-06, + "loss": 0.0025, + "step": 12920 + }, + { + "epoch": 3.3052147239263805, + "grad_norm": 0.821662962436676, + "learning_rate": 3.110928250646307e-06, + "loss": 0.0129, + "step": 12930 + }, + { + "epoch": 3.307770961145194, + "grad_norm": 0.3390525281429291, + "learning_rate": 3.1026696688568137e-06, + "loss": 0.0106, + "step": 12940 + }, + { + "epoch": 3.3103271983640083, + "grad_norm": 0.07365961372852325, + "learning_rate": 3.0944171296118574e-06, + "loss": 0.0271, + "step": 12950 + }, + { + "epoch": 3.312883435582822, + "grad_norm": 0.03872542828321457, + "learning_rate": 3.0861706591938013e-06, + "loss": 0.0106, + "step": 12960 + }, + { + "epoch": 3.315439672801636, + "grad_norm": 0.08862084150314331, + "learning_rate": 3.0779302838656906e-06, + "loss": 0.0046, + "step": 12970 + }, + { + "epoch": 3.31799591002045, + "grad_norm": 4.23959493637085, + "learning_rate": 3.0696960298711525e-06, + "loss": 0.0028, + "step": 12980 + }, + { + "epoch": 3.3205521472392636, + "grad_norm": 0.6766570806503296, + "learning_rate": 3.0614679234343242e-06, + "loss": 0.0076, + "step": 12990 + }, + { + "epoch": 3.3231083844580778, + "grad_norm": 6.0040130615234375, + "learning_rate": 3.05324599075976e-06, + "loss": 0.0292, + "step": 13000 + }, + { + "epoch": 3.3256646216768915, + "grad_norm": 1.5274661779403687, + "learning_rate": 3.0450302580323553e-06, + "loss": 0.0104, + "step": 13010 + }, + { + "epoch": 3.3282208588957056, + "grad_norm": 2.2926924228668213, + "learning_rate": 3.036820751417259e-06, + "loss": 0.038, + "step": 13020 + }, + { + "epoch": 3.3307770961145193, + "grad_norm": 2.260282278060913, + "learning_rate": 3.0286174970597916e-06, + "loss": 0.0122, + "step": 13030 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.12744282186031342, + "learning_rate": 3.02042052108536e-06, + "loss": 0.0079, + "step": 13040 + }, + { + "epoch": 3.335889570552147, + "grad_norm": 0.016303053125739098, + "learning_rate": 3.0122298495993803e-06, + "loss": 0.0297, + "step": 13050 + }, + { + "epoch": 3.3384458077709613, + "grad_norm": 0.024792036041617393, + "learning_rate": 3.0040455086871846e-06, + "loss": 0.015, + "step": 13060 + }, + { + "epoch": 3.341002044989775, + "grad_norm": 3.460494041442871, + "learning_rate": 2.995867524413949e-06, + "loss": 0.0163, + "step": 13070 + }, + { + "epoch": 3.3435582822085887, + "grad_norm": 0.18595871329307556, + "learning_rate": 2.9876959228246006e-06, + "loss": 0.0047, + "step": 13080 + }, + { + "epoch": 3.346114519427403, + "grad_norm": 3.1711583137512207, + "learning_rate": 2.9795307299437425e-06, + "loss": 0.0171, + "step": 13090 + }, + { + "epoch": 3.3486707566462166, + "grad_norm": 0.08736535161733627, + "learning_rate": 2.971371971775565e-06, + "loss": 0.0196, + "step": 13100 + }, + { + "epoch": 3.3512269938650308, + "grad_norm": 0.05008082464337349, + "learning_rate": 2.96321967430377e-06, + "loss": 0.0042, + "step": 13110 + }, + { + "epoch": 3.3537832310838445, + "grad_norm": 2.4688546657562256, + "learning_rate": 2.9550738634914765e-06, + "loss": 0.0086, + "step": 13120 + }, + { + "epoch": 3.3563394683026586, + "grad_norm": 2.240190267562866, + "learning_rate": 2.946934565281151e-06, + "loss": 0.0203, + "step": 13130 + }, + { + "epoch": 3.3588957055214723, + "grad_norm": 0.22563564777374268, + "learning_rate": 2.9388018055945157e-06, + "loss": 0.0028, + "step": 13140 + }, + { + "epoch": 3.3614519427402865, + "grad_norm": 3.0712578296661377, + "learning_rate": 2.930675610332473e-06, + "loss": 0.019, + "step": 13150 + }, + { + "epoch": 3.3640081799591, + "grad_norm": 2.705103635787964, + "learning_rate": 2.9225560053750113e-06, + "loss": 0.0041, + "step": 13160 + }, + { + "epoch": 3.366564417177914, + "grad_norm": 0.05551149323582649, + "learning_rate": 2.9144430165811423e-06, + "loss": 0.0132, + "step": 13170 + }, + { + "epoch": 3.369120654396728, + "grad_norm": 0.1891528069972992, + "learning_rate": 2.9063366697887947e-06, + "loss": 0.0135, + "step": 13180 + }, + { + "epoch": 3.3716768916155417, + "grad_norm": 0.014974648132920265, + "learning_rate": 2.898236990814751e-06, + "loss": 0.0119, + "step": 13190 + }, + { + "epoch": 3.374233128834356, + "grad_norm": 3.4752650260925293, + "learning_rate": 2.890144005454557e-06, + "loss": 0.0181, + "step": 13200 + }, + { + "epoch": 3.3767893660531696, + "grad_norm": 2.042994976043701, + "learning_rate": 2.8820577394824433e-06, + "loss": 0.0029, + "step": 13210 + }, + { + "epoch": 3.3793456032719837, + "grad_norm": 7.3295769691467285, + "learning_rate": 2.873978218651233e-06, + "loss": 0.0173, + "step": 13220 + }, + { + "epoch": 3.3819018404907975, + "grad_norm": 0.04749654605984688, + "learning_rate": 2.8659054686922757e-06, + "loss": 0.0123, + "step": 13230 + }, + { + "epoch": 3.3844580777096116, + "grad_norm": 0.024615732952952385, + "learning_rate": 2.8578395153153536e-06, + "loss": 0.0077, + "step": 13240 + }, + { + "epoch": 3.3870143149284253, + "grad_norm": 0.014985025860369205, + "learning_rate": 2.849780384208607e-06, + "loss": 0.0039, + "step": 13250 + }, + { + "epoch": 3.3895705521472395, + "grad_norm": 0.13005146384239197, + "learning_rate": 2.8417281010384396e-06, + "loss": 0.0251, + "step": 13260 + }, + { + "epoch": 3.392126789366053, + "grad_norm": 0.07327497750520706, + "learning_rate": 2.8336826914494607e-06, + "loss": 0.0027, + "step": 13270 + }, + { + "epoch": 3.3946830265848673, + "grad_norm": 0.07815208286046982, + "learning_rate": 2.8256441810643755e-06, + "loss": 0.0119, + "step": 13280 + }, + { + "epoch": 3.397239263803681, + "grad_norm": 1.9264451265335083, + "learning_rate": 2.8176125954839247e-06, + "loss": 0.0107, + "step": 13290 + }, + { + "epoch": 3.3997955010224947, + "grad_norm": 3.673927068710327, + "learning_rate": 2.8095879602867877e-06, + "loss": 0.0077, + "step": 13300 + }, + { + "epoch": 3.402351738241309, + "grad_norm": 2.514970064163208, + "learning_rate": 2.8015703010295214e-06, + "loss": 0.0301, + "step": 13310 + }, + { + "epoch": 3.4049079754601226, + "grad_norm": 2.072049379348755, + "learning_rate": 2.793559643246451e-06, + "loss": 0.0028, + "step": 13320 + }, + { + "epoch": 3.4074642126789367, + "grad_norm": 2.3494277000427246, + "learning_rate": 2.7855560124496146e-06, + "loss": 0.0079, + "step": 13330 + }, + { + "epoch": 3.4100204498977504, + "grad_norm": 2.0031983852386475, + "learning_rate": 2.777559434128666e-06, + "loss": 0.0137, + "step": 13340 + }, + { + "epoch": 3.4125766871165646, + "grad_norm": 4.773671627044678, + "learning_rate": 2.7695699337507996e-06, + "loss": 0.0102, + "step": 13350 + }, + { + "epoch": 3.4151329243353783, + "grad_norm": 0.5617696642875671, + "learning_rate": 2.7615875367606704e-06, + "loss": 0.0155, + "step": 13360 + }, + { + "epoch": 3.4176891615541924, + "grad_norm": 5.82913875579834, + "learning_rate": 2.753612268580306e-06, + "loss": 0.0117, + "step": 13370 + }, + { + "epoch": 3.420245398773006, + "grad_norm": 0.17889423668384552, + "learning_rate": 2.7456441546090335e-06, + "loss": 0.0077, + "step": 13380 + }, + { + "epoch": 3.42280163599182, + "grad_norm": 3.2761387825012207, + "learning_rate": 2.7376832202233962e-06, + "loss": 0.0039, + "step": 13390 + }, + { + "epoch": 3.425357873210634, + "grad_norm": 0.0072940983809530735, + "learning_rate": 2.7297294907770735e-06, + "loss": 0.0059, + "step": 13400 + }, + { + "epoch": 3.4279141104294477, + "grad_norm": 0.05972537025809288, + "learning_rate": 2.7217829916007888e-06, + "loss": 0.0119, + "step": 13410 + }, + { + "epoch": 3.430470347648262, + "grad_norm": 1.3629683256149292, + "learning_rate": 2.713843748002256e-06, + "loss": 0.0102, + "step": 13420 + }, + { + "epoch": 3.4330265848670756, + "grad_norm": 2.336515188217163, + "learning_rate": 2.7059117852660667e-06, + "loss": 0.0082, + "step": 13430 + }, + { + "epoch": 3.4355828220858897, + "grad_norm": 0.766312837600708, + "learning_rate": 2.697987128653633e-06, + "loss": 0.0148, + "step": 13440 + }, + { + "epoch": 3.4381390593047034, + "grad_norm": 0.01915799267590046, + "learning_rate": 2.6900698034030904e-06, + "loss": 0.0027, + "step": 13450 + }, + { + "epoch": 3.4406952965235176, + "grad_norm": 6.4156646728515625, + "learning_rate": 2.6821598347292387e-06, + "loss": 0.0227, + "step": 13460 + }, + { + "epoch": 3.4432515337423313, + "grad_norm": 1.6114623546600342, + "learning_rate": 2.6742572478234363e-06, + "loss": 0.0045, + "step": 13470 + }, + { + "epoch": 3.445807770961145, + "grad_norm": 0.04842757061123848, + "learning_rate": 2.6663620678535396e-06, + "loss": 0.0031, + "step": 13480 + }, + { + "epoch": 3.448364008179959, + "grad_norm": 4.460205554962158, + "learning_rate": 2.658474319963812e-06, + "loss": 0.0242, + "step": 13490 + }, + { + "epoch": 3.450920245398773, + "grad_norm": 1.2775579690933228, + "learning_rate": 2.650594029274853e-06, + "loss": 0.0083, + "step": 13500 + }, + { + "epoch": 3.453476482617587, + "grad_norm": 8.932818412780762, + "learning_rate": 2.642721220883503e-06, + "loss": 0.0197, + "step": 13510 + }, + { + "epoch": 3.4560327198364007, + "grad_norm": 2.6447277069091797, + "learning_rate": 2.634855919862782e-06, + "loss": 0.0086, + "step": 13520 + }, + { + "epoch": 3.458588957055215, + "grad_norm": 4.694246292114258, + "learning_rate": 2.626998151261798e-06, + "loss": 0.0063, + "step": 13530 + }, + { + "epoch": 3.4611451942740286, + "grad_norm": 5.1632914543151855, + "learning_rate": 2.61914794010567e-06, + "loss": 0.0071, + "step": 13540 + }, + { + "epoch": 3.4637014314928427, + "grad_norm": 0.45551520586013794, + "learning_rate": 2.6113053113954456e-06, + "loss": 0.0198, + "step": 13550 + }, + { + "epoch": 3.4662576687116564, + "grad_norm": 0.023942044004797935, + "learning_rate": 2.6034702901080278e-06, + "loss": 0.0098, + "step": 13560 + }, + { + "epoch": 3.46881390593047, + "grad_norm": 1.2750016450881958, + "learning_rate": 2.5956429011960905e-06, + "loss": 0.0101, + "step": 13570 + }, + { + "epoch": 3.4713701431492843, + "grad_norm": 4.26313591003418, + "learning_rate": 2.5878231695880023e-06, + "loss": 0.0115, + "step": 13580 + }, + { + "epoch": 3.473926380368098, + "grad_norm": 0.28257378935813904, + "learning_rate": 2.5800111201877397e-06, + "loss": 0.0079, + "step": 13590 + }, + { + "epoch": 3.476482617586912, + "grad_norm": 0.5308012962341309, + "learning_rate": 2.572206777874818e-06, + "loss": 0.0096, + "step": 13600 + }, + { + "epoch": 3.479038854805726, + "grad_norm": 4.8633341789245605, + "learning_rate": 2.5644101675042066e-06, + "loss": 0.021, + "step": 13610 + }, + { + "epoch": 3.48159509202454, + "grad_norm": 2.2458882331848145, + "learning_rate": 2.5566213139062502e-06, + "loss": 0.0071, + "step": 13620 + }, + { + "epoch": 3.4841513292433537, + "grad_norm": 0.1293790638446808, + "learning_rate": 2.5488402418865854e-06, + "loss": 0.0114, + "step": 13630 + }, + { + "epoch": 3.486707566462168, + "grad_norm": 0.014333824627101421, + "learning_rate": 2.5410669762260788e-06, + "loss": 0.0146, + "step": 13640 + }, + { + "epoch": 3.4892638036809815, + "grad_norm": 4.425572395324707, + "learning_rate": 2.5333015416807192e-06, + "loss": 0.0093, + "step": 13650 + }, + { + "epoch": 3.4918200408997953, + "grad_norm": 0.04234839603304863, + "learning_rate": 2.525543962981569e-06, + "loss": 0.0049, + "step": 13660 + }, + { + "epoch": 3.4943762781186094, + "grad_norm": 0.4814109802246094, + "learning_rate": 2.5177942648346597e-06, + "loss": 0.0059, + "step": 13670 + }, + { + "epoch": 3.4969325153374236, + "grad_norm": 0.25284790992736816, + "learning_rate": 2.5100524719209387e-06, + "loss": 0.0086, + "step": 13680 + }, + { + "epoch": 3.4994887525562373, + "grad_norm": 2.6780126094818115, + "learning_rate": 2.502318608896165e-06, + "loss": 0.0078, + "step": 13690 + }, + { + "epoch": 3.502044989775051, + "grad_norm": 1.6357485055923462, + "learning_rate": 2.494592700390848e-06, + "loss": 0.0047, + "step": 13700 + }, + { + "epoch": 3.504601226993865, + "grad_norm": 0.4582887887954712, + "learning_rate": 2.4868747710101647e-06, + "loss": 0.0093, + "step": 13710 + }, + { + "epoch": 3.507157464212679, + "grad_norm": 1.8089367151260376, + "learning_rate": 2.479164845333881e-06, + "loss": 0.0039, + "step": 13720 + }, + { + "epoch": 3.509713701431493, + "grad_norm": 3.7371037006378174, + "learning_rate": 2.471462947916267e-06, + "loss": 0.0095, + "step": 13730 + }, + { + "epoch": 3.5122699386503067, + "grad_norm": 0.04978760704398155, + "learning_rate": 2.4637691032860306e-06, + "loss": 0.0093, + "step": 13740 + }, + { + "epoch": 3.5148261758691204, + "grad_norm": 0.17964474856853485, + "learning_rate": 2.456083335946232e-06, + "loss": 0.0245, + "step": 13750 + }, + { + "epoch": 3.5173824130879345, + "grad_norm": 0.01520370040088892, + "learning_rate": 2.4484056703742083e-06, + "loss": 0.01, + "step": 13760 + }, + { + "epoch": 3.5199386503067487, + "grad_norm": 0.04782997816801071, + "learning_rate": 2.4407361310214893e-06, + "loss": 0.0102, + "step": 13770 + }, + { + "epoch": 3.5224948875255624, + "grad_norm": 0.04237792268395424, + "learning_rate": 2.4330747423137314e-06, + "loss": 0.0059, + "step": 13780 + }, + { + "epoch": 3.525051124744376, + "grad_norm": 0.24677464365959167, + "learning_rate": 2.4254215286506287e-06, + "loss": 0.0035, + "step": 13790 + }, + { + "epoch": 3.5276073619631902, + "grad_norm": 2.3235230445861816, + "learning_rate": 2.4177765144058424e-06, + "loss": 0.008, + "step": 13800 + }, + { + "epoch": 3.530163599182004, + "grad_norm": 0.09863277524709702, + "learning_rate": 2.4101397239269202e-06, + "loss": 0.0169, + "step": 13810 + }, + { + "epoch": 3.532719836400818, + "grad_norm": 0.050358258187770844, + "learning_rate": 2.402511181535213e-06, + "loss": 0.0032, + "step": 13820 + }, + { + "epoch": 3.535276073619632, + "grad_norm": 0.08366558700799942, + "learning_rate": 2.3948909115258163e-06, + "loss": 0.005, + "step": 13830 + }, + { + "epoch": 3.537832310838446, + "grad_norm": 0.028095854446291924, + "learning_rate": 2.3872789381674665e-06, + "loss": 0.0131, + "step": 13840 + }, + { + "epoch": 3.5403885480572597, + "grad_norm": 0.010922097600996494, + "learning_rate": 2.3796752857024854e-06, + "loss": 0.0127, + "step": 13850 + }, + { + "epoch": 3.542944785276074, + "grad_norm": 5.2768330574035645, + "learning_rate": 2.372079978346691e-06, + "loss": 0.004, + "step": 13860 + }, + { + "epoch": 3.5455010224948875, + "grad_norm": 5.860825061798096, + "learning_rate": 2.3644930402893297e-06, + "loss": 0.0121, + "step": 13870 + }, + { + "epoch": 3.5480572597137012, + "grad_norm": 0.030172038823366165, + "learning_rate": 2.356914495692984e-06, + "loss": 0.0014, + "step": 13880 + }, + { + "epoch": 3.5506134969325154, + "grad_norm": 0.023287015035748482, + "learning_rate": 2.349344368693513e-06, + "loss": 0.0078, + "step": 13890 + }, + { + "epoch": 3.553169734151329, + "grad_norm": 0.010513374581933022, + "learning_rate": 2.3417826833999657e-06, + "loss": 0.0075, + "step": 13900 + }, + { + "epoch": 3.5557259713701432, + "grad_norm": 3.824662923812866, + "learning_rate": 2.3342294638945077e-06, + "loss": 0.0234, + "step": 13910 + }, + { + "epoch": 3.558282208588957, + "grad_norm": 1.5583800077438354, + "learning_rate": 2.3266847342323377e-06, + "loss": 0.0024, + "step": 13920 + }, + { + "epoch": 3.560838445807771, + "grad_norm": 0.9682608842849731, + "learning_rate": 2.319148518441622e-06, + "loss": 0.0043, + "step": 13930 + }, + { + "epoch": 3.563394683026585, + "grad_norm": 0.0384635366499424, + "learning_rate": 2.3116208405234107e-06, + "loss": 0.006, + "step": 13940 + }, + { + "epoch": 3.565950920245399, + "grad_norm": 0.4134227931499481, + "learning_rate": 2.304101724451564e-06, + "loss": 0.0118, + "step": 13950 + }, + { + "epoch": 3.5685071574642127, + "grad_norm": 0.014091679826378822, + "learning_rate": 2.2965911941726687e-06, + "loss": 0.0034, + "step": 13960 + }, + { + "epoch": 3.5710633946830264, + "grad_norm": 0.21840809285640717, + "learning_rate": 2.289089273605975e-06, + "loss": 0.0055, + "step": 13970 + }, + { + "epoch": 3.5736196319018405, + "grad_norm": 0.015261857770383358, + "learning_rate": 2.2815959866433096e-06, + "loss": 0.0019, + "step": 13980 + }, + { + "epoch": 3.5761758691206547, + "grad_norm": 4.033803939819336, + "learning_rate": 2.2741113571490066e-06, + "loss": 0.0131, + "step": 13990 + }, + { + "epoch": 3.5787321063394684, + "grad_norm": 0.08580244332551956, + "learning_rate": 2.2666354089598198e-06, + "loss": 0.0133, + "step": 14000 + }, + { + "epoch": 3.581288343558282, + "grad_norm": 0.17088328301906586, + "learning_rate": 2.2591681658848686e-06, + "loss": 0.0047, + "step": 14010 + }, + { + "epoch": 3.5838445807770962, + "grad_norm": 2.8940188884735107, + "learning_rate": 2.251709651705535e-06, + "loss": 0.0082, + "step": 14020 + }, + { + "epoch": 3.58640081799591, + "grad_norm": 1.2774847745895386, + "learning_rate": 2.244259890175412e-06, + "loss": 0.0128, + "step": 14030 + }, + { + "epoch": 3.588957055214724, + "grad_norm": 0.8745086789131165, + "learning_rate": 2.236818905020207e-06, + "loss": 0.0056, + "step": 14040 + }, + { + "epoch": 3.591513292433538, + "grad_norm": 0.05803001672029495, + "learning_rate": 2.22938671993769e-06, + "loss": 0.0036, + "step": 14050 + }, + { + "epoch": 3.5940695296523515, + "grad_norm": 3.186616897583008, + "learning_rate": 2.221963358597593e-06, + "loss": 0.0049, + "step": 14060 + }, + { + "epoch": 3.5966257668711656, + "grad_norm": 0.13081157207489014, + "learning_rate": 2.214548844641552e-06, + "loss": 0.0159, + "step": 14070 + }, + { + "epoch": 3.59918200408998, + "grad_norm": 0.5573609471321106, + "learning_rate": 2.2071432016830257e-06, + "loss": 0.0063, + "step": 14080 + }, + { + "epoch": 3.6017382413087935, + "grad_norm": 0.11412039399147034, + "learning_rate": 2.1997464533072232e-06, + "loss": 0.0092, + "step": 14090 + }, + { + "epoch": 3.604294478527607, + "grad_norm": 2.3137636184692383, + "learning_rate": 2.1923586230710185e-06, + "loss": 0.0082, + "step": 14100 + }, + { + "epoch": 3.6068507157464214, + "grad_norm": 0.7297873497009277, + "learning_rate": 2.1849797345028917e-06, + "loss": 0.0057, + "step": 14110 + }, + { + "epoch": 3.609406952965235, + "grad_norm": 0.14575114846229553, + "learning_rate": 2.1776098111028427e-06, + "loss": 0.0122, + "step": 14120 + }, + { + "epoch": 3.611963190184049, + "grad_norm": 0.20701062679290771, + "learning_rate": 2.1702488763423206e-06, + "loss": 0.0116, + "step": 14130 + }, + { + "epoch": 3.614519427402863, + "grad_norm": 2.8510355949401855, + "learning_rate": 2.1628969536641436e-06, + "loss": 0.0094, + "step": 14140 + }, + { + "epoch": 3.6170756646216766, + "grad_norm": 0.13213932514190674, + "learning_rate": 2.1555540664824337e-06, + "loss": 0.0136, + "step": 14150 + }, + { + "epoch": 3.6196319018404908, + "grad_norm": 0.011733833700418472, + "learning_rate": 2.1482202381825356e-06, + "loss": 0.0049, + "step": 14160 + }, + { + "epoch": 3.622188139059305, + "grad_norm": 0.06473023444414139, + "learning_rate": 2.1408954921209435e-06, + "loss": 0.007, + "step": 14170 + }, + { + "epoch": 3.6247443762781186, + "grad_norm": 0.029512058943510056, + "learning_rate": 2.1335798516252243e-06, + "loss": 0.0187, + "step": 14180 + }, + { + "epoch": 3.6273006134969323, + "grad_norm": 4.00309944152832, + "learning_rate": 2.126273339993949e-06, + "loss": 0.0142, + "step": 14190 + }, + { + "epoch": 3.6298568507157465, + "grad_norm": 1.9352320432662964, + "learning_rate": 2.1189759804966142e-06, + "loss": 0.0048, + "step": 14200 + }, + { + "epoch": 3.63241308793456, + "grad_norm": 2.03886079788208, + "learning_rate": 2.1116877963735714e-06, + "loss": 0.0007, + "step": 14210 + }, + { + "epoch": 3.6349693251533743, + "grad_norm": 2.063149929046631, + "learning_rate": 2.1044088108359433e-06, + "loss": 0.0113, + "step": 14220 + }, + { + "epoch": 3.637525562372188, + "grad_norm": 0.1273782104253769, + "learning_rate": 2.0971390470655693e-06, + "loss": 0.008, + "step": 14230 + }, + { + "epoch": 3.640081799591002, + "grad_norm": 0.050878312438726425, + "learning_rate": 2.089878528214908e-06, + "loss": 0.0002, + "step": 14240 + }, + { + "epoch": 3.642638036809816, + "grad_norm": 0.3995646834373474, + "learning_rate": 2.082627277406983e-06, + "loss": 0.0134, + "step": 14250 + }, + { + "epoch": 3.64519427402863, + "grad_norm": 2.8083791732788086, + "learning_rate": 2.0753853177352945e-06, + "loss": 0.0122, + "step": 14260 + }, + { + "epoch": 3.6477505112474438, + "grad_norm": 0.38471710681915283, + "learning_rate": 2.0681526722637603e-06, + "loss": 0.0061, + "step": 14270 + }, + { + "epoch": 3.6503067484662575, + "grad_norm": 1.0761078596115112, + "learning_rate": 2.060929364026632e-06, + "loss": 0.0071, + "step": 14280 + }, + { + "epoch": 3.6528629856850716, + "grad_norm": 4.6696319580078125, + "learning_rate": 2.05371541602842e-06, + "loss": 0.015, + "step": 14290 + }, + { + "epoch": 3.6554192229038853, + "grad_norm": 1.2931352853775024, + "learning_rate": 2.0465108512438285e-06, + "loss": 0.0105, + "step": 14300 + }, + { + "epoch": 3.6579754601226995, + "grad_norm": 0.30030888319015503, + "learning_rate": 2.0393156926176796e-06, + "loss": 0.0035, + "step": 14310 + }, + { + "epoch": 3.660531697341513, + "grad_norm": 1.4162043333053589, + "learning_rate": 2.0321299630648374e-06, + "loss": 0.007, + "step": 14320 + }, + { + "epoch": 3.6630879345603273, + "grad_norm": 1.6966540813446045, + "learning_rate": 2.0249536854701335e-06, + "loss": 0.0022, + "step": 14330 + }, + { + "epoch": 3.665644171779141, + "grad_norm": 2.748809337615967, + "learning_rate": 2.017786882688303e-06, + "loss": 0.0059, + "step": 14340 + }, + { + "epoch": 3.668200408997955, + "grad_norm": 3.920806646347046, + "learning_rate": 2.0106295775439018e-06, + "loss": 0.0024, + "step": 14350 + }, + { + "epoch": 3.670756646216769, + "grad_norm": 4.018367290496826, + "learning_rate": 2.003481792831242e-06, + "loss": 0.0134, + "step": 14360 + }, + { + "epoch": 3.6733128834355826, + "grad_norm": 0.7412097454071045, + "learning_rate": 1.9963435513143076e-06, + "loss": 0.0061, + "step": 14370 + }, + { + "epoch": 3.6758691206543967, + "grad_norm": 0.00914350152015686, + "learning_rate": 1.989214875726702e-06, + "loss": 0.0037, + "step": 14380 + }, + { + "epoch": 3.678425357873211, + "grad_norm": 0.7989885210990906, + "learning_rate": 1.982095788771552e-06, + "loss": 0.0081, + "step": 14390 + }, + { + "epoch": 3.6809815950920246, + "grad_norm": 0.09935598075389862, + "learning_rate": 1.9749863131214543e-06, + "loss": 0.0057, + "step": 14400 + }, + { + "epoch": 3.6835378323108383, + "grad_norm": 0.021534953266382217, + "learning_rate": 1.9678864714183877e-06, + "loss": 0.0009, + "step": 14410 + }, + { + "epoch": 3.6860940695296525, + "grad_norm": 1.7669703960418701, + "learning_rate": 1.9607962862736617e-06, + "loss": 0.004, + "step": 14420 + }, + { + "epoch": 3.688650306748466, + "grad_norm": 3.493924856185913, + "learning_rate": 1.9537157802678196e-06, + "loss": 0.0012, + "step": 14430 + }, + { + "epoch": 3.6912065439672803, + "grad_norm": 0.002254684455692768, + "learning_rate": 1.9466449759505856e-06, + "loss": 0.0053, + "step": 14440 + }, + { + "epoch": 3.693762781186094, + "grad_norm": 3.5533618927001953, + "learning_rate": 1.939583895840785e-06, + "loss": 0.0053, + "step": 14450 + }, + { + "epoch": 3.6963190184049077, + "grad_norm": 6.355319976806641, + "learning_rate": 1.932532562426275e-06, + "loss": 0.0086, + "step": 14460 + }, + { + "epoch": 3.698875255623722, + "grad_norm": 0.021470896899700165, + "learning_rate": 1.925490998163868e-06, + "loss": 0.0097, + "step": 14470 + }, + { + "epoch": 3.701431492842536, + "grad_norm": 2.308654308319092, + "learning_rate": 1.918459225479268e-06, + "loss": 0.0156, + "step": 14480 + }, + { + "epoch": 3.7039877300613497, + "grad_norm": 2.9286420345306396, + "learning_rate": 1.911437266766993e-06, + "loss": 0.0076, + "step": 14490 + }, + { + "epoch": 3.7065439672801634, + "grad_norm": 0.0710514560341835, + "learning_rate": 1.9044251443903088e-06, + "loss": 0.0009, + "step": 14500 + }, + { + "epoch": 3.7091002044989776, + "grad_norm": 0.029081158339977264, + "learning_rate": 1.8974228806811496e-06, + "loss": 0.0007, + "step": 14510 + }, + { + "epoch": 3.7116564417177913, + "grad_norm": 4.481345176696777, + "learning_rate": 1.8904304979400557e-06, + "loss": 0.0094, + "step": 14520 + }, + { + "epoch": 3.7142126789366054, + "grad_norm": 0.005593888461589813, + "learning_rate": 1.8834480184360987e-06, + "loss": 0.0025, + "step": 14530 + }, + { + "epoch": 3.716768916155419, + "grad_norm": 0.050757136195898056, + "learning_rate": 1.8764754644068122e-06, + "loss": 0.0052, + "step": 14540 + }, + { + "epoch": 3.719325153374233, + "grad_norm": 0.02077576145529747, + "learning_rate": 1.8695128580581146e-06, + "loss": 0.0015, + "step": 14550 + }, + { + "epoch": 3.721881390593047, + "grad_norm": 0.041414808481931686, + "learning_rate": 1.862560221564247e-06, + "loss": 0.0077, + "step": 14560 + }, + { + "epoch": 3.724437627811861, + "grad_norm": 0.014929791912436485, + "learning_rate": 1.8556175770676987e-06, + "loss": 0.0033, + "step": 14570 + }, + { + "epoch": 3.726993865030675, + "grad_norm": 0.21779873967170715, + "learning_rate": 1.8486849466791385e-06, + "loss": 0.0058, + "step": 14580 + }, + { + "epoch": 3.7295501022494886, + "grad_norm": 0.025204051285982132, + "learning_rate": 1.8417623524773343e-06, + "loss": 0.0102, + "step": 14590 + }, + { + "epoch": 3.7321063394683027, + "grad_norm": 0.015351396054029465, + "learning_rate": 1.8348498165091056e-06, + "loss": 0.0017, + "step": 14600 + }, + { + "epoch": 3.7346625766871164, + "grad_norm": 0.05748201906681061, + "learning_rate": 1.827947360789225e-06, + "loss": 0.0054, + "step": 14610 + }, + { + "epoch": 3.7372188139059306, + "grad_norm": 1.63164484500885, + "learning_rate": 1.8210550073003701e-06, + "loss": 0.0067, + "step": 14620 + }, + { + "epoch": 3.7397750511247443, + "grad_norm": 0.021220263093709946, + "learning_rate": 1.814172777993039e-06, + "loss": 0.0112, + "step": 14630 + }, + { + "epoch": 3.7423312883435584, + "grad_norm": 1.962134599685669, + "learning_rate": 1.807300694785496e-06, + "loss": 0.0066, + "step": 14640 + }, + { + "epoch": 3.744887525562372, + "grad_norm": 0.02569643221795559, + "learning_rate": 1.800438779563683e-06, + "loss": 0.01, + "step": 14650 + }, + { + "epoch": 3.7474437627811863, + "grad_norm": 0.10192188620567322, + "learning_rate": 1.7935870541811633e-06, + "loss": 0.0025, + "step": 14660 + }, + { + "epoch": 3.75, + "grad_norm": 0.06718003004789352, + "learning_rate": 1.7867455404590495e-06, + "loss": 0.0014, + "step": 14670 + }, + { + "epoch": 3.7525562372188137, + "grad_norm": 0.01870041899383068, + "learning_rate": 1.7799142601859322e-06, + "loss": 0.0062, + "step": 14680 + }, + { + "epoch": 3.755112474437628, + "grad_norm": 3.080137014389038, + "learning_rate": 1.7730932351178055e-06, + "loss": 0.0049, + "step": 14690 + }, + { + "epoch": 3.7576687116564416, + "grad_norm": 0.33492809534072876, + "learning_rate": 1.7662824869780094e-06, + "loss": 0.0088, + "step": 14700 + }, + { + "epoch": 3.7602249488752557, + "grad_norm": 0.7548993825912476, + "learning_rate": 1.759482037457152e-06, + "loss": 0.0021, + "step": 14710 + }, + { + "epoch": 3.7627811860940694, + "grad_norm": 0.2977140247821808, + "learning_rate": 1.7526919082130434e-06, + "loss": 0.0089, + "step": 14720 + }, + { + "epoch": 3.7653374233128836, + "grad_norm": 0.009994206950068474, + "learning_rate": 1.7459121208706264e-06, + "loss": 0.0069, + "step": 14730 + }, + { + "epoch": 3.7678936605316973, + "grad_norm": 1.6630052328109741, + "learning_rate": 1.7391426970219021e-06, + "loss": 0.0103, + "step": 14740 + }, + { + "epoch": 3.7704498977505114, + "grad_norm": 1.2915098667144775, + "learning_rate": 1.7323836582258774e-06, + "loss": 0.0079, + "step": 14750 + }, + { + "epoch": 3.773006134969325, + "grad_norm": 3.242319345474243, + "learning_rate": 1.7256350260084736e-06, + "loss": 0.0069, + "step": 14760 + }, + { + "epoch": 3.775562372188139, + "grad_norm": 0.026173055171966553, + "learning_rate": 1.718896821862478e-06, + "loss": 0.0011, + "step": 14770 + }, + { + "epoch": 3.778118609406953, + "grad_norm": 0.021731965243816376, + "learning_rate": 1.7121690672474577e-06, + "loss": 0.0042, + "step": 14780 + }, + { + "epoch": 3.780674846625767, + "grad_norm": 4.898509502410889, + "learning_rate": 1.7054517835897144e-06, + "loss": 0.0178, + "step": 14790 + }, + { + "epoch": 3.783231083844581, + "grad_norm": 5.831714630126953, + "learning_rate": 1.6987449922821887e-06, + "loss": 0.006, + "step": 14800 + }, + { + "epoch": 3.7857873210633946, + "grad_norm": 0.009105149656534195, + "learning_rate": 1.6920487146844117e-06, + "loss": 0.0012, + "step": 14810 + }, + { + "epoch": 3.7883435582822087, + "grad_norm": 0.0681765154004097, + "learning_rate": 1.6853629721224318e-06, + "loss": 0.0064, + "step": 14820 + }, + { + "epoch": 3.7908997955010224, + "grad_norm": 0.09596231579780579, + "learning_rate": 1.6786877858887457e-06, + "loss": 0.0036, + "step": 14830 + }, + { + "epoch": 3.7934560327198366, + "grad_norm": 0.2018987387418747, + "learning_rate": 1.6720231772422251e-06, + "loss": 0.0041, + "step": 14840 + }, + { + "epoch": 3.7960122699386503, + "grad_norm": 0.034721288830041885, + "learning_rate": 1.665369167408062e-06, + "loss": 0.0083, + "step": 14850 + }, + { + "epoch": 3.798568507157464, + "grad_norm": 0.009844356216490269, + "learning_rate": 1.6587257775776889e-06, + "loss": 0.0047, + "step": 14860 + }, + { + "epoch": 3.801124744376278, + "grad_norm": 0.014034909196197987, + "learning_rate": 1.6520930289087206e-06, + "loss": 0.0053, + "step": 14870 + }, + { + "epoch": 3.8036809815950923, + "grad_norm": 0.03924409672617912, + "learning_rate": 1.6454709425248754e-06, + "loss": 0.0053, + "step": 14880 + }, + { + "epoch": 3.806237218813906, + "grad_norm": 0.03811722993850708, + "learning_rate": 1.6388595395159207e-06, + "loss": 0.0107, + "step": 14890 + }, + { + "epoch": 3.8087934560327197, + "grad_norm": 3.9966225624084473, + "learning_rate": 1.632258840937599e-06, + "loss": 0.0111, + "step": 14900 + }, + { + "epoch": 3.811349693251534, + "grad_norm": 0.009593687951564789, + "learning_rate": 1.6256688678115607e-06, + "loss": 0.0138, + "step": 14910 + }, + { + "epoch": 3.8139059304703475, + "grad_norm": 0.011800892651081085, + "learning_rate": 1.6190896411252966e-06, + "loss": 0.0066, + "step": 14920 + }, + { + "epoch": 3.8164621676891617, + "grad_norm": 0.02664501592516899, + "learning_rate": 1.612521181832075e-06, + "loss": 0.0053, + "step": 14930 + }, + { + "epoch": 3.8190184049079754, + "grad_norm": 2.8575503826141357, + "learning_rate": 1.6059635108508731e-06, + "loss": 0.0082, + "step": 14940 + }, + { + "epoch": 3.821574642126789, + "grad_norm": 1.9057544469833374, + "learning_rate": 1.5994166490663087e-06, + "loss": 0.0026, + "step": 14950 + }, + { + "epoch": 3.8241308793456033, + "grad_norm": 0.012572694569826126, + "learning_rate": 1.5928806173285716e-06, + "loss": 0.0035, + "step": 14960 + }, + { + "epoch": 3.8266871165644174, + "grad_norm": 3.401106595993042, + "learning_rate": 1.58635543645337e-06, + "loss": 0.0068, + "step": 14970 + }, + { + "epoch": 3.829243353783231, + "grad_norm": 0.008161719888448715, + "learning_rate": 1.5798411272218427e-06, + "loss": 0.0048, + "step": 14980 + }, + { + "epoch": 3.831799591002045, + "grad_norm": 4.691705703735352, + "learning_rate": 1.5733377103805154e-06, + "loss": 0.0045, + "step": 14990 + }, + { + "epoch": 3.834355828220859, + "grad_norm": 0.011220389977097511, + "learning_rate": 1.5668452066412137e-06, + "loss": 0.0004, + "step": 15000 + }, + { + "epoch": 3.8369120654396727, + "grad_norm": 0.5998366475105286, + "learning_rate": 1.56036363668102e-06, + "loss": 0.0055, + "step": 15010 + }, + { + "epoch": 3.839468302658487, + "grad_norm": 3.6791257858276367, + "learning_rate": 1.5538930211421839e-06, + "loss": 0.0094, + "step": 15020 + }, + { + "epoch": 3.8420245398773005, + "grad_norm": 0.025082003325223923, + "learning_rate": 1.5474333806320735e-06, + "loss": 0.004, + "step": 15030 + }, + { + "epoch": 3.8445807770961147, + "grad_norm": 0.01650637947022915, + "learning_rate": 1.540984735723104e-06, + "loss": 0.0042, + "step": 15040 + }, + { + "epoch": 3.8471370143149284, + "grad_norm": 0.943402111530304, + "learning_rate": 1.5345471069526718e-06, + "loss": 0.0047, + "step": 15050 + }, + { + "epoch": 3.8496932515337425, + "grad_norm": 0.018428007140755653, + "learning_rate": 1.5281205148230866e-06, + "loss": 0.0187, + "step": 15060 + }, + { + "epoch": 3.8522494887525562, + "grad_norm": 0.013692053034901619, + "learning_rate": 1.5217049798015127e-06, + "loss": 0.0018, + "step": 15070 + }, + { + "epoch": 3.85480572597137, + "grad_norm": 0.01854683831334114, + "learning_rate": 1.5153005223198986e-06, + "loss": 0.0011, + "step": 15080 + }, + { + "epoch": 3.857361963190184, + "grad_norm": 0.07175463438034058, + "learning_rate": 1.5089071627749157e-06, + "loss": 0.0003, + "step": 15090 + }, + { + "epoch": 3.859918200408998, + "grad_norm": 0.3006972074508667, + "learning_rate": 1.5025249215278852e-06, + "loss": 0.0027, + "step": 15100 + }, + { + "epoch": 3.862474437627812, + "grad_norm": 0.5995022058486938, + "learning_rate": 1.4961538189047258e-06, + "loss": 0.0079, + "step": 15110 + }, + { + "epoch": 3.8650306748466257, + "grad_norm": 0.03315654397010803, + "learning_rate": 1.489793875195879e-06, + "loss": 0.0002, + "step": 15120 + }, + { + "epoch": 3.86758691206544, + "grad_norm": 0.01580039970576763, + "learning_rate": 1.4834451106562502e-06, + "loss": 0.0002, + "step": 15130 + }, + { + "epoch": 3.8701431492842535, + "grad_norm": 0.047737788408994675, + "learning_rate": 1.477107545505137e-06, + "loss": 0.0041, + "step": 15140 + }, + { + "epoch": 3.8726993865030677, + "grad_norm": 2.45046329498291, + "learning_rate": 1.470781199926174e-06, + "loss": 0.0075, + "step": 15150 + }, + { + "epoch": 3.8752556237218814, + "grad_norm": 3.830009698867798, + "learning_rate": 1.4644660940672628e-06, + "loss": 0.0058, + "step": 15160 + }, + { + "epoch": 3.877811860940695, + "grad_norm": 0.03586220741271973, + "learning_rate": 1.4581622480405095e-06, + "loss": 0.0055, + "step": 15170 + }, + { + "epoch": 3.8803680981595092, + "grad_norm": 0.048213325440883636, + "learning_rate": 1.45186968192216e-06, + "loss": 0.0135, + "step": 15180 + }, + { + "epoch": 3.8829243353783234, + "grad_norm": 0.011242564767599106, + "learning_rate": 1.4455884157525369e-06, + "loss": 0.0049, + "step": 15190 + }, + { + "epoch": 3.885480572597137, + "grad_norm": 0.26863622665405273, + "learning_rate": 1.4393184695359752e-06, + "loss": 0.0038, + "step": 15200 + }, + { + "epoch": 3.888036809815951, + "grad_norm": 0.09017948806285858, + "learning_rate": 1.4330598632407554e-06, + "loss": 0.0018, + "step": 15210 + }, + { + "epoch": 3.890593047034765, + "grad_norm": 0.21921706199645996, + "learning_rate": 1.4268126167990475e-06, + "loss": 0.0051, + "step": 15220 + }, + { + "epoch": 3.8931492842535786, + "grad_norm": 0.048430170863866806, + "learning_rate": 1.4205767501068413e-06, + "loss": 0.0027, + "step": 15230 + }, + { + "epoch": 3.895705521472393, + "grad_norm": 0.03785645216703415, + "learning_rate": 1.4143522830238855e-06, + "loss": 0.0022, + "step": 15240 + }, + { + "epoch": 3.8982617586912065, + "grad_norm": 0.018065497279167175, + "learning_rate": 1.4081392353736206e-06, + "loss": 0.0075, + "step": 15250 + }, + { + "epoch": 3.90081799591002, + "grad_norm": 1.4500396251678467, + "learning_rate": 1.4019376269431229e-06, + "loss": 0.0034, + "step": 15260 + }, + { + "epoch": 3.9033742331288344, + "grad_norm": 0.04054681211709976, + "learning_rate": 1.395747477483036e-06, + "loss": 0.0021, + "step": 15270 + }, + { + "epoch": 3.9059304703476485, + "grad_norm": 1.107225775718689, + "learning_rate": 1.3895688067075109e-06, + "loss": 0.0012, + "step": 15280 + }, + { + "epoch": 3.908486707566462, + "grad_norm": 0.14938171207904816, + "learning_rate": 1.3834016342941364e-06, + "loss": 0.0052, + "step": 15290 + }, + { + "epoch": 3.911042944785276, + "grad_norm": 0.2700784504413605, + "learning_rate": 1.3772459798838884e-06, + "loss": 0.022, + "step": 15300 + }, + { + "epoch": 3.91359918200409, + "grad_norm": 0.010788323357701302, + "learning_rate": 1.3711018630810568e-06, + "loss": 0.0127, + "step": 15310 + }, + { + "epoch": 3.9161554192229038, + "grad_norm": 0.17254537343978882, + "learning_rate": 1.3649693034531908e-06, + "loss": 0.0026, + "step": 15320 + }, + { + "epoch": 3.918711656441718, + "grad_norm": 2.0272927284240723, + "learning_rate": 1.3588483205310238e-06, + "loss": 0.0028, + "step": 15330 + }, + { + "epoch": 3.9212678936605316, + "grad_norm": 0.7689258456230164, + "learning_rate": 1.352738933808434e-06, + "loss": 0.0046, + "step": 15340 + }, + { + "epoch": 3.9238241308793453, + "grad_norm": 0.09393978118896484, + "learning_rate": 1.3466411627423553e-06, + "loss": 0.0058, + "step": 15350 + }, + { + "epoch": 3.9263803680981595, + "grad_norm": 0.02193518355488777, + "learning_rate": 1.3405550267527373e-06, + "loss": 0.0118, + "step": 15360 + }, + { + "epoch": 3.9289366053169736, + "grad_norm": 1.4280931949615479, + "learning_rate": 1.3344805452224668e-06, + "loss": 0.0055, + "step": 15370 + }, + { + "epoch": 3.9314928425357873, + "grad_norm": 0.017598293721675873, + "learning_rate": 1.3284177374973252e-06, + "loss": 0.0001, + "step": 15380 + }, + { + "epoch": 3.934049079754601, + "grad_norm": 0.017776915803551674, + "learning_rate": 1.3223666228859034e-06, + "loss": 0.0089, + "step": 15390 + }, + { + "epoch": 3.936605316973415, + "grad_norm": 0.025338156148791313, + "learning_rate": 1.3163272206595607e-06, + "loss": 0.0101, + "step": 15400 + }, + { + "epoch": 3.939161554192229, + "grad_norm": 0.00857964251190424, + "learning_rate": 1.3102995500523513e-06, + "loss": 0.0002, + "step": 15410 + }, + { + "epoch": 3.941717791411043, + "grad_norm": 0.17898601293563843, + "learning_rate": 1.3042836302609707e-06, + "loss": 0.0083, + "step": 15420 + }, + { + "epoch": 3.9442740286298568, + "grad_norm": 0.01416697259992361, + "learning_rate": 1.2982794804446858e-06, + "loss": 0.0031, + "step": 15430 + }, + { + "epoch": 3.946830265848671, + "grad_norm": 0.03067069500684738, + "learning_rate": 1.2922871197252818e-06, + "loss": 0.0027, + "step": 15440 + }, + { + "epoch": 3.9493865030674846, + "grad_norm": 0.013419978320598602, + "learning_rate": 1.2863065671869995e-06, + "loss": 0.0004, + "step": 15450 + }, + { + "epoch": 3.9519427402862988, + "grad_norm": 0.36794596910476685, + "learning_rate": 1.2803378418764728e-06, + "loss": 0.0034, + "step": 15460 + }, + { + "epoch": 3.9544989775051125, + "grad_norm": 0.014534058049321175, + "learning_rate": 1.274380962802666e-06, + "loss": 0.0006, + "step": 15470 + }, + { + "epoch": 3.957055214723926, + "grad_norm": 0.953043520450592, + "learning_rate": 1.2684359489368186e-06, + "loss": 0.0097, + "step": 15480 + }, + { + "epoch": 3.9596114519427403, + "grad_norm": 0.0640961155295372, + "learning_rate": 1.2625028192123822e-06, + "loss": 0.0076, + "step": 15490 + }, + { + "epoch": 3.962167689161554, + "grad_norm": 0.026453586295247078, + "learning_rate": 1.2565815925249613e-06, + "loss": 0.0042, + "step": 15500 + }, + { + "epoch": 3.964723926380368, + "grad_norm": 0.02020988054573536, + "learning_rate": 1.250672287732247e-06, + "loss": 0.0005, + "step": 15510 + }, + { + "epoch": 3.967280163599182, + "grad_norm": 0.8880366683006287, + "learning_rate": 1.2447749236539674e-06, + "loss": 0.0122, + "step": 15520 + }, + { + "epoch": 3.969836400817996, + "grad_norm": 0.06537387520074844, + "learning_rate": 1.2388895190718209e-06, + "loss": 0.0043, + "step": 15530 + }, + { + "epoch": 3.9723926380368098, + "grad_norm": 0.03674660250544548, + "learning_rate": 1.2330160927294178e-06, + "loss": 0.001, + "step": 15540 + }, + { + "epoch": 3.974948875255624, + "grad_norm": 0.06352321058511734, + "learning_rate": 1.2271546633322157e-06, + "loss": 0.0007, + "step": 15550 + }, + { + "epoch": 3.9775051124744376, + "grad_norm": 2.053643226623535, + "learning_rate": 1.2213052495474759e-06, + "loss": 0.0064, + "step": 15560 + }, + { + "epoch": 3.9800613496932513, + "grad_norm": 0.006071037612855434, + "learning_rate": 1.2154678700041805e-06, + "loss": 0.0061, + "step": 15570 + }, + { + "epoch": 3.9826175869120655, + "grad_norm": 0.032529015094041824, + "learning_rate": 1.2096425432929943e-06, + "loss": 0.0042, + "step": 15580 + }, + { + "epoch": 3.9851738241308796, + "grad_norm": 0.14356601238250732, + "learning_rate": 1.2038292879661896e-06, + "loss": 0.0025, + "step": 15590 + }, + { + "epoch": 3.9877300613496933, + "grad_norm": 0.009755146689713001, + "learning_rate": 1.1980281225376029e-06, + "loss": 0.0123, + "step": 15600 + }, + { + "epoch": 3.990286298568507, + "grad_norm": 0.026369577273726463, + "learning_rate": 1.1922390654825582e-06, + "loss": 0.002, + "step": 15610 + }, + { + "epoch": 3.992842535787321, + "grad_norm": 1.3809363842010498, + "learning_rate": 1.186462135237823e-06, + "loss": 0.0037, + "step": 15620 + }, + { + "epoch": 3.995398773006135, + "grad_norm": 0.06871844828128815, + "learning_rate": 1.1806973502015423e-06, + "loss": 0.0076, + "step": 15630 + }, + { + "epoch": 3.997955010224949, + "grad_norm": 0.01024967897683382, + "learning_rate": 1.1749447287331805e-06, + "loss": 0.0064, + "step": 15640 + }, + { + "epoch": 4.000511247443763, + "grad_norm": 0.010401812382042408, + "learning_rate": 1.1692042891534677e-06, + "loss": 0.0001, + "step": 15650 + }, + { + "epoch": 4.0030674846625764, + "grad_norm": 0.019328856840729713, + "learning_rate": 1.1634760497443308e-06, + "loss": 0.0011, + "step": 15660 + }, + { + "epoch": 4.00562372188139, + "grad_norm": 2.2071306705474854, + "learning_rate": 1.1577600287488472e-06, + "loss": 0.0046, + "step": 15670 + }, + { + "epoch": 4.008179959100205, + "grad_norm": 0.2718164026737213, + "learning_rate": 1.1520562443711813e-06, + "loss": 0.0002, + "step": 15680 + }, + { + "epoch": 4.0107361963190185, + "grad_norm": 0.0036849735770374537, + "learning_rate": 1.1463647147765262e-06, + "loss": 0.0024, + "step": 15690 + }, + { + "epoch": 4.013292433537832, + "grad_norm": 0.026393355801701546, + "learning_rate": 1.1406854580910426e-06, + "loss": 0.0003, + "step": 15700 + }, + { + "epoch": 4.015848670756646, + "grad_norm": 0.02888057939708233, + "learning_rate": 1.1350184924018137e-06, + "loss": 0.001, + "step": 15710 + }, + { + "epoch": 4.0184049079754605, + "grad_norm": 4.927444934844971, + "learning_rate": 1.1293638357567692e-06, + "loss": 0.0046, + "step": 15720 + }, + { + "epoch": 4.020961145194274, + "grad_norm": 1.5200058221817017, + "learning_rate": 1.1237215061646446e-06, + "loss": 0.0009, + "step": 15730 + }, + { + "epoch": 4.023517382413088, + "grad_norm": 0.006760958582162857, + "learning_rate": 1.118091521594909e-06, + "loss": 0.0001, + "step": 15740 + }, + { + "epoch": 4.026073619631902, + "grad_norm": 0.0747433677315712, + "learning_rate": 1.1124738999777268e-06, + "loss": 0.0004, + "step": 15750 + }, + { + "epoch": 4.028629856850716, + "grad_norm": 0.018624255433678627, + "learning_rate": 1.1068686592038786e-06, + "loss": 0.0011, + "step": 15760 + }, + { + "epoch": 4.03118609406953, + "grad_norm": 0.018480489030480385, + "learning_rate": 1.10127581712472e-06, + "loss": 0.0029, + "step": 15770 + }, + { + "epoch": 4.033742331288344, + "grad_norm": 0.08727142959833145, + "learning_rate": 1.0956953915521196e-06, + "loss": 0.0015, + "step": 15780 + }, + { + "epoch": 4.036298568507157, + "grad_norm": 0.17428268492221832, + "learning_rate": 1.0901274002584029e-06, + "loss": 0.0003, + "step": 15790 + }, + { + "epoch": 4.038854805725971, + "grad_norm": 0.006692373659461737, + "learning_rate": 1.0845718609762912e-06, + "loss": 0.0016, + "step": 15800 + }, + { + "epoch": 4.041411042944786, + "grad_norm": 0.03485719487071037, + "learning_rate": 1.0790287913988533e-06, + "loss": 0.0028, + "step": 15810 + }, + { + "epoch": 4.043967280163599, + "grad_norm": 0.1434144675731659, + "learning_rate": 1.0734982091794439e-06, + "loss": 0.0014, + "step": 15820 + }, + { + "epoch": 4.046523517382413, + "grad_norm": 0.025571011006832123, + "learning_rate": 1.067980131931649e-06, + "loss": 0.0043, + "step": 15830 + }, + { + "epoch": 4.049079754601227, + "grad_norm": 0.022263115271925926, + "learning_rate": 1.0624745772292262e-06, + "loss": 0.0001, + "step": 15840 + }, + { + "epoch": 4.051635991820041, + "grad_norm": 0.026498448103666306, + "learning_rate": 1.0569815626060553e-06, + "loss": 0.0014, + "step": 15850 + }, + { + "epoch": 4.054192229038855, + "grad_norm": 0.11276555806398392, + "learning_rate": 1.051501105556077e-06, + "loss": 0.0012, + "step": 15860 + }, + { + "epoch": 4.056748466257669, + "grad_norm": 0.03225693851709366, + "learning_rate": 1.0460332235332421e-06, + "loss": 0.0019, + "step": 15870 + }, + { + "epoch": 4.059304703476482, + "grad_norm": 0.6012237071990967, + "learning_rate": 1.0405779339514466e-06, + "loss": 0.0028, + "step": 15880 + }, + { + "epoch": 4.061860940695296, + "grad_norm": 1.399053931236267, + "learning_rate": 1.0351352541844895e-06, + "loss": 0.0005, + "step": 15890 + }, + { + "epoch": 4.064417177914111, + "grad_norm": 0.004602417815476656, + "learning_rate": 1.0297052015660065e-06, + "loss": 0.0018, + "step": 15900 + }, + { + "epoch": 4.066973415132924, + "grad_norm": 0.011124187149107456, + "learning_rate": 1.0242877933894212e-06, + "loss": 0.0026, + "step": 15910 + }, + { + "epoch": 4.069529652351738, + "grad_norm": 0.012430194765329361, + "learning_rate": 1.0188830469078832e-06, + "loss": 0.0008, + "step": 15920 + }, + { + "epoch": 4.072085889570552, + "grad_norm": 0.00355120119638741, + "learning_rate": 1.0134909793342251e-06, + "loss": 0.0014, + "step": 15930 + }, + { + "epoch": 4.074642126789366, + "grad_norm": 0.010579611174762249, + "learning_rate": 1.0081116078408932e-06, + "loss": 0.0002, + "step": 15940 + }, + { + "epoch": 4.07719836400818, + "grad_norm": 0.13667239248752594, + "learning_rate": 1.0027449495599045e-06, + "loss": 0.0002, + "step": 15950 + }, + { + "epoch": 4.079754601226994, + "grad_norm": 0.013385191559791565, + "learning_rate": 9.97391021582782e-07, + "loss": 0.0018, + "step": 15960 + }, + { + "epoch": 4.0823108384458076, + "grad_norm": 0.09518828243017197, + "learning_rate": 9.92049840960514e-07, + "loss": 0.002, + "step": 15970 + }, + { + "epoch": 4.084867075664621, + "grad_norm": 0.017801359295845032, + "learning_rate": 9.86721424703483e-07, + "loss": 0.0005, + "step": 15980 + }, + { + "epoch": 4.087423312883436, + "grad_norm": 0.021596604958176613, + "learning_rate": 9.81405789781425e-07, + "loss": 0.0011, + "step": 15990 + }, + { + "epoch": 4.08997955010225, + "grad_norm": 2.4400172233581543, + "learning_rate": 9.76102953123369e-07, + "loss": 0.0041, + "step": 16000 + }, + { + "epoch": 4.092535787321063, + "grad_norm": 0.07604683190584183, + "learning_rate": 9.708129316175875e-07, + "loss": 0.0009, + "step": 16010 + }, + { + "epoch": 4.095092024539877, + "grad_norm": 0.00839927326887846, + "learning_rate": 9.655357421115324e-07, + "loss": 0.0001, + "step": 16020 + }, + { + "epoch": 4.097648261758692, + "grad_norm": 0.8300763964653015, + "learning_rate": 9.60271401411797e-07, + "loss": 0.0009, + "step": 16030 + }, + { + "epoch": 4.100204498977505, + "grad_norm": 0.036536745727062225, + "learning_rate": 9.550199262840494e-07, + "loss": 0.0004, + "step": 16040 + }, + { + "epoch": 4.102760736196319, + "grad_norm": 0.013892588205635548, + "learning_rate": 9.49781333452987e-07, + "loss": 0.0009, + "step": 16050 + }, + { + "epoch": 4.105316973415133, + "grad_norm": 0.27383795380592346, + "learning_rate": 9.445556396022754e-07, + "loss": 0.0005, + "step": 16060 + }, + { + "epoch": 4.107873210633947, + "grad_norm": 0.009578673169016838, + "learning_rate": 9.393428613745036e-07, + "loss": 0.0036, + "step": 16070 + }, + { + "epoch": 4.110429447852761, + "grad_norm": 0.42609789967536926, + "learning_rate": 9.341430153711306e-07, + "loss": 0.0049, + "step": 16080 + }, + { + "epoch": 4.112985685071575, + "grad_norm": 0.12703081965446472, + "learning_rate": 9.289561181524214e-07, + "loss": 0.0037, + "step": 16090 + }, + { + "epoch": 4.115541922290388, + "grad_norm": 0.13383671641349792, + "learning_rate": 9.237821862374092e-07, + "loss": 0.0022, + "step": 16100 + }, + { + "epoch": 4.118098159509202, + "grad_norm": 0.011773956939578056, + "learning_rate": 9.186212361038288e-07, + "loss": 0.0002, + "step": 16110 + }, + { + "epoch": 4.120654396728017, + "grad_norm": 2.616377115249634, + "learning_rate": 9.134732841880811e-07, + "loss": 0.003, + "step": 16120 + }, + { + "epoch": 4.12321063394683, + "grad_norm": 0.012035293504595757, + "learning_rate": 9.083383468851609e-07, + "loss": 0.0079, + "step": 16130 + }, + { + "epoch": 4.125766871165644, + "grad_norm": 0.2209741622209549, + "learning_rate": 9.032164405486193e-07, + "loss": 0.0047, + "step": 16140 + }, + { + "epoch": 4.128323108384458, + "grad_norm": 0.00564198475331068, + "learning_rate": 8.981075814905077e-07, + "loss": 0.0009, + "step": 16150 + }, + { + "epoch": 4.130879345603272, + "grad_norm": 0.021742451936006546, + "learning_rate": 8.930117859813236e-07, + "loss": 0.0009, + "step": 16160 + }, + { + "epoch": 4.133435582822086, + "grad_norm": 0.011637063696980476, + "learning_rate": 8.879290702499576e-07, + "loss": 0.0025, + "step": 16170 + }, + { + "epoch": 4.1359918200409, + "grad_norm": 0.44723692536354065, + "learning_rate": 8.828594504836491e-07, + "loss": 0.0012, + "step": 16180 + }, + { + "epoch": 4.1385480572597135, + "grad_norm": 2.4436564445495605, + "learning_rate": 8.778029428279278e-07, + "loss": 0.0014, + "step": 16190 + }, + { + "epoch": 4.141104294478527, + "grad_norm": 0.3361468017101288, + "learning_rate": 8.727595633865643e-07, + "loss": 0.0013, + "step": 16200 + }, + { + "epoch": 4.143660531697342, + "grad_norm": 1.743153691291809, + "learning_rate": 8.677293282215182e-07, + "loss": 0.0022, + "step": 16210 + }, + { + "epoch": 4.1462167689161555, + "grad_norm": 0.13101418316364288, + "learning_rate": 8.627122533528892e-07, + "loss": 0.0001, + "step": 16220 + }, + { + "epoch": 4.148773006134969, + "grad_norm": 0.00849368516355753, + "learning_rate": 8.577083547588638e-07, + "loss": 0.0001, + "step": 16230 + }, + { + "epoch": 4.151329243353783, + "grad_norm": 0.005825154948979616, + "learning_rate": 8.527176483756671e-07, + "loss": 0.0004, + "step": 16240 + }, + { + "epoch": 4.1538854805725975, + "grad_norm": 0.012522836215794086, + "learning_rate": 8.477401500975063e-07, + "loss": 0.0005, + "step": 16250 + }, + { + "epoch": 4.156441717791411, + "grad_norm": 0.1203024610877037, + "learning_rate": 8.427758757765264e-07, + "loss": 0.0029, + "step": 16260 + }, + { + "epoch": 4.158997955010225, + "grad_norm": 0.046782489866018295, + "learning_rate": 8.378248412227574e-07, + "loss": 0.0016, + "step": 16270 + }, + { + "epoch": 4.161554192229039, + "grad_norm": 0.02540050819516182, + "learning_rate": 8.328870622040652e-07, + "loss": 0.0001, + "step": 16280 + }, + { + "epoch": 4.164110429447852, + "grad_norm": 0.00631357915699482, + "learning_rate": 8.27962554446094e-07, + "loss": 0.0001, + "step": 16290 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 0.02206520363688469, + "learning_rate": 8.23051333632231e-07, + "loss": 0.0001, + "step": 16300 + }, + { + "epoch": 4.169222903885481, + "grad_norm": 0.02339177392423153, + "learning_rate": 8.181534154035398e-07, + "loss": 0.0012, + "step": 16310 + }, + { + "epoch": 4.171779141104294, + "grad_norm": 0.11059535294771194, + "learning_rate": 8.132688153587237e-07, + "loss": 0.0002, + "step": 16320 + }, + { + "epoch": 4.174335378323108, + "grad_norm": 0.04154384881258011, + "learning_rate": 8.083975490540658e-07, + "loss": 0.0003, + "step": 16330 + }, + { + "epoch": 4.176891615541923, + "grad_norm": 0.004592495039105415, + "learning_rate": 8.035396320033911e-07, + "loss": 0.0022, + "step": 16340 + }, + { + "epoch": 4.179447852760736, + "grad_norm": 0.005622932221740484, + "learning_rate": 7.98695079678004e-07, + "loss": 0.0001, + "step": 16350 + }, + { + "epoch": 4.18200408997955, + "grad_norm": 0.008403644897043705, + "learning_rate": 7.93863907506649e-07, + "loss": 0.0007, + "step": 16360 + }, + { + "epoch": 4.184560327198364, + "grad_norm": 0.04854018986225128, + "learning_rate": 7.890461308754565e-07, + "loss": 0.0025, + "step": 16370 + }, + { + "epoch": 4.1871165644171775, + "grad_norm": 0.007120284251868725, + "learning_rate": 7.842417651278978e-07, + "loss": 0.0041, + "step": 16380 + }, + { + "epoch": 4.189672801635992, + "grad_norm": 0.005977618508040905, + "learning_rate": 7.794508255647293e-07, + "loss": 0.0005, + "step": 16390 + }, + { + "epoch": 4.192229038854806, + "grad_norm": 0.01604490913450718, + "learning_rate": 7.746733274439517e-07, + "loss": 0.0005, + "step": 16400 + }, + { + "epoch": 4.1947852760736195, + "grad_norm": 0.04814684018492699, + "learning_rate": 7.699092859807566e-07, + "loss": 0.0006, + "step": 16410 + }, + { + "epoch": 4.197341513292433, + "grad_norm": 0.9865232706069946, + "learning_rate": 7.651587163474822e-07, + "loss": 0.0002, + "step": 16420 + }, + { + "epoch": 4.199897750511248, + "grad_norm": 0.014311658218502998, + "learning_rate": 7.604216336735554e-07, + "loss": 0.0016, + "step": 16430 + }, + { + "epoch": 4.2024539877300615, + "grad_norm": 0.07794589549303055, + "learning_rate": 7.556980530454571e-07, + "loss": 0.001, + "step": 16440 + }, + { + "epoch": 4.205010224948875, + "grad_norm": 0.1298278272151947, + "learning_rate": 7.509879895066652e-07, + "loss": 0.0025, + "step": 16450 + }, + { + "epoch": 4.207566462167689, + "grad_norm": 0.026249248534440994, + "learning_rate": 7.462914580576081e-07, + "loss": 0.0028, + "step": 16460 + }, + { + "epoch": 4.210122699386503, + "grad_norm": 0.008666482754051685, + "learning_rate": 7.416084736556173e-07, + "loss": 0.0007, + "step": 16470 + }, + { + "epoch": 4.212678936605317, + "grad_norm": 0.37366658449172974, + "learning_rate": 7.369390512148816e-07, + "loss": 0.0008, + "step": 16480 + }, + { + "epoch": 4.215235173824131, + "grad_norm": 0.007286503445357084, + "learning_rate": 7.322832056063978e-07, + "loss": 0.0003, + "step": 16490 + }, + { + "epoch": 4.217791411042945, + "grad_norm": 1.1950106620788574, + "learning_rate": 7.276409516579252e-07, + "loss": 0.0024, + "step": 16500 + }, + { + "epoch": 4.220347648261758, + "grad_norm": 0.008293128572404385, + "learning_rate": 7.23012304153931e-07, + "loss": 0.0001, + "step": 16510 + }, + { + "epoch": 4.222903885480573, + "grad_norm": 0.009917296469211578, + "learning_rate": 7.183972778355586e-07, + "loss": 0.0006, + "step": 16520 + }, + { + "epoch": 4.225460122699387, + "grad_norm": 0.013085747137665749, + "learning_rate": 7.137958874005629e-07, + "loss": 0.003, + "step": 16530 + }, + { + "epoch": 4.2280163599182, + "grad_norm": 0.09627839922904968, + "learning_rate": 7.092081475032753e-07, + "loss": 0.0006, + "step": 16540 + }, + { + "epoch": 4.230572597137014, + "grad_norm": 0.007571856956928968, + "learning_rate": 7.046340727545531e-07, + "loss": 0.0001, + "step": 16550 + }, + { + "epoch": 4.233128834355828, + "grad_norm": 0.005919609218835831, + "learning_rate": 7.000736777217332e-07, + "loss": 0.0015, + "step": 16560 + }, + { + "epoch": 4.235685071574642, + "grad_norm": 0.016153009608387947, + "learning_rate": 6.955269769285877e-07, + "loss": 0.0001, + "step": 16570 + }, + { + "epoch": 4.238241308793456, + "grad_norm": 0.016689471900463104, + "learning_rate": 6.909939848552722e-07, + "loss": 0.0015, + "step": 16580 + }, + { + "epoch": 4.24079754601227, + "grad_norm": 0.012151209637522697, + "learning_rate": 6.864747159382851e-07, + "loss": 0.0056, + "step": 16590 + }, + { + "epoch": 4.2433537832310835, + "grad_norm": 0.006638688966631889, + "learning_rate": 6.819691845704207e-07, + "loss": 0.002, + "step": 16600 + }, + { + "epoch": 4.245910020449898, + "grad_norm": 0.026410236954689026, + "learning_rate": 6.774774051007227e-07, + "loss": 0.0006, + "step": 16610 + }, + { + "epoch": 4.248466257668712, + "grad_norm": 0.03302263841032982, + "learning_rate": 6.729993918344347e-07, + "loss": 0.0001, + "step": 16620 + }, + { + "epoch": 4.2510224948875255, + "grad_norm": 0.010984467342495918, + "learning_rate": 6.685351590329625e-07, + "loss": 0.0033, + "step": 16630 + }, + { + "epoch": 4.253578732106339, + "grad_norm": 0.01048219483345747, + "learning_rate": 6.640847209138224e-07, + "loss": 0.0024, + "step": 16640 + }, + { + "epoch": 4.256134969325154, + "grad_norm": 0.005691261030733585, + "learning_rate": 6.596480916505993e-07, + "loss": 0.0034, + "step": 16650 + }, + { + "epoch": 4.2586912065439675, + "grad_norm": 0.018030589446425438, + "learning_rate": 6.552252853728958e-07, + "loss": 0.0003, + "step": 16660 + }, + { + "epoch": 4.261247443762781, + "grad_norm": 0.5344707369804382, + "learning_rate": 6.508163161662994e-07, + "loss": 0.001, + "step": 16670 + }, + { + "epoch": 4.263803680981595, + "grad_norm": 0.01154622994363308, + "learning_rate": 6.464211980723223e-07, + "loss": 0.0011, + "step": 16680 + }, + { + "epoch": 4.266359918200409, + "grad_norm": 0.014204679057002068, + "learning_rate": 6.42039945088369e-07, + "loss": 0.0008, + "step": 16690 + }, + { + "epoch": 4.268916155419223, + "grad_norm": 0.17623376846313477, + "learning_rate": 6.376725711676829e-07, + "loss": 0.0004, + "step": 16700 + }, + { + "epoch": 4.271472392638037, + "grad_norm": 0.15494291484355927, + "learning_rate": 6.33319090219311e-07, + "loss": 0.0002, + "step": 16710 + }, + { + "epoch": 4.274028629856851, + "grad_norm": 0.007995120249688625, + "learning_rate": 6.289795161080492e-07, + "loss": 0.0005, + "step": 16720 + }, + { + "epoch": 4.276584867075664, + "grad_norm": 0.006873907521367073, + "learning_rate": 6.246538626544074e-07, + "loss": 0.0021, + "step": 16730 + }, + { + "epoch": 4.279141104294479, + "grad_norm": 0.8828302621841431, + "learning_rate": 6.203421436345597e-07, + "loss": 0.0017, + "step": 16740 + }, + { + "epoch": 4.281697341513293, + "grad_norm": 0.027829930186271667, + "learning_rate": 6.160443727803034e-07, + "loss": 0.0083, + "step": 16750 + }, + { + "epoch": 4.284253578732106, + "grad_norm": 0.010724814608693123, + "learning_rate": 6.11760563779012e-07, + "loss": 0.001, + "step": 16760 + }, + { + "epoch": 4.28680981595092, + "grad_norm": 0.005555745679885149, + "learning_rate": 6.07490730273596e-07, + "loss": 0.0013, + "step": 16770 + }, + { + "epoch": 4.289366053169734, + "grad_norm": 0.5492291450500488, + "learning_rate": 6.03234885862457e-07, + "loss": 0.0018, + "step": 16780 + }, + { + "epoch": 4.291922290388548, + "grad_norm": 0.01208993699401617, + "learning_rate": 5.989930440994451e-07, + "loss": 0.001, + "step": 16790 + }, + { + "epoch": 4.294478527607362, + "grad_norm": 0.06638287752866745, + "learning_rate": 5.947652184938124e-07, + "loss": 0.0055, + "step": 16800 + }, + { + "epoch": 4.297034764826176, + "grad_norm": 0.021167244762182236, + "learning_rate": 5.905514225101761e-07, + "loss": 0.0027, + "step": 16810 + }, + { + "epoch": 4.2995910020449895, + "grad_norm": 0.01520733255892992, + "learning_rate": 5.863516695684713e-07, + "loss": 0.0002, + "step": 16820 + }, + { + "epoch": 4.302147239263804, + "grad_norm": 0.41557273268699646, + "learning_rate": 5.8216597304391e-07, + "loss": 0.0003, + "step": 16830 + }, + { + "epoch": 4.304703476482618, + "grad_norm": 0.0344698503613472, + "learning_rate": 5.779943462669357e-07, + "loss": 0.0006, + "step": 16840 + }, + { + "epoch": 4.3072597137014315, + "grad_norm": 0.5560601353645325, + "learning_rate": 5.738368025231856e-07, + "loss": 0.0053, + "step": 16850 + }, + { + "epoch": 4.309815950920245, + "grad_norm": 0.014782343059778214, + "learning_rate": 5.696933550534445e-07, + "loss": 0.0003, + "step": 16860 + }, + { + "epoch": 4.31237218813906, + "grad_norm": 0.018144864588975906, + "learning_rate": 5.655640170536053e-07, + "loss": 0.0014, + "step": 16870 + }, + { + "epoch": 4.3149284253578735, + "grad_norm": 0.0022675390355288982, + "learning_rate": 5.614488016746216e-07, + "loss": 0.0007, + "step": 16880 + }, + { + "epoch": 4.317484662576687, + "grad_norm": 0.22662724554538727, + "learning_rate": 5.573477220224777e-07, + "loss": 0.0006, + "step": 16890 + }, + { + "epoch": 4.320040899795501, + "grad_norm": 0.011389585211873055, + "learning_rate": 5.532607911581294e-07, + "loss": 0.0022, + "step": 16900 + }, + { + "epoch": 4.322597137014315, + "grad_norm": 0.06751693785190582, + "learning_rate": 5.491880220974799e-07, + "loss": 0.0005, + "step": 16910 + }, + { + "epoch": 4.325153374233129, + "grad_norm": 0.006349935662001371, + "learning_rate": 5.451294278113234e-07, + "loss": 0.0002, + "step": 16920 + }, + { + "epoch": 4.327709611451943, + "grad_norm": 0.013773099519312382, + "learning_rate": 5.410850212253193e-07, + "loss": 0.0001, + "step": 16930 + }, + { + "epoch": 4.330265848670757, + "grad_norm": 0.010721610859036446, + "learning_rate": 5.37054815219934e-07, + "loss": 0.0001, + "step": 16940 + }, + { + "epoch": 4.33282208588957, + "grad_norm": 0.3423142433166504, + "learning_rate": 5.330388226304145e-07, + "loss": 0.0005, + "step": 16950 + }, + { + "epoch": 4.335378323108385, + "grad_norm": 1.396784782409668, + "learning_rate": 5.290370562467378e-07, + "loss": 0.0034, + "step": 16960 + }, + { + "epoch": 4.337934560327199, + "grad_norm": 0.018410976976156235, + "learning_rate": 5.250495288135776e-07, + "loss": 0.001, + "step": 16970 + }, + { + "epoch": 4.340490797546012, + "grad_norm": 0.0063552772626280785, + "learning_rate": 5.210762530302554e-07, + "loss": 0.0005, + "step": 16980 + }, + { + "epoch": 4.343047034764826, + "grad_norm": 0.010990941897034645, + "learning_rate": 5.17117241550707e-07, + "loss": 0.0022, + "step": 16990 + }, + { + "epoch": 4.34560327198364, + "grad_norm": 0.009601407684385777, + "learning_rate": 5.131725069834403e-07, + "loss": 0.0004, + "step": 17000 + }, + { + "epoch": 4.348159509202454, + "grad_norm": 0.010526538826525211, + "learning_rate": 5.092420618914934e-07, + "loss": 0.0007, + "step": 17010 + }, + { + "epoch": 4.350715746421268, + "grad_norm": 0.02961459383368492, + "learning_rate": 5.053259187923981e-07, + "loss": 0.0008, + "step": 17020 + }, + { + "epoch": 4.353271983640082, + "grad_norm": 0.9297602772712708, + "learning_rate": 5.01424090158133e-07, + "loss": 0.0008, + "step": 17030 + }, + { + "epoch": 4.355828220858895, + "grad_norm": 0.12763309478759766, + "learning_rate": 4.975365884150951e-07, + "loss": 0.0002, + "step": 17040 + }, + { + "epoch": 4.35838445807771, + "grad_norm": 0.005043504294008017, + "learning_rate": 4.93663425944047e-07, + "loss": 0.001, + "step": 17050 + }, + { + "epoch": 4.360940695296524, + "grad_norm": 0.7221952676773071, + "learning_rate": 4.8980461508009e-07, + "loss": 0.0012, + "step": 17060 + }, + { + "epoch": 4.363496932515337, + "grad_norm": 0.00512282457202673, + "learning_rate": 4.85960168112613e-07, + "loss": 0.0041, + "step": 17070 + }, + { + "epoch": 4.366053169734151, + "grad_norm": 0.27002206444740295, + "learning_rate": 4.821300972852666e-07, + "loss": 0.002, + "step": 17080 + }, + { + "epoch": 4.368609406952965, + "grad_norm": 0.0055974265560507774, + "learning_rate": 4.783144147959096e-07, + "loss": 0.0001, + "step": 17090 + }, + { + "epoch": 4.371165644171779, + "grad_norm": 0.011708883568644524, + "learning_rate": 4.745131327965818e-07, + "loss": 0.0006, + "step": 17100 + }, + { + "epoch": 4.373721881390593, + "grad_norm": 0.47823408246040344, + "learning_rate": 4.7072626339345896e-07, + "loss": 0.0006, + "step": 17110 + }, + { + "epoch": 4.376278118609407, + "grad_norm": 0.01948222517967224, + "learning_rate": 4.669538186468192e-07, + "loss": 0.0007, + "step": 17120 + }, + { + "epoch": 4.378834355828221, + "grad_norm": 0.006773567758500576, + "learning_rate": 4.6319581057099604e-07, + "loss": 0.0009, + "step": 17130 + }, + { + "epoch": 4.381390593047035, + "grad_norm": 0.009596975520253181, + "learning_rate": 4.5945225113435024e-07, + "loss": 0.0005, + "step": 17140 + }, + { + "epoch": 4.383946830265849, + "grad_norm": 0.007052087225019932, + "learning_rate": 4.557231522592254e-07, + "loss": 0.0102, + "step": 17150 + }, + { + "epoch": 4.386503067484663, + "grad_norm": 0.00172753247898072, + "learning_rate": 4.520085258219131e-07, + "loss": 0.0011, + "step": 17160 + }, + { + "epoch": 4.389059304703476, + "grad_norm": 0.0074590700678527355, + "learning_rate": 4.4830838365261086e-07, + "loss": 0.0003, + "step": 17170 + }, + { + "epoch": 4.39161554192229, + "grad_norm": 0.011804984882473946, + "learning_rate": 4.446227375353895e-07, + "loss": 0.0001, + "step": 17180 + }, + { + "epoch": 4.394171779141105, + "grad_norm": 0.011131849139928818, + "learning_rate": 4.4095159920815254e-07, + "loss": 0.0064, + "step": 17190 + }, + { + "epoch": 4.396728016359918, + "grad_norm": 0.01048702746629715, + "learning_rate": 4.3729498036260144e-07, + "loss": 0.0001, + "step": 17200 + }, + { + "epoch": 4.399284253578732, + "grad_norm": 0.005339341703802347, + "learning_rate": 4.336528926441924e-07, + "loss": 0.0013, + "step": 17210 + }, + { + "epoch": 4.401840490797546, + "grad_norm": 0.005323043093085289, + "learning_rate": 4.300253476521077e-07, + "loss": 0.0006, + "step": 17220 + }, + { + "epoch": 4.40439672801636, + "grad_norm": 4.231276512145996, + "learning_rate": 4.2641235693921257e-07, + "loss": 0.0021, + "step": 17230 + }, + { + "epoch": 4.406952965235174, + "grad_norm": 0.007030295208096504, + "learning_rate": 4.228139320120211e-07, + "loss": 0.0004, + "step": 17240 + }, + { + "epoch": 4.409509202453988, + "grad_norm": 0.0034739875700324774, + "learning_rate": 4.1923008433065627e-07, + "loss": 0.0014, + "step": 17250 + }, + { + "epoch": 4.412065439672801, + "grad_norm": 0.008167327381670475, + "learning_rate": 4.1566082530882126e-07, + "loss": 0.0004, + "step": 17260 + }, + { + "epoch": 4.414621676891615, + "grad_norm": 0.008502046577632427, + "learning_rate": 4.1210616631375267e-07, + "loss": 0.0003, + "step": 17270 + }, + { + "epoch": 4.41717791411043, + "grad_norm": 0.011800073087215424, + "learning_rate": 4.085661186661921e-07, + "loss": 0.0001, + "step": 17280 + }, + { + "epoch": 4.419734151329243, + "grad_norm": 0.013263700529932976, + "learning_rate": 4.050406936403456e-07, + "loss": 0.0001, + "step": 17290 + }, + { + "epoch": 4.422290388548057, + "grad_norm": 2.6378958225250244, + "learning_rate": 4.015299024638536e-07, + "loss": 0.001, + "step": 17300 + }, + { + "epoch": 4.424846625766871, + "grad_norm": 0.012993947602808475, + "learning_rate": 3.9803375631774555e-07, + "loss": 0.0008, + "step": 17310 + }, + { + "epoch": 4.427402862985685, + "grad_norm": 0.01075220387428999, + "learning_rate": 3.945522663364154e-07, + "loss": 0.0001, + "step": 17320 + }, + { + "epoch": 4.429959100204499, + "grad_norm": 0.2742706537246704, + "learning_rate": 3.910854436075767e-07, + "loss": 0.0002, + "step": 17330 + }, + { + "epoch": 4.432515337423313, + "grad_norm": 0.007065648213028908, + "learning_rate": 3.876332991722348e-07, + "loss": 0.0016, + "step": 17340 + }, + { + "epoch": 4.4350715746421265, + "grad_norm": 0.020136894658207893, + "learning_rate": 3.84195844024644e-07, + "loss": 0.0005, + "step": 17350 + }, + { + "epoch": 4.43762781186094, + "grad_norm": 0.027455536648631096, + "learning_rate": 3.8077308911227964e-07, + "loss": 0.0006, + "step": 17360 + }, + { + "epoch": 4.440184049079755, + "grad_norm": 0.05177016928792, + "learning_rate": 3.773650453358008e-07, + "loss": 0.0005, + "step": 17370 + }, + { + "epoch": 4.4427402862985685, + "grad_norm": 1.8424170017242432, + "learning_rate": 3.739717235490137e-07, + "loss": 0.0013, + "step": 17380 + }, + { + "epoch": 4.445296523517382, + "grad_norm": 0.1712723970413208, + "learning_rate": 3.705931345588376e-07, + "loss": 0.0003, + "step": 17390 + }, + { + "epoch": 4.447852760736196, + "grad_norm": 0.09310045093297958, + "learning_rate": 3.672292891252732e-07, + "loss": 0.0001, + "step": 17400 + }, + { + "epoch": 4.4504089979550105, + "grad_norm": 0.0639791414141655, + "learning_rate": 3.6388019796136654e-07, + "loss": 0.001, + "step": 17410 + }, + { + "epoch": 4.452965235173824, + "grad_norm": 0.0685553252696991, + "learning_rate": 3.605458717331739e-07, + "loss": 0.0006, + "step": 17420 + }, + { + "epoch": 4.455521472392638, + "grad_norm": 0.034271907061338425, + "learning_rate": 3.5722632105972765e-07, + "loss": 0.0007, + "step": 17430 + }, + { + "epoch": 4.458077709611452, + "grad_norm": 0.04286443442106247, + "learning_rate": 3.539215565130055e-07, + "loss": 0.0007, + "step": 17440 + }, + { + "epoch": 4.460633946830266, + "grad_norm": 0.02046520821750164, + "learning_rate": 3.506315886178957e-07, + "loss": 0.0006, + "step": 17450 + }, + { + "epoch": 4.46319018404908, + "grad_norm": 0.012680341489613056, + "learning_rate": 3.4735642785215963e-07, + "loss": 0.0041, + "step": 17460 + }, + { + "epoch": 4.465746421267894, + "grad_norm": 0.005123642738908529, + "learning_rate": 3.4409608464640366e-07, + "loss": 0.0006, + "step": 17470 + }, + { + "epoch": 4.468302658486707, + "grad_norm": 0.003161899745464325, + "learning_rate": 3.4085056938404303e-07, + "loss": 0.0009, + "step": 17480 + }, + { + "epoch": 4.470858895705521, + "grad_norm": 0.009017124772071838, + "learning_rate": 3.376198924012708e-07, + "loss": 0.001, + "step": 17490 + }, + { + "epoch": 4.473415132924336, + "grad_norm": 0.04075018689036369, + "learning_rate": 3.3440406398702055e-07, + "loss": 0.0024, + "step": 17500 + }, + { + "epoch": 4.475971370143149, + "grad_norm": 3.3212058544158936, + "learning_rate": 3.3120309438293973e-07, + "loss": 0.0038, + "step": 17510 + }, + { + "epoch": 4.478527607361963, + "grad_norm": 0.15001584589481354, + "learning_rate": 3.2801699378335274e-07, + "loss": 0.0001, + "step": 17520 + }, + { + "epoch": 4.481083844580777, + "grad_norm": 0.054374609142541885, + "learning_rate": 3.248457723352316e-07, + "loss": 0.0008, + "step": 17530 + }, + { + "epoch": 4.483640081799591, + "grad_norm": 0.01903144083917141, + "learning_rate": 3.2168944013815764e-07, + "loss": 0.0002, + "step": 17540 + }, + { + "epoch": 4.486196319018405, + "grad_norm": 0.012091502547264099, + "learning_rate": 3.1854800724429703e-07, + "loss": 0.0003, + "step": 17550 + }, + { + "epoch": 4.488752556237219, + "grad_norm": 0.0041188085451722145, + "learning_rate": 3.1542148365836465e-07, + "loss": 0.0001, + "step": 17560 + }, + { + "epoch": 4.4913087934560325, + "grad_norm": 0.004006090573966503, + "learning_rate": 3.123098793375928e-07, + "loss": 0.0055, + "step": 17570 + }, + { + "epoch": 4.493865030674846, + "grad_norm": 4.543797016143799, + "learning_rate": 3.092132041916979e-07, + "loss": 0.0013, + "step": 17580 + }, + { + "epoch": 4.496421267893661, + "grad_norm": 0.006038820371031761, + "learning_rate": 3.06131468082852e-07, + "loss": 0.0002, + "step": 17590 + }, + { + "epoch": 4.4989775051124745, + "grad_norm": 0.06375231593847275, + "learning_rate": 3.0306468082564933e-07, + "loss": 0.0013, + "step": 17600 + }, + { + "epoch": 4.501533742331288, + "grad_norm": 0.010167334228754044, + "learning_rate": 3.000128521870771e-07, + "loss": 0.0001, + "step": 17610 + }, + { + "epoch": 4.504089979550102, + "grad_norm": 0.3248971700668335, + "learning_rate": 2.969759918864784e-07, + "loss": 0.0001, + "step": 17620 + }, + { + "epoch": 4.5066462167689165, + "grad_norm": 0.0035574256908148527, + "learning_rate": 2.939541095955334e-07, + "loss": 0.0016, + "step": 17630 + }, + { + "epoch": 4.50920245398773, + "grad_norm": 0.0564039871096611, + "learning_rate": 2.9094721493821255e-07, + "loss": 0.0009, + "step": 17640 + }, + { + "epoch": 4.511758691206544, + "grad_norm": 0.0021962756291031837, + "learning_rate": 2.8795531749076067e-07, + "loss": 0.0008, + "step": 17650 + }, + { + "epoch": 4.514314928425358, + "grad_norm": 0.00536281056702137, + "learning_rate": 2.8497842678165467e-07, + "loss": 0.0006, + "step": 17660 + }, + { + "epoch": 4.516871165644172, + "grad_norm": 0.007071573752909899, + "learning_rate": 2.8201655229158465e-07, + "loss": 0.0002, + "step": 17670 + }, + { + "epoch": 4.519427402862986, + "grad_norm": 6.006162643432617, + "learning_rate": 2.7906970345341177e-07, + "loss": 0.0027, + "step": 17680 + }, + { + "epoch": 4.5219836400818, + "grad_norm": 0.013232548721134663, + "learning_rate": 2.761378896521477e-07, + "loss": 0.0009, + "step": 17690 + }, + { + "epoch": 4.524539877300613, + "grad_norm": 0.007870076224207878, + "learning_rate": 2.732211202249202e-07, + "loss": 0.0004, + "step": 17700 + }, + { + "epoch": 4.527096114519427, + "grad_norm": 0.001714581623673439, + "learning_rate": 2.7031940446094475e-07, + "loss": 0.0005, + "step": 17710 + }, + { + "epoch": 4.529652351738242, + "grad_norm": 0.7307052612304688, + "learning_rate": 2.674327516014924e-07, + "loss": 0.0004, + "step": 17720 + }, + { + "epoch": 4.532208588957055, + "grad_norm": 0.15873846411705017, + "learning_rate": 2.6456117083986487e-07, + "loss": 0.0001, + "step": 17730 + }, + { + "epoch": 4.534764826175869, + "grad_norm": 0.02356737293303013, + "learning_rate": 2.617046713213617e-07, + "loss": 0.0005, + "step": 17740 + }, + { + "epoch": 4.537321063394683, + "grad_norm": 0.005058961920440197, + "learning_rate": 2.5886326214325297e-07, + "loss": 0.0061, + "step": 17750 + }, + { + "epoch": 4.539877300613497, + "grad_norm": 0.007151829544454813, + "learning_rate": 2.560369523547485e-07, + "loss": 0.0011, + "step": 17760 + }, + { + "epoch": 4.542433537832311, + "grad_norm": 0.008786034770309925, + "learning_rate": 2.5322575095697077e-07, + "loss": 0.0012, + "step": 17770 + }, + { + "epoch": 4.544989775051125, + "grad_norm": 0.01927710324525833, + "learning_rate": 2.50429666902926e-07, + "loss": 0.0001, + "step": 17780 + }, + { + "epoch": 4.5475460122699385, + "grad_norm": 0.008998622186481953, + "learning_rate": 2.476487090974755e-07, + "loss": 0.0002, + "step": 17790 + }, + { + "epoch": 4.550102249488752, + "grad_norm": 0.007529801689088345, + "learning_rate": 2.448828863973052e-07, + "loss": 0.0006, + "step": 17800 + }, + { + "epoch": 4.552658486707567, + "grad_norm": 0.039857879281044006, + "learning_rate": 2.4213220761090173e-07, + "loss": 0.0069, + "step": 17810 + }, + { + "epoch": 4.5552147239263805, + "grad_norm": 0.008589456789195538, + "learning_rate": 2.3939668149852046e-07, + "loss": 0.0001, + "step": 17820 + }, + { + "epoch": 4.557770961145194, + "grad_norm": 0.005125945899635553, + "learning_rate": 2.366763167721603e-07, + "loss": 0.0003, + "step": 17830 + }, + { + "epoch": 4.560327198364008, + "grad_norm": 0.004016405437141657, + "learning_rate": 2.3397112209553207e-07, + "loss": 0.0002, + "step": 17840 + }, + { + "epoch": 4.5628834355828225, + "grad_norm": 0.08685509860515594, + "learning_rate": 2.312811060840381e-07, + "loss": 0.0026, + "step": 17850 + }, + { + "epoch": 4.565439672801636, + "grad_norm": 0.0353722870349884, + "learning_rate": 2.286062773047354e-07, + "loss": 0.0001, + "step": 17860 + }, + { + "epoch": 4.56799591002045, + "grad_norm": 0.021159430965781212, + "learning_rate": 2.2594664427631807e-07, + "loss": 0.0001, + "step": 17870 + }, + { + "epoch": 4.570552147239264, + "grad_norm": 0.02464616298675537, + "learning_rate": 2.2330221546908005e-07, + "loss": 0.0002, + "step": 17880 + }, + { + "epoch": 4.573108384458077, + "grad_norm": 0.042522210627794266, + "learning_rate": 2.2067299930489838e-07, + "loss": 0.0002, + "step": 17890 + }, + { + "epoch": 4.575664621676892, + "grad_norm": 0.03682945668697357, + "learning_rate": 2.180590041571995e-07, + "loss": 0.0001, + "step": 17900 + }, + { + "epoch": 4.578220858895706, + "grad_norm": 0.006650723051279783, + "learning_rate": 2.15460238350933e-07, + "loss": 0.0001, + "step": 17910 + }, + { + "epoch": 4.580777096114519, + "grad_norm": 0.008757648058235645, + "learning_rate": 2.1287671016254897e-07, + "loss": 0.0002, + "step": 17920 + }, + { + "epoch": 4.583333333333333, + "grad_norm": 0.17380449175834656, + "learning_rate": 2.1030842781996796e-07, + "loss": 0.0002, + "step": 17930 + }, + { + "epoch": 4.585889570552148, + "grad_norm": 0.010563348419964314, + "learning_rate": 2.0775539950255774e-07, + "loss": 0.0002, + "step": 17940 + }, + { + "epoch": 4.588445807770961, + "grad_norm": 0.1902349591255188, + "learning_rate": 2.0521763334110324e-07, + "loss": 0.0002, + "step": 17950 + }, + { + "epoch": 4.591002044989775, + "grad_norm": 0.012331271544098854, + "learning_rate": 2.0269513741778492e-07, + "loss": 0.0002, + "step": 17960 + }, + { + "epoch": 4.593558282208589, + "grad_norm": 0.03246452286839485, + "learning_rate": 2.0018791976615048e-07, + "loss": 0.0001, + "step": 17970 + }, + { + "epoch": 4.5961145194274025, + "grad_norm": 0.03161914646625519, + "learning_rate": 1.9769598837109105e-07, + "loss": 0.0051, + "step": 17980 + }, + { + "epoch": 4.598670756646217, + "grad_norm": 0.0297340489923954, + "learning_rate": 1.9521935116881107e-07, + "loss": 0.0002, + "step": 17990 + }, + { + "epoch": 4.601226993865031, + "grad_norm": 3.1284525394439697, + "learning_rate": 1.9275801604681232e-07, + "loss": 0.0012, + "step": 18000 + }, + { + "epoch": 4.6037832310838445, + "grad_norm": 0.005135530140250921, + "learning_rate": 1.9031199084385833e-07, + "loss": 0.0006, + "step": 18010 + }, + { + "epoch": 4.606339468302658, + "grad_norm": 0.005707759875804186, + "learning_rate": 1.8788128334995715e-07, + "loss": 0.0001, + "step": 18020 + }, + { + "epoch": 4.608895705521473, + "grad_norm": 0.008373766206204891, + "learning_rate": 1.8546590130633035e-07, + "loss": 0.0011, + "step": 18030 + }, + { + "epoch": 4.6114519427402865, + "grad_norm": 0.013387088663876057, + "learning_rate": 1.8306585240539576e-07, + "loss": 0.0, + "step": 18040 + }, + { + "epoch": 4.6140081799591, + "grad_norm": 0.0016770199872553349, + "learning_rate": 1.8068114429073524e-07, + "loss": 0.0002, + "step": 18050 + }, + { + "epoch": 4.616564417177914, + "grad_norm": 0.005193398799747229, + "learning_rate": 1.7831178455707533e-07, + "loss": 0.0001, + "step": 18060 + }, + { + "epoch": 4.619120654396728, + "grad_norm": 0.008146319538354874, + "learning_rate": 1.759577807502627e-07, + "loss": 0.001, + "step": 18070 + }, + { + "epoch": 4.621676891615542, + "grad_norm": 0.00837623793631792, + "learning_rate": 1.736191403672377e-07, + "loss": 0.0028, + "step": 18080 + }, + { + "epoch": 4.624233128834356, + "grad_norm": 0.0052582272328436375, + "learning_rate": 1.7129587085601084e-07, + "loss": 0.0031, + "step": 18090 + }, + { + "epoch": 4.62678936605317, + "grad_norm": 0.004001881927251816, + "learning_rate": 1.689879796156424e-07, + "loss": 0.0001, + "step": 18100 + }, + { + "epoch": 4.629345603271983, + "grad_norm": 0.00547180837020278, + "learning_rate": 1.6669547399621567e-07, + "loss": 0.0012, + "step": 18110 + }, + { + "epoch": 4.631901840490798, + "grad_norm": 0.01456737145781517, + "learning_rate": 1.6441836129881427e-07, + "loss": 0.0002, + "step": 18120 + }, + { + "epoch": 4.634458077709612, + "grad_norm": 0.02086499147117138, + "learning_rate": 1.6215664877549774e-07, + "loss": 0.0005, + "step": 18130 + }, + { + "epoch": 4.637014314928425, + "grad_norm": 0.012769919820129871, + "learning_rate": 1.5991034362928204e-07, + "loss": 0.0002, + "step": 18140 + }, + { + "epoch": 4.639570552147239, + "grad_norm": 0.18312056362628937, + "learning_rate": 1.576794530141129e-07, + "loss": 0.0016, + "step": 18150 + }, + { + "epoch": 4.642126789366053, + "grad_norm": 0.0038857313338667154, + "learning_rate": 1.5546398403484542e-07, + "loss": 0.0028, + "step": 18160 + }, + { + "epoch": 4.644683026584867, + "grad_norm": 0.011599598452448845, + "learning_rate": 1.5326394374721887e-07, + "loss": 0.0001, + "step": 18170 + }, + { + "epoch": 4.647239263803681, + "grad_norm": 0.01293268147855997, + "learning_rate": 1.5107933915783745e-07, + "loss": 0.0008, + "step": 18180 + }, + { + "epoch": 4.649795501022495, + "grad_norm": 0.012520798482000828, + "learning_rate": 1.4891017722414525e-07, + "loss": 0.0012, + "step": 18190 + }, + { + "epoch": 4.652351738241308, + "grad_norm": 1.7857009172439575, + "learning_rate": 1.467564648544062e-07, + "loss": 0.0014, + "step": 18200 + }, + { + "epoch": 4.654907975460123, + "grad_norm": 0.0051713059656322, + "learning_rate": 1.4461820890767976e-07, + "loss": 0.0001, + "step": 18210 + }, + { + "epoch": 4.657464212678937, + "grad_norm": 0.27911797165870667, + "learning_rate": 1.424954161938019e-07, + "loss": 0.0002, + "step": 18220 + }, + { + "epoch": 4.66002044989775, + "grad_norm": 0.030297599732875824, + "learning_rate": 1.4038809347336036e-07, + "loss": 0.0001, + "step": 18230 + }, + { + "epoch": 4.662576687116564, + "grad_norm": 0.008832601830363274, + "learning_rate": 1.38296247457676e-07, + "loss": 0.0002, + "step": 18240 + }, + { + "epoch": 4.665132924335378, + "grad_norm": 0.015586239285767078, + "learning_rate": 1.3621988480877812e-07, + "loss": 0.0001, + "step": 18250 + }, + { + "epoch": 4.6676891615541924, + "grad_norm": 0.010234987363219261, + "learning_rate": 1.341590121393882e-07, + "loss": 0.0001, + "step": 18260 + }, + { + "epoch": 4.670245398773006, + "grad_norm": 0.0027397891972213984, + "learning_rate": 1.3211363601289273e-07, + "loss": 0.0033, + "step": 18270 + }, + { + "epoch": 4.67280163599182, + "grad_norm": 0.009892730042338371, + "learning_rate": 1.3008376294332715e-07, + "loss": 0.0001, + "step": 18280 + }, + { + "epoch": 4.675357873210634, + "grad_norm": 0.03709021210670471, + "learning_rate": 1.2806939939535358e-07, + "loss": 0.0001, + "step": 18290 + }, + { + "epoch": 4.677914110429448, + "grad_norm": 0.006908297538757324, + "learning_rate": 1.2607055178423978e-07, + "loss": 0.0001, + "step": 18300 + }, + { + "epoch": 4.680470347648262, + "grad_norm": 0.009108340367674828, + "learning_rate": 1.2408722647583692e-07, + "loss": 0.0001, + "step": 18310 + }, + { + "epoch": 4.683026584867076, + "grad_norm": 0.009007609449326992, + "learning_rate": 1.221194297865641e-07, + "loss": 0.0006, + "step": 18320 + }, + { + "epoch": 4.685582822085889, + "grad_norm": 0.16736240684986115, + "learning_rate": 1.2016716798338436e-07, + "loss": 0.0004, + "step": 18330 + }, + { + "epoch": 4.688139059304703, + "grad_norm": 0.015963025391101837, + "learning_rate": 1.182304472837853e-07, + "loss": 0.0001, + "step": 18340 + }, + { + "epoch": 4.690695296523518, + "grad_norm": 2.035236358642578, + "learning_rate": 1.1630927385576196e-07, + "loss": 0.0034, + "step": 18350 + }, + { + "epoch": 4.693251533742331, + "grad_norm": 0.017824502661824226, + "learning_rate": 1.1440365381779117e-07, + "loss": 0.0003, + "step": 18360 + }, + { + "epoch": 4.695807770961145, + "grad_norm": 0.021831089630723, + "learning_rate": 1.1251359323881994e-07, + "loss": 0.0004, + "step": 18370 + }, + { + "epoch": 4.69836400817996, + "grad_norm": 0.016453437507152557, + "learning_rate": 1.1063909813823992e-07, + "loss": 0.0002, + "step": 18380 + }, + { + "epoch": 4.700920245398773, + "grad_norm": 0.0010980580700561404, + "learning_rate": 1.0878017448587075e-07, + "loss": 0.0007, + "step": 18390 + }, + { + "epoch": 4.703476482617587, + "grad_norm": 0.015402301214635372, + "learning_rate": 1.0693682820194062e-07, + "loss": 0.0001, + "step": 18400 + }, + { + "epoch": 4.706032719836401, + "grad_norm": 0.00750606507062912, + "learning_rate": 1.0510906515706798e-07, + "loss": 0.0001, + "step": 18410 + }, + { + "epoch": 4.708588957055214, + "grad_norm": 0.007333151530474424, + "learning_rate": 1.0329689117224262e-07, + "loss": 0.0001, + "step": 18420 + }, + { + "epoch": 4.711145194274029, + "grad_norm": 0.043747998774051666, + "learning_rate": 1.0150031201880573e-07, + "loss": 0.0017, + "step": 18430 + }, + { + "epoch": 4.713701431492843, + "grad_norm": 0.038498375564813614, + "learning_rate": 9.97193334184332e-08, + "loss": 0.0005, + "step": 18440 + }, + { + "epoch": 4.716257668711656, + "grad_norm": 0.01316425297409296, + "learning_rate": 9.79539610431185e-08, + "loss": 0.0001, + "step": 18450 + }, + { + "epoch": 4.71881390593047, + "grad_norm": 0.014060962945222855, + "learning_rate": 9.620420051514978e-08, + "loss": 0.0002, + "step": 18460 + }, + { + "epoch": 4.721370143149285, + "grad_norm": 0.014505197294056416, + "learning_rate": 9.44700574070978e-08, + "loss": 0.002, + "step": 18470 + }, + { + "epoch": 4.723926380368098, + "grad_norm": 0.006656293291598558, + "learning_rate": 9.275153724179475e-08, + "loss": 0.001, + "step": 18480 + }, + { + "epoch": 4.726482617586912, + "grad_norm": 0.009168506599962711, + "learning_rate": 9.104864549231706e-08, + "loss": 0.0005, + "step": 18490 + }, + { + "epoch": 4.729038854805726, + "grad_norm": 0.016905134543776512, + "learning_rate": 8.936138758196933e-08, + "loss": 0.0001, + "step": 18500 + }, + { + "epoch": 4.7315950920245395, + "grad_norm": 0.012130284681916237, + "learning_rate": 8.768976888426484e-08, + "loss": 0.0001, + "step": 18510 + }, + { + "epoch": 4.734151329243354, + "grad_norm": 0.08311706781387329, + "learning_rate": 8.603379472291118e-08, + "loss": 0.0006, + "step": 18520 + }, + { + "epoch": 4.736707566462168, + "grad_norm": 0.006343462970107794, + "learning_rate": 8.43934703717908e-08, + "loss": 0.0001, + "step": 18530 + }, + { + "epoch": 4.7392638036809815, + "grad_norm": 0.0163714736700058, + "learning_rate": 8.27688010549449e-08, + "loss": 0.0001, + "step": 18540 + }, + { + "epoch": 4.741820040899795, + "grad_norm": 0.004221266135573387, + "learning_rate": 8.115979194655843e-08, + "loss": 0.0006, + "step": 18550 + }, + { + "epoch": 4.74437627811861, + "grad_norm": 0.21704137325286865, + "learning_rate": 7.956644817094072e-08, + "loss": 0.0005, + "step": 18560 + }, + { + "epoch": 4.7469325153374236, + "grad_norm": 0.010948584415018559, + "learning_rate": 7.798877480251321e-08, + "loss": 0.0002, + "step": 18570 + }, + { + "epoch": 4.749488752556237, + "grad_norm": 0.004772350657731295, + "learning_rate": 7.642677686578726e-08, + "loss": 0.0001, + "step": 18580 + }, + { + "epoch": 4.752044989775051, + "grad_norm": 0.005361333955079317, + "learning_rate": 7.488045933535582e-08, + "loss": 0.0004, + "step": 18590 + }, + { + "epoch": 4.754601226993865, + "grad_norm": 0.014715801924467087, + "learning_rate": 7.334982713586958e-08, + "loss": 0.0013, + "step": 18600 + }, + { + "epoch": 4.757157464212679, + "grad_norm": 0.16467002034187317, + "learning_rate": 7.183488514202863e-08, + "loss": 0.0002, + "step": 18610 + }, + { + "epoch": 4.759713701431493, + "grad_norm": 0.01957176998257637, + "learning_rate": 7.03356381785597e-08, + "loss": 0.0001, + "step": 18620 + }, + { + "epoch": 4.762269938650307, + "grad_norm": 1.6649690866470337, + "learning_rate": 6.885209102020896e-08, + "loss": 0.0014, + "step": 18630 + }, + { + "epoch": 4.76482617586912, + "grad_norm": 0.3953768014907837, + "learning_rate": 6.73842483917192e-08, + "loss": 0.0001, + "step": 18640 + }, + { + "epoch": 4.767382413087935, + "grad_norm": 0.14624808728694916, + "learning_rate": 6.593211496781881e-08, + "loss": 0.0004, + "step": 18650 + }, + { + "epoch": 4.769938650306749, + "grad_norm": 0.1692701280117035, + "learning_rate": 6.449569537320677e-08, + "loss": 0.0002, + "step": 18660 + }, + { + "epoch": 4.772494887525562, + "grad_norm": 0.009253126569092274, + "learning_rate": 6.307499418253705e-08, + "loss": 0.0001, + "step": 18670 + }, + { + "epoch": 4.775051124744376, + "grad_norm": 0.011899994686245918, + "learning_rate": 6.167001592040367e-08, + "loss": 0.0001, + "step": 18680 + }, + { + "epoch": 4.77760736196319, + "grad_norm": 0.0060684094205498695, + "learning_rate": 6.028076506132741e-08, + "loss": 0.0014, + "step": 18690 + }, + { + "epoch": 4.780163599182004, + "grad_norm": 0.004709139931946993, + "learning_rate": 5.890724602974074e-08, + "loss": 0.0001, + "step": 18700 + }, + { + "epoch": 4.782719836400818, + "grad_norm": 0.01010716613382101, + "learning_rate": 5.7549463199974566e-08, + "loss": 0.0001, + "step": 18710 + }, + { + "epoch": 4.785276073619632, + "grad_norm": 0.016905710101127625, + "learning_rate": 5.6207420896242646e-08, + "loss": 0.0001, + "step": 18720 + }, + { + "epoch": 4.7878323108384455, + "grad_norm": 0.016229376196861267, + "learning_rate": 5.488112339263052e-08, + "loss": 0.0002, + "step": 18730 + }, + { + "epoch": 4.79038854805726, + "grad_norm": 0.05223598703742027, + "learning_rate": 5.3570574913078264e-08, + "loss": 0.0006, + "step": 18740 + }, + { + "epoch": 4.792944785276074, + "grad_norm": 2.0258493423461914, + "learning_rate": 5.2275779631371646e-08, + "loss": 0.0006, + "step": 18750 + }, + { + "epoch": 4.7955010224948875, + "grad_norm": 3.3475382328033447, + "learning_rate": 5.0996741671123226e-08, + "loss": 0.0017, + "step": 18760 + }, + { + "epoch": 4.798057259713701, + "grad_norm": 0.05064383149147034, + "learning_rate": 4.97334651057646e-08, + "loss": 0.0011, + "step": 18770 + }, + { + "epoch": 4.800613496932515, + "grad_norm": 0.023241933435201645, + "learning_rate": 4.8485953958530286e-08, + "loss": 0.0001, + "step": 18780 + }, + { + "epoch": 4.8031697341513295, + "grad_norm": 0.016580374911427498, + "learning_rate": 4.725421220244553e-08, + "loss": 0.0005, + "step": 18790 + }, + { + "epoch": 4.805725971370143, + "grad_norm": 0.029072437435388565, + "learning_rate": 4.603824376031407e-08, + "loss": 0.0009, + "step": 18800 + }, + { + "epoch": 4.808282208588957, + "grad_norm": 0.012765723280608654, + "learning_rate": 4.4838052504705406e-08, + "loss": 0.0001, + "step": 18810 + }, + { + "epoch": 4.810838445807771, + "grad_norm": 0.10864470154047012, + "learning_rate": 4.3653642257943105e-08, + "loss": 0.0001, + "step": 18820 + }, + { + "epoch": 4.813394683026585, + "grad_norm": 0.876848578453064, + "learning_rate": 4.248501679208983e-08, + "loss": 0.0015, + "step": 18830 + }, + { + "epoch": 4.815950920245399, + "grad_norm": 2.7491190433502197, + "learning_rate": 4.133217982894011e-08, + "loss": 0.0034, + "step": 18840 + }, + { + "epoch": 4.818507157464213, + "grad_norm": 0.014472966082394123, + "learning_rate": 4.019513504000372e-08, + "loss": 0.002, + "step": 18850 + }, + { + "epoch": 4.821063394683026, + "grad_norm": 2.2643988132476807, + "learning_rate": 3.907388604649842e-08, + "loss": 0.0031, + "step": 18860 + }, + { + "epoch": 4.82361963190184, + "grad_norm": 1.9290136098861694, + "learning_rate": 3.796843641933334e-08, + "loss": 0.0006, + "step": 18870 + }, + { + "epoch": 4.826175869120655, + "grad_norm": 0.0031455198768526316, + "learning_rate": 3.687878967910285e-08, + "loss": 0.0023, + "step": 18880 + }, + { + "epoch": 4.828732106339468, + "grad_norm": 0.07761465013027191, + "learning_rate": 3.580494929607159e-08, + "loss": 0.0017, + "step": 18890 + }, + { + "epoch": 4.831288343558282, + "grad_norm": 0.007843953557312489, + "learning_rate": 3.4746918690165e-08, + "loss": 0.0001, + "step": 18900 + }, + { + "epoch": 4.833844580777096, + "grad_norm": 0.012562461197376251, + "learning_rate": 3.370470123095826e-08, + "loss": 0.0, + "step": 18910 + }, + { + "epoch": 4.83640081799591, + "grad_norm": 0.030938010662794113, + "learning_rate": 3.267830023766516e-08, + "loss": 0.0003, + "step": 18920 + }, + { + "epoch": 4.838957055214724, + "grad_norm": 0.011501450091600418, + "learning_rate": 3.166771897912868e-08, + "loss": 0.0, + "step": 18930 + }, + { + "epoch": 4.841513292433538, + "grad_norm": 0.020580632612109184, + "learning_rate": 3.0672960673808205e-08, + "loss": 0.0001, + "step": 18940 + }, + { + "epoch": 4.8440695296523515, + "grad_norm": 0.9514909386634827, + "learning_rate": 2.969402848977232e-08, + "loss": 0.0004, + "step": 18950 + }, + { + "epoch": 4.846625766871165, + "grad_norm": 0.01408900786191225, + "learning_rate": 2.873092554468604e-08, + "loss": 0.0017, + "step": 18960 + }, + { + "epoch": 4.84918200408998, + "grad_norm": 0.007879039272665977, + "learning_rate": 2.7783654905803036e-08, + "loss": 0.0007, + "step": 18970 + }, + { + "epoch": 4.8517382413087935, + "grad_norm": 0.00814911350607872, + "learning_rate": 2.6852219589953986e-08, + "loss": 0.0012, + "step": 18980 + }, + { + "epoch": 4.854294478527607, + "grad_norm": 0.006910000462085009, + "learning_rate": 2.5936622563537685e-08, + "loss": 0.0, + "step": 18990 + }, + { + "epoch": 4.856850715746421, + "grad_norm": 0.01161187607795, + "learning_rate": 2.503686674251382e-08, + "loss": 0.0002, + "step": 19000 + }, + { + "epoch": 4.8594069529652355, + "grad_norm": 0.4636043906211853, + "learning_rate": 2.4152954992388565e-08, + "loss": 0.0005, + "step": 19010 + }, + { + "epoch": 4.861963190184049, + "grad_norm": 0.003940473776310682, + "learning_rate": 2.328489012821067e-08, + "loss": 0.0001, + "step": 19020 + }, + { + "epoch": 4.864519427402863, + "grad_norm": 0.01835138350725174, + "learning_rate": 2.243267491455925e-08, + "loss": 0.0018, + "step": 19030 + }, + { + "epoch": 4.867075664621677, + "grad_norm": 0.003659491892904043, + "learning_rate": 2.159631206553714e-08, + "loss": 0.0004, + "step": 19040 + }, + { + "epoch": 4.86963190184049, + "grad_norm": 0.010196760296821594, + "learning_rate": 2.077580424475978e-08, + "loss": 0.0007, + "step": 19050 + }, + { + "epoch": 4.872188139059305, + "grad_norm": 0.40100085735321045, + "learning_rate": 1.9971154065349108e-08, + "loss": 0.0008, + "step": 19060 + }, + { + "epoch": 4.874744376278119, + "grad_norm": 0.0034521687775850296, + "learning_rate": 1.9182364089924134e-08, + "loss": 0.0001, + "step": 19070 + }, + { + "epoch": 4.877300613496932, + "grad_norm": 0.019500114023685455, + "learning_rate": 1.8409436830593152e-08, + "loss": 0.0002, + "step": 19080 + }, + { + "epoch": 4.879856850715746, + "grad_norm": 0.006733261980116367, + "learning_rate": 1.765237474894488e-08, + "loss": 0.0046, + "step": 19090 + }, + { + "epoch": 4.882413087934561, + "grad_norm": 0.02504415065050125, + "learning_rate": 1.691118025604066e-08, + "loss": 0.0001, + "step": 19100 + }, + { + "epoch": 4.884969325153374, + "grad_norm": 0.017662404105067253, + "learning_rate": 1.618585571240949e-08, + "loss": 0.0006, + "step": 19110 + }, + { + "epoch": 4.887525562372188, + "grad_norm": 0.005395929794758558, + "learning_rate": 1.5476403428035803e-08, + "loss": 0.0001, + "step": 19120 + }, + { + "epoch": 4.890081799591002, + "grad_norm": 0.004282295238226652, + "learning_rate": 1.478282566235667e-08, + "loss": 0.0013, + "step": 19130 + }, + { + "epoch": 4.8926380368098155, + "grad_norm": 0.006876455619931221, + "learning_rate": 1.4105124624251843e-08, + "loss": 0.0001, + "step": 19140 + }, + { + "epoch": 4.89519427402863, + "grad_norm": 2.063361406326294, + "learning_rate": 1.3443302472036513e-08, + "loss": 0.0009, + "step": 19150 + }, + { + "epoch": 4.897750511247444, + "grad_norm": 0.34553998708724976, + "learning_rate": 1.279736131345799e-08, + "loss": 0.0013, + "step": 19160 + }, + { + "epoch": 4.9003067484662575, + "grad_norm": 0.18306072056293488, + "learning_rate": 1.2167303205682934e-08, + "loss": 0.001, + "step": 19170 + }, + { + "epoch": 4.902862985685072, + "grad_norm": 0.007264157757163048, + "learning_rate": 1.1553130155297908e-08, + "loss": 0.0066, + "step": 19180 + }, + { + "epoch": 4.905419222903886, + "grad_norm": 0.006425623781979084, + "learning_rate": 1.0954844118296614e-08, + "loss": 0.0032, + "step": 19190 + }, + { + "epoch": 4.9079754601226995, + "grad_norm": 0.014158538542687893, + "learning_rate": 1.0372447000077113e-08, + "loss": 0.0001, + "step": 19200 + }, + { + "epoch": 4.910531697341513, + "grad_norm": 0.011595489457249641, + "learning_rate": 9.805940655436274e-09, + "loss": 0.0003, + "step": 19210 + }, + { + "epoch": 4.913087934560327, + "grad_norm": 3.0229008197784424, + "learning_rate": 9.2553268885609e-09, + "loss": 0.0012, + "step": 19220 + }, + { + "epoch": 4.9156441717791415, + "grad_norm": 0.048986513167619705, + "learning_rate": 8.720607453024388e-09, + "loss": 0.0009, + "step": 19230 + }, + { + "epoch": 4.918200408997955, + "grad_norm": 0.010676774196326733, + "learning_rate": 8.20178405178118e-09, + "loss": 0.0008, + "step": 19240 + }, + { + "epoch": 4.920756646216769, + "grad_norm": 0.006409882567822933, + "learning_rate": 7.698858337159553e-09, + "loss": 0.0005, + "step": 19250 + }, + { + "epoch": 4.923312883435583, + "grad_norm": 0.019153757020831108, + "learning_rate": 7.2118319108582805e-09, + "loss": 0.0002, + "step": 19260 + }, + { + "epoch": 4.925869120654397, + "grad_norm": 0.15216705203056335, + "learning_rate": 6.7407063239405264e-09, + "loss": 0.0005, + "step": 19270 + }, + { + "epoch": 4.928425357873211, + "grad_norm": 0.005260101519525051, + "learning_rate": 6.285483076828858e-09, + "loss": 0.0002, + "step": 19280 + }, + { + "epoch": 4.930981595092025, + "grad_norm": 0.012515813112258911, + "learning_rate": 5.846163619300238e-09, + "loss": 0.0006, + "step": 19290 + }, + { + "epoch": 4.933537832310838, + "grad_norm": 0.2751409411430359, + "learning_rate": 5.422749350482148e-09, + "loss": 0.0025, + "step": 19300 + }, + { + "epoch": 4.936094069529652, + "grad_norm": 0.006292239762842655, + "learning_rate": 5.015241618849254e-09, + "loss": 0.0017, + "step": 19310 + }, + { + "epoch": 4.938650306748467, + "grad_norm": 0.006389266811311245, + "learning_rate": 4.623641722215077e-09, + "loss": 0.0003, + "step": 19320 + }, + { + "epoch": 4.94120654396728, + "grad_norm": 0.00943728256970644, + "learning_rate": 4.247950907733112e-09, + "loss": 0.0002, + "step": 19330 + }, + { + "epoch": 4.943762781186094, + "grad_norm": 0.10176153481006622, + "learning_rate": 3.888170371887934e-09, + "loss": 0.0009, + "step": 19340 + }, + { + "epoch": 4.946319018404908, + "grad_norm": 0.5887343287467957, + "learning_rate": 3.5443012604957638e-09, + "loss": 0.0003, + "step": 19350 + }, + { + "epoch": 4.948875255623722, + "grad_norm": 0.013615554198622704, + "learning_rate": 3.2163446686966913e-09, + "loss": 0.0013, + "step": 19360 + }, + { + "epoch": 4.951431492842536, + "grad_norm": 2.769341230392456, + "learning_rate": 2.9043016409552317e-09, + "loss": 0.0029, + "step": 19370 + }, + { + "epoch": 4.95398773006135, + "grad_norm": 0.010591404512524605, + "learning_rate": 2.6081731710531076e-09, + "loss": 0.0022, + "step": 19380 + }, + { + "epoch": 4.956543967280163, + "grad_norm": 0.02332007698714733, + "learning_rate": 2.3279602020892522e-09, + "loss": 0.0001, + "step": 19390 + }, + { + "epoch": 4.959100204498977, + "grad_norm": 0.5851395130157471, + "learning_rate": 2.06366362647481e-09, + "loss": 0.0002, + "step": 19400 + }, + { + "epoch": 4.961656441717792, + "grad_norm": 0.0056032175198197365, + "learning_rate": 1.8152842859320286e-09, + "loss": 0.0001, + "step": 19410 + }, + { + "epoch": 4.9642126789366054, + "grad_norm": 0.011081293225288391, + "learning_rate": 1.5828229714892619e-09, + "loss": 0.0001, + "step": 19420 + }, + { + "epoch": 4.966768916155419, + "grad_norm": 0.00719296932220459, + "learning_rate": 1.366280423480415e-09, + "loss": 0.0012, + "step": 19430 + }, + { + "epoch": 4.969325153374233, + "grad_norm": 0.0061414423398673534, + "learning_rate": 1.1656573315421693e-09, + "loss": 0.0002, + "step": 19440 + }, + { + "epoch": 4.9718813905930475, + "grad_norm": 0.006891654338687658, + "learning_rate": 9.80954334611206e-10, + "loss": 0.0003, + "step": 19450 + }, + { + "epoch": 4.974437627811861, + "grad_norm": 0.35890260338783264, + "learning_rate": 8.121720209219864e-10, + "loss": 0.0025, + "step": 19460 + }, + { + "epoch": 4.976993865030675, + "grad_norm": 0.010992944240570068, + "learning_rate": 6.59310928006196e-10, + "loss": 0.0001, + "step": 19470 + }, + { + "epoch": 4.979550102249489, + "grad_norm": 0.010081345215439796, + "learning_rate": 5.2237154268997e-10, + "loss": 0.0003, + "step": 19480 + }, + { + "epoch": 4.982106339468302, + "grad_norm": 0.027087528258562088, + "learning_rate": 4.013543010927823e-10, + "loss": 0.0005, + "step": 19490 + }, + { + "epoch": 4.984662576687117, + "grad_norm": 0.005823803599923849, + "learning_rate": 2.962595886257802e-10, + "loss": 0.0006, + "step": 19500 + }, + { + "epoch": 4.987218813905931, + "grad_norm": 0.10786204785108566, + "learning_rate": 2.0708773999011945e-10, + "loss": 0.0003, + "step": 19510 + }, + { + "epoch": 4.989775051124744, + "grad_norm": 0.31824302673339844, + "learning_rate": 1.3383903917696394e-10, + "loss": 0.0009, + "step": 19520 + }, + { + "epoch": 4.992331288343558, + "grad_norm": 0.01443443726748228, + "learning_rate": 7.651371946637565e-11, + "loss": 0.0014, + "step": 19530 + }, + { + "epoch": 4.994887525562373, + "grad_norm": 0.014061720110476017, + "learning_rate": 3.511196342509404e-11, + "loss": 0.0015, + "step": 19540 + }, + { + "epoch": 4.997443762781186, + "grad_norm": 0.10343813896179199, + "learning_rate": 9.633902908201542e-12, + "loss": 0.0003, + "step": 19550 + }, + { + "epoch": 5.0, + "grad_norm": 0.009711273945868015, + "learning_rate": 7.961905745812459e-14, + "loss": 0.0002, + "step": 19560 + }, + { + "epoch": 5.0, + "step": 19560, + "total_flos": 1.975874722701312e+17, + "train_loss": 0.2911138574734241, + "train_runtime": 11035.9124, + "train_samples_per_second": 1.772, + "train_steps_per_second": 1.772 + } + ], + "logging_steps": 10, + "max_steps": 19560, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.975874722701312e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}