diff --git "a/ESICA/trainer_state.json" "b/ESICA/trainer_state.json" new file mode 100644--- /dev/null +++ "b/ESICA/trainer_state.json" @@ -0,0 +1,6805 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 7730, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00517464424320828, + "grad_norm": 2.9354467391967773, + "learning_rate": 3.0172413793103453e-07, + "loss": 0.4804, + "step": 8 + }, + { + "epoch": 0.01034928848641656, + "grad_norm": 2.4340202808380127, + "learning_rate": 6.465517241379311e-07, + "loss": 0.4851, + "step": 16 + }, + { + "epoch": 0.015523932729624839, + "grad_norm": 2.8747549057006836, + "learning_rate": 9.913793103448276e-07, + "loss": 0.4678, + "step": 24 + }, + { + "epoch": 0.02069857697283312, + "grad_norm": 3.4659008979797363, + "learning_rate": 1.336206896551724e-06, + "loss": 0.4884, + "step": 32 + }, + { + "epoch": 0.0258732212160414, + "grad_norm": 2.2758100032806396, + "learning_rate": 1.681034482758621e-06, + "loss": 0.4822, + "step": 40 + }, + { + "epoch": 0.031047865459249677, + "grad_norm": 1.6406975984573364, + "learning_rate": 2.025862068965517e-06, + "loss": 0.4722, + "step": 48 + }, + { + "epoch": 0.03622250970245795, + "grad_norm": 3.5947091579437256, + "learning_rate": 2.370689655172414e-06, + "loss": 0.4996, + "step": 56 + }, + { + "epoch": 0.04139715394566624, + "grad_norm": 2.5726499557495117, + "learning_rate": 2.7155172413793105e-06, + "loss": 0.5066, + "step": 64 + }, + { + "epoch": 0.04657179818887452, + "grad_norm": 3.21673583984375, + "learning_rate": 3.0603448275862068e-06, + "loss": 0.4978, + "step": 72 + }, + { + "epoch": 0.0517464424320828, + "grad_norm": 2.6083199977874756, + "learning_rate": 3.4051724137931034e-06, + "loss": 0.473, + "step": 80 + }, + { + "epoch": 0.056921086675291076, + "grad_norm": 12.260250091552734, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.4753, + "step": 88 + }, + { + "epoch": 0.062095730918499355, + "grad_norm": 5.023367881774902, + "learning_rate": 4.094827586206897e-06, + "loss": 0.4865, + "step": 96 + }, + { + "epoch": 0.06727037516170763, + "grad_norm": 1.0768873691558838, + "learning_rate": 4.439655172413794e-06, + "loss": 0.4705, + "step": 104 + }, + { + "epoch": 0.0724450194049159, + "grad_norm": 2.1622188091278076, + "learning_rate": 4.78448275862069e-06, + "loss": 0.4595, + "step": 112 + }, + { + "epoch": 0.07761966364812418, + "grad_norm": 1.117838740348816, + "learning_rate": 5.129310344827587e-06, + "loss": 0.4365, + "step": 120 + }, + { + "epoch": 0.08279430789133248, + "grad_norm": 7.942878246307373, + "learning_rate": 5.474137931034483e-06, + "loss": 0.4567, + "step": 128 + }, + { + "epoch": 0.08796895213454076, + "grad_norm": 2.440915822982788, + "learning_rate": 5.81896551724138e-06, + "loss": 0.4358, + "step": 136 + }, + { + "epoch": 0.09314359637774904, + "grad_norm": 4.037357330322266, + "learning_rate": 6.163793103448276e-06, + "loss": 0.4732, + "step": 144 + }, + { + "epoch": 0.09831824062095731, + "grad_norm": 7.871211528778076, + "learning_rate": 6.508620689655173e-06, + "loss": 0.4649, + "step": 152 + }, + { + "epoch": 0.1034928848641656, + "grad_norm": 3.0454769134521484, + "learning_rate": 6.853448275862069e-06, + "loss": 0.454, + "step": 160 + }, + { + "epoch": 0.10866752910737387, + "grad_norm": 2.9666481018066406, + "learning_rate": 7.198275862068966e-06, + "loss": 0.4459, + "step": 168 + }, + { + "epoch": 0.11384217335058215, + "grad_norm": 1.0525585412979126, + "learning_rate": 7.543103448275862e-06, + "loss": 0.4661, + "step": 176 + }, + { + "epoch": 0.11901681759379043, + "grad_norm": 2.428351879119873, + "learning_rate": 7.88793103448276e-06, + "loss": 0.4544, + "step": 184 + }, + { + "epoch": 0.12419146183699871, + "grad_norm": 2.5685620307922363, + "learning_rate": 8.232758620689656e-06, + "loss": 0.4625, + "step": 192 + }, + { + "epoch": 0.129366106080207, + "grad_norm": 2.816399574279785, + "learning_rate": 8.577586206896551e-06, + "loss": 0.4499, + "step": 200 + }, + { + "epoch": 0.13454075032341525, + "grad_norm": 6.485348701477051, + "learning_rate": 8.922413793103449e-06, + "loss": 0.4631, + "step": 208 + }, + { + "epoch": 0.13971539456662355, + "grad_norm": 1.7010209560394287, + "learning_rate": 9.267241379310346e-06, + "loss": 0.4509, + "step": 216 + }, + { + "epoch": 0.1448900388098318, + "grad_norm": 2.7307119369506836, + "learning_rate": 9.612068965517242e-06, + "loss": 0.4337, + "step": 224 + }, + { + "epoch": 0.1500646830530401, + "grad_norm": 1.4600287675857544, + "learning_rate": 9.95689655172414e-06, + "loss": 0.4639, + "step": 232 + }, + { + "epoch": 0.15523932729624837, + "grad_norm": 1.5604009628295898, + "learning_rate": 9.999978494742326e-06, + "loss": 0.4425, + "step": 240 + }, + { + "epoch": 0.16041397153945666, + "grad_norm": 1.2792322635650635, + "learning_rate": 9.999901251622079e-06, + "loss": 0.4235, + "step": 248 + }, + { + "epoch": 0.16558861578266496, + "grad_norm": 4.630190372467041, + "learning_rate": 9.999767832624e-06, + "loss": 0.4398, + "step": 256 + }, + { + "epoch": 0.17076326002587322, + "grad_norm": 1.252773404121399, + "learning_rate": 9.999578239247104e-06, + "loss": 0.4511, + "step": 264 + }, + { + "epoch": 0.1759379042690815, + "grad_norm": 6.371535301208496, + "learning_rate": 9.999332473621544e-06, + "loss": 0.4544, + "step": 272 + }, + { + "epoch": 0.18111254851228978, + "grad_norm": 15.354849815368652, + "learning_rate": 9.999030538508598e-06, + "loss": 0.4566, + "step": 280 + }, + { + "epoch": 0.18628719275549807, + "grad_norm": 3.446639060974121, + "learning_rate": 9.99867243730063e-06, + "loss": 0.4573, + "step": 288 + }, + { + "epoch": 0.19146183699870634, + "grad_norm": 2.354877233505249, + "learning_rate": 9.998258174021043e-06, + "loss": 0.4526, + "step": 296 + }, + { + "epoch": 0.19663648124191463, + "grad_norm": 4.430312633514404, + "learning_rate": 9.997787753324253e-06, + "loss": 0.4681, + "step": 304 + }, + { + "epoch": 0.2018111254851229, + "grad_norm": 2.4858336448669434, + "learning_rate": 9.997261180495623e-06, + "loss": 0.4467, + "step": 312 + }, + { + "epoch": 0.2069857697283312, + "grad_norm": 1.4998490810394287, + "learning_rate": 9.996678461451408e-06, + "loss": 0.4495, + "step": 320 + }, + { + "epoch": 0.21216041397153945, + "grad_norm": 2.127995014190674, + "learning_rate": 9.996039602738688e-06, + "loss": 0.439, + "step": 328 + }, + { + "epoch": 0.21733505821474774, + "grad_norm": 1.3331767320632935, + "learning_rate": 9.995344611535295e-06, + "loss": 0.4307, + "step": 336 + }, + { + "epoch": 0.222509702457956, + "grad_norm": 3.9961822032928467, + "learning_rate": 9.994593495649733e-06, + "loss": 0.441, + "step": 344 + }, + { + "epoch": 0.2276843467011643, + "grad_norm": 1.733726143836975, + "learning_rate": 9.993786263521083e-06, + "loss": 0.4522, + "step": 352 + }, + { + "epoch": 0.23285899094437257, + "grad_norm": 1.6327801942825317, + "learning_rate": 9.992922924218924e-06, + "loss": 0.4302, + "step": 360 + }, + { + "epoch": 0.23803363518758086, + "grad_norm": 1.1273424625396729, + "learning_rate": 9.99200348744321e-06, + "loss": 0.4719, + "step": 368 + }, + { + "epoch": 0.24320827943078913, + "grad_norm": 1.8268440961837769, + "learning_rate": 9.991027963524188e-06, + "loss": 0.4492, + "step": 376 + }, + { + "epoch": 0.24838292367399742, + "grad_norm": 1.284851312637329, + "learning_rate": 9.989996363422246e-06, + "loss": 0.4354, + "step": 384 + }, + { + "epoch": 0.2535575679172057, + "grad_norm": 1.4397149085998535, + "learning_rate": 9.988908698727828e-06, + "loss": 0.4405, + "step": 392 + }, + { + "epoch": 0.258732212160414, + "grad_norm": 1.2019370794296265, + "learning_rate": 9.987764981661278e-06, + "loss": 0.4497, + "step": 400 + }, + { + "epoch": 0.26390685640362227, + "grad_norm": 2.0720713138580322, + "learning_rate": 9.986565225072713e-06, + "loss": 0.4473, + "step": 408 + }, + { + "epoch": 0.2690815006468305, + "grad_norm": 1.9331440925598145, + "learning_rate": 9.98530944244187e-06, + "loss": 0.4244, + "step": 416 + }, + { + "epoch": 0.2742561448900388, + "grad_norm": 1.0095131397247314, + "learning_rate": 9.983997647877973e-06, + "loss": 0.4344, + "step": 424 + }, + { + "epoch": 0.2794307891332471, + "grad_norm": 2.3902952671051025, + "learning_rate": 9.98262985611955e-06, + "loss": 0.4318, + "step": 432 + }, + { + "epoch": 0.2846054333764554, + "grad_norm": 4.7941508293151855, + "learning_rate": 9.981206082534287e-06, + "loss": 0.4504, + "step": 440 + }, + { + "epoch": 0.2897800776196636, + "grad_norm": 9.00587272644043, + "learning_rate": 9.979726343118847e-06, + "loss": 0.4541, + "step": 448 + }, + { + "epoch": 0.2949547218628719, + "grad_norm": 2.8924694061279297, + "learning_rate": 9.978190654498687e-06, + "loss": 0.473, + "step": 456 + }, + { + "epoch": 0.3001293661060802, + "grad_norm": 3.2436609268188477, + "learning_rate": 9.976599033927884e-06, + "loss": 0.4578, + "step": 464 + }, + { + "epoch": 0.3053040103492885, + "grad_norm": 1.6975555419921875, + "learning_rate": 9.974951499288925e-06, + "loss": 0.4377, + "step": 472 + }, + { + "epoch": 0.31047865459249674, + "grad_norm": 1.182005524635315, + "learning_rate": 9.973248069092516e-06, + "loss": 0.443, + "step": 480 + }, + { + "epoch": 0.31565329883570503, + "grad_norm": 4.843271732330322, + "learning_rate": 9.971488762477373e-06, + "loss": 0.4347, + "step": 488 + }, + { + "epoch": 0.3208279430789133, + "grad_norm": 1.5591872930526733, + "learning_rate": 9.969673599210006e-06, + "loss": 0.4555, + "step": 496 + }, + { + "epoch": 0.3260025873221216, + "grad_norm": 1.4283549785614014, + "learning_rate": 9.967802599684494e-06, + "loss": 0.4414, + "step": 504 + }, + { + "epoch": 0.3311772315653299, + "grad_norm": 9.921428680419922, + "learning_rate": 9.965875784922261e-06, + "loss": 0.4393, + "step": 512 + }, + { + "epoch": 0.33635187580853815, + "grad_norm": 8.30053424835205, + "learning_rate": 9.963893176571836e-06, + "loss": 0.4407, + "step": 520 + }, + { + "epoch": 0.34152652005174644, + "grad_norm": 3.877694606781006, + "learning_rate": 9.961854796908615e-06, + "loss": 0.4533, + "step": 528 + }, + { + "epoch": 0.34670116429495473, + "grad_norm": 5.783639907836914, + "learning_rate": 9.959760668834601e-06, + "loss": 0.4517, + "step": 536 + }, + { + "epoch": 0.351875808538163, + "grad_norm": 1.8837212324142456, + "learning_rate": 9.957610815878156e-06, + "loss": 0.4505, + "step": 544 + }, + { + "epoch": 0.35705045278137126, + "grad_norm": 3.0668551921844482, + "learning_rate": 9.955405262193731e-06, + "loss": 0.4569, + "step": 552 + }, + { + "epoch": 0.36222509702457956, + "grad_norm": 1.1842219829559326, + "learning_rate": 9.9531440325616e-06, + "loss": 0.4263, + "step": 560 + }, + { + "epoch": 0.36739974126778785, + "grad_norm": 4.542625427246094, + "learning_rate": 9.950827152387575e-06, + "loss": 0.4186, + "step": 568 + }, + { + "epoch": 0.37257438551099614, + "grad_norm": 3.0496370792388916, + "learning_rate": 9.948454647702727e-06, + "loss": 0.4382, + "step": 576 + }, + { + "epoch": 0.3777490297542044, + "grad_norm": 2.0867106914520264, + "learning_rate": 9.94602654516309e-06, + "loss": 0.4496, + "step": 584 + }, + { + "epoch": 0.3829236739974127, + "grad_norm": 2.7117843627929688, + "learning_rate": 9.94354287204936e-06, + "loss": 0.4379, + "step": 592 + }, + { + "epoch": 0.38809831824062097, + "grad_norm": 1.1659321784973145, + "learning_rate": 9.941003656266589e-06, + "loss": 0.4386, + "step": 600 + }, + { + "epoch": 0.39327296248382926, + "grad_norm": 2.4552273750305176, + "learning_rate": 9.93840892634388e-06, + "loss": 0.4521, + "step": 608 + }, + { + "epoch": 0.3984476067270375, + "grad_norm": 1.1509175300598145, + "learning_rate": 9.935758711434052e-06, + "loss": 0.4509, + "step": 616 + }, + { + "epoch": 0.4036222509702458, + "grad_norm": 10.790364265441895, + "learning_rate": 9.933053041313325e-06, + "loss": 0.4647, + "step": 624 + }, + { + "epoch": 0.4087968952134541, + "grad_norm": 4.1285576820373535, + "learning_rate": 9.930291946380977e-06, + "loss": 0.4545, + "step": 632 + }, + { + "epoch": 0.4139715394566624, + "grad_norm": 3.7522051334381104, + "learning_rate": 9.927475457659007e-06, + "loss": 0.4469, + "step": 640 + }, + { + "epoch": 0.4191461836998706, + "grad_norm": 5.767664432525635, + "learning_rate": 9.924603606791786e-06, + "loss": 0.4441, + "step": 648 + }, + { + "epoch": 0.4243208279430789, + "grad_norm": 2.0059242248535156, + "learning_rate": 9.921676426045698e-06, + "loss": 0.4409, + "step": 656 + }, + { + "epoch": 0.4294954721862872, + "grad_norm": 2.6766843795776367, + "learning_rate": 9.918693948308783e-06, + "loss": 0.4381, + "step": 664 + }, + { + "epoch": 0.4346701164294955, + "grad_norm": 12.32884407043457, + "learning_rate": 9.915656207090367e-06, + "loss": 0.4454, + "step": 672 + }, + { + "epoch": 0.4398447606727037, + "grad_norm": 1.6368271112442017, + "learning_rate": 9.912563236520675e-06, + "loss": 0.4433, + "step": 680 + }, + { + "epoch": 0.445019404915912, + "grad_norm": 4.221009731292725, + "learning_rate": 9.909415071350464e-06, + "loss": 0.4281, + "step": 688 + }, + { + "epoch": 0.4501940491591203, + "grad_norm": 3.8563995361328125, + "learning_rate": 9.90621174695062e-06, + "loss": 0.3926, + "step": 696 + }, + { + "epoch": 0.4553686934023286, + "grad_norm": 53.29820251464844, + "learning_rate": 9.902953299311763e-06, + "loss": 0.4306, + "step": 704 + }, + { + "epoch": 0.46054333764553684, + "grad_norm": 13.51191234588623, + "learning_rate": 9.899639765043854e-06, + "loss": 0.4272, + "step": 712 + }, + { + "epoch": 0.46571798188874514, + "grad_norm": 6.6518659591674805, + "learning_rate": 9.89627118137576e-06, + "loss": 0.3784, + "step": 720 + }, + { + "epoch": 0.47089262613195343, + "grad_norm": 20.379615783691406, + "learning_rate": 9.892847586154863e-06, + "loss": 0.3522, + "step": 728 + }, + { + "epoch": 0.4760672703751617, + "grad_norm": 4.289416790008545, + "learning_rate": 9.889369017846616e-06, + "loss": 0.322, + "step": 736 + }, + { + "epoch": 0.48124191461837, + "grad_norm": 3.6690123081207275, + "learning_rate": 9.88583551553411e-06, + "loss": 0.3021, + "step": 744 + }, + { + "epoch": 0.48641655886157825, + "grad_norm": 14.723472595214844, + "learning_rate": 9.882247118917656e-06, + "loss": 0.3019, + "step": 752 + }, + { + "epoch": 0.49159120310478654, + "grad_norm": 8.286908149719238, + "learning_rate": 9.87860386831431e-06, + "loss": 0.2976, + "step": 760 + }, + { + "epoch": 0.49676584734799484, + "grad_norm": 8.500956535339355, + "learning_rate": 9.874905804657445e-06, + "loss": 0.2595, + "step": 768 + }, + { + "epoch": 0.5019404915912031, + "grad_norm": 5.236771583557129, + "learning_rate": 9.871152969496274e-06, + "loss": 0.2585, + "step": 776 + }, + { + "epoch": 0.5071151358344114, + "grad_norm": 10.104601860046387, + "learning_rate": 9.867345404995393e-06, + "loss": 0.2601, + "step": 784 + }, + { + "epoch": 0.5122897800776197, + "grad_norm": 3.540522813796997, + "learning_rate": 9.8634831539343e-06, + "loss": 0.2653, + "step": 792 + }, + { + "epoch": 0.517464424320828, + "grad_norm": 4.285573482513428, + "learning_rate": 9.85956625970692e-06, + "loss": 0.2589, + "step": 800 + }, + { + "epoch": 0.5226390685640362, + "grad_norm": 16.070560455322266, + "learning_rate": 9.855594766321122e-06, + "loss": 0.2469, + "step": 808 + }, + { + "epoch": 0.5278137128072445, + "grad_norm": 7.80849552154541, + "learning_rate": 9.85156871839821e-06, + "loss": 0.2395, + "step": 816 + }, + { + "epoch": 0.5329883570504528, + "grad_norm": 3.7642133235931396, + "learning_rate": 9.847488161172429e-06, + "loss": 0.2636, + "step": 824 + }, + { + "epoch": 0.538163001293661, + "grad_norm": 21.90070152282715, + "learning_rate": 9.843353140490466e-06, + "loss": 0.2591, + "step": 832 + }, + { + "epoch": 0.5433376455368694, + "grad_norm": 16.849538803100586, + "learning_rate": 9.839163702810922e-06, + "loss": 0.2453, + "step": 840 + }, + { + "epoch": 0.5485122897800776, + "grad_norm": 8.792280197143555, + "learning_rate": 9.834919895203789e-06, + "loss": 0.2567, + "step": 848 + }, + { + "epoch": 0.553686934023286, + "grad_norm": 10.0724458694458, + "learning_rate": 9.83062176534994e-06, + "loss": 0.2452, + "step": 856 + }, + { + "epoch": 0.5588615782664942, + "grad_norm": 6.677402019500732, + "learning_rate": 9.826269361540565e-06, + "loss": 0.2541, + "step": 864 + }, + { + "epoch": 0.5640362225097024, + "grad_norm": 3.9658732414245605, + "learning_rate": 9.821862732676655e-06, + "loss": 0.2533, + "step": 872 + }, + { + "epoch": 0.5692108667529108, + "grad_norm": 30.570205688476562, + "learning_rate": 9.817401928268435e-06, + "loss": 0.2255, + "step": 880 + }, + { + "epoch": 0.574385510996119, + "grad_norm": 15.49843978881836, + "learning_rate": 9.812886998434817e-06, + "loss": 0.2407, + "step": 888 + }, + { + "epoch": 0.5795601552393272, + "grad_norm": 6.582603454589844, + "learning_rate": 9.80831799390283e-06, + "loss": 0.2614, + "step": 896 + }, + { + "epoch": 0.5847347994825356, + "grad_norm": 3.3082640171051025, + "learning_rate": 9.803694966007059e-06, + "loss": 0.236, + "step": 904 + }, + { + "epoch": 0.5899094437257438, + "grad_norm": 88.54562377929688, + "learning_rate": 9.799017966689057e-06, + "loss": 0.2349, + "step": 912 + }, + { + "epoch": 0.5950840879689522, + "grad_norm": 3.6538026332855225, + "learning_rate": 9.794287048496771e-06, + "loss": 0.2495, + "step": 920 + }, + { + "epoch": 0.6002587322121604, + "grad_norm": 6.745138168334961, + "learning_rate": 9.789502264583949e-06, + "loss": 0.2718, + "step": 928 + }, + { + "epoch": 0.6054333764553687, + "grad_norm": 14.57174015045166, + "learning_rate": 9.784663668709537e-06, + "loss": 0.2563, + "step": 936 + }, + { + "epoch": 0.610608020698577, + "grad_norm": 5.400656223297119, + "learning_rate": 9.779771315237086e-06, + "loss": 0.2311, + "step": 944 + }, + { + "epoch": 0.6157826649417852, + "grad_norm": 9.440869331359863, + "learning_rate": 9.77482525913413e-06, + "loss": 0.246, + "step": 952 + }, + { + "epoch": 0.6209573091849935, + "grad_norm": 6.850522994995117, + "learning_rate": 9.769825555971575e-06, + "loss": 0.2565, + "step": 960 + }, + { + "epoch": 0.6261319534282018, + "grad_norm": 10.268706321716309, + "learning_rate": 9.764772261923074e-06, + "loss": 0.2559, + "step": 968 + }, + { + "epoch": 0.6313065976714101, + "grad_norm": 10.18468189239502, + "learning_rate": 9.759665433764393e-06, + "loss": 0.244, + "step": 976 + }, + { + "epoch": 0.6364812419146184, + "grad_norm": 11.332158088684082, + "learning_rate": 9.754505128872778e-06, + "loss": 0.2345, + "step": 984 + }, + { + "epoch": 0.6416558861578266, + "grad_norm": 7.11050271987915, + "learning_rate": 9.749291405226304e-06, + "loss": 0.2377, + "step": 992 + }, + { + "epoch": 0.6468305304010349, + "grad_norm": 5.708597660064697, + "learning_rate": 9.744024321403229e-06, + "loss": 0.2182, + "step": 1000 + }, + { + "epoch": 0.6520051746442432, + "grad_norm": 9.726438522338867, + "learning_rate": 9.738703936581333e-06, + "loss": 0.2481, + "step": 1008 + }, + { + "epoch": 0.6571798188874515, + "grad_norm": 26.72545051574707, + "learning_rate": 9.733330310537255e-06, + "loss": 0.2464, + "step": 1016 + }, + { + "epoch": 0.6623544631306598, + "grad_norm": 9.903454780578613, + "learning_rate": 9.727903503645818e-06, + "loss": 0.251, + "step": 1024 + }, + { + "epoch": 0.6675291073738681, + "grad_norm": 1.9352082014083862, + "learning_rate": 9.722423576879354e-06, + "loss": 0.257, + "step": 1032 + }, + { + "epoch": 0.6727037516170763, + "grad_norm": 2.8848230838775635, + "learning_rate": 9.71689059180702e-06, + "loss": 0.2522, + "step": 1040 + }, + { + "epoch": 0.6778783958602846, + "grad_norm": 1.9029700756072998, + "learning_rate": 9.711304610594104e-06, + "loss": 0.2436, + "step": 1048 + }, + { + "epoch": 0.6830530401034929, + "grad_norm": 2.7303895950317383, + "learning_rate": 9.70566569600132e-06, + "loss": 0.2328, + "step": 1056 + }, + { + "epoch": 0.6882276843467011, + "grad_norm": 3.8664939403533936, + "learning_rate": 9.699973911384119e-06, + "loss": 0.2416, + "step": 1064 + }, + { + "epoch": 0.6934023285899095, + "grad_norm": 803.8958129882812, + "learning_rate": 9.694229320691961e-06, + "loss": 0.2433, + "step": 1072 + }, + { + "epoch": 0.6985769728331177, + "grad_norm": 6.5076584815979, + "learning_rate": 9.688431988467609e-06, + "loss": 0.2594, + "step": 1080 + }, + { + "epoch": 0.703751617076326, + "grad_norm": 7.428098201751709, + "learning_rate": 9.682581979846388e-06, + "loss": 0.2655, + "step": 1088 + }, + { + "epoch": 0.7089262613195343, + "grad_norm": 6.998285293579102, + "learning_rate": 9.676679360555479e-06, + "loss": 0.2484, + "step": 1096 + }, + { + "epoch": 0.7141009055627425, + "grad_norm": 2.3249993324279785, + "learning_rate": 9.670724196913149e-06, + "loss": 0.2339, + "step": 1104 + }, + { + "epoch": 0.7192755498059509, + "grad_norm": 4.36381721496582, + "learning_rate": 9.66471655582803e-06, + "loss": 0.2566, + "step": 1112 + }, + { + "epoch": 0.7244501940491591, + "grad_norm": 2.4100117683410645, + "learning_rate": 9.658656504798361e-06, + "loss": 0.2687, + "step": 1120 + }, + { + "epoch": 0.7296248382923674, + "grad_norm": 28.909257888793945, + "learning_rate": 9.652544111911218e-06, + "loss": 0.2624, + "step": 1128 + }, + { + "epoch": 0.7347994825355757, + "grad_norm": 4.873836517333984, + "learning_rate": 9.646379445841769e-06, + "loss": 0.2547, + "step": 1136 + }, + { + "epoch": 0.7399741267787839, + "grad_norm": 2.3288071155548096, + "learning_rate": 9.640162575852487e-06, + "loss": 0.2409, + "step": 1144 + }, + { + "epoch": 0.7451487710219923, + "grad_norm": 39.58846664428711, + "learning_rate": 9.633893571792375e-06, + "loss": 0.2365, + "step": 1152 + }, + { + "epoch": 0.7503234152652005, + "grad_norm": 20.86064338684082, + "learning_rate": 9.627572504096188e-06, + "loss": 0.2441, + "step": 1160 + }, + { + "epoch": 0.7554980595084088, + "grad_norm": 17.1488094329834, + "learning_rate": 9.621199443783633e-06, + "loss": 0.2325, + "step": 1168 + }, + { + "epoch": 0.7606727037516171, + "grad_norm": 1.877516269683838, + "learning_rate": 9.614774462458573e-06, + "loss": 0.2309, + "step": 1176 + }, + { + "epoch": 0.7658473479948253, + "grad_norm": 4.672080039978027, + "learning_rate": 9.608297632308233e-06, + "loss": 0.2261, + "step": 1184 + }, + { + "epoch": 0.7710219922380336, + "grad_norm": 7.2915940284729, + "learning_rate": 9.601769026102368e-06, + "loss": 0.2492, + "step": 1192 + }, + { + "epoch": 0.7761966364812419, + "grad_norm": 3.54653263092041, + "learning_rate": 9.595188717192466e-06, + "loss": 0.262, + "step": 1200 + }, + { + "epoch": 0.7813712807244502, + "grad_norm": 2.2927441596984863, + "learning_rate": 9.58855677951091e-06, + "loss": 0.232, + "step": 1208 + }, + { + "epoch": 0.7865459249676585, + "grad_norm": 128.51602172851562, + "learning_rate": 9.581873287570164e-06, + "loss": 0.2534, + "step": 1216 + }, + { + "epoch": 0.7917205692108668, + "grad_norm": 16.090349197387695, + "learning_rate": 9.575138316461909e-06, + "loss": 0.2525, + "step": 1224 + }, + { + "epoch": 0.796895213454075, + "grad_norm": 24.87708282470703, + "learning_rate": 9.568351941856223e-06, + "loss": 0.2509, + "step": 1232 + }, + { + "epoch": 0.8020698576972833, + "grad_norm": 5.203312397003174, + "learning_rate": 9.561514240000724e-06, + "loss": 0.2411, + "step": 1240 + }, + { + "epoch": 0.8072445019404916, + "grad_norm": 1.168088436126709, + "learning_rate": 9.554625287719711e-06, + "loss": 0.2389, + "step": 1248 + }, + { + "epoch": 0.8124191461836999, + "grad_norm": 5.142014980316162, + "learning_rate": 9.547685162413298e-06, + "loss": 0.2297, + "step": 1256 + }, + { + "epoch": 0.8175937904269082, + "grad_norm": 44.04859161376953, + "learning_rate": 9.540693942056553e-06, + "loss": 0.2433, + "step": 1264 + }, + { + "epoch": 0.8227684346701164, + "grad_norm": 4.631281852722168, + "learning_rate": 9.533651705198616e-06, + "loss": 0.2493, + "step": 1272 + }, + { + "epoch": 0.8279430789133247, + "grad_norm": 53.435794830322266, + "learning_rate": 9.526558530961817e-06, + "loss": 0.2367, + "step": 1280 + }, + { + "epoch": 0.833117723156533, + "grad_norm": 1.1324574947357178, + "learning_rate": 9.519414499040785e-06, + "loss": 0.25, + "step": 1288 + }, + { + "epoch": 0.8382923673997412, + "grad_norm": 7.66749382019043, + "learning_rate": 9.51221968970156e-06, + "loss": 0.2287, + "step": 1296 + }, + { + "epoch": 0.8434670116429496, + "grad_norm": 4.866576194763184, + "learning_rate": 9.504974183780686e-06, + "loss": 0.2452, + "step": 1304 + }, + { + "epoch": 0.8486416558861578, + "grad_norm": 14.431381225585938, + "learning_rate": 9.497678062684301e-06, + "loss": 0.2477, + "step": 1312 + }, + { + "epoch": 0.8538163001293662, + "grad_norm": 2.9705538749694824, + "learning_rate": 9.490331408387225e-06, + "loss": 0.2316, + "step": 1320 + }, + { + "epoch": 0.8589909443725744, + "grad_norm": 5.7500762939453125, + "learning_rate": 9.482934303432038e-06, + "loss": 0.2312, + "step": 1328 + }, + { + "epoch": 0.8641655886157826, + "grad_norm": 4.217560291290283, + "learning_rate": 9.475486830928155e-06, + "loss": 0.2505, + "step": 1336 + }, + { + "epoch": 0.869340232858991, + "grad_norm": 6.206101417541504, + "learning_rate": 9.467989074550891e-06, + "loss": 0.2518, + "step": 1344 + }, + { + "epoch": 0.8745148771021992, + "grad_norm": 11.412181854248047, + "learning_rate": 9.46044111854052e-06, + "loss": 0.2378, + "step": 1352 + }, + { + "epoch": 0.8796895213454075, + "grad_norm": 1.595080852508545, + "learning_rate": 9.452843047701324e-06, + "loss": 0.2467, + "step": 1360 + }, + { + "epoch": 0.8848641655886158, + "grad_norm": 17.12870979309082, + "learning_rate": 9.44519494740065e-06, + "loss": 0.2489, + "step": 1368 + }, + { + "epoch": 0.890038809831824, + "grad_norm": 5.790205478668213, + "learning_rate": 9.437496903567946e-06, + "loss": 0.2359, + "step": 1376 + }, + { + "epoch": 0.8952134540750324, + "grad_norm": 44.45295333862305, + "learning_rate": 9.429749002693793e-06, + "loss": 0.2584, + "step": 1384 + }, + { + "epoch": 0.9003880983182406, + "grad_norm": 2.3127973079681396, + "learning_rate": 9.421951331828938e-06, + "loss": 0.2376, + "step": 1392 + }, + { + "epoch": 0.9055627425614489, + "grad_norm": 8.570551872253418, + "learning_rate": 9.414103978583312e-06, + "loss": 0.2288, + "step": 1400 + }, + { + "epoch": 0.9107373868046572, + "grad_norm": 5.7028045654296875, + "learning_rate": 9.406207031125048e-06, + "loss": 0.2503, + "step": 1408 + }, + { + "epoch": 0.9159120310478654, + "grad_norm": 16.7283878326416, + "learning_rate": 9.398260578179487e-06, + "loss": 0.2321, + "step": 1416 + }, + { + "epoch": 0.9210866752910737, + "grad_norm": 16.976455688476562, + "learning_rate": 9.390264709028189e-06, + "loss": 0.2387, + "step": 1424 + }, + { + "epoch": 0.926261319534282, + "grad_norm": 3.8708393573760986, + "learning_rate": 9.382219513507922e-06, + "loss": 0.2416, + "step": 1432 + }, + { + "epoch": 0.9314359637774903, + "grad_norm": 13.31892204284668, + "learning_rate": 9.374125082009654e-06, + "loss": 0.231, + "step": 1440 + }, + { + "epoch": 0.9366106080206986, + "grad_norm": 7.283811569213867, + "learning_rate": 9.365981505477541e-06, + "loss": 0.2375, + "step": 1448 + }, + { + "epoch": 0.9417852522639069, + "grad_norm": 3.824449062347412, + "learning_rate": 9.3577888754079e-06, + "loss": 0.2402, + "step": 1456 + }, + { + "epoch": 0.9469598965071151, + "grad_norm": 1.6422468423843384, + "learning_rate": 9.34954728384819e-06, + "loss": 0.2372, + "step": 1464 + }, + { + "epoch": 0.9521345407503234, + "grad_norm": 9.619569778442383, + "learning_rate": 9.341256823395965e-06, + "loss": 0.2472, + "step": 1472 + }, + { + "epoch": 0.9573091849935317, + "grad_norm": 2.6928234100341797, + "learning_rate": 9.332917587197844e-06, + "loss": 0.2175, + "step": 1480 + }, + { + "epoch": 0.96248382923674, + "grad_norm": 6.168208599090576, + "learning_rate": 9.324529668948459e-06, + "loss": 0.2186, + "step": 1488 + }, + { + "epoch": 0.9676584734799483, + "grad_norm": 8.888847351074219, + "learning_rate": 9.316093162889407e-06, + "loss": 0.2247, + "step": 1496 + }, + { + "epoch": 0.9728331177231565, + "grad_norm": 1.2777928113937378, + "learning_rate": 9.307608163808189e-06, + "loss": 0.2553, + "step": 1504 + }, + { + "epoch": 0.9780077619663649, + "grad_norm": 6.637683868408203, + "learning_rate": 9.299074767037137e-06, + "loss": 0.2408, + "step": 1512 + }, + { + "epoch": 0.9831824062095731, + "grad_norm": 3.551591396331787, + "learning_rate": 9.290493068452357e-06, + "loss": 0.2514, + "step": 1520 + }, + { + "epoch": 0.9883570504527813, + "grad_norm": 13.958417892456055, + "learning_rate": 9.281863164472647e-06, + "loss": 0.2473, + "step": 1528 + }, + { + "epoch": 0.9935316946959897, + "grad_norm": 4.702348709106445, + "learning_rate": 9.273185152058406e-06, + "loss": 0.2342, + "step": 1536 + }, + { + "epoch": 0.9987063389391979, + "grad_norm": 3.8143277168273926, + "learning_rate": 9.26445912871055e-06, + "loss": 0.2404, + "step": 1544 + }, + { + "epoch": 1.0038809831824063, + "grad_norm": 1.0366370677947998, + "learning_rate": 9.255685192469424e-06, + "loss": 0.234, + "step": 1552 + }, + { + "epoch": 1.0090556274256144, + "grad_norm": 7.876848220825195, + "learning_rate": 9.246863441913685e-06, + "loss": 0.2419, + "step": 1560 + }, + { + "epoch": 1.0142302716688227, + "grad_norm": 2.356088161468506, + "learning_rate": 9.237993976159211e-06, + "loss": 0.2445, + "step": 1568 + }, + { + "epoch": 1.019404915912031, + "grad_norm": 3.279806137084961, + "learning_rate": 9.229076894857973e-06, + "loss": 0.234, + "step": 1576 + }, + { + "epoch": 1.0245795601552394, + "grad_norm": 13.705038070678711, + "learning_rate": 9.220112298196922e-06, + "loss": 0.2418, + "step": 1584 + }, + { + "epoch": 1.0297542043984476, + "grad_norm": 1.1529611349105835, + "learning_rate": 9.211100286896865e-06, + "loss": 0.2339, + "step": 1592 + }, + { + "epoch": 1.034928848641656, + "grad_norm": 4.895956516265869, + "learning_rate": 9.202040962211334e-06, + "loss": 0.2465, + "step": 1600 + }, + { + "epoch": 1.0401034928848643, + "grad_norm": 1.908892035484314, + "learning_rate": 9.19293442592544e-06, + "loss": 0.2291, + "step": 1608 + }, + { + "epoch": 1.0452781371280724, + "grad_norm": 3.374854803085327, + "learning_rate": 9.183780780354736e-06, + "loss": 0.2211, + "step": 1616 + }, + { + "epoch": 1.0504527813712807, + "grad_norm": 5.483238697052002, + "learning_rate": 9.174580128344073e-06, + "loss": 0.2289, + "step": 1624 + }, + { + "epoch": 1.055627425614489, + "grad_norm": 34.095176696777344, + "learning_rate": 9.16533257326643e-06, + "loss": 0.2178, + "step": 1632 + }, + { + "epoch": 1.0608020698576972, + "grad_norm": 29.359207153320312, + "learning_rate": 9.156038219021764e-06, + "loss": 0.2288, + "step": 1640 + }, + { + "epoch": 1.0659767141009056, + "grad_norm": 31.014923095703125, + "learning_rate": 9.146697170035839e-06, + "loss": 0.2373, + "step": 1648 + }, + { + "epoch": 1.071151358344114, + "grad_norm": 6.621888160705566, + "learning_rate": 9.137309531259054e-06, + "loss": 0.2237, + "step": 1656 + }, + { + "epoch": 1.076326002587322, + "grad_norm": 36.402793884277344, + "learning_rate": 9.127875408165261e-06, + "loss": 0.2329, + "step": 1664 + }, + { + "epoch": 1.0815006468305304, + "grad_norm": 3.9474239349365234, + "learning_rate": 9.118394906750585e-06, + "loss": 0.2252, + "step": 1672 + }, + { + "epoch": 1.0866752910737387, + "grad_norm": 4.111749649047852, + "learning_rate": 9.108868133532224e-06, + "loss": 0.2385, + "step": 1680 + }, + { + "epoch": 1.0918499353169469, + "grad_norm": 15.675889015197754, + "learning_rate": 9.099295195547264e-06, + "loss": 0.2455, + "step": 1688 + }, + { + "epoch": 1.0970245795601552, + "grad_norm": 2.580955982208252, + "learning_rate": 9.089676200351467e-06, + "loss": 0.2488, + "step": 1696 + }, + { + "epoch": 1.1021992238033635, + "grad_norm": 6.403937816619873, + "learning_rate": 9.08001125601807e-06, + "loss": 0.2286, + "step": 1704 + }, + { + "epoch": 1.107373868046572, + "grad_norm": 15.213994979858398, + "learning_rate": 9.07030047113656e-06, + "loss": 0.2188, + "step": 1712 + }, + { + "epoch": 1.11254851228978, + "grad_norm": 25.329008102416992, + "learning_rate": 9.060543954811464e-06, + "loss": 0.225, + "step": 1720 + }, + { + "epoch": 1.1177231565329884, + "grad_norm": 1.8411587476730347, + "learning_rate": 9.050741816661128e-06, + "loss": 0.2456, + "step": 1728 + }, + { + "epoch": 1.1228978007761967, + "grad_norm": 1.8786582946777344, + "learning_rate": 9.040894166816461e-06, + "loss": 0.2286, + "step": 1736 + }, + { + "epoch": 1.1280724450194048, + "grad_norm": 4.067444801330566, + "learning_rate": 9.031001115919732e-06, + "loss": 0.2522, + "step": 1744 + }, + { + "epoch": 1.1332470892626132, + "grad_norm": 3.8344902992248535, + "learning_rate": 9.02106277512329e-06, + "loss": 0.237, + "step": 1752 + }, + { + "epoch": 1.1384217335058215, + "grad_norm": 3.862894058227539, + "learning_rate": 9.011079256088355e-06, + "loss": 0.2522, + "step": 1760 + }, + { + "epoch": 1.1435963777490297, + "grad_norm": 116.52717590332031, + "learning_rate": 9.001050670983721e-06, + "loss": 0.2311, + "step": 1768 + }, + { + "epoch": 1.148771021992238, + "grad_norm": 2.6656832695007324, + "learning_rate": 8.990977132484535e-06, + "loss": 0.2315, + "step": 1776 + }, + { + "epoch": 1.1539456662354464, + "grad_norm": 1.0956850051879883, + "learning_rate": 8.980858753771002e-06, + "loss": 0.2296, + "step": 1784 + }, + { + "epoch": 1.1591203104786545, + "grad_norm": 1.0202746391296387, + "learning_rate": 8.970695648527132e-06, + "loss": 0.2239, + "step": 1792 + }, + { + "epoch": 1.1642949547218628, + "grad_norm": 2.6931161880493164, + "learning_rate": 8.96048793093945e-06, + "loss": 0.2408, + "step": 1800 + }, + { + "epoch": 1.1694695989650712, + "grad_norm": 5.132647514343262, + "learning_rate": 8.950235715695717e-06, + "loss": 0.2194, + "step": 1808 + }, + { + "epoch": 1.1746442432082795, + "grad_norm": 2.618748664855957, + "learning_rate": 8.93993911798365e-06, + "loss": 0.2515, + "step": 1816 + }, + { + "epoch": 1.1798188874514877, + "grad_norm": 14.242859840393066, + "learning_rate": 8.929598253489617e-06, + "loss": 0.249, + "step": 1824 + }, + { + "epoch": 1.184993531694696, + "grad_norm": 9.176653861999512, + "learning_rate": 8.91921323839734e-06, + "loss": 0.2334, + "step": 1832 + }, + { + "epoch": 1.1901681759379044, + "grad_norm": 8.211836814880371, + "learning_rate": 8.908784189386589e-06, + "loss": 0.2558, + "step": 1840 + }, + { + "epoch": 1.1953428201811125, + "grad_norm": 2.718625783920288, + "learning_rate": 8.898311223631876e-06, + "loss": 0.2168, + "step": 1848 + }, + { + "epoch": 1.2005174644243208, + "grad_norm": 7.304298400878906, + "learning_rate": 8.887794458801137e-06, + "loss": 0.2232, + "step": 1856 + }, + { + "epoch": 1.2056921086675292, + "grad_norm": 3.7061972618103027, + "learning_rate": 8.8772340130544e-06, + "loss": 0.2266, + "step": 1864 + }, + { + "epoch": 1.2108667529107373, + "grad_norm": 12.395085334777832, + "learning_rate": 8.866630005042476e-06, + "loss": 0.2383, + "step": 1872 + }, + { + "epoch": 1.2160413971539457, + "grad_norm": 3.222179889678955, + "learning_rate": 8.855982553905604e-06, + "loss": 0.2394, + "step": 1880 + }, + { + "epoch": 1.221216041397154, + "grad_norm": 2.2755377292633057, + "learning_rate": 8.845291779272131e-06, + "loss": 0.2308, + "step": 1888 + }, + { + "epoch": 1.2263906856403621, + "grad_norm": 29.07979393005371, + "learning_rate": 8.834557801257162e-06, + "loss": 0.2298, + "step": 1896 + }, + { + "epoch": 1.2315653298835705, + "grad_norm": 6.345997333526611, + "learning_rate": 8.823780740461204e-06, + "loss": 0.2326, + "step": 1904 + }, + { + "epoch": 1.2367399741267788, + "grad_norm": 11.137429237365723, + "learning_rate": 8.81296071796882e-06, + "loss": 0.2374, + "step": 1912 + }, + { + "epoch": 1.2419146183699872, + "grad_norm": 9.395957946777344, + "learning_rate": 8.80209785534726e-06, + "loss": 0.2498, + "step": 1920 + }, + { + "epoch": 1.2470892626131953, + "grad_norm": 17.85895347595215, + "learning_rate": 8.791192274645107e-06, + "loss": 0.2275, + "step": 1928 + }, + { + "epoch": 1.2522639068564037, + "grad_norm": 7.938706874847412, + "learning_rate": 8.780244098390891e-06, + "loss": 0.2405, + "step": 1936 + }, + { + "epoch": 1.2574385510996118, + "grad_norm": 14.045018196105957, + "learning_rate": 8.769253449591728e-06, + "loss": 0.2308, + "step": 1944 + }, + { + "epoch": 1.2626131953428201, + "grad_norm": 20.312881469726562, + "learning_rate": 8.758220451731922e-06, + "loss": 0.2309, + "step": 1952 + }, + { + "epoch": 1.2677878395860285, + "grad_norm": 1.4342230558395386, + "learning_rate": 8.74714522877159e-06, + "loss": 0.2287, + "step": 1960 + }, + { + "epoch": 1.2729624838292368, + "grad_norm": 4.709366321563721, + "learning_rate": 8.736027905145265e-06, + "loss": 0.229, + "step": 1968 + }, + { + "epoch": 1.278137128072445, + "grad_norm": 13.358282089233398, + "learning_rate": 8.724868605760497e-06, + "loss": 0.2243, + "step": 1976 + }, + { + "epoch": 1.2833117723156533, + "grad_norm": 1.1019079685211182, + "learning_rate": 8.713667455996449e-06, + "loss": 0.2292, + "step": 1984 + }, + { + "epoch": 1.2884864165588616, + "grad_norm": 16.261205673217773, + "learning_rate": 8.70242458170249e-06, + "loss": 0.2515, + "step": 1992 + }, + { + "epoch": 1.2936610608020698, + "grad_norm": 7.513927936553955, + "learning_rate": 8.691140109196782e-06, + "loss": 0.2329, + "step": 2000 + }, + { + "epoch": 1.2988357050452781, + "grad_norm": 7.099719047546387, + "learning_rate": 8.67981416526486e-06, + "loss": 0.2327, + "step": 2008 + }, + { + "epoch": 1.3040103492884865, + "grad_norm": 0.7779621481895447, + "learning_rate": 8.668446877158205e-06, + "loss": 0.2254, + "step": 2016 + }, + { + "epoch": 1.3091849935316948, + "grad_norm": 3.191051959991455, + "learning_rate": 8.657038372592815e-06, + "loss": 0.2376, + "step": 2024 + }, + { + "epoch": 1.314359637774903, + "grad_norm": 3.789489984512329, + "learning_rate": 8.645588779747775e-06, + "loss": 0.2205, + "step": 2032 + }, + { + "epoch": 1.3195342820181113, + "grad_norm": 3.229055166244507, + "learning_rate": 8.634098227263809e-06, + "loss": 0.2204, + "step": 2040 + }, + { + "epoch": 1.3247089262613194, + "grad_norm": 2.834246873855591, + "learning_rate": 8.622566844241846e-06, + "loss": 0.2356, + "step": 2048 + }, + { + "epoch": 1.3298835705045278, + "grad_norm": 20.905750274658203, + "learning_rate": 8.610994760241555e-06, + "loss": 0.2336, + "step": 2056 + }, + { + "epoch": 1.3350582147477361, + "grad_norm": 10.509840965270996, + "learning_rate": 8.599382105279899e-06, + "loss": 0.2268, + "step": 2064 + }, + { + "epoch": 1.3402328589909445, + "grad_norm": 11.719096183776855, + "learning_rate": 8.58772900982967e-06, + "loss": 0.2149, + "step": 2072 + }, + { + "epoch": 1.3454075032341526, + "grad_norm": 3.012690544128418, + "learning_rate": 8.576035604818031e-06, + "loss": 0.2217, + "step": 2080 + }, + { + "epoch": 1.350582147477361, + "grad_norm": 26.176471710205078, + "learning_rate": 8.564302021625033e-06, + "loss": 0.2153, + "step": 2088 + }, + { + "epoch": 1.3557567917205693, + "grad_norm": 7.365968704223633, + "learning_rate": 8.552528392082147e-06, + "loss": 0.2427, + "step": 2096 + }, + { + "epoch": 1.3609314359637774, + "grad_norm": 18.66005516052246, + "learning_rate": 8.54071484847078e-06, + "loss": 0.2114, + "step": 2104 + }, + { + "epoch": 1.3661060802069858, + "grad_norm": 8.920136451721191, + "learning_rate": 8.528861523520792e-06, + "loss": 0.2311, + "step": 2112 + }, + { + "epoch": 1.371280724450194, + "grad_norm": 8.041257858276367, + "learning_rate": 8.516968550408998e-06, + "loss": 0.2127, + "step": 2120 + }, + { + "epoch": 1.3764553686934025, + "grad_norm": 5.114807605743408, + "learning_rate": 8.505036062757677e-06, + "loss": 0.2535, + "step": 2128 + }, + { + "epoch": 1.3816300129366106, + "grad_norm": 9.047259330749512, + "learning_rate": 8.493064194633072e-06, + "loss": 0.2284, + "step": 2136 + }, + { + "epoch": 1.386804657179819, + "grad_norm": 12.650711059570312, + "learning_rate": 8.481053080543879e-06, + "loss": 0.2553, + "step": 2144 + }, + { + "epoch": 1.391979301423027, + "grad_norm": 1.5026906728744507, + "learning_rate": 8.469002855439741e-06, + "loss": 0.2348, + "step": 2152 + }, + { + "epoch": 1.3971539456662354, + "grad_norm": 18.557418823242188, + "learning_rate": 8.456913654709725e-06, + "loss": 0.23, + "step": 2160 + }, + { + "epoch": 1.4023285899094438, + "grad_norm": 2.7122020721435547, + "learning_rate": 8.444785614180807e-06, + "loss": 0.2419, + "step": 2168 + }, + { + "epoch": 1.407503234152652, + "grad_norm": 3.2189199924468994, + "learning_rate": 8.432618870116339e-06, + "loss": 0.2381, + "step": 2176 + }, + { + "epoch": 1.4126778783958602, + "grad_norm": 3.4176528453826904, + "learning_rate": 8.42041355921453e-06, + "loss": 0.2425, + "step": 2184 + }, + { + "epoch": 1.4178525226390686, + "grad_norm": 29.597209930419922, + "learning_rate": 8.4081698186069e-06, + "loss": 0.2207, + "step": 2192 + }, + { + "epoch": 1.4230271668822767, + "grad_norm": 28.842697143554688, + "learning_rate": 8.39588778585674e-06, + "loss": 0.2419, + "step": 2200 + }, + { + "epoch": 1.428201811125485, + "grad_norm": 6.279779434204102, + "learning_rate": 8.383567598957567e-06, + "loss": 0.2383, + "step": 2208 + }, + { + "epoch": 1.4333764553686934, + "grad_norm": 21.101608276367188, + "learning_rate": 8.37120939633158e-06, + "loss": 0.2286, + "step": 2216 + }, + { + "epoch": 1.4385510996119018, + "grad_norm": 2.4211056232452393, + "learning_rate": 8.358813316828097e-06, + "loss": 0.2461, + "step": 2224 + }, + { + "epoch": 1.44372574385511, + "grad_norm": 9.471773147583008, + "learning_rate": 8.346379499722e-06, + "loss": 0.2282, + "step": 2232 + }, + { + "epoch": 1.4489003880983182, + "grad_norm": 7.390787124633789, + "learning_rate": 8.333908084712163e-06, + "loss": 0.2244, + "step": 2240 + }, + { + "epoch": 1.4540750323415266, + "grad_norm": 3.4226720333099365, + "learning_rate": 8.321399211919893e-06, + "loss": 0.2293, + "step": 2248 + }, + { + "epoch": 1.4592496765847347, + "grad_norm": 4.221699237823486, + "learning_rate": 8.308853021887346e-06, + "loss": 0.2384, + "step": 2256 + }, + { + "epoch": 1.464424320827943, + "grad_norm": 8.037599563598633, + "learning_rate": 8.296269655575956e-06, + "loss": 0.2307, + "step": 2264 + }, + { + "epoch": 1.4695989650711514, + "grad_norm": 18.835407257080078, + "learning_rate": 8.283649254364843e-06, + "loss": 0.228, + "step": 2272 + }, + { + "epoch": 1.4747736093143597, + "grad_norm": 6.508788585662842, + "learning_rate": 8.270991960049231e-06, + "loss": 0.2262, + "step": 2280 + }, + { + "epoch": 1.4799482535575679, + "grad_norm": 1.9109559059143066, + "learning_rate": 8.25829791483885e-06, + "loss": 0.2302, + "step": 2288 + }, + { + "epoch": 1.4851228978007762, + "grad_norm": 15.648486137390137, + "learning_rate": 8.245567261356347e-06, + "loss": 0.2323, + "step": 2296 + }, + { + "epoch": 1.4902975420439843, + "grad_norm": 9.713397979736328, + "learning_rate": 8.232800142635675e-06, + "loss": 0.2273, + "step": 2304 + }, + { + "epoch": 1.4954721862871927, + "grad_norm": 13.52708625793457, + "learning_rate": 8.219996702120482e-06, + "loss": 0.231, + "step": 2312 + }, + { + "epoch": 1.500646830530401, + "grad_norm": 1.5020825862884521, + "learning_rate": 8.207157083662516e-06, + "loss": 0.2344, + "step": 2320 + }, + { + "epoch": 1.5058214747736094, + "grad_norm": 0.9299488067626953, + "learning_rate": 8.19428143151999e-06, + "loss": 0.2308, + "step": 2328 + }, + { + "epoch": 1.5109961190168177, + "grad_norm": 26.19788932800293, + "learning_rate": 8.181369890355975e-06, + "loss": 0.254, + "step": 2336 + }, + { + "epoch": 1.5161707632600259, + "grad_norm": 1.185628056526184, + "learning_rate": 8.16842260523677e-06, + "loss": 0.2329, + "step": 2344 + }, + { + "epoch": 1.521345407503234, + "grad_norm": 8.439960479736328, + "learning_rate": 8.155439721630265e-06, + "loss": 0.2089, + "step": 2352 + }, + { + "epoch": 1.5265200517464423, + "grad_norm": 128.4012451171875, + "learning_rate": 8.14242138540432e-06, + "loss": 0.2227, + "step": 2360 + }, + { + "epoch": 1.5316946959896507, + "grad_norm": 1.9064865112304688, + "learning_rate": 8.129367742825117e-06, + "loss": 0.2285, + "step": 2368 + }, + { + "epoch": 1.536869340232859, + "grad_norm": 1.233069896697998, + "learning_rate": 8.116278940555517e-06, + "loss": 0.2178, + "step": 2376 + }, + { + "epoch": 1.5420439844760674, + "grad_norm": 3.824653148651123, + "learning_rate": 8.103155125653419e-06, + "loss": 0.2332, + "step": 2384 + }, + { + "epoch": 1.5472186287192755, + "grad_norm": 7.783995628356934, + "learning_rate": 8.089996445570097e-06, + "loss": 0.2175, + "step": 2392 + }, + { + "epoch": 1.5523932729624839, + "grad_norm": 3.668410301208496, + "learning_rate": 8.076803048148553e-06, + "loss": 0.2355, + "step": 2400 + }, + { + "epoch": 1.557567917205692, + "grad_norm": 9.429797172546387, + "learning_rate": 8.06357508162185e-06, + "loss": 0.2244, + "step": 2408 + }, + { + "epoch": 1.5627425614489003, + "grad_norm": 1.2597167491912842, + "learning_rate": 8.050312694611451e-06, + "loss": 0.2274, + "step": 2416 + }, + { + "epoch": 1.5679172056921087, + "grad_norm": 22.601144790649414, + "learning_rate": 8.037016036125542e-06, + "loss": 0.2134, + "step": 2424 + }, + { + "epoch": 1.573091849935317, + "grad_norm": 12.697772979736328, + "learning_rate": 8.023685255557368e-06, + "loss": 0.242, + "step": 2432 + }, + { + "epoch": 1.5782664941785254, + "grad_norm": 9.506535530090332, + "learning_rate": 8.010320502683549e-06, + "loss": 0.2277, + "step": 2440 + }, + { + "epoch": 1.5834411384217335, + "grad_norm": 2.1831676959991455, + "learning_rate": 7.996921927662395e-06, + "loss": 0.2292, + "step": 2448 + }, + { + "epoch": 1.5886157826649416, + "grad_norm": 3.2136082649230957, + "learning_rate": 7.983489681032219e-06, + "loss": 0.2349, + "step": 2456 + }, + { + "epoch": 1.59379042690815, + "grad_norm": 12.306464195251465, + "learning_rate": 7.970023913709652e-06, + "loss": 0.2303, + "step": 2464 + }, + { + "epoch": 1.5989650711513583, + "grad_norm": 17.164098739624023, + "learning_rate": 7.956524776987945e-06, + "loss": 0.2178, + "step": 2472 + }, + { + "epoch": 1.6041397153945667, + "grad_norm": 3.7992491722106934, + "learning_rate": 7.94299242253526e-06, + "loss": 0.228, + "step": 2480 + }, + { + "epoch": 1.609314359637775, + "grad_norm": 3.3547708988189697, + "learning_rate": 7.929427002392981e-06, + "loss": 0.2387, + "step": 2488 + }, + { + "epoch": 1.6144890038809832, + "grad_norm": 1.9732433557510376, + "learning_rate": 7.915828668973992e-06, + "loss": 0.2367, + "step": 2496 + }, + { + "epoch": 1.6196636481241915, + "grad_norm": 9.517621994018555, + "learning_rate": 7.902197575060978e-06, + "loss": 0.2296, + "step": 2504 + }, + { + "epoch": 1.6248382923673996, + "grad_norm": 1.9416531324386597, + "learning_rate": 7.888533873804693e-06, + "loss": 0.2267, + "step": 2512 + }, + { + "epoch": 1.630012936610608, + "grad_norm": 1.1089810132980347, + "learning_rate": 7.874837718722254e-06, + "loss": 0.2394, + "step": 2520 + }, + { + "epoch": 1.6351875808538163, + "grad_norm": 6.213425159454346, + "learning_rate": 7.861109263695405e-06, + "loss": 0.2302, + "step": 2528 + }, + { + "epoch": 1.6403622250970247, + "grad_norm": 16.181215286254883, + "learning_rate": 7.847348662968796e-06, + "loss": 0.2337, + "step": 2536 + }, + { + "epoch": 1.645536869340233, + "grad_norm": 2.1743736267089844, + "learning_rate": 7.833556071148245e-06, + "loss": 0.2249, + "step": 2544 + }, + { + "epoch": 1.6507115135834411, + "grad_norm": 54.79741668701172, + "learning_rate": 7.819731643199006e-06, + "loss": 0.2229, + "step": 2552 + }, + { + "epoch": 1.6558861578266493, + "grad_norm": 2.49298095703125, + "learning_rate": 7.805875534444016e-06, + "loss": 0.232, + "step": 2560 + }, + { + "epoch": 1.6610608020698576, + "grad_norm": 68.19276428222656, + "learning_rate": 7.79198790056217e-06, + "loss": 0.244, + "step": 2568 + }, + { + "epoch": 1.666235446313066, + "grad_norm": 2.6579787731170654, + "learning_rate": 7.77806889758655e-06, + "loss": 0.2314, + "step": 2576 + }, + { + "epoch": 1.6714100905562743, + "grad_norm": 1.3463815450668335, + "learning_rate": 7.764118681902688e-06, + "loss": 0.236, + "step": 2584 + }, + { + "epoch": 1.6765847347994827, + "grad_norm": 11.474882125854492, + "learning_rate": 7.750137410246803e-06, + "loss": 0.2229, + "step": 2592 + }, + { + "epoch": 1.6817593790426908, + "grad_norm": 33.29436492919922, + "learning_rate": 7.73612523970404e-06, + "loss": 0.218, + "step": 2600 + }, + { + "epoch": 1.6869340232858991, + "grad_norm": 17.358701705932617, + "learning_rate": 7.722082327706701e-06, + "loss": 0.2313, + "step": 2608 + }, + { + "epoch": 1.6921086675291073, + "grad_norm": 3.2574429512023926, + "learning_rate": 7.708008832032485e-06, + "loss": 0.2373, + "step": 2616 + }, + { + "epoch": 1.6972833117723156, + "grad_norm": 4.210421562194824, + "learning_rate": 7.693904910802712e-06, + "loss": 0.2358, + "step": 2624 + }, + { + "epoch": 1.702457956015524, + "grad_norm": 1.4634445905685425, + "learning_rate": 7.679770722480539e-06, + "loss": 0.2105, + "step": 2632 + }, + { + "epoch": 1.7076326002587323, + "grad_norm": 5.840834140777588, + "learning_rate": 7.665606425869194e-06, + "loss": 0.2274, + "step": 2640 + }, + { + "epoch": 1.7128072445019404, + "grad_norm": 3.880319118499756, + "learning_rate": 7.651412180110176e-06, + "loss": 0.2254, + "step": 2648 + }, + { + "epoch": 1.7179818887451488, + "grad_norm": 2.0179333686828613, + "learning_rate": 7.637188144681478e-06, + "loss": 0.2219, + "step": 2656 + }, + { + "epoch": 1.723156532988357, + "grad_norm": 4.8886332511901855, + "learning_rate": 7.622934479395792e-06, + "loss": 0.2358, + "step": 2664 + }, + { + "epoch": 1.7283311772315653, + "grad_norm": 1.195976972579956, + "learning_rate": 7.608651344398713e-06, + "loss": 0.2285, + "step": 2672 + }, + { + "epoch": 1.7335058214747736, + "grad_norm": 5.361800193786621, + "learning_rate": 7.5943389001669395e-06, + "loss": 0.2164, + "step": 2680 + }, + { + "epoch": 1.738680465717982, + "grad_norm": 13.222526550292969, + "learning_rate": 7.579997307506472e-06, + "loss": 0.2213, + "step": 2688 + }, + { + "epoch": 1.7438551099611903, + "grad_norm": 10.024150848388672, + "learning_rate": 7.565626727550804e-06, + "loss": 0.2318, + "step": 2696 + }, + { + "epoch": 1.7490297542043984, + "grad_norm": 2.918762445449829, + "learning_rate": 7.551227321759111e-06, + "loss": 0.2264, + "step": 2704 + }, + { + "epoch": 1.7542043984476066, + "grad_norm": 1.1094213724136353, + "learning_rate": 7.536799251914442e-06, + "loss": 0.2345, + "step": 2712 + }, + { + "epoch": 1.759379042690815, + "grad_norm": 0.8953060507774353, + "learning_rate": 7.522342680121897e-06, + "loss": 0.236, + "step": 2720 + }, + { + "epoch": 1.7645536869340233, + "grad_norm": 11.926855087280273, + "learning_rate": 7.507857768806803e-06, + "loss": 0.2237, + "step": 2728 + }, + { + "epoch": 1.7697283311772316, + "grad_norm": 72.2110824584961, + "learning_rate": 7.4933446807129e-06, + "loss": 0.216, + "step": 2736 + }, + { + "epoch": 1.77490297542044, + "grad_norm": 30.90595054626465, + "learning_rate": 7.4788035789005e-06, + "loss": 0.2334, + "step": 2744 + }, + { + "epoch": 1.780077619663648, + "grad_norm": 2.09355092048645, + "learning_rate": 7.464234626744659e-06, + "loss": 0.2455, + "step": 2752 + }, + { + "epoch": 1.7852522639068564, + "grad_norm": 1.883202314376831, + "learning_rate": 7.449637987933347e-06, + "loss": 0.2208, + "step": 2760 + }, + { + "epoch": 1.7904269081500646, + "grad_norm": 3.159433364868164, + "learning_rate": 7.435013826465601e-06, + "loss": 0.231, + "step": 2768 + }, + { + "epoch": 1.795601552393273, + "grad_norm": 2.3597705364227295, + "learning_rate": 7.420362306649691e-06, + "loss": 0.219, + "step": 2776 + }, + { + "epoch": 1.8007761966364813, + "grad_norm": 30.12824058532715, + "learning_rate": 7.405683593101263e-06, + "loss": 0.2193, + "step": 2784 + }, + { + "epoch": 1.8059508408796896, + "grad_norm": 4.709564685821533, + "learning_rate": 7.390977850741498e-06, + "loss": 0.2252, + "step": 2792 + }, + { + "epoch": 1.811125485122898, + "grad_norm": 2.6210618019104004, + "learning_rate": 7.376245244795255e-06, + "loss": 0.2217, + "step": 2800 + }, + { + "epoch": 1.816300129366106, + "grad_norm": 19.381088256835938, + "learning_rate": 7.361485940789221e-06, + "loss": 0.2084, + "step": 2808 + }, + { + "epoch": 1.8214747736093142, + "grad_norm": 7.356868267059326, + "learning_rate": 7.346700104550042e-06, + "loss": 0.2243, + "step": 2816 + }, + { + "epoch": 1.8266494178525226, + "grad_norm": 4.240950584411621, + "learning_rate": 7.331887902202463e-06, + "loss": 0.2204, + "step": 2824 + }, + { + "epoch": 1.831824062095731, + "grad_norm": 0.8546013832092285, + "learning_rate": 7.317049500167466e-06, + "loss": 0.2159, + "step": 2832 + }, + { + "epoch": 1.8369987063389392, + "grad_norm": 1.1202447414398193, + "learning_rate": 7.3021850651603955e-06, + "loss": 0.2429, + "step": 2840 + }, + { + "epoch": 1.8421733505821476, + "grad_norm": 27.37849998474121, + "learning_rate": 7.2872947641890854e-06, + "loss": 0.2432, + "step": 2848 + }, + { + "epoch": 1.8473479948253557, + "grad_norm": 2.847451686859131, + "learning_rate": 7.272378764551988e-06, + "loss": 0.2142, + "step": 2856 + }, + { + "epoch": 1.852522639068564, + "grad_norm": 3.85857892036438, + "learning_rate": 7.257437233836285e-06, + "loss": 0.2268, + "step": 2864 + }, + { + "epoch": 1.8576972833117722, + "grad_norm": 2.2850306034088135, + "learning_rate": 7.242470339916014e-06, + "loss": 0.2252, + "step": 2872 + }, + { + "epoch": 1.8628719275549805, + "grad_norm": 2.8729026317596436, + "learning_rate": 7.227478250950178e-06, + "loss": 0.2154, + "step": 2880 + }, + { + "epoch": 1.868046571798189, + "grad_norm": 4.433546543121338, + "learning_rate": 7.212461135380855e-06, + "loss": 0.2289, + "step": 2888 + }, + { + "epoch": 1.8732212160413972, + "grad_norm": 7.162495136260986, + "learning_rate": 7.197419161931305e-06, + "loss": 0.2229, + "step": 2896 + }, + { + "epoch": 1.8783958602846056, + "grad_norm": 2.4063501358032227, + "learning_rate": 7.182352499604081e-06, + "loss": 0.2286, + "step": 2904 + }, + { + "epoch": 1.8835705045278137, + "grad_norm": 15.924505233764648, + "learning_rate": 7.167261317679121e-06, + "loss": 0.214, + "step": 2912 + }, + { + "epoch": 1.8887451487710218, + "grad_norm": 2.0503506660461426, + "learning_rate": 7.1521457857118525e-06, + "loss": 0.2263, + "step": 2920 + }, + { + "epoch": 1.8939197930142302, + "grad_norm": 1.1639388799667358, + "learning_rate": 7.137006073531285e-06, + "loss": 0.2373, + "step": 2928 + }, + { + "epoch": 1.8990944372574385, + "grad_norm": 10.224421501159668, + "learning_rate": 7.121842351238102e-06, + "loss": 0.2182, + "step": 2936 + }, + { + "epoch": 1.9042690815006469, + "grad_norm": 1.769492268562317, + "learning_rate": 7.106654789202751e-06, + "loss": 0.2257, + "step": 2944 + }, + { + "epoch": 1.9094437257438552, + "grad_norm": 6.07066535949707, + "learning_rate": 7.0914435580635286e-06, + "loss": 0.2245, + "step": 2952 + }, + { + "epoch": 1.9146183699870634, + "grad_norm": 3.925419807434082, + "learning_rate": 7.076208828724661e-06, + "loss": 0.2408, + "step": 2960 + }, + { + "epoch": 1.9197930142302717, + "grad_norm": 11.967470169067383, + "learning_rate": 7.060950772354389e-06, + "loss": 0.2307, + "step": 2968 + }, + { + "epoch": 1.9249676584734798, + "grad_norm": 13.755480766296387, + "learning_rate": 7.045669560383039e-06, + "loss": 0.2325, + "step": 2976 + }, + { + "epoch": 1.9301423027166882, + "grad_norm": 2.514618158340454, + "learning_rate": 7.030365364501104e-06, + "loss": 0.2497, + "step": 2984 + }, + { + "epoch": 1.9353169469598965, + "grad_norm": 87.96125030517578, + "learning_rate": 7.015038356657303e-06, + "loss": 0.2279, + "step": 2992 + }, + { + "epoch": 1.9404915912031049, + "grad_norm": 5.449705600738525, + "learning_rate": 6.9996887090566645e-06, + "loss": 0.2299, + "step": 3000 + }, + { + "epoch": 1.9456662354463132, + "grad_norm": 2.127930164337158, + "learning_rate": 6.98431659415858e-06, + "loss": 0.2237, + "step": 3008 + }, + { + "epoch": 1.9508408796895214, + "grad_norm": 9.181459426879883, + "learning_rate": 6.968922184674868e-06, + "loss": 0.213, + "step": 3016 + }, + { + "epoch": 1.9560155239327295, + "grad_norm": 4.903986930847168, + "learning_rate": 6.95350565356784e-06, + "loss": 0.2141, + "step": 3024 + }, + { + "epoch": 1.9611901681759378, + "grad_norm": 1.7192250490188599, + "learning_rate": 6.93806717404835e-06, + "loss": 0.2118, + "step": 3032 + }, + { + "epoch": 1.9663648124191462, + "grad_norm": 11.83716869354248, + "learning_rate": 6.922606919573851e-06, + "loss": 0.2261, + "step": 3040 + }, + { + "epoch": 1.9715394566623545, + "grad_norm": 2.4812798500061035, + "learning_rate": 6.907125063846447e-06, + "loss": 0.2121, + "step": 3048 + }, + { + "epoch": 1.9767141009055629, + "grad_norm": 1.210487961769104, + "learning_rate": 6.891621780810941e-06, + "loss": 0.2225, + "step": 3056 + }, + { + "epoch": 1.981888745148771, + "grad_norm": 13.787434577941895, + "learning_rate": 6.876097244652879e-06, + "loss": 0.2178, + "step": 3064 + }, + { + "epoch": 1.9870633893919794, + "grad_norm": 6.533021450042725, + "learning_rate": 6.860551629796597e-06, + "loss": 0.2216, + "step": 3072 + }, + { + "epoch": 1.9922380336351875, + "grad_norm": 16.69125747680664, + "learning_rate": 6.844985110903255e-06, + "loss": 0.2197, + "step": 3080 + }, + { + "epoch": 1.9974126778783958, + "grad_norm": 8.427518844604492, + "learning_rate": 6.829397862868878e-06, + "loss": 0.202, + "step": 3088 + }, + { + "epoch": 2.002587322121604, + "grad_norm": 6.225667476654053, + "learning_rate": 6.8137900608223985e-06, + "loss": 0.2248, + "step": 3096 + }, + { + "epoch": 2.0077619663648125, + "grad_norm": 5.35168981552124, + "learning_rate": 6.798161880123671e-06, + "loss": 0.2238, + "step": 3104 + }, + { + "epoch": 2.012936610608021, + "grad_norm": 3.4669506549835205, + "learning_rate": 6.78251349636152e-06, + "loss": 0.2087, + "step": 3112 + }, + { + "epoch": 2.0181112548512288, + "grad_norm": 33.86760330200195, + "learning_rate": 6.766845085351755e-06, + "loss": 0.2332, + "step": 3120 + }, + { + "epoch": 2.023285899094437, + "grad_norm": 13.199527740478516, + "learning_rate": 6.751156823135203e-06, + "loss": 0.2139, + "step": 3128 + }, + { + "epoch": 2.0284605433376455, + "grad_norm": 6.058915615081787, + "learning_rate": 6.735448885975724e-06, + "loss": 0.2194, + "step": 3136 + }, + { + "epoch": 2.033635187580854, + "grad_norm": 1.5237799882888794, + "learning_rate": 6.7197214503582355e-06, + "loss": 0.2227, + "step": 3144 + }, + { + "epoch": 2.038809831824062, + "grad_norm": 19.020872116088867, + "learning_rate": 6.703974692986729e-06, + "loss": 0.2257, + "step": 3152 + }, + { + "epoch": 2.0439844760672705, + "grad_norm": 1.8353416919708252, + "learning_rate": 6.68820879078228e-06, + "loss": 0.217, + "step": 3160 + }, + { + "epoch": 2.049159120310479, + "grad_norm": 8.923017501831055, + "learning_rate": 6.672423920881068e-06, + "loss": 0.2086, + "step": 3168 + }, + { + "epoch": 2.0543337645536868, + "grad_norm": 10.713179588317871, + "learning_rate": 6.6566202606323806e-06, + "loss": 0.2177, + "step": 3176 + }, + { + "epoch": 2.059508408796895, + "grad_norm": 1.580041766166687, + "learning_rate": 6.640797987596621e-06, + "loss": 0.2231, + "step": 3184 + }, + { + "epoch": 2.0646830530401035, + "grad_norm": 2.9307405948638916, + "learning_rate": 6.6249572795433155e-06, + "loss": 0.2089, + "step": 3192 + }, + { + "epoch": 2.069857697283312, + "grad_norm": 4.403675079345703, + "learning_rate": 6.609098314449116e-06, + "loss": 0.2261, + "step": 3200 + }, + { + "epoch": 2.07503234152652, + "grad_norm": 5.07964563369751, + "learning_rate": 6.593221270495797e-06, + "loss": 0.2172, + "step": 3208 + }, + { + "epoch": 2.0802069857697285, + "grad_norm": 67.94288635253906, + "learning_rate": 6.5773263260682595e-06, + "loss": 0.226, + "step": 3216 + }, + { + "epoch": 2.0853816300129364, + "grad_norm": 40.04513168334961, + "learning_rate": 6.561413659752521e-06, + "loss": 0.2459, + "step": 3224 + }, + { + "epoch": 2.0905562742561448, + "grad_norm": 43.539859771728516, + "learning_rate": 6.545483450333712e-06, + "loss": 0.2203, + "step": 3232 + }, + { + "epoch": 2.095730918499353, + "grad_norm": 1.605956792831421, + "learning_rate": 6.529535876794069e-06, + "loss": 0.2147, + "step": 3240 + }, + { + "epoch": 2.1009055627425615, + "grad_norm": 11.918776512145996, + "learning_rate": 6.5135711183109156e-06, + "loss": 0.2252, + "step": 3248 + }, + { + "epoch": 2.10608020698577, + "grad_norm": 39.16691207885742, + "learning_rate": 6.497589354254662e-06, + "loss": 0.2245, + "step": 3256 + }, + { + "epoch": 2.111254851228978, + "grad_norm": 1.8998316526412964, + "learning_rate": 6.481590764186778e-06, + "loss": 0.2227, + "step": 3264 + }, + { + "epoch": 2.116429495472186, + "grad_norm": 0.9274665117263794, + "learning_rate": 6.465575527857781e-06, + "loss": 0.2145, + "step": 3272 + }, + { + "epoch": 2.1216041397153944, + "grad_norm": 2.3836865425109863, + "learning_rate": 6.44954382520522e-06, + "loss": 0.2096, + "step": 3280 + }, + { + "epoch": 2.1267787839586028, + "grad_norm": 1.166344165802002, + "learning_rate": 6.433495836351643e-06, + "loss": 0.2026, + "step": 3288 + }, + { + "epoch": 2.131953428201811, + "grad_norm": 1.88438081741333, + "learning_rate": 6.417431741602585e-06, + "loss": 0.2217, + "step": 3296 + }, + { + "epoch": 2.1371280724450195, + "grad_norm": 2.4005556106567383, + "learning_rate": 6.401351721444533e-06, + "loss": 0.2145, + "step": 3304 + }, + { + "epoch": 2.142302716688228, + "grad_norm": 2.708996295928955, + "learning_rate": 6.385255956542907e-06, + "loss": 0.2327, + "step": 3312 + }, + { + "epoch": 2.147477360931436, + "grad_norm": 3.4204115867614746, + "learning_rate": 6.369144627740023e-06, + "loss": 0.2156, + "step": 3320 + }, + { + "epoch": 2.152652005174644, + "grad_norm": 6.446617603302002, + "learning_rate": 6.353017916053063e-06, + "loss": 0.2138, + "step": 3328 + }, + { + "epoch": 2.1578266494178524, + "grad_norm": 14.877888679504395, + "learning_rate": 6.336876002672042e-06, + "loss": 0.2305, + "step": 3336 + }, + { + "epoch": 2.1630012936610608, + "grad_norm": 20.82187843322754, + "learning_rate": 6.3207190689577745e-06, + "loss": 0.2134, + "step": 3344 + }, + { + "epoch": 2.168175937904269, + "grad_norm": 1.6044968366622925, + "learning_rate": 6.304547296439831e-06, + "loss": 0.226, + "step": 3352 + }, + { + "epoch": 2.1733505821474774, + "grad_norm": 1.2519872188568115, + "learning_rate": 6.288360866814504e-06, + "loss": 0.2263, + "step": 3360 + }, + { + "epoch": 2.178525226390686, + "grad_norm": 2.0602993965148926, + "learning_rate": 6.272159961942764e-06, + "loss": 0.2354, + "step": 3368 + }, + { + "epoch": 2.1836998706338937, + "grad_norm": 8.087594032287598, + "learning_rate": 6.255944763848215e-06, + "loss": 0.2276, + "step": 3376 + }, + { + "epoch": 2.188874514877102, + "grad_norm": 2.5950396060943604, + "learning_rate": 6.239715454715054e-06, + "loss": 0.2205, + "step": 3384 + }, + { + "epoch": 2.1940491591203104, + "grad_norm": 10.110027313232422, + "learning_rate": 6.223472216886021e-06, + "loss": 0.2275, + "step": 3392 + }, + { + "epoch": 2.1992238033635187, + "grad_norm": 8.663797378540039, + "learning_rate": 6.2072152328603464e-06, + "loss": 0.2356, + "step": 3400 + }, + { + "epoch": 2.204398447606727, + "grad_norm": 5.048397541046143, + "learning_rate": 6.190944685291708e-06, + "loss": 0.2147, + "step": 3408 + }, + { + "epoch": 2.2095730918499354, + "grad_norm": 6.550983428955078, + "learning_rate": 6.174660756986175e-06, + "loss": 0.2289, + "step": 3416 + }, + { + "epoch": 2.214747736093144, + "grad_norm": 2.499985933303833, + "learning_rate": 6.158363630900155e-06, + "loss": 0.2248, + "step": 3424 + }, + { + "epoch": 2.2199223803363517, + "grad_norm": 1.5437711477279663, + "learning_rate": 6.142053490138335e-06, + "loss": 0.2065, + "step": 3432 + }, + { + "epoch": 2.22509702457956, + "grad_norm": 2.1307685375213623, + "learning_rate": 6.1257305179516315e-06, + "loss": 0.2172, + "step": 3440 + }, + { + "epoch": 2.2302716688227684, + "grad_norm": 31.46668815612793, + "learning_rate": 6.109394897735121e-06, + "loss": 0.2192, + "step": 3448 + }, + { + "epoch": 2.2354463130659767, + "grad_norm": 1.232292890548706, + "learning_rate": 6.093046813025995e-06, + "loss": 0.2076, + "step": 3456 + }, + { + "epoch": 2.240620957309185, + "grad_norm": 12.811654090881348, + "learning_rate": 6.0766864475014785e-06, + "loss": 0.2437, + "step": 3464 + }, + { + "epoch": 2.2457956015523934, + "grad_norm": 76.99649047851562, + "learning_rate": 6.060313984976783e-06, + "loss": 0.2206, + "step": 3472 + }, + { + "epoch": 2.2509702457956013, + "grad_norm": 7.426888942718506, + "learning_rate": 6.043929609403032e-06, + "loss": 0.2169, + "step": 3480 + }, + { + "epoch": 2.2561448900388097, + "grad_norm": 4.467635154724121, + "learning_rate": 6.027533504865196e-06, + "loss": 0.2332, + "step": 3488 + }, + { + "epoch": 2.261319534282018, + "grad_norm": 3.1688578128814697, + "learning_rate": 6.011125855580026e-06, + "loss": 0.2198, + "step": 3496 + }, + { + "epoch": 2.2664941785252264, + "grad_norm": 1.5323104858398438, + "learning_rate": 5.994706845893986e-06, + "loss": 0.2221, + "step": 3504 + }, + { + "epoch": 2.2716688227684347, + "grad_norm": 6.551446914672852, + "learning_rate": 5.978276660281174e-06, + "loss": 0.2285, + "step": 3512 + }, + { + "epoch": 2.276843467011643, + "grad_norm": 1.8108479976654053, + "learning_rate": 5.961835483341255e-06, + "loss": 0.2216, + "step": 3520 + }, + { + "epoch": 2.2820181112548514, + "grad_norm": 0.735333263874054, + "learning_rate": 5.945383499797388e-06, + "loss": 0.2379, + "step": 3528 + }, + { + "epoch": 2.2871927554980593, + "grad_norm": 0.6946657299995422, + "learning_rate": 5.928920894494147e-06, + "loss": 0.2168, + "step": 3536 + }, + { + "epoch": 2.2923673997412677, + "grad_norm": 48.75246810913086, + "learning_rate": 5.912447852395444e-06, + "loss": 0.2224, + "step": 3544 + }, + { + "epoch": 2.297542043984476, + "grad_norm": 3.300706148147583, + "learning_rate": 5.8959645585824575e-06, + "loss": 0.2235, + "step": 3552 + }, + { + "epoch": 2.3027166882276844, + "grad_norm": 1.0665665864944458, + "learning_rate": 5.879471198251544e-06, + "loss": 0.2149, + "step": 3560 + }, + { + "epoch": 2.3078913324708927, + "grad_norm": 8.621994018554688, + "learning_rate": 5.86296795671216e-06, + "loss": 0.2134, + "step": 3568 + }, + { + "epoch": 2.313065976714101, + "grad_norm": 11.574270248413086, + "learning_rate": 5.846455019384787e-06, + "loss": 0.2019, + "step": 3576 + }, + { + "epoch": 2.318240620957309, + "grad_norm": 4.1682515144348145, + "learning_rate": 5.8299325717988355e-06, + "loss": 0.2288, + "step": 3584 + }, + { + "epoch": 2.3234152652005173, + "grad_norm": 42.42872619628906, + "learning_rate": 5.813400799590573e-06, + "loss": 0.2191, + "step": 3592 + }, + { + "epoch": 2.3285899094437257, + "grad_norm": 13.773611068725586, + "learning_rate": 5.7968598885010315e-06, + "loss": 0.2215, + "step": 3600 + }, + { + "epoch": 2.333764553686934, + "grad_norm": 6.93135404586792, + "learning_rate": 5.780310024373923e-06, + "loss": 0.2263, + "step": 3608 + }, + { + "epoch": 2.3389391979301424, + "grad_norm": 8.114331245422363, + "learning_rate": 5.763751393153545e-06, + "loss": 0.2228, + "step": 3616 + }, + { + "epoch": 2.3441138421733507, + "grad_norm": 1.8138705492019653, + "learning_rate": 5.747184180882704e-06, + "loss": 0.2057, + "step": 3624 + }, + { + "epoch": 2.349288486416559, + "grad_norm": 75.94891357421875, + "learning_rate": 5.730608573700613e-06, + "loss": 0.2174, + "step": 3632 + }, + { + "epoch": 2.354463130659767, + "grad_norm": 10.848607063293457, + "learning_rate": 5.714024757840806e-06, + "loss": 0.2248, + "step": 3640 + }, + { + "epoch": 2.3596377749029753, + "grad_norm": 5.006667613983154, + "learning_rate": 5.697432919629048e-06, + "loss": 0.209, + "step": 3648 + }, + { + "epoch": 2.3648124191461837, + "grad_norm": 1.7111903429031372, + "learning_rate": 5.680833245481234e-06, + "loss": 0.2026, + "step": 3656 + }, + { + "epoch": 2.369987063389392, + "grad_norm": 7.416234970092773, + "learning_rate": 5.664225921901302e-06, + "loss": 0.2339, + "step": 3664 + }, + { + "epoch": 2.3751617076326004, + "grad_norm": 1.52064049243927, + "learning_rate": 5.647611135479133e-06, + "loss": 0.216, + "step": 3672 + }, + { + "epoch": 2.3803363518758087, + "grad_norm": 1.1007732152938843, + "learning_rate": 5.6309890728884555e-06, + "loss": 0.2238, + "step": 3680 + }, + { + "epoch": 2.3855109961190166, + "grad_norm": 1.1339306831359863, + "learning_rate": 5.614359920884751e-06, + "loss": 0.2305, + "step": 3688 + }, + { + "epoch": 2.390685640362225, + "grad_norm": 1.9651095867156982, + "learning_rate": 5.5977238663031495e-06, + "loss": 0.217, + "step": 3696 + }, + { + "epoch": 2.3958602846054333, + "grad_norm": 8.74808120727539, + "learning_rate": 5.581081096056337e-06, + "loss": 0.2328, + "step": 3704 + }, + { + "epoch": 2.4010349288486417, + "grad_norm": 1.7282960414886475, + "learning_rate": 5.564431797132454e-06, + "loss": 0.2172, + "step": 3712 + }, + { + "epoch": 2.40620957309185, + "grad_norm": 2.679291248321533, + "learning_rate": 5.547776156592989e-06, + "loss": 0.2316, + "step": 3720 + }, + { + "epoch": 2.4113842173350584, + "grad_norm": 5.007349491119385, + "learning_rate": 5.531114361570684e-06, + "loss": 0.2344, + "step": 3728 + }, + { + "epoch": 2.4165588615782667, + "grad_norm": 91.39422607421875, + "learning_rate": 5.514446599267429e-06, + "loss": 0.2316, + "step": 3736 + }, + { + "epoch": 2.4217335058214746, + "grad_norm": 2.13769268989563, + "learning_rate": 5.497773056952159e-06, + "loss": 0.2272, + "step": 3744 + }, + { + "epoch": 2.426908150064683, + "grad_norm": 3.045867681503296, + "learning_rate": 5.481093921958749e-06, + "loss": 0.2093, + "step": 3752 + }, + { + "epoch": 2.4320827943078913, + "grad_norm": 1.7974780797958374, + "learning_rate": 5.4644093816839086e-06, + "loss": 0.221, + "step": 3760 + }, + { + "epoch": 2.4372574385510997, + "grad_norm": 2.6184771060943604, + "learning_rate": 5.44771962358508e-06, + "loss": 0.2214, + "step": 3768 + }, + { + "epoch": 2.442432082794308, + "grad_norm": 1.2994199991226196, + "learning_rate": 5.4310248351783264e-06, + "loss": 0.2394, + "step": 3776 + }, + { + "epoch": 2.4476067270375164, + "grad_norm": 4.988973140716553, + "learning_rate": 5.414325204036237e-06, + "loss": 0.2176, + "step": 3784 + }, + { + "epoch": 2.4527813712807243, + "grad_norm": 2.890683650970459, + "learning_rate": 5.397620917785799e-06, + "loss": 0.2182, + "step": 3792 + }, + { + "epoch": 2.4579560155239326, + "grad_norm": 9.80825424194336, + "learning_rate": 5.380912164106312e-06, + "loss": 0.2228, + "step": 3800 + }, + { + "epoch": 2.463130659767141, + "grad_norm": 6.544095516204834, + "learning_rate": 5.364199130727262e-06, + "loss": 0.2336, + "step": 3808 + }, + { + "epoch": 2.4683053040103493, + "grad_norm": 4.200817584991455, + "learning_rate": 5.347482005426224e-06, + "loss": 0.218, + "step": 3816 + }, + { + "epoch": 2.4734799482535577, + "grad_norm": 2.199859142303467, + "learning_rate": 5.330760976026744e-06, + "loss": 0.228, + "step": 3824 + }, + { + "epoch": 2.478654592496766, + "grad_norm": 4.835768222808838, + "learning_rate": 5.314036230396233e-06, + "loss": 0.2106, + "step": 3832 + }, + { + "epoch": 2.4838292367399744, + "grad_norm": 1.3598271608352661, + "learning_rate": 5.297307956443856e-06, + "loss": 0.2349, + "step": 3840 + }, + { + "epoch": 2.4890038809831823, + "grad_norm": 36.20537185668945, + "learning_rate": 5.28057634211842e-06, + "loss": 0.2099, + "step": 3848 + }, + { + "epoch": 2.4941785252263906, + "grad_norm": 1.2210248708724976, + "learning_rate": 5.2638415754062625e-06, + "loss": 0.212, + "step": 3856 + }, + { + "epoch": 2.499353169469599, + "grad_norm": 12.795808792114258, + "learning_rate": 5.247103844329137e-06, + "loss": 0.2252, + "step": 3864 + }, + { + "epoch": 2.5045278137128073, + "grad_norm": 7.502784252166748, + "learning_rate": 5.230363336942105e-06, + "loss": 0.2284, + "step": 3872 + }, + { + "epoch": 2.5097024579560157, + "grad_norm": 2.0604794025421143, + "learning_rate": 5.213620241331424e-06, + "loss": 0.2173, + "step": 3880 + }, + { + "epoch": 2.5148771021992236, + "grad_norm": 35.510223388671875, + "learning_rate": 5.196874745612425e-06, + "loss": 0.2195, + "step": 3888 + }, + { + "epoch": 2.520051746442432, + "grad_norm": 4.822657108306885, + "learning_rate": 5.180127037927408e-06, + "loss": 0.2274, + "step": 3896 + }, + { + "epoch": 2.5252263906856403, + "grad_norm": 3.2674267292022705, + "learning_rate": 5.163377306443527e-06, + "loss": 0.2128, + "step": 3904 + }, + { + "epoch": 2.5304010349288486, + "grad_norm": 17.1621036529541, + "learning_rate": 5.146625739350671e-06, + "loss": 0.2209, + "step": 3912 + }, + { + "epoch": 2.535575679172057, + "grad_norm": 3.881488561630249, + "learning_rate": 5.129872524859356e-06, + "loss": 0.2337, + "step": 3920 + }, + { + "epoch": 2.5407503234152653, + "grad_norm": 0.8638431429862976, + "learning_rate": 5.1131178511986045e-06, + "loss": 0.2165, + "step": 3928 + }, + { + "epoch": 2.5459249676584736, + "grad_norm": 30.481639862060547, + "learning_rate": 5.096361906613836e-06, + "loss": 0.2212, + "step": 3936 + }, + { + "epoch": 2.551099611901682, + "grad_norm": 13.07479190826416, + "learning_rate": 5.079604879364746e-06, + "loss": 0.2081, + "step": 3944 + }, + { + "epoch": 2.55627425614489, + "grad_norm": 2.632080316543579, + "learning_rate": 5.062846957723194e-06, + "loss": 0.2131, + "step": 3952 + }, + { + "epoch": 2.5614489003880982, + "grad_norm": 21.21263885498047, + "learning_rate": 5.046088329971095e-06, + "loss": 0.219, + "step": 3960 + }, + { + "epoch": 2.5666235446313066, + "grad_norm": 4.239492416381836, + "learning_rate": 5.0293291843982896e-06, + "loss": 0.2351, + "step": 3968 + }, + { + "epoch": 2.571798188874515, + "grad_norm": 2.2403526306152344, + "learning_rate": 5.012569709300441e-06, + "loss": 0.2104, + "step": 3976 + }, + { + "epoch": 2.5769728331177233, + "grad_norm": 44.84512710571289, + "learning_rate": 4.995810092976912e-06, + "loss": 0.2097, + "step": 3984 + }, + { + "epoch": 2.582147477360931, + "grad_norm": 411.47149658203125, + "learning_rate": 4.979050523728654e-06, + "loss": 0.2123, + "step": 3992 + }, + { + "epoch": 2.5873221216041395, + "grad_norm": 19.23361587524414, + "learning_rate": 4.962291189856089e-06, + "loss": 0.2208, + "step": 4000 + }, + { + "epoch": 2.592496765847348, + "grad_norm": 1.0302085876464844, + "learning_rate": 4.945532279656993e-06, + "loss": 0.2157, + "step": 4008 + }, + { + "epoch": 2.5976714100905562, + "grad_norm": 16.12313461303711, + "learning_rate": 4.9287739814243835e-06, + "loss": 0.2165, + "step": 4016 + }, + { + "epoch": 2.6028460543337646, + "grad_norm": 1.375899076461792, + "learning_rate": 4.912016483444403e-06, + "loss": 0.2186, + "step": 4024 + }, + { + "epoch": 2.608020698576973, + "grad_norm": 24.06150245666504, + "learning_rate": 4.8952599739942015e-06, + "loss": 0.2256, + "step": 4032 + }, + { + "epoch": 2.6131953428201813, + "grad_norm": 3.1488006114959717, + "learning_rate": 4.878504641339822e-06, + "loss": 0.2298, + "step": 4040 + }, + { + "epoch": 2.6183699870633896, + "grad_norm": 38.62849044799805, + "learning_rate": 4.861750673734085e-06, + "loss": 0.2238, + "step": 4048 + }, + { + "epoch": 2.6235446313065975, + "grad_norm": 1.4817951917648315, + "learning_rate": 4.8449982594144786e-06, + "loss": 0.2226, + "step": 4056 + }, + { + "epoch": 2.628719275549806, + "grad_norm": 28.119951248168945, + "learning_rate": 4.828247586601035e-06, + "loss": 0.2168, + "step": 4064 + }, + { + "epoch": 2.6338939197930142, + "grad_norm": 8.89055347442627, + "learning_rate": 4.811498843494222e-06, + "loss": 0.2061, + "step": 4072 + }, + { + "epoch": 2.6390685640362226, + "grad_norm": 1.4245612621307373, + "learning_rate": 4.794752218272824e-06, + "loss": 0.2239, + "step": 4080 + }, + { + "epoch": 2.644243208279431, + "grad_norm": 1.8925325870513916, + "learning_rate": 4.7780078990918326e-06, + "loss": 0.2189, + "step": 4088 + }, + { + "epoch": 2.649417852522639, + "grad_norm": 2.8567874431610107, + "learning_rate": 4.761266074080326e-06, + "loss": 0.213, + "step": 4096 + }, + { + "epoch": 2.654592496765847, + "grad_norm": 21.136762619018555, + "learning_rate": 4.744526931339367e-06, + "loss": 0.2169, + "step": 4104 + }, + { + "epoch": 2.6597671410090555, + "grad_norm": 27.49756622314453, + "learning_rate": 4.727790658939875e-06, + "loss": 0.223, + "step": 4112 + }, + { + "epoch": 2.664941785252264, + "grad_norm": 38.296661376953125, + "learning_rate": 4.711057444920522e-06, + "loss": 0.2048, + "step": 4120 + }, + { + "epoch": 2.6701164294954722, + "grad_norm": 2.039231061935425, + "learning_rate": 4.694327477285619e-06, + "loss": 0.2286, + "step": 4128 + }, + { + "epoch": 2.6752910737386806, + "grad_norm": 1.4154362678527832, + "learning_rate": 4.6776009440030035e-06, + "loss": 0.2223, + "step": 4136 + }, + { + "epoch": 2.680465717981889, + "grad_norm": 27.39082908630371, + "learning_rate": 4.660878033001922e-06, + "loss": 0.2154, + "step": 4144 + }, + { + "epoch": 2.6856403622250973, + "grad_norm": 2.18965482711792, + "learning_rate": 4.644158932170929e-06, + "loss": 0.2162, + "step": 4152 + }, + { + "epoch": 2.690815006468305, + "grad_norm": 37.81782531738281, + "learning_rate": 4.627443829355765e-06, + "loss": 0.2317, + "step": 4160 + }, + { + "epoch": 2.6959896507115135, + "grad_norm": 3.4519877433776855, + "learning_rate": 4.610732912357256e-06, + "loss": 0.229, + "step": 4168 + }, + { + "epoch": 2.701164294954722, + "grad_norm": 7.153350353240967, + "learning_rate": 4.5940263689291955e-06, + "loss": 0.2298, + "step": 4176 + }, + { + "epoch": 2.7063389391979302, + "grad_norm": 2.97613263130188, + "learning_rate": 4.57732438677624e-06, + "loss": 0.22, + "step": 4184 + }, + { + "epoch": 2.7115135834411386, + "grad_norm": 15.217869758605957, + "learning_rate": 4.560627153551795e-06, + "loss": 0.2013, + "step": 4192 + }, + { + "epoch": 2.7166882276843465, + "grad_norm": 17.354280471801758, + "learning_rate": 4.543934856855913e-06, + "loss": 0.2089, + "step": 4200 + }, + { + "epoch": 2.721862871927555, + "grad_norm": 3.547731876373291, + "learning_rate": 4.527247684233185e-06, + "loss": 0.2166, + "step": 4208 + }, + { + "epoch": 2.727037516170763, + "grad_norm": 3.7181508541107178, + "learning_rate": 4.510565823170625e-06, + "loss": 0.2223, + "step": 4216 + }, + { + "epoch": 2.7322121604139715, + "grad_norm": 1.831517219543457, + "learning_rate": 4.493889461095574e-06, + "loss": 0.2136, + "step": 4224 + }, + { + "epoch": 2.73738680465718, + "grad_norm": 2.9298593997955322, + "learning_rate": 4.477218785373587e-06, + "loss": 0.2184, + "step": 4232 + }, + { + "epoch": 2.742561448900388, + "grad_norm": 4.073368072509766, + "learning_rate": 4.460553983306332e-06, + "loss": 0.2076, + "step": 4240 + }, + { + "epoch": 2.7477360931435966, + "grad_norm": 1.481608510017395, + "learning_rate": 4.443895242129484e-06, + "loss": 0.214, + "step": 4248 + }, + { + "epoch": 2.752910737386805, + "grad_norm": 8.85819149017334, + "learning_rate": 4.4272427490106215e-06, + "loss": 0.2211, + "step": 4256 + }, + { + "epoch": 2.758085381630013, + "grad_norm": 15.898992538452148, + "learning_rate": 4.410596691047123e-06, + "loss": 0.215, + "step": 4264 + }, + { + "epoch": 2.763260025873221, + "grad_norm": 1.9979652166366577, + "learning_rate": 4.3939572552640645e-06, + "loss": 0.2083, + "step": 4272 + }, + { + "epoch": 2.7684346701164295, + "grad_norm": 1.3245702981948853, + "learning_rate": 4.377324628612123e-06, + "loss": 0.216, + "step": 4280 + }, + { + "epoch": 2.773609314359638, + "grad_norm": 4.14132022857666, + "learning_rate": 4.36069899796547e-06, + "loss": 0.2299, + "step": 4288 + }, + { + "epoch": 2.778783958602846, + "grad_norm": 22.5025691986084, + "learning_rate": 4.344080550119672e-06, + "loss": 0.2223, + "step": 4296 + }, + { + "epoch": 2.783958602846054, + "grad_norm": 4.249485492706299, + "learning_rate": 4.327469471789597e-06, + "loss": 0.2112, + "step": 4304 + }, + { + "epoch": 2.7891332470892625, + "grad_norm": 26.41204071044922, + "learning_rate": 4.310865949607311e-06, + "loss": 0.2171, + "step": 4312 + }, + { + "epoch": 2.794307891332471, + "grad_norm": 1.7798364162445068, + "learning_rate": 4.294270170119987e-06, + "loss": 0.2243, + "step": 4320 + }, + { + "epoch": 2.799482535575679, + "grad_norm": 36.40846252441406, + "learning_rate": 4.277682319787802e-06, + "loss": 0.22, + "step": 4328 + }, + { + "epoch": 2.8046571798188875, + "grad_norm": 2.6109695434570312, + "learning_rate": 4.261102584981848e-06, + "loss": 0.214, + "step": 4336 + }, + { + "epoch": 2.809831824062096, + "grad_norm": 3.024670124053955, + "learning_rate": 4.244531151982034e-06, + "loss": 0.2237, + "step": 4344 + }, + { + "epoch": 2.815006468305304, + "grad_norm": 8.061532974243164, + "learning_rate": 4.227968206974999e-06, + "loss": 0.223, + "step": 4352 + }, + { + "epoch": 2.8201811125485126, + "grad_norm": 1.6048274040222168, + "learning_rate": 4.211413936052013e-06, + "loss": 0.2302, + "step": 4360 + }, + { + "epoch": 2.8253557567917205, + "grad_norm": 1.1506285667419434, + "learning_rate": 4.194868525206887e-06, + "loss": 0.2187, + "step": 4368 + }, + { + "epoch": 2.830530401034929, + "grad_norm": 2.458400011062622, + "learning_rate": 4.178332160333891e-06, + "loss": 0.2229, + "step": 4376 + }, + { + "epoch": 2.835705045278137, + "grad_norm": 1.0294382572174072, + "learning_rate": 4.161805027225655e-06, + "loss": 0.2025, + "step": 4384 + }, + { + "epoch": 2.8408796895213455, + "grad_norm": 7.586164474487305, + "learning_rate": 4.145287311571089e-06, + "loss": 0.2221, + "step": 4392 + }, + { + "epoch": 2.8460543337645534, + "grad_norm": 1.346274733543396, + "learning_rate": 4.1287791989532935e-06, + "loss": 0.2107, + "step": 4400 + }, + { + "epoch": 2.8512289780077618, + "grad_norm": 11.22722053527832, + "learning_rate": 4.1122808748474745e-06, + "loss": 0.2115, + "step": 4408 + }, + { + "epoch": 2.85640362225097, + "grad_norm": 11.525304794311523, + "learning_rate": 4.095792524618861e-06, + "loss": 0.2143, + "step": 4416 + }, + { + "epoch": 2.8615782664941785, + "grad_norm": 1.2492775917053223, + "learning_rate": 4.079314333520623e-06, + "loss": 0.2164, + "step": 4424 + }, + { + "epoch": 2.866752910737387, + "grad_norm": 20.382579803466797, + "learning_rate": 4.062846486691784e-06, + "loss": 0.224, + "step": 4432 + }, + { + "epoch": 2.871927554980595, + "grad_norm": 1.4670060873031616, + "learning_rate": 4.04638916915515e-06, + "loss": 0.2262, + "step": 4440 + }, + { + "epoch": 2.8771021992238035, + "grad_norm": 1.079577922821045, + "learning_rate": 4.0299425658152255e-06, + "loss": 0.2128, + "step": 4448 + }, + { + "epoch": 2.882276843467012, + "grad_norm": 2.4892783164978027, + "learning_rate": 4.013506861456136e-06, + "loss": 0.2148, + "step": 4456 + }, + { + "epoch": 2.88745148771022, + "grad_norm": 43.330230712890625, + "learning_rate": 3.997082240739551e-06, + "loss": 0.1989, + "step": 4464 + }, + { + "epoch": 2.892626131953428, + "grad_norm": 1.0373423099517822, + "learning_rate": 3.9806688882026125e-06, + "loss": 0.2078, + "step": 4472 + }, + { + "epoch": 2.8978007761966365, + "grad_norm": 35.63648223876953, + "learning_rate": 3.964266988255861e-06, + "loss": 0.2311, + "step": 4480 + }, + { + "epoch": 2.902975420439845, + "grad_norm": 1.692650556564331, + "learning_rate": 3.94787672518116e-06, + "loss": 0.2041, + "step": 4488 + }, + { + "epoch": 2.908150064683053, + "grad_norm": 11.523728370666504, + "learning_rate": 3.931498283129631e-06, + "loss": 0.2085, + "step": 4496 + }, + { + "epoch": 2.913324708926261, + "grad_norm": 1.7369672060012817, + "learning_rate": 3.915131846119581e-06, + "loss": 0.2117, + "step": 4504 + }, + { + "epoch": 2.9184993531694694, + "grad_norm": 561.2520141601562, + "learning_rate": 3.898777598034434e-06, + "loss": 0.2014, + "step": 4512 + }, + { + "epoch": 2.9236739974126777, + "grad_norm": 11.942583084106445, + "learning_rate": 3.882435722620667e-06, + "loss": 0.2121, + "step": 4520 + }, + { + "epoch": 2.928848641655886, + "grad_norm": 4.81027364730835, + "learning_rate": 3.866106403485745e-06, + "loss": 0.2115, + "step": 4528 + }, + { + "epoch": 2.9340232858990944, + "grad_norm": 13.312249183654785, + "learning_rate": 3.849789824096061e-06, + "loss": 0.2025, + "step": 4536 + }, + { + "epoch": 2.939197930142303, + "grad_norm": 7.112905502319336, + "learning_rate": 3.833486167774867e-06, + "loss": 0.2168, + "step": 4544 + }, + { + "epoch": 2.944372574385511, + "grad_norm": 1.4722397327423096, + "learning_rate": 3.817195617700224e-06, + "loss": 0.2208, + "step": 4552 + }, + { + "epoch": 2.9495472186287195, + "grad_norm": 2.662931442260742, + "learning_rate": 3.800918356902936e-06, + "loss": 0.2087, + "step": 4560 + }, + { + "epoch": 2.9547218628719274, + "grad_norm": 1.9063646793365479, + "learning_rate": 3.784654568264497e-06, + "loss": 0.2112, + "step": 4568 + }, + { + "epoch": 2.9598965071151357, + "grad_norm": 3.9283578395843506, + "learning_rate": 3.768404434515038e-06, + "loss": 0.2125, + "step": 4576 + }, + { + "epoch": 2.965071151358344, + "grad_norm": 1.8885002136230469, + "learning_rate": 3.7521681382312693e-06, + "loss": 0.2217, + "step": 4584 + }, + { + "epoch": 2.9702457956015524, + "grad_norm": 7.40506649017334, + "learning_rate": 3.735945861834434e-06, + "loss": 0.2288, + "step": 4592 + }, + { + "epoch": 2.975420439844761, + "grad_norm": 3.7346508502960205, + "learning_rate": 3.7197377875882547e-06, + "loss": 0.2006, + "step": 4600 + }, + { + "epoch": 2.9805950840879687, + "grad_norm": 13.738978385925293, + "learning_rate": 3.703544097596887e-06, + "loss": 0.2208, + "step": 4608 + }, + { + "epoch": 2.985769728331177, + "grad_norm": 25.822072982788086, + "learning_rate": 3.6873649738028737e-06, + "loss": 0.2057, + "step": 4616 + }, + { + "epoch": 2.9909443725743854, + "grad_norm": 6.863162040710449, + "learning_rate": 3.671200597985104e-06, + "loss": 0.214, + "step": 4624 + }, + { + "epoch": 2.9961190168175937, + "grad_norm": 2.6450467109680176, + "learning_rate": 3.655051151756762e-06, + "loss": 0.2111, + "step": 4632 + }, + { + "epoch": 3.001293661060802, + "grad_norm": 4.778412818908691, + "learning_rate": 3.638916816563298e-06, + "loss": 0.2038, + "step": 4640 + }, + { + "epoch": 3.0064683053040104, + "grad_norm": 4.276455879211426, + "learning_rate": 3.622797773680379e-06, + "loss": 0.2208, + "step": 4648 + }, + { + "epoch": 3.011642949547219, + "grad_norm": 11.780566215515137, + "learning_rate": 3.6066942042118568e-06, + "loss": 0.2253, + "step": 4656 + }, + { + "epoch": 3.0168175937904267, + "grad_norm": 16.244783401489258, + "learning_rate": 3.5906062890877368e-06, + "loss": 0.2178, + "step": 4664 + }, + { + "epoch": 3.021992238033635, + "grad_norm": 9.579275131225586, + "learning_rate": 3.5745342090621406e-06, + "loss": 0.2088, + "step": 4672 + }, + { + "epoch": 3.0271668822768434, + "grad_norm": 2.0552473068237305, + "learning_rate": 3.5584781447112737e-06, + "loss": 0.1995, + "step": 4680 + }, + { + "epoch": 3.0323415265200517, + "grad_norm": 5.175484657287598, + "learning_rate": 3.542438276431401e-06, + "loss": 0.2117, + "step": 4688 + }, + { + "epoch": 3.03751617076326, + "grad_norm": 2.5752906799316406, + "learning_rate": 3.526414784436819e-06, + "loss": 0.2138, + "step": 4696 + }, + { + "epoch": 3.0426908150064684, + "grad_norm": 15.387189865112305, + "learning_rate": 3.510407848757828e-06, + "loss": 0.213, + "step": 4704 + }, + { + "epoch": 3.047865459249677, + "grad_norm": 1.8179436922073364, + "learning_rate": 3.494417649238713e-06, + "loss": 0.227, + "step": 4712 + }, + { + "epoch": 3.0530401034928847, + "grad_norm": 5.298356056213379, + "learning_rate": 3.47844436553572e-06, + "loss": 0.2273, + "step": 4720 + }, + { + "epoch": 3.058214747736093, + "grad_norm": 1.73037588596344, + "learning_rate": 3.462488177115041e-06, + "loss": 0.223, + "step": 4728 + }, + { + "epoch": 3.0633893919793014, + "grad_norm": 107.42707824707031, + "learning_rate": 3.4465492632507946e-06, + "loss": 0.2007, + "step": 4736 + }, + { + "epoch": 3.0685640362225097, + "grad_norm": 1.1304751634597778, + "learning_rate": 3.4306278030230143e-06, + "loss": 0.2251, + "step": 4744 + }, + { + "epoch": 3.073738680465718, + "grad_norm": 3.318490505218506, + "learning_rate": 3.4147239753156324e-06, + "loss": 0.2042, + "step": 4752 + }, + { + "epoch": 3.0789133247089264, + "grad_norm": 7.975718975067139, + "learning_rate": 3.398837958814475e-06, + "loss": 0.2036, + "step": 4760 + }, + { + "epoch": 3.0840879689521343, + "grad_norm": 6.05825662612915, + "learning_rate": 3.382969932005252e-06, + "loss": 0.2255, + "step": 4768 + }, + { + "epoch": 3.0892626131953427, + "grad_norm": 4.792303562164307, + "learning_rate": 3.367120073171548e-06, + "loss": 0.2107, + "step": 4776 + }, + { + "epoch": 3.094437257438551, + "grad_norm": 46.59291076660156, + "learning_rate": 3.351288560392833e-06, + "loss": 0.2162, + "step": 4784 + }, + { + "epoch": 3.0996119016817594, + "grad_norm": 1.4341033697128296, + "learning_rate": 3.335475571542442e-06, + "loss": 0.2031, + "step": 4792 + }, + { + "epoch": 3.1047865459249677, + "grad_norm": 0.8310636878013611, + "learning_rate": 3.3196812842855895e-06, + "loss": 0.1967, + "step": 4800 + }, + { + "epoch": 3.109961190168176, + "grad_norm": 2.6309316158294678, + "learning_rate": 3.303905876077372e-06, + "loss": 0.2109, + "step": 4808 + }, + { + "epoch": 3.1151358344113844, + "grad_norm": 1.6860774755477905, + "learning_rate": 3.28814952416077e-06, + "loss": 0.2186, + "step": 4816 + }, + { + "epoch": 3.1203104786545923, + "grad_norm": 11.298639297485352, + "learning_rate": 3.272412405564659e-06, + "loss": 0.2198, + "step": 4824 + }, + { + "epoch": 3.1254851228978007, + "grad_norm": 12.061873435974121, + "learning_rate": 3.2566946971018225e-06, + "loss": 0.2182, + "step": 4832 + }, + { + "epoch": 3.130659767141009, + "grad_norm": 13.391987800598145, + "learning_rate": 3.240996575366961e-06, + "loss": 0.2411, + "step": 4840 + }, + { + "epoch": 3.1358344113842174, + "grad_norm": 1.7577292919158936, + "learning_rate": 3.225318216734713e-06, + "loss": 0.2034, + "step": 4848 + }, + { + "epoch": 3.1410090556274257, + "grad_norm": 15.407753944396973, + "learning_rate": 3.209659797357669e-06, + "loss": 0.2177, + "step": 4856 + }, + { + "epoch": 3.146183699870634, + "grad_norm": 17.410030364990234, + "learning_rate": 3.1940214931643945e-06, + "loss": 0.2206, + "step": 4864 + }, + { + "epoch": 3.151358344113842, + "grad_norm": 24.095252990722656, + "learning_rate": 3.1784034798574514e-06, + "loss": 0.2121, + "step": 4872 + }, + { + "epoch": 3.1565329883570503, + "grad_norm": 1.7420412302017212, + "learning_rate": 3.1628059329114286e-06, + "loss": 0.2287, + "step": 4880 + }, + { + "epoch": 3.1617076326002587, + "grad_norm": 12.788291931152344, + "learning_rate": 3.1472290275709642e-06, + "loss": 0.2116, + "step": 4888 + }, + { + "epoch": 3.166882276843467, + "grad_norm": 12.375226974487305, + "learning_rate": 3.1316729388487815e-06, + "loss": 0.211, + "step": 4896 + }, + { + "epoch": 3.1720569210866754, + "grad_norm": 3.030954360961914, + "learning_rate": 3.1161378415237197e-06, + "loss": 0.2159, + "step": 4904 + }, + { + "epoch": 3.1772315653298837, + "grad_norm": 1.1043621301651, + "learning_rate": 3.1006239101387725e-06, + "loss": 0.2259, + "step": 4912 + }, + { + "epoch": 3.1824062095730916, + "grad_norm": 5.325335502624512, + "learning_rate": 3.0851313189991226e-06, + "loss": 0.2146, + "step": 4920 + }, + { + "epoch": 3.1875808538163, + "grad_norm": 2.782697916030884, + "learning_rate": 3.0696602421701943e-06, + "loss": 0.2013, + "step": 4928 + }, + { + "epoch": 3.1927554980595083, + "grad_norm": 1.1667276620864868, + "learning_rate": 3.054210853475682e-06, + "loss": 0.2264, + "step": 4936 + }, + { + "epoch": 3.1979301423027167, + "grad_norm": 0.9541177153587341, + "learning_rate": 3.0387833264956078e-06, + "loss": 0.215, + "step": 4944 + }, + { + "epoch": 3.203104786545925, + "grad_norm": 3.777299404144287, + "learning_rate": 3.02337783456437e-06, + "loss": 0.2156, + "step": 4952 + }, + { + "epoch": 3.2082794307891334, + "grad_norm": 1.8934595584869385, + "learning_rate": 3.007994550768793e-06, + "loss": 0.2144, + "step": 4960 + }, + { + "epoch": 3.2134540750323417, + "grad_norm": 1.6408132314682007, + "learning_rate": 2.9926336479461846e-06, + "loss": 0.2023, + "step": 4968 + }, + { + "epoch": 3.2186287192755496, + "grad_norm": 1.0243244171142578, + "learning_rate": 2.9772952986823943e-06, + "loss": 0.2096, + "step": 4976 + }, + { + "epoch": 3.223803363518758, + "grad_norm": 2.1264467239379883, + "learning_rate": 2.9619796753098716e-06, + "loss": 0.215, + "step": 4984 + }, + { + "epoch": 3.2289780077619663, + "grad_norm": 3.805931568145752, + "learning_rate": 2.946686949905733e-06, + "loss": 0.2156, + "step": 4992 + }, + { + "epoch": 3.2341526520051747, + "grad_norm": 4.528544902801514, + "learning_rate": 2.9314172942898257e-06, + "loss": 0.2003, + "step": 5000 + }, + { + "epoch": 3.239327296248383, + "grad_norm": 3.4873783588409424, + "learning_rate": 2.9161708800228e-06, + "loss": 0.2179, + "step": 5008 + }, + { + "epoch": 3.2445019404915914, + "grad_norm": 13.625129699707031, + "learning_rate": 2.900947878404181e-06, + "loss": 0.2089, + "step": 5016 + }, + { + "epoch": 3.2496765847347993, + "grad_norm": 0.8940442800521851, + "learning_rate": 2.8857484604704415e-06, + "loss": 0.2057, + "step": 5024 + }, + { + "epoch": 3.2548512289780076, + "grad_norm": 4.131524085998535, + "learning_rate": 2.870572796993084e-06, + "loss": 0.1919, + "step": 5032 + }, + { + "epoch": 3.260025873221216, + "grad_norm": 1.8720024824142456, + "learning_rate": 2.8554210584767188e-06, + "loss": 0.2197, + "step": 5040 + }, + { + "epoch": 3.2652005174644243, + "grad_norm": 0.6342570781707764, + "learning_rate": 2.8402934151571505e-06, + "loss": 0.2223, + "step": 5048 + }, + { + "epoch": 3.2703751617076326, + "grad_norm": 1.085699200630188, + "learning_rate": 2.8251900369994645e-06, + "loss": 0.2088, + "step": 5056 + }, + { + "epoch": 3.275549805950841, + "grad_norm": 1.371040940284729, + "learning_rate": 2.8101110936961153e-06, + "loss": 0.2165, + "step": 5064 + }, + { + "epoch": 3.2807244501940493, + "grad_norm": 1.3010331392288208, + "learning_rate": 2.795056754665028e-06, + "loss": 0.2177, + "step": 5072 + }, + { + "epoch": 3.2858990944372573, + "grad_norm": 5.235978603363037, + "learning_rate": 2.7800271890476836e-06, + "loss": 0.2108, + "step": 5080 + }, + { + "epoch": 3.2910737386804656, + "grad_norm": 1.792107105255127, + "learning_rate": 2.765022565707226e-06, + "loss": 0.237, + "step": 5088 + }, + { + "epoch": 3.296248382923674, + "grad_norm": 7.266729831695557, + "learning_rate": 2.750043053226561e-06, + "loss": 0.221, + "step": 5096 + }, + { + "epoch": 3.3014230271668823, + "grad_norm": 2.3540456295013428, + "learning_rate": 2.735088819906465e-06, + "loss": 0.2029, + "step": 5104 + }, + { + "epoch": 3.3065976714100906, + "grad_norm": 0.9405587911605835, + "learning_rate": 2.7201600337636946e-06, + "loss": 0.2356, + "step": 5112 + }, + { + "epoch": 3.311772315653299, + "grad_norm": 4.883279323577881, + "learning_rate": 2.7052568625290955e-06, + "loss": 0.208, + "step": 5120 + }, + { + "epoch": 3.316946959896507, + "grad_norm": 3.857858419418335, + "learning_rate": 2.690379473645718e-06, + "loss": 0.2149, + "step": 5128 + }, + { + "epoch": 3.3221216041397152, + "grad_norm": 1.8659663200378418, + "learning_rate": 2.675528034266941e-06, + "loss": 0.2059, + "step": 5136 + }, + { + "epoch": 3.3272962483829236, + "grad_norm": 2.2125329971313477, + "learning_rate": 2.6607027112545893e-06, + "loss": 0.2195, + "step": 5144 + }, + { + "epoch": 3.332470892626132, + "grad_norm": 2.2989087104797363, + "learning_rate": 2.645903671177058e-06, + "loss": 0.2113, + "step": 5152 + }, + { + "epoch": 3.3376455368693403, + "grad_norm": 12.40267562866211, + "learning_rate": 2.631131080307445e-06, + "loss": 0.2188, + "step": 5160 + }, + { + "epoch": 3.3428201811125486, + "grad_norm": 1.3983057737350464, + "learning_rate": 2.6163851046216813e-06, + "loss": 0.2183, + "step": 5168 + }, + { + "epoch": 3.347994825355757, + "grad_norm": 1.027703046798706, + "learning_rate": 2.6016659097966636e-06, + "loss": 0.2204, + "step": 5176 + }, + { + "epoch": 3.353169469598965, + "grad_norm": 14.928071022033691, + "learning_rate": 2.5869736612083955e-06, + "loss": 0.2067, + "step": 5184 + }, + { + "epoch": 3.3583441138421732, + "grad_norm": 2.262359857559204, + "learning_rate": 2.572308523930131e-06, + "loss": 0.2205, + "step": 5192 + }, + { + "epoch": 3.3635187580853816, + "grad_norm": 8.3280668258667, + "learning_rate": 2.557670662730515e-06, + "loss": 0.2014, + "step": 5200 + }, + { + "epoch": 3.36869340232859, + "grad_norm": 1.9972519874572754, + "learning_rate": 2.5430602420717355e-06, + "loss": 0.2165, + "step": 5208 + }, + { + "epoch": 3.3738680465717983, + "grad_norm": 1.4478007555007935, + "learning_rate": 2.528477426107678e-06, + "loss": 0.2106, + "step": 5216 + }, + { + "epoch": 3.3790426908150066, + "grad_norm": 9.525103569030762, + "learning_rate": 2.513922378682075e-06, + "loss": 0.2216, + "step": 5224 + }, + { + "epoch": 3.3842173350582145, + "grad_norm": 6.314258098602295, + "learning_rate": 2.499395263326669e-06, + "loss": 0.2161, + "step": 5232 + }, + { + "epoch": 3.389391979301423, + "grad_norm": 1.9323687553405762, + "learning_rate": 2.484896243259375e-06, + "loss": 0.2078, + "step": 5240 + }, + { + "epoch": 3.3945666235446312, + "grad_norm": 12.306028366088867, + "learning_rate": 2.470425481382447e-06, + "loss": 0.2104, + "step": 5248 + }, + { + "epoch": 3.3997412677878396, + "grad_norm": 3.2243802547454834, + "learning_rate": 2.4559831402806454e-06, + "loss": 0.2259, + "step": 5256 + }, + { + "epoch": 3.404915912031048, + "grad_norm": 9.889036178588867, + "learning_rate": 2.441569382219413e-06, + "loss": 0.2024, + "step": 5264 + }, + { + "epoch": 3.4100905562742563, + "grad_norm": 66.57870483398438, + "learning_rate": 2.427184369143051e-06, + "loss": 0.2176, + "step": 5272 + }, + { + "epoch": 3.4152652005174646, + "grad_norm": 1.0559253692626953, + "learning_rate": 2.4128282626728985e-06, + "loss": 0.2063, + "step": 5280 + }, + { + "epoch": 3.4204398447606725, + "grad_norm": 7.3964948654174805, + "learning_rate": 2.398501224105517e-06, + "loss": 0.2192, + "step": 5288 + }, + { + "epoch": 3.425614489003881, + "grad_norm": 3.4036178588867188, + "learning_rate": 2.384203414410878e-06, + "loss": 0.2082, + "step": 5296 + }, + { + "epoch": 3.4307891332470892, + "grad_norm": 1.0019922256469727, + "learning_rate": 2.3699349942305603e-06, + "loss": 0.2045, + "step": 5304 + }, + { + "epoch": 3.4359637774902976, + "grad_norm": 5.541652679443359, + "learning_rate": 2.355696123875934e-06, + "loss": 0.203, + "step": 5312 + }, + { + "epoch": 3.441138421733506, + "grad_norm": 2.078033447265625, + "learning_rate": 2.341486963326366e-06, + "loss": 0.2122, + "step": 5320 + }, + { + "epoch": 3.4463130659767143, + "grad_norm": 6.079925537109375, + "learning_rate": 2.3273076722274233e-06, + "loss": 0.196, + "step": 5328 + }, + { + "epoch": 3.451487710219922, + "grad_norm": 7.444606781005859, + "learning_rate": 2.3131584098890775e-06, + "loss": 0.2193, + "step": 5336 + }, + { + "epoch": 3.4566623544631305, + "grad_norm": 11.668577194213867, + "learning_rate": 2.299039335283914e-06, + "loss": 0.2137, + "step": 5344 + }, + { + "epoch": 3.461836998706339, + "grad_norm": 8.687463760375977, + "learning_rate": 2.2849506070453466e-06, + "loss": 0.2013, + "step": 5352 + }, + { + "epoch": 3.4670116429495472, + "grad_norm": 2.5783653259277344, + "learning_rate": 2.27089238346584e-06, + "loss": 0.1905, + "step": 5360 + }, + { + "epoch": 3.4721862871927556, + "grad_norm": 1.5522114038467407, + "learning_rate": 2.2568648224951217e-06, + "loss": 0.2213, + "step": 5368 + }, + { + "epoch": 3.477360931435964, + "grad_norm": 80.04730987548828, + "learning_rate": 2.2428680817384153e-06, + "loss": 0.2131, + "step": 5376 + }, + { + "epoch": 3.4825355756791723, + "grad_norm": 0.8574538230895996, + "learning_rate": 2.228902318454666e-06, + "loss": 0.2126, + "step": 5384 + }, + { + "epoch": 3.48771021992238, + "grad_norm": 1.0680707693099976, + "learning_rate": 2.214967689554775e-06, + "loss": 0.2076, + "step": 5392 + }, + { + "epoch": 3.4928848641655885, + "grad_norm": 0.7886340618133545, + "learning_rate": 2.201064351599837e-06, + "loss": 0.2122, + "step": 5400 + }, + { + "epoch": 3.498059508408797, + "grad_norm": 123.42720031738281, + "learning_rate": 2.18719246079938e-06, + "loss": 0.2136, + "step": 5408 + }, + { + "epoch": 3.503234152652005, + "grad_norm": 1.1535052061080933, + "learning_rate": 2.17335217300961e-06, + "loss": 0.2202, + "step": 5416 + }, + { + "epoch": 3.5084087968952136, + "grad_norm": 1.2697583436965942, + "learning_rate": 2.1595436437316614e-06, + "loss": 0.2115, + "step": 5424 + }, + { + "epoch": 3.5135834411384215, + "grad_norm": 32.163143157958984, + "learning_rate": 2.1457670281098493e-06, + "loss": 0.2159, + "step": 5432 + }, + { + "epoch": 3.51875808538163, + "grad_norm": 1.4220489263534546, + "learning_rate": 2.132022480929926e-06, + "loss": 0.2117, + "step": 5440 + }, + { + "epoch": 3.523932729624838, + "grad_norm": 5.474082946777344, + "learning_rate": 2.118310156617342e-06, + "loss": 0.2136, + "step": 5448 + }, + { + "epoch": 3.5291073738680465, + "grad_norm": 1.7098948955535889, + "learning_rate": 2.1046302092355107e-06, + "loss": 0.2094, + "step": 5456 + }, + { + "epoch": 3.534282018111255, + "grad_norm": 1.005296230316162, + "learning_rate": 2.0909827924840787e-06, + "loss": 0.2145, + "step": 5464 + }, + { + "epoch": 3.539456662354463, + "grad_norm": 0.7779061794281006, + "learning_rate": 2.0773680596971976e-06, + "loss": 0.2018, + "step": 5472 + }, + { + "epoch": 3.5446313065976716, + "grad_norm": 6.462289333343506, + "learning_rate": 2.0637861638418003e-06, + "loss": 0.2165, + "step": 5480 + }, + { + "epoch": 3.54980595084088, + "grad_norm": 1.7986319065093994, + "learning_rate": 2.0502372575158865e-06, + "loss": 0.2066, + "step": 5488 + }, + { + "epoch": 3.554980595084088, + "grad_norm": 1.3168617486953735, + "learning_rate": 2.0367214929468036e-06, + "loss": 0.203, + "step": 5496 + }, + { + "epoch": 3.560155239327296, + "grad_norm": 1.542401671409607, + "learning_rate": 2.0232390219895364e-06, + "loss": 0.2134, + "step": 5504 + }, + { + "epoch": 3.5653298835705045, + "grad_norm": 1.3326711654663086, + "learning_rate": 2.009789996125009e-06, + "loss": 0.2285, + "step": 5512 + }, + { + "epoch": 3.570504527813713, + "grad_norm": 6.727497100830078, + "learning_rate": 1.99637456645837e-06, + "loss": 0.2263, + "step": 5520 + }, + { + "epoch": 3.575679172056921, + "grad_norm": 1.8087265491485596, + "learning_rate": 1.982992883717304e-06, + "loss": 0.2017, + "step": 5528 + }, + { + "epoch": 3.580853816300129, + "grad_norm": 26.89634132385254, + "learning_rate": 1.9696450982503356e-06, + "loss": 0.2256, + "step": 5536 + }, + { + "epoch": 3.5860284605433375, + "grad_norm": 5.575590133666992, + "learning_rate": 1.95633136002514e-06, + "loss": 0.2184, + "step": 5544 + }, + { + "epoch": 3.591203104786546, + "grad_norm": 4.252037525177002, + "learning_rate": 1.943051818626857e-06, + "loss": 0.2112, + "step": 5552 + }, + { + "epoch": 3.596377749029754, + "grad_norm": 3.719069480895996, + "learning_rate": 1.9298066232564135e-06, + "loss": 0.206, + "step": 5560 + }, + { + "epoch": 3.6015523932729625, + "grad_norm": 1.1236941814422607, + "learning_rate": 1.916595922728843e-06, + "loss": 0.2156, + "step": 5568 + }, + { + "epoch": 3.606727037516171, + "grad_norm": 26.330717086791992, + "learning_rate": 1.9034198654716163e-06, + "loss": 0.2223, + "step": 5576 + }, + { + "epoch": 3.611901681759379, + "grad_norm": 1.9026235342025757, + "learning_rate": 1.890278599522975e-06, + "loss": 0.2032, + "step": 5584 + }, + { + "epoch": 3.6170763260025875, + "grad_norm": 5.766218662261963, + "learning_rate": 1.8771722725302644e-06, + "loss": 0.2259, + "step": 5592 + }, + { + "epoch": 3.6222509702457955, + "grad_norm": 1.3441072702407837, + "learning_rate": 1.864101031748277e-06, + "loss": 0.2295, + "step": 5600 + }, + { + "epoch": 3.627425614489004, + "grad_norm": 2.0641391277313232, + "learning_rate": 1.8510650240376e-06, + "loss": 0.2177, + "step": 5608 + }, + { + "epoch": 3.632600258732212, + "grad_norm": 0.6838395595550537, + "learning_rate": 1.8380643958629596e-06, + "loss": 0.2103, + "step": 5616 + }, + { + "epoch": 3.6377749029754205, + "grad_norm": 1.7583774328231812, + "learning_rate": 1.8250992932915811e-06, + "loss": 0.2074, + "step": 5624 + }, + { + "epoch": 3.642949547218629, + "grad_norm": 4.7582879066467285, + "learning_rate": 1.8121698619915457e-06, + "loss": 0.2259, + "step": 5632 + }, + { + "epoch": 3.6481241914618368, + "grad_norm": 2116.8251953125, + "learning_rate": 1.7992762472301511e-06, + "loss": 0.1989, + "step": 5640 + }, + { + "epoch": 3.653298835705045, + "grad_norm": 4.6195526123046875, + "learning_rate": 1.7864185938722868e-06, + "loss": 0.22, + "step": 5648 + }, + { + "epoch": 3.6584734799482534, + "grad_norm": 1.0362610816955566, + "learning_rate": 1.7735970463787967e-06, + "loss": 0.22, + "step": 5656 + }, + { + "epoch": 3.663648124191462, + "grad_norm": 2.249736785888672, + "learning_rate": 1.7608117488048636e-06, + "loss": 0.2207, + "step": 5664 + }, + { + "epoch": 3.66882276843467, + "grad_norm": 1.8665297031402588, + "learning_rate": 1.7480628447983878e-06, + "loss": 0.2017, + "step": 5672 + }, + { + "epoch": 3.6739974126778785, + "grad_norm": 1.555904746055603, + "learning_rate": 1.735350477598372e-06, + "loss": 0.225, + "step": 5680 + }, + { + "epoch": 3.679172056921087, + "grad_norm": 18.196121215820312, + "learning_rate": 1.7226747900333135e-06, + "loss": 0.2245, + "step": 5688 + }, + { + "epoch": 3.684346701164295, + "grad_norm": 2.0208940505981445, + "learning_rate": 1.7100359245196035e-06, + "loss": 0.2216, + "step": 5696 + }, + { + "epoch": 3.689521345407503, + "grad_norm": 10.489842414855957, + "learning_rate": 1.6974340230599173e-06, + "loss": 0.2157, + "step": 5704 + }, + { + "epoch": 3.6946959896507114, + "grad_norm": 24.985515594482422, + "learning_rate": 1.6848692272416268e-06, + "loss": 0.2045, + "step": 5712 + }, + { + "epoch": 3.69987063389392, + "grad_norm": 3.337186574935913, + "learning_rate": 1.6723416782352076e-06, + "loss": 0.2204, + "step": 5720 + }, + { + "epoch": 3.705045278137128, + "grad_norm": 1.8910478353500366, + "learning_rate": 1.659851516792651e-06, + "loss": 0.1995, + "step": 5728 + }, + { + "epoch": 3.7102199223803365, + "grad_norm": 52.98031234741211, + "learning_rate": 1.647398883245886e-06, + "loss": 0.2094, + "step": 5736 + }, + { + "epoch": 3.7153945666235444, + "grad_norm": 5.706784248352051, + "learning_rate": 1.6349839175051995e-06, + "loss": 0.2059, + "step": 5744 + }, + { + "epoch": 3.7205692108667527, + "grad_norm": 13.464763641357422, + "learning_rate": 1.622606759057666e-06, + "loss": 0.2075, + "step": 5752 + }, + { + "epoch": 3.725743855109961, + "grad_norm": 2.077737331390381, + "learning_rate": 1.610267546965581e-06, + "loss": 0.2226, + "step": 5760 + }, + { + "epoch": 3.7309184993531694, + "grad_norm": 2.427469253540039, + "learning_rate": 1.5979664198648959e-06, + "loss": 0.2196, + "step": 5768 + }, + { + "epoch": 3.736093143596378, + "grad_norm": 1.3271838426589966, + "learning_rate": 1.5857035159636625e-06, + "loss": 0.2108, + "step": 5776 + }, + { + "epoch": 3.741267787839586, + "grad_norm": 1.4195632934570312, + "learning_rate": 1.5734789730404815e-06, + "loss": 0.2141, + "step": 5784 + }, + { + "epoch": 3.7464424320827945, + "grad_norm": 9.39681625366211, + "learning_rate": 1.5612929284429484e-06, + "loss": 0.2122, + "step": 5792 + }, + { + "epoch": 3.751617076326003, + "grad_norm": 9.311136245727539, + "learning_rate": 1.549145519086122e-06, + "loss": 0.212, + "step": 5800 + }, + { + "epoch": 3.7567917205692107, + "grad_norm": 0.5969098210334778, + "learning_rate": 1.5370368814509727e-06, + "loss": 0.2131, + "step": 5808 + }, + { + "epoch": 3.761966364812419, + "grad_norm": 9.47191047668457, + "learning_rate": 1.5249671515828569e-06, + "loss": 0.2289, + "step": 5816 + }, + { + "epoch": 3.7671410090556274, + "grad_norm": 1.388187050819397, + "learning_rate": 1.5129364650899869e-06, + "loss": 0.206, + "step": 5824 + }, + { + "epoch": 3.772315653298836, + "grad_norm": 1.3709971904754639, + "learning_rate": 1.5009449571419077e-06, + "loss": 0.2053, + "step": 5832 + }, + { + "epoch": 3.777490297542044, + "grad_norm": 8.477566719055176, + "learning_rate": 1.4889927624679762e-06, + "loss": 0.2137, + "step": 5840 + }, + { + "epoch": 3.782664941785252, + "grad_norm": 17.933977127075195, + "learning_rate": 1.4770800153558513e-06, + "loss": 0.2123, + "step": 5848 + }, + { + "epoch": 3.7878395860284604, + "grad_norm": 1.4228250980377197, + "learning_rate": 1.4652068496499804e-06, + "loss": 0.2051, + "step": 5856 + }, + { + "epoch": 3.7930142302716687, + "grad_norm": 9.45233154296875, + "learning_rate": 1.4533733987501004e-06, + "loss": 0.2242, + "step": 5864 + }, + { + "epoch": 3.798188874514877, + "grad_norm": 1.5488440990447998, + "learning_rate": 1.4415797956097356e-06, + "loss": 0.2251, + "step": 5872 + }, + { + "epoch": 3.8033635187580854, + "grad_norm": 2.1524927616119385, + "learning_rate": 1.4298261727347034e-06, + "loss": 0.2213, + "step": 5880 + }, + { + "epoch": 3.8085381630012938, + "grad_norm": 3.4990813732147217, + "learning_rate": 1.41811266218163e-06, + "loss": 0.2038, + "step": 5888 + }, + { + "epoch": 3.813712807244502, + "grad_norm": 8.138912200927734, + "learning_rate": 1.4064393955564615e-06, + "loss": 0.2139, + "step": 5896 + }, + { + "epoch": 3.8188874514877105, + "grad_norm": 19.545347213745117, + "learning_rate": 1.3948065040129882e-06, + "loss": 0.2099, + "step": 5904 + }, + { + "epoch": 3.8240620957309184, + "grad_norm": 2.30070161819458, + "learning_rate": 1.3832141182513699e-06, + "loss": 0.2154, + "step": 5912 + }, + { + "epoch": 3.8292367399741267, + "grad_norm": 1.3020079135894775, + "learning_rate": 1.3716623685166685e-06, + "loss": 0.2164, + "step": 5920 + }, + { + "epoch": 3.834411384217335, + "grad_norm": 1.0856869220733643, + "learning_rate": 1.3601513845973835e-06, + "loss": 0.2108, + "step": 5928 + }, + { + "epoch": 3.8395860284605434, + "grad_norm": 4.268250465393066, + "learning_rate": 1.3486812958239931e-06, + "loss": 0.2157, + "step": 5936 + }, + { + "epoch": 3.8447606727037518, + "grad_norm": 8.244180679321289, + "learning_rate": 1.3372522310675063e-06, + "loss": 0.2101, + "step": 5944 + }, + { + "epoch": 3.8499353169469597, + "grad_norm": 5.211091041564941, + "learning_rate": 1.3258643187380071e-06, + "loss": 0.2058, + "step": 5952 + }, + { + "epoch": 3.855109961190168, + "grad_norm": 8.61664867401123, + "learning_rate": 1.3145176867832165e-06, + "loss": 0.2121, + "step": 5960 + }, + { + "epoch": 3.8602846054333764, + "grad_norm": 0.9246022701263428, + "learning_rate": 1.3032124626870546e-06, + "loss": 0.2136, + "step": 5968 + }, + { + "epoch": 3.8654592496765847, + "grad_norm": 4.238160133361816, + "learning_rate": 1.2919487734682073e-06, + "loss": 0.2043, + "step": 5976 + }, + { + "epoch": 3.870633893919793, + "grad_norm": 14.910839080810547, + "learning_rate": 1.2807267456787004e-06, + "loss": 0.2241, + "step": 5984 + }, + { + "epoch": 3.8758085381630014, + "grad_norm": 9.794402122497559, + "learning_rate": 1.2695465054024752e-06, + "loss": 0.221, + "step": 5992 + }, + { + "epoch": 3.8809831824062098, + "grad_norm": 0.7135350108146667, + "learning_rate": 1.2584081782539764e-06, + "loss": 0.2041, + "step": 6000 + }, + { + "epoch": 3.886157826649418, + "grad_norm": 5.769599914550781, + "learning_rate": 1.247311889376736e-06, + "loss": 0.2096, + "step": 6008 + }, + { + "epoch": 3.891332470892626, + "grad_norm": 2.1306018829345703, + "learning_rate": 1.2362577634419692e-06, + "loss": 0.1968, + "step": 6016 + }, + { + "epoch": 3.8965071151358344, + "grad_norm": 3.706986427307129, + "learning_rate": 1.2252459246471754e-06, + "loss": 0.2189, + "step": 6024 + }, + { + "epoch": 3.9016817593790427, + "grad_norm": 5.8149309158325195, + "learning_rate": 1.2142764967147385e-06, + "loss": 0.216, + "step": 6032 + }, + { + "epoch": 3.906856403622251, + "grad_norm": 27.880945205688477, + "learning_rate": 1.2033496028905445e-06, + "loss": 0.2085, + "step": 6040 + }, + { + "epoch": 3.9120310478654594, + "grad_norm": 9.760760307312012, + "learning_rate": 1.1924653659425862e-06, + "loss": 0.2212, + "step": 6048 + }, + { + "epoch": 3.9172056921086673, + "grad_norm": 3.9804182052612305, + "learning_rate": 1.1816239081595926e-06, + "loss": 0.202, + "step": 6056 + }, + { + "epoch": 3.9223803363518757, + "grad_norm": 0.8538228273391724, + "learning_rate": 1.1708253513496504e-06, + "loss": 0.2036, + "step": 6064 + }, + { + "epoch": 3.927554980595084, + "grad_norm": 0.9339475035667419, + "learning_rate": 1.160069816838838e-06, + "loss": 0.2068, + "step": 6072 + }, + { + "epoch": 3.9327296248382924, + "grad_norm": 30.274492263793945, + "learning_rate": 1.1493574254698598e-06, + "loss": 0.1915, + "step": 6080 + }, + { + "epoch": 3.9379042690815007, + "grad_norm": 30.7927303314209, + "learning_rate": 1.1386882976006897e-06, + "loss": 0.2005, + "step": 6088 + }, + { + "epoch": 3.943078913324709, + "grad_norm": 6.506274700164795, + "learning_rate": 1.128062553103223e-06, + "loss": 0.2143, + "step": 6096 + }, + { + "epoch": 3.9482535575679174, + "grad_norm": 1.3298203945159912, + "learning_rate": 1.1174803113619204e-06, + "loss": 0.1939, + "step": 6104 + }, + { + "epoch": 3.9534282018111258, + "grad_norm": 48.32373809814453, + "learning_rate": 1.106941691272474e-06, + "loss": 0.2189, + "step": 6112 + }, + { + "epoch": 3.9586028460543337, + "grad_norm": 1.674280047416687, + "learning_rate": 1.0964468112404691e-06, + "loss": 0.2052, + "step": 6120 + }, + { + "epoch": 3.963777490297542, + "grad_norm": 44.19257736206055, + "learning_rate": 1.0859957891800548e-06, + "loss": 0.2268, + "step": 6128 + }, + { + "epoch": 3.9689521345407504, + "grad_norm": 32.45912170410156, + "learning_rate": 1.075588742512617e-06, + "loss": 0.2087, + "step": 6136 + }, + { + "epoch": 3.9741267787839587, + "grad_norm": 18.419063568115234, + "learning_rate": 1.0652257881654625e-06, + "loss": 0.215, + "step": 6144 + }, + { + "epoch": 3.9793014230271666, + "grad_norm": 1.7867515087127686, + "learning_rate": 1.0549070425705017e-06, + "loss": 0.2029, + "step": 6152 + }, + { + "epoch": 3.984476067270375, + "grad_norm": 10.934088706970215, + "learning_rate": 1.0446326216629422e-06, + "loss": 0.2314, + "step": 6160 + }, + { + "epoch": 3.9896507115135833, + "grad_norm": 0.9720279574394226, + "learning_rate": 1.0344026408799868e-06, + "loss": 0.2087, + "step": 6168 + }, + { + "epoch": 3.9948253557567917, + "grad_norm": 0.8267812728881836, + "learning_rate": 1.0242172151595365e-06, + "loss": 0.2066, + "step": 6176 + }, + { + "epoch": 4.0, + "grad_norm": 1.8741275072097778, + "learning_rate": 1.0140764589388963e-06, + "loss": 0.2009, + "step": 6184 + }, + { + "epoch": 4.005174644243208, + "grad_norm": 1.0776174068450928, + "learning_rate": 1.003980486153494e-06, + "loss": 0.2154, + "step": 6192 + }, + { + "epoch": 4.010349288486417, + "grad_norm": 24.826271057128906, + "learning_rate": 9.939294102355957e-07, + "loss": 0.1968, + "step": 6200 + }, + { + "epoch": 4.015523932729625, + "grad_norm": 1.9425315856933594, + "learning_rate": 9.839233441130353e-07, + "loss": 0.2168, + "step": 6208 + }, + { + "epoch": 4.020698576972833, + "grad_norm": 1.4220765829086304, + "learning_rate": 9.739624002079412e-07, + "loss": 0.2137, + "step": 6216 + }, + { + "epoch": 4.025873221216042, + "grad_norm": 3.797436237335205, + "learning_rate": 9.640466904354778e-07, + "loss": 0.2009, + "step": 6224 + }, + { + "epoch": 4.03104786545925, + "grad_norm": 10.675097465515137, + "learning_rate": 9.541763262025866e-07, + "loss": 0.1992, + "step": 6232 + }, + { + "epoch": 4.0362225097024576, + "grad_norm": 16.90818214416504, + "learning_rate": 9.443514184067326e-07, + "loss": 0.201, + "step": 6240 + }, + { + "epoch": 4.041397153945666, + "grad_norm": 5.427804470062256, + "learning_rate": 9.345720774346589e-07, + "loss": 0.2199, + "step": 6248 + }, + { + "epoch": 4.046571798188874, + "grad_norm": 10.079002380371094, + "learning_rate": 9.248384131611493e-07, + "loss": 0.2155, + "step": 6256 + }, + { + "epoch": 4.051746442432083, + "grad_norm": 2.3871824741363525, + "learning_rate": 9.151505349477901e-07, + "loss": 0.1933, + "step": 6264 + }, + { + "epoch": 4.056921086675291, + "grad_norm": 4.908697605133057, + "learning_rate": 9.055085516417439e-07, + "loss": 0.212, + "step": 6272 + }, + { + "epoch": 4.062095730918499, + "grad_norm": 0.8784133791923523, + "learning_rate": 8.959125715745248e-07, + "loss": 0.2121, + "step": 6280 + }, + { + "epoch": 4.067270375161708, + "grad_norm": 4.069620132446289, + "learning_rate": 8.863627025607835e-07, + "loss": 0.2138, + "step": 6288 + }, + { + "epoch": 4.072445019404916, + "grad_norm": 14.950401306152344, + "learning_rate": 8.768590518970938e-07, + "loss": 0.2108, + "step": 6296 + }, + { + "epoch": 4.077619663648124, + "grad_norm": 10.423088073730469, + "learning_rate": 8.674017263607488e-07, + "loss": 0.2062, + "step": 6304 + }, + { + "epoch": 4.082794307891333, + "grad_norm": 39.73945617675781, + "learning_rate": 8.57990832208559e-07, + "loss": 0.2094, + "step": 6312 + }, + { + "epoch": 4.087968952134541, + "grad_norm": 8.358613967895508, + "learning_rate": 8.486264751756607e-07, + "loss": 0.2154, + "step": 6320 + }, + { + "epoch": 4.093143596377749, + "grad_norm": 1.2010987997055054, + "learning_rate": 8.393087604743283e-07, + "loss": 0.2063, + "step": 6328 + }, + { + "epoch": 4.098318240620958, + "grad_norm": 7.8026323318481445, + "learning_rate": 8.300377927927888e-07, + "loss": 0.2106, + "step": 6336 + }, + { + "epoch": 4.103492884864165, + "grad_norm": 3.521782159805298, + "learning_rate": 8.208136762940489e-07, + "loss": 0.2258, + "step": 6344 + }, + { + "epoch": 4.1086675291073735, + "grad_norm": 2.0065267086029053, + "learning_rate": 8.116365146147243e-07, + "loss": 0.2187, + "step": 6352 + }, + { + "epoch": 4.113842173350582, + "grad_norm": 1.1906496286392212, + "learning_rate": 8.025064108638742e-07, + "loss": 0.1985, + "step": 6360 + }, + { + "epoch": 4.11901681759379, + "grad_norm": 10.423641204833984, + "learning_rate": 7.934234676218411e-07, + "loss": 0.2114, + "step": 6368 + }, + { + "epoch": 4.124191461836999, + "grad_norm": 3.763667345046997, + "learning_rate": 7.843877869391053e-07, + "loss": 0.2217, + "step": 6376 + }, + { + "epoch": 4.129366106080207, + "grad_norm": 3.3933229446411133, + "learning_rate": 7.753994703351298e-07, + "loss": 0.2197, + "step": 6384 + }, + { + "epoch": 4.134540750323415, + "grad_norm": 2.327582836151123, + "learning_rate": 7.664586187972234e-07, + "loss": 0.2126, + "step": 6392 + }, + { + "epoch": 4.139715394566624, + "grad_norm": 85.6529541015625, + "learning_rate": 7.575653327794075e-07, + "loss": 0.2228, + "step": 6400 + }, + { + "epoch": 4.144890038809832, + "grad_norm": 33.25102615356445, + "learning_rate": 7.48719712201284e-07, + "loss": 0.2031, + "step": 6408 + }, + { + "epoch": 4.15006468305304, + "grad_norm": 8.674571990966797, + "learning_rate": 7.399218564469174e-07, + "loss": 0.2038, + "step": 6416 + }, + { + "epoch": 4.155239327296249, + "grad_norm": 10.107747077941895, + "learning_rate": 7.311718643637134e-07, + "loss": 0.199, + "step": 6424 + }, + { + "epoch": 4.160413971539457, + "grad_norm": 3.600902795791626, + "learning_rate": 7.224698342613096e-07, + "loss": 0.2008, + "step": 6432 + }, + { + "epoch": 4.165588615782665, + "grad_norm": 0.6561605334281921, + "learning_rate": 7.138158639104748e-07, + "loss": 0.2107, + "step": 6440 + }, + { + "epoch": 4.170763260025873, + "grad_norm": 1.277238130569458, + "learning_rate": 7.052100505420051e-07, + "loss": 0.2072, + "step": 6448 + }, + { + "epoch": 4.175937904269081, + "grad_norm": 12.70361614227295, + "learning_rate": 6.96652490845634e-07, + "loss": 0.2003, + "step": 6456 + }, + { + "epoch": 4.1811125485122895, + "grad_norm": 45.80218505859375, + "learning_rate": 6.881432809689459e-07, + "loss": 0.2014, + "step": 6464 + }, + { + "epoch": 4.186287192755498, + "grad_norm": 8.82374095916748, + "learning_rate": 6.796825165162951e-07, + "loss": 0.2092, + "step": 6472 + }, + { + "epoch": 4.191461836998706, + "grad_norm": 1.1906932592391968, + "learning_rate": 6.712702925477343e-07, + "loss": 0.2058, + "step": 6480 + }, + { + "epoch": 4.196636481241915, + "grad_norm": 19.203519821166992, + "learning_rate": 6.62906703577943e-07, + "loss": 0.2051, + "step": 6488 + }, + { + "epoch": 4.201811125485123, + "grad_norm": 3.1207361221313477, + "learning_rate": 6.545918435751669e-07, + "loss": 0.1995, + "step": 6496 + }, + { + "epoch": 4.206985769728331, + "grad_norm": 0.9524929523468018, + "learning_rate": 6.463258059601635e-07, + "loss": 0.2109, + "step": 6504 + }, + { + "epoch": 4.21216041397154, + "grad_norm": 1.476508378982544, + "learning_rate": 6.381086836051498e-07, + "loss": 0.2117, + "step": 6512 + }, + { + "epoch": 4.217335058214748, + "grad_norm": 2.5996196269989014, + "learning_rate": 6.299405688327631e-07, + "loss": 0.1894, + "step": 6520 + }, + { + "epoch": 4.222509702457956, + "grad_norm": 3.0487098693847656, + "learning_rate": 6.218215534150185e-07, + "loss": 0.2065, + "step": 6528 + }, + { + "epoch": 4.227684346701165, + "grad_norm": 4.056415557861328, + "learning_rate": 6.137517285722816e-07, + "loss": 0.2122, + "step": 6536 + }, + { + "epoch": 4.232858990944372, + "grad_norm": 111.7614974975586, + "learning_rate": 6.057311849722419e-07, + "loss": 0.2201, + "step": 6544 + }, + { + "epoch": 4.2380336351875805, + "grad_norm": 2.33253812789917, + "learning_rate": 5.977600127288941e-07, + "loss": 0.2137, + "step": 6552 + }, + { + "epoch": 4.243208279430789, + "grad_norm": 2.283048391342163, + "learning_rate": 5.898383014015275e-07, + "loss": 0.2056, + "step": 6560 + }, + { + "epoch": 4.248382923673997, + "grad_norm": 3.730635643005371, + "learning_rate": 5.81966139993716e-07, + "loss": 0.2076, + "step": 6568 + }, + { + "epoch": 4.2535575679172055, + "grad_norm": 9.798884391784668, + "learning_rate": 5.741436169523234e-07, + "loss": 0.2268, + "step": 6576 + }, + { + "epoch": 4.258732212160414, + "grad_norm": 0.8087504506111145, + "learning_rate": 5.663708201665041e-07, + "loss": 0.2049, + "step": 6584 + }, + { + "epoch": 4.263906856403622, + "grad_norm": 1.3564224243164062, + "learning_rate": 5.586478369667203e-07, + "loss": 0.2138, + "step": 6592 + }, + { + "epoch": 4.269081500646831, + "grad_norm": 3.182577610015869, + "learning_rate": 5.50974754123757e-07, + "loss": 0.2055, + "step": 6600 + }, + { + "epoch": 4.274256144890039, + "grad_norm": 21.311397552490234, + "learning_rate": 5.433516578477504e-07, + "loss": 0.2154, + "step": 6608 + }, + { + "epoch": 4.279430789133247, + "grad_norm": 1.5569988489151, + "learning_rate": 5.357786337872168e-07, + "loss": 0.2202, + "step": 6616 + }, + { + "epoch": 4.284605433376456, + "grad_norm": 1.3208539485931396, + "learning_rate": 5.282557670280914e-07, + "loss": 0.2076, + "step": 6624 + }, + { + "epoch": 4.289780077619664, + "grad_norm": 4.689513206481934, + "learning_rate": 5.207831420927722e-07, + "loss": 0.207, + "step": 6632 + }, + { + "epoch": 4.294954721862872, + "grad_norm": 2.186889410018921, + "learning_rate": 5.133608429391706e-07, + "loss": 0.2065, + "step": 6640 + }, + { + "epoch": 4.300129366106081, + "grad_norm": 59.678680419921875, + "learning_rate": 5.059889529597678e-07, + "loss": 0.2117, + "step": 6648 + }, + { + "epoch": 4.305304010349288, + "grad_norm": 2.851491689682007, + "learning_rate": 4.986675549806769e-07, + "loss": 0.2017, + "step": 6656 + }, + { + "epoch": 4.3104786545924965, + "grad_norm": 3.045703172683716, + "learning_rate": 4.913967312607154e-07, + "loss": 0.1905, + "step": 6664 + }, + { + "epoch": 4.315653298835705, + "grad_norm": 13.042987823486328, + "learning_rate": 4.841765634904777e-07, + "loss": 0.2081, + "step": 6672 + }, + { + "epoch": 4.320827943078913, + "grad_norm": 2.1927504539489746, + "learning_rate": 4.770071327914177e-07, + "loss": 0.211, + "step": 6680 + }, + { + "epoch": 4.3260025873221215, + "grad_norm": 2.328122615814209, + "learning_rate": 4.6988851971493886e-07, + "loss": 0.2201, + "step": 6688 + }, + { + "epoch": 4.33117723156533, + "grad_norm": 1.7505789995193481, + "learning_rate": 4.628208042414889e-07, + "loss": 0.211, + "step": 6696 + }, + { + "epoch": 4.336351875808538, + "grad_norm": 1.5832010507583618, + "learning_rate": 4.558040657796603e-07, + "loss": 0.2, + "step": 6704 + }, + { + "epoch": 4.3415265200517466, + "grad_norm": 1.8618698120117188, + "learning_rate": 4.4883838316529816e-07, + "loss": 0.2205, + "step": 6712 + }, + { + "epoch": 4.346701164294955, + "grad_norm": 22.6337833404541, + "learning_rate": 4.4192383466061583e-07, + "loss": 0.2088, + "step": 6720 + }, + { + "epoch": 4.351875808538163, + "grad_norm": 35.79507827758789, + "learning_rate": 4.350604979533135e-07, + "loss": 0.2025, + "step": 6728 + }, + { + "epoch": 4.357050452781372, + "grad_norm": 4.392690181732178, + "learning_rate": 4.2824845015570713e-07, + "loss": 0.2048, + "step": 6736 + }, + { + "epoch": 4.36222509702458, + "grad_norm": 4.461722373962402, + "learning_rate": 4.214877678038609e-07, + "loss": 0.1996, + "step": 6744 + }, + { + "epoch": 4.367399741267787, + "grad_norm": 1.0779790878295898, + "learning_rate": 4.1477852685672895e-07, + "loss": 0.2073, + "step": 6752 + }, + { + "epoch": 4.372574385510996, + "grad_norm": 1.209999680519104, + "learning_rate": 4.0812080269529983e-07, + "loss": 0.2101, + "step": 6760 + }, + { + "epoch": 4.377749029754204, + "grad_norm": 5.77162504196167, + "learning_rate": 4.015146701217493e-07, + "loss": 0.2166, + "step": 6768 + }, + { + "epoch": 4.3829236739974125, + "grad_norm": 1.2013427019119263, + "learning_rate": 3.949602033586047e-07, + "loss": 0.2081, + "step": 6776 + }, + { + "epoch": 4.388098318240621, + "grad_norm": 0.7678247690200806, + "learning_rate": 3.884574760479037e-07, + "loss": 0.2123, + "step": 6784 + }, + { + "epoch": 4.393272962483829, + "grad_norm": 3.831841468811035, + "learning_rate": 3.820065612503732e-07, + "loss": 0.2085, + "step": 6792 + }, + { + "epoch": 4.3984476067270375, + "grad_norm": 0.9764569401741028, + "learning_rate": 3.756075314446045e-07, + "loss": 0.2101, + "step": 6800 + }, + { + "epoch": 4.403622250970246, + "grad_norm": 13.396934509277344, + "learning_rate": 3.6926045852624106e-07, + "loss": 0.1976, + "step": 6808 + }, + { + "epoch": 4.408796895213454, + "grad_norm": 5.996229648590088, + "learning_rate": 3.629654138071692e-07, + "loss": 0.2099, + "step": 6816 + }, + { + "epoch": 4.4139715394566625, + "grad_norm": 4.443068981170654, + "learning_rate": 3.56722468014718e-07, + "loss": 0.2184, + "step": 6824 + }, + { + "epoch": 4.419146183699871, + "grad_norm": 5.215079307556152, + "learning_rate": 3.505316912908668e-07, + "loss": 0.2063, + "step": 6832 + }, + { + "epoch": 4.424320827943079, + "grad_norm": 32.11725616455078, + "learning_rate": 3.443931531914507e-07, + "loss": 0.2137, + "step": 6840 + }, + { + "epoch": 4.429495472186288, + "grad_norm": 7.726386547088623, + "learning_rate": 3.3830692268538637e-07, + "loss": 0.2148, + "step": 6848 + }, + { + "epoch": 4.434670116429496, + "grad_norm": 11.44247055053711, + "learning_rate": 3.3227306815389213e-07, + "loss": 0.2148, + "step": 6856 + }, + { + "epoch": 4.439844760672703, + "grad_norm": 4.2190022468566895, + "learning_rate": 3.262916573897218e-07, + "loss": 0.2155, + "step": 6864 + }, + { + "epoch": 4.445019404915912, + "grad_norm": 7.056035995483398, + "learning_rate": 3.2036275759640245e-07, + "loss": 0.2086, + "step": 6872 + }, + { + "epoch": 4.45019404915912, + "grad_norm": 2.7552056312561035, + "learning_rate": 3.1448643538748045e-07, + "loss": 0.2033, + "step": 6880 + }, + { + "epoch": 4.455368693402328, + "grad_norm": 12.676017761230469, + "learning_rate": 3.086627567857703e-07, + "loss": 0.2127, + "step": 6888 + }, + { + "epoch": 4.460543337645537, + "grad_norm": 15.750905990600586, + "learning_rate": 3.0289178722261726e-07, + "loss": 0.2152, + "step": 6896 + }, + { + "epoch": 4.465717981888745, + "grad_norm": 83.92919921875, + "learning_rate": 2.9717359153715707e-07, + "loss": 0.2194, + "step": 6904 + }, + { + "epoch": 4.4708926261319535, + "grad_norm": 26.0535945892334, + "learning_rate": 2.9150823397559094e-07, + "loss": 0.1975, + "step": 6912 + }, + { + "epoch": 4.476067270375162, + "grad_norm": 3.255311965942383, + "learning_rate": 2.8589577819046364e-07, + "loss": 0.1951, + "step": 6920 + }, + { + "epoch": 4.48124191461837, + "grad_norm": 4.512423515319824, + "learning_rate": 2.8033628723994623e-07, + "loss": 0.2066, + "step": 6928 + }, + { + "epoch": 4.4864165588615785, + "grad_norm": 1.1333099603652954, + "learning_rate": 2.7482982358712885e-07, + "loss": 0.2114, + "step": 6936 + }, + { + "epoch": 4.491591203104787, + "grad_norm": 1.0661486387252808, + "learning_rate": 2.6937644909931893e-07, + "loss": 0.2166, + "step": 6944 + }, + { + "epoch": 4.496765847347995, + "grad_norm": 2.0188798904418945, + "learning_rate": 2.639762250473482e-07, + "loss": 0.2005, + "step": 6952 + }, + { + "epoch": 4.501940491591203, + "grad_norm": 5.197384357452393, + "learning_rate": 2.5862921210487833e-07, + "loss": 0.2232, + "step": 6960 + }, + { + "epoch": 4.507115135834411, + "grad_norm": 5.651261806488037, + "learning_rate": 2.5333547034772645e-07, + "loss": 0.2152, + "step": 6968 + }, + { + "epoch": 4.512289780077619, + "grad_norm": 0.9246218800544739, + "learning_rate": 2.480950592531844e-07, + "loss": 0.2058, + "step": 6976 + }, + { + "epoch": 4.517464424320828, + "grad_norm": 1.335443377494812, + "learning_rate": 2.429080376993537e-07, + "loss": 0.2143, + "step": 6984 + }, + { + "epoch": 4.522639068564036, + "grad_norm": 1.77558434009552, + "learning_rate": 2.37774463964483e-07, + "loss": 0.2133, + "step": 6992 + }, + { + "epoch": 4.527813712807244, + "grad_norm": 1.7683552503585815, + "learning_rate": 2.3269439572631448e-07, + "loss": 0.2065, + "step": 7000 + }, + { + "epoch": 4.532988357050453, + "grad_norm": 2.1491098403930664, + "learning_rate": 2.2766789006143265e-07, + "loss": 0.2064, + "step": 7008 + }, + { + "epoch": 4.538163001293661, + "grad_norm": 1.7803951501846313, + "learning_rate": 2.226950034446279e-07, + "loss": 0.1952, + "step": 7016 + }, + { + "epoch": 4.5433376455368695, + "grad_norm": 3.2635340690612793, + "learning_rate": 2.1777579174825703e-07, + "loss": 0.2197, + "step": 7024 + }, + { + "epoch": 4.548512289780078, + "grad_norm": 9.03862476348877, + "learning_rate": 2.1291031024161856e-07, + "loss": 0.2275, + "step": 7032 + }, + { + "epoch": 4.553686934023286, + "grad_norm": 5.261979103088379, + "learning_rate": 2.0809861359033124e-07, + "loss": 0.2115, + "step": 7040 + }, + { + "epoch": 4.5588615782664945, + "grad_norm": 64.26140594482422, + "learning_rate": 2.0334075585571988e-07, + "loss": 0.2294, + "step": 7048 + }, + { + "epoch": 4.564036222509703, + "grad_norm": 0.9967195391654968, + "learning_rate": 1.986367904942066e-07, + "loss": 0.2011, + "step": 7056 + }, + { + "epoch": 4.569210866752911, + "grad_norm": 4.485716342926025, + "learning_rate": 1.9398677035671222e-07, + "loss": 0.2163, + "step": 7064 + }, + { + "epoch": 4.574385510996119, + "grad_norm": 2.1471633911132812, + "learning_rate": 1.8939074768806076e-07, + "loss": 0.2028, + "step": 7072 + }, + { + "epoch": 4.579560155239327, + "grad_norm": 15.670969009399414, + "learning_rate": 1.8484877412639435e-07, + "loss": 0.1971, + "step": 7080 + }, + { + "epoch": 4.584734799482535, + "grad_norm": 0.8100490570068359, + "learning_rate": 1.8036090070259026e-07, + "loss": 0.2079, + "step": 7088 + }, + { + "epoch": 4.589909443725744, + "grad_norm": 5.267513751983643, + "learning_rate": 1.7592717783969094e-07, + "loss": 0.2117, + "step": 7096 + }, + { + "epoch": 4.595084087968952, + "grad_norm": 1.9715381860733032, + "learning_rate": 1.7154765535233486e-07, + "loss": 0.2013, + "step": 7104 + }, + { + "epoch": 4.60025873221216, + "grad_norm": 2.9178831577301025, + "learning_rate": 1.6722238244619827e-07, + "loss": 0.205, + "step": 7112 + }, + { + "epoch": 4.605433376455369, + "grad_norm": 2.994739294052124, + "learning_rate": 1.6295140771744044e-07, + "loss": 0.2133, + "step": 7120 + }, + { + "epoch": 4.610608020698577, + "grad_norm": 6.191929817199707, + "learning_rate": 1.587347791521604e-07, + "loss": 0.2123, + "step": 7128 + }, + { + "epoch": 4.6157826649417855, + "grad_norm": 4.549292087554932, + "learning_rate": 1.5457254412585666e-07, + "loss": 0.2049, + "step": 7136 + }, + { + "epoch": 4.620957309184994, + "grad_norm": 2.3262717723846436, + "learning_rate": 1.5046474940289268e-07, + "loss": 0.2017, + "step": 7144 + }, + { + "epoch": 4.626131953428202, + "grad_norm": 2.1860039234161377, + "learning_rate": 1.4641144113597628e-07, + "loss": 0.2262, + "step": 7152 + }, + { + "epoch": 4.63130659767141, + "grad_norm": 15.974281311035156, + "learning_rate": 1.4241266486563654e-07, + "loss": 0.2189, + "step": 7160 + }, + { + "epoch": 4.636481241914618, + "grad_norm": 2.774641275405884, + "learning_rate": 1.3846846551971272e-07, + "loss": 0.216, + "step": 7168 + }, + { + "epoch": 4.641655886157826, + "grad_norm": 2.4483981132507324, + "learning_rate": 1.3457888741285452e-07, + "loss": 0.2103, + "step": 7176 + }, + { + "epoch": 4.646830530401035, + "grad_norm": 0.8397948145866394, + "learning_rate": 1.307439742460165e-07, + "loss": 0.2079, + "step": 7184 + }, + { + "epoch": 4.652005174644243, + "grad_norm": 1.7461739778518677, + "learning_rate": 1.2696376910597275e-07, + "loss": 0.2079, + "step": 7192 + }, + { + "epoch": 4.657179818887451, + "grad_norm": 2.657538414001465, + "learning_rate": 1.2323831446483025e-07, + "loss": 0.208, + "step": 7200 + }, + { + "epoch": 4.66235446313066, + "grad_norm": 1.223616361618042, + "learning_rate": 1.1956765217955302e-07, + "loss": 0.2003, + "step": 7208 + }, + { + "epoch": 4.667529107373868, + "grad_norm": 1.7629609107971191, + "learning_rate": 1.1595182349149026e-07, + "loss": 0.2013, + "step": 7216 + }, + { + "epoch": 4.672703751617076, + "grad_norm": 18.01417350769043, + "learning_rate": 1.1239086902591512e-07, + "loss": 0.2096, + "step": 7224 + }, + { + "epoch": 4.677878395860285, + "grad_norm": 14.346609115600586, + "learning_rate": 1.0888482879156503e-07, + "loss": 0.2015, + "step": 7232 + }, + { + "epoch": 4.683053040103493, + "grad_norm": 1.5055747032165527, + "learning_rate": 1.0543374218019708e-07, + "loss": 0.2072, + "step": 7240 + }, + { + "epoch": 4.6882276843467015, + "grad_norm": 31.9080867767334, + "learning_rate": 1.0203764796614057e-07, + "loss": 0.2101, + "step": 7248 + }, + { + "epoch": 4.69340232858991, + "grad_norm": 57.54270553588867, + "learning_rate": 9.869658430586349e-08, + "loss": 0.2121, + "step": 7256 + }, + { + "epoch": 4.698576972833118, + "grad_norm": 1.9144995212554932, + "learning_rate": 9.541058873754394e-08, + "loss": 0.2115, + "step": 7264 + }, + { + "epoch": 4.7037516170763265, + "grad_norm": 1.482640266418457, + "learning_rate": 9.217969818064832e-08, + "loss": 0.2121, + "step": 7272 + }, + { + "epoch": 4.708926261319534, + "grad_norm": 1.4592971801757812, + "learning_rate": 8.900394893551655e-08, + "loss": 0.2152, + "step": 7280 + }, + { + "epoch": 4.714100905562742, + "grad_norm": 14.836435317993164, + "learning_rate": 8.588337668295366e-08, + "loss": 0.2044, + "step": 7288 + }, + { + "epoch": 4.719275549805951, + "grad_norm": 1.025329828262329, + "learning_rate": 8.28180164838288e-08, + "loss": 0.1941, + "step": 7296 + }, + { + "epoch": 4.724450194049159, + "grad_norm": 2.9841601848602295, + "learning_rate": 7.980790277868189e-08, + "loss": 0.2092, + "step": 7304 + }, + { + "epoch": 4.729624838292367, + "grad_norm": 2.577059507369995, + "learning_rate": 7.685306938733761e-08, + "loss": 0.2256, + "step": 7312 + }, + { + "epoch": 4.734799482535576, + "grad_norm": 8.968996047973633, + "learning_rate": 7.395354950852307e-08, + "loss": 0.2021, + "step": 7320 + }, + { + "epoch": 4.739974126778784, + "grad_norm": 111.47150421142578, + "learning_rate": 7.110937571949639e-08, + "loss": 0.2084, + "step": 7328 + }, + { + "epoch": 4.745148771021992, + "grad_norm": 18.689817428588867, + "learning_rate": 6.832057997568087e-08, + "loss": 0.2121, + "step": 7336 + }, + { + "epoch": 4.750323415265201, + "grad_norm": 1.8758889436721802, + "learning_rate": 6.55871936103053e-08, + "loss": 0.2213, + "step": 7344 + }, + { + "epoch": 4.755498059508409, + "grad_norm": 7.350550651550293, + "learning_rate": 6.290924733405201e-08, + "loss": 0.2219, + "step": 7352 + }, + { + "epoch": 4.760672703751617, + "grad_norm": 1.5531758069992065, + "learning_rate": 6.028677123471105e-08, + "loss": 0.2062, + "step": 7360 + }, + { + "epoch": 4.765847347994825, + "grad_norm": 15.50673770904541, + "learning_rate": 5.771979477684375e-08, + "loss": 0.2054, + "step": 7368 + }, + { + "epoch": 4.771021992238033, + "grad_norm": 2.5559699535369873, + "learning_rate": 5.5208346801451376e-08, + "loss": 0.2015, + "step": 7376 + }, + { + "epoch": 4.776196636481242, + "grad_norm": 8.223630905151367, + "learning_rate": 5.2752455525650334e-08, + "loss": 0.2108, + "step": 7384 + }, + { + "epoch": 4.78137128072445, + "grad_norm": 6.698090553283691, + "learning_rate": 5.035214854235526e-08, + "loss": 0.2053, + "step": 7392 + }, + { + "epoch": 4.786545924967658, + "grad_norm": 1.6960841417312622, + "learning_rate": 4.8007452819968107e-08, + "loss": 0.1979, + "step": 7400 + }, + { + "epoch": 4.791720569210867, + "grad_norm": 1.6891340017318726, + "learning_rate": 4.571839470207839e-08, + "loss": 0.2112, + "step": 7408 + }, + { + "epoch": 4.796895213454075, + "grad_norm": 3.160813093185425, + "learning_rate": 4.3484999907163484e-08, + "loss": 0.2083, + "step": 7416 + }, + { + "epoch": 4.802069857697283, + "grad_norm": 1.1303309202194214, + "learning_rate": 4.130729352830154e-08, + "loss": 0.2076, + "step": 7424 + }, + { + "epoch": 4.807244501940492, + "grad_norm": 1.805098533630371, + "learning_rate": 3.9185300032889005e-08, + "loss": 0.1997, + "step": 7432 + }, + { + "epoch": 4.8124191461837, + "grad_norm": 8.593647956848145, + "learning_rate": 3.711904326236693e-08, + "loss": 0.2029, + "step": 7440 + }, + { + "epoch": 4.817593790426908, + "grad_norm": 0.8849772214889526, + "learning_rate": 3.510854643195061e-08, + "loss": 0.2274, + "step": 7448 + }, + { + "epoch": 4.822768434670117, + "grad_norm": 0.9237291216850281, + "learning_rate": 3.3153832130371486e-08, + "loss": 0.2077, + "step": 7456 + }, + { + "epoch": 4.827943078913325, + "grad_norm": 1.9521716833114624, + "learning_rate": 3.1254922319621794e-08, + "loss": 0.2049, + "step": 7464 + }, + { + "epoch": 4.833117723156533, + "grad_norm": 1.4542557001113892, + "learning_rate": 2.941183833470751e-08, + "loss": 0.2142, + "step": 7472 + }, + { + "epoch": 4.838292367399741, + "grad_norm": 4.287179946899414, + "learning_rate": 2.7624600883410235e-08, + "loss": 0.2088, + "step": 7480 + }, + { + "epoch": 4.843467011642949, + "grad_norm": 35.568233489990234, + "learning_rate": 2.589323004605293e-08, + "loss": 0.1929, + "step": 7488 + }, + { + "epoch": 4.848641655886158, + "grad_norm": 0.8782773017883301, + "learning_rate": 2.4217745275275094e-08, + "loss": 0.2058, + "step": 7496 + }, + { + "epoch": 4.853816300129366, + "grad_norm": 26.239900588989258, + "learning_rate": 2.2598165395813498e-08, + "loss": 0.2132, + "step": 7504 + }, + { + "epoch": 4.858990944372574, + "grad_norm": 124.2812728881836, + "learning_rate": 2.1034508604292904e-08, + "loss": 0.2025, + "step": 7512 + }, + { + "epoch": 4.864165588615783, + "grad_norm": 1.2773438692092896, + "learning_rate": 1.9526792469017896e-08, + "loss": 0.2107, + "step": 7520 + }, + { + "epoch": 4.869340232858991, + "grad_norm": 1.78557288646698, + "learning_rate": 1.807503392977916e-08, + "loss": 0.21, + "step": 7528 + }, + { + "epoch": 4.874514877102199, + "grad_norm": 1.3910415172576904, + "learning_rate": 1.6679249297660847e-08, + "loss": 0.2093, + "step": 7536 + }, + { + "epoch": 4.879689521345408, + "grad_norm": 1.938033938407898, + "learning_rate": 1.533945425485739e-08, + "loss": 0.2087, + "step": 7544 + }, + { + "epoch": 4.884864165588616, + "grad_norm": 6.586126804351807, + "learning_rate": 1.405566385449919e-08, + "loss": 0.2114, + "step": 7552 + }, + { + "epoch": 4.890038809831824, + "grad_norm": 1.0857036113739014, + "learning_rate": 1.2827892520481667e-08, + "loss": 0.2242, + "step": 7560 + }, + { + "epoch": 4.895213454075033, + "grad_norm": 4.703243255615234, + "learning_rate": 1.1656154047303691e-08, + "loss": 0.2132, + "step": 7568 + }, + { + "epoch": 4.90038809831824, + "grad_norm": 2.6706597805023193, + "learning_rate": 1.0540461599913287e-08, + "loss": 0.2061, + "step": 7576 + }, + { + "epoch": 4.9055627425614485, + "grad_norm": 1.3466987609863281, + "learning_rate": 9.480827713557183e-09, + "loss": 0.1993, + "step": 7584 + }, + { + "epoch": 4.910737386804657, + "grad_norm": 15.963964462280273, + "learning_rate": 8.47726429364426e-09, + "loss": 0.218, + "step": 7592 + }, + { + "epoch": 4.915912031047865, + "grad_norm": 2.146188259124756, + "learning_rate": 7.529782615608439e-09, + "loss": 0.2096, + "step": 7600 + }, + { + "epoch": 4.921086675291074, + "grad_norm": 27.721698760986328, + "learning_rate": 6.638393324782111e-09, + "loss": 0.2174, + "step": 7608 + }, + { + "epoch": 4.926261319534282, + "grad_norm": 2.0490379333496094, + "learning_rate": 5.803106436279571e-09, + "loss": 0.2079, + "step": 7616 + }, + { + "epoch": 4.93143596377749, + "grad_norm": 36.78939437866211, + "learning_rate": 5.023931334879883e-09, + "loss": 0.2093, + "step": 7624 + }, + { + "epoch": 4.936610608020699, + "grad_norm": 1.1963229179382324, + "learning_rate": 4.3008767749253e-09, + "loss": 0.2183, + "step": 7632 + }, + { + "epoch": 4.941785252263907, + "grad_norm": 2.2999484539031982, + "learning_rate": 3.6339508802213374e-09, + "loss": 0.2119, + "step": 7640 + }, + { + "epoch": 4.946959896507115, + "grad_norm": 1.0794548988342285, + "learning_rate": 3.0231611439457407e-09, + "loss": 0.2043, + "step": 7648 + }, + { + "epoch": 4.952134540750324, + "grad_norm": 18.64220428466797, + "learning_rate": 2.468514428563551e-09, + "loss": 0.211, + "step": 7656 + }, + { + "epoch": 4.957309184993532, + "grad_norm": 3.2397313117980957, + "learning_rate": 1.9700169657510537e-09, + "loss": 0.2075, + "step": 7664 + }, + { + "epoch": 4.96248382923674, + "grad_norm": 1.9977914094924927, + "learning_rate": 1.5276743563258367e-09, + "loss": 0.2065, + "step": 7672 + }, + { + "epoch": 4.967658473479949, + "grad_norm": 25.819950103759766, + "learning_rate": 1.141491570182396e-09, + "loss": 0.2122, + "step": 7680 + }, + { + "epoch": 4.972833117723156, + "grad_norm": 1.0757025480270386, + "learning_rate": 8.114729462377346e-10, + "loss": 0.2269, + "step": 7688 + }, + { + "epoch": 4.9780077619663645, + "grad_norm": 3.0153450965881348, + "learning_rate": 5.376221923830694e-10, + "loss": 0.2125, + "step": 7696 + }, + { + "epoch": 4.983182406209573, + "grad_norm": 1.0717418193817139, + "learning_rate": 3.1994238543997526e-10, + "loss": 0.2222, + "step": 7704 + }, + { + "epoch": 4.988357050452781, + "grad_norm": 1.5106489658355713, + "learning_rate": 1.5843597112707997e-10, + "loss": 0.2066, + "step": 7712 + }, + { + "epoch": 4.99353169469599, + "grad_norm": 9.808573722839355, + "learning_rate": 5.3104764033973245e-11, + "loss": 0.2174, + "step": 7720 + }, + { + "epoch": 4.998706338939198, + "grad_norm": 0.9095555543899536, + "learning_rate": 3.949947598447246e-12, + "loss": 0.2172, + "step": 7728 + }, + { + "epoch": 5.0, + "step": 7730, + "total_flos": 4.245357569192755e+16, + "train_loss": 0.24287281370255492, + "train_runtime": 14386.9094, + "train_samples_per_second": 68.735, + "train_steps_per_second": 0.537 + } + ], + "logging_steps": 8, + "max_steps": 7730, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 387, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.245357569192755e+16, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}